diff --git a/2007/index.html b/2007/index.html new file mode 100644 index 000000000..21b0e02be --- /dev/null +++ b/2007/index.html @@ -0,0 +1,160 @@ + + + + + + +Posts for year 2007 | PyPy + + + + + + + + + + + + + + + + Skip to main content +
+

Posts for year 2007

+
+
+ + \ No newline at end of file diff --git a/2008/index.html b/2008/index.html new file mode 100644 index 000000000..b584bd233 --- /dev/null +++ b/2008/index.html @@ -0,0 +1,289 @@ + + + + + + +Posts for year 2008 | PyPy + + + + + + + + + + + + + + + + Skip to main content +
+

Posts for year 2008

+
+
+ + \ No newline at end of file diff --git a/2009/index.html b/2009/index.html new file mode 100644 index 000000000..53377bf5a --- /dev/null +++ b/2009/index.html @@ -0,0 +1,217 @@ + + + + + + +Posts for year 2009 | PyPy + + + + + + + + + + + + + + + + Skip to main content +
+

Posts for year 2009

+
+
+ + \ No newline at end of file diff --git a/2010/index.html b/2010/index.html new file mode 100644 index 000000000..602e9d44b --- /dev/null +++ b/2010/index.html @@ -0,0 +1,235 @@ + + + + + + +Posts for year 2010 | PyPy + + + + + + + + + + + + + + + + Skip to main content +
+

Posts for year 2010

+
+
+ + \ No newline at end of file diff --git a/2011/index.html b/2011/index.html new file mode 100644 index 000000000..55262a7cd --- /dev/null +++ b/2011/index.html @@ -0,0 +1,232 @@ + + + + + + +Posts for year 2011 | PyPy + + + + + + + + + + + + + + + + Skip to main content +
+

Posts for year 2011

+
+
+ + \ No newline at end of file diff --git a/2012/index.html b/2012/index.html new file mode 100644 index 000000000..c2d89e399 --- /dev/null +++ b/2012/index.html @@ -0,0 +1,235 @@ + + + + + + +Posts for year 2012 | PyPy + + + + + + + + + + + + + + + + Skip to main content +
+

Posts for year 2012

+
+
+ + \ No newline at end of file diff --git a/2013/index.html b/2013/index.html new file mode 100644 index 000000000..2b6647d35 --- /dev/null +++ b/2013/index.html @@ -0,0 +1,241 @@ + + + + + + +Posts for year 2013 | PyPy + + + + + + + + + + + + + + + + Skip to main content +
+

Posts for year 2013

+
+
+ + \ No newline at end of file diff --git a/2014/index.html b/2014/index.html new file mode 100644 index 000000000..ed7e97a9d --- /dev/null +++ b/2014/index.html @@ -0,0 +1,169 @@ + + + + + + +Posts for year 2014 | PyPy + + + + + + + + + + + + + + + + Skip to main content +
+

Posts for year 2014

+
+
+ + \ No newline at end of file diff --git a/2015/index.html b/2015/index.html new file mode 100644 index 000000000..d55568e73 --- /dev/null +++ b/2015/index.html @@ -0,0 +1,163 @@ + + + + + + +Posts for year 2015 | PyPy + + + + + + + + + + + + + + + + Skip to main content +
+

Posts for year 2015

+
+
+ + \ No newline at end of file diff --git a/2016/index.html b/2016/index.html new file mode 100644 index 000000000..9e35f8764 --- /dev/null +++ b/2016/index.html @@ -0,0 +1,163 @@ + + + + + + +Posts for year 2016 | PyPy + + + + + + + + + + + + + + + + Skip to main content +
+

Posts for year 2016

+
+
+ + \ No newline at end of file diff --git a/2017/index.html b/2017/index.html new file mode 100644 index 000000000..d4fc775e2 --- /dev/null +++ b/2017/index.html @@ -0,0 +1,142 @@ + + + + + + +Posts for year 2017 | PyPy + + + + + + + + + + + + + + + + Skip to main content +
+

Posts for year 2017

+
+
+ + \ No newline at end of file diff --git a/2018/index.html b/2018/index.html new file mode 100644 index 000000000..13f8f1720 --- /dev/null +++ b/2018/index.html @@ -0,0 +1,139 @@ + + + + + + +Posts for year 2018 | PyPy + + + + + + + + + + + + + + + + Skip to main content +
+

Posts for year 2018

+
+
+ + \ No newline at end of file diff --git a/2019/index.html b/2019/index.html new file mode 100644 index 000000000..fd1b7f38c --- /dev/null +++ b/2019/index.html @@ -0,0 +1,139 @@ + + + + + + +Posts for year 2019 | PyPy + + + + + + + + + + + + + + + + Skip to main content +
+

Posts for year 2019

+
+
+ + \ No newline at end of file diff --git a/2020/index.html b/2020/index.html new file mode 100644 index 000000000..35366dbf8 --- /dev/null +++ b/2020/index.html @@ -0,0 +1,130 @@ + + + + + + +Posts for year 2020 | PyPy + + + + + + + + + + + + + + + + Skip to main content +
+

Posts for year 2020

+
+
+ + \ No newline at end of file diff --git a/2021/index.html b/2021/index.html new file mode 100644 index 000000000..34ab3561d --- /dev/null +++ b/2021/index.html @@ -0,0 +1,133 @@ + + + + + + +Posts for year 2021 | PyPy + + + + + + + + + + + + + + + + Skip to main content +
+

Posts for year 2021

+
+
+ + \ No newline at end of file diff --git a/2022/index.html b/2022/index.html new file mode 100644 index 000000000..4a06d4a88 --- /dev/null +++ b/2022/index.html @@ -0,0 +1,142 @@ + + + + + + +Posts for year 2022 | PyPy + + + + + + + + + + + + + + + + Skip to main content +
+

Posts for year 2022

+
+
+ + \ No newline at end of file diff --git a/2023/index.html b/2023/index.html new file mode 100644 index 000000000..2e46f8472 --- /dev/null +++ b/2023/index.html @@ -0,0 +1,121 @@ + + + + + + +Posts for year 2023 | PyPy + + + + + + + + + + + + + + + + Skip to main content +
+
+
+ + \ No newline at end of file diff --git a/2024/index.html b/2024/index.html new file mode 100644 index 000000000..4d098d525 --- /dev/null +++ b/2024/index.html @@ -0,0 +1,136 @@ + + + + + + +Posts for year 2024 | PyPy + + + + + + + + + + + + + + + + Skip to main content +
+

Posts for year 2024

+
+
+ + \ No newline at end of file diff --git a/CNAME b/CNAME new file mode 100644 index 000000000..f42493892 --- /dev/null +++ b/CNAME @@ -0,0 +1 @@ +pypy.org \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 000000000..d120d4b22 --- /dev/null +++ b/README.md @@ -0,0 +1,3 @@ +# gh-pages branch +These pages are generated automatically. Any changes will be overwritten during +the github action CI run. diff --git a/archive.html b/archive.html new file mode 100644 index 000000000..224aabb64 --- /dev/null +++ b/archive.html @@ -0,0 +1,175 @@ + + + + + + +Archive | PyPy + + + + + + + + + + + + + + + + Skip to main content +
+
+
+ + \ No newline at end of file diff --git a/assets/css/code.css b/assets/css/code.css new file mode 100644 index 000000000..fc470e482 --- /dev/null +++ b/assets/css/code.css @@ -0,0 +1,86 @@ +/* code.css file generated by Nikola */ +pre { line-height: 125%; } +td.linenos .normal { color: inherit; background-color: transparent; padding-left: 5px; padding-right: 5px; } +span.linenos { color: inherit; background-color: transparent; padding-left: 5px; padding-right: 5px; } +td.linenos .special { color: #000000; background-color: #ffffc0; padding-left: 5px; padding-right: 5px; } +span.linenos.special { color: #000000; background-color: #ffffc0; padding-left: 5px; padding-right: 5px; } +pre.code .hll, .code .codetable .hll, .highlight pre .hll { background-color: #ffffcc } +pre.code , .code .codetable , .highlight pre { background: #f8f8f8; } +pre.code .c, .code .codetable .c, .highlight pre .c { color: #3D7B7B; font-style: italic } /* Comment */ +pre.code .err, .code .codetable .err, .highlight pre .err { border: 1px solid #FF0000 } /* Error */ +pre.code .k, .code .codetable .k, .highlight pre .k { color: #008000; font-weight: bold } /* Keyword */ +pre.code .o, .code .codetable .o, .highlight pre .o { color: #666666 } /* Operator */ +pre.code .ch, .code .codetable .ch, .highlight pre .ch { color: #3D7B7B; font-style: italic } /* Comment.Hashbang */ +pre.code .cm, .code .codetable .cm, .highlight pre .cm { color: #3D7B7B; font-style: italic } /* Comment.Multiline */ +pre.code .cp, .code .codetable .cp, .highlight pre .cp { color: #9C6500 } /* Comment.Preproc */ +pre.code .cpf, .code .codetable .cpf, .highlight pre .cpf { color: #3D7B7B; font-style: italic } /* Comment.PreprocFile */ +pre.code .c1, .code .codetable .c1, .highlight pre .c1 { color: #3D7B7B; font-style: italic } /* Comment.Single */ +pre.code .cs, .code .codetable .cs, .highlight pre .cs { color: #3D7B7B; font-style: italic } /* Comment.Special */ +pre.code .gd, .code .codetable .gd, .highlight pre .gd { color: #A00000 } /* Generic.Deleted */ +pre.code .ge, .code .codetable .ge, .highlight pre .ge { font-style: italic } /* Generic.Emph */ +pre.code .ges, .code .codetable .ges, .highlight pre .ges { font-weight: bold; font-style: italic } /* Generic.EmphStrong */ +pre.code .gr, .code .codetable .gr, .highlight pre .gr { color: #E40000 } /* Generic.Error */ +pre.code .gh, .code .codetable .gh, .highlight pre .gh { color: #000080; font-weight: bold } /* Generic.Heading */ +pre.code .gi, .code .codetable .gi, .highlight pre .gi { color: #008400 } /* Generic.Inserted */ +pre.code .go, .code .codetable .go, .highlight pre .go { color: #717171 } /* Generic.Output */ +pre.code .gp, .code .codetable .gp, .highlight pre .gp { color: #000080; font-weight: bold } /* Generic.Prompt */ +pre.code .gs, .code .codetable .gs, .highlight pre .gs { font-weight: bold } /* Generic.Strong */ +pre.code .gu, .code .codetable .gu, .highlight pre .gu { color: #800080; font-weight: bold } /* Generic.Subheading */ +pre.code .gt, .code .codetable .gt, .highlight pre .gt { color: #0044DD } /* Generic.Traceback */ +pre.code .kc, .code .codetable .kc, .highlight pre .kc { color: #008000; font-weight: bold } /* Keyword.Constant */ +pre.code .kd, .code .codetable .kd, .highlight pre .kd { color: #008000; font-weight: bold } /* Keyword.Declaration */ +pre.code .kn, .code .codetable .kn, .highlight pre .kn { color: #008000; font-weight: bold } /* Keyword.Namespace */ +pre.code .kp, .code .codetable .kp, .highlight pre .kp { color: #008000 } /* Keyword.Pseudo */ +pre.code .kr, .code .codetable .kr, .highlight pre .kr { color: #008000; font-weight: bold } /* Keyword.Reserved */ +pre.code .kt, .code .codetable .kt, .highlight pre .kt { color: #B00040 } /* Keyword.Type */ +pre.code .m, .code .codetable .m, .highlight pre .m { color: #666666 } /* Literal.Number */ +pre.code .s, .code .codetable .s, .highlight pre .s { color: #BA2121 } /* Literal.String */ +pre.code .na, .code .codetable .na, .highlight pre .na { color: #687822 } /* Name.Attribute */ +pre.code .nb, .code .codetable .nb, .highlight pre .nb { color: #008000 } /* Name.Builtin */ +pre.code .nc, .code .codetable .nc, .highlight pre .nc { color: #0000FF; font-weight: bold } /* Name.Class */ +pre.code .no, .code .codetable .no, .highlight pre .no { color: #880000 } /* Name.Constant */ +pre.code .nd, .code .codetable .nd, .highlight pre .nd { color: #AA22FF } /* Name.Decorator */ +pre.code .ni, .code .codetable .ni, .highlight pre .ni { color: #717171; font-weight: bold } /* Name.Entity */ +pre.code .ne, .code .codetable .ne, .highlight pre .ne { color: #CB3F38; font-weight: bold } /* Name.Exception */ +pre.code .nf, .code .codetable .nf, .highlight pre .nf { color: #0000FF } /* Name.Function */ +pre.code .nl, .code .codetable .nl, .highlight pre .nl { color: #767600 } /* Name.Label */ +pre.code .nn, .code .codetable .nn, .highlight pre .nn { color: #0000FF; font-weight: bold } /* Name.Namespace */ +pre.code .nt, .code .codetable .nt, .highlight pre .nt { color: #008000; font-weight: bold } /* Name.Tag */ +pre.code .nv, .code .codetable .nv, .highlight pre .nv { color: #19177C } /* Name.Variable */ +pre.code .ow, .code .codetable .ow, .highlight pre .ow { color: #AA22FF; font-weight: bold } /* Operator.Word */ +pre.code .w, .code .codetable .w, .highlight pre .w { color: #bbbbbb } /* Text.Whitespace */ +pre.code .mb, .code .codetable .mb, .highlight pre .mb { color: #666666 } /* Literal.Number.Bin */ +pre.code .mf, .code .codetable .mf, .highlight pre .mf { color: #666666 } /* Literal.Number.Float */ +pre.code .mh, .code .codetable .mh, .highlight pre .mh { color: #666666 } /* Literal.Number.Hex */ +pre.code .mi, .code .codetable .mi, .highlight pre .mi { color: #666666 } /* Literal.Number.Integer */ +pre.code .mo, .code .codetable .mo, .highlight pre .mo { color: #666666 } /* Literal.Number.Oct */ +pre.code .sa, .code .codetable .sa, .highlight pre .sa { color: #BA2121 } /* Literal.String.Affix */ +pre.code .sb, .code .codetable .sb, .highlight pre .sb { color: #BA2121 } /* Literal.String.Backtick */ +pre.code .sc, .code .codetable .sc, .highlight pre .sc { color: #BA2121 } /* Literal.String.Char */ +pre.code .dl, .code .codetable .dl, .highlight pre .dl { color: #BA2121 } /* Literal.String.Delimiter */ +pre.code .sd, .code .codetable .sd, .highlight pre .sd { color: #BA2121; font-style: italic } /* Literal.String.Doc */ +pre.code .s2, .code .codetable .s2, .highlight pre .s2 { color: #BA2121 } /* Literal.String.Double */ +pre.code .se, .code .codetable .se, .highlight pre .se { color: #AA5D1F; font-weight: bold } /* Literal.String.Escape */ +pre.code .sh, .code .codetable .sh, .highlight pre .sh { color: #BA2121 } /* Literal.String.Heredoc */ +pre.code .si, .code .codetable .si, .highlight pre .si { color: #A45A77; font-weight: bold } /* Literal.String.Interpol */ +pre.code .sx, .code .codetable .sx, .highlight pre .sx { color: #008000 } /* Literal.String.Other */ +pre.code .sr, .code .codetable .sr, .highlight pre .sr { color: #A45A77 } /* Literal.String.Regex */ +pre.code .s1, .code .codetable .s1, .highlight pre .s1 { color: #BA2121 } /* Literal.String.Single */ +pre.code .ss, .code .codetable .ss, .highlight pre .ss { color: #19177C } /* Literal.String.Symbol */ +pre.code .bp, .code .codetable .bp, .highlight pre .bp { color: #008000 } /* Name.Builtin.Pseudo */ +pre.code .fm, .code .codetable .fm, .highlight pre .fm { color: #0000FF } /* Name.Function.Magic */ +pre.code .vc, .code .codetable .vc, .highlight pre .vc { color: #19177C } /* Name.Variable.Class */ +pre.code .vg, .code .codetable .vg, .highlight pre .vg { color: #19177C } /* Name.Variable.Global */ +pre.code .vi, .code .codetable .vi, .highlight pre .vi { color: #19177C } /* Name.Variable.Instance */ +pre.code .vm, .code .codetable .vm, .highlight pre .vm { color: #19177C } /* Name.Variable.Magic */ +pre.code .il, .code .codetable .il, .highlight pre .il { color: #666666 } /* Literal.Number.Integer.Long */ +.highlight table, .highlight tr, .highlight td, .code table, .code tr, .code td { border-spacing: 0; border-collapse: separate; padding: 0 } +.highlight pre, .code pre { white-space: pre-wrap; line-height: normal } +.highlighttable td.linenos, .codetable td.linenos { vertical-align: top; padding-left: 10px; padding-right: 10px; user-select: none; -webkit-user-select: none } +.highlighttable td.linenos code:before, .codetable td.linenos code:before { content: attr(data-line-number) } +.highlighttable td.code, .codetable td.code { overflow-wrap: normal; border-collapse: collapse } +.highlighttable td.code code, .codetable td.code code { overflow: unset; border: none; padding: 0; margin: 0; white-space: pre-wrap; line-height: unset; background: none } +.highlight .lineno.nonumber, .code .lineno.nonumber { list-style: none } +table.codetable, table.highlighttable { width: 100%;} +.codetable td.linenos, td.linenos { text-align: right; width: 3.5em; padding-right: 0.5em; background: rgba(127, 127, 127, 0.2) } +.codetable td.code, td.code { padding-left: 0.5em; } diff --git a/assets/css/html4css1.css b/assets/css/html4css1.css new file mode 100644 index 000000000..cc2933594 --- /dev/null +++ b/assets/css/html4css1.css @@ -0,0 +1 @@ +@import url("rst_base.css"); diff --git a/assets/css/img/loader.gif b/assets/css/img/loader.gif new file mode 100644 index 000000000..9c97738a2 Binary files /dev/null and b/assets/css/img/loader.gif differ diff --git a/assets/css/img/search.png b/assets/css/img/search.png new file mode 100644 index 000000000..8c6943d42 Binary files /dev/null and b/assets/css/img/search.png differ diff --git a/assets/css/ipython.min.css b/assets/css/ipython.min.css new file mode 100644 index 000000000..c1c6bc4e8 --- /dev/null +++ b/assets/css/ipython.min.css @@ -0,0 +1,9 @@ +/*! +* +* IPython base +* +*/.modal.fade .modal-dialog{-webkit-transform:translate(0,0);-ms-transform:translate(0,0);-o-transform:translate(0,0);transform:translate(0,0)}code{color:#000}pre{font-size:inherit;line-height:inherit}label{font-weight:normal}.border-box-sizing{box-sizing:border-box;-moz-box-sizing:border-box;-webkit-box-sizing:border-box}.corner-all{border-radius:2px}.no-padding{padding:0}.hbox{display:-webkit-box;-webkit-box-orient:horizontal;-webkit-box-align:stretch;display:-moz-box;-moz-box-orient:horizontal;-moz-box-align:stretch;display:box;box-orient:horizontal;box-align:stretch;display:flex;flex-direction:row;align-items:stretch}.hbox>*{-webkit-box-flex:0;-moz-box-flex:0;box-flex:0;flex:none}.vbox{display:-webkit-box;-webkit-box-orient:vertical;-webkit-box-align:stretch;display:-moz-box;-moz-box-orient:vertical;-moz-box-align:stretch;display:box;box-orient:vertical;box-align:stretch;display:flex;flex-direction:column;align-items:stretch}.vbox>*{-webkit-box-flex:0;-moz-box-flex:0;box-flex:0;flex:none}.hbox.reverse,.vbox.reverse,.reverse{-webkit-box-direction:reverse;-moz-box-direction:reverse;box-direction:reverse;flex-direction:row-reverse}.hbox.box-flex0,.vbox.box-flex0,.box-flex0{-webkit-box-flex:0;-moz-box-flex:0;box-flex:0;flex:none;width:auto}.hbox.box-flex1,.vbox.box-flex1,.box-flex1{-webkit-box-flex:1;-moz-box-flex:1;box-flex:1;flex:1}.hbox.box-flex,.vbox.box-flex,.box-flex{-webkit-box-flex:1;-moz-box-flex:1;box-flex:1;flex:1}.hbox.box-flex2,.vbox.box-flex2,.box-flex2{-webkit-box-flex:2;-moz-box-flex:2;box-flex:2;flex:2}.box-group1{-webkit-box-flex-group:1;-moz-box-flex-group:1;box-flex-group:1}.box-group2{-webkit-box-flex-group:2;-moz-box-flex-group:2;box-flex-group:2}.hbox.start,.vbox.start,.start{-webkit-box-pack:start;-moz-box-pack:start;box-pack:start;justify-content:flex-start}.hbox.end,.vbox.end,.end{-webkit-box-pack:end;-moz-box-pack:end;box-pack:end;justify-content:flex-end}.hbox.center,.vbox.center,.center{-webkit-box-pack:center;-moz-box-pack:center;box-pack:center;justify-content:center}.hbox.baseline,.vbox.baseline,.baseline{-webkit-box-pack:baseline;-moz-box-pack:baseline;box-pack:baseline;justify-content:baseline}.hbox.stretch,.vbox.stretch,.stretch{-webkit-box-pack:stretch;-moz-box-pack:stretch;box-pack:stretch;justify-content:stretch}.hbox.align-start,.vbox.align-start,.align-start{-webkit-box-align:start;-moz-box-align:start;box-align:start;align-items:flex-start}.hbox.align-end,.vbox.align-end,.align-end{-webkit-box-align:end;-moz-box-align:end;box-align:end;align-items:flex-end}.hbox.align-center,.vbox.align-center,.align-center{-webkit-box-align:center;-moz-box-align:center;box-align:center;align-items:center}.hbox.align-baseline,.vbox.align-baseline,.align-baseline{-webkit-box-align:baseline;-moz-box-align:baseline;box-align:baseline;align-items:baseline}.hbox.align-stretch,.vbox.align-stretch,.align-stretch{-webkit-box-align:stretch;-moz-box-align:stretch;box-align:stretch;align-items:stretch}div.error{margin:2em;text-align:center}div.error>h1{font-size:500%;line-height:normal}div.error>p{font-size:200%;line-height:normal}div.traceback-wrapper{text-align:left;max-width:800px;margin:auto}div.traceback-wrapper pre.traceback{max-height:600px;overflow:auto}/*! +* +* IPython notebook +* +*/.ansi-black-fg{color:#3e424d}.ansi-black-bg{background-color:#3e424d}.ansi-black-intense-fg{color:#282c36}.ansi-black-intense-bg{background-color:#282c36}.ansi-red-fg{color:#e75c58}.ansi-red-bg{background-color:#e75c58}.ansi-red-intense-fg{color:#b22b31}.ansi-red-intense-bg{background-color:#b22b31}.ansi-green-fg{color:#00a250}.ansi-green-bg{background-color:#00a250}.ansi-green-intense-fg{color:#007427}.ansi-green-intense-bg{background-color:#007427}.ansi-yellow-fg{color:#ddb62b}.ansi-yellow-bg{background-color:#ddb62b}.ansi-yellow-intense-fg{color:#b27d12}.ansi-yellow-intense-bg{background-color:#b27d12}.ansi-blue-fg{color:#208ffb}.ansi-blue-bg{background-color:#208ffb}.ansi-blue-intense-fg{color:#0065ca}.ansi-blue-intense-bg{background-color:#0065ca}.ansi-magenta-fg{color:#d160c4}.ansi-magenta-bg{background-color:#d160c4}.ansi-magenta-intense-fg{color:#a03196}.ansi-magenta-intense-bg{background-color:#a03196}.ansi-cyan-fg{color:#60c6c8}.ansi-cyan-bg{background-color:#60c6c8}.ansi-cyan-intense-fg{color:#258f8f}.ansi-cyan-intense-bg{background-color:#258f8f}.ansi-white-fg{color:#c5c1b4}.ansi-white-bg{background-color:#c5c1b4}.ansi-white-intense-fg{color:#a1a6b2}.ansi-white-intense-bg{background-color:#a1a6b2}.ansi-bold{font-weight:bold}.ansi-underline{text-decoration:underline}.ansi-inverse{outline:.5px dotted}.ansibold{font-weight:bold}.ansiblack{color:black}.ansired{color:darkred}.ansigreen{color:darkgreen}.ansiyellow{color:#c4a000}.ansiblue{color:darkblue}.ansipurple{color:darkviolet}.ansicyan{color:steelblue}.ansigray{color:gray}.ansibgblack{background-color:black}.ansibgred{background-color:red}.ansibggreen{background-color:green}.ansibgyellow{background-color:yellow}.ansibgblue{background-color:blue}.ansibgpurple{background-color:magenta}.ansibgcyan{background-color:cyan}.ansibggray{background-color:gray}div.cell{display:-webkit-box;-webkit-box-orient:vertical;-webkit-box-align:stretch;display:-moz-box;-moz-box-orient:vertical;-moz-box-align:stretch;display:box;box-orient:vertical;box-align:stretch;display:flex;flex-direction:column;align-items:stretch;border-radius:2px;box-sizing:border-box;-moz-box-sizing:border-box;-webkit-box-sizing:border-box;border-width:1px;border-style:solid;border-color:transparent;width:100%;padding:5px;margin:0;outline:0;position:relative;overflow:visible}div.cell:before{position:absolute;display:block;top:-1px;left:-1px;width:5px;height:calc(100%+2px);content:'';background:transparent}div.cell.jupyter-soft-selected{border-left-color:#e3f2fd;border-left-width:1px;padding-left:5px;border-right-color:#e3f2fd;border-right-width:1px;background:#e3f2fd}@media print{div.cell.jupyter-soft-selected{border-color:transparent}}div.cell.selected,div.cell.selected.jupyter-soft-selected{border-color:#ababab}div.cell.selected:before,div.cell.selected.jupyter-soft-selected:before{position:absolute;display:block;top:-1px;left:-1px;width:5px;height:calc(100%+2px);content:'';background:#42a5f5}@media print{div.cell.selected,div.cell.selected.jupyter-soft-selected{border-color:transparent}}.edit_mode div.cell.selected{border-color:#66bb6a}.edit_mode div.cell.selected:before{position:absolute;display:block;top:-1px;left:-1px;width:5px;height:calc(100%+2px);content:'';background:#66bb6a}@media print{.edit_mode div.cell.selected{border-color:transparent}}.prompt{min-width:14ex;padding:.4em;margin:0;font-family:monospace;text-align:right;line-height:1.21429em;-webkit-touch-callout:none;-webkit-user-select:none;-khtml-user-select:none;-moz-user-select:none;-ms-user-select:none;user-select:none;cursor:default}@media(max-width:540px){.prompt{text-align:left}}div.inner_cell{min-width:0;display:-webkit-box;-webkit-box-orient:vertical;-webkit-box-align:stretch;display:-moz-box;-moz-box-orient:vertical;-moz-box-align:stretch;display:box;box-orient:vertical;box-align:stretch;display:flex;flex-direction:column;align-items:stretch;-webkit-box-flex:1;-moz-box-flex:1;box-flex:1;flex:1}div.input_area>div.highlight>pre{border:1px solid #cfcfcf;line-height:1.21429em}div.prompt:empty{padding-top:0;padding-bottom:0}div.unrecognized_cell{padding:5px 5px 5px 0;display:-webkit-box;-webkit-box-orient:horizontal;-webkit-box-align:stretch;display:-moz-box;-moz-box-orient:horizontal;-moz-box-align:stretch;display:box;box-orient:horizontal;box-align:stretch;display:flex;flex-direction:row;align-items:stretch}div.unrecognized_cell .inner_cell{border-radius:2px;padding:5px;font-weight:bold;color:red;border:1px solid #cfcfcf;background:#eaeaea}div.unrecognized_cell .inner_cell a{color:inherit;text-decoration:none}div.unrecognized_cell .inner_cell a:hover{color:inherit;text-decoration:none}@media(max-width:540px){div.unrecognized_cell>div.prompt{display:none}}@media print{div.code_cell{page-break-inside:avoid}}div.input{page-break-inside:avoid;display:-webkit-box;-webkit-box-orient:horizontal;-webkit-box-align:stretch;display:-moz-box;-moz-box-orient:horizontal;-moz-box-align:stretch;display:box;box-orient:horizontal;box-align:stretch;display:flex;flex-direction:row;align-items:stretch}@media(max-width:540px){div.input{display:-webkit-box;-webkit-box-orient:vertical;-webkit-box-align:stretch;display:-moz-box;-moz-box-orient:vertical;-moz-box-align:stretch;display:box;box-orient:vertical;box-align:stretch;display:flex;flex-direction:column;align-items:stretch}}div.input_prompt{color:#303f9f;border-top:1px solid transparent}div.input_area>div.highlight{margin:0;border:0;padding:0;background-color:transparent}div.input_area>div.highlight>pre{margin:0;border:0;padding:.4em}.CodeMirror{line-height:1.21429em;font-size:14px;height:auto;background:0}.CodeMirror-scroll{overflow-y:hidden;overflow-x:auto}.CodeMirror-lines{padding:.4em}.CodeMirror-linenumber{padding:0 8px 0 4px}.CodeMirror-gutters{border-bottom-left-radius:2px;border-top-left-radius:2px}.CodeMirror pre{padding:0;border:0;border-radius:0}.highlight-base{color:#000}.highlight-variable{color:#000}.highlight-variable-2{color:#1a1a1a}.highlight-variable-3{color:#333}.highlight-string{color:#ba2121}.highlight-comment{color:#408080;font-style:italic}.highlight-number{color:#080}.highlight-atom{color:#88F}.highlight-keyword{color:#008000;font-weight:bold}.highlight-builtin{color:#008000}.highlight-error{color:red}.highlight-operator{color:#a2f;font-weight:bold}.highlight-meta{color:#a2f}.highlight-def{color:#00f}.highlight-string-2{color:#f50}.highlight-qualifier{color:#555}.highlight-bracket{color:#997}.highlight-tag{color:#170}.highlight-attribute{color:#00c}.highlight-header{color:blue}.highlight-quote{color:#090}.highlight-link{color:#00c}.cm-s-ipython span.cm-keyword{color:#008000;font-weight:bold}.cm-s-ipython span.cm-atom{color:#88F}.cm-s-ipython span.cm-number{color:#080}.cm-s-ipython span.cm-def{color:#00f}.cm-s-ipython span.cm-variable{color:#000}.cm-s-ipython span.cm-operator{color:#a2f;font-weight:bold}.cm-s-ipython span.cm-variable-2{color:#1a1a1a}.cm-s-ipython span.cm-variable-3{color:#333}.cm-s-ipython span.cm-comment{color:#408080;font-style:italic}.cm-s-ipython span.cm-string{color:#ba2121}.cm-s-ipython span.cm-string-2{color:#f50}.cm-s-ipython span.cm-meta{color:#a2f}.cm-s-ipython span.cm-qualifier{color:#555}.cm-s-ipython span.cm-builtin{color:#008000}.cm-s-ipython span.cm-bracket{color:#997}.cm-s-ipython span.cm-tag{color:#170}.cm-s-ipython span.cm-attribute{color:#00c}.cm-s-ipython span.cm-header{color:blue}.cm-s-ipython span.cm-quote{color:#090}.cm-s-ipython span.cm-link{color:#00c}.cm-s-ipython span.cm-error{color:red}.cm-s-ipython span.cm-tab{background:url();background-position:right;background-repeat:no-repeat}div.output_wrapper{position:relative;display:-webkit-box;-webkit-box-orient:vertical;-webkit-box-align:stretch;display:-moz-box;-moz-box-orient:vertical;-moz-box-align:stretch;display:box;box-orient:vertical;box-align:stretch;display:flex;flex-direction:column;align-items:stretch;z-index:1}div.output_scroll{height:24em;width:100%;overflow:auto;border-radius:2px;-webkit-box-shadow:inset 0 2px 8px rgba(0,0,0,0.8);box-shadow:inset 0 2px 8px rgba(0,0,0,0.8);display:block}div.output_collapsed{margin:0;padding:0;display:-webkit-box;-webkit-box-orient:vertical;-webkit-box-align:stretch;display:-moz-box;-moz-box-orient:vertical;-moz-box-align:stretch;display:box;box-orient:vertical;box-align:stretch;display:flex;flex-direction:column;align-items:stretch}div.out_prompt_overlay{height:100%;padding:0 .4em;position:absolute;border-radius:2px}div.out_prompt_overlay:hover{-webkit-box-shadow:inset 0 0 1px #000;box-shadow:inset 0 0 1px #000;background:rgba(240,240,240,0.5)}div.output_prompt{color:#d84315}div.output_area{padding:0;page-break-inside:avoid;display:-webkit-box;-webkit-box-orient:horizontal;-webkit-box-align:stretch;display:-moz-box;-moz-box-orient:horizontal;-moz-box-align:stretch;display:box;box-orient:horizontal;box-align:stretch;display:flex;flex-direction:row;align-items:stretch}div.output_area .MathJax_Display{text-align:left !important}div.output_area .rendered_html table{margin-left:0;margin-right:0}div.output_area .rendered_html img{margin-left:0;margin-right:0}div.output_area img,div.output_area svg{max-width:100%;height:auto}div.output_area img.unconfined,div.output_area svg.unconfined{max-width:none}div.output_area .mglyph>img{max-width:none}.output{display:-webkit-box;-webkit-box-orient:vertical;-webkit-box-align:stretch;display:-moz-box;-moz-box-orient:vertical;-moz-box-align:stretch;display:box;box-orient:vertical;box-align:stretch;display:flex;flex-direction:column;align-items:stretch}@media(max-width:540px){div.output_area{display:-webkit-box;-webkit-box-orient:vertical;-webkit-box-align:stretch;display:-moz-box;-moz-box-orient:vertical;-moz-box-align:stretch;display:box;box-orient:vertical;box-align:stretch;display:flex;flex-direction:column;align-items:stretch}}div.output_area pre{margin:0;padding:0;border:0;vertical-align:baseline;color:black;background-color:transparent;border-radius:0}div.output_subarea{overflow-x:auto;padding:.4em;-webkit-box-flex:1;-moz-box-flex:1;box-flex:1;flex:1;max-width:calc(100% - 14ex)}div.output_scroll div.output_subarea{overflow-x:visible}div.output_text{text-align:left;color:#000;line-height:1.21429em}div.output_stderr{background:#fdd}div.output_latex{text-align:left}div.output_javascript:empty{padding:0}.js-error{color:darkred}div.raw_input_container{line-height:1.21429em;padding-top:5px}input.raw_input{font-family:monospace;font-size:inherit;color:inherit;width:auto;vertical-align:baseline;padding:0 .25em;margin:0 .25em}input.raw_input:focus{box-shadow:none}p.p-space{margin-bottom:10px}div.output_unrecognized{padding:5px;font-weight:bold;color:red}div.output_unrecognized a{color:inherit;text-decoration:none}div.output_unrecognized a:hover{color:inherit;text-decoration:none}.rendered_html{color:#000}.rendered_html em{font-style:italic}.rendered_html strong{font-weight:bold}.rendered_html u{text-decoration:underline}.rendered_html :link{text-decoration:underline}.rendered_html :visited{text-decoration:underline}.rendered_html h1{font-size:185.7%;margin:1.08em 0 0 0;font-weight:bold;line-height:1.0}.rendered_html h2{font-size:157.1%;margin:1.27em 0 0 0;font-weight:bold;line-height:1.0}.rendered_html h3{font-size:128.6%;margin:1.55em 0 0 0;font-weight:bold;line-height:1.0}.rendered_html h4{font-size:100%;margin:2em 0 0 0;font-weight:bold;line-height:1.0}.rendered_html h5{font-size:100%;margin:2em 0 0 0;font-weight:bold;line-height:1.0;font-style:italic}.rendered_html h6{font-size:100%;margin:2em 0 0 0;font-weight:bold;line-height:1.0;font-style:italic}.rendered_html h1:first-child{margin-top:.538em}.rendered_html h2:first-child{margin-top:.636em}.rendered_html h3:first-child{margin-top:.777em}.rendered_html h4:first-child{margin-top:1em}.rendered_html h5:first-child{margin-top:1em}.rendered_html h6:first-child{margin-top:1em}.rendered_html ul:not(.list-inline),.rendered_html ol:not(.list-inline){padding-left:2em}.rendered_html ul{list-style:disc}.rendered_html ul ul{list-style:square}.rendered_html ul ul ul{list-style:circle}.rendered_html ol{list-style:decimal}.rendered_html ol ol{list-style:upper-alpha}.rendered_html ol ol ol{list-style:lower-alpha}.rendered_html ol ol ol ol{list-style:lower-roman}.rendered_html ol ol ol ol ol{list-style:decimal}.rendered_html *+ul{margin-top:1em}.rendered_html *+ol{margin-top:1em}.rendered_html hr{color:black;background-color:black}.rendered_html pre{margin:1em 2em}.rendered_html pre,.rendered_html code{border:0;background-color:#fff;color:#000;font-size:100%;padding:0}.rendered_html blockquote{margin:1em 2em}.rendered_html table{margin-left:auto;margin-right:auto;border:0;border-collapse:collapse;border-spacing:0;color:black;font-size:12px;table-layout:fixed}.rendered_html thead{border-bottom:1px solid black;vertical-align:bottom}.rendered_html tr,.rendered_html th,.rendered_html td{text-align:right;vertical-align:middle;padding:.5em .5em;line-height:normal;white-space:normal;max-width:none;border:0}.rendered_html th{font-weight:bold}.rendered_html tbody tr:nth-child(odd){background:#f5f5f5}.rendered_html tbody tr:hover{background:rgba(66,165,245,0.2)}.rendered_html *+table{margin-top:1em}.rendered_html p{text-align:left}.rendered_html *+p{margin-top:1em}.rendered_html img{display:block;margin-left:auto;margin-right:auto}.rendered_html *+img{margin-top:1em}.rendered_html img,.rendered_html svg{max-width:100%;height:auto}.rendered_html img.unconfined,.rendered_html svg.unconfined{max-width:none}.rendered_html .alert{margin-bottom:initial}.rendered_html *+.alert{margin-top:1em}div.text_cell{display:-webkit-box;-webkit-box-orient:horizontal;-webkit-box-align:stretch;display:-moz-box;-moz-box-orient:horizontal;-moz-box-align:stretch;display:box;box-orient:horizontal;box-align:stretch;display:flex;flex-direction:row;align-items:stretch}@media(max-width:540px){div.text_cell>div.prompt{display:none}}div.text_cell_render{outline:0;resize:none;width:inherit;border-style:none;padding:.5em .5em .5em .4em;color:#000;box-sizing:border-box;-moz-box-sizing:border-box;-webkit-box-sizing:border-box}a.anchor-link:link{text-decoration:none;padding:0 20px;visibility:hidden}h1:hover .anchor-link,h2:hover .anchor-link,h3:hover .anchor-link,h4:hover .anchor-link,h5:hover .anchor-link,h6:hover .anchor-link{visibility:visible}.text_cell.rendered .input_area{display:none}.text_cell.rendered .rendered_html{overflow-x:auto;overflow-y:hidden}.text_cell.rendered .rendered_html tr,.text_cell.rendered .rendered_html th,.text_cell.rendered .rendered_html td{max-width:none}.text_cell.unrendered .text_cell_render{display:none}.text_cell .dropzone .input_area{border:2px dashed #bababa;margin:-1px}.cm-header-1,.cm-header-2,.cm-header-3,.cm-header-4,.cm-header-5,.cm-header-6{font-weight:bold;font-family:"Helvetica Neue",Helvetica,Arial,sans-serif}.cm-header-1{font-size:185.7%}.cm-header-2{font-size:157.1%}.cm-header-3{font-size:128.6%}.cm-header-4{font-size:110%}.cm-header-5{font-size:100%;font-style:italic}.cm-header-6{font-size:100%;font-style:italic} diff --git a/assets/css/nikola_ipython.css b/assets/css/nikola_ipython.css new file mode 100644 index 000000000..aba37da64 --- /dev/null +++ b/assets/css/nikola_ipython.css @@ -0,0 +1,56 @@ +div.prompt { + padding: 0.6em; + font-size: 13px; + background-color: #E9E9E9; + margin-right: 1em; + -webkit-border-radius: 3px; + -moz-border-radius: 3px; + border-radius: 3px; +} + +div.output_prompt { + /* 5px right shift to account for margin in parent container */ + margin: 0 5px 0 0px; +} + +div.output_area pre { + font-size: 13px; +} + +div.text_cell_render { + padding: 0px; + color: #333333; +} + +.rendered_html p { + text-align: left; +} + +.rendered_html ul { + margin: 0 0 12px 25px; +} + +.rendered_html :visited { + text-decoration: none; +} + +.rendered_html :link { + text-decoration: none; +} + +.rendered_html pre, .rendered_html code { + background-color: #DDDDDD; + margin: 1em 0em; + font-size: 14px; +} + +.rendered_html pre { + padding-left: 0.5em; + padding-right: 0.5em; + padding-top: 0.05em; + padding-bottom: 0.05em; +} + +.page-content > .content p { + margin: 0 0 0px; +} diff --git a/assets/css/nikola_rst.css b/assets/css/nikola_rst.css new file mode 100644 index 000000000..71a0f8439 --- /dev/null +++ b/assets/css/nikola_rst.css @@ -0,0 +1,79 @@ +div.admonition, div.attention, div.caution, div.danger, div.error, +div.hint, div.important, div.note, div.tip, div.warning, div.sidebar, +div.system-message { +/* stolen from Boostrap 4 (.card) */ + margin-bottom: 2rem; + position: relative; + display: -webkit-box; + display: -ms-flexbox; + display: flex; + -webkit-box-orient: vertical; + -webkit-box-direction: normal; + -ms-flex-direction: column; + flex-direction: column; + min-width: 0; + word-wrap: break-word; + background-color: #fff; + color: #212529; + background-clip: border-box; + border: 1px solid rgba(0,0,0,.125); + border-radius: .25rem; + padding: 0; +} + +div.attention, div.caution, div.danger, div.error, div.warning { + /* stolen from Boostrap 3 (.border-danger) */ + border-color: #dc3545!important; +} + +div.admonition p, div.hint p, +div.important p, div.note p, +div.tip p, div.sidebar p, +div.attention p, div.caution p, +div.danger p, div.error p, +div.warning p, div.system-message p { + padding-left: 1rem; + padding-right: 1rem; +} + +div.admonition p.admonition-title, div.hint p.admonition-title, +div.important p.admonition-title, div.note p.admonition-title, +div.tip p.admonition-title, div.sidebar p.sidebar-title, +div.attention p.admonition-title, div.caution p.admonition-title, +div.danger p.admonition-title, div.error p.admonition-title, +div.warning p.admonition-title, div.system-message p.system-message-title { +/* stolen from Boostrap 4 (.card .card-header) */ + font-weight: 400; + font-size: 1.25rem; + padding: .75rem 1.25rem; + margin-bottom: 1rem; + background-color: rgba(0,0,0,.03); + border-bottom: 1px solid rgba(0,0,0,.125); +} + +div.attention p.admonition-title, div.caution p.admonition-title, +div.danger p.admonition-title, div.error p.admonition-title, +div.warning p.admonition-title, div.system-message p.system-message-title { + /* stolen from Boostrap 4 (.card .card-header .bg-danger) */ + background-color: #dc3545; + color: white; +} + +div.sidebar { + margin-right: 0; +} + +/* Improved margin overrides */ +div.topic, +pre.literal-block, +pre.doctest-block, +pre.math, +pre.code, +div.code { + margin-left: 1rem; + margin-right: 1rem; +} + +div.code { + margin-bottom: 1rem; +} diff --git a/assets/css/rst.css b/assets/css/rst.css new file mode 100644 index 000000000..03424a8dd --- /dev/null +++ b/assets/css/rst.css @@ -0,0 +1,2 @@ +@import url("rst_base.css"); +@import url("nikola_rst.css"); diff --git a/assets/css/rst_base.css b/assets/css/rst_base.css new file mode 100644 index 000000000..51f92b463 --- /dev/null +++ b/assets/css/rst_base.css @@ -0,0 +1,424 @@ +/* Minimal style sheet for the HTML output of Docutils. */ +/* */ +/* :Author: Günter Milde, based on html4css1.css by David Goodger */ +/* :Id: $Id: minimal.css 8642 2021-03-26 13:51:14Z milde $ */ +/* :Copyright: © 2015 Günter Milde. */ +/* :License: Released under the terms of the `2-Clause BSD license`_, */ +/* in short: */ +/* */ +/* Copying and distribution of this file, with or without modification, */ +/* are permitted in any medium without royalty provided the copyright */ +/* notice and this notice are preserved. */ +/* */ +/* This file is offered as-is, without any warranty. */ +/* */ +/* .. _2-Clause BSD license: http://www.spdx.org/licenses/BSD-2-Clause */ + +/* This CSS2.1_ stylesheet defines rules for Docutils elements without */ +/* HTML equivalent. It is required to make the document semantic visible. */ +/* */ +/* .. _CSS2.1: http://www.w3.org/TR/CSS2 */ +/* .. _validates: http://jigsaw.w3.org/css-validator/validator$link */ + +/* titles */ +p.topic-title, +p.admonition-title, +p.system-message-title { + font-weight: bold; +} +p.sidebar-title, +p.rubric { + font-weight: bold; + font-size: larger; +} +p.rubric { + color: maroon; +} +p.subtitle, +p.section-subtitle, +p.sidebar-subtitle { + font-weight: bold; + margin-top: -0.5em; +} +h1 + p.subtitle { + font-size: 1.6em; +} +h2 + p.section-subtitle, a.toc-backref { + color: black; + text-decoration: none; +} + +/* Warnings, Errors */ +.system-messages h2, +.system-message-title, +span.problematic { + color: red; +} + +/* Inline Literals */ +.docutils.literal { + font-family: monospace; + white-space: pre-wrap; +} +/* do not wrap at hyphens and similar: */ +.literal > span.pre { white-space: nowrap; } + +/* Lists */ + +/* compact and simple lists: no margin between items */ +.simple li, .simple ul, .simple ol, +.compact li, .compact ul, .compact ol, +.simple > li p, dl.simple > dd, +.compact > li p, dl.compact > dd { + margin-top: 0; + margin-bottom: 0; +} + +/* Table of Contents */ +.topic.contents { margin: 0.5em 0; } +.topic.contents ul.auto-toc { + list-style-type: none; + padding-left: 1.5em; +} + +/* Enumerated Lists */ +ol.arabic { list-style: decimal } +ol.loweralpha { list-style: lower-alpha } +ol.upperalpha { list-style: upper-alpha } +ol.lowerroman { list-style: lower-roman } +ol.upperroman { list-style: upper-roman } + +/* Definition Lists and Derivatives */ +dt .classifier { font-style: italic } +dt .classifier:before { + font-style: normal; + margin: 0.5em; + content: ":"; +} +/* Field Lists and similar */ +/* bold field name, content starts on the same line */ +dl.field-list > dt, +dl.option-list > dt, +dl.docinfo > dt, +dl.footnote > dt, +dl.citation > dt { + font-weight: bold; + clear: left; + float: left; + margin: 0; + padding: 0; + padding-right: 0.5em; +} +/* Offset for field content (corresponds to the --field-name-limit option) */ +dl.field-list > dd, +dl.option-list > dd, +dl.docinfo > dd { + margin-left: 9em; /* ca. 14 chars in the test examples, fit all Docinfo fields */ +} +/* start field-body on a new line after long field names */ +dl.field-list > dd > *:first-child, +dl.option-list > dd > *:first-child +{ + display: inline-block; + width: 100%; + margin: 0; +} +/* field names followed by a colon */ +dl.field-list > dt:after, +dl.docinfo > dt:after { + content: ":"; +} + +/* Bibliographic Fields (docinfo) */ +dl.docinfo pre.address { + font: inherit; + margin: 0.5em 0; +} +dl.docinfo > dd.authors > p { margin: 0; } + +/* Option Lists */ +dl.option-list > dt { font-weight: normal; } +span.option { white-space: nowrap; } + +/* Footnotes and Citations */ +dl.footnote.superscript > dd { margin-left: 1em; } +dl.footnote.brackets > dd { margin-left: 2em; } +dl.footnote > dt { font-weight: normal; } +a.footnote-reference.brackets:before, +dt.label > span.brackets:before { content: "["; } +a.footnote-reference.brackets:after, +dt.label > span.brackets:after { content: "]"; } +a.footnote-reference.superscript, +dl.footnote.superscript > dt.label { + vertical-align: super; + font-size: small; +} +dt.label > span.fn-backref { + margin-left: 0.2em; + font-weight: normal; +} +dt.label > span.fn-backref > a { font-style: italic; } + +/* Alignment */ +.align-left { + text-align: left; + margin-right: auto; +} +.align-center { + clear: both; + text-align: center; + margin-left: auto; + margin-right: auto; +} +.align-right { + text-align: right; + margin-left: auto; +} +.align-top { vertical-align: top; } +.align-middle { vertical-align: middle; } +.align-bottom { vertical-align: bottom; } + +/* reset inner alignment in figures and tables */ +figure.align-left, figure.align-right, +table.align-left, table.align-center, table.align-right { + text-align: inherit; +} + +/* Text Blocks */ +blockquote, +div.topic, +aside.topic { + margin: 1em 2em; +} +.sidebar, +.admonition, +.system-message { + border: thin solid; + margin: 1em 2em; + padding: 0.5em 1em; +} +.sidebar { + width: 30%; + max-width: 26em; + float: right; + clear: right; +} +div.line-block { display: block; } +div.line-block div.line-block, pre { margin-left: 2em; } + +/* Code line numbers: dropped when copying text from the page */ +pre.code .ln { display: none; } +pre.code code:before { + content: attr(data-lineno); /* …, none) fallback not supported by any browser */ + color: gray; +} + +/* Tables */ + +td > p:first-child, th > p:first-child { margin-top: 0; } +td > p, th > p { margin-bottom: 0; } + +.borderless td, .borderless th { + border: 0; + padding: 0; + padding-right: 0.5em /* separate table cells */ +} + +/* CSS31_ style sheet for the output of Docutils HTML writers. */ +/* Rules for easy reading and pre-defined style variants. */ +/* */ +/* :Author: Günter Milde, based on html4css1.css by David Goodger */ +/* :Id: $Id: plain.css 8636 2021-03-19 00:23:33Z milde $ */ +/* :Copyright: © 2015 Günter Milde. */ +/* :License: Released under the terms of the `2-Clause BSD license`_, */ +/* in short: */ +/* */ +/* Copying and distribution of this file, with or without modification, */ +/* are permitted in any medium without royalty provided the copyright */ +/* notice and this notice are preserved. */ +/* */ +/* This file is offered as-is, without any warranty. */ +/* */ +/* .. _2-Clause BSD license: http://www.spdx.org/licenses/BSD-2-Clause */ +/* .. _CSS3: http://www.w3.org/TR/CSS3 */ + + +/* Document Structure */ +/* ****************** */ + +/* Transitions */ +hr.docutils { + width: 80%; + margin-top: 1em; + margin-bottom: 1em; + clear: both; +} + +dl > dd { + margin-bottom: 0.5em; +} + +/* Lists */ +/* ===== */ + +/* Separate list entries in compound lists */ +dl > dd, ol > li, + +/* Definition Lists */ +/* Indent lists nested in definition lists */ +/* (:only-child is new in CSS 3) */ +dd > ul:only-child, dd > ol:only-child { padding-left: 1em; } + +/* Description Lists */ +/* styled like in most dictionaries, encyclopedias etc. */ +dl.description > dt { + font-weight: bold; + clear: left; + float: left; + margin: 0; + padding: 0; + padding-right: 0.5em; +} + +/* Field Lists */ + +/* example for custom field-name width */ +dl.field-list.narrow > dd { + margin-left: 5em; +} +/* run-in: start field-body on same line after long field names */ +dl.field-list.run-in > dd p { + display: block; +} + +/* Bibliographic Fields */ + +/* generally, bibliographic fields use special definition list dl.docinfo */ +/* but dedication and abstract are placed into "topic" divs */ +div.abstract p.topic-title { + text-align: center; +} +div.dedication { + margin: 2em 5em; + text-align: center; + font-style: italic; +} +div.dedication p.topic-title { + font-style: normal; +} + +/* Text Blocks */ +/* =========== */ + +/* Literal Blocks */ +pre.literal-block, pre.doctest-block, +pre.math, pre.code { + font-family: monospace; +} + +/* Block Quotes */ +blockquote > table, +div.topic > table { + margin-top: 0; + margin-bottom: 0; +} +blockquote p.attribution, +div.topic p.attribution { + text-align: right; + margin-left: 20%; +} + +/* Tables */ +/* ====== */ + +/* th { vertical-align: bottom; } */ + +table tr { text-align: left; } + +/* "booktabs" style (no vertical lines) */ +table.booktabs { + border: 0; + border-top: 2px solid; + border-bottom: 2px solid; + border-collapse: collapse; +} +table.booktabs * { + border: 0; +} +table.booktabs th { + border-bottom: thin solid; +} + +/* numbered tables (counter defined in div.document) */ +table.numbered > caption:before { + counter-increment: table; + content: "Table " counter(table) ": "; + font-weight: bold; +} + +/* Explicit Markup Blocks */ +/* ====================== */ + +/* Footnotes and Citations */ +/* ----------------------- */ + +/* line on the left */ +dl.footnote { + padding-left: 1ex; + border-left: solid; + border-left-width: thin; +} + +/* Directives */ +/* ---------- */ + +/* Body Elements */ +/* ~~~~~~~~~~~~~ */ + +/* Images and Figures */ + +/* let content flow to the side of aligned images and figures */ +figure.align-left, +img.align-left, +video.align-left, +object.align-left { + clear: left; + float: left; + margin-right: 1em; +} +figure.align-right, +img.align-right, +video.align-right, +object.align-right { + clear: right; + float: right; + margin-left: 1em; +} + +/* Numbered figures */ +figure.numbered > figcaption > p:before { + counter-increment: figure; + content: "Figure " counter(figure) ": "; + font-weight: bold; +} + +/* Admonitions and System Messages */ +.caution p.admonition-title, +.attention p.admonition-title, +.danger p.admonition-title, +.error p.admonition-title, +.warning p.admonition-title, +div.error { + color: red; +} + +/* Sidebar */ +/* Move right. In a layout with fixed margins, */ +/* it can be moved into the margin. */ +aside.sidebar { + width: 30%; + max-width: 26em; + margin-left: 1em; + margin-right: -2%; + background-color: #ffffee; +} diff --git a/assets/css/styles.css b/assets/css/styles.css new file mode 100644 index 000000000..bebf1d6e6 --- /dev/null +++ b/assets/css/styles.css @@ -0,0 +1,375 @@ +:root { + --content-width: 1000px; + --breakpoint: 799px; + --nav-height: 70px; + --nav-background: #d0dad5; + --nav-font-color: darkolivegreen; + --link-hover-color: #e0dad5; +} + +/* Links */ +:link { + color: #3344dd; + text-decoration: underline; +} +:visited { + color: #3344dd; +} +:link:hover, :visited:hover { + color: #3344dd; +} + +#brand :link, +#brand :visited { + color: inherit; +} + +pre.literal-block { + margin-left: 0; + margin-right: 0; + padding: 1rem 1.5rem; + border-top: 1px solid black; + border-bottom: 1px solid black; +} + +pre.literal-block *:last-child { + margin-bottom: 0; +} + +.myfooter { + overflow: auto; + display: flex; + flex-direction: row; +} + +.myfooter div { + height: 50px; +} + +#footer { + border-top: 1px solid #4F5151; + clear: both; +} + +#container { + margin: 1em auto; + max-width: 1200px; +} + +.post { + float: left; + width: 68%; +} + + +div.sidebar { + float: left; + width: 27%; +} + +.logotext { + padding-top: 4px; +} + +#toplogo { + padding-top: 18px; +} + +.comment { + border-top: 1px solid #4F5151; + margin-top: 1em; + padding: .5em 0; +} + +/* For flex layout */ +.row { + display: flex; + flex-direction: column; + padding: 0; + width: 100%; } + .row .column { + display: block; + flex: 1 1 auto; + margin-left: 0; + max-width: 100%; + width: 100%; } + +@media (min-width: 40rem) { + .row { + flex-direction: row; + margin-left: -1.0rem; + width: calc(100% + 2.0rem); } + .row .column { + margin-bottom: inherit; + padding: 0 1.0rem; } } + +@media screen and (max-width: 40rem) { + .row .column { + text-align: center; } + div.sidebar { + display: none; + } + #container { + max-width: 100%; + } + div.post { + width: 95%; + } +} + + +.pb-4 { + padding-bottom: 6rem; } + +.text-sm { + font-size: 1rem; } + +.text-lg { + font-size: 2rem; } + + +/* For the responsive navbar, adapted from https://www.taniarascia.com/responsive-dropdown-navigation-bar */ + +@charset "UTF-8"; +.navigation { + height: 70px; + background: var(--nav-background); +} + +.brand { + position: absolute; + padding-left: 20px; + float: left; + line-height: 70px; + text-transform: uppercase; + font-size: 1.4em; +} +.brand a, +.brand a:visited { + color: var(--nav-font-color); + text-decoration: none; +} + +.content { + margin: 0 auto; +} + +.nav-container nav { + float: right; + background: var(--nav-background); +} + +.nav-container th { + padding: 0.5em; +} + +.nav-container ul { + list-style: none; + margin: 0; + padding: 0; +} +.nav-container ul li { + float: left; + position: relative; +} +.nav-container ul li a:link, +.nav-container ul li a, +.nav-container ul li a:visited { + display: block; + padding: 0 20px; + line-height: 70px; + background: var(--nav-background); + color: var(--nav-font-color); + text-decoration: none; +} +.nav-container ul li a:hover, +.nav-container ul li a:link:hover, +.nav-container ul li a:visited:hover { + background: var(--link-hover-color); + color: var(--nav-font-color); +} +.nav-container ul li a:not(:only-child):after, +.nav-container ul li a:visited:not(:only-child):after { + padding-left: 4px; + content: ' ▾'; +} +.nav-container ul li ul li { + min-width: 220px; +} +.nav-container ul li ul li a { + padding: 15px; + line-height: 20px; +} + +.nav-dropdown { + position: absolute; + display: none; + z-index: 1; + box-shadow: 0 3px 12px rgba(0, 0, 0, 0.15); +} + +/* Mobile navigation */ +.nav-mobile { + display: none; + position: absolute; + top: 0; + right: 0; + background: var(--nav-background); + height: 70px; + width: 70px; +} + +body { + color: #383939; + margin: 0; + padding: 0; +} + + +@media only screen and (max-width: 798px) { + #container { + margin: 0; + padding: 0; + max-width: 100%; + } + + #content { + margin: 10px; + padding: 0; + } + + .nav-mobile { + display: block; + } + + nav { + width: 100%; + padding: 70px 0 15px; + background: var(--nav-background); + } + nav ul { + display: none; + } + nav ul li { + float: none; + } + nav ul li a { + padding: 15px; + line-height: 20px; + } + nav ul li ul li a { + padding-left: 30px; + } + + .nav-dropdown { + position: static; + padding: 0 0 0 15px; + } +} + +@media screen and (min-width: 799px) { + .nav-list { + display: block !important; + } +} +#nav-toggle { + position: absolute; + left: 18px; + top: 22px; + cursor: pointer; + padding: 10px 35px 16px 0px; +} +#nav-toggle span, +#nav-toggle span:before, +#nav-toggle span:after { + cursor: pointer; + border-radius: 1px; + height: 5px; + width: 35px; + background: black; + position: absolute; + display: block; + content: ''; + transition: all 300ms ease-in-out; +} +#nav-toggle span:before { + top: -10px; +} +#nav-toggle span:after { + bottom: -10px; +} +#nav-toggle.active span { + background-color: transparent; +} +#nav-toggle.active span:before, #nav-toggle.active span:after { + top: 0; +} +#nav-toggle.active span:before { + transform: rotate(45deg); +} +#nav-toggle.active span:after { + transform: rotate(-45deg); +} + +#download { + border: 2px solid darkolivegreen; + border-radius: 5px; + text-align: center; + padding: 10px; + font-size: 120%; + font-weight: bold; + display: block; + text-shadow: 0px 0px 5px white; +} + +#download img { + width: 22px; + height: 22px; +} + +table { + border-top: 2px solid darkolivegreen; + border-bottom: 2px solid darkolivegreen; +} + +table caption { + text-align: left; +} + +tbody tr:nth-child(odd) { + background-color: #eeeeee; +} + +.highlight pre, .code pre { + white-space: pre; + font-size: 15px; + margin-left: 1px; + margin-right: 1px; + padding-top: 5px; + padding-bottom: 10px; +} + +/* Header permalinks */ +h1:hover .headerlink, h2:hover .headerlink, +h3:hover .headerlink, h4:hover .headerlink, +h5:hover .headerlink, h6:hover .headerlink { + display: inline; +} + +.headerlink { + display: none; + color: #ddd; + margin-left: 0.2em; + padding: 0 0.2em; +} + +.headerlink:hover { + opacity: 1; + background: #ddd; + color: #000; + text-decoration: none; +} + +code { + background-color: #eeeeee; +} diff --git a/assets/css/styles.min.css b/assets/css/styles.min.css new file mode 100644 index 000000000..91362282b --- /dev/null +++ b/assets/css/styles.min.css @@ -0,0 +1 @@ +/*! sanitize.css v8.0.0 | CC0 License | github.com/csstools/sanitize.css */*,::after,::before{background-repeat:no-repeat;box-sizing:border-box}::after,::before{text-decoration:inherit;vertical-align:inherit}html{cursor:default;font-family:system-ui,-apple-system,Segoe UI,Roboto,Ubuntu,Cantarell,Noto Sans,sans-serif,"Apple Color Emoji","Segoe UI Emoji","Segoe UI Symbol","Noto Color Emoji";line-height:1.15;-moz-tab-size:4;tab-size:4;-ms-text-size-adjust:100%;-webkit-text-size-adjust:100%;word-break:break-word}body{margin:0}h1{font-size:2em;margin:.67em 0}hr{height:0;overflow:visible}main{display:block}nav ol,nav ul{list-style:none}pre{font-family:Menlo,Consolas,Roboto Mono,Ubuntu Monospace,Noto Mono,Oxygen Mono,Liberation Mono,monospace;font-size:1em}a{background-color:transparent}abbr[title]{text-decoration:underline;text-decoration:underline dotted}b,strong{font-weight:bolder}code,kbd,samp{font-family:Menlo,Consolas,Roboto Mono,Ubuntu Monospace,Noto Mono,Oxygen Mono,Liberation Mono,monospace;font-size:1em}small{font-size:80%}::-moz-selection{background-color:#b3d4fc;color:#000;text-shadow:none}::selection{background-color:#b3d4fc;color:#000;text-shadow:none}audio,canvas,iframe,img,svg,video{vertical-align:middle}audio,video{display:inline-block}audio:not([controls]){display:none;height:0}img{border-style:none}svg:not([fill]){fill:currentColor}svg:not(:root){overflow:hidden}table{border-collapse:collapse}button,input,select,textarea{font-family:inherit;font-size:inherit;line-height:inherit}button,input,select{margin:0}button{overflow:visible;text-transform:none}[type=button],[type=reset],[type=submit],button{-webkit-appearance:button}fieldset{padding:.35em .75em .625em}input{overflow:visible}legend{color:inherit;display:table;max-width:100%;white-space:normal}progress{display:inline-block;vertical-align:baseline}select{text-transform:none}textarea{margin:0;overflow:auto;resize:vertical}[type=checkbox],[type=radio]{padding:0}[type=search]{-webkit-appearance:textfield;outline-offset:-2px}::-webkit-inner-spin-button,::-webkit-outer-spin-button{height:auto}::-webkit-input-placeholder{color:inherit;opacity:.54}::-webkit-search-decoration{-webkit-appearance:none}::-webkit-file-upload-button{-webkit-appearance:button;font:inherit}::-moz-focus-inner{border-style:none;padding:0}:-moz-focusring{outline:1px dotted ButtonText}details{display:block}dialog{background-color:#fff;border:solid;color:#000;display:block;height:-moz-fit-content;height:-webkit-fit-content;height:fit-content;left:0;margin:auto;padding:1em;position:absolute;right:0;width:-moz-fit-content;width:-webkit-fit-content;width:fit-content}dialog:not([open]){display:none}summary{display:list-item}canvas{display:inline-block}template{display:none}[tabindex],a,area,button,input,label,select,summary,textarea{-ms-touch-action:manipulation;touch-action:manipulation}[hidden]{display:none}[aria-busy=true]{cursor:progress}[aria-controls]{cursor:pointer}[aria-disabled=true],[disabled]{cursor:not-allowed}[aria-hidden=false][hidden]:not(:focus){clip:rect(0,0,0,0);display:inherit;position:absolute}/*! Based on https://github.com/milligram/milligram */*,:after,:before{box-sizing:inherit}html{box-sizing:border-box;font-size:62.5%}body{font-size:1.6em;font-weight:300;letter-spacing:.01em;line-height:1.6;background-color:#fcfcfc}body{color:#4b545c;font-family:Roboto,'Helvetica Neue',Helvetica,Arial,sans-serif}b,strong{font-weight:700}p{margin-top:0}h1,h2,h3,h4,h5,h6{font-weight:300;letter-spacing:-.1rem;margin-bottom:2rem;margin-top:0}h1{font-size:4.6rem;line-height:1.2}h2{font-size:3.6rem;line-height:1.25}h3{font-size:2.8rem;line-height:1.3}h4{font-size:2.2rem;letter-spacing:-.08rem;line-height:1.35}h5{font-size:1.8rem;letter-spacing:-.05rem;line-height:1.5}h6{font-size:1.6rem;letter-spacing:0;line-height:1.4}blockquote{border-left:.3rem solid #d1d1d1;margin-left:0;margin-right:0;padding:1rem 1.5rem}blockquote :last-child{margin-bottom:0}.button,button,input[type=button],input[type=reset],input[type=submit]{background-color:#1890ff;border:.1rem solid #1890ff;border-radius:.4rem;color:#fcfcfc;cursor:pointer;display:inline-block;font-size:1.1rem;font-weight:700;height:3.8rem;letter-spacing:.1rem;line-height:3.8rem;padding:0 3rem;text-align:center;text-decoration:none;text-transform:uppercase;white-space:nowrap;-webkit-box-shadow:0 5px 10px 0 rgba(24,144,255,.2);-webkit-box-shadow--moz-box-shadow:0 5px 10px 0 rgba(24,144,255,.2);-webkit-box-shadow-box-shadow:0 5px 10px 0 rgba(24,144,255,.2)}.button:focus,.button:hover,button:focus,button:hover,input[type=button]:focus,input[type=button]:hover,input[type=reset]:focus,input[type=reset]:hover,input[type=submit]:focus,input[type=submit]:hover{background-color:#4b545c;border-color:#4b545c;color:#fcfcfc;outline:0;-webkit-box-shadow:0 5px 10px 0 rgba(75,84,92,.2);-webkit-box-shadow--moz-box-shadow:0 5px 10px 0 rgba(75,84,92,.2);-webkit-box-shadow-box-shadow:0 5px 10px 0 rgba(75,84,92,.2)}.button[disabled],button[disabled],input[type=button][disabled],input[type=reset][disabled],input[type=submit][disabled]{cursor:default;opacity:.5}.button[disabled]:focus,.button[disabled]:hover,button[disabled]:focus,button[disabled]:hover,input[type=button][disabled]:focus,input[type=button][disabled]:hover,input[type=reset][disabled]:focus,input[type=reset][disabled]:hover,input[type=submit][disabled]:focus,input[type=submit][disabled]:hover{background-color:#1890ff;border-color:#1890ff}.button.button-outline,button.button-outline,input[type=button].button-outline,input[type=reset].button-outline,input[type=submit].button-outline{background-color:transparent;color:#1890ff}.button.button-outline:focus,.button.button-outline:hover,button.button-outline:focus,button.button-outline:hover,input[type=button].button-outline:focus,input[type=button].button-outline:hover,input[type=reset].button-outline:focus,input[type=reset].button-outline:hover,input[type=submit].button-outline:focus,input[type=submit].button-outline:hover{background-color:transparent;border-color:#4b545c;color:#4b545c}.button.button-outline[disabled]:focus,.button.button-outline[disabled]:hover,button.button-outline[disabled]:focus,button.button-outline[disabled]:hover,input[type=button].button-outline[disabled]:focus,input[type=button].button-outline[disabled]:hover,input[type=reset].button-outline[disabled]:focus,input[type=reset].button-outline[disabled]:hover,input[type=submit].button-outline[disabled]:focus,input[type=submit].button-outline[disabled]:hover{border-color:inherit;color:#1890ff}.button.button-clear,button.button-clear,input[type=button].button-clear,input[type=reset].button-clear,input[type=submit].button-clear{background-color:transparent;border-color:transparent;color:#1890ff}.button.button-clear:focus,.button.button-clear:hover,button.button-clear:focus,button.button-clear:hover,input[type=button].button-clear:focus,input[type=button].button-clear:hover,input[type=reset].button-clear:focus,input[type=reset].button-clear:hover,input[type=submit].button-clear:focus,input[type=submit].button-clear:hover{background-color:transparent;border-color:transparent;color:#4b545c}.button.button-clear[disabled]:focus,.button.button-clear[disabled]:hover,button.button-clear[disabled]:focus,button.button-clear[disabled]:hover,input[type=button].button-clear[disabled]:focus,input[type=button].button-clear[disabled]:hover,input[type=reset].button-clear[disabled]:focus,input[type=reset].button-clear[disabled]:hover,input[type=submit].button-clear[disabled]:focus,input[type=submit].button-clear[disabled]:hover{color:#1890ff}code{background:#f4f5f6;border-radius:.4rem;font-size:86%;margin:0 .2rem;padding:.8rem 1rem;white-space:normal}pre{border-left:.3rem solid #1890ff;padding-left:2rem;background:#f4f5f6;overflow-y:hidden}pre>code{border-radius:0;display:block;padding:1rem 1.5rem;white-space:pre}hr{border:0;border-top:.1rem solid #d1d1d1;margin:3rem 0}input:not([type]),input[type=color],input[type=date],input[type=datetime-local],input[type=datetime],input[type=email],input[type=month],input[type=number],input[type=password],input[type=search],input[type=tel],input[type=text],input[type=url],input[type=week],select,textarea{-webkit-appearance:none;-webkit-appearance--moz-appearance:none;appearance:none;background-color:transparent;color:#4b545c;border:.1rem solid #d1d1d1;border-radius:.4rem;box-shadow:none;box-sizing:inherit;height:3.8rem;padding:.6rem 1rem;width:100%}input:not([type]):focus,input[type=color]:focus,input[type=date]:focus,input[type=datetime-local]:focus,input[type=datetime]:focus,input[type=email]:focus,input[type=month]:focus,input[type=number]:focus,input[type=password]:focus,input[type=search]:focus,input[type=tel]:focus,input[type=text]:focus,input[type=url]:focus,input[type=week]:focus,select:focus,textarea:focus{border-color:#1890ff;outline:0}select{background:url('data:image/svg+xml;utf8,') center right no-repeat;padding-right:3rem}select:focus{background-image:url('data:image/svg+xml;utf8,')}textarea{min-height:6.5rem}label,legend{display:block;font-size:1.6rem;font-weight:700;margin-bottom:.5rem}fieldset{border-width:0;padding:0}input[type=checkbox],input[type=radio]{display:inline}.label-inline{display:inline-block;font-weight:400;margin-left:.5rem}a{color:#1890ff;text-decoration:none}a:focus,a:hover{color:#4b545c}dl,ol,ul{list-style:none;margin-top:0;padding-left:3rem}dl dl,dl ol,dl ul,ol dl,ol ol,ol ul,ul dl,ul ol,ul ul{font-size:90%;margin:1.5rem 0 1.5rem 3rem}ol{list-style:decimal inside;list-style-position:outside}ul{list-style:circle inside;list-style-position:outside}.button,button,dd,dt,li{margin-bottom:1rem}fieldset,input,select,textarea{margin-bottom:1.5rem}blockquote,dl,figure,form,ol,p,pre,table,ul{margin-bottom:2.5rem}table{border-spacing:0;width:100%}td,th{border-bottom:.1rem solid #e1e1e1;padding:1.2rem 1.5rem;text-align:left}td:first-child,th:first-child{padding-left:0}td:last-child,th:last-child{padding-right:0}@media screen and (max-width:40rem){table{border-spacing:0;display:flex;width:100%}table thead{border-right:solid .1rem #e1e1e1}table thead td,table thead th{padding-left:0}table thead td:first-child,table thead th:first-child{padding-left:0}table thead td:last-child,table thead th:last-child{padding-right:1.2rem}table tbody{display:flex;overflow-x:auto;white-space:nowrap}table tbody tr{border-right:solid .1rem #e1e1e1}table tbody tr:last-child{border-right:none}table td,table th{display:block}table td:first-child,table th:first-child{padding-left:1.2rem}table td:last-child,table th:last-child{padding-right:1.2rem}}img{max-width:100%}.p-0{padding:0}.pt-0{padding-top:0}.pb-0{padding-bottom:0}.pl-0{padding-left:0}.pr-0{padding-right:0}.py-0{padding-top:0;padding-bottom:0}.px-0{padding-left:0;padding-right:0}.p-1{padding:1rem}.pt-1{padding-top:1rem}.pb-1{padding-bottom:1rem}.pl-1{padding-left:1rem}.pr-1{padding-right:1rem}.py-1{padding-top:1rem;padding-bottom:1rem}.px-1{padding-left:1rem;padding-right:1rem}.p-2{padding:2rem}.pt-2{padding-top:2rem}.pb-2{padding-bottom:2rem}.pl-2{padding-left:2rem}.pr-2{padding-right:2rem}.py-2{padding-top:2rem;padding-bottom:2rem}.px-2{padding-left:2rem;padding-right:2rem}.p-3{padding:3rem}.pt-3{padding-top:3rem}.pb-3{padding-bottom:3rem}.pl-3{padding-left:3rem}.pr-3{padding-right:3rem}.py-3{padding-top:3rem;padding-bottom:3rem}.px-3{padding-left:3rem;padding-right:3rem}.p-4{padding:6rem}.pt-4{padding-top:6rem}.pb-4{padding-bottom:6rem}.pl-4{padding-left:6rem}.pr-4{padding-right:6rem}.py-4{padding-top:6rem;padding-bottom:6rem}.px-4{padding-left:6rem;padding-right:6rem}.p-5{padding:8rem}.pt-5{padding-top:8rem}.pb-5{padding-bottom:8rem}.pl-5{padding-left:8rem}.pr-5{padding-right:8rem}.py-5{padding-top:8rem;padding-bottom:8rem}.px-5{padding-left:8rem;padding-right:8rem}.m-0{margin:0}.mt-0{margin-top:0}.mb-0{margin-bottom:0}.ml-0{margin-left:0}.mr-0{margin-right:0}.my-0{margin-top:0;margin-bottom:0}.mx-0{margin-left:0;margin-right:0}.m-1{margin:1rem}.mt-1{margin-top:1rem}.mb-1{margin-bottom:1rem}.ml-1{margin-left:1rem}.mr-1{margin-right:1rem}.my-1{margin-top:1rem;margin-bottom:1rem}.mx-1{margin-left:1rem;margin-right:1rem}.m-2{margin:2rem}.mt-2{margin-top:2rem}.mb-2{margin-bottom:2rem}.ml-2{margin-left:2rem}.mr-2{margin-right:2rem}.my-2{margin-top:2rem;margin-bottom:2rem}.mx-2{margin-left:2rem;margin-right:2rem}.m-3{margin:3rem}.mt-3{margin-top:3rem}.mb-3{margin-bottom:3rem}.ml-3{margin-left:3rem}.mr-3{margin-right:3rem}.my-3{margin-top:3rem;margin-bottom:3rem}.mx-3{margin-left:3rem;margin-right:3rem}.m-4{margin:6rem}.mt-4{margin-top:6rem}.mb-4{margin-bottom:6rem}.ml-4{margin-left:6rem}.mr-4{margin-right:6rem}.my-4{margin-top:6rem;margin-bottom:6rem}.mx-4{margin-left:6rem;margin-right:6rem}.m-5{margin:8rem}.mt-5{margin-top:8rem}.mb-5{margin-bottom:8rem}.ml-5{margin-left:8rem}.mr-5{margin-right:8rem}.my-5{margin-top:8rem;margin-bottom:8rem}.mx-5{margin-left:8rem;margin-right:8rem}.mx-auto{margin-left:auto;margin-right:auto}.text-justify{text-align:justify}.text-left{text-align:left}.text-center{text-align:center}.text-left{text-align:right}.text-sm{font-size:1rem}.text-lg{font-size:2rem}.text-xl{font-size:2.5rem}.display-none{display:none}.display-inline{display:inline}.display-inline-block{display:inline-block}.display-block{display:block}.display-flex{display:flex}.display-inline-flex{display:inline-flex}.display-table{display:table}.clearfix:after{clear:both;content:' ';display:table}.float-left{float:left}.float-right{float:right}.container{margin:0 auto;max-width:112rem;padding:0 2rem;position:relative;width:100%}.row{display:flex;flex-direction:column;padding:0;width:100%}.row.row-no-padding{padding:0}.row.row-no-padding>.column{padding:0}.row.row-wrap{flex-wrap:wrap}.row.row-top{align-items:flex-start}.row.row-bottom{align-items:flex-end}.row.row-center{align-items:center}.row.row-stretch{align-items:stretch}.row.row-baseline{align-items:baseline}.row .column{display:block;flex:1 1 auto;margin-left:0;max-width:100%;width:100%}.row .column.column-offset-10{margin-left:10%}.row .column.column-offset-20{margin-left:20%}.row .column.column-offset-25{margin-left:25%}.row .column.column-offset-33,.row .column.column-offset-34{margin-left:33.3333%}.row .column.column-offset-50{margin-left:50%}.row .column.column-offset-66,.row .column.column-offset-67{margin-left:66.6666%}.row .column.column-offset-75{margin-left:75%}.row .column.column-offset-80{margin-left:80%}.row .column.column-offset-90{margin-left:90%}.row .column.column-10{flex:0 0 10%;max-width:10%}.row .column.column-20{flex:0 0 20%;max-width:20%}.row .column.column-25{flex:0 0 25%;max-width:25%}.row .column.column-33,.row .column.column-34{flex:0 0 33.3333%;max-width:33.3333%}.row .column.column-40{flex:0 0 40%;max-width:40%}.row .column.column-50{flex:0 0 50%;max-width:50%}.row .column.column-60{flex:0 0 60%;max-width:60%}.row .column.column-66,.row .column.column-67{flex:0 0 66.6666%;max-width:66.6666%}.row .column.column-75{flex:0 0 75%;max-width:75%}.row .column.column-80{flex:0 0 80%;max-width:80%}.row .column.column-90{flex:0 0 90%;max-width:90%}.row .column .column-top{align-self:flex-start}.row .column .column-bottom{align-self:flex-end}.row .column .column-center{align-self:center}@media (min-width:40rem){.row{flex-direction:row;margin-left:-1rem;width:calc(100% + 2rem)}.row .column{margin-bottom:inherit;padding:0 1rem}}@font-face{font-family:ico;src:url(../font/ico.eot?13319731);src:url(../font/ico.eot?13319731#iefix) format("embedded-opentype"),url(../font/ico.woff2?13319731) format("woff2"),url(../font/ico.woff?13319731) format("woff"),url(../font/ico.ttf?13319731) format("truetype"),url(../font/ico.svg?13319731#ico) format("svg");font-weight:400;font-style:normal}[class*=" icon-"]:before,[class^=icon-]:before{font-family:ico;font-style:normal;font-weight:400;speak:none;display:inline-block;text-decoration:inherit;width:1em;margin-right:.2em;text-align:center;font-variant:normal;text-transform:none;line-height:1em;margin-left:.2em;-webkit-font-smoothing:antialiased;-moz-osx-font-smoothing:grayscale}.animate-spin{-moz-animation:spin 2s infinite linear;-o-animation:spin 2s infinite linear;-webkit-animation:spin 2s infinite linear;animation:spin 2s infinite linear;display:inline-block}@-moz-keyframes spin{0%{-moz-transform:rotate(0);-o-transform:rotate(0);-webkit-transform:rotate(0);transform:rotate(0)}100%{-moz-transform:rotate(359deg);-o-transform:rotate(359deg);-webkit-transform:rotate(359deg);transform:rotate(359deg)}}@-webkit-keyframes spin{0%{-moz-transform:rotate(0);-o-transform:rotate(0);-webkit-transform:rotate(0);transform:rotate(0)}100%{-moz-transform:rotate(359deg);-o-transform:rotate(359deg);-webkit-transform:rotate(359deg);transform:rotate(359deg)}}@-o-keyframes spin{0%{-moz-transform:rotate(0);-o-transform:rotate(0);-webkit-transform:rotate(0);transform:rotate(0)}100%{-moz-transform:rotate(359deg);-o-transform:rotate(359deg);-webkit-transform:rotate(359deg);transform:rotate(359deg)}}@-ms-keyframes spin{0%{-moz-transform:rotate(0);-o-transform:rotate(0);-webkit-transform:rotate(0);transform:rotate(0)}100%{-moz-transform:rotate(359deg);-o-transform:rotate(359deg);-webkit-transform:rotate(359deg);transform:rotate(359deg)}}@keyframes spin{0%{-moz-transform:rotate(0);-o-transform:rotate(0);-webkit-transform:rotate(0);transform:rotate(0)}100%{-moz-transform:rotate(359deg);-o-transform:rotate(359deg);-webkit-transform:rotate(359deg);transform:rotate(359deg)}}.icon-menu-outline:before{content:'\e800'}.icon-github-circled:before{content:'\f09b'}.icon-rss:before{content:'\f09e'}.icon-menu:before{content:'\f0c9'}.icon-spinner:before{content:'\f110'}.icon-code:before{content:'\f121'}.icon-bitbucket:before{content:'\f171'}.icon-circle-notch:before{content:'\f1ce'}.icon-github-text:before{content:'\f307'}.navbar{display:flex;flex-wrap:wrap;justify-content:space-between;margin-top:2rem;padding-left:0;padding-inline-start:0}.navbar-item{padding:.5rem .66rem;margin:0 .2rem}.nav-item-active{border-radius:.4rem;border:.1rem solid #1890ff}.nav-item-active:hover{border-color:#4b545c}.navbar-menu{display:none;position:fixed;right:0;top:0;z-index:100;padding:.5rem .66rem;margin-top:2rem;margin-right:2rem;border-radius:50%;background-color:#fff}@media screen and (max-width:60rem){.navbar{justify-content:center}}.navbar-smart{display:flex;flex-wrap:wrap;justify-content:space-between;margin-top:2rem;padding-left:0;padding-inline-start:0}@media screen and (max-width:60rem){.navbar-menu-on{display:block}.navbar-smart{display:none;flex-direction:column;align-items:flex-start;position:fixed;left:0;top:0;width:100%;z-index:100;padding:1rem;background-color:rgba(255,255,255,.98)}.navbar-open{display:flex}}.shadow{-webkit-box-shadow:0 5px 10px 0 rgba(0,0,0,.3);-moz-box-shadow:0 5px 10px 0 rgba(0,0,0,.3);box-shadow:0 5px 10px 0 rgba(0,0,0,.3)}.shadow-lg{-webkit-box-shadow:0 10px 20px 0 rgba(0,0,0,.8);-moz-box-shadow:0 10px 20px 0 rgba(0,0,0,.8);box-shadow:0 10px 20px 0 rgba(0,0,0,.8)}.shadow-primary{-webkit-box-shadow:0 5px 10px 0 rgba(24,144,255,.2);-moz-box-shadow:0 5px 10px 0 rgba(24,144,255,.2);box-shadow:0 5px 10px 0 rgba(24,144,255,.2)}.shadow-secondary{-webkit-box-shadow:0 5px 10px 0 rgba(75,84,92,.2);-moz-box-shadow:0 5px 10px 0 rgba(75,84,92,.2);box-shadow:0 5px 10px 0 rgba(75,84,92,.2)}.muted{color:#8b9298}@media screen and (max-width:40rem){.row .column{text-align:center}}.person{min-height:190px;margin-bottom:8rem}.person h3{margin-bottom:1rem}.person img{float:left;max-width:150px;margin-right:2rem;margin-bottom:1rem}.person p{margin-left:150px}@media screen and (max-width:40rem){.person img{float:none}.person p{margin-left:0}}.hljs{display:block;overflow-x:auto;padding:.5em;color:#9006b3;background:#f4f5f6}.hljs-comment,.hljs-quote{color:#bbbcc2;font-style:italic}.hljs-doctag,.hljs-formula,.hljs-keyword{color:#dd4600}.hljs-deletion,.hljs-name,.hljs-section,.hljs-selector-tag,.hljs-subst{color:#e45649}.hljs-literal{color:#009c15}.hljs-addition,.hljs-attribute,.hljs-meta-string,.hljs-regexp,.hljs-string{color:#2e912c}.hljs-built_in,.hljs-class .hljs-title{color:#c1ae01}.hljs-attr,.hljs-number,.hljs-selector-attr,.hljs-selector-class,.hljs-selector-pseudo,.hljs-template-variable,.hljs-type,.hljs-variable{color:#b97f00}.hljs-bullet,.hljs-link,.hljs-meta,.hljs-selector-id,.hljs-symbol,.hljs-title{color:#2263f0}.hljs-emphasis{font-style:italic}.hljs-strong{font-weight:700}.hljs-link{text-decoration:underline} \ No newline at end of file diff --git a/assets/css/theme.css b/assets/css/theme.css new file mode 100644 index 000000000..b198c2ced --- /dev/null +++ b/assets/css/theme.css @@ -0,0 +1,421 @@ +@charset "UTF-8"; + +/* + Copyright © 2014-2022 Daniel Aleksandersen and others. + + Permission is hereby granted, free of charge, to any + person obtaining a copy of this software and associated + documentation files (the "Software"), to deal in the + Software without restriction, including without limitation + the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the + Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice + shall be included in all copies or substantial portions of + the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY + KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE + WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR + PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS + OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR + OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR + OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +body { + color: #4F5151; + font-family: Helvetica, Arial, sans-serif; + font-size: 17px; + line-height: 1.4; + padding: 1em; +} + +#container { + margin: 1em auto; + max-width: 770px; +} +#menu ul, +#menu ul li, +.postpromonav .tags, +.postpromonav .tags li, +.pager, +.pager li, +#toptranslations ul, +#toptranslations ul li { + list-style: none; + padding-left: 0; + padding-right: 0; +} + +#toptranslations ul { + display: inline; +} + +#menu ul li, +#toptranslations ul li { + display: inline-block; + margin-right: 1.5em; +} + +#toptranslations h2 { + display: inline; + font-size: 1em; + margin-right: 1.5em; +} + +html[dir="rtl"] #menu ul li, +html[dir="rtl"] #toptranslations ul li, +html[dir="rtl"] #toptranslations h2 { + margin-left: 1.5em; + margin-right: 0; +} + +#toptranslations { + text-align: right; + float: right; +} + +html[dir="rtl"] #toptranslations { + text-align: left; + float: left; +} + +.posttranslations h3, .translationslist h3 { + display: inline; + font-size: 1em; +} + +.entry-title { + font-size: 2em; +} + +.posttranslations h3:last-child, .translationslist h3:last-child { + display: none; +} + +.postindex article { + border-bottom: 1px solid #4F5151; + padding-bottom: 1em; +} +#header { + border-bottom: 1px solid #4F5151; +} +#footer { + border-top: 1px solid #4F5151; +} + +/* Tags */ +.postpromonav { + border-bottom: 1px solid #4F5151; + border-top: 1px solid #4F5151; + margin-top: 1em; + padding: .5em 0; +} +.postpromonav .tags { + text-align: center; +} +.metadata p:before, +.postpromonav .tags li:before, +.postlist .listdate:after, +.translationslist p:before { + content: " — "; +} +.postlist li { + margin-bottom: .33em; +} +.byline a:not(:last-child):after { + content: ","; +} + +/* Post and archive pagers */ +.postindexpager .pager .next:before { + content: "↓ "; +} +.postindexpager .pager .previous:before, +.archivenav .pager .up:before { + content: "↑ "; +} +.postpromonav .pager .next:after, +.archivenav .pager .next:after { + content: " →"; +} +html[dir="rtl"] .postpromonav .pager .previous:after, +html[dir="rtl"] .archivenav .pager .previous:after { + content: " →"; +} +.postpromonav .pager .previous:before, +.archivenav .pager .previous:before { + content: "← "; +} +html[dir="rtl"] .postpromonav .pager .next:before, +html[dir="rtl"] .archivenav .pager .next:before { + content: "← "; +} +html[dir="rtl"] .postpromonav .pager .next:after, +html[dir="rtl"] .archivenav .pager .next:after, +html[dir="rtl"] .postpromonav .pager .previous:before, +html[dir="rtl"] .archivenav .pager .previous:before { + content: ""; +} +.metadata p:first-of-type:before, +.postpromonav .tags li:first-of-type:before, +.translationslist p:first-of-type:before { + content: ""; +} +.postpromonav .pager { + clear: both; + height: 1em; +} +.postpromonav .tags li, +.postpromonav .pager li, +.archivenav .pager li { + display: inline-block; +} +.archivenav .pager { + text-align: center +} +.postpromonav .pager .next, +.archivenav .pager .next { + float: right; +} +html[dir="rtl"] .postpromonav .pager .next, +html[dir="rtl"] .archivenav .pager .next { + float: left; +} + +.postpromonav .pager .previous, +.archivenav .pager .previous { + float: left; +} +html[dir="rtl"] .postpromonav .pager .previous, +html[dir="rtl"] .archivenav .pager .previous { + float: right; +} + +.archivenav .pager .disabled, +.archivenav .pager .disabled a, +.archivenav .pager .disabled:link { + color: #888; + cursor: not-allowed; +} + +.metadata p, +.translationslist p { + display: inline; +} + +#brand { + font-size: 3em; + line-height: 1; +} + +/* Links */ +:link { + color: #1168CC; + text-decoration: none; +} +:visited { + color: #6830BB; +} +:link:hover, :visited:hover { + color: #0d53a3; +} + +#brand :link, +#brand :visited { + color: inherit; +} + +/* Images */ +img { + border: none; + line-height: 1; +} + +.postpage img, +.postpage object, +.postindex article img, +.postindex article object { + height: auto; + max-width: 100%; +} + +/* Comment helpers */ +#disqus_thread { + min-height: 325px; +} + +.breadcrumb { + padding: 8px 15px; + margin-bottom: 20px; + list-style: none; +} + +.breadcrumb > li { + display: inline-block; + margin-right: 0; + margin-left: 0; +} + +.breadcrumb > li:after { + content: ' / '; + color: #888; +} + +.breadcrumb > li:last-of-type:after { + content: ''; + margin-left: 0; +} + +.thumbnails { + list-style: none; + padding: 0; +} + +.thumbnails > li { + display: inline-block; + margin-right: 10px; +} + +.thumbnails > li:last-of-type { + margin-right: 0; +} + +.sr-only { + position: absolute; + width: 1px; + height: 1px; + padding: 0; + margin: -1px; + overflow: hidden; + clip: rect(0, 0, 0, 0); + border: 0; +} + +.sr-only-focusable:active, +.sr-only-focusable:focus { + position: static; + width: auto; + height: auto; + margin: 0; + overflow: visible; + clip: auto; +} + +pre.code, code { + white-space: pre-wrap; + word-wrap: normal; + overflow: auto; +} + +/* Set a minimum logo height to ensure .svg are displayed if there is no brand text (#3493) */ +#logo { + min-height: 25px; +} + +/* SOURCE: https://github.com/h5bp/html5-boilerplate/blob/master/src/css/main.css */ +@media print { + *, + *:before, + *:after { + background: transparent !important; + color: #000 !important; /* Black prints faster: http://www.sanbeiji.com/archives/953 */ + box-shadow: none !important; + text-shadow: none !important; + font-family: Garamond, Junicode, serif; + } + + body { + font-size: 12pt; + } + + a, + a:visited { + text-decoration: underline; + } + + a[href]:after { + content: " (" attr(href) ")"; + } + + abbr[title]:after { + content: " (" attr(title) ")"; + } + + /* + * Don't show links that are fragment identifiers, + * or use the `javascript:` pseudo protocol + */ + + a[href^="#"]:after, + a[href^="data:"]:after, + a[href^="javascript:"]:after { + content: ""; + } + + pre, + blockquote { + border: 1px solid #999; + page-break-inside: avoid; + } + + /* + * Printing Tables: + * http://css-discuss.incutio.com/wiki/Printing_Tables + */ + + thead { + display: table-header-group; + } + + tr, + img { + page-break-inside: avoid; + } + + img { + max-width: 100% !important; + } + + p, + h2, + h3 { + orphans: 3; + widows: 3; + } + + h2, + h3 { + page-break-after: avoid; + } + + .hidden-print { + display: none !important; + } + + article .entry-title a[href]:after, + article .metadata a[href]:after, + article .tags a[href]:after { + content: ""; + } + + article .metadata .sourceline { + display: none; + } + + article .metadata .linkline a[href]:after { + content: " (" attr(href) ")"; + } + + #header { + display: none; + } + + .postpromonav { + padding: 0; + } +} diff --git a/assets/css/tipuesearch.css b/assets/css/tipuesearch.css new file mode 100644 index 000000000..5793c6db6 --- /dev/null +++ b/assets/css/tipuesearch.css @@ -0,0 +1,271 @@ +/* +Tipue Search 7.1 +Copyright (c) 2019 Tipue +Tipue Search is released under the MIT License +http://www.tipue.com/search +*/ + + +/* search box */ + +/* +#tipue_search_input +{ + float: left; + font: 15px 'Open Sans', sans-serif; + color: #333; + -webkit-font-smoothing: antialiased; + -moz-osx-font-smoothing: grayscale; + width: 230px; + background-color: #f3f3f3; + border: none; + padding: 9px 0 10px 18px; + height: 56px; + border-radius: 3px; + -moz-appearance: none; + -webkit-appearance: none; + box-sizing: border-box; + box-shadow: none; + outline: 0; + margin: 0; +} +#tipue_search_input:-webkit-autofill, +#tipue_search_input:-webkit-autofill:hover, +#tipue_search_input:-webkit-autofill:focus +{ + -webkit-box-shadow: 0 0 0px 1000px #f3f3f3 inset; +} +*/ +.tipue_search_button { + position: relative; + float: left; + width: 49px; + height: 56px; + margin-left: -3px; + background-color: #f3f3f3; + border: none; + border-top-right-radius: 3px; + border-bottom-right-radius: 3px; + box-sizing: border-box; + cursor: pointer; + outline: 0; +} + +.tipue_search_icon { + float: left; + font: 24px/1 'Open Sans', sans-serif; + -webkit-font-smoothing: antialiased; + -moz-osx-font-smoothing: grayscale; + color: #333; + transform: rotate(-45deg); + -moz-appearance: none; + -webkit-appearance: none; + box-sizing: border-box; + box-shadow: none; + outline: 0; + margin: -1px 0 0 16px; +} + +.tipue_search_group:after { + content: ""; + display: table; + clear: both; +} + + +/* search results */ + + +#tipue_search_content { + max-width: 100%; + margin: 0; +} + +.tipue_search_content_title { + font-weight: 300; + font-size: 2rem; + color: #111; +} + +.tipue_search_content_title a { + color: #111; + text-decoration: none; +} + +.tipue_search_result { + padding-top: 27px; +} + +#tipue_search_results_count, .tipue_search_content_debug { + font: 13px/1.5 'Source Code Pro', monospace; + text-transform: uppercase; + color: #999; + -webkit-font-smoothing: antialiased; + -moz-osx-font-smoothing: grayscale; +} + +#tipue_search_results_count { + padding-top: 9px; +} + +.tipue_search_content_url, .tipue_search_note, .tipue_search_related, #tipue_search_error, #tipue_search_replace { + font-weight: 300; + padding-top: 7px; + word-wrap: break-word; + hyphens: auto; +} + +#tipue_search_replace, .tipue_search_related { + margin-top: 7px; +} + +#tipue_search_error { + color: #333; + margin-top: 17px; +} + +.tipue_search_content_text { + font-weight: 300; + word-wrap: break-word; + hyphens: auto; + margin-top: 9px; +} + +.tipue_search_content_bold { + font-weight: 400; +} + +.tipue_search_content_debug { + margin: 7px 0 2px 0; +} + + +/* images */ + + +.tipue_search_image { + padding: 17px 0 6px 0; +} + +.tipue_search_img { + width: 100%; + max-width: 330px; + height: auto; + transition: 0.5s; + border-radius: 2px; +} + +.tipue_search_img:hover { + opacity: 0.9; +} + +#tipue_search_zoom_text { + font: 12px/1.7 'Source Code Pro', monospace; + color: #ccc; + text-transform: uppercase; + letter-spacing: 1px; + padding-top: 9px; +} + +#tipue_search_zoom_text a { + color: #ccc; + text-decoration: none; + border-bottom: 2px solid #f7f7f7; +} + +#tipue_search_zoom_text a:hover { + border: 0; +} + +.tipue_search_image_zoom { + cursor: pointer; +} + +#tipue_search_image_modal { + display: none; + position: fixed; + z-index: 1000; + left: 0; + top: 0; + width: 100%; + height: 100%; + overflow: auto; + background-color: rgba(0, 0, 0, 0.9); +} + +.tipue_search_image_close { + position: absolute; + top: 0; + right: 0; + font: 22px/1 'Source Code Pro', monospace; + color: #ccc; + padding: 25px 30px; + cursor: pointer; +} + +.tipue_search_image_block { + margin: 0 auto; + max-width: 900px; + padding: 73px 30px 30px 30px; + box-sizing: border-box; + color: #fff; +} + +#tipue_search_zoom_img { + max-width: 100%; + height: auto; +} + +#tipue_search_zoom_text, .tipue_search_zoom_options { + padding-top: 9px; +} + + +/* footer */ + + +#tipue_search_foot { + margin: 51px 0 21px 0; +} + +#tipue_search_foot_boxes { + font: 14px 'Source Code Pro', sans-serif; + text-transform: uppercase; + color: #333; + padding: 0; + margin: 0; + cursor: pointer; +} + +#tipue_search_foot_boxes li { + display: inline; + list-style: none; + margin: 0; + padding: 0; +} + +#tipue_search_foot_boxes li a { + background-color: #f7f7f7; + color: #666; + padding: 10px 17px 11px 17px; + border-radius: 3px; + margin-right: 7px; + text-decoration: none; + text-align: center; + transition: 0.3s; +} + +#tipue_search_foot_boxes li.current { + background: #252525; + color: #ccc; + padding: 10px 17px 11px 17px; + border-radius: 3px; + margin-right: 7px; + text-align: center; +} + +#tipue_search_foot_boxes li a:hover { + background: #252525; + color: #ccc; +} + diff --git a/assets/fonts/LICENSE.txt b/assets/fonts/LICENSE.txt new file mode 100644 index 000000000..066dae804 --- /dev/null +++ b/assets/fonts/LICENSE.txt @@ -0,0 +1,30 @@ +Font license info + + +## Typicons + + (c) Stephen Hutchings 2012 + + Author: Stephen Hutchings + License: SIL (http://scripts.sil.org/OFL) + Homepage: http://typicons.com/ + + +## Font Awesome + + Copyright (C) 2016 by Dave Gandy + + Author: Dave Gandy + License: SIL () + Homepage: http://fortawesome.github.com/Font-Awesome/ + + +## Brandico + + (C) 2012 by Vitaly Puzrin + + Author: Crowdsourced, for Fontello project + License: SIL (http://scripts.sil.org/OFL) + Homepage: + + diff --git a/assets/fonts/ico.eot b/assets/fonts/ico.eot new file mode 100644 index 000000000..9be00157f Binary files /dev/null and b/assets/fonts/ico.eot differ diff --git a/assets/fonts/ico.svg b/assets/fonts/ico.svg new file mode 100644 index 000000000..640267b79 --- /dev/null +++ b/assets/fonts/ico.svg @@ -0,0 +1,28 @@ + + + +Copyright (C) 2019 by original authors @ fontello.com + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/assets/fonts/ico.ttf b/assets/fonts/ico.ttf new file mode 100644 index 000000000..f1f9a002c Binary files /dev/null and b/assets/fonts/ico.ttf differ diff --git a/assets/fonts/ico.woff b/assets/fonts/ico.woff new file mode 100644 index 000000000..9c26d2056 Binary files /dev/null and b/assets/fonts/ico.woff differ diff --git a/assets/fonts/ico.woff2 b/assets/fonts/ico.woff2 new file mode 100644 index 000000000..bcadeef9c Binary files /dev/null and b/assets/fonts/ico.woff2 differ diff --git a/assets/js/fancydates.js b/assets/js/fancydates.js new file mode 100644 index 000000000..dc7906dd8 --- /dev/null +++ b/assets/js/fancydates.js @@ -0,0 +1,22 @@ +function fancydates(fanciness, luxonDateFormat) { + if (fanciness === 0) { + return; + } + + var dates = document.querySelectorAll('.dt-published, .dt-updated, .listdate'); + + var l = dates.length; + + for (var i = 0; i < l; i++) { + var d = luxon.DateTime.fromISO(dates[i].attributes.datetime.value); + var o; + if (fanciness === 1 && luxonDateFormat.preset) { + o = d.toLocal().toLocaleString(luxon.DateTime[luxonDateFormat.format]); + } else if (fanciness === 1) { + o = d.toLocal().toFormat(luxonDateFormat.format); + } else { + o = d.toRelative(); + } + dates[i].innerHTML = o; + } +} diff --git a/assets/js/fancydates.min.js b/assets/js/fancydates.min.js new file mode 100644 index 000000000..bb0b07bea --- /dev/null +++ b/assets/js/fancydates.min.js @@ -0,0 +1 @@ +function fancydates(t,e){if(0!==t)for(var a=document.querySelectorAll(".dt-published, .dt-updated, .listdate"),o=a.length,l=0;l",d.insertBefore(c.lastChild,d.firstChild)}function d(){var a=y.elements;return"string"==typeof a?a.split(" "):a}function e(a,b){var c=y.elements;"string"!=typeof c&&(c=c.join(" ")),"string"!=typeof a&&(a=a.join(" ")),y.elements=c+" "+a,j(b)}function f(a){var b=x[a[v]];return b||(b={},w++,a[v]=w,x[w]=b),b}function g(a,c,d){if(c||(c=b),q)return c.createElement(a);d||(d=f(c));var e;return e=d.cache[a]?d.cache[a].cloneNode():u.test(a)?(d.cache[a]=d.createElem(a)).cloneNode():d.createElem(a),!e.canHaveChildren||t.test(a)||e.tagUrn?e:d.frag.appendChild(e)}function h(a,c){if(a||(a=b),q)return a.createDocumentFragment();c=c||f(a);for(var e=c.frag.cloneNode(),g=0,h=d(),i=h.length;i>g;g++)e.createElement(h[g]);return e}function i(a,b){b.cache||(b.cache={},b.createElem=a.createElement,b.createFrag=a.createDocumentFragment,b.frag=b.createFrag()),a.createElement=function(c){return y.shivMethods?g(c,a,b):b.createElem(c)},a.createDocumentFragment=Function("h,f","return function(){var n=f.cloneNode(),c=n.createElement;h.shivMethods&&("+d().join().replace(/[\w\-:]+/g,function(a){return b.createElem(a),b.frag.createElement(a),'c("'+a+'")'})+");return n}")(y,b.frag)}function j(a){a||(a=b);var d=f(a);return!y.shivCSS||p||d.hasCSS||(d.hasCSS=!!c(a,"article,aside,dialog,figcaption,figure,footer,header,hgroup,main,nav,section{display:block}mark{background:#FF0;color:#000}template{display:none}")),q||i(a,d),a}function k(a){for(var b,c=a.getElementsByTagName("*"),e=c.length,f=RegExp("^(?:"+d().join("|")+")$","i"),g=[];e--;)b=c[e],f.test(b.nodeName)&&g.push(b.applyElement(l(b)));return g}function l(a){for(var b,c=a.attributes,d=c.length,e=a.ownerDocument.createElement(A+":"+a.nodeName);d--;)b=c[d],b.specified&&e.setAttribute(b.nodeName,b.nodeValue);return e.style.cssText=a.style.cssText,e}function m(a){for(var b,c=a.split("{"),e=c.length,f=RegExp("(^|[\\s,>+~])("+d().join("|")+")(?=[[\\s,>+~#.:]|$)","gi"),g="$1"+A+"\\:$2";e--;)b=c[e]=c[e].split("}"),b[b.length-1]=b[b.length-1].replace(f,g),c[e]=b.join("}");return c.join("{")}function n(a){for(var b=a.length;b--;)a[b].removeNode()}function o(a){function b(){clearTimeout(g._removeSheetTimer),d&&d.removeNode(!0),d=null}var d,e,g=f(a),h=a.namespaces,i=a.parentWindow;return!B||a.printShived?a:("undefined"==typeof h[A]&&h.add(A),i.attachEvent("onbeforeprint",function(){b();for(var f,g,h,i=a.styleSheets,j=[],l=i.length,n=Array(l);l--;)n[l]=i[l];for(;h=n.pop();)if(!h.disabled&&z.test(h.media)){try{f=h.imports,g=f.length}catch(o){g=0}for(l=0;g>l;l++)n.push(f[l]);try{j.push(h.cssText)}catch(o){}}j=m(j.reverse().join("")),e=k(a),d=c(a,j)}),i.attachEvent("onafterprint",function(){n(e),clearTimeout(g._removeSheetTimer),g._removeSheetTimer=setTimeout(b,500)}),a.printShived=!0,a)}var p,q,r="3.7.3",s=a.html5||{},t=/^<|^(?:button|map|select|textarea|object|iframe|option|optgroup)$/i,u=/^(?:a|b|code|div|fieldset|h1|h2|h3|h4|h5|h6|i|label|li|ol|p|q|span|strong|style|table|tbody|td|th|tr|ul)$/i,v="_html5shiv",w=0,x={};!function(){try{var a=b.createElement("a");a.innerHTML="",p="hidden"in a,q=1==a.childNodes.length||function(){b.createElement("a");var a=b.createDocumentFragment();return"undefined"==typeof a.cloneNode||"undefined"==typeof a.createDocumentFragment||"undefined"==typeof a.createElement}()}catch(c){p=!0,q=!0}}();var y={elements:s.elements||"abbr article aside audio bdi canvas data datalist details dialog figcaption figure footer header hgroup main mark meter nav output picture progress section summary template time video",version:r,shivCSS:s.shivCSS!==!1,supportsUnknownElements:q,shivMethods:s.shivMethods!==!1,type:"default",shivDocument:j,createElement:g,createDocumentFragment:h,addElements:e};a.html5=y,j(b);var z=/^$|\b(?:all|print)\b/,A="html5shiv",B=!q&&function(){var c=b.documentElement;return!("undefined"==typeof b.namespaces||"undefined"==typeof b.parentWindow||"undefined"==typeof c.applyElement||"undefined"==typeof c.removeNode||"undefined"==typeof a.attachEvent)}();y.type+=" print",y.shivPrint=o,o(b),"object"==typeof module&&module.exports&&(module.exports=y)}("undefined"!=typeof window?window:this,document); \ No newline at end of file diff --git a/assets/js/html5shiv-printshiv.min.js b/assets/js/html5shiv-printshiv.min.js new file mode 100644 index 000000000..2b43bd062 --- /dev/null +++ b/assets/js/html5shiv-printshiv.min.js @@ -0,0 +1,4 @@ +/** +* @preserve HTML5 Shiv 3.7.3-pre | @afarkas @jdalton @jon_neal @rem | MIT/GPL2 Licensed +*/ +!function(a,b){function c(a,b){var c=a.createElement("p"),d=a.getElementsByTagName("head")[0]||a.documentElement;return c.innerHTML="x",d.insertBefore(c.lastChild,d.firstChild)}function d(){var a=y.elements;return"string"==typeof a?a.split(" "):a}function e(a,b){var c=y.elements;"string"!=typeof c&&(c=c.join(" ")),"string"!=typeof a&&(a=a.join(" ")),y.elements=c+" "+a,j(b)}function f(a){var b=x[a[v]];return b||(b={},w++,a[v]=w,x[w]=b),b}function g(a,c,d){if(c||(c=b),q)return c.createElement(a);d||(d=f(c));var e;return e=d.cache[a]?d.cache[a].cloneNode():u.test(a)?(d.cache[a]=d.createElem(a)).cloneNode():d.createElem(a),!e.canHaveChildren||t.test(a)||e.tagUrn?e:d.frag.appendChild(e)}function h(a,c){if(a||(a=b),q)return a.createDocumentFragment();c=c||f(a);for(var e=c.frag.cloneNode(),g=0,h=d(),i=h.length;i>g;g++)e.createElement(h[g]);return e}function i(a,b){b.cache||(b.cache={},b.createElem=a.createElement,b.createFrag=a.createDocumentFragment,b.frag=b.createFrag()),a.createElement=function(c){return y.shivMethods?g(c,a,b):b.createElem(c)},a.createDocumentFragment=Function("h,f","return function(){var n=f.cloneNode(),c=n.createElement;h.shivMethods&&("+d().join().replace(/[\w\-:]+/g,function(a){return b.createElem(a),b.frag.createElement(a),'c("'+a+'")'})+");return n}")(y,b.frag)}function j(a){a||(a=b);var d=f(a);return!y.shivCSS||p||d.hasCSS||(d.hasCSS=!!c(a,"article,aside,dialog,figcaption,figure,footer,header,hgroup,main,nav,section{display:block}mark{background:#FF0;color:#000}template{display:none}")),q||i(a,d),a}function k(a){for(var b,c=a.getElementsByTagName("*"),e=c.length,f=RegExp("^(?:"+d().join("|")+")$","i"),g=[];e--;)b=c[e],f.test(b.nodeName)&&g.push(b.applyElement(l(b)));return g}function l(a){for(var b,c=a.attributes,d=c.length,e=a.ownerDocument.createElement(A+":"+a.nodeName);d--;)b=c[d],b.specified&&e.setAttribute(b.nodeName,b.nodeValue);return e.style.cssText=a.style.cssText,e}function m(a){for(var b,c=a.split("{"),e=c.length,f=RegExp("(^|[\\s,>+~])("+d().join("|")+")(?=[[\\s,>+~#.:]|$)","gi"),g="$1"+A+"\\:$2";e--;)b=c[e]=c[e].split("}"),b[b.length-1]=b[b.length-1].replace(f,g),c[e]=b.join("}");return c.join("{")}function n(a){for(var b=a.length;b--;)a[b].removeNode()}function o(a){function b(){clearTimeout(g._removeSheetTimer),d&&d.removeNode(!0),d=null}var d,e,g=f(a),h=a.namespaces,i=a.parentWindow;return!B||a.printShived?a:("undefined"==typeof h[A]&&h.add(A),i.attachEvent("onbeforeprint",function(){b();for(var f,g,h,i=a.styleSheets,j=[],l=i.length,n=Array(l);l--;)n[l]=i[l];for(;h=n.pop();)if(!h.disabled&&z.test(h.media)){try{f=h.imports,g=f.length}catch(o){g=0}for(l=0;g>l;l++)n.push(f[l]);try{j.push(h.cssText)}catch(o){}}j=m(j.reverse().join("")),e=k(a),d=c(a,j)}),i.attachEvent("onafterprint",function(){n(e),clearTimeout(g._removeSheetTimer),g._removeSheetTimer=setTimeout(b,500)}),a.printShived=!0,a)}var p,q,r="3.7.3",s=a.html5||{},t=/^<|^(?:button|map|select|textarea|object|iframe|option|optgroup)$/i,u=/^(?:a|b|code|div|fieldset|h1|h2|h3|h4|h5|h6|i|label|li|ol|p|q|span|strong|style|table|tbody|td|th|tr|ul)$/i,v="_html5shiv",w=0,x={};!function(){try{var a=b.createElement("a");a.innerHTML="",p="hidden"in a,q=1==a.childNodes.length||function(){b.createElement("a");var a=b.createDocumentFragment();return"undefined"==typeof a.cloneNode||"undefined"==typeof a.createDocumentFragment||"undefined"==typeof a.createElement}()}catch(c){p=!0,q=!0}}();var y={elements:s.elements||"abbr article aside audio bdi canvas data datalist details dialog figcaption figure footer header hgroup main mark meter nav output picture progress section summary template time video",version:r,shivCSS:s.shivCSS!==!1,supportsUnknownElements:q,shivMethods:s.shivMethods!==!1,type:"default",shivDocument:j,createElement:g,createDocumentFragment:h,addElements:e};a.html5=y,j(b);var z=/^$|\b(?:all|print)\b/,A="html5shiv",B=!q&&function(){var c=b.documentElement;return!("undefined"==typeof b.namespaces||"undefined"==typeof b.parentWindow||"undefined"==typeof c.applyElement||"undefined"==typeof c.removeNode||"undefined"==typeof a.attachEvent)}();y.type+=" print",y.shivPrint=o,o(b),"object"==typeof module&&module.exports&&(module.exports=y)}("undefined"!=typeof window?window:this,document); \ No newline at end of file diff --git a/assets/js/jquery.min.js b/assets/js/jquery.min.js new file mode 100644 index 000000000..a1c07fd80 --- /dev/null +++ b/assets/js/jquery.min.js @@ -0,0 +1,2 @@ +/*! jQuery v3.4.1 | (c) JS Foundation and other contributors | jquery.org/license */ +!function(e,t){"use strict";"object"==typeof module&&"object"==typeof module.exports?module.exports=e.document?t(e,!0):function(e){if(!e.document)throw new Error("jQuery requires a window with a document");return t(e)}:t(e)}("undefined"!=typeof window?window:this,function(C,e){"use strict";var t=[],E=C.document,r=Object.getPrototypeOf,s=t.slice,g=t.concat,u=t.push,i=t.indexOf,n={},o=n.toString,v=n.hasOwnProperty,a=v.toString,l=a.call(Object),y={},m=function(e){return"function"==typeof e&&"number"!=typeof e.nodeType},x=function(e){return null!=e&&e===e.window},c={type:!0,src:!0,nonce:!0,noModule:!0};function b(e,t,n){var r,i,o=(n=n||E).createElement("script");if(o.text=e,t)for(r in c)(i=t[r]||t.getAttribute&&t.getAttribute(r))&&o.setAttribute(r,i);n.head.appendChild(o).parentNode.removeChild(o)}function w(e){return null==e?e+"":"object"==typeof e||"function"==typeof e?n[o.call(e)]||"object":typeof e}var f="3.4.1",k=function(e,t){return new k.fn.init(e,t)},p=/^[\s\uFEFF\xA0]+|[\s\uFEFF\xA0]+$/g;function d(e){var t=!!e&&"length"in e&&e.length,n=w(e);return!m(e)&&!x(e)&&("array"===n||0===t||"number"==typeof t&&0+~]|"+M+")"+M+"*"),U=new RegExp(M+"|>"),X=new RegExp($),V=new RegExp("^"+I+"$"),G={ID:new RegExp("^#("+I+")"),CLASS:new RegExp("^\\.("+I+")"),TAG:new RegExp("^("+I+"|[*])"),ATTR:new RegExp("^"+W),PSEUDO:new RegExp("^"+$),CHILD:new RegExp("^:(only|first|last|nth|nth-last)-(child|of-type)(?:\\("+M+"*(even|odd|(([+-]|)(\\d*)n|)"+M+"*(?:([+-]|)"+M+"*(\\d+)|))"+M+"*\\)|)","i"),bool:new RegExp("^(?:"+R+")$","i"),needsContext:new RegExp("^"+M+"*[>+~]|:(even|odd|eq|gt|lt|nth|first|last)(?:\\("+M+"*((?:-\\d)?\\d*)"+M+"*\\)|)(?=[^-]|$)","i")},Y=/HTML$/i,Q=/^(?:input|select|textarea|button)$/i,J=/^h\d$/i,K=/^[^{]+\{\s*\[native \w/,Z=/^(?:#([\w-]+)|(\w+)|\.([\w-]+))$/,ee=/[+~]/,te=new RegExp("\\\\([\\da-f]{1,6}"+M+"?|("+M+")|.)","ig"),ne=function(e,t,n){var r="0x"+t-65536;return r!=r||n?t:r<0?String.fromCharCode(r+65536):String.fromCharCode(r>>10|55296,1023&r|56320)},re=/([\0-\x1f\x7f]|^-?\d)|^-$|[^\0-\x1f\x7f-\uFFFF\w-]/g,ie=function(e,t){return t?"\0"===e?"\ufffd":e.slice(0,-1)+"\\"+e.charCodeAt(e.length-1).toString(16)+" ":"\\"+e},oe=function(){T()},ae=be(function(e){return!0===e.disabled&&"fieldset"===e.nodeName.toLowerCase()},{dir:"parentNode",next:"legend"});try{H.apply(t=O.call(m.childNodes),m.childNodes),t[m.childNodes.length].nodeType}catch(e){H={apply:t.length?function(e,t){L.apply(e,O.call(t))}:function(e,t){var n=e.length,r=0;while(e[n++]=t[r++]);e.length=n-1}}}function se(t,e,n,r){var i,o,a,s,u,l,c,f=e&&e.ownerDocument,p=e?e.nodeType:9;if(n=n||[],"string"!=typeof t||!t||1!==p&&9!==p&&11!==p)return n;if(!r&&((e?e.ownerDocument||e:m)!==C&&T(e),e=e||C,E)){if(11!==p&&(u=Z.exec(t)))if(i=u[1]){if(9===p){if(!(a=e.getElementById(i)))return n;if(a.id===i)return n.push(a),n}else if(f&&(a=f.getElementById(i))&&y(e,a)&&a.id===i)return n.push(a),n}else{if(u[2])return H.apply(n,e.getElementsByTagName(t)),n;if((i=u[3])&&d.getElementsByClassName&&e.getElementsByClassName)return H.apply(n,e.getElementsByClassName(i)),n}if(d.qsa&&!A[t+" "]&&(!v||!v.test(t))&&(1!==p||"object"!==e.nodeName.toLowerCase())){if(c=t,f=e,1===p&&U.test(t)){(s=e.getAttribute("id"))?s=s.replace(re,ie):e.setAttribute("id",s=k),o=(l=h(t)).length;while(o--)l[o]="#"+s+" "+xe(l[o]);c=l.join(","),f=ee.test(t)&&ye(e.parentNode)||e}try{return H.apply(n,f.querySelectorAll(c)),n}catch(e){A(t,!0)}finally{s===k&&e.removeAttribute("id")}}}return g(t.replace(B,"$1"),e,n,r)}function ue(){var r=[];return function e(t,n){return r.push(t+" ")>b.cacheLength&&delete e[r.shift()],e[t+" "]=n}}function le(e){return e[k]=!0,e}function ce(e){var t=C.createElement("fieldset");try{return!!e(t)}catch(e){return!1}finally{t.parentNode&&t.parentNode.removeChild(t),t=null}}function fe(e,t){var n=e.split("|"),r=n.length;while(r--)b.attrHandle[n[r]]=t}function pe(e,t){var n=t&&e,r=n&&1===e.nodeType&&1===t.nodeType&&e.sourceIndex-t.sourceIndex;if(r)return r;if(n)while(n=n.nextSibling)if(n===t)return-1;return e?1:-1}function de(t){return function(e){return"input"===e.nodeName.toLowerCase()&&e.type===t}}function he(n){return function(e){var t=e.nodeName.toLowerCase();return("input"===t||"button"===t)&&e.type===n}}function ge(t){return function(e){return"form"in e?e.parentNode&&!1===e.disabled?"label"in e?"label"in e.parentNode?e.parentNode.disabled===t:e.disabled===t:e.isDisabled===t||e.isDisabled!==!t&&ae(e)===t:e.disabled===t:"label"in e&&e.disabled===t}}function ve(a){return le(function(o){return o=+o,le(function(e,t){var n,r=a([],e.length,o),i=r.length;while(i--)e[n=r[i]]&&(e[n]=!(t[n]=e[n]))})})}function ye(e){return e&&"undefined"!=typeof e.getElementsByTagName&&e}for(e in d=se.support={},i=se.isXML=function(e){var t=e.namespaceURI,n=(e.ownerDocument||e).documentElement;return!Y.test(t||n&&n.nodeName||"HTML")},T=se.setDocument=function(e){var t,n,r=e?e.ownerDocument||e:m;return r!==C&&9===r.nodeType&&r.documentElement&&(a=(C=r).documentElement,E=!i(C),m!==C&&(n=C.defaultView)&&n.top!==n&&(n.addEventListener?n.addEventListener("unload",oe,!1):n.attachEvent&&n.attachEvent("onunload",oe)),d.attributes=ce(function(e){return e.className="i",!e.getAttribute("className")}),d.getElementsByTagName=ce(function(e){return e.appendChild(C.createComment("")),!e.getElementsByTagName("*").length}),d.getElementsByClassName=K.test(C.getElementsByClassName),d.getById=ce(function(e){return a.appendChild(e).id=k,!C.getElementsByName||!C.getElementsByName(k).length}),d.getById?(b.filter.ID=function(e){var t=e.replace(te,ne);return function(e){return e.getAttribute("id")===t}},b.find.ID=function(e,t){if("undefined"!=typeof t.getElementById&&E){var n=t.getElementById(e);return n?[n]:[]}}):(b.filter.ID=function(e){var n=e.replace(te,ne);return function(e){var t="undefined"!=typeof e.getAttributeNode&&e.getAttributeNode("id");return t&&t.value===n}},b.find.ID=function(e,t){if("undefined"!=typeof t.getElementById&&E){var n,r,i,o=t.getElementById(e);if(o){if((n=o.getAttributeNode("id"))&&n.value===e)return[o];i=t.getElementsByName(e),r=0;while(o=i[r++])if((n=o.getAttributeNode("id"))&&n.value===e)return[o]}return[]}}),b.find.TAG=d.getElementsByTagName?function(e,t){return"undefined"!=typeof t.getElementsByTagName?t.getElementsByTagName(e):d.qsa?t.querySelectorAll(e):void 0}:function(e,t){var n,r=[],i=0,o=t.getElementsByTagName(e);if("*"===e){while(n=o[i++])1===n.nodeType&&r.push(n);return r}return o},b.find.CLASS=d.getElementsByClassName&&function(e,t){if("undefined"!=typeof t.getElementsByClassName&&E)return t.getElementsByClassName(e)},s=[],v=[],(d.qsa=K.test(C.querySelectorAll))&&(ce(function(e){a.appendChild(e).innerHTML="",e.querySelectorAll("[msallowcapture^='']").length&&v.push("[*^$]="+M+"*(?:''|\"\")"),e.querySelectorAll("[selected]").length||v.push("\\["+M+"*(?:value|"+R+")"),e.querySelectorAll("[id~="+k+"-]").length||v.push("~="),e.querySelectorAll(":checked").length||v.push(":checked"),e.querySelectorAll("a#"+k+"+*").length||v.push(".#.+[+~]")}),ce(function(e){e.innerHTML="";var t=C.createElement("input");t.setAttribute("type","hidden"),e.appendChild(t).setAttribute("name","D"),e.querySelectorAll("[name=d]").length&&v.push("name"+M+"*[*^$|!~]?="),2!==e.querySelectorAll(":enabled").length&&v.push(":enabled",":disabled"),a.appendChild(e).disabled=!0,2!==e.querySelectorAll(":disabled").length&&v.push(":enabled",":disabled"),e.querySelectorAll("*,:x"),v.push(",.*:")})),(d.matchesSelector=K.test(c=a.matches||a.webkitMatchesSelector||a.mozMatchesSelector||a.oMatchesSelector||a.msMatchesSelector))&&ce(function(e){d.disconnectedMatch=c.call(e,"*"),c.call(e,"[s!='']:x"),s.push("!=",$)}),v=v.length&&new RegExp(v.join("|")),s=s.length&&new RegExp(s.join("|")),t=K.test(a.compareDocumentPosition),y=t||K.test(a.contains)?function(e,t){var n=9===e.nodeType?e.documentElement:e,r=t&&t.parentNode;return e===r||!(!r||1!==r.nodeType||!(n.contains?n.contains(r):e.compareDocumentPosition&&16&e.compareDocumentPosition(r)))}:function(e,t){if(t)while(t=t.parentNode)if(t===e)return!0;return!1},D=t?function(e,t){if(e===t)return l=!0,0;var n=!e.compareDocumentPosition-!t.compareDocumentPosition;return n||(1&(n=(e.ownerDocument||e)===(t.ownerDocument||t)?e.compareDocumentPosition(t):1)||!d.sortDetached&&t.compareDocumentPosition(e)===n?e===C||e.ownerDocument===m&&y(m,e)?-1:t===C||t.ownerDocument===m&&y(m,t)?1:u?P(u,e)-P(u,t):0:4&n?-1:1)}:function(e,t){if(e===t)return l=!0,0;var n,r=0,i=e.parentNode,o=t.parentNode,a=[e],s=[t];if(!i||!o)return e===C?-1:t===C?1:i?-1:o?1:u?P(u,e)-P(u,t):0;if(i===o)return pe(e,t);n=e;while(n=n.parentNode)a.unshift(n);n=t;while(n=n.parentNode)s.unshift(n);while(a[r]===s[r])r++;return r?pe(a[r],s[r]):a[r]===m?-1:s[r]===m?1:0}),C},se.matches=function(e,t){return se(e,null,null,t)},se.matchesSelector=function(e,t){if((e.ownerDocument||e)!==C&&T(e),d.matchesSelector&&E&&!A[t+" "]&&(!s||!s.test(t))&&(!v||!v.test(t)))try{var n=c.call(e,t);if(n||d.disconnectedMatch||e.document&&11!==e.document.nodeType)return n}catch(e){A(t,!0)}return 0":{dir:"parentNode",first:!0}," ":{dir:"parentNode"},"+":{dir:"previousSibling",first:!0},"~":{dir:"previousSibling"}},preFilter:{ATTR:function(e){return e[1]=e[1].replace(te,ne),e[3]=(e[3]||e[4]||e[5]||"").replace(te,ne),"~="===e[2]&&(e[3]=" "+e[3]+" "),e.slice(0,4)},CHILD:function(e){return e[1]=e[1].toLowerCase(),"nth"===e[1].slice(0,3)?(e[3]||se.error(e[0]),e[4]=+(e[4]?e[5]+(e[6]||1):2*("even"===e[3]||"odd"===e[3])),e[5]=+(e[7]+e[8]||"odd"===e[3])):e[3]&&se.error(e[0]),e},PSEUDO:function(e){var t,n=!e[6]&&e[2];return G.CHILD.test(e[0])?null:(e[3]?e[2]=e[4]||e[5]||"":n&&X.test(n)&&(t=h(n,!0))&&(t=n.indexOf(")",n.length-t)-n.length)&&(e[0]=e[0].slice(0,t),e[2]=n.slice(0,t)),e.slice(0,3))}},filter:{TAG:function(e){var t=e.replace(te,ne).toLowerCase();return"*"===e?function(){return!0}:function(e){return e.nodeName&&e.nodeName.toLowerCase()===t}},CLASS:function(e){var t=p[e+" "];return t||(t=new RegExp("(^|"+M+")"+e+"("+M+"|$)"))&&p(e,function(e){return t.test("string"==typeof e.className&&e.className||"undefined"!=typeof e.getAttribute&&e.getAttribute("class")||"")})},ATTR:function(n,r,i){return function(e){var t=se.attr(e,n);return null==t?"!="===r:!r||(t+="","="===r?t===i:"!="===r?t!==i:"^="===r?i&&0===t.indexOf(i):"*="===r?i&&-1:\x20\t\r\n\f]*)[\x20\t\r\n\f]*\/?>(?:<\/\1>|)$/i;function j(e,n,r){return m(n)?k.grep(e,function(e,t){return!!n.call(e,t,e)!==r}):n.nodeType?k.grep(e,function(e){return e===n!==r}):"string"!=typeof n?k.grep(e,function(e){return-1)[^>]*|#([\w-]+))$/;(k.fn.init=function(e,t,n){var r,i;if(!e)return this;if(n=n||q,"string"==typeof e){if(!(r="<"===e[0]&&">"===e[e.length-1]&&3<=e.length?[null,e,null]:L.exec(e))||!r[1]&&t)return!t||t.jquery?(t||n).find(e):this.constructor(t).find(e);if(r[1]){if(t=t instanceof k?t[0]:t,k.merge(this,k.parseHTML(r[1],t&&t.nodeType?t.ownerDocument||t:E,!0)),D.test(r[1])&&k.isPlainObject(t))for(r in t)m(this[r])?this[r](t[r]):this.attr(r,t[r]);return this}return(i=E.getElementById(r[2]))&&(this[0]=i,this.length=1),this}return e.nodeType?(this[0]=e,this.length=1,this):m(e)?void 0!==n.ready?n.ready(e):e(k):k.makeArray(e,this)}).prototype=k.fn,q=k(E);var H=/^(?:parents|prev(?:Until|All))/,O={children:!0,contents:!0,next:!0,prev:!0};function P(e,t){while((e=e[t])&&1!==e.nodeType);return e}k.fn.extend({has:function(e){var t=k(e,this),n=t.length;return this.filter(function(){for(var e=0;e\x20\t\r\n\f]*)/i,he=/^$|^module$|\/(?:java|ecma)script/i,ge={option:[1,""],thead:[1,"","
"],col:[2,"","
"],tr:[2,"","
"],td:[3,"","
"],_default:[0,"",""]};function ve(e,t){var n;return n="undefined"!=typeof e.getElementsByTagName?e.getElementsByTagName(t||"*"):"undefined"!=typeof e.querySelectorAll?e.querySelectorAll(t||"*"):[],void 0===t||t&&A(e,t)?k.merge([e],n):n}function ye(e,t){for(var n=0,r=e.length;nx",y.noCloneChecked=!!me.cloneNode(!0).lastChild.defaultValue;var Te=/^key/,Ce=/^(?:mouse|pointer|contextmenu|drag|drop)|click/,Ee=/^([^.]*)(?:\.(.+)|)/;function ke(){return!0}function Se(){return!1}function Ne(e,t){return e===function(){try{return E.activeElement}catch(e){}}()==("focus"===t)}function Ae(e,t,n,r,i,o){var a,s;if("object"==typeof t){for(s in"string"!=typeof n&&(r=r||n,n=void 0),t)Ae(e,s,n,r,t[s],o);return e}if(null==r&&null==i?(i=n,r=n=void 0):null==i&&("string"==typeof n?(i=r,r=void 0):(i=r,r=n,n=void 0)),!1===i)i=Se;else if(!i)return e;return 1===o&&(a=i,(i=function(e){return k().off(e),a.apply(this,arguments)}).guid=a.guid||(a.guid=k.guid++)),e.each(function(){k.event.add(this,t,i,r,n)})}function De(e,i,o){o?(Q.set(e,i,!1),k.event.add(e,i,{namespace:!1,handler:function(e){var t,n,r=Q.get(this,i);if(1&e.isTrigger&&this[i]){if(r.length)(k.event.special[i]||{}).delegateType&&e.stopPropagation();else if(r=s.call(arguments),Q.set(this,i,r),t=o(this,i),this[i](),r!==(n=Q.get(this,i))||t?Q.set(this,i,!1):n={},r!==n)return e.stopImmediatePropagation(),e.preventDefault(),n.value}else r.length&&(Q.set(this,i,{value:k.event.trigger(k.extend(r[0],k.Event.prototype),r.slice(1),this)}),e.stopImmediatePropagation())}})):void 0===Q.get(e,i)&&k.event.add(e,i,ke)}k.event={global:{},add:function(t,e,n,r,i){var o,a,s,u,l,c,f,p,d,h,g,v=Q.get(t);if(v){n.handler&&(n=(o=n).handler,i=o.selector),i&&k.find.matchesSelector(ie,i),n.guid||(n.guid=k.guid++),(u=v.events)||(u=v.events={}),(a=v.handle)||(a=v.handle=function(e){return"undefined"!=typeof k&&k.event.triggered!==e.type?k.event.dispatch.apply(t,arguments):void 0}),l=(e=(e||"").match(R)||[""]).length;while(l--)d=g=(s=Ee.exec(e[l])||[])[1],h=(s[2]||"").split(".").sort(),d&&(f=k.event.special[d]||{},d=(i?f.delegateType:f.bindType)||d,f=k.event.special[d]||{},c=k.extend({type:d,origType:g,data:r,handler:n,guid:n.guid,selector:i,needsContext:i&&k.expr.match.needsContext.test(i),namespace:h.join(".")},o),(p=u[d])||((p=u[d]=[]).delegateCount=0,f.setup&&!1!==f.setup.call(t,r,h,a)||t.addEventListener&&t.addEventListener(d,a)),f.add&&(f.add.call(t,c),c.handler.guid||(c.handler.guid=n.guid)),i?p.splice(p.delegateCount++,0,c):p.push(c),k.event.global[d]=!0)}},remove:function(e,t,n,r,i){var o,a,s,u,l,c,f,p,d,h,g,v=Q.hasData(e)&&Q.get(e);if(v&&(u=v.events)){l=(t=(t||"").match(R)||[""]).length;while(l--)if(d=g=(s=Ee.exec(t[l])||[])[1],h=(s[2]||"").split(".").sort(),d){f=k.event.special[d]||{},p=u[d=(r?f.delegateType:f.bindType)||d]||[],s=s[2]&&new RegExp("(^|\\.)"+h.join("\\.(?:.*\\.|)")+"(\\.|$)"),a=o=p.length;while(o--)c=p[o],!i&&g!==c.origType||n&&n.guid!==c.guid||s&&!s.test(c.namespace)||r&&r!==c.selector&&("**"!==r||!c.selector)||(p.splice(o,1),c.selector&&p.delegateCount--,f.remove&&f.remove.call(e,c));a&&!p.length&&(f.teardown&&!1!==f.teardown.call(e,h,v.handle)||k.removeEvent(e,d,v.handle),delete u[d])}else for(d in u)k.event.remove(e,d+t[l],n,r,!0);k.isEmptyObject(u)&&Q.remove(e,"handle events")}},dispatch:function(e){var t,n,r,i,o,a,s=k.event.fix(e),u=new Array(arguments.length),l=(Q.get(this,"events")||{})[s.type]||[],c=k.event.special[s.type]||{};for(u[0]=s,t=1;t\x20\t\r\n\f]*)[^>]*)\/>/gi,qe=/\s*$/g;function Oe(e,t){return A(e,"table")&&A(11!==t.nodeType?t:t.firstChild,"tr")&&k(e).children("tbody")[0]||e}function Pe(e){return e.type=(null!==e.getAttribute("type"))+"/"+e.type,e}function Re(e){return"true/"===(e.type||"").slice(0,5)?e.type=e.type.slice(5):e.removeAttribute("type"),e}function Me(e,t){var n,r,i,o,a,s,u,l;if(1===t.nodeType){if(Q.hasData(e)&&(o=Q.access(e),a=Q.set(t,o),l=o.events))for(i in delete a.handle,a.events={},l)for(n=0,r=l[i].length;n")},clone:function(e,t,n){var r,i,o,a,s,u,l,c=e.cloneNode(!0),f=oe(e);if(!(y.noCloneChecked||1!==e.nodeType&&11!==e.nodeType||k.isXMLDoc(e)))for(a=ve(c),r=0,i=(o=ve(e)).length;r").attr(n.scriptAttrs||{}).prop({charset:n.scriptCharset,src:n.url}).on("load error",i=function(e){r.remove(),i=null,e&&t("error"===e.type?404:200,e.type)}),E.head.appendChild(r[0])},abort:function(){i&&i()}}});var Vt,Gt=[],Yt=/(=)\?(?=&|$)|\?\?/;k.ajaxSetup({jsonp:"callback",jsonpCallback:function(){var e=Gt.pop()||k.expando+"_"+kt++;return this[e]=!0,e}}),k.ajaxPrefilter("json jsonp",function(e,t,n){var r,i,o,a=!1!==e.jsonp&&(Yt.test(e.url)?"url":"string"==typeof e.data&&0===(e.contentType||"").indexOf("application/x-www-form-urlencoded")&&Yt.test(e.data)&&"data");if(a||"jsonp"===e.dataTypes[0])return r=e.jsonpCallback=m(e.jsonpCallback)?e.jsonpCallback():e.jsonpCallback,a?e[a]=e[a].replace(Yt,"$1"+r):!1!==e.jsonp&&(e.url+=(St.test(e.url)?"&":"?")+e.jsonp+"="+r),e.converters["script json"]=function(){return o||k.error(r+" was not called"),o[0]},e.dataTypes[0]="json",i=C[r],C[r]=function(){o=arguments},n.always(function(){void 0===i?k(C).removeProp(r):C[r]=i,e[r]&&(e.jsonpCallback=t.jsonpCallback,Gt.push(r)),o&&m(i)&&i(o[0]),o=i=void 0}),"script"}),y.createHTMLDocument=((Vt=E.implementation.createHTMLDocument("").body).innerHTML="
",2===Vt.childNodes.length),k.parseHTML=function(e,t,n){return"string"!=typeof e?[]:("boolean"==typeof t&&(n=t,t=!1),t||(y.createHTMLDocument?((r=(t=E.implementation.createHTMLDocument("")).createElement("base")).href=E.location.href,t.head.appendChild(r)):t=E),o=!n&&[],(i=D.exec(e))?[t.createElement(i[1])]:(i=we([e],t,o),o&&o.length&&k(o).remove(),k.merge([],i.childNodes)));var r,i,o},k.fn.load=function(e,t,n){var r,i,o,a=this,s=e.indexOf(" ");return-1").append(k.parseHTML(e)).find(r):e)}).always(n&&function(e,t){a.each(function(){n.apply(this,o||[e.responseText,t,e])})}),this},k.each(["ajaxStart","ajaxStop","ajaxComplete","ajaxError","ajaxSuccess","ajaxSend"],function(e,t){k.fn[t]=function(e){return this.on(t,e)}}),k.expr.pseudos.animated=function(t){return k.grep(k.timers,function(e){return t===e.elem}).length},k.offset={setOffset:function(e,t,n){var r,i,o,a,s,u,l=k.css(e,"position"),c=k(e),f={};"static"===l&&(e.style.position="relative"),s=c.offset(),o=k.css(e,"top"),u=k.css(e,"left"),("absolute"===l||"fixed"===l)&&-1<(o+u).indexOf("auto")?(a=(r=c.position()).top,i=r.left):(a=parseFloat(o)||0,i=parseFloat(u)||0),m(t)&&(t=t.call(e,n,k.extend({},s))),null!=t.top&&(f.top=t.top-s.top+a),null!=t.left&&(f.left=t.left-s.left+i),"using"in t?t.using.call(e,f):c.css(f)}},k.fn.extend({offset:function(t){if(arguments.length)return void 0===t?this:this.each(function(e){k.offset.setOffset(this,t,e)});var e,n,r=this[0];return r?r.getClientRects().length?(e=r.getBoundingClientRect(),n=r.ownerDocument.defaultView,{top:e.top+n.pageYOffset,left:e.left+n.pageXOffset}):{top:0,left:0}:void 0},position:function(){if(this[0]){var e,t,n,r=this[0],i={top:0,left:0};if("fixed"===k.css(r,"position"))t=r.getBoundingClientRect();else{t=this.offset(),n=r.ownerDocument,e=r.offsetParent||n.documentElement;while(e&&(e===n.body||e===n.documentElement)&&"static"===k.css(e,"position"))e=e.parentNode;e&&e!==r&&1===e.nodeType&&((i=k(e).offset()).top+=k.css(e,"borderTopWidth",!0),i.left+=k.css(e,"borderLeftWidth",!0))}return{top:t.top-i.top-k.css(r,"marginTop",!0),left:t.left-i.left-k.css(r,"marginLeft",!0)}}},offsetParent:function(){return this.map(function(){var e=this.offsetParent;while(e&&"static"===k.css(e,"position"))e=e.offsetParent;return e||ie})}}),k.each({scrollLeft:"pageXOffset",scrollTop:"pageYOffset"},function(t,i){var o="pageYOffset"===i;k.fn[t]=function(e){return _(this,function(e,t,n){var r;if(x(e)?r=e:9===e.nodeType&&(r=e.defaultView),void 0===n)return r?r[i]:e[t];r?r.scrollTo(o?r.pageXOffset:n,o?n:r.pageYOffset):e[t]=n},t,e,arguments.length)}}),k.each(["top","left"],function(e,n){k.cssHooks[n]=ze(y.pixelPosition,function(e,t){if(t)return t=_e(e,n),$e.test(t)?k(e).position()[n]+"px":t})}),k.each({Height:"height",Width:"width"},function(a,s){k.each({padding:"inner"+a,content:s,"":"outer"+a},function(r,o){k.fn[o]=function(e,t){var n=arguments.length&&(r||"boolean"!=typeof e),i=r||(!0===e||!0===t?"margin":"border");return _(this,function(e,t,n){var r;return x(e)?0===o.indexOf("outer")?e["inner"+a]:e.document.documentElement["client"+a]:9===e.nodeType?(r=e.documentElement,Math.max(e.body["scroll"+a],r["scroll"+a],e.body["offset"+a],r["offset"+a],r["client"+a])):void 0===n?k.css(e,t,i):k.style(e,t,n,i)},s,n?e:void 0,n)}})}),k.each("blur focus focusin focusout resize scroll click dblclick mousedown mouseup mousemove mouseover mouseout mouseenter mouseleave change select submit keydown keypress keyup contextmenu".split(" "),function(e,n){k.fn[n]=function(e,t){return 0=1){this.items.push(itemData);this.completeLayout(rowWidthWithoutSpacing/itemData.aspectRatio,"justify");return true}}}if(newAspectRatiothis.maxAspectRatio){if(this.items.length===0){this.items.push(merge(itemData));this.completeLayout(rowWidthWithoutSpacing/newAspectRatio,"justify");return true}previousRowWidthWithoutSpacing=this.width-(this.items.length-1)*this.spacing;previousAspectRatio=this.items.reduce(function(sum,item){return sum+item.aspectRatio},0);previousTargetAspectRatio=previousRowWidthWithoutSpacing/this.targetRowHeight;if(Math.abs(newAspectRatio-targetAspectRatio)>Math.abs(previousAspectRatio-previousTargetAspectRatio)){this.completeLayout(previousRowWidthWithoutSpacing/previousAspectRatio,"justify");return false}else{this.items.push(merge(itemData));this.completeLayout(rowWidthWithoutSpacing/newAspectRatio,"justify");return true}}else{this.items.push(merge(itemData));this.completeLayout(rowWidthWithoutSpacing/newAspectRatio,"justify");return true}},isLayoutComplete:function(){return this.height>0},completeLayout:function(newHeight,widowLayoutStyle){var itemWidthSum=this.left,rowWidthWithoutSpacing=this.width-(this.items.length-1)*this.spacing,clampedToNativeRatio,clampedHeight,errorWidthPerItem,roundedCumulativeErrors,singleItemGeometry,centerOffset;if(typeof widowLayoutStyle==="undefined"||["justify","center","left"].indexOf(widowLayoutStyle)<0){widowLayoutStyle="left"}clampedHeight=Math.max(this.edgeCaseMinRowHeight,Math.min(newHeight,this.edgeCaseMaxRowHeight));if(newHeight!==clampedHeight){this.height=clampedHeight;clampedToNativeRatio=rowWidthWithoutSpacing/clampedHeight/(rowWidthWithoutSpacing/newHeight)}else{this.height=newHeight;clampedToNativeRatio=1}this.items.forEach(function(item){item.top=this.top;item.width=item.aspectRatio*this.height*clampedToNativeRatio;item.height=this.height;item.left=itemWidthSum;itemWidthSum+=item.width+this.spacing},this);if(widowLayoutStyle==="justify"){itemWidthSum-=this.spacing+this.left;errorWidthPerItem=(itemWidthSum-this.width)/this.items.length;roundedCumulativeErrors=this.items.map(function(item,i){return Math.round((i+1)*errorWidthPerItem)});if(this.items.length===1){singleItemGeometry=this.items[0];singleItemGeometry.width-=Math.round(errorWidthPerItem)}else{this.items.forEach(function(item,i){if(i>0){item.left-=roundedCumulativeErrors[i-1];item.width-=roundedCumulativeErrors[i]-roundedCumulativeErrors[i-1]}else{item.width-=roundedCumulativeErrors[i]}})}}else if(widowLayoutStyle==="center"){centerOffset=(this.width-itemWidthSum)/2;this.items.forEach(function(item){item.left+=centerOffset+this.spacing},this)}},forceComplete:function(fitToWidth,rowHeight){if(typeof rowHeight==="number"){this.completeLayout(rowHeight,this.widowLayoutStyle)}else{this.completeLayout(this.targetRowHeight,this.widowLayoutStyle)}},getItems:function(){return this.items}}},{merge:2}],2:[function(require,module,exports){(function(isNode){var Public=function(clone){return merge(clone===true,false,arguments)},publicName="merge";Public.recursive=function(clone){return merge(clone===true,true,arguments)};Public.clone=function(input){var output=input,type=typeOf(input),index,size;if(type==="array"){output=[];size=input.length;for(index=0;index=layoutConfig.maxNumRows){currentRow=null;return true}currentRow=createNewRow(layoutConfig,layoutData);if(!itemAdded){itemAdded=currentRow.addItem(itemData);if(currentRow.isLayoutComplete()){laidOutItems=laidOutItems.concat(addRow(layoutConfig,layoutData,currentRow));if(layoutData._rows.length>=layoutConfig.maxNumRows){currentRow=null;return true}currentRow=createNewRow(layoutConfig,layoutData)}}}});if(currentRow&¤tRow.getItems().length&&layoutConfig.showWidows){if(layoutData._rows.length){if(layoutData._rows[layoutData._rows.length-1].isBreakoutRow){nextToLastRowHeight=layoutData._rows[layoutData._rows.length-1].targetRowHeight}else{nextToLastRowHeight=layoutData._rows[layoutData._rows.length-1].height}currentRow.forceComplete(false,nextToLastRowHeight)}else{currentRow.forceComplete(false)}laidOutItems=laidOutItems.concat(addRow(layoutConfig,layoutData,currentRow));layoutConfig._widowCount=currentRow.getItems().length}layoutData._containerHeight=layoutData._containerHeight-layoutConfig.boxSpacing.vertical;layoutData._containerHeight=layoutData._containerHeight+layoutConfig.containerPadding.bottom;return{containerHeight:layoutData._containerHeight,widowCount:layoutConfig._widowCount,boxes:layoutData._layoutItems}}module.exports=function(input,config){var layoutConfig={};var layoutData={};var defaults={containerWidth:1060,containerPadding:10,boxSpacing:10,targetRowHeight:320,targetRowHeightTolerance:.25,maxNumRows:Number.POSITIVE_INFINITY,forceAspectRatio:false,showWidows:true,fullWidthBreakoutRowCadence:false,widowLayoutStyle:"left"};var containerPadding={};var boxSpacing={};config=config||{};layoutConfig=merge(defaults,config);containerPadding.top=!isNaN(parseFloat(layoutConfig.containerPadding.top))?layoutConfig.containerPadding.top:layoutConfig.containerPadding;containerPadding.right=!isNaN(parseFloat(layoutConfig.containerPadding.right))?layoutConfig.containerPadding.right:layoutConfig.containerPadding;containerPadding.bottom=!isNaN(parseFloat(layoutConfig.containerPadding.bottom))?layoutConfig.containerPadding.bottom:layoutConfig.containerPadding;containerPadding.left=!isNaN(parseFloat(layoutConfig.containerPadding.left))?layoutConfig.containerPadding.left:layoutConfig.containerPadding;boxSpacing.horizontal=!isNaN(parseFloat(layoutConfig.boxSpacing.horizontal))?layoutConfig.boxSpacing.horizontal:layoutConfig.boxSpacing;boxSpacing.vertical=!isNaN(parseFloat(layoutConfig.boxSpacing.vertical))?layoutConfig.boxSpacing.vertical:layoutConfig.boxSpacing;layoutConfig.containerPadding=containerPadding;layoutConfig.boxSpacing=boxSpacing;layoutData._layoutItems=[];layoutData._awakeItems=[];layoutData._inViewportItems=[];layoutData._leadingOrphans=[];layoutData._trailingOrphans=[];layoutData._containerHeight=layoutConfig.containerPadding.top;layoutData._rows=[];layoutData._orphans=[];layoutConfig._widowCount=0;return computeLayout(layoutConfig,layoutData,input.map(function(item){if(item.width&&item.height){return{aspectRatio:item.width/item.height}}else{return{aspectRatio:item}}}))}},{"./row":1,merge:2}]},{},[]); \ No newline at end of file diff --git a/assets/js/luxon.min.js b/assets/js/luxon.min.js new file mode 100644 index 000000000..678a5b982 --- /dev/null +++ b/assets/js/luxon.min.js @@ -0,0 +1 @@ +var luxon=function(e){"use strict";function t(e,t){for(var n=0;ne.length)&&(t=e.length);for(var n=0,r=new Array(t);n=e.length?{done:!0}:{done:!1,value:e[t++]}};throw new TypeError("Invalid attempt to iterate non-iterable instance.\nIn order to be iterable, non-array objects must have a [Symbol.iterator]() method.")}return(t=e[Symbol.iterator]()).next.bind(t)}var l=function(e){function t(){return e.apply(this,arguments)||this}return r(t,e),t}(u(Error)),f=function(e){function t(t){return e.call(this,"Invalid DateTime: "+t.toMessage())||this}return r(t,e),t}(l),d=function(e){function t(t){return e.call(this,"Invalid Interval: "+t.toMessage())||this}return r(t,e),t}(l),h=function(e){function t(t){return e.call(this,"Invalid Duration: "+t.toMessage())||this}return r(t,e),t}(l),m=function(e){function t(){return e.apply(this,arguments)||this}return r(t,e),t}(l),y=function(e){function t(t){return e.call(this,"Invalid unit "+t)||this}return r(t,e),t}(l),v=function(e){function t(){return e.apply(this,arguments)||this}return r(t,e),t}(l),g=function(e){function t(){return e.call(this,"Zone is an abstract class")||this}return r(t,e),t}(l),p="numeric",w="short",k="long",b={year:p,month:p,day:p},O={year:p,month:w,day:p},S={year:p,month:w,day:p,weekday:w},T={year:p,month:k,day:p},M={year:p,month:k,day:p,weekday:k},N={hour:p,minute:p},E={hour:p,minute:p,second:p},D={hour:p,minute:p,second:p,timeZoneName:w},I={hour:p,minute:p,second:p,timeZoneName:k},V={hour:p,minute:p,hour12:!1},L={hour:p,minute:p,second:p,hour12:!1},x={hour:p,minute:p,second:p,hour12:!1,timeZoneName:w},C={hour:p,minute:p,second:p,hour12:!1,timeZoneName:k},F={year:p,month:p,day:p,hour:p,minute:p},Z={year:p,month:p,day:p,hour:p,minute:p,second:p},j={year:p,month:w,day:p,hour:p,minute:p},A={year:p,month:w,day:p,hour:p,minute:p,second:p},z={year:p,month:w,day:p,weekday:w,hour:p,minute:p},_={year:p,month:k,day:p,hour:p,minute:p,timeZoneName:w},q={year:p,month:k,day:p,hour:p,minute:p,second:p,timeZoneName:w},H={year:p,month:k,day:p,weekday:k,hour:p,minute:p,timeZoneName:k},U={year:p,month:k,day:p,weekday:k,hour:p,minute:p,second:p,timeZoneName:k};function R(e){return void 0===e}function P(e){return"number"==typeof e}function W(e){return"number"==typeof e&&e%1==0}function J(){try{return"undefined"!=typeof Intl&&Intl.DateTimeFormat}catch(e){return!1}}function Y(){return!R(Intl.DateTimeFormat.prototype.formatToParts)}function G(){try{return"undefined"!=typeof Intl&&!!Intl.RelativeTimeFormat}catch(e){return!1}}function $(e,t,n){if(0!==e.length)return e.reduce(function(e,r){var i=[t(r),r];return e&&n(e[0],i[0])===e[0]?e:i},null)[1]}function B(e,t){return t.reduce(function(t,n){return t[n]=e[n],t},{})}function Q(e,t){return Object.prototype.hasOwnProperty.call(e,t)}function K(e,t,n){return W(e)&&e>=t&&e<=n}function X(e,t){void 0===t&&(t=2);var n=e<0?"-":"",r=n?-1*e:e;return""+n+(r.toString().length=0&&(t=new Date(t)).setUTCFullYear(t.getUTCFullYear()-1900),+t}function ue(e){var t=(e+Math.floor(e/4)-Math.floor(e/100)+Math.floor(e/400))%7,n=e-1,r=(n+Math.floor(n/4)-Math.floor(n/100)+Math.floor(n/400))%7;return 4===t||3===r?53:52}function se(e){return e>99?e:e>60?1900+e:2e3+e}function ce(e,t,n,r){void 0===r&&(r=null);var i=new Date(e),o={hour12:!1,year:"numeric",month:"2-digit",day:"2-digit",hour:"2-digit",minute:"2-digit"};r&&(o.timeZone=r);var a=Object.assign({timeZoneName:t},o),u=J();if(u&&Y()){var s=new Intl.DateTimeFormat(n,a).formatToParts(i).find(function(e){return"timezonename"===e.type.toLowerCase()});return s?s.value:null}if(u){var c=new Intl.DateTimeFormat(n,o).format(i);return new Intl.DateTimeFormat(n,a).format(i).substring(c.length).replace(/^[, \u200e]+/,"")}return null}function le(e,t){var n=parseInt(e,10);Number.isNaN(n)&&(n=0);var r=parseInt(t,10)||0;return 60*n+(n<0||Object.is(n,-0)?-r:r)}function fe(e){var t=Number(e);if("boolean"==typeof e||""===e||Number.isNaN(t))throw new v("Invalid unit value "+e);return t}function de(e,t,n){var r={};for(var i in e)if(Q(e,i)){if(n.indexOf(i)>=0)continue;var o=e[i];if(void 0===o||null===o)continue;r[t(i)]=fe(o)}return r}function he(e,t){var n=Math.trunc(Math.abs(e/60)),r=Math.trunc(Math.abs(e%60)),i=e>=0?"+":"-";switch(t){case"short":return""+i+X(n,2)+":"+X(r,2);case"narrow":return""+i+n+(r>0?":"+r:"");case"techie":return""+i+X(n,2)+X(r,2);default:throw new RangeError("Value format "+t+" is out of range for property format")}}function me(e){return B(e,["hour","minute","second","millisecond"])}var ye=/[A-Za-z_+-]{1,256}(:?\/[A-Za-z_+-]{1,256}(\/[A-Za-z_+-]{1,256})?)?/;function ve(e){return JSON.stringify(e,Object.keys(e).sort())}var ge=["January","February","March","April","May","June","July","August","September","October","November","December"],pe=["Jan","Feb","Mar","Apr","May","Jun","Jul","Aug","Sep","Oct","Nov","Dec"],we=["J","F","M","A","M","J","J","A","S","O","N","D"];function ke(e){switch(e){case"narrow":return[].concat(we);case"short":return[].concat(pe);case"long":return[].concat(ge);case"numeric":return["1","2","3","4","5","6","7","8","9","10","11","12"];case"2-digit":return["01","02","03","04","05","06","07","08","09","10","11","12"];default:return null}}var be=["Monday","Tuesday","Wednesday","Thursday","Friday","Saturday","Sunday"],Oe=["Mon","Tue","Wed","Thu","Fri","Sat","Sun"],Se=["M","T","W","T","F","S","S"];function Te(e){switch(e){case"narrow":return[].concat(Se);case"short":return[].concat(Oe);case"long":return[].concat(be);case"numeric":return["1","2","3","4","5","6","7"];default:return null}}var Me=["AM","PM"],Ne=["Before Christ","Anno Domini"],Ee=["BC","AD"],De=["B","A"];function Ie(e){switch(e){case"narrow":return[].concat(De);case"short":return[].concat(Ee);case"long":return[].concat(Ne);default:return null}}function Ve(e,t){for(var n,r="",i=c(e);!(n=i()).done;){var o=n.value;o.literal?r+=o.val:r+=t(o.val)}return r}var Le={D:b,DD:O,DDD:T,DDDD:M,t:N,tt:E,ttt:D,tttt:I,T:V,TT:L,TTT:x,TTTT:C,f:F,ff:j,fff:_,ffff:H,F:Z,FF:A,FFF:q,FFFF:U},xe=function(){function e(e,t){this.opts=t,this.loc=e,this.systemLoc=null}e.create=function(t,n){return void 0===n&&(n={}),new e(t,n)},e.parseFormat=function(e){for(var t=null,n="",r=!1,i=[],o=0;o0&&i.push({literal:r,val:n}),t=null,n="",r=!r):r?n+=a:a===t?n+=a:(n.length>0&&i.push({literal:!1,val:n}),n=a,t=a)}return n.length>0&&i.push({literal:r,val:n}),i},e.macroTokenToFormatOpts=function(e){return Le[e]};var t=e.prototype;return t.formatWithSystemDefault=function(e,t){return null===this.systemLoc&&(this.systemLoc=this.loc.redefaultToSystem()),this.systemLoc.dtFormatter(e,Object.assign({},this.opts,t)).format()},t.formatDateTime=function(e,t){return void 0===t&&(t={}),this.loc.dtFormatter(e,Object.assign({},this.opts,t)).format()},t.formatDateTimeParts=function(e,t){return void 0===t&&(t={}),this.loc.dtFormatter(e,Object.assign({},this.opts,t)).formatToParts()},t.resolvedOptions=function(e,t){return void 0===t&&(t={}),this.loc.dtFormatter(e,Object.assign({},this.opts,t)).resolvedOptions()},t.num=function(e,t){if(void 0===t&&(t=0),this.opts.forceSimple)return X(e,t);var n=Object.assign({},this.opts);return t>0&&(n.padTo=t),this.loc.numberFormatter(n).format(e)},t.formatDateTimeFromString=function(t,n){var r=this,i="en"===this.loc.listingMode(),o=this.loc.outputCalendar&&"gregory"!==this.loc.outputCalendar&&Y(),a=function(e,n){return r.loc.extract(t,e,n)},u=function(e){return t.isOffsetFixed&&0===t.offset&&e.allowZ?"Z":t.isValid?t.zone.formatOffset(t.ts,e.format):""},s=function(){return i?function(e){return Me[e.hour<12?0:1]}(t):a({hour:"numeric",hour12:!0},"dayperiod")},c=function(e,n){return i?function(e,t){return ke(t)[e.month-1]}(t,e):a(n?{month:e}:{month:e,day:"numeric"},"month")},l=function(e,n){return i?function(e,t){return Te(t)[e.weekday-1]}(t,e):a(n?{weekday:e}:{weekday:e,month:"long",day:"numeric"},"weekday")},f=function(e){return i?function(e,t){return Ie(t)[e.year<0?0:1]}(t,e):a({era:e},"era")};return Ve(e.parseFormat(n),function(n){switch(n){case"S":return r.num(t.millisecond);case"u":case"SSS":return r.num(t.millisecond,3);case"s":return r.num(t.second);case"ss":return r.num(t.second,2);case"m":return r.num(t.minute);case"mm":return r.num(t.minute,2);case"h":return r.num(t.hour%12==0?12:t.hour%12);case"hh":return r.num(t.hour%12==0?12:t.hour%12,2);case"H":return r.num(t.hour);case"HH":return r.num(t.hour,2);case"Z":return u({format:"narrow",allowZ:r.opts.allowZ});case"ZZ":return u({format:"short",allowZ:r.opts.allowZ});case"ZZZ":return u({format:"techie",allowZ:r.opts.allowZ});case"ZZZZ":return t.zone.offsetName(t.ts,{format:"short",locale:r.loc.locale});case"ZZZZZ":return t.zone.offsetName(t.ts,{format:"long",locale:r.loc.locale});case"z":return t.zoneName;case"a":return s();case"d":return o?a({day:"numeric"},"day"):r.num(t.day);case"dd":return o?a({day:"2-digit"},"day"):r.num(t.day,2);case"c":return r.num(t.weekday);case"ccc":return l("short",!0);case"cccc":return l("long",!0);case"ccccc":return l("narrow",!0);case"E":return r.num(t.weekday);case"EEE":return l("short",!1);case"EEEE":return l("long",!1);case"EEEEE":return l("narrow",!1);case"L":return o?a({month:"numeric",day:"numeric"},"month"):r.num(t.month);case"LL":return o?a({month:"2-digit",day:"numeric"},"month"):r.num(t.month,2);case"LLL":return c("short",!0);case"LLLL":return c("long",!0);case"LLLLL":return c("narrow",!0);case"M":return o?a({month:"numeric"},"month"):r.num(t.month);case"MM":return o?a({month:"2-digit"},"month"):r.num(t.month,2);case"MMM":return c("short",!1);case"MMMM":return c("long",!1);case"MMMMM":return c("narrow",!1);case"y":return o?a({year:"numeric"},"year"):r.num(t.year);case"yy":return o?a({year:"2-digit"},"year"):r.num(t.year.toString().slice(-2),2);case"yyyy":return o?a({year:"numeric"},"year"):r.num(t.year,4);case"yyyyyy":return o?a({year:"numeric"},"year"):r.num(t.year,6);case"G":return f("short");case"GG":return f("long");case"GGGGG":return f("narrow");case"kk":return r.num(t.weekYear.toString().slice(-2),2);case"kkkk":return r.num(t.weekYear,4);case"W":return r.num(t.weekNumber);case"WW":return r.num(t.weekNumber,2);case"o":return r.num(t.ordinal);case"ooo":return r.num(t.ordinal,3);case"q":return r.num(t.quarter);case"qq":return r.num(t.quarter,2);case"X":return r.num(Math.floor(t.ts/1e3));case"x":return r.num(t.ts);default:return function(n){var i=e.macroTokenToFormatOpts(n);return i?r.formatWithSystemDefault(t,i):n}(n)}})},t.formatDurationFromString=function(t,n){var r,i=this,o=function(e){switch(e[0]){case"S":return"millisecond";case"s":return"second";case"m":return"minute";case"h":return"hour";case"d":return"day";case"M":return"month";case"y":return"year";default:return null}},a=e.parseFormat(n),u=a.reduce(function(e,t){var n=t.literal,r=t.val;return n?e:e.concat(r)},[]),s=t.shiftTo.apply(t,u.map(o).filter(function(e){return e}));return Ve(a,(r=s,function(e){var t=o(e);return t?i.num(r.get(t),e.length):e}))},e}(),Ce=function(){function e(e,t){this.reason=e,this.explanation=t}return e.prototype.toMessage=function(){return this.explanation?this.reason+": "+this.explanation:this.reason},e}(),Fe=function(){function e(){}var t=e.prototype;return t.offsetName=function(e,t){throw new g},t.formatOffset=function(e,t){throw new g},t.offset=function(e){throw new g},t.equals=function(e){throw new g},n(e,[{key:"type",get:function(){throw new g}},{key:"name",get:function(){throw new g}},{key:"universal",get:function(){throw new g}},{key:"isValid",get:function(){throw new g}}]),e}(),Ze=null,je=function(e){function t(){return e.apply(this,arguments)||this}r(t,e);var i=t.prototype;return i.offsetName=function(e,t){return ce(e,t.format,t.locale)},i.formatOffset=function(e,t){return he(this.offset(e),t)},i.offset=function(e){return-new Date(e).getTimezoneOffset()},i.equals=function(e){return"local"===e.type},n(t,[{key:"type",get:function(){return"local"}},{key:"name",get:function(){return J()?(new Intl.DateTimeFormat).resolvedOptions().timeZone:"local"}},{key:"universal",get:function(){return!1}},{key:"isValid",get:function(){return!0}}],[{key:"instance",get:function(){return null===Ze&&(Ze=new t),Ze}}]),t}(Fe),Ae=RegExp("^"+ye.source+"$"),ze={};var _e={year:0,month:1,day:2,hour:3,minute:4,second:5};var qe={},He=function(e){function t(n){var r;return(r=e.call(this)||this).zoneName=n,r.valid=t.isValidZone(n),r}r(t,e),t.create=function(e){return qe[e]||(qe[e]=new t(e)),qe[e]},t.resetCache=function(){qe={},ze={}},t.isValidSpecifier=function(e){return!(!e||!e.match(Ae))},t.isValidZone=function(e){try{return new Intl.DateTimeFormat("en-US",{timeZone:e}).format(),!0}catch(e){return!1}},t.parseGMTOffset=function(e){if(e){var t=e.match(/^Etc\/GMT(0|[+-]\d{1,2})$/i);if(t)return-60*parseInt(t[1])}return null};var i=t.prototype;return i.offsetName=function(e,t){return ce(e,t.format,t.locale,this.name)},i.formatOffset=function(e,t){return he(this.offset(e),t)},i.offset=function(e){var t=new Date(e);if(isNaN(t))return NaN;var n,r=(n=this.name,ze[n]||(ze[n]=new Intl.DateTimeFormat("en-US",{hour12:!1,timeZone:n,year:"numeric",month:"2-digit",day:"2-digit",hour:"2-digit",minute:"2-digit",second:"2-digit"})),ze[n]),i=r.formatToParts?function(e,t){for(var n=e.formatToParts(t),r=[],i=0;i=0?d:1e3+d))/6e4},i.equals=function(e){return"iana"===e.type&&e.name===this.name},n(t,[{key:"type",get:function(){return"iana"}},{key:"name",get:function(){return this.zoneName}},{key:"universal",get:function(){return!1}},{key:"isValid",get:function(){return this.valid}}]),t}(Fe),Ue=null,Re=function(e){function t(t){var n;return(n=e.call(this)||this).fixed=t,n}r(t,e),t.instance=function(e){return 0===e?t.utcInstance:new t(e)},t.parseSpecifier=function(e){if(e){var n=e.match(/^utc(?:([+-]\d{1,2})(?::(\d{2}))?)?$/i);if(n)return new t(le(n[1],n[2]))}return null},n(t,null,[{key:"utcInstance",get:function(){return null===Ue&&(Ue=new t(0)),Ue}}]);var i=t.prototype;return i.offsetName=function(){return this.name},i.formatOffset=function(e,t){return he(this.fixed,t)},i.offset=function(){return this.fixed},i.equals=function(e){return"fixed"===e.type&&e.fixed===this.fixed},n(t,[{key:"type",get:function(){return"fixed"}},{key:"name",get:function(){return 0===this.fixed?"UTC":"UTC"+he(this.fixed,"narrow")}},{key:"universal",get:function(){return!0}},{key:"isValid",get:function(){return!0}}]),t}(Fe),Pe=function(e){function t(t){var n;return(n=e.call(this)||this).zoneName=t,n}r(t,e);var i=t.prototype;return i.offsetName=function(){return null},i.formatOffset=function(){return""},i.offset=function(){return NaN},i.equals=function(){return!1},n(t,[{key:"type",get:function(){return"invalid"}},{key:"name",get:function(){return this.zoneName}},{key:"universal",get:function(){return!1}},{key:"isValid",get:function(){return!1}}]),t}(Fe);function We(e,t){var n;if(R(e)||null===e)return t;if(e instanceof Fe)return e;if("string"==typeof e){var r=e.toLowerCase();return"local"===r?t:"utc"===r||"gmt"===r?Re.utcInstance:null!=(n=He.parseGMTOffset(e))?Re.instance(n):He.isValidSpecifier(r)?He.create(e):Re.parseSpecifier(r)||new Pe(e)}return P(e)?Re.instance(e):"object"==typeof e&&e.offset&&"number"==typeof e.offset?e:new Pe(e)}var Je=function(){return Date.now()},Ye=null,Ge=null,$e=null,Be=null,Qe=!1,Ke=function(){function e(){}return e.resetCaches=function(){ct.resetCache(),He.resetCache()},n(e,null,[{key:"now",get:function(){return Je},set:function(e){Je=e}},{key:"defaultZoneName",get:function(){return e.defaultZone.name},set:function(e){Ye=e?We(e):null}},{key:"defaultZone",get:function(){return Ye||je.instance}},{key:"defaultLocale",get:function(){return Ge},set:function(e){Ge=e}},{key:"defaultNumberingSystem",get:function(){return $e},set:function(e){$e=e}},{key:"defaultOutputCalendar",get:function(){return Be},set:function(e){Be=e}},{key:"throwOnInvalid",get:function(){return Qe},set:function(e){Qe=e}}]),e}(),Xe={};function et(e,t){void 0===t&&(t={});var n=JSON.stringify([e,t]),r=Xe[n];return r||(r=new Intl.DateTimeFormat(e,t),Xe[n]=r),r}var tt={};var nt={};function rt(e,t){void 0===t&&(t={});var n=t,r=(n.base,function(e,t){if(null==e)return{};var n,r,i={},o=Object.keys(e);for(r=0;r=0||(i[n]=e[n]);return i}(n,["base"])),i=JSON.stringify([e,r]),o=nt[i];return o||(o=new Intl.RelativeTimeFormat(e,t),nt[i]=o),o}var it=null;function ot(e,t,n,r,i){var o=e.listingMode(n);return"error"===o?null:"en"===o?r(t):i(t)}var at=function(){function e(e,t,n){if(this.padTo=n.padTo||0,this.floor=n.floor||!1,!t&&J()){var r={useGrouping:!1};n.padTo>0&&(r.minimumIntegerDigits=n.padTo),this.inf=function(e,t){void 0===t&&(t={});var n=JSON.stringify([e,t]),r=tt[n];return r||(r=new Intl.NumberFormat(e,t),tt[n]=r),r}(e,r)}}return e.prototype.format=function(e){if(this.inf){var t=this.floor?Math.floor(e):e;return this.inf.format(t)}return X(this.floor?Math.floor(e):ne(e,3),this.padTo)},e}(),ut=function(){function e(e,t,n){var r;if(this.opts=n,this.hasIntl=J(),e.zone.universal&&this.hasIntl){var i=e.offset/60*-1,o=i>=0?"Etc/GMT+"+i:"Etc/GMT"+i,a=He.isValidZone(o);0!==e.offset&&a?(r=o,this.dt=e):(r="UTC",n.timeZoneName?this.dt=e:this.dt=0===e.offset?e:sr.fromMillis(e.ts+60*e.offset*1e3))}else"local"===e.zone.type?this.dt=e:(this.dt=e,r=e.zone.name);if(this.hasIntl){var u=Object.assign({},this.opts);r&&(u.timeZone=r),this.dtf=et(t,u)}}var t=e.prototype;return t.format=function(){if(this.hasIntl)return this.dtf.format(this.dt.toJSDate());var e=function(e){switch(ve(B(e,["weekday","era","year","month","day","hour","minute","second","timeZoneName","hour12"]))){case ve(b):return"M/d/yyyy";case ve(O):return"LLL d, yyyy";case ve(S):return"EEE, LLL d, yyyy";case ve(T):return"LLLL d, yyyy";case ve(M):return"EEEE, LLLL d, yyyy";case ve(N):return"h:mm a";case ve(E):return"h:mm:ss a";case ve(D):case ve(I):return"h:mm a";case ve(V):return"HH:mm";case ve(L):return"HH:mm:ss";case ve(x):case ve(C):return"HH:mm";case ve(F):return"M/d/yyyy, h:mm a";case ve(j):return"LLL d, yyyy, h:mm a";case ve(_):return"LLLL d, yyyy, h:mm a";case ve(H):return"EEEE, LLLL d, yyyy, h:mm a";case ve(Z):return"M/d/yyyy, h:mm:ss a";case ve(A):return"LLL d, yyyy, h:mm:ss a";case ve(z):return"EEE, d LLL yyyy, h:mm a";case ve(q):return"LLLL d, yyyy, h:mm:ss a";case ve(U):return"EEEE, LLLL d, yyyy, h:mm:ss a";default:return"EEEE, LLLL d, yyyy, h:mm a"}}(this.opts),t=ct.create("en-US");return xe.create(t).formatDateTimeFromString(this.dt,e)},t.formatToParts=function(){return this.hasIntl&&Y()?this.dtf.formatToParts(this.dt.toJSDate()):[]},t.resolvedOptions=function(){return this.hasIntl?this.dtf.resolvedOptions():{locale:"en-US",numberingSystem:"latn",outputCalendar:"gregory"}},e}(),st=function(){function e(e,t,n){this.opts=Object.assign({style:"long"},n),!t&&G()&&(this.rtf=rt(e,n))}var t=e.prototype;return t.format=function(e,t){return this.rtf?this.rtf.format(e,t):function(e,t,n,r){void 0===n&&(n="always"),void 0===r&&(r=!1);var i={years:["year","yr."],quarters:["quarter","qtr."],months:["month","mo."],weeks:["week","wk."],days:["day","day","days"],hours:["hour","hr."],minutes:["minute","min."],seconds:["second","sec."]},o=-1===["hours","minutes","seconds"].indexOf(e);if("auto"===n&&o){var a="days"===e;switch(t){case 1:return a?"tomorrow":"next "+i[e][0];case-1:return a?"yesterday":"last "+i[e][0];case 0:return a?"today":"this "+i[e][0]}}var u=Object.is(t,-0)||t<0,s=Math.abs(t),c=1===s,l=i[e],f=r?c?l[1]:l[2]||l[1]:c?i[e][0]:e;return u?s+" "+f+" ago":"in "+s+" "+f}(t,e,this.opts.numeric,"long"!==this.opts.style)},t.formatToParts=function(e,t){return this.rtf?this.rtf.formatToParts(e,t):[]},e}(),ct=function(){function e(e,t,n,r){var i=function(e){var t=e.indexOf("-u-");if(-1===t)return[e];var n,r=e.substring(0,t);try{n=et(e).resolvedOptions()}catch(e){n=et(r).resolvedOptions()}var i=n;return[r,i.numberingSystem,i.calendar]}(e),o=i[0],a=i[1],u=i[2];this.locale=o,this.numberingSystem=t||a||null,this.outputCalendar=n||u||null,this.intl=function(e,t,n){return J()?n||t?(e+="-u",n&&(e+="-ca-"+n),t&&(e+="-nu-"+t),e):e:[]}(this.locale,this.numberingSystem,this.outputCalendar),this.weekdaysCache={format:{},standalone:{}},this.monthsCache={format:{},standalone:{}},this.meridiemCache=null,this.eraCache={},this.specifiedLocale=r,this.fastNumbersCached=null}e.fromOpts=function(t){return e.create(t.locale,t.numberingSystem,t.outputCalendar,t.defaultToEN)},e.create=function(t,n,r,i){void 0===i&&(i=!1);var o=t||Ke.defaultLocale;return new e(o||(i?"en-US":function(){if(it)return it;if(J()){var e=(new Intl.DateTimeFormat).resolvedOptions().locale;return it=e&&"und"!==e?e:"en-US"}return it="en-US"}()),n||Ke.defaultNumberingSystem,r||Ke.defaultOutputCalendar,o)},e.resetCache=function(){it=null,Xe={},tt={},nt={}},e.fromObject=function(t){var n=void 0===t?{}:t,r=n.locale,i=n.numberingSystem,o=n.outputCalendar;return e.create(r,i,o)};var t=e.prototype;return t.listingMode=function(e){void 0===e&&(e=!0);var t=J()&&Y(),n=this.isEnglish(),r=!(null!==this.numberingSystem&&"latn"!==this.numberingSystem||null!==this.outputCalendar&&"gregory"!==this.outputCalendar);return t||n&&r||e?!t||n&&r?"en":"intl":"error"},t.clone=function(t){return t&&0!==Object.getOwnPropertyNames(t).length?e.create(t.locale||this.specifiedLocale,t.numberingSystem||this.numberingSystem,t.outputCalendar||this.outputCalendar,t.defaultToEN||!1):this},t.redefaultToEN=function(e){return void 0===e&&(e={}),this.clone(Object.assign({},e,{defaultToEN:!0}))},t.redefaultToSystem=function(e){return void 0===e&&(e={}),this.clone(Object.assign({},e,{defaultToEN:!1}))},t.months=function(e,t,n){var r=this;return void 0===t&&(t=!1),void 0===n&&(n=!0),ot(this,e,n,ke,function(){var n=t?{month:e,day:"numeric"}:{month:e},i=t?"format":"standalone";return r.monthsCache[i][e]||(r.monthsCache[i][e]=function(e){for(var t=[],n=1;n<=12;n++){var r=sr.utc(2016,n,1);t.push(e(r))}return t}(function(e){return r.extract(e,n,"month")})),r.monthsCache[i][e]})},t.weekdays=function(e,t,n){var r=this;return void 0===t&&(t=!1),void 0===n&&(n=!0),ot(this,e,n,Te,function(){var n=t?{weekday:e,year:"numeric",month:"long",day:"numeric"}:{weekday:e},i=t?"format":"standalone";return r.weekdaysCache[i][e]||(r.weekdaysCache[i][e]=function(e){for(var t=[],n=1;n<=7;n++){var r=sr.utc(2016,11,13+n);t.push(e(r))}return t}(function(e){return r.extract(e,n,"weekday")})),r.weekdaysCache[i][e]})},t.meridiems=function(e){var t=this;return void 0===e&&(e=!0),ot(this,void 0,e,function(){return Me},function(){if(!t.meridiemCache){var e={hour:"numeric",hour12:!0};t.meridiemCache=[sr.utc(2016,11,13,9),sr.utc(2016,11,13,19)].map(function(n){return t.extract(n,e,"dayperiod")})}return t.meridiemCache})},t.eras=function(e,t){var n=this;return void 0===t&&(t=!0),ot(this,e,t,Ie,function(){var t={era:e};return n.eraCache[e]||(n.eraCache[e]=[sr.utc(-40,1,1),sr.utc(2017,1,1)].map(function(e){return n.extract(e,t,"era")})),n.eraCache[e]})},t.extract=function(e,t,n){var r=this.dtFormatter(e,t).formatToParts().find(function(e){return e.type.toLowerCase()===n});return r?r.value:null},t.numberFormatter=function(e){return void 0===e&&(e={}),new at(this.intl,e.forceSimple||this.fastNumbers,e)},t.dtFormatter=function(e,t){return void 0===t&&(t={}),new ut(e,this.intl,t)},t.relFormatter=function(e){return void 0===e&&(e={}),new st(this.intl,this.isEnglish(),e)},t.isEnglish=function(){return"en"===this.locale||"en-us"===this.locale.toLowerCase()||J()&&new Intl.DateTimeFormat(this.intl).resolvedOptions().locale.startsWith("en-us")},t.equals=function(e){return this.locale===e.locale&&this.numberingSystem===e.numberingSystem&&this.outputCalendar===e.outputCalendar},n(e,[{key:"fastNumbers",get:function(){var e;return null==this.fastNumbersCached&&(this.fastNumbersCached=(!(e=this).numberingSystem||"latn"===e.numberingSystem)&&("latn"===e.numberingSystem||!e.locale||e.locale.startsWith("en")||J()&&"latn"===new Intl.DateTimeFormat(e.intl).resolvedOptions().numberingSystem)),this.fastNumbersCached}}]),e}();function lt(){for(var e=arguments.length,t=new Array(e),n=0;n1?t-1:0),r=1;r3?be.indexOf(e)+1:Oe.indexOf(e)+1),u}var xt=/^(?:(Mon|Tue|Wed|Thu|Fri|Sat|Sun),\s)?(\d{1,2})\s(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\s(\d{2,4})\s(\d\d):(\d\d)(?::(\d\d))?\s(?:(UT|GMT|[ECMP][SD]T)|([Zz])|(?:([+-]\d\d)(\d\d)))$/;function Ct(e){var t,n=e[1],r=e[2],i=e[3],o=e[4],a=e[5],u=e[6],s=e[7],c=e[8],l=e[9],f=e[10],d=e[11],h=Lt(n,o,i,r,a,u,s);return t=c?Vt[c]:l?0:le(f,d),[h,new Re(t)]}var Ft=/^(Mon|Tue|Wed|Thu|Fri|Sat|Sun), (\d\d) (Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec) (\d{4}) (\d\d):(\d\d):(\d\d) GMT$/,Zt=/^(Monday|Tuesday|Wedsday|Thursday|Friday|Saturday|Sunday), (\d\d)-(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)-(\d\d) (\d\d):(\d\d):(\d\d) GMT$/,jt=/^(Mon|Tue|Wed|Thu|Fri|Sat|Sun) (Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec) ( \d|\d\d) (\d\d):(\d\d):(\d\d) (\d{4})$/;function At(e){var t=e[1],n=e[2],r=e[3];return[Lt(t,e[4],r,n,e[5],e[6],e[7]),Re.utcInstance]}function zt(e){var t=e[1],n=e[2],r=e[3],i=e[4],o=e[5],a=e[6];return[Lt(t,e[7],n,r,i,o,a),Re.utcInstance]}var _t=lt(/([+-]\d{6}|\d{4})(?:-?(\d\d)(?:-?(\d\d))?)?/,gt),qt=lt(/(\d{4})-?W(\d\d)(?:-?(\d))?/,gt),Ht=lt(/(\d{4})-?(\d{3})/,gt),Ut=lt(vt),Rt=ft(St,Tt,Mt),Pt=ft(pt,Tt,Mt),Wt=ft(wt,Tt,Mt),Jt=ft(Tt,Mt);var Yt=ft(Tt);var Gt=lt(/(\d{4})-(\d\d)-(\d\d)/,bt),$t=lt(kt),Bt=ft(St,Tt,Mt,Nt),Qt=ft(Tt,Mt,Nt);var Kt={weeks:{days:7,hours:168,minutes:10080,seconds:604800,milliseconds:6048e5},days:{hours:24,minutes:1440,seconds:86400,milliseconds:864e5},hours:{minutes:60,seconds:3600,milliseconds:36e5},minutes:{seconds:60,milliseconds:6e4},seconds:{milliseconds:1e3}},Xt=Object.assign({years:{quarters:4,months:12,weeks:52,days:365,hours:8760,minutes:525600,seconds:31536e3,milliseconds:31536e6},quarters:{months:3,weeks:13,days:91,hours:2184,minutes:131040,seconds:7862400,milliseconds:78624e5},months:{weeks:4,days:30,hours:720,minutes:43200,seconds:2592e3,milliseconds:2592e6}},Kt),en=Object.assign({years:{quarters:4,months:12,weeks:52.1775,days:365.2425,hours:8765.82,minutes:525949.2,seconds:525949.2*60,milliseconds:525949.2*60*1e3},quarters:{months:3,weeks:13.044375,days:91.310625,hours:2191.455,minutes:131487.3,seconds:525949.2*60/4,milliseconds:7889237999.999999},months:{weeks:30.436875/7,days:30.436875,hours:730.485,minutes:43829.1,seconds:2629746,milliseconds:2629746e3}},Kt),tn=["years","quarters","months","weeks","days","hours","minutes","seconds","milliseconds"],nn=tn.slice(0).reverse();function rn(e,t,n){void 0===n&&(n=!1);var r={values:n?t.values:Object.assign({},e.values,t.values||{}),loc:e.loc.clone(t.loc),conversionAccuracy:t.conversionAccuracy||e.conversionAccuracy};return new an(r)}function on(e,t,n,r,i){var o,a=e[i][n],u=t[n]/a,s=!(Math.sign(u)===Math.sign(r[i]))&&0!==r[i]&&Math.abs(u)<=1?(o=u)<0?Math.floor(o):Math.ceil(o):Math.trunc(u);r[i]+=s,t[n]-=s*a}var an=function(){function e(e){var t="longterm"===e.conversionAccuracy||!1;this.values=e.values,this.loc=e.loc||ct.create(),this.conversionAccuracy=t?"longterm":"casual",this.invalid=e.invalid||null,this.matrix=t?en:Xt,this.isLuxonDuration=!0}e.fromMillis=function(t,n){return e.fromObject(Object.assign({milliseconds:t},n))},e.fromObject=function(t){if(null==t||"object"!=typeof t)throw new v("Duration.fromObject: argument expected to be an object, got "+(null===t?"null":typeof t));return new e({values:de(t,e.normalizeUnit,["locale","numberingSystem","conversionAccuracy","zone"]),loc:ct.fromObject(t),conversionAccuracy:t.conversionAccuracy})},e.fromISO=function(t,n){var r=dt(t,[Dt,It])[0];if(r){var i=Object.assign(r,n);return e.fromObject(i)}return e.invalid("unparsable",'the input "'+t+"\" can't be parsed as ISO 8601")},e.fromISOTime=function(t,n){var r=dt(t,[Et,Yt])[0];if(r){var i=Object.assign(r,n);return e.fromObject(i)}return e.invalid("unparsable",'the input "'+t+"\" can't be parsed as ISO 8601")},e.invalid=function(t,n){if(void 0===n&&(n=null),!t)throw new v("need to specify a reason the Duration is invalid");var r=t instanceof Ce?t:new Ce(t,n);if(Ke.throwOnInvalid)throw new h(r);return new e({invalid:r})},e.normalizeUnit=function(e){var t={year:"years",years:"years",quarter:"quarters",quarters:"quarters",month:"months",months:"months",week:"weeks",weeks:"weeks",day:"days",days:"days",hour:"hours",hours:"hours",minute:"minutes",minutes:"minutes",second:"seconds",seconds:"seconds",millisecond:"milliseconds",milliseconds:"milliseconds"}[e?e.toLowerCase():e];if(!t)throw new y(e);return t},e.isDuration=function(e){return e&&e.isLuxonDuration||!1};var t=e.prototype;return t.toFormat=function(e,t){void 0===t&&(t={});var n=Object.assign({},t,{floor:!1!==t.round&&!1!==t.floor});return this.isValid?xe.create(this.loc,n).formatDurationFromString(this,e):"Invalid Duration"},t.toObject=function(e){if(void 0===e&&(e={}),!this.isValid)return{};var t=Object.assign({},this.values);return e.includeConfig&&(t.conversionAccuracy=this.conversionAccuracy,t.numberingSystem=this.loc.numberingSystem,t.locale=this.loc.locale),t},t.toISO=function(){if(!this.isValid)return null;var e="P";return 0!==this.years&&(e+=this.years+"Y"),0===this.months&&0===this.quarters||(e+=this.months+3*this.quarters+"M"),0!==this.weeks&&(e+=this.weeks+"W"),0!==this.days&&(e+=this.days+"D"),0===this.hours&&0===this.minutes&&0===this.seconds&&0===this.milliseconds||(e+="T"),0!==this.hours&&(e+=this.hours+"H"),0!==this.minutes&&(e+=this.minutes+"M"),0===this.seconds&&0===this.milliseconds||(e+=ne(this.seconds+this.milliseconds/1e3,3)+"S"),"P"===e&&(e+="T0S"),e},t.toISOTime=function(e){if(void 0===e&&(e={}),!this.isValid)return null;var t=this.toMillis();if(t<0||t>=864e5)return null;e=Object.assign({suppressMilliseconds:!1,suppressSeconds:!1,includePrefix:!1,format:"extended"},e);var n=this.shiftTo("hours","minutes","seconds","milliseconds"),r="basic"===e.format?"hhmm":"hh:mm";e.suppressSeconds&&0===n.seconds&&0===n.milliseconds||(r+="basic"===e.format?"ss":":ss",e.suppressMilliseconds&&0===n.milliseconds||(r+=".SSS"));var i=n.toFormat(r);return e.includePrefix&&(i="T"+i),i},t.toJSON=function(){return this.toISO()},t.toString=function(){return this.toISO()},t.toMillis=function(){return this.as("milliseconds")},t.valueOf=function(){return this.toMillis()},t.plus=function(e){if(!this.isValid)return this;for(var t,n=un(e),r={},i=c(tn);!(t=i()).done;){var o=t.value;(Q(n.values,o)||Q(this.values,o))&&(r[o]=n.get(o)+this.get(o))}return rn(this,{values:r},!0)},t.minus=function(e){if(!this.isValid)return this;var t=un(e);return this.plus(t.negate())},t.mapUnits=function(e){if(!this.isValid)return this;for(var t={},n=0,r=Object.keys(this.values);n=0){i=f;var d=0;for(var h in u)d+=this.matrix[h][f]*u[h],u[h]=0;P(s[f])&&(d+=s[f]);var m=Math.trunc(d);for(var y in a[f]=m,u[f]=d-m,s)tn.indexOf(y)>tn.indexOf(f)&&on(this.matrix,s,y,a,f)}else P(s[f])&&(u[f]=s[f])}for(var v in u)0!==u[v]&&(a[i]+=v===i?u[v]:u[v]/this.matrix[i][v]);return rn(this,{values:a},!0).normalize()},t.negate=function(){if(!this.isValid)return this;for(var e={},t=0,n=Object.keys(this.values);te},t.isBefore=function(e){return!!this.isValid&&this.e<=e},t.contains=function(e){return!!this.isValid&&(this.s<=e&&this.e>e)},t.set=function(t){var n=void 0===t?{}:t,r=n.start,i=n.end;return this.isValid?e.fromDateTimes(r||this.s,i||this.e):this},t.splitAt=function(){var t=this;if(!this.isValid)return[];for(var n=arguments.length,r=new Array(n),i=0;i+this.e?this.e:c;a.push(e.fromDateTimes(u,l)),u=l,s+=1}return a},t.splitBy=function(t){var n=un(t);if(!this.isValid||!n.isValid||0===n.as("milliseconds"))return[];for(var r,i=this.s,o=1,a=[];i+this.e?this.e:u,a.push(e.fromDateTimes(i,r)),i=r,o+=1}return a},t.divideEqually=function(e){return this.isValid?this.splitBy(this.length()/e).slice(0,e):[]},t.overlaps=function(e){return this.e>e.s&&this.s=e.e)},t.equals=function(e){return!(!this.isValid||!e.isValid)&&(this.s.equals(e.s)&&this.e.equals(e.e))},t.intersection=function(t){if(!this.isValid)return this;var n=this.s>t.s?this.s:t.s,r=this.e=r?null:e.fromDateTimes(n,r)},t.union=function(t){if(!this.isValid)return this;var n=this.st.e?this.e:t.e;return e.fromDateTimes(n,r)},e.merge=function(e){var t=e.sort(function(e,t){return e.s-t.s}).reduce(function(e,t){var n=e[0],r=e[1];return r?r.overlaps(t)||r.abutsStart(t)?[n,r.union(t)]:[n.concat([r]),t]:[n,t]},[[],null]),n=t[0],r=t[1];return r&&n.push(r),n},e.xor=function(t){for(var n,r,i=null,o=0,a=[],u=t.map(function(e){return[{time:e.s,type:"s"},{time:e.e,type:"e"}]}),s=c((n=Array.prototype).concat.apply(n,u).sort(function(e,t){return e.time-t.time}));!(r=s()).done;){var l=r.value;1===(o+="s"===l.type?1:-1)?i=l.time:(i&&+i!=+l.time&&a.push(e.fromDateTimes(i,l.time)),i=null)}return e.merge(a)},t.difference=function(){for(var t=this,n=arguments.length,r=new Array(n),i=0;i=0){var f;r=c;var d,h=l(e,t);(i=e.plus(((f={})[c]=h,f)))>t?(e=e.plus(((d={})[c]=h-1,d)),h-=1):e=i,o[c]=h}}return[e,o,i,r]}(e,t,n),o=i[0],a=i[1],u=i[2],s=i[3],c=t-o,l=n.filter(function(e){return["hours","minutes","seconds","milliseconds"].indexOf(e)>=0});if(0===l.length){var f;if(u0?(d=an.fromMillis(c,r)).shiftTo.apply(d,l).plus(h):h}var mn={arab:"[٠-٩]",arabext:"[۰-۹]",bali:"[᭐-᭙]",beng:"[০-৯]",deva:"[०-९]",fullwide:"[0-9]",gujr:"[૦-૯]",hanidec:"[〇|一|二|三|四|五|六|七|八|九]",khmr:"[០-៩]",knda:"[೦-೯]",laoo:"[໐-໙]",limb:"[᥆-᥏]",mlym:"[൦-൯]",mong:"[᠐-᠙]",mymr:"[၀-၉]",orya:"[୦-୯]",tamldec:"[௦-௯]",telu:"[౦-౯]",thai:"[๐-๙]",tibt:"[༠-༩]",latn:"\\d"},yn={arab:[1632,1641],arabext:[1776,1785],bali:[6992,7001],beng:[2534,2543],deva:[2406,2415],fullwide:[65296,65303],gujr:[2790,2799],khmr:[6112,6121],knda:[3302,3311],laoo:[3792,3801],limb:[6470,6479],mlym:[3430,3439],mong:[6160,6169],mymr:[4160,4169],orya:[2918,2927],tamldec:[3046,3055],telu:[3174,3183],thai:[3664,3673],tibt:[3872,3881]},vn=mn.hanidec.replace(/[\[|\]]/g,"").split("");function gn(e,t){var n=e.numberingSystem;return void 0===t&&(t=""),new RegExp(""+mn[n||"latn"]+t)}var pn="missing Intl.DateTimeFormat.formatToParts support";function wn(e,t){return void 0===t&&(t=function(e){return e}),{regex:e,deser:function(e){var n=e[0];return t(function(e){var t=parseInt(e,10);if(isNaN(t)){t="";for(var n=0;n=a&&r<=u&&(t+=r-a)}}return parseInt(t,10)}return t}(n))}}}var kn="( |"+String.fromCharCode(160)+")",bn=new RegExp(kn,"g");function On(e){return e.replace(/\./g,"\\.?").replace(bn,kn)}function Sn(e){return e.replace(/\./g,"").replace(bn," ").toLowerCase()}function Tn(e,t){return null===e?null:{regex:RegExp(e.map(On).join("|")),deser:function(n){var r=n[0];return e.findIndex(function(e){return Sn(r)===Sn(e)})+t}}}function Mn(e,t){return{regex:e,deser:function(e){return le(e[1],e[2])},groups:t}}function Nn(e){return{regex:e,deser:function(e){return e[0]}}}var En={year:{"2-digit":"yy",numeric:"yyyyy"},month:{numeric:"M","2-digit":"MM",short:"MMM",long:"MMMM"},day:{numeric:"d","2-digit":"dd"},weekday:{short:"EEE",long:"EEEE"},dayperiod:"a",dayPeriod:"a",hour:{numeric:"h","2-digit":"hh"},minute:{numeric:"m","2-digit":"mm"},second:{numeric:"s","2-digit":"ss"}};var Dn=null;function In(e,t){if(e.literal)return e;var n=xe.macroTokenToFormatOpts(e.val);if(!n)return e;var r=xe.create(t,n).formatDateTimeParts((Dn||(Dn=sr.fromMillis(1555555555555)),Dn)).map(function(e){return function(e,t,n){var r=e.type,i=e.value;if("literal"===r)return{literal:!0,val:i};var o=n[r],a=En[r];return"object"==typeof a&&(a=a[o]),a?{literal:!1,val:a}:void 0}(e,0,n)});return r.includes(void 0)?e:r}function Vn(e,t,n){var r=function(e,t){var n;return(n=Array.prototype).concat.apply(n,e.map(function(e){return In(e,t)}))}(xe.parseFormat(n),e),i=r.map(function(t){return n=t,i=gn(r=e),o=gn(r,"{2}"),a=gn(r,"{3}"),u=gn(r,"{4}"),s=gn(r,"{6}"),c=gn(r,"{1,2}"),l=gn(r,"{1,3}"),f=gn(r,"{1,6}"),d=gn(r,"{1,9}"),h=gn(r,"{2,4}"),m=gn(r,"{4,6}"),y=function(e){return{regex:RegExp((t=e.val,t.replace(/[\-\[\]{}()*+?.,\\\^$|#\s]/g,"\\$&"))),deser:function(e){return e[0]},literal:!0};var t},(v=function(e){if(n.literal)return y(e);switch(e.val){case"G":return Tn(r.eras("short",!1),0);case"GG":return Tn(r.eras("long",!1),0);case"y":return wn(f);case"yy":return wn(h,se);case"yyyy":return wn(u);case"yyyyy":return wn(m);case"yyyyyy":return wn(s);case"M":return wn(c);case"MM":return wn(o);case"MMM":return Tn(r.months("short",!0,!1),1);case"MMMM":return Tn(r.months("long",!0,!1),1);case"L":return wn(c);case"LL":return wn(o);case"LLL":return Tn(r.months("short",!1,!1),1);case"LLLL":return Tn(r.months("long",!1,!1),1);case"d":return wn(c);case"dd":return wn(o);case"o":return wn(l);case"ooo":return wn(a);case"HH":return wn(o);case"H":return wn(c);case"hh":return wn(o);case"h":return wn(c);case"mm":return wn(o);case"m":case"q":return wn(c);case"qq":return wn(o);case"s":return wn(c);case"ss":return wn(o);case"S":return wn(l);case"SSS":return wn(a);case"u":return Nn(d);case"a":return Tn(r.meridiems(),0);case"kkkk":return wn(u);case"kk":return wn(h,se);case"W":return wn(c);case"WW":return wn(o);case"E":case"c":return wn(i);case"EEE":return Tn(r.weekdays("short",!1,!1),1);case"EEEE":return Tn(r.weekdays("long",!1,!1),1);case"ccc":return Tn(r.weekdays("short",!0,!1),1);case"cccc":return Tn(r.weekdays("long",!0,!1),1);case"Z":case"ZZ":return Mn(new RegExp("([+-]"+c.source+")(?::("+o.source+"))?"),2);case"ZZZ":return Mn(new RegExp("([+-]"+c.source+")("+o.source+")?"),2);case"z":return Nn(/[a-z_+-/]{1,256}?/i);default:return y(e)}}(n)||{invalidReason:pn}).token=n,v;var n,r,i,o,a,u,s,c,l,f,d,h,m,y,v}),o=i.find(function(e){return e.invalidReason});if(o)return{input:t,tokens:r,invalidReason:o.invalidReason};var a=function(e){return["^"+e.map(function(e){return e.regex}).reduce(function(e,t){return e+"("+t.source+")"},"")+"$",e]}(i),u=a[0],s=a[1],c=RegExp(u,"i"),l=function(e,t,n){var r=e.match(t);if(r){var i={},o=1;for(var a in n)if(Q(n,a)){var u=n[a],s=u.groups?u.groups+1:1;!u.literal&&u.token&&(i[u.token.val[0]]=u.deser(r.slice(o,o+s))),o+=s}return[r,i]}return[r,{}]}(t,c,s),f=l[0],d=l[1],h=d?function(e){var t;return t=R(e.Z)?R(e.z)?null:He.create(e.z):new Re(e.Z),R(e.q)||(e.M=3*(e.q-1)+1),R(e.h)||(e.h<12&&1===e.a?e.h+=12:12===e.h&&0===e.a&&(e.h=0)),0===e.G&&e.y&&(e.y=-e.y),R(e.u)||(e.S=te(e.u)),[Object.keys(e).reduce(function(t,n){var r=function(e){switch(e){case"S":return"millisecond";case"s":return"second";case"m":return"minute";case"h":case"H":return"hour";case"d":return"day";case"o":return"ordinal";case"L":case"M":return"month";case"y":return"year";case"E":case"c":return"weekday";case"W":return"weekNumber";case"k":return"weekYear";case"q":return"quarter";default:return null}}(n);return r&&(t[r]=e[n]),t},{}),t]}(d):[null,null],y=h[0],v=h[1];if(Q(d,"a")&&Q(d,"H"))throw new m("Can't include meridiem when specifying 24-hour format");return{input:t,tokens:r,regex:c,rawMatches:f,matches:d,result:y,zone:v}}var Ln=[0,31,59,90,120,151,181,212,243,273,304,334],xn=[0,31,60,91,121,152,182,213,244,274,305,335];function Cn(e,t){return new Ce("unit out of range","you specified "+t+" (of type "+typeof t+") as a "+e+", which is invalid")}function Fn(e,t,n){var r=new Date(Date.UTC(e,t-1,n)).getUTCDay();return 0===r?7:r}function Zn(e,t,n){return n+(re(e)?xn:Ln)[t-1]}function jn(e,t){var n=re(e)?xn:Ln,r=n.findIndex(function(e){return eue(n)?(t=n+1,u=1):t=n,Object.assign({weekYear:t,weekNumber:u,weekday:a},me(e))}function zn(e){var t,n=e.weekYear,r=e.weekNumber,i=e.weekday,o=Fn(n,1,4),a=ie(n),u=7*r+i-o-3;u<1?u+=ie(t=n-1):u>a?(t=n+1,u-=ie(n)):t=n;var s=jn(t,u),c=s.month,l=s.day;return Object.assign({year:t,month:c,day:l},me(e))}function _n(e){var t=e.year,n=Zn(t,e.month,e.day);return Object.assign({year:t,ordinal:n},me(e))}function qn(e){var t=e.year,n=jn(t,e.ordinal),r=n.month,i=n.day;return Object.assign({year:t,month:r,day:i},me(e))}function Hn(e){var t=W(e.year),n=K(e.month,1,12),r=K(e.day,1,oe(e.year,e.month));return t?n?!r&&Cn("day",e.day):Cn("month",e.month):Cn("year",e.year)}function Un(e){var t=e.hour,n=e.minute,r=e.second,i=e.millisecond,o=K(t,0,23)||24===t&&0===n&&0===r&&0===i,a=K(n,0,59),u=K(r,0,59),s=K(i,0,999);return o?a?u?!s&&Cn("millisecond",i):Cn("second",r):Cn("minute",n):Cn("hour",t)}function Rn(e){return new Ce("unsupported zone",'the zone "'+e.name+'" is not supported')}function Pn(e){return null===e.weekData&&(e.weekData=An(e.c)),e.weekData}function Wn(e,t){var n={ts:e.ts,zone:e.zone,c:e.c,o:e.o,loc:e.loc,invalid:e.invalid};return new sr(Object.assign({},n,t,{old:n}))}function Jn(e,t,n){var r=e-60*t*1e3,i=n.offset(r);if(t===i)return[r,t];r-=60*(i-t)*1e3;var o=n.offset(r);return i===o?[r,i]:[e-60*Math.min(i,o)*1e3,Math.max(i,o)]}function Yn(e,t){var n=new Date(e+=60*t*1e3);return{year:n.getUTCFullYear(),month:n.getUTCMonth()+1,day:n.getUTCDate(),hour:n.getUTCHours(),minute:n.getUTCMinutes(),second:n.getUTCSeconds(),millisecond:n.getUTCMilliseconds()}}function Gn(e,t,n){return Jn(ae(e),t,n)}function $n(e,t){var n=e.o,r=e.c.year+Math.trunc(t.years),i=e.c.month+Math.trunc(t.months)+3*Math.trunc(t.quarters),o=Object.assign({},e.c,{year:r,month:i,day:Math.min(e.c.day,oe(r,i))+Math.trunc(t.days)+7*Math.trunc(t.weeks)}),a=an.fromObject({years:t.years-Math.trunc(t.years),quarters:t.quarters-Math.trunc(t.quarters),months:t.months-Math.trunc(t.months),weeks:t.weeks-Math.trunc(t.weeks),days:t.days-Math.trunc(t.days),hours:t.hours,minutes:t.minutes,seconds:t.seconds,milliseconds:t.milliseconds}).as("milliseconds"),u=Jn(ae(o),n,e.zone),s=u[0],c=u[1];return 0!==a&&(s+=a,c=e.zone.offset(s)),{ts:s,o:c}}function Bn(e,t,n,r,i){var o=n.setZone,a=n.zone;if(e&&0!==Object.keys(e).length){var u=t||a,s=sr.fromObject(Object.assign(e,n,{zone:u,setZone:void 0}));return o?s:s.setZone(a)}return sr.invalid(new Ce("unparsable",'the input "'+i+"\" can't be parsed as "+r))}function Qn(e,t,n){return void 0===n&&(n=!0),e.isValid?xe.create(ct.create("en-US"),{allowZ:n,forceSimple:!0}).formatDateTimeFromString(e,t):null}function Kn(e,t){var n=t.suppressSeconds,r=void 0!==n&&n,i=t.suppressMilliseconds,o=void 0!==i&&i,a=t.includeOffset,u=t.includePrefix,s=void 0!==u&&u,c=t.includeZone,l=void 0!==c&&c,f=t.spaceZone,d=void 0!==f&&f,h=t.format,m=void 0===h?"extended":h,y="basic"===m?"HHmm":"HH:mm";r&&0===e.second&&0===e.millisecond||(y+="basic"===m?"ss":":ss",o&&0===e.millisecond||(y+=".SSS")),(l||a)&&d&&(y+=" "),l?y+="z":a&&(y+="basic"===m?"ZZZ":"ZZ");var v=Qn(e,y);return s&&(v="T"+v),v}var Xn={month:1,day:1,hour:0,minute:0,second:0,millisecond:0},er={weekNumber:1,weekday:1,hour:0,minute:0,second:0,millisecond:0},tr={ordinal:1,hour:0,minute:0,second:0,millisecond:0},nr=["year","month","day","hour","minute","second","millisecond"],rr=["weekYear","weekNumber","weekday","hour","minute","second","millisecond"],ir=["year","ordinal","hour","minute","second","millisecond"];function or(e){var t={year:"year",years:"year",month:"month",months:"month",day:"day",days:"day",hour:"hour",hours:"hour",minute:"minute",minutes:"minute",quarter:"quarter",quarters:"quarter",second:"second",seconds:"second",millisecond:"millisecond",milliseconds:"millisecond",weekday:"weekday",weekdays:"weekday",weeknumber:"weekNumber",weeksnumber:"weekNumber",weeknumbers:"weekNumber",weekyear:"weekYear",weekyears:"weekYear",ordinal:"ordinal"}[e.toLowerCase()];if(!t)throw new y(e);return t}function ar(e,t){for(var n,r=c(nr);!(n=r()).done;){var i=n.value;R(e[i])&&(e[i]=Xn[i])}var o=Hn(e)||Un(e);if(o)return sr.invalid(o);var a=Ke.now(),u=Gn(e,t.offset(a),t),s=u[0],l=u[1];return new sr({ts:s,zone:t,o:l})}function ur(e,t,n){var r=!!R(n.round)||n.round,i=function(e,i){return e=ne(e,r||n.calendary?0:2,!0),t.loc.clone(n).relFormatter(n).format(e,i)},o=function(r){return n.calendary?t.hasSame(e,r)?0:t.startOf(r).diff(e.startOf(r),r).get(r):t.diff(e,r).get(r)};if(n.unit)return i(o(n.unit),n.unit);for(var a,u=c(n.units);!(a=u()).done;){var s=a.value,l=o(s);if(Math.abs(l)>=1)return i(l,s)}return i(e>t?-0:0,n.units[n.units.length-1])}var sr=function(){function e(e){var t=e.zone||Ke.defaultZone,n=e.invalid||(Number.isNaN(e.ts)?new Ce("invalid input"):null)||(t.isValid?null:Rn(t));this.ts=R(e.ts)?Ke.now():e.ts;var r=null,i=null;if(!n)if(e.old&&e.old.ts===this.ts&&e.old.zone.equals(t)){var o=[e.old.c,e.old.o];r=o[0],i=o[1]}else{var a=t.offset(this.ts);r=Yn(this.ts,a),r=(n=Number.isNaN(r.year)?new Ce("invalid input"):null)?null:r,i=n?null:a}this._zone=t,this.loc=e.loc||ct.create(),this.invalid=n,this.weekData=null,this.c=r,this.o=i,this.isLuxonDateTime=!0}e.now=function(){return new e({})},e.local=function(t,n,r,i,o,a,u){return R(t)?e.now():ar({year:t,month:n,day:r,hour:i,minute:o,second:a,millisecond:u},Ke.defaultZone)},e.utc=function(t,n,r,i,o,a,u){return R(t)?new e({ts:Ke.now(),zone:Re.utcInstance}):ar({year:t,month:n,day:r,hour:i,minute:o,second:a,millisecond:u},Re.utcInstance)},e.fromJSDate=function(t,n){void 0===n&&(n={});var r,i=(r=t,"[object Date]"===Object.prototype.toString.call(r)?t.valueOf():NaN);if(Number.isNaN(i))return e.invalid("invalid input");var o=We(n.zone,Ke.defaultZone);return o.isValid?new e({ts:i,zone:o,loc:ct.fromObject(n)}):e.invalid(Rn(o))},e.fromMillis=function(t,n){if(void 0===n&&(n={}),P(t))return t<-864e13||t>864e13?e.invalid("Timestamp out of range"):new e({ts:t,zone:We(n.zone,Ke.defaultZone),loc:ct.fromObject(n)});throw new v("fromMillis requires a numerical input, but received a "+typeof t+" with value "+t)},e.fromSeconds=function(t,n){if(void 0===n&&(n={}),P(t))return new e({ts:1e3*t,zone:We(n.zone,Ke.defaultZone),loc:ct.fromObject(n)});throw new v("fromSeconds requires a numerical input")},e.fromObject=function(t){var n=We(t.zone,Ke.defaultZone);if(!n.isValid)return e.invalid(Rn(n));var r=Ke.now(),i=n.offset(r),o=de(t,or,["zone","locale","outputCalendar","numberingSystem"]),a=!R(o.ordinal),u=!R(o.year),s=!R(o.month)||!R(o.day),l=u||s,f=o.weekYear||o.weekNumber,d=ct.fromObject(t);if((l||a)&&f)throw new m("Can't mix weekYear/weekNumber units with year/month/day or ordinals");if(s&&a)throw new m("Can't mix ordinal dates with month/day");var h,y,v=f||o.weekday&&!l,g=Yn(r,i);v?(h=rr,y=er,g=An(g)):a?(h=ir,y=tr,g=_n(g)):(h=nr,y=Xn);for(var p,w=!1,k=c(h);!(p=k()).done;){var b=p.value;R(o[b])?o[b]=w?y[b]:g[b]:w=!0}var O=(v?function(e){var t=W(e.weekYear),n=K(e.weekNumber,1,ue(e.weekYear)),r=K(e.weekday,1,7);return t?n?!r&&Cn("weekday",e.weekday):Cn("week",e.week):Cn("weekYear",e.weekYear)}(o):a?function(e){var t=W(e.year),n=K(e.ordinal,1,ie(e.year));return t?!n&&Cn("ordinal",e.ordinal):Cn("year",e.year)}(o):Hn(o))||Un(o);if(O)return e.invalid(O);var S=Gn(v?zn(o):a?qn(o):o,i,n),T=new e({ts:S[0],zone:n,o:S[1],loc:d});return o.weekday&&l&&t.weekday!==T.weekday?e.invalid("mismatched weekday","you can't specify both a weekday of "+o.weekday+" and a date of "+T.toISO()):T},e.fromISO=function(e,t){void 0===t&&(t={});var n=dt(e,[_t,Rt],[qt,Pt],[Ht,Wt],[Ut,Jt]);return Bn(n[0],n[1],t,"ISO 8601",e)},e.fromRFC2822=function(e,t){void 0===t&&(t={});var n=dt(function(e){return e.replace(/\([^)]*\)|[\n\t]/g," ").replace(/(\s\s+)/g," ").trim()}(e),[xt,Ct]);return Bn(n[0],n[1],t,"RFC 2822",e)},e.fromHTTP=function(e,t){void 0===t&&(t={});var n=dt(e,[Ft,At],[Zt,At],[jt,zt]);return Bn(n[0],n[1],t,"HTTP",t)},e.fromFormat=function(t,n,r){if(void 0===r&&(r={}),R(t)||R(n))throw new v("fromFormat requires an input string and a format");var i=r,o=i.locale,a=void 0===o?null:o,u=i.numberingSystem,s=void 0===u?null:u,c=function(e,t,n){var r=Vn(e,t,n);return[r.result,r.zone,r.invalidReason]}(ct.fromOpts({locale:a,numberingSystem:s,defaultToEN:!0}),t,n),l=c[0],f=c[1],d=c[2];return d?e.invalid(d):Bn(l,f,r,"format "+n,t)},e.fromString=function(t,n,r){return void 0===r&&(r={}),e.fromFormat(t,n,r)},e.fromSQL=function(e,t){void 0===t&&(t={});var n=dt(e,[Gt,Bt],[$t,Qt]);return Bn(n[0],n[1],t,"SQL",e)},e.invalid=function(t,n){if(void 0===n&&(n=null),!t)throw new v("need to specify a reason the DateTime is invalid");var r=t instanceof Ce?t:new Ce(t,n);if(Ke.throwOnInvalid)throw new f(r);return new e({invalid:r})},e.isDateTime=function(e){return e&&e.isLuxonDateTime||!1};var t=e.prototype;return t.get=function(e){return this[e]},t.resolvedLocaleOpts=function(e){void 0===e&&(e={});var t=xe.create(this.loc.clone(e),e).resolvedOptions(this);return{locale:t.locale,numberingSystem:t.numberingSystem,outputCalendar:t.calendar}},t.toUTC=function(e,t){return void 0===e&&(e=0),void 0===t&&(t={}),this.setZone(Re.instance(e),t)},t.toLocal=function(){return this.setZone(Ke.defaultZone)},t.setZone=function(t,n){var r=void 0===n?{}:n,i=r.keepLocalTime,o=void 0!==i&&i,a=r.keepCalendarTime,u=void 0!==a&&a;if((t=We(t,Ke.defaultZone)).equals(this.zone))return this;if(t.isValid){var s=this.ts;if(o||u){var c=t.offset(this.ts);s=Gn(this.toObject(),c,t)[0]}return Wn(this,{ts:s,zone:t})}return e.invalid(Rn(t))},t.reconfigure=function(e){var t=void 0===e?{}:e,n=t.locale,r=t.numberingSystem,i=t.outputCalendar;return Wn(this,{loc:this.loc.clone({locale:n,numberingSystem:r,outputCalendar:i})})},t.setLocale=function(e){return this.reconfigure({locale:e})},t.set=function(e){if(!this.isValid)return this;var t,n=de(e,or,[]),r=!R(n.weekYear)||!R(n.weekNumber)||!R(n.weekday),i=!R(n.ordinal),o=!R(n.year),a=!R(n.month)||!R(n.day),u=o||a,s=n.weekYear||n.weekNumber;if((u||i)&&s)throw new m("Can't mix weekYear/weekNumber units with year/month/day or ordinals");if(a&&i)throw new m("Can't mix ordinal dates with month/day");r?t=zn(Object.assign(An(this.c),n)):R(n.ordinal)?(t=Object.assign(this.toObject(),n),R(n.day)&&(t.day=Math.min(oe(t.year,t.month),t.day))):t=qn(Object.assign(_n(this.c),n));var c=Gn(t,this.o,this.zone);return Wn(this,{ts:c[0],o:c[1]})},t.plus=function(e){return this.isValid?Wn(this,$n(this,un(e))):this},t.minus=function(e){return this.isValid?Wn(this,$n(this,un(e).negate())):this},t.startOf=function(e){if(!this.isValid)return this;var t={},n=an.normalizeUnit(e);switch(n){case"years":t.month=1;case"quarters":case"months":t.day=1;case"weeks":case"days":t.hour=0;case"hours":t.minute=0;case"minutes":t.second=0;case"seconds":t.millisecond=0}if("weeks"===n&&(t.weekday=1),"quarters"===n){var r=Math.ceil(this.month/3);t.month=3*(r-1)+1}return this.set(t)},t.endOf=function(e){var t;return this.isValid?this.plus((t={},t[e]=1,t)).startOf(e).minus(1):this},t.toFormat=function(e,t){return void 0===t&&(t={}),this.isValid?xe.create(this.loc.redefaultToEN(t)).formatDateTimeFromString(this,e):"Invalid DateTime"},t.toLocaleString=function(e){return void 0===e&&(e=b),this.isValid?xe.create(this.loc.clone(e),e).formatDateTime(this):"Invalid DateTime"},t.toLocaleParts=function(e){return void 0===e&&(e={}),this.isValid?xe.create(this.loc.clone(e),e).formatDateTimeParts(this):[]},t.toISO=function(e){return void 0===e&&(e={}),this.isValid?this.toISODate(e)+"T"+this.toISOTime(e):null},t.toISODate=function(e){var t=(void 0===e?{}:e).format,n="basic"===(void 0===t?"extended":t)?"yyyyMMdd":"yyyy-MM-dd";return this.year>9999&&(n="+"+n),Qn(this,n)},t.toISOWeekDate=function(){return Qn(this,"kkkk-'W'WW-c")},t.toISOTime=function(e){var t=void 0===e?{}:e,n=t.suppressMilliseconds,r=void 0!==n&&n,i=t.suppressSeconds,o=void 0!==i&&i,a=t.includeOffset,u=void 0===a||a,s=t.includePrefix,c=void 0!==s&&s,l=t.format;return Kn(this,{suppressSeconds:o,suppressMilliseconds:r,includeOffset:u,includePrefix:c,format:void 0===l?"extended":l})},t.toRFC2822=function(){return Qn(this,"EEE, dd LLL yyyy HH:mm:ss ZZZ",!1)},t.toHTTP=function(){return Qn(this.toUTC(),"EEE, dd LLL yyyy HH:mm:ss 'GMT'")},t.toSQLDate=function(){return Qn(this,"yyyy-MM-dd")},t.toSQLTime=function(e){var t=void 0===e?{}:e,n=t.includeOffset,r=void 0===n||n,i=t.includeZone;return Kn(this,{includeOffset:r,includeZone:void 0!==i&&i,spaceZone:!0})},t.toSQL=function(e){return void 0===e&&(e={}),this.isValid?this.toSQLDate()+" "+this.toSQLTime(e):null},t.toString=function(){return this.isValid?this.toISO():"Invalid DateTime"},t.valueOf=function(){return this.toMillis()},t.toMillis=function(){return this.isValid?this.ts:NaN},t.toSeconds=function(){return this.isValid?this.ts/1e3:NaN},t.toJSON=function(){return this.toISO()},t.toBSON=function(){return this.toJSDate()},t.toObject=function(e){if(void 0===e&&(e={}),!this.isValid)return{};var t=Object.assign({},this.c);return e.includeConfig&&(t.outputCalendar=this.outputCalendar,t.numberingSystem=this.loc.numberingSystem,t.locale=this.loc.locale),t},t.toJSDate=function(){return new Date(this.isValid?this.ts:NaN)},t.diff=function(e,t,n){if(void 0===t&&(t="milliseconds"),void 0===n&&(n={}),!this.isValid||!e.isValid)return an.invalid(this.invalid||e.invalid,"created by diffing an invalid DateTime");var r,i=Object.assign({locale:this.locale,numberingSystem:this.numberingSystem},n),o=(r=t,Array.isArray(r)?r:[r]).map(an.normalizeUnit),a=e.valueOf()>this.valueOf(),u=hn(a?this:e,a?e:this,o,i);return a?u.negate():u},t.diffNow=function(t,n){return void 0===t&&(t="milliseconds"),void 0===n&&(n={}),this.diff(e.now(),t,n)},t.until=function(e){return this.isValid?ln.fromDateTimes(this,e):this},t.hasSame=function(e,t){if(!this.isValid)return!1;var n=e.valueOf(),r=this.setZone(e.zone,{keepLocalTime:!0});return r.startOf(t)<=n&&n<=r.endOf(t)},t.equals=function(e){return this.isValid&&e.isValid&&this.valueOf()===e.valueOf()&&this.zone.equals(e.zone)&&this.loc.equals(e.loc)},t.toRelative=function(t){if(void 0===t&&(t={}),!this.isValid)return null;var n=t.base||e.fromObject({zone:this.zone}),r=t.padding?thisthis.set({month:1}).offset||this.offset>this.set({month:5}).offset)}},{key:"isInLeapYear",get:function(){return re(this.year)}},{key:"daysInMonth",get:function(){return oe(this.year,this.month)}},{key:"daysInYear",get:function(){return this.isValid?ie(this.year):NaN}},{key:"weeksInWeekYear",get:function(){return this.isValid?ue(this.weekYear):NaN}}],[{key:"DATE_SHORT",get:function(){return b}},{key:"DATE_MED",get:function(){return O}},{key:"DATE_MED_WITH_WEEKDAY",get:function(){return S}},{key:"DATE_FULL",get:function(){return T}},{key:"DATE_HUGE",get:function(){return M}},{key:"TIME_SIMPLE",get:function(){return N}},{key:"TIME_WITH_SECONDS",get:function(){return E}},{key:"TIME_WITH_SHORT_OFFSET",get:function(){return D}},{key:"TIME_WITH_LONG_OFFSET",get:function(){return I}},{key:"TIME_24_SIMPLE",get:function(){return V}},{key:"TIME_24_WITH_SECONDS",get:function(){return L}},{key:"TIME_24_WITH_SHORT_OFFSET",get:function(){return x}},{key:"TIME_24_WITH_LONG_OFFSET",get:function(){return C}},{key:"DATETIME_SHORT",get:function(){return F}},{key:"DATETIME_SHORT_WITH_SECONDS",get:function(){return Z}},{key:"DATETIME_MED",get:function(){return j}},{key:"DATETIME_MED_WITH_SECONDS",get:function(){return A}},{key:"DATETIME_MED_WITH_WEEKDAY",get:function(){return z}},{key:"DATETIME_FULL",get:function(){return _}},{key:"DATETIME_FULL_WITH_SECONDS",get:function(){return q}},{key:"DATETIME_HUGE",get:function(){return H}},{key:"DATETIME_HUGE_WITH_SECONDS",get:function(){return U}}]),e}();function cr(e){if(sr.isDateTime(e))return e;if(e&&e.valueOf&&P(e.valueOf()))return sr.fromJSDate(e);if(e&&"object"==typeof e)return sr.fromObject(e);throw new v("Unknown datetime argument: "+e+", of type "+typeof e)}return e.DateTime=sr,e.Duration=an,e.FixedOffsetZone=Re,e.IANAZone=He,e.Info=fn,e.Interval=ln,e.InvalidZone=Pe,e.LocalZone=je,e.Settings=Ke,e.VERSION="1.28.0",e.Zone=Fe,e}({}); \ No newline at end of file diff --git a/assets/js/styles.js b/assets/js/styles.js new file mode 100644 index 000000000..3b4fcd959 --- /dev/null +++ b/assets/js/styles.js @@ -0,0 +1,25 @@ +'use strict'; + +(function($) { // Begin jQuery + $(function() { // DOM ready + // If a link has a dropdown, add sub menu toggle. + $('nav ul li a:not(:only-child)').click(function(eee) { + $(this).siblings('.nav-dropdown').toggle(); + // Close one dropdown when selecting another + $('.nav-dropdown').not($(this).siblings()).hide(); + eee.stopPropagation(); + }); + // Clicking away from dropdown will remove the dropdown class + $('html').click(function() { + $('.nav-dropdown').hide(); + }); + // Toggle open and close nav styles on click + $('#nav-toggle').click(function() { + $('nav ul').slideToggle(); + }); + // Hamburger to X toggle + $('#nav-toggle').on('click', function() { + this.classList.toggle('active'); + }); + }); // end DOM ready +})(jQuery); // end jQuery diff --git a/assets/js/tipuesearch.js b/assets/js/tipuesearch.js new file mode 100644 index 000000000..48245668b --- /dev/null +++ b/assets/js/tipuesearch.js @@ -0,0 +1,611 @@ + +/* +Tipue Search 7.1 +Copyright (c) 2019 Tipue +Tipue Search is released under the MIT License +http://www.tipue.com/search +*/ + + +(function($) { + + $.fn.tipuesearch = function(options) { + + var set = $.extend( { + + 'contextBuffer' : 60, + 'contextLength' : 60, + 'contextStart' : 90, + 'debug' : false, + 'descriptiveWords' : 25, + 'footerPages' : 3, + 'highlightTerms' : true, + 'imageZoom' : true, + 'minimumLength' : 3, + 'newWindow' : false, + 'show' : 10, + 'showContext' : true, + 'showRelated' : true, + 'showTime' : true, + 'showTitleCount' : true, + 'showURL' : true, + 'wholeWords' : true + }, options); + + return this.each(function() { + + var tipuesearch_t_c = 0; + + var tipue_search_w = ''; + if (set.newWindow) + { + tipue_search_w = ' target="_blank"'; + } + + function getURLP(name) + { + var locSearch = location.search; + var splitted = (new RegExp('[?|&]' + name + '=' + '([^&;]+?)(&|#|;|$)').exec(locSearch)||[,""]); + var searchString = splitted[1].replace(/\+/g, '%20'); + try + { + searchString = decodeURIComponent(searchString); + } + catch(e) + { + searchString = unescape(searchString); + } + return searchString || null; + } + + if (getURLP('q')) + { + $('#tipue_search_input').val(getURLP('q')); + getTipueSearch(0, true); + } + + $(this).keyup(function(event) + { + if(event.keyCode == '13') + { + getTipueSearch(0, true); + } + }); + + + function getTipueSearch(start, replace) + { + window.scrollTo(0, 0); + + var out = ''; + var show_replace = false; + var show_stop = false; + var standard = true; + var c = 0; + var found = []; + + var d_o = $('#tipue_search_input').val(); + d_o = d_o.replace(/\+/g, ' ').replace(/\s\s+/g, ' '); + + d_o = $.trim(d_o); + var d = d_o.toLowerCase(); + + if ((d.match("^\"") && d.match("\"$")) || (d.match("^'") && d.match("'$"))) + { + standard = false; + } + + var d_w = d.split(' '); + + if (standard) + { + d = ''; + for (var i = 0; i < d_w.length; i++) + { + var a_w = true; + for (var f = 0; f < tipuesearch_stop_words.length; f++) + { + if (d_w[i] == tipuesearch_stop_words[f]) + { + a_w = false; + show_stop = true; + } + } + if (a_w) + { + d = d + ' ' + d_w[i]; + } + } + d = $.trim(d); + d_w = d.split(' '); + } + else + { + d = d.substring(1, d.length - 1); + } + + if (d.length >= set.minimumLength) + { + if (standard) + { + if (replace) + { + var d_r = d; + for (var i = 0; i < d_w.length; i++) + { + for (var f = 0; f < tipuesearch_replace.words.length; f++) + { + if (d_w[i] == tipuesearch_replace.words[f].word) + { + d = d.replace(d_w[i], tipuesearch_replace.words[f].replace_with); + show_replace = true; + } + } + } + d_w = d.split(' '); + } + + var d_t = d; + for (var i = 0; i < d_w.length; i++) + { + for (var f = 0; f < tipuesearch_stem.words.length; f++) + { + if (d_w[i] == tipuesearch_stem.words[f].word) + { + d_t = d_t + ' ' + tipuesearch_stem.words[f].stem; + } + } + } + d_w = d_t.split(' '); + + for (var i = 0; i < tipuesearch.pages.length; i++) + { + var score = 0; + var s_t = tipuesearch.pages[i].text; + for (var f = 0; f < d_w.length; f++) + { + if (set.wholeWords) + { + var pat = new RegExp('\\b' + d_w[f] + '\\b', 'gi'); + } + else + { + var pat = new RegExp(d_w[f], 'gi'); + } + if (tipuesearch.pages[i].title.search(pat) != -1) + { + var m_c = tipuesearch.pages[i].title.match(pat).length; + score += (20 * m_c); + } + if (tipuesearch.pages[i].text.search(pat) != -1) + { + var m_c = tipuesearch.pages[i].text.match(pat).length; + score += (20 * m_c); + } + if (tipuesearch.pages[i].tags) + { + if (tipuesearch.pages[i].tags.search(pat) != -1) + { + var m_c = tipuesearch.pages[i].tags.match(pat).length; + score += (10 * m_c); + } + } + if (tipuesearch.pages[i].url.search(pat) != -1) + { + score += 20; + } + + if (score != 0) + { + for (var e = 0; e < tipuesearch_weight.weight.length; e++) + { + if (tipuesearch.pages[i].url == tipuesearch_weight.weight[e].url) + { + score += tipuesearch_weight.weight[e].score; + } + } + } + + if (d_w[f].match('^-')) + { + pat = new RegExp(d_w[f].substring(1), 'i'); + if (tipuesearch.pages[i].title.search(pat) != -1 || tipuesearch.pages[i].text.search(pat) != -1 || tipuesearch.pages[i].tags.search(pat) != -1) + { + score = 0; + } + } + } + + if (score != 0) + { + found.push( + { + "score": score, + "title": tipuesearch.pages[i].title, + "desc": s_t, + "img": tipuesearch.pages[i].img, + "url": tipuesearch.pages[i].url, + "note": tipuesearch.pages[i].note + }); + c++; + } + } + } + else + { + for (var i = 0; i < tipuesearch.pages.length; i++) + { + var score = 0; + var s_t = tipuesearch.pages[i].text; + var pat = new RegExp(d, 'gi'); + if (tipuesearch.pages[i].title.search(pat) != -1) + { + var m_c = tipuesearch.pages[i].title.match(pat).length; + score += (20 * m_c); + } + if (tipuesearch.pages[i].text.search(pat) != -1) + { + var m_c = tipuesearch.pages[i].text.match(pat).length; + score += (20 * m_c); + } + if (tipuesearch.pages[i].tags) + { + if (tipuesearch.pages[i].tags.search(pat) != -1) + { + var m_c = tipuesearch.pages[i].tags.match(pat).length; + score += (10 * m_c); + } + } + if (tipuesearch.pages[i].url.search(pat) != -1) + { + score += 20; + } + + if (score != 0) + { + for (var e = 0; e < tipuesearch_weight.weight.length; e++) + { + if (tipuesearch.pages[i].url == tipuesearch_weight.weight[e].url) + { + score += tipuesearch_weight.weight[e].score; + } + } + } + + if (score != 0) + { + found.push( + { + "score": score, + "title": tipuesearch.pages[i].title, + "desc": s_t, + "img": tipuesearch.pages[i].img, + "url": tipuesearch.pages[i].url, + "note": tipuesearch.pages[i].note + }); + c++; + } + } + } + + if (c != 0) + { + if (set.showTitleCount && tipuesearch_t_c == 0) + { + var title = document.title; + document.title = '(' + c + ') ' + title; + tipuesearch_t_c++; + } + + if (c == 1) + { + out += '
' + tipuesearch_string_4; + } + else + { + var c_c = c.toString().replace(/\B(?=(\d{3})+(?!\d))/g, ","); + out += '
' + c_c + ' ' + tipuesearch_string_5; + } + if (set.showTime) + { + var endTimer = new Date().getTime(); + var time = (endTimer - startTimer) / 1000; + out += ' (' + time.toFixed(2) + ' ' + tipuesearch_string_14 + ')'; + set.showTime = false; + } + out += '
'; + + if (set.showRelated && standard) + { + var ront = ''; + f = 0; + for (var i = 0; i < tipuesearch_related.Related.length; i++) + { + if (d == tipuesearch_related.Related[i].search) + { + if (!f) + { + out += ''; + out += ront; + } + } + + if (show_replace) + { + out += '
' + tipuesearch_string_2 + ' ' + d + '. ' + tipuesearch_string_3 + ' ' + d_r + '
'; + } + + found.sort(function(a, b) { return b.score - a.score } ); + + var l_o = 0; + + if (set.imageZoom) + { + out += '
'; + } + + for (var i = 0; i < found.length; i++) + { + if (l_o >= start && l_o < set.show + start) + { + out += '
'; + + out += ''; + + if (set.debug) + { + out += '
Score: ' + found[i].score + '
'; + } + + if (set.showURL) + { + var s_u = found[i].url.toLowerCase(); + if (s_u.indexOf('http://') == 0) + { + s_u = s_u.slice(7); + } + out += ''; + } + + if (found[i].img) + { + if (set.imageZoom) + { + out += '
' + found[i].title + '
'; + } + else + { + out += '
' + found[i].title + '
'; + } + } + + if (found[i].desc) + { + var t = found[i].desc; + + if (set.showContext) + { + d_w = d.split(' '); + var s_1 = found[i].desc.toLowerCase().indexOf(d_w[0]); + if (s_1 > set.contextStart) + { + var t_1 = t.substr(s_1 - set.contextBuffer); + var s_2 = t_1.indexOf(' '); + t_1 = t.substr(s_1 - set.contextBuffer + s_2); + t_1 = $.trim(t_1); + + if (t_1.length > set.contextLength) + { + t = '... ' + t_1; + } + } + } + + if (standard) + { + d_w = d.split(' '); + for (var f = 0; f < d_w.length; f++) + { + if (set.highlightTerms) + { + var patr = new RegExp('(' + d_w[f] + ')', 'gi'); + t = t.replace(patr, "$1"); + } + } + } + else if (set.highlightTerms) + { + var patr = new RegExp('(' + d + ')', 'gi'); + t = t.replace(patr, "$1"); + } + + var t_d = ''; + var t_w = t.split(' '); + if (t_w.length < set.descriptiveWords) + { + t_d = t; + } + else + { + for (var f = 0; f < set.descriptiveWords; f++) + { + t_d += t_w[f] + ' '; + } + } + t_d = $.trim(t_d); + if (t_d.charAt(t_d.length - 1) != '.') + { + t_d += ' ...'; + } + + t_d = t_d.replace(/h0011/g, 'span class=\"tipue_search_content_bold\"'); + t_d = t_d.replace(/h0012/g, '/span'); + + out += '
' + t_d + '
'; + } + + if (found[i].note) + { + out += '
' + found[i].note + '
'; + } + + out += '
'; + } + l_o++; + } + + if (c > set.show) + { + var pages = Math.ceil(c / set.show); + var page = (start / set.show); + if (set.footerPages < 3) + { + set.footerPages = 3; + } + + out += '
    '; + + if (start > 0) + { + out += '
  • ' + tipuesearch_string_6 + '
  • '; + } + + if (page <= 2) + { + var p_b = pages; + if (pages > set.footerPages) + { + p_b = set.footerPages; + } + for (var f = 0; f < p_b; f++) + { + if (f == page) + { + out += ''; + } + else + { + out += '
  • ' + (f + 1) + '
  • '; + } + } + } + else + { + var p_b = page + set.footerPages - 1; + if (p_b > pages) + { + p_b = pages; + } + for (var f = page - 1; f < p_b; f++) + { + if (f == page) + { + out += ''; + } + else + { + out += '
  • ' + (f + 1) + '
  • '; + } + } + } + + if (page + 1 != pages) + { + out += '
  • ' + tipuesearch_string_7 + '
  • '; + } + + out += '
'; + } + + } + else + { + out += '
' + tipuesearch_string_8 + '
'; + } + } + else + { + if (show_stop) + { + out += '
' + tipuesearch_string_8 + ' ' + tipuesearch_string_9 + '
'; + } + else + { + if (set.minimumLength == 1) + { + out += '
' + tipuesearch_string_11 + '
'; + } + else + { + out += '
' + tipuesearch_string_12 + ' ' + set.minimumLength + ' ' + tipuesearch_string_13 + '
'; + } + } + } + + $('#tipue_search_content').hide().html(out).slideDown(200); + + $('#tipue_search_replaced').click(function() + { + getTipueSearch(0, false); + }); + + $('.tipue_search_related_btn').click(function() + { + $('#tipue_search_input').val($(this).attr('id')); + getTipueSearch(0, true); + }); + + $('.tipue_search_image_zoom').click(function() + { + $('#tipue_search_image_modal').fadeIn(300); + $('#tipue_search_zoom_img').attr('src', this.src); + + var z_u = $(this).attr('data-url'); + $('#tipue_search_zoom_url').attr('href', z_u); + + var z_o = this.alt + ''; + + $('#tipue_search_zoom_text').html(z_o); + }); + + $('.tipue_search_image_close').click(function() + { + $('#tipue_search_image_modal').fadeOut(300); + }); + + $('.tipue_search_foot_box').click(function() + { + var id_v = $(this).attr('id'); + var id_a = id_v.split('_'); + + getTipueSearch(parseInt(id_a[0]), id_a[1]); + }); + } + + }); + }; + +})(jQuery); diff --git a/assets/js/tipuesearch.min.js b/assets/js/tipuesearch.min.js new file mode 100644 index 000000000..f50ef6ee6 --- /dev/null +++ b/assets/js/tipuesearch.min.js @@ -0,0 +1,181 @@ +(function($){$.fn.tipuesearch=function(options){var set=$.extend({'contextBuffer':60,'contextLength':60,'contextStart':90,'debug':false,'descriptiveWords':25,'footerPages':3,'highlightTerms':true,'imageZoom':true,'minimumLength':3,'newWindow':false,'show':10,'showContext':true,'showRelated':true,'showTime':true,'showTitleCount':true,'showURL':true,'wholeWords':true},options);return this.each(function(){var tipuesearch_t_c=0;var tipue_search_w='';if(set.newWindow) +{tipue_search_w=' target="_blank"';} +function getURLP(name) +{var locSearch=location.search;var splitted=(new RegExp('[?|&]'+name+'='+'([^&;]+?)(&|#|;|$)').exec(locSearch)||[,""]);var searchString=splitted[1].replace(/\+/g,'%20');try +{searchString=decodeURIComponent(searchString);} +catch(e) +{searchString=unescape(searchString);} +return searchString||null;} +if(getURLP('q')) +{$('#tipue_search_input').val(getURLP('q'));getTipueSearch(0,true);} +$(this).keyup(function(event) +{if(event.keyCode=='13') +{getTipueSearch(0,true);}});function getTipueSearch(start,replace) +{window.scrollTo(0,0);var out='';var show_replace=false;var show_stop=false;var standard=true;var c=0;var found=[];var d_o=$('#tipue_search_input').val();d_o=d_o.replace(/\+/g,' ').replace(/\s\s+/g,' ');d_o=$.trim(d_o);var d=d_o.toLowerCase();if((d.match("^\"")&&d.match("\"$"))||(d.match("^'")&&d.match("'$"))) +{standard=false;} +var d_w=d.split(' ');if(standard) +{d='';for(var i=0;i=set.minimumLength) +{if(standard) +{if(replace) +{var d_r=d;for(var i=0;i'+c_c+' '+tipuesearch_string_5;} +if(set.showTime) +{var endTimer=new Date().getTime();var time=(endTimer-startTimer)/ 1000;out+=' ('+time.toFixed(2)+' '+tipuesearch_string_14+')';set.showTime=false;} +out+='
';if(set.showRelated&&standard) +{var ront='';f=0;for(var i=0;i'+tipuesearch_related.Related[i].related+', ';f++;}} +if(f) +{ront=ront.slice(0,-2);ront+='.';out+=ront;}} +if(show_replace) +{out+='
'+tipuesearch_string_2+' '+d+'. '+tipuesearch_string_3+' '+d_r+'
';} +found.sort(function(a,b){return b.score-a.score});var l_o=0;if(set.imageZoom) +{out+='
';} +for(var i=0;i=start&&l_o'+found[i].title+'';if(set.debug) +{out+='
Score: '+found[i].score+'
';} +if(set.showURL) +{var s_u=found[i].url.toLowerCase();if(s_u.indexOf('http://')==0) +{s_u=s_u.slice(7);} +out+='';} +if(found[i].img) +{if(set.imageZoom) +{out+='
'+found[i].title+'
';} +else +{out+='
'+found[i].title+'
';}} +if(found[i].desc) +{var t=found[i].desc;if(set.showContext) +{d_w=d.split(' ');var s_1=found[i].desc.toLowerCase().indexOf(d_w[0]);if(s_1>set.contextStart) +{var t_1=t.substr(s_1-set.contextBuffer);var s_2=t_1.indexOf(' ');t_1=t.substr(s_1-set.contextBuffer+s_2);t_1=$.trim(t_1);if(t_1.length>set.contextLength) +{t='... '+t_1;}}} +if(standard) +{d_w=d.split(' ');for(var f=0;f$1");}}} +else if(set.highlightTerms) +{var patr=new RegExp('('+d+')','gi');t=t.replace(patr,"$1");} +var t_d='';var t_w=t.split(' ');if(t_w.length'+t_d+'';} +if(found[i].note) +{out+='
'+found[i].note+'
';} +out+='';} +l_o++;} +if(c>set.show) +{var pages=Math.ceil(c / set.show);var page=(start / set.show);if(set.footerPages<3) +{set.footerPages=3;} +out+='
    ';if(start>0) +{out+='
  • '+tipuesearch_string_6+'
  • ';} +if(page<=2) +{var p_b=pages;if(pages>set.footerPages) +{p_b=set.footerPages;} +for(var f=0;f';} +else +{out+='
  • '+(f+1)+'
  • ';}}} +else +{var p_b=page+set.footerPages-1;if(p_b>pages) +{p_b=pages;} +for(var f=page-1;f';} +else +{out+='
  • '+(f+1)+'
  • ';}}} +if(page+1!=pages) +{out+='
  • '+tipuesearch_string_7+'
  • ';} +out+='
';}} +else +{out+='
'+tipuesearch_string_8+'
';}} +else +{if(show_stop) +{out+='
'+tipuesearch_string_8+' '+tipuesearch_string_9+'
';} +else +{if(set.minimumLength==1) +{out+='
'+tipuesearch_string_11+'
';} +else +{out+='
'+tipuesearch_string_12+' '+set.minimumLength+' '+tipuesearch_string_13+'
';}}} +$('#tipue_search_content').hide().html(out).slideDown(200);$('#tipue_search_replaced').click(function() +{getTipueSearch(0,false);});$('.tipue_search_related_btn').click(function() +{$('#tipue_search_input').val($(this).attr('id'));getTipueSearch(0,true);});$('.tipue_search_image_zoom').click(function() +{$('#tipue_search_image_modal').fadeIn(300);$('#tipue_search_zoom_img').attr('src',this.src);var z_u=$(this).attr('data-url');$('#tipue_search_zoom_url').attr('href',z_u);var z_o=this.alt+'';$('#tipue_search_zoom_text').html(z_o);});$('.tipue_search_image_close').click(function() +{$('#tipue_search_image_modal').fadeOut(300);});$('.tipue_search_foot_box').click(function() +{var id_v=$(this).attr('id');var id_a=id_v.split('_');getTipueSearch(parseInt(id_a[0]),id_a[1]);});}});};})(jQuery); \ No newline at end of file diff --git a/assets/js/tipuesearch_content.js b/assets/js/tipuesearch_content.js new file mode 100644 index 000000000..002ce6a4f --- /dev/null +++ b/assets/js/tipuesearch_content.js @@ -0,0 +1,2740 @@ +var tipuesearch = { + "pages": [ + { + "title": "How to help?", + "text": "How to help PyPy?\nHere are some ideas to help PyPy move forward:\n\nuse pypy for your projects and provide detailed feedback\ntalk to us about how to support Python 3.x\nwrite blog posts or tweets about your experiences\nhelp porting to new platforms\ncontact us and get involved\ndonate some money to enable others to help\ntake on our consultants and make PyPy work better for your", + "tags": "", + "url": "https://www.pypy.org/howtohelp.html" + }, + { + "title": "Guest Post: How PortaOne uses PyPy for high-performance processing, connecting over 1B of phone calls every month", + "text": "The PyPy project is always happy to hear about industrial use and deployments\nof PyPy. For the GC bug\nfinding\ntask earlier this year, we collaborated with PortaOne and we're super happy\nthat Serhii Titov, head of the QA department at PortaOne, was up to writing\nthis guest post to describe their use and experience with the project.\n\nWhat does PortaOne do?\nWe at PortaOne Inc. allow telecom operators to\nlaunch new services (or provide existing services more efficiently) using our\nVoIP platform (PortaSIP) and our real-time charging system (PortaBilling),\nwhich provides additional features for cloud PBX, such as call transfer,\nqueues, interactive voice response (IVR) and more. At this moment our support\nteam manages several thousand servers with our software installed in 100\ncountries, through which over 500 telecommunication service providers connect\nmillions of end users every day. The unique thing about PortaOne is that we\nsupply the source code of our product to our customers - something unheard of\nin the telecom world! Thus we attract \"telco innovators\", who use our APIs to\nbuild around the system and the source code to create unique tweaks of\nfunctionality, which produces amazing products.\nAt the core of PortaSIP is the middle-ware component (the proper name for it is\n\"B2BUA\", but that probably does not say much to anyone outside of experts in\nVoIP), which implements the actual handling of SIP calls, messages, etc. and\nall added features (for instance, trying to send a call via telco operators\nthrough which the cost per minute is lower). It has to be fast (since even a\nsmall delay in establishing a call is noticed by a customer), reliable\n(everyone hates when a call drops or cannot be completed) and yet easily\nexpandable with new functionality. This is why we decided to use Python as\nopposed to C/C++ or similar programming languages, which are often used in\ntelecom equipment.\nThe B2BUA component is a batch of similar Python processes that are looped\ninside a\nasyncore.dispatcher\nwrapper. The load balancing between these Python processes is done by our\nstateless SIP proxy server written in C++. All our sockets are served by this\nB2BUA. We have our custom client-wrappers around pymysql, redis,\ncassandra-driver and requests to communicate with external services. Some\nof the Python processes use cffi\nwrappers around C-code to improve their performance (examples: an Oracle DB\ndriver, a client to a radius server, a custom C logger).\nThe I/O operations that block the main thread of the Python processes are\nprocessed in sub-threads. We have custom wrappers around threading.Thread\nand also asyncore.dispatcher. The results of such operations are returned to\nthe main thread.\nImproving our performance with PyPy\nWe started with CPython and then in 2014 switched to PyPy because it was\nfaster. Here's an exact quote from our first testing notes: \"PyPy gives\nsignificant performance boost, ~50%\". Nowadays, after years of changes in all\nthe software involved, PyPy still gives us +50% boost compared to CPython.\nTaking care of real time traffic for so many people around the globe is\nsomething we're really proud of. I hope the PyPy team can be proud of it as\nwell, as the PyPy product is a part of this solution.\nFinding a garbage collector bug: stage 1, the GC hooks\nHowever our path with PyPy wasn't perfectly smooth. There were very rare cases\nof crashes on PyPy that we weren't able to catch. That's because to make\ncoredump useful we needed to switch to PyPy with debug, but we cannot let it\nrun in that mode on a production system for an extended period of time, and we\ndid not have any STR (steps-to-reproduce) to make PyPy crash again in our lab.\nThat's why we kept (and still keep) both interpreters installed just in case,\nand we would switch to CPython if we noticed it happening.\nAt the time of updating PyPy from 3.5 to 3.6 our QA started noticing those\ncrashes more often, but we still had no luck with STR or collecting proper\ncoredumps with debug symbols. Then it became even worse after our development\nplayed with the Garbage Collector's\noptions to increase performance\nof our middleware component. The crashes started to affect our regular\nperformance testing (controlled by QA manager Yevhenii Bovda). At that point it\nwas decided that we can no longer live like that and so we started an intense\ninvestigation.\nDuring the first stage of our investigation (following the best practice of\ntroubleshooting) we narrowed down the issue as much as we could. So, it was not\nour code, it was definitely somewhere in PyPy. Eventually our SIP software\nengineer Yevhenii Yatchenko found out\nthat this bug is connected with the use of our custom hooks in the\nGC. Yevhenii created\nticket #4899 and within 2-3 days we\ngot a fix from a member of the PyPy team, in true open-source fashion.\nFinding a garbage collector bug: stage 2, the real bug\nThen came stage 2. In parallel with the previous ticket, Yevhenii created\n#4900 that we still see failing\nwith coredumps quite often, and they are not connected to GC custom hooks. In a\nnutshell, it took us dozens of back and forward emails, three Zoom sessions and\nfour versions of a patch to solve the issue. During the last iteration we got a\nnew set of options to try and a new version of the patch. Surprisingly, that\nhelped! What a relief! So, the next logical step was to remove all debug\noptions and run PyPy only with the patch. Unfortunately, it started to fail\nagain and we came to the obvious conclusion that what will help us is not a\npatch, but one of options we were testing out. At that point we found out that\nPYPY_GC_MAX_PINNED=0\nis a necessary and sufficient condition to solve our issue. This points to\nanother bug in the garbage collector, somehow related to object pinning.\nHere's our current state: we have to add PYPY_GC_MAX_PINNED=0, but we do not\nface the crashes anymore.\nConclusion and next steps\nGratitude is extended to Carl for his invaluable assistance in resolving the\nnasty bugss, because it seems we're the only ones who suffered from the last\none and we really did not want to fall back to CPython due to its performance\ndisadvantage.\nSerhii Titov, head of the QA department at PortaOne Inc.\nP.S. If you are a perfectionist and at this point you have mixed feelings and\nyou are still bothered by the question \"But there might still be a bug in the\nGC, what about that?\" - Carl has some ideas about it and he will sort it out\n(we will help with the testing/verification part).", + "tags": "casestudy,guestpost", + "url": "https://www.pypy.org/posts/2024/08/portaone.html" + }, + { + "title": "PyPy v7.3.17 release", + "text": "PyPy v7.3.17: release of python 2.7 and 3.10\nThe PyPy team is proud to release version 7.3.17 of PyPy.\nThis release includes a new RISC-V JIT backend, an improved REPL based on\nwork by the CPython team, and better JIT optimizations of integer\noperations. Special shout-outs to Logan Chien for the RISC-V backend\nwork, to Nico Rittinghaus for better integer optimization in the JIT, and\nthe CPython team that has worked on the repl.\nThe release includes two different interpreters:\n\nPyPy2.7, which is an interpreter supporting the syntax and the features of\nPython 2.7 including the stdlib for CPython 2.7.18+ (the + is for\nbackported security updates)\nPyPy3.10, which is an interpreter supporting the syntax and the features of\nPython 3.10, including the stdlib for CPython 3.10.14.\n\nThe interpreters are based on much the same codebase, thus the dual\nrelease. This is a micro release, all APIs are compatible with the other 7.3\nreleases. It follows after 7.3.16 release on April 23, 2024.\nWe recommend updating. You can find links to download the releases here:\n\nhttps://pypy.org/download.html\n\nWe would like to thank our donors for the continued support of the PyPy\nproject. If PyPy is not quite good enough for your needs, we are available for\ndirect consulting work. If PyPy is helping you out, we would love to hear\nabout it and encourage submissions to our blog via a pull request\nto https://github.com/pypy/pypy.org\nWe would also like to thank our contributors and encourage new people to join\nthe project. PyPy has many layers and we need help with all of them: bug fixes,\nPyPy and RPython documentation improvements, or general help with\nmaking RPython's JIT even better.\nIf you are a python library maintainer and use C-extensions, please consider\nmaking a HPy / CFFI / cppyy version of your library that would be performant\non PyPy. In any case, both cibuildwheel and the multibuild system support\nbuilding wheels for PyPy.\n\nRISC-V backend for the JIT\nPyPy's JIT has added support for generating 64-bit RISC-V machine code at\nruntime (RV64-IMAD, specifically). So far we are not releasing binaries for any\nRISC-V platforms, but there are instructions on how to cross-compile binaries.\n\n\nREPL Improvements\nThe biggest user-visible change of the release is new features in the repl of\nPyPy3.10. CPython 3.13 has adopted and extended PyPy's pure-Python repl, adding\na number of features and fixing a number or bugs in the process. We have\nbackported and added the following features:\n\nPrompts and tracebacks use terminal colors, as well as terminal hyperlinks\nfor file names.\nBracketed paste enable pasting several lines of input into the terminal\nwithout auto-indentation getting in the way.\nA special interactive help browser (F1), history browser (F2), explicit paste\nmode (F3).\nSupport for Ctrl- to jump over whole words at a time.\n\nSee the CPython documentation for further details. Thanks to \u0141ukasz Langa,\nPablo Galindo Salgado and the other CPython devs involved in this work.\n\n\nBetter JIT optimizations of integer operations\nThe optimizers of PyPy's JIT have become much better at reasoning about and\noptimizing integer operations. This is done with a new \"knownbits\" abstract\ndomain. In many programs that do bit-manipulation of integers, some of the\nbits of the integer variables of the program can be statically known. Here's a\nsimple example:\nx = a | 1\n...\nif x & 1:\n ...\nelse:\n ...\n\nWith the new abstract domain, the JIT can optimize the if-condition to\nTrue, because it already knows that the lowest bit of x must be set.\nThis optimization applies to all Python-integers that fit into a machine word\n(PyPy optimistically picks between two different representations for int,\ndepending on the size of the value). Unfortunately there is very little impact\nof this change on almost all Python code, because intensive bit-manipulation is\nrare in Python. However, the change leads to significant performance\nimprovements in Pydrofoil (the RPython-based RISC-V/ARM emulators that are\nautomatically generated from high-level Sail specifications of the respective\nISAs, and that use the RPython JIT to improve performance).\n\n\nPyPy versions and speed.pypy.org\nThe keen-eyed will have noticed no mention of Python version 3.9 in the\nreleases above. Typically we will maintain only one version of Python3, but due\nto PyPy3.9 support on conda-forge we maintained multiple versions from the\nfirst release of PyPy3.10 in PyPy v7.3.12 (Dec 2022). Conda-forge is\nsunsetting its PyPy support, which means we can drop PyPy3.9. Since that was\nthe major driver of benchmarks at https://speed.pypy.org, we revamped the site\nto showcase PyPy3.9, PyPy3.10, and various versions of cpython on the home\npage. For historical reasons, the \"baseline\" for comparison is still cpython\n3.7.19.\nWe will keep the buildbots building PyPY3.9 until the end of August, these\nbuilds will still be available on the nightly builds tab of the buildbot.\n\n\nWhat is PyPy?\nPyPy is a Python interpreter, a drop-in replacement for CPython\nIt's fast (PyPy and CPython performance\ncomparison) due to its integrated tracing JIT compiler.\nWe also welcome developers of other dynamic languages to see what RPython\ncan do for them.\nWe provide binary builds for:\n\nx86 machines on most common operating systems\n(Linux 32/64 bits, Mac OS 64 bits, Windows 64 bits)\n64-bit ARM machines running Linux (aarch64) and macos (macos_arm64).\n\nPyPy supports Windows 32-bit, Linux PPC64 big- and little-endian, Linux ARM\n32 bit, RISC-V RV64IMAFD Linux, and s390x Linux but does not release binaries.\nPlease reach out to us if you wish to sponsor binary releases for those\nplatforms. Downstream packagers provide binary builds for debian, Fedora,\nconda, OpenBSD, FreeBSD, Gentoo, and more.\n\n\nWhat else is new?\nFor more information about the 7.3.17 release, see the full changelog.\nPlease update, and continue to help us make pypy better.\nCheers,\nThe PyPy Team", + "tags": "release", + "url": "https://www.pypy.org/posts/2024/08/pypy-v7317-release.html" + }, + { + "title": "Conda-forge proposes sunsetting support for PyPy", + "text": "Conda-forge has kindly been providing support for PyPy since 2019. The\nconda-forge team has been very patient and generous with resources, but it\nseems the uptake of PyPy has not justified the effort. Major packages still\nare not available on PyPy,\nothers find it hard to update\nversions. We don't\nget much feedback at all about people using PyPy, and even less about PyPy on\nconda-forge. The conda-forge team has proposed sunsetting\nPyPy going\nforward, which means current packages would remain but no new packages would be\nbuilt. If you have an opinion, you can comment on that PR, or on this blog post.\nSince conda-forge supports PyPy3.9 but not PyPy3.10, we have continued\nreleasing PyPy3.9 even though we typically support only one version of PyPy3.\nWith the sunsetting proposal, we will not release any more updates to PyPy3.9.\nI opened a poll about the\nintention to drop PyPy3.9. If you have an opinion, please chime in.", + "tags": "conda-forge", + "url": "https://www.pypy.org/posts/2024/08/conda-forge-proposes-dropping-support-for-pypy.html" + }, + { + "title": "A Knownbits Abstract Domain for the Toy Optimizer, Correctly", + "text": "After Max' introduction to abstract interpretation for the toy optimizer in the\nlast post, I want to present a more complicated abstract domain in this post.\nThis abstract domain reasons about the individual bits of a variable in a trace.\nEvery bit can be either \"known zero\", \"known one\" or \"unknown\". The abstract\ndomain is useful for optimizing integer operations, particularly the bitwise operations.\nThe abstract domain follows quite closely the tristate abstract domain of the\neBPF verifier in the Linux\nKernel, as\ndescribed by the paper\nSound, Precise, and Fast Abstract Interpretation with Tristate\nNumbers by Harishankar Vishwanathan, Matan\nShachnai, Srinivas Narayana, and Santosh Nagarakatte.\nThe presentation in this post will still be in the context of the\ntoy optimizer. We'll spend a significant part of\nthe post convincing ourselves that the abstract domain transfer functions that\nwe're writing are really correct, using both property-based testing and\nautomated proofs (again using Z3).\nPyPy has implemented and merged a more complicated version of the same abstract\ndomain for the \"real\" PyPy JIT. A more thorough explanation of that real world\nimplementation will follow.\nI'd like to thank Max Bernstein and Armin Rigo for lots of great feedback on\ndrafts of this post. The PyPy implementation was mainly done by Nico\nRittinghaus and me.\nContents:\n\n\nMotivation\nThe Knownbits Abstract Domain\nTransfer Functions\nProperty-based Tests with Hypothesis\nWhen are Transfer Functions Correct? How do we test them?\nImplementing Binary Transfer Functions\nAddition and Subtraction\nProving correctness of the transfer functions with Z3\nCases where this style of Z3 proof doesn't work\nMaking Statements about Precision\nUsing the Abstract Domain in the Toy Optimizer for Generalized Constant Folding\nUsing the KnownBits Domain for Conditional Peephole Rewrites\nConclusion\n\n\nMotivation\nIn many programs that do bit-manipulation of integers, some of the bits of the\ninteger variables of the program can be statically known. Here's a simple\nexample:\nx = a | 1\n...\nif x & 1:\n ...\nelse:\n ...\n\n\nAfter the assignment x = a | 1, we know that the lowest bit of x must be 1\n(the other bits are unknown) and an optimizer could remove the condition x & 1 by\nconstant-folding it to 1.\nAnother (more complicated) example is:\nassert i & 0b111 == 0 # check that i is a multiple of 8\nj = i + 16\nassert j & 0b111 == 0\n\n\nThis kind of code could e.g. happen in a CPU\nemulator, where i and j are\nintegers that represent emulated pointers, and the asserts are alignment\nchecks. The first assert implies that the lowest three bits of i must be 0.\nAdding 16 to such a number produces a result where the lowest three bits are\nagain all 0, therefore the second assert is always true. So we would like a\ncompiler to remove the second assert.\nBoth of these will optimizations are doable with the help of the knownbits\nabstract domain that we'll discuss in the rest of the post.\nThe Knownbits Abstract Domain\nAn abstract value of the knownbits domain needs to be able to store, for every\nbit of an integer variable in a program, whether it is known 0, known 1, or\nunknown. To represent\nthree different states, we need 2 bits, which we will call one and unknown.\nHere's the encoding:\n\n\n\none\nunknown\nknownbit\n\n\n\n\n0\n0\n0\n\n\n1\n0\n1\n\n\n0\n1\n?\n\n\n1\n1\nillegal\n\n\n\nThe unknown bit is set if we don't know the value of the bit (\"?\"), the one\nbit is set if the bit is known to be a 1. Since two bits are enough to encode\nfour different states, but we only need three, the combination of a set one\nbit and a set unknown is not allowed.\nWe don't just want to encode a single bit, however. Instead, we want to do this\nfor all the bits of an integer variable. Therefore the instances of the abstract\ndomain get two integer fields ones and unknowns, where each pair of\ncorresponding bits encodes the knowledge about the corresponding bit of the\ninteger variable in the program.\nWe can start implementing a Python class that works like this:\nfrom dataclasses import dataclass\n\n@dataclass(eq=False)\nclass KnownBits:\n ones : int\n unknowns : int\n\n def __post_init__(self):\n if isinstance(self.ones, int):\n assert self.is_well_formed()\n\n def is_well_formed(self):\n # a bit cannot be both 1 and unknown\n return self.ones & self.unknowns == 0\n\n @staticmethod\n def from_constant(const : int):\n \"\"\" Construct a KnownBits corresponding to a constant, where all bits\n are known.\"\"\"\n return KnownBits(const, 0)\n\n def is_constant(self):\n \"\"\" Check if the KnownBits instance represents a constant. \"\"\"\n # it's a constant if there are no unknowns\n return self.unknowns == 0\n\n\nWe can also add some convenience properties. Sometimes it is easier to work\nwith an integer where all the known bits are set, or one where the positions\nof all the known zeros have a set bit:\nclass KnownBits:\n ...\n\n @property\n def knowns(self):\n \"\"\" return an integer where the known bits are set. \"\"\"\n # the knowns are just the unknowns, inverted\n return ~self.unknowns\n\n @property\n def zeros(self):\n \"\"\" return an integer where the places that are known zeros have a bit\n set. \"\"\"\n # it's a 0 if it is known, but not 1\n return self.knowns & ~self.ones\n\n\nAlso, for debugging and for writing tests we want a way to print the known bits\nin a human-readable form, and also to have a way to construct a KnownBits\ninstance from a string. It's not important to understand the details of\n__str__ or from_str for the rest of the post, so I'm putting them into a fold:\n\nKnownBits from and to string conversions\n\n\nclass KnownBits:\n ...\n\n def __repr__(self):\n if self.is_constant():\n return f\"KnownBits.from_constant({self.ones})\"\n return f\"KnownBits({self.ones}, {self.unknowns})\"\n\n def __str__(self):\n res = []\n ones, unknowns = self.ones, self.unknowns\n # construct the string representation right to left\n while 1:\n if not ones and not unknowns:\n break # we leave off the leading known 0s\n if ones == -1 and not unknowns:\n # -1 has all bits set in two's complement, so the leading\n # bits are all 1\n res.append('1')\n res.append(\"...\")\n break\n if unknowns == -1:\n # -1 has all bits set in two's complement, so the leading bits\n # are all ?\n assert not ones\n res.append(\"?\")\n res.append(\"...\")\n break\n if unknowns & 1:\n res.append('?')\n elif ones & 1:\n res.append('1')\n else:\n res.append('0')\n ones >>= 1\n unknowns >>= 1\n if not res:\n res.append('0')\n res.reverse()\n return \"\".join(res)\n\n @staticmethod\n def from_str(s):\n \"\"\" Construct a KnownBits instance that from a string. String can start\n with ...1 to mean that all higher bits are 1, or ...? to mean that all\n higher bits are unknown. Otherwise it is assumed that the higher bits\n are all 0. \"\"\"\n ones, unknowns = 0, 0\n startindex = 0\n if s.startswith(\"...?\"):\n unknowns = -1\n startindex = 4\n elif s.startswith(\"...1\"):\n ones = -1\n startindex = 4\n for index in range(startindex, len(s)):\n ones <<= 1\n unknowns <<= 1\n c = s[index]\n if c == '1':\n ones |= 1\n elif c == '?':\n unknowns |= 1\n return KnownBits(ones, unknowns)\n\n @staticmethod\n def all_unknown():\n \"\"\" convenience constructor for the \"all bits unknown\" abstract value\n \"\"\"\n return KnownBits.from_str(\"...?\")\n\n\n\n\n\n\nAnd here's a pytest-style unit test for str:\ndef test_str():\n assert str(KnownBits.from_constant(0)) == '0'\n assert str(KnownBits.from_constant(5)) == '101'\n assert str(KnownBits(5, 0b10)) == '1?1'\n assert str(KnownBits(~0b1111, 0b10)) == '...100?0'\n assert str(KnownBits(1, ~0b1)) == '...?1'\n\n\nAn instance of KnownBits represents a set of integers, namely those that match\nthe known bits stored in the instance. We can write a method contains that\ntakes a concrete int value and returns True if the value matches the\npattern of the known bits:\nclass KnownBits:\n ...\n\n def contains(self, value : int):\n \"\"\" Check whether the KnownBits instance contains the concrete integer\n `value`. \"\"\"\n # check whether value matches the bit pattern. in the places where we\n # know the bits, the value must agree with ones.\n return value & self.knowns == self.ones\n\n\nand a test:\ndef test_contains():\n k1 = KnownBits.from_str('1?1')\n assert k1.contains(0b111)\n assert k1.contains(0b101)\n assert not k1.contains(0b110)\n assert not k1.contains(0b011)\n\n k2 = KnownBits.from_str('...?1') # all odd numbers\n for i in range(-101, 100):\n assert k2.contains(i) == (i & 1)\n\n\nTransfer Functions\nNow that we have implemented the basics of the KnownBits class, we need to\nstart implementing the transfer functions. They are for computing what we know\nabout the results of an operation, given the knowledge we have about the bits\nof the arguments.\nWe'll start with a simple unary operation, invert(x) (which is ~x in Python\nand C syntax), which flips all the bits of at integer. If we know some bits of\nthe arguments, we can compute the corresponding bits of the result. The unknown\nbits remain unknown.\nHere's the code:\nclass KnownBits:\n ...\n\n def abstract_invert(self):\n # self.zeros has bits set where the known 0s are in self\n return KnownBits(self.zeros, self.unknowns)\n\n\nAnd a unit-test:\ndef test_invert():\n k1 = KnownBits.from_str('01?01?01?')\n k2 = k1.abstract_invert()\n assert str(k2) == '...10?10?10?'\n\n k1 = KnownBits.from_str('...?')\n k2 = k1.abstract_invert()\n assert str(k2) == '...?'\n\n\nBefore we continue with further transfer functions, we'll think about\ncorrectness of the transfer functions and build up some test infrastructure. To\ntest transfer functions, it's quite important to move being simple example-style\nunit tests. The state-space for more complicated binary transfer functions is\nextremely large and it's too easy to do something wrong in a corner case.\nTherefore we'll look at property-based-test for KnownBits next.\nProperty-based Tests with Hypothesis\nWe want to do property-based tests of KnownBits, to try\nmake it less likely that we'll get a corner-case in the implementation wrong.\nWe'll use Hypothesis for that.\nI can't give a decent introduction to Hypothesis here, but want to give a few\nhints about the API. Hypothesis is a way to run unit tests with randomly\ngenerated input. It provides strategies to describe the data that the test\nfunctions expects. Hypothesis provides primitive strategies (for things like\nintegers, strings, floats, etc) and ways to build composite strategies out of\nthe primitive ones.\nTo be able to write the tests, we need to generate random KnownBits instances,\nand we also want an int instance that is a member of the KnownBits instance.\nWe generate tuples of (KnownBits, int) together, to ensure this property.\nWe'll ask Hypothesis to generate us a random concrete int as the concrete\nvalue, and then we'll also generate a second random int to use as the\nunknown masks (i.e. which bits of the concrete int we don't know in the\nKnownBits instance). Here's a function that takes two such ints and builds the\ntuple:\ndef build_knownbits_and_contained_number(concrete_value : int, unknowns : int):\n # to construct a valid KnownBits instance, we need to mask off the unknown\n # bits\n ones = concrete_value & ~unknowns\n return KnownBits(ones, unknowns), concrete_value\n\n\nWe can turn this function into a hypothesis strategy to generate input data\nusing the strategies.builds function:\nfrom hypothesis import strategies, given, settings\n\nints = strategies.integers()\n\nrandom_knownbits_and_contained_number = strategies.builds(\n build_knownbits_and_contained_number,\n ints, ints\n)\n\n\nOne important special case of KnownBits are the constants, which contain only\na single concrete value. We'll also generate some of those specifically, and\nthen combine the random_knownbits_and_contained_number strategy with it:\nconstant_knownbits = strategies.builds(\n lambda value: (KnownBits.from_constant(value), value),\n ints\n)\n\nknownbits_and_contained_number = constant_knownbits | random_knownbits_and_contained_number\n\n\nNow we can write the first property-based tests, for the KnownBits.contains\nmethod:\n@given(knownbits_and_contained_number)\ndef test_contains(t):\n k, n = t\n assert k.contains(t)\n\n\nThe @given decorator is used to tell Hypothesis which strategy to use to\ngenerate random data for the test function. Hypothesis will run the test with a\nnumber of random examples (100 by default). If it finds an error, it will try to\nminimize the example needed that demonstrates the problem, to try to make it\neasier to understand what is going wrong. It also saves all failing cases into\nan example database and tries them again on subsequent runs.\nThis test is as much a check for whether we got the strategies right as it is\nfor the logic in KnownBits.contains. Here's an example output of random\nconcrete and abstract values that we are getting here:\n110000011001101 ...?0???1\n...1011011 ...1011011\n...1001101110101000010010011111011 ...1001101110101000010010011111011\n...1001101110101000010010011111011 ...100110111010100001?010?1??1??11\n1000001101111101001011010011111101000011000111011001011111101 1000001101111101001011010011111101000011000111011001011111101\n1000001101111101001011010011111101000011000111011001011111101 1000001101111101001011010011111101000011000111????01?11?????1\n1111100000010 1111100000010\n1111100000010 ...?11111?00000??\n110110 110110\n110110 ...?00?00????11??10\n110110 ??0??0\n...100010111011111 ...?100?10111??111?\n...1000100000110001 ...?000?00000??000?\n110000001110 ...?0?0??000?00?0?0000000?00???0000?????00???000?0?00?01?000?0??1??\n110000001110 ??000000???0\n1011011010000001110101001111000010001001011101010010010001000000010101010010001101110101111111010101010010101100110000011110000 1011011010000001110101001111000010001001011101010010010001000000010101010010001101110101111111010101010010101100110000011110000\n...1011010010010100 ...1011010010010100\n...1011111110110011 ...1011111110110011\n101000011110110 101000011?10?1?\n100101 ?00?0?\n\n\nThat looks suitably random, but we might want to bias our random numbers a\nlittle bit towards common error values like small constants, powers of two, etc.\nLike this:\nINTEGER_WIDTH = 64\n# some small integers\nints_special = set(range(100))\n# powers of two\nints_special = ints_special.union(1 << i for i in range(INTEGER_WIDTH - 2))\n# powers of two - 1\nints_special = ints_special.union((1 << i) - 1 for i in range(INTEGER_WIDTH - 2))\n# negative versions of what we have so far\nints_special = ints_special.union(-x for x in ints_special)\n# bit-flipped versions of what we have so far\nints_special = ints_special.union(~x for x in ints_special)\nints_special = list(ints_special)\n# sort them (because hypothesis simplifies towards earlier elements in the list)\nints_special.sort(key=lambda element: (abs(element), element < 0))\n\nints = strategies.sampled_from(ints_special) | strategies.integers()\n\n\nNow we get data like this:\n1110 1110\n...10000000000000000001 ...10000??0??0000??00?1\n1 ??0??0000??00?1\n1 ?\n...10101100 ...10101100\n110000000011001010111011111111111111011110010001001100110001011 ...?0?101?\n110000000011001010111011111111111111011110010001001100110001011 ??00000000??00?0?0???0??????????????0????00?000?00??00??000?0??\n...1011111111111111111111111111 ...?11?11??\n...1011111111111111111111111111 ...?0??????????????????????????\n0 ...?0??????????????????????????\n101101 101101\n111111111111111111111111111111111111111111111 111111111111111111111111111111111111111111111\n10111 10111\n...101100 ...1?111011?0\n101000 ?001010?0\n101000 ?0?000\n110010 110010\n...100111 ...100111\n1111011010010 1111011010010\n...1000000000000000000000000000000000000 ...1000000000000000000000000000000000000\n\n\nWe can also write a test that checks that the somewhat tricky logic in\n__str__ and from_str is correct, by making sure that the two functions\nround-trip (ie converting a KnownBits to a string and then back to a\nKnownBits instance produces the same abstract value).\n@given(knownbits_and_contained_number)\ndef test_hypothesis_str_roundtrips(t1):\n k1, n1 = t1\n s = str(k1)\n k2 = KnownBits.from_str(s)\n assert k1.ones == k2.ones\n assert k1.unknowns == k2.unknowns\n\n\nNow let's actually apply this infrastructure to test abstract_invert.\nWhen are Transfer Functions Correct? How do we test them?\nAbstract values, i.e. instances of KnownBits represent sets of concrete\nvalues. We want the transfer functions to compute overapproximations of the\nconcrete values. So if we have an arbitrary abstract value k, with a concrete\nnumber n that is a member of the abstract values (i.e.\nk.contains(n) == True) then the result of the concrete operation op(n)\nmust be a member of the result of the abstract operation k.abstract_op()\n(i.e. k.abstract_op().contains(op(n)) == True).\nChecking the correctness/overapproximation property is a good match for\nhypothesis. Here's what the test for abstract_invert looks like:\n@given(knownbits_and_contained_number)\ndef test_hypothesis_invert(t):\n k1, n1 = t1\n n2 = ~n1 # compute the real result\n k2 = k1.abstract_invert() # compute the abstract result\n assert k2.contains(n2) # the abstract result must contain the real result\n\n\nThis is the only condition needed for abstract_invert to be correct. If\nabstract_invert fulfils this property for every combination of abstract and\nconcrete value then abstract_invert is correct. Note however, that this test\ndoes not actually check whether abstract_invert gives us precise results. A\ncorrect (but imprecise) implementation of abstract_invert would simply return\na completely unknown result, regardless of what is known about the input\nKnownBits.\nThe \"proper\" CS term for this notion of correctness is called soundness. The\ncorrectness condition on the transfer functions is called a Galois\nconnection. I won't go into any mathematical/technical details here, but\nwanted to at least mention the terms. I found Martin\nKellogg's\nslides\nto be quite an approachable introduction to the Galois connection and how to\nshow soundness.\nImplementing Binary Transfer Functions\nNow we have infrastructure in place for testing transfer functions with random\ninputs. With that we can start thinking about the more complicated case, that of\nbinary operations. Let's start with the simpler ones, and and or. For and,\nwe can know a 0 bit in the result if either of the input bits are known 0;\nor we can know a 1 bit in the result if both input bits are known 1.\nOtherwise the resulting bit is unknown. Let's look at all the combinations:\nand\ninput1: 000111???\ninput2: 01?01?01? \nresult: 00001?0??\n\n\nclass KnownBits:\n ...\n\n def abstract_and(self, other):\n ones = self.ones & other.ones # known ones\n knowns = self.zeros | other.zeros | ones\n return KnownBits(ones, ~knowns)\n\n\nHere's an example unit-test and a property-based test for and:\ndef test_and():\n # test all combinations of 0, 1, ? in one example\n k1 = KnownBits.from_str('01?01?01?')\n k2 = KnownBits.from_str('000111???')\n res = k1.abstract_and(k2) # should be: 0...00001?0??\n assert str(res) == \"1?0??\"\n\n@given(knownbits_and_contained_number, knownbits_and_contained_number)\ndef test_hypothesis_and(t1, t2):\n k1, n1 = t1\n k2, n2 = t2\n k3 = k1.abstract_and(k2)\n n3 = n1 & n2\n assert k3.contains(n3)\n\n\nTo implement or is pretty similar. The result is known 1 where either of the\ninputs is 1. The result is known 0 where both inputs are known 0, and ?\notherwise.\nor\ninput1: 000111???\ninput2: 01?01?01? \nresult: 01?111?1?\n\n\nclass KnownBits:\n ...\n\n def abstract_or(self, other):\n ones = self.ones | other.ones\n zeros = self.zeros & other.zeros\n knowns = ones | zeros\n return KnownBits(ones, ~knowns)\n\n\nHere's an example unit-test and a property-based test for or:\ndef test_or():\n k1 = KnownBits.from_str('01?01?01?')\n k2 = KnownBits.from_str('000111???')\n res = k1.abstract_or(k2) # should be: 0...01?111?1?\n assert str(res) == \"1?111?1?\"\n\n@given(knownbits_and_contained_number, knownbits_and_contained_number)\ndef test_hypothesis_or(t1, t2):\n k1, n1 = t1\n k2, n2 = t2\n k3 = k1.abstract_or(k2)\n n3 = n1 | n2\n assert k3.contains(n3)\n\n\nImplementing support for abstract_xor is relatively simple, and left as an\nexercise :-).\nAddition and Subtraction\ninvert, and, and or are relatively simple transfer functions to write,\nbecause they compose over the individual bits of the integers. The arithmetic\nfunctions add and sub are significantly harder, because of carries and\nborrows. Coming up with the formulas for them and gaining an intuitive\nunderstanding is quite tricky and involves carefully going through a few\nexamples with pen and paper. When implementing this in PyPy, Nico and I didn't\ncome up with the implementation ourselves, but instead took them from the\nTristate Numbers paper. Here's the code,\nwith example tests and hypothesis tests:\nclass KnownBits:\n ...\n\n def abstract_add(self, other):\n sum_ones = self.ones + other.ones\n sum_unknowns = self.unknowns + other.unknowns\n all_carries = sum_ones + sum_unknowns\n ones_carries = all_carries sum_ones\n unknowns = self.unknowns | other.unknowns | ones_carries\n ones = sum_ones & ~unknowns\n return KnownBits(ones, unknowns)\n\n def abstract_sub(self, other):\n diff_ones = self.ones - other.ones\n val_borrows = (diff_ones + self.unknowns) (diff_ones - other.unknowns)\n unknowns = self.unknowns | other.unknowns | val_borrows\n ones = diff_ones & ~unknowns\n return KnownBits(ones, unknowns)\n\n\ndef test_add():\n k1 = KnownBits.from_str('0?10?10?10')\n k2 = KnownBits.from_str('0???111000')\n res = k1.abstract_add(k2)\n assert str(res) == \"?????01?10\"\n\ndef test_sub():\n k1 = KnownBits.from_str('0?10?10?10')\n k2 = KnownBits.from_str('0???111000')\n res = k1.abstract_sub(k2)\n assert str(res) == \"...?11?10\"\n k1 = KnownBits.from_str( '...1?10?10?10')\n k2 = KnownBits.from_str('...10000???111000')\n res = k1.abstract_sub(k2)\n assert str(res) == \"111?????11?10\"\n\n@given(knownbits_and_contained_number, knownbits_and_contained_number)\ndef test_hypothesis_add(t1, t2):\n k1, n1 = t1\n k2, n2 = t2\n k3 = k1.abstract_add(k2)\n n3 = n1 + n2\n assert k3.contains(n3)\n\n@given(knownbits_and_contained_number, knownbits_and_contained_number)\ndef test_hypothesis_sub(t1, t2):\n k1, n1 = t1\n k2, n2 = t2\n k3 = k1.abstract_sub(k2)\n n3 = n1 - n2\n assert k3.contains(n3)\n\n\nNow we are in a pretty good situation, and have implemented abstract versions\nfor a bunch of important arithmetic and binary functions. What's also surprising\nis that the implementation of all of the transfer functions is quite efficient.\nWe didn't have to write loops over the individual bits at all, instead we found\nclosed form expressions using primitive operations on the underlying integers\nones and unknowns. This means that computing the results of abstract\noperations is quite efficient, which is important when using the abstract domain\nin the context of a JIT compiler.\nProving correctness of the transfer functions with Z3\nAs one can probably tell from my recent posts, I've been thinking about\ncompiler correctness a lot. Getting the transfer functions absolutely\ncorrect is really crucial, because a bug in them would lead to miscompilation of\nPython code when the abstract domain is added to the JIT. While the randomized\ntests are great, it's still entirely possible for them to miss bugs. The state\nspace for the arguments of a binary transfer function is 3**64 * 3**64, and if\nonly a small part of that contains wrong behaviour it would be really unlikely\nfor us to find it with random tests by chance. Therefore I was reluctant to\nmerge the PyPy branch that contained the new abstract domain for a long time.\nTo increase our confidence in the correctness of the transfer functions further,\nwe can use Z3 to prove their correctness, which gives us much stronger\nguarantees (not 100%, obviously). In this subsection I will show how to do that.\nHere's an attempt to do this manually in the Python repl:\n>>>> import z3\n>>>> solver = z3.Solver()\n>>>> # like last blog post, proof by failing to find counterexamples\n>>>> def prove(cond): assert solver.check(z3.Not(cond)) == z3.unsat\n>>>>\n>>>> # let's set up a z3 bitvector variable for an arbitrary concrete value\n>>>> n1 = z3.BitVec('concrete_value', 64)\n>>>> n1\nconcrete_value\n>>>> # due to operator overloading we can manipulate z3 formulas\n>>>> n2 = ~n1\n>>>> n2\n~concrete_value\n>>>> \n>>>> # now z3 bitvector variables for the ones and zeros fields\n>>>> ones = z3.BitVec('abstract_ones', 64)\n>>>> unknowns = z3.BitVec('abstract_unknowns', 64)\n>>>> # we construct a KnownBits instance with the z3 variables\n>>>> k1 = KnownBits(ones, unknowns)\n>>>> # due to operator overloading we can call the methods on k1:\n>>>> k2 = k1.abstract_invert()\n>>>> k2.ones\n~abstract_unknowns & ~abstract_ones\n>>>> k2.unknowns\nabstract_unknowns\n>>>> # here's the correctness condition that we want to prove:\n>>>> k2.contains(n2)\n~concrete_value & ~abstract_unknowns ==\n~abstract_unknowns & ~abstract_ones\n>>>> # let's try\n>>>> prove(k2.contains(n2))\nTraceback (most recent call last):\n File \"\", line 1, in \n File \"\", line 1, in prove\nAssertionError\n>>>> # it doesn't work! let's look at the counterexample to see why:\n>>>> solver.model()\n[abstract_unknowns = 0,\n abstract_ones = 0,\n concrete_value = 1]\n>>>> # we can build a KnownBits instance with the values in the\n>>>> # counterexample:\n>>>> ~1 # concrete result\n-2\n>>>> counter_example_k1 = KnownBits(0, 0)\n>>>> counter_example_k1\nKnownBits.from_constant(0)\n>>>> counter_example_k2 = counter_example_k1.abstract_invert()\n>>>> counter_example_k2\nKnownBits.from_constant(-1)\n>>>> # let's check the failing condition\n>>>> counter_example_k2.contains(~1)\nFalse\n\n\nWhat is the problem here? We didn't tell Z3 that n1 was supposed to be a\nmember of k1. We can add this as a precondition to the solver, and then the\nprove works:\n>>>> solver.add(k1.contains(n1))\n>>>> prove(k2.contains(n2)) # works!\n\n\nThis is super cool! It's really a proof about the actual implementation, because\nwe call the implementation methods directly, and due to the operator overloading\nthat Z3 does we can be sure that we are actually checking a formula that\ncorresponds to the Python code. This eliminates one source of errors in formal\nmethods.\nDoing the proof manually on the Python REPL is kind of annoying though, and we\nalso would like to make sure that the proofs are re-done when we change the\ncode. What we would really like to do is writing the proofs as a unit-test that\nwe can run while developing and in CI. Doing this is possible, and the unit\ntests that really perform proofs look pleasingly similar to the\nHypothesis-based ones.\nFirst we need to set up a bit of infrastructure:\nINTEGER_WIDTH = 64\n\ndef BitVec(name):\n return z3.BitVec(name, INTEGER_WIDTH)\n\ndef BitVecVal(val):\n return z3.BitVecVal(val, INTEGER_WIDTH)\n\ndef z3_setup_variables():\n # instantiate a solver\n solver = z3.Solver()\n\n # a Z3 variable for the first concrete value\n n1 = BitVec(\"n1\")\n # a KnownBits instances that uses Z3 variables as its ones and unknowns,\n # representing the first abstract value\n k1 = KnownBits(BitVec(\"n1_ones\"), BitVec(\"n1_unkowns\"))\n # add the precondition to the solver that the concrete value n1 must be a\n # member of the abstract value k1\n solver.add(k1.contains(n1))\n\n # a Z3 variable for the second concrete value\n n2 = BitVec(\"n2\")\n # a KnownBits instances for the second abstract value\n k2 = KnownBits(BitVec(\"n2_ones\"), BitVec(\"n2_unkowns\"))\n # add the precondition linking n2 and k2 to the solver\n solver.add(k2.contains(n2))\n return solver, k1, n1, k2, n2\n\ndef prove(cond, solver):\n z3res = solver.check(z3.Not(cond))\n if z3res != z3.unsat:\n assert z3res == z3.sat # can't be timeout, we set no timeout\n # make the model with the counterexample global, to make inspecting the\n # bug easier when running pytest --pdb\n global model\n model = solver.model()\n print(f\"n1={model.eval(n1)}, n2={model.eval(n2)}\")\n counter_example_k1 = KnownBits(model.eval(k1.ones).as_signed_long(),\n model.eval(k1.unknowns).as_signed_long())\n counter_example_k2 = KnownBits(model.eval(k2.ones).as_signed_long(),\n model.eval(k2.unknowns).as_signed_long())\n print(f\"k1={counter_example_k1}, k2={counter_example_k2}\")\n print(f\"but {cond=} evaluates to {model.eval(cond)}\")\n raise ValueError(solver.model())\n\n\nAnd then we can write proof-unit-tests like this:\ndef test_z3_abstract_invert():\n solver, k1, n1, _, _ = z3_setup_variables()\n k2 = k1.abstract_invert()\n n2 = ~n1\n prove(k2.contains(n2), solver)\n\ndef test_z3_abstract_and():\n solver, k1, n1, k2, n2 = z3_setup_variables()\n k3 = k1.abstract_and(k2)\n n3 = n1 & n2\n prove(k3.contains(n3), solver)\n\ndef test_z3_abstract_or():\n solver, k1, n1, k2, n2 = z3_setup_variables()\n k3 = k1.abstract_or(k2)\n n3 = n1 | n2\n prove(k3.contains(n3), solver)\n\ndef test_z3_abstract_add():\n solver, k1, n1, k2, n2 = z3_setup_variables()\n k3 = k1.abstract_add(k2)\n n3 = n1 + n2\n prove(k3.contains(n3), solver)\n\ndef test_z3_abstract_sub():\n solver, k1, n1, k2, n2 = z3_setup_variables()\n k3 = k1.abstract_sub(k2)\n n3 = n1 - n2\n prove(k3.contains(n3), solver)\n\n\nIt's possible to write a bit more Python-metaprogramming-magic and unify the\nHypothesis and Z3 tests into the same test definition.1\nCases where this style of Z3 proof doesn't work\nUnfortunately the approach described in the previous section only works for a\nvery small number of cases. It breaks down as soon as the KnownBits methods\nthat we're calling contain any if conditions (including hidden ones like\nthe short-circuiting and and or in Python). Let's look at an example and\nimplement abstract_eq. eq is supposed to be an operation that compares two\nintegers and returns 0 or 1 if they are different or equal, respectively.\nImplementing this in knownbits looks like this (with example and hypothesis\ntests):\nclass KnownBits:\n ...\n\n def abstract_eq(self, other):\n # the result is a 0, 1, or ?\n\n # if they are both the same constant, they must be equal\n if self.is_constant() and other.is_constant() and self.ones == other.ones:\n return KnownBits.from_constant(1)\n # check whether we have known disagreeing bits, then we know the result\n # is 0\n if self._disagrees(other):\n return KnownBits.from_constant(0)\n return KnownBits(0, 1) # an unknown boolean\n\n def _disagrees(self, other):\n # check whether the bits disagree in any place where both are known\n both_known = self.knowns & other.knowns\n return self.ones & both_known != other.ones & both_known\n\ndef test_eq():\n k1 = KnownBits.from_str('...?')\n k2 = KnownBits.from_str('...?')\n assert str(k1.abstract_eq(k2)) == '?'\n k1 = KnownBits.from_constant(10)\n assert str(k1.abstract_eq(k1)) == '1'\n k1 = KnownBits.from_constant(10)\n k2 = KnownBits.from_constant(20)\n assert str(k1.abstract_eq(k2)) == '0'\n\n@given(knownbits_and_contained_number, knownbits_and_contained_number)\ndef test_hypothesis_eq(t1, t2):\n k1, n1 = t1\n k2, n2 = t2\n k3 = k1.abstract_eq(k2)\n assert k3.contains(int(n1 == n2))\n\n\nTrying to do the proof in the same style as before breaks:\n>>>> k3 = k1.abstract_eq(k2)\nTraceback (most recent call last):\n File \"\", line 1, in \n File \"knownbits.py\", line 246, in abstract_eq\n if self._disagrees(other):\n File \"venv/site-packages/z3/z3.py\", line 381, in __bool__\n raise Z3Exception(\"Symbolic expressions cannot be cast to concrete Boolean values.\")\nz3.z3types.Z3Exception: Symbolic expressions cannot be cast to concrete Boolean values.\n\n\nWe cannot call abstract_eq on a KnownBits with Z3 variables as fields,\nbecause once we hit an if statement, the whole approach of relying on the\noperator overloading breaks down. Z3 doesn't actually parse the Python code or\nanything advanced like that, we rather build an expression only by running the\ncode and letting the Z3 formulas build up.\nTo still prove the correctness of abstract_eq we need to manually transform\nthe control flow logic of the function into a Z3 formula that uses the z3.If\nexpression, using a small helper function:\ndef z3_cond(b, trueval=1, falseval=0):\n return z3.If(b, BitVecVal(trueval), BitVecVal(falseval))\n\ndef z3_abstract_eq(k1, k2):\n # follow the *logic* of abstract_eq, we can't call it due to the ifs in it\n case1cond = z3.And(k1.is_constant(), k2.is_constant(), k1.ones == k2.ones)\n case2cond = k1._disagrees(k2)\n\n # ones is 1 in the first case, 0 otherwise\n ones = z3_cond(case1cond, 1, 0)\n\n # in the first two cases, unknowns is 0, 1 otherwise\n unknowns = z3_cond(z3.Or(case1cond, case2cond), 0, 1)\n return KnownBits(ones, unknowns)\n\ndef test_z3_abstract_eq_logic():\n solver, k1, n1, k2, n2 = z3_setup_variables()\n n3 = z3_cond(n1 == n2) # concrete result\n k3 = z3_abstract_eq(k1, k2)\n prove(k3.contains(n3), solver)\n\n\nThis proof works. It is a lot less satisfying than the previous ones though,\nbecause we could have done an error in the manual transcription from Python code\nto Z3 formulas (there are possibly more heavy-handed approaches where we do\nthis transformation more automatically using e.g. the ast module to analyze\nthe source code, but that's a much more complicated researchy project). To\nlessen this problem somewhat we can factor out the parts of the logic that don't\nhave any conditions into small helper methods (like _disagrees in this\nexample) and use them in the manual conversion of the code to Z3 formulas.2\nThe final condition that Z3 checks, btw, is this one:\nIf(n1 == n2, 1, 0) &\n~If(Or(And(n1_unkowns == 0,\n n2_unkowns == 0,\n n1_ones == n2_ones),\n n1_ones & ~n1_unkowns & ~n2_unkowns !=\n n2_ones & ~n1_unkowns & ~n2_unkowns),\n 0, 1) ==\nIf(And(n1_unkowns == 0, n2_unkowns == 0, n1_ones == n2_ones),\n 1, 0)\n\n\nMaking Statements about Precision\nSo far we have only used Z3 to prove statements about correctness, i.e. that\nour abstract operations overapproximate what can happen with concrete values.\nWhile proving this property is essential if we want to avoid miscompilation,\ncorrectness alone is not a very strong constraint on the implementation of our\nabstract transfer functions. We could simply return Knownbits.unknowns() for\nevery abstract_* method and the resulting overapproximation would be correct,\nbut useless in practice.\nIt's much harder to make statements about whether the transfer functions are\nmaximally precise. There are two aspects of precision I want to discuss in this\nsection, however.\nThe first aspect is that we would really like it if the transfer functions\ncompute the maximally precise results for singleton sets. If all abstract\narguments of an operations are constants, i.e. contain only a single concrete\nelement, then we know that the resulting set also has only a single element. We\ncan prove that all our transfer functions have this property:\ndef test_z3_prove_constant_folding():\n solver, k1, n1, k2, n2 = z3_setup_variables()\n k3 = k1.abstract_invert()\n prove(z3.Implies(k1.is_constant(),\n k3.is_constant()), solver)\n\n k3 = k1.abstract_and(k2)\n prove(z3.Implies(z3.And(k1.is_constant(), k2.is_constant()),\n k3.is_constant()), solver)\n\n k3 = k1.abstract_or(k2)\n prove(z3.Implies(z3.And(k1.is_constant(), k2.is_constant()),\n k3.is_constant()), solver)\n\n k3 = k1.abstract_sub(k2)\n prove(z3.Implies(z3.And(k1.is_constant(), k2.is_constant()),\n k3.is_constant()), solver)\n\n k3 = z3_abstract_eq(k1, k2)\n prove(z3.Implies(z3.And(k1.is_constant(), k2.is_constant()),\n k3.is_constant()), solver)\n\n\nProving with Z3 that the transfer functions are maximally precise for\nnon-constant arguments seems to be relatively hard. I tried a few completely\nrigorous approaches and failed. The paper Sound, Precise, and Fast Abstract\nInterpretation with Tristate Numbers\ncontains an optimality proof for the transfer functions of addition and\nsubtraction, so we can be certain that they are as precise as is\npossible.\nI still want to show an approach for trying to find concrete examples of\nabstract values that are less precise than they could be, using a combination\nof Hypothesis and Z3. The idea is to use hypothesis to pick random abstract\nvalues. Then we compute the abstract result using our transfer function.\nAfterwards we can ask Z3 to find us an abstract result that is better than the\none our transfer function produced. If Z3 finds a better abstract result, we\nhave a concrete example of imprecision for our transfer function. Those tests\naren't strict proofs, because they rely on generating random abstract values,\nbut they can still be valuable (not for the transfer functions in this blog\npost, which are all optimal).\nHere is what the code looks like (this is a little bit bonus content, I'll not\nexplain the details and can only hope that the comments are somewhat helpful):\n@given(random_knownbits_and_contained_number, random_knownbits_and_contained_number)\n@settings(deadline=None)\ndef test_check_precision(t1, t2):\n k1, n1 = t1\n k2, n2 = t2\n # apply transfer function\n k3 = k1.abstract_add(k2)\n example_res = n1 + n2\n\n # try to find a better version of k3 with Z3\n solver = z3.Solver()\n solver.set(\"timeout\", 8000)\n\n var1 = BitVec('v1')\n var2 = BitVec('v2')\n\n ones = BitVec('ones')\n unknowns = BitVec('unknowns')\n better_k3 = KnownBits(ones, unknowns)\n print(k1, k2, k3)\n\n # we're trying to find an example for a better k3, so we use check, without\n # negation:\n res = solver.check(z3.And(\n # better_k3 should be a valid knownbits instance\n better_k3.is_well_formed(),\n # it should be better than k3, ie there are known bits in better_k3\n # that we don't have in k3\n better_k3.knowns & ~k3.knowns != 0,\n # now encode the correctness condition for better_k3 with a ForAll:\n # for all concrete values var1 and var2, it must hold that if\n # var1 is in k1 and var2 is in k2 it follows that var1 + var2 is in\n # better_k3\n z3.ForAll(\n [var1, var2],\n z3.Implies(\n z3.And(k1.contains(var1), k2.contains(var2)),\n better_k3.contains(var1 + var2)))))\n # if this query is satisfiable, we have found a better result for the\n # abstract_add\n if res == z3.sat:\n model = solver.model()\n rk3 = KnownBits(model.eval(ones).as_signed_long(), model.eval(unknowns).as_signed_long())\n print(\"better\", rk3)\n assert 0\n if res == z3.unknown:\n print(\"timeout\")\n\n\nIt does not actually fail for abstract_add (nor the other abstract\nfunctions). To see the test failing we can add some imprecision to the\nimplementation of abstract_add to see Hypothesis and Z3 find examples of\nvalues that are not optimally precise (for example by setting some bits\nof unknowns in the implementation of abstract_add unconditionally).\nUsing the Abstract Domain in the Toy Optimizer for Generalized Constant Folding\nNow after all this work we can finally actually use the knownbits abstract\ndomain in the toy optimizer. The code for this follows Max' intro post about\nabstract interpretation\nquite closely.\nFor completeness sake, in the fold there's the basic infrastructure classes\nthat make up the IR again (they are identical or at least extremely close to\nthe previous toy posts).\n\ntoy infrastructure\n\nclass Value:\n def find(self):\n raise NotImplementedError(\"abstract\")\n\n\n@dataclass(eq=False)\nclass Operation(Value):\n name : str\n args : list[Value]\n\n forwarded : Optional[Value] = None\n\n def find(self) -> Value:\n op = self\n while isinstance(op, Operation):\n next = op.forwarded\n if next is None:\n return op\n op = next\n return op\n\n def arg(self, index):\n return self.args[index].find()\n\n def make_equal_to(self, value : Value):\n self.find().forwarded = value\n\n\n@dataclass(eq=False)\nclass Constant(Value):\n value : object\n\n def find(self):\n return self\n\n\nclass Block(list):\n def __getattr__(self, opname):\n def wraparg(arg):\n if not isinstance(arg, Value):\n arg = Constant(arg)\n return arg\n def make_op(*args):\n op = Operation(opname,\n [wraparg(arg) for arg in args])\n self.append(op)\n return op\n return make_op\n\n\ndef bb_to_str(l : Block, varprefix : str = \"var\"):\n def arg_to_str(arg : Value):\n if isinstance(arg, Constant):\n return str(arg.value)\n else:\n return varnames[arg]\n\n varnames = {}\n res = []\n for index, op in enumerate(l):\n # give the operation a name used while\n # printing:\n var = f\"{varprefix}{index}\"\n varnames[op] = var\n arguments = \", \".join(\n arg_to_str(op.arg(i))\n for i in range(len(op.args))\n )\n strop = f\"{var} = {op.name}({arguments})\"\n res.append(strop)\n return \"\\n\".join(res)\n\n\n\n\n\n\nNow we can write some first tests, the first one simply checking constant\nfolding:\ndef test_constfold_two_ops():\n bb = Block()\n var0 = bb.getarg(0)\n var1 = bb.int_add(5, 4)\n var2 = bb.int_add(var1, 10)\n var3 = bb.int_add(var2, var0)\n\n opt_bb = simplify(bb)\n assert bb_to_str(opt_bb, \"optvar\") == \"\"\"\\\noptvar0 = getarg(0)\noptvar1 = int_add(19, optvar0)\"\"\"\n\n\nCalling the transfer functions on constant KnownBits produces a constant\nresults, as we have seen. Therefore \"regular\" constant folding should hopefully\nbe achieved by optimizing with the KnownBits abstract domain too.\nThe next two tests are slightly more complicated and can't be optimized by\nregular constant-folding. They follow the motivating examples from the start of\nthis blog post, a hundred years ago:\ndef test_constfold_via_knownbits():\n bb = Block()\n var0 = bb.getarg(0)\n var1 = bb.int_or(var0, 1)\n var2 = bb.int_and(var1, 1)\n var3 = bb.dummy(var2)\n\n opt_bb = simplify(bb)\n assert bb_to_str(opt_bb, \"optvar\") == \"\"\"\\\noptvar0 = getarg(0)\noptvar1 = int_or(optvar0, 1)\noptvar2 = dummy(1)\"\"\"\n\ndef test_constfold_alignment_check():\n bb = Block()\n var0 = bb.getarg(0)\n var1 = bb.int_invert(0b111)\n # mask off the lowest three bits, thus var2 is aligned\n var2 = bb.int_and(var0, var1)\n # add 16 to aligned quantity\n var3 = bb.int_add(var2, 16)\n # check alignment of result\n var4 = bb.int_and(var3, 0b111)\n var5 = bb.int_eq(var4, 0)\n # var5 should be const-folded to 1\n var6 = bb.dummy(var5)\n\n opt_bb = simplify(bb)\n assert bb_to_str(opt_bb, \"optvar\") == \"\"\"\\\noptvar0 = getarg(0)\noptvar1 = int_and(optvar0, -8)\noptvar2 = int_add(optvar1, 16)\noptvar3 = dummy(1)\"\"\"\n\n\nHere is simplify to make these tests pass:\ndef unknown_transfer_functions(*abstract_args):\n return KnownBits.all_unknown()\n\n\ndef simplify(bb: Block) -> Block:\n abstract_values = {} # dict mapping Operation to KnownBits\n\n def knownbits_of(val : Value):\n if isinstance(val, Constant):\n return KnownBits.from_constant(val.value)\n return abstract_values[val]\n\n opt_bb = Block()\n for op in bb:\n # apply the transfer function on the abstract arguments\n name_without_prefix = op.name.removeprefix(\"int_\")\n method_name = f\"abstract_{name_without_prefix}\"\n transfer_function = getattr(KnownBits, method_name, unknown_transfer_functions)\n abstract_args = [knownbits_of(arg.find()) for arg in op.args]\n abstract_res = abstract_values[op] = transfer_function(*abstract_args)\n # if the result is a constant, we optimize the operation away and make\n # it equal to the constant result\n if abstract_res.is_constant():\n op.make_equal_to(Constant(abstract_res.ones))\n continue\n # otherwise emit the op\n opt_bb.append(op)\n return opt_bb\n\n\nThe code follows the approach from the previous blog post very closely. The\nonly difference is that we apply the transfer function first, to be able to\ndetect whether the abstract domain can tell us that the result has to always be\na constant. This code makes all three tests pass.\nUsing the KnownBits Domain for Conditional Peephole Rewrites\nSo far we are only using the KnownBits domain to find out that certain\noperations have to produce a constant. We can also use the KnownBits domain\nto check whether certain operation rewrites are correct. Let's use one of the\nexamples from the Mining JIT traces for missing optimizations with\nZ3\npost, where Z3 found the inefficiency (x << 4) & -0xf == x << 4 in PyPy JIT\ntraces. We don't have shift operations, but we want to generalize this optimization\nanyway. The general form of this rewrite is that under some circumstances x &\ny == x, and we can use the KnownBits domain to detect situations where this\nmust be true.\nTo understand when x & y == x is true, we can think about individual pairs of\nbits a and b. If a == 0, then a & b == 0 & b == 0 == a. If b == 1\nthen a & b == a & 1 == a. So if either a == 0 or b == 1 is true,\na & b == a follows. And if either of these conditions is true for all the\nbits of x and y, we can know that x & y == x.\nWe can write a method on KnownBits to check for this condition:\nclass KnownBits:\n ...\n\n def is_and_identity(self, other):\n \"\"\" Return True if n1 & n2 == n1 for any n1 in self and n2 in other.\n (or, equivalently, return True if n1 | n2 == n2)\"\"\"\n return self.zeros | other.ones == -1\n\n\nSince my reasoning about this feels ripe for errors, let's check that our\nunderstanding is correct with Z3:\ndef test_prove_is_and_identity():\n solver, k1, n1, k2, n2 = z3_setup_variables()\n prove(z3.Implies(k1.is_and_identity(k2), n1 & n2 == n1), solver)\n\n\nNow let's use this in the toy optimizer. Here are two tests for this rewrite:\ndef test_remove_redundant_and():\n bb = Block()\n var0 = bb.getarg(0)\n var1 = bb.int_invert(0b1111)\n # mask off the lowest four bits\n var2 = bb.int_and(var0, var1)\n # applying the same mask is not redundant\n var3 = bb.int_and(var2, var1)\n var4 = bb.dummy(var3)\n\n opt_bb = simplify(bb)\n assert bb_to_str(opt_bb, \"optvar\") == \"\"\"\\\noptvar0 = getarg(0)\noptvar1 = int_and(optvar0, -16)\noptvar2 = dummy(optvar1)\"\"\"\n\ndef test_remove_redundant_and_more_complex():\n bb = Block()\n var0 = bb.getarg(0)\n var1 = bb.getarg(1)\n # var2 has bit pattern ????\n var2 = bb.int_and(var0, 0b1111)\n # var3 has bit pattern ...?1111\n var3 = bb.int_or(var1, 0b1111)\n # var4 is just var2\n var4 = bb.int_and(var2, var3)\n var5 = bb.dummy(var4)\n\n opt_bb = simplify(bb)\n assert bb_to_str(opt_bb, \"optvar\") == \"\"\"\\\noptvar0 = getarg(0)\noptvar1 = getarg(1)\noptvar2 = int_and(optvar0, 15)\noptvar3 = int_or(optvar1, 15)\noptvar4 = dummy(optvar2)\"\"\"\n\n\nThe first test could also be made to pass by implementing a reassociation\noptimization that turns (x & c1) & c2 into x & (c1 & c2) and then constant-folds the second and. But here we want to\nuse KnownBits and conditionally rewrite int_and to its first argument. So to make the tests pass,\nwe can change simplify like this:\ndef simplify(bb: Block) -> Block:\n abstract_values = {} # dict mapping Operation to KnownBits\n\n def knownbits_of(val : Value):\n ...\n\n opt_bb = Block()\n for op in bb:\n # apply the transfer function on the abstract arguments\n name_without_prefix = op.name.removeprefix(\"int_\")\n method_name = f\"abstract_{name_without_prefix}\"\n transfer_function = getattr(KnownBits, method_name, unknown_transfer_functions)\n abstract_args = [knownbits_of(arg.find()) for arg in op.args]\n abstract_res = abstract_values[op] = transfer_function(*abstract_args)\n # if the result is a constant, we optimize the operation away and make\n # it equal to the constant result\n if abstract_res.is_constant():\n op.make_equal_to(Constant(abstract_res.ones))\n continue\n # <<<< new code\n # conditionally rewrite int_and(x, y) to x\n if op.name == \"int_and\":\n k1, k2 = abstract_args\n if k1.is_and_identity(k2):\n op.make_equal_to(op.arg(0))\n continue\n # >>>> end changes\n opt_bb.append(op)\n return opt_bb\n\n\nAnd with that, the new tests pass as well. A real implementation would also\ncheck the other argument order, but we leave that out for the sake of brevity.\nThis rewrite also generalizes the rewrites int_and(0, x) -> 0 and\nint_and(-1, x) -> x, let's add a test for those:\ndef test_remove_and_simple():\n bb = Block()\n var0 = bb.getarg(0)\n var1 = bb.getarg(1)\n var2 = bb.int_and(0, var0) # == 0\n var3 = bb.int_invert(var2) # == -1\n var4 = bb.int_and(var1, var3) # == var1\n var5 = bb.dummy(var4)\n\n opt_bb = simplify(bb)\n assert bb_to_str(opt_bb, \"optvar\") == \"\"\"\\\noptvar0 = getarg(0)\noptvar1 = getarg(1)\noptvar2 = dummy(optvar1)\"\"\"\n\n\nThis test just passes. And that's it for this post!\nConclusion\nIn this post we've seen the implementation, testing and proofs about a 'known\nbits' abstract domain, as well as its use in the toy optimizer to generalize\nconstant folding, and to implement conditional peephole rewrites.\nIn the next posts I'll write about the real implementation of a knownbits\ndomain in PyPy's JIT, its combination with the existing interval abstract\ndomain, how to deal with gaining information from conditions in the program,\nand some lose ends.\nSources:\n\nKnown bits in LLVM\nTristate numbers for known bits in Linux eBPF\nSound, Precise, and Fast Abstract Interpretation with Tristate Numbers\nVerifying the Veri\ufb01er: eBPF Range Analysis Veri\ufb01cation\nBit-Twiddling: Addition with Unknown\n Bits\n is a super readable blog post by Dougall J. I've taken the ones and\n unknowns naming from this post, which I find significantly clearer than\n value and mask, which the Linux kernel uses.\nBits, Math and Performance(?), a fantastic\n blog by Harold Aptroot. There are a\n lot of relevant posts about known bits, range analysis etc. Harold is also\n the author of Haroldbot, a website that can be used\n for bitvector calculations, and also checks bitvector identities.\nSharpening Constraint Programming approaches for Bit-Vector Theory\nDeriving Abstract Transfer Functions for Analyzing Embedded Software\nSynthesizing Abstract Transformers\n\n\n\n\n\nThere's a subtletly about the Z3 proofs that I'm sort of\nglossing over here. Python integers are of arbitrary width, and the\nKnownBits code is actually carefully written to work for integers of any\nsize. This property is tested by the Hypothesis tests, which don't limit\nthe sizes of the generated random integers. However, the Z3 proofs only\ncheck bitvectors of a fixed bitwidth of 64. There are various ways to deal\nwith this situation. For most \"real\" compilers, the bitwidth of integers\nwould be fixed anyway. Then the components ones and unknowns of the\nKnownBits class would use the number of bits the corresponding integer\nvariable has, and the Z3 proofs would use the same width. This is what we\ndo in the PyPy JIT.\u00a0\u21a9\n\n\nThe less close connection between implementation and proof\nfor abstract_eq is one of the reasons why it makes sense to do\nunit-testing in addition to proofs. For a more detailed explanation of\nwhy both tests and proofs are good to\nhave, see Jeremy Siek's blog\npost,\nas well as the Knuth\nquote.\u00a0\u21a9", + "tags": "toy-optimizer,z3", + "url": "https://www.pypy.org/posts/2024/08/toy-knownbits.html" + }, + { + "title": "Abstract interpretation in the Toy Optimizer", + "text": "This is a cross-post\nfrom Max Bernstein from his excellent blog where he writes about programming\nlanguages, compilers, optimizations, virtual machines. He's looking for a\n(dynamic language runtime or compiler related) job too.\n\nCF Bolz-Tereick wrote some excellent posts in which they introduce a small IR\nand optimizer and extend it with allocation\nremoval. We also did a live stream together in which\nwe did some more heap optimizations.\nIn this blog post, I'm going to write a small abstract interpreter for the Toy\nIR and then show how we can use it to do some simple optimizations. It assumes\nthat you are familiar with the little IR, which I have reproduced unchanged in\na GitHub Gist.\nAbstract interpretation is a general framework for efficiently computing\nproperties that must be true for all possible executions of a program. It's a\nwidely used approach both in compiler optimizations as well as offline static\nanalysis for finding bugs. I'm writing this post to pave the way for CF's next\npost on proving abstract interpreters correct for range analysis and known bits\nanalysis inside PyPy.\nBefore we begin, I want to note a couple of things:\n\nThe Toy IR is in SSA form, which means that every variable is defined exactly\n once. This means that abstract properties of each variable are easy to track.\nThe Toy IR represents a linear trace without control flow, meaning we won't\n talk about meet/join or fixpoints. They only make sense if the IR has a\n notion of conditional branches or back edges (loops).\n\nAlright, let's get started.\nWelcome to abstract interpretation\nAbstract interpretation means a couple different things to different people.\nThere's rigorous mathematical formalism thanks to Patrick and Radhia Cousot,\nour favorite power couple, and there's also sketchy hand-wavy stuff like what\nwill follow in this post. In the end, all people are trying to do is reason\nabout program behavior without running it.\nIn particular, abstract interpretation is an over-approximation of the\nbehavior of a program. Correctly implemented abstract interpreters never lie,\nbut they might be a little bit pessimistic. This is because instead of using\nreal values and running the program---which would produce a concrete result and\nsome real-world behavior---we \"run\" the program with a parallel universe of\nabstract values. This abstract run gives us information about all possible\nruns of the program.1\nAbstract values always represent sets of concrete values. Instead of literally\nstoring a set (in the world of integers, for example, it could get pretty\nbig...there are a lot of integers), we group them into a finite number of named\nsubsets.2\nLet's learn a little about abstract interpretation with an example program and\nexample abstract domain. Here's the example program:\nv0 = 1\nv1 = 2\nv2 = add(v0, v1)\n\n\nAnd our abstract domain is \"is the number positive\" (where \"positive\" means\nnonnegative, but I wanted to keep the words distinct):\n top\n / \\\npositive negative\n \\ /\n bottom\n\n\nThe special top value means \"I don't know\" and the special bottom value\nmeans \"empty set\" or \"unreachable\". The positive and negative values\nrepresent the sets of all positive and negative numbers, respectively.\nWe initialize all the variables v0, v1, and v2 to bottom and then walk\nour IR, updating our knowledge as we go.\n# here\nv0:bottom = 1\nv1:bottom = 2\nv2:bottom = add(v0, v1)\n\n\nIn order to do that, we have to have transfer functions for each operation.\nFor constants, the transfer function is easy: determine if the constant is\npositive or negative. For other operations, we have to define a function that\ntakes the abstract values of the operands and returns the abstract value of the\nresult.\nIn order to be correct, transfer functions for operations have to be compatible\nwith the behavior of their corresponding concrete implementations. You can\nthink of them having an implicit universal quantifier forall in front of\nthem.\nLet's step through the constants at least:\nv0:positive = 1\nv1:positive = 2\n# here\nv2:bottom = add(v0, v1)\n\n\nNow we need to figure out the transfer function for add. It's kind of tricky\nright now because we haven't specified our abstract domain very well. I keep\nsaying \"numbers\", but what kinds of numbers? Integers? Real numbers? Floating\npoint? Some kind of fixed-width bit vector (int8, uint32, ...) like an\nactual machine \"integer\"?\nFor this post, I am going to use the mathematical definition of integer, which\nmeans that the values are not bounded in size and therefore do not overflow.\nActual hardware memory constraints aside, this is kind of like a Python int.\nSo let's look at what happens when we add two abstract numbers:\n\n\n\n\ntop\npositive\nnegative\nbottom\n\n\n\n\ntop\ntop\ntop\ntop\nbottom\n\n\npositive\ntop\npositive\ntop\nbottom\n\n\nnegative\ntop\ntop\nnegative\nbottom\n\n\nbottom\nbottom\nbottom\nbottom\nbottom\n\n\n\nAs an example, let's try to add two numbers a and b, where a is positive\nand b is negative. We don't know anything about their values other than their\nsigns. They could be 5 and -3, where the result is 2, or they could be\n1 and -100, where the result is -99. This is why we can't say anything\nabout the result of this operation and have to return top.\nThe short of this table is that we only really know the result of an addition\nif both operands are positive or both operands are negative. Thankfully, in\nthis example, both operands are known positive. So we can learn something about\nv2:\nv0:positive = 1\nv1:positive = 2\nv2:positive = add(v0, v1)\n# here\n\n\nThis may not seem useful in isolation, but analyzing more complex programs even\nwith this simple domain may be able to remove checks such as if (v2 < 0) { ... }.\nLet's take a look at another example using an sample absval (absolute value)\nIR operation:\nv0 = getarg(0)\nv1 = getarg(1)\nv2 = absval(v0)\nv3 = absval(v1)\nv4 = add(v2, v3)\nv5 = absval(v4)\n\n\nEven though we have no constant/concrete values, we can still learn something\nabout the states of values throughout the program. Since we know that absval\nalways returns a positive number, we learn that v2, v3, and v4 are all\npositive. This means that we can optimize out the absval operation on v5:\nv0:top = getarg(0)\nv1:top = getarg(1)\nv2:positive = absval(v0)\nv3:positive = absval(v1)\nv4:positive = add(v2, v3)\nv5:positive = v4\n\n\nOther interesting lattices include:\n\nConstants (where the middle row is pretty wide)\nRange analysis (bounds on min and max of a number)\nKnown bits (using a bitvector representation of a number, which bits are\n always 0 or 1)\n\nFor the rest of this blog post, we are going to do a very limited version of\n\"known bits\", called parity. This analysis only tracks the least significant\nbit of a number, which indicates if it is even or odd.\nParity\nThe lattice is pretty similar to the positive/negative lattice:\n top\n / \\\neven odd\n \\ /\n bottom\n\n\nLet's define a data structure to represent this in Python code:\nclass Parity:\n def __init__(self, name):\n self.name = name\n\n def __repr__(self):\n return self.name\n\n\nAnd instantiate the members of the lattice:\nTOP = Parity(\"top\")\nEVEN = Parity(\"even\")\nODD = Parity(\"odd\")\nBOTTOM = Parity(\"bottom\")\n\n\nNow let's write a forward flow analysis of a basic block using this lattice.\nWe'll do that by assuming that a method on Parity is defined for each IR\noperation. For example, Parity.add, Parity.lshift, etc.\ndef analyze(block: Block) -> None:\n parity = {v: BOTTOM for v in block}\n\n def parity_of(value):\n if isinstance(value, Constant):\n return Parity.const(value)\n return parity[value]\n\n for op in block:\n transfer = getattr(Parity, op.name)\n args = [parity_of(arg.find()) for arg in op.args]\n parity[op] = transfer(*args)\n\n\nFor every operation, we compute the abstract value---the parity---of the\narguments and then call the corresponding method on Parity to get the\nabstract result.\n\nWe need to special case Constants due to a quirk of how the Toy IR is\nconstructed: the constants don't appear in the instruction stream and instead\nare free-floating.\nLet's start by looking at the abstraction function for concrete\nvalues---constants:\nclass Parity:\n # ...\n @staticmethod\n def const(value):\n if value.value % 2 == 0:\n return EVEN\n else:\n return ODD\n\n\nSeems reasonable enough. Let's pause on operations for a moment and consider an\nexample program:\nv0 = getarg(0)\nv1 = getarg(1)\nv2 = lshift(v0, 1)\nv3 = lshift(v1, 1)\nv4 = add(v2, v3)\nv5 = dummy(v4)\n\n\nThis function (which is admittedly a little contrived) takes two inputs, shifts\nthem left by one bit, adds the result, and then checks the least significant\nbit of the addition result. It then passes that result into a dummy function,\nwhich you can think of as \"return\" or \"escape\".\nTo do some abstract interpretation on this program, we'll need to implement the\ntransfer functions for lshift and add (dummy will just always return\nTOP). We'll start with add. Remember that adding two even numbers returns\nan even number, adding two odd numbers returns an even number, and mixing even\nand odd returns an odd number.\nclass Parity:\n # ...\n def add(self, other):\n if self is BOTTOM or other is BOTTOM:\n return BOTTOM\n if self is TOP or other is TOP:\n return TOP\n if self is EVEN and other is EVEN:\n return EVEN\n if self is ODD and other is ODD:\n return EVEN\n return ODD\n\n\nWe also need to fill in the other cases where the operands are top or\nbottom. In this case, they are both \"contagious\"; if either operand is\nbottom, the result is as well. If neither is bottom but either operand is top,\nthe result is as well.\nNow let's look at lshift. Shifting any number left by a non-zero number of\nbits will always result in an even number, but we need to be careful about the\nzero case! Shifting by zero doesn't change the number at all. Unfortunately,\nsince our lattice has no notion of zero, we have to over-approximate here:\nclass Parity:\n # ...\n def lshift(self, other):\n # self << other\n if other is ODD:\n return EVEN\n return TOP\n\n\nThis means that we will miss some opportunities to optimize, but it's a\ntradeoff that's just part of the game. (We could also add more elements to our\nlattice, but that's a topic for another day.)\nNow, if we run our abstract interpretation, we'll collect some interesting\nproperties about the program. If we temporarily hack on the internals of\nbb_to_str, we can print out parity information alongside the IR operations:\nv0:top = getarg(0)\nv1:top = getarg(1)\nv2:even = lshift(v0, 1)\nv3:even = lshift(v1, 1)\nv4:even = add(v2, v3)\nv5:top = dummy(v4)\n\n\nThis is pretty awesome, because we can see that v4, the result of the\naddition, is always even. Maybe we can do something with that information.\nOptimization\nOne way that a program might check if a number is odd is by checking the least\nsignificant bit. This is a common pattern in C code, where you might see code\nlike y = x & 1. Let's introduce a bitand IR operation that acts like the\n& operator in C/Python. Here is an example of use of it in our program:\nv0 = getarg(0)\nv1 = getarg(1)\nv2 = lshift(v0, 1)\nv3 = lshift(v1, 1)\nv4 = add(v2, v3)\nv5 = bitand(v4, 1) # new!\nv6 = dummy(v5)\n\n\nWe'll hold off on implementing the transfer function for it---that's left as an\nexercise for the reader---and instead do something different.\nInstead, we'll see if we can optimize operations of the form bitand(X, 1). If\nwe statically know the parity as a result of abstract interpretation, we can\nreplace the bitand with a constant 0 or 1.\nWe'll first modify the analyze function (and rename it) to return a new\nBlock containing optimized instructions:\ndef simplify(block: Block) -> Block:\n parity = {v: BOTTOM for v in block}\n\n def parity_of(value):\n if isinstance(value, Constant):\n return Parity.const(value)\n return parity[value]\n\n result = Block()\n for op in block:\n # TODO: Optimize op\n # Emit\n result.append(op)\n # Analyze\n transfer = getattr(Parity, op.name)\n args = [parity_of(arg.find()) for arg in op.args]\n parity[op] = transfer(*args)\n return result\n\n\nWe're approaching this the way that PyPy does things under the hood, which is\nall in roughly a single pass. It tries to optimize an instruction away, and if\nit can't, it copies it into the new block.\nNow let's add in the bitand optimization. It's mostly some gross-looking\npattern matching that checks if the right hand side of a bitwise and\noperation is 1 (TODO: the left hand side, too). CF had some neat ideas on how\nto make this more ergonomic, which I might save for later.3\nThen, if we know the parity, optimize the bitand into a constant.\ndef simplify(block: Block) -> Block:\n parity = {v: BOTTOM for v in block}\n\n def parity_of(value):\n if isinstance(value, Constant):\n return Parity.const(value)\n return parity[value]\n\n result = Block()\n for op in block:\n # Try to simplify\n if isinstance(op, Operation) and op.name == \"bitand\":\n arg = op.arg(0)\n mask = op.arg(1)\n if isinstance(mask, Constant) and mask.value == 1:\n if parity_of(arg) is EVEN:\n op.make_equal_to(Constant(0))\n continue\n elif parity_of(arg) is ODD:\n op.make_equal_to(Constant(1))\n continue\n # Emit\n result.append(op)\n # Analyze\n transfer = getattr(Parity, op.name)\n args = [parity_of(arg.find()) for arg in op.args]\n parity[op] = transfer(*args)\n return result\n\n\nRemember: because we use union-find to rewrite instructions in the optimizer\n(make_equal_to), later uses of the same instruction get the new\noptimized version \"for free\" (find).\nLet's see how it works on our IR:\nv0 = getarg(0)\nv1 = getarg(1)\nv2 = lshift(v0, 1)\nv3 = lshift(v1, 1)\nv4 = add(v2, v3)\nv6 = dummy(0)\n\n\nHey, neat! bitand disappeared and the argument to dummy is now the constant\n0 because we know the lowest bit.\nWrapping up\nHopefully you have gained a little bit of an intuitive understanding of\nabstract interpretation. Last year, being able to write some code made me more\ncomfortable with the math. Now being more comfortable with the math is helping\nme write the code. It's nice upward spiral.\nThe two abstract domains we used in this post are simple and not very useful in\npractice but it's possible to get very far using slightly more complicated\nabstract domains. Common domains include: constant propagation, type inference,\nrange analysis, effect inference, liveness, etc. For example, here is a a\nsample lattice for constant propagation:\n\n\n \n \n\n\nIt has multiple levels to indicate more and less precision. For example, you\nmight learn that a variable is either 1 or 2 and be able to encode that as\nnonnegative instead of just going straight to top.\nCheck out some real-world abstract interpretation in open source projects:\n\nKnown bits in LLVM\nConstant range in LLVM\nBut I am told that the ranges don't form a lattice (see Interval Analysis and Machine Arithmetic: Why Signedness Ignorance Is Bliss)\nTristate numbers for known bits in Linux eBPF\nRange analysis in Linux eBPF\nGDB prologue analysis\n of assembly to understand the stack and find frame pointers without using\n DWARF (some\n docs)\n\nIf you have some readable examples, please share them so I can add.\nAcknowledgements\nThank you to CF Bolz-Tereick for the toy optimizer and\nhelping edit this post!\n\n\n\n\nIn the words of abstract interpretation researchers Vincent Laviron\nand Francesco Logozzo in their paper Refining Abstract\nInterpretation-based Static Analyses with Hints (APLAS 2009):\n\nThe three main elements of an abstract interpretation are: (i) the\nabstract elements (\"which properties am I interested in?\"); (ii) the\nabstract transfer functions (\"which is the abstract semantics of basic\nstatements?\"); and (iii) the abstract operations (\"how do I combine the\nabstract elements?\").\n\nWe don't have any of these \"abstract operations\" in this post because\nthere's no control flow but you can read about them elsewhere!\u00a0\u21a9\n\n\nThese abstract values are arranged in a lattice, which is a\nmathematical structure with some properties but the most important ones are\nthat it has a top, a bottom, a partial order, a meet operation, and values\ncan only move in one direction on the lattice.\nUsing abstract values from a lattice promises two things:\n\nThe analysis will terminate\nThe analysis will be correct for any run of the program, not just one\n sample run\n\n\u21a9\n\n\nSomething about __match_args__ and @property...\u00a0\u21a9", + "tags": "toy-optimizer", + "url": "https://www.pypy.org/posts/2024/07/toy-abstract-interpretation.html" + }, + { + "title": "Mining JIT traces for missing optimizations with Z3", + "text": "In my last post I've described how to use Z3 to find simple local peephole\noptimization patterns\nfor the integer operations in PyPy's JIT. An example is int_and(x, 0) ->\n0. In this post I want to scale up the problem of identifying possible\noptimizations to much bigger instruction sequences, also using Z3. For that, I\nam starting with the JIT traces of real benchmarks, after they have been\noptimized by the optimizer of PyPy's JIT. Then we can ask Z3 to find\ninefficient integer operations in those traces.\nStarting from the optimized traces of real programs has some big\nadvantages over the \"classical\" superoptimization approach of generating and\nthen trying all possible sequences of instructions. It avoids the\ncombinatorial explosion that happens with the latter approach. Also, starting\nfrom the traces of benchmarks or (even better) actual programs makes sure that\nwe actually care about the missing optimizations\nthat are found in this way. And because the traces are analyzed after they have\nbeen optimized by PyPy's optimizer, we only get reports for missing\noptimizations, that the JIT isn't able to do (yet).\nThe techniques and experiments I describe in this post are again the result of\na bunch of discussions with John Regehr at a conference a few weeks ago, as\nwell as reading his blog posts and papers. Thanks John! Also thanks to Max\nBernstein for super helpful feedback on the drafts\nof this blog post (and for poking me to write things in general).\nHigh-Level Approach\nThe approach that I took works as follows:\n\nRun benchmarks or other interesting programs and then dump the IR of the JIT\n traces into a file. The traces have at that point been already optimized by\n the PyPy JIT's optimizer.\nFor every trace, ignore all the operations on non-integer variables.\nTranslate every integer operation into a Z3 formula.\nFor every operation, use Z3 to find out whether the operation is redundant\n (how that is done is described below).\nIf the operation is redundant, the trace is less efficient than it could have\n been, because the optimizer could also have removed the operation. Report the\n inefficiency.\nMinimize the inefficient programs by removing as many operations as possible\n to make the problem easier to understand.\n\nIn the post I will describe the details and show some pseudocode of the\napproach. I'll also make the proper code public eventually (but it needs a\nhealthy dose of cleanups first).\nDumping PyPy Traces\nPyPy will write its JIT traces into the file out if the environment variable\nPYPYLOG is set as follows:\nPYPYLOG=jit-log-opt:out pypy \n\n\nThis environment variable works for PyPy, but also for other virtual machines\nbuilt with RPython.\n(This is really a side point for the rest of the blog post, but since the\nquestion came up I wanted to clarify it: Operations on integers in the Python\nprogram that the JIT is running don't all correspond 1-to-1 with the int_...\noperations in the traces. The int_... trace operations always operate on\nmachine words. The Python int type supports arbitrarily large integers. PyPy\nwill optimistically try to lower the operations on Python integers into machine\nword operations, but adds the necessary guards into the trace to make sure that\noverflow outside of the range of machine words is caught. In case one of these\nguards fails the interpreter switches to a big integer heap-allocated\nrepresentation.)\nEncoding Traces as Z3 formulas\nThe last blog post already contained the code to encode the results of\nindividual trace operations into Z3 formulas, so we don't need to repeat that\nhere. To encode traces of operations we introduce a Z3 variable for every\noperation in the trace and then call the z3_expression function for every\nsingle one of the operations in the trace.\nFor example, for the following trace:\n[i1]\ni2 = uint_rshift(i1, 32)\ni3 = int_and(i2, 65535)\ni4 = uint_rshift(i1, 48)\ni5 = int_lshift(i4, 16)\ni6 = int_or(i5, i3)\njump(i6, i2) # equal\n\n\nWe would get the Z3 formula:\nz3.And(i2 == LShR(i1, 32),\n i3 == i2 & 65535,\n i4 == LShR(i1, 48),\n i5 == i4 << 16)\n\n\nUsually we won't ask for the formula of the whole trace at once. Instead we go\nthrough the trace operation by operation and try to find inefficiencies in the\ncurrent one we are looking at. Roughly like this (pseudo-)code:\ndef newvar(name):\n return z3.BitVec(name, INTEGER_WIDTH)\n\ndef find_inefficiencies(trace):\n solver = z3.Solver()\n var_to_z3var = {}\n for input_argument in trace.inputargs:\n var_to_z3var[input_argument] = newz3var(input_argument)\n for op in trace:\n var_to_z3var[op] = z3resultvar = newz3var(op.resultvarname)\n arg0 = op.args[0]\n z3arg0 = var_to_z3var[arg0]\n if len(op.args) == 2:\n arg1 = op.args[1]\n z3arg1 = var_to_z3var[arg1]\n else:\n z3arg1 = None\n res, valid_if = z3_expression(op.name, z3arg0, z3arg1)\n # checking for inefficiencies, see the next sections\n ...\n if ...:\n return \"inefficient\", op\n\n # not inefficient, assert op into the solver and continue with the next op\n solver.add(z3resultvar == res)\n return None # no inefficiency found\n\n\nIdentifying constant booleans with Z3\nTo get started finding inefficiencies in a trace, we can\nfirst focus on boolean variables. For every operation in the trace that\nreturns a bool we can ask Z3 to prove that this variable must be always True or\nalways False. Most of the time, neither of these proofs will succeed. But if Z3\nmanages to prove one of them, we know have found an ineffiency: instead of\ncomputing the boolean result (eg by executing a comparison) the JIT's optimizer\ncould have replaced the operation with the corresponding boolean constant.\nHere's an example of an inefficiency found that way: if x < y and y < z are\nboth true, PyPy's JIT could conclude that x < z must also\nbe true. However, currently the JIT cannot make that conclusion because it\nonly reasons about the concrete ranges (lower and upper bounds) for every\ninteger variable, but it has no way to remember anything about relationships\nbetween different variables. This kind of reasoning would quite often be useful\nto remove list/string bounds checks. Here's a talk about how LLVM does\nthis (but it might be\ntoo heavyweight for a JIT setting).\nHere are some more examples found that way:\n\nx - 1 == x is always False\nx - (x == -1) == -1 is always False. The pattern x - (x == -1) happens a\n lot in PyPy's hash computations: To be compatible with the CPython hashes we\n need to make sure that no object's hash is -1 (CPython uses -1 as an error\n value on the C level).\n\nHere's pseudo-code for how to implement checking boolean operations for\ninefficiencies:\ndef find_inefficiencies(trace):\n ...\n for op in trace:\n ...\n res, valid_if = z3_expression(op.name, z3arg0, z3arg1)\n # check for boolean constant result\n if op.has_boolean_result():\n if prove(solver, res == 0):\n return \"inefficient\", op, 0\n if prove(solver, res == 1):\n return \"inefficient\", op, 1\n # checking for other inefficiencies, see the next sections\n ...\n\n # not inefficient, add op to the solver and continue with the next op\n solver.add(z3resultvar == res)\n return None # no inefficiency found\n\n\nIdentifying redundant operations\nA more interesting class of redundancy is to try to find two operations in a\ntrace that compute the same result. We can do that by asking Z3 to prove for\neach pair of different operations in the trace to prove that the result is\nalways the same. If a previous operation returns the same result, the JIT could\nhave re-used that result instead of re-computing it, saving time. Doing this\nsearch for equivalent operations with Z3 is quadratic in the number of\noperations, but since traces have a maximum length it is not too bad in\npractice.\nThis is the real workhorse of my script so far, it's what finds most of the\ninefficiencies. Here's a few examples:\n\nThe very first and super useful example the script found is int_eq(b, 1) ==\n b if b is known to be a boolean (ie and integer 0 or 1). I have already\n implemented this optimization in the JIT.\nSimilarly, int_and(b, 1) == b for booleans.\n(x << 4) & -0xf == x << 4\n((x >> 63) << 1) << 2) >> 3 == x >> 63. In general the JIT is quite bad at\n optimizing repeated shifts (the infrastructure for doing better with that is\n already in place, so this will be a relatively easy fix).\n(x & 0xffffffff) | ((x >> 32) << 32) == x. Having the JIT optimize this\n would maybe require first recognizing that (x >> 32) << 32 can be expressed\n as a mask: (x & 0xffffffff00000000), and then using (x & c1) | (x & c2) ==\n x & (c1 | c2)\nA commonly occurring pattern is variations of this one:\n ((x & 1345) 2048) - 2048 == x & 1345 (with different constants, of\n course). xor is add without carry, and x & 1345 does not have the bit\n 2048 set. Therefore the 2048 is equivalent to + 2048, which the -\n 2048 cancels. More generally, if a & b == 0, then a + b == a | b == a b.\n I don't understand at all why this appears so often in the traces, but I\n see variations of it a lot. LLVM can optimize this, but GCC\n can't, thanks to\n Andrew Pinski for filing the\n bug!\n\nAnd here's some implementation pseudo-code again:\ndef find_inefficiencies(trace):\n ...\n for op in trace:\n ...\n res, valid_if = z3_expression(op.name, z3arg0, z3arg1)\n # check for boolean constant result\n ...\n # searching for redundant operations\n for previous_op in trace:\n if previous_op is op:\n break # done, reached the current op\n previous_op_z3var = var_to_z3var[previous_op]\n if prove(solver, previous_op_z3var == res):\n return \"inefficient\", op, previous_op\n ...\n # more code here later\n ...\n\n # not inefficient, add op to the solver and continue with the next op\n solver.add(z3resultvar == res)\n return None # no inefficiency found\n\n\nSynthesizing more complicated constants with exists-forall\nTo find out whether some integer operations always return a constant result, we\ncan't simply use the same trick as for those operations that return boolean\nresults, because enumerating 2\u2076\u2074 possible constants and checking them all\nwould take too long. Like in the last post, we can use z3.ForAll to find out\nwhether Z3 can synthesize a constant for the result of an operation for us.\nIf such a constant exists, the JIT could have removed the operation,\nand replaced it with the constant that Z3 provides.\nHere a few examples of inefficiencies found this way:\n\n(x 1) x == 1 (or, more generally: (x y) x == y)\nif x | y == 0, it follows that x == 0 and y == 0\nif x != MAXINT, then x + 1 > x\n\nImplementing this is actually slightly annoying. The solver.add calls for\nnon-inefficient ops add assertions to the solver, which are now confusing the\nz3.ForAll query. We could remove all assertion from the solver, then do the\nForAll query, then add the assertions back. What I ended doing instead was\ninstantiating a second solver object that I'm using for the ForAll queries,\nthat remains empty the whole time.\ndef find_inefficiencies(trace):\n solver = z3.Solver()\n empty_solver = z3.Solver()\n var_to_z3var = {}\n ...\n for op in trace:\n ...\n res, valid_if = z3_expression(op.name, z3arg0, z3arg1)\n # check for boolean constant result\n ...\n # searching for redundant operations\n ...\n # checking for constant results\n constvar = z3.BitVec('find_const', INTEGER_WIDTH)\n condition = z3.ForAll(\n var_to_z3var.values(),\n z3.Implies(\n *solver.assertions(),\n expr == constvar\n )\n )\n if empty_solver.check(condition) == z3.sat:\n model = empty_solver.model()\n const = model[constvar].as_signed_long()\n return \"inefficient\", op, const\n\n # not inefficient, add op to the solver and continue with the next op\n solver.add(z3resultvar == res)\n return None # no inefficiency found\n\n\nMinimization\nAnalyzing an inefficiency by hand in the context of a larger trace is quite\ntedious. Therefore I've implemented a (super inefficient) script to try to make\nthe examples smaller. Here's how that works:\n\nFirst throw out all the operations that occur after the inefficient operation\n in the trace.\nThen we remove all \"dead\" operations, ie operations that don't have their\n results used (all the operations that we can analyze with Z3 are without side\n effects).\nNow we try to remove every guard in the trace one by one and check\n afterwards, whether the resulting trace still has an inefficiency.\nWe also try to replace every single operation with a new argument to the\n trace, to see whether the inefficiency is still present.\n\nThe minimization process is sort of inefficient and I should probably be using\n shrinkray or\n C-Reduce instead. However, it\n seems to work well in practice and the runtime isn't too bad.\nResults\nSo far I am using the JIT traces of three programs: 1) Booting Linux on the\nPydrofoil RISC-V emulator, 2) booting Linux on the Pydrofoil ARM emulator, and 3)\nrunning the PyPy bootstrap process on top of PyPy.\nI picked these programs because most Python programs don't contain interesting\namounts of integer operations, and the traces of the emulators\ncontain a lot of them. I also used the bootstrap process because I still wanted\nto try a big Python program and personally care about the runtime of this\nprogram a lot.\nThe script identifies 94\ninefficiencies in the traces, a lot of them come from repeating\npatterns. My next steps will be to manually inspect them all, categorize them, and\nimplement easy optimizations identified that way. I also want a way to sort the\nexamples by execution count in the benchmarks, to get a feeling for which of\nthem are most important.\nI didn't investigate the full set of Python\nbenchmarks that PyPy uses yet, because I don't expect\nthem to contain interesting amounts of integer operations, but maybe I am wrong\nabout that? Will have to try eventually.\nConclusion\nThis was again much easier to do than I would have expected! Given that I had\nthe translation of trace ops to Z3 already in place, it was a matter of about a\nday's of programming to use this infrastructure to find the first problems and\nminimizing them.\nReusing the results of existing operations or replacing operations by constants\ncan be seen as \"zero-instruction superoptimization\". I'll probably be rather\nbusy for a while to add the missing optimizations identified by my simple\nscript. But later extensions to actually synthesize one or several operations\nin the attempt to optimize the traces more and find more opportunities should\nbe possible.\nFinding inefficiencies in traces with Z3 is significantly less\nannoying and also less error-prone than just manually inspecting traces and\ntrying to spot optimization opportunities.\nRandom Notes and Sources\nAgain, John's blog posts:\n\nLet\u2019s Work on an LLVM Superoptimizer\nEarly Superoptimizer Results\nA Few Synthesizing Superoptimizer Results\nSynthesizing Constants\n\nand papers:\n\nA Synthesizing Superoptimizer\nHydra: Generalizing Peephole Optimizations with Program Synthesis\n\nI remembered recently that I had seen the approach of optimizing the traces of\na tracing JIT with Z3 a long time ago, as part of the (now long dead, I think)\nSPUR\nproject.\nThere's a workshop\npaper\nfrom 2010 about this. SPUR was trying to use Z3 built into the actual JIT (as\nopposed to using Z3 only to find places where the regular optimizers could be\nimproved). In addition to bitvectors, SPUR also used the Z3 support for arrays\nto model the C# heap and remove redundant stores. This is still another future\nextension for all the Z3 work I've been doing in the context of the PyPy JIT.", + "tags": "jit,z3", + "url": "https://www.pypy.org/posts/2024/07/mining-jit-traces-missing-optimizations-z3.html" + }, + { + "title": "Finding Simple Rewrite Rules for the JIT with Z3", + "text": "In June I was at the PLDI conference in\nCopenhagen to present a paper\nI co-authored with Max Bernstein. I also finally\nmet John Regehr, who I'd been talking on social\nmedia for ages but had never met. John has been working on compiler correctness\nand better techniques for building compilers and optimizers since a very long\ntime. The blog post Finding JIT Optimizer Bugs using SMT Solvers and\nFuzzing\nwas heavily inspired by this work. We talked a lot about his and his groups\nwork on using Z3 for\nsuperoptimization and for\nfinding missing optimizations. I have applied some of the things John told me\nabout to the traces of PyPy's JIT, and wanted to blog about that. However, my\ndraft felt quite hard to understand. Therefore I have now written this current\npost, to at least try to provide a somewhat gentler on-ramp to the topic.\nIn this post we will use the Python-API to Z3 to find local peephole rewrite\nrules for the operations in the intermediate representation of PyPy's tracing\nJIT. The code for this is simple enough that we can go through all of it.\nThe PyPy JIT produces traces of machine level instructions, which are optimized\nand then turned into machine code. The optimizer uses a number of approaches to\nmake the traces more efficient. For integer operations it applies a number of\narithmetic simplification rules rules, for example int_add(x, 0) -> x. When\nimplementing these rules in the JIT there are two problems: How do we know\nthat the rules are correct? And how do we know that we haven't forgotten any\nrules? We'll try to answer both of these, but the first one in particular.\nWe'll be using Z3, a satisfiability module theories (SMT) solver which has good\nbitvector support and most importantly an excellent Python API. We can use the\nsolver to reason about bitvectors, which are how we will model machine\nintegers.\nTo find rewrite rules, we will consider the binary operations (i.e. those\ntaking two arguments) in PyPy traces that take and produce integers. The\ncompletely general form op(x, y) is not simplifiable on its own. But if\neither x == y\nor if one of the arguments is a constant, we can potentially simplify the\noperation into a simpler form. The results are either the variable x, or a\n(potentially different) constant. We'll ignore constant-folding where both\narguments of the binary operation are constants. The possible results for a\nsimplifiable binary operation are the variable x or another constant. This\nleaves the following patterns as possibilities:\n\nop(x, x) == x\nop(x, x) == c1\nop(x, c1) == x\nop(c1, x) == x\nop(x, c1) == c2\nop(c1, x) == c2\n\nOur approach will be to take every single supported binary integer operation,\ninstantiate all of these patterns, and try to ask Z3 whether the resulting\nsimplification is valid for all values of x.\nQuick intro to the Z3 Python-API\nHere's a terminal session showing the use of the Z3 Python API:\n>>>> import z3\n>>>> # construct a Z3 bitvector variable of width 8, with name x:\n>>>> x = z3.BitVec('x', 8)\n>>>> # construct a more complicated formula by using operator overloading:\n>>>> x + x\nx + x\n>>>> x + 1\nx + 1\n\n\nZ3 checks the \"satisfiability\" of a formula. This means that it tries to find\nan example set of concrete values for the variables that occur in a formula,\nsuch that the formula becomes true. Examples:\n>>>> solver = z3.Solver()\n>>>> solver.check(x * x == 3)\nunsat\n>>>> # meaning no x fulfils this property\n>>>>\n>>>> solver.check(x * x == 9)\nsat\n>>>> model = solver.model()\n>>>> model\n[x = 253]\n>>>> model[x].as_signed_long()\n-3\n>>>> # 253 is the same as -3 in two's complement arithmetic with 8 bits\n\n\nIn order to use Z3 to prove something, we can ask Z3 to find counterexamples\nfor the statement, meaning concrete values that would make the negation of the\nstatement true:\n>>>> solver.check(z3.Not(x -1 == ~x))\nunsat\n\n\nThe result unsat means that we just proved that x -1 == ~x is true for\nall x, because there is no value for x that makes not (x -1 == ~x)\ntrue (this works because -1 has all the bits set).\nIf we try to prove something incorrect in this way, the following happens:\n>>>> solver.check(z3.Not(x -1 == x))\nsat\n\n\nsat shows that x -1 == x is (unsurprisingly) not always true, and we can\nask for a counterexample:\n>>>> solver.model()\n[x = 0]\n\n\nThis way of proving this works because the check calls try to solve an\n(implicit) \"exists\" quantifier, over all the Z3 variables used in the formula.\ncheck will either return z3.unsat, which means that no concrete values make\nthe formula true; or z3.sat, which means that you can get some concrete\nvalues that make the formula true by calling solver.model().\nIn math terms we prove things using check by de-Morgan's rules for quantifiers:\n$$ \\lnot \\exists x: \\lnot f(x) \\implies \\forall x: f(x) $$\nNow that we've seen the basics of using the Z3 API on a few small examples,\nwe'll use it in a bigger program.\nEncoding the integer operations of RPython's JIT into Z3 formulas\nNow we'll use the API to reason about the integer operations of the PyPy JIT\nintermediate representation (IR). The binary integer operations are:\nopnames2 = [\n\"int_add\",\n\"int_sub\",\n\"int_mul\",\n\"int_and\",\n\"int_or\",\n\"int_xor\",\n\"int_eq\",\n\"int_ne\",\n\"int_lt\",\n\"int_le\",\n\"int_gt\",\n\"int_ge\",\n\"uint_lt\",\n\"uint_le\",\n\"uint_gt\",\n\"uint_ge\",\n\"int_lshift\",\n\"int_rshift\",\n\"uint_rshift\",\n\"uint_mul_high\",\n\"int_pydiv\",\n\"int_pymod\",\n]\n\n\nThere's not much special about the integer operations. Like in LLVM, most of\nthem are signedness-independent: int_add, int_sub, int_mul, ... work\ncorrectly for unsigned integers but also for\ntwo's-complement signed\nintegers. Exceptions for that are order comparisons like int_lt etc. for\nwhich we have unsigned variants uint_lt etc. All operations that produce a\nboolean result return a full-width integer 0 or 1 (the PyPy JIT supports\nonly word-sized integers in its intermediate representation)\nIn order to reason about the IR operations, some ground work:\nimport z3\n\nINTEGER_WIDTH = 64\nsolver = z3.Solver()\nsolver.set(\"timeout\", 10000) # milliseconds, ie 10s\nxvar = z3.BitVec('x', INTEGER_WIDTH)\nconstvar = z3.BitVec('const', INTEGER_WIDTH)\nconstvar2 = z3.BitVec('const2', INTEGER_WIDTH)\nTRUEBV = z3.BitVecVal(1, INTEGER_WIDTH)\nFALSEBV = z3.BitVecVal(0, INTEGER_WIDTH)\n\n\nAnd here's the a function to turn an integer IR operation of PyPy's JIT into Z3\nformulas:\ndef z3_expression(opname, arg0, arg1=None):\n \"\"\" computes a tuple of (result, valid_if) of Z3 formulas. `result` is the\n formula representing the result of the operation, given argument formulas\n arg0 and arg1. `valid_if` is a pre-condition that must be true for the\n result to be meaningful. \"\"\"\n result = None\n valid_if = True # the precondition is mostly True, with few exceptions\n if opname == \"int_add\":\n result = arg0 + arg1\n elif opname == \"int_sub\":\n result = arg0 - arg1\n elif opname == \"int_mul\":\n result = arg0 * arg1\n elif opname == \"int_and\":\n result = arg0 & arg1\n elif opname == \"int_or\":\n result = arg0 | arg1\n elif opname == \"int_xor\":\n result = arg0 arg1\n elif opname == \"int_eq\":\n result = cond(arg0 == arg1)\n elif opname == \"int_ne\":\n result = cond(arg0 != arg1)\n elif opname == \"int_lt\":\n result = cond(arg0 < arg1)\n elif opname == \"int_le\":\n result = cond(arg0 <= arg1)\n elif opname == \"int_gt\":\n result = cond(arg0 > arg1)\n elif opname == \"int_ge\":\n result = cond(arg0 >= arg1)\n elif opname == \"uint_lt\":\n result = cond(z3.ULT(arg0, arg1))\n elif opname == \"uint_le\":\n result = cond(z3.ULE(arg0, arg1))\n elif opname == \"uint_gt\":\n result = cond(z3.UGT(arg0, arg1))\n elif opname == \"uint_ge\":\n result = cond(z3.UGE(arg0, arg1))\n elif opname == \"int_lshift\":\n result = arg0 << arg1\n valid_if = z3.And(arg1 >= 0, arg1 < INTEGER_WIDTH)\n elif opname == \"int_rshift\":\n result = arg0 << arg1\n valid_if = z3.And(arg1 >= 0, arg1 < INTEGER_WIDTH)\n elif opname == \"uint_rshift\":\n result = z3.LShR(arg0, arg1)\n valid_if = z3.And(arg1 >= 0, arg1 < INTEGER_WIDTH)\n elif opname == \"uint_mul_high\":\n # zero-extend args to 2*INTEGER_WIDTH bit, then multiply and extract\n # highest INTEGER_WIDTH bits\n zarg0 = z3.ZeroExt(INTEGER_WIDTH, arg0)\n zarg1 = z3.ZeroExt(INTEGER_WIDTH, arg1)\n result = z3.Extract(INTEGER_WIDTH * 2 - 1, INTEGER_WIDTH, zarg0 * zarg1)\n elif opname == \"int_pydiv\":\n valid_if = arg1 != 0\n r = arg0 / arg1\n psubx = r * arg1 - arg0\n result = r + (z3.If(arg1 < 0, psubx, -psubx) >> (INTEGER_WIDTH - 1))\n elif opname == \"int_pymod\":\n valid_if = arg1 != 0\n r = arg0 % arg1\n result = r + (arg1 & z3.If(arg1 < 0, -r, r) >> (INTEGER_WIDTH - 1))\n elif opname == \"int_is_true\":\n result = cond(arg0 != FALSEBV)\n elif opname == \"int_is_zero\":\n result = cond(arg0 == FALSEBV)\n elif opname == \"int_neg\":\n result = -arg0\n elif opname == \"int_invert\":\n result = ~arg0\n else:\n assert 0, \"unknown operation \" + opname\n return result, valid_if\n\ndef cond(z3expr):\n \"\"\" helper function to turn a Z3 boolean result z3expr into a 1 or 0\n bitvector, using z3.If \"\"\"\n return z3.If(z3expr, TRUEBV, FALSEBV)\n\n\nWe map the semantics of a PyPy JIT operation to Z3 with the z3_expression\nfunction. It takes the name of a JIT operation and its two (or one) arguments\ninto a pair of Z3 formulas, result and valid_if. The resulting formulas are\nconstructed with the operator overloading of Z3 variables/formulas.\nThe first element result of the result of z3_expression represents the result\nof performing the operation. valid_if is a bool that represents a condition that\nneeds to be True in order for the result of the operation to be defined. E.g.\nint_pydiv(a, b) is only valid if b != 0. Most operations are always valid,\nso they return True as that condition (we'll ignore valid_if for a bit, but it\nwill become more relevant further down in the post).\nWe can define a helper function to prove things by finding counterexamples:\ndef prove(cond):\n \"\"\" Try to prove a condition cond by searching for counterexamples of its negation. \"\"\"\n z3res = solver.check(z3.Not(cond))\n if z3res == z3.unsat:\n return True\n elif z3res == z3.unknown: # eg on timeout\n return False\n elif z3res == z3.sat:\n return False\n assert 0, \"should be unreachable\"\n\n\nFinding rewrite rules\nNow we can start finding our first rewrite rules, following the first pattern\nop(x, x) -> x. We do this by iterating over all the supported binary\noperation names, getting the z3 expression for op(x, x) and then asking Z3 to\nprove op(x, x) == x.\nfor opname in opnames2:\n result, valid_if = z3_expression(opname, xvar, xvar)\n if prove(result == xvar):\n print(f\"{opname}(x, x) -> x, {result}\")\n\n\nThis yields the simplifications:\nint_and(x, x) -> x\nint_or(x, x) -> x\n\n\nSynthesizing constants\nSupporting the next patterns is harder: op(x, x) == c1, op(x, c1) == x, and\nop(c1, x) == x. We don't know which constants to pick to try to get Z3 to\nprove the equality. We could iterate over common constants like 0, 1,\nMAXINT, etc, or even over all the 256 values for a bitvector of length 8.\nHowever, we will instead ask Z3 to find the constants for us too.\nThis can be done by using quantifiers, in this case z3.ForAll. The query we\npose to Z3 is \"does there exist a constant c1 such that for all x the\nfollowing is true: op(x, c1) == x? Note that the constant c1 is not\nnecessarily unique, there could be many of them. We generate several matching\nconstant, and add that they must be different to the condition of the second\nand further queries.\nWe can express this in a helper function:\ndef find_constant(z3expr, number_of_results=5):\n condition = z3.ForAll(\n [xvar],\n z3expr\n )\n for i in range(number_of_results):\n checkres = solver.check(condition)\n if checkres == z3.sat:\n # if a solver check succeeds, we can ask for a model, which is\n # concrete values for the variables constvar\n model = solver.model()\n const = model[constvar].as_signed_long()\n yield const\n # make sure we don't generate the same constant again on the\n # next call\n condition = z3.And(constvar != const, condition)\n else:\n # no (more) constants found\n break\n\n\nWe can use this new function for the three mentioned patterns:\n# try to find constants for op(x, x) == c\nfor opname in opnames2:\n result, valid_if = z3_expression(opname, xvar, xvar)\n for const in find_constant(result == constvar):\n print(f\"{opname}(x, x) -> {const}\")\n# try to find constants for op(x, c) == x and op(c, x) == x\nfor opname in opnames2:\n result, valid_if = z3_expression(opname, xvar, constvar)\n for const in find_constant(result == xvar):\n print(f\"{opname}(x, {const}) -> x\")\n result, valid_if = z3_expression(opname, constvar, xvar)\n for const in find_constant(result == xvar):\n print(f\"{opname}({const}, x) -> x\")\n# this code is not quite correct, we'll correct it later\n\n\nTogether this yields the following new simplifications:\n# careful, these are not all correct!\nint_sub(x, x) -> 0\nint_xor(x, x) -> 0\nint_eq(x, x) -> 1\nint_ne(x, x) -> 0\nint_lt(x, x) -> 0\nint_le(x, x) -> 1\nint_gt(x, x) -> 0\nint_ge(x, x) -> 1\nuint_lt(x, x) -> 0\nuint_le(x, x) -> 1\nuint_gt(x, x) -> 0\nuint_ge(x, x) -> 1\nuint_rshift(x, x) -> 0\nint_pymod(x, x) -> 0\nint_add(x, 0) -> x\nint_add(0, x) -> x\nint_sub(x, 0) -> x\nint_mul(x, 1) -> x\nint_mul(1, x) -> x\nint_and(x, -1) -> x\nint_and(-1, x) -> x\nint_or(x, 0) -> x\nint_or(0, x) -> x\nint_xor(x, 0) -> x\nint_xor(0, x) -> x\nint_lshift(x, 0) -> x\nint_rshift(x, 0) -> x\nuint_rshift(x, 0) -> x\nint_pydiv(x, 1) -> x\nint_pymod(x, 0) -> x\n\n\nMost of these look good at first glance, but the last one reveals a problem:\nwe've been ignoring the valid_if expression up to now. We can stop doing that by\nchanging the code like this, which adds z3.And(valid_if, ...) to the argument of\nthe calls to find_constant:\n# try to find constants for op(x, x) == c, op(x, c) == x and op(c, x) == x\nfor opname in opnames2:\n result, valid_if = z3_expression(opname, xvar, xvar)\n for const in find_constant(z3.And(valid_if, result == constvar)):\n print(f\"{opname}(x, x) -> {const}\")\n# try to find constants for op(x, c) == x and op(c, x) == x\nfor opname in opnames2:\n result, valid_if = z3_expression(opname, xvar, constvar)\n for const in find_constant(z3.And(result == xvar, valid_if)):\n print(f\"{opname}(x, {const}) -> x\")\n result, valid_if = z3_expression(opname, constvar, xvar)\n for const in find_constant(z3.And(result == xvar, valid_if)):\n print(f\"{opname}({const}, x) -> x\")\n\n\nAnd we get this list instead:\nint_sub(x, x) -> 0\nint_xor(x, x) -> 0\nint_eq(x, x) -> 1\nint_ne(x, x) -> 0\nint_lt(x, x) -> 0\nint_le(x, x) -> 1\nint_gt(x, x) -> 0\nint_ge(x, x) -> 1\nuint_lt(x, x) -> 0\nuint_le(x, x) -> 1\nuint_gt(x, x) -> 0\nuint_ge(x, x) -> 1\nint_add(x, 0) -> x\nint_add(0, x) -> x\nint_sub(x, 0) -> x\nint_mul(x, 1) -> x\nint_mul(1, x) -> x\nint_and(x, -1) -> x\nint_and(-1, x) -> x\nint_or(x, 0) -> x\nint_or(0, x) -> x\nint_xor(x, 0) -> x\nint_xor(0, x) -> x\nint_lshift(x, 0) -> x\nint_rshift(x, 0) -> x\nuint_rshift(x, 0) -> x\nint_pydiv(x, 1) -> x\n\n\nSynthesizing two constants\nFor the patterns op(x, c1) == c2 and op(c1, x) == c2 we need to synthesize\ntwo constants. We can again write a helper method for that:\ndef find_2consts(z3expr, number_of_results=5):\n condition = z3.ForAll(\n [xvar],\n z3expr\n )\n for i in range(number_of_results):\n checkres = solver.check(condition)\n if checkres == z3.sat:\n model = solver.model()\n const = model[constvar].as_signed_long()\n const2 = model[constvar2].as_signed_long()\n yield const, const2\n condition = z3.And(z3.Or(constvar != const, constvar2 != const2), condition)\n else:\n return\n\n\nAnd then use it like this:\nfor opname in opnames2:\n # try to find constants c1, c2 such that op(c1, x) -> c2\n result, valid_if = z3_expression(opname, constvar, xvar)\n consts = find_2consts(z3.And(valid_if, result == constvar2))\n for const, const2 in consts:\n print(f\"{opname}({const}, x) -> {const2}\")\n # try to find constants c1, c2 such that op(x, c1) -> c2\n result, valid_if = z3_expression(opname, xvar, constvar)\n consts = find_2consts(z3.And(valid_if, result == constvar2))\n for const, const2 in consts:\n print(\"%s(x, %s) -> %s\" % (opname, const, const2))\n\n\nWhich yields some straightforward simplifications:\nint_mul(0, x) -> 0\nint_mul(x, 0) -> 0\nint_and(0, x) -> 0\nint_and(x, 0) -> 0\nuint_lt(x, 0) -> 0\nuint_le(0, x) -> 1\nuint_gt(0, x) -> 0\nuint_ge(x, 0) -> 1\nint_lshift(0, x) -> 0\nint_rshift(0, x) -> 0\nuint_rshift(0, x) -> 0\nuint_mul_high(0, x) -> 0\nuint_mul_high(1, x) -> 0\nuint_mul_high(x, 0) -> 0\nuint_mul_high(x, 1) -> 0\nint_pymod(x, 1) -> 0\nint_pymod(x, -1) -> 0\n\n\nA few require a bit more thinking:\nint_or(-1, x) -> -1\nint_or(x, -1) -> -1\n\n\nThe are true because in two's complement, -1 has all bits set.\nThe following ones require recognizing that -9223372036854775808 == -2**63 is\nthe most negative signed 64-bit integer, and 9223372036854775807 == 2 ** 63 -\n1 is the most positive one:\nint_lt(9223372036854775807, x) -> 0\nint_lt(x, -9223372036854775808) -> 0\nint_le(-9223372036854775808, x) -> 1\nint_le(x, 9223372036854775807) -> 1\nint_gt(-9223372036854775808, x) -> 0\nint_gt(x, 9223372036854775807) -> 0\nint_ge(9223372036854775807, x) -> 1\nint_ge(x, -9223372036854775808) -> 1\n\n\nThe following ones are true because the bitpattern for -1 is the largest\nunsigned number:\nuint_lt(-1, x) -> 0\nuint_le(x, -1) -> 1\nuint_gt(x, -1) -> 0\nuint_ge(-1, x) -> 1\n\n\nStrength Reductions\nAll the patterns so far only had a variable or a constant on the target of the\nrewrite. We can also use the machinery to do strengh-reductions where we\ngenerate a single-argument operation op1(x) for input operations op(x, c1)\nor op(c1, x). To achieve this, we try all combinations of binary and unary\noperations. (We won't consider strength reductions where a binary operation\ngets turned into a \"cheaper\" other binary operation here.)\nopnames1 = [\n\"int_is_true\",\n\"int_is_zero\",\n\"int_neg\",\n\"int_invert\",\n]\n\nfor opname in opnames2:\n for opname1 in opnames1:\n result, valid_if = z3_expression(opname, xvar, constvar)\n # try to find a constant op(x, c) == g(x)\n result1, valid_if1 = z3_expression(opname1, xvar)\n consts = find_constant(z3.And(valid_if, valid_if1, result == result1))\n for const in consts:\n print(f\"{opname}(x, {const}) -> {opname1}(x)\")\n\n # try to find a constant op(c, x) == g(x)\n result, valid_if = z3_expression(opname, constvar, xvar)\n result1, valid_if1 = z3_expression(opname1, xvar)\n consts = find_constant(z3.And(valid_if, valid_if1, result == result1))\n for const in consts:\n print(f\"{opname}({const}, x) -> {opname1}(x)\")\n\n\nWhich yields the following new simplifications:\nint_sub(0, x) -> int_neg(x)\nint_sub(-1, x) -> int_invert(x)\nint_mul(x, -1) -> int_neg(x)\nint_mul(-1, x) -> int_neg(x)\nint_xor(x, -1) -> int_invert(x)\nint_xor(-1, x) -> int_invert(x)\nint_eq(x, 0) -> int_is_zero(x)\nint_eq(0, x) -> int_is_zero(x)\nint_ne(x, 0) -> int_is_true(x)\nint_ne(0, x) -> int_is_true(x)\nuint_lt(0, x) -> int_is_true(x)\nuint_lt(x, 1) -> int_is_zero(x)\nuint_le(1, x) -> int_is_true(x)\nuint_le(x, 0) -> int_is_zero(x)\nuint_gt(x, 0) -> int_is_true(x)\nuint_gt(1, x) -> int_is_zero(x)\nuint_ge(x, 1) -> int_is_true(x)\nuint_ge(0, x) -> int_is_zero(x)\nint_pydiv(x, -1) -> int_neg(x)\n\n\nConclusions\nWith not very little code we managed to generate a whole lot of local\nsimplifications for integer operations in the IR of PyPy's JIT. The rules\ndiscovered that way are \"simple\", in the sense that they only require looking\nat a single instruction, and not where the arguments of that instruction came\nfrom. They also don't require any knowledge about the properties of the\narguments of the instructions (e.g. that they are positive).\nThe rewrites in this post have mostly been in PyPy's JIT already. But now we\nmechanically confirmed that they are correct. I've also added the remaining\nuseful looking ones, in particular int_eq(x, 0) -> int_is_zero(x) etc.\nIf we wanted to scale this approach up, we would have to work much harder!\nThere are a bunch of problems that come with generalizing the approach to\nlooking at sequences of instructions:\n\n\nCombinatorial explosion: if we look at sequences of instructions, we very\n quickly get a combinatorial explosion and it becomes untractable to try all\n combinations.\n\n\nFinding non-minimal patterns: Some complicated simplifications can be\n instances of simpler ones. For example, because int_add(x, 0) -> x, it's\n also true that int_add(int_sub(x, y), 0) -> int_sub(x, y). If we simply\n generate all possible sequences, we will find the latter simplification rule,\n which we would usually not care about.\n\n\nUnclear usefulness: if we simply generate all rewrites up to a certain number\n of instructions, we will get a lot of patterns that are useless in the sense\n that they typically aren't found in realistic programs. It would be much\n better to somehow focus on the patterns that real benchmarks are using.\n\n\nIn the next blog post I'll discuss an alternative approach to simply generating\nall possible sequences of instructions, that tries to address these problems.\nThis works by analyzing the real traces of benchmarks and mining those for\ninefficiencies, which only shows problems that occur in actual programs.\nSources\nI've been re-reading a lot of blog posts from John's blog:\n\nLet\u2019s Work on an LLVM Superoptimizer\nEarly Superoptimizer Results\nA Few Synthesizing Superoptimizer Results\nSynthesizing Constants\n\nbut also papers:\n\nA Synthesizing Superoptimizer\nHydra: Generalizing Peephole Optimizations with Program Synthesis\n\nAnother of my favorite blogs has been Philipp Zucker's\nblog in the last year or two, lots of excellent\nposts about/using Z3 on there.", + "tags": "jit,z3", + "url": "https://www.pypy.org/posts/2024/07/finding-simple-rewrite-rules-jit-z3.html" + }, + { + "title": "Profiling PyPy using the Firefox profiler user interface", + "text": "Introduction\nIf you ever wanted to profile your Python code on PyPy, you probably came across VMProf \u2014 a statistical profiler for PyPy.\nVMProf's console output can already give some insights into where your code spends time, \nbut it is far from showing all the information captured while profiling.\nThere have been some tools around to visualize VMProf's output.\nUnfortunately the vmprof.com user interface is no longer available and vmprof-server is not as easy to use, you may want to take a look at a local viewer or converter.\nThose so far could give you some general visualizations of your profile, but do not show any PyPy related context like PyPy's log output (PyPyLog, which is output when using the PYPYLOG environment variable to log JIT actions).\nTo bring all of those features together in one tool, you may take a look at the vmprof-firefox-converter.\nCreated in the context of my bachelor's thesis, the vmprof-firefox-converter is a tool for analyzing VMProf profiles with the Firefox profiler user interface. \nInstead of building a new user interface from scratch, this allows us to reuse the user interface work Mozilla put into the Firefox profiler.\nThe Firefox profiler offers a timeline where you can zoom into profiles and work with different visualizations like a flame graph or a stack chart.\nTo understand why there is time spent inside a function, you can revisit the source code and even dive into the intermediate representation of functions executed by PyPy's just-in-time compiler.\nAdditionally, there is a visualization for PyPy's log output, to keep track whether PyPy spent time inside the interpreter, JIT or GC throughout the profiling time.\nProfiling word count\nIn this blog post, I want to show an example of how to use the vmprof-firefox-converter for a simple Python program.\nBased on Ben Hoyt's blog Performance comparison: counting words in Python, Go, C++, C, AWK, Forth, and Rust we will profile two python versions of a word counter running on PyPy. One being a bit more optimized. For this, VMProf will be used, but instead of just going with the console output, we will use the Firefox profiler user interface.\nAt first, we are going to look at a simple way of counting words with Collections.Counter.\nThis will read one line from the standard input at a time and count the words with counter.update()\ncounts = collections.Counter()\nfor line in sys.stdin:\n words = line.lower().split()\n counts.update(words)\n\nfor word, count in counts.most_common():\n print(word, count)\n\n\nTo start profiling, simply execute:\npypy -m vmprofconvert -run simple.py \n\n\nBut unfortunately the crash never occurred when running in gdb.\nAfterwards I tried the next best thing, which was configuring the CI runner to\ndump a core file and upload it as a build\nartifact, which worked. Looking\nat the cores locally only sort of worked, because I am running a different\nversion of Ubuntu than the CI runners. So I used\ntmate to be able to log into the\nCI runner after a crash and interactively used gdb there. Unfortunately what I\nlearned from that was that the bug was some kind of memory corruption,\nwhich is always incredibly unpleasant to debug. Basically the header word of a\nPython object had been corrupted somehow at the point of the crash, which means\nthat it's vtable wasn't\nusable any more.\n(Sidenote: PyPy doesn't really use a vtable\npointer,\ninstead it uses half a word in the header for the vtable, and the other half\nfor flags that the GC needs to keep track of the state of the object.\nCorrupting all this is still bad.)\nReproducing Locally\nAt that point it was clear that I had to push to reproduce the problem on my\nlaptop, to allow me to work on the problem more directly and not to always have\nto go via the CI runner. Memory corruption bugs often have a lot of randomness\n(depending on which part of memory gets modified, things might crash or more\nlikely just happily keep running). Therefore I decided to try to brute-force\nreproducing the crash by simply running the tests many many times. Since the\ncrash happened in the AST rewriting phase of pytest, and that happens only if\nno pyc\nfiles\nof the bytecode-compiled rewritten ASTs exist, I made sure to delete them\nbefore every test run.\nTo repeat the test runs I used\nmultitime, which is a simple program\nthat runs a command repeatedly. It's meant for lightweight benchmarking\npurposes, but it also halts the execution of the command if that command exits\nwith an error (and it sleeps a small random time between runs, which might help\nwith randomizing the situation, maybe). Here's a demo:\n\n\n(Max pointed out\nautoclave to me when reviewing\nthis post, which is a more dedicated tool for this job.)\nThankfully, running the tests repeatedly eventually lead to a crash, solving my\n\"only happens on CI\" problem. I then tried various variants to exclude possible\nsources of errors. The first source of errors to exclude in PyPy bugs is the\njust-in-time compiler, so I reran the tests with --jit off to see whether I\ncould still get it to crash, and thankfully I eventually could (JIT bugs are\noften very annoying).\nNext source of bugs to exclude where C-extensions. Since those were the tests\nof nanobind, a framework for creating C-extension modules I was a bit worried\nthat the bug might be in our emulation of CPython's C-API. But running PyPy\nwith the -v option (which will print all the imports as they happen)\nconfirmed that at the point of crash no C-extension had been imported yet.\nUsing rr\nI still couldn't get the bug to happen in GDB, so the tool I tried next was\nrr, the \"reverse debugger\". rr can record the execution of a program and\nlater replay it arbitrarily often. This gives you a time-traveling debugger\nthat allows you to execute the program backwards in addition to forwards.\nEventually I managed to get the crash to happen when running the tests with\nrr record --chaos (--chaos randomizes some decisions that rr takes, to try to\nincrease the chance of reproducing bugs).\nUsing rr well is quite hard, and I'm not very good at it. The main approach I\nuse with rr to debug memory corruption is to replay the crash, then set a\nwatchpoint\nfor the corrupted memory location, then use the command reverse-continue to\nfind the place in the code that mutated the memory location. reverse-continue\nis like continue, except that it will execute the program backwards from the\ncurrent point. Here's a little demo of this:\n\n\nDoing this for my bug revealed that the object that was being corrupted was\nerroneously collected by the garbage collector. For some reason the GC had\nwrongly decided that the object was no longer reachable and therefore put the\nobject into a freelist by writing a pointer to the next entry in the freelist\ninto the first word of the object, overwriting the object's header. The next\ntime the object was used things crashed.\nSide-quest: wrong GC assertions\nAt this point in the process, I got massively side-tracked. PyPy's GC has a\nnumber of debug modes that you can optionally turn on. Those slow down the\nprogram execution a lot, but they should in theory help to understand why the\nGC goes wrong. When I turned them on, I was getting a failing assertion really\nearly in the test execution, complaining about an invariant violation in the GC\nlogic. At first this made me very happy. I thought that this would help me fix\nthe bug more quickly.\nExtremely frustratingly, after two days of work I concluded that the assertion\nlogic itself was wrong. I have fixed that in the meantime too, the details\nof that are in the bonus section at the end of the post.\nUsing GDB scripting to find the real bug\nAfter that disaster I went back to the earlier rr recording without GC assertions\nand tried to understand in more detail why the GC decided to free an object\nthat was still being referenced. To be able to do that I used the GDB Python\nscripting\nAPI to\nwrite some helper commands to understand the state of the GC heap (rr is an\nextension of GDB, so the GDB scripting API works in rr too).\nThe first (small) helper command I wrote with the GDB scripting API was a way\nto pretty-print the currently active GC flags of a random PyPy object, starting\njust from the pointer. The more complex command I wrote was an object tracer,\nwhich follows pointers to GC objects starting from a root object to explore the\nobject graph. The object tracer isn't complete, it doesn't deal with all the\ncomplexities of PyPy's GC. But it was good enough to help me with my problem, I\nfound out that the corrupted object was stored in an array.\nAs an example, here's a function that uses the GDB API to walk one of the\nhelper data structures of the GC, a stack of pointers:\ndef walk_addr_stack(obj):\n \"\"\" walk an instance of the AddressStack class (which is a linked list of\n arrays of 1019 pointers).\n\n the first of the arrays is only partially filled with used_in_last_chunk\n items, all the other chunks are full.\"\"\"\n if obj.type.code == gdb.TYPE_CODE_PTR:\n obj = obj.dereference()\n used_in_last_chunk = lookup(obj, \"used_in_last_chunk\")\n chunk = lookup(obj, \"inst_chunk\").dereference()\n while 1:\n items = lookup(chunk, \"items\")\n for i in range(used_in_last_chunk):\n yield items[i]\n chunk = lookup(chunk, \"next\")\n if not chunk:\n break\n chunk = chunk.dereference()\n used_in_last_chunk = 1019\n\n\nThe full file of supporting code I wrote can be found in this\ngist. This is\npretty rough throw-away code, however.\nIn the following recording I show a staged debugging session with some of the\nextra commands I wrote with the Python API. The details aren't important, I\njust wanted to give a bit of a flavor of what inspecting objects looks like:\n\n\nThe next step was to understand why the array content wasn't being correctly\ntraced by the GC, which I eventually managed with some conditional\nbreakpoints,\nmore watchpoints, and using reverse-continue. It turned out to be a bug that\noccurs when the content of one array was memcopied into another array. The\ntechnical details of why the array wasn't traced correctly are described in\ndetail in the next section.\nWriting a unit test\nTo try to make sure I really understood the bug correctly I then wrote a GC\nunit test that shows the problem. Like most of PyPy, our GC is written in\nRPython, a (somewhat strange) subset/dialect of Python2, which can be compiled\nto C code. However, since it is also valid Python2 code, it can be unit-tested\non top of a Python2\nimplementation\n(which is one of the reasons why we keep maintaining PyPy2).\nIn the GC unit tests you have a lot of control about what order things happen\nin, e.g. how objects are allocated, when garbage collection phases happen, etc.\nAfter some trying I managed to write a test that crashes with the same kind of\nmemory corruption that my original crash exhibited: an object that is still\nreachable via an array is collected by the GC. To give you a flavor of what\nthis kind of test looks like, here's an (edited for clarity) version of the\ntest I eventually managed to write\ndef test_incrementality_bug_arraycopy(self):\n source = self.malloc(VAR, 8) # first array\n # the stackroots list emulates the C stack\n self.stackroots.append(source)\n target = self.malloc(VAR, 8) # second array\n self.stackroots.append(target)\n node = self.malloc(S) # unrelated object, will be collected\n node.x = 5\n # store reference into source array, calling the write barrier\n self.writearray(source, 0, node)\n val = self.gc.collect_step()\n source = self.stackroots[0] # reload arrays, they might have moved\n target = self.stackroots[1]\n # this GC step traces target\n val = self.gc.collect_step()\n\n # emulate what a memcopy of arrays does\n res = self.gc.writebarrier_before_copy(source, target, 0, 0, 2)\n assert res\n target[0] = source[0] # copy two elements of the arrays\n target[1] = source[1]\n # now overwrite the reference to node in source\n self.writearray(source, 0, lltype.nullptr(S))\n # this GC step traces source\n self.gc.collect_step()\n # some more collection steps, crucially target isn't traced again\n # but node is deleted\n for i in range(3):\n self.gc.collect_step()\n # used to crash, node got collected\n assert target[0].x == 5\n\n\nOne of the good properties of testing our GC that way is that all the memory is\nemulated. The crash in the last line of the test isn't a segfault at all,\ninstead you get a nice exception saying that you tried to access a freed chunk\nof memory and you can then debug this with a python2 debugger.\nFixing the Bug\nWith the unit test in hand, fixing the test was relatively straightforward (the\ndiff in its simplest form is anyway only a single line\nchange).\nAfter this first version of my fix, I\ntalked to Armin\nRigo who\nhelped me find different case that was still wrong, in the same area of the\ncode.\nI also got help by the developers at PortaOne\nwho are using PyPy on their servers and had seen some mysterious PyPy\ncrashes\nrecently, that looked related to the GC. They did test deployments of my fixes\nin their various stages to their servers to try to see whether stability\nimproved for them. Unfortunately in the end it turned out that their crashes\nare an unrelated GC bug related to object pinning, which we haven't resolved\nyet.\nWriting a GC fuzzer/property based test\nFinding bugs in the GC is always extremely disconcerting, particularly since\nthis one managed to hide for so long (more than ten years!). Therefore I wanted\nto use these bugs as motivation to try to find more problems in PyPy's GC. Given\nthe ridiculous effectiveness of fuzzing, I used\nhypothesis to write a\nproperty-based test. Every test performs a sequence of randomly chosen steps\nfrom the following list:\n\nallocate an object\nread a random field from a random object\nwrite a random reference into a random object\ndrop a random stack reference\nperform one GC step\nallocate an array\nread a random index from a random array\nwrite to an array\nmemcopy between two arrays\n\nThis approach of doing a sequence of steps is pretty close to the stateful\ntesting approach of\nhypothesis, but I just implemented it manually with the data\nstrategy.\nEvery one of those steps is always performed on both the tested GC, and on some\nregular Python objects. The Python objects provide the \"ground truth\" of what\nthe heap should look like, so we can compare the state of the GC objects\nwith the state of the Python objects to find out whether the GC made a mistake.\nIn order to check whether the test is actually useful, I reverted my bug fixes\nand made sure that the test re-finds both the spurious GC assertion error and the\nproblems with memcopying an array.\nIn addition, the test also found corner cases in my fix. There was a situation\nthat I hadn't accounted for, which the test found after eventually.\nI also plan on adding a bunch of other GC features as steps in the\ntest to stress them too (for example weakrefs, identity hashes, pinning, maybe\nfinalization).\nAt the point of publishing this post, the fixes got merged to the 2.7/3.9/3.10\nbranches of PyPy, and will be part of the next release (v7.3.16).\nThe technical details of the bug\nIn order to understand the technical details of the bug, I need to give some\nbackground explanations about PyPy's GC.\nPyPy's incremental GC\nPyPy uses an incremental generational mark-sweep GC. It's\ngenerational\nand therefore has minor collections (where only young objects get collected)\nand major collections (collecting long-lived objects eventually, using a\nmark-and-sweep\nalgorithm). Young objects are allocated in a nursery using a\nbump-pointer allocator, which makes allocation quite efficient. They are moved\nout of the nursery by minor collections. In order to find references from old\nto young objects the GC uses a write barrier to detect writes into old objects.\nThe GC is also\nincremental,\nwhich means that its major collections aren't done all at once (which would\nlead to long pauses). Instead, major collections are sliced up into small\nsteps, which are done directly after a minor collection (the GC isn't\nconcurrent though, which would mean that the GC does work in a separate\nthread).\nThe incremental GC uses tri-color\nmarking\nto reason about the reachable part of the heap during the marking phase, where\nevery old object can be:\n\nblack: already marked, reachable, definitely survives the collection\ngrey: will survive, but still needs to be marked\nwhite: potentially dead\n\nThe color of every object is encoded by setting flags\nin the object header.\nThe GC maintains the invariant that black objects must never point to white\nobjects. At the start of a major collection cycle the stack roots are turned\ngray. During the mark phase of a major collection cycle, the GC will trace gray\nobjects, until\nnone are left. To trace a gray object, all the objects it references have to be\nmarked grey if they are white so far. After a grey object is traced, it can be\nmarked black (because all the referenced objects are now either black or gray).\nEventually, there are no gray objects left. At that point (because no white\nobject can be reached from a black one) all the white objects are known to be\nunreachable and can therefore be freed.\nThe GC is incremental because every collection step will only trace a limited\nnumber of gray objects, before giving control back to the program. This leads to\na problem: if an already traced (black) object is changed between two marking\nsteps of the GC, the program can mutate that object and write a new reference\ninto one of its fields. This could lead to an invariant violation, if the\nreferenced object is white. Therefore, the GC uses the write barrier (which it\nneeds anyway to find references from old to young objects) to mark all black\nobjects that are modified gray, and then trace them again at one of the\nlater collection steps.\nThe special write barrier of memcopy\nArrays use a different kind of write barrier than normal objects. Since they\ncan be arbitrarily large, tracing them can take a long time. Therefore it's\npotentially wasteful to trace them fully at a minor collection. To fix this,\nthe array write barrier keeps more granular information about which parts of\nthe array have been modified since the last collection step. Then only the\nmodified parts of the array need to be traced, not the whole array.\nIn addition, there is another optimization for arrays, which is that memcopy is\ntreated specially by the GC. If memcopy is implemented by simply writing a loop\nthat copies the content of one array to the other, that will invoke the write\nbarrier every single loop iteration for the write of every array element,\ncosting a lot of overhead. Here's some pseudo-code:\ndef arraycopy(source, dest, source_start, dest_start, length):\n for i in range(length):\n value = source[source_start + i]\n dest[dest_start + i] = value # <- write barrier inserted here\n\n\nTherefore the GC has a special memcopy-specific\nwrite barrier that will perform the GC logic once before the memcopy loop, and\nthen use a regular (typically SIMD-optimized) memcopy implementation from\nlibc. Roughly like this:\ndef arraycopy(source, dest, source_start, dest_start, length):\n gc_writebarrier_before_array_copy(source, dest, source_start, dest_start, length)\n raw_memcopy(cast_to_voidp(source) + source_start,\n cast_to_voidp(dest) + dest_start,\n sizeof(itemtype(source)) * length)\n\n\n(this is really a rough sketch. The real\ncode\nis much more complicated.)\nThe bug\nThe bugs turned out to be precisely in this memcopy write barrier. When we\nimplemented the current GC, we adapted our previous GC, which was a\ngenerational mark-sweep GC but not incremental. We started with most of the\nprevious GC's code, including the write barriers. The regular write barriers\nwere adapted to the new incremental assumptions, in particular the need for the\nwrite barrier to also turn black objects back to gray when they are modified\nduring a marking phase. This was simply not done at all for the memcopy write\nbarrier, at least in two of the code paths. Fixing this problem fixes the unit\ntests and stops the crashes.\nReflections\nThe way the bug was introduced is really typical. A piece of code (the memcopy\nwrite barrier) was written under a set of assumptions. Then those assumptions\nchanged later. Not all the code pieces that relied on these assumptions to be\ncorrect were updated. It's pretty hard to prevent this in all situations.\nI still think we could have done more to prevent the bug occurring. Writing a\nproperty-based test for the GC would have been a good idea given the complexity\nof the GC, and definitely something we did in other parts of our code at the\ntime (just using the random module mostly, we started using hypothesis\nlater).\nIt's a bit of a mystery to me why this bug managed to be undetected for so\nlong. Memcopy happens in a lot of pretty core operations of e.g. lists in\nPython (list.extend, to name just one example). To speculate, I would suspect\nthat all the other preconditions for the bug occurring made it pretty rare:\n\nthe content of an old list that is not yet marked needs to be copied into\n another old list that is marked already\nthe source of the copy needs to also store an object that has no other\n references\nthe source of the copy then needs to be overwritten with other data\nthen the next collection steps need to be happening at the right points\n...\n\nGiven the complexity of the GC logic I also wonder whether some lightweight\nformal methods would have been a good idea. Formalizing some of the core\ninvariants in B or\nTLA+ and then model\nchecking them up to some number\nof\nobjects would have found this problem pretty quickly. There are also correctness\nproofs for GC algorithms in some research papers, but I don't have a good\noverview of the literature to point to any that are particularly good or bad.\nGoing such a more formal route might have fixed this and probably a whole bunch\nof other bugs, but of course it's a pretty expensive (and tedious) approach.\nWhile it was super annoying to track this down, it was definitely good to learn\na bit more about how to use rr and the GDB scripting interface.\nBonus Section: The Wrong Assertion\nSome more technical information about the wrong assertion is in this section.\nBackground: pre-built objects\nPyPy's VM-building bootstrapping process can \"freeze\" a bunch of heap objects\ninto the final binary. This allows the VM to start up quickly, because those\nfrozen objects are loaded by the OS as part of the binary.\nThose frozen pre-built objects are parts of the 'roots' of the garbage\ncollector and need to be traced. However, tracing all the pre-built objects at\nevery collection would be very expensive, because there are a lot of them\n(about 150,000 in a PyPy 3.10 binary). Tracing them all is also not necessary,\nbecause most of them are never modified. Unmodified pre-built objects can only reference\nother pre-built objects, which can never be deallocated anyway. Therefore we\nhave an optimization that uses the write barrier (which we need anyway to find\nold-to-young pointers) to notice when a pre-built object gets modified for the\nvery first time. If that happens, it gets added to the set of pre-built objects\nthat gets counted as a root, and is traced as a root at collections\nfrom then on.\nThe wrong assertion\nThe assertion that triggered when I turned on the GC debug mode was saying that\nthe GC found a reference from a black to a white object, violating its\ninvariant. Unmodified pre-built objects count as black, and they aren't roots,\nbecause they can only ever reference other pre-built objects. However, when a\npre-built object gets modified for the first time, it becomes part of the root\nset and will be marked gray. This logic works fine.\nThe wrong assertion triggers if a pre-built object is mutated for the very\nfirst time in the middle of an incremental marking phase. While the pre-built\nobject gets added to the root set just fine, and will get traced before the\nmarking phase ends, this is encoded slightly differently for pre-built objects,\ncompared to \"regular\" old objects. Therefore, the invariant checking code\nwrongly reported a black->white pointer in this situation.\nTo fix it I also wrote a unit test checking the problem, made sure that the GC\nhypothesis test also found the bug, and then fixed the wrong assertion to take\nthe color encoding of pre-built objects into account.\nThe bug managed to be invisible because we don't tend to turn on the GC\nassertions very often. We only do that when we find a GC bug, which is of\ncourse also when we need it the most to be correct.\nAcknowledgements\nThanks to Matti Picus, Max Bernstein, Wouter van Heyst for giving me feedback on drafts of the\npost. Thanks to Armin Rigo for reviewing the code and pointing out holes in my\nthinking. Thanks to the original reporters of the various forms of the bug,\nincluding Lily Foote, David Hewitt, Wenzel Jakob.", + "tags": "", + "url": "https://www.pypy.org/posts/2024/03/fixing-bug-incremental-gc.html" + }, + { + "title": "PyPy v7.3.15 release", + "text": "PyPy v7.3.15: release of python 2.7, 3.9, and 3.10\nThe PyPy team is proud to release version 7.3.15 of PyPy.\nThis is primarily a bug-fix release, and includes work done to migrate PyPy to\nGit and Github.\nThe release includes three different interpreters:\n\n\nPyPy2.7, which is an interpreter supporting the syntax and the features of\nPython 2.7 including the stdlib for CPython 2.7.18+ (the + is for\nbackported security updates)\nPyPy3.9, which is an interpreter supporting the syntax and the features of\nPython 3.9, including the stdlib for CPython 3.9.18.\nPyPy3.10, which is an interpreter supporting the syntax and the features of\nPython 3.10, including the stdlib for CPython 3.10.13.\n\n\nThe interpreters are based on much the same codebase, thus the multiple\nrelease. This is a micro release, all APIs are compatible with the other 7.3\nreleases. It follows after 7.3.14 release on Dec 25, 2023\nWe recommend updating. You can find links to download the v7.3.15 releases here:\n\nhttps://pypy.org/download.html\n\nWe would like to thank our donors for the continued support of the PyPy\nproject. If PyPy is not quite good enough for your needs, we are available for\ndirect consulting work. If PyPy is helping you out, we would love to hear about\nit and encourage submissions to our blog via a pull request\nto https://github.com/pypy/pypy.org\nWe would also like to thank our contributors and encourage new people to join\nthe project. PyPy has many layers and we need help with all of them: bug fixes,\nPyPy and RPython documentation improvements, or general help with\nmaking RPython's JIT even better.\nIf you are a python library maintainer and use C-extensions, please consider\nmaking a HPy / CFFI / cppyy version of your library that would be performant\non PyPy. In any case, both cibuildwheel and the multibuild system support\nbuilding wheels for PyPy.\n\nWhat is PyPy?\nPyPy is a Python interpreter, a drop-in replacement for CPython\nIt's fast (PyPy and CPython 3.7.4 performance\ncomparison) due to its integrated tracing JIT compiler.\nWe also welcome developers of other dynamic languages to see what RPython\ncan do for them.\nWe provide binary builds for:\n\n\nx86 machines on most common operating systems\n(Linux 32/64 bits, Mac OS 64 bits, Windows 64 bits)\n64-bit ARM machines running Linux (aarch64).\nApple M1 arm64 machines (macos_arm64).\ns390x running Linux\n\n\nPyPy support Windows 32-bit, Linux PPC64 big- and little-endian, and Linux ARM\n32 bit, but does not release binaries. Please reach out to us if you wish to\nsponsor binary releases for those platforms. Downstream packagers provide\nbinary builds for debian, Fedora, conda, OpenBSD, FreeBSD, Gentoo, and more.\n\n\nWhat else is new?\nFor more information about the 7.3.15 release, see the full changelog.\nPlease update, and continue to help us make pypy better.\nCheers,\nThe PyPy Team", + "tags": "release", + "url": "https://www.pypy.org/posts/2024/01/pypy-v7315-release.html" + }, + { + "title": "PyPy has moved to Git, GitHub", + "text": "PyPy has moved its canonical repo and issue tracker from\nhttps://foss.heptapod.net/pypy/pypy to https://github.com/pypy/pypy. Obviously,\nthis means development will now be tracked in Git rather than Mercurial.\nMotivation\nWe still feel Mercurial is a better version control system. The named branch\nmodel and user interface are superior. But\n\n\nfoss.heptapod.net is not well indexed in google/bing/duckduckgo\n search, so people find it harder to search for issues in the project.\n\n\nSince Heptapod has tightened its spam control, we get reports that\n users create issues only to have them flagged as spam.\n\n\nOpen Source has become synonymous with GitHub, and we are too small to\n change that.\n\n\nMuch of the current development comes as a reaction to fixing issues.\n Tracking interlocking issues is easier if all the code is on the same\n platform.\n\n\nThe FAQ\n presents two arguments against the move. Github notes\n solves much of point (1): the difficulty of discovering provenance of\n commits, although not entirely. But the main problem is point (2), it turns\n out that not moving to GitHub is an impediment to contribution and issue\n reporting.\n\n\nPeople who wish to continue to use Mercurial can use the same method below to\n push to GitHub.\n\n\nGitHub is more resource rich than foss.heptapod.net. We could add CI\n jobs to replace some of our aging buildbot\n infrastructure.\n\n\nMethod\nThe migration required two parts: migrating the code and then migrating the\nissues and merge requests.\nCode migration 1: code and notes\nI used a fork of git-remote-hg to\ncreate a local Git repo with all the changesets. Then I wanted to add a Git\nnote to each commit with the branch it came from. So I prepared a file with two\ncolumns: the Git commit hash, and the corresponding branch from Mercurial.\nMercurial can describe each commit in two ways: either the commit hash or by a\nnumber index. I used hg log to convert an index i to the Mercurial hash,\nand then git-hg-helper from git-remote-hg to convert the Mercurial hash to\na Git hash:\n$(cd pypy-git; git-hg-helper git-rev $(cd ../pypy-hg; hg log -r $i -T\"{node}\\n\"))\n\n\nThen I used hg log again to print the Mercurial branch for the index i:\n$(cd pypy-hg; hg log -r $i -T'{branch}\\n')\n\n\nPutting these two together, I could loop over all the commits by their\nnumerical index to prepare the file. Then I iterated over each line in the\nfile, and added the Git note. Since the git note add command works on the\ncurrent HEAD, I needed to checkout each commit in turn and then add the note:\ngit checkout -q && git notes --ref refs/notes/branch add -m branch:\n\n\nI could then use git push --all to push to GitHub.\nCode migration 2: prepare the branches\nPyPy has almost 500 open branches. The code migration created all the branch\nHEADs, but git push --all did not push them. I needed to check them out and\npush each one. So I created a file with all the branch names\ncd pypy-hg; hg branches | cut -f1 -d\" \" > branches.txt\n\n\nand then push each one to the GitHub repo\nwhile read branch; do git checkout branches/$branch && git push origin branches/$branch; done < branches.txt\n\n\nNote that the branches were named branches/XXX by the migration, not branch/XXX. This confuses the merge request migration, more about that later.\nIssue and merge request migration\nI used the solution from\nnode-gitlab-2-github which\nworked almost perfectly. It is important to do the conversion on a private\nrepo otherwise every mention of a successfully mapped user name notifies\nthe user about the transfer. This can be quite annoying for a repo the size of\nPyPy with 600 merge requests and over 4000 issues. Issues transferred without a\nproblem: the script properly retained the issue numbers. However the script\ndoes not convert the Mercurial hashes to Git hashes, so the bare hashes in\ncomments show up without a link to the commit. Merge requests are more of a problem:\n\nThe Mercurial named branch \"disappears\" once it is merged, so a merge request\n to a merged branch does not find the target branch name in Git. The\n conversion creates an issue instead with the label gitlab merge request.\nFor some reason, the branches created by git-remote-hg are called\n branches/XXX and not branch/XXX as expected by GitLab. This messes up the\n merge request/PR conversion. For some of the branches (open PRs and main\n target branches) I manually created additional branches without the es. The\n net result is that open merge requests became open PRs, merged merge requests\n became issues, and closed-not-merged merge requests were not migrated.\n\nLayered conversions\nPyPy already migrated once from Bitbucket to Heptapod. Many of the issues\nreflect the multiple transitions: they have lines like \"Created originally on\nBitbucket by XXX\" from the first transition, and an additional line \"In\nHeptapod\" from this transition.\nCredits\nWe would like to express our gratitude to the Octobus\nteam who support Heptapod. The transition from Bitbucket was quite an effort,\nand they have generously hosted our development since then. We wish them all\nthe best, and still believe that Mercurial should have \"won\".\nNext steps\nWhile the repo at GitHub is live, there are still a few more things we need to\ndo:\n\nDocumentation needs an update for the new repo and the build automation from\n readthedocs must be adjusted.\nThe wiki should be copied from Heptapod.\nbuildbot.pypy.org should also look at the new repo. I hope the code is up to\n the task of interacting with a Git repo.\nspeed.pypy.org tracks changes, it too needs to reference the new location\nTo keep tracking branches with Git notes on new commits, I activated a\n github action by Julian to\n add a Git branch note to each commit. Please see the README there for\n directions on using Git notes.\nSome of the merge requests were not migrated. If someone wants to, they could\n migrate those once they figure out the branch naming problems.\n\nAdditionally, now is the time for all of you to prove the move is worthwhile:\n\nStar the repo, let others know how to find it,\nHelp fix some of the open issues or file new ones,\nTake advantage of the more familiar workflow to get involved in the project,\nSuggest ways to improve the migration: are there things I missed or could\n have done better?\n\nHow will development change?\nHeptapod did not allow personal forks, so we were generous with a commit bit to\nthe main repo. Additionally, we (well, me) have been using a\ncommit-directly-to-main workflow. We will now be adopting a more structured\nworkflow. Please fork the repo and submit a pull request for any changes. We\ncan now add some pre-merge CI to check that the PR at least passes the first\nstage of translation. The live and active branches will be:\n\nmain: what was \"default\" in Mercurial, it is the Python2.7 interpreter and\n the base of the RPython interpreter,\npy3.9: the Python3.9 interpreter, which also includes all RPython changes\n from main. This is exactly like on Mercurial, and\npy3.10: the Python3.10 interpreter, which also includes all RPython changes\n from main and all bugfixes from py3.9. This is exactly like on Mercurial.\n\nWorking between the repos\nFinding commits\nIf you want to figure out how a Mercurial commit relates to a Git commit, you\ncan use git-hg-helper. You run it in the Git repo. It takes the full long\nhash from one repo and gives you the corresponding hash of the other repo:\n$ git-hg-helper git-rev d64027c4c2b903403ceeef2c301f5132454491df\n4527e62ad94b0e940a5b0f9f20d29428672f93f7\n$ git-hg-helper hg-rev 4527e62ad94b0e940a5b0f9f20d29428672f93f7\nd64027c4c2b903403ceeef2c301f5132454491df\n\n\nFinding branches\nBranches migrated from Mercurial will have a branches prefix, not branch.\nWhile GitLab uses branch for its prefix, the git-remote-hg script uses\nbranches. New work should be in a PR targeting main, py3.9 or py3.10.\nThanks for helping to make PyPy better.\nMatti\nUpdate\nIn the meantime we found out that unfortunately something went wrong in the\nmigration of the issues. The old issue\n3655 got lost in the\nmigration. This means that after number 3655 the numbers are different between\ngithub and heptapod, with heptapod being one larger. E.g. issue 3700 on\nheptapod is issue 3699 on\ngithub. We are investigating\noptions.", + "tags": "", + "url": "https://www.pypy.org/posts/2023/12/pypy-moved-to-git-github.html" + }, + { + "title": "PyPy v7.3.14 release", + "text": "PyPy v7.3.14: release of python 2.7, 3.9, and 3.10\nThe PyPy team is proud to release version 7.3.14 of PyPy.\nHighlights of this release are compatibility with HPy-0.9, cffi 1.16,\nadditional C-API interfaces, and more python3.10 fixes.\nThe release includes three different interpreters:\n\n\nPyPy2.7, which is an interpreter supporting the syntax and the features of\nPython 2.7 including the stdlib for CPython 2.7.18+ (the + is for\nbackported security updates)\nPyPy3.9, which is an interpreter supporting the syntax and the features of\nPython 3.9, including the stdlib for CPython 3.9.18.\nPyPy3.10, which is an interpreter supporting the syntax and the features of\nPython 3.10, including the stdlib for CPython 3.10.13.\n\n\nThe interpreters are based on much the same codebase, thus the multiple\nrelease. This is a micro release, all APIs are compatible with the other 7.3\nreleases. It follows after 7.3.13 release on Sept 29, 2023.\nWe recommend updating. You can find links to download the v7.3.14 releases here:\n\nhttps://pypy.org/download.html\n\nWe would like to thank our donors for the continued support of the PyPy\nproject. If PyPy is not quite good enough for your needs, we are available for\ndirect consulting work. If PyPy is helping you out, we would love to hear about\nit and encourage submissions to our blog via a pull request\nto https://github.com/pypy/pypy.org\nWe would also like to thank our contributors and encourage new people to join\nthe project. Since the last release we have contributions from three new\ncontributors. PyPy has many layers and we need help with all of them: bug\nfixes, PyPy and RPython documentation improvements, or general help\nwith making RPython's JIT even better.\nIf you are a python library maintainer and use C-extensions, please consider\nmaking a HPy / CFFI / cppyy version of your library that would be performant\non PyPy. In any case, both cibuildwheel and the multibuild system support\nbuilding wheels for PyPy.\n\nWhat is PyPy?\nPyPy is a Python interpreter, a drop-in replacement for CPython\nIt's fast (PyPy and CPython 3.7.4 performance\ncomparison) due to its integrated tracing JIT compiler.\nWe also welcome developers of other dynamic languages to see what RPython\ncan do for them.\nWe provide binary builds for:\n\n\nx86 machines on most common operating systems\n(Linux 32/64 bits, Mac OS 64 bits, Windows 64 bits)\n64-bit ARM machines running Linux (aarch64).\nApple M1 arm64 machines (macos_arm64).\ns390x running Linux\n\n\nPyPy support Windows 32-bit, Linux PPC64 big- and little-endian, and Linux ARM\n32 bit, but does not release binaries. Please reach out to us if you wish to\nsponsor binary releases for those platforms. Downstream packagers provide\nbinary builds for debian, Fedora, conda, OpenBSD, FreeBSD, Gentoo, and more.\n\n\nWhat else is new?\nFor more information about the 7.3.14 release, see the full changelog.\nPlease update, and continue to help us make pypy better.\nCheers,\nThe PyPy Team", + "tags": "release", + "url": "https://www.pypy.org/posts/2023/12/pypy-v7314-release.html" + }, + { + "title": "PyPy v7.3.13 release", + "text": "PyPy v7.3.13: release of python 2.7, 3.9, and 3.10\nThe PyPy team is proud to release version 7.3.13 of PyPy.\nThis is primarily a security/bug-fix release. CPython released security\npatches, and this release also improves the ability to use type\nspecifications via PyType_FromSpec and friends. There are also some\nsmall speed-ups.\nThe release includes three different interpreters:\n\n\nPyPy2.7, which is an interpreter supporting the syntax and the features of\nPython 2.7 including the stdlib for CPython 2.7.18+ (the + is for\nbackported security updates)\nPyPy3.9, which is an interpreter supporting the syntax and the features of\nPython 3.9, including the stdlib for CPython 3.9.18.\nPyPy3.10, which is an interpreter supporting the syntax and the features of\nPython 3.10, including the stdlib for CPython 3.10.13. Note it requires at\nleast cython 0.29.35 or cython 3.0.0b3.\n\n\nThe interpreters are based on much the same codebase, thus the multiple\nrelease. This is a micro release, all APIs are compatible with the other 7.3\nreleases. It follows after 7.3.12 release on June 16, 2023.\nWe recommend updating. You can find links to download the v7.3.13 releases here:\n\nhttps://pypy.org/download.html\n\nWe would like to thank our donors for the continued support of the PyPy\nproject. If PyPy is not quite good enough for your needs, we are available for\ndirect consulting work. If PyPy is helping you out, we would love to hear about\nit and encourage submissions to our blog via a pull request\nto https://github.com/pypy/pypy.org\nWe would also like to thank our contributors and encourage new people to join\nthe project. PyPy has many layers and we need help with all of them: bug fixes,\nPyPy and RPython documentation improvements, or general help with making\nRPython's JIT even better.\nIf you are a python library maintainer and use C-extensions, please consider\nmaking a HPy / CFFI / cppyy version of your library that would be performant\non PyPy. In any case, both cibuildwheel and the multibuild system support\nbuilding wheels for PyPy.\n\nWhat is PyPy?\nPyPy is a Python interpreter, a drop-in replacement for CPython\nIt's fast (PyPy and CPython 3.7.4 performance\ncomparison) due to its integrated tracing JIT compiler.\nWe also welcome developers of other dynamic languages to see what RPython\ncan do for them.\nWe provide binary builds for:\n\n\nx86 machines on most common operating systems\n(Linux 32/64 bits, Mac OS 64 bits, Windows 64 bits)\n64-bit ARM machines running Linux (aarch64).\nApple M1 arm64 machines (macos_arm64).\ns390x running Linux\n\n\nPyPy support Windows 32-bit, Linux PPC64 big- and little-endian, and Linux ARM\n32 bit, but does not release binaries. Please reach out to us if you wish to\nsponsor binary releases for those platforms. Downstream packagers provide\nbinary builds for debian, Fedora, conda, OpenBSD, FreeBSD, Gentoo, and more.\n\n\nWhat else is new?\nFor more information about the 7.3.13 release, see the full changelog.\nPlease update, and continue to help us make pypy better.\nCheers,\nThe PyPy Team", + "tags": "release", + "url": "https://www.pypy.org/posts/2023/09/pypy-v7313-release.html" + }, + { + "title": "PyPy v7.3.12 release", + "text": "PyPy v7.3.12: release of python 2.7, 3.9, and 3.10.\nThe PyPy team is proud to release version 7.3.12 of PyPy.\nThis release includes a new string-to-int algorithm (also appearing in CPython\n3.12) that is faster than the older one; support for symlinks in Windows; and\nour first Python3.10 version.\nThe release includes three different interpreters:\n\n\nPyPy2.7, which is an interpreter supporting the syntax and the features of\nPython 2.7 including the stdlib for CPython 2.7.18+ (the + is for\nbackported security updates)\nPyPy3.9, which is an interpreter supporting the syntax and the features of\nPython 3.9, including the stdlib for CPython 3.9.17.\nPyPy3.10, which is an interpreter supporting the syntax and the features of\nPython 3.10, including the stdlib for CPython 3.10.12. This is our first\nrelease of 3.10, but based on past experience we are quite confident in\nits compatibility with upstream. Of course, we recommend testing your code\nwith this new version before putting it into production. Note it does\nrequire at least cython 0.29.35 or cython 3.0.0b3\n\n\nThe interpreters are based on much the same codebase, thus the multiple\nrelease. This is a micro release, all APIs are compatible with the other 7.3\nreleases. It follows after 7.3.11 release on Dec 29, 2022\nWe recommend updating. You can find links to download the v7.3.12 releases here:\n\nhttps://pypy.org/download.html\n\nWe would like to thank our donors for the continued support of the PyPy\nproject. If PyPy is not quite good enough for your needs, we are available for\ndirect consulting work. If PyPy is helping you out, we would love to hear about\nit and encourage submissions to our blog via a pull request\nto https://github.com/pypy/pypy.org\nWe would also like to thank our contributors and encourage new people to join\nthe project. PyPy has many layers and we need help with all of them: bug fixes,\nPyPy and RPython documentation improvements, or general help with making\nRPython's JIT even better. Since the previous release, we have accepted\ncontributions from one new contributor, thanks for pitching in, and welcome\nto the project!\nIf you are a python library maintainer and use C-extensions, please consider\nmaking a HPy / CFFI / cppyy version of your library that would be performant\non PyPy. In any case, both cibuildwheel and the multibuild system support\nbuilding wheels for PyPy.\n\nWhat is PyPy?\nPyPy is a Python interpreter, a drop-in replacement for CPython 2.7, 3.9 and\n3.10. It's fast (PyPy and CPython 3.7.4 performance\ncomparison) due to its integrated tracing JIT compiler.\nWe also welcome developers of other dynamic languages to see what RPython\ncan do for them.\nWe provide binary builds for:\n\n\nx86 machines on most common operating systems\n(Linux 32/64 bits, Mac OS 64 bits, Windows 64 bits)\n64-bit ARM machines running Linux (aarch64).\nApple M1 arm64 machines (macos_arm64).\ns390x running Linux\n\n\nPyPy support Windows 32-bit, Linux PPC64 big- and little-endian, and Linux ARM\n32 bit, but does not release binaries. Please reach out to us if you wish to\nsponsor binary releases for those platforms. Downstream packagers provide\nbinary builds for debian, Fedora, conda, OpenBSD, FreeBSD, Gentoo, and more.\n\n\nWhat else is new?\nFor more information about the 7.3.12 release, see the full changelog.\nPlease update, and continue to help us make pypy better.\nCheers,\nThe PyPy Team", + "tags": "release", + "url": "https://www.pypy.org/posts/2023/06/pypy-v7312-release.html" + }, + { + "title": "RPython-based emulator speeds up RISC-V simulation over 15x", + "text": "In cooperation with RISC-V International, who funded a part of this project,\nwe recently created a workflow to\nuse RPython to take a Sail RISC-V model and automatically create a RISC-V ISA\nemulator from it, which we call Pydrofoil. The simulator sped up booting a\nlinux emulator from 35 minutes (using the standard Sail-generated emulator in\nC) to 2 minutes, a speedup of 17.5x. More details about the process are in the\nRISC-V blog post.\nA few take-aways from the project:\n\nWhile PyPy has shown it can speed up generic python code about 4x, the\ntechnology behind PyPy can really shine in other areas.\nRPython is malleable and can be molded to many tasks, the RPython meta-JIT is\nvery flexible.\nA JIT is well-suited for the problem of emulation, because it can\nperform dynamic binary translation.\n\nPyPy can solve real world performance problems, even somewhat unusual ones.\nPlease get in touch and let us know how we can help you solve yours!", + "tags": "casestudy,performance", + "url": "https://www.pypy.org/posts/2023/05/rpython-used-to-speed-up-risc-v-simulation-over-15x.html" + }, + { + "title": "Repeated string concatenation is quadratic in PyPy (and CPython)", + "text": "This is a super brief blog post responding to an issue that we got on the PyPy\nissue tracker. I am moving my response to the blog (with permission of the\nsubmitter) to have a post to point to, since it's a problem that comes up with\nsome regularity. It's also documented on our page of differences between PyPy\nand CPython but I thought an additional blog post might be good.\nThe issue pointed out that a small program that operates on strings is much\nslower on PyPy compared to CPython. The program is a solution for 2016's\nAdvent of Code Day 16 and looks like this:\ndef dragon(a):\n b = a[::-1].replace('0','r').replace('1','0').replace('r','1')\n return a+'0'+b\n\ndef diffstr(a):\n b = \"\"\n for i in range(0,len(a),2):\n b += ['0','1'][a[i] == a[i+1]]\n return b\n\ndef iterdiff(a):\n b = a\n while(len(b) % 2 == 0):\n b = diffstr(b)\n return b\n\nsize = 35651584\ninitstate = '10010000000110000'\nwhile(len(initstate) < size):\n initstate = dragon(initstate)\ninitstate = initstate[:size]\nprint(iterdiff(initstate))\n\nThe submitter pointed out, that the program is fast on CPython (~8s on my\nlaptop) and slow (didn't finish) on PyPy.\nThe reason for the performance difference is that += on strings in a loop\nhas quadratic complexity in PyPy, which is what diffstr does. To see the\nquadraticness, consider that to add a character at the end of the string, the\nbeginning of the string needs to be copied into a new chunk of memory. If the\nloop runs n times, that means there are\n1 + 2 + 3 + ... + n = n * (n + 1) // 2\ncharacter copies.\nRepeated string concatenations are in principle also quadratic in CPython, but\nCPython has an optimization that makes them sometimes not quadratic, which is\nwhat makes this program not too slow in CPython.\nIn order to fix the problem on PyPy it's best to use a list for the string\nparts, which has the right amortized O(1) complexity for .append calls, and\nthen use str.join after the loop:\ndef diffstr(a):\n b = []\n for i in range(0,len(a),2):\n b.append(['0','1'][a[i] == a[i+1]])\n return \"\".join(b)\n\nWith this change the program becomes a little bit faster on CPython for me, and\non PyPy it stops being quadratic and runs in ~3.5s.\nIn general, it's best not to rely on the presence of this optimization in\nCPython either. Sometimes, a small innocent looking changes will break CPython's\noptimization. E.g. this useless change makes CPython also take ages:\ndef diffstr(a):\n b = \"\"\n for i in range(0,len(a),2):\n b += ['0','1'][a[i] == a[i+1]]\n c = b\n return b\n\nThe reason why this change breaks the optimization in CPython is that it only\ntriggers if the reference count of b is 1, in which case it uses realloc\non the string. The change is unrealistic of course, but you could imagine a\nrelated that keeps an extra reference to b for a sensible reason.\nAnother situation in which the optimization doesn't work is discussed in this\nStackOverflow question with an answer by Tim Peters.\nIt's unlikely that PyPy will fix this. We had a prototype how to do it, but it\nseems very little \"production\" code uses += on strings in a loop, and the fix\nmakes the strings implementation quite a bit more complex.\nSo, in summary, don't use repeated concatenations in a loop!", + "tags": "performance", + "url": "https://www.pypy.org/posts/2023/01/string-concatenation-quadratic.html" + }, + { + "title": "PyPy v7.3.11 release", + "text": "PyPy v7.3.11: release of python 2.7, 3.8, and 3.9\nThe PyPy team is proud to release version 7.3.11 of PyPy. As could be expected,\nthe first release of macOS arm64 impacted the macOS x86-64 build, so this is\na bug release to restore the ability of macOS users to run PyPy on\nmacOS < 11.0. It also incorporates the latest CPython stdlib updates\nreleased the day after 7.3.10 went out, and a few more bug fixes. The release\nincludes three different interpreters:\n\n\nPyPy2.7, which is an interpreter supporting the syntax and the features of\nPython 2.7 including the stdlib for CPython 2.7.18+ (the + is for\nbackported security updates)\nPyPy3.8, which is an interpreter supporting the syntax and the features of\nPython 3.8, including the stdlib for CPython 3.8.16. Note we intend to drop\nsupport for this version in an upcoming release as soon as we release\nPyython 3.10.\nPyPy3.9, which is an interpreter supporting the syntax and the features of\nPython 3.9, including the stdlib for CPython 3.9.16.\n\n\nThe interpreters are based on much the same codebase, thus the multiple\nrelease. This is a micro release, all APIs are compatible with the other 7.3\nreleases and follows quickly on the heals of the 7.3.10 release on Dec 6.\nWe recommend updating. You can find links to download the v7.3.11 releases here:\n\nhttps://pypy.org/download.html\n\nWe would like to thank our donors for the continued support of the PyPy\nproject. If PyPy is not quite good enough for your needs, we are available for\ndirect consulting work. If PyPy is helping you out, we would love to hear about\nit and encourage submissions to our blog via a pull request\nto https://github.com/pypy/pypy.org\nWe would also like to thank our contributors and encourage new people to join\nthe project. PyPy has many layers and we need help with all of them: bug fixes,\nPyPy and RPython documentation improvements, or general help with making\nRPython's JIT even better. Since the previous release, we have accepted\ncontributions from one new contributor, thanks for pitching in, and welcome\nto the project!\nIf you are a python library maintainer and use C-extensions, please consider\nmaking a HPy / CFFI / cppyy version of your library that would be performant\non PyPy.\nIn any case, both cibuildwheel and the multibuild system support\nbuilding wheels for PyPy.\n\nWhat is PyPy?\nPyPy is a Python interpreter, a drop-in replacement for CPython 2.7, 3.8 and\n3.9. It's fast (PyPy and CPython 3.7.4 performance\ncomparison) due to its integrated tracing JIT compiler.\nWe also welcome developers of other dynamic languages to see what RPython\ncan do for them.\nWe provide binary builds for:\n\n\nx86 machines on most common operating systems\n(Linux 32/64 bits, Mac OS 64 bits, Windows 64 bits)\n64-bit ARM machines running Linux (aarch64).\nApple M1 arm64 machines (macos_arm64).\ns390x running Linux\n\n\nPyPy support Windows 32-bit, Linux PPC64 big- and little-endian, and Linux ARM\n32 bit, but does not release binaries. Please reach out to us if you wish to\nsponsor binary releases for those platforms. Downstream packagers provide\nbinary builds for debian, Fedora, conda, OpenBSD, FreeBSD, Gentoo, and more.\n\n\nWhat else is new?\nFor more information about the 7.3.11 release, see the full changelog.\nPlease update, and continue to help us make pypy better.\nCheers,\nThe PyPy Team", + "tags": "release", + "url": "https://www.pypy.org/posts/2022/12/pypy-v7311-release.html" + }, + { + "title": "Finding JIT Optimizer Bugs using SMT Solvers and Fuzzing", + "text": "In this blog post I want to describe a recent bug finding technique that I've\nadded to the PyPy JIT testing infrastructure. This technique uses the Z3\ntheorem prover to find bugs in the optimizer of PyPy's JIT, in particular its\ninteger operation optimizations. The approach is\nbased on things I have learned from John Regehr's blog (this post is a\ngood first one to read), Twitter, and on\nhis (et al) paper Alive2: Bounded Translation Validation for LLVM. The work\nwas triggered by a recent miscompilation bug my current bachelor student Nico\nRittinghaus found.\n\nBackground: Python Integers in the PyPy JIT\nThe optimizer of PyPy's JITs operates on traces, which are linear sequences of\ninstructions with guards. The instructions in the traces operate on different\nmachine-level data types, machine integers, doubles, pointers, bools, etc. In\nthis post we'll be mostly concerned with machine integers.\nTo given some wider context I'll explain a bit how Python ints in the user code\nrelate to the types that are used in traces when the PyPy Python implementation\nis used.\nWhen PyPy turns a regular Python 3 function into a trace, there is a lot of work\nhappening in the JIT frontend to try to observe and infer the types that the\nPython function concretely uses at runtime. The traces are generated under these\ntyping assumptions. Therefore, code that uses ints in the Python code can\ntypically be translated into traces that operate on machine integers. In order\nto make sure that the Python integer semantics are upheld, many of the\noperations in the traces need to check that the integer results of some\noperations still fit into a machine integer. If that is not the case (a rare\nsituation for most programs), the trace is left via a guard, execution falls\nback to the interpreter, and there a big integer representation is chosen for\nthe too big value (the big integer representation is done via a pointer and\nsome storage on the heap).\nAll of this machinery is not going to be too relevant for the rest of the\npost. For the post it's important to know that trace instructions operate on\nmachine integers and other low-level types, and some of the operations can\noptionally check whether the\nresults still fit into a machine integer. These trace operations are improved by\nthe optimizer, which tries to transform the trace into one that behaves the\nsame, but is less costly to execute.\n\n\nBackground: Bounds Analysis in PyPy's JIT\nThe optimizer of PyPy's JIT has an analysis based on abstract interpretation\nthat tries to find out whether the integer values stored in a variable are\nactually not using the full 64 bit (or 32 bit) range, but instead fit into some\nsmaller range. This means that for every integer variable x in a trace, the\nJIT compiler tracks upper and lower bounds of the runtime value of that\nvariable: a range [a, b] such that for every concrete runtime value v\nthat gets stored in variable x, a <= v <= b must be true.\na and b start out\nas the most general MININT and MAXINT, but sometimes there is extra\ninformation that makes it possible to improve these known bounds, and that is\noften useful to optimize the code.\nA typical example is that the JIT knows that the length of a string is\nnon-negative, so for this kind of code: x = len(s) where s is a string,\nx gets a range [0, MAXINT] assigned. With this information we could for\nexample remove a check x + 10 < 0 completely, because it can never be true.\nThe bounds information is useful for optimization, but the analysis of the\nbounds is also a source of bugs in the JIT, because the reasoning is often\nsubtle and easy to get wrong in corner cases. We already use a number of testing\ntechniques to try to make sure that it is correct. A simple one is\nproperty-based testing using Hypothesis on the operations on bounds. Even\nthough Hypothesis is fantastic, it unfortunately does not catch\nabsolutely all the bugs even if we'd like it too, as we'll see in the next\nsection.\n\n\nMotivation: A JIT Miscompilation\nI am currently supervising a Bachelor thesis by Nico Rittinghaus, who is\nextending the integer analysis in the JIT. He'll probably write a separate blog\npost about that soon. In the process of his work, the current bounds analysis\ncode got a lot of scrutiny, and we found out that one of the unit tests of the\nbounds analysis was actually incorrect, and the example code in that unit test\nwas optimized incorrectly. This case of incorrect optimization is not a big deal\nfor regular Python code, because it involved a \"wrapping integer addition\noperation\", i.e. one where overflowing results just wrap around to negative\nvalues. All the additions and other arithmetic operations that the PyPy Python\nfrontend generates actually have\noverflow checks (to be able to switch to a big integer representation if\nneeded).\nHowever, it's still possible to trigger the problem with the\n__pypy__.intop.int_add API which is a function that exposes wraparound\narithmetic on Python ints.\nHere's the miscompilation. The JIT optimizes the following function:\nimport __pypy__\n\ndef wrong(x):\n a = __pypy__.intop.int_add(x, 10)\n if a < 15:\n if x < 6:\n return 0\n return 1\n return 2\n\nInto the following code:\nimport __pypy__\n\ndef wrong(x):\n a = __pypy__.intop.int_add(x, 10)\n if a < 15:\n return 0\n return 2\n\nBasically the faulty reasoning of the JIT looks like this: if int_add(x, 10) < 15\nthen it must follow that x < 5, which is stronger than x < 6, so the\nsecond if is always true. This sounds good, but is actually wrong\nif the addition + 10 wrapped around. So if x == MAXINT, then\nint_add(x, 10) == MININT + 9 < 15. But MAXINT < 5 is not\ncorrect.\nNote how the same reasoning with overflow-checking addition is correct! If x +\n10 < 15 and the + didn't overflow, then indeed x < 6. And if your\nmind bends starting to think about all this, you understand some of the\ndifficulty of getting the JIT correct in this area.\n\n\nHow could we have avoided this bug?\nOne exercise I try to do after finding bugs is to reflect on ways that the\nbug could have been avoided. I think this is particularly important in the JIT,\nwhere bugs are potentially really annoying to find and can cause very strange\nbehaviour in basically arbitrary Python code.\nIt's easy to always answer this question with \"try to think more carefully\nwhen working\", but that approach cannot be relied on in complicated situations,\nbecause humans don't concentrate perfectly for long stretches of time.\nA situation-specific problem I identified was the bad design of the range analysis API.\nA range is not just represented by two numbers, instead it's two numbers\nand two bools that are supposed to represent that some operation did or did not\nunderflow/overflow. The meaning of these bools was quite hard to grasp and easy\nto get wrong, so probably they should never have been introduced in the first\nplace (and my bugfix indeed removed them).\nBut in the rest of this blog post I want to talk about another, systematic\napproach that can be applied to the problem of mis-optimizations of integer\noperations, and that is done by applying an SMT solver to the problem.\nAn SMT solver (Satisfyability Modulo Theories) is a tool that can be used to\nfind out whether mathematical formulas are \"satisfiable\", i.e. whether\nsome chosen set of inputs exists that will make the formulas evaluate to true. SMT solvers are\ncommonly used in a wide range of CS applications including program correctness\nproofs, program synthesis, etc. The most widely known one is probably Z3 by\nMicrosoft Research which has the nice advantage of coming with an easy-to-use\nPython binding.\nGoing into this I basically knew next to nothing about SMT solvers (despite\nhaving been embedded in a formal methods research group for years!) so it was an\ninteresting new world to learn about.\nAs briefly mentioned in the introduction, the approach I took followed a similar\n(but much more properly executed) one applied to LLVM operations, called\nAlive2. Krister Waldfridsson has done similar work for GCC recently,\ndescribed on his blog.\n\n\nZ3 Proof of Concept\nThe first thing I did was to try to get Z3 find the above bug, by encoding the\ninput program into an SMT formula by hand and trying to get Z3 to prove the condition\nthat the JIT thinks is always true. The Z3 code for this looks as follows:\nfrom z3 import BitVec, Implies, prove\nx = BitVec('x', 64)\na = x + 10\ncond1 = a < 15\ncond2 = x < 6\nprove(Implies(cond1, cond2))\n\nHere, x is defined to be a bit vector variable of width 64, which is a\ndatatype that can be used to represent bounded machine integers. Addition on\nbit vectors performs wraparound arithmetic, like the __pypy__.intop.int_add\ncall in the original code. The JIT optimized the second condition away, so\nessentially it was convinced that the first condition implies the second one.\nThe above snippet tries to get Z3 to confirm this.\nWhen run, the above program prints:\ncounterexample\n[x = 9223372036854775803]\nWhich shows the bug. As a small side-note, I thought it was cool that the\nprocess of \"proving\" something in Z3 basically means trying to find an example\nfor the negation of the formula. If no counterexample can be found for the\nnegation, the original formula is true. If the original formula turns out to be\nfalse (like here) we get a nice example that shows the problem to go with it.\nIt's not realistic to hand-translate all the hundreds of\nunit-tests into Z3 formulas and then ask Z3 to prove the optimizations. Instead,\nwe want to have a program that does this for us.\n\n\nSMT Checking of the JIT Optimizer\nWhat we want from this program is the following: given an unoptimized trace and\nits optimized version, we want to use Z3 to check whether the optimized trace\nbehaves identically to the unoptimized one. One question is what \"behaves\nidentically\" means. What we care about is the outputs of the trace being the\nsame values, no matter how they are computed. Also, for every guard we want to\nmake sure that it fails in identical ways in the optimized and unoptimized\nversions. A guard is only allowed to be optimized away if it can never fail.\nThe code that comes after a guard can assume that the guard has not failed,\nbecause otherwise execution would have left the trace. All of this should be\ntrue regardless for the values of the input variables of the trace.\nSo in order to check that the two traces are behaving identically, we do the\nfollowing:\n\nWe create Z3 variables for every input variable. We use the same input\nvariables both for the unoptimized as well as the optimized trace.\nWe align the two traces at the corresponding guards. Thankfully the optimizer\nkeeps track of which optimized guard corresponds to which unoptimized input\nguard.\nAll the operations before a guard are translated into Z3 formulas, for both\nversions of the trace.\nFor two corresponding guards, we ask Z3 to prove that the guard conditions are\nidentical.\nFor a guard that was optimized away we ask Z3 to prove that the condition is\nalways true.\nAfter a guard, we tell Z3 that from now on it can assume that the guard\ncondition is true.\nWe repeat this, guard for guard, until we reach the end of the trace. There,\nwe ask Z3 to prove that the output variables in the unoptimized trace and the\noptimized trace are identical (every trace can return one or many values).\n\nI implemented this, it's not a lot of code, basically a couple of hundred lines\nof (somewhat hacky) Python code. So far I only support integer\noperations. Here are some parts of the code to give you a flavor of what this\nlooks like.\nThis is the code that translates operations into Z3 formulas:\ndef add_to_solver(self, ops, state):\n for op in ops:\n if op.type != 'v': # is it an operation with a result\n res = self.newvar(op)\n else: # or does it return void\n res = None\n\n # ...\n\n # convert arguments\n if op.numargs() == 1:\n arg0 = self.convertarg(op, 0)\n elif op.numargs() == 2:\n arg0 = self.convertarg(op, 0)\n arg1 = self.convertarg(op, 1)\n\n # compute results\n if opname == \"int_add\":\n expr = arg0 + arg1\n elif opname == \"int_sub\":\n expr = arg0 - arg1\n elif opname == \"int_mul\":\n expr = arg0 * arg1\n elif opname == \"int_and\":\n expr = arg0 & arg1\n elif opname == \"int_or\":\n expr = arg0 | arg1\n elif opname == \"int_xor\":\n expr = arg0 arg1\n\n # ... more operations, some shown below\n\n self.solver.add(res == expr)\n\nNew Z3 variables are defined by the helper function newvar, which adds the\noperation to a dictionary box_to_z3 mapping boxes (=variables) to Z3\nvariables. Due to the SSA property that traces have, a variable must be defined\nbefore its first use.\nHere's what newvar looks like (LONG_BIT is a constant that is either\n64 or 32, depending on the target architecture):\ndef newvar(self, box, repr=None):\n # ... some logic around making the string representation\n # somewhat nicer omitted\n result = z3.BitVec(repr, LONG_BIT)\n self.box_to_z3[box] = result\n return result\n\nThe convert method turns an operation argument (either a constant or a\nvariable) into a Z3 formula (either a constant bit vector or an already defined\nZ3 variable). convertarg is a helper function that takes an operation, reads\nits nth argument and converts it.\ndef convert(self, box):\n if isinstance(box, ConstInt):\n return z3.BitVecVal(box.getint(), LONG_BIT)\n return self.box_to_z3[box]\n\ndef convertarg(self, box, arg):\n return self.convert(box.getarg(arg))\n\nThe lookup of variables in box_to_z3 that convert does cannot fail,\nbecause the variable must have been defined before use.\nComparisons return the bit vector 0 or bit vector 1, we use a helper function\ncond to turn the Z3 truth value of the comparison into a bit vector:\ndef cond(self, z3expr):\n return z3.If(z3expr, TRUEBV, FALSEBV)\n\n\ndef add_to_solver(self, ops, state):\n # ... start as above\n\n # more cases\n elif opname == \"int_eq\":\n expr = self.cond(arg0 == arg1)\n elif opname == \"int_ne\":\n expr = self.cond(arg0 != arg1)\n elif opname == \"int_lt\":\n expr = self.cond(arg0 < arg1)\n elif opname == \"int_le\":\n expr = self.cond(arg0 <= arg1)\n elif opname == \"int_gt\":\n expr = self.cond(arg0 > arg1)\n elif opname == \"int_ge\":\n expr = self.cond(arg0 >= arg1)\n elif opname == \"int_is_true\":\n expr = self.cond(arg0 != FALSEBV)\n elif opname == \"uint_lt\":\n expr = self.cond(z3.ULT(arg0, arg1))\n elif opname == \"uint_le\":\n expr = self.cond(z3.ULE(arg0, arg1))\n elif opname == \"uint_gt\":\n expr = self.cond(z3.UGT(arg0, arg1))\n elif opname == \"uint_ge\":\n expr = self.cond(z3.UGE(arg0, arg1))\n elif opname == \"int_is_zero\":\n expr = self.cond(arg0 == FALSEBV)\n\n # ... rest as above\n\nSo basically for every trace operation that operates on integers I had to give a\ntranslation into Z3 formulas, which is mostly straightforward.\nGuard operations get converted into a Z3 boolean by their own helper function,\nwhich looks like this:\ndef guard_to_condition(self, guard, state):\n opname = guard.getopname()\n if opname == \"guard_true\":\n return self.convertarg(guard, 0) == TRUEBV\n elif opname == \"guard_false\":\n return self.convertarg(guard, 0) == FALSEBV\n elif opname == \"guard_value\":\n return self.convertarg(guard, 0) == self.convertarg(guard, 1)\n\n # ... some more exist, shown below\n\nSome operations are a bit trickier. An important example in the context of\nthis blog post are integer operations that check for overflow. The overflow\noperations return a result, but also a boolean whether the operation overflowed\nor not.\ndef add_to_solver(self, ops, state):\n\n # ... more cases\n\n elif opname == \"int_add_ovf\":\n expr = arg0 + arg1\n m = z3.SignExt(LONG_BIT, arg0) + z3.SignExt(LONG_BIT, arg1)\n state.no_ovf = m == z3.SignExt(LONG_BIT, expr)\n elif opname == \"int_sub_ovf\":\n expr = arg0 - arg1\n m = z3.SignExt(LONG_BIT, arg0) - z3.SignExt(LONG_BIT, arg1)\n state.no_ovf = m == z3.SignExt(LONG_BIT, expr)\n elif opname == \"int_mul_ovf\":\n expr = arg0 * arg1\n m = z3.SignExt(LONG_BIT, arg0) * z3.SignExt(LONG_BIT, arg1)\n state.no_ovf = m == z3.SignExt(LONG_BIT, expr)\n\n # ...\n\nThe boolean is computed by comparing the result of the bit vector operation with\nthe result of converting the input bit vectors into an abstract (arbitrary\nprecision) integer and the result back to bit vectors. Let's go through the\naddition case step by step, the other cases work analogously.\nThe addition in the first elif that computes expr is an addition on bit\nvectors, therefore it is performing wraparound arithmetic.\nz3.SignExt(LONG_BIT, arg0) sign-extends arg0 from a bit vector of\nLONG_BIT bits to an abstract, arbitrary precision integer. The addition in\nthe second line is therefore an addition between abstract integers, so it will\nnever overflow and just compute the correct result as an integer.\nThe condition to check for overflow is now: if the results of the two different\nways to do the addition are the same, then overflow did not occur. So in order\nto compute state.no_ovf in the addition case the\ncode converts the result of the bit vector wraparound addition to\nan abstract integer (using SignExt again), and then compares that to the integer\nresult.\nThis boolean can then be checked by the guard operations guard_no_overflow\nand guard_overflow.\ndef guard_to_condition(self, guard, state):\n\n # ... more cases\n\n elif opname == \"guard_no_overflow\":\n assert state.no_ovf is not None\n return state.no_ovf\n elif opname == \"guard_overflow\":\n assert state.no_ovf is not None\n return z3.Not(state.no_ovf)\n\n # ... more cases\n\n\n\nFinding the Bug, Again\nLet's actually make all of this more concrete by applying it to the trace of our\noriginal bug. The input trace and the incorrectly optimized trace for that look\nlike this (differences highlighted):\n# input # optimized\n[i0] [i0]\ni1 = int_add(i0, 10) i1 = int_add(i0, 10)\ni2 = int_lt(i1, 15) i2 = int_lt(i1, 15)\nguard_true(i2) guard_true(i2)\ni3 = int_lt(i0, 6) jump(0)\nguard_true(i3)\njump(0)\n\nNote that the trace represents just one of the paths through the control flow\ngraph of the original function, which is typical for tracing JITs (the other\npaths could incrementally get added later).\nThe first guards in both these traces correspond to each other, so the first\nchunks to check are the first three operations (lines 1-4). Those operations\ndon't get changed by the optimizer at all.\nThese two identical traces get translated to the following Z3 formulas:\ni1unoptimized == input_i0 + 10\ni2unoptimized == If(i1unoptimized < 15, 1, 0)\ni1optimized == input_i0 + 10\ni2optimized == If(i1optimized < 15, 1, 0)\n\nTo check that the two corresponding guards are the same, the solver is asked to\nprove that (i2unoptimized == 1) == (i2optimized == 1). This is\ncorrect, because the formulas for i2unoptimized and i2optimized are\ncompletely identical.\nAfter checking that the guards behave the same, we add the knowledge to the\nsolver that the guards passed. So the Z3 formulas become:\ni1unoptimized == input_i0 + 10\ni2unoptimized == If(i1unoptimized < 15, 1, 0)\ni1optimized == input_i0 + 10\ni2optimized == If(i1optimized < 15, 1, 0)\ni1optimized == 1\ni2optimized == 1\n\nNow we continue with the remaining operations of the two traces (lines 6-8).\nWe start by adding the int_lt operation in the unoptimized trace to the Z3\nformulas:\n...\ni3unoptimized == If(input_i0 < 6, 1, 0)\n\nBecause the second guard was optimized away, we need to ask Z3 to prove that\ni3unoptimized == 1 is always true, which fails and gives the following\ncounterexample:\ninput_i0 = 9223372036854775800\ni1unoptimized = 9223372036854775810\ni2unoptimized = 0\ni1optimized = 9223372036854775810\ni2optimized = 1\ni3unoptimized = 0\n\nThus demonstrating the bug. The fact that the Z3-based equivalence check also\nmanaged to find the original motivating bug without manually translating it to\na formula is a good confirmation that the approach works.\n\n\nSecond bug\nSo with this code I applied the Z3-based equivalence check to all our optimizer\nunit tests. In addition to the bug we've been discussing the whole post, it also\nfound another buggy test! I had found it too by hand by staring at all the tests\nin the process of writing all the Z3 infrastructure, but it was still a good\nconfirmation that the process worked. This bug was in the range analysis for\nint_neg, integer negation. It failed to account that -MININT == MININT\nand therefore did a mis-optimization along the following lines:\nimport __pypy__\n\ndef wrong(x):\n a = __pypy__.intop.int_sub(0, x)\n if a < 0:\n if x > 0:\n return 0\n return 1\n return 2\n\nWhich was wrongly optimized into:\nimport __pypy__\n\ndef wrong(x):\n a = __pypy__.intop.int_sub(0, x)\n if a < 0:\n return 0\n return 2\n\nThis is wrong precisely for x == MININT.\n\n\nGenerating Random Traces\nThese two bugs were the only two that the Z3 checker found for existing unit\ntests. To try to find some more bugs I combined PyPy's existing random trace\ngenerator with the Z3 optimization checker. The random trace generator has so\nfar been mostly used to find bugs in the machine code backends, particularly\nalso in the register allocator. So far we haven't used it with our optimizer,\nbut my experiments show that we should have!\nI'm going to describe a little bit how the random trace generator works. It's\nactually not that complicated, but there's one neat trick to it.\nThe basic idea is straightforward, it starts out with an empty trace with a\nrandom number of input variables. Then it adds some number of operations to the\ntrace, either regular operations or guards. Every operation takes already\nexisting variables as input.\nThe neat trick is that our random trace generator keeps a concrete random\nexample value for every one of the input variables, and an example result for\nevery operation. In this way, it is possible to generate guards that are\nconsistent with the example values to ensure that running the trace to its end\nis possible with at least one set of values.\nHere's an example random trace that is generated, together with the random\nexample inputs and the results of every operation at the end of every line:\n[i0, i1, i2, i3, i4, i5] # example values: 9, 11, -8, -95, 46, 57\ni6 = int_add_ovf(i3, i0) # -86\nguard_no_overflow()\ni7 = int_sub(i2, -35/ci) # 27\ni8 = uint_ge(i3, i5) # 1\nguard_true(i8)\ni9 = int_lt(i7, i8) # 0\ni10 = int_mul_ovf(34/ci, i7) # 918\nguard_no_overflow()\ni11 = int_and(i10, 63/ci) # 22\ni12 = int_rshift(i3, i11) # -1\ni13 = int_is_zero(i7) # 0\ni14 = int_is_true(i13) # 0\nguard_false(i13)\ni15 = int_lt(i8, i4) # 1\ni16 = int_and(i6, i0) # 8\ni17 = uint_ge(i6, -6/ci) # 0\nfinish()\nNote how every guard generated is true for the example values.\nI have been running this combination of random trace generation and Z3 checking\nfor many nights and it has found some bugs, which I'll describe in the next\nsection. It should probably be run for a lot longer, but still a useful\nexercise already.\nIn this mode, I'm giving every Z3 call a time limit to make sure that the random\ntests don't just take arbitrarily long. This means that asking Z3 to prove\nsomething can have three outcomes, either it's proved, or Z3 finds a\ncounterexample, or Z3 times out.\n\n\nBugs Found\nIn addition to the two bugs I've already described, I'll briefly list the\nadditional bugs that were found by optimizing random traces and then trying to\nprove the equivalence with Z3.\nMost of the bugs were actually identified by optimizing random traces alone, not\nby the Z3 component. They manifested as assert failures in the JIT compiler.\n\nThe JIT concluded after 12 == int_mul(x, 12) that x == 1, which is\nincorrect if overflow occurred (a counterexample is 0x8000000000000001).\nAn amusing bug, where from 0 == int_lshift(0x1000000000000000, x) with\nx <= 0 <= 15, the JIT concluded that 0x1000000000000000 == 0,\ntriggering an assert. This wrong conclusion was again caused by not taking the\npossibility of overflow into account.\nA corner case in an optimization for chained integer additions with a\nconstant, where in complex enough expressions, the wrong IR API was used\n(which works correctly in simple cases). Again, this triggered an assert.\n\nThis shows that we should have been fuzzing our JIT optimizer already (not a\nsurprising observation in hindsight, fuzz all the things!).\nThankfully, there was also one further bug that really failed in the Z3\nverifier. It's a bug in common subexpression elimination / arithmetic\nsimplification, which again does not take overflow correctly into account.\nThe buggy trace looks like this (unfortunately it's not easily possible to show\nthis bug in Python code).\n[a, b]\nc = int_add(a, b)\nr = int_sub_ovf(c, b)\nguard_no_ovf()\nfinish(r)\n\nThis was optimized to:\n[a, b]\nfinish(a)\n\nWhich is incorrect, because the guard can fail given the right inputs.\nBut the optimizer concluded that the subtraction is safe, because its the\ninverse of an earlier addition, not taking into account that this earlier\naddition can have overflowed.\nNote that a related optimization is actually correct. Given this code:\n[a, b]\nc = int_add_ovf(a, b)\nguard_no_ovf()\nr = int_sub(c, b)\nfinish(r)\n\nIt can be optimized to:\n[a, b]\nc = int_add_ovf(a, b)\nguard_no_ovf()\nfinish(a)\n\n\n\nFuture Work and Conclusion\nIn the current form the Z3 checker is only a start, even though it has already\nbeen concretely useful. There are various directions into which we could extend\nit. In addition to generate random tests completely from scratch, we could also\nstart from the existing manually written unit-tests and randomly mutate those.\nI also want to extend the Z3 checker with support more operations, heap\noperations in particular (but it's not quite clear to me how to model garbage\ncollection).\nI also want to try to switch the code away from the Z3 API and use the more\ngeneral smtlib interface directly, in order to be able to use other SMT\ncheckers than Z3, eg CVC4.\nBut all in all this was a fun and not too hard way to find a bunch of bugs in\nour optimizer! And the infrastructure is now in place, which means that we run\nsome random test cases every time we execute our tests. This is going to be\nparticularly useful when we do further work on the integer reasoning of the JIT\n(like Nico is doing, for example). As of time of writing of this post, all the\nbugs mentioned have been fixed and the Z3 code has landed on the default branch\nand runs as part of PyPy's CI infrastructure.\n\n\nAcknowledgements\nThanks to Saam Barati, Max Bernstein, Joshua Schmidt and Martin\nBerger, for great feedback on drafts of this post!", + "tags": "jit,testing", + "url": "https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html" + }, + { + "title": "PyPy v7.3.10 release", + "text": "PyPy v7.3.10: release of python 2.7, 3.8, and 3.9\nThe PyPy team is proud to release version 7.3.10 of PyPy. We have some nice\nspeedups and bugfixes we wish to share. The release includes three different\ninterpreters:\n\n\nPyPy2.7, which is an interpreter supporting the syntax and the features of\nPython 2.7 including the stdlib for CPython 2.7.18+ (the + is for\nbackported security updates)\nPyPy3.8, which is an interpreter supporting the syntax and the features of\nPython 3.8, including the stdlib for CPython 3.8.15.\nPyPy3.9, which is an interpreter supporting the syntax and the features of\nPython 3.9, including the stdlib for CPython 3.9.15. We have gained\nconfidence in the stability of this version, and are removing the \"beta\"\nlabel.\n\n\nThe interpreters are based on much the same codebase, thus the multiple\nrelease. This is a micro release, all APIs are compatible with the other 7.3\nreleases. Highlights of the release, since the release of 7.3.9 in March 2022\ninclude:\n\n\nA release of Apple Silicon M1 arm64 versions. This work was sponsored by\nan anonymous donor and is tested on our buildbots.\nMany improvements to the basic interpreter to make it 15-20% faster\nThe conda-forge community has built over 1000 packages for PyPy3.8 and 3.9,\nmaking it easier than ever to use PyPy.\nUpdate the packaged OpenSSL to 1.1.1s, sqlite3 to 3.39.4, and apply\napplicable security fixes from CPython 3.9.15 to PyPy2.7\nUpdate the HPy backend in PyPy3.8 and PyPy3.9 to 0.0.4\n\n\nWe recommend updating. You can find links to download the v7.3.10 releases here:\n\nhttps://pypy.org/download.html\n\nWe would like to thank our donors for the continued support of the PyPy\nproject. If PyPy is not quite good enough for your needs, we are available for\ndirect consulting work. If PyPy is helping you out, we would love to hear about\nit and encourage submissions to our blog via a pull request\nto https://github.com/pypy/pypy.org\nWe would also like to thank our contributors and encourage new people to join\nthe project. PyPy has many layers and we need help with all of them: bug fixes,\nPyPy and RPython documentation improvements, or general help with making\nRPython's JIT even better. Since the previous release, we have accepted\ncontributions from five new contributors, thanks for pitching in, and welcome\nto the project!\nIf you are a python library maintainer and use C-extensions, please consider\nmaking a HPy / CFFI / cppyy version of your library that would be performant\non PyPy.\nIn any case, both cibuildwheel and the multibuild system support\nbuilding wheels for PyPy.\n\nWhat is PyPy?\nPyPy is a Python interpreter, a drop-in replacement for CPython 2.7, 3.8 and\n3.9. It's fast (PyPy and CPython 3.7.4 performance\ncomparison) due to its integrated tracing JIT compiler.\nWe also welcome developers of other dynamic languages to see what RPython\ncan do for them.\nWe provide binary builds for:\n\n\nx86 machines on most common operating systems\n(Linux 32/64 bits, Mac OS 64 bits, Windows 64 bits)\n64-bit ARM machines running Linux (aarch64).\nApple M1 arm64 machines (macos_arm64).\ns390x running Linux\n\n\nPyPy support Windows 32-bit, Linux PPC64 big- and little-endian, and Linux ARM\n32 bit, but does not release binaries. Please reach out to us if you wish to\nsponsor binary releases for those platforms. Downstream packagers provide\nbinary builds for debian, Fedora, conda, OpenBSD, FreeBSD, Gentoo, and more.\n\n\nWhat else is new?\nFor more information about the 7.3.10 release, see the full changelog.\nPlease update, and continue to help us make pypy better.\nCheers,\nThe PyPy Team", + "tags": "release", + "url": "https://www.pypy.org/posts/2022/12/pypy-v7310-release.html" + }, + { + "title": "PyPy and conda-forge", + "text": "You can use PyPy as your python interpreter in a conda environment. The\nconda-forge team has graciously provided this service.\nThe conda-forge tips-and-tricks\npage says:\n\nThe conda-forge channel supports creating and installing packages into\nenvironments using the PyPy interpreter. Many packages are already available.\nYou need to enable the conda-forge channel and use the pypy identifier when\ncreating your environment:\n\n $ conda create -c conda-forge -n my-pypy-env pypy python=3.8\n $ conda activate my-pypy-env\n\n\n\nCurrently supported python versions are 3.8 and 3.9. Support for pypy3.7 has\nbeen dropped. While you can still create a python 3.7 environment, you you\nwill not be getting updates as new package versions are released (including\npypy itself).\nif you are using defaults as a low priority channel, then you need to use\nstrict channel priority as the metadata in defaults has not been patched yet\nwhich allows cpython extension packages to be installed alongside pypy.\n\n $ conda config --set channel_priority strict\n\n\nThe work required some out-of-the-box thinking on the part of conda-forge since\nthey needed to add the idea of a pypy identifier to the python version and\nthe whole conda team has been very supportive of the effort needed. Binary\npackages are on offer for the usual platforms:\n\nx86_64 windows, macos, linux\nppc64le and aarch64 linux.\n\nThere are currently over 1000 packages available for download via the\nconda-forge channel, and more are being added as the kind package maintainers\nwork around various differences between CPython and PyPy. Please let us know if\nyour favorite package is not supported.", + "tags": "extension modules", + "url": "https://www.pypy.org/posts/2022/11/pypy-and-conda-forge.html" + }, + { + "title": "The PyPy Blog Turns 15 Years", + "text": "Exactly 15 years ago today we wrote the first blog post on the PyPy blog!\nOver the years, we have written 423 posts, from the shortest to the\nlongest. In 2021 we moved from blogger to our own domain.\nThe topics over the years varied widely, we published release announcements;\nroadmaps; JIT, GC and STM updates; benchmarks; sprint, trip and\nconference reports; technical deep dives; case studies; april fool's\njokes; research projects; other languages using RPython; finished PhD\nBachelor and Master, theses; pictures:\n\n\n\nand diagrams:\n\n\n\nQuite a number of blog posts were very early iterations of papers that we\npublished later, here are a few that I can remember:\n\nApplying a Tracing JIT to an Interpreter became Tracing the meta-level:\nPyPy's tracing JIT compiler at ICOOOLPS 2009, by far our most successful\npaper.\nEscape Analysis in PyPy's JIT became Allocation removal by partial\nevaluation in a tracing JIT at PEPM 2010.\nControlling the Tracing of an Interpreter With Hints was a draft of the\npaper Runtime feedback in a meta-tracing JIT for efficient dynamic\nlanguages at ICOOOLPS 2011\nUsing Escape Analysis Across Loop Boundaries for Specialization was the\nnucleus of Loop-aware optimizations in PyPy's tracing JIT at DLS 2012.\nList Strategies was eventually turned into the paper Storage strategies\nfor collections in dynamically typed languages at OOPSLA 2013.\n\n\nGreatest Hits\nIn terms of visitors, the top five posts on the old blog were \u2013 on the new blog\nwe simply don't have stats (yet?):\n\nLet's remove the global interpreter lock\nTutorial: Writing an Interpreter with PyPy, Part 1\nPyPy's new JSON parser\nPyPy gets funding from Mozilla for Python 3.5 support\nHow to make your code 80 times faster\n\nThe number of posts per year developed like this:\n\nThe most prolific authors are:\n\nMaciej Fija\u0142kowski\nCarl Friedrich Bolz-Tereick\nArmin Rigo\nAntonio Cuni\nMatti Picus\n\nSeveral blog posts have made it to the Hacker News front page, three of them to\nnumber 1:\n\nPyPy-STM: first \u201cinteresting\u201d release (discussion)\nLet's Remove the Global Interpreter Lock (discussion)\nInside cpyext: Why emulating CPython C API is so Hard (discussion)\n\n\n\nPersonal Favourites\nWhile looking through the posts, there were a few that stood out to me in some\nway, so here's a subjective list of ones that I had fun looking at again:\n\n2008: Sprint Discussions: JIT Generator Planning\n2009: PyPy gets a new compiler\n2010: Oh, and btw: PyPy gets funding through \"Eurostars\"\n2011: Realtime image processing in Python\n2012: Architecture of Cppyy\n2013: 10 years of PyPy\n2014: PyPy IO Improvements\n2015: Automatic SIMD vectorization support in PyPy\n2016: PyPy Enterprise Edition\n2017: Async HTTP benchmarks on PyPy3\n2018: Improving SyntaxError in PyPy\n2018: The First 15 Years of PyPy \u2014 a Personal Retrospective\n2019: PyPy for low-latency systems\n2020: PyPy and CFFI have moved to Heptapod\n2021: Some Ways that PyPy uses Graphviz\n\nWe'd like to thank our authors, guest authors, commenters, users and readers who\nhave stuck with us through one and a half decades! If there's any particular\ntopics you would like to read something about, or any guest posts you'd like to\nwrite, let us know!", + "tags": "meta", + "url": "https://www.pypy.org/posts/2022/10/blog-15-years.html" + }, + { + "title": "Allocation Removal in the Toy Optimizer", + "text": "One of the workhorse optimization of RPython's tracing JIT is allocation\nremoval, which removes short-lived object allocation from traces. Many Python\nprograms create a lot of objects that only live for a short time, and whose\nlifespan is fully predictable (common examples are integer and float boxes, but\nalso tuples, frames, intermediate string results, etc). Allocation removal will\ntry (and very often succeed) to remove these allocations from traces. In\nthis blog post I want to show a toy version of how allocation removal is\nimplemented.\nIn the previous blog post of this series I showed the complete code for\nwriting a toy one-pass optimizer that does constant folding, common\nsubexpression elimination and strength reduction. In this\nsecond post, I want to use allocation removal as a more advanced optimization\npass. The basic optimization framework is the same, we will use the same\ndatastructures for intermediate representation and also keep using the same\nunion find data structure to store equivalences between IR operations. Here's\nthe infrastructure code from the last post:\nimport pytest\nfrom typing import Optional, Any\n\n\nclass Value:\n def find(self):\n raise NotImplementedError(\"abstract\")\n\n def _set_forwarded(self, value):\n raise NotImplementedError(\"abstract\")\n\n\nclass Operation(Value):\n def __init__(\n self, name: str, args: list[Value]\n ):\n self.name = name\n self.args = args\n self.forwarded = None\n self.info = None\n\n def __repr__(self):\n return (\n f\"Operation({self.name}, \"\n f\"{self.args}, {self.forwarded}, \"\n f\"{self.info})\"\n )\n\n def find(self) -> Value:\n op = self\n while isinstance(op, Operation):\n next = op.forwarded\n if next is None:\n return op\n op = next\n return op\n\n def arg(self, index):\n return self.args[index].find()\n\n def make_equal_to(self, value: Value):\n self.find()._set_forwarded(value)\n\n def _set_forwarded(self, value: Value):\n self.forwarded = value\n\n\nclass Constant(Value):\n def __init__(self, value: Any):\n self.value = value\n\n def __repr__(self):\n return f\"Constant({self.value})\"\n\n def find(self):\n return self\n\n def _set_forwarded(self, value: Value):\n assert (\n isinstance(value, Constant)\n and value.value == self.value\n )\n\nclass Block(list):\n def opbuilder(opname):\n def wraparg(arg):\n if not isinstance(arg, Value):\n arg = Constant(arg)\n return arg\n def build(self, *args):\n # construct an Operation, wrap the\n # arguments in Constants if necessary\n op = Operation(opname,\n [wraparg(arg) for arg in args])\n # add it to self, the basic block\n self.append(op)\n return op\n return build\n\n # a bunch of operations we support\n add = opbuilder(\"add\")\n mul = opbuilder(\"mul\")\n getarg = opbuilder(\"getarg\")\n dummy = opbuilder(\"dummy\")\n lshift = opbuilder(\"lshift\")\n # some new one for this post\n alloc = opbuilder(\"alloc\")\n load = opbuilder(\"load\")\n store = opbuilder(\"store\")\n print = opbuilder(\"print\")\n\ndef bb_to_str(bb: Block, varprefix: str = \"var\"):\n def arg_to_str(arg: Value):\n if isinstance(arg, Constant):\n return str(arg.value)\n else:\n return varnames[arg]\n\n varnames = {}\n res = []\n for index, op in enumerate(bb):\n var = f\"{varprefix}{index}\"\n varnames[op] = var\n arguments = \", \".join(\n arg_to_str(op.arg(i))\n for i in range(len(op.args))\n )\n strop = f\"{var} = {op.name}({arguments})\"\n res.append(strop)\n return \"\\n\".join(res)\n\nThere are two changes to the code from the last post: Operation instances\nhave a new .info field, which is set to None by default. We will learn\nhow the info field is used a bit further down. Also, we define some new\noperations.\n\nInterpreter\nIn this post we will mainly concern ourselves with optimizing\nprograms that allocate memory. We assume that our language is garbage collected\nand memory safe. The new operations that we will optimize are alloc\n(allocates some new object), store (stores a value into a fixed field of an\nobject), load (loads the value from a field in the object).\nWe are leaving out a lot of details of a \"real\" system here, usually an\nalloc operation would get some extra information, for example the type of\nthe freshly allocated object or at least its size. load and store would\ntypically have some kind of field offset and maybe some information about the\nfield's type\nHere's a simple program that uses these operations:\nvar0 = getarg(0)\nobj0 = alloc()\nstore(obj0, 0, var0)\nvar1 = load(obj0, 0)\nprint(var1)\nThe code allocates a new object obj0, stores var0 into field 0 of\nthe object, the loads the same field and prints the result of the load.\nBefore we get started in writing the optimizer for these operations, let's try\nto understand the semantics of the new operations a bit better. To do this, we\ncan sketch a small interpreter for basic blocks, supporting only getarg,\nalloc, store, load, print:\ndef test_interpret():\n bb = Block()\n var0 = bb.getarg(0)\n obj = bb.alloc()\n sto = bb.store(obj, 0, var0)\n var1 = bb.load(obj, 0)\n bb.print(var1)\n assert interpret(bb, 17) == 17\n\nclass Object:\n def __init__(self):\n self.contents: dict[int, Any] = {}\n\n def store(self, idx : int, value : Any):\n self.contents[idx] = value\n\n def load(self, idx : int):\n return self.contents[idx]\n\ndef get_num(op, index=1):\n assert isinstance(op.arg(index), Constant)\n return op.arg(index).value\n\ndef interpret(bb : Block, *args : tuple[Any]):\n def argval(op, i):\n arg = op.arg(i)\n if isinstance(arg, Constant):\n return arg.value\n else:\n assert isinstance(arg, Operation)\n return arg.info\n\n for index, op in enumerate(bb):\n if op.name == \"getarg\":\n res = args[get_num(op, 0)]\n elif op.name == \"alloc\":\n res = Object()\n elif op.name == \"load\":\n fieldnum = get_num(op)\n res = argval(op, 0).load(fieldnum)\n elif op.name == \"store\":\n obj = argval(op, 0)\n fieldnum = get_num(op)\n fieldvalue = argval(op, 2)\n obj.store(fieldnum, fieldvalue)\n # no result, only side effect\n continue\n elif op.name == \"print\":\n res = argval(op, 0)\n print(res)\n return res\n else:\n raise NotImplementedError(\n f\"{op.name} not supported\")\n op.info = res\n\nThe interpreter walks the operations of a block, executing each one in turn. It\nuses the info field to store the result of each already executed\nOperation. In this interpreter sketch we stop at the first print that\nwe execute and return its argument for the simple but bad reason that it makes\ntest_interpret easier to write.\nObjects in the interpreter are represented using a class Object, which\nstores the object's field into a Python dictionary. As written above, this is a\nsimplification, in a real system the alloc operation might for example take\nsome kind of type as an argument, that describes which kinds of fields an\nobject has and how they are laid out in memory, which would allow more\nefficient storage of the content. But we don't want to care about this level of\ndetail in the post, so using a dict in the interpreter is good enough.\n\n\nVersion 1: Naive Attempt\nIn many programs, some allocated objects don't live for very long and have a\ncompletely predictable lifetime. They get allocated, used for a while, and then\nthere is no way to reference them any more, so the garbage collector will\nreclaim them. The very first example block had such an allocation:\nvar0 = getarg(0)\nobj0 = alloc()\nstore(obj0, 0, var0)\nvar1 = load(obj0, 0)\nprint(var1)\nHere obj0 is written to, then read from, and then it's no longer used. We\nwant to optimize such programs to remove this alloc operation. The optimized\nversion of this program would look like this:\nvar0 = getarg(0)\nprint(var0)\nThe alloc, store and load operations have been completely removed.\nThis is a pretty important optimizations for PyPy's JIT: Allocations, memory\nreads and writes are quite costly and occur a lot in Python, so getting rid\nof as many of them as possible is instrumental for performance.\nImplementing the optimization is not a lot of code! However, understanding all\nthe corner cases of the\noptimization and making sure that the resulting program behave correctly is not\ncompletely trivial. Therefore we will develop the optimization step by step, in\na test driven fashion: I will start each section with a new test that shows a\nbug in the version of the optimization that we have so far.\nLet's start in a really naive way. Here's the first test we would like to\npass, using the example program above:\ndef test_remove_unused_allocation():\n bb = Block()\n var0 = bb.getarg(0)\n obj = bb.alloc()\n sto = bb.store(obj, 0, var0)\n var1 = bb.load(obj, 0)\n bb.print(var1)\n opt_bb = optimize_alloc_removal(bb)\n # the virtual object looks like this:\n # obj\n # \u250c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2510\n # \u2502 0: var0 \u2502\n # \u2514\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2518\n assert bb_to_str(opt_bb, \"optvar\") == \"\"\"\\\noptvar0 = getarg(0)\noptvar1 = print(optvar0)\"\"\"\n\nWe will define a class VirtualObject that is basically identical to\nObject above. But it will not be used by the interpreter, instead we will\nuse it during optimization.\nclass VirtualObject:\n def __init__(self):\n self.contents: dict[int, Value] = {}\n\n def store(self, idx, value):\n self.contents[idx] = value\n\n def load(self, idx):\n return self.contents[idx]\n\nThe structure of the optimizer is going to be like those in the first blog post.\nThe optimizer makes a single pass over all operations. It removes some and\nemits others.\nThis first version of the allocation removal optimizer is going to be extremely\noptimistic. It simply assumes that all the allocations in the program can be\noptimized away. That is not realistic in practice. We will have to\nrefine this approach later, but it's a good way to start. That means whenever\nthe optimizer sees an alloc operation, it removes it and creates a\nVirtualObject object which stores the information that is known during\noptimization about the result of the alloc. Like in the interpreter, the\nVirtualObject is stored in the .info field of the Operation instance\nthat represents the alloc.\nWhen the optimizer sees a store operation, it will also remove it and\ninstead execute the store by calling the VirtualObject.store method.\nHere is one important difference between the interpreter and the optimizer: In\nthe interpreter, the values that were stored into an Object (and thus\nput into the object's .contents dictionary) were runtime values, for\nexample integers or other objects. In the optimizer however, the\nfields of the VirtualObject store Value instances, either Constant\ninstances or Operation instances.\nWhen the optimizer sees a load operation, it also removes it, and replaces\nthe load with the Operation (or Constant) that is stored in the\nVirtualObject at that point:\ndef optimize_alloc_removal(bb):\n opt_bb = Block()\n for op in bb:\n if op.name == \"alloc\":\n op.info = VirtualObject()\n continue\n if op.name == \"load\":\n info = op.arg(0).info\n field = get_num(op)\n op.make_equal_to(info.load(field))\n continue\n if op.name == \"store\":\n info = op.arg(0).info\n field = get_num(op)\n info.store(field, op.arg(2))\n continue\n opt_bb.append(op)\n return opt_bb\n\nThis is the first version of the optimization. It doesn't handle all kinds of\ndifficult cases, and we'll have to do something about its optimism.\nBut, already in this minimalistic form, we can write a slightly more complicated\ntest with two allocations, one object pointing to the other. It works correctly\ntoo, both allocations are removed:\ndef test_remove_two_allocations():\n bb = Block()\n var0 = bb.getarg(0)\n obj0 = bb.alloc()\n sto1 = bb.store(obj0, 0, var0)\n obj1 = bb.alloc()\n sto2 = bb.store(obj1, 0, obj0)\n var1 = bb.load(obj1, 0)\n var2 = bb.load(var1, 0)\n bb.print(var2)\n # the virtual objects look like this:\n # obj0\n # \u250c\u2500\u2500\u2500\u2500\u2500\u2500\u2510\n # \u2502 0: \u2577 \u2502\n # \u2514\u2500\u2500\u2500\u2500\u253c\u2500\u2518\n # \u2502\n # \u25bc\n # obj1\n # \u250c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2510\n # \u2502 0: var0 \u2502\n # \u2514\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2518\n # therefore\n # var1 is the same as obj0\n # var2 is the same as var0\n opt_bb = optimize_alloc_removal(bb)\n assert bb_to_str(opt_bb, \"optvar\") == \"\"\"\\\noptvar0 = getarg(0)\noptvar1 = print(optvar0)\"\"\"\n\n\n\nVersion 2: Re-Materializing Allocations\nTo make it easier to talk about how the optimizer operates, let's introduce\nsome terminology. As already seen by the choice\nof the class name VirtualObject, we will call an object virtual if the\noptimizer has optimized away the alloc operation that creates the object.\nOther objects are equivalently not virtual, for example those that have\nexisted before we enter the current code block.\nThe first problem that we need to fix is the assumption that every\nallocation can be removed. So far we only looked at small programs where every\nallocation could be removed, or equivalently, where every object is virtual.\nA program that creates virtual objects, stores into and loads from them, and\nthen forgets the objects. In this simple case removing the allocations is fine.\nAs we saw in the previous section, it's also fine to have a virtual object\nreference another virtual, both allocations can be removed.\nWhat are the cases were we can't remove an allocation?\nThe first version of the optimizer simply assumed that every allocation can be\nremoved. This can't work. We will replace this assumption with the following\nsimple heuristic:\nIf a reference to a virtual object a is stored into an object b\nthat is not virtual, then a will also stop being virtual. If an object a\nthat was virtual stops being virtual, we say that it escapes. \u00b9\nThe simplest test case for this happening looks like this:\ndef test_materialize():\n bb = Block()\n var0 = bb.getarg(0)\n obj = bb.alloc()\n sto = bb.store(var0, 0, obj)\n opt_bb = optimize_alloc_removal(bb)\n # obj is virtual, without any fields\n # \u250c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2510\n # \u2502 empty \u2502\n # \u2514\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2518\n # then we store a reference to obj into\n # field 0 of var0. Since var0 is not virtual,\n # obj escapes, so we have to put it back\n # into the optimized basic block\n assert bb_to_str(opt_bb, \"optvar\") == \"\"\"\\\noptvar0 = getarg(0)\noptvar1 = alloc()\noptvar2 = store(optvar0, 0, optvar1)\"\"\"\n # so far, fails like this:\n # the line:\n # info.store(field, op.arg(2))\n # produces an AttributeError because info\n # is None\n\nIf the optimizer reaches a point where a virtual object escapes (like the\nstore operation in the test), the optimizer has already removed the alloc\noperation that created the virtual object. If the object escapes, we don't want\nto go back in the operations list and re-insert the alloc operation, that\nsounds potentially very complicated. Instead, we re-insert the alloc\noperation that will recreate the virtual object at the point of escape using a\nhelper function materialize.\ndef materialize(opt_bb, value: Operation) -> None:\n assert not isinstance(value, Constant)\n assert isinstance(value, Operation)\n info = value.info\n assert isinstance(info, VirtualObject)\n assert value.name == \"alloc\"\n # put the alloc operation back into the trace\n opt_bb.append(value)\n\nI've added a number of fairly strong assertions to materialize to encode our\ncurrent assumptions about the situations in which it expects to be called. We\nwill remove some of them later as we generalize the code.\nNow that we have materialize we need to change optimize_alloc_removal to\nrecognize the case of storing a virtual object into a non-virtual one. We can\nrecognize Operation instances that produced a virtual object by looking at\ntheir .info field. If it is None, the object is not virtual, otherwise\nit is. If we store something into a virtual object, we leave the code as above.\nIf we store a virtual object into an object that is not virtual, we will first\nmaterialize the virtual object, and then emit the store.\ndef optimize_alloc_removal(bb):\n opt_bb = Block()\n for op in bb:\n if op.name == \"alloc\":\n op.info = VirtualObject()\n continue\n if op.name == \"load\":\n info = op.arg(0).info\n field = get_num(op)\n op.make_equal_to(info.load(field))\n continue\n if op.name == \"store\":\n info = op.arg(0).info\n if info: # virtual\n field = get_num(op)\n info.store(field, op.arg(2))\n continue\n else: # not virtual\n # first materialize the\n # right hand side\n materialize(opt_bb, op.arg(2))\n # then emit the store via\n # the general path below\n opt_bb.append(op)\n return opt_bb\n\nThis is the general idea, and it is enough to pass test_materialize. But of\ncourse there are still a number of further problems that we now need to solve.\n\n\nVersion 3: Don't Materialize Twice\nThe first problem is the fact that after we materialize a virtual object, it is\nno longer virtual. So if it escapes a second time, it should not be\nmaterialized a second time. A test for that case could simply repeat the\nstore operation:\ndef test_dont_materialize_twice():\n # obj is again an empty virtual object,\n # and we store it into var0 *twice*.\n # this should only materialize it once\n bb = Block()\n var0 = bb.getarg(0)\n obj = bb.alloc()\n sto0 = bb.store(var0, 0, obj)\n sto1 = bb.store(var0, 0, obj)\n opt_bb = optimize_alloc_removal(bb)\n assert bb_to_str(opt_bb, \"optvar\") == \"\"\"\\\noptvar0 = getarg(0)\noptvar1 = alloc()\noptvar2 = store(optvar0, 0, optvar1)\noptvar3 = store(optvar0, 0, optvar1)\"\"\"\n # fails so far: the operations that we get\n # at the moment are:\n # optvar0 = getarg(0)\n # optvar1 = alloc()\n # optvar2 = store(optvar0, 0, optvar1)\n # optvar3 = alloc()\n # optvar4 = store(optvar0, 0, optvar3)\n # ie the object is materialized twice,\n # which is incorrect\n\nWe solve the problem by setting the .info field of an object that we\nmaterialize to None to mark it as no longer being virtual.\ndef materialize(opt_bb, value: Operation) -> None:\n assert not isinstance(value, Constant)\n assert isinstance(value, Operation)\n info = value.info\n if info is None:\n return # already materialized\n assert value.name == \"alloc\"\n # put the alloc operation back into the trace\n opt_bb.append(value)\n # but only once\n value.info = None\n\n# optimize_alloc_removal unchanged\n\nThis fixes the problem, only one alloc is created. This fix also allows\nanother test case to pass, one where we store a non-virtual into another\nnon-virtual, code which we cannot optimize at all:\ndef test_materialize_non_virtuals():\n # in this example we store a non-virtual var1\n # into another non-virtual var0\n # this should just lead to no optimization at\n # all\n bb = Block()\n var0 = bb.getarg(0)\n var1 = bb.getarg(1)\n sto = bb.store(var0, 0, var1)\n opt_bb = optimize_alloc_removal(bb)\n assert bb_to_str(opt_bb, \"optvar\") == \"\"\"\\\noptvar0 = getarg(0)\noptvar1 = getarg(1)\noptvar2 = store(optvar0, 0, optvar1)\"\"\"\n\n\n\nVersion 4: Materialization of Constants\nAnother straightforward extension is to support materializing constants. A\nconstant is never virtual, so materializing it should do nothing.\ndef test_materialization_constants():\n # in this example we store the constant 17\n # into the non-virtual var0\n # again, this will not be optimized\n bb = Block()\n var0 = bb.getarg(0)\n sto = bb.store(var0, 0, 17)\n opt_bb = optimize_alloc_removal(bb)\n # the previous line fails so far, triggering\n # the assert:\n # assert not isinstance(value, Constant)\n # in materialize\n assert bb_to_str(opt_bb, \"optvar\") == \"\"\"\\\noptvar0 = getarg(0)\noptvar1 = store(optvar0, 0, 17)\"\"\"\n\nTo implement that case, we check for value being a constant and return\nearly:\ndef materialize(opt_bb, value: Operation) -> None:\n if isinstance(value, Constant):\n return\n assert isinstance(value, Operation)\n info = value.info\n if info is None:\n return # already materialized\n assert value.name == \"alloc\"\n # put the alloc operation back into the trace\n opt_bb.append(value)\n # but only once\n value.info = None\n\n# optimize_alloc_removal unchanged\n\n\n\nVersion 5: Materializing Fields\nNow we need to solve a more difficult problem. So far, the virtual objects that\nwe have materialized have all been empty, meaning they didn't have any fields\nwritten to at the point of materialization. Let's write a test for this:\ndef test_materialize_fields():\n bb = Block()\n var0 = bb.getarg(0)\n var1 = bb.getarg(1)\n obj = bb.alloc()\n contents0 = bb.store(obj, 0, 8)\n contents1 = bb.store(obj, 1, var1)\n sto = bb.store(var0, 0, obj)\n\n # the virtual obj looks like this\n # obj\n # \u250c\u2500\u2500\u2500\u2500\u2500\u2500\u252c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2510\n # \u2502 0: 8 \u2502 1: var1 \u2502\n # \u2514\u2500\u2500\u2500\u2500\u2500\u2500\u2534\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2518\n # then it needs to be materialized\n # this is the first example where a virtual\n # object that we want to materialize has any\n # content and is not just an empty object\n opt_bb = optimize_alloc_removal(bb)\n assert bb_to_str(opt_bb, \"optvar\") == \"\"\"\\\noptvar0 = getarg(0)\noptvar1 = getarg(1)\noptvar2 = alloc()\noptvar3 = store(optvar2, 0, 8)\noptvar4 = store(optvar2, 1, optvar1)\noptvar5 = store(optvar0, 0, optvar2)\"\"\"\n # fails so far! the operations we get\n # at the moment are:\n # optvar0 = getarg(0)\n # optvar1 = getarg(1)\n # optvar2 = alloc()\n # optvar3 = store(optvar0, 0, optvar2)\n # which is wrong, because the store operations\n # into optvar1 got lost\n\nTo fix this problem, we need to re-create a store operation for every\nelement of the .contents dictionary of the virtual object we are\nmaterializing. \u00b2\ndef materialize(opt_bb, value: Operation) -> None:\n if isinstance(value, Constant):\n return\n assert isinstance(value, Operation)\n info = value.info\n if info is None:\n return # already materialized\n assert value.name == \"alloc\"\n # put the alloc operation back into the trace\n opt_bb.append(value)\n # put the content back\n for idx, val in info.contents.items():\n # re-create store operation\n opt_bb.store(value, idx, val)\n # only materialize once\n value.info = None\n\n# optimize_alloc_removal unchanged\n\nThis is enough to pass the test.\n\n\nVersion 6: Recursive Materialization\nIn the above example, the fields of the virtual objects contained\nonly constants or non-virtual objects. However, we could have a situation where\na whole tree of virtual objects is built, and then the root of the tree escapes.\nThis makes it necessary to escape the whole tree. Let's write a test for a small\ntree of two virtual objects:\ndef test_materialize_chained_objects():\n bb = Block()\n var0 = bb.getarg(0)\n obj0 = bb.alloc()\n obj1 = bb.alloc()\n contents = bb.store(obj0, 0, obj1)\n const = bb.store(obj1, 0, 1337)\n sto = bb.store(var0, 0, obj0)\n # obj0\n # \u250c\u2500\u2500\u2500\u2500\u2500\u2500\u2510\n # \u2502 0: \u2577 \u2502\n # \u2514\u2500\u2500\u2500\u2500\u253c\u2500\u2518\n # \u2502\n # \u25bc\n # obj1\n # \u250c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2510\n # \u2502 0: 1337 \u2502\n # \u2514\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2518\n # now obj0 escapes\n opt_bb = optimize_alloc_removal(bb)\n assert bb_to_str(opt_bb, \"optvar\") == \"\"\"\\\noptvar0 = getarg(0)\noptvar1 = alloc()\noptvar2 = alloc()\noptvar3 = store(optvar2, 0, 1337)\noptvar4 = store(optvar1, 0, optvar2)\noptvar5 = store(optvar0, 0, optvar1)\"\"\"\n # fails in an annoying way! the resulting\n # basic block is not in proper SSA form\n # so printing it fails. The optimized\n # block would look like this:\n # optvar0 = getarg(0)\n # optvar1 = alloc()\n # optvar3 = store(optvar1, 0, optvar2)\n # optvar4 = store(optvar0, 0, optvar1)\n # where optvar2 is an ``alloc`` Operation\n # that is not itself in the output block\n\nTo fix it, materialize needs to call itself recursively for all the field\nvalues of the virtual object:\ndef materialize(opt_bb, value: Operation) -> None:\n if isinstance(value, Constant):\n return\n assert isinstance(value, Operation)\n info = value.info\n if info is None:\n return # already materialized\n assert value.name == \"alloc\"\n # put the alloc operation back into the trace\n opt_bb.append(value)\n # put the content back\n for idx, val in sorted(info.contents.items()):\n # materialize recursively\n materialize(opt_bb, val)\n opt_bb.store(value, idx, val)\n # only materialize once\n value.info = None\n\n# optimize_alloc_removal unchanged\n\nGetting there, the materialization logic is almost done. We need to fix a\nsubtle remaining problem though.\n\n\nVersion 7: Dealing with Object Cycles\nThe bug we need to fix in this section is a bit tricky, and does not immediately\noccur in a lot of programs. In\nfact, in PyPy a variant of it was hiding out in our optimizer\nuntil we found it much later (despite us being aware of the general problem and\ncorrectly dealing with it in other cases).\nThe problem is this: a virtual object can (directly or indirectly) point to\nitself, and we must carefully deal with that case to avoid infinite recursion in\nmaterialize. Here's the simplest test:\ndef test_object_graph_cycles():\n bb = Block()\n var0 = bb.getarg(0)\n var1 = bb.alloc()\n var2 = bb.store(var1, 0, var1)\n var3 = bb.store(var0, 1, var1)\n # \u250c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2510\n # \u25bc \u2502\n # obj0 \u2502\n # \u250c\u2500\u2500\u2500\u2500\u2500\u2500\u2510 \u2502\n # \u2502 0: \u2577 \u2502 \u2502\n # \u2514\u2500\u2500\u2500\u2500\u253c\u2500\u2518 \u2502\n # \u2502 \u2502\n # \u2514\u2500\u2500\u2500\u2500\u2500\u2518\n # obj0 points to itself, and then it is\n # escaped\n opt_bb = optimize_alloc_removal(bb)\n # the previous line fails with an\n # InfiniteRecursionError\n # materialize calls itself, infinitely\n\n # what we want is instead this output:\n assert bb_to_str(opt_bb, \"optvar\") == \"\"\"\\\noptvar0 = getarg(0)\noptvar1 = alloc()\noptvar2 = store(optvar1, 0, optvar1)\noptvar3 = store(optvar0, 1, optvar1)\"\"\"\n\nThe fix is not a big change, but a little bit subtle nevertheless.\nWe have to change the\norder in which things are done in materialize. Right after emitting the\nalloc, we set the .info to None, to mark the object as not virtual.\nOnly afterwards do we re-create the stores and call materialize recursively.\nIf a recursive call reaches the same object, it's already marked as non-virtual,\nso materialize won't recurse further:\ndef materialize(opt_bb, value: Operation) -> None:\n if isinstance(value, Constant):\n return\n assert isinstance(value, Operation)\n info = value.info\n if info is None:\n return # already materialized\n assert value.name == \"alloc\"\n # put the alloc operation back into the trace\n opt_bb.append(value)\n # only materialize once\n value.info = None\n # put the content back\n for idx, val in sorted(info.contents.items()):\n # materialize recursively\n materialize(opt_bb, val)\n opt_bb.store(value, idx, val)\n\n\n\nVersion 8: Loading from non-virtual objects\nNow materialize is done. We need to go back to optimize_alloc_removal and\nimprove it further. The last time we changed it, we added a case analysis to the\ncode dealing with store, distinguishing between storing to a virtual and to\na non-virtual object. We need to add an equivalent distinction to the load\ncase, because right now loading from a non-virtual crashes.\ndef test_load_non_virtual():\n bb = Block()\n var0 = bb.getarg(0)\n var1 = bb.load(var0, 0)\n bb.print(var1)\n # the next line fails in the line\n # op.make_equal_to(info.load(field))\n # because info is None\n opt_bb = optimize_alloc_removal(bb)\n assert bb_to_str(opt_bb, \"optvar\") == \"\"\"\\\noptvar0 = getarg(0)\noptvar1 = load(optvar0, 0)\noptvar2 = print(optvar1)\"\"\"\n\nTo fix it, we split the load code into two cases, leaving the virtual path\nas before, and letting the load from a non-virtual fall through to the\ngeneral code at the end of the function.\ndef optimize_alloc_removal(bb):\n opt_bb = Block()\n for op in bb:\n if op.name == \"alloc\":\n op.info = VirtualObject()\n continue\n if op.name == \"load\":\n info = op.arg(0).info\n if info: # virtual\n field = get_num(op)\n op.make_equal_to(info.load(field))\n continue\n # otherwise not virtual, use the\n # general path below\n if op.name == \"store\":\n info = op.arg(0).info\n if info: # virtual\n field = get_num(op)\n info.store(field, op.arg(2))\n continue\n else: # not virtual\n # first materialize the\n # right hand side\n materialize(opt_bb, op.arg(2))\n # then emit the store via\n # the general path below\n opt_bb.append(op)\n return opt_bb\n\n\n\nVersion 9 (Final): Materialize on Other Operations\nWe're almost at the end now. There's one final generalization left to do. We\nstarted with the heuristic that storing a virtual into a non-virtual would\nescape it. This should be generalized. Every time we pass a virtual into any\noperation where it is not the first argument of a load and a store\nshould also escape it (imagine passing the virtual to some function call).\nLet's test this as usual with our print operation:\ndef test_materialize_on_other_ops():\n # materialize not just on store\n bb = Block()\n var0 = bb.getarg(0)\n var1 = bb.alloc()\n var2 = bb.print(var1)\n opt_bb = optimize_alloc_removal(bb)\n assert bb_to_str(opt_bb, \"optvar\") == \"\"\"\\\noptvar0 = getarg(0)\noptvar1 = alloc()\noptvar2 = print(optvar1)\"\"\"\n # again, the resulting basic block is not in\n # valid SSA form\n\nTo fix this, we will take the call to materialize out of the store code\npath and instead put it into the generic code path the end of the while\nloop:\n# materialize is unchanged\ndef materialize(opt_bb, value: Value) -> None:\n if isinstance(value, Constant):\n return\n assert isinstance(value, Operation)\n info = value.info\n if not info:\n # Already materialized\n return\n assert value.name == \"alloc\"\n opt_bb.append(value)\n value.info = None\n for idx, val in sorted(info.contents.items()):\n materialize(opt_bb, val)\n opt_bb.store(value, idx, val)\n\ndef optimize_alloc_removal(bb):\n opt_bb = Block()\n for op in bb:\n if op.name == \"alloc\":\n op.info = VirtualObject()\n continue\n if op.name == \"load\":\n info = op.arg(0).info\n if info: # virtual\n field = get_num(op)\n op.make_equal_to(info.load(field))\n continue\n if op.name == \"store\":\n info = op.arg(0).info\n if info: # virtual\n field = get_num(op)\n info.store(field, op.arg(2))\n continue\n # materialize all the arguments of\n # operations that are put into the\n # output basic block\n for arg in op.args:\n materialize(opt_bb, arg.find())\n opt_bb.append(op)\n return opt_bb\n\nThat's it, we're done. It's not a lot of code, but actually quite a powerful\noptimization. In addition to removing allocations for objects that are only used\nbriefly and in predictable ways, it also has another effect. If an object is\nallocated, used in a number of operations and then escapes further down in the\nblock, the operations in between can often be optimized away. This is\ndemonstrated by the next test (which already passes):\ndef test_sink_allocations():\n bb = Block()\n var0 = bb.getarg(0)\n var1 = bb.alloc()\n var2 = bb.store(var1, 0, 123)\n var3 = bb.store(var1, 1, 456)\n var4 = bb.load(var1, 0)\n var5 = bb.load(var1, 1)\n var6 = bb.add(var4, var5)\n var7 = bb.store(var1, 0, var6)\n var8 = bb.store(var0, 1, var1)\n opt_bb = optimize_alloc_removal(bb)\n assert bb_to_str(opt_bb, \"optvar\") == \"\"\"\\\noptvar0 = getarg(0)\noptvar1 = add(123, 456)\noptvar2 = alloc()\noptvar3 = store(optvar2, 0, optvar1)\noptvar4 = store(optvar2, 1, 456)\noptvar5 = store(optvar0, 1, optvar2)\"\"\"\n\nNote that the addition is not optimized away, because the code from this blog\npost does not contain constant folding and the other optimizations from\nthe last one. Combining them would not be too hard though.\n\n\nConclusion\nThat's it! The core idea of PyPy's allocation removal optimization in one or\ntwo screens of code. The real implementation has a number of refinements,\nbut the core ideas are all here.\nI'm not going to show any benchmark numbers or anything like that here, if you\nare interested in numbers you could look at the evaluation Section 6.\n\"Implementation and Evaluation\" of the paper that describes the work.\nThere's a complementary optimization that improves load and store\noperations for objects that are not virtual. I'll probably not write that\ndown as another post, but Max Bernstein and I developed that together on a\nPyPy Twitch channel channel a few weeks ago, here's the recording:\n\n\nFootnotes\n\u00b9 This is how PyPy uses the terminology, not really used consistently by other\nprojects. The term \"escape\" is fairly standard throughout the escape\nanalysis literature. The term \"virtual\" was used originally in Armin Rigo's\nPsyco but is e.g. also used by the paper Partial Escape Analysis and Scalar\nReplacement for Java.\n\u00b2 The order in which we put the store operations back is relying on\ndictionary iteration order, which is insertion order. That's not a bad\nordering, we could also be explicit and sort the fields in some order (ideally\nthe order in which the object lays them out in memory).", + "tags": "toy-optimizer", + "url": "https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html" + }, + { + "title": "PyPy Sponsors and Consultants", + "text": "Keeping a project as ambitious as PyPy requires resources. Sometimes the\nproblems encountered are general, like updating python versions or supporting\nvarious c-extensions. Sometimes the problems are specific and require precise\nsolutions that may not generalize to all users. Likewise, sponsorship of PyPy\ncan be general or specific.\nGeneral PyPy Sponsorship\nPyPy has had many financial contributors in the\npast. We are grateful to them, and to the following current sponsors:\n\n\nBaroque Software who hosts the buildbot\n master as well as our benchmark\n runner\n\n\nThe recurring and once-off fiscal sponsors via\n OpenCollective\n\n\nOthers providing buildbot workers\n\n\nPyPy Consulting Work\n\n\nBaroque Software is an innovative company that\n has been doing performance oriented consulting for a variety of biggest\n players on the market since 2007. Please reach out to their team for\n help making PyPy fulfill its potential in your application.\n\n\nMatti Picus, the PyPy release manager, has been\n pushing PyPy into the Python ecosystem since 2016: dealing with\n packaging, compatibility, and performance. He works at\n Quansight and is available for projects.", + "tags": "", + "url": "https://www.pypy.org/pypy-sponsors.html" + }, + { + "title": "D\u00fcsseldorf HPy/PyPy/GraalPy sprint September 19-23rd 2022", + "text": "The programming language group of the Computer Science department of\nHeinrich-Heine Universit\u00e4t D\u00fcsseldorf is happy to invite everybody to another\nsprint in D\u00fcsseldorf, from the 19th to the 23rd of September 2022. This is a\nfully public sprint, everyone and particularly newcomers are welcome to join\nus! The goal is to bring together people from the HPy, PyPy, GraalPy and\nCPython communities.\n\nTopics and goals\n\nwork on HPy APIs, discussions around next steps for the project\ncontinuing new and ongoing ports to HPy, including Cython, NumPy, Pillow, Matplotlib\n3.10 support on PyPy and GraalPy\npreparing the next PyPy release\ndiscussions around ways to improve collaboration between the different Python\nimplementations\n\n\n\nWhat is a sprint?\nThe experience of the PyPy project has shown the benefits of regular\nsprints. They are focussed one week physical meetings where people pair-program\non new features and discuss future plans. Coming to one is a great way to get\nstarted with a project!\n\n\nLocation\nThe sprint will take place in a seminar room of the computer science\ndepartment. It is in the building 25.12, room 02.50 (second floor) of the\nuniversity campus. For travel instructions see\n\nhttps://www.cs.hhu.de/lehrstuehle-und-arbeitsgruppen/softwaretechnik-und-programmiersprachen/kontakt/service/lage-und-anreise\n\nWe ask participants to wear masks during the indoor working hours.\n\n\n\n\n\nWiegels, CC BY 3.0, via Wikimedia Commons\n\n\n\n\nExact times\nWork days: starting September 19th (~morning), ending September 23rd (~afternoon).\nWe will do a to-be-planned social activity on Wednesday afternoon.\n\n\nRegistration\nPlease register by editing this file or by opening a pull request:\n\nhttps://foss.heptapod.net/pypy/extradoc/-/blob/branch/extradoc/sprintinfo/ddorf2022/people.txt\n\nor by sending a quick mail to the pypy-dev mailing list:\n\nhttp://mail.python.org/mailman/listinfo/pypy-dev", + "tags": "sprints", + "url": "https://www.pypy.org/posts/2022/07/ddorf-sprint-sep-2022.html" + }, + { + "title": "M1 support for PyPy", + "text": "The PyPy team is happy to announce that we can now target the macOS ARM64\nplatform. Much of the work was executed by Maciej Fija\u0142kowski (fijal) and\nfunded via a generous contribution to our OpenCollective. The work is based\non our existing support for aarch64 (arm64 on linux) with some twists\nto support the differences between the CPUs and the operating system. There\nare nightly builds for pypy3.8 and pypy3.9 (look for macos_arm64), and\nthe architecture will be part of our next release.\nPlease try it out and let us know how it is useful for you or how we could\nimprove.\nWe still need help improving our macOS support. We have an open issue to\nhelp our packaging story. Help is welcome.\nThe PyPy team.", + "tags": "", + "url": "https://www.pypy.org/posts/2022/07/m1-support-for-pypy.html" + }, + { + "title": "Implementing a Toy Optimizer", + "text": "In this blog post I want to show the complete code (in Python3) of how a very\nsimple optimizer for sequences of operations can work. These algorithms could\nbe part of a (really simple) compiler, or a JIT. The architecture of the code in\nthis blog post is very similar to that of the trace optimizer of the PyPy JIT:\nAfter a trace is produced, is is optimized before being sent to the machine code\nbackend that produces binary instructions for the CPU architecture that PyPy is\nrunning on.\nTo get started, the first thing we need to do is define how our operations are\nstored. The\nformat that a compiler uses to store the program while it is being optimized\nis usually called its intermediate representation (IR). Many production\ncompilers use IRs that are in the Static Single-Assignment Form (SSA), and\nwe will also use that. SSA form has the property that every variable is\nassigned to exactly once, and every variable is defined before it is used. This\nsimplifies many things.\nLet's make this concrete. If our input program is a complex expressions, such\nas a * (b + 17) + (b + 17) the intermediate representation of that (or at\nleast its text representation) would maybe be something like:\nvar1 = add(b, 17)\nvar2 = mul(a, var1)\nvar3 = add(b, 17)\nvar4 = add(var2, var3)\nThis sequence of instructions is inefficient. The operation add(b, 17) is\ncomputed twice and we can save time by removing the second one and only\ncomputing it once. In this post I want to show an optimizer that can do this\n(and some related) optimizations.\nLooking at the IR we notice that the input expression has been linearized\ninto a sequence of operations, and all the intermedia results have been given\nunique variable names. The value that every variable is assigned is computed\nby the right hand side, which is some operation consisting of an operand and an\narbitrary number of arguments. The arguments of an operation are either\nthemselves variables or constants.\nI will not at all talk about the process of translating the input program\ninto the IR. Instead, I will assume we have some component that does this\ntranslation already. The tests in this blog post will construct small\nsnippets of IR by hand. I also won't talk about what happens after the\noptimization (usually the optimized IR is translated into machine code).\n\nImplementing the Intermediate Representation\nLet's start modelling the intermediate representation with Python classes.\nFirst we define a base class of all values that can be used as arguments in\noperations, and let's also add a class that represents constants:\nimport pytest\nfrom typing import Optional, Any\n\nclass Value:\n pass\n\nclass Constant(Value):\n def __init__(self, value: Any):\n self.value = value\n\n def __repr__(self):\n return f\"Constant({self.value})\"\n\nOne consequence of the fact that every variable is assigned to only once is\nthat variables are in a one-to-one correspondence with the right-hand-side of\ntheir unique assignments. That means that we don't need a class that represents\nvariables at all. Instead, it's sufficient to have a class that represents an\noperation (the right-hand side), and that by definition is the same as the variable (left-hand side) that it defines:\nclass Operation(Value):\n def __init__(self, name: str, args: list[Value]):\n self.name = name\n self.args = args\n\n def __repr__(self):\n return f\"Operation({self.name}, {self.args})\"\n\n def arg(self, index: int):\n return self.args[index]\n\nNow we can instantiate these two classes to represent the example sequence of\noperations above:\ndef test_construct_example():\n # first we need something to represent\n # \"a\" and \"b\". In our limited view, we don't\n # know where they come from, so we will define\n # them with a pseudo-operation called \"getarg\"\n # which takes a number n as an argument and\n # returns the n-th input argument. The proper\n # SSA way to do this would be phi-nodes.\n\n a = Operation(\"getarg\", [Constant(0)])\n b = Operation(\"getarg\", [Constant(1)])\n # var1 = add(b, 17)\n var1 = Operation(\"add\", [b, Constant(17)])\n # var2 = mul(a, var1)\n var2 = Operation(\"mul\", [a, var1])\n # var3 = add(b, 17)\n var3 = Operation(\"add\", [b, Constant(17)])\n # var4 = add(var2, var3)\n var4 = Operation(\"add\", [var2, var3])\n\n sequence = [a, b, var1, var2, var3, var4]\n # nothing to test really, it shouldn't crash\n\nUsually, complicated programs are represented as a control flow graph in a\ncompiler, which represents all the possible paths that control can take while\nexecuting the program. Every node in the control flow graph is a basic\nblock. A basic block is a linear sequence of operations with no control flow\ninside of it.\nWhen optimizing a program, a compiler usually looks at the whole control flow\ngraph of a function. However, that is still too complicated! So let's\nsimplify further and look at only at optimizations we can do when looking at\na single basic block and its sequence of instructions (they are called local\noptimizations).\nLet's define a class representing basic blocks and let's also add some\nconvenience functions for constructing sequences of operations, because the\ncode in test_construct_example is a bit annoying.\nclass Block(list):\n def opbuilder(opname):\n def wraparg(arg):\n if not isinstance(arg, Value):\n arg = Constant(arg)\n return arg\n def build(self, *args):\n # construct an Operation, wrap the\n # arguments in Constants if necessary\n op = Operation(opname,\n [wraparg(arg) for arg in args])\n # add it to self, the basic block\n self.append(op)\n return op\n return build\n\n # a bunch of operations we support\n add = opbuilder(\"add\")\n mul = opbuilder(\"mul\")\n getarg = opbuilder(\"getarg\")\n dummy = opbuilder(\"dummy\")\n lshift = opbuilder(\"lshift\")\n\ndef test_convencience_block_construction():\n bb = Block()\n # a again with getarg, the following line\n # defines the Operation instance and\n # immediately adds it to the basic block bb\n a = bb.getarg(0)\n assert len(bb) == 1\n assert bb[0].name == \"getarg\"\n\n # it's a Constant\n assert bb[0].args[0].value == 0\n\n # b with getarg\n b = bb.getarg(1)\n # var1 = add(b, 17)\n var1 = bb.add(b, 17)\n # var2 = mul(a, var1)\n var2 = bb.mul(a, var1)\n # var3 = add(b, 17)\n var3 = bb.add(b, 17)\n # var4 = add(var2, var3)\n var4 = bb.add(var2, var3)\n assert len(bb) == 6\n\nThat's a good bit of infrastructure to make the tests easy to write. One\nthing we are lacking though is a way to print the basic blocks into a nicely\nreadable textual representation. Because in the current form, the repr of a\nBlock is very annoying, the output of pretty-printing bb in the test above\nlooks like this:\n[Operation('getarg', [Constant(0)]),\n Operation('getarg', [Constant(1)]),\n Operation('add',\n [Operation('getarg',\n [Constant(1)]),\n Constant(17)]),\n Operation('mul',\n [Operation('getarg',\n [Constant(0)]),\n Operation('add',\n [Operation('getarg',\n [Constant(1)]),\n Constant(17)])]),\n Operation('add',\n [Operation('getarg',\n [Constant(1)]),\n Constant(17)]),\n Operation('add',\n [Operation('mul',\n [Operation('getarg',\n [Constant(0)]),\n Operation('add',\n [Operation('getarg',\n [Constant(1)]),\n Constant(17)])]),\n Operation('add',\n [Operation('getarg',\n [Constant(1)]),\n Constant(17)])])]\n\nIt's impossible to see what is going on here, because the Operations in the\nbasic block appear several times, once as elements of the list but then also as\narguments to operations further down in the list. So we need some code that\nturns things back into a readable textual representation, so we have a chance\nto debug.\ndef bb_to_str(bb: Block, varprefix: str = \"var\"):\n # the implementation is not too important,\n # look at the test below to see what the\n # result looks like\n\n def arg_to_str(arg: Value):\n if isinstance(arg, Constant):\n return str(arg.value)\n else:\n # the key must exist, otherwise it's\n # not a valid SSA basic block:\n # the variable must be defined before\n # its first use\n return varnames[arg]\n\n varnames = {}\n res = []\n for index, op in enumerate(bb):\n # give the operation a name used while\n # printing:\n var = f\"{varprefix}{index}\"\n varnames[op] = var\n arguments = \", \".join(\n arg_to_str(op.arg(i))\n for i in range(len(op.args))\n )\n strop = f\"{var} = {op.name}({arguments})\"\n res.append(strop)\n return \"\\n\".join(res)\n\ndef test_basicblock_to_str():\n bb = Block()\n var0 = bb.getarg(0)\n var1 = bb.add(5, 4)\n var2 = bb.add(var1, var0)\n\n assert bb_to_str(bb) == \"\"\"\\\nvar0 = getarg(0)\nvar1 = add(5, 4)\nvar2 = add(var1, var0)\"\"\"\n\n # with a different prefix for the invented\n # variable names:\n assert bb_to_str(bb, \"x\") == \"\"\"\\\nx0 = getarg(0)\nx1 = add(5, 4)\nx2 = add(x1, x0)\"\"\"\n\n # and our running example:\n bb = Block()\n a = bb.getarg(0)\n b = bb.getarg(1)\n var1 = bb.add(b, 17)\n var2 = bb.mul(a, var1)\n var3 = bb.add(b, 17)\n var4 = bb.add(var2, var3)\n\n assert bb_to_str(bb, \"v\") == \"\"\"\\\nv0 = getarg(0)\nv1 = getarg(1)\nv2 = add(v1, 17)\nv3 = mul(v0, v2)\nv4 = add(v1, 17)\nv5 = add(v3, v4)\"\"\"\n # Note the re-numbering of the variables! We\n # don't attach names to Operations at all, so\n # the printing will just number them in\n # sequence, can sometimes be a source of\n # confusion.\n\nThis is much better. Now we're done with the basic infrastructure, we can\ndefine sequences of operations and print them in a readable way. Next we need a\ncentral data structure that is used when actually optimizing basic blocks.\n\n\nStoring Equivalences between Operations Using a Union-Find Data Structure\nWhen optimizing a sequence of operations, we want to make it less costly to\nexecute. For that we typically want to remove operations (and sometimes\nreplace operations with less expensive ones). We can remove operations if\nthey do redundant computation, like case of the duplicate add(v1, 17) in\nthe example. So what we want to do is to turn the running input sequence:\nv0 = getarg(0)\nv1 = getarg(1)\nv2 = add(v1, 17)\nv3 = mul(v0, v2)\nv4 = add(v1, 17)\nv5 = add(v3, v4)\nInto the following optimized output sequence:\noptvar0 = getarg(0)\noptvar1 = getarg(1)\noptvar2 = add(optvar1, 17)\noptvar3 = mul(optvar0, optvar2)\noptvar4 = add(optvar3, optvar2)\nWe left out the second add (which defines v4), and then replaced the\nusage of v4 with v2 in the final operation that defines v5.\nWhat we effectively did was discover that v2 and v4 are equivalent and then\nreplaced v4 with v2. In general, we might discover more such equivalences,\nand we need a data structure to store them. A good data structure to store\nthese equivalences is Union Find (also called Disjoint-set data structure),\nwhich stores a collection of disjoint sets. Disjoint means, that no operation\ncan appear in more than one set. The sets in our concrete case are the sets of\noperations that compute the same result.\nWhen we start out, every operation is in its own singleton set, with no other\nmember. As we discover more equivalences, we will unify sets into larger sets\nof operations that all compute the same result. So one operation the data\nstructure supports is union, to unify two sets, we'll call that\nmake_equal_to in the code below.\nThe other operation the data structure supports is find, which takes an\noperation and returns a \"representative\" of the set of all equivalent\noperations. Two operations are in the same set, if the representative that\nfind returns for them is the same.\nThe exact details of how the data structure works are only sort of important\n(even though it's very cool, I promise!). It's OK to skip over the\nimplementation. We will add the data structure right into our Value,\nConstant and Operation classes:\nclass Value:\n def find(self):\n raise NotImplementedError(\"abstract\")\n def _set_forwarded(self, value):\n raise NotImplementedError(\"abstract\")\n\n\nclass Operation(Value):\n def __init__(self, name: str, args: list[Value]):\n self.name = name\n self.args = args\n self.forwarded = None\n\n def __repr__(self):\n return (\n f\"Operation({self.name},\"\n f\"{self.args}, {self.forwarded})\"\n )\n\n def find(self) -> Value:\n # returns the \"representative\" value of\n # self, in the union-find sense\n op = self\n while isinstance(op, Operation):\n # could do path compression here too\n # but not essential\n next = op.forwarded\n if next is None:\n return op\n op = next\n return op\n\n def arg(self, index):\n # change to above: return the\n # representative of argument 'index'\n return self.args[index].find()\n\n def make_equal_to(self, value: Value):\n # this is \"union\" in the union-find sense,\n # but the direction is important! The\n # representative of the union of Operations\n # must be either a Constant or an operation\n # that we know for sure is not optimized\n # away.\n\n self.find()._set_forwarded(value)\n\n def _set_forwarded(self, value: Value):\n self.forwarded = value\n\n\nclass Constant(Value):\n def __init__(self, value: Any):\n self.value = value\n\n def __repr__(self):\n return f\"Constant({self.value})\"\n\n def find(self):\n return self\n\n def _set_forwarded(self, value: Value):\n # if we found out that an Operation is\n # equal to a constant, it's a compiler bug\n # to find out that it's equal to another\n # constant\n assert isinstance(value, Constant) and \\\n value.value == self.value\n\ndef test_union_find():\n # construct three operation, and unify them\n # step by step\n bb = Block()\n a1 = bb.dummy(1)\n a2 = bb.dummy(2)\n a3 = bb.dummy(3)\n\n # at the beginning, every op is its own\n # representative, that means every\n # operation is in a singleton set\n # {a1} {a2} {a3}\n assert a1.find() is a1\n assert a2.find() is a2\n assert a3.find() is a3\n\n # now we unify a2 and a1, then the sets are\n # {a1, a2} {a3}\n a2.make_equal_to(a1)\n # they both return a1 as the representative\n assert a1.find() is a1\n assert a2.find() is a1\n # a3 is still different\n assert a3.find() is a3\n\n # now they are all in the same set {a1, a2, a3}\n a3.make_equal_to(a2)\n assert a1.find() is a1\n assert a2.find() is a1\n assert a3.find() is a1\n\n # now they are still all the same, and we\n # also learned that they are the same as the\n # constant 6\n # the single remaining set then is\n # {6, a1, a2, a3}\n c = Constant(6)\n a2.make_equal_to(c)\n assert a1.find() is c\n assert a2.find() is c\n assert a3.find() is c\n\n # union with the same constant again is fine\n a2.make_equal_to(c)\n\n\n\nConstant Folding\nNow comes the first actual optimization, a simple constant folding pass. It\nwill remove operations where all the arguments are constants and replace them\nwith the constant result.\nEvery pass has the same structure: we go over all operations in the basic\nblock in order and decide for each operation whether it can be removed. For the\nconstant folding pass, we can remove all the operations with constant\narguments (but we'll implement only the add case here).\nI will show a buggy version of the constant folding pass first. It has a\nproblem that is related to why we need the union-find data structure. We will\nfix it a bit further down.\ndef constfold_buggy(bb: Block) -> Block:\n opt_bb = Block()\n\n for op in bb:\n # basic idea: go over the list and do\n # constant folding of add where possible\n if op.name == \"add\":\n arg0 = op.args[0]\n arg1 = op.args[1]\n if isinstance(arg0, Constant) and \\\n isinstance(arg1, Constant):\n # can constant-fold! that means we\n # learned a new equality, namely\n # that op is equal to a specific\n # constant\n value = arg0.value + arg1.value\n op.make_equal_to(Constant(value))\n # don't need to have the operation\n # in the optimized basic block\n continue\n # otherwise the operation is not\n # constant-foldable and we put into the\n # output list\n opt_bb.append(op)\n return opt_bb\n\n\ndef test_constfold_simple():\n bb = Block()\n var0 = bb.getarg(0)\n var1 = bb.add(5, 4)\n var2 = bb.add(var1, var0)\n\n opt_bb = constfold_buggy(bb)\n assert bb_to_str(opt_bb, \"optvar\") == \"\"\"\\\noptvar0 = getarg(0)\noptvar1 = add(9, optvar0)\"\"\"\n\n@pytest.mark.xfail\ndef test_constfold_buggy_limitation():\n # this test fails! it shows the problem with\n # the above simple constfold_buggy pass\n\n bb = Block()\n var0 = bb.getarg(0)\n # this is folded\n var1 = bb.add(5, 4)\n # we want this folded too, but it doesn't work\n var2 = bb.add(var1, 10)\n var3 = bb.add(var2, var0)\n\n opt_bb = constfold_buggy(bb)\n assert bb_to_str(opt_bb, \"optvar\") == \"\"\"\\\noptvar0 = getarg(0)\noptvar1 = add(19, optvar0)\"\"\"\n\nWhy does the test fail? The opt_bb printed output looks like this:\noptvar0 = getarg(0)\noptvar1 = add(9, 10)\noptvar2 = add(optvar1, optvar0)\nThe problem is that when we optimize the second addition in constfold_buggy,\nthe argument of that operation is an Operation not a Constant, so\nconstant-folding is not applied to the second add. However, we have already\nlearned that the argument var1 to the operation var2 is equal to\nConstant(9). This information is stored in the union-find data structure.\nSo what we are missing are suitable find calls in the constant folding pass, to\nmake use of the previously learned equalities.\nHere's the fixed version:\ndef constfold(bb: Block) -> Block:\n opt_bb = Block()\n\n for op in bb:\n # basic idea: go over the list and do\n # constant folding of add where possible\n if op.name == \"add\":\n # >>> changed\n arg0 = op.arg(0) # uses .find()\n arg1 = op.arg(1) # uses .find()\n # <<< end changes\n if isinstance(arg0, Constant) and \\\n isinstance(arg1, Constant):\n # can constant-fold! that means we\n # learned a new equality, namely\n # that op is equal to a specific\n # constant\n value = arg0.value + arg1.value\n op.make_equal_to(Constant(value))\n # don't need to have the operation\n # in the optimized basic block\n continue\n # otherwise the operation is not\n # constant-foldable and we put into the\n # output list\n opt_bb.append(op)\n return opt_bb\n\n\ndef test_constfold_two_ops():\n # now it works!\n bb = Block()\n var0 = bb.getarg(0)\n var1 = bb.add(5, 4)\n var2 = bb.add(var1, 10)\n var3 = bb.add(var2, var0)\n opt_bb = constfold(bb)\n\n assert bb_to_str(opt_bb, \"optvar\") == \"\"\"\\\noptvar0 = getarg(0)\noptvar1 = add(19, optvar0)\"\"\"\n\n\n\nCommon Subexpression Elimination\nThe constfold pass only discovers equalities between Operations and\nConstants. Let's do a second pass that also discovers equalities between\nOperations and other Operations.\nA simple optimization that does that has this property common subexpression\nelimination (CSE), which will finally optimize away the problem in the\nintroductory example code that we had above.\ndef cse(bb: Block) -> Block:\n # structure is the same, loop over the input,\n # add some but not all operations to the\n # output\n\n opt_bb = Block()\n\n for op in bb:\n # only do CSE for add here, but it\n # generalizes\n if op.name == \"add\":\n arg0 = op.arg(0)\n arg1 = op.arg(1)\n # Check whether we have emitted the\n # same operation already\n prev_op = find_prev_add_op(\n arg0, arg1, opt_bb)\n if prev_op is not None:\n # if yes, we can optimize op away\n # and replace it with the earlier\n # result, which is an Operation\n # that was already emitted to\n # opt_bb\n op.make_equal_to(prev_op)\n continue\n opt_bb.append(op)\n return opt_bb\n\n\ndef eq_value(val0, val1):\n if isinstance(val0, Constant) and \\\n isinstance(val1, Constant):\n # constants compare by their value\n return val0.value == val1.value\n # everything else by identity\n return val0 is val1\n\n\ndef find_prev_add_op(arg0: Value, arg1: Value,\n opt_bb: Block) -> Optional[Operation]:\n # Really naive and quadratic implementation.\n # What we do is walk over the already emitted\n # operations and see whether we emitted an add\n # with the current arguments already. A real\n # implementation might use a hashmap of some\n # kind, or at least only look at a limited\n # window of instructions.\n for opt_op in opt_bb:\n if opt_op.name != \"add\":\n continue\n # It's important to call arg here,\n # for the same reason why we\n # needed it in constfold: we need to\n # make sure .find() is called\n if eq_value(arg0, opt_op.arg(0)) and \\\n eq_value(arg1, opt_op.arg(1)):\n return opt_op\n return None\n\n\ndef test_cse():\n bb = Block()\n a = bb.getarg(0)\n b = bb.getarg(1)\n var1 = bb.add(b, 17)\n var2 = bb.mul(a, var1)\n var3 = bb.add(b, 17)\n var4 = bb.add(var2, var3)\n\n opt_bb = cse(bb)\n assert bb_to_str(opt_bb, \"optvar\") == \"\"\"\\\noptvar0 = getarg(0)\noptvar1 = getarg(1)\noptvar2 = add(optvar1, 17)\noptvar3 = mul(optvar0, optvar2)\noptvar4 = add(optvar3, optvar2)\"\"\"\n\n\n\nStrength Reduction\nNow we have one pass that replaces Operations with Constants and one that\nreplaces Operations with previously existing Operations. Let's now do one\nfinal pass that replaces Operations by newly invented Operations, a simple\nstrength reduction. This one will be simple.\ndef strength_reduce(bb: Block) -> Block:\n opt_bb = Block()\n for op in bb:\n if op.name == \"add\":\n arg0 = op.arg(0)\n arg1 = op.arg(1)\n if arg0 is arg1:\n # x + x turns into x << 1\n newop = opt_bb.lshift(arg0, 1)\n op.make_equal_to(newop)\n continue\n opt_bb.append(op)\n return opt_bb\n\ndef test_strength_reduce():\n bb = Block()\n var0 = bb.getarg(0)\n var1 = bb.add(var0, var0)\n\n opt_bb = strength_reduce(bb)\n\n assert bb_to_str(opt_bb, \"optvar\") == \"\"\"\\\noptvar0 = getarg(0)\noptvar1 = lshift(optvar0, 1)\"\"\"\n\n\n\nPutting Things Together\nLet's combine the passes into one single pass, so that we are going over all\nthe operations only exactly once, instead of having to look at every operation\nonce for all the different passes.\ndef optimize(bb: Block) -> Block:\n opt_bb = Block()\n\n for op in bb:\n if op.name == \"add\":\n arg0 = op.arg(0)\n arg1 = op.arg(1)\n\n # constant folding\n if isinstance(arg0, Constant) and \\\n isinstance(arg1, Constant):\n value = arg0.value + arg1.value\n op.make_equal_to(Constant(value))\n continue\n\n # cse\n prev_op = find_prev_add_op(\n arg0, arg1, opt_bb)\n if prev_op is not None:\n op.make_equal_to(prev_op)\n continue\n\n # strength reduce:\n # x + x turns into x << 1\n if arg0 is arg1:\n newop = opt_bb.lshift(arg0, 1)\n op.make_equal_to(newop)\n continue\n\n # and while we are at it, let's do some\n # arithmetic simplification:\n # a + 0 => a\n if eq_value(arg0, Constant(0)):\n op.make_equal_to(arg1)\n continue\n if eq_value(arg1, Constant(0)):\n op.make_equal_to(arg0)\n continue\n opt_bb.append(op)\n return opt_bb\n\n\ndef test_single_pass():\n bb = Block()\n # constant folding\n var0 = bb.getarg(0)\n var1 = bb.add(5, 4)\n var2 = bb.add(var1, 10)\n var3 = bb.add(var2, var0)\n\n opt_bb = optimize(bb)\n assert bb_to_str(opt_bb, \"optvar\") == \"\"\"\\\noptvar0 = getarg(0)\noptvar1 = add(19, optvar0)\"\"\"\n\n # cse + strength reduction\n bb = Block()\n var0 = bb.getarg(0)\n var1 = bb.getarg(1)\n var2 = bb.add(var0, var1)\n var3 = bb.add(var0, var1) # the same as var3\n var4 = bb.add(var2, 2)\n var5 = bb.add(var3, 2) # the same as var4\n var6 = bb.add(var4, var5)\n\n opt_bb = optimize(bb)\n assert bb_to_str(opt_bb, \"optvar\") == \"\"\"\\\noptvar0 = getarg(0)\noptvar1 = getarg(1)\noptvar2 = add(optvar0, optvar1)\noptvar3 = add(optvar2, 2)\noptvar4 = lshift(optvar3, 1)\"\"\"\n\n # removing + 0\n bb = Block()\n var0 = bb.getarg(0)\n var1 = bb.add(16, -16)\n var2 = bb.add(var0, var1)\n var3 = bb.add(0, var2)\n var4 = bb.add(var2, var3)\n\n opt_bb = optimize(bb)\n assert bb_to_str(opt_bb, \"optvar\") == \"\"\"\\\noptvar0 = getarg(0)\noptvar1 = lshift(optvar0, 1)\"\"\"\n\n\n\nConclusion\nThat's it for now. Why is this architecture cool? From a software engineering\npoint of view, sticking everything into a single function like in optimize\nabove is obviously not great, and if you wanted to do this for real you would\ntry to split the cases into different functions that are individually\ndigestible, or even use a DSL that makes the pattern matching much more\nreadable. But the advantage of the architecture is that it's quite efficient,\nit makes it possible to pack a lot of good optimizations into a single pass\nover a basic block.\nOf course this works even better if you are in a tracing context, where\neverything is put into a trace, which is basically one incredibly long basic\nblock. In a JIT context it's also quite important that the\noptimizer itself runs quickly.\nVarious other optimizations are possible in this model. There is a\nfollow-up post that show how to implement what is arguably PyPy's most\nimportant optimization.\n\n\nSome Further Pointers\nThis post is only a short introduction and is taking some shortcuts, I wanted to\nalso give some (non-exhaustive) pointers to more general literature about the\ntouched topics.\nThe approach to CSE described here is usually can be seen as value\nnumbering, it's normally really implemented with a hashmap though. Here's a\npaper that describes various styles of implementing that, even beyond a\nsingle basic block. The paper also partly takes the perspective of discovering\nequivalence classes of operations that compute the same result.\nA technique that leans even more fully into finding equivalences between\noperations is using e-graphs and then applying equality saturation (this is\nsignificantly more advanced that what I described here though). A cool modern\nproject that applies this technique is egg.\nIf you squint a bit, you can generally view a constant folding pass as a very\nsimple form of Partial Evaluation: every operation that has constant\narguments is constant-folded away, and the remaining ones are \"residualized\",\ni.e. put into the output program. This point of view is not super important for\nthe current post, but will become important in the next one.\nAcknowledgements: Thanks to Thorsten Ball for getting me to write\nthis and for his enthusiastic feedback. I also got great feedback from Max\nBernstein, Matti Picus and Per Vognsen. A conversation with Peng Wu that\nwe had many many years ago and that stuck with me made me keep thinking about\nvarious ways to view compiler optimizations.", + "tags": "toy-optimizer", + "url": "https://www.pypy.org/posts/2022/07/toy-optimizer.html" + }, + { + "title": "How is PyPy Tested?", + "text": "How is PyPy Tested?\nIn this post I want to give an overview of how the PyPy project does and thinks\nabout testing. PyPy takes testing quite seriously and has done some from the\nstart of the project. Here I want to present the different styles of\ntests that PyPy has, when we use them and how I think about them.\n\nBackground\nTo make the blog post self-contained, I am going to start with a small overview\nabout PyPy's architecture. If you already know what PyPy is and how it works,\nyou can skip this section.\nPyPy means \"Python in Python\". It is an alternative implementation of the Python\nlanguage. Usually, when we speak of \"Python\", we can mean two different things.\nOn the one hand it means \"Python as an abstract programming language\". On the\nother hand, the main implementation of that language is also often called\n\"Python\". To more clearly distinguish the two, the implementation is often also\ncalled \"CPython\", because it is an interpreter implemented in C code.\nNow we can make the statement \"PyPy is Python in Python\" more precise: PyPy is\nan interpreter for Python 3.9, implemented in RPython. RPython (\"Restricted\nPython\") is a subset of Python 2, which is statically typed (using type\ninference, not type annotations) and can be compiled\nto C code. That means we can take our Python 3.9 interpreter, and compile it\ninto a C binary that can run Python 3.9 code. The final binary behaves pretty\nsimilarly to CPython.\nThe main thing that makes PyPy interesting is that during the translation of our\ninterpreter to C, a number of components are automatically inserted into the\nfinal binary. One component is a reasonably good garbage collector.\nThe more exciting component that is inserted into the binary is a just-in-time\ncompiler. The insertion of this component is not fully automatic, instead it is\nguided by a small number of annotations in the source code of the interpreter.\nThe effect of inserting this JIT compiler into the binary is that the resulting\nbinary can run Python code significantly faster than CPython, in many cases.\nHow this works is not important for the rest of the post, if you want to see an\nexample of concretely doing that to a small interpreter you can look at this\nvideo.\n\n\nPyPy Testing History\nA few historical notes on the PyPy project and its relationship to testing: The\nPyPy project was started in 2004. At the time when the project was started,\nExtreme Programming and Agile Software Development were up and coming. On the\nmethodology side, PyPy was heavily influenced by these, and started using\nTest-Driven Development and pair programming right from the start.\nAlso technologically, PyPy has been influential on testing in the Python world.\nOriginally, PyPy had used the unittest testing framework, but pretty soon\nthe developers got frustrated with it. Holger Krekel, one of the original\ndevelopers who started PyPy, started the pytest testing framework soon\nafterwards.\n\n\nInterpreter-Level Tests\nSo, how are tests for PyPy written, concretely? The tests for the interpreter\nare split into two different kinds, which we call \"interpreter level tests\" and\n\"application level tests\". The former are tests that can be used to test the\nobjects and functions that are used in the implementation of the Python\ninterpreter. Since the interpreter is written in Python 2, those tests are also\nwritten in Python 2, using pytest. They tend to be more on the unit test side of\nthings. They are in files with the pattern test_*.py.\nHere is an example that tests the implementation of integers (very slightly\nsimplified):\nclass TestW_IntObject:\n ...\n\n def test_hash(self):\n w_x = W_IntObject(42)\n w_result = w_x.descr_hash(self.space)\n assert isinstance(w_result, W_IntObject)\n assert w_result.intval == 42\n\nThis test checks that if you take an object that represents integers in the\nPython language (using the class W_IntObject, a \"wrapped integer object\")\nwith the value 42, computing the hash of that object returns another instance of\nthe same class, also with the value 42.\nThese tests can be run on top of any Python 2 implementation, either CPython or\nPyPy. We can then test and debug the internals of the PyPy interpreter using\nfamiliar tools like indeed pytest and the Python debuggers. They can be run,\nbecause all the involved code like the tests and the class W_IntObject are\njust completely regular Python 2 classes that behave in the regular way when\nrun on top of a Python interpreter.\nIn CPython, these tests don't really have an equivalent. They would correspond\nto tests that are written in C and that can test the logic of all the C\nfunctions of CPython that execute certain functionality, accessing the internals\nof C structs in the process. \u00b9\n\n\nApplication-Level Tests\nThere is also a second class of tests for the interpreter. Those are tests that\ndon't run on the level of the implementation. Instead, they are executed by\nthe PyPy Python interpreter, thus running on the level of the applications run\nby PyPy. Since the interpreter is running Python 3, the tests are also written\nin Python 3. They are stored in files with the pattern apptest_*.py and\nlook like \"regular\" Python 3 tests. \u00b2\nHere's an example of how you could write a test equivalent to the one above:\ndef test_hash():\n assert hash(42) == 42\n\nThis style of test looks more \"natural\" and is the preferred one in cases where\nthe test does not need to access the internals of the logic or the objects of\nthe interpreter.\nApplication level tests can be run in two different ways. On the one hand, we\ncan simply run them on CPython 3. This is very useful! Since we want PyPy to\nbehave like CPython, running the tests that we write on CPython is useful to\nmake sure that the tests themselves aren't wrong.\nOn the other hand, the main way to run these tests is on top of PyPy, itself\nrunning on top of a Python 2 implementation. This makes it possible to run the\ntest without first bootstrapping PyPy to C. Since bootstrapping to C is a\nrelatively slow operation (can take up to an hour) it is crucially important to\nbe able to run tests without bootstrapping first. It also again makes it\npossible to debug crashes in the interpreter using the regular Python 2\ndebugger. Of course running tests in this way is unfortunately itself not super\nfast, given that they run on a stack of two different interpreters.\nApplication-level tests correspond quite closely to CPython's tests suite (which\nis using the unittest framework). Of course in CPython it is not possible to run\nthe test suite without building the CPython binary using a C compiler. \u00b3\nSo when do we write application-level tests, and when interpreter-level tests?\nInterpreter-level tests are necessary to test internal data structures that\ntouch data and logic that is not directly exposed to the Python language. If\nthat is not necessary, we try to write application-level tests. App-level tests\nare however by their nature always more on the integration test side of things.\nTo be able to run the test_hash function above, many parts of PyPy need to\nwork correctly, the parser, the bytecode compiler, the bytecode interpreter, the\nhash builtin, calling the __hash__ special method, etc, etc.\nThis observation is also true for CPython! One could argue that CPython has no\nunit tests at all, because in order to be able to even run the tests, most of\nPython needs to be in working order already, so all the tests are really\nimplicitly integration tests.\n\n\nThe CPython Test Suite\nWe also use the CPython Test suite as a final check to see whether our\ninterpreter correctly implements all the features of the Python language. In\nthat sense it acts as some kind of compliance test suite that checks whether we\nimplement the language correctly. The test suite is not perfect for this.\nSince it is written for CPython's purposes during its development, a\nlot of the tests check really specific CPython implementation details. Examples\nfor these are tests that check that __del__ is called immediately after\nobjects go out of scope (which only happens if you use reference counting as a\ngarbage collection strategy, PyPy uses a different approach to garbage\ncollection). Other examples are checking\nfor exception error messages very explicitly. However, the CPython test suite\nhas gotten a lot better in these regards over time, by adding\nsupport.gc_collect() calls to fix the former problem, and by marking some\nvery specific tests with the @impl_detail decorator. Thanks to all the\nCPython developers who have worked on this!\nIn the process of re-implementing CPython's functionality and running CPython's\ntests suite, PyPy can often also be a good way to find bugs in CPython. While we\nthink about the corner cases of some Python feature we occasionally find\nsituations where CPython didn't get everything completely correct either, which\nwe then report back.\n\n\nTesting for Performance Regressions\nAll the tests we described so far are checking behaviour. But one of PyPy's\nimportant goals is to be a fast implementation not \"just\" a correct one. Some\naspects of performance can be tested by regular unit tests, either application-\nor interpreter-level. In order to check whether some performance shortcut is\ntaken in the interpreter, we sometimes can write tests that monkeypatch the slow\ndefault implementation to always error. Then, if the fast path is taken\nproperly, that slow default implementation is never reached.\nBut we also have additional tests that test the correct interaction with the JIT\nexplicitly. For that, we have a special style of test that checks that the JIT\nwill produce the correct machine code for a small snippet of Python code. To\nmake this kind of test somewhat more robust, we don't check the machine code\ndirectly, but instead the architecture independent intermediate\nrepresentation that the JIT uses to produce machine code from.\nAs an example, here is a small test that loading the attribute of a constant\nglobal instance can be completely constant folded away:\ndef test_load_attr(self):\n src = '''\n class A(object):\n pass\n a = A()\n a.x = 1\n def main(n):\n i = 0\n while i < n:\n i = i + a.x\n return i\n '''\n log = self.run(src, [1000])\n assert log.result == 1000\n loop, = log.loops_by_filename(self.filepath)\n assert loop.match(\"\"\"\n i9 = int_lt(i5, i6)\n guard_true(i9, descr=...)\n guard_not_invalidated(descr=...)\n i10 = int_add(i5, 1)\n --TICK--\n jump(..., descr=...)\n \"\"\")\n\nThe string passed to the loop.match function is a string representation of\nthe intermediate representation code that is generated for the while loop in\nthe main function given in the source. The important part of that\nintermediate representation is that the i = i + a.x addition is optimized\ninto an int_add(x, 1) operation. The second argument for the addition is the\nconstant 1, because the JIT noted that the global a is a constant, and\nthe attribute x of that instance is always 1. The test thus checks that\nthis optimization still works.\nThose tests are again more on the unit test side of things (and can thus\nunfortunately be a bit brittle sometimes and break). The integration test\nequivalent for performance is the PyPy Speed Center which tracks the\nperformance of micro- and macro-benchmarks over time and lets us see when big\nperformance regressions are happening. The speed center is not really an\nautomatic test and does not produce pass/fail outcomes. Instead, it requires\nhuman judgement and intervention in order to interpret the performance changes.\nHaving a real pass/fail mechanism is something that would be great to have\nbut is probably quite tricky in practice.\n\n\nConclusion\nThis concludes my overview of some of the different styles of tests that we use\nto develop the PyPy Python interpreter.\nThere is a whole other set of tests for the development of the RPython language,\nthe garbage collectors it provides as well as the code that does the automatic\nJIT insertion, maybe I'll cover these in a future post.\n\nFootnotes\n\u00b9 CPython has the _testcapimodule.c and related modules, that are used to\nunit-test the C-API. However, these are still driven from Python tests using\nthe unittest framework and wouldn't run without the Python interpreter\nalready working.\n\u00b2 There is also a deprecated different way to write these tests, by putting\nthem in the test_*.py files that interpreter level tests are using and\nthen having a test class with the pattern class AppTest*. We haven't\nconverted all of them to the new style yet, even though the old style is\nquite weird: since the test_*.py files are themselves parsed by\nPython 2, the tests methods in AppTest* classes need to be written in the\nsubset of Python 3 syntax that is also valid Python 2 syntax, leading to a lot\nof confusion.\n\u00b3 Nit-picky side-note: C interpreters are a thing! But not that\nwidely used in practice, or only in very specific situations.", + "tags": "", + "url": "https://www.pypy.org/posts/2022/04/how-is-pypy-tested.html" + }, + { + "title": "PyPy v7.3.9 security release", + "text": "PyPy v7.3.9 security release\nThe PyPy team is proud to release version 7.3.9 of PyPy. This is a security\nrelease to match the recent CPython release and updates the portable pypy\ntarballs with bzip2 1.0.8, openssl1.1.1n, and libexpat 2.4.7. Along\nthe way this release fixes some issues discovered after the 7.3.8 release and\nupdates sqlite3 to 3.38.2. It includes:\n\n\nPyPy2.7, which is an interpreter supporting the syntax and the features of\nPython 2.7 including the stdlib for CPython 2.7.18+ (the + is for\nbackported security updates)\nPyPy3.7, which is an interpreter supporting the syntax and the features of\nPython 3.7, including the stdlib for CPython 3.7.13. This will be the last\nrelease of PyPy3.7.\nPyPy3.8, which is an interpreter supporting the syntax and the features of\nPython 3.8, including the stdlib for CPython 3.8.13.\nPyPy3.9, which is an interpreter supporting the syntax and the features of\nPython 3.9, including the stdlib for CPython 3.9.12. We relate to this as\n\"beta\" quality. We welcome testing of this version, if you discover\nincompatibilities, please report them so we can gain confidence in the version.\n\n\nThe interpreters are based on much the same codebase, thus the multiple\nrelease. This is a micro release, all APIs are compatible with the other 7.3\nreleases. Highlights of the release, since the release of 7.3.8 in February 2022,\ninclude:\n\n\nFixed some failing stdlib tests on PyPy3.9\nUpdate the bundled libexpat to 2.4.6 and sqlite3 to 3.38.2\n\n\nWe recommend updating. You can find links to download the v7.3.9 releases here:\n\nhttps://pypy.org/download.html\n\nWe would like to thank our donors for the continued support of the PyPy\nproject. If PyPy is not quite good enough for your needs, we are available for\ndirect consulting work. If PyPy is helping you out, we would love to hear about\nit and encourage submissions to our blog via a pull request\nto https://github.com/pypy/pypy.org\nWe would also like to thank our contributors and encourage new people to join\nthe project. PyPy has many layers and we need help with all of them: PyPy\nand RPython documentation improvements, tweaking popular modules to run\non PyPy, or general help with making RPython's JIT even better. Since the\n7.3.7 release, we have accepted contributions from 6 new contributors,\nthanks for pitching in, and welcome to the project!\nIf you are a python library maintainer and use C-extensions, please consider\nmaking a HPy / CFFI / cppyy version of your library that would be performant\non PyPy.\nIn any case both cibuildwheel and the multibuild system support\nbuilding wheels for PyPy.\n\nWhat is PyPy?\nPyPy is a Python interpreter, a drop-in replacement for CPython 2.7, 3.7, 3.8 and\n3.9. It's fast (PyPy and CPython 3.7.4 performance\ncomparison) due to its integrated tracing JIT compiler.\nWe also welcome developers of other dynamic languages to see what RPython\ncan do for them.\nThis PyPy release supports:\n\n\nx86 machines on most common operating systems\n(Linux 32/64 bits, Mac OS X 64 bits, Windows 64 bits, OpenBSD, FreeBSD)\n64-bit ARM machines running Linux. A shoutout to Huawei for sponsoring\nthe VM running the tests.\ns390x running Linux\nbig- and little-endian variants of PPC64 running Linux,\n\n\nPyPy support Windows 32-bit, PPC64 big- and little-endian, and ARM 32 bit, but\ndoes not release binaries. Please reach out to us if you wish to sponsor\nreleases for those platforms.\n\n\nKnown Issues with PyPy3.9\n\nWe slightly modified the concurrent future's ProcessExcecutorPool to\nstart all the worker processes when the first task is received (like on\nPython3.8) to avoid an apparent race condition when using fork and\nthreads (issue 3650).\n\n\n\nWhat else is new?\nFor more information about the 7.3.9 release, see the full changelog.\nPlease update, and continue to help us make PyPy better.\nCheers,\nThe PyPy team", + "tags": "release", + "url": "https://www.pypy.org/posts/2022/03/pypy-v738-release.html" + }, + { + "title": "PyPy v7.3.8: release of python 2.7, 3.7, 3.8, and 3.9", + "text": "PyPy v7.3.8: release of python 2.7, 3.7, 3.8, and 3.9-beta\nThe PyPy team is proud to release version 7.3.8 of PyPy. It has been only a few\nmonths since our last release, but we have some nice speedups and bugfixes we\nwish to share. The release includes four different interpreters:\n\n\nPyPy2.7, which is an interpreter supporting the syntax and the features of\nPython 2.7 including the stdlib for CPython 2.7.18+ (the + is for\nbackported security updates)\nPyPy3.7, which is an interpreter supporting the syntax and the features of\nPython 3.7, including the stdlib for CPython 3.7.12. This will be the last\nrelease of PyPy3.7.\nPyPy3.8, which is an interpreter supporting the syntax and the features of\nPython 3.8, including the stdlib for CPython 3.8.12. This is our third\nrelease of this interpreter, and we are removing the \"beta\" tag.\nPyPy3.9, which is an interpreter supporting the syntax and the features of\nPython 3.9, including the stdlib for CPython 3.9.10. As this is our first\nrelease of this interpreter, we relate to this as \"beta\" quality. We\nwelcome testing of this version, if you discover incompatibilities, please\nreport them so we can gain confidence in the version.\n\n\nThe interpreters are based on much the same codebase, thus the multiple\nrelease. This is a micro release, all APIs are compatible with the other 7.3\nreleases. Highlights of the release, since the release of 7.3.7 in late October 2021,\ninclude:\n\n\nPyPy3.9 uses an RPython version of the PEG parser which brought with it a\ncleanup of the lexer and parser in general\nFixed a regression in PyPy3.8 when JITting empty list comprehensions\nTweaked some issues around changing the file layout after packaging to make\nthe on-disk layout of PyPy3.8 more compatible with CPython. This requires\nsetuptools>=58.1.0\nRPython now allows the target executable to have a . in its name, so\nPyPy3.9 will produce a pypy3.9-c and libpypy3.9-c.so. Changing the\nname of the shared object to be version-specific (it used to be\nlibpypy3-c.so) will allow it to live alongside other versions.\nBuilding PyPy3.9+ accepts a --platlibdir argument like CPython.\nImprovement in ssl's use of CFFI buffers to speed up recv and recvinto\nUpdate the packaged OpenSSL to 1.1.1m\n\n\nWe recommend updating. You can find links to download the v7.3.8 releases here:\n\nhttps://pypy.org/download.html\n\nWe would like to thank our donors for the continued support of the PyPy\nproject. If PyPy is not quite good enough for your needs, we are available for\ndirect consulting work. If PyPy is helping you out, we would love to hear about\nit and encourage submissions to our blog via a pull request\nto https://github.com/pypy/pypy.org\nWe would also like to thank our contributors and encourage new people to join\nthe project. PyPy has many layers and we need help with all of them: PyPy\nand RPython documentation improvements, tweaking popular modules to run\non PyPy, or general help with making RPython's JIT even better. Since the\nprevious release, we have accepted contributions from 6 new contributors,\nthanks for pitching in, and welcome to the project!\nIf you are a python library maintainer and use C-extensions, please consider\nmaking a HPy / CFFI / cppyy version of your library that would be performant\non PyPy.\nIn any case both cibuildwheel and the multibuild system support\nbuilding wheels for PyPy.\n\nWhat is PyPy?\nPyPy is a Python interpreter, a drop-in replacement for CPython 2.7, 3.7, 3.8 and\n3.9. It's fast (PyPy and CPython 3.7.4 performance\ncomparison) due to its integrated tracing JIT compiler.\nWe also welcome developers of other dynamic languages to see what RPython\ncan do for them.\nThis PyPy release supports:\n\n\nx86 machines on most common operating systems\n(Linux 32/64 bits, Mac OS X 64 bits, Windows 64 bits, OpenBSD, FreeBSD)\n64-bit ARM machines running Linux. A shoutout to Huawei for sponsoring\nthe VM running the tests.\ns390x running Linux\nbig- and little-endian variants of PPC64 running Linux,\n\n\nPyPy support Windows 32-bit, PPC64 big- and little-endian, and ARM 32 bit, but\ndoes not release binaries. Please reach out to us if you wish to sponsor\nreleases for those platforms.\n\n\nKnown Issues with PyPy3.9\n\nThere is still a known speed regression around **kwargs handling\nWe slightly modified the concurrent future's ProcessExcecutorPool to\nstart all the worker processes when the first task is received (like on\nPython3.8) to avoid an apparent race condition when using fork and\nthreads (issue 3650).\n\n\n\nWhat else is new?\nFor more information about the 7.3.8 release, see the full changelog.\nPlease update, and continue to help us make PyPy better.\nCheers,\nThe PyPy team", + "tags": "release", + "url": "https://www.pypy.org/posts/2022/02/pypy-v738-release.html" + }, + { + "title": "Natural Language Processing for Icelandic with PyPy: A Case Study", + "text": "Natural Language Processing for Icelandic with PyPy: A Case Study\nIcelandic is one\nof the smallest languages of the world, with about 370.000 speakers. It\nis a language in the Germanic family, most similar to Norwegian, Danish\nand Swedish, but closer to the original Old\nNorse spoken throughout\nScandinavia until about the 14th century CE.\nAs with other small languages, there are worries that the language may\nnot\nsurvive\nin a digital world, where all kinds of fancy applications are developed\nfirst - and perhaps only - for the major languages. Voice assistants,\nchatbots, spelling and grammar checking utilities, machine translation,\netc., are increasingly becoming staples of our personal and professional\nlives, but if they don\u2019t exist for Icelandic, Icelanders will gravitate\ntowards English or other languages where such tools are readily\navailable.\nIceland is a technology-savvy country, with world-leading adoption\nrates of the\nInternet,\nPCs and smart devices, and a thriving software industry. So the\ngovernment figured that it would be worthwhile to fund a 5-year\nplan to build natural\nlanguage processing (NLP) resources and other infrastructure for the\nIcelandic language. The project focuses on collecting data and\ndeveloping open source software for a range of core applications, such\nas tokenization, vocabulary lookup, n-gram statistics, part-of-speech\ntagging, named entity recognition, spelling and grammar checking, neural\nlanguage models and speech processing.\n\nMy name is Vilhj\u00e1lmur \u00deorsteinsson, and I\u2019m the founder and CEO of a\nsoftware startup Mi\u00f0eind in Reykjav\u00edk,\nIceland, that employs 10 software engineers and linguists and focuses on\nNLP and AI for the Icelandic language. The company participates in the\ngovernment\u2019s language technology program, and has contributed\nsignificantly to the program\u2019s core tools (e.g., a tokenizer and a\nparser), spelling and grammar checking modules, and a neural machine\ntranslation stack.\nWhen it came to a choice of programming languages and development tools\nfor the government program, the requirements were for a major, well\nsupported, vendor-and-OS-agnostic FOSS platform with a large and diverse\ncommunity, including in the NLP space. The decision to select Python as\na foundational language for the project was a relatively easy one. That\nsaid, there was a bit of trepidation around the well known fact that\nCPython can be slow for inner-core tasks, such as tokenization and\nparsing, that can see heavy workloads in production.\nI first became aware of PyPy in early 2016 when I was developing a\ncrossword game Netskrafl in Python 2.7\nfor Google App Engine. I had a utility program that compressed a\ndictionary into a Directed Acyclic Word Graph and was taking 160\nseconds\u00a0 to run on CPython 2.7, so I tried PyPy and to my amazement saw\na 4x speedup (down to 38 seconds), with literally no effort besides\ndownloading the PyPy runtime.\nThis led me to select PyPy as the default Python interpreter for my\ncompany\u2019s Python development efforts as well as for our production\nwebsites and API servers, a role in which it remains to this day. We\nhave followed PyPy\u2019s upgrades along the way, being just about to migrate\nour minimally required language version from 3.6 to 3.7.\nIn NLP, speed and memory requirements can be quite important for\nsoftware usability. On the other hand, NLP logic and algorithms are\noften complex and challenging to program, so programmer productivity and\ncode clarity are also critical success factors. A pragmatic approach\nbalances these factors, avoids premature optimization and seeks a\ncareful compromise between maximal run-time efficiency and minimal\nprogramming and maintenance effort.\nTurning to our use cases, our Icelandic text\ntokenizer \"Tokenizer\" is fairly light,\nruns tight loops and performs a large number of small, repetitive\noperations. It runs very well on PyPy\u2019s JIT and has not required further\noptimization.\nOur Icelandic parser Greynir\n(known on PyPI as reynir) is,\nif I may say so myself, a piece of work. It parses natural language\ntext according to a\nhand-written context-free\ngrammar,\nusing an Earley-type\nalgorithm as enhanced\nby Scott and\nJohnstone.\nThe CFG contains almost 7,000 nonterminals and 6,000 terminals, and the\nparser handles ambiguity as well as left, right and middle recursion. It\nreturns a packed parse forest for each input sentence, which is then\npruned by a scoring heuristic down to a single best result tree.\nThis parser was originally coded in pure Python and turned out to be\nunusably slow when run on CPython - but usable on PyPy, where it was\n3-4x faster. However, when we started applying it to heavier production\nworkloads, it\u00a0 became apparent that it needed to be faster still. We\nthen proceeded to convert the innermost Earley parsing loop from Python\nto tight\nC++\nand to call it from PyPy via\nCFFI, with callbacks for\ntoken-terminal matching functions (\u201cbusiness logic\u201d) that remained on\nthe Python side. This made the parser much faster (on the order of 100x\nfaster than the original on CPython) and quick enough for our production\nuse cases. Even after moving much of the heavy processing to C++ and using CFFI, PyPy still gives a significant speed boost over CPython.\nConnecting C++ code with PyPy proved to be quite painless using CFFI,\nalthough we had to figure out a few magic incantations in our build\nmodule\nto make it compile smoothly during setup from source on Windows and\nMacOS in addition to Linux. Of course, we build binary PyPy and CPython\nwheels for the most common targets so most users don\u2019t have to worry\nabout setup requirements.\nWith the positive experience from the parser project, we proceeded to\ntake a similar approach for two other core NLP packages: our compressed\nvocabulary package BinPackage\n(known on PyPI as islenska) and our\ntrigrams database package Icegrams.\nThese packages both take large text input (3.1 million word forms with\ninflection data in the vocabulary case; 100 million tokens in the\ntrigrams case) and compress it into packed binary structures. These\nstructures are then memory-mapped at run-time using\nmmap and queried via\nPython functions with a lookup time in the microseconds range. The\nlow-level data structure navigation is done in\nC++,\ncalled from Python via CFFI. The ex-ante preparation, packing,\nbit-fiddling and data structure generation is fast enough with PyPy, so\nwe haven\u2019t seen a need to optimize that part further.\nTo showcase our tools, we host public (and open source) websites such as\ngreynir.is for our parsing, named entity\nrecognition and query stack and\nyfirlestur.is for our spell and grammar\nchecking stack. The server code on these sites is all Python running on\nPyPy using Flask,\nwrapped in gunicorn and hosted on\nnginx. The underlying database is\nPostgreSQL accessed via\nSQLAlchemy and\npsycopg2cffi. This setup\nhas served us well for 6 years and counting, being fast, reliable and\nhaving helpful and supporting communities.\nAs can be inferred from the above, we are avid fans of PyPy and\ncommensurately thankful for the great work by the PyPy team over the\nyears. PyPy has enabled us to use Python for a larger part of our\ntoolset than CPython alone would have supported, and its smooth\nintegration with C/C++ through CFFI has helped us attain a better\ntradeoff between performance and programmer productivity in our\nprojects. We wish for PyPy a great and bright future and also look\nforward to exciting related developments on the horizon, such as\nHPy.", + "tags": "casestudy", + "url": "https://www.pypy.org/posts/2022/02/nlp-icelandic-case-study.html" + }, + { + "title": "Error Message Style Guides of Various Languages", + "text": "Error Message Style Guides of Various Languages\nPyPy has been trying to produce good SyntaxErrors and other errors for\na long time. CPython has also made an enormous push to improve its\nSyntaxErrors in the last few releases. These improvements are great, but the process\nfeels somewhat arbitrary sometimes. To see what other languages are doing, I\nasked people on Twitter whether they know of error message style guides for\nother programming languages.\nWonderfully, people answered me with lots of helpful links (full list at the\nend of the post), thank you everybody! All those sources are very interesting\nand contain many great points, I recommend reading them directly! In this\npost, I'll try to summarize some common themes or topics that I thought were\nparticularly interesting.\n\nLanguage Use\nAlmost all guides stress the need for plain and simple English, as well as\nconciseness and clarity [Flix, Racket, Rust, Flow]. Flow suggests to put coding\neffort into making the grammar correct, for example in the case of plurals or\nto distinguish between \"a\" and \"an\".\nThe suggested tone should be friendly and neutral, the messages should not\nblame the Programmer [Flow]. Rust and Flix suggest to not use the term\n'illegal' and use something like 'invalid' instead.\nFlow suggests to avoid \"compiler speak\". For example terms like 'token' and\n'identifier' should be avoided and terms that are more familiar to programmers\nbe used (eg \"name\" is better). The Racket guide goes further and has a list of\nallowed technical terms and some prohibited terms.\n\n\nStructure\nSeveral guides (such as Flix and Flow) point out a 80/20 rule: 80% of the times an error message is\nread, the developer knows that message well and knows exactly what to do. For\nthis use case it's important that the message is short. On the other hand, 20%\nof the times this same message will have to be understood by a developer who\nhas never seen it before and is confused, and so the message needs to contain\nenough information\nto allow them to find out what is going on. So the error message needs to strike\na balance between brevity and clarity.\nThe Racket guide proposes to use the following general structure for errors:\n'State the constraint that was violated (\"expected a\"), followed by what was\nfound instead.'\nThe Rust guides says to avoid \"Did you mean?\" and questions in general, and\nwants the compiler to instead be explicit about why something was suggested. The\nexample the Rust guide gives is: 'Compare \"did you mean: Foo\" vs. \"there is a\nstruct with a similar name: Foo\".' Racket goes further and forbids\nsuggestions altogether because \"Students will follow well\u2010meaning\u2010but\u2010wrong\nadvice uncritically, if only because they have no reason to doubt the\nauthoritative voice of the tool.\"\n\n\nFormatting and Source Positions\nThe Rust guide suggests to put all identifiers into backticks (like in\nMarkdown), Flow formats the error messages using full Markdown.\nThe Clang, Flow and Rust guides point out the importance of using precise\nsource code spans to point to errors, which is especially important if the\ncompiler information is used in the context of an IDE to show a red squiggly\nunderline or some other highlighting. The spans should be as small as possible to point out the source of\nthe error [Flow].\n\n\nConclusion\nI am quite impressed how advanced and well-thought out the approaches are. I wonder whether it would makes sense for\nPython to adopt a (probably minimal, to get started) subset of these ideas as guidelines for its own errors.\n\n\nSources\n\nRust: https://rustc-dev-guide.rust-lang.org/diagnostics.html\nClang: https://clang.llvm.org/diagnostics.html\nFlix: https://flix.dev/principles/\nRacket: https://cs.brown.edu/~kfisler/Misc/error-msg-guidelines-racket-studlangs.pdf\nMore about the research that lead to the Racket guidelines (including the referenced papers): https://twitter.com/ShriramKMurthi/status/1451688982761381892\nFlow: https://calebmer.com/2019/07/01/writing-good-compiler-error-messages.html\nElm: https://elm-lang.org/news/compiler-errors-for-humans\nElm's error message catalog: https://github.com/elm/error-message-catalog\nReason: https://reasonml.github.io/blog/2017/08/25/way-nicer-error-messages.html", + "tags": "", + "url": "https://www.pypy.org/posts/2021/12/error-message-style-guides.html" + }, + { + "title": "PyPy v7.3.7: bugfix release of python 3.7 and 3.8", + "text": "PyPy v7.3.7: bug-fix release of 3.7, 3.8\nWe are releasing a PyPy 7.3.7 to fix the recent 7.3.6 release's binary\nincompatibility with the previous 7.3.x releases. We mistakenly added fields\nto PyFrameObject and PyDateTime_CAPI that broke the promise of binary\ncompatibility, which means that c-extension wheels compiled for 7.3.5 will not\nwork with 7.3.6 and via-versa. Please do not use 7.3.6.\nWe have added a cursory test for binary API breakage to the\nhttps://github.com/pypy/binary-testing repo which hopefully will prevent such\nmistakes in the future.\nAdditionally, a few smaller bugs were fixed:\n\nUse uint for the request argument of fcntl.ioctl (issue 3568)\nFix incorrect tracing of while True` body in 3.8 (issue 3577)\nProperly close resources when using a concurrent.futures.ProcessPool\n(issue 3317)\nFix the value of LIBDIR in _sysconfigdata in 3.8 (issue 3582)\n\nYou can find links to download the v7.3.7 releases here:\n\nhttps://pypy.org/download.html\n\nWe would like to thank our donors for the continued support of the PyPy\nproject. If PyPy is not quite good enough for your needs, we are available for\ndirect consulting work. If PyPy is helping you out, we would love to hear about\nit and encourage submissions to our blog site via a pull request\nto https://github.com/pypy/pypy.org\nWe would also like to thank our contributors and encourage new people to join\nthe project. PyPy has many layers and we need help with all of them: PyPy\nand RPython documentation improvements, tweaking popular modules to run\non PyPy, or general help with making RPython's JIT even better.\nIf you are a python library maintainer and use C-extensions, please consider\nmaking a CFFI / cppyy version of your library that would be performant on PyPy.\nIn any case both cibuildwheel and the multibuild system support\nbuilding wheels for PyPy.\n\nWhat is PyPy?\nPyPy is a Python interpreter, a drop-in replacement for CPython 2.7, 3.7, and\n3.8. It's fast (PyPy and CPython 3.7.4 performance\ncomparison) due to its integrated tracing JIT compiler.\nWe also welcome developers of other dynamic languages to see what RPython\ncan do for them.\nThis PyPy release supports:\n\n\nx86 machines on most common operating systems\n(Linux 32/64 bits, Mac OS X 64 bits, Windows 64 bits, OpenBSD, FreeBSD)\n64-bit ARM machines running Linux.\ns390x running Linux\n\n\nPyPy does support ARM 32 bit and PPC64 processors, but does not release binaries.", + "tags": "release", + "url": "https://www.pypy.org/posts/2021/10/pypy-v737-release.html" + }, + { + "title": "PyPy v7.3.6: release of python 2.7, 3.7, and 3.8", + "text": "PyPy v7.3.6: release of python 2.7, 3.7, and 3.8-beta\nThe PyPy team is proud to release version 7.3.6 of PyPy, which includes\nthree different interpreters:\n\n\nPyPy2.7, which is an interpreter supporting the syntax and the features of\nPython 2.7 including the stdlib for CPython 2.7.18+ (the + is for\nbackported security updates)\nPyPy3.7, which is an interpreter supporting the syntax and the features of\nPython 3.7, including the stdlib for CPython 3.7.12.\nPyPy3.8, which is an interpreter supporting the syntax and the features of\nPython 3.8, including the stdlib for CPython 3.8.12. Since this is our\nfirst release of the interpreter, we relate to this as \"beta\" quality. We\nwelcome testing of this version, if you discover incompatibilites, please\nreport them so we can gain confidence in the version.\n\n\nThe interpreters are based on much the same codebase, thus the multiple\nrelease. This is a micro release, all APIs are compatible with the other 7.3\nreleases. Highlights of the release, since the release of 7.3.5 in May 2021,\ninclude:\n\n\nWe have merged a backend for HPy, the better C-API interface. The backend\nimplements HPy version 0.0.3.\nTranslation of PyPy into a binary, known to be slow, is now about 40%\nfaster. On a modern machine, PyPy3.8 can translate in about 20 minutes.\nPyPy Windows 64 is now available on conda-forge, along with nearly 700\ncommonly used binary packages. This new offering joins the more than 1000\nconda packages for PyPy on Linux and macOS. Many thanks to the conda-forge\nmaintainers for pushing this forward over the past 18 months.\nSpeed improvements were made to io, sum, _ssl and more. These\nwere done in response to user feedback.\nThe 3.8 version of the release contains a beta-quality improvement to the\nJIT to better support compiling huge Python functions by breaking them\nup into smaller pieces.\nThe release of Python3.8 required a concerted effort. We were greatly\nhelped by @isidentical (Batuhan Taskaya) and other new contributors.\nThe 3.8 package now uses the same layout as CPython, and many of the\nPyPy-specific changes to sysconfig, distutils.sysconfig, and\ndistutils.commands.install.py have been removed. The stdlib now\nis located in /lib/pypy3.8 on posix systems, and in\n/Lib on Windows. The include files on windows remain the same.\nOn posix they are in /include/pypy3.8. Note we still use the\npypy prefix to prevent mixing the files with CPython (which uses\npython.\n\n\nWe recommend updating. You can find links to download the v7.3.6 releases here:\n\nhttps://pypy.org/download.html\n\nWe would like to thank our donors for the continued support of the PyPy\nproject. If PyPy is not quite good enough for your needs, we are available for\ndirect consulting work. If PyPy is helping you out, we would love to hear about\nit and encourage submissions to our blog via a pull request\nto https://github.com/pypy/pypy.org\nWe would also like to thank our contributors and encourage new people to join\nthe project. PyPy has many layers and we need help with all of them: PyPy\nand RPython documentation improvements, tweaking popular modules to run\non PyPy, or general help with making RPython's JIT even better. Since the\nprevious release, we have accepted contributions from 7 new contributors,\nthanks for pitching in, and welcome to the project!\nIf you are a python library maintainer and use C-extensions, please consider\nmaking a CFFI / cppyy version of your library that would be performant on PyPy.\nIn any case both cibuildwheel and the multibuild system support\nbuilding wheels for PyPy.\n\nWhat is PyPy?\nPyPy is a Python interpreter, a drop-in replacement for CPython 2.7, 3.7, and\nsoon 3.8. It's fast (PyPy and CPython 3.7.4 performance\ncomparison) due to its integrated tracing JIT compiler.\nWe also welcome developers of other dynamic languages to see what RPython\ncan do for them.\nThis PyPy release supports:\n\n\nx86 machines on most common operating systems\n(Linux 32/64 bits, Mac OS X 64 bits, Windows 64 bits, OpenBSD, FreeBSD)\nbig- and little-endian variants of PPC64 running Linux,\ns390x running Linux\n64-bit ARM machines running Linux.\n\n\nPyPy does support Windows 32-bit and ARM 32 bit processors, but does not\nrelease binaries. Please reach out to us if you wish to sponsor releases for\nthose platforms.\n\n\nWhat else is new?\nFor more information about the 7.3.6 release, see the full changelog.\nPlease update, and continue to help us make PyPy better.\nCheers,\nThe PyPy team", + "tags": "release", + "url": "https://www.pypy.org/posts/2021/10/pypy-v736-release.html" + }, + { + "title": "Better JIT Support for Auto-Generated Python Code", + "text": "Performance Cliffs\nA common bad property of many different JIT compilers is that of a \"performance\ncliff\": A seemingly reasonable code change, leading to massively reduced\nperformance due to hitting some weird property of the JIT compiler that's not\neasy to understand for the programmer (e.g. here's a blog post about the fix of\na performance cliff when running React on\nV8). Hitting a performance cliff as a\nprogrammer can be intensely frustrating and turn people off from using PyPy\naltogether. Recently we've been working on trying to remove some of PyPy's\nperformance cliffs, and this post describes one such effort.\nThe problem showed up in an issue\nwhere somebody found the performance\nof their website using Tornado a lot\nworse than what various benchmarks suggested. It took some careful digging to\nfigure out what caused the problem: The slow performance was caused by the huge\nfunctions that the Tornado templating engine creates. These functions lead the\nJIT to behave in unproductive ways. In this blog post I'll describe why the\nproblem occurs and how we fixed it.\nProblem\nAfter quite a bit of debugging we narrowed down the problem to the following\nreproducer: If you render a big HTML template\n(example)\nusing the Tornado templating engine, the template rendering is really not any\nfaster than CPython. A small template doesn't show this behavior, and other\nparts of Tornado seem to perform well. So we looked into how the templating\nengine works, and it turns out that the templates are compiled into Python\nfunctions. This means that a big template can turn into a really enormous Python\nfunction (Python version of the\nexample).\nFor some reason really enormous Python functions aren't handled particularly\nwell by the JIT, and in the next section I'll explain some the background that's\nnecessary to understand why this happens.\nTrace Limits and Inlining\nTo understand why the problem occurs, it's necessary to understand how PyPy's\ntrace limit and inlining works. The tracing JIT has a maximum trace length built\nin, the reason for that is some limitation in the compact encoding of traces in\nthe JIT. Another reason is that we don't want to generate arbitrary large chunks\nof machine code. Usually, when we hit the trace limit, it is due to inlining.\nWhile tracing, the JIT will inline many of the functions called from the\noutermost one. This is usually good and improves performance greatly, however,\ninlining can also lead to the trace being too long. If that happens, we\nwill mark a called function as uninlinable. The next time we trace the outer\nfunction we won't inline it, leading to a shorter trace, which hopefully fits\nthe trace limit.\n\nIn the diagram above we trace a function f, which calls a function g, which\nis inlined into the trace. The trace ends up being too long, so the JIT\ndisables inlining of g. The next time we try to trace f the trace will\ncontain a call to g instead of inlining it. The trace ends up being not too\nlong, so we can turn it into machine code when tracing finishes.\nNow we know enough to understand what the problem with automatically generated\ncode is: sometimes, the outermost function itself\ndoesn't fit the trace limit, without any inlining going on at all. This is\nusually not the case for normal, hand-written Python functions. However, it can\nhappen for automatically generated Python code, such as the code that the\nTornado templating engine produces.\nSo, what happens when the JIT hits such a huge function? The function is traced\nuntil the trace is too long. Then the trace limits stops further tracing. Since\nnothing was inlined, we cannot make the trace shorter the next time by disabling\ninlining. Therefore, this happens again and again, the next time we trace the\nfunction we run into exactly the same problem. The net effect is that the\nfunction is even slowed down: we spend time tracing it, then stop tracing and\nthrow the trace away. Therefore, that effort is never useful, so the resulting\nexecution can be slower than not using the JIT at all!\nSolution\nTo get out of the endless cycle of useless retracing we first had the idea of\nsimply disabling all code generation for such huge functions, that produce too long\ntraces even if there is no inlining at all. However, that lead to disappointing\nperformance in the example Tornado program, because important parts of the code\nremain always interpreted.\nInstead, our solution is now as follows: After we have hit the trace limit and\nno inlining has happened so far, we mark the outermost function as a source of huge\ntraces. The next time we trace such a function, we do so in a special mode. In\nthat mode, hitting the trace limit behaves differently: Instead of stopping the\ntracer and throwing away the trace produced so far, we will use the unfinished\ntrace to produce machine code. This trace corresponds to the first part of the\nfunction, but stops at a basically arbitrary point in the middle of the\nfunction.\nThe question is what should happen when execution\nreaches the end of this unfinished trace. We want to be able to cover more of\nthe function with machine code and therefore need to extend the trace\nfrom that point on. But we don't want to do that too\neagerly to prevent lots and lots of machine code being generated. To achieve\nthis behaviour we add a guard to the end of the unfinished trace, which will\nalways fail. This has the right behaviour: a failing guard will transfer control\nto the interpreter, but if it fails often enough, we can patch it to jump to\nmore machine code, that starts from this position. In that way, we can slowly\nexplore the full gigantic function and add all those parts of the control flow\ngraph that are actually commonly executed at runtime.\n\nIn the diagram we are trying to trace a huge function f, which leads to\nhitting the trace limit. However, nothing was inlined into the trace, so\ndisabling inlining won't ensure a successful trace attempt the next time.\nInstead, we mark f as \"huge\". This has the effect that when we trace it again\nand are about to hit the trace limit, we end the trace at an arbitrary point by\ninserting a guard that always fails.\n\nIf this guard failure is executed often enough, we might patch the guard and\nadd a jump to a further part of the function f. This can continue potentially\nseveral times, until the trace really hits and end points (for example by\nclosing the loop and jumping back to trace 1, or by returning from f).\nEvaluation\nSince this is a performance cliff that we didn't observe in any of our\nbenchmarks ourselves, it's pointless to look at the\neffect that this improvement has on existing benchmarks \u2013 there shouldn't and\nindeed there isn't any.\nInstead, we are going to look at a micro-benchmark that came out of the\noriginal bug report, one that simply renders a big artificial Tornado template\n200 times. The code of the micro-benchmark can be found\nhere.\nAll benchmarks were run 10 times in new processes. The means and standard\ndeviations of the benchmark runs are:\n\n\n\nImplementation\nTime taken (lower is better)\n\n\n\n\nCPython 3.9.5\n14.19 \u00b1 0.35s\n\n\nPyPy3 without JIT\n59.48 \u00b1 5.41s\n\n\nPyPy3 JIT old\n14.47 \u00b1 0.35s\n\n\nPyPy3 JIT new\n4.89 \u00b1 0.10s\n\n\n\nWhat we can see is that while the old JIT is very helpful for this\nmicro-benchmark, it only brings the performance up to CPython levels, not\nproviding any extra benefit. The new JIT gives an almost 3x speedup.\nAnother interesting number we can look at is how often the JIT started a trace,\nand for how many traces we produced actual machine code:\n\n\n\nImplementation\nTraces Started\nTraces sent to backend\nTime spent in JIT\n\n\n\n\nPyPy3 JIT old\n216\n24\n0.65s\n\n\nPyPy3 JIT new\n30\n25\n0.06s\n\n\n\nHere we can clearly see the problem: The old JIT would try tracing the\nauto-generated templating code again and again, but would never actually produce\nany machine code, wasting lots of time in the process. The new JIT still traces a\nfew times uselessly, but then eventually converges and stops emitting machine\ncode for all the paths through the auto-generated Python code.\n\n\nRelated Work\nTim Felgentreff pointed me to the fact that\nTruffle also has a\nmechanism\nto slice huge methods into smaller compilation units (and I am sure other JITs\nhave such mechanisms as well).\nConclusion\nIn this post we've described a performance cliff in PyPy's JIT, that of really\nbig auto-generated functions which hit the trace limit without inlining, that we\nstill want to generate machine code for. We achieve this by chunking up the\ntrace into several smaller traces, which we compile piece by piece. This is not\na super common thing to be happening \u2013 otherwise we would have run into and\nfixed it earlier \u2013 but it's still good to have a fix now.\nThe work\ndescribed in this post tiny bit experimental still, but we will release it as\npart of the upcoming 3.8 beta release, to get some more experience with it.\nPlease grab a 3.8 release\ncandidate,\ntry it out and let us know your observations, good and bad!", + "tags": "", + "url": "https://www.pypy.org/posts/2021/09/jit-auto-generated-code.html" + }, + { + "title": "#pypy IRC moves to Libera.Chat", + "text": "Following the example of many other FOSS projects, the PyPy team has\ndecided to move its official #pypy IRC channel from Freenode to\nLibera.Chat: irc.libera.chat/pypy\nThe core devs will no longer be present on the Freenode channel, so we recommend to\njoin the new channel as soon as possible.\nwikimedia.org has a\nnice guide on\nhow to setup your client to migrate from Freenode to Libera.Chat.", + "tags": "", + "url": "https://www.pypy.org/posts/2021/05/pypy-irc-moves-to-libera-chat.html" + }, + { + "title": "PyPy v7.3.5: bugfix release of python 2.7 and 3.7", + "text": "PyPy v7.3.5: release of 2.7 and 3.7\nWe are releasing a PyPy 7.3.5 with bugfixes for PyPy 7.3.4, released April 4.\nPyPy 7.3.4 was the first release that runs on windows 64-bit, so that support\nis still \"beta\". We are releasing it in the hopes that we can garner momentum\nfor its continued support, but are already aware of some problems, for instance\nit errors in the NumPy test suite (issue 3462). Please help out with testing\nthe release and reporting successes and failures, financially supporting our\nongoing work, and helping us find the source of these problems.\n\nThe new windows 64-bit builds improperly named c-extension modules\nwith the same extension as the 32-bit build (issue 3443)\nUse the windows-specific PC/pyconfig.h rather than the posix one\nFix the return type for _Py_HashDouble which impacts 64-bit windows\nA change to the python 3.7 sysconfig.get_config_var('LIBDIR') was wrong,\nleading to problems finding libpypy3-c.so for embedded PyPy (issue 3442).\nInstantiate distutils.command.install schema for PyPy-specific\nimplementation_lower\nDelay thread-checking logic in greenlets until the thread is actually started\n(continuation of issue 3441)\nFour upstream (CPython) security patches were applied:\n\nBPO 42988 to remove pydoc.getfile\nBPO 43285 to not trust the PASV response in ftplib.\nBPO 43075 to remove a possible ReDoS in urllib AbstractBasicAuthHandler\nBPO 43882 to sanitize urls containing ASCII newline and tabs in\nurllib.parse\n\n\nFix for json-specialized dicts (issue 3460)\nSpecialize ByteBuffer.setslice which speeds up binary file reading by a\nfactor of 3\nWhen assigning the full slice of a list, evaluate the rhs before clearing the\nlist (issue 3440)\nOn Python2, PyUnicode_Contains accepts bytes as well as unicode.\nFinish fixing _sqlite3 - untested _reset() was missing an argument\n(issue 3432)\nUpdate the packaged sqlite3 to 3.35.5 on windows. While not a bugfix, this\nseems like an easy win.\n\nWe recommend updating. These fixes are the direct result of end-user bug\nreports, so please continue reporting issues as they crop up.\nYou can find links to download the v7.3.5 releases here:\n\nhttps://pypy.org/download.html\n\nWe would like to thank our donors for the continued support of the PyPy\nproject. If PyPy is not quite good enough for your needs, we are available for\ndirect consulting work. If PyPy is helping you out, we would love to hear about\nit and encourage submissions to our renovated blog site via a pull request\nto https://github.com/pypy/pypy.org\nWe would also like to thank our contributors and encourage new people to join\nthe project. PyPy has many layers and we need help with all of them: PyPy\nand RPython documentation improvements, tweaking popular modules to run\non PyPy, or general help with making RPython's JIT even better.\nIf you are a python library maintainer and use C-extensions, please consider\nmaking a CFFI / cppyy version of your library that would be performant on PyPy.\nIn any case both cibuildwheel and the multibuild system support\nbuilding wheels for PyPy.\n\nWhat is PyPy?\nPyPy is a Python interpreter, a drop-in replacement for CPython 2.7, 3.7, and\nsoon 3.8. It's fast (PyPy and CPython 3.7.4 performance\ncomparison) due to its integrated tracing JIT compiler.\nWe also welcome developers of other dynamic languages to see what RPython\ncan do for them.\nThis PyPy release supports:\n\n\nx86 machines on most common operating systems\n(Linux 32/64 bits, Mac OS X 64 bits, Windows 32/64 bits, OpenBSD, FreeBSD)\nbig- and little-endian variants of PPC64 running Linux,\ns390x running Linux\n64-bit ARM machines running Linux.\n\n\nPyPy does support ARM 32 bit processors, but does not release binaries.", + "tags": "release", + "url": "https://www.pypy.org/posts/2021/05/pypy-v735-release.html" + }, + { + "title": "Checksums", + "text": "Here are the checksums\npypy3.10-v7.3.17 sha256:\n53b6e5907df869c49e4eae7aca09fba16d150741097efb245892c1477d2395f2 pypy3.10-v7.3.17-aarch64.tar.bz2\ne534110e1047da37c1d586c392f74de3424f871d906a2083de6d41f2a8cc9164 pypy3.10-v7.3.17-linux32.tar.bz2\nfdcdb9b24f1a7726003586503fdeb264fd68fc37fbfcea022dcfe825a7fee18b pypy3.10-v7.3.17-linux64.tar.bz2\na050e25e8d686853dd5afc363e55625165825dacfb55f8753d8225ebe417cfd2 pypy3.10-v7.3.17-macos_arm64.tar.bz2\n6c2c5f2300d7564e711421b4968abd63243cb96f76e363975dd648ebf4a362ee pypy3.10-v7.3.17-macos_x86_64.tar.bz2\n6ad74bc578e9c6d3a8a1c51503313058e3c58c35df86f7485453c4be6ab24bf7 pypy3.10-v7.3.17-src.tar.bz2\n00857673af7d92144a5e134c723891953a1e99ac002eff440330de23a8147e85 pypy3.10-v7.3.17-src.zip\ncab794a03ddda26238c72942ea6f225612e0dc17c76cac6652da83a95024e6e8 pypy3.10-v7.3.17-win64.zip\npypy2.7-v7.3.16 sha256:\na8df5ce1650f4756933f8780870c91a0a40e7c9870d74629bf241392bcb5c2e3 pypy2.7-v7.3.17-aarch64.tar.bz2\na3aa0867cc837a34941047ece0fbb6ca190410fae6ad35fae4999d03bf178750 pypy2.7-v7.3.17-linux32.tar.bz2\n9f3497f87b3372d17e447369e0016a4bec99a6b4d2a59aba774a25bfe4353474 pypy2.7-v7.3.17-linux64.tar.bz2\n8573172db377ee0831bf20492cdee9bac4e0b194e3dfe8bf7c44ee257a824766 pypy2.7-v7.3.17-macos_arm64.tar.bz2\ne3e1af1d6ad15e51d8d19ea36e1ac65c4c792314cc8b8dc5cf771ec4353b50f8 pypy2.7-v7.3.17-macos_x86_64.tar.bz2\n50e06840f4bbde91448080a4118068a89b8fbcae25ff8da1e2bb1402dc9a0346 pypy2.7-v7.3.17-src.tar.bz2\n593cedd368a59bd5ed5dc8df00961a42a50c5d75d2614a96b1c75d25612dadf1 pypy2.7-v7.3.17-src.zip\n2ce2f4c205819902ee3ea2e80f8fc9ae9b18647bcfc8046ba83fe46b4139f734 pypy2.7-v7.3.17-win64.zip\npypy3.10-v7.3.16 sha256:\nfc720999bc5050e1d3706b3b6445e695cf42bfc71ebc7c88ed6bb88828b1d385 pypy3.10-v7.3.16-aarch64.tar.bz2\n0df48aa780159e879ac89a805d143e4a6cd1b842f98046f5a3f865814bfaa2a4 pypy3.10-v7.3.16-linux32.tar.bz2\n404e6180d6caf9258eaab0c02c72018e9aa8eb03ab9094a0ff17ee5e3b265ac1 pypy3.10-v7.3.16-linux64.tar.bz2\n6c003376667a95c7a228544649677b9927b8210d6444b901817aad24b8719b93 pypy3.10-v7.3.16-macos_arm64.tar.bz2\n490f2c6ba2489f405444f3b4ad42166da6e2eb73489a9535b206067eaaf21737 pypy3.10-v7.3.16-macos_x86_64.tar.bz2\naf97efe498a209ba18c7bc7d084164a9907fb3736588b6864955177e19d5216a pypy3.10-v7.3.16-s390x.tar.bz2\n4a3a3177d0a1f51d59982bb981d1d485403bda3419d5437b9e077f55f59424ff pypy3.10-v7.3.16-src.tar.bz2\n8f59b6859d7d49036afce8156ea52f9c6a1e8d1e08af01bd6c70444d092841f5 pypy3.10-v7.3.16-src.zip\ne08415a2f35c6ecf2342b504bdfde11e4c5eca3fc5ef7fd2214ff064a5a54396 pypy3.10-v7.3.16-win64.zip\npypy3.9-v7.3.16 sha256:\nde3f2ed3581b30555ac0dd3e4df78a262ec736a36fb2e8f28259f8539b278ef4 pypy3.9-v7.3.16-aarch64.tar.bz2\n583b6d6dd4e8c07cbc04da04a7ec2bdfa6674825289c2378c5e018d5abe779ea pypy3.9-v7.3.16-linux32.tar.bz2\n16f9c5b808c848516e742986e826b833cdbeda09ad8764e8704595adbe791b23 pypy3.9-v7.3.16-linux64.tar.bz2\n88f824e7a2d676440d09bc90fc959ae0fd3557d7e2f14bfbbe53d41d159a47fe pypy3.9-v7.3.16-macos_arm64.tar.bz2\nfda015431621e7e5aa16359d114f2c45a77ed936992c1efff86302e768a6b21c pypy3.9-v7.3.16-macos_x86_64.tar.bz2\n7a56ebb27dba3110dc1ff52d8e0449cdb37fe5c2275f7faf11432e4e164833ba pypy3.9-v7.3.16-s390x.tar.bz2\n5b75af3f8e76041e79c1ef5ce22ce63f8bd131733e9302081897d8f650e81843 pypy3.9-v7.3.16-src.tar.bz2\ndef4dae720dd09b868b9b8a7a1255f07f925d88a4543f99cd9ae1aeb0a49ff5e pypy3.9-v7.3.16-src.zip\n06ec12a5e964dc0ad33e6f380185a4d295178dce6d6df512f508e7aee00a1323 pypy3.9-v7.3.16-win64.zip\npypy2.7-v7.3.16 sha256:\nbe44e65dd8c00d2388b2580dbe2af6a5179f951a8f4979efc74360f92f3c7e96 pypy2.7-v7.3.16-aarch64.tar.bz2\na19712d7a6bd4f6d113e352c5271803c583b5129b76a357d387b1fa85204f8e5 pypy2.7-v7.3.16-linux32.tar.bz2\n04b2fceb712d6f811274825b8a471ee392d3d1b53afc83eb3f42439ce00d8e07 pypy2.7-v7.3.16-linux64.tar.bz2\n9cc13f4d6c4096820e1e0ddabb3959f853e45150ce0166a39aa23867e99f0145 pypy2.7-v7.3.16-macos_arm64.tar.bz2\ne8744c1cef8b9e4eb2d2b6b368ed19a1c5cde482c7ef750f2d9f0807bb77fd1c pypy2.7-v7.3.16-macos_x86_64.tar.bz2\n09eb70b932e6aac484cf4b5f2de5845f71589f2cbb53e5ed37a497613b43cd53 pypy2.7-v7.3.16-s390x.tar.bz2\n43721cc0c397f0f3560b325c20c70b11f7c76c27910d3df09f8418cec4f9c2ad pypy2.7-v7.3.16-src.tar.bz2\n54c5f8405bb28e3a48d8962ad1765e8536d53546e1c352bcabab36e5727dd609 pypy2.7-v7.3.16-src.zip\na51ac82cc0374f86b5eba571d4e5f23cdce5ac7cd3bd5b2d2d726c0d98684d7d pypy2.7-v7.3.16-win64.zip\npypy3.10-v7.3.15 sha256:\n52146fccaf64e87e71d178dda8de63c01577ec3923073dc69e1519622bcacb74 pypy3.10-v7.3.15-aarch64.tar.bz2\n75dd58c9abd8b9d78220373148355bc3119febcf27a2c781d64ad85e7232c4aa pypy3.10-v7.3.15-linux32.tar.bz2\n33c584e9a70a71afd0cb7dd8ba9996720b911b3b8ed0156aea298d4487ad22c3 pypy3.10-v7.3.15-linux64.tar.bz2\nd927c5105ea7880f7596fe459183e35cc17c853ef5105678b2ad62a8d000a548 pypy3.10-v7.3.15-macos_arm64.tar.bz2\n559b61ba7e7c5a5c23cef5370f1fab47ccdb939ac5d2b42b4bef091abe3f6964 pypy3.10-v7.3.15-macos_x86_64.tar.bz2\n209e57596381e13c9914d1332f359dc4b78de06576739747eb797bdbf85062b8 pypy3.10-v7.3.15-s390x.tar.bz2\n837622130b36603a1893899bd9f529961a8e4a56c9eb67268d72ddf8920c9579 pypy3.10-v7.3.15-src.tar.bz2\n67432b82dd7e436d818bd6cd38115564f13fc226ffd2940f3915ad68b0fc683b pypy3.10-v7.3.15-src.zip\nb378b3ab1c3719aee0c3e5519e7bff93ff67b2d8aa987fe4f088b54382db676c pypy3.10-v7.3.15-win64.zip\npypy3.9-v7.3.15 sha256:\n03e35fcba290454bb0ccf7ee57fb42d1e63108d10d593776a382c0a2fe355de0 pypy3.9-v7.3.15-aarch64.tar.bz2\nc6209380977066c9e8b96e8258821c70f996004ce1bc8659ae83d4fd5a89ff5c pypy3.9-v7.3.15-linux32.tar.bz2\nf062be307200bde434817e1620cebc13f563d6ab25309442c5f4d0f0d68f0912 pypy3.9-v7.3.15-linux64.tar.bz2\n300541c32125767a91b182b03d9cc4257f04971af32d747ecd4d62549d72acfd pypy3.9-v7.3.15-macos_arm64.tar.bz2\n18ad7c9cb91c5e8ef9d40442b2fd1f6392ae113794c5b6b7d3a45e04f19edec6 pypy3.9-v7.3.15-macos_x86_64.tar.bz2\ndeeb5e54c36a0fd9cfefd16e63a0d5bed4f4a43e6bbc01c23f0ed8f7f1c0aaf3 pypy3.9-v7.3.15-s390x.tar.bz2\n6bb9537d85aa7ad13c0aad2e41ff7fd55080bc9b4d1361b8f502df51db816e18 pypy3.9-v7.3.15-src.tar.bz2\n06dd38124b873343bdf566ca9076ff8e38ad82fd7f2feecd942480c2200a13c0 pypy3.9-v7.3.15-src.zip\na156dad8b58570597eaaabe05663f00f80c60bc11df4a9c46d0953b6c5eb9209 pypy3.9-v7.3.15-win64.zip\npypy2.7-v7.3.15 sha256:\n31b41fca7280636d7818713b7a0fab8f34ece9c82cc88e51d305d43b3e6306d6 pypy2.7-v7.3.15-aarch64.tar.bz2\ncb5c1da62a8ca31050173c4f6f537bc3ff316026895e5f1897b9bb526babae79 pypy2.7-v7.3.15-linux32.tar.bz2\ne857553bdc4f25ba9670a5c173a057a9ff71262d5c5da73a6ddef9d7dc5d4f5e pypy2.7-v7.3.15-linux64.tar.bz2\n618d33df7ac6570d88a58183e3e15c56f63f862968cecbd2ee896eac6255cea6 pypy2.7-v7.3.15-macos_arm64.tar.bz2\n72dac262fc63115b6ccd2c3c15e7afd1b2e7a65d7e97265c116246d1cf2cdffd pypy2.7-v7.3.15-macos_x86_64.tar.bz2\neb442279ec3f1eb17da296e38b531d3ca50c6418eab208a020bca4646a1dea46 pypy2.7-v7.3.15-s390x.tar.bz2\na66ddaed39544a35bb7ab7a17dbf673a020c7cb3a614bd2b61a54776888daf2c pypy2.7-v7.3.15-src.tar.bz2\na424a065d42b49f6e7f3576cdc3acb60778dd578be8d59f04eccd35c2ef15dc8 pypy2.7-v7.3.15-src.zip\nca3c813aec8f9304c7bdc0f69d8ea2a263d4247224ee094e0017338da84c75f2 pypy2.7-v7.3.15-win64.zip\npypy3.10-v7.3.14 sha256:\nfbef65dfc69dcd6006d843553d268b331f1b13dfc3938492bd35f0f477b5bcf4 pypy3.10-v7.3.14-aarch64.tar.bz2\nd37e7c7a03bed5dceca2ab7f821ad7655808cccf6908155f78f0effd811b7f4f pypy3.10-v7.3.14-linux32.tar.bz2\na83879891dc0a6c1504da0954fba1125b21a2591782897231a8168100ea72b94 pypy3.10-v7.3.14-linux64.tar.bz2\n0f09584e21ed8f45e8ff1e3dd1582f077ebdd23a1068298f45006f67bc692632 pypy3.10-v7.3.14-macos_arm64.tar.bz2\n31ce62b7ea3b5b5bde68241ae9404f0a68f5a7d0094ef651062b7a64caecfd4e pypy3.10-v7.3.14-macos_x86_64.tar.bz2\n363e87ad3b6547cc68981c665cf049449bed44cf9e49cabbbcc61df73ea2d40b pypy3.10-v7.3.14-s390x.tar.bz2\na3481af466103fa13740db4e27780e0423dcf8626b3340f60d3d3c28fbc11ae0 pypy3.10-v7.3.14-src.tar.bz2\n95db3e9d22a4820ad9a683d4f6895fa611b16ed02bd709c86a4ac903f9b36721 pypy3.10-v7.3.14-src.zip\n1713edd310f400935fe9a9f8fa0fd9da1a405eaf7b69564d00f437fb498327f8 pypy3.10-v7.3.14-win64.zip\npypy3.9-v7.3.14 sha256:\n14b842f32f60ce2d9d130971f9bcbdb6875824a0e78fac36806d267e0982179c pypy3.9-v7.3.14-aarch64.tar.bz2\n4ad89a22369a6f2f83a7d8d047e0fc4cf5597f0921fa7afa23499ed05f663503 pypy3.9-v7.3.14-linux32.tar.bz2\nfebd770a616641ca8419c381c7fb224e515b892551d0db49a1231397ed38859d pypy3.9-v7.3.14-linux64.tar.bz2\n4f8f2464a743f855b8fc8bda7ce7994a674616db3b5c2c1955cd08502fa782ca pypy3.9-v7.3.14-macos_arm64.tar.bz2\n0e2fea9b2dadb82b7acf05f21c0144f7bb1cfaaa39c693ab1eba4aef5ed52680 pypy3.9-v7.3.14-macos_x86_64.tar.bz2\nba2451e9081db5bc724a05530a7f98817231de83ff6fdf15bad21a4e9b6dfeae pypy3.9-v7.3.14-s390x.tar.bz2\n560fe6161e159557e1fe612aaadf9b293eefded1da372e70b8e3b23bba598366 pypy3.9-v7.3.14-src.tar.bz2\n16336170410dd13eb39fbacb412b640c9e3ab4dcdd3e2a8f3ba7978edae1dc2d pypy3.9-v7.3.14-src.zip\n9b3d8496f2a4729fdf20d9f835299902048950baad3a42019b67da75ca5b38b7 pypy3.9-v7.3.14-win64.zip\npypy2.7-v7.3.14 sha256:\n98468f4cc704a2821401afdd001ebddd367e594e05a70c7767fb86f1364fb21a pypy2.7-v7.3.14-aarch64.tar.bz2\nb12b4b587da55c8f212ae854e31d29258451e069c65aca596e577644e520bc8b pypy2.7-v7.3.14-linux32.tar.bz2\n5938c3c6cddb2e8eb5e435cd3bf61d15134b94a9ac026e26a533bdda6c28a4a0 pypy2.7-v7.3.14-linux64.tar.bz2\na428e18fcf1470b032fb1f4d75795aeed9216b4314a4c8a3e4d7e13f10f8607e pypy2.7-v7.3.14-macos_arm64.tar.bz2\n8af24683621937e65c518fbca1eb34e17ffc741c2ac917e4ca20694348157d78 pypy2.7-v7.3.14-macos_x86_64.tar.bz2\n5abc6a0f55a89c08def13b5f410b8e7bd706fe1b472f31db01ecbc4d0a49e8dc pypy2.7-v7.3.14-s390x.tar.bz2\ne096fe67ce2d8d4d5e7dceb84fe1ca854498f00766d31b27d32c8d8833131373 pypy2.7-v7.3.14-src.tar.bz2\n680df6e172c5e5778fe3f7bd0a1f8902148f5de9decc5ec9252e72e94eb49bff pypy2.7-v7.3.14-src.zip\na4c6d35e5ae68dfb773ec34b7d8f1503c8fbfcad817e6147babd6cfd3c8eb071 pypy2.7-v7.3.14-win64.zip\npypy3.10-v7.3.13 sha256:\nac476f01c9653358404f2e4b52f62307b2f64ccdb8c96dadcbfe355824d81a63 pypy3.10-v7.3.13-aarch64.tar.bz2\nbfba57eb1f859dd0ad0d6fe841bb12e1256f1f023c7fbca083b536cccbc1233b pypy3.10-v7.3.13-linux32.tar.bz2\n54936eeafd9350a5ea0375b036272a260871b9bca82e1b0bb3201deea9f5a442 pypy3.10-v7.3.13-linux64.tar.bz2\nefb3007b7aace0af6e3b30d381088a5bbc175973a6627b6b0d624a2ca2dc63ce pypy3.10-v7.3.13-macos_arm64.tar.bz2\n2c6238b9ece7b94ffdfd1d9b50619edef4b169a5c78adcdb691fce6709cd6610 pypy3.10-v7.3.13-macos_x86_64.tar.bz2\n3c813c7efa6a026b281313b299c186c585155fc164c7538e65d41efdabff87c9 pypy3.10-v7.3.13-s390x.tar.bz2\n4ac1733c19d014d3193c804e7f40ffccbf6924bcaaee1b6089b82b9bf9353a6d pypy3.10-v7.3.13-src.tar.bz2\n828fc66eca1c097e44bc910c78ab773a98747268c7ce264da97022e5aca358dc pypy3.10-v7.3.13-src.zip\n5b99422fb8978b2f4bbf97961bca49963a82dc47c2fa51b7d23c493db3a2e0f0 pypy3.10-v7.3.13-win64.zip\npypy3.9-v7.3.13 sha256:\n317d7876c5825a086f854253648b967a432b993ce87695d2895d3ad6ed0d2716 pypy3.9-v7.3.13-aarch64.tar.bz2\nac695238b4a3635ac6b482e74e04e2ea78b31acca0decd5de601dfd2f4ebf35a pypy3.9-v7.3.13-linux32.tar.bz2\n323b05a9f607e932cda1995cbe77a96e4ea35994631aa6d734c8035e8479b74e pypy3.9-v7.3.13-linux64.tar.bz2\na07b17a790a1952b551e69d47d77a5546ad5e666ed1bd90b9ad60baaca6aa51e pypy3.9-v7.3.13-macos_arm64.tar.bz2\n180802aa0122d4a05ec480bf3130c78591ba88fdde25d8e65a92d4a798b318a3 pypy3.9-v7.3.13-macos_x86_64.tar.bz2\n213c88f652a99c4dc4e8e00b4b5b58f381c7f7e9ea1a9b65801fc0eb1e50df0a pypy3.9-v7.3.13-s390x.tar.bz2\nbc6147268105e7cb3bd57b401e6d97f66aa4ede269104b2712a7cdd9f02f68cd pypy3.9-v7.3.13-src.tar.bz2\n5036ba37fb07116754f3eab2df6d41f405f947ffbf8d99d62bf743dc1d2c195f pypy3.9-v7.3.13-src.zip\n85745a2055c4a8cefac9b6d3f7f305b1edaaf62468c8f640b4511d9dd21d091c pypy3.9-v7.3.13-win64.zip\npypy2.7-v7.3.13 sha256:\nf1e20f833cc86a097c1f1318069fc17d01c3988678c1438fe27ed567fcb5cfd0 pypy2.7-v7.3.13-aarch64.tar.bz2\nb727d2e759a740f45bab1e333029d001c4384b52949bcbb4bd2ad7912eae8dad pypy2.7-v7.3.13-linux32.tar.bz2\ne41ceb5dc6c4d3a9311ed5f88edfeedbf3e8abbd1ed3c4f2e151a90a5cf4e1d7 pypy2.7-v7.3.13-linux64.tar.bz2\n5b86cf0750abc188a0355380d10c7bab1dec51b610cde23ce78f30a9ef296618 pypy2.7-v7.3.13-macos_arm64.tar.bz2\n50769df0091e8fa51c9e4356e0cb204e6f6aa54f502ec5a6e55aef03d0ac5675 pypy2.7-v7.3.13-macos_x86_64.tar.bz2\nfbb2f3d92831c02b094f17e9609b95a6202d4bdcddae437e380ab14388d4556e pypy2.7-v7.3.13-s390x.tar.bz2\n976984bc6ca5ec9d37ae4e219b020cbed2751d1a02267033f59ed700ba8cec40 pypy2.7-v7.3.13-src.tar.bz2\n34976f32358349b535081d5b5d48759d6f112a31352dc11c15dcfea44bb041d8 pypy2.7-v7.3.13-src.zip\n0dc9c18f91f2aee97b95eaec2244e3b22e0183095f359c410d0090c54413dadc pypy2.7-v7.3.13-win64.zip\npypy3.10-v7.3.12 sha256:\n26208b5a134d9860a08f74cce60960005758e82dc5f0e3566a48ed863a1f16a1 pypy3.10-v7.3.12-aarch64.tar.bz2\n811667825ae58ada4b7c3d8bc1b5055b9f9d6a377e51aedfbe0727966603f60e pypy3.10-v7.3.12-linux32.tar.bz2\n6c577993160b6f5ee8cab73cd1a807affcefafe2f7441c87bd926c10505e8731 pypy3.10-v7.3.12-linux64.tar.bz2\n45671b1e9437f95ccd790af10dbeb57733cca1ed9661463b727d3c4f5caa7ba0 pypy3.10-v7.3.12-macos_arm64.tar.bz2\ndbc15d8570560d5f79366883c24bc42231a92855ac19a0f28cb0adeb11242666 pypy3.10-v7.3.12-macos_x86_64.tar.bz2\n043c13a585479428b463ab69575a088db74aadc16798d6e677d97f563585fee3 pypy3.10-v7.3.12-s390x.tar.bz2\n86e4e4eacc36046c6182f43018796537fe33a60e1d2a2cc6b8e7f91a5dcb3e42 pypy3.10-v7.3.12-src.tar.bz2\n191c275e3f6f2785da783cc7e951cc53cdf9df3b42d4533cd121c526e0b79991 pypy3.10-v7.3.12-src.zip\n8c3b1d34fb99100e230e94560410a38d450dc844effbee9ea183518e4aff595c pypy3.10-v7.3.12-win64.zip\npypy3.9-v7.3.12 sha256:\ne9327fb9edaf2ad91935d5b8563ec5ff24193bddb175c1acaaf772c025af1824 pypy3.9-v7.3.12-aarch64.tar.bz2\naa04370d38f451683ccc817d76c2b3e0f471dbb879e0bd618d9affbdc9cd37a4 pypy3.9-v7.3.12-linux32.tar.bz2\n84c89b966fab2b58f451a482ee30ca7fec3350435bd0b9614615c61dc6da2390 pypy3.9-v7.3.12-linux64.tar.bz2\n0e8a1a3468b9790c734ac698f5b00cc03fc16899ccc6ce876465fac0b83980e3 pypy3.9-v7.3.12-macos_arm64.tar.bz2\n64f008ffa070c407e5ef46c8256b2e014de7196ea5d858385861254e7959f4eb pypy3.9-v7.3.12-macos_x86_64.tar.bz2\n20d84658a6899bdd2ca35b00ead33a2f56cff2c40dce1af630466d27952f6d4f pypy3.9-v7.3.12-s390x.tar.bz2\ne7a2046c7e6c25fc386abbb5132e92a7cc2491e3935699a946cb5dcbb342c2aa pypy3.9-v7.3.12-src.tar.bz2\nc65e4082b6da1660041ccb23823e1cbd7759377c391f050e7c1ccad2220f08c0 pypy3.9-v7.3.12-src.zip\n0996054207b401aeacace1aa11bad82cfcb463838a1603c5f263626c47bbe0e6 pypy3.9-v7.3.12-win64.zip\npypy2.7-v7.3.12 sha256:\ne04dcb6286a7b4724ec3f0e50d3cc1ba8583301dd1658c06d7f37599e4201c59 pypy2.7-v7.3.12-aarch64.tar.bz2\nabf3ae477bd0e526ac6dcefe0bfa845e1535aa053342c0d641219bfcde4b9b56 pypy2.7-v7.3.12-linux32.tar.bz2\n1a61a2574b79466f606010f2999a2b995bd96cd085f91a78ebdd3d5c2c40e81d pypy2.7-v7.3.12-linux64.tar.bz2\n6b747aa076ae8597e49603c5dec4ca5935a1a0a132d7404a559be96a260d9bf7 pypy2.7-v7.3.12-macos_arm64.tar.bz2\n6e89ffdd15537ce4ffce3145b65ee57c2e9c952892bd95b934012d2f009f503b pypy2.7-v7.3.12-macos_x86_64.tar.bz2\n80c0154d8b0949f9dc6a227c322abbc9590c8ae4c9f11c13bf4022aa38b82064 pypy2.7-v7.3.12-s390x.tar.bz2\ndd61d88da274c2ce2cec77667d4a3df9a652bcc50e26f90991d4dd0af66bccf4 pypy2.7-v7.3.12-src.tar.bz2\n99cfea9862299cb043914167f4ddc69171c3f38462b6e1ab170df0aab423ca0f pypy2.7-v7.3.12-src.zip\n84cd3b98812d47a1ddb36f3417cc96b3dbdfa32c2b4e16438f205e1253f7ccea pypy2.7-v7.3.12-win64.zip\npypy3.10-v7.3.12rc2 sha256:\na6dc89b8100f423d5f8f5f579db3691e0ec5f14c2d92d529d70054263e202810 pypy3.10-v7.3.12rc2-aarch64.tar.bz2\n5607812d1fc9ec62956555a88b75f9178fadba090759f7c0941341b9d761e6ef pypy3.10-v7.3.12rc2-linux32.tar.bz2\n6be46911c20152de7d317cf8b2b7c83933a18a9d4193c41e0b70810381fc8d09 pypy3.10-v7.3.12rc2-linux64.tar.bz2\n7c353cce25d76482e6b03e298891e7a5433b1c825391bc9f14b93abdd365276b pypy3.10-v7.3.12rc2-macos_arm64.tar.bz2\n098e408004813c126f09989588d586428982278c2a79a5f216f55b29db2f05de pypy3.10-v7.3.12rc2-macos_x86_64.tar.bz2\n2a842af10a5b1f3be97866af21a7108951c45af7b0ffb757a8e1e1ffd2c76718 pypy3.10-v7.3.12rc2-s390x.tar.bz2\nd8c51b7bb88dd1343195d088c95b4b53c704ae2c7a517ba8d8f8c728bf150683 pypy3.10-v7.3.12rc2-src.tar.bz2\ncc695d4e48bc29867e171071524d97cd4cd903ec965ee0748c3dde2b012ae36a pypy3.10-v7.3.12rc2-src.zip\ncd3b1b409d41ea694a2d22f15afcab12305c058b8fa2a197c49e96b1c5fb776c pypy3.10-v7.3.12rc2-win64.zip\npypy3.9-v7.3.12rc2 sha256:\n0e50aafa4e92413573cff9d579613175e5cdc128bda91a47154c9909b47c2f4c pypy3.9-v7.3.12rc2-aarch64.tar.bz2\n37335affc962acd79fcd1f08cce19c3d2a3d2d2f6e9ba73d6c804160fd42b471 pypy3.9-v7.3.12rc2-linux32.tar.bz2\n79a3d32a21534d784f2ac4934d157354aba4871b72c39ac7908e9d853c16c3ad pypy3.9-v7.3.12rc2-linux64.tar.bz2\n4b4adfb435c3677bf7c518413c2c53282789ceadd747bec19ed42ce0eb7192ed pypy3.9-v7.3.12rc2-macos_arm64.tar.bz2\n3b29d34919f53136a2272363d819eb4e678368a01d5a182feae04a78a505d15d pypy3.9-v7.3.12rc2-macos_x86_64.tar.bz2\n9d760b96db54f8d51c47c78397d70dbf61e1144de5afe6840deb3b9a7c265381 pypy3.9-v7.3.12rc2-s390x.tar.bz2\n4835d2f3814c92851f7930398f397cd0e938de165329c019d86561d9482c9daf pypy3.9-v7.3.12rc2-src.tar.bz2\n453d84e4104216c23a466fc58f58231c051eafabf258c1c907b41ffe9955219b pypy3.9-v7.3.12rc2-src.zip\n559fa00f89eab23c87ac2132ef30fb456631f4ff4bb8009d60900be57594dbea pypy3.9-v7.3.12rc2-win64.zip\npypy2.7-v7.3.12rc2 sha256:\n561c6496251fbdf36ecfeaa08bc2dc89f24ef3044dde6d9f297efc798726e49d pypy2.7-v7.3.12rc2-aarch64.tar.bz2\na66cfb8fd8a88a60bcefca14364c7e87f2932f978b81187572064e1df16c0285 pypy2.7-v7.3.12rc2-linux32.tar.bz2\n03d68b7d43751807cc4e7743a3977f2359cc4b6f0acaad00057b1b4158efe51a pypy2.7-v7.3.12rc2-linux64.tar.bz2\n0cd0fc59894325ab30585fc2bee1d244b2b788d04e3aec46dafb0e2b3b232657 pypy2.7-v7.3.12rc2-macos_arm64.tar.bz2\n75587e171ea77ccbdcc9e0f062c9bd55bc374083ac106eeb788321dc5f031aa6 pypy2.7-v7.3.12rc2-macos_x86_64.tar.bz2\n5968a009c19bf723eda722e9ff1b95986a1b5c79247269532f99e0b25819089a pypy2.7-v7.3.12rc2-s390x.tar.bz2\n6c69d4260554ef677d9dfb3b81a1dbd6f4d7302ef0170d1c66616865a711317f pypy2.7-v7.3.12rc2-src.tar.bz2\na4cbe00a2bef9181929b4577c535f327021ee6af596ac0ad8d577e2a67b44a5f pypy2.7-v7.3.12rc2-src.zip\n2bcab9251209b44eb0f7059f91c070d1de19abcfc42397e437ebe3be2faaaf5d pypy2.7-v7.3.12rc2-win64.zip\npypy3.10-v7.3.12rc1 sha256:\n3e92ba4977c1937913c5a4cb04ee31fa809cb44d12eefcfcd5b7ef64fa2d1a45 pypy3.10-v7.3.12rc1-aarch64.tar.bz2\n889f887eada150cdbf3bfce5bb209fae90a64ad99590047c1123452431d43659 pypy3.10-v7.3.12rc1-linux32.tar.bz2\ncbc86894e22bd06f5d99dbd274dcfe0c2cacfb213f6522e06153010f40423dcc pypy3.10-v7.3.12rc1-linux64.tar.bz2\n9e135570058c227515f742b0600c1a209f154a72247ba52073c0026f6bdc5022 pypy3.10-v7.3.12rc1-macos_arm64.tar.bz2\n3f423b794962e0ddbf95a1f40591f008e7b62a603206f65a239b25953308fbf6 pypy3.10-v7.3.12rc1-macos_x86_64.tar.bz2\n94d25c8777eff222e4cdb7419db7e49ad1b789e88fb6d59ab930e474180c74c1 pypy3.10-v7.3.12rc1-s390x.tar.bz2\n8952f17d401babd69f9bd4f7a417c19f07e1ed7bd078721eadf90f55914793e4 pypy3.10-v7.3.12rc1-src.tar.bz2\nc11b44ab9396bc6ce2a1ff5be514c27b1b327f79da6ba2cad635ea90e590ab5c pypy3.10-v7.3.12rc1-src.zip\n2a2c285909f67984691f7861637a633c06cb30e59374744de08c0dbfbd89a151 pypy3.10-v7.3.12rc1-win64.zip\npypy3.9-v7.3.12rc1 sha256:\n4be87ceb5d522e8f0619a06660a7b68252add41b60ab4957d8f899d4893f6a15 pypy3.9-v7.3.12rc1-aarch64.tar.bz2\n0219d3353eda1526828d4b48e773d045469c0b0dafd95b0bfae72b4ef258bd02 pypy3.9-v7.3.12rc1-linux32.tar.bz2\n298ab60c5e1d56924767a4c2fcb5b3c66561c2128ca385c207193b2b3c61a5f9 pypy3.9-v7.3.12rc1-linux64.tar.bz2\n759b5d4de479b67c01df168c482f00cfdc75475f8401bfecd4f6bd7f0be2df23 pypy3.9-v7.3.12rc1-macos_arm64.tar.bz2\n5d3286920bba60af7bf8a4047b879a04302d2d0e7038965bef26f2dabd235b88 pypy3.9-v7.3.12rc1-macos_x86_64.tar.bz2\n77a27d2cde6e101b94acbc663c3c530568ed509fcdb0eaec149a195410c6efba pypy3.9-v7.3.12rc1-s390x.tar.bz2\n7ef838e96bdd6e672868e705eb04cfbe67a5e4495e7bf374e6fc0d68fa285f7f pypy3.9-v7.3.12rc1-src.tar.bz2\n4bf7eeb2263051838e38ff483f734994c0e1cfd2b818eddbe9e30ae8d9f6fd83 pypy3.9-v7.3.12rc1-src.zip\na78186a26590d87c48a81902a0118f6c3c70f4ef895f3ceb2fcc714a338832a7 pypy3.9-v7.3.12rc1-win64.zip\npypy2.7-v7.3.12rc1 sha256:\n79a87e1e7b3e6bd77117bedb2efe45c0de3cf9e055f688fc7a038969d058de1f pypy2.7-v7.3.12rc1-aarch64.tar.bz2\n0aef12d0a4fe998125c3e6758325905c7b7fc9b348915c4241af89953e04fdc0 pypy2.7-v7.3.12rc1-linux32.tar.bz2\neb7f8be5f120edc29211c2ccaff4be219dcfb82030db3f667ce2c88e859217f1 pypy2.7-v7.3.12rc1-linux64.tar.bz2\n0552074ff977ea860b1989e298dd27d54f5d59e180b9b605922c0ba8becfcf6e pypy2.7-v7.3.12rc1-macos_arm64.tar.bz2\n6dc763c8d25b00c4931e1989e09a429065b41eccf1d39cf85eb09b35846615b4 pypy2.7-v7.3.12rc1-macos_x86_64.tar.bz2\nb2a498c7d10150ad416b27be30b7ec38a61b208eecf2d58eadb6ce822e9d5ca3 pypy2.7-v7.3.12rc1-s390x.tar.bz2\n23c1ecf2b28aae2aa676a1b2eb2bdbf7db18d8718489db6d8501fb9a4b232f49 pypy2.7-v7.3.12rc1-src.tar.bz2\n60cf43bae08c87dfdd3e70be54604c6ca559c14ecf53181dc162c3befd5f8df0 pypy2.7-v7.3.12rc1-src.zip\n5f0786c0c797700458ff0cb9cfe750dd5b81a7ca3175d9ffcb55b5418b707e9c pypy2.7-v7.3.12rc1-win64.zip\npypy3.9-v7.3.11 sha256:\n09175dc652ed895d98e9ad63d216812bf3ee7e398d900a9bf9eb2906ba8302b9 pypy3.9-v7.3.11-aarch64.tar.bz2\n0099d72c2897b229057bff7e2c343624aeabdc60d6fb43ca882bff082f1ffa48 pypy3.9-v7.3.11-linux32.tar.bz2\nd506172ca11071274175d74e9c581c3166432d0179b036470e3b9e8d20eae581 pypy3.9-v7.3.11-linux64.tar.bz2\n91ad7500f1a39531dbefa0b345a3dcff927ff9971654e8d2e9ef7c5ae311f57e pypy3.9-v7.3.11-macos_arm64.tar.bz2\nd33f40b207099872585afd71873575ca6ea638a27d823bc621238c5ae82542ed pypy3.9-v7.3.11-macos_x86_64.tar.bz2\ne1f30f2ddbe3f446ddacd79677b958d56c07463b20171fb2abf8f9a3178b79fc pypy3.9-v7.3.11-s390x.tar.bz2\nb0f3166fb2a5aadfd5ceb9db5cdd5f7929a0eccca02b4a26c0dae0492f7ca8ea pypy3.9-v7.3.11-src.tar.bz2\n3d2f473590b1390478e281a2e0d209b5df7cc9f26c33e73baecf7bd0f62bc848 pypy3.9-v7.3.11-src.zip\n57faad132d42d3e7a6406fcffafffe0b4f390cf0e2966abb8090d073c6edf405 pypy3.9-v7.3.11-win64.zip\npypy3.8-v7.3.11 sha256:\n9a2fa0b8d92b7830aa31774a9a76129b0ff81afbd22cd5c41fbdd9119e859f55 pypy3.8-v7.3.11-aarch64.tar.bz2\na79b31fce8f5bc1f9940b6777134189a1d3d18bda4b1c830384cda90077c9176 pypy3.8-v7.3.11-linux32.tar.bz2\n470330e58ac105c094041aa07bb05676b06292bc61409e26f5c5593ebb2292d9 pypy3.8-v7.3.11-linux64.tar.bz2\n78cdc79ff964c4bfd13eb45a7d43a011cbe8d8b513323d204891f703fdc4fa1a pypy3.8-v7.3.11-macos_arm64.tar.bz2\n194ca0b4d91ae409a9cb1a59eb7572d7affa8a451ea3daf26539aa515443433a pypy3.8-v7.3.11-macos_x86_64.tar.bz2\neab7734d86d96549866f1cba67f4f9c73c989f6a802248beebc504080d4c3fcd pypy3.8-v7.3.11-s390x.tar.bz2\n4d6769bfca73734e8666fd70503b7ceb06a6e259110e617331bb3899ca4e6058 pypy3.8-v7.3.11-src.tar.bz2\n3e635c7d4d5ded1c5f41f7a9f277a0ee3dfd21a545516fb68e90240dca66fa07 pypy3.8-v7.3.11-src.zip\n0f46fb6df32941ea016f77cfd7e9b426d5ac25a2af2453414df66103941c8435 pypy3.8-v7.3.11-win64.zip\npypy2.7-v7.3.11 sha256:\nea924da1defe9325ef760e288b04f984614e405580f5321eb6a5c8f539bd415a pypy2.7-v7.3.11-aarch64.tar.bz2\n30fd245fab7068c96a75b9ff1323ac55174c64fc8c4751cceb4b7a9bedc1851e pypy2.7-v7.3.11-linux32.tar.bz2\nba8ed958a905c0735a4cfff2875c25089954dc020e087d982b0ffa5b9da316cd pypy2.7-v7.3.11-linux64.tar.bz2\ncc5696ab4f93cd3481c1e4990b5dedd7ba60ac0602fa1890d368889a6c5bf771 pypy2.7-v7.3.11-macos_arm64.tar.bz2\n56deee9c22640f5686c35b9d64fdb1ce3abd044583e4078f0b171ca2fd2a198e pypy2.7-v7.3.11-macos_x86_64.tar.bz2\n8fe9481c473178e53266983678684a70fe0c42bafc95f1807bf3ef28770316d4 pypy2.7-v7.3.11-s390x.tar.bz2\n1117afb66831da4ea6f39d8d2084787a74689fd0229de0be301f9ed9b255093c pypy2.7-v7.3.11-src.tar.bz2\n6df2ddd9a925eac5294ae5a5f8916baefbc4bc3298d7cdada18fc1fa71aa0670 pypy2.7-v7.3.11-src.zip\n106942702de0df148e39fa44a33e76b8a362341e1460d4e5e61b3ff0e64e5514 pypy2.7-v7.3.11-win64.zip\npypy3.9-v7.3.10 sha256:\n657a04fd9a5a992a2f116a9e7e9132ea0c578721f59139c9fb2083775f71e514 pypy3.9-v7.3.10-aarch64.tar.bz2\nb6db59613b9a1c0c1ab87bc103f52ee95193423882dc8a848b68850b8ba59cc5 pypy3.9-v7.3.10-linux32.tar.bz2\n95cf99406179460d63ddbfe1ec870f889d05f7767ce81cef14b88a3a9e127266 pypy3.9-v7.3.10-linux64.tar.bz2\ne2a6bec7408e6497c7de8165aa4a1b15e2416aec4a72f2578f793fb06859ccba pypy3.9-v7.3.10-macos_arm64.tar.bz2\nf90c8619b41e68ec9ffd7d5e913fe02e60843da43d3735b1c1bc75bcfe638d97 pypy3.9-v7.3.10-macos_x86_64.tar.bz2\nca6525a540cf0c682d1592ae35d3fbc97559a97260e4b789255cc76dde7a14f0 pypy3.9-v7.3.10-s390x.tar.bz2\n3738d32575ed2513e3e66878e4e4c6c208caed267570f3f9f814748830002967 pypy3.9-v7.3.10-src.tar.bz2\ne3e2c41db0a5590d31233fd2909feeb83b1e7f997a473d74a11ad87ba4bbdc30 pypy3.9-v7.3.10-src.zip\n07e18b7b24c74af9730dfaab16e24b22ef94ea9a4b64cbb2c0d80610a381192a pypy3.9-v7.3.10-win64.zip\n\n2775f1eca62dd1eab0af09f8e4b1640b5c86f18a766ed46ff9aa7dc8aa916c13 pypy3.9-v7.3.10rc3-aarch64.tar.bz2\n68b2f1b986217475fc98bc0e5a98b4bb0c602ec1d603abbeef9ada89c9ff7048 pypy3.9-v7.3.10rc3-linux32.tar.bz2\n1cf9db691cadbf870c9af4a6af7ab89cbf24fef0469d63fd0d857656ee4adee6 pypy3.9-v7.3.10rc3-linux64.tar.bz2\nb585ab42f95aa7f0e713c6c22aba030e5d49d78ba79e8d005e754384d33cfaa4 pypy3.9-v7.3.10rc3-macos_arm64.tar.bz2\n73550941c02349c5d1051331f590962da9a0eff52e793295c1a3bd2a72dc461e pypy3.9-v7.3.10rc3-macos_x86_64.tar.bz2\nabb736466180c3cc68ff5cd0d9b07cfabebc26989eb7fc5e9a9512e1bbe234c2 pypy3.9-v7.3.10rc3-s390x.tar.bz2\na313e85a073f3a4d9c592e142e69c856b40afd29473665d7f41fe07d50ecbad2 pypy3.9-v7.3.10rc3-src.tar.bz2\n6f5ead6ccdf7544eb5a7e33e352a361bfd19f6bfcd91f9e121843b4e2ae9c590 pypy3.9-v7.3.10rc3-src.zip\nf5ae260d8557d7380d595c93ccd2b7bbaff718d8dd82051034444479a89e1c37 pypy3.9-v7.3.10rc3-win64.zip\npypy3.8-v7.3.10 sha256:\ne4caa1a545f22cfee87d5b9aa6f8852347f223643ad7d2562e0b2a2f4663ad98 pypy3.8-v7.3.10-aarch64.tar.bz2\nb70ed7fdc73a74ebdc04f07439f7bad1a849aaca95e26b4a74049d0e483f071c pypy3.8-v7.3.10-linux32.tar.bz2\nceef6496fd4ab1c99e3ec22ce657b8f10f8bb77a32427fadfb5e1dd943806011 pypy3.8-v7.3.10-linux64.tar.bz2\n6cb1429371e4854b718148a509d80143f801e3abfc72fef58d88aeeee1e98f9e pypy3.8-v7.3.10-macos_arm64.tar.bz2\n399eb1ce4c65f62f6a096b7c273536601b7695e3c0dc0457393a659b95b7615b pypy3.8-v7.3.10-macos_x86_64.tar.bz2\nc294f8e815158388628fe77ac5b8ad6cd93c8db1359091fa02d41cf6da4d61a1 pypy3.8-v7.3.10-s390x.tar.bz2\n218a1e062f17aba89f61bc398e8498f13c048b9fcf294343f5d9d56c3ac9b882 pypy3.8-v7.3.10-src.tar.bz2\n0e4dd55729a2bf8c9bf963c769004b287ef57576ddb402e71e387847a7c31c0a pypy3.8-v7.3.10-src.zip\n362dd624d95bd64743190ea2539b97452ecb3d53ea92ceb2fbe9f48dc60e6b8f pypy3.8-v7.3.10-win64.zip\n\nd7feab3fd0e670dc66277ad710d2a26dd5ec3def68cb4fdf2697e570b74ab62e pypy3.8-v7.3.10rc3-aarch64.tar.bz2\n4a33b7e08033527e9f8c6dc2a3d6a8d0163c381b9e75813cfe1a7865caf335ae pypy3.8-v7.3.10rc3-linux32.tar.bz2\n7ab218ab7f05a156ad3ea3b498e6da94dd7e7e77dfe03ee77e5827af755a6207 pypy3.8-v7.3.10rc3-linux64.tar.bz2\nd77a5f94690e8e74d3ae57d1f65ef657c670614559447a196da001de943e1fa5 pypy3.8-v7.3.10rc3-macos_arm64.tar.bz2\nfa15127affd9dbc6d447cf48a99fe4795423132070b84b802d2dc8cbecd9607e pypy3.8-v7.3.10rc3-macos_x86_64.tar.bz2\n8d3e07840be537b6b879add1b34a082dde156f7c2a8c5d75be60e9192393533d pypy3.8-v7.3.10rc3-s390x.tar.bz2\n5284dfba00f4ffcdf29b732cf7f2e63f29d1f33295f826a2caefb1f782cedaef pypy3.8-v7.3.10rc3-src.tar.bz2\nd8a2992734463e8db5ca4209c5ce7f9fcc2965f9fbd975cb04a4e173b6d2411b pypy3.8-v7.3.10rc3-src.zip\nfab16618e7adf8c268c7f48032f51d6d4985734d672d18712fe8b557fe9c9abe pypy3.8-v7.3.10rc3-win64.zip\npypy2.7-v7.3.10 sha256:\n274342f0e75e99d60ba7a0cfb0e13792e7664163e01450d2f7f2f7825603a0ae pypy2.7-v7.3.10-aarch64.tar.bz2\n0b17132f62d2a0c3c4572c57eb53820f25611afad71f3d6a310202942baed6e1 pypy2.7-v7.3.10-linux32.tar.bz2\n461fb6df524208af9e94ffb16989f628b585bdb4b9e97d81e668899fc3a064a3 pypy2.7-v7.3.10-linux64.tar.bz2\n14b178f005603e3df6db7574b77b9c65ae79feda1a629214cafcb4eee7da679d pypy2.7-v7.3.10-macos_arm64.tar.bz2\n188551185ee945d5e42a3a619205d02ac31db77bdd5d98b6c11469e125c3bdb5 pypy2.7-v7.3.10-macos_x86_64.tar.bz2\n0fac1ec1e05c70941f758be05d40ce7ffe6a42c0416e70b55d40a7523e3e70ae pypy2.7-v7.3.10-s390x.tar.bz2\n35e2cf4519cb51c4d5ffb4493ee24f0c7f42b4b04944903ca4b33981a04a3bc5 pypy2.7-v7.3.10-src.tar.bz2\nece8975f49b192cc6e3169301a3c3ef71822cc7b52e70d7d8b506f54f917e14e pypy2.7-v7.3.10-src.zip\n2915b5201a5f71546951bc41efd80f40b2ed709511bc526219a70f3ae37b918e pypy2.7-v7.3.10-win64.zip\n\n85f0b2f0bffe9a9a0fe17382c25f595be7c7ca9a4d070eaf98cb4258bdc8f703 pypy2.7-v7.3.10rc3-aarch64.tar.bz2\n38f0fe020ac7880ae4e843d2cacdfcceecd0d7dca5fd2769f13b60a1e6bf8e86 pypy2.7-v7.3.10rc3-linux32.tar.bz2\ne6d7330c16f503e1c21dacb22c525974f1d81fea86ef32e0d21239d9d372b4d5 pypy2.7-v7.3.10rc3-linux64.tar.bz2\n5f62122884e87b263ce3f416513e1f380276fc327570cff07daac864907b1d1e pypy2.7-v7.3.10rc3-macos_arm64.tar.bz2\n6de0c73285378cae79ee92566e38296e91382cd5df0322224d006dd2e2429489 pypy2.7-v7.3.10rc3-macos_x86_64.tar.bz2\n0c350a480a928c9ed0fca0a531f333946269c32f9673c9d461772c48eccc5380 pypy2.7-v7.3.10rc3-s390x.tar.bz2\n2514df50aeb2dafd8fd13b299dd3a1a30986e5e396a7ea253410d3126b7ad245 pypy2.7-v7.3.10rc3-src.tar.bz2\ndbd30ad54104ffb9ada8717cec068958b15c4ad9a22e37b192acdd1495e9ec44 pypy2.7-v7.3.10rc3-src.zip\nf95114991fbe1bc6aa87466a62efbba6d6e4e1a8c95b5efd43a402ece0371357 pypy2.7-v7.3.10rc3-win64.zip\npypy3.9-v7.3.9 sha256:\n2e1ae193d98bc51439642a7618d521ea019f45b8fb226940f7e334c548d2b4b9 pypy3.9-v7.3.9-aarch64.tar.bz2\n0de4b9501cf28524cdedcff5052deee9ea4630176a512bdc408edfa30914bae7 pypy3.9-v7.3.9-linux32.tar.bz2\n46818cb3d74b96b34787548343d266e2562b531ddbaf330383ba930ff1930ed5 pypy3.9-v7.3.9-linux64.tar.bz2\n59c8852168b2b1ba1f0211ff043c678760380d2f9faf2f95042a8878554dbc25 pypy3.9-v7.3.9-osx64.tar.bz2\n774dca83bcb4403fb99b3d155e7bd572ef8c52b9fe87a657109f64e75ad71732 pypy3.9-v7.3.9-s390x.tar.bz2\n2abaa1e9fe1ec0e233c9fbc377a0c8e9a0634080a8f4f30eb6898301f6618c12 pypy3.9-v7.3.9-src.tar.bz2\n1c67e33882052ab53e464e398898abefd6df7ff7127bf754be88bb17938759f2 pypy3.9-v7.3.9-src.zip\nbe48ab42f95c402543a7042c999c9433b17e55477c847612c8733a583ca6dff5 pypy3.9-v7.3.9-win64.zip\npypy3.8-v7.3.9 sha256:\n5e124455e207425e80731dff317f0432fa0aba1f025845ffca813770e2447e32 pypy3.8-v7.3.9-aarch64.tar.bz2\n4b261516c6c59078ab0c8bd7207327a1b97057b4ec1714ed5e79a026f9efd492 pypy3.8-v7.3.9-linux32.tar.bz2\n08be25ec82fc5d23b78563eda144923517daba481a90af0ace7a047c9c9a3c34 pypy3.8-v7.3.9-linux64.tar.bz2\n91a5c2c1facd5a4931a8682b7d792f7cf4f2ba25cd2e7e44e982139a6d5e4840 pypy3.8-v7.3.9-osx64.tar.bz2\nc6177a0016c9145c7b99fddb5d74cc2e518ccdb216a6deb51ef6a377510cc930 pypy3.8-v7.3.9-s390x.tar.bz2\n5b5d9d9256f12a129af8384e2f581bdfab3bc0fbbe3a0a480d9c1d2e95490eb1 pypy3.8-v7.3.9-src.tar.bz2\nd4f716f324ebbd7ec3c0e0e309c2d7dd76846f693f50b7796820acf346147401 pypy3.8-v7.3.9-src.zip\n05022baaa55db2b60880f2422312d9e4025e1267303ac57f33e8253559d0be88 pypy3.8-v7.3.9-win64.zip\npypy3.7-v7.3.9 sha256:\ndfc62f2c453fb851d10a1879c6e75c31ffebbf2a44d181bb06fcac4750d023fc pypy3.7-v7.3.9-aarch64.tar.bz2\n3398cece0167b81baa219c9cd54a549443d8c0a6b553ec8ec13236281e0d86cd pypy3.7-v7.3.9-linux32.tar.bz2\nc58195124d807ecc527499ee19bc511ed753f4f2e418203ca51bc7e3b124d5d1 pypy3.7-v7.3.9-linux64.tar.bz2\n12d92f578a200d50959e55074b20f29f93c538943e9a6e6522df1a1cc9cef542 pypy3.7-v7.3.9-osx64.tar.bz2\nfcab3b9e110379948217cf592229542f53c33bfe881006f95ce30ac815a6df48 pypy3.7-v7.3.9-s390x.tar.bz2\n70426163b194ee46009986eea6d9426098a3ffb552d9cdbd3dfaa64a47373f49 pypy3.7-v7.3.9-src.tar.bz2\n3643392817cfd0826f70be3d026c2f119904b2bfb40c39c32bad84f5a6aa02f5 pypy3.7-v7.3.9-src.zip\n8acb184b48fb3c854de0662e4d23a66b90e73b1ab73a86695022c12c745d8b00 pypy3.7-v7.3.9-win64.zip\npypy2.7-v7.3.9 sha256:\naff4e4dbab53448f662cd01acb2251571d60f836d2f48382a7d8da54ca5b3442 pypy2.7-v7.3.9-aarch64.tar.bz2\nbbf4e7343d43c8217099a9bffeed6a1781f4b5a3e186ed1a0befca65e647aeb9 pypy2.7-v7.3.9-linux32.tar.bz2\n172a928b0096a7e00b7d58f523f57300c35c3de7f822491e2a7bc845375c23f8 pypy2.7-v7.3.9-linux64.tar.bz2\n77314f5a6b2cc35d24e6f952bef89f5da612b90e4127a8034aed708d9ae483c4 pypy2.7-v7.3.9-osx64.tar.bz2\n62481dd3c6472393ca05eb3a0880c96e4f5921747157607dbaa772a7369cab77 pypy2.7-v7.3.9-s390x.tar.bz2\n39b0972956f6548ce5828019dbae12503c32d6cbe91a2becf88d3e42cc52197b pypy2.7-v7.3.9-src.tar.bz2\n3400e6b03cfcecd0a2f90271e4dd44e5fe862c7bf82a43535114ad57b57af555 pypy2.7-v7.3.9-src.zip\nca7b0f4c576995b388cfb4c796e3f6f20b037e5314571bf267daa068a3a2af31 pypy2.7-v7.3.9-win64.zip\npypy3.9-v7.3.8 sha256:\n89d7ee12a8c416e83fae80af82482531fc6502321e75e5b7a0cc01d756ee5f0e pypy3.9-v7.3.8-aarch64.tar.bz2\nb7282bc4484bceae5bc4cc04e05ee4faf51cb624c8fc7a69d92e5fdf0d0c96aa pypy3.9-v7.3.8-aarch64-portable.tar.bz2\na0d18e4e73cc655eb02354759178b8fb161d3e53b64297d05e2fff91f7cf862d pypy3.9-v7.3.8-linux32.tar.bz2\n129a055032bba700cd1d0acacab3659cf6b7180e25b1b2f730e792f06d5b3010 pypy3.9-v7.3.8-linux64.tar.bz2\n95bd88ac8d6372cd5b7b5393de7b7d5c615a0c6e42fdb1eb67f2d2d510965aee pypy3.9-v7.3.8-osx64.tar.bz2\n37b596bfe76707ead38ffb565629697e9b6fa24e722acc3c632b41ec624f5d95 pypy3.9-v7.3.8-s390x.tar.bz2\n546b7fc3789728869d5ada7b6a95ce9d03047e8489b92ada84613c900e431ee9 pypy3.9-v7.3.8-src.tar.bz2\nc5cece54ce0444943ae43fe672b13b21b3915d1e71ac730589de8204ec6f417a pypy3.9-v7.3.8-src.zip\nc1b2e4cde2dcd1208d41ef7b7df8e5c90564a521e7a5db431673da335a1ba697 pypy3.9-v7.3.8-win64.zip\n\n81c58e0c0eb0f76801d0ac8cb528dd8a0b1e4138a4062e3e64e71beeadeccb79 pypy3.9-v7.3.8rc2-linux32.tar.bz2\n22ec1af269d68f7288a48f49ca58cb55fb9cb78f6ae58341cd13484838327751 pypy3.9-v7.3.8rc2-linux64.tar.bz2\nb49e569944f712f257e7557e61e21b36b388c9af09ce8a09085e93a51a8e3f95 pypy3.9-v7.3.8rc2-osx64.tar.bz2\n47824c665d7992dafbe8f00749f72b606bc3478c80adaaea340100f349e7b207 pypy3.9-v7.3.8rc2-s390x.tar.bz2\n53d47b101a6ff31b07b79429b0cf62e06efb29c3147799ab5aaac270ff17581b pypy3.9-v7.3.8rc2-src.tar.bz2\nc84e8094ecca6f90930d527e2c2ca6c37d1da6009ba16d8eef4d02d02a5b05b5 pypy3.9-v7.3.8rc2-src.zip\nb118fd06197e1218917fa9577874d6bc31a7488f057d5000377c63ee6cd0beca pypy3.9-v7.3.8rc2-win64.zip\n\n89dd0399a89a04b58c22e9b773747258807996bd5071dbf996a85bf8af432393 pypy3.9-v7.3.8rc1-linux32.tar.bz2\nf3f90203afcf7ee359e8c8a871bfaa06d96f926781fd94fb81f471dcd32f7332 pypy3.9-v7.3.8rc1-linux64.tar.bz2\n9a5d7217d8173bbdf1c7351b34651fee0596b0bcfe6fe4becae150d4a5469487 pypy3.9-v7.3.8rc1-osx64.tar.bz2\n4651d804341046be824af0ca35b7ebbbb6d5cdcef0d4a373891398dba182d010 pypy3.9-v7.3.8rc1-src.tar.bz2\nc4db62a854c2cc994d46fac0105a8e3bd4273093b9844c1f7cb69118fae6df72 pypy3.9-v7.3.8rc1-src.zip\nad214e4a44c893dc503e7e0b6f6bdfa7523db80b9d4890523f8ee96339d05fc9 pypy3.9-v7.3.8rc1-win64.zip\npypy3.8-v7.3.8 sha256:\nfe41df391f87239925e573e195e631a9d03d37f471eb1479790ee13ca47a28af pypy3.8-v7.3.8-aarch64.tar.bz2\n0210536e9f1841ba283c13b04783394050837bb3e6f4091c9f1bd9c7f2b94b55 pypy3.8-v7.3.8-aarch64-portable.tar.bz2\nbea4b275decd492af6462157d293dd6fcf08a949859f8aec0959537b40afd032 pypy3.8-v7.3.8-linux32.tar.bz2\n089f8e3e357d6130815964ddd3507c13bd53e4976ccf0a89b5c36a9a6775a188 pypy3.8-v7.3.8-linux64.tar.bz2\nde1b283ff112d76395c0162a1cf11528e192bdc230ee3f1b237f7694c7518dee pypy3.8-v7.3.8-osx64.tar.bz2\nad53d373d6e275a41ca64da7d88afb6a17e48e7bfb2a6fff92daafdc06da6b90 pypy3.8-v7.3.8-s390x.tar.bz2\nf1a378b264cdbfb0e03d77dfc4d105d02f91d542bd7c9c957d1f8083a9808f1f pypy3.8-v7.3.8-src.tar.bz2\n7abf870044c95b31c8e1a0a32e887485b56f3c0a3151401446b113a0a65111b4 pypy3.8-v7.3.8-src.zip\n0894c468e7de758c509a602a28ef0ba4fbf197ccdf946c7853a7283d9bb2a345 pypy3.8-v7.3.8-win64.zip\n\n475883e59f6d2a90d273142da27f999a227d510f51b7cdec3f53ceaf832b6b4b pypy3.8-v7.3.8rc2-linux32.tar.bz2\n141abedd8f0f46f61d9f05243c4fe32a88c6d9f2219cd3cd6a1312f56d4bd5eb pypy3.8-v7.3.8rc2-linux64.tar.bz2\n3bd390bfa30f4225cc379d592c822b9bb2dea9530451904fa215b8649d614375 pypy3.8-v7.3.8rc2-osx64.tar.bz2\n735751d124140cb75c24848199230fe41110761fcb830ba2a253baa5846ec86f pypy3.8-v7.3.8rc2-s390x.tar.bz2\n0ae9515b964865d5946bb48c41e1248cac00ba6f145f10ff230163f4a3c47c91 pypy3.8-v7.3.8rc2-src.tar.bz2\n973ec5dab8b1243b71d25acca4d6db3d1545e62e0984a5d43d407052e4767662 pypy3.8-v7.3.8rc2-src.zip\n089cbb1491eaf921bf905dc79936a95a90b0b5a06ebde3e26d1d2e98bdd2dcdd pypy3.8-v7.3.8rc2-win64.zip\n\n56b62c57df91b4a04036535a94814da3c682ac5208d4a565f230fbc657d949e3 pypy3.8-v7.3.8rc1-linux32.tar.bz2\nfac68364acdebed2a11f6d5a62fc10e7c44985bfe9baafdb991f65e25a375998 pypy3.8-v7.3.8rc1-linux64.tar.bz2\ned62e2f5e25bda752463e2acd881de5876ccd383ce3589630b880de204d8ad75 pypy3.8-v7.3.8rc1-osx64.tar.bz2\n70aa9380fe19a3694d38aab92d46b96427dd8a98952a4d4637043739a485be4f pypy3.8-v7.3.8rc1-src.tar.bz2\n9abb90bc11c5ba53aa7f8c23ab95eba864bb253082d23aa8552d23b322ecef85 pypy3.8-v7.3.8rc1-src.zip\n6a4d2405adc13c68140a48492178829a11ff8d3a22a27b9730166486be2688d0 pypy3.8-v7.3.8rc1-win64.zip\npypy3.7-v7.3.8 sha256:\n4fb2f8281f3aaca72e6fe62ecc5fc054fcc79cd061ca3e0eea730f7d82d610d4 pypy3.7-v7.3.8-aarch64.tar.bz2\n639c76f128a856747aee23a34276fa101a7a157ea81e76394fbaf80b97dcf2f2 pypy3.7-v7.3.8-aarch64-portable.tar.bz2\n38429ec6ea1aca391821ee4fbda7358ae86de4600146643f2af2fe2c085af839 pypy3.7-v7.3.8-linux32.tar.bz2\n409085db79a6d90bfcf4f576dca1538498e65937acfbe03bd4909bdc262ff378 pypy3.7-v7.3.8-linux64.tar.bz2\n76b8eef5b059a7e478f525615482d2a6e9feb83375e3f63c16381d80521a693f pypy3.7-v7.3.8-osx64.tar.bz2\n5c2cd3f7cf04cb96f6bcc6b02e271f5d7275867763978e66651b8d1605ef3141 pypy3.7-v7.3.8-s390x.tar.bz2\n35752be62b148fa6f7fb69e58e1f993c7cc319bea54928eb03ed2e75b8248d5f pypy3.7-v7.3.8-src.tar.bz2\n089fd12039ef92256fc218fc45652a93bbef1f5291181d07a4b55dad3f6987b9 pypy3.7-v7.3.8-src.zip\n96df67492bc8d62b2e71dddf5f6c58965a26cac9799c5f4081401af0494b3bcc pypy3.7-v7.3.8-win64.zip\n\na85189cdbf717928a4c5c90f05ccf48668e38291d2ac438e644d06aa6fa1fb7e pypy3.7-v7.3.8rc2-linux32.tar.bz2\nb8fe346d90561f34db1f23b0213ce247c148b7922d3b9acbfb7fdb1824c708b0 pypy3.7-v7.3.8rc2-linux64.tar.bz2\n480ad018194096736c47a2735ad453bbc0bd60117e7326508a723befe9543c28 pypy3.7-v7.3.8rc2-osx64.tar.bz2\nebc8d34d5b4c546cb2bdb22a848def94b07d23cc6833fd54b76226eb658126a2 pypy3.7-v7.3.8rc2-s390x.tar.bz2\n2d3059daaaaae35ffd70387b37e9bfe91224a24951be20e5edfbe836300fbdb3 pypy3.7-v7.3.8rc2-src.tar.bz2\n25df8cfc7510470c525e35d4a465499d0284ea4a895b08a1f75de3fb3a1698b3 pypy3.7-v7.3.8rc2-src.zip\n3fe66039537920d141cd5fca018e9778e283613dd791dab41122223224585db0 pypy3.7-v7.3.8rc2-win64.zip\n\n6db124cda7eb9ee54dbdaf8e5edc052bc32bd59c1a535faf34b175e3e5cd855d pypy3.7-v7.3.8rc1-linux32.tar.bz2\n9f239262bcf31609b758a70dcf3c8aba4bfa9d1639285afba707414639ee5871 pypy3.7-v7.3.8rc1-linux64.tar.bz2\ned208dac960650f52c69cfc38d17af5e978acd1ad6f09de6aaac1603dea32ffa pypy3.7-v7.3.8rc1-osx64.tar.bz2\n9c2ec87b0c827f9d37ce7c11a9b7b4c1cc9a2182b7f86a1bb36ee209dffda49d pypy3.7-v7.3.8rc1-src.tar.bz2\n4cc32f99e4dbda8a20f1b9e0e95cdba59963a173e00a02baa574e4d00739b58f pypy3.7-v7.3.8rc1-src.zip\n6eb5a637534dbcaa496208061ad19faf5f4413c941a450e091e22ef49e3af9ec pypy3.7-v7.3.8rc1-win64.zip\npypy2.7-v7.3.8 sha256:\nca1f8d3146c83002ee97615906b0930e821297dcce3063b5b28933a0690ef298 pypy2.7-v7.3.8-aarch64.tar.bz2\nb5edfc995d83feea8b4c8aeffccb89753b4b182f076126550bd07cc35faa6208 pypy2.7-v7.3.8-aarch64-portable.tar.bz2\n7c84f173bbcd73d0eb10909259d11b5cc253d4c6ea4492e6da8f2532df9b3da5 pypy2.7-v7.3.8-linux32.tar.bz2\n1f2e84fb539ffce233c34769d2f11647955f894be091e85419e05f48011e8940 pypy2.7-v7.3.8-linux64.tar.bz2\ne5c1ff39ad9916ea23e3deb8012fe42367b6b19284cf13b1a1ea2b2f53a43add pypy2.7-v7.3.8-osx64.tar.bz2\nb4ae4e708ba84602d976ad6ae391ef2eef4b1896d831b8f2b2ec69927dd92014 pypy2.7-v7.3.8-s390x.tar.bz2\n0cdad270c62d3ccc53cc87eeb069a6dc46acaf95521b584624bcd6697d94fa1c pypy2.7-v7.3.8-src.tar.bz2\n13f70c6a0d4e5a59eb368c11d6b581ae09aa9715f96f84b890c5c9fa24cdaa93 pypy2.7-v7.3.8-src.zip\n806a29a6c5550b1e669d8870683d3379138d3d43eb1e07bdf26d65a0691265f2 pypy2.7-v7.3.8-win64.zip\n\n3e9744307a60740191341df2b4feb42ca08452eff354156322b760e1aac3ef54 pypy2.7-v7.3.8rc2-linux32.tar.bz2\na13ceb4a881a8da75475feea3d55dc337b7e2c6cf58e1e33924fa17012ace4e5 pypy2.7-v7.3.8rc2-linux64.tar.bz2\n6413048a6ab1ec5d7702a08f482443be0604a6f2019f32024a35e27c42ed7210 pypy2.7-v7.3.8rc2-osx64.tar.bz2\nb015012ac2f72a3971d4b4691df2a6f2dc478f2abb2252dec79ad2b4c66c18ed pypy2.7-v7.3.8rc2-s390x.tar.bz2\n8b08ace5f402fe7b8b18416082534d2463409b6891ffa426a6989448c5d95064 pypy2.7-v7.3.8rc2-src.tar.bz2\nb507dac295d94972c62c1faf2206db6333993df60864d0c23be0206d8560e278 pypy2.7-v7.3.8rc2-src.zip\n270d289a6b32a83db1e0b1078801b2f36fce6d12e238346a2b8354bf31a64e1e pypy2.7-v7.3.8rc2-win64.zip\n\n5ab938f2b0cff62be3869076f1fb99c859ef2df165ed33d329e2de4d32aaafef pypy2.7-v7.3.8rc1-linux32.tar.bz2\n124de0f3d327e39e0344b70d71298315714fe0b1115db80b463dda06bd618c58 pypy2.7-v7.3.8rc1-linux64.tar.bz2\n183a9c0aa5c9ced4ce071ddedf6ae203a752574f06e96722077eb5708f583405 pypy2.7-v7.3.8rc1-osx64.tar.bz2\n96c9f5a85759cc92000064d3b32ce89748870b35a48e631f713be3f29bf64f3c pypy2.7-v7.3.8rc1-src.tar.bz2\na11e32d93da35a5ab7bf0a6cd37abce4f1697ef22c0bb46957f2360526c20c7b pypy2.7-v7.3.8rc1-src.zip\ne3b2e88b5785538ac3f7bccf3122e400b7d42f3871201fbfb2110b9eb93473be pypy2.7-v7.3.8rc1-win64.zip\npypy3.8-v7.3.7 sha256:\ncbd44e0a9146b3c03a9d14b265774a848f387ed846316c3e984847e278d0efd3 pypy3.8-v7.3.7-aarch64.tar.bz2\ndfb9d005f0fc917edc60fd618143e4934c412f9168b55166f5519ba0a3b1a835 pypy3.8-v7.3.7-linux32.tar.bz2\n5dee37c7c3cb8b160028fbde3a5901c68043dfa545a16794502b897d4bc40d7e pypy3.8-v7.3.7-linux64.tar.bz2\n1f044fe7bbdd443b7913ecf554683dab6dade5dcd7f47d4e6d01f4bb4cf84836 pypy3.8-v7.3.7-osx64.tar.bz2\nae7d6a76490b317a74b87788d596610c7ffd0ae2d3ffa2433d5bb5300f6b4b77 pypy3.8-v7.3.7-s390x.tar.bz2\n21ae339f4f5016d6ca7300305f3e3b554373835cb3c39a9041fe30e6811c80c6 pypy3.8-v7.3.7-src.tar.bz2\naa9aa0a800d06048d301fbafa7892ff8978e2d63b23cc23a147f2fd1fd288baf pypy3.8-v7.3.7-src.zip\n8ceb03d2f7b73c6ce0758290bc42ba366a45c46e033eda36f1779d957a905735 pypy3.8-v7.3.7-win64.zip\npypy3.7-v7.3.7 sha256:\na1a84882525dd574c4b051b66e9b7ef0e132392acc2f729420d7825f96835216 pypy3.7-v7.3.7-aarch64.tar.bz2\n0ab9e2e8ae1ac463bb811b9d3ba24d138f41f7378c17ca9e2d8dee51bf151d19 pypy3.7-v7.3.7-linux32.tar.bz2\n8332f923755441fedfe4767a84601c94f4d6f8475384406cb5f259ad8d0b2002 pypy3.7-v7.3.7-linux64.tar.bz2\nedc9df7d0f7c56f7ee05b24117bdb6c03aa65e768471e210c05ccdbbfd11a866 pypy3.7-v7.3.7-osx64.tar.bz2\n7f91efc65a69e727519cc885ca6351f4bfdd6b90580dced2fdcc9ae1bf10013b pypy3.7-v7.3.7-s390x.tar.bz2\n2ed02ac9e710859c41bc82deafb08619792bb9a27eeaa1676c741ededd214dd7 pypy3.7-v7.3.7-src.tar.bz2\n240ecf56c50b190cc7b728b07fc535be4b3d70a65406d0d8440edc02df4cce17 pypy3.7-v7.3.7-src.zip\n53505dc0b57590290efd7656117ee5384bcd036f7f7c4f0bc3f5cd10299037d1 pypy3.7-v7.3.7-win64.zip\npypy3.8-v7.3.6 sha256:\n704d5303096e8a3173e73435f3bb204e31a8bf02ed5ba617a4a0f1e7491edf50 pypy3.8-v7.3.6-aarch64.tar.bz2\ne857a04a76285f0ef5bae84f6f5e9943ca415d499204c531b1c33fe8f015b48d pypy3.8-v7.3.6-linux32.tar.bz2\n8579ea990e95d2b7e101ef47fd9ebf25a9500d5086e8f708c43f9bae83306ece pypy3.8-v7.3.6-linux64.tar.bz2\n8195e52a20cf2a4f42c2d7e4969fbf44fe349c1f80f758e20525dd0f8c134bec pypy3.8-v7.3.6-osx64.tar.bz2\na36208d5e950ec4b630b33d0aede8ca3da383d973fc5ca387082c7e5bad8d245 pypy3.8-v7.3.6-s390x.tar.bz2\nf234c56eb0d4ab0afb196232fb38cd1ca8e19b1c65cf7b65eb691695499be259 pypy3.8-v7.3.6-src.tar.bz2\n055caaab4171e29915aaad602c9a49fa46e2b50a3f56c650772e31467c541858 pypy3.8-v7.3.6-src.zip\n1b216fd75f8f0a48633cc21dce7d6f25ba65016142df758842e1df661269b458 pypy3.8-v7.3.6-win64.zip\n\n\n59c299e9657334d651e2154c77490a743cb507f4f39344f934b2975ca91b4b2f pypy3.8-v7.3.6rc3-aarch64.tar.bz2\n6cd36eb9857d6f7022099300c70666eb706f1e06b404234ea929a341fee40b68 pypy3.8-v7.3.6rc3-linux32.tar.bz2\nacdbc39ade2ef2cf2b4bcf0eb387ec0ef0d257175751d32e9d730886405439d0 pypy3.8-v7.3.6rc3-linux64.tar.bz2\n18fdba4a6c54c7df6fe2521858046ba865261c0e89557c4b53ef37eb7e562806 pypy3.8-v7.3.6rc3-osx64.tar.bz2\n128ede0f5565b626431755d58eb632362c748508e53777d32184eba5da8fdb6d pypy3.8-v7.3.6rc3-s390x.tar.bz2\n0cb9c517a96850c4fba0494ee10b35e87861d71d8b1387e0588c316fa21230ee pypy3.8-v7.3.6rc3-src.tar.bz2\n54704168785a6b22580d46a4a39f5a2c3f81e5d9f0c8e5ba906ac01603d42cbf pypy3.8-v7.3.6rc3-src.zip\n1bd65ab6c82a696f2dcecd9b37679b474eadd149d96aab30438642236a1f7136 pypy3.8-v7.3.6rc3-win64.zip\n\n8ec2b28c6f1558a6abd0ce0a6fb504253b43b013a750c08c1e74470631afc1dd pypy3.8-v7.3.6rc2-aarch64.tar.bz2\n008e9a9336108821f0080011aafe54a71e42ffffb7223d5183e610f689a0f8aa pypy3.8-v7.3.6rc2-linux32.tar.bz2\nb1069fc7b08c2a230630f55f155c3ea016038471490ff0be020f850c5a8ec0cc pypy3.8-v7.3.6rc2-linux64.tar.bz2\n4298d6b1a8333746c43dd313eb6ccd64f11b3dde795921d07f02c8e32d1ac44b pypy3.8-v7.3.6rc2-osx64.tar.bz2\n9f3f7bb2842e626a85c8b314a3af959f98dc4a57fc0169c98b566b6fe645ea39 pypy3.8-v7.3.6rc2-s390x.tar.bz2\na9c3835e37e84a7667e3e548a176986a77663612d30594c7c4877ce0e712c6c9 pypy3.8-v7.3.6rc2-src.tar.bz2\ncae1f0a13b0da3b9db87141e662c3db73564f8fa4e4f1dab2d838341bf8bacc1 pypy3.8-v7.3.6rc2-src.zip\n6415bfd8afb6cef9cd7666de60f58d7fbbabae92042a9c1f3ce5e8ffe9ba4a26 pypy3.8-v7.3.6rc2-win64.zip\n\n18308f227c02ecb84ad21ed4a51bba8472acafe20386caef7ada0058d2d5a243 pypy3.8-v7.3.6rc1-aarch64.tar.bz2\n9b16a894477cbdb1275ab253d7bc71e8d64ad7d12dd61c835242fdac2cdf6cc7 pypy3.8-v7.3.6rc1-linux32.tar.bz2\n2abcd2a21f17216613c941a6bf6e26b395b089b9aa8f227af9e1b55c86d6d732 pypy3.8-v7.3.6rc1-linux64.tar.bz2\nd3aebc5c862e223606e3a79c245a748da7b9aa7d0206a2400e6c7d906676ef34 pypy3.8-v7.3.6rc1-osx64.tar.bz2\ne5013c21d21ca0eb16bc2e12c4093ec3095150b606830fb10f0c588629412b37 pypy3.8-v7.3.6rc1-s390x.tar.bz2\n999747cb4eacbc23c14e9f71d42c784c35cf45b52a7de9113c6db0811300e526 pypy3.8-v7.3.6rc1-src.tar.bz2\n3c9010fb3d1074c1ac350f0dbc8b215c53b2ab8ca3440d9ca4e903800e2ef1ce pypy3.8-v7.3.6rc1-src.zip\ncef32837d4ab2cd9fbb6173472b633c6996f6a7915d89c66f87f0f0c69edcda2 pypy3.8-v7.3.6rc1-win64.zip\npypy3.7-v7.3.6 sha256:\nd446b6987eeaa03d706603863e83d6b99df69232cf1e06d3ee5706add6a84cd6 pypy3.7-v7.3.6-aarch64.tar.bz2\n459e77c845b31fa9367f7b1b1122155f0ba7888b1d4ce4455c35d2111eeeb275 pypy3.7-v7.3.6-linux32.tar.bz2\nc41d07063b1d002a91ad2a0763b4baaca2b306ec635889c2e4826e706cc7f9ca pypy3.7-v7.3.6-linux64.tar.bz2\n26f0c5c2a5f4a2ce35281d2fa760aa10715300dd110387eac43699a78ed32365 pypy3.7-v7.3.6-osx64.tar.bz2\n3659bf96a177a53426ffc38d3619c6ee307e600c80e924edc9cee604680c141d pypy3.7-v7.3.6-s390x.tar.bz2\n9252ccaa130094205b3c7f0a2cad5adc0d9dfba31658ff3172f788dec1fdb348 pypy3.7-v7.3.6-src.tar.bz2\nc2385436004d7d8d8978650efff1c22512ed9f9808c83ddfd68fe8fe812eb879 pypy3.7-v7.3.6-src.zip\n341e69a369da5a1f4f69dbbd47e7dff5e745439b203e28c7afcf98308a24b003 pypy3.7-v7.3.6-win64.zip\n\n742fc6fa7bdc377e8a8c976f57ef643a9068a0427a5ffbb50f8ba32aa6986392 pypy3.7-v7.3.6rc3-aarch64.tar.bz2\nb5382404935dd09b8a7ac160b593729151c9c907e6df029e3a7f312c53b5038a pypy3.7-v7.3.6rc3-linux32.tar.bz2\n33db78a3c9c9f78eaaf7f52c9c174b1e4c795e5d3294e8364002470a3ced0986 pypy3.7-v7.3.6rc3-linux64.tar.bz2\n3218ef597290ec2983c692a01a6fe9ba5ebf05b8e95fed5e8431b750ec588544 pypy3.7-v7.3.6rc3-osx64.tar.bz2\n4f555251083f633bf044a1bc68d6c50629a374d90f1bee66e245cfac0fdd86f5 pypy3.7-v7.3.6rc3-s390x.tar.bz2\nf0f047f046bec43e433ee08db460c267518eb5b7df1f4d4d6bc3fd735c06a3bc pypy3.7-v7.3.6rc3-src.tar.bz2\na27d35e75c2486029502590ee862e02af2a3453fa685b42916d618cdbc250fd0 pypy3.7-v7.3.6rc3-src.zip\n67c2e0676b04bbb3bbcf13f5c1f6c97a420b576e362c4948bed0fcbbf64419ee pypy3.7-v7.3.6rc3-win64.zip\n\n7c5877b27ece045af7603436d64c8589eadc920045341bb16c9a773b924b1dfc pypy3.7-v7.3.6rc2-aarch64.tar.bz2\n1afe2650a79ea2f234576986e599d504c1f4ab7928a50e3360cdac3b900c04b3 pypy3.7-v7.3.6rc2-linux32.tar.bz2\nd590359ea1a674b51ea13c2a79d883db38b21c43494c986f90af1f34053111a6 pypy3.7-v7.3.6rc2-linux64.tar.bz2\nbd9a96b9c5c542ef36e1e01f0e1987140d54f7bf04f0434bf3a3b9efe166c912 pypy3.7-v7.3.6rc2-osx64.tar.bz2\n22cab4d077f39dc2ff74ebb0d4505e5e3a5b88f2b909643181f57d7b810391da pypy3.7-v7.3.6rc2-s390x.tar.bz2\n064e4f9fa408bacb67829782d95e2206b20319ae5b15e85993c76532350f57e8 pypy3.7-v7.3.6rc2-src.tar.bz2\n4071597a7450fb0d886005c82c52ed7773e9b0c2015bc93968850071d3195f6d pypy3.7-v7.3.6rc2-src.zip\n6c6ac71a616882a53648d49e3b20dd1991c08e39a422e650cd58e2f12eecf19c pypy3.7-v7.3.6rc2-win64.zip\n\n7cfb96afb7aa7478516c1747da77616edf92b46fda56570bcc3117bed46364c1 pypy3.7-v7.3.6rc1-aarch64.tar.bz2\n8079707602a24ab1b61f8982c8ef858f2780e60c08e02354c377d428326f57dd pypy3.7-v7.3.6rc1-linux32.tar.bz2\nc40b7859933e14ca398e4eba0f70f9dbd521def5279acb4fc7c897d41ac0ac60 pypy3.7-v7.3.6rc1-linux64.tar.bz2\n8d9fde2810f84564902cb37d2d8f7294e5c3ea1fd664ab186864c71edb517d83 pypy3.7-v7.3.6rc1-osx64.tar.bz2\n8c4db2df86239c3e1fa5fb8a4efa5f5ec1f4d55f48ea92a01bd73bdce7fdf9bb pypy3.7-v7.3.6rc1-s390x.tar.bz2\n25b980da5a5ca89a67e3752dfb1bb6ee3cd0804b7961d0a12e2f9180afe5bd07 pypy3.7-v7.3.6rc1-src.tar.bz2\nc2d21937db476d9c2d86f1e8622998278599f0cadda43a6335c6c7ada5403fec pypy3.7-v7.3.6rc1-src.zip\na8d8a861dbff630f902d167da202b654e700b802b1c77643723cd246cef0b2ff pypy3.7-v7.3.6rc1-win64.zip\npypy2.7-v7.3.6 sha256:\n90e9aafb310314938f54678d4d6d7db1163b57c9343e640b447112f74d7f9151 pypy2.7-v7.3.6-aarch64.tar.bz2\n7a1145f3a278ffab4da0e2d4c4bd024ab8d67106a502e4bb7f6d67337e7af2b7 pypy2.7-v7.3.6-linux32.tar.bz2\n82127f43fae6ce75d47d6c4539f8c1ea372e9c2dbfa40fae8b58351d522793a4 pypy2.7-v7.3.6-linux64.tar.bz2\n9a97de82037d4be1949ec0c35a4d638ba635e8b34948549ae2fa08abd2cbaa8c pypy2.7-v7.3.6-osx64.tar.bz2\nbb29ecbe1f4a05045f0804b3e741267fc2db742249747b36cdbbd18866c15f04 pypy2.7-v7.3.6-s390x.tar.bz2\n0114473c8c57169cdcab1a69c60ad7fef7089731fdbe6f46af55060b29be41e4 pypy2.7-v7.3.6-src.tar.bz2\ncd88f99eccce3b9921a3c7fa452b25d7b60d87ff580bb03237bb1cd0fe2dd031 pypy2.7-v7.3.6-src.zip\nfcc8f6b3b472a77eaa754951f288fe234b4953bfba845888dd839b9b862cb891 pypy2.7-v7.3.6-win64.zip\n\n\ne92e4ba12a62f053e70799e463c7fcb2663b9fa270a16764250385024180cde4 pypy2.7-v7.3.6rc3-aarch64.tar.bz2\n918cf465e1339adcc66d9829b711e30d6a78d764ce74d79407ce35222f24e569 pypy2.7-v7.3.6rc3-linux32.tar.bz2\n21d9ed5a80aee8c320321b32eb3ca0bc89d630646a7371ee560c15296e68e4aa pypy2.7-v7.3.6rc3-linux64.tar.bz2\ndcb0f049626b47d0bef1ff4f6d19c43b92f7c99a2cf2032afcbf3456b0e00425 pypy2.7-v7.3.6rc3-osx64.tar.bz2\n648e6e02e31d0ee17428f90da7fc938c2b6d0a8bd790ca73887c94a1016013d7 pypy2.7-v7.3.6rc3-s390x.tar.bz2\n0b868fe3b6c5a1a498b558395876a5d9cd3f0add649d5c281542db31a086c16b pypy2.7-v7.3.6rc3-src.tar.bz2\neec6ec44cb9e4da0a29118fe98d4c289374af617e5279a77f6759a9713b68d2d pypy2.7-v7.3.6rc3-src.zip\n47f9003c5909271c3ee4ce81de3703e2f17e20d7eba7d7328e8dc29407107b3d pypy2.7-v7.3.6rc3-win64.zip\n\n9de5474ae55d31b02b9d43be26d7b3ea70e24e6e8a24bdc1d2ee396e191f315d pypy2.7-v7.3.6rc2-aarch64.tar.bz2\n85a57d385a0e6072dfcf979654160fecb3f7d3d7a43352a28dff2c9dd63c7b01 pypy2.7-v7.3.6rc2-linux32.tar.bz2\n5e5800b1dcc705476bdc1bb6a195e857390d3fafc6406ba27513bff461cfadf7 pypy2.7-v7.3.6rc2-linux64.tar.bz2\nc6cb5bc6107bdbbf18a18db5b143a9d0476c6578f2d35792c49274d14f6f55ab pypy2.7-v7.3.6rc2-osx64.tar.bz2\na490ab50a846c5587d525aba6ec6cbaeca758e9c6c6941ea0a1738bb78d32b22 pypy2.7-v7.3.6rc2-s390x.tar.bz2\n1e3870ba5ca5567e4808893ca3361e79f1ba02424059e4459936810ff304ba63 pypy2.7-v7.3.6rc2-src.tar.bz2\n38d18c15a64950822a404e98b9fba8aac671671e4d51553a60923de5992a6ddd pypy2.7-v7.3.6rc2-src.zip\n965f3581e53de1d55f150d78aa9d90b7717a243be494b78d9b88b30ab4a1a8be pypy2.7-v7.3.6rc2-win64.zip\n\nb2957fc3a3fe3957529fdb3e0e85965d46f4b7c09e4101237869f34ddfe5f0d4 pypy2.7-v7.3.6rc1-aarch64.tar.bz2\n37b9c8d41b5ba85b8ab9defd86da98b842f975d72c473bf92c3c1143a9c293cf pypy2.7-v7.3.6rc1-linux32.tar.bz2\nb83967849db84c6e7b7c80b2135788da9c235a89a689729fd044b58d1d92c12f pypy2.7-v7.3.6rc1-linux64.tar.bz2\n63a57129987f54ee692129b53fdf13d635cb6097dc0a1c8cd77f255fc95edda4 pypy2.7-v7.3.6rc1-osx64.tar.bz2\n187e9de4fc4d7edc332275031a40f0de8dc882050b14d5e9b588808c51efedf9 pypy2.7-v7.3.6rc1-s390x.tar.bz2\nbe979c8742181d5646ee1b78eac467612cf61484713ae6862e2b3475b4325b98 pypy2.7-v7.3.6rc1-src.tar.bz2\nc746176c507128e8e5aca14e5a0eaa101955b7cc860ceeba8b20f4f011da4061 pypy2.7-v7.3.6rc1-src.zip\nc515b46bccf1b56fd2f7761a9e3984aa6d56843e848eae67a28fd58fb158a5a9 pypy2.7-v7.3.6rc1-win64.zip\npypy3.7-v7.3.5 sha256:\n85d83093b3ef5b863f641bc4073d057cc98bb821e16aa9361a5ff4898e70e8ee pypy3.7-v7.3.5-aarch64.tar.bz2\n3dd8b565203d372829e53945c599296fa961895130342ea13791b17c84ed06c4 pypy3.7-v7.3.5-linux32.tar.bz2\n9000db3e87b54638e55177e68cbeb30a30fe5d17b6be48a9eb43d65b3ebcfc26 pypy3.7-v7.3.5-linux64.tar.bz2\nb3a7d3099ad83de7c267bb79ae609d5ce73b01800578ffd91ba7e221b13f80db pypy3.7-v7.3.5-osx64.tar.bz2\ndffdf5d73613be2c6809dc1a3cf3ee6ac2f3af015180910247ff24270b532ed5 pypy3.7-v7.3.5-s390x.tar.bz2\nd920fe409a9ecad9d074aa8568ca5f3ed3581be66f66e5d8988b7ec66e6d99a2 pypy3.7-v7.3.5-src.tar.bz2\n61bb9740eaac5dd93577e6b76e8bb1a998daa1df5314bc3b192e6803552e12ea pypy3.7-v7.3.5-src.zip\n072bd22427178dc4e65d961f50281bd2f56e11c4e4d9f16311c703f69f46ae24 pypy3.7-v7.3.5-win64.zip\n\ndbf579f7eb5c527d37ecd43da88cbad02920881b608eb7486d70b4fa31bfc146 pypy3.7-v7.3.5rc3-aarch64.tar.bz2\nd2daf8b1966497d09be703b939bd0020394e0738095243396b3d5f87cef0d815 pypy3.7-v7.3.5rc3-linux32.tar.bz2\n1f9712fa86a50b1de00eb776f3e99033c2a7911dceaa8bc9daf77aa3d2a95842 pypy3.7-v7.3.5rc3-linux64.tar.bz2\nff1d1ce25f60d9474a950ccc90c5c4af376cba2b8af83b4e30cf33de97611c7e pypy3.7-v7.3.5rc3-osx64.tar.bz2\n8e1c4035ba05161083105f452dfcd463c657085405444afc0acf26ceedb1e8a3 pypy3.7-v7.3.5rc3-s390x.tar.bz2\n9f7215f77106a6df0c201b6025dffdc605cd0731d60ee85a81343a51e64edc76 pypy3.7-v7.3.5rc3-src.tar.bz2\n21cae47ec47bead5d0c5e7a902a1bec85cab1eb30bf7190bd140309c20602110 pypy3.7-v7.3.5rc3-src.zip\n8e40ddc6e4360602597bed44f3ae227d20f8eaa0adfb6a728d10805f76456b74 pypy3.7-v7.3.5rc3-win64.zip\n\n\nc01e59167a26976e764f7b230f6febe0af59982911cd727c551191aed0a843c4 pypy3.7-v7.3.5rc2-aarch64.tar.bz2\n7f8e55f34bf9422576a501c22ae8b82d5d6ffcbf40251a9daf53b5d8d96c2f43 pypy3.7-v7.3.5rc2-linux32.tar.bz2\n93f9ccf44ec92145cf2fe17ac98a07f0adc08866b001c7f023b64a3729ed9710 pypy3.7-v7.3.5rc2-linux64.tar.bz2\n4902ac65329447f2451d2b2b264a12fb95d97a4bb734c75410d2b5abc6e6de52 pypy3.7-v7.3.5rc2-osx64.tar.bz2\nf0d4bbbe4000c836c17168cc709b233b6184039aad69bc9929c415a92bc462a9 pypy3.7-v7.3.5rc2-s390x.tar.bz2\nb1ac30e5e7cd8d04c4472b5c4a71a414d6b0cf08a2026fd1bfc84994598abfda pypy3.7-v7.3.5rc2-src.tar.bz2\nc6c004550444c2f8749d7e34bcdfe404333b5f4bdf08af7745e28371c8358050 pypy3.7-v7.3.5rc2-src.zip\nea41d9e5cb94c7b9e7df2652b74fcc1018ce3e786c9636791b70e46d90e7e8ac pypy3.7-v7.3.5rc2-win64.zip\n\n8dcd20e35e26bf92ce08fc8c97350acb4c773e19a78a89d3b4f28a8be63006d3 pypy3.7-v7.3.5rc1-aarch64.tar.bz2\n04573fd71618d5c26b0828dd306fa02e9eece8a33a020081e55b60d9a6bc6240 pypy3.7-v7.3.5rc1-linux32.tar.bz2\n97c1142f7ac99af03b2c56eb379af6e9ed4eef7d0d37675f4ca5ec33c841d62f pypy3.7-v7.3.5rc1-linux64.tar.bz2\nf4893667f0b978deb891b0b7d91a1117e25299f19c65b31281c40e87dea523d3 pypy3.7-v7.3.5rc1-osx64.tar.bz2\n2880cfa6349aebc5c28aff5df06cabb8c8733dc7090f7f36410eb9ff3def37bc pypy3.7-v7.3.5rc1-s390x.tar.bz2\nddccb7e8b24523f3f0e31e6c34b3a61c260b895ac9c7567f560f8ceda675fef8 pypy3.7-v7.3.5rc1-src.tar.bz2\nf39baa99eb0cb4d1505cd43676f86c54cae142f88b9b875542520b8596368ba7 pypy3.7-v7.3.5rc1-src.zip\nab8c5e6bf756f6dda2eba5c2e8d65d8d5de9b3a2c54f2f7a3dfb4f111e40ba0d pypy3.7-v7.3.5rc1-win64.zip\npypy2.7-7.3.5 sha256:\n8dc2c753f8a94eca1a304d7736c99b439c09274f492eaa3446770c6c32ed010e pypy2.7-v7.3.5-aarch64.tar.bz2\n35bb5cb1dcca8e05dc58ba0a4b4d54f8b4787f24dfc93f7562f049190e4f0d94 pypy2.7-v7.3.5-linux32.tar.bz2\n4858b347801fba3249ad90af015b3aaec9d57f54d038a58d806a1bd3217d5150 pypy2.7-v7.3.5-linux64.tar.bz2\n8b10442ef31c3b28048816f858adde6d6858a190d9367001a49648e669cbebb6 pypy2.7-v7.3.5-osx64.tar.bz2\nb91aaa5819ba8af90799eed8eaaba87ceca1fd4dbcbcdb2defc6d313d663b5dd pypy2.7-v7.3.5-s390x.tar.bz2\nc0444fd9873058c1c0d99e13a934e92285cb05992c9968bf523c32bf9bec0a9d pypy2.7-v7.3.5-src.tar.bz2\nc67214acee357d383bb2716269663406611e17cee580026d6d7baa7891afa85b pypy2.7-v7.3.5-src.zip\n0b90eded11ba89a526c4288f17fff7e75000914ac071bd6d67912748ae89d761 pypy2.7-v7.3.5-win64.zip\n\n0f83212202d51835dcedfdfe607fe157d1111a368f7f28738792417acd987c37 pypy2.7-v7.3.5rc3-aarch64.tar.bz2\n6dc2fec9894121cc75500c84509c869648e6fa95c8e8084c81bf17191d80ba8c pypy2.7-v7.3.5rc3-linux32.tar.bz2\n8a918307a51a02ae222e71e2973a4d0dc520a3bae2d510a6571aaf53cf7cead7 pypy2.7-v7.3.5rc3-linux64.tar.bz2\n9376ba404009ce435e7b04a3c194f783b841464031607081081429f079797faa pypy2.7-v7.3.5rc3-osx64.tar.bz2\nc95f5d5cef6181fe08f54824872c94f27177feb5d156fa6dae279a5b8228b13c pypy2.7-v7.3.5rc3-s390x.tar.bz2\nb643dd908e6d07d703f388798e0355e3378a8157833680cbea55c3cf3e4256e2 pypy2.7-v7.3.5rc3-src.tar.bz2\nbaeafa81e445a5b6c8da8ec92c8587a11104f7e125478d669d9eaa45492b7b90 pypy2.7-v7.3.5rc3-src.zip\n21b21873124572043749bb5b19cc33a14ffbf6d8ea5e538006689cc4e3af3d5a pypy2.7-v7.3.5rc3-win64.zip\n\n8250c8db8f227aec3d85f8866f8ad78d925ed338a5622f64c22d6a7fb0963b5a pypy2.7-v7.3.5rc2-aarch64.tar.bz2\n978ed1e445809adbaa0ca593abd445384c28d72344bf67184b5cee5e0f76fc3c pypy2.7-v7.3.5rc2-linux32.tar.bz2\na933976a2adc840d07be9ed4ac1dc1b1986fd68f875c4258ed214a2ce9f5f659 pypy2.7-v7.3.5rc2-linux64.tar.bz2\ncbdfe3f9e49cb96b5b182b19ce257a086dbb7204ba01c178db13b4e6272a3260 pypy2.7-v7.3.5rc2-osx64.tar.bz2\nda2bf8e5e8f03f10ffd8c7e970e20ff702a91fc44a6bd0de51f1a79401804e79 pypy2.7-v7.3.5rc2-s390x.tar.bz2\nb47ce66e8d716b22e7b78f1ec0e2d212a27afd355adcb94e00b6d76ffa9a513f pypy2.7-v7.3.5rc2-src.tar.bz2\nb031352443dff2202fcc0ee131887a232214363af1d87ba35886dc683b18eb85 pypy2.7-v7.3.5rc2-src.zip\n47a355033a4c61e679f5ed34274a320adda8df2c27ed313bda0841dc8e11a354 pypy2.7-v7.3.5rc2-win64.zip\n\n4431bc2193f76b97add9726420c6d6ab14b46178e9cfeade5f596016b66b6549 pypy2.7-v7.3.5rc1-aarch64.tar.bz2\nb0d2432bf50bfeeb00e91e048db6df1bba40ca54b0d19d9f61db0f3a4e6e2bf5 pypy2.7-v7.3.5rc1-linux32.tar.bz2\n5a81b1e5733351a1e27e8072f474c60d24ab987dc1355873861b69961da425f5 pypy2.7-v7.3.5rc1-linux64.tar.bz2\nd2e3077b6c0a84e07af5e4c5eb9c883e54bf649ef982dd5310b3e8e68dfffc0e pypy2.7-v7.3.5rc1-osx64.tar.bz2\n5d6a52bbed77855303dadf10a44c1f5e07920ad28948ecf6f13c57eed0c95f8b pypy2.7-v7.3.5rc1-s390x.tar.bz2\n45639e3b398f1dbac54f35e2aebc4770432519dd8838e0190708f1dcfa945356 pypy2.7-v7.3.5rc1-src.tar.bz2\n67329cae37163b4838bb5768dd04ebc75ce1bbb0a62b74da404587f7344d80fc pypy2.7-v7.3.5rc1-src.zip\n6d36595d6cf6f61c33c0e36ae47d9f84abe1ab99cee6cb910a2517d4d3db6cb0 pypy2.7-v7.3.5rc1-win64.zip\npypy3.7-7.3.4 sha256:\na4148fa73b74a091e004e1f378b278c0b8830984cbcb91e10fa31fd915c43efe pypy3.7-v7.3.4-aarch64.tar.bz2\n04de1a2e80530f3d74abcf133ec046a0fb12d81956bc043dee8ab4799f3b77eb pypy3.7-v7.3.4-linux32.tar.bz2\n09d7298b44a38648a87995ec06e1e093761644e50f547c8bb0b2d7f4fe433548 pypy3.7-v7.3.4-linux64.tar.bz2\n8a4f0e6c7e3845820202bf7f46b48e36886ceb820ff0767963fd74091c4f5d13 pypy3.7-v7.3.4-osx64.tar.bz2\n7d6fb180c359a66a158ef6e81eeca88fbabbb62656a1700f425a70db18de2a0f pypy3.7-v7.3.4-s390x.tar.bz2\n74d3c1e79f3fc7d384ffb32d3d2a95c2d5f61b81091eccce12ac76030d96ad08 pypy3.7-v7.3.4-src.tar.bz2\n80d4da3aaeb8b4cc5e4e4ea747f2e468e9f448da549aa7ada4d59c24380cda43 pypy3.7-v7.3.4-src.zip\n0ff4e4653f1ff0653f105680eb101c64c857fa8f828a54a61b02f65c94b5d262 pypy3.7-v7.3.4-win64.zip\n\n647e34857d181e7560205eb877915b787836237929c7bd52860de626d5e85e9d pypy3.7-v7.3.4rc2-aarch64.tar.bz2\ncfc661034347d79ba907078b4e3acea4f09d0de0eaf474c5bde173666319780c pypy3.7-v7.3.4rc2-linux32.tar.bz2\ndcf1fa6dd5da4076f040ed4302a22c8da3838335e64cd118c29d69eb7d443d6b pypy3.7-v7.3.4rc2-linux64.tar.bz2\nc9ecc213cdc3169ef230d85e49d9d073ffc1ba0a36bc1d8483f724e31b9d9d12 pypy3.7-v7.3.4rc2-osx64.tar.bz2\nfcc5c02382f67c7ee6f267b459131519b6a72e60ae370d6e398d54c0e07080f9 pypy3.7-v7.3.4rc2-s390x.tar.bz2\nf1257d4d8a3d84e84ff85c83f4f5bc2e126727d7595c536ccbe1a03a280c0df6 pypy3.7-v7.3.4rc2-src.tar.bz2\ndfab9881e2c42ae61115aa6ed77389f835094fd783dc08cf4dee1ebfdd4c1d47 pypy3.7-v7.3.4rc2-src.zip\nb62b7aad962a8c42895a13b08d68b32254934d6d1b1f5f1f02f762cbe111b035 pypy3.7-v7.3.4rc2-win64.zip\n\n958a562528d24fdb33b9fd12f2076f4b546dc218e0793324558560823234adb1 pypy3.7-v7.3.4rc1-aarch64.tar.bz2\nd05299744ac8c6f12bb3587541ce106f3a93d9ed64b0529c46e79b56efd27b24 pypy3.7-v7.3.4rc1-linux32.tar.bz2\nbb7ee16bdf7c1bbbca45d1228502a5c276be33e27e849525aa5a61c0eaec5b4a pypy3.7-v7.3.4rc1-linux64.tar.bz2\n6d3aea12b744413c874e33ff456f6591049e12dc1a356d975dc0e29a047a151e pypy3.7-v7.3.4rc1-osx64.tar.bz2\n8deb01eb54b95e480d2ee03ee9148ba0c1684b410165c198e9f68a015656246e pypy3.7-v7.3.4rc1-src.tar.bz2\nbf247839954a4518327d5cbc9ab1a1b4296982c2fe78671d59a58373239e675e pypy3.7-v7.3.4rc1-src.zip\n0819de5a5212bddef0f615f7ced03dfd9f5d4ee115ec3564119d45b6b447843f pypy3.7-v7.3.4rc1-win64.zip\npypy2.7-7.3.4 sha256:\n9e741162ce486b14fbcf5aa377796d26b0529a9352fb602ee8b66c005f8420d1 pypy2.7-v7.3.4-aarch64.tar.bz2\n653cc3f0612399e494021027f4463d62639dffa4345736a16d0704f3f8a61d5f pypy2.7-v7.3.4-linux32.tar.bz2\nd3f7b0625e770d9be62201765d7d2316febc463372fba9c93a12969d26ae03dd pypy2.7-v7.3.4-linux64.tar.bz2\nee7bf42ce843596521e02c763408a5164d18f23c9617f1b8e032ce0675686582 pypy2.7-v7.3.4-osx64.tar.bz2\nf19b70ca5bd918d1349444be775bc2194c8165b0140e6e8b87c3ee101765a5ba pypy2.7-v7.3.4-s390x.tar.bz2\nff9b928237767efe08ccfba79dae489519b3c768fb6e3af52d39c2a8a1c21ca4 pypy2.7-v7.3.4-src.tar.bz2\ne0811ecc272fee58e01b95c4c12f23b115a3e64075a1b50dcefe8faaa6cca869 pypy2.7-v7.3.4-src.zip\n1080012d7a3cea65182528259b51d52b1f61a3717377c2d9ba11ef36e06162d5 pypy2.7-v7.3.4-win64.zip\n\nf0a11bd48a01b27595e659c3a1b7fb936ac6e0a21574f1fc2f57fd032830342a pypy2.7-v7.3.4rc2-aarch64.tar.bz2\n81dd5ac16b11f6f9ba0ff2536306dd85997a6cad86aa4e7971e7805264d61716 pypy2.7-v7.3.4rc2-linux32.tar.bz2\n077acdb14e797878341fc6f50d87a2f0c9b7d25215c6b2f73541bacb7730f64d pypy2.7-v7.3.4rc2-linux64.tar.bz2\n6a220785a962c56db26dd56245aacb7cb6658879ecaad9ada04d26df56da172c pypy2.7-v7.3.4rc2-osx64.tar.bz2\na3201493550457f932ddf743118635a7e8ff6b5c5fd69d0b8596dfeabcc5bffd pypy2.7-v7.3.4rc2-s390x.tar.bz2\n1965dfc3de6fdae83bd954fed206111a020898708d8754705fb1312473be35bf pypy2.7-v7.3.4rc2-src.tar.bz2\n1072727a4a948b16ccebb165015e43716ffc586f5249356c97c454b24aacb2dd pypy2.7-v7.3.4rc2-src.zip\ne20f206ba8751d2c17ad80c66b7f4bd63c2f500cbfa9e8a3906cd7d77955e00f pypy2.7-v7.3.4rc2-win64.zip\n\nee4894169260d3e4c55e06232c96d690e41d13e9f82f1512edcf6b8d960b695d pypy2.7-v7.3.4rc1-aarch64.tar.bz2\nfd736003d5a7f5f2744269d67dc9a96005a5a2ceac8987007bd27ab57681c0f2 pypy2.7-v7.3.4rc1-linux32.tar.bz2\nec1cd67c28416c359dbe1caddf7ae7a0be10e3fbe6435150d39d4b7492469852 pypy2.7-v7.3.4rc1-linux64.tar.bz2\ncce4e360b31010e415e397ce8982535db482e36c0f13934eaa6d9e1e30eb2bc3 pypy2.7-v7.3.4rc1-osx64.tar.bz2\n84930e433a81f16dcf81b678c12167ef951cd74534ee1ee8e6b0b27b0a128e1d pypy2.7-v7.3.4rc1-src.tar.bz2\n7bdc1e5431a7429bd2ec2853c86a68f09069f080b9765a87084904f52adab789 pypy2.7-v7.3.4rc1-src.zip\n02befc534dbcc2da6ad4c7e60735d977dc8b4f6901630eb599d1684cb86a58c7 pypy2.7-v7.3.4rc1-win64.zip\npypy3.7-7.3.3 sha256:\nee4aa041558b58de6063dd6df93b3def221c4ca4c900d6a9db5b1b52135703a8 pypy3.7-v7.3.3-aarch64.tar.bz2\n7d81b8e9fcd07c067cfe2f519ab770ec62928ee8787f952cadf2d2786246efc8 pypy3.7-v7.3.3-linux32.tar.bz2\n37e2804c4661c86c857d709d28c7de716b000d31e89766599fdf5a98928b7096 pypy3.7-v7.3.3-linux64.tar.bz2\nd72b27d5bb60813273f14f07378a08822186a66e216c5d1a768ad295b582438d pypy3.7-v7.3.3-osx64.tar.bz2\n92000d90b9a37f2e9cb7885f2a872adfa9e48e74bf7f84a8b8185c8181f0502d pypy3.7-v7.3.3-s390x.tar.bz2\nf6c96401f76331e474cca2d14437eb3b2f68a0f27220a6dcbc537445fe9d5b78 pypy3.7-v7.3.3-src.tar.bz2\n9e4756903b14c5f971989a2f5a4de6ee19b21a59f2a798b3ad2ad0e71b2582a5 pypy3.7-v7.3.3-src.zip\na282ce40aa4f853e877a5dbb38f0a586a29e563ae9ba82fd50c7e5dc465fb649 pypy3.7-v7.3.3-win32.zip\n\n54a1697d39f136c3e3961afbd58a049e10a5ed10e6d230e6729d696c226d5185 pypy3.7-v7.3.3rc2-aarch64.tar.bz2\n796c0b57b28850f9a212593f30baf7c241c0ed3fe857048d2ea50b3e13b9773b pypy3.7-v7.3.3rc2-linux32.tar.bz2\nbe427afe0434ac42b4da997c841250c499286c57f1c1e9a764d49787bbeeda38 pypy3.7-v7.3.3rc2-linux64.tar.bz2\ne670772077ea400c8f276f8bea301a0c3fa0f037f7e174ae08b34d46e43ce433 pypy3.7-v7.3.3rc2-osx64.tar.bz2\nb230bfd935d6a4ecfaf890c91431b56cb53325ad988899542b178610f94d5970 pypy3.7-v7.3.3rc2-s390x.tar.bz2\nc4a7f8c8a00073de1f987562bed486c372005e021505d3847562966541e0ea6f pypy3.7-v7.3.3rc2-src.tar.bz2\n26ba0babe260fbc9264c15070b129593ca871c7658a661eacf4c5e27507542f7 pypy3.7-v7.3.3rc2-src.zip\n53959607ea55de6ec5cf15227c195e3356d56629e91279ce26744cb3e392a863 pypy3.7-v7.3.3rc2-win32.zip\n\n45357c23a05bc4e4828c0c0964142a7c45f0bcc6653cae67837ff00a02ececb2 pypy3.7-v7.3.3rc1-aarch64.tar.bz2\n22c04f6984c986895999c73d845e57957d86ab788137e482b60f83aa4983e278 pypy3.7-v7.3.3rc1-linux32.tar.bz2\n2069912448749295537c2b381957c5e07dec103fc9a3322f2ce8a57b3fa6e60c pypy3.7-v7.3.3rc1-linux64.tar.bz2\n9fbbf9cfb9ca699e00ea08aaec6248625541998c251033aa3e6d8c592c0a6ff9 pypy3.7-v7.3.3rc1-osx64.tar.bz2\nf502ed792c9da1531a413cd8a7c4c8158c649d7820cb4a910a5852866579c365 pypy3.7-v7.3.3rc1-s390x.tar.bz2\n6780d79e205768a5b2c1d6ecc9e1c4a8c05811cc6b130ed728ba1a53088e0406 pypy3.7-v7.3.3rc1-src.tar.bz2\nedaed54347b69d2a3037e427c60eb88050226cf082d26fff594221cbedab9cd8 pypy3.7-v7.3.3rc1-src.zip\n3c82f4569293dcff5085f0c61af1ba2671217256c58b6e6092629a406eee4fc5 pypy3.7-v7.3.3rc1-win32.zip\npypy3.6-7.3.3 sha256:\nbc82cf7f0182b942a2cfad4a0d167f364bfbf18f434e100a2fe62bc88547ac9b pypy3.6-v7.3.3-aarch64.tar.bz2\nf183c61e66fd2c536a65695bd7ff770748c2884c235a589b9c6ac63690770c69 pypy3.6-v7.3.3-linux32.tar.bz2\n4fb85fdd516482cab727bb9473b066ff8fb672940dedf7ccc32bf92957d29e0a pypy3.6-v7.3.3-linux64.tar.bz2\n84126fcb957f260de221244222152c981643144df1d817329781f555daa52e35 pypy3.6-v7.3.3-osx64.tar.bz2\n0de9c33ff3500c6e7fd273d0a6d341bc839b0298f697c4d6fe141f2b54c5c3e2 pypy3.6-v7.3.3-s390x.tar.bz2\na23d21ca0de0f613732af4b4abb0b0db1cc56134b5bf0e33614eca87ab8805af pypy3.6-v7.3.3-src.tar.bz2\ndf534213c27c6ecc8e7d4f2a6950305301711ea3e132ec7a836959146761c9d8 pypy3.6-v7.3.3-src.zip\nb935253877b703d29b1b11f79e66944f1f88adb8a76f871abf765d4de9d25f8a pypy3.6-v7.3.3-win32.zip\n\n58a35d069bc887c09f8106aec1c0da18241f887dc227bd9e31bd2819496b8256 pypy3.6-v7.3.3rc2-aarch64.tar.bz2\ne171477f56ada45ce64df6f91ad4961c13b674d268b8b16850d1bae5eda43393 pypy3.6-v7.3.3rc2-linux32.tar.bz2\ndf2f421c3782e09ca304f00afd79d7ac24224c3346b41ddae9ab919f4b243538 pypy3.6-v7.3.3rc2-linux64.tar.bz2\n1b2715c8bdf97bbe2135a13562aaeab3408c1459d714412a0b0c607309c5c48b pypy3.6-v7.3.3rc2-osx64.tar.bz2\nd1eaa8ea52f8ce7b02ddc08cff56a64405cfdc7f657edd9bfbb8788484ab9c01 pypy3.6-v7.3.3rc2-s390x.tar.bz2\n3c91a1e911eee1baf9093dcb66899bd06a9ddc095ee60c51c2bca1626497148f pypy3.6-v7.3.3rc2-src.tar.bz2\ne9e5dc879afcddc7ffea09500a092fe00c9070d8fd5008ef0342e0b77c9f9161 pypy3.6-v7.3.3rc2-src.zip\n7bfdc3544216003b96e76f133073084f2918c5cd29642211735c8507142d107a pypy3.6-v7.3.3rc2-win32.zip\n\n9e65dff7a5bc34d32ea88b9436a9f9629542dd3eb8f948f49ecce40112530199 pypy3.6-v7.3.3rc1-aarch64.tar.bz2\n13a67079e78eaa01dcc2a8aa986a50944bc4bf42469c3c39e3ecb0f0cee31439 pypy3.6-v7.3.3rc1-linux32.tar.bz2\n17fb6dff3a5fd9d9e791ce1cd8ae9076e5f47b8b463b7575e4403f01656b0735 pypy3.6-v7.3.3rc1-linux64.tar.bz2\n2f62a9c9876d83a2bf04d8e5e1373aa7e0dcd1e523a58216e60f20329a536b9b pypy3.6-v7.3.3rc1-osx64.tar.bz2\na652572f3c783c4c9cfae477a6a64584f2df39e4df75773131ab512e486d61f3 pypy3.6-v7.3.3rc1-s390x.tar.bz2\nbd5e6d6ba3bd9bc1a233c2dd77b518fd1d337a37670fe0e23edf837852254ee7 pypy3.6-v7.3.3rc1-src.tar.bz2\ne26c8c95e2d131507a08c3e8b8010e6dd366e8e9bf6e77db6844bc5145be1932 pypy3.6-v7.3.3rc1-src.zip\n773ffcabddc3bdc626318f24f0ba256153eca517775425b618c1c7b8b10f1680 pypy3.6-v7.3.3rc1-win32.zip\npypy2.7-7.3.3 sha256:\n23b145b7cfbaeefb6ee76fc8216c83b652ab1daffac490558718edbbd60082d8 pypy2.7-v7.3.3-aarch64.tar.bz2\nbfbc81874b137837a8ba8c517b97de29f5a336f7ec500c52f2bfdbd3580d1703 pypy2.7-v7.3.3-linux32.tar.bz2\nf412b602ccd6912ddee0e7523e0e38f4b2c7a144449c2cad078cffbdb66fd7b1 pypy2.7-v7.3.3-linux64.tar.bz2\nf34dc4f5ded1f6bcea05841aa9781b9307329e3ab755607917148568824ae0b0 pypy2.7-v7.3.3-osx64.tar.bz2\n8254a7fb98ea66c33324a403d06ccb052d616a4176ce0130591693ceeb011cf7 pypy2.7-v7.3.3-s390x.tar.bz2\nf63488051ba877fd65840bf8d53822a9c6423d947839023b8720139f4b6e2336 pypy2.7-v7.3.3-src.tar.bz2\n5ce67ea6afb0cf1a3e20bbd4bbd375e375f572d5325524f9c7760edf8521f029 pypy2.7-v7.3.3-src.zip\nb3e660dae8d25d8278fd6a0db77e76a16ac9a8c1dca22e7e103d39ed696dc69e pypy2.7-v7.3.3-win32.zip\n\n4f2eee1d8ae2571d6fde76141237cf7717324dd6b6a1aa50036c42266d92cbce pypy2.7-v7.3.3rc2-aarch64.tar.bz2\n79c741bd28f293820382f4ecd81414a327745956fa402a5dcfe38900e7520214 pypy2.7-v7.3.3rc2-linux32.tar.bz2\nb227698c4797170b7fdb427a56632fa7733695dd3b31fd404ce4c0939505f918 pypy2.7-v7.3.3rc2-linux64.tar.bz2\n451fca86c965e498ce2ada9474c36d316a627bd6aeeeb808b952a447c938c936 pypy2.7-v7.3.3rc2-osx64.tar.bz2\n83147a40ecc2ab39679129f7898756febd09422ee63a0074fb7f844964c189d8 pypy2.7-v7.3.3rc2-s390x.tar.bz2\n1d60d7f9662278ba59f34cd20c0332993c0bb117009309bc06bd3cb651318c36 pypy2.7-v7.3.3rc2-src.tar.bz2\n4810fb6761eccf6f3e6a14f7a8e4010548e551928fef27fb9482b0c7e3e501d5 pypy2.7-v7.3.3rc2-src.zip\n72a43db2c5bd639023adad2a5c9fd7d4db639c5269dcfeb19ef5b0576771ea9b pypy2.7-v7.3.3rc2-win32.zip\n\n061be51e14fc5f16ce38a61b3873239a0a74b02af51be5930b52941bbb3e6eb2 pypy2.7-v7.3.3rc1-aarch64.tar.bz2\n395113ae0a9d1e352e5aef22b1d9e272b029b186d5e1c7e204dd6df044647fc1 pypy2.7-v7.3.3rc1-linux32.tar.bz2\n1e160ff884fdcdc3388b3c88a00ee54d0b11e7b3c94c4787a217eeea76da63e3 pypy2.7-v7.3.3rc1-linux64.tar.bz2\n761b6e9485dd218e63d231f351f908e74c6cc6bb38cc3b61992b92a0e5384f02 pypy2.7-v7.3.3rc1-osx64.tar.bz2\n72d62a3d0bfcb1693f44d5bc3601d528188838df9fbb885e3e18770f81f97e5a pypy2.7-v7.3.3rc1-s390x.tar.bz2\n39fa3f6f0921785c4b44ab2e47777d64480737c710672f09913b2306a1430281 pypy2.7-v7.3.3rc1-src.tar.bz2\n6b5b466e74505e59985ff9583587a417a200ab2d41829b8c72c74daef4c0d44c pypy2.7-v7.3.3rc1-src.zip\n403bce17882ca7f305fedd9f604f5657364e4ef76086064bbed0a31dfbf47155 pypy2.7-v7.3.3rc1-win32.zip\npypy3.6-7.3.2 sha256:\n164d6a0503c83dd328e1a6bf7fcb2b2e977c1d27c6fcc491a7174fd37bc32a12 pypy3.6-v7.3.2-aarch64.tar.bz2\n6fa871dedf5e60372231362d2ccb0f28f623d42267cabb49be11a3e10bee2726 pypy3.6-v7.3.2-linux32.tar.bz2\nd7a91f179076aaa28115ffc0a81e46c6a787785b2bc995c926fe3b02f0e9ad83 pypy3.6-v7.3.2-linux64.tar.bz2\nfd457bfeaf54aa69417b6aa4817df40e702dc8aaaf7e83ba005d391a1bddfa96 pypy3.6-v7.3.2-osx64.tar.bz2\n16afbaa245c016c054d9300c19433efcc76c50664ff2c86d913ff76ed0a729dc pypy3.6-v7.3.2-s390x.tar.bz2\nfd6175fed63ff9fccd7886068078853078948d98afae9bd4f5554c6f7873c10d pypy3.6-v7.3.2-src.tar.bz2\nedcbcd3598a91de3115f86550d1bc76ac46fc0a3e86a1e951769a993f6fbcbf0 pypy3.6-v7.3.2-src.zip\n13a39d46340afed20f11de24e9068968386e4bb7c8bd168662711916e2bf1da6 pypy3.6-v7.3.2-win32.zip\n\n62e525c6c71c8264c8476e2c4afe11d2aa07b71f9bcf6d694fc4aae27bfcbb66 pypy3.6-v7.3.2rc2-aarch64.tar.bz2\ne9de7036c663f08f06f760340c5d165d8bdecad159abd14d0d93d1bde714ed38 pypy3.6-v7.3.2rc2-linux32.tar.bz2\ne3ac3cf1560f8aee41e542bd999214cbbe0645a4786e4d8a5dc3d58b219429f3 pypy3.6-v7.3.2rc2-linux64.tar.bz2\n7995b74b190f619feb3f393620f63dd0f7cae9e8e298c0616bd184090c356c90 pypy3.6-v7.3.2rc2-osx64.tar.bz2\n9c09100e3302221dbe9776bb3f99e870a8404a2f6afd7a056fa3b7116f5ab013 pypy3.6-v7.3.2rc2-s390x.tar.bz2\nb7d4b3cf3ba7e7749421b1eb857be32d8e5fede124cb2a1d1e1bc606a437b4c5 pypy3.6-v7.3.2rc2-src.tar.bz2\nf5c4f219a974c69b949221082b789a455a67f9f6a37c173cb48a6246ab57f05c pypy3.6-v7.3.2rc2-src.zip\n0555340fdd2e2fcbf114d1f2b57d798269dfccddf1b6419dbe3ce937927b0504 pypy3.6-v7.3.2rc2-win32.zip\n\n1c69cca7292e3c3ffcb7a09f5cdeb51d45e24dc75510b2c9bb410b8ffc57a579 pypy3.6-v7.3.2rc1-aarch64.tar.bz2\nd5738cffc11b364b5f0bf4883c2e1fd46431822f3bd126c7d8c83e9b5f0e6543 pypy3.6-v7.3.2rc1-linux32.tar.bz2\n41cab069841cfc713cc2d0526034f04fcbd741d67d70212926a3ff90754a39f5 pypy3.6-v7.3.2rc1-linux64.tar.bz2\nafabd1ea5a7da31df547c1d4b7028caef1dfaad0ba7e9dda81da2884dfe3062c pypy3.6-v7.3.2rc1-osx64.tar.bz2\n9202fa080d821cca5fe788acfdee3020449e3c36df720ede89ef7389ad6d4a37 pypy3.6-v7.3.2rc1-src.tar.bz2\n8dc4d906720208d590133d580bc7976f7aca1fedf49c3dec1eba1fccb39e0bdc pypy3.6-v7.3.2rc1-src.zip\n29d47b72cf417d12b23161d898dae38f48e48788733623ffb09807e913fbeb44 pypy3.6-v7.3.2rc1-win32.zip\npypy3.7-7.3.2 sha256:\nc5c35a37917f759c19e2a6b3df3b4d56298faa2fae83c143469bcbda42ca5dd2 pypy3.7-v7.3.2-aarch64.tar.bz2\n34c7e1c7bd06e437ad43cc90a20f9444be1f0a264d0955e32098294c30274784 pypy3.7-v7.3.2-linux32.tar.bz2\na285ddcbc909d68c648585fae4f33b0ba24961bb4e8fafe5874cf725d6e83df6 pypy3.7-v7.3.2-linux64.tar.bz2\n337dd4d9e529d2f221e0beb092236c18430e0564ab835c6bba425a1daf7c9958 pypy3.7-v7.3.2-osx64.tar.bz2\nd4ce71ebba148bf83c24fc963e8282c9b7f0c81fcf6b612301b8efe6bd7658d1 pypy3.7-v7.3.2-s390x.tar.bz2\n9274186eb0c28716a8c6134803b1df857bc3f496e25e50e605c4d95201c8817d pypy3.7-v7.3.2-src.tar.bz2\n23363123c607058dac29995cf281c4609a8d8d278841a8f05ea8559bdb1678a8 pypy3.7-v7.3.2-src.zip\ne3c589be07760bc3042981c379b7fd1603e832a4db426075f09e090473846a96 pypy3.7-v7.3.2-win32.zip\n\n78fe46fa8706e325bd0bdb81d6f0865b7dae0ffb22a77c533a24fa960e885b1b pypy3.7-v7.3.2rc2-aarch64.tar.bz2\n2ed3489e1ea42b1807e79ba46a2dfb2c763bdd4d15efac0fd8ba9cf05ab436bb pypy3.7-v7.3.2rc2-linux32.tar.bz2\n6c67701914b7885e67d282c1286e9109fc79e73ab65b5c164492fb024b8deb7f pypy3.7-v7.3.2rc2-linux64.tar.bz2\n28b48a691276a806bcf0009df5e367d90159b9b4a4161ad9857454999e6915ec pypy3.7-v7.3.2rc2-osx64.tar.bz2\n544023b22670be740970bfc8d67a102dfa045cb229e40271a4197a9e8d3bc5da pypy3.7-v7.3.2rc2-s390x.tar.bz2\n9a3f29338340ab5e006300b68369745bd16f99943a7d48d8440c5a0ad67a5c68 pypy3.7-v7.3.2rc2-src.tar.bz2\n73a6c2241d0a5ce7741a15f8cfd205a6f1eb10310799d912c069d6be58907ba7 pypy3.7-v7.3.2rc2-src.zip\n9a44c694f9c642a7a127241466f72ca58f303d3e148bf5488e34a162c7d7a55b pypy3.7-v7.3.2rc2-win32.zip\n\na7e2376f5e64256aa2e3cf3d403b4c48753c9c2588c57e0fc6bddebefacb3a9d pypy3.7-v7.3.2rc1-aarch64.tar.bz2\ne2b2fa3f83f4a3cc138eb88c3bbf4fde395faec6bc04cd72721623865a366d96 pypy3.7-v7.3.2rc1-linux32.tar.bz2\n8173935a5d1cae7238cb27e35bf881ab0ed0d8bd978d3cf6c80311ed596324ba pypy3.7-v7.3.2rc1-linux64.tar.bz2\ne730cf9e5be8566544a478bf2da4bc4ab84428ac4f4a7bb8e001ea4516a3f3be pypy3.7-v7.3.2rc1-osx64.tar.bz2\n209c2136654ea116c316c6d5305659e8e33d49b9f9f61eee36c06330bb3214ba pypy3.7-v7.3.2rc1-src.tar.bz2\n419020e81793030cb6d011e7c0b75183163a7586a31ae88a6a52689e9c45926e pypy3.7-v7.3.2rc1-src.zip\na6fc9d568c05504759e945e70b94fc55f5e99748eb01da4fb5192231238fa1d7 pypy3.7-v7.3.2rc1-win32.zip\npypy2.7-7.3.2 sha256:\nfce1f06f20ab8bcacb9ac1c33572d6425033de53c3a93fbd5391189cc3e106cb pypy2.7-v7.3.2-aarch64.tar.bz2\n78f30ac17abe3cc077fc2456ef55adb51b052c5126011b2a32bacc858acaca7d pypy2.7-v7.3.2-linux32.tar.bz2\n8d4f08116a97153a0f739de8981874d544b564cbc87dd064cca33f36c29da13b pypy2.7-v7.3.2-linux64.tar.bz2\n10ca57050793923aea3808b9c8669cf53b7342c90c091244e9660bf797d397c7 pypy2.7-v7.3.2-osx64.tar.bz2\n042d5e99f660de098de979c4b27f7f8c1332d904db379bb2bf2c3402729749bb pypy2.7-v7.3.2-s390x.tar.bz2\n8189480d8350ad6364d05c2b39fd7d832644d4b1cd018f785126389df45928d1 pypy2.7-v7.3.2-src.tar.bz2\nd891c55f4e657b5e3fe609cee02b2288790abb5554a544ca047f088310d129c4 pypy2.7-v7.3.2-src.zip\n0fd62265e0421a02432f10a294a712a5e784a8e061375e6d8ea5fd619be1be62 pypy2.7-v7.3.2-win32.zip\n\nfa76bfc65200eeb3b32253e674a9339a417aef23f5a5c54e0c519bbbfefcdc7e pypy2.7-v7.3.2rc2-aarch64.tar.bz2\n40ff311202eca98ef3d6edeac4171470135087a8de34296f486c17ec376ebe51 pypy2.7-v7.3.2rc2-linux32.tar.bz2\n379d458c1a9d38c2b3a6a32bd805786fc584739548a697a4ef7b683bcfdfda3e pypy2.7-v7.3.2rc2-linux64.tar.bz2\n3d515a233c83cbc833bcdd0b75354b20dc79b9f6ca892a5db9cadaea36c6bb5b pypy2.7-v7.3.2rc2-osx64.tar.bz2\n41344e1e4d27d774780e9cace6e70c5025b510c82de708ea55b64d21ed0c2f40 pypy2.7-v7.3.2rc2-s390x.tar.bz2\n144bfc9607e6319ba950de9a4d1587020e3f1311cc25a79d1711de78c5992f4f pypy2.7-v7.3.2rc2-src.tar.bz2\nf9de3fe464ca11dfcdd6816b64051f03bdba7c66755b17ddd4f071c4d08cc0fb pypy2.7-v7.3.2rc2-src.zip\n01a9b5b266fde443698cb01c7bac843cc0ed8747f47f1e8930666a4303bf83b2 pypy2.7-v7.3.2rc2-win32.zip\n\n925543a3161153d9b15df49000e96ce2625bf4371619667b5f37616b699acc21 pypy2.7-v7.3.2rc1-linux32.tar.bz2\n6216e1bbac3b86bfd38d16f0685c34c8c9c7aaf908ebd00388844ec295b89c17 pypy2.7-v7.3.2rc1-linux64.tar.bz2\na6fcdb44f12379eb1a547750322bd4c154b6e0c5ee30f9de2d9e2b86b2f2f319 pypy2.7-v7.3.2rc1-osx64.tar.bz2\n9f58b5bacab010d945d9c31e8b7a2539034858f4cdf048f016d8d04430688cc6 pypy2.7-v7.3.2rc1-src.tar.bz2\n0c86b52f6ad09dce1275427c18a216a0cbb5cf0db89eba2389e97ae81416eef7 pypy2.7-v7.3.2rc1-src.zip\nbbb737f4ce714af0e7797fc951f5231b26ee10f8bca3d969c5b732982f952957 pypy2.7-v7.3.2rc1-win32.zip\npypy2.7-7.3.1 sha256:\n094f23ab262e666d8740bf27459a6b1215a628dad9b6c2a88f1ed5c793fab267 pypy2.7-v7.3.1-aarch64.tar.bz2\ncd155d06cd0956d9de4a16e8a6bdf0722cb45b5bc4bbf805825d393ebd6690ad pypy2.7-v7.3.1-linux32.tar.bz2\nbe74886547df7bf7094096a11fc0a48496779d0d1b71901797b0c816f92caca3 pypy2.7-v7.3.1-linux64.tar.bz2\ndfd4651243441d2f8f1c348e9ecc09848642d0c31bb323aa8ac320e5b9f232f0 pypy2.7-v7.3.1-osx64.tar.bz2\n1b65e085118e44ac57d38a9ba79516c68bf1fdcd65c81c66b5b5ffff06b4463b pypy2.7-v7.3.1-ppc64.tar.bz2\nd81c7177e25bd8b1c99081e32362a29ee467ccd310b17a11161f4a9b96222b20 pypy2.7-v7.3.1-ppc64le.tar.bz2\n71ad5132a6fd32af0b538c17ebd1e0bfe5f5dfa74b129bce242bd28357bf35fc pypy2.7-v7.3.1-s390x.tar.bz2\nfa3771514c8a354969be9bd3b26d65a489c30e28f91d350e4ad2f4081a9c9321 pypy2.7-v7.3.1-src.tar.bz2\n71d764c94f467f9dd75b6af086e2b69e0d520bf6227bcb39055c24c799c135be pypy2.7-v7.3.1-src.zip\ne3c0dfb385d9825dd7723f26576d55d43ed92f1178f2399ab39e9fa11621a47b pypy2.7-v7.3.1-win32.zip\npypy3.6-7.3.1 sha256:\n0069bc3c1570b935f1687f5e128cf050cd7229309e48fad2a2bf2140d43ffcee pypy3.6-v7.3.1-aarch64.tar.bz2\n2e7a818c67f3ac0708e4d8cdf1961f30cf9586b3f3ca2f215d93437c5ea4567b pypy3.6-v7.3.1-linux32.tar.bz2\nf67cf1664a336a3e939b58b3cabfe47d893356bdc01f2e17bc912aaa6605db12 pypy3.6-v7.3.1-linux64.tar.bz2\nd9c1778cd1ba37e129b495ea0f35ccdd9b68f5cd9d33ef0ce24e955c16d8840b pypy3.6-v7.3.1-osx64.tar.bz2\nee02b3e65f0ca49dc09850b57835c2b65d1234f26f7991027ca6d65fadbaa4d9 pypy3.6-v7.3.1-ppc64.tar.bz2\n089fd806629ebf79cb0cb4b0c303d8665f360903b79f0df9214b58dbc42e8231 pypy3.6-v7.3.1-ppc64le.tar.bz2\n147592888e25678c1ae1c2929dc7420b3a0990117fdb25f235cb22476b4e4b5a pypy3.6-v7.3.1-s390x.tar.bz2\n0c2cc3229da36c6984baee128c8ff8bb4516d69df1d73275dc4622bf249afa83 pypy3.6-v7.3.1-src.tar.bz2\n91e7ba30519f2c4c1833280acfb660b48392ef57c5ed0fa4e8af78587a7b8f20 pypy3.6-v7.3.1-src.zip\n752fbe8c4abee6468e5ce22af82818f821daded36faa65f3d69423f9c217007a pypy3.6-v7.3.1-win32.zip\npypy2.7-7.3.0 sha256:\na3dd8d5e2a656849fa344dce4679d854a19bc4a096a0cf62b46a1be127a5d56c pypy2.7-v7.3.0-aarch64.tar.bz2\neac1308b7d523003a5f6d20f58406d52ab14611bcec750122ae513a5a35110db pypy2.7-v7.3.0-linux32.tar.bz2\nf4950a54378ac637da2a6defa52d6ffed96af12fcd5d74e1182fb834883c9826 pypy2.7-v7.3.0-linux64.tar.bz2\nca7b056b243a6221ad04fa7fc8696e36a2fb858396999dcaa31dbbae53c54474 pypy2.7-v7.3.0-osx64.tar.bz2\n82e62869812aa2953a4f83e96c813cbc52973dfa5e42605e72b6610ac13f2481 pypy2.7-v7.3.0-ppc64.tar.bz2\n592a6db77270b922ffa13cbeced9eabbc36c532ded9fc145f6a19073d3e78499 pypy2.7-v7.3.0-ppc64le.tar.bz2\nd254b82a00021339762198e41ba7f72316010d0f9bd4dcd7b0755185da9c005e pypy2.7-v7.3.0-s390x.tar.bz2\nb0b25c7f8938ab0fedd8dedf26b9e73c490913b002b484c1b2f19d5844a518de pypy2.7-v7.3.0-src.tar.bz2\n42dc84a277e7a5e635fe39bbd745f06135902c229a257123332b7555800d915b pypy2.7-v7.3.0-src.zip\na9e3c5c983edba0313a41d3c1ab55b080816c4129e67a6c272c53b9dbcdd97ec pypy2.7-v7.3.0-win32.zip\npypy3.6-7.3.0 sha256:\nb900241bca7152254c107a632767f49edede99ca6360b9a064141267b47ef598 pypy3.6-v7.3.0-aarch64.tar.bz2\n7045b295d38ba0b5ee65bd3f078ca249fcf1de73fedeaab2d6ad78de2eab0f0e pypy3.6-v7.3.0-linux32.tar.bz2\nd3d549e8f43de820ac3385b698b83fa59b4d7dd6cf3fe34c115f731e26ad8856 pypy3.6-v7.3.0-linux64.tar.bz2\n87b2545dad75fe3027b4b2108aceb9fdadcdd24e61ae312ac48b449fdd452bf3 pypy3.6-v7.3.0-osx64.tar.bz2\ne2587e8da2abb12a86bf75941ce739124d2a1156367a9a3d729ac31d0841c300 pypy3.6-v7.3.0-ppc64.tar.bz2\nd6f3b701313df69483b43ebdd21b9652ae5e808b2eea5fbffe3b74b82d2e7433 pypy3.6-v7.3.0-ppc64le.tar.bz2\n0fe2f7bbf42ea88b40954d7de773a43179a44f40656f2f58201524be70699544 pypy3.6-v7.3.0-s390x.tar.bz2\n48d12c15fbcbcf4a32882a883195e1f922997cde78e7a16d4342b9b521eefcfa pypy3.6-v7.3.0-src.tar.bz2\n8ae9efd0a2aadb19e892bbd07eca8ef51536296a3ef93964149aceba511e79ca pypy3.6-v7.3.0-src.zip\n30e6870c4f3d8ef91890a6556a98080758000ba7c207cccdd86a8f5d358998c1 pypy3.6-v7.3.0-win32.zip\npypy2.7-7.2.0 sha256:\n57b0be053c6a5f069e23b843f38863cf7920f5eef7bc89f2e086e5c3a28a2ba9 pypy2.7-v7.2.0-aarch64.tar.bz2\n76d666e5aee54b519d6ec1af4ef0cbdc85f7f9276dd554e97deb026adfd0c936 pypy2.7-v7.2.0-linux32.tar.bz2\n05acf28e6a243026ecad933b9361d8f74b41f00818071b76b38c4694cc4c9599 pypy2.7-v7.2.0-linux64.tar.bz2\n36aa2f2440e762333569118dd0b3d5371d575c40966effa194d116c5453ddb52 pypy2.7-v7.2.0-osx64.tar.bz2\nfb51150a4ce94b0ca8587899ba69c41fc58a6b35c5340ea6926376ecb9cfcac4 pypy2.7-v7.2.0-ppc64.tar.bz2\n5c4224525657c29b815cb2c6b3f9bc5a267368cc6adf0fedb235a6052929f65f pypy2.7-v7.2.0-ppc64le.tar.bz2\nbb7ae585ecb4d904c890e28a2c5b6bd379f57cc3d9e38ff45597ff54fa935eaa pypy2.7-v7.2.0-s390x.tar.bz2\n55cb7757784fbe3952102447f65b27d80e6c885a464a7af1a9ce264492439dcc pypy2.7-v7.2.0-src.tar.bz2\n897038550614d558f9f6718409b107e27903ef2b2b57ec250939d1b1ebdf0aba pypy2.7-v7.2.0-src.zip\n956eeaaaac053e5d0917e77a3d2ad1933ab5561eb3e6e71235780b5aa5fd2bb7 pypy2.7-v7.2.0-win32.zip\npypy2.7-7.1.1 sha256:\n41ca390a76ca0d47b8353a0d6a20d5aab5fad8b0bb647b960d8c33e873d18ef5 pypy2.7-v7.1.1-linux32.tar.bz2\n73b09ef0860eb9ad7997af3030b22909806a273d90786d78420926df53279d66 pypy2.7-v7.1.1-linux64.tar.bz2\n31a17294dec96c2191885c776b4ee02112957dc874f7ba03e570537a77b78c35 pypy2.7-v7.1.1-osx64.tar.bz2\n1ef94c3a9c67c2335cee0b21753036b4696ed588b9d54b7b8036a6ae47f7001d pypy2.7-v7.1.1-s390x.tar.bz2\n5f06bede6d71dce8dfbfe797aab26c8e35cb990e16b826914652dc093ad74451 pypy2.7-v7.1.1-src.tar.bz2\nd9b07a2954ad6dbde94feffd848311e2b5169563d33e3e9f17969579b01a4158 pypy2.7-v7.1.1-src.zip\n9c59226311f216a181e70ee7b5aa4d9665a15d00f24ae02acec9af7d96355f63 pypy2.7-v7.1.1-win32.zip\npypy2.7-7.1.0 sha256:\n44ec91e8cb01caab289d8763c203f3aaf288d14325a6c42692bd1ac4e870d758 pypy2.7-v7.1.0-linux32.tar.bz2\nfef176a29a2ef068c00c8098e59dab935ca6e956f089672b3f7351da95a034f5 pypy2.7-v7.1.0-linux64.tar.bz2\n8be43685ce718b0768387450fc6dc395d60809b778b6146c353ef67826022153 pypy2.7-v7.1.0-osx64.tar.bz2\nb065f55741bcb37863f1eca30ce91c9d79159371a6994100930cdc2ede3237bc pypy2.7-v7.1.0-s390x.tar.bz2\nb051a71ea5b4fa27d0a744b28e6054661adfce8904dcc82500716b5edff5ce4b pypy2.7-v7.1.0-src.tar.bz2\ne60ce30f9947844da43daaa7658adc0c05330681305225954114772f42df06ec pypy2.7-v7.1.0-src.zip\n76658c9ad679d562b8b6a09d006caa666406337b9834ff56db16980c5e549f20 pypy2.7-v7.1.0-win32.zip\npypy3.6-7.2.0 sha256:\nf82dc9dc6c692417ee9727f23beae75364a5757ebdc657a2a1d0010ac3ad17ab pypy3.6-v7.2.0-aarch64.tar.bz2\n45e99de197cb3e974cfc8d45e0076ad2066852e61e56b3eafd1237efafd2c43e pypy3.6-v7.2.0-linux32.tar.bz2\naa128e555ad0fe5c4c15104ae0903052bd232b6e3a73f5fe023d27b8fd0d6089 pypy3.6-v7.2.0-linux64.tar.bz2\n836abb0ec303b90a684533711ed3b8269d3e8c64805b595e410920abdea678ac pypy3.6-v7.2.0-osx64.tar.bz2\n14021d196e393b3a6d2395ab94ceec347753715e37223efe4c50b7c141b351a2 pypy3.6-v7.2.0-ppc64.tar.bz2\n6aef73a3b68e9a6c062cadd83d3db16790960cf97401ca6f2aad2195e9b05c35 pypy3.6-v7.2.0-ppc64le.tar.bz2\na11da8118064db102d159e9221319c428b298c4a87f26166fd6ae94be8d6ae0d pypy3.6-v7.2.0-s390x.tar.bz2\n0d7c707df5041f1593fe82f29c40056c21e4d6cb66554bbd66769bd80bcbfafc pypy3.6-v7.2.0-src.tar.bz2\n405ac35695dd374d5ea192cb44cb47231f9a65812cc7b6549df33df12ffe54db pypy3.6-v7.2.0-src.zip\nc926f622bec24a8b348591d631717ace83b3a6c3c2dac02b157b622b97d1fc9c pypy3.6-v7.2.0-win32.zip\npypy3.6-7.1.1 sha256:\ncb11ef4b0df569c28390b1ee93029159e1b90bfbad98df6abd629d5203b2abd9 pypy3.6-v7.1.1-linux32.tar.bz2\n8014f63b1a34b155548852c7bf73aab2d41ebddf2c8fb603dc9dd8509be93db0 pypy3.6-v7.1.1-linux64.tar.bz2\na5c2f2bfa2b4a4d29e8a67baab95699b169054066df218a14f171bb84a6df0c0 pypy3.6-v7.1.1-osx64.tar.bz2\n4a91bf2d9a142b6dbf82b5301cb510535ae9a54e1645546b2e0735a7b5ed85ba pypy3.6-v7.1.1-s390x.tar.bz2\n6a3ef876e3691a54f4cff045028ec3be94ab9beb2e99f051b83175302c1899a8 pypy3.6-v7.1.1-src.tar.bz2\n4a3ebeb767740f2dc0b886d02797d21d7d69f154cf951bb991c19bd485e6cae1 pypy3.6-v7.1.1-src.zip\n8b513b254de5f31890f5956569de9aec3a0a91d7aba72fc89d66901f4a8ccf49 pypy3.6-v7.1.1-win32.zip\npypy 3.6-v7.1.0 sha256:\n031bfac61210a6e161bace0691b854dc15d01b0e624dc0588c544ee5e1621a83 pypy3.6-v7.1.0-linux32.tar.bz2\n270dd06633cf03337e6f815d7235e790e90dabba6f4b6345c9745121006925fc pypy3.6-v7.1.0-linux64.tar.bz2\nd46e005ba095cb4a7006079ffbf4fe63c18cf5e9d8ce9ce8383efc1a4863ab5b pypy3.6-v7.1.0-osx64.tar.bz2\n243cd0cc188a94c1f064f402ae72b8ba4303eb3137eac53c53826472b8005098 pypy3.6-v7.1.0-s390x.tar.bz2\nfaa81f469bb2a7cbd22c64f22d4b4ddc5a1f7c798d43b7919b629b932f9b1c6f pypy3.6-v7.1.0-src.tar.bz2\n4858e7e8a0007bc3b381bd392208b28d30889a4e5a88a3c28e3d9dc4f25b654e pypy3.6-v7.1.0-src.zip\n77a0576a3d518210467f0df2d0d9a1892c664566dc02f25d974c2dbc6b4749e7 pypy3.6-v7.1.0-win32.zip", + "tags": "", + "url": "https://www.pypy.org/checksums.html" + }, + { + "title": "Some Ways that PyPy uses Graphviz", + "text": "Some way that PyPy uses Graphviz\nSomebody wrote this super cool thread on Twitter about using Graphviz to make\nsoftware visualize its internal state:\n\ud83e\uddf5 Make yours and everybody else's lives slightly less terrible by having all your programs print out their internal stuff as pictures; \u2728 a thread \u2728 pic.twitter.com/NjQ42bXN2E\u2014 Kate (@thingskatedid) April 24, 2021 PyPy is using this approach a lot too and I collected a few screenshots of that\ntechnique on Twitter and I thought it would make a nice blog post too!\nThe most important view early in the project, and the way that our Graphviz\nvisualizations got started was that we implemented a way to look at the control\nflow graphs of our RPython functions after type inference. They are in static\nsingle information form (SSI), a variant of SSA form. Hovering over the\nvariables shows the inferred types in the footer:\n\nThere's another view that shows the inferred call graph of the program:\n\nA related viewer shows the inferred class hierarchy (in this case the exception\nhierarchy) and you can focus on a single class, which will show you its base\nclasses and all the methods and instance attributes that were found:\n\n\nWe also have a view to show us the traces that are produced by the tracing JIT\ntests. this viewer doesn't really scale to the big traces that the full Python\ninterpreter produces, but it's really useful during testing:\n\nThen there are more traditional tree views, eg here is a parse tree for a small\npiece of Python source code:\n\nParsing-related we have visualized the DFAs of the parser in the past,\nthough the code is unfortunately lost.\nAll these visualizations are made by walking the relevant data structures and\nproducing a Graphviz input file using a bit of string manipulation, which is\nquite easy to do. Knowing a bit of Graphviz is a really useful skill, it's\nsuper easy to make throwaway visualizations.\nFor example here is a one-off thing I did when debugging our JSON parser to\nshow the properties of the objects used in a huge example json file:\n\nOn top of graphviz, we have a custom tool called the dotviewer, which is\nwritten in Python and uses Pygame to give you a zoomable, pannable, searchable\nway to look at huge Graphviz graphs. All the images in this post are\nscreenshots of that tool. In its simplest form it takes any .dot files as\ninput.\nHere's a small video dotviewer, moving around and searching in the json graph.\nBy writing a bit of extra Python code the dotviewer can also be extended to add\nhyperlinks in the graphs to navigate to different views (for example, we did\nthat for the callgraphs above).\nAll in all this is a really powerful approach to understand the behaviour of\nsome of code, or when debugging complicated problems and we have gotten a\nhuge amount of mileage out of this over the years. It can be seen as an instance\nof moldable development (\"a way of programming through which you construct\ncustom tools for each problem\"). And it's really easy to get into! The Graphviz\nlanguage is quite a simple text-based language that can be applied to a huge\namount of different visualization situations.", + "tags": "", + "url": "https://www.pypy.org/posts/2021/04/ways-pypy-graphviz.html" + }, + { + "title": "PyPy v7.3.4: release of python 2.7 and 3.7", + "text": "PyPy v7.3.4: release of python 2.7 and 3.7\nThe PyPy team is proud to release the version 7.3.4 of PyPy, which includes\ntwo different interpreters:\n\n\nPyPy2.7, which is an interpreter supporting the syntax and the features of\nPython 2.7 including the stdlib for CPython 2.7.18+ (the + is for\nbackported security updates)\nPyPy3.7, which is an interpreter supporting the syntax and the features of\nPython 3.7, including the stdlib for CPython 3.7.10. We no longer refer to\nthis as beta-quality as the last incompatibilities with CPython (in the\nre module) have been fixed.\n\n\nWe are no longer releasing a Python3.6 version, as we focus on updating to\nPython 3.8. We have begun streaming the advances towards this goal on Saturday\nevenings European time on https://www.twitch.tv/pypyproject. If Python3.6 is\nimportant to you, please reach out as we could offer sponsored longer term\nsupport.\nThe two interpreters are based on much the same codebase, thus the multiple\nrelease. This is a micro release, all APIs are compatible with the other 7.3\nreleases. Highlights of the release include binary Windows 64 support,\nfaster numerical instance fields, and a preliminary HPy backend.\nA new contributor (Ondrej Baranovi\u010d - thanks!) took us up on the challenge to get\nwindows 64-bit support. The work has been merged and for the first time we\nare releasing a 64-bit Windows binary package.\nThe release contains the biggest change to PyPy's implementation of the\ninstances of user-defined classes in many years. The optimization was\nmotivated by the report of performance problems running a numerical particle\nemulation. We implemented an optimization that stores int and float\ninstance fields in an unboxed way, as long as these fields are type-stable\n(meaning that the same field always stores the same type, using the principle\nof type freezing). This gives significant performance improvements on\nnumerical pure-Python code, and other code where instances store many integers\nor floating point numbers.\nThere were also a number of optimizations for methods around strings and bytes,\nfollowing user reported performance problems. If you are unhappy with PyPy's\nperformance on some code of yours, please report an issue!\nA major new feature is prelminary support for the Universal mode of HPy: a\nnew way of writing c-extension modules to totally encapsulate PyObject*.\nThe goal, as laid out in the HPy documentation and recent HPy blog post,\nis to enable a migration path\nfor c-extension authors who wish their code to be performant on alternative\ninterpreters like GraalPython (written on top of the Java virtual machine),\nRustPython, and PyPy. Thanks to Oracle and IBM for sponsoring work on HPy.\nSupport for the vmprof statistical profiler has been extended to ARM64 via a\nbuilt-in backend.\nSeveral issues exposed in the 7.3.3 release were fixed. Many of them came from the\ngreat work ongoing to ship PyPy-compatible binary packages in conda-forge.\nA big shout out to them for taking this on.\nDevelopment of PyPy takes place on https://foss.heptapod.net/pypy/pypy.\nWe have seen an increase in the number of drive-by contributors who are able to\nuse gitlab + mercurial to create merge requests.\nThe CFFI backend has been updated to version 1.14.5 and the cppyy backend\nto 1.14.2. We recommend using CFFI rather than C-extensions to interact with C,\nand using cppyy for performant wrapping of C++ code for Python.\nAs always, we strongly recommend updating to the latest versions. Many fixes\nare the direct result of end-user bug reports, so please continue reporting\nissues as they crop up.\nYou can find links to download the v7.3.4 releases here:\n\nhttps://pypy.org/download.html\n\nWe would like to thank our donors for the continued support of the PyPy\nproject. If PyPy is not quite good enough for your needs, we are available for\ndirect consulting work. If PyPy is helping you out, we would love to hear about\nit and encourage submissions to our renovated blog site via a pull request\nto https://github.com/pypy/pypy.org\nWe would also like to thank our contributors and encourage new people to join\nthe project. PyPy has many layers and we need help with all of them: PyPy\nand RPython documentation improvements, tweaking popular modules to run\non PyPy, or general help with making RPython's JIT even better. Since the\nprevious release, we have accepted contributions from 10 new contributors,\nthanks for pitching in, and welcome to the project!\nIf you are a python library maintainer and use C-extensions, please consider\nmaking a cffi / cppyy version of your library that would be performant on PyPy.\nIn any case both cibuildwheel and the multibuild system support\nbuilding wheels for PyPy.\n\nWhat is PyPy?\nPyPy is a Python interpreter, a drop-in replacement for CPython 2.7, 3.7, and\nsoon 3.8. It's fast (PyPy and CPython 3.7.4 performance\ncomparison) due to its integrated tracing JIT compiler.\nWe also welcome developers of other dynamic languages to see what RPython\ncan do for them.\nThis PyPy release supports:\n\n\nx86 machines on most common operating systems\n(Linux 32/64 bits, Mac OS X 64 bits, Windows 32/64 bits, OpenBSD, FreeBSD)\nbig- and little-endian variants of PPC64 running Linux,\ns390x running Linux\n64-bit ARM machines running Linux.\n\n\nPyPy does support ARM 32 bit processors, but does not release binaries.\n\n\nWhat else is new?\nFor more information about the 7.3.4 release, see the full changelog.\nPlease update, and continue to help us make PyPy better.\nCheers,\nThe PyPy team", + "tags": "release", + "url": "https://www.pypy.org/posts/2021/04/pypy-v734-release-of-python-27-and-37.html" + }, + { + "title": "New HPy blog", + "text": "Regular readers of this blog\nalready know\nabout HPy, a project which aims to develop a new C\nAPI for Python to make it easier/faster to support C extensions on alternative\nPython implementations, including PyPy.\nThe HPy team just published the\nfirst post of HPy new\nblog, so if you are interested in its development, make sure to check it out!", + "tags": "", + "url": "https://www.pypy.org/posts/2021/03/new-hpy-blog.html" + }, + { + "title": "PyPy's blog has moved", + "text": "For many years, PyPy has been publishing blog posts at\nhttps://morepypy.blogspot.com. From now on,\nthe posts will be here, at https://pypy.org/blog. The\nRSS feed is https://pypy.org/rss.xml. The original\ncontent has been migrated to the newer site, including comments.\n\n\nAmong the motivations for the move were:\nOne site to rule them all\nAdding the blog posts here seems like a natural extension of the web site\nrather than outsourcing it to a third-party. Since the site is generated using\nthe static site generator nikola from the github repo\nhttps://github.com/pypy/pypy.org, we also\nhave good source control for the content.\nCI previews, and github\nThose of you who follow PyPy may note something new in the URL for the repo:\nuntil now PyPy has been using mercurial as hosted\non https://foss.heptapod.net. While\nheptapod (a community driven effort to bring mercurial\nsupport to GitLab\u2122) does provide a GitLab CI runner for the open source\noffering, on github it is easier to integrate netlify\nfor previews. Hopefully the move to the more popular github platform will\nencourage new contributors to publish their success stories around using PyPy\nand the RPython toolchain.\nComments\nComments to blog posts are generated via the utterances\njavascript plugin. The comments appear as issues in the repo.\nWhen viewing the site, a query is made to fetch the comments to the issue with\nthat name. To comment, users must authorize the utterances app to post on their\nbehalf using the GitHub\nOAuth flow.\nAlternatively, users can comment on the GitHub issue directly. The interaction\nwith github for authentication and moderation seems more natural than the\nmanual moderation required on blogspot.\nPlease prove to us that the move is worth it\nHelp us with guest blog posts, and PRs to improve the styling of the site. One\nalready open issue is that the\nnavbar needlessly uses javascript, help to keep the responsive style in pure\nCSS is welcome. The theme could also use tweaking.\nBut more importantly, we want to hear from you. Guest blog posts about\nPyPy are welcome. Just follow the directions in the repo's README to create a\nPR with your favorite PyPy story.\nThe PyPy Team", + "tags": "", + "url": "https://www.pypy.org/posts/2021/03/pypys-blog-has-moved.html" + }, + { + "title": "Mac meets Arm64", + "text": "Looking for sponsorship\n\nApple now ships Macs which are running on an arm64 variant machine with the\nlatest version of MacOS, Big Sur M1. We are getting requests for PyPy to\nsupport this new architecture. Here is our position on this topic (or at least\nmine, Armin Rigo's), and how you can help.\n\nPorting PyPy is harder than just re-running the compiler, because PyPy contains\na few big architecture-dependent \"details\", like the JIT compiler and the\nforeign function interfaces (CFFI and ctypes).\n\nFixing the JIT compiler should not be too much work: we already support arm64,\njust the Linux one. But Apple made various details different (like the calling\nconventions). A few other parts need to be fixed too, notably CFFI and ctypes,\nagain because of the calling conventions.\n\nFixing that would be a reasonable amount of work. I would do it myself for a\nsmall amount of money. However, the story doesn't finish here. Obviously, the\nstart of the story would be to get ssh access to a Big Sur M1 machine. (If at\nthis point you're thinking \"sure, I can give you ssh access for three months\",\nthen please read on.) The next part of the story is that we need a machine\navailable long term. It can be either a machine provided and maintained by a\nthird party, or alternatively a pot of money big enough to support the\nacquision of a machine and ongoing work of one of us.\n\nIf we go with the provided-machine solution: What we need isn't a lot of\nresources. Our CI requires maybe 10 GB of disk space, and a few hours of CPU\nper run. It should fit into 8 GB of RAM. We normally do a run every night but\nwe can certainly lower the frequency a bit if that would help. However, we'd\nideally like some kind of assurance that you are invested into maintaining the\nmachine for the next 3-5 years (I guess, see below). We had far too many\nmachines that disappeared after a few months.\n\nIf we go with the money-supported solution: it's likely that after 3-5 years\nthe whole Mac base will have switched to arm64, we'll drop x86-64 support for\nMac, and we'll be back to the situation of the past where there was only one\nkind of Mac machine to care about. In the meantime, we are looking at 3-5\nyears of lightweight extra maintenance. We have someone that has said he would\ndo it, but not for free.\n\nIf either of these two solutions occurs, we'll still have, I quote, \"probably\nsome changes in distutils-type stuff to make python happy\", and then some\npackaging/deployment changes to support the \"universal2\" architecture, i.e.\nincluding both versions inside a single executable (which will not be just an\nextra switch to clang, because the two versions need a different JIT backend\nand so must be translated separately).\n\nSo, now all the factors are on the table. We won't do the minimal \"just the\nJIT compiler fixes\" if we don't have a plan that goes farther. Either we get\nsufficient money, and maybe support, and then we can do it quickly; or PyPy\nwill just remain not natively available on M1 hardware for the next 3-5 years.\nWe are looking forward to supporting M1, and view resources contributed by\nthe community as a vote of confidence in assuring the future of PyPy on this\nhardware. Contact us: pypy-dev@python.org, or our private mailing\nlist pypy-z@python.org.\n\nThanks for reading!\n\nArmin Rigo", + "tags": "", + "url": "https://www.pypy.org/posts/2020/12/mac-meets-arm64-940822335619099039.html" + }, + { + "title": "PyPy 7.3.3 triple release: python 3.7, 3.6, and 2.7", + "text": "The PyPy team is proud to release the version 7.3.3 of PyPy, which includes\nthree different interpreters:\n\nPyPy2.7, which is an interpreter supporting the syntax and the features of\nPython 2.7 including the stdlib for CPython 2.7.18 (updated from the\nprevious version)PyPy3.6: which is an interpreter supporting the syntax and the features of\nPython 3.6, including the stdlib for CPython 3.6.12 (updated from the\nprevious version).PyPy3.7 beta: which is our second release of an interpreter supporting the\nsyntax and the features of Python 3.7, including the stdlib for CPython\n3.7.9. We call this beta quality software, there may be issues about\ncompatibility with new and changed features in CPython 3.7.\nPlease let us know what is broken or missing. We have not implemented the\ndocumented changes in the re module, and a few other pieces are also\nmissing. For more information, see the PyPy 3.7 wiki page\n\nThe interpreters are based on much the same codebase, thus the multiple\nrelease. This is a micro release, all APIs are compatible with the 7.3\nreleases, but read on to find out what is new.\nSeveral issues found in the 7.3.2 release were fixed. Many of them came from the\ngreat work by conda-forge to ship PyPy binary packages. A big shout out\nto them for taking this on.\nDevelopment of PyPy has moved to https://foss.heptapod.net/pypy/pypy.\nThis was covered more extensively in this blog post. We have seen an\nincrease in the number of drive-by contributors who are able to use gitlab +\nmercurial to create merge requests.\nThe CFFI backend has been updated to version 1.14.3. We recommend using CFFI\nrather than c-extensions to interact with C, and using cppyy for performant\nwrapping of C++ code for Python.\nA new contributor took us up on the challenge to get windows 64-bit support.\nThe work is proceeding on the win64 branch, more help in coding or\nsponsorship is welcome. In anticipation of merging this large change, we fixed\nmany test failures on windows.\nAs always, this release fixed several issues and bugs. We strongly recommend\nupdating. Many of the fixes are the direct result of end-user bug reports, so\nplease continue reporting issues as they crop up.\nYou can find links to download the v7.3.3 releases here:\n\nhttps://pypy.org/download.html\nWe would like to thank our donors for the continued support of the PyPy\nproject. If PyPy is not quite good enough for your needs, we are available for\ndirect consulting work.\nWe would also like to thank our contributors and encourage new people to join\nthe project. PyPy has many layers and we need help with all of them: PyPy\nand RPython documentation improvements, tweaking popular modules to run\non pypy, or general help with making RPython\u2019s JIT even better. Since the\nprevious release, we have accepted contributions from 2 new contributors,\nthanks for pitching in.\nIf you are a python library maintainer and use c-extensions, please consider\nmaking a cffi / cppyy version of your library that would be performant on PyPy.\nIn any case both cibuildwheel and the multibuild system support\nbuilding wheels for PyPy.\n\nWhat is PyPy?\nPyPy is a Python interpreter, a drop-in replacement for CPython 2.7, 3.6, and\n3.7. It\u2019s fast (PyPy and CPython 3.7.4 performance\ncomparison) due to its integrated tracing JIT compiler.\nWe also welcome developers of other dynamic languages to see what RPython\ncan do for them.\nThis PyPy release supports:\n\nx86 machines on most common operating systems\n(Linux 32/64 bits, Mac OS X 64 bits, Windows 32 bits, OpenBSD, FreeBSD)big- and little-endian variants of PPC64 running Linux,s390x running Linux64-bit ARM machines running Linux.\n\nPyPy does support ARM 32 bit processors, but does not release binaries.\u00a0\nWhat else is new?\nFor more information about the 7.3.3 release, see the full changelog.\n\nPlease update, and continue to help us make PyPy better.\n\nCheers,\nThe PyPy team", + "tags": "release", + "url": "https://www.pypy.org/posts/2020/11/pypy-733-triple-release-python-37-36-3446596804408262749.html" + }, + { + "title": "Download (advanced)", + "text": "Contents\n\n\"JIT Compiler\" version\nLinux binaries and common distributions\nPyPy-STM 2.5.1\nOther versions\nInstalling\nInstalling more modules\nBuilding from source\nPackaging\nChecksums\n\n\nWe provide pre-compiled binaries for many platforms and OSes:\n\nthe Python2.7 compatible release \u2014 PyPy2.7 v7.3.17\nthe Python3.10 compatible release \u2014 PyPy3.10 v7.3.17\n\n\nNote\nOur nightly binary builds have the most recent bugfixes and performance\nimprovements, though they can be less stable than the official releases. See\nthis link for older versions.\n\n\n\n\nPyPy latest\n\n\n\n\n\n\n\nOS\nPyPy3.10\nPyPy2.7\nNotes\n\n\n\nLinux x86 64 bit\nDownload\nDownload\ncompatible with CentOS7 and later.\n\nWindows 64 bit\nDownload\nDownload\ncompatible with any windows 64-bit\nyou might need the VC runtime library installer vcredist.x64.exe\n\nMacOS arm64\nDownload\nDownload\nMacOS >= 11. Not signed, for signed packages use Homebrew.\n\nMacOS x86_64\nDownload\nDownload\nMacOS >= 10.15, not for Mojave and below. Not signed, for signed\npackages use Homebrew.\n\nLinux ARM64\nDownload\nDownload\ncompatible with CentOS7 and later.\n\n\n\n\n\nOther Platforms\n\n\n\n\n\n\n\nOS\nPyPy3.10\nPyPy2.7\nNotes\n\n\n\nLinux x86 32 bit\nDownload\nDownload\ncompatible with CentOS7 and later\n\n\n\n\n\"JIT Compiler\" version\nThe binaries above include a Just-in-Time compiler. On x86-32, they only work on\nCPUs that have the SSE2 instruction set (most of them do, nowadays).. They also\ncontain stackless extensions, like greenlets.\n\n\nLinux binaries and common distributions\nSince version 7.3, the linux x86 binaries ship with versions\nof OpenSSL, SQLite3, libffi, expat, and TCL/TK binary libraries linked in. This\nmake the binaries \"portable\" so that they should run on any current glibc-based\nlinux platform. The ideas were adopted from the portable-pypy package.\nThis solution to the portability problem means that the versions of the\npackaged libraries are frozen to the version shipped, so updating your system\nlibraries will not affect this installation of PyPy. Also see the note about\nSSL certificates below.\nThere are other solutions:\n\ndownload PyPy from your release vendor (usually an outdated\nversion): Ubuntu (PPA), Debian, Homebrew, MacPorts,\nFedora, Gentoo and Arch are known to package PyPy, with various\ndegrees of being up-to-date. FreshPorts packages for FreeBSD.\nrecompile the CFFI-based TCL/TK, OpenSSL, or sqlite3 modules, using system\nlibraries and the scripts in pypy/lib_pypy/pypy_tools. This solution will\nnot solve compatibility issues with libffi, since that is baked into PyPy.\nor translate your own PyPy.\n\n\n\nNote\nSSL Certificates\nWhile the linux binaries ship an OpenSSL library, they do not ship a\ncertificate store for SSL certificates. If you wish to use SSL module,\nyou will need a valid certificate store. You can use the certifi package\nand set SSL_CERT_FILE to certifi.where() or install your platform\ncertificates which should be discovered by the _ssl module.\n\n\nPrevious version can be downloaded from here, or directly from the buildbot's\nmirror.\nIf your CPU is really, really old, it may be a x86-32 without SSE2.\nThere is untested support for manually translating PyPy's JIT without\nSSE2 (--jit-backend=x86-without-sse2) but note that your machine\nis probably low-spec enough that running CPython on it is a better\nidea in the first place.\n\n\nPyPy-STM 2.5.1\nThis is a special version of PyPy! See the Software Transactional\nMemory (STM) documentation.\n\nPyPy-STM Linux x86-64 binary (64bit, tar.bz2 built on Ubuntu 12.04 - 16.04)\n\n\n\nOther versions\nThe other versions of PyPy are:\n\nTry the most up-to-date nightly binary builds , if the official\nrelease is too old for what you want to do.\nReverse debugger: This version enables debugging your Python\nprograms by going forward and backward in time. See the RevDB\ndocumentation.\n\n\nOld-style sandboxing: A special safe version.\nThis is NOT the version announced in-development during 2019!\nRead the docs about sandboxing.\nThis version is not supported and not actively maintained. You\nwill likely have to fix some issues yourself, or checkout an old\nversion, or otherwise play around on your own. We provide this\ndocumentation only for historical reasons. Please do not use in\nproduction. For reference, there are some very old, unmaintained\nbinaries for Linux (32bit, 64bit).\n\n\n\nInstalling\nAll binary versions are packaged in a tar.bz2 or zip file. When\nuncompressed, they run in-place. You can uncompress them\neither somewhere in your home directory or, say, in /opt.\nIf you want, put a symlink from somewhere like\n/usr/local/bin/pypy to /path/to/pypy_expanded/bin/pypy. Do\nnot move or copy the executable pypy outside the tree --- put\na symlink to it, otherwise it will not find its libraries.\n\n\nInstalling more modules\nThe typical pip workflow for packages with binary extensions\nrequires that the package maintainers provide a wheel for PyPy, which is\nsometimes too much work for the overburdened maintainers. For more information\nsee the installation documentation_\nIf you use your distribution's PyPy package we recommend you install packages\ninto a virtualenv. If you try to build a module and the build process complains\nabout \"missing Python.h\", you may need to install the pypy-dev package.\n\n\nBuilding from source\n(see more build instructions)\n\nGet the source code. The preferred way is to checkout the current\ntrunk using git. The trunk usually works and is of course\nmore up-to-date:\ngit clone https://github.com/pypy/pypy\n\nThe trunk contains PyPy 2. For PyPy 3, switch to the correct branch:\n# switch to the branch that implements Python 3.10\ngit checkout branches/py3.10\n\nAlternatively, get one of the following smaller packages for the source at\nthe same revision as the above binaries:\n\npypy2.7-v7.3.17-src.tar.bz2 (sources, PyPy 2.7 only)\npypy3.10-v7.3.17-src.tar.bz2 (sources, PyPy 3.10 only)\n\n\nMake sure you installed the dependencies. See the list here.\n\nEnter the goal directory:\ncd pypy/pypy/goal\n\n\nRun the rpython script. Here are the common combinations\nof options (works also with python instead of pypy;\nrequires CPython 2.7 or PyPy 2, even to build PyPy 3):\n# get the JIT version\npypy ../../rpython/bin/rpython -Ojit targetpypystandalone\n# get the no-jit version\npypy ../../rpython/bin/rpython -O2 targetpypystandalone\n# get the sandbox version\npypy ../../rpython/bin/rpython -O2 --sandbox targetpypystandalone\n\n\nEnjoy Mandelbrot :-) It takes on the order of half an hour to\nfinish the translation, and about 3GB of RAM on a 32-bit system\nand about 5GB on 64-bit systems. (Do not start a translation on a\nmachine with insufficient RAM! It will just swap forever. See\nnotes below in that case.)\nIf you want to install this PyPy as root, please read the next section,\nPackaging.\n\nNotes:\n\nIt is recommended to use PyPy to do translations, instead of using CPython,\nbecause it is twice as fast. You should just start by downloading an\nofficial release of PyPy (with the JIT). If you really have to use CPython\nthen note that we are talking about CPython 2.7 here, not CPython 3.x.\n(Older versions like 2.6 are out.)\nOn some 32-bit systems, the address space limit of 2 or 3 GB of RAM\ncan be an issue. More generally you may be just a little bit low of\nRAM. First note that 2 GB is really not enough nowadays; on Windows\nyou first need to refer to the Windows build instructions. More\nprecisely, translation on 32-bit takes at this point 2.7 GB if PyPy is\nused and 2.9 GB if CPython is used. There are two workarounds:\n1. use PyPy, not CPython. If you don't have any PyPy so far, not even\nan older version, then you need to build one first, with some parts\nremoved. So, first translate with:\ncpython2 rpython -Ojit targetpypystandalone \\\n--withoutmod-micronumpy --withoutmod-cpyext\n\nthen copy pypy-c and libpypy_c.so somewhere else, and finally\ncall it with ...pypy-c ../../rpython/bin/rpython -Ojit.\n2. if even using PyPy instead of CPython is not enough, try to tweak\nsome internal parameters. Example (slower but saves around 400MB):\nPYPY_DONT_RUN_SUBPROCESS=1 PYPY_GC_MAX_DELTA=200MB \\\npypy --jit loop_longevity=300 ../../rpython/bin/rpython \\\n-Ojit --source\n# then read the next point about --source\n\n\nYou can run translations with --source, which only builds the C\nsource files (and prints at the end where). Then you can cd there\nand execute make. This is another way to reduce memory usage.\nNote that afterwards, you have to run manually pypy-c\n.../pypy/tool/build_cffi_imports.py if you want to be able to import\nthe cffi-based modules.\nLike other JITs, PyPy doesn't work out of the box on some Linux\ndistributions that trade full POSIX compliance for extra security\nfeatures. E.g. with PAX, you have to run PyPy with paxctl -cm.\nThis also applies to translation (unless you use CPython to run the\ntranslation and you specify --source).\n\n\n\nPackaging\nOnce PyPy is translated from source, a binary package similar to those\nprovided in the section Default (with a JIT Compiler) above can be\ncreated with the package.py script:\ncd ./pypy/pypy/tool/release/\npython package.py --help # for information\npython package.py --archive-name pypy-my-own-package-name\n\nIt is recommended to use package.py because custom scripts will\ninvariably become out-of-date. If you want to write custom scripts\nanyway, note an easy-to-miss point: some modules are written with CFFI,\nand require some compilation. If you install PyPy as root without\npre-compiling them, normal users will get errors:\n\nPyPy 2.5.1 or earlier: normal users would see permission errors.\nInstallers need to run pypy -c \"import gdbm\" and other similar\ncommands at install time; the exact list is in package.py. Users\nseeing a broken installation of PyPy can fix it after-the-fact if they\nhave sudo rights, by running once e.g. sudo pypy -c \"import gdbm.\nPyPy 2.6 and later: anyone would get ImportError: no module named\n_gdbm_cffi. Installers need to run pypy _gdbm_build.py in the\nlib_pypy directory during the installation process (plus others;\nsee the exact list in package.py). Users seeing a broken\ninstallation of PyPy can fix it after-the-fact, by running pypy\n/path/to/lib_pypy/_gdbm_build.py. This command produces a file\ncalled _gdbm_cffi.pypy-41.so locally, which is a C extension\nmodule for PyPy. You can move it at any place where modules are\nnormally found: e.g. in your project's main directory, or in a\ndirectory that you add to the env var PYTHONPATH.\n\n\n\nChecksums\nChecksums for the downloads are here", + "tags": "", + "url": "https://www.pypy.org/download_advanced.html" + }, + { + "title": "PyPy 7.3.2 triple release: python 2.7, 3.6, and 3.7", + "text": "The PyPy team is proud to release version 7.3.2 of PyPy, which includes\nthree different interpreters:\n\nPyPy2.7, which is an interpreter supporting the syntax and the features of\nPython 2.7 including the stdlib for CPython 2.7.13PyPy3.6: which is an interpreter supporting the syntax and the features of\nPython 3.6, including the stdlib for CPython 3.6.9.PyPy3.7 alpha: which is our first release of an interpreter supporting the\nsyntax and the features of Python 3.7, including the stdlib for CPython\n3.7.9. We call this an alpha release since it is our first. It is based off PyPy 3.6 so\nissues should be around compatibility and not stability. Please try it out\nand let us know what is broken or missing. We have not implemented some of the\ndocumented changes in the re module, and other pieces are also\nmissing. For more information, see the PyPy 3.7 wiki page\n\nThe interpreters are based on much the same codebase, thus the multiple\nrelease. This is a micro release, all APIs are compatible with the 7.3.0 (Dec\n2019) and 7.3.1 (April 2020) releases, but read on to find out what is new.\nConda Forge now supports PyPy as a python interpreter. The support is quite\ncomplete for linux and macOS. This is the result of a lot of\nhard work and good will on the part of the Conda Forge team. A big shout out\nto them for taking this on.\nDevelopment of PyPy has transitioning to https://foss.heptapod.net/pypy/pypy.\nThis move was covered more extensively in this blog post. We have seen an\nincrease in the number of drive-by contributors who are able to use gitlab +\nmercurial to create merge requests.\nThe CFFI backend has been updated to version 1.14.2. We recommend using CFFI\nrather than c-extensions to interact with C, and using cppyy for performant\nwrapping of C++ code for Python.\nNumPy has begun shipping wheels on PyPI for PyPy, currently for linux 64-bit\nonly. Wheels for PyPy windows will be available from the next NumPy release. Thanks to NumPy for their support.\nA new contributor took us up on the challenge to get windows 64-bit support.\nThe work is proceeding on the win64 branch, more help in coding or\nsponsorship is welcome.\nAs always, this release fixed several issues and bugs. We strongly recommend\nupdating. Many of the fixes are the direct result of end-user bug reports, so\nplease continue reporting issues as they crop up.You can find links to download the v7.3.2 releases here:\n\nhttps://pypy.org/download.html\nWe would like to thank our donors for the continued support of the PyPy\nproject. Please help support us at Open Collective. If PyPy is not yet good enough for your needs, we are available for\ndirect consulting work.\nWe would also like to thank our contributors and encourage new people to join\nthe project. PyPy has many layers and we need help with all of them: PyPy\nand RPython documentation improvements, tweaking popular modules to run\non pypy, or general help with making RPython\u2019s JIT even better. Since the\nprevious release, we have accepted contributions from 8 new contributors,\nthanks for pitching in.\nIf you are a python library maintainer and use c-extensions, please consider\nmaking a cffi / cppyy version of your library that would be performant on PyPy.\nIn any case both cibuildwheel and the multibuild system support\nbuilding wheels for PyPy.\n\nWhat is PyPy?\nPyPy is a very compliant Python interpreter, almost a drop-in replacement for\nCPython 2.7, 3.6, and 3.7. It\u2019s fast (PyPy and CPython 2.7.x performance\ncomparison) due to its integrated tracing JIT compiler.\nWe also welcome developers of other dynamic languages to see what RPython\ncan do for them.\nThis PyPy release supports:\n\nx86 machines on most common operating systems\n(Linux 32/64 bits, Mac OS X 64 bits, Windows 32 bits, OpenBSD, FreeBSD)big- and little-endian variants of PPC64 running Linux,s390x running Linux64-bit ARM machines running Linux.\n\nPyPy does support ARM 32 bit processors, but does not release binaries.\n\n\n\n\nWhat else is new?\nFor more information about the 7.3.2 release, see the full changelog.\n\nPlease update, and continue to help us make PyPy better.\n\nCheers,\nThe PyPy team", + "tags": "release", + "url": "https://www.pypy.org/posts/2020/09/pypy-732-triple-release-python-27-36-3980901335490872787.html" + }, + { + "title": "PyPy is on Open Collective", + "text": "Hi all,\n\nPyPy is now a member of Open Collective, a fiscal host. We have been thinking about switching to this organization for a couple of years; we like it for various reasons, like the budget transparency and the lightweight touch. We can now officially announce our membership!\n\nWith this, we are now again free to use PyPy for all financial issues, like receiving funds professionally, paying parts of sprint budgets as we like, and so on. We will shortly be reintroducing buttons that link to Open Collective from the PyPy web site.\n\nAlthough the old donation buttons were removed last year, we believe that there are still a few people that send regularly money to the SFC, the not-for-profit charity we were affiliated with. If you do, please stop doing it now (and, if you like to do so, please set up an equivalent donation to PyPy on Open Collective).\n\nAnd by the way, sorry for all of you who were getting mixed feelings from the previous blog post (co-written with the SFC). PyPy is committed to continue being Open Source just like before. This was never in question. What these two blog posts mean is only that we switched to a different organization for our internal finances.\n\nWe're looking forward to how this new relationship will go!\n\nArmin Rigo, for the PyPy team", + "tags": "", + "url": "https://www.pypy.org/posts/2020/08/pypy-is-on-open-collective-5673322428814364737.html" + }, + { + "title": "A new chapter for PyPy", + "text": "PyPy winds down its membership in the Software Freedom Conservancy\n\nConservancy and PyPy's great work together\n\nPyPy joined Conservancy in\nthe second half of 2010, shortly after the release of\nPyPy 1.2, the first version to contain a fully functional JIT. In 2013, PyPy\nstarted supporting ARM, bringing its just-in-time speediness to many more devices and began working toward supporting NumPy to help\nscientists crunch their numbers faster. Together, PyPy and Conservancy ran successful fundraising drives and facilitated payment\nand oversight for contractors and code sprints.\n\nConservancy supported PyPy's impressive growth as it expanded support for\ndifferent hardware platforms, greatly improved the performance of C extensions,\nand added support for Python 3 as the language itself evolved.\n\nThe road ahead\n \nConservancy provides a fiscal and organizational home for projects that find the\nfreedoms and guardrails that come along with a charitable home advantageous for\ntheir community goals. While this framework was a great fit for the early PyPy\ncommunity, times change and all good things must come to an end.\n\nPyPy will remain a free and open source project, but the community's structure\nand organizational underpinnings will be changing and the PyPy community will be\nexploring options outside of the charitable realm for its next phase of growth\n(\"charitable\" in the legal sense -- PyPy will remain a community project).\n\nDuring the last year PyPy and Conservancy have worked together to properly\nutilise the generous donations made by stalwart PyPy enthusiats over the years\nand to wrap up PyPy's remaining charitable obligations. PyPy is grateful for\nthe Conservancy's help in shepherding the project toward its next chapter.\n\nThank yousFrom Conservancy: \"We are happy that Conservancy was able to help PyPy bring important software\nfor the public good during a critical time in its history. We wish the\ncommunity well and look forward to seeing it develop and succeed in new ways.\" \u2014 Karen Sandler, Conservancy's Executive DirectorFrom PyPy:\"PyPy would like to thank Conservancy for their decade long support in\nbuilding the community and wishes Conservancy continued success in their\njourney promoting, improving, developing and defending free and open source\nsofware.\" \u2014 Simon Cross & Carl Friedrich Bolz-Tereick, on behalf of PyPy.\n\n\nAbout\n\nPyPy is a multi-layer python interpreter with a built-in JIT compiler that runs\nPython quickly across different computing environments.\nSoftware Freedom Conservancy (Conservancy) is a charity that provides a home\nto over forty free and open source software projects.", + "tags": "pypy", + "url": "https://www.pypy.org/posts/2020/08/a-new-chapter-for-pypy-8388322709667328389.html" + }, + { + "title": "PyPy 7.3.1 released", + "text": "The PyPy team is proud to release the version 7.3.1 of PyPy, which includes\ntwo different interpreters:\n\n\n\n\nPyPy2.7, which is an interpreter supporting the syntax and the features of\nPython 2.7 including the stdlib for CPython 2.7.13\nPyPy3.6: which is an interpreter supporting the syntax and the features of\nPython 3.6, including the stdlib for CPython 3.6.9.\n\n\n\nThe interpreters are based on much the same codebase, thus the multiple\nrelease. This is a micro release, no APIs have changed since the 7.3.0 release\nin December, but read on to find out what is new.\n\n\nConda Forge now supports PyPy as a Python interpreter. The support right now\nis being built out. After this release, many more c-extension-based\npackages can be successfully built and uploaded. This is the result of a lot of\nhard work and good will on the part of the Conda Forge team. A big shout out\nto them for taking this on.\n\n\nWe have worked with the Python packaging group to support tooling around\nbuilding third party packages for Python, so this release updates the pip and\nsetuptools installed when executing pypy -mensurepip to pip>=20. This\ncompletes the work done to update the PEP 425 python tag from pp373 to\nmean \u201cPyPy 7.3 running python3\u201d to pp36 meaning \u201cPyPy running Python\n3.6\u201d (the format is recommended in the PEP). The tag itself was\nchanged in 7.3.0, but older pip versions build their own tag without querying\nPyPy. This means that wheels built for the previous tag format will not be\ndiscovered by pip from this version, so library authors should update their\nPyPy-specific wheels on PyPI.\n\n\nDevelopment of PyPy is transitioning to https://foss.heptapod.net/pypy/pypy.\nThis move was covered more extensively in the blog post from last month.\n\n\nThe CFFI backend has been updated to version 14.0. We recommend using CFFI\nrather than c-extensions to interact with C, and using cppyy for performant\nwrapping of C++ code for Python. The cppyy backend has been enabled\nexperimentally for win32, try it out and let use know how it works.\n\n\nEnabling cppyy requires a more modern C compiler, so win32 is now built\nwith MSVC160 (Visual Studio 2019). This is true for PyPy 3.6 as well as for 2.7.\n\n\nWe have improved warmup time by up to 20%, performance of io.StringIO to\nmatch if not be faster than CPython, and improved JIT code generation for\ngenerators (and generator expressions in particular) when passing them to\nfunctions like sum, map, and map that consume them. Performance of closures has also be improved in certain situations.\n\n\nAs always, this release fixed several issues and bugs raised by the growing\ncommunity of PyPy users. We strongly recommend updating. Many of the fixes are\nthe direct result of end-user bug reports, so please continue reporting issues\nas they crop up.\n\nYou can find links to download the v7.3.1 releases here:\n\n\n\nhttps://pypy.org/download.html\n\nWe would like to thank our donors for the continued support of the PyPy\nproject. If PyPy is not quite good enough for your needs, we are available for\ndirect consulting work.\n\n\nWe would also like to thank our contributors and encourage new people to join\nthe project. PyPy has many layers and we need help with all of them: PyPy\nand RPython documentation improvements, tweaking popular modules to run\non PyPy, or general help with making RPython\u2019s JIT even better. Since the\nprevious release, we have accepted contributions from 13 new contributors,\nthanks for pitching in.\n\n\nIf you are a Python library maintainer and use c-extensions, please consider\nmaking a cffi / cppyy version of your library that would be performant on PyPy.\nIn any case both cibuildwheel and the multibuild system support\nbuilding wheels for PyPy wheels.\n\n\n\n\n\u00a0\n\nWhat is PyPy?\nPyPy is a very compliant Python interpreter, almost a drop-in replacement for\nCPython 2.7, 3.6, and soon 3.7. It\u2019s fast (PyPy and CPython 2.7.x performance\ncomparison) due to its integrated tracing JIT compiler.\n\n\nWe also welcome developers of other dynamic languages to see what RPython\ncan do for them.\n\n\nThis PyPy release supports:\n\n\n\n\nx86 machines on most common operating systems\n(Linux 32/64 bits, Mac OS X 64 bits, Windows 32 bits, OpenBSD, FreeBSD)\nbig- and little-endian variants of PPC64 running Linux,\ns390x running Linux\n64-bit ARM machines running Linux.\n\n\n\n\n\n\nWhat else is new?\nFor more information about the 7.3.1 release, see the full changelog.\n\nPlease update, and continue to help us make PyPy better.\n\nCheers,\nThe PyPy team\n\n\n\nThe PyPy Team", + "tags": "release", + "url": "https://www.pypy.org/posts/2020/04/pypy-731-released-6266451647387657480.html" + }, + { + "title": "Leysin 2020 Sprint Report", + "text": "At the end of February ten of us gathered in Leysin, Switzerland to work on\na variety of topics including HPy, PyPy Python 3.7 support and the PyPy\nmigration to Heptapod.\n\n\n\n\nWe had a fun and productive week. The snow was beautiful. There was skiing\nand lunch at the top of Berneuse, cooking together, some late nights at\nthe pub next door, some even later nights coding, and of course the\nobligatory cheese fondue outing.\n\nThere were a few of us participating in a PyPy sprint for the first time\nand a few familiar faces who had attended many sprints. Many different\nprojects were represented including PyPy, HPy, GraalPython,\nHeptapod, and rust-cpython. The atmosphere was relaxed and welcoming, so if\nyou're thinking of attending the next one -- please do!\n\nTopics worked on:\n\n\nHPy\nHPy is a new project to design and implement a better API for extending\nPython in C. If you're unfamiliar with it you can read more about it at\nHPy.\n\nA lot of attention was devoted to the Big HPy Design Discussion which\ntook up two full mornings. So much was decided that this will likely\nget its own detailed write-up, but bigger topics included:\n\nthe HPy GetAttr, SetAttr, GetItem and SetItem methods,\nHPy_FromVoidP and HPy_AsVoidP for passing HPy handles to C functions\nthat pass void* pointers to callbacks,\navoiding having va_args as part of the ABI,\nexception handling,\nsupport for creating custom types.\n\nQuite a few things got worked on too:\n\nimplemented support for writing methods that take keyword arguments with\nHPy_METH_KEYWORDS,\nimplemented HPy_GetAttr, HPy_SetAttr, HPy_GetItem, and HPy_SetItem,\nstarted implementing support for adding custom types,\nstarted implementing dumping JSON objects in ultrajson-hpy,\nrefactored the PyPy GIL to improve the interaction between HPy and\nPyPy's cpyext,\nexperimented with adding HPy support to rust-cpython.\n\nAnd there was some discussion of the next steps of the HPy initiative\nincluding writing documentation, setting up websites and funding, and\npossibly organising another HPy gathering later in the year.\n\n\nPyPy\n\nGeorges gave a presentation on the Heptapod topic and branch workflows\nand showed everyone how to use hg-evolve.\nWork was done on improving the PyPy CI buildbot post the move to\nheptapod, including a light-weight pre-merge CI and restricting\nwhen the full CI is run to only branch commits.\nA lot of work was done improving the -D tests. \n\n\n\nMiscellaneous\n\nArmin demoed VRSketch and NaN Industries in VR, including an implementation\nof the Game of Life within NaN Industries!\nSkiing!\n\n\n\nAftermath\nImmediately after the sprint large parts of Europe and the world were\nhit by the COVID-19 epidemic. It was good to spend time together before\ntravelling ceased to be a sensible idea and many gatherings were cancelled.\n\nKeep safe out there everyone.\n\nThe HPy & PyPy Team & Friends\n\nIn joke for those who attended the sprint: Please don't replace this blog post\nwith its Swedish translation (or indeed a translation to any other language :).", + "tags": "cpyext,CPython,GraalPython,Heptapod,hpy,pypy,pypy3", + "url": "https://www.pypy.org/posts/2020/03/leysin-2020-sprint-report-764567777353955897.html" + }, + { + "title": "PyPy and CFFI have moved to Heptapod", + "text": "It has been a very busy month, not so much because of deep changes in the JIT of PyPy but more around the development, deployment, and packaging of the project.\n\n\n\u00a0\n\nHosting\nThe biggest news is that we have moved the center of our development off Bitbucket and to the new https://foss.heptapod.net/pypy. This is a friendly fork of Gitlab called heptapod that understands Mercurial and is hosted by Clever Cloud. When Atlassian decided to close down Mercurial hosting on bitbucket.org, PyPy debated what to do. Our development model is based on long-lived branches, and we want to keep the ability to immediately see which branch each commit came from. Mercurial has this, git does not (see our FAQ). Octobus, whose business is Mercurial, developed a way to use Mercurial with Gitlab called heptapod. The product is still under development, but quite usable (i.e., it doesn't get in the way). Octobus partnered with Clever Cloud hosting to offer community FOSS projects hosted on Bitbucket who wish to remain with Mercurial a new home. PyPy took them up on the offer, and migrated its repos to https://foss.heptapod.net/pypy. We were very happy with how smooth it was to import the repos to heptapod/GitLab, and are learning the small differences between Bitbucket and GitLab. All the pull requests, issues, and commits kept the same ids, but work is still being done to attribute the issues, pull requests, and comments to the correct users. So from now on, when you want to contribute to PyPy, you do so at the new home.\n\nCFFI, which previously was also hosted on Bitbucket, has joined the PyPy group at https://foss.heptapod.net/pypy/cffi.\n\n\n\u00a0\n\nWebsite\nSecondly, thanks to work by https://baroquesoftware.com/ in leading a redesign and updating the logo, the https://www.pypy.org website has undergone a facelift. It should now be easier to use on small-screen devices. Thanks also to the PSF for hosting the site.\n\n\n\u00a0\n\nPackaging\nAlso, building PyPy from source takes a fair amount of time. While we provide downloads in the form of tarballs or zipfiles, and some platforms such as debian and Homebrew provide packages, traditionally the downloads have only worked on a specific flavor of operating system. A few years ago squeaky-pl started providing portable builds. We have adopted that build system for our linux offerings, so the nightly downloads and release downloads should now work on any glibc platform that has not gone EndOfLife. So there goes another excuse not to use PyPy. And the \"but does it run scipy\" excuse also no longer holds, although \"does it speed up scipy\" still has the wrong answer. For that we are working on HPy, and will be sprinting soon.\nThe latest versions of pip, wheel, and setuptools, together with the manylinux2010 standard for linux wheels and tools such as multibuild or cibuildwheels (well, from the next version) make it easier for library developers to build binary wheels for PyPy. If you are having problems getting going with this, please reach out.\n\n\n\n\u00a0\n\nGive it a try\nThanks to all the folks who provide the infrastructure PyPy depends on. We hope the new look will encourage more involvement and engagement. Help prove us right!\n\nThe PyPy Team", + "tags": "", + "url": "https://www.pypy.org/posts/2020/02/pypy-and-cffi-have-moved-to-heptapod-5791595152472747032.html" + }, + { + "title": "Leysin Winter sprint 2020: Feb 29 - March 8th", + "text": "The next PyPy sprint will be in Leysin, Switzerland, for the fourteenth\ntime. This is a fully public sprint: newcomers and topics other than\nthose proposed below are welcome.\n\n\n\n\nGoals and topics of the sprint\nThe list of topics is open.\u00a0 For reference, we would like to work at least partially on the following topics:\n\nHPy \nPython 3.7 support (buildbot status)\n\nAs usual, the main side goal is to have fun in winter sports :-)\nWe can take a day off (for ski or anything else).\n\n\nTimes and accomodation\nThe sprint will occur for one week starting on Saturday, the 29th of February, to Sunday, the 8th of March 2020\u00a0(dates were pushed back one day!)\u00a0 It will occur in Les Airelles, a different bed-and-breakfast place from the traditional one in Leysin.\u00a0 It is a nice old house at the top of the village.\n\nWe have a 4- or 5-people room as well as up to three double-rooms.\u00a0 Please register early!\u00a0 These rooms are not booked for the sprint in advance, and might be already taken if you end up announcing yourself late.\u00a0 We have a big room for up to 7 people with nice view, which might be split in two or three sub-rooms; plus possibly separately-booked double rooms if needed. (But it is of course always possible to book at a different place in Leysin.)\n\nFor more information, see our repository or write to me directly at armin.rigo@gmail.com.", + "tags": "", + "url": "https://www.pypy.org/posts/2020/01/leysin-winter-sprint-2020-feb-28-march-6349761524797409012.html" + }, + { + "title": "Python compatibility", + "text": "The goal of this page is to point out some of the differences between running\npython with PyPy and with CPython\n\nTL;DR\nPure python code works, but there are a few differences with object lifetime\nmanagement. Modules that use the CPython C API will probably work, but will\nnot achieve a speedup via the JIT. We encourage library authors to use CFFI\nand HPy instead.\nIf you are looking for how to use PyPy with the scientific python ecosystem,\nwe encourage you to use conda, since they repackage common libraries like\nscikit-learn and SciPy for PyPy.\n\n\nRefcounting, __del__, and resource use\nThe main difference in pure-python code that is not going to be fixed is that\nPyPy does\nnot support refcounting semantics for \"automatically\" releasing state when\nan object's __del__ is called. The following code won't fill the\nfile immediately, but only after a certain period of time, when the GC\ndoes a collection and flushes the output, since the file is only closed when\nthe __del__ method is called:\nopen(\"filename\", \"w\").write(\"stuff\")\n\nThe proper fix is\nwith open(\"filename\", \"w\") as f:\n f.write(\"stuff\")\n\nThe same problem---not closing your files---can also show up if your\nprogram opens a large number of files without closing them explicitly.\nIn that case, you can easily hit the system limit on the number of file\ndescriptors that are allowed to be opened at the same time.\nPyPy can be run with the command-line option -X track-resources (as in,\npypy -X track-resources myprogram.py). This produces a ResourceWarning\nwhen the GC closes a non-closed file or socket. The traceback for the place\nwhere the file or socket was allocated is given as well, which aids finding\nplaces where close() is missing.\nSimilarly, remember that you must close() a non-exhausted\ngenerator in order to have its pending finally or with\nclauses executed immediately:\ndef mygen():\n with foo:\n yield 42\n\nfor x in mygen():\n if x == 42:\n break # foo.__exit__ is not run immediately!\n\n# fixed version:\ngen = mygen()\ntry:\n for x in gen:\n if x == 42:\n break\nfinally:\n gen.close()\n\nMore generally, __del__() methods are not executed as predictively\nas on CPython: they run \"some time later\" in PyPy (or not at all if\nthe program finishes running in the meantime). See more details\nhere.\n\n\nWhy is memory usage so high?\nNote that PyPy returns unused memory to the operating system only after\na madvise() system call (at least Linux, OS X, BSD) or on Windows. It is\nimportant to realize that you may not see this in top. The unused\npages are marked with MADV_FREE, which tells the system \"if you\nneed more memory at some point, grab this page\". As long as memory is\nplentiful, the RES column in top might remains high. (Exceptions to\nthis rule are systems with no MADV_FREE, where we use\nMADV_DONTNEED, which forcefully lowers the RES. This includes\nLinux <= 4.4.)\n\n\nMore info\nA more complete list of known differences is available at our dev site.", + "tags": "", + "url": "https://www.pypy.org/compat.html" + }, + { + "title": "Contact", + "text": "irc: #pypy on irc.libera.chat\nmailing list: pypy-dev at python.org\nfor security related issues, non-public funding enquiries etc. please contact pypy-z@python.org\nthe issue tracker (registration required to open new issues or to comment)\nmore on our dev site.\ncode on https://github.com/pypy/pypy\nReach out to our consultants for specific projects", + "tags": "", + "url": "https://www.pypy.org/contact.html" + }, + { + "title": "Download and Install", + "text": "We provide pre-compiled binaries for many platforms and OSes. There are also\npre-compiled binaries available on conda-forge. We have found conda-forge\na convenient and cooperative community for distributing not only the\ninterpreter, but many packages like SciPy that are difficult to build and\nwhich do not yet have binary PyPy builds available on PyPI.\n\nNote\nOur nightly binary builds have the most recent bugfixes and performance\nimprovements, though they can be less stable than the official releases. See\nthese links for other versions or more information including other\nplatforms.\n\n\n\nPyPy latest\n\n\n\n\n\n\n\nOS\nPyPy3.10\nPyPy2.7\nNotes\n\n\n\nLinux x86 64 bit\nDownload\nDownload\ncompatible with CentOS7 and later.\n\nWindows 64 bit\nDownload\nDownload\ncompatible with any windows 64-bit\nyou might need the VC runtime library installer vcredist.x64.exe\n\nMacOS arm64\nDownload\nDownload\nMacOS >= 11. Not signed, for signed packages use Homebrew.\n\nMacOS x86_64\nDownload\nDownload\nMacOS >= 10.15, not for Mojave and below. Not signed, for signed\npackages use Homebrew.\n\nLinux ARM64\nDownload\nDownload\ncompatible with CentOS7 and later.\n\n\n\n\n\nNote\nSSL Certificates\nWhile the linux binaries ship an OpenSSL library, they do not ship a\ncertificate store for SSL certificates. If you wish to use SSL module,\nyou will need a valid certificate store. You can use the certifi package\nand set SSL_CERT_FILE to certifi.where() or install your platform\ncertificates which should be discovered by the _ssl module.\n\n\n\nSource\n\n3.10 Source (tar.bz2); 3.10 Source (zip).\n2.7 Source (tar.bz2); 2.7 Source (zip).\n\n\n\nMore information\nVisit the more information page for other platforms, information about\nrunning PyPy, STM, instructions on building from source and more.\n\n\nChecksums\nChecksums for the downloads are here", + "tags": "", + "url": "https://www.pypy.org/download.html" + }, + { + "title": "PyPy - Features", + "text": "PyPy is a replacement for CPython. It is built using the RPython\nlanguage that was co-developed with it. The main reason to use it\ninstead of CPython is speed: it runs generally faster (see next section).\nPyPy implements Python 2.7.18 and 3.10.14.\nIt supports all of the core language. It supports most of\nthe commonly used Python standard library modules. For known differences with\nCPython, see our compatibility page.\nThe following CPU architectures are supported and maintained:\n\nx86 (IA-32) and x86_64\nARM platforms (ARMv6 or ARMv7, with VFPv3, and Apple Silicon arm64)\nAArch64\nRISCV\nPowerPC 64bit both little and big endian\nSystem Z (s390x)\n\nPyPy's x86 version runs on several operating systems, such as Linux\n(32/64 bits), MacOS (64 bits), Windows (32 bits), OpenBSD, FreeBSD.\nNon-x86 versions are supported on Linux, and ARM64 is supported on MacOS.\nIf you are interested in helping, see our howtohelp page.\n\nThe main features of PyPy:\n\nSpeed\nOur main executable comes with a Just-in-Time compiler. It is\nreally fast in running most benchmarks\u2014including very large and\ncomplicated Python applications, not just 10-liners.\nThere are two cases that you should be aware where PyPy will not be\nable to speed up your code:\n\nShort-running processes: if it doesn't run for at least a few seconds,\nthen the JIT compiler won't have enough time to warm up.\nIf all the time is spent in run-time libraries (i.e. in C functions),\nand not actually running Python code, the JIT compiler will not help.\n\nSo the case where PyPy works best is when executing long-running\nprograms where a significant fraction of the time is spent executing\nPython code. This is the case covered by the majority of our\nbenchmarks, but not all of them --- the goal of PyPy is to get speed\nbut still support (ideally) any Python program.\n\n\nMemory usage\nMemory-hungry Python programs (several hundreds of MBs or more) might\nend up taking less space than they do in CPython. It is not always\nthe case, though, as it depends on a lot of details. Also note that\nthe baseline is higher than CPython's.\n\n\nStackless\nSupport for Stackless and greenlets are now integrated in the normal\nPyPy. More detailed information is available here.\n\n\nOther features\nPyPy has many secondary features and semi-independent\nprojects. We will mention here:\n\nOther languages: we also implemented other languages that makes\nuse of our RPython toolchain: Prolog (almost complete), as\nwell as Smalltalk, JavaScript, Io, Scheme and Gameboy.\nThere is also a Ruby implementation called Topaz and a PHP implementation\ncalled HippyVM.\n\n\n\n\nSandboxing\nPyPy's sandboxing is a working prototype for the idea of running untrusted\nuser programs. Unlike other sandboxing approaches for Python, PyPy's does not\ntry to limit language features considered \"unsafe\". Instead we replace all\ncalls to external libraries (C or platform) with a stub that communicates\nwith an external process handling the policy.\n\nNote\nPlease be aware that it is a prototype only. It needs work to become\nmore complete, and you are welcome to help. In particular, almost none\nof the extension modules work (not even time ), and pypy_interact\nis merely a demo. Also, a more complete system would include a way\nto do the same as pypy_interact from other languages than Python,\nto embed a sandboxed interpreter inside programs written in other\nlanguages.\n\nTo run the sandboxed process, you need to get the full sources and\nbuild pypy-sandbox from it (see Building from source). These\ninstructions give you a pypy-c that you should rename to\npypy-sandbox to avoid future confusion. Then run:\ncd pypy/sandbox\npypy_interact.py path/to/pypy-sandbox\n# don't confuse it with pypy/goal/pyinteractive.py!\n\nYou get a fully sandboxed interpreter, in its own filesystem hierarchy\n(try os.listdir('/')). For example, you would run an untrusted\nscript as follows:\nmkdir virtualtmp\ncp untrusted.py virtualtmp/\npypy_interact.py --tmp=virtualtmp pypy-sandbox /tmp/untrusted.py\n\nNote that the path /tmp/untrusted.py is a path inside the sandboxed\nfilesystem. You don't have to put untrusted.py in the real /tmp\ndirectory at all.\nTo read more about its features, try pypy_interact.py --help or go to\nour documentation site.", + "tags": "", + "url": "https://www.pypy.org/features.html" + }, + { + "title": "PyPy", + "text": "A fast, compliant alternative implementation of Python\n\n \n Download PyPy\nWhat is PyPy ?\nDocumentation (external link)\n\n\n\nOn average, PyPy is 4.4 times faster than CPython 3.7. We currently support python 3.10 and 2.7.\n\n\n\nPyPy (with JIT) benchmark times normalized to CPython. Smaller is\nbetter. Based on the geometric average of all benchmarks\n\n\n\"... we are avid fans of PyPy and\ncommensurately thankful for the great work by the PyPy team over the\nyears. PyPy has enabled us to use Python for a larger part of our\ntoolset than CPython alone would have supported, and its smooth\nintegration with C/C++ through CFFI has helped us attain a better\ntradeoff between performance and programmer productivity in our\nprojects\"\n-- Vilhj\u00e1lmur \u00deorsteinsson, founder and CEO of Mi\u00f0eind, Feb 2022\nAdvantages and distinct Features\n\nSpeed: thanks to its Just-in-Time compiler, Python programs\noften run faster on PyPy. (What is a JIT compiler?)\nMemory usage: memory-hungry Python programs (several hundreds of\nMBs or more) might end up taking less space than they do in CPython.\nCompatibility: PyPy is highly compatible with existing python code.\nIt supports cffi, cppyy, and can run popular python libraries like\ntwisted, and django. It can also run NumPy, Scikit-learn and more via a\nc-extension compatibility layer.\nStackless: PyPy comes by default with support for stackless mode,\nproviding micro-threads for massive concurrency.\nAs well as other features.", + "tags": "", + "url": "https://www.pypy.org/" + }, + { + "title": "The PyPy Team (from 2008)", + "text": "Armin Rigo\n\nArmin Rigo is a former researcher at the Heinrich-Heine Universitat\nD\u00fcsseldorf (Germany). He studied Mathematics at the University\nof Lausanne (Switzerland), obtained his Ph.D. in Logic and Set\nTheory at the Free University of Brussels (Belgium) in 2002, and\nworked at the University of Southampton (UK) until 2005. He is\nthe author of Psyco, the first just-in-time compiler for Python.\nHe is one of the founders and lead developers of the PyPy project\nwhich began in 2003. He has taken part in all areas, from the Python\nlanguage definition to the RPython translation framework,\nincluding the garbage collector and the tracing just-in-time\ncompiler.\n\n\nMaciej Fija\u0142kowski\n\nMaciej is a freelancer working mostly on PyPy for the past several years.\nHe's a core developer since 2006, working on all kinds of parts in\nthe entire codebase including JIT, GC and assembler backends.\nMaciej has been going to many conferences, advertising PyPy to a broader\naudience for the past several years, including a keynote at Pycon 2010.\nHe's also the main maintainer of\njitviewer, a tool for analyzing performance of your python programs under\nPyPy.\n\n\nCarl Friedrich Bolz\n\nCarl Friedrich is a core developer since 2005, currently doing his PhD at the\nHeinrich-Heine Universit\u00e4t D\u00fcsseldorf (Germany). He has worked on most aspects\nof PyPy, from the core interpreter to the GC to the JIT. He has published\nseveral papers about the inner workings of PyPy, presenting them at various\nscientific conferences. Carl Friedrich is also interested in other dynamic\nlanguage implementation and was the original author of the Prolog\nimplementation.\nCarl Friedrich likes science fiction novels and sometimes plays the bassoon.\n\n\nAntonio Cuni\n\nAntonio Cuni loves skiing, mountains and programming languages. He studied\nComputer Science at the University of Genova (Italy), and then at the same\nuniversity he obtained his Ph.D. in Computer Science in 2010, with a\ndissertation about the PyPy CLI JIT backend. He has been a core PyPy\ndeveloper since 2006, working in various areas including the \"object oriented\nbackends\" for the CLI and JVM, the RPython translation framework, the Python\ninterpreter and the JIT compiler generator. Apart from PyPy, he is the author of\nother popular tools such as pdb++.\n\n\nBenjamin Peterson\nBoth a PyPy and CPython core developer, Benjamin knows way too much about the\nnooks and cranies of the Python language. He is driven by a fascination with\ninterpreters and compilers of all shapes and sizes. Around the PyPy project, he\ntries to be generally useful and has taken on major projects including rewriting\nPyPy's Python compiler and porting PyPy to Python 2.7.\n\n\nAlex Gaynor\n\nAlex is software engineer living in Washington, DC. He's been a PyPy developer\nsince 2010, and has worked on many parts of the codebase, including the JIT\ncompiler's optimizers, the RPython translation toolchain, and the Python\ninterpreter. In addition to his work on PyPy, Alex is also the creator of\nTopaz, a Ruby VM built on RPython and a core developer of Django (a Python web\nframework) and CPython, as well as a retired member of the board of directors\nof the Python Software Foundation.\n\n\nH\u00e5kan Ard\u00f6\n\nH\u00e5kan Ard\u00f6 received his master of science degree in electrical\nengineering from Lund University in 2002. He specialized in\nVLSI-design and Image Processing. He worked as a software\nengineer at Axis Communications 2002-2003 before doing his\nPhD at the Centre for Mathematical Sciences of Lund University\n2003-2009 in the Mathematical Imaging Group. His thesis work consisted\nof designing image processing algorithms for traffic surveillance,\naiming for a system that automatically measures the safety of an\nintersection or road segment. He is currently working part-time as a\npostdoc at the Centre for Mathematical Sciences of Lund University\ncontinuing this work and part-time as CTO with a spinoff company\nCognimatics. His contributions to PyPy started 2010 and consists of\nthe array module as well as work on the JIT compiler's trace optimizers.\n\n\nHolger Krekel\n\nHolger Krekel is a founder of the PyPy project and has participated in\nPyPy core development for several years as well as maintained much of\nits infrastructure. He also is the author of the popular py.test and\ntox testing tools as well as execnet, a library for easily deploying\ndifferent interacting Python interpreters side by side. He helped\nmanage multiple PyPy funding contracts through his company merlinux and was a\nPyPy representative within the Software Freedom Conservancy (SFC). He\nholds a summa cum laude degree in computer science with a thesis about\nartificial intelligence applied to the game of Go. As of 2011 he is on\nanother sabbatical-ish leave, caring for his newborn son, travelling\nand pondering what comes next. Other than that he continues to care\nfor testing and some PyPy co-ordination bits behind the scene.\n\n\nSamuele Pedroni\nSamuele Pedroni got involved with PyPy almost at its inception in the\nspring of 2003. One of the design contributors to PyPy, his help has\nranged from infrastructure and processes, through building out\nRPython... optimizing the Python interpreter, to compressing resume\ndata in the last incarnation of the JIT compiler. Tempted away into the\napplication side of the software equation, these days he contributes\nsome words and wisdom to PyPy's paper writing.\n\n\nMany more people\nPyPy is and has always been an effort of many volunteers. Consult the LICENSE\nfile for details.", + "tags": "", + "url": "https://www.pypy.org/people.html" + }, + { + "title": "Performance", + "text": "Contents\n\nProfiling: vmprof\nOptimization strategy\nMicro-tuning tips\n\n\nThis document collects strategies, tactics and tricks for making your\ncode run faster under PyPy. Many of these are also useful hints for\nstock Python and other languages. For contrast, we also describe some\nCPython (stock Python) optimizations that are not needed in PyPy.\n\n\nProfiling: vmprof\nAs a general rule, when considering performance issues, follow these\nthree points: first measure them (it is counter-productive to fight\nimaginary performance issues); then profile your code (it is useless\nto optimize the wrong parts). Only optimize then.\nPyPy 2.6 introduced vmprof, a very-low-overhead statistical profiler.\nThe standard, non-statistical cProfile is also supported, and can be\nenabled without turning off the JIT. We do recommend vmprof anyway\nbecause turning on cProfile can distort the result (sometimes massively,\nthough hopefully this should not be too common).\n\n\n\nOptimization strategy\nThese suggestions apply to all computer languages. They're here as\nreminders of things to try before any Python or PyPy-specific tweaking.\n\nBuild a regression-test suite\nBefore you start tuning, build a regression-test suite for your code.\nThis front-loads a significant amount of work, but it means you can\ntry lots of optimizations without worrying so much about introducing\nfunctional bugs.\n\n\nMeasure, don't guess\nHuman beings are bad at guessing or intuiting where the hotspots in code are.\nMeasure, don't guess; use a profiler to pin down the 20% of the\ncode where the code is spending 80% of its time, then speed-tune that.\nMeasuring will save you a lot of effort wasted on tuning parts of the code\nthat aren't actually bottlenecks.\nAs you tune, re-profile frequently so you can see how the hottest spots\nare shifting around.\n\n\nI/O-bound is different from compute-bound\nBe aware of the difference between code that is compute-bound (slow\nbecause it's doing a huge number of instructions) and code that is I/O\nbound (slow because of disk or network delays).\nExpect to get most of your gains from optimizing compute-bound code.\nIt's usually (though not always) a sign that you're near the end of\nworthwhile tuning when profiling shows that the bulk of the\napplication's time is spent on network and disk I/O.\n\n\nTune your algorithms first\nGenerally, when your code is doing things that are O(n**2) or larger\nin the size of your data set, the cost of those operations is going\nto swamp any small gains you can pick up with the tricks we describe\nhere.\nTune your algorithms first. It's time to think about applying our\nlist of micro-tuning tips after you think you've optimized out\nintrinsically expensive operations.\nThat said, be prepared for the possibility that you will discover\nbetter-hidden algorithmic problems as you micro-tune. Likely\nyou will go through this cycle more than once.\n\n\nFocus on tight loops\nIt's extremely common for high time costs to lurk within some\ninnocuous-looking code inside a tight loop - especially in code\nthat does something like a searching/matching/lookup operation\nor any kind of graph traversal.\nProbably the most common kind of performance-killer in compute-bound\ncode is an O(n**2) operation that is disguised by being some sort of\nO(n) lookup or match inside an O(n) loop.\nAnother common time-sink is relatively expensive common-setup\noperations that are performed inside tight loops but could be moved\nto before they start. (For a representative case of this, see the\nmicro-tuning tip on regexp compilation.)\n\n\nSmaller is faster\nModern computers have multiple levels of memory caching, some directly\non the processor chip. Causing a cache miss at any level incurs a\nperformance penalty proportional to random-access time for the next\noutward (and much slower) layer of cache.\nAccordingly, smaller is faster. Programs or routines with a small\nenough working set to fit inside a fast cache will be as fast as\nthat cache is. To make your code fast, reduce the length of the\nseries of Python or JIT-compiler opcodes it generates by making\nit simpler.\nThe tradeoff here is that algorithmic tuning often trades time for\nspace - that is, it increases the size of an algorithm's working set\nby including pre-computations or tables or reverse maps in order to\navoid O(n**2) operations.\nIt's impossible to predict in advance where the sweet spot in that\ntradeoff will be. You have to try different things and measure -\nwhich takes us right back to \"Measure, don't guess\". And another\nfunction of your regression test suite can be as a speed benchmark.\n\n\n\n\nMicro-tuning tips\nThese are in no particular order.\n\nKeep it simple\nSimple is better than complex. The PyPy JIT is not very smart; the\nsimpler your code is the better it will run. Here again, though, you face\na tradeoff: you may need to pay with more algorithmic complexity in order\nto avoid brute-force operations that are O(n**2) or worse.\nWrite plain-vanilla code in plain-vanilla ways. The PyPy JIT has many\nproductions that optimize a common usage pattern against an uncommon\nusage pattern.\n\n\nGlobal variables\nIn CPython, global variables and functions (including package imports)\nare much more expensive to reference than locals; avoid them. (This\nis also good modularity practice).\nThe cost of CPython global references is high enough that, for example, if you\nhave code in a frequently-visited inner loop that uses int() a lot, it\nmay be worthwhile to create a local copy of the reference with \"int =\nint\" in an enclosing block.\nHowever, this in not true in JITted PyPy code. The \"int = int\" hack\nwon't buy you performance, it's just an extra copy. The modularity\nreason for avoiding globals are still valid.\n\n\nRegular expressions\nRegular-expression compilation is expensive. If the regexp pattern in\na search, match, or replace operation is static (doesn't mutate at\nruntime) refactor so it's only done once.\nIf the regexp compilation is in a class method, consider doing it as\nthe initializer of a regexp-valued static (shared) class member and\nusing that class member in your operation.\nIf the regexp compilation is in a free function, consider moving it\nto module level and referencing the resulting regexp object\n(but see the warning above about global variables).\n\n\nOld- vs. new-style classes\nNew-style classes allow faster attribute access and take up less core\nper instance than old-style classes. Much of this advantage may be\nlost, however, if attribute names are not constant. For example: x.a\n= y or even setattr(x, 'a', y) will be much faster than a dynamic\nversion: setattr(x, 'a' + some_variable, y).\nClasses that inherit from both new- and old-style classes are\nextremely slow; avoid at all costs.\nIn PyPy, isinstance() called against an old-style class was very slow\nuntil 2.0.\n\n\nString concatenation is expensive\nIn CPython, you may want to replace:\ns = head + body + maybe + tail\n\nwith the admittedly less readable:\ns = \"%(head)s%(body)s%(maybe)s%(tail)s\" % locals()\n\nor even:\ns = \"{head}{body}{maybe}{tail}\".format(**locals())\n\nBoth of the latter forms avoid multiple-allocation overhead.\nBut PyPy's JIT makes the overhead of intermediate concatenations\ngo away in linear code that keeps the number of concatenations\nsmall, bound and constant. (And locals() is rather slow\nwith PyPy's JIT.)\nOn the other hand, in code like this with a string-valued foo() function:\nfor x in mylist:\n s += foo(x)\n\nthe JIT cannot optimize out intermediate copies. This code is\nactually quadratic in the total size of the mylist strings due to\nrepeated string copies of ever-larger prefix segments. (Such code\nis always fine for bytearrays, because in this case += is an\nin-place operation.)\nThis:\nparts = []\nfor x in mylist:\n parts.append(foo(x))\ns = \"\".join(parts)\n\ncan be much faster because all the string concatenation in the last\nline creates exactly one new string object with one C-level copy\nsequence (and list operations are relatively cheap).\n\n\nFrame introspection and tracing are slow\nCertain function calls can disable PyPy's speed options over\nstretches of surrounding code called \"JIT scopes\".\nA JIT like PyPy's works based on the assumption that the only thing\nworth optimizing are loops that are executed often. Whenever the\ninterpreter enters a loop in the interpreted program, the JIT records\nwhat the interpreter does, creating a trace. This trace is optimized,\ncompiled to machine code and executed when the loop is hit with the\nconditions observed during tracing. This trace is one kind of JIT scope.\nAnother kind of JIT scope that matters is a function, considered as\na unit for inlining.\nNote that a JIT scope is a run-time phenomenon, not a compile-time\none. It's not confined by source-code module boundaries. A library-\nor foreign-module call in a frequently-called loop or inlined function\nwill be part of its JIT scope.\nlocals(), globals(), sys._getframe(), sys.exc_info(), and sys.settrace\nwork in PyPy, but they incur a performance penalty that can be huge by\ndisabling the JIT over the enclosing JIT scope.\n(Thanks Eric S. Raymond for the text above)\n\nInsider's point of view\nThis section describes performance issues from the point of view of\ninsiders of the project; it should be particularly interesting if you\nplan to contribute in that area.\nOne of the goals of the PyPy project is to provide a fast and compliant\npython interpreter. Some of the ways we achieve this are by providing a\nhigh-performance garbage collector (GC) and a high-performance\nJust-in-Time compiler (JIT). Results of comparing PyPy and CPython can\nbe found on the speed website. Those benchmarks are not a random\ncollection: they are a combination of real-world Python programs ---\nbenchmarks originally included with the (now dead) Unladen Swallow\nproject --- and benchmarks for which we found PyPy to be slow (and improved).\nConsult the descriptions of each for details.\nThe JIT, however, is not a magic bullet. There are several characteristics\nthat might surprise people who are not used to JITs in\ngeneral or to the PyPy JIT in particular. The JIT is generally good at\nspeeding up straight-forward Python code that spends a lot of time in the\nbytecode dispatch loop, i.e., running actual Python code --- as opposed\nto running things that only are invoked by Python code. Good\nexamples include numeric calculations or any kind of heavily\nobject-oriented program. Bad examples include doing computations with\nlarge longs --- which is performed by unoptimizable support code. When the\nJIT cannot help, PyPy is generally slower than CPython.\nMore specifically, the JIT is known not to work on:\n\nTests: The ideal unit tests execute each piece of tested code\nonce. This leaves no time for the JIT to warm up.\nReally short-running scripts: A rule of thumb is if something runs below\n0.2s the JIT has no chance, but it depends a lot on the program in question.\nIn general, make sure you warm up your program before running benchmarks, if\nyou're measuring something long-running like a server. The time required\nto warm up the JIT varies; give it at least a couple of seconds. (PyPy's\nJIT takes an especially long time to warm up.)\nLong-running runtime functions: These are the functions provided\nby the runtime of PyPy that do a significant amount of work.\nPyPy's runtime is generally not as optimized as CPython's and we expect those\nfunctions to take somewhere between the same time as CPython to twice as long.\nThis includes, for example, computing with longs, or sorting large lists.\nA counterexample is regular expressions: although they take time, they\ncome with their own JIT.\n\nUnrelated things that we know PyPy to be slow at (note that we're probably\nworking on it):\n\nCPython C extension modules: Any C extension module recompiled\nwith PyPy takes a very large hit in performance. PyPy supports C\nextension modules solely to provide basic functionality.\nIf the extension module is for speedup purposes only, then it\nmakes no sense to use it with PyPy at the moment. Instead, remove it\nand use a native Python implementation, which also allows opportunities\nfor JIT optimization. If the extension module is\nboth performance-critical and an interface to some C library, then it\nmight be worthwhile to consider rewriting it as a pure Python version\nthat uses CFFI for the interface.\nMissing RPython modules: A few modules of the standard library\n(like csv and cPickle) are written in C in CPython, but written\nnatively in pure Python in PyPy. Sometimes the JIT is able to do a\ngood job on them, and sometimes not. In most cases (like csv and\ncPickle), we're slower than CPython, with the notable exception of\njson and heapq.\nAbuse of itertools: The itertools module is often \"abused\" in the\nsense that it is used for the wrong purposes. From our point of view,\nitertools is great if you have iterations over millions of items, but\nnot for most other cases. It gives you 3 lines in functional style\nthat replace 10 lines of Python loops (longer but arguably much easier\nto read). The pure Python version is generally not slower even on\nCPython, and on PyPy it allows the JIT to work much better --- simple\nPython code is fast. The same argument also applies to filter(),\nreduce(), and to some extend map() (although the simple case\nis JITted), and to all usages of the operator module we can think\nof.\nCtypes: Ctypes is slower than on CPython. Consider CFFI or HPy\ninstead which have special paths inside the JIT.\n\nWe generally consider things that are slower on PyPy than CPython to be bugs\nof PyPy. If you find some issue that is not documented here,\nplease report it to our bug tracker for investigation.", + "tags": "", + "url": "https://www.pypy.org/performance.html" + }, + { + "title": "PyPy 7.3.0 released", + "text": "The PyPy team is proud to release the version 7.3.0 of PyPy, which includes\ntwo different interpreters:\n\nPyPy2.7, which is an interpreter supporting the syntax and the features of\nPython 2.7 including the stdlib for CPython 2.7.13\nPyPy3.6: which is an interpreter supporting the syntax and the features of\nPython 3.6, including the stdlib for CPython 3.6.9.\n\n\n\n\n\nThe interpreters are based on much the same codebase, thus the double\nrelease.\n\n\nWe have worked with the python packaging group to support tooling around\nbuilding third party packages for python, so this release changes the ABI tag\nfor PyPy.\n\n\nBased on the great work done in portable-pypy, the linux downloads we\nprovide are now built on top of the manylinux2010 CentOS6 docker image.\nThe tarballs include the needed shared objects to run on any platform that\nsupports manylinux2010 wheels, which should include all supported versions of\ndebian- and RedHat-based distributions (including Ubuntu, CentOS, and Fedora).\n\n\nThe CFFI backend has been updated to version 1.13.1. We recommend using CFFI\nrather than c-extensions to interact with C.\n\nThe built-in cppyy module was upgraded to 1.10.6, which\nprovides, among others, better template resolution, stricter enum handling,\nanonymous struct/unions, cmake fragments for distribution, optimizations for\nPODs, and faster wrapper calls. We reccomend using cppyy for performant\nwrapping of C++ code for Python.\n\n\nThe vendored pyrepl package for interaction inside the REPL was updated.\n\n\nSupport for codepage encoding and decoding was added for Windows.\n\n\nAs always, this release fixed several issues and bugs raised by the growing\ncommunity of PyPy users. We strongly recommend updating. Many of the fixes are\nthe direct result of end-user bug reports, so please continue reporting issues\nas they crop up.\n\nYou can download the v7.3 releases here:\n\n\n\nhttps://pypy.org/download.html\n\nWe would like to thank our donors for the continued support of the PyPy\nproject. If PyPy is not quite good enough for your needs, we are available for\ndirect consulting work.\n\n\nWe would also like to thank our contributors and encourage new people to join\nthe project. PyPy has many layers and we need help with all of them: PyPy\nand RPython documentation improvements, tweaking popular packages to run\non pypy, or general help with making RPython\u2019s JIT even better. Since the\nprevious release, we have accepted contributions from 3 new contributors,\nthanks for pitching in.\n\nIf you are a python library maintainer and use c-extensions, please consider making a cffi / cppyy version of your library that would be performant on PyPy. If you are stuck with using the C-API, you can use docker images with PyPy built in or the multibuild system to build wheels.\n\n\n\nWhat is PyPy?\nPyPy is a very compliant Python interpreter, almost a drop-in replacement for\nCPython 2.7, 3.6. It\u2019s fast (PyPy and CPython 2.7.x performance\ncomparison) due to its integrated tracing JIT compiler.\n\nWe also welcome developers of other dynamic languages to see what RPython\ncan do for them.\n\nThis PyPy release supports:\n\nx86 machines on most common operating systems\n(Linux 32/64 bit, Mac OS X 64-bit, Windows 32-bit, OpenBSD, FreeBSD)\n\n\n\n\nbig- and little-endian variants of PPC64 running Linux \n\n\n\n\ns390x running Linux\n\n\n\n\n64-bit ARM machines running Linux\n\nUnfortunately at the moment of writing our ARM buildbots are out of service,\nso for now we are not releasing any binary for the ARM architecture (32-bit), although PyPy does support ARM 32-bit processors.\n\n\n\nWhat else is new?\nPyPy 7.2 was released in October, 2019.\nThere are many incremental improvements to RPython and PyPy, For more information about the 7.3.0 release, see the full changelog.\n\nPlease update, and continue to help us make PyPy better.\n\nCheers,\nThe PyPy team", + "tags": "release", + "url": "https://www.pypy.org/posts/2019/12/pypy-730-released-3614026620096963655.html" + }, + { + "title": "HPy kick-off sprint report", + "text": "Recently Antonio, Armin and Ronan had a small internal sprint in the beautiful\ncity of Gda\u0144sk to kick-off the development of HPy. Here is a brief report of\nwhat was accomplished during the sprint.\n\nWhat is HPy?\nThe TL;DR answer is \"a better way to write C extensions for Python\".\nThe idea of HPy was born during EuroPython 2019 in Basel, where there was an\ninformal meeting which included core developers of PyPy, CPython (Victor\nStinner and Mark Shannon) and Cython (Stefan Behnel). The ideas were later also\ndiscussed with Tim Felgentreff of GraalPython, to make sure they would also be\napplicable to this very different implementation, Windel Bouwman of RustPython\nis following the project as well.\nAll of us agreed that the current design of the CPython C API is problematic\nfor various reasons and, in particular, because it is too tied to the current\ninternal design of CPython. The end result is that:\n\n\nalternative implementations of Python (such as PyPy, but not only) have a\nhard time loading and executing existing C extensions;\nCPython itself is unable to change some of its internal implementation\ndetails without breaking the world. For example, as of today it would be\nimpossible to switch from using reference counting to using a real GC,\nwhich in turns make it hard for example to remove the GIL, as gilectomy\nattempted.\n\n\nHPy tries to address these issues by following two major design guidelines:\n\nobjects are referenced and passed around using opaque handles, which are\nsimilar to e.g., file descriptors in spirit. Multiple, different handles\ncan point to the same underlying object, handles can be duplicated and\neach handle must be released independently of any other duplicate.\nThe internal data structures and C-level layout of objects are not\nvisible nor accessible using the API, so each implementation if free to\nuse what fits best.\n\nThe other major design goal of HPy is to allow incremental transition and\nporting, so existing modules can migrate their codebase one method at a time.\nMoreover, Cython is considering to optionally generate HPy code, so extension\nmodule written in Cython would be able to benefit from HPy automatically.\nMore details can be found in the README of the official HPy repository.\n\n\nTarget ABI\nWhen compiling an HPy extension you can choose one of two different target ABIs:\n\n\nHPy/CPython ABI: in this case, hpy.h contains a set of macros and\nstatic inline functions. At compilation time this translates the HPy API\ninto the standard C-API. The compiled module will have no performance\npenalty, and it will have a \"standard\" filename like\nfoo.cpython-37m-x86_64-linux-gnu.so.\nUniversal HPy ABI: as the name implies, extension modules compiled\nthis way are \"universal\" and can be loaded unmodified by multiple Python\ninterpreters and versions. Moreover, it will be possible to dynamically\nenable a special debug mode which will make it easy to find e.g., open\nhandles or memory leaks, without having to recompile the extension.\n\n\nUniversal modules can also be loaded on CPython, thanks to the\nhpy_universal module which is under development. An extra layer of\nindirection enables loading extensions compiled with the universal ABI. Users\nof hpy_universal will face a small performance penalty compared to the ones\nusing the HPy/CPython ABI.\nThis setup gives several benefits:\n\n\nExtension developers can use the extra debug features given by the\nUniversal ABI with no need to use a special debug version of Python.\nProjects which need the maximum level of performance can compile their\nextension for each relevant version of CPython, as they are doing now.\nProjects for which runtime speed is less important will have the choice of\ndistributing a single binary which will work on any version and\nimplementation of Python.\n\n\n\n\nA simple example\nThe HPy repo contains a proof of concept module. Here is a simplified\nversion which illustrates what a HPy module looks like:\n\n#include \"hpy.h\"\n\nHPy_DEF_METH_VARARGS(add_ints)\nstatic HPy add_ints_impl(HPyContext ctx, HPy self, HPy *args, HPy_ssize_t nargs)\n{\n long a, b;\n if (!HPyArg_Parse(ctx, args, nargs, \"ll\", &a, &b))\n return HPy_NULL;\n return HPyLong_FromLong(ctx, a+b);\n}\n\n\nstatic HPyMethodDef PofMethods[] = {\n {\"add_ints\", add_ints, HPy_METH_VARARGS, \"\"},\n {NULL, NULL, 0, NULL}\n};\n\nstatic HPyModuleDef moduledef = {\n HPyModuleDef_HEAD_INIT,\n .m_name = \"pof\",\n .m_doc = \"HPy Proof of Concept\",\n .m_size = -1,\n .m_methods = PofMethods\n};\n\n\nHPy_MODINIT(pof)\nstatic HPy init_pof_impl(HPyContext ctx)\n{\n HPy m;\n m = HPyModule_Create(ctx, &moduledef);\n if (HPy_IsNull(m))\n return HPy_NULL;\n return m;\n}\n\nPeople who are familiar with the current C-API will surely notice many\nsimilarities. The biggest differences are:\n\n\nInstead of PyObject *, objects have the type HPy, which as\nexplained above represents a handle.\nYou need to explicitly pass an HPyContext around: the intent is\nprimary to be future-proof and make it easier to implement things like\nsub- interpreters.\nHPy_METH_VARARGS is implemented differently than CPython's\nMETH_VARARGS: in particular, these methods receive an array of HPy\nand its length, instead of a fully constructed tuple: passing a tuple\nmakes sense on CPython where you have it anyway, but it might be an\nunnecessary burden for alternate implementations. Note that this is\nsimilar to the new METH_FASTCALL which was introduced in CPython.\nHPy relies a lot on C macros, which most of the time are needed to support\nthe HPy/CPython ABI compilation mode. For example, HPy_DEF_METH_VARARGS\nexpands into a trampoline which has the correct C signature that CPython\nexpects (i.e., PyObject (*)(PyObject *self, *PyObject *args)) and\nwhich calls add_ints_impl.\n\n\n\n\nSprint report and current status\nAfter this long preamble, here is a rough list of what we accomplished during\nthe week-long sprint and the days immediatly after.\nOn the HPy side, we kicked-off the code in the repo: at the moment of writing\nthe layout of the directories is a bit messy because we moved things around\nseveral times, but we identified several main sections:\n\n\nA specification of the API which serves both as documentation and as an\ninput for parts of the projects which are automatically\ngenerated. Currently, this lives in public_api.h.\n\nA set of header files which can be used to compile extension modules:\ndepending on whether the flag -DHPY_UNIVERSAL_ABI is passed to the\ncompiler, the extension can target the HPy/CPython ABI or the HPy\nUniversal ABI\n\nA CPython extension module called hpy_universal which makes it\npossible to import universal modules on CPython\n\nA set of tests which are independent of the implementation and are meant\nto be an \"executable specification\" of the semantics. Currently, these\ntests are run against three different implementations of the HPy API:\n\n\nthe headers which implements the \"HPy/CPython ABI\"\nthe hpy_universal module for CPython\nthe hpy_universal module for PyPy (these tests are run in the PyPy repo)\n\n\n\n\n\nMoreover, we started a PyPy branch in which to implement the\nhpy_univeral module: at the moment of writing PyPy can pass all the HPy\ntests apart the ones which allow conversion to and from PyObject *.\nAmong the other things, this means that it is already possible to load the\nvery same binary module in both CPython and PyPy, which is impressive on its\nown :).\nFinally, we wanted a real-life use case to show how to port a module to HPy\nand to do benchmarks. After some searching, we choose ultrajson, for the\nfollowing reasons:\n\n\nit is a real-world extension module which was written with performance in\nmind\nwhen parsing a JSON file it does a lot of calls to the Python API to\nconstruct the various parts of the result message\nit uses only a small subset of the Python API\n\n\nThis repo contains the HPy port of ultrajson. This commit shows an example\nof what the porting looks like.\nujson_hpy is also a very good example of incremental migration: so far\nonly ujson.loads is implemented using the HPy API, while ujson.dumps\nis still implemented using the old C-API, and both can coexist nicely in the\nsame compiled module.\n\n\nBenchmarks\nOnce we have a fully working ujson_hpy module, we can finally run\nbenchmarks! We tested several different versions of the module:\n\n\nujson: this is the vanilla implementation of ultrajson using the\nC-API. On PyPy this is executed by the infamous cpyext compatibility\nlayer, so we expect it to be much slower than on CPython\nujson_hpy: our HPy port compiled to target the HPy/CPython ABI. We\nexpect it to be as fast as ujson\nujson_hpy_universal: same as above but compiled to target the\nUniversal HPy ABI. We expect it to be slightly slower than ujson on\nCPython, and much faster on PyPy.\n\n\nFinally, we also ran the benchmark using the builtin json module. This is\nnot really relevant to HPy, but it might still be an interesting as a\nreference data point.\nThe benchmark is very simple and consists of parsing a big JSON file 100\ntimes. Here is the average time per iteration (in milliseconds) using the\nvarious versions of the module, CPython 3.7 and the latest version of the hpy\nPyPy branch:\n\n\n\n\n\n\n\n\u00a0\nCPython\nPyPy\n\nujson\n154.32\n633.97\n\nujson_hpy\n152.19\n\u00a0\n\nujson_hpy_universal\n168.78\n207.68\n\njson\n224.59\n135.43\n\n\n\nAs expected, the benchmark proves that when targeting the HPy/CPython ABI, HPy\ndoesn't impose any performance penalty on CPython. The universal version is\n~10% slower on CPython, but gives an impressive 3x speedup on PyPy! It it\nworth noting that the PyPy hpy module is not fully optimized yet, and we\nexpect to be able to reach the same performance as CPython for this particular\nexample (or even more, thanks to our better GC).\nAll in all, not a bad result for two weeks of intense hacking :)\nIt is also worth noting than PyPy's builtin json module does really\nwell in this benchmark, thanks to the recent optimizations that were described\nin an earlier blog post.\n\n\nConclusion and future directions\nWe think we can be very satisfied about what we have got so far. The\ndevelopment of HPy is quite new, but these early results seem to indicate that\nwe are on the right track to bring Python extensions into the future.\nAt the moment, we can anticipate some of the next steps in the development of\nHPy:\n\n\nThink about a proper API design: what we have done so far has\nbeen a \"dumb\" translation of the API we needed to run ujson. However,\none of the declared goal of HPy is to improve the design of the API. There\nwill be a trade-off between the desire of having a clean, fresh new API\nand the need to be not too different than the old one, to make porting\neasier. Finding the sweet spot will not be easy!\nImplement the \"debug\" mode, which will help developers to find\nbugs such as leaking handles or using invalid handles.\nInstruct Cython to emit HPy code on request.\nEventually, we will also want to try to port parts of numpy to HPy to\nfinally solve the long-standing problem of sub-optimal numpy\nperformance in PyPy.\n\n\nStay tuned!", + "tags": "", + "url": "https://www.pypy.org/posts/2019/12/hpy-kick-off-sprint-report-1840829336092490938.html" + }, + { + "title": "PyPy v7.2 released", + "text": "The PyPy team is proud to release the version 7.2.0 of PyPy, which includes\ntwo different interpreters:\n\nPyPy2.7, which is an interpreter supporting the syntax and the features of\nPython 2.7 including the stdlib for CPython 2.7.13\n\n\nPyPy3.6: which is an interpreter supporting the syntax and the features of\nPython 3.6, including the stdlib for CPython 3.6.9.\n\n\n\n\n\nThe interpreters are based on much the same codebase, thus the double\nrelease.\n\n\nAs always, this release is 100% compatible with the previous one and fixed\nseveral issues and bugs raised by the growing community of PyPy users.\nWe strongly recommend updating. Many of the fixes are the direct result of\nend-user bug reports, so please continue reporting issues as they crop up.\n\n\nYou can download the v7.2 releases here:\n\n\n\nhttps://pypy.org/download.html\n\nWith the support of Arm Holdings Ltd. and Crossbar.io, this release supports\nthe 64-bit aarch64 ARM architecture. More about the work and the\nperformance data around this welcome development can be found in the blog\npost.\n\n\nThis release removes the \u201cbeta\u201d tag from PyPy3.6. While there may still be some\nsmall corner-case incompatibilities (around the exact error messages in\nexceptions and the handling of faulty codec errorhandlers) we are happy with\nthe quality of the 3.6 series and are looking forward to working on a Python\n3.7 interpreter.\n\n\nWe updated our benchmark runner at https://speed.pypy.org to a more modern\nmachine and updated the baseline python to CPython 2.7.11. Thanks to Baroque\nSoftware for maintaining the benchmark runner.\n\n\nThe CFFI-based _ssl module was backported to PyPy2.7 and updated to use\ncryptography version 2.7. Additionally, the _hashlib, and crypt (or\n_crypt on Python3) modules were converted to CFFI. This has two\nconsequences: end users and packagers can more easily update these libraries\nfor their platform by executing (cd lib_pypy; ../bin/pypy _*_build.py).\nMore significantly, since PyPy itself links to fewer system shared objects\n(DLLs), on platforms with a single runtime namespace like linux, different CFFI\nand c-extension modules can load different versions of the same shared object\ninto PyPy without collision (issue 2617).\n\n\nUntil downstream providers begin to distribute c-extension builds with PyPy, we\nhave made packages for some common packages available as wheels.\n\n\nThe CFFI backend has been updated to version 1.13.0. We recommend using CFFI\nrather than c-extensions to interact with C, and cppyy for interacting with\nC++ code.\n\n\nThanks to Anvil, we revived the PyPy Sandbox, (soon to be released) which allows total control\nover a Python interpreter\u2019s interactions with the external world.\n\n\nWe implemented a new JSON decoder that is much faster, uses less memory, and\nuses a JIT-friendly specialized dictionary. More about that in the recent blog post\n\n\nWe would like to thank our donors for the continued support of the PyPy\nproject. If PyPy is not quite good enough for your needs, we are available for\ndirect consulting work.\n\nWe would also like to thank our contributors and encourage new people to join\nthe project. PyPy has many layers and we need help with all of them: PyPy\nand RPython documentation improvements, tweaking popular modules to run\non PyPy, or general help with making RPython\u2019s JIT even better. Since the\nprevious release, we have accepted contributions from 27 new contributors,\nso thanks for pitching in.\n\n\n\n\nWhat is PyPy?\nPyPy is a very compliant Python interpreter, almost a drop-in replacement for\nCPython 2.7, 3.6. It\u2019s fast (PyPy and CPython 2.7.x performance\ncomparison) due to its integrated tracing JIT compiler.\n\n\nWe also welcome developers of other dynamic languages to see what RPython\ncan do for them.\n\n\nThis PyPy release supports:\n\nx86 machines on most common operating systems\n(Linux 32/64 bit, Mac OS X 64-bit, Windows 32-bit, OpenBSD, FreeBSD)\n\n\n\n\nbig- and little-endian variants of PPC64 running Linux \n\n\n\n\ns390x running Linux\n\n\n\n\n64-bit ARM machines running Linux\n\n\n\n\n\nUnfortunately at the moment of writing our ARM buildbots are out of service,\nso for now we are not releasing any binary for the ARM architecture (32-bit), although PyPy does support ARM 32-bit processors.\n\n\n\nWhat else is new?\nPyPy 7.1 was released in March, 2019.\nThere are many incremental improvements to RPython and PyPy, For more information about the 7.2.0 release, see the full changelog.\n\nPlease update, and continue to help us make PyPy better.\n\nCheers,\nThe PyPy team", + "tags": "release", + "url": "https://www.pypy.org/posts/2019/10/pypy-v72-released-1090406556726313495.html" + }, + { + "title": "PyPy's new JSON parser", + "text": "Introduction\nIn the last year or two I have worked on and off on making PyPy's\nJSON faster, particularly when parsing large\nJSON files. In this post I am going to document those techniques and\nmeasure their performance impact. Note that I am quite a lot more\nconstrained in what optimizations I can apply here, compared to some of\nthe much more advanced approaches like\nMison,\nSparser or\nSimdJSON because I don't want to\nchange the json.loads API that Python programs expect, and because I\ndon't want to only support CPUs with wide SIMD extensions. With a more\nexpressive API, more optimizations would be possible.\nThere are a number of problems of working with huge JSON files:\ndeserialization takes a long time on the one hand, and the resulting\ndata structures often take a lot of memory (usually they can be many\ntimes bigger than the size of the file they originated from). Of course\nthese problems are related, because allocating and initializing a big\ndata structure takes longer than a smaller data structure. Therefore I\nalways tried to attack both of these problems at the same time.\nOne common theme of the techniques I am describing is that of optimizing\nthe parser for how JSON files are typically used, not how they could\ntheoretically be used. This is a similar approach to the way dynamic\nlanguages are optimized more generally: most JITs will optimize for\ntypical patterns of usage, at the cost of less common usage patterns,\nwhich might even become slower as a result of the optimizations.\n\nMaps\nThe first technique I investigated is to use maps in the JSON parser.\nMaps, also called hidden classes or shapes, are a fairly common way to\n(generally, not just in the context of JSON parsing) optimize instances\nof\nclasses\nin dynamic language VMs. Maps exploit the fact that while it is in\ntheory possible to add arbitrary fields to an instance, in practice most\ninstances of a class are going to have the same set of fields (or one of\na small number of different sets). Since JSON dictionaries or objects\noften come from serialized instances of some kind, this property often\nholds in JSON files as well: dictionaries often have the same fields in\nthe same order, within a JSON file.\nThis property can be exploited in two ways: on the one hand, it can be\nused to again store the deserialized dictionaries in a more memory\nefficient way by not using a hashmap in most cases, but instead\nsplitting the dictionary into a shared description of the set of keys\n(the map) and an array of storage with the values. This makes the\ndeserialized dictionaries smaller if the same set of keys is repeated a\nlot. This is completely transparent to the Python programmer, the\ndictionary will look completely normal to the Python program but its\ninternal representation is different.\nOne downside of using maps is that sometimes files will contain many\ndictionaries that have unique key sets. Since maps themselves are quite\nlarge data structures and since dictionaries that use maps contain an\nextra level of indirection we want to fall back to using normal hashmaps\nto represent the dictionaries where that is the case. To prevent this we\nperform some statistics at runtime, how often every map (i.e. set of\nkeys) is used in the file. For uncommonly used maps, the map is\ndiscarded and the dictionaries that used the map converted into using a\nregular hashmap.\n\nUsing Maps to Speed up Parsing\nAnother benefit of using maps to store deserialized dictionaries is that\nwe can use them to speed up the parsing process itself. To see how this\nworks, we need to understand maps a bit better. All the maps produced as\na side-effect of parsing JSON form a tree. The tree root is a map that\ndescribes the object without any attributes. From every tree node we\nhave a number of edges going to other nodes, each edge for a specific\nnew attribute added:\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n \n \n \n \n \n\n\n\n \n\n\n\n\n \n \n \n \n \n\n\n\n \n \n \n \n\n\n\n \n \n \n \n \n \n \n\n\n\n \n \n \n \n \n \n \n\n\n\n\n\n\n\n\n \n \n \n \n \n\n\n \n \n \n \n \n\n\n \n \n \n \n \n\n\n\n \n\n\n\n \n \n \n \n\n\n\n\n\n\n \n \n \n \n \n\n\n \n \n \n \n \n\n\n\n\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n\n\n\n\n\nThis map tree is the result of parsing a file that has dictionaries with\nthe keys a, b, c many times, the keys a, b, f less often, and also some\nobjects with the keys x, y.\nWhen parsing a dictionary we traverse this tree from the root, according\nto the keys that we see in the input file. While doing this, we\npotentially add new nodes, if we get key combinations that we have never\nseen before. The set of keys of a dictionary parsed so far are\nrepresented by the current tree node, while we can store the values into\nan array. We can use the tree of nodes to speed up parsing. A lot of the\nnodes only have one child, because after reading the first few keys of\nan object, the remaining ones are often uniquely determined in a given\nfile. If we have only one child map node, we can speculatively parse the\nnext key by doing a memcmp between the key that the map tree says is\nlikely to come next and the characters that follow the ',' that started\nthe next entry in the dictionary. If the memcmp returns true this\nmeans that the speculation paid off, and we can transition to the new map\nthat the edge points to, and parse the corresponding value. If not, we\nfall back to general code that parses the string, handles escaping rules\netc. This trick was explained to me by some V8 engineers, the same trick\nis supposedly used as part of the V8 JSON parser.\nThis scheme doesn't immediately work for map tree nodes that have more\nthan one child. However, since we keep statistics anyway about how often\neach map is used as the map of a parsed dictionary, we can speculate\nthat the most common map transition is taken more often than the others\nin the future, and use that as the speculated next node.\nSo for the example transition tree shown in the figure above the key\nspeculation would succeed for objects with keys a, b, c. For objects\nwith keys a, b, f the speculation would succeed for the first two\nkeys, but not for the third key f. For objects with the keys\nx, y the speculation would fail for the first key x but succeed\nfor the second key y.\nFor real-world datasets these transition trees can become a lot more\ncomplicated, for example here is a visualization of a part of the\ntransition tree generated for parsing a New York Times dataset:\n\n\n\n\nCaching Strings\nA rather obvious observation we can use to improve performance of the\nparser is the fact that string values repeat a lot in most JSON files.\nFor strings that are used as dictionary keys this is pretty obvious.\nHowever it happens also for strings that are used as values in\ndictionaries (or are stored in lists). We can use this fact to\nintern/memoize strings and save memory. This is an approach that many\nJSON parsers use, including\nCPython's.\nTo do this, I keep a dictionary of strings that we have seen so far\nduring parsing and look up new strings that are deserialized. If we have\nseen the string before, we can re-use the deserialized previous string.\nRight now I only consider utf-8 strings for caching that do not contain\nany escapes (whether stuff like \\\", \\n or escaped unicode chars).\nThis simple approach works extremely well for dictionary keys, but needs\na number of improvements to be a win in general. The first observation\nis that computing the hash to look up the string in the dictionary of\nstrings we've seen so far is basically free. We can compute the hash\nwhile scanning the input for the end of the string we are currently\ndeserializing. Computing the hash while scanning doesn't increase the\ntime spent scanning much. This is not a new idea, I am sure many other\nparsers do the same thing (but CPython doesn't seem to).\nAnother improvement follows from the observation that inserting every\nsingle deserialized non-key string into a hashmap is too expensive.\nInstead, we insert strings into the cache more conservatively, by\nkeeping a small ring buffer of hashes of recently deserialized strings.\nThe hash is looked for in the ring buffer, and only if the hash is\npresent we insert the string into the memoization hashmap. This has the\neffect of only inserting strings into the memoization hashmap that\nre-occur a second time not too far into the file. This seems to give a\ngood trade-off between still re-using a lot of strings but keeping the\ntime spent updating and the size of the memoization hashmap low.\nAnother twist is that in a lot of situations caching strings is not\nuseful at all, because it will almost never succeed. Examples of this\nare UUIDs (which are unique), or the content of a tweet in a JSON file\nwith many tweets (which is usually unique). However, in the same file it\nmight be useful to cache e.g. the user name of the Twitter user, because\nmany tweets from the same person could be in such a file. Therefore the\nusefulness of the string cache depends on which fields of objects we are\ndeserializing the value off. Therefore we keep statistics per map field\nand disable string memoization per individual field if the cache hit\nrate falls below a certain threshold. This gives the best of both\nworlds: in the cases where string values repeat a lot in certain fields\nwe use the cache to save time and memory. But for those fields that\nmostly contain unique strings we don't waste time looking up and adding\nstrings in the memoization table. Strings outside of dictionaries are\nquite rare anyway, so we just always try to use the cache for them.\nThe following pseudocode sketches the code to deserialize a string in\nthe input at a given position. The function also takes a map, which is\nthe point in the map tree that we are currently deserializing a field\noff (if we are deserializing a string in another context, some kind of\ndummy map can be used there).\n\ndef deserialize_string(pos, input, map):\n # input is the input string, pos is the position of the starting \" of\n # the string\n\n # find end of string, check whether it contains escape codes,\n # compute hash, all at the same time\n end, escapes, hash = find_end_of_string(pos + 1, input)\n if end == -1:\n raise ParseError\n if escapes:\n # need to be much more careful with escaping\n return deserialize_string_escapes(pos, input)\n \n # should we cache at all?\n if map.cache_disabled():\n return input[pos + 1:end]\n\n # if string is in cache, return it\n if hash in cache:\n map.cache_hit += 1\n return cache[hash]\n\n result = input[pos + 1:end]\n map.cache_miss += 1\n\n # if hash is in the ring buffer of recently seen hashes,\n # add the string to the cache\n if hash in ring_buffer:\n cache[hash] = result\n else:\n ring_buffer.write(hash)\n return result\n\n\n\n\nEvaluation\nTo find out how much the various techniques help, I implemented a number\nof JSON parsers in PyPy with different combinations of the techniques\nenabled. I compared the numbers with the JSON parser of CPython 3.7.3\n(simplejson), with ujson, with the JSON parser of Node 12.11.1 (V8) and with\nRapidJSON (in DOM mode).\nI collected a number of medium-to-large JSON files to try the JSON\nparsers on:\n\nCensys: A subset of the Censys port and\nprotocol scan data for websites in the Alexa top million domains\nGharchive: Github activity from\nJanuary 15-23, 2015 from Github Archive\nReddit: Reddit\ncomments from May 2009\nRosie: The nested matches produced using the Rosie pattern\nlanguage all.things pattern on a log\nfile\nNytimes: Metadata of a collection of New York Times articles\nTpch: The TPC-H database benchmark's deals table as a JSON file\nTwitter: A JSON export of the @pypyproject Twitter account data\nWikidata: A file storing a subset of the Wikidata fact dump from Nov\n11, 2014\nYelp: A file of yelp\nbusinesses\n\nHere are the file sizes of the benchmarks:\n\n \n\n Benchmark\n File Size [MiB]\n \n\n \n\n Censys\n 898.45\n \n\n Gharchive\n 276.34\n \n\n NYTimes\n 12.98\n \n\n Reddit\n 931.65\n \n\n Rosie\n 388.88\n \n\n TPCH\n 173.86\n \n\n Wikidata\n 119.75\n \n\n Yelp\n 167.61\n \n\n\nI measured the times of each benchmark with a number of variations\nof the improved PyPy algorithms:\n\nPyPyBaseline: The PyPy JSON parser as it was before my work with JSON\nparsing started (PyPy version 5.8)\nPyPyKeyStringCaching: Memoizing the key strings of dictionaries, but\nnot the other strings in a json file, and not using maps to represent\ndictionaries (this is the JSON parser that PyPy has been shipping since\nversion 5.9, in the benchmarks I used 7.1).\nPyPyMapNoCache: Like PyPyKeyStringCaching, but using maps to\nrepresent dictionaries. This includes speculatively parsing the next\nkey using memcmp, but does not use string caching of non-key strings.\nPyPyFull: Like PyPyMapNoCache but uses a string cache for all\nstrings, not just keys. This is equivalent to what will be released soon as part of PyPy 7.2\n\nIn addition to wall clock time of parsing, I also measured the increase\nin memory use of each implementation after the input string has been\ndeserialized, i.e. the size of the in-memory representation of every\nJSON file.\n\n\nContributions of Individual Optimizations\nLet's first look at the contributions of the individual optimizations to the\noverall performance and memory usage.\n\n\n\n\nAll the benchmarks were run 30 times in new processes, all the numbers are\nnormalized to PyPyFull.\nThe biggest individual improvement to both parsing time and memory used comes\nfrom caching just the keys in parsed dictionaries. This is the optimization in\nPyPy's JSON parser that has been implemented for a while already. To understand\nwhy this optimization is so useful, let's look at some numbers about each\nbenchmark, namely the number of total keys across all dictionaries in each\nfile, as well as the number of unique keys. As we can see, for all benchmarks\nthe number of unique keys is significantly smaller than the number of keys in\ntotal.\n\n \n\n Benchmark\n Number of keys\n Number of unique keys\n \n\n \n\n Censys\n 14\u2009404\u2009234\n 163\n \n\n Gharchive\n 6\u2009637\u2009881\n 169\n \n\n NYTimes\n 417\u2009337\n 60\n \n\n Reddit\n 25\u2009226\u2009397\n 21\n \n\n Rosie\n 28\u2009500\u2009101\n 5\n \n\n TPCH\n 6\u2009700\u2009000\n 45\n \n\n Wikidata\n 6\u2009235\u2009088\n 1\u2009602\n \n\n Yelp\n 5\u2009133\u2009914\n 61\n \n\n\nThe next big jump in deserialization time and memory comes from introducing\nmaps to represent deserialized dictionaries. With PyPyMapNoCache\ndeserialization time goes down because it's much cheaper to walk the tree\nof maps and store all deserialized objects into an array of values than to\nbuild hashmaps with the same keys again and again. Memory use goes down\nfor the same reason: it takes a lot less memory to store the shared\nstructure of each set of keys in the map, as opposed to repeating it again\nand again in every hashmap.\nWe can look at some numbers about every benchmark again. The table shows how\nmany map-based dictionaries are deserialized for every benchmark, and how many\nhashmap-backed dictionaries. We see that the number of hashmap-backed\ndictionaries is often zero, or at most a small percentage of all dictionaries\nin each benchmark. Yelp has the biggest number of hashmap-backed dictionaries.\nThe reason for this is that the input file contains hashmaps that store\ncombinations of various features of Yelp businesses, and a lot of these\ncombinations are totally unique to a business. Therefore the heuristics\ndetermine that it's better to store these using hashmaps.\n\n \n \n Benchmark\n Map Dicts\n Regular Dicts\n % Regular Dicts\n \n \n \n \n Censys\n 4\u2009049\u2009235\n 1\u2009042\n 0.03\n \n \n Gharchive\n 955\u2009301\n 0\n 0.00\n \n \n NYTimes\n 80\u2009393\n 0\n 0.00\n \n \n Reddit\n 1\u2009201\u2009257\n 0\n 0.00\n \n \n Rosie\n 6\u2009248\u2009966\n 0\n 0.00\n \n \n TPCH\n 1\u2009000\u2009000\n 0\n 0.00\n \n \n Wikidata\n 1\u2009923\u2009460\n 46\u2009905\n 2.38\n \n \n Yelp\n 443\u2009140\n 52\u2009051\n 10.51\n \n \n\n\nWe can also look at numbers about how often the memcmp-based speculative\nparsing of the next key of a given map succeeds. Looking at statistics\nabout each benchmark, we can see that the speculation of what key we\nexpect next pays off in a significant percentage of cases, between 63% for\nWikidata where the dictionary structures are quite irregular, and 99% for\nReddit, where all the dictionaries have the same set of keys.\n\n \n\n Benchmark\n Number of Keys\n Map Transitions\n % Successful Speculation\n \n\n \n\n Censys\n 14\u2009404\u2009234\n 14\u2009403\u2009243\n 65.79\n \n\n Gharchive\n 6\u2009637\u2009881\n 6\u2009637\u2009881\n 86.71\n \n\n NYTimes\n 417\u2009337\n 417\u2009337\n 79.85\n \n\n Reddit\n 25\u2009226\u2009397\n 25\u2009226\u2009397\n 100.00\n \n\n Rosie\n 28\u2009500\u2009101\n 28\u2009500\u2009101\n 90.37\n \n\n TPCH\n 6\u2009700\u2009000\n 6\u2009700\u2009000\n 86.57\n \n\n Wikidata\n 6\u2009235\u2009088\n 5\u2009267\u2009744\n 63.68\n \n\n Yelp\n 5\u2009133\u2009914\n 4\u2009593\u2009980\n 90.43\n \n\n geomean\n \n \n 82.04\n \n\n\nGeneral string caching is the most unclear optimization. On the one hand its\nimpact on memory usage is quite substantial, leading to a 20% reduction for\nGharchive and Reddit, up to a 2\u00d7 improvement for Yelp. On the other hand, the\neffect on performance is less clear, since it even leads to a slowdown in\nGharchive and Reddit, and generally only a small improvement. Choosing the\nright heuristic for when to disable the cache also has somewhat unclear effects\nand is definitely a topic worthy of further investigation.\n\nComparison against other JSON Decoders\nTo get a more general feeling of the performance and memory usage of the\nimproved PyPy parser, we compare it against CPython's built-in json\nparser, ujson for CPython, Node's (V8) JSON parser and RapidJSON. For\nbetter context for the memory usage I also show the file size of the input\nfiles.\nThese benchmarks are not really an apples-to-apple comparison. All of the\nimplementations use different in-memory representations of strings in\nthe deserialized data-structure (Node uses two bytes per character in\na string, in CPython it\ndepends but 4 bytes on my\nmachine), PyPyBaseline uses four bytes, PyPy and RapidJSON use utf-8). But\nit's still interesting to get some ballpark numbers. The results are as\nfollows:\n\n\n\n\nAs we can see, PyPyFull handily beats CPython and ujson, with a geometric\nmean of the improvement of about 2.5\u00d7. The memory improvement can be even\nmore extreme, with an improvement of over 4\u00d7 against CPython/ujson in some\ncases (CPython gives better memory sizes, because its parser caches the\nkeys of dictionaries as well). Node is often more than 50% slower, whereas\nRapidJSON beats us easily, by a factor of 2\u00d7 on average.\n\nConclusions\nWhile the speedup I managed to achieve over the course of this project is\nnice and I am certainly happy to beat both CPython and Node, I am\nultimately still annoyed that RapidJSON manages to maintain such a clear\nlead over PyPyFull, and would like to get closer to it. One problem that\nPyPy suffers compared to RapidJSON is the overhead of garbage collection.\nDeserializing large JSON files is pretty much the worst case for the\ngenerational GC that PyPy uses, since none of the deserialized objects die\nyoung (and the GC expects that most objects do). That means that a lot of\nthe deserialization time of PyPy is wasted allocating the resulting\nobjects in the nursery, and then copying them into the old generation.\nSomehow, this should be done in better ways, but all my attempts to not\nhave to do the copy did not seem to help much. So maybe more improvements\nare possible, if I can come up with more ideas.\nOn the memory side of things, Node/V8 is beating PyPy clearly which might\nindicate more general problems in how we represent Python objects in\nmemory. On the other hand, I think it's cool that we are competitive with\nRapidJSON in terms of memory and often within 2\u00d7 of the file size.\nAn effect that I didn't consider at all in this blog post is the fact that\naccessing the deserialized objects with constants strings is also faster\nthan with regular dictionaries, due to them being represented with maps.\nMore benchmarking work to do in the future!\nIf you have your own programs that run on PyPy and use the json parser\na lot, please measure them on the new code and let me know whether you see\nany difference!", + "tags": "", + "url": "https://www.pypy.org/posts/2019/10/pypys-new-json-parser-492911724084305501.html" + }, + { + "title": "A second life for the Sandbox", + "text": "Hi all,\n\nAnvil is a UK-based company sponsoring one month of work to revive PyPy's\n\"sandbox\" mode and upgrade it to PyPy3. Thanks to them, sandboxing will be\ngiven a second life!\n\nThe sandboxed PyPy is a special version of PyPy that runs\nfully isolated. It gives a safe way to execute arbitrary Python\nprograms (whole programs, not small bits of code inside your larger Python\nprogram). Such scripts can be fully untrusted, and they can try to do\nanything\u2014there are no syntax-based restrictions, for example\u2014but whatever\nthey do, any communication with the external world is not actually done but\ndelegated to the parent process. This is similar but much more flexible than\nLinux's Seccomp approach, and it is more lightweight than setting up a full\nvirtual machine. It also works without operating system support.\n\nHowever, during the course of the years the sandbox mode of PyPy has been\nmostly unmaintained and unsupported by the core developers, mostly because of\na lack of interest by users and because it took too much effort to maintain\nit.\n\nNow we have found that we have an actual user, Anvil. As far as I can tell\nthey are still using a very old version of PyPy, the last one that supported\nsandboxing. This is where this contract comes from: the goal is to modernize sandboxing and port it to PyPy3.\n\nPart of my motivation for accepting this work is that I may have found a way to\ntweak the protocol on the pipe between the sandboxed PyPy and the parent\ncontroller process. This should make the sandboxed PyPy more resilient against\nfuture developments and easier to maintain; at most, in the future some tweaks will be needed in the\ncontroller process but hopefully not deep inside the guts of the sandboxed\nPyPy. Among the advantages, such a more robust solution should mean that we\ncan actually get a working sandboxed PyPy\u2014or sandboxed PyPy3 or sandboxed\nversion of any other interpreter written in RPython\u2014with just an extra\nargument when calling rpython to translate this interpreter. If everything\nworks as planned, sandboxing may be given a second life.\n\nArmin Rigo", + "tags": "", + "url": "https://www.pypy.org/posts/2019/08/a-second-life-for-sandbox-6848726729476245390.html" + }, + { + "title": "PyPy JIT for Aarch64", + "text": "Hello everyone.\nWe are pleased to announce the availability of the new PyPy for AArch64. This\nport brings PyPy's high-performance just-in-time compiler to the AArch64\nplatform, also known as 64-bit ARM. With the addition of AArch64, PyPy now\nsupports a total of 6 architectures: x86 (32 & 64bit), ARM (32 & 64bit), PPC64,\nand s390x. The AArch64 work was funded by ARM Holdings Ltd. and Crossbar.io.\nPyPy has a good record of boosting the performance of Python programs on the\nexisting platforms. To show how well the new PyPy port performs, we compare the\nperformance of PyPy against CPython on a set of benchmarks. As a point of\ncomparison, we include the results of PyPy on x86_64.\nNote, however, that the results presented here were measured on a Graviton A1\nmachine from AWS, which comes with a very serious word of warning: Graviton A1's\nare virtual machines, and, as such, they are not suitable for benchmarking. If\nsomeone has access to a beefy enough (16G) ARM64 server and is willing to give\nus access to it, we are happy to redo the benchmarks on a real machine. One\nmajor concern is that while a virtual CPU is 1-to-1 with a real CPU, it is not\nclear to us how CPU caches are shared across virtual CPUs. Also, note that by no\nmeans is this benchmark suite representative enough to average the results. Read\nthe numbers individually per benchmark.\nThe following graph shows the speedups on AArch64 of PyPy (hg id 2417f925ce94) compared to\nCPython (2.7.15), as well as the speedups on a x86_64 Linux laptop\ncomparing the most recent release, PyPy 7.1.1, to CPython 2.7.16.\n\n\n\nIn the majority of benchmarks, the speedups achieved on AArch64 match those\nachieved on the x86_64 laptop. Over CPython, PyPy on AArch64 achieves speedups\nbetween 0.6x to 44.9x. These speedups are comparable to x86_64, where the\nnumbers are between 0.6x and 58.9x.\nThe next graph compares between the speedups achieved on AArch64 to the speedups\nachieved on x86_64, i.e., how great the speedup is on AArch64 vs. the same\nbenchmark on x86_64. This comparison should give a rough idea about the\nquality of the generated code for the new platform.\n\n\n\nNote that we see a large variance: There are generally three groups of\nbenchmarks - those that run at more or less the same speed, those that\nrun at 2x the speed, and those that run at 0.5x the speed of x86_64.\nThe variance and disparity are likely related to a variety of issues, mostly due\nto differences in architecture. What is however interesting is that, compared\nto measurements performed on older ARM boards, the branch predictor on the\nGraviton A1 machine appears to have improved. As a result, the speedups achieved\nby PyPy over CPython are smaller than on older ARM boards: sufficiently branchy\ncode, like CPython itself, simply runs a lot faster. Hence, the advantage\nof the non-branchy code generated by PyPy's just-in-time compiler is smaller.\nOne takeaway here is that many possible improvements for PyPy have yet to be\nimplemented. This is true for both of the above platforms, but probably more so\nfor AArch64, which comes with a large number of CPU registers. The PyPy backend\nwas written with x86 (the 32-bit variant) in mind, which has a really low number\nof registers. We think that we can improve in the area of emitting more modern\nmachine code, which may have a higher impact on AArch64 than on x86_64. There is\nalso a number of missing features in the AArch64 backend. These features are\ncurrently implemented as expensive function calls instead of inlined native\ninstructions, something we intend to improve.\nBest,\nMaciej Fijalkowski, Armin Rigo and the PyPy team", + "tags": "", + "url": "https://www.pypy.org/posts/2019/07/pypy-jit-for-aarch64-7161523403247118006.html" + }, + { + "title": "PyPy 7.1.1 Bug Fix Release", + "text": "The PyPy team is proud to release a bug-fix release version 7.1.1 of PyPy, which\nincludes two different interpreters:\n\nPyPy2.7, which is an interpreter supporting the syntax and the features of\nPython 2.\nPyPy3.6-beta: the second official release of PyPy to support 3.6\nfeatures.\n\n\n\n\n\nThe interpreters are based on much the same codebase, thus the double\nrelease.\n\n\nThis bugfix fixes bugs related to large lists, dictionaries, and sets, some corner cases with unicode, and PEP 3118 memory views of ctype structures. It also fixes a few issues related to the ARM 32-bit backend. For the complete list see the changelog.\n\nYou can download the v7.1.1 releases here:\n\n\n\nhttps://pypy.org/download.html\n\n\nAs always, this release is 100% compatible with the previous one and fixed\nseveral issues and bugs raised by the growing community of PyPy users.\nWe strongly recommend updating.\n\nThe PyPy3.6 release is rapidly maturing, but is still considered beta-quality.\n\nThe PyPy team", + "tags": "release", + "url": "https://www.pypy.org/posts/2019/04/pypy-711-bug-fix-release-6539023630991217367.html" + }, + { + "title": "An RPython JIT for LPegs", + "text": "The following is a guest post by Stefan Troost, he describes the work he did in his bachelor thesis:\n\nIn this project we have used the RPython infrastructure to generate an RPython\nJIT for a\nless-typical use-case: string pattern matching. The work in this project is\nbased on Parsing Expression Grammars and\nLPeg, an implementation of PEGs\ndesigned to be used in Lua. In this post I will showcase some of the work that\nwent into this project, explain PEGs in general and LPeg in particular, and\nshow some benchmarking results.\nParsing Expression Grammars\nParsing Expression Grammas (PEGs) are a type of formal grammar similar to\ncontext-free grammars, with the main difference being that they are unambiguous.\nThis is achieved by redefining the ambiguous choice operator of CFGs (usually\nnoted as |) as an ordered choice operator. In practice this means that if a\nrule in a PEG presents a choice, a PEG parser should prioritize the leftmost\nchoice. Practical uses include parsing and pattern-searching. In comparison to\nregular expressions PEGs stand out as being able to be parsed in linear time,\nbeing strictly more powerful than REs, as well as being arguably more readable.\nLPeg\nLPeg is an implementation of PEGs written in C to be used in the Lua\nprogramming language. A crucial detail of this implementation is that it parses\nhigh level function calls, translating them to bytecode, and interpreting that\nbytecode. Therefore, we are able to improve that implementation by replacing\nLPegs C-interpreter with an RPython JIT. I use a modified version of LPeg to\nparse PEGs and pass the generated Intermediate Representation, the LPeg\nbytecode, to my VM.\nThe LPeg Library\nThe LPeg Interpreter executes bytecodes created by parsing a string of commands\nusing the LPeg library. Our JIT supports a subset of the LPeg library, with\nsome of the more advanced or obscure features being left out. Note that this\nsubset is still powerful enough to do things like parse JSON.\n\n\n\nOperator\nDescription\n\n\n\n\nlpeg.P(string)\nMatches string literally\n\n\nlpeg.P(n)\nMatches exactly n characters\n\n\nlpeg.P(-n)\nMatches at most n characters\n\n\nlpeg.S(string)\nMatches any character in string (Set)\n\n\nlpeg.R(\u201cxy\u201d)\nMatches any character between x and y (Range)\n\n\npatternn\nMatches at least n repetitions of pattern\n\n\npattern-n\nMatches at most n repetitions of pattern\n\n\npattern1 * pattern2\nMatches pattern1 followed by pattern2\n\n\npattern1 + pattern2\nMatches pattern1 or pattern2 (ordered choice)\n\n\npattern1 - pattern2\nMatches pattern1 if pattern2 does not match\n\n\n-pattern\nEquivalent to (\"\" - pattern)\n\n\n\nAs a simple example, the pattern lpeg.P\"ab\"+lpeg.P\"cd\" would match either the\nstring ab or the string cd.\nTo extract semantic information from a pattern, captures are needed. These are\nthe following operations supported for capture creation.\n\n\n\nOperation\nWhat it produces\n\n\n\n\nlpeg.C(pattern)\nthe match for patten plus all captures made by pattern\n\n\nlpeg.Cp()\nthe current position (matches the empty string)\n\n\n\n(tables taken from the LPeg documentation)\nThese patterns are translated into bytecode by LPeg, at which point we are able\nto pass them into our own VM.\nThe VM\nThe state of the VM at any point is defined by the following variables:\n\nPC: program counter indicating the current instruction\nfail: an indicator that some match failed and the VM must backtrack\nindex: counter indicating the current character of the input string\nstackentries: stack of return addresses and choice points\ncaptures: stack of capture objects\n\nThe execution of bytecode manipulates the values of these variables in order to\nproduce some output. How that works and what that output looks like will be\nexplained now.\nThe Bytecode\nFor simplicity\u2019s sake I will not go over every individual bytecode, but instead\nchoose some that exemplify the core concepts of the bytecode set.\ngeneric character matching bytecodes\n\n\nany: Checks if there\u2019s any characters left in the inputstring. If it succeeds\nit advances the index and PC by 1, if not the bytecode fails.\n\n\nchar c: Checks if there is another bytecode in the input and if that\ncharacter is equal to c. Otherwise the bytecode fails.\n\n\nset c1-c2: Checks if there is another bytecode in the input and if that\ncharacter is between (including) c1 and c2. Otherwise the bytecode fails.\n\n\nThese bytecodes are the easiest to understand with very little impact on the\nVM. What it means for a bytecode to fail will be explained when\nwe get to control flow bytecodes.\nTo get back to the example, the first half of the pattern lpeg.P\"ab\" could be\ncompiled to the following bytecodes:\nchar a\nchar b\n\ncontrol flow bytecodes\n\n\njmp n: Sets PC to n, effectively jumping to the n\u2019th bytecode. Has no defined\nfailure case.\n\n\ntestchar c n: This is a lookahead bytecode. If the current character is equal\nto c it advances the PC but not the index. Otherwise it jumps to n.\n\n\ncall n: Puts a return address (the current PC + 1) on the stackentries stack\nand sets the PC to n. Has no defined failure case.\n\n\nret: Opposite of call. Removes the top value of the stackentries stack (if\nthe string of bytecodes is valid this will always be a return address) and\nsets the PC to the removed value. Has no defined failure case.\n\n\nchoice n: Puts a choice point on the stackentries stack. Has no defined\nfailure case.\n\n\ncommit n: Removes the top value of the stackentries stack (if the string of\nbytecodes is valid this will always be a choice point) and jumps to n. Has no\ndefined failure case.\n\n\nUsing testchar we can implement the full pattern lpeg.P\"ab\"+lpeg.P\"cd\" with\nbytecode as follows:\ntestchar a -> L1\nany\nchar b\nend\nany\nL1: char c\nchar d\nend\n\nThe any bytecode is needed because testchar does not consume a character\nfrom the input.\nFailure Handling, Backtracking and Choice Points\nA choice point consist of the VM\u2019s current index and capturestack as well as a\nPC. This is not the VM\u2019s PC at the time of creating the\nchoicepoint, but rather the PC where we should continue trying to find\nmatches when a failure occurs later.\nNow that we have talked about choice points, we can talk about how the VM\nbehaves in the fail state. If the VM is in the fail state, it removed entries\nfrom the stackentries stack until it finds a choice point. Then it backtracks\nby restoring the VM to the state defined by the choice point. If no choice\npoint is found this way, no match was found in the string and the VM halts.\nUsing choice points we could implement the example lpeg.P\"ab\" + lpeg.P\"cd\" in\nbytecodes in a different way (LPEG uses the simpler way shown above, but for\nmore complex patterns it can\u2019t use the lookahead solution using testchar):\nchoice L1\nchar a\nchar b\ncommit\nend\nL1: char c\nchar d\nend\n\nCaptures\nSome patterns require the VM to produce more output than just \u201cthe pattern\nmatched\u201d or \u201cthe pattern did not match\u201d. Imagine searching a document for an\nIPv4 address and all your program responded was \u201cI found one\u201d. In order to\nrecieve additional information about our inputstring, captures are used.\nThe capture object\nIn my VM, two types of capture objects are supported, one of them being the\nposition capture. It consists of a single index referencing the point in the\ninputstring where the object was created.\nThe other type of capture object is called simplecapture. It consists of an\nindex and a size value, which are used to reference a substring of the\ninputstring. In addition, simplecaptures have a variable status indicating they\nare either open or full. If a simplecapture object is open, that means that its\nsize is not yet determined, since the pattern we are capturing is of variable\nlength.\nCapture objects are created using the following bytecodes:\n\n\nFullcapture Position: Pushes a positioncapture object with the current index\nvalue to the capture stack.\n\n\nFullcapture Simple n: Pushes a simplecapture object with current index value\nand size=n to the capture stack.\n\n\nOpencapture Simple: Pushes an open simplecapture object with current index\nvalue and undetermined size to the capture stack.\n\n\nclosecapture: Sets the top element of the capturestack to full and sets its\nsize value using the difference between the current index and the index of\nthe capture object.\n\n\nThe RPython Implementation\nThese, and many more bytecodes were implemented in an RPython-interpreter.\nBy adding jit hints, we were able to generate an efficient JIT.\nWe will now take a closer look at some implementations of bytecodes.\n...\n elif instruction.name == \"any\":\n if index >= len(inputstring):\n fail = True\n else:\n pc += 1\n index += 1\n\n...\n\nThe code for the any-bytecode is relatively straight-forward. It either\nadvances the pc and index or sets the VM into the fail state,\ndepending on whether the end of the inputstring has been reached or not.\n...\n if instruction.name == \"char\":\n if index >= len(inputstring):\n fail = True\n elif instruction.character == inputstring[index]:\n pc += 1\n index += 1\n else:\n fail = True\n...\n\nThe char-bytecode also looks as one would expect. If the VM\u2019s string index is\nout of range or the character comparison fails, the VM is put into the\nfail state, otherwise the pc and index are advanced by 1. As you can see, the\ncharacter we\u2019re comparing the current inputstring to is stored in the\ninstruction object (note that this code-example has been simplified for\nclarity, since the actual implementation includes a jit-optimization that\nallows the VM to execute multiple successive char-bytecodes at once).\n...\n elif instruction.name == \"jmp\":\n pc = instruction.goto\n...\n\nThe jmp-bytecode comes with a goto value which is a pc that we want\nexecution to continue at.\n...\n elif instruction.name == \"choice\":\n pc += 1\n choice_points = choice_points.push_choice_point(\n instruction.goto, index, captures)\n...\n\nAs we can see here, the choice-bytecode puts a choice point onto the stack that\nmay be backtracked to if the VM is in the fail-state. This choice point\nconsists of a pc to jump to which is determined by the bytecode.\nBut it also includes the current index and captures values at the time the choice\npoint was created. An ongoing topic of jit optimization is which data structure\nis best suited to store choice points and return addresses. Besides naive\nimplementations of stacks and single-linked lists, more case-specific\nstructures are also being tested for performance.\nBenchmarking Result\nIn order to find out how much it helps to JIT LPeg patterns we ran a small\nnumber of benchmarks. We used an otherwise idle Intel Core i5-2430M CPU with\n3072 KiB of cache and 8 GiB of RAM, running with 2.40GHz. The machine was\nrunning Ubuntu 14.04 LTS, Lua 5.2.3 and we used GNU grep 2.16 as a point of\ncomparison for one of the benchmarks. The benchmarks were run 100 times in\na new process each. We measured the full runtime of the called process,\nincluding starting the process.\nNow we will take a look at some plots generated by measuring the runtime of\ndifferent iterations of my JIT compared to lua and using bootstrapping to\ngenerate a sampling distribution of mean values. The plots contain a few different\nvariants of pypeg, only the one called \"fullops\" is important for this blog post, however.\n\n\n\nThis is the plot for a search pattern that searches a text file for valid URLs.\nAs we can see, if the input file is as small as 100 kb, the benefits of JIT\noptimizations do not outweigh the time required to generate the\nmachine code. As a result, all of our attempts perform significantly slower\nthan LPeg.\n\n\n\nThis is the plot for the same search pattern on a larger input file. As we can\nsee, for input files as small as 500 kb our VM already outperforms LPeg\u2019s. An\nongoing goal of continued development is to get this lower boundary as small as\npossible.\n\n\n\nThe benefits of a JIT compared to an Interpreter become more and more relevant\nfor larger input files. Searching a file as large as 5 MB makes this fairly\nobvious and is exactly the behavior we expect.\n\n\n\nThis time we are looking at a different more complicated pattern, one that parses JSON used on a\n50 kb input file. As expected, LPeg outperforms us, however, something\nunexpected happens as we increase the filesize.\n\n\n\nSince LPeg has a defined maximum depth of 400 for the choicepoints and\nreturnaddresses Stack, LPeg by default refuses to parse files as small as\n100kb. This raises the question if LPeg was intended to be used for parsing.\nUntil a way to increase LPeg\u2019s maximum stack depth is found, no comparisons to\nLPeg can be performed at this scale. This has been a low priority in the past\nbut may be addressed in the future.\nTo conclude, we see that at sufficiently high filesizes, our JIT outperforms\nthe native LPeg-interpreter. This lower boundary is currently as low as 100kb\nin filesize.\nConclusion\nWriting a JIT for PEG\u2019s has proven itself to be a challenge worth pursuing, as\nthe expected benefits of a JIT compared to an Interpreter have been achieved.\nFuture goals include getting LPeg to be able to use parsing patterns on larger\nfiles, further increasing the performance of our JIT and comparing it to other\nwell-known programs serving a similar purpose, like grep.\nThe prototype implementation that I described in this post can be found\non Github\n(it's a bit of a hack in some places, though).", + "tags": "", + "url": "https://www.pypy.org/posts/2019/04/an-rpython-jit-for-lpegs-4779548053359386284.html" + }, + { + "title": "PyPy v7.1 released; now uses utf-8 internally for unicode strings", + "text": "The PyPy team is proud to release version 7.1.0 of PyPy, which includes\ntwo different interpreters:\n\n\n\n\nPyPy2.7, which is an interpreter supporting the syntax and the features of\nPython 2.7\nPyPy3.6-beta: this is the second official release of PyPy to support 3.6\nfeatures, although it is still considered beta quality.\n\n\n\nThe interpreters are based on much the same codebase, thus the double\nrelease.\n\n\nThis release, coming fast on the heels of 7.0 in February, finally merges the\ninternal refactoring of unicode representation as UTF-8. Removing the\nconversions from strings to unicode internally lead to a nice speed bump. We merged the utf-8 changes to the py3.5 branch (Python3.5.3) but will concentrate on 3.6 going forward.\n\n\nWe also improved the ability to use the buffer protocol with ctype structures\nand arrays.\n\n\nThe CFFI backend has been updated to version 1.12.2. We recommend using CFFI\nrather than c-extensions to interact with C, and cppyy for interacting with\nC++ code.\n\u00a0You can download the v7.1 releases here:\n\n\n\nhttps://pypy.org/download.html\n\nWe would like to thank our donors for the continued support of the PyPy\nproject. If PyPy is not quite good enough for your needs, we are available for\ndirect consulting work.\n\n\nWe would also like to thank our contributors and encourage new people to join\nthe project. PyPy has many layers and we need help with all of them: PyPy\nand RPython documentation improvements, tweaking popular modules to run\non pypy, or general help with making RPython\u2019s JIT even better.\n\n\n\nWhat is PyPy?\nPyPy is a very compliant Python interpreter, almost a drop-in replacement for\nCPython 2.7, 3.6. It\u2019s fast (PyPy and CPython 2.7.x performance\ncomparison) due to its integrated tracing JIT compiler.\n\n\nWe also welcome developers of other dynamic languages to see what RPython\ncan do for them.\n\nThis PyPy release supports:\n\u00a0\n\n\nx86 machines on most common operating systems\n(Linux 32/64 bits, Mac OS X 64 bits, Windows 32 bits, OpenBSD, FreeBSD)\nbig- and little-endian variants of PPC64 running Linux\n\u00a0ARM32 although we do not supply downloadable binaries at this time\ns390x running Linux\n\n\n\nWhat else is new?\nPyPy 7.0 was released in February, 2019.\nThere are many incremental improvements to RPython and PyPy, for more information see the changelog.\n\nPlease update, and continue to help us make PyPy better.\n\n\nCheers, The PyPy team", + "tags": "release", + "url": "https://www.pypy.org/posts/2019/03/pypy-v71-released-now-uses-utf-8-451324088028792912.html" + }, + { + "title": "PyPy v7.0.0: triple release of 2.7, 3.5 and 3.6-alpha", + "text": "The PyPy team is proud to release the version 7.0.0 of PyPy, which includes\nthree different interpreters:\n\n\nPyPy2.7, which is an interpreter supporting the syntax and the features of\nPython 2.7\nPyPy3.5, which supports Python 3.5\nPyPy3.6-alpha: this is the first official release of PyPy to support 3.6\nfeatures, although it is still considered alpha quality.\n\n\nAll the interpreters are based on much the same codebase, thus the triple\nrelease.\nUntil we can work with downstream providers to distribute builds with PyPy, we\nhave made packages for some common packages available as wheels.\nThe GC hooks , which can be used to gain more insights into its\nperformance, has been improved and it is now possible to manually manage the\nGC by using a combination of gc.disable and gc.collect_step. See the\nGC blog post.\nWe updated the cffi module included in PyPy to version 1.12, and the\ncppyy backend to 1.4. Please use these to wrap your C and C++ code,\nrespectively, for a JIT friendly experience.\nAs always, this release is 100% compatible with the previous one and fixed\nseveral issues and bugs raised by the growing community of PyPy users.\nWe strongly recommend updating.\nThe PyPy3.6 release and the Windows PyPy3.5 release are still not production\nquality so your mileage may vary. There are open issues with incomplete\ncompatibility and c-extension support.\nThe utf8 branch that changes internal representation of unicode to utf8 did not\nmake it into the release, so there is still more goodness coming.\nYou can download the v7.0 releases here:\n\nhttps://pypy.org/download.html\nWe would like to thank our donors for the continued support of the PyPy\nproject. If PyPy is not quite good enough for your needs, we are available for\ndirect consulting work.\nWe would also like to thank our contributors and encourage new people to join\nthe project. PyPy has many layers and we need help with all of them: PyPy\nand RPython documentation improvements, tweaking popular modules to run\non pypy, or general help with making RPython's JIT even better.\n\n\nWhat is PyPy?\nPyPy is a very compliant Python interpreter, almost a drop-in replacement for\nCPython 2.7, 3.5 and 3.6. It's fast (PyPy and CPython 2.7.x performance\ncomparison) due to its integrated tracing JIT compiler.\nWe also welcome developers of other dynamic languages to see what RPython\ncan do for them.\nThe PyPy release supports:\n\n\nx86 machines on most common operating systems\n(Linux 32/64 bits, Mac OS X 64 bits, Windows 32 bits, OpenBSD, FreeBSD)\nbig- and little-endian variants of PPC64 running Linux,\ns390x running Linux\n\n\nUnfortunately at the moment of writing our ARM buildbots are out of service,\nso for now we are not releasing any binary for the ARM architecture.\n\n\nWhat else is new?\nPyPy 6.0 was released in April, 2018.\nThere are many incremental improvements to RPython and PyPy, the complete listing is here.\n\nPlease update, and continue to help us make PyPy better.\n\n\nCheers, The PyPy team", + "tags": "release", + "url": "https://www.pypy.org/posts/2019/02/pypy-v700-triple-release-of-27-35-and-606875333356156076.html" + }, + { + "title": "D\u00fcsseldorf Sprint Report 2019", + "text": "Hello everyone!\nWe are happy to report a successful and well attended sprint that is wrapping up\nin D\u00fcsseldorf, Germany. In the last week we had eighteen people sprinting\nat the Heinrich-Heine-Universit\u00e4t D\u00fcsseldorf on various topics.\nTotally serious work going on here constantly.\nA big\nchunk of the sprint was dedicated to various discussions, since we did not\nmanage to gather the core developers in one room in quite a while.\nDiscussion topics included:\n\nFunding and general sustainability of open source.\nCatching up with CPython 3.7/3.8 \u2013 we are planning to release 3.6 some time\nin the next few months and we will continue working on 3.7/3.8.\nWhat to do with VMprof\nHow can we support Cython inside PyPy in a way that will be understood\nby the JIT, hence fast.\nThe future of supporting the numeric stack on pypy \u2013 we have made significant\nprogress in the past few years and most of the numeric stack works out of the box,\nbut deployment and performance remain problems. Improving on those problems\nremains a very important focus for PyPy as a project.\nUsing the presence of a CPython developer (\u0141ukasz Langa) and a Graal Python developer\n(Tim Felgentreff) we discussed ways to collaborate in order to improve Python\necosystem across implementations.\nPierre-Yves David and Georges Racinet from octobus gave us an exciting demo\non Heptapod, which adds mercurial support to gitlab.\nMaciej and Armin gave demos of their current (non-PyPy-related) project VRSketch.\n\n\n\nVisiting the Landschaftspark Duisburg Nord on the break day\n\nSome highlights of the coding tasks worked on:\n\nAarch64 (ARM64) JIT backend work has been started, we are able to run the first\ntest! Tobias Oberstein from Crossbar GmbH and Rodolph Perfetta from ARM joined the\nsprint to help kickstart the project.\nThe long running math-improvements branch that was started by Stian Andreassen got merged\nafter bugfixes done by Alexander Schremmer. It should improve operations on large integers.\nThe arcane art of necromancy was used to revive long dormant regalloc branch started\nand nearly finished by Carl Friedrich Bolz-Tereick. The branch got merged and gives\nsome modest speedups across the board.\nAndrew Lawrence worked on MSI installer for PyPy on windows.\n\u0141ukasz worked on improving failing tests on the PyPy 3.6 branch. He knows very obscure\ndetails of CPython (e.g. how pickling works), hence we managed to progress very quickly.\nMatti Picus set up a new benchmarking server for PyPy 3 branches.\nThe Utf8 branch, which changes the internal representation of unicode might be finally\nmerged at some point very soon. We discussed and improved upon the last few\nblockers. It gives significant speedups in a lot of cases handling strings.\nZlib was missing couple methods, which were added by Ronan Lamy and Julian Berman.\nManuel Jacob fixed RevDB failures.\nAntonio Cuni and Matti Picus worked on 7.0 release which should happen in a few days.\n\nNow we are all quite exhausted, and are looking forward to catching up on sleep.\nBest regards,\nMaciej Fija\u0142kowski, Carl Friedrich Bolz-Tereick and the whole PyPy team.", + "tags": "", + "url": "https://www.pypy.org/posts/2019/02/dusseldorf-sprint-report-2019-6107623654916313905.html" + }, + { + "title": "PyPy for low-latency systems", + "text": "PyPy for low-latency systems\nRecently I have merged the gc-disable branch, introducing a couple of features\nwhich are useful when you need to respond to certain events with the lowest\npossible latency. This work has been kindly sponsored by Gambit Research\n(which, by the way, is a very cool and geeky place where to work, in case you\nare interested). Note also that this is a very specialized use case, so these\nfeatures might not be useful for the average PyPy user, unless you have the\nsame problems as described here.\n\nThe PyPy VM manages memory using a generational, moving Garbage Collector.\nPeriodically, the GC scans the whole heap to find unreachable objects and\nfrees the corresponding memory. Although at a first look this strategy might\nsound expensive, in practice the total cost of memory management is far less\nthan e.g. on CPython, which is based on reference counting. While maybe\ncounter-intuitive, the main advantage of a non-refcount strategy is\nthat allocation is very fast (especially compared to malloc-based allocators),\nand deallocation of objects which die young is basically for free. More\ninformation about the PyPy GC is available here.\n\nAs we said, the total cost of memory managment is less on PyPy than on\nCPython, and it's one of the reasons why PyPy is so fast. However, one big\ndisadvantage is that while on CPython the cost of memory management is spread\nall over the execution of the program, on PyPy it is concentrated into GC\nruns, causing observable pauses which interrupt the execution of the user\nprogram.\nTo avoid excessively long pauses, the PyPy GC has been using an incremental\nstrategy since 2013. The GC runs as a series of \"steps\", letting the user\nprogram to progress between each step.\n\nThe following chart shows the behavior of a real-world, long-running process:\n\n\n\n\nThe orange line shows the total memory used by the program, which\nincreases linearly while the program progresses. Every ~5 minutes, the GC\nkicks in and the memory usage drops from ~5.2GB to ~2.8GB (this ratio is controlled\nby the PYPY_GC_MAJOR_COLLECT env variable).\nThe purple line shows aggregated data about the GC timing: the whole\ncollection takes ~1400 individual steps over the course of ~1 minute: each\npoint represent the maximum time a single step took during the past 10\nseconds. Most steps take ~10-20 ms, although we see a horrible peak of ~100 ms\ntowards the end. We have not investigated yet what it is caused by, but we\nsuspect it is related to the deallocation of raw objects.\n\nThese multi-millesecond pauses are a problem for systems where it is important\nto respond to certain events with a latency which is both low and consistent.\nIf the GC kicks in at the wrong time, it might causes unacceptable pauses during\nthe collection cycle.\n\nLet's look again at our real-world example. This is a system which\ncontinuously monitors an external stream; when a certain event occurs, we want\nto take an action. The following chart shows the maximum time it takes to\ncomplete one of such actions, aggregated every minute:\n\n\n\n\nYou can clearly see that the baseline response time is around ~20-30\nms. However, we can also see periodic spikes around ~50-100 ms, with peaks up\nto ~350-450 ms! After a bit of investigation, we concluded that most (although\nnot all) of the spikes were caused by the GC kicking in at the wrong time.\n\nThe work I did in the gc-disable branch aims to fix this problem by\nintroducing two new features to the gc module:\n\n\ngc.disable(), which previously only inhibited the execution of\nfinalizers without actually touching the GC, now disables the GC major\ncollections. After a call to it, you will see the memory usage grow\nindefinitely.\ngc.collect_step() is a new function which you can use to manually\nexecute a single incremental GC collection step.\n\n\nIt is worth to specify that gc.disable() disables only the major\ncollections, while minor collections still runs. Moreover, thanks to the\nJIT's virtuals, many objects with a short and predictable lifetime are not\nallocated at all. The end result is that most objects with short lifetime are\nstill collected as usual, so the impact of gc.disable() on memory growth\nis not as bad as it could sound.\n\nCombining these two functions, it is possible to take control of the GC to\nmake sure it runs only when it is acceptable to do so. For an example of\nusage, you can look at the implementation of a custom GC inside pypytools.\nThe peculiarity is that it also defines a \"with nogc():\" context manager\nwhich you can use to mark performance-critical sections where the GC is not\nallowed to run.\n\nThe following chart compares the behavior of the default PyPy GC and the new\ncustom GC, after a careful placing of nogc() sections:\n\n\n\n\nThe yellow line is the same as before, while the purple line shows the new\nsystem: almost all spikes have gone, and the baseline performance is about 10%\nbetter. There is still one spike towards the end, but after some investigation\nwe concluded that it was not caused by the GC.\n\nNote that this does not mean that the whole program became magically\nfaster: we simply moved the GC pauses in some other place which is not\nshown in the graph: in this specific use case this technique was useful\nbecause it allowed us to shift the GC work in places where pauses are more\nacceptable.\n\nAll in all, a pretty big success, I think. These functionalities are already\navailable in the nightly builds of PyPy, and will be included in the next\nrelease: take this as a New Year present :)\n\nAntonio Cuni and the PyPy team", + "tags": "gc,sponsors", + "url": "https://www.pypy.org/posts/2019/01/pypy-for-low-latency-systems-613165393301401965.html" + }, + { + "title": "PyPy Winter Sprint Feb 4-9 in D\u00fcsseldorf", + "text": "PyPy Sprint February 4th-9th 2019 in D\u00fcsseldorf\n\n\nThe next PyPy sprint will be held in the Computer Science department of Heinrich-Heine Universit\u00e4t D\u00fcsseldorf from the 4th to the 9st of February 2019 (nine years after the last sprint there). This is a fully public sprint, everyone is welcome to join us.\n\nTopics and goals\n\n\n\nimprove Python 3.6 support\ndiscuss benchmarking situation\nprogress on utf-8 branches\ncpyext performance and completeness\npackaging: are we ready to upload to PyPI?\n\nissue 2617\u00a0 - we expose too many functions from lib-pypy.so\nmanylinux2010 - will it solve our build issues?\nformulate an ABI name and upgrade policy\n\n\n\nmemoryview(ctypes.Structure) does not create the correct format string\ndiscussing the state and future of PyPy and the wider Python ecosystem\n\n\n\n\nLocation\n\nThe sprint will take place in seminar room 25.12.02.55 of the computer science department.\u00a0 It is in the building 25.12 of the university campus, second floor. Travel instructions\n\n\nExact times\n\nWork days: starting February 4th (10:00), ending February 9th (~afternoon). The break day will probably be Thursday.\n\nRegistration\n\n\nPlease register by Mercurial::\nhttps://bitbucket.org/pypy/extradoc/\n\nhttps://foss.heptapod.net/pypy/extradoc/-/blob/branch/default/extradoc/sprintinfo/ddorf2019/people.txt\n\nor on the pypy-dev mailing list if you do not yet have check-in rights:\n\n\nhttps://mail.python.org/mailman/listinfo/pypy-dev\n\n\n\nLooking forward to seeing everyone there!", + "tags": "", + "url": "https://www.pypy.org/posts/2018/12/pypy-winter-sprint-feb-4-9-in-dusseldorf-7199110498451574074.html" + }, + { + "title": "Funding for 64-bit Armv8-a support in PyPy", + "text": "Hello everyone\n\nAt PyPy we are trying to support a relatively wide range of platforms. We have PyPy working on OS X, Windows and various flavors of linux (and unofficially various flavors of BSD) on the software side, with hardware side having x86, x86_64, PPC, 32-bit Arm (v7) and even zarch. This is harder than for other projects, since PyPy emits assembler on the fly from the just in time compiler and it requires significant amount of work to port it to a new platform.\n\nWe are pleased to inform that Arm Limited, together with Crossbar.io GmbH, are sponsoring the development of 64-bit Armv8-a architecture support through Baroque Software OU, which would allow PyPy to run on a new variety of low-power, high-density servers with that architecture. We believe this will be beneficial for the funders, for the PyPy project as well as to the wider community.\n\nThe work will commence soon and will be done some time early next year with expected speedups either comparable to x86 speedups or, if our current experience with ARM holds, more significant than x86 speedups.\n\nBest,\nMaciej Fijalkowski and the PyPy team", + "tags": "", + "url": "https://www.pypy.org/posts/2018/11/hello-everyone-at-pypy-we-are-trying-to-5336557946798583063.html" + }, + { + "title": "Guest Post: Implementing a Calculator REPL in RPython", + "text": "This is a tutorial style post that walks through using the RPython translation\ntoolchain to create a REPL that executes basic math expressions. \n\nWe will do that by scanning the user's input into tokens, compiling those \ntokens into bytecode and running that bytecode in our own virtual machine. Don't\nworry if that sounds horribly complicated, we are going to explain it step by\nstep. \n\nThis post is a bit of a diversion while on my journey to create a compliant \nlox implementation\nusing the RPython translation toolchain. The \nmajority of this work is a direct RPython translation of the low level C \nguide from Bob Nystrom (@munificentbob) in the\nexcellent book craftinginterpreters.com\nspecifically the chapters 14 \u2013 17.\n\nThe road ahead\n\nAs this post is rather long I'll break it into a few major sections. In each section we will\nhave something that translates with RPython, and at the end it all comes together. \n\n\nREPL\n\nVirtual Machine\n\nScanning the source\n\nCompiling Expressions\n\nEnd to end\n\n\nA REPL\n\nSo if you're a Python programmer you might be thinking this is pretty trivial right?\n\nI mean if we ignore input errors, injection attacks etc couldn't we just do something\nlike this:\n\n\"\"\"\nA pure python REPL that can parse simple math expressions\n\"\"\"\nwhile True:\n print(eval(raw_input(\"> \")))\n\n\nWell it does appear to do the trick:\n\n$ python2 section-1-repl/main.py\n> 3 + 4 * ((1.0/(2 * 3 * 4)) + (1.0/(4 * 5 * 6)) - (1.0/(6 * 7 * 8)))\n3.1880952381\n\n\nSo can we just ask RPython to translate this into a binary that runs magically\nfaster?\n\nLet's see what happens. We need to add two functions for RPython to\nget its bearings (entry_point and target) and call the file targetXXX:\n\ntargetrepl1.py\n\ndef repl():\n while True:\n print eval(raw_input('> '))\n\n\ndef entry_point(argv):\n repl()\n return 0\n\n\ndef target(driver, *args):\n return entry_point, None\n\n\nWhich at translation time gives us this admonishment that accurately tells us\nwe are trying to call a Python built-in raw_input that is unfortunately not \nvalid RPython.\n\n$ rpython ./section-1-repl/targetrepl1.py\n...SNIP...\n[translation:ERROR] AnnotatorError: \n\nobject with a __call__ is not RPython: \nProcessing block:\n block@18 is a \n in (target1:2)repl \n containing the following operations: \n v0 = simple_call((builtin_function raw_input), ('> ')) \n v1 = simple_call((builtin_function eval), v0) \n v2 = str(v1) \n v3 = simple_call((function rpython_print_item), v2) \n v4 = simple_call((function rpython_print_newline)) \n\n\nOk so we can't use raw_input or eval but that doesn't faze us. Let's get \nthe input from a stdin stream and just print it out (no evaluation).\n\ntargetrepl2.py\n\nfrom rpython.rlib import rfile\n\nLINE_BUFFER_LENGTH = 1024\n\n\ndef repl(stdin):\n while True:\n print \"> \",\n line = stdin.readline(LINE_BUFFER_LENGTH)\n print line\n\n\ndef entry_point(argv):\n stdin, stdout, stderr = rfile.create_stdio()\n try:\n repl(stdin)\n except:\n return 0\n\n\ndef target(driver, *args):\n return entry_point, None\n\n\nTranslate targetrepl2.py \u2013 we can add an optimization level if we\nare so inclined:\n\n$ rpython --opt=2 section-1-repl/targetrepl2.py\n...SNIP...\n[Timer] Timings:\n[Timer] annotate --- 1.2 s\n[Timer] rtype_lltype --- 0.9 s\n[Timer] backendopt_lltype --- 0.6 s\n[Timer] stackcheckinsertion_lltype --- 0.0 s\n[Timer] database_c --- 15.0 s\n[Timer] source_c --- 1.6 s\n[Timer] compile_c --- 1.9 s\n[Timer] =========================================\n[Timer] Total: --- 21.2 s\n\n\nNo errors!? Let's try it out:\n\n$ ./target2-c \n1 + 2\n> 1 + 2\n\nC\n\n\nAhh our first success \u2013 let's quickly deal with the flushing fail by using the \nstdout stream directly as well. Let's print out the input in quotes:\n\nfrom rpython.rlib import rfile\n\nLINE_BUFFER_LENGTH = 1024\n\n\ndef repl(stdin, stdout):\n while True:\n stdout.write(\"> \")\n line = stdin.readline(LINE_BUFFER_LENGTH)\n print '\"%s\"' % line.strip()\n\n\ndef entry_point(argv):\n stdin, stdout, stderr = rfile.create_stdio()\n try:\n repl(stdin, stdout)\n except:\n pass\n return 0\n\n\ndef target(driver, *args):\n return entry_point, None\n\n\nTranslation works, and the test run too:\n\n$ ./target3-c \n> hello this seems better\n\"hello this seems better\"\n> C\n\n\nSo we are in a good place with taking user input and printing output... What about\nthe whole math evaluation thing we were promised? For that we are can probably leave\nour RPython REPL behind for a while and connect it up at the end.\n\nA virtual machine\n\nA virtual machine is the execution engine of our basic math interpreter. It will be very simple,\nonly able to do simple tasks like addition. I won't go into any depth to describe why we want\na virtual machine, but it is worth noting that many languages including Java and Python make \nthis decision to compile to an intermediate bytecode representation and then execute that with\na virtual machine. Alternatives are compiling directly to native machine code like (earlier versions of) the V8\nJavaScript engine, or at the other end of the spectrum executing an abstract syntax tree \u2013 \nwhich is what the Truffle approach to building VMs is based on. \n\nWe are going to keep things very simple. We will have a stack where we can push and pop values,\nwe will only support floats, and our VM will only implement a few very basic operations.\n\nOpCodes\n\nIn fact our entire instruction set is:\n\nOP_CONSTANT\nOP_RETURN\nOP_NEGATE\nOP_ADD\nOP_SUBTRACT\nOP_MULTIPLY\nOP_DIVIDE\n\n\nSince we are targeting RPython we can't use the nice enum module from the Python standard\nlibrary, so instead we just define a simple class with class attributes.\n\nWe should start to get organized, so we will create a new file \nopcodes.py and add this:\n\nclass OpCode:\n OP_CONSTANT = 0\n OP_RETURN = 1\n OP_NEGATE = 2\n OP_ADD = 3\n OP_SUBTRACT = 4\n OP_MULTIPLY = 5\n OP_DIVIDE = 6\n\n\nChunks\n\nTo start with we need to get some infrastructure in place before we write the VM engine.\n\nFollowing craftinginterpreters.com\nwe start with a Chunk object which will represent our bytecode. In RPython we have access \nto Python-esq lists so our code object will just be a list of OpCode values \u2013 which are \njust integers. A list of ints, couldn't get much simpler.\n\nsection-2-vm/chunk.py\n\nclass Chunk:\n code = None\n\n def __init__(self):\n self.code = []\n\n def write_chunk(self, byte):\n self.code.append(byte)\n\n def disassemble(self, name):\n print \"== %s ==\\n\" % name\n i = 0\n while i < len(self.code):\n i = disassemble_instruction(self, i)\n\n\nFrom here on I'll only present minimal snippets of code instead of the whole lot, but \nI'll link to the repository with the complete example code. For example the \nvarious debugging including disassemble_instruction isn't particularly interesting\nto include verbatim. See the github repo for full details\n\nWe need to check that we can create a chunk and disassemble it. The quickest way to do this\nis to use Python during development and debugging then every so often try to translate it.\n\nGetting the disassemble part through the RPython translator was a hurdle for me as I\nquickly found that many str methods such as format are not supported, and only very basic\n% based formatting is supported. I ended up creating helper functions for string manipulation\nsuch as:\n\ndef leftpad_string(string, width, char=\" \"):\n l = len(string)\n if l > width:\n return string\n return char * (width - l) + string\n\n\nLet's write a new entry_point that creates and disassembles a chunk of bytecode. We can\nset the target output name to vm1 at the same time:\n\ntargetvm1.py\n\ndef entry_point(argv):\n bytecode = Chunk()\n bytecode.write_chunk(OpCode.OP_ADD)\n bytecode.write_chunk(OpCode.OP_RETURN)\n bytecode.disassemble(\"hello world\")\n return 0\n\ndef target(driver, *args):\n driver.exe_name = \"vm1\"\n return entry_point, None\n\n\nRunning this isn't going to be terribly interesting, but it is always nice to\nknow that it is doing what you expect:\n\n$ ./vm1 \n== hello world ==\n\n0000 OP_ADD \n0001 OP_RETURN \n\n\nChunks of data\n\nRef: https://www.craftinginterpreters.com/chunks-of-bytecode.html#constants\n\nSo our bytecode is missing a very crucial element \u2013 the values to operate on!\n\nAs with the bytecode we can store these constant values as part of the chunk\ndirectly in a list. Each chunk will therefore have a constant data component,\nand a code component. \n\nEdit the chunk.py file and add the new instance attribute constants as an\nempty list, and a new method add_constant.\n\n def add_constant(self, value):\n self.constants.append(value)\n return len(self.constants) - 1\n\n\nNow to use this new capability we can modify our example chunk\nto write in some constants before the OP_ADD:\n\n bytecode = Chunk()\n constant = bytecode.add_constant(1.0)\n bytecode.write_chunk(OpCode.OP_CONSTANT)\n bytecode.write_chunk(constant)\n\n constant = bytecode.add_constant(2.0)\n bytecode.write_chunk(OpCode.OP_CONSTANT)\n bytecode.write_chunk(constant)\n\n bytecode.write_chunk(OpCode.OP_ADD)\n bytecode.write_chunk(OpCode.OP_RETURN)\n\n bytecode.disassemble(\"adding constants\")\n\n\nWhich still translates with RPython and when run gives us the following disassembled\nbytecode:\n\n== adding constants ==\n\n0000 OP_CONSTANT (00) '1'\n0002 OP_CONSTANT (01) '2'\n0004 OP_ADD \n0005 OP_RETURN\n\n\nWe won't go down the route of serializing the bytecode to disk, but this bytecode chunk\n(including the constant data) could be saved and executed on our VM later \u2013 like a Java\n.class file. Instead we will pass the bytecode directly to our VM after we've created\nit during the compilation process. \n\nEmulation\n\nSo those four instructions of bytecode combined with the constant value mapping\n00 -> 1.0 and 01 -> 2.0 describes individual steps for our virtual machine\nto execute. One major point in favor of defining our own bytecode is we can \ndesign it to be really simple to execute \u2013 this makes the VM really easy to implement.\n\nAs I mentioned earlier this virtual machine will have a stack, so let's begin with that.\nNow the stack is going to be a busy little beast \u2013 as our VM takes instructions like \nOP_ADD it will pop off the top two values from the stack, and push the result of adding \nthem together back onto the stack. Although dynamically resizing Python lists \nare marvelous, they can be a little slow. RPython can take advantage of a constant sized\nlist which doesn't make our code much more complicated.\n\nTo do this we will define a constant sized list and track the stack_top directly. Note\nhow we can give the RPython translator hints by adding assertions about the state that\nthe stack_top will be in.\n\nclass VM(object):\n STACK_MAX_SIZE = 256\n stack = None\n stack_top = 0\n\n def __init__(self):\n self._reset_stack()\n\n def _reset_stack(self):\n self.stack = [0] * self.STACK_MAX_SIZE\n self.stack_top = 0\n\n def _stack_push(self, value):\n assert self.stack_top < self.STACK_MAX_SIZE\n self.stack[self.stack_top] = value\n self.stack_top += 1\n\n def _stack_pop(self):\n assert self.stack_top >= 0\n self.stack_top -= 1\n return self.stack[self.stack_top]\n\n def _print_stack(self):\n print \" \",\n if self.stack_top <= 0:\n print \"[]\",\n else:\n for i in range(self.stack_top):\n print \"[ %s ]\" % self.stack[i],\n print\n\n\nNow we get to the main event, the hot loop, the VM engine. Hope I haven't built it up to\nmuch, it is actually really simple! We loop until the instructions tell us to stop \n(OP_RETURN), and dispatch to other simple methods based on the instruction.\n\n def _run(self):\n while True:\n instruction = self._read_byte()\n\n if instruction == OpCode.OP_RETURN:\n print \"%s\" % self._stack_pop()\n return InterpretResultCode.INTERPRET_OK\n elif instruction == OpCode.OP_CONSTANT:\n constant = self._read_constant()\n self._stack_push(constant)\n elif instruction == OpCode.OP_ADD:\n self._binary_op(self._stack_add) \n\n\nNow the _read_byte method will have to keep track of which instruction we are up \nto. So add an instruction pointer (ip) to the VM with an initial value of 0.\nThen _read_byte is simply getting the next bytecode (int) from the chunk's code:\n\n def _read_byte(self):\n instruction = self.chunk.code[self.ip]\n self.ip += 1\n return instruction\n\n\n\n\nIf the instruction is OP_CONSTANT we take the constant's address from the next byte\nof the chunk's code, retrieve that constant value and add it to the VM's stack.\n\n def _read_constant(self):\n constant_index = self._read_byte()\n return self.chunk.constants[constant_index]\n\n\nFinally our first arithmetic operation OP_ADD, what it has to achieve doesn't \nrequire much explanation: pop two values from the stack, add them together, push \nthe result. But since a few operations all have the same template we introduce a\nlayer of indirection \u2013 or abstraction \u2013 by introducing a reusable _binary_op \nhelper method.\n\n @specialize.arg(1)\n def _binary_op(self, operator):\n op2 = self._stack_pop()\n op1 = self._stack_pop()\n result = operator(op1, op2)\n self._stack_push(result)\n\n @staticmethod\n def _stack_add(op1, op2):\n return op1 + op2\n\n\n\n\nNote we tell RPython to specialize _binary_op on the first argument. This causes\nRPython to make a copy of _binary_op for every value of the first argument passed,\nwhich means that each copy contains a call to a particular operator, which can then be\ninlined.\n\nTo be able to run our bytecode the only thing left to do is to pass in the chunk \nand call _run():\n\n def interpret_chunk(self, chunk):\n if self.debug_trace:\n print \"== VM TRACE ==\"\n self.chunk = chunk\n self.ip = 0\n try:\n result = self._run()\n return result\n except:\n return InterpretResultCode.INTERPRET_RUNTIME_ERROR\n\n\ntargetvm3.py connects the pieces:\n\ndef entry_point(argv):\n bytecode = Chunk()\n constant = bytecode.add_constant(1)\n bytecode.write_chunk(OpCode.OP_CONSTANT)\n bytecode.write_chunk(constant)\n constant = bytecode.add_constant(2)\n bytecode.write_chunk(OpCode.OP_CONSTANT)\n bytecode.write_chunk(constant)\n bytecode.write_chunk(OpCode.OP_ADD)\n bytecode.write_chunk(OpCode.OP_RETURN)\n\n vm = VM()\n vm.interpret_chunk(bytecode)\n\n return 0\n\n\nI've added some trace debugging so we can see what the VM and stack is doing.\n\nThe whole thing translates with RPython, and when run gives us:\n\n./vm3\n== VM TRACE ==\n []\n0000 OP_CONSTANT (00) '1'\n [ 1 ]\n0002 OP_CONSTANT (01) '2'\n [ 1 ] [ 2 ]\n0004 OP_ADD \n [ 3 ]\n0005 OP_RETURN \n3\n\n\nYes we just computed the result of 1+2. Pat yourself on the back. \n\nAt this point it is probably valid to check that the translated executable is actually\nfaster than running our program directly in Python. For this trivial example under \nPython2/pypy this targetvm3.py file runs in the 20ms \u2013 90ms region, and the \ncompiled vm3 runs in <5ms. Something useful must be happening during the translation.\n\nI won't go through the code adding support for our other instructions as they are\nvery similar and straightforward. Our VM is ready to execute our chunks of bytecode,\nbut we haven't yet worked out how to take the entered expression and turn that into\nthis simple bytecode. This is broken into two steps, scanning and compiling.\n\nScanning the source\n\nAll the source for this section can be found in \nsection-3-scanning.\n\nThe job of the scanner is to take the raw expression string and transform it into\na sequence of tokens. This scanning step will strip out whitespace and comments, \ncatch errors with invalid token and tokenize the string. For example the input \n\"( 1 + 2 ) would get tokenized into LEFT_PAREN, NUMBER(1), PLUS, NUMBER(2), RIGHT_PAREN.\n\nAs with our OpCodes we will just define a simple Python class to define an int\nfor each type of token:\n\nclass TokenTypes:\n ERROR = 0\n EOF = 1\n LEFT_PAREN = 2\n RIGHT_PAREN = 3\n MINUS = 4\n PLUS = 5\n SLASH = 6\n STAR = 7\n NUMBER = 8\n\n\nA token has to keep some other information as well \u2013 keeping track of the location and \nlength of the token will be helpful for error reporting. The NUMBER token clearly needs \nsome data about the value it is representing: we could include a copy of the source lexeme \n(e.g. the string 2.0), or parse the value and store that, or \u2013 what we will do in this \nblog \u2013 use the location and length information as pointers into the original source \nstring. Every token type (except perhaps ERROR) will use this simple data structure: \n\nclass Token(object):\n\n def __init__(self, start, length, token_type):\n self.start = start\n self.length = length\n self.type = token_type\n\n\nOur soon to be created scanner will create these Token objects which refer back to \naddresses in some source. If the scanner sees the source \"( 1 + 2.0 )\" it would emit\nthe following tokens:\n\nToken(0, 1, TokenTypes.LEFT_PAREN)\nToken(2, 1, TokenTypes.NUMBER)\nToken(4, 1, TokenTypes.PLUS)\nToken(6, 3, TokenTypes.NUMBER)\nToken(10, 1, TokenTypes.RIGHT_PAREN)\n\n\nScanner\n\nLet's walk through the scanner implementation method\nby method. The scanner will take the source and pass through it once, creating tokens\nas it goes.\n\nclass Scanner(object):\n\n def __init__(self, source):\n self.source = source\n self.start = 0\n self.current = 0\n\n\nThe start and current variables are character indices in the source string that point to \nthe current substring being considered as a token. \n\nFor example in the string \"(51.05+2)\" while we are tokenizing the number 51.05\nwe will have start pointing at the 5, and advance current character by character\nuntil the character is no longer part of a number. Midway through scanning the number \nthe start and current values might point to 1 and 4 respectively:\n\n\n\n\n0\n1\n2\n3\n4\n5\n6\n7\n8\n\n\n\n\n\"(\"\n\"5\"\n\"1\"\n\".\"\n\"0\"\n\"5\"\n\"+\"\n\"2\"\n\")\"\n\n\n\n\u00a0\n\n\n\u00a0\n\n\n\n\n\n\n\n\nFrom current=4 the scanner peeks ahead and sees that the next character (5) is\na digit, so will continue to advance.\n\n\n\n\n0\n1\n2\n3\n4\n5\n6\n7\n8\n\n\n\n\n\"(\"\n\"5\"\n\"1\"\n\".\"\n\"0\"\n\"5\"\n\"+\"\n\"2\"\n\")\"\n\n\n\n\u00a0\n\n\n\n\u00a0\n\n\n\n\n\n\n\nWhen the scanner peeks ahead and sees the \"+\" it will create the number\ntoken and emit it. The method that carry's out this tokenizing is _number:\n\n def _number(self):\n while self._peek().isdigit():\n self.advance()\n\n # Look for decimal point\n if self._peek() == '.' and self._peek_next().isdigit():\n self.advance()\n while self._peek().isdigit():\n self.advance()\n\n return self._make_token(TokenTypes.NUMBER)\n\n\nIt relies on a few helpers to look ahead at the upcoming characters:\n\n def _peek(self):\n if self._is_at_end():\n return '\\0'\n return self.source[self.current]\n\n def _peek_next(self):\n if self._is_at_end():\n return '\\0'\n return self.source[self.current+1]\n\n def _is_at_end(self):\n return len(self.source) == self.current\n\n\nIf the character at current is still part of the number we want to call advance\nto move on by one character.\n\n def advance(self):\n self.current += 1\n return self.source[self.current - 1]\n\n\nOnce the isdigit() check fails in _number() we call _make_token() to emit the\ntoken with the NUMBER type.\n\n def _make_token(self, token_type):\n return Token(\n start=self.start,\n length=(self.current - self.start),\n token_type=token_type\n )\n\n\nNote again that the token is linked to an index address in the source, rather than \nincluding the string value.\n\nOur scanner is pull based, a token will be requested via scan_token. First we skip \npast whitespace and depending on the characters emit the correct token:\n\n def scan_token(self):\n # skip any whitespace\n while True:\n char = self._peek()\n if char in ' \\r\\t\\n':\n self.advance()\n break\n\n self.start = self.current\n\n if self._is_at_end():\n return self._make_token(TokenTypes.EOF)\n\n char = self.advance()\n\n if char.isdigit():\n return self._number()\n\n if char == '(':\n return self._make_token(TokenTypes.LEFT_PAREN)\n if char == ')':\n return self._make_token(TokenTypes.RIGHT_PAREN)\n if char == '-':\n return self._make_token(TokenTypes.MINUS)\n if char == '+':\n return self._make_token(TokenTypes.PLUS)\n if char == '/':\n return self._make_token(TokenTypes.SLASH)\n if char == '*':\n return self._make_token(TokenTypes.STAR)\n\n return ErrorToken(\"Unexpected character\", self.current)\n\n\n\n\nIf this was a real programming language we were scanning, this would be the point where we \nadd support for different types of literals and any language identifiers/reserved words.\n\nAt some point we will need to parse the literal value for our numbers, but we leave that\njob for some later component, for now we'll just add a get_token_string helper. To make\nsure that RPython is happy to index arbitrary slices of source we add range assertions:\n\n def get_token_string(self, token):\n if isinstance(token, ErrorToken):\n return token.message\n else:\n end_loc = token.start + token.length\n assert end_loc < len(self.source)\n assert end_loc > 0\n return self.source[token.start:end_loc]\n\n\nA simple entry point can be used to test our scanner with a hard coded \nsource string:\n\ntargetscanner1.py\n\nfrom scanner import Scanner, TokenTypes, TokenTypeToName\n\n\ndef entry_point(argv):\n\n source = \"( 1 + 2.0 )\"\n\n scanner = Scanner(source)\n t = scanner.scan_token()\n while t.type != TokenTypes.EOF and t.type != TokenTypes.ERROR:\n print TokenTypeToName[t.type],\n if t.type == TokenTypes.NUMBER:\n print \"(%s)\" % scanner.get_token_string(t),\n print\n t = scanner.scan_token()\n return 0\n\n\nRPython didn't complain, and lo it works:\n\n$ ./scanner1 \nLEFT_PAREN\nNUMBER (1)\nPLUS\nNUMBER (2.0)\nRIGHT_PAREN\n\n\nLet's connect our REPL to the scanner.\n\ntargetscanner2.py\n\nfrom rpython.rlib import rfile\nfrom scanner import Scanner, TokenTypes, TokenTypeToName\n\nLINE_BUFFER_LENGTH = 1024\n\n\ndef repl(stdin, stdout):\n while True:\n stdout.write(\"> \")\n source = stdin.readline(LINE_BUFFER_LENGTH)\n\n scanner = Scanner(source)\n t = scanner.scan_token()\n while t.type != TokenTypes.EOF and t.type != TokenTypes.ERROR:\n print TokenTypeToName[t.type],\n if t.type == TokenTypes.NUMBER:\n print \"(%s)\" % scanner.get_token_string(t),\n print\n t = scanner.scan_token()\n\n\ndef entry_point(argv):\n stdin, stdout, stderr = rfile.create_stdio()\n try:\n repl(stdin, stdout)\n except:\n pass\n return 0\n\n\nWith our REPL hooked up we can now scan tokens from arbitrary input:\n\n$ ./scanner2\n> (3 *4) - -3\nLEFT_PAREN\nNUMBER (3)\nSTAR\nNUMBER (4)\nRIGHT_PAREN\nMINUS\nMINUS\nNUMBER (3)\n> C\n\n\nCompiling expressions\n\nReferences\n\n\nhttps://www.craftinginterpreters.com/compiling-expressions.html\n\nhttps://effbot.org/zone/simple-top-down-parsing.htm\n\n\nThe final piece is to turn this sequence of tokens into our low level \nbytecode instructions for the virtual machine to execute. Buckle up, \nwe are about to write us a compiler.\n\nOur compiler will take a single pass over the tokens using \nVaughan Pratt\u2019s \nparsing technique, and output a chunk of bytecode \u2013 if we do it\nright it will be compatible with our existing virtual machine.\n\nRemember the bytecode we defined above is really simple \u2013 by relying \non our stack we can transform a nested expression into a sequence of\nour bytecode operations.\n\nTo make this more concrete let's go through by hand translating an\nexpression into bytecode.\n\nOur source expression:\n\n(3 + 2) - (7 * 2)\n\n\nIf we were to make an abstract syntax tree we'd get something \nlike this:\n\n\n\nNow if we start at the first sub expression (3+2) we can clearly\nnote from the first open bracket that we must see a close bracket,\nand that the expression inside that bracket must be valid on its \nown. Not only that but regardless of the inside we know that the whole\nexpression still has to be valid. Let's focus on this first bracketed\nexpression, let our attention recurse into it so to speak.\n\nThis gives us a much easier problem \u2013 we just want to get our virtual\nmachine to compute 3 + 2. In this bytecode dialect we would load the \ntwo constants, and then add them with OP_ADD like so: \n\nOP_CONSTANT (00) '3.000000'\nOP_CONSTANT (01) '2.000000'\nOP_ADD\n\n\nThe effect of our vm executing these three instructions is that sitting\npretty at the top of the stack is the result of the addition. Winning.\n\nJumping back out from our bracketed expression, our next token is MINUS,\nat this point we have a fair idea that it must be used in an infix position. \nIn fact whatever token followed the bracketed expression it must be a \nvalid infix operator, if not the expression is over or had a syntax error. \n\nAssuming the best from our user (naive), we handle MINUS the same way\nwe handled the first PLUS. We've already got the first operand on the\nstack, now we compile the right operand and then write out the bytecode\nfor OP_SUBTRACT.\n\nThe right operand is another simple three instructions:\n\nOP_CONSTANT (02) '7.000000'\nOP_CONSTANT (03) '2.000000'\nOP_MULTIPLY\n\n\nThen we finish our top level binary expression and write a OP_RETURN to\nreturn the value at the top of the stack as the execution's result. Our\nfinal hand compiled program is:\n\nOP_CONSTANT (00) '3.000000'\nOP_CONSTANT (01) '2.000000'\nOP_ADD\nOP_CONSTANT (02) '7.000000'\nOP_CONSTANT (03) '2.000000'\nOP_MULTIPLY\nOP_SUBTRACT\nOP_RETURN\n\n\nOk that wasn't so hard was it? Let's try make our code do that.\n\nWe define a parser object which will keep track of where we are, and\nwhether things have all gone horribly wrong:\n\nclass Parser(object):\n def __init__(self):\n self.had_error = False\n self.panic_mode = False\n self.current = None\n self.previous = None\n\n\nThe compiler will also be a class, we'll need one of our Scanner instances\nto pull tokens from, and since the output is a bytecode Chunk let's go ahead\nand make one of those in our compiler initializer:\n\nclass Compiler(object):\n\n def __init__(self, source):\n self.parser = Parser()\n self.scanner = Scanner(source)\n self.chunk = Chunk()\n\n\nSince we have this (empty) chunk of bytecode we will make a helper method\nto add individual bytes. Every instruction will pass from our compiler into\nan executable program through this simple .\n\n def emit_byte(self, byte):\n self.current_chunk().write_chunk(byte)\n\n\nTo quote from Bob Nystrom on the Pratt parsing technique:\n\n\n the implementation is a deceptively-simple handful of deeply intertwined code\n\n\nI don't actually think I can do justice to this section. Instead I suggest \nreading his treatment in \nPratt Parsers: Expression Parsing Made Easy\nwhich explains the magic behind the parsing component. Our only major difference is \ninstead of creating an AST we are going to directly emit bytecode for our VM.\n\nNow that I've absolved myself from taking responsibility in explaining this somewhat\ntricky concept, I'll discuss some of the code from \ncompiler.py, and walk through what happens \nfor a particular rule.\n\nI'll jump straight to the juicy bit the table of parse rules. We define a ParseRule\nfor each token, and each rule comprises:\n\n\nan optional handler for when the token is as a prefix (e.g. the minus in (-2)),\n\nan optional handler for whet the token is used infix (e.g. the slash in 2/47)\n\na precedence value (a number that determines what is of higher precedence)\n\n\nrules = [\n ParseRule(None, None, Precedence.NONE), # ERROR\n ParseRule(None, None, Precedence.NONE), # EOF\n ParseRule(Compiler.grouping, None, Precedence.CALL), # LEFT_PAREN\n ParseRule(None, None, Precedence.NONE), # RIGHT_PAREN\n ParseRule(Compiler.unary, Compiler.binary, Precedence.TERM), # MINUS\n ParseRule(None, Compiler.binary, Precedence.TERM), # PLUS\n ParseRule(None, Compiler.binary, Precedence.FACTOR), # SLASH\n ParseRule(None, Compiler.binary, Precedence.FACTOR), # STAR\n ParseRule(Compiler.number, None, Precedence.NONE), # NUMBER\n]\n\n\nThese rules really are the magic of our compiler. When we get to a particular\ntoken such as MINUS we see if it is an infix operator and if so we've gone and\ngot its first operand ready. At all times we rely on the relative precedence; consuming \neverything with higher precedence than the operator we are currently evaluating.\n\nIn the expression:\n\n2 + 3 * 4\n\n\nThe * has higher precedence than the +, so 3 * 4 will be parsed together\nas the second operand to the first infix operator (the +) which follows\nthe BEDMAS \norder of operations I was taught at high school.\n\nTo encode these precedence values we make another Python object moonlighting\nas an enum:\n\nclass Precedence(object):\n NONE = 0\n DEFAULT = 1\n TERM = 2 # + -\n FACTOR = 3 # * /\n UNARY = 4 # ! - +\n CALL = 5 # ()\n PRIMARY = 6\n\n\nWhat happens in our compiler when turning -2.0 into bytecode? Assume we've just \npulled the token MINUS from the scanner. Every expression has to start with some\ntype of prefix \u2013 whether that is:\n\n\na bracket group (, \n\na number 2, \n\nor a prefix unary operator -. \n\n\nKnowing that, our compiler assumes there is a prefix handler in the rule table \u2013 in\nthis case it points us at the unary handler.\n\n def parse_precedence(self, precedence):\n # parses any expression of a given precedence level or higher\n self.advance()\n prefix_rule = self._get_rule(self.parser.previous.type).prefix\n prefix_rule(self)\n\n\n\n\nunary is called:\n\n def unary(self):\n op_type = self.parser.previous.type\n # Compile the operand\n self.parse_precedence(Precedence.UNARY)\n # Emit the operator instruction\n if op_type == TokenTypes.MINUS:\n self.emit_byte(OpCode.OP_NEGATE)\n\n\nHere \u2013 before writing the OP_NEGATE opcode we recurse back into parse_precedence\nto ensure that whatever follows the MINUS token is compiled \u2013 provided it has \nhigher precedence than unary \u2013 e.g. a bracketed group. \nCrucially at run time this recursive call will ensure that the result is left \non top of our stack. Armed with this knowledge, the unary method just\nhas to emit a single byte with the OP_NEGATE opcode.\n\nTest compilation\n\nNow we can test our compiler by outputting disassembled bytecode\nof our user entered expressions. Create a new entry_point \ntargetcompiler:\n\nfrom rpython.rlib import rfile\nfrom compiler import Compiler\n\nLINE_BUFFER_LENGTH = 1024\n\n\ndef entry_point(argv):\n stdin, stdout, stderr = rfile.create_stdio()\n\n try:\n while True:\n stdout.write(\"> \")\n source = stdin.readline(LINE_BUFFER_LENGTH)\n compiler = Compiler(source, debugging=True)\n compiler.compile()\n except:\n pass\n return 0\n\n\nTranslate it and test it out:\n\n$ ./compiler1 \n> (2/4 + 1/2)\n== code ==\n\n0000 OP_CONSTANT (00) '2.000000'\n0002 OP_CONSTANT (01) '4.000000'\n0004 OP_DIVIDE \n0005 OP_CONSTANT (02) '1.000000'\n0007 OP_CONSTANT (00) '2.000000'\n0009 OP_DIVIDE \n0010 OP_ADD \n0011 OP_RETURN\n\n\nNow if you've made it this far you'll be eager to finally connect everything\ntogether by executing this bytecode with the virtual machine.\n\nEnd to end\n\nAll the pieces slot together rather easily at this point, create a new \nfile targetcalc.py and define our \nentry point:\n\nfrom rpython.rlib import rfile\nfrom compiler import Compiler\nfrom vm import VM\n\nLINE_BUFFER_LENGTH = 4096\n\n\ndef entry_point(argv):\n stdin, stdout, stderr = rfile.create_stdio()\n vm = VM()\n try:\n while True:\n stdout.write(\"> \")\n source = stdin.readline(LINE_BUFFER_LENGTH)\n if source:\n compiler = Compiler(source, debugging=False)\n compiler.compile()\n vm.interpret_chunk(compiler.chunk)\n except:\n pass\n return 0\n\n\ndef target(driver, *args):\n driver.exe_name = \"calc\"\n return entry_point, None\n\n\n\n\nLet's try catch it out with a double negative:\n\n$ ./calc \n> 2--3\n== VM TRACE ==\n []\n0000 OP_CONSTANT (00) '2.000000'\n [ 2.000000 ]\n0002 OP_CONSTANT (01) '3.000000'\n [ 2.000000 ] [ 3.000000 ]\n0004 OP_NEGATE \n [ 2.000000 ] [ -3.000000 ]\n0005 OP_SUBTRACT \n [ 5.000000 ]\n0006 OP_RETURN \n5.000000\n\n\nOk well let's evaluate the first 50 terms of the \nNilakantha Series:\n\n$ ./calc\n> 3 + 4 * ((1/(2 * 3 * 4)) + (1/(4 * 5 * 6)) - (1/(6 * 7 * 8)) + (1/(8 * 9 * 10)) - (1/(10 * 11 * 12)) + (1/(12 * 13 * 14)) - (1/(14 * 15 * 16)) + (1/(16 * 17 * 18)) - (1/(18 * 19 * 20)) + (1/(20 * 21 * 22)) - (1/(22 * 23 * 24)) + (1/(24 * 25 * 26)) - (1/(26 * 27 * 28)) + (1/(28 * 29 * 30)) - (1/(30 * 31 * 32)) + (1/(32 * 33 * 34)) - (1/(34 * 35 * 36)) + (1/(36 * 37 * 38)) - (1/(38 * 39 * 40)) + (1/(40 * 41 * 42)) - (1/(42 * 43 * 44)) + (1/(44 * 45 * 46)) - (1/(46 * 47 * 48)) + (1/(48 * 49 * 50)) - (1/(50 * 51 * 52)) + (1/(52 * 53 * 54)) - (1/(54 * 55 * 56)) + (1/(56 * 57 * 58)) - (1/(58 * 59 * 60)) + (1/(60 * 61 * 62)) - (1/(62 * 63 * 64)) + (1/(64 * 65 * 66)) - (1/(66 * 67 * 68)) + (1/(68 * 69 * 70)) - (1/(70 * 71 * 72)) + (1/(72 * 73 * 74)) - (1/(74 * 75 * 76)) + (1/(76 * 77 * 78)) - (1/(78 * 79 * 80)) + (1/(80 * 81 * 82)) - (1/(82 * 83 * 84)) + (1/(84 * 85 * 86)) - (1/(86 * 87 * 88)) + (1/(88 * 89 * 90)) - (1/(90 * 91 * 92)) + (1/(92 * 93 * 94)) - (1/(94 * 95 * 96)) + (1/(96 * 97 * 98)) - (1/(98 * 99 * 100)) + (1/(100 * 101 * 102)))\n\n== VM TRACE ==\n []\n0000 OP_CONSTANT (00) '3.000000'\n [ 3.000000 ]\n0002 OP_CONSTANT (01) '4.000000'\n...SNIP...\n0598 OP_CONSTANT (101) '102.000000'\n [ 3.000000 ] [ 4.000000 ] [ 0.047935 ] [ 1.000000 ] [ 10100.000000 ] [ 102.000000 ]\n0600 OP_MULTIPLY \n [ 3.000000 ] [ 4.000000 ] [ 0.047935 ] [ 1.000000 ] [ 1030200.000000 ]\n0601 OP_DIVIDE \n [ 3.000000 ] [ 4.000000 ] [ 0.047935 ] [ 0.000001 ]\n0602 OP_ADD \n [ 3.000000 ] [ 4.000000 ] [ 0.047936 ]\n0603 OP_MULTIPLY \n [ 3.000000 ] [ 0.191743 ]\n0604 OP_ADD \n [ 3.191743 ]\n0605 OP_RETURN \n3.191743\n\n\nWe just executed 605 virtual machine instructions to compute pi to 1dp!\n\nThis brings us to the end of this tutorial. To recap we've walked through the whole \ncompilation process: from the user providing an expression string on the REPL, scanning\nthe source string into tokens, parsing the tokens while accounting for relative \nprecedence via a Pratt parser, generating bytecode, and finally executing the bytecode \non our own VM. RPython translated what we wrote into C and compiled it, meaning\nour resulting calc REPL is really fast.\n\n\n \u201cThe world is a thing of utter inordinate complexity and richness and strangeness that is absolutely awesome.\u201d\n \n \u2015 Douglas Adams \n\n\nMany thanks to Bob Nystrom for writing the book that inspired this post, and thanks to \nCarl Friedrich and Matt Halverson for reviewing.\n\n\u2015 Brian (@thorneynzb)", + "tags": "", + "url": "https://www.pypy.org/posts/2018/11/guest-post-implementing-calculator-repl-6271483514675006846.html" + }, + { + "title": "Inside cpyext: Why emulating CPython C API is so Hard", + "text": "cpyext is PyPy's subsystem which provides a compatibility\nlayer to compile and run CPython C extensions inside PyPy. Often people ask\nwhy a particular C extension doesn't work or is very slow on PyPy.\nUsually it is hard to answer without going into technical details. The goal of\nthis blog post is to explain some of these technical details, so that we can\nsimply link here instead of explaining again and again :).\nFrom a 10.000 foot view, cpyext is PyPy's version of \"Python.h\". Every time\nyou compile an extension which uses that header file, you are using cpyext.\nThis includes extension explicitly written in C (such as numpy) and\nextensions which are generated from other compilers/preprocessors\n(e.g. Cython).\nAt the time of writing, the current status is that most C extensions \"just\nwork\". Generally speaking, you can simply pip install them,\nprovided they use the public, official C API instead of poking at private\nimplementation details. However, the performance of cpyext is generally\npoor. A Python program which makes heavy use of cpyext extensions\nis likely to be slower on PyPy than on CPython.\nNote: in this blog post we are talking about Python 2.7 because it is still\nthe default version of PyPy: however most of the implementation of cpyext is\nshared with PyPy3, so everything applies to that as well.\n\n\nC API Overview\nIn CPython, which is written in C, Python objects are represented as PyObject*,\ni.e. (mostly) opaque pointers to some common \"base struct\".\nCPython uses a very simple memory management scheme: when you create an\nobject, you allocate a block of memory of the appropriate size on the heap.\nDepending on the details, you might end up calling different allocators, but\nfor the sake of simplicity, you can think that this ends up being a call to\nmalloc(). The resulting block of memory is initialized and casted to to\nPyObject*: this address never changes during the object lifetime, and the\nC code can freely pass it around, store it inside containers, retrieve it\nlater, etc.\nMemory is managed using reference counting. When you create a new reference to\nan object, or you discard a reference you own, you have to increment or\ndecrement the reference counter accordingly. When the reference counter goes to\n0, it means that the object is no longer used and can safely be\ndestroyed. Again, we can simplify and say that this results in a call to\nfree(), which finally releases the memory which was allocated by malloc().\nGenerally speaking, the only way to operate on a PyObject* is to call the\nappropriate API functions. For example, to convert a given PyObject* to a C\ninteger, you can use PyInt_AsLong(); to add two objects together, you can\ncall PyNumber_Add().\nInternally, PyPy uses a similar approach. All Python objects are subclasses of\nthe RPython W_Root class, and they are operated by calling methods on the\nspace singleton, which represents the interpreter.\nAt first, it looks very easy to write a compatibility layer: just make\nPyObject* an alias for W_Root, and write simple RPython functions\n(which will be translated to C by the RPython compiler) which call the\nspace accordingly:\ndef PyInt_AsLong(space, o):\n return space.int_w(o)\n\ndef PyNumber_Add(space, o1, o2):\n return space.add(o1, o2)\n\nActually, the code above is not too far from the real\nimplementation. However, there are tons of gory details which make it much\nharder than it looks, and much slower unless you pay a lot of attention\nto performance.\n\n\nThe PyPy GC\nTo understand some of cpyext challenges, you need to have at least a rough\nidea of how the PyPy GC works.\nContrarily to the popular belief, the \"Garbage Collector\" is not only about\ncollecting garbage: instead, it is generally responsible for all memory\nmanagement, including allocation and deallocation.\nWhereas CPython uses a combination of malloc/free/refcounting to manage\nmemory, the PyPy GC uses a completely different approach. It is designed\nassuming that a dynamic language like Python behaves the following way:\n\n\nYou create, either directly or indirectly, lots of objects.\nMost of these objects are temporary and very short-lived. Think e.g. of\ndoing a + b + c: you need to allocate an object to hold the temporary\nresult of a + b, then it dies very quickly because you no longer need it\nwhen you do the final + c part.\nOnly small fraction of the objects survive and stay around for a while.\n\n\nSo, the strategy is: make allocation as fast as possible; make deallocation of\nshort-lived objects as fast as possible; find a way to handle the remaining\nsmall set of objects which actually survive long enough to be important.\nThis is done using a Generational GC: the basic idea is the following:\n\n\nWe have a nursery, where we allocate \"young objects\" very quickly.\nWhen the nursery is full, we start what we call a \"minor collection\".\nWe do a quick scan to determine the small set of objects which survived so\nfar\nWe move these objects out of the nursery, and we place them in the\narea of memory which contains the \"old objects\". Since the address of the\nobjects changes, we fix all the references to them accordingly.\n\n\n\n\nnow the nursery contains only objects which \"died young\". We can\ndiscard all of them very quickly, reset the nursery, and use the same area\nof memory to allocate new objects from now.\n\n\nIn practice, this scheme works very well and it is one of the reasons why PyPy\nis much faster than CPython. However, careful readers have surely noticed\nthat this is a problem for cpyext. On one hand, we have PyPy objects which\ncan potentially move and change their underlying memory address; on the other\nhand, we need a way to represent them as fixed-address PyObject* when we\npass them to C extensions. We surely need a way to handle that.\n\n\nPyObject* in PyPy\nAnother challenge is that sometimes, PyObject* structs are not completely\nopaque: there are parts of the public API which expose to the user specific\nfields of some concrete C struct. For example the definition of PyTypeObject\nwhich exposes many of the tp_* slots to the user.\nSince the low-level layout of PyPy W_Root objects is completely different\nthan the one used by CPython, we cannot simply pass RPython objects to C; we\nneed a way to handle the difference.\nSo, we have two issues so far: objects can move, and incompatible\nlow-level layouts. cpyext solves both by decoupling the RPython and the C\nrepresentations. We have two \"views\" of the same entity, depending on whether\nwe are in the PyPy world (the movable W_Root subclass) or in the C world\n(the non-movable PyObject*).\nPyObject* are created lazily, only when they are actually needed. The\nvast majority of PyPy objects are never passed to any C extension, so we don't\npay any penalty in that case. However, the first time we pass a W_Root to\nC, we allocate and initialize its PyObject* counterpart.\nThe same idea applies also to objects which are created in C, e.g. by calling\nPyObject_New(). At first, only the PyObject* exists and it is\nexclusively managed by reference counting. As soon as we pass it to the PyPy\nworld (e.g. as a return value of a function call), we create its W_Root\ncounterpart, which is managed by the GC as usual.\nHere we start to see why calling cpyext modules is more costly in PyPy than in\nCPython. We need to pay some penalty for all the conversions between\nW_Root and PyObject*.\nMoreover, the first time we pass a W_Root to C we also need to allocate\nthe memory for the PyObject* using a slowish \"CPython-style\" memory\nallocator. In practice, for all the objects which are passed to C we pay more\nor less the same costs as CPython, thus effectively \"undoing\" the speedup\nguaranteed by PyPy's Generational GC under normal circumstances.\n\n\nMaintaining the link between W_Root and PyObject*\nWe now need a way to convert between W_Root and PyObject* and\nvice-versa; also, we need to to ensure that the lifetime of the two entities\nare in sync. In particular:\n\n\nas long as the W_Root is kept alive by the GC, we want the\nPyObject* to live even if its refcount drops to 0;\nas long as the PyObject* has a refcount greater than 0, we want to\nmake sure that the GC does not collect the W_Root.\n\n\nThe PyObject* \u21e8 W_Root link is maintained by the special field\nob_pypy_link which is added to all PyObject*. On a 64 bit machine this\nmeans that all PyObject* have 8 bytes of overhead, but then the\nconversion is very quick, just reading the field.\nFor the other direction, we generally don't want to do the same: the\nassumption is that the vast majority of W_Root objects will never be\npassed to C, and adding an overhead of 8 bytes to all of them is a\nwaste. Instead, in the general case the link is maintained by using a\ndictionary, where W_Root are the keys and PyObject* the values.\nHowever, for a few selected W_Root subclasses we do maintain a\ndirect link using the special _cpy_ref field to improve performance. In\nparticular, we use it for W_TypeObject (which is big anyway, so a 8 bytes\noverhead is negligible) and W_NoneObject. None is passed around very\noften, so we want to ensure that the conversion to PyObject* is very\nfast. Moreover it's a singleton, so the 8 bytes overhead is negligible as\nwell.\nThis means that in theory, passing an arbitrary Python object to C is\npotentially costly, because it involves doing a dictionary lookup. We assume\nthat this cost will eventually show up in the profiler: however, at the time\nof writing there are other parts of cpyext which are even more costly (as we\nwill show later), so the cost of the dict lookup is never evident in the\nprofiler.\n\n\nCrossing the border between RPython and C\nThere are two other things we need to care about whenever we cross the border\nbetween RPython and C, and vice-versa: exception handling and the GIL.\nIn the C API, exceptions are raised by calling PyErr_SetString() (or one of\nmany other functions which have a similar effect), which basically works by\ncreating an exception value and storing it in some global variable. The\nfunction then signals that an exception has occurred by returning an error value,\nusually NULL.\nOn the other hand, in the PyPy interpreter, exceptions are propagated by raising the\nRPython-level OperationError exception, which wraps the actual app-level\nexception values. To harmonize the two worlds, whenever we return from C to\nRPython, we need to check whether a C API exception was raised and if so turn it\ninto an OperationError.\nWe won't dig into details of how the GIL is handled in cpyext.\nFor the purpose of this post, it is enough to know that whenever we enter\nC land, we store the current thread id into a global variable which is\naccessible also from C; conversely, whenever we go back from RPython to C, we\nrestore this value to 0.\nSimilarly, we need to do the inverse operations whenever you need to cross the\nborder between C and RPython, e.g. by calling a Python callback from C code.\nAll this complexity is automatically handled by the RPython function\ngeneric_cpy_call. If you look at the code you see that it takes care of 4\nthings:\n\n\nHandling the GIL as explained above.\nHandling exceptions, if they are raised.\nConverting arguments from W_Root to PyObject*.\nConverting the return value from PyObject* to W_Root.\n\n\nSo, we can see that calling C from RPython introduce some overhead.\nCan we measure it?\nAssuming that the conversion between W_Root and PyObject* has a\nreasonable cost (as explained by the previous section), the overhead\nintroduced by a single border-cross is still acceptable, especially if the\ncallee is doing some non-negligible amount of work.\nHowever this is not always the case. There are basically three problems that\nmake (or used to make) cpyext super slow:\n\n\nPaying the border-crossing cost for trivial operations which are called\nvery often, such as Py_INCREF.\nCrossing the border back and forth many times, even if it's not strictly\nneeded.\nPaying an excessive cost for argument and return value conversions.\n\n\nThe next sections explain in more detail each of these problems.\n\n\nAvoiding unnecessary roundtrips\nPrior to the 2017 Cape Town Sprint, cpyext was horribly slow, and we were\nwell aware of it: the main reason was that we never really paid too much\nattention to performance. As explained in the blog post, emulating all the\nCPython quirks is basically a nightmare, so better to concentrate on\ncorrectness first.\nHowever, we didn't really know why it was so slow. We had theories and\nassumptions, usually pointing at the cost of conversions between W_Root\nand PyObject*, but we never actually measured it.\nSo, we decided to write a set of cpyext microbenchmarks to measure the\nperformance of various operations. The result was somewhat surprising: the\ntheory suggests that when you do a cpyext C call, you should pay the\nborder-crossing costs only once, but what the profiler told us was that we\nwere paying the cost of generic_cpy_call several times more than what we expected.\nAfter a bit of investigation, we discovered this was ultimately caused by our\n\"correctness-first\" approach. For simplicity of development and testing, when\nwe started cpyext we wrote everything in RPython: thus, every single API call\nmade from C (like the omnipresent PyArg_ParseTuple(), PyInt_AsLong(), etc.)\nhad to cross back the C-to-RPython border. This was especially daunting for\nvery simple and frequent operations like Py_INCREF and Py_DECREF,\nwhich CPython implements as a single assembly instruction!\nAnother source of slow down was the implementation of PyTypeObject slots.\nAt the C level, these are function pointers which the interpreter calls to do\ncertain operations, e.g. tp_new to allocate a new instance of that type.\nAs usual, we have some magic to implement slots in RPython; in particular,\n_make_wrapper does the opposite of generic_cpy_call: it takes a\nRPython function and wraps it into a C function which can be safely called\nfrom C, handling the GIL, exceptions and argument conversions automatically.\nThis was very handy during the development of cpyext, but it might result in\nsome bad nonsense; consider what happens when you call the following C\nfunction:\nstatic PyObject* foo(PyObject* self, PyObject* args)\n{\n PyObject* result = PyInt_FromLong(1234);\n return result;\n}\n\n\nyou are in RPython and do a cpyext call to foo: RPython-to-C;\nfoo calls PyInt_FromLong(1234), which is implemented in RPython:\nC-to-RPython;\nthe implementation of PyInt_FromLong indirectly calls\nPyIntType.tp_new, which is a C function pointer: RPython-to-C;\nhowever, tp_new is just a wrapper around an RPython function, created\nby _make_wrapper: C-to-RPython;\nfinally, we create our RPython W_IntObject(1234); at some point\nduring the RPython-to-C crossing, its PyObject* equivalent is\ncreated;\nafter many layers of wrappers, we are again in foo: after we do\nreturn result, during the C-to-RPython step we convert it from\nPyObject* to W_IntObject(1234).\n\nPhew! After we realized this, it was not so surprising that cpyext was very\nslow :). And this was a simplified example, since we are not passing a\nPyObject* to the API call. When we do, we need to convert it back and\nforth at every step. Actually, I am not even sure that what I described was\nthe exact sequence of steps which used to happen, but you get the general\nidea.\nThe solution is simple: rewrite as much as we can in C instead of RPython,\nto avoid unnecessary roundtrips. This was the topic of most of the Cape Town\nsprint and resulted in the cpyext-avoid-roundtrip branch, which was\neventually merged.\nOf course, it is not possible to move everything to C: there are still\noperations which need to be implemented in RPython. For example, think of\nPyList_Append: the logic to append an item to a list is complex and\ninvolves list strategies, so we cannot replicate it in C. However, we\ndiscovered that a large subset of the C API can benefit from this.\nMoreover, the C API is huge. While we invented this new way of writing\ncpyext code, we still need to\nconvert many of the functions to the new paradigm. Sometimes the rewrite is\nnot automatic\nor straighforward. cpyext is a delicate piece of software, so it happens often\nthat we make a mistake and end up staring at a segfault in gdb.\nHowever, the most important takeaway is that the performance improvements we got\nfrom this optimization are impressive, as we will detail later.\n\n\nConversion costs\nThe other potential big source of slowdown is the conversion of arguments\nbetween W_Root and PyObject*.\nAs explained earlier, the first time you pass a W_Root to C, you need to\nallocate its PyObject* counterpart. Suppose you have a foo function\ndefined in C, which takes a single int argument:\nfor i in range(N):\n foo(i)\n\nTo run this code, you need to create a different PyObject* for each value\nof i: if implemented naively, it means calling N times malloc()\nand free(), which kills performance.\nCPython has the very same problem, which is solved by using a free list to\nallocate ints. So, what we did was to simply steal the code from CPython\nand do the exact same thing. This was also done in the\ncpyext-avoid-roundtrip branch, and the benchmarks show that it worked\nperfectly.\nEvery type which is converted often to PyObject* must have a very fast\nallocator. At the moment of writing, PyPy uses free lists only for ints and\ntuples: one of the next steps on our TODO list is certainly to use this\ntechnique with more types, like float.\nConversely, we also need to optimize the converstion from PyObject* to\nW_Root: this happens when an object is originally allocated in C and\nreturned to Python. Consider for example the following code:\nimport numpy as np\nmyarray = np.random.random(N)\nfor i in range(len(arr)):\n myarray[i]\n\nAt every iteration, we get an item out of the array: the return type is a an\ninstance of numpy.float64 (a numpy scalar), i.e. a PyObject'*: this is\nsomething which is implemented by numpy entirely in C, so completely\nopaque to cpyext. We don't have any control on how it is allocated,\nmanaged, etc., and we can assume that allocation costs are the same as on\nCPython.\nAs soon as we return these PyObject* to Python, we need to allocate\ntheir W_Root equivalent. If you do it in a small loop like in the example\nabove, you end up allocating all these W_Root inside the nursery, which is\na good thing since allocation is super fast (see the section above about the\nPyPy GC).\nHowever, we also need to keep track of the W_Root to PyObject* link.\nCurrently, we do this by putting all of them in a dictionary, but it is very\ninefficient, especially because most of these objects die young and thus it\nis wasted work to do that for them. Currently, this is one of the biggest\nunresolved problem in cpyext, and it is what causes the two microbenchmarks\nallocate_int and allocate_tuple to be very slow.\nWe are well aware of the problem, and we have a plan for how to fix it. The\nexplanation is too technical for the scope of this blog post as it requires a\ndeep knowledge of the GC internals to be understood, but the details are\nhere.\n\n\nC API quirks\nFinally, there is another source of slowdown which is beyond our control. Some\nparts of the CPython C API are badly designed and expose some of the\nimplementation details of CPython.\nThe major example is reference counting. The Py_INCREF / Py_DECREF API\nis designed in such a way which forces other implementation to emulate\nrefcounting even in presence of other GC management schemes, as explained\nabove.\nAnother example is borrowed references. There are API functions which do\nnot incref an object before returning it, e.g. PyList_GetItem(). This is\ndone for performance reasons because we can avoid a whole incref/decref pair,\nif the caller needs to handle the returned item only temporarily: the item is\nkept alive because it is in the list anyway.\nFor PyPy, this is a challenge: thanks to list strategies, lists are often\nrepresented in a compact way. For example, a list containing only integers is\nstored as a C array of long. How to implement PyList_GetItem? We\ncannot simply create a PyObject* on the fly, because the caller will never\ndecref it and it will result in a memory leak.\nThe current solution is very inefficient. The first time we do a\nPyList_GetItem, we convert the whole list to a list of\nPyObject*. This is bad in two ways: the first is that we potentially pay a\nlot of unneeded conversion cost in case we will never access the other items\nof the list. The second is that by doing that we lose all the performance\nbenefit granted by the original list strategy, making it slower for the\nrest of the pure-python code which will manipulate the list later.\nPyList_GetItem is an example of a bad API because it assumes that the list\nis implemented as an array of PyObject*: after all, in order to return a\nborrowed reference, we need a reference to borrow, don't we?\nFortunately, (some) CPython developers are aware of these problems, and there\nis an ongoing project to design a better C API which aims to fix exactly\nthis kind of problem.\nNonetheless, in the meantime we still need to implement the current\nhalf-broken APIs. There is no easy solution for that, and it is likely that\nwe will always need to pay some performance penalty in order to implement them\ncorrectly.\nHowever, what we could potentially do is to provide alternative functions\nwhich do the same job but are more PyPy friendly: for example, we could think\nof implementing PyList_GetItemNonBorrowed or something like that: then, C\nextensions could choose to use it (possibly hidden inside some macro and\n#ifdef) if they want to be fast on PyPy.\n\n\nCurrent performance\nDuring the whole blog post we claimed cpyext is slow. How\nslow it is, exactly?\nWe decided to concentrate on microbenchmarks for now. It should be evident\nby now there are simply too many issues which can slow down a cpyext\nprogram, and microbenchmarks help us to concentrate on one (or few) at a\ntime.\nThe microbenchmarks measure very simple things, like calling functions and\nmethods with the various calling conventions (no arguments, one arguments,\nmultiple arguments); passing various types as arguments (to measure conversion\ncosts); allocating objects from C, and so on.\nHere are the results from the old PyPy 5.8 relative and normalized to CPython\n2.7, the lower the better:\n\n\n\n\n\n\n\n\n\n\n\nPyPy was horribly slow everywhere, ranging from 2.5x to 10x slower. It is\nparticularly interesting to compare simple.noargs, which measures the cost\nof calling an empty function with no arguments, and simple.onearg(i),\nwhich measures the cost calling an empty function passing an integer argument:\nthe latter is ~2x slower than the former, indicating that the conversion cost\nof integers is huge.\nPyPy 5.8 was the last release before the famous Cape Town sprint, when we\nstarted to look at cpyext performance seriously. Here are the performance data for\nPyPy 6.0, the latest release at the time of writing:\n\n\n\n\nThe results are amazing! PyPy is now massively faster than before, and for\nmost benchmarks it is even faster than CPython: yes, you read it correctly:\nPyPy is faster than CPython at doing CPython's job, even considering all the\nextra work it has to do to emulate the C API. This happens thanks to the JIT,\nwhich produces speedups high enough to counterbalance the slowdown caused by\ncpyext.\nThere are two microbenchmarks which are still slower though: allocate_int\nand allocate_tuple, for the reasons explained in the section about\nConversion costs.\n\n\nNext steps\nDespite the spectacular results we got so far, cpyext is still slow enough to\nkill performance in most real-world code which uses C extensions extensively\n(e.g., the omnipresent numpy).\nOur current approach is something along these lines:\n\n\nrun a real-world small benchmark which exercises cpyext\nmeasure and find the major bottleneck\nwrite a corresponding microbenchmark\noptimize it\nrepeat\n\n\nOn one hand, this is a daunting task because the C API is huge and we need to\ntackle functions one by one. On the other hand, not all the functions are\nequally important, and is is enough to optimize a relatively small subset to\nimprove many different use cases.\nWhere a year ago we announced we have a working answer to run c-extension in\nPyPy, we now have a clear picture of what are the performance bottlenecks, and\nwe have developed some technical solutions to fix them. It is \"only\" a matter\nof tackling them, one by one. It is worth noting that most of the work was\ndone during two sprints, for a total 2-3 person-months of work.\nWe think this work is important for the Python ecosystem. PyPy has established\na baseline for performance in pure python code, providing an answer for the\n\"Python is slow\" detractors. The techniques used to make cpyext performant\nwill let PyPy become an alternative for people who mix C extensions with\nPython, which, it turns out, is just about everyone, in particular those using\nthe various scientific libraries. Today, many developers are forced to seek\nperformance by converting code from Python to a lower language. We feel there\nis no reason to do this, but in order to prove it we must be able to run both\ntheir python and their C extensions performantly, then we can begin to educate\nthem how to write JIT-friendly code in the first place.\nWe envision a future in which you can run arbitrary Python programs on PyPy,\nwith the JIT speeding up the pure Python parts and the C parts running as fast\nas today: the best of both worlds!", + "tags": "cpyext,profiling,speed", + "url": "https://www.pypy.org/posts/2018/09/inside-cpyext-why-emulating-cpython-c-8083064623681286567.html" + }, + { + "title": "The First 15 Years of PyPy \u2014 a Personal Retrospective", + "text": "A few weeks ago I (=Carl Friedrich Bolz-Tereick) gave a keynote at ICOOOLPS in\nAmsterdam with the above title. I was very happy to have been given that\nopportunity, since a number of our papers have been published at ICOOOLPS,\nincluding the very first one I published when I'd just started my PhD. I decided\nto turn the talk manuscript into a (longish) blog post, to make it available to a wider audience.\nNote that this blog post describes my personal recollections and research, it is\nthus necessarily incomplete and coloured by my own experiences.\nPyPy has turned 15 years old this year, so I decided that that's a good reason\nto dig into and talk about the history of the project so far. I'm going to do\nthat using the lens of how performance developed over time, which is from\nsomething like 2000x slower than CPython, to roughly 7x faster. In this post\nI am going to present the history of the project, and also talk about some\nlessons that we learned.\nThe post does not make too many assumptions about any prior knowledge of what\nPyPy is, so if this is your first interaction with it, welcome! I have tried to\nsprinkle links to earlier blog posts and papers into the writing, in case you\nwant to dive deeper into some of the topics.\nAs a disclaimer, in this post I am going to mostly focus on ideas, and not\nexplain who had or implemented them. A huge amount of people contributed to the\ndesign, the implementation, the funding and the organization of PyPy over the\nyears, and it would be impossible to do them all justice.\n\nContents\n\n2003: Starting the Project\n2003: Implementing the Interpreter\nEarly organizational ideas\n2004-2007: EU-Funding\n2005: Bootstrapping PyPy\nRPython's Modularity Problems\n2006: The Meta-JIT\nThe First JIT Generator\nPromote\nVirtuals\nJIT Status 2007\n2007: RSqueak and other languages\n2008-2009: Four More JIT Generators\n2009: Meta-Tracing\nWhy did we Abandon Partial Evaluation?\n2009-2011: The PyJIT Eurostars Project\nTracing JIT improvements\n2010: speed.pypy.org\nContinuous Integration\n2010: Implementing Python Objects with Maps\n2011: Container Storage Strategies\nDeep Changes in the Runtime are Necessary\nJIT Status 2011\n2012-2017: Engineering and Incremental Progress\nCPyExt\nPython 3\nIncentives of OSS compared to Academia\nMeta-Tracing really works!\nAcknowledgements\n\n\n\n2003: Starting the Project\nOn the technical level PyPy is a Python interpreter written in Python, which is\nwhere the name comes from. It also has an automatically generated JIT compiler,\nbut I'm going to introduce that gradually over the rest of the blog post, so\nlet's not worry about it too much yet. On the social level PyPy is an\ninteresting mixture of a open source project, that sometimes had research done\nin it.\nThe project got started in late 2002 and early 2003. To set the stage, at that\npoint Python was a significantly less popular language than it is today. Python\n2.2 was the version at the time, Python didn't even have a bool type yet.\nIn fall 2002 the PyPy project was started by a number of Python programmers on a\nmailing list who said\nsomething like (I am exaggerating somewhat) \"Python is the greatest most\nwonderful most perfect language ever, we should use it for absolutely\neverything. Well, what aren't we using it for? The Python virtual machine itself\nis written in C, that's bad. Let's start a project to fix that.\"\nOriginally that project was called \"minimal python\", or \"ptn\", later gradually\nrenamed to PyPy. Here's the mailing list post to announce the project more\nformally:\nMinimal Python Discussion, Coding and Sprint\n--------------------------------------------\n\nWe announce a mailinglist dedicated to developing\na \"Minimal Python\" version. Minimal means that\nwe want to have a very small C-core and as much\nas possible (re)implemented in python itself. This\nincludes (parts of) the VM-Code.\nWhy would that kind of project be useful? Originally it wasn't necessarily meant\nto be useful as a real implementation at all, it was more meant as a kind of\nexecutable explanation of how Python works, free of the low level details of\nCPython. But pretty soon there were then also plans for how the virtual machine\n(VM) could be bootstrapped to be runnable without an existing Python\nimplementation, but I'll get to that further down.\n\n\n\n\n2003: Implementing the Interpreter\nIn early 2003 a group of Python people met in Hildesheim (Germany) for the first\nof many week long development sprints, organized by Holger Krekel. During that\nweek a group of people showed up and started working on the core interpreter.\nIn May 2003 a second sprint was organized by Laura Creighton and Jacob Hal\u00e9n in\nGothenburg (Sweden). And already at that sprint enough of the Python bytecodes\nand data structures were implemented to make it possible to run a program that\ncomputed how much money everybody had to pay for the food bills of the week. And\neverybody who's tried that for a large group of people knows that that\u2019s an\namazingly complex mathematical problem.\nIn the next two years, the project continued as a open source project with\nvarious contributors working on it in their free time, and meeting for the\noccasional sprint. In that time, the rest of the core interpreter and the core\ndata types were implemented.\nThere's not going to be any other code in this post, but to give a bit of a\nflavor of what the Python interpreter at that time looked like, here's the\nimplementation of the DUP_TOP bytecode after these first sprints. As you can\nsee, it's in Python, obviously, and it has high level constructs such as method\ncalls to do the stack manipulations:\ndef DUP_TOP(f):\n w_1 = f.valuestack.top()\n f.valuestack.push(w_1)\nHere's the early code for integer addition:\ndef int_int_add(space, w_int1, w_int2):\n x = w_int1.intval\n y = w_int2.intval\n try:\n z = x + y\n except OverflowError:\n raise FailedToImplement(space.w_OverflowError,\n space.wrap(\"integer addition\"))\n return W_IntObject(space, z)\n(the current implementations look slightly but not fundamentally different.)\n\n\n\n\nEarly organizational ideas\nSome of the early organizational ideas of the project were as follows. Since the\nproject was started on a sprint and people really liked that style of working\nPyPy continued to be developed on various subsequent sprints.\nFrom early on there was a very heavy emphasis on testing. All the parts of the\ninterpreter that were implemented had a very careful set of unit tests to make\nsure that they worked correctly. From early on, there was a continuous\nintegration infrastructure, which grew over time (nowadays it is very natural\nfor people to have automated tests, and the concept of green/red builds: but\nembracing this workflow in the early 2000s was not really mainstream yet, and\nit is probably one of the reasons behind PyPy's success).\nAt the sprints there was also an emphasis on doing pair programming to make\nsure that everybody understood the codebase\nequally. There was also a heavy emphasis on writing good code and on regularly\ndoing refactorings to make sure that the codebase remained nice, clean and\nunderstandable. Those ideas followed from the early thoughts that PyPy would be\na sort of readable explanation of the language.\nThere was also a pretty fundamental design decision made at the time. That was\nthat the project should stay out of language design completely. Instead it would\nfollow CPython's lead and behave exactly like that implementation in all cases.\nThe project therefore committed to being almost quirk-to-quirk compatible and to\nimplement even the more obscure (and partially unnecessary) corner cases of\nCPython.\nAll of these principles continue pretty much still today (There are a few places\nwhere we had to deviate from being completely compatible, they are documented\nhere).\n\n\n\n\n2004-2007: EU-Funding\nWhile all this coding was going on it became clear pretty soon that the goals\nthat various participants had for the project would be very hard to achieve with\njust open source volunteers working on the project in their spare time.\nParticularly also the sprints became expensive given that those were just\nvolunteers doing this as a kind of weird hobby. Therefore a couple of people of\nthe project got together to apply for an EU grant in the framework programme 6\nto solve these money problems. In mid-2004 that application proved to be\nsuccessful.\u00a0And so the project got a grant of a 1.3 million Euro for\ntwo years to be able to employ some of the core developers and to make it\npossible for them work on the project full time. The EU grant went to seven\nsmall-to-medium companies and Uni D\u00fcsseldorf. The budget also contained money to\nfund sprints, both for the employed core devs as well as other open source\ncontributors.\n\nThe EU project started in December 2004 and that was a fairly heavy change in\npace for the project. Suddenly a lot of people were working full time on it, and\nthe pace and the pressure picked up quite a lot. Originally it had been a\nleisurely project people worked on for fun. But afterwards people discovered\nthat doing this kind of work full time becomes slightly less fun, particularly\nalso if you have to fulfill the ambitious technical goals that the EU proposal\ncontained. And the proposal indeed contained a bit everything to increase its\nchance of acceptance, such as aspect oriented programming, semantic web, logic\nprogramming, constraint programming, and so on. Unfortunately it\nturned out that those things then have to be implemented, which can be called\nthe first thing we learned: if you promise something to the EU, you'll have to\nactually go do it (After the funding ended, a lot of these features were\nactually removed from the project again, at a cleanup sprint).\n\n\n\n\n2005: Bootstrapping PyPy\nSo what were the actually useful things done as part of the EU project?\nOne of the most important goals that the EU project was meant to solve was the\nquestion of how to turn PyPy into an actually useful VM for Python. The\nbootstrapping plans were taken quite directly from Squeak, which is a Smalltalk\nVM written in a subset of Smalltalk called Slang, which can then be bootstrapped\nto C code. The plan for PyPy was to do something similar, to define a restricted\nsubset of Python called RPython, restricted in such a way that it should be\npossible to statically compile RPython programs to C code. Then the Python\ninterpreter should only use that subset, of course.\nThe main difference from the Squeak approach is that Slang, the subset of Squeak\nused there, is actually quite a low level language. In a way, you could almost\ndescribe it as C with Smalltalk syntax. RPython was really meant to be a\nmuch higher level language, much closer to Python, with full support for single\ninheritance classes, and most of Python's built-in data structures.\n\n\n(BTW, you don\u2019t have to understand any of the illustrations in this blog post,\nthey are taken from talks and project reports we did over the years so they are\nof archaeological interest only and I don\u2019t understand most of them myself.)\nFrom 2005 on, work on the RPython type inference engine and C backend started in\nearnest, which was sort of co-developed with the RPython language definition and\nthe PyPy Python interpreter. This is also roughly the time that I joined the\nproject as a volunteer.\nAnd at the second sprint I went to, in July 2005, two and a half years after the\nproject got started, we managed to bootstrap the PyPy interpreter to C for the\nfirst time. When we ran the compiled program, it of course immediately\nsegfaulted. The reason for that was that the C backend had turned characters\ninto signed chars in C, while the rest of the infrastructure assumed that they\nwere unsigned chars. After we fixed that, the second attempt worked and we\nmanaged to run an incredibly complex program, something like 6 * 7. That\nfirst bootstrapped version was really really slow, a couple of hundred times\nslower than CPython.\n\n\nThe bootstrapping process of RPython has a number of nice benefits, a big one\nbeing that a number of the properties of the generated virtual machine don't\nhave to expressed in the interpreter. The biggest example of this is garbage\ncollection. RPython is a garbage collected language, and the interpreter does\nnot have to care much about GC in most cases. When the C source code is\ngenerated, a GC is automatically inserted. This is a source of great\nflexibility. Over time we experimented with a number of different GC\napproaches, from reference counting to Boehm to our current incremental\ngenerational collector. As an aside, for a long time we were also working on\nother backends to the RPython language and hoped to be able to target Java and\n.NET as well. Eventually we abandoned this strand of work, however.\n\n\n\n\nRPython's Modularity Problems\nNow we come to the first thing I would say we learned in the project, which is\nthat the quality of tools we thought of as internal things still matters a lot.\nOne of the biggest technical mistakes we've made in the project was that we\ndesigned RPython without any kind of story for modularity. There is no concept\nof modules in the language or any other way to break up programs into smaller\ncomponents. We always thought that it would be ok for RPython to be a little bit\ncrappy. It was meant to be this sort of internal language with not too many\nexternal users. And of course that turned out to be completely wrong later.\nThat lack of modularity led to various problems that persist until today. The\nbiggest one is that there is no separate compilation for RPython programs at\nall! You always need to compile all the parts of your VM together, which leads\nto infamously bad compilation times.\nAlso by not considering the modularity question we were never forced to fix\nsome internal structuring issues of the RPython compiler itself.\nVarious layers of the compiler keep very badly defined and porous interfaces between\nthem. This was made possible by being able to work with all the program information in one heap,\nmaking the compiler less approachable and maintainable than it maybe could be.\nOf course this mistake just got more and more costly to fix over time,\nand so it means that so far nobody has actually done it.\nNot thinking more carefully about RPython's design, particularly its\nmodularity story, is in my opinion the biggest technical mistake the project\nmade.\n\n\n\n\n2006: The Meta-JIT\nAfter successfully bootstrapping the VM we did some fairly straightforward\noptimizations on the interpreter and the C backend and managed to reduce the\nslowdown versus CPython to something like 2-5 times slower. That's great! But of\ncourse not actually useful in practice. So where do we go from here?\nOne of the not so secret goals of Armin Rigo, one of the PyPy founders, was to\nuse PyPy together with some advanced partial evaluation magic sauce to\nsomehow automatically generate a JIT compiler from the interpreter. The goal was\nsomething like, \"you write your interpreter in RPython, add a few annotations\nand then we give you a JIT for free for the language that that interpreter\nimplements.\"\nWhere did the wish for that approach come from, why not just write a JIT for\nPython manually in the first place? Armin had actually done just that before he\nco-founded PyPy, in a project called Psyco. Psyco was an extension module for\nCPython that contained a method-based JIT compiler for Python code.\u00a0And Psyco\nproved to be an amazingly frustrating compiler to write. There were two main\nreasons for that. The first reason was that Python is actually quite a complex\nlanguage underneath its apparent simplicity. The second reason for the\nfrustration was that Python was and is very much an alive language, that gains\nnew features in the language core in every version. So every time a new Python\nversion came out, Armin had to do fundamental changes and rewrites to Psyco, and\nhe was getting pretty frustrated with it. So he hoped that that effort could be\ndiminished by not writing the JIT for PyPy by hand at all. Instead, the goal was\nto generate a method-based JIT from the interpreter automatically. By taking the\ninterpreter, and applying a kind of advanced transformation to it, that would\nturn it into a method-based JIT. And all that would still be translated into a\nC-based VM, of course.\n\nSlide from Psyco presentation at EuroPython 2002\n\n\n\n\nThe First JIT Generator\nFrom early 2006 on until the end of the EU project a lot of work went into\nwriting such a JIT generator. The idea was to base it on runtime partial\nevaluation. Partial evaluation is an old idea in computer science. It's supposed\nto be a way to automatically turn interpreters for a language into a compiler\nfor that same language. Since PyPy was trying to generate a JIT compiler, which\nis in any case necessary to get good performance for a dynamic language like\nPython, the partial evaluation was going to happen at runtime.\nThere are various ways to look at partial evaluation, but if you've never heard\nof it before, a simple way to view it is that it will compile a Python function\nby gluing\u00a0together the implementations of the bytecodes of that function and\noptimizing the result.\nThe main new ideas of PyPy's partial-evaluation based JIT generator as opposed\nto earlier partial-evaluation approaches are the ideas of \"promote\" and the idea\nof \"virtuals\". Both of these techniques had already been present (in a slightly\nless general form) in Psyco, and the goal was to keep using them in PyPy. Both\nof these techniques also still remain in use today in PyPy. I'm\ngoing on a slight technical diversion now, to give a high level explanation of\nwhat those ideas are for.\n\n\n\n\n\nPromote\nOne important ingredient of any JIT compiler is the ability to do runtime\nfeedback. Runtime feedback is most commonly used to know something about which\nconcrete types are used by a program in practice. Promote is basically a way to\neasily introduce runtime feedback into the JIT produced by the JIT generator.\nIt's an annotation the implementer of a language can use to express their wish\nthat specialization should happen at this point. This mechanism can be used to\nexpress all kinds of runtime feedback, moving values from the interpreter\ninto the compiler, whether they be types or other things.\n\n\n\n\nVirtuals\nVirtuals are a very aggressive form of partial escape analysis. A dynamic\nlanguage often puts a lot of pressure on the garbage collector, since most\nprimitive types (like integers, floats and strings) are boxed in the heap, and\nnew boxes are allocated all the time.\nWith the help of virtuals a very significant portion of all allocations in the\ngenerated machine code can be completely removed. Even if they can't be removed,\noften the allocation can be delayed or moved into an error path, or even\ninto a deoptimization path, and thus disappear from the generated machine code\ncompletely.\nThis optimization really is the super-power of PyPy's optimizer, since it\ndoesn't work only for primitive boxes but for any kind of object allocated on\nthe heap with a predictable lifetime.\nAs an aside, while this kind of partial escape analysis is sort of new for\nobject-oriented languages, it has actually existed in Prolog-based partial\nevaluation systems since the 80s, because it's just extremely natural there.\n\n\n\n\nJIT Status 2007\nSo, back to our history. We're now in 2007, at the end of the EU project (you\ncan find the EU-reports we wrote during the projects here). The EU project\nsuccessfully finished, we survived the final review with the EU. So, what's the\n2007 status of the JIT generator? It works kind of, it can be applied to PyPy. It\nproduces a VM with a JIT that will turn Python code into machine code at runtime\nand run it. However, that machine code is not particularly fast. Also, it tends\nto generate many megabytes of machine code even for small Python programs. While\nit's always faster than PyPy without JIT, it's only sometimes faster than\nCPython, and most of the time Psyco still beats it. On the one hand, this is\nstill an amazing achievement! It's arguably the biggest application of partial\nevaluation at this point in time! On the other hand, it was still quite\ndisappointing in practice, particularly since some of us had believed at the\ntime that it should have been possible to reach and then surpass the speed of\nPsyco with this approach.\n\n\n\n\n2007: RSqueak and other languages\nAfter the EU project ended we did all kinds of things. Like sleep for a month\nfor example, and have the cleanup sprint that I already mentioned. We also had a\nslightly unusual sprint in Bern, with members of the Software Composition\nGroup of Oscar Nierstrasz. As I wrote above, PyPy had been heavily influenced\nby Squeak Smalltalk, and that group is a heavy user of Squeak, so we wanted to\nsee how to collaborate with them. At the beginning of the sprint, we decided\ntogether that the goal of that week should be to try to write a Squeak virtual\nmachine in RPython, and at the end of the week we'd gotten surprisingly far with\nthat goal. Basically most of the bytecodes and the Smalltalk object system\nworked, we had written an image loader and could run some benchmarks (during the\nsprint we also regularly updated a blog, the success of which led us to start\nthe PyPy blog).\n\n\nThe development of the Squeak interpreter was very interesting for the project,\nbecause it was the first real step that moved RPython from being an\nimplementation detail of PyPy to be a more interesting project in its own right.\nBasically a language to write interpreters in, with the eventual promise to get\na JIT for that language almost for free. That Squeak implementation is now\ncalled RSqueak (\"Research Squeak\").\nI'll not go into more details about any of the other language implementations in\nRPython in this post, but over the years we've had a large variety of language\nof them done by various people and groups, most of them as research vehicles,\nbut also some as real language implementations. Some very cool research results\ncame out of these efforts, here's a slightly outdated list of some of them.\nThe use of RPython for other languages complicated the PyPy narrative a lot, and\nin a way we never managed to recover the simplicity of the original project\ndescription \"PyPy is Python in Python\". Because now it's something like \"we have\nthis somewhat strange language, a subset of Python, that's called RPython, and\nit's good to write interpreters in. And if you do that, we'll give you a JIT for\nalmost free. And also, we used that language to write a Python implementation,\ncalled PyPy.\". It just doesn't roll off the tongue as nicely.\n\n\n\n\n2008-2009: Four More JIT Generators\nBack to the JIT. After writing the first JIT generator as part of the EU\nproject, with somewhat mixed results, we actually wrote several more JIT\ngenerator prototypes with different architectures to try to solve some of the\nproblems of the first approach. To give an impression of these prototypes,\nhere\u2019s a list of them.\n\nThe second JIT generator we started working on in 2008 behaved exactly like\nthe first one, but had a meta-interpreter based architecture, to make it more\nflexible and easier to experiment with. The meta-interpreter was called\nthe \"rainbow interpreter\", and in general the JIT is an area where we went\nsomewhat overboard with borderline silly terminology, with notable\noccurrences of \"timeshifter\", \"blackhole interpreter\" etc.\nThe third JIT generator was an experiment based on the second one which\nchanged\ncompilation strategy. While the previous two had compiled many control flow\npaths of the currently compiled function eagerly, that third JIT was sort of\nmaximally lazy and stopped compilation at every control flow split to avoid\nguessing which path would actually be useful later when executing the code.\nThis was an attempt to reduce the problem of the first JIT generating way too\nmuch machine code. Only later, when execution went down one of the not yet\ncompiled paths would it continue compiling more code. This gives an effect\nsimilar to that of lazy basic block versioning.\nThe fourth JIT generator was a pretty strange prototype, a runtime partial\nevaluator for Prolog, to experiment with various specialization trade-offs. It\nhad an approach that we gave a not at all humble name, called \"perfect\nspecialization\".\nThe fifth JIT generator is the one that we are still using today. Instead of\ngenerating a method-based JIT compiler from our interpreter we switched to\ngenerating a tracing JIT compiler. Tracing JIT compilers were sort of the\nlatest fashion at the time, at least for a little while.\n\n\n\n\n\n2009: Meta-Tracing\nSo, how did that tracing JIT generator work? A tracing JIT generates code by\nobserving and logging the execution of the running program. This yields a\nstraight-line trace of operations, which are then optimized and compiled into\nmachine code.\u00a0Of course most tracing systems mostly focus on tracing loops.\nAs we discovered, it's actually quite simple to apply a tracing JIT to a generic\ninterpreter, by not tracing the execution of the user program directly, but by\ninstead tracing the execution of the interpreter while it is running the user\nprogram (here's the paper we wrote about this approach).\nSo that's what we implemented. Of course we kept the two successful parts of the\nfirst JIT, promote and virtuals (both links go to the papers about these\nfeatures in the meta-tracing context).\n\n\n\n\n\nWhy did we Abandon Partial Evaluation?\nSo one question I get sometimes asked when telling this story is, why did\nwe think that tracing would work better than partial evaluation (PE)? One of the\nhardest parts of compilers in general and partial evaluation based systems in\nparticular is the decision when and how much to inline, how much to specialize,\nas well as the decision when to split control flow paths. In the PE based JIT\ngenerator we never managed to control that question. Either the JIT would\ninline too much, leading to useless compilation of all kinds of unlikely error\ncases. Or it wouldn't inline enough, preventing necessary optimizations.\nMeta tracing solves this problem with a hammer, it doesn't make particularly\ncomplex inlining decisions at all. It instead decides what to inline by\nprecisely following what a real execution through the program is doing. Its\ninlining decisions are therefore very understandable and predictable, and it\nbasically only has one heuristic based on whether the called function contains a\nloop or not: If the called function contains a loop, we'll never inline it, if\nit doesn't we always try to inline it. That predictability is actually what was\nthe most helpful, since it makes it possible for interpreter authors to\nunderstand why the JIT did what it did and to actually influence its inlining\ndecisions by changing the annotations in the interpreter source. It turns out\nthat simple is better than complex.\n\n\n\n\n2009-2011: The PyJIT Eurostars Project\nWhile we were writing all these JIT prototypes, PyPy had sort of reverted back\nto being a volunteer-driven open source project (although some of us, like\nAntonio Cuni and I, had started working for universities and other project\nmembers had other sources of funding). But again, while we did the work it\nbecame clear that to get an actually working fast PyPy with generated JIT we\nwould need actual funding again for the project. So we applied to the EU again,\nthis time for a much smaller project with less money, in the Eurostars\nframework. We got a grant for three participants, merlinux, OpenEnd and Uni\nD\u00fcsseldorf, on the order of a bit more than half a million euro. That money was\nspecifically for JIT development and JIT testing infrastructure.\n\n\n\n\n\nTracing JIT improvements\nWhen writing the grant we had sat together at a sprint and discussed extensively\nand decided that we would not switch JIT generation approaches any more. We all\nliked the tracing approach well enough and thought it was promising. So instead\nwe agreed to try in earnest to make the tracing JIT really practical. So in the\nEurostars project we started with implementing sort of fairly standard JIT\ncompiler optimizations for the meta-tracing JIT, such as:\n\nconstant folding\ndead code elimination\nloop invariant code motion (using LuaJIT's approach)\nbetter heap optimizations\nfaster deoptimization (which is actually a bit of a mess in the\nmeta-approach)\nand dealing more efficiently with Python frames objects and the\nfeatures of Python's debugging facilities\n\n\n\n\n\n2010: speed.pypy.org\nIn 2010, to make sure that we wouldn't accidentally introduce speed regressions\nwhile working on the JIT, we implemented infrastructure to build PyPy and run\nour benchmarks nightly. Then, the https://speed.pypy.org website was implemented\nby Miquel Torres, a volunteer. The website shows the changes in benchmark\nperformance compared to the previous n days. It didn't sound too important at\nfirst, but this was (and is) a fantastic tool, and an amazing motivator over the\nnext years, to keep continually improving performance.\n\n\n\n\n\nContinuous Integration\nThis actually leads me to something else that I'd say we learned, which is that\ncontinuous integration is really awesome, and completely transformative to have\nfor a project. This is not a particularly surprising insight nowadays in the\nopen source community, it's easy to set up continuous integration on Github\nusing Travis or some other CI service. But I still see a lot of research\nprojects that don't have tests, that don't use CI, so I wanted to mention it\nanyway. As I mentioned earlier in the post, PyPy has a quite serious testing\nculture, with unit tests written for new code, regression tests for all bugs,\nand integration tests using the CPython test suite. Those tests are run\nnightly on a number of architectures and operating systems.\nHaving all this kind of careful testing is of course necessary, since PyPy is\nreally trying to be a Python implementation that people actually use, not just\nwrite papers about. But having all this infrastructure also had other benefits,\nfor example it allows us to trust newcomers to the project very quickly.\nBasically after your first patch gets accepted, you immediately get commit\nrights to the PyPy repository. If you screw up, the tests (or the code reviews)\nare probably going to catch it, and that reduction to the barrier to\ncontributing is just super great.\nThis concludes my advertisement for testing in this post.\n\n\n\n\n2010: Implementing Python Objects with Maps\nSo, what else did we do in the Eurostars project, apart from adding traditional\ncompiler optimizations to the tracing JIT and setting up CI infrastructure?\nAnother strand of work, that went on sort of concurrently to the JIT generator\nimprovements, were deep rewrites in the Python runtime, and the Python data\nstructures. I am going to write about two exemplary ones here, maps and storage strategies.\nThe first such rewrite is fairly standard. Python instances are similar to\nJavascript objects, in that you can add arbitrary attributes to them at runtime.\nOriginally Python instances were backed by a dictionary in PyPy, but of course\nin practice most instances of the same class have the same set of attribute\nnames. Therefore we went and implemented Self style maps, which are often\ncalled hidden classes in the JS world to represent instances instead. This\nhas two big benefits, it allows you to generate much better machine code for\ninstance attribute access and makes instances use a lot less memory.\n\n\n\n\n\n2011: Container Storage Strategies\nAnother important change in the PyPy runtime was rewriting the Python container\ndata structures, such as lists, dictionaries and sets. A fairly straightforward\nobservation about how those are used is that in a significant percentage of\ncases they contain type-homogeneous data. As an example it's quite common to\nhave lists of only integers, or lists of only strings. So we changed the list,\ndict and set implementations to use something we called storage strategies. With\nstorage strategies these data structures use a more efficient representations if\nthey contain only primitives of the same type, such as ints, floats, strings.\nThis makes it possible to store the values without boxing them in the underlying\ndata structure. Therefore read and write access are much faster for such type\nhomogeneous containers. Of course when later another data type gets added to\nsuch a list, the existing elements need to all be boxed at that point, which is\nexpensive. But we did a study and found out that that happens quite rarely in\npractice. A lot of that work was done by Lukas Diekmann.\n\n\n\n\n\nDeep Changes in the Runtime are Necessary\nThese two are just two examples for a number of fairly fundamental changes in\nthe PyPy runtime and PyPy data structures, probably the two most important ones,\nbut we did many others. That leads me to another thing we learned. If you want\nto generate good code for a complex dynamic language such as Python, it's\nactually not enough at all to have a good code generator and good compiler\noptimizations. That's not going to help you, if your runtime data-structures\naren't in a shape where it's possible to generate efficient machine code to\naccess them.\nMaybe this is well known in the VM and research community. However it's the main\nmistake that in my opinion every other Python JIT effort has made in the last 10\nyears, where most projects said something along the lines of \"we're not\nchanging the existing CPython data structures at all, we'll just let LLVM\ninline enough C code of the runtime and then it will optimize all the overhead\naway\". That never works very well.\n\n\n\n\nJIT Status 2011\nSo, here we are at the end of the Eurostars project, what's the status of the JIT? Well, it\nseems this meta-tracing stuff really works! We finally started actually\nbelieving in it, when we reached the point in 2010 where self-hosting PyPy was\nactually faster than bootstrapping the VM on CPython. Speeding up the\nbootstrapping process is something that Psyco never managed at all, so we\nconsidered this a quite important achievement. At the end of\nEurostars, we were about 4x faster than CPython on our set of benchmarks.\n\n\n\n\n2012-2017: Engineering and Incremental Progress\n2012 the Eurostars project was finished and PyPy reverted yet another time back\nto be an open source project. From then on, we've had a more diverse set of\nsources of funding: we received some crowd funding via the Software Freedom\nConservancy and contracts of various sizes from companies to implement various\nspecific features, often handled by Baroque Software. Over the next couple of\nyears\nwe revamped various parts of the VM. We improved the GC in major ways. We\noptimized the implementation of the JIT compiler to improve warmup times. We\nimplemented backends for various CPU architectures (including PowerPC and\ns390x). We tried to reduce the number of performance cliffs and make the JIT\nuseful in a broader set of cases.\nAnother strand of work was to push quite significantly to be more\ncompatible with CPython, particularly the Python 3 line as well as extension\nmodule support. Other compatibility improvements we did was making sure that\nvirtualenv works with PyPy, better support for distutils and setuptools and\nsimilar improvements. The continually improving performance as well better\ncompatibility with the ecosystem tools led to the first few users of PyPy in\nindustry.\n\n\n\n\nCPyExt\nAnother very important strand of work that took a lot of effort in recent years\nwas CPyExt. One of the main blockers of PyPy adoption had always been the fact\nthat a lot of people need specific C-extension modules at least in some parts of\ntheir program, and telling them to reimplement everything in Python is just not\na practical solution. Therefore we worked on CPyExt, an emulation layer to make\nit possible to run CPython C-extension modules in PyPy. Doing that was a very\npainful process, since the CPython extension API leaks a lot of CPython\nimplementation details, so we had to painstakingly emulate all of these details\nto make it possible to run extensions. That this works at all remains completely\namazing to me! But nowadays CPyExt is even getting quite good, a lot of the big\nnumerical libraries such as Numpy and Pandas are now supported (for a while\nwe had worked hard on a reimplementation of Numpy called NumPyPy, but\neventually realized that it would never be complete and useful enough).\nHowever, calling CPyExt modules from PyPy can still be very slow,\nwhich makes it impractical for some applications\nthat's why we are working on it.\nNot thinking about C-extension module emulation earlier in the project history\nwas a pretty bad strategic mistake. It had been clear for a long time that\ngetting people to just stop using all their C-extension modules was never going\nto work, despite our efforts to give them alternatives, such as cffi. So we\nshould have thought of a story for all the existing C-extension modules earlier\nin the project. Not starting CPyExt earlier was mostly a failure of our\nimagination (and maybe a too high pain threshold): We didn't believe this kind\nof emulation was going to be practical, until somebody went and tried it.\n\n\n\n\nPython 3\nAnother main\nfocus of the last couple of years has been to catch up with the CPython 3 line.\nOriginally we had ignored Python 3 for a little bit too long, and were trailing\nseveral versions behind. In 2016 and 2017 we had a grant from the Mozilla open\nsource support program of $200'000 to be able to catch up with Python 3.5. This\nwork is now basically done, and we are starting to target CPython 3.6 and will\nhave to look into 3.7 in the near future.\n\n\n\n\nIncentives of OSS compared to Academia\nSo, what can be learned from those more recent years? One thing we can observe\nis that a lot of the engineering work we did in that time is not really science\nas such. A lot of the VM techniques we implemented are kind of well known, and\ncatching up with new Python features is also not particularly deep researchy\nwork. Of course this kind of work is obviously super necessary if you want\npeople to use your VM, but it would be very hard to try to get research funding\nfor it. PyPy managed quite well over its history to balance phases of more\nresearch oriented work, and more product oriented ones. But getting this balance\nsomewhat right is not easy, and definitely also involves a lot of luck. And, as\nhas been discussed a lot, it's actually very hard to find funding for open\nsource work, both within and outside of academia.\n\n\nMeta-Tracing really works!\nLet me end with what, in my opinion, is the main positive technical result of PyPy the\nproject. Which is that the whole idea of using a meta-tracing JIT can really\nwork! Currently PyPy is about 7 times faster than CPython on a broad set of\nbenchmarks. Also, one of the very early motivations for using a meta-jitting\napproach in PyPy, which was to not have to adapt the JIT to new versions of\nCPython proved to work: indeed we didn't have to change anything in the JIT\ninfrastructure to support Python 3.\nRPython has also worked and improved performance for a number of other\nlanguages. Some of these interpreters had wildly different architectures.\nAST-based interpreters, bytecode based, CPU emulators, really inefficient\nhigh-level ones that allocate continuation objects all the time, and so on. This\nshows that RPython also gives you a lot of freedom in deciding how you want to\nstructure the interpreter and that it can be applied to languages of quite\ndifferent paradigms.\nI'll end with a list of the people that have contributed code to PyPy over its\nhistory, more than 350 of them. I'd like to thank all of them and the various\nroles they played. To the next 15 years!\n\n\n\n\n\n\nAcknowledgements\nA lot of people helped me with this blog post. Tim Felgentreff made me give the\nkeynote, which lead me to start collecting the material. Samuele Pedroni\ngave essential early input when I just started planning the talk, and also gave\nfeedback on the blog post. Maciej Fija\u0142kowski gave me feedback on the post, in\nparticular important insight about the more recent years of the project. Armin\nRigo discussed the talk slides with me, and provided details about the early\nexpectations about the first JIT's hoped-for performance. Antonio Cuni gave\nsubstantial feedback and many very helpful suggestions for the blog post.\nMichael Hudson-Doyle also fixed a number of mistakes in the post and rightfully\ncomplained about the lack of mention of the GC. Christian Tismer provided\naccess to his copy of early Python-de mailing list posts. Matti Picus pointed\nout a number of things I had forgotten and fixed a huge number of typos and\nawkward English, including my absolute inability to put commas correctly.\nAll remaining errors are of course my own.\n\n\nupdate: fixed confusing wording in the maps section.", + "tags": "roadmap", + "url": "https://www.pypy.org/posts/2018/09/the-first-15-years-of-pypy-3412615975376972020.html" + }, + { + "title": "Repeating a Matrix Multiplication Benchmark", + "text": "I watched the Hennessy & Patterson's Turing award lecture recently:\n\n\n\nIn it, there's a slide comparing the performance of various matrix\nmultiplication implementations, using Python (presumably CPython) as a baseline\nand comparing that against various C implementations (I couldn't find the\nlinked paper yet):\n\n\n\nI expected the baseline speedup of switching from CPython to C to be\nhigher and I also wanted to know what performance PyPy gets, so I did my own\nbenchmarks. This is a problem that Python is completely unsuited for, so it\nshould give very exaggerated results.\nThe usual disclaimers apply: All benchmarks are lies, benchmarking of\nsynthetic workloads even more so. My implementation is really naive (though I\ndid optimize it a little bit to help CPython), don't use any\nof this code\nfor anything real. The benchmarks ran on my rather old Intel i5-3230M laptop\nunder Ubuntu 17.10.\nWith that said, my results were as follows:\n\n\n\nImplementation\ntime\nspeedup over CPython\nspeedup over PyPy\n\n\n\n\nCPython\n512.588 \u00b1 2.362 s\n1 \u00d7\n\n\n\nPyPy\n8.167 \u00b1 0.007 s\n62.761 \u00b1 0.295 \u00d7\n1 \u00d7\n\n\n'naive' C\n2.164 \u00b1 0.025 s\n236.817 \u00b1 2.918 \u00d7\n3.773 \u00b1 0.044 \u00d7\n\n\nNumPy\n0.171 \u00b1 0.002 s\n2992.286 \u00b1 42.308 \u00d7\n47.678 \u00b1 0.634 \u00d7\n\nThis is running 1500x1500 matrix multiplications with (the same) random matrices. Every\nimplementation is run 50 times in a fresh process. The results are averaged,\nthe errors are bootstrapped 99% confidence intervals.\nSo indeed the speedup that I got of switching from CPython to C is quite a bit higher than\n47x! PyPy is much better than CPython, but of course can't really compete\nagainst GCC. And then the real professionals (numpy/OpenBLAS) are in a whole\n'nother league. The speedup of the AVX numbers in the slide above is even\nhigher than my NumPy numbers, which I assume is the result of my old CPU with\ntwo cores, vs. the 18 core CPU with AVX support.\nLesson confirmed: leave matrix multiplication to people who\nactually know what they are doing.", + "tags": "", + "url": "https://www.pypy.org/posts/2018/06/repeating-matrix-multiplication-8641748742577945875.html" + }, + { + "title": "How to ignore the annoying Cython warnings in PyPy 6.0", + "text": "If you install any Cython-based module in PyPy 6.0.0, it is very likely that you get a warning like this:\n>>>> import numpy\n/data/extra/pypy/6.0.0/site-packages/numpy/random/__init__.py:99: UserWarning: __builtin__.type size changed, may indicate binary incompatibility. Expected 888, got 408\n from .mtrand import *\n\n\nThe TL;DR version is: the warning is a false alarm, and you can hide it by doing:\n$ pypy -m pip install pypy-fix-cython-warning\n\n\nThe package does not contain any module, only a\u00a0.pth\u00a0file which installs a warning filter at startup.\n\nTechnical details\n\nThis happens because whenever Cython compiles a pyx file, it generates C code which does a sanity check on the C size of\u00a0PyType_Type. PyPy versions up to 5.10 are buggy and report the incorrect size, so Cython includes a workaround to compare it with the incorrect value, when on PyPy.\n\nPyPy 6 fixed the bug and now\u00a0PyType_Type\u00a0reports the correct size; however, Cython still tries to compare it with the old, buggy value, so it (wrongly) emits the warning.\n\nCython 0.28.2 includes a fix for it, so that C files generated by it no longer emit the warning. However, most packages are distributed with pre-cythonized C files. For example,\u00a0numpy-1.14.2.zip\u00a0include C files which were generated by Cython 0.26.1: if you compile it you still get the warning, even if you locally installed a newer version of Cython.\n\nThere is not much that we can do on the PyPy side, apart for waiting for all the Cython-based packages to do a new release which include C files generated by a newer Cython.\u00a0 In the mean time, installing this module will silence the\u00a0warning.", + "tags": "", + "url": "https://www.pypy.org/posts/2018/04/how-to-ignore-annoying-cython-warnings-1007636731207810779.html" + }, + { + "title": "PyPy2.7 and PyPy3.5 v6.0 dual release", + "text": "The PyPy team is proud to release both PyPy2.7 v6.0 (an interpreter supporting\nPython 2.7 syntax), and a PyPy3.5 v6.0 (an interpreter supporting Python\n3.5 syntax). The two releases are both based on much the same codebase, thus\nthe dual release.\nThis release is a feature release following our previous 5.10 incremental\nrelease in late December 2017. Our C-API compatibility layer cpyext is\nnow much faster (see the blog post) as well as more complete. We have made\nmany other improvements in speed and CPython compatibility. Since the changes\naffect the included python development header files, all c-extension modules must\nbe recompiled for this version.\nUntil we can work with downstream providers to distribute builds with PyPy, we\nhave made packages for some common packages available as wheels. You may\ncompile yourself using pip install --no-build-isolation , the\nno-build-isolation is currently needed for pip v10.\nFirst-time python users are often stumped by silly typos and omissions when\ngetting started writing code. We have improved our parser to emit more friendly\nsyntax errors, making PyPy not only faster but more friendly.\nThe GC now has hooks to gain more insights into its performance\nThe default Matplotlib TkAgg backend now works with PyPy, as do pygame and pygobject.\nWe updated the cffi module included in PyPy to version 1.11.5, and the\ncppyy backend to 0.6.0. Please use these to wrap your C and C++ code,\nrespectively, for a JIT friendly experience.\nAs always, this release is 100% compatible with the previous one and fixed\nseveral issues and bugs raised by the growing community of PyPy users.\nWe strongly recommend updating.\nThe Windows PyPy3.5 release is still considered beta-quality. There are open\nissues with unicode handling especially around system calls and c-extensions.\nThe utf8 branch that changes internal representation of unicode to utf8 did not\nmake it into the release, so there is still more goodness coming. We also\nbegan working on a Python3.6 implementation, help is welcome.\nYou can download the v6.0 releases here:\n\n\nhttps://pypy.org/download.html\n\nWe would like to thank our donors for the continued support of the PyPy\nproject. If PyPy is not quite good enough for your needs, we are available for\ndirect consulting work.\nWe would also like to thank our contributors and encourage new people to join\nthe project. PyPy has many layers and we need help with all of them: PyPy\nand RPython documentation improvements, tweaking popular modules to run\non pypy, or general help with making RPython\u2019s JIT even better.\n\n\nWhat is PyPy?\nPyPy is a very compliant Python interpreter, almost a drop-in replacement for\nCPython 2.7 and CPython 3.5. It\u2019s fast (PyPy and CPython 2.7.x performance comparison)\ndue to its integrated tracing JIT compiler.\nWe also welcome developers of other dynamic languages to see what RPython\ncan do for them.\nThe PyPy release supports:\n\n\n\nx86 machines on most common operating systems\n(Linux 32/64 bits, Mac OS X 64 bits, Windows 32 bits, OpenBSD, FreeBSD)\nnewer ARM hardware (ARMv6 or ARMv7, with VFPv3) running Linux,\nbig- and little-endian variants of PPC64 running Linux,\ns390x running Linux\n\n\n\n\n\n\nWhat else is new?\n\nPyPy 5.10 was released in Dec, 2017.\n\nThere are many incremental improvements to RPython and PyPy, the complete listing is here.\n\n\u00a0 \nPlease update, and continue to help us make PyPy better.\n\nCheers, The PyPy team", + "tags": "release", + "url": "https://www.pypy.org/posts/2018/04/pypy27-and-pypy35-v60-dual-release-7416552143474607997.html" + }, + { + "title": "Improving SyntaxError in PyPy", + "text": "For the last year, my halftime job has been to teach non-CS uni students\nto program in Python. While doing that, I have been trying to see what common\nstumbling blocks exist for novice programmers. There are many\nthings that could be said here, but a common theme that emerges is\nhard-to-understand error messages. One source of such error messages,\nparticularly when starting out, is SyntaxErrors.\nPyPy's parser (mostly following the architecture of CPython) uses a\nregular-expression-based tokenizer with some cleverness to deal with\nindentation, and a simple LR(1) parser. Both of these components obviously\nproduce errors for invalid syntax, but the messages are not very helpful. Often,\nthe message is just \"invalid syntax\", without any hint of what exactly is wrong.\nIn the last couple of weeks I have invested a little bit of effort to make them a\ntiny bit better. They will be part of the upcoming PyPy 6.0 release. Here are\nsome examples of what changed.\n\nMissing Characters\nThe first class of errors occurs when a token is missing, often there is only one\nvalid token that the parser expects. This happens most commonly by leaving out\nthe ':' after control flow statements (which is the syntax error I personally\nstill make at least a few times a day). In such situations, the parser will now\ntell you which character it expected:\n\n>>>> # before\n>>>> if 1\n File \"\", line 1\n if 1\n \nSyntaxError: invalid syntax\n>>>>\n\n>>>> # after\n>>>> if 1\n File \"\", line 1\n if 1\n \nSyntaxError: invalid syntax (expected ':')\n>>>>\n\nAnother example of this feature:\n\n>>>> # before\n>>>> def f:\n File \"\", line 1\n def f:\n \nSyntaxError: invalid syntax\n>>>>\n\n>>>> # after\n>>>> def f:\n File \"\", line 1\n def f:\n \nSyntaxError: invalid syntax (expected '(')\n>>>>\n\n\n\nParentheses\nAnother source of errors are unmatched parentheses. Here, PyPy has always had\nslightly better error messages than CPython:\n\n>>> # CPython\n>>> )\n File \"\", line 1\n )\n \nSyntaxError: invalid syntax\n>>>\n\n>>>> # PyPy\n>>> )\n File \"\", line 1\n )\n \nSyntaxError: unmatched ')'\n>>>>\n\nThe same is true for parentheses that are never closed (the call to eval is\nneeded to get the error, otherwise the repl will just wait for more input):\n\n>>> # CPython\n>>> eval('(')\n File \"\", line 1\n (\n \nSyntaxError: unexpected EOF while parsing\n>>>\n\n>>>> # PyPy\n>>>> eval('(')\n File \"\", line 1\n (\n \nSyntaxError: parenthesis is never closed\n>>>>\n\nWhat I have now improved is the case of parentheses that are matched wrongly:\n\n>>>> # before\n>>>> (1,\n.... 2,\n.... ]\n File \"\", line 3\n ]\n \nSyntaxError: invalid syntax\n>>>>\n\n>>>> # after\n>>>> (1,\n.... 2,\n.... ]\n File \"\", line 3\n ]\n \nSyntaxError: closing parenthesis ']' does not match opening parenthesis '(' on line 1\n>>>>\n\n\n\nConclusion\nObviously these are just some very simple cases, and there is still a lot of\nroom for improvement (one huge problem is that only a single SyntaxError is\never shown per parse attempt, but fixing that is rather hard).\nIf you have a favorite unhelpful SyntaxError message you love to hate, please\ntell us in the comments and we might try to improve it. Other kinds of\nnon-informative error messages are also always welcome!", + "tags": "", + "url": "https://www.pypy.org/posts/2018/04/improving-syntaxerror-in-pypy-5733639208090522433.html" + }, + { + "title": "Leysin Winter Sprint 2018: review", + "text": "Like every year, the PyPy developers and a couple of newcomers\n gathered in Leysin, Switzerland, to share their thoughts and\n contribute to the development of PyPy.\n As always, we had interesting discussions about how we could\n improve PyPy, to make it the first choice for even more\n developers. We also made some progress with current issues, like\n compatibility with Python 3.6 and improving the performance of\n CPython extension modules, where we fixed a lot of bugs and gained\n new insights about where and how we could tweak PyPy.\n \n We were very happy about the number of new people who joined us\n for the first time, and hope they enjoyed it as much as everyone\n else. \n \n Topics\n We worked on the following topics (and more!):\n \n Introductions for newcomers\n Python 3.5 and 3.6 improvements\n CPyExt performance improvements and GC implementation\n \n JIT: guard-compatible implementation\n \n Pygame performance improvements\n Unicode/UTF8 implementation\n \n CFFI tutorial/overview rewrite\n \n py3 test runners refactoring\n RevDB improvements\n \n \n The weather was really fine for most of the week, with only\n occasional snow and fog. We started our days with a short (and\n sometimes not so short) planning session and enjoyed our dinners in\n the great restaurants in the area. Some of us even started earlier\n and continued till late night. It was a relaxed, but also very\n productive atmosphere. On our break day on Wednesday, we enjoyed the\n great conditions and went skiing and hiking.\n Attendees\n \n Arianna\n Jean-Daniel\n \n Stefan Beyer\n Floris Bruynooghe\n \n Antonio Cuni\n Ren\u00e9 Dudfield\n Manuel Jacob\n Ronan Lamy\n Remi Meier\n Matti Picus\n \n Armin Rigo\n Alexander Schremmer\n \n \n Leysin is easily reachable by Geneva Airport, so feel free to join\n us next time!\n \n \n Cheers,\n Stefan", + "tags": "", + "url": "https://www.pypy.org/posts/2018/03/leysin-winter-sprint-2018-review-3988364248531980164.html" + }, + { + "title": "PyPy 5.10.1 bugfix release for python 3.5", + "text": "We have released a bug fix PyPy3.5-v5.10.1\ndue to the following issues:\n\n\n\nFix time.sleep(float('nan')) which would hang on Windows\nFix missing errno constants on Windows\nFix issue 2718 for the REPL on Linux\nFix an overflow in converting int secs to nanosecs (issue 2717 )\nUsing kwarg 'flag' to os.setxattr had no effect\nFix the winreg module for unicode entries in the registry on Windows\n\n\n\nNote that many of these fixes are for our new beta version of PyPy3.5 on Windows. There may be more unicode problems in the Windows beta version,\nespecially concerning directory- and file-names with non-ASCII\ncharacters.\n\nOn macOS, we recommend you wait for the\nHomebrew package to prevent issues with third-party packages. For other supported platforms our downloads are available now.\nThanks to those who reported the issues.\n\n\n\nWhat is PyPy?\nPyPy is a very compliant Python interpreter, almost a drop-in replacement for\nCPython 2.7 and CPython 3.5. It\u2019s fast (PyPy and CPython 2.7.x performance comparison)\ndue to its integrated tracing JIT compiler.\n\nWe also welcome developers of other dynamic languages to see what RPython\ncan do for them.\n\nThis PyPy 3.5 release supports:\n\n\n\n\nx86 machines on most common operating systems\n(Linux 32/64 bits, macOS 64 bits, Windows 32 bits, OpenBSD, FreeBSD)\nnewer ARM hardware (ARMv6 or ARMv7, with VFPv3) running Linux,\nbig- and little-endian variants of PPC64 running Linux,\ns390x running Linux\n\n\n\nPlease update, and continue to help us make PyPy better.\n\nCheers\n\nThe PyPy Team", + "tags": "release", + "url": "https://www.pypy.org/posts/2018/01/pypy-5101-bugfix-release-for-python-35-8485250762789380657.html" + }, + { + "title": "Leysin Winter sprint: 17-24 March 2018", + "text": "The next PyPy sprint will be in Leysin, Switzerland, for the thirteenth\ntime. This is a fully public sprint: newcomers and topics other than\nthose proposed below are welcome.\n\n(Note: this sprint is independent from the suggested April-May sprint in\nPoland.)\n\nGoals and topics of the sprint\n\nThe list of topics is open, but here is our current list:\n\n\n\n\n\n\n\n\n\n cffi tutorial/overview rewrite\n py3 test runners are too complicated\n make win32 builds green\n make packaging more like cpython/portable builds\n get CI builders for PyPy into mainstream projects (Numpy, Scipy, lxml, uwsgi)\n get more of scientific stack working (tensorflow?)\n cpyext performance improvements\n General 3.5 and 3.6 improvements\n JIT topics: guard-compatible, and the subsequent research project to save and reuse traces across processes\n finish unicode-utf8\n update www.pypy.org, speed.pypy.org (web devs needed)\n\n\nAs usual, the main side goal is to have fun in winter sports :-)\nWe can take a day off (for ski or anything else).\n\nExact times\n\nWork days: starting March 18th (~noon), ending March 24th (~noon).\n\nPlease see announcement.txt for more information.", + "tags": "", + "url": "https://www.pypy.org/posts/2018/01/leysin-winter-sprint-17-24-march-2018-7141092581585849418.html" + }, + { + "title": "PyPy2.7 and PyPy3.5 v5.10 dual release", + "text": "The PyPy team is proud to release both PyPy2.7 v5.10 (an interpreter supporting\nPython 2.7 syntax), and a final PyPy3.5 v5.10 (an interpreter for Python\n3.5 syntax). The two releases are both based on much the same codebase, thus\nthe dual release.\nThis release is an incremental release with very few new features, the main\nfeature being the final PyPy3.5 release that works on linux and OS X with beta\nwindows support. It also includes fixes for vmprof cooperation with greenlets.\nCompared to 5.9, the 5.10 release contains mostly bugfixes and small improvements.\nWe have in the pipeline big new features coming for PyPy 6.0 that did not make\nthe release cut and should be available within the next couple months.\nAs always, this release is 100% compatible with the previous one and fixed\nseveral issues and bugs raised by the growing community of PyPy users.\nAs always, we strongly recommend updating.\nThere are quite a few important changes that are in the pipeline that did not\nmake it into the 5.10 release. Most important are speed improvements to cpyext\n(which will make numpy and pandas a bit faster) and utf8 branch that changes\ninternal representation of unicode to utf8, which should help especially the\nPython 3.5 version of PyPy.\nThis release concludes the Mozilla Open Source grant for having a compatible\nPyPy 3.5 release and we're very grateful for that. Of course, we will continue\nto improve PyPy 3.5 and probably move to 3.6 during the course of 2018.\nYou can download the v5.10 releases here:\n\nhttps://pypy.org/download.html\nWe would like to thank our donors for the continued support of the PyPy\nproject.\nWe would also like to thank our contributors and\nencourage new people to join the project. PyPy has many\nlayers and we need help with all of them: PyPy and RPython documentation\nimprovements, tweaking popular modules to run on pypy, or general help\nwith making RPython's JIT even better.\n\nWhat is PyPy?\nPyPy is a very compliant Python interpreter, almost a drop-in replacement for\nCPython 2.7 and CPython 3.5. It's fast (PyPy and CPython 2.7.x performance comparison)\ndue to its integrated tracing JIT compiler.\nWe also welcome developers of other dynamic languages to see what RPython\ncan do for them.\nThe PyPy release supports:\n\n\nx86 machines on most common operating systems\n(Linux 32/64 bits, Mac OS X 64 bits, Windows 32 bits, OpenBSD, FreeBSD)\nnewer ARM hardware (ARMv6 or ARMv7, with VFPv3) running Linux,\nbig- and little-endian variants of PPC64 running Linux,\ns390x running Linux\n\n\n\n\nChangelog\n\nimprove ssl handling on windows for pypy3 (makes pip work)\nimprove unicode handling in various error reporters\nfix vmprof cooperation with greenlets\nfix some things in cpyext\ntest and document the cmp(nan, nan) == 0 behaviour\ndon't crash when calling sleep with inf or nan\nfix bugs in _io module\ninspect.isbuiltin() now returns True for functions implemented in C\nallow the sequences future-import, docstring, future-import for CPython bug compatibility\nIssue #2699: non-ascii messages in warnings\nposix.lockf\nfixes for FreeBSD platform\nadd .debug files, so builds contain debugging info, instead of being stripped\nimprovements to cppyy\nissue #2677 copy pure c PyBuffer_{From,To}Contiguous from cpython\nissue #2682, split firstword on any whitespace in sqlite3\nctypes: allow ptr[0] = foo when ptr is a pointer to struct\nmatplotlib will work with tkagg backend once matplotlib pr #9356 is merged\nimprovements to utf32 surrogate handling\ncffi version bump to 1.11.2\n\nMaciej Fijalkowski, Matti Picus and the whole PyPy team", + "tags": "release", + "url": "https://www.pypy.org/posts/2017/12/pypy27-and-pypy35-v510-dual-release-3223396318213306071.html" + }, + { + "title": "How to make your code 80 times faster", + "text": "I often hear people who are happy because PyPy makes their code 2 times faster\nor so. Here is a short personal story which shows PyPy can go well beyond\nthat.\n\nDISCLAIMER: this is not a silver bullet or a general recipe: it worked in\nthis particular case, it might not work so well in other cases. But I think it\nis still an interesting technique. Moreover, the various steps and\nimplementations are showed in the same order as I tried them during the\ndevelopment, so it is a real-life example of how to proceed when optimizing\nfor PyPy.\n\nSome months ago I played a bit with evolutionary algorithms: the ambitious\nplan was to automatically evolve a logic which could control a (simulated)\nquadcopter, i.e. a PID controller (spoiler: it doesn't fly).\n\nThe idea is to have an initial population of random creatures: at each\ngeneration, the ones with the best fitness survive and reproduce with small,\nrandom variations.\n\nHowever, for the scope of this post, the actual task at hand is not so\nimportant, so let's jump straight to the code. To drive the quadcopter, a\nCreature has a run_step method which runs at each delta_t (full\ncode):\nclass Creature(object):\n INPUTS = 2 # z_setpoint, current z position\n OUTPUTS = 1 # PWM for all 4 motors\n STATE_VARS = 1\n ...\n\n def run_step(self, inputs):\n # state: [state_vars ... inputs]\n # out_values: [state_vars, ... outputs]\n self.state[self.STATE_VARS:] = inputs\n out_values = np.dot(self.matrix, self.state) + self.constant\n self.state[:self.STATE_VARS] = out_values[:self.STATE_VARS]\n outputs = out_values[self.STATE_VARS:]\n return outputs\n\n\ninputs is a numpy array containing the desired setpoint and the current\nposition on the Z axis;\noutputs is a numpy array containing the thrust to give to the motors. To\nstart easy, all the 4 motors are constrained to have the same thrust, so\nthat the quadcopter only travels up and down the Z axis;\nself.state contains arbitrary values of unknown size which are passed from\none step to the next;\nself.matrix and self.constant contains the actual logic. By putting\nthe \"right\" values there, in theory we could get a perfectly tuned PID\ncontroller. These are randomly mutated between generations.\n\nrun_step is called at 100Hz (in the virtual time frame of the simulation). At each\ngeneration, we test 500 creatures for a total of 12 virtual seconds each. So,\nwe have a total of 600,000 executions of run_step at each generation.\n\nAt first, I simply tried to run this code on CPython; here is the result:\n$ python -m ev.main\nGeneration 1: ... [population = 500] [12.06 secs]\nGeneration 2: ... [population = 500] [6.13 secs]\nGeneration 3: ... [population = 500] [6.11 secs]\nGeneration 4: ... [population = 500] [6.09 secs]\nGeneration 5: ... [population = 500] [6.18 secs]\nGeneration 6: ... [population = 500] [6.26 secs]\n\nWhich means ~6.15 seconds/generation, excluding the first.\n\nThen I tried with PyPy 5.9:\n$ pypy -m ev.main\nGeneration 1: ... [population = 500] [63.90 secs]\nGeneration 2: ... [population = 500] [33.92 secs]\nGeneration 3: ... [population = 500] [34.21 secs]\nGeneration 4: ... [population = 500] [33.75 secs]\n\nOuch! We are ~5.5x slower than CPython. This was kind of expected: numpy is\nbased on cpyext, which is infamously slow. (Actually, we are working on\nthat and on the cpyext-avoid-roundtrip branch we are already faster than\nCPython, but this will be the subject of another blog post.)\n\nSo, let's try to avoid cpyext. The first obvious step is to use numpypy\ninstead of numpy (actually, there is a hack to use just the micronumpy\npart). Let's see if the speed improves:\n$ pypy -m ev.main # using numpypy\nGeneration 1: ... [population = 500] [5.60 secs]\nGeneration 2: ... [population = 500] [2.90 secs]\nGeneration 3: ... [population = 500] [2.78 secs]\nGeneration 4: ... [population = 500] [2.69 secs]\nGeneration 5: ... [population = 500] [2.72 secs]\nGeneration 6: ... [population = 500] [2.73 secs]\n\nSo, ~2.7 seconds on average: this is 12x faster than PyPy+numpy, and more than\n2x faster than the original CPython. At this point, most people would be happy\nand go tweeting how PyPy is great.\n\nIn general, when talking of CPython vs PyPy, I am rarely satified of a 2x\nspeedup: I know that PyPy can do much better than this, especially if you\nwrite code which is specifically optimized for the JIT. For a real-life\nexample, have a look at capnpy benchmarks, in which the PyPy version is\n~15x faster than the heavily optimized CPython+Cython version (both have been\nwritten by me, and I tried hard to write the fastest code for both\nimplementations).\n\nSo, let's try to do better. As usual, the first thing to do is to profile and\nsee where we spend most of the time. Here is the vmprof profile. We spend a\nlot of time inside the internals of numpypy, and allocating tons of temporary\narrays to store the results of the various operations.\n\nAlso, let's look at the jit traces and search for the function run:\nthis is loop in which we spend most of the time, and it is composed of 1796\noperations. The operations emitted for the line np.dot(...) +\nself.constant are listed between lines 1217 and 1456. Here is the excerpt\nwhich calls np.dot(...); most of the ops are cheap, but at line 1232 we\nsee a call to the RPython function descr_dot; by looking at the\nimplementation we see that it creates a new W_NDimArray to store the\nresult, which means it has to do a malloc():\n\n\n\nThe implementation of the + self.constant part is also interesting:\ncontrary the former, the call to W_NDimArray.descr_add has been inlined by\nthe JIT, so we have a better picture of what's happening; in particular, we\ncan see the call to __0_alloc_with_del____ which allocates the\nW_NDimArray for the result, and the raw_malloc which allocates the\nactual array. Then we have a long list of 149 simple operations which set the\nfields of the resulting array, construct an iterator, and finally do a\ncall_assembler: this is the actual logic to do the addition, which was\nJITtted indipendently; call_assembler is one of the operations to do\nJIT-to-JIT calls:\n\n\n\nAll of this is very suboptimal: in this particular case, we know that the\nshape of self.matrix is always (3, 2): so, we are doing an incredible\namount of work, including calling\u00a0malloc() twice for the temporary arrays, just to\ncall two functions which ultimately do a total of 6 multiplications\nand 6 additions. Note also that this is not a fault of the JIT: CPython+numpy\nhas to do the same amount of work, just hidden inside C calls.\n\nOne possible solution to this nonsense is a well known compiler optimization:\nloop unrolling. From the compiler point of view, unrolling the loop is always\nrisky because if the matrix is too big you might end up emitting a huge blob\nof code, possibly uselss if the shape of the matrices change frequently: this\nis the main reason why the PyPy JIT does not even try to do it in this case.\n\nHowever, we know that the matrix is small, and always of the same\nshape. So, let's unroll the loop manually:\nclass SpecializedCreature(Creature):\n\n def __init__(self, *args, **kwargs):\n Creature.__init__(self, *args, **kwargs)\n # store the data in a plain Python list\n self.data = list(self.matrix.ravel()) + list(self.constant)\n self.data_state = [0.0]\n assert self.matrix.shape == (2, 3)\n assert len(self.data) == 8\n\n def run_step(self, inputs):\n # state: [state_vars ... inputs]\n # out_values: [state_vars, ... outputs]\n k0, k1, k2, q0, q1, q2, c0, c1 = self.data\n s0 = self.data_state[0]\n z_sp, z = inputs\n #\n # compute the output\n out0 = s0*k0 + z_sp*k1 + z*k2 + c0\n out1 = s0*q0 + z_sp*q1 + z*q2 + c1\n #\n self.data_state[0] = out0\n outputs = [out1]\n return outputs\n\nIn the actual code there is also a sanity check which asserts that the\ncomputed output is the very same as the one returned by Creature.run_step.\n\nSo, let's try to see how it performs. First, with CPython:\n$ python -m ev.main\nGeneration 1: ... [population = 500] [7.61 secs]\nGeneration 2: ... [population = 500] [3.96 secs]\nGeneration 3: ... [population = 500] [3.79 secs]\nGeneration 4: ... [population = 500] [3.74 secs]\nGeneration 5: ... [population = 500] [3.84 secs]\nGeneration 6: ... [population = 500] [3.69 secs]\n\nThis looks good: 60% faster than the original CPython+numpy\nimplementation. Let's try on PyPy:\nGeneration 1: ... [population = 500] [0.39 secs]\nGeneration 2: ... [population = 500] [0.10 secs]\nGeneration 3: ... [population = 500] [0.11 secs]\nGeneration 4: ... [population = 500] [0.09 secs]\nGeneration 5: ... [population = 500] [0.08 secs]\nGeneration 6: ... [population = 500] [0.12 secs]\nGeneration 7: ... [population = 500] [0.09 secs]\nGeneration 8: ... [population = 500] [0.08 secs]\nGeneration 9: ... [population = 500] [0.08 secs]\nGeneration 10: ... [population = 500] [0.08 secs]\nGeneration 11: ... [population = 500] [0.08 secs]\nGeneration 12: ... [population = 500] [0.07 secs]\nGeneration 13: ... [population = 500] [0.07 secs]\nGeneration 14: ... [population = 500] [0.08 secs]\nGeneration 15: ... [population = 500] [0.07 secs]\n\nYes, it's not an error. After a couple of generations, it stabilizes at around\n~0.07-0.08 seconds per generation. This is around 80 (eighty) times faster\nthan the original CPython+numpy implementation, and around 35-40x faster than\nthe naive PyPy+numpypy one.\n\nLet's look at the trace again: it no longer contains expensive calls, and\ncertainly no more temporary malloc() s. The core of the logic is between\nlines 386-416, where we can see that it does fast C-level multiplications and\nadditions: float_mul and float_add are translated straight into\nmulsd and addsd x86 instructions.\n\nAs I said before, this is a very particular example, and the techniques\ndescribed here do not always apply: it is not realistic to expect an 80x\nspeedup on arbitrary code, unfortunately. However, it clearly shows the potential of PyPy when\nit comes to high-speed computing. And most importantly, it's not a toy\nbenchmark which was designed specifically to have good performance on PyPy:\nit's a real world example, albeit small.\n\nYou might be also interested in the talk I gave at last EuroPython, in which I\ntalk about a similar topic: \"The Joy of PyPy JIT: abstractions for free\"\n(abstract, slides and video).\n\n\n\nHow to reproduce the results\n$ git clone https://github.com/antocuni/evolvingcopter\n$ cd evolvingcopter\n$ {python,pypy} -m ev.main --no-specialized --no-numpypy\n$ {python,pypy} -m ev.main --no-specialized\n$ {python,pypy} -m ev.main", + "tags": "jit,profiling,speed", + "url": "https://www.pypy.org/posts/2017/10/how-to-make-your-code-80-times-faster-1424098117108093942.html" + }, + { + "title": "(Cape of) Good Hope for PyPy", + "text": "Hello from the other side of the world (for most of you)!\n\nWith the excuse of coming to PyCon ZA during the last two weeks Armin,\nRonan, Antonio and sometimes Maciek had a very nice and productive sprint in\nCape Town, as pictures show :). We would like to say a big thank you to\nKiwi.com, which sponsored part of the travel costs via its awesome Sourcelift\nprogram to help Open Source projects.\n\n\n\nArmin, Anto and Ronan at Cape Point\n\n\nArmin, Ronan and Anto spent most of the time hacking at cpyext, our CPython\nC-API compatibility layer: during the last years, the focus was to make it\nworking and compatible with CPython, in order to run existing libraries such\nas numpy and pandas. However, we never paid too much attention to performance,\nso the net result is that with the latest released version of PyPy, C\nextensions generally work but their speed ranges from \"slow\" to \"horribly\nslow\".\n\nFor example, these very simple microbenchmarks measure the speed of\ncalling (empty) C functions, i.e. the time you spend to \"cross the border\"\nbetween RPython and C. (Note: this includes the time spent doing the loop in regular Python code.) These are the results on CPython, on PyPy 5.8, and on\nour newest in-progress version:\n\n$ python bench.py # CPython\nnoargs : 0.41 secs\nonearg(None): 0.44 secs\nonearg(i) : 0.44 secs\nvarargs : 0.58 secs\n\n\n\n$ pypy-5.8 bench.py # PyPy 5.8\nnoargs : 1.01 secs\nonearg(None): 1.31 secs\nonearg(i) : 2.57 secs\nvarargs : 2.79 secs\n\n\n\n$ pypy bench.py # cpyext-refactor-methodobject branch\nnoargs : 0.17 secs\nonearg(None): 0.21 secs\nonearg(i) : 0.22 secs\nvarargs : 0.47 secs\n\n\n\n\n\nSo yes: before the sprint, we were ~2-6x slower than CPython. Now, we are\nfaster than it!\nTo reach this result, we did various improvements, such as:\n\n\n\nteach the JIT how to look (a bit) inside the cpyext module;\nwrite specialized code for calling METH_NOARGS, METH_O and\nMETH_VARARGS functions; previously, we always used a very general and\nslow logic;\nimplement freelists to allocate the cpyext versions of int and\ntuple objects, as CPython does;\nthe cpyext-avoid-roundtrip branch: crossing the RPython/C border is\nslowish, but the real problem was (and still is for many cases) we often\ncross it many times for no good reason. So, depending on the actual API\ncall, you might end up in the C land, which calls back into the RPython\nland, which goes to C, etc. etc. (ad libitum).\n\n\nThe branch tries to fix such nonsense: so far, we fixed only some cases, which\nare enough to speed up the benchmarks shown above. But most importantly, we\nnow have a clear path and an actual plan to improve cpyext more and\nmore. Ideally, we would like to reach a point in which cpyext-intensive\nprograms run at worst at the same speed of CPython.\n\nThe other big topic of the sprint was Armin and Maciej doing a lot of work on the\nunicode-utf8 branch: the goal of the branch is to always use UTF-8 as the\ninternal representation of unicode strings. The advantages are various:\n\n\n\ndecoding a UTF-8 stream is super fast, as you just need to check that the\nstream is valid;\nencoding to UTF-8 is almost a no-op;\nUTF-8 is always more compact representation than the currently\nused UCS-4. It's also almost always more compact than CPython 3.5 latin1/UCS2/UCS4 combo;\nsmaller representation means everything becomes quite a bit faster due to lower cache pressure.\n\n\nBefore you ask: yes, this branch contains special logic to ensure that random\naccess of single unicode chars is still O(1), as it is on both CPython and the\ncurrent PyPy.\nWe also plan to improve the speed of decoding even more by using modern processor features, like SSE and AVX. Preliminary results show that decoding can be done 100x faster than the current setup.\n\n\nIn summary, this was a long and profitable sprint, in which we achieved lots\nof interesting results. However, what we liked even more was the privilege of\ndoing commits from awesome places such as the top of Table Mountain:\n\n\n\nOur sprint venue today #pypy pic.twitter.com/o38IfTYmAV\n\u2014 Ronan Lamy (@ronanlamy) 4 ottobre 2017\n\n\n\n\n\nThe panorama we looked at instead of staring at cpyext code", + "tags": "cpyext,profiling,speed,sprint,unicode", + "url": "https://www.pypy.org/posts/2017/10/cape-of-good-hope-for-pypy-hello-from-3656631725712879033.html" + }, + { + "title": "PyPy v5.9 Released, Now Supports Pandas, NumPy", + "text": "The PyPy team is proud to release both PyPy3.5 v5.9 (a beta-quality interpreter for Python\n3.5 syntax) and PyPy2.7 v5.9 (an interpreter supporting\nPython 2.7 syntax). \n\n\n\nNumPy and Pandas now work on PyPy2.7 (together with Cython 0.27.1). Many other modules\nbased on C-API extensions work on PyPy as well.\n\n\n\nCython 0.27.1 (released very recently) supports more projects with PyPy, both\non PyPy2.7 and PyPy3.5 beta. Note version 0.27.1 is now the minimum\nversion that supports this version of PyPy, due to some interactions with\nupdated C-API interface code.\n\n\n\n\nWe optimized the JSON parser for recurring string keys, which should decrease\nmemory use by up to 50% and increase parsing speed by up to 15% for large JSON files\nwith many repeating dictionary keys (which is quite common).\n\n\n\nCFFI, which is part of the PyPy release, has been updated to 1.11.1,\nimproving an already great package for interfacing with C. CFFI now supports\ncomplex arguments in API mode, as well as char16_t and char32_t and has\nimproved support for callbacks.\n\n\n\nIssues in the C-API compatibility layer that appeared as excessive memory\nuse were cleared up and other incompatibilities were resolved. The C-API\ncompatibility layer does slow down code which crosses the python-c interface\noften. Some fixes are in the pipelines for some of the performance issues, and we still recommend\nusing pure python on PyPy or interfacing via CFFI.\u00a0 \n\n\nPlease let us know if your use case is slow, we have ideas how to make things\nfaster but need real-world examples (not micro-benchmarks) of problematic code.\n\n\nWork sponsored by a Mozilla grant continues on PyPy3.5; we continue on the path to the goal of a complete python 3.5 implementation. Of course the bug fixes and performance enhancements\nmentioned above are part of both PyPy2.7 and PyPy3.5 beta.\n\n\nAs always, this release fixed many other issues and bugs raised by the\ngrowing community of PyPy users. We strongly recommend updating.\n\n\nYou can download the v5.9 releases here (note that we provide PyPy3.5 binaries for only Linux 64bit for now):\n\n\n\nhttps://pypy.org/download.html\n\nWe would like to thank our donors and contributors, and\nencourage new people to join the project. PyPy has many\nlayers and we need help with all of them: PyPy and RPython documentation\nimprovements, tweaking popular modules to run on PyPy, or general help\nwith making RPython\u2019s JIT even better.\n\n\nWhat is PyPy?\nPyPy is a very compliant Python interpreter, almost a drop-in replacement for CPython 2.7 (stdlib version 2.7.13), and CPython 3.5 (stdlib version 3.5.3). It\u2019s fast (PyPy and CPython 2.7.x performance comparison) due to its integrated tracing JIT compiler.\n\nWe also welcome developers of other dynamic languages to see what RPython can do for them.\n\nThe PyPy 2.7 release supports:\n\n\n\nx86 machines on most common operating systems (Linux 32/64 bits, Mac OS X 64 bits, Windows 32 bits, OpenBSD, FreeBSD)\nnewer ARM hardware (ARMv6 or ARMv7, with VFPv3) running Linux,\nbig- and little-endian variants of PPC64 running Linux,\ns390x running Linux \n\n\n\n\nWhat else is new?\n\nPyPy 5.8 was released in June, 2017.\n\nThere are many incremental improvements to RPython and PyPy, the complete listing is here.\n\n\u00a0 \nPlease update, and continue to help us make PyPy better.\n\nCheers, The PyPy team", + "tags": "release", + "url": "https://www.pypy.org/posts/2017/10/pypy-v59-released-now-supports-pandas-2261195727261691228.html" + }, + { + "title": "Let's remove the Global Interpreter Lock", + "text": "Hello everyone\nThe Python community has been discussing removing the Global Interpreter Lock for\na long time.\nThere have been various attempts at removing it:\nJython or IronPython successfully removed it with the help of the underlying\nplatform, and some have yet to bear fruit, like gilectomy. Since our February sprint in Leysin,\nwe have experimented with the topic of GIL removal in the PyPy project.\nWe believe that the work done in IronPython or Jython can be reproduced with\nonly a bit more effort in PyPy. Compared to that, removing the GIL in CPython is a much\nharder topic, since it also requires tackling the problem of multi-threaded reference\ncounting. See the section below for further details.\nAs we announced at EuroPython, what we have so far is a GIL-less PyPy\nwhich can run very simple multi-threaded, nicely parallelized, programs.\nAt the moment, more complicated programs probably segfault. The\nremaining 90% (and another 90%) of work is with putting locks in strategic\nplaces so PyPy does not segfault during concurrent accesses to\ndata structures.\nSince such work would complicate the PyPy code base and our day-to-day work,\nwe would like to judge the interest of the community and the commercial\npartners to make it happen (we are not looking for individual\ndonations at this point). We estimate a total cost of $50k,\nout of which we already have backing for about 1/3 (with a possible 1/3\nextra from the STM money, see below). This would give us a good\nshot at delivering a good proof-of-concept working PyPy with no GIL. If we can get a $100k\ncontract, we will deliver a fully working PyPy interpreter with no GIL as a release,\npossibly separate from the default PyPy release.\nPeople asked several questions, so I'll try to answer the technical parts\nhere.\nWhat would the plan entail?\nWe've already done the work on the Garbage Collector to allow doing multi-\nthreaded programs in RPython. \"All\" that is left is adding locks on mutable\ndata structures everywhere in the PyPy codebase. Since it would significantly complicate\nour workflow, we require real interest in that topic, backed up by\ncommercial contracts in order to justify the added maintenance burden.\nWhy did the STM effort not work out?\nSTM was a research project that proved that the idea is possible. However,\nthe amount of user effort that is required to make programs run in a\nparallelizable way is significant, and we never managed to develop tools\nthat would help in doing so. At the moment we're not sure if more work\nspent on tooling would improve the situation or if the whole idea is really doomed.\nThe approach also ended up adding significant overhead on single threaded programs,\nso in the end it is very easy to make your programs slower. (We have some money\nleft in the donation pot for STM which we are not using; according to the rules, we\ncould declare the STM attempt failed and channel that money towards the present\nGIL removal proposal.)\nWouldn't subinterpreters be a better idea?\nPython is a very mutable language - there are tons of mutable state and\nbasic objects (classes, functions,...) that are compile-time in other\nlanguage but runtime and fully mutable in Python. In the end, sharing\nthings between subinterpreters would be restricted to basic immutable\ndata structures, which defeats the point. Subinterpreters suffers from the same problems as\nmultiprocessing with no additional benefits.\nWe believe that reducing mutability to implement subinterpreters is not viable without seriously impacting the\nsemantics of the language (a conclusion which applies to many other\napproaches too).\nWhy is it easier to do in PyPy than CPython?\nRemoving the GIL in CPython has two problems:\n\nhow do we guard access to mutable data structures with locks and\nwhat to do with reference counting that needs to be guarded.\n\nPyPy only has the former problem; the latter doesn't exist,\ndue to a different garbage collector approach. Of course the first problem\nis a mess too, but at least we are already half-way there. Compared to Jython\nor IronPython, PyPy lacks some data structures that are provided by JVM or .NET,\nwhich we would need to implement, hence the problem is a little harder\nthan on an existing multithreaded platform. However, there is good research\nand we know how that problem can be solved.\nBest regards,\nMaciej Fijalkowski", + "tags": "", + "url": "https://www.pypy.org/posts/2017/08/lets-remove-global-interpreter-lock-748023554216649595.html" + }, + { + "title": "Binary wheels for PyPy", + "text": "Hi,\n\nthis is a short blog post, just to announce the existence of this Github repository, which contains binary PyPy wheels for some selected packages. The availability of binary wheels means that you can install the packages much more quickly, without having to wait for compilation.\n\n\nAt the moment of writing, these packages are available:\n\n\nnumpy\nscipy\npandas\npsutil\nnetifaces\n\n\nFor now, we provide only wheels built on Ubuntu, compiled for PyPy 5.8.\nIn particular, it is worth noting that they are not\u00a0manylinux1 wheels, which means they could not work on other Linux distributions. For more information, see the explanation in the README of the above repo.\n\nMoreover, the existence of the wheels does not guarantee that they work correctly 100% of the time. they still depend on cpyext, our C-API emulation layer, which is still work-in-progress, although it has become better and better during the last months. Again, the wheels are there only to save compilation time.\n\nTo install a package from the wheel repository, you can invoke pip like this:\n\n$ pip install --extra-index https://antocuni.github.io/pypy-wheels/ubuntu numpy\n\n\n\nHappy installing!", + "tags": "", + "url": "https://www.pypy.org/posts/2017/07/binary-wheels-for-pypy-8718353804433344916.html" + }, + { + "title": "PyPy v5.8 released", + "text": "The PyPy team is proud to release both PyPy2.7 v5.8 (an interpreter supporting\nPython 2.7 syntax), and a beta-quality PyPy3.5 v5.8 (an interpreter for Python\n3.5 syntax). The two releases are both based on much the same codebase, thus\nthe dual release. Note that PyPy3.5 supports Linux 64bit only for now.\n\nThis new PyPy2.7 release includes the upstream stdlib version 2.7.13, and\nPyPy3.5 includes the upstream stdlib version 3.5.3.\n\nWe fixed critical bugs in the shadowstack rootfinder garbage collector\nstrategy that crashed multithreaded programs and very rarely showed up\neven in single threaded programs.\n\nWe added native PyPy support to profile frames in the vmprof statistical\nprofiler.\n\nThe struct module functions pack* and unpack* are now much faster,\nespecially on raw buffers and bytearrays. Microbenchmarks show a 2x to 10x\nspeedup. Thanks to Gambit Research for sponsoring this work.\n\nThis release adds (but disables by default) link-time optimization and\nprofile guided optimization of the base interpreter, which may make\nunjitted code run faster. To use these, translate with appropriate\noptions. Be aware of issues with gcc toolchains, though.\n\nPlease let us know if your use case is slow, we have ideas how to make things\nfaster but need real-world examples (not micro-benchmarks) of problematic code.\n\nWork sponsored by a Mozilla grant continues on PyPy3.5; numerous fixes from\nCPython were ported to PyPy and PEP 489 was fully implemented. Of course the\nbug fixes and performance enhancements mentioned above are part of both PyPy\n2.7 and PyPy 3.5.\n\nCFFI, which is part of the PyPy release, has been updated to an unreleased 1.10.1,\nimproving an already great package for interfacing with C.\n\nAnyone using NumPy 1.13.0, must upgrade PyPy to this release since we implemented some previously missing C-API functionality. Many other c-extension modules now work with PyPy, let us know if yours does not.\n\nAs always, this release fixed many issues and bugs raised by the\ngrowing community of PyPy users. We strongly recommend updating.\n\nYou can download the v5.8 release here:\n\n\nhttps://pypy.org/download.html\n\nWe would like to thank our donors and contributors, and\nencourage new people to join the project. PyPy has many\nlayers and we need help with all of them: PyPy and RPython documentation\nimprovements, tweaking popular modules to run on PyPy, or general help\nwith making RPython\u2019s JIT even better.\n\n\nWhat is PyPy?\nPyPy is a very compliant Python interpreter, almost a drop-in replacement for CPython 2.7 and CPython 3.5. It\u2019s fast (PyPy and CPython 2.7.x performance comparison) due to its integrated tracing JIT compiler.\nWe also welcome developers of other dynamic languages to see what RPython can do for them.\nThe PyPy 2.7 release supports:\n\n\n\nx86 machines on most common operating systems (Linux 32/64 bits, Mac OS X 64 bits, Windows 32 bits, OpenBSD, FreeBSD)\nnewer ARM hardware (ARMv6 or ARMv7, with VFPv3) running Linux,\nbig- and little-endian variants of PPC64 running Linux,\ns390x running Linux \n\n\n\n\n\nWhat else is new?\n\nPyPy 5.7 was released in March, 2017.\n\nThere are many incremental improvements to RPython and PyPy, the complete listing is here.\n\n\u00a0 \nPlease update, and continue to help us make PyPy better.\n\nCheers, The PyPy team", + "tags": "release,sponsors", + "url": "https://www.pypy.org/posts/2017/06/pypy-v58-released-739876359584854017.html" + }, + { + "title": "PyPy 5.7.1 bugfix released", + "text": "We have released a bugfix PyPy2.7-v5.7.1 and PyPy3.5-v5.7.1 beta (Linux 64bit),\ndue to the following issues:\n\n\n\ncorrectly handle an edge case in dict.pop (issue 2508)\nfix a regression to correctly handle multiple inheritance in a C-API type\nwhere the second base is an app-level class with a __new__ function\nfix a regression to fill a C-API type\u2019s tp_getattr slot from a\n__getattr__ method (issue 2523)\n\n\n\nThanks to those who reported issues and helped test out the fixes\n\nYou can download the v5.7.1 release here:\n\n\nhttps://pypy.org/download.html\n\n\nWhat is PyPy?\nPyPy is a very compliant Python interpreter, almost a drop-in replacement for CPython 2.7 and CPython 3.5. It\u2019s fast (PyPy and CPython 2.7.x performance comparison) due to its integrated tracing JIT compiler.\nWe also welcome developers of other dynamic languages to see what RPython can do for them.\nThe PyPy 2.7 release supports:\n\n\n\nx86 machines on most common operating systems (Linux 32/64 bits, Mac OS X 64 bits, Windows 32 bits, OpenBSD, FreeBSD)\nnewer ARM hardware (ARMv6 or ARMv7, with VFPv3) running Linux,\nbig- and little-endian variants of PPC64 running Linux,\ns390x running Linux\n\n\n\nPlease update, and continue to help us make PyPy better.\n\nCheers, The PyPy team", + "tags": "release", + "url": "https://www.pypy.org/posts/2017/04/pypy-571-bugfix-released-8519267986159880133.html" + }, + { + "title": "Native profiling in VMProf", + "text": "We are happy to announce a new release for the PyPI package vmprof.\nIt is now able to capture native stack frames on Linux and Mac OS X to show you bottle necks in compiled code (such as CFFI modules, Cython or C Python extensions). It supports PyPy, CPython versions 2.7, 3.4, 3.5 and 3.6. Special thanks to Jetbrains for funding the native profiling support.\n\n\n\n\n\n\nWhat is vmprof?\n\nIf you have already worked with vmprof you can skip the next two section. If not, here is a short introduction:\n\nThe goal of vmprof package is to give you more insight into your program. It is a statistical profiler. Another prominent profiler you might already have worked with is cProfile. It is bundled with the Python standard library.\n\nvmprof's distinct feature (from most other profilers) is that it does not significantly slow down your program execution. The employed strategy is statistical, rather than deterministic. Not every function call is intercepted, but it samples stack traces and memory usage at a configured sample rate (usually around 100hz). You can imagine that this creates a lot less contention than doing work before and after each function call.\n\nAs mentioned earlier cProfile gives you a complete profile, but it needs to intercept every function call (it is a deterministic profiler). Usually this means that you have to capture and record every function call, but this takes an significant amount time.\n\n The overhead vmprof consumes is roughly 3-4% of your total program runtime or even less if you reduce the sampling frequency. Indeed it lets you sample and inspect much larger programs. If you failed to profile a large application with cProfile, please give vmprof a shot.\n\nvmprof.com or PyCharm\n\n\n\nThere are two major alternatives to the command-line tools shipped with vmprof:\n\nA web service on vmprof.com\nPyCharm Professional Edition \n\n\nWhile the command line tool is only good for quick inspections, vmprof.com\n and PyCharm compliment each other providing deeper insight into your \nprogram. With PyCharm you can view the per-line profiling results inside\n the editor. With the vmprof.com you get a\u00a0handy visualization of the profiling results as a flame chart and memory usage graph.\n\n\n\n\n\nSince the PyPy Team runs and maintains the service on vmprof.com (which is by the way free and open-source), I\u2019ll explain some more details here. On vmprof.com you can inspect the generated profile interactively instead of looking at console output. What is sent to vmprof.com? You can find details here.\n\nFlamegraph: Accumulates and displays the most frequent codepaths. It allows you to quickly and accurately identify hot spots in your code. The flame graph below is a very short run of richards.py (Thus it shows a lot of time spent in PyPy's JIT compiler).\n\n\n\n\n\nList all functions (optionally sorted): the equivalent of the vmprof command line output in the web.\n\n\n\n\n\u00a0Memory curve: A line plot that shows how how many MBytes have been consumed over the lifetime of your program (see more info in the section below).\n\n\n\nNative programs\n\nThe new feature introduced in vmprof 0.4.x allows you to look beyond the Python level. As you might know, Python maintains a stack of frames to save the execution. Up to now the vmprof profiles only contained that level of information. But what if you program jumps to native code (such as calling gzip compression on a large file)? Up to now you would not see that information.\n\nMany packages make use of the CPython C API (which we discurage, please lookup cffi for a better way to call C). Have you ever had the issue that you know that your performance problems reach down to, but you could not profile it properly? Now you can!\n\n Let's inspect a very simple Python program to find out why a program is significantly slower on Linux than on Mac:\n\nimport numpy as np\nn = 1000\na = np.random.random((n, n))\nb = np.random.random((n, n))\nc = np.dot(np.abs(a), b)\n\n\nTake two NxN random matrix objects and create a dot product. The first argument to the dot product provides the absolute value of the random matrix.\n\n\nRunPythonNumPyOSn=... Took \n [1]CPython 3.5.2NumPy 1.12.1Mac OS X, 10.12.3n=5000~9 sec\n [2]CPython 3.6.0NumPy 1.12.1Linux 64, Kernel 4.9.14n=1000~26 sec\n\n\nNote that the Linux machine operates on a 5 times smaller matrix, still it takes much longer. What is wrong? Is Linux slow? CPython 3.6.0? Well no, lets inspect and [1] and [2] (shown below in that order).\n\n\n\n\n\n[2] runs on Linux, spends nearly all of the time in PyArray_MatrixProduct2, if you compare to [1] on Mac OS X, you'll see that a lot of time is spent in generating the random numbers and the rest in cblas_matrixproduct.\n\nBlas has a very efficient implementation so you can achieve the same on Linux if you install a blas implementation (such as openblas).\n\nUsually you can spot potential program source locations that take a lot of time and might be the first starting point to resolve performance issues.\n\nBeyond Python programs \n\nIt is not unthinkable that the strategy can be reused for native programs. Indeed this can already be done by creating a small cffi wrapper around an entry point of a compiled C program. It would even work for programs compiled from other languages (e.g. C++ or Fortran). The resulting function names are the full symbol name embedded into either the executable symboltable or extracted from the dwarf debugging information. Most of those will be compiler specific and contain some cryptic information.\n\nMemory profiling\nWe thankfully received a code contribution from the company Blue Yonder. They have built a memory profiler (for Linux and Mac OS X) on top of vmprof.com that displays the memory consumption for the runtime of your process.\n\nYou can run it the following way:\n\n$ python -m vmprof --mem --web script.py\n\nBy adding --mem, vmprof will capture memory information and display it in the dedicated view on vmprof.com. You can tha view by by clicking the 'Memory' switch in the flamegraph view.\n\nThere is more\n\nSome more minor highlights contained in 0.4.x:\n\nVMProf support for Windows 64 bit (No native profiling)\nVMProf can read profiles generated by another host system\nVMProf is now bundled in several binary wheel for fast and easy installation (Mac OS X, Linux 32/64 for CPython 2.7, 3.4, 3.5, 3.6)\n\nFuture plans - Profile Streaming\n\nvmprof has not reached the end of development. There are many features we could implement. But there is one feature that could be a great asset to many Python developers.\n\nContinuous delivery of your statistical profile, or in short, profile streaming. One of the great strengths of vmprof is that is consumes very little overhead. It is not a crazy idea to run this in production.\n\nIt would require a smart way to stream the profile in the background to vmprof.com and new visualizations to look at much more data your Python service produces.\n\nIf that sounds like a solid vmprof improvement, don't hesitate to get in touch with us (e.g. IRC #pypy, mailing list pypy-dev, or comment below)\n\nYou can help! \n\nThere are some immediate things other people could help with. Either by donating time or money (yes we have occasional contributors which is great)!\n\nWe gladly received code contribution for the memory profiler. But it was not enough time to finish the migration completely. Sadly it is a bit brittle right now.\nWe would like to spend more time on other visualizations. This should include to give a much better user experience on vmprof.com (like a tutorial that explains the visualization that we already have).\u00a0\nBuild Windows 32/64 bit wheels (for all CPython versions we currently support)\n\nWe are also happy to accept google summer of code projects on vmprof for new visualizations and other improvements. If you qualify and are interested, don't hesitate to ask!\n\nRichard Plangger (plan_rich) and the PyPy Team\n\n[1] Mac OS X https://vmprof.com/#/567aa150-5927-4867-b22d-dbb67ac824ac\n[2] Linux64 https://vmprof.com/#/097fded2-b350-4d68-ae93-7956cd10150c", + "tags": "", + "url": "https://www.pypy.org/posts/2017/04/native-profiling-in-vmprof-6949065546884243105.html" + }, + { + "title": "PyPy2.7 and PyPy3.5 v5.7 - two in one release", + "text": "The PyPy team is proud to release both PyPy2.7 v5.7 (an interpreter supporting\nPython v2.7 syntax), and a beta-quality PyPy3.5 v5.7 (an interpreter for Python\nv3.5 syntax). The two releases are both based on much the same codebase, thus\nthe dual release. Note that PyPy3.5 only supports Linux 64bit for now.\n\nThis new PyPy2.7 release includes the upstream stdlib version 2.7.13, and PyPy3.5 (our first in the 3.5 series) includes the upstream stdlib version 3.5.3.\n\nWe continue to make incremental improvements to our C-API compatibility layer (cpyext). PyPy2 can now import and run many C-extension packages, among the most notable are Numpy, Cython, and Pandas. Performance may be slower than CPython, especially for frequently-called short C functions. Please let us know if your use case is slow, we have ideas how to make things faster but need real-world examples (not micro-benchmarks) of problematic code.\n\nWork proceeds at a good pace on the PyPy3.5 version due to a grant from the Mozilla Foundation, hence our first 3.5.3 beta release. Thanks Mozilla !!! While we do not pass all tests yet, asyncio works and as these benchmarks show it already gives a nice speed bump. We also backported the f\"\" formatting from 3.6 (as an exception; otherwise \u201cPyPy3.5\u201d supports the Python 3.5 language).\n\nCFFI has been updated to 1.10, improving an already great package for interfacing with C.\n\nWe now use shadowstack as our default gcrootfinder even on Linux. The alternative, asmgcc, will be deprecated at some future point. While about 3% slower, shadowstack is much more easily maintained and debuggable. Also, the performance of shadowstack has been improved in general: this should close the speed gap between other platforms and Linux.\n\nAs always, this release fixed many issues and bugs raised by the growing community of PyPy users. We strongly recommend updating.\n\nYou can download the v5.7 release here:\n\n\nhttps://pypy.org/download.html\n\nWe would like to thank our donors for the continued support of the PyPy project.\nWe would also like to thank our contributors and encourage new people to join the project. PyPy has many layers and we need help with all of them: PyPy and RPython documentation improvements, tweaking popular modules to run on pypy, or general help with making RPython\u2019s JIT even better.\n\n\n\u00a0\n\nWhat is PyPy?\nPyPy is a very compliant Python interpreter, almost a drop-in replacement for CPython 2.7 and CPython 3.5. It\u2019s fast (PyPy and CPython 2.7.x performance comparison) due to its integrated tracing JIT compiler.\nWe also welcome developers of other dynamic languages to see what RPython can do for them.\nThe PyPy 2.7 release supports:\n\n\n\nx86 machines on most common operating systems (Linux 32/64 bits, Mac OS X 64 bits, Windows 32 bits, OpenBSD, FreeBSD)\nnewer ARM hardware (ARMv6 or ARMv7, with VFPv3) running Linux,\nbig- and little-endian variants of PPC64 running Linux,\ns390x running Linux\n\n\n\n\n\n\u00a0\n\nWhat else is new?\n\n(since the releases of PyPy 2.7 and 3.3 at the end of 2016)\n\nThere are many incremental improvements to RPython and PyPy, the complete listing is here.\n\n\u00a0 \nPlease update, and continue to help us make PyPy better.\n\nCheers, The PyPy team", + "tags": "release", + "url": "https://www.pypy.org/posts/2017/03/pypy27-and-pypy35-v57-two-in-one-release-4736633226245374150.html" + }, + { + "title": "Leysin Winter Sprint Summary", + "text": "Today\n is the last day of our yearly sprint event in Leysin. We had lots of \nideas on how to enhance the current state of PyPy, we went skiing and \nhad interesting discussions around virtual machines, the Python \necosystem, and other real world problems.\n\u00a0\n\n\nWhy don't you join us next time?\n\n\nA usual PyPy sprints day goes through the following stages:\n\n\n\n\n\u00a0Planning Session: Tasks from previous days that have seen progress or \nare completed are noted in a shared document. Everyone adds new tasks \nand then assigns themselves to one or more tasks (usually in pairs). As \nsoon as everybody is happy with their task and has a partner to work \nwith, the planning session is concluded and the work can start.\nDiscussions: A sprint is a good occasion to discuss difficult \nand important topics in person. We usually sit down in a separate area \nin the sprint room and discuss until a) nobody wants to discuss anymore \nor b) we found a solution to the problem. The good thing is that usally \nthe outcome is b).\nLunch: For lunch we prepare sandwiches and other finger food.\nContinue working until dinner, which we eat at a random restaurant in Leysin.\nGoto 1 the next day, if sprint has not ended.\n\n\n\nSprints\n are open to everybody and help newcomers to get started with PyPy (we usually\n pair you with a developer familiar with PyPy). They are perfect to \ndiscuss and find solutions to problems we currently face. If you are \neager to join next year, please don't hesitate to register next year \naround January.\n\n\u00a0\n\n\nSprint Summary\u00a0 \u00a0\nSprint goals included to work on the following topics: \n\n\nWork towards releasing PyPy 3.5 (it will be released soon)\nCPython Extension (CPyExt) modules on PyPy\nHave fun in winter sports (a side goal)\n\n\n\n\nHighlights\n\n\n\n\n\n\n\nWe have spent lots of time debugging and fixing memory issues on CPyExt.\n In particular, we fixed a serious memory leak where taking a memoryview\n would prevent numpy arrays from ever being freed. More work is still required to ensure that our GC always releases arrays in a timely \nmanner.\nFruitful discussions and progress about how to flesh out some details about the unicode representation in PyPy. Our current goal is to use utf-8 as the unicode representation internally and have fast vectorized operations (indexing, check if valid, ...).\nPyPy will participate in GSoC 2017 and we will try to allocate more resources to that than last year.\nProfile and think about some details how to reduce the starting size of the interpreter. The starting point would be to look at the parser and reduce the amount of strings to keep alive.\nFound a topic for a student's master thesis: correctly freeing cpyext reference cycles.\nRun lots of Python3 code on top of PyPy3 and resolve issues we found along the way.\nInitial work on making RPython thread-safe without a GIL.\n\n\n\n\nList of attendees\n\n\n- Stefan Beyer\n\n- Antonio Cuni\n\n- Maciej Fijalkowski\n\n- Manuel Jacob\n\n- Ronan Lamy\n\n- Remi Meier\n\n- Richard Plangger\n\n- Armin Rigo\n\n- Robert Zaremba\n\n\u00a0\n\n\u00a0 \n\n\n\n\n\n\n\n\nWe\n would like to thank our donors for the continued support of the PyPy \nproject and we looking forward to next years sprint in Leysin.\n\n\n\n\nThe PyPy Team", + "tags": "", + "url": "https://www.pypy.org/posts/2017/03/leysin-winter-sprint-summary-4587213628578490701.html" + }, + { + "title": "Async HTTP benchmarks on PyPy3", + "text": "Hello everyone,\n\n\n\nSince Mozilla announced funding, we've been working quite hard on delivering you a working Python 3.5.\n\n\u00a0\n\nWe are almost ready to release an alpha version of PyPy 3.5. Our goal is to release it shortly after the sprint. Many modules have already been ported and\u00a0 it can probably run many Python 3 programs already. We are happy to receive any feedback after the next release.\u00a0 \n\n\n\nTo show that the heart (asyncio) of Python 3 is already working we have prepared some benchmarks. They are done by Pawe\u0142 Piotr Przeradowski @squeaky_pl for a HTTP workload on serveral asynchronous IO libraries, namely the relatively new asyncio and curio libraries and the battle-tested tornado, gevent and Twisted libraries. To see the benchmarks check out https://github.com/squeaky-pl/zenchmarks and the instructions for reproducing can be found inside README.md in the repository. Raw results can be obtained from https://github.com/squeaky-pl/zenchmarks/blob/master/results.csv.\n\n\n\nThe\n purpose of the presented benchmarks is showing that the upcoming PyPy release \nis already working with unmodified code that runs on CPython 3.5. PyPy \nalso manages to make them run significantly faster.\n\n\n\nThe\n benchmarks consist of HTTP servers implemented on the top of the mentioned \nlibraries. All the servers are single-threaded relying on underlying \nevent loops to provide concurrency. Access logging was disabled to \nexclude terminal I/O from the results. The view code consists of a \nlookup in a dictionary mapping ASCII letters to verses from the famous \nZen of Python. If a verse is found the view returns it, otherwise a 404 \nNot Found response is served. The 400 Bad Request and 500 Internal \nServer Error cases are also handled.\n\n\n\nThe workload was generated with the wrk HTTP benchmarking tool. It is run with one thread opening up to 100 \nconcurrent connections for 2 seconds and repeated 1010 times to get \nconsecutive measures. There is a Lua script provided\n that instructs wrk to continuously send 24 different requests that hit \ndifferent execution paths (200, 404, 400) in the view code. Also it is \nworth noting that wrk will only count 200 responses as successful so the actual request per second throughput is higher.\n\n\n\nFor your convenience all the used libraries versions are vendored into the benchmark repository. There is also a precompiled portable version of wrk provided\n that should run on any reasonably recent (10 year old or newer) Linux \nx86_64 distribution. The benchmark was performed on a public cloud scaleway x86_64 server launched in a Paris data center. The server was running \nUbuntu 16.04.01 LTS and reported Intel(R) Xeon(R) CPU D-1531 @ 2.20GHz \nCPU. CPython 3.5.2 (shipped by default in Ubuntu) was benchmarked \nagainst a pypy-c-jit-90326-88ef793308eb-linux64 snapshot of the 3.5 compatibility branch of PyPy.\n\n\n\n\n\n\u00a0\n\n\u00a0\n\n\u00a0\n\n\u00a0\n\nWe want to thank Mozilla for supporting our work!\n\n\n\nCheers,\n\nfijal, squeaky_pl and the PyPy Team", + "tags": "", + "url": "https://www.pypy.org/posts/2017/03/async-http-benchmarks-on-pypy3-1092124994927894138.html" + }, + { + "title": "Leysin Winter Sprint: 25/26th Feb. - 4th March 2017", + "text": "The next PyPy sprint will be in Leysin, Switzerland, for the twelveth time.\nThis is a fully public sprint: newcomers and topics other than those\nproposed below are welcome.\n\nGoals and topics of the sprint\nThe list of topics is very open.\n\nThe main topic is Python 3.5 support in PyPy, as most py3.5\ncontributors should be present. It is also a good topic if you have\nno or limited experience with PyPy contribution: we can easily find\nsomething semi-independent that is not done in py3.5 so far, and\ndo pair-programming with you.\nAny other topic is fine too: JIT compiler optimizations, CFFI,\nthe RevDB reverse debugger, improving to speed of your program on\nPyPy, etc.\nAnd as usual, the main side goal is to have fun in winter sports :-)\nWe can take a day off (for ski or anything else).\n\n\n\nExact times\nWork days: starting 26th Feb (~noon), ending March 4th (~noon).\nI have pre-booked the week from Saturday Feb 25th to Saturday March 4th.\nIf it is possible for you to arrive Sunday before mid-afternoon, then\nyou should get a booking from Sunday only. The break day should be\naround Wednesday.\nIt is fine to stay a few more days on either side, or conversely to book\nfor a part of that time only.\n\n\nLocation & Accomodation\n\nLeysin, Switzerland, \"same place as before\".\n\n\n\nLet me refresh your\nmemory: both the sprint venue and the lodging will be in a\npair of chalets built specifically for bed & breakfast:\nhttps://www.ermina.ch/. The place has a good ADSL Internet connection\nwith wireless installed. You can also arrange your own lodging\nelsewhere (as long as you are in Leysin, you cannot be more than a 15\nminutes walk away from the sprint venue).\nPlease confirm that you are coming so that we can adjust the\nreservations as appropriate.\nThe options of rooms are a bit more limited than on previous years\nbecause the place for bed-and-breakfast is shrinking; but we should\nstill have enough room for us. The price is around 60 CHF, breakfast\nincluded, in shared rooms (3 or 4 people). If there are people that\nwould prefer a double or single room, please contact me and we'll see\nwhat choices you have. There are also a choice of hotels in Leysin.\nPlease register by Mercurial:\n\nhttps://bitbucket.org/pypy/extradoc/\nhttps://foss.heptapod.net/pypy/extradoc/-/blob/branch/default/extradoc/sprintinfo/leysin-winter-2017/\nor on the pypy-dev mailing list if you do not yet have check-in rights:\n\nhttps://mail.python.org/mailman/listinfo/pypy-dev\nYou need a Swiss-to-(insert country here) power adapter. There will be\nsome Swiss-to-EU adapters around, and at least one EU-format power strip.", + "tags": "", + "url": "https://www.pypy.org/posts/2017/01/leysin-winter-sprint-2526th-feb-4th-3831779797804484935.html" + }, + { + "title": "PyPy2.7 v5.6 released - stdlib 2.7.12 support, C-API improvements, and more", + "text": "We have released PyPy2.7 v5.6 [0], about two months after PyPy2.7 v5.4. This new PyPy2.7 release includes the upstream stdlib version 2.7.12.\n\nWe continue to make incremental improvements to our C-API compatibility layer (cpyext). We pass all but 12 of the over-6000 tests in the upstream NumPy test suite, and have begun examining what it would take to support Pandas and PyQt. \n\nWork proceeds at a good pace on the PyPy3.5 version due to a grant from the Mozilla Foundation, and some of those changes have been backported to PyPy2.7 where relevant.\n\nThe PowerPC and s390x backend have been enhanced with the capability to use SIMD instructions for micronumpy loops.\n\nWe changed timeit to now report average +/- standard deviation, which is better than the misleading minimum value reported in CPython.\n\nWe now support building PyPy with OpenSSL 1.1 in our built-in _ssl module, as well as maintaining support for previous versions.\n\nCFFI has been updated to 1.9, improving an already great package for interfacing with C.\n\nAs always, this release fixed many issues and bugs raised by the growing community of PyPy users. We strongly recommend updating. You can download the PyPy2.7 v5.6 release here:\n\n\nhttps://pypy.org/download.html\n\nDownstream packagers have been hard at work. The Debian package is already available, and the portable PyPy versions are also ready, for those who wish to run PyPy on other Linux distributions like RHEL/Centos 5.\n\nWe would like to thank our donors for the continued support of the PyPy project.\n\nWe would also like to thank our contributors and encourage new people to join the project. PyPy has many layers and we need help with all of them: PyPy and RPython documentation improvements, tweaking popular modules to run on pypy, or general help with making RPython\u2019s JIT even better.\n\n\nWhat is PyPy?\nPyPy is a very compliant Python interpreter, almost a drop-in replacement for CPython 2.7. It\u2019s fast (PyPy and CPython 2.7.x performance comparison) due to its integrated tracing JIT compiler.\nWe also welcome developers of other dynamic languages to see what RPython can do for them.\nThis release supports:\n\n\n\nx86 machines on most common operating systems (Linux 32/64 bits, Mac OS X 64 bits, Windows 32 bits, OpenBSD, FreeBSD)\nnewer ARM hardware (ARMv6 or ARMv7, with VFPv3) running Linux,\nbig- and little-endian variants of PPC64 running Linux,\ns390x running Linux\n\n\n\n\n\n\nWhat else is new?\n\n(since the release of PyPy 5.4 in August, 2016)\n\nThere are many incremental improvements to RPython and PyPy, the complete listing is here.\n\n\u00a0 \nPlease update, and continue to help us make PyPy better.\n\nCheers, The PyPy team\n\n[0] We skipped 5.5 since we share a code base with PyPy3, and PyPy3.3-v.5.5-alpha was released last month", + "tags": "release", + "url": "https://www.pypy.org/posts/2016/11/pypy27-v56-released-stdlib-2712-support-5671090852400583673.html" + }, + { + "title": "Vectorization extended. PowerPC and s390x", + "text": "We are happy to announce that JIT support in both the PowerPC backend and the\ns390x backend have been enhanced. Both can now vectorize loops via SIMD\ninstructions. Special thanks to IBM for funding this work.\n\nIf you are not familiar with this topic you can read more details\u00a0here.\n\n\nThere are many more enhancements under the hood. Most notably, all pure operations are now delayed until the latest possible point. In some cases indices have been calculated more than once or they needed an additional register, because the old value is still used. Additionally it is now possible to load quadword-aligned memory in both PPC and s390x (x86 currently cannot do that).\n\nNumPy & CPyExt\nThe community and core developers have been moving CPyExt towards a complete, but emulated, layer for CPython C extensions. This is great, because the one restriction preventing the wider deployment of PyPy in several scenarios will hopefully be removed. However, we advocate not to use CPyExt, but rather to not write C code at all (let PyPy speed up your Python code) or use cffi.\n\nThe work done here to support vectorization helps micronumpy (NumPyPy) to speed up operations for PPC and s390x. So why is PyPy supporting both NumPyPy and NumPy, do we actually need both? Yes, there are places where gcc can beat the JIT, and places where the tight integration between NumPyPy and PyPy is more performant. We do have plans to integrate both, hijacking the C-extension method calls to use NumPyPy where we know NumPyPy can be faster.\n\nJust to give you an idea why this is a benefit:\n\nNumPy arrays can carry custom dtypes and apply user defined python functions on the arrays. How could one optimize this kind of scenario? In a traditional setup, you cannot. But as soon as NumPyPy is turned on, you can suddenly JIT compile this code and vectorize it.\n\nAnother example is element access that occurs frequently, or any other calls that cross between Python and the C level frequently.\n\nBenchmarks\nLet's have a look at some benchmarks reusing\u00a0mikefc's numpy benchmark suite\u00a0(find the forked version here).\u00a0I only ran a subset of microbenchmarks, showing that the core functionality is\nfunctioning properly. Additionally it has been rewritten to use\u00a0perf\u00a0instead of the timeit stdlib module.\n\n\nSetup\nx86 runs on a Intel i7-2600 clocked at 3.40GHz using 4 cores. PowerPC runs on the Power 8 clocked at 3.425GHz providing 160 cores. Last but not least the mainframe machine clocked up to 4 GHz, but fully virtualized (as it is common for such machines). Note that PowerPC is a non private remote machine. It is used by many users and it is crowded with processes. It is hard to extract a stable benchmark there.\n\nx86 ran on Fedora 24 (kernel version of 4.8.4), PPC ran on Fedora 21 (kernel version 3.17.4) and s390x ran on Redhat Linux 7.2 (kernel version 3.10.0). Respectivley, numpy on cpython had openblas available on x86, no blas implementation were present on s390x and PPC provided blas and lapack.\n\nAs you can see all machines run very different configurations. It does not make sense to compare across platforms, but rather implementations on the same platform.\n\n\n\n\n\n\n\nBlue shows CPython 2.7.10+ available on that platform using the latest NumPy (1.11). Micro NumPy is used for PyPy. PyPy+ indicates that the vectorization optimization is turned on.\nAll bar charts show the median value of all runs (5 samples, 100 loops, 10 inner loops, for the operations on vectors (not matrices) the loops are set to 1000). PyPy additionally gets 3 extra executions to warmup the JIT.\n\nThe comparison is really comparing speed of machine code. It compares the PyPy's JIT output vs GCC's output. It has little to do with the speed of the interpreter.\n\nBoth new SIMD backends speedup the numeric kernels. Some times it is near to the speed of CPython, some times it is faster. The maximum parallelism very much depends on the extension emitted by the compiler. All three SIMD backends have the same vector register size (which is 128 bit). This means that all three behave similar but ppc and s390x gain more because they can load 128bit of memory from quadword aligned memory.\n\n\nFuture directions\nPython is achieving rapid adoption in data science. This is currently a trend emerging in Europe, and Python is already heavily used for data science in the USA many other places around the world.\n\n\nPyPy can make a valuable contribution for data scientists, helping them to rapidly write scientific programs in Python and run them at near native speed. If you happen to be in that situation, we are eager to hear you feedback or resolve your issues and also work together to improve the performance of your,\ncode. Just get in touch!\n\n\nRichard Plangger (plan_rich) and the PyPy team", + "tags": "", + "url": "https://www.pypy.org/posts/2016/11/vectorization-extended-powerpc-and-s390x-4042433015460084057.html" + }, + { + "title": "PyPy3 5.5.0 released", + "text": "We're pleased to announce the release of PyPy3 v5.5.0. Coming four months after PyPy3.3 v5.2, it improves compatibility with Python 3.3 (3.3.5). We strongly recommend updating from previous PyPy3 versions.\n\nWe would like to thank all of the people who donated to the py3k proposal for supporting the work that went into this release.\n\nYou can download the PyPy3.3 v5.5.0 release here:\u00a0https://pypy.org/download.html\n\nImproved Python 3.3.5 support.\n\nos.get_terminal_size(), time.monotonic(), str.casefold()\u00a0\nfaulthandler module\nThere are still some missing features such as a PEP 393-like space efficient string representation and including performance regressions (e.g. issue #2305). The focus for this release has been updating to 3.3 compatibility. Windows is also not yet supported.\n\nensurepip is also included (it's only included in CPython 3 >= 3.4).\nBuffer interface improvements (numpy on top of cpyext)\nSeveral JIT improvements (force-virtual-state, residual calls)\nSearch path for libpypy-c.so has changed (helps with cffi embedding on linux distributions)\nImprove the error message when the user forgot the \"self\" argument of a method\nMany more small improvements, please head over to our documentation for more information\n\n\nTowards Python 3.5\n\n\nWe have started to work on Python 3.5, which is a version used by many software projects. It seems to get wide adoption. We are happy to be part of the\u00a0Mozilla Open Source Support (MOSS) initiative.\n\n\n\nNevertheless we want to give our users the chance to use PyPy in their Python 3 projects, thus we have prepared this release.\n\n\nWhat is PyPy?\nPyPy is a very compliant Python interpreter, almost a drop-in replacement for\u00a0CPython 2.7.10 and 3.3.5. It's fast due to its integrated tracing JIT\u00a0compiler.\n We also welcome developers of other dynamic languages to see what RPython can do for them.\n\nThis release supports:\n\nx86 machines on most common operating systems except Windows\u00a0\nnewer ARM hardware (ARMv6 or ARMv7, with VFPv3) running Linux\u00a0\nbig- and little-endian variants of PPC64 running Linux\u00a0\ns390x running Linux\n\nPlease try it out and let us know what you think. We welcome feedback, we know\nyou are using PyPy, please tell us about it!\n\nCheers\n\nThe PyPy Team", + "tags": "release", + "url": "https://www.pypy.org/posts/2016/10/pypy3-550-released-8069558680221199646.html" + }, + { + "title": "RevDB released, v5.4.1", + "text": "Hi all,\n\n\nThe first beta version of RevDB is out! Remember that RevDB is a reverse debugger for Python. The idea is that it is a debugger that can run forward and backward in time, letting you more easily understand your subtle bug in your big Python program.\n\n\nRevDB should work on almost any Python program. Even if you are normally only using CPython, trying to reproduce the bug with RevDB is similar to trying to run the program on a regular PyPy---usually it just works, even if not quite always.\n\n\nNews from the alpha version in the previous blog post include notably support for:\n\nThreads.\nCPyExt, the compatibility layer of PyPy that can run CPython C extension modules.\n\nas well as many other improvements.\n\n\nYou need to build it yourself for now. It is tested on 64-bit Linux. 32-bit Linux, OS/X, and other POSIX platforms should all either work out of the box or be just a few fixes away (contributions welcome). Win32 support is a lot more involved but not impossible.\n\n\nSee https://bitbucket.org/pypy/revdb/ for more information!\n\nArmin", + "tags": "releaserevdb", + "url": "https://www.pypy.org/posts/2016/09/revdb-released-v541-6719768292347391304.html" + }, + { + "title": "PyPy 5.4.1 bugfix released", + "text": "We have released a bugfix for PyPy2.7-v5.4.0, released last week, due to the following issues:\n\n\n\nUpdate list of contributors in documentation and LICENSE file, this was unfortunately left out of 5.4.0. My apologies to the new contributors\nAllow tests run with -A to find libm.so even if it is a script not a dynamically loadable file\nBump sys.setrecursionlimit() when translating PyPy, for translating with CPython\nTweak a float comparison with 0 in backendopt.inline to avoid rounding errors\nFix for an issue for translating the sandbox\nFix for and issue where unicode.decode('utf8', 'custom_replace') messed up the last byte of a unicode string sometimes\nUpdate built-in cffi to version 1.8.1\nExplicitly detect that we found as-yet-unsupported OpenSSL 1.1, and crash translation with a message asking for help porting it\nFix a regression where a PyBytesObject was forced (converted to a RPython object) when not required, reported as issue #2395\n\n\nThanks to those who reported the issues.\n\n\nWhat is PyPy?\n\nPyPy is a very compliant Python interpreter, almost a drop-in replacement for CPython 2.7. It's fast (PyPy and CPython 2.7.x performance comparison) due to its integrated tracing JIT compiler.\n\nWe also welcome developers of other dynamic languages to see what RPython can do for them.\n\nThis release supports:\n\nx86 machines on most common operating systems (Linux 32/64, Mac OS X 64, Windows 32, OpenBSD, FreeBSD),\nnewer ARM hardware (ARMv6 or ARMv7, with VFPv3) running Linux,\nbig- and little-endian variants of PPC64 running Linux,\ns390x running Linux\n\n\nPlease update, and continue to help us make PyPy better.\n\nCheers\n\nThe PyPy Team", + "tags": "release", + "url": "https://www.pypy.org/posts/2016/09/pypy-541-bugfix-released-3217566297258542810.html" + }, + { + "title": "PyPy2 v5.4 released - incremental improvements and enhancements", + "text": "We have released PyPy2.7 v5.4, a little under two months after PyPy2.7 v5.3.\nThis new PyPy2.7 release includes incremental improvements to our C-API\ncompatibility layer (cpyext), enabling us to pass over 99% of the upstream\nnumpy test suite.\n\nWe updated built-in cffi support to version 1.8,\nwhich now supports the \u201climited API\u201d mode for c-extensions on\nCPython >=3.2.\n\n\nWe improved tooling for the PyPy JIT, and expanded VMProf\nsupport to OpenBSD and Dragon Fly BSD\n\n\nAs always, this release fixed many issues and bugs raised by the\ngrowing community of PyPy users. We strongly recommend updating.\n\n\nYou can download the PyPy2 v5.4 release here:\n\n\nhttps://pypy.org/download.html\n\nWe would like to thank our donors for their continued support of the PyPy\nproject. We would also like to thank our contributors and\nencourage new people to join the project. PyPy has many\nlayers and we need help with all of them: PyPy and RPython documentation\nimprovements, testing and adapting popular modules to run on PyPy, or general help\nwith making RPython\u2019s JIT even better.\n\n\n\nWhat is PyPy?\nPyPy is a very compliant Python interpreter, almost a drop-in replacement for CPython 2.7. It\u2019s fast (PyPy and CPython 2.7 performance comparison) due to its integrated tracing JIT compiler.\n\nWe also welcome developers of other dynamic languages to see what RPython can do for them.\n\nThis release supports:\n\nx86 machines on most common operating systems (Linux 32/64, Mac OS X 64, Windows 32, OpenBSD, FreeBSD)\nnewer ARM hardware (ARMv6 or ARMv7, with VFPv3) running Linux\nbig- and little-endian variants of PPC64 running Linux\ns390x running Linux\n\n\n\nWhat is New?\n\n(since the release of PyPy 5.3 in June, 2016)\nThere are many incremental improvements to RPython and PyPy, the complete listing is here. Mozilla generously sponsored work toward python 3.5 compatibility, and we are beginning to see some cross-over improvements of RPython and PyPy2.7 as a result.\n\nPlease update, and continue to help us make PyPy better.\nCheers\nThe PyPy Team", + "tags": "release", + "url": "https://www.pypy.org/posts/2016/08/pypy2-v54-released-incremental-3611318295736669599.html" + }, + { + "title": "PyPy Tooling Upgrade: JitViewer and VMProf", + "text": "We are happy to announce a major JitViewer (JV) update.\nJV allows you to inspect RPython's internal compiler representation (the language in which PyPy is implemented) including the generated machine code of your program. It can graphically show you details of the JIT compiled code and helps you pinpoint issues in your program.\n\nVMProf is a statistical CPU profiler for python imposing very little overhead at runtime.\n\nBoth VMProf and JitViewer share a common goal: Present useful information for your python program.\nThe combination of both can reveal more information than either alone.\nThat is the reason why they are now both packaged together.\nWe also updated vmprof.com\u00a0with various bug fixes and changes including an all new interface to JV.\n\nThis work was done with the goal of improving tooling and libraries around the Python/PyPy/RPython ecosystem.\nSome of the tools we have developed:\n\n\nCFFI - Foreign Function Interface that avoids CPyExt (CFFI docs)\nRevDB - A reverse debugger for python (RevDB blog post)\n\n\nand of course the tools we discuss here:\n\n\nVMProf - A statistical CPU profiler (VMProf docs)\nJitViewer - Visualization of the log file produced by RPython (JitLog docs)\n\n\n\nA \"brand new\" JitViewer\n\nJitViewer has two pieces: you create a log file when running your program, and then use a graphic tool to view what happened.\n\nThe old logging format was a hard-to-maintain, plain-text-logging facility. Frequent changes often broke internal tools.\nAdditionally, the logging output of a long running program required a lot of disk space.\n\nOur new binary format encodes data densely, makes use of some compression (gzip), and tries to remove repetition where possible.\nIt also supports versioning for future proofing and can be extended easily.\n\nAnd *drumroll* you no longer need to install a tool to view the log yourself\nanymore! The whole system moved to vmprof.com and you can use it any time.\n\nSounds great. But what can you do with it? Here are two examples for a PyPy user:\n\nPyPy crashed? Did you discover a bug?\n\nFor some hard to find bugs it is often necessary to look at the compiled code. The old\nprocedure often required you to upload a plain text file which was hard to parse and to look through.\n\nA better way to share a crash report is to install the ``vmprof`` module from PyPi and execute either of the two commands:\n\n# this program does not crash, but has some weird behaviour\n$ pypy -m jitlog --web \n...\nPyPy Jitlog: https://vmprof.com/#//traces\n# this program segfaults\n$ pypy -m jitlog -o /tmp/log \n...\n\n$ pypy -m jitlog --upload /tmp/log\nPyPy Jitlog: https://vmprof.com/#//traces\n\n\nProviding the link in the bug report allows PyPy developers to browse and identify potential issues.\n\n\nSpeed issues\n\nVMProf is a great tool to find hot spots that consume a lot of time in your program. As soon as you have identified code that runs slowly, you can switch to jitlog and maybe pinpoint certain aspects that do not behave as expected. You will find an overview, and are able to browse the generated code. If you cannot make sense of all that, you can just share the link with us and we can have a look too.\n\nFuture direction\n\nWe hope that the new release will help both PyPy developers and PyPy users resolve potential issues and easily point them out.\n\nHere are a few ideas what might come in the next few releases:\n\n\n\n\u00a0Combination of CPU profiles and the JITLOG (sadly did not make it into the current release).\nExtend vmprof.com to be able to query vmprof/jitlog. An example query for vmprof: 'methods.callsites() > 5' andfor the jitlog would be 'traces.contains('call_assembler').hasbridge('*my_func_name*')'.\nExtend the jitlog to capture the information of the optimization stage.\n\n\n\nRichard Plangger (plan_rich) and the PyPy team", + "tags": "", + "url": "https://www.pypy.org/posts/2016/08/pypy-tooling-upgrade-jitviewer-and-5107430577468391432.html" + }, + { + "title": "PyPy gets funding from Mozilla for Python 3.5 support", + "text": "\"Python 2.x versus Python 3.x\": this is by now an old question. In the eyes of some people Python 2 is here to stay, and in the eyes of others Python has long been 3 only.\n\nPyPy's own position is that PyPy will support Python 2.7 forever---the RPython language in which PyPy is written is a subset of 2.7, and we have no plan to upgrade that. But at the same time, we want to support 3.x. This is particularly true now: a relatively recent development is that Python 3.5 seems to attract more and more people. The \"switch\" to Python 3.x might be starting to happen.\n\nCorrespondingly, PyPy has been searching for a while for a way to support a larger-scale development effort. The goal is to support not just any old version of Python 3.x, but Python 3.5, as this seems to be the version that people are switching to. PyPy is close to supporting all of Python 3.3 now; but the list of what is new in Python 3.4 and 3.5 is far, far longer than anyone imagines. The long-term goal is also to get a version of \"PyPy3\" that is as good as \"PyPy2\" is, including its performance and its cpyext layer (CPython C API interoperability), for example.\n\nSo, the end result: Mozilla recently decided to award $200,000 to Baroque Software to work on PyPy as part of its Mozilla Open Source Support (MOSS) initiative. This money will be used to implement the Python 3.5 features in PyPy. Within the next year, we plan to use the money to pay four core PyPy developers half-time to work on the missing features and on some of the big performance and cpyext issues. This should speed up the progress of catching up with Python 3.x significantly. We are extremely thankful to Mozilla for supporting us in this way, and will keep you updated on the progress via this blog.", + "tags": "sponsors", + "url": "https://www.pypy.org/posts/2016/08/pypy-gets-funding-from-mozilla-for-5569307998787871200.html" + }, + { + "title": "Reverse debugging for Python", + "text": "RevPDB\nA \"reverse debugger\" is a debugger where you can go forward and\nbackward in time. It is an uncommon feature, at least in the open\nsource world, but I have no idea why. I have used undodb-gdb and\nrr, which are reverse debuggers for C code, and I can only say that\nthey saved me many, many days of poking around blindly in gdb.\nThe PyPy team is pleased to give you \"RevPDB\", a reverse-debugger\nsimilar to rr but for Python.\nAn example is worth a thousand words. Let's say your big Python\nprogram has a bug that shows up inconsistently. You have nailed it\ndown to something like:\n\nstart x.py, which does stuff (maybe involving processing files,\nanswering some web requests that you simulate from another terminal,\netc.);\nsometimes, after a few minutes, your program's state becomes\ninconsistent and you get a failing assert or another exception.\n\nThis is the case where RevPDB is useful.\nRevPDB is available only on 64-bit Linux and OS/X right now, but should\nnot be too hard to port to other OSes. It is very much alpha-level!\n(It is a debugger full of bugs. Sorry about that.) I believe it is\nstill useful---it helped me in one real use case already.\n\n\nHow to get RevPDB\nThe following demo was done with an alpha version for 64-bit Linux,\ncompiled for Arch Linux. I won't provide the binary; it should be\neasy enough to retranslate (much faster than a regular PyPy because it\ncontains neither a JIT nor a custom GC). Grab the PyPy sources from\nMercurial, and then:\n\nhg update reverse-debugger\n# or \"hg update ff376ccacb36\" for exactly this demo\ncd pypy/goal\n../../rpython/bin/rpython -O2 --revdb targetpypystandalone.py \\\n --withoutmod-cpyext --withoutmod-micronumpy\n\nand possibly rename the final pypy-c to pypy-revdb to avoid\nconfusion.\nOther platforms than 64-bit Linux and OS/X need some fixes before they work.\n\n\nDemo\nFor this demo, we're going to use this x.py as the \"big program\":\n\nimport os\n\nclass Foo(object):\n value = 5\n\nlst1 = [Foo() for i in range(100)]\nlst1[50].value += 1\nfor x in lst1:\n x.value += 1\n\nfor x in lst1:\n if x.value != 6:\n print 'oops!'\n os._exit(1)\n\nOf course, it is clear what occurs in this small example: the check\nfails on item 50. For this demo, the check has been written with\nos._exit(1), because this exits immediately the program. If it\nwas written with an assert, then its failure would execute things\nin the traceback module afterwards, to print the traceback; it\nwould be a minor mess just to find the exact point of the failing\nassert. (This and other issues are supposed to be fixed in the\nfuture, but for now it is alpha-level.)\nAnyway, with a regular assert and a regular post-mortem pdb,\nwe could observe that x.value is indeed 7 instead of 6 when the\nassert fails. Imagine that the program is much bigger: how would we\nfind the exact chain of events that caused this value 7 to show up on\nthis particular Foo object? This is what RevPDB is for.\nFirst, we need for now to disable Address Space Layout Randomization\n(ASLR), otherwise replaying will not work. This is done once with the\nfollowing command line, which changes the state until the next\nreboot:\n\necho 0 | sudo tee /proc/sys/kernel/randomize_va_space\n\nUPDATE: the above is no longer necessary from revision ff376ccacb36.\nRun x.py with RevPDB's version of PyPy instead of the regular\ninterpreter (CPython or PyPy):\n\nPYPYRDB=log.rdb ./pypy-revdb x.py\n\nThis pypy-revdb executable is like a slow PyPy executable, running\n(for now) without a JIT. This produces a file log.rdb which\ncontains a complete log of this execution. (If the bug we are\ntracking occurs rarely, we need to re-run it several times until we\nget the failure. But once we got the failure, then we're done with\nthis step.)\nStart:\n\nrpython/translator/revdb/revdb.py log.rdb\n\nWe get a pdb-style debugger. This revdb.py is a normal Python\nprogram, which you run with an unmodified Python; internally, it looks\ninside the log for the path to pypy-revdb and run it as needed (as\none forking subprocess, in a special mode).\nInitially, we are at the start of the program---not at the end, like\nwe'd get in a regular debugger:\n\nFile \"/app_main.py\", line 787 in setup_bootstrap_path:\n(1)$\n\nThe list of commands is available with help.\nGo to the end with continue (or c):\n\n(1)$ continue\nFile \"/tmp/x.py\", line 14 in :\n...\n lst1 = [Foo() for i in range(100)]\n lst1[50].value += 1\n for x in lst1:\n x.value += 1\n\n for x in lst1:\n if x.value != 6:\n print 'oops!'\n> os._exit(1)\n(19727)$\n\nWe are now at the beginning of the last executed line. The number\n19727 is the \"time\", measured in number of lines executed. We can go\nbackward with the bstep command (backward step, or bs), line\nby line, and forward again with the step command. There are also\ncommands bnext, bcontinue and bfinish and their forward\nequivalents. There is also \"go TIME\" to jump directly to the specified\ntime. (Right now the debugger only stops at \"line start\"\nevents, not at function entry or exit, which makes some cases a bit\nsurprising: for example, a step from the return statement of\nfunction foo() will jump directly to the caller's caller, if the\ncaller's current line was return foo() + 2, because no \"line\nstart\" event occurs in the caller after foo() returns to it.)\nWe can print Python expressions and statements using the p\ncommand:\n\n(19727)$ p x\n$0 = <__main__.Foo object at 0xfffffffffffeab3e>\n(19727)$ p x.value\n$1 = 7\n(19727)$ p x.value + 1\n8\n\nThe \"$NUM =\" prefix is only shown when we print an object that\nreally exists in the debugged program; that's why the last line does\nnot contain it. Once a $NUM has been printed, then we can use\nit in further expressions---even at a different point time. It\nbecomes an anchor that always refers to the same object:\n\n(19727)$ bstep\n\nFile \"/tmp/x.py\", line 13 in :\n...\n\n lst1 = [Foo() for i in range(100)]\n lst1[50].value += 1\n for x in lst1:\n x.value += 1\n\n for x in lst1:\n if x.value != 6:\n> print 'oops!'\n os._exit(1)\n(19726)$ p $0.value\n$1 = 7\n\nIn this case, we want to know when this value 7 was put in this\nattribute. This is the job of a watchpoint:\n\n(19726)$ watch $0.value\nWatchpoint 1 added\nupdating watchpoint value: $0.value => 7\n\nThis watchpoint means that $0.value will be evaluated at each line.\nWhen the repr() of this expression changes, the watchpoint activates\nand execution stops:\n\n(19726)$ bcontinue\n[searching 19629..19726]\n[searching 19338..19629]\n\nupdating watchpoint value: $0.value => 6\nReverse-hit watchpoint 1: $0.value\nFile \"/tmp/x.py\", line 9 in :\n import os\n\n class Foo(object):\n value = 5\n\n lst1 = [Foo() for i in range(100)]\n lst1[50].value += 1\n for x in lst1:\n> x.value += 1\n\n for x in lst1:\n if x.value != 6:\n print 'oops!'\n os._exit(1)\n(19524)$\n\nNote that using the $NUM syntax is essential in watchpoints. You\ncan't say \"watch x.value\", because the variable x will go out\nof scope very soon when we move forward or backward in time. In fact\nthe watchpoint expression is always evaluated inside an environment\nthat contains the builtins but not the current locals and globals.\nBut it also contains all the $NUM, which can be used to refer to\nknown objects. It is thus common to watch $0.attribute if $0\nis an object, or to watch len($1) if $1 is some list. The\nwatch expression can also be a simple boolean: for example, \"watch\n$2 in $3\" where $3 is some dict and $2 is some object that\nyou find now in the dict; you would use this to find out the time when\n$2 was put inside $3, or removed from it.\nUse \"info watchpoints\" and \"delete \" to manage\nwatchpoints.\nThere are also regular breakpoints, which you set with \"b\nFUNCNAME\". It breaks whenever there is a call to a function that\nhappens to have the given name. (It might be annoying to use for a\nfunction like __init__() which has many homonyms. There is no\nsupport for breaking on a fully-qualified name or at a given line\nnumber for now.)\nIn our demo, we stop at the line x.value += 1, which is where the\nvalue was changed from 6 to 7. Use bcontinue again to stop at the\nline lst1[50].value += 1, which is where the value was changed from\n5 to 6. Now we know how this value attribute ends up being 7.\n\n(19524)$ bcontinue\n[searching 19427..19524]\n[searching 19136..19427]\n\nupdating watchpoint value: $0.value => 5\nReverse-hit watchpoint 1: $0.value\nFile \"/tmp/x.py\", line 7 in :\n import os\n\n class Foo(object):\n value = 5\n\n lst1 = [Foo() for i in range(100)]\n> lst1[50].value += 1\n for x in lst1:\n x.value += 1\n\n for x in lst1:\n if x.value != 6:\n...\n(19422)$\n\nTry to use bcontinue yet another time. It will stop now just before\n$0 is created. At that point in time, $0 refers to\nan object that does not exist yet, so the watchpoint now evaluates to\nan error message (but it continues to work as before, with that error\nmessage as the string it currently evaluates to).\n\n(19422)$ bcontinue\n[searching 19325..19422]\n\nupdating watchpoint value: $0.value => RuntimeError:\n '$0' refers to an object created later in time\nReverse-hit watchpoint 1: $0.value\nFile \"/tmp/x.py\", line 6 in :\n import os\n\n class Foo(object):\n value = 5\n\n> lst1 = [Foo() for i in range(100)]\n lst1[50].value += 1\n for x in lst1:\n x.value += 1\n\n for x in lst1:\n...\n(19371)$\n\nIn big programs, the workflow is similar, just more complex. Usually\nit works this way: we find interesting points in time with some\ncombination of watchpoints and some direct commands to move around.\nWe write down on a piece of (real or virtual) paper these points in\nhistory, including most importantly their time, so that we can\nconstruct an ordered understanding of what is going on.\nThe current revdb can be annoying and sometimes even crash; but\nthe history you reconstruct can be kept. All the times and\nexpressions printed are still valid when you restart revdb. The\nonly thing \"lost\" is the $NUM objects, which you need to print\nagain. (Maybe instead of $0, $1, ... we should use $, where the big number identifies uniquely the object by its\ncreation time. These numbers would continue to be valid even after\nrevdb is restarted. They are more annoying to use than just\n$0 though.)\nScreencast: Here's a (slightly typo-y) screencast of cfbolz using the reverse debugger:\n\n\n\nCurrent issues\nGeneral issues:\n\nIf you are using revdb on a log that took more than a few\nminutes to record, then it can be painfully slow. This is because\nrevdb needs to replay again big parts of the log for some\noperations.\nThe pypy-revdb is currently missing the following modules:\nthread (implementing multithreading is possible, but not done\nyet);\ncpyext (the CPython C API compatibility layer);\nmicronumpy (minor issue only);\n_continuation (for greenlets).\n\n\nDoes not contain a JIT, and does not use our fast garbage\ncollectors. You can expect pypy-revdb to be maybe 3 times\nslower than CPython.\nOnly works on Linux and OS/X. There is no fundamental reason for\nthis restriction, but it is some work to fix.\nReplaying a program uses a lot more memory; maybe 15x as much than\nduring the recording. This is because it creates many forks. If\nyou have a program that consumes 10% of your RAM or more, you will\nneed to reduce MAX_SUBPROCESSES in process.py.\n\nReplaying also comes with a bunch of user interface issues:\n\nAttempted to do I/O or access raw memory: we get this whenever\ntrying to print some expression that cannot be evaluated with\nonly the GC memory---or which can, but then the __repr__()\nmethod of the result cannot. We need to reset the state with\nbstep + step before we can print anything else. However,\nif only the __repr__() crashes, you still see the $NUM =\nprefix, and you can use that $NUM afterwards.\nid() is globally unique, returning a reproducible 64-bit number,\nso sometimes using id(x) is a workaround for when using x\ndoesn't work because of Attempted to do I/O issues (e.g. p\n[id(x) for x in somelist]).\nas explained in the demo, next/bnext/finish/bfinish might jump\naround a bit non-predictably.\nsimilarly, breaks on watchpoints can stop at apparently unexpected\nplaces (when going backward, try to do \"step\" once). The issue is\nthat it can only stop at the beginning of every line. In the\nextreme example, if a line is foo(somelist.pop(getindex())),\nthen somelist is modified in the middle. Immediately before\nthis modification occurs, we are in getindex(), and\nimmediately afterwards we are in foo(). The watchpoint will\nstop the program at the end of getindex() if running backward,\nand at the start of foo() if running forward, but never\nactually on the line doing the change.\nwatchpoint expressions must not have any side-effect at all. If\nthey do, the replaying will get out of sync and revdb.py will\ncomplain about that. Regular p expressions and statements can\nhave side-effects; these effects are discarded as soon as you move\nin time again.\nsometimes even \"p import foo\" will fail with Attempted to do\nI/O. Use instead \"p import sys; foo = sys.modules['foo']\".\nuse help to see all commands. backtrace can be useful.\nThere is no up command; you have to move in time instead,\ne.g. using bfinish to go back to the point where the current\nfunction was called.\n\n\n\nHow RevPDB is done\nIf I had to pick the main advantage of PyPy over CPython, it is that\nwe have got with the RPython translation toolchain a real place for\nexperimentation. Every now and then, we build inside RPython some\nfeature that gives us an optionally tweaked version of the PyPy\ninterpreter---tweaked in a way that would be hard to do with CPython,\nbecause it would require systematic changes everywhere. The most\nobvious and successful examples are the GC and the JIT. But there\nhave been many other experiments along the same lines, from the\nso-called stackless transformation in the early days, to the STM\nversion of PyPy.\nRevPDB works in a similar way. It is a version of PyPy in which some\noperations are systematically replaced with other operations.\nTo keep the log file at a reasonable size, we duplicate the content of\nall GC objects during replaying---by repeating the same actions on\nthem, without writing anything in the log file. So that means that in\nthe pypy-revdb binary, the operations that do arithmetic or\nread/write GC-managed memory are not modified. Most operations are\nlike that. However, the other operations, the ones that involve\neither non-GC memory or calls to external C functions, are tweaked.\nEach of these operations is replaced with code that works in two\nmodes, based on a global flag:\n\nin \"recording\" mode, we log the result of the operation (but not the\narguments);\nin \"replaying\" mode, we don't really do the operation at all, but\ninstead just fetch the result from the log.\n\nHopefully, all remaining unmodified operations (arithmetic and GC\nload/store) are completely deterministic. So during replaying, every\ninteger or non-GC pointer variable will have exactly the same value as\nit had during recording. Interestingly, it means that if the\nrecording process had a big array in non-GC memory, then in the\nreplaying process, the array is not allocated at all; it is just\nrepresented by the same address, but there is nothing there. When we\nrecord \"read item 123 from the array\", we record the result of the\nread (but not the \"123\"). When we replay, we're seeing again the same\n\"read item 123 from the array\" operation. At that point, we don't\nread anything; we just return the result from the log. Similarly,\nwhen recording a \"write\" to the array, we record nothing (this write\noperation has no result); so that when replaying, we redo nothing.\nNote how that differs from anything managed by GC memory: GC objects\n(including GC arrays) are really allocated, writes really occur, and\nreads are redone. We don't touch the log in this case.\n\n\nOther reverse debuggers for Python\nThere are already some Python experiments about reverse debugging.\nThis is also known as \"omniscient debugging\". However, I claim that\nthe result they get to is not very useful (for the purpose presented\nhere). How they work is typically by recording changes to some\nobjects, like lists and dictionaries, in addition to recording the\nhistory of where your program passed through. However, the problem of\nPython is that lists and dictionaries are not the end of the story.\nThere are many, many, many types of objects written in C which are\nmutable---in fact, the immutable ones are the exception. You can try\nto systematically record all changes, but it is a huge task and easy\nto forget a detail.\nIn other words it is a typical use case for tweaking the RPython\ntranslation toolchain, rather than tweaking the CPython (or PyPy)\ninterpreter directly. The result that we get here with RevPDB is more\nsimilar to rr anyway, in that only a relatively small number of\nexternal events are recorded---not every single change to every single\nlist and dictionary.\nSome links:\n\nepdb: https://github.com/native-human/epdb\npode: https://github.com/rodsenra/pode\n\nFor C:\n\nrr: https://rr-project.org/\nundodb-gdb: https://undo.io/\n\n\n\nFuture work\nAs mentioned above, it is alpha-level, and only works on Linux and OS/X.\nSo the plans for the immediate future are to fix the various\nissues described above, and port to more operating systems. The core of the system\nis in the C file and headers in rpython/translator/revdb/src-revdb.\nFor interested people, there is also the Duhton interpreter and its\nreverse-debugger branch, which is where I prototyped the RPython\nconcept before moving to PyPy. The basics should work for any\ninterpreter written in RPython, but they require some specific code to\ninterface with the language; in the case of PyPy, it is in\npypy/interpreter/reverse_debugging.py.\nIn parallel, there are various user interface improvements that people\ncould be interested in, like a more \"pdb++\" experience. (And the script\nat rpython/translator/revdb/revdb.py should be moved out into some\nmore \"official\" place, and the reverse-debugger branch should be\nmerged back to default.)\nI would certainly welcome any help!\n-+- Armin", + "tags": "revdb", + "url": "https://www.pypy.org/posts/2016/07/reverse-debugging-for-python-8854823774141612670.html" + }, + { + "title": "PyPy2 v5.3 released - major C-extension support improvements", + "text": "We have released PyPy2.7 v5.3, about six weeks after PyPy 5.1 and a week after\nPyPy3.3 v5.2 alpha 1, the first PyPy release targeting 3.3\ncompatibility. This new PyPy2.7 release includes major improvements for the\nC-API compatibility layer. In addition to complete support\nfor lxml, we now pass most (more than 95%) of the upstream numpy test suite. We can build and run scipy and matplotlib as well. Most of the failures have to do with (ab) use of the C-API, for instance writing to a read-only pointer obtained from PyString_AsString().\n\nNote that the C-API compatibility layer is significantly slower than CPython, as explained in the blog post about the new strategy for reflection of C objects into the PyPy interpreter.\n\nWe updated cffi to version 1.7 (incremental changes which provide a nicer developer experience, documented here). We would encourage developers to move their C-extension modules to cffi, but are willing to help you work through issues with existing code; come to #pypy on IRC and let us know how we can help you help us do better.\n\nYou can download the PyPy2 v5.3 release here:\n\n\n\nhttps://pypy.org/download.html\n\nWe would like to thank our donors for their continued support of the PyPy\nproject. We would also like to thank our contributors and\nencourage new people to join the project. PyPy has many\nlayers and we need help with all of them: PyPy and RPython documentation\nimprovements, tweaking popular modules to run on PyPy, or general help\nwith making RPython\u2019s JIT even better.\n\n\n\nWhat is PyPy?\nPyPy is a very compliant Python interpreter, almost a drop-in replacement for CPython 2.7. It\u2019s fast (PyPy and CPython 2.7 performance comparison) due to its integrated tracing JIT compiler.\n\nWe also welcome developers of other dynamic languages to see what RPython can do for them.\n\nThis release supports:\n\nx86 machines on most common operating systems (Linux 32/64, Mac OS X 64, Windows 32, OpenBSD, FreeBSD)\nnewer ARM hardware (ARMv6 or ARMv7, with VFPv3) running Linux\nbig- and little-endian variants of PPC64 running Linux\ns390x running Linux\n\n\n\nOther Highlights\n\n(since the release of PyPy 5.1 in April, 2016)\n\n\nNew features:\n\n\nMerge a major expansion of the C-API support in cpyext, also expand cpyext tests to allow running them after translation as well as untranslated\n\n\nInstead of \u201cGIL not held when a CPython C extension module\ncalls PyXxx\u201d, we now silently acquire/release the GIL. Helps with\nC extension modules that call some PyXxx() functions without\nholding the GIL (arguably, they are theoretically buggy).\n\n\nSupport command line -v to trace import statements\n\n\nRevive traceviewer, a tool to use pygame to view traces\n\n\n\n\n\n\nNumpy via our internal _numpypy module:\n\nImplement ufunc.outer\nMove PyPy-specific numpypy headers to a subdirectory (also changed the repo\naccordingly)\n\n\u00a0\n\n\nPerformance improvements:\n\nUse bitstrings to compress lists of descriptors that are attached to an\nEffectInfo\nRemove most of the _ovf, _zer and _val operations from RPython. Kills\nquite some code internally, and allows the JIT to do better\noptimizations: for example, app-level code like x / 2 or x % 2\ncan now be turned into x >> 1 or x & 1, even if x is possibly\nnegative.\nRework the way registers are moved/spilled in before_call()\n\n\n\n\nInternal refactorings:\n\nRefactor code to better support Python3-compatible syntax\nReduce the size of generated C sources during translation by\neliminating many many unused struct declarations (Issue #2281)\nReduce the size of generated code by using the same function objects in\nall generated subclasses\nShare cpyext Py* function wrappers according to the signature, shrinking the\ntranslated libpypy.so by about 10% (without the JIT)\n\n\n\nPlease update, and continue to help us make PyPy better.\nCheers\n\nThe PyPy Team", + "tags": "release", + "url": "https://www.pypy.org/posts/2016/06/pypy2-v53-released-major-c-extension-7708576047190172431.html" + }, + { + "title": "PyPy3.3 v5.2 alpha 1 released", + "text": "We're pleased to announce the first alpha release of PyPy3.3 v5.2. This is the\nfirst release of PyPy which targets Python 3.3 (3.3.5) compatibility.We would like to thank all of the people who donated to the py3k proposal\nfor supporting the work that went into this and future releases.You can download the PyPy3.3 v5.2 alpha 1 release here:https://pypy.org/download.html#python-3-3-5-compatible-pypy3-3-v5-2HighlightsPython 3.3.5 support!Being an early alpha release, there are some missing features such as a\nPEP 393-like space efficient string representation and known issues\nincluding performance issues (e.g. issue #2305). The focus for this\nrelease has been updating to 3.3 compatibility. Windows is also not yet\nsupported.\n\nensurepip is also included (it's only included in CPython 3 >= 3.4).\nWhat is PyPy?PyPy is a very compliant Python interpreter, almost a drop-in replacement for\nCPython 2.7.10 and one day 3.3.5. It's fast due to its integrated tracing JIT\ncompiler.We also welcome developers of other dynamic languages to see what RPython\ncan do for them.This release supports:x86 machines on most common operating systems except Windows\n(Linux 32/64, Mac OS X 64, OpenBSD, FreeBSD),\nnewer ARM hardware (ARMv6 or ARMv7, with VFPv3) running Linux,\nbig- and little-endian variants of PPC64 running Linux,\ns390x running Linux\nPlease try it out and let us know what you think. We welcome feedback, we know\nyou are using PyPy, please tell us about it!We'd especially like to thank these people for their contributions to this\nrelease:Manuel Jacob, Ronan Lamy, Mark Young, Amaury Forgeot d'Arc, Philip Jenvey,\nMartin Matusiak, Vasily Kuznetsov, Matti Picus, Armin Rigo and many others.CheersThe PyPy Team", + "tags": "release", + "url": "https://www.pypy.org/posts/2016/05/pypy33-v52-alpha-1-released-1725927506363370346.html" + }, + { + "title": "PyPy 5.1.1 bugfix released", + "text": "We have released a bugfix for PyPy 5.1, due to a regression in installing third-party packages depending on numpy (using our numpy fork available at https://bitbucket.org/pypy/numpy ).Thanks to those who reported the issue. We also fixed a regression in translating PyPy which increased the memory required to translate. Improvement will be noticed by downstream packagers and those who translate rather thandownload pre-built binaries.\n\nWhat is PyPy?\n\nPyPy is a very compliant Python interpreter, almost a drop-in replacement for CPython 2.7. It's fast (PyPy and CPython 2.7.x performance comparison) due to its integrated tracing JIT compiler.We also welcome developers of other dynamic languages to see what RPython can do for them.This release supports:\n\nx86 machines on most common operating systems (Linux 32/64, Mac OS X 64, Windows 32, OpenBSD, FreeBSD),\nnewer ARM hardware (ARMv6 or ARMv7, with VFPv3) running Linux,\nbig- and little-endian variants of PPC64 running Linux,\ns390x running Linux\n\n\nPlease update, and continue to help us make PyPy better.CheersThe PyPy Team", + "tags": "release", + "url": "https://www.pypy.org/posts/2016/05/pypy-511-bugfix-released-7586640750680293200.html" + }, + { + "title": "PyPy 5.1 released", + "text": "We have released PyPy 5.1, about a month after PyPy 5.0.\n\nThis release includes more improvement to warmup time and memory requirements, extending the work done on PyPy 5.0. We have seen an additional reduction of about 20% in memory requirements, and up to 30% warmup time improvement, more detail in the blog post.\n\nWe also now have full support for the IBM s390x. Since this support is in RPython, any dynamic language written using RPython, like PyPy, will automagically be supported on that architecture.\n\nWe updated cffi to 1.6 (cffi 1.6 itself will be released shortly), and continue to improve support for the wider python ecosystem using the PyPy interpreter.\n\nYou can download the PyPy 5.1 release here:\n\n\nhttps://pypy.org/download.html\n\nWe would like to thank our donors for the continued support of the PyPy project.\nWe would also like to thank our contributors and encourage new people to join the project. PyPy has many layers and we need help with all of them: PyPy and RPython documentation improvements, tweaking popular modules to run on pypy, or general help with making RPython\u2019s JIT even better.\n\n\nWhat is PyPy?\nPyPy is a very compliant Python interpreter, almost a drop-in replacement for CPython 2.7. It\u2019s fast (PyPy and CPython 2.7.x performance comparison) due to its integrated tracing JIT compiler.\n\nWe also welcome developers of other dynamic languages to see what RPython can do for them.\n\nThis release supports:\n\nx86 machines on most common operating systems (Linux 32/64, Mac OS X 64, Windows 32, OpenBSD, FreeBSD),\nnewer ARM hardware (ARMv6 or ARMv7, with VFPv3) running Linux,\nbig- and little-endian variants of PPC64 running Linux,\ns390x running Linux\n\n\n\n\nOther Highlights\n\n(since the release of PyPy 5.0 in March, 2016\n\n\n\nNew features:\n\nA new jit backend for the IBM s390x, which was a large effort over the past few months.\nAdd better support for PyUnicodeObject in the C-API compatibility layer\nSupport GNU/kFreeBSD Debian ports in vmprof\nAdd __pypy__._promote\nMake attrgetter a single type for CPython compatibility\n\n\n\n\nBug Fixes\n\nCatch exceptions raised in an exit function\nFix a corner case in the JIT\nFix edge cases in the cpyext refcounting-compatible semantics (more work on cpyext compatibility is coming in the cpyext-ext branch, but isn\u2019t ready yet)\nTry harder to not emit NEON instructions on ARM processors without NEON support\nImprove the rpython posix module system interaction function calls\nDetect a missing class function implementation instead of calling a random function\nCheck that PyTupleObjects do not contain any NULLs at the point of conversion to W_TupleObjects\nIn ctypes, fix _anonymous_ fields of instances\nFix JIT issue with unpack() on a Trace which contains half-written operations\nFix sandbox startup (a regression in 5.0)\nFix possible segfault for classes with mangled mro or __metaclass__\nFix isinstance(deque(), Hashable) on the pure python deque\nFix an issue with forkpty()\nIssues reported with our previous release were resolved after reports from users on our issue tracker at https://foss.heptapod.net/pypy/pypy/-/issues or on IRC at #pypy\n\n\n\n\nNumpy:\n\nImplemented numpy.where for a single argument\nIndexing by a numpy scalar now returns a scalar\nFix transpose(arg) when arg is a sequence\nRefactor include file handling, now all numpy ndarray, ufunc, and umath functions exported from libpypy.so are declared in pypy_numpy.h, which is included only when building our fork of numpy\nAdd broadcast\n\n\n\n\nPerformance improvements:\n\nImprove str.endswith([tuple]) and str.startswith([tuple]) to allow JITting\nMerge another round of improvements to the warmup performance\nCleanup history rewriting in pyjitpl\nRemove the forced minor collection that occurs when rewriting the assembler at the start of the JIT backend\nPort the resource module to cffi\n\n\u00a0\n\n\nInternal refactorings:\n\nUse a simpler logger to speed up translation\nDrop vestiges of Python 2.5 support in testing\nUpdate rpython functions with ones needed for py3k\n\n\n\n\n\n\n\n\n\n\nPlease update, and continue to help us make PyPy better.\nCheers\nThe PyPy Team", + "tags": "release", + "url": "https://www.pypy.org/posts/2016/04/pypy-51-released-4979856639628970409.html" + }, + { + "title": "PyPy Enterprise Edition", + "text": "With the latest additions, PyPy's JIT now supports the Z architecture on Linux. The newest architecture revision (also known as s390x, or colloquially referred to as \"big iron\") is the 64-bit extension for IBM mainframes. Currently only Linux 64 bit is supported (not z/OS nor TPF).\nThis is the fourth assembler backend supported by PyPy in addition to x86 (32 and 64), ARM (32-bit only) and PPC64 (both little- and big-endian). It might seem that we kind of get a hang of new architectures. Thanks to IBM for funding this work!\n\n\nHistory \nWhen I went to university one lecture covered the prediction of Thomas Watson in 1943. His famous quote \"I think there is a world market for maybe five computers ...\", turned out not to be true. \n\nHowever, even 70 years later, mainframes are used more often than you think. They back critical tasks requiring a high level of stability/security and offer high hardware and computational utilization rates by virtualization.\n\nWith the new PyPy JIT backend we are happy to present a fast Python virtual machine for mainframes and contribute more free software running on s390x.\n\n\nMeta tracing\nEven though the JIT backend has been tested on PyPy, it is not restricted to\u00a0 the Python programming language. Do you have a great idea for a DSL, or another language that should run on mainframes? Go ahead and just implement your interpreter using RPython.\n\n\nHow do I get a copy?\nPyPy can be built using the usual instructions found here. As soon as the next PyPy version has been released we will provide binaries. Until then you can just grab a nightly here.We are currently busy to get the next version of PyPy ready, so an official release will be rolled out soon.\n\n\nComparing s390x to x86\nThe goal of this comparison is not to scientifically evaluate the benefits/disadvantages on s390x, but rather to see that PyPy's architecture delivers the same benefits as it does on other platforms. Similar to the comparison done for PPC I ran the benchmarks using the same setup. The first column is the speedup of the PyPy JIT VM compared to the speedup of a pure PyPy interpreter 1). Note that the s390x's OS was virtualized.\n\n\u00a0 Label \u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 x86\u00a0\u00a0\u00a0\u00a0 s390x\u00a0\u00a0\u00a0\u00a0\u00a0 s390x (run 2)\n\n\u00a0 ai\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 13.7 \u00a0\u00a0\u00a0\u00a0 12.4\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 11.9\u00a0 bm_chameleon\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 8.5 \u00a0\u00a0\u00a0\u00a0\u00a0 6.3\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 6.8\u00a0 bm_dulwich_log\u00a0\u00a0\u00a0\u00a0\u00a0 5.1\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 5.0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 5.1\u00a0 bm_krakatau\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 5.5 \u00a0\u00a0\u00a0\u00a0\u00a0 2.0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 2.0\u00a0 bm_mako\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 8.4 \u00a0\u00a0\u00a0\u00a0\u00a0 5.8\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 5.9\u00a0 bm_mdp\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 2.0 \u00a0\u00a0\u00a0\u00a0\u00a0 3.8\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 3.8\u00a0 chaos\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 56.9 \u00a0\u00a0\u00a0\u00a0 52.6\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 53.4\u00a0 crypto_pyaes\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 62.5 \u00a0\u00a0\u00a0\u00a0 64.2\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 64.2\u00a0 deltablue\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 3.3 \u00a0\u00a0\u00a0\u00a0\u00a0 3.9\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 3.6\u00a0 django\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 28.8 \u00a0\u00a0\u00a0\u00a0 22.6\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 21.7\u00a0 eparse\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 2.3 \u00a0\u00a0\u00a0\u00a0\u00a0 2.5\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 2.6\u00a0 fannkuch\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 9.1 \u00a0\u00a0\u00a0\u00a0\u00a0 9.9\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 10.1\u00a0 float\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 13.8 \u00a0\u00a0\u00a0\u00a0 12.8\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 13.8\u00a0 genshi_text\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 16.4 \u00a0\u00a0\u00a0\u00a0 10.5\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 10.9\u00a0 genshi_xml\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 8.2 \u00a0\u00a0\u00a0\u00a0\u00a0 7.9\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 8.2\u00a0 go\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 6.7 \u00a0\u00a0\u00a0\u00a0\u00a0 6.2\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 11.2\u00a0 hexiom2\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 24.3 \u00a0\u00a0\u00a0\u00a0 23.8\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 23.5\u00a0 html5lib\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 5.4 \u00a0\u00a0\u00a0\u00a0\u00a0 5.8\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 5.7\u00a0 json_bench\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 28.8 \u00a0\u00a0\u00a0\u00a0 27.8\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 28.1\u00a0 meteor-contest\u00a0\u00a0\u00a0\u00a0\u00a0 5.1 \u00a0\u00a0\u00a0\u00a0\u00a0 4.2\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 4.4\u00a0 nbody_modified\u00a0\u00a0\u00a0\u00a0 20.6 \u00a0\u00a0\u00a0\u00a0 19.3\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 19.4\u00a0 pidigits\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 1.0 \u00a0\u00a0\u00a0\u00a0 -1.1\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 -1.0\u00a0 pyflate-fast\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 9.0 \u00a0\u00a0\u00a0\u00a0\u00a0 8.7\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 8.5\u00a0 pypy_interp\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 3.3 \u00a0 \u00a0\u00a0\u00a0 4.2\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 4.4\u00a0 raytrace-simple\u00a0\u00a0\u00a0 69.0 \u00a0\u00a0\u00a0 100.9\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 93.4\u00a0 richards\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 94.1 \u00a0\u00a0\u00a0\u00a0 96.6\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 84.3\u00a0 rietveld\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 3.2 \u00a0\u00a0\u00a0\u00a0\u00a0 2.5\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 2.7\u00a0 slowspitfire\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 2.8 \u00a0\u00a0\u00a0\u00a0\u00a0 3.3\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 4.2\u00a0 spambayes\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 5.0 \u00a0\u00a0\u00a0\u00a0\u00a0 4.8\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 4.8\u00a0 spectral-norm\u00a0\u00a0\u00a0\u00a0\u00a0 41.9 \u00a0\u00a0\u00a0\u00a0 39.8\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 42.6\u00a0 spitfire\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 3.8 \u00a0\u00a0\u00a0\u00a0\u00a0 3.9\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 4.3\u00a0 spitfire_cstringio\u00a0 7.6 \u00a0\u00a0\u00a0\u00a0\u00a0 7.9\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 8.2\u00a0 sympy_expand\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 2.9 \u00a0\u00a0\u00a0\u00a0\u00a0 1.8\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 1.8\u00a0 sympy_integrate\u00a0\u00a0\u00a0\u00a0 4.3 \u00a0\u00a0\u00a0\u00a0\u00a0 3.9\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 4.0\u00a0 sympy_str\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 1.5 \u00a0\u00a0\u00a0\u00a0\u00a0 1.3\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 1.3\u00a0 sympy_sum\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 6.2 \u00a0\u00a0\u00a0\u00a0\u00a0 5.8\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 5.9\u00a0 telco\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 61.2 \u00a0\u00a0\u00a0\u00a0 48.5\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 54.8\u00a0 twisted_iteration\u00a0 55.5 \u00a0\u00a0\u00a0\u00a0 41.9\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 43.8\u00a0 twisted_names\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 8.2 \u00a0\u00a0\u00a0\u00a0\u00a0 9.3\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 9.7\u00a0 twisted_pb\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 12.1 \u00a0\u00a0\u00a0\u00a0 10.4\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 10.2\u00a0 twisted_tcp\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 4.9 \u00a0\u00a0\u00a0\u00a0\u00a0 4.8\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 5.2\n\u00a0 Geometric mean:\u00a0\u00a0\u00a0 9.31\u00a0\u00a0\u00a0\u00a0\u00a0 9.10\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 9.43\n\nAs you can see the benefits are comparable on both platforms.\nOf course this is scientifically not good enough, but it shows a tendency. s390x can achieve the same results as you can get on x86. \n\nAre you running your business application on a mainframe? We would love to get some feedback. Join us in IRC tell us if PyPy made your application faster! \n\nplan_rich & the PyPy Team\n\n1) PyPy revision for the benchmarks: 4b386bcfee54", + "tags": "", + "url": "https://www.pypy.org/posts/2016/04/pypy-enterprise-edition-3688275697656890948.html" + }, + { + "title": "Warmup improvements: more efficient trace representation", + "text": "Hello everyone.\nI'm pleased to inform that we've finished another round of\nimprovements to the warmup performance of PyPy. Before I go\ninto details, I'll recap the achievements that we've done since we've started\nworking on the warmup performance. I picked a random PyPy from November 2014\n(which is definitely before we started the warmup work) and compared it with\na recent one, after 5.0. The exact revisions are respectively ffce4c795283\nand cfbb442ae368. First let's compare pure warmup benchmarks that\ncan be found in our benchmarking suite. Out of those,\npypy-graph-alloc-removal numbers should be taken with a grain of salt,\nsince other work could have influenced the results.\nThe rest of the benchmarks mentioned is bottlenecked purely by warmup times.\nYou can see how much your program spends in warmup running\nPYPYLOG=jit-summary:- pypy your-program.py under \"tracing\" and \"backend\"\nfields (in the first three lines). An example looks like that:\n\n[e00c145a41] {jit-summary\nTracing: 71 0.053645 <- time spent tracing & optimizing\nBackend: 71 0.028659 <- time spent compiling to assembler\nTOTAL: 0.252217 <- total run time of the program\n\nThe results of the benchmarks\n\n\n\n\n\n\n\n\n\n\nbenchmark\ntime - old\ntime - new\nspeedup\nJIT time - old\nJIT time - new\n\nfunction_call\n1.86\n1.42\n1.3x\n1.12s\n0.57s\n\nfunction_call2\n5.17s\n2.73s\n1.9x\n4.2s\n1.6s\n\nbridges\n2.77s\n2.07s\n1.3x\n1.5s\n0.8s\n\npypy-graph-alloc-removal\n2.06s\n1.65s\n1.25x\n1.25s\n0.79s\n\n\n\nAs we can see, the overall warmup benchmarks got up to 90% faster with\nJIT time dropping by up to 2.5x. We have more optimizations in the pipeline,\nwith an idea how to transfer some of the JIT gains into more of a total program\nruntime by jitting earlier and more eagerly.\n\nDetails of the last round of optimizations\nNow the nitty gritty details - what did we actually do? I covered a lot of\nwarmup improvements in the past blog posts so I'm going to focus on\nthe last change, the jit-leaner-frontend branch. This last change is simple, instead of using\npointers to store the \"operations\" objects created during tracing, we use a compact list of\n16-bit integers (with 16bit pointers in between). On 64bit machine the memory wins are\ntremendous - the new representation is 4x more efficient to use 16bit pointers than full 64bit pointers.\nAdditionally, the smaller representation has much better cache behavior and much less\npointer chasing in memory. It also has a better defined lifespan, so we don't need to\nbother tracking them by the GC, which also saves quite a bit of time.\nThe change sounds simple, but the details in the underlaying data mean that\neverything in the JIT had to be changed which took quite a bit of effort :-)\nGoing into the future on the JIT front, we have an exciting set of optimizations,\nranging from faster loops through faster warmup to using better code generation\ntechniques and broadening the kind of program that PyPy speeds up. Stay tuned\nfor the updates.\nWe would like to thank our commercial partners for making all of this possible.\nThe work has been performed by baroquesoftware and would not be possible\nwithout support from people using PyPy in production. If your company uses\nPyPy and want it to do more or does not use PyPy but has performance problems\nwith the Python installation, feel free to get in touch with me, trust me using\nPyPy ends up being a lot cheaper than rewriting everything in go :-)\nBest regards,\nMaciej Fijalkowski", + "tags": "", + "url": "https://www.pypy.org/posts/2016/04/warmup-improvements-more-efficient-7082900097299909512.html" + }, + { + "title": "PyPy 5.0.1 bugfix released", + "text": "PyPy 5.0.1\n\nWe have released a bugfix for PyPy 5.0, after reports that the newly released\nlxml 3.6.0, which now supports PyPy 5.0 +, can crash on large files.\nThanks to those who reported the crash. Please update, downloads are available\nat\n\npypy.org/download.html\n\n\nThe changes between PyPy 5.0 and 5.0.1 are only two bug fixes: one in\ncpyext, which fixes notably (but not only) lxml; and another for a\ncorner case of the JIT.\n\n\nWhat is PyPy?\n\n\nPyPy is a very compliant Python interpreter, almost a drop-in replacement for\nCPython 2.7. It\u2019s fast (PyPy and CPython 2.7.x performance comparison)\ndue to its integrated tracing JIT compiler.\n\nWe also welcome developers of other\ndynamic languages to see what RPython can do for them.\n\nThis release supports x86 machines on most common operating systems\n(Linux 32/64, Mac OS X 64, Windows 32, OpenBSD, FreeBSD),\nnewer ARM hardware (ARMv6 or ARMv7, with VFPv3) running Linux, and the\nbig- and little-endian variants of PPC64 running Linux.\n\n\nPlease update, and continue to help us make PyPy better.\n\n\nCheers\n\nThe PyPy Team", + "tags": "release", + "url": "https://www.pypy.org/posts/2016/03/pypy-501-bugfix-released-2218405735970044084.html" + }, + { + "title": "PyPy 5.0 released", + "text": "PyPy 5.0\nWe have released PyPy 5.0, about three months after PyPy 4.0.1. We encourage all users of PyPy to update to this version.\n\nYou can download the PyPy 5.0 release here:\n\n\nhttps://pypy.org/download.html\n\nWe would like to thank our donors for the continued support of the PyPy project.\nWe would also like to thank our contributors and encourage new people to join the project. PyPy has many layers and we need help with all of them: PyPy and RPython documentation improvements, tweaking popular modules to run on pypy, or general help with making RPython\u2019s JIT even better.\n\n\u00a0\n\nFaster and Leaner\n\nWe continue to improve the warmup time and memory usage of JIT-related metadata. The exact effects depend vastly on the program you\u2019re running and can range from insignificant to warmup being up to 30% faster and memory dropping by about 30%. \n\n\u00a0\n\nC-API Upgrade\n\nWe also merged a major upgrade to our C-API layer (cpyext), simplifying the interaction between c-level objects and PyPy interpreter level objects. As a result, lxml (prerelease) with its cython compiled component passes all tests on PyPy. The new cpyext is also much faster. This major refactoring will soon be followed by an expansion of our C-API compatibility.\n\n\u00a0\n\nProfiling with vmprof supported on more platforms\n\n\nvmprof has been a go-to profiler for PyPy on linux for a few releases and we\u2019re happy to announce that thanks to the cooperation with jetbrains, vmprof now works on Linux, OS X and Windows on both PyPy and CPython.\n\n\n\u00a0\n\nCFFI\nWhile not applicable only to PyPy, cffi is arguably our most significant contribution to the python ecosystem. PyPy 5.0 ships with cffi-1.5.2 which now allows embedding PyPy (or CPython) in a C program.\n\n\n\u00a0\n\nWhat is PyPy?\n\nPyPy is a very compliant Python interpreter, almost a drop-in replacement for CPython 2.7. It\u2019s fast (pypy and cpython 2.7.x performance comparison) due to its integrated tracing JIT compiler.\nWe also welcome developers of other dynamic languages to see what RPython can do for them.\nThis release supports x86 machines on most common operating systems (Linux 32/64, Mac OS X 64, Windows 32, OpenBSD, freebsd), newer ARM hardware (ARMv6 or ARMv7, with VFPv3) running Linux, and 64 bit PowerPC hardware, specifically Linux running the big- and little-endian variants of ppc64.\n\n\n\u00a0\n\nOther Highlights (since 4.0.1 released in November 2015)\n\nNew features:\nSupport embedding PyPy in a C-program via cffi and static callbacks in cffi.\nThis deprecates the old method of embedding PyPy\nRefactor vmprof to work cross-operating-system, deprecate using buggy\nlibunwind on Linux platforms. Vmprof even works on Windows now.\nSupport more of the C-API type slots, like tp_getattro, and fix C-API\nmacros, functions, and structs such as _PyLong_FromByteArray(),\nPyString_GET_SIZE, f_locals in PyFrameObject, Py_NAN, co_filename in\nPyCodeObject\nUse a more stable approach for allocating PyObjects in cpyext. (see\nblog post). Once the PyObject corresponding to a PyPy object is created,\nit stays around at the same location until the death of the PyPy object.\nDone with a little bit of custom GC support. It allows us to kill the\nnotion of \u201cborrowing\u201d inside cpyext, reduces 4 dictionaries down to 1, and\nsignificantly simplifies the whole approach (which is why it is a new\nfeature while technically a refactoring) and allows PyPy to support the\npopulart lxml module (as of the next release) with no PyPy specific\npatches needed\nMake the default filesystem encoding ASCII, like CPython\nUse hypothesis in test creation, which is great for randomizing tests\n\n\u00a0\n\n\nBug Fixes\nBackport always using os.urandom for uuid4 from cpython and fix the JIT as well\n(issue #2202)\nMore completely support datetime, optimize timedelta creation\nFix for issue #2185 which caused an inconsistent list of operations to be\ngenerated by the unroller, appeared in a complicated DJango app\nFix an elusive issue with stacklets on shadowstack which showed up when\nforgetting stacklets without resuming them\nFix entrypoint() which now acquires the GIL\nFix direct_ffi_call() so failure does not bail out before setting CALL_MAY_FORCE\nFix (de)pickling long values by simplifying the implementation\nFix RPython rthread so that objects stored as threadlocal do not force minor\nGC collection and are kept alive automatically. This improves perfomance of\nshort-running Python callbacks and prevents resetting such object between\ncalls\nSupport floats as parameters to itertools.isslice()\nCheck for the existence of CODESET, ignoring it should have prevented PyPy\nfrom working on FreeBSD\nFix for corner case (likely shown by Krakatau) for consecutive guards with\ninterdependencies\nFix applevel bare class method comparisons which should fix pretty printing\nin IPython\nIssues reported with our previous release were resolved after reports from users on our issue tracker at https://foss.heptapod.net/pypy/pypy/-/issues or on IRC at #pypy\n\n\u00a0\n\n\nNumpy:\nUpdates to numpy 1.10.2 (incompatibilities and not-implemented features\nstill exist)\nSupport dtype=((\u2018O\u2019, spec)) union while disallowing record arrays with\nmixed object, non-object values\nRemove all traces of micronumpy from cpyext if \u2013withoutmod-micronumpy option used\nSupport indexing filtering with a boolean ndarray\nSupport partition() as an app-level function, together with a cffi wrapper\nin pypy/numpy, this now provides partial support for partition()\n\n\u00a0\n\n\nPerformance improvements:\nOptimize global lookups\nImprove the memory signature of numbering instances in the JIT. This should\nmassively decrease the amount of memory consumed by the JIT, which is\nsignificant for most programs. Also compress the numberings using variable-\nsize encoding\nOptimize string concatenation\nUse INT_LSHIFT instead of INT_MUL when possible\nImprove struct.unpack by casting directly from the underlying buffer.\nUnpacking floats and doubles is about 15 times faster, and integer types\nabout 50% faster (on 64 bit integers). This was then subsequently\nimproved further in optimizeopt.py.\nOptimize two-tuple lookups in mapdict, which improves warmup of instance\nvariable access somewhat\nReduce all guards from int_floordiv_ovf if one of the arguments is constant\nIdentify permutations of attributes at instance creation, reducing the\nnumber of bridges created\nGreatly improve re.sub() performance\n\n\u00a0\n\n\nInternal refactorings:\nRefactor and improve exception analysis in the annotator\nRemove unnecessary special handling of space.wrap().\nSupport list-resizing setslice operations in RPython\nTweak the trace-too-long heuristic for multiple jit drivers\nRefactor bookkeeping (such a cool word - three double letters) in the\nannotater\nRefactor wrappers for OS functions from rtyper to rlib and simplify them\nSimplify backend loading instructions to only use four variants\nSimplify GIL handling in non-jitted code\nRefactor naming in optimizeopt\nChange GraphAnalyzer to use a more precise way to recognize external\nfunctions and fix null pointer handling, generally clean up external\nfunction handling\nRemove pure variants of getfield_gc_* operations from the JIT by\ndetermining purity while tracing\nRefactor databasing\nSimplify bootstrapping in cpyext\nRefactor rtyper debug code into python.rtyper.debug\nSeperate structmember.h from Python.h Also enhance creating api functions\nto specify which header file they appear in (previously only pypy_decl.h)\nFix tokenizer to enforce universal newlines, needed for Python 3 support\n\n\n\nPlease try it out and let us know what you think. We welcome feedback, we know you are using PyPy, please tell us about it!\nCheers\nThe PyPy Team", + "tags": "release", + "url": "https://www.pypy.org/posts/2016/03/pypy-50-released-5730569530415927220.html" + }, + { + "title": "C-API Support update", + "text": "As you know, PyPy can emulate the CPython C API to some extent. In this post I will describe an important optimization that we merged to improve the performance and stability of the C-API emulation layer.\n\nThe C-API is implemented by passing around PyObject * pointers in the C code. The problem with providing the same interface with PyPy is that\nobjects don't natively have the same PyObject * structure at all; and\nadditionally their memory address can change. PyPy handles the\ndifference by maintaining two sets of objects. More precisely, starting\nfrom a PyPy object, it can allocate on demand a PyObject structure\nand fill it with information that points back to the original PyPy\nobjects; and conversely, starting from a C-level object, it can allocate\na PyPy-level object and fill it with information in the opposite\ndirection.\n\nI have merged a rewrite of the interaction between C-API C-level objects\nand PyPy's interpreter level objects. This is mostly a simplification\nbased on a small hack in our garbage collector. This hack makes the\ngarbage collector aware of the reference-counted PyObject\nstructures. When it considers a pair consisting of a PyPy object and a\nPyObject, it will always free either none or both of them at the\nsame time. They both stay alive if either there is a regular GC\nreference to the PyPy object, or the reference counter in the\nPyObject is bigger than zero.\n\nThis gives a more stable result. Previously, a PyPy object might grow a\ncorresponding PyObject, loose it (when its reference counter goes to\nzero), and later have another corresponding PyObject re-created at a\ndifferent address. Now, once a link is created, it remains alive until\nboth objects die.\n\nThe rewrite significantly simplifies our previous code (which used to be\nbased on at least 4 different dictionaries), and should make using the\nC-API somewhat faster (though it is still slower than using pure\npython or cffi).\n\nA side effect of this work is that now PyPy actually supports the upstream lxml package---which is is one\nof the most popular packages on PyPI. (Specifically, you need version\n3.5.0 with this pull\nrequest to remove old PyPy-specific hacks that were not really\nworking. See\ndetails.) At this point, we no longer recommend using the\ncffi-lxml alternative: although it may still be faster, it might be\nincomplete and old.\n\nWe are actively working on extending our C-API support, and hope to soon\nmerge a branch to support more of the C-API functions (some numpy news\ncoming!). Please try\nit out and let us know how it works for you.\n\nArmin Rigo and the PyPy team", + "tags": "", + "url": "https://www.pypy.org/posts/2016/02/c-api-support-update-8582726091670983181.html" + }, + { + "title": "Using CFFI for embedding", + "text": "Introduction\n\nCFFI has been a great success so far to call C libraries in your\nPython programs, in a way that is both simple and that works across\nCPython 2.x and 3.x and PyPy.\n\nThis post assumes that you know what CFFI is and how to use it in\nAPI mode (ffi.cdef(), ffi.set_source(), ffi.compile()).\nA quick overview can be found in this paragraph.\n\nThe major news of CFFI 1.4, released last december, was that you can\nnow declare C functions with extern \"Python\" in the cdef().\nThese magic keywords make the function callable from C (where it is\ndefined automatically), but calling it will call some Python code\n(which you attach with the @ffi.def_extern() decorator). This is\nuseful because it gives a more straightforward, faster and\nlibffi-independent way to write callbacks. For more details, see the\ndocumentation.\n\nYou are, in effect, declaring a static family of C functions which\ncall Python code. The idea is to take pointers to them, and pass them\naround to other C functions, as callbacks. However, the idea of a set\nof C functions which call Python code opens another path: embedding\nPython code inside non-Python programs.\n\nEmbedding\n\nEmbedding is traditionally done using the CPython C API: from C code,\nyou call Py_Initialize() and then some other functions like\nPyRun_SimpleString(). In the simple cases it is, indeed, simple\nenough; but it can become a complicated story if you throw in\nsupporting application-dependent object types; and a messy story if\nyou add correctly running on multiple threads, for example.\nMoreover, this approach is specific to CPython (2.x or 3.x). It does\nnot work at all on PyPy, which has its own very different, minimal\nembedding API.\n\nThe new-and-coming thing about CFFI 1.5, meant as replacement of the\nabove solutions, is direct embedding support---with no fixed API at\nall. The idea is to write some Python script with a cdef() which\ndeclares a number of extern \"Python\" functions. When running the\nscript, it creates the C source code and compiles it to a\ndynamically-linked library (.so on Linux). This is the same as in\nthe regular API-mode usage. What is new is that these extern\n\"Python\" can now also be exported from the .so, in the C\nsense. You also give a bit of initialization-time Python code\ndirectly in the script, which will be compiled into the .so too.\nThis library can now be used directly from any C program (and it is\nstill importable in Python). It exposes the C API of your choice,\nwhich you specified with the extern \"Python\" declarations. You\ncan use it to make whatever custom API makes sense in your particular\ncase. You can even directly make a \"plug-in\" for any program that\nsupports them, just by exporting the API expected for such plugins.\n\nTrying it out on CPython\n\nThis is still being finalized, but please try it out. You can\nsee embedding.py directly online for a quick glance. Or\nsee below the instructions on Linux with CPython 2.7 (CPython 3.x and\nnon-Linux platforms are still a work in progress right now, but this\nshould be quickly fixed):\n\nget the branch static-callback-embedding of CFFI:\n\nhg clone https://foss.heptapod.net/cffi/cffi\nhg up static-callback-embedding\n\n\nmake the _cffi_backend.so:\n\npython setup_base.py build_ext -f -i\n\n\nrun embedding.py in the demo directory:\n\ncd demo\nPYTHONPATH=.. python embedding.py\n\n\nthis produces _embedding_cffi.c. Run gcc to build it. On Linux:\n\ngcc -shared -fPIC _embedding_cffi.c -o _embedding_cffi.so \\\n -lpython2.7 -I/usr/include/python2.7\n\n\ntry out the demo C program in embedding_test.c:\n\ngcc embedding_test.c _embedding_cffi.so\nPYTHONPATH=.. LD_LIBRARY_PATH=. ./a.out\n\n\n\nNote that if you get ImportError: cffi extension module\n'_embedding_cffi' has unknown version 0x2701, it means that the\n_cffi_backend module loaded is a pre-installed one instead of the\nmore recent one in \"..\". Be sure to use PYTHONPATH=.. for now. (Some installations manage to be confused enough to load the system-wide cffi even if another version is in the PYTHONPATH. I think a virtualenv can be used to work around this issue.)\n\nTry it out on PyPy\n\nVery similar steps can be followed on PyPy, but it requires the\ncffi-static-callback-embedding branch of PyPy, which you must\nfirst translate from sources. The difference is then that you need to\nadapt the first gcc command line: replace -lpython2.7 with\n-lpypy-c and to fix the -I path (and possibly add a -L\npath).\n\nMore details\n\nHow it works, more precisely, is by automatically initializing CPython/PyPy\nthe first time any of the extern \"Python\"\nfunctions is called from the C program. This is done using locks in case of multi-threading,\nso several threads can concurrently do this \"first call\". This should work even if two\ndifferent threads call the first time a function from two different\nembedded CFFI extensions that happen to be linked with the same program. Explicit initialization is\nnever needed.\n\nThe custom initialization-time Python code you put in\nffi.embedding_init_code() is executed at that time. If this code\nstarts to be big, you can move it to independent modules or packages.\nThen the initialization-time Python code only needs to import them. In\nthat case, you have to carefully set up sys.path if the modules are\nnot installed in the usual Python way.\nIf the Python code is big and full of dependencies, a better alternative\nwould be to use virtualenv. How to do that is not fully fleshed out so\nfar. You can certainly run the whole program with the environment\nvariables set up by the virtualenv's activate script first. There\nare probably other solutions that involve using gcc's\n-Wl,-rpath=\\$ORIGIN/ or -Wl,-rpath=/fixed/path/ options to load\na specific libpython or libypypy-c library. If you try it out and it\ndoesn't work the way you would like, please complain :-)\nAnother point: right now this does not support CPython's notion of\nmultiple subinterpreters. The logic creates a single global Python\ninterpreter, and runs everything in that context. Maybe a future\nversion would have an explicit API to do that \u2014 or maybe it should be\nthe job of a 3rd-party extension module to provide a Python interface\nover the notion of subinterpreters...\nMore generally, any feedback is appreciated.\nHave fun,\nArmin", + "tags": "", + "url": "https://www.pypy.org/posts/2016/01/using-cffi-for-embedding-8493496761738752124.html" + }, + { + "title": "Leysin Winter Sprint (20-27th February 2016)", + "text": "The next PyPy sprint will be in Leysin, Switzerland, for the eleventh time.\nThis is a fully public sprint: newcomers and topics other than those\nproposed below are welcome.\n\nGoals and topics of the sprint\nThe details depend on who is here and ready to work. The list of\ntopics is mostly the same as last year (did PyPy became a mature\nproject with only long-term goals?):\n\ncpyext (CPython C API emulation layer): various speed and\ncompleteness topics\ncleaning up the optimization step in the JIT, change the register\nallocation done by the JIT's backend, or more improvements to the\nwarm-up time\nfinish vmprof - a statistical profiler for CPython and PyPy\nPy3k (Python 3.x support), NumPyPy (the numpy module)\nSTM (Software Transaction Memory), notably: try to come up with\nbenchmarks, and measure them carefully in order to test and improve\nthe conflict reporting tools, and more generally to figure out how\npractical it is in large projects to avoid conflicts\nAnd as usual, the main side goal is to have fun in winter sports :-)\nWe can take a day off for ski.\n\n\n\nExact times\nI have booked the week from Saturday 20 to Saturday 27. It is fine to\nleave either the 27 or the 28, or even stay a few\nmore days on either side. The plan is to work full days between the 21\nand the 27. You are of course allowed to show up for a part of that\ntime only, too.\n\n\nLocation & Accomodation\nLeysin, Switzerland, \"same place as before\". Let me refresh your\nmemory: both the sprint venue and the lodging will be in a\npair of chalets built specifically for bed & breakfast:\nhttps://www.ermina.ch/. The place has a good ADSL Internet connection\nwith wireless installed. You can also arrange your own lodging\nelsewhere (as long as you are in Leysin, you cannot be more than a 15\nminutes walk away from the sprint venue).\nPlease confirm that you are coming so that we can adjust the\nreservations as appropriate.\nThe options of rooms are a bit more limited than on previous years\nbecause the place for bed-and-breakfast is shrinking: what is\nguaranteed is only one double-bed room and a bigger room with 5-6\nindividual beds (the latter at 50-60 CHF per night, breakfast\nincluded). If there are more people that would prefer a single room,\nplease contact me and we'll see what choices you have. There are a\nchoice of hotels, many of them reasonably priced for Switzerland.\nPlease register by Mercurial:\n\nhttps://bitbucket.org/pypy/extradoc/\nhttps://foss.heptapod.net/pypy/extradoc/-/blob/branch/default/extradoc/sprintinfo/leysin-winter-2016\nor on the pypy-dev mailing list if you do not yet have check-in rights:\n\nhttps://mail.python.org/mailman/listinfo/pypy-dev\nYou need a Swiss-to-(insert country here) power adapter. There will be\nsome Swiss-to-EU adapters around, and at least one EU-format power strip.", + "tags": "", + "url": "https://www.pypy.org/posts/2016/01/leysin-winter-sprint-20-27th-february-1737200016169608469.html" + }, + { + "title": "PyPy 4.0.1 released please update", + "text": "PyPy 4.0.1\n\nWe have released PyPy 4.0.1, three weeks after PyPy 4.0.0. We have fixed a few critical bugs in the JIT compiled code, reported by users. We therefore encourage all users of PyPy to update to this version. There are a few minor enhancements in this version as well.\n\nYou can download the PyPy 4.0.1 release here:\n\n\nhttps://pypy.org/download.html\n\nWe would like to thank our donors for the continued support of the PyPy project.\nWe would also like to thank our contributors and encourage new people to join the project. PyPy has many layers and we need help with all of them: PyPy and RPython documentation improvements, tweaking popular modules to run on pypy, or general help with making RPython\u2019s JIT even better.\n\n \n\n\n\u00a0\n\nCFFI update\n\nWhile not applicable only to PyPy, cffi is arguably our most significant contribution to the python ecosystem. PyPy 4.0.1 ships with cffi-1.3.1 with the improvements it brings.\n\n\n\u00a0\n\nWhat is PyPy?\n\nPyPy is a very compliant Python interpreter, almost a drop-in replacement for CPython 2.7. It\u2019s fast (pypy and cpython 2.7.x performance comparison) due to its integrated tracing JIT compiler.\nWe also welcome developers of other dynamic languages to see what RPython can do for them.\nThis release supports x86 machines on most common operating systems (Linux 32/64, Mac OS X 64, Windows 32, OpenBSD, freebsd), newer ARM hardware (ARMv6 or ARMv7, with VFPv3) running Linux, and the big- and little-endian variants of ppc64 running Linux.\n\n\n\u00a0\n\nOther Highlights (since 4.0.0 released three weeks ago)\n\n\n\nBug Fixes\nFix a bug when unrolling double loops in JITted code\nFix multiple memory leaks in the ssl module, one of which affected CPython as well (thanks to Alex Gaynor for pointing those out)\nUse pkg-config to find ssl headers on OS-X\nIssues reported with our previous release were resolved after reports from users on our issue tracker at https://foss.heptapod.net/pypy/pypy/-/issues or on IRC at #pypy\n\n\nNew features\nInternal cleanup of RPython class handling\nSupport stackless and greenlets on PPC machines\nImprove debug logging in subprocesses: use PYPYLOG=jit:log.%d for example to have all subprocesses write the JIT log to a file called \u2018log.%d\u2019, with \u2018%d\u2019 replaced with the subprocess\u2019 PID.\nSupport PyOS_double_to_string in our cpyext capi compatibility layer\n\n\nNumpy\nImprove support for __array_interface__\nPropagate most NAN mantissas through float16-float32-float64 conversions\n\n\nPerformance improvements and refactorings\nImprovements in slicing byte arrays\nImprovements in enumerate()\nSilence some warnings while translating\n\n\n\nPlease update, and continue to help us make PyPy better.\n\nCheers \nThe PyPy Team", + "tags": "release", + "url": "https://www.pypy.org/posts/2015/11/pypy-401-released-please-update-2652340737298251005.html" + }, + { + "title": "PyPy 4.0.0 Released - A Jit with SIMD Vectorization and More", + "text": "PyPy 4.0.0\nWe\u2019re pleased and proud to unleash PyPy 4.0.0, a major update of the PyPy python 2.7.10 compatible interpreter with a Just In Time compiler. We have improved warmup time and memory overhead used for tracing, added vectorization for numpy and general loops where possible on x86 hardware (disabled by default), refactored rough edges in rpython, and increased functionality of numpy.\nYou can download the PyPy 4.0.0 release here:\n\n\nhttps://pypy.org/download.html\n\nWe would like to thank our donors for the continued support of the PyPy project.\nWe would also like to thank our contributors (7 new ones since PyPy 2.6.0) and encourage new people to join the project. PyPy has many layers and we need help with all of them: PyPy and RPython documentation improvements, tweaking popular modules to run on PyPy, or general help with making RPython\u2019s JIT even better.\n\n\nNew Version Numbering\n\n\nSince the past release, PyPy 2.6.1, we decided to update the PyPy 2.x.x versioning directly to PyPy 4.x.x, to avoid confusion with CPython 2.7 and 3.5. Note that this version of PyPy uses the stdlib and implements the syntax of CPython 2.7.10.\n\n\nVectorization\n\n\nRichard Plangger began work in March and continued over a Google Summer of Code to add a vectorization step to the trace optimizer. The step recognizes common constructs and emits SIMD code where possible, much as any modern compiler does. This vectorization happens while tracing running code, so it is actually easier at run-time to determine the availability of possible vectorization than it is for ahead-of-time compilers.\nAvailability of SIMD hardware is detected at run time, without needing to precompile various code paths into the executable.\nThe first version of the vectorization has been merged in this release, since it is so new it is off by default. To enable the vectorization in built-in JIT drivers (like numpy ufuncs), add \u2013jit vec=1, to enable all implemented vectorization add \u2013jit vec_all=1\nBenchmarks and a summary of this work appear here\n\n\nInternal Refactoring: Warmup Time Improvement and Reduced Memory Usage\n\n\nMaciej Fijalkowski and Armin Rigo refactored internals of Rpython that now allow PyPy to more efficiently use guards in jitted code. They also rewrote unrolling, leading to a warmup time improvement of 20% or so. The reduction in guards also means a reduction in the use of memory, also a savings of around 20%.\n\n\n\nNumpy\n\nOur implementation of numpy continues to improve. ndarray and the numeric dtypes are very close to feature-complete; record, string and unicode dtypes are mostly supported. We have reimplemented numpy linalg, random and fft as cffi-1.0 modules that call out to the same underlying libraries that upstream numpy uses. Please try it out, especially using the new vectorization (via \u2013jit vec=1 on the command line) and let us know what is missing for your code.\n\n\n\nCFFI\n\nWhile not applicable only to PyPy, cffi is arguably our most significant contribution to the python ecosystem. Armin Rigo continued improving it, and PyPy reaps the benefits of cffi-1.3: improved manangement of object lifetimes, __stdcall on Win32, ffi.memmove(), and percolate const, restrict keywords from cdef to C code.\n\n\n\nWhat is PyPy?\n\nPyPy is a very compliant Python interpreter, almost a drop-in replacement for CPython 2.7. It\u2019s fast (pypy and cpython 2.7.x performance comparison) due to its integrated tracing JIT compiler.\nWe also welcome developers of other dynamic languages to see what RPython can do for them.\nThis release supports x86 machines on most common operating systems (Linux 32/64, Mac OS X 64, Windows 32, OpenBSD, freebsd), as well as newer ARM hardware (ARMv6 or ARMv7, with VFPv3) running Linux.\nWe also introduce support for the 64 bit PowerPC hardware, specifically Linux running the big- and little-endian variants of ppc64.\n\n\nOther Highlights (since 2.6.1 release two months ago)\n\nBug Fixes\nApplied OPENBSD downstream fixes\nFix a crash on non-linux when running more than 20 threads\nIn cffi, ffi.new_handle() is more cpython compliant\nAccept unicode in functions inside the _curses cffi backend exactly like cpython\nFix a segfault in itertools.islice()\nUse gcrootfinder=shadowstack by default, asmgcc on linux only\nFix ndarray.copy() for upstream compatability when copying non-contiguous arrays\nFix assumption that lltype.UniChar is unsigned\nFix a subtle bug with stacklets on shadowstack\nImprove support for the cpython capi in cpyext (our capi compatibility layer). Fixing these issues inspired some thought about cpyext in general, stay tuned for more improvements\nWhen loading dynamic libraries, in case of a certain loading error, retry loading the library assuming it is actually a linker script, like on Arch and Gentoo\nIssues reported with our previous release were resolved after reports from users on our issue tracker at https://foss.heptapod.net/pypy/pypy/-/issues or on IRC at #pypy\n\n\nNew features:\nAdd an optimization pass to vectorize loops using x86 SIMD intrinsics.\nSupport __stdcall on Windows in CFFI\nImprove debug logging when using PYPYLOG=???\nDeal with platforms with no RAND_egd() in OpenSSL\n\n\nNumpy:\nAdd support for ndarray.ctypes\nFast path for mixing numpy scalars and floats\nAdd support for creating Fortran-ordered ndarrays\nFix casting failures in linalg (by extending ufunc casting)\nRecognize and disallow (for now) pickling of ndarrays with objects embedded in them\n\n\nPerformance improvements and refactorings:\nReuse hashed keys across dictionaries and sets\nRefactor JIT interals to improve warmup time by 20% or so at the cost of a minor regression in JIT speed\nRecognize patterns of common sequences in the JIT backends and optimize them\nMake the garbage collecter more incremental over external_malloc() calls\nShare guard resume data where possible which reduces memory usage\nFast path for zip(list, list)\nReduce the number of checks in the JIT for lst[a:]\nMove the non-optimizable part of callbacks outside the JIT\nFactor in field immutability when invalidating heap information\nUnroll itertools.izip_longest() with two sequences\nMinor optimizations after analyzing output from vmprof and trace logs\nRemove many class attributes in rpython classes\nHandle getfield_gc_pure* and getfield_gc_* uniformly in heap.py\nImprove simple trace function performance by lazily calling fast2locals and locals2fast only if truly necessary \n\n\n\n\n\n\n\n\nPlease try it out and let us know what you think. We welcome feedback, we know you are using PyPy, please tell us about it!\nCheers\nThe PyPy Team", + "tags": "release", + "url": "https://www.pypy.org/posts/2015/10/pypy-400-released-jit-with-simd-8282134928733384063.html" + }, + { + "title": "Automatic SIMD vectorization support in PyPy", + "text": "Hi everyone,\n\nit took some time to catch up with the JIT refacrtorings merged in this summer. But, (drums) we are happy to announce that:\n\n\nThe next release of PyPy,\u00a0 \"PyPy 4.0.0\", will ship the new auto vectorizer\nThe goal of this project was to increase the speed of numerical applications in both the NumPyPy library and for arbitrary Python programs. In PyPy we have focused a lot on improvements in the 'typical python workload', which usually involves object and string manipulations, mostly for web development. We're hoping with this work that we'll continue improving the other very important Python use case - numerics.\n\n\nWhat it can do! \nIt targets numerics only. It \nwill not execute object manipulations faster, but it is capable of \nenhancing common vector and matrix operations.\nGood news is that it is not specifically targeted for the NumPy library and the PyPy \nvirtual machine. Any interpreter (written in RPython) is able make use \nof the vectorization. For more information about that take a look here, or consult the documentation. For the time being it is not turn on by default, so be sure to enable it by specifying --jit vec=1\u00a0before running your program.\n\nIf your language (written in RPython) contains many array/matrix operations, you can easily integrate the optimization by adding the parameter 'vec=1' to the JitDriver.\n\n\nNumPyPy Improvements\n\nLet's take a look at the core functions of the NumPyPy library (*). \nThe following tests tests show the speedup of the core functions commonly used in Python code interfacing with NumPy, on CPython with NumPy, on the PyPy 2.6.1 relased several weeks ago, and on PyPy 15.11 to be released soon. Timeit was used to test the time needed to run the operation in the plot title on various vector (lower case) and square matrix (upper case) sizes displayed on the X axis. The Y axis shows the speedup compared to CPython 2.7.10. This means that higher is better.\u00a0\n\n\n\n\n\n\n\n\nIn comparison to PyPy 2.6.1, the speedup greatly improved. The hardware support really strips down the runtime of the vector and matrix operations. There is another operation we would like to highlight: the dot product.\nIt is a very common operation in numerics and PyPy now (given a moderate sized matrix and vector) decreases the time spent in that operation. See for yourself:\n\n\n\n\n\nThese are nice improvements in the NumPyPy library and we got to a competitive level only making use of SSE4.1.\n\n\nFuture work\u00a0\u00a0 \n\nThis is not the end of the road. The GSoC project showed that it is possible to implement this optimization in PyPy. There might be other improvements we can make to carry this further:\n\nCheck alignment at runtime to increase the memory throughput of the CPU\nSupport the AVX vector extension which (at least) doubles the size of the vector register\nHandle each and every corner case in Python traces to enable it\u00a0 globally\nDo not rely only on loading operations to trigger the analysis, there might be cases where combination of floating point values could be done in parallel \n\nCheers,\nThe PyPy Team\n\n(*) The benchmark code can be found here it was run using this configuration: i7-2600 CPU @ 3.40GHz (4 cores).", + "tags": "", + "url": "https://www.pypy.org/posts/2015/10/automatic-simd-vectorization-support-in-639063580401330508.html" + }, + { + "title": "PowerPC backend for the JIT", + "text": "Hi all,\n\nPyPy's JIT now supports the 64-bit PowerPC architecture! This is the\nthird architecture supported, in addition to x86 (32 and 64) and ARM\n(32-bit only). More precisely, we support Linux running the big- and the\nlittle-endian variants of ppc64. Thanks to IBM for funding this work!\n\nThe new JIT backend has been merged into \"default\". You should be able\nto translate PPC versions\nas usual\ndirectly on the machines. For\nthe foreseeable future, I will compile and distribute binary versions\ncorresponding to the official releases (for Fedora), but of course I'd\nwelcome it if someone else could step in and do it. Also, it is unclear\nyet if we will run a buildbot.\n\nTo check that the result performs well, I logged in a ppc64le machine\nand ran the usual benchmark suite of PyPy (minus sqlitesynth: sqlite\nwas not installed on that machine). I ran it twice at a difference of\n12 hours, as an attempt to reduce risks caused by other users suddenly\nusing the machine. The machine was overall relatively quiet. Of\ncourse, this is scientifically not good enough; it is what I could come\nup with given the limited resources.\n\nHere are the results, where the numbers are speed-up factors between the\nnon-jit and the jit version of PyPy. The first column is x86-64, for\nreference. The second and third columns are the two ppc64le runs. All\nare Linux. A few benchmarks are not reported here because the runner\ndoesn't execute them on non-jit (however, apart from sqlitesynth, they\nall worked).\n\n\n ai 13.7342 16.1659 14.9091\n bm_chameleon 8.5944 8.5858 8.66\n bm_dulwich_log 5.1256 5.4368 5.5928\n bm_krakatau 5.5201 2.3915 2.3452\n bm_mako 8.4802 6.8937 6.9335\n bm_mdp 2.0315 1.7162 1.9131\n chaos 56.9705 57.2608 56.2374\n sphinx\n crypto_pyaes 62.505 80.149 79.7801\n deltablue 3.3403 5.1199 4.7872\n django 28.9829 23.206 23.47\n eparse 2.3164 2.6281 2.589\n fannkuch 9.1242 15.1768 11.3906\n float 13.8145 17.2582 17.2451\n genshi_text 16.4608 13.9398 13.7998\n genshi_xml 8.2782 8.0879 9.2315\n go 6.7458 11.8226 15.4183\n hexiom2 24.3612 34.7991 33.4734\n html5lib 5.4515 5.5186 5.365\n json_bench 28.8774 29.5022 28.8897\n meteor-contest 5.1518 5.6567 5.7514\n nbody_modified 20.6138 22.5466 21.3992\n pidigits 1.0118 1.022 1.0829\n pyflate-fast 9.0684 10.0168 10.3119\n pypy_interp 3.3977 3.9307 3.8798\n raytrace-simple 69.0114 108.8875 127.1518\n richards 94.1863 118.1257 102.1906\n rietveld 3.2421 3.0126 3.1592\n scimark_fft\n scimark_lu\n scimark_montecarlo\n scimark_sor\n scimark_sparsematmul\n slowspitfire 2.8539 3.3924 3.5541\n spambayes 5.0646 6.3446 6.237\n spectral-norm 41.9148 42.1831 43.2913\n spitfire 3.8788 4.8214 4.701\n spitfire_cstringio 7.606 9.1809 9.1691\n sqlitesynth\n sympy_expand 2.9537 2.0705 1.9299\n sympy_integrate 4.3805 4.3467 4.7052\n sympy_str 1.5431 1.6248 1.5825\n sympy_sum 6.2519 6.096 5.6643\n telco 61.2416 54.7187 55.1705\n trans2_annotate\n trans2_rtype\n trans2_backendopt\n trans2_database\n trans2_source\n twisted_iteration 55.5019 51.5127 63.0592\n twisted_names 8.2262 9.0062 10.306\n twisted_pb 12.1134 13.644 12.1177\n twisted_tcp 4.9778 1.934 5.4931\n\n GEOMETRIC MEAN 9.31 9.70 10.01\n\n\nThe last line reports the geometric mean of each column. We see that\nthe goal was reached: PyPy's JIT actually improves performance by a\nfactor of around 9.7 to 10 times on ppc64le. By comparison, it \"only\"\nimproves performance by a factor 9.3 on Intel x86-64. I don't know why,\nbut I'd guess it mostly means that a non-jitted PyPy performs slightly\nbetter on Intel than it does on PowerPC.\n\nWhy is that? Actually, if we do the same comparison with an ARM\ncolumn too, we also get higher numbers there than on Intel.\nWhen we discovered that a few years ago, we guessed that\non ARM running the whole interpreter in\nPyPy takes up a lot of resources, e.g. of instruction cache, which the\nJIT's assembler doesn't need any more after the process is warmed up.\nAnd caches are much bigger on Intel. However, PowerPC is much closer\nto Intel, so this argument doesn't work for PowerPC.\nBut there are other more subtle\nvariants of it. Notably, Intel is doing crazy things about branch\nprediction, which likely helps a big interpreter---both the non-JITted\nPyPy and CPython, and both for the interpreter's main loop itself and\nfor the numerous indirect branches that depend on the types of the\nobjects. Maybe the PowerPC is as good as Intel, and so this argument\ndoesn't work either. Another one would be:\non PowerPC I did notice that gcc itself is not\nperfect at optimization. During development of this backend, I often\nlooked at assembler produced by gcc, and there are a number of small\ninefficiencies there. All these are factors that slow down the\nnon-JITted version of PyPy, but don't influence the speed of the\nassembler produced just-in-time.\n\nAnyway, this is just guessing. The fact remains that PyPy can now\nbe used on PowerPC machines. Have fun!\n\nA bient\u00f4t,\n\nArmin.", + "tags": "", + "url": "https://www.pypy.org/posts/2015/10/powerpc-backend-for-jit-3014100267884692148.html" + }, + { + "title": "PyPy memory and warmup improvements (2) - Sharing of Guards", + "text": "Hello everyone!\nThis is the second part of the series of improvements in warmup time and\nmemory consumption in the PyPy JIT. This post covers recent work on sharing guard\nresume data that was recently merged to trunk. It will be a part\nof the next official PyPy release. To understand what it does, let's\nstart with a loop for a simple example:\n\nclass A(object):\n def __init__(self, x, y):\n self.x = x\n self.y = y\n\n def call_method(self, z):\n return self.x + self.y + z\n\ndef f():\n s = 0\n for i in range(100000):\n a = A(i, 1 + i)\n s += a.call_method(i)\n\nAt the entrance of the loop, we have the following set of operations:\n\nguard(i5 == 4)\nguard(p3 is null)\np27 = p2.co_cellvars\np28 = p2.co_freevars\nguard_class(p17, 4316866008, descr=)\np30 = p17.w_seq\nguard_nonnull(p30, descr=)\ni31 = p17.index\np32 = p30.strategy\nguard_class(p32, 4317041344, descr=)\np34 = p30.lstorage\ni35 = p34..item0\n\nThe above operations gets executed at the entrance, so each time we call f(). They ensure\nall the optimizations done below stay valid. Now, as long as nothing\nout of the ordinary happens, they only ensure that the world around us never changed. However, if e.g. someone puts new\nmethods on class A, any of the above guards might fail. Despite the fact that it's a very unlikely\ncase, PyPy needs to track how to recover from such a situation. Each of those points needs to keep the full\nstate of the optimizations performed, so we can safely deoptimize them and reenter the interpreter.\nThis is vastly wasteful since most of those guards never fail, hence some sharing between guards\nhas been performed.\nWe went a step further - when two guards are next to each other or the\noperations in between them don't have side effects, we can safely redo the operations or to simply\nput, resume in the previous guard. That means every now and again we execute a few\noperations extra, but not storing extra info saves quite a bit of time and memory. This is similar to the approach that LuaJIT takes, which is called sparse snapshots.\n\n\nI've done some measurements on annotating & rtyping translation of pypy, which\nis a pretty memory hungry program that compiles a fair bit. I measured, respectively:\n\ntotal time the translation step took (annotating or rtyping)\ntime it took for tracing (that excludes backend time for the total JIT time) at\nthe end of rtyping.\nmemory the GC feels responsible for after the step. The real amount of memory\nconsumed will always be larger and the coefficient of savings is in 1.5-2x mark\n\nHere is the table:\n\n\n\n\n\n\n\n\n\n\nbranch\ntime annotation\ntime rtyping\nmemory annotation\nmemory rtyping\ntracing time\n\n\n\ndefault\n317s\n454s\n707M\n1349M\n60s\n\nsharing\n302s\n430s\n595M\n1070M\n51s\n\nwin\n4.8%\n5.5%\n19%\n26%\n17%\n\n\n\nObviously pypy translation is an extreme example - the vast majority of the code out there\ndoes not have that many lines of code to be jitted. However, it's at the very least\na good win for us :-)\nWe will continue to improve the warmup performance and keep you posted!\nCheers,\nfijal", + "tags": "", + "url": "https://www.pypy.org/posts/2015/10/pypy-memory-and-warmup-improvements-2-4598780879518640015.html" + }, + { + "title": "PyPy warmup improvements", + "text": "Hello everyone!\nI'm very pleased to announce that we've just managed to merge\nthe optresult branch.\nUnder this cryptic name is the biggest JIT refactoring we've done in a couple\nyears, mostly focused on the warmup time and memory impact of PyPy.\nTo understand why we did that, let's look back in time - back when we\ngot the first working JIT prototype in 2009 we were focused exclusively\non achieving peak performance with some consideration towards memory usage, but\nwithout serious consideration towards warmup time. This means we accumulated\nquite a bit of technical debt over time that we're trying, with difficulty,\nto address right now. This branch mostly does not affect the peak performance\n- it should however help you with short-living scripts, like test runs.\nWe identified warmup time to be one of the major pain points for pypy users,\nalong with memory impact and compatibility issues with CPython C extension\nworld. While we can't address all the issues at once, we're trying to address\nthe first two in the work contributing to this blog post. I will write\na separate article on the last item separately.\nTo see how much of a problem warmup is for your program, you can run your\nprogram with PYPYLOG=jit-summary:- environment variable set.\nThis should show you something like this:\n\n(pypy-optresult)fijal@hermann:~/src/botbot-web$ PYPYLOG=jit-summary:- python orm.py 1500\n[d195a2fcecc] {jit-summary\nTracing: 781 2.924965\nBackend: 737 0.722710\nTOTAL: 35.912011\nops: 1860596\nrecorded ops: 493138\n calls: 81022\nguards: 131238\nopt ops: 137263\nopt guards: 35166\nforcings: 4196\nabort: trace too long: 22\nabort: compiling: 0\nabort: vable escape: 22\nabort: bad loop: 0\nabort: force quasi-immut: 0\nnvirtuals: 183672\nnvholes: 25797\nnvreused: 116131\nTotal # of loops: 193\nTotal # of bridges: 575\nFreed # of loops: 6\nFreed # of bridges: 75\n[d195a48de18] jit-summary}\n\nThis means that the total (wall clock) time was 35.9s, out of which we spent\n2.9s tracing 781 loops and 0.72s compiling them. The remaining couple were\naborted (trace too long is normal, vable escape means someone called\nsys._getframe() or equivalent). You can do the following things:\n\ncompare the numbers with pypy --jit off and see at which number of\niterations pypy jit kicks in\nplay with the thresholds:\npypy --jit threshold=500,function_threshold=400,trace_eagerness=50 was\nmuch better in this example. What this does is to lower the threshold\nfor tracing loops from default of 1039 to 400, threshold for tracing\nfunctions from the start from 1619 to 500 and threshold for tracing bridges\nfrom 200 to 50. Bridges are \"alternative paths\" that JIT did not take that\nare being additionally traced. We believe in sane defaults, so we'll try\nto improve upon those numbers, but generally speaking there is no one-size\nfits all here.\nif the tracing/backend time stays high, come and complain to us with\nbenchmarks, we'll try to look at them\n\nWarmup, as a number, is notoriously hard to measure. It's a combination of:\n\npypy running interpreter before jitting\npypy needing time to JIT the traces\nadditional memory allocations needed during tracing to accomodate bookkeeping\ndata\nexiting and entering assembler until there is enough coverage of assembler\n\nWe're working hard on making a better assesment at this number, stay tuned :-)\n\nSpeedups\nOverall we measured about 50% speed improvement in the optimizer, which reduces\nthe overall warmup time between 10% and 30%. The very\nobvious warmup benchmark got a speedup from 4.5s to 3.5s, almost\n30% improvement. Obviously the speedups on benchmarks would vastly\ndepend on how much warmup time is there in those benchmarks. We observed\nannotation of pypy to decreasing by about 30% and the overall translation\ntime by about 7%, so your mileage may vary.\nOf course, as usual with the large refactoring of a crucial piece of PyPy,\nthere are expected to be bugs. We are going to wait for the default branch\nto stabilize so you should see warmup improvements in the next release.\nIf you're not afraid to try, nightlies will already have them.\nWe're hoping to continue improving upon warmup time and memory impact in the\nfuture, stay tuned for improvements.\n\n\nTechnical details\nThe branch does \"one\" thing - it changes the underlying model of how operations\nare represented during tracing and optimizations. Let's consider a simple\nloop like:\n\n[i0, i1]\ni2 = int_add(i0, i1)\ni3 = int_add(i2, 1)\ni4 = int_is_true(i3)\nguard_true(i4)\njump(i3, i2)\n\nThe original representation would allocate a Box for each of i0 - i4\nand then store those boxes in instances of ResOperation. The list of such\noperations would then go to the optimizer. Those lists are big - we usually\nremove 90% of them during optimizations, but they can be a couple thousand\nelements. Overall, allocating those big lists takes a toll on warmup time,\nespecially due to the GC pressure. The branch removes the existance of Box\ncompletely, instead using a link to ResOperation itself. So say in the above\nexample, i2 would refer to its producer - i2 = int_add(i0, i1) with\narguments getting special treatment.\nThat alone reduces the GC pressure slightly, but a reduced number\nof instances also lets us store references on them directly instead\nof going through expensive dictionaries, which were used to store optimizing\ninformation about the boxes.\nCheers!\nfijal & arigo", + "tags": "", + "url": "https://www.pypy.org/posts/2015/09/pypy-warmup-improvements-8349465374608676233.html" + }, + { + "title": "PyPy 2.6.1 released", + "text": "PyPy 2.6.1\nWe\u2019re pleased to announce PyPy 2.6.1, an update to PyPy 2.6.0 released June 1.\nWe have fixed many issues, updated stdlib to 2.7.10, cffi to version 1.3, extended support for\nthe new vmprof statistical profiler for multiple threads, and increased\nfunctionality of numpy.\nYou can download the PyPy 2.6.1 release here:\n\n\nhttps://pypy.org/download.html\n\nWe would like to thank our donors for the continued support of the PyPy\nproject, and our volunteers and contributors.\n\nWe would also like to encourage new people to join the project. PyPy has many\nlayers and we need help with all of them: PyPy and RPython documentation\nimprovements, tweaking popular modules to run on pypy, or general help with making\nRPython\u2019s JIT even better.\n\n\nWhat is PyPy?\nPyPy is a very compliant Python interpreter, almost a drop-in replacement for\nCPython 2.7. It\u2019s fast (pypy and cpython 2.7.x performance comparison)\ndue to its integrated tracing JIT compiler.\n\nThis release supports x86 machines on most common operating systems\n(Linux 32/64, Mac OS X 64, Windows 32, OpenBSD, freebsd),\nas well as newer ARM hardware (ARMv6 or ARMv7, with VFPv3) running Linux.\n\nWe also welcome developers of other\ndynamic languages to see what RPython can do for them.\n\n\n\nHighlights\n\nBug Fixes\nRevive non-SSE2 support\nFixes for detaching _io.Buffer*\nOn Windows, close (and flush) all open sockets on exiting\nDrop support for ancient macOS v10.4 and before\nClear up contention in the garbage collector between trace-me-later and pinning\nIssues reported with our previous release were resolved after reports from users on\nour issue tracker at https://foss.heptapod.net/pypy/pypy/-/issues or on IRC at\n#pypy.\n\n\nNew features:\ncffi was updated to version 1.3\nThe python stdlib was updated to 2.7.10 from 2.7.9\nvmprof now supports multiple threads and OS X\nThe translation process builds cffi import libraries for some stdlib\npackages, which should prevent confusion when package.py is not used\nbetter support for gdb debugging\nfreebsd should be able to translate PyPy \u201cout of the box\u201d with no patches\n\n\nNumpy:\nBetter support for record dtypes, including the align keyword\nImplement casting and create output arrays accordingly (still missing some corner cases)\nSupport creation of unicode ndarrays\nBetter support ndarray.flags\nSupport axis argument in more functions\nRefactor array indexing to support ellipses\nAllow the docstrings of built-in numpy objects to be set at run-time\nSupport the buffered nditer creation keyword\n\n\nPerformance improvements:\nDelay recursive calls to make them non-recursive\nSkip loop unrolling if it compiles too much code\nTweak the heapcache\nAdd a list strategy for lists that store both floats and 32-bit integers.\nThe latter are encoded as nonstandard NaNs. Benchmarks show that the speed\nof such lists is now very close to the speed of purely-int or purely-float\nlists.\nSimplify implementation of ffi.gc() to avoid most weakrefs\nMassively improve the performance of map() with more than\none sequence argument\n\n\n\nPlease try it out and let us know what you think. We welcome\nsuccess stories, experiments, or benchmarks, we know you are using PyPy, please tell us about it!\nCheers\nThe PyPy Team", + "tags": "release", + "url": "https://www.pypy.org/posts/2015/08/pypy-261-released-3638960649983103796.html" + }, + { + "title": "PyPy and ijson - a guest blog post", + "text": "This gem was posted in the ijson issue tracker after some discussion on #pypy, and Dav1dde kindly allowed us to repost it here:\n\n\"So, I was playing around with parsing huge JSON files (19GiB, testfile is ~520MiB) and wanted to try a sample code with PyPy, turns out, PyPy needed ~1:30-2:00 whereas CPython 2.7 needed ~13 seconds (the pure python implementation on both pythons was equivalent at ~8 minutes). \"Apparantly ctypes is really bad performance-wise, especially on PyPy. So I made a quick CFFI mockup: https://gist.github.com/Dav1dde/c509d472085f9374fc1d\n\nBefore:\nCPython 2.7: \u00a0\u00a0\u00a0 python -m emfas.server size dumps/echoprint-dump-1.json \u00a0\u00a0\u00a0 11.89s user 0.36s system 98% cpu 12.390 total\u00a0\nPYPY: \u00a0\u00a0\u00a0 python -m emfas.server size dumps/echoprint-dump-1.json \u00a0\u00a0\u00a0 117.19s user 2.36s system 99% cpu 1:59.95 total \nAfter (CFFI): CPython 2.7: \u00a0\u00a0\u00a0\u00a0 python jsonsize.py ../dumps/echoprint-dump-1.json\u00a0\u00a0\u00a0\u00a0 8.63s user 0.28s system 99% cpu 8.945 total\u00a0\nPyPy: \u00a0\u00a0\u00a0\u00a0 python jsonsize.py ../dumps/echoprint-dump-1.json \u00a0\u00a0\u00a0\u00a0 4.04s user 0.34s system 99% cpu 4.392 total\n\"\n\n\nDav1dd goes into more detail in the issue itself, but we just want to emphasize a few significant points from this brief interchange:\n\nHis CFFI implementation is faster than the ctypes one even on CPython 2.7.\nPyPy + CFFI is faster than CPython even when using C code to do the heavy parsing.\n\n\u00a0The PyPy Team", + "tags": "", + "url": "https://www.pypy.org/posts/2015/06/pypy-and-ijson-guest-blog-post-8143007374752482637.html" + }, + { + "title": "PyPy 2.6.0 release", + "text": "PyPy 2.6.0 - Cameo Charm\n\nWe\u2019re pleased to announce PyPy 2.6.0, only two months after PyPy 2.5.1. We are particulary happy to update cffi to version 1.1, which makes the popular ctypes-alternative even easier to use, and to support the new vmprof statistical profiler.\n\n\n\nYou can download the PyPy 2.6.0 release here:\n\n\n\n\nhttps://pypy.org/download.html\n\n\n\n\nWe would like to thank our donors for the continued support of the PyPy project, and for those who donate to our three sub-projects, as well as our volunteers and contributors.\n\n\n\nThanks also to Yury V. Zaytsev and David Wilson who recently started running nightly builds on Windows and MacOSX buildbots.\n\n\n\nWe\u2019ve shown quite a bit of progress, but we\u2019re slowly running out of funds. Please consider donating more, or even better convince your employer to donate, so we can finish those projects! The three sub-projects are:\n\n\n\nPy3k (supporting Python 3.x): We have released a Python 3.2.5 compatible version we call PyPy3 2.4.0, and are working toward a Python 3.3 compatible version\nSTM (software transactional memory): We have released a first working version, and continue to try out new promising paths of achieving a fast multithreaded Python\nNumPy which requires installation of our fork of upstream numpy, available on bitbucket\n\n\n\n\nWe would also like to encourage new people to join the project. PyPy has many layers and we need help with all of them: PyPy and RPython documentation improvements, tweaking popular modules to run on pypy, or general help with making RPython\u2019s JIT even better. Nine new people contributed since the last release, you too could be one of them.\n\n\nWhat is PyPy?\n\nPyPy is a very compliant Python interpreter, almost a drop-in replacement for CPython 2.7. It\u2019s fast (pypy and cpython 2.7.x performance comparison) due to its integrated tracing JIT compiler.\n\n\n\nThis release supports x86 machines on most common operating systems (Linux 32/64, Mac OS X 64, Windows, OpenBSD, freebsd), as well as newer ARM hardware (ARMv6 or ARMv7, with VFPv3) running Linux.\n\n\n\nWhile we support 32 bit python on Windows, work on the native Windows 64 bit python is still stalling, we would welcome a volunteer to handle that. We also welcome developers with other operating systems or dynamic languages to see what RPython can do for them.\n\n\n\n\n\nHighlights\n\nPython compatibility:\n\nImprove support for TLS 1.1 and 1.2\nWindows downloads now package a pypyw.exe in addition to pypy.exe\nSupport for the PYTHONOPTIMIZE environment variable (impacting builtin\u2019s __debug__ property)\nIssues reported with our previous release were resolved after reports from users on our issue tracker at https://foss.heptapod.net/pypy/pypy/-/issues or on IRC at #pypy.\n\n\nNew features:\n\nAdd preliminary support for a new lightweight statistical profiler vmprof, which has been designed to accomodate profiling JITted code\n\n\nNumpy:\n\nSupport for object dtype via a garbage collector hook\nSupport for .can_cast and .min_scalar_type as well as beginning a refactoring of the internal casting rules\nBetter support for subtypes, via the __array_interface__, __array_priority__, and __array_wrap__ methods (still a work-in-progress)\nBetter support for ndarray.flags\n\n\nPerformance improvements:\n\nSlight improvement in frame sizes, improving some benchmarks\nInternal refactoring and cleanups leading to improved JIT performance\n\n\nImproved IO performance of zlib and bz2 modules\nWe continue to improve the JIT\u2019s optimizations. Our benchmark suite is now over 7 times faster than cpython\n\n\n\n\n\n\n\nPlease try it out and let us know what you think. We welcome success stories, experiments, or benchmarks, we know you are using PyPy, please tell us about it!\nCheers\nThe PyPy Team", + "tags": "release", + "url": "https://www.pypy.org/posts/2015/06/pypy-260-release-8983050552628070433.html" + }, + { + "title": "CFFI 1.0.1 released", + "text": "CFFI 1.0.1 final has now been released for CPython! CFFI is a (CPython and PyPy) module to interact with C code from Python.\nThe main news from CFFI 0.9 is the new way to build extension modules:\nthe \"out-of-line\" mode, where you have a separate build script. When\nthis script is executed, it produces the extension module. This comes\nwith associated Setuptools support that fixes the headache of\ndistributing your own CFFI-using packages. It also massively cuts\ndown the import times.\nAlthough this is a major new version, it should be fully\nbackward-compatible: existing projects should continue to work, in\nwhat is now called the \"in-line mode\".\nThe documentation has been reorganized and split into a few pages.\nFor more information about this new \"out-of-line\" mode, as well as\nmore general information about what CFFI is and how to use it, read the Goals and proceed to\nthe Overview.\nUnlike the 1.0 beta 1 version (ffi.dlopen(), instead of only\nffi.verify().\nPyPy support: PyPy needs integrated support for efficient JITting,\nso you cannot install a different version of CFFI on top of an\nexisting PyPy. You need to wait for the upcoming PyPy 2.6 to use\nCFFI 1.0---or get a nightly build.\nMy thanks again to the PSF (Python Software Foundation) for their\nfinancial support!\n\nUPDATE:Bug with the first example \"ABI out-of-line\": variadic functions (like printf, ending in a \"...\" argument) crash. Fixed in CFFI 1.0.2.", + "tags": "release", + "url": "https://www.pypy.org/posts/2015/05/cffi-101-released-756545636419794802.html" + }, + { + "title": "CFFI 1.0 beta 1", + "text": "Finally! CFFI 1.0 is almost ready. CFFI gives Python developers a convenient way to call external C libraries. Here \"Python\" == \"CPython or PyPy\", but this post is mostly about the CPython side of CFFI, as the PyPy version is not ready yet.\nOn CPython, you can download the version\n\"1.0.0b1\" either by looking for the cffi-1.0 branch in\nthe repository, or by\nsaying\n\npip install \"cffi>=1.0.dev0\"\n\n(Until 1.0 final is ready,\npip install cffi will still give you version 0.9.2.)\nThe main news: you can now explicitly generate and compile a CPython C\nextension module from a \"build\" script. Then in the rest of your\nprogram or library, you no longer need to import cffi at all.\nInstead, you simply say:\n\nfrom _my_custom_module import ffi, lib\n\nThen you use ffi and lib just like you did in your\nverify()-based project in CFFI 0.9.2. (The lib is what used to\nbe the result of verify().) The details of how you use them\nshould not have changed at all, so that the rest of your program should\nnot need any update.\n\nBenefits\nThis is a big step towards standard practices for making and\ndistributing Python packages with C extension modules:\n\non the one hand, you need an explicit compilation step, triggered\nhere by running the \"build\" script;\non the other hand, what you gain in return is better control over\nwhen and why the C compilation occurs, and more standard ways to write\ndistutils- or setuptools-based setup.py files (see below).\n\nAdditionally, this completely removes one of the main drawbacks of using\nCFFI to interface with large C APIs: the start-up time. In some cases\nit could be extreme on slow machines (cases of 10-20 seconds on ARM\nboards occur commonly). Now, the import above is instantaneous.\nIn fact, none of the pure Python cffi package is needed any more at\nruntime (it needs only an internal extension module from CFFI, which\ncan be installed by doing \"pip install cffi-runtime\" [*] if you only need that).\nThe ffi object you get by the import above is of a\ncompletely different class written entirely in C. The two\nimplementations might get merged in the future; for now they are\nindependent, but give two compatible APIs. The differences are that\nsome methods like cdef() and verify() and set_source() are\nomitted from the C version, because it is supposed to be a complete FFI\nalready; and other methods like new(), which take as parameter a\nstring describing a C type, are faster now because that string is parsed\nusing a custom small-subset-of-C parser, written in C too.\n\n\nIn practice\nCFFI 1.0 beta 1 was tested on CPython 2.7 and 3.3/3.4, on Linux and to\nsome extent on Windows and OS/X. Its PyPy version is not ready yet,\nand the only docs available so far are those below.\nThis is beta software, so there might be bugs and details may change. We are interested in hearing any feedback (irc.freenode.net #pypy) or bug reports.\nTo use the new features, create a source file that is not imported by the rest of\nyour project, in which you place (or move) the code to build the FFI\nobject:\n\n# foo_build.py\nimport cffi\nffi = cffi.FFI()\n\nffi.cdef(\"\"\"\n int printf(const char *format, ...);\n\"\"\")\n\nffi.set_source(\"_foo\", \"\"\"\n #include \n\"\"\") # and other arguments like libraries=[...]\n\nif __name__ == '__main__':\n ffi.compile()\n\nThe ffi.set_source() replaces the ffi.verify() of CFFI 0.9.2.\nCalling it attaches the given source code to the ffi object, but this call doesn't\ncompile or return anything by itself. It may be placed above the ffi.cdef()\nif you prefer. Its first argument is the name of the C extension module\nthat will be produced.\nActual compilation (including generating the complete C sources) occurs\nlater, in one of two places: either in ffi.compile(), shown above,\nor indirectly from the setup.py, shown next.\nIf you directly execute the file foo_build.py above, it will\ngenerate a local file _foo.c and compile it to _foo.so (or the\nappropriate extension, like _foo.pyd on Windows). This is the\nextension module that can be used in the rest of your program by saying\n\"from _foo import ffi, lib\".\n\n\nDistutils\nIf you want to distribute your program, you write a setup.py using\neither distutils or setuptools. Using setuptools is generally\nrecommended nowdays, but using distutils is possible too. We show it\nfirst:\n\n# setup.py\nfrom distutils.core import setup\nimport foo_build\n\nsetup(\n name=\"example\",\n version=\"0.1\",\n py_modules=[\"example\"],\n ext_modules=[foo_build.ffi.distutils_extension()],\n)\n\nThis is similar to the CFFI 0.9.2 way. It only works if cffi was\ninstalled previously, because otherwise foo_build cannot be\nimported. The difference is that you use ffi.distutils_extension()\ninstead of ffi.verifier.get_extension(), because there is no longer\nany verifier object if you use set_source().\n\n\nSetuptools\nThe modern way is to write setup.py files based on setuptools, which\ncan (among lots of other things) handle dependencies. It is what you\nnormally get with pip install, too. Here is how you'd write it:\n\n# setup.py\nfrom setuptools import setup\n\nsetup(\n name=\"example\",\n version=\"0.1\",\n py_modules=[\"example\"],\n setup_requires=[\"cffi>=1.0.dev0\"],\n cffi_modules=[\"foo_build:ffi\"],\n install_requires=[\"cffi-runtime\"], # see [*] below\n)\n\nNote that \"cffi\" is mentioned on three lines here:\n\nthe first time is in setup_requires, which means that cffi will\nbe locally downloaded and used for the setup.\nthe second mention is a custom cffi_modules argument. This\nargument is handled by cffi as soon as it is locally downloaded. It\nshould be a list of \"module:ffi\" strings, where the ffi part\nis the name of the global variable in that module.\nthe third mention is in install_requires. It means that in\norder to install this example package, \"cffi-runtime\" must also be\ninstalled. This is (or will be) a PyPI entry that only contains a\ntrimmed down version of CFFI, one that does not include the pure\nPython \"cffi\" package and its dependencies. None of it is needed at\nruntime.\n\n[*] NOTE: The \"cffi-runtime\" PyPI entry is not ready yet. For now, use \"cffi>=1.0.dev0\" instead. Considering PyPy, which has got a built-in \"_cffi_backend\" module, the \"cffi-runtime\" package could never be upgraded there; but it would still be nice if we were able to upgrade the \"cffi\" pure Python package on PyPy. This might require some extra care in writing the interaction code. We need to sort it out now...\n\n\nThanks\nSpecial thanks go to the PSF (Python Software Foundation) for their\nfinancial support, without which this work---er... it might likely have occurred anyway, but at an unknown future date :-)\n(For reference, the amount I asked for (and got) is equal to one\nmonth of what a Google Summer of Code student gets, for work that will\ntake a bit longer than one month. At least I personally am running mostly\non such money, and so I want to thank the PSF again for their\ncontribution to CFFI---and while I'm at it, thanks to all other\ncontributors to PyPy---for making this job more than an unpaid hobby on\nthe side :-)\n\nArmin Rigo", + "tags": "", + "url": "https://www.pypy.org/posts/2015/05/cffi-10-beta-1-4375652711495636911.html" + }, + { + "title": "PyPy-STM 2.5.1 released", + "text": "PyPy-STM 2.5.1 - Mawhrin-Skel\n\nWe're pleased to announce PyPy-STM 2.5.1, codenamed Mawhrin-Skel.\nThis is the second official release of PyPy-STM. You can download\nthis release here (64-bit Linux only):\n\nhttps://pypy.org/download.html\nDocumentation:\n\nhttps://pypy.readthedocs.org/en/latest/stm.html\nPyPy is an implementation of the Python programming language which focuses\non performance. So far we've been relentlessly optimizing for the single\ncore/process scenario. PyPy STM brings to the table a version of PyPy\nthat does not have the infamous Global Interpreter Lock, hence can run\nmultiple threads on multiple cores. Additionally it comes with a set\nof primitives that make writing multithreaded applications a lot easier,\nas explained below (see TransactionQueue) and in the documentation.\nInternally, PyPy-STM is based on the Software Transactional Memory\nplug-in called stmgc-c7. This version comes with a relatively\nreasonable single-core overhead but scales only up to around 4 cores\non some examples; the next version of the plug-in, stmgc-c8, is in\ndevelopment and should address that limitation (as well as reduce the\noverhead). These versions only support 64-bit Linux; we'd welcome\nsomeone to port the upcoming stmgc-c8 to other (64-bit) platforms.\nThis release passes all regular PyPy tests, except for a few\nspecial cases. In other words, you should be able to drop in\nPyPy-STM instead of the regular PyPy and your program should still\nwork. See current status for more information.\nThis work was done by Remi Meier and Armin Rigo. Thanks to all donors\nfor crowd-funding the STM work so far! As usual, it took longer\nthan we would have thought. I really want to thank the people that\nkept making donations anyway. Your trust is greatly appreciated!\n\n\nWhat's new?\nCompared to the July 2014 release, the main addition is a way to\nget reports about STM conflicts. This is an essential new feature.\nTo understand why this is so important, consider that if you already\nplayed around with the previous release, chances are that you didn't\nget very far. It probably felt like a toy: on very small examples it\nwould nicely scale, but on any larger example it would not scale at\nall. You didn't get any feedback about why, but the underlying reason\nis that, in a typical large example, there are some STM conflicts that\noccur all the time and that won't be immediately found just by\nthinking. This prevents any parallelization.\nNow PyPy-STM is no longer a black box: you have a way to learn about\nthese conflicts, fix them, and try again. The tl;dr version is to run:\n\n PYPYSTM=stmlog ./pypy-stm example.py\n ./print_stm_log.py stmlog\n\nMore details in the STM user guide.\n\n\n\nPerformance\nThe performance is now more stable than it used to be. More\nprecisely, the best case is still \"25%-40% single-core slow-down with\nvery good scaling up to 4 threads\", but the average performance seems\nnot too far from that. There are still dark spots --- notably, the\nJIT is still slower to warm up, though it was improved a lot. These\nare documented in the current status section. Apart from\nthat, we should not get more than 2x single-core slow-down in the\nworst case. Please report such cases as bugs!\n\n\n\nTransactionQueue\nAs explained before, PyPy-STM is more than \"just\" a Python without\nGIL. It is a Python in which you can do minor tweaks to your\nexisting, non-multithreaded programs and get them to use multiple\ncores. You identify medium- or large-sized, likely-independent parts\nof the code and to ask PyPy-STM to run these parts in parallel. An\nexample would be every iteration of some outermost loop over all items\nof a dictionary. This is done with a new API:\ntransaction.TransactionQueue(). See help(TransactionQueue) or\nread more about it in the STM user guide.\nThis is not a 100% mechanical change: very likely, you need to hunt\nfor and fix \"STM conflicts\" that prevent parallel execution (see\ndocs). However, at all points your program runs correctly, and you\ncan stop the hunt when you get acceptable performance. You don't get\ndeadlocks or corrupted state.\n\nThanks for reading!\nArmin, Remi, Fijal", + "tags": "release", + "url": "https://www.pypy.org/posts/2015/03/pypy-stm-251-released-1342113838236225773.html" + }, + { + "title": "PyPy 2.5.1 Released", + "text": "PyPy 2.5.1 - Pineapple Bromeliad\nWe\u2019re pleased to announce PyPy 2.5.1, Pineapple Bromeliad following on the heels of 2.5.0. You can download the PyPy 2.5.1 release here:\n\n\n\nhttps://pypy.org/download.html\n\nWe would like to thank our donors for the continued support of the PyPy\nproject, and for those who donate to our three sub-projects, as well as our\nvolunteers and contributors.\nWe\u2019ve shown quite a bit of progress, but we\u2019re slowly running out of funds.\nPlease consider donating more, or even better convince your employer to donate,\nso we can finish those projects! The three sub-projects are:\n\n\n\n\nPy3k (supporting Python 3.x): We have released a Python 3.2.5 compatible version we call PyPy3 2.4.0, and are working toward a Python 3.3 compatible version\n\u00a0\n\n\n\nSTM (software transactional memory): We have released a first working version,\nand continue to try out new promising paths of achieving a fast multithreaded Python\n\n\n\n\nNumPy which requires installation of our fork of upstream numpy,\navailable on bitbucket\n\n\nWe would also like to encourage new people to join the project. PyPy has many\nlayers and we need help with all of them: PyPy and Rpython documentation\nimprovements, tweaking popular modules to run on pypy, or general help with making\nRpython\u2019s JIT even better.\n\n\n\nWhat is PyPy?\nPyPy is a very compliant Python interpreter, almost a drop-in replacement for\nCPython 2.7. It\u2019s fast (pypy and cpython 2.7.x performance comparison)\ndue to its integrated tracing JIT compiler.\n\n\nThis release supports x86 machines on most common operating systems\n(Linux 32/64, Mac OS X 64, Windows, and OpenBSD),\nas well as newer ARM hardware (ARMv6 or ARMv7, with VFPv3) running Linux.\n\n\nWhile we support 32 bit python on Windows, work on the native Windows 64\nbit python is still stalling, we would welcome a volunteer\nto handle that.\n\n\n\n\nHighlights\n\nThe past months have seen pypy mature and grow, as rpython becomes the goto\nsolution for writing fast dynamic language interpreters. Our separation of\nRpython from the python interpreter PyPy is now much clearer in the\nPyPy documentation and we now have seperate RPython documentation.\nTell us what still isn\u2019t clear, or even better help us improve the documentation. \n\n\n\n\nWe merged version 2.7.9 of python\u2019s stdlib. From the python release notice:\nThe entirety of Python 3.4\u2019s ssl module has been backported.\nSee PEP 466 for justification.\nHTTPS certificate validation using the system\u2019s certificate store is now\nenabled by default. See PEP 476 for details.\nSSLv3 has been disabled by default in httplib and its reverse dependencies\ndue to the POODLE attack.\nThe ensurepip module has been backported, which provides the pip\npackage manager in every Python 2.7 installation. See PEP 477.\n\n\n\nThe garbage collector now ignores parts of the stack which did not change\nsince the last collection, another performance boost \n\n\nerrno and LastError are saved around cffi calls so things like pdb will not\noverwrite it \n\n\nWe continue to asymptotically approach a score of 7 times faster than cpython\non our benchmark suite, we now rank 6.98 on latest runs \n\n\nIssues reported with our previous release were resolved after reports from users on\nour issue tracker at https://foss.heptapod.net/pypy/pypy/-/issues or on IRC at\n#pypy.\n\nPlease try it out and let us know what you think. We welcome\nsuccess stories, experiments, or benchmarks, we know you are using PyPy, please tell us about it!\n\nCheers\n\nThe PyPy Team", + "tags": "release", + "url": "https://www.pypy.org/posts/2015/03/pypy-251-released-5657064769385723517.html" + }, + { + "title": "Pydgin: Using RPython to Generate Fast Instruction-Set Simulators", + "text": "Note: This is a guest blog post by Derek Lockhart and Berkin Ilbeyi from\nComputer Systems Laboratory of Cornell University.\nIn this blog post I'd like to describe some recent work on using the RPython\ntranslation toolchain to generate fast instruction set simulators.\nOur open-source framework, Pydgin [a], provides a domain-specific\nlanguage (DSL) embedded in Python for concisely describing instruction set\narchitectures [b] and then uses these descriptions to generate fast,\nJIT-enabled simulators.\nPydgin will be presented at the IEEE International Symposium on Performance\nAnalysis of Systems and Software (ISPASS) and in this post we provide a\npreview of that work.\nIn addition, we discuss some additional progress updates that occurred after\nthe publishing deadline and will not appear in the final paper [1].\nOur area of research expertise is computer architecture, which is perhaps an\nunfamiliar topic for some readers of the PyPy blog.\nBelow we provide some brief background on hardware simulation in the field of\ncomputer architecture, as well as some context as to why instruction set\nsimulators in particular are such an important tool.\n\nSimulators: Designing Hardware with Software\nFor computer architects in both academia and industry, a key step in designing\nnew computational hardware (e.g., CPUs, GPUs, and mobile system-on-chips) is\nsimulation [c] of the target system.\nWhile numerous models for simulation exist, three classes are particularly\nimportant in hardware design.\nFunctional Level models simulate the behavior of the target system.\nThese models are useful for creating a \"golden\" reference which can serve as an\nexecutable specification or alternatively as an emulation platform for software\ndevelopment.\nCycle Level models aim to simulate both the behavior and the approximate\ntiming of a hardware component.\nThese models help computer architects explore design tradeoffs and quickly\ndetermine things like how big caches should be, how many functional units are\nneeded to meet throughput targets, and how the addition of a custom accelerator\nblock may impact total system performance.\nRegister-Transfer Level (RTL) models specify the behavior, timing, and\nresources (e.g., registers, wires, logic gates) of a hardware component.\nRTL models are bit-accurate hardware specifications typically written in a\nhardware description language (HDL) such as Verilog or VHDL.\nOnce verified through extensive simulation, HDL specifications can be passed\ninto synthesis and place-and-route tools to estimate area/energy/timing or to\ncreate FPGA or ASIC prototypes.\nAn instruction set simulator (ISS) is a special kind of\nfunctional-level model that simulates the behavior of a processor or\nsystem-on-chip (SOC). ISSs serve an important role in hardware design\nbecause they model the instruction set architecture (ISA) interface: the\ncontractual boundary between hardware designers and software developers.\nISSs allow hardware designers to quickly experiment with adding new processor\ninstructions while also allowing software developers to build new compilers,\nlibraries, and applications long before physical silicon is available.\n\n\nInstruction-Set Simulators Must be Fast and Productive\nInstruction-set simulators are more important than ever because the ISA\nboundary has become increasingly fluid.\nWhile Moore's law has continued to deliver larger numbers of transistors\nwhich computer architects can use to build increasingly complex chips, limits\nin Dennard scaling have restricted how these transistors can be used [d].\nIn more simple terms, thermal constraints (and energy constraints in mobile\ndevices) have resulted in a growing interest in pervasive specialization:\nusing custom accelerators to more efficiently perform compute intensive tasks.\nThis is already a reality for designers of mobile SOCs who continually add new\naccelerator blocks and custom processor instructions in order to achieve higher\nperformance with less energy consumption.\nISSs are indispensable tools in this SOC design process for both hardware\narchitects building the silicon and software engineers developing the software\nstack on top of it.\nAn instruction set simulator has two primary responsibilities: 1) accurately\nemulating the external execution behavior of the target, and 2) providing\nobservability by accurately reproducing the target's internal state (e.g.,\nregister values, program counter, status flags) at each time step.\nHowever, other qualities critical to an effective ISS are simulation\nperformance and designer productivity.\nSimulation performance is important because shorter simulation times allow\ndevelopers to more quickly execute and verify large software applications.\nDesigner productivity is important because it allows hardware architects to\neasily experiment with adding new instructions and estimate their impact on\napplication performance.\nTo improve simulation performance, high-performance ISSs use dynamic binary\ntranslation (DBT) as a mechanism to translate frequently visited blocks of\ntarget instructions into optimized sequences of host instructions.\nTo improve designer productivity, many design toolchains automatically generate\nISSs from an architectural description language (ADL): a special\ndomain-specific language for succinctly specifying instruction encodings and\ninstruction semantics of an ISA.\nVery few existing systems have managed to encapsulate the design complexity of\nDBT engines such that high-performance, DBT-accelerated ISSs could be\nautomatically generated from ADLs [e].\nUnfortunately, tools which have done so are either proprietary software or\nleave much to be desired in terms of performance or productivity.\n\n\nWhy RPython?\nOur research group learned of the RPython translation toolchain through our\nexperiences with PyPy, which we had used in conjunction with our Python\nhardware modeling framework to achieve significant improvements in simulation\nperformance [2].\nWe realized that the RPython translation toolchain could potentially be adapted\nto create fast instruction set simulators since the process of interpreting\nexecutables comprised of binary instructions shared many similarities with the\nprocess of interpreting bytecodes in a dynamic-language VM.\nIn addition, we were inspired by PyPy's meta-tracing approach to JIT-optimizing\nVM design which effectively separates the process of specifying a language\ninterpreter from the optimization machinery needed to achieve good performance.\nExisting ADL-driven ISS generators have tended to use domain-specific\nlanguages that require custom parsers or verbose C-based syntax that\ndistracts from the instruction specification.\nCreating an embedded-ADL within Python provides several benefits over these\nexisting approaches including a gentler learning curve for new users, access to\nbetter debugging tools, and easier maintenance and extension by avoiding a\ncustom parser.\nAdditionally, we have found that the ability to directly execute Pydgin\nISA descriptions in a standard Python interpreter such as CPython or PyPy\nsignificantly helps debugging and testing during initial ISA exploration.\nPython's concise, pseudocode-like syntax also manages to map quite closely to\nthe pseudocode specifications provided by many ISA manuals [f].\n\n\nThe Pydgin embedded-ADL\nDefining a new ISA in the Pydgin embedded-ADL requires four primary pieces of\ninformation: the architectural state (e.g. register file, program counter,\ncontrol registers), the bit encodings of each instruction, the instruction\nfields, and the semantic definitions for each instruction. Pydgin aims to make\nthis process as painless as possible by providing helper classes and functions\nwhere possible.\nFor example, below we provide a truncated example of the ARMv5 instruction\nencoding table. Pydgin maintains encodings of all instructions in a centralized\nencodings data structure for easy maintenance and quick lookup. The\nuser-provided instruction names and bit encodings are used to automatically\ngenerate decoders for the simulator. Unlike many ADLs, Pydgin does not require\nthat the user explicitly specify instruction types or mask bits for field\nmatching because the Pydgin decoder generator can automatically infer decoder\nfields from the encoding table.\n\nencodings = [\n ['adc', 'xxxx00x0101xxxxxxxxxxxxxxxxxxxxx'],\n ['add', 'xxxx00x0100xxxxxxxxxxxxxxxxxxxxx'],\n ['and', 'xxxx00x0000xxxxxxxxxxxxxxxxxxxxx'],\n ['b', 'xxxx1010xxxxxxxxxxxxxxxxxxxxxxxx'],\n ['bl', 'xxxx1011xxxxxxxxxxxxxxxxxxxxxxxx'],\n ['bic', 'xxxx00x1110xxxxxxxxxxxxxxxxxxxxx'],\n ['bkpt', '111000010010xxxxxxxxxxxx0111xxxx'],\n ['blx1', '1111101xxxxxxxxxxxxxxxxxxxxxxxxx'],\n ['blx2', 'xxxx00010010xxxxxxxxxxxx0011xxxx'],\n # ...\n ['teq', 'xxxx00x10011xxxxxxxxxxxxxxxxxxxx'],\n ['tst', 'xxxx00x10001xxxxxxxxxxxxxxxxxxxx'],\n]\n\nA major goal of Pydgin was ensuring instruction semantic definitions map to ISA\nmanual specifications as much as possible. The code below shows one such\ndefinition for the ARMv5 add instruction.\nA user-defined Instruction class (not shown) specifies field names that can\nbe used to conveniently access bit positions within an instruction (e.g.\nrd, rn, S).\nAdditionally, users can choose to define their own helper functions, such as\nthe condition_passed function, to create more concise syntax that better\nmatches the ISA manual.\n\ndef execute_add( s, inst ):\n if condition_passed( s, inst.cond() ):\n a, = s.rf[ inst.rn() ]\n b, _ = shifter_operand( s, inst )\n result = a + b\n s.rf[ inst.rd() ] = trim_32( result )\n\n if inst.S():\n if inst.rd() == 15:\n raise FatalError('Writing SPSR not implemented!')\n s.N = (result >> 31)&1\n s.Z = trim_32( result ) == 0\n s.C = carry_from( result )\n s.V = overflow_from_add( a, b, result )\n\n if inst.rd() == 15:\n return\n\n s.rf[PC] = s.fetch_pc() + 4\n\nCompared to the ARM ISA Reference manual shown below, the Pydgin instruction\ndefinition is a fairly close match. Pydgin's definitions could certainly be\nmade more concise by using a custom DSL, however, this would lose many of the\ndebugging benefits afforded to a well-supported language such as Python and\nadditionally require using a custom parser that would likely need modification\nfor each new ISA.\n\nif ConditionPassed(cond) then\n Rd = Rn + shifter_operand\n if S == 1 and Rd == R15 then\n if CurrentModeHasSPSR() then CPSR = SPSR\n else UNPREDICTABLE else if S == 1 then\n N Flag = Rd[31]\n Z Flag = if Rd == 0 then 1 else 0\n C Flag = CarryFrom(Rn + shifter_operand)\n V Flag = OverflowFrom(Rn + shifter_operand)\n\nCreating an ISS that can run real applications is a rather complex task, even\nfor a bare metal simulator with no operating system such as Pydgin.\nEach system call in the C library must be properly implemented, and\nbootstrapping code must be provided to set up the program stack and\narchitectural state.\nThis is a very tedious and error prone process which Pydgin tries to\nencapsulate so that it remains as transparent to the end user as possible.\nIn future versions of Pydgin we hope to make bootstrapping more painless and\nsupport a wider variety of C libraries.\n\n\n\n\nPydgin Performance\nIn order to achieve good simulation performance from Pydgin ISSs, significant\nwork went into adding appropriate JIT annotations to the Pydgin library\ncomponents.\nThese optimization hints, which allow the JIT generated by the RPython\ntranslation toolchain to produce more efficient code, have been specifically\nselected for the unique properties of ISSs.\nFor the sake of brevity, we do not talk about the exact optimizations here but\na detailed discussion can be found in the ISPASS paper [1].\nIn the paper we evaluate two ISSs, one for a simplified MIPS ISA and another\nfor the ARMv5 ISA, whereas below we only discuss results for the ARMv5 ISS.\nThe performance of Pydgin-generated ARMv5 ISSs were compared against\nseveral reference ISSs: the gem5 ARM atomic simulator (gem5),\ninterpretive and JIT-enabled versions of SimIt-ARM (simit-nojit and\nsimit-jit), and QEMU.\nAtomic models from the gem5 simulator were chosen for comparison due their wide\nusage amongst computer architects [g].\nSimIt-ARM was selected because it is currently the highest performance\nADL-generated DBT-ISS publicly available.\nQEMU has long been held as the gold-standard for DBT simulators due to its\nextremely high performance, however, QEMU is generally intended for usage as an\nemulator rather than a simulator [c] and therefore achieves its excellent\nperformance at the cost of observability.\nUnlike QEMU, all other simulators in our study faithfully track architectural\nstate at an instruction level rather than block level.\nPydgin ISSs were generated with and without JITs using the RPython translation\ntoolchain in order to help quantify the performance benefit of the meta-tracing\nJIT.\nThe figure below shows the performance of each ISS executing applications from\nthe SPEC CINT2006 benchmark suite [h].\nBenchmarks were run to completion on the high-performance DBT-ISSs\n(simit-jit, pydgin-jit, and QEMU), but were terminated after only\n10 billion simulated instructions for the non-JITed interpretive ISSs\n(these would require many hours, in some cases days, to run to completion).\nSimulation performance is measured in MIPS [i] and plotted on a log\nscale due to the wide variance in performance.\nThe WHMEAN group summarizes each ISS's performance across all benchmarks\nusing the weighted harmonic mean.\n\n\n\nA few points to take away from these results:\n\nISSs without JITs (gem5, simit-nojit, and pydgin-nojit) demonstrate\nrelatively consistent performance across applications, whereas ISSs with JITs\n(simit-jit, pydgin-jit, and QEMU) demonstrate much greater\nperformance variability from application-to-application.\nThe gem5 atomic model demonstrates particularly miserable performance, only\n2-3 MIPS!\nQEMU lives up to its reputation as a gold-standard for simulator performance,\nleading the pack on nearly every benchmark and reaching speeds of 240-1120\nMIPS.\npydgin-jit is able to outperform simit-jit on four of the\napplications, including considerable performance improvements of 1.44\u20131.52\u00d7\nfor the applications 456.hmmer, 462.libquantum, and 471.omnetpp\n(managing to even outperform QEMU on 471.omnetpp).\nsimit-jit is able to obtain much more consistent performance (230-459\nMIPS across all applications) than pydgin-jit (9.6-659 MIPS). This is\ndue to simit-jit's page-based approach to JIT optimization compared to\npydgin-jit's tracing-based approach.\n464.h264ref displays particularly bad pathological behavior in Pydgin\u2019s\ntracing JIT and is the only application to perform worse on pydgin-jit\nthan pydgin-nojit (9.6 MIPS vs. 21 MIPS).\n\nThe pathological behavior demonstrated by 464.h264ref was of particular\nconcern because it caused pydgin-jit to perform even worse than having no\nJIT at all. RPython JIT logs indicated that the reason for this performance\ndegradation was a large number of tracing aborts due to JIT traces growing too\nlong. However, time limitations before the publication deadline prevented us\nfrom investigating this issue thoroughly.\nSince the deadline we've applied some minor bug fixes and made some small\nimprovements in the memory representation.\nMore importantly, we've addressed the performance degradation in 464.h264ref\nby increasing trace lengths for the JIT.\nBelow we show how the performance of 464.h264ref changes as the\ntrace_limit parameter exposed by the RPython JIT is varied from the default\nsize of 6000 operations.\n\n\n\n\nBy quadrupling the trace limit we achieve an 11x performance improvement in\n464.h264ref.\nThe larger trace limit allows the JIT to optimize long code paths that were\npreviously triggering trace aborts, greatly helping amortize the costs of\ntracing.\nNote that arbitrarily increasing this limit can potentially hurt performance if\nlonger traces are not able to detect optimizable code sequences.\nAfter performing similar experiments across the applications in the SPEC\nCINT2006 benchmark suite, we settled on a trace limit of 400,000 operations.\nIn the figure below we show how the updated Pydgin ISS (pydgin-400K) improves\nperformance across all benchmarks and fixes the performance degradation\npreviously seen in 464.h264ref. Note that the non-JITted simulators have been\nremoved for clarity, and simulation performance is now plotted on a\nlinear scale to more clearly distinguish the performance gap between\neach ISS.\n\n\n\nWith these improvements, we are now able to beat simit-jit on all but two\nbenchmarks. In future work we hope to further close the gap with QEMU as well.\n\n\nConclusions and Future Work\nPydgin demonstrates that the impressive work put into the RPython translation\ntoolchain, designed to simplify the process of building fast dynamic-language\nVMs, can also be leveraged to build fast instruction set simulators.\nOur prototype ARMv5 ISS shows that Pydgin can generate ISSs with performance\ncompetitive to SimIt-ARM while also providing a more productive development\nexperience: RPython allowed us to develop Pydgin with only four person-months\nof work.\nAnother significant benefit of the Pydgin approach is that any performance\nimprovements applied to the RPython translation toolchain immediately benefit\nPydgin ISSs after a simple software download and retranslation.\nThis allows Pydgin to track the continual advances in JIT technology introduced\nby the PyPy development team.\nPydgin is very much a work in progress. There are many features we would like\nto add, including:\n\nmore concise syntax for accessing arbitrary instruction bits\nsupport for other C libraries such as glibc, uClibc, and musl\n(we currently only support binaries compiled with newlib)\nsupport for self-modifying code\nfeatures for more productive debugging of target applications\nISS descriptions for other ISAs such as RISC-V, ARMv8, and x86\nautomatic generation of compilers and toolchains from Pydgin descriptions\n\nIn addition, we think there are opportunities for even greater performance\nimprovements with more advanced techniques such as:\n\nautomatic generation of optimized instruction decoders\noptimizations for floating-point intensive applications\nmultiple tracing-JITs for parallel simulation of multicore SOCs\na parallel JIT compilation engine as proposed by Bo\u0308hm et al. [3]\n\nWe hope that Pydgin can be of use to others, so if you try it out please let us\nknow what you think. Feel free to contact us if you find any of the above\ndevelopment projects interesting, or simply fork the project on GitHub and hack\naway!\n-- Derek Lockhart and Berkin Ilbeyi\n\n\nAcknowledgements\n We would like to sincerely thank Carl Friedrich Bolz and Maciej Fijalkowski for their feedback on the Pydgin publication and their guidance on improving the JIT performance of our simulators. We would also like to thank for the whole PyPy team for their incredible work on the PyPy and the RPython translation toolchain. Finally, thank you to our research advisor, Prof. Christopher Batten, and the sponsors of this work which include the National Science Foundation, the Defense Advanced Research Projects Agency, and Intel Corporation.\n\n\nFootnotes\n\n\n\n[a]Pydgin loosely stands for [Py]thon [D]SL for [G]enerating\n[In]struction set simulators and is pronounced the same as \u201cpigeon\u201d. The\nname is inspired by the word \u201cpidgin\u201d which is a grammatically simplified\nform of language and captures the intent of the Pydgin embedded-ADL.\nhttps://github.com/cornell-brg/pydgin\n\n\n\n\n\n[b]Popular instruction set architectures (ISAs) include MIPs, ARM,\nx86, and more recently RISC-V\n\n\n\n\n\n[c](1, 2) For a good discussion of simulators vs. emulators, please see the\nfollowing post on StackOverflow:\nhttps://stackoverflow.com/questions/1584617/simulator-or-emulator-what-is-the-difference\n\n\n\n\n\n[d]https://en.wikipedia.org/wiki/Dark_silicon\n\n\n\n\n\n[e]Please see the Pydgin paper for a more detailed discussion of prior work.\n\n\n\n\n\n[f]For more examples of Pydgin ISA specifications, please see the ISPASS\npaper [1] or the Pydgin source code on GitHub.\nPydgin instruction definitions for a simple MIPS-inspired ISA can be\nfound here:\n\nhttps://github.com/cornell-brg/pydgin/blob/master/parc/isa.py\n\nPydgin instruction definitions for a simplified ARMv5 ISA can be found\nhere:\n\nhttps://github.com/cornell-brg/pydgin/blob/master/arm/isa.py\n\n\n\n\n\n\n\n[g]gem5 is a cycle-level simulation framework that contains both\nfunctional-level (atomic) and cycle-level processor models. Although\nprimarily used for detailed, cycle-approximate processor simulation,\ngem5's atomic model is a popular tool for many ISS tasks.\n\nhttps://www.m5sim.org/SimpleCPU\n\n\n\n\n\n\n\n[h]All performance measurements were taken on an unloaded server-class\nmachine.\n\n\n\n\n\n[i]Millions of instructions per second.\n\n\n\n\nReferences\n\n\n\n[1](1, 2, 3) Derek Lockhart, Berkin Ilbeyi, and Christopher Batten. \"Pydgin:\nGenerating Fast Instruction Set Simulators from Simple Architecture\nDescriptions with Meta-Tracing JIT Compilers.\" IEEE Int'l Symp. on\nPerformance Analysis of Systems and Software (ISPASS), Mar. 2015.\n\nhttps://csl.cornell.edu/~cbatten/pdfs/lockhart-pydgin-ispass2015.pdf\nhttps://github.com/cornell-brg/pydgin\n\n\n\n\n\n\n\n[2]Derek Lockhart, Gary Zibrat, and Christopher Batten. \"PyMTL: A Unified\nFramework for Vertically Integrated Computer Architecture Research.\" 47th\nACM/IEEE Int'l Symp. on Microarchitecture (MICRO-47), Dec. 2014.\n\nhttps://csl.cornell.edu/~cbatten/pdfs/lockhart-pymtl-micro2014.pdf\nhttps://github.com/cornell-brg/pymtl\n\n\n\n\n\n\n\n[3]I. Bo\u0308hm, B. Franke, and N. Topham. Generalized Just-In-Time Trace\nCompilation Using a Parallel Task Farm in a Dynamic Binary Translator.\nACM SIGPLAN Conference on Programming Language Design and Implementation\n(PLDI), Jun 2011.", + "tags": "", + "url": "https://www.pypy.org/posts/2015/03/pydgin-using-rpython-to-generate-fast-1514065178985838697.html" + }, + { + "title": "Experiments in Pyrlang with RPython", + "text": "Pyrlang is an Erlang BEAM bytecode interpreter written in RPython.\nIt implements approximately 25% of BEAM instructions. It can support\ninteger calculations (but not bigint), closures, exception handling,\nsome operators to atom, list and tuple, user modules, and multi-process\nin single core. Pyrlang is still in development.\nThere are some differences between BEAM and the VM of PyPy:\n\nBEAM is a register-based VM, whereas the VM in PyPy is stack-based.\nThere is no traditional call-stack in BEAM. The Y register in BEAM is\nsimilar to a call-stack, but the Y register can sometimes store some\nvariables.\nThere are no typical language-level threads and OS-level threads in\nBEAM; only language-level processes, whose behavior is very similar\nto the actor model.\n\nRegarding bytecode dispatch loop, Pyrlang uses a while loop to fetch\ninstructions and operands, call the function corresponding to every\ninstruction, and jump back to the head of the while loop. Due to the\ndifferences between the RPython call-stack and BEAM\u2019s Y register, we\ndecided to implement and manage the Y register by hand. On the other\nhand, PyPy uses RPython\u2019s call stack to implement Python\u2019s call stack.\nAs a result, the function for the dispatch loop in PyPy calls itself\nrecursively. This does not happen in Pyrlang.\nThe Erlang compiler (erlc) usually compiles the bytecode instructions\nfor function invocation into CALL (for normal invocation) and CALL_ONLY\n(for tail recursive invocation). You can use a trampoline semantic to\nimplement it:\n\nCALL instruction: The VM pushes the current instruction pointer (or\ncalled-program counter in PyPy) to the Y register, and jumps to the\ndestination label. When encountering a RETURN instruction, the VM\npops the instruction pointer from the Y register and returns to the\nlocation of the instruction pointer to continue executing the outer\nfunction.\nCALL_ONLY instruction: The VM simply jumps to the destination label,\nwithout any modification of the Y register. As a result, the tail\nrecursive invocation never increases the Y register.\n\nThe current implementation only inserts the JIT hint of can_enter_jit\nfollowing the CALL_ONLY instruction. This means that the JIT only\ntraces the tail-recursive invocation in Erlang code, which has a very\nsimilar semantic to the loop in imperative programming languages like\nPython.\nWe have also written a single scheduler to implement the language level\nprocess in a single core. There is a runable queue in the scheduler. On\neach iteration, the scheduler pops one element (which is a process\nobject with dispatch loop) from the queue, and executes the dispatch\nloop of the process object. In the dispatch loop, however, there is a\ncounter-call \u201creduction\u201d inside the dispatch loop. The reduction\ndecrements during the execution of the loop, and when the reduction\nbecomes 0, the dispatch loop terminates. Then the scheduler pushes that\nelement into the runable queue again, and pops the next element for the\nqueue, and so on.\nWe are planning to implement a multi-process scheduler for multi-core\nCPUs, which will require multiple schedulers and even multiple runable\nqueues for each core, but that will be another story. :-)\n\nMethods\nWe wrote two benchmark programs of Erlang:\n\nFACT: A benchmark to calculate the factorial in a tail-recursive\nstyle, but because we haven\u2019t implemented big int, we do a remainder\ncalculation to the argument for the next iteration, so the number\nnever overflows.\nREVERSE: The benchmark creates a reversed list of numbers, such as\n[20000, 19999, 19998, \u2026], and applies a bubble sort to it.\n\n\n\nResults\n\nThe Value of Reduction\nWe used REVERSE to evaluate the JIT with different values of\nreduction:\n\n\nThe X axis is the value of reduction, and the Y axis is the execution\ntime (by second).\nIt seems that when the value of reduction is small, the reduction\ninfluences the performance significantly, but when reduction becomes\nlarger, it only increases the speed very slightly. In fact, we use 2000\nas the default reduction value (as well as the reduction value in the\nofficial Erlang interpreter).\nSurprisingly, the trace is always generated even when the reduction is\nvery small, such as 0, which means the dispatch loop can only run for a\nvery limited number of iterations, and the language level process\nexecutes fewer instructions than an entire loop in one switch of the\nscheduler). The generated trace is almost the same, regardless of\ndifferent reduction values.\nActually, the RPython JIT only cares what code it meets, but does not\ncare who executes it, thus the JIT always generates the results above.\nThe trace even can be shared among different threads if they execute the\nsame code.\nThe overhead at low reduction value may be due to the scheduler, which\nswitches from different processes too frequently, or from the\ntoo-frequent switching between bytecode interpreter and native code, but\nnot from JIT itself.\nHere is more explanation from Armin Rigo:\n\n\u201cThe JIT works well because you\u2019re using a scheme where some counter\nis decremented (and the soft-thread interrupted when it reaches\nzero) only once in each app-level loop. The soft-thread switch is\ndone by returning to some scheduler, which will resume a different\nsoft-thread by calling it. It means the JIT can still compile each\nof the loops as usual, with the generated machine code containing\nthe decrease-and-check-for-zero operation which, when true, exits\nthe assembler.\"\n\n\nFair Process Switching vs. Unfair Process Switching\nWe are also concerned about the timing for decreasing reduction value.\nIn our initial version of Pyrlang, we decrease reduction value at every\nlocal function invocation, module function invocation, and BIF (built-in\nfunction) invocation, since this is what the official Erlang interpreter\ndoes. However, since the JIT in RPython basically traces the target\nlanguage loop (which is the tail recursive invocation in Pyrlang) it is\ntypically better to keep the loop whole during a switch of the language\nlevel process. We modified Pyrlang, and made the reduction decrement\nonly occur after CALL_ONLY, which is actually the loop boundary of the\ntarget language.\nOf course, this strategy may cause an \u201cunfair\u201d execution among language\nlevel processes. For example, if one process has only a single\nlong-sequence code, it executes until the end of the code. On the other\nhand, if a process has a very short loop, it may be executed by very\nlimited steps then be switched out by the scheduler. However, in the\nreal world, this \u201cunfairness\u201d is usually considered acceptable, and is\nused in many VM implementations including PyPy for improving the overall\nperformance.\nWe compared these two versions of Pyrlang in the FACT benchmark. The\nreduction decrement is quite different because there are some BIF\ninvocations inside the loop. In the old version the process can be\nsuspended at loop boundaries or other function invocation, but in the\nnew version, it can be suspended only at loop boundaries.\nWe show that the strategy is effective, removing around 7% of the\noverhead. We have also compared it in REVERSE, but since there are no\nextra invocations inside the trace, it cannot provide any performance\nimprovement. In the real world, we believe there is usually more than\none extra invocation inside a single loop, so this strategy is effective\nfor most cases.\n\n\nComparison with Default Erlang and HiPE\nWe compared the performance of Pyrlang with the default Erlang\ninterpreter and the HiPE (High Performance Erlang) complier. HiPE is an\nofficial Erlang compiler that can compile Erlang source code to native\ncode. The speed of Erlang programs obviously improves but loses its\ngenerality instead.\nPlease note that Pyrlang is still in development, so in some situations\nit does less work than the default Erlang interpreter, such as not\nchecking integer overflow when dealing with big integer, and not\nchecking and adding locks when accessing message queues in the\nlanguage-level process, so is therefore faster. The final version of\nPyrlang may be slower.\nWe used the two benchmark programs above, and made sure both of them are\nexecuted for more than five seconds to cover the JIT warm-up time for\nRPython. The experiment environment is a OS X 10.10 machine with 3.5GHZ\n6-core Intel Xeon E5 CPU and 14GB 1866 MHz DDR3 ECC memory.\nLet\u2019s look at the result of FACT. The graph shows that Pyrlang runs\n177.41% faster on average than Erlang, and runs at almost the same speed\nas HiPE. However, since we haven\u2019t implemented big integer in Pyrlang,\nthe arithmetical operators do not do any extra overflow checking. It is\nreasonable that the final version for Pyrlang will be slower than the\ncurrent version and HiPE.\n\nAs for REVERSE, the graph shows that Pyrlang runs 45.09% faster than\nErlang, but 63.45% slower than HiPE on average. We think this is\nreasonable because there are only few arithmetical operators in this\nbenchmark so the speeds of these three implementations are closer.\nHowever, we observed that at the scale of 40,000, the speed of Pyrlang\nslowed down significantly (111.35% slower than HiPE) compared with the\nother two scales (56.38% and 22.63% slower than HiPE).\nUntil now we can only hypothesize why Pyrlang slows down at that scale.\nWe guess that the overhead might be from GC. This is because the BEAM\nbytecode provides some GC hints to help the default Erlang compiler to\nperform some GC operations immediately. For example, using GC_BIF\ninstead of a BIF instruction tells the VM that there may be a GC\nopportunity, and tells the VM how many live variables should be around\none instruction. In Pyrlang we do not use these kinds of hints but rely\non RPython\u2019s GC totally. When there are a huge number of objects during\nruntime, (as for REVERSE, it should be the Erlang list object) the speed\ntherefore slows down.\n\nRuochen Huang", + "tags": "", + "url": "https://www.pypy.org/posts/2015/02/experiments-in-pyrlang-with-rpython-8103387814587972227.html" + }, + { + "title": "linalg support in pypy/numpy", + "text": "Introduction\nPyPy's numpy support has matured enough that it can now support the lapack/blas libraries through the numpy.linalg module. To install the version of numpy this blog post refers to, install PyPy version 2.5.0 or newer, and run this:\n\npypy -m pip install git+https://bitbucket.org/pypy/numpy.git\n\n\nThis update is a major step forward for PyPy's numpy support. Many of the basic matrix operations depend on linalg, even matplotlib requires it to display legends (a pypy-friendly version of matplotlib 1.3 is available at https://github.com/mattip/matplotlib).\n\nA number of improvements and adaptations, some of which are in the newly-released PyPy 2.5.0, made this possible:\n\nSupport for an extended frompyfunc(), which in the PyPy version supports much of the ufunc API (signatures, multiple dtypes) allowing creation of pure-python, jit-friendly ufuncs. An additional keyword allows choosing between out = func(in) or func(in, out) ufunc signatures. More explanation follows.\nSupport for GenericUfuncs via PyPy's (slow) capi-compatibility layer. The underlying mechanism actually calls the internal implementation of frompyfunc().\nA cffi version of _umath_linalg. Since cffi uses dlopen() to call into shared objects, we added support in the numpy build system to create non-python shared libraries from source code in the numpy tree. We also rewrote parts of the c-based _umath_linalg.c.src in python, renamed numpy's umath_linalg capi module to umath_linag_capi, and use it as a shared object through cffi.\n\n\n\nStatus\nWe have not completely implemented all the linalg features. dtype resolution via casting is missing, especially for complex ndarrays, which leads to slight numerical errors where numpy uses a more precise type for intermediate calculations. Other missing features in PyPy's numpy support may have implications for complete linalg support.\n\nSome OSX users have noticed they need to update pip to version 6.0.8 to overcome a regression in pip, and it is not clear if we support all combinations of blas/lapack implementations on all platforms.\n\nOver the next few weeks we will be ironing out these issues.\n\n\nPerformance\nA simple benchmark is shown below, but let's state the obvious: PyPy's JIT and the iterators built into PyPy's ndarray implementation will in most cases be no faster than CPython's numpy. The JIT can help where there is a mixture of python and numpy-array code. We do have plans to implement lazy evaluation and to further optimize PyPy's support for numeric python, but numpy is quite good at what it does.\n\n\nHowTo for PyPy's extended frompyfunc \nThe magic enabling blas support is a rewrite of the _umath_linalg c-based module as a cffi-python module that creates ufuncs via frompyfunc. We extended the numpy frompyfunc to allow it to function as a replacement for the generic ufunc available in numpy only through the c-api.\n\nWe start with the basic frompyfunc, which wraps a python function into a ufunc:\n\u00a0\ndef times2(in0):\n return in0 * 2\nufunc = frompyfunc(times2, 1, 1)\n\n\nIn cpython's numpy the dtype of the result is always object, which is not implemented (yet) in PyPy, so this example will fail. While the utility of object dtypes can be debated, in the meantime we add a non-numpy-compatible keyword argument dtypes to frompyfunc. If dtype=['match'] the output dtype will match the dtype of the first input ndarray:\n\nufunc = frompyfunc(times2, 1, 1, dtype=['match'])\nai = arange(24).reshape(3, 4, 2)\nao = ufunc(ai)\nassert (ao == ai * 2).all()\n\n\nI hear you ask \"why is the dtypes keyword argument a list?\" This is so we can support the Generalized Universal Function API, which allows specifying a number of specialized functions and the input-output dtypes each specialized function accepts.\nNote that the function feeds the values of ai one at a time, the function operates on scalar values. To support more complicated ufunc calls, the generalized ufunc API allows defining a signature, which specifies the layout of the ndarray inputs and outputs. So we extended frompyfunc with a signature keyword as well.\nWe add one further extension to frompyfunc: we allow a Boolean keyword stack_inputs to specify the argument layout of the function itself. If the function is of the form:\n\u00a0\nout0, out1, ... = func(in0, in1,...)\n\n\nthen stack_inputs is False. If it is True the function is of the form:\n\u00a0\nfunc(in0, in1, ... out0, out1, ...)\n\n\nHere is a complete example of using frompyfunc to create a ufunc, based on this link:\n\u00a0\ndef times2(in_array, out_array):\n in_flat = in_array.flat\n out_flat = out_array.flat\n for i in range(in_array.size):\n out_flat[i] = in_flat[i] * 2\nufunc = frompyfunc([times2, times2], 1, 1,\n signature='(i)->(i)',\n dtypes=[dtype(int), dtype(int),\n dtype(float), dtype(float),\n ],\n stack_inputs=True,\n )\nai = arange(10, dtype=int)\nai2 = ufunc(ai)\nassert all(ai2 == ai * 2)\n\n\nUsing this extended syntax, we rewrote the lapack calls into the blas functions in pure python, no c needed. Benchmarking this approach actually was much slower than using the upstream umath_linalg module via cpyext, as can be seen in the following benchmarks. This is due to the need to copy c-aligned data into Fortran-aligned format. Our __getitem__ and __setitem__ iterators are not as fast as pointer arithmetic in C. So we next tried a hybrid approach: compile and use numpy's umath_linalg python module as a shared object, and call the optimized specific wrapper function from it.\n\n\nBenchmarks\nHere are some benchmarks, running a tight loop of the different versions of linalg.inv(a), where a is a 10x10 double ndarray. The benchmark ran on an i7 processor running ubuntu 14.04 64 bit:\n\n Impl. Time after warmup \n \n CPython 2.7 + numpy 1.10.dev + lapack 8.9 msec/1000 loops \n PyPy 2.5.0 + numpy + lapack via cpyext 8.6 msec/1000 loops \n PyPy 2.5.0 + numpy + lapack via pure python + cffi 19.9 msec/1000 loops \n PyPy 2.5.0 + numpy + lapack via python + c + cffi 9.5 msec/1000 loops\n\n\n\n\n\n\n\n\nWhile no general conclusions may be drawn from a single micro-benchmark, it does indicate that there is some merit in the approach taken. \n\nConclusion\nPyPy's numpy now includes a working linalg module. There are still some rough corners, but hopefully we have implemented the parts you need. While the speed of the isolated linalg function is no faster than CPython and upstream numpy, it should not be significantly slower either. Your use case may see an improvement if you use a mix of python and lapack, which is the usual case.\n\nPlease let us know how it goes. We love to hear success stories too.\n\nWe still have challenges at all levels of programming,and are always looking for people willing to contribute, so stop by on IRC at #pypy.\n\nmattip and the PyPy Team", + "tags": "", + "url": "https://www.pypy.org/posts/2015/02/linalg-support-in-pypynumpy-1131217944329711855.html" + }, + { + "title": "NumPyPy status - January 2015", + "text": "Hi Everyone\n\nHere is what has been done in January thanks to the funding of NumPyPy,\u00a0I would like to thank all the donors and tell you that you can still donate\u00a0:\n\nI have focused on implementing the object dtype this month, it is now possible to store objects inside ndarrays using the object dtype\nIt is also possible to add an object ndarray to any other ndarray (implementing other operators is trivial)\n\n\nThe next things I plan on working on next are :\n\n\nImplementing the missing operations for object arrays\nImplementing garbage collection support for object arrays (currently, storing an object inside an ndarray doesn't keep the object alive)\nPackaging NumPyPy on PyPI\n\n\nCheers\n\n\nRomain", + "tags": "", + "url": "https://www.pypy.org/posts/2015/02/numpypy-status-january-2015-5092986229783279944.html" + }, + { + "title": "PyPy 2.5.0 released", + "text": "PyPy 2.5.0 - Pincushion Protea\nWe\u2019re pleased to announce PyPy 2.5, which contains significant performance\nenhancements and bug fixes.\nYou can download the PyPy 2.5.0 release here:\n\n\nhttps://pypy.org/download.html\n\nWe would like to thank our donors for the continued support of the PyPy\nproject, and for those who donate to our three sub-projects, as well as our\nvolunteers and contributors (10 new commiters joined PyPy since the last\nrelease).\nWe\u2019ve shown quite a bit of progress, but we\u2019re slowly running out of funds.\nPlease consider donating more, or even better convince your employer to donate,\nso we can finish those projects! The three sub-projects are:\n\n\n\nPy3k (supporting Python 3.x): We have released a Python 3.2.5 compatible version\n\nwe call PyPy3 2.4.0, and are working toward a Python 3.3 compatible version\n\n\n\nSTM (software transactional memory): We have released a first working version,\nand continue to try out new promising paths of achieving a fast multithreaded Python\n\n\nNumPy which requires installation of our fork of upstream numpy,\navailable on bitbucket\n\n\n\n\nWhat is PyPy?\nPyPy is a very compliant Python interpreter, almost a drop-in replacement for\nCPython 2.7. It\u2019s fast (pypy and cpython 2.7.x performance comparison)\ndue to its integrated tracing JIT compiler.\nThis release supports x86 machines on most common operating systems\n(Linux 32/64, Mac OS X 64, Windows, and OpenBSD),\nas well as newer ARM hardware (ARMv6 or ARMv7, with VFPv3) running Linux.\nWhile we support 32 bit python on Windows, work on the native Windows 64\nbit python is still stalling, we would welcome a volunteer\nto handle that.\n\n\nHighlights\n\nThe past months have seen pypy mature and grow, as rpython becomes the goto\nsolution for writing fast dynamic language interpreters. Our separation of\nrpython and the python interpreter PyPy is now much clearer in the\nPyPy documentation and we now have separate RPython documentation.\nWe have improved warmup time as well as jitted code performance: more than 10%\ncompared to pypy-2.4.0.\nWe no longer zero-out memory allocated in the gc nursery by default, work that\nwas started during a GSoC.\nPassing objects between C and PyPy has been improved. We are now able to pass\nraw pointers to C (without copying) using pinning. This improves I/O;\nbenchmarks that use networking intensively improved by about 50%. File()\noperations still need some refactoring but are already showing a 20%\nimprovement on our benchmarks. Let us know if you see similar improvements.\nOur integrated numpy support gained much of the GenericUfunc api in order to\nsupport the lapack/blas linalg module of numpy. This dovetails with work in the\npypy/numpy repository to support linalg both through the (slower) cpyext capi\ninterface and also via (the faster) pure python cffi interface, using an\nextended frompyfunc() api. We will soon post a seperate blog post specifically\nabout linalg and PyPy.\nDictionaries are now ordered by default, see the blog post\nOur nightly translations use \u2013shared by default, including on OS/X and linux\nWe now more carefully handle errno (and GetLastError, WSAGetLastError) tying\nthe handlers as close as possible to the external function call, in non-jitted\nas well as jitted code.\nIssues reported with our previous release were resolved after reports from users on\nour issue tracker at https://foss.heptapod.net/pypy/pypy/-/issues or on IRC at\n#pypy.\n\nWe have further improvements on the way: rpython file handling,\nfinishing numpy linalg compatibility, numpy object dtypes, a better profiler,\nas well as support for Python stdlib 2.7.9.\nPlease try it out and let us know what you think. We especially welcome\nsuccess stories, we know you are using PyPy, please tell us about it!\nCheers\nThe PyPy Team", + "tags": "release", + "url": "https://www.pypy.org/posts/2015/02/pypy-250-released-247160062953533060.html" + }, + { + "title": "Faster, more memory efficient and more ordered dictionaries on PyPy", + "text": "Hello everyone!\nAs of today, we merged the latest branch that brings better dictionaries to PyPy by default. The work is based on an idea by Raymond Hettinger on python-dev, with prior work done notably in Java.\u00a0 It was done by Maciej Fija\u0142kowski and Armin Rigo, with Laurence Tratt recently prodding us to finish it.\u00a0 (Earlier work going in a similar direction include Alex Gaynor's work on ordered dicts in Topaz, which was also used in the Hippy VM.\u00a0 Each of these pieces of work is itself based on the original dict implementation in RPython, whose origins fade in the Subversion prehistory of PyPy.)\u00a0 Coincidentally, a very similar idea has been implemented in Zend PHP very recently. Zend implementation description.\nThis post covers the basics of design and implementation as well as some basic benchmarks.\n\n\nDictionaries are now ordered!\nOne surprising part is that the new design, besides being more\nmemory efficient, is ordered by design: it preserves the\ninsertion order.\u00a0 This is not forbidden by the Python language, which allows any order.\u00a0 It makes the collections.OrderedDict subclass much faster than before: it is now a thin subclass of dict.\u00a0 Obviously, we recommend that any portable Python program continues to use OrderedDict when ordering is important.\u00a0 Note that a non-portable program might rely on more: for example, a **keywords argument now receives the keywords in the same order as the one in which they were given in the call.\u00a0 (Whether such a thing might be called a language design change or not is a bit borderline.)\u00a0 The point is that Python programs that work on CPython or previous versions of PyPy should continue to work on PyPy.\nThere is one exception, though.\u00a0 The iterators of the OrderedDict subclass are now working just like the ones of the dict builtin: they will raise RuntimeError when iterating if the dictionary was modified.\u00a0 In the CPython design, the class OrderedDict explicitly doesn't worry about that, and instead you get some result that might range from correct to incorrect to crashes (i.e. random Python exceptions).\n\n\nOriginal PyPy dictionary design\nOriginally, PyPy dictionaries, as well as CPython dictionaries\nare implemented as follows (simplified view):\n\nstruct dict {\n long num_items;\n dict_entry* items;\u00a0\u00a0 /* pointer to array */\n}\n\nstruct dict_entry {\n long hash;\n PyObject* key;\n PyObject* value;\n}\n\nWhere items is a sparse array, with 1/3 to 1/2 of the items being NULL.\nThe average space occupied by a dictionary is 3 * WORD * 12/7 plus some small constant (the smallest dict has 8 entries, which is\n8 * 3 * WORD + 2 * WORD = 26 WORDs).\n\n\nNew PyPy dictionary design\nThe new PyPy dictionary is split in two arrays:\n\nstruct dict {\n long num_items;\n variable_int *sparse_array;\n dict_entry* compact_array;\n}\n\nstruct dict_entry {\n long hash;\n PyObject *key;\n PyObject *value;\n}\n\nHere, compact_array stores all the items in order of insertion, while sparse_array is a 1/2 to 2/3 full array of integers. The integers themselves are of the smallest size necessary for indexing the compact_array. So if compact_array has less than 256 items, then sparse_array will be made of bytes; if less than 216, it'll be two-byte integers; and so on.\nThis design saves quite a bit of memory. For example, on 64bit systems we can, but almost never, use indexing of more than 4 billion elements; and for small dicts, the extra sparse_array takes very little space.\u00a0 For example a 100 element dict, would be on average for the original design on 64bit: 100 * 12/7 * WORD * 3 =~ 4100 bytes, while on new design it's 100 * 12/7 + 3 * WORD * 100 =~ 2600 bytes, quite a significant saving.\n\n\nGC friendliness\nThe obvious benefit of having more compact dictionaries is an increased cache friendliness. In modern CPUs cache misses are much more costly than doing additional simple work, like having an additional level of (in-cache) indirection. Additionally, there is a GC benefit coming from it. When doing a minor collection, the GC has to visit all the GC fields in old objects that can point to young objects. In the case of large arrays, this can prove problematic since the array grows and with each minor collection we need to visit more and more GC pointers. In order to avoid it, large arrays in PyPy employ a technique called \"card marking\" where the GC only visits \"cards\" or subsets of arrays that were modified between collections. The problem with dictionaries was that by design modifications in a dictionary occur randomly, hence a lot of cards used to get invalidated. In the new design, however, new items are typically appended to the compact_array, hence invalidate much fewer cards --- which improves GC performance.\u00a0 (The new sparse_array is an array of integers, so it does not suffer from the same problems.)\n\n\nDeletion\nDeleting entries from dictionaries is not very common, but important in a few use cases.\u00a0 To preserve order, when we delete an entry, we mark the entry as removed but don't otherwise shuffle the remaining entries.\u00a0 If we repeat this operation often enough, there will be a lot of removed entries in the (originally compact) array.\u00a0 At this point, we need to do a \"packing\" operation, which moves all live entries to the start of the array (and then reindexes the sparse array, as the positions changed).\u00a0 This works well, but there are use cases where previously no reindexing was ever needed, so it makes these cases a bit slower (for example when repeatedly adding and removing keys in equal number).\n\n\nBenchmarks\nThe PyPy speed benchmarks show mostly small effect, see changes. The microbenchmarks that we did show large improvements on large and very large dictionaries (particularly, building dictionaries of at least a couple 100s of items is now twice faster) and break-even on small ones (between 20% slower and 20% faster depending very much on the usage patterns and sizes of dictionaries). The new dictionaries enable various optimization possibilities which we're going to explore in the near future.\nCheers,\nfijal, arigo and the PyPy team", + "tags": "", + "url": "https://www.pypy.org/posts/2015/01/faster-more-memory-efficient-and-more-4096950404745375390.html" + }, + { + "title": "Leysin Winter Sprint (20-28th February 2015)", + "text": "The next PyPy sprint will be in Leysin, Switzerland, for the tenth time.\nThis is a fully public sprint: newcomers and topics other than those\nproposed below are welcome.\n\nGoals and topics of the sprint\n\nThe details depend on who is here and ready to work. We might touch\ntopics such as:\n\n\ncleaning up the optimization step in the JIT, change the register\nallocation done by the JIT's backend, or improvements to the\nwarm-up time\n\nSTM (Software Transaction Memory), notably: try to come up with\nbenchmarks, and measure them carefully in order to test and improve\nthe conflict reporting tools, and more generally to figure out how\npractical it is in large projects to avoid conflicts\n\nvmprof - a statistical profiler for CPython and PyPy work, including\nmaking it more user friendly.\n\nPy3k (Python 3.x support), NumPyPy (the numpy module)\n\nadded: cffi 1.0, trying out pygame+cffi on Raspberry Pi devices\n\nAnd as usual, the main side goal is to have fun in winter sports :-)\nWe can take a day off for ski.\n\n\n\n\nExact times\n\nFor a change, and as an attempt to simplify things, I specified the\ndates as 20-28 Februrary 2015, where 20 and 28 are travel days. We will\nwork full days between the 21 and the 27. You are of course allowed to\nshow up for a part of that time only, too.\n\nLocation and Accomodation\n\nLeysin, Switzerland, \"same place as before\". Let me refresh your\nmemory: both the sprint venue and the lodging will be in a very spacious\npair of chalets built specifically for bed & breakfast:\nErmina. The place has a good ADSL Internet connection\nwith wireless installed. You can of course arrange your own lodging\nanywhere (as long as you are in Leysin, you cannot be more than a 15\nminutes walk away from the sprint venue), but I definitely recommend\nlodging there too -- you won't find a better view anywhere else (though\nyou probably won't get much worse ones easily, either :-)\n\nPlease confirm that you are coming so that we can adjust the\nreservations as appropriate. In the past, the rates were around 60 CHF a\nnight all included in 2-person rooms, with breakfast. Now, the rooms\navailable are either single-person (or couple), or rooms for 3 persons.\nThe latter choice is recommended and should be under 60 CHF per person.\n\nPlease register by Mercurial, or on the pypy-dev mailing list if you do not yet have check-in rights.\n\nYou need a Swiss-to-(insert country here) power adapter. There will be\nsome Swiss-to-EU adapters around, and at least one EU-format power strip.", + "tags": "", + "url": "https://www.pypy.org/posts/2015/01/leysin-winter-sprint-20-28th-february-2590212640945547308.html" + }, + { + "title": "September donations and thank you to the Python Software Foundation!", + "text": "Hello everyone!\nWe would like to show you a short update on the PyPy funding.\nWe gathered a total of $15,986 in the month of September and as per\nearlier agreement, the Python Software Foundation donated $10,000\nto PyPy. We would like to thank everyone participating and the PSF in\nparticular for supporting the PyPy project and making our work possible!\nWe've been working hard on the goals outlined in the funding proposals.\n\nPyPy Python 3 support has been in beta for a while and it's already\nbeing used by many people, as seen per the number of reported bugs.\nWe're currently supporting 3.2, planning on moving towards 3.4 in the\nfuture.\nSoftware Transactional Memory has been a successful research project,\nwith first real world results shown during the Warsaw sprint.\nMore detailed update on numpy will be published soon. A little spoiler is\nthat we're planning on addressing matplotlib, scipy and the larger ecosystem\nto some extent. Stay tuned!\n\nAgain, thanks to everyone who donated and happy Thanksgiving to everyone\non that side of the world!\nCheers,\nfijal and the entire PyPy team", + "tags": "", + "url": "https://www.pypy.org/posts/2014/11/september-donations-and-thank-you-to-4531550307707104017.html" + }, + { + "title": "Tornado without a GIL on PyPy STM", + "text": "This post is by Konstantin Lopuhin, who tried PyPy STM during the\nWarsaw sprint.\nPython has a GIL, right? Not quite - PyPy STM is a python implementation\nwithout a GIL, so it can scale CPU-bound work to several cores.\nPyPy STM is developed by Armin Rigo and Remi Meier,\nand supported by community donations.\nYou can read more about it in the\ndocs.\nAlthough PyPy STM is still a work in progress, in many cases it can already\nrun CPU-bound code faster than regular PyPy, when using multiple cores.\nHere we will see how to slightly modify Tornado IO loop to use\ntransaction\nmodule.\nThis module is described\nin the docs and is really simple to use - please see an example there.\nAn event loop of Tornado, or any other asynchronous\nweb server, looks like this (with some simplifications):\n\nwhile True:\n for callback in list(self._callbacks):\n self._run_callback(callback)\n event_pairs = self._impl.poll()\n self._events.update(event_pairs)\n while self._events:\n fd, events = self._events.popitem()\n handler = self._handlers[fd]\n self._handle_event(fd, handler, events)\n\nWe get IO events, and run handlers for all of them, these handlers can\nalso register new callbacks, which we run too. When using such a framework,\nit is very nice to have a guaranty that all handlers are run serially,\nso you do not have to put any locks. This is an ideal case for the\ntransaction module - it gives us guaranties that things appear\nto be run serially, so in user code we do not need any locks. We just\nneed to change the code above to something like:\n\nwhile True:\n for callback in list(self._callbacks):\n transaction.add( # added\n self._run_callback, callback)\n transaction.run() # added\n event_pairs = self._impl.poll()\n self._events.update(event_pairs)\n while self._events:\n fd, events = self._events.popitem()\n handler = self._handlers[fd]\n transaction.add( # added\n self._handle_event, fd, handler, events)\n transaction.run() # added\n\nThe actual commit is\nhere,\n- we had to extract a little function to run the callback.\n\nPart 1: a simple benchmark: primes\nNow we need a simple benchmark, lets start with\nthis\n- just calculate a list of primes up to the given number, and return it\nas JSON:\n\ndef is_prime(n):\n for i in xrange(2, n):\n if n % i == 0:\n return False\n return True\n\nclass MainHandler(tornado.web.RequestHandler):\n def get(self, num):\n num = int(num)\n primes = [n for n in xrange(2, num + 1) if is_prime(n)]\n self.write({'primes': primes})\n\nWe can benchmark it with siege:\n\nsiege -c 50 -t 20s https://localhost:8888/10000\n\nBut this does not scale. The CPU load is at 101-104 %, and we handle 30 %\nless request per second. The reason for the slowdown is STM overhead,\nwhich needs to keep track of all writes and reads in order to detect conflicts.\nAnd the reason for using only one core is, obviously, conflicts!\nFortunately, we can see what this conflicts are, if we run code like this\n(here 4 is the number of cores to use):\n\nPYPYSTM=stm.log ./primes.py 4\n\nThen we can use print_stm_log.py\nto analyse this log. It lists the most expensive conflicts:\n\n14.793s lost in aborts, 0.000s paused (1258x STM_CONTENTION_INEVITABLE)\nFile \"/home/ubuntu/tornado-stm/tornado/tornado/httpserver.py\", line 455, in __init__\n self._start_time = time.time()\nFile \"/home/ubuntu/tornado-stm/tornado/tornado/httpserver.py\", line 455, in __init__\n self._start_time = time.time()\n...\n\nThere are only three kinds of conflicts, they are described in\nstm source,\nHere we see that two threads call into external function to get current time,\nand we can not rollback any of them, so one of them must wait till the other\ntransaction finishes.\nFor now we can hack around this by disabling this timing - this is only\nneeded for internal profiling in tornado.\nIf we do it, we get the following results (but see caveats below):\n\n\n\n\n\n\n\n\nImpl.\nreq/s\n\n\n\nPyPy 2.4\n14.4\n\nCPython\u00a02.7\n3.2\n\nPyPy-STM 1\n9.3\n\nPyPy-STM 2\n16.4\n\nPyPy-STM 3\n20.4\n\nPyPy\u00a0STM\u00a04\n24.2\n\n\n\n\n\u00a0\u00a0\u00a0\n\n\n\nAs we can see, in this benchmark PyPy STM using just two cores\ncan beat regular PyPy!\nThis is not linear scaling, there are still conflicts left, and this\nis a very simple example but still, it works!\nBut its not that simple yet :)\nFirst, these are best-case numbers after long (much longer than for regular\nPyPy) warmup. Second, it can sometimes crash (although removing old pyc files\nfixes it). Third, benchmark meta-parameters are also tuned.\nHere we get relatively good results only when there are a lot of concurrent\nclients - as a results, a lot of requests pile up, the server is not keeping\nwith the load, and transaction module is busy with work running this piled up\nrequests. If we decrease the number of concurrent clients, results get slightly worse.\nAnother thing we can tune is how heavy is each request - again, if we ask\nprimes up to a lower number, then less time is spent doing calculations,\nmore time is spent in tornado, and results get much worse.\nBesides the time.time() conflict described above, there are a lot of others.\nThe bulk of time is lost in these two conflicts:\n\n14.153s lost in aborts, 0.000s paused (270x STM_CONTENTION_INEVITABLE)\nFile \"/home/ubuntu/tornado-stm/tornado/tornado/web.py\", line 1082, in compute_etag\n hasher = hashlib.sha1()\nFile \"/home/ubuntu/tornado-stm/tornado/tornado/web.py\", line 1082, in compute_etag\n hasher = hashlib.sha1()\n\n13.484s lost in aborts, 0.000s paused (130x STM_CONTENTION_WRITE_READ)\nFile \"/home/ubuntu/pypy/lib_pypy/transaction.py\", line 164, in _run_thread\n got_exception)\n\nThe first one is presumably calling into some C function from stdlib, and we get\nthe same conflict as for time.time() above, but is can be fixed on PyPy\nside, as we can be sure that computing sha1 is pure.\nIt is easy to hack around this one too, just removing etag support, but if\nwe do it, performance is much worse, only slightly faster than regular PyPy,\nwith the top conflict being:\n\n83.066s lost in aborts, 0.000s paused (459x STM_CONTENTION_WRITE_WRITE)\nFile \"/home/arigo/hg/pypy/stmgc-c7/lib-python/2.7/_weakrefset.py\", line 70, in __contains__\nFile \"/home/arigo/hg/pypy/stmgc-c7/lib-python/2.7/_weakrefset.py\", line 70, in __contains__\n\nComment by Armin: It is unclear why this happens so far. We'll investigate...\nThe second conflict (without etag tweaks) originates\nin the transaction module, from this piece of code:\n\nwhile True:\n self._do_it(self._grab_next_thing_to_do(tloc_pending),\n got_exception)\n counter[0] += 1\n\nComment by Armin: This is a conflict in the transaction module itself; ideally,\nit shouldn't have any, but in order to do that we might need a little bit\nof support from RPython or C code. So this is pending improvement.\nTornado modification used in this blog post is based on 3.2.dev2.\nAs of now, the latest version is 4.0.2, and if we\napply\nthe same changes to this version, then we no longer get any scaling on this benchmark,\nand there are no conflicts that take any substantial time.\nComment by Armin: There are two possible reactions to a conflict. We can either\nabort one of the two threads, or (depending on the circumstances) just\npause the current thread until the other one commits, after which the\nthread will likely be able to continue. The tool ``print_stm_log.py``\ndid not report conflicts that cause pauses. It has been fixed very\nrecently. Chances are that on this test it would report long pauses and\npoint to locations that cause them.\n\n\nPart 2: a more interesting benchmark: A-star\nAlthough we have seen that PyPy STM is not all moonlight and roses,\nit is interesting to see how it works on a more realistic application.\nastar.py\nis a simple game where several players move on a map\n(represented as a list of lists of integers),\nbuild and destroy walls, and ask server to give them\nshortest paths between two points\nusing A-star search, adopted from ActiveState recipie.\nThe benchmark bench_astar.py\nis simulating players, and tries to put the main load on A-star search,\nbut also does some wall building and destruction. There are no locks\naround map modifications, as normal tornado is executing all callbacks\nserially, and we can keep this guaranty with atomic blocks of PyPy STM.\nThis is also an example of a program that is not trivial\nto scale to multiple cores with separate processes (assuming\nmore interesting shared state and logic).\nThis benchmark is very noisy due to randomness of client interactions\n(also it could be not linear), so just lower and upper bounds for\nnumber of requests are reported\n\n\n\n\n\n\nImpl.\nreq/s\n\n\n\nPyPy 2.4\n5 .. 7\n\nCPython 2.7\n0.5 .. 0.9\n\nPyPy-STM 1\n2 .. 4\n\nPyPy STM 4\n2 .. 6\n\n\n\nClearly this is a very bad benchmark, but still we can see that scaling is worse\nand STM overhead is sometimes higher.\nThe bulk of conflicts come from the transaction module (we have seen it\nabove):\n\n91.655s lost in aborts, 0.000s paused (249x STM_CONTENTION_WRITE_READ)\nFile \"/home/ubuntu/pypy/lib_pypy/transaction.py\", line 164, in _run_thread\n got_exception)\n\nAlthough it is definitely not ready for production use, you can already try\nto run things, report bugs, and see what is missing in user-facing tools\nand libraries.\nBenchmarks setup:\n\nAmazon c3.xlarge (4 cores) running Ubuntu 14.04\npypy-c-r74011-stm-jit for the primes benchmark (but it has more bugs\nthan more recent versions), and\npypy-c-r74378-74379-stm-jit\nfor astar benchmark (put it inside pypy source checkout at 38c9afbd253c)\nhttps://bitbucket.org/kostialopuhin/tornado-stm-bench at 65144cda7a1f", + "tags": "", + "url": "https://www.pypy.org/posts/2014/11/tornado-without-gil-on-pypy-stm-7284102716557557428.html" + }, + { + "title": "PyPy IO improvements", + "text": "Hello everyone!\nWe've wrapped up the Warsaw sprint, so I would like to describe some\nbranches which have been recently merged and which improved the I/O and the\nGC: gc_no_cleanup_nursery and gc-incminimark-pinning.\nThe first branch was started by Wenzhu Man for her Google Summer of Code\nand finished by Maciej Fija\u0142kowski and Armin Rigo.\nThe PyPy GC works by allocating new objects in the young object\narea (the nursery), simply by incrementing a pointer. After each minor\ncollection, the nursery has to be cleaned up. For simplicity, the GC used\nto do it by zeroing the whole nursery.\nThis approach has bad effects on the cache, since you zero a large piece of\nmemory at once and do unnecessary work for things that don't require zeroing\nlike large strings. We mitigated the first problem somewhat with incremental\nnursery zeroing, but this branch removes the zeroing completely, thus\nimproving the string handling and recursive code (since jitframes don't\nrequires zeroed memory either). I measured the effect on two examples:\na recursive implementation of fibonacci and gcbench,\nto measure GC performance.\nThe results for fibonacci and gcbench are below (normalized to cpython\n2.7). Benchmarks were run 50 times each (note that the big standard\ndeviation comes mostly from the warmup at the beginning, true figures\nare smaller):\n\n\n\n\n\n\n\n\n\nbenchmark\nCPython\nPyPy 2.4\nPyPy non-zero\n\nfibonacci\n4.8+-0.15 (1.0x)\n0.59+-0.07 (8.1x)\n0.45+-0.07 (10.6x)\n\ngcbench\n22+-0.36 (1.0x)\n1.34+-0.28 (16.4x)\n1.02+-0.15 (21.6x)\n\n\n\n\nThe second branch was done by Gregor Wegberg for his master thesis and finished\nby Maciej Fija\u0142kowski and Armin Rigo. Because of the way it works, the PyPy GC from\ntime to time moves the objects in memory, meaning that their address can change.\nTherefore, if you want to pass pointers to some external C function (for\nexample, write(2) or read(2)), you need to ensure that the objects they are\npointing to will not be moved by the GC (e.g. when running a different thread).\nPyPy up to 2.4 solves the problem by copying the data into or from a non-movable buffer, which\nis obviously inefficient.\nThe branch introduce the concept of \"pinning\", which allows us to inform the\nGC that it is not allowed to move a certain object for a short period of time.\nThis introduces a bit of extra complexity\nin the garbage collector, but improves the I/O performance quite drastically,\nbecause we no longer need the extra copy to and from the non-movable buffers.\nIn this benchmark, which does I/O in a loop,\nwe either write a number of bytes from a freshly allocated string into\n/dev/null or read a number of bytes from /dev/full. I'm showing the results\nfor PyPy 2.4, PyPy with non-zero-nursery and PyPy with non-zero-nursery and\nobject pinning. Those are wall times for cases using os.read/os.write\nand file.read/file.write, normalized against CPython 2.7.\nBenchmarks were done using PyPy 2.4 and revisions 85646d1d07fb for\nnon-zero-nursery and 3d8fe96dc4d9 for non-zero-nursery and pinning.\nThe benchmarks were run once, since the standard deviation was small.\n\n\n\nThe Y axis is speed, normalized to CPython, the more the better\n\nWhat we can see is that os.read and os.write both improved greatly\nand outperforms CPython now for each combination. file operations are\na little more tricky, and while those branches improved the situation a bit,\nthe improvement is not as drastic as in os versions. It really should not\nbe the case and it showcases how our file buffering is inferior to CPython.\nWe plan on removing our own buffering and using FILE* in C in the near future,\nso we should outperform CPython on those too (since our allocations are cheaper).\nIf you look carefully in the benchmark, the write function is copied three times.\nThis hack is intended to avoid JIT overspecializing the assembler code, which happens\nbecause the buffering code was written way before the JIT was done. In fact, our buffering\nis hilariously bad, but if stars align correctly it can be JIT-compiled to something\nthat's not half bad. Try removing the hack and seeing how the performance of the last\nbenchmark drops :-) Again, this hack should be absolutely unnecessary once we remove\nour own buffering, stay tuned for more.\nCheers,\nfijal", + "tags": "", + "url": "https://www.pypy.org/posts/2014/11/pypy-io-improvements-1042070332447047674.html" + }, + { + "title": "PyPy3 2.4.0 released", + "text": "We're pleased to announce the availability of PyPy3 2.4.0!\n\nThis release contains several bugfixes and enhancements. Among the user-facing improvements specific to PyPy3:\nBetter Windows compatibility, e.g. the nt module functions _getfinalpathname\u00a0& _getfileinformation are now supported (the former is required for the popular pathlib library for example)\nVarious fsencode PEP 383 related fixes to the posix module (readlink, uname,\u00a0ttyname and ctermid) and improved locale handling\nSwitched the default binary name on POSIX distributions from 'pypy' to 'pypy3' (which symlinks to to 'pypy3.2')\nFixed a couple different crashes related to parsing Python 3 source code\n\nAnd improvements shared with the recent PyPy 2.4.0 release:\ninternal refactoring in string and GIL handling which led to significant speedups\nimproved handling of multiple objects (like sockets) in long-running programs. They are collected and released more efficiently, reducing memory use. In simpler terms - we closed what looked like a memory leak\nWindows builds now link statically to zlib, expat, bzip, and openssl-1.0.1i\nMany issues were resolved since the 2.3.1 release in June\n\nYou can download PyPy3 2.4.0 here https://pypy.org/download.html.\n\nPyPy\u00a0is a very compliant Python interpreter, almost a drop-in replacement for CPython 2.7 and 3.2.5. It's fast (pypy 2.4 and cpython 2.7.x performance comparison) due to its integrated tracing JIT compiler.\n\nThis release supports x86 machines running Linux 32/64, Mac OS X 64, Windows, and OpenBSD, as well as newer ARM hardware (ARMv6 or ARMv7, with VFPv3) running Linux.\u00a0 \nWe would like to thank our donors for the continued support of the PyPy project.\n\nThe complete release notice is here.\n\nPlease try it out and let us know what you think. We especially welcome success stories, please tell us about how it has helped you!\n\nCheers, The PyPy Team", + "tags": "release", + "url": "https://www.pypy.org/posts/2014/10/pypy3-240-released-5007750685927360190.html" + }, + { + "title": "Couchbase contribution to PyPy", + "text": "Hello everyone!\nWe always offer to put on the blog info about our sponsors who donate substantial amounts of money. So far most people decided to stay anonymous, so this is the first blog post describing our sponsor and his relationship to PyPy, hopefully not the last. We'll also publish a full blog post about the PSF-matched fundraiser soon. This is a guest post by Brent Woodruff from Couchbase.\n\n\n\n\nCouchbase is a leading NoSQL document database that provides a flexible data model, high performance, scalability, and high availability. Couchbase is a commercially supported open source project. Visit us at https://www.couchbase.com and https://github.com/couchbase.\n\n\nCouchbase Inc. donated $2000.00, and employees of Couchbase personally contributed a disclosed additional $230.00, towards Pypy progress during the September funding drive. These funds will see a match from the Python Software Foundation.\n\nPypy is primarily used by Couchbase employees to perform product analysis and troubleshooting using internally developed tools. Every customer of Couchbase benefits from the use of Pypy; both due to the rapid development provided by Python, and the speed of the resulting tools provided by the Pypy JIT interpreter.\n\n\u201cPyPy is great - it gave us a 4x speedup in our CPU-intensive internal application over CPython\u201d\n-Dave Rigby and Daniel Owen, Couchbase Engineers\n\n\nAdditionally, Couchbase has a preliminary CFFI based Couchbase client available for Pypy users.", + "tags": "sponsors", + "url": "https://www.pypy.org/posts/2014/10/couchbase-contribution-to-pypy-2360892117372790069.html" + }, + { + "title": "PyPy 2.4.0 released, 9 days left in funding drive", + "text": "We're pleased to announce the availability of PyPy 2.4.0; faster, fewer bugs, and updated to the python 2.7.8 stdlib.\n\nThis release contains several bugfixes and enhancements. Among the user-facing improvements:\n\ninternal refactoring in string and GIL handling which led to significant speedups\nimproved handling of multiple objects (like sockets) in long-running programs. They are collected and released more efficiently, reducing memory use. In simpler terms - we closed what looked like a memory leak\nWindows builds now link statically to zlib, expat, bzip, and openssl-1.0.1i\nMany issues were resolved since the 2.3.1 release in June \n\n\nYou can download PyPy 2.4.0 here https://pypy.org/download.html.\n\nWe would like to also point out that in September, the Python Software Foundation will match funds for any donations up to $10k, so head over to our website and help this mostly-volunteer effort out.\n\nPyPy is a very compliant Python interpreter, almost a drop-in replacement for CPython 2.7 and 3.2.5. It's fast (pypy 2.4 and cpython 2.7.x performance comparison) due to its integrated tracing JIT compiler. \n\nThis release supports x86 machines running Linux 32/64, Mac OS X 64, Windows, and OpenBSD, as well as newer ARM hardware (ARMv6 or ARMv7, with VFPv3) running Linux.\u00a0 \nWe would like to thank our donors for the continued support of the PyPy project.\n\nThe complete release notice is here.\n\nPlease try it out and let us know what you think. We especially welcome success stories, please tell us about how it has helped you!\n\nCheers, The PyPy Team", + "tags": "release", + "url": "https://www.pypy.org/posts/2014/09/pypy-240-released-9-days-left-in-7722154416024407111.html" + }, + { + "title": "PyPy 2.4-beta just in time for PSF's funding drive", + "text": "We're pleased to announce the availability of PyPy 2.4-beta1; faster, fewer bugs, and updated to the python 2.7.8 stdlib.\n\nThis release contains several bugfixes and enhancements. Among the user-facing improvements:\n\ninternal refactoring in string and GIL handling which led to significant speedups\nimproved handling of multiple objects (like sockets) in long-running programs. They are collected and released more efficiently, reducing memory use. In simpler terms - we closed what looked like a memory leak\nWindows builds now link statically to zlib, expat, bzip, and openssl-1.0.1i\nMany issues were resolved since the 2.3.1 release in June \n\n\nYou can download the PyPy 2.4-beta1 release here https://pypy.org/download.html.\n\nWe would like to also point out that in\nSeptember, the Python Software Foundation will match funds for\nany donations up to $10k, so head over to our website and help this mostly-volunteer effort out.\n\nPyPy is a very compliant Python interpreter, almost a drop-in replacement for CPython 2.7 and 3.2.5. It's fast (pypy 2.4 and cpython 2.7.x performance comparison) due to its integrated tracing JIT compiler.\n\nThis\n release supports x86 machines running Linux 32/64, Mac OS X 64, \nWindows, and OpenBSD, as well as newer ARM hardware (ARMv6 or ARMv7, \nwith VFPv3) running Linux.\u00a0 \nWe would like to thank our donors for the continued support of the PyPy project.\n\nThe complete release notice is here.\n\nPlease\n try it out and let us know what you think. We especially welcome \nsuccess stories, please tell us about how it has helped you!\n\nCheers, The PyPy Team\n\nNews Flash from the beta release cycle:\n\nNote that the beta release mistakenly identifies itself in sys.pypy_version_info as releaselevel=='final', please do not mistake this for a final version\nThe beta can hit a \"Illegal instruction\" exception in jitted code on ARMv6 processors like the RaspberryPi. This will be fixed for the release.", + "tags": "", + "url": "https://www.pypy.org/posts/2014/09/pypy-24-beta-just-in-time-for-psfs-5956090195665204063.html" + }, + { + "title": "Python Software Foundation Matching Donations this Month", + "text": "We're extremely excited to announce that for the month of September, any amount\nyou donate to PyPy will be match (up to $10,000) by the Python Software\nFoundation.This includes any of our ongoing fundraisers: NumPyPy, STM, Python3, or our\ngeneral fundraising.Here are some of the things your previous donations have helped accomplish:Getting PyPy3 completed (currently 3.2, with 3.3 work underway)\nNew research and production engineering on STM for PyPy\nLots of progress on NumPy for PyPy\nSignificant performance improvements\nYou can see a preview of what's coming in our next 2.4 release in the draft\nrelease notes.Thank you to all the individuals and companies which have donated so far.So please, donate today: https://pypy.org/(Please be aware that the donation progress bars are not live updating, so\ndon't be afraid if your donation doesn't show up immediately).", + "tags": "", + "url": "https://www.pypy.org/posts/2014/09/python-software-foundation-matching-2230529993193139046.html" + }, + { + "title": "A Field Test of Software Transactional Memory Using the RSqueak Smalltalk VM", + "text": "Extending the Smalltalk RSqueakVM with STM\nby Conrad Calmez, Hubert Hesse, Patrick Rein and Malte Swart supervised by Tim Felgentreff and Tobias Pape\n\nIntroduction\nAfter pypy-stm we can announce that through the RSqueakVM (which used to be called SPyVM) a second VM implementation supports software transactional memory. RSqueakVM is a Smalltalk implementation based on the RPython toolchain. We have added STM support based on the STM tools from RPython (rstm). The benchmarks indicate that linear scale up is possible, however in some situations the STM overhead limits speedup.\nThe work was done as a master's project at the Software Architechture Group of Professor Robert Hirschfeld at at the Hasso Plattner Institut at the University of Potsdam. We - four students - worked about one and a half days per week for four months on the topic. The RSqueakVM was originally developped during a sprint at the University of Bern. When we started the project we were new to the topic of building VMs / interpreters.\nWe would like to thank Armin, Remi and the #pypy IRC channel who supported us over the course of our project. We also like to thank Toni Mattis and Eric Seckler, who have provided us with an initial code base.\n\nIntroduction to RSqueakVM\nAs the original Smalltalk implementation, the RSqueakVM executes a given Squeak Smalltalk image, containing the Smalltalk code and a snapshot of formerly created objects and active execution contexts. These execution contexts are scheduled inside the image (greenlets) and not mapped to OS threads. Thereby the non-STM RSqueakVM runs on only one OS thread.\n\nChanges to RSqueakVM\nThe core adjustments to support STM were inside the VM and transparent from the view of a Smalltalk user. Additionally we added Smalltalk code to influence the behavior of the STM. As the RSqueakVM has run in one OS thread so far, we added the capability to start OS threads. Essentially, we added an additional way to launch a new Smalltalk execution context (thread). But in contrast to the original one this one creates a new native OS thread, not a Smalltalk internal green thread.\n\nSTM (with automatic transaction boundaries) already solves the problem of concurrent access on one value as this is protected by the STM transactions (to be more precise one instruction). But there are cases were the application relies on the fact that a bigger group of changes is executed either completely or not at all (atomic). Without further information transaction borders could be in the middle of such a set of atomic statements. rstm allows to aggregate multiple statements into one higher level transaction. To let the application mark the beginning and the end of these atomic blocks (high-level transactions), we added two more STM specific extensions to Smalltalk.\n\n\nBenchmarks\nRSqueak was executed in a single OS thread so far. rstm enables us to execute the VM using several OS threads. Using OS threads we expected a speed-up in benchmarks which use multiple threads. We measured this speed-up by using two benchmarks: a simple parallel summation where each thread sums up a predefined interval and an implementation of Mandelbrot where each thread computes a range of predefined lines.\n\nTo assess the speed-up, we used one RSqueakVM compiled with rstm enabled, but once running the benchmarks with OS threads and once with Smalltalk green threads. The workload always remained the same and only the number of threads increased. To assess the overhead imposed by the STM transformation we also ran the green threads version on an unmodified RSqueakVM. All VMs were translated with the JIT optimization and all benchmarks were run once before the measurement to warm up the JIT. As the JIT optimization is working it is likely to be adoped by VM creators (the baseline RSqueakVM did that) so that results with this optimization are more relevant in practice than those without it. We measured the execution time by getting the system time in Squeak. The results are:\n\nParallel Sum Ten Million\n\n\n\n\n\n\nBenchmark Parallel Sum 10,000,000\n\n\n Thread Count RSqueak green threads RSqueak/STM green threads RSqueak/STM OS threads Slow down from RSqueak green threads to RSqueak/STM green threads Speed up from RSqueak/STM green threads to RSQueak/STM OS Threads \n \n 1 168.0 ms 240.0 ms 290.9 ms 0.70 0.83 \n 2 167.0 ms 244.0 ms 246.1 ms 0.68 0.99 \n 4 167.8 ms 240.7 ms 366.7 ms 0.70 0.66 \n 8 168.1 ms 241.1 ms 757.0 ms 0.70 0.32 \n 16 168.5 ms 244.5 ms 1460.0 ms 0.69 0.17 \n \n\n\n\nParallel Sum One Billion\n\n\n\n\n\n\nBenchmark Parallel Sum 1,000,000,000\n\n\n\nThread CountRSqueak green threadsRSqueak/STM green threadsRSqueak/STM OS threadsSlow down from RSqueak green threads to RSqueak/STM green threadsSpeed up from RSqueak/STM green threads to RSQueak/STM OS Threads\n\n 1 16831.0 ms 24111.0 ms 23346.0 ms 0.70 1.03 \n 2 17059.9 ms 24229.4 ms 16102.1 ms 0.70 1.50 \n 4 16959.9 ms 24365.6 ms 12099.5 ms 0.70 2.01 \n 8 16758.4 ms 24228.1 ms 14076.9 ms 0.69 1.72 \n 16 16748.7 ms 24266.6 ms 55502.9 ms 0.69 0.44 \n\n\n\n\n\nMandelbrot Iterative\n\n\n\n\n\n\nBenchmark Mandelbrot\n\n\n Thread Count RSqueak green threads RSqueak/STM green threads RSqueak/STM OS threads Slow down from RSqueak green threads to RSqueak/STM green threads Speed up from RSqueak/STM green threads to RSqueak/STM OS Threads \n \n 1 724.0 ms 983.0 ms 1565.5 ms 0.74 0.63 \n 2 780.5 ms 973.5 ms 5555.0 ms 0.80 0.18 \n 4 781.0 ms 982.5 ms 20107.5 ms 0.79 0.05 \n 8 779.5 ms 980.0 ms 113067.0 ms 0.80 0.01\n\n\n\n\n\nDiscussion of benchmark results\nFirst of all, the ParallelSum benchmarks show that the parallelism is actually paying off, at least for sufficiently large embarrassingly parallel problems. Thus RSqueak can also benefit from rstm.\nOn the other hand, our Mandelbrot implementation shows the limits of our current rstm integration. We implemented two versions of the algorithm one using one low-level array and one using two nested collections. In both versions, one job only calculates a distinct range of rows and both lead to a slowdown. The summary of the state of rstm transactions shows that there are a lot of inevitable transactions (transactions which must be completed). One reason might be the interactions between the VM and its low-level extensions, so called plugins. We have to investigate this further.\n\nLimitations\nAlthough the current VM setup is working well enough to support our benchmarks, the VM still has limitations. First of all, as it is based on rstm, it has the current limitation of only running on 64-bit Linux.\nBesides this, we also have two major limitations regarding the VM itself. First, the atomic interface exposed in Smalltalk is currently not working, when the VM is compiled using the just-in-time compiler transformation. Simple examples such as concurrent parallel sum work fine while more complex benchmarks such as chameneos fail. The reasons for this are currently beyond our understanding. Second, Smalltalk supports green threads, which are threads which are managed by the VM and are not mapped to OS threads. We currently support starting new Smalltalk threads as OS threads instead of starting them as green threads. However, existing threads in a Smalltalk image are not migrated to OS threads, but remain running as green threads.\n\nFuture work for STM in RSqueak\nThe work we presented showed interesting problems, we propose the following problem statements for further analysis:\n\nInevitable transactions in benchmarks. This looks like it could limit other applications too so it should be solved.\nCollection implementation aware of STM: The current implementation of collections can cause a lot of STM collisions due to their internal memory structure. We believe it could bear potential for performance improvements, if we replace these collections in an STM enabled interpreter with implementations with less STM collisions. As already proposed by Remi Meier, bags, sets and lists are of particular interest.\nFinally, we exposed STM through languages features such as the atomic method, which is provided through the VM. Originally, it was possible to model STM transactions barriers implicitly by using clever locks, now its exposed via the atomic keyword. From a language design point of view, the question arises whether this is a good solution and what features an stm-enabled interpreter must provide to the user in general? Of particular interest are for example, access to the transaction length and hints for transaction borders to and their performance impact.\n\n\n\nDetails for the technically inclined\n\nAdjustments to the interpreter loop were minimal.\nSTM works on bytecode granularity that means, there is a implicit transaction border after every bytecode executed. Possible alternatives: only break transactions after certain bytecodes, break transactions on one abstraction layer above, e.g. object methods (setter, getter).\nrstm calls were exposed using primtives (a way to expose native code in Smalltalk), this was mainly used for atomic.\nStarting and stopping OS threads is exposed via primitives as well. Threads are started from within the interpreter.\nFor Smalltalk enabled STM code we currently have different image versions. However another way to add, load and replace code to the Smalltalk code base is required to make a switch between STM and non-STM code simple.\n\n\n\nDetails on the project setup\nFrom a non-technical perspective, a problem we encountered was the huge roundtrip times (on our machines up to 600s, 900s with JIT enabled). This led to a tendency of bigger code changes (\"Before we compile, let's also add this\"), lost flow (\"What where we doing before?\") and different compiled interpreters in parallel testing (\"How is this version different from the others?\") As a consequence it was harder to test and correct errors. While this is not as much of a problem for other RPython VMs, RSqueakVM needs to execute the entire image, which makes running it untranslated even slower.\n\nSummary\nThe benchmarks show that speed up is possible, but also that the STM overhead in some situations can eat up the speedup. The resulting STM-enabled VM still has some limitations: As rstm is currently only running on 64-bit Linux the RSqueakVM is doing so as well. Eventhough it is possible for us now to create new threads that map to OS threads within the VM, the migration of exiting Smalltalk threads keeps being problematic.\nWe showed that an existing VM code base can benefit of STM in terms of scaling up. Further it was relatively easy to enable STM support. This may also be valuable to VM developers considering to get STM support for their VMs.", + "tags": "Smalltalk,Squeak,stm", + "url": "https://www.pypy.org/posts/2014/08/a-field-test-of-software-transactional-5659022209916605798.html" + }, + { + "title": "PyPy-STM: first \"interesting\" release", + "text": "Hi all,\n\nPyPy-STM is now reaching a point where we can say it's good enough to be\na GIL-less Python. (We don't guarantee there are no more bugs, so please\nreport them :-) The first official STM release:\n\n\npypy-stm-2.3-r2-linux64\n(UPDATE: this is release r2, fixing a systematic segfault at start-up on some systems)\n\n\nThis corresponds roughly to PyPy 2.3 (not 2.3.1). It requires 64-bit\nLinux. More precisely, this release is built for Ubuntu 12.04 to 14.04;\nyou can also rebuild it\nfrom source by getting the branch stmgc-c7. You need\nclang to compile, and you need a patched\nversion of llvm.\n\nThis version's performance can reasonably be compared with a regular\nPyPy, where both include the JIT. Thanks for following the meandering progress of PyPy-STM over the past three years --- we're finally getting somewhere really interesting! We cannot thank enough all contributors to the previous PyPy-STM money pot that made this possible. And, although this blog post is focused on the results from that period of time, I have of course to remind you that we're running a second call for donation for future work, which I will briefly mention again later.\n\nA recap of what we did to get there: around the start of the year we found a new model, a \"redo-log\"-based STM which uses a couple of hardware tricks to not require chasing pointers, giving it (in this context) exceptionally cheap read barriers. This idea was developed over the following months and (relatively) easily integrated with the JIT compiler. The most recent improvements on the Garbage Collection side are closing the gap with a regular PyPy (there is still a bit more to do there). There is some preliminary user documentation.\n\nToday, the result of this is a PyPy-STM that is capable of running pure Python code on multiple threads in parallel, as we will show in the benchmarks that follow. A quick warning: this is only about pure Python code. We didn't try so far to optimize the case where most of the time is spent in external libraries, or even manipulating \"raw\" memory like array.array or numpy arrays. To some extent there is no point because the approach of CPython works well for this case, i.e. releasing the GIL around the long-running operations in C. Of course it would be nice if such cases worked as well in PyPy-STM --- which they do to some extent; but checking and optimizing that is future work.\n\nAs a starting point for our benchmarks, when running code that\nonly uses one thread, we get a slow-down between 1.2 and 3: at worst,\nthree times as slow; at best only 20% slower than a regular\nPyPy. This worst case has been brought down --it used to be 10x-- by\nrecent work on \"card marking\", a useful GC technique that is also\npresent in the regular PyPy (and about which I don't find any blog post;\nmaybe we should write one :-) The main remaining issue is fork(), or\nany function that creates subprocesses: it works, but is very slow. To\nremind you of this fact, it prints a line to stderr when used.\n\nNow the real main part: when you run multithreaded code, it scales very nicely with two\nthreads, and less-than-linearly but still not badly with three or four\nthreads. Here is an artificial example:\n\n total = 0\n lst1 = [\"foo\"]\n for i in range(100000000):\n lst1.append(i)\n total += lst1.pop()\n\nWe run this code N times, once in each of N threads\n(full\nbenchmark). Run times, best of three:\n\n\n\nNumber of threads\n Regular PyPy (head)\n PyPy-STM\nN = 1\n real 0.92s \nuser+sys 0.92s\n real 1.34s \nuser+sys 1.34s\nN = 2\n real 1.77s \nuser+sys 1.74s\n real 1.39s \nuser+sys 2.47s\nN = 3\n real 2.57s \nuser+sys 2.56s\n real 1.58s \nuser+sys 4.106s\nN = 4\n real 3.38s \nuser+sys 3.38s\n real 1.64s \nuser+sys 5.35s\n\n\n(The \"real\" time is the wall clock time. The \"user+sys\" time is the\nrecorded CPU time, which can be larger than the wall clock time if\nmultiple CPUs run in parallel. This was run on a 4x2 cores machine.\nFor direct comparison, avoid loops that are so trivial\nthat the JIT can remove all allocations from them: right now\nPyPy-STM does not handle this case well. It has to force a dummy allocation\nin such loops, which makes minor collections occur much more frequently.)\n\nFour threads is the limit so far: only four threads can be executed in\nparallel. Similarly, the memory usage is limited to 2.5 GB of GC\nobjects. These two limitations are not hard to increase, but at least\nincreasing the memory limit requires fighting against more LLVM bugs.\n(Include here snark remarks about LLVM.)\n\nHere are some measurements from more real-world benchmarks. This time,\nthe amount of work is fixed and we parallelize it on T threads. The first benchmark is just running translate.py on a trunk PyPy. The last\nthree benchmarks are here.\n\n\n\nBenchmark\n PyPy 2.3\n (PyPy head)\n PyPy-STM, T=1\n T=2\n T=3\n T=4\ntranslate.py --no-allworkingmodules\n(annotation step)\n 184s\n (170s)\n 386s (2.10x)\n n/a\nmultithread-richards\n5000 iterations\n 24.2s\n (16.8s)\n 52.5s (2.17x)\n 37.4s (1.55x)\n 25.9s (1.07x)\n 32.7s (1.35x)\nmandelbrot\ndivided in 16-18 bands\n 22.9s\n (18.2s)\n 27.5s (1.20x)\n 14.4s (0.63x)\n 10.3s (0.45x)\n 8.71s (0.38x)\nbtree\n 2.26s\n (2.00s)\n 2.01s (0.89x)\n 2.22s (0.98x)\n 2.14s (0.95x)\n 2.42s (1.07x)\n\n\nThis shows various cases that can occur:\n\nThe mandelbrot example runs with minimal overhead and very good parallelization.\nIt's dividing the plane to compute in bands, and each of the T threads receives the\nsame number of bands.\n\nRichards, a classical benchmark for PyPy (tweaked to run the iterations\nin multiple threads), is hard to beat on regular PyPy:\nwe suspect that the difference is due to the fact that a lot of\npaths through the loops don't allocate, triggering the issue already\nexplained above. Moreover, the speed of Richards was again improved\ndramatically recently, in trunk.\n\nThe translation benchmark measures the time translate.py\ntakes to run the first phase only, \"annotation\" (for now it consumes too much memory\nto run translate.py to the end). Moreover the timing starts only after the large number of\nsubprocesses spawned at the beginning (mostly gcc). This benchmark is not parallel, but we\ninclude it for reference here. The slow-down factor of 2.1x is still too much, but\nwe have some idea about the reasons: most likely, again the Garbage Collector, missing the regular PyPy's\nvery fast small-object allocator for old objects. Also, translate.py\nis an example of application that could, with\nreasonable efforts, be made largely parallel in the future using atomic blocks.\n\nAtomic blocks are also present in the btree benchmark. I'm not completely sure\nbut it seems that, in this case, the atomic blocks create too many\nconflicts between the threads for actual parallization: the base time is very good,\nbut running more threads does not help at all.\n\n\nAs a summary, PyPy-STM looks already useful to run CPU-bound multithreaded\napplications. We are certainly still going to fight slow-downs, but it\nseems that there are cases where 2 threads are enough to outperform a regular\nPyPy, by a large margin. Please try it out on your own small examples!\n\nAnd, at the same time, please don't attempt to retrofit threads inside\nan existing large program just to benefit from PyPy-STM!\nOur goal is not to send everyone down the obscure route of multithreaded\nprogramming and its dark traps. We are going finally to shift our main\nfocus on the phase 2 of our\nresearch (donations welcome): how to enable a better way of writing multi-core programs.\nThe starting point is to fix and test atomic blocks. Then we will have to\ndebug common causes of conflicts and fix them or work around them; and\ntry to see how common frameworks like Twisted can be adapted.\n\nLots of work ahead, but lots of work behind too :-)\n\nArmin (thanks Remi as well for the work).", + "tags": "releasestm", + "url": "https://www.pypy.org/posts/2014/07/pypy-stm-first-interesting-release-8684276541915333814.html" + }, + { + "title": "PyPy3 2.3.1 - Fulcrum", + "text": "We're pleased to announce the first stable release of PyPy3. PyPy3\ntargets Python 3 (3.2.5) compatibility.We would like to thank all of the people who donated to the py3k proposal\nfor supporting the work that went into this.You can download the PyPy3 2.3.1 release here:https://pypy.org/download.html#pypy3-2-3-1HighlightsThe first stable release of PyPy3: support for Python 3!\nThe stdlib has been updated to Python 3.2.5\nAdditional support for the u'unicode' syntax (PEP 414) from Python 3.3\nUpdates from the default branch, such as incremental GC and various JIT\nimprovements\nResolved some notable JIT performance regressions from PyPy2:\nRe-enabled the previously disabled collection (list/dict/set) strategies\nResolved performance of iteration over range objects\nResolved handling of Python 3's exception __context__ unnecessarily forcing\nframe object overhead\nWhat is PyPy?PyPy is a very compliant Python interpreter, almost a drop-in replacement for\nCPython 2.7.6 or 3.2.5. It's fast due to its integrated tracing JIT compiler.This release supports x86 machines running Linux 32/64, Mac OS X 64, Windows,\nand OpenBSD,\nas well as newer ARM hardware (ARMv6 or ARMv7, with VFPv3) running Linux.While we support 32 bit python on Windows, work on the native Windows 64\nbit python is still stalling, we would welcome a volunteer\nto handle that.How to use PyPy?We suggest using PyPy from a virtualenv. Once you have a virtualenv\ninstalled, you can follow instructions from pypy documentation on how\nto proceed. This document also covers other installation schemes.Cheers,\nthe PyPy team", + "tags": "pypy3", + "url": "https://www.pypy.org/posts/2014/06/pypy3-231-fulcrum-3765964217640322884.html" + }, + { + "title": "PyPy 2.3.1 - Terrestrial Arthropod Trap Revisited", + "text": "We're pleased to announce PyPy 2.3.1, a feature-and-bugfix improvement over our recent 2.3 release last month.\n\nThis release contains several bugfixes and enhancements among the user-facing improvements:\nThe built-in struct module was renamed to _struct, solving issues with IDLE and other modules\nSupport for compilation with gcc-4.9\nA CFFI-based version of the gdbm module is now included in our binary bundle\nMany issues were resolved since the 2.3 release on May 8 \n\nYou can download the PyPy 2.3.1 release here:\n\nhttps://pypy.org/download.html\n\nPyPy is a very compliant Python interpreter, almost a drop-in replacement for CPython 2.7. It's fast (pypy 2.3.1 and cpython 2.7.x performance comparison) due to its integrated tracing JIT compiler.\n\nThis release supports x86 machines running Linux 32/64, Mac OS X 64, Windows, and OpenBSD, as well as newer ARM hardware (ARMv6 or ARMv7, with VFPv3) running Linux.\u00a0 \nWe would like to thank our donors for the continued support of the PyPy project.\n\nThe complete release notice is here.\n\nPlease try it out and let us know what you think. We especially welcome success stories, please tell us about how it has helped you!\n\nCheers, The PyPy Team", + "tags": "", + "url": "https://www.pypy.org/posts/2014/06/pypy-231-terrestrial-arthropod-trap-5076300474324870908.html" + }, + { + "title": "PyPy 2.3 - Terrestrial Arthropod Trap", + "text": "We\u2019re pleased to announce PyPy 2.3, which targets version 2.7.6 of the Python language. This release updates the stdlib from 2.7.3, jumping directly to 2.7.6.\n\nThis release also contains several bugfixes and performance improvements, many generated by real users finding corner cases.\u00a0CFFI\u00a0has made it easier than ever to use existing C code with both cpython and PyPy, easing the transition for packages like\u00a0cryptography,\u00a0Pillow(Python Imaging Library [Fork]), a basic port of\u00a0pygame-cffi, and others.\n\nPyPy can now be embedded in a hosting application, for instance inside\u00a0uWSGI\n\nYou can download the PyPy 2.3 release here:\n\nhttps://pypy.org/download.html\n\nPyPy is a very compliant Python interpreter, almost a drop-in replacement for CPython 2.7. It's fast (pypy 2.3 and cpython 2.7.x performance comparison; note that cpython's speed has not changed since 2.7.2) due to its integrated tracing JIT compiler.\n\nThis release supports x86 machines running Linux 32/64, Mac OS X 64, Windows, and OpenBSD, as well as newer ARM hardware (ARMv6 or ARMv7, with VFPv3) running Linux.\u00a0\n\nWe would like to thank our donors for the continued support of the PyPy project.\n\nThe complete release notice is here\n\nCheers, The PyPy Team", + "tags": "", + "url": "https://www.pypy.org/posts/2014/05/pypy-23-terrestrial-arthropod-trap-9057496904945555741.html" + }, + { + "title": "NumPy on PyPy - Status Update", + "text": "Work on NumPy on PyPy continued in March, though at a lighter pace than the previous few months. Progress was made on both compatibility and speed fronts. Several behavioral issues reported to the bug tracker were resolved. The most significant of these was probably the correction of casting to built-in Python types. Previously, int/long conversions of numpy scalars such as inf/nan/1e100 would return bogus results. Now, they raise or return values, as appropriate.\n\nOn the speed front, enhancements to the PyPy JIT were made to support virtualizing the raw_store/raw_load memory operations used in numpy arrays. Further work remains here in virtualizing the alloc_raw_storage when possible. This will allow scalars to have storages but still be virtualized when possible in loops.\n\nAside from continued work on compatibility/speed of existing code, we also hope to begin implementing the C-level components of other numpy modules such as mtrand, nditer, linalg, and so on. Several approaches could be taken to get C-level code in these modules working, ranging from reimplementing in RPython to interfacing with existing code with CFFI, if possible. The appropriate approach depends on many factors and will probably vary from module to module.To try out PyPy + NumPy, grab a nightly PyPy and install our NumPy fork. Feel free to report comments/issues to IRC, our mailing list, or bug tracker. Thanks to the contributors to the NumPy on PyPy\u00a0proposal for supporting this work.", + "tags": "numpy", + "url": "https://www.pypy.org/posts/2014/04/numpy-on-pypy-status-update-1103134247318103282.html" + }, + { + "title": "STM results and Second Call for Donations", + "text": "Hi all,\n\nWe now have a preliminary version of PyPy-STM\nwith the JIT, from the new STM documentation\npage. This PyPy-STM is still not quite useful, failing to top the\nperformance of a regular PyPy by a small margin on most benchmarks, but\nit's definitely getting there :-) The overheads with the JIT are still\na bit too high. (I've been tracking an obscure bug since days.\nIt turned out to be a simple buffer overflow. But if anybody has\na clue about why a hardware watchpoint in gdb, set on one of the garbled\nmemory locations, fails to trigger but the memory ends up being modified\nanyway... and, it turns out, by just a regular pointer write... ideas\nwelcome.)\n\nBut I go off-topic :-) The main point of this post is to announce the\n2nd Call for Donation about\nSTM. We achieved most of the goals laid out in the first call. We\neven largely overachieved them in terms of raw performance, even if\nthere are many cases that are unreasonably slow for now. So, after the\nsuccessful research, we are launching a second proposal about the\ndevelopment part of the project:\n\nPolish PyPy-STM to get a consistently reasonable speed, 25%-40%\nslower than a regular JITted PyPy when running single-threaded code. Of\ncourse it is supposed to scale nicely as long as there are no\nuser-visible conflicts.\n\nFocus on developing the Python-facing interface: both internal things\n(e.g. do dictionaries need to be more TM-friendly in general?) as well\nas directly visible things (e.g. some profiler-like interface to explore\ncommon conflicts in a program).\n\nRegular multithreaded code should benefit out of the box, but the\nfinal goal is to explore and tweak some existing non-multithreaded\nframeworks and improve their TM-friendliness. So existing programs\nusing Twisted or Stackless, for example, should run on multiple cores\nwithout any major change.\n\nSee the full call for more\ndetails! I'd like to thank Remi Meier for getting involved. And a big\nthank you to everybody who contributed money on the first call. It\ntook more time than anticipated, but it's there in good but rough shape.\nNow it needs a lot of polishing :-)\n\nArmin", + "tags": "stm", + "url": "https://www.pypy.org/posts/2014/04/stm-results-and-second-call-for-1767845182888902777.html" + }, + { + "title": "pygame_cffi: pygame on PyPy", + "text": "The Raspberry Pi aims to be a low-cost educational tool that anyone can use to learn about electronics and programming. Python and pygame are included in the Pi's programming toolkit. And since last year, thanks in part to sponsorship from the Raspberry Pi Foundation, PyPy also works on the Pi (read more here).\nWith PyPy working on the Pi, game logic written in Python stands to gain an awesome performance boost. However, the original pygame is a Python C extension. This means it performs poorly on PyPy and negates any speedup in the Python parts of the game code.\nOne solution to making pygame games run faster on PyPy, and eventually on the Raspberry Pi, comes in the form of pygame_cffi. pygame_cffi uses CFFI to wrap the underlying SDL library instead of a C extension. A few months ago, the Raspberry Pi Foundation sponsored a Cape Town Python User Group hackathon to build a proof-of-concept pygame using CFFI. This hackathon was a success and it produced an early working version of pygame_cffi.\nSo for the last 5 weeks Raspberry Pi has been funding work on pygame_cffi. The goal was a complete implementation of the core modules. We also wanted benchmarks to illuminate performance differences between pygame_cffi on PyPy and pygame on CPython. We are happy to report that those goals were met. So without further ado, here's a rundown of what works.\n\nCurrent functionality\n\nSurfaces support all the usual flags for SDL and OpenGL rendering (more about OpenGL below).\nThe graphics-related modules color, display, font and image, and parts of draw and transform are mostly complete.\nEvents! No fastevent module yet, though.\nMouse and keyboard functionality, as provided by the mouse and key modules, is complete.\nSound functionality, as provided by the mixer and music modules, is complete.\nMiscellaneous modules, cursors, rect, sprite and time are also complete.\n\n\nInvention screenshot:\n\n\n\nMutable mamba screenshot:\n\n\n\nWith the above-mentioned functionality in place we could get 10+ of the pygame examples to work, and a number of PyWeek games. At the time of writing, if a game doesn't work it is most likely due to an unimplemented transform or draw function. That will be remedied soon.\n\n\nPerformance\nIn terms of performance, pygame_cffi on PyPy is showing a lot of promise. It beats pygame on CPython by a significant margin in our events processing and collision detection benchmarks, while blit and fill benchmarks perform similarly. The pygame examples we checked also perform better.\n\n\n\n\n\nHowever, there is still work to be done to identify and eliminate bottlenecks. On the Raspberry Pi performance is markedly worse compared to pygame (barring collision detection). The PyWeek games we tested also performed slightly worse. Fortunately there is room for improvement in various places.\n\nInvention & Mutable Mamba (x86)\n\n\n\nStandard pygame examples (Raspberry Pi)\n\n\n\nHere's a summary of some of the benchmarks. Relative speed refers to the frame rate obtained in pygame_cffi on PyPy relative to pygame on CPython.\n\n\n\n\n\n\nBenchmark\nRelative speed (pypy speedup)\n\n\n\nEvents (x86)\n1.41\n\nEvents (Pi)\n0.58\n\nN2 collision detection on 100 sprites (x86)\n4.14\n\nN2 collision detection on 100 sprites (Pi)\n1.01\n\nBlit 100 surfaces (x86)\n1.06\n\nBlit 100 surfaces (Pi)\n0.60\n\nInvention (x86)\n0.95\n\nMutable Mamba (x86)\n0.72\n\nstars example (x86)\n1.95\n\nstars example (Pi)\n0.84\n\n\n\n\nOpenGL\nSome not-so-great news is that PyOpenGL performs poorly on PyPy since PyOpenGL uses ctypes. This translates into a nasty reduction in frame rate for games that use OpenGL surfaces. It might be worthwhile creating a CFFI-powered version of PyOpenGL as well.\n\n\n\nWhere to now?\nWork on pygame_cffi is ongoing. Here are some things that are in the pipeline:\n\nGet pygame_cffi on PyPy to a place where it is consistently faster than pygame on CPython.\nImplement the remaining modules and functions, starting with draw and transform.\nImprove test coverage.\nReduce the time it takes for CFFI to parse the cdef. This makes the initial pygame import slow.\n\nIf you want to contribute you can find pygame_cffi on Github.\nFeel free to find us on #pypy on freenode or post issues on github.\nCheers,\nRizmari Versfeld", + "tags": "sponsors", + "url": "https://www.pypy.org/posts/2014/03/pygamecffi-pygame-on-pypy-8679802461301121984.html" + }, + { + "title": "STMGC-C7 with PyPy", + "text": "Hi all,\n\nHere is one of the first full PyPy's\n(edit: it was r69967+, but the general list of versions is currently here)\ncompiled with the new StmGC-c7\nlibrary. It has no JIT so far, but it runs some small\nsingle-threaded benchmarks by taking around 40% more time than a\ncorresponding non-STM, no-JIT version of PyPy. It scales --- up to two\nthreads only, which is the hard-coded maximum so far in the c7 code.\nBut the scaling looks perfect in these small benchmarks without\nconflict: starting two threads each running a copy of the benchmark\ntakes almost exactly the same amount of total time, simply using two\ncores.\n\nFeel free to try it! It is not actually useful so far, because it is\nlimited to two cores and CPython is something like 2.5x faster. One of\nthe important next steps is to re-enable the JIT. Based on our current\nunderstanding of the \"40%\" figure, we can probably reduce it with\nenough efforts; but also, the JIT should be able to easily produce\nmachine code that suffers a bit less than the interpreter from these\neffects. This seems to mean that we're looking at 20%-ish slow-downs\nfor the future PyPy-STM-JIT.\n\nInteresting times :-)\n\nFor reference, this is what you get by downloading the\nPyPy binary linked above: a Linux 64 binary (Ubuntu 12.04) that\nshould behave mostly like a regular PyPy. (One main missing feature is\nthat destructors are never called.) It uses two cores, but obviously\nonly if the Python program you run is multithreaded. The only new\nbuilt-in feature is with __pypy__.thread.atomic: this gives\nyou a way to enforce that a block of code runs \"atomically\", which means\nwithout any operation from any other thread randomly interleaved.\n\nIf you want to translate it yourself, you need a trunk version of clang\nwith three patches applied. That's the number of bugs that we couldn't\nfind workarounds for, not the total number of bugs we found by (ab)using\nthe address_space feature...\n\nStay tuned for more!\n\nArmin & Remi", + "tags": "stm", + "url": "https://www.pypy.org/posts/2014/03/hi-all-here-is-one-of-first-full-pypys-8725931424559481728.html" + }, + { + "title": "PyPy on uWSGI", + "text": "Hello everyone\nThere is an interview with Roberto De Ioris (from uWSGI fame) about embedding PyPy in uWSGI that covers recent addition of a PyPy embedding interface using cffi and the experience with using it. Read The full interview\nCheers\nfijal", + "tags": "", + "url": "https://www.pypy.org/posts/2014/03/hello-everyone-there-is-interview-with-7561523711224053700.html" + }, + { + "title": "NumPy on PyPy - Progress in February", + "text": "More progress was made on the NumPy front in the past month. On the compatibility front, we now pass ~130 more tests from NumPy's suite since the end of January. Currently, we pass 2336 tests out of 3265 tests run, with many of the failures representing portions of NumPy that we don't plan to implement in the near future (object dtypes, unicode, etc). There are still some failures that do represent issues, such as special indexing cases and failures to respect subclassed ndarrays in return values, which we do plan to resolve. There are also some unimplemented components and ufuncs remaining which we hope to implement, such as nditer and mtrand. Overall, the most common array functionality should be working.\n\nAdditionally, I began to take a look at some of the loops generated by our code. One widely used loop is dot, and we were running about 5x slower than NumPy's C version. I was able to optimize the dot loop and also the general array iterator to get us to ~1.5x NumPy C time on dot operations of various sizes. Further progress in this area could be made by using CFFI to tie into BLAS libraries, when available. Also, work remains in examining traces generated for our other loops and checking for potential optimizations.\n\nTo try out PyPy + NumPy, grab a nightly PyPy and install our NumPy fork. Feel free to report comments/issues to IRC, our mailing list, or bug tracker. Thanks to the contributors to the NumPy on PyPy proposal for supporting this work.\n\nCheers,\nBrian", + "tags": "numpy", + "url": "https://www.pypy.org/posts/2014/03/numpy-status-update-february-1245769841736493525.html" + }, + { + "title": "Py3k status update #13", + "text": "This is the 13th status update about our work on the py3k branch, which we\ncan work on thanks to all of the people who donated to the py3k proposal.We're just finishing up a cleanup of int/long types. This work helps the py3k\nbranch unify these types into the Python 3 int and restore JIT compilation of\nmachine sized integers.This cleanup also removes multimethods from these types. PyPy has\nhistorically used a clever implementation of multimethod dispatch for declaring\nmethods of the __builtin__ types in RPython.This multimethod scheme provides some convenient features for doing this,\nhowever we've come to the conclusion that it may be more trouble than it's\nworth. A major problem of multimethods is that they generate a large amount of\nstub methods which burden the already lengthy and memory hungry RPython\ntranslation process. Also, their implementation and behavior can be somewhat\ncomplicated/obscure.The alternative to multimethods involves doing the work of the type checking\nand dispatching rules in a more verbose, manual way. It's a little more work in\nthe end but less magical.Recently, Manuel Jacob finished a large cleanup effort of the\nunicode/string/bytearray types that also removed their multimethods. This work\nalso benefits the py3k branch: it'll help with future PEP 393 (or PEP 393\nalternative) work. This effort was partly sponsored by Google's Summer of\nCode: thanks Manuel and Google!Now there's only a couple major pieces left in the multimethod removal (the\nfloat/complex types and special marshaling code) and a few minor pieces that\nshould be relatively easy.In conclusion, there's been some good progress made on py3k and multimethod\nremoval this winter, albeit a bit slower than we would have liked.cheers,\nPhil", + "tags": "pypy3", + "url": "https://www.pypy.org/posts/2014/02/py3k-status-update-13-4630607029125647100.html" + }, + { + "title": "Rewrites of the STM core model -- again", + "text": "Hi all,\n\nA quick note about the Software Transactional Memory (STM) front.\n\nSince the previous\npost, we believe we progressed a lot by discovering an alternative\ncore model for software transactions. Why do I say \"believe\"? It's\nbecause it means again that we have to rewrite from scratch the C\nlibrary handling STM. This is currently work in progress. Once this is\ndone, we should be able to adapt the existing pypy-stm to run on top of\nit without much rewriting efforts; in fact it should simplify the\ndifficult issues we ran into for the JIT. So while this is basically\nyet another restart similar to last\nJune's, the difference is that the work that we have already put in the PyPy\npart (as opposed to the C library) remains.\n\nYou can read about the basic ideas of this new C library here.\nIt is still STM-only, not HTM, but because it doesn't constantly move\nobjects around in memory, it would be easier to adapt an HTM version.\nThere are even potential ideas about a hybrid TM, like using HTM but\nonly to speed up the commits. It is based on a Linux-only system call, remap_file_pages()\n(poll: who heard about it before? :-). As previously, the work is done\nby Remi Meier and myself.\n\nCurrently, the C library is incomplete, but early experiments show good\nresults in running duhton,\nthe interpreter for a minimal language created for the purpose of\ntesting STM. Good results means we brough down the slow-downs from\n60-80% (previous version) to around 15% (current version). This number\nmeasures the slow-down from the non-STM-enabled to the STM-enabled\nversion, on one CPU core; of course, the idea is that the STM version\nscales up when using more than one core.\n\nThis means that we are looking forward to a result that is much better\nthan originally predicted. The pypy-stm has chances to run at a\none-thread speed that is only \"n%\" slower than the regular pypy-jit, for\na value of \"n\" that is optimistically 15 --- but more likely some number\naround 25 or 50. This is seriously better than the original estimate,\nwhich was \"between 2x and 5x\". It would mean that using pypy-stm is\nquite worthwhile even with just two cores.\n\nMore updates later...\n\nArmin", + "tags": "stm", + "url": "https://www.pypy.org/posts/2014/02/rewrites-of-stm-core-model-again-633249729751034512.html" + }, + { + "title": "NumPy Status Update - December/January", + "text": "Work continued on the NumPy + PyPy front steadily in December and more lightly in January. The continued focus was compatibility, targeting incorrect or unimplemented features that appeared in multiple NumPy test suite failures. We now pass ~2/3 of the NumPy test suite. The biggest improvements were made in these areas:\n\n- Bugs in conversions of arrays/scalars to/from native types\n- Fix cases where we would choose incorrect dtypes when initializing or computing results\n- Improve handling of subclasses of ndarray through computations\n- Support some optional arguments for array methods that are used in the pure-python part of NumPy\n- Support additional attributes in arrays, array.flags, and dtypes\n- Fix some indexing corner cases that arise in NumPy testing\n- Implemented part of\u00a0numpy.fft (cffti and cfftf)\n\nLooking forward, we plan to continue improving the correctness of the existing implemented NumPy functionality, while also beginning to look at performance. The initial focus for performance will be to look at areas where we are significantly worse than CPython+NumPy. Those interested in trying these improvements out will need a PyPy nightly, and an install of the PyPy NumPy fork. Thanks again to the NumPy on PyPy donors for funding this work.", + "tags": "numpy", + "url": "https://www.pypy.org/posts/2014/02/numpy-status-update-decemberjanuary-4292961614234099787.html" + }, + { + "title": "NumPy Status Update - November", + "text": "Since the PyPy 2.2 release last month, more progress has been made on the NumPy compatibility front. Initial work has been directed by running the NumPy test suite and targeting failures that appear most frequently, along with fixing the few bugs reported on the bug tracker.\n\nImprovements were made in these areas:\n- Many missing/broken scalar functionalities were added/fixed. The scalar API should match up more closely with arrays now.\n- Some missing dtype functionality was added (newbyteorder, hasobject, descr, etc)\n- Support for optional arguments (axis, order) was added to some ndarray functions\n- Fixed some corner cases for string/record types\n\nMost of these improvements went onto trunk after 2.2 was split, so if you're interested in trying them out or running into problems on 2.2, try the\nnightly.\n\nThanks again to the NumPy on PyPy donors who make this continued progress possible.\n\nCheers,\nBrian", + "tags": "numpy", + "url": "https://www.pypy.org/posts/2013/12/numpy-status-update-november-364321959153372759.html" + }, + { + "title": "PyGame CFFI", + "text": "One of the RaspberryPi's goals is to be a fun toolkit for school children (and adults!) to learn programming and electronics with. Python and pygame are part of this toolkit. Recently the RaspberryPi Foundation funded parts of the effort of porting of pypy to the Pi -- making Python programs on the Pi faster!\nUnfortunately pygame is written as a Python C extension that wraps SDL which means performance of pygame under pypy remains mediocre. To fix this pygame needs to be rewritten using cffi to wrap SDL instead.\nRaspberryPi sponsored a CTPUG (Cape Town Python User Group) hackathon to put together a proof-of-concept pygame-cffi. The day was quite successful - we got a basic version of the bub'n'bros client working on pygame-cffi (and on PyPy). The results can be found on github with contributions from the five people present at the sprint.\nWhile far from complete, the proof of concept does show that there are no major obstacles to porting pygame to cffi and that cffi is a great way to bind your Python package to C libraries.\nAmazingly, we managed to have machines running all three major platforms (OS X, Linux and Windows) at the hackathon so the code runs on all of them!\nWe would like to thank the Praekelt foundation for providing the venue and The Raspberry Pi foundation for providing food and drinks!\nCheers,\nSimon Cross, Jeremy Thurgood, Neil Muller, David Sharpe and fijal.", + "tags": "", + "url": "https://www.pypy.org/posts/2013/12/pygame-cffi-8991437796535033699.html" + }, + { + "title": "PyPy Leysin Winter Sprint (11-19st January 2014)", + "text": "The next PyPy sprint will be in Leysin, Switzerland, for the ninth time.\nThis is a fully public sprint: newcomers and topics other than those\nproposed below are welcome.\nGoals and topics of the sprint\n\nPy3k: work towards supporting Python 3 in PyPy\nNumPyPy: work towards supporting the numpy module in PyPy\nSTM: work towards supporting Software Transactional Memory\nAnd as usual, the main side goal is to have fun in winter sports :-)\nWe can take a day off for ski.\n\n\nExact times\nFor a change, and as an attempt to simplify things, I specified the\ndates as 11-19 January 2014, where 11 and 19 are travel days. We will\nwork full days between the 12 and the 18. You are of course allowed to\nshow up for a part of that time only, too.\nLocation & Accomodation\nLeysin, Switzerland, \"same place as before\". Let me refresh your\nmemory: both the sprint venue and the lodging will be in a very spacious\npair of chalets built specifically for bed & breakfast:\nhttps://www.ermina.ch/. The place has a good ADSL Internet connexion\nwith wireless installed. You can of course arrange your own lodging\nanywhere (as long as you are in Leysin, you cannot be more than a 15\nminutes walk away from the sprint venue), but I definitely recommend\nlodging there too -- you won't find a better view anywhere else (though\nyou probably won't get much worse ones easily, either :-)\nPlease confirm that you are coming so that we can adjust the\nreservations as appropriate. The rate so far has been around 60 CHF a\nnight all included in 2-person rooms, with breakfast. There are larger\nrooms too (less expensive per person) and maybe the possibility to get a\nsingle room if you really want to.\nPlease register by Mercurial:\n\nhttps://bitbucket.org/pypy/extradoc/\nhttps://foss.heptapod.net/pypy/extradoc/-/blob/branch/default/extradoc/sprintinfo/leysin-winter-2014\n\nor on the pypy-dev mailing list if you do not yet have check-in rights:\n\nhttps://mail.python.org/mailman/listinfo/pypy-dev\nYou need a Swiss-to-(insert country here) power adapter. There will be\nsome Swiss-to-EU adapters around -- bring a EU-format power strip if you\nhave one.", + "tags": "numpy", + "url": "https://www.pypy.org/posts/2013/11/pypy-leysin-winter-sprint-11-19st-8860782754173653661.html" + }, + { + "title": "PyPy 2.2.1 - Incrementalism.1", + "text": "We're pleased to announce PyPy 2.2.1, which targets version 2.7.3 of the Python\nlanguage. This is a bugfix release over 2.2.\nYou can download the PyPy 2.2.1 release here:\n\nhttps://pypy.org/download.html\nWhat is PyPy?\nPyPy is a very compliant Python interpreter, almost a drop-in replacement for\nCPython 2.7. It's fast (pypy 2.2 and cpython 2.7.2 performance comparison)\ndue to its integrated tracing JIT compiler.\nThis release supports x86 machines running Linux 32/64, Mac OS X 64, Windows\n32, or ARM (ARMv6 or ARMv7, with VFPv3).\nWork on the native Windows 64 is still stalling, we would welcome a volunteer\nto handle that.\nHighlights\nThis is a bugfix release. The most important bugs fixed are:\n\nan issue in sockets' reference counting emulation, showing up\nnotably when using the ssl module and calling makefile().\nTkinter support on Windows.\nIf sys.maxunicode==65535 (on Windows and maybe OS/X), the json\ndecoder incorrectly decoded surrogate pairs.\nsome FreeBSD fixes.\n\nNote that CFFI 0.8.1 was released. Both versions 0.8 and 0.8.1 are\ncompatible with both PyPy 2.2 and 2.2.1.\nCheers,\nArmin Rigo & everybody", + "tags": "", + "url": "https://www.pypy.org/posts/2013/11/pypy-221-incrementalism1-9197847629771910947.html" + }, + { + "title": "CFFI 0.8", + "text": "Hi all,\n\nCFFI 0.8 for CPython (2.6-3.x) has been released.\n\nQuick download: pip install cffi --upgrade\nDocumentation: https://cffi.readthedocs.org/en/release-0.8/\n\nWhat's new: a number of small fixes; ffi.getwinerror(); integrated support for C99 variable-sized structures; multi-thread safety.\n\n--- Armin\n\nUpdate: CFFI 0.8.1, with fixes on Python 3 on OS/X, and some FreeBSD fixes (thanks Tobias).", + "tags": "", + "url": "https://www.pypy.org/posts/2013/11/cffi-08-6086756821078041950.html" + }, + { + "title": "NumPy status update", + "text": "Here is what has been happening with NumPy in PyPy in October thanks to the people who donated to the NumPyPy proposal:\n\n The biggest change is that we shifted to using an external fork of numpy rather than a minimal numpypy module. The idea is that we will be able to\u00a0reuse\u00a0most of the upstream pure-python numpy\u00a0components, replacing the C modules with appropriate RPython micronumpy pieces at the correct places in the module namespace.\n\n The numpy fork should work just as well as the old numpypy for functionality that existed previously, and also include much new functionality from the pure-python numpy pieces that simply hadn't been imported yet in numpypy. However, this new functionality will not have been \"hand picked\" to only include pieces that work, so you may run into functionality that relies on unimplemented components (which should fail with user-level exceptions).\n\n This setup also allows us to run the entire numpy test suite, which will help in directing future compatibility development. The recent PyPy release includes these changes, so download it and let us know how it works! And if you want to live on the edge, the nightly includes even more numpy progress made in November.\n\n To install the fork, download the latest release, and then install numpy either separately with a virtualenv: pip install git+https://bitbucket.org/pypy/numpy.git; or directly: git clone https://bitbucket.org/pypy/numpy.git; cd numpy; pypy setup.py install.\n\nEDIT: if you install numpy as root, you may need to also import it once as root before it works: sudo pypy -c 'import numpy'\n\n\n Along with this change, progress was made in fixing internal micronumpy bugs and increasing compatibility:\nFixed a bug with strings in record dtypes\nFixed a bug where the multiplication of an ndarray with a Python int or float resulted in loss of the array's dtype\nFixed several segfaults encountered in the numpy test suite (suite should run now without segfaulting)\n\n We also began working on __array_prepare__ and __array_wrap__, which are necessary pieces for a working matplotlib module.\n\n Cheers,\nRomain and Brian", + "tags": "numpy", + "url": "https://www.pypy.org/posts/2013/11/numpy-status-update-1609808546418002632.html" + }, + { + "title": "PyPy 2.2 - Incrementalism", + "text": "We're pleased to announce PyPy 2.2, which targets version 2.7.3 of the Python language. This release main highlight is the introduction of the incremental garbage collector, sponsored by the Raspberry Pi Foundation.\nThis release also contains several bugfixes and performance improvements.\nYou can download the PyPy 2.2 release here:\nhttps://pypy.org/download.htmlWe would like to thank our donors for the continued support of the PyPy project. We showed quite a bit of progress on all three projects (see below) and we're slowly running out of funds. Please consider donating more so we can finish those projects! The three projects are:\nPy3k (supporting Python 3.x): the release PyPy3 2.2 is imminent.\nSTM (software transactional memory): a preview will be released very soon, as soon as we fix a few bugs\nNumPy: the work done is included in the PyPy 2.2 release. More details below.\n\nWhat is PyPy?PyPy is a very compliant Python interpreter, almost a drop-in replacement for CPython 2.7. It's fast (pypy 2.2 and cpython 2.7.2 performance comparison) due to its integrated tracing JIT compiler.\nThis release supports x86 machines running Linux 32/64, Mac OS X 64, Windows 32, or ARM (ARMv6 or ARMv7, with VFPv3).\nWork on the native Windows 64 is still stalling, we would welcome a volunteer to handle that.HighlightsOur Garbage Collector is now \"incremental\". It should avoid almost all pauses due to a major collection taking place. Previously, it would pause the program (rarely) to walk all live objects, which could take arbitrarily long if your process is using a whole lot of RAM. Now the same work is done in steps. This should make PyPy more responsive, e.g. in games. There are still other pauses, from the GC and the JIT, but they should be on the order of 5 milliseconds each.\nThe JIT counters for hot code were never reset, which meant that a process running for long enough would eventually JIT-compile more and more rarely executed code. Not only is it useless to compile such code, but as more compiled code means more memory used, this gives the impression of a memory leak. This has been tentatively fixed by decreasing the counters from time to time.\nNumPy has been split: now PyPy only contains the core module, called _numpypy. The numpy module itself has been moved to https://bitbucket.org/pypy/numpy and numpypy disappeared. You need to install NumPy separately with a virtualenv: pip install git+https://bitbucket.org/pypy/numpy.git; or directly: git clone https://bitbucket.org/pypy/numpy.git;\u00a0cd numpy;\u00a0pypy setup.py install.\nnon-inlined calls have less overhead\nThings that use sys.set_trace are now JITted (like coverage)\nJSON decoding is now very fast (JSON encoding was already very fast)\nvarious buffer copying methods experience speedups (like list-of-ints to int[] buffer from cffi)\nWe finally wrote (hopefully) all the missing os.xxx() functions, including os.startfile() on Windows and a handful of rare ones on Posix.\nnumpy has a rudimentary C API that cooperates with cpyext\nCheers,\nArmin Rigo and Maciej Fijalkowski", + "tags": "", + "url": "https://www.pypy.org/posts/2013/11/pypy-22-incrementalism-4723643710897639332.html" + }, + { + "title": "Py3k status update #12", + "text": "This is the 12th status update about our work on the py3k branch, which we\ncan work on thanks to all of the people who donated to the py3k proposal.Here's an update on the recent progress:Thank you to everyone who has provided initial feedback on the PyPy3 2.1 beta\n1 release. We've gotten a number of bug reports, most of which have been\nfixed.\nAs usual, we're continually keeping up with changes from the default\nbranch. Oftentimes these merges come at a cost (conflicts and or\nreintegration of py3k changes) but occasionally we get goodies for free, such\nas the recent JIT optimizations and incremental garbage collection.\nWe've been focusing on re-optimizing Python 2 int sized (machine sized)\nintegers:\nWe have a couple of known, notable speed regressions in the PyPy3 beta release\nvs regular PyPy. The major one being with Python 2.x int sized (or machine\nsized) integers.Python 3 drops the distinction between int and long types. CPython 3.x\naccomplishes this by removing the old int type entirely and renaming the long\ntype to int. Initially, we've done the same for PyPy3 for the sake of\nsimplicity and getting everything working.However PyPy's JIT is capable of heavily optimizing these machine sized integer\noperations, so this came with a regression in performance in this area.We're now in the process of solving this. Part of this work also involves some\nhouse cleaning on these numeric types which also benefits the default branch.cheers,\nPhil", + "tags": "pypy3", + "url": "https://www.pypy.org/posts/2013/11/py3k-status-update-12-5307085693947812769.html" + }, + { + "title": "Making coverage.py faster under PyPy", + "text": "If you've ever tried to run your programs with coverage.py under PyPy,\nyou've probably experienced some incredible slowness. Take this simple\nprogram:def f():\n return 1\n\n\ndef main():\n i = 10000000\n while i:\n i -= f()\n\nmain()\nRunning time coverage.py run test.py five times, and looking at the best\nrun, here's how PyPy 2.1 stacks up against CPython 2.7.5:\n\n\n\n\n\nPython\nTime\nNormalized to CPython\n\n\n\nCPython 2.7.5\n3.879s\n1.0x\n\nPyPy 2.1\n53.330s\n13.7x slower\n\n\nTotally ridiculous. I got turned onto this problem because on one of my\nprojects CPython takes about 1.5 minutes to run our test suite on the build\nbot, but PyPy takes 8-10 minutes.So I sat down to address it. And the results:\n\n\n\n\n\nPython\nTime\nNormalized to CPython\n\n\n\nCPython 2.7.5\n3.879s\n1.0x\n\nPyPy 2.1\n53.330s\n13.7x slower\n\nPyPy head\n1.433s\n2.7x faster\n\n\nNot bad.Technical detailsSo how'd we do it? Previously, using sys.settrace() (which coverage.py\nuses under the hood) disabled the JIT. Except it didn't just disable the JIT,\nit did it in a particularly insidious way \u2014 the JIT had no idea it was being\ndisabled!Instead, every time PyPy discovered that one of your functions was a hotspot,\nit would start tracing to observe what the program was doing, and right when it\nwas about to finish, coverage would run and cause the JIT to abort. Tracing\nis a slow process, it makes up for it by generating fast machine code at the\nend, but tracing is still incredibly slow. But we never actually got to the\n\"generate fast machine code\" stage. Instead we'd pay all the cost of tracing,\nbut then we'd abort, and reap none of the benefits.To fix this, we adjusted some of the heuristics in the JIT, to better show it\nhow sys.settrace() works. Previously the JIT saw it as an opaque\nfunction which gets the frame object, and couldn't tell whether or not it\nmessed with the frame object. Now we let the JIT look inside the\n function, so it's able to see that coverage.py isn't\nmessing with the frame in any weird ways, it's just reading the line number and\nfile path out of it.I asked several friends in the VM implementation and research field if they\nwere aware of any other research into making VMs stay fast when debugging tools\nlike coverage.py are running. No one I spoke to was aware of any (but I\ndidn't do a particularly exhaustive review of the literature, I just tweeted at\na few people), so I'm pleased to say that PyPy is quite possibly the first VM\nto work on optimizing code in debugging mode! This is possible because of our\nyears spent investing in meta-tracing research.Happy testing,\nAlex", + "tags": "", + "url": "https://www.pypy.org/posts/2013/10/making-coveragepy-faster-under-pypy-935409618297062344.html" + }, + { + "title": "Update on STM", + "text": "Hi all,\nThe sprint in London was a lot of fun and very fruitful. In the last\nupdate on STM, Armin was working on improving and specializing the\nautomatic barrier placement. There is still a lot to do in that area,\nbut that work is merged now. Specializing and improving barrier placement\nis still to be done for the JIT.\nBut that is not all. Right after the sprint, we were able to squeeze\nthe last obvious bugs in the STM-JIT combination. However, the performance\nwas nowhere near to what we want. So until now, we fixed some of the most\nobvious issues. Many come from RPython erring on the side of caution\nand e.g. making a transaction inevitable even if that is not strictly\nnecessary, thereby limiting parallelism. Another problem came from\nincreasing counters everytime a guard fails, which caused transactions\nto conflict on these counter updates. Since these counters do not have\nto be completely accurate, we update them non-transactionally now with\na chance of small errors.\nThere are still many such performance issues of various complexity left\nto tackle: we are nowhere near done. So stay tuned or contribute :)\n\nPerformance\nNow, since the JIT is all about performance, we want to at least\nshow you some numbers that are indicative of things to come.\nOur set of STM benchmarks is very small unfortunately\n(something you can help us out with), so this is\nnot representative of real-world performance. We tried to\nminimize the effect of JIT warm-up in the benchmark results.\nThe machine these benchmarks were executed on has 4 physical\ncores with Hyper-Threading (8 hardware threads).\nRaytracer from stm-benchmarks:\nRender times in seconds for a 1024x1024 image:\n\n\n\n\n\n\n\nInterpreter\nBase time: 1 thread\n8 threads (speedup)\n\n\n\nPyPy-2.1\n2.47\n2.56 (0.96x)\n\nCPython\n81.1\n73.4 (1.1x)\n\nPyPy-STM\n50.2\n10.8 (4.6x)\n\n\n\nFor comparison, disabling the JIT gives 148s on PyPy-2.1 and 87s on\nPyPy-STM (with 8 threads).\nRichards from PyPy repository on the stmgc-c4\nbranch:\nAverage time per iteration in milliseconds:\n\n\n\n\n\n\n\nInterpreter\nBase time: 1 thread\n8 threads (speedup)\n\n\n\nPyPy-2.1\n15.6\n15.4 (1.01x)\n\nCPython\n239\n237 (1.01x)\n\nPyPy-STM\n371\n116 (3.2x)\n\n\n\nFor comparison, disabling the JIT gives 492ms on PyPy-2.1 and 538ms on\nPyPy-STM.\n\nTry it!\nAll this can be found in the PyPy repository on the stmgc-c4\nbranch.\nTry it for yourself, but keep in mind that this is still experimental\nwith a lot of things yet to come. Only Linux x64 is supported right\nnow, but contributions are welcome.\nYou can download a prebuilt binary from here:\nhttps://bitbucket.org/pypy/pypy/downloads/pypy-oct13-stm.tar.bz2\n(Linux x64 Ubuntu >= 12.04). This was made at revision bafcb0cdff48.\n\nSummary\nWhat the numbers tell us is that PyPy-STM is, as expected,\nthe only of the three interpreters where multithreading gives a large\nimprovement in speed. What they also tell us is that, obviously, the\nresult is not good enough yet: it still takes longer on a 8-threaded\nPyPy-STM than on a regular single-threaded PyPy-2.1. However, as you\nshould know by now, we are good at promising speed and delivering it...\nyears later :-)\nBut it has been two years already since PyPy-STM started, and this is\nour first preview of the JIT integration. Expect major improvements\nsoon: with STM, the JIT generates code that is completely suboptimal in\nmany cases (barriers, allocation, and more). Once we improve this, the\nperformance of the STM-JITted code should come much closer to PyPy 2.1.\nCheers\nRemi & Armin", + "tags": "stm", + "url": "https://www.pypy.org/posts/2013/10/update-on-stm-7145890443443707910.html" + }, + { + "title": "Incremental Garbage Collector in PyPy", + "text": "Hello everyone.\nWe're pleased to announce that as of today,\nthe default PyPy comes with a GC that has much smaller pauses than yesterday.\nLet's start with explaining roughly what GC pauses are. In CPython each\nobject has a reference count, which is incremented each time we create\nreferences and decremented each time we forget them. This means that objects\nare freed each time they become unreachable. That is only half of the story\nthough. First note that when the last reference to a large tree of\nobjects goes away, you have a pause: all the objects are freed. Your\nprogram is not progressing at all during this pause, and this pause's\nduration can be arbitrarily large. This occurs at deterministic times,\nthough. But consider code like this:\n\nclass A(object):\n pass\n\na = A()\nb = A()\na.item = b\nb.item = a\ndel a\ndel b\n\nThis creates a reference cycle. It means that while we deleted references to\na and b from the current scope, they still have a reference count of 1,\nbecause they point to each other, even though the whole group has no references\nfrom the outside. CPython employs a cyclic garbage collector which is used to\nfind such cycles. It walks over all objects in memory, starting from some known\nroots, such as type objects, variables on the stack, etc. This solves the\nproblem, but can create noticeable, nondeterministic GC pauses as the heap\nbecomes large and convoluted.\nPyPy essentially has only the cycle finder - it does not bother with reference\ncounting, instead it walks alive objects every now and then (this is a big\nsimplification, PyPy's GC is much more complex than this). Although this might\nsound like a missing feature, it is really one of the reasons why PyPy is so\nfast, because at the end of the day the total time spent in managing the\nmemory is lower in PyPy than CPython. However, as a result, PyPy also has the\nproblem of GC pauses.\nTo alleviate this problem, which is essential for\napplications like games, we started to work on incremental GC, which spreads\nthe walking of objects and cleaning them across the execution time in smaller\nintervals. The work was sponsored by the Raspberry Pi foundation, started\nby Andrew Chambers and finished by Armin Rigo and Maciej Fija\u0142kowski.\n\n\nBenchmarks\nEveryone loves benchmarks. We did not measure any significant speed difference\non our quite extensive benchmark suite on speed.pypy.org. The main\nbenchmark that we used for other comparisons was translating the topaz\nruby interpreter using various versions of PyPy and CPython. The exact\ncommand was python /bin/rpython -O2 --rtype targettopaz.py.\nVersions:\n\ntopaz - dce3eef7b1910fc5600a4cd0afd6220543104823\npypy source - defb5119e3c6\npypy compiled with minimark (non-incremental GC) - d1a0c07b6586\npypy compiled with incminimark (new, incremental GC) - 417a7117f8d7\nCPython - 2.7.3\n\nThe memory usage of CPython, PyPy with minimark and PyPy with incminimark is\nshown here. Note that this benchmark is quite bad for PyPy in general, the\nmemory usage is higher and the amount of time taken is longer. This is due\nto the JIT warmup being both memory hungry and inefficient (see below).\nBut first, the new GC is not worse than the old one.\n\n\nEDIT:Red line is CPython, blue is incminimark (new), green is minimark (old)\n\nThe image was obtained by graphing the output of memusage.py.\nHowever, the GC pauses are significantly smaller. For PyPy the way to\nget GC pauses is to measure time between start and stop while running stuff\nwith PYPYLOG=gc-collect:log pypy program.py, for CPython, the magic\nincantation is gc.set_debug(gc.DEBUG_STATS) and parsing the output.\nFor what is worth, the average and total for CPython, as well as the total\nnumber of events are not directly comparable since it only shows the cyclic\ncollector, not the reference counts. The only comparable thing is the\namount of long pauses and their duration. In the table below, pause duration\nis sorted into 8 buckets, each meaning \"below that or equal to the threshold\".\nThe output is generated using the gcanalyze tool.\nCPython:\n\n\n\n\n\n\n\n\n\n\n\n\n150.1ms\n300.2ms\n450.3ms\n600.5ms\n750.6ms\n900.7ms\n1050.8ms\n1200.9ms\n\n5417\n5\n3\n2\n1\n1\n0\n1\n\n\n\nPyPy minimark (non-incremental GC):\n\n\n\n\n\n\n\n\n\n\n\n\n216.4ms\n432.8ms\n649.2ms\n865.6ms\n1082.0ms\n1298.4ms\n1514.8ms\n1731.2ms\n\n27\n14\n6\n4\n6\n5\n3\n3\n\n\n\nPyPy incminimark (new incremental GC):\n\n\n\n\n\n\n\n\n\n\n\n\n15.7ms\n31.4ms\n47.1ms\n62.8ms\n78.6ms\n94.3ms\n110.0ms\n125.7ms\n\n25512\n122\n4\n1\n0\n0\n0\n2\n\n\n\nAs we can see, while there is still work to be done (the 100ms ones could\nbe split among several steps), we did improve the situation quite drastically\nwithout any actual performance difference.\nNote about the benchmark - we know it's a pretty extreme case of JIT\nwarmup, we know we suck on it, we're working on it and we're not afraid of\nshowing PyPy is not always the best ;-)\n\n\nNitty gritty details\nHere are some nitty gritty details for people really interested in\nGarbage Collection. This was done as a patch to \"minimark\", our current\nGC, and called \"incminimark\" for now. The former is a generational\nstop-the-world GC. New objects are allocated \"young\", which means that\nthey initially live in the \"nursery\", a special zone of a few MB of\nmemory. When the nursery is full, a \"minor collection\" step moves the\nsurviving objects out of the nursery. This can be done quickly (a few\nmillisecond) because we only need to walk through the young objects that\nsurvive --- usually a small fraction of all young objects; and also by\nfar not all objects that are alive at this point, but only the young\nones. However, from time to time this minor collection is followed by a\n\"major collection\": in that step, we really need to walk all objects to\nclassify which ones are still alive and which ones are now dead\n(\"marking\") and free the memory occupied by the dead ones (\"sweeping\").\nYou can read more details here.\nThis \"major collection\" is what gives the long GC pauses. To fix this\nproblem we made the GC incremental: instead of running one complete\nmajor collection, we split its work into a variable number of pieces and\nrun each piece after every minor collection for a while, until there are\nno more pieces. The pieces are each doing a fraction of marking, or a\nfraction of sweeping. It adds some few milliseconds after each of these\nminor collections, rather than requiring hundreds of milliseconds in one\ngo.\nThe main issue is that splitting the major collections means that the\nmain program is actually running between the pieces, and so it can\nchange the pointers in the objects to point to other objects. This is\nnot a problem for sweeping: dead objects will remain dead whatever the\nmain program does. However, it is a problem for marking. Let us see\nwhy.\nIn terms of the incremental GC literature, objects are either \"white\",\n\"gray\" or \"black\". This is called tri-color marking. See for example\nthis blog post about Rubinius, or this page about LuaJIT or the wikipedia description. The\nobjects start as \"white\" at the beginning of marking; become \"gray\" when\nthey are found to be alive; and become \"black\" when they have been fully\ntraversed. Marking proceeds by scanning grey objects for pointers to\nwhite objects. The white objects found are turned grey, and the grey\nobjects scanned are turned black. When there are no more grey objects,\nthe marking phase is complete: all remaining white objects are truly\nunreachable and can be freed (by the following sweeping phase).\nIn this model, the important part is that a black object can never point\nto a white object: if the latter remains white until the end, it will be\nfreed, which is incorrect because the black object itself can still be\nreached. How do we ensure that the main program, running in the middle\nof marking, will not try to write a pointer to white object into a black\nobject? This requires a \"write barrier\", i.e. a piece of code that runs\nevery time we set a pointer into an object or array. This piece of code\nchecks if some (hopefully rare) condition is met, and calls a function\nif that is the case.\nThe trick we used in PyPy is to consider minor collections as part of\nthe whole, rather than focus only on major collections. The existing\nminimark GC had always used a write barrier of its own to do its job,\nlike any generational GC. This existing write barrier is used to detect\nwhen an old object (outside the nursery) is modified to point to a young\nobject (inside the nursery), which is essential information for minor\ncollections. Actually, although this was the goal, the actual write\nbarrier code is simpler: it just records all old objects into which we\nwrite any pointer --- to a young or old object. As we found out over\ntime, doing so is not actually slower, and might actually be a\nperformance improvement: for example, if the main program does a lot of\nwrites into the same old object, we don't need to check over and over\nagain if the written pointer points to a young object or not. We just\nrecord the old object in some list the first time, and that's it.\nThe trick is that this unmodified write barrier works for incminimark\ntoo. Imagine that we are in the middle of the marking phase, running\nthe main program. The write barrier will record all old objects that\nare being modified. Then at the next minor collection, all surviving\nyoung objects will be moved out of the nursery. At this point, as we're\nabout to continue running the major collection's marking phase, we\nsimply add to the list of pending gray objects all the objects that we\njust considered --- both the objects listed as \"old objects that are\nbeing modified\", and the objects that we just moved out of the nursery.\nA fraction from the former list were black object; so this mean that\nthey are turned back from the black to the gray color. This technique\nimplements nicely, if indirectly, what is called a \"backward write\nbarrier\" in the literature. The backwardness is about the color that\nneeds to be changed in the opposite of the usual direction \"white ->\ngray -> black\", thus making more work for the GC. (This is as opposed\nto \"forward write barrier\", where we would also detect \"black -> white\"\nwrites but turn the white object gray.)\nIn summary, I realize that this description is less about how we turned\nminimark into incminimark, and more about how we differ from the\nstandard way of making a GC incremental. What we really had to do to\nmake incminimark was to write logic that says \"if the major collection\nis in the middle of the marking phase, then add this object to the list\nof gray objects\", and put it at a few places throughout minor\ncollection. Then we simply split a major collection into increments,\ndoing marking or sweeping of some (relatively arbitrary) number of\nobjects before returning. That's why, after we found that the existing\nwrite barrier would do, it was not much actual work, and could be done\nwithout major changes. For example, not a single line from the JIT\nneeded adaptation. All in all it was relatively painless work. ;-)\nCheers,armin and fijal", + "tags": "", + "url": "https://www.pypy.org/posts/2013/10/incremental-garbage-collector-in-pypy-8956893523842234676.html" + }, + { + "title": "Numpy Status Update", + "text": "Hi everyone\n\nThanks to the people who donated money to the numpy proposal, here is what I've been working on recently :\n\n- Fixed conversion from a numpy complex number to a python complex number\n- Implement the rint ufunc\n-\u00a0Make numpy.character usable as a dtype\n-\u00a0Fix ndarray(dtype=str).fill()\n- Various fixes on boolean and fancy indexing\n\nCheers\nRomain", + "tags": "numpy", + "url": "https://www.pypy.org/posts/2013/09/numpy-status-update-5160363918470470887.html" + }, + { + "title": "PyCon South Africa & sprint", + "text": "Hi all,\n\nFor those of you that happen to be from South Africa: don't miss\nPyCon ZA 2013, next October 3rd and 4th!\nLike last year, a few of us will be there. There will be the first talk\nabout STM getting ready (a\nblog post about that should follow soon).\n\nMoreover, general sprints will continue on the weekend (5th and 6th).\nAfterwards, Fijal will host a longer PyPy sprint (marathon?) with me\nuntil around the 21th. You are welcome to it as well! Write to the mailing list or to fijal directly (fijall\nat gmail.com), or simply in comments of this post.\n\n--- Armin", + "tags": "", + "url": "https://www.pypy.org/posts/2013/09/pycon-south-africa-sprint-6630788654105016762.html" + }, + { + "title": "Slides of the PyPy London Demo Evening", + "text": "The slides of the London demo evening are now online:", + "tags": "", + "url": "https://www.pypy.org/posts/2013/08/slides-of-pypy-london-demo-evening-5157052112396009739.html" + }, + { + "title": "NumPy road forward", + "text": "Hello everyone.\nThis is the roadmap for numpy effort in PyPy as discussed on the London sprint.\nFirst, the highest on our priority list is to finish the low-level part\nof the numpy module. What\nwe'll do is to finish the RPython part of numpy and provide a pip installable\nnumpypy repository that includes the pure python part of Numpy. This would\ncontain the original Numpy with a few minor changes.\nSecond, we need to work on the JIT support that will make NumPy on PyPy\nfaster. In detail:\n\nreenable the lazy loop evaluation\noptimize bridges, which is depending on optimizer refactorings\nSSE support\n\nOn the compatibility front, there were some independent attempts into\nmaking the following stuff working:\n\nf2py\nC API (in fact, PyArray_* API is partly present in the nightly builds of\nPyPy)\nmatplotlib (both using PyArray_* API and embedding CPython runtime in PyPy)\nscipy\n\nIn order to make all of the above happen faster, it would be helpful to raise\nmore funds. You can donate to PyPy's NumPy project on our website. Note\nthat PyPy is a member of SFC which is a 501(c)(3) US non-profit, so donations\nfrom US companies can be tax-deducted.\nCheers,\nfijal, arigo, ronan, rguillebert, anto and others", + "tags": "numpy", + "url": "https://www.pypy.org/posts/2013/08/numpy-road-forward-4210065750776753500.html" + }, + { + "title": "Preliminary London Demo Evening Agenda", + "text": "We now have a preliminary agenda for the demo evening in London next week. It takes place on Tuesday, August 27 2013, 18:30-19:30 (BST) at King's College London, Strand. The preliminary agenda is as follows:\n\n\nLaurence Tratt: Welcome from the Software Development Team\nCarl Friedrich Bolz: A Short Introduction to PyPy\nMaciej Fija\u0142kowski: Numpy on PyPy, Present State and Outlook\nLukas Diekmann: Collection Strategies for Fast Containers in PyPy\nArmin Rigo: Software Transactional Memory for PyPy\nEdd Barrett: Unipycation: Combining Prolog and Python\n \n\nAll the talks are lightning talks. Afterwards there will be plenty of time for discussion.\n\nThere's still free spots, if you want to come, please register on the Eventbrite page. Hope to see you there!", + "tags": "", + "url": "https://www.pypy.org/posts/2013/08/preliminary-london-demo-evening-agenda-5254002451136674320.html" + }, + { + "title": "Update on STM", + "text": "Hi all,\n\nA quick update on Software Transactional Memory. We are\nworking on two fronts.\n\nOn the one hand, the integration of the \"c4\" C library with PyPy is done\nand works well, but is still subject to improvements. The \"PyPy-STM\"\nexecutable (without the JIT)\nseems to be stable, as far as it has been tested. It runs a simple\nbenchmark like Richards with a 3.2x slow-down over a regular JIT-less\nPyPy.\n\nThe main factor of this slow-down: the numerous \"barriers\" in\nthe code --- checks that are needed a bit everywhere to verify that a\npointer to an object points to a recent enough version, and if not, to\ngo to the most recent version. These barriers are inserted automatically\nduring the translation; there is no need for us to manually put 42 million\nbarriers in the source code of PyPy. But this automatic insertion uses a\nprimitive algorithm right now, which usually ends up putting more barriers than the\ntheoretical optimum. I (Armin) am trying to improve that --- and progressing:\nlast week the slow-down was around 4.5x. This is done in the branch\nstmgc-static-barrier.\n\nOn the other hand, Remi is progressing on the JIT integration in\nthe branch stmgc-c4. \nThis has been working in simple cases since a couple of weeks by now, but the\nresulting \"PyPy-JIT-STM\" often crashes. This is because while the\nbasics are not really hard, we keep hitting new issues that must be\nresolved.\n\nThe basics are that whenever the JIT is about to generate\nassembler corresponding to a load or a store in a GC object, it must\nfirst generate a bit of extra assembler that corresponds to the barrier\nthat we need. This works fine by now (but could benefit from the same\nkind of optimizations described above, to reduce the number of barriers).\nThe additional issues are all more subtle. I will describe the current\none as an example: it is how to write constant pointers inside the assembler.\n\nRemember that the STM library classifies objects as either\n\"public\" or \"protected/private\". A \"protected/private\" object\nis one which has not been seen by another thread so far.\nThis is essential as an optimization, because we know that no\nother thread will access our protected or private objects in parallel,\nand thus we are free to modify their content in place. By contrast,\npublic objects are frozen, and to do any change, we first need to\nbuild a different (protected) copy of the object. See this\nblog\npost for more details.\n\nSo far so good, but the JIT will sometimes (actually often) hard-code\nconstant pointers into the assembler it produces. For example, this is the\ncase when the Python code being JITted creates an instance of a known class;\nthe corresponding assembler produced by the JIT will reserve the memory for\nthe instance and then write the constant type pointer in it. This type\npointer is a GC object (in the simple model, it's the Python class object;\nin PyPy it's actually the \"map\" object, which is\na different story).\n\nThe problem right now is that this constant pointer may point to a\nprotected object. This is a problem because the same piece of assembler\ncan later be executed by a different thread. If it does, then this\ndifferent thread will create instances whose type pointer is bogus: looking\nlike a protected object, but actually protected by a different thread.\nAny attempt to use this type pointer to change anything on the class\nitself will likely crash: the threads will all think they can safely change it\nin-place. To fix this, we need to make sure we only write pointers to\npublic objects in the assembler. This is a bit involved because we need\nto ensure that there is a public version of the object to start with.\n\nWhen this is done, we will likely hit the next problem, and the next one;\nbut at some point it should converge (hopefully!) and we'll give you our first\nPyPy-JIT-STM ready to try. Stay tuned :-)\n\nA bient\u00f4t,\n\nArmin.", + "tags": "stm", + "url": "https://www.pypy.org/posts/2013/08/update-on-stm-8705514488940872802.html" + }, + { + "title": "NumPyPy Status Update", + "text": "Hello everyone\n\nAs expected, nditer is a lot of work. I'm going to pause my work on it for now and focus on simpler and more important things, here is a list of what I implemented :\n\nFixed a bug on 32 bit that made int32(123).dtype == dtype(\"int32\") fail\nFixed a bug on the pickling of array slices\nThe external loop flag is implemented on the nditer class\nThe c_index, f_index and multi_index flags are also implemented\nAdd dtype(\"double\") and dtype(\"str\")\nC-style iteration is available for nditer\n\nCheers\nRomain Guillebert", + "tags": "numpy", + "url": "https://www.pypy.org/posts/2013/08/numpypy-status-update-3401163348519734658.html" + }, + { + "title": "PyPy 2.1 - Considered ARMful", + "text": "We're pleased to announce PyPy 2.1, which targets version 2.7.3 of the Python\nlanguage. This is the first release with official support for ARM processors in the JIT.\nThis release also contains several bugfixes and performance improvements.You can download the PyPy 2.1 release here:https://pypy.org/download.htmlWe would like to thank the Raspberry Pi Foundation for supporting the work\nto finish PyPy's ARM support.The first beta of PyPy3 2.1, targeting version 3 of the Python language, was\njust released, more details can be found here.\nWhat is PyPy?PyPy is a very compliant Python interpreter, almost a drop-in replacement for\nCPython 2.7. It's fast (pypy 2.1 and cpython 2.7.2 performance comparison)\ndue to its integrated tracing JIT compiler.This release supports x86 machines running Linux 32/64, Mac OS X 64 or Windows\n32. This release also supports ARM machines running Linux 32bit - anything with\nARMv6 (like the Raspberry Pi) or ARMv7 (like the Beagleboard,\nChromebook, Cubieboard, etc.) that supports VFPv3 should work. Both\nhard-float armhf/gnueabihf and soft-float armel/gnueabi builds are\nprovided. The armhf builds for Raspbian are created using the Raspberry Pi\ncustom cross-compilation toolchain\nbased on gcc-arm-linux-gnueabihf and should work on ARMv6 and\nARMv7 devices running Debian or Raspbian. The armel builds are built\nusing the gcc-arm-linux-gnuebi toolchain provided by Ubuntu and\ncurrently target ARMv7.Windows 64 work is still stalling, we would welcome a volunteer\nto handle that.HighlightsJIT support for ARM, architecture versions 6 and 7, hard- and soft-float ABI\nStacklet support for ARM\nSupport for os.statvfs and os.fstatvfs on unix systems\nImproved logging performance\nFaster sets for objects\nInterpreter improvements\nDuring packaging, compile the CFFI based TK extension\nPickling of numpy arrays and dtypes\nSubarrays for numpy\nBugfixes to numpy\nBugfixes to cffi and ctypes\nBugfixes to the x86 stacklet support\nFixed issue 1533: fix an RPython-level OverflowError for space.float_w(w_big_long_number).\nFixed issue 1552: GreenletExit should inherit from BaseException.\nFixed issue 1537: numpypy __array_interface__\nFixed issue 1238: Writing to an SSL socket in PyPy sometimes failed with a \"bad write retry\" message.\nCheers,David Schneider for the PyPy team.", + "tags": "", + "url": "https://www.pypy.org/posts/2013/08/pypy-21-considered-armful-7177475722033479233.html" + }, + { + "title": "PyPy Demo Evening in London, August 27, 2013", + "text": "As promised in the London sprint announcement we are organising a PyPy demo\nevening during the London sprint on Tuesday, August 27 2013, 18:30-19:30 (BST). The\ndescription of the event is below. If you want to come, please register on the\nEventbrite page.\n\nPyPy is a fast Python VM. Maybe you've never used PyPy and want to find out\nwhat use it might be for you? Or you and your organisation have been using it\nand you want to find out more about how it works under the hood? If so, this\ndemo session is for you!\nMembers of the PyPy team will give a series of lightning talks on PyPy: its\nbenefits; how it works; research currently being undertaken to make it\nfaster; and unusual uses it can be put to. Speakers will be available\nafterwards for informal discussions. This is the first time an event like\nthis has been held in the UK, and is a unique opportunity to speak to core\npeople. Speakers confirmed thus far include: Armin Rigo, Maciej Fija\u0142kowski,\nCarl Friedrich Bolz, Lukas Diekmann, Laurence Tratt, Edd Barrett.\nThe venue for this talk is the Software Development Team, King's College\nLondon. The main entrance is on the Strand, from where the room for the event\nwill be clearly signposted. Travel directions can be found at\nhttps://www.kcl.ac.uk/campuslife/campuses/directions/strand.aspx\nIf you have any questions about the event, please contact Laurence Tratt", + "tags": "", + "url": "https://www.pypy.org/posts/2013/07/pypy-demo-evening-in-london-august-27-3640213278969666664.html" + }, + { + "title": "PyPy3 2.1 beta 1", + "text": "We're pleased to announce the first beta of the upcoming 2.1 release of\nPyPy3. This is the first release of PyPy which targets Python 3 (3.2.3)\ncompatibility.We would like to thank all of the people who donated to the py3k proposal\nfor supporting the work that went into this and future releases.You can download the PyPy3 2.1 beta 1 release here:https://pypy.org/download.html#pypy3-2-1-beta-1HighlightsThe first release of PyPy3: support for Python 3, targetting CPython 3.2.3!There are some known issues including performance regressions (issues\n#1540 & #1541) slated to be resolved before the final release.\n\nWhat is PyPy?PyPy is a very compliant Python interpreter, almost a drop-in replacement for\nCPython 2.7.3 or 3.2.3. It's fast due to its integrated tracing JIT compiler.This release supports x86 machines running Linux 32/64, Mac OS X 64 or Windows\n32. Also this release supports ARM machines running Linux 32bit - anything with\nARMv6 (like the Raspberry Pi) or ARMv7 (like Beagleboard,\nChromebook, Cubieboard, etc.) that supports VFPv3 should work.Windows 64 work is still stalling and we would welcome a volunteer to handle\nthat.How to use PyPy?We suggest using PyPy from a virtualenv. Once you have a virtualenv\ninstalled, you can follow instructions from pypy documentation on how\nto proceed. This document also covers other installation schemes.Cheers,\nthe PyPy team", + "tags": "", + "url": "https://www.pypy.org/posts/2013/07/pypy3-21-beta-1-8647445024868663902.html" + }, + { + "title": "PyPy 2.1 beta 2", + "text": "We're pleased to announce the second beta of the upcoming 2.1 release of PyPy.\nThis beta adds one new feature to the 2.1 release and contains several bugfixes listed below.You can download the PyPy 2.1 beta 2 release here:https://pypy.org/download.htmlHighlightsSupport for os.statvfs and os.fstatvfs on unix systems.\nFixed issue 1533: fix an RPython-level OverflowError for space.float_w(w_big_long_number).\nFixed issue 1552: GreenletExit should inherit from BaseException.\nFixed issue 1537: numpypy __array_interface__\nFixed issue 1238: Writing to an SSL socket in pypy sometimes failed with a \"bad write retry\" message.\ndistutils: copy CPython's implementation of customize_compiler, dont call\nsplit on environment variables, honour CFLAGS, CPPFLAGS, LDSHARED and\nLDFLAGS.\nDuring packaging, compile the CFFI tk extension.\nWhat is PyPy?PyPy is a very compliant Python interpreter, almost a drop-in replacement for\nCPython 2.7.3. It's fast due to its integrated tracing JIT compiler.This release supports x86 machines running Linux 32/64, Mac OS X 64 or Windows\n32. Also this release supports ARM machines running Linux 32bit - anything with\nARMv6 (like the Raspberry Pi) or ARMv7 (like Beagleboard,\nChromebook, Cubieboard, etc.) that supports VFPv3 should work.Windows 64 work is still stalling, we would welcome a volunteer\nto handle that.How to use PyPy?We suggest using PyPy from a virtualenv. Once you have a virtualenv\ninstalled, you can follow instructions from pypy documentation on how\nto proceed. This document also covers other installation schemes.Cheers,\nThe PyPy Team.", + "tags": "", + "url": "https://www.pypy.org/posts/2013/07/pypy-21-beta-2-264349571160808803.html" + }, + { + "title": "PyPy San Francisco Sprint July 27th 2013", + "text": "The next PyPy sprint will be in San Francisco, California. It is a public\nsprint, suitable for newcomers. It will run on Saturday July 27th.Some possible things people will be hacking on the sprint:running your software on PyPy\nmaking your software fast on PyPy\nimproving PyPy's JIT\nimproving Twisted on PyPy\nany exciting stuff you can think of\nIf there are newcomers, we'll run an introduction to hacking on PyPy.Location\nThe sprint will be held at the Rackspace Office:620 Folsom St, Ste 100The doors will open at 10AM and run until 6PM.", + "tags": "", + "url": "https://www.pypy.org/posts/2013/07/pypy-san-francisco-sprint-july-27th-2012-3064530444396960172.html" + }, + { + "title": "PyPy London Sprint (August 26 - September 1 2013)", + "text": "The next PyPy sprint will be in London, United Kingdom for the first\ntime. This is a fully public sprint. PyPy sprints are a very good way\nto get into PyPy development and no prior PyPy knowledge is necessary.\nGoals and topics of the sprint\nFor newcomers:\n\nbring your application/library and we'll help you port it to PyPy,\nbenchmark and profile\ncome and write your favorite missing numpy function\nhelp us work on developer tools like jitviewer\n\nWe'll also work on:\n\nrefactoring the JIT optimizations\nSTM and STM-related topics\nanything else attendees are interested in\n\nExact times\nThe work days should be August 26 - September 1 2013 (Monday-Sunday).\nThe official plans are for people to arrive on the 26th, and\nto leave on the 2nd. There will be a break day in the middle.\nWe'll typically start at 10:00 in the morning.\nLocation\nThe sprint will happen within a room of King's College's Strand\nCampus in Central London, UK. There are some travel instructions how to\nget there. We are being hosted by Laurence Tratt and the Software\nDevelopment Team.\nDemo Session\nIf you don't want to come to the full sprint, but still want to chat a\nbit, we are planning to have a demo session on Tuesday August 27. We\nwill announce this separately on the blog. If you are interested, please\nleave a comment.\nRegistration\nIf you want to attend, please register by adding yourself to the\n\"people.txt\" file in Mercurial:\n\nhttps://bitbucket.org/pypy/extradoc/\nhttps://foss.heptapod.net/pypy/extradoc/-/blob/branch/default/extradoc/sprintinfo/london-2013\n\nor on the pypy-dev mailing list if you do not yet have check-in rights:\n\nhttps://mail.python.org/mailman/listinfo/pypy-dev\n\nRemember that you may need a (insert country here)-to-UK power adapter.\nPlease note that UK is not within the Schengen zone, so non-EU and\nnon-Switzerland citizens may require specific visa. Please check travel\nregulations. Also, the UK uses pound sterling (GBP).", + "tags": "sprint", + "url": "https://www.pypy.org/posts/2013/07/pypy-london-sprint-august-26-september-5156945690440578388.html" + }, + { + "title": "Software Transactional Memory lisp experiments", + "text": "As covered in the previous blog post, the STM subproject of PyPy has been\nback on the drawing board. The result of this experiment is an STM-aware\ngarbage collector written in C. This is finished by now, thanks to Armin's\nand Remi's work, we have a fully functional garbage collector and a STM system\nthat can be used from any C program with enough effort. Using it is more than\na little mundane, since you have to inserts write and read barriers by hand\neverywhere in your code that reads or writes to garbage collector controlled\nmemory. In the PyPy integration, this manual work is done automatically\nby the STM transformation in the interpreter.\nHowever, to experiment some more, we created a minimal\nlisp-like/scheme-like interpreter\n(called Duhton), that follows closely CPython's implementation strategy.\nFor anyone familiar with CPython's source code, it should be pretty\nreadable. This interpreter works like a normal and very basic lisp variant,\nhowever it comes with a transaction builtin, that lets you spawn transactions\nusing the STM system. We implemented a few demos that let you play with the\ntransaction system. All the demos are running without conflicts, which means\nthere are no conflicting writes to global memory and hence the demos are very\namenable to parallelization. They exercise:\n\narithmetics - demo/many_sqare_roots.duh\nread-only access to globals - demo/trees.duh\nread-write access to local objects - demo/trees2.duh\n\nWith the latter ones being very similar to the classic gcbench. STM-aware\nDuhton can be found in the stmgc repo, while the STM-less Duhton,\nthat uses refcounting, can be found in the duhton repo under the base\nbranch.\nBelow are some benchmarks. Note that this is a little comparing apples to\noranges since the single-threaded duhton uses refcounting GC vs generational\nGC for STM version. Future pypy benchmarks will compare more apples to apples.\nMoreover none of the benchmarks has any conflicts. Time is the total time\nthat the benchmark took (not the CPU time) and there was very little variation\nin the consecutive runs (definitely below 5%).\n\n\n\n\n\n\n\n\n\nbenchmark\n1 thread (refcount)\n1 thread (stm)\n2 threads\n4 threads\n\nsquare\n1.9s\n3.5s\n1.8s\n0.9s\n\ntrees\n0.6s\n1.0s\n0.54s\n0.28s\n\ntrees2\n1.4s\n2.2s\n1.1s\n0.57s\n\n\n\nAs you can see, the slowdown for STM vs single thread is significant\n(1.8x, 1.7x, 1.6x respectively), but still lower than 2x. However the speedup\nfrom running on multiple threads parallelizes the problem almost perfectly.\nWhile a significant milestone, we hope the next blog post will cover\nSTM-enabled pypy that's fully working with JIT work ongoing.\nCheers,\nfijal on behalf of Remi Meier and Armin Rigo", + "tags": "stm", + "url": "https://www.pypy.org/posts/2013/07/software-transactional-memory-lisp-7777576128992250197.html" + }, + { + "title": "PyPy 2.1 beta", + "text": "We're pleased to announce the first beta of the upcoming 2.1 release of PyPy.\u00a0This beta contains many bugfixes and improvements, numerous improvements to the\u00a0numpy in pypy effort. The main feature being that the ARM processor support is\u00a0not longer considered alpha level.\n\nWe would like to thank the Raspberry Pi\u00a0Foundation for supporting the work to finish PyPy's ARM support.\n\n\nYou can download the PyPy 2.1 beta release here:\n\nhttps://pypy.org/download.html\n\n\n\n\nHighlights\n\nBugfixes to the ARM JIT backend, so that ARM is now an officially\nsupported processor architecture\nStacklet support on ARM\nInterpreter improvements\nVarious numpy improvements\nBugfixes to cffi and ctypes\nBugfixes to the stacklet support\nImproved logging performance\nFaster sets for objects\n\n\n\n\n\n\nWhat is PyPy?\nPyPy is a very compliant Python interpreter, almost a drop-in replacement for\u00a0CPython 2.7.3. It's fast due to its integrated tracing JIT compiler.\u00a0This release supports x86 machines running Linux 32/64, Mac OS X 64 or Windows\u00a032. Also this release supports ARM machines running Linux 32bit - anything with\u00a0ARMv6 (like the Raspberry Pi) or ARMv7 (like Beagleboard,\u00a0Chromebook, Cubieboard, etc.) that supports VFPv3 should work. Both\u00a0hard-float armhf/gnueabihf and soft-float\u00a0armel/gnueabi builds are\u00a0provided. armhf builds for Raspbian are created using the Raspberry Pi\ncustom cross-compilation toolchain\u00a0based on gcc-arm-linux-gnueabihf and should work on ARMv6\u00a0and\u00a0ARMv7 devices running Debian or Raspbian. armel builds are built\u00a0using the gcc-arm-linux-gnuebi toolchain provided by Ubuntu and\u00a0currently target ARMv7.\n\nWindows 64 work is still stalling, we would welcome a volunteer\u00a0to handle that.\n\n\n\n\nHow to use PyPy?\nWe suggest using PyPy from a virtualenv. Once you have a virtualenv\u00a0installed, you can follow instructions from pypy documentation on how\u00a0to proceed. This document also covers other installation schemes.\n\nCheers,\n\nthe PyPy team.", + "tags": "", + "url": "https://www.pypy.org/posts/2013/07/pypy-21-beta-1351105697755187196.html" + }, + { + "title": "EuroPython", + "text": "Hi all,\n\nA short note: if you're at EuroPython right now and wondering if PyPy is\ndead because you don't see the obviously expected talk about PyPy, don't\nworry. PyPy is still alive and kicking. The truth is two-fold: (1) we\nmissed the talk deadline (duh!)... but as importantly, (2) for various\nreasons we chose not to travel to Florence this year after our trip to\nPyCon US. (Antonio Cuni is at Florence but doesn't have a talk about PyPy\neither.)\n\nArmin", + "tags": "", + "url": "https://www.pypy.org/posts/2013/07/europython-8992114341185888806.html" + }, + { + "title": "Py3k status update #11", + "text": "This is the 11th status update about our work on the py3k branch, which we\ncan work on thanks to all of the people who donated to the py3k proposal.Here's some highlights of the progress made since the previous update:PyPy py3k now matches CPython 3's hash code for\nint/float/complex/Decimal/Fraction\nVarious outstanding unicode identifier related issues were\nresolved. E.g. test_importlib/pep263/ucn/unicode all now fully pass. Various\nusage of identifiers (in particular type and module names) have been fixed to\nhandle non-ascii names -- mostly around display of reprs and exception\nmessages.\nThe unicodedata database has been upgraded to 6.0.0.\nWindows support has greatly improved, though it could still use some more\nhelp (but so does the default branch to a certain degree).\nProbably the last of the parsing related bugs/features have been taken care\nof.\nOf course various other smaller miscellaneous fixes\nThis leaves the branch w/ only about 5 outstanding failures of the stdlib test\nsuite:test_float1 failing test about containment of floats in collections.\ntest_memoryviewVarious failures: requires some bytes/str changes among other things (Manuel\nJacob's has some progress on this on the py3k-memoryview branch)\ntest_multiprocessing1 or more tests deadlock on some platforms\ntest_sys and test_threading2 failing tests for the New GIL's new API\nProbably the biggest feature left to tackle is the New GIL.We're now pretty close to pushing an initial release. We had planned for one\naround PyCon, but having missed that we've put some more effort into the branch\nto provide a more fully-fledged initial release.Thanks to the following for their contributions: Manuel Jacob, Amaury Forgeot\nd'Arc, Karl Ramm, Jason Chu and Christian Hudon.cheers,\nPhil", + "tags": "pypy3", + "url": "https://www.pypy.org/posts/2013/06/py3k-status-update-11-133025715908408072.html" + }, + { + "title": "STM on the drawing board", + "text": "Hi all!\n\nThis is an update about the Software Transactional Memory subproject of\nPyPy. I have some good news of progress. Also,\nRemi Meier will\nlikely help me this summer. He did various\ninvestigations with PyPy-STM for his Master's Thesis and contributed back\na lot of ideas and some code. Welcome again Remi!\n\nI am also sorry that it seems to advance so slowly. Beyond the usual\nexcuses --- I was busy with other things, e.g. releasing PyPy 2.0 --- I\nwould like to reassure people: I'm again working on it, and the financial\ncontributions are still there and reserved for STM (almost half the money is\nleft, a big thank you again if you contributed!).\n\nThe real reason for the apparent slowness, though, is that it is really\na research project. It's possible to either have hard deadlines, or to\nfollow various tracks and keep improving the basics, but not both at the\nsame time.\n\nDuring the past month where I have worked again on STM, I worked still on\nthe second option; and I believe it was worth every second of it. Let me try\nto convince you :-)\n\nThe main blocker was that the STM subsystem, written in C, and the\nGarbage Collection (GC) subsystem, written in RPython, were getting\nharder and harder to coordinate. So what I did instead is to give up\nusing RPython in favor of using only C for both. C is a good language\nfor some things, which includes low-level programming where we must take\ncare of delicate multithreading issues; RPython is not a good fit in\nthat case, and wasn't designed to be.\n\nI started a fresh Mercurial repo\nwhich is basically a stand-alone C library. This library (in heavy development\nright now!) gives any C\nprogram some functions to allocate and track GC-managed objects, and\ngives an actual STM+GC combination on these objects. It's possible\n(though rather verbose) to use it directly in C programs, like in a\nsmall example interpreter. Of course the eventual purpose is to link it\nwith PyPy during translation to C, with all the verbose calls\nautomatically generated.\n\nSince I started this, bringing the GC closer to the STM, I kept finding\nnew ways that the two might interact to improve the performance, maybe\nradically. Here is a summary of the current ideas.\n\nWhen we run\nmultiple threads, there are two common cases: one is to access (read and write)\nobjects that have only been seen by the current thread; the other is to read\nobjects seen by all threads, like in Python the modules/functions/classes,\nbut not to write to them. Of course, writing to the same object from\nmultiple threads occurs too, and it is handled correctly (that's the whole\npoint), but it is a relatively rare case.\n\nSo each object is classified as \"public\" or \"protected\" (or \"private\",\nwhen they belong to the current transaction). Newly created objects, once\nthey are no longer private, remain protected until\nthey are read by a different thread. Now, the point is to use very\ndifferent mechanisms for public and for protected objects. Public\nobjects are visible by all threads, but read-only in memory; to change\nthem, a copy must be made, and the changes are written to the copy (the\n\"redolog\" approach to STM). Protected objects, on the other hand, are\nmodified in-place, with (if necessary) a copy of them being made\nfor the sole purpose of a possible abort of the transaction (the \"undolog\"\napproach).\n\nThis is combined with a generational GC similar to PyPy's --- but here,\neach thread gets its own nursery and does its own \"minor collections\",\nindependently of the others.\n\nSo objects are by default protected; when another thread tries to follow a\npointer to them, then it is that other thread's job to carefully \"steal\"\nthe object and turn it public (possibly making a copy of it if needed,\ne.g. if it was still a young object living in the original nursery).\n\nThe same object can exist temporarily in multiple versions: any number\nof public copies; at most one active protected copy; and optionally one\nprivate copy per thread (this is the copy as currently seen by the\ntransaction in progress on that thread). The GC cleans up the\nunnecessary copies.\n\nThese ideas are variants and extensions of the same basic idea\nof keeping multiple copies with revision numbers to track them.\nMoreover, \"read barriers\" and \"write barriers\" are used by the C program\ncalling into this library in order to be sure that it is accessing the\nright version of the object. In the currently investigated variant\nI believe it should be possible to have rather cheap\nread barriers, which would definitely be a major speed improvement over\nthe previous variants. Actually, as far as I know, it would be a major\nimprovement over most of the other existing STMs: in them, the typical read barrier\ninvolves following chains of pointers, and checking some dictionary to see if this\nthread has a modified local copy of the object. The difference with a\nread barrier that can resolve most cases in a few CPU cycles should be\nhuge.\n\nSo, this is research :-) It is progressing, and at some point I'll be\nsatisfied with it and stop rewriting everything; and then the actual\nintegration into PyPy should be straightforward (there is already code\nto detect where the read and write barriers need to be inserted, where\ntransactions can be split, etc.). Then there is support for the\nJIT to be written, and so on. But more about it later.\n\nThe purpose of this post was to give you some glimpses into what I'm\nworking on right now. As usual, no plan for release yet. But you can\nlook forward to seeing the C library progress. I'll probably also start\nsoon some sample interpreter in C, to test the waters (likely a\nrevival of duhton).\nIf you know nothing about Python but all about the C-level\nmultithreading issues, now is a good time to get involved :-)\n\nThanks for reading!\n\nArmin", + "tags": "stm", + "url": "https://www.pypy.org/posts/2013/06/stm-on-drawing-board-1028082727566254104.html" + }, + { + "title": "NumPyPy status update", + "text": "Hello everyone,\n\nMay was the first month I was paid to work on NumPyPy (thanks to all who donated!), here is what I worked on during this period :\n\n\nIt is now possible to use subarrays.\nIt is now possible to pickle ndarrays (including those using subarrays), dtypes and scalars, the pickling protocol is the same as numpy's.\n\n\n\n\nFor June, I plan to work on the nditer class, it seems that there's enough work for an entire month.\n\nCheers\nRomain Guillebert", + "tags": "numpy", + "url": "https://www.pypy.org/posts/2013/06/numpypy-status-update-3846626188716521472.html" + }, + { + "title": "PyPy 2.0.2 - Fermi Panini", + "text": "We're pleased to announce PyPy 2.0.2. This is a stable bugfix release\nover 2.0 and 2.0.1. You can download it here:\n\nhttps://pypy.org/download.html\nIt fixes a crash in the JIT when calling external C functions (with\nctypes/cffi) in a multithreaded context.\n\nWhat is PyPy?\nPyPy is a very compliant Python interpreter, almost a drop-in replacement for\nCPython 2.7. It's fast (pypy 2.0 and cpython 2.7.3 performance comparison)\ndue to its integrated tracing JIT compiler.\nThis release supports x86 machines running Linux 32/64, Mac OS X 64 or\nWindows 32. Support for ARM is progressing but not bug-free yet.\n\n\nHighlights\nThis release contains only the fix described above. A crash (or wrong\nresults) used to occur if all these conditions were true:\n\nyour program is multithreaded;\nit runs on a single-core machine or a heavily-loaded multi-core one;\nit uses ctypes or cffi to issue external calls to C functions.\n\nThis was fixed in the branch emit-call-x86 (see the example file\nbug1.py).\nCheers,\narigo et. al. for the PyPy team", + "tags": "", + "url": "https://www.pypy.org/posts/2013/05/pypy-202-fermi-panini-1917947221142595738.html" + }, + { + "title": "PyPy 2.0.1 - Bohr Sm\u00f8rrebr\u00f8d", + "text": "We're pleased to announce PyPy 2.0.1. This is a stable bugfix release\nover 2.0. You can download it here:\n\nhttps://pypy.org/download.html\nThe fixes are mainly about fatal errors or crashes in our stdlib. See\nbelow for more details.\n\nWhat is PyPy?\nPyPy is a very compliant Python interpreter, almost a drop-in replacement for\nCPython 2.7. It's fast (pypy 2.0 and cpython 2.7.3 performance comparison)\ndue to its integrated tracing JIT compiler.\nThis release supports x86 machines running Linux 32/64, Mac OS X 64 or\nWindows 32. Support for ARM is progressing but not bug-free yet.\n\n\nHighlights\n\nfix an occasional crash in the JIT that ends in RPython Fatal error:\nNotImplementedError.\nid(x) is now always a positive number (except on int/float/long/complex).\nThis fixes an issue in _sqlite.py (mostly for 32-bit Linux).\nfix crashes of callback-from-C-functions (with cffi) when used together\nwith Stackless features, on asmgcc (i.e. Linux only). Now gevent should\nwork better.\nwork around an eventlet issue with socket._decref_socketios().\n\nCheers,\narigo et. al. for the PyPy team", + "tags": "", + "url": "https://www.pypy.org/posts/2013/05/pypy-201-bohr-smrrebrd-6316445093061941482.html" + }, + { + "title": "Numpy Status Update", + "text": "Hello Everyone,\n\nI've started to work on NumPyPy since the end of April and here is a short update :\n\n\nI implemented pickling support on ndarrays and dtypes, it will be compatible with numpy's pickling protocol when the \"numpypy\" module will be renamed to \"numpy\".\nI am now working on subarrays.\n\n\n\n\nI would also like to thank everyone who donated and allowed me to work on this.\n\n\n\nCheers,\n\nRomain Guillebert", + "tags": "numpy", + "url": "https://www.pypy.org/posts/2013/05/numpy-status-update-4176018422530420763.html" + }, + { + "title": "PyPy 2.0 - Einstein Sandwich", + "text": "We're pleased to announce PyPy 2.0. This is a stable release that brings\na swath of bugfixes, small performance improvements and compatibility fixes.\nPyPy 2.0 is a big step for us and we hope in the future we'll be able to\nprovide stable releases more often.\nYou can download the PyPy 2.0 release here:\n\nhttps://pypy.org/download.html\nThe two biggest changes since PyPy 1.9 are:\n\nstackless is now supported including greenlets, which means eventlet\nand gevent should work (but read below about gevent)\nPyPy now contains release 0.6 of cffi as a builtin module, which\nis preferred way of calling C from Python that works well on PyPy\n\nIf you're using PyPy for anything, it would help us immensely if you fill out\nthe following survey: https://bit.ly/pypysurvey This is for the developers\neyes and we will not make any information public without your agreement.\n\nWhat is PyPy?\nPyPy is a very compliant Python interpreter, almost a drop-in replacement for\nCPython 2.7. It's fast (pypy 2.0 and cpython 2.7.3 performance comparison)\ndue to its integrated tracing JIT compiler.\nThis release supports x86 machines running Linux 32/64, Mac OS X 64 or\nWindows 32. Windows 64 work is still stalling, we would welcome a volunteer\nto handle that. ARM support is on the way, as you can see from the recently\nreleased alpha for ARM.\n\n\nHighlights\n\nStackless including greenlets should work. For gevent, you need to check\nout pypycore and use the pypy-hacks branch of gevent.\ncffi is now a module included with PyPy. (cffi also exists for\nCPython; the two versions should be fully compatible.) It is the\npreferred way of calling C from Python that works on PyPy.\nCallbacks from C are now JITted, which means XML parsing is much faster.\nA lot of speed improvements in various language corners, most of them small,\nbut speeding up some particular corners a lot.\nThe JIT was refactored to emit machine code which manipulates a \"frame\"\nthat lives on the heap rather than on the stack. This is what makes\nStackless work, and it could bring another future speed-up (not done yet).\nA lot of stability issues fixed.\nRefactoring much of the numpypy array classes, which resulted in removal of\nlazy expression evaluation. On the other hand, we now have more complete\ndtype support and support more array attributes.\n\nCheers,\nfijal, arigo and the PyPy team", + "tags": "", + "url": "https://www.pypy.org/posts/2013/05/pypy-20-einstein-sandwich-635158782365435530.html" + }, + { + "title": "PyPy 2.0 alpha for ARM", + "text": "Hello.\nWe're pleased to announce an alpha release of PyPy 2.0 for ARM. This is mostly\na technology preview, as we know the JIT is not yet stable enough for the\nfull release. However please try your stuff on ARM and report back.\nThis is the first release that supports a range of ARM devices - anything with\nARMv6 (like the Raspberry Pi) or ARMv7 (like Beagleboard, Chromebook,\nCubieboard, etc.) that supports VFPv3 should work. We provide builds with\nsupport for both ARM EABI variants: hard-float and some older operating\nsystems soft-float.\nThis release comes with a list of limitations, consider it alpha quality,\nnot suitable for production:\n\nstackless support is missing.\nassembler produced is not always correct, but we successfully managed to\nrun large parts of our extensive benchmark suite, so most stuff should work.\n\nYou can download the PyPy 2.0 alpha ARM release here (including a deb for raspbian):\n\nhttps://pypy.org/download.html\nPart of the work was sponsored by the Raspberry Pi foundation.\n\nWhat is PyPy?\nPyPy is a very compliant Python interpreter, almost a drop-in replacement for\nCPython 2.7.3. It's fast due to its integrated tracing JIT compiler.\nThis release supports ARM machines running Linux 32bit. Both hard-float\narmhf and soft-float armel builds are provided. armhf builds are\ncreated using the Raspberry Pi custom cross-compilation toolchain based on\ngcc-arm-linux-gnueabihf and should work on ARMv6 and ARMv7 devices running at\nleast debian or ubuntu. armel builds are built using gcc-arm-linux-gnuebi\ntoolchain provided by ubuntu and currently target ARMv7. If there is interest\nin other builds, such as gnueabi for ARMv6 or without requiring a VFP let us\nknow in the comments or in IRC.\n\n\nBenchmarks\nEverybody loves benchmarks. Here is a table of our benchmark suite\n(for ARM we don't provide it yet on https://speed.pypy.org,\nunfortunately).\nThis is a comparison of Cortex A9 processor with 4M cache and Xeon W3580 with\n8M of L3 cache. The set of benchmarks is a subset of what we run for\nhttps://speed.pypy.org that finishes in reasonable time. The ARM machine\nwas provided by Calxeda.\nColumns are respectively:\n\nbenchmark name\nPyPy speedup over CPython on ARM (Cortex A9)\nPyPy speedup over CPython on x86 (Xeon)\nspeedup on Xeon vs Cortex A9, as measured on CPython\nspeedup on Xeon vs Cortex A9, as measured on PyPy\nrelative speedup (how much bigger the x86 speedup is over ARM speedup)\n\n\n\n\n\n\n\n\n\n\n\nBenchmark\nPyPy vs CPython (arm)\nPyPy vs CPython (x86)\nx86 vs arm (pypy)\nx86 vs arm (cpython)\nrelative speedup\n\nai\n3.61\n3.16\n7.70\n8.82\n0.87\n\nbm_mako\n3.41\n2.11\n8.56\n13.82\n0.62\n\nchaos\n21.82\n17.80\n6.93\n8.50\n0.82\n\ncrypto_pyaes\n22.53\n19.48\n6.53\n7.56\n0.86\n\ndjango\n13.43\n11.16\n7.90\n9.51\n0.83\n\neparse\n1.43\n1.17\n6.61\n8.12\n0.81\n\nfannkuch\n6.22\n5.36\n6.18\n7.16\n0.86\n\nfloat\n5.22\n6.00\n9.68\n8.43\n1.15\n\ngo\n4.72\n3.34\n5.91\n8.37\n0.71\n\nhexiom2\n8.70\n7.00\n7.69\n9.56\n0.80\n\nhtml5lib\n2.35\n2.13\n6.59\n7.26\n0.91\n\njson_bench\n1.12\n0.93\n7.19\n8.68\n0.83\n\nmeteor-contest\n2.13\n1.68\n5.95\n7.54\n0.79\n\nnbody_modified\n8.19\n7.78\n6.08\n6.40\n0.95\n\npidigits\n1.27\n0.95\n14.67\n19.66\n0.75\n\npyflate-fast\n3.30\n3.57\n10.64\n9.84\n1.08\n\nraytrace-simple\n46.41\n29.00\n5.14\n8.23\n0.62\n\nrichards\n31.48\n28.51\n6.95\n7.68\n0.91\n\nslowspitfire\n1.28\n1.14\n5.91\n6.61\n0.89\n\nspambayes\n1.93\n1.27\n4.15\n6.30\n0.66\n\nsphinx\n1.01\n1.05\n7.76\n7.45\n1.04\n\nspitfire\n1.55\n1.58\n5.62\n5.49\n1.02\n\nspitfire_cstringio\n9.61\n5.74\n5.43\n9.09\n0.60\n\nsympy_expand\n1.42\n0.97\n3.86\n5.66\n0.68\n\nsympy_integrate\n1.60\n0.95\n4.24\n7.12\n0.60\n\nsympy_str\n0.72\n0.48\n3.68\n5.56\n0.66\n\nsympy_sum\n1.99\n1.19\n3.83\n6.38\n0.60\n\ntelco\n14.28\n9.36\n3.94\n6.02\n0.66\n\ntwisted_iteration\n11.60\n7.33\n6.04\n9.55\n0.63\n\ntwisted_names\n3.68\n2.83\n5.01\n6.50\n0.77\n\ntwisted_pb\n4.94\n3.02\n5.10\n8.34\n0.61\n\n\n\nIt seems that Cortex A9, while significantly slower than Xeon, has higher\nslowdowns with a large interpreter (CPython) than a JIT compiler (PyPy). This\ncomes as a surprise to me, especially that our ARM assembler is not nearly\nas polished as our x86 assembler. As for the causes, various people mentioned\nbranch predictor, but I would not like to speculate without actually knowing.\n\n\nHow to use PyPy?\nWe suggest using PyPy from a virtualenv. Once you have a virtualenv\ninstalled, you can follow instructions from pypy documentation on how\nto proceed. This document also covers other installation schemes.\nWe would not recommend using in production PyPy on ARM just quite yet,\nhowever the day of a stable PyPy ARM release is not far off.\nCheers,\nfijal, bivab, arigo and the whole PyPy team", + "tags": "arm,sponsors", + "url": "https://www.pypy.org/posts/2013/05/pypy-20-alpha-for-arm-2318299473927531503.html" + }, + { + "title": "PyPy 2.0 beta 2 released", + "text": "We're pleased to announce the 2.0 beta 2 release of PyPy. This is a major\nrelease of PyPy and we're getting very close to 2.0 final, however it includes\nquite a few new features that require further testing. Please test and report\nissues, so we can have a rock-solid 2.0 final. It also includes a performance\nregression of about 5% compared to 2.0 beta 1 that we hope to fix before\n2.0 final. The ARM support is not working yet and we're working hard to\nmake it happen before the 2.0 final. The new major features are:\n\nJIT now supports stackless features, that is greenlets and stacklets. This\nmeans that JIT can now optimize the code that switches the context. It enables\nrunning eventlet and gevent on PyPy (although gevent requires some\nspecial support that's not quite finished, read below).\nThis is the first PyPy release that includes cffi as a core library.\nVersion 0.6 comes included in the PyPy library. cffi has seen a lot of\nadoption among library authors and we believe it's the best way to wrap\nC libaries. You can see examples of cffi usage in _curses.py and\n_sqlite3.py in the PyPy source code.\n\nYou can download the PyPy 2.0 beta 2 release here:\n\nhttps://pypy.org/download.html\n\nWhat is PyPy?\nPyPy is a very compliant Python interpreter, almost a drop-in replacement for\nCPython 2.7.3. It's fast (pypy 2.0 beta 2 and cpython 2.7.3\nperformance comparison) due to its integrated tracing JIT compiler.\nThis release supports x86 machines running Linux 32/64, Mac OS X 64 or\nWindows 32. It also supports ARM machines running Linux, however this is\ndisabled for the beta 2 release.\nWindows 64 work is still stalling, we would welcome a volunteer\nto handle that.\n\n\nHow to use PyPy?\nWe suggest using PyPy from a virtualenv. Once you have a virtualenv\ninstalled, you can follow instructions from pypy documentation on how\nto proceed. This document also covers other installation schemes.\n\n\nHighlights\n\ncffi is officially supported by PyPy. It comes included in the standard\nlibrary, just use import cffi\nstackless support - eventlet just works and gevent requires pypycore\nand pypy-hacks branch of gevent (which mostly disables cython-based\nmodules)\ncallbacks from C are now much faster. pyexpat is about 3x faster, cffi\ncallbacks around the same\n__length_hint__ is implemented (PEP 424)\na lot of numpy improvements\n\n\n\nImprovements since 1.9\n\nJIT hooks are now a powerful tool to introspect the JITting process that\nPyPy performs\nvarious performance improvements compared to 1.9 and 2.0 beta 1\noperations on long objects are now as fast as in CPython (from\nroughly 2x slower)\nwe now have special strategies for dict/set/list which contain\nunicode strings, which means that now such collections will be both faster\nand more compact.", + "tags": "release", + "url": "https://www.pypy.org/posts/2013/04/pypy-20-beta-2-released-4858660312787995512.html" + }, + { + "title": "So, you want to try PyPy", + "text": "Hello.\nDuring the PyCon trip multiple people asked me how exactly they could run\ntheir stuff on PyPy to get the speedups. Now, in an ideal world,\nyou would just swap CPython with PyPy, everything would run tons of times\nfaster and everyone would live happily ever after. However, we don't live in\nan ideal world and PyPy does not speed up everything you could\npotentially run. Chances are that you can run your stuff quite a bit faster, but\nit requires quite a bit more R&D than just that. This blog post is an attempt to\nexplain certain steps that might help. So here we go:\n\nDownload and install PyPy. 2.0 beta 1 or upcoming 2.0 beta 2 would be a good\ncandidate; it's not called a beta for stability reasons.\nRun your tests on PyPy. There is absolutely no need for fast software that\ndoes not work. There might be some failures. Usually they're harmless (e.g.\nyou forgot to close the file); either fix them or at least inspect them. In\nshort, make sure stuff works.\nInspect your stack. In particular, C extensions, while sometimes working, are\na potential source of instability and slowness. Fortunately,\nsince the introduction of cffi, the ecosystem of PyPy-compatible software\nhas been growing. Things I know are written with PyPy in mind:\nthe new version of pyOpenSSL will support PyPy via cffi\npsycopg2cffi is the most actively maintained postgres binding for PyPy,\nwith pg8000 reported working\nmysql has a ctypes based implementation (although a cffi-based one would\nbe definitely better)\nPyPy 2.0 beta 2 will come with sqlite-using-cffi\nlxml-cffi\nuWSGI, while working, is almost certainly not the best choice. Try\ntornado, twisted.web, cyclone.io, gunicorn or gevent\n(note: gevent support for PyPy is not quite finished; will write about it\nin a separate blog post, but you can't just use the main branch of gevent)\nconsult (and contribute to) pypy compatibility wiki for details (note\nthat it's community maintained, might be out of date)\n\n\n\n\nHave benchmarks. If you don't have benchmarks, then performance does not\nmatter for you. Since PyPy's warm-up time is bad (and yes, we know, we're\nworking on it), you should leave ample time for warm-ups. Five to ten seconds\nof continuous computation should be enough.\nTry them. If you get lucky, the next step might be to deploy and be happy.\nIf you're unlucky, profile and try to isolate bottlenecks. They might be in\na specific library or they might be in your code. The better you can isolate\nthem, the higher your chances of understanding what's going on.\nDon't take it for granted. PyPy's JIT is very good, but there is a variety\nof reasons that it might not work how you expect it to. A lot of times it\nstarts off slow, but a little optimization can improve the speed as much as\n10x. Since PyPy's runtime is less mature than CPython, there are higher\nchances of finding an obscure corner of the standard library that might be\natrociously slow.\nMost importantly, if you run out of options and you have a reproducible\nexample, please report it. A pypy-dev email, popping into #pypy\non irc.freenode.net, or getting hold of me on twitter are good ways.\nYou can also contact me directly at fijall at gmail.com as well. While\nit's cool if the example is slow, a lot of problems only show up on large\nand convoluted examples. As long as I can reproduce it on my machine or I can\nlog in somewhere, I am usually happy to help.\nI typically use a combination of jitviewer, valgrind and\nlsprofcalltree to try to guess what's going on. These tools are all\nuseful, but use them with care. They usually require quite a bit of\nunderstanding before being useful. Also sometimes they're just plain useless\nand you need to write your own analysis.\n\nI hope this summary of steps to take is useful. We hear a lot of stories\nof people trying PyPy, most of them positive, but some of them negative.\nIf you just post \"PyPy didn't work for me\" on your blog, that's\ncool too, but you're missing an opportunity. The reasons may vary from\nsomething serious like \"this is a bad pattern for PyPy GC\" to something\ncompletely hilarious like \"oh, I left this sys._getframe() somewhere\nin my hot loops for debugging\" or \"I used the logging module which uses\nsys._getframe() all over the place\".\nCheers,\nfijal", + "tags": "", + "url": "https://www.pypy.org/posts/2013/03/so-you-want-to-try-pypy-4702482800824669595.html" + }, + { + "title": "Numpy status update and developer announcement", + "text": "Hello, some good news!\nFirst the update:\n\ndtype support - NumPy on PyPy now supports non-native storage formats.\nDue to a lack of true support for longdoubles in rpython, we decided to back\nout the support of longdouble-as-double which was misleading.\nmissing ndarray attributes - work has been made toward supporting the\ncomplete set of attributes\non ndarrays. We are progressing alphabetically, and have made it to d.\nUnsupported attributes, and unsupported arguments to attribute calls\nwill raise a NotImplementedError.\npickling support for numarray - hasn't started yet, but next on the list\nThere has been some work on exposing FFI routines in numpypy.\nBrian Kearns has made progress in improving the numpypy namespace.\nThe python numpypy submodules now more closely resemble their numpy\ncounterparts. Also, translated _numpypy submodules are now more properly\nmapped to the numpy core c-based submodules, furthering the goal of being\nable to install numpy as a pure-python module with few modifications.\n\nAnd now the good news:\nWhile our funding drive over 2012 did not reach our goal, we still managed to\nraise a fair amount of money in donations. So far we only managed to spend around $10 000 of it.\nWe issued a call for additional developers, and are glad to welcome Romain Guillebert and Ronan Lamy\nto the numpypy team. Hopefully we will be able to report on speedier progress soon.\nCheers,\nMatti Picus, Maciej Fijalkowski", + "tags": "numpy", + "url": "https://www.pypy.org/posts/2013/03/numpy-status-update-and-developer-1503421654591696377.html" + }, + { + "title": "Py3k status update #10", + "text": "This is the tenth status update about our work on the py3k branch, which we\ncan work on thanks to all of the people who donated to the py3k proposal.There's been significant progress since the last update: the linux x86-32\nbuildbot now passes 289 out of approximately 354 modules (with 39 skips) of\nCPython's regression test suite.That means there's only 26 test module failures left! The list of major items\nremaining for 3.2 compatibility are now short enough to list here, with their\nrelated tests:Tokenizer support for non-ascii identifiers\ntest_importlib\ntest_pep263\nmemoryview (Manuel Jacob's tackling this on the py3k-memoryview branch)\ntest_memoryview\nmultiprocessing module currently deadlocks\ntest_multiprocessing\nBuggy handling of the new extended unpacking syntax by the compiler:\ntest_unpack_ex\nThe new Global Interpreter Lock and new thread signal handling\ntest_threading\ntest_threadsignals\ntest_sys\nUpgrade unicodedata to 6.0.0 (requires updates to the actual unicodedata\ngeneration script)\ntest_ucn\ntest_unicode\ntest_unicodedata\nCPyExt\ntest_capi (currently crashes)\nUpdate int's hash code to match to CPython (float's is already updated on the\npy3k-newhash branch. note that PyPy 2.x doesn't even totally match\nCPython's hashing)\ntest_decimal\ntest_fractions\ntest_numeric_tower\nMiscellaneous:\ntest_complex\ntest_float\ntest_peepholer\ntest_range\ntest_sqlite (a new cffi based version seems to be coming)\ntest_ssl\ntest_struct\ntest_subprocess\ntest_sys_settrace\ntest_time\nAdditionally there are still a number of failures in PyPy's internal test\nsuite. These tests are usually ran against untranslated versions of PyPy during\ndevelopment. However we've now began running them against a fully translated\nversion of PyPy on the buildbot too (thanks to Amaury for setting this\nup). This further ensures that our tests and implementation are sane.We're getting closer to producing an initial alpha release. Before that happens\nwe'd like to see:further test fixes\nthe results of test runs on other major platforms (e.g. linux x86-64 and osx\nseem to have some additional failures as of now)\nsome basic real world testing\nFinally I'd like to thank Manuel Jacob for his various contributions over the\npast month, including fixing the array and ctypes modules among other things,\nand also Amaury Forgeot d'Arc for his ongoing excellent contributions.cheers,\nPhil", + "tags": "pypy3", + "url": "https://www.pypy.org/posts/2013/03/py3k-status-update-10-6681398990092286007.html" + }, + { + "title": "10 years of PyPy", + "text": "From a software engineering perspective, 10 years is indistinguishable\nfrom infinity, so I don't care what happens 10 years from now -- as\nlong as you don't blame me. :-) - Guido van Rossum, Python creator.\n10 years is indeed a long time. PyPy was created approximately 10 years ago,\nwith the exact date being lost in the annals of the version control system.\nWe've come a long way during those 10 years, from a \"minimal Python\" that\nwas supposed to serve mostly as an educational tool, through to a vehicle for\nacademic research to a high performance VM for Python and beyond.\nSome facts from the PyPy timeline:\n\nIn 2007, at the end of the EU funding period, we promised the JIT was just around the corner.\nIt turned out we misjudged it pretty badly -- the first usable PyPy was released in 2010.\nAt some point we decided to have a JavaScript backend so one could compile RPython programs\nto JavaScript and run them in a browser. Turned out it was a horrible idea.\nAnother option we tried was using RPython to write CPython C extensions. Again, it turned out RPython\nis a bad language and instead we made a fast JIT, so you don't have to write C extensions.\nWe made N attempts to use LLVM. Seriously, N is 4 or 5. But we haven't fully given up yet :-)\nThey all run into issues one way or another.\nWe were huge fans of ctypes at the beginning. Up to the point where we tried to make\na restricted subset with static types, called rctypes for RPython. Turned out to be horrible.\nTwice.\nWe were very hopeful about creating a JIT generator from the beginning. But the first one failed miserably,\ngenerating too much assembler. The second failed too. The third first burned down and then failed.\nHowever, we managed to release a working JIT in 2010, against all odds.\nMartijn Faassen used to ask us \"how fast is PyPy\" so we decided to name an option enabling all\noptimizations \"--faassen\". Then \"--no-faassen\" was naturally added too. Later we\ndecided to grow up and renamed it to \"-O2\", and now \"-Ojit\".\nThe first time the Python interpreter successfully compiled to C, it segfaulted because the code generator used signed chars instead of unsigned chars...\nTo make it more likely to be accepted, the proposal for the EU project contained basically every feature under the sun a language could have. This proved to be annoying, because we had to actually implement all that stuff. Then we had to do a cleanup sprint where we deleted 30% of codebase and 70% of features.\nAt one sprint someone proposed a new software development methodology: 'Terminology-Driven Programming' means to pick a fancy name, then discuss what it could mean, then implement it. Examples: timeshifter, rainbow interpreter, meta-space bubble, hint annotations (all but one of these really existed).\nThere is a conspiracy theory that the reason why translation is so slow is because time is stored away during it, which is later retrieved when an actual program runs to make them appear faster\n\nOverall, it was a really long road. However, 10 years later we are in\ngood shape. A quick look on the immediate future: we are approaching\nPyPy 2.0 with stackless+JIT and cffi support,\nthe support for Python 3 is taking shape, non-standard\nextensions like STM are slowly getting ready (more soon), and there are\nseveral non-Python interpreters around the corner (Hippy, Topaz and more).\nCheers,\nfijal, arigo, hodgestar, cfbolz and the entire pypy team.", + "tags": "", + "url": "https://www.pypy.org/posts/2013/02/10-years-of-pypy-634401291726575821.html" + }, + { + "title": "cppyy status update", + "text": "The cppyy module\nprovides C++ bindings for PyPy by using the reflection information extracted\nfrom C++ header files by means of the\nReflex package.\nIn order to support C++11, the goal is to move away from Reflex and instead use\ncling, an interactive\nC++ interpreter, as the backend.\nCling is based on llvm's\nclang.\n\nThe use of a real compiler under the hood has the advantage that it is now\npossible to cover every conceivable corner case.\nThe disadvantage, however, is that every corner case actually has to be\ncovered.\nLife is somewhat easier when calls come in from the python interpreter, as\nthose calls have already been vetted for syntax errors and all lookups are\nwell scoped.\nFurthermore, the real hard work of getting sane responses from and for C++\nin an interactive environment is done in cling, not in the bindings.\nNevertheless, it is proving a long road (but for that matter clang does not\nsupport all of C++11 yet), so here's a quick status update showing that good \nprogress is being made.\n\nThe following example is on CPython, not PyPy, but moving a third\n(after Reflex and\nCINT) backend into place\nunderneath cppyy is straightforward compared to developing the backend\nin the first place.\n\nTake this snippet of C++11 code\n(cpp11.C):\n\n constexpr int data_size() { return 5; }\n\n auto N = data_size();\n\n template\n struct MyMath {\n static auto add(L l, R r) -> decltype(l+r) { return l + r; }\n };\n\n template class MyMath;\n\nAs a practical matter, most usage of new C++11 features will live in\nimplementations, not in declarations, and are thus never seen by the bindings.\nThe above example is therefore somewhat contrived, but it will serve to show\nthat these new declarations actually work.\nThe new features used here are\nconstexpr,\nauto, and\ndecltype.\nHere is how you could use these from CPython, using the\nPyROOT\npackage, which has more than a passing resemblance to cppyy, as one is based\non the other:\n\n import ROOT as gbl\n gbl.gROOT.LoadMacro('cpp11.C')\n\n print 'N =', gbl.N\n print '1+1 =', gbl.MyMath(int, int).add(1,1)\n\nwhich, when entered into a file\n(cpp11.py) and executed,\nprints the expected results:\n\n $ python cpp11.py\n N = 5\n 1+1 = 2\n\nIn the example, the C++ code is compiled on-the-fly, rather than first generating\na dictionary as is needed with Reflex.\nA deployment model that utilizes stored pre-compiled information is foreseen\nto work with larger projects, which may have to pull in headers from many places.\n\nWork is going to continue first on C++03 on cling with CPython (about 85% of\nunit tests currently pass), with a bit of work on C++11 support on the side.\nOnce fully in place, it can be brought into a new backend for cppyy, after \nwhich the remaining parts of C++11 can be fleshed out for both interpreters.\n\nCheers,\nWim Lavrijsen", + "tags": "", + "url": "https://www.pypy.org/posts/2013/02/cppyy-status-update-808802896237239604.html" + }, + { + "title": "PyCon Silicon Valley and San Francisco visit", + "text": "Hello everyone.\nWe (Armin Rigo and Maciej Fijalkowski) are visiting San Francisco/Silicon Valley\nfor PyCon and beyond. Alex Gaynor, another core PyPy dev is living there\npermanently. My visiting dates are 12-28 of March, Armin's 11-21st.\nIf you want us to give a talk at your company or simply catch up with us\nfor a dinner\nplease get in touch. Write to pypy-dev@python.org, if you want this publically\nknown or simply send me a mail at fijall@gmail.com if you don't want it public.\nCheers,\nfijal", + "tags": "", + "url": "https://www.pypy.org/posts/2013/02/hello-everyone-4718797989680066222.html" + }, + { + "title": "Announcing Topaz, an RPython powered Ruby interpreter", + "text": "Hello everyone\n\nLast week, Alex Gaynor announced the first public release of\nTopaz,\na Ruby interpreter written in RPython. This is the culmination of a\npart-time effort over the past 10 months to provide a Ruby interpreter\nthat implements enough interesting constructs in Ruby to show that the\nRPython toolchain can produce a Ruby implementation fast enough to\nbeat what is out there.\n\nDisclaimer\n\nObviously the implementation is very incomplete currently in terms of\navailable standard library. We are working on getting it useable. If\nyou want to try it, grab a\nnightly build.\n\nWe have run some benchmarks from the\nRuby benchmark suite\nand the\nmetatracing VMs experiment. The\npreliminary results are promising, but at this point we are missing so\nmany method implementations that most benchmarks won't run yet. So instead of\nperformance, I'm going to talk about the high-level structure of the\nimplementation.\n\nArchitecture\n\nTopaz interprets a custom bytecode set. The basics are similar to\nSmalltalk VMs, with bytecodes for loading and storing locals and\ninstance variables, sending messages, and stack management. Some\nsyntactical features of Ruby, such as defining classes and modules,\nliteral regular expressions, hashes, ranges, etc also have their own\nbytecodes. The third kind of bytecodes are for control flow constructs\nin Ruby, such as loops, exception handling, break, continue, etc.\n\nIn trying to get from Ruby source code to bytecode, we found that the\neasiest way to support all of the Ruby syntax is to write a custom\nlexer and use an RPython port of PLY\n(fittingly called RPly) to create the\nparser from the Ruby yacc grammar.\n\nThe Topaz interpreter uses an ObjectSpace (similar to how PyPy does\nit), to interact with the Ruby world. The object space contains all\nthe logic for wrapping and interacting with Ruby objects from the\nVM. It's __init__ method sets up the core classes, initial globals,\nand creates the main thread (the only one right now, as we do not have\nthreading, yet).\n\nClasses are mostly written in Python. We use ClassDef objects to\ndefine the Ruby hierarchy and attach RPython methods to Ruby via\nClassDef decorators. These two points warrant a little explanation.\n\nHierarchies\n\nAll Ruby classes ultimately inherit from BasicObject. However, most\nobjects are below Object (which is a direct subclass of\nBasicObject). This includes objects of type Fixnum, Float,\nClass, and Module, which may not need all of the facilities of\nfull objects most of the time.\n\nMost VMs treat such objects specially, using tagged pointers to\nrepresent Fixnums, for example. Other VMs (for example from the\nSOM Family)\ndon't. In the latter case, the implementation hierarchy matches the\nlanguage hierarchy, which means that objects like Fixnum share a\nrepresentation with all other objects (e.g. they have class pointers\nand some kind of instance variable storage).\n\nIn Topaz, implementation hierarchy and language hierarchy are\nseparate. The first is defined through the Python inheritance. The\nother is defined through the ClassDef for each Python class, where the\nappropriate Ruby superclass is chosen. The diagram below shows how the\nimplementation class W_FixnumObject inherits directly from\nW_RootObject. Note that W_RootObject doesn't have any attrs,\nspecifically no storage for instance variables and no map (for\ndetermining the class - we'll get to that). These attributes are\ninstead defined on W_Object, which is what most other implementation\nclasses inherit from. However, on the Ruby side, Fixnum correctly\ninherits (via Numeric and Integer) from Object.\n\n\n\n\nThis simple structural optimization gives a huge speed boost, but\nthere are VMs out there that do not have it and suffer performance\nhits for it.\n\nDecorators\n\nRuby methods can have symbols in its names that are not allowed as\npart of Python method names, for example !, ?, or =, so we\ncannot simply define Python methods and expose them to Ruby by the\nsame name. \n\nFor defining the Ruby method name of a function, as well as argument\nnumber checking, Ruby type coercion and unwrapping of Ruby objects to\ntheir Python equivalents, we use decorators defined on ClassDef. When\nthe ObjectSpace initializes, it builds all Ruby classes from their\nrespective ClassDef objects. For each method in an implementation\nclass that has a ClassDef decorator, a wrapper method is generated and\nexposed to Ruby. These wrappers define the name of the Ruby method,\ncoerce Ruby arguments, and unwrap them for the Python method.\n\nHere is a simple example:\n\n@classdef.method(\"*\", times=\"int\")\ndef method_times(self, space, times):\n return self.strategy.mul(space, self.str_storage, times)\n\n\nThis defines the method * on the Ruby String class. When this is\ncalled, the first argument is converted into a Ruby Fixnum object\nusing the appropriate coercion method, and then unwrapped into a plain\nPython int and passed as argument to method_times. The wrapper\nmethod also supplies the space argument.\n\nObject Structure\n\nRuby objects have dynamically defined instance variables and may\nchange their class at any time in the program (a concept called\nsingleton class\nin Ruby - it allows each object to have unique behaviour). To still\nefficiently access instance variables, you want to avoid dictionary\nlookups and let the JIT know about objects of the same class that have\nthe same instance variables. Topaz, like PyPy (which got it from\nSelf), implements instances using maps, which transforms dictionary\nlookups into array accesses. See the\nblog post\nfor the details.\n\nThis is only a rough overview of the architecture. If you're\ninterested, get in touch on\n#topaz.freenode.net, follow the\nTopaz Twitter account or contribute\non GitHub.\n\nTim Felgentreff", + "tags": "", + "url": "https://www.pypy.org/posts/2013/02/announcing-topaz-rpython-powered-ruby-6662407703061538341.html" + }, + { + "title": "CFFI 0.5", + "text": "Hi all,\n\nA short notice to tell you that CFFI 0.5 was released. This\ncontains a number of small improvements from 0.4, but seems to otherwise\nbe quite stable since a couple of months --- no change since January 10,\napart from the usual last-minute fixes for Python 3 and for Windows.\n\nHave fun!\n\nArmin", + "tags": "", + "url": "https://www.pypy.org/posts/2013/02/cffi-05-1630643916751622710.html" + }, + { + "title": "NumPyPy 2013 Developer Position", + "text": "Introduction\nProposed herein is a part-time fellowship for developing NumPy in PyPy.\nThe work will initially consist of 100 hours\nwith the possibility of extension, until the funds run out.\nDevelopment and improvement of PyPy's NumPyPy (as\nwith most Open Source and Free Software) is done as a collaborative process\nbetween volunteer, paid, and academic contributors. Due to a successful funding\ndrive but a lack of contributors willing to work directly for PyPy, we find\nourselves in the enviable situation of being able to offer this position.\n\n\nBackground\nPyPy's developers make all PyPy software available to the public\nwithout charge, under PyPy's Open Source copyright license, the\npermissive MIT License. PyPy's license assures that PyPy is equally\navailable to everyone freely on terms that allow both non-commercial\nand commercial activity. This license allows for academics, for-profit\nsoftware developers, volunteers and enthusiasts alike to collaborate\ntogether to make a better Python implementation for everyone.\nNumPy support for PyPy is licensed similarly, and therefore NumPy in\nPyPy support can directly help researchers and developers who seek to\ndo numeric computing but want an easier programming language to use\nthan Fortan or C, which is typically used for these\napplications. Being licensed freely to the general public means that\nopportunities to use, improve and learn about how NumPy in PyPy works\nitself will be generally available to everyone.\n\n\nThe Need for a Part-Time Developer\nNumPy project in PyPy has seen some slow, but steady progress since we started\nworking about a year ago. On one hand,\nit's actually impressive what we could deliver with the effort undertaken,\non the other hand, we would like to see the development accelerated.\nPyPy has strict coding, testing, documentation, and review standards,\nwhich ensures excellent code quality, continually improving\ndocumentation and code test coverage, and minimal regressions. A\npart-time developer will be able to bring us closer to the goal of\nfull numpy-api implementation and speed improvements.\n\n\nWork Plan\nThe current proposal is split into two parts:\n\nCompatibility:\nThis part covers the core NumPy Python API. We'll implement most NumPy APIs\nthat are officially documented and we'll pass most of NumPy's tests that\ncover documented APIs and are not implementation details.\nSpecifically, we don't plan to:\n\nimplement NumPy's C API\nimplement other scientific libraries, like SciPy, matplotlib or biopython\nimplement details that are otherwise agreed by consensus to not have a place\nin PyPy's implementation of NumPy or agreed with NumPy community\nto be implementation details\n\n\nSpeed:\nThis part will cover significant speed improvements in the JIT that would\nmake numeric computations faster. This includes, but is not necesarilly\nlimited to:\n\nwrite a set of benchmarks covering various use cases\nteaching the JIT backend (or multiple backends) how to deal with vector\noperations, like SSE\nexperiments with automatic parallelization using multiple threads, akin\nto numexpr\nimproving the JIT register allocator that will make a difference, especially\nfor tight loops\n\nAs with all speed improvements, it's relatively hard to predict exactly\nhow it'll cope, however we expect the results to be withing an order\nof magnitude of handwritten C equivalent.\n\n\n\n\nPosition Candidate\nWe would like people who are proficient in NumPy and PyPy (but don't have to be\ncore developers of either) to step up. The developer selection will be done\nby consensus of PyPy core developers and consulted with the Software Freedom\nConservancy for lack of conflict of interest. The main criterium will be\npast contributions to the PyPy project, but they don't have to be significant\nin size.\nA candidate for the Developer position will demonstrate the following:\n\nThe ability to write clear, stable, suitable and tested code\nThe ability to understand and extend the JIT capabilities used in NumPyPy.\nA positive presence in PyPy's online community on IRC and the mailing\nlist.\n\nIdeally the Developer will also:\n\nHave familiarity with the infrastructure of the PyPy project (including\nbug tracker and buildbot).\nHave Worked to provide education or outreach on PyPy in other forums such as\nworkshops, conferences, and user groups.\n\nConservancy and PyPy are excited to announce the Developer Position.\nRenumeration for the position will be at the rate of 60 USD per hour, through\nthe Software Freedom Conservancy.\nPyPy community is promising to provide necessary guidance and help into\nthe current codebase, however we expect a successful candidate to be able\nto review code and incorporate external patches within two months of the\nstarting date of the contract.\nCandidates should submit their proposal (including their CV) to:\npypy-z@python.org\nThe deadline for this initial round of proposals is February 1, 2013.", + "tags": "numpy", + "url": "https://www.pypy.org/posts/2013/01/numpypy-2013-developer-position-1547805593757893630.html" + }, + { + "title": "Py3k status update #9", + "text": "This is the ninth status update about our work on the py3k branch, which\nwe can work on thanks to all of the people who donated to the py3k\nproposal.Just a very short update on December's work: we're now passing about 223 of\napproximately 355 modules of CPython's regression test suite, up from passing\n194 last month.Some brief highlights:More encoding related issues were addressed. e.g. now most if not all the\nmultibytecodec test modules pass.\nFixed some path handling issues (test_os, test_ntpath and\ntest_posixpath now pass)\nWe now pass test_class, test_descr and almost test_builtin (among\nother things): these are notable as they are fairly extensive test suites of\ncore aspects of the langauge.\nAmaury Forgeot d'Arc continued making progress on CPyExt (thanks again!)\ncheers,\nPhil", + "tags": "pypy3", + "url": "https://www.pypy.org/posts/2013/01/py3k-status-update-9-98332471264591773.html" + }, + { + "title": "PyPy related internship at NCAR", + "text": "Hello everyone\nI would like to advertise a PyPy-related summer internship at\nthe National Center for Atmospheric Research, which is located in lovely\nBoulder, Colorado. As for the last year, the mentor will be Davide del Vento,\nwith my possible support on the PyPy side.\nThe full details of the application are to be found on\nthe internship description and make sure you read the requirements\nfirst. Important requirements:\n\nMust currently be enrolled in a United States university.\nOnly students authorized to work for any employer in the United\nStates will be considered for the SIParCS program.\nMust be a graduate or under graduate who has completed their sophomore year.\n\nIf you happen to fulfill the requirements, to me this sounds like\na great opportunity to spend a summer at NCAR in Boulder hacking on atmospheric\nmodels using PyPy.\nCheers,\nfijal", + "tags": "", + "url": "https://www.pypy.org/posts/2012/12/pypy-related-internship-at-ncar-7412729710421119926.html" + }, + { + "title": "Py3k status update #8", + "text": "This is the eight status update about our work on the py3k branch, which\nwe can work on thanks to all of the people who donated to the py3k\nproposal.Just a short update on November's work: we're now passing about 194 of\napproximately 355 modules of CPython's regression test suite, up from passing\n160 last month. Many test modules only fail a small number of individual tests\nnow.We'd like to thank Amaury Forgeot d'Arc for his contributions, in particular he\nhas made significant progress on updating CPyExt for Python 3 this month.Some other highlights:test_marshal now passes, and there's been significant progress on\npickling (thanks Kenny Levinsen and Amaury for implementing\nint.{to,from}_bytes)\nWe now have a _posixsubprocess module\nMore encoding related fixes, which affects many failing tests\n_sre was updated and now test_re almost passes\nException behavior is almost complete per the Python 3 specs, what's mostly\nmissing now are the new __context__ and __traceback__ attributes (PEP\n3134)\nFixed some crashes and deadlocks occurring during the regression tests\nWe merged the unicode-strategies branch both to default and to py3k: now we\nhave versions of lists, dictionaries and sets specialized for unicode\nelements, as we already had for strings.\nHowever, for string-specialized containers are still faster in some cases\nbecause there are shortcuts which have not been implemented for unicode yet\n(e.g., constructing a set of strings from a list of strings). The plan is to\ncompletely kill the shortcuts and improve the JIT to produce the fast\nversion automatically for both the string and unicode versions, to have a\nmore maintainable codebase without sacrificing the speed. The autoreds\nbranch (already merged) was a first step in this direction.\ncheers,\nPhilip&Antonio", + "tags": "pypy3", + "url": "https://www.pypy.org/posts/2012/12/py3k-status-update-8-3932232806458251730.html" + }, + { + "title": "PyPy San Francisco Sprint Dec 1st - Dec 2nd 2012", + "text": "The next PyPy sprint will be in San Francisco, California. It is a\npublic sprint, suitable for newcomers. It will run on Saturday December 1st and\nSunday December 2nd. The goals for the sprint are continued work towards the\n2.0 release as well as code cleanup, we of course welcome any topic which\ncontributors are interested in working on.Some other possible topics are:running your software on PyPy\nwork on PyPy's numpy (status)\nwork on STM (status)\nJIT improvements\nany exciting stuff you can think of\nIf there are newcomers, we'll run the usual introduction to hacking on\nPyPy.\nLocationThe sprint will be held at the Rackspace Office:620 Folsom St, Ste 100\nSan FranciscoThe doors will open at 10AM both days, and run until 6PM both days.Thanks to David Reid for helping get everything set up!", + "tags": "", + "url": "https://www.pypy.org/posts/2012/11/pypy-san-francisco-sprint-dec-1st-dec-5133109101989613355.html" + }, + { + "title": "PyPy 2.0 beta 1", + "text": "We're pleased to announce the 2.0 beta 1 release of PyPy. This release is\nnot a typical beta, in a sense the stability is the same or better than 1.9\nand can be used in production. It does however include a few performance\nregressions documented below that don't allow us to label is as 2.0 final.\n(It also contains many performance improvements.)\nThe main features of this release are support for ARM processor and\ncompatibility with CFFI. It also includes\nnumerous improvements to the numpy in pypy effort, cpyext and performance.\nYou can download the PyPy 2.0 beta 1 release here:\n\nhttps://pypy.org/download.html\n\nWhat is PyPy?\nPyPy is a very compliant Python interpreter, almost a drop-in replacement for\nCPython 2.7.3. It's fast (pypy 2.0 beta 1 and cpython 2.7.3\nperformance comparison) due to its integrated tracing JIT compiler.\nThis release supports x86 machines running Linux 32/64, Mac OS X 64 or\nWindows 32. It also supports ARM machines running Linux.\nWindows 64 work is still stalling, we would welcome a volunteer\nto handle that.\n\n\nHow to use PyPy?\nWe suggest using PyPy from a virtualenv. Once you have a virtualenv\ninstalled, you can follow instructions from pypy documentation on how\nto proceed. This document also covers other installation schemes.\n\n\nRegressions\nReasons why this is not PyPy 2.0:\n\nthe ctypes fast path is now slower than it used to be. In PyPy\n1.9 ctypes was either incredibly faster or slower than CPython depending whether\nyou hit the fast path or not. Right now it's usually simply slower. We're\nprobably going to rewrite ctypes using cffi, which will make it\nuniversally faster.\ncffi (an alternative to interfacing with C code) is very fast, but\nit is missing one optimization that will make it as fast as a native\ncall from C.\nnumpypy lazy computation was disabled for the sake of simplicity.\nWe should reenable this for the final 2.0 release.\n\n\n\nHighlights\n\ncffi is officially supported by PyPy. You can install it normally by\nusing pip install cffi once you have installed PyPy and pip.\nThe corresponding 0.4 version of cffi has been released.\nARM is now an officially supported processor architecture.\nPyPy now work on soft-float ARM/Linux builds. Currently ARM processors\nsupporting the ARMv7 and later ISA that include a floating-point unit are\nsupported.\nThis release contains the latest Python standard library 2.7.3 and is fully\ncompatible with Python 2.7.3.\nIt does not however contain hash randomization, since the solution present\nin CPython is not solving the problem anyway. The reason can be\nfound on the CPython issue tracker.\ngc.get_referrers() is now faster.\nVarious numpy improvements. The list includes:\naxis argument support in many places\nfull support for fancy indexing\ncomplex128 and complex64 dtypes\n\n\nJIT hooks are now a powerful tool to introspect the JITting process that\nPyPy performs.\n**kwds usage is much faster in the typical scenario\noperations on long objects are now as fast as in CPython (from\nroughly 2x slower)\nWe now have special strategies for dict/set/list which contain\nunicode strings, which means that now such collections will be both faster\nand more compact.\n\n\n\nThings we're working on\nThere are a few things that did not make it to the 2.0 beta 1, which\nare being actively worked on. Greenlets support in the JIT is one\nthat we would like to have before 2.0 final. Two important items that\nwill not make it to 2.0, but are being actively worked on, are:\n\nFaster JIT warmup time.\nSoftware Transactional Memory.\n\nCheers,\nMaciej Fijalkowski, Armin Rigo and the PyPy team", + "tags": "", + "url": "https://www.pypy.org/posts/2012/11/pypy-20-beta-1-2702952243260181341.html" + }, + { + "title": "Py3k status update #7", + "text": "This is the seventh status update about our work on the py3k branch, which\nwe can work on thanks to all of the people who donated to the py3k\nproposal.The biggest news is that this month Philip started to work on py3k in parallel\nto Antonio. As such, there was an increased amount of activity.The py3k buildbots now fully translate the branch every night and run the\nPython standard library tests.We currently pass 160 out of approximately 355 modules of CPython's standard\ntest suite, fail 144 and skip approximately 51.Some highlights:dictviews (the objects returned by dict.keys/values/items) has been greatly\nimproved, and now they full support set operators\na lot of tests has been fixed wrt complex numbers (and in particular the\n__complex__ method)\n_csv has been fixed and now it correctly handles unicode instead of bytes\nmore parser fixes, py3k list comprehension semantics; now you can no longer\naccess the list comprehension variable after it finishes\n2to3'd most of the lib_pypy modules (pypy's custom standard lib\nreplacements/additions)\npy3-enabled pyrepl: this means that finally readline works at the command\nprompt, as well as builtins.input(). pdb seems to work, as well as\nfancycompleter to get colorful TAB completions :-)\npy3 round\nfurther tightening/cleanup of the unicode handling (more usage of\nsurrogateescape, surrogatepass among other things)\nas well as keeping up with some big changes happening on the default branch\nand of course various other fixes.\nFinally, we would like to thank Amaury Forgeot d'Arc for his significant\ncontributions.cheers,\nPhilip&Antonio", + "tags": "pypy3", + "url": "https://www.pypy.org/posts/2012/11/py3k-status-update-7-6182140595418083307.html" + }, + { + "title": "NumPy status update #5", + "text": "Hello.\nI'm quite excited to inform that work on NumPy in PyPy has been restarted\nand there has been quite a bit of progress on the NumPy front in PyPy in the\npast two months. Things that happened:\n\ncomplex dtype support - thanks to matti picus, NumPy on PyPy now supports\ncomplex dtype (only complex128 so far, there is work on the other part)\nbig refactoring - probably the biggest issue we did was finishing\na big refactoring that disabled some speedups (notably lazy computation\nof arrays), but lowered the barrier of implementing cool new features.\nfancy indexing support - all fancy indexing tricks should now work,\nincluding a[b] where b is an array of integers.\nnewaxis support - now you can use newaxis features\nimprovements to ``intp``, ``uintp``, ``void``, ``string`` and record dtypes\n\nFeatures that have active branches, but hasn't been merged:\n\nfloat16 dtype support\nmissing ndarray attributes - this is a branch to finish all attributes\non ndarray, hence ending one chapter.\npickling support for numarray - hasn't started yet, but next on the list\n\nMore importantly, we're getting very close to able to import the python part\nof the original numpy with only import modifications and running it's tests.\nMost tests will fail at this point, however it'll be a good start for another\nchapter :-)\nCheers,\nfijal", + "tags": "numpy", + "url": "https://www.pypy.org/posts/2012/11/numpy-status-update-5-5489198414356844587.html" + }, + { + "title": "Cape Town 2012 sprint report", + "text": "Hello.\nWe're about to finish a PyPy sprint in Cape Town, South Africa that was\none of the smallest done so far, only having Armin Rigo and Maciej Fijalkowski\nwith Alex Gaynor joining briefly at the beginning, however also one of the\nlongest, lasting almost 3 weeks. The sprint theme seems to be predominantly\n\"no new features\" and \"spring cleaning\". We overall removed about 20k lines\nof code in the PyPy source tree. The breakdown of things done and worked on:\n\nWe killed SomeObject support in annotation and rtyper. This is a modest\ncode saving, however, it reduces the complexity of RPython and also,\nhopefully, improves compile errors from RPython. We're far from done\non the path to have comprehensible compile-time errors, but the first\nstep is always the hardest :)\n\nWe killed some magic in specifying the interface between builtin functions\nand Python code. It used to be possible to write builtin functions like this:\n\ndef f(space, w_x='xyz'):\n\nwhich will magically wrap 'xyz' into a W_StringObject. Right now, instead,\nyou have to write:\n\n@unwrap_spec(w_x=WrappedDefault('xyz'))\ndef f(space, w_x):\n\nwhich is more verbose, but less magical.\n\nWe killed the CExtModuleBuilder which is the last remaining part of\ninfamous extension compiler that could in theory build C extensions\nfor CPython in RPython. This was never working very well and the main\npart was killed long ago.\n\nWe killed various code duplications in the C backend.\n\nWe killed microbench and a bunch of other small-to-medium unused\ndirectories.\n\nWe killed llgraph JIT backend and rewrote it from scratch. Now the llgraph\nbackend is not translatable, but this feature was rarely used and caused\na great deal of complexity.\n\nWe progressed on continulet-jit-3 branch, up to the point of merging\nit into result-in-resops branch, which also has seen a bit of progress.\nPurpose of those two branches:\n\ncontinulet-jit-3: enable stackless to interact with the JIT by killing\nglobal state while resuming from the JIT into the interpreter. This has\nmultiple benefits. For example it's one of the stones on the path to\nenable STM for PyPy. It also opens new possibilities for other optimizations\nincluding Python-Python calls and generators.\nresult-in-resops: the main goal is to speed up the tracing time of PyPy.\nWe found out the majority of time is spent in the optimizer chain,\nwhich faces an almost complete rewrite. It also simplifies the storage\nof the operations as well as the number of implicit invariants that have\nto be kept in mind while developing.\n\n\nWe finished and merged the excellent work by Ronan Lamy which makes the\nflow object space (used for abstract interpretation during RPython\ncompilation) independent from the Python interpreter. This means\nwe've achieved an important milestone on the path of separating the RPython\ntranslation toolchain from the PyPy Python interpreter.\n\n\nCheers,\nfijal & armin", + "tags": "", + "url": "https://www.pypy.org/posts/2012/10/cape-town-2012-sprint-report-1612771358321767072.html" + }, + { + "title": "Py3k status update #6", + "text": "This is the sixth status update about our work on the py3k branch, which we\ncan work on thanks to all of the people who donated to the py3k proposal.The coolest news is not about what we did in the past weeks, but what we will\ndo in the next: I am pleased to announce that Philip Jenvey has been\nselected by the PyPy communitiy to be funded for his upcoming work on py3k,\nthanks to your generous donations. He will start to work on it shortly, and he\nwill surely help the branch to make faster progress. I am also particularly\nhappy of this because Philip is the first non-core developer who is getting\npaid with donations: he demonstrated over the past months to be able to work\neffectively on PyPy, and so we were happy to approve his application for the\njob. This means that anyone can potentially be selected in the future, the\nonly strict requirement is to have a deep interest in working on PyPy and to\nprove to be able to do so by contributing to the project.Back to the status of the branch. Most of the work since the last status\nupdate has been done in the area of, guess what? Unicode strings. As usual,\nthis is one of the most important changes between Python 2 and Python 3, so\nit's not surprising. The biggest news is that now PyPy internally supports\nunicode identifiers (such as names of variables, functions, attributes, etc.),\nwhereas earlier it supported only ASCII bytes strings. The changes is still\nbarely visible from the outside, because the parser still rejects non-ASCII\nidentifiers, however you can see it with a bit of creativity:>>>> def foo(x): pass\n>>>> foo(**{'\u00e0\u00e8\u00ec\u00f2\u00f9': 42})\nTraceback (most recent call last):\n File \"\", line 1, in \nTypeError: foo() got an unexpected keyword argument '\u00e0\u00e8\u00ec\u00f2\u00f9'\nBefore the latest changes, you used to get question marks instead of the\nproper name for the keyword argument. Although this might seem like a small\ndetail, it is a big step towards a proper working Python 3 interpreter and it\nrequired a couple of days of headaches. A spin-off of this work is that now\nRPython has better built-in support for unicode (also in the default branch):\nfor example, it now supports unicode string formatting (using the percent\noperator) and the methods .encode/.decode('utf-8').Other than that there is the usual list of smaller issues and bugs that got\nfixed, including (but not limited to):teach the compiler when to emit the new opcode DELETE_DEREF (and\nimplement it!)\ndetect when we use spaces and TABs inconsistently in the source code, as\nCPython does\nfix yet another bug related to the new lexically scoped exceptions (this\nis the last one, hopefully)\nport some of the changes that we did to the standard CPython 2.7 tests to\n3.2, to mark those which are implementation details and should not be run on\nPyPy\nFinally, I would like to thank Amaury Forgeot d'Arc and Ariel Ben-Yehuda for\ntheir work on the branch; among other things, Amaury recently worked on\ncpyext and on the PyPy _cffi_backend, while Ariel submitted a patch to\nimplement PEP 3138.", + "tags": "pypy3", + "url": "https://www.pypy.org/posts/2012/09/py3k-status-update-6-4049281716377789914.html" + }, + { + "title": "PyPy Cape Town Sprint Oct 7th - Oct 21st 2012", + "text": "Hello everyone!\nThe next PyPy sprint will be in Cape Town, South Africa. It is a\npublic sprint, suitable for newcomers. It starts a couple of days\nafter PyCon South Africa, which is on the 4th and 5th of October.\nThis is a relatively unusual sprint in that it is hosted halfway\nacross the world from where most contributors live, so we plan to\nspend some time during those two weeks doing sprinting and some time\ndoing touristy stuff. The goals for the sprint are general progress\nand whatever people are interested in.\nPossible topics:\n\nPyPy release 2.0\nrunning your software on PyPy\nwork on PyPy's numpy (status)\nwork on STM (status)\nJIT improvements\nany exciting stuff you can think of\n\nIf there are newcomers, we'll run the usual introduction to hacking on\nPyPy.\n\nLocation\nThe sprint will be held either in the apartment of fijal, which is in\nTamboerskloof, Cape Town, or in the offices of the Praekelt\nFoundation, located in Woodstock, Cape Town. The Praekelt Foundation\nhas offered to host us, if needed.\nCape Town, as a very touristy place, has tons of accomodation ranging\nin quality from good to amazing. Depending on the sprint location you\nmight need a car.\n\n\nGood to Know\nYou probably don't need visa for South Africa -- consult Wikipedia.\nSouth Africa is a lovely place with lots of stuff to do. You can see\npenguins, elephants, lions and sharks all on one day (or better yet,\non multiple days).\nThere is a wide selection of good restaurants within a reasonable\ndistance of the sprint venue (depending on the venue, either walking\nor driving).\nThe power plug is some weird derivative of an old-english standard,\nbut adapters are easily acquired.\n\n\nWho's Coming?\nIf you'd like to come, please let us know when you will be arriving\nand leaving, as well as what your interests are. We'll keep a list of\npeople which we'll update (or you can do so yourself if you have\nbitbucket pypy commit rights).\n\nCheers,\nfijal", + "tags": "", + "url": "https://www.pypy.org/posts/2012/09/pypy-cape-town-sprint-oct-7th-oct-21st-5757682347636918027.html" + }, + { + "title": "NumPy on PyPy status update", + "text": "Hello everyone.\nIt's been a while since we posted a numpy work update, but I'm pleased to\ninform you that work on it has been restarted. A lot of the work has been\ndone by Matti Picus, who is one of the newest contributors to the PyPy\nproject. None of the work below has been merged so far, it's work in progress:\n\nComplex dtype support.\nFixing incompatibilities between numpy and pypy's version.\nRefactoring numpypy to simplify the code and make it easier for new\ncontributors.\nReuse most of the numpy's pure python code without modifications.\n\nFinishing this is also the plan for the next month.\nCheers,\nfijal", + "tags": "numpy", + "url": "https://www.pypy.org/posts/2012/09/numpy-on-pypy-status-update-1605312600799448094.html" + }, + { + "title": "CFFI release 0.3", + "text": "Hi everybody,\nWe released CFFI 0.3. This is the first release that supports more\nthan CPython 2.x :-)\n\nCPython 2.6, 2.7, and 3.x are supported (3.3 definitely, but maybe 3.2 or earlier too)\nPyPy trunk is supported.\n\nIn more details, the main news are:\n\nsupport for PyPy. You need to get a trunk version of PyPy, which\ncomes with the built-in module _cffi_backend to use with the CFFI\nrelease. For testing, you can download the Linux 32/64 versions of\nPyPy trunk. The OS/X and Windows versions of _cffi_backend\nare not tested at all so far, so probably don't work yet.\nsupport for Python 3. It is unknown which exact version is\nrequired; probably 3.2 or even earlier, but we need 3.3 to run the\ntests. The 3.x version is not a separate source; it runs out of the same sources. Thanks Amaury for starting this port.\nthe main change in the API is that you need to use ffi.string(cdata)\ninstead of str(cdata) or unicode(cdata). The motivation for this\nchange was the Python 3 compatibility. If your Python 2 code used to\ncontain str(), it would interpret the memory content\nas a null-terminated string; but on Python 3 it would just return a\ndifferent string, namely \"\", and proceed without even\na crash, which is bad. So ffi.string() solves it by always returning\nthe memory content as an 8-bit string (which is a str in Python 2 and\na bytes in Python 3).\nother minor API changes are documented at\nhttps://cffi.readthedocs.org/ (grep for version 0.3).\n\nUpcoming work, to be done before release 1.0:\n\nexpose to the user the module cffi.model in a possibly refactored\nway, for people that don't like (or for some reason can't easily use)\nstrings containing snippets of C declarations. We are thinking about\nrefactoring it in such a way that it has a ctypes-compatible\ninterface, to ease porting existing code from ctypes to cffi. Note\nthat this would concern only the C type and function declarations, not\nall the rest of ctypes.\nCFFI 1.0 will also have a corresponding PyPy release. We are thinking\nabout calling it PyPy 2.0 and including the whole of CFFI (instead of\njust the _cffi_backend module like now). In other words it will\nsupport CFFI out of the box --- we want to push forward usage of CFFI\nin PyPy :-)\n\nCheers,\nArmin Rigo and Maciej Fija\u0142kowski", + "tags": "releasecffi", + "url": "https://www.pypy.org/posts/2012/08/cffi-release-03-4740491796308953732.html" + }, + { + "title": "C++ objects in cppyy, part 1: Data Members", + "text": "The cppyy module makes it possible to call into C++ from PyPy through the\nReflex package.\nDocumentation and setup instructions are\navailable here.\nRecent work has focused on STL, low-level buffers, and code quality, but also\na lot on pythonizations for the\nCINT backend, which is\nmostly for High Energy Physics (HEP) use only.\nA\nprevious posting walked\nthrough the high-level structure and organization of the module, where it was\nargued why it is necessary to write cppyy in RPython and generate bindings at\nrun-time for the best performance.\nThis posting details how access to C++ data structures is provided and is part\nof a series of 3 postings on C++ object representation in Python: the second\nposting will be about method dispatching, the third will tie up several odds\nand ends by showing how the choices presented here and in part 2 work together\nto make features such as auto-casting possible.\n\n\nWrapping Choices\n\nSay we have a plain old data type (POD), which is the simplest possible\ndata structure in C++.\nLike for example:\n\n struct A {\n int m_i;\n double m_d;\n };\n\nWhat should such a POD look like when represented in Python?\nLet's start by looking at a Python data structure that is functionally\nsimilar, in that it also carries two public data members of the desired\ntypes.\nSomething like this:\n\n class A(object):\n def __init__(self):\n self.m_i = 0\n self.m_d = 0.\n\nAlright, now how to go about connecting this Python class with the former\nC++ POD?\nOr rather, how to connect instances of either.\nThe exact memory layout of a Python\nA\ninstance is up to Python, and likewise the layout of a C++\nA instance is up\nto C++.\nBoth layouts are implementation details of the underlying language, language\nimplementation, language version, and the platform used.\nIt should be no surprise then, that for example an\nint in C++ looks\nnothing like a\nPyIntObject, even\nthough it is perfectly possible, in both cases, to point out in memory where\nthe integer value is.\nThe two representations can thus not make use of the same block of memory\ninternally.\nHowever, the requirement is that the access to C++ from Python looks and feels\nnatural in its use, not that the mapping is exact.\nAnother requirement is that we want access to the actual object from both\nPython and C++.\nIn practice, it is easier to provide natural access to C++ from Python than\nthe other way around, because the choices of memory layout in C++ are far more\nrestrictive: the memory layout defines the access, as the actual class\ndefinition is gone at run-time.\nThe best choice then, is that the Python object will act as a proxy to the C++\nobject, with the actual data always being in C++.\n\nFrom here it follows that if the\nm_i data member\nlives in C++, then Python needs some kind of helper to access it.\nConveniently, since version 2.2, Python has a\nproperty construct\nthat can take a getter and setter function that are called when the property\nis used in Python code, and present it to the programmer as if it were a data\nmember.\nSo we arrive at this (note how the\nproperty instance\nis a variable at the class level):\n\n class A(object):\n def __init__(self):\n self._cppthis = construct_new_A()\n m_i = property(get_m_i, set_m_i)\n m_d = property(get_m_d, set_m_d)\n\nThe\nconstruct_new_A\nhelper is not very interesting (the reflection layer can provide for it\ndirectly), and methods are a subject for part 2 of this posting, so focus on\nget_m_i\nand set_m_i.\nIn order for the getter to work, the method needs to have access to the C++\ninstance for which the Python object is a proxy.\nOn access, Python will call the getter function with the proxy instance for\nwhich it is called.\nThe proxy has a\n_cppthis data\nmember from which the C++ instance can be accessed (think of it as a pointer)\nand all is good, at least for\nm_i.\nThe second data member\nm_d, however,\nrequires some more work: it is located at some offset into\n_cppthis.\nThis offset can be obtained from the reflection information, which lets the\nC++ compiler calculate it, so details such as\nbyte padding\nare fully accounted for.\nSince the setter also needs the offset, and since both share some more details\nsuch as the containing class and type information of the data member, it is\nnatural to create a custom property class.\nThe getter and setter methods then become bound methods of an instance of that\ncustom property,\nCPPDataMember, and\nthere is one such instance per data member.\nThink of something along these lines:\n\n def make_datamember(cppclass, name):\n cppdm = cppyy.CPPDataMember(cppclass, name)\n return property(cppdm.get, cppdm.set)\n\nwhere the\nmake_datamember\nfunction replaces the call to\nproperty in the\nclass definition above.\n\nNow hold on a minute!\nBefore it was argued that Python and C++ can not share the same underlying\nmemory structure, because of choices internal to the language.\nBut if on the Python side choices are being made by the developer of the\nlanguage bindings, that is no longer a limitation.\nIn other words, why not go through e.g. the Python extension API, and do\nthis:\n\n struct A_pyproxy {\n PyObject_HEAD\n int m_i;\n double m_d;\n };\n\nDoing so would save on\nmalloc overhead and remove\na pointer indirection.\nThere are some technical issues specific to PyPy for such a choice: there is\nno such thing as\nPyPyObject_HEAD\nand the layout of objects is not a given as that is decided only at\ntranslation time.\nBut assume that those issues can be solved, and also accept that there is no\nproblem in creating structure definitions like this at run-time, since the\nreflection layer can provide both the required size and access to the\nplacement\nnew operator\n(compare e.g. CPython's\nstruct module).\nThere is then still a more fundamental problem: it must be possible to take\nover ownership in Python from instances created in C++ and vice-versa.\nWith a proxy scheme, that is trivial: just pass the pointer and do the\nnecessary bookkeeping.\nWith an embedded object, however, not every use case can be implemented: e.g.\nif an object is created in Python, passed to C++, and deleted in C++, it\nmust have been allocated independently.\nThe proxy approach is therefore still the best choice, although embedding\nobjects may provide for optimizations in some use cases.\n\n\nInheritance\n\nThe next step, is to take a more complicated C++ class, one with inheritance\n(I'm leaving out details such as constructors etc., for brevity):\n\n class A {\n public:\n virtual ~A() {}\n int m_i;\n double m_d;\n };\n\n class B : public A {\n public:\n virtual ~B() {}\n int m_j;\n };\n\nFrom the previous discussion, it should already be clear what this will look\nlike in Python:\n\n class A(object):\n def __init__(self):\n self._cppthis = construct_new_A()\n m_i = make_datamember('A', 'm_i')\n m_d = make_datamember('A', 'm_d')\n\n class B(A):\n def __init__(self):\n self._cppthis = construct_new_B()\n m_j = make_datamember('B', 'm_j')\n\nThere are some minor adjustments needed, however.\nFor one, the offset of the\nm_i data member\nmay be no longer zero: it is possible that a virtual function dispatch table\n(vtable)\npointer is added at the beginning of\nA (an alternative\nis to have the vtable pointer at the end of the object).\nBut if\nm_i is handled the\nsame way as\nm_d, with the\noffset provided by the compiler, then the compiler will add the bits, if any,\nfor the vtable pointer and all is still fine.\nA real problem could come in however, with a call of the\nm_i property on\nan instance of\nB: in that case,\nthe _cppthis\npoints to a B\ninstance, whereas the getter/setter pair expect an\nA instance.\nIn practice, this is usually not a problem: compilers will align\nA and\nB and calculate\nan offset for\nm_j from the start\nof A.\nStill, that is an implementation detail (even though it is one that can be\ndetermined at run-time and thus taken advantage of by the JIT), so it can not\nbe relied upon.\nThe m_i getter\nthus needs to take into account that it can be called with a derived type,\nand so it needs to add an additional offset.\nWith that modification, the code looks something like this (as you would have\nguessed, this is getting more and more into pseudo-code territory, although it\nis conceptually close to the actual implementation in cppyy):\n\n def get_m_i(self):\n return int(self._cppthis + offset(A, m_i) + offset(self.__class__, A))\n\nWhich is a shame, really, because the offset between\nB and\nA is going\nto be zero most of the time in practice, and the JIT can not completely\nelide\nthe offset calculation (as we will see later; it is easy enough to elide if\nself.__class__ is\nA, though).\nOne possible solution is to repeat the properties for each derived class, i.e.\nto have a\nget_B_m_i etc., but\nthat looks ugly on the Python side and anyway\ndoes not work in all cases: e.g. with multiple inheritance where there are\ndata members with the same name in both bases, or if\nB itself has a\npublic data member called\nm_i that shadows\nthe one from A.\nThe optimization then, is achieved by making\nB in charge of the\noffset calculations, by making\noffset a method of\nB, like so:\n\n def get_m_i(self):\n return int(self._cppthis + offset(A, m_i) + self.offset(A))\n\nThe insight is that by scanning the inheritance hierarchy of a derived\nclass like B, you\ncan know statically whether it may sometimes need offsets, or whether the\noffsets are always going to be zero.\nHence, if the offsets are always zero, the method\noffset on\nB will\nsimply return the literal\n0 as its\nimplementation, with the JIT taking care of the rest through inlining and\nconstant folding.\nIf the offset could be non-zero, then the method will perform an actual\ncalculation, and it will let the JIT elide the call only if possible.\n\n\nMultiple Virtual Inheritance\n\nNext up would be multiple inheritance, but that is not very interesting: we\nalready have the offset calculation between the actual and base class, which\nis all that is needed to resolve any multiple inheritance hierarchy.\nSo, skip that and move on to multiple virtual inheritance.\nThat that is going to be a tad more complicated will be clear if you show the\nfollowing code snippet to any old C++ hand and see how they respond.\nMost likely you will be told: \"Don't ever do that.\"\nBut if code can be written, it will be written, and so for the sake of the\nargument, what would this look like in Python:\n\n class A {\n public:\n virtual ~A() {}\n int m_a;\n };\n\n class B : public virtual A {\n public:\n virtual ~B() {}\n int m_b;\n };\n\n class C : public virtual A {\n public:\n virtual ~C() {}\n int m_c;\n };\n\n class D : public virtual B, public virtual C {\n public:\n virtual ~D() {}\n int m_d;\n };\n\nActually, nothing changes from what we have seen so far: the scheme as laid\nout above is fully sufficient.\nFor example, D\nwould simply look like:\n\n class D(B, C):\n def __init__(self):\n self._cppthis = construct_new_D()\n m_d = make_datamember('D', 'm_d')\n\nPoint being, the only complication added by the multiple virtual\ninheritance, is that navigation of the C++ instance happens with pointers\ninternal to the instance rather than with offsets.\nHowever, it is still a fixed offset from any location to any other location\nwithin the instance as its parts are laid out consecutively in memory (this is\nnot a requirement, but it is the most efficient, so it is what is used in\npractice).\nBut what you can not do, is determine the offset statically: you need a live\n(i.e. constructed) object for any offset calculations.\nIn Python, everything is always done dynamically, so that is of itself not a\nlimitation.\nFurthermore,\nself is already\npassed to the offset calculation (remember that this was done to put the\ncalculation in the derived class, to optimize the common case of zero\noffset), thus a live C++ instance is there precisely when it is needed.\nThe call to the offset calculation is hard to elide, since the instance will\nbe passed to a C++ helper and so the most the JIT can do is guard on the\ninstance's memory address, which is likely to change between traces.\nInstead, explicit caching is needed on the base and derived types, allowing\nthe JIT to elide the lookup in the explicit cache.\n\n\nStatic Data Members and Global Variables\n\nThat, so far, covers all access to instance data members.\nNext up are static data members and global variables.\nA complication here is that a Python\nproperty needs to\nlive on the class in order to work its magic.\nOtherwise, if you get the property, it will simply return the getter function,\nand if you set it, it will dissappear.\nThe logical conclusion then, is that a\nproperty\nrepresenting a static or global variable, needs to live on the class of the\nclass, or the metaclass.\nIf done directly though, that would mean that every static data member is\navailable from every class, since all Python classes have the same metaclass,\nwhich is class\ntype (and which is\nits own metaclass).\nTo prevent that from happening and because\ntype is actually\nimmutable, each proxy class needs to have its own custom metaclass.\nFurthermore, since static data can also be accessed on the instance, the\nclass, too, gets a\nproperty object\nfor each static data member.\nExpressed in code, for a basic C++ class, this looks as follows:\n\n class A {\n public:\n static int s_i;\n };\n\nPaired with some Python code such as this, needed to expose the static\nvariable both on the class and the instance level:\n\n meta_A = type(CppClassMeta, 'meta_A', [CPPMetaBase], {})\n meta_A.s_i = make_datamember('A', 's_i')\n\n class A(object):\n __metaclass__ = meta_A\n s_i = make_datamember('A', 's_i')\n\nInheritance adds no complications for the access of static data per se, but\nthere is the issue that the metaclasses must follow the same hierarchy as the\nproxy classes, for the Python method resolution order (MRO) to work.\nIn other words, there are two complete, parallel class hierarchies that map\none-to-one: a hierarchy for the proxy classes and one for their metaclasses.\n\nA parallel class hierarchy is used also in other highly dynamic,\nobject-oriented environments, such as for example\nSmalltalk.\nIn Smalltalk as well, class-level constructs, such as class methods and data\nmembers, are defined for the class in the metaclass.\nA metaclass hierarchy has further uses, such as lazy loading of nested\nclasses and member templates (this would be coded up in the base class of all\nmetaclasses:\nCPPMetaBase), and\nmakes it possible to distribute these over different reflection libraries.\nWith this in place, you can write Python codes like so:\n\n >>>> from cppyy.gbl import A\n >>>> a = A()\n >>>> a.s_i = 42\n >>>> print A.s_i == a.s_i\n True\n >>>> # etc.\n\nThe implementation of the getter for\ns_i is a lot\neasier than for instance data: the static data lives at a fixed, global,\naddress, so no offset calculations are needed.\nThe same is done for global data or global data living in namespaces:\nnamespaces are represented as Python classes, and global data are implemented\nas properties on them.\nThe need for a metaclass is one of the reasons why it is easier for namespaces\nto be classes: module objects are too restrictive.\nAnd even though namespaces are not modules, you still can, with\nsome limitations,\nimport from\nthem anyway.\n\nIt is common that global objects themselves are pointers, and therefore it\nis allowed that the stored\n_cppthis is not a\npointer to a C++ object, but rather a pointer to a pointer to a C++ object.\nA double pointer, as it were.\nThis way, if the C++ code updates the global pointer, it will automatically\nreflect on the Python side in the proxy.\nLikewise, if on the Python side the pointer gets set to a different variable,\nit is the pointer that gets updated, and this will be visible on the C++ side.\nIn general, however, the same caveat as for normal Python code applies: in\norder to set a global object, it needs to be set within the scope of that\nglobal object.\nAs an example, consider the following code for a C++ namespace\nNS with\nglobal variable\ng_a, which behaves\nthe same as Python code for what concerns the visibility of changes to the\nglobal variable:\n\n >>>> from cppyy.gbl import NS, A\n >>>> from NS import g_a\n >>>> g_a = A(42) # does NOT update C++ side\n >>>> print NS.g_a.m_i\n 13 # the old value happens to be 13\n >>>> NS.g_a = A(42) # does update C++ side\n >>>> print NS.g_a.m_i\n 42\n >>>> # etc.\n\n\nConclusion\n\nThat covers all there is to know about data member access of C++ classes in\nPython through a reflection layer!\nA few final notes: RPython does not support metaclasses, and so the\nconstruction of proxy classes (code like\nmake_datamember\nabove) happens in Python code instead.\nThere is an overhead penalty of about 2x over pure RPython code associated\nwith that, due to extra guards that get inserted by the JIT.\nA factor of 2 sounds like a lot, but the overhead is tiny to begin with, and\n2x of tiny is still tiny and it's not easy to measure.\nThe class definition of the custom property,\nCPPDataMember, is\nin RPython code, to be transparent to the JIT.\nThe actual offset calculations are in the reflection layer.\nHaving the proxy class creation in Python, with structural code in RPython,\ncomplicates matters if proxy classes need to be constructed on-demand.\nFor example, if an instance of an as-of-yet unseen type is returned by a\nmethod.\nExplaining how that is solved is a topic of part 2, method calls, so stay\ntuned.\n\nThis posting laid out the reasoning behind the object representation of C++\nobjects in Python by cppyy for the purpose of data member access.\nIt explained how the chosen representation of offsets gives rise to a very\npythonic representation, which allows Python introspection tools to work as\nexpected.\nIt also explained some of the optimizations done for the benefit of the JIT.\nNext up are method calls, which will be described in part 2.", + "tags": "", + "url": "https://www.pypy.org/posts/2012/08/c-objects-in-cppyy-part-1-data-members-1105848719513737614.html" + }, + { + "title": "Multicore Programming in PyPy and CPython", + "text": "Hi all,\nThis is a short \"position paper\" kind of post about my view (Armin\nRigo's) on the future of multicore programming in high-level languages.\nIt is a summary of the\nkeynote presentation at EuroPython. As I learned by talking with people\nafterwards, I am not a good enough speaker to manage to convey a deeper\nmessage in a 20-minutes talk. I will try instead to convey it in a\n250-lines post...\nThis is about three points:\n\nWe often hear about people wanting a version of Python running without\nthe Global Interpreter Lock (GIL): a \"GIL-less Python\". But what we\nprogrammers really need is not just a GIL-less Python --- we need a\nhigher-level way to write multithreaded programs than using directly\nthreads and locks. One way is Automatic Mutual Exclusion (AME), which\nwould give us an \"AME Python\".\nA good enough Software Transactional Memory (STM) system can be used\nas an internal tool to do that.\nThis is what we are building into an \"AME PyPy\".\nThe picture is darker for CPython, though there is a way too. The\nproblem is that when we say STM, we think about either GCC 4.7's STM\nsupport, or Hardware Transactional Memory (HTM). However, both\nsolutions are enough for a \"GIL-less CPython\", but not\nfor \"AME CPython\", due to capacity limitations. For the latter, we\nneed somehow to add some large-scale STM into the compiler.\n\nLet me explain these points in more details.\n\nGIL-less versus AME\nThe first point is in favor of the so-called Automatic Mutual Exclusion\napproach. The issue with using threads (in any language with or without\na GIL) is that threads are fundamentally non-deterministic. In other\nwords, the programs' behaviors are not reproductible at all, and worse,\nwe cannot even reason about it --- it becomes quickly messy. We would\nhave to consider all possible combinations of code paths and timings,\nand we cannot hope to write tests that cover all combinations. This\nfact is often documented as one of the main blockers towards writing\nsuccessful multithreaded applications.\nWe need to solve this issue with a higher-level solution. Such\nsolutions exist theoretically, and Automatic Mutual Exclusion (AME) is\none of them. The idea of AME is that we divide the execution of each\nthread into a number of \"atomic blocks\". Each block is well-delimited\nand typically large. Each block runs atomically, as if it acquired a\nGIL for its whole duration. The trick is that internally we use\nTransactional Memory, which is a technique that lets the system run the\natomic blocks from each thread in parallel, while giving the programmer\nthe illusion that the blocks have been run in some global serialized\norder.\nThis doesn't magically solve all possible issues, but it helps a lot: it\nis far easier to reason in terms of a random ordering of large atomic\nblocks than in terms of a random ordering of lines of code --- not to\nmention the mess that multithreaded C is, where even a random ordering\nof instructions is not a sufficient model any more.\nHow do such atomic blocks look like? For example, a program might\ncontain a loop over all keys of a dictionary, performing some\n\"mostly-independent\" work on each value. This is a typical example:\neach atomic block is one iteration through the loop. By using the\ntechnique described here, we can run the iterations in parallel\n(e.g. using a thread pool) but using AME to ensure that they appear to\nrun serially.\nIn Python, we don't care about the order in which the loop iterations\nare done, because we are anyway iterating over the keys of a dictionary.\nSo we get exactly the same effect as before: the iterations still run in\nsome random order, but --- and that's the important point --- they\nappear to run in a\nglobal serialized order. In other words, we introduced parallelism, but\nonly under the hood: from the programmer's point of view, his program\nstill appears to run completely serially. Parallelisation as a\ntheoretically invisible optimization... more about the \"theoretically\"\nin the next paragraph.\nNote that randomness of order is not fundamental: they are techniques\nbuilding on top of AME that can be used to force the order of the\natomic blocks, if needed.\n\n\nPyPy and STM/AME\nTalking more precisely about PyPy: the current prototype pypy-stm is\ndoing precisely this. In pypy-stm, the length of the atomic blocks is\nselected in one of two ways: either explicitly or automatically.\nThe automatic selection gives blocks corresponding to some small number\nof bytecodes, in which case we have merely a GIL-less Python: multiple\nthreads will appear to run serially, with the execution randomly\nswitching from one thread to another at bytecode boundaries, just like\nin CPython.\nThe explicit selection is closer to what was described in the previous\nsection: someone --- the programmer or the author of some library that\nthe programmer uses --- will explicitly put with thread.atomic: in\nthe source, which delimitates an atomic block. For example, we can use\nit to build a library that can be used to iterate over the keys of a\ndictionary: instead of iterating over the dictionary directly, we would\nuse some custom utility which gives the elements \"in parallel\". It\nwould give them by using internally a pool of threads, but enclosing\nevery handling of an element into such a with thread.atomic block.\nThis gives the nice illusion of a global serialized order, and thus\ngives us a well-behaving model of the program's behavior.\nRestating this differently,\nthe only semantical difference between pypy-stm and\na regular PyPy or CPython is that it has thread.atomic, which is a\ncontext manager that gives the illusion of forcing the GIL to not be\nreleased during the execution of the corresponding block of code. Apart\nfrom this addition, they are apparently identical.\nOf course they are only semantically identical if we ignore performance:\npypy-stm uses multiple threads and can potentially benefit from that\non multicore machines. The drawback is: when does it benefit, and how\nmuch? The answer to this question is not immediate. The programmer\nwill usually have to detect and locate places that cause too many\n\"conflicts\" in the Transactional Memory sense. A conflict occurs when\ntwo atomic blocks write to the same location, or when A reads it,\nB writes it, but B finishes first and commits. A conflict\ncauses the execution of one atomic block to be aborted and restarted,\ndue to another block committing. Although the process is transparent,\nif it occurs more than occasionally, then it has a negative impact on\nperformance.\nThere is no out-of-the-box perfect solution for solving all conflicts.\nWhat we will need is more tools to detect them and deal with them, data\nstructures that are made aware of the risks of \"internal\" conflicts when\nexternally there shouldn't be one, and so on. There is some work ahead.\nThe point here is that from the point of view of the final programmer,\nwe gets conflicts that we should resolve --- but at any point, our\nprogram is correct, even if it may not be yet as efficient as it could\nbe. This is the opposite of regular multithreading, where programs are\nefficient but not as correct as they could be. In other words, as we\nall know, we only have resources to do the easy 80% of the work and not\nthe remaining hard 20%. So in this model we get a program that has 80%\nof the theoretical maximum of performance and it's fine. In the regular\nmultithreading model we would instead only manage to remove 80% of the\nbugs, and we are left with obscure rare crashes.\n\n\nCPython and HTM\nCouldn't we do the same for CPython? The problem here is that\npypy-stm is implemented as a transformation step during translation,\nwhich is not directly possible in CPython. Here are our options:\n\nWe could review and change the C code everywhere in CPython.\nWe use GCC 4.7, which supports some form of STM.\nWe wait until Intel's next generation of CPUs comes out (\"Haswell\")\nand use HTM.\nWe write our own C code transformation within a compiler (e.g. LLVM).\n\nI will personally file the first solution in the \"thanks but no thanks\"\ncategory. If anything, it will give us another fork of CPython that\nwill painfully struggle to keep not more than 3-4 versions behind, and\nthen eventually die. It is very unlikely to be ever merged into the\nCPython trunk, because it would need changes everywhere. Not to\nmention that these changes would be very experimental: tomorrow we might\nfigure out that different changes would have been better, and have to\nstart from scratch again.\nLet us turn instead to the next two solutions. Both of these solutions\nare geared toward small-scale transactions, but not long-running ones.\nFor example, I have no clue how to give GCC rules about performing I/O\nin a transaction --- this seems not supported at all; and moreover\nlooking at the STM library that is available so far to be linked with\nthe compiled program, it assumes short transactions only. By contrast,\nwhen I say \"long transaction\" I mean transactions that can run for 0.1\nseconds or more. To give you an idea, in 0.1 seconds a PyPy program\nallocates and frees on the order of ~50MB of memory.\nIntel's Hardware Transactional Memory solution is both more flexible and\ncomes with a stricter limit. In one word, the transaction boundaries\nare given by a pair of special CPU instructions that make the CPU enter\nor leave \"transactional\" mode. If the transaction aborts, the CPU\ncancels any change, rolls back to the \"enter\" instruction and causes\nthis instruction to return an error code instead of re-entering\ntransactional mode (a bit like a fork()). The software then detects\nthe error code. Typically, if transactions are rarely cancelled, it is\nfine to fall back to a GIL-like solution just to redo these cancelled\ntransactions.\nAbout the implementation: this is done by recording all the changes that\na transaction wants to do to the main memory, and keeping them invisible\nto other CPUs. This is \"easily\" achieved by keeping them inside this\nCPU's local cache; rolling back is then just a matter of discarding a\npart of this cache without committing it to memory. From this point of\nview, there is a lot to bet that we are actually talking about the\nregular per-core Level 1 and Level 2 caches --- so any transaction that\ncannot fully store its read and written data in the 64+256KB of the L1+L2\ncaches will abort.\nSo what does it mean? A Python interpreter overflows the L1 cache of\nthe CPU very quickly: just creating new Python function frames takes a\nlot of memory (on the order of magnitude of 1/100 of the whole L1\ncache). Adding a 256KB L2 cache into the picture helps, particularly\nbecause it is highly associative and thus avoids a lot of fake conflicts.\nHowever, as long as the HTM support is limited to L1+L2 caches,\nit is not going to be enough to run an \"AME Python\" with any sort of\nmedium-to-long transaction. It can\nrun a \"GIL-less Python\", though: just running a few hundred or even\nthousand bytecodes at a time should fit in the L1+L2 caches, for most\nbytecodes.\nI would vaguely guess that it will take on the order of 10 years until\nCPU cache sizes grow enough for a CPU in HTM mode to actually be able to\nrun 0.1-second transactions. (Of course in 10 years' time a lot of other\nthings may occur too, including the whole Transactional Memory model\nbeing displaced by something else.)\n\n\nWrite your own STM for C\nLet's discuss now the last option: if neither GCC 4.7 nor HTM are\nsufficient for an \"AME CPython\", then we might want to\nwrite our own C compiler patch (as either extra work on GCC 4.7, or an\nextra pass to LLVM, for example).\nWe would have to deal with the fact that we get low-level information,\nand somehow need to preserve interesting high-level bits through the\ncompiler up to the point at which our pass runs: for example, whether\nthe field we read is immutable or not. (This is important because some\ncommon objects are immutable, e.g. PyIntObject. Immutable reads don't\nneed to be recorded, whereas reads of mutable data must be protected\nagainst other threads modifying them.) We can also have custom code to\nhandle the reference counters: e.g. not consider it a conflict if\nmultiple transactions have changed the same reference counter, but just\nresolve it automatically at commit time. We are also free to handle I/O\nin the way we want.\nMore generally, the advantage of this approach over both the current GCC\n4.7 and over HTM is that we control the whole process. While this still\nlooks like a lot of work, it looks doable. It would be possible to come\nup with a minimal patch of CPython that can be accepted into core\nwithout too much troubles (e.g. to mark immutable fields and tweak the\nrefcounting macros), and keep all the cleverness inside the compiler\nextension.\n\n\nConclusion\nI would assume that a programming model specific to PyPy and not\napplicable to CPython has little chances to catch on, as long as PyPy is\nnot the main Python interpreter (which looks unlikely to change anytime\nsoon). Thus as long as only PyPy has AME, it looks like it will not\nbecome the main model of multicore usage in Python. However, I can\nconclude with a more positive note than during the EuroPython\nconference: it is a lot of work, but there is a more-or-less reasonable\nway forward to have an AME version of CPython too.\nIn the meantime, pypy-stm is around the corner, and together with\ntools developed on top of it, it might become really useful and used. I\nhope that in the next few years this work will trigger enough motivation\nfor CPython to follow the ideas.", + "tags": "stm", + "url": "https://www.pypy.org/posts/2012/08/multicore-programming-in-pypy-and-6595343388141556320.html" + }, + { + "title": "NumPyPy non-progress report", + "text": "Hello everyone.\nNot much has happened in the past few months with numpypy development. A part\nof the reason was doing other stuff for me, a part of the reason was\nvarious unexpected visa-related admin, a part of the reason was EuroPython\nand a part was long-awaited holiday.\nThe thing that's maybe worth mentioning is that it does not mean the donations\ndisappeared in the mist. PyPy developers are being paid to work on NumPyPy on\nan hourly basis - that means if I decide to take holidays or work on something\nelse, the money is simply staying in the account until later.\nThanks again for all the donations, I hope to get back to this topic soon!\nCheers,\nfijal", + "tags": "numpy", + "url": "https://www.pypy.org/posts/2012/08/hello-everyone-5492331040603503642.html" + }, + { + "title": "CFFI release 0.2.1", + "text": "Hi everybody,We released CFFI 0.2.1 (expected to be 1.0 soon). CFFI is a way to call C from Python.EDIT: Win32 was broken in 0.2. Fixed.This release is only for CPython 2.6 or 2.7. PyPy support is coming in\nthe ffi-backend branch, but not finished yet. CPython 3.x would be\neasy but requires the help of someone.The package is available on bitbucket as well as documented. You\ncan also install it straight from the python package index: pip install cffiContains numerous small changes and support for more C-isms.\nThe biggest news is the support for installing packages that use\nffi.verify() on machines without a C compiler. Arguably, this\nlifts the last serious restriction for people to use CFFI.\nPartial list of smaller changes:mappings between 'wchar_t' and Python unicodes\nthe introduction of ffi.NULL\na possibly clearer API for ffi.new(): e.g. to allocate a single int and obtain a pointer to it, use ffi.new(\"int *\") instead of the old\nffi.new(\"int\")\nand of course a plethora of smaller bug fixes\n\nCFFI uses pkg-config to install itself if available. This helps\nlocate libffi on modern Linuxes. Mac OS/X support is available too\n(see the detailed installation instructions). Win32 should work out\nof the box. Win64 has not been really tested yet.\nCheers,\nArmin Rigo and Maciej Fija\u0142kowski", + "tags": "releasecffi", + "url": "https://www.pypy.org/posts/2012/07/cffi-release-02-4800000428934604295.html" + }, + { + "title": "Prototype PHP interpreter using the PyPy toolchain - Hippy VM", + "text": "Hello everyone.\nI'm proud to release the result of a Facebook-sponsored study on the feasibility of\nusing the RPython toolchain to produce a PHP interpreter. The rules were\nsimple: two months; one person; get as close to PHP as possible, implementing\nenough warts and corner cases to be reasonably sure that it answers hard\nproblems in the PHP language. The outcome is called Hippy VM and implements\nmost of the PHP 1.0 language (functions, arrays, ints, floats and strings).\nThis should be considered an alpha release.\nThe resulting interpreter is obviously incomplete \u2013 it does not support all\nmodern PHP constructs (classes are completely unimplemented), builtin functions,\ngrammar productions, web server integration, builtin libraries\netc., etc.. It's just complete enough for me to reasonably be able to\nsay that \u2013 given some engineering effort \u2013 it's possible to provide a rock-solid\nand fast PHP VM using PyPy technologies.\nThe result is available in a Bitbucket repo and is released under the MIT\nlicense.\n\nPerformance\nThe table below shows a few benchmarks comparing Hippy VM to Zend (a standard\nPHP interpreter available in Linux distributions) and HipHop VM (a PHP-to-C++\noptimizing compiler developed by Facebook). The versions used were Zend 5.3.2\n(Zend Engine v2.3.0) and HipHop VM heads/vm-0-ga4fbb08028493df0f5e44f2bf7c042e859e245ab\n(note that you need to check out the vm branch to get the newest version).\nThe run was performed on 64-bit Linux running on a Xeon W3580 with 8M of\nL2 cache, which was otherwise unoccupied.\nUnfortunately, I was not able to run it on the JITted version of HHVM, the new effort by Facebook,\nbut people involved with the project told me it's usually slower or comparable with the compiled HipHop.\nTheir JITted VM is still alpha software, so I'll update it as soon as I have the info.\n\n\n\n\n\n\n\n\n\n\n\nbenchmark\nZend\nHipHop VM\nHippy VM\nHippy / Zend\nHippy / HipHop\n\narr\n2.771\n0.508+-0%\n0.274+-0%\n10.1x\n1.8x\n\nfannkuch\n21.239\n7.248+-0%\n1.377+-0%\n15.4x\n5.3x\n\nheapsort\n1.739\n0.507+-0%\n0.192+-0%\n9.1x\n2.6x\n\nbinary_trees\n3.223\n0.641+-0%\n0.460+-0%\n7.0x\n1.4x\n\ncache_get_scb\n3.350\n0.614+-0%\n0.267+-2%\n12.6x\n2.3x\n\nfib\n2.357\n0.497+-0%\n0.021+-0%\n111.6x\n23.5x\n\nfasta\n1.499\n0.233+-4%\n0.177+-0%\n8.5x\n1.3x\n\n\n\n\nThe PyPy compiler toolchain provides a way to implement a dynamic\nlanguage interpreter in a high-level language called RPython. This is\na language which is lower-level than Python, but still higher-level than\nC or C++: for example, RPython is a garbage-collected language. The killer\nfeature is that the toolchain will generate a JIT for your interpreter which\nwill be able to leverage most of the work that has been done on speeding up Python\nin the PyPy project. The resulting JIT is generated for your interpreter, and is not Python-specific.\nThis was one of the toolchain's original design decisions \u2013 in contrast to e.g. the JVM,\nwhich was initially only used to interpret Java and later adjusted to serve as a platform for\ndynamic languages.\nAnother important difference is that there is no common bytecode to which you compile both your\nlanguage and Python, so you don't inherit problems presented when implementing language X on top of,\nsay, Parrot VM or the JVM. The PyPy toolchain does not impose constraints on the semantics of\nyour language, whereas the benefits of the JVM only apply to languages that map well onto Java concepts.\nTo read more about creating your own interpreters using the PyPy toolchain,\nread more blog posts or an excellent article by Laurence Tratt.\n\n\nPHP deviations\nThe project's biggest deviation from the PHP specification is probably\nthat GC is no longer reference counting. That means that the object finalizer, when\nimplemented, will not be called directly at the moment of object death, but\nat some later point. There are possible future developments to alleviate that\nproblem, by providing \"refcounted\" objects when leaving the current scope.\nResearch has to be done in order to achieve that.\n\n\nAssessment\nThe RPython toolchain seems to be a cost-effective choice for writing\ndynamic language VMs. It both provides a fast JIT and gives you\naccess to low-level primitives when you need them. A good example is\nin the directory hippy/rpython which contains the implementation\nof an ordered dictionary. An ordered dictionary is not a primitive\nthat RPython provides \u2013 it's not necessary for the goal of\nimplementing Python. Now, implementing it on top of a normal dictionary\nis possible, but inefficient. RPython provides a way to work\ndirectly at a lower level, if you desire to do so.\nThings that require improvements in RPython:\n\nLack of mutable strings on the RPython level ended up being a problem.\nI ended up using lists of characters; which are efficient, but inconvenient,\nsince they don't support any string methods.\nFrame handling is too conservative and too Python-specific, especially around\nthe calls. It's possible to implement less general, but simpler and faster\nframe handling implementation in RPython.\n\n\n\nStatus of the implementation\nDon't use it! It's a research prototype intended to assess the feasibility\nof using RPython to create dynamic language VMs. The most notable\nfeature that's missing is reasonable error reporting. That said, I'm\nconfident it implements enough of the PHP language to prove that the full\nimplementation will present the same performance characteristics.\n\n\nBenchmarks\nThe benchmarks are a selection of computer language shootout benchmarks, as well\nas cache_get_scb, which is a part of old Facebook code. All benchmarks other\nthan this one (which is not open source, but definitely the most interesting :( ) are\navailable in the bench directory. The Python program to run them is called\nrunner.py and is in the same directory. It runs them 10 times, cutting off the first\n3 runs (to ignore the JIT warm-up time) and averaging the rest. As you can see\nthe standard deviation is fairly minimal for all interpreters and runs; if\nit's omitted it means it's below 0.5%.\nThe benchmarks were not selected for their ease of optimization \u2013 the optimizations\nin the interpreter were written specifically for this set of benchmarks. No special JIT\noptimizations were added, and barring what's mentioned below a vanilla PyPy 1.9 checkout\nwas used for compilation.\n\n\nSo, how fast will my website run if this is completed?\nThe truth is that I lack the benchmarks to be able to answer that right now. The core\nof the PHP language is implemented up to the point where I'm confident\nthat the performance will not change as we get more of the PHP going.\n\n\nHow do I run it?\nGet a PyPy checkout, apply the diff if you want to squeeze out the last\nbits of performance and run pypy-checkout/pypy/bin/rpython targethippy.py to\nget an executable that resembles a PHP interpreter. You can also directly run\npython targethippy.py file.php, but this will be about 2000x slower.\n\n\nRPython modifications\nThere was a modification that I did to the PyPy source code; the diff\nis available. It's trivial, and should simply be made optional in the\nRPython JIT generator, but it was easier just to do it, given the very constrained time\nframe.\n\ngen_store_back_in_virtualizable was disabled. This feature is\nnecessary for Python frames but not for PHP frames. PHP frames\ndo not have to be kept alive after we exit a function.\n\n\n\nFuture\nHippy is a cool prototype that presents a very interesting path towards a fast\nPHP VM. However, at the moment I have too many other open source commitments\nto take on the task of completing it in my spare time. I do think that this project\nhas a lot of potential, but I will not commit to any further development at\nthis time. If you send pull requests I'll try to review them. I'm also open\nto having further development on this project funded, so if you're interested\nin this project and the potential of a fast PHP interpreter, please get in\ntouch.\n\nCheers,\nfijal\nEDIT: Fixed the path to the rpython binary", + "tags": "", + "url": "https://www.pypy.org/posts/2012/07/hello-everyone-6869934374873967346.html" + }, + { + "title": "Py3k status update #5", + "text": "This is the fifth status update about our work on the py3k branch, which we\ncan work on thanks to all of the people who donated to the py3k proposal.Apart from the usual \"fix shallow py3k-related bugs\" part, most of my work in\nthis iteration has been to fix the bootstrap logic of the interpreter, in\nparticular to setup the initial sys.path.Until few weeks ago, the logic to determine sys.path was written entirely\nat app-level in pypy/translator/goal/app_main.py, which is automatically\nincluded inside the executable during translation. The algorithm is more or\nless like this:find the absolute path of the executable by looking at sys.argv[0]\nand cycling through all the directories in PATH\nstarting from there, go up in the directory hierarchy until we find a\ndirectory which contains lib-python and lib_pypy\nThis works fine for Python 2 where the paths and filenames are represented as\n8-bit strings, but it is a problem for Python 3 where we want to use unicode\ninstead. In particular, whenever we try to encode a 8-bit string into an\nunicode, PyPy asks the _codecs built-in module to find the suitable\ncodec. Then, _codecs tries to import the encodings package, to list\nall the available encodings. encodings is a package of the standard\nlibrary written in pure Python, so it is located inside\nlib-python/3.2. But at this point in time we yet have to add\nlib-python/3.2 to sys.path, so the import fails. Bootstrap problem!The hard part was to find the problem: since it is an error which happens so\nearly, the interpreter is not even able to display a traceback, because it\ncannot yet import traceback.py. The only way to debug it was through some\ncarefully placed print statement and the help of gdb. Once found the\nproblem, the solution was as easy as moving part of the logic to RPython,\nwhere we don't have bootstrap problems.Once the problem was fixed, I was able to finally run all the CPython test\nagainst the compiled PyPy. As expected there are lots of failures, and fixing\nthem will be the topic of my next months.", + "tags": "pypy3", + "url": "https://www.pypy.org/posts/2012/07/py3k-status-update-5-359698189825543897.html" + }, + { + "title": "EuroPython sprint", + "text": "Hi all,\n\nEuroPython is next week. We will actually be giving a presentation on Monday, in one of the plenary talks: PyPy: current status and GIL-less future. This is the first international PyPy keynote we give, as far as I know, but not the first keynote about PyPy [David Beazley's video] :-)\n\nThe other talks are PyPy JIT under the hood and to some extent Performance analysis tools for JITted VMs. This year we are also trying out a help desk. Finally, we will have the usual sprint after EuroPython on Saturday and Sunday.\n\nSee you soon!\n\nArmin.", + "tags": "", + "url": "https://www.pypy.org/posts/2012/06/europython-sprint-5668923199392472912.html" + }, + { + "title": "Architecture of Cppyy", + "text": "The cppyy module makes it possible to call into C++ from PyPy through the\nReflex package.\nWork started about two years ago, with a follow-up sprint a year later.\nThe module has now reached an acceptable level of maturity and initial\ndocumentation with setup instructions, as well as a list of the currently\nsupported language features, are now available here.\nThere is a sizable (non-PyPy) set of unit and application tests that is still\nbeing worked through, not all of them of general applicability, so development\ncontinues its current somewhat random walk towards full language coverage.\nHowever, if you find that cppyy by and large works for you except for certain\nspecific features, feel free to ask for them to be given higher priority.\nCppyy handles bindings differently than what is typically found in other\ntools with a similar objective, so this update walks through some of these\ndifferences, and explains why choices were made as they are.\nThe most visible difference, is from the viewpoint of the Python programmer\ninteracting with the module.\nThe two canonical ways of making Python part of a larger environment, are to\neither embed or extend it.\nThe latter is done with so-called extension modules, which are explicitly\nconstructed to be very similar in their presentation to the Python programmer\nas normal Python modules.\nIn cppyy, however, the external C++ world is presented from a single entrance\npoint, the global C++ namespace (in the form of the variable cppyy.gbl).\nThus, instead of importing a package that contains your C++ classes, usage\nlooks like this (assuming class MyClass in the global namespace):\n\n>>>> import cppyy\n>>>> m = cppyy.gbl.MyClass()\n>>>> # etc.\n\nThis is more natural than it appears at first: C++ classes and functions are,\nonce compiled, represented by unique linker symbols, so it makes sense to give\nthem their own unique place on the Python side as well.\nThis organization allows pythonizations of C++ classes to propagate from one\ncode to another, ensures that all normal Python introspection (such as\nissubclass and isinstance) works as expected in all cases, and that it\nis possible to represent C++ constructs such as typedefs simply by Python\nreferences.\nAchieving this unified presentation would clearly require a lot of internal\nadministration to track all C++ entities if they each lived in their own,\npre-built extension modules.\nSo instead, cppyy generates the C++ bindings at run-time, which brings us to\nthe next difference.\nThen again, that is not really a difference: when writing or generating a\nPython extension module, the result is some C code that consists of calls into\nPython, which then gets compiled.\nHowever, it is not the bindings themselves that are compiled; it is the code\nthat creates the bindings that gets compiled.\nIn other words, any generated or hand-written extension module does exactly\nwhat cppyy does, except that they are much more specific in that the bound\ncode is hard-wired with e.g. fixed strings and external function calls.\nThe upshot is that in Python, where all objects are first-class and run-time\nconstructs, there is no difference whatsoever between bindings generated at\nrun-time, and bindings generated at ... well, run-time really.\nThere is a difference in organization, though, which goes back to the first\npoint of structuring the C++ class proxies in Python: given that a class will\nsettle in a unique place once bound, instead of inside a module that has no\nmeaning in the C++ world, it follows that it can also be uniquely located in\nthe first place.\nIn other words, cppyy can, and does, make use of a class loader to\nauto-load classes on-demand.\nIf at this point, this all reminds you of a bit ctypes, just with some extra\nbells and whistles, you would be quite right.\nIn fact, internally cppyy makes heavy use of the RPython modules that form the\nguts of ctypes.\nThe difficult part of ctypes, however, is the requirement to annotate\nfunctions and structures.\nThat is not very pleasant in C, but in C++ there is a whole other level of\ncomplexity in that the C++ standard specifies many low-level details, that are\nrequired for dispatching calls and understanding object layout, as\n\"implementation defined.\"\nOf course, in the case of Open Source compilers, getting at those details is\ndoable, but having to reverse engineer closed-source compilers gets old rather\nquickly in more ways than one.\nMore generally, these implementation defined details prevent a clean interface,\ni.e. without a further dependency on the compiler, into C++ like the one that\nthe CFFI module provides for C.\nStill, once internal pointers have been followed, offsets have been calculated,\nthis objects have been provided, etc., etc., the final dispatch into binary\nC++ is no different than that into C, and cppyy will therefore be able to make\nuse of CFFI internally, like it does with ctypes today.\nThis is especially relevant in the CLang/LLVM world, where stub functions\nare done away with.\nTo get the required low-level details then, cppyy relies on a back-end, rather\nthan getting it from the programmer, and this is where Reflex (together with\nthe relevant C++ compiler) comes in, largely automating this tedious process.\nThere is nothing special about Reflex per se, other than that it is relatively\nlightweight, available, and has proven to be able to handle huge code bases.\nIt was a known quantity when work on cppyy started, and given the number\nof moving parts in learning PyPy, that was a welcome relief.\nReflex is based on gccxml, and can therefore handle pretty much any C or\nC++ code that you care to throw at it.\nIt is also technically speaking obsolete as it will not support C++11, since\ngccxml won't, but its expected replacement, based on CLang/LLVM, is not\nquite there yet (we are looking at Q3 of this year).\nIn cppyy, access to Reflex, or any back-end for that matter, is through a\nthin C API (see the schematic below): cppyy asks high level questions to the\nback-end, and receives low-level results, some of which are in the form of\nopaque handles.\nThis ensures that cppyy is not tied to any specific back-end.\nIn fact, currently it already supports another, CINT, but that back-end is\nof little interest outside of High Energy Physics (HEP).\nThe Python side is always the same, however, so any Python code based on cppyy\ndoes not have to change if the back-end changes.\nTo use the system, a back-end specific tool (genreflex for Reflex) is\nfirst run on a set of header files with a selection file for choosing the\nrequired classes.\nThis produces a C++ file that must be compiled into a shared library, and a\ncorresponding map file for the class loader.\nThese shared libraries, with their map files alongside, can be put anywhere\nas long as they can be located through the standard paths for the dynamic\nloader.\nWith that in place, the setup is ready, and the C++ classes are available to\nbe used from cppyy.\n\nSo far, nothing that has been described is specific to PyPy.\nIn fact, most of the technologies described have been used for a long time\non CPython already, so why the need for a new, PyPy-specific, module?\nTo get to that, it is important to first understand how a call is mediated\nbetween Python and C++.\nIn Python, there is the concept of a PyObject, which has a reference count, a\npointer to a type object, and some payload.\nThere are APIs to extract the low-level information from the payload for use\nin the C++ call, and to repackage any results from the call.\nThis marshalling is where the bulk of the time is spent when dispatching.\nTo be absolutely precise, most C++ extension module generators produce slow\ndispatches because they don't handle overloads efficiently, but even in there,\nthey still spend most of their time in the marshalling code, albeit in calls\nthat fail before trying the next overload.\nIn PyPy, speed is gained by having the JIT unbox objects into the payload only,\nallowing it to become part of compiled traces.\nIf the same marshalling APIs were used, the JIT is forced to rebox the payload,\nhand it over through the API, only to have it unboxed again by the binding.\nDoing so is dreadfully inefficient.\nThe objective of cppyy, then, is to keep all code transparent to the JIT until\nthe absolute last possible moment, i.e. the call into C++ itself, therefore\nallowing it to (more or less) directly pass the payload it already has, with\nan absolute minimal amount of extra work.\nIn the extreme case when the binding is not to a call, but to a data member of\nan object (or to a global variable), the memory address is delivered to the\nJIT and this results in direct access with no overhead.\nNote the interplay: cppyy in PyPy does not work like a binding in the CPython\nsense that is a back-and-forth between the interpreter and the extension.\nInstead, it does its work by being transparent to the JIT, allowing the JIT to\ndissolve the binding.\nAnd with that, we have made a full circle: if to work well with the JIT, and\nin so doing achieve the best performance, you can not have marshalling or do\nany other API-based driving, then the concept of compiled extension modules is\nout, and the better solution is in run-time generated bindings.\nThat leaves one final point.\nWhat if you do want to present an extension module-like interface to\nprogrammers that use your code?\nBut of course, this is Python: everything consists of first-class objects,\nwhose behavior can be changed on the fly.\nIn CPython, you might hesitate to make such changes, as every overlay or\nindirection results in quite a bit of overhead.\nWith PyPy, however, these layers are all optimized out of existences, making\nthat a non-issue.\nThis posting laid out the reasoning behind the organization of cppyy.\nA follow-up is planned, to explain how C++ objects are handled and\nrepresented internally.\nWim Lavrijsen", + "tags": "", + "url": "https://www.pypy.org/posts/2012/06/architecture-of-cppyy-9077100041707701102.html" + }, + { + "title": "Release 0.1 of CFFI", + "text": "Hi.We're pleased to announce the first public release, 0.1 of CFFI, a way to call C from Python.\n(This release does not support PyPy yet --- but we announce it here as it is planned for the\nnext release :-)The package is available on bitbucket as well as documented. You can also install it\nstraight from the python package index (pip).The aim of this project is to provide a convenient and reliable way of calling C code from Python.\nThe interface is based on LuaJIT's FFI and follows a few principles:The goal is to call C code from Python. You should be able to do so\nwithout learning a 3rd language: every alternative requires you to learn\ntheir own language (Cython, SWIG) or API (ctypes). So we tried to\nassume that you know Python and C and minimize the extra bits of API that\nyou need to learn.\nKeep all the Python-related logic in Python so that you don't need to\nwrite much C code (unlike CPython native C extensions).\nWork either at the level of the ABI (Application Binary Interface)\nor the API (Application Programming Interface). Usually, C\nlibraries have a specified C API but often not an ABI (e.g. they may\ndocument a \"struct\" as having at least these fields, but maybe more).\n(ctypes works at the ABI level, whereas Cython or native C extensions\nwork at the API level.)\nWe try to be complete. For now some C99 constructs are not supported,\nbut all C89 should be, including macros (and including macro \"abuses\",\nwhich you can manually wrap in saner-looking C functions).\nWe attempt to support both PyPy and CPython (although PyPy support is not\ncomplete yet) with a reasonable path for other Python implementations like\nIronPython and Jython.\nNote that this project is not about embedding executable C code in\nPython, unlike Weave. This is about calling existing C libraries\nfrom Python.\nStatus of the projectConsider this as a beta release. Creating CPython extensions is fully supported and the API should\nbe relatively stable; however, minor adjustements of the API are possible.PyPy support is not yet done and this is a goal for the next release. There are vague plans to make this the\npreferred way to call C from Python that can reliably work between PyPy and CPython.Right now CFFI's verify() requires a C compiler and header files to be available at run-time.\nThis limitation will be lifted in the near future and it'll contain a way to cache the resulting binary.Cheers,\n\nArmin Rigo and Maciej Fija\u0142kowski", + "tags": "releasecffi", + "url": "https://www.pypy.org/posts/2012/06/release-01-of-cffi-4760622823232463868.html" + }, + { + "title": "STM with threads", + "text": "Hi all,A quick update. The first version of pypy-stm based on regular\nthreads is ready. Still having no JIT and a 4-or-5-times performance\nhit, it is not particularly fast, but I am happy that it turns out not\nto be much slower than the previous thread-less attempts. It is at\nleast fast enough to run faster (in real time) than an equivalent no-STM\nPyPy, if fed with an eight-threaded program on an eight-core machine\n(provided, of course, you don't mind it eating all 8 cores' CPU power\ninstead of just one :-).You can download and play around with this binary for Linux 64. It\nwas made from the stm-thread branch of the PyPy repository (translate.py --stm -O2 targetpypystandalone.py). (Be sure\nto put it where it can find its stdlib, e.g. by putting it inside the\ndirectory from the official 1.9 release.)This binary supports the thread module and runs without the GIL.\nSo, despite the factor-of-4 slow-down issue, it should be the fourth\ncomplete Python interpreter in which we can reasonably claim to have\nresolved the problem of the GIL. (The first one was Greg Stein's Python\n1.4, re-explored here; the second one is Jython; the third one is\nIronPython.) Unlike the previous three, it is also the first one to\noffer full GIL semantics to the programmer, and additionally\nthread.atomic (see below). I should also add that we're likely to\nsee in the next year a 5th such interpreter, too, based on Hardware\nTransactional Memory (same approach as with STM, but using e.g.\nIntel's HTM).The binary I linked to above supports all built-in modules from PyPy,\napart from signal, still being worked on (which can be a bit\nannoying because standard library modules like subprocess depend on\nit). The sys.get/setcheckinterval() functions can be used to tweak\nthe frequency of the automatic commits. Additionally, it offers\nthread.atomic, described in the previous blog post as a way to\ncreate longer atomic sections (with the observable effect of preventing\nthe \"GIL\" to be released during that time). A complete\ntransaction.py module based on it is available from the sources.The main missing features are:the signal module;\nthe Garbage Collector, which does not do major collections so far, only\nminor ones;\nand finally, the JIT, which needs some amount of integration to generate\nthe correctly-tweaked assembler.\nHave fun!Armin.", + "tags": "stm", + "url": "https://www.pypy.org/posts/2012/06/stm-with-threads-7818875111634541910.html" + }, + { + "title": "PyPy 1.9 - Yard Wolf", + "text": "We're pleased to announce the 1.9 release of PyPy. This release brings mostly\nbugfixes, performance improvements, other small improvements and overall\nprogress on the numpypy effort.\nIt also brings an improved situation on Windows and OS X.You can download the PyPy 1.9 release here:https://pypy.org/download.htmlWhat is PyPy?PyPy is a very compliant Python interpreter, almost a drop-in replacement for\nCPython 2.7. It's fast (pypy 1.9 and cpython 2.7.2 performance comparison)\ndue to its integrated tracing JIT compiler.This release supports x86 machines running Linux 32/64, Mac OS X 64 or\nWindows 32. Windows 64 work is still stalling, we would welcome a volunteer\nto handle that.Thanks to our donorsBut first of all, we would like to say thank you to all people who\ndonated some money to one of our four calls:NumPy in PyPy (got so far $44502 out of $60000, 74%)\nPy3k (Python 3) (got so far $43563 out of $105000, 41%)\nSoftware Transactional Memory (got so far $21791 of $50400, 43%)\nas well as our general PyPy pot.\nThank you all for proving that it is indeed possible for a small team of\nprogrammers to get funded like that, at least for some\ntime. We want to include this thank you in the present release\nannouncement even though most of the work is not finished yet. More\nprecisely, neither Py3k nor STM are ready to make it in an official release\nyet: people interested in them need to grab and (attempt to) translate\nPyPy from the corresponding branches (respectively py3k and\nstm-thread).HighlightsThis release still implements Python 2.7.2.\nMany bugs were corrected for Windows 32 bit. This includes new\nfunctionality to test the validity of file descriptors; and\ncorrect handling of the calling convensions for ctypes. (Still not\nmuch progress on Win64.) A lot of work on this has been done by Matti Picus\nand Amaury Forgeot d'Arc.\nImprovements in cpyext, our emulator for CPython C extension modules.\nFor example PyOpenSSL should now work. We thank various people for help.\nSets now have strategies just like dictionaries. This means for example\nthat a set containing only ints will be more compact (and faster).\nA lot of progress on various aspects of numpypy. See the numpy-status\npage for the automatic report.\nIt is now possible to create and manipulate C-like structures using the\nPyPy-only _ffi module. The advantage over using e.g. ctypes is that\n_ffi is very JIT-friendly, and getting/setting of fields is translated\nto few assembler instructions by the JIT. However, this is mostly intended\nas a low-level backend to be used by more user-friendly FFI packages, and\nthe API might change in the future. Use it at your own risk.\nThe non-x86 backends for the JIT are progressing but are still not\nmerged (ARMv7 and PPC64).\nJIT hooks for inspecting the created assembler code have been improved.\nSee JIT hooks documentation for details.\nselect.kqueue has been added (BSD).\nHandling of keyword arguments has been drastically improved in the best-case\nscenario: proxy functions which simply forwards *args and **kwargs\nto another function now performs much better with the JIT.\nList comprehension has been improved.\nJitViewerThere will be a corresponding 1.9 release of JitViewer which is guaranteed to work\nwith PyPy 1.9. See the JitViewer docs for details.Cheers,\nThe PyPy Team", + "tags": "", + "url": "https://www.pypy.org/posts/2012/06/pypy-19-yard-wolf-7006180436602667005.html" + }, + { + "title": "Py3k status update #4", + "text": "This is the fourth status update about our work on the py3k branch, which we\ncan work on thanks to all of the people who donated to the py3k proposal.For various reasons, less work than usual has been done since the last status\nupdate. However, some interesting things happened anyway.As readers know, so far we spent most of the effort in fixing all PyPy's own\ntests which started to fail for various py2/py3 differences. Most of them\nfailed for shallow reasons, e.g. syntactic changes or the int/long\nunifications. Others failed for subtle differences and needed a bit more care,\nfor example the fact that unbound methods are gone in Py3k.The good news is that finally we are seeing the light at the end of the\ntunnel. Most of them have been fixed. For sine other tests, we introduced the\nconcept of \"py3k-skipping\": some optimizations and modules are indeed failing,\nbut right now we are concentrating on completing the core language and so we\nare not interested in those. When the core language will be done, we will be\nable to easily find and work on the py3k-skipped tests. In particular, for\nnow we disabled the Int and String dict strategies, which are broken\nbecause of the usual int/long unification and str vs bytes. As for modules,\nfor now _continuation (needed for stackless) and _multiprocessing do\nnot work yet.Another non-trivial feature we implemented is the proper cleaning of exception\nvariables when we exit except blocks. This is a feature which touches\nlots of levels of PyPy, starting from astcompiler, down to the bytecode\ninterpreter. It tooks two days of headache, but at the end we made it :-).Additionally, Amaury did a lot of improvements to cpyext, which had been\nbroken since forever on this branch.As for the next plans, now that things are starting to work and PyPy's own\ntests mostly pass, we can finally start to run the compiled PyPy against\nCPython's test suite. It is very likely that we will have tons of failures at\nthe beginning, but once we start to fix them one by one, a Py3k-compatible\nPyPy will be closer and closer.", + "tags": "pypy3", + "url": "https://www.pypy.org/posts/2012/06/py3k-status-update-4-4834053219477515637.html" + }, + { + "title": "STM update: back to threads?", + "text": "Hi again,\n\nHere is another update on the status of Software Transactional Memory on PyPy.\n\nThose of you who have been closely following this blog since last year know that, from the very first post about STM, I explored various design ideas about the API that we should get when programming in Python.\n\nI went a full circle, and now I am back to where I started (with, important difference, a very roughly working implementation of pypy-stm).\n\nWhat I realized is that the \"thread\" module is not that bad after all --- I mean, yes, it is a horribly low-level interface, but it is general enough to build various interesting things on top of it. What the \"stm-thread\" branch of PyPy contains is, basically, the regular \"thread\" module in which the GIL was replaced with STM. It gives multicore capabilities to any program based on multiple threads. (This is so far exactly the idea same than the one being investigated for Hardware Transactional Memory. It is roughly also what you would get if you managed to convince GCC 4.7 to compile CPython using STM.)\n\nNow while this might already be quite interesting to some people, here is how it relates to all I said previously: namely, threads are bad, and some new \"transaction\" module would be a better idea.\n\nThere is one new core functionality in the \"stm-thread\" branch: it is \"thread.atomic\", a context manager that can be used in a \"with\" statement (exact name subject to change). In terms of the GIL, it prevents the GIL from being released in the \"with\" block. In terms of STM, it prevents a \"transaction break\", which means that the whole \"with\" statement runs in one single transaction. (From the Python programmer's point of view, the net effect is the same.)\n\nSo far, no ground-breaking news. But what I missed previously is that this is enough to give multicore capabilities even to a program that is not using threads so far. It is possible to rewrite an equivalent of the old transaction module in a few pages of pure Python, using \"thread.atomic\". Something along the following lines: start N threads that each reads from a Queue.Queue() the next job to do, and does it in a \"with thread.atomic\" block. The STM version of PyPy is then able to run these atomic blocks concurrently. The key point is that the slightly delicate handling of threads should be nicely hidden inside the new \"transaction\" module, and from outside the observed behavior would be exactly as if the transactions that we schedule are run serially.\n\nThe point I kept missing was that, yes, this sounds like nonsense, because it seems that we create N threads just to serialize their work again in \"thread.atomic\" sections. In fact this would be nonsense in any model that would \"just\" remove the GIL to let multiple threads run concurrently without crashing. Indeed, you have multiple threads, but their atomic blocks would be again a sort of GIL: only one of them would run at a time. And this is indeed the simple model of execution that you get even with STM --- but not the model of performance. The performance with STM scales with the number of cores, as long as there is enough non-conflicting work to do.\n\nSo in summary the complete circle back to the starting point is that threads might be a good low-level model. It mends itself naturally to, say, a kind of program in which the main thread polls file descriptors using select() or the Linux epoll(), and the work received is split along N other threads --- which is the kind of program you would naturally write in other languages that don't have a GIL, say Java. The other threads can then use \"thread.atomic\" blocks to protect sections of their work. The traditional Transactional Memory point of view is that you use such blocks to guard the short sections of code that communicate with other threads or modify global state, but nothing prevents you from using much larger sections: you should be able to scale them up to the size of a native \"unit of work\", so that every unit is naturally atomic. And then it's only a matter of design: you can tweak an existing module that does the thread pooling to add one \"with thread.atomic\"; or do it yourself from scratch; or (if the design is compatible enough) just plug in the proposed pure-Python \"transaction\" module. Or if you feel like it you can even use threads directly (but keep in mind that using threads too explicitly is not a composable abstraction, whereas higher-level designs typically are).\n\nAt the end of the day, you can write or reuse programs whose global structure you are already familiar with, for example with a thread pool (that can be hidden in a library if you prefer), or any other structure with or without explicit threads. But you can do so without all the mess that comes with threads like locks and deadlocks. From that angle it is really similar to Garbage Collection: e.g. the Boehm GC (now used by GCC itself) lets you write C code like you are used to, but forgeting all you had to learn about careful explicit memory management.", + "tags": "stm", + "url": "https://www.pypy.org/posts/2012/05/stm-update-back-to-threads-6622746581767639355.html" + }, + { + "title": "STM update (and thanks everybody)", + "text": "A short update on the Software Transactional Memory (STM) side. Let me remind you that the work is to add STM internally into PyPy, with the goal of letting the user's programs run on multiple cores after a minor adaptation. (The goal is not to expose STM to the user's program.) I will soon write some official documentation that explains in more details exactly what you get. For now you can read the previous blog posts, and you can also find technical details in the call for donation itself; or directly look at how I adapted the examples linked to later in this post.I have now reached the point where the basics seem to work. There is no integration with the JIT so far; moreover the integration with the Garbage Collection subsystem is not finished right now, but at least it is \"not crashing in my simple tests and not leaking memory too quickly\". (It means that it is never calling __del__ so far, although it releases memory; and when entering transactional mode or when going to the next transaction, all live objects become immortal. This should still let most not-too-long-running programs work.)If you want to play with it, you can download this binary (you need to put it in a place with the paths lib-python and lib_pypy, for example inside the main directory from a regular nightly tarball or from a full checkout). This version was compiled for Linux x86 32-bit from the stm-gc branch on the 25th of April. It runs e.g. the modified version of richards. This branch could also be translated for Linux x86-64, but not for other OSes nor other CPUs for now.The resulting pypy-stm exposes the same interface as the pure Python transaction module, which is an emulator (running on CPython or any version of PyPy) which can be used to play around and prepare your programs. See the comments in there. A difference is that the real pypy-stm doesn't support epoll right now, so it cannot be used yet to play with a branch of Twisted that was already adapted (thanks Jean-Paul Calderone); but that's coming soon. For now you can use it to get multi-core usage on purely computational programs.I did for example adapt PyPy's own translate.py: see the tweak in rpython/rtyper.py. Lines 273-281 are all that I needed to add, and they are mostly a \"simplification and parallelization\" of the lines above. There are a few more places in the whole translate.py that could be similarly modified, but overall it is just that: a few places. I did not measure performance, but I checked that it is capable of using multiple cores in the RTyping step of translation, with --- as expected --- some still-reasonable number of conflicts, particularly at the beginning when shared data structures are still being built.On a few smaller, more regular examples like richards, I did measure the performance. It is not great, even taking into account that it has no JIT so far. Running pypy-stm with one thread is roughly 5 times slower than running a regular PyPy with no JIT (it used to be better in previous versions, but they didn't have any GC; nevertheless, I need to investigate). However, it does seem to scale. At least, it scales roughly as expected on my 2-real-cores, 4-hyperthreaded-cores laptop (i.e. for N between 1 and 4, the N-threaded pypy-stm performs similarly to N independent pypy-stm's running one thread each).And finally......a big thank you to everyone who contributed some money to support this! As you see on the PyPy site, we got more than 6700$ so far in only 5 or 6 weeks. Thanks to that, my contract started last Monday, and I am now paid a small salary via the Software Freedom Conservancy (thanks Bradley M. Kuhn for organizational support from the SFC). Again, thank you everybody!UPDATE: The performance regression was due to disabling an optimization, the method cache, which caused non-deterministic results --- the performance could vary from simple to double. Today, as a workaround, I made the method cache transaction-local for now; it is only effective for transactions that run for long enough (maybe 0.1ms or 1ms), but at least it is there in this situation. In the version of richards presented above, the transactions are too short to make a difference (around 0.015ms).", + "tags": "stm", + "url": "https://www.pypy.org/posts/2012/04/stm-update-and-thanks-everybody-6071745734932940294.html" + }, + { + "title": "NumPy on PyPy progress report", + "text": "Hello.\nA lot of things happened in March, like pycon. I was also busy doing other\nthings (pictured), so apologies for the late numpy status update.\nHowever, a lot of things have happened and numpy continues to be one of the\nmain points of entry for hacking on PyPy. Apologies to all the people whose\npatches I don't review in timely manner, but seriously, you do a lot of\nwork.\nThis list of changes is definitely not exhaustive, and I might be forgetting\nimportant contributions. In a loose order:\n\nMatti Picus made out parameter work for a lot of (but not all)\nfunctions.\n\nWe merged record dtypes support. The only missing dtypes left are complex\n(important), datetime (less important) and object (which will probably\nnever be implemented because it makes very little sense and is a mess with moving GCs).\n\nTaavi Burns and others implemented lots of details, including lots of ufuncs.\nOn the completely unscientific measure of \"implemented functions\" on\nnumpypy status page, we're close to 50% of numpy working. In reality\nit might be more or less, but after complex dtypes we're getting very close\nto running real programs.\n\nBool indexing of arrays of the same size should work, leaving only\narrays-of-ints indexing as the last missing element of fancy indexing.\n\nI did some very early experiments on SSE. This work is seriously\npreliminary - in fact the only implemented operation is addition of\nfloat single-dimension numpy arrays. However, results are encouraging,\ngiven that our assembler generator is far from ideal:\n\n\n\n\n\n\n\n\n\n\n\u00a0\nNumpy\n\nPyPy SSE\n\nPyPy\n\nGCC non-looped\n\nGCC looped\n\n\na+b\n\n0.6s\n\n0.3s\n\n0.4s\n\n0.3s\n\n0.25s\n\n\na+b+c\n\n1.9s\n\n0.35s\n\n0.5s\n\n0.7s\n\n0.32s\n\n\na+b+c+d+e\n\n3.2s\n\n0.36s\n\n0.8s\n\n1.7s\n\n0.51s\n\n\n\n\nThe benchmark repo is available. GCC was run with -O3, no further\noptions specified. PyPy was run with default options, the SSE branch is under\nbackend-vector-ops, but it's not working completely yet.\nOne might argue that C and Python is not the same code - indeed it is not.\nIt just shows some possible approach to writing numeric code.\n\n\nNext step would be to just continue implementing missing features such as\n\nspecialised arrays i.e. masked arrays and matrixes\ncore modules such as fft, linalg, random.\nnumpy's testing framework\n\nThe future is hard to predict, but we're not far off!\nCheers,fijal\n\nUPDATE:Indeed, string and unicode dtypes are not supported yet. They're as important as complex dtype", + "tags": "numpy", + "url": "https://www.pypy.org/posts/2012/04/numpy-on-pypy-progress-report-6048076549081013253.html" + }, + { + "title": "PyCon 2012 wrap up", + "text": "So, PyCon happened. This was the biggest PyCon ever and probably the biggest\ngathering of Python hackers ever.\nFrom the PyPy perspective, a lot at PyCon was about PyPy. Listing things:\n\nDavid Beazley presented an excellent keynote describing his experience\ndiving head-first into PyPy and at least partly failing. He, however, did\nnot fail to explain bits and pieces about PyPy's architecture.\nVideo is available.\nWe gave tons of talks, including the tutorial, why pypy by example\nand pypy's JIT architecture\nWe had a giant influx of new commiters, easily doubling the amount of pull\nrequests ever created for PyPy. The main topics for newcomers were numpy and\npy3k, disproving what David said about PyPy being too hard to dive into ;)\nGuido argued in his keynote that Python is not too slow. In the meantime,\nwe're trying to prove him correct :-)\n\nWe would like to thank everyone who talked to us, shared ideas and especially\nthose who participated in sprints - we're always happy to welcome newcomers!\nI'm sure there are tons of things I forgot, but thank you all!\nCheers,\nfijal", + "tags": "", + "url": "https://www.pypy.org/posts/2012/04/pycon-2012-wrap-up-559575896040055505.html" + }, + { + "title": "Py3k status update #3", + "text": "This is the third status update about my work on the py3k branch, which I can work on thanks to all of the people who donated to the py3k proposal.\n\nA lot of work has been done during the last month: as usual, the list of changes is too big to be reported in a detalied way, so this is just a summary of what happened.\n\nOne of the most active areas was killing old and deprecated features. In particular, we killed support for the __cmp__ special method and its counsins, the cmp builtin function and keyword argument for list.sort() and sorted(). Killing is easy, but then you have to fix all the places which breaks because of this, including all the types which relied on __cmp__ to be comparable,, fixing all the tests which tried to order objects which are no longer ordeable now, or implementing new behavior like forbidding calling hash() on objects which implement __eq__ but not __hash__.\n\nAmong the other features, we killed lots of now-gone functions in the operator module, the builtins apply(), reduce() and buffer, and the os.* functions to deal with temporary files, which has been deprecated in favour of the new tempfile module.\n\nThe other topic which can't miss in a py3k status update is, as usual, string-vs-unicode. At this round, we fixed bugs in string formatting (in particular to teach format() to always use unicode strings) and various corner cases about when calling the (possibly overridden) __str__ method on subclasses of str. Believe me, you don't want to know the precise rules :-).\n\nOther features which we worked on and fixed tests include, but are not limited to, marshal, hashlib, zipimport, _socket and itertools, plus the habitual endless lists of tests which fail for shallow reasons such as the syntactic differences, int vs long, range() vs list(range()) etc. As a result, the number of failing tests dropped from 650 to 235: we are beginning to see the light at the end of the tunnel :-)\n\nBenjamin finished implementing Python 3 syntax. Most of it was small cleanups and tweaks to be compatible with CPython such as making True and False keywords and preventing . . . (note spaces between dots) from being parsed as Ellipsis. Larger syntax additions included keyword only arguments and function annotations.\n\nFinally, we did some RPython fixes, so that it is possible again to translate PyPy in the py3k branch. However, the resuling binary is a strange beast which mixes python 2 and python 3 semantics, so it is unusable for anything but showing friends how cool it is.\n\nI would like to underline that I was not alone in doing all this work. In particular, a lot of people joined the PyPy sprint at Pycon and worked on the branch, as you can clearly see in this activity graph. I would like to thank all who helped!\n\ncheers,\nAntonio and Benjamin", + "tags": "pypy3", + "url": "https://www.pypy.org/posts/2012/04/py3k-status-update-3-6975588144646689872.html" + }, + { + "title": "PyPy sprint in Leipzig, Germany (June 22-27)", + "text": "The next PyPy sprint will be held --- for the first time in a while ---\nin a place where we haven't been so far: Leipzig, Germany, at the\nPython Academy's Teaching Center. It will take place from the 22nd\nto the 27th of June 2012, before EuroPython. Thanks to Mike M\u00fcller for\norganizing it!\nThis is a fully public sprint, everyone is welcome to join us. All days are\nfull sprint days, so it is recommended to arrive the 21st and leave the 28th.\nTopics and goals\nOpen. Here are some goals:\n\nnumpy: progress towards completing the numpypy module; try to\nuse it in real code\nstm: progress on Transactional Memory; try out the transaction module on real code.\njit optimizations: there are a number of optimizations we can still\ntry out or refactor.\nwork on various, more efficient data structures for Python language.\nA good example would be lazy string slicing/concatenation or more efficient\nobjects.\nany other PyPy-related topic is fine too.\n\nGrants\nFor students, we have the possibility to support some costs via PyPy\nfunds. Additionally, we can support you applying for grants from the\nPSF and other sources.\nRegistration\nIf you'd like to come, please sign up either by announcing yourself on\npypy-dev, or by directly adding yourself to the list of people.\n(We need to have a head count for the organization.) If you are new to\nthe project please drop a note about your interests and post any\nquestions.\nMore...\nFor more information, please see the sprint announcement.", + "tags": "", + "url": "https://www.pypy.org/posts/2012/04/pypy-sprint-in-leipzig-june-22-27-6450601012927549960.html" + }, + { + "title": "Call for donations for Software Transactional Memory", + "text": "Hi all,\n\nThe Software Transactional Memory\ncall for donations is up. From the proposal:\n\n\nPrevious attempts on Hardware Transactional Memory focused on parallelizing existing programs written using the thread or threading modules. However, as argued here, this may not be the most practical way to achieve real multithreading; it seems that better alternatives would offer good scalability too. Notably, Transactional Memory could benefit any event-based system that is written to dispatch events serially (Twisted-based, most GUI toolkit, Stackless, gevent, and so on). The events would internally be processed in parallel, while maintaining the illusion of serial execution, with all the corresponding benefits of safety. This should be possible with minimal changes to the event dispatchers. This approach has been described by the Automatic Mutual Exclusion work at Microsoft Research, but not been implemented anywhere (to the best of our knowledge).\n\nNote that, yes, this gives you both sides of the coin: you keep using your non-thread-based program (without worrying about locks and their drawbacks like deadlocks, races, and friends), and your programs benefit from all your cores.\n\nIn more details, a low-level built-in module will provide the basics to start transactions in parallel; but this module will be only used internally in a tweaked version of, say, a Twisted reactor. Using this reactor will be enough for your existing Twisted-based programs to actually run on multiple cores. You, as a developer of the Twisted-based program, have only to care about improving the parallelizability of your program (e.g. by splitting time-consuming transactions into several parts; the exact rules will be published in detail once they are known).\n\n\nThe point is that your program is always correct, and can be tweaked to improve performance. This is the opposite from what explicit threads and locks give you, which is a performant program which you need to tweak to remove bugs. Arguably, this approach is the reason for why you use Python in the first place :-)\n\nArmin", + "tags": "stm", + "url": "https://www.pypy.org/posts/2012/03/call-for-donations-for-software-8853699867109654713.html" + }, + { + "title": "Py3k status update #2", + "text": "This is the second status update about my work on the py3k branch, which I can work on thanks to all of the people who donated to the py3k proposal.Since my previous status update, things have improved a lot: first of all, I fixed the syntax of many more tests, which were failing on the branch because they used constructs which are no longer valid in Python 3, such as u'' strings, the print statement or the old except Exception, e syntax. I have to say that this work is tedious and not very rewarding, but it has to be done anyway, so that the real failures can stand up.Then, I spent most of the rest of the time by killing features which are present in Python 2 and are gone in Python 3.Some of them were easy and mechnical: for example, I removed all the function attributes such as func_code and func_closure, which has been renamed to __code__ and __closure__, and then I had to find and fix all the places which still expected the old ones.Some were trickier: I removed support for the cmp function and the __cmp__ special method, but this also meant that I had to fix a few types which relied on it to be comparable (for example, did you know that the cells contained in __closure__ are comparable?). At the same time, I also removed the old behavior which in Python 2 allows us to compare arbitrary objects with <, > & co.: in Python 3 the only comparisons allowed between incompatible types are == and !=.Speaking of old special methods, __hex__ and __oct__ are gone as well (and I didn't even know about their existence before removing them :-))But the most important breakthrough was the removal of the _file module, containing the implementation of the file type in Python 2, which is now gone since in Python 3 files are handled by the _io module. Killing the module was not straightforward, because some of the importing logic was tightly tied to the internal implementation of files, so it needed some refactoring. Finally, I had to fix the marshal module to correctly detect text files vs. byte files.Among these things, I fixed tons of smaller issues here and there. As a result, there are many fewer failing tests than a few weeks ago. Obviously the number itself does not mean much, because sometimes fixing a single test takes hours, and some other times by changing one line one fixes tens of tests. But at the end, seeing it dropping from 999 to 650 always is nice and rewarding :-).The road for having a pypy3k is still long, but everything is going fine so far. Stay tuned for more updates!cheers, Antonio", + "tags": "pypy3", + "url": "https://www.pypy.org/posts/2012/03/py3k-status-update-2-4018939509128176130.html" + }, + { + "title": "Py3k status update", + "text": "Thank to all the people who donated to the py3k proposal, we managed to collect enough money to start to work on the first step. This is a quick summary of what I did since I began working on this.\nFirst of all, many thanks to Amaury Forgeot d'Arc, who started the py3k branch months ago, and already implemented lots of features including e.g. switching to \"unicode everywhere\" and the int/long unification, making my job considerably easier :-)\nI started to work on the branch at the last Leysin sprint together with Romain Guillebert, where we worked on various syntactical changes such as extended tuple unpacking and keyword-only arguments. Working on such features is a good way to learn about a lot of the layers which the PyPy Python interpreter is composed of, because often you have to touch the tokenizer, the parser, the ast builder, the compiler and finally the interpreter.\nThen I worked on improving our test machinery in various way, e.g. by optimizing the initialization phase of the object space created by tests, which considerably speeds up small test runs, and adding the possibility to automatically run our tests against CPython 3, to ensure that what we are not trying to fix a test which is meant to fail :-). I also setup our buildbot to run the py3k tests nightly, so that we can have an up to date overview of what is left to do.\nFinally I started to look at all the tests in the interpreter/ directory, trying to unmangle the mess of failing tests. Lots of tests were failing because of simple syntax errors (e.g., by using the no longer valid except Exception, e syntax or the old print statement), others for slightly more complex reasons like unicode vs bytes or the now gone int/long distinction. Others were failing simply because they relied on new features, such as the new lexical exception handlers.\nTo give some numbers, at some point in january we had 1621 failing tests in the branch, while today we are under 1000 (to be exact: 999, and this is why I've waited until today to post the status update :-)).\nBefore ending this blog post, I would like to thank once again all the people who donated to PyPy, who let me to do this wonderful job. That's all for now, I'll post more updates soon.\ncheers, Antonio", + "tags": "pypy3", + "url": "https://www.pypy.org/posts/2012/02/py3k-status-update-8840622949715145821.html" + }, + { + "title": "A Larger Example for the Flow Graph Language", + "text": "Part 4 of Comparing Partial Evaluation to Tracing\nThis is the fourth and final blog post in a series about comparing partial evaluation and\ntracing. We've come a long way: In the first post of the series I showed an interpreter for a small flow-graph\nlanguage together with a partial evaluator it. In the second post I showed how a tracer for\nthe same language works and how it relates to both execution and to partial\nevaluation. The third post described an optimizer for traces.\nIn this final post we can compare and contrast the two different approaches of\ntracing and partial evaluation by means of an example. The programs in the flow\nchart language seen so far have been rather small, so I want to give an example\nof a larger program: an interpreter for an extremely simple bytecode\ninstruction set. I will look at how the partial evaluator deals with that\ninterpreter, and\nwhat the tracer does with it. The code for\nthat, as well as all the code of the series can be found here: https://paste.pocoo.org/show/550282/ (some small\nadditions have been made, such as a nicer way to print traces).\nA Bytecode Interpreter\nWriting programs in the flow graph language is painful, but I still want to give\nan example that is a bit more interesting than the tiny ones that we've seen so\nfar. The example is an interpreter for the bytecode of a very trivial\nregister-based language. The language has four registers, one of which is an\naccumulator on which all the actual operations are performed.\nThe opcodes of the language are:\n\njump_if_a, jumps to a target address when the accumulator is non-zero\nmov_a_r0, mov_a_r1, mov_a_r2 move the value of the accumulator to\nthe respective register\nmov_r0_a, mov_r1_a, mov_r2_a move the value of a register to\nthe accumulator\nadd_r0_to_a, add_r1_to_a, add_r2_to_a add the value of the\nregister to the accumulator\ndecr_a decrement the accumulator\nreturn_a stop the program and print the accumulator\n\nThe interpreter has a main loop that reads the opcode at the current program\ncounter, does a (lengthy) dispatch to the right bytecode via a series of if\nstatements and then executes the right opcode. Afterwards the next opcode is\ntreated equivalently.\nHere is a part of the source code in the flow graph language. As pseudocode:\n\nbytecode_loop:\n opcode = bytecode[pc]\n pc = pc + 1\n c = opcode == 'jump_if_a'\n if c goto op_jump_if_a else goto not_jump_if_a\n\n# select the right bytecode via a long series of if statements\nnot_jump_if_a:\n c = opcode == 'mov_a_r0'\n if y goto op_mov_a_r0 else goto not_mov_a_r0\nnot_mov_a_r0:\n c = opcode == 'mov_a_r0'\n if y goto op_mov_a_r1 else goto not_mov_a_r1\n...\n\n# bytecode implementations\nop_mov_a_r0:\n r0 = a\n goto bytecode_loop\n\nop_jump_if_a:\n c = a == 0\n target = bytecode[pc]\n pc += 1\n if c goto bytecode_loop else goto op_jump_if_a_jump\n\nop_jump_if_a_jump:\n pc = target\n goto bytecode_loop\n...\n\nAnd actually working, as Prolog facts (the full implementation can be found at\nthe link above):\n% bytecode dispatch loop\nblock(bytecode_loop,\n op2(opcode, readlist, var(bytecode), var(pc),\n op2(pc, add, var(pc), const(1),\n op2(c, eq, var(opcode), const(jump_if_a),\n if(c, op_jump_if_a, not_jump_if_a))))).\n\n% select the right bytecode via a long series of if statements\nblock(not_jump_if_a,\n op2(c, eq, var(opcode), const(mov_a_r0),\n if(c, op_mov_a_r0, not_mov_a_r0))).\nblock(not_mov_a_r0,\n op2(c, eq, var(opcode), const(mov_a_r1),\n if(c, op_mov_a_r1, not_mov_a_r1))).\n...\n\n% bytecode implementations\nblock(op_jump_if_a,\n op2(c, eq, var(a), const(0),\n op2(target, readlist, var(bytecode), var(pc),\n op2(pc, add, var(pc), const(1),\n if(c, bytecode_loop, op_jump_if_a_jump))))).\nblock(op_jump_if_a_jump,\n op1(pc, same, var(target),\n promote(bytecode, bytecode_loop))).\nblock(op_mov_a_r0,\n op1(r0, same, var(a), jump(bytecode_loop))).\n...\n\nThe bytecode_loop block is the main dispatch loop. It reads an opcode out of the\nbytecode list at the program counter position, then has a long series of if\nstatements that compares the current opcode to the various existing opcodes.\nThe full code of the interpreter can be found under the link above.\nThe bytecodes of the interpreter don't really permit hugely complex\nprograms, but it can be used to write a program that computes the square of a\nnumber with the following program:\n\nmov_a_r0 # r0 = a\nmov_a_r1 # r1 = a\n# 2:\nmov_r0_a # r0--\ndecr_a\nmov_a_r0\nmov_r2_a # r2 += a\nadd_r1_to_a\nmov_a_r2\nmov_r0_a # if r0!=0: goto 2\njump_if_a 2\nmov_r2_a # return r2\nreturn_a\n\nPartially Evaluating the Bytecode Interpreter\nThe partial evaluator from the first blog post can be easily used to partially\nevaluate the bytecode interpreter. The static input is the bytecode for\ncomputing the square and the initial program counter value, as given above. The\ndynamic input are the content of the accumulator (the number to be squared).\nThis can be done as follows:\n?- bytecode_square(B),\nEnv = [bytecode/B, pc/0],\ndo_pe(bytecode_loop, Env, Label),\nREnv = [a/16, r0/0, r1/0, r2/0],\ninterp(jump(Label), REnv), listing(block).\n256\n:- dynamic block/2.\n\n\n\nThe code that is generated by the partial evaluation process is somewhat hard to\nread. It contains a lot of passages like this:\n...\nblock(op_return_a1, print_and_stop(var(a))).\nblock(not_decr_a1, jump(op_return_a1)).\nblock(not_add_r2_to_a2, jump(not_decr_a1)).\nblock(not_add_r1_to_a2, jump(not_add_r2_to_a2)).\nblock(not_add_r0_to_a3, jump(not_add_r1_to_a2)).\nblock(not_mov_r2_a3, jump(not_add_r0_to_a3)).\nblock(not_mov_r1_a5, jump(not_mov_r2_a3)).\nblock(not_mov_r0_a5, jump(not_mov_r1_a5)).\nblock(not_mov_a_r27, jump(not_mov_r0_a5)).\nblock(not_mov_a_r18, jump(not_mov_a_r27)).\nblock(not_mov_a_r09, jump(not_mov_a_r18)).\nblock(not_jump_if_a11, jump(not_mov_a_r09)).\nblock(bytecode_loop12, jump(not_jump_if_a11)).\nblock(op_mov_r2_a2, op1(a, same, var(r2), jump(bytecode_loop12))).\n...\n\nI.e. lots of blocks that do nothing but jump to another block, interspersed with\nsome blocks that contain an actual operation. I cleaned the output up manually\nand got something like the following (this sort of cleanup is something a good\npartial evaluation system would do itself, after partial evaluation has\noccurred):\nblock(bytecode_loop1,\n op1(r0, same, var(a),\n op1(r1, same, var(a),\n op1(a, same, var(r0),\n op2(a, sub, var(a), const(1),\n op1(r0, same, var(a),\n op1(a, same, var(r2),\n op2(a, add, var(a), var(r1),\n op1(r2, same, var(a),\n op1(a, same, var(r0),\n op2(c, eq, var(a), const(0),\n if(c, bytecode_loop11, op_jump_if_a_jump1)))))))))))).\n\nblock(bytecode_loop11,\n op1(a, same, var(r2),\n print_and_stop(var(a))).\n\nblock(op_jump_if_a_jump1,\n op1(a, same, var(r0),\n op2(a, sub, var(a), const(1),\n op1(r0, same, var(a),\n op1(a, same, var(r2),\n op2(a, add, var(a), var(r1),\n op1(r2, same, var(a),\n op1(a, same, var(r0),\n op2(c, eq, var(a), const(0),\n if(c, bytecode_loop11, op_jump_if_a_jump1)))))))))).\n\nWhat do we see here? The partial evaluator has generated a block bytecode_loop1,\nwhich corresponds to the initialization opcodes mov_a_r0 and mov_a_r1 together\nwith one iteration of the loop. Then it either jumps to a copy of the main loop\n(label op_jump_if_a_jump1) or to block bytecode_loop11, which prints the result\nand then stops. The residual code does exactly what the bytecode did: It\nsquares the accumulator then prints that. All the uses of the bytecode and\npc variable are gone.\nWhy did the partial evaluator produce two copies of the main loop that\nlook the same? The reason for that is that in the second copy, the additional\nstatic information target = 2 is known, where target is a variable in\nthe interpreter source that stores the jump target, for very brief periods of\ntime. This additional static information does not have any effect on the\nresidual code, so the same code is uselessly generated twice. This is an\nexample of overspecialization.\nTracing the Interpreter\nIn this section we will look at what happens if we try to trace the interpreter.\nThe naive way of doing that yields traces that are not very useful, because they\nabort after one iteration. We will look at a way of avoiding this problem. The\nproblems described in this section are at the core of the paper Tracing the\nmeta-level: PyPy's tracing JIT compiler (that paper uses a slightly more\nadvanced version of the bytecode interpreter as an example).\nTo trace the interpreter, it is useful to change the bytecode_loop block from above\nto always promote the bytecode and the pc variables, because without\nknowing them the trace produced is not really interesting. This is similar to\nmaking these variables static in the partial evaluation example above:\nblock(bytecode_loop,\n promote(bytecode, bytecode_loop_promote_bytecode)).\nblock(bytecode_loop_promote_bytecode,\n promote(pc, bytecode_loop_promote_pc)).\nblock(bytecode_loop_promote_pc,\n op2(opcode, readlist, var(bytecode), var(pc),\n op2(pc, add, var(pc), const(1),\n op2(c, eq, var(opcode), const(0),\n if(c, op_jump_if_a, not_jump_if_a))))).\n...\n\nThe rest of the interpreter stays unchanged.\nTo trace the interpreter we would start naively at the bytecode_loop label, because\nthat's the label in the interpreter that is jumped to most often (which a\nprofiler could establish easily). The following command can be used for that\n(this output prints traces in a slightly more readable way than in previous blog\nposts):\n?- bytecode_square(B),\n A = 16, Env = [bytecode/B, pc/2, a/A, r0/A, r1/A, r2/0],\n do_trace(bytecode_loop, Env).\ntrace\n guard_value(bytecode,[mov_a_r0,mov_a_r1,mov_r0_a,decr_a,mov_a_r0,mov_r2_a,add_r1_to_a,mov_a_r2,mov_r0_a,jump_if_a,2,mov_r2_a,return_a],[],bytecode_loop_promote_bytecode)\n guard_value(pc,2,[],bytecode_loop_promote_pc)\n op2(opcode,readlist,var(bytecode),var(pc))\n op2(pc,add,var(pc),const(1))\n op2(c,eq,var(opcode),const(jump_if_a))\n guard_false(c,[],op_jump_if_a)\n op2(c,eq,var(opcode),const(mov_a_r0))\n guard_false(c,[],op_mov_a_r0)\n op2(c,eq,var(opcode),const(mov_a_r1))\n guard_false(c,[],op_mov_a_r1)\n op2(c,eq,var(opcode),const(mov_a_r2))\n guard_false(c,[],op_mov_a_r2)\n op2(c,eq,var(opcode),const(mov_r0_a))\n guard_true(c,[],not_mov_r0_a)\n op1(a,same,var(r0))\n loop\n\nopttrace\n guard_value(bytecode,[mov_a_r0,mov_a_r1,mov_r0_a,decr_a,mov_a_r0,mov_r2_a,add_r1_to_a,mov_a_r2,mov_r0_a,jump_if_a,2,mov_r2_a,return_a],[],bytecode_loop_promote_bytecode)\n guard_value(pc,2,[bytecode/[mov_a_r0,mov_a_r1,mov_r0_a,decr_a,mov_a_r0,mov_r2_a,add_r1_to_a,mov_a_r2,mov_r0_a,jump_if_a,2,mov_r2_a,return_a]],bytecode_loop_promote_pc)\n op1(a,same,var(r0))\n op1(bytecode,same,const([mov_a_r0,mov_a_r1,mov_r0_a,decr_a,mov_a_r0,mov_r2_a,add_r1_to_a,mov_a_r2,mov_r0_a,jump_if_a,2,mov_r2_a,return_a]))\n op1(pc,same,const(3))\n op1(opcode,same,const(mov_r0_a))\n op1(c,same,const(1))\n loop\n\n256\nB = [mov_a_r0, mov_a_r1, mov_r0_a, decr_a, mov_a_r0, mov_r2_a, add_r1_to_a, mov_a_r2, mov_r0_a|...],\nA = 16,\nEnv = [bytecode/[mov_a_r0, mov_a_r1, mov_r0_a, decr_a, mov_a_r0, mov_r2_a, add_r1_to_a|...], pc/2, a/16, r0/16, r1/16, r2/0]\n\nThese traces are very short. They start with promoting the bytecode and the\npc, followed by the execution of the opcode mov_r0_a, which is the\none at position 2 in the given bytecode. Then they increment the pc and\nloop back to the beginning. Looking at the optimized trace, it is clear that the\ntrace is essentially useless. It will run only for one iteration, because in the\nsecond iteration the pc is 3, thus the guard_value at the beginning\nwill fail.\nThis problem can be solved by tracing more than just one iteration of the\nbytecode dispatch loop, which is called meta-tracing. To get this behaviour, in\nthis simple example it is enough to start (and thus end) tracing at a different\nlabel, op_jump_if_a_jump. This label is hit when the interpreter executes a\njump_if_a bytecode and the jump is taken. In a loop on the level of the\nexecuted bytecode program there is one such jump. Thus tracing from this label,\na full loop in the bytecode program is traced, containing potentially many\niterations of the bytecode dispatch loop in the control flow graph language.\nDoing that yields the following:\n?- bytecode_square(B),\n A = 16, Env = [bytecode/B, pc/11, a/A, r0/A, r1/A, r2/0, target/2],\n do_trace(op_jump_if_a_jump, Env).\ntrace\n op1(pc,same,var(target))\n guard_value(bytecode,[mov_a_r0,mov_a_r1,mov_r0_a,decr_a,mov_a_r0,mov_r2_a,add_r1_to_a,mov_a_r2,mov_r0_a,jump_if_a,2,mov_r2_a,return_a],[],bytecode_loop)\n guard_value(bytecode,[mov_a_r0,mov_a_r1,mov_r0_a,decr_a,mov_a_r0,mov_r2_a,add_r1_to_a,mov_a_r2,mov_r0_a,jump_if_a,2,mov_r2_a,return_a],[],bytecode_loop_promote_bytecode)\n guard_value(pc,2,[],bytecode_loop_promote_pc)\n op2(opcode,readlist,var(bytecode),var(pc))\n op2(pc,add,var(pc),const(1))\n op2(c,eq,var(opcode),const(jump_if_a))\n guard_false(c,[],op_jump_if_a)\n op2(c,eq,var(opcode),const(mov_a_r0))\n guard_false(c,[],op_mov_a_r0)\n op2(c,eq,var(opcode),const(mov_a_r1))\n guard_false(c,[],op_mov_a_r1)\n op2(c,eq,var(opcode),const(mov_a_r2))\n guard_false(c,[],op_mov_a_r2)\n op2(c,eq,var(opcode),const(mov_r0_a))\n guard_true(c,[],not_mov_r0_a)\n op1(a,same,var(r0))\n guard_value(bytecode,[mov_a_r0,mov_a_r1,mov_r0_a,decr_a,mov_a_r0,mov_r2_a,add_r1_to_a,mov_a_r2,mov_r0_a,jump_if_a,2,mov_r2_a,return_a],[],bytecode_loop_promote_bytecode)\n guard_value(pc,3,[],bytecode_loop_promote_pc)\n op2(opcode,readlist,var(bytecode),var(pc))\n ...\n lots of operations ommitted\n ...\n guard_value(bytecode,[mov_a_r0,mov_a_r1,mov_r0_a,decr_a,mov_a_r0,mov_r2_a,add_r1_to_a,mov_a_r2,mov_r0_a,jump_if_a,2,mov_r2_a,return_a],[],bytecode_loop_promote_bytecode)\n guard_value(pc,9,[],bytecode_loop_promote_pc)\n op2(opcode,readlist,var(bytecode),var(pc))\n op2(pc,add,var(pc),const(1))\n op2(c,eq,var(opcode),const(jump_if_a))\n guard_true(c,[],not_jump_if_a)\n op2(c,eq,var(a),const(0))\n op2(target,readlist,var(bytecode),var(pc))\n op2(pc,add,var(pc),const(1))\n guard_false(c,[],bytecode_loop)\n loop\n\nopttrace\n op1(pc,same,var(target))\n guard_value(bytecode,[mov_a_r0,mov_a_r1,mov_r0_a,decr_a,mov_a_r0,mov_r2_a,add_r1_to_a,mov_a_r2,mov_r0_a,jump_if_a,2,mov_r2_a,return_a],[],bytecode_loop)\n guard_value(pc,2,[bytecode/[mov_a_r0,mov_a_r1,mov_r0_a,decr_a,mov_a_r0,mov_r2_a,add_r1_to_a,mov_a_r2,mov_r0_a,jump_if_a,2,mov_r2_a,return_a]],bytecode_loop_promote_pc)\n op1(a,same,var(r0))\n op2(a,sub,var(a),const(1))\n op1(r0,same,var(a))\n op1(a,same,var(r2))\n op2(a,add,var(a),var(r1))\n op1(r2,same,var(a))\n op1(a,same,var(r0))\n op2(c,eq,var(a),const(0))\n guard_false(c,[bytecode/[mov_a_r0,mov_a_r1,mov_r0_a,decr_a,mov_a_r0,mov_r2_a,add_r1_to_a,mov_a_r2,mov_r0_a,jump_if_a,2,mov_r2_a,return_a],pc/11,opcode/jump_if_a,target/2],bytecode_loop)\n op1(bytecode,same,const([mov_a_r0,mov_a_r1,mov_r0_a,decr_a,mov_a_r0,mov_r2_a,add_r1_to_a,mov_a_r2,mov_r0_a,jump_if_a,2,mov_r2_a,return_a]))\n op1(pc,same,const(11))\n op1(opcode,same,const(jump_if_a))\n op1(target,same,const(2))\n op1(c,same,const(0))\n loop\n\n256\nB = [mov_a_r0, mov_a_r1, mov_r0_a, decr_a, mov_a_r0, mov_r2_a, add_r1_to_a, mov_a_r2, mov_r0_a|...],\nA = 16,\nEnv = [bytecode/[mov_a_r0, mov_a_r1, mov_r0_a, decr_a, mov_a_r0, mov_r2_a, add_r1_to_a|...], pc/11, a/16, r0/16, r1/16, r2/0, target/2] .\n\nThat looks better. The trace corresponds to the interpreter running all the\nbytecodes in the loop of the squaring function in the example bytecode above.\nThe optimized code starts with\ntwo guards (checking that the bytecode is still the one for the squaring\nfunction, checking that the pc is 2) and then only does the operations\nthat actually do the computation. No bytecode dispatching is performed, thus the\ninterpretation overhead is fully removed, apart from the two guard_value\noperations at the beginning.\nMany of the assignments in the trace are superfluous, e.g. all the copying back\nand forth between registers r1, r1, r2 and accumulator a. This\ncould be easily solved by an even more intelligent optimization utilizing SSA\nform.\nConclusion About the Interpreter\nBoth partial evaluation and meta-tracing can be used to transform the example\nbytecode computing a square into a form that shows the essential computation\nthat is going on, without the interpretation overhead. The naive partial evaluator\nproduces lots of extra blocks that just jump around, which could be solved with\na post-processing step. The tracer by itself produces uselessly short traces,\nbut with a simple trick of starting the trace at a different point the results\nbecome a lot better.\nIn a real meta-tracing system, the meta-tracer would need a way for the author\nof the interpreter\nto mark which bytecode corresponds to a backward jump. It would also need better\nintegration with the interpreter to start tracing automatically, as well as\ncache the traces. Additionally, it would have to deal better with guards that fail a\nlot, attaching new traces to the failing guards. However, all that is \"just\"\nengineering on top of the ideas presented in this series of blog posts.\nHigh-Level Conclusion\nSome concluding high-level thoughts about the similarities of tracing and\npartial evaluation: Tracing and partial evaluation try to tackle a similar\nproblem, that of automatically reducing the interpreter overhead, their\napproaches are slightly different though.\nTracing is very close to normal evaluation, only keeping some extra information\nin the process. But then, the optimizer that is used in a tracer\nis again very similar in structure to a partial evaluator. The task of the\noptimizer is much simpler though, because it does not need to deal with control\nflow at all, just a linear list of operations.\nSo in a sense tracing is taking those parts of partial evaluation that work (the\n\"just evaluate those things that you can, and leave the others\") and replacing\nthe parts that don't (controlling unfolding) by a much more pragmatic mechanism.\nThat mechanism observes actual execution runs of the program to choose control\nflow paths that are typical. At the same time, the tracer's focus is on loops,\nbecause they are where most programs spend significant amounts of time.\nAnother point of view of tracing is that it is a form of partial evaluation that\nreplaces the control components of a partial evaluator with an oracle (the\nactual execution runs) that provide the information which paths to look at.\nAlready in the quite trivial interpreter here the effects of this are visible.\nThe simple partial evaluator over-specializes the loop and produces two\nidentical versions of it, that aren't different. The tracer doesn't, and it\nalso generates only code for the loop itself, not for the initialization\nopcodes.\nThat's it for this series. To those that made it, thanks for following along.\nAlso thanks to Samuele and Sven, who consistently gave me good feedback on the\nposts before I put them here.", + "tags": "", + "url": "https://www.pypy.org/posts/2012/02/larger-example-for-flow-graph-language-6139699450091061040.html" + }, + { + "title": "PyPy 1.8 - business as usual", + "text": "We're pleased to announce the 1.8 release of PyPy. As habitual this\nrelease brings a lot of bugfixes, together with performance and memory\nimprovements over the 1.7 release. The main highlight of the release\nis the introduction of list strategies which makes homogenous lists\nmore efficient both in terms of performance and memory. This release\nalso upgrades us from Python 2.7.1 compatibility to 2.7.2. Otherwise\nit's \"business as usual\" in the sense that performance improved\nroughly 10% on average since the previous release.\nyou can download the PyPy 1.8 release here:\n\nhttps://pypy.org/download.html\n\nWhat is PyPy?\nPyPy is a very compliant Python interpreter, almost a drop-in replacement for\nCPython 2.7. It's fast (pypy 1.8 and cpython 2.7.1 performance comparison)\ndue to its integrated tracing JIT compiler.\nThis release supports x86 machines running Linux 32/64, Mac OS X 32/64 or\nWindows 32. Windows 64 work has been stalled, we would welcome a volunteer\nto handle that.\n\n\nHighlights\n\nList strategies. Now lists that contain only ints or only floats should\nbe as efficient as storing them in a binary-packed array. It also improves\nthe JIT performance in places that use such lists. There are also special\nstrategies for unicode and string lists.\n\nAs usual, numerous performance improvements. There are many examples\nof python constructs that now should be faster; too many to list them.\n\nBugfixes and compatibility fixes with CPython.\n\nWindows fixes.\n\nNumPy effort progress; for the exact list of things that have been done,\nconsult the numpy status page. A tentative list of things that has\nbeen done:\n\nmulti dimensional arrays\nvarious sizes of dtypes\na lot of ufuncs\na lot of other minor changes\n\nRight now the numpy module is available under both numpy and numpypy\nnames. However, because it's incomplete, you have to import numpypy first\nbefore doing any imports from numpy.\n\nNew JIT hooks that allow you to hook into the JIT process from your python\nprogram. There is a brief overview of what they offer.\n\nStandard library upgrade from 2.7.1 to 2.7.2.\n\n\n\n\nOngoing work\nAs usual, there is quite a bit of ongoing work that either didn't make it to\nthe release or is not ready yet. Highlights include:\n\nNon-x86 backends for the JIT: ARMv7 (almost ready) and PPC64 (in progress)\nSpecialized type instances - allocate instances as efficient as C structs,\nincluding type specialization\nMore numpy work\nSince the last release there was a significant breakthrough in PyPy's\nfundraising. We now have enough funds to work on first stages of numpypy\nand py3k. We would like to thank again to everyone who donated.\nIt's also probably worth noting, we're considering donations for the\nSoftware Transactional Memory project. You can read more about our plans\n\nCheers,\nThe PyPy Team", + "tags": "", + "url": "https://www.pypy.org/posts/2012/02/pypy-18-business-as-usual-7266036404915945090.html" + }, + { + "title": "Introductory Article About RPython", + "text": "Laurence Tratt from King's College London has written a long and detailed introduction to the goals and significance of RPython over on his blog. Laurie has been implementing his Converge Language in RPython in the last months. He is one of the first people external to the PyPy team who have pushed a sizeable RPython-based VM quite far, adding and tuning JIT hints. The post describes some of that work and his impressions of RPython and PyPy.\n\n\n\"RPython, to my mind, is an astonishing project. It has, almost single-handedly, opened up an entirely new approach to VM implementation. As my experience shows, creating a decent RPython VM is not a huge amount of work (despite some frustrations). In short: never again do new languages need come with unusably slow VMs. That the the PyPy / RPython team have shown that these ideas scale up to a fast implementation of a large, real-world language (Python) is another feather in their cap.\"", + "tags": "", + "url": "https://www.pypy.org/posts/2012/02/introductionary-article-about-rpython-5386281283454207551.html" + }, + { + "title": "Optimizing Traces of the Flow Graph Language", + "text": "Part 3 of Comparing Partial Evaluation to Tracing\nThis is the third blog post in a series about comparing partial evaluation and\ntracing. In the first post of the series I introduced a small flow-graph\nlanguage together with an interpreter for it. Then I showed a partial evaluator\nfor the language. In the second post of the series I showed how a tracer for\nthe same language works and how it relates to both execution and to partial\nevaluation. Then I added support for promotion to that tracer.\nIn this post I will show how to optimize the traces that are produced by the\ntracer and compare the structure of the optimizer to that of partial\nevaluation.\nThe code from this post can be found here: https://paste.pocoo.org/show/547304/\nOptimizing Traces\nIn the last post we saw how to produce a linear trace with guards by\ninterpreting a control flow graph program in a special mode. A trace always end with\na loop statement, which jumps to the beginning. The tracer is just logging\nthe operations that are done while interpreting, so the trace can contain\nsuperfluous operations. On the other hand, the trace also contains some of the\nruntime values through promotions and some decisions made on them which can be\nexploited by optimization. An example for this is the trace produced by the\npromotion example from the last post:\nop2(c,ge,var(i),const(0),\nguard_true(c,[],l_done,\nguard_value(x,5,[],b2,\nop2(x2,mul,var(x),const(2),\nop2(x3,add,var(x2),const(1),\nop2(i,sub,var(i),var(x3),\nloop))))))\n\nAfter the guard_value(x, 5, ...) operation, x is know to be 5: If\nit isn't 5, execution falls back to the interpreter. Therefore, operations\non x after the guard can be constant-folded. To do that sort of\nconstant-folding,\nan extra optimization step is needed. That optimization step walks along the\ntrace, remembers which variables are constants and what their values are using a\npartial environment. The opimizer removes operations that have only constant\narguments and leaves the others in the trace. This process is actually\nremarkably similar to partial evaluation: Some variables are known to be\nconstants, operations on only constant arguments are optimized away, the rest\nremains.\nThe code for optimizing operations looks as follows:\noptimize(op1(ResultVar, Op, Arg, Rest), PEnv, NewOp) :-\n presolve(Arg, PEnv, RArg),\n (RArg = const(C) ->\n do_op(Op, C, Res),\n write_env(PEnv, ResultVar, Res, NEnv),\n NewOp = RestResidual\n ;\n remove_env(PEnv, ResultVar, NEnv),\n NewOp = op1(ResultVar, Op, RArg, RestResidual)\n ),\n optimize(Rest, NEnv, RestResidual).\n\noptimize(op2(ResultVar, Op, Arg1, Arg2, Rest), PEnv, NewOp) :-\n presolve(Arg1, PEnv, RArg1),\n presolve(Arg2, PEnv, RArg2),\n (RArg1 = const(C1), RArg2 = const(C2) ->\n do_op(Op, C1, C2, Res),\n write_env(PEnv, ResultVar, Res, NEnv),\n NewOp = RestResidual\n ;\n remove_env(PEnv, ResultVar, NEnv),\n NewOp = op2(ResultVar, Op, RArg1, RArg2, RestResidual)\n ),\n optimize(Rest, NEnv, RestResidual).\n\nJust like partial evaluation! It even reuses the helper functions presolve\nfrom the partial evaluator and a partial environment PEnv. When the\narguments of the operation are known constants in the partial environment, the\noperation can be executed at optimization time and removed from the trace.\nOtherwise, the operation has to stay in the output trace. The result variable\n(as in the partial evaluator) needs to be removed from the partial environment,\nbecause it was just overwritten by an unknown result.\nNow we need to deal with guards in the trace.\noptimize(guard_true(V, [], L, Rest), PEnv, NewOp) :-\n plookup(V, PEnv, Val),\n (Val = const(C) ->\n NewOp = RestResidual\n ;\n NewOp = guard_true(V, PEnv, L, RestResidual)\n ),\n optimize(Rest, PEnv, RestResidual).\n\noptimize(guard_false(V, [], L, Rest), PEnv, NewOp) :-\n plookup(V, PEnv, Val),\n (Val = const(C) ->\n NewOp = RestResidual,\n NEnv = PEnv\n ;\n write_env(PEnv, V, 0, NEnv),\n NewOp = guard_false(V, PEnv, L, RestResidual)\n ),\n optimize(Rest, NEnv, RestResidual).\n\nWhen the variable that is being guarded is actually known to be a constant, we\ncan remove the guard. Note that it is not possible that the guard of that\nconstant fails: The tracer recorded the operation while running with real\nvalues, therefore the guards have to succeed for values the optimizer\ndiscovers to be constant.\nguard_false is slightly different from guard_true: after the former we\nknow that the argument is actually 0. After guard_true we only know that\nit is not equal to zero, but not which precise value it has.\nAnother point to note in the optimization of guards is that the second argument\nof the guard operation, which was so far always just an empty list, is now\nreplaced by the partial environment PEnv. I will discuss further down why\nthis is needed.\nOptimizing guard_value is very similar, except that it really gives precise\ninformation about the variable involved:\noptimize(guard_value(V, C, [], L, Rest), PEnv, NewOp) :-\n plookup(V, PEnv, Val),\n (Val = const(C1) ->\n NewOp = RestResidual,\n NEnv = PEnv\n ;\n write_env(PEnv, V, C, NEnv),\n NewOp = guard_value(V, C, PEnv, L, RestResidual)\n ),\n optimize(Rest, NEnv, RestResidual).\n\nThis operation is the main way how the optimizer gains constant variables that\nit then exploits to do constant-folding on later operations. This is a chief\ndifference from partial evaluation: There the optimizer knows the value of some\nvariables from the start. When optimizing traces, at the beginning the value of\nno variable is known. Knowledge about some variables is only later gained\nthrough guards.\nNow we are missing what happens with the loop statement. In principle, it is\nturned into a loop statement again. However, at the loop statement a few\nadditional operations need to be emitted. The reason is that we optimized away\noperations and thus assignments when the result value of the variable was a\nconstant. That means the involved variable still potentially has some older\nvalue. The next iteration of the loop would continue with this older value,\nwhich is obviously wrong. Therefore we need to emit some assignments before the\nloop statement, one per entry in the partial environment:\noptimize(loop, PEnv, T) :-\n generate_assignments(PEnv, T).\n\ngenerate_assignments([], loop).\ngenerate_assignments([Var/Val | Tail], op1(Var, same, const(Val), T)) :-\n generate_assignments(Tail, T).\n\nAs an example of how generate_assignments assignments works, let's look at\nthe following example. When the partial environment is, [x/5, y/10] the\nfollowing assignments are generated:\n?- generate_assignments([x/5, y/10], Out).\nOut = op1(x, same, const(5), op1(y, same, const(10), loop)).\n\nThat's all the code of the optimizer. While the basic structure is quite similar to partial evaluation,\nit's a lot less complex as well. What made the partial evaluator hard was that\nit needs to deal with control flow statements and with making sure that code is\nreused if the same block is partially evaluated with the same constants. Here,\nall these complexities go away. The tracer has already removed all control flow\nand replaced it with guards and one loop operation at the end. Thus, the\noptimizer can simply do one pass over the operations, removing some (with some\nextra care around the loop statement).\nWith this machinery in place, we can optimize the trace from the promotion\nexample of the last post:\n?- optimize(\n guard_value(x,3,[],b2,\n op2(x2,mul,var(x),const(2),\n op2(x3,add,var(x2),const(1),\n op2(i,sub,var(i),var(x3),\n op2(c,ge,var(i),const(0),\n guard_true(c,[],l_done, loop)))))),\n [],\n LoopOut).\nLoopOut = guard_value(x, 3, [], b2, op2(i, sub, var(i), const(7), op2(c, ge, var(i), const(0), guard_true(c, [x/3, x2/6, x3/7], l_done, op1(x, same, const(3), op1(x2, same, const(6), op1(x3, same, const(7), loop)))))))\n\nMore readably, the optimized version is:\nguard_value(x, 3, [], b2,\nop2(i, sub, var(i), const(7),\nop2(c, ge, var(i), const(0),\nguard_true(c, [x/3, x2/6, x3/7], l_done,\nop1(x, same, const(3),\nop1(x2, same, const(6),\nop1(x3, same, const(7),\nloop)))))))\n\nAs intended, the operations on x after the guard_value have all been\nremoved. However, some additional assignments (to x, x2, x3) at the end have been generated as\nwell. The assignments look superfluous, but the optimizer does not have\nenough information to easily recognize this. That can be fixed, but only at the\ncost of additional complexity. (A real system would transform the trace into\nstatic single assignment form to answer such questions.)\nResuming to the Interpreter\nWhy does the code above need to add the partial environment to\nthe guards that cannot be optimized away? The reason is related to why we needed\nto generate assignments before the loop statement. The problem is that the optimizer\nremoves assignments to variables when it knows the values of these variables.\nThat means that when switching back from running the optimized trace to the\ninterpreter, a number of variables are not updated in the environment, making\nthe execution in the interpreter incorrect.\nIn the example above, this applies to the variables x2 and x3. When the\nsecond guard fails, they have not been assigned in the optimized case.\nTherefore, the guard lists them and their (always constant) values.\nWhen switching back these assignments need to be made. Thus we need to adapt the\nresume_interp function from the last blog post as follows:\nwrite_resumevars([], Env, Env).\nwrite_resumevars([Key / Value | Rest], Env, NEnv) :-\n write_env(Env, Key, Value, Env1),\n write_resumevars(Rest, Env1, NEnv).\n\nresume_interp(Env, ResumeVars, L) :-\n write_resumevars(ResumeVars, Env, NEnv),\n block(L, Block),\n interp(Block, NEnv).\n\nOn resuming, the ResumeVars (a former partial environment) are simply added\nback to the normal environment before going back to the interpreter.\nThe data attached to guards about what needs to be done to resume to the\ninterpreter when the guard fails is often a very complex part of a tracing\nsystem. The data can become big, yet most guards never fail. Therefore, most\nreal systems try hard to compress the attached data or try to share it between\nsubsequent guards.\nSummary\nIn this post we have shown how to optimize traces by applying a variant of the\npartial evaluation principle: Perform all the operations that have only constant\narguments, leave the others alone. However, optimizing traces is much simpler,\nbecause no control flow is involved. All the questions about control flow have\nalready been solved by the tracing component.\nIn the next and final post of the series I will show a larger example of how\ntracing and partial evaluation can be used to optimize a small bytecode\ninterpreter.", + "tags": "", + "url": "https://www.pypy.org/posts/2012/02/optimizing-traces-of-flow-graph-4169388883059419385.html" + }, + { + "title": "Almost There - PyPy's ARM Backend", + "text": "In this post I want to give an update on the status of the ARM backend for PyPy's JIT and describe some of the issues and details of the backend.\n\n\n\n\n\n\n\nCurrent Status\nIt has been a more than a year that I have been working on the ARM backend. Now it is in a shape, that we can measure meaningful numbers and also ask for some feedback.\u00a0Since the last post about the backend we have added support floating point operations as well as for PyPy's framework GC's. Another area of work was to keep up with the constant improvements done in the main development branch, such as out-of-line guards, labels, etc.\u00a0It has been possible for about a year to cross-translate the PyPy Python interpreter and other interpreters such as Pyrolog, with a JIT, to run benchmarks on ARM. Up until now there remained some hard to track bugs that would cause the interpreter to crash with a segmentation fault in certain cases when running with the JIT on ARM. Lately it was possible to run all benchmarks without problems, but when running the translation toolchain itself it would crash.\u00a0During the last PyPy sprint in Leysin Armin and I managed to fix several of these hard to track bugs in the ARM backend with the result that, it is now possible to run the PyPy translator on ARM itself (at least unless until it runs out of memory), which is a kind of litmus test for the backend itself and used to crash before. Just to point it out, we are not able to complete a PyPy translation on ARM, because on the hardware we have currently available there is not enough memory. But up to the point we run out of memory the JIT does not hit any issues.\n\n\n\n\n\n\n\nImplementation Details\nThe hardware requirements to run the JIT on ARM follow those for Ubuntu on ARM which targets ARMv7 with a VFP unit running in little endian mode. The JIT can be translated without floating point support, but there might be a few places that need to be fixed to fully work in this setting.\u00a0We are targeting the ARM instruction set, because at least at the time we decided to use it seemed to be the best choice in terms of speed while having some size overhead compared to the Thumb2 instruction set. It appears that the Thumb2 instruction set should give comparable speed with better code density but has a few restriction on the number of registers available and the use of conditional execution. Also the implementation is a bit easier using a fixed width instruction set and we can use the full set of registers in the generated code when using the ARM instruction set.\n\n\n\n\n\n\n\nThe calling convention on ARM\nThe calling convention on ARM uses 4 of the general purpose registers to pass arguments to functions, further arguments are passed on the stack. The presence of a floating point unit is not required for ARM cores, for this reason there are different ways of handling floats with relation to the calling convention. There is a so called soft-float calling convention that is independent of the presence of a floating point unit. For this calling convention floating point arguments to functions are stored in the general purpose registers and on the stack. Passing floats around this way works with software and hardware floating point implementations. But in presence of a floating point unit it produces some overhead, because floating point numbers need to be moved from the floating point unit to the core registers to do a call and moved back to the floating point registers by the callee. The alternative calling convention is the so-called hard-float calling convention which requires the presence of a floating point unit but has the advantage of getting rid of the overhead of moving floating point values around when performing a call. Although it would be better in the long term to support the hard-float calling convention, we need to be able to interoperate with external code compiled for the operating system we are running on. For this reason at the moment we only support the soft-float to interoperate with external code.\u00a0We implemented and tested the backend on a BeagleBoard-xM with a Cortex-A8 processor running Ubuntu 11.04 for ARM.\n\n\n\n\n\n\n\nTranslating for ARM\nThe toolchain used to translate PyPy currently is based on a Scratchbox2. Scratchbox2 is a cross-compiling environment. Development had stopped for a while, but it seems to have revived again. We run a 32-bit Python interpreter on the host system and perform all calls to the compiler using a Scratchbox2 based environment. A description on how to setup the cross translation toolchain can be found here.\n\n\n\n\n\n\n\nResults\nThe current results on ARM, as shown in the graph below, show that the JIT currently gives a speedup of about 3.5 times compared to CPython on ARM. The benchmarks were run on the before mentioned BeagleBoard-xM with a 1GHz ARM Cortex-A8 processor and 512MB of memory. The operating system on the board is Ubuntu 11.04 for ARM. We measured the PyPy interpreter with the JIT enabled and disabled comparing each to CPython Python 2.7.1+ (r271:86832) for ARM. The graph shows the speedup or slowdown of both PyPy versions for the different benchmarks from our benchmark suite normalized to the runtime of CPython. The data used for the graph can be seen below.\n\n\n\nThe speedup is less than the speedup of 5.2 times we currently get on x86 on our own benchmark suite (see https://speed.pypy.org for details). There are several possible reasons for this. Comparing the results for the interpreter without the JIT on ARM and x86 suggests that the interpreter generated by PyPy, without the JIT, has a worse performance when compared to CPython that it does on x86. Also it is quite possible that the code we are generating with the JIT is not yet optimal. Also there are some architectural constraints produce some overhead. One of these differences is the handling of constants, most ARM instructions only support 8 bit (that can be shifted) immediate values, larger constants need to be loaded into a register, something that is not necessary on x86.\n\n\nBenchmarkPyPy JITPyPy no JIT\nai0.4844397800473.72756749625\nchaos0.08072916919342.2908692212\ncrypto_pyaes0.07111148322453.30112318509\ndjango0.09777432455192.56779947601\nfannkuch0.2104237356982.49163632938\nfloat0.1542753346752.12053281495\ngo0.3304830342025.84628320479\nhtml5lib0.6292643898623.60333138526\nmeteor-contest0.9847474269122.93838610037\nnbody_modified0.2369695930821.40027234936\npyflate-fast0.3674471918072.72472422146\nraytrace-simple0.02905274614371.97270054339\nrichards0.0345755735533.29767342015\nslowspitfire0.7866425519083.7397367403\nspambayes0.6603243794563.29059863111\nspectral-norm0.0636107837314.01788986233\nspitfire0.436171311652.72050579076\nspitfire_cstringio0.2555387021341.7418593111\ntelco0.1029189304133.86388866047\ntwisted_iteration0.1227239868054.33632475491\ntwisted_names2.423677971352.99878698076\ntwisted_pb1.309918374314.48877805486\ntwisted_tcp0.9270333540552.8161624665\nwaf1.020598119321.03793427321\n\n\n\n\n\n\n\n\n\n\nThe next steps and call for help\nAlthough there probably still are some remaining issues which have not surfaced yet, the JIT backend for ARM is working. Before we can merge the backend into the main development line there are some things that we would like to do first, in particular it we are looking for a way to run the all PyPy tests to verify that things work on ARM before we can merge.\u00a0Additionally\u00a0there are some other longterm ideas. To do this we are looking for people willing to help, either by contributing to implement the open features or that can help us with hardware to test.\n\nThe incomplete list of open topics:\n\nWe are looking for a better way to translate PyPy for ARM, than the one describe above. I am not sure if there currently is hardware with enough memory to directly translate PyPy on an ARM based system, this would require between 1.5 or 2 Gig of memory. A fully QEMU based approach could also work, instead of Scratchbox2 that uses QEMU under the hood.\nTest the JIT on different hardware.\nExperiment with the JIT settings to find the optimal thresholds for ARM.\nContinuous integration: We are looking for a way to run the PyPy test suite to make sure everything works as expected on ARM, here QEMU also might provide an alternative.\nA long term plan would be to port the backend to ARMv5 ISA and improve the support for systems without a floating point unit. This would require to implement the ISA and create different code paths and improve the instruction selection depending on the target architecture.\nReview of the generated machine code the JIT generates on ARM to see if the instruction selection makes sense for ARM.\nBuild a version that runs on Android.\nImprove the tools, i.e. integrate with jitviewer.\n\nSo if you are interested or willing to help in any way contact us.", + "tags": "arm,jit,pypy", + "url": "https://www.pypy.org/posts/2012/02/almost-there-pypys-arm-backend_01-3216759488618774525.html" + }, + { + "title": "A Simple Tracer for the Flow Graph Language", + "text": "Part 2 of Comparing Partial Evaluation to Tracing\nThis is the second blog post in a series about comparing partial evaluation and\ntracing. In the first post of the series I introduced a small flow-graph\nlanguage together with an interpreter for it. Then I showed a partial evaluator\nfor the language. In this post I will show how a tracer for the same language\nworks and how it relates to both execution and to partial evaluation.\nThe code from this post can be found here: https://paste.pocoo.org/show/543542/\nTracing Execution\nThe idea of a tracer (for the described language and also in general) is to do completely normal\ninterpretation but at the same time keep a log of all the normal operations\n(i.e. non-control-flow operations) that were performed. This continues until the\ntracer executes the code block where it started at, in which case the trace\ncorresponds to a closed loop. Then tracing stops and the last operation is\nreplaced by a jump to the start. After tracing has ended, the trace can be\nexecuted, optionally optimizing it before that.\nTo write a tracer, we start from the rules of the interpreter, rename the\npredicate to trace and add some extra arguments. Thus, the following rules\nin the interpreter:\ninterp(op1(ResultVar, Op, Arg, Rest), Env) :-\n resolve(Arg, Env, RArg),\n do_op(Op, RArg, Res),\n write_env(Env, ResultVar, Res, NEnv),\n interp(Rest, NEnv).\n\ninterp(op2(ResultVar, Op, Arg1, Arg2, Rest), Env) :-\n resolve(Arg1, Env, RArg1),\n resolve(Arg2, Env, RArg2),\n do_op(Op, RArg1, RArg2, Res),\n write_env(Env, ResultVar, Res, NEnv),\n interp(Rest, NEnv).\n\nbecome the following rules in the tracer:\ntrace(op1(ResultVar, Op, Arg, Rest), Env, op1(ResultVar, Op, Arg, T), TraceAnchor) :-\n resolve(Arg, Env, RArg),\n do_op(Op, RArg, Res),\n write_env(Env, ResultVar, Res, NEnv),\n trace(Rest, NEnv, T, TraceAnchor).\n\ntrace(op2(ResultVar, Op, Arg1, Arg2, Rest), Env, op2(ResultVar, Op, Arg1, Arg2, T), TraceAnchor) :-\n resolve(Arg1, Env, RArg1),\n resolve(Arg2, Env, RArg2),\n do_op(Op, RArg1, RArg2, Res),\n write_env(Env, ResultVar, Res, NEnv),\n trace(Rest, NEnv, T, TraceAnchor).\n\nNote how the bodies of the trace rules correspond exactly to the bodies of\nthe interp rules, the only difference is the recursive call to trace.\nThe meaning of the arguments of trace is as follows: The first and second argument are\nthe operation currently executed and the environment,\nlike in the interpreter. The argument\nafter that is an output argument that collects the currently traced operation,\nin the example above it is exactly like the operation that was executed.\nTraceAnchor is additional information about the trace that is being built\nright now, most of the time it is just handed on to the recursive call of\ntrace. We will see later what it contains.\nThe rule for print_and_stop is very simple, as execution (and therefore also\ntracing) simply stops there:\ntrace(print_and_stop(V), Env, print_and_stop(V), _) :-\n resolve(V, Env, Val),\n print(Val), nl.\n\nLeft are the rules for the control operations jump and if. A trace\nlinearizes one execution path, it contains no jumps. However, when a jump to the\nstarting label is reached, tracing should stop. Therefore, the implementation of\njump contains two cases:\ntrace(jump(L), Env, T, TraceAnchor) :-\n (TraceAnchor = traceanchor(L, FullTrace) ->\n T = loop,\n write(trace), nl, write(FullTrace), nl,\n do_optimize(FullTrace, OptTrace),\n write(opttrace), nl, write(OptTrace), nl,\n runtrace(OptTrace, Env, OptTrace)\n ;\n block(L, Block),\n trace(Block, Env, T, TraceAnchor)\n ).\n\nLet's disect this code in small increments. First, we see what TraceAnchor\nis. It is a term of the form\ntraceanchor(StartLabel, FullTrace). StartLabel is a label in the program\nwhere tracing started (and where it should end as well, when the loop is\nclosed). The argument FullTrace is an accumulator which contains the full\ntrace that is being built right now.\nThe condition at the start of the rule checks whether the taget-label L is\nthe same as the one stored in the trace anchor. If that is the case, we can stop\ntracing. The rest of the trace T is assigned the operation loop, which\njumps back to the beginning of the trace. Afterwards we print and optimize the\ntrace, then run it, using the FullTrace part of the traceanchor.\nIf the label we jump to is not the StartLabel we simply continue tracing\nwithout recording any operation. This part of the rule is again extremely\nsimilar to the interpretation of jump.\nFor now, we will not use any interesting optimizations, just return the\nunoptimized trace unchanged:\ndo_optimize(FullTrace, FullTrace).\n\nThe missing operation now is if. An if statement needs special treatment,\nbecause it is a way where control flow can diverge from the trace. The trace is\nlinear, therefore it can only record one of the two possible paths. When\nexecuting the trace it is possible for the other path to be taken. Therefore\nwe need to make sure that the same conditions that were true or false during\ntracing are still true or false during the execution of the trace. This is done\nwith a guard operation, which checks for this condition. The following rule\nimplements it:\ntrace(if(V, L1, L2), Env, T, TraceAnchor) :-\n lookup(V, Env, Val),\n (Val == 0 ->\n L = L2, T = guard_false(V, [], L1, NT)\n ;\n L = L1, T = guard_true(V, [], L2, NT)\n ),\n trace(jump(L), Env, NT, TraceAnchor).\n\nIt is very similar to the interp rule of if. The rule inserts a\nguard_true into the case, if the condition is true, and a guard_false if\nthe condition is false. The arguments of the guard are: The variable that is\nbeing guarded, an empty list (the reason for that will be explained in a later\npost), the label where execution needs to continue when the guard fails and the\nrest of the trace.\nLet's also add a small helper predicate that can be used to conveniently start\ntracing:\ndo_trace(L, Env) :-\n block(L, StartBlock),\n trace(StartBlock, Env, ProducedTrace, traceanchor(L, ProducedTrace)).\n\nThe predicate takes a label and an environment and executes the label with the\ngiven environment by first producing a trace, then executing the trace and\neventually jumping back to interpretation, if a guard fails. It does this by\nreading the code at label L with the block statement, and then calling\ntrace with an unbound variable ProducedTrace to hold the trace and a trace\nanchor that contains the label where tracing started and the produced trace\nvariable.\nWith that predicate and the trace so far we can already trace the power\nimplementation from the last blog post, just not execute the trace (which we\nwill do in the next section):\n?- do_trace(power_rec, [res/1, x/10, y/20]).\ntrace\nop2(res,mul,var(res),var(x),op2(y,sub,var(y),const(1),guard_true(y,[],power_done,loop)))\nopttrace\nop2(res,mul,var(res),var(x),op2(y,sub,var(y),const(1),guard_true(y,[],power_done,loop)))\n...\n\nThe computed trace is:\n\nop2(res,mul,var(res),var(x),\nop2(y,sub,var(y),const(1),\nguard_true(y,[],power_done,\nloop)))\n\nwhich is exactly the content of the loop from power_rec. Note how the if\nis turned into a guard_true which jumps to power_done if the guard\nfails.\nA real tracing system would need a way for the tracer to get started, e.g. by\ndoing profiling in an interpreter and starting the tracer for labels that are\njumped to often. Also, traces for the same label are usually cached in some way.\nThese details are left out in this simple model.\nExecuting Traces\nIn a real tracing system, the traces would be turned into machine code and\nexecuted by the CPU. In our small model, we will simply write another\ninterpreter for them. This interpreter is very simple and looks again very\nsimilar to interp.\nruntrace(op1(ResultVar, Op, Arg, Rest), Env, TraceFromStart) :-\n resolve(Arg, Env, RArg),\n do_op(Op, RArg, Res),\n write_env(Env, ResultVar, Res, NEnv),\n runtrace(Rest, NEnv, TraceFromStart).\n\nruntrace(op2(ResultVar, Op, Arg1, Arg2, Rest), Env, TraceFromStart) :-\n resolve(Arg1, Env, RArg1),\n resolve(Arg2, Env, RArg2),\n do_op(Op, RArg1, RArg2, Res),\n write_env(Env, ResultVar, Res, NEnv),\n runtrace(Rest, NEnv, TraceFromStart).\n\nThese rules are completely equivalent to the interp rules for op1 and\nop2. runtrace needs an extra argument, TraceFromStart, which is\nalways just handed over to the recursive call of runtrace.\nWhen the end of the trace is reached and the loop statement is encountered,\nwe simply start from the beginning:\nruntrace(loop, Env, TraceFromStart) :-\n runtrace(TraceFromStart, Env, TraceFromStart).\n\nThe remaining question is what to do when encountering guards. In that case the\nguard condition needs to be checked. If the guard succeeds, executing the trace can\ncontinue. Otherwise the trace is aborted and the interpreter resumes execution:\nruntrace(guard_true(V, ResumeVars, L, Rest), Env, TraceFromStart) :-\n lookup(V, Env, Val),\n (Val == 0 ->\n resume_interp(Env, ResumeVars, L)\n ;\n runtrace(Rest, Env, TraceFromStart)\n ).\n\nruntrace(guard_false(V, ResumeVars, L, Rest), Env, TraceFromStart) :-\n lookup(V, Env, Val),\n (Val == 0 ->\n runtrace(Rest, Env, TraceFromStart)\n ;\n resume_interp(Env, ResumeVars, L)\n ).\n\n\nresume_interp(Env, [], L) :-\n block(L, Block),\n interp(Block, Env).\n\nNote how the execution is handed over to the interpreter at the label that was\nencoded as the third argument in the guard operation.\nWhat the ResumeVars are for we will see in a later post. For now we assume\nthat it is always an empty list.\nWith this interpreter for traces we can now trace and then execute the example:\n:- do_trace(power_rec, [res/1, x/10, y/20]).\ntrace\nop2(res,mul,var(res),var(x),op2(y,sub,var(y),const(1),guard_true(y,[],power_done,loop)))\nopttrace\nop2(res,mul,var(res),var(x),op2(y,sub,var(y),const(1),guard_true(y,[],power_done,loop)))\n100000000000000000000\n\nOf course this is example is not very exciting, because the trace looks more or less exactly\nlike the original code as well. There will be more exciting examples in a later\npost.\nExtension: Promotion\nAs it is, the tracer does not actually add much to the interpreter. It\nlinearizes control flow, but nothing deeply advanced happens. In this section I\nwill add a crucial but simple to implement extension to the control flow language that allows the tracer\nto do more interesting things. This extension is called promotion.\nPromotion is basically a hint that the programmer can add to her control flow\ngraph program. A promotion is an operation promote(V, L) that takes a\nvariable V and a label L. When the interpreter runs this statement, it\nsimply jumps to the label L and ignores the variable:\ninterp(promote(_, L), Env) :-\n interp(jump(L), Env).\n\nHowever, the tracer does something much more interesting. For the tracer, the\npromote statement is a hint that it would be very useful to know the value\nof V and that the rest of the trace should keep that value as a constant.\nTherefore, when the tracer encounters a promotion, it inserts a special kind of\nguard called guard_value\ntrace(promote(V, L), Env, guard_value(V, Val, [], L, T), TraceAnchor) :-\n lookup(V, Env, Val),\n trace(jump(L), Env, T, TraceAnchor).\n\nThe guard_value is an interesting operation, because it freezes the current\nvalue FVal of variable V into the trace. When the trace is executed, the\nguard checks that the current value of the variable and the frozen value are the\nsame. If yes, execution continues, if not, the trace is aborted:\nruntrace(guard_value(V, FVal, ResumeVars, L, Rest), Env, TraceFromStart) :-\n lookup(V, Env, Val),\n (Val == FVal ->\n runtrace(Rest, Env, TraceFromStart)\n ;\n resume_interp(Env, ResumeVars, L)\n ).\n\nWhat can this operation be used for? It's a way to communicate to the tracer\nthat variable V is not changing very often and that it is therefore useful\nto freeze the current value into the trace. This can be done even without\nknowing the value of V in advance.\nLet's look at a (slightly contrived) example:\n\nl:\n c = i >= 0\n if c goto b else goto l_done\n\nl_done:\n print_and_stop(var(i))\n\nb:\n promote(x, b2)\n\nb2:\n x2 = x * 2\n x3 = x2 + 1\n i = i - x3\n goto l\n\nEncoded in Prolog syntax:\nblock(l, op2(c, ge, var(i), const(0),\n if(c, b, l_done))).\nblock(l_done, print_and_stop(var(i))).\n\nblock(b, promote(x, b2)).\nblock(b2, op2(x2, mul, var(x), const(2),\n op2(x3, add, var(x2), const(1),\n op2(i, sub, var(i), var(x3),\n jump(l))))).\n\nThis is a simple loop that counts down in steps of x * 2 + 1, whatever x\nmight be, until i >= 0 is no longer true. Assuming that x doesn't change\noften, it is worth to promote it to be able to constant-fold x * 2 + 1 to\nnot have to redo it every iteration. This is done with the promotion of x\n(of course optimizing this loop with loop invariant code motion would work as\nwell, because x doesn't actually change during the loop).\nTo trace this, we can run the following query:\n?- do_trace(b, [i/100, x/5]).\ntrace\nguard_value(x,5,[],b2,op2(x2,mul,var(x),const(2),op2(x3,add,var(x2),const(1),op2(i,sub,var(i),var(x3),op2(c,ge,var(i),const(0),guard_true(c,[],l_done,loop))))))\nopttrace\nguard_value(x,5,[],b2,op2(x2,mul,var(x),const(2),op2(x3,add,var(x2),const(1),op2(i,sub,var(i),var(x3),op2(c,ge,var(i),const(0),guard_true(c,[],l_done,loop))))))\n-10\n\nWriting the trace in a more readable way:\nguard_value(x,3,[],b2,\nop2(x2,mul,var(x),const(2),\nop2(x3,add,var(x2),const(1),\nop2(i,sub,var(i),var(x3),\nop2(c,ge,var(i),const(0),\nguard_true(c,[],l_done,\nloop))))))\n\nAfter the guard_value the operations performed on x could be\nconstant-folded away, because the guard ensures that x is 5 before\nexecution continues. To actually do the constant-folding we would need some\noptimization component that optimizes traces. This will be done in the next blog\npost.\nIn this section I mostly talked about how promotion is realized in the tracer,\nnot what and how to use to use it for. Promotion is one of the most important\ningredients that's responsible for the success of PyPy's tracing approach. How\nthis works is discussed in detail in the paper \"Runtime feedback in a\nmeta-tracing JIT for efficient dynamic languages\".\nConclusion\nIn this blog post we have seen a very minimalistic tracer and an interpreter for\nthe produced traces. The tracer is very much like the original interpreter, it\njust also keeps track of which operations were executed, in addition to\nexecuting the program. Tracing stops when a loop is closed, then the trace can\nbe optimized and run. Running a trace continues until a failing guard is hit. At\nthat point, execution goes back to the normal interpreter (and stays there, in\nthis very simple implementation).\nI also presented an extension of tracing that makes it possible to add a hint\ncalled promote to the original program that tells the tracer to feed back a\nruntime value into the trace and freeze it there. This extension would be\nimpossible to do in the partial evaluator from the last post, because partial\nevaluation is done strictly before run time, so if a variable isn't already\nknown, its likely runtime value cannot be found out.\nIn the next post I will show how to optimize traces before executing them and\nhow the optimizer for traces is related to partial evaluation.", + "tags": "", + "url": "https://www.pypy.org/posts/2012/01/simple-tracer-for-flow-graph-language-6930951890987229484.html" + }, + { + "title": "NumPyPy status update", + "text": "Hello.\nThis is just a quick status update on the NumPy in PyPy project that very\nrecently became my day job. I should give my thanks once again to Getco,\nNate Lawson and other contributors who donated above $40000 towards the goal.\nRecently we (Alex Gaynor, Matti Picus and me) implemented a few interesting things\nthat a lot of people use:\n\nmore ufuncs\nmost ufuncs now accept the axis parameter (except all and any)\nfixed string representation of arrays, now it's identical to numpy (uses\npretty much the same code)\nndarray.flat should be working correctly\nndarray.flatten, ndarray.ravel, ndarray.take\nindexing arrays by boolean arrays of the same size\nand various bugfixes.\n\nWe would also like to introduce the nightly report of numpy status. This\nis an automated tool that does package introspection. While it gives some\nsort of idea how much of numpy is implemented, it's not by far the authority.\nYour tests should be the authority. It won't report whether functions\nsupport all kinds of parameters (for example masked arrays and out parameter\nare completely unsupported) or that functions work at all. We also\nreserve the right to incorporate jokes in that website, so don't treat it\nthat seriously overall :-)\nThanks, and stay tuned. We hope to post here regular updates on the\nprogress.\nCheers,\nfijal & the PyPy team", + "tags": "numpy", + "url": "https://www.pypy.org/posts/2012/01/numpypy-status-update-6434340612277938795.html" + }, + { + "title": "Py3k and Numpy First Stage: Thanks to all who Gave", + "text": "Last year was quite successful for PyPy fundraising through the Software Freedom Conservancy, and Conservancy and PyPy are very excited to announce that enough was raised to begin the first stages on the Py3k and Numpy grant proposals.As of the end of 2011, 135 different individuals gave to the Py3k campaign, and 114 to the Numpy campaign. We thank each of you who donated to help make this work possible. Meanwhile, if you haven't given to support these projects, we do hope you'll give generously now to help fund their second stages later this year!We're also particularly excited that a few donors gave particularly large donations to support this work; those big donations really filled in the gap to help us get started!Specifically, we're pleased to announce that Google donated $35000 towards implementing Python 3 in PyPy. Google's general support of the Python community is well known, and their specific support of our grant proposal is much appreciated.Meanwhile, Numpy was supported in part by contributions from Nate Lawson, Cantab Capital Partners, and Getco, as well as more than a hundred other contributors.With these donations combined with many others, we're now starting work on both projects. This week, the Conservancy signed contracts with Antonio Cuni and Benjamin Peterson to work towards the Stage 1.1 goals in Py3k proposal (and is negotiating for another contractor as well), and with Maciej Fija\u0142kowski to work towards the Stage 1 goals in the Numpy proposal.In 2012, PyPy will continue regular sprint meetings, at which Py3K and Numpy efforts will certainly have a place. We have some limited funds to fund travels of contributors to those meetings.We're very thankful for all who donated so far to support these efforts, and we hope that now that work has begun, even more donors will come forward to help us finish the job. In the meantime, watch for the commits showing up from these developers and other contributors in the PyPy repositories!Cheers, The PyPy Team", + "tags": "numpy,pypy3", + "url": "https://www.pypy.org/posts/2012/01/py3k-and-numpy-first-stage-thanks-to-3008917396290059758.html" + }, + { + "title": "Comparing Partial Evaluation and Tracing, Part 1", + "text": "As part of writing my PhD I am currently thinking about the relationship\nbetween PyPy's meta-tracing approach with various previous ideas to\nautomatically get a (JIT-)compiler from only an interpreter of a language. One\nof the most-researched ideas along these lines is that of partial evaluation.\nPartial evaluation has basically the same goals as PyPy when it comes to\ncompilers: Write an interpreter, and get a compiler for free. The methods for\nreaching that goal are a bit different. In this series of blog posts, I am\ntrying to explore the similarities and differences of partial evaluation and\nPyPy's meta-tracing.\nA Flowgraph Language\nTo be able to clearly understand what \"partial evaluation\" is and what\n\"meta-tracing\" is I will show an \"executable model\" of both. To that end, I am\ndefining a small imperative language and will then show what a partial evaluator\nand a tracer for that language look like. All this code will be\nimplemented in Prolog. (Any pattern-matching functional language would do, but I\nhappen to know Prolog best. Backtracking is not used, so you can read things\nsimply as functional programs.) In this post I will start with\nthe definition of the language, and a partial evaluator for it. The code\nwritten in this blog post can be found fully here: https://paste.pocoo.org/show/541004/\nThe language is conceptionally similar to PyPy's flow graphs, but a bit more\nrestricted. It does not have function calls, only labelled basic blocks\nthat consist of a series of linearly executed operations, followed by a\nconditional or an unconditional jump. Every operation is assigning a value to a\nvariable, which is computed by applying some operation to some arguments.\nA simple program to raise x to the yth power in that language looks like\nthis:\n\npower:\n res = 1\n if y goto power_rec else goto power_done\n\npower_rec:\n res = res * x\n y = y - 1\n if y goto power_rec else goto power_done\n\npower_done:\n print_and_stop(res)\n\nTo represent the same program as Prolog data structures, we use the\nfollowing Prolog code:\nblock(power, op1(res, same, const(1),\n if(y, power_rec, power_done))).\nblock(power_rec, op2(res, mul, var(res), var(x),\n op2(y, sub, var(y), const(1),\n if(y, power_rec, power_done)))).\nblock(power_done, print_and_stop(var(res))).\n\nEvery rule of block declares one block by first giving the label of the\nblock, followed by the code. Code is a series of op1 or op2 statements\nterminated by a jump, an if or a print_and_stop. op1 statements\nare operations with one argument of the form op1(res_variable,\noperation_name, argument, next_statement). Arguments can be either variables\nin the form var(name) or constants in the form const(value).\nTo run programs in this flowgraph language, we first need some helper\nfunctionality. The first few helper functions are concerned with the handling of\nenvironments, the data structures the interpreter uses to map variable\nnames occuring in the program to the variables' current values. In Python\ndictionaries would be used for this purpose, but in Prolog we have to emulate\nthese by lists of key/value pairs (not very efficient, but good enough):\nlookup(X, [], _) :- throw(key_not_found(X)).\nlookup(Key, [Key/Value | _], Value) :- !.\nlookup(Key, [_ | Rest], Value) :- lookup(Key, Rest, Value).\n\nwrite_env([], X, V, [X/V]).\nwrite_env([Key/_ | Rest], Key, Value, [Key/Value | Rest]) :- !.\nwrite_env([Pair | Rest], Key, Value, [Pair | NewRest]) :- write_env(Rest, Key, Value, NewRest).\n\nremove_env([], _, []).\nremove_env([Key/_ | Rest], Key, Rest) :- !.\nremove_env([Pair | Rest], Key, [Pair | NewRest]) :- remove_env(Rest, Key, NewRest).\n\nresolve(const(X), _, X).\nresolve(var(X), Env, Y) :- lookup(X, Env, Y).\n\nThe implementation of these functions is not too important. The lookup\nfunction finds a key in an environment list, the write_env function adds a\nnew key/value pair to an environment, remove_env removes a key. The\nresolve function is used to take either a constant or a variable and return\na value. If it's a constant, the value of that constant is returned, if it's a\nvariable it is looked up in the environment. Note how the last argument of\nlookup and resolve is actually a return value, which is the typical\napproach in Prolog.\nSo far we have not specified what the primitive operations that can occur in the\nprogram actually mean. For that we define a do_op function which\nexecutes primitive operations:\ndo_op(same, X, X).\ndo_op(mul, X, Y, Z) :- Z is X * Y.\ndo_op(add, X, Y, Z) :- Z is X + Y.\ndo_op(sub, X, Y, Z) :- Z is X - Y.\ndo_op(eq, X, Y, Z) :- X == Y -> Z = 1; Z = 0.\ndo_op(ge, X, Y, Z) :- X >= Y -> Z = 1; Z = 0.\ndo_op(readlist, L, I, X) :- nth0(I, L, X).\ndo_op(Op, _, _, _) :- throw(missing_op(Op)).\n\nAgain the last argument is an output variable.\nNow we can start executing simple operations. For that an interp predicate\nis defined. It takes as its first argument the current environment and as the\nsecond argument the operation to execute. E.g. to execute primitive operations\nwith one or two arguments:\ninterp(op1(ResultVar, Op, Arg, Rest), Env) :-\n resolve(Arg, Env, RArg),\n do_op(Op, RArg, Res),\n write_env(Env, ResultVar, Res, NEnv),\n interp(Rest, NEnv).\n\ninterp(op2(ResultVar, Op, Arg1, Arg2, Rest), Env) :-\n resolve(Arg1, Env, RArg1),\n resolve(Arg2, Env, RArg2),\n do_op(Op, RArg1, RArg2, Res),\n write_env(Env, ResultVar, Res, NEnv),\n interp(Rest, NEnv).\n\nFirst the arguments are resolved into values. Afterwards the operation is executed,\nand the result is written back into the environment. Then interp is called on\nthe rest of the program. Similarly easy are the unconditional jump and\nprint_and_stop:\ninterp(jump(L), Env) :-\n block(L, Block),\n interp(Block, Env).\n\n\ninterp(print_and_stop(Arg), Env) :-\n resolve(Arg, Env, Val),\n print(Val), nl.\n\nIn the unconditional jump we simply get the target block and continue executing\nthat. To execute print_and_stop we resolve the argument, print the value and\nthen are done.\nThe conditional jump is only slightly more difficult:\ninterp(if(V, L1, L2), Env) :-\n lookup(V, Env, Val),\n (Val == 0 ->\n block(L2, Block)\n ;\n block(L1, Block)\n ),\n interp(Block, Env).\n\nFirst the variable is looked up in the environment. If the variable is zero,\nexecution continues at the second block, otherwise it continues at the first\nblock.\nGiven this interpreter, we can execute the above example program like this, on a\nProlog console:\n$ swipl -s cfglang.pl\n?- block(power, Block), interp(Block, [x/10, y/10]).\n10000000000\n\nPartial Evaluation of the Flowgraph Language\nLet's look at what a partial evaluator for this simple flowgraph language would\nlook like. Partial evaluation (PE), also called specialization, is a program\nmanipuation technique. PE takes an input program and transforms it into a\n(hopefully) simpler and faster output program. It does this by assuming that\nsome variables in the input program are constants. All operations that act only\non such constants can be folded away. All other operations need to remain in the\noutput program (called residual program). Thus the partial evaluator proceeds\nmuch like an interpreter, just that it cannot actually execute some operations.\nAlso, its output is not just a value, but also list of remaining operations that\ncould not be optimized away.\nThe partial evaluator cannot use normal environments, because unlike the\ninterpreter not all variables' values are known to it. It will therefore work on\npartial environments, which store just the know variables. For these partial\nenvironments, some new helper functions are needed:\nplookup(Key, [], var(Key)).\nplookup(Key, [Key/Value | _], const(Value)) :- !.\nplookup(Key, [_ | Rest], Value) :- plookup(Key, Rest, Value).\n\npresolve(const(X), _, const(X)).\npresolve(var(V), PEnv, X) :- plookup(V, PEnv, X).\n\nThe function plookup takes a variable and a partial environment and returns\neither const(Value) if the variable is found in the partial environment or\nvar(Key) if it is not. Equivalently, presolve is like resolve,\nexcept that it uses plookup instead of lookup.\nWith these helpers we can start writing a partial evaluator. The following two\nrules are where the main optimization in the form of constant folding happens.\nThe idea is that when the partial evaluator sees an operation that involves\nonly constant arguments, it can constant-fold the operation, otherwise it\ncan't:\npe(op1(ResultVar, Op, Arg, Rest), PEnv, NewOp) :-\n presolve(Arg, PEnv, RArg),\n (RArg = const(C) ->\n do_op(Op, C, Res),\n write_env(PEnv, ResultVar, Res, NEnv),\n RestResidual = NewOp\n ;\n remove_env(PEnv, ResultVar, NEnv),\n NewOp = op1(ResultVar, Op, RArg, RestResidual)\n ),\n pe(Rest, NEnv, RestResidual).\n\npe(op2(ResultVar, Op, Arg1, Arg2, Rest), PEnv, NewOp) :-\n presolve(Arg1, PEnv, RArg1),\n presolve(Arg2, PEnv, RArg2),\n (RArg1 = const(C1), RArg2 = const(C2) ->\n do_op(Op, C1, C2, Res),\n write_env(PEnv, ResultVar, Res, NEnv),\n RestResidual = NewOp\n\n ;\n remove_env(PEnv, ResultVar, NEnv),\n NewOp = op2(ResultVar, Op, RArg1, RArg2, RestResidual)\n ),\n pe(Rest, NEnv, RestResidual).\n\nThe pe predicate takes a partial environment, the current operations and\npotentially returns a new operation. To partially evaluate a simple operation, its arguments are\nlooked up in the partial environment. If all the arguments are constants, the\noperation can be executed, and no new operation is produced. Otherwise, we need\nto produce a new residual operation which is exactly like the one currently\nlooked at. Also, the result variable needs to be removed from the partial\nenvironment, because it was just overwritten by an unknown value.\nThe potentially generated residual operation is stored into the output argument\nNewOp. The output argument of the recursive call is the last argument of\nthe newly created residual operation, which will then be filled by the\nrecursive call. This is a typical approach in Prolog, but may look strange if\nyou are not familiar with it.\nNote how the first case of these two rules is just like interpretation. The\nsecond case doesn't really do anything, it just produces a residual operation.\nThis relationship between normal evaluation and partial evaluation is very\ntypical.\nThe unconditional jump and print_and_stop are not much more complex:\npe(jump(L), PEnv, jump(LR)) :-\n do_pe(L, PEnv, LR).\n\npe(print_and_stop(Arg), Env, print_and_stop(RArg)) :-\n presolve(Arg, Env, RArg).\n\nTo partially evaluate an unconditional jump we again produce a jump. The target\nlabel of that residual jump is computed by asking the partial evaluator to\nproduce residual code for the label L with the given partial environment.\nprint_and_stop is simply turned into a print_and_stop. We will see the\ncode for do_pe soon.\nConditional jumps are more interesting:\npe(if(V, L1, L2), PEnv, NewOp) :-\n plookup(V, PEnv, Val),\n (Val = const(C) ->\n (C = 0 ->\n L = L2\n ;\n L = L1\n ),\n do_pe(L, PEnv, LR),\n NewOp = jump(LR)\n ;\n do_pe(L1, PEnv, L1R),\n do_pe(L2, PEnv, L2R),\n NewOp = if(V, L1R, L2R)\n ).\n\nFirst we look up the value of the condition variable. If it is a constant, we\ncan produce better code, because we know statically that only one path is\nreachable. Thus we produce code for that path, and then emit an unconditional\njump there. If the condition variable is not known at partial evaluation time,\nwe need to partially evaluate both paths and produce a conditional jump in the\nresidual code.\nThis rule is the one that causes the partial evaluator to potentially do much\nmore work than the interpreter, because after an if sometimes both paths\nneed to be explored. In the worst case this process never stops, so a real\npartial evaluator would need to ensure somehow that it terminates. There are\nmany algorithms for doing that, but I will ignore this problem here.\nNow we need to understand what the do_pe predicate is doing. Its most\nimportant task is to make sure that we don't do the same work twice by\nmemoizing code that was already partially evaluated in the past. For that it\nkeeps a mapping of Label, Partial Environment to Label of the residual\ncode:\ndo_pe(L, PEnv, LR) :-\n (code_cache(L, PEnv, LR) ->\n true\n ;\n gensym(L, LR),\n assert(code_cache(L, PEnv, LR)),\n block(L, Code),\n pe(Code, PEnv, Residual),\n assert(block(LR, Residual))\n ).\n\nIf the code cache indicates that label L was already partially evaluated\nwith partial environment PEnv, then the previous residual code label\nLPrevious\nis returned. Otherwise, a new label is generated with gensym, the code cache\nis informed of that new label with assert, then the block is partially\nevaluated and the residual code is added to the database.\nFor those who know partial evaluation terminology: This partial evaluator is a\npolyvariant online partial evaluator. \"Polyvariant\" means that for every label,\nseveral specialized version of the block can be generated. \"Online\" means that\nno preprocessing is done before the partial evaluator runs.\n\nPartial Evaluation Example\nWith this code we can look at the classical example of partial evaluation (it's\nprobably the \"Hello World\" of partial evaluation). We\ncan ask the partial evaluator to compute a power function, where the exponent\ny is a fixed number, e.g. 5, and the base x is unknown:\n?- do_pe(power, [y/5], LR).\nLR = power1.\n\nTo find out which code was produced, we can use listing:\n?- listing(code_cache)\ncode_cache(power, [y/5], power1).\ncode_cache(power_rec, [y/5, res/1], power_rec1).\ncode_cache(power_rec, [y/4], power_rec2).\ncode_cache(power_rec, [y/3], power_rec3).\ncode_cache(power_rec, [y/2], power_rec4).\ncode_cache(power_rec, [y/1], power_rec5).\ncode_cache(power_done, [y/0], power_done1).\n\n?- listing(block)\n.... the block definition of the user program ....\nblock(power_done1, print_and_stop(var(res))).\nblock(power_rec5, op2(res, mul, var(res), var(x), jump(power_done1))).\nblock(power_rec4, op2(res, mul, var(res), var(x), jump(power_rec5))).\nblock(power_rec3, op2(res, mul, var(res), var(x), jump(power_rec4))).\nblock(power_rec2, op2(res, mul, var(res), var(x), jump(power_rec3))).\nblock(power_rec1, op2(res, mul, const(1), var(x), jump(power_rec2))).\nblock(power1, jump(power_rec1)).\n\nThe code_cache tells which residual labels correspond to which original\nlabels under which partial environments. Thus, power1 contains the code of\npower under the assumption that y is 5. Looking at the block listing,\nthe label power1 corresponds to code that simply multiplies res by x\nfive times without using the variable x at all. The loop that was present\nin the original program has been fully unrolled, the loop variable y has\ndisappeared. Hopefully this is faster than the original program.\n\nConclusion\nIn this blog post we saw an interpreter for a simple flow graph language in\nProlog, together with a partial evaluator for it. The partial evaluator\nessentially duplicates every rule of the interpreter. If all the arguments of\nthe current operation are known, it acts like the interpreter, otherwise it\nsimply copies the operation into the residual code.\nPartial evaluation can be used for a variety of applications, but the most\ncommonly cited one is that of applying it to an interpreter. To do that, the\nprogram that the interpreter runs is assumed to be constant by the partial\nevaluator. Thus a specialized version of the interpreter is produced that does\nnot use the input program at all. That residual code can be seen as a compiled\nversion of the input program.\nIn the next blog post in this series we will look at writing a simple tracer for\nthe same flowgraph language.", + "tags": "", + "url": "https://www.pypy.org/posts/2012/01/comparing-partial-evaluation-and-7255412724168990164.html" + }, + { + "title": "PyPy internship at NCAR", + "text": "Hello, everyone\nI would like to inform you that there is a very interesting opportunity\nfor doing an internship at NCAR in the lovely town of Boulder, situated\non the foothils of Rocky Mountains. Before you read on, make sure you:\n\nare a student of a US University, who is legally eligible to work in the US\nare at least finishing second year this year\napply before February 3rd.\n\nThe internship itself will focus on using PyPy (in some way) to provide\na high performance numeric kernel for an atmospheric model, and measuring how\nfast we can go. This is very much in line with what the current effort on\nNumPy in PyPy is about. The internship will be mentored by Davide del Vento\nand I hope to have some influence over where it goes myself :-)\nA few interesting links:\n\nprogram website\ninternship proposal - note that the actual roadmap is very flexible, as\nlong as it's a numeric kernel of an atmospheric model using PyPy.\n\nFeel free to contact Davide for details about the proposal and pypy-dev or\nme directly for details about PyPy.\nCheers,\nfijal", + "tags": "", + "url": "https://www.pypy.org/posts/2012/01/pypy-internship-at-ncar-2244162842744077724.html" + }, + { + "title": "Transactional Memory (II)", + "text": "Here is an update about the previous blog post about the\nGlobal Interpreter Lock (GIL). In 5 months, the point of view\nchanged quite a bit.\nLet me remind you that the GIL is the technique used in both CPython and\nPyPy to safely run multi-threaded programs: it is a global lock that\nprevents multiple threads from actually running at the same time. The\nreason to do that is that it would have disastrous effects in the\ninterpreter if several threads access the same object concurrently --- to\nthe point that in CPython even just manipulating the object's reference\ncounter needs to be protected by the lock.\nSo far, the ultimate goal to enable true multi-CPU usage has been to\nremove the infamous GIL from the interpreter, so that multiple threads\ncould actually run in parallel. It's a lot of work, but this has been\ndone in Jython. The reason that it has not been done in CPython so far\nis that it's even more work: we would need to care not only about\ncarefully adding fine-grained locks everywhere, but also about reference\ncounting; and there are a lot more C extension modules that would need\ncare, too. And we don't have locking primitives as performant as\nJava's, which have been hand-tuned since ages (e.g. to use help from the\nJIT compiler).\nBut we think we have a plan to implement a different model for using\nmultiple cores. Believe it or not, this is better than just removing\nthe GIL from PyPy. You might get to use all your cores without ever\nwriting threads.\nYou would instead just use some event dispatcher, say from Twisted, from\nStackless, or from your favorite GUI; or just write your own. From\nthere, you (or someone else) would add some minimal extra code to the\nevent dispatcher's source code, to exploit the new transactional features\noffered by PyPy. Then you would run your program on a\nspecial version of PyPy, and voil\u00e0: you get some form of automatic parallelization.\nSounds magic, but the basic idea is simple: start handling multiple\nevents in parallel, giving each one its own transaction. More about\nit later.\n\nThreads or Events?\nFirst, why would this be better than \"just\" removing the GIL? Because\nusing threads can be a mess in any complex program. Some authors (e.g.\nLee) have argued that the reason is that threads are fundamentally\nnon-deterministic. This makes it very hard to reason about them.\nBasically the programmer needs to \"trim\" down the non-determinism (e.g.\nby adding locks, semaphores, etc.), and it's hard to be sure when he's\ngot a sufficiently deterministic result, if only because he can't write\nexhaustive tests for it.\nBy contrast, consider a Twisted program. It's not a multi-threaded\nprogram, which means that it handles the \"events\" one after the other.\nThe exact ordering of the events is not really deterministic, because\nthey often correspond to external events; but that's the only source of\nnon-determinism. The actual handling of each event occurs in a nicely\ndeterministic way, and most importantly, not in parallel with the\nhandling of other events. The same is true about other libraries like\nGUI toolkits, gevent, or Stackless.\n(Of course the Twisted and the Stackless models, to cite only these two,\nare quite different from each other; but they have in common the fact\nthat they are not multi-threaded, and based instead on \"events\" ---\nwhich in the Stackless case means running a tasklet from one switch()\npoint to the next one.)\nThese two models --- threads or events --- are the two main models we\nhave right now. The latter is more used in Python, because it is much\nsimpler to use than the former, and the former doesn't give any benefit\nbecause of the GIL. A third model, which is the only one that gives\nmulti-core benefits, is to use multiple processes, and do inter-process\ncommunication.\n\nThe problem\nConsider the case of a big program that has arbitrary complicated\ndependencies. Even assuming a GIL-less Python, this is likely enough to\nprevent the programmer from even starting a multi-threaded rewrite,\nbecause it would require a huge mess of locks. He could also consider\nusing multiple processes instead, but the result is annoying as well:\nthe complicated dependencies translate into a huge mess of inter-process\nsynchronization.\nThe problem can also be down-sized to very small programs, like the kind\nof hacks that you do and forget about. In this case, the dependencies\nmight be simpler, but you still have to learn and use subtle locking\npatterns or a complex inter-process library, which is overkill for the\npurpose.\n(This is similar to how explicit memory management is not very hard for\nsmall programs --- but still, nowadays a lot of people agree that\nautomatic memory management is easier for programs of all sizes. I\nthink the same will eventually be true for using multiple CPUs, but the\ncorrect solution will take time to mature, like garbage collectors did.\nThis post is a step in hopefully the right direction :-))\n\nEvents in Transactions\nLet me introduce the notion of independent events: two events are\nindependent if they don't touch the same set of objects. In a multi-threaded\nworld, it means that they can be executed in parallel without needing any lock\nto ensure correctness.\nEvents might also be mostly independent, i.e. they rarely access the same\nobject concurrently. Of course, in a multi-threaded world we would still need\nlocks to ensure correctness, but the point is that the locks are rarely causing\npauses: lock contention is low.\nConsider again the Twisted example I gave above. There are often several\nevents pending in the dispatch queue (assuming the program is using 100%\nof our single usable CPU, otherwise the whole discussion is moot). The case I am\ninterested in is the case in which these events are generally mostly\nindependent, i.e. we expect few conflicts between them. However\nthey don't have to be proved independent. In fact it is fine if\nthey have arbitrary complicated dependencies as described above. The\npoint is the expected common case. Imagine that you have a GIL-less\nPython and that you can, by a wave of your hand, have all the careful\nlocking mess magically done. Then what I mean here is the case in which\nsuch a theoretical program would run mostly in parallel on multiple\ncore, without waiting too often on the locks.\nIn this case, the solution I'm proposing is that with minimal tweaks\nin the event dispatch loop, we can\nhandle multiple events on multiple threads, each in its own transaction.\nA transaction is basically a tentative execution of the corresponding\npiece of code: if we detect conflicts with other concurrently executing\ntransactions, we abort the whole transaction and restart it from\nscratch.\nBy now, the fact that it can basically work should be clear: multiple\ntransactions will only get into conflict when modifying the same data\nstructures, which is the case where the magical wand above would have\nput locks. If the magical program could progress without too many\nlocks, then the transactional program can progress without too many\nconflicts. In a way, you get even more than what the magical program\ncan give you: each event is dispatched in its own transaction, which\nmeans that from each event's point of view, we have the illusion that\nnobody else is running concurrently. This is exactly what all existing\nTwisted-/Stackless-/etc.-based programs are assuming.\nNote that this solution, without transactions, already exists in some\nother languages: for example, Erlang is all about independent events.\nThis is the simple case where we can just run them on multiple cores,\nknowing by construction of the language that you can't get conflicts.\nOf course, it doesn't work for Python or for a lot of other languages.\nFrom that point of view, what I'm suggesting is merely that\ntransactional memory could be a good model to cope with the risks of\nconflicts that come from not having a special-made language.\n\nNot a perfect solution\nOf course, transactional memory\n(TM) is not a perfect solution either. Right now, the biggest issue is\nthe performance hit that comes from the software implementation (STM).\nIn time, hardware support (HTM) is likely to show up and help\nmitigate the problem; but I won't deny the fact that in some cases,\nbecause it's simple enough and/or because you really need the top\nperformance, TM is not the best solution.\nAlso, the explanations above are silent on what is a hard point for TM,\nnamely system calls. The basic general solution is to suspend other\ntransactions as soon as a transaction does its first system call, so\nthat we are sure that the transaction will succeed. Of course this\nsolution is far from optimal. Interestingly, it's possible to do better\non a case-by-case basis: for example, by adding in-process buffers, we\ncan improve the situation for sockets, by having recv() store in a\nbuffer what is received so that it can be re-recv()-ed later if the\ntransaction is aborted; similarly, send() or writes to log files can be\ndelayed until we are sure that the transaction will commit.\nFrom my point of view, the most important point is that the TM solution\ncomes from the correct side of the \"determinism\" scale. With threads,\nyou have to prune down non-determinism. With TM, you start from a\nmostly deterministic point, and if needed, you add non-determinism. The\nreason you would want to do so is to make the transactions shorter:\nshorter transactions have less risks of conflicts, and when there are\nconflicts, less things to redo. So making transactions shorter\nincreases the parallelism that your program can achieve, while at the\nsame time requiring more care.\nIn terms of an event-driven model, the equivalent would be to divide the\nresponse of a big processing event into several events that are handled\none after the other: for example, the first event sets things up and fires the second\nevent, which does the actual computation; and afterwards a third event\nwrites the results back. As a result, the second event's transaction\nhas little risks of getting aborted. On the other hand, the writing\nback needs to be aware of the fact that it's not in the same transaction\nas the original setting up, which means that other unrelated\ntransactions may have run in-between.\n\nOne step towards the future?\nThese, and others, are the problems of the TM approach. They are \"new\"\nproblems, too, in the sense that the existing ways of programming don't\nhave these problems.\nStill, as you have guessed, I think that it is overall a win, and\npossibly a big win --- a win that might be on the same scale for the age\nof multiple CPUs as automatic garbage collection was 20 years ago for\nthe age of RAM size explosion.\nStay tuned for more!\n--- Armin (and reviews by Antonio and Fijal)\n\n\nUPDATE: please look at the tiny transaction module I wrote as an example. The idea is to have the same interface as this module, but implemented differently. By making use of transactional memory internally, it should be possible to safely run on multiple CPUs while keeping the very same programmer interface.", + "tags": "stm", + "url": "https://www.pypy.org/posts/2012/01/transactional-memory-ii-7225309560970774590.html" + }, + { + "title": "NumPyPy progress report - running benchmarks", + "text": "Hello.\nWe're excited to let you know about some of the great progress we've made on\nNumPyPy: both completeness and performance. In this blog entry we mostly\nwill talk about performance and how much progress we have made so far.\nWord of warning: this\nwork is in progress -- we're maybe half way to where we want to be and there are\nmany trivial and not so trivial optimizations to be written. (For example, we\nhaven't even started to implement important optimizations, like vectorization.)\n\nBenchmark\nWe chose a laplace equation solver, based on SciPy's PerformancePython wiki.\nUnfortunately, the different implementations on the wiki page accidentally use\ntwo different algorithms, which have different convergences, and very different\nperformance characteristics on modern computers. As a result, we implemented\nour own versions in both C and Python (with and without NumPy). The full source\ncan be found in fijal's hack repo, all these benchmarks were performed at\nrevision 18502dbbcdb3.\nFirst, let me describe various algorithms used. Note that some of them contain\nPyPy-specific hacks to work around limitations in the current implementation.\nThese hacks will go away eventually and the performance will improve.\nNumerically the algorithms used are identical, however exact data layout in\nmemory differs between them.\nA note about all the benchmarks: they each were run once, but the\nperformance is very stable across runs.\nStarting with the C version, it implements a trivial laplace transform\nusing two loops and double-reference memory (array of int*). The double\nreference does not matter for performance and the two algorithms are\nimplemented in inline-laplace.c and laplace.c. They were both compiled\nwith gcc 4.4.5 at -O3. The inline version modifies array in-place while the non-inline version stores results in a copy. That makes them converge at different rate, hence different number of iterations\nA straightforward version of those in Python is implemented in laplace.py\nusing, respectively, inline_slow_time_step and slow_time_step.\nslow_2_time_step does the same thing, except it copies arrays in-place\ninstead of creating new copies. Table below compares running PyPy against C:\n\n\n\n\n\n\n\nbench\nnumber of iterations\ntime per iteration\n\nlaplace C\n219\n6.3ms\n\ninline-laplace C\n278\n20ms\n\nslow python\n219\n17ms\n\nslow 2 python\n219\n14ms\n\ninline_slow python\n278\n23.7ms\n\n\n\nAn important thing to notice is the data dependency of the inline\nversion causes a huge slowdown for the C versions. This is not a severe\ndisadvantage for us though -- the brain-dead Python version takes longer\nand PyPy is not able to take advantage of the knowledge that the data is\nindependent. The results are in the same ballpark as the C versions --\n15% - 170% slower, but the algorithm\none chooses matters more than the language. By comparison, the slow versions\ntake about 5.75s each on CPython 2.6 per iteration and, by estimation,\nare about 200x slower than the PyPy equivalent, if I had the patience to\nmeasure the full run.\nThe next step is to use NumPy expressions. The first problem we run into is\nthat computing the error requires walking the entire array a second time. This\nis fairly inefficient in terms of cache access, so I took the liberty of\ncomputing the errors every 15 steps. This results in the convergence being\nrounded to the nearest 15 iterations, but speeds things up considerably.\nnumeric_time_step takes the most braindead approach of replacing the array\nwith itself, like this:\n\nu[1:-1, 1:-1] = ((u[0:-2, 1:-1] + u[2:, 1:-1])*dy2 +\n (u[1:-1,0:-2] + u[1:-1, 2:])*dx2)*dnr_inv\n\nWe need 3 arrays here -- one is an intermediate (PyPy only needs one, for all of\nthose subexpressions), one is a copy for computing the error, and one is the\nresult. This works automatically because in NumPy + or * creates an\nintermediate, while NumPyPy avoids allocating the intermediate if possible.\nnumeric_2_time_step works in pretty much the same way:\n\nsrc = self.u\nself.u = src.copy()\nself.u[1:-1, 1:-1] = ((src[0:-2, 1:-1] + src[2:, 1:-1])*dy2 +\n (src[1:-1,0:-2] + src[1:-1, 2:])*dx2)*dnr_inv\n\nexcept the copy is now explicit rather than implicit.\nnumeric_3_time_step does the same thing, but notice one doesn't have to copy\nthe entire array, it's enough to copy the border pieces and fill rest with\nzeros:\n\nsrc = self.u\nself.u = numpy.zeros((self.nx, self.ny), 'd')\nself.u[0] = src[0]\nself.u[-1] = src[-1]\nself.u[:, 0] = src[:, 0]\nself.u[:, -1] = src[:, -1]\nself.u[1:-1, 1:-1] = ((src[0:-2, 1:-1] + src[2:, 1:-1])*dy2 +\n (src[1:-1,0:-2] + src[1:-1, 2:])*dx2)*dnr_inv\n\nnumeric_4_time_step is the one that tries hardest to resemble the C version.\nInstead of doing an array copy, it actually notices that one can alternate\nbetween two arrays. This is exactly what the C version does. The\nremove_invalidates call is a PyPy specific hack - we hope to remove this\ncall in the near future, but, in short, it promises \"I don't have any unbuilt\nintermediates that depend on the value of the argument\", which means one doesn't\nhave to compute sub-expressions one is not actually using:\n\nremove_invalidates(self.old_u)\nremove_invalidates(self.u)\nself.old_u[:,:] = self.u\nsrc = self.old_u\nself.u[1:-1, 1:-1] = ((src[0:-2, 1:-1] + src[2:, 1:-1])*dy2 +\n (src[1:-1,0:-2] + src[1:-1, 2:])*dx2)*dnr_inv\n\nThis one is the most comparable to the C version.\nnumeric_5_time_step does the same thing, but notices one doesn't have to copy\nthe entire array, it's enough to just copy the edges. This is an optimization\nthat was not done in the C version:\n\nremove_invalidates(self.old_u)\nremove_invalidates(self.u)\nsrc = self.u\nself.old_u, self.u = self.u, self.old_u\nself.u[0] = src[0]\nself.u[-1] = src[-1]\nself.u[:, 0] = src[:, 0]\nself.u[:, -1] = src[:, -1]\nself.u[1:-1, 1:-1] = ((src[0:-2, 1:-1] + src[2:, 1:-1])*dy2 +\n (src[1:-1,0:-2] + src[1:-1, 2:])*dx2)*dnr_inv\n\nLet's look at the table of runs. As before, gcc 4.4.5, compiled at -O3,\nand PyPy nightly 7bb8b38d8563, on an x86-64 machine. All of the numeric methods\nrun for 226 steps, slightly more than the 219, rounding to the next 15 when the\nerror is computed.\n\n\n\n\n\n\n\nbenchmark\nPyPy\nCPython\n\nnumeric\n21ms\n35ms\n\nnumeric 2\n14ms\n37ms\n\nnumeric 3\n13ms\n29ms\n\nnumeric 4\n11ms\n31ms\n\nnumeric 5\n9.3ms\n21ms\n\n\n\nWe think that these preliminary results are pretty good. They're not as fast as\nthe C version (or as fast as we'd like them to be), but we're already much\nfaster than NumPy on CPython -- almost always by more than 2x on this relatively\nreal-world example. This is not the end, though. In fact, it's hardly the\nbeginning! As we continue work, we hope to make even more use of the\nhigh level information that we have. Looking at the assembler generated by\ngcc for this example, it's pretty clear we can outperform it thanks to better\naliasing information and hence better possibilities for vectorization.\nStay tuned.\nEDIT: fixed the benchmark name\n\nEDIT2: added info that first table is about PyPy\nCheers,\nfijal", + "tags": "numpy", + "url": "https://www.pypy.org/posts/2012/01/numpypy-progress-report-running-3336055571122066974.html" + }, + { + "title": "Leysin Winter Sprint", + "text": "PyPy Leysin Winter Sprint: 15-22nd January 2012\n\n\nThe next PyPy sprint will be in Leysin, Switzerland, for the\neighth time. This is a fully public sprint: newcomers and topics\nother than those proposed below are welcome.\n\nGoals and topics of the sprint\n\n\nPy3k: work towards supporting Python 3 in PyPy\n\nNumPyPy: work towards supporting the numpy module in PyPy\n\nJIT backends: integrate tests for ARM; look at the PowerPC 64;\n maybe try again to write an LLVM- or GCC-based one\n\nSTM and STM-related topics; or the Concurrent Mark-n-Sweep GC\n\nAnd as usual, the main side goal is to have fun in winter sports :-)\n We can take a day off for ski.\n\n\nExact times\n\nThe work days should be 15-21 January 2011 (Sunday-Saturday). The\nofficial plans are for people to arrive on the 14th or the 15th, and to\nleave on the 22nd.\n\nInterested? Read more...", + "tags": "", + "url": "https://www.pypy.org/posts/2011/12/leysin-winter-sprint-6862532189897876336.html" + }, + { + "title": "Come see us at PyCon 2012", + "text": "PyCon 2012 is coming up in just a few short months, and PyPy will be well\nrepresented there. We'll be delivering a tutorial, two talks, plus we'll be\naround for the sprints.Here are the abstracts for the tutorials and talks:How to get the most out of your PyPy, by Maciej Fijalkowski, Alex Gaynor\nand Armin Rigo: For many applications PyPy can provide performance benefits\nright out of the box. However, little details can push your application to\nperform much better. In this tutorial we'll give you insights on how to push\nPyPy to its limits. We'll focus on understanding the performance\ncharacteristics of PyPy, and learning the analysis tools in order to maximize\nyour applications' performance. This is the tutorial.\nWhy PyPy by example, by Maciej Fijalkowski, Alex Gaynor and Armin Rigo:\nOne of the goals of PyPy is to make existing Python code faster; however an\neven broader goal was to make it possible to write things in Python that\npreviously would needed to be written in C or other low-level language. This\ntalk will show examples of this, and describe how they represent the\ntremendous progress PyPy has made, and what it means for people looking at\nusing PyPy.\nHow the PyPy JIT works, by Benjamin Peterson: The Python community is\nabuzz about the major speed gains PyPy can offer for pure Python code. But how\ndoes the PyPy JIT actually work? This talk will discuss how the PyPy JIT is\nimplemented. It will include descriptions of the tracing, optimization, and\nassembly generation phases. I will demonstrate each step with an example loop.\nIf you have any questions let us know! We look forward to seeing people at\nPyCon and chatting about PyPy and the entire Python ecosystem.See you there,\nMaciej Fijalkowski, Alex Gaynor, Benjamin Peterson, Armin Rigo, and the entire PyPy team", + "tags": "", + "url": "https://www.pypy.org/posts/2011/12/come-see-us-at-pycon-2012-610420698450130659.html" + }, + { + "title": "Plotting using matplotlib from PyPy", + "text": "Big fat warning This is just a proof of concept. It barely works. There are\nmissing pieces left and right, which were replaced with hacks so I can get this\nto run and prove it's possible. Don't try this at home, especially your home.\nYou have been warned.\nThere has been a lot of talking about PyPy not integrating well with the\ncurrent scientific Python ecosystem, and numpypy (a NumPy reimplementation\non top of pypy) was dubbed \"a fancy array library\". I'm going to show that\nintegration with this ecosystem is possible with our design.\nFirst, the demo:\n\n#!/usr/bin/env pypy\n\n# numpy, pypy version\nimport numpypy as numpy\n# DRAGONS LIVE THERE (fortunately hidden)\nfrom embed.emb import import_mod\n\npylab = import_mod('matplotlib.pylab')\n\nif __name__ == '__main__':\n a = numpy.arange(100, dtype=int)\n b = numpy.sin(a)\n pylab.plot(a, b)\n pylab.show()\n\nAnd you get:\n\n\n\nNow, how to reproduce it:\n\nYou need a PyPy without cpyext, I did not find a linker that would support\noverriding symbols. Right now there are no nightlies like this, so you have\nto compile it yourself, like:\n\n./translate.py -Ojit targetpypystandalone.py --withoutmod-cpyext\n\nThat would give you a PyPy that's unable to load some libraries like PIL, but\nperfectly working otherwise.\n\nSpeaking of which, you need a reasonably recent PyPy.\n\nThe approach is generally portable, however the implementation has been\ntested only on 64bit linux. Few tweaks might be required.\n\nYou need to install python2.6, the python2.6 development headers, and have\nnumpy and matplotlib installed on that python.\n\nYou need a checkout of my hacks directory and put embedded on your\nPYTHONPATH, your pypy checkout also has to be on the PYTHONPATH.\n\n\n\nEr wait, what happened?\nWhat didn't happen is we did not reimplement matplotlib on top of PyPy. What\ndid happen is we embed CPython inside of PyPy using ctypes. We instantiate it.\nand follow the embedding tutorial for CPython. Since numpy arrays are not\nmovable, we're able to pass around an integer that's represents the memory\naddress of the array data and reconstruct it in the embedded interpreter. Hence\nwith a relatively little effort we managed to reuse the same array data on both\nsides to plot at array. Easy, no?\nThis approach can be extended to support anything that's not too tied with\npython objects. SciPy and matplotlib both fall into the same category\nbut probably the same strategy can be applied to anything, like GTK or QT.\nIt's just a matter of extending a hack into a working library.\nTo summarize, while we're busy making numpypy better and faster, it seems\nthat all external libraries on the C side can be done using an embedded Python\ninterpreter with relatively little effort. To get to that point, I spent\na day and a half to learn how to embed CPython, with very little prior\nexperience in the CPython APIs. Of course you should still keep as much as\npossible in PyPy to make it nice and fast :)\nCheers,\nfijal", + "tags": "", + "url": "https://www.pypy.org/posts/2011/12/plotting-using-matplotlib-from-pypy-6389240123679375092.html" + }, + { + "title": "PyPy 1.7 on Win32", + "text": "Hi all,\n\nWe have fixed _continuation on Win32 (thanks Stakkars), and so we have now a Win32 version of PyPy 1.7.", + "tags": "", + "url": "https://www.pypy.org/posts/2011/11/pypy-17-on-win32-4962523601794245248.html" + }, + { + "title": "PyPy 1.7 - widening the sweet spot", + "text": "We're pleased to announce the 1.7 release of PyPy. As became a habit, this\nrelease brings a lot of bugfixes and performance improvements over the 1.6\nrelease. However, unlike the previous releases, the focus has been on widening\nthe \"sweet spot\" of PyPy. That is, classes of Python code that PyPy can greatly\nspeed up should be vastly improved with this release. You can download the 1.7\nrelease here:\n\nhttps://pypy.org/download.html\n\nWhat is PyPy?\nPyPy is a very compliant Python interpreter, almost a drop-in replacement for\nCPython 2.7. It's fast (pypy 1.7 and cpython 2.7.1 performance comparison)\ndue to its integrated tracing JIT compiler.\nThis release supports x86 machines running Linux 32/64, Mac OS X 32/64 or\nWindows 32. Windows 64 work is ongoing, but not yet natively supported.\nThe main topic of this release is widening the range of code which PyPy\ncan greatly speed up. On average on\nour benchmark suite, PyPy 1.7 is around 30% faster than PyPy 1.6 and up\nto 20 times faster on some benchmarks.\n\n\nHighlights\n\nNumerous performance improvements. There are too many examples which python\nconstructs now should behave faster to list them.\n\nBugfixes and compatibility fixes with CPython.\n\nWindows fixes.\n\nPyPy now comes with stackless features enabled by default. However,\nany loop using stackless features will interrupt the JIT for now, so no real\nperformance improvement for stackless-based programs. Contact pypy-dev for\ninfo how to help on removing this restriction.\n\nNumPy effort in PyPy was renamed numpypy. In order to try using it, simply\nwrite:\n\nimport numpypy as numpy\n\nat the beginning of your program. There is a huge progress on numpy in PyPy\nsince 1.6, the main feature being implementation of dtypes.\n\nJSON encoder (but not decoder) has been replaced with a new one. This one\nis written in pure Python, but is known to outperform CPython's C extension\nup to 2 times in some cases. It's about 20 times faster than\nthe one that we had in 1.6.\n\nThe memory footprint of some of our RPython modules has been drastically\nimproved. This should impact any applications using for example cryptography,\nlike tornado.\n\nThere was some progress in exposing even more CPython C API via cpyext.\n\n\n\n\nThings that didn't make it, expect in 1.8 soon\nThere is an ongoing work, which while didn't make it to the release, is\nprobably worth mentioning here. This is what you should probably expect in\n1.8 some time soon:\n\nSpecialized list implementation. There is a branch that implements lists of\nintegers/floats/strings as compactly as array.array. This should drastically\nimprove performance/memory impact of some applications\nNumPy effort is progressing forward, with multi-dimensional arrays coming\nsoon.\nThere are two brand new JIT assembler backends, notably for the PowerPC and\nARM processors.\n\n\n\nFundraising\nIt's maybe worth mentioning that we're running fundraising campaigns for\nNumPy effort in PyPy and for Python 3 in PyPy. In case you want to see any\nof those happen faster, we urge you to donate to numpy proposal or\npy3k proposal. In case you want PyPy to progress, but you trust us with\nthe general direction, you can always donate to the general pot.\n\nCheers,Maciej Fija\u0142kowki, Armin Rigo and the entire PyPy team", + "tags": "", + "url": "https://www.pypy.org/posts/2011/11/pypy-17-widening-sweet-spot-4260962828394182017.html" + }, + { + "title": "Gothenburg sprint report", + "text": "In the past week, we have been busy hacking on PyPy at the Gothenburg sprint, the second of this 2011. The sprint was hold at Laura's and Jacob's place, and here is a brief report of what happened.\n\n\nIn the first day we welcomed Mark Pearse, who was new to PyPy and at his first sprint. Mark worked the whole sprint in the new SpecialisedTuple branch, whose aim is to have a special implementation for small 2-items and 3-items tuples of primitive types (e.g., ints or floats) to save memory. Mark paired with Antonio for a couple of days, then he continued alone and did an amazing job. He even learned how to properly do Test Driven Development :-).\nAntonio spent a couple of days investigating whether it is possible to use application checkpoint libraries such as BLCR and DMTCP to save the state of the PyPy interpreter between subsequent runs, thus saving also the JIT-compiled code to reduce the warmup time. The conclusion is that these are interesting technologies, but more work would be needed (either on the PyPy side or on the checkpoint library side) before it can have a practical usage for PyPy users.\nThen, Antonio spent most of the rest of the sprint working on his ffistruct branch, whose aim is to provide a very JIT-friendly way to interact with C structures, and eventually implement ctypes.Structure on top of that. The \"cool part\" of the branch is already done, and the JIT already can compile set/get of fields into a single fast assembly instruction, about 400 times faster than the corresponding ctypes code. What is still left to do is to add a nicer syntax (which is easy) and to implement all the ctypes peculiarities (which is tedious, at best :-)).\nAs usual, Armin did tons of different stuff, including fixing a JIT bug, improving the performance of file.readlines() and working on the STM branch (for Software Transactional Memory), which is now able to run RPython multithreaded programs using software transaction (as long as they don't fill up all the memory, because support for the GC is still missing :-)). Finally, he worked on improving the Windows version of PyPy. While doing so he discovered together with Anto a terrible bug which lead to a continuous leak of stack space because the JIT called some functions using the wrong calling convention.\nH\u00e5kan, with some help from Armin, worked on the jit-targets branch, whose goal is to heavily refactor the way the traces are internally represented by the JIT, so that in the end we can produce (even :-)) better code than what we do nowadays. More details in this mail.\nAndrew Dalke worked on a way to integrate PyPy with FORTRAN libraries, and in particular the ones which are wrapped by Numpy and Scipy: in doing so, he wrote f2pypy, which is similar to the existing f2py but instead of producing a CPython extension module it produces a pure python modules based on ctypes. More work is needed before it can be considered complete, but f2pypy is already able to produce a wrapper for BLAS which passes most of the tests under CPython, although there's still work left to get it working for PyPy.\n\n\nArmin and H\u00e5kan with Laura's \"5x faster\" cake\nChristian Tismer worked the whole sprint on the branch to make PyPy compatible with Windows 64 bit. This needs a lot of work because a lot of PyPy is written under the assumption that the long type in C has the same bit size than void*, which is not true on Win64. Christian says that in the past Genova-Pegli sprint he completed 90% of the work, and in this sprint he did the other 90% of the work. Obviously, what is left to complete the task is the third 90% :-). More seriously, he estimated a total of 2-4 person-weeks of work to finish it.\nBut, all in all, the best part of the sprint has been the cake that Laura baked to celebrate the \"5x faster than CPython\" achievement. Well, actually our speed page reports \"only\" 4.7x, but that's because in the meantime we switched from comparing against CPython 2.6 to comparing against CPython 2.7, which is slightly faster. We are confident that we will reach the 5x goal again, and that will be the perfect excuse to eat another cake :-)", + "tags": "", + "url": "https://www.pypy.org/posts/2011/11/gothenburg-sprint-report-8371395613874909242.html" + }, + { + "title": "Speeding up JSON encoding in PyPy", + "text": "Hi\nRecently I spent a bit of effort into speeding up JSON in PyPy. I started with\nwriting a benchmark, which is admittedly not a very good one, but it's\nbetter than nothing (suggestions on how to improve it are welcome!).\nFor this particular benchmark, the numbers are as follow. Note that CPython by\ndefault uses the optimized C extension, while PyPy uses the pure Python one.\nPyPy trunk contains another pure Python version which has been optimized\nspecifically for the PyPy JIT. Detailed optimizations are described later in\nthis post.\nThe number reported is the time taken for the third run, when things are\nwarmed up. Full session here.\n\n\n\n\n\n\nCPython 2.6\n22s\n\nCPython 2.7\n3.7s\n\nCPython 2.7 no C extension\n44s\n\nPyPy 1.5\n34s\n\nPyPy 1.6\n22s\n\nPyPy trunk\n3.3s\n\n\n\nLessons learned:\n\nExpectations are high\nA lot of performance critical stuff in Python world is already written in a hand\noptimized C. Writing C (especially when you interface with CPython C API) is\nugly and takes significant effort. This approach does not scale well when\nthere is a lot of code to be written or when there is a very tight coupling\nbetween the part to be rewritten and the rest of the code. Still, people would\nexpect PyPy to be better at \"tasks\" and not precisely at running equivalent\ncode, hence a comparison between the C extension and the pure python version\nis sound. Fortunately it's possible to outperform the C extension, but requires\na bit of effort on the programmer side as well.\n\n\nOften interface between the C and Python part is ugly\nThis is very clear if you look at json module as implemented in CPython's\nstandard library. Not everything is in C (it would probably be just too\nmuch effort) and the interface to what is in C is guided via profiling not\nby what kind of interface makes sense. This especially is evident comparing CPython 2.6 to 2.7.\nJust adapting the code to an interface with C made the Python version slower.\nRemoving this clutter improves the readability a lot and improves PyPy's version\na bit, although I don't have hard numbers.\n\n\nJitViewer is crucial\nIn case you're fighting with PyPy's performance, jitviewer is worth a shot.\nWhile it's not completely trivial to understand what's going on, it'll\ndefinitely show you what kind of loops got compiled and how.\n\n\nNo nice and fast way to build strings in Python\nPyPy has a custom thing called __pypy__.builders.StringBuilder. It has\na few a features that make it much easier to optimize than other ways like\nstr.join() or cStringIO.\n\nYou can specify the start size, which helps a lot if you can even provide\na rough estimate on the size of the string (less copying)\nOnly append and build are allowed. While the string is being built you\ncan't seek or do anything else. After it's built you can never append any more.\nUnicode version available as well as __pypy__.builders.UnicodeBuilder.\n\n\n\nMethod calls are ok, immutable globals are ok\nPyPy's JIT seems to be good enough for at least the simple cases. Calling\nmethods for common infrastructure or loading globals (instead of rebinding as\nlocals) is fast enough and improves code readability.\n\n\nString copying is expensive\nEdit: see the comment at the end\nIf you use re.sub, the current implementation will always create a copy\nof the string even if there was no match to replace.\nIf you know your regexp is simple, first try to check if there is\nanything to replace. This is a pretty hard optimization to\ndo automatically -- simply matching the regular expression can be too costly\nfor it to make sense. In our particular example however, the regexp is really\nsimple, checking ranges of characters. It also seems that this is by far the\nfastest way to escape characters as of now.\n\n\nGenerators are slower than they should be\nI changed the entire thing to simply call builder.append instead of\nyielding to the main loop where it would be gathered. This is kind of a PyPy\nbug that using generators extensively is slower, but a bit hard to fix.\nEspecially in cases where there is relatively little data being passed around\n(few bytes), it makes sense to gather it first. If I were to implement an\nefficient version of iterencode, I would probably handle chunks of\npredetermined size, about 1000 bytes instead of yielding data every few bytes.\n\n\nI must admit I worked around PyPy's performance bug\nFor obscure (although eventually fixable) reasons, this:\n\nfor c in s: # s is string\n del c\n\nis faster than:\n\nfor c in s:\n pass\n\nThis is a PyPy performance bug and should be fixed, but on a different branch ;-)\n\n\nPyPy's JIT is good\nI was pretty surprised, but the JIT actually did make stuff work nicely.\nThe changes that were done were relatively minor and straightforward, once\nthe module was cleaned to the normal \"pythonic\" state.\nIt is worth noting that it's possible to write code in Python and make it\nrun really fast, but you have to be a bit careful. Again, jitviewer is your\nfriend when determining why things are slow. I hope we can write more tools\nin the future that would more automatically guide people through potential\nperformance pitfals.\nCheers,\nfijal\nEdit: I was wrong about re.sub. It just seems to be that the JIT is figuring match better than sub, will be fixed soon", + "tags": "", + "url": "https://www.pypy.org/posts/2011/10/speeding-up-json-encoding-in-pypy-8937643890263223898.html" + }, + { + "title": "PyPy G\u00f6teborg Post-Hallowe'en Sprint Nov 2nd - Nov 9th", + "text": "The next PyPy sprint will be in Gothenburg, Sweden. It is a public sprint,\nsuitable for newcomers. We'll focus on making a public kickoff for\nboth the numpy/pypy integration project\nand the Py3k support project,\nas well as whatever interests the Sprint attendees. Since both of these\nprojects are very new, there will be plenty of work suitable for newcomers\nto PyPy.\nOther topics might include:\n\nHelping people get their code running with PyPy\nwork on a FSCons talk?\nstate of the STM Vinnova project (We most likely, but not for certain will\nknow whether or not we are approved by this date.)\n\n\nOther Useful dates\nGothPyCon - Saturday Oct 29.\nFSCONS Friday Nov 11 - Sunday Nov 12.\n\n\nLocation\nThe sprint will be held in the apartment of Laura Creighton and Jacob Hall\u00e9n\nwhich is at G\u00f6tabergsgatan 22 in Gothenburg, Sweden. Here is a map. This is\nin central Gothenburg. It is between the tram stops of Vasaplatsen and\nValand, (a distance of 4 blocks) where many lines call -- the 2, 3, 4, 5,\n7, 10 and 13.\nProbably cheapest and not too far away is to book accomodation at SGS\nVeckobostader. The Elite Park Avenyn Hotel is a luxury hotel just a\nfew blocks away. There are scores of hotels a short walk away from the\nsprint location, suitable for every budget, desire for luxury, and desire\nfor the unusual. You could, for instance, stay on a boat. Options are\ntoo numerous to go into here. Just ask in the mailing list or on the blog.\nHours will be\nfrom 10:00 until people have had enough. It's a good idea to arrive a\nday before the sprint starts and leave a day later. In the middle of\nthe sprint there usually is a break day and it's usually ok to take\nhalf-days off if you feel like it. Of course, many of you may be interested\nin sticking around for FSCons, held the weekend after the sprint.\n\n\nGood to Know\nSweden is not part of the Euro zone. One SEK (krona in singular, kronor\nin plural) is roughly 1/10th of a Euro (9.36 SEK to 1 Euro).\nThe venue is central in Gothenburg. There is a large selection of\nplaces to get food nearby, from edible-and-cheap to outstanding. We\noften cook meals together, so let us know if you have any food allergies,\ndislikes, or special requirements.\nSweden uses the same kind of plugs as Germany. 230V AC.\n\n\nGetting Here\nIf are coming train, you will arrive at the Central Station. It is\nabout 12 blocks to the site from there, or you can take a tram.\nThere are two airports which are local to G\u00f6teborg, Landvetter (the main\none) and Gothenburg City Airport (where some budget airlines fly).\nIf you arrive at Landvetter the airport bus stops right downtown at\nElite Park Avenyn Hotel which is the second stop, 4 blocks from the\nSprint site, as well as the end of the line, which is the Central Station.\nIf you arrive at Gothenburg City Airport take the bus to the end of the\nline. You will be at the Central Station.\nYou can also arrive by ferry, from either Kiel in Germany or Frederikshavn\nin Denmark.\n\n\nWho's Coming?\nIf you'd like to come, please let us know when you will be arriving and\nleaving, as well as letting us know your interests We'll keep a list\nof people which we'll update (which you can do so yourself if you\nhave bitbucket pypy commit rights).", + "tags": "", + "url": "https://www.pypy.org/posts/2011/10/pypy-goteborg-post-halloween-sprint-nov-7335004338996313725.html" + }, + { + "title": "Numpy funding and status update", + "text": "Hi everyone,\nIt's been a little while since we wrote about NumPy on PyPy, so we wanted to\ngive everyone an update on what we've been up to, and what's up next for us.\nWe would also like to note that we're launching a funding campaign\nfor NumPy support in PyPy. Details can be found on the donation page.\nSome of the things that have happened since last we wrote are:\n\nWe added dtype support, meaning you can now create arrays of a bunch of\ndifferent types, including bools, ints of a various sizes, and floats.\nMore array methods and ufuncs, including things like comparison methods\n(==, >, etc.)\nSupport for more and more argument types, for example you can index by a\ntuple now (only works with tuples of length one, since we only have\nsingle-dimension arrays thus far).\n\nSome of the things we're working on at the moment:\n\nMore dtypes, including complex values and user-defined dtypes.\nSubscripting arrays by other array as indices, and by bool arrays as masks.\nStarting to reuse Python code from the original numpy.\n\nSome of the things on the near horizon are:\n\nBetter support for scalar data, for example did you know that\nnumpy.array([True], dtype=bool)[0] doesn't return a bool object?\nInstead it returns a numpy.bool_.\nMulti-dimensional array support.\n\nIf you're interested in helping out, we always love more contributors,\nAlex, Maciej, Justin, and the whole PyPy team", + "tags": "numpy", + "url": "https://www.pypy.org/posts/2011/10/numpy-funding-and-status-update-2380711174693638392.html" + }, + { + "title": "More Compact Lists with List Strategies", + "text": "Since we come closer to merging the list-strategy branch I want to try to explain this memory optimization today. Datatypes in PyPy are stored as W_Objects (e.g. W_StringObject to represent strings, W_IntObject to represent ints). This is necessary due to the dynamic nature of Python. So the actual value (e.g. string, integer) is stored inside that box, resulting in an indirection. When having a large amount of such boxed objects, for example in a list, the wasted memory can become quite large. If you have a closer look at such lists, you will see that in many of them only one type of data is stored and only few (and smaller) lists store mixed types. Another thing to observe is that those lists often won't change the types of the objects they contain at runtime very often. For instance a list of a million integers is very unlikely to suddenly get a string appended to it. List StrategiesThe goal of this work is to write an optimization that exploits this behaviour. Instead of wrapping all items in a list, we implement lists in a way that they are optimized for storing certain (primitive) datatypes. These implementations store the content of the list in unwrapped form, getting rid of the extra indirection and wrapper objects. One approach would be to add a level of indirection, making each W_ListObject instance point to another object that stores the actual content. For this other object, several implementations would exist, for every datatype we want to store without wrapping it (as well as a general one that deals with arbitrary content). The data layout would look something like this: This approach has the problem that we need two indirections to get to the data and that the implementation instances need memory themselves.What we would like to do is to make the W_ListObject point to an RPython list directly, that contains either wrapped or unwrapped data. This plan has the problem that storing different unwrapped data is not directly possible in RPython. To solve the problem, we use the rerased RPython library module. It allows us to erase the type of an object, in this case lists, and returns something similar to void-star in C, or Object in Java. This object is then stored on the W_ListObject in the field storage. If we want to work with the list, for example to append or delete items, we need to unerase the storage again.Example for rerase: storage = erase([1 ,2 ,3 ,4])\n# storage is an opaque object that you can do nothing with\n....\nl = unerase(storage)\nl.clear()\nNow that we know how to make the W_ListObject point directly to wrapped or unwrapped data, we need to find out how to actually do any operations on this data. This can be accomplished by adding another field to our W_ListObject. This field points to a ListStrategy object. The actual implementation of W_ListObject is now deferred to those ListStrategy classes. For instance, a W_ListObject which holds only integers will use the IntegerListStrategy.When the type of content is being changed, we need to change the used strategy as well as the storage in compatible ways. For example when we add a string to the list of integers we need to switch to the ObjectListStrategy and change the storage to be a list of wrapped objects. Thus the currently used strategy always knows what to do with what is currently in the storage. As you can see, we now save one level of indirections by storing some of the data unwrapped. Of course each operation on a list needs to go via the strategy, but since we save one indirection for each element stored in that list and the Strategy classes are singletons, the benefits outweigh the costs.Currently there are only strategies for integers and strings since many lists seem to have these datatypes. Other strategies i.e for floats and unicode strings are planned. We also implemented two special strategies for empty lists and range-lists. The EmptyListStrategy's storage is None. If objects are added to the list we just switch to the appropriate strategy (determined by the item's type). RangeListsStrategies do not store any items at all. Instead they only store values describing the range of the list, i.e. start, step and length. On any operations that changes the data of the list we switch to the IntegerStrategy.A nice side-effect of storing unwrapped datatypes is that we can implement optimized methods for certain cases. For instance, since comparison of unwrapped integers is now much faster than comparison between arbitrary objects, we can rewrite the sorting methods for lists containing integers.MicrobenchmarksFinally here is an early overview of the memory consumption of different Python implementations: CPython, PyPy and PyPy-list which uses list-strategies. To demonstrate how powerful list-strategies can be in the best case, we wrote benchmarks that create a list of integers, a list of strings and a range-list each with one million elements each and then reads out the heap size of the process as reported by the OS. The results are as follows: The savings on integers and strings in this ideal case are quite big.The benchmark for range-lists is a little unfair, since in CPython one could accomplish the same memory behaviour using xrange. However, in PyPy users won't notice that internally the list does not store all items, making it still possible to use all list methods, such as append or delete.ConclusionWe hope that list strategies bring memory savings for applications that use homogeneous lists of primitive types. Furthermore, operations on such lists tend to be somewhat faster as well. This also integrates well with the JIT. The list strategies optimizations will be merged to the PyPy's default branch at some point in the next months. An equivalent optimization for dictionaries has already been merged (and is part of PyPy 1.6), one for sets is coming in the future.Lukas Diekmann and Carl Friedrich Bolz", + "tags": "", + "url": "https://www.pypy.org/posts/2011/10/more-compact-lists-with-list-strategies-8229304944653956829.html" + }, + { + "title": "Py3k for PyPy fundraiser", + "text": "Hi,We would like to announce a donation campaign for implementing Python 3 in PyPy.\nPlease read our detailed plan for all the details and donate using the\nbutton on that page!Thanks,\nThe PyPy Team", + "tags": "", + "url": "https://www.pypy.org/posts/2011/09/py3k-for-pypy-fundraiser-8139653689520709617.html" + }, + { + "title": "Wrapping C++ Libraries with Reflection \u2014 Status Report One Year Later", + "text": "Well over a year ago, work was started on the cppyy module which lives in the\nreflex-support branch.\nSince then, work has progressed at a varying pace and has included a recent\nsprint in D\u00fcsseldorf, last July.\nLet's first take a step back and recap why we're interested in doing this,\ngiven that it is perfectly possible to use C++ through generated bindings and\ncpyext.\ncppyy makes use of reflection information generated for the C++ classes of\ninterest, and has that reflection information available at run time.\nTherefore, it is able to open up complex C++ types to the JIT in a\nconceptually similar manner as simple types are open to it.\nThis means that it is possible to get rid of a lot of the marshalling layers\nwhen making cross-language calls, resulting in much lower call overhead than\nis possible when going through the CPython API, or other methods of wrapping.\nThere are two problems that need to be solved: C++ language constructs need to\nbe presented on the Python side in a natural way; and cross-language impedance\nmismatches need to be minimized, with some hints of the user if need be.\nFor the former, the list of mapped features has grown to a set that is\nsufficient to do real work.\nThere is now support for:\n\n\nbuiltin, pointer, and array types\nnamespaces, classes, and inner classes\nglobal functions, global data\nstatic/instance data members and methods\ndefault variables, object return by value\nsingle and multiple (virtual) inheritance\ntemplated classes\nbasic STL support and pythonizations\nbasic (non-global) operator mapping\n\n\nThe second problem is harder and will always be an on-going process.\nBut one of the more important issues has been solved at the recent D\u00fcsseldorf\nsprint, namely, that of reclaiming C++ objects instantiated from the Python\nside by the garbage collector.\nPerformance has also improved, especially that of the nicer \"pythonized\"\ninterface that the user actually sees, although it still misses out on\nabout a factor of 2.5 in comparison to the lower-level interface (which has\ngotten uglier, so you really don't want to use that).\nMost of this improvement is due to restructuring so that it plays nicer with\nthe JIT and libffi, both of which themselves have seen improvements.\nWork is currently concentrated on the back-ends: a CINT back-end is underway\nand a LLVM/CLang pre-compiled headers (PCH) back-end is planned.\nThe latter is needed for this code to be released in the wild, rather than\njust used in high energy physics (HEP), as that would be easier to support.\nAlso, within HEP, CLang's PCH are foreseen to be the future format of\nreflection information.\nAt the end of the D\u00fcsseldorf sprint, we tried a little code that did something\nactually \"useful,\" namely the filling of a histogram with some random values.\nWe did get it to work, but trying cppyy on a large class library showed\nthat a good warning system for such things like missing classes was sorely\nneeded.\nThat has been added since, and revisiting the histogram example later, here is\nan interesting note: the pypy-c run takes 1.5x the amount of time of that\nof the compiled, optimized, C++ code.\nThe run was timed start to finish, including the reflection library loading\nand JIT warm-up that is needed in the case of Python, but not for the compiled\nC++ code.\nHowever, in HEP, scientists run many short jobs while developing their\nanalysis codes, before submitting larger jobs on the GRID to run during lunch\ntime or overnight.\nThus, a more realistic comparison is to include the compilation time needed\nfor the C++ code and with that, the Python code needs only 55% of the time\nrequired by C++.\nThe choice of a programming language is often a personal one, and such\narguments like the idea that C++ is hard to use typically do not carry much\nweight with the in-crowd that studies quantum field dynamics for fun.\nHowever, getting the prompt with your analysis results back faster is a sure\nwinner. We hope that cppyy will soon have progressed far enough to make it\nuseful first to particle physicists and then other uses for wrapping C++\nlibraries.\n\nWim Lavrijsen, Carl Friedrich Bolz, Armin Rigo", + "tags": "", + "url": "https://www.pypy.org/posts/2011/08/wrapping-c-libraries-with-reflection-3916959558080483711.html" + }, + { + "title": "We need Software Transactional Memory", + "text": "Hi all. Here is (an extract of) a short summary paper about my current position on\nSoftware Transactional Memory as a general tool in the implementation\nof Python or Python-like languages. Thanks to people on IRC for discussion on making\nthis blog post better (lucian, Alex Gaynor, rguillebert, timonator, Da_Blitz).\nFor the purpose of the present discussion, we are comparing Java with Python\nwhen it comes to multi-threading.\n\nThe problem in complex high-level languages\nLike Java, the Python language gives guarantees: it is not acceptable\nfor the Python virtual machine to crash due to incorrect usage of\nthreads. A primitive operation in Java is something like reading or\nwriting a field of an object; the corresponding guarantees are along the\nlines of: if the program reads a field of an object, and another thread\nwrites to the same field of the same object, then the program will see\neither the old value, or the new value, but not something else entirely,\nand the virtual machine will not crash.\nHigher-level languages like Python differ from Java by the fact that a\n\"primitive operation\" is far more complex. It may for example involve\nlooking in several hash maps, perhaps doing updates. In general, it is\ncompletely impossible to map every operation that must be atomic to a\nsingle processor instruction.\n\nJython: fine-grained locking\nThis problem has been solved \"explicitly\" in the Jython interpreter that\nruns on top of Java. The solution is explicit in the following sense:\nthroughout the Jython interpreter, every single operation makes careful\nuse of Java-level locking mechanisms. This is an application of\n\"fine-grained locking\". For example, operations like attribute lookup,\nwhich need to perform look-ups in a number of hash maps, are protected\nby acquiring and releasing locks (in __getattribute__).\nA draw-back of this solution is the attention to detail required.\nIf even one place misses a lock, then there is either a\nbug --- and such bugs occur in cases that are increasingly rare and hard\nto debug as the previous bugs are fixed --- or we just file it under \"differences\nfrom CPython\". There is however the risk of\ndeadlock, if two threads attempt to lock the same objects in different\norder.\n\nIn practice, the situation is actually not as bad as\nI may paint it: the number of locks in Jython is reasonable, and allows for\nall the \"common cases\" to work as expected.\n(For the uncommon cases, see below.)\n\nPerformance-wise, the Java virtual machine itself comes with locks that\nhave been heavily optimized over a long period of time, so the\nperformance is acceptable. However if this solution were coded in C, it\nwould need a lot of extra work to optimize the locks manually (possibly\nintroducing more of the subtle bugs).\n\nCPython: coarse-grained locking\nCPython, the standard implementation of Python in C, took a different\nand simpler approach: it has a single global lock, called the Global\nInterpreter Lock (GIL). It uses \"coarse-grained locking\": the lock is\nacquired and released around the whole execution of one bytecode (or\nactually a small number of bytecodes, like 100). This solution is\nenough to ensure that no two operations can conflict with each other,\nbecause the two bytecodes that invoke them are themselves\nserialized by the GIL. It is a solution which avoids --- unlike Jython\n--- writing careful lock-acquiring code all over the interpreter. It\nalso offers even stronger guarantees: every bytecode runs entirely\natomically.\nNowadays, the draw-back of the GIL approach is obvious on multi-core\nmachines: by serializing the execution of bytecodes, starting multiple\nthreads does not actually let the interpreter use of more than one core.\nPyPy, the Python implementation in Python, takes the same approach so\nfar.\n\nExisting usage\nAs we have seen, we have the following situation: the existing Python\nlanguage, as CPython implements it, offers very strong guarantees about\nmulti-threaded usage. It is important to emphasize that most existing\nmulti-threaded Python programs actually rely on such strong guarantees.\nThis can be seen for example in a problem that takes a populated list\nand does in several threads:\n\nnext_item = global_list.pop()\n\nThis implicitly relies on the fact that pop() will perform atomic\nremoval from the list. If two threads try to pop() from the same list\nat the same time, then the two operations will occur in one order or the\nother; but they will not e.g. return the same object to both threads or\nmess up the internal state of the list object.\nWith such an example in mind, it should be clear that we do not want a\nsolution to the multi-core issue that involves dropping these strong\nguarantees. It is ok however to lower the barrier, as Jython does; but\nany Python implementation must offer some guarantees, or not offer\nmulti-threading at all. This includes the fact that a lot of methods on\nbuilt-in types are supposed to be atomic.\n\n(It should be noted that not offering multi-threading at all is actually\nalso a (partial) solution to the problem. Recently, several \"hacks\"\nhave appeared that give a programmer more-or-less transparent access to\nmultiple independent processes (e.g. multiprocessing). While these provide appropriate\nsolutions in some context, they are not as widely applicable as\nmulti-threading. As a typical example, they fail to apply when the\nmutiple cores need to process information that cannot be serialized at\nall --- a requirement for any data exchange between several processes.)\n\nHere is an example of how Jython's consistency is weaker than CPython's GIL.\nIt takes uncommon examples to show it, and the fact that it does not work\nlike a CPython programmer expect them to is generally considered as an\nimplementation detail. Consider:\nThread 1: set1.update(set2)\nThread 2: set2.update(set3)\nThread 3: set3.update(set1)\nEach operation is atomic in the case of CPython, but decomposed in two steps\n(which can each be considered atomic) in the case of Jython: reading from the\nargument, and then updating the target set. Suppose that initially\nset1 = {1}, set2 = {2}, set3 = {3}. On CPython, independently on\nthe order in which the threads run, we will end up with at least one of the\nsets being {1, 2, 3}. On Jython, it is possible that all\nthree sets end up as containing two items only. The example is a bit\nfar-fetched but should show that CPython's consistency is strictly stronger\nthan Jython's.\n\nPyPy\nPyPy is a Python interpreter much like CPython or Jython, but the way it\nis produced is particular. It is an interpreter written in RPython, a\nsubset of Python, which gets turned into a complete virtual machine (as\ngenerated C code) automatically by a step called the \"translation\". In\nthis context, the trade-offs are different from the ones in CPython and\nin Jython: it is possible in PyPy, and even easy, to apply arbitrary\nwhole-program transformations to the interpreter at \"translation-time\".\nWith this in mind, it is possible to imagine a whole-program\ntransformation that would add locking on every object manipulated in\nRPython by the interpreter. This would end up in a situation similar to\nJython. However, it would not automatically solve the issue of\ndeadlocks, which is avoided in the case of Jython by careful manual\nplacement of the locks. (In fact, being deadlock-free is a global\nprogram property that cannot be automatically ensured or verified; any\nchange to Jython can in theory break this property, and thus introduce\nsubtle deadlocks. The same applies to non-atomicity.)\nIn fact, we can easily check that if the interpreter accesses (for\nboth reading and writing)\nobjects A and B in a bytecode of thread 1, and objects B and A (in the\nopposite order) in a bytecode of thread 2 --- and moreover if you need to\nhave accessed the first object before you can decide that you will need\nto access the second object --- then there is no way (apart from the GIL) to avoid\na deadlock while keeping the strong guarantee of atomicity. Indeed, if\nboth threads have progressed to the middle of the execution of their\nbytecode, then A has already been mutated by thread 1 and similarly B\nhas already been mutated by thread 2. It is not possible to\nsuccessfully continue running the threads in that case.\n\nUsing Software Transactional Memory\nSoftware Transactional Memory (STM) is an approach that gives a solution\nto precisely the above problem. If a thread ended up in a situation\nwhere continuing to run it would be wrong, then we can abort and\nrollback. This is similar to the notion of transaction on databases.\nIn the above example, one or both threads would notice that they are\nabout to run into troubles and abort. This means more concretely that\nthey need to have a way to restart execution at the start of the\nbytecode, with all the side-effects of what they did so far being either\ncancelled or just not committed yet.\nWe think that this capacity to abort and rollback is the missing piece\nof the puzzle of multi-threaded implementations of Python.\nActually, according to the presentation of the problem given\nabove, it is unavoidable that any solution that wants to offer the\nsame level of consistency and atomicity as CPython would involve\nthe capacity of aborting and rolling back --- which means precisely\nthat STM cannot be avoided.\n\nOk, but why not settle down with Jython's\napproach and put careful locks left and right throughout the interpreter?\nBecause (1) we would have to consider every operation's atomicity and make decisions\n(or steal Jython's) and document them\nhere;\n(2) it would also be really a lot of work, to optimize these locks e.g. with the\nJIT as well as the JVM does; and (3) it is not the PyPy way to require manually\ntweaking your code everywhere for a feature that should be orthogonal. Point\n(3) is probably the most important here: you need to redo the work for every\nlanguage you implement in PyPy.\nIt also implies my own point (4): it is not fun :-)\n\nIn more details, the process would work as follows. (This gives an\noverview of one possible model; it is possible that a different model\nwill end up being better.) In every thread:\n\nAt the start of a bytecode, we start a \"transaction\". This means\nsetting up a thread-local data structure to record a log of what\noccurs in the transaction.\nWe record in the log all objects that are read, as well as the\nmodifications that we would like to make.\nDuring this time, we detect \"read\" inconsistencies, shown by the\nobject's \"last-modified\" timestamp being later than the start time\nof the current transaction, and abort. This prevents the rest of\nthe code from running with inconsistent values.\nIf we reach the end of the bytecode without a \"read\" inconsistency,\nthen we atomically check for \"write\" inconsistencies. These are\ninconsistencies which arise from concurrent updates to objects\nin the other threads --- either our \"write\" objects, or our \"read\"\nobjects.\nIf no inconsistency is found, we \"commit\" the transaction by copying\nthe delayed writes from the log into main memory.\n\n\nThe points at which a transaction starts or ends are exactly the\npoints at which, in CPython, the Global Interpreter Lock is\nrespectively acquired and released. If we ignore the fact that (purely for\nperformance) CPython acquires and releases the GIL only every N bytecodes,\nthen this means:\n\nBefore any bytecode we acquire the GIL (start a transaction), and after\nthe bytecode we release it (ends the transaction); and\nBefore doing an external call to the C library or the OS we release the GIL\n(ends the transaction) and afterwards re-acquire it (start the next transaction).\n\nSo in particular this model is well suited to the STM condition that we cannot\ndo anything in a transaction that cannot be rolled back, like --- precisely ---\nsystem calls. Indeed, by construction, these system calls occur outside a\ntransaction, because in CPython they occur with the GIL released.\n\nPerformance\nA large number of implementation details are still open for now.\nFrom a user's point of view (i.e. the programmer using Python),\nthe most relevant one is the overall performance impact. We\ncannot give precise numbers so far, and we expect the initial\nperformance to be abysmally bad (maybe 10x slower); however, with\nsuccessive improvements to the locking mechanism, to the global\nprogram transformation inserting the locks, to the garbage \ncollector (GC), and to the Just-in-Time (JIT) compiler, we\nbelieve that it should be possible to get a roughly reasonable\nperformance (up to maybe 2x slower). For example, the GC can\nmaintain flags on the objects to know that they did not escape\ntheir creation thread, and do not need any logging; and the JIT\ncompiler can aggregate several reads or writes to an object into\none. We believe that these are the kind of optimizations that\ncan give back a lot of the performance lost.\n\nThe state of STM\nTransactional Memory is itself a relatively old idea, originating\nfrom a 1986 paper by Tom Knight. At first based on hardware\nsupport, the idea of software-only transactional memory (STM) was\npopularized in 1995 and has recently been the focus of intense \nresearch.\nThe approach outlined above --- using STM to form the core of the\nimplementation of a language --- is new, as far as we know. So\nfar, most implementations provide STM as a library feature. It\nrequires explicit usage, often in the form of explicitly\ndeclaring which objects must be protected by STM (object-based\nSTMs). It is only recently that native STM support has started\nto appear, notably in the Clojure language.\nSTM is described on Wikipedia as an approach that \"greatly\nsimplifies conceptual understanding of multithreaded programs and\nhelps make programs more maintainable by working in harmony with\nexisting high-level abstractions such as objects and modules.\"\nWe actually think that these benefits are important enough to\nwarrant being exposed to the Python programmer as well, instead\nof being used only internally. This would give the Python\nprogrammer a very simple interface:\n\nwith atomic:\n \n\n(This is an old idea. Funny how back in 2003 people, including me, thought that this was a hack. Now I'm writing a blog post to say \"it was not a hack; it's explicitly using locks that is a hack.\" I'm buying the idea of composability.)\n\nFrom a practical point of view, I started looking seriously at\nthe University of Rochester STM (RSTM), a C++ library that has\nbeen a focus of --- and a collection of results from --- recent\nresearch. One particularly representative paper is\nA\nComprehensive Strategy for Contention Management in Software\nTransactional Memory by Michael F. Spear, Luke Dalessandro,\nVirendra J. Marathe and Michael L. Scott.\n\nConclusion\nTaking these ideas and applying them in the context of an\nimplementation of a complex high-level language like Python comes\nwith its own challanges. In this context, using PyPy makes sense\nas both an experimentation platform and as a platform that is\nrecently gaining attention for its performance. The alternatives\nare unattractive: doing it in CPython for example would mean\nglobally rewriting the interpreter. In PyPy instead, we write it\nas a transformation that is applied systematically at translation-time.\nAlso, PyPy is a general platform for generating fast interpreters\nfor dynamic languages; the STM implementation in PyPy would work\nout of the box for other language implementations as well, instead\nof just for Python.\n\nUpdate:\n\nThis is mostly me (Armin Rigo) ranting aloud and trying experiments;\nthis post should not be confused as meaning that the whole PyPy team\nwill now spend the next years working on it full-time.\nAs I said it is orthogonal to the actual Python interpreter, and it is in\nany case a feature that can be turned on or off during translation; I know\nthat in many or most use cases, people are more interested in getting a\nfast PyPy rather than one which is twice as slow but scales well.\nNothing I said is really new. For proof, see\nRiley and Zilles (2006)\nas well as Tabba (2010) who both experimented with Hardware Transactional Memory, turning CPython or PyPy interpreter's GIL into start/end transactions, as I describe here.", + "tags": "", + "url": "https://www.pypy.org/posts/2011/08/we-need-software-transactional-memory-6513983438425039230.html" + }, + { + "title": "PyPy 1.6 - kickass panda", + "text": "We're pleased to announce the 1.6 release of PyPy. This release brings a lot\nof bugfixes and performance improvements over 1.5, and improves support for\nWindows 32bit and OS X 64bit. This version fully implements Python 2.7.1 and\nhas beta level support for loading CPython C extensions. You can download it\nhere:\n\nhttps://pypy.org/download.html\n\nWhat is PyPy?\nPyPy is a very compliant Python interpreter, almost a drop-in replacement for\nCPython 2.7.1. It's fast (pypy 1.6 and cpython 2.6.2 performance comparison)\ndue to its integrated tracing JIT compiler.\nThis release supports x86 machines running Linux 32/64 or Mac OS X. Windows 32\nis beta (it roughly works but a lot of small issues have not been fixed so\nfar). Windows 64 is not yet supported.\nThe main topics of this release are speed and stability: on average on\nour benchmark suite, PyPy 1.6 is between 20% and 30% faster than PyPy 1.5,\nwhich was already much faster than CPython on our set of benchmarks.\nThe speed improvements have been made possible by optimizing many of the\nlayers which compose PyPy. In particular, we improved: the Garbage Collector,\nthe JIT warmup time, the optimizations performed by the JIT, the quality of\nthe generated machine code and the implementation of our Python interpreter.\n\n\nHighlights\n\nNumerous performance improvements, overall giving considerable speedups:\nbetter GC behavior when dealing with very large objects and arrays\nfast ctypes: now calls to ctypes functions are seen and optimized\nby the JIT, and they are up to 60 times faster than PyPy 1.5 and 10 times\nfaster than CPython\nimproved generators(1): simple generators now are inlined into the caller\nloop, making performance up to 3.5 times faster than PyPy 1.5.\nimproved generators(2): thanks to other optimizations, even generators\nthat are not inlined are between 10% and 20% faster than PyPy 1.5.\nfaster warmup time for the JIT\nJIT support for single floats (e.g., for array('f'))\noptimized dictionaries: the internal representation of dictionaries is now\ndynamically selected depending on the type of stored objects, resulting in\nfaster code and smaller memory footprint. For example, dictionaries whose\nkeys are all strings, or all integers. Other dictionaries are also smaller\ndue to bugfixes.\n\n\nJitViewer: this is the first official release which includes the JitViewer,\na web-based tool which helps you to see which parts of your Python code have\nbeen compiled by the JIT, down until the assembler. The jitviewer 0.1 has\nalready been release and works well with PyPy 1.6.\nThe CPython extension module API has been improved and now supports many\nmore extensions. For information on which one are supported, please refer to\nour compatibility wiki.\nMultibyte encoding support: this was of of the last areas in which we were\nstill behind CPython, but now we fully support them.\nPreliminary support for NumPy: this release includes a preview of a very\nfast NumPy module integrated with the PyPy JIT. Unfortunately, this does\nnot mean that you can expect to take an existing NumPy program and run it on\nPyPy, because the module is still unfinished and supports only some of the\nnumpy API. However, barring some details, what works should be\nblazingly fast :-)\nBugfixes: since the 1.5 release we fixed 53 bugs in our bug tracker, not\ncounting the numerous bugs that were found and reported through other\nchannels than the bug tracker.\n\nCheers,\nHakan Ardo, Carl Friedrich Bolz, Laura Creighton, Antonio Cuni,\nMaciej Fijalkowski, Amaury Forgeot d'Arc, Alex Gaynor,\nArmin Rigo and the PyPy team", + "tags": "", + "url": "https://www.pypy.org/posts/2011/08/pypy-16-kickass-panda-559424594592497545.html" + }, + { + "title": "Visualization of JITted code", + "text": "Hello.\nWe're proud to announce the first public release of the jitviewer. As of now,\njitviewer is a slightly internal tool that helps understanding how your Python\nsource code is compiled by the PyPy's JIT all the way down to machine code.\nTo install it, you need a very recent version of PyPy\n(newer than 9th of August), for example one of the nightly builds:\n\n\ninstall pip and distribute either by creating a PyPy virtualenv\nor by following the installation instructions.\nmake sure to have a source code checkout of PyPy and put it in your\nPYTHONPATH.\npip install jitviewer. Note that you need to run the pip\nexecutable which belongs to PyPy, not the globally installed one.\n\n\nHave a look at the README for how to start it, or try the online demo if\nyou just want to play with it.\nThe jitviewer is a web application written with flask and jinja2. If\nyou have experience with web development and you want to help PyPy, don't\nhesitate to contact us, there are plenty of things to improve in it :-).\n\nWhat does the jitviewer really do?\nAt the top of the page, you will see the list of pieces of code which has been\ncompiled by the JIT. You will see entries for both normal loops and for\n\"entry bridges\". This is not the right place to discuss the difference\nbetween those, but you most probably want to look at loops, because usually\nit's where most of the time is spent.\nNote that for each loop, you will see the name of the function which contains\nthe first instruction of the loop. However, thanks to the inlining done\nby the JIT, it will contain also the code for other functions.\nOnce you select a loop, the jitviewer shows how the JIT has compiled the\nPython source code into assembler in a hierarchical way. It displays four\nlevels:\n\nPython source code: only the lines shown in azure have been compiled for\nthis particular loop, the ones in gray have not.\n\nPython bytecode, the one you would get by doing:\n\ndef f(a, b):\n return a + b\n\nimport dis\ndis.dis(f)\n\nThe opcodes are e.g. LOAD_FAST, LOAD_GLOBAL etc. The opcodes\nwhich are not in bold have been completely optimized aways by the JIT.\n\nIntermediate representation of jit code (IR). This is a combination of\noperations (like integer addition, reading fields out of structures) and\nguards (which check that the assumptions we made are actually true). Guards\nare in red. These operations are \"at the same level as C\": so, for example,\n+ takes two unboxed integers which can be stored into the register\nof the CPU.\n\nAssembler: you can see it by clicking on \"Show assembler\" in the menu on the\nright.\n\n\nSometimes you'll find that a guard fails often enough that a new piece of\nassembler is required to be compiled. This is an alternative path through the\ncode and it's called a bridge. You can see bridges in the jitviewer when\nthere is a link next to a guard. For more information about purpose look up\nthe jit documentation.\n\n\nI'm still confused\nJitviewer is not perfect when it comes to explaining what's going on. Feel free\nto pop up on IRC or send us a mail to the mailing list, we'll try to explain\nand/or improve the situation. Consult the contact page for details.\nCheers,\nfijal & antocuni", + "tags": "", + "url": "https://www.pypy.org/posts/2011/08/visualization-of-jitted-code-6202490807361942120.html" + }, + { + "title": "PyPy is faster than C, again: string formatting", + "text": "String formatting is probably something you do just about every day in Python,\nand never think about. It's so easy, just \"%d %d\" % (i, i) and you're\ndone. No thinking about how to size your result buffer, whether your output\nhas an appropriate NULL byte at the end, or any other details. A C\nequivalent might be:\n\nchar x[44];\nsprintf(x, \"%d %d\", i, i);\n\nNote that we had to stop for a second and consider how big numbers might get\nand overestimate the size (44 = length of the biggest number on 64bit (20) +\n1 for the sign * 2 + 1 (for the space) + 1 (NUL byte)), it took the authors of\nthis post, fijal and alex, 3 tries to get the math right on this :-)\nThis is fine, except you can't even return x from this function, a more\nfair comparison might be:\n\nchar *x = malloc(44 * sizeof(char));\nsprintf(x, \"%d %d\", i, i);\n\nx is slightly overallocated in some situations, but that's fine.\nBut we're not here to just discuss the implementation of string\nformatting, we're here to discuss how blazing fast PyPy is at it, with\nthe new unroll-if-alt branch. Given the Python code:\n\ndef main():\n for i in xrange(10000000):\n \"%d %d\" % (i, i)\n\nmain()\n\nand the C code:\n\n#include \n#include \n\n\nint main() {\n int i = 0;\n char x[44];\n for (i = 0; i < 10000000; i++) {\n sprintf(x, \"%d %d\", i, i);\n }\n}\n\nRun under PyPy, at the head of the unroll-if-alt branch, and\ncompiled with GCC 4.5.2 at -O4 (other optimization levels were tested,\nthis produced the best performance). It took 0.85 seconds to\nexecute under PyPy, and 1.63 seconds with the compiled binary. We\nthink this demonstrates the incredible potential of dynamic\ncompilation, GCC is unable to inline or unroll the sprintf call,\nbecause it sits inside of libc.\nBenchmarking the C code:\n\n#include \n#include \n\n\nint main() {\n int i = 0;\n for (i = 0; i < 10000000; i++) {\n char *x = malloc(44 * sizeof(char));\n sprintf(x, \"%d %d\", i, i);\n free(x);\n }\n}\n\nWhich as discussed above, is more comperable to the Python, gives a\nresult of 1.96 seconds.\nSummary of performance:\n\n\n\n\n\n\n\n\n\nPlatform\nGCC (stack)\nGCC (malloc)\nCPython\nPyPy (unroll-if-alt)\n\nTime\n1.63s\n1.96s\n10.2s\n0.85s\n\nrelative to C\n1x\n0.83x\n0.16x\n1.9x\n\n\n\nOverall PyPy is almost 2x faster. This is clearly win for dynamic\ncompilation over static - the sprintf function lives in libc and so\ncannot be specializing over the constant string, which has to be parsed\nevery time it's executed. In the case of PyPy, we specialize\nthe assembler if we detect the left hand string of the modulo operator\nto be constant.\nCheers,\nalex & fijal", + "tags": "", + "url": "https://www.pypy.org/posts/2011/08/pypy-is-faster-than-c-again-string-6756589731691762127.html" + }, + { + "title": "Realtime image processing in Python", + "text": "Image processing is notoriously a CPU intensive task. To do it in realtime,\nyou need to implement your algorithm in a fast language, hence trying to do it\nin Python is foolish: Python is clearly not fast enough for this task. Is it?\n:-)\nActually, it turns out that the PyPy JIT compiler produces code which is fast\nenough to do realtime video processing using two simple algorithms implemented\nby H\u00e5kan Ard\u00f6.\nsobel.py implements a classical way of locating edges in images, the\nSobel operator. It is an approximation of the magnitude of the image\ngradient. The processing time is spend on two convolutions between the\nimage and 3x3-kernels.\nmagnify.py implements a pixel coordinate transformation that rearranges\nthe pixels in the image to form a magnifying effect in the center.\nIt consists of a single loop over the pixels in the output image copying\npixels from the input image.\nYou can try by yourself by downloading the appropriate demo:\n\n\npypy-image-demo.tar.bz2: this archive contains only the source code,\nuse this is you have PyPy already installed\npypy-image-demo-full.tar.bz2: this archive contains both the source\ncode and prebuilt PyPy binaries for linux 32 and 64 bits\n\n\nTo run the demo, you need to have mplayer installed on your system. The\ndemo has been tested only on linux, it might (or not) work also on other\nsystems:\n$ pypy pypy-image-demo/sobel.py\n\n$ pypy pypy-image-demo/magnify.py\n\nBy default, the two demos uses an example AVI file. To have more fun, you can\nuse your webcam by passing the appropriate mplayer parameters to the scripts,\ne.g:\n$ pypy demo/sobel.py tv://\n\nBy default magnify.py uses nearest-neighbor interpolation. By adding the\noption -b, bilinear interpolation will be used instead, which gives\nsmoother result:\n$ pypy demo/magnify.py -b\n\nThere is only a single implementation of the algorithm in\nmagnify.py. The two different interpolation methods are implemented by\nsubclassing the class used to represent images and embed the\ninterpolation within the pixel access method. PyPy is able to achieve good\nperformance with this kind of abstractions because it can inline\nthe pixel access method and specialize the implementation of the algorithm.\nIn C++ that kind of pixel access method would be virtual and you'll need to use\ntemplates to get the same effect without incurring in runtime overhead.\n\n\n\n\n\n\nThe video above shows PyPy and CPython running sobel.py side by\nside (PyPy taking input from the webcam, CPython from the test\nfile). Alternatively, to have a feeling on how much PyPy is faster than\nCPython, try to run the demo with the latter. These are the the average fps\n(frames per second) that I get on my machine (Ubuntu 64 bit, Intel i7 920, 4GB\nRAM) when processing the default test.avi video and using the prebuilt\nPyPy binary found in the full tarball alinked above. For sobel.py:\n\n\nPyPy: ~47.23 fps\nCPython: ~0.08 fps\n\n\nFor magnify.py:\n\n\nPyPy: ~26.92 fps\nCPython: ~1.78 fps\n\n\nThis means that on sobel.py, PyPy is 590 times faster. On\nmagnify.py the difference is much less evident and the speedup is \"only\"\n15x.\nIt must be noted that this is an extreme example of what PyPy can do. In\nparticular, you cannot expect (yet :-)) PyPy to be fast enough to run an\narbitrary video processing algorithm in real time, but the demo still proves\nthat PyPy has the potential to get there.", + "tags": "", + "url": "https://www.pypy.org/posts/2011/07/realtime-image-processing-in-python-6985924592886873374.html" + }, + { + "title": "Global Interpreter Lock, or how to kill it", + "text": "People that listened to my (Armin Rigo) lightning talk at EuroPython know that\nsuddenly, we have a plan to remove the Global Interpreter Lock --- the\ninfamous GIL, the thing in CPython that prevents multiple threads from\nactually running in your Python code in parallel.\nThat's not actually new, because Jython has been doing it all along.\nJython works by very carefully adding locks to\nall the mutable built-in types, and by relying on the underlying Java\nplatform to be efficient about them (so that the result is faster than,\nsay, very carefully adding similar locks in CPython). By \"very\ncarefully\", I mean really really carefully; for example,\n'dict1.update(dict2)' needs to lock both dict1 and dict2, but if you do\nit naively, then a parallel 'dict2.update(dict1)' might cause a\ndeadlock.\nAll of PyPy, CPython and IronPython have a GIL. But for PyPy we are considering\na quite different approach than Jython's, based on Software\nTransactional Memory. This is a recent development in computer\nscience, and it gives a nicer solution than locking. Here is a short\nintroduction to it.\nSay you want to atomically pop an item from 'list1' and append it to\n'list2':\n\ndef f(list1, list2):\n x = list1.pop()\n list2.append(x)\n\nThis is not safe in multithreaded cases (even with the GIL). Say that\nyou call f(l1, l2) in thread 1 and f(l2, l1) in thread 2. What\nyou want is that it has no effect at all (x is moved from one list to\nthe other, then back). But what can occur is that instead the top of\nthe two lists are swapped, depending on timing issues.\nOne way to fix it is with a global lock:\n\ndef f(list1, list2):\n global_lock.acquire()\n x = list1.pop()\n list2.append(x)\n global_lock.release()\n\nA finer way to fix it is with locks that come with the lists:\n\ndef f(list1, list2):\n acquire_all_locks(list1.lock, list2.lock)\n x = list1.pop()\n list2.append(x)\n release_all_locks(list1.lock, list2.lock)\n\nThe second solution is a model for Jython's, while the first is a model\nfor CPython's. Indeed, in CPython's interpreter, we acquire the GIL,\nthen we do one bytecode (or actually a number of them, like 100), then\nwe release the GIL; and then we proceed to the next bunch of 100.\nSoftware Transactional Memory (STM) gives a third solution:\n\ndef f(list1, list2):\n while True:\n t = transaction()\n x = list1.pop(t)\n list2.append(t, x)\n if t.commit():\n break\n\nIn this solution, we make a transaction object and use it in all\nreads and writes we do to the lists. There are actually several\ndifferent models, but let's focus on one of them. During a transaction,\nwe don't actually change the global memory at all. Instead, we use the\nthread-local transaction object. We store in it which objects we\nread from, which objects we write to, and what values we write. It is\nonly when the transaction reaches its end that we attempt to \"commit\"\nit. Committing might fail if other commits have occurred in between,\ncreating inconsistencies; in that case, the transaction aborts and\nmust restart from the beginning.\nIn the same way as the previous two solutions are models for CPython and\nJython, the STM solution looks like it could be a model for PyPy in the\nfuture. In such a PyPy, the interpreter would start a transaction, do\none or several bytecodes, and then end the transaction; and repeat.\nThis is very similar to what is going on in CPython with the GIL. In\nparticular, it means that it gives programmers all the same guarantees\nas the GIL does. The only difference is that it can actually run\nmultiple threads in parallel, as long as their code does not interfere\nwith each other. (In particular, if you need not just the GIL but actual\nlocks in your existing multi-threaded program, then this will not\nmagically remove the need for them. You might get an additional built-in\nmodule that exposes STM to your Python programs, if you prefer it over\nlocks, but that's another question.)\nWhy not apply that idea to CPython? Because we would need to change\neverything everywhere. In the example above, you may have noted that I\nno longer call 'list1.pop()', but 'list1.pop(t)'; this is a way to tell\nthat the implementation of all the methods needs to be changed, in order\nto do their work \"transactionally\". This means that instead of really\nchanging the global memory in which the list is stored, it must instead\nrecord the change in the transation object. If our interpreter is\nwritten in C, as CPython is, then we need to write it explicitly\neverywhere. If it is written instead in a higher-level language, as\nPyPy is, then we can add this behavior as as set of translation rules, and\napply them automatically wherever it is necessary. Moreover, it can be\na translation-time option: you can either get the current \"pypy\" with a\nGIL, or a version with STM, which would be slower due to the extra\nbookkeeping. (How much slower? I have no clue, but as a wild guess,\nmaybe between 2 and 5 times slower. That is fine if you have enough\ncores, as long as it scales nicely :-)\nA final note: as STM research is very recent (it started around 2003),\nthere are a number of variants around, and it's not clear yet which one\nis better in which cases. As far as I can tell, the approach described\nin \"A Comprehensive Strategy for Contention Management in Software\nTransactional Memory\" seems to be one possible state-of-the-art; it also\nseems to be \"good enough for all cases\".\nSo, when will it be done? I cannot say yet. It is still at the idea\nstage, but I think that it can work. How long would it take us to\nwrite it? Again no clue, but we are looking at many months rather\nthan many days. This is the sort of thing that I would\nlike to be able to work on full time after the Eurostars funding\nruns out on September 1. We are currently looking at ways to use\ncrowdfunding to raise money so that I can do exactly that. Expect\na blog post about that very soon. But this looks like a perfect\ncandidate for crowdfunding -- there are at least thousands of you who\nwould be willing to pay 10s of Euros to Kill the GIL. Now we only\nhave to make this happen.", + "tags": "", + "url": "https://www.pypy.org/posts/2011/06/global-interpreter-lock-or-how-to-kill-8270246310848099963.html" + }, + { + "title": "Report back from our survey", + "text": "Hi all,\nI'm here to report back the results of our survey. First, we're very pleased to\nreport that a number of you guys are happilly running PyPy in production! Most\n(97%) of the respondants using PyPy are using it because it's faster, but a\nfurther 26% (respondants could choose multiple answers) are using it because of\nlower memory usage. Of users who aren't using PyPy, the most common reason was\nC extensions, followed by \"Other\".\nFrom reading the extra comments section there are a few things we've learned:\n\nGoogle docs needs a better UI for this stuff\nA huge number of people want NumPy and SciPy, it was easily the most\nrequested C extension (25% of respondants said somthing about NumPy). We've\nalready blogged on the topic of our plans for NumPy.\nHaving packages in the various OS's repositories would be a big help in\ngetting users up and running.\n\nA huge thanks to everyone who responded! Finally, if you're using PyPy in\nproduction we'd love to get a testimonial from you, if you're willing to spare\na few minutes to give us a quote or two please get in contact with us via our\nmailing list.\nThanks,\nAlex", + "tags": "", + "url": "https://www.pypy.org/posts/2011/06/report-back-from-our-survey-2083371215707583264.html" + }, + { + "title": "PyPy Genova-Pegli Post-EuroPython Sprint June 27 - July 2 2011", + "text": "The next PyPy sprint will be in Genova-Pegli, Italy, the week after EuroPython\n(which is in Florence, about 3h away by train). This is a fully public sprint:\nnewcomers and topics other than those proposed below are welcome.\n\n\n\nGoals and topics of the sprint\n\n\nNow that we have released 1.5, the sprint itself is going to be mainly\nworking on fixing issues reported by various users. Possible topics\ninclude, but are not limited to:\n\n\nfixing issues in the bug tracker\nimprove cpyext, the C-API compatibility layer, to support more extension\nmodules\nfinish/improve/merge jitypes2, the branch which makes ctypes JIT friendly\ngeneral JIT improvements\nimprove our tools, like the jitviewer or the buildbot infrastructure\nmake your favorite module/application working on PyPy, if it doesn't yet\n\n\n\n\nOf course this does not prevent people from showing up with a more precise\ninterest in mind If there are newcomers, we will gladly give introduction\ntalks.\n\n\nSince we are almost on the beach, we can take one day off for summer\nrelaxation and/or tourist visits nearby :-).\n\n\n\n\n\n\nExact times\nThe work days should be 27 June - 2 July 2011. People may arrive on\nthe 26th already and/or leave on the 3rd.\n\n\n\nLocation & Accomodation\nBoth the sprint venue and the lodging will be at Albergo Puppo in\nGenova-Pegli, Italy. Pegli is a nice and peaceful little quarter of Genova,\nand the hotel is directly on the beach, making it a perfect place for those\nwho want to enjoy the sea in the middle of the Italian summer, as a quick\nsearch on Google Images shows :-)\n\nThe place has a good ADSL Internet connexion with wireless installed. You can\nof course arrange your own lodging anywhere but I definitely recommend lodging\nthere too.\nPlease confirm that you are coming so that we can adjust the reservations as\nappropriate. The prices are as follows, and they include breakfast and a\nparking place for the car, in case you need it:\n\n\nsingle room: 70 \u20ac\ndouble room: 95 \u20ac\ntriple room: 105 \u20ac\n\n\nPlease register by hg:\n\nhttps://foss.heptapod.net/pypy/extradoc/-/blob/branch/default/extradoc/sprintinfo/genova-pegli-2011/people.txt\nor on the pypy-dev mailing list if you do not yet have check-in rights:\n\nhttps://mail.python.org/mailman/listinfo/pypy-dev\nIn case you want to share a room with someone else but you don't know who,\nplease let us know (either by writing it directly in people.txt or by writing\non the mailing list) and we will try to arrange it.", + "tags": "", + "url": "https://www.pypy.org/posts/2011/05/pypy-genova-pegli-post-europython-4004229800858530064.html" + }, + { + "title": "PyPy Usage Survey", + "text": "We've been working on PyPy for a long time. But readers of this blog will know\nthat in the past year something has changed: we think PyPy is production ready.\nAnd it's not just us, this week LWN.net wrote an article about how PyPy\nsped up one of their scripts by a factor of three, noting that, \"plans are to\nrun gitdm under PyPy from here on out\". All in all we think PyPy is pretty\ngreat, but not everyone is using it yet, and we want to know why. We want your\nfeedback on why PyPy isn't ready to be your only Python yet, and how we can\nimprove it to make that happen.\nTherefore, we've put together a quick survey, whether you're using PyPy or not\nif you could take a few minutes to fill it out and let us know how we're doing\nwe'd really appreciate it. You can find the form here.\nThanks,\nThe PyPy team", + "tags": "", + "url": "https://www.pypy.org/posts/2011/05/pypy-usage-survey-1402303968715807009.html" + }, + { + "title": "Server migration in progress", + "text": "Hi all,\n\nWe are in the process of migrating the hosting machine for PyPy, moving away from codespeak.net and towards a mixture of custom servers (e.g. for buildbot.pypy.org) and wide-scale services (e.g. for the docs, now at readthedocs.org).\n\nWhen this is done, a proper announce will be posted here. In the meantime, we have already moved the mailing lists, now hosted on python.org. The subscribers' list have been copied, so if you didn't notice anything special for the past week, then everything works fine :-) This concerns pypy-dev, pypy-issue and pypy-commit. Two notes:\nSome settings have not been copied, notably if you used to disable mail delivery. Sorry about that; you have to re-enter such settings.\nFollowing the move, about 50 addresses have been dropped for being invalid. I'm unsure why they were not dropped earlier, but in case sending mail to you from python.org instead of codespeak.net fails, then you have been dropped from the mailing lists, and you need to subscribe again.", + "tags": "", + "url": "https://www.pypy.org/posts/2011/05/server-migration-in-progress-2113491786141182920.html" + }, + { + "title": "Playing with Linear Programming on PyPy", + "text": "Fancy hi-level interfaces often come with a high runtime overhead\nmaking them slow. Here is an experiment with building such an\ninterface using constructions that PyPy should be good at\noptimizing. The idea is to allow the JIT in PyPy to remove the\noverhead introduced by using a fancy high-level python interface\non top of a low-level C interface. The application considered is\nLinear\nprogramming. It is a tool used to solve linear optimization\nproblems. It can for example be used to find the nonnegative values\nx, y and z that gives the maximum value of\n\n\n\n\n\nwithout violating the constraints\n\n\n\n\n\n\n\nThere exists general purpose solvers for these kind of problems that\nare very fast and can literally handle millions of variables. To use\nthem however the problem has to be transformed into some specific\nmatrix form, and the coefficients of all the matrices\nhas to be passed to the solver using some API. This transformation is\na tedious and error prone step that forces you to work with matrix\nindexes instead of readable variable names. Also it makes maintaining\nan implementation hard since any modification has to be transformed\ntoo.\n\n\nThe example above comes from the manual of\nthe glpk library. That\nmanual continues by describing how to convert this problem into the\nstandard form of glpk (which involves introducing three new variables)\nand then gives the c-code needed to call the\nlibrary. Relating that c-code to the problem above without the\nintermediate explanation of the manual is not easy. A common\nsolution here is to build a hi-level interface that allows a more\nnatural way of defining the matrices and/or allow the equations to be\nentered symbolically. Unfortunately, such interfaces often become\nslow. For the benchmark below for example, \ncvxopt\nrequires 20 minutes to setup a problem that takes 9.43 seconds to solve\n(this seems a bit extreme, am I doing something wrong?).\n\n\nThe high-level interface I constructed on top of the\nglpk library is \npplp and it allows\nthe equations to be entered symbolically. The above problem can be\nsolved using\n\n lp = LinearProgram()\n x, y, z = lp.IntVar(), lp.IntVar(), lp.IntVar()\n lp.objective = 10*x + 6*y + 4*z\n lp.add_constraint( x + y + z <= 100 )\n lp.add_constraint( 10*x + 4*y + 5*z <= 600 )\n lp.add_constraint( 2*x + 2*y + 6*z <= 300 )\n lp.add_constraint( x >= 0 )\n lp.add_constraint( y >= 0 )\n lp.add_constraint( z >= 0 )\n\n maxval = lp.maximize()\n print maxval\n print x.value, y.value, z.value\n\n\n\nTo benchmark the API I used it to solve a \nminimum-cost\n flow problem with 154072 nodes and 390334 arcs. The C library\n needs 9.43 s to solve this and the pplp interface adds another 5.89\n s under PyPy and 28.17 s under CPython. A large amount of time is\n still spend setting up the problem, but it's a significant\n improvement over the 20 minutes required on CPython by\n cvxopt. It is\n probably not designed to be fast on this kind of benchmark. I have\n not been able to get cvxopt to work under PyPy. The benchmark used is\n available here", + "tags": "", + "url": "https://www.pypy.org/posts/2011/05/playing-with-linear-programming-on-pypy-4040572987275633047.html" + }, + { + "title": "NumPy Follow up", + "text": "Hi everyone. Since yesterday's blog post we got a ton of feedback, so we want\nto clarify a few things, as well as share some of the progress we've made, in\nonly the 24 hours since the post.\nReusing the original NumPy\nFirst, a lot of people have asked why we cannot just reuse the original NumPy\nthrough cpyext, our CPython C-API compatibility layer. We believe this is\nnot the best approach, for a few reasons:\n\n\ncpyext is slow, and always will be slow. It has to emulate far too many\ndetails of the CPython object model that don't exist on PyPy (e.g.,\nreference counting). Since people are using NumPy primarily for speed this\nwould mean that even if we could have a working NumPy, no one would want to\nuse it. Also, as soon as the execution crosses the cpyext boundary, it\nbecomes invisible to the JIT, which means the JIT has to assume the worst\nand deoptimize stuff away.\nNumPy uses many obscure documented and undocumented details of the CPython\nC-API. Emulating these is often difficult or impossible (e.g. we can't fix\naccessing a struct field, as there's no function call for us to intercept).\nIt's not much fun. Frankly, working on cpyext, debugging the crashes,\nand everything else that goes with it is not terribly fun, especially when\nyou know that the end result will be slow. We've demonstrated we can build\na much faster NumPy, in a way that's more fun, and given that the people\nworking on this are volunteers, it's important to keep us motivated.\n\n\nFinally, we are not proposing to rewrite the entirety of NumPy or, god\nforbid, BLAST, or any of the low level stuff that operates on C-level arrays,\nonly the parts that interface with Python code directly.\nC bindings vs. CPython C-API\nThere are two issues on C code, one has a very nice story, and the other not so\nmuch. First is the case of arbitrary C-code that isn't Python related, things\nlike libsqlite, libbz2, or any random C shared library on your system.\nPyPy will quite happily call into these, and bindings can be developed either\nat the RPython level (using rffi) or in pure Python, using ctypes.\nWriting bindings with ctypes has the advantage that they can run on every\nalternative Python implementation, such as Jython and IronPython. Moreover,\nonce we merge the jittypes2 branch ctypes calls will even be smoking\nfast.\nOn the other hand there is the CPython C-extension API. This is a very specific\nAPI which CPython exposes, and PyPy tries to emulate. It will never be fast,\nbecause there is far too much overhead in all the emulation that needs to be\ndone.\nOne of the reasons people write C extensions is for speed. Often, with PyPy\nyou can just forget about C, write everything in pure python and let the JIT to\ndo its magic.\nIn case the PyPy JIT alone isn't fast enough, or you just want to\nuse existing C code then it might make sense to split\nyour C-extension into 2 parts, one which doesn't touch the CPython C-API and\nthus can be loaded with ctypes and called from PyPy, and another which does\nthe interfacing with Python for CPython (where it will be faster).\nThere are also libraries written in C to interface with existing C codebases,\nbut for whom performance is not the largest goal, for these the right solution\nis to try using CPyExt, and if it works that's great, but if it fails the\nsolution will be to rewrite using ctypes, where it will work on all Python\nVMs, not just CPython.\nAnd finally there are rare cases where rewriting in RPython makes more sense,\nNumPy is one of the few examples of these because we need to be able to give\nthe JIT hints on how to appropriately vectorize all of the operations on an\narray. In general writing in RPython is not necessary for almost any\nlibraries, NumPy is something of a special case because it is so ubiquitous\nthat every ounce of speed is valuable, and makes the way people use it leads to\ncode structure where the JIT benefits enormously from extra hints and the\nability to manipulate memory directly, which is not possible from Python.\nProgress\nOn a more positive note, after we published the last post, several new people\ncame and contributed improvements to the numpy-exp branch. We would like to\nthank all of them:\n\n\nnightless_night contributed: An implementation of __len__, fixed bounds\nchecks on __getitem__ and __setitem__.\nbrentp contributed: Subtraction and division on NumPy arrays.\nMostAwesomeDude contributed: Multiplication on NumPy arrays.\nhodgestar contributed: Binary operations between floats and NumPy arrays.\n\n\nThose last two were technically an outstanding branch we finally merged, but\nhopefully you get the picture. In addition there was some exciting work done by\nregular PyPy contributors. I hope it's clear that there's a place to jump in\nfor people with any level of PyPy familiarity. If you're interested in\ncontributing please stop by #pypy on irc.freenode.net, the pypy-dev mailing\nlist, or send us pull requests on bitbucket.\nAlex", + "tags": "numpy", + "url": "https://www.pypy.org/posts/2011/05/numpy-follow-up-6928627691060102514.html" + }, + { + "title": "Numpy in PyPy - status and roadmap", + "text": "Hello.\nNumPy integration is one of the single most requested features for PyPy. This\npost tries to describe where we are, what we plan (or what we don't plan), and\nhow you can help.\nShort version for the impatient: we are doing experiments, which show that\nPyPy+numpy can be faster and better than CPython+numpy. We have a plan on how\nto move forward, but at the moment there is lack of dedicated people or money\nto tackle it.\n\nThe slightly longer version\nIntegrating numpy in PyPy has been my pet project on an on-and-off (mostly off)\nbasis over the past two years. There were some experiments, then a long\npause, and then some more experiments which are documented below.\nThe general idea is not to use the existing CPython module, but to\nreimplement numpy in RPython (i.e. the language PyPy is implemented in), thus\nletting our JIT achieve extra speedups. The really cool thing about this part\nis that numpy will automatically benefit of any general JIT improvements,\nwithout any need of extra tweaking.\nAt the moment, there is branch called numpy-exp which contains a\ntranslatable version of a very minimal version of numpy in the module called\nmicronumpy. Example benchmarks show the following:\n\n\n\n\n\n\n\n\u00a0\nadd\niterate\n\nCPython 2.6.5 with numpy 1.3.0\n0.260s (1x)\n4.2 (1x)\n\nPyPy numpy-exp @ 3a9d77b789e1\n0.120s (2.2x)\n0.087 (48x)\n\n\n\nThe add benchmark spends most of the time inside the + operator on\narrays (doing a + a + a + a + a), , which in CPython is implemented in C.\nAs you can see from the table above, the PyPy version is already ~2 times\nfaster. (Although numexpr is still faster than PyPy, but we're working on it).\nThe exact way array addition is implemented is worth another blog post, but in\nshort it lazily evaluates the expression and computes it at the end, avoiding\nintermediate results. This approach scales much better than numexpr\nand can lead to speeding up all the operations that you can perform on matrices.\nThe next obvious step to get even more speedups would be to extend the JIT to\nuse SSE operations on x86 CPUs, which should speed it up by about additional\n2x, as well as using multiple threads to do operations.\niterate is also interesting, but for entirely different reasons. On CPython\nit spends most of the time inside a Python loop; the PyPy version is ~48 times\nfaster, because the JIT can optimize across the python/numpy boundary, showing\nthe potential of this approach, users are not grossly penalized for writing\ntheir loops in Python.\nThe drawback of this approach is that we need to reimplement numpy in RPython,\nwhich takes time. A very rough estimate is that it would be possible to\nimplement an useful subset of it (for some definition of useful) in a period\nof time comprised between one and three man-months.\nIt also seems that the result will be faster for most cases and the same speed\nas original numpy for other cases. The only problem is finding the dedicated\npersons willing to spend quite some time on this and however, I am willing to\nboth mentor such a person and encourage him or her.\nThe good starting point for helping would be to look at what's already\nimplemented in micronumpy modules and try extending it. Adding a - operator\nor adding integers would be an interesting start. Drop by on #pypy on\nirc.freenode.net or get in contact with developers via some other channel (such\nas the pypy-dev mailing list) if you want to help.\nAnother option would be to sponsor NumPy development. In case you're\ninterested, please get in touch with us or leave your email in comments.\nCheers,\nfijal", + "tags": "numpy", + "url": "https://www.pypy.org/posts/2011/05/numpy-in-pypy-status-and-roadmap-8332894230779779992.html" + }, + { + "title": "PyPy 1.5 Released: Catching Up", + "text": "We're pleased to announce the 1.5 release of PyPy. This release updates\nPyPy with the features of CPython 2.7.1, including the standard library. Thus\nall the features of CPython 2.6 and CPython 2.7 are now supported. It\nalso contains additional performance improvements. You can download it here:\n\nhttps://pypy.org/download.html\n\nWhat is PyPy?\nPyPy is a very compliant Python interpreter, almost a drop-in replacement for\nCPython 2.7.1. It's fast (pypy 1.5 and cpython 2.6.2 performance comparison)\ndue to its integrated tracing JIT compiler.\nThis release includes the features of CPython 2.6 and 2.7. It also includes a\nlarge number of small improvements to the tracing JIT compiler. It supports\nIntel machines running Linux 32/64 or Mac OS X. Windows is beta (it roughly\nworks but a lot of small issues have not been fixed so far). Windows 64 is\nnot yet supported.\nNumerous speed achievements are described on our blog. Normalized speed\ncharts comparing pypy 1.5 and pypy 1.4 as well as pypy 1.5 and cpython\n2.6.2 are available on our benchmark website. The speed improvement over 1.4\nseems to be around 25% on average.\n\n\nMore highlights\n\nThe largest change in PyPy's tracing JIT is adding support for loop invariant\ncode motion, which was mostly done by H\u00e5kan Ard\u00f6. This feature improves the\nperformance of tight loops doing numerical calculations.\nThe CPython extension module API has been improved and now supports many more\nextensions. For information on which one are supported, please refer to our\ncompatibility wiki.\nThese changes make it possible to support Tkinter and IDLE.\nThe cProfile profiler is now working with the JIT. However, it skews the\nperformance in unstudied ways. Therefore it is not yet usable to analyze\nsubtle performance problems (the same is true for CPython of course).\nThere is an external fork which includes an RPython version of the\npostgresql. However, there are no prebuilt binaries for this.\nOur developer documentation was moved to Sphinx and cleaned up.\nand many small things :-)\n\nCheers,\nCarl Friedrich Bolz, Laura Creighton, Antonio Cuni, Maciej Fijalkowski,\nAmaury Forgeot d'Arc, Alex Gaynor, Armin Rigo and the PyPy team", + "tags": "release", + "url": "https://www.pypy.org/posts/2011/04/pypy-15-released-catching-up-302997959079576809.html" + }, + { + "title": "Using Tkinter and IDLE with PyPy", + "text": "We are pleased to announce that Tkinter, the GUI library based on TCL/TK, now\nworks with PyPy.\nTkinter is composed of two parts:\n\n\n_tkinter, a module written in C which interfaces with the TCL world\nTkinter, a pure Python package which wraps _tkinter to expose the\npythonic API we are used to\n\n\n\n\n\n\nThe PyPy version of _tkinter reuses the C code of as found in CPython and\ncompile it through the PyPy C-API compatibility layer, cpyext. To make it\nwork with PyPy, we had to modify it slightly, in order to remove the\ndependency on some API functions which are not supported by PyPy. In particular, we\nremoved the dependency on the PyOS_InputHook variable, which allows a nice\nintegration of Tkinter and the Python interactive prompt: the result is that,\nunlike CPython, in PyPy Tk windows created at the interactive prompt are not\nshown until we manually call the mainloop method. Apart from this\ninconvenience, all the rest works fine.\nAt the moment, _tkinter is not distributed with PyPy because our build\nsystem does not support automatic compilation of C extension. Instead, it is\nnecessary to install it manually, either directly from source or by\neasy_installing/pip installing tkinter-pypy from PyPI.\nFor everything to work correctly, you need a recent build of PyPy: the\nfollowing is a step-by-step guide to install _tkinter in a PyPy nightly\nbuild for Linux 64 bit; for other architectures, look at the nightly build\npage:\n$ wget https://buildbot.pypy.org/nightly/trunk/pypy-c-jit-43485-1615dfd7d8f1-linux64.tar.bz2\n\n$ tar xfv pypy-c-jit-43485-1615dfd7d8f1-linux64.tar.bz2\n\n$ cd pypy-c-jit-43485-1615dfd7d8f1-linux64/\n\n$ wget https://peak.telecommunity.com/dist/ez_setup.py\n\n$ ./bin/pypy ez_setup.py # install setuptools\n\n$ ./bin/easy_install tkinter-pypy\n\nOnce you complete the steps above, you can start using Tkinter from your\npython programs. In particular, you can use IDLE, the IDE which is part of\nthe Python standard library. To start IDLE, type:\n$ ./bin/pypy -m idlelib.idle\n\nHave fun :-)", + "tags": "", + "url": "https://www.pypy.org/posts/2011/04/using-tkinter-and-idle-with-pypy-6156563216925585965.html" + }, + { + "title": "Tutorial Part 2: Adding a JIT", + "text": "This is the second part of a tutorial written by Andrew Brown. The first\npart described how to write an interpreter with PyPy.\n\nAdding JIT\nTranslating RPython to C is pretty cool, but one of the best features of PyPy\nis its ability to generate just-in-time compilers for your interpreter.\nThat's right, from just a couple hints on how your interpreter is structured,\nPyPy will generate and include a JIT compiler that will, at runtime, translate\nthe interpreted code of our BF language to machine code!\nSo what do we need to tell PyPy to make this happen? First it needs to know\nwhere the start of your bytecode evaluation loop is. This lets it keep track of\ninstructions being executed in the target language (BF).\nWe also need to let it know what defines a particular execution frame. Since\nour language doesn't really have stack frames, this boils down to what's\nconstant for the execution of a particular instruction, and what's not. These\nare called \"green\" and \"red\" variables, respectively.\nRefer back to example2.py for the following.\nIn our main loop, there are four variables used: pc, program, bracket_map, and\ntape. Of those, pc, program, and bracket_map are all green variables. They\ndefine the execution of a particular instruction. If the JIT routines see the\nsame combination of green variables as before, it knows it's skipped back and\nmust be executing a loop. The variable \"tape\" is our red variable, it's what's\nbeing manipulated by the execution.\nSo let's tell PyPy this info. Start by importing the JitDriver class and making\nan instance:\nfrom pypy.rlib.jit import JitDriver\njitdriver = JitDriver(greens=['pc', 'program', 'bracket_map'],\n reds=['tape'])\n\nAnd we add this line to the very top of the while loop in the mainloop\nfunction:\njitdriver.jit_merge_point(pc=pc, tape=tape, program=program,\n bracket_map=bracket_map)\n\nWe also need to define a JitPolicy. We're not doing anything fancy, so this is\nall we need somewhere in the file:\ndef jitpolicy(driver):\n from pypy.jit.codewriter.policy import JitPolicy\n return JitPolicy()\n\nSee this example at example3.py\nNow try translating again, but with the flag --opt=jit:\n\n$ python ./pypy/pypy/translator/goal/translate.py --opt=jit example3.py\n\nIt will take significantly longer to translate with JIT enabled, almost 8\nminutes on my machine, and the resulting binary will be much larger. When it's\ndone, try having it run the mandelbrot program again. A world of difference,\nfrom 12 seconds compared to 45 seconds before!\nInterestingly enough, you can see when the JIT compiler switches from\ninterpreted to machine code with the mandelbrot example. The first few lines of\noutput come out pretty fast, and then the program gets a boost of speed and\ngets even faster.\n\n\nA bit about Tracing JIT Compilers\nIt's worth it at this point to read up on how tracing JIT compilers work.\nHere's a brief explanation: The interpreter is usually running your interpreter\ncode as written. When it detects a loop of code in the target language (BF) is\nexecuted often, that loop is considered \"hot\" and marked to be traced. The next\ntime that loop is entered, the interpreter gets put in tracing mode where every\nexecuted instruction is logged.\nWhen the loop is finished, tracing stops. The trace of the loop is sent to an\noptimizer, and then to an assembler which outputs machine code. That machine\ncode is then used for subsequent loop iterations.\nThis machine code is often optimized for the most common case, and depends on\nseveral assumptions about the code. Therefore, the machine code will contain\nguards, to validate those assumptions. If a guard check fails, the runtime\nfalls back to regular interpreted mode.\nA good place to start for more information is\nhttps://en.wikipedia.org/wiki/Just-in-time_compilation\n\n\nDebugging and Trace Logs\nCan we do any better? How can we see what the JIT is doing? Let's do two\nthings.\nFirst, let's add a get_printable_location function, which is used during debug\ntrace logging:\ndef get_location(pc, program, bracket_map):\n return \"%s_%s_%s\" % (\n program[:pc], program[pc], program[pc+1:]\n )\njitdriver = JitDriver(greens=['pc', 'program', 'bracket_map'], reds=['tape'],\n get_printable_location=get_location)\n\nThis function is passed in the green variables, and should return a string.\nHere, we're printing out the BF code, surrounding the currently executing\ninstruction with underscores so we can see where it is.\nDownload this as example4.py and translate it the same as example3.py.\nNow let's run a test program (test.b, which just prints the letter \"A\" 15 or so\ntimes in a loop) with trace logging:\n\n$ PYPYLOG=jit-log-opt:logfile ./example4-c test.b\n\nNow take a look at the file \"logfile\". This file is quite hard to read, so\nhere's my best shot at explaining it.\nThe file contains a log of every trace that was performed, and is essentially a\nglimpse at what instructions it's compiling to machine code for you. It's\nuseful to see if there are unnecessary instructions or room for optimization.\nEach trace starts with a line that looks like this:\n\n[3c091099e7a4a7] {jit-log-opt-loop\n\nand ends with a line like this:\n\n[3c091099eae17d jit-log-opt-loop}\n\nThe next line tells you which loop number it is, and how many ops are in it.\nIn my case, the first trace looks like this:\n 1\n 2\n 3\n 4\n 5\n 6\n 7\n 8\n 9\n10\n11\n12\n13\n14\n15\n16\n17\n18\n19\n20\n21\n22\n23\n24\n25\n26\n27\n28\n29 [3c167c92b9118f] {jit-log-opt-loop\n # Loop 0 : loop with 26 ops\n [p0, p1, i2, i3]\n debug_merge_point('+<[>[_>_+<-]>.[<+>-]<<-]++++++++++.', 0)\n debug_merge_point('+<[>[>_+_<-]>.[<+>-]<<-]++++++++++.', 0)\n i4 = getarrayitem_gc(p1, i2, descr=)\n i6 = int_add(i4, 1)\n setarrayitem_gc(p1, i2, i6, descr=)\n debug_merge_point('+<[>[>+_<_-]>.[<+>-]<<-]++++++++++.', 0)\n debug_merge_point('+<[>[>+<_-_]>.[<+>-]<<-]++++++++++.', 0)\n i7 = getarrayitem_gc(p1, i3, descr=)\n i9 = int_sub(i7, 1)\n setarrayitem_gc(p1, i3, i9, descr=)\n debug_merge_point('+<[>[>+<-_]_>.[<+>-]<<-]++++++++++.', 0)\n i10 = int_is_true(i9)\n guard_true(i10, descr=) [p0]\n i14 = call(ConstClass(ll_dict_lookup__dicttablePtr_Signed_Signed), ConstPtr(ptr12), 90, 90, descr=)\n guard_no_exception(, descr=) [i14, p0]\n i16 = int_and(i14, -9223372036854775808)\n i17 = int_is_true(i16)\n guard_false(i17, descr=) [i14, p0]\n i19 = call(ConstClass(ll_get_value__dicttablePtr_Signed), ConstPtr(ptr12), i14, descr=)\n guard_no_exception(, descr=) [i19, p0]\n i21 = int_add(i19, 1)\n i23 = int_lt(i21, 114)\n guard_true(i23, descr=) [i21, p0]\n guard_value(i21, 86, descr=) [i21, p0]\n debug_merge_point('+<[>[_>_+<-]>.[<+>-]<<-]++++++++++.', 0)\n jump(p0, p1, i2, i3, descr=)\n [3c167c92bc6a15] jit-log-opt-loop}\n\nI've trimmed the debug_merge_point lines a bit, they were really long.\nSo let's see what this does. This trace takes 4 parameters: 2 object pointers\n(p0 and p1) and 2 integers (i2 and i3). Looking at the debug lines, it seems to\nbe tracing one iteration of this loop: \"[>+<-]\"\nIt starts executing the first operation on line 4, a \">\", but immediately\nstarts executing the next operation. The \">\" had no instructions, and looks\nlike it was optimized out completely. This loop must always act on the same\npart of the tape, the tape pointer is constant for this trace. An explicit\nadvance operation is unnecessary.\nLines 5 to 8 are the instructions for the \"+\" operation. First it gets the\narray item from the array in pointer p1 at index i2 (line 6), adds 1 to it and\nstores it in i6 (line 7), and stores it back in the array (line 8).\nLine 9 starts the \"<\" instruction, but it is another no-op. It seems that i2\nand i3 passed into this routine are the two tape pointers used in this loop\nalready calculated. Also deduced is that p1 is the tape array. It's not clear\nwhat p0 is.\nLines 10 through 13 perform the \"-\" operation: get the array value (line 11),\nsubtract (line 12) and set the array value (line 13).\nNext, on line 14, we come to the \"]\" operation. Lines 15 and 16 check whether\ni9 is true (non-zero). Looking up, i9 is the array value that we just\ndecremented and stored, now being checked as the loop condition, as expected\n(remember the definition of \"]\"). Line 16 is a guard, if the condition is not\nmet, execution jumps somewhere else, in this case to the routine called\n and is passed one parameter: p0.\nAssuming we pass the guard, lines 17 through 23 are doing the dictionary lookup\nto bracket_map to find where the program counter should jump to. I'm not too\nfamiliar with what the instructions are actually doing, but it looks like there\nare two external calls and 3 guards. This seems quite expensive, especially\nsince we know bracket_map will never change (PyPy doesn't know that). We'll\nsee below how to optimize this.\nLine 24 increments the newly acquired instruction pointer. Lines 25 and 26 make\nsure it's less than the program's length.\nAdditionally, line 27 guards that i21, the incremented instruction pointer, is\nexactly 86. This is because it's about to jump to the beginning (line 29) and\nthe instruction pointer being 86 is a precondition to this block.\nFinally, the loop closes up at line 28 so the JIT can jump to loop body \nto handle that case (line 29), which is the beginning of the loop again. It\npasses in parameters (p0, p1, i2, i3).\n\n\nOptimizing\nAs mentioned, every loop iteration does a dictionary lookup to find the\ncorresponding matching bracket for the final jump. This is terribly\ninefficient, the jump target is not going to change from one loop to the next.\nThis information is constant and should be compiled in as such.\nThe problem is that the lookups are coming from a dictionary, and PyPy is\ntreating it as opaque. It doesn't know the dictionary isn't being modified or\nisn't going to return something different on each query.\nWhat we need to do is provide another hint to the translation to say that the\ndictionary query is a pure function, that is, its output depends only on its\ninputs and the same inputs should always return the same output.\nTo do this, we use a provided function decorator pypy.rlib.jit.purefunction,\nand wrap the dictionary call in a decorated function:\n@purefunction\ndef get_matching_bracket(bracket_map, pc):\n return bracket_map[pc]\n\nThis version can be found at example5.py\nTranslate again with the JIT option and observe the speedup. Mandelbrot now\nonly takes 6 seconds! (from 12 seconds before this optimization)\nLet's take a look at the trace from the same function:\n[3c29fad7b792b0] {jit-log-opt-loop\n# Loop 0 : loop with 15 ops\n[p0, p1, i2, i3]\ndebug_merge_point('+<[>[_>_+<-]>.[<+>-]<<-]++++++++++.', 0)\ndebug_merge_point('+<[>[>_+_<-]>.[<+>-]<<-]++++++++++.', 0)\ni4 = getarrayitem_gc(p1, i2, descr=)\ni6 = int_add(i4, 1)\nsetarrayitem_gc(p1, i2, i6, descr=)\ndebug_merge_point('+<[>[>+_<_-]>.[<+>-]<<-]++++++++++.', 0)\ndebug_merge_point('+<[>[>+<_-_]>.[<+>-]<<-]++++++++++.', 0)\ni7 = getarrayitem_gc(p1, i3, descr=)\ni9 = int_sub(i7, 1)\nsetarrayitem_gc(p1, i3, i9, descr=)\ndebug_merge_point('+<[>[>+<-_]_>.[<+>-]<<-]++++++++++.', 0)\ni10 = int_is_true(i9)\nguard_true(i10, descr=) [p0]\ndebug_merge_point('+<[>[_>_+<-]>.[<+>-]<<-]++++++++++.', 0)\njump(p0, p1, i2, i3, descr=)\n[3c29fad7ba32ec] jit-log-opt-loop}\n\nMuch better! Each loop iteration is an add, a subtract, two array loads, two\narray stores, and a guard on the exit condition. That's it! This code doesn't\nrequire any program counter manipulation.\nI'm no expert on optimizations, this tip was suggested by Armin Rigo on the\npypy-dev list. Carl Friedrich has a series of posts on how to optimize your\ninterpreter that are also very useful: https://bit.ly/bundles/cfbolz/1\n\n\nFinal Words\nI hope this has shown some of you what PyPy is all about other than a faster\nimplementation of Python.\nFor those that would like to know more about how the process works, there are\nseveral academic papers explaining the process in detail that I recommend. In\nparticular: Tracing the Meta-Level: PyPy's Tracing JIT Compiler.\nSee https://readthedocs.org/docs/pypy/en/latest/extradoc.html", + "tags": "", + "url": "https://www.pypy.org/posts/2011/04/tutorial-part-2-adding-jit-8121732841568309472.html" + }, + { + "title": "Tutorial: Writing an Interpreter with PyPy, Part 1", + "text": "This is a guest blog post written by Andrew Brown, with help from the PyPy developers\non the pypy-dev mailing list.\nThis tutorial's master copy and supporting files live at\nhttps://bitbucket.org/brownan/pypy-tutorial/\n\nWhen I first learned about the PyPy project, it took me a while to figure out\nexactly what it was about. For those that don't already know, it's two things:\n\nA set of tools for implementing interpreters for interpreted languages\nAn implementation of Python using this toolchain\n\nThe second part is probably what most people think PyPy is, but this tutorial\nis not about their Python interpreter. It is about writing your own\ninterpreter for your own language.\nThis is the project I undertook to help myself better understand how PyPy works\nand what it's all about.\nThis tutorial assumes you know very little about PyPy, how it works, and even\nwhat it's all about. I'm starting from the very beginning here.\n\nWhat PyPy Does\nHere's a brief overview of what PyPy can do. Let's say you want to write an\ninterpreted language. This involves writing some kind of source code parser, a\nbytecode interpretation loop, and lots of standard library code.\nThat's quite a bit of work for moderately complicated languages, and there's a\nlot of low level work involved. Writing the parser and compiler code usually\nisn't fun, that's why there are tools out there to generate parsers and\ncompilers for you.\nEven then, you still must worry about memory management in your interpreter,\nand you're going to be re-implementing a lot if you want data types like\narbitrary precision integers, nice general hash tables, and such. It's enough\nto put someone off from implementing their idea for a language.\nWouldn't it be nice if you could write your language in an existing high level\nlanguage like, for example, Python? That sure would be ideal, you'd get all the\nadvantages of a high level language like automatic memory management and rich\ndata types at your disposal. Oh, but an interpreted language interpreting\nanother language would be slow, right? That's twice as much interpreting going\non.\nAs you may have guessed, PyPy solves this problem. PyPy is a sophisticated\ntoolchain for analyzing and translating your interpreter code to C code (or JVM\nor CLI). This process is called \"translation\", and it knows how to translate\nquite a lot of Python's syntax and standard libraries, but not everything. All\nyou have to do is write your interpreter in RPython, a subset of the Python\nlanguage carefully defined to allow this kind of analysis and translation, and\nPyPy will produce for you a very efficient interpreter.\nBecause efficient interpreters should not be hard to write.\n\n\nThe Language\nThe language I've chosen to implement is dead simple. The language runtime\nconsists of a tape of integers, all initialized to zero, and a single pointer\nto one of the tape's cells. The language has 8 commands, described here:\n\n>\nMoves the tape pointer one cell to the right\n\n\n<\nMoves the tape pointer one cell to the left\n+\nIncrements the value of the cell underneath the pointer\n-\nDecrements the value of the cell underneath the pointer\n\n\n[\nIf the cell under the current pointer is 0, skip to the instruction after\nthe matching ]\n\n\n]\nSkip back to the matching [ (evaluating its condition)\n\n\n.\nPrint out a single byte to stdout from the cell under the pointer\n\n\n,\nRead in a single byte from stdin to the cell under the pointer\n\nAny unrecognized bytes are ignored.\nSome of you may recognize this language. I will be referring to it as BF.\nOne thing to notice is that the language is its own bytecode; there is no\ntranslation from source code to bytecode. This means that the language can be\ninterpreted directly: the main eval loop of our interpreter will operate right\non the source code. This simplifies the implementation quite a bit.\n\n\nFirst Steps\nLet's start out by writing a BF interpreter in plain old Python. The first step\nis sketching out an eval loop:\ndef mainloop(program):\n tape = Tape()\n pc = 0\n while pc < len(program):\n code = program[pc]\n\n if code == \">\":\n tape.advance()\n elif code == \"<\":\n tape.devance()\n elif code == \"+\":\n tape.inc()\n elif code == \"-\":\n tape.dec()\n elif code == \".\":\n sys.stdout.write(chr(tape.get()))\n elif code == \",\":\n tape.set(ord(sys.stdin.read(1)))\n elif code == \"[\" and value() == 0:\n # Skip forward to the matching ]\n elif code == \"]\" and value() != 0:\n # Skip back to the matching [\n\n pc += 1\n\nAs you can see, a program counter (pc) holds the current instruction index. The\nfirst statement in the loop gets the instruction to execute, and then a\ncompound if statement decides how to execute that instruction.\nThe implementation of [ and ] are left out here, but they should change the\nprogram counter to the value of the matching bracket. (The pc then gets\nincremented, so the condition is evaluated once when entering a loop, and once\nat the end of each iteration)\nHere's the implementation of the Tape class, which holds the tape's values as\nwell as the tape pointer:\nclass Tape(object):\n def __init__(self):\n self.thetape = [0]\n self.position = 0\n\n def get(self):\n return self.thetape[self.position]\n def set(self, val):\n self.thetape[self.position] = val\n def inc(self):\n self.thetape[self.position] += 1\n def dec(self):\n self.thetape[self.position] -= 1\n def advance(self):\n self.position += 1\n if len(self.thetape) <= self.position:\n self.thetape.append(0)\n def devance(self):\n self.position -= 1\n\nAs you can see, the tape expands as needed to the right, indefinitely. We\nshould really add some error checking to make sure the pointer doesn't go\nnegative, but I'm not worrying about that now.\nExcept for the omission of the \"[\" and \"]\" implementation, this code will work\nfine. However, if the program has a lot of comments, it will have to skip over\nthem one byte at a time at runtime. So let's parse those out once and for all.\nAt the same time, we'll build a dictionary mapping between brackets, so that\nfinding a matching bracket is just a single dictionary lookup. Here's how:\ndef parse(program):\n parsed = []\n bracket_map = {}\n leftstack = []\n\n pc = 0\n for char in program:\n if char in ('[', ']', '<', '>', '+', '-', ',', '.'):\n parsed.append(char)\n\n if char == '[':\n leftstack.append(pc)\n elif char == ']':\n left = leftstack.pop()\n right = pc\n bracket_map[left] = right\n bracket_map[right] = left\n pc += 1\n\n return \"\".join(parsed), bracket_map\n\nThis returns a string with all invalid instructions removed, and a dictionary\nmapping bracket indexes to their matching bracket index.\nAll we need is some glue code and we have a working BF interpreter:\ndef run(input):\n program, map = parse(input.read())\n mainloop(program, map)\n\nif __name__ == \"__main__\":\n import sys\n run(open(sys.argv[1], 'r'))\n\nIf you're following along at home, you'll also need to change the signature of\nmainloop() and implement the bracket branches of the if statement. Here's the\ncomplete example: example1.py\nAt this point you can try it out to see that it works by running the\ninterpreter under python, but be warned, it will be very slow on the more\ncomplex examples:\n\n$ python example1.py 99bottles.b\n\nYou can find mandel.b and several other example programs (not written by me) in\nmy repository.\n\n\nPyPy Translation\nBut this is not about writing a BF interpreter, this is about PyPy. So what\ndoes it take to get PyPy to translate this into a super-fast executable?\nAs a side note, there are some simple examples in the pypy/translator/goal\ndirectory of the PyPy source tree that are helpful here. My starting point for\nlearning this was the example \"targetnopstandalone.py\", a simple hello world\nfor PyPy.\nFor our example, the module must define a name called \"target\" which returns the\nentry point. The translation process imports your module and looks for that\nname, calls it, and the function object returned is where it starts the\ntranslation.\ndef run(fp):\n program_contents = \"\"\n while True:\n read = os.read(fp, 4096)\n if len(read) == 0:\n break\n program_contents += read\n os.close(fp)\n program, bm = parse(program_contents)\n mainloop(program, bm)\n\ndef entry_point(argv):\n try:\n filename = argv[1]\n except IndexError:\n print \"You must supply a filename\"\n return 1\n\n run(os.open(filename, os.O_RDONLY, 0777))\n return 0\n\ndef target(*args):\n return entry_point, None\n\nif __name__ == \"__main__\":\n entry_point(sys.argv)\n\nThe entry_point function is passed the command line arguments when you run the\nresulting executable.\nA few other things have changed here too. See the next section...\n\n\nAbout RPython\nLet's talk a bit about RPython at this point. PyPy can't translate arbitrary\nPython code because Python is a bit too dynamic. There are restrictions on what\nstandard library functions and what syntax constructs one can use. I won't be\ngoing over all the restrictions, but for more information see\nhttps://readthedocs.org/docs/pypy/en/latest/coding-guide.html#restricted-python\nIn the example above, you'll see a few things have changed. I'm now using low\nlevel file descriptors with os.open and os.read instead of file objects.\nThe implementation of \".\" and \",\" are similarly tweaked (not shown above).\nThose are the only changes to make to this code, the rest is simple enough for\nPyPy to digest.\nThat wasn't so hard, was it? I still get to use dictionaries, expandable lists,\nand even classes and objects! And if low level file descriptors are too low for\nyou, there are some helpful abstractions in the rlib.streamio module included\nwith PyPy's \"RPython standard library.\"\nFor the example thus far, see example2.py\n\n\nTranslating\nIf you haven't already, check yourself out the latest version of PyPy from\ntheir bitbucket.org repository:\n\n$ hg clone https://bitbucket.org/pypy/pypy\n\n(A recent revision is necessary because of a bugfix that makes my example\npossible)\nThe script to run is in \"pypy/translator/goal/translate.py\". Run this script,\npassing in our example module as an argument.\n[A note added much later: this script has been moved to \"rpython/bin/rpython\".]\n\n$ python ./pypy/pypy/translator/goal/translate.py example2.py\n\n(You can use PyPy's python interpreter for extra speed, but it's not necessary)\nPyPy will churn for a bit, drawing some nice looking fractals to your console\nwhile it works. It takes around 20 seconds on my machine.\nThe result from this is an executable binary that interprets BF programs.\nIncluded in my repository are some example BF programs, including a mandelbrot\nfractal generator, which takes about 45 seconds to run on my computer. Try it\nout:\n\n$ ./example2-c mandel.b\n\nCompare this to running the interpreter un-translated on top of python:\n\n$ python example2.py mandel.b\n\nTakes forever, doesn't it?\nSo there you have it. We've successfully written our own interpreter in RPython\nand translated it with the PyPy toolchain.\n\n(more in the next blog post...)", + "tags": "", + "url": "https://www.pypy.org/posts/2011/04/tutorial-writing-interpreter-with-pypy-3785910476193156295.html" + }, + { + "title": "PyPy G\u00f6teborg Post-Easter Sprint April 25 - May 1 2011", + "text": "The next PyPy sprint will be in Gothenburg, Sweden. It is a public sprint,\nvery suitable for newcomers. We'll focus on making the 1.5 release (if\nit hasn't already happened) and whatever interests the Sprint attendees.\n\nTopics and goals\nThe main goal is to polish and release PyPy 1.5, supporting Python 2.7\nas well as the last few months' improvements in the JIT (provided that\nit hasn't already happened). Other topics:\n\nGoing over our documentation, and classifying our docs in terms of\nmouldiness. Deciding what needs writing, and maybe writing it.\nHelping people get their code running with PyPy\nmaybe work on EuroPython Training, and talks\nSummer of Code preparation\nspeed.pypy.org\nany other programming task is welcome too -- e.g. tweaking the\nPython or JavaScript interpreter, Stackless support, and so on.\n\n\n\nLocation\nThe sprint will be held in the apartment of Laura Creighton and Jacob Hall\u00e9n\nwhich is at G\u00f6tabergsgatan 22 in Gothenburg, Sweden. Here is a map. This is\nin central Gothenburg. It is between the tram stops of Vasaplatsen and\nValand, (a distance of 4 blocks) where many lines call -- the 2, 3, 4, 5,\n7, 10 and 13.\nProbably cheapest and not too far away is to book accomodation at SGS\nVeckobostader. The Elite Park Avenyn Hotel is a luxury hotel just a\nfew blocks away. There are scores of hotels a short walk away from the\nsprint location, suitable for every budget, desire for luxury, and desire\nfor the unusual. You could, for instance, stay on a boat. Options are\ntoo numerous to go into here. Just ask in the mailing list or on the blog.\nHours will be\nfrom 10:00 until people have had enough. It's a good idea to arrive a\nday before the sprint starts and leave a day later. In the middle of\nthe sprint there usually is a break day and it's usually ok to take\nhalf-days off if you feel like it.\n\n\nGood to Know\nSweden is not part of the Euro zone. One SEK (krona in singular, kronor\nin plural) is roughly 1/10th of a Euro (9.36 SEK to 1 Euro).\nThe venue is central in Gothenburg. There is a large selection of\nplaces to get food nearby, from edible-and-cheap to outstanding. We\noften cook meals together, so let us know if you have any food allergies,\ndislikes, or special requirements.\nSweden uses the same kind of plugs as Germany. 230V AC.\nThe Sprint will be held the week following Easter. This means, as always,\nthat Gothcon will be taking place the weekend before (Easter weekend).\nGothcon, now in its 35 year, is the largest European game players conference.\nSome of you may be interested in arriving early for the board games.\nThe conference site is only in Swedish, alas. You don't need to register\nin advance unless you are planning to host a tournament, (and it's too\nlate for that anyway).\n\n\nGetting Here\nIf are coming train, you will arrive at the Central Station. It is\nabout 12 blocks to the site from there, or you can take a tram.\nThere are two airports which are local to G\u00f6teborg, Landvetter (the main\none) and Gothenburg City Airport (where some budget airlines fly).\nIf you arrive at Landvetter the airport bus stops right downtown at\nElite Park Avenyn Hotel which is the second stop, 4 blocks from the\nSprint site, as well as the end of the line, which is the Central Station.\nIf you arrive at Gothenburg City Airport take the bus to the end of the\nline. You will be at the Central Station.\nYou can also arrive by ferry, from either Kiel in Germany or Frederikshavn\nin Denmark.\n\n\nWho's Coming?\nIf you'd like to come, please let us know when you will be arriving and\nleaving, as well as letting us know your interests We'll keep a list\nof people which we'll update (which you can do so yourself if you\nhave bitbucket pypy commit rights).", + "tags": "", + "url": "https://www.pypy.org/posts/2011/04/pypy-goteborg-post-easter-sprint-april-16274563331982977.html" + }, + { + "title": "Controlling the Tracing of an Interpreter With Hints, Part 4: Benchmarks", + "text": "This is part 4 and the final part of the series on how to speed up an interpreter\nwritten with PyPy by adding JIT hints to the interpreter. Part 1 described how\nto control the extent of tracing. Part 2 described how to influence the\noptimizer with promotion and pure functions. Part 3 described a simple object\nmodel and how it can be optimized by doing small rewrites. In this (short) post\nI present some benchmarks.\n\nBenchmarks\nFor the benchmarks I ran a subset of the benchmarks on https://speed.pypy.org\nwith CPython and four different executables of PyPy's Python interpreter (all\nwith a JIT). The executables contain all combinations of enabling maps (which\nmake instance attributes fast) and type versions (which makes method lookup\nfast).\n\npypy-slow: contains neither maps nor type versions.\npypy-map: contains maps but not type versions.\npypy-version: contains type versions but not maps.\npypy-full: contains both maps and type versions\n\nThe results are as follows:\n\nThe graph shows the speedup over CPython's numbers. The results are quite\ninteresting. Maps by themselves do not speed up much over the bare JIT, whereas\ntyped versions alone improve on the JIT baseline in many cases. However, maps\nare not useless. In combination with type versions they add a nice improvement\nover just type versions in a number of benchmarks (most notably\nraytrace-simple and richards but also in crypto-pyaes, django\nand go).\nIt's clear that type versions can be arbitrarily effective. A method lookup on a\nclass can be arbitrarily slow, if the inheritance hierarchy becomes deeper and\ndeeper. The full lookup is replaced by one promotion if type versions are\nenabled.\nMaps on the other hand always replace one dict lookup with one promotion. Since\ndict lookups are already very fast, this by itself does not lead to a gigantic\nimprovement. Only in combination with type versions do they show their full\npotential.", + "tags": "", + "url": "https://www.pypy.org/posts/2011/03/controlling-tracing-of-interpreter-with_26-3072929156700508140.html" + }, + { + "title": "A thank you to the PSF", + "text": "This year's PyCon was an incredible time; several members of the PyPy team were\nthere, and we'll be blogging more about our experiences in the coming days.\nHowever, we quickly wanted to extend a thank you to the Python Software\nFoundation (PSF).\nAs you may have heard, on Friday morning at PyCon Jesse Noller handed the PyPy\nteam a check for $10,000, on behalf of the PSF. This was in recognition of our\nsuccess over the past few years in bringing PyPy from a research project\nto a fast, compliant, production-ready Python implementation, and to allow us\nto continue our work on making it faster and more up-to-date with upstream\nversion changes.\nBeyond the large check, we're grateful for the endorsement this represents,\nnot only of our work on PyPy, but also of all alternatve Python VMs.\nThe PSF has shifted its focus from representing just CPython to representing\nthe Python Language, reguardless of its implementation, something we are very\nappreciative of.\n\nFrom left to right, PyPy people present at PyCon 2011: Maciej Fija\u0142kowski, Armin Rigo, Alex Gaynor, Laura Creighton and Jacob Hall\u00e9n\n\nThank you, PSF.", + "tags": "sponsors", + "url": "https://www.pypy.org/posts/2011/03/thank-you-to-psf-5934275567667314914.html" + }, + { + "title": "Controlling the Tracing of an Interpreter With Hints, Part 3: Putting it All Together", + "text": "This is part 3 of the series on how to speed up an interpreter written with\nPyPy by adding JIT hints to the interpreter. Part 1 described how to control\nthe extent of tracing. Part 2 described how to influence the optimizer with\npromotion and pure functions. In this post I describe a worked-out example of\na small object model for a dynamic language and how to make it efficient using\nthe hints described in the previous posts.\n\nA Simple Object Model\nTo implement a dynamic language efficiently, the operations on its objects need\nto be fast. Most dynamic languages have object models that are made by using\ndictionaries everywhere. Let's look at an example of how the JIT can be made to\noptimize such operations.\nFor the purpose of this blog post we will use a very simple and bare-bones\nobject model that just supports very simple classes and instances, without any\ninheritance or any fancy features. The model has classes, which contain methods.\nInstances have a class. Instances have their own attributes. When looking up an\nattribute on an instance, the instances attributes are searched. If the\nattribute is not found there, the class' attributes are searched.\nTo implement this object model, we could use the following RPython code as part\nof the interpreter source code:\nclass Class(object):\n def __init__(self, name):\n self.name = name\n self.methods = {}\n\n def instantiate(self):\n return Instance(self)\n\n def find_method(self, name):\n result = self.methods.get(name)\n if result is not None:\n return result\n raise AttributeError(name)\n\n def change_method(self, name, value):\n self.methods[name] = value\n\n\nclass Instance(object):\n def __init__(self, cls):\n self.cls = cls\n self.attributes = {}\n\n def getfield(self, name):\n result = self.attributes.get(name)\n if result is not None:\n return result\n raise AttributeError(name)\n\n def write_attribute(self, name, value):\n self.attributes[name] = value\n\n def getattr(self, name):\n try:\n return self.getfield(name)\n except AttributeError:\n return self.cls.find_method(name)\n\nIn this straightforward implementation the methods and attributes are just\nstored in dictionaries on the classes/instances. While this object model is very\nsimple it already contains all the hard parts of Python's object model. Both\ninstances and classes can have arbitrary fields, and they are changeable at\nany time. Moreover, instances can change their class after they have been\ncreated.\nWhen using this object model in\nan interpreter, a huge amount of time will be spent doing lookups in these\ndictionaries. To make the language efficient using a tracing JIT, we need to\nfind a way to get rid of these dictionary lookups somehow.\nLet's assume we trace through code that sums three attributes, such as:\ninst.getattr(\"a\") + inst.getattr(\"b\") + inst.getattr(\"c\")\n\nThe trace could look like this:\n# inst.getattr(\"a\")\nattributes1 = inst.attributes\nresult1 = dict.get(attributes1, \"a\")\nguard(result1 is not None)\n\n# inst.getattr(\"b\")\nattributes2 = inst.attributes\nv1 = dict.get(attributes2, \"b\")\nguard(v1 is None)\ncls1 = inst.cls\nmethods1 = cls.methods\nresult2 = dict.get(methods1, \"b\")\nguard(result2 is not None)\nv2 = result1 + result2\n\n# inst.getattr(\"c\")\nattributes3 = inst.attributes\nv3 = dict.get(attributes3, \"c\")\nguard(v3 is None)\ncls1 = inst.cls\nmethods2 = cls.methods\nresult3 = dict.get(methods2, \"c\")\nguard(result3 is not None)\n\nv4 = v2 + result3\nreturn(v4)\n\nIn this example, the attribute a is found on the instance, but the\nattributes b and c are found on the class. The trace indeed contains\nfive calls to dict.get, which is slow.\n\n\nMaking Instance Attributes Faster Using Maps\nThe first step in making getattr faster in our object model is to optimize\naway the dictionary lookups on the instances. The hints we have looked at in the\ntwo earlier blog posts don't seem to help with the current object model. There is\nno pure function to be seen, and the instance is not a candidate for promotion,\nbecause there tend to be many instances.\nThis is a common problem when trying to apply hints. Often, the interpreter\nneeds a small rewrite to expose the pure functions and nearly-constant objects\nthat are implicitly there. In the case of instance fields this rewrite is not\nentirely obvious. The basic idea is as follows. In theory instances can have\narbitrary fields. In practice however many instances share their layout (i.e.\ntheir set of keys) with many other instances.\nTherefore it makes sense to factor the layout information out of the instance\nimplementation into a shared object. This shared layout object is called a\nmap. Maps are an old idea that comes originally from the SELF language. They are\nalso used by many JavaScript implementations such as V8. I've written about maps\nbefore, so I won't explain them fully again.\nThe rewritten Instance class using maps looks like this:\nclass Map(object):\n def __init__(self):\n self.attribute_indexes = {}\n self.other_maps = {}\n\n @purefunction\n def getindex(self, name):\n return self.attribute_indexes.get(name, -1)\n\n @purefunction\n def new_map_with_additional_attribute(self, name):\n if name not in self.other_maps:\n newmap = Map()\n newmap.attribute_indexes.update(self.attribute_indexes)\n newmap.attribute_indexes[name] = len(self.attribute_indexes)\n self.other_maps[name] = newmap\n return self.other_maps[name]\n\n\nEMPTY_MAP = Map()\n\nclass Instance(object):\n def __init__(self, cls):\n self.cls = cls\n self.map = EMPTY_MAP\n self.storage = []\n\n def getfield(self, name):\n map = hint(self.map, promote=True)\n index = map.getindex(name)\n if index != -1:\n return self.storage[index]\n raise AttributeError(name)\n\n def write_attribute(self, name, value):\n map = hint(self.map, promote=True)\n index = map.getindex(name)\n if index != -1:\n self.storage[index] = value\n return\n self.map = map.new_map_with_additional_attribute(name)\n self.storage.append(value)\n\n def getattr(self, name):\n try:\n return self.getfield(name)\n except AttributeError:\n return self.cls.find_method(name)\n\nInstances no longer use dictionaries to store their fields. Instead, they have a\nreference to a map, which maps field names to indexes into a storage list. The\nstorage list contains the actual field values. The maps are shared between\nobjects with the same layout. Therefore they have to be immutable, which means\nthat their getindex method is a pure function. When a new attribute is added\nto an instance, a new map needs to be chosen, which is done with the\nnew_map_with_additional_attribute method on the previous map. Now that we have\nintroduced maps, it is safe to promote the map everywhere, because we assume\nthat the number of different instance layouts is small.\nWith this changed instance implementation, the trace we had above changes to the\nfollowing, where 0xb74af4a8 is the memory address of the Map instance that\nhas been promoted:\n# inst.getattr(\"a\")\nmap1 = inst.map\nguard(map1 == 0xb74af4a8)\nindex1 = Map.getindex(map1, \"a\")\nguard(index1 != -1)\nstorage1 = inst.storage\nresult1 = storage1[index1]\n\n# inst.getattr(\"b\")\nmap2 = inst.map\nguard(map2 == 0xb74af4a8)\nindex2 = Map.getindex(map2, \"b\")\nguard(index2 == -1)\ncls1 = inst.cls\nmethods1 = cls.methods\nresult2 = dict.get(methods1, \"b\")\nguard(result2 is not None)\nv2 = result1 + result2\n\n# inst.getattr(\"c\")\nmap3 = inst.map\nguard(map3 == 0xb74af4a8)\nindex3 = Map.getindex(map3, \"c\")\nguard(index3 == -1)\ncls1 = inst.cls\nmethods2 = cls.methods\nresult3 = dict.get(methods2, \"c\")\nguard(result3 is not None)\n\nv4 = v2 + result3\nreturn(v4)\n\nThe calls to Map.getindex can be optimized away, because they are calls to\na pure function and they have constant arguments. That means that index1/2/3\nare constant and the guards on them can be removed. All but the first guard on\nthe map will be optimized away too, because the map cannot have changed in\nbetween. The optimized trace looks like this:\n# inst.getattr(\"a\")\nmap1 = inst.map\nguard(map1 == 0xb74af4a8)\nstorage1 = inst.storage\nresult1 = storage1[0]\n\n# inst.getattr(\"b\")\ncls1 = inst.cls\nmethods1 = cls1.methods\nresult2 = dict.get(methods1, \"b\")\nguard(result2 is not None)\nv2 = result1 + result2\n\n# inst.getattr(\"c\")\ncls2 = inst.cls\nmethods2 = cls2.methods\nresult3 = dict.get(methods2, \"c\")\nguard(result3 is not None)\n\nv4 = v2 + result3\nreturn(v4)\n\nThe index 0 that is used to read out of the storage array is the result\nof the constant-folded getindex call. This trace is already much better than\nthe original one. Now we are down from five dictionary lookups to just two.\n\n\nVersioning of Classes\nInstances were optimized making the assumption that the total number of\nInstance layouts is small compared to the number of instances. For classes we\nwill make an even stronger assumption. We simply assume that it is rare for\nclasses to change at all. This is not totally reasonable (sometimes classes contain\ncounters or similar things) but for this simple example it is good enough.\nWhat we would really like is if the Class.find_method method were pure.\nBut it cannot be, because it is always possible to change the class itself.\nEvery time the class changes, find_method can potentially return a\nnew value.\nTherefore, we give every class a version number, which is increased every time a\nclass gets changed (i.e., the content of the methods dictionary changes).\nThis means that the result of methods.get() for a given (name,\nversion) pair will always be the same, i.e. it is a pure operation. To help\nthe JIT to detect this case, we factor it out in a helper method which is\nexplicitly marked as @purefunction. The refactored Class looks like\nthis:\nclass VersionTag(object):\n pass\n\nclass Class(object):\n def __init__(self, name):\n self.name = name\n self.methods = {}\n self.version = VersionTag()\n\n def find_method(self, name):\n self = hint(self, promote=True)\n version = hint(self.version, promote=True)\n result = self._find_method(name, version)\n if result is not None:\n return result\n raise AttributeError(name)\n\n @purefunction\n def _find_method(self, name, version):\n return self.methods.get(name)\n\n def change_method(self, name, value):\n self.methods[name] = value\n self.version = VersionTag()\n\nWhat is interesting here is that _find_method takes the version\nargument but it does not use it at all. Its only purpose is to make the call\npure (because when the version number changes, the result of the call might be\ndifferent than the previous one).\nThe trace with this new class implementation looks like this:\n# inst.getattr(\"a\")\nmap1 = inst.map\nguard(map1 == 0xb74af4a8)\nindex1 = Map.getindex(map1, \"a\")\nguard(index1 != -1)\nstorage1 = inst.storage\nresult1 = storage1[index1]\n\n# inst.getattr(\"b\")\nmap2 = inst.map\nguard(map2 == 0xb74af4a8)\nindex2 = Map.getindex(map2, \"b\")\nguard(index2 == -1)\ncls1 = inst.cls\nguard(cls1 == 0xb7aaaaf8)\nversion1 = cls1.version\nguard(version1 == 0xb7bbbb18)\nresult2 = Class._find_method(cls, \"b\", version1)\nguard(result2 is not None)\nv2 = result1 + result2\n\n# inst.getattr(\"c\")\nmap3 = inst.map\nguard(map3 == 0xb74af4a8)\nindex3 = Map.getindex(map3, \"c\")\nguard(index3 == -1)\ncls2 = inst.cls\nguard(cls2 == 0xb7aaaaf8)\nversion2 = cls2.version\nguard(version2 == 0xb7bbbb18)\nresult3 = Class._find_method(cls, \"c\", version2)\nguard(result3 is not None)\n\nv4 = v2 + result3\nreturn(v4)\n\nThe calls to Class._find_method can now be optimized away, also the\npromotion of the class and the version, except for the first one. The final\noptimized trace looks like this:\n# inst.getattr(\"a\")\nmap1 = inst.map\nguard(map1 == 0xb74af4a8)\nstorage1 = inst.storage\nresult1 = storage1[0]\n\n# inst.getattr(\"b\")\ncls1 = inst.cls\nguard(cls1 == 0xb7aaaaf8)\nversion1 = cls1.version\nguard(version1 == 0xb7bbbb18)\nv2 = result1 + 41\n\n# inst.getattr(\"c\")\nv4 = v2 + 17\nreturn(v4)\n\nThe constants 41 and 17 are the results of the folding of the\n_find_method` calls. This final trace is now very good. It no longer performs any\ndictionary lookups. Instead it contains several guards. The first guard\nchecks that the map is still the same. This guard will fail if the same\ncode is executed with an instance that has another layout. The second guard\nchecks that the class of inst is still the same. It will fail if trace is\nexecuted with an instance of another class. The third guard checks that the\nclass did not change since the trace was produced. It will fail if somebody\ncalls the change_method method on the class.\n\n\nReal-World Considerations\nThe techniques used above for the simple object model are used for the object\nmodel of PyPy's Python interpreter too. Since Python's object model is\nconsiderably more complex, some additional work needs to be done.\nThe first problem that needs to be solved is that Python supports (multiple)\ninheritance. Therefore looking up a method in a class needs to consider the\nwhole method resolution order. This makes the versioning of classes more\ncomplex. If a class is changed its version changes. At the same time, the\nversions of all the classes inheriting from it need to be changed as well,\nrecursively. This makes class changes expensive, but they should be rare. On the\nother hand, a method lookup in a complex class hierarchy is as optimized in the\ntrace as in our object model here.\nA downside of the versioning of classes that we haven't yet fixed in PyPy, is\nthat some classes do change a lot. An example would be a class that keeps a\ncounter of how many instances have been created so far. This is very slow right\nnow, but we have ideas about how to fix it in the future.\nAnother optimization is that in practice the shape of an instance is correlated\nwith its class. In our code above, we allow both to vary independently.\nIn PyPy's Python interpreter we act somewhat more cleverly. The class of\nan instance is not stored on the instance itself, but on the map. This means\nthat we get one fewer promotion (and thus one fewer guard) in the trace, because the class doesn't need to\nbe promoted after the map has been.\n\n\nMore General Patterns\nThe techniques we used above to make instance and class lookups faster are\napplicable in more general cases than the one we developed them for. A more\nabstract view of maps is that of splitting a data-structure into a part that\nchanges slowly, and a part that changes quickly. In the concrete example of maps\nwe split the original dictionary into the map (the slow-changing part) and the\nstorage array (the quick-changing part). All the computation on the\nslow-changing part can be constant-folded during tracing so that only the\nmanipulation of the quick-changing part remains.\nSimilarly, versions can be used to constant-fold arbitrary functions of large data\nstructures. The version needs to be updated carefully every time the result of\nthis function can change. Therefore this is useful only if the data structure is\nexpected to change slowly.\n\n\nConclusion\nIn this post I showed how to use purefunction and promote to make a\nsmall but still relevant dynamic object model no longer use any dictionary lookups\nafter tracing. Instead a number of guards are inserted into the\ntrace to check whether the assumptions about the objects are still true. This\nmakes operations on objects seriously faster. I plan to write another small post\nthat shows the speed benefits for PyPy's Python interpreter for exactly these\noperations.", + "tags": "", + "url": "https://www.pypy.org/posts/2011/03/controlling-tracing-of-interpreter-with_21-6524148550848694588.html" + }, + { + "title": "Controlling the Tracing of an Interpreter With Hints, Part 2: Controlling Optimization", + "text": "This is part 2 of a series on how to speed up an interpreter written with PyPy\nby adding JIT hints to the interpreter. Part 1 described how to control the\nextent of tracing. In this post I will describe how to add hints that\ninfluence the optimizer. If applied correctly these techniques can give\nreally big speedups by pre-computing parts of what happens at runtime. On the other\nhand, if applied incorrectly they might lead to code bloat, thus making the\nresulting program actually slower.\n\nBackground\nBefore sending the trace to the backend to produce actual machine code, it is\noptimized. The optimizer applies a number of techniques to remove or reduce\nthe number of operations: most of these are well known compiler optimization\ntechniques, with the difference that it is easier to apply them in a tracing\nJIT because it only has to deal with linear traces. Among the techniques:\n\nconstant folding\ncommon subexpression elimination\nallocation removal, as described in the paper that I recently presented at\nPEPM\nstore/load propagation\nloop invariant code motion\n\nIn some places it turns out that if the interpreter author rewrites some parts\nof the interpreter with these optimizations in mind the traces that are produced\nby the optimizer can be vastly improved.\nIn this post I will describe two hints that allow the interpreter author to\nincrease the optimization opportunities for constant folding. For constant\nfolding to work, two conditions need\nto be met:\n\nthe arguments of an operation actually need to all be constant,\ni.e. statically known by the optimizer\nthe operation needs to be pure, i.e. always yield the same result given\nthe same arguments.\n\nThe PyPy JIT generator automatically detects the majority of these conditions.\nHowever, for the cases in which the automatic detection does not work, the\ninterpreter author can apply hints to improve the optimization\nopportunities. There is one kind of hint for both of the conditions above.\nNote: These hints are written by an interpreter developer and applied to the\nRPython source of the interpreter. Normal Python users will never see them.\n\n\nWhere Do All the Constants Come From\nIt is worth clarifying what is a \"constant\" in this context. A variable of\nthe trace is said to be constant if its value is statically known by the\noptimizer.\nThe simplest example of constants are literal values. For example, if in the\nRPython source code we have a line like y = x + 1, the second operand will\nbe a constant in the trace.\nHowever, the optimizer can statically know the value of a variable even if it\nis not a constant in the original source code. For example, consider the\nfollowing fragment of RPython code:\nif x == 4:\n y = y + x\n\nIf the fragment is traced with x being 4, the following trace is\nproduced:\n\nguard(x == 4)\ny = y + x\n\nIn the trace above, the value of x is statically known thanks to the\nguard. Remember that a guard is a runtime check. The above trace will run to\ncompletion when x == 4. If the check fails, execution of the trace is\nstopped and the interpreter continues to run.\nThere are cases in which it is useful to turn an arbitrary variable\ninto a constant value. This process is called promotion and it is an old idea\nin partial evaluation (it's called \"the trick\" there). Promotion is also heavily\nused by Psyco and by all older versions of PyPy's JIT. Promotion is a technique\nthat only works well in JIT compilers, in\nstatic compilers it is significantly less applicable.\nPromotion is essentially a tool for trace specialization. In some places in the\ninterpreter it would be very useful if a variable were constant, even though it\ncould have different values in practice. In such a place, promotion is used. The\ntypical reason to do that is if there is\na lot of computation depending on the value of that variable.\nLet's make this more concrete. If we trace a call to the following function:\ndef f1(x, y):\n z = x * 2 + 1\n return z + y\n\nWe get a trace that looks like this:\n\nv1 = x * 2\nz = v1 + 1\nv2 = z + y\nreturn(v2)\n\nObserve how the first two operations could be constant-folded if the value of\nx were known. Let's assume that the value of x can vary, but does so\nrarely, i.e. only takes a few different values at runtime. If this is the\ncase, we can add a hint to promote x, like this:\ndef f2(x, y):\n x = hint(x, promote=True)\n z = x * 2 + 1\n return z + y\n\nThe meaning of this hint is that the tracer should pretend that x is a\nconstant\nin the code that follows. When just running the code, the function has no\neffect, as it simply returns its first argument. When tracing, some extra work\nis done. Let's assume that this changed function is traced with\nthe arguments 4 and 8. The trace will be the same, except for one\noperation at the beginning:\n\nguard(x == 4)\nv1 = x * 2\nz = v1 + 1\nv2 = z + y\nreturn(v2)\n\nThe promotion is turned into a guard operation in the trace. The guard\ncaptures the value of x as it was at runtime. From the point of view of the\noptimizer, this guard is not any different than the one produced by the if\nstatement in the example above. After the guard, the rest of the trace can\nassume that x is equal to 4, meaning that the optimizer will turn this\ntrace into:\n\nguard(x == 4)\nv2 = 9 + y\nreturn(v2)\n\nNotice how the first two arithmetic operations were constant folded. The hope is\nthat the guard is executed quicker than the multiplication and the addition that\nwas now optimized away.\nIf this trace is executed with values of x other than 4, the guard will\nfail, and execution will continue in the interpreter. If the guard fails often\nenough, a new trace will be started from the guard. This other trace will\ncapture a different value of x. If it is e.g. 2, then the optimized\ntrace looks like this:\n\nguard(x == 2)\nv2 = 5 + y\nreturn(v2)\n\nThis new trace will be attached to the guard instruction of the first trace. If\nx takes on even more values, a new trace will eventually be made for all of them,\nlinking them into a chain. This is clearly not desirable, so we should promote\nonly variables that don't vary much. However, adding a promotion hint will never produce wrong\nresults. It might just lead to too much assembler code.\nPromoting integers, as in the examples above, is not used that often.\nHowever, the internals of dynamic language interpreters often\nhave values that are variable but vary little in the context of parts of a user\nprogram. An example would be the types of variables in a user function. Even\nthough in principle the argument to a Python function could be any Python type,\nin practise the argument types tend to not vary much. Therefore it is possible to\npromote the types. In the next blog post I will give a complete example for how\nthis works.\n\n\nDeclaring New Pure Operations\nIn the last section we saw a way to turn arbitrary variables into constants. All\npure operations on these constants can be constant-folded. This works great for\nconstant folding of simple types, e.g. integers. Unfortunately, in the context of an\ninterpreter for a dynamic\nlanguage, most operations actually manipulate objects, not simple types. The\noperations on objects are often not pure and might even have side-effects. If\none reads a field out of a constant reference to an object this cannot\nnecessarily be folded away because the object can be mutated. Therefore, another\nhint is needed.\nAs an example, take the following class:\nclass A(object):\n def __init__(self, x, y):\n self.x = x\n self.y = y\n\n def f(self, val):\n self.y = self.compute() + val\n\n def compute(self):\n return self.x * 2 + 1\n\nTracing the call a.f(10) of some instance of A yields the following\ntrace (note how the call to compute is inlined):\n\nx = a.x\nv1 = x * 2\nv2 = v1 + 1\nv3 = v2 + val\na.y = v3\n\nIn this case, adding a promote of self in the f method to get rid of the\ncomputation of the first few operations does not help. Even if a is a\nconstant reference to an object, reading the x field does not necessarily\nalways yield the same value. To solve this problem, there is another annotation,\nwhich lets the interpreter author communicate invariants to the optimizer. In\nthis case, she could decide that the x field of instances of A is\nimmutable, and therefore compute\nis a pure function. To communicate this, there is a purefunction decorator.\nIf the code in compute should be constant-folded away, we would change the\nclass as follows:\nclass A(object):\n def __init__(self, x, y):\n self.x = x\n self.y = y\n\n def f(self, val):\n self = hint(self, promote=True)\n self.y = self.compute() + val\n\n @purefunction\n def compute(self):\n return self.x * 2 + 1\n\nNow the trace will look like this:\n\nguard(a == 0xb73984a8)\nv1 = compute(a)\nv2 = v1 + val\na.y = v2\n\nHere, 0xb73984a8 is the address of the instance of A that was used\nduring tracing. The call to compute is not inlined, so that the optimizer\nhas a chance to see it. Since compute function is marked as pure, and its\nargument\nis a constant reference, the call will be removed by the optimizer. The final\ntrace looks like this:\n\nguard(a == 0xb73984a8)\nv2 = 9 + val\na.y = v2\n\n(assuming that the x field's value is 4).\nOn the one hand, the purefunction annotation is very powerful. It can be\nused to constant-fold arbitrary parts of the computation in the interpreter.\nHowever, the annotation also gives you ample opportunity to mess things up. If a\nfunction is annotated to be pure, but is not really, the optimizer can produce\nsubtly wrong code. Therefore, a lot of care has to be taken when using this\nannotation.\n\nObservably Pure Functions\nWhy can't we simply write an analysis to find out that the x fields of the\nA instances is immutable and deduce that compute is a pure function,\nsince it only reads the x field and does not have side effects? This might\nbe possible in this particular case, but in practice the functions that are\nannotate with the purefunction decorator are usually more complex.\nThe easiest example for this is that of a function that uses memoization to\ncache its results. If you analyze this function, it looks like the function has\nside effects, because it changes the memoizing dictionary. However, because this side\neffect is not externally visible, the function from the outside is pure. This is\na property that is not easily detectable by analysis. Therefore, the purity\nof this function needs to be annotated.\n\n\nImmutable Fields\nOne of the most common cases of pure functions is reading immutable\nvalues out of objects. Since this is so common, we have special syntactic sugar\nfor it. A RPython class can have a class attribute _immutable_fields_ set to\na list of strings, listing the fields that cannot be changed. This is equivalent\nto using getters and annotating them with purefunction.\n\n\n\nConclusion\nIn this blog post I explained two more hints that can be used in the source code\nof the interpreter. They are used to influence what the optimizer does with the\ntrace. I realize the examples given here are a bit too small, in the next\ninstallment I will give a worked-out example that puts all the pieces together.", + "tags": "", + "url": "https://www.pypy.org/posts/2011/03/controlling-tracing-of-interpreter-with_15-3281215865169782921.html" + }, + { + "title": "Controlling the Tracing of an Interpreter With Hints, Part 1: Controlling the Extent of Tracing", + "text": "The question I was asked most often during my recent US trip was how exactly\nthe hints work that interpreter authors can use to improve the execution speed\nof the programs running on their interpreters. Since those hints are not really\ndocumented all that well, I decided to write blog posts about them. This is the\nfirst one.\n\nBackground\nFirst, let's recap some basics: PyPy's approach to implementing dynamic\nlanguages is to write an interpreter for\nthe language in RPython. This interpreter can be translated to C and then\nfurther to machine code. The interpreter consists of code in the form of a\nlarge number of generated C functions and some data. Similarly, the user\nprogram consists of functions in the language the interpreter executes.\nAs was explained in a blog post and a paper two years ago, PyPy's JIT is a\nmeta-tracer. Since we want to re-use our tracer for a variety of languages, we\ndon't trace the execution of the user program, but instead trace the execution\nof the interpreter that is running the program. This means that the traces\ndon't contain the bytecodes of the language in question, but RPython-level\noperations that the interpreter did to execute the program.\nOn the other hand, the loops that are traced by the tracer are the loops in the\nuser program. This means that the tracer stops tracing after one iteration of\nthe loop in the user function that is being considered. At this point, it can\nhave traced many iterations of the interpreter main loop.\nHere's a diagram of this process:\n\n\n\nOn the left you see the levels of execution. The CPU executes the binary of\nPyPy's Python interpreter, which consists of RPython functions that have been\ncompiled first to C, then to machine code. Some of these functions contain\nloops, others don't. The interpreter runs a Python program written by a\nprogrammer (the user). If the tracer is used, it traces operations on the level\nof the interpreter. However, the extent of the trace is determined by the loops\nin the user program.\n\n\nHow Far Should Tracing Go\nWhen the tracer encounters a function call at the interpreter level, e.g. the\ninterpreter main loop calling a helper function, it can do one of two things:\n\nit can trace into the helper function, effectively inlining it into the trace.\nit can not trace into the function and instead record a call to that function\nas an operation in the trace. Such a call operation in the trace is sometimes\ncalled residual call.\n\nAs a default, the tracer will try to trace into the helper because that will\ngive more information to the optimizer, allowing it to do a better job. This is\nparticularly important for the allocation removal optimization, because if a\nfreshly allocated object is passed as an argument to a residual call, its\nallocation cannot be optimized away.\nThere is a problem however if the helper function itself contains a loop. The\ntracer records the linear sequence of operations that are being executed. Thus\nwhen it encounters a loop on the interpreter level it records all the\noperations of every iteration of the loop itself, with the net effect of\nunrolling it. The only places where the tracer stops and tries to close the\ntrace is in the main loop of the interpreter. When the tracer encounters the\nmain loop, it also checks whether the original user loop has been closed, and\nthus whether it can stop tracing.\nFor most helper functions in the interpreter that contain loops, fully\nunrolling does not make sense. If a loop is unrolled, the trace is specific to\nthe number of iteration that was seen during tracing. If the trace is later\nexecuted with a different number of iterations, the trace will be left via a\nguard failure, which is inefficient. Therefore the default behaviour of the\ntracer is to never trace into a function on the interpreter level that contains\na loop, but to trace into all non-looping helper functions.\nThis default behaviour is essentially a heuristic, but one that usually makes\nsense. We want to produce just enough traces to make the resulting code\nefficient, but not more. Therefore we trace as much as possible (everything by\ndefault) except the functions which loops where tracing would produce code that\nis less general than it could be.\nAs an example for a helper with a loop, take string concatenation. It loops over\nthe characters of both arguments and copies them over into the result string. It\ndoes not make sense to unroll the loops in this function. If we do that,\nthe resulting trace can only be used for strings of the length that was seen\nduring tracing. In practise, the string lengths are usually different each run,\nmeaning that the trace with unrolling is not run to completion in most cases.\n\n\nInfluencing the Default Behaviour\nSometimes the default behaviour is not actually what is wanted. This is\nsomething the interpreter author has to decide, usually by looking at the traces\nthat are produced and deciding that they should be improved. There are two ways\nin which the default is wrong:\n\nfalse negatives: if a helper function that does contain a loop should\nbe traced into, unrolling the loop.\nfalse positives: if a helper function that does not contain a loop is\ninlined into the trace, but the interpreter author decides that this is not\nhelpful.\n\nIf the interpreter author finds false negatives or false positives, she can fix\nthat by applying a hint to the tracer. These hints take the form of function\ndecorators (which both live in the pypy.rlib.jit module). In the next two\nsubsections I will describe these two function decorators and their use.\n\nUnrolling Functions With Loops\nThe first decorator, used to fix false negatives, is the unroll_safe\ndecorator. It is used to tell the tracer to always trace into a function that\nhas a loop, effectively unrolling the loop. This decorator should be used only\nif the loop in the helper function is expected to always run for the same number\nof iterations. This sounds like a strong restriction, in practise this is less\nsevere: The number of iterations needs to only be the same in the context where\nthe helper functions is traced from.\nIt is easiest to understand this condition via an example. Let's look at the\nBUILD_TUPLE bytecode in Python. It takes one argument, the length n of\nthe tuple being built. The bytecode pops n arguments from the stack, turns\nthem into a tuple and pushes that tuple on the stack. Thus the function that\nimplements BUILD_TUPLE in PyPy's Python interpreter calls a helper\npopvalues which pops n values from the stack and returns them in a list.\nThis helper is implemented with a loop and would thus not be traced into by\ndefault. The loop in the helper can run for very different numbers of\niterations, because it is used in a variety of places. However, for every\nconcrete BUILD_TUPLE bytecode, the argument will be constant. Therefore it\nis safe (and even necessary) to annotate popvalues with the unroll_safe\ndecorator.\nA different example is the implementation of the isinstance builtin. It is\nused to check whether an object a is an instance of a class B like\nthis: isinstance(a, B). The second argument of the function can also be a\ntuple of classes to check whether an object is an instance of one of a number of\nclasses: isinstance(a, (A, B, C, D)). To implement this second case, the\nimplementation of isinstance contains a loop iterating over the elements of\nthe tuple. The number of loop iterations can vary, but is usually fixed for each\nindividual call site which typically just lists a few classes in the source\ncode. Therefore it is also safe to annotate the implementation of isinstance\nwith the unroll_safe decorator.\n\n\nPreventing the Tracing of Functions\nThe second decorator dont_look_inside is used to fix false positives. It\ntells the JIT to never trace into the decorated function and just always produce\na residual call instead. This decorator is in many ways less important than the\nunrolling one (except for a special situation that I will describe in a\nfollow-up post). It is used if tracing into a function is not expected to yield\nany speed benefits, because the optimizer will not be able to improve it much.\nThis is often the case if the called helper function does not contain any\n\"dynamic\" behaviour. In such a situation it is better to just leave the function\ncall in the trace, because that produces less code.\nAn example would be the import mechanism in Python. It's very unlikely that any\nperformance improvement can be had by turning part of it into assembler.\nTherefore we hide it from the tracer by annotating them with\ndont_look_inside.\n\n\n\nConclusion\nIn this post we discussed two hints that can be used to control precisely which\nparts of the interpreter should be meta-traced. If these hints are used\ncarefully, this can go a long way to making the interpreter produce traces that\ncontain exactly the interesting part of the execution, and will contain calls to\nthe functions that can not be optimized by tracing techniques.\nIn the next part of this series I will discuss a different set of hints that can\nbe used to strongly optimize traces.", + "tags": "", + "url": "https://www.pypy.org/posts/2011/03/controlling-tracing-of-interpreter-with-871085470935630424.html" + }, + { + "title": "Bay Area 2011 Tour Summary", + "text": "We spent the week in the San Francisco Bay Area showing off PyPy.\nHere are notes and photos of the tour.\n\nDay 1: Google SF\nGoogle has offices in downtown San Francisco. They are at a beautiful\nplace and the views are spectacular. We thank Wesley Chun and Guido van\nRossum for organizing this meeting. Between 25 and 30 engineers showed\nup. Some of them were Python programmers, but others were C++\nprogrammers; and they all seem to have real problems that they want to\nsolve with PyPy. We didn't have prepared slides so far, so we mostly\nran demos and talked. As predicted, Google would love SWIG support.\nThey suggested that we rename the translation toolchain (as we vaguely\nthought too) to separate it more from PyPy's Python interpreter; up\nuntil today, many had no idea that they could use PyPy for other\nlanguages. All in all, it was very positive and people looked forward\nto meeting up at PyCon.\n\n\nDay 2: Stanford\n\n\n\n\nThis was the most academically-oriented talk. You can find the\nabstract, the slides (PgUp/PgDown to navigate) and the video here.\nThere were around 35 people in the audience, and maybe 1000 real-time\nvideo watchers (who didn't get to ask questions). The live audience\nseemed to be a mixture of students, professors, and people from the\nlocal industry. We thank David Allison and Andy Freeman for organizing\nit. It has been two or three years since they invited me (Armin) and I\nfinally managed to get here :-)\nThe slides are longer than the talk; we focused on the JIT because that\nwas what the audience was most interested in. They were really\nimpressed at the stability, the tests, and that we don't have lots of\nbugs reported in the JIT of our latest public release. We later found\nout that many who came to the talk believed that they were going to get\na talk about how we jitted a subset of python because real python is too\nhard -- impossible to do. They came to heckle with examples of how\npython was impossible. So they were amazed when the first slide of\nArmin's presentation was \"Python is complicated\", and the next slide\n\"Python is messy\". It was a positive outcome. We made new fans :-)\n\n\nDay 3: Yelp\n\n\n\n\n\nAs you can see in the image, tons of people showed up -- ~140. Thanks\nto Grace Law, who is the coordinator for the SF Python Meet-up, and to\nJimmy Retzlaff and Ashley King-Bishof from Yelp. Yelp is also located\nin downtown San Francisco. This looks like the place to be if you are a\nstart-up in California (and not in Silicon Valley): lots of enthusiastic\nyoung people are here, and they are hiring. Yelp has an enormous open\nspace, suitable for huge parties, and the coolest beer dispensers on the\nplanet, made as a hack-a-thon project by three Yelp engineers (pictured\nbelow):\n\n\n\n\n\n\n\n\n\nBy the way, their management structure seems to be flat. There are\nalmost no line managers, i.e. managers for the engineering staff;\ninstead they self-organize into teams. This is not what you expect\nfor the USA; things appear to have changed a lot.\nThe talk was in two sections, \"PyPy from the user's point of view\" and\n\"How the JIT works\". Good feedback; impressed that we support all of\nPython 2.7 (including all the modules that are in C in the stdlib), and\nimpressed that the Python 3.0 conversion is not considered a big deal by\nus, although we have no precise date yet. The plan is, of course, just\nto tweak the interpreter until it supports both (by adding the necessary\nconditions); the other aspects like GC and the JIT will not be affected\nat all.\n\n\nDay 4: Dropbox\n\n\n\n\n\n\n\nThis was another place full of excited, successful young people. The\nCTO looks like he turned 30 last week, and he's been CTO for 4 years\nnow. The three of us were quite obviously the oldest people there. We\nfelt old. They have another great big open barn complex. It's\nloud. Very loud. Loud refrigerators, loud street noise, loud machinery\nin the walls doing who knows what, loudly.\nThis was the first tech talk at dropbox. Thanks to Rian Hunter for\norganizing it. They have a big kitchen, and we held the talk in there.\nThere was a skylight, which made the room too bright, so harder to read\nthe slides than would otherwise be the case. They were jazzed about our\nvisit, and wanted copies of all the pictures Jacob took before he left.\nThey seemed familiar with Google V8, and thought that how long it took\nto build PyPy was a great incentive for us to make PyPy faster. They\nare very interested in fast ctypes, fast SWIG, fast Cython. They were\npleased and surprised that we don't have too much JIT bloat (typically\n~10% of the total RAM usage).\nThe mobile developers want a smaller Python more than a faster one.\nPython takes too much memory given the tiny amount available on a lot of\ncell phones. Not that we have an answer to this problem now.\nThey were pleased to learn that we will soon be able to JIT ctypes code.\nAnd the fact that Armin knows many ways to segfault CPython was a bit of\na shock. We talked for an hour after the presentation. Again, a very\npositive outcome.\n\n\nDays 5 and 6: Noisebridge sprint\n\n\n\nAbout six people showed up for the sprint. (Late. Californians really\ndo start the day at 11.) Noisebridge is a very eclectic place; people\nshow up to do pretty much everything from sewing to breaking apart\nequipment to making robots and beer. It's donation-driven. Thanks to\nJim Stockford for volunteering the space and arranging this and helping\nus set up for the sprint.\nDuring the sprint, we did a little bit of everything; there was no clear\npattern. Ademan worked on sqlite, Greg Price looked to see if his\nsoftware could run on PyPy, Will worked on the documentation, and a few\nof us fixed some more 2.7 tests. Alex Gaynor and Fijal joined us, too.\n\n\nDay 7: Google Mountain View and Mozilla\nWe gave two talks on the 7th day of our trip so we were already quite\nexhausted. Fortunately new people joined, so the talks were actually split\nbetween multiple people. We would like to thank Peter Norvig and Ben Bayer\nfor inviting us to Google and Andreas Gal, Brendan Eich and Dave Herman\nfor inviting us to Mozilla. Both talks should hopefully appear online\nat some point soon, but as of now we don't have a link.\nIt was pretty incredible to find ourselves at Mozilla talking with at\nleast 15 people who deeply understood the ideas of tracing JITs and\nalso understood why we undertook the decision to generate our JIT\ninstead of writing it. They suffered from having to write JavaScript\nJIT (even multiple ones) by hand, as Armin did with Psyco. He deeply\nsympathizes. The discussion afterwards was very successful and we're\nlooking forward to cooperating with them. Many exciting things were\ndiscussed as possibilities.\nNext day we went to Pycon, which is ongoing and a topic for yet another\nblog post.", + "tags": "", + "url": "https://www.pypy.org/posts/2011/03/bay-area-2011-tour-summary-9117372109664978472.html" + }, + { + "title": "US Trip Report: POPL, Microsoft, IBM", + "text": "Some notes from my recent trip (from 23rd of January to 17th of February) to the\nUS where, I presented PyPy at various scientifically oriented places. In\nsummary, there seems to be quite a bit of interest in PyPy within the research\ncommunity, details below.\n\nPEPM/POPL/STOP\nFrom the 24th to the 29th of January I was in Austin, Texas at the POPL\nconference, where I gave a talk at one of the workshops, PEPM (Partial\nEvaluation and Program Manipulation). The title of our paper is\n\"Allocation Removal by Partial Evaluation in a Tracing JIT\", the abstract is:\n\nThe performance of many dynamic language implementations suffers from high\nallocation rates and runtime type checks. This makes dynamic languages less\napplicable to purely algorithmic problems, despite their growing\npopularity. In this paper we present a simple compiler optimization based\non online partial evaluation to remove object allocations and runtime type\nchecks in the context of a tracing JIT. We evaluate the optimization using\na Python VM and find that it gives good results for all our (real-life)\nbenchmarks.\nThe talk (slides) seemed to be well-received and there was\na good discussion afterwards. PEPM in general was a very enjoyable workshop\nwith many interesting talks on partial evaluation (which I am very interested\nin) and a great keynote by Olivier Danvy about \"A Walk in the Semantic Park\".\nPOPL itself was a bit outside of the area I am most knowledgeable in, most of\nthe talks being on formal topics. Some of the talks that stuck to my mind:\n\n\"The Design of Kodu: A Tiny Visual Programming Language for Children on the\nXbox 360\", the keynote by Matthew MacLaurin from Microsoft Research. I didn't\nknow about Kodu before, and was very impressed by it.\n\n\n\"Automating String Processing in Spreadsheets using Input-Output Examples\"\n(paper) by Sumit Gulwani (also from MS Research) describes a plugin to Excel\nthat can automate many common string processing tasks by giving a couple of\nexamples, which are then abstracted into a generic string manipulation. Very\ncool.\n\n\n\"Dynamic Inference of Static Types for Ruby\" (paper) by Michael Furr,\nJong-hoon (David) An, Jeffrey S. Foster and Michael Hicks describes an\napproach to type inference that works by observing the actual types seen\nduring unit-testing. Similar things have been done a few times before,\nhowever, the paper actually gives a correctness result.\n\n\n\"The Essence of Compiling with Traces\" (paper) by Shu-Yu Guo and Jens\nPalsberg describes a formalization of a simple imperative language and\nproves that executing it using trace compilation will do exactly the same\nthing than using an interpreter. It also looks at what conditions an\noptimization on traces must fulfill to still produce valid results.\n\nAfter the main conference, I took part in the STOP (Scripts to Programs)\nworkshop. It had a great keynote \"Scripting in a Concurrent World\" by John Field\nabout the Thorn language and a few interesting other talks.\n\n\nMicrosoft Research\nAfter POPL I went to Redmond to visit Microsoft Research for a week,\nspecifically the RiSE group. This is the group that did the SPUR project,\na meta-tracing JIT for C# applied to a JavaScript interpreter in C#. I compared\nPyPy to SPUR last year. I am very grateful for Microsoft for inviting me\nthere.\nAt Microsoft I gave a talk about \"PyPy's Approach to Implementing Dynamic\nLanguages Using a Tracing JIT Compiler\", the slides of which can be found\nhere. The talk was filmed and is online. People seemed to be impressed\nwith the \"product qualities\" of PyPy, e.g. the buildbot infrastructure and\nspeed tracking website.\nThe rest of the time I discussed with various researchers in the RiSE group,\nparticularly with Nikolai Tillmann. We talked a lot about similarities and\ndifferences between SPUR and PyPy and tried to understand our respective projects\nbetter. SPUR is a really great project and I learned a lot in the discussions,\nfor example about the optimizations and heuristics their trace compiler uses.\nAnother very cool project done by the RiSE group that I learned more about is\nPEX. PEX is a unit test generator for C# that tries to produce unit tests for\nso-far untested execution paths within methods. There is an online puzzle\nversion of it, if you want to get an impression of the technology (including a\nvery impressive C# IDE in the browser).\n\n\nIBM\nFor the last part of the trip I stayed in New York City for two weeks,\nmostly as a vacation. However, I also visited IBM Watson Research Center for\ntwo days, to which I had been invited by David Edelsohn.\nThe first day I gave the same presentation I had given at Microsoft (with some\nimprovements to the slides), again it was quite well received. The rest of\nthe time I spent in (very fruitful) discussions with various people and teams,\namong them the Liquid Metal team and the Thorn team.\nThe second day I met with members of the FIORANO group, who are working on\ndynamic compilation for dynamic languages and Java. They explored various ways\nto speed up Python, both by improving the CPython interpreter as well as with\nJIT compilation techniques.\nAnother of their projects is to add a trace compiler to IBM's J9 JVM, about\nwhich the paper \"A Trace-based Java JIT Compiler Retrofitted from a\nMethod-based Compiler\" is going to appear at CGO. I discussed tracing JITs with\nPeng Wu, one of the authors of that paper. Peng tries to systematically look at\nthe various heuristics found in the different VMs that use tracing JITs. This\nis a very different perspective from the one I usually have, focusing on how to\nimprove PyPy's specific heuristics. Therefore that discussion helped me thinking\nabout the issues more generally.\nAnother goal of the group is to try to find benchmarks that are representative\nfor typical Python workloads, which is something that has been done very\ncarefully for Java e.g. when developing the DaCapo benchmark suite. The\nbenchmarks that the Python community uses have not been selected in such a\ncareful and measured way, so I think that trying to be more systematic there is\na very worthwhile endeavour.", + "tags": "", + "url": "https://www.pypy.org/posts/2011/03/us-trip-report-popl-microsoft-ibm-3874568000250679204.html" + }, + { + "title": "PyPy Winter Sprint Report", + "text": "A few weeks ago I had the great fortune to attend the PyPy winter sprint in Leysin Switzerland. I've wanted to contribute to PyPy for a long time and I thought diving into a sprint might be a good way to get familiar with some of the code. What I wasn't expecting was to be using RPython to implement new methods on built-in Python objects on the first day. The main thing I took away from the sprint was just how easy it is to get involved in developing PyPy (well, some bits of it at least and being surrounded by core developers helps). I wrote up a very short description of how to get started here, but I'll do a longer blog post with examples on my own blog soon(ish).\n\nThe sprint was kicked off by Armin merging the \"fast-forward\" branch of PyPy onto trunk. \"fast-forward\" brings PyPy from Python 2.5 compatibility to Python 2.7. Along with this it brought a large number of test failures, as the sterling work done by Benjamin Peterson and Amaury Forgeot d'Arc was not complete. This immediately set the primary sprint goal to reduce the number of test failures.\n\nWe made a great deal of progress on this front, and you can see how close PyPy is now from the buildbots.\n\nJacob Hall\u00e9n and I started working through the list of tests with failures alphabetically. We made short work of test_asyncore and moved onto test_bytes where I was stuck for the rest of the sprint. I spent much of the remaining days working with Laura Creighton on the pypy bytearray implementation to make it more compatible with Python 2.7. This meant adding new methods, changing some of the Python protocol method implementations and even changing the way that bytearray is constructed. All in all great fun and a great introduction to working with RPython.\n\nA big part of the compatibility with Python 2.7 work was done by Laura and Armin who basically rewrote the math module from scratch. This was needed to incorporate all the improvements made (mostly by Mark Dickinson) in CPython in 2.7. That involved a lot of head-scratching about such subtleties as whether -0.0 should be considered almost equal to 0.0 and other fun problems.\n\n\n\n\nThe first meal together, before everyone had arrived\n\nIf you add on top of this the wonderful people, the beautiful scenery, the Swiss cheese fondues, managing to not kill myself with a days skiing and traditional pypy card games, I can heartily recommend pypy sprints as a close approximation of geek nirvana.\n\n\n\nView of the mountains from the sprint\n\n\nWorking on 2.7 compatibility wasn't the only work that happened during the sprint. Other activities included:\n\nAntonio Cuni worked on the \"jittypes\" branch. This is a reimplementation of the core of the PyPy ctypes code to make it jittable. The goal is that for common cases the jit should be able to turn ctypes calls from Python into direct C level calls. This work was not completed but very close and is great for the future of integrating C libraries with PyPy. As ctypes is also available in CPython and IronPython, and hopefully will be available in Jython soon, integrating C code with Python through ctypes is the most \"implementation portable\" technique.\nDavid Schneider continued his work on the JIT backend for ARM. PyPy has been cross-compilable to ARM for a long time, but bringing the JIT to ARM will provide a *fast* PyPy for ARM, which includes platforms like Android. Again David didn't complete this work but did complete the float support.\nH\u00e5kan Ardo was present for two days and continued his crazy-clever work on JIT optimisations, some of which are described in the Loop invariant code motion blog entry.\nHolger Krekel worked on updating the PyPy test suite to the latest version of py.test and also worked with me on the interminable bytearray changes for part of the sprint.\nNo one was sure what \u00a0Maciej Fija\u0142kowski worked on but he seemed to be quite busy.\n\nI think that was most of the work done during the actual sprint. There was also a great deal of healthy discussion about the future of PyPy. Expect lots more interesting and exciting developments over the coming year.", + "tags": "sprint", + "url": "https://www.pypy.org/posts/2011/02/pypy-winter-sprint-report-4155886720346408516.html" + }, + { + "title": "The PyPy San Franciso Bay Area Tour 2011", + "text": "PyPy is coming to the San Francisco Bay Area in the beginning of March with\na series of talks and a mini sprint.\n\n\nWednesday March 2, 4:15 p.m. Armin Rigo gives\na\ntalk at Stanford. open to the public.\n\nThursday March 3, 6:00 p.m. General talk at Yelp, 706 Mission St 9th Floor,\n San Francisco CA 94103 open to the public.\n\nSaturday and Sunday March 5 and 6.\n PyPy mini sprint at noisebridge.\n 2169 Mission street between 17th and 18th in San Francisco. Open to the public.\n\nMonday March 7th, 11:30 a.m. Google Tech talk in Mountain View at the\n Googleplex. Not open to the public (but the video should be available\n later).\n\nMonday March 7th, 2:30 p.m. Talk at Mozilla in Mountain View. Not\n open to the public (but Mozilla developers can videoconference).\n\n\nFrom the PyPy project team we will have Armin Rigo, Maciej Fija\u0142kowski\n(from 6th March), Laura Creighton and Jacob Hall\u00e9n and possibly\nChristian Tismer attending.\n\nMost of the talks will focus on (some of) the highlights and the\nstatus of pypy:\n\n\nmost Python benchmarks run much faster than with CPython or Psyco\nthe real-world PyPy compiler toolchain itself (200 KLocs) runs twice as fast\nsupports x86 32 and 64bit and is in the process of supporting ARM\nfull compatibility with CPython (more than Jython/IronPython)\nfull (and JIT-ed) ctypes support to call C libraries from Python\nsupports Stackless Python (in-progress)\nnew \"cpyext\" layer which integrates existing CPython C extensions\nan experimental super-fast JIT-compilation of calls to C++ libraries\n\n\nAs is usual for us, there is vastly more material that is available for\nus to cover than time, especially when it comes to possible future\ndirections for PyPy. We want to reserve a certain amount of time at\neach talk purely to discuss things that are of interest to audience\nmembers. However, if you already know what you wish we would discuss,\nand are attending a talk (or even if you aren't), please let us know.\nYou can either reply to this blog post, or mail Laura directly at\nlac at openend.se .\n\nApart from getting more technical and project insight, our travel is\nalso a good possibility for companies in the SF area to talk to us\nregarding contracting. In September 2011 our current \"Eurostars\" research\nproject ends and some of us are looking for ways to continue working on\nPyPy through consulting, subcontracting or hiring. The two companies,\nOpen End and merlinux, have successfully done a number of such contracts\nand projects in the past. If you want to talk business or get together for\nlunch or dinner, let us know! If you would like us to come to your company\nand make a presentation, let us know! If you have any ideas about what\nwe should discuss in a presentation so that you could use it to convince\nthe powers-that-be at your place of employment that investing time and\nmoney in PyPy would be a good idea, let us know!\n\nOn Tuesday March 8th we will be heading for Atlanta for the Python VM\nand Language Summits before attending PyCon. Maciej Fija\u0142kowski and\nAlex Gaynor will be giving a talk entitled\nWhy is\nPython slow and how can PyPy help?\nMaciej will also be giving the talk\nRunning\nultra large telescopes in Python which is\npartially about his experiences using PyPy in the Square Kilometer Array\nproject in South Africa. There will be a PyPy Sprint March 14-17.\nAll are welcome.", + "tags": "", + "url": "https://www.pypy.org/posts/2011/02/pypy-san-franciso-bay-area-tour-2011-6179180737090334330.html" + }, + { + "title": "PyPy faster than C on a carefully crafted example", + "text": "Good day everyone.\nRecent round of optimizations, especially loop invariant code motion\nhas been very good for small to medium examples. There is work ongoing to\nmake them scale to larger ones, however there are few examples worth showing\nhow well they perform. This one following example, besides getting benefits\nfrom loop invariants, also shows a difference between static and dynamic\ncompilation. In fact, after applying all the optimizations C does, only a\nJIT can use the extra bit of runtime information to run even faster.\nThe example is as follows. First Python. I create two files, x.py:\n\ndef add(a, b):\n return a + b\n\nAnd y.py:\n\nfrom x import add\n\ndef main():\n i = 0\n a = 0.0\n while i < 1000000000:\n a += 1.0\n add(a, a)\n i += 1\n\nmain()\n\nFor C, x.c:\n\ndouble add(double a, double b)\n{\n return a + b;\n}\n\nand y.c:\n\ndouble add(double a, double b);\n\nint main()\n{\n int i = 0;\n double a = 0;\n while (i < 1000000000) {\n a += 1.0;\n add(a, a);\n i++;\n }\n}\n\nResults?\n\n1.97s - PyPy\n3.07s - C\n\nCompilation options:\n\nPyPy trunk (386ed41eae0c), running pypy-c y.py\nC - gcc -O3 (GCC 4.4.5 shipped with Ubuntu Maverick)\n\nHence, PyPy 50% faster than C on this carefully crafted example. The reason\nis obvious - static compiler can't inline across file boundaries. In C,\nyou can somehow circumvent that, however, it wouldn't anyway work\nwith shared libraries. In Python however, even when the whole import system\nis completely dynamic, the JIT can dynamically find out what can be inlined.\nThat example would work equally well for Java and other decent JITs, it's\nhowever good to see we work in the same space :-)\nCheers,\nfijal\nEDIT: Updated GCC version", + "tags": "", + "url": "https://www.pypy.org/posts/2011/02/pypy-faster-than-c-on-carefully-crafted-5614784244310486765.html" + }, + { + "title": "A JIT Backend for ARM Processors", + "text": "In the past few months, I have been developing as a part of my master thesis\nthe ARM backend for the the PyPy JIT, in the arm-backend branch. Currently, it is still work in progress: all integer and object operations are working and\nthe support for floating point is also under development.\nARM processors are very widely used, beeing deployed in servers, some netbooks\nand mainly mobile devices such as phones and tablets. One of our goals is to be\nable to run PyPy on phones, specially on Android. Currently is not yet possible\nto translate and compile PyPy for Android automatically, but there has been\nsome work on using Android's NDK to compile PyPy's generated C code.\nThe JIT Backend targets the application profile of the ARMv7 instruction set\narchitecture which is found for example in the Cortex-A8 processors used in many Android powered devices and in Apple's A4 processors built into the latest iOS devices. To develop and\ntest the backend we are using a BeagleBoard-xM which has a 1 GHz ARM\nCortex-A8 and 512 MB of RAM running the ARM port of Ubuntu 10.10.\nCurrently on Linux it is possible to translate and cross-compile PyPy's Python\ninterpreter as well as other interpreters with the ARM JIT backend enabled\nusing Scratchbox 2 to provide a build environment and the GNU ARM cross\ncompilation toolchain. So far the backend only supports the Boehm garbage\ncollector which does not produce the best results combined with the JIT, but we\nplan to add support for the other GCs in the future, doing so should increase\nthe performance of PyPy on ARM.\nWhile still debugging the last issues with the backend we already can run some\nsimple benchmarks on Pyrolog, a prolog interpreter written in RPython.\nEven using Boehm as the GC the results look very promising. In the benchmarks\nwe compare Pyrolog to SWI-Prolog, a prolog interpreter written in C, which\nis available from the package repositories for Ubuntu's ARM port.\nThe benchmarks can be found in the pyrolog-bench repository.\n\nBenchmarkSWI-Prolog in ms.Pyrolog in ms.Speedup\n\niterate60.06.010.0\niterate_assert130.06.021.67\niterate_call3310.05.0662.0\niterate_cut60.0359.00.16713\niterate_exception4950.0346.014.306\niterate_failure400.0127.03.1496\niterate_findall740.0No res.\niterate_if140.06.023.333\n\nThe iterate_call benchmark, which constructs a predicate and calls it at\nruntime, with a speedup of 662 times over SWI-Prolog is an example where the\nJIT can show its strength. The Pyrolog interpreter and the JIT treat\ndynamically defined predicates as static ones and can generate optimezed code\nin both cases. Whereas SWI only compiles statically defined rules and has to\nfall back to interpretation on dynamic ones.\nFor simple benchmarks running on PyPy's Python intepreter we see some speedups\nover CPython, but we still need to debug the backend bit more before we can\nshow numbers on more complex benchmarks. So, stay tuned.", + "tags": "", + "url": "https://www.pypy.org/posts/2011/01/jit-backend-for-arm-processors-5994810755839586463.html" + }, + { + "title": "PyPy wants you!", + "text": "If you ever considered contributing to PyPy, but never did so far, this is a\ngood moment to start! :-)\nRecently, we merged the fast-forward branch which brings Python 2.7\ncompatibility, with the plan of releasing a new version of PyPy as soon as all\ntests pass.\nHowever, at the moment there are still quite a few of failing tests because\nof new 2.7 features that have not been implemented yet: many of them are easy\nto fix, and doing it represents a good way to get confidence with the code\nbase, for those who are interested in it. Michael Foord wrote a little howto\nexplaining the workflow for running lib-python tests.\nThus, if you are willing to join us in the effort of having a PyPy compatible\nwith Python 2.7, probably the most sensible option is to come on the #PyPy IRC\nchannel on Freenode, so we can coordinate each other not to fix the same test\ntwice.\nMoreover, if you are a student and are considering participating in the next\nGoogle Summer of Code this is a good time to get into pypy. You have the\nopportunity to get a good understanding of pypy for when you decide what you\nwould like to work on over the summer.", + "tags": "", + "url": "https://www.pypy.org/posts/2011/01/pypy-wants-you-4543209863582915733.html" + }, + { + "title": "Loop invariant code motion", + "text": "Recently, the jit-unroll-loops branch was merged. It implements the\nidea described in \nUsing Escape Analysis Across Loop Boundaries for Specialization.\nThat post does only talk about virtuals, but the idea turned out\nto be more far reaching. After the metainterpreter produces a trace,\nseveral optimizations are applied to the trace before it is turned\ninto binary code. Removing allocations is only one of them. There are also\nfor instance\n\n Heap optimizations that removes memory accesses by reusing results\n previously read from or written to the same location.\n Reusing of the results of pure operations if the same pure\n operation is executed twice.\n Removal of redundant guards.\n ...\n\nA lot of these optimizations are in one way or another removing\noperations form the trace and/or reusing previous results. All of these\noptimizations could benefit from being able to operate across loop\nboundaries. Not only in the sense that operations operating on loop\ninvariants could be moved out of the loop entirely. But also that\nresults produced at the end of an iteration could be reused at the\nbeginning of the next even if there are no loop invariants involved.\n\n\n\nThis is achieved by unrolling the trace into two iterations, and\nletting the optimizer work on this two-iteration-trace.\nThe optimizer will now be able to optimize the second iteration more than the\nfirst since it can reuse results from the first iteration. The\noptimized version of the first iteration we call the preamble and the\noptimized version of the second iteration we call the loop. The\npreamble will end with a jump to the loop, while the loop will end\nwith a jump to itself. This means that the preamble will be executed\nonce for the first iteration, the loop will be executed for all following\niterations.\n \n\nSqrt example\nHere is an example of a Python implementation of sqrt using a fairly\nsimple algorithm\n\n\n\n \n\ndef sqrt(y, n=10000):\n x = y / 2\n while n > 0:\n n -= 1\n x = (x + y/x) / 2\n return x\n\n\n\nIf it is called with sqrt(1234.0), \na fairly long trace is produced. From this trace\nthe optimizer creates\nthe\nfollowing preamble (Loop 1) and loop (Loop 0) \n\n\n\n\n\n\n\nLooking at the preamble, it starts by making sure that it is not \ncurrently being profiled, the guard\non i5, and that the function object have not been changed\nsince the trace was made, the guard on p3. Somewhat\nintermixed with that, the\ninteger variable n is unboxed, by making sure p11\npoints to an integer object and reading out the integer value from\nthat object. \nThese operations are not needed in the\nloop (and have been removed from it) as emitting the same guards again\nwould be redundant and n becomes a virtual before the\nend of the preamble.\n\n guard_value(i5, 0, descr=) \n guard_nonnull_class(p11, ConstClass(W_IntObject), descr=) \n guard_value(p3, ConstPtr(ptr15), descr=) \n i16 = getfield_gc_pure(p11, descr=)\n\n\nNext comes a test and a guard implementing the while statement\nfollowed by the decrementing of n. These operation appear\nboth in the preamble and in the loop\n\n i18 = int_gt(i16, 0)\n guard_true(i18, descr=) \n i20 = int_sub(i16, 1)\n\n\nAfter that the two floating point variables x and y\nare unboxed. Again this is only needed in the preamble. Note how the\nunboxed value of y, called f23, is passed unchanged\nfrom the preamble to the loop in arguments of the jump \nto allow it to be reused. It will not become a virtual\nsince it is never changed within the loop.\n\n guard_nonnull_class(p12, 17652552, descr=) \n guard_nonnull_class(p10, 17652552, descr=) \n f23 = getfield_gc_pure(p10, descr=)\n f24 = getfield_gc_pure(p12, descr=)\n\n\nFollowing that is the actual calculations performed in the loop in\nform of floating point operations (since the function was called with\na float argument). These appear in both the loop\nand the preamble.\n\n i26 = float_eq(f24, 0.000000)\n guard_false(i26, descr=) \n f27 = float_truediv(f23, f24)\n f28 = float_add(f24, f27)\n f30 = float_truediv(f28, 2.000000)\n\n\nFinally there are some tests checking if a signal was received\n(such as when the user presses ctrl-C) and thus should execute some\nsignal handler or if we need to hand over to another thread. This is\nimplemented with a counter that is decreased once every iteration. It\nwill go below zero after some specific number of iterations, tunable by\nsys.setcheckinterval. The counter is read from and written to\nsome global location where it also can be made negative by a C-level\nsignal handler. \n\n i32 = getfield_raw(32479328, descr=)\n i34 = int_sub(i32, 2)\n setfield_raw(32479328, i34, descr=)\n i36 = int_lt(i34, 0)\n guard_false(i36, descr=) \n jump(p0, p1, p2, p4, p10, i20, f30, f23, descr=)\n\n\n\nBridges\n\nWhen a guard fails often enough, the meta-interpreter is started again\nto produce a new trace starting at the failing guard. The tracing is\ncontinued until a previously compiled loop is entered. This could\neither be the the same loop that contains the failing guard\nor some completely different loop. If it is the same loop, executing\nthe preamble again maybe be unnecessary.\nIt is preferable to end the bridge with a jump directly to \nthe loop. To achieve this the optimizer tries to produce short\n preambles that are inlined at the end of bridges allowing\nthem to jump directly to the loop. Inlining is better than jumping to\na common preamble because most of the inlined short preamble can\ntypically be removed again by the optimizer.\nCreating such a short\npreamble is however not always possible. Bridges jumping to loops for which\nno short preamble can be generated have to end with a jump to the\nfull preamble instead.\n\n\n\nThe short preamble is created by comparing the operations in the\npreamble with the operations in the loop. The\noperations that are in the preamble but not in the loop \nare moved to the short preamble whenever it is safe to move them to\nthe front of the operations remaining. In other words, the full preamble\nis equivalent to the short preamble followed by one iteration of the\nloop. \n\n\n\nThis much has currently been implemented. To give the full picture\nhere, there are two more features that \nhopefully will be implemented in the near future.\nThe first is to replace the full preamble, used by the interpreter\nwhen it reaches a compiled loop, with the short preamble.\nThis is currently not done and is probably not as straight forward as\nit might first seem. The problem is where to resume interpreting on a\nguard failure. However, implementing that should save some\nmemory. Not only \nbecause the preamble will become smaller, but mainly because the\nguards will appear either in the loop or in the preamble, but not\nin both (as they do now). That means there will only be a single bridge and \nnot potentially two copies once the guards are traced.\n\n\n\nThe sqrt example above would with a short preamble result in a trace\nlike this\n\n\n\n\n\nIf it is executed long enough, the last guard will be traced to form a\nbridge. The trace will inherit the virtuals from its parent. This can\nbe used to optimize away the part of the inlined short preamble\nthat deals with virtuals. The resulting bridge should look\nsomething like\n\n\n [p0, p1, p2, p3, p4, f5, i6]\n i7 = force_token()\n setfield_gc(p1, i7, descr=)\n call_may_force(ConstClass(action_dispatcher), p0, p1, descr=)\n guard_not_forced(, descr=) \n guard_no_exception(, descr=) \n\n guard_nonnull_class(p4, 17674024, descr=) \n f52 = getfield_gc_pure(p4, descr=)\n jump(p1, p0, p2, p3, p4, i38, f53, f52, descr=)\n\n\nHere the first paragraph comes from the traced bridge and the second\nis what remains of the short preamble after optimization. The\nbox p4 is \nnot a virtual (it contains a pointer to y which is never\nchanged), and it is only virtuals \nthat the bridge inherit from it's parents. This is why the last two\noperations currently cannot be removed.\n\n\n\n\nEach time the short preamble is inlined, a new copy of each of the\nguards in it is generated. Typically the short preamble is inlined in\nseveral places and thus there will be several copies of each of those\nguards. \nIf they fail often enough bridges\nfrom them will be traced (as with all guards). But since there\ntypically are several copies of each guard the same bridge\nwill be generated in \nseveral places. To prevent this, mini-bridges from the inlined guards\nare produced already during the inlining. These mini-bridges contain\nnothing but a jump to the preamble.\n\n\nThe mini-bridges needs the arguments of the preamble to be able\nto jump to it. These arguments contain among other things, boxed\nversions of the \nvariables x and y. Those variables are virtuals in\nthe loop, and have to be allocated. Currently those allocations\nare placed in front of the inlined guard. Moving those allocations into\nthe mini-bridges is the second feature that \nhopefully will be implemented in the near future. \n\nAfter this feature is\nimplemented, the result should look something like\n\n\n\n\n\nMultiple specialized versions\n\nFloating point operations were generated in the trace above\nbecause sqrt was called with a float argument. If it is\ninstead called with an int argument, integer operations will be generated. The\nsomewhat more complex situations is when both int's and float's are\nused as arguments. Then the jit need to generate multiple versions of\nthe same loop, specialized in different ways. The details, given\nbelow, on how this is achieved is somewhat involved. For the casual\nreader it would make perfect sense to skip to the next section here.\n\n\n\nConsider the case when sqrt is first called with a float\nargument (but with n small enough not to generate the\nbridge). Then the trace shown above will be\ngenerated. If sqrt is now called with an int argument, the\nguard in the preamble testing that the type of the input object is float\nwill fail:\n\n guard_nonnull_class(p12, 17652552, descr=) \n\nIt will fail every iteration, so soon enough a bridge will be\ngenerated from this guard in the preamble. This guard will end with a\njump to the same loop, and the optimizer will try to inline\nthe short preamble at the end of it. This will however fail\nsince now there are two guards on p12. One that makes sure it\nis an int and and one that makes sure it is a float. The optimizer\nwill detect that the second guard will always fail and mark the bridge\nas invalid. Invalid loops are not passed on to the backend for\ncompilation. \n\n\n\nIf a loop is detected to be invalid while inlining the short preamble,\nthe metainterpreter will continue to trace for yet another \niteration of the loop. This new trace can be compiled as above and\nwill produce a new loop with a new preamble that are now specialized\nfor int arguments instead of float arguments. The bridge that\npreviously became invalid will now be tried again. This time inlining\nthe short preamble of the new loop instead. This will produce a set of\ntraces connected like this\n\n\n\n\n(click for some hairy details)\n\n\nThe height of the boxes is this figure represents how many instructions\nthey contain (presuming the missing features from the previous section\nare implemented). Loop 0 is specialized for floats and it's preamble have\nbeen split into two boxes at the failing guard. Loop 2 is specialized\nfor ints and is larger than Loop 0. This is mainly because the integer\ndivision in python does not map to the integer division of the\nmachine, but have to be implemented with several instructions (integer\ndivision in python truncates its result towards minus\ninfinity, while the the machine integer division truncates towards\n0). Also the height of the bridge is about the same as the height of\nLoop 2. This is because it contains a full iteration of the loop.\n\n\n\nA More Advanced Example\n\nLet's conclude with an example that is a bit more advanced, where this unrolling\napproach actually outperforms the previous approach. Consider\nmaking a\nfixed-point\nimplementation of the square root using 16 bit's of decimals. This can be\ndone using the same implementation\nof sqrt but calling it with an object of a class representing\nsuch fixed-point real numbers:\n\n\nclass Fix16(object):\n def __init__(self, val, scale=True):\n if isinstance(val, Fix16):\n self.val = val.val\n else:\n if scale:\n self.val = int(val * 2**16)\n else:\n self.val = val\n\n def __add__(self, other):\n return Fix16(self.val + Fix16(other).val, False)\n\n def __sub__(self, other):\n return Fix16(self.val - Fix16(other).val, False)\n\n def __mul__(self, other):\n return Fix16((self.val >> 8) * (Fix16(other).val >> 8), False)\n\n def __div__(self, other):\n return Fix16((self.val << 16) / Fix16(other).val, False)\n\n\n\n\nBelow is a table comparing the runtime of the sqrt function above with\ndifferent argument types on different python interpreters. Pypy 1.4.1\nwas released before the optimizations described in this post were in place\nwhile they are in place in the \nnightly\n build from January 5, \ndenoted pypy in the table. There are also the running time for the same\nalgorithms implemented in C and compiled with \"gcc -O3\n-march=native\". Tests were executed on a 2.53GHz Intel Core2\nprocessor with n=100000000 iterations.\nComparing the integer versions with C may be considered a\nbit unfair because of the more advanced integer division operator in\npython. The left part of this table shows runtimes of sqrt in\na program containing a single call to sqrt (i.e. only a single\nspecialized version of the loop is needed). The right part shows the\nruntime of sqrt when it has been called with a different\ntype of argument before.\n\n\n\n\n First callSecond call\n floatintFix16\u00a0\u00a0\n floatintFix16\n cpython\n 28.18 s\n 22.13 s\n 779.04 s\n \n 28.07 s\n 22.21 s\n 767.03 s \n \n pypy 1.4.1\n 1.20 s\n 6.49 s\n 11.31 s\n \n 1.20 s\n 6.54 s\n 11.23 s\n \n pypy\n 1.20 s\n 6.44 s\n 6.78 s\n \n 1.19 s\n 6.26 s\n 6.79 s\n \n gcc\n 1.15 s\n 1.82 s\n 1.89 s\n \n 1.15 s\n 1.82 s\n 1.89 s\n \n\n\n\n\nFor this to work in the last case, when Fix16 is the argument type in\nthe second type, \nthe trace_limit had to be increased from its default value to prevent\nthe metainterpreter from aborting while tracing the second version of\nthe loop. Also sys.setcheckinterval(1000000) were used to prevent the\nbridge from being generated. With the bridge the performance of the\nlast case is significantly worse. Maybe because the optimizer currently\nfails to generate a short preamble for it. But the slowdown\nseems too big for that to be the only explanation. Below are the runtimes\nnumbers with checkinterval set to its default value of 100:\n\n\n First callSecond call\n floatintFix16\u00a0\u00a0\n floatintFix16\n cpython\n 28.71 s\n 22.09 s\n 781.86 s\n \n 28.28 s\n 21.92 s\n 761.59 s\n \n pypy 1.4.1\n 1.21 s\n 6.48 s\n 11.22 s\n \n 1.72 s\n 7.58 s\n 12.18 s\n \n pypy\n 1.21 s\n 6.27 s\n 7.22 s\n \n 1.20 s\n 6.29 s\n 90.47 s\n \n\n\n\nConclusions\nEven though we are seeing speedups in a variety of different small\nbenchmarks, more complicated examples are not affected much by these\noptimizations. It might partly be because larger examples have longer\nand more complicated loops, and thus allowing optimizations to operate\nacross loop boundary will have a smaller relative effect. Another problem is\nthat with more complicated examples there will be more bridges, and bridges\nare currently not handled very well (most of the time all virtuals are\nforced at the end of the bridge as explained above). But moving those\nforcings into the mini bridges should fix that.", + "tags": "", + "url": "https://www.pypy.org/posts/2011/01/loop-invariant-code-motion-1998392217676829154.html" + }, + { + "title": "PyPy 1.4.1", + "text": "Here is PyPy 1.4.1 :-)\n\nUpdate: Win32 binaries available.\n\nEnjoy!\n\nRelease announcement\n\nWe're pleased to announce\nthe 1.4.1 release of PyPy.\nThis release consolidates all the bug fixes that occurred since the\nprevious release. To everyone that took the trouble to report\nthem, we want to say thank you.\n\nWhat is PyPy\n\nPyPy is a very compliant Python interpreter, almost a drop-in\nreplacement for CPython. Note that it still only emulates Python\n2.5 by default; the fast-forward branch with Python 2.7\nsupport is slowly getting ready but will only be integrated in\nthe next release.\n\nIn two words, the advantage of trying out PyPy instead of CPython\n(the default implementation of Python) is, for now, the\nperformance. Not all programs are faster in PyPy, but we are\nconfident that any CPU-intensive task will be much faster, at\nleast if it runs for long enough (the JIT has a slow warm-up\nphase, which can take several seconds or even one minute on the\nlargest programs).\n\nNote again that we do support compiling and using C extension\nmodules from CPython (pypy setup.py install). However, this\nis still an alpha feature, and the most complex modules typically\nfail for various reasons; others work (e.g. PIL) but take a\nserious performance hit. Also, for Mac OS X see below.\n\nPlease note also that PyPy's performance was optimized almost\nexclusively on Linux. It seems from some reports that on Windows\nas well as Mac OS X (probably for different reasons) the\nperformance might be lower. We did not investigate much so far.\n\nMore highlights\n\n\n\nWe migrated to Mercurial (thanks to Ronny Pfannschmidt and\n Antonio Cuni) for the effort) and moved to bitbucket. The new\n command to check out a copy of PyPy is:\n hg clone https://bitbucket.org/pypy/pypy\n\nIn long-running processes, the assembler generated by old\n JIT-compilations is now freed. There should be no more leak,\n however long the process runs.\n\nImprove a lot the performance of the binascii module, and\n of hashlib.md5 and hashlib.sha.\n\nMade sys.setrecursionlimit() a no-op. Instead, we rely purely\n on the built-in stack overflow detection mechanism, which also\n gives you a RuntimeError -- just not at some exact recursion\n level.\n\nFix argument processing (now e.g. pypy -OScpass works like\n it does on CPython --- if you have a clue what it does there\n :-) )\n\ncpyext on Mac OS X: it still does not seem to work. I get\n systematically a segfault in dlopen(). Contributions welcome.\n\nFix two corner cases in the GC (one in minimark, one in\n asmgcc+JIT). This notably prevented pypy translate.py -Ojit\n from working on Windows, leading to crashes.\n\nFixed a corner case in the JIT's optimizer, leading to Fatal\n RPython error: AssertionError.\n\nAdded some missing built-in functions into the 'os' module.\n\nFix ctypes (it was not propagating keepalive information from\n c_void_p).", + "tags": "", + "url": "https://www.pypy.org/posts/2010/12/pypy-141-7283625923182122073.html" + }, + { + "title": "PyPy migrates to Mercurial", + "text": "The assiduous readers of this blog surely remember that during the last\nD\u00fcsseldorf sprint in October, we started the process for migrating our main\ndevelopment repository from Subversion to Mercurial. Today, after more than\ntwo months, the process has finally been completed :-).\nThe new official PyPy repository is hosted on BitBucket.\nThe migration has been painful because the SVN history of PyPy was a mess and\nnone of the existing conversion tools could handle it correctly. This was\npartly because PyPy started when subversion was still at version 0.9 when some\nbest-practices were still to be established, and partly because we probably\nmanaged to invent all the possible ways to do branches (and even some of the\nimpossible ones: there is at least one commit which you cannot do with the\nplain SVN client but you have to speak to the server by yourself :-)).\nThe actual conversion was possible thanks to the enormous work done by Ronny\nPfannschmidt and his hackbeil tool. I would like to personally thank Ronny\nfor his patience to handle all the various requests we asked for.\nWe hope that PyPy development becomes even more approachable now, at least from\na version control point of view.", + "tags": "", + "url": "https://www.pypy.org/posts/2010/12/pypy-migrates-to-mercurial-3308736161543832134.html" + }, + { + "title": "Oh, and btw: PyPy gets funding through \"Eurostars\"", + "text": "There is a supporting reason why we made so many advances in the last year:\nfunding through Eurostars, a European research funding program.\nThe title of our proposal (accepted in 2009) is: \"PYJIT - a fast\nand flexible toolkit for dynamic programming languages based on PyPy\".\nAnd the participants are Open End AB, the Heinrich-Heine-Universit\u00e4t\nD\u00fcsseldorf (HHU), and merlinux GmbH.\nIt's not hard to guess what PYJIT is actually about, is it?\nQuoting: \"The PYJIT project will deliver a fast and flexible\nJust-In-Time Compiler toolkit based on PyPy to the market of dynamic\nlanguages. Our main aim is to showcase our project's results for the\nOpen Source language Python, providing unprecedented levels of\nflexibility and with speed hitherto only available using statically\ntyped languages.\" (Details in German or in Swedish :-)\nA subgoal is to improve our development and testing infrastructure,\nmainly showcased by Holger's recent py.test releases, the testing tool\nused by PyPy for its 16K tests and the speed.pypy.org infrastructure\n(web app programmed by Miquel Torres on his own time).\nThe overall scope of this project is smaller than that of the previous EU project\nfrom 2004 to 2007. The persons that are (or were) getting money to work\non PyPy are Samuele Pedroni (at Open End), Maciej Fijalkowski (as a\nsubcontractor), Carl Friedrich Bolz, Armin Rigo, Antonio Cuni (all at\nHHU), and Holger Krekel (at merlinux) as well as Ronny Pfannschmidt (as\na subcontractor).\nThe Eurostars funding lasts until August 2011. What comes afterwards?\nWell, for one, many of the currently funded people have done work without\ngetting funding in previous years. This will probably continue.\nWe also have non-funded people in the core group right now and we'll\nhope to enlarge it further. But of course there are still large tasks\nahead which may greatly benefit from funding. We have setup a\ndonation infrastructure and maybe we can win one or more larger\norganisations to provide higher or regular sums of money to fund future\ndevelopment work. Another possibility for companies is to pay\nPyPy developers to help and improve PyPy for their particular use cases.\nAnd finally, your help, donations and suggestions are always\nwelcome and overall we hope to convince more and more people it's\nworthwhile to invest into PyPy's future.", + "tags": "", + "url": "https://www.pypy.org/posts/2010/12/oh-and-btw-pypy-gets-funding-through-3568486750776147382.html" + }, + { + "title": "Leysin Winter sprint", + "text": "Hi all,\n\n\n\n\n\nThe next sprint will be in Leysin, Switzerland, during the week of the 16th-22nd of January 2011.\n\nNow that we have released 1.4, and plan to release 1.4.1 soon, the sprint is going to be mainly working on fixing issues reported by various users. Of course this does not prevent people from showing up with a more precise interest in mind.\n\nAs usual, the break day on the sprint will likely be a day of skiing :-)\n\nHoping to see you there.\n\n\n\n\n\n\n\n\nUpdate: there are actually a number of branches that we want to polish and merge into trunk: at least fast-forward, jit-unroll-loops, arm-backend and jitypes2. For more details, see the announcement.", + "tags": "", + "url": "https://www.pypy.org/posts/2010/12/leysin-winter-sprint-8115212435349091722.html" + }, + { + "title": "PyPy 1.4 release aftermath", + "text": "A couple days have passed since the announcement of the 1.4 release, and this\nis a short summary of what happened afterwards. Let's start with\nnumbers:\n\n16k visits to the release announcement on our blog\nwe don't have download statistics unfortunately\n10k visits to speed center\nmost traffic comes from referring sites, reddit alone creating above a third\nof our traffic\n\nNot too bad for a project that doesn't have a well-established user base.\nLessons learned:\n\nReleases are very important. They're still the major way projects communicate\nwith community, even if we have nightly builds that are mostly stable.\nNo segfaults were reported, no incompatibilities between JIT and normal\ninterpretation. We think that proves (or at least provides a lot of\nexperimental evidence) that our write-once-and-then-transform method is\neffective.\nA lot of people complained about their favorite module in C not working, we\nshould have made it clearer that CPyExt is in alpha state. Indeed, we\nwould like to know which C extension modules do work :-).\nSome people reported massive speedups, other reported slowdowns compared\nto CPython. Most of those slowdowns relate to modules being inefficient\n(or doing happy nonsense), like ctypes. This is expected, given that\nnot all modules are even jitted (although having them jitted is usually\na matter of a couple of minutes).\nNobody complained about a lack of some stdlib module. We implemented the ones\nwhich are used more often, but this makes us wonder if less used stdlib modules\nhave any users at all.\n\nIn general feedback has been overwhelmingly positive and we would like to\nthank everyone trying (and especially those reporting problems)\nCheers,\nfijal", + "tags": "release", + "url": "https://www.pypy.org/posts/2010/12/pypy-14-release-aftermath-2979780282210978576.html" + }, + { + "title": "We are not heroes, just very patient", + "text": "Inspired by some of the comments to the release that said \"You are heroes\", I though a bit about the longish history of PyPy and hunted around for some of the mailing list posts that started the project. Then I put all this information together into the following timeline:\n\n timeline \n\nThere is also a larger version of the timeline. Try to click on some of the events, the links usually go to the sprint descriptions. I also tried to find pictures for the sprints but succeeded for only half of them, if anybody still has some, I would be interested. It's kind of fun to browse around in some of the old sprint descriptions to see how PyPy evolved. Some of the current ideas have been around for a long time, some are new. In the description of the releases I put estimates for the speed of the release.", + "tags": "", + "url": "https://www.pypy.org/posts/2010/12/we-are-not-heroes-just-very-patient-7114408885070101720.html" + }, + { + "title": "PyPy 1.4: Ouroboros in practice", + "text": "We're pleased to announce the 1.4 release of PyPy. This is a major breakthrough\nin our long journey, as PyPy 1.4 is the first PyPy release that can translate\nitself faster than CPython. Starting today, we are using PyPy more for\nour every-day development. So may you :) You can download it here:\nhttps://pypy.org/download.html\n\nWhat is PyPy\nPyPy is a very compliant Python interpreter, almost a drop-in replacement\nfor CPython. It is fast (pypy 1.4 and cpython 2.6 comparison).\nNew Features\nAmong its new features, this release includes numerous performance improvements\n(which made fast self-hosting possible), a 64-bit JIT backend, as well\nas serious stabilization. As of now, we can consider the 32-bit and 64-bit\nlinux versions of PyPy stable enough to run in production.\nNumerous speed achievements are described on our blog. Normalized speed\ncharts comparing pypy 1.4 and pypy 1.3 as well as pypy 1.4 and cpython 2.6\nare available on the benchmark website. For the impatient: yes, we got a lot faster!\n\n\nMore highlights\n\nPyPy's built-in Just-in-Time compiler is fully transparent and\nautomatically generated; it now also has very reasonable memory\nrequirements. The total memory used by a very complex and\nlong-running process (translating PyPy itself) is within 1.5x to\nat most 2x the memory needed by CPython, for a speed-up of 2x.\nMore compact instances. All instances are as compact as if\nthey had __slots__. This can give programs a big gain in\nmemory. (In the example of translation above, we already have\ncarefully placed __slots__, so there is no extra win.)\nVirtualenv support: now PyPy is fully compatible with virtualenv: note that\nto use it, you need a recent version of virtualenv (>= 1.5).\nFaster (and JITted) regular expressions - huge boost in speeding up\nthe re module.\nOther speed improvements, like JITted calls to functions like map().\n\nCheers,\nCarl Friedrich Bolz, Antonio Cuni, Maciej Fijalkowski,\nAmaury Forgeot d'Arc, Armin Rigo and the PyPy team", + "tags": "", + "url": "https://www.pypy.org/posts/2010/11/pypy-14-ouroboros-in-practice-5437628000869417542.html" + }, + { + "title": "Improving Memory Behaviour to Make Self-Hosted PyPy Translations Practical", + "text": "In our previous blog post, we talked about how fast PyPy can translate\nitself compared to CPython. However, the price to pay for the 2x speedup was\nan huge amount of memory: actually, it was so huge that a standard -Ojit\ncompilation could not be completed on 32-bit because it required more than the\n4 GB of RAM that are addressable on that platform. On 64-bit, it consumed\n8.3 GB of RAM instead of the 2.3 GB needed by CPython.\nThis behavior was mainly caused by the JIT, because at the time we wrote the\nblog post the generated assembler was kept alive forever, together with some\nbig data structure needed to execute it.\nIn the past two weeks Anto and Armin attacked the issue in the jit-free\nbranch, which has been recently merged to trunk. The branch solves several\nissues. The main idea of the branch is that if a\nloop has not been executed for a certain amount of time (controlled by the new\nloop_longevity JIT parameter) we consider it \"old\" and no longer needed,\nthus we deallocate it.\n(In the process of doing this, we also discovered and fixed an\noversight in the implementation of generators, which led to generators being\nfreed only very slowly.)\nTo understand the freeing of loops some more, let's look at how many loops are\nactually created during a translation.\nThe purple line in the following graph shows how many loops (and bridges) are\nalive at any point in time with an infinite longevity, which is equivalent to\nthe situation we had before the jit-free branch. By contrast, the blue\nline shows the number of loops that you get in the current trunk: the\ndifference is evident, as now we never have more than 10000 loops alive, while\npreviously we got up to about 37000 ones. The time on the X axis is expressed\nin \"Giga Ticks\", where a tick is the value read out of the Time Stamp Counter\nof the CPU.\n\n\n\nThe grey vertical bars represent the beginning of each phase of the\ntranslation:\n\nannotate performs control flow graph construction and type inference.\nrtype lowers the abstraction level of the control flow graphs with types to that of C.\npyjitpl constructs the JIT.\nbackendopt optimizes the control flow graphs.\nstackcheckinsertion finds the places in the call graph that can overflow the C stack and inserts checks that raise an exception instead.\ndatabase_c produces a database of all the objects the C code will have to know about.\nsource_c produces the C source code.\ncompile_c calls the compiler to produce the executable.\n\nYou can nicely see, how the number of alive graphs drops shortly after the\nbeginning of a new phase.\nThose two fixes, freeing loops and generators, improve the memory usage greatly:\nnow, translating PyPy\non PyPy on 32-bit consumes 2 GB of RAM, while on CPython it consumes 1.1 GB.\nThis result can even be improved somewhat, because we are not actually freeing\nthe assembler code itself, but\nonly the large data structures around it; we can consider it as a residual\nmemory leak of around 150 MB in this case. This will be fixed in the\njit-free-asm branch.\nThe following graph shows the memory usage in more detail:\n\n\nthe blue line (cpython-scaled) shows the total amount of RAM that the\nOS allocates for CPython. Note that the X axis (the time) has been\nscaled down so that it spans as much as the PyPy one, to ease the\ncomparison. Actually, CPython took more than twice as much time as PyPy to\ncomplete the translation\nthe red line (VmRss) shows the total amount of RAM that the\nOS allocates for PyPy: it includes both the memory directly handled by\nour GC and the \"raw memory\" that we need to allocate for other tasks, such\nas the assembly code generated by the JIT\nthe brown line (gc-before) shows how much memory is used by the GC\nbefore each major collection\nthe yellow line (gc-after) shows how much memory is used by the GC\nafter each major collection: this represent the amount of memory which is\nactually needed to hold our Python objects. The difference between\ngc-before and gc-after (the GC delta) is the amout of memory that the GC\nuses before triggering a new major collection\n\n\n\n\n\nBy comparing gc-after and cpython-scaled, we can see that PyPy\nuses mostly the same amount of memory as CPython for storing the application\nobjects (due to reference counting the memory usage in CPython is always very\nclose to the actually necessary memory). The extra memory\nused by PyPy is due to the GC delta, to the machine code generated by the JIT\nand probably to some other external effect (such as e.g. Memory\nFragmentation).\nNote that the GC delta can be set arbitrarly low (another recent addition --\nthe default value depends on the actual RAM on your computer; it probably\nworks to translate if your computer has precisely 2 GB, because in this\ncase the GC delta and thus the total memory usage will be somewhat\nlower than reported here), but the cost is to have more\nfrequent major collections and thus a higher run-time overhead. The same is\ntrue for the memory needed by the JIT, which can be reduced by telling the JIT\nto compile less often or to discard old loops more frequently. As often\nhappens in computer science, there is a trade-off between space and time, and\ncurrently for this particular example PyPy runs twice as fast as CPython by\ndoubling the memory usage. We hope to improve even more on this trade-off.\nOn 64-bit, things are even better as shown by the the following graph:\n\n\n\nThe general shape of the lines is similar to the 32-bit graph. However, the\nrelative difference to CPython is much better: we need about 3 GB of RAM, just\n24% more than the 2.4 GB needed by CPython. And we are still more than 2x\nfaster!\nThe memory saving is due (partly?) to the vtable ptr optimization, which is\nenabled by default on 64-bit because it has no speed penalty (see\nUnifying the vtable ptr with the GC header).\nThe net result of our work is that now translating PyPy on PyPy is practical\nand takes less than 30 minutes. It's impressive how quickly you get used to\ntranslation taking half the time -- now we cannot use CPython any more for that\nbecause it feels too slow :-).", + "tags": "", + "url": "https://www.pypy.org/posts/2010/11/improving-memory-behaviour-to-make-self-856966667913962461.html" + }, + { + "title": "Running large radio telescope software on top of PyPy and twisted", + "text": "Hello.\nAs some of you already know, I've recently started working on a\nvery large radio telescope at SKA South Africa. This telescope's\noperating software runs almost exclusively on Python (several high throughput\npieces are in C or CUDA or directly executed by FPGAs). Some cool telescope pictures:\n\n\n\n\n\n\n\n(photos courtesy of SKA South Africa)\nMost of the operation software is using the KatCP protocol to talk between devices.\nThe currently used implementation is Open Source software with a custom home built\nserver and client. As part of the experiments, I've implemented a Twisted based\nversion and run in on top of CPython and PyPy for both the default\nimplementation and the one based on Twisted to see how those perform.\nThere are two testing scenarios: the first one is trying to saturate the connection\nby setting up multiple sensors that report state every 10ms, the second one\nis measuring a round-trip between sending a request and receiving the response.\nBoth numbers are measuring the number of requests per 0.2s, so the more the better. On X axis there is a number of simultanously connected clients.\nAll benchmark code is available in the KatCP repository.\nThe results are as follows:\n\n\n\n\n\n\nAs you can see, in general Twisted has larger overhead for a single client\nand scales better as the number of clients increases. That's I think expected,\nsince Twisted has extra layers of indirection. The round trip degradation of\nTwisted has to be investigated, but for us scenario1 is by far more important.\nAll across the board PyPy performs much better than CPython for both\nTwisted and a home-made solution, which I think is a pretty good result.\nNote: we didn't roll this set up into production yet, but there are high\nchances for both twisted and PyPy to be used in some near future.\nCheers,\nfijal", + "tags": "", + "url": "https://www.pypy.org/posts/2010/11/running-large-radio-telescope-software-7600337209616168504.html" + }, + { + "title": "Efficiently Implementing Python Objects With Maps", + "text": "As could be foreseen by my Call for Memory Benchmarks post a while ago, I am\ncurrently working on improving the memory behaviour of PyPy's Python\ninterpreter. In this blog post I want to describe the various data a Python\ninstance can store. Then I want to describe how a branch that I did and that was\nrecently merged implements the various features of instances in a very\nmemory-efficient way.\n\nPython's Object Model\nAll \"normal\" new-style Python instances (i.e. instances of subclasses of object\nwithout added declarations) store two (or possibly three) kinds of information.\n\nStoring the Class\nEvery instance knows which class it belongs to. This information is accessible\nvia the .__class__ attribute. It can also be changed to other (compatible\nenough) classes by writing to that attribute.\n\n\nInstance Variables\nEvery instance also has an arbitrary number of attributes stored (also called\ninstance variables). The instance variables used can vary per instance, which is\nnot the case in most other class-based languages: traditionally (e.g. in\nSmalltalk or Java) the class describes the shape of its instances,\nwhich means that the\nset of admissible instance variable names is the same for all instances of a\nclass.\nIn Python on the other hand, it is possible to add arbitrary attributes to an\ninstance at any point. The instance behaves like a dictionary mapping attribute\nnames (as strings) to the attribute values.\nThis is actually how CPython implements instances. Every instance has a\nreference to a dictionary that stores all the attributes of the instance. This\ndictionary can be reached via the .__dict__ attribute. To make things more\nfun, the dictionary can also be changed by writing to that attribute.\n\n\nExample\nAs an example, consider the following code:\nclass A(object):\n pass\n\ninstance1 = A()\ninstance1.x = 4\ninstance1.y = 6\ninstance1.z = -1\n\ninstance2 = A()\ninstance2.x = 1\ninstance2.y = 2\ninstance2.z = 3\n\nThese two instances would look something like this in memory:\n\n(The picture glosses over a number of details, but it still shows the essential\nissues.)\nThis way of storing things is simple, but unfortunately rather inefficient. Most\ninstances of the same class have the same shape, i.e. the same set of instance\nattribute names. That means that the key part of all the dictionaries is\nidentical (shown grey here). Therefore storing that part repeatedly in all\ninstances is a waste. In addition, dictionaries are themselves rather large.\nSince they are typically implemented as hashmaps, which must not be too full to\nbe efficient, a dictionary will use something like 6 words on average per key.\n\n\nSlots\nSince normal instances are rather large, CPython 2.2 introduced slots, to make\ninstances consume less memory. Slots are a way to fix the set of attributes an\ninstance can have. This is achieved by adding a declaration to a class, like\nthis:\nclass B(object):\n __slots__ = [\"x\", \"y\", \"z\"]\n\nNow the instances of B can only have x, y and z as attributes\nand don't have a dictionary at all. Instead, the instances of B get\nallocated with enough size to hold exactly the number of instance variables that\nthe class permits. This clearly saves a lot of memory over the dictionary\napproach, but has a number of disadvantages. It is obviously less flexible, as\nyou cannot add additional instance variables to an instance if you happen to\nneed to do that. It also introduces a set of rules and corner-cases that can\nbe surprising sometimes (e.g. instances of a subclass of a class with slots that\ndoesn't have a slots declaration will have a dict).\n\n\n\nUsing Maps for Memory-Efficient Instances\nAs we have seen in the diagram above, the dictionaries of instances of the same\nclass tend to look very similar and share all the keys. The central idea to use\nless memory is to \"factor out\" the common parts of the instance dictionaries\ninto a new object, called a \"map\" (because it is a guide to the landscape of the\nobject, or something). After that factoring out, the representation of the\ninstances above looks something like this:\n\nEvery instance now has a reference to its map, which describes what the instance\nlooks like. The actual instance variables are stored in an array (called\nstorage in the diagram). In the example here, the map describes that the\ninstances have three attributes x, y and z. The numbers after the\nattributes are indexes into the storage array.\nIf somebody adds a new attribute to one of the instances, the map for that\ninstance will be changed to another map that also contains the new attribute,\nand the storage will have to grow a field with the new attribute. The maps are\nimmutable, immortal and reused as much as possible. This means, that two\ninstances of the same class with the same set of attributes will have the same\nmap. This also means that the memory the map itself uses is not too important,\nbecause it will potentially be amortized over many instances.\nNote that using maps makes instances nearly as small as if the correct slots had\nbeen declared in the class. The only overhead needed is the indirection to the\nstorage array, because you can get new instance variables, but not new slots.\nThe concept of a \"map\" that describes instances is kind of old and comes from\nthe virtual machine for the Self programming language. The optimization was\nfirst described in 1989 in a paper by Chambers, Ungar and Lee with the title An\nEfficient Implementation of Self, a Dynamically-Typed Object-Oriented Language\nBased on Prototypes. A similar technique is used in Google's V8 JavaScript\nengine, where the maps are called hidden classes and in the Rhino\nJavaScript engine.\nThe rest of the post describes a number of further details that occur if\ninstances are implemented using maps.\n\nSupporting Dictionaries with Maps\nThe default instance representation with maps as shown above works without\nactually having a dictionary as part of each instance. If a dictionary is\nactually requested, by accessing the .__dict__ attribute, it needs to be\ncreated and cached. The dictionary is not a normal Python dictionary, but a thin\nwrapper around the object that forwards all operations to it. From the user's\npoint of view it behaves like a normal dictionary though (it even has the\ncorrect type).\nThe dictionary needs to be cached, because accessing .__dict__ several times\nshould always return the same dictionary. The caching happens by using a\ndifferent map that knows about the dictionary and putting the dictionary into\nthe storage array:\n\nThings become really complex if the fake dict is used in strange ways. As long\nas the keys are strings, everything is fine. If somebody adds other keys to the\ndict, they cannot be represented by the map any more (which supports only\nattributes, i.e. string keys in the __dict__). If that happens, all the\ninformation of the instance will move into the fake dictionary, like this:\n\nIn this picture, the key -1 was added to the instance's dictionary. Since\nusing the dictionary in arbitrary ways should be rare, we are fine with the\nadditional time and memory that the approach takes.\n\n\nSlots and Maps\nMaps work perfectly together with slots, because the slots can just be stored\ninto the storage array used by the maps as well (in practise there are some\nrefinements to that scheme). This means that putting a __slots__ on a\nclass has mostly no effect, because the instance only stores the values of the\nattributes (and not the names), which is equivalent to the way slots are stored\nin CPython.\n\n\n\nImplementation Details\nIn the diagrams above, I represented the maps as flat objects. In practise this\nis a bit more complex, because it needs to be efficient to go from one map to\nthe next when new attributes are added. Thus the maps are organized in a tree.\nThe instances with their maps from above look a bit more like this in practise:\n\nEvery map just describes one attribute of the object, with a name and a an\nindex. Every map also has a back field, that points to another map\ndescribing what the rest of the object looks like. This chain ends with a\nterminator, which also stores the class of the object.\nThe maps also contain the information necessary for making a new object of\nclass A. Immediately after the new object has been created, its map is the\nterminator. If the x attribute is added, its maps is changed to the\nsecond-lowest map, and so on. The blue arrows show the sequence of maps that\nthe new object goes through when the attributes x, y, z are added.\nThis representation of maps as chains of objects sounds very inefficient if an\nobject has many attributes. The whole chain has to be walked to find the index.\nThis is true to some extent. The problem goes away in the presence of the JIT,\nwhich knows that the chain of maps is an immutable structure, and will thus\noptimize away all the chain-walking. If the JIT is not used, there are a few\ncaches that try to speed up the walking of this chain (similar to the method\ncache in CPython and PyPy).\n\n\nResults\nIt's hard to compare the improvements of this optimization in a fair way, as\nthe trade-offs are just very different. Just to give an impression, a million\nobjects of the same class with three fields on a 32bit system takes:\nwithout slots:\n\n182 MiB memory in CPython\n177 MiB memory in PyPy without maps\n40 MiB memory in PyPy with maps\n\nwith slots:\n\n45 MiB memory in CPython\n50 MiB memory in PyPy without maps\n40 MiB memory in PyPy with maps\n\nNote how maps make the objects a bit more efficient like CPython using slots.\nAlso, using slots has no additional effect in PyPy.\n\n\nConclusion\nMaps are a powerful approach to shrinking the memory used by many similar\ninstances. I think they can be pushed even further (e.g. by adding information\nabout the types of the attributes) and plan to do so in the following months.\nDetails will be forthcoming.", + "tags": "", + "url": "https://www.pypy.org/posts/2010/11/efficiently-implementing-python-objects-3838329944323946932.html" + }, + { + "title": "Speeding up PyPy by donations", + "text": "PyPy joins the Software Freedom Conservancy\n\nGood news. PyPy is now a member of the Software Freedom Conservancy (SFC),\nsee the SFC blog post. This allows us to manage non-profit monetary aspects of\nthe project independently from a company or particular persons. So we\ncan now officially receive donations both from people prefering right or\nleft sides, see the Donate buttons on our home page and our blog.\nAnd you can use PayPal or Google Checkout, Donations are tax-exempt in the\nUSA and hopefully soon in Europe as well.\nWhat's it going to get used for? For the immediate future we intend to use\nthe donations for funding travels of core contributors to PyPy sprints\nwho otherwise can't afford to come. So if you have no time but some\nmoney you can help to encourage coding contributors to care for PyPy.\nIf we end up with bigger sums we'll see and take suggestions. Money\nspending decisions will be done by core PyPy people according to\nnon-profit guidelines. And we'll post information from time to time\nabout how much we got and where the money went.\nIf you have any questions regarding the SFC membership or donations\nyou may send email to sfc at pypy.org which will be observed\nby Carl Friedrich Bolz, Jacob Hallen and Holger Krekel - the initial\nPyPy SFC representatives on behalf of the PyPy team. Many thanks go\nout to Bradley M. Kuhn for helping to implement the PyPy SFC membership.\ncheers,\nHolger & Carl Friedrich", + "tags": "", + "url": "https://www.pypy.org/posts/2010/11/speeding-up-pypy-by-donations-6035529829962326007.html" + }, + { + "title": "A snake which bites its tail: PyPy JITting itself", + "text": "We have to admit: even if we have been writing for years about the fantastic\nspeedups that the PyPy JIT gives, we, the PyPy developers, still don't use it\nfor our daily routine. Until today :-).\nReaders brave enough to run translate.py to translate PyPy by themselves\nsurely know that the process takes quite a long time to complete, about a hour\non super-fast hardware and even more on average computers. Unfortunately, it\nhappened that translate.py was a bad match for our JIT and thus ran much\nslower on PyPy than on CPython.\nOne of the main reasons is that the PyPy translation toolchain makes heavy use\nof custom metaclasses, and until few weeks ago metaclasses disabled some of\nthe central optimizations which make PyPy so fast. During the recent\nD\u00fcsseldorf sprint, Armin and Carl Friedrich fixed this problem and\nre-enabled all the optimizations even in presence of metaclasses.\nSo, today we decided that it was time to benchmark again PyPy against itself.\nFirst, we tried to translate PyPy using CPython as usual, with the following\ncommand line (on a machine with an \"Intel(R) Xeon(R) CPU W3580 @ 3.33GHz\" and\n12 GB of RAM, running a 32-bit Ubuntu):\n\n$ python ./translate.py -Ojit targetpypystandalone --no-allworkingmodules\n\n... lots of output, fractals included ...\n\n[Timer] Timings:\n[Timer] annotate --- 252.0 s\n[Timer] rtype_lltype --- 199.3 s\n[Timer] pyjitpl_lltype --- 565.2 s\n[Timer] backendopt_lltype --- 217.4 s\n[Timer] stackcheckinsertion_lltype --- 26.8 s\n[Timer] database_c --- 234.4 s\n[Timer] source_c --- 480.7 s\n[Timer] compile_c --- 258.4 s\n[Timer] ===========================================\n[Timer] Total: --- 2234.2 s\n\nThen, we tried the same command line with PyPy (SVN revision 78903, x86-32 JIT\nbackend, downloaded from the nightly build page):\n\n$ pypy-c-78903 ./translate.py -Ojit targetpypystandalone --no-allworkingmodules\n\n... lots of output, fractals included ...\n\n[Timer] Timings:\n[Timer] annotate --- 165.3 s\n[Timer] rtype_lltype --- 121.9 s\n[Timer] pyjitpl_lltype --- 224.0 s\n[Timer] backendopt_lltype --- 72.1 s\n[Timer] stackcheckinsertion_lltype --- 7.0 s\n[Timer] database_c --- 104.4 s\n[Timer] source_c --- 167.9 s\n[Timer] compile_c --- 320.3 s\n[Timer] ===========================================\n[Timer] Total: --- 1182.8 s\n\nYes, it's not a typo: PyPy is almost two times faster than CPython!\nMoreover, we can see that PyPy is faster in each of the individual steps apart\ncompile_c, which consists in just a call to make to invoke gcc.\nThe slowdown comes from the fact that the Makefile also contains a lot of\ncalls to the trackgcroot.py script, which happens to perform badly on PyPy\nbut we did not investigate why yet.\nHowever, there is also a drawback: on this specific benchmark, PyPy consumes\nmuch more memory than CPython. The reason why the command line above contains\n--no-allworkingmodules is that if we include all the modules the\ntranslation crashes when it's complete at 99% because it consumes all the 4GB\nof memory which is addressable by a 32-bit process.\nA partial explanation if that so far the assembler generated by the PyPy JIT\nis immortal, and the memory allocated for it is never reclaimed. This is\nclearly bad for a program like translate.py which is divided into several\nindependent steps, and for which most of the code generated in each step could\nbe safely be thrown away when it's completed.\nIf we switch to 64-bit we can address the whole 12 GB of RAM that we have, and\nthus translating with all working modules is no longer an issue. This is the\ntime taken with CPython (note that it does not make sense to compare with the\n32-bit CPython translation above, because that one does not include all the\nmodules):\n\n$ python ./translate.py -Ojit\n\n[Timer] Timings:\n[Timer] annotate --- 782.7 s\n[Timer] rtype_lltype --- 445.2 s\n[Timer] pyjitpl_lltype --- 955.8 s\n[Timer] backendopt_lltype --- 457.0 s\n[Timer] stackcheckinsertion_lltype --- 63.0 s\n[Timer] database_c --- 505.0 s\n[Timer] source_c --- 939.4 s\n[Timer] compile_c --- 465.1 s\n[Timer] ===========================================\n[Timer] Total: --- 4613.2 s\n\nAnd this is for PyPy:\n\n$ pypy-c-78924-64 ./translate.py -Ojit\n\n[Timer] Timings:\n[Timer] annotate --- 505.8 s\n[Timer] rtype_lltype --- 279.4 s\n[Timer] pyjitpl_lltype --- 338.2 s\n[Timer] backendopt_lltype --- 125.1 s\n[Timer] stackcheckinsertion_lltype --- 21.7 s\n[Timer] database_c --- 187.9 s\n[Timer] source_c --- 298.8 s\n[Timer] compile_c --- 650.7 s\n[Timer] ===========================================\n[Timer] Total: --- 2407.6 s\n\nThe results are comparable with the 32-bit case: PyPy is still almost 2 times\nfaster than CPython. And it also shows that our 64-bit JIT backend is as good\nas the 32-bit one. Again, the drawback is in the consumed memory: CPython\nused 2.3 GB while PyPy took 8.3 GB.\nOverall, the results are impressive: we knew that PyPy can be good at\noptimizing small benchmarks and even middle-sized programs, but as far as we\nknow this is the first example in which it heavily optimizes a huge, real world\napplication. And, believe us, the PyPy translation toolchain is complex\nenough to contains all kinds of dirty tricks and black magic that make Python\nlovable and hard to optimize :-).", + "tags": "", + "url": "https://www.pypy.org/posts/2010/11/snake-which-bites-its-tail-pypy-jitting-5161284681004717142.html" + }, + { + "title": "D\u00fcsseldorf Sprint Report 2010", + "text": "This years installment of the yearly PyPy D\u00fcsseldorf Sprint is drawing to a\nclose. As usual, we worked in the seminar room of the programming language\ngroup at the University of D\u00fcsseldorf. The sprint was different from previous\nones in that we had fewer people than usual and many actually live in\nD\u00fcsseldorf all the time.\nDavid spent the sprint working on the arm-backend branch, which is adding an\nARM backend to the JIT. With the help of Armin he added support for bridges in\nthe JIT and generally implemented missing operations, mostly for handling integers so far.\nRonny and Anto worked the whole week trying to come up with a scheme for\nimporting PyPy's SVN history into a mercurial repository without loosing too\nmuch information. This is a non-trivial task, because PyPy's history is gnarly.\nWe are nearly at revision 79000 and when we started using it, Subversion was at\nversion 0.1. All possible and impossible ways to mangle and mistreat a\nSubversion repository have been applied to PyPy's repo, so most of the\nimporting tools just give up. Ronny and Anto came up with a new plan and new\nhelper scripts every day, only to then discover another corner case that they\nhadn't thought of. Now they might actually have a final plan (but they said\nthat every day, so who knows?).The branch history of PyPy's repository (every box is a branch)Carl Friedrich and Lukas started working in earnest on memory benchmarks to\nunderstand the memory behaviour of Python code better. They have now\nimplemented a generic memory benchmark runner and a simple analysis that walks\nall objects and collects size information about them. They also added some\nbenchmarks that were proposed in the comments of the recent call for\nbenchmarks. As soon as some results from that work are there, we will post\nabout them.\nThere were also some minor tasks performed during the sprint. Armin implemented\nthe _bisect module and the dict.popitem method in RPython. Armin and\nCarl Friedrich made the new memory-saving mapdict implementation more suitable\nto use without the JIT (blog post should come about that too, at some point).\nThey also made classes with custom metaclasses a lot faster when the JIT is\nused.\nThe last three days of the sprint were spent working on H\u00e5kan's\njit-unroll-loops branch. The branch is meant to move loop invariants out of\nthe loop, using techniques very similar to what is described in the recent post\non escape analysis across loop boundaries (see? it will soon stop being\nscience-fiction). Some of the ideas of this approach also come from LuaJIT\nwhich also uses very aggressive loop invariant code motion in its optimizers.\nMoving loop invariants outside of the loop is very useful, because many of the\nlookups that Python programs do in loops are loop invariants. An example is if\nyou call a function in a loop: The global lookup can often be done only once.\nThis branch fundamentally changes some of the core assumptions of the JIT, so\nit is a huge amount of work to make it fit with all the other parts and to\nadapt all tests. That work is now nearly done, some failing tests remain. The\nnext steps are to fix them and then do additional tests with the translated\nexecutable and look at the benchmarks.", + "tags": "", + "url": "https://www.pypy.org/posts/2010/10/dusseldorf-sprint-report-2010-371223200425847723.html" + }, + { + "title": "The peace of green", + "text": "No, we are not going to talk about the environment (i.e., the set of variables\nas printed by /usr/bin/env. What else? :-)).\nAfter months in which we had a couple of tests failing every day, we finally\nmanaged to turn (almost) everything green today, at least on Linux. Enjoy\nthis screenshoot taken from the nightly build page:\n\n\n\n\nAs usual, the full buildbot results can be seen from the summary page.\ncheers,\nAnto", + "tags": "", + "url": "https://www.pypy.org/posts/2010/10/peace-of-green-4230271053903469504.html" + }, + { + "title": "PhD Thesis about PyPy's CLI JIT Backend", + "text": "Hi all,\nfew months ago I finished the PhD studies and now my thesis is available,\njust in case someone does not have anything better to do than read it :-).\nThe title of the thesis is High performance implementation of Python for\nCLI/.NET with JIT compiler generation for dynamic languages, and its mainly\nbased on my work on the CLI backend for the PyPy JIT (note that the CLI JIT\nbackend is currently broken on trunk, but it's still working in the cli-jit\nbranch).\nThe thesis might be useful also for people that are not directly interested in\nthe CLI JIT backend, as it also contains general information about the inner\nworkings of PyPy which are independent from the backend: in particular,\nchapters 5 and 6 explain how the JIT frontend works.\n\nHere is the summary of chapters:\n\nIntroduction\nThe problem\nEnter PyPy\nCharacterization of the target platform\nTracing JITs in a nutshell\nThe PyPy JIT compiler generator\nThe CLI JIT backend\nBenchmarks\nConclusion and Future Work\n\n\n\ncheers,\nAnto", + "tags": "", + "url": "https://www.pypy.org/posts/2010/10/phd-thesis-about-pypys-cli-jit-backend-969267841095296323.html" + }, + { + "title": "Next PyPy sprint", + "text": "Hi all,\n\nThe next PyPy sprint is scheduled for the end of the month, from the 25th to the 31st of October 2010. It will be done at the university of D\u00fcsseldorf, Germany, where three of us are working.\n\nPlease see this link for more information.", + "tags": "", + "url": "https://www.pypy.org/posts/2010/10/next-pypy-sprint-4850394963147107623.html" + }, + { + "title": "PyPy in Google's Summer of Code 2010", + "text": "Hello.\nThis year we had a record of two and a half applications (one was on a cross\nsection of PyPy and numpy) accepted for the Google\nSoC program. Since it ended a couple of weeks ago, we wanted to present the results that\nwere achieved. All three projects were completed successfully, although the rate\nof success varied quite a bit.\nThe Numpy proposal progress significantly on making numpy compatible with\nPyPy's CPython's extension module support, but failed to bring PyPy's numpy\nimplementation into a usable shape (which is a somewhat ambitious goal, one\nmight argue). The experiments done during the projects are living on the\nmicronumpy branch.\nThe Fast ctypes proposal did some useful experiments on how to JIT external\ncalls from PyPy to C, however, the actual code as of now is not very\ninteresting and it's quite far from providing a full ctypes replacement (or\nequivalent).\nDefinitely the most successful proposal was a 64bit (x86_64) backend for PyPy's\nJIT. It not only includes working 64bit JIT (merged into PyPy trunk), but also\na working asmgcc for x86_64 linux platform, that makes it possible to run the JIT\non this architecture with our advanced garbage collectors. One can say that\nx64_64 is now no longer a second-class citizen for PyPy, although it definitely\ndidn't receive as much testing as the x86 platform. Expect this to be a major\nselling point for the next PyPy release :-)\nCheers,\nfijal & the PyPy team", + "tags": "", + "url": "https://www.pypy.org/posts/2010/09/pypy-in-googles-summer-of-code-2010-1267220161643618015.html" + }, + { + "title": "Using Escape Analysis Across Loop Boundaries for Specialization", + "text": "This blog post is a successor to the one about escape analysis in PyPy's\nJIT. The examples from there will be continued here. This post is a bit\nscience-fictiony. The algorithm that PyPy currently uses is significantly more\ncomplex and much harder than the one that is described here. The resulting\nbehaviour is very similar, however, so we will use the simpler version (and we\nmight switch to that at some point in the actual implementation).\nIn the last blog post we described how escape analysis can be used to remove\nmany of the allocations of short-lived objects and many of the type dispatches\nthat are present in a non-optimized trace. In this post we will improve the\noptimization to also handle more cases.\nTo understand some more what the optimization described in the last blog post\ncan achieve, look at the following figure:\n\n\n\nThe figure shows a trace before optimization, together with the lifetime of\nvarious kinds of objects created in the trace. It is executed from top to\nbottom. At the bottom, a jump is used to execute the same loop another time.\nFor clarity, the figure shows two iterations of the loop.\nThe loop is executed until one of the guards in the trace fails, and the\nexecution is aborted.\nSome of the operations within this trace are new operations, which each create a\nnew instance of some class. These instances are used for a while, e.g. by\ncalling methods on them, reading and writing their fields. Some of these\ninstances escape, which means that they are stored in some globally accessible\nplace or are passed into a function.\nTogether with the new operations, the figure shows the lifetimes of the\ncreated objects. Objects in category 1 live for a while, and are then just not\nused any more. The creation of these objects is removed by the\noptimization described in the last blog post.\nObjects in category 2 live for a while and then escape. The optimization of the\nlast post deals with them too: the new that creates them and\nthe field accesses are deferred, until the point where the object escapes.\nThe objects in category 3 and 4 are in principle like the objects in category 1\nand 2. They are created, live for a while, but are then passed as an argument\nto the jump operation. In the next iteration they can either die (category\n3) or escape (category 4).\nThe optimization of the last post considered the passing of an object along a\njump to be equivalent to escaping. It was thus treating objects in category 3\nand 4 like those in category 2.\nThe improved optimization described in this post will make it possible to deal\nbetter with objects in category 3 and 4. This will have two consequences: on\nthe one hand, more allocations are removed from the trace (which is clearly\ngood). As a side-effect of this, the traces will also be type-specialized.\n\nOptimizing Across the Jump\nLet's look at the final trace obtained in the last post for the example loop.\nThe final trace was much better than the original one, because many allocations\nwere removed from it. However, it also still contained allocations:\n\n\n\nThe two new BoxedIntegers stored in p15 and p10 are passed into\nthe next iteration of the loop. The next iteration will check that they are\nindeed BoxedIntegers, read their intval fields and then not use them\nany more. Thus those instances are in category 3.\nIn its current state the loop\nallocates two BoxedIntegers at the end of every iteration, that then die\nvery quickly in the next iteration. In addition, the type checks at the start\nof the loop are superfluous, at least after the first iteration.\nThe reason why we cannot optimize the remaining allocations away is because\ntheir lifetime crosses the jump. To improve the situation, a little trick is\nneeded. The trace above represents a loop, i.e. the jump at the end jumps to\nthe beginning. Where in the loop the jump occurs is arbitrary, since the loop\ncan only be left via failing guards anyway. Therefore it does not change the\nsemantics of the loop to put the jump at another point into the trace and we\ncan move the jump operation just above the allocation of the objects that\nappear in the current jump. This needs some care, because the arguments to\njump are all currently live variables, thus they need to be adapted.\nIf we do that for our example trace above, the trace looks like this:\n\n\n\nNow the lifetime of the remaining allocations no longer crosses the jump, and\nwe can run our escape analysis a second time, to get the following trace:\n\n\n\nThis result is now really good. The code performs the same operations than\nthe original code, but using direct CPU arithmetic and no boxing, as opposed to\nthe original version which used dynamic dispatching and boxing.\nLooking at the final trace it is also completely clear that specialization has\nhappened. The trace corresponds to the situation in which the trace was\noriginally recorded, which happened to be a loop where BoxedIntegers were\nused. The now resulting loop does not refer to the BoxedInteger class at\nall any more, but it still has the same behaviour. If the original loop had\nused BoxedFloats, the final loop would use float_* operations\neverywhere instead (or even be very different, if the object model had\nuser-defined classes).\n\n\nEntering the Loop\nThe approach of placing the jump at some other point in the loop leads to\none additional complication that we glossed over so far. The beginning of the\noriginal loop corresponds to a point in the original program, namely the\nwhile loop in the function f from the last post.\nNow recall that in a VM that uses a tracing JIT, all programs start by being\ninterpreted. This means that when f is executed by the interpreter, it is\neasy to go from the interpreter to the first version of the compiled loop.\nAfter the jump is moved and the escape analysis optimization is applied a\nsecond time, this is no longer easily possible. In particular, the new loop\nexpects two integers as input arguments, while the old one expected two\ninstances.\nTo make it possible to enter the loop directly from the intepreter, there\nneeds to be some additional code that enters the loop by taking as input\narguments what is available to the interpreter, i.e. two instances. This\nadditional code corresponds to one iteration of the loop, which is thus\npeeled off:\n\n\n\n\n\nSummary\nThe optimization described in this post can be used to optimize away\nallocations in category 3 and improve allocations in category 4, by deferring\nthem until they are no longer avoidable. A side-effect of these optimizations\nis also that the optimized loops are specialized for the types of the variables\nthat are used inside them.", + "tags": "", + "url": "https://www.pypy.org/posts/2010/09/using-escape-analysis-across-loop-2887031293132023676.html" + }, + { + "title": "Escape Analysis in PyPy's JIT", + "text": "The goal of a just-in-time compiler for a dynamic language is obviously to\nimprove the speed of the language over an implementation of the language that\nuses interpretation. The first goal of a JIT is thus to remove the\ninterpretation overhead, i.e. the overhead of bytecode (or AST) dispatch and the\noverhead of the interpreter's data structures, such as operand stack etc. The\nsecond important problem that any JIT for a dynamic language needs to solve is\nhow to deal with the overhead of boxing of primitive types and of type\ndispatching. Those are problems that are usually not present in statically typed\nlanguages.\nBoxing of primitive types means that dynamic languages need to be able to handle\nall objects, even integers, floats, etc. in the same way as user-defined\ninstances. Thus those primitive types are usually boxed, i.e. a small\nheap-structure is allocated for them, that contains the actual value.\nType dispatching is the process of finding the concrete implementation that is\napplicable to the objects at hand when doing a generic operation at hand. An\nexample would be the addition of two objects: The addition needs to check what\nthe concrete objects are that should be added are, and choose the implementation\nthat is fitting for them.\nLast year, we wrote a blog post and a paper about how PyPy's meta-JIT\napproach works. These explain how the meta-tracing JIT can remove the overhead\nof bytecode dispatch. In this post (and probably a followup) we want to explain\nhow the traces that are produced by our meta-tracing JIT are then optimized to\nalso remove some of the overhead more closely associated to dynamic languages,\nsuch as boxing overhead and type dispatching. The most important technique to\nachieve this is a form of escape analysis that we call virtual objects.\nThis is best explained via an example.\n\nRunning Example\nFor the purpose of this blog post, we are going to use a very simple object\nmodel, that just supports an integer and a float type. The objects support only\ntwo operations, add, which adds two objects (promoting ints to floats in a\nmixed addition) and is_positive, which returns whether the number is greater\nthan zero. The implementation of add uses classical Smalltalk-like\ndouble-dispatching. These classes could be part of the implementation of a very\nsimple interpreter written in RPython.\nclass Base(object):\n def add(self, other):\n \"\"\" add self to other \"\"\"\n raise NotImplementedError(\"abstract base\")\n def add__int(self, intother):\n \"\"\" add intother to self, where intother is a Python integer \"\"\"\n raise NotImplementedError(\"abstract base\")\n def add__float(self, floatother):\n \"\"\" add floatother to self, where floatother is a Python float \"\"\"\n raise NotImplementedError(\"abstract base\")\n def is_positive(self):\n \"\"\" returns whether self is positive \"\"\"\n raise NotImplementedError(\"abstract base\")\n\nclass BoxedInteger(Base):\n def __init__(self, intval):\n self.intval = intval\n def add(self, other):\n return other.add__int(self.intval)\n def add__int(self, intother):\n return BoxedInteger(intother + self.intval)\n def add__float(self, floatother):\n return BoxedFloat(floatother + float(self.intval))\n def is_positive(self):\n return self.intval > 0\n\nclass BoxedFloat(Base):\n def __init__(self, floatval):\n self.floatval = floatval\n def add(self, other):\n return other.add__float(self.floatval)\n def add__int(self, intother):\n return BoxedFloat(float(intother) + self.floatval)\n def add__float(self, floatother):\n return BoxedFloat(floatother + self.floatval)\n def is_positive(self):\n return self.floatval > 0.0\n\nUsing these classes to implement arithmetic shows the basic problem that a\ndynamic language implementation has. All the numbers are instances of either\nBoxedInteger or BoxedFloat, thus they consume space on the heap. Performing many\narithmetic operations produces lots of garbage quickly, thus putting pressure on\nthe garbage collector. Using double dispatching to implement the numeric tower\nneeds two method calls per arithmetic operation, which is costly due to the\nmethod dispatch.\nTo understand the problems more directly, let us consider a simple function that\nuses the object model:\ndef f(y):\n res = BoxedInteger(0)\n while y.is_positive():\n res = res.add(y).add(BoxedInteger(-100))\n y = y.add(BoxedInteger(-1))\n return res\n\nThe loop iterates y times, and computes something in the process. To\nunderstand the reason why executing this function is slow, here is the trace\nthat is produced by the tracing JIT when executing the function with y\nbeing a BoxedInteger:\n\n# arguments to the trace: p0, p1\n# inside f: res.add(y)\nguard_class(p1, BoxedInteger)\n # inside BoxedInteger.add\n i2 = getfield_gc(p1, intval)\n guard_class(p0, BoxedInteger)\n # inside BoxedInteger.add__int\n i3 = getfield_gc(p0, intval)\n i4 = int_add(i2, i3)\n p5 = new(BoxedInteger)\n # inside BoxedInteger.__init__\n setfield_gc(p5, i4, intval)\n# inside f: BoxedInteger(-100)\np6 = new(BoxedInteger)\n # inside BoxedInteger.__init__\n setfield_gc(p6, -100, intval)\n\n# inside f: .add(BoxedInteger(-100))\nguard_class(p5, BoxedInteger)\n # inside BoxedInteger.add\n i7 = getfield_gc(p5, intval)\n guard_class(p6, BoxedInteger)\n # inside BoxedInteger.add__int\n i8 = getfield_gc(p6, intval)\n i9 = int_add(i7, i8)\n p10 = new(BoxedInteger)\n # inside BoxedInteger.__init__\n setfield_gc(p10, i9, intval)\n\n# inside f: BoxedInteger(-1)\np11 = new(BoxedInteger)\n # inside BoxedInteger.__init__\n setfield_gc(p11, -1, intval)\n\n# inside f: y.add(BoxedInteger(-1))\nguard_class(p0, BoxedInteger)\n # inside BoxedInteger.add\n i12 = getfield_gc(p0, intval)\n guard_class(p11, BoxedInteger)\n # inside BoxedInteger.add__int\n i13 = getfield_gc(p11, intval)\n i14 = int_add(i12, i13)\n p15 = new(BoxedInteger)\n # inside BoxedInteger.__init__\n setfield_gc(p15, i14, intval)\n\n# inside f: y.is_positive()\nguard_class(p15, BoxedInteger)\n # inside BoxedInteger.is_positive\n i16 = getfield_gc(p15, intval)\n i17 = int_gt(i16, 0)\n# inside f\nguard_true(i17)\njump(p15, p10)\n\n(indentation corresponds to the stack level of the traced functions).\nThe trace is inefficient for a couple of reasons. One problem is that it checks\nrepeatedly and redundantly for the class of the objects around, using a\nguard_class instruction. In addition, some new BoxedInteger instances are\nconstructed using the new operation, only to be used once and then forgotten\na bit later. In the next section, we will see how this can be improved upon,\nusing escape analysis.\n\n\nVirtual Objects\nThe main insight to improve the code shown in the last section is that some of\nthe objects created in the trace using a new operation don't survive very\nlong and are collected by the garbage collector soon after their allocation.\nMoreover, they are used only inside the loop, thus we can easily prove that\nnobody else in the program stores a reference to them. The\nidea for improving the code is thus to analyze which objects never escape the\nloop and may thus not be allocated at all.\nThis process is called escape analysis. The escape analysis of\nour tracing JIT works by using virtual objects: The trace is walked from\nbeginning to end and whenever a new operation is seen, the operation is\nremoved and a virtual object is constructed. The virtual object summarizes the\nshape of the object that is allocated at this position in the original trace,\nand is used by the escape analysis to improve the trace. The shape describes\nwhere the values that would be stored in the fields of the allocated objects\ncome from. Whenever the optimizer sees a setfield that writes into a virtual\nobject, that shape summary is thus updated and the operation can be removed.\nWhen the optimizer encounters a getfield from a virtual, the result is read\nfrom the virtual object, and the operation is also removed.\nIn the example from last section, the following operations would produce two\nvirtual objects, and be completely removed from the optimized trace:\n\np5 = new(BoxedInteger)\nsetfield_gc(p5, i4, intval)\np6 = new(BoxedInteger)\nsetfield_gc(p6, -100, intval)\n\nThe virtual object stored in p5 would know that it is an BoxedInteger, and that\nthe intval field contains i4, the one stored in p6 would know that\nits intval field contains the constant -100.\nThe following operations, that use p5 and p6 could then be\noptimized using that knowledge:\n\nguard_class(p5, BoxedInteger)\ni7 = getfield_gc(p5, intval)\n# inside BoxedInteger.add\nguard_class(p6, BoxedInteger)\n# inside BoxedInteger.add__int\ni8 = getfield_gc(p6, intval)\ni9 = int_add(i7, i8)\n\nThe guard_class operations can be removed, because the classes of p5 and\np6 are known to be BoxedInteger. The getfield_gc operations can be removed\nand i7 and i8 are just replaced by i4 and -100. Thus the only\nremaining operation in the optimized trace would be:\n\ni9 = int_add(i4, -100)\n\nThe rest of the trace is optimized similarly.\nSo far we have only described what happens when virtual objects are used in\noperations that read and write their fields. When the virtual object is used in\nany other operation, it cannot stay virtual. For example, when a virtual object\nis stored in a globally accessible place, the object needs to actually be\nallocated, as it will live longer than one iteration of the loop.\nThis is what happens at the end of the trace above, when the jump operation\nis hit. The arguments of the jump are at this point virtual objects. Before the\njump is emitted, they are forced. This means that the optimizers produces code\nthat allocates a new object of the right type and sets its fields to the field\nvalues that the virtual object has. This means that instead of the jump, the\nfollowing operations are emitted:\n\np15 = new(BoxedInteger)\nsetfield_gc(p15, i14, intval)\np10 = new(BoxedInteger)\nsetfield_gc(p10, i9, intval)\njump(p15, p10)\n\nNote how the operations for creating these two instances has been moved down the\ntrace. It looks like for these operations we actually didn't win much, because\nthe objects are still allocated at the end. However, the optimization was still\nworthwhile even in this case, because some operations that have been performed\non the forced virtual objects have been removed (some getfield_gc operations\nand guard_class operations).\nThe final optimized trace of the example looks like this:\n\n# arguments to the trace: p0, p1\nguard_class(p1, BoxedInteger)\ni2 = getfield_gc(p1, intval)\nguard_class(p0, BoxedInteger)\ni3 = getfield_gc(p0, intval)\ni4 = int_add(i2, i3)\ni9 = int_add(i4, -100)\n\nguard_class(p0, BoxedInteger)\ni12 = getfield_gc(p0, intval)\ni14 = int_add(i12, -1)\n\ni17 = int_gt(i14, 0)\nguard_true(i17)\np15 = new(BoxedInteger)\nsetfield_gc(p15, i14, intval)\np10 = new(BoxedInteger)\nsetfield_gc(p10, i9, intval)\njump(p15, p10)\n\nThe optimized trace contains only two allocations, instead of the original five,\nand only three guard_class operations, from the original seven.\n\n\nSummary\nIn this blog post we described how simple escape analysis within the scope of\none loop works. This optimizations reduces the allocation of many intermediate\ndata structures that become garbage quickly in an interpreter. It also removes a\nlot of the type dispatching overhead. In a later post, we will explain how this\noptimization can be improved further.", + "tags": "", + "url": "https://www.pypy.org/posts/2010/09/escape-analysis-in-pypys-jit-1780048403046080197.html" + }, + { + "title": "EuroPython 2010 Videos available", + "text": "Hi all,\nthe videos of the talks from EuroPython 2010 are now available on\nblip.tv: in particular, there are the three videos of the PyPy talk.\nPart 1: What's news in PyPy 1.2 and 1.3 (by Antonio Cuni)\nPart 2: Just in Time compilation (by Armin Rigo)\nPart 3: cpyext (by Amaury Forgeot d'Arc)\nMoreover, here is Mark Shannon's talk which compares HotPy, Unladen Swallow\nand PyPy:", + "tags": "", + "url": "https://www.pypy.org/posts/2010/08/europython-2010-videos-available-8446190660370796142.html" + }, + { + "title": "Call for Benchmarks", + "text": "As you know, a lot of PyPy's recent development effort has gone into speeding up\nexecution of Python programs. However, an additional good property of PyPy's\nPython interpreter is that most objects are represented in a much more compact\nway than in CPython. We would like to investigate some more advanced techniques\nto reduce the memory usage of Python programs further.\nTo do this it is necessary to investigate the memory behaviour of real programs\nwith large heaps. For speed measurements there are standard benchmarks, but for\nmemory improvements there is nothing comparable, the memory behaviour of large\nprograms is not that well understood. Therefore we are looking for programs that we\ncan study and use as benchmarks.\nSpecifically we are looking for Python programs with the following properties:\n\nlarge heaps of about 10MB-1GB\nshould have non-trivial runtime as well (in the range of a few seconds), to\njudge the speed impact of optimizations\nideally pure-Python programs that don't use extension modules so that they run\nunder both CPython and PyPy (this is optional, but makes my life much easier).\n\nWe are also rather interested in programs that do a lot of string/unicode\nprocessing.\nWe would be grateful for all ideas. Telling us about a program also has the\nadvantage that we will work on optimizing PyPy for it :-).", + "tags": "", + "url": "https://www.pypy.org/posts/2010/08/call-for-benchmarks-2605012131351543912.html" + }, + { + "title": "PyOhio", + "text": "This weekend I delivered a talk at PyOhio (an annual conference in Columbus, OH, USA) on PyPy and Unladen Swallow. The talk covered reasons that Python, the language, is hard to optimize, why CPython is slow, and a few optimizations that PyPy and Unladen Swallow have implemented. The slides from my talk are online, and the talk was recorded so a video will follow. I gave a similar talk to ChiPy (the Chicago Python user group), which was also recorded and the video is available. Both audiences were excited about the futures for PyPy and Unladen Swallow, and for the future of a faster Python.\nAlex", + "tags": "", + "url": "https://www.pypy.org/posts/2010/08/pyohio-2568618480482575546.html" + }, + { + "title": "Using virtualenv with PyPy", + "text": "Thanks to the work that was recently done on the sys-prefix branch, it is\nnow possible to use virtualenv with PyPy.\nTo try it, you need:\n\n\na recent version of PyPy: PyPy 1.3 does not contain the necessary logic to\nwork with virtualenv, so you need a more recent PyPy from subversion\ntrunk. You can either build it by yourself or download one of our\nprecompiled nightly builds\na copy of virtualenv-pypy: this is a fork of virtualenv that contains\nall the patches needed to work with PyPy, and hopefully will be merged\nback at some point. It should be totally compatible with the official\nversion of virtualenv, so it is safe to use it even to create non-PyPy\nenvironments. If you notice some weird behavior that does not happen with\nthe standard virtualenv, please let us know.\n\n\nThe directory layout has been redesigned in a way that it is possible to use\nvirtualenv to install a PyPy both from a precompiled tarball or from an svn\ncheckout:\n\n# from a tarball\n$ virtualenv -p /opt/pypy-c-jit-76426-linux/bin/pypy my-pypy-env\n\n# from the svn checkout\n$ virtualenv -p /path/to/pypy-trunk/pypy/translator/goal/pypy-c my-pypy-env\n\nOnce the environment has been created, you can enter it as usual. Note that\nbin/python is now a symlink to bin/pypy.\nEnjoy it :-)", + "tags": "", + "url": "https://www.pypy.org/posts/2010/08/using-virtualenv-with-pypy-7238942727709530503.html" + }, + { + "title": "A Play on Regular Expression", + "text": "The paper where the algorithms we described in the recent blog posts come from is now available. It is written as a play in three Acts with a cast of three and is very readable and funny. The Haskell code is at Sebastian Fischer's github pages.", + "tags": "", + "url": "https://www.pypy.org/posts/2010/07/play-on-regular-expression-9014941705636345998.html" + }, + { + "title": "EuroPython 2010 report", + "text": "So, EuroPython 2010 is over, I am flying home and it's time to write a report\nabout the conference from the PyPy point of view.\nAs usual, the conference was very interesting and went very well. The quality\nof the talks I attended to was high on average and most importantly I could\nmeet a lot of interesting people to discuss various things.\nOn the first day, Armin, Amaury and I presented the usual PyPy status talk\n(here are the slides):\nthe talk is an extended version of the one that I and Armin presented at\nPycon Italia in May and is divided in three parts: first I talked about the\ncurrent status of the project, what is the content of the recent 1.2 and 1.3\nreleases and showed a demo of a simple Django application that renders a\nMandelbrot fractal and is measurably faster on PyPy than on CPython. In the\nsecond part of the talk, Armin gave an introduction about the ideas that stand\nbehind the JIT. Finally, in the third part Amaury explained how the new\ncpyext module lets PyPy to compile and load existing CPython extensions\nwritten in C.\nI think that the talk was well received: the only drawback is that there was\nno time to answer questions at the end of the presentation. However, we\nreceived a lot of \"offline\" questions after the talk finished and thorough the\nwhole conference: it is always great to see that people are interested in our\nwork, and I'd like to thank everybody for the feedback that they gave to us.\nPyPy was also mentioned in the interesting Mark Shannon's talk, where he\ncompared the optimization techniques used by PyPy, Unladen Swallow and\nHotPy, which is Mark's own PhD project. Moreover, Henrik Vendelbo\ngave a talk about how to tweak PyPy to produce a standalone\nexecutable which embeds a whole python application to make deployment easier,\nwhile Andrew Francis explained his implementation of the Go select\nstatement based on the stackless.py module implemented in PyPy. Personally,\nI am glad to see that people start to think of PyPy as a useful starting\npoint to experiment with new features and use cases that we did not think\nabout: after all, one of PyPy explicit goals is to be \"flexible and easy to\nexperiment with\".\nAfter the conference there were the usual post EuroPython sprints: this\nyear we had not planned a PyPy sprint, but some people showed interest\nin it and since Armin and I happened to be still around the day after the\nconference, we decided to do a mini 1-day sprint, with 6 or 7 people\npresent. Since there were only two core developers it was impossible to use\nour usual pairing scheme, in which every newcomer pairs with someone who is\nexperienced with the source code to gain knowledge of it. However, I think it\nwas still a successful day of work, and we managed to fix a couple of bugs\nthat was standing in our issue tracker. Again, I'd like to thank all the\npeople that came and worked with us during the sprint.\nIn conclusion I really enjoyed the EuroPython 2010 experience: the fact that I\nmanaged to find a place in Birmingham where to eat a good Italian-style \"gelato\"\nhelped a lot :-).", + "tags": "", + "url": "https://www.pypy.org/posts/2010/07/europython-2010-report-7803731360759120212.html" + }, + { + "title": "CERN Sprint Report \u2013 Wrapping C++ Libraries", + "text": "The last five days we have been sprinting in a meeting room in the Computing\nCenter at CERN in Gen\u00e8ve, Switzerland. Present are Armin Rigo, Antonio Cuni,\nCarl Friedrich Bolz and Wim Lavrijsen (LBL). The goal of the sprint was to use\nsome of the C++ technology developed at CERN to make it possible to use C++\nlibraries from PyPy's Python interpreter. For this we used the Reflex\nlibrary, which provides reflection information for C++ classes. We discussed\nusing Reflex in PyPy during the D\u00fcsseldorf sprint of 2008, please read\nthat blog post if you want some more details on how Reflex works. There is\nsupport for this sort of C++/Python integration also for CPython, using the\nPyROOT module.\nThe sprint was very successful. On Monday we had a few discussion about how\nReflex could best be integrated with PyPy. One of the goals of the sprint was to\nmake the approach JIT-friendly from the start, so that calls to C++ libraries\ncan be reasonably fast. After the discussion we started coding on the\nreflex-support branch. This branch adds a new cppyy builtin module to\nPyPy's Python interpreter (why we chose that name is left as an exercise to the\nreader). This module can be used to load C++ classes, construct instances and\ncall static and instance methods on them.\nThe work has just started, as of now, the argument and return types of the\nmethods are restricted to some simple C types, such as int, double and\nchar* and pointers to class instances. Most of the work necessary to\nproperly resolve overloaded methods is done, but default arguments are not.\nAs an example, suppose there is a C++ class like this:\nclass example01 {\nprivate:\n static int count;\n int somedata;\npublic:\n\n example01(int a) : somedata(a) {\n count++;\n }\n ~example01() {\n count--;\n }\n static int getCount() {\n return count;\n }\n\n int addDataToInt(int a) {\n return somedata + a;\n }\n};\nint example01::count = 0;\n\nYou can now use it from PyPy's Python interpreter in the following way, after\nyou have used Reflex to generate reflection information for the class:\nimport cppyy\ncppyy.load_lib(\"example01Dict.so\") # contains the Reflex information\nexample01_class = cppyy.gbl.example01\ninstance = example01_class(7)\nassert example01_class.getCount() == 1\nres = instance.addDataToInt(4)\nassert res == 11\nres = instance.addDataToInt(-4)\nassert res == 3\ninstance.destruct() # so far explicit destruction needed\nassert example01_class.getCount() == 0\n\nWe also did some very early JIT work and some early performance measurements.\nThe rough figures are that cppyy is two times faster at calling a simple C++\nmethod from Python than PyROOT. To get a feeling for how fast things could\ngo in the end, we also implemented a proof-of-concept for some more advanced JIT\ntechnology (which requires a patch for Reflex and uses a GCC extension). With\nthis, the speedup over PyROOT is a factor of 20. Of course, this is still a\nlot slower than a C++ to C++ method call (probably by at least an order of\nmagnitude).\nThe sprint was very productive because we managed to get the right people into\nthe same room working together. Wim has a lot of experience with C++ and Reflex,\nand is the author of PyROOT, and of course the others know a lot about PyPy\n(at the end of the sprint, Anto was very glad that he stopped using C++ a long\ntime ago). Also, working at CERN was very cool. The atmosphere is amazing, and\nwe got to visit the ATLAS control room. Extremely advanced technology, and\nalso research on a completely different scale than what we are used to.", + "tags": "", + "url": "https://www.pypy.org/posts/2010/07/cern-sprint-report-wrapping-c-libraries-6547377950791793143.html" + }, + { + "title": "Comparing SPUR to PyPy", + "text": "Recently, I've become aware of the SPUR project of Microsoft Research and\nread some of their papers (the tech report \"SPUR: A Trace-Based JIT Compiler\nfor CIL\" is very cool). I found the project to be very interesting and since\ntheir approach is in many ways related to what PyPy is doing, I now want to\ncompare and contrast the two projects.\n\nA Tracing JIT for .NET\nSPUR consist of two parts: On the one hand it is a VM for CIL, the\nbytecode of the .NET VM. This VM uses a tracing JIT compiler to compile the\nprograms it is running to machine code. As opposed to most existing VMs that\nhave a tracing JIT it does not use an interpreter at all. Instead it\ncontains various variants of a JIT compiler that produce different versions of\neach method. Those are:\n\na profiling JIT which produces code that does lightweight profiling when\nrunning the compiled method\na tracing JIT which produces code that produces a trace when running the\ncompiled method\na transfer-tail JIT which is used to produce code which is run to get from a\nfailing guard back to the normal profiling version of a method\nan optimizing JIT that actually optimizes traces and turns them into machine code\n\n\nOptimizations Done by the Optimizing JIT\nSPUR's optimizing JIT does a number of powerful optimizations on the traces before it\nturns them into machine code. Among them are usual compiler optimizations such\nas register allocation, common subexpression elimination, loop invariant code\nmotion, etc.\nIt also performs some optimizations that are specific to the tracing context and\nare thus not commonly found in \"normal\" compilers:\n\nguard implication: if a guard is implied by an earlier guard, it is removed\nguard strengthening: if there is a sequence of guards that become stronger\nand stronger (i.e. each guard implies the previous one), the first guard in\nthe sequence is replaced by the last one, and all others are removed. This can\ngreatly reduce the number of guards and is generally safe. It can shift a\nguard failure to an earlier point in the trace, but the failure would have\noccurred at some point in the trace anyway.\nload/store optimizations: this is an optimization for memory reads/writes.\nIf several loads from the same memory location occur without writes in\nbetween, all but the first one are removed. Similarly, if a write to a memory\nlocation is performed, this write is delayed as much as possible. If there is\na write to the same location soon afterwards, the first write can be removed.\nescape analysis: for allocations that occur in a loop, the optimizer checks\nwhether the resulting object escapes the loop. If not, the allocation is moved\nbefore the loop, so that only one object needs to be allocated, instead of one\nevery loop iteration.\nuser-controlled loop unrolling: not exactly an optimization, but an\ninteresting feature anyway. It is possible to annotate a CIL method with a\nspecial decorator [TraceUnfold] and then the tracing JIT will fully unroll\nthe loops it contains. This can be useful for loops than are known to run a\nsmall and fixed number of iterations for each call-site.\nuser controlled tracing: The user can also control tracing up to a point.\nMethods can be annotated with [NativeCall] to tell the tracer to never\ntrace their execution. Instead they appear as a direct call in the trace.\n\n\n\n\nA JavaScript Implementation\nIn addition to the tracing JIT I just described, SPUR also contains a JavaScript\nimplementation for .NET. The approach of this implementation is to translate\nJavaScript to CIL bytecode, doing some amount of type inference to detect\nvariables that have fixed types. All operations where no precise type could be\ndetermined are implemented with calls to a JavaScript runtime system, which does\nthe necessary type dispatching. The JavaScript runtime is implemented in C#.\nThe JavaScript implementation and the CLI VM with a tracing JIT sound quite\nunrelated at first, but together they amplify each other. The tracing JIT traces\nthe JavaScript functions that have been translated to CLI bytecode. Since the\nJavaScript runtime is in C#, it exists as CLI bytecode too. Thus it can be\ninlined into the JavaScript functions by the tracer. This is highly beneficial,\nsince it exposes the runtime type dispatching of the JavaScript operations to\nthe optimizations of the tracing JIT. Particularly the common expression\nelimination helps the JavaScript code. If a series of operations is performed on\nthe same object, the operations will all do the same type checks. All but the\ntype checks of the first operation can be removed by the optimizer.\n\nPerformance Results\nThe speed results of the combined JavaScript implementation and tracing JIT are\nquite impressive. It beats TraceMonkey for most benchmarks in SunSpider (apart\nfrom some string-heavy benchmarks that are quite slow) and can compete with V8\nin many of them. However, all this is steady-state performance and it seems\nSPUR's compile time is rather bad currently.\n\n\nFurther Possibilities\nA further (so far still hypothetical) advantage of SPUR is that the approach can\noptimize cases where execution crosses the border of two different systems. If\nsomebody wrote an HTML layout engine and a DOM in C# to get a web browser and\nintegrated it with the JavaScript implementation described above, the tracing\nJIT could optimize DOM manipulations performed by JavaScript code as well as\ncallbacks from the browser into JavaScript code.\nOf course the approach SPUR takes to implement JavaScript is completely\ngeneralizable. It should be possible to implement other dynamic languages in the\nsame way as JavaScript using SPUR. One would have to write a runtime system for\nthe language in C#, as well as a compiler from the language into CIL bytecode.\nGiven these two elements, SPUR's tracing JIT compiler would probably do a\nreasonable job at optimizing this other language (of course in practise, the\nlanguage implementation would need some tweaking and annotations to make it\nreally fast).\n\n\n\nComparison With PyPy\nThe goals of PyPy and SPUR are very similar. Both projects want to implement\ndynamic languages in an efficient way by using a tracing JIT. Both apply the\ntracing JIT \"one level down\", i.e. the runtime system of the dynamic language is\nvisible to the tracing JIT. This is the crucial point of the approach of both\nprojects. Since the runtime system of the dynamic language is visible to the\ntracing JIT, the JIT can optimize programs in that dynamic language. It does not\nitself need to know about the semantics of the dynamic language. This makes the\ntracing JIT usable for a variety of dynamic languages. It also means that the\ntwo halves can be implemented and debugged independently.\nIn SPUR, C# (or another language that is compilable to CIL) plays the role of\nRPython, and CIL is equivalent to the intermediate format that PyPy's\ntranslation toolchain uses. Both formats operate on a similar abstraction level,\nthey are quite close to C, but still have support for the object system of their\nrespective language and are garbage-collected.\nSPUR supports only a JavaScript implementation so far, which could maybe change in\nthe future. Thus JavaScript in SPUR corresponds to Python in PyPy, which was the\nfirst dynamic language implemented in PyPy (and is also the reason for PyPy's\nexistence).\nThere are obviously also differences between the two projects, although many of\nthem are only skin-deep. The largest difference is the reliance of SPUR on\ncompilers on all levels. PyPy takes the opposite approach of using interpreters\nalmost everywhere. The parts of PyPy that correspond to SPUR's compilers are (I\nwill use the Python implementation of PyPy as an example):\n\nthe JavaScript-to-CIL compiler corresponds to the Python interpreter of PyPy\nthe profiling JIT corresponds to a part of PyPy's translation toolchain\nwhich adds some profiling support in the process of turning RPython code into\nC code,\nthe tracing JIT corresponds to a special interpreter in the PyPy JIT which\nexecutes an RPython program and produces a trace of the execution\nthe transfer-tail JIT corresponds to PyPy's blackhole interpreter, also\ncalled fallback interpreter\nthe optimizing JIT corresponds to the optimizers and backends of PyPy's JIT\n\n\nPyPy's Optimizations\nComparing the optimizations that the two projects perform, the biggest\ndifference is that PyPy does \"trace stitching\" instead of fully supporting trace\ntrees. The difference between the two concerns what happens when a new trace\ngets added to an existing loop. The new trace starts from a guard in the\nexisting loop that was observed to fail often. Trace stitching means that the\nloop is just patched with a jump to the new trace. SPUR instead recompiles the\nwhole trace tree, which gives the optimizers more opportunities, but also makes\ncompilation a lot slower. Another difference is that PyPy does not perform\nloop-invariant code motion yet.\nMany of the remaining optimizations are very similar. PyPy supports guard\nimplication as well as guard strengthening. It has some load/store\noptimizations, but PyPy's alias analysis is quite rudimentary. On the other\nhand, PyPy's escape analysis is very powerful. PyPy also has support for the\nannotations that SPUR supports, using some decorators in the pypy.rlib.jit\nmodule. User-controlled loop unrolling is performed using the unroll_safe\ndecorator, tracing of a function can be disabled with the dont_look_inside\ndecorator.\nPyPy has a few more annotations that were not mentioned in the SPUR tech report.\nMost importantly, it is possible to declare a function as pure, using the\npurefunction decorator. PyPy's optimizers will remove calls to a function\ndecorated that way if the arguments to the call are all constant. In addition it\nis possible to declare instances of classes to be immutable, which means that\nfield accesses on constant instances can be folded away. Furthermore there is\nthe promote hint, which is spelled x = hint(x, promote=True). This will\nproduce a guard in the trace, to turn x into a constant after the guard.\n\n\n\nSummary\nGiven the similarity between the projects' goals, it is perhaps not so\nsurprising to see that PyPy and SPUR have co-evolved and reached many similar\ndesign decisions. It is still very good to see another project that does many\nthings in the same way as PyPy.", + "tags": "", + "url": "https://www.pypy.org/posts/2010/07/comparing-spur-to-pypy-8835011873209414462.html" + }, + { + "title": "\"Blackhole\" interpreter", + "text": "Hi all,\n\nHere are a few words about the JIT's \"great speedup in compiling\ntime\" advertized on the PyPy 1.3 release (see the\n\nprevious blog post).\nThe exact meaning behind these words needs a fair bit of\nexplanation, so here it is in case you are interested.\n\nIf you download a version of PyPy 1.3 that includes a JIT\ncompiler, you get an executable that could be qualified as rather\nfat: it actually contains three interpreters. You have on the\none hand the regular Python interpreter. It is here because it's\nnot possible to JIT-compile every single piece of Python code you\ntry to run; only the most executed loops are JIT-compiled. They\nare JIT-compiled with a tracing interpreter that operates one\nlevel down. This is the second interpreter. This tracing step\nis quite slow, but it's all right because it's only invoked on\nthe most executed loops (on the order of 100 to 1000 times in\ntotal in a run of a Python script that takes anyway seconds or\nminutes to run).\n\nSo apart from the JIT compilation itself, we have two worlds in\nwhich the execution proceeds: either by regular interpretation,\nor by the execution of assembler code generated by the JIT\ncompiler. And of course, we need to be able to switch from one\nworld to the other quickly: during regular interpretation we have\nto detect if we already have generated assembler for this piece\nof code and if so, jump to it; and during execution of the\nassembler, when a \"guard\" fails, i.e. when we meet a path of\nexecution for which we did not produce assembler, then we need to\nswitch back to regular interpretation (or occasionally invoke the\nJIT compiler again).\n\nLet us consider the cost of switching from one world to another.\nDuring regular interpretation, if we detect that we already have\nassembler corresponding to this Python loop, then we just jump to\nit instead of interpreting the Python loop. This is fairly\ncheap, as it involves just one fast extra check per Python loop.\nThe reverse is harder because \"guard\" failures can occur at any\npoint in time: it is possible that the bit of assembler that we\nalready executed so far corresponds to running the first 4 Python\nopcodes of the loop and a half. The guard that failed just now\nis somewhere in the middle of interpreting that opcode -- say,\nmultiplying these two Python objects.\n\nIt's almost impossible to just \"jump\" at the right place in the\ncode of the regular interpreter -- how do you jump inside a\nregular function compiled in C, itself in a call chain, resuming\nexecution of the function from somewhere in the middle?\n\nSo here is the important new bit in PyPy 1.3. Previously, what\nwe would do is invoke the JIT compiler again in order to follow\nwhat needs to happen between the guard failure and the real end\nof the Python opcode. We would then throw away the trace\ngenerated, as the only purpose was to finish running the current\nopcode. We call this \"blackhole interpretation\". After the end\nof the Python opcode, we can jump to the regular interpreter\neasily.\n\nDoing so was straightforward, but slow, in case it needs to be\ndone very often (as in the case in some examples, but not all).\nIn PyPy 1.3, this blackhole interpretation step has been\nredesigned as a time-critical component, and that's where the\nthird interpreter comes from. It is an interpreter that works\nlike the JIT compiler, but without the overhead of tracing (e.g.\nit does not need to box all values). It was designed from the\nground up for the sole purpose of finishing the execution of the\ncurrent Python opcode. The bytecode format that it interprets is\nalso new, designed for that purpose, and the JIT compiler itself\n(the second interpreter) was adapted to it.\nThe old bytecode format in PyPy 1.2 is gone\n(it was more suited for the JIT compiler, but less for blackhole\ninterpretation).\n\nIn summary, it was a lot of changes in the most front-end-ish\nparts of the JIT compiler, even though it was mostly hidden\nchanges. I hope that this longish blog post helped bring it a\nbit more to the light :-)", + "tags": "", + "url": "https://www.pypy.org/posts/2010/06/blackhole-interpreter-2752965445510091289.html" + }, + { + "title": "PyPy 1.3 released", + "text": "Hello.\nWe're please to announce the release of PyPy 1.3. This release has two major\nimprovements. First of all, we stabilized the JIT compiler since 1.2 release,\nanswered user issues, fixed bugs, and generally improved speed.\nWe're also pleased to announce alpha support for loading CPython extension\nmodules written in C. While the main purpose of this release is increased\nstability, this feature is in alpha stage and it is not yet suited for\nproduction environments.\n\nHighlights of this release\n\nWe introduced support for CPython extension modules written in C. As of now,\nthis support is in alpha, and it's very unlikely unaltered C extensions will\nwork out of the box, due to missing functions or refcounting details. The\nsupport is disabled by default, so you have to do:\n\nimport cpyext\n\nbefore trying to import any .so file. Also, libraries are source-compatible\nand not binary-compatible. That means you need to recompile binaries, using\nfor example:\n\npypy setup.py build\n\nDetails may vary, depending on your build system. Make sure you include\nthe above line at the beginning of setup.py or put it in your PYTHONSTARTUP.\nThis is alpha feature. It'll likely segfault. You have been warned!\n\nJIT bugfixes. A lot of bugs reported for the JIT have been fixed, and its\nstability greatly improved since 1.2 release.\n\nVarious small improvements have been added to the JIT code, as well as a great\nspeedup of compiling time.\n\n\n\n\nCheers,\nMaciej Fijalkowski, Armin Rigo, Alex Gaynor, Amaury Forgeot d'Arc and the PyPy team\n\n\nUpdate:The correct command to build extension is \"pypy setup.py build\", not \"python setup.py build\" as it was stated before.", + "tags": "release", + "url": "https://www.pypy.org/posts/2010/06/pypy-13-released-8546085566902489304.html" + }, + { + "title": "A JIT for Regular Expression Matching", + "text": "This is part 2 of a series, see Part 1 for an introduction. In this post\nI want to describe how the JIT generator of the PyPy project can be used to turn\nthe elegant but not particularly fast regular expression matcher from the first\npart into a rather fast implementation. In addition, I will show some speed\nmeasurements against various regular expression implementations.\nAgain, note the disclaimer: This technology could not easily be used\nto implement Python's re-module.\n\nExample Expression and First Numbers\nThe regular expression I will use as an example in the rest of this paper is\nthe expression (a|b)*a(a|b){20}a(a|b)*. It matches all strings that have two\na with exactly 20 characters between them. This regular expression has\nthe property that the corresponding DFA needs 2**(n+1) different states. As\nan input string, we use a random string (of varying lengths) that does not\nmatch the regular expression. I will give all results as number of chars matched\nper second. While this is not a particularly typical regular expression, it\nshould still be possible to get some ballpark numbers for the speeds of various\nimplementations \u2013 as we will see, the differences between implementations are\nhuge anyway.\nAll the benchmarks were performed on my laptop, which has an Intel Core2 Duo\nP8400 processor with 2.26 GHz and 3072 KB of cache on a machine with 3GB RAM\nrunning Ubuntu Linux 10.04.\nTo get a feeling for the orders of magnitude involved, the CPython re module\n(which is implemented in C and quite optimized) can match 2'500'000 chars/s.\nGoogle's new re2 implementation still matches 550'000 chars/s. Google's\nimplementation is slower, but their algorithm gives complexity and space\nguarantees similar to our implementation in the last blog post.\nOn the other end of the performance scale is the pure-Python code from the last\nblog post running on CPython. It can match only 12'200 chars/s and is thus 200\ntimes slower than the re module.\n\n\nTranslating the Matcher\nThe code described in the last blog post is not only normal Python code, but\nalso perfectly valid RPython code. Nothing particularly dynamic is going on in\nthe code, thus it can be translated with PyPy's translation toolchain to C code.\nThe resulting binary is considerably faster and can match 720'000 chars/s, 60\ntimes faster than the untranslated version.\nAnother approach is to write equivalent versions of the algorithms in lower\nlevel languages. This has been done for C++ by Sebastian Fischer and for Java by\nBaltasar Tranc\u00f3n y Widemann. The algorithm is object-oriented enough to be\nmapped very closely to the respective languages. The C++ version is\na little bit faster than the RPython version translated to C, at 750'000 chars/s. That's\nnot very surprising, given their similarity. The Java version is more than twice\nas fast, with 1'920'000 chars/s. Apparently the Java JIT compiler is a lot\nbetter at optimizing the method calls in the algorithm or does some other\noptimizations. One reason for this could be that the Java JIT can assume that\nthe classes it sees are all there are (and it will invalidate the generated\nmachine code if more classes are loaded), whereas the C++ compiler needs to\ngenerate code that works even in the presence of more regular expression\nclasses.\n\n\nGenerating a JIT\nTo get even more performance out of the RPython code, it is possible to generate\na JIT for it with the help of the PyPy translation toolchain. To do this, the\nmatching code needs to be extended somewhat by some hints that tell PyPy's JIT\ngenerator how this is to be done. The JIT generator can automatically produce a\nJIT compiler from an RPython interpreter of the source language. In our case,\nwe view the regular expression matcher as an interpreter for regular\nexpressions. Then the match function corresponds to the\ndispatch loop of a traditional interpreter.\nOur regular expression matcher is a very peculiar interpreter. The matcher\nworks by running exactly one loop (the one in match) as many times as the\ninput string is long, irrespective of the \"program\", i.e. the particular\nregular expressions. In addition, within the loop there are no conditions (e.g.\nif statements) at all, it is just linear code. This makes it almost perfectly\nsuited\nto the JIT generator, which produces a tracing JIT. A tracing JIT compiles the\nhot loops of a program (i.e. regular expression) and has to do extra work if\nthere are conditions in the loop. In our case, there is exactly one loop per\nregular expression, without any condition.\n\nJIT Hints\nThe hints that are needed for the match function of the last blog post can\nbe seen here (the function is slightly rewritten, e.g. the JIT does only\nproperly support a while loop as the main dispatch loop):\njitdriver = jit.JitDriver(reds=[\"i\", \"result\", \"s\"], greens=[\"re\"])\n\ndef match(re, s):\n if not s:\n return re.empty\n # shift a mark in from the left\n result = re.shift(s[0], 1)\n i = 1\n while i < len(s):\n jitdriver.can_enter_jit(i=i, result=result, s=s, re=re)\n jitdriver.jit_merge_point(i=i, result=result, s=s, re=re)\n # shift the internal marks around\n result = re.shift(s[i], 0)\n i += 1\n re.reset()\n return result\n\nThe jitdriver is an instance describing the data of the interpreter we are\ndealing with. The arguments to the constructor need to list all local variables\nof the dispatch loop. The local variables are classified into two classes, red\nones and green ones. The green ones hold the objects that make up the program\nthat the interpreter currently runs and which position in the program is\ncurrently being executed. In a typical bytecode interpreter, the bytecode object\nand the program counter would be green. In our case, the regular expression is\nthe program, so it is green. The rest of the variables are red.\nThe green variables are treated specially by the JIT generator. At runtime, for\na given value of the green variables, one piece of machine code will be\ngenerated. This piece of machine code can therefore assume that the value of\nthe green variable is constant.\nThere are two additional hints, which are method calls on the\njitdriver instance. The jit_merge_point method marks the beginning of\nthe main interpreter loop. The can_enter_jit function marks the point where\na loop in the user program can be closed, which in our case is trivial, it's\njust at the end of the interpreter loop (for technical reasons it is put at the beginning, because nothing must happen between the can_enter_jit and jit_merge_point invocations).\nThose are the hints that the JIT generator needs to function at all. We added\nsome additional hints, that give the JIT generator more information to work\nwith. Those hints are immutability information, which means that certain\ninstance fields can not be changed after the object has been constructed. Apart\nfrom the marked field, none of the fields of any of the Regex subclasses\ncan change. For example for the Char class this is expressed in the\nfollowing way:\nclass Char(Regex):\n _immutable_fields_ = [\"c\"]\n def __init__(self, c):\n ...\n\nThese hints allow the generated JIT to constant-fold reads out of the immutable\nfields in some situations.\n\n\nAdaptions to the Original Code\nIn the introduction above I wrote that the code within the loop in match\nuses no conditions. It is indeed true that none of the _shift methods\nhave an if statement or similar. However, there are some hidden conditions\ndue to the fact that the and and or boolean operators are used, which\nare short-circuiting. Therefore the JIT-version of the code needs to be adapted\nto use the non-short-circuiting operators & and |.\n\n\nJIT Example\nTo get an impression of how the generated machine code looks like, consider the\nregular expression (a|b)*. As regular expression objects this would be\nRepetition(Alternative(Char('a'), Char('b'))). The machine code in its intermediate,\nmachine-independent form looks as follows (I have slightly cleaned it up and\nadded comments for clarity):\n# arguments of the loop\n# i0 is i in the match function\n# result0 is result in the match function\n# s0 is s in the match function\n[i0, result0, s0] # those are the arguments to the machine code\nchar = s0[i0] # read the character\n# read the current mark:\ni5 = ConstPtr(ptr_repetition).marked\ni7 = char == 'a' # is the character equal to 'a'\ni8 = i5 & i7\ni10 = char == 'b' # is the character equal to 'b'\ni11 = i5 & i10\n# write new mark\nConstPtr(ptr_chara).marked = i8\ni13 = i8 | i11\n# write new mark\nConstPtr(ptr_charb).marked = i11\n# write new mark\nConstPtr(ptr_alternative).marked = i13\n# increment the index\ni17 = i0 + 1\ni18 = len(s0)\n# write new mark\nConstPtr(ptr_repetition).marked = i13\n# check that index is smaller than the length of the string\ni19 = i17 < i18\nif not i19:\n go back to normally running match\njump(i17, i13, s0) # start from the top again\n\nThe various ConstPtr(ptr_*) denote constant addresses of parts of the regular\nexpression tree:\n\nptr_repetition is the Repetition\nptr_chara is Char('a')\nptr_charb is Char('b')\nptr_alternative is the Alternative\n\nEssentially the machine code reads the next char out of the string, the current\nmark out of the Repetition and then performs some boolean operations on\nthose, writing back the new marks. Note in particular how the generated\nmachine code does not need to do any method calls to shift and _shift and\nthat most field reads out of the regular expression classes have been optimized\naway, because the fields are immutable. Therefore the machine code does not\nneed to deconstruct the tree of regular expression objects at all, it just\nknows where in memory the various parts of it are, and encodes that directly\ninto the code.\n\n\nPerformance Results With JIT\nWith the regular expression matcher translated to C and with a generated JIT,\nthe regular expression performance increases significantly. Our running example\ncan match 16'500'000 chars/s, which is more than six times faster than the\nre module. This is not an entirely fair comparison, because the re\nmodule can give more information than just \"matches\" or \"doesn't match\", but\nit's still interesting to see. A more relevant comparison is that between the\nprogram with and without a JIT: Generating a JIT speeds the matcher up by more\nthan 20 times.\n\n\n\nConclusion\nSo, what have we actually won? We translated the relatively simple and very slow\nregular expression matching algorithm from the last post to C and were thus able\nto speed it up significantly. The real win is gained by also generating a JIT\nfor the matcher, which can be regarded as a simple interpreter. The resulting\nmatcher is rather fast.\nThe lesson from these posts is not that you can or should write a practical\nand general regular expression module in this way \u2013 indeed, enhancing the\nalgorithm to support more features of the re module would be a lot of work\nand it is also unclear what the speed results for more realistic regular\nexpressions would be. However, it makes for a great case study of the JIT\ngenerator. It was relatively straightforward to generate a JIT for the regex\nmatcher, and the speed results were great (Admittedly I know rather a lot about\nPyPy's JIT though). This approach is generalizable to many programs that are\nsufficiently \"interpreter-like\" (whatever that exactly means).\nAll the results that appeared at various points in this blog post can be seen\nhere:\n\n\n\n\n\n\n\nImplementation\nchars/s\nspeedup over pure Python\n\nPure Python code\n12'200\n1\n\nPython re module\n2'500'000\n205\n\nGoogle's re2 implementation\n550'000\n45\n\nRPython implementation translated to C\n720'000\n59\n\nC++ implementation\n750'000\n61\n\nJava implementation\n1'920'000\n157\n\nRPython implementation with JIT\n16'500'000\n1352\n\n\n\n\nSources\nAll the source code can be found in my Subversion user directory on Codespeak.\n\nEdit: Armin is right (see first comment). I fixed the problem.", + "tags": "", + "url": "https://www.pypy.org/posts/2010/06/jit-for-regular-expression-matching-3877859053629057968.html" + }, + { + "title": "PyPy in Google's Summer of Code 2010", + "text": "Good news everyone.\nThis year, thanks to google generosity and PSF support, we got two and a\nhalf of students for PyPy's summer of code. We didn't cut any students, but one\nof the projects is a joint project of PyPy and numpy. Hereby I present\ndescriptions, in my own words with my own opinions and in arbitrary order. For\nmore details please follow links to particular blogs.\n\nJason Creighton: 64bit JIT backend for PyPy\nIntel 64bit (and I mean x86_64) compatibility for JIT has been one of the top\nrequested features (along with GIL removal). While GIL removal is not really an\neasy task, having our JIT emit 64bit assembler is sort of easy, thanks to our\nJIT backend abstraction. It will likely be faster, thanks to abundance of\nregisters.\n\n\nBartosz Skowron: Fast ctypes for PyPy\nHistorically weak point of PyPy was compatibility with extension modules. We\nhave progressed quite a bit in recent years, first introducing ctypes for\npypy then progressing towards CPython extension modules. However, ctypes is\nwell known to be slow (and it's even slower on PyPy) and writing CPython\nextension modules is ugly, and it's going to be only with compatibility layer\nthat'll keep this slow. What happens if we try to employ JIT technology to\nctypes? Maybe we can compile calls to C code from Python as a direct calls in\ncompiled assembler? Why not?\nThis project will look how the JIT technology can be employed to do some\nsort of FFI. There is no guarantee we'll get super-fast ctypes as a result,\nbut it's good to see progress in that area.\n\n\nDan Roberts: Numpy in PyPy\nThis is a joint project of numpy and PyPy. The main objective is to bring\nnumpy to PyPy, possibly fast. The official mentor for this project is\nStefan van der Walt from numpy community. During initial meeting it was\nagreed that probably the best way to go would be to support original numpy\nwith CPython extension compatibility and then provide a minimal native numpy\nframework for pypy. The former would retain full compatibility, while the\nlatter would have JIT integration, with line of our previous\nnumeric experiments. There would be an explicit interface from converting\none array to another for convinience.\n\nOverall, I'm very happy to see so much support for PyPy from SoC. I hope all\nthree proposals will be successful!\nCheers,\nfijal & pypy team.", + "tags": "", + "url": "https://www.pypy.org/posts/2010/05/pypy-in-googles-summer-of-code-2010-5321939902318322352.html" + }, + { + "title": "An Efficient and Elegant Regular Expression Matcher in Python", + "text": "Two weeks ago, I was at the Workshop Programmiersprachen und Rechenkonzepte,\na yearly meeting of German programming language researchers. At the workshop,\nFrank Huch and Sebastian Fischer gave a really excellent talk about an\nelegant regular expression matcher written in Haskell. One design goal of the\nmatcher was to run in time linear to the length of the input string (i.e.\nwithout backtracking) and linear in the size of the regular expression. The\nmemory use should also only be linear in the regular expression.\nDuring the workshop, some of the Haskell people and me then implemented the\nalgorithm in (R)Python. Involved were Frank, Sebastian, Baltasar Tranc\u00f3n y\nWidemann, Bernd Bra\u00dfel and Fabian Reck.\nIn this blog post I want to describe this implementation and show the code of\nit, because it is quite simple. In a later post I will show what optimizations\nPyPy can perform on this matcher and also do some benchmarks.\nA Note on terminology: In the rest of the post \"regular expression\" is meant\nin the Computer Science sense, not in the POSIX sense. Most importantly, that\nmeans that back-references are not allowed.\nAnother note: This algorithm could not be used to implement PyPy's re\nmodule! So it won't help to speed up this currently rather slow implementation.\n\nImplementing Regular Expression Matchers\nThere are two typical approaches to implement regular expression. A naive one is\nto use a back-tracking implementation, which can lead to exponential matching\ntimes given a sufficiently evil regular expression.\nThe other, more complex one, is to transform the regular expression into a\nnon-deterministic finite automaton (NFA) and then transform the NFA into a\ndeterministic finite automaton (DFA). A DFA can be used to efficiently match\na string, the problem of this approach is that turning an NFA into a DFA can\nlead to exponentially large automatons.\nGiven this problem of potential memory explosion, a more sophisticated approach\nto matching is to not construct the DFA fully, but instead use the NFA for\nmatching. This requires some care, because it is necessary to keep track of\nwhich set of states the automaton is in (it is not just one state, because the\nautomaton is non-deterministic).\nThe algorithm described here is essentially equivalent to this approach, however\nit does not need an intermediate NFA and represents a state of a corresponding\nDFA as marked regular expression (represented as a tree of nodes). For many\ndetails about an alternative approach to implement regular expressions\nefficiently, see Russ Cox excellent article collection.\n\n\nThe Algorithm\nIn the algorithm the regular expression is represented as a tree of nodes. The\nleaves of the nodes can match exactly one character (or the epsilon node, which\nmatches the empty string). The inner nodes of the tree combine other nodes in\nvarious ways, like alternative, sequence or repetition. Every node in the tree\ncan potentially have a mark. The meaning of the mark is that a node is marked,\nif that sub-expression matches the string seen so far.\nThe basic approach of the algorithm is that for every character of the input\nstring the regular expression tree is walked and a number of the nodes in the\nregular expression are marked. At the end of the string, if the top-level node\nis marked, the string matches, otherwise it does not. At the beginning of the\nstring, one mark gets shifted into the regular expression from the top, and then\nthe marks that are in the regex already are shifted around for every additional\ncharacter.\nLet's start looking at some code, and an example to make this clearer. The base\nclass of all regular expression nodes is this:\nclass Regex(object):\n def __init__(self, empty):\n # empty denotes whether the regular expression\n # can match the empty string\n self.empty = empty\n # mark that is shifted through the regex\n self.marked = False\n\n def reset(self):\n \"\"\" reset all marks in the regular expression \"\"\"\n self.marked = False\n\n def shift(self, c, mark):\n \"\"\" shift the mark from left to right, matching character c.\"\"\"\n # _shift is implemented in the concrete classes\n marked = self._shift(c, mark)\n self.marked = marked\n return marked\n\nThe match function which checks whether a string matches a regex is:\ndef match(re, s):\n if not s:\n return re.empty\n # shift a mark in from the left\n result = re.shift(s[0], True)\n for c in s[1:]:\n # shift the internal marks around\n result = re.shift(c, False)\n re.reset()\n return result\n\nThe most important subclass of Regex is Char, which matches one\nconcrete character:\nclass Char(Regex):\n def __init__(self, c):\n Regex.__init__(self, False)\n self.c = c\n\n def _shift(self, c, mark):\n return mark and c == self.c\n\nShifting the mark through Char is easy: a Char instance retains a mark\nthat is shifted in when the current character is the same as that in the\ninstance.\nAnother easy case is that of the empty regular expression Epsilon:\nclass Epsilon(Regex):\n def __init__(self):\n Regex.__init__(self, empty=True)\n\n def _shift(self, c, mark):\n return False\n\nEpsilons never get a mark, but they can match the empty string.\n\nAlternative\nNow the more interesting cases remain. First we define an abstract base class\nBinary for the case of composite regular expressions with two children, and\nthen the first subclass Alternative which matches if either of two regular\nexpressions matches the string (usual regular expressions syntax a|b).\nclass Binary(Regex):\n def __init__(self, left, right, empty):\n Regex.__init__(self, empty)\n self.left = left\n self.right = right\n\n def reset(self):\n self.left.reset()\n self.right.reset()\n Regex.reset(self)\n\nclass Alternative(Binary):\n def __init__(self, left, right):\n empty = left.empty or right.empty\n Binary.__init__(self, left, right, empty)\n\n def _shift(self, c, mark):\n marked_left = self.left.shift(c, mark)\n marked_right = self.right.shift(c, mark)\n return marked_left or marked_right\n\nAn Alternative can match the empty string, if either of its children can.\nSimilarly, shifting a mark into an Alternative shifts it into both its\nchildren. If either of the children are marked afterwards, the Alternative\nis marked too.\nAs an example, consider the regular expression a|b|c, which would be\nrepresented by the objects Alternative(Alternative(Char('a'), Char('b')), Char('c')).\nMatching the string \"a\" would lead to the following marks in\nthe regular expression objects (green nodes are marked, white ones are\nunmarked):\n\n\nAt the start of the process, no node is marked. Then the first char is matched,\nwhich adds a mark to the Char('a') node, and the mark will propagate up the\ntwo Alternative nodes.\n\n\nRepetition\nThe two remaining classes are slightly trickier. Repetition is used to match\na regular expression any number of times (usual regular expressions syntax\na*):\nclass Repetition(Regex):\n def __init__(self, re):\n Regex.__init__(self, True)\n self.re = re\n\n def _shift(self, c, mark):\n return self.re.shift(c, mark or self.marked)\n\n def reset(self):\n self.re.reset()\n Regex.reset(self)\n\nA Repetition can always match the empty string. The mark is shifted into the\nchild, but if the Repetition is already marked, this will be shifted into\nthe child as well, because the Repetition could match a second time.\nAs an example, consider the regular expression (a|b|c)* matching the string\nabcbac:\n\nFor every character, one of the alternatives matches, thus the repetition matches\nas well.\n\n\nSequence\nThe only missing class is that for sequences of expressions, Sequence (usual\nregular expressions syntax ab):\nclass Sequence(Binary):\n def __init__(self, left, right):\n empty = left.empty and right.empty\n Binary.__init__(self, left, right, empty)\n\n def _shift(self, c, mark):\n old_marked_left = self.left.marked\n marked_left = self.left.shift(c, mark)\n marked_right = self.right.shift(\n c, old_marked_left or (mark and self.left.empty))\n return (marked_left and self.right.empty) or marked_right\n\nA Sequence can be empty only if both its children are empty. The mark\nhandling is a bit delicate. If a mark is shifted in, it will be shifted to the\nleft child regular expression. If that left child is already marked before the\nshift, that mark is shifted to the right child. If the left child can match the\nempty string, the right child gets the mark shifted in as well.\nThe whole sequence matches (i.e. is marked), if the left child is marked after\nthe shift and if the right child can match the empty string, or if the right\nchild is marked.\nConsider the regular expression abc matching the string abcd. For the\nfirst three characters, the marks wander from left to right, when the d is\nreached, the matching fails.\n\n\n\nMore Complex Example\nAs a more complex example, consider the expression ((abc)*|(abcd))(d|e)\nmatching the string abcabcabcd.\n\nNote how the two branches of the first alternative match the first abc in\nparallel, until it becomes clear that only the left alternative (abc)* can\nwork.\n\n\nComplexity\nThe match function above loops over the entire string without going back and\nforth. Each iteration goes over the whole tree every time. Thus the complexity\nof the algorithm is O(m*n) where m is the size of the regular expression\nand n is the length of the string.\n\n\n\nSummary & Outlook\nSo, what have we achieved now? The code shown here can match regular expressions\nwith the desired complexity. It is also not much code. By itself, the Python\ncode shown above is not terribly efficient. In the next post I will show how the\nJIT generator can be used to make the simple matcher shown above really fast.", + "tags": "", + "url": "https://www.pypy.org/posts/2010/05/efficient-and-elegant-regular-2727904462179540436.html" + }, + { + "title": "Running wxPython on top of pypy", + "text": "Hello,\nThese last three weeks we have been busy working on the cpyext subsystem, which\nallows pypy to execute extension modules written with the Python C API.\nToday we hacked enough to have wxPython compile, and run its wonderful demo.\nThis:\n\ncannot be distinguished from the same run with a\nstandard python interpreter, but this:\n\nshows an exception that\nCPython never produces.\nwxPython is a big extension module: it has more than 500 classes and 7500\nfunctions, most of the code is automatically generated by swig. It uses\nadvanced techniques, like \"Original Object Return\" and cross-platform\npolymorphism, that effectively allows the developer to seamlessly subclass C++\nobjects in Python and write GUI applications efficiently.\nThe demo application runs reasonably fast, it feels slower than with CPython,\nbut I did not activate the JIT option of pypy. It still crashes in some places\n(the demo is very comprehensive and covers all the aspects of wxPython), and\nthreads are expected to not work at the moment.\nWe had to modify a little the code of wxPython, mainly because it often stores\nborrowed references into C++ objects. This does not work well in pypy, where\nall other counted references can disappear, and allows the address of the object\nto change. The solution is to use weak references instead. The patch is here,\nit will eventually be merged into the upstream wxPython version.\nThis first real test proves that CPython extensions can be migrated to pypy\nwithout much pain. It also points some places which can be improved, like\nbetter diagnostics in crashes, better support of distutils...\nAmaury Forgeot d'Arc", + "tags": "", + "url": "https://www.pypy.org/posts/2010/05/running-wxpython-on-top-of-pypy-52246787415886751.html" + }, + { + "title": "Using CPython extension modules with PyPy natively, or: PyPy can load .pyd files with CPyExt!", + "text": "PyPy is now able to load\nand run CPython extension modules (i.e. .pyd and .so files) natively by using the new CPyExt\nsubsystem.\nUnlike the solution presented in another blog post (where extension modules like\nnumpy etc. were run on CPython and proxied through TCP), this solution does not require\na running CPython anymore. We do not achieve full binary compatiblity\nyet (like Ironclad), but recompiling the extension is generally enough.\nThe only prerequisite is that the necessary functions of the C API of CPython are already\nimplemented in PyPy. If you are a user or an author of a module and miss certain functions\nin PyPy, we invite you to implement them. Up until now, a lot of people (including a lot of\nnew committers) have stepped up and implemented a few functions to get their favorite module\nrunning. See the end of this post for a list of names.\nRegarding speed, we tried the following: even though there is a bit of overhead when running\nthese modules, we could run the regular expression engine of CPython (_sre.so) and execute\nthe spambayes benchmark of the Unladen Swallow benchmark suite (cf. speed.pypy.org) and\nexperience a speedup:\nIt became two times faster on pypy-c than with the built-in regular\nexpression engine of PyPy. From Amdahl's Law it follows that the _sre.so must run several\ntimes faster than the built-in engine.\nCurrently pursued modules include PIL and others. Distutils support is nearly ready.\nIf you would like to participate or want information on how to use this new feature, come and join\nour IRC channel #pypy on freenode.\nAmaury Forgeot d'Arc and Alexander Schremmer\nFurther CPyExt Contributors:\nAlex Gaynor\nBenjamin Peterson\nJean-Paul Calderone\nMaciej Fijalkowski\nJan de Mooij\nLucian Branescu Mihaila\nAndreas St\u00fchrk\nZooko Wilcox-O Hearn", + "tags": "cpyext,CPython,extension modules,speed", + "url": "https://www.pypy.org/posts/2010/04/using-cpython-extension-modules-with-5864754772659599217.html" + }, + { + "title": "PyPy on google open source blog", + "text": "Hello\nBea D\u00fcring, from the PyPy team, wrote a post for google open source blog covering PyPy's 1.2 release. It's also the first public mention of the fact that google provided financial support for PyPy's 2.5 compatibility. Thanks!\nCheers\nfijal", + "tags": "", + "url": "https://www.pypy.org/posts/2010/04/pypy-on-google-open-source-blog-1192495586835103069.html" + }, + { + "title": "Introducing nightly builds and ubuntu PPA", + "text": "Hello.\n\nWe're pleased to announce two things that we were constantly asked for: Nightly builds and Ubuntu PPA for 1.2 release made by Bartosz Skowron. There are no nightly build ubuntu packages (yet).\n\n\nNightly builds are what they are - pure pypy executables with JIT compiled in (for linux only now). They require either a pypy checkout or a release download. The main difference is that by default display more debugging information than release builds and that they contain recent bugfixes and improvements of course :-)\n\nCheers\nfijal", + "tags": "", + "url": "https://www.pypy.org/posts/2010/03/introducing-nightly-builds-and-ubuntu-3346203966988761264.html" + }, + { + "title": "Blog coverage of speed.pypy.org", + "text": "If you want to read a detailed analysis about why speed.pypy.org is cool, head over to Saveen Reddy's blog at the MSDN.", + "tags": "", + "url": "https://www.pypy.org/posts/2010/03/blog-coverage-of-speedpypyorg-2291955489972824511.html" + }, + { + "title": "Heroes of the 1.2 Release", + "text": "Now that the release is done I wanted to list and to thank some people that\nwere essential in the process of getting it out of the door, particularly\nbecause the work of some of them is not very visible usually.\nArmin Rigo and Maciej Fija\u0142kowski tirelessly worked on most aspects of\nthe release, be it fixing the last known bugs and performance problems,\npackaging or general wizardry.\nAmaury Forgeot d'Arc made sure that PyPy 1.2 actually supports Windows as a\nplatform properly and compiled the Windows binaries.\nMiquel Torres designed and implemented our new speed overview page,\nhttps://speed.pypy.org which is a great tool for us to spot performance\nregressions and to showcase our improvements to the general public.\ntav designed the new user-oriented web page, https://pypy.org which is a lot\nnicer for people that only want to use PyPy as a Python implementation (and not\nbe confused by how PyPy is actually made).\nHolger Krekel fixed our main development server codespeak.net, even while\nbeing on vacation and not really having online connectivity. Without that, we\ncouldn't actually have released anything.\nBartosz Skowron worked a lot on making Ubuntu packages for PyPy, which is\nreally cool. Even though he didn't quite finish in time for the release, we will\nhopefully get them soon.\nThanks to all you guys!", + "tags": "release", + "url": "https://www.pypy.org/posts/2010/03/heroes-of-12-release-7211722984024027191.html" + }, + { + "title": "Introducing the PyPy 1.2 release", + "text": "We are pleased to announce PyPy's 1.2 release.\nThis version 1.2 is a major milestone and it is the first release to ship\na Just-in-Time compiler that is known to be faster than CPython\n(and unladen swallow) on some real-world applications (or the best benchmarks\nwe could get for them). The main theme for the 1.2 release is speed.\nThe JIT is stable and we don't observe crashes. Nevertheless we would\nrecommend you to treat it as beta software and as a way to try out the JIT\nto see how it works for you.\nHighlights:\n\nThe JIT compiler.\nVarious interpreter optimizations that improve performance as well as help\nsave memory. Read our various blog posts about achievements.\nIntroducing a new PyPy website at pypy.org made by tav and improved\nby the PyPy team.\nIntroducing speed.pypy.org made by Miquel Torres, a new service that monitors our performance\nnightly.\nThere will be ubuntu packages on PyPy's PPA made by Bartosz Skowron,\nhowever various troubles prevented us from having them as of now.\n\nKnown JIT problems (or why you should consider this beta software) are:\n\nThe only supported platform is 32bit x86 for now, we're looking for help with\nother platforms.\nIt is still memory-hungry. There is no limit on the amount of RAM that\nthe assembler can consume; it is thus possible (although unlikely) that\nthe assembler ends up using unreasonable amounts of memory.\n\nIf you want to try PyPy, go to the download page on our excellent new site\nand find the binary for your platform. If the binary does not work (e.g. on\nLinux, because of different versions of external .so dependencies), or if\nyour platform is not supported, you can try building from the source.\nThe PyPy release team,\nArmin Rigo, Maciej Fijalkowski and Amaury Forgeot d'Arc\nTogether with\nAntonio Cuni, Carl Friedrich Bolz, Holger Krekel, Samuele Pedroni and many others.", + "tags": "release", + "url": "https://www.pypy.org/posts/2010/03/introducing-pypy-12-release-2791388655442447862.html" + }, + { + "title": "State of PyPy talk from Pycon", + "text": "Hello.\n\nThe last PyPy video from pycon has been uploaded. It's a very short (less than 10 minutes) \"keynote\" talk about state of PyPy.\n\nEnjoy!\nfijal", + "tags": "", + "url": "https://www.pypy.org/posts/2010/03/state-of-pypy-talk-from-pycon-6748503931490058986.html" + }, + { + "title": "Introducing speed.pypy.org", + "text": "Hello.\nSome time ago, we introduced our nightly performance graphs. This was a quick\nhack to allow us to see performance regressions. Thanks to Miquel Torres,\nwe can now introduce https://speed.pypy.org, which is a Django-powered web\napp sporting a more polished visualisation of our nightly performance runs.\nWhile this website is not finished yet, it's already far better than our previous\napproach :-)\nDetails about announcement on pypy-dev are found here.\nIf you're are interested in having something similar for other benchmark runs, contact Miquel (tobami at gmail).\nQuoting Miquel: \"I would also like to note, that if other performance-oriented\nopensource projects are interested, I would be willing to see if we can set-up\nsuch a Speed Center for them. There are already people interested in\ncontributing to make it into a framework to be plugged into buildbots, software\nforges and the like. Stay tuned!\"", + "tags": "", + "url": "https://www.pypy.org/posts/2010/03/introducing-speedpypyorg-1822874891591164256.html" + }, + { + "title": "Benchmarking twisted", + "text": "Hello.\nI recently did some benchmarking of twisted on top of PyPy. For the very\nimpatient: PyPy is up to 285% faster than CPython. For more patient people,\nthere is a full explanation of what I did and how I performed measurments,\nso they can judge themselves.\nThe benchmarks are living in twisted-benchmarks and were mostly written\nby Jean Paul Calderone. Even though he called them \"initial exploratory\ninvestigation into a potential direction for future development resulting\nin performance oriented metrics guiding the process of optimization and\navoidance of complexity regressions\", they're still much much better than\naverage benchmarks found out there.\nThe methodology was to run each benchmark for\nquite some time (about 1 minute), measuring number of requests each 5s.\nThen I looked at dump of data and substracted some time it took\nfor JIT-capable interpreters to warm up (up to 15s), averaging\neverything after that. Averages of requests per second are in the table below (the higher the better):\n\n\n\n\n\n\n\n\nbenchname\nCPython\nUnladen swallow\nPyPy\n\nnames\n10930\n11940 (9% faster)\n15429 (40% faster)\n\npb\n1705\n2280 (34% faster)\n3029 (78% faster)\n\niterations\n75569\n94554 (25% faster)\n291066 (285% faster)\n\naccept\n2176\n2166 (same speed)\n2290 (5% faster)\n\nweb\n879\n854 (3% slower)\n1040 (18% faster)\n\ntcp\n105M\n119M (7% faster)\n60M (46% slower)\n\n\n\nTo reproduce, run each benchmark with:\n\nbenchname.py -n 12 -d 5\nWARNING: running tcp-based benchmarks that open new connection for each\nrequest (web & accept) can exhaust number of some kernel structures,\nlimit n or wait until next run if you see drops in request per second.\nThe first obvious thing is that various benchmarks are more or less amenable\nto speedups by JIT compilation. Accept and tcp getting smallest speedups, if at\nall. This is understandable, since JIT is mostly about reducing interpretation\nand frame overhead, which is probably not large when it comes to accepting\nconnections. However, if you actually loop around, doing something, JIT\ncan give you a lot of speedup.\nThe other obvious thing is that PyPy is the fastest python interpreter\nhere, almost across-the board (Jython and IronPython won't run twisted),\nexcept for raw tcp throughput. However, speedups can vary and I expect\nthis to improve after the release, as there are points, where PyPy can\nbe improved. Regarding raw tcp throughput - this can be a problem for\nsome applications and we're looking forward to improve this particular\nbit.\nThe main reason to use twisted for this comparison is a lot of support from\ntwisted team and JP Calderone in particular, especially when it comes to\nproviding benchmarks. If some open source project wants to be looked at\nby PyPy team, please provide a reasonable set of benchmarks and infrastructure.\nIf, however, you're a closed source project fighting with performance problems\nof Python, we're providing contracting for investigating opportunities, how\nPyPy and not only PyPy, can speed up your project.\nCheers,\nfijal\n\nBenchmark descriptions:\n\nnames - simple DNS server\nweb - simple http hello world server\npb - perspective broker, RPC mechanism for twisted\niterations - empty twisted loop\naccept - number of tcp connections accepted per second\ntcp - raw socket transfer throughput\n\nUsed interpreters:\n\nCPython 2.6.2 - as packaged by ubuntu\nUnladen swallow svn trunk, revision 1109\nPyPy svn trunk, revision 71439\n\nTwisted version used: svn trunk, revision 28580\nMachine: unfortunately 32bit virtual-machine under qemu, running ubuntu karmic,\non top of Quad core intel Q9550 with 6M cache. Courtesy of Michael Schneider.", + "tags": "jit", + "url": "https://www.pypy.org/posts/2010/03/hello-5058108566628405592.html" + }, + { + "title": "Pycon 2010 report", + "text": "Hello.\nGreetings to everybody from Pycon 2010 Atlanta. Right now I'm sitting in\na sprint room with people sprinting on various projects, like CPython,\ntwisted etc. The conference was really great, and I've seen some good talks,\nalthough I've been too exhausted from my own talks to go to too many.\nProbably I should stay away from proposing that many talks to next pycon :-)\nThe highlight of sprints was that we got a common mercurial repository at python.org for python benchmarks. We might be able to come up with\n\"the python benchmark suite\" which will mostly consist \nof simple benchmarks using large python libraries, rather than microbenchmarks.\nThe repository was started by the Unladen Swallow people and we already\nhave common commit access among PyPy, CPython, Unladen Swallow, Jython\nand Iron Python. We don't have yet a common place to run benchmarks,\nbut we should be able to fix that soon.\nRegarding the talks, there are online videos for\nHow to write cross-interpreter python programs and Speed of PyPy talks,\namong other talks from Pycon.\nThere should be a video for my short keynote shortly.\nThe talks were well received as there is interest in PyPy's progress.\nCheers,\nfijal", + "tags": "", + "url": "https://www.pypy.org/posts/2010/02/pycon-2010-report-6986911457623699520.html" + }, + { + "title": "Nightly graphs of PyPy's performance", + "text": "Hello.\nIn the past few months, we made tremendous progress on the JIT front.\nTo monitor the progress daily, we introduced recently some cool graphs\nthat plot revision vs performance. They are based on unladen swallow\nbenchmark runner and they're written entirely in JavaScript, using canvas\nvia the JQuery and Flot libraries.\nIt's amazing what you can do in JavaScript these days... They are also\ntested via the very good oejskit plugin, that integrates py.test\nwith JavaScript testing, driven by the command line.\nAs you can probably see, we're very good on some benchmarks and not that\ngreat on others. Some of the bad results come from the fact that while we\ndid a lot of JIT-related work, other PyPy parts did not see that much\nlove. Some of our algorithms on the builtin data types are inferior to those\nof CPython. This is going to be an ongoing focus for a while.\nWe want to first improve on the benchmarks for a couple\nof weeks before doing a release to gather further feedback.\nCheers,\nfijal", + "tags": "", + "url": "https://www.pypy.org/posts/2010/01/nightly-graphs-of-pypys-performance-8360469412941669946.html" + }, + { + "title": "Accelerating PyPy development by funding", + "text": "PyPy has recently made some great speed and memory progress towards providing the most efficient Python interpreter out there. We also just announced\nour plans for the pypy-1.2 release. Much of this is driven by personal\ncommitment, by individuals and companies investing time and money.\nNow we'd appreciate some feedback and help regarding getting money\ninto the PyPy project to help its core members (between\n5 and 15 people depending how you count) to sustain themselves. We see\nseveral options:\n\nuse a foundation structure and ask for tax-exempt donations to the\nproject, its developers and infrastructure. We just got\na letter from the Software Freedom Conservancy that they view\nour application favourably so this option becomes practical hopefully\nsoon.\noffer to implement certain features like a 64bit JIT-backend,\nNumpy for PyPy or a streamlined installation in exchange for money,\ncontributed in small portions/donations. Do you imagine you or your\ncompany would sponsor PyPy on a small scale for efforts like this?\nAny other bits you'd like to see?\noffer to implement larger scale tasks by contracting PyPy related companies,\nnamely Open End and merlinux who have successfully done such\ncontracts in the past. Please don't hesitate to contact\nholger@merlinux.eu and bea@openend.se if you want to start a\nconversation on this.\napply for public/state funding - in fact we are likely to get some\nfunding through Eurostars, more on that separately. Such funding\nis usually only a 50-60% percentage of actual employment and\nproject costs, and is tied to research questions rather than\nto make PyPy a production-useable interpreter, though.\n\nAnything else we should look out for?\ncheers & thanks for any feedback,\nMaciej and Holger", + "tags": "", + "url": "https://www.pypy.org/posts/2009/12/accelerating-pypy-development-by-8973749020516679741.html" + }, + { + "title": "Planning a next release of PyPy", + "text": "The PyPy core team is planning to make a new release before the next Pycon US.\nThe main target of the 1.2 release is packaging the good results\nwe have achieved applying our current JIT compiler generator to our\nPython interpreter. Some of that progress has been chronicled in\nrecent posts on the status blog. By releasing them in a\nrelatively stable prototype we want to encourage people to try them with their\nown code and to gather feedback in this way. By construction the JIT compiler\nshould support all Python features, what may vary are the speedups\nachieved (in some cases the JIT may produce worse results than the PyPy\ninterpreter which we would like to know) and the extra memory required\nby it.\nFor the 1.2 release we will focus on the JIT stability first, less on\nimproving non-strictly JIT areas. The JIT should be good at many things\nas shown by previous blog postings. We want the JIT compiler in the\nrelease to work well on Intel 32 bits on Linux, with Mac OS X and\nWindows being secondary targets. Which compilation targets work will\ndepend a bit on contributions.\nIn order to finalize the release we intend to have a concentrated\neffort (\"virtual sprint\") from the 22nd to the 29th of\nJanuary. Coordination will happen as usual through the #pypy irc\nchannel on freenode. Samuele Pedroni will take the role of release\nmanager as he already did in the past.", + "tags": "release", + "url": "https://www.pypy.org/posts/2009/12/planning-next-release-of-pypy-4193252449406707091.html" + }, + { + "title": "Leysin Winter Sprint: reported", + "text": "Update: the sprint has been reported to some later date.\n\nThe next PyPy sprint will probably still be in Leysin, Switzerland, for the\nseventh time.", + "tags": "", + "url": "https://www.pypy.org/posts/2009/12/leysin-winter-sprint-23-30th-january-7768876505015446348.html" + }, + { + "title": "Using CPython extension modules with PyPy, or: PyQt on PyPy", + "text": "If you have ever wanted to use CPython extension modules on PyPy,\nwe want to announce that there is a solution that should be compatible\nto quite a bit of the available modules. It is neither new nor written\nby us, but works nevertheless great with PyPy.\nThe trick is to use RPyC, a transparent, symmetric remote procedure\ncall library written in Python. The idea is to start a\nCPython process that hosts the PyQt libraries\nand connect to it via TCP to send RPC commands to it.\nI tried to run PyQt applications\nusing it on PyPy and could get quite a bit of the functionality of these\nworking. Remaining problems include regular segfaults of CPython\nbecause of PyQt-induced memory corruption and bugs because classes\nlike StandardButtons behave incorrectly when it comes to arithmetical operations.\nChanges to RPyC needed to be done to support remote unbound __init__ methods,\nshallow call by value for list and dict types (PyQt4 methods want real lists and dicts\nas parameters), and callbacks to methods (all remote method objects are wrapped into\nsmall lambda functions to ease the call for PyQt4).\nIf you want to try RPyC to run the PyQt application of your choice, you just\nneed to follow these steps. Please report your experience here in the blog\ncomments or on our mailing list.\n\n\nDownload RPyC from the RPyC download page.\nDownload this patch and apply it to RPyC by running\npatch -p1 < rpyc-3.0.7-pyqt4-compat.patch in the RPyC directory.\nInstall RPyc by running python setup.py install as root.\nRun the file rpyc/servers/classic_server.py using CPython.\nExecute your PyQt application on PyPy.\n\n\nPyPy will automatically connect to CPython and use its PyQt libraries.\nNote that this scheme works with nearly every extension library. Look\nat pypy/lib/sip.py on how to add new libraries (you need to create\nsuch a file for every proxied extension module).\nHave fun with PyQt\nAlexander Schremmer", + "tags": "CPython,extension modules,PyQt4,RPyC", + "url": "https://www.pypy.org/posts/2009/11/using-cpython-extension-modules-with-4951018896657992031.html" + }, + { + "title": "Some benchmarking", + "text": "Hello.\n\nRecently, thanks to the surprisingly helpful Unhelpful, also known as Andrew Mahone,\nwe have a decent, if slightly arbitrary, set of performances graphs.\nIt contains a couple of benchmarks already\nseen on this blog as well as some taken from The Great Computer\nLanguage Benchmarks Game. These benchmarks don't even try to represent \"real applications\"\nas they're mostly small algorithmic benchmarks. Interpreters used:\n\n\n\nPyPy trunk, revision 69331 with --translation-backendopt-storesink, which is\nnow on by default\n\n\nUnladen swallow trunk, r900\n\nCPython 2.6.2 release\n\n\nHere are the graphs; the benchmarks and the runner script are available\n\n\n\n\nAnd zoomed in for all benchmarks except binary-trees and fannkuch.\n\n\n\nAs we can see, PyPy is generally somewhere between the same speed\nas CPython to 50x faster (f1int). The places where we're the same\nspeed as CPython are places where we know we have problems - for example generators are\nnot sped up by the JIT and they require some work (although not as much by far\nas generators & Psyco :-). The glaring inefficiency is in the regex-dna benchmark.\nThis one clearly demonstrates that our regular expression engine is really,\nreally, bad and urgently requires attention.\n\n\nThe cool thing here is, that although these benchmarks might not represent\ntypical python applications, they're not uninteresting. They show\nthat algorithmic code does not need to be far slower in Python than in C,\nso using PyPy one need not worry about algorithmic code being dramatically\nslow. As many readers would agree, that kills yet another usage of C in our\nlives :-)\n\nCheers,\nfijal", + "tags": "jit", + "url": "https://www.pypy.org/posts/2009/11/some-benchmarking-9211261260383281459.html" + }, + { + "title": "D\u00fcsseldorf Sprint Report", + "text": "While the D\u00fcsseldorf is dwindling off, we put our minds to the task of retelling\nour accomplishments. The sprint was mostly about improving the JIT and we \nmanaged to stick to that task (as much as we managed to stick to anything). The \nsprint was mostly filled with doing many small things. \n \nInlining \nCarl Friedrich and Samuele started the sprint trying to tame the JIT's inlining.\nUntil now, the JIT would try to inline everything in a loop (except other loops) \nwhich is what most tracing JITs actually do. This works great if the resulting \ntrace is of reasonable length, but if not it would result in excessive memory \nconsumption and code cache problems in the CPU. So far we just had a limit on \nthe trace size, and we would abort tracing when the limit was reached. This \nwould happen again and again for the same loop, which is not useful at all. The \nnew approach introduced is to be more clever when tracing is aborted by marking \nthe function with the largest contribution to the trace size as non-inlinable. The\nnext time this loop is traced, it usually then gives a reasonably sized trace.\nThis gives a problem because now some functions that don't contain loops are not\ninlined, which means they never get assembler code for them generated. To remedy \nthis problem we also make it possible to trace functions from their start (as \nopposed to just tracing loops). We do that only for functions that can not be \ninlinined (either because they contain loops or they were marked as \nnon-inlinable as described above). \nThe result of this is that the Python version telco decimal benchmark runs \nto completion without having to arbitrarily increase the trace length limit. \nIt's also about 40% faster than running it on CPython. This is one of the first \nnon-tiny programs that we speed up. \n \n \nReducing GC Pressure \nArmin and Anto used some GC instrumentation to find places in pypy-c-jit \nthat allocate a lot of memory. This is an endlessly surprising exercise, as \nusually we don't care too much about allocations of short-lived objects when \nwriting RPython, as our GCs usually deal well with those. They found a few \nplaces where they could remove allocations, most importantly by making one of \nthe classes that make up traces smaller. \n \n \nOptimizing Chains of Guards \nCarl Friedrich and Samuele started a simple optimization on the trace level that \nremoves superfluous guards. A common pattern in a trace is to have stronger \nand stronger guards about the same object. As an example, often there is first a \nguard that an object is not None, later followed by a guard that it is exactly \nof a given class and then even later that it is a precise instance of that \nclass. This is inefficient, as we can just check the most precise thing in the \nplace of the first guard, saving us guards (which take memory, as they need resume data). \nMaciek, Armin and Anto later improved on that by introducing a new guard that \nchecks for non-nullity and a specific class in one guard, which allows us to \ncollapse more chains. \n \n \nImproving JIT and Exceptions \nArmin and Maciek went on a multi-day quest to make the JIT and Python-level \nexceptions like each other more. So far, raising and catching exceptions would \nmake the JIT generate code that has a certain amusement value, but is not really \nfast in any way. To improve the situation, they had to dig into the exception \nsupport in the Python interpreter, where they found various inefficiencies. They \nalso had to rewrite the exceptions module to be in RPython (as opposed to \njust pure Python + an old hack). Another problems is that tracebacks give you \naccess to interpreter frames. This forces the JIT to deoptimize things, as \nthe JIT keeps some of the frame's content in CPU registers or on the CPU stack, \nwhich reflective access to frames prevents. \nCurrently we try to improve the simple cases where the traceback is never \nactually accessed. This work is not completely finished, but some cases are \nalready significantly faster. \n \n \nMoving PyPy to use py.test 1.1 \nHolger worked on porting PyPy to use the newly released py.test 1.1. PyPy \nstill uses some very old support code in its testing infrastructure, which makes \nthis task a bit annoying. He also gave the other PyPy developers a demo of some \nof the newer py.test features and we discussed which of them we want to start \nusing to improve our tests to make them shorter and clearer. One of the things \nwe want to do eventually is to have less skipped tests than now. \n \n \nUsing a Simple Effect Analysis for the JIT \nOne of the optimization the JIT does is caching fields that are read out of \nstructures on the heap. This cache needs to be invalidated at some points, for\nexample when such a field is written to (as we don't track aliasing much).\nAnother case is a call in the assembler, as the target function could\narbitrarily change the heap. This of course is imprecise, since most functions\ndon't actually change the whole heap, and we have an analysis that finds out\nwhich sorts of types of structs and arrays a function can mutate. During the\nsprint Carl Friedrich and Samuele integrated this analysis with the JIT, to help\nit invalidate caches less aggressively. Later Anto and Carl Friedrich also\nported this support to the CLI version of the JIT.\n\n\nMiscellaneous\nSamuele (with some assistance of Carl Friedrich) set up a buildbot slave on a\nMac Mini at the University. This should let us stabilize on the Max OS X. So far\nwe still have a number of failing tests, but now we are in a situation to\nsanely approach fixing them.\nAnto improved the CLI backend to support the infrastructure for producing the\nprofiling graphs Armin introduced.\nThe guinea-pigs that were put into Carl Friedrich's care have been fed (which\nwas the most important sprint task anyway).\n Samuele & Carl Friedrich", + "tags": "", + "url": "https://www.pypy.org/posts/2009/11/dusseldorf-sprint-report-2505348213879053352.html" + }, + { + "title": "D\u00fcsseldorf Sprint Started", + "text": "The D\u00fcsseldorf sprint starts today. Only Samuele and me are there so far, but that should change over the course of the day. We will mostly work on the JIT during this sprint, trying to make it a lot more practical. For that we need to decrease its memory requirements some more and to make it use less aggressive inlining. We will post more as the sprint progresses.", + "tags": "", + "url": "https://www.pypy.org/posts/2009/11/dusseldorf-sprint-started-7608527610228870250.html" + }, + { + "title": "PyPy on RuPy 2009", + "text": "Hello.\n\nIt's maybe a bit late to announce, but there will be PyPy talk\nat Rupy conference this weekend in\nPoznan. Precisely, I'll be talking mostly about PyPy's JIT and\nhow to use it. Unfortunately the talk is on Saturday, at 8:30 in the morning.\n\n\nEDIT: Talk is online, together with examples\n\nCheers,\nfijal", + "tags": "", + "url": "https://www.pypy.org/posts/2009/11/pypy-on-rupy-2009-5675275348619189353.html" + }, + { + "title": "Logging and nice graphs", + "text": "Hi all,\n\nThis week I worked on improving the system we use for logging. Well, it was not really a \"system\" but rather a pile of hacks to measure in custom ways timings and counts and display them. So now, we have a system :-)\n\nThe system in question was integrated in the code for the GC and the JIT, which are two independent components as far as the source is concerned. However, we can now display a unified view. Here is for example pypy-c-jit running pystone for (only) 5000 iterations:\n\n\n\nThe top long bar represents time. The bottom shows two summaries of the total time taken by the various components, and also plays the role of a legend to understand the colors at the top. Shades of red are the GC, shades of green are the JIT.\n\nHere is another picture, this time on pypy-c-jit running 10 iterations of richards:\n\n\n\nWe have to look more closely at various examples, but a few things immediately show up. One thing is that the GC is put under large pressure by the jit-tracing, jit-optimize and (to a lesser extent) the jit-backend components. So large in fact that the GC takes at least 60-70% of the time there. We will have to do something about it at some point. The other thing is that on richards (and it's likely generally the case), the jit-blackhole component takes a lot of time. \"Blackholing\" is the operation of recovering from a guard failure in the generated assembler, and falling back to the interpreter. So this is also something we will need to improve.\n\nThat's it! The images were generated with the following commands:\n\nPYPYLOG=/tmp/log pypy-c-jit richards.py\npython pypy/tool/logparser.py draw-time /tmp/log --mainwidth=8000 --output=filename.png\n\nEDIT: nowadays the command-line has changed to:python rpython/tool/logparser.py draw-time /tmp/log --mainwidth=8000 filename.png", + "tags": "jit", + "url": "https://www.pypy.org/posts/2009/11/hi-all-this-week-i-worked-on-improving-6515977421244851229.html" + }, + { + "title": "GC improvements", + "text": "In the last week, I (Armin) have been taking some time off the\nJIT work to improve our GCs. More precisely, our GCs now take\none or two words less for every object. This further reduce the\nmemory usage of PyPy, as we will show at the end.\n\nBackground information: RPython object model\n\nWe first need to understand the RPython object model as\nimplemented by our GCs and our C backend. (Note that the\nobject model of the Python interpreter is built on top of\nthat, but is more complicated -- e.g. Python-level objects\nare much more flexible than RPython objects.)\n\nConsider these two RPython classes:\n \n\nclass A:\n def __init__(self, x):\n self.x = x\n def f(self):\n return self.x * 42\n\nclass B(A):\n def __init__(self, x, y):\n self.x = x\n self.y = y\n def f(self):\n return self.x + self.y\n\n\nThe instances of A and B look like this in memory (all cells\nare one word):\n\n\nGC header\nvtable ptr of A\nhash\nx\n\n\n\nGC header\nvtable ptr of B\nhash\nx\ny\n\n\nThe first word, the GC header, describes the layout. It\nencodes on half a word the shape of the object, including where it\ncontains further pointers, so that the GC can trace it. The\nother half contains GC flags (e.g. the mark bit of a\nmark-and-sweep GC).\n\nThe second word is used for method dispatch. It is similar to a\nC++ vtable pointer. It points to static data that is mostly a\ntable of methods (as function pointers), containing e.g. the method f\nof the example.\n\nThe hash field is not necessarily there; it is only present in classes\nwhose hash is ever taken in the RPython program (which includes being\nkeys in a dictionary). It is an \"identity hash\": it works like\nobject.__hash__() in Python, but it cannot just be the address of\nthe object in case of a GC that moves objects around.\n\nFinally, the x and y fields are, obviously, used to store the value\nof the fields. Note that instances of B can be used in places that\nexpect a pointer to an instance of A.\n\nUnifying the vtable ptr with the GC header\n\nThe first idea of saving a word in every object is the observation\nthat both the vtable ptr and the GC header store information about\nthe class of the object. Therefore it is natural to try to only have\none of them. The problem is that we still need bits for the GC flags,\nso the field that we have to remove is the vtable pointer.\n\nThis means that method dispatch needs to be more clever: it\ncannot directly read the vtable ptr, but needs to compute it\nfrom the half-word of the GC header. Fortunately, this can be\ndone with no extra instruction on the assembler level. Here is\nhow things will look like in the end, assuming a 32-bit x86\nmachine (but note that as usual we just generate portable C).\n\nThe trick for achieving efficiency is that we store all\nvtables together in memory, and make sure that they don't take\nmore than 256 KB in total (16 bits, plus 2 bits of alignment).\nHere is how the assembler code (produced by the normal C\ncompiler, e.g. gcc) for calling a method looks like. Before\nthe change:\n\n\nMOV EDX, [EAX + 4] # load the vtable ptr from object EAX\nMOV EDX, [EDX + method_offset] # load the function pointer from the vtable\nCALL EDX\n\n\nInstead, we now have:\n\n\nMOVZX EDX, [EAX] # load the 16-bit part of the GC header from EAX\nMOV EDX, [vtable_start + 4*EDX + method_offset]\nCALL EDX\n\n\nNote that the complex addressing scheme done by the second MOV\nis still just one instruction: the vtable_start and\nmethod_offset are constants, so they are combined. And as the\nvtables are anyway aligned at a word boundary, we can use\n4*EDX to address them, giving us 256 KB instead of just 64 KB\nof vtables.\n\nOptimizing the hash field\n\nIn PyPy's Python interpreter, all application-level objects\nare represented as an instance of some subclass of W_Root.\nSince all of these objects could potentially be stored in a\ndictionary by the application Python program, all these\nobjects need a hash field. Of course, in practice, only a\nfraction of all objects in a Python program end up having\ntheir hash ever taken. Thus this field of W_Root is wasted\nmemory most of the time.\n\n(Up to now, we had a hack in place to save the hash field\non a few classes like W_IntegerObject, but that meant that\nthe Python expression ``object.__hash__(42)'' would raise\na TypeError in PyPy.)\n\nThe solution we implemented now (done by some Java GCs, among\nothers) is to add a hash field to an object when the\n(identity) hash of that object is actually taken. This means\nthat we had to enhance our GCs to support this. When objects\nare allocated, we don't reserve any space for the hash:\n\nobject at 0x74B028\n\n...00...\nx\ny\n\n \nWhen the hash of an object is taken, we use its current memory\naddress, and set a flag in the GC header saying that this\nparticular object needs a hash:\n\nobject at 0x74B028\n\n...01...\nx\ny\n\n\nIf the GC needs to move the object to another memory location,\nit will make the new version of the object bigger, i.e. it\nwill also allocate space for the hash field:\n\nobject at 0x825F60\n\n...11...\nx\ny\n0x74B028\n\n\nThis hash field is immediately initialized with the old memory\naddress, which is the hash value that we gave so far for the\nobject. To not disturb the layout of the object, we always\nput the extra hash field at the end. Of course, once set,\nthe hash value does not change even if the object needs to\nmove again.\n\nResults\n\nRunning the following program on PyPy's Python interpreter\nwith n=4000000:\n\n\ndef make_linked_list(n):\n a = None\n i = 0\n while i < n:\n b = X()\n b.next = a\n a = b\n i += 1\n\n\nthe two optimizations together save 32 MB of RAM (i.e. 8 bytes\nper object). The version of PyPy we measured this with was built\nas follows:\n\n\n./translate.py --gcremovetypeptr targetpypystandalone --objspace-std-withsharingdict\n\n\nThe total amount of RAM used on a 32-bit Linux is 247 MB,\ncompleting in 10.3 seconds. On CPython, it consumes 684 MB\nand takes 89 seconds to complete... This nicely shows that\nour GCs are much faster at allocating objects, and that our\nobjects can be much smaller than CPython's.\n\nArmin Rigo & Carl Friedrich Bolz", + "tags": "", + "url": "https://www.pypy.org/posts/2009/10/gc-improvements-6174120095428192954.html" + }, + { + "title": "First pypy-cli-jit benchmarks", + "text": "As the readers of this blog already know, I've been working on porting the\nJIT to CLI/.NET for the last months. Now that it's finally possible to get a\nworking pypy-cli-jit, it's time to do some benchmarks.\nWarning: as usual, all of this has to be considered to be a alpha version:\ndon't be surprised if you get a crash when trying to run pypy-cli-jit. Of\ncourse, things are improving very quickly so it should become more and more\nstable as days pass.\nFor this time, I decided to run four benchmarks. Note that for all of them we\nrun the main function once in advance, to let the JIT recoginizing the hot\nloops and emitting the corresponding code. Thus, the results reported do\nnot include the time spent by the JIT compiler itself, but give a good\nmeasure of how good is the code generated by the JIT. At this point in time,\nI know that the CLI JIT backend spends way too much time compiling stuff, but\nthis issue will be fixed soon.\n\n\nf1.py: this is the classic PyPy JIT benchmark. It is just a function\nthat does some computational intensive work with integers.\nfloatdemo.py: this is the same benchmark involving floating point\nnumbers that have already been described in a previous blog post.\noodemo.py: this is just a microbenchmark doing object oriented stuff\nsuch as method calls and attribute access.\nrichards2.py: a modified version of the classic richards.py, with a\nwarmup call before starting the real benchmark.\n\n\nThe benchmarks were run on a Windows machine with an Intel Pentium Dual Core\nE5200 2.5GHz and 2GB RAM, both with .NET (CLR 2.0) and Mono 2.4.2.3.\nBecause of a known mono bug, if you use a version older than 2.1 you need\nto pass the option -O=-branch to mono when running pypy-cli-jit, else it\nwill just loop forever.\nFor comparison, we also run the same benchmarks with IronPython 2.0.1 and\nIronPython 2.6rc1. Note that IronPython 2.6rc1 does not work with mono.\nSo, here are the results (expressed in seconds) with Microsoft CLR:\n\n\n\n\n\n\n\n\n\n\n\nBenchmark\npypy-cli-jit\nipy 2.0.1\nipy 2.6\nipy2.01/ pypy\nipy2.6/ pypy\n\n\n\nf1\n0.028\n0.145\n0.136\n5.18x\n4.85x\n\nfloatdemo\n0.671\n0.765\n0.812\n1.14x\n1.21x\n\noodemo\n1.25\n4.278\n3.816\n3.42x\n3.05x\n\nrichards2\n1228\n442\n670\n0.36x\n0.54x\n\n\n\n\nAnd with Mono:\n\n\n\n\n\n\n\n\n\nBenchmark\npypy-cli-jit\nipy 2.0.1\nipy2.01/ pypy\n\n\n\nf1\n0.042\n0.695\n16.54x\n\nfloatdemo\n0.781\n1.218\n1.55x\n\noodemo\n1.703\n9.501\n5.31x\n\nrichards2\n720\n862\n1.20x\n\n\n\n\nThese results are very interesting: under the CLR, we are between 5x faster\nand 3x slower than IronPython 2.0.1, and between 4.8x faster and 1.8x slower\nthan IronPython 2.6. On the other hand, on mono we are consistently faster\nthan IronPython, up to 16x. Also, it is also interesting to note that\npypy-cli runs faster on CLR than mono for all benchmarks except richards2.\nI've not investigated yet, but I think that the culprit is the terrible\nbehaviour of tail calls on CLR: as I already wrote in another blog post,\ntail calls are ~10x slower than normal calls on CLR, while being only ~2x\nslower than normal calls on mono. richads2 is probably the benchmark that\nmakes most use of tail calls, thus explaining why we have a much better result\non mono than CLR.\nThe next step is probably to find an alternative implementation that does not\nuse tail calls: this probably will also improve the time spent by the JIT\ncompiler itself, which is not reported in the numbers above but that so far it\nis surely too high to be acceptable. Stay tuned.", + "tags": "cli,jit,pypy", + "url": "https://www.pypy.org/posts/2009/10/first-pypy-cli-jit-benchmarks-6698484455072589492.html" + }, + { + "title": "PyPy's JIT now supports floats", + "text": "Hello.\n\n\n\nWe've just merged branch which adds float support to x86 backend.\nThis means that floating point operations are now super fast\nin PyPy's JIT. Let's have a look at example, provided by \nAlex Gaynor\nand stolen from Factor blog.\n\n\n\nThe original version of the benchmark, was definitely tuned for the performance needs of CPython.\n\nFor running this on PyPy, I changed to a bit simpler version of the program,\nand I'll explain a few changes that I did, which the reflect current\nlimitations of PyPy's JIT. They're not very deep and they might be\nalready gone while you're reading it:\n\n\n\nUsage of __slots__. This is a bit ridiculous, but we spend quite a bit\n of time to speed up normal instances of new-style classes which are\n very fast, yet ones with __slots__ are slower. To be fixed soon.\n\nUsage of reduce. This one is even more obscure, but reduce is not\n perceived as a thing producing loops in a program. Moving to\n a pure-Python version of reduce fixes the problem.\n\nUsing x ** 2 vs x * x. In PyPy, reading a local variable is a\n no-op when JITted (the same as reading local variable in C). However\n multiplication is simpler operation that power operation.\n\n\n\nI also included the original Java benchmark. Please\nnote that original java version is similar to my modified one\n(not the one specifically tuned for CPython)\n\n\nThe performance figures below (for n = 1 000 000), average of 10 runs:\n\n\nCPython 2.6: 7.56s\nCPython & psyco 2.6: 4.44s\nPyPy: 1.63s\nJava (JVM 1.6, client mode): 0.77s\n\n\n\nand while JVM is much faster, it's very good that we can even compare :-)\n\n\nCheers\nfijal", + "tags": "jit", + "url": "https://www.pypy.org/posts/2009/10/pypys-jit-now-supports-floats-7003493323596806737.html" + }, + { + "title": "First results of the JIT", + "text": "Hi all,\n\nJust a quick note to tell you that we are progressing on the\nJIT front. Here are the running times of the richards\nbenchmark on my laptop:\n\n8.18 seconds with CPython 2.5.2;\n\n2.61 seconds with pypy-c-jit (3x faster than CPython);\n\n1.04 seconds if you ignore the time spent making assembler (8x faster than CPython);\n\n1.59 seconds on Psyco, for reference (5x faster that CPython).\n\nYes, as this table shows, we are spending 1.57 seconds in the JIT\nsupport code. That's too much -- even ridiculously so -- for anything but a\nlong-running process. We are working on that :-)\n\nIf you want to build your own pypy-c-jit (for x86-32 only for now):\n\nyou need a Subversion checkout of trunk;\n\nrun pypy/translator/goal/translate.py with the -Ojit\n option;\n\nas usual, wait a long time (and be sure you have more than 1GB of RAM).\n\nFor now pypy-c-jit spews a lot of debugging output and\nthere are a few known\nexamples where it crashes. As we like to repeat, however, it's a complete JIT:\napart from the crashes (the bugs are probably in the JIT support code), it supports the whole Python language from the start -- in the sense of doing correct things. Future work include\nPython-specific improvements by e.g. tweaking the data structures used to store Python objects so that they are more JIT-friendly.\n\nEDIT: Oh yes, fijal reminds me that CPython 2.6 is 30% faster than CPython 2.5 on this benchmark (which is mostly my \"fault\", as I extracted a small part of PyPy and submitted it as a patch to CPython that works particularly well for examples like richards). It does not fundamentally change the fact that we are way faster though.", + "tags": "jit", + "url": "https://www.pypy.org/posts/2009/09/first-results-of-jit-6674537807334018925.html" + }, + { + "title": "PyPy sprint in D\u00fcsseldorf, 6 Nov - 13 Nov", + "text": "The next PyPy sprint will be held in the Computer Science department of\nHeinrich-Heine Universit\u00e4t D\u00fcsseldorf from the 6th to the 13th of\nNovember 2009. This is a fully public sprint, everyone is welcome to\njoin us.\n\nTopics and goals\nAt the sprint we intend to work on the JIT generator in PyPy and on\napplying it to PyPy Python interpreter.\nThe precise work that will be done is not fixed, as we don't know in\nwhich state the JIT will be in November. However, possible areas of\nwork might include:\n\ntweaking the interpreter/objspace to be more JIT-friendly, e.g.\ninstance implementation code, call code\nif there is interest starting non x86-32 JIT backends\ntrying out existing software to find features where the optimizations\nof the JIT could be improved\nimproving our benchmarking infrastructure\n\nWe will give special priority to topics that \"non-core\" people find\ninteresting (as long as they are somehow JIT-related).\nFor an introduction of how our JIT-generation process works, please\nrefer to our blog:\nhttps://morepypy.blogspot.com/2009/03/jit-bit-of-look-inside.html\nThere is also a more dense academic paper about the subject:\nhttps://codespeak.net/svn/pypy/extradoc/talk/icooolps2009/bolz-tracing-jit-final.pdf\n\n\nLocation\nThe sprint will take place in a seminar room of the computer science\ndepartment. It is in the building 25.12 of the university campus. For\ntravel instructions see\n\nhttps://stups.cs.uni-duesseldorf.de/anreise/esbahn.php\n\n\nRegistration\nIf you'd like to come, please subscribe to the pypy-sprint mailing\nlist and drop a note about your interests and post any questions.\nMore organisational information will be send to that list. We'll keep a\nlist of people which we'll update (which you can do so yourself if\nyou have codespeak commit rights).", + "tags": "", + "url": "https://www.pypy.org/posts/2009/09/pypy-sprint-in-dusseldorf-6-nov-13-nov-8153983964308175836.html" + }, + { + "title": "PyPy gets a new compiler", + "text": "Today, I merged the parser-compiler branch, which I have been working on over the summer. It contained a total rewrite of both PyPy's Python parser and AST compiler. PyPy's old parser was (in)famous internally for being complicated and slow (with many algorithmic complexities greater than O(n)). The new parser is a simple as I could make it LL(1) parser like CPython (though it doesn't share the hacks of CPython's parser).\n\nThe new compiler is based on the Abstract Syntax Trees (AST) that CPython 2.5 introduced instead of PyPy's old AST based on the compiler package's. This means that Python code running on PyPy will be able to use the same _ast interface as CPython. PyPy's _ast implementation supports AST features that CPython 2.6 added, including compiling modified AST to bytecode and executing it. In this rewrite, some more obscure compiler features were added, too. For example, jumps in bytecode can now be greater than 65535 bytes! (That's like an if statement with 7000 lines of code in the body.)\n\nWhile the PyPy translation toolchain still has many obscure details and hacks, this merge completes the process of making the actual Python interpreter very clean. Hopefully, this will make adding new features much easier and make PyPy less frustrating to maintain as well as providing application level code with an improved AST interface!", + "tags": "compiler,parser,speed", + "url": "https://www.pypy.org/posts/2009/08/pypy-gets-new-compiler_25-6401910947439531107.html" + }, + { + "title": "Gothenburg JIT sprint report", + "text": "Finally, we managed to squeeze in some time to write a report about what\nhas been going on the mysterious JIT sprint in Gothenburg, Sweden.\nThe main goals of the sprint were to lay down the groundwork for getting\nmore JIT work going in the next months and get more of PyPy developers\nup to speed with the current state of the JIT. One of the elements was\nto get better stability of the JIT, moving it slowly from being a prototype to\nactually work nicely on larger programs.\n\nThe secret goal of the sprint was to seek more speed, which Anto and\nCarl Friedrich did even during the break day:\n\n\nWe spent the first two days improving test coverage of the x86 backend\nand the optimizer. Now we have 100% coverage with unittests\n(modulo figleaf bugs), which does not mean anything, but it's better\nthan before.\n\nThen we spent quite some time improving the optimizer passes, so\nnow we generate far less code than before the sprint, because a lot of\nit is optimized away. On the interpreter side, we marked more objects\n(like code objects) as immutable, so that reading fields from them\ncan be constant-folded.\nAnother important optimization that we did is to remove consecutive\nreading of the same fields from the same structure, if no code in between\ncan change it.\nOur JIT is a hybrid environment, where only hot loops of code are jitted\nand the rest stays being interpreted. We found out that the performance\nof the non-jitted part was suboptimal, because all accesses to python\nframes went through an extra layer of indirection. We removed this layer\nof indirection, in the case where the jit and the interpreter cannot\naccess the same frame (which is the common case).\nWe also spent some time improving the performance of our x86 backend,\nby making it use more registers and by doing more advanced variable\nrenaming at the end of loops. It seems that using more registerd is not as\nmuch of a win as we hoped, because modern day processors are much\nsmarter than we thought.\nThe most mind bending part was finding why we loose performance by\nmaking the JIT see more of the interpreter. It took us two very frustrating\ndays and 36 gray hairs to find out that from the JIT we call a different malloc\nfunction in the Boehm GC, which is by far slower than the version that\nwe use from the interpreter. This meant that the more we jitted, the\nslower our code got, purely because of the mallocs.\nNow that this is fixed, the world makes much more sense again.\nA lot of the sprint's work is not directly measurable in the performance\nfigures, but we did a lot of work that is necessary for performance to\nimprove in the next weeks. After we have done a bit more work, we should\nbe able to provide some performance figures for programs that are\nmore realistic than just loops that count to ten millions (which are\nvery fast already :).\nNow we're going to enjoy a couple of days off to recover from the sprint.\nB\u00e4sta h\u00e4lsningar,\nCarl Friedrich, fijal", + "tags": "jit", + "url": "https://www.pypy.org/posts/2009/08/gothenburg-jit-sprint-report-3309138497953458138.html" + }, + { + "title": "PyPy numeric experiments", + "text": "Because PyPy will be presenting at the upcoming euroscipy conference, I have been playing recently with the idea of NumPy and PyPy integration. My idea is to integrate PyPy's JIT with NumPy or at least a very basic subset of it. Time constraints make it impossible to hand write a JIT compiler that understands NumPy. But given PyPy's architecture we actually have a JIT generator, so we don't need to write one :-)\n\n\n\nOur JIT has shown that it can speed up small arithmetic examples significantly. What happens with something like NumPy?\n\n\nI wrote a very minimal subset of NumPy in RPython, called micronumpy (only single-dimension int arrays that can only get and set items), and a benchmark against it. The point of this benchmark is to compare the performance of a builtin function (numpy.minimum) against the equivalent hand-written function, written in pure Python and compiled by our JIT.\n\n\nThe goal is to prove that it is possible to write algorithms in Python instead of C without loss of efficiency. Sure, we can write some functions (like minimum in the following example), but there is a whole universe of other ufuncs which would be cool to have in Python instead, assuming this could be done without a huge loss in efficiency.\n\n\nHere are the results. This is comparing PyPy svn revision 66303 in the pyjitpl5 branch against python 2.6 with NumPy 1.2.1. The builtin numpy.minimum in PyPy is just a naive implementation in RPython, which is comparable to the speed of a naive implementation written in C (and thus a bit slower than the optimized\nversion in NumPy):\n\n\n\nNumPy (builtin function)0.12s\nPyPy's micronumpy (builtin function)0.28s\nCPython (pure Python)11s\nPyPy with JIT (pure Python)0.91s\n\n\nAs we can see, PyPy's JIT is slower than the optmized NumPy's C version, but still much faster than CPython (12x).\n\n\nWhy is it slower? When you actually look at assembler, it's pretty obvious that it's atrocious. There's a lot of speedup to be gained out of just doing simple optimizations on resulting assembler. There are also pretty obvious limitations, like x86 backend not being able to emit opcodes for floats or x86_64 not being there. Those limitations are not fundamental in any sense and can be relatively straightforward to overcome. Therefore it seems we can get C-level speeds for pure Python implementations of numeric algorithms using NumPy arrays in PyPy. I think it's an interesting perspective that Python has the potential of becoming less of a glue language and more of a real implementation language in the scientific field.\n\nCheers,\nfijal", + "tags": "numpy", + "url": "https://www.pypy.org/posts/2009/07/pypy-numeric-experiments-2221073696038673235.html" + }, + { + "title": "ECOOP 2009", + "text": "Last week (from 6th to 10th of July) Anto, Armin and me (Carl Friedrich) were in\nthe magnificent city of Genova, Italy at the ECOOP conference. In this blog\npost I want to give a (necessarily personal) account of what we did there.\n\nWorkshop days: ICOOOLPS\nThe first two days of the conference were the workshop days. On Monday we\nattended the ICOOOLPS workshop, (see the programme of the workshop). We\nhad gotten two papers accepted at the workshop (one about layering PyPy's JIT\non top of the CLR and one about the basic idea of PyPy's tracing JIT) and\nthus gave two presentations at the workshop, one was given by Anto, the other\nby me. Both went reasonably well, we got some positive feedback.\nNearly all the other talks were rather interesting as well. I particularly liked\nthe one by Hans Schippers, who presented a machine model built on delegation\ncalled delMDSOC. The model is meant implement most features that a language\nwould need that makes it possible to separate cross-cutting concerns. In the\ntalk at ICOOOLPS he presented an extension to the model that adds concurrency\nsupport, using a combination of actors and coroutines. He then showed that the\nconcurrency mechanisms of Java, Salsa (and extension of Java adding actors) and\nIo can be mapped to this model.\nFurthermore there were two interesting invited talks, one by Andreas Gal\n(Mozilla), and one by Cliff Click (Azul Systems). Andreas explained how\nTraceMonkey works. This was very useful for me, because his talk was just before\nmine and I could thus kill most of my introduction about tracing JIT compilers\nand have more time for the really interesting stuff :-). Cliff talked about\nimplementing other languages on top of the JVM and some of the pitfalls in\ngetting them perform well.\nAll in all, ICOOOLPS was a very enjoyable workshop, also with many interesting\ndiscussions.\nOn Tuesday there were more workshops, but also the PyPy tutorial, so I only went\nto a few talks of the COP workshop and spent the rest of the morning\npreparing the tutorial (see next section).\n\n\nTutorial\nOn Tuesday afternoon we gave a PyPy Tutorial, as part of the ECOOP summer\nschool. The first lesson we learned was that (as opposed to a community\nconference) people don't necessarily want to actually take their laptop out and\ntry stuff. We gave a slow walk-through about the full life-cycle of development\nof a dynamic language interpreter using PyPy's tool-chain: Starting from writing\nyour interpreter in RPython, testing it on top of CPython to translating it to\nC, .NET or Java to actually adding hints to get a JIT inserted.\nThere were about seven people attending the tutorial, a couple of which were\nvery interested and were asking questions and discussing. Some of the\ndiscussions were even very technical, e.g. one about the details of our\ntype-inference algorithm for RPython and why we cannot do a bottom-up analysis\nbut have to use forward-propagation instead.\nJan Vitek of Purdue University told of some of the problems of the OVM\nproject, which is (among other things) a Java implementation in Java (OVM also\nwants to support implementing VMs for other languages with it, if I understood\ncorrectly). He said that the project has\nessentially gotten too large and complicated, which means that it is very hard\nfor new people to get into the project. While PyPy doesn't have some of the\nproblems of a full Java implementation (e.g. right now our concurrency support\nis minimal) I definitely think that some of these risks apply to PyPy as well\nand we should find ways to improve the situation in this regard. Channeling\nSamuele: Somewhere inside the large lumbering blob of PyPy there is an elegant\ncore trying to get out.\n\n\nMain Conference\nFrom Wednesday till Friday the main conference was happening. Many of the\ntalks were not all that interesting for me, being quite Java centric. One talk\nthat I liked a lot was \"Making Sense of Large Heaps\", which was presented by\nNick Mitchell (IBM). He presented a tool called \"Yeti\" that can be used to\nanalyze large heaps of Java programs. The tool uses some clever algorithms and\nheuristics to summarize the heap usage of data structures in intelligent ways to\nmake it easier to find possible memory-wasters in a program. Nick also gave Anto\nand me a demo of the tool, where we tried to apply it to pypy-jvm (we found\nout that a fifth of the static data in there belongs to the parser/compiler :-(\n).\nOn each of the days of the conference there was a keynote. I missed the one by\nSimon Peyton-Jones on Wednesday about type classes in Haskell. On Thursday,\nDavid Ungar was awarded the Dahl-Nygaard-Prize for his work on the Self\nprogramming language. Subsequently he gave a really inspiring keynote with the\ntitle \"Self and Self: Whys and Wherefores\" where he recollected Self's history,\nboth on a technical as well as on a social level. Parts of the talk were\nsnippets from the movies Self: The Movie and Alternate Reality Kit, both\nof which I highly recommend.\nThe keynote on Friday was by Cliff Click with the title \"Java on 1000 Cores:\nTales of Hardware/Software Co-design\". He described the custom CPU architecture\nthat Azul Systems has developed to run Java server applications on hundreds of\ncores. The talk mostly talked about the hardware, which I found very interesting\n(but some people didn't care for too much). Azul's CPU is essentially 54 in-order\nRISC cores in a single processor. The cores have a lot of extensions that make\nit easier to run Java on them, e.g. hardware read- and write-barriers,\nhardware-transactional-memory and hardware escape-detection (!).\nIn addition to the talks, there is of course always the hallway track (or coffee\ntrack) which is the track where you stand in the hallway and discuss with\npeople. As usual, this was the most interesting part of the conference. One of\nthose talks was Anto and me giving a PyPy demo to David Ungar. We had a very\ninteresting discussion about VM implementation in general and the sort of\ndebugging tools you need to write in particular. He liked PyPy a lot, which\nmakes me very happy. He also liked the fact that I have actually read most Self\npapers :-).", + "tags": "", + "url": "https://www.pypy.org/posts/2009/07/ecoop-2009-8415055006373020774.html" + }, + { + "title": "EuroPython", + "text": "EuroPython is coming. We have two 30-minutes talks that we will present. In addition, the sprint takes place the 29th of June (there will be no-one from the team on the 28th of June), as well as on the 3rd and 4th of July.", + "tags": "", + "url": "https://www.pypy.org/posts/2009/06/europython-8318355560715932819.html" + }, + { + "title": "JIT progress", + "text": "In the last days I finally understood how to do virtualizables. Now the frame overhead is gone. This was done with the help of discussion with Samuele, porting ideas from PyPy's first JIT attempt.\n\n\nThis is of course work in progress, but it works in PyPy (modulo a few XXXs, but no bugs so far). The performance of the resulting code is quite good: even with Boehm (the GC that is easy to compile to but gives a slowish pypy-c), a long-running loop typically runs 50% faster than CPython. That's \"baseline\" speed, moreover: we will get better speed-ups by applying optimizations on the generated code. Doing so is in progress, but it suddenly became easier because that optimization phase no longer has to consider virtualizables -- they are now handled earlier.\n\nUpdate:Virtualizables is basically a way to avoid frame overhead. The frame object\nis allocated and has a pointer, but the JIT is free to unpack it's fields (for example python\nlevel locals) and store them somewhere else (stack or registers). Each external (out of jit) access\nto frame managed by jit, needs to go via special accessors that can ask jit where those variables\nare.", + "tags": "jit", + "url": "https://www.pypy.org/posts/2009/06/jit-progress-7289127796450840053.html" + }, + { + "title": "News from the jit front", + "text": "As usual, progress is going slower then predicted,\nbut nevertheless, we're working hard to make some progress.\n\n\nWe recently managed to make our nice GCs cooperate with our JIT. This is\none point from our detailed plan. As of now, we have a JIT with GCs and\nno optimizations. It already speeds up some things, while slowing down\nothers. The main reason for this is that the JIT generates assembler which is kind\nof ok, but it does not do the same level of optimizations gcc would do.\n\n\nSo the current status of the JIT is that it can produce assembler out\nof executed python code (or any interpreter written in RPython actually),\nbut the results are not high quality enough since we're missing optimizations.\n\n\nThe current plan, as of now, looks as follows:\n\n\nImprove the handling of GCs in JIT with inlining of malloc-fast\n paths, that should speed up things by a constant, not too big factor.\n\n\nWrite a simplified python interpreter, which will be a base for experiments\n and to make sure that our JIT does correct things with regard to\n optimizations. That would work as mid-level integration test.\n\n\nThink about ways to inline loop-less python functions into their parent's loop.\n\n\nGet rid of frame overhead (by virtualizables)\n\n\nMeasure, write benchmarks, publish\n\n\nProfit\n\n\n\nCheers,\nfijal", + "tags": "jit", + "url": "https://www.pypy.org/posts/2009/06/news-from-jit-front-367552118380842303.html" + }, + { + "title": "ICOOOLPS Submissions", + "text": "Both of the papers that people from the PyPy team submitted to ICOOOLPS have\nbeen accepted. They are:\n\n\n\"Faster than C#: efficient implementation of dynamic languages on .NET\"\n(pdf1) by Armin, Anto and Davide Ancona, who is Anto's Ph.D. advisor\n\"Tracing the Meta-Level: PyPy\u2019s Tracing JIT Compiler\" (pdf2) by Carl\nFriedrich, Armin, Anto and Maciek\n\n\n(the pdfs are obviously the submitted versions, not the final ones).\nThis year ICOOOLPS (Implementation, Compilation, Optimization of\nObject-Oriented Languages, Programs and Systems) is being held on July the 6th\nat ECOOP 2009 in Genova, Italy. Other than these two papers, Anto and Carl\nFriedrich will also present a PyPy tutorial, on July the 7th.", + "tags": "jit", + "url": "https://www.pypy.org/posts/2009/05/icooolps-submissions-6705901656116873587.html" + }, + { + "title": "4 weeks of GDB", + "text": "Hello.\n\nSo, according to our jit\nplan we're mostly done with point 1, that is to provide a JIT that compiles\npython code to assembler in the most horrible manner possible but doesn't\nbreak. That meant mostly 4 weeks of glaring at GDB and megabytess of assembler\ngenerated by C code generated from python code. The figure of 4 weeks proves\nthat our approach is by far superior to the one of psyco, since Armin says it's\n\"only 4 weeks\" :-)\n\n\nRight now, pypy compiled with JIT can run the whole CPython test suite\nwithout crashing, which means we're done with obvious bugs and the only\nones waiting for us are really horrible. (Or they really don't exist.\nAt least they should never be about obscure Python corner cases: they can\nonly be in the 10'000 lines of relatively clear code that is our JIT\ngenerator.)\n\n\nBut... the fun thing is that we can actually concentrate on optimizations!\nSo the next step is to provide a JIT that is correct *and* actually speeds\nup python. Stay tuned for more :-)\n\nCheers,\nfijal, armin & benjamin\n\nUPDATE: for those of you blessed with no knowledge of C, gdb stands for GNU debugger, a classic debugger for C. (It's also much more powerful than python debugger, pdb, which is kind of surprising).", + "tags": "jit", + "url": "https://www.pypy.org/posts/2009/04/4-weeks-of-gdb-522864241041643529.html" + }, + { + "title": "1.1 final released", + "text": "We just released PyPy 1.1 final. Not much changed since the beta, apart\nfrom some more fixed bugs. Have fun with it!", + "tags": "release", + "url": "https://www.pypy.org/posts/2009/04/11-final-released-225813777919757859.html" + }, + { + "title": "Roadmap for JIT", + "text": "Hello.\n\n\nFirst a disclaimer. This post is more about plans for future than current\nstatus. We usually try to write about things that we have done, because\nit's much much easier to promise things than to actually make it happen,\nbut I think it's important enough to have some sort of roadmap.\n\n\nIn recent months we came to the point where the 5th generation of\nJIT prototype was working as nice\nor even a bit nicer than 1st one back in 2007. Someone might ask \"so why\ndid you spend all this time without going forward?\". And indeed, we spend\na lot of time moving sideways, but as posted, we also spent a lot of time\ndoing some other things, which are important as well.\nThe main advantage of current JIT incarnation is much much simpler than\nthe first one. Even I can comprehend it, which is much of an improvement :-)\n\n\nSo, the prototype is working and gives very nice speedups in range of 20-30x\nover CPython. We're pretty confident this prototype will work and will\nproduce fast python interpreter eventually. So we decided that now we'll\nwork towards changing prototype into something stable and solid. This\nmight sound easy, but in fact it's not. Having stable assembler backend\nand optimizations that keep semantics is not as easy as it might sound.\n\n\nThe current roadmap, as I see it, looks like as following:\n\n\n Provide a JIT that does not speedup things, but produce assembler without\n optimizations turned on, that is correct and able to run CPython's library\n tests on a nightly basis.\n\n\n Introduce simple optimizations, that should make above JIT a bit faster than\n CPython. With optimizations disabled JIT is producing incredibly dumb\n assembler, which is slower than correspoding C code, even with removal\n of interpretation overhead (which is not very surprising).\n\n\n Backport optimizations from JIT prototype, one by one, keeping an eye\n on how they perform and making sure they don't break anything.\n\n\n Create new optimizations, like speeding up attribute access.\n\n\n Profit.\n\n\n\nThis way, we can hopefully provide a working JIT, which gives fast python\ninterpreter, which is a bit harder than just a nice prototype.\n\n\nTell us what you think about this plan.\n\nCheers,\nfijal & others.", + "tags": "jit,pypy,roadmap,speed", + "url": "https://www.pypy.org/posts/2009/04/roadmap-for-jit-377358891902851723.html" + }, + { + "title": "Leysin Sprint Report", + "text": "The Leysin sprint is nearing its end, as usual here is an attempt at a summary\nof what we did.\nRelease Work\nLarge parts of the sprint were dedicated to fixing bugs. Since the easy bugs\nseem to have been fixed long ago, those were mostly very annoying and hard bugs.\nThis work was supported by our buildbots, which we tried to get free of\ntest-failures. This was worked on by nearly all participants of the sprint\n(Samuele, Armin, Anto, Niko, Anders, Christian, Carl Friedrich). One\nparticularly annoying bug was the differences in the tracing events that PyPy\nproduces (fixed by Anders, Samuele and Christian). Some details about larger\ntasks are in the sections below.\nThe work culminated in the beta released on Sunday.\n\nStackless\nA large number of problems came from our stackless features, which do some\nadvanced things and thus seem to contain advanced bugs. Samuele and Carl\nFriedrich spent some time fixing tasklet pickling and unpickling. This was\nachieved by supporting the (un)pickling of builtin code objects. In addition\nthey fixed some bugs in the finalization of tasklets. This needs some care\nbecause the __del__ of a tasklet cannot run at arbitrary points in time, but\nonly at safe points. This problem was a bit subtle to get right, and popped up\nnearly every morning of the sprint in form of a test failure.\nArmin and Niko added a way to restrict the stack depth of the RPython-level\nstack. This can useful when using stackless, because if this is not there it is\npossible that you fill your whole heap with stack frames in the case of an\ninfinite recursion. Then they went on to make stackless not segfault when\nthreads are used at the same time, or if a callback from C library code is in\nprogress. Instead you get a RuntimeError now, which is not good but better\nthan a segfault.\n\n\n\nKilling Features\nDuring the sprint we discussed the fate of the LLVM and the JS backends. Both\nhave not really been maintained for some time, and even partially untested\n(their tests were skipped). Also their usefulness appears to be limited. The JS\nbackend is cool in principle, but has some serious limitations due to the fact\nthat JavaScript is really a dynamic language, while RPython is rather static.\nThis made it hard to use some features of JS from RPython, e.g. RPython does not\nsupport closures of any kind.\nThe LLVM backend had its own set of problems. For\na long time it produced the fastest form of PyPy's Python interpreter, by first\nusing the LLVM backend, applying the LLVM optimizations to the result, then\nusing LLVM's C backend to produce C code, then apply GCC to the result :-).\nHowever, it is not clear that it is still useful to directly produce LLVM\nbitcode, since LLVM has rather good C frontends nowadays, with llvm-gcc and\nclang. It is likely that we will use LLVM in the future in our JIT (but that's\nanother story, based on different code).\nTherefore we decided to remove these two backends from SVN, which Samuele and\nCarl Friedrich did. They are not dead, only resting until somebody who is\ninterested in maintaining them steps up.\n\n\nWindows\nOne goal of the release is good Windows-support. Anders and Samuele set up a new\nwindows buildbot which revealed a number of failures. Those were attacked by\nAnders, Samuele and Christian as well as by Amaury (who was not at the sprint,\nbut thankfully did a lot of Windows work in the last months).\n\n\nOS X\nChristian with some help by Samuele tried to get translation working again under\nMac OS X. This was a large mess, because of different behaviours of some POSIX\nfunctionality in Leopard. It is still possible to get the old behaviour back,\nbut whether that was enabled or not depended on a number of factors such as\nwhich Python is used. Eventually they managed to successfully navigate that maze\nand produce something that almost works (there is still a problem remaining\nabout OpenSSL).\n\n\nDocumentation\nThe Friday of the sprint was declared to be a documentation day, where (nearly)\nno coding was allowed. This resulted in a newly structured and improved getting\nstarted document (done by Carl Friedrich, Samuele and some help of Niko) and\na new document describing differences to CPython (Armin, Carl Friedrich) as\nwell as various improvements to existing documents (everybody else). Armin\nundertook the Sisyphean task of listing all talks, paper and related stuff\nof the PyPy project.\n\n\n\nVarious Stuff\n\nJava Backend Work\nNiko and Anto worked on the JVM backend for a while. First they had to fix\ntranslation of the Python interpreter to Java. Then they tried to improve the\nperformance of the Python interpreter when translated to Java. Mostly they did a\nlot of profiling to find performance bottlenecks. They managed to improve\nperformance by 40% by overriding fillInStackTrace of the generated exception\nclasses. Apart from that they found no simple-to-fix performance problems.\n\n\nJIT Work\nArmin gave a presentation about the current state of the JIT to the sprinters as\nwell as Adrian Kuhn, Toon Verwaest and Camillo Bruni of the University of Bern\nwho came to visit for one day. There was a bit of work on the JIT going on too;\nArmin and Anto tried to get closer to having a working JIT on top of the CLI.", + "tags": "", + "url": "https://www.pypy.org/posts/2009/04/leysin-sprint-report-1416905818217912359.html" + }, + { + "title": "Beta for 1.1.0 released", + "text": "Today we are releasing a beta of the upcoming PyPy 1.1 release. There\nare some Windows and OS X issues left that we would like to address\nbetween now and the final release but apart from this things should be\nworking. We would appreciate feedback.\nThe PyPy development team.\n\nPyPy 1.1: Compatibility & Consolidation\nWelcome to the PyPy 1.1 release - the first release after the end of EU\nfunding. This release focuses on making PyPy's Python interpreter more\ncompatible with CPython (currently CPython 2.5) and on making the\ninterpreter more stable and bug-free.\nPyPy's Getting Started lives at:\n\nhttps://codespeak.net/pypy/dist/pypy/doc/getting-started.html\n\nHighlights of This Release\n\n\nMore of CPython's standard library extension modules are supported,\namong them ctypes, sqlite3, csv, and many more. Most of these extension modules\nare fully supported under Windows as well.\nhttps://codespeak.net/pypy/dist/pypy/doc/cpython_differences.html\nhttps://morepypy.blogspot.com/2008/06/pypy-improvements.html\n\nThrough a large number of tweaks, performance has been improved by\n10%-50% since the 1.0 release. The Python interpreter is now between\n0.8-2x (and in some corner case 3-4x) slower than CPython. A large\npart of these speed-ups come from our new generational garbage\ncollectors.\nhttps://codespeak.net/pypy/dist/pypy/doc/garbage_collection.html\n\nOur Python interpreter now supports distutils as well as\neasy_install for pure-Python modules.\n\nWe have tested PyPy with a number of third-party libraries. PyPy can\nrun now: Django, Pylons, BitTorrent, Twisted, SymPy, Pyglet, Nevow,\nPinax:\nhttps://morepypy.blogspot.com/2008/08/pypy-runs-unmodified-django-10-beta.html\nhttps://morepypy.blogspot.com/2008/07/pypys-python-runs-pinax-django.html\nhttps://morepypy.blogspot.com/2008/06/running-nevow-on-top-of-pypy.html\n\nA buildbot was set up to run the various tests that PyPy is using\nnightly on Windows and Linux machines:\nhttps://codespeak.net:8099/\n\nSandboxing support: It is possible to translate the Python\ninterpreter in a special way so that the result is fully sandboxed.\nhttps://codespeak.net/pypy/dist/pypy/doc/sandbox.html\nhttps://blog.sandbox.lt/en/WSGI%20and%20PyPy%20sandbox\n\n\n\n\n\nOther Changes\n\n\nThe clr module was greatly improved. This module is used to\ninterface with .NET libraries when translating the Python\ninterpreter to the CLI.\nhttps://codespeak.net/pypy/dist/pypy/doc/clr-module.html\nhttps://morepypy.blogspot.com/2008/01/pypynet-goes-windows-forms.html\nhttps://morepypy.blogspot.com/2008/01/improve-net-integration.html\n\nStackless improvements: PyPy's stackless module is now more\ncomplete. We added channel preferences which change details of the\nscheduling semantics. In addition, the pickling of tasklets has been\nimproved to work in more cases.\n\nClassic classes are enabled by default now. In addition, they have\nbeen greatly optimized and debugged:\nhttps://morepypy.blogspot.com/2007/12/faster-implementation-of-classic.html\n\nPyPy's Python interpreter can be translated to Java bytecode now to\nproduce a pypy-jvm. At the moment there is no integration with\nJava libraries yet, so this is not really useful.\n\nWe added cross-compilation machinery to our translation toolchain to\nmake it possible to cross-compile our Python interpreter to Nokia's\nMaemo platform:\nhttps://codespeak.net/pypy/dist/pypy/doc/maemo.html\n\nSome effort was spent to make the Python interpreter more\nmemory-efficient. This includes the implementation of a mark-compact\nGC which uses less memory than other GCs during collection.\nAdditionally there were various optimizations that make Python\nobjects smaller, e.g. class instances are often only 50% of the size\nof CPython.\nhttps://morepypy.blogspot.com/2008/10/dsseldorf-sprint-report-days-1-3.html\n\nThe support for the trace hook in the Python interpreter was\nimproved to be able to trace the execution of builtin functions and\nmethods. With this, we implemented the _lsprof module, which is\nthe core of the cProfile module.\n\nA number of rarely used features of PyPy were removed since the previous\nrelease because they were unmaintained and/or buggy. Those are: The\nLLVM and the JS backends, the aspect-oriented programming features,\nthe logic object space, the extension compiler and the first\nincarnation of the JIT generator. The new JIT generator is in active\ndevelopment, but not included in the release.\nhttps://codespeak.net/pipermail/pypy-dev/2009q2/005143.html\nhttps://morepypy.blogspot.com/2009/03/good-news-everyone.html\nhttps://morepypy.blogspot.com/2009/03/jit-bit-of-look-inside.html\n\n\n\n\n\nWhat is PyPy?\nTechnically, PyPy is both a Python interpreter implementation and an\nadvanced compiler, or more precisely a framework for implementing dynamic\nlanguages and generating virtual machines for them.\nThe framework allows for alternative frontends and for alternative\nbackends, currently C, Java and .NET. For our main target \"C\", we can\n\"mix in\" different garbage collectors and threading models,\nincluding micro-threads aka \"Stackless\". The inherent complexity that\narises from this ambitious approach is mostly kept away from the Python\ninterpreter implementation, our main frontend.\nSocially, PyPy is a collaborative effort of many individuals working\ntogether in a distributed and sprint-driven way since 2003. PyPy would\nnot have gotten as far as it has without the coding, feedback and\ngeneral support from numerous people.\nHave fun,\n\nthe PyPy release team, [in alphabetical order]\nAmaury Forgeot d'Arc, Anders Hammerquist, Antonio Cuni, Armin Rigo,\nCarl Friedrich Bolz, Christian Tismer, Holger Krekel,\nMaciek Fijalkowski, Samuele Pedroni\nand many others:\nhttps://codespeak.net/pypy/dist/pypy/doc/contributor.html", + "tags": "release", + "url": "https://www.pypy.org/posts/2009/04/beta-for-110-released-4604559533184706699.html" + }, + { + "title": "Leysin Sprint Started", + "text": "The Leysin Sprint started today. The weather is great and the view is wonderful, as usual. Technically we are working on the remaining test failures of the nightly test runs and are generally trying to fix various long-postponed bugs. I will try to give more detailed reports as the sprint progresses.", + "tags": "", + "url": "https://www.pypy.org/posts/2009/04/leysin-sprint-started-4551365436232104640.html" + }, + { + "title": "Pycon videos are online", + "text": "Hi.\n\nWe didn't yet write full pycon summary, but both of our talks are now online: PyPy status talk and python in a sandbox.\nUpdate:\nSlides are also available: PyPy status talk and Python in a sandbox.\n\n\nEnjoy!\nfijal & holger", + "tags": "", + "url": "https://www.pypy.org/posts/2009/04/pycon-videos-are-online-909873128878039557.html" + }, + { + "title": "VM summit: nice to see friendly competition", + "text": "So Google has launched the unladen swallow project\nwith this first goal: \n\n\n Produce a version of Python at least 5x faster than CPython.\n\n\nWe discussed some details with Collin Winter, Jeffrey Yasskin and Thomas Wouters\nduring the VM summit yesterday. We were a bit confused about usage\nof the term JIT, because as far as we understood, it's going to be upfront\ncompilation into LLVM. In the past we have looked into LLVM\n \u2013 at one point PyPy extensively use it but it\nwasn't clear how we could make good use to it. \nThey also consider changing to something else than LLVM. It's gonna be \ninteresting to see how this works out. \n\n\nIt's good to see friendly competition, and we think that should take up\nthe challenge and see if we can produce faster pickling, run 2to3 and \nDjango faster than what they can come up with. We also talked \nto IronPython and Jython developers and all agreed that some\ncommon benchmarks would be good. And maybe do weekly\npress releases about small speed increases? :) \n\n\nThe idea of the VM summit here in Chicago was to bring together implementors\nof various virtual machine languages. There were members of the communities of\nIronPython, CPython, GemStone's MagLev, Rubinius, Mozilla's TraceMonkey, Parrot, \nSun's Da Vinci Machine, Microsoft's DLR, Jython and JRuby.\nEverybody got to talk 5-10 minutes on their current status and \nchallenges. It is clear that you cannot begin to cover the \ncomplexities and architectures of the involved projects. \nBut that wasn't too much of a problem because the rest of\nthe day everybody freely and dynamically grouped on their\nissues of choice. We established some more personal contacts,\nwas great to chat with people like Andreas Gal from the University of \nCalifornia, Irvine, who have a very similar idea about the JIT\nthat we have. Actually, we could probably haved mixed our\ntwo presentations and nobody would have actually noticed :-).\n\n\nAt the end of the presentation part, John Rose presented his\nslides. John is a Hotspot developer, and while not precisely a dynamic\nlanguage implementor, he has a lot of experience in virtual\nmachine implementation. It's very good to see the JVM being extended towards\nsupporting dynamic-language specific things, in order to be something\nmore than just a good platform for Java. We'll probably have \nsome extra meetup with him the next days. \n\ncheers, \nholger and fijal", + "tags": "", + "url": "https://www.pypy.org/posts/2009/03/vm-summit-nice-to-see-friendly-8755773725359396485.html" + }, + { + "title": "PyPy talk at OpenBossa 09", + "text": "Yesterday i gave my PyPy status/mobile perspectives at OpenBossa, Nokia's developer conference for embedded platforms in Brazil. Found it a bit of a tough task to do that in 50 minutes. I had some 50, later more developers attending the talk and was happy with the questions and the feedback. Guess it's a good sign if the number of people grows during a talk :) It was the first time i tried to work more with pictures and actually used some devianart photos from Marikaz to mark section transitions. I summarize/highlight some key points here in the post.\nAfter intro and 2.5 compatibility status, i talked about our measurements of PyPy's Python on Nokia's N810 internet tablet. The best bit is that for almost all Python data structures PyPy has smaller memory representations than CPython. Particularly good are class instances which often score at 50% of CPython's sizes. Startup time is also often better and can be improved. On the bad side, PyPy's quite large base interpreter size and its bytecode execution is often worse. In the talk i also outline ideas for \"perfect PYC files\" for minimizing module import times and maximizing sharing across interpreter processes. I also briefly discussed the PyPy situation with extension modules and regarding C++ libs. Most of these ideas arose from sprint discussions last year. In the morning i also had some good talk with Stefan Seefeld about Boost Python and the new QT4 bindings. Maybe to use Boost Python is also a good opportunity - but PyPy does not currently have a C-level or C++ level API.\nIn subsequent lunch discussions people agreed that PyPy has three main interesting areas currently:\n\nthe Python Just-In-Time Compiler\na virtualized, sandboxed Python interpreter\nan efficient Python interpreter for small devices\n\nI think our upcoming 1.1 release will be a good point in time for many people to look some more into PyPy. I hope we are crossing the chasm soon. It's been a while since the project started :) Getting some more sponsoring to sustain and increase our current efforts probably wouldn't hurt.\nNow i am off to spend my last day in Recife / Brazil, fly back to Germany in the evening and then spend time on preparing for Pycon 2009. And I guess i am going to enjoy some naturally cold air - at least my two jogging sessions at Brazillian beaches, at a sustained 30 degrees celsius, were tough. I guess i shouldn't complain, though :)\nWas great meeting all the brazillian guys and the few women - just had breakfeast with Kate Alhola, kernel hacker and working on the new \"Freemantle\" graphical platform. Many thanks go to Marcio Marcedo and the Python team at INDT who invited me here. Hope to come again next year and eventually talk more about the Zone VM :)\nIf you are interested in some more not so pypy-specific bits about the conference and what i experienced, you might head over to my tetamap blog.\nholger", + "tags": "", + "url": "https://www.pypy.org/posts/2009/03/pypy-talk-at-openbossa-09-5135830287297423499.html" + }, + { + "title": "Good news everyone!", + "text": "A quick update from the JIT front. As of yesterday, we're now able to translate\na highly-experimental Python interpreter that contains JIT. It mostly crashes\nimmediately, mostly due to some unsupported operations in the assembler backend,\nbut for a carefully crafted program, we're able to get massive speedups.\nFor something as complex as:\n\n\n i = 0\n while i < 10000000:\n i = i + 1\n\n\nour JIT is about 20x faster than CPython. That's still about 3x slower than\nPsyco, but looking at assembler code it's obvious that we can speed it up\na lot. These are very good news, since we don't encode python semantics at\nall in the JIT. The JIT is automatically generated from the Python interpreter\nsource code. This means we should be able to expand it to handle more complex\npython programs relatively quickly (interested assembler experts needed!).\n\n\nThis is actually the fifth incarnation of JIT that happened over the last\ntwo years. It's by far simpler and more promising than any of the previous\napproaches. Expect more details soon!\n\nCheers,\nfijal", + "tags": "jit", + "url": "https://www.pypy.org/posts/2009/03/good-news-everyone-421421336094214242.html" + }, + { + "title": "JIT - a bit of look inside", + "text": "The previous post about our JIT explained a bit from the 1000 km\nperspective how the tracing JIT would approach a language like Python.\n\n\nI would like to step a bit inside and give a zoom to some of its features that\nare already working.\nWhile probably not the most innovative, I think it's very nice to look\nat the way we work with the JIT and what tools we use.\n\n\nThe main cool thing is that you can work on and try the JIT (including trying\nit on the Python interpreter!) without even generating a single bit of\nassembler. How? Let's start with something very simple. Let's take\na simple interpreter for language X.\n\n\nLanguage X has 3 opcodes: CO_INCREASE, CO_DECREASE and CO_JUMP_BACK_3.\nCO_INCREASE increase the accumulator by one, CO_DECREASE decrease\nit by one, CO_JUMP_BACK_3 jump 3 opcodes back, if the accumulator is smaller\nthan 100 (this is only to maintain some halting conditions possible).\nThe interpreter for language X looks like this::\n\n\n jitdriver = JitDriver(greens = ['i'], reds = ['res', 'a'])\n code = [CO_INCREASE, CO_INCREASE, CO_INCREASE,\n CO_JUMP_BACK_3, CO_INCREASE, CO_DECREASE]\n \n def add(res, a):\n return res + a\n\n def sub(res, a):\n return res - a\n\n def main_interpreter_loop(a):\n i = 0\n res = 0\n c = len(code)\n while i < c:\n jitdriver.jit_merge_point(res=res, i=i, a=a)\n elem = code[i]\n if elem == CO_INCREASE:\n res = add(res, a)\n elif elem == CO_DECREASE:\n res = sub(res, a)\n else:\n if res > 100:\n pass\n else:\n i = i - 3\n jitdriver.can_enter_jit(res=res, i=i, a=a)\n continue\n i = i + 1\n return res\n\n\nAll very simple code, expect the jitdriver hints, which instruct JIT how to\nbehave (they are the equivalent of the ``add_to_position_key`` of last the blog\npost).\n\n\nLet's look how this code is processed. This will also give a glance\nat how we work in this code. This particular piece can be found\non a branch in pypy/jit/metainterp/test/test_loop.py\nand can be run with ./test_all.py jit/metainterp/test/test_loop.py -k test_example -s --view from pypy directory. The -s option lets you see the debugging output, while\n--view will show you some graphs. So, let's look at graphs in order:\n\n\n\nAnd the same picture with a bit of zoom for the first block:\n\n\n\n\nThis is the call graph of an interpreter loop, nothing magic so far. This is an\nintermediate representation of translation toolchain input. If you look around\nyou can follow how the opcodes are dispatched (with a chain of ifs) and helpers\ncalled. Next graph is very boring, because it's a bit lower level representation\nof the same thing (you exit with q or escape btw :).\n\n\nWhen we exit the graph viewer, we can see the trace generated by interpreting\nthis graph with a given bytecode (variable code in paste above). It's something\nlike:\n\n\n\n [compiler] ENTER\n [runner:cpu] call__4 [(''), * GCREF hidden, 0] -> 0\n [runner:cpu] int_eq [0, 0] -> True\n [runner:cpu] int_add [9, 1] -> 10\n [runner:cpu] int_add [0, 1] -> 1\n [runner:cpu] int_lt [1, 6] -> True\n [runner:cpu] call__4 [(''), * GCREF hidden, 1] -> 0\n [runner:cpu] int_eq [0, 0] -> True\n [runner:cpu] int_add [10, 1] -> 11\n [runner:cpu] int_add [1, 1] -> 2\n [runner:cpu] int_lt [2, 6] -> True\n [runner:cpu] call__4 [(''), * GCREF hidden, 2] -> 0\n [runner:cpu] int_eq [0, 0] -> True\n [runner:cpu] int_add [11, 1] -> 12\n [runner:cpu] int_add [2, 1] -> 3\n [runner:cpu] int_lt [3, 6] -> True\n [runner:cpu] call__4 [(''), * GCREF hidden, 3] -> 1\n [runner:cpu] int_eq [1, 0] -> False\n [runner:cpu] int_eq [1, 2] -> False\n [runner:cpu] int_gt [12, 100] -> False\n [runner:cpu] int_sub [3, 3] -> 0\n [compiler] LEAVE\n\n\nIt's entering JIT, doing some primitive operations for bytecode dispatching\nand repeating the loop. Note that at the end of the interpreted loop\n(not to be confused with the interpreter loop), we see int_sub [3, 3]\nwhich resets the bytecode position to the beginning. At this time JIT\n(instructed by can_enter_jit hint) notices that all green variables\nare the same (here only i),\nhence we can compile the efficient loop from this point.\n\n\n\n\nThe loop contains 3 additions and a check (for i < 100), exactly\nthe same as our interpreted program would do, but completely without\ninterpretation overhead!\n\n\nAs you might have noticed, there is no assembler involved so far. All of this\ninstruction execution is done directly, in pure python. In fact, the\ncode for executing instructions is located in jit/backend/llgraph\nwhich directly interprets instructions. This is by far simpler (and easier\nto debug) than x86 assembler.\n\n\nAnd this is basically it: the very simple interpreter and a jit for it.\nOf course we actually can generate assembler for that. Also the missing\npiece is optimizing the generated graphs. While for this example,\nby removing the interpretetation overhead, we're done, with more complex\nexamples it's important to further optimize traces. Hopefully this and\nhow we actually generate assembler will be topics for next blog posts.\n\nCheers,\nfijal", + "tags": "jit", + "url": "https://www.pypy.org/posts/2009/03/jit-bit-of-look-inside-7472130507462677287.html" + }, + { + "title": "PyPy on Mobiles, at OpenBossa", + "text": "Next week i am going to give a talk on PyPy at OpenBossa, a developer conference on embedded platforms. I've written up a bit more of my background and why i find it very interesting to go there on my blog. Probably will mostly follow up there or on twitter and not much here on the PyPy blog because it's not all about PyPy. To summarize how i see it: i think there is great potential for Python and PyPy on mobiles and am thrilled to hear about what's going on currently and to discuss opportunities.\ncheers, holger", + "tags": "", + "url": "https://www.pypy.org/posts/2009/03/pypy-on-mobiles-at-openbossa-845760004725129519.html" + }, + { + "title": "Applying a Tracing JIT to an Interpreter", + "text": "After I had failed once more to explain to someone on IRC what the idea behind\nthe current JIT generator work of PyPy, I decided to just write a blog post to\nexplain it. Here it is :-). The post turned out to be a bit long, so please bear\nwith me.\nThe goal of the post is to give an understanding of how PyPy's JIT generator is\ngoing to work. To do this, I will look at what happens when you write an\ninterpreter in Java and apply a completely normal tracing JIT to it (for this\nreason all the code examples will be in some sort of pseudo-Java). The\nresulting generated machine code is bad, so I will explain a way to fix the\noccurring problem.\nThe techniques I describe here are conceptually similar to what we are doing in\nPyPy. The details (as usual) are different. The reasons why I am trying to\nexplain things in this way is that I can start from tracing JITs, which are a\nknown existing technique.\nTo understand the following, it is helpful to already know a bit how a normal\ntracing JIT works. I will give a reminder of how it is working, but there also\nexist a couple of more thorough introductions on the web already.\nI also will leave out a lot of details about the more detailed workings of\ntracing JITs and only explain the things that are relevant to what I am trying\nto get to here.\nTracing JITs\nTracing JITs are an idea explored by the Dynamo project in the context of\ndynamic optimization of machine code at runtime. The techniques were then\nsuccessfully applied to Java VMs and are now being used by Mozilla's\nTraceMonkey JavaScript VM. They are built on some basic assumptions:\n\n\nprograms spend most of their runtime in loops\nseveral iterations of the same loop are likely to take similar code paths\nthe best way to gain information about the behaviour of a program is to\nobserve it\n\n\nThe basic approach of a tracing JIT is to only generate machine code for\ncommonly executed loops and to interpret the rest of the program. The code for\nthose common loops however should be highly optimized, including aggressive\ninlining.\nThe generation of loops works as follows: At first, everything is interpreted.\nThe interpreter does a bit of lightweight profiling to figure out which loops\nare run often. When a common loop is identified, the interpreter enters a\nspecial mode (called tracing mode). When in tracing mode, the interpreter\nrecords a history (the trace) of all the operations it executes, in addition\nto actually performing the operations. During tracing, the trace is repeatedly\nchecked whether the interpreter is at a position in the program that it had seen\nearlier in the trace. If this happens, the trace recorded corresponds to a loop\nin the program that the tracing interpreter is running. At this point, this loop\nis turned into machine code by taking the trace and making machine code versions\nof all the operations in it.\nThis process assumes that the path through the loop that was traced is a\n\"typical\" example of possible paths (which is statistically likely). Of course\nit is possible that later another path through the loop is taken, therefore the\nmachine code will contain guards, which check that the path is still the same.\nIf during execution of the machine code a guard fails, the machine code is left\nand execution falls back to using interpretation (there are more complex\nmechanisms in place to still produce more code for the cases of guard failures,\nbut they are of no importance for this post).\nIt is important to understand when the tracer considers a loop in the trace to\nbe closed. This happens when the position key is the same as at an earlier\npoint. The position key describes the position of the execution of the program,\ne.g. usually contains things like the function currently being executed and the\nprogram counter position of the tracing interpreter.\nLet's look at a small example. Take the following code:\n\nint sum_1_to_n(int n) {\n int result = 0;\n while (n >= 0) {\n result += n;\n n -= 1;\n }\n return result;\n}\n\nThe tracing JIT will at one point trace the execution of the while loop in\nsum_1_to_n. The trace might look as follows:\n\nguard_true(n >= 0);\nresult += n;\nn -= 1;\n\n\nThis trace will then be turned into machine code. Note that the machine code\nloop is by itself infinite and can only be left via a guard failure.\nA slightly more complex example:\n\nint f(int a, int b) {\n if (b % 46 == 41)\n return a - b;\n else\n return a + b;\n}\n\nint strange_sum(int n) {\n int result = 0;\n while (n >= 0) {\n result = f(result, n);\n n -= 1;\n }\n return result;\n}\n\nThe trace of the loop in strange_sum would maybe look like this:\n\nguard_true(n >= 0);\na = result;\nb = n;\nguard_false(b % 46 == 41);\nresult = a + b;\nn -= 1;\n\n\nThis would then be turned into machine code. Note how f was inlined into the\nloop and how the common else case was turned into machine code, while the\nother one is implemented via a guard failure.\nApplying a Tracing JIT to an Interpreter\nIn the rest of the post we will explore what happens when the program that is\nbeing executed/compiled by the tracing JIT is itself a (bytecode) interpreter\nfor another language.\nA stylized bytecode interpreter for a simple programming language could look as\nfollows:\n\nW_Object interpret(String bytecode, ...) {\n Stack stack = new Stack();\n int pc = 0;\n while (true) { // bytecode dispatch loop\n char instruction = bytecode.charAt(pc);\n pc += 1;\n switch (instruction) {\n case ADD:\n W_Object arg2 = stack.pop();\n W_Object arg1 = stack.pop();\n stack.push(do_addition(arg1, arg2));\n break;\n case SUB:\n W_Object arg2 = stack.pop();\n W_Object arg1 = stack.pop();\n stack.push(do_substraction(arg1, arg2));\n break;\n case RETURN:\n return stack.pop();\n case JUMP_BACKWARD:\n pc -= (int)bytecode.charAt(pc);\n break;\n case LOAD_INTEGER:\n int value = (int)bytecode.charAt(pc);\n pc += 1;\n stack.push(new W_Integer(value));\n break;\n case PRINT:\n do_print(stack.pop());\n break;\n case DUP:\n stack.push(stack.peek());\n break;\n case JUMP_IF_TRUE:\n ...\n ...\n }\n }\n\nIf we apply a tracing JIT to this function, it will trace and compile the\nexecution of one bytecode, because after one bytecode the bytecode dispatch loop\nis closed. E.g. it might trace and produce machine code for the execution of a\nSUB. (Sidenote: this interpret function is an example where one of the\nassumptions of a tracing JIT break down: two iterations of the bytecode dispatch\nloop are rarely going to follow the same code path, because usually two\nconsecutive bytecodes encode different instructions).\nThe important bit to remember here is that the tracing JIT will produce a\nmachine code loop that corresponds to the bytecode dispatch loop in the\ninterpret function. Let's see how we can change that.\nImproving the Generated Code\nIf we want to make use of the fact that the program that is being jitted is\nitself an interpreter, we need to change the tracing JIT a bit. To be more\nprecise we add a way for the user of the tracing JIT to add information to the\nposition key that the tracing JIT uses to decide when a loop is closed. This is\ndone by a call to a magic function add_to_position_key. This allows the\nprogram writer to influence the tracing JIT's behaviour.\nThe semantics of add_to_position_key is as follows: The method itself does\nnot do anything. It has an effect only when it is seen during tracing. If it is\nseen during tracing, the tracer adds the argument of the call to the position\nkey that the tracer is using to find out whether a loop was closed or not.\nIn the example of the interpret function above, we would add a call to this\nfunction into the while loop as follows:\n\nW_Object interpret(String bytecode, ...) {\n Stack stack = new Stack();\n int pc = 0;\n while (true) { // bytecode dispatch loop\n add_to_position_key(pc);\n add_to_position_key(bytecode);\n char instruction = bytecode.charAt(pc);\n pc += 1;\n switch (instruction) {\n case ADD:\n ...\n\nWhen the modified tracing JIT traces now the interpret function executing a\nSUB, something interesting happens. When the bytecode loop is closed, the\nmodified tracing JIT does not consider the trace to be a loop, because the value of\npc has been increased by one, so the position key differs. Instead it\ncontinues to trace, effectively unrolling the bytecode dispatch loop of\ninterpret.\nThe only way for a loop to be considered closed is if the pc variable has\nthe same value a second time. This can only happen after a JUMP_BACKWARD\ninstruction has been executed. A JUMP_BACKWARD instruction will only be in\nthe bytecode when the bytecode represents a loop. This means that the modified\ntracing JIT will trace the interpret function and will only consider that\nthe trace represents a loop when the bytecode itself represents a loop! Thus, a\nmachine code loop will eventually be created that corresponds to the loop in the\nbytecode.\nLet's look at at example. If we have a bytecode that corresponds to the\nfollowing instructions:\n\npc | instruction\n---+---------------------\n0 | LOAD_INTEGER 0\n2 | DUP\n3 | PRINT\n4 | LOAD_INTEGER 1\n6 | ADD\n7 | JUMP_BACKWARD 6\n\nThis loop will print integers starting from 0 and going on from there. The\nmodified tracing JIT will unroll the bytecode dispatch until it sees the\nJUMP_BACKWARD bytecode. After that bytecode the pc will be 2 again. Thus\nthe earlier position key is repeated, which means that the loop will be closed.\nThe produced machine code will do the equivalent of the following Java code:\n\n...\nguard_true(pc == 2)\nguard_true(bytecode == \"... correct bytecode string ...\")\nwhile (true) {\n instruction = bytecode.charAt(pc);\n pc += 1;\n guard_true(instruction == DUP);\n stack.push(stack.peek());\n\n instruction = bytecode.charAt(pc);\n pc += 1;\n guard_true(instruction == PRINT);\n do_print(stack.pop());\n\n instruction = bytecode.charAt(pc);\n pc += 1;\n guard_true(instruction == LOAD_INTEGER)\n value = (int)bytecode.charAt(pc);\n pc += 1\n stack.push(W_Integer(value))\n\n instruction = bytecode.charAt(pc);\n pc += 1;\n guard_true(instruction == ADD)\n arg2 = stack.pop()\n arg1 = stack.pop()\n stack.push(do_addition(arg1, arg2))\n\n instruction = bytecode.charAt(pc);\n pc += 1;\n guard_true(instruction == JUMP_BACKWARD)\n pc -= (int)bytecode.charAt(pc);\n}\n\nThis is machine code that essentially does what the bytecode above did. Of\ncourse the code still remains some remnants of the interpreter (like the program\ncounter manipulations, the stack handling, etc), which would have to be removed\nby some clever enough optimization step. If this were done, result would look a\nlot more natural.\nSummary\nIf a tracing JIT is enhanced by a way to influence its loop-closing behaviour we\ncan significantly improve its performance when the jitted program is itself an\ninterpreter. The result is that in such a case the produced machine code\nwill correspond to the functions that are being interpreted, not to the code of\nthe interpreter itself.\nNow, what does all this have to do with PyPy? What we are working on since a\nwhile is a sort of tracing JIT for RPython which allows to be customized with a\nfunction very similar to the add_to_position_key described above. This will\nmake it possible to make the tracing JIT generate code that corresponds to the\ncode that the interpreter interprets. For example, we would add a call to\nadd_to_position_key to SPy, PyPy's Smalltalk VM. Then the tracing JIT will\nproduce machine code for Smalltalk-level loops, with all the usual benefits of a\ntracing JIT (like inlining of intermediate methods, constant-folding, ...).\nThis JIT differs from normal tracing JITs in that it also supports very powerful\nconstant-folding and allocation-removal optimizations. Those optimizations will\n(hopefully) be the content of a later blog post.\nThe basics of this process have been working fine since quite a while. What the\nwork currently focuses on is to improve the optimizers to remove not only the\nbytecode manipulation code, but also the stack handling, and a large number of\nother inefficiencies.", + "tags": "jit", + "url": "https://www.pypy.org/posts/2009/03/applying-tracing-jit-to-interpreter-3287844903778799266.html" + }, + { + "title": "The next Leysin Winter Sprint", + "text": "PyPy Leysin Winter Sprint (14-21th April 2009)\n\nThe next PyPy sprint will be in Leysin, Switzerland, for the\nsixth time. This sprint will take place immediately after\nEaster. This is a fully public sprint: newcomers and topics\nother than those proposed below are welcome.\n\n\n\n\n\n\n\nThe overall idea of the sprint is to continue working on making PyPy ready\nfor general use. There are a few tasks left in there. In parallel, we\nwill continue the work on the JIT, if there is general interest. And as\nusual, we are ready to add any other task -- please mention on the mailing\nlist what you would like to work on; the list of task is not really fixed.\nAnd as usual, the main side goal is to have fun in winter sports :-)\nWe can take a day off for ski until Sunday, the 19th; afterwards, the\ninstallations close. (There was quite a lot of snow this winter, so\nthere should be some left even though it's relatively late in the season.)\n\n\n\n\n\nFor more information see the announcement.", + "tags": "", + "url": "https://www.pypy.org/posts/2009/03/next-leysin-winter-sprint-1791506307881043273.html" + }, + { + "title": "Wroclaw 2009 sprint progress report", + "text": "Hello.\n\nWe have just finished probably the smallest sprint ever\nin PyPy history. For most of the time it was just me\nand Armin pairing together.\n\nWe also had a chance to work a bit with people from\nthe University, but there were definitely not enough\ncore developers to organize the work in a reasonable\nmanner. At some point we ended up having two pairs containing\nfour people each.\n\nJakub and Bartosz (who were our gentle hosts) worked\non getting PyPy's sandbox integrated with django.\nIt's still just an example what you can do (ie you\ncan do much more), but it's already interesting to look\nat. The code can be found in user dir. This server (not yet online anywhere, sorry)\nis able to run untrusted python code provided by user inside\na fully configurable sandbox.\n\nWe also implemented missing peepholer optimizations from\nCPython, finding out that some peepholer tests were failing,\njust because PyPy is optimizing better :-)\n\nThe main part of the sprint was work on JIT (most notable the fifth\ngeneration of the JIT), which was moved\nfrom the obscure directory in Carl's user in svn (which contains\nbranches these days!) into a PyPy branch. It's still very much\nwork in progress and a lot of pen and paper or handwaving was\ninvolved, but we were able to implement a lot of basics in record time.\n\nRight now we need a lot of rest after the exhaustive sprint,\nbut after that, stay tuned for more information about\nprogressing JIT!\n\nCheers,\nfijal", + "tags": "", + "url": "https://www.pypy.org/posts/2009/02/wroclaw-2009-sprint-progress-report-2510073170049635489.html" + }, + { + "title": "Wroclaw 2009 PyPy sprint and talk", + "text": "The next PyPy sprint will be held in Wroc\u0142aw, Poland 7-14th February 2009. This is fully public\nsprint and all newcomers are welcomed. Preceeding the sprint there\nwill be a talk at University of Technology in Wroc\u0142aw held at 22nd of January.\n\nFor detailed info about the sprint, look here.\n\nThe talk will be a general, high-level overview about PyPy project. There is a very nice poster, made by Jakub Gustak and Bartosz Skowron (in polish):\n\n\n\nTalk details:\n\nLocation: Politechnika Wroc\u0142awska, budynek C-13, sala 0.31\nDate: 22nd January 2009, 19:00\nLanguage: very likely polish, although talk can be as well in english if some non-polish native would show up.\n\n\nCheers,\nfijal", + "tags": "", + "url": "https://www.pypy.org/posts/2009/01/wroclaw-2009-pypy-sprint-and-talk-8240928228677982487.html" + }, + { + "title": "Pycon 2009", + "text": "Hello.\n\nBoth of our PyPy talks has been accepted for Pycon US 2009. Although both\nare somehow related to PyPy, they're vastly different in\ntopics, attitude and target audience.\n\nThe first one is a classic PyPy status talk - we'll mostly talk about\nour achievements from the last year (readers of this blog are aware of most,\nbut not all :) as well as some general introduction and plans for the future.\n\n\nThe second one is about PyPy's sandboxing features. This is in my opinion\na very underestimated feature, also by us, because it's not really well\nadvertised or documented. The main purpose of the talk is to present\nto the general public how this works and how to use it. Hopefully we will\nget to work and publish about this a bit more ahead of Pycon already. \nUnlike Zope's Restricted Python, it provides you with the full python\nlanguage, inside a fully\nvirtualized sandbox, controlled from an external process by a custom\nsecurity policy. Stay tuned for more :-)\n\n\nSee you at Pycon 2009!\n\n\nCheers,\nfijal and holger", + "tags": "", + "url": "https://www.pypy.org/posts/2008/12/pycon-2009-9090464449197911432.html" + }, + { + "title": "Porting the JIT to CLI (part 3)", + "text": "In my two previous posts, we talked about the PyPy JIT generator, seeing\nthat it can produce huge speedups and how its backend-independent frontend\nworks.\nIn this post, we will look closer at the internals of the CLI JIT backend; in\nparticular, we will see how we work around some serious limitations of the\nplatform, and why these workarounds didn't have any serious impact on the\nperformances of our toy virtual machine.\n\nGraphs, blocks, links\n\n\n\n\nOne of the core aspect of PyPy translator is the concept of flow graph: a\nflow graph is a data structure that represents the code we are operating on.\nIt is composed by a set of basic blocks, each block containing a sequence\nof operations; blocks are connected together by links, and each link can\ncarry a variable number of arguments whose value is passed to the target\nblock. In case a block contains more than one outgoing links, the one to\nfollow is selected by looking at the value of a designated variable (the\nexitswitch), thus making possible to implement conditional jumps. To have\na more complete description of the flow graphs model, check the documentation.\n\nAs we saw in the previous post, the generated JIT compiler makes heavy use of\nflexswitches to generate efficient code, continuously intermixing\nJIT-compile time and runtime.\nIn terms of graphs, we can think of a flexswitch as a special block whose\nlinks change over time. In particular, adding a new case to the flexswitch is\nequivalent to create a link whose target is a new block where the just\ngenerated code starts. Thus, the graphs grows over the time, as showed by\nthe following images:\n\n\n\n\n\n\nIn the images above, the block containing the flexswitch is colored in\ncyan. In the first picture, there is only one block connected to the\nflexswitch: this block contains the code to restart the JIT compilation. The\nsecond picture shows the graph after the first case has been added: you can\nclearly see that a new block has been created and attached to the flexswitch.\nFinally, the third picture shows the graph after a while, with a lot of new\nblocks attached.\n\n\nTranslate graphs to CLI\nConceptually, the goal of the CLI JIT backend is to express these graphs in\nterms of CLI bytecode.\nTranslating the single block is easy, as it is just a list of sequential\noperation, and it's straightforward to map each operation to the equivalent\nCLI opcode or to a call to a helper method. Moreover, we need a way to\nexpress links between the various basic blocks: if the links are known in\nadvance, render them is as easy as emitting a (potentially conditional) jump to\nthe target block. Thus, we won't discuss this part in detail, as it is quite\nstraightforward.\nThe hard part is how to implement flexswitches: at the time when we are\nemitting the code, some of the blocks of this growable graph don't even exist:\nhow can we make a jump to a non existent block of code? For backends that\nemit assembly code, it is rather easy: when they need to add a new case to the\nflexswitch, they can just patch the existing code to insert a jump to a\nnewly allocated area of the memory, where the new code is being generated in.\nFor CLI this approach is not feasible, as the VM will never allow us to modify\nexisting code. Thus, we need to think of a different approach.\n\n\nGraphs and methods\nIn .NET, the basic unit of compilation is the method: the only way to\nexecute some bytecode is to wrap it into a method. Moreover, it is not\npossible to execute a method until it has been completed, and after this point\nit is no longer possible to add new code.\nBecause of all these constraints we cannot simply map each graph to its own\nmethod, since we saw that our graphs can grow after they have already been\nexecuted few times.\nHence, we need to distinguish between the two concepts:\n\n\na graph is the logical unit of code as seen by the JIT compiler:\nconcretely, the CLI JIT backend renders it as one or more methods;\na method is a collection of basic blocks; each method has the so\ncalled parent graph, i.e. the graph its blocks logically belongs to.\n\n\nThe first method of a graph is called main method (which has\nnothing to do with the Main static methods found in .exe files); other\nmethods are called children methods.\nWhen we want to add a new case to the flexswitch, we create a method\ncontaining all the new code; then we wrap the method inside a delegate (the\n.NET equivalent of a function pointer) and pass it to the flexswitch, so that\nit can later invoke it.\n\n\nThe hard bit: non-local links\nUsing this approach, after a while the blocks of our original graph are\nscattered over a lot of different methods; however, there are no constraints\nabout how these blocks can be linked together, so it happens to have links\nbetween blocks which are not in the same method. In the following, we will\nrefer to them as non-local links.\nIf the non-local block we want to jump to happens to be at the beginning of\nits containing method, it is enough to invoke the method; but, what if we want\nto jump somewhere in the middle? What we really want is to produce a method\nwhich has multiple entry-points; again, doing it in assembly would be\ntrivial, but the virtual machine does not provide any support for it, so we\nneed a work around.\nEach method in a graph is assigned an unique 16 bit method id; each block in\na method is assigned a progressive 16 bit block number. From this two\nnumbers, we can compute the block id as an unsigned integer, by storing\nthe method id in the first 16 bits and the block number in the second 16 bits.\nBy construction, the block id is guaranteed to be unique in the graph.\nThe following picture shows a graph composed of three methods; the id of each\nmethod is shown in red, while the block ids are shown in red (for the method\nid part) and black (for the block number part). The graph contains three\nnon-local links; in particular, note the link between blocks 0x00020001\nand 0x00010001 which connects two block that resides in different methods.\n\n\n\nEvery method contains a special dispatch block, (not shown in the picture above) whose goal is to jump to\nthe specified block number inside the method itself. The first argument of a\nchild method is always a block id; when the method starts, it immediately\njumps to the dispatch block, and thus to the desired block.For example, suppose to have a method which contains 3 blocks numbered 0, 1,\n2; here is how its dispatch blocks looks like; for simplicity it is shown as\nC# code, but it is actually generated as IL bytecode:\n\n// dispatch block\nint methodid = (blockid & 0xFFFF0000) >> 16); // take the first 16 bits\nint blocknum = blockid && 0x0000FFFF; // take the second 16 bits\n\nif (methodid != MY_METHOD_ID) {\n// jump_to_unknown block\n...\n}\n\nswitch(blocknum) {\ncase 0:\ngoto block0;\ncase 1:\ngoto block1;\ncase 2:\ngoto block2;\ndefault:\nthrow new Exception(\"Invalid block id\");\n}\n\nWhenever we want to jump to a non-local block, it is enough to store the block\nid in the appropriate variable and jump to the dispatch block. If the block\nresides in a different method, the jump_to_unknown block is entered; this\nspecial block is implemented differently by the main method and the child\nmethods, as we will see soon.\nEach time a new method is added to the graph, we build a delegate\nfor it, and store it in a special array\ncalled method_map; since we assign the method id sequentially starting\nfrom 0, we are sure that to fetch the method whose id is n we can simply\nload the n-th element of the array.\nThe jump_to_unknown block of the main method uses this array to select the\nright method, and calls it (FlexSwitchCase is the type of delegates for\nall children methods):\n\n// jump_to_unknown block of the main method\nFlexSwitchCase meth = method_map[methodid];\nblockid = meth(blockid, ...); // execute the method\ngoto dispatch_block;\n\nEach child method returns a block id specifying the next block to jump to;\nafter its execution, we assign the return value to the blockid variable,\nand jump again to the dispatch block, which will jump again to the appropriate\nblock.\nKeeping this in mind, it is straightforward to implement the\njump_to_unknown block of children methods: it is enough to return the\ntarget block id to the caller, and let its dispatch loop do the right thing.\nIf the caller is also a child method, it will return it again, until we reach\nthe dispatch loop of the main method, which will finally do the jump. In\ntheory, we could implement things differently and jumping directly from a\nchild method to another one, but in that case the call stack could grows\nindefinitely in case of a tight loop between two blocks residing in different\nmethods.\nTo implement the dispatch block we can exploit the switch opcode of the\nCLI; if the .NET JIT is smart enough, it can render it using an indirect jump;\noverall, jumping to a non-local block consists of an indirect function call\n(by invoking the delegate) plus an indirect jump (by executing the switch\nopcode); even if this is more costly than a simple direct jump, we will see in\nthe next section that this not the main source of overhead when following a\nnon-local link.\nObviously, the slow dispatching logic is needed only when we want to jump to a\nnon-local block; if the target block happens to reside in the same method as\nthe current one, we can directly jump to it, completely removing the overhead.\nMoreover, the dispatch blocks are emitted only if needed, i.e. if the parent\ngraph contains at least one flexswitch; graphs without flexswitches are\nrendered in the obvious way, by making one method per graph.\n\n\nThe slow bit: passing arguments\nJumping to the correct block is not enough to follow a link: as we said\nbefore, each link carries a set of arguments to be passed from the source to\nthe target block. As usual, passing arguments across local links is easy, as\nwe can just use local variables to hold their values; on the other hand,\nnon-local links make things more complex.\nThe only way to jump to a block is to invoke its containing method, so the\nfirst solution that comes to mind is to specify its input arguments as\nparameter of the method; however, each block has potentially a different\nnumber (and different types) of input arguments than every other block, so we\nneed to think of something else.\nAn alternative solution could be to compute the union of the sets of input\narguments of all the blocks in the method, and use this set as a signature\nfor the method; this way, there would be enough space to specify the input\narguments for every block we might want to jump to, each block ignoring the\nexceeding unused parameters.\nUnfortunately, all the children methods must have the very same signature,\nas they are all called from the same calling site in the dispatch block of the\nmain method. Since the union of the set of input arguments (and hence the\ncomputed signature) varies from method to method, this solution cannot work.\nWe might think to determine the signature by computing the union of input\narguments of all blocks in the graph; this way, all the children methods\nwould have the same signature. But as we said above, the graph grows new\nblocks at runtime, so we cannot determine in advance which set of input\narguments we will need.\nTo solve the problem we need a way to pass a variable number of arguments\nwithout knowing in advance neither their number nor their types. Thus, we use\nan instance of this class:\n\npublic class InputArgs {\npublic int[] ints;\npublic float[] floats;\npublic object[] objs;\n...\n}\n\nSince the fields are arrays, they can grow as needed to contain any number of\narguments; arguments whose type is primitive are stored in the ints or\nfloats array, depending on their type; arguments whose type is a reference\ntype are stored in the objs array: it's up to each block to cast each\nargument back to the needed type.\nThis solution impose a huge overhead on both writing and reading arguments:\n\n\nwhen writing, we need to make sure that the arrays are big enough to\ncontains all the arguments we need; if not, we need to allocate a bigger\narray. Moreover, for each argument we store into the array the virtual\nmachine performs a bound-check, even if we know the index will never be\nout of bounds (because we checked the size of the array in advance);\nwhen reading, the same bound-check is performed for each argument read;\nmoreover, for each value read from the objs array we need to insert a\ndowncast.\n\n\nTo mitigate the performance drop, we avoid to allocate a new InputArgs\nobject each time we do a non-local jump; instead, we preallocate one at the\nbeginning of the main method, and reuse it all the time.\nOur benchmarks show that passing arguments in arrays is about 10 times slower\nthan passing them as real parameter of a method. Unfortunately, we couldn't\ncome up with anything better.\n\n\nImplement flexswitches\nNow, we can exploit all this machinery to implement flexswitches, as this is\nour ultimate goal. As described above, the point is to be able to add new\ncases at runtime, each case represented as a delegate. Here is an excerpt\nof the C# class that implements a flexswitch that switches over an integer\nvalue:\n\npublic class IntLowLevelFlexSwitch:\n{\npublic uint default_blockid = 0xFFFFFFFF;\npublic int numcases = 0;\npublic int[] values = new int[4];\npublic FlexSwitchCase[] cases = new FlexSwitchCase[4];\n\npublic void add_case(int value, FlexSwitchCase c)\n{\n...\n}\n\npublic uint execute(int value, InputArgs args)\n{\nfor(int i=0; i);\n }\n}\n\nIf you call mysum(41), the execution goes in the default branch of the\nswitch, thus calling continue_compilation: its job is to restart the JIT\ncompiler, which now can emit fast code because it knows the exact type of\na; then, it modifies the original mysum_compiled function, in\norder to make it executing the newly generated code the next time it\nencounters an integer at that point:\n\nPyObject mysum_compiled(PyObject a)\n{\n Type a_type = a.GetType();\n switch(a_type) {\n PyInteger: return new PyInteger(a.value+1); // fast path!\n default: continue_compilation(a_type, );\n }\n}\n\nFrom now on, every time we call mysum with an integer argument, the JIT\ncompiler is not called anymore and the fast path is directly executed; if we\nhappen to call mysum with a float arguments, the switch goes again in the\ndefault branch, and the JIT compiler is started once more to produce fast\ncode also for this case. What happens in practice is that compile-time and\nruntime are continuously intermixed, until the switches are stable enough and\nthe compiler is not needed anymore.\nIn PyPy jargon, this kind of \"growable switch\" is called flexswitch, and\nit's one of the most important concept of our JIT generator.\n\nPromotion\nHow can the JIT generator know which values are useful to know to generate\nefficient code and which aren't? Unfortunately it can't, or at least our JIT\ngenerator is not smart enough at the moment.\nTo get the best from it, the developers of the VM need to instruct it by\nannotating the variables on which we want the JIT to stop until it knows the\nactual values; this is done by using particular hints, called promote\nand promote_class; variables annotated with such hints are said to be\npromoted. If something is promoted, a flexswitch is used to gain\ninformation about it, as seen in the last section.\nFor an example, let's look at an excerpt from main dispatch loop of the tlc\nvirtual machine:\n\nelif opcode == ADD:\n a, b = stack.pop(), stack.pop()\n hint(a, promote_class=True)\n hint(b, promote_class=True)\n stack.append(b.add(a))\n\nThis the implementation of the ADD opcode: first, it pops two values from\nthe stack; then, it computes the result; finally, it push the result to the\nstack again. In between, both the classes of a and b have been\npromoted: this means that when the JIT emits the code for b.add(a), it\nknows exactly what is happening: if it sees that both are instances of the\nIntObj class, it inlines the method call and emits a fast integer addition\ninstead.\n\nVirtuals\nThe other important concept of the JIT is the presence of virtual\nstructures, virtual lists, and virtual dictionaries. Again, I'm not\ngoing to explain in depth how they work, but only why they are so important for\ngenerating highly efficient code.\nThe essence of virtuals is that you don't allocate objects until you really\nneed to do it, e.g. because they are being passed as an argument to some\nexternal function. Instead, we store all the informations we need as local\nvariables; e.g., in the case of a virtual structure, we create as many local\nvariables as the number of its fields: if the structure escapes the local\nscope, we force it to a real object, by allocating memory on the heap and\ninitializing it after the current value of the local variables.\nThis technique allows the JIT to avoid the allocation of many temporary\nobjects that hold intermediate results; consider for example the following\nPython loop:\n\nresult = 0\nfor i in range(N):\n result += i\nreturn result\n\nWithout the JIT, at each iteration, a new int object is created and bound\nto the result variable, while the previous one is discarded and not needed\nanymore. By combining virtuals and promotion, the JIT can emit code that does\nthe whole computation locally, and allocates a real object only at the end,\nwhen it escapes from the local scope because it is returned from the\nfunction.\n\nPutting it all together\nThis is, essentially, how PyPy's generated JITs work. To summarize, our JITs\nemit multiple versions of each chunk of code: each version is specialized\nand optimized for one particular case.\nThe cost of selecting the right specialization to use (through flexswitches)\nis almost always negligible compared to how much time you save by running the\nfast version instead of the more-general-but-slow one. Moreover, each\nspecialized version knows the exact shape of the objects it's dealing with, so\nthey can be virtualized to make the generated code even more efficient.\nAt the end, the actual code generation is done by one of the JIT backends:\nthe backends exploit all the knowledge gathered by the previous steps to\nproduce highly efficient code, but this will be the subject of the next blog\npost.", + "tags": "", + "url": "https://www.pypy.org/posts/2008/11/porting-jit-to-cli-part-2-2456826431882963884.html" + }, + { + "title": "Porting the JIT to CLI (part 1)", + "text": "As the readers of this blog already know, I have been working on the CLI\nJIT backend for some months: last Friday, it reached an important milestone,\nas it is now able to produce huge speedups for a little dynamic language. To\nknow how huge the speedup is, read on :-).\nThe goal of PyPy JIT generator is to take an interpreter and, with the help of\nfew annotations, automatically generate a JIT compiler for it. In this post,\nwe will talk about the tlc virtual machine: while tlc it is just a toy\nlanguage, it contains some features that make it an interesting target for our\nJIT generator.\n\nThe tlc virtual machine\ntlc is executed by a stack based, dynamically typed virtual machine (for\nthose who knows a bit about the Python VM: does it sound familiar? :-)).\nThere are three types of objects: integers, nil, and cons cells (i.e.\nlisp-like pairs of objects).\nAs the VM is very simple, it provides only few opcodes:\n\n\nopcodes to manipulate the stack, like PUSH, POP, etc.\ninteger operations, like ADD, MUL, all the comparisons, etc.:\nthese operations can only be applied to integers;\nlist operations, like CONS, CAR, CDR: these operations can\nonly be applied to lists;\nother operations, including jumps and conditional jumps.\n\n\nThe VM is interesting for our purposes because it has a lot of similarities\nwith Python (though on a smaller scale, of course):\n\n\nit has to do type-checks at runtime before doing most of the operations;\nevery time you do an arithmetic operation, it has to unbox the operand,\ndo the computation, and the box the result again.\n\n\nThis means that even if you have a program which only uses integers, you are\npaying a lot of overhead.\nTo know more about this toy VM, look at its source code: the interesting\nbits are the classes used to represent objects, and the interp_eval\nfunction, which contains the main loop of the virtual machine. As you can\nsee, the implementation is quite straightforward; all the hint calls you\nsee are the special annotations needed by the JIT generator to produce better\ncode.\n\n\nLet's JIT it!\nSo, the whole point is to generate a JIT compiler from it, isn't it?\nFirst, checkout a fresh copy of the oo-jit branch:\n\n$ svn co https://codespeak.net/svn/pypy/branch/oo-jit\n\nThen, go to the oo-jit/pypy/jit/tl directory, and compile the tlc VM\nwith the CLI backend and JIT enabled:\n\n$ cd oo-jit/pypy/jit/tl/\n$ ../../translator/goal/translate.py -b cli --jit --batch targettlc\n...\nlot of texts\n...\n\nIf everything went OK, you now have a targettlc-cli executable, which\naccepts two arguments: the name of the file containing the tlc program we\nwant to run, and an integer to be passed to it.\nLuckily, in the same directory we have a factorial.tlc file that contains\nthe bytecode for a function that -- guess? -- computes the factorial of a\ngiven integer; let's try it:\n\n$ ./targettlc-cli factorial.tlc 5\nNon jitted: 120 (0.009371 seconds)\nWarmup jitted: 120 (0.208954 seconds)\nWarmed jitted: 120 (0.000323999999999991 seconds)\n\nCool, it seems that the result was computed correcly :-). As you can see from\nthe output, we ran the program three times:\n\n\nby plain interpretation, without any jitting;\nwith the jit enabled: this run includes the time spent by doing the\ncompilation itself, plus the time spent by running the produced code;\nagain with the jit enabled, but this time the compilation has already\nbeen done, so we are actually measuring how good is the code we produced.\n\n\nSo, it's time to run a benchmark: let's try to compute the factorial of a very\nbig number; the result will be 0, because obviously after a while we overflow,\nbut after all we are interested in the time spent, not in the result:\n\n$ ./targettlc-cli factorial.tlc 5000000\nNon jitted: 0 (19.93247 seconds)\nWarmup jitted: 0 (0.293229999999998 seconds)\nWarmed jitted: 0 (0.0494239999999984 seconds)\n\n$ python -c 'print 19.93247/0.0494239999999984'\n403.295362577\n\nAnd no, I didn't make any mistake in copying&pasting: the jitted version is\nreally 400 times faster that the non jitted one!\nWarning: my laptop seems to be not very well suited for benchmarks, as the\nresults vary a lot from run to run; I've run the benchmarks a lot of times,\nand I got speedup factors up to 500 times, so your results may be different.\n\n\nMore benchmarks\nIt's also interesting to compare the result with a manual written C#\nversion of the factorial, to see how good is code we produced; to get\nreasonable results, we need to compute a larger factorial, to let to code to\nrun a bit more:\n\n$ ./targettlc-cli --onlyjit factorial.tlc 100000000\nWarmup jitted: 0 (0.980856 seconds)\nWarmed jitted: 0 (0.769716 seconds)\n\n$ mono factorial.exe 100000000\nC#: 0 (0.153777 seconds)\n\n$ python -c 'print 0.769716/0.153777'\n5.00540392907\n\nWe know that the generated code is far from being optimal, but probably the\nfactor of five is at least partially due to the fact that Mono's own JIT is optimized for\nC#-like code, and our code has a completely different shape.\nAll the benchmarks above were run under Linux, with Mono 1.9.1. Here are the\nresults for the same benchmarks, but run with Microsoft CLR (on a different\nmachine, so the absolute values are not comparable):\n\n$ ./targettlc-cli factorial.tlc 5000000\nNon jitted: 0 (15,640625 seconds)\nWarmup jitted: 0 (0,4375 seconds)\nWarmed jitted: 0 (0,03125 seconds)\n\n$ python -c 'print 15.640625/0.03125'\n500.5\n\n$ ./targettlc-cli --onlyjit factorial.tlc 100000000\nWarmup jitted: 0 (0,90625 seconds)\nWarmed jitted: 0 (0,515625 seconds)\n\n$ ./factorial.exe 100000000\nC#: 0 (0,34375 seconds)\n\n$ python -c 'print 0.515625/0.34375'\n1.5\n\nThe results are even better than before; this is probably thanks to CLR's JIT,\nthat does a better job than Mono when faced to something which is different\nthan the usual C#-like code.\n\n\nConclusions (for now)\nThis is a very important result, because it proves that PyPy's approach to JIT\ncompilers can be applied effectively also to OO virtual machines; the result\nis even better than what I expected, because when generating code for .NET we\nhave much less freedom than when generating assembly code, and I had to play\nsome tricks to work around some .NET limitations.\nMoreover, it worked at the first try :-). I tried to compile the tlc\nvirtual machine as soon as all the related JIT tests were passing, and\nsurprisingly everything worked just fine, even if it was the very first time I\nwas trying to apply some features of the JIT to something bigger than a test:\nI think this is yet another prove that Test Driven Development just works!\nEven if this is a major milestone, the CLI JIT backend is not yet completed:\nas a consequence it can't still be used for the full PyPy, but all the\nhardest problems should have been solved now.\nSince a lot of readers asked for more technical details, especially about the\nJIT, I will try to soon write a second blog post explaining how the CLI backend works\ninternally, with a brief look to the generated code to see how it looks like.", + "tags": "", + "url": "https://www.pypy.org/posts/2008/11/porting-jit-to-cli-part-1-8712941279840156635.html" + }, + { + "title": "One year PyPy Blog", + "text": "Last Friday the PyPy Status Blog had its first anniversary. Yay! After not\nreally buying into any of this new-fangled \"blog\" stuff for a long time we just\nbit the bullet and got started. Totally surprisingly it even worked. We posted\n76 post in the last year, more than one per week. By now we have more than 800\nsubscribers (according to feedburner), which is quite cool for a rather niche\nblog.\nTo make our blog even more interesting, I would like to ask for some feedback\nvia the comments:\n\n\nWhich posts did you like in particular?\nWhat sort of posts would you be interested in getting more of?\nAny other improvements we could make?", + "tags": "", + "url": "https://www.pypy.org/posts/2008/11/one-year-pypy-blog-3267056180369310162.html" + }, + { + "title": "Sprint Discussions: JIT Generator Planning", + "text": "Background\nFinally, the JIT post :-). First some background: Despite our plans at the end\nof the EU period, PyPy's Python interpreter didn't get a good and widely\napplicable JIT in the last year. The reason for that was that we discovered that\nalthough the basic idea to generate JIT compilers is good, the concrete\nprototype made during the EU period is basically flawed. It could have\nbeen pushed a bit farther, but would have run into deep troubles eventually. One\nof the problems would have been performance instability: change a seemingly\nunrelated bit in your source program, and the performance changes in unexpected\nways, which is clearly not desirable. Another problem with that old approach is\nthat too much assembler code is generated, leading to memory problems, and also\nthat the generated assembler is bad in various ways, e.g. it is hard in that\napproach to do proper register allocation.\nTherefore we decided that it would be worthless to pursue this direction much\nfurther. Instead we tried to research approaches to fixing the inherent\nproblems. This research was largely done in Prolog and I eventually wrote my\nMaster's thesis about it. From the Prolog work we got some good insights into\nwhat needs to be done and what sorts of techniques are needed. Also, it inspired\nArmin to do some more exploration on a small Python prototype which used the\nlessons learned from Prolog and also some additional ideas from tracing JITs. So\nfar, however, the prototype is neither in RPython, nor much integrated with\nPyPy.\nThis research is not the only thing happening in the JIT-area. During the last\nyear, Antonio Cuni was working on bringing the JIT to pypy-cli. This\nconsisted mostly of writing a .NET backend for the old JIT-generator. Some\nfurther work is being done since August by John Witulski, who is writing an\nAMD64 backend for the JIT-generator for his Bachelor's thesis.\n\nWhere to go from thereDuring the sprint we discussed in which directions we should continue now. We\nplan to work quite a bit on the JIT in the coming months. Both Armin and Anto\nare in D\u00fcsseldorf for four months, and them and me plan to mostly work on the\nJIT (as well as giving a lecture on \"Dynamic Programming Languages\", trying to\nensnare some more students).\nThe first step will be to experiment a bit more with Armin's prototype. So far\nit looks rather promising, but there are some unsolved issues that we need to\nlook into first. The first issue is to think a bit about how to efficiently do\nprofiling to compile only important code paths. The other large issue are\nso-called \"virtualizables\". Roughly speaking, they are the frame objects of the\ninterpreter from which the JIT is generated. They need special treatment,\nbecause on the one hand it is important that they get optimized away to make the\ncode fast, since the frames are accessed all the time for the local variables;\non the other hand they should still be usable for introspection if code is\naround that is trying to look into them.\nWhen this is done, the prototype needs to be ported to RPython, which is a\nnon-trivial task, since it is rather dynamic so far (it is rather important that\nthe unresolved issues are done before the porting, because once the prototype is\nin RPython, experimentation will be harder). The porting has the potential to be\ntedious, but in a sense it is \"just work\", as opposed to unclear research.\nAt this point it will become important to think about the backend interface. The\ninterface that the old frontend used to produce assembler code won't be usable\nfor the new approach, so things need to be rearranged slightly. Afterwards the\nbackends will have more information and be invoked at a slightly higher level,\nwhich should allow them to produce better code.\nWhen all this is done, the JIT generator will be in a rather good state and it\nshould become possible (modulo a lot of details, of course), to use it on the\nPython interpreter.\nConclusion\nI am intentionally not attaching any time estimates to this blog post. So far\nour time estimates have not been very accurate when it comes to the JIT, which\nonly lead to disappointment when the JIT failed to materialize. We hope that we\nwill progress in interesting ways in the next four months, but who knows. Note\nthat we are really quite disappointed ourselves that it took so much longer than\nwe planned and hoped. The reason for this is mostly that this work really is\nresearch and sometimes it is just hard to predict what sort of problems turn up.\nPartial evaluation (the basis for our JIT generator) is a 30 years old technique\nthat was always just promising and never really successful, so the fact that we\nthink we can solve its problems in a few years is very much hubris anyway :-).\nOn the positive side, we think that we now know these problems much better than\never before and that we have a plan that has a chance to succeed.\nAlso we are still convinced that our approach has huge potential, despite the\ndifficulties. If we manage to pull it off, it should be significantly simpler to\nsupport new language features in the JIT and also to get speedups on some rather\ninteresting bits of the language. Some ideas we are having include generating a\nJIT for the regex engine or speed up ctypes-bindings to be nearly as fast as an\nextension module (or faster?). Also the JIT will be such that by construction\nthe JIT-generated code behaves identical to the original code, which isn't\nalways true for Psyco, for example.", + "tags": "", + "url": "https://www.pypy.org/posts/2008/10/sprint-discussions-jit-generator-3301578822967655604.html" + }, + { + "title": "Sprint Discussions: C++ Library Bindings", + "text": "At the beginning of this year, PyPy grew ctypes support, thanks to generous\nsupport by Google. This made it possible to interface with C libraries from\nour Python interpreter, something that was possible but rather tedious before.\nWhat we are lacking so far is a way to interface to large C++ libraries (like\nGUI libraries). During the sprint we had a brainstorming session about possible\napproaches for fixing this shortcoming.For CPython there are a number of approaches in common use:\n\n\nSIP, mainly used for PyQT\nSWIG\nBoost.Python\n\n\nThose all have the property that they produce some code that is then compiled\nwith a compiler to produce a CPython extension. The produced code also uses\nfunctions from CPython's C-API. This model is not simple to use for PyPy in its\ncurrent state. Since PyPy generates C code automatically, a fixed C-level API\ndoes not exist (it is not unlikely that at one point in the future we might have\nto provide one, but not yet). At the moment, PyPy very much has a \"Don't call\nus, we call you\"-approach.\nA very different approach is followed by the Reflex package, which is\ndeveloped at CERN (which has an incredible amount of C++ libraries). It is not\nmainly intended for writing Python bindings for C++ libraries but instead\nprovides reflection capabilities for C++. The idea is that for every C++ shared\nlibrary, an additional shared library is produced, which allows together with\nReflex to introspect properties of C++ classes, methods, etc. at runtime. These\nfacilities are then used for writing a small generic CPython extension module,\nthat allows CPython to use any C++ library for which this reflection information\nwas generated.\nThis approach is a bit similar to the ctypes module, apart from the fact\nthat ctypes does not use any reflection information, but the user has to\nspecify the data structures that occur in the C code herself. This makes it\nsometimes rather burdensome to write cross-platform library bindings.\nFor PyPy the approach seems rather fitting: We would need to implement only the\ngeneric extension module and could then use any number of C++ libraries. Of\ncourse some more evaluation is needed (e.g. to find out whether there are any\nrestrictions for the C++ code that the library can use and how bothersome it is\nto get this reflection information for a large library) but so far it seems\npromising.", + "tags": "", + "url": "https://www.pypy.org/posts/2008/10/sprint-discussions-c-library-bindings-249141169883996521.html" + }, + { + "title": "Sprint Discussions: Release Planning", + "text": "One of the discussions that happened during the sprint was about how to approach\nthe next PyPy release. There hasn't been a release since the end of the EU\nperiod, which is not an optimal situation. Therefore we plan to make a 1.1\nrelease at the beginning of next year, ideally before Pycon US. We'd also like\nto move towards time-based releases. This will be greatly helped by the\nnew buildbot infrastructure, which allows us to decide when the\nstate of the codebase is stable enough to release.\nAnother goal of the release is to involve more people from the wider PyPy\ncommunity by having bugdays and generally asking for more support. This will be\nparticularly useful for bugs on platforms that no one of the core developers\ngroup is using.\nFeature-wise the release will mostly contain CPython 2.5 language support,\nincluding some new extension modules (like ctypes, expat, sqlite).\nIn addition we plan to make it easier to actually install and use the PyPy\nPython interpreter, which means some sort of proper installation procedure and\nsupporting distutils on top of PyPy. Another part of the release will be\nsupport for fully sand-boxing an interpreter.\nAdditionally there were also a large number of improvements on several levels\nsince the last release, like optimizations, faster oldstyle-classes, better\nGCs, correct finalization behaviour, lots and lots of bugfixes, better\nthreading support (still with the GIL), some work on improving memory\nbehaviour, ...\nIn contrast to our last release, we will focus mainly on PyPy's Python\nIntepreter and more particularly its C-version. There are also various\nexperimental interpreters that PyPy contains, like for Prolog, Smalltalk,\nJavaScript and Scheme. We also don't intend to put the LLVM and Javascipt\nbackends in the release, since they are essentially unmaintained and at least\npartially broken. If anybody is particularly interested in one of these\ncomponents, please feel free to step up and take responsibility for them.\nAnother thing that the release won't contain is a JIT. I plan to make another\nblog-post about this soon, stay tuned.", + "tags": "release", + "url": "https://www.pypy.org/posts/2008/10/sprint-discussions-release-planning-7097053444808236145.html" + }, + { + "title": "D\u00fcsseldorf Sprint Report Days 1-3", + "text": "The D\u00fcsseldorf sprint is currently in full progress and this post will try to\nsummarize what progress has been made in the last days. We are (again) sprinting\nat the STUPS group of the D\u00fcsseldorf University. You can find the sprint\nannouncement and the daily planning file.\nHolger and Samuele put quite some effort over several days into setting up and\nimproving PyPy's testing infrastructure. PyPy has a variety of tests. On the one\nhand, there are of course our own tests. But then we also have the CPython tests\nthat should be run on top of pypy-c. Up to now we used a custom-made pile of\nhacks, held together by lots of duct-tape. It consisted of a variety of\ndifferent machines running different things with different reporting solutions.\nSome of the old test-results can still be found on wyvern. Now we are moving\nto a buildbot based solution together with a custom reporter to have a view\nsimilar to the old one. Some details are not quite finished yet, but most of the\nthings are already working rather well (currently all the results displayed\nare from the 2.5-merge branch).\nAnother large (and ongoing) topic of work is the 2.5 branch. It contains the\nwork done by our Summer-of-Code student, Bruno Gola, of adding CPython 2.5\nfeatures to PyPy's Python interpreter. While Bruno implemented most language\nfeatures and imported the 2.5 stdlib into PyPy, a lot of details were still\nmissing. In the last days nearly everybody worked on fixing small issues and\nfailing stdlib tests. While doing that we tried to categorize some CPython tests\nas implementation dependant so that we can skip them when running on PyPy.\n\nMemory Improvements\nOne goal of the sprint is to measure and to reduce the memory behaviour of our\nPython interpreter. The idea is to make pypy-c a realistic option for use on\nembedded devices. By memory behaviour we mean both the\ndynamic memory usage (how much bytes does a dict or an instance take) as well as\nthe size of the executable and details of the GC strategy.\nAlexander, Carl Friedrich and Antonio did some work on analyzing the static data\nthat a pypy-c executable contains. Our executables have the tendency to be\nrather large, both due to a lot of code and due to a large amount of static\ndata. The analysis didn't give any really surprising results, the problem is\nmostly that we have a lot of static data originating from a bit everywhere in\nour program. Two big offenders are the unicodedata-module with about 750 KB\nof static data and the multimethod-tables with about 150 KB of data.\nArmin, Iko, Anto and Maciek worked on a new approach to malloc-removal. This is\n(for PyPy) a crucial optimization of the translation toolchain that performs\nescape analysis to find out which objects don't outlive the frame they were\nallocated in. Since RPython is garbage-collected we usually have a lot of\nallocations, so it is important to statically get rid of many of them. To\nsuccessfully do that, some inlining is needed to give the analysis more context.\nThis leads to the fact that we have rather aggressive inlining-settings to allow\nas much malloc-removal as possible. The new approach tries to inline functions\nonly if this actually leads to the successful removal of a malloc operation. The\ncode is not finished quite yet, so it remains to be seen how successful it will\nbe.\nBefore the sprint Maciek had started to work on a mark-compact GC for PyPy. The\nidea is that it is better for memory-constrained-environments because it does\nnot double the memory-requirements during collections. During the sprint Armin\nand Maciek worked on cleaning up the code a bit and then merging the branch.\nAn interesting property of the mark-compact GC is that after a collection all\nthe memory that is not currently used by the program is returned to the\noperating system. Right now the GC is not as fast as our more mature ones, but\nit probably will be the basis for future tweaking.\nA small thing that was done by Alexander and Carl Friedrich to make objects smaller is\nto enable shared instance dictionaries also for instances of old-style\nclasses. Before it worked only for instances of new-style classes. Shared\ninstance dictionaries are a way to reduce the memory-usage of instances. In the\noptimal case, it gives the same memory-savings that __slots__ are giving,\nbut without any behavioural changes. Conceptually it is very similar e.g. to\nthe notion of \"map\" in the Self project, or the hidden classes that Google Chrome's V8\nis using (click on the link, there are nice graphics). The\ndifference is that for them it is mostly a way to get faster attribute access,\nand PyPy is so far only using it form memory savings (but that might change in\nthe future).\nIn parallel to all the other work, John Witulski worked tirelessly on advancing\nthe AMD64-JIT-backend. John has the implementation of this backend as the topic\nof his Bachelor's thesis. He is progressing quite well (especially also\nconsidering that this is his first sizeable Python project ever), just sometimes\nbeing impaired by such annoyances as errors in the official Intel documentation.\nBy now the backend is supporting many integer operations and control flow.", + "tags": "", + "url": "https://www.pypy.org/posts/2008/10/dsseldorf-sprint-report-days-1-3-5256639868851086032.html" + }, + { + "title": "Prolog-JIT Master's-Thesis Finished", + "text": "As we already blogged, in the last half-year or so, Michael Leuschel, Armin\nand me did a lot of JIT generator work on a Prolog prototype. The idea was to\nexperiment more quickly with some techniques than what would have been possible\nwith RPython. These experiments were quite successful in themselves. With very\nlittle code we managed to get a JIT that is not doing too badly when compared to\nexisting projects for Prolog.\nThis Prolog work was also the subject of my Master's thesis. I finished the\nthesis about two weeks ago (and since then have been mostly sleeping and then\nsprinting). The thesis should be self-contained when it comes to explaining the\nJIT concepts but needs knowledge of Prolog to be understandable.", + "tags": "jit", + "url": "https://www.pypy.org/posts/2008/10/prolog-jit-masters-thesis-finished-5462132148241449867.html" + }, + { + "title": "PyPy/Python at the Maemo summit", + "text": "Maciej and me visited the Maemo Summit in Berlin -\na community meetup around Nokia's Linux based\nmobile platform. We spontaneously did a lightning\ntalk about a first running pypy-c on Maemo\nand got nice feedback. \n\nWe also had a nice lunch with guys from the INDT in Brazil, including Marcio Marcedo and Marcelo Eduardo. It turns out that Python is used a lot on Maemo, for example the nice Canola UI is done with it. Will be interesting to see how this shapes up in relation to the iPhone and Android.\n\nA lot of Nokia engineers were around and they announced that from October on they are going for weekly new releases of their SDK for the new Fremantle (Maemo-5) debian-based platform until the SDK becomes final - if we got this right. \n\nFunnily enough, we met Marius Gedminas from the Programmers of Vilnius - he gave a lightning talk on his impressions as a community member. We think python programmers really should go much more to non-Python centric conferences.\n\nThe whole event took place at the C-Base - was a bit\ncrammed in some of the sessions with something like 200 people attending.\n\ncheers, Maciej and Holger", + "tags": "", + "url": "https://www.pypy.org/posts/2008/09/pypypython-at-maemo-summit-6115106472056714072.html" + }, + { + "title": "Pycon UK, Javascript and the GIL", + "text": "Just got back from Pycon UK 2008 - here are some impressions. \nBoth the keynote speakers Mark Shuttleworth (Canonical) and \nTed Leung (Sun Microsystems) expressed their concerns about\nJavascript becoming so fast and prominent that it could displace\nPython in the future. They also highlighted the fact that\nMulti-core systems get cheaper and more popular also on \ndesktop computers or notebooks. They challenged the community\nto advance Python implementations to exploit it. Question was up \nwhat PyPy can do here. As it stands, PyPy still uses the good old\nGlobal Interpreter Lock (GIL) but our approaches should indeed \nlend itself well to do experimentation with free threading. \n\nDuring the 2-day conference we met many interesting people, most \nnotably the guys from Resolver, among them William Reade who is working on\nIronClad -- which implements a fake python25.dll on top of\nIronPython. He presented some good results for Numpy in his\nlightning talk. This approach is surely something to follow\nclosely and potentially use for PyPy. \n\nWe also had lunch and a couple of chats with Jacob Kaplan-Moss from\nDjango fame - he is apparently up to try use PyPy's sandboxing features\nfor one of his projects, cool!\n\nConference itself was well organized for the 230 attending people - although\nthe venue might be a bit small for next year's EuroPython. Ah, and\nwe gave three well attended talks, find the slides here:\n\n\nPyPy status and 1.1 plans\nPyPy JIT\npy.test tutorial\n\ncheers,\nHolger, Maciej, Anto (associated through merlinux, btw)", + "tags": "", + "url": "https://www.pypy.org/posts/2008/09/pycon-uk-javascript-and-gil-8387247619202094916.html" + }, + { + "title": "D\u00fcsseldorf PyPy sprint 5-13th October, 2008", + "text": "The PyPy team is happy to announce the next sprint, which will take place in\nthe Computer Science Department of the University of D\u00fcsseldorf, Germany.\nSprinting will start on the 6th of October and go on till the 12th. Please\narrive on the day before if you want to come.\nTopics of the sprint will be aiming at a 1.1 release and to work on integrating PyPy better \nwith small devices. Other topics are also welcome!\nWe will try to find a hotel with group rates, so if you are interested, please\nsign up soon! See the announcement for more details.", + "tags": "", + "url": "https://www.pypy.org/posts/2008/09/dsseldorf-pypy-sprint-5-13th-october-8919978872121664955.html" + }, + { + "title": "pylib/py.test 0.9.2 released", + "text": "PyPy and its 14638 automated tests use the py.test tool which is also used by many other projects. PyPy developers have actually driven and contributed a lot to its development. \n\nI just released version 0.9.2 of the py lib mainly fixing Windows issues and providing better packaging and integration with setuptools. It's usable completely independently from PyPy - \"easy_install py\" gives you the py.test command line. Of course you can run py.test on top of a translated PyPy version as well. Here is a quick summary of what the py lib provides besides py.test:\n\npy.execnet: ad-hoc code distribution to SSH, Socket and local sub processes\npy.magic.greenlet: micro-threads on standard CPython (\"stackless-light\") and PyPy\npy.path: path abstractions over local and subversion files\npy.code: dynamic code compile and traceback printing support\ntested against Linux, Win32, OSX, works on python 2.3-2.6\n\nGood general entry points for installation and documentation:\n\nPypi pages\nDownload/Install\nDocumentation/API\n\nhave fun, holger krekel", + "tags": "release", + "url": "https://www.pypy.org/posts/2008/08/pylibpytest-092-released-6233865913406513469.html" + }, + { + "title": "New translation option: --opt", + "text": "Hi all,\n\nA few command-line options for translate.py have changed.\nMost interesting is that optimization levels are selected with\nthe option --opt, or -O for short. This replaces --allopts,\nwhich was also called --faassen in reference to a person who\nis actually not involved in PyPy (so that was a bit of a\nstrange joke). Also, --allworkingmodules is the default\nnowadays, and can be cancelled with --no-allworkingmodules.\nThreads are also included in --allworkingmodules now.\n\nExamples:\n\ntranslate.py (reasonable default, corresponds to --opt=2)\n translate.py --opt=3 (best, maybe 10-20% faster)\n translate.py --opt=1 (translation is faster and less RAM-hungry)\n\n\nFor more information, see:\n \nGetting started\n List of optimization levels", + "tags": "", + "url": "https://www.pypy.org/posts/2008/08/new-translation-option-opt-7737733390438084418.html" + }, + { + "title": "PyPy runs unmodified django 1.0 beta", + "text": "This is just a quick update post to previous post - django folks commited all\noutstanding tickets and we are able to run unmodified django\non top of pypy-c. Instructions how to do it are well explained\non django wiki entry\n\nenjoy,\nfijal", + "tags": "", + "url": "https://www.pypy.org/posts/2008/08/pypy-runs-unmodified-django-10-beta-7105507436425430319.html" + }, + { + "title": "Europython 2008 PyPy talks and sprint sum up", + "text": "The EuroPython 2008 conference and sprints have finished - it certainly was \na very eventful and successful conference for PyPy. And many very interesting \nnon-PyPy talks as well. PyPy presentations are available online: PyPy status talk\nPyPy for the rest of us, PyPy behind the scenes. Armin and Maciej also did a well-attended \ntalk about PyPy's garbage collection, but that was quite interactive, no slides. \n\nThe talks were all well visited and we got good questions. However, we still \nneed to work on sorting out the \"PyPy technology cloud\" and how to present\nit to different audiences. Anyway, we are happy to hear feedback or questions\nabout the talks!\n\nAfter the conference there was a three-day PyPy sprint. Despite \nthe fact that most PyPy core developers were zombies, \nwe made good progress. Particularly our newcomers did very well. \nHere are some results: \n\n itertools rewritten in RPython for performance by Jakub\n Gustak and Andrew Durdin \n\n a new ctypes based dbm and hashlib module, both by Gasper Zejn \n with support from Henrik Vendelbo, they also got ctypes to nicely work on OSX. (sorry for lack of proper letters in names :)\n\n implement builtin function call profiling by Stephan Diehl, Antonio and Armin. \n\n running\n Pinax on top of pypy-c, by Henrik, Holger, Gasper. \n\n Jim Baker started a _rawffi.py for Jython using JNA aiming\n to provide support to run PyPy's ctypes on top of Jython. \n When Jython gets this to run, PyPy's JVM backend should be \n able to use it. Talk about Code Reuse :) \n\n oldstyle classes are now the default, this makes \n PyPy mimick very closely cpython's 2.5 object model. \n\n Andrew started a port of the Malbolge \n interpreter written in Python to RPython (obviously the only missing \n link for PyPy to take over the world). \n\n various cleanups (a new option \"--lonepycfiles\" helps with\n saner imports, remove int-float comparison shortcuts, ...) \n\nAt the end of the sprint we also discussed initial plans for a 1.1 release which we'd like to make happen this year. So we are generally looking forward to a busy rest of 2008 and luckily this starts by many of us taking a good vacation first :) \n\nCheers,\nfijal & holger", + "tags": "", + "url": "https://www.pypy.org/posts/2008/07/europython-2008-pypy-talks-and-sprint-2255727845041197411.html" + }, + { + "title": "Finding Bugs in PyPy with a Fuzzer", + "text": "Last week I played a bit with Fusil, which is a fuzzing framework. The idea is\nto feed the interpreter code that calls the functions of a module with random values\nof various types as arguments in the hope that one hits an unchecked case. This is\ndone until a problem is hit , the most common problem being a segfault. Victor Stinner,\nthe author of Fusil, is a regular in the PyPy IRC channel and thankfully helped me\ngetting started with Fusil. I used his project description for CPython as a starting\npoint and tweaked it a bit. Reason is that PyPy is harder to segfault and so\nI tweaked Fusil to also count uncaught RPython-level exceptions as such a problem.\n(RPython has full exception support, and if an RPython-exception escapes to the top\nlevel, the Python interpreter aborts. One should not be able to exploit this but\nbut for a user it is bad enough, because such exceptions cannot be caught from\nPython code.)\nUsing Fusil I found a number of cases where such exceptions happened (in some\npickle support-code, in the expat parser, in the os and in the termios\nmodule) and also one or two segfaults (in the parser module, of all places).\nI fixed all these problems so that by\nnow the fuzzer just runs for a very long time and only finds things that take\ntoo long (so they count as a way to do a DoS attack) like\npow(12355123123L, 12351512123121L) or round(1, 1000000000) (the latter\nshould probably be fixed). This probably just means that the fuzzer is not good\nenough, because there are certainly segfaults left in PyPy. However, the fact\nthat it is rather hard to find them validates our approach of using a\nhigh-level memory-managed language for our interpreter. Victor tells me that it\nis rather easy to find segfaults in CPython this way, he already found quite\nsome problems.", + "tags": "", + "url": "https://www.pypy.org/posts/2008/07/finding-bugs-in-pypy-with-fuz-7503072572107631526.html" + }, + { + "title": "PyPy's Python runs Pinax / Django", + "text": "During the EP2008 sprint we got Pinax running on top of PyPy. At our play1 server we have it running on top of pypy-c. Not that you'll notice many differences to the original site but that's the point, isn't it? ... Well, in fact i am too lazy to customize our play1 version now - i rather spent a nice evening with the other sprint guys :) \n\nPinax integrates numerous reusable Django apps to take care of the things that many sites have in common. Many thanks particularly to Henrik Vendelbo who sorted out various Pinax and PyPy issues, and wrote up a nice DjangoAndPyPy wiki page describing the installation process.\n\ngreetings from Vilnius (Lithunia), Holger", + "tags": "", + "url": "https://www.pypy.org/posts/2008/07/pypys-python-runs-pinax-django-1265543049596913506.html" + }, + { + "title": "EP2008: PyPy meets Jython", + "text": "One of the great events at EuroPython 2008 were our chats and meetings with the Jython and Sun people. The Jython people recently are pushing into releasing Python version 2.5 and they currently pursue many interesting sub projects. Coincidentally, PyPy also has tons of interesting areas and results :) So we eventually got into brainstorming a number of possible technical collab ideas. Further below is a first list as i wrote it down from our 10 people PyPy / Jython 30 minute close up meeting yesterday.\n\nIt felt great to be able to talk to the Jython people this way - kudos to Sun for their clear commitments and open ways to go about things! I sense a genuine interest on fair collaboration with non-java developer communities. Seems like they are serious about not focusing on \"Java this\", \"Java that\" anymore but rather focus on the JVM platform. Good! And about language\nindependent interest in ambitious technology. Even Better! I am tensed to see how things go from here.\n\nSo here the list of technical collab ideas:\nctypes - try to create _rawffi module in Java for Jython, which will enable Jython to reuse our existing ctypes implementation (and have PyPy use the Jython-rawffi for its own for PyPy.JVM) generally see to share work / (continue) collaborate regarding extension modulesJython/PyPy (and eventually IronPython): document known differences to CPython, maybe in a PEPPython Interpreter for Jython (in order to run CPython's .pyc files): re-use pypy's bytecode evaluator, implement a \"Jython object space\". re-use rpython-extension modules for jython (e.g. SRE), by compiling them to Java and reusing as a native library.collaborate on testing framework / benchmarking, have a common site to show test resultsmake py.test compatible with jythoncome up with a set of \"pure Python language\" tests, which would gather and refactor tests from CPython, PyPy and Jython. look into using java types / jython approaches for implementing free threading.share knowledge regarding JIT / psyco\nIf you have any more ideas, comments or would like to join efforts, let us know!\n\nCheers and thanks to Ted Leung, Frank Wierzbiki, Jim Baker and Tobias Ivarsson from Sun and Jython fame respectively,\n\nHolger", + "tags": "ep2008,jython,pypy,sun", + "url": "https://www.pypy.org/posts/2008/07/ep2008-pypy-meets-jython-1107070144380217881.html" + }, + { + "title": "PyPy at the EuroPython 2008", + "text": "Greetings from Vilnius, Lithuania. There were already\ntwo pypy talks, one performed by Jacob Hallen\nPyPy for the rest of us and second\nby Maciej Fijalkowski PyPy status talk. The thing that\nwe forgotten to tell is that PyPy sandboxing feature\ncan also easily limit CPU and RAM usage as well as\nany other possible resource (like network transfer).\nFor anyone who would like to join, there is a PyPy\nsprint after the conference.\n\nCheers,\narigo & fijal", + "tags": "", + "url": "https://www.pypy.org/posts/2008/07/pypy-at-europython-2008-1488914968455397674.html" + }, + { + "title": "JIT in Prolog", + "text": "Hi all,\n\nSome news from the JIT front. Progress on the JIT has been low-profile\nin the past few months. No big results to announce yet, but we have\nplayed with some new ideas, and they are now documented as a draft\nresearch paper: Towards Just-In-Time Compilation and Specialisation of Prolog.\n\nProlog? Yes. To understand this slightly unusual choice of programming\nlanguage, here is first some background about our JIT.\n\nPyPy contains not a JIT but a JIT generator, which means that we\nonly write an interpreter for a language (say, the complete Python\nlanguage), and we get a JIT \"for free\". More precisely, it's not for\nfree: we had to write the JIT generator, of course, as well as some\namount of subtle generic support code. The JIT generator preprocesses\nthe (complete Python) interpreter that we wrote and links the result\nwith the generic support code; the result is a (complete Python) JIT.\n\nThe way that this works so far gives us a generated JIT that is very\nsimilar to Psyco in the way\nit works.\nBut Psyco has issues (and so the current PyPy JITs have the same issues):\nit can sometimes produce too much machine code,\ne.g. by failing to notice that two versions of the machine code are\nclose enough that they should really be one; and it can also sometimes\nfail in the opposite way, by making a single sub-efficient version of\nthe machine code instead of several efficient specialized versions.\n\nA few months ago we have chosen to experiment with improving this\ninstead of finishing and polishing what we had so far. The choice was\nmostly because we were (and still are) busy finishing and polishing\neverything else in PyPy, so it was more fun to keep at least the JIT on\nthe experimental side. Besides, PyPy is now getting to a rather good\nand complete state, and it is quite usable without the JIT already.\n\nAnyway, enough excuses. Why is this about Prolog?\n\nIn PyPy, both the (complete Python) interpreter and the JIT support code\nare in RPython. Now RPython is not\nan extremely complicated language, but still, it is far from the top on a\nminimalism scale. In general, this is a good in practice (or at least I\nthink so): it gives\na reasonable balance because it is convenient to write interpreters\nin RPython, while not being so bloated that it makes our translation\ntoolchain horribly complicated (e.g. writing garbage collectors for\nRPython - or even JIT generators - is reasonable). Still, it is not the\nbest choice for early research-level experimentation.\n\nSo what we did instead recently is hand-write, in Prolog, a JIT that\nlooks similar to what we would like to achieve for RPython with our JIT\ngenerator. This gave much quicker turnaround times than we were used to\nwhen we played around directly with RPython. We wrote tiny example\ninterpreters in Prolog (of course not a complete Python interpreter).\nSelf-inspection is trivial in Prolog, and generating Prolog code at\nruntime is very easy too. Moreover, many other issues are also easier\nin Prolog: for example, all data structures are immutable \"terms\".\nOther languages than Prolog would have worked, too, but it happens to be\none that we (Carl Friderich, Michael Leuschel and myself) are familiar\nwith -- not to mention that it's basically a nice small dynamic\nlanguage.\n\nOf course, all this is closely related to what we want to do in PyPy.\nThe fundamental issues are the same. Indeed, in PyPy, the major goals\nof the JIT are to remove, first, the overhead of allocating objects all\nthe time (e.g. integers), and second, the overhead of dynamic dispatch\n(e.g. finding out that it's integers we are adding). The equivalent\ngoals in Prolog are, first, to avoid creating short-lived terms, and\nsecond, to remove the overhead of dispatch (typically, the dispatching\nto multiple clauses). If you are familiar with Prolog you can find more\ndetails about this in the paper. So far we already played with many possible solutions\nin the Prolog JIT, and the paper describes the most mature one; we have\nmore experimentation in mind. The main point here is that these are\nmostly language-independent techniques (anything that works both in\nProlog and in RPython has to be language-independent, right? :-)\n\nIn summary, besides the nice goal of speeding up Prolog, we are trying\nto focus our Prolog JIT on the issues and goals that have equivalents in\nthe PyPy JIT generator. So in the end we are pretty convinced that it\nwill give us something that we can backport to PyPy -- good ideas about\nwhat works and what doesn't, as well as some concrete algorithms.", + "tags": "jit", + "url": "https://www.pypy.org/posts/2008/06/hi-all-some-news-from-jit-front-7534695765973581706.html" + }, + { + "title": "PyPy code swarm", + "text": "Following the great success of code_swarm, I recently produced a\nvideo that shows the commit history of the PyPy project.\nThe video shows the commits under the dist/ and branch/\ndirectories, which is where most of the development happens.\nIn the first part of the video, you can see clearly our sprint based\napproach: the video starts in February 2003, when the first PyPy\nsprint took place in Hildesheim: after a lot of initial activity, few\ncommits happened in the next two months, until the second PyPy sprint,\nwhich took place in Gothenburg in late May 2003; around the minute\n0:15, you can see the high commit rate due to the sprint.\nThe next two years follow more or less the same pattern: very high\nactivity during sprints, followed by long pauses between them; the\nmost interesting breaking point is located around the minute 01:55;\nit's January 2005, and when the EU project starts, the number of\ncommits just explodes, as well as the number of people involved.\nI also particularly appreciated minute 03:08 aka March 22, 2006: it's\nthe date of my first commit to dist/, and my nickname magically\nappears; but of course I'm biased :-).\nThe soundtrack is NIN - Ghosts IV - 34: thanks to xoraxax for\nhaving added the music and uploaded the video.\n PyPy Codeswarm from solse@trashymail.com on Vimeo.", + "tags": "", + "url": "https://www.pypy.org/posts/2008/06/pypy-code-swarm-7038411918926116477.html" + }, + { + "title": "Funding of some recent progress by Google's Open Source Programs", + "text": "As readers of this blog already know, PyPy development has\nrecently focused on getting the code base to a more usable state. One\nof the most important parts of this work was creating an\nimplementation of the ctypes module for PyPy, which\nprovides a realistic way to interface with external libraries. The\nmodule is now fairly complete (if somewhat slow), and has generated a\ngreat deal of community interest. One of the main reasons this work\nprogressed so well was that we received funding from Google's Open\nSource Programs Office. This is\nreally fantastic for us, and we cannot thank Google and Guido enough for helping PyPy progress\nmore rapidly than we could have with volunteer-only time!\nThis funding opportunity arose from the PyPy US road trip at the end\nof last year, which included a visit to Google. You\ncan check out the video\nof the talk we gave during our visit. We wrapped up our day with\ndiscussions about the possibility of Google funding some PyPy work and\nsoon after a we were at work on the proposal for improvements we'd\nsubmitted.\nOne nice side-effect of the funding is indeed that we can use some of\nthe money for funding travels of contributors to our sprint meetings.\nThe next scheduled Google funding proposal also aims at making our\nPython interpreter more usable and compliant with CPython. This will be done by trying to\nfully run Django on top of PyPy. With\nmore efforts like this one we're hoping that PyPy can start to be used\nas a CPython replacement before the end of 2008.\nMany thanks to the teams at merlinux and Open End for making this development possible, including\nCarl Friedrich Bolz, Antonio Cuni, Holger Krekel, Maciek Fijalkowski\nat merlinux, Samuele Pedroni and yours truly at Open End.\nWe always love to hear feedback from the community, and you can get\nthe latest word on our development and let us know your thoughts here in the comments.\nBea D\u00fcring, Open End AB\n\nPS: Thanks Carl Friedrich Bolz for drafting this post.", + "tags": "", + "url": "https://www.pypy.org/posts/2008/06/pypy-improvements-5272963843122158791.html" + }, + { + "title": "Pdb++ and rlcompleter_ng", + "text": "When hacking on PyPy, I spend a lot of time inside pdb; thus, I tried\nto create a more comfortable environment where I can pass my nights\n:-).\nAs a result, I wrote two modules:\n\n\npdb.py, which extends the default behaviour of pdb, by adding\nsome commands and some fancy features such as syntax highlight and\npowerful tab completion; pdb.py is meant to be placed somewhere in\nyour PYTHONPATH, in order to override the default version of pdb.py\nshipped with the stdlib;\nrlcompleter_ng.py, whose most important feature is the ability\nto show coloured completions depending on the type of the objects.\n\n\nTo find more informations about those modules and how to install them,\nhave a look at their docstrings.\nIt's important to underline that these modules are not PyPy specific,\nand they work perfectly also on top of CPython.", + "tags": "", + "url": "https://www.pypy.org/posts/2008/06/pdb-and-rlcompleterng-2414105295687348881.html" + }, + { + "title": "Running Nevow on top of PyPy", + "text": "Another episode of the \"Running Real Application of top of PyPy\" series:\n\nToday's topic: Divmod's Nevow. Nevow (pronounced as the French \"nouveau\", or \"noo-voh\") is a web application construction kit written in Python. Which means it's just another web framework, but this time built on top of Twisted.\nWhile, due to some small problems we're not yet able to pass full Twisted test suite on top of pypy-c, Nevow seems to be simple enough to work perfectly (959 out of 960 unit tests passing, with the last one recognized as pointless and about to be deleted). Also, thanks to\nexarkun, Nevow now no longer relies on ugly details like refcounting.\n\nAs usual, translate pypy using:\n\ntranslate.py --gc=hybrid --thread targetpypystandalone --faassen --allworkingmodules --oldstyle\n\nOf course, obligatory to the series, screenshot:\n\n\nThis is Nevow's own test suite.\n\nCheers,\nfijal", + "tags": "", + "url": "https://www.pypy.org/posts/2008/06/running-nevow-on-top-of-pypy-58891137802412513.html" + }, + { + "title": "Next sprint: Vilnius/Post EuroPython, 10-12th of July", + "text": "As happened in the last years, there will be a PyPy sprint just after\nEuroPython. The sprint will take place in the same hotel as the\nconference, from 10th to 12th of July.\nThis is a fully public sprint: newcomers are welcome, and on the first\nday we will have a tutorial session for those new to PyPy development.\nSome of the topics we would like to work on:\n\n\ntry out Python programs and fix them or fix PyPy or fix performance bottlenecks\nsome JIT improvement work\nport the stackless transform to ootypesystem\n\n\nOf course, other topics are also welcome.\nFor more information, see the full announcement.", + "tags": "", + "url": "https://www.pypy.org/posts/2008/06/next-sprint-vilniuspost-europython-10-3844544842675903586.html" + }, + { + "title": "German Introductory Podcast About Python and PyPy", + "text": "During the Berlin Sprint Holger was interviewed by Tim Pritlove for Tim's\nPodcast \"Chaosradio Express\". The whole thing is in German, so only\ninteresting to German-speakers. The PyPy episode can be found here. The\ninterview is touching on a lot of topics, starting with a fairly general intro\nabout what Python is and why it is interesting and then moving to explaining and\ndiscussing PyPy. The bit about PyPy starts after about 45 minutes. There is also\na comment page about the episode.", + "tags": "", + "url": "https://www.pypy.org/posts/2008/06/german-introductory-podcast-about-3836017753197345761.html" + }, + { + "title": "Running Pylons on top of PyPy", + "text": "The next episode of the \"Running Real Applications on Top of PyPy\" series: \nYesterday, we spend some time with Philip Jenvey on tweaking Pylons and PyPy to cooperate with each other. While doing this we found some pretty obscure details, but in general things went well.\n\nAfter resolving some issues, we can now run all (72) Pylons tests on\ntop of pypy-c compiled with the following command:\n\n\ntranslate.py --gc=hybrid --thread targetpypystandalone --faassen --allworkingmodules --oldstyle\n\nand run some example application. Here is the obligatory screenshot (of course\nit might be fake, as usual with screenshots). Note: I broke application on purpose to showcase cool debugger, default screen is just boring:\n\nPlease note that we run example application without DB access, since\nwe need some more work to get SQLAlchemy run on top of pypy-c together with\npysqlite-ctypes. Just one example of an obscure details that sqlalchemy is\nrelying on in the test suite:\n\n\n class A(object):\n \u00a0\u00a0locals()[42] = 98\n\n\nUpdate:This is only about new-style classes.\n\nThis works on CPython and doesn't on PyPy.\nCheers,\nfijal", + "tags": "", + "url": "https://www.pypy.org/posts/2008/06/running-pylons-on-top-of-pypy-3234492105090025733.html" + }, + { + "title": "List comprehension implementation details", + "text": "List comprehensions are a nice feature in Python. They are, however, just\nsyntactic sugar for for loops. E.g. the following list comprehension:\n\ndef f(l):\n return [i ** 2 for i in l if i % 3 == 0]\n\nis sugar for the following for loop:\n\ndef f(l):\n result = []\n for i in l:\n if i % 3 == 0:\n result.append(i ** 2)\n return result\n\nThe interesting bit about this is that list comprehensions are actually\nimplemented in almost exactly this way. If one disassembles the two functions\nabove one gets sort of similar bytecode for both (apart from some details, like\nthe fact that the append in the list comprehension is done with a special\nLIST_APPEND bytecode).\nNow, when doing this sort of expansion there are some classical problems: what\nname should the intermediate list get that is being built? (I said classical\nbecause this is indeed one of the problems of many macro systems). What CPython\ndoes is give the list the name _[1] (and _[2]... with nested list\ncomprehensions). You can observe this behaviour with the following code:\n\n$ python\nPython 2.5.2 (r252:60911, Apr 21 2008, 11:12:42)\n[GCC 4.2.3 (Ubuntu 4.2.3-2ubuntu7)] on linux2\nType \"help\", \"copyright\", \"credits\" or \"license\" for more information.\n>>> [dir() for i in [0]][0]\n['_[1]', '__builtins__', '__doc__', '__name__', 'i']\n>>> [[dir() for i in [0]][0] for j in [0]][0]\n['_[1]', '_[2]', '__builtins__', '__doc__', '__name__', 'i', 'j']\n\nThat is a sort of nice decision, since you can not reach that name by any\n\"normal\" means. Of course you can confuse yourself in funny ways if you want:\n\n>>> [locals()['_[1]'].extend([i, i + 1]) for i in range(10)]\n[0, 1, None, 1, 2, None, 2, 3, None, 3, 4, None, 4, 5, None, 5, 6, None, 6, 7, None, 7, 8, None, 8, 9, None, 9, 10, None]\n\nNow to the real reason why I am writing this blog post. PyPy's Python\ninterpreter implements list comprehensions in more or less exactly the same way,\nwith on tiny difference: the name of the variable:\n\n$ pypy-c-53594-generation-allworking\nPython 2.4.1 (pypy 1.0.0 build 53594) on linux2\nType \"help\", \"copyright\", \"credits\" or \"license\" for more information.\n``the globe is our pony, the cosmos our real horse''\n>>>> [dir() for i in [0]][0]\n['$list0', '__builtins__', '__doc__', '__name__', 'i']\n\n\nNow, that shouldn't really matter for anybody, should it? Turns out it does. The\nfollowing way too clever code is apparently used a lot:\n\n__all__ = [__name for __name in locals().keys() if not __name.startswith('_') '\n or __name == '_']\n\nIn PyPy this will give you a \"$list0\" in __all__, which will prevent the\nimport of that module :-(. I guess I need to change the name to match CPython's.\nLesson learned: no detail is obscure enough to not have some code depending\non it. Mostly problems on this level of obscurity are the things we are fixing\nin PyPy at the moment.", + "tags": "", + "url": "https://www.pypy.org/posts/2008/06/list-comprehension-implementation-5289956690288817225.html" + }, + { + "title": "Better Profiling Support for PyPy", + "text": "As PyPy is getting more and more usable, we need better tools to use to work on certain applications running on top of PyPy. Out of this interest, I spent some time implementing the _lsprof module, which is a part of the standard library since Python2.5. It is necessary for the cProfile module, which can profile Python programs with high accuracy and a lot less overhead than the older, pure-python profile module. Together with the excellent\nlsprofcalltree script, you can display this data using kcachegrind, which gives you great visualization possibilities for your profile data.\n\nCheers,\nfijal", + "tags": "", + "url": "https://www.pypy.org/posts/2008/06/better-profiling-support-for-pypy-1848129914083462080.html" + }, + { + "title": "Threads and GCs", + "text": "Hi all,\n\nWe can now compile a pypy-c that includes both thread support\nand one of our semi-advanced garbage collectors. This means\nthat threaded Python programs can now run not only with a\nbetter performance, but without the annoyances of the Boehm\ngarbage collector. (For example, Boehm doesn't like too much\nseeing large numbers of __del__(), and our implementation of\nctypes uses them everywhere.)\n\nMagic translation command (example):\n\n translate.py --thread --gc=hybrid targetpypystandalone --faassen --allworkingmodules\n\nNote that multithreading in PyPy is based on a global\ninterpreter lock, as in CPython. I imagine that we will get\nrid of the global interpreter lock at some point in the future\n-- I can certainly see how this might be done in PyPy, unlike\nin CPython -- but it will be a lot of work nevertheless. Given\nour current priorities, it will probably not occur soon unless\nsomeone steps in.", + "tags": "", + "url": "https://www.pypy.org/posts/2008/05/threads-and-gcs-1126087726480790112.html" + }, + { + "title": "Progresses on the CLI JIT backend front", + "text": "In the last months, I've actively worked on the CLI backend for PyPy's\nJIT generator, whose goal is to automatically generate JIT compilers\nthat produces .NET bytecode on the fly.\nThe CLI JIT backend is far from be completed and there is still a lot\nof work to be done before it can handle the full PyPy's Python\ninterpreter; nevertheless, yesterday I finally got the first .NET\nexecutable that contains a JIT for a very simple toy language called\ntlr, which implements an interpreter for a minimal register based\nvirtual machine with only 8 operations.\nTo compile the tlr VM, follow these steps:\n\n\nget a fresh checkout of the oo-jit branch, i.e. the branch\nwhere the CLI JIT development goes on:\n\n$ svn co https://codespeak.net/svn/pypy/branch/oo-jit\n\n\ngo to the oo-jit/pypy/jit/tl directory, and compile the tlr VM\nwith the CLI backend and JIT enabled:\n\n$ cd oo-jit/pypy/jit/tl/\n$ ../../translator/goal/translate.py -b cli --jit --batch targettlr\n\n\n\n\nThe goal of our test program is to compute the square of a given\nnumber; since the only operations supported by the VM are addition and\nnegation, we compute the result by doing repetitive additions; I won't\ndescribe the exact meaning of all the tlr bytecodes here, as they are\nquite self-documenting:\n\nALLOCATE, 3, # make space for three registers\nMOV_A_R, 0, # i = a\nMOV_A_R, 1, # copy of 'a'\n\nSET_A, 0,\nMOV_A_R, 2, # res = 0\n\n# 10:\nSET_A, 1,\nNEG_A,\nADD_R_TO_A, 0,\nMOV_A_R, 0, # i--\n\nMOV_R_A, 2,\nADD_R_TO_A, 1,\nMOV_A_R, 2, # res += a\n\nMOV_R_A, 0,\nJUMP_IF_A, 10, # if i!=0: goto 10\n\nMOV_R_A, 2,\nRETURN_A # return res\n\nYou can find the program also at the end of the tlr module; to get an\nassembled version of the bytecode, ready to be interpreted, run this\ncommand:\n\n$ python tlr.py assemble > square.tlr\n\nNow, we are ready to execute the code through the tlr VM; if you are\nusing Linux/Mono, you can simply execute the targettlr-cli script\nthat has been created for you; however, if you use Windows, you have\nto manually fish the executable inside the targettlr-cli-data\ndirectory:\n\n# Linux\n$ ./targettlr-cli square.tlr 16\n256\n\n# Windows\n> targettlr-cli-data\\main.exe square.tlr 16\n256\n\nCool, our program computed the result correctly! But, how can we be\nsure that it really JIT compiled our code instead of interpreting it?\nTo inspect the code that it's generated by our JIT compiler, we simply\nset the PYPYJITLOG environment variable to a filename, so that the\nJIT will create a .NET assembly containing all the code that has been\ngenerated by the JIT:\n\n$ PYPYJITLOG=generated.dll ./targettlr-cli square.tlr 16\n256\n$ file generated.dll\ngenerated.dll: MS-DOS executable PE for MS Windows (DLL) (console) Intel 80386 32-bit\n\nNow, we can inspect the DLL with any IL disassembler, such as\nilasm or monodis; here is an excerpt of the disassembled code,\nthat shows how our square.tlr bytecode has been compiled to .NET\nbytecode:\n\n.method public static hidebysig default int32 invoke (object[] A_0, int32 A_1) cil managed\n{\n .maxstack 3\n .locals init (int32 V_0, int32 V_1, int32 V_2, int32 V_3, int32 V_4, int32 V_5)\n\n ldc.i4 -1\n ldarg.1\n add\n stloc.1\n ldc.i4 0\n ldarg.1\n add\n stloc.2\n IL_0010: ldloc.1\n ldc.i4.0\n cgt.un\n stloc.3\n ldloc.3\n brfalse IL_003b\n\n ldc.i4 -1\n ldloc.1\n add\n stloc.s 4\n ldloc.2\n ldarg.1\n add\n stloc.s 5\n ldloc.s 5\n stloc.2\n ldloc.s 4\n stloc.1\n ldarg.1\n starg 1\n\n nop\n nop\n br IL_0010\n\n IL_003b: ldloc.2\n stloc.0\n br IL_0042\n\n ldloc.0\n ret\n}\n\nIf you know a bit IL, you can see that the code generated is not\noptimal, as there are some redundant operations like all those\nstloc/ldloc pairs; however, while not optimal, it is still quite good\ncode, not much different to what you would get by writing the square\nalgorithm directly in e.g. C#.\nAs I said before, all of this is still work in progress and there is\nstill much to be done. Stay tuned :-).", + "tags": "", + "url": "https://www.pypy.org/posts/2008/05/progresses-on-cli-jit-backend-front-1021772190959551376.html" + }, + { + "title": "More windows support", + "text": "Recently, thanks to Amaury Forgeot d'Arc and Michael Schneider, Windows became more of a first-class platform for PyPy's Python interpreter. Most RPython extension modules are now considered working (apart from some POSIX specific modules). Even CTypes now works on windows!\n\n\nNext step would be to have better buildbot support for all supported platforms (Windows, Linux and OS X), so we can control and react to regressions quickly. (Buildbot is maintained by JP Calderone)\n\nCheers,\nfijal", + "tags": "", + "url": "https://www.pypy.org/posts/2008/05/more-windows-support-1747028151130099034.html" + }, + { + "title": "S3-Workshop Potsdam 2008 Writeup", + "text": "Trying to give some notes about the S3 Workshop in Potsdam that several\nPyPyers and Spies (Armin, Carl Friedrich, Niko, Toon, Adrian) attended before\nthe Berlin sprint. We presented a paper about SPy there. Below are some mostly\nrandom note about my (Carl Friedrich's) impressions of the conference and some\ntalk notes. Before that I'd like to give thanks to the organizers who did a\ngreat job. The workshop was well organized, the social events were wonderful (a\nvery relaxing boat trip in the many lakes around Potsdam and a conference\ndinner).\nVideo recordings of all the talks can be found on the program page.\n\nInvited Talks\n\"Late-bound Object Lambda Architectures\" by Ian Piumarta was quite an inspiring\ntalk about VPRI's attempt at writing a flexible and understandable computing\nsystem in 20K lines of code. The talk was lacking a bit in technical details, so\nwhile it was inspiring I couldn't really say much about their implementation.\nApart from that, I disagree with some of their goals, but that's the topic of\nanother blog post.\n\"The Lively Kernel \u2013 A Self-supporting System on a Web Page\" by Dan Ingalls. Dan\nIngalls is one of the inventors of the original Smalltalk and of Squeak. He was\ntalking about his latest work, the attempts of bringing a Squeak-like system to\na web browser using JavaScript and SVG. To get some feel for what exactly The\nLively Kernel is, it is easiest to just try it out (only works in Safari\nand Firefox 3 above Beta 5 though). I guess in a sense the progress of the\nLively Kernel over Squeak is not that great but Dan seems to be having fun. Dan\nis an incredibly enthusiastic, friendly and positive person, it was really great\nmeeting him. He even seemed to like some of the ideas in SPy.\n\"On Sustaining Self\" by Richard P. Gabriel was a sort of deconstructivist\nmulti-media-show train wreck of a presentation that was a bit too weird for my\ntaste. There was a lot of music, there were sections in the presentation\nwhere Richard discussed with an alter ego, whose part he had recorded in advance\nand mangled with a sound editor. There was a large bit of a documentary\nabout Levittown. Even the introduction and the questions were weird, with Pascal\nConstanza staring down the audience, without saying a word (nobody dared to ask\nquestions). I am not sure I saw the point of the presentation, apart from\ngetting the audience to think, which probably worked. It seems that there are\npeople (e.g. Christian Neukirchen) that liked the presentation, though.\n\n\nResearch Papers\n\"SBCL - A Sanely Bootstrappable Common Lisp by Christophe Rhodes described the\nbootstrapping process of SBCL (Steel Bank Common Lisp). SBCL can be bootstrapped\nby a variety of Common Lisps, not just by itself. SBCL contains a complete\nblueprint of the initial image instead of always getting the new image by\ncarefully mutating the old one. This bootstrapping approach is sort of similar\nto that of PyPy.\n\"Reflection for the Masses\" by Charlotte Herzeel, Pascal Costanza, and Theo\nD'Hondt retraced some of the work of Brian Smith on reflection in Lisp. The\ntalk was not very good, it was way too long (40 min), quite hard to understand\nbecause Charlotte Herzeel was talking in a very low voice. The biggest mistake\nin her talk was in my opinion that she spent too much time explaining a more or\nless standard meta-circular interpreter for Lisp and then running out of time\nwhen she was trying to explain the modifications. I guess it would have been a\nfair assumptions that large parts of the audience know such interpreters, so\nglossing over the details would have been fine. A bit of a pity, since the paper\nseems interesting.\n\"Back to the Future in One Week - Implementing a Smalltalk VM in PyPy\"\nby Carl Friedrich Bolz, Adrian Kuhn, Adrian Lienhard, Nicholas D. Matsakis,\nOscar Nierstrasz, Lukas Renggli, Armin Rigo and Toon Verwaest, the paper with\nthe longest author list. We just made everybody an author who was at the sprint\nin Bern. Our paper had more authors than all the other papers together :-). I\ngave the presentation at the workshop, which went quite well, judging from the\nfeedback I got.\n\"Huemul - A Smalltalk Implementation\" by Guillermo Adri\u00e1n Molina. Huemul is a\nSmalltalk implementation that doesn't contain an interpreter but directly\ncompiles all methods to assembler (and also saves the assembler in the image).\nIn addition, as much functionality (such as threading, GUI) as possible is\ndelegated to libraries instead of reimplementing them in Smalltalk\n(as e.g. Squeak is doing). The approach seems to suffer from the usual problems\nof manually writing a JIT, e.g. the VM seems to segfault pretty often. Also I\ndon't agree with some of the design decisions of the threading scheme, there is\nno automatic locking of objects at all, instead the user code is responsible for\npreventing concurrent accesses from messing up things (which even seems to lead\nto segfaults in the default image).\n\"Are Bytecodes an Atavism?\" by Theo D'Hondt argued that using AST-based\ninterpreters can be as fast as bytecode-based interpreters which he proved by\nwriting two AST-interpreters, one for Pico and one for Scheme. Both of these\nimplementations seem to perform pretty well. Theo seems to have many similar\nviews as PyPy, for example that writing simple straightforward interpreters is\noften preferable than writing complex (JIT-)compilers.", + "tags": "", + "url": "https://www.pypy.org/posts/2008/05/s3-workshop-potsdam-2008-writeup-6610637452403831794.html" + }, + { + "title": "Berlin Sprint Finished", + "text": "The Berlin sprint is finished, below some notes on what we worked on during\nthe last three days:\n\n\nCamillo worked tirelessly on the gameboy emulator with some occasional input\nby various people. He is making good progress, some test ROMs run now on the\ntranslated emulator. However, the graphics are still not completely working\nfor unclear reasons. Since PyBoy is already taken as a project name, we\nconsidered calling it PyGirl (another name proposition was \"BoyBoy\", but the\nimplementation is not circular enough for that).\n\n\n\n\nOn Monday Armin and Samuele fixed the problem with our multimethods so that\nthe builtin shortcut works again (the builtin shortcut is an optimization\nthat speeds up all operations on builtin non-subclassed types quite a bit).\nAntonio and Holger (who hasn't been on a sprint in a while, great to have you\nback!) worked on writing a conftest file (the plugin mechanism of py.test)\nthat would allow us to run Django tests using py.test, which seems to be not\ncompletely trivial. They also fixed some bugs in PyPy's Python interpreter,\ne.g. related to dictionary subclassing.\nKarl started adding sound support to the RPython SDL-bindings, which will be\nneeded both by the Gameboy emulator and eventually by the SPy VM.\nArmin and Maciek continued the work that Maciek had started a while ago of\nimproving the speed of PyPy's IO operation. In the past, doing IO usually\ninvolved copying lots of memory around, which should have improved now. Armin\nand Maciek improved and then merged the first of the two branches that\ncontained IO improvements, which speeds up IO on non-moving GCs (mostly the\nBoehm GC). Then they continued working on the hybrid-io branch which is\nsupposed improve IO on the hybrid GC (which was partially designed exactly\nfor this).\nToon, Carl Friedrich finished cleaning up the SPy improvement branch and\nfixed all warnings that occur when you translate SPy there. An obscure bug in\nan optimization prevented them from getting working executables, which at\nthis moment blocks the merging of that branch.\n\n\nBy now everybody is home again (except for Anto, who booked his return flight\ntwo days too late, accidentally) and mostly resting. It was a good sprint, with\nsome interesting results and several new people joining. And it was definitely\nthe most unusual sprint location ever :-).", + "tags": "", + "url": "https://www.pypy.org/posts/2008/05/berlin-sprint-finished-1597243123548564657.html" + }, + { + "title": "Berlin Sprint Day 1 + 2", + "text": "After having survived the S3-Workshop which took place in Potsdam on Thursday\nand Friday (a blog-post about this will follow later) we are now sitting in the\nc-base in Berlin, happily sprinting. Below are some notes on what progress we\nmade so far:\n\n\nThe Gameboy emulator in RPython that Camillo Bruni is working on for his\nBachelor project at Uni Bern does now translate. It took him (assisted by\nvarious people) a while to figure out the translation errors (essentially\nbecause he wrote nice Python code that passed bound methods around, which the\nRTyper doesn't completely like). Now that is fixed and the Gameboy emulator\ntranslates and runs a test ROM. You cannot really see anything yet, because\nthere is no graphics support in RPython.\nTo get graphics support in RPython Armin and Karl started writing SDL\nbindings for RPython, which both the Gameboy emulator and the SPy VM need.\nThey have basic stuff working, probably enough to support the Gameboy\nalready.\nAlexander, Armin, Maciek and Samuele discussed how to approach separate\ncompilation for RPython, which isn't easy because the RPython type analysis\nis a whole-program analysis.\nStephan, Peter and Adrian (at least in the beginning) worked on making PyPy's\nstackless module more complete. They added channel preferences which\nchange details of the scheduling semantics.\nToon, Carl Friedrich and Adrian (a tiny bit) worked on SPy. There is a branch\nthat Toon started a while ago which contains many improvements but is also\nquite unclear in many respects. There was some progress in cleaning that up.\nThis involved implementing the Smalltalk process scheduler (Smalltalk really\nis an OS). There is still quite some work left though. While doing so, we\ndiscovered many funny facts about Squeak's implementation details (most of\nwhich are exposed to the user) in the process. I guess we should collect them\nand blog about them eventually.\nSamuele and Maciek improved the ctypes version of pysqlite that Gerhard\nH\u00e4ring started.\nArmin, Samuele and Maciek found an obscure bug in the interaction between the\nbuiltin-type-shortcut that Armin recently implemented and our multimethod\nimplementation. It's not clear which of the two are to blame, however it\nseems rather unclear how to fix the problem: Armin and Samuele are stuck in a\ndiscussion about how to approach a solution since a while and are hard to\ntalk to.\nStijn Timbermont, a Ph.D. student at the Vrije Universiteit Brussel who is\nvisiting the sprint for two days was first looking at how our GCs are\nimplemented to figure out whether he can use PyPy for some experiments. The\nanswer to that seems to be no. Today he was hacking on a Pico interpreter\n(without knowing too much about Python) and is making some nice progress, it\nseems.\n\n\nWill try to blog more as the sprint progresses.", + "tags": "", + "url": "https://www.pypy.org/posts/2008/05/berlin-sprint-day-1-2-8761821946764492267.html" + }, + { + "title": "General performance improvements", + "text": "Hi all,\n\nDuring the past two weeks we invested some more efforts on the\nbaseline performance of pypy-c. Some of the tweaks we did\nwere just new ideas, and others were based on actual\nprofiling. The net outcome is that we now expect PyPy to be\nin the worst case twice as slow than CPython on real\napplications. Here are some small-to-medium-size benchmark\nresults. The number is the execution time, normalized to 1.0\nfor CPython 2.4:\n\n1.90 on templess (a simple templating language)\n1.49 on gadfly (pure Python SQL database)\n1.49 on translate.py (pypy's own translation toolchain)\n1.44 on mako (another templating system)\n1.21 on pystone\n0.78 on richards\n\n\n(This is all without the JIT, as usual. The JIT is not ready yet.)\n\nYou can build yourself a pypy-c with this kind of speed with\nthe magic command line (gcrootfinder is only for a 32-bit\nLinux machine):\n\n pypy/translator/goal/translate.py --gc=hybrid --gcrootfinder=asmgcc targetpypystandalone --allworkingmodules --faassen\n\nThe main improvements come from:\n \nA general shortcut for any operation between built-in objects:\nfor example, a subtraction of two integers or floats now dispatches\ndirectly to the integer or float subtraction code, without looking up\nthe '__sub__' in the class.\nA shortcut for getting attributes out of instances of user classes\nwhen the '__getattribute__' special method is not overridden.\nThe so-called Hybrid Garbage Collector is now a\nthree-generations collector.\n\nMore about our GCs...\nSome profiling showed bad performance in our implementation of\nthe built-in id() -- a trivial function to write in CPython, but a lot\nmore fun when you have a moving GC and your object's real address can\nchange.\nThe bytecode compiler's parser had a very slow linear search\nalgorithm that we replaced with a dictionary lookup.\n\n\nThese benchmarks are doing CPU-intensive operations. You can expect\na similar blog post soon about the I/O performance, as the\nio-improvements branch gets closer to being merged\n:-) The branch could also improve the speed of\nstring operations, as used e.g. by the templating systems.", + "tags": "", + "url": "https://www.pypy.org/posts/2008/05/general-performance-improvements-838741900863354293.html" + }, + { + "title": "Next Sprint: Berlin, May 17-22nd May", + "text": "Our next PyPy sprint will take place in the crashed c-base space station, Berlin, Germany, Earth, Solar System. This is a fully public sprint: newcomers (from all planets) are welcome. Suggestion of topics (other topics are welcome too):\n\n\nwork on PyPy's JIT generator: we are refactoring parts of the\n compiling logic, in ways that may also allow generating better\n machine code for loops (people or aliens with knowledge on\n compilers and SSA, welcome)\n\nwork on the SPy VM, PyPy's Squeak implementation, particularly the\n graphics capabilities \n\nwork on PyPy's GameBoy emulator, which also needs graphics support\n \ntrying some large pure-Python applications or libraries on PyPy and\n fixing the resulting bugs. Possibilities are Zope 3, Django and\n others.\n\n\nFor more information, see the full announcement.", + "tags": "", + "url": "https://www.pypy.org/posts/2008/05/next-sprint-berlin-may-17-22nd-may-5362899847460267375.html" + }, + { + "title": "Google's Summer of Code", + "text": "PyPy got one proposal accepted for Google's Summer of Code under the Python\nSoftware Foundation's umbrella. We welcome Bruno Gola into the PyPy\ncommunity. He will work on supporting all Python 2.5 features in PyPy and will\nalso update PyPy's standard library to support the modules that were modified\nor new in Python 2.5.\nRight now PyPy supports only Python 2.4 fully (some Python 2.5 features have\nalready sneaked in, though).", + "tags": "", + "url": "https://www.pypy.org/posts/2008/04/googles-summer-of-code-4911168632727441622.html" + }, + { + "title": "Float operations for JIT", + "text": "Recently, we taught the JIT x86 backend how to produce code for the x87 floating point coprocessor. This means that JIT is able to nicely speed up float operations (this this is not true for our Python interpreter yet - we did not integrate it yet). This is the first time we started going beyond what is feasible in psyco - it would take a lot of effort to make floats working on top of psyco, way more than it will take on PyPy.\n\nThis work is in very early stage and lives on a jit-hotpath branch, which includes all our recent experiments on JIT compiler generation, including tracing JIT experiments and huge JIT refactoring.\n\nBecause we don't encode the Python's semantics in our JIT (which is really a JIT generator), it is expected that our Python interpreter with a JIT will become fast \"suddenly\", when our JIT generator is good enough. If this point is reached, we would also get fast interpreters for Smalltalk or JavaScript with relatively low effort.\n\nStay tuned.\n\n\nCheers,\nfijal", + "tags": "", + "url": "https://www.pypy.org/posts/2008/04/float-operations-for-jit-6499693696246367083.html" + }, + { + "title": "Wrapping pyrepl in the readline API", + "text": "If you translate a pypy-c with --allworkingmodules and start it, you will probably not notice anything strange about its prompt - except when typing multiline statements. You can move the cursor up and continue editing previous lines. And the history is multiline-statements-aware as well. Great experience! Ah, and completion using tab is nice too.\n\nTruth be told, there is nothing new here: it was all done by Michael Hudson's pyrepl many years ago. We had already included pyrepl in PyPy some time ago. What is new is a pure Python readline.py which exposes the most important parts of the API of the standard readline module by wrapping pyrepl under the hood, without needing the GNU readline library at all. The PyPy prompt is based on this, benefitting automagically from pyrepl's multiline editing capabilities, with minor tweaks so that the prompt looks much more like CPython's than a regular pyrepl prompt does.\n\nYou can also try and use this multiline prompt with CPython: check out pyrepl at https://codespeak.net/svn/pyrepl/trunk/pyrepl and run the new pythoni1 script.", + "tags": "", + "url": "https://www.pypy.org/posts/2008/04/wrapping-pyrepl-in-readline-api-362730784820949868.html" + }, + { + "title": "Other April's Fools Ideas", + "text": "While discussing what to post as an April Fool's joke yesterday, we had a\ncouple of other ideas, listed below. Most of them were rejected because they are\ntoo incredible, others because they are too close to our wish list.\n\nquantum computer backend\nPerl6 interpreter in RPython\nRuby backend to allow run \"python on rails\"\nmandatory static typing at app-level, because it's the only way to increase\nperformances\nrewrite PyPy in Haskell, because we discovered that dynamic typing is just\nnot suitable for a project of this size\na C front-end, so that we can interpret the C source of Python C extensions\nand JIT it. This would work by writing an interpreter for LLVM bytecode in\nRPython.\nan elisp backend\na TeX backend (use PyPy for your advanced typesetting needs)\nan SQL JIT backend, pushing remote procedures into the DB engine", + "tags": "", + "url": "https://www.pypy.org/posts/2008/04/other-aprils-fools-ideas-955926452383759016.html" + }, + { + "title": "Trying to get PyPy to run on Python 3.0", + "text": "As you surely know, Python 3.0 is coming; recently, they released\nPython 3.0 alpha 3, and the final version is expected around\nSeptember.\nAs suggested by the migration guide (in the PEP 3000), we started by applying\n2to3 to our standard interpreter, which is written in RPython (though\nwe should call it RPython 2.4 now, as opposed to RPython 3.0 -- see\nbelow).\nConverting was not seamless, but most of the resulting bugs were due to the\nnew dict views, str/unicode changes and the missing \"reduce\" built-in.\nAfter forking and refactoring both our interpreter and the 2to3 script,\nthe Python interpreter runs on Python 3.0 alpha 3!\nNext step was to run 2to3 over the whole translation toolchain,\ni.e. the part of PyPy which takes care of analyzing the interpreter in\norder to produce efficient executables; after the good results we got\nwith the standard interpreter, we were confident that it would have\nbeen relatively easy to run 2to3 over it: unfortunately, it was not\n:-(.\nAfter letting 2to3 run for days and days uninterrupted, we decided to\nkill it: we assume that the toolchain is simply too complex to be\nconverted in a reasonable amount of time.\nSo, we needed to think something else; THE great idea we had was to\nturn everything upside-down: if we can't port PyPy to Py3k, we can\nalways port Py3k to PyPy!\nUnder the hood, the 2to3 conversion tool operates as a graph\ntransformer: it takes the graph of your program (in the form of Python\n2.x source file) and returns a transformed graph of the same program\n(in the form of Python 3.0 source file). Since the entire translation\ntoolchain of PyPy is based on graph transformations, we could reuse it\nto modify the behaviour of the 2to3 tool. We wrote a general\ngraph-inverter algorithm which, as the name suggests, takes a graph\ntransformation and build the inverse transformation; then, we applied\nthe graph inverter to 2to3, getting something that we called 3to2: it\nis important to underline that 3to2 was built by automatically\nanalysing 2to3 and reversing its operation with only the help of a few\nmanual hints. For this reason and because we are not keeping generated\nfiles under version control, we do not need to maintain this new tool in\nthe Subversion repository.\nOnce we built 3to2, it was relatively easy to pipe its result to our\ninterpreter, getting something that can run Python 3.0 programs.\nPerformance-wise, this approach has the problem of being slower at\nimport time, because it needs to run (automatically) 3to2 every time\nthe source is modified; in the future, we plan to apply our JIT\ntechniques also to this part of the interpreter, trying to mitigate the\nslowdown until it is not noticeable anymore to the final user.\nIn the next weeks, we will work on the transformation (and probably publish\nthe technique as a research paper, with a title like \"Automatic Program\nReversion on Intermediate Languages\").\nUPDATE: In case anybody didn't guess or didn't spot the acronym: The above\nwas an April Fool's joke. Nearly nothing of it is true.", + "tags": "", + "url": "https://www.pypy.org/posts/2008/04/trying-to-get-pypy-to-run-on-python-30-5082015544752137606.html" + }, + { + "title": "Py-Lib 0.9.1 released", + "text": "The Py-Lib 0.9.1 release is out! The Py-Lib is a very important support\nlibrary that PyPy uses for a lot of things \u2013 most importantly it contains\npy.test, which PyPy uses for testing.\nThis is mostly a bugfix release, with a couple of new features sneaked in.\nMost important changes:\n\nsome new functionality (authentication, export, locking) in py.path's\nSubversion APIs\nnumerous small fixes in py.test's rsession (experimental pluggable session)\nand generative test features\nsome fixes in the py.test core\n\nDownload/Install: https://codespeak.net/py/0.9.1/download.html\nDocumentation/API: https://codespeak.net/py/0.9.1/index.html\nUPDATE: the py-lib is now easy-installable with:\n\neasy_install py", + "tags": "release", + "url": "https://www.pypy.org/posts/2008/03/py-lib-091-released-1654797401128918376.html" + }, + { + "title": "PyPy Summer of Code Participation", + "text": "As in the last years, PyPy will again participate in Google's Summer of Code\nprogram under the umbrella of the Python Software Foundation. Unfortunately we\nwere a bit disorganized this year, so that our project ideas are only put up\nnow. The list of project ideas of PyPy can be found here.\nAny interested student should mail to our mailing list or just come to the\n#pypy channel on irc.freenode.net to discuss things.", + "tags": "", + "url": "https://www.pypy.org/posts/2008/03/pypy-summer-of-code-participation-3403842530060519982.html" + }, + { + "title": "ctypes configuration tool", + "text": "As a part of implementing ctypes, we decided to make coding using ctypes better on its own (irrelevant what python interpreter you use). The concrete problem we're trying to solve is to make ctypes code more platform-independent than it is. Say you want to create a ctypes type for size_t: ctypes itself provides no mechanism for doing that, so you need to use a concrete integer type (c_int, c_long, c_short etc.). Your code either becomes platform dependent if you pick one of them or is littered with conditionals for all sorts of platforms. We created a small library, called ctypes_configure (which is actually a variation of something we use somewhere in the PyPy source tree), which tries to solve some platform dependencies by compiling and running small chunks of C code through a C compiler. It's sort of like configure in the Linux world, except for Python using ctypes.\n\nTo install the library, you can just type easy_install ctypes_configure. The code is in an svn repository on codespeak and there is even some documentation and sample code. Also, even though the code lives in the pypy repository, it depends only on pylib, not on the whole of pypy.\n\nThe library is in its early infancy (but we think it is already rather useful). In the future we could add extra features, it might be possible to check whether the argtypes that are attached to the external functions are consistent with what is in the C headers), so that the following code wouldn't segfault but give a nice error\n\nlibc = ctypes.CDLL(\"libc.so\")\ntime = libc.time\ntime.argtypes = [ctypes.c_double, ctypes.c_double]\ntime(0.0, 0.0)\n\n\nAlso, we plan to add a way to install a package that uses ctypes_configure in such a way that the installed library doesn't need to call the C compiler any more later.", + "tags": "", + "url": "https://www.pypy.org/posts/2008/03/ctypes-configuration-tool-7414864595600362988.html" + }, + { + "title": "Bittorrent on PyPy", + "text": "Hi all,\n\nBittorrent now runs on PyPy! I tried the no-GUI BitTornado version (btdownloadheadless.py). It behaves correctly and I fixed the last few obvious places which made noticeable pauses. (However we know that there are I/O performance issues left: we make too many internal copies of the data, e.g. in a file.read() or os.read().)\n\nWe are interested in people trying out other real-world applications that, like the GUI-less Bittorrent, don't have many external dependencies to C extension modules. Please report all the issues to us!\n\nThe current magic command line for creating a pypy-c executable with as many of CPython's modules as possible is:\n\n\n cd pypy/translator/goal\n ./translate.py --thread targetpypystandalone.py --allworkingmodules --withmod-_rawffi --faassen\n\n\n(This gives you a thread-aware pypy-c, which requires the Boehm gc library. The _rawffi module gives you ctypes support but is only tested for Linux at the moment.)", + "tags": "", + "url": "https://www.pypy.org/posts/2008/03/bittorrent-on-pypy-7984272143557948160.html" + }, + { + "title": "As fast as CPython (for carefully taken benchmarks)", + "text": "Good news everyone. A tuned PyPy compiled to C is nowadays as fast as CPython on the richards benchmark and slightly faster on the gcbench benchmark.\n\nIMPORTANT: These are very carefully taken benchmarks where we expect pypy to be fast! PyPy is still quite slower than CPython on other benchmarks and on real-world applications (but we're working on it). The point of this post is just that for the first time (not counting JIT experiments) we are faster than CPython on *one* example :-)\n\nThe exact times as measured on my notebook (which is a Core Duo machine) are here:\n\nCompiled pypy with options:\n\n\n./translate.py --gcrootfinder=asmgcc --gc=generation targetpypystandalone.py --allworkingmodules --withmod-_rawffi --faassen\n\n(allworkingmodules and withmod-_rawffi are very likely irrelevant to those benchmarks)\n\nCPython version 2.5.1, release.\n\n\nrichards 800ms pypy-c vs 809ms cpython (1% difference)\ngcbench 53700ms pypy-c vs 60215ms cpython (11% difference)\n\nPyPy shines on gcbench, which is mostly just about allocating and freeing many objects. Our gc is simply better than refcounting, even though we've got shortcomings in other places.\n\n\nAbout richards, there is a catch. We use a method cache optimization, and have an optimization which helps to avoid creating bound methods each time a method is called. This speeds up the benchmark for about 20%. Although method cache was even implemented for CPython, it didn't make its way to the core because some C modules directly modify the dictionary of new-style classes. In PyPy, the greater level of abstraction means that this operation is just illegal.", + "tags": "", + "url": "https://www.pypy.org/posts/2008/03/as-fast-as-cpython-for-carefully-taken-1984440931984637179.html" + }, + { + "title": "Running Pyglet on PyPy", + "text": "As part of our efforts of making PyPy's Python interpreter usable we put quite some effort into interfacing with external libraries. We were able, in quite a short amount of time (I think beginning really from Leysin sprint, or slightly earlier) to provide a prototype of the ctypes library. It is written in completely normal Python, at applevel, based on a very thin wrapper around the libffi library. This makes development a lot easier, but it makes the resulting ctypes implementation rather slow. The implementation is not complete yet and it will still need quite some effort to make it feature-complete (ctypes has lots of details and special cases and\ndo-what-I-mean magic). Yet another point will be to make it faster, but that's for much later.\n\nThe implementation is good enough to run those parts of Pyglet that don't depend on PIL (which PyPy doesn't have). Here are a few pictures of running Pyglet demos on top of compiled pypy-c.\n\n\n\nTo compile a version of PyPy that supports ctypes, use this highly sophisticated command line\n\n\n./translate.py --gc=generation ./targetpypystandalone.py --allworkingmodules --withmod-_rawffi\n\nNote: this works on linux only right now.\n\nThe list of missing small ctypes features is quite extensive, but I consider the current implementation to be usable for most common cases. I would love to hear about libraries written in pure python (using ctypes), to run them on top of PyPy and use them as test cases. If someone knows such library, please provide a link.", + "tags": "", + "url": "https://www.pypy.org/posts/2008/02/running-pyglet-on-pypy-3191536711417589549.html" + }, + { + "title": "Python Finalizers Semantics, Part 2: Resurrection", + "text": "Continuing the last blog post about GC semantics in Python.\nAnother consequence of reference counting is that resurrection is easy to\ndetect. A dead object can resurrect itself if its finalizer stores it into a\nglobally reachable position, like this:\n\nclass C(object):\n def __init__(self, num):\n self.num = num\n def __del__(self):\n global c\n if c is None:\n c = self\nc = C(1)\nwhile c is not None:\n c = None\n print \"again\"\n\nThis is an infinite loop in CPython: Every time c is set to None in the\nloop, the __del__ method resets it to the C instance again (note that\nthis is terribly bad programming style, of course. In case anybody was wondering\n:-)). CPython can detect resurrection by checking whether the reference count\nafter the call to __del__ has gotten bigger.\nThere exist even worse examples of perpetual resurrection in particular in\ncombination with the cycle GC. If you want to see a particularly horrible one,\nsee this discussion started by Armin Rigo. In the ensuing thread Tim Peters\nproposes to follow Java's example and call the finalizer of every object at most\nonce.\nIn PyPy the resurrection problem is slightly more complex, since we have GCs\nthat run collection from time to time and don't really get to know at which\nprecise time an object dies. If the GC discovers during a collection that an\nobject is dead, it will call the finalizer after the collection is finished. If\nthe object is then dead at the next collection, the GC does not know whether\nthe object was resurrected by the finalizer and then died in the meantime or\nwhether it was not resurrected. Therefore it seemed sanest to follow Tim's\nsolution and to never call the finalizer of an object a second time, which has\nmany other benefits as well.", + "tags": "", + "url": "https://www.pypy.org/posts/2008/02/python-finalizers-semantics-part-2-2748812428675325525.html" + }, + { + "title": "Python Finalizers Semantics, Part 1", + "text": "Python's garbage collection semantics is very much historically grown and\nimplementation-driven. Samuele Pedroni therefore likes to call it the \"'there\nis no such thing as too much chocolate'-approach to GC semantics\" :-). In this\ntwo-part post series I am going to talk about the semantics of finalization\n(__del__ methods) in CPython and PyPy.\nThe current behaviour is mostly all a consequence of the fact that CPython uses\nreference counting for garbage collection. The first consequence is that if\nseveral objects die at the same time, their finalizers are called in a\nso-called topological order, which is a feature that some GCs have that\nCPython offers by chance. This ensures, that in a __del__ method, all the\nattributes of the object didn't get their __del__ called yet. A simple\nexample:\n\nclass B(object):\n def __init__(self, logfile):\n self.logfile = logfile\n def __del__(self):\n self.logfile.write(\"done doing stuff\")\nb = B(file(\"logfile.txt\", \"w\"))\n\nIf the instance of B dies now, both it and the logfile are dead. They will\nget their __del__``s called and it's important that the file's ``__del__\ngets called second, because otherwise the __del__ of B would try to\nwrite to a closed file.\nThe correct ordering happens completely automatically if you use reference\ncounting: Setting b to None will decref the old value of b. This reduces\nthe reference count of this instance to 0, so the finalizer will be called.\nAfter the __del__ has finished, this object will be freed and all the\nobjects it points to decrefed as well, which decreases the reference count of\nthe file to 0 and call its `` __del__`` as well, which closes the file.\nThe behaviour of PyPy's semispace and generational GCs wasn't very nice so far:\nit just called the finalizers in an essentially random order. Last week Armin\ncame up with a somewhat complicated algorithm that solves this by emulating\nCPython's finalization order, which we subsequently implemented. So PyPy does\nwhat you expect now! The Boehm GC does a topological ordering by default, so it\nwasn't a problem there.\nA small twist on the above is when\nthere is a cycle of objects involving finalizers:\nIn this case a topological ordering is not possible, so that CPython refuses to\nguess the finalization order and puts such cycles into gc.garbage. This\nwould be very hard for PyPy to do, since our GC implementation is essentially\nindependent from the Python interpreter. The same GCs work for our other\ninterpreters after all too. Therefore we decided to break such a cycle at an\narbitrary place, which doesn't sound too insane. The insane thing is for\na Python program to create a cycle of objects with finalizers and depend\non the order in which the finalizers are called. Don't do that :-) (After\nall, CPython wouldn't even call the finalizers in this case.)", + "tags": "", + "url": "https://www.pypy.org/posts/2008/02/python-finalizers-semantics-part-1-1196956834543115766.html" + }, + { + "title": "PyPy presence on various conferences in the near future", + "text": "Hello! I will have the pleasure of presenting PyPy on various conferences in the near future. They're (in chronological order):\n\n\nStudencki Festiwal Informatyczny in Krakow, POLAND 6-8 March 2008. I think this might be only interesting for polish people (website, in polish)\n\nPycon Chicago, IL, USA. 14-17 March 2008. There should be also a PyPy sprint afterwards, including newbie-friendly tutorial, everybody is welcome to join us! (Provided that I'll get the US visa, which seems to be non-trivial issue for a polish citizen)\n RuPy, Poznan, POLAND 13-14 April 2008 (website). This is small, but very friendly Ruby and Python conference. Last year was amazing, I can strongly recommend to go there (Poznan is only 2h by train from Berlin also has its own airport).\n\n\nHope to see you at those places!\n\nCheers,\nfijal", + "tags": "", + "url": "https://www.pypy.org/posts/2008/02/pypy-presence-on-various-conferences-in-6584680808789191759.html" + }, + { + "title": "Buildbots and Better Platform Support", + "text": "In the last days we improved platform-support of PyPy's Python interpreter.\nJean-Paul Calderone has been tirelessly working for some time now on setting up a\nbuildbot for translating and testing PyPy. So far the basic mechanisms are\nworking and the buildbot is running on various machines, including some that\nMichael Schneider (bigdog) lets us use, one of them being a Windows machine,\nthe other one with a 64bit Linux (lots of thanks to those two, you are\nawesome!).\nWhat is still missing is a nice way to visualize the test results to quickly see\nwhich tests have started failing on which platforms. There is a prototype\nalready, which still needs some tweaking.\nThe availability of these machines has triggered some much-needed bug-fixing in\nPyPy to make our Python interpreter work better on Windows and on 64 bit Linux.\nMaciek and Michael Schneider worked on this quite a bit last week, with the\nresult that PyPy supports many more extension modules now on Windows and 64 bit\nLinux. Since we now have the buildbot the hope is that the support also won't\ndisappear soon :-).", + "tags": "", + "url": "https://www.pypy.org/posts/2008/01/buildbots-and-better-platform-support-6965497451398110731.html" + }, + { + "title": "PyPy Keyboard Heatmap", + "text": "Today I saw the keyboard heatmap generator on the Blended Technologies\nblog. I threw all the PyPy code at it to see whether the heatmap looks any\ndifferent than normal Python code. It doesn't:\n\nSo now the excuse \"I can't contribute to PyPy because it needs all those special\nPyPy-keys\" isn't working anymore :-).", + "tags": "", + "url": "https://www.pypy.org/posts/2008/01/pypy-keyboard-heatmap-4950995633665492453.html" + }, + { + "title": "RPython can be faster than C", + "text": "(yes, C as in language, not c as in speed of light). I looked recently at the great computer language shootout, for some benchmarks and to make some speed comparisons. I use this benchmark, modified it to be rpythonic-enough and compared speeds. The code is here (the only change from the Python version was to create a class instead of tuple, so actually this version is more OO). Also the benchmark is very likely flawed because it favours better GCs :).\nSo, here we go:\n\n\nLanguage:Time of run (for N=14):\nPython version running on Python 2.5.1, distribution25.5s\nPython version running on PyPy with generational GC45.5\nPython with psyco20s\nRPython translated to C using PyPy's generational GC0.42s\ncompiling the Haskell version with GHC 6.6.11.6s\ncompiling the C version with gcc 4.1.2 -O3 -fomit-frame-pointer0.6s\n\n\n\nAlso worth noticing is that when using psyco with the original version (with tuples) it is very fast (2s).\n\nSo, PyPy's Python interpreter is 80% slower than CPython on this (not too horrible), but RPython is 40% faster than gcc here. Cool. The result is mostly due to our GC, which also proves that manual memory-management can be slower than garbage collection in some situations. Please note that this result does not mean that RPython is meant for you. It requires a completely different mindset than the one used to program in Python. Don't say you weren't warned! :-)", + "tags": "", + "url": "https://www.pypy.org/posts/2008/01/rpython-can-be-faster-than-c-2559071147541131237.html" + }, + { + "title": "PyPy.NET goes Windows Forms", + "text": "After having spent the last few days on understanding PyPy's JIT,\ntoday I went back hacking the clr module. As a result, it is now\npossible to import and use external assemblies from pypy-cli,\nincluding Windows Forms\nHere is a screenshot of the result you get by typing the following at\nthe pypy-cli interactive prompt:\n\n>>>> import clr\n>>>> clr.AddReferenceByPartialName(\"System.Windows.Forms\")\n>>>> clr.AddReferenceByPartialName(\"System.Drawing\")\n>>>> from System.Windows.Forms import Application, Form, Label\n>>>> from System.Drawing import Point\n>>>>\n>>>> frm = Form()\n>>>> frm.Text = \"The first pypy-cli Windows Forms app ever\"\n>>>> lbl = Label()\n>>>> lbl.Text = \"Hello World!\"\n>>>> lbl.AutoSize = True\n>>>> lbl.Location = Point(100, 100)\n>>>> frm.Controls.Add(lbl)\n>>>> Application.Run(frm)\n\nUnfortunately at the moment you can't do much more than this, because\nwe still miss support for delegates and so it's not possibile to\nhandle events. Still, it's a step in the right direction :-).", + "tags": "", + "url": "https://www.pypy.org/posts/2008/01/pypynet-goes-windows-forms-7031406830502864570.html" + }, + { + "title": "Improve .NET Integration", + "text": "A while ago Amit Regmi, a student from Canada, started working on the\nclr module improvements branch as a university project.\nDuring the sprint Carl Friedrich, Paul and me worked more on it and\nbrought it to a mergeable state.\nIt adds a lot of new features to the clr module, which is the\nmodule that allows integration between pypy-cli (aka PyPy.NET) and\nthe surrounding .NET environment:\n\n\nfull support to generic classes;\na new importer hook, allowing things like from System import\nMath and so on;\n.NET classes that implements IEnumerator are treated\nas Python iterators; e.g. it's is possile to iterate over them\nwith a for loop.\n\n\nThis is an example of a pypy-cli session:\n\n>>>> from System import Math\n>>>> Math.Abs(-42)\n42\n>>>> from System.Collections.Generic import List\n>>>> mylist = List[int]()\n>>>> mylist.Add(42)\n>>>> mylist.Add(43)\n>>>> mylist.Add(\"foo\")\nTraceback (most recent call last):\n File \"\", line 1, in \nTypeError: No overloads for Add could match\n>>>> mylist[0]\n42\n>>>> for item in mylist: print item\n42\n43\n\nThis is still to be considered an alpha version; there are few known\nbugs and probably a lot of unknown ones :-), so don't expect it to\nwork in every occasion. Still, it's a considerable step towards real\nworld :-).", + "tags": "", + "url": "https://www.pypy.org/posts/2008/01/improve-net-integration-2239651503641931440.html" + }, + { + "title": "Crashing Other People's Compilers", + "text": "Over the years PyPy has (ab?)used various external software for different\npurposes, and we've discovered bugs in nearly all of them, mostly by pushing them\nto their limits. For example, many compilers are not happy with 200MB of\nsource in one file. The Microsoft C compiler has a limit of 65536 lines of code\nper file and the CLI was raising \"System.InvalidProgramException: Method\npypy.runtime.Constants:.cctor () is too complex.\", where too complex probably\nmeans \"too long\". Just for fun, today we collected all projects we could think of\nin which we found bugs:\n\n\nCPython (lots)\nPyPy and the py-lib (surpise)\nctypes\nTCC (we gave up on it)\nBoehm\nGraphviz\nMono\nLLVM (lots)\nPython.net\nthe Microsoft IL assembler\nMicrosoft's C compiler\nJasmin\nJPype\nnucular\nTwisted\nthe JVM, maybe\npygame or SDL\n\n\nSo one could say that PyPy is really just the most expensive debugging tool\never :-).", + "tags": "", + "url": "https://www.pypy.org/posts/2008/01/crashing-other-peoples-compilers-4574453763254909150.html" + }, + { + "title": "Leysin Winter Sport Sprint Started", + "text": "The Leysin sprint has started since yesterday morning in the usual location. The view is spectacular (see photo) the weather mostly sunny. The following people are sprinting:\nMaciej FijalkowskiArmin RigoToby WatsonPaul deGrandisAntonio CuniCarl Friedrich BolzSo it is a rather small sprint.We started working on various features and performance improvements for the high level backends (JVM and .NET) and on implementing ctypes for PyPy. Later this week we plan to spend a few days on the JIT, because Anto and I both need to get into it for our respective university projects.", + "tags": "", + "url": "https://www.pypy.org/posts/2008/01/leysin-winter-sport-sprint-started-5478612778498579467.html" + }, + { + "title": "Finding GC roots: using LLVM or parsing assembler files from GCC", + "text": "PyPy contains a framework for writing custom Garbage Collectors, and a few simple GCs have been written in this framework. A common issue with all these GCs is how to find all the stack roots, i.e. all the pointers to live GC-managed objects currently stored in local variables, in all the callers of the current function. The current solution is to maintain a custom shadow stack of roots, where all functions push and pop copies of their local variables of type \"GC pointer\". Clearly this is an overhead. Can we remove it?\n\nLLVM has recently grown some support for this. By emitting markers in the LLVM source and with the help of a bit of custom C++ code, we can generate stack maps for the functions compiled by LLVM. Then, with 100% non-portable code in our framework GC's root finding algorithm, we can walk the machine stack and locate where in each stack frame LLVM stores the GC pointers. (Yes, I mean non-portable: LLVM offers no help for doing that. Maybe it will at some point, though I didn't manage to explain why this is an issue to people working on this in LLVM so far...). I've tried that approach in the llvmgcroot branch. Over the manually-managed shadow stack, this gives speed improvements which are, very roughly, on the order of 5%.\n\nNote that this prevents some optimizations in LLVM, because it forces it to allocate all local variables of type \"GC pointer\" in the stack; it cannot keep them in registers and it must assume that they can be changed more or less at any time (as moving GCs do). Can we do better?\n\nActually, yes. We can even do better in the C backend, using a GCC hack. GCC has this nice extension:\nasm(\"bla\", constrains);\nThis is meant to generate assembler instructions directly from C. Internally, GCC considers the whole asm() as a single regular instruction of its intermediate language; the constrains are expressed in the same way as the constrains for all the prebuilt intermediate language instructions. They express things like input and output operands of the instruction, whether they can live in memory or in registers, whether the whole instruction has side-effects, etc. The nice thing about asm() is that it doesn't kill any optimization whatsoever in GCC - it's your job to make sure that you use the correct constrains.\n\nSo what I've tried in the asmgcroot branch is to use asm() as markers. In this branch, the C backend produces code like this after each function call, for each local variable containing a live GC pointer:\n\nasm(\"/* GCROOT %0 */\" : \"=g\"(localvar) : \"0\"(localvar) : \"memory\");\n\nThis causes GCC to emit the following line in the assembler file it generates:\n\n/* GCROOT register-or-memory-containing-localvar */\n\nI won't go in the details of the asm() line above - the constrains are just enough to make sure that GCC doesn't optimize too much, but don't prevent most optimizations from occurring. For example, the localvar can be in a register.\n\nThe assembler will just ignore the line above; it is a comment. But what we can do is write our own tool parsing the assembler files. This tool locates the /* GCROOT */ comments and follows where the register or memory location in the comment comes from (to do this it must follow the control flow and data flow of the function). This allows it to build a stack map: for each call instruction it knows exactly which registers and frame stack locations contain a live GC pointer. The stack map is then emitted in an extra assembler file that we link with the rest. As with LLVM above, the stack map is then used at run-time by non-portable code written in our GC's stack root tracker.\n\nYes, that's rather insane. But at least, we don't need to modify the assembler file - just read it. If GCC is too clever in its optimizations, the custom parser will get lost and complain cleanly; but I think that it is relatively safe in the sense that GCC optimizations should not be able to make the custom parser produce wrong results.\n\nThe branch is not merged because it's probably too insane to merge (not to mention, it's probably not portable to non-GCC compilers, and it is completely platform-specific). Still, it gives good results, better that the pure LLVM approach - on the order of 10% to 25% speed-ups for pypy-c.", + "tags": "", + "url": "https://www.pypy.org/posts/2008/01/finding-gc-roots-using-llvm-or-parsing-1980376164990001937.html" + }, + { + "title": "Visualizing a Python tokenizer", + "text": "Armin and me have been working on PyPy's parser and bytecode compiler for the Python language in the last days. Armin implemented several bytecode optimizations that CPython has since a while whereas I tried to refactor our tokenizer and parser (because our existing parser is rather slow and also not very nice code). Armin is mostly done whereas the new parser is not very far yet. What is done, however, is the Python tokenizer. It is implemented in the usual way, by using a set of regular expressions to generate a deterministic finite automaton (DFA). This automaton is then turned into a big function which does the actual tokenization. Of course the picture is not quite as simple for Python, because it is not possible to tokenize Python using only regular expressions. To generate the proper \"indent\" and \"dedent\" tokens it would be necessary to keep state (the previous indentation levels) which a DFA cannot do. This is solved by postprocessing the tokens that the tokenizer produces to turn whitespace tokens into the proper indent and dedent tokens.\nFor debugging purposes I implemented a visualization tool for DFAs using PyPy's pygame-based graph viewer. The graph viewer is able to visualize interactively any graph given in the graph-description language of Graphviz. Looking at the tokenizing DFA for Python is rather instructive, both for understanding how tokenizing works and (maybe) for understanding the Python language. To try it, download the dot file of the DFA and run from a pypy checkout:\n$ python pypy/bin/dotviewer.py tokenizer.dotThe following is a screenshot of the graphviewer:\n\nFor people who don't want do checkout PyPy I generated a (rather big) png for the DFA.\nNext thing I would like to do (apart from actually finishing the parser, of course :-) ) is visualize the Python grammar itself using syntax diagrams or something similar. So far I couldn't really find a program to do that, though.", + "tags": "", + "url": "https://www.pypy.org/posts/2008/01/visualizing-python-tokenizer-5020282079473796926.html" + }, + { + "title": "PyPy Winter Sports Sprint from 12-19th of January in Leysin, Switzerland", + "text": "The next PyPy sprint will be held in Leysin, Switzerland, for\nthe fifth time. The overall idea of the sprint is to continue\nworking on making PyPy ready for general use.\nThe proposed topics are: ctypes, JIT, testing, LLVM. This is\na fully public sprint, so newcomers and other topics are\nwelcome. And like previous winters, the main side goal is to\nhave fun in winter sports :-) See the sprint announcement\nfor details.", + "tags": "", + "url": "https://www.pypy.org/posts/2007/12/pypy-winter-sports-sprint-from-12-19th-5592383212609773292.html" + }, + { + "title": "(German) Slides of Talk at Python User Group Munich Available", + "text": "Georg Brandl has put up the slides of the PyPy talk he gave at the Python User Group Munich. The slides are in German.", + "tags": "", + "url": "https://www.pypy.org/posts/2007/12/german-slides-of-talk-at-python-user-3715884461725333051.html" + }, + { + "title": "Various Performance Improvements", + "text": "A few days ago, Armin discovered Gnuplot. He wrote a script that turns the results of the nightly benchmark runs into plots (lower is always better, all the numbers of the microbenchmarks are \"times slower than CPython\"). The corresponding microbenchmarks can be found in the repository. Staring at the plots revealed a strange performance regression around the revision 45000. After some investigation Armin found that an mostly unrelated change had disabled our method cache, which caused the regression. This was fixed.\n\nIn addition, Armin did a few other small tweaks in the interpreter main loop, making sure that small bytecodes are inlined into the main loop. This gave another few percent of performance increase. Together with the GC improvements two weeks ago this leads to the fastest non-JIT PyPy ever. Unfortunately \"fastest\" is not really very fast yet in absolute terms, with realistic apps being around 3-4 times slower than CPython. Especially calls (in all its variants) are quite slow, which is something we should look into.", + "tags": "", + "url": "https://www.pypy.org/posts/2007/12/various-performance-improvements-7027210611565246190.html" + }, + { + "title": "Faster implementation of classic classes merged", + "text": "Old-style classes have so far been a bit neglected by PyPy's Python interpreter. By default, PyPy makes all classes new-style and you have to use a command-line switch (--oldstyle) at startup or at translation time to change that default. Then you would get an pure-Python implementation of classic classes. This implementation was extremely slow (around 20 times slower than classic classes in CPython). In the past we had hoped that we could get away with mostly only supporting new-style classes, however it seems that real-world software seems to rely on them quite a bit, so we decided to offer a better migration path.\n\nA while ago I therefore started a re-implementation of classic classes in RPython to speed them up. This work is now finished, the branch I worked on got merged today. Speed for the old-style class benchmarks was improved greatly and I found quite a number of bugs in the old implementation too. New-style classes are still a bit faster than old-style in PyPy though, and this is unlikely to change.", + "tags": "", + "url": "https://www.pypy.org/posts/2007/12/faster-implementation-of-classic-1021557618590043616.html" + }, + { + "title": "Profiling for fun with valgrind", + "text": "Recently I've been doing a lot of profiling on the PyPy executables to find speed bottlenecks. Valgrind (the original page seems to be down) is an extremely nice tool for doing this. It has several built-in tools that give you different types of profiles. The callgrind mode provides you with a lot of information including relative call costs. The cachegrind tool gives you less information, but what it gives you (e.g. cache misses) is much more accurate. The obvious choice would be to have a way to combine the results of two profiling runs to have both. In the last days I wrote a script that does this. It's available at my user's svn and has a pretty intuitive command line interface. The combining calculation are not perfect yet, total costs of functions can still be a bit bogus (they can sum up to whatever) but at least the relative figures are good. This means that we can stop looking at two different types of graphs now.\n\nAn awesome tool for analyzing the profile data is kcachegrind.\n\n\n\nWhich also proves that my 12'' display is to small at least for some things :-).\n\n\nUpdate: pygrind is available under the MIT license.", + "tags": "kcachegrind,profiling,valgrind", + "url": "https://www.pypy.org/posts/2007/12/profiling-for-fun-with-valgrind-3215121784705288400.html" + }, + { + "title": "PyPy Talk at the Python User Group Munich", + "text": "Tomorrow evening there will be an introductory talk about PyPy at the Python User Group Munich. The talk will be given by CPython and PyPy contributor Georg Brandl and will be in German.", + "tags": "", + "url": "https://www.pypy.org/posts/2007/12/pypy-talk-at-python-user-group-munich-1952379593354367249.html" + }, + { + "title": "PyPy tasks in GHOP", + "text": "In the latest bunch of tasks that Titus released on Friday for the Google Highly Open Participation Contest there are several that are related to PyPy. Some of them are about presenting PyPy to a technical audience: Task 187, Task 188, Task 189, Task 190.\n\nThen there are some three about Ropes, which are all rather challenging:\nSolving the first three section of the last ICFP contest with PyPy's ropes implementation:Task 248.Implementing nice wrapper classes around PyPy's ropes algorithms to make their use convenient: Task 239.Implementing the Ropes algorithms in C as a CPython extension module: Task 218 (already taken).\nIn addition there is a task to use PyPy's sandboxing features to provide an interactive Python tutorial on a web page: Task 220.\n\nWe're really looking forward to working together with some bright students!", + "tags": "", + "url": "https://www.pypy.org/posts/2007/12/pypy-tasks-in-ghop-5130253260153218709.html" + }, + { + "title": "faster than c", + "text": "Of course being \"faster than c\" means being faster than light. What did you think it means? :-)", + "tags": "", + "url": "https://www.pypy.org/posts/2007/12/faster-than-c-8057790636822502084.html" + }, + { + "title": "Good news from the garbage collection front", + "text": "It seems that we can do better! Armin fixed a bug in our generational garbage collector, which caused variable sized objects (e.g. arrays) to be allocated outside of the nursery. This resulted in 50% speedup on synthetic benchmarks and about 10-20% on real world ones. Doing some preliminary measures, it seems that we spend roughly 10% of the time in garbage collection, which is good (and there is still some room for improvements!)", + "tags": "", + "url": "https://www.pypy.org/posts/2007/12/good-news-from-garbage-collection-front-2678138026363485439.html" + }, + { + "title": "PyPy Google Tech Talk", + "text": "The Google Tech Talk that Samuele, Armin, Jacob and Laura gave during the US trip is now on YouTube: https://www.youtube.com/watch?v=GnPmErtqPXk", + "tags": "", + "url": "https://www.pypy.org/posts/2007/11/pypy-google-tech-talk-9082134238390123890.html" + }, + { + "title": "Sprint Pictures", + "text": "The obligatory sprint picture post...\n\n\n\n\nAlexander Schremmer, Armin Rigo, Maciek Fijalkowski, Antonio Cuni\n\nAnders Chrigstr\u00f6m, Samuele Pedroni, Laura Creighton, Jacob Hall\u00e9n, Carl Friedrich Bolz, Richard Emslie, Maciek Fijalkowski, Armin Rigo\n\nHolger Krekel\n\nWhiteboard with \"real world goals\" dependencies.", + "tags": "", + "url": "https://www.pypy.org/posts/2007/11/sprint-pictures-3151912856495869652.html" + }, + { + "title": "Sprint Discussions: Wrapping External Libraries", + "text": "A more technical discussion during the sprint was about the next steps for the external module problem (minutes). One of PyPy's biggest problems in becoming more generally useful are C extension modules, which can't work with PyPy's Python interpreter. We already reimplemented many of the more commonly used extension modules in CPython's standard library in Python or RPython. However, there are more missing and there is no way to implement all the extension modules that other people have written.\nWhiteboard after the discussion.\n\nTherefore we need a different approach to this problem. Extension modules are commonly written for two different reasons, one being speed, the other being wrapping non-Python libraries. At the moment we want mostly to approach a solution for the latter problem, because we hope that the JIT will eventually make it possible to not have to write extension modules for speed reasons any more. There are two rough ideas to approach this problem in the near future (there are other, more long-term ideas that I am not describing now): One of them is to add the ctypes module to PyPy's Python interpreter, which would mean re-implementing it since the existing implementation is written in C. The other way would be to work on the existing way to get extensions in that PyPy provides, which are \"mixed modules\". Mixed modules are written in a combination of RPython and normal Python code. To then wrap C libraries you would use rffi, which is the foreign function interface of RPython.The discussion round: Maciek Fijalkowski, Armin Rigo, Richard Emslie, Alexander Schremmer.Both approaches have problems: With ctypes you have no built-in way to query C header files for structure layouts and constants which requires you to hard-wire them, which is highly platform dependant. Mixed modules are not really fun to write, since they need to be RPython and we currently don't have a way to do separate compilation, so you always need to translate PyPy's whole Python interpreter to see whether your module is correct. In the meeting it was decided to first go for a ctypes replacement. The replacement would be written in pure Python, we already have a very thin wrapper around libffi which the new ctypes implementation would use. The goal to reach would be to get the pygame implementation in ctypes to run on PyPy. To make ctypes more useful in general to write this kind of wrappers, we will probably extract some code that we have already written for PyPy's own usage: it gives a way to write \"imprecise\" declarations (\"a structure with at least fields called x and y which are of some kind of integer type\") and turn them into exact ctypes declarations, internally using the C compiler to inspect the platform headers. After this is done we should approach separate compilation so that developing modules in RPython has a quicker turnaround time. This is somewhat involved to implement for technical reasons. There are ideas how to implement it quickly to make it usable for prototyping, but it's still a lot of work.", + "tags": "", + "url": "https://www.pypy.org/posts/2007/11/sprint-discussions-wrapping-external-8731011170537270161.html" + }, + { + "title": "Sprint Discussions: Releases, Testing", + "text": "During the sprint we had various discussions about technical issues as well as planning discussions about how we want to go about things. One of them was about the stability of PyPy, how to ensure stability, how to handle releases and approaches to being more \"usable\". I will describe this discussion in this post (there are also minutes of the meeting).\n\n\n\nThe Meetings whiteboard\n\nTesting\n First we discussed the current situation in terms of testing. PyPy has been extremely testing-oriented from the start, it is being developed almost exclusively in test-driven-development style. To deal with the large number of tests we already have some infrastructure in place: we run all of PyPy's tests nightly on a Linux machine we translate a PyPy Python interpreter every night and use that to run the CPython compliance tests against it, also on a Linux machine we translate several Python interpreters every night and run benchmarks against them on a PowerPC running Mac OS X As you can see, we are lacking in the Windows testing area, which is an even worse problem because none of the currently active developers has Windows as his primary OS. We should improve this by finding a Windows machine where the tests are run nightly and where we can log in to try bug-fixes quickly. The latter bit is important, we had a nightly windows test run before (thanks to Scott Dial) but it didn't help, because even if you tried to fix a bug you would have to wait until the next night to see whether it worked. Another very serious problem is that of aggregation: we have these various test runs that all have a web interface to check for errors but there is no easy way to find out which tests failed. You have to go to each page and even some sub-pages to see what needs fixing, which is a tedious process. The idea for solving this is aggregate all the available information into some sort of testing-entry-point page that gives a quick overview of the regressions that happened during the night. It's not clear whether we can achieve that with existing tools (buildbots or whatever), but we will investigate that.\n Releases\nThe discussion about releases was more on a fundamental and less on a concrete level (especially when it comes to time-frames). We discussed what it means to make a release, because obviously it is more than just taking an SVN revision and putting a tarball of it onto the webpage. During the EU period we were required to make several releases, but those were not really meant to be more than technology previews for the brave adventurers to try. In the future we have the goal to release things that are more stable and hopefully more practically useful. The plan is to use medium-sized Python applications that have a chance to run on top of PyPy because they don't use too many extension modules (web apps being likely candidates) and that have good unit-tests themselves. The first step would be to find some applications that fit this description, fix the bugs that prevents PyPy from running them and from then on run them nightly on one of the testing machines to check for regressions. This would allow us to be more confident when stating that \"PyPy works\". Another thing to keep in mind for releases is the special features that our Python interpreter provides (e.g. the thunk and the taint object space, our stackless features, transparent proxies, sandboxing, special object implementations). Those features are neither tested by the CPython tests nor by any existing applications. Therefore we cannot really be confident that these features work and don't have too many bugs (in fact, the first time somebody really use the become feature of the thunk space in earnest he found a serious bug that is not fixed so far). To get around this problem, we plan to write small-to-medium sized example applications for each of these features (for stackless we can maybe use one of the existing stackless examples). This will hopefully find bugs and will also make it possible to evaluate whether the features make sense from a language design point of view. A minor thing to make releases easier is to be able to not only have the tests be run once a night but also be able to trigger them manually on the release branch before doing the release.Publishing Cool Things\n Since we decided that the releases we make should be stable and usable, we also discussed how we would go about making new \"cool things\" like features, experiments etc. better known. The consensus was that this blog is probably the best forum for doing this. In addition we discussed having a stabler snapshot of the trunk made to ensure that people wanting to play around with these features don't accidentally get\na broken version.Helping Out\nRight now we are still in cleanup mode (the cleanup sprint is nearly done, but we haven't finished all the cleanups yet), so we won't be able to start on the above things right now. However, they will have a strong focus soon. So if you are interested in trying out to run programs on top of PyPy or writing new ones that use the new features you are most welcome to do so and we will try to fix the bugs or help you doing it (of course some tolerance against frustration is needed when you do that, because the bugs that turn up tend to be obscure). We have not been perfect at this in the past, but this will have to change.", + "tags": "release", + "url": "https://www.pypy.org/posts/2007/11/sprint-discussions-releases-testing-1126468258904483211.html" + }, + { + "title": "Ropes branch merged", + "text": "This afternoon we merged the ropes branch that I have been working on on the side for a while (also to cut down the number of currently active branches a bit, since we are doing major cleanups right now). It contained a new (optional) implementation of the unicode type using the rope data structure. Ropes essentially use concatenation trees to represent strings. The leaves of the trees contain either byte arrays or arrays of unicode characters.\n\n\nOf course the fact that ropes are used is mostly completely transparent to the user (as usual in the pypy world :) ). Normal and unicode strings are implemented with them, but just from the behavior of these types the user has a hard time noticing. Of course there are significant changes in performance (in both directions).\n\nUsing ropes to implement strings has some interesting effects. The most obvious one is that string concatenation, slicing and repetition is really fast (I suspect that it is amortized O(1), but haven't proved it). This is probably not helping most existing Python programs because people tend to code in such a way that these operations are not done too often. However, with ropes it is possible to do something like this:\nPython 2.4.1 (pypy 1.0.0 build 48942) on linux2\nType \"help\", \"copyright\", \"credits\" or \"license\" for more information.\n>>>> import sys\n>>>> a = \"a\" * sys.maxint\n>>>> hash(a)\n-768146060\n\n\nSo somebody who is targeting a Python implementation that has ropes could write his code in such a way that this is taken into account. Another interesting feature is that ropes try to share as much data as possible with each other, so if you create a large slice of a large string, the slice is not going to take much additional memory.\n\nOne of the most interesting use-cases of ropes are together with unicode. The leaf nodes of a rope unicode string can be either a byte array or an array of unicode characters. This means that a unicode string that uses only characters that are latin-1 or ascii will use one byte of memory per character. If a unicode string contains mostly only unicode characters that are latin-1 and a few that are not, it will still use 1 byte for most of the latin-1 characters. This property also allows really fast encoding and decoding of unicode strings as long as they don't contain non-latin-1 characters (only with certain encodings of course):\n>>>> s = \"a\" * sys.maxint\n>>>> u = s.decode(\"ascii\")\n>>>> u = s.decode(\"latin-1\")\n>>>> u = s.decode(\"utf-8\")\nAgain, encoding and decoding strings that contain a few non-latin-1 characters is again efficient:\n>>>> u = \"a\" * 100000000 + u\"\\uffff\"\n>>>> s = u.encode(\"utf-8\")\n>>>> len(s)\n100000003\nI am not completely certain how useful this behaviour is for real-life applications, but it's kind of cool :-). It saves memory for european languages that contain few non-ascii characters.\n\nOf course there is at least one down-side to all of this, which is that string indexing is not O(1) any longer, because we have to walk down the tree to find the correct leaf where the character is actually in. I have not measured much, but I expect it to be quite fast in practice, because the trees are never deeper than 32 nodes.", + "tags": "", + "url": "https://www.pypy.org/posts/2007/11/ropes-branch-merged-8782576892496878598.html" + }, + { + "title": "PyPy cleanup sprint startup", + "text": "The following week we will have a sprint in Gothenburg to clean up the PyPy codebase and make it ready for future developments. So far, only a few people are here, the others will arrive this afternoon.\n\nThe \u00c4lvsborgsbron in Gothenburg from the ferry I took to get there.", + "tags": "", + "url": "https://www.pypy.org/posts/2007/11/pypy-cleanup-sprint-startup-4429006224971155209.html" + }, + { + "title": "Unicode support in RPython", + "text": "In the recent days we (Carl Friedrich, Anto and me) implemented native unicode support for RPython. This means that now you can write u'xxxx' directly in your RPython program, as well as unicode(some_string_variable) and most of the unicode methods should work as well. The things that don't work, are operations that require the unicode database (such as .upper() and friends) and encodings (unicode(x, encoding) for example). Right now our python interpreter does not use this at all, but that's the next step.\n\nCheers,\nfijal", + "tags": "", + "url": "https://www.pypy.org/posts/2007/11/unicode-support-in-rpython-in-recent-1444449848043047640.html" + }, + { + "title": "The PyPy Road Show (1): New York and IBM", + "text": "We're slowly getting adjusted to the jet-lag (except maybe Samuele). Time to blog... The past two days at IBM, in New York, have been quite interesting. The place is a research center. Feels University-like, but meetings rooms have no windows and climatization fixed on \"polar\" settings. The building is of course heated at this time of the year, and then the meeting rooms are climatized... I guess that just doesn't make sense to me. We gave a 1h30 talk to a general audience first. Then we had a compact schedule of meetings with various people or groups of people. In the early preparations for this trip we planned to stay only one day, but Martin Hirzel, our host, found too many people that wanted to talk with us :-) I think that both us and most of the people we talked with got interesting things out of the meetings. On our side, let me point a few highlights. We asked two people that worked on the GCs for the Jikes RVM if reusing them for RPython programs would make sense. They didn't scream \"you're mad!\", so I guess the answer is yes. Apparently, it has been done before, too. I'm still not sure I got this right, but it seems that Microsoft paid someone money to integrate them with Rotor... Then the real-time garbage-collection guys explained to us the things that we need to take care about when writing a VM: real-time GC needs not only write barriers and read barriers, but pointer-equality-comparison barriers... They have bad memories of trying to add a posteriori this kind of barrier into existing VMs, so it took us a bit of explaining to make them realize that adding new kinds of barriers is mostly trivial for us (I'm still not 100% sure they got it... bad memories can stick hard). Then we had discussions with JIT people. Mostly, this allowed us to confirm that Samuele has already got a good idea about what Java JITs like Hotspot can do, and in which kind of situation they work well. As expected, the most difficult bit for a PyPy-like JIT that would run on top of a JVM would be the promotion. We discussed approaches like first generating fall-back cases that include some instrumentation logic, and regenerating code with a few promoted values after some time if it seems like it will be a gain. Replacing a method with a new version is difficult to do in a way that is portable across Java VMs. There are still possible workarounds, but it also means that if we really want to explore this seriously, we should consider experimenting with specifics VMs - e.g. the Jikes RVM gives (or could be adapted to give) hooks to replace methods with new versions of them, which is something that the JVM's own JIT internally does all the time. We showed the taint object space and the sandboxed PyPy to several groups of security people. I won't say much about it here, beyond the fact that they were generally interested by the fact that the corresponding code is very short and easy to play with. They are doing a lot on security in Java and... PHP, for web sites. Someone could write a PHP interpreter (!) in PyPy to get the same kind of results. But as Laura and Samuele put it, there are things in life you do for fun, and things you do for money :-) We're in Vancouver today and tomorrow. More about this later... Armin Rigo", + "tags": "", + "url": "https://www.pypy.org/posts/2007/11/pypy-road-show-1-new-york-and-ibm-7837076523877011699.html" + }, + { + "title": "The PyPy Road Show", + "text": "Armin Rigo, Samuele Pedroni, Laura Creighton and Jacob Hall\u00e9n are on a two-week-trip through the USA and Canada, to present PyPy to various companies and institutions. The next few blog entries will cover our experiences and adventures.\n\nHere is a glimpse of our schedule (all November 2007):\n4th: Chigaco5th-6th: New York7th-8th: Vancouver9th-18th: San Francisco and the Bay Area\nNotably, we meet with IBM Research in New York and give a Google Talk in the Bay Area.", + "tags": "", + "url": "https://www.pypy.org/posts/2007/11/pypy-road-show-5790414147905233059.html" + }, + { + "title": "First Post", + "text": "Welcome to the PyPy status blog. After we got a lot of positive feedback about the blog coverage of our Squeak/PyPy sprint in Bern we decided that having a general PyPy blog sounds like a good idea. We will try to periodically post about what is going on in the PyPy project, cover sprints and other events where PyPyers are present. If you have any wishes about things we should write about, feel free to leave a comment.", + "tags": "", + "url": "https://www.pypy.org/posts/2007/10/first-post-8150793557471983289.html" + }, + { + "title": "Search", + "text": "Search results appear here.", + "tags": "", + "url": "https://www.pypy.org/search.html" + } + ] +}; \ No newline at end of file diff --git a/assets/js/tipuesearch_set.js b/assets/js/tipuesearch_set.js new file mode 100644 index 000000000..8475b5c0d --- /dev/null +++ b/assets/js/tipuesearch_set.js @@ -0,0 +1,84 @@ + +/* +Tipue Search 7.1 +Copyright (c) 2019 Tipue +Tipue Search is released under the MIT License +http://www.tipue.com/search +*/ + + +/* +Stop words +Stop words list from http://www.ranks.nl/stopwords +*/ + +var tipuesearch_stop_words = ["a", "about", "above", "after", "again", "against", "all", "am", "an", "and", "any", "are", "aren't", "as", "at", "be", "because", "been", "before", "being", "below", "between", "both", "but", "by", "can't", "cannot", "could", "couldn't", "did", "didn't", "do", "does", "doesn't", "doing", "don't", "down", "during", "each", "few", "for", "from", "further", "had", "hadn't", "has", "hasn't", "have", "haven't", "having", "he", "he'd", "he'll", "he's", "her", "here", "here's", "hers", "herself", "him", "himself", "his", "how", "how's", "i", "i'd", "i'll", "i'm", "i've", "if", "in", "into", "is", "isn't", "it", "it's", "its", "itself", "let's", "me", "more", "most", "mustn't", "my", "myself", "no", "nor", "not", "of", "off", "on", "once", "only", "or", "other", "ought", "our", "ours", "ourselves", "out", "over", "own", "same", "shan't", "she", "she'd", "she'll", "she's", "should", "shouldn't", "so", "some", "such", "than", "that", "that's", "the", "their", "theirs", "them", "themselves", "then", "there", "there's", "these", "they", "they'd", "they'll", "they're", "they've", "this", "those", "through", "to", "too", "under", "until", "up", "very", "was", "wasn't", "we", "we'd", "we'll", "we're", "we've", "were", "weren't", "what", "what's", "when", "when's", "where", "where's", "which", "while", "who", "who's", "whom", "why", "why's", "with", "won't", "would", "wouldn't", "you", "you'd", "you'll", "you're", "you've", "your", "yours", "yourself", "yourselves"]; + + +// Word replace + +var tipuesearch_replace = {'words': [ + {'word': 'tipua', 'replace_with': 'tipue'}, + {'word': 'javscript', 'replace_with': 'javascript'}, + {'word': 'jqeury', 'replace_with': 'jquery'} +]}; + + +// Weighting + +var tipuesearch_weight = {'weight': [ + {'url': 'http://www.tipue.com', 'score': 60}, + {'url': 'http://www.tipue.com/search', 'score': 60}, + {'url': 'http://www.tipue.com/tipr', 'score': 30}, + {'url': 'http://www.tipue.com/support', 'score': 20} +]}; + + +// Illogical stemming + +var tipuesearch_stem = {'words': [ + {'word': 'e-mail', 'stem': 'email'}, + {'word': 'javascript', 'stem': 'jquery'}, + {'word': 'javascript', 'stem': 'js'} +]}; + + +// Related + +var tipuesearch_related = {'Related': [ + {'search': 'tipue', 'related': 'Search', 'include': 1}, + {'search': 'tipue', 'related': 'jQuery'}, + {'search': 'tipue', 'related': 'Blog'}, + {'search': 'tipue', 'related': 'Support'}, + {'search': 'tipue search', 'related': 'Demo', 'include': 1}, + {'search': 'tipue search', 'related': 'Support'} +]}; + + +// Internal strings + +var tipuesearch_string_1 = 'No title'; +var tipuesearch_string_2 = 'Showing results for'; +var tipuesearch_string_3 = 'Search instead for'; +var tipuesearch_string_4 = '1 result'; +var tipuesearch_string_5 = 'results'; +var tipuesearch_string_6 = 'Prev'; +var tipuesearch_string_7 = 'Next'; +var tipuesearch_string_8 = 'Nothing found'; +var tipuesearch_string_9 = 'Common words are largely ignored'; +var tipuesearch_string_10 = 'Related'; +var tipuesearch_string_11 = 'Search should be one character or more'; +var tipuesearch_string_12 = 'Search should be'; +var tipuesearch_string_13 = 'characters or more'; +var tipuesearch_string_14 = 'seconds'; +var tipuesearch_string_15 = 'Open Image'; +var tipuesearch_string_16 = 'Goto Page'; + + +// Internals + + +// Timer for showTime + +var startTimer = new Date().getTime(); + diff --git a/assets/xml/atom.xsl b/assets/xml/atom.xsl new file mode 100644 index 000000000..7b183442b --- /dev/null +++ b/assets/xml/atom.xsl @@ -0,0 +1,28 @@ + + + + + + + + +<xsl:value-of select="feed/title"/> (Atom feed) + + + +

(Atom feed)

+

This is an Atom feed. To subscribe to it, copy its address and paste it when your feed reader asks for it. It will be updated periodically in your reader. New to feeds? Learn more.

+

+ +urladdressfalse +

+

Preview of the feed’s current headlines:

+
    + +
  1. +
    +
+ + +
+
diff --git a/assets/xml/rss.xsl b/assets/xml/rss.xsl new file mode 100644 index 000000000..f34b3b173 --- /dev/null +++ b/assets/xml/rss.xsl @@ -0,0 +1,28 @@ + + + + + + + + +<xsl:value-of select="rss/channel/title"/> (RSS) + + + +

(RSS)

+

This is an RSS feed. To subscribe to it, copy its address and paste it when your feed reader asks for it. It will be updated periodically in your reader. New to feeds? Learn more.

+

+ +urladdressfalse +

+

Preview of the feed’s current headlines:

+
    + +
  1. +
    +
+ + +
+
diff --git a/authors/alex.html b/authors/alex.html new file mode 100644 index 000000000..bc3d8ba9b --- /dev/null +++ b/authors/alex.html @@ -0,0 +1,140 @@ + + + + + +Posts by Alex | PyPy + + + + + + + + + + + + + + + + + Skip to main content +
+

Posts by Alex

+ +
+
+ + \ No newline at end of file diff --git a/authors/alex.xml b/authors/alex.xml new file mode 100644 index 000000000..38100f0da --- /dev/null +++ b/authors/alex.xml @@ -0,0 +1,256 @@ + +PyPy (Posts by Alex)https://www.pypy.org/enContents © 2024 <a href="mailto:pypy-dev@pypy.org">The PyPy Team</a> Sat, 31 Aug 2024 17:48:10 GMTNikola (getnikola.com)http://blogs.law.harvard.edu/tech/rssPython Software Foundation Matching Donations this Monthhttps://www.pypy.org/posts/2014/09/python-software-foundation-matching-2230529993193139046.htmlAlex<p>We're extremely excited to announce that for the month of September, any amount<br> +you donate to PyPy will be match (up to $10,000) by the <a class="reference external" href="https://pyfound.blogspot.com/2014/09/matching-donations-to-pypy-in-september.html">Python Software<br> +Foundation</a>.</p><p>This includes any of our ongoing fundraisers: NumPyPy, STM, Python3, or our<br> +general fundraising.</p><p>Here are some of the things your previous donations have helped accomplish:</p><ul class="simple"><li>Getting PyPy3 completed (currently 3.2, with 3.3 work underway)</li> +<li>New research and production engineering on STM for PyPy</li> +<li>Lots of progress on NumPy for PyPy</li> +<li>Significant performance improvements</li> +</ul><p>You can see a preview of what's coming in our next 2.4 release in the <a class="reference external" href="https://doc.pypy.org/en/latest/release-2.4.0.html#highlights">draft<br> +release notes</a>.</p><p>Thank you to all the individuals and companies which have donated so far.</p><p>So please, donate today: <a class="reference external" href="https://pypy.org/">https://pypy.org/</a></p><p>(Please be aware that the donation progress bars are not live updating, so<br> +don't be afraid if your donation doesn't show up immediately).</p>https://www.pypy.org/posts/2014/09/python-software-foundation-matching-2230529993193139046.htmlMon, 01 Sep 2014 17:49:00 GMTMaking coverage.py faster under PyPyhttps://www.pypy.org/posts/2013/10/making-coveragepy-faster-under-pypy-935409618297062344.htmlAlex<p>If you've ever tried to run your programs with <tt class="docutils literal">coverage.py</tt> under PyPy,<br> +you've probably experienced some incredible slowness. Take this simple<br> +program:</p><pre class="code python literal-block"><span class="keyword">def</span> <span class="name function">f</span><span class="punctuation">():</span> + <span class="keyword">return</span> <span class="literal number integer">1</span> + + +<span class="keyword">def</span> <span class="name function">main</span><span class="punctuation">():</span> + <span class="name">i</span> <span class="operator">=</span> <span class="literal number integer">10000000</span> + <span class="keyword">while</span> <span class="name">i</span><span class="punctuation">:</span> + <span class="name">i</span> <span class="operator">-=</span> <span class="name">f</span><span class="punctuation">()</span> + +<span class="name">main</span><span class="punctuation">()</span> +</pre><p>Running <tt class="docutils literal">time coverage.py run test.py</tt> five times, and looking at the best<br> +run, here's how PyPy 2.1 stacks up against CPython 2.7.5:</p><table border="1" class="docutils"><colgroup> +<col width="32%"> +<col width="19%"> +<col width="49%"> +</colgroup> +<thead valign="bottom"> +<tr><th class="head">Python</th> +<th class="head">Time</th> +<th class="head">Normalized to CPython</th> +</tr> +</thead> +<tbody valign="top"> +<tr><td>CPython 2.7.5</td> +<td>3.879s</td> +<td>1.0x</td> +</tr> +<tr><td>PyPy 2.1</td> +<td>53.330s</td> +<td>13.7x slower</td> +</tr> +</tbody> +</table><p>Totally ridiculous. I got turned onto this problem because on one of my<br> +projects CPython takes about 1.5 minutes to run our test suite on the build<br> +bot, but PyPy takes 8-10 minutes.</p><p>So I sat down to address it. And the results:</p><table border="1" class="docutils"><colgroup> +<col width="32%"> +<col width="19%"> +<col width="49%"> +</colgroup> +<thead valign="bottom"> +<tr><th class="head">Python</th> +<th class="head">Time</th> +<th class="head">Normalized to CPython</th> +</tr> +</thead> +<tbody valign="top"> +<tr><td>CPython 2.7.5</td> +<td>3.879s</td> +<td>1.0x</td> +</tr> +<tr><td>PyPy 2.1</td> +<td>53.330s</td> +<td>13.7x slower</td> +</tr> +<tr><td>PyPy head</td> +<td>1.433s</td> +<td>2.7x faster</td> +</tr> +</tbody> +</table><p>Not bad.</p><div class="section" id="technical-details"><h1>Technical details</h1><p>So how'd we do it? Previously, using <tt class="docutils literal">sys.settrace()</tt> (which <tt class="docutils literal">coverage.py</tt><br> +uses under the hood) disabled the JIT. Except it didn't just disable the JIT,<br> +it did it in a particularly insidious way — the JIT had no idea it was being<br> +disabled!</p><p>Instead, every time PyPy discovered that one of your functions was a hotspot,<br> +it would start tracing to observe what the program was doing, and right when it<br> +was about to finish, <tt class="docutils literal">coverage</tt> would run and cause the JIT to abort. Tracing<br> +is a slow process, it makes up for it by generating fast machine code at the<br> +end, but tracing is still incredibly slow. But we never actually got to the<br> +"generate fast machine code" stage. Instead we'd pay all the cost of tracing,<br> +but then we'd abort, and reap none of the benefits.</p><p>To fix this, we adjusted some of the heuristics in the JIT, to better show it<br> +how <tt class="docutils literal"><span class="pre">sys.settrace(&lt;tracefunc&gt;)</span></tt> works. Previously the JIT saw it as an opaque<br> +function which gets the frame object, and couldn't tell whether or not it<br> +messed with the frame object. Now we let the JIT look inside the<br> +<tt class="docutils literal">&lt;tracefunc&gt;</tt> function, so it's able to see that <tt class="docutils literal">coverage.py</tt> isn't<br> +messing with the frame in any weird ways, it's just reading the line number and<br> +file path out of it.</p><p>I asked several friends in the VM implementation and research field if they<br> +were aware of any other research into making VMs stay fast when debugging tools<br> +like <tt class="docutils literal">coverage.py</tt> are running. No one I spoke to was aware of any (but I<br> +didn't do a particularly exhaustive review of the literature, I just tweeted at<br> +a few people), so I'm pleased to say that PyPy is quite possibly the first VM<br> +to work on optimizing code in debugging mode! This is possible because of our<br> +years spent investing in meta-tracing research.</p></div><p>Happy testing,<br> +Alex</p>https://www.pypy.org/posts/2013/10/making-coveragepy-faster-under-pypy-935409618297062344.htmlSat, 26 Oct 2013 00:48:00 GMTPyPy San Francisco Sprint July 27th 2013https://www.pypy.org/posts/2013/07/pypy-san-francisco-sprint-july-27th-2012-3064530444396960172.htmlAlex<p>The next PyPy sprint will be in San Francisco, California. It is a public<br> +sprint, suitable for newcomers. It will run on Saturday July 27th.</p><p>Some possible things people will be hacking on the sprint:</p><ul class="simple"><li>running your software on PyPy</li> +<li>making your software fast on PyPy</li> +<li>improving PyPy's JIT</li> +<li>improving Twisted on PyPy</li> +<li>any exciting stuff you can think of</li> +</ul><p>If there are newcomers, we'll run an introduction to hacking on PyPy.</p><p>Location<br> +The sprint will be held at the Rackspace Office:</p><p>620 Folsom St, Ste 100</p><p>The doors will open at 10AM and run until 6PM.</p>https://www.pypy.org/posts/2013/07/pypy-san-francisco-sprint-july-27th-2012-3064530444396960172.htmlFri, 26 Jul 2013 01:17:00 GMTPyPy San Francisco Sprint Dec 1st - Dec 2nd 2012https://www.pypy.org/posts/2012/11/pypy-san-francisco-sprint-dec-1st-dec-5133109101989613355.htmlAlex<p>The next PyPy sprint will be in San Francisco, California. It is a<br> +public sprint, suitable for newcomers. It will run on Saturday December 1st and<br> +Sunday December 2nd. The goals for the sprint are continued work towards the<br> +2.0 release as well as code cleanup, we of course welcome any topic which<br> +contributors are interested in working on.</p><p>Some other possible topics are:</p><ul class="simple"><li>running your software on PyPy</li> +<li>work on PyPy's numpy (<a class="reference external" href="https://www.pypy.org/posts/2012/09/numpy-on-pypy-status-update-1605312600799448094.html">status</a>)</li> +<li>work on STM (<a class="reference external" href="https://mail.python.org/pipermail/pypy-dev/2012-September/010513.html">status</a>)</li> +<li>JIT improvements</li> +<li>any exciting stuff you can think of</li> +</ul><p>If there are newcomers, we'll run the usual introduction to hacking on<br> +PyPy.</p><br> +<h1>Location</h1><p>The sprint will be held at the Rackspace Office:</p><p>620 Folsom St, Ste 100<br> +San Francisco</p><p>The doors will open at 10AM both days, and run until 6PM both days.</p><p>Thanks to David Reid for helping get everything set up!</p>https://www.pypy.org/posts/2012/11/pypy-san-francisco-sprint-dec-1st-dec-5133109101989613355.htmlTue, 27 Nov 2012 19:29:00 GMTCome see us at PyCon 2012https://www.pypy.org/posts/2011/12/come-see-us-at-pycon-2012-610420698450130659.htmlAlex<p><a class="reference external" href="https://us.pycon.org/2012/">PyCon 2012</a> is coming up in just a few short months, and PyPy will be well<br> +represented there. We'll be delivering a tutorial, two talks, plus we'll be<br> +around for the sprints.</p><p>Here are the abstracts for the tutorials and talks:</p><ul class="simple"><li><strong>How to get the most out of your PyPy</strong>, by Maciej Fijalkowski, Alex Gaynor<br> +and Armin Rigo: For many applications PyPy can provide performance benefits<br> +right out of the box. However, little details can push your application to<br> +perform much better. In this tutorial we'll give you insights on how to push<br> +PyPy to its limits. We'll focus on understanding the performance<br> +characteristics of PyPy, and learning the analysis tools in order to maximize<br> +your applications' performance. <em>This is the tutorial.</em></li> +<li><strong>Why PyPy by example</strong>, by Maciej Fijalkowski, Alex Gaynor and Armin Rigo:<br> +One of the goals of PyPy is to make existing Python code faster; however an<br> +even broader goal was to make it possible to write things in Python that<br> +previously would needed to be written in C or other low-level language. This<br> +talk will show examples of this, and describe how they represent the<br> +tremendous progress PyPy has made, and what it means for people looking at<br> +using PyPy.</li> +<li><strong>How the PyPy JIT works</strong>, by Benjamin Peterson: The Python community is<br> +abuzz about the major speed gains PyPy can offer for pure Python code. But how<br> +does the PyPy JIT actually work? This talk will discuss how the PyPy JIT is<br> +implemented. It will include descriptions of the tracing, optimization, and<br> +assembly generation phases. I will demonstrate each step with an example loop.</li> +</ul><p>If you have any questions let us know! We look forward to seeing people at<br> +PyCon and chatting about PyPy and the entire Python ecosystem.</p><p>See you there,<br> +Maciej Fijalkowski, Alex Gaynor, Benjamin Peterson, Armin Rigo, and the entire PyPy team</p>https://www.pypy.org/posts/2011/12/come-see-us-at-pycon-2012-610420698450130659.htmlThu, 22 Dec 2011 22:27:00 GMTPy3k for PyPy fundraiserhttps://www.pypy.org/posts/2011/09/py3k-for-pypy-fundraiser-8139653689520709617.htmlAlex<p>Hi,</p><p>We would like to announce a donation campaign for implementing Python 3 in PyPy.<br> +Please read our <a class="reference external" href="https://pypy.org/py3donate.html">detailed plan</a> for all the details and donate using the<br> +button on that page!</p><p>Thanks,<br> +The PyPy Team</p>https://www.pypy.org/posts/2011/09/py3k-for-pypy-fundraiser-8139653689520709617.htmlWed, 21 Sep 2011 17:44:00 GMTReport back from our surveyhttps://www.pypy.org/posts/2011/06/report-back-from-our-survey-2083371215707583264.htmlAlex<p>Hi all,</p> +<p>I'm here to report back the results of our survey. First, we're very pleased to +report that a number of you guys are happilly running PyPy in production! Most +(97%) of the respondants using PyPy are using it because it's faster, but a +further 26% (respondants could choose multiple answers) are using it because of +lower memory usage. Of users who aren't using PyPy, the most common reason was +C extensions, followed by "Other".</p> +<p>From reading the extra comments section there are a few things we've learned:</p> +<ol class="loweralpha simple"> +<li>Google docs needs a better UI for this stuff</li> +<li>A huge number of people want NumPy and SciPy, it was easily the most +requested C extension (25% of respondants said somthing about NumPy). We've +already blogged on the topic of <a class="reference external" href="https://www.pypy.org/posts/2011/05/numpy-in-pypy-status-and-roadmap-8332894230779779992.html">our plans for NumPy</a>.</li> +<li>Having packages in the various OS's repositories would be a big help in +getting users up and running.</li> +</ol> +<p>A huge thanks to everyone who responded! Finally, if you're using PyPy in +production we'd love to get a testimonial from you, if you're willing to spare +a few minutes to give us a quote or two please get in contact with us via <a class="reference external" href="https://mail.python.org/mailman/listinfo/pypy-dev">our +mailing list</a>.</p> +<p>Thanks, +Alex</p>https://www.pypy.org/posts/2011/06/report-back-from-our-survey-2083371215707583264.htmlWed, 08 Jun 2011 06:18:00 GMTPyPy Usage Surveyhttps://www.pypy.org/posts/2011/05/pypy-usage-survey-1402303968715807009.htmlAlex<p>We've been working on PyPy for a long time. But readers of this blog will know +that in the past year something has changed: we think PyPy is production ready. +And it's not just us, this week <a class="reference external" href="https://lwn.net/">LWN.net</a> wrote an article about how <a class="reference external" href="https://lwn.net/SubscriberLink/442268/22f66371348bd7c5/">PyPy +sped up one of their scripts by a factor of three</a>, noting that, "plans are to +run gitdm under PyPy from here on out". All in all we think PyPy is pretty +great, but not everyone is using it yet, and we want to know why. We want your +feedback on why PyPy isn't ready to be your only Python yet, and how we can +improve it to make that happen.</p> +<p>Therefore, we've put together a quick survey, whether you're using PyPy or not +if you could take a few minutes to fill it out and let us know how we're doing +we'd really appreciate it. You can find the form <a class="reference external" href="https://spreadsheets.google.com/viewform?hl=en&amp;formkey=dF9NZlFpNldNS05fdFVKMnpKZVFzN0E6MQ#gid=0">here</a>.</p> +<p>Thanks, +The PyPy team</p>https://www.pypy.org/posts/2011/05/pypy-usage-survey-1402303968715807009.htmlMon, 16 May 2011 17:27:00 GMTNumPy Follow uphttps://www.pypy.org/posts/2011/05/numpy-follow-up-6928627691060102514.htmlAlex<p>Hi everyone. Since yesterday's blog post we got a ton of feedback, so we want +to clarify a few things, as well as share some of the progress we've made, in +only the 24 hours since the post.</p> +<h3>Reusing the original NumPy</h3> +<p>First, a lot of people have asked why we cannot just reuse the original NumPy +through <tt class="docutils literal">cpyext</tt>, our CPython C-API compatibility layer. We believe this is +not the best approach, for a few reasons:</p> +<blockquote> +<ol class="arabic simple"> +<li><tt class="docutils literal">cpyext</tt> is slow, and always will be slow. It has to emulate far too many +details of the CPython object model that don't exist on PyPy (e.g., +reference counting). Since people are using NumPy primarily for speed this +would mean that even if we could have a working NumPy, no one would want to +use it. Also, as soon as the execution crosses the <tt class="docutils literal">cpyext</tt> boundary, it +becomes invisible to the JIT, which means the JIT has to assume the worst +and deoptimize stuff away.</li> +<li>NumPy uses many obscure documented and undocumented details of the CPython +C-API. Emulating these is often difficult or impossible (e.g. we can't fix +accessing a struct field, as there's no function call for us to intercept).</li> +<li>It's not much fun. Frankly, working on <tt class="docutils literal">cpyext</tt>, debugging the crashes, +and everything else that goes with it is not terribly fun, especially when +you know that the end result will be slow. We've demonstrated we can build +a much faster NumPy, in a way that's more fun, and given that the people +working on this are volunteers, it's important to keep us motivated.</li> +</ol> +</blockquote> +<p>Finally, we are <strong>not</strong> proposing to rewrite the entirety of NumPy or, god +forbid, BLAST, or any of the low level stuff that operates on C-level arrays, +only the parts that interface with Python code directly.</p> +<h3>C bindings vs. CPython C-API</h3> +<p>There are two issues on C code, one has a very nice story, and the other not so +much. First is the case of arbitrary C-code that isn't Python related, things +like <tt class="docutils literal">libsqlite</tt>, <tt class="docutils literal">libbz2</tt>, or any random C shared library on your system. +PyPy will quite happily call into these, and bindings can be developed either +at the RPython level (using <tt class="docutils literal">rffi</tt>) or in pure Python, using <tt class="docutils literal">ctypes</tt>. +Writing bindings with <tt class="docutils literal">ctypes</tt> has the advantage that they can run on every +alternative Python implementation, such as Jython and IronPython. Moreover, +once we merge the <tt class="docutils literal">jittypes2</tt> branch <tt class="docutils literal">ctypes</tt> calls will even be smoking +fast.</p> +<p>On the other hand there is the CPython C-extension API. This is a very specific +API which CPython exposes, and PyPy tries to emulate. It will never be fast, +because there is far too much overhead in all the emulation that needs to be +done.</p> +<p>One of the reasons people write C extensions is for speed. Often, with PyPy +you can just forget about C, write everything in pure python and let the JIT to +do its magic.</p> +<p>In case the PyPy JIT alone isn't fast enough, or you just want to +use existing C code then it might make sense to split +your C-extension into 2 parts, one which doesn't touch the CPython C-API and +thus can be loaded with <tt class="docutils literal">ctypes</tt> and called from PyPy, and another which does +the interfacing with Python for CPython (where it will be faster).</p> +<p>There are also libraries written in C to interface with existing C codebases, +but for whom performance is not the largest goal, for these the right solution +is to try using CPyExt, and if it works that's great, but if it fails the +solution will be to rewrite using <cite>ctypes</cite>, where it will work on all Python +VMs, not just CPython.</p> +<p>And finally there are rare cases where rewriting in RPython makes more sense, +NumPy is one of the few examples of these because we need to be able to give +the JIT hints on how to appropriately vectorize all of the operations on an +array. In general writing in RPython is not necessary for almost any +libraries, NumPy is something of a special case because it is so ubiquitous +that every ounce of speed is valuable, and makes the way people use it leads to +code structure where the JIT benefits enormously from extra hints and the +ability to manipulate memory directly, which is not possible from Python.</p> +<h3>Progress</h3> +<p>On a more positive note, after we published the <a class="reference external" href="https://www.pypy.org/posts/2011/05/numpy-in-pypy-status-and-roadmap-8332894230779779992.html">last post</a>, several new people +came and contributed improvements to the <tt class="docutils literal"><span class="pre">numpy-exp</span></tt> branch. We would like to +thank all of them:</p> +<blockquote> +<ul class="simple"> +<li>nightless_night contributed: An implementation of <tt class="docutils literal">__len__</tt>, fixed bounds +checks on <tt class="docutils literal">__getitem__</tt> and <tt class="docutils literal">__setitem__</tt>.</li> +<li>brentp contributed: Subtraction and division on NumPy arrays.</li> +<li>MostAwesomeDude contributed: Multiplication on NumPy arrays.</li> +<li>hodgestar contributed: Binary operations between floats and NumPy arrays.</li> +</ul> +</blockquote> +<p>Those last two were technically an outstanding branch we finally merged, but +hopefully you get the picture. In addition there was some exciting work done by +regular PyPy contributors. I hope it's clear that there's a place to jump in +for people with any level of PyPy familiarity. If you're interested in +contributing please stop by #pypy on irc.freenode.net, the <a class="reference external" href="https://codespeak.net/mailman/listinfo/pypy-dev">pypy-dev</a> mailing +list, or send us pull requests on <a class="reference external" href="https://bitbucket.org/pypy/pypy">bitbucket</a>.</p> +<p>Alex</p>numpyhttps://www.pypy.org/posts/2011/05/numpy-follow-up-6928627691060102514.htmlThu, 05 May 2011 21:56:00 GMTPyOhiohttps://www.pypy.org/posts/2010/08/pyohio-2568618480482575546.htmlAlex<p>This weekend I delivered a talk at <a class="reference external" href="https://pyohio.org/">PyOhio</a> (an annual conference in Columbus, OH, USA) on PyPy and Unladen Swallow. The talk covered reasons that Python, the language, is hard to optimize, why CPython is slow, and a few optimizations that PyPy and Unladen Swallow have implemented. The slides from my talk are <a class="reference external" href="https://www.scribd.com/doc/35240506/Making-Python-Fast-PyPy-and-Unladen-Swallow">online</a>, and the talk was recorded so a video will follow. I gave a similar talk to <a class="reference external" href="https://chipy.org/">ChiPy</a> (the Chicago Python user group), which was also recorded and the video is <a class="reference external" href="https://carlfk.blip.tv/file/3866910">available</a>. Both audiences were excited about the futures for PyPy and Unladen Swallow, and for the future of a faster Python.</p> +<p>Alex</p>https://www.pypy.org/posts/2010/08/pyohio-2568618480482575546.htmlMon, 02 Aug 2010 21:33:00 GMT \ No newline at end of file diff --git a/authors/alexander-schremmer.html b/authors/alexander-schremmer.html new file mode 100644 index 000000000..c2dd7d278 --- /dev/null +++ b/authors/alexander-schremmer.html @@ -0,0 +1,116 @@ + + + + + +Posts by Alexander Schremmer | PyPy + + + + + + + + + + + + + + + + + Skip to main content +
+
+
+ + \ No newline at end of file diff --git a/authors/alexander-schremmer.xml b/authors/alexander-schremmer.xml new file mode 100644 index 000000000..8cd38031f --- /dev/null +++ b/authors/alexander-schremmer.xml @@ -0,0 +1,71 @@ + +PyPy (Posts by Alexander Schremmer)https://www.pypy.org/enContents © 2024 <a href="mailto:pypy-dev@pypy.org">The PyPy Team</a> Sat, 31 Aug 2024 17:48:11 GMTNikola (getnikola.com)http://blogs.law.harvard.edu/tech/rssUsing CPython extension modules with PyPy natively, or: PyPy can load .pyd files with CPyExt!https://www.pypy.org/posts/2010/04/using-cpython-extension-modules-with-5864754772659599217.htmlAlexander Schremmer<p>PyPy is now able to load +and run CPython extension modules (i.e. .pyd and .so files) natively by using the new CPyExt +subsystem. +Unlike the solution presented in <a class="reference external" href="https://www.pypy.org/posts/2009/11/using-cpython-extension-modules-with-4951018896657992031.html">another blog post</a> (where extension modules like +numpy etc. were run on CPython and proxied through TCP), this solution does not require +a running CPython anymore. We do not achieve full binary compatiblity +yet (like Ironclad), but recompiling the extension is generally enough.</p> +<p>The only prerequisite is that the necessary functions of the C API of CPython are already +implemented in PyPy. If you are a user or an author of a module and miss certain functions +in PyPy, we invite you to implement them. Up until now, a lot of people (including a lot of +new committers) have stepped up and implemented a few functions to get their favorite module +running. See the end of this post for a list of names.</p> +<p>Regarding speed, we tried the following: even though there is a bit of overhead when running +these modules, we could run the regular expression engine of CPython (<tt class="docutils literal"><span class="pre">_sre.so</span></tt>) and execute +the spambayes benchmark of the Unladen Swallow benchmark suite (cf. <a class="reference external" href="https://speed.pypy.org/">speed.pypy.org</a>) and +experience a speedup: +It became <em>two times faster</em> on pypy-c than with the built-in regular +expression engine of PyPy. From <a href="https://en.wikipedia.org/wiki/Amdahl%27s_law">Amdahl's Law</a> it follows that the <tt class="docutils literal"><span class="pre">_sre.so</span></tt> must run several +times faster than the built-in engine.</p> +<p>Currently pursued modules include PIL and others. Distutils support is nearly ready. +If you would like to participate or want information on how to use this new feature, come and join +our IRC channel <tt class="docutils literal"><span class="pre">#pypy</span></tt> on <a class="reference external" href="irc://irc.freenode.net/">freenode</a>.</p> +<p>Amaury Forgeot d'Arc and Alexander Schremmer</p> +<p>Further CPyExt Contributors:</p> +<ul><li>Alex Gaynor +</li><li>Benjamin Peterson +</li><li>Jean-Paul Calderone +</li><li>Maciej Fijalkowski +</li><li>Jan de Mooij +</li><li>Lucian Branescu Mihaila +</li><li>Andreas Stührk +</li><li>Zooko Wilcox-O Hearn</li></ul>cpyextCPythonextension modulesspeedhttps://www.pypy.org/posts/2010/04/using-cpython-extension-modules-with-5864754772659599217.htmlFri, 09 Apr 2010 22:56:00 GMTUsing CPython extension modules with PyPy, or: PyQt on PyPyhttps://www.pypy.org/posts/2009/11/using-cpython-extension-modules-with-4951018896657992031.htmlAlexander Schremmer<div class="document" id="using-cpython-extension-modules-with-pypy-or-pyqt-on-pypy"> + +<p>If you have ever wanted to use CPython extension modules on PyPy, +we want to announce that there is a solution that should be compatible +to quite a bit of the available modules. It is neither new nor written +by us, but works nevertheless great with PyPy.</p> +<p>The trick is to use RPyC, a transparent, symmetric remote procedure +call library written in Python. The idea is to start a +CPython process that hosts the PyQt libraries +and connect to it via TCP to send RPC commands to it.</p> +<p>I tried to run PyQt applications +using it on PyPy and could get quite a bit of the functionality of these +working. Remaining problems include regular segfaults of CPython +because of PyQt-induced memory corruption and bugs because classes +like StandardButtons behave incorrectly when it comes to arithmetical operations.</p> +<p>Changes to RPyC needed to be done to support remote unbound <tt class="docutils literal"><span class="pre">__init__</span></tt> methods, +shallow call by value for list and dict types (PyQt4 methods want real lists and dicts +as parameters), and callbacks to methods (all remote method objects are wrapped into +small lambda functions to ease the call for PyQt4).</p> +<p>If you want to try RPyC to run the PyQt application of your choice, you just +need to follow these steps. Please report your experience here in the blog +comments or on our <a class="reference external" href="https://codespeak.net/mailman/listinfo/pypy-dev">mailing list</a>.</p> +<blockquote> +<ol class="arabic simple"> +<li>Download RPyC from the <a class="reference external" href="https://sourceforge.net/projects/rpyc/files/">RPyC download page</a>.</li> +<li>Download this <a class="reference external" href="https://codespeak.net/svn/user/xoraxax/rpyc-3.0.7-pyqt4-compat.patch">patch</a> and apply it to RPyC by running +<tt class="docutils literal"><span class="pre">patch</span> <span class="pre">-p1</span> <span class="pre">&lt;</span> <span class="pre">rpyc-3.0.7-pyqt4-compat.patch</span></tt> in the RPyC directory.</li> +<li>Install RPyc by running <tt class="docutils literal"><span class="pre">python</span> <span class="pre">setup.py</span> <span class="pre">install</span></tt> as root.</li> +<li>Run the file <tt class="docutils literal"><span class="pre">rpyc/servers/classic_server.py</span></tt> using CPython.</li> +<li>Execute your PyQt application on PyPy.</li> +</ol> +</blockquote> +<p>PyPy will automatically connect to CPython and use its PyQt libraries.</p> +<p>Note that this scheme works with nearly every extension library. Look +at <tt class="docutils literal"><span class="pre">pypy/lib/sip.py</span></tt> on how to add new libraries (you need to create +such a file for every proxied extension module).</p> +<p>Have fun with PyQt</p> +<p>Alexander Schremmer</p> +</div>CPythonextension modulesPyQt4RPyChttps://www.pypy.org/posts/2009/11/using-cpython-extension-modules-with-4951018896657992031.htmlMon, 30 Nov 2009 11:19:00 GMT \ No newline at end of file diff --git a/authors/antonio-cuni.html b/authors/antonio-cuni.html new file mode 100644 index 000000000..36d09cdf9 --- /dev/null +++ b/authors/antonio-cuni.html @@ -0,0 +1,230 @@ + + + + + +Posts by Antonio Cuni | PyPy + + + + + + + + + + + + + + + + + Skip to main content +
+

Posts by Antonio Cuni

+ +
+
+ + \ No newline at end of file diff --git a/authors/antonio-cuni.xml b/authors/antonio-cuni.xml new file mode 100644 index 000000000..16d998085 --- /dev/null +++ b/authors/antonio-cuni.xml @@ -0,0 +1,1374 @@ + +PyPy (Posts by Antonio Cuni)https://www.pypy.org/enContents © 2024 <a href="mailto:pypy-dev@pypy.org">The PyPy Team</a> Sat, 31 Aug 2024 17:48:13 GMTNikola (getnikola.com)http://blogs.law.harvard.edu/tech/rss#pypy IRC moves to Libera.Chathttps://www.pypy.org/posts/2021/05/pypy-irc-moves-to-libera-chat.htmlAntonio Cuni<p>Following the example of many other FOSS projects, the PyPy team has +decided to move its official <code>#pypy</code> IRC channel from Freenode to +<a href="https://libera.chat/">Libera.Chat</a>: <a href="irc://irc.libera.chat/pypy">irc.libera.chat/pypy</a></p> +<p>The core devs will no longer be present on the Freenode channel, so we recommend to +join the new channel as soon as possible.</p> +<p>wikimedia.org has a +<a href="https://meta.wikimedia.org/wiki/IRC/Migrating_to_Libera_Chat">nice guide</a> on +how to setup your client to migrate from Freenode to Libera.Chat.</p> +<!--TEASER_END-->https://www.pypy.org/posts/2021/05/pypy-irc-moves-to-libera-chat.htmlMon, 31 May 2021 10:00:00 GMTNew HPy bloghttps://www.pypy.org/posts/2021/03/new-hpy-blog.htmlAntonio Cuni<p>Regular readers of this blog +<a href="https://www.pypy.org/posts/2021/03/posts/2019/12/hpy-kick-off-sprint-report-1840829336092490938.html">already know</a> +about <a href="https://hpyproject.org">HPy</a>, a project which aims to develop a new C +API for Python to make it easier/faster to support C extensions on alternative +Python implementations, including PyPy.</p> +<p>The HPy team just published the +<a href="https://hpyproject.org/blog/posts/2021/03/hello-hpy/">first post</a> of HPy new +blog, so if you are interested in its development, make sure to check it out!</p>https://www.pypy.org/posts/2021/03/new-hpy-blog.htmlMon, 29 Mar 2021 14:00:00 GMTHPy kick-off sprint reporthttps://www.pypy.org/posts/2019/12/hpy-kick-off-sprint-report-1840829336092490938.htmlAntonio Cuni<p>Recently Antonio, Armin and Ronan had a small internal sprint in the beautiful +city of Gdańsk to kick-off the development of HPy. Here is a brief report of +what was accomplished during the sprint.</p> +<div class="section" id="what-is-hpy"> +<h2>What is HPy?</h2> +<p>The TL;DR answer is "a better way to write C extensions for Python".</p> +<p>The idea of HPy was born during EuroPython 2019 in Basel, where there was an +informal meeting which included core developers of PyPy, CPython (Victor +Stinner and Mark Shannon) and Cython (Stefan Behnel). The ideas were later also +discussed with Tim Felgentreff of <a class="reference external" href="https://github.com/graalvm/graalpython">GraalPython</a>, to make sure they would also be +applicable to this very different implementation, Windel Bouwman of <a class="reference external" href="https://github.com/RustPython/RustPython">RustPython</a> +is following the project as well.</p> +<p>All of us agreed that the current design of the CPython C API is problematic +for various reasons and, in particular, because it is too tied to the current +internal design of CPython. The end result is that:</p> + +<ul class="simple"> +<li>alternative implementations of Python (such as PyPy, but not only) have a +<a class="reference external" href="https://www.pypy.org/posts/2018/09/inside-cpyext-why-emulating-cpython-c-8083064623681286567.html">hard time</a> loading and executing existing C extensions;</li> +<li>CPython itself is unable to change some of its internal implementation +details without breaking the world. For example, as of today it would be +impossible to switch from using reference counting to using a real GC, +which in turns make it hard for example to remove the GIL, as <a class="reference external" href="https://pythoncapi.readthedocs.io/gilectomy.html">gilectomy</a> +attempted.</li> +</ul> + +<p>HPy tries to address these issues by following two major design guidelines:</p> +<ol class="arabic simple"> +<li>objects are referenced and passed around using opaque handles, which are +similar to e.g., file descriptors in spirit. Multiple, different handles +can point to the same underlying object, handles can be duplicated and +each handle must be released independently of any other duplicate.</li> +<li>The internal data structures and C-level layout of objects are not +visible nor accessible using the API, so each implementation if free to +use what fits best.</li> +</ol> +<p>The other major design goal of HPy is to allow incremental transition and +porting, so existing modules can migrate their codebase one method at a time. +Moreover, Cython is considering to optionally generate HPy code, so extension +module written in Cython would be able to benefit from HPy automatically.</p> +<p>More details can be found in the README of the official <a class="reference external" href="https://github.com/pyhandle/hpy">HPy repository</a>.</p> +</div> +<div class="section" id="target-abi"> +<h2>Target ABI</h2> +<p>When compiling an HPy extension you can choose one of two different target ABIs:</p> + +<ul class="simple"> +<li><strong>HPy/CPython ABI</strong>: in this case, <tt class="docutils literal">hpy.h</tt> contains a set of macros and +static inline functions. At compilation time this translates the HPy API +into the standard C-API. The compiled module will have no performance +penalty, and it will have a "standard" filename like +<tt class="docutils literal"><span class="pre">foo.cpython-37m-x86_64-linux-gnu.so</span></tt>.</li> +<li><strong>Universal HPy ABI</strong>: as the name implies, extension modules compiled +this way are "universal" and can be loaded unmodified by multiple Python +interpreters and versions. Moreover, it will be possible to dynamically +enable a special debug mode which will make it easy to find e.g., open +handles or memory leaks, <strong>without having to recompile the extension</strong>.</li> +</ul> + +<p>Universal modules can <strong>also</strong> be loaded on CPython, thanks to the +<tt class="docutils literal">hpy_universal</tt> module which is under development. An extra layer of +indirection enables loading extensions compiled with the universal ABI. Users +of <tt class="docutils literal">hpy_universal</tt> will face a small performance penalty compared to the ones +using the HPy/CPython ABI.</p> +<p>This setup gives several benefits:</p> + +<ul class="simple"> +<li>Extension developers can use the extra debug features given by the +Universal ABI with no need to use a special debug version of Python.</li> +<li>Projects which need the maximum level of performance can compile their +extension for each relevant version of CPython, as they are doing now.</li> +<li>Projects for which runtime speed is less important will have the choice of +distributing a single binary which will work on any version and +implementation of Python.</li> +</ul> + +</div> +<div class="section" id="a-simple-example"> +<h2>A simple example</h2> +<p>The HPy repo contains a <a class="reference external" href="https://github.com/pyhandle/hpy/blob/master/proof-of-concept/pof.c">proof of concept</a> module. Here is a simplified +version which illustrates what a HPy module looks like:</p> +<pre class="code C literal-block"> +<span class="comment preproc">#include</span> <span class="comment preprocfile">"hpy.h"</span><span class="comment preproc"> +</span> +<span class="name">HPy_DEF_METH_VARARGS</span><span class="punctuation">(</span><span class="name">add_ints</span><span class="punctuation">)</span> +<span class="keyword">static</span> <span class="name">HPy</span> <span class="name">add_ints_impl</span><span class="punctuation">(</span><span class="name">HPyContext</span> <span class="name">ctx</span><span class="punctuation">,</span> <span class="name">HPy</span> <span class="name">self</span><span class="punctuation">,</span> <span class="name">HPy</span> <span class="operator">*</span><span class="name">args</span><span class="punctuation">,</span> <span class="name">HPy_ssize_t</span> <span class="name">nargs</span><span class="punctuation">)</span> +<span class="punctuation">{</span> + <span class="keyword type">long</span> <span class="name">a</span><span class="punctuation">,</span> <span class="name">b</span><span class="punctuation">;</span> + <span class="keyword">if</span> <span class="punctuation">(</span><span class="operator">!</span><span class="name">HPyArg_Parse</span><span class="punctuation">(</span><span class="name">ctx</span><span class="punctuation">,</span> <span class="name">args</span><span class="punctuation">,</span> <span class="name">nargs</span><span class="punctuation">,</span> <span class="literal string">"ll"</span><span class="punctuation">,</span> <span class="operator">&amp;</span><span class="name">a</span><span class="punctuation">,</span> <span class="operator">&amp;</span><span class="name">b</span><span class="punctuation">))</span> + <span class="keyword">return</span> <span class="name">HPy_NULL</span><span class="punctuation">;</span> + <span class="keyword">return</span> <span class="name function">HPyLong_FromLong</span><span class="punctuation">(</span><span class="name">ctx</span><span class="punctuation">,</span> <span class="name">a</span><span class="operator">+</span><span class="name">b</span><span class="punctuation">);</span> +<span class="punctuation">}</span> + + +<span class="keyword">static</span> <span class="name">HPyMethodDef</span> <span class="name">PofMethods</span><span class="punctuation">[]</span> <span class="operator">=</span> <span class="punctuation">{</span> + <span class="punctuation">{</span><span class="literal string">"add_ints"</span><span class="punctuation">,</span> <span class="name">add_ints</span><span class="punctuation">,</span> <span class="name">HPy_METH_VARARGS</span><span class="punctuation">,</span> <span class="literal string">""</span><span class="punctuation">},</span> + <span class="punctuation">{</span><span class="name builtin">NULL</span><span class="punctuation">,</span> <span class="name builtin">NULL</span><span class="punctuation">,</span> <span class="literal number integer">0</span><span class="punctuation">,</span> <span class="name builtin">NULL</span><span class="punctuation">}</span> +<span class="punctuation">};</span> + +<span class="keyword">static</span> <span class="name">HPyModuleDef</span> <span class="name">moduledef</span> <span class="operator">=</span> <span class="punctuation">{</span> + <span class="name">HPyModuleDef_HEAD_INIT</span><span class="punctuation">,</span> + <span class="punctuation">.</span><span class="name">m_name</span> <span class="operator">=</span> <span class="literal string">"pof"</span><span class="punctuation">,</span> + <span class="punctuation">.</span><span class="name">m_doc</span> <span class="operator">=</span> <span class="literal string">"HPy Proof of Concept"</span><span class="punctuation">,</span> + <span class="punctuation">.</span><span class="name">m_size</span> <span class="operator">=</span> <span class="operator">-</span><span class="literal number integer">1</span><span class="punctuation">,</span> + <span class="punctuation">.</span><span class="name">m_methods</span> <span class="operator">=</span> <span class="name">PofMethods</span> +<span class="punctuation">};</span> + + +<span class="name">HPy_MODINIT</span><span class="punctuation">(</span><span class="name">pof</span><span class="punctuation">)</span> +<span class="keyword">static</span> <span class="name">HPy</span> <span class="name">init_pof_impl</span><span class="punctuation">(</span><span class="name">HPyContext</span> <span class="name">ctx</span><span class="punctuation">)</span> +<span class="punctuation">{</span> + <span class="name">HPy</span> <span class="name">m</span><span class="punctuation">;</span> + <span class="name">m</span> <span class="operator">=</span> <span class="name">HPyModule_Create</span><span class="punctuation">(</span><span class="name">ctx</span><span class="punctuation">,</span> <span class="operator">&amp;</span><span class="name">moduledef</span><span class="punctuation">);</span> + <span class="keyword">if</span> <span class="punctuation">(</span><span class="name">HPy_IsNull</span><span class="punctuation">(</span><span class="name">m</span><span class="punctuation">))</span> + <span class="keyword">return</span> <span class="name">HPy_NULL</span><span class="punctuation">;</span> + <span class="keyword">return</span> <span class="name">m</span><span class="punctuation">;</span> +<span class="punctuation">}</span> +</pre> +<p>People who are familiar with the current C-API will surely notice many +similarities. The biggest differences are:</p> + +<ul class="simple"> +<li>Instead of <tt class="docutils literal">PyObject *</tt>, objects have the type <tt class="docutils literal">HPy</tt>, which as +explained above represents a handle.</li> +<li>You need to explicitly pass an <tt class="docutils literal">HPyContext</tt> around: the intent is +primary to be future-proof and make it easier to implement things like +sub- interpreters.</li> +<li><tt class="docutils literal">HPy_METH_VARARGS</tt> is implemented differently than CPython's +<tt class="docutils literal">METH_VARARGS</tt>: in particular, these methods receive an array of <tt class="docutils literal">HPy</tt> +and its length, instead of a fully constructed tuple: passing a tuple +makes sense on CPython where you have it anyway, but it might be an +unnecessary burden for alternate implementations. Note that this is +similar to the new <a class="reference external" href="https://www.python.org/dev/peps/pep-0580/">METH_FASTCALL</a> which was introduced in CPython.</li> +<li>HPy relies a lot on C macros, which most of the time are needed to support +the HPy/CPython ABI compilation mode. For example, <tt class="docutils literal">HPy_DEF_METH_VARARGS</tt> +expands into a trampoline which has the correct C signature that CPython +expects (i.e., <tt class="docutils literal">PyObject <span class="pre">(*)(PyObject</span> *self, *PyObject *args)</tt>) and +which calls <tt class="docutils literal">add_ints_impl</tt>.</li> +</ul> + +</div> +<div class="section" id="sprint-report-and-current-status"> +<h2>Sprint report and current status</h2> +<p>After this long preamble, here is a rough list of what we accomplished during +the week-long sprint and the days immediatly after.</p> +<p>On the HPy side, we kicked-off the code in the repo: at the moment of writing +the layout of the directories is a bit messy because we moved things around +several times, but we identified several main sections:</p> + +<ol class="arabic"> +<li><p class="first">A specification of the API which serves both as documentation and as an +input for parts of the projects which are automatically +generated. Currently, this lives in <a class="reference external" href="https://github.com/pyhandle/hpy/blob/9aa8a2738af3fd2eda69d4773b319d10a9a5373f/tools/public_api.h">public_api.h</a>.</p> +</li> +<li><p class="first">A set of header files which can be used to compile extension modules: +depending on whether the flag <tt class="docutils literal"><span class="pre">-DHPY_UNIVERSAL_ABI</span></tt> is passed to the +compiler, the extension can target the <a class="reference external" href="https://github.com/pyhandle/hpy/blob/9aa8a2738af3fd2eda69d4773b319d10a9a5373f/hpy-api/hpy_devel/include/cpython/hpy.h">HPy/CPython ABI</a> or the <a class="reference external" href="https://github.com/pyhandle/hpy/blob/9aa8a2738af3fd2eda69d4773b319d10a9a5373f/hpy-api/hpy_devel/include/universal/hpy.h">HPy +Universal ABI</a></p> +</li> +<li><p class="first">A <a class="reference external" href="https://github.com/pyhandle/hpy/tree/9aa8a2738af3fd2eda69d4773b319d10a9a5373f/cpython-universal/src">CPython extension module</a> called <tt class="docutils literal">hpy_universal</tt> which makes it +possible to import universal modules on CPython</p> +</li> +<li><p class="first">A set of <a class="reference external" href="https://github.com/pyhandle/hpy/tree/9aa8a2738af3fd2eda69d4773b319d10a9a5373f/test">tests</a> which are independent of the implementation and are meant +to be an "executable specification" of the semantics. Currently, these +tests are run against three different implementations of the HPy API:</p> + +<ul class="simple"> +<li>the headers which implements the "HPy/CPython ABI"</li> +<li>the <tt class="docutils literal">hpy_universal</tt> module for CPython</li> +<li>the <tt class="docutils literal">hpy_universal</tt> module for PyPy (these tests are run in the PyPy repo)</li> +</ul> + +</li> +</ol> + +<p>Moreover, we started a <a class="reference external" href="https://foss.heptapod.net/pypy/pypy/-/tree/branch/hpy/pypy/module/hpy_universal/">PyPy branch</a> in which to implement the +<tt class="docutils literal">hpy_univeral</tt> module: at the moment of writing PyPy can pass all the HPy +tests apart the ones which allow conversion to and from <tt class="docutils literal">PyObject *</tt>. +Among the other things, this means that it is already possible to load the +very same binary module in both CPython and PyPy, which is impressive on its +own :).</p> +<p>Finally, we wanted a real-life use case to show how to port a module to HPy +and to do benchmarks. After some searching, we choose <a class="reference external" href="https://github.com/esnme/ultrajson">ultrajson</a>, for the +following reasons:</p> + +<ul class="simple"> +<li>it is a real-world extension module which was written with performance in +mind</li> +<li>when parsing a JSON file it does a lot of calls to the Python API to +construct the various parts of the result message</li> +<li>it uses only a small subset of the Python API</li> +</ul> + +<p>This repo contains the <a class="reference external" href="https://github.com/pyhandle/ultrajson-hpy">HPy port of ultrajson</a>. This <a class="reference external" href="https://github.com/pyhandle/ultrajson-hpy/commit/efb35807afa8cf57db5df6a3dfd4b64c289fe907">commit</a> shows an example +of what the porting looks like.</p> +<p><tt class="docutils literal">ujson_hpy</tt> is also a very good example of incremental migration: so far +only <tt class="docutils literal">ujson.loads</tt> is implemented using the HPy API, while <tt class="docutils literal">ujson.dumps</tt> +is still implemented using the old C-API, and both can coexist nicely in the +same compiled module.</p> +</div> +<div class="section" id="benchmarks"> +<h2>Benchmarks</h2> +<p>Once we have a fully working <tt class="docutils literal">ujson_hpy</tt> module, we can finally run +benchmarks! We tested several different versions of the module:</p> + +<ul class="simple"> +<li><tt class="docutils literal">ujson</tt>: this is the vanilla implementation of ultrajson using the +C-API. On PyPy this is executed by the infamous <tt class="docutils literal">cpyext</tt> compatibility +layer, so we expect it to be much slower than on CPython</li> +<li><tt class="docutils literal">ujson_hpy</tt>: our HPy port compiled to target the HPy/CPython ABI. We +expect it to be as fast as <tt class="docutils literal">ujson</tt></li> +<li><tt class="docutils literal">ujson_hpy_universal</tt>: same as above but compiled to target the +Universal HPy ABI. We expect it to be slightly slower than <tt class="docutils literal">ujson</tt> on +CPython, and much faster on PyPy.</li> +</ul> + +<p>Finally, we also ran the benchmark using the builtin <tt class="docutils literal">json</tt> module. This is +not really relevant to HPy, but it might still be an interesting as a +reference data point.</p> +<p>The <a class="reference external" href="https://github.com/pyhandle/ultrajson-hpy/blob/hpy/benchmark/main.py">benchmark</a> is very simple and consists of parsing a <a class="reference external" href="https://github.com/pyhandle/ultrajson-hpy/blob/hpy/benchmark/download_data.sh">big JSON file</a> 100 +times. Here is the average time per iteration (in milliseconds) using the +various versions of the module, CPython 3.7 and the latest version of the hpy +PyPy branch:</p> +<table border="1" class="docutils"> +<colgroup> +<col width="55%"> +<col width="24%"> +<col width="21%"> +</colgroup> +<tbody valign="top"> +<tr><td> </td> +<td>CPython</td> +<td>PyPy</td> +</tr> +<tr><td>ujson</td> +<td>154.32</td> +<td>633.97</td> +</tr> +<tr><td>ujson_hpy</td> +<td>152.19</td> +<td> </td> +</tr> +<tr><td>ujson_hpy_universal</td> +<td>168.78</td> +<td>207.68</td> +</tr> +<tr><td>json</td> +<td>224.59</td> +<td>135.43</td> +</tr> +</tbody> +</table> +<p>As expected, the benchmark proves that when targeting the HPy/CPython ABI, HPy +doesn't impose any performance penalty on CPython. The universal version is +~10% slower on CPython, but gives an impressive 3x speedup on PyPy! It it +worth noting that the PyPy hpy module is not fully optimized yet, and we +expect to be able to reach the same performance as CPython for this particular +example (or even more, thanks to our better GC).</p> +<p>All in all, not a bad result for two weeks of intense hacking :)</p> +<p>It is also worth noting than PyPy's builtin <tt class="docutils literal">json</tt> module does <strong>really</strong> +well in this benchmark, thanks to the recent optimizations that were described +in an <a class="reference external" href="https://www.pypy.org/posts/2019/10/pypys-new-json-parser-492911724084305501.html">earlier blog post</a>.</p> +</div> +<div class="section" id="conclusion-and-future-directions"> +<h2>Conclusion and future directions</h2> +<p>We think we can be very satisfied about what we have got so far. The +development of HPy is quite new, but these early results seem to indicate that +we are on the right track to bring Python extensions into the future.</p> +<p>At the moment, we can anticipate some of the next steps in the development of +HPy:</p> + +<ul class="simple"> +<li>Think about a proper API design: what we have done so far has +been a "dumb" translation of the API we needed to run <tt class="docutils literal">ujson</tt>. However, +one of the declared goal of HPy is to improve the design of the API. There +will be a trade-off between the desire of having a clean, fresh new API +and the need to be not too different than the old one, to make porting +easier. Finding the sweet spot will not be easy!</li> +<li>Implement the "debug" mode, which will help developers to find +bugs such as leaking handles or using invalid handles.</li> +<li>Instruct Cython to emit HPy code on request.</li> +<li>Eventually, we will also want to try to port parts of <tt class="docutils literal">numpy</tt> to HPy to +finally solve the long-standing problem of sub-optimal <tt class="docutils literal">numpy</tt> +performance in PyPy.</li> +</ul> + +<p>Stay tuned!</p> + +</div>https://www.pypy.org/posts/2019/12/hpy-kick-off-sprint-report-1840829336092490938.htmlWed, 18 Dec 2019 13:38:00 GMTPyPy v7.0.0: triple release of 2.7, 3.5 and 3.6-alphahttps://www.pypy.org/posts/2019/02/pypy-v700-triple-release-of-27-35-and-606875333356156076.htmlAntonio Cuni<br> +<div class="document" id="pypy-v7-0-0-triple-release-of-2-7-3-5-and-3-6-alpha"> +The PyPy team is proud to release the version 7.0.0 of PyPy, which includes +three different interpreters:<br> +<blockquote> +<ul class="simple"> +<li>PyPy2.7, which is an interpreter supporting the syntax and the features of +Python 2.7</li> +<li>PyPy3.5, which supports Python 3.5</li> +<li>PyPy3.6-alpha: this is the first official release of PyPy to support 3.6 +features, although it is still considered alpha quality.</li> +</ul> +</blockquote> +All the interpreters are based on much the same codebase, thus the triple +release.<br> +Until we can work with downstream providers to distribute builds with PyPy, we +have made packages for some common packages <a class="reference external" href="https://github.com/antocuni/pypy-wheels">available as wheels</a>.<br> +The <a class="reference external" href="https://doc.pypy.org/en/latest/gc_info.html#semi-manual-gc-management">GC hooks</a> , which can be used to gain more insights into its +performance, has been improved and it is now possible to manually manage the +GC by using a combination of <tt class="docutils literal">gc.disable</tt> and <tt class="docutils literal">gc.collect_step</tt>. See the +<a class="reference external" href="https://www.pypy.org/posts/2019/01/pypy-for-low-latency-systems-613165393301401965.html">GC blog post</a>.<br> +We updated the <a class="reference external" href="https://cffi.readthedocs.io/">cffi</a> module included in PyPy to version 1.12, and the +<a class="reference external" href="https://cppyy.readthedocs.io/">cppyy</a> backend to 1.4. Please use these to wrap your C and C++ code, +respectively, for a JIT friendly experience.<br> +As always, this release is 100% compatible with the previous one and fixed +several issues and bugs raised by the growing community of PyPy users. +We strongly recommend updating.<br> +The PyPy3.6 release and the Windows PyPy3.5 release are still not production +quality so your mileage may vary. There are open issues with incomplete +compatibility and c-extension support.<br> +The utf8 branch that changes internal representation of unicode to utf8 did not +make it into the release, so there is still more goodness coming. +You can download the v7.0 releases here:<br> +<blockquote> +<a class="reference external" href="https://pypy.org/download.html">https://pypy.org/download.html</a></blockquote> +We would like to thank our donors for the continued support of the PyPy +project. If PyPy is not quite good enough for your needs, we are available for +direct consulting work.<br> +We would also like to thank our contributors and encourage new people to join +the project. PyPy has many layers and we need help with all of them: <a class="reference external" href="https://www.blogger.com/index.html">PyPy</a> +and <a class="reference external" href="https://rpython.readthedocs.org/">RPython</a> documentation improvements, tweaking popular modules to run +on pypy, or general <a class="reference external" href="https://www.blogger.com/project-ideas.html">help</a> with making RPython's JIT even better.<br> +<div class="section" id="what-is-pypy"> +<h1> +What is PyPy?</h1> +PyPy is a very compliant Python interpreter, almost a drop-in replacement for +CPython 2.7, 3.5 and 3.6. It's fast (<a class="reference external" href="https://speed.pypy.org/">PyPy and CPython 2.7.x</a> performance +comparison) due to its integrated tracing JIT compiler.<br> +We also welcome developers of other <a class="reference external" href="https://rpython.readthedocs.io/en/latest/examples.html">dynamic languages</a> to see what RPython +can do for them.<br> +The PyPy release supports:<br> +<blockquote> +<ul class="simple"> +<li><strong>x86</strong> machines on most common operating systems +(Linux 32/64 bits, Mac OS X 64 bits, Windows 32 bits, OpenBSD, FreeBSD)</li> +<li>big- and little-endian variants of <strong>PPC64</strong> running Linux,</li> +<li><strong>s390x</strong> running Linux</li> +</ul> +</blockquote> +Unfortunately at the moment of writing our ARM buildbots are out of service, +so for now we are <strong>not</strong> releasing any binary for the ARM architecture.</div> +<div class="section" id="changelog"> +<h1> +What else is new?</h1> +PyPy 6.0 was released in April, 2018. +There are many incremental improvements to RPython and PyPy, the complete listing is <a class="reference external" href="https://doc.pypy.org/en/latest/release-v7.0.0.html">here</a>.<br> +<br> +Please update, and continue to help us make PyPy better.<br> +<br> +<br> +Cheers, The PyPy team +</div> +</div>releasehttps://www.pypy.org/posts/2019/02/pypy-v700-triple-release-of-27-35-and-606875333356156076.htmlMon, 11 Feb 2019 10:55:00 GMTPyPy for low-latency systemshttps://www.pypy.org/posts/2019/01/pypy-for-low-latency-systems-613165393301401965.htmlAntonio Cuni<h1 class="title"> +PyPy for low-latency systems</h1> +Recently I have merged the gc-disable branch, introducing a couple of features +which are useful when you need to respond to certain events with the lowest +possible latency. This work has been kindly sponsored by <a class="reference external" href="https://www.gambitresearch.com/">Gambit Research</a> +(which, by the way, is a very cool and geeky place where to <a class="reference external" href="https://www.gambitresearch.com/jobs.html">work</a>, in case you +are interested). Note also that this is a very specialized use case, so these +features might not be useful for the average PyPy user, unless you have the +same problems as described here.<br> +<br> +The PyPy VM manages memory using a generational, moving Garbage Collector. +Periodically, the GC scans the whole heap to find unreachable objects and +frees the corresponding memory. Although at a first look this strategy might +sound expensive, in practice the total cost of memory management is far less +than e.g. on CPython, which is based on reference counting. While maybe +counter-intuitive, the main advantage of a non-refcount strategy is +that allocation is very fast (especially compared to malloc-based allocators), +and deallocation of objects which die young is basically for free. More +information about the PyPy GC is available <a class="reference external" href="https://pypy.readthedocs.io/en/latest/gc_info.html#incminimark">here</a>.<br> +<br> +As we said, the total cost of memory managment is less on PyPy than on +CPython, and it's one of the reasons why PyPy is so fast. However, one big +disadvantage is that while on CPython the cost of memory management is spread +all over the execution of the program, on PyPy it is concentrated into GC +runs, causing observable pauses which interrupt the execution of the user +program.<br> +To avoid excessively long pauses, the PyPy GC has been using an <a class="reference external" href="https://www.pypy.org/posts/2013/10/incremental-garbage-collector-in-pypy-8956893523842234676.html">incremental +strategy</a> since 2013. The GC runs as a series of "steps", letting the user +program to progress between each step.<br> +<br> +The following chart shows the behavior of a real-world, long-running process:<br> +<div class="separator" style="clear: both; text-align: center;"> +<a href="https://3.bp.blogspot.com/-44yKwUVK3BE/XC4X9XL4BII/AAAAAAAABbE/XdTCIoyA-eYxvxIgJhFHaKnzxjhoWStHQCEwYBhgL/s1600/gc-timing.png" style="margin-right: 1em;"><img border="0" height="246" src="https://3.bp.blogspot.com/-44yKwUVK3BE/XC4X9XL4BII/AAAAAAAABbE/XdTCIoyA-eYxvxIgJhFHaKnzxjhoWStHQCEwYBhgL/s640/gc-timing.png" width="640"></a></div> +<br> +<br> +The orange line shows the total memory used by the program, which +increases linearly while the program progresses. Every ~5 minutes, the GC +kicks in and the memory usage drops from ~5.2GB to ~2.8GB (this ratio is controlled +by the <a class="reference external" href="https://pypy.readthedocs.io/en/latest/gc_info.html#environment-variables">PYPY_GC_MAJOR_COLLECT</a> env variable).<br> +The purple line shows aggregated data about the GC timing: the whole +collection takes ~1400 individual steps over the course of ~1 minute: each +point represent the <strong>maximum</strong> time a single step took during the past 10 +seconds. Most steps take ~10-20 ms, although we see a horrible peak of ~100 ms +towards the end. We have not investigated yet what it is caused by, but we +suspect it is related to the deallocation of raw objects.<br> +<br> +These multi-millesecond pauses are a problem for systems where it is important +to respond to certain events with a latency which is both low and consistent. +If the GC kicks in at the wrong time, it might causes unacceptable pauses during +the collection cycle.<br> +<br> +Let's look again at our real-world example. This is a system which +continuously monitors an external stream; when a certain event occurs, we want +to take an action. The following chart shows the maximum time it takes to +complete one of such actions, aggregated every minute:<br> +<br> +<div class="separator" style="clear: both; text-align: center;"> +<a href="https://4.bp.blogspot.com/-FO9uFHSqZzU/XC4YC8LZUpI/AAAAAAAABa8/B8ZOrEgbVJUHoO65wxvCMVpvciO_d_0TwCLcBGAs/s1600/normal-max.png" style="margin-right: 1em;"><img border="0" height="240" src="https://4.bp.blogspot.com/-FO9uFHSqZzU/XC4YC8LZUpI/AAAAAAAABa8/B8ZOrEgbVJUHoO65wxvCMVpvciO_d_0TwCLcBGAs/s640/normal-max.png" width="640"></a></div> +<br> +You can clearly see that the baseline response time is around ~20-30 +ms. However, we can also see periodic spikes around ~50-100 ms, with peaks up +to ~350-450 ms! After a bit of investigation, we concluded that most (although +not all) of the spikes were caused by the GC kicking in at the wrong time.<br> +<br> +The work I did in the <tt class="docutils literal"><span class="pre">gc-disable</span></tt> branch aims to fix this problem by +introducing <a class="reference external" href="https://pypy.readthedocs.io/en/latest/gc_info.html#semi-manual-gc-management">two new features</a> to the <tt class="docutils literal">gc</tt> module:<br> +<blockquote> +<ul class="simple"> +<li><tt class="docutils literal">gc.disable()</tt>, which previously only inhibited the execution of +finalizers without actually touching the GC, now disables the GC major +collections. After a call to it, you will see the memory usage grow +indefinitely.</li> +<li><tt class="docutils literal">gc.collect_step()</tt> is a new function which you can use to manually +execute a single incremental GC collection step.</li> +</ul> +</blockquote> +It is worth to specify that <tt class="docutils literal">gc.disable()</tt> disables <strong>only</strong> the major +collections, while minor collections still runs. Moreover, thanks to the +JIT's virtuals, many objects with a short and predictable lifetime are not +allocated at all. The end result is that most objects with short lifetime are +still collected as usual, so the impact of <tt class="docutils literal">gc.disable()</tt> on memory growth +is not as bad as it could sound.<br> +<br> +Combining these two functions, it is possible to take control of the GC to +make sure it runs only when it is acceptable to do so. For an example of +usage, you can look at the implementation of a <a class="reference external" href="https://github.com/antocuni/pypytools/blob/master/pypytools/gc/custom.py">custom GC</a> inside <a class="reference external" href="https://pypi.org/project/pypytools/">pypytools</a>. +The peculiarity is that it also defines a "<tt class="docutils literal">with <span class="pre">nogc():"</span></tt> context manager +which you can use to mark performance-critical sections where the GC is not +allowed to run.<br> +<br> +The following chart compares the behavior of the default PyPy GC and the new +custom GC, after a careful placing of <tt class="docutils literal">nogc()</tt> sections:<br> +<br> +<div class="separator" style="clear: both; text-align: center;"> +<a href="https://1.bp.blogspot.com/-bGqs0WrOEBk/XC4YJN0uZfI/AAAAAAAABbA/4EXOASvy830IKBoTFtrnmY22Vyd_api-ACLcBGAs/s1600/nogc-max.png" style="margin-right: 1em;"><img border="0" height="242" src="https://1.bp.blogspot.com/-bGqs0WrOEBk/XC4YJN0uZfI/AAAAAAAABbA/4EXOASvy830IKBoTFtrnmY22Vyd_api-ACLcBGAs/s640/nogc-max.png" width="640"></a></div> +<br> +The yellow line is the same as before, while the purple line shows the new +system: almost all spikes have gone, and the baseline performance is about 10% +better. There is still one spike towards the end, but after some investigation +we concluded that it was <strong>not</strong> caused by the GC.<br> +<br> +Note that this does <strong>not</strong> mean that the whole program became magically +faster: we simply moved the GC pauses in some other place which is <strong>not</strong> +shown in the graph: in this specific use case this technique was useful +because it allowed us to shift the GC work in places where pauses are more +acceptable.<br> +<br> +All in all, a pretty big success, I think. These functionalities are already +available in the nightly builds of PyPy, and will be included in the next +release: take this as a New Year present :)<br> +<br> +Antonio Cuni and the PyPy teamgcsponsorshttps://www.pypy.org/posts/2019/01/pypy-for-low-latency-systems-613165393301401965.htmlThu, 03 Jan 2019 14:21:00 GMTInside cpyext: Why emulating CPython C API is so Hardhttps://www.pypy.org/posts/2018/09/inside-cpyext-why-emulating-cpython-c-8083064623681286567.htmlAntonio Cuni<br> +<div class="document" id="inside-cpyext-why-emulating-cpython-c-api-is-so-hard"> +<tt class="docutils literal">cpyext</tt> is PyPy's subsystem which provides a compatibility +layer to compile and run CPython C extensions inside PyPy. Often people ask +why a particular C extension doesn't work or is very slow on PyPy. +Usually it is hard to answer without going into technical details. The goal of +this blog post is to explain some of these technical details, so that we can +simply link here instead of explaining again and again :).<br> +From a 10.000 foot view, <tt class="docutils literal">cpyext</tt> is PyPy's version of <tt class="docutils literal">"Python.h"</tt>. Every time +you compile an extension which uses that header file, you are using <tt class="docutils literal">cpyext</tt>. +This includes extension explicitly written in C (such as <tt class="docutils literal">numpy</tt>) and +extensions which are generated from other compilers/preprocessors +(e.g. <tt class="docutils literal">Cython</tt>).<br> +At the time of writing, the current status is that most C extensions "just +work". Generally speaking, you can simply <tt class="docutils literal">pip install</tt> them, +provided they use the public, <a class="reference external" href="https://docs.python.org/2/c-api/index.html">official C API</a> instead of poking at private +implementation details. However, the performance of cpyext is generally +poor. A Python program which makes heavy use of <tt class="docutils literal">cpyext</tt> extensions +is likely to be slower on PyPy than on CPython.<br> +Note: in this blog post we are talking about Python 2.7 because it is still +the default version of PyPy: however most of the implementation of <tt class="docutils literal">cpyext</tt> is +shared with PyPy3, so everything applies to that as well.<br> +<div class="section" id="c-api-overview"> +<h1> +C API Overview</h1> +In CPython, which is written in C, Python objects are represented as <tt class="docutils literal">PyObject*</tt>, +i.e. (mostly) opaque pointers to some common "base struct".<br> +CPython uses a very simple memory management scheme: when you create an +object, you allocate a block of memory of the appropriate size on the heap. +Depending on the details, you might end up calling different allocators, but +for the sake of simplicity, you can think that this ends up being a call to +<tt class="docutils literal">malloc()</tt>. The resulting block of memory is initialized and casted to to +<tt class="docutils literal">PyObject*</tt>: this address never changes during the object lifetime, and the +C code can freely pass it around, store it inside containers, retrieve it +later, etc.<br> +Memory is managed using reference counting. When you create a new reference to +an object, or you discard a reference you own, you have to <a class="reference external" href="https://docs.python.org/2/c-api/refcounting.html#c.Py_INCREF">increment</a> or +<a class="reference external" href="https://docs.python.org/2/c-api/refcounting.html#c.Py_DECREF">decrement</a> the reference counter accordingly. When the reference counter goes to +0, it means that the object is no longer used and can safely be +destroyed. Again, we can simplify and say that this results in a call to +<tt class="docutils literal">free()</tt>, which finally releases the memory which was allocated by <tt class="docutils literal">malloc()</tt>.<br> +Generally speaking, the only way to operate on a <tt class="docutils literal">PyObject*</tt> is to call the +appropriate API functions. For example, to convert a given <tt class="docutils literal">PyObject*</tt> to a C +integer, you can use <a class="reference external" href="https://docs.python.org/2/c-api/int.html#c.PyInt_AsLong">PyInt_AsLong()</a>; to add two objects together, you can +call <a class="reference external" href="https://docs.python.org/2/c-api/number.html#c.PyNumber_Add">PyNumber_Add()</a>.<br> +Internally, PyPy uses a similar approach. All Python objects are subclasses of +the RPython <tt class="docutils literal">W_Root</tt> class, and they are operated by calling methods on the +<tt class="docutils literal">space</tt> singleton, which represents the interpreter.<br> +At first, it looks very easy to write a compatibility layer: just make +<tt class="docutils literal">PyObject*</tt> an alias for <tt class="docutils literal">W_Root</tt>, and write simple RPython functions +(which will be translated to C by the RPython compiler) which call the +<tt class="docutils literal">space</tt> accordingly:<br> +<pre class="code python literal-block"><span class="keyword">def</span> <span class="name function">PyInt_AsLong</span><span class="punctuation">(</span><span class="name">space</span><span class="punctuation">,</span> <span class="name">o</span><span class="punctuation">):</span> + <span class="keyword">return</span> <span class="name">space</span><span class="operator">.</span><span class="name">int_w</span><span class="punctuation">(</span><span class="name">o</span><span class="punctuation">)</span> + +<span class="keyword">def</span> <span class="name function">PyNumber_Add</span><span class="punctuation">(</span><span class="name">space</span><span class="punctuation">,</span> <span class="name">o1</span><span class="punctuation">,</span> <span class="name">o2</span><span class="punctuation">):</span> + <span class="keyword">return</span> <span class="name">space</span><span class="operator">.</span><span class="name">add</span><span class="punctuation">(</span><span class="name">o1</span><span class="punctuation">,</span> <span class="name">o2</span><span class="punctuation">)</span> +</pre> +Actually, the code above is not too far from the real +implementation. However, there are tons of gory details which make it much +harder than it looks, and much slower unless you pay a lot of attention +to performance.</div> +<div class="section" id="the-pypy-gc"> +<h1> +The PyPy GC</h1> +To understand some of <tt class="docutils literal">cpyext</tt> challenges, you need to have at least a rough +idea of how the PyPy GC works.<br> +Contrarily to the popular belief, the "Garbage Collector" is not only about +collecting garbage: instead, it is generally responsible for all memory +management, including allocation and deallocation.<br> +Whereas CPython uses a combination of malloc/free/refcounting to manage +memory, the PyPy GC uses a completely different approach. It is designed +assuming that a dynamic language like Python behaves the following way:<br> +<blockquote> +<ul class="simple"> +<li>You create, either directly or indirectly, lots of objects.</li> +<li>Most of these objects are temporary and very short-lived. Think e.g. of +doing <tt class="docutils literal">a + b + c</tt>: you need to allocate an object to hold the temporary +result of <tt class="docutils literal">a + b</tt>, then it dies very quickly because you no longer need it +when you do the final <tt class="docutils literal">+ c</tt> part.</li> +<li>Only small fraction of the objects survive and stay around for a while.</li> +</ul> +</blockquote> +So, the strategy is: make allocation as fast as possible; make deallocation of +short-lived objects as fast as possible; find a way to handle the remaining +small set of objects which actually survive long enough to be important.<br> +This is done using a <strong>Generational GC</strong>: the basic idea is the following:<br> +<blockquote> +<ol class="arabic simple"> +<li>We have a nursery, where we allocate "young objects" very quickly.</li> +<li>When the nursery is full, we start what we call a "minor collection".<ul> +<li>We do a quick scan to determine the small set of objects which survived so +far</li> +<li>We <strong>move</strong> these objects out of the nursery, and we place them in the +area of memory which contains the "old objects". Since the address of the +objects changes, we fix all the references to them accordingly.</li> +</ul> +</li> +</ol> +<ol class="arabic simple" start="4"> +<li>now the nursery contains only objects which "died young". We can +discard all of them very quickly, reset the nursery, and use the same area +of memory to allocate new objects from now.</li> +</ol> +</blockquote> +In practice, this scheme works very well and it is one of the reasons why PyPy +is much faster than CPython. However, careful readers have surely noticed +that this is a problem for <tt class="docutils literal">cpyext</tt>. On one hand, we have PyPy objects which +can potentially move and change their underlying memory address; on the other +hand, we need a way to represent them as fixed-address <tt class="docutils literal">PyObject*</tt> when we +pass them to C extensions. We surely need a way to handle that.</div> +<div class="section" id="pyobject-in-pypy"> +<h1> +<tt class="docutils literal">PyObject*</tt> in PyPy</h1> +Another challenge is that sometimes, <tt class="docutils literal">PyObject*</tt> structs are not completely +opaque: there are parts of the public API which expose to the user specific +fields of some concrete C struct. For example the definition of <a class="reference external" href="https://docs.python.org/2/c-api/typeobj.html">PyTypeObject</a> +which exposes many of the <tt class="docutils literal">tp_*</tt> slots to the user. +Since the low-level layout of PyPy <tt class="docutils literal">W_Root</tt> objects is completely different +than the one used by CPython, we cannot simply pass RPython objects to C; we +need a way to handle the difference.<br> +So, we have two issues so far: objects can move, and incompatible +low-level layouts. <tt class="docutils literal">cpyext</tt> solves both by decoupling the RPython and the C +representations. We have two "views" of the same entity, depending on whether +we are in the PyPy world (the movable <tt class="docutils literal">W_Root</tt> subclass) or in the C world +(the non-movable <tt class="docutils literal">PyObject*</tt>).<br> +<tt class="docutils literal">PyObject*</tt> are created lazily, only when they are actually needed. The +vast majority of PyPy objects are never passed to any C extension, so we don't +pay any penalty in that case. However, the first time we pass a <tt class="docutils literal">W_Root</tt> to +C, we allocate and initialize its <tt class="docutils literal">PyObject*</tt> counterpart.<br> +The same idea applies also to objects which are created in C, e.g. by calling +<a class="reference external" href="https://docs.python.org/2/c-api/allocation.html#c.PyObject_New">PyObject_New()</a>. At first, only the <tt class="docutils literal">PyObject*</tt> exists and it is +exclusively managed by reference counting. As soon as we pass it to the PyPy +world (e.g. as a return value of a function call), we create its <tt class="docutils literal">W_Root</tt> +counterpart, which is managed by the GC as usual.<br> +Here we start to see why calling cpyext modules is more costly in PyPy than in +CPython. We need to pay some penalty for all the conversions between +<tt class="docutils literal">W_Root</tt> and <tt class="docutils literal">PyObject*</tt>.<br> +Moreover, the first time we pass a <tt class="docutils literal">W_Root</tt> to C we also need to allocate +the memory for the <tt class="docutils literal">PyObject*</tt> using a slowish "CPython-style" memory +allocator. In practice, for all the objects which are passed to C we pay more +or less the same costs as CPython, thus effectively "undoing" the speedup +guaranteed by PyPy's Generational GC under normal circumstances.</div> +<div class="section" id="maintaining-the-link-between-w-root-and-pyobject"> +<h1> +Maintaining the link between <tt class="docutils literal">W_Root</tt> and <tt class="docutils literal">PyObject*</tt></h1> +We now need a way to convert between <tt class="docutils literal">W_Root</tt> and <tt class="docutils literal">PyObject*</tt> and +vice-versa; also, we need to to ensure that the lifetime of the two entities +are in sync. In particular:<br> +<blockquote> +<ol class="arabic simple"> +<li>as long as the <tt class="docutils literal">W_Root</tt> is kept alive by the GC, we want the +<tt class="docutils literal">PyObject*</tt> to live even if its refcount drops to 0;</li> +<li>as long as the <tt class="docutils literal">PyObject*</tt> has a refcount greater than 0, we want to +make sure that the GC does not collect the <tt class="docutils literal">W_Root</tt>.</li> +</ol> +</blockquote> +The <tt class="docutils literal">PyObject*</tt> ⇨ <tt class="docutils literal">W_Root</tt> link is maintained by the special field +<a class="reference external" href="https://foss.heptapod.net/pypy/pypy/-/tree/branch/py3.6/pypy/module/cpyext/parse/cpyext_object.h#lines-5">ob_pypy_link</a> which is added to all <tt class="docutils literal">PyObject*</tt>. On a 64 bit machine this +means that all <tt class="docutils literal">PyObject*</tt> have 8 bytes of overhead, but then the +conversion is very quick, just reading the field.<br> +For the other direction, we generally don't want to do the same: the +assumption is that the vast majority of <tt class="docutils literal">W_Root</tt> objects will never be +passed to C, and adding an overhead of 8 bytes to all of them is a +waste. Instead, in the general case the link is maintained by using a +dictionary, where <tt class="docutils literal">W_Root</tt> are the keys and <tt class="docutils literal">PyObject*</tt> the values.<br> +However, for a <a class="reference external" href="https://foss.heptapod.net/pypy/pypy/-/tree/branch/py3.6/pypy/module/cpyext/pyobject.py#lines-66">few selected</a> <tt class="docutils literal">W_Root</tt> subclasses we <strong>do</strong> maintain a +direct link using the special <tt class="docutils literal">_cpy_ref</tt> field to improve performance. In +particular, we use it for <tt class="docutils literal">W_TypeObject</tt> (which is big anyway, so a 8 bytes +overhead is negligible) and <tt class="docutils literal">W_NoneObject</tt>. <tt class="docutils literal">None</tt> is passed around very +often, so we want to ensure that the conversion to <tt class="docutils literal">PyObject*</tt> is very +fast. Moreover it's a singleton, so the 8 bytes overhead is negligible as +well.<br> +This means that in theory, passing an arbitrary Python object to C is +potentially costly, because it involves doing a dictionary lookup. We assume +that this cost will eventually show up in the profiler: however, at the time +of writing there are other parts of <tt class="docutils literal">cpyext</tt> which are even more costly (as we +will show later), so the cost of the dict lookup is never evident in the +profiler.</div> +<div class="section" id="crossing-the-border-between-rpython-and-c"> +<h1> +Crossing the border between RPython and C</h1> +There are two other things we need to care about whenever we cross the border +between RPython and C, and vice-versa: exception handling and the GIL.<br> +In the C API, exceptions are raised by calling <a class="reference external" href="https://docs.python.org/2/c-api/exceptions.html#c.PyErr_SetString">PyErr_SetString()</a> (or one of +<a class="reference external" href="https://docs.python.org/2/c-api/exceptions.html#exception-handling">many other functions</a> which have a similar effect), which basically works by +creating an exception value and storing it in some global variable. The +function then signals that an exception has occurred by returning an error value, +usually <tt class="docutils literal">NULL</tt>.<br> +On the other hand, in the PyPy interpreter, exceptions are propagated by raising the +RPython-level <a class="reference external" href="https://foss.heptapod.net/pypy/pypy/-/tree/branch/py3.6/pypy/interpreter/error.py#lines-20">OperationError</a> exception, which wraps the actual app-level +exception values. To harmonize the two worlds, whenever we return from C to +RPython, we need to check whether a C API exception was raised and if so turn it +into an <tt class="docutils literal">OperationError</tt>.<br> +We won't dig into details of <a class="reference external" href="https://foss.heptapod.net/pypy/pypy/-/tree/branch/py3.6/pypy/module/cpyext/api.py#lines-205">how the GIL is handled in cpyext</a>. +For the purpose of this post, it is enough to know that whenever we enter +C land, we store the current thread id into a global variable which is +accessible also from C; conversely, whenever we go back from RPython to C, we +restore this value to 0.<br> +Similarly, we need to do the inverse operations whenever you need to cross the +border between C and RPython, e.g. by calling a Python callback from C code.<br> +All this complexity is automatically handled by the RPython function +<a class="reference external" href="https://foss.heptapod.net/pypy/pypy/-/tree/branch/py3.6/pypy/module/cpyext/api.py#lines-1757">generic_cpy_call</a>. If you look at the code you see that it takes care of 4 +things:<br> +<blockquote> +<ol class="arabic simple"> +<li>Handling the GIL as explained above.</li> +<li>Handling exceptions, if they are raised.</li> +<li>Converting arguments from <tt class="docutils literal">W_Root</tt> to <tt class="docutils literal">PyObject*</tt>.</li> +<li>Converting the return value from <tt class="docutils literal">PyObject*</tt> to <tt class="docutils literal">W_Root</tt>.</li> +</ol> +</blockquote> +So, we can see that calling C from RPython introduce some overhead. +Can we measure it?<br> +Assuming that the conversion between <tt class="docutils literal">W_Root</tt> and <tt class="docutils literal">PyObject*</tt> has a +reasonable cost (as explained by the previous section), the overhead +introduced by a single border-cross is still acceptable, especially if the +callee is doing some non-negligible amount of work.<br> +However this is not always the case. There are basically three problems that +make (or used to make) <tt class="docutils literal">cpyext</tt> super slow:<br> +<blockquote> +<ol class="arabic simple"> +<li>Paying the border-crossing cost for trivial operations which are called +very often, such as <tt class="docutils literal">Py_INCREF</tt>.</li> +<li>Crossing the border back and forth many times, even if it's not strictly +needed.</li> +<li>Paying an excessive cost for argument and return value conversions.</li> +</ol> +</blockquote> +The next sections explain in more detail each of these problems.</div> +<div class="section" id="avoiding-unnecessary-roundtrips"> +<h1> +Avoiding unnecessary roundtrips</h1> +Prior to the <a class="reference external" href="https://www.pypy.org/posts/2017/10/cape-of-good-hope-for-pypy-hello-from-3656631725712879033.html">2017 Cape Town Sprint</a>, <tt class="docutils literal">cpyext</tt> was horribly slow, and we were +well aware of it: the main reason was that we never really paid too much +attention to performance. As explained in the blog post, emulating all the +CPython quirks is basically a nightmare, so better to concentrate on +correctness first.<br> +However, we didn't really know <strong>why</strong> it was so slow. We had theories and +assumptions, usually pointing at the cost of conversions between <tt class="docutils literal">W_Root</tt> +and <tt class="docutils literal">PyObject*</tt>, but we never actually measured it.<br> +So, we decided to write a set of <a class="reference external" href="https://github.com/antocuni/cpyext-benchmarks">cpyext microbenchmarks</a> to measure the +performance of various operations. The result was somewhat surprising: the +theory suggests that when you do a cpyext C call, you should pay the +border-crossing costs only once, but what the profiler told us was that we +were paying the cost of <tt class="docutils literal">generic_cpy_call</tt> several times more than what we expected.<br> +After a bit of investigation, we discovered this was ultimately caused by our +"correctness-first" approach. For simplicity of development and testing, when +we started <tt class="docutils literal">cpyext</tt> we wrote everything in RPython: thus, every single API call +made from C (like the omnipresent <a class="reference external" href="https://docs.python.org/2/c-api/arg.html#c.PyArg_ParseTuple">PyArg_ParseTuple()</a>, <a class="reference external" href="https://docs.python.org/2/c-api/int.html#c.PyInt_AsLong">PyInt_AsLong()</a>, etc.) +had to cross back the C-to-RPython border. This was especially daunting for +very simple and frequent operations like <tt class="docutils literal">Py_INCREF</tt> and <tt class="docutils literal">Py_DECREF</tt>, +which CPython implements as a single assembly instruction!<br> +Another source of slow down was the implementation of <tt class="docutils literal">PyTypeObject</tt> slots. +At the C level, these are function pointers which the interpreter calls to do +certain operations, e.g. <a class="reference external" href="https://docs.python.org/2/c-api/typeobj.html#c.PyTypeObject.tp_new">tp_new</a> to allocate a new instance of that type.<br> +As usual, we have some magic to implement slots in RPython; in particular, +<a class="reference external" href="https://foss.heptapod.net/pypy/pypy/-/tree/branch/py3.6/pypy/module/cpyext/api.py#lines-362">_make_wrapper</a> does the opposite of <tt class="docutils literal">generic_cpy_call</tt>: it takes a +RPython function and wraps it into a C function which can be safely called +from C, handling the GIL, exceptions and argument conversions automatically.<br> +This was very handy during the development of cpyext, but it might result in +some bad nonsense; consider what happens when you call the following C +function:<br> +<pre class="code C literal-block"><span class="keyword">static</span> <span class="name">PyObject</span><span class="operator">*</span> <span class="name function">foo</span><span class="punctuation">(</span><span class="name">PyObject</span><span class="operator">*</span> <span class="name">self</span><span class="punctuation">,</span> <span class="name">PyObject</span><span class="operator">*</span> <span class="name">args</span><span class="punctuation">)</span> +<span class="punctuation">{</span> + <span class="name">PyObject</span><span class="operator">*</span> <span class="name">result</span> <span class="operator">=</span> <span class="name">PyInt_FromLong</span><span class="punctuation">(</span><span class="literal number integer">1234</span><span class="punctuation">);</span> + <span class="keyword">return</span> <span class="name">result</span><span class="punctuation">;</span> +<span class="punctuation">}</span> +</pre> +<ol class="arabic simple"> +<li>you are in RPython and do a cpyext call to <tt class="docutils literal">foo</tt>: <strong>RPython-to-C</strong>;</li> +<li><tt class="docutils literal">foo</tt> calls <tt class="docutils literal">PyInt_FromLong(1234)</tt>, which is implemented in RPython: +<strong>C-to-RPython</strong>;</li> +<li>the implementation of <tt class="docutils literal">PyInt_FromLong</tt> indirectly calls +<tt class="docutils literal">PyIntType.tp_new</tt>, which is a C function pointer: <strong>RPython-to-C</strong>;</li> +<li>however, <tt class="docutils literal">tp_new</tt> is just a wrapper around an RPython function, created +by <tt class="docutils literal">_make_wrapper</tt>: <strong>C-to-RPython</strong>;</li> +<li>finally, we create our RPython <tt class="docutils literal">W_IntObject(1234)</tt>; at some point +during the <strong>RPython-to-C</strong> crossing, its <tt class="docutils literal">PyObject*</tt> equivalent is +created;</li> +<li>after many layers of wrappers, we are again in <tt class="docutils literal">foo</tt>: after we do +<tt class="docutils literal">return result</tt>, during the <strong>C-to-RPython</strong> step we convert it from +<tt class="docutils literal">PyObject*</tt> to <tt class="docutils literal">W_IntObject(1234)</tt>.</li> +</ol> +Phew! After we realized this, it was not so surprising that <tt class="docutils literal">cpyext</tt> was very +slow :). And this was a simplified example, since we are not passing a +<tt class="docutils literal">PyObject*</tt> to the API call. When we do, we need to convert it back and +forth at every step. Actually, I am not even sure that what I described was +the exact sequence of steps which used to happen, but you get the general +idea.<br> +The solution is simple: rewrite as much as we can in C instead of RPython, +to avoid unnecessary roundtrips. This was the topic of most of the Cape Town +sprint and resulted in the <tt class="docutils literal"><span class="pre">cpyext-avoid-roundtrip</span></tt> branch, which was +eventually <a class="reference external" href="https://foss.heptapod.net/pypy/pypy/-/tree/branch/cpyext_avoid-roundtrip">merged</a>.<br> +Of course, it is not possible to move <strong>everything</strong> to C: there are still +operations which need to be implemented in RPython. For example, think of +<tt class="docutils literal">PyList_Append</tt>: the logic to append an item to a list is complex and +involves list strategies, so we cannot replicate it in C. However, we +discovered that a large subset of the C API can benefit from this.<br> +Moreover, the C API is <strong>huge</strong>. While we invented this new way of writing +<tt class="docutils literal">cpyext</tt> code, we still need to +convert many of the functions to the new paradigm. Sometimes the rewrite is +not automatic +or straighforward. <tt class="docutils literal">cpyext</tt> is a delicate piece of software, so it happens often +that we make a mistake and end up staring at a segfault in gdb.<br> +However, the most important takeaway is that the performance improvements we got +from this optimization are impressive, as we will detail later.</div> +<div class="section" id="conversion-costs"> +<h1> +Conversion costs</h1> +The other potential big source of slowdown is the conversion of arguments +between <tt class="docutils literal">W_Root</tt> and <tt class="docutils literal">PyObject*</tt>.<br> +As explained earlier, the first time you pass a <tt class="docutils literal">W_Root</tt> to C, you need to +allocate its <tt class="docutils literal">PyObject*</tt> counterpart. Suppose you have a <tt class="docutils literal">foo</tt> function +defined in C, which takes a single int argument:<br> +<pre class="code python literal-block"><span class="keyword">for</span> <span class="name">i</span> <span class="operator word">in</span> <span class="name builtin">range</span><span class="punctuation">(</span><span class="name">N</span><span class="punctuation">):</span> + <span class="name">foo</span><span class="punctuation">(</span><span class="name">i</span><span class="punctuation">)</span> +</pre> +To run this code, you need to create a different <tt class="docutils literal">PyObject*</tt> for each value +of <tt class="docutils literal">i</tt>: if implemented naively, it means calling <tt class="docutils literal">N</tt> times <tt class="docutils literal">malloc()</tt> +and <tt class="docutils literal">free()</tt>, which kills performance.<br> +CPython has the very same problem, which is solved by using a <a class="reference external" href="https://en.wikipedia.org/wiki/Free_list">free list</a> to +<a class="reference external" href="https://github.com/python/cpython/blob/2.7/Objects/intobject.c#L16">allocate ints</a>. So, what we did was to simply <a class="reference external" href="https://foss.heptapod.net/pypy/pypy/-/commit/d8754ab9ba6371c83eaeb80cdf8cc13a37ee0c89">steal the code</a> from CPython +and do the exact same thing. This was also done in the +<tt class="docutils literal"><span class="pre">cpyext-avoid-roundtrip</span></tt> branch, and the benchmarks show that it worked +perfectly.<br> +Every type which is converted often to <tt class="docutils literal">PyObject*</tt> must have a very fast +allocator. At the moment of writing, PyPy uses free lists only for ints and +<a class="reference external" href="https://foss.heptapod.net/pypy/pypy/-/commit/35e2fb9903f2483940d7970bd83ce8c65aa1c1a3">tuples</a>: one of the next steps on our TODO list is certainly to use this +technique with more types, like <tt class="docutils literal">float</tt>.<br> +Conversely, we also need to optimize the converstion from <tt class="docutils literal">PyObject*</tt> to +<tt class="docutils literal">W_Root</tt>: this happens when an object is originally allocated in C and +returned to Python. Consider for example the following code:<br> +<pre class="code python literal-block"><span class="keyword namespace">import</span> <span class="name namespace">numpy</span> <span class="keyword namespace">as</span> <span class="name namespace">np</span> +<span class="name">myarray</span> <span class="operator">=</span> <span class="name">np</span><span class="operator">.</span><span class="name">random</span><span class="operator">.</span><span class="name">random</span><span class="punctuation">(</span><span class="name">N</span><span class="punctuation">)</span> +<span class="keyword">for</span> <span class="name">i</span> <span class="operator word">in</span> <span class="name builtin">range</span><span class="punctuation">(</span><span class="name builtin">len</span><span class="punctuation">(</span><span class="name">arr</span><span class="punctuation">)):</span> + <span class="name">myarray</span><span class="punctuation">[</span><span class="name">i</span><span class="punctuation">]</span> +</pre> +At every iteration, we get an item out of the array: the return type is a an +instance of <tt class="docutils literal">numpy.float64</tt> (a numpy scalar), i.e. a <tt class="docutils literal">PyObject'*</tt>: this is +something which is implemented by numpy entirely in C, so completely +opaque to <tt class="docutils literal">cpyext</tt>. We don't have any control on how it is allocated, +managed, etc., and we can assume that allocation costs are the same as on +CPython.<br> +As soon as we return these <tt class="docutils literal">PyObject*</tt> to Python, we need to allocate +their <tt class="docutils literal">W_Root</tt> equivalent. If you do it in a small loop like in the example +above, you end up allocating all these <tt class="docutils literal">W_Root</tt> inside the nursery, which is +a good thing since allocation is super fast (see the section above about the +PyPy GC).<br> +However, we also need to keep track of the <tt class="docutils literal">W_Root</tt> to <tt class="docutils literal">PyObject*</tt> link. +Currently, we do this by putting all of them in a dictionary, but it is very +inefficient, especially because most of these objects die young and thus it +is wasted work to do that for them. Currently, this is one of the biggest +unresolved problem in <tt class="docutils literal">cpyext</tt>, and it is what causes the two microbenchmarks +<tt class="docutils literal">allocate_int</tt> and <tt class="docutils literal">allocate_tuple</tt> to be very slow.<br> +We are well aware of the problem, and we have a plan for how to fix it. The +explanation is too technical for the scope of this blog post as it requires a +deep knowledge of the GC internals to be understood, but the details are +<a class="reference external" href="https://foss.heptapod.net/pypy/extradoc/-/blob/branch/extradoc/planning/cpyext.txt#L27">here</a>.</div> +<div class="section" id="c-api-quirks"> +<h1> +C API quirks</h1> +Finally, there is another source of slowdown which is beyond our control. Some +parts of the CPython C API are badly designed and expose some of the +implementation details of CPython.<br> +The major example is reference counting. The <tt class="docutils literal">Py_INCREF</tt> / <tt class="docutils literal">Py_DECREF</tt> API +is designed in such a way which forces other implementation to emulate +refcounting even in presence of other GC management schemes, as explained +above.<br> +Another example is borrowed references. There are API functions which <strong>do +not</strong> incref an object before returning it, e.g. <a class="reference external" href="https://docs.python.org/2/c-api/list.html#c.PyList_GetItem">PyList_GetItem()</a>. This is +done for performance reasons because we can avoid a whole incref/decref pair, +if the caller needs to handle the returned item only temporarily: the item is +kept alive because it is in the list anyway.<br> +For PyPy, this is a challenge: thanks to <a class="reference external" href="https://www.pypy.org/posts/2011/10/more-compact-lists-with-list-strategies-8229304944653956829.html">list strategies</a>, lists are often +represented in a compact way. For example, a list containing only integers is +stored as a C array of <tt class="docutils literal">long</tt>. How to implement <tt class="docutils literal">PyList_GetItem</tt>? We +cannot simply create a <tt class="docutils literal">PyObject*</tt> on the fly, because the caller will never +decref it and it will result in a memory leak.<br> +The current solution is very inefficient. The first time we do a +<tt class="docutils literal">PyList_GetItem</tt>, we <a class="reference external" href="https://foss.heptapod.net/pypy/pypy/-/tree/branch/py3.6/pypy/module/cpyext/listobject.py#lines-28">convert</a> the <strong>whole</strong> list to a list of +<tt class="docutils literal">PyObject*</tt>. This is bad in two ways: the first is that we potentially pay a +lot of unneeded conversion cost in case we will never access the other items +of the list. The second is that by doing that we lose all the performance +benefit granted by the original list strategy, making it slower for the +rest of the pure-python code which will manipulate the list later.<br> +<tt class="docutils literal">PyList_GetItem</tt> is an example of a bad API because it assumes that the list +is implemented as an array of <tt class="docutils literal">PyObject*</tt>: after all, in order to return a +borrowed reference, we need a reference to borrow, don't we?<br> +Fortunately, (some) CPython developers are aware of these problems, and there +is an ongoing project to <a class="reference external" href="https://pythoncapi.readthedocs.io/">design a better C API</a> which aims to fix exactly +this kind of problem.<br> +Nonetheless, in the meantime we still need to implement the current +half-broken APIs. There is no easy solution for that, and it is likely that +we will always need to pay some performance penalty in order to implement them +correctly.<br> +However, what we could potentially do is to provide alternative functions +which do the same job but are more PyPy friendly: for example, we could think +of implementing <tt class="docutils literal">PyList_GetItemNonBorrowed</tt> or something like that: then, C +extensions could choose to use it (possibly hidden inside some macro and +<tt class="docutils literal">#ifdef</tt>) if they want to be fast on PyPy.</div> +<div class="section" id="current-performance"> +<h1> +Current performance</h1> +During the whole blog post we claimed <tt class="docutils literal">cpyext</tt> is slow. How +slow it is, exactly?<br> +We decided to concentrate on <a class="reference external" href="https://github.com/antocuni/cpyext-benchmarks">microbenchmarks</a> for now. It should be evident +by now there are simply too many issues which can slow down a <tt class="docutils literal">cpyext</tt> +program, and microbenchmarks help us to concentrate on one (or few) at a +time.<br> +The microbenchmarks measure very simple things, like calling functions and +methods with the various calling conventions (no arguments, one arguments, +multiple arguments); passing various types as arguments (to measure conversion +costs); allocating objects from C, and so on.<br> +Here are the results from the old PyPy 5.8 relative and normalized to CPython +2.7, the lower the better:<br> +<br> + + +<div class="separator" style="clear: both; text-align: center;"> +<a href="https://4.bp.blogspot.com/-5QV9jBfeXfo/W6UOCRA9YqI/AAAAAAAABX4/H2zgbv_XFQEHD4Lb2lj5Ve4Ob_YMuSXLwCLcBGAs/s1600/pypy58.png" style="margin-left: 1em; margin-right: 1em;"><img border="0" height="480" src="https://4.bp.blogspot.com/-5QV9jBfeXfo/W6UOCRA9YqI/AAAAAAAABX4/H2zgbv_XFQEHD4Lb2lj5Ve4Ob_YMuSXLwCLcBGAs/s640/pypy58.png" width="640"></a></div> +<br> +<div class="separator" style="clear: both; text-align: center;"> +<a href="https://www.blogger.com/blogger.g?blogID=3971202189709462152" style="margin-left: 1em; margin-right: 1em;"></a></div> +<div class="separator" style="clear: both; text-align: center;"> +<a href="https://www.blogger.com/blogger.g?blogID=3971202189709462152" style="margin-left: 1em; margin-right: 1em;"></a></div> +<br> +PyPy was horribly slow everywhere, ranging from 2.5x to 10x slower. It is +particularly interesting to compare <tt class="docutils literal">simple.noargs</tt>, which measures the cost +of calling an empty function with no arguments, and <tt class="docutils literal">simple.onearg(i)</tt>, +which measures the cost calling an empty function passing an integer argument: +the latter is ~2x slower than the former, indicating that the conversion cost +of integers is huge.<br> +PyPy 5.8 was the last release before the famous Cape Town sprint, when we +started to look at cpyext performance seriously. Here are the performance data for +PyPy 6.0, the latest release at the time of writing:<br> +<div class="separator" style="clear: both; text-align: center;"> +<a href="https://1.bp.blogspot.com/-MRkRoxtCeOE/W6UOL5txl1I/AAAAAAAABX8/i0ZiOyS2MOgiSyxFAyMOkKcB6xqjSihBACLcBGAs/s1600/pypy60.png" style="margin-left: 1em; margin-right: 1em;"><img border="0" height="480" src="https://1.bp.blogspot.com/-MRkRoxtCeOE/W6UOL5txl1I/AAAAAAAABX8/i0ZiOyS2MOgiSyxFAyMOkKcB6xqjSihBACLcBGAs/s640/pypy60.png" width="640"></a></div> +<br> +<br> +The results are amazing! PyPy is now massively faster than before, and for +most benchmarks it is even faster than CPython: yes, you read it correctly: +PyPy is faster than CPython at doing CPython's job, even considering all the +extra work it has to do to emulate the C API. This happens thanks to the JIT, +which produces speedups high enough to counterbalance the slowdown caused by +cpyext.<br> +There are two microbenchmarks which are still slower though: <tt class="docutils literal">allocate_int</tt> +and <tt class="docutils literal">allocate_tuple</tt>, for the reasons explained in the section about +<a class="reference internal" href="https://www.blogger.com/blogger.g?blogID=3971202189709462152#conversion-costs">Conversion costs</a>.</div> +<div class="section" id="next-steps"> +<h1> +Next steps</h1> +Despite the spectacular results we got so far, <tt class="docutils literal">cpyext</tt> is still slow enough to +kill performance in most real-world code which uses C extensions extensively +(e.g., the omnipresent numpy).<br> +Our current approach is something along these lines:<br> +<blockquote> +<ol class="arabic simple"> +<li>run a real-world small benchmark which exercises cpyext</li> +<li>measure and find the major bottleneck</li> +<li>write a corresponding microbenchmark</li> +<li>optimize it</li> +<li>repeat</li> +</ol> +</blockquote> +On one hand, this is a daunting task because the C API is huge and we need to +tackle functions one by one. On the other hand, not all the functions are +equally important, and is is enough to optimize a relatively small subset to +improve many different use cases.<br> +Where a year ago we announced we have a working answer to run c-extension in +PyPy, we now have a clear picture of what are the performance bottlenecks, and +we have developed some technical solutions to fix them. It is "only" a matter +of tackling them, one by one. It is worth noting that most of the work was +done during two sprints, for a total 2-3 person-months of work.<br> +We think this work is important for the Python ecosystem. PyPy has established +a baseline for performance in pure python code, providing an answer for the +"Python is slow" detractors. The techniques used to make <tt class="docutils literal">cpyext</tt> performant +will let PyPy become an alternative for people who mix C extensions with +Python, which, it turns out, is just about everyone, in particular those using +the various scientific libraries. Today, many developers are forced to seek +performance by converting code from Python to a lower language. We feel there +is no reason to do this, but in order to prove it we must be able to run both +their python and their C extensions performantly, then we can begin to educate +them how to write JIT-friendly code in the first place.<br> +We envision a future in which you can run arbitrary Python programs on PyPy, +with the JIT speeding up the pure Python parts and the C parts running as fast +as today: the best of both worlds!</div> +</div>cpyextprofilingspeedhttps://www.pypy.org/posts/2018/09/inside-cpyext-why-emulating-cpython-c-8083064623681286567.htmlFri, 21 Sep 2018 16:32:00 GMTHow to ignore the annoying Cython warnings in PyPy 6.0https://www.pypy.org/posts/2018/04/how-to-ignore-annoying-cython-warnings-1007636731207810779.htmlAntonio Cuni<div> +</div> +<div> +<br class="Apple-interchange-newline"> +If you install any Cython-based module in PyPy 6.0.0, it is very likely that you get a warning like this:</div> +<pre><code>&gt;&gt;&gt;&gt; import numpy +/data/extra/pypy/6.0.0/site-packages/numpy/random/__init__.py:99: UserWarning: __builtin__.type size changed, may indicate binary incompatibility. Expected 888, got 408 + from .mtrand import * +</code></pre> +<div> +The TL;DR version is: the warning is a false alarm, and you can hide it by doing:</div> +<pre><code>$ pypy -m pip install pypy-fix-cython-warning +</code></pre> +<div> +The package does not contain any module, only a <code>.pth</code> file which installs a warning filter at startup.</div> +<h2> +Technical details</h2> +<div> +This happens because whenever Cython compiles a pyx file, it generates C code which does a sanity check on the C size of <code>PyType_Type</code>. PyPy versions up to 5.10 are buggy and report the incorrect size, so Cython includes a workaround to compare it with the incorrect value, when on PyPy.</div> +<div> +PyPy 6 fixed the bug and now <code>PyType_Type</code> reports the correct size; however, Cython still tries to compare it with the old, buggy value, so it (wrongly) emits the warning.</div> +<div> +Cython 0.28.2 includes a fix for it, so that C files generated by it no longer emit the warning. However, most packages are distributed with pre-cythonized C files. For example, <code>numpy-1.14.2.zip</code> include C files which were generated by Cython 0.26.1: if you compile it you still get the warning, even if you locally installed a newer version of Cython.<br> +<span style="color: #24292e;"><br></span> +<span style="color: #24292e;">There is not much that we can do on the PyPy side, apart for waiting for all the Cython-based packages to do a new release which include C files generated by a newer Cython.  In the mean time, installing this module will silence the </span><span style="color: #24292e;">warning.</span></div> +<div> +<div style="color: #24292e; font-size: 16px;"> +<br></div> +</div>https://www.pypy.org/posts/2018/04/how-to-ignore-annoying-cython-warnings-1007636731207810779.htmlFri, 27 Apr 2018 14:10:00 GMTHow to make your code 80 times fasterhttps://www.pypy.org/posts/2017/10/how-to-make-your-code-80-times-faster-1424098117108093942.htmlAntonio Cuni<div class="document" id="how-to-make-your-code-80-times-faster"> +I often hear people who are happy because PyPy makes their code 2 times faster +or so. Here is a short personal story which shows PyPy can go well beyond +that.<br> +<br> +<strong>DISCLAIMER</strong>: this is not a silver bullet or a general recipe: it worked in +this particular case, it might not work so well in other cases. But I think it +is still an interesting technique. Moreover, the various steps and +implementations are showed in the same order as I tried them during the +development, so it is a real-life example of how to proceed when optimizing +for PyPy.<br> +<br> +Some months ago I <a class="reference external" href="https://github.com/antocuni/evolvingcopter">played a bit</a> with evolutionary algorithms: the ambitious +plan was to automatically evolve a logic which could control a (simulated) +quadcopter, i.e. a <a class="reference external" href="https://en.wikipedia.org/wiki/PID_controller">PID controller</a> (<strong>spoiler</strong>: it doesn't fly).<br> +<br> +The idea is to have an initial population of random creatures: at each +generation, the ones with the best fitness survive and reproduce with small, +random variations.<br> +<br> +However, for the scope of this post, the actual task at hand is not so +important, so let's jump straight to the code. To drive the quadcopter, a +<tt class="docutils literal">Creature</tt> has a <tt class="docutils literal">run_step</tt> method which runs at each <tt class="docutils literal">delta_t</tt> (<a class="reference external" href="https://github.com/antocuni/evolvingcopter/blob/master/ev/creature.py">full +code</a>):<br> +<pre class="code python literal-block"><span class="keyword">class</span> <span class="name class">Creature</span><span class="punctuation">(</span><span class="name builtin">object</span><span class="punctuation">):</span> + <span class="name">INPUTS</span> <span class="operator">=</span> <span class="literal number integer">2</span> <span class="comment single"># z_setpoint, current z position</span> + <span class="name">OUTPUTS</span> <span class="operator">=</span> <span class="literal number integer">1</span> <span class="comment single"># PWM for all 4 motors</span> + <span class="name">STATE_VARS</span> <span class="operator">=</span> <span class="literal number integer">1</span> + <span class="operator">...</span> + + <span class="keyword">def</span> <span class="name function">run_step</span><span class="punctuation">(</span><span class="name builtin pseudo">self</span><span class="punctuation">,</span> <span class="name">inputs</span><span class="punctuation">):</span> + <span class="comment single"># state: [state_vars ... inputs]</span> + <span class="comment single"># out_values: [state_vars, ... outputs]</span> + <span class="name builtin pseudo">self</span><span class="operator">.</span><span class="name">state</span><span class="punctuation">[</span><span class="name builtin pseudo">self</span><span class="operator">.</span><span class="name">STATE_VARS</span><span class="punctuation">:]</span> <span class="operator">=</span> <span class="name">inputs</span> + <span class="name">out_values</span> <span class="operator">=</span> <span class="name">np</span><span class="operator">.</span><span class="name">dot</span><span class="punctuation">(</span><span class="name builtin pseudo">self</span><span class="operator">.</span><span class="name">matrix</span><span class="punctuation">,</span> <span class="name builtin pseudo">self</span><span class="operator">.</span><span class="name">state</span><span class="punctuation">)</span> <span class="operator">+</span> <span class="name builtin pseudo">self</span><span class="operator">.</span><span class="name">constant</span> + <span class="name builtin pseudo">self</span><span class="operator">.</span><span class="name">state</span><span class="punctuation">[:</span><span class="name builtin pseudo">self</span><span class="operator">.</span><span class="name">STATE_VARS</span><span class="punctuation">]</span> <span class="operator">=</span> <span class="name">out_values</span><span class="punctuation">[:</span><span class="name builtin pseudo">self</span><span class="operator">.</span><span class="name">STATE_VARS</span><span class="punctuation">]</span> + <span class="name">outputs</span> <span class="operator">=</span> <span class="name">out_values</span><span class="punctuation">[</span><span class="name builtin pseudo">self</span><span class="operator">.</span><span class="name">STATE_VARS</span><span class="punctuation">:]</span> + <span class="keyword">return</span> <span class="name">outputs</span> +</pre> +<ul class="simple"> +<li><tt class="docutils literal">inputs</tt> is a numpy array containing the desired setpoint and the current +position on the Z axis;</li> +<li><tt class="docutils literal">outputs</tt> is a numpy array containing the thrust to give to the motors. To +start easy, all the 4 motors are constrained to have the same thrust, so +that the quadcopter only travels up and down the Z axis;</li> +<li><tt class="docutils literal">self.state</tt> contains arbitrary values of unknown size which are passed from +one step to the next;</li> +<li><tt class="docutils literal">self.matrix</tt> and <tt class="docutils literal">self.constant</tt> contains the actual logic. By putting +the "right" values there, in theory we could get a perfectly tuned PID +controller. These are randomly mutated between generations.</li> +</ul> +<tt class="docutils literal">run_step</tt> is called at 100Hz (in the virtual time frame of the simulation). At each +generation, we test 500 creatures for a total of 12 virtual seconds each. So, +we have a total of 600,000 executions of <tt class="docutils literal">run_step</tt> at each generation.<br> +<br> +At first, I simply tried to run this code on CPython; here is the result:<br> +<pre class="code literal-block">$ python -m ev.main +Generation 1: ... [population = 500] [12.06 secs] +Generation 2: ... [population = 500] [6.13 secs] +Generation 3: ... [population = 500] [6.11 secs] +Generation 4: ... [population = 500] [6.09 secs] +Generation 5: ... [population = 500] [6.18 secs] +Generation 6: ... [population = 500] [6.26 secs] +</pre> +Which means ~6.15 seconds/generation, excluding the first.<br> +<br> +Then I tried with PyPy 5.9:<br> +<pre class="code literal-block">$ pypy -m ev.main +Generation 1: ... [population = 500] [63.90 secs] +Generation 2: ... [population = 500] [33.92 secs] +Generation 3: ... [population = 500] [34.21 secs] +Generation 4: ... [population = 500] [33.75 secs] +</pre> +Ouch! We are ~5.5x slower than CPython. This was kind of expected: numpy is +based on cpyext, which is infamously slow. (Actually, <a class="reference external" href="https://pypy.org/posts/2017/10/cape-of-good-hope-for-pypy-hello-from-3656631725712879033.html">we are working on +that</a> and on the <tt class="docutils literal"><span class="pre">cpyext-avoid-roundtrip</span></tt> branch we are already faster than +CPython, but this will be the subject of another blog post.)<br> +<br> +So, let's try to avoid cpyext. The first obvious step is to use <a class="reference external" href="https://doc.pypy.org/en/latest/faq.html#what-about-numpy-numpypy-micronumpy">numpypy</a> +instead of numpy (actually, there is a <a class="reference external" href="https://github.com/antocuni/evolvingcopter/blob/master/ev/pypycompat.py">hack</a> to use just the micronumpy +part). Let's see if the speed improves:<br> +<pre class="code literal-block">$ pypy -m ev.main # using numpypy +Generation 1: ... [population = 500] [5.60 secs] +Generation 2: ... [population = 500] [2.90 secs] +Generation 3: ... [population = 500] [2.78 secs] +Generation 4: ... [population = 500] [2.69 secs] +Generation 5: ... [population = 500] [2.72 secs] +Generation 6: ... [population = 500] [2.73 secs] +</pre> +So, ~2.7 seconds on average: this is 12x faster than PyPy+numpy, and more than +2x faster than the original CPython. At this point, most people would be happy +and go tweeting how PyPy is great.<br> +<br> +In general, when talking of CPython vs PyPy, I am rarely satified of a 2x +speedup: I know that PyPy can do much better than this, especially if you +write code which is specifically optimized for the JIT. For a real-life +example, have a look at <a class="reference external" href="https://capnpy.readthedocs.io/en/latest/benchmarks.html">capnpy benchmarks</a>, in which the PyPy version is +~15x faster than the heavily optimized CPython+Cython version (both have been +written by me, and I tried hard to write the fastest code for both +implementations).<br> +<br> +So, let's try to do better. As usual, the first thing to do is to profile and +see where we spend most of the time. Here is the <a class="reference external" href="https://vmprof.com/#/449ca8ee-3ab2-49d4-b6f0-9099987e9000">vmprof profile</a>. We spend a +lot of time inside the internals of numpypy, and allocating tons of temporary +arrays to store the results of the various operations.<br> +<br> +Also, let's look at the <a class="reference external" href="https://vmprof.com/#/28fd6e8f-f103-4bf4-a76a-4b65dbd637f4/traces">jit traces</a> and search for the function <tt class="docutils literal">run</tt>: +this is loop in which we spend most of the time, and it is composed of 1796 +operations. The operations emitted for the line <tt class="docutils literal"><span class="pre">np.dot(...)</span> + +self.constant</tt> are listed between lines 1217 and 1456. Here is the excerpt +which calls <tt class="docutils literal"><span class="pre">np.dot(...)</span></tt>; most of the ops are cheap, but at line 1232 we +see a call to the RPython function <a class="reference external" href="https://foss.heptapod.net/pypy/pypy/-/blob/release-pypy3.5-v5.10.0/pypy/module/micronumpy/ndarray.py#L1160">descr_dot</a>; by looking at the +implementation we see that it creates a new <tt class="docutils literal">W_NDimArray</tt> to store the +result, which means it has to do a <tt class="docutils literal">malloc()</tt>:<br> +<div class="separator" style="clear: both; text-align: center;"> +<a href="https://4.bp.blogspot.com/-_h6BuLTtEO8/Wfb6BXDg93I/AAAAAAAABNY/BY2XBg4ZtwokB9f1mWSmzI9gn_qanb81QCLcBGAs/s1600/2017-10-trace1.png" style="margin-left: 1em; margin-right: 1em;"><img border="0" height="450" src="https://4.bp.blogspot.com/-_h6BuLTtEO8/Wfb6BXDg93I/AAAAAAAABNY/BY2XBg4ZtwokB9f1mWSmzI9gn_qanb81QCLcBGAs/s640/2017-10-trace1.png" width="640"></a></div> +<br> +The implementation of the <tt class="docutils literal">+ self.constant</tt> part is also interesting: +contrary the former, the call to <tt class="docutils literal">W_NDimArray.descr_add</tt> has been inlined by +the JIT, so we have a better picture of what's happening; in particular, we +can see the call to <tt class="docutils literal">__0_alloc_with_del____</tt> which allocates the +<tt class="docutils literal">W_NDimArray</tt> for the result, and the <tt class="docutils literal">raw_malloc</tt> which allocates the +actual array. Then we have a long list of 149 simple operations which set the +fields of the resulting array, construct an iterator, and finally do a +<tt class="docutils literal">call_assembler</tt>: this is the actual logic to do the addition, which was +JITtted indipendently; <tt class="docutils literal">call_assembler</tt> is one of the operations to do +JIT-to-JIT calls:<br> +<div class="separator" style="clear: both; text-align: center;"> +<a href="https://1.bp.blogspot.com/-vmo0pWharIU/Wfb3VfwHjxI/AAAAAAAABNE/a6Em09qZizwGiWJeTbGzKfHQH70dB7RKgCEwYBhgL/s1600/2017-10-trace2.png" style="margin-left: 1em; margin-right: 1em;"><img border="0" height="640" src="https://1.bp.blogspot.com/-vmo0pWharIU/Wfb3VfwHjxI/AAAAAAAABNE/a6Em09qZizwGiWJeTbGzKfHQH70dB7RKgCEwYBhgL/s640/2017-10-trace2.png" width="625"></a></div> +<br> +All of this is very suboptimal: in this particular case, we know that the +shape of <tt class="docutils literal">self.matrix</tt> is always <tt class="docutils literal">(3, 2)</tt>: so, we are doing an incredible +amount of work, including calling <tt class="docutils literal">malloc()</tt> twice for the temporary arrays, just to +call two functions which ultimately do a total of 6 multiplications +and 6 additions. Note also that this is not a fault of the JIT: CPython+numpy +has to do the same amount of work, just hidden inside C calls.<br> +<br> +One possible solution to this nonsense is a well known compiler optimization: +loop unrolling. From the compiler point of view, unrolling the loop is always +risky because if the matrix is too big you might end up emitting a huge blob +of code, possibly uselss if the shape of the matrices change frequently: this +is the main reason why the PyPy JIT does not even try to do it in this case.<br> +<br> +However, we <strong>know</strong> that the matrix is small, and always of the same +shape. So, let's unroll the loop manually:<br> +<pre class="code python literal-block"><span class="keyword">class</span> <span class="name class">SpecializedCreature</span><span class="punctuation">(</span><span class="name">Creature</span><span class="punctuation">):</span> + + <span class="keyword">def</span> <span class="name function magic">__init__</span><span class="punctuation">(</span><span class="name builtin pseudo">self</span><span class="punctuation">,</span> <span class="operator">*</span><span class="name">args</span><span class="punctuation">,</span> <span class="operator">**</span><span class="name">kwargs</span><span class="punctuation">):</span> + <span class="name">Creature</span><span class="operator">.</span><span class="name function magic">__init__</span><span class="punctuation">(</span><span class="name builtin pseudo">self</span><span class="punctuation">,</span> <span class="operator">*</span><span class="name">args</span><span class="punctuation">,</span> <span class="operator">**</span><span class="name">kwargs</span><span class="punctuation">)</span> + <span class="comment single"># store the data in a plain Python list</span> + <span class="name builtin pseudo">self</span><span class="operator">.</span><span class="name">data</span> <span class="operator">=</span> <span class="name builtin">list</span><span class="punctuation">(</span><span class="name builtin pseudo">self</span><span class="operator">.</span><span class="name">matrix</span><span class="operator">.</span><span class="name">ravel</span><span class="punctuation">())</span> <span class="operator">+</span> <span class="name builtin">list</span><span class="punctuation">(</span><span class="name builtin pseudo">self</span><span class="operator">.</span><span class="name">constant</span><span class="punctuation">)</span> + <span class="name builtin pseudo">self</span><span class="operator">.</span><span class="name">data_state</span> <span class="operator">=</span> <span class="punctuation">[</span><span class="literal number float">0.0</span><span class="punctuation">]</span> + <span class="keyword">assert</span> <span class="name builtin pseudo">self</span><span class="operator">.</span><span class="name">matrix</span><span class="operator">.</span><span class="name">shape</span> <span class="operator">==</span> <span class="punctuation">(</span><span class="literal number integer">2</span><span class="punctuation">,</span> <span class="literal number integer">3</span><span class="punctuation">)</span> + <span class="keyword">assert</span> <span class="name builtin">len</span><span class="punctuation">(</span><span class="name builtin pseudo">self</span><span class="operator">.</span><span class="name">data</span><span class="punctuation">)</span> <span class="operator">==</span> <span class="literal number integer">8</span> + + <span class="keyword">def</span> <span class="name function">run_step</span><span class="punctuation">(</span><span class="name builtin pseudo">self</span><span class="punctuation">,</span> <span class="name">inputs</span><span class="punctuation">):</span> + <span class="comment single"># state: [state_vars ... inputs]</span> + <span class="comment single"># out_values: [state_vars, ... outputs]</span> + <span class="name">k0</span><span class="punctuation">,</span> <span class="name">k1</span><span class="punctuation">,</span> <span class="name">k2</span><span class="punctuation">,</span> <span class="name">q0</span><span class="punctuation">,</span> <span class="name">q1</span><span class="punctuation">,</span> <span class="name">q2</span><span class="punctuation">,</span> <span class="name">c0</span><span class="punctuation">,</span> <span class="name">c1</span> <span class="operator">=</span> <span class="name builtin pseudo">self</span><span class="operator">.</span><span class="name">data</span> + <span class="name">s0</span> <span class="operator">=</span> <span class="name builtin pseudo">self</span><span class="operator">.</span><span class="name">data_state</span><span class="punctuation">[</span><span class="literal number integer">0</span><span class="punctuation">]</span> + <span class="name">z_sp</span><span class="punctuation">,</span> <span class="name">z</span> <span class="operator">=</span> <span class="name">inputs</span> + <span class="comment single">#</span> + <span class="comment single"># compute the output</span> + <span class="name">out0</span> <span class="operator">=</span> <span class="name">s0</span><span class="operator">*</span><span class="name">k0</span> <span class="operator">+</span> <span class="name">z_sp</span><span class="operator">*</span><span class="name">k1</span> <span class="operator">+</span> <span class="name">z</span><span class="operator">*</span><span class="name">k2</span> <span class="operator">+</span> <span class="name">c0</span> + <span class="name">out1</span> <span class="operator">=</span> <span class="name">s0</span><span class="operator">*</span><span class="name">q0</span> <span class="operator">+</span> <span class="name">z_sp</span><span class="operator">*</span><span class="name">q1</span> <span class="operator">+</span> <span class="name">z</span><span class="operator">*</span><span class="name">q2</span> <span class="operator">+</span> <span class="name">c1</span> + <span class="comment single">#</span> + <span class="name builtin pseudo">self</span><span class="operator">.</span><span class="name">data_state</span><span class="punctuation">[</span><span class="literal number integer">0</span><span class="punctuation">]</span> <span class="operator">=</span> <span class="name">out0</span> + <span class="name">outputs</span> <span class="operator">=</span> <span class="punctuation">[</span><span class="name">out1</span><span class="punctuation">]</span> + <span class="keyword">return</span> <span class="name">outputs</span> +</pre> +In the <a class="reference external" href="https://github.com/antocuni/evolvingcopter/blob/master/ev/creature.py#L100">actual code</a> there is also a sanity check which asserts that the +computed output is the very same as the one returned by <tt class="docutils literal">Creature.run_step</tt>.<br> +<br> +So, let's try to see how it performs. First, with CPython:<br> +<pre class="code literal-block">$ python -m ev.main +Generation 1: ... [population = 500] [7.61 secs] +Generation 2: ... [population = 500] [3.96 secs] +Generation 3: ... [population = 500] [3.79 secs] +Generation 4: ... [population = 500] [3.74 secs] +Generation 5: ... [population = 500] [3.84 secs] +Generation 6: ... [population = 500] [3.69 secs] +</pre> +This looks good: 60% faster than the original CPython+numpy +implementation. Let's try on PyPy:<br> +<pre class="code literal-block">Generation 1: ... [population = 500] [0.39 secs] +Generation 2: ... [population = 500] [0.10 secs] +Generation 3: ... [population = 500] [0.11 secs] +Generation 4: ... [population = 500] [0.09 secs] +Generation 5: ... [population = 500] [0.08 secs] +Generation 6: ... [population = 500] [0.12 secs] +Generation 7: ... [population = 500] [0.09 secs] +Generation 8: ... [population = 500] [0.08 secs] +Generation 9: ... [population = 500] [0.08 secs] +Generation 10: ... [population = 500] [0.08 secs] +Generation 11: ... [population = 500] [0.08 secs] +Generation 12: ... [population = 500] [0.07 secs] +Generation 13: ... [population = 500] [0.07 secs] +Generation 14: ... [population = 500] [0.08 secs] +Generation 15: ... [population = 500] [0.07 secs] +</pre> +Yes, it's not an error. After a couple of generations, it stabilizes at around +~0.07-0.08 seconds per generation. This is around <strong>80 (eighty) times faster</strong> +than the original CPython+numpy implementation, and around 35-40x faster than +the naive PyPy+numpypy one.<br> +<br> +Let's look at the <a class="reference external" href="https://vmprof.com/#/402af746-2966-4403-a61d-93015abac033/traces">trace</a> again: it no longer contains expensive calls, and +certainly no more temporary <tt class="docutils literal">malloc()</tt> s. The core of the logic is between +lines 386-416, where we can see that it does fast C-level multiplications and +additions: <tt class="docutils literal">float_mul</tt> and <tt class="docutils literal">float_add</tt> are translated straight into +<tt class="docutils literal">mulsd</tt> and <tt class="docutils literal">addsd</tt> x86 instructions.<br> +<br> +As I said before, this is a very particular example, and the techniques +described here do not always apply: it is not realistic to expect an 80x +speedup on arbitrary code, unfortunately. However, it clearly shows the potential of PyPy when +it comes to high-speed computing. And most importantly, it's not a toy +benchmark which was designed specifically to have good performance on PyPy: +it's a real world example, albeit small.<br> +<br> +You might be also interested in the talk I gave at last EuroPython, in which I +talk about a similar topic: "The Joy of PyPy JIT: abstractions for free" +(<a class="reference external" href="https://ep2017.europython.eu/conference/talks/the-joy-of-pypy-jit-abstractions-for-free">abstract</a>, <a class="reference external" href="https://speakerdeck.com/antocuni/the-joy-of-pypy-jit-abstractions-for-free">slides</a> and <a class="reference external" href="https://www.youtube.com/watch?v=NQfpHQII2cU">video</a>).<br> +<br> +<div class="section" id="how-to-reproduce-the-results"> +<h3> +How to reproduce the results</h3> +<pre class="code literal-block">$ git clone https://github.com/antocuni/evolvingcopter +$ cd evolvingcopter +$ {python,pypy} -m ev.main --no-specialized --no-numpypy +$ {python,pypy} -m ev.main --no-specialized +$ {python,pypy} -m ev.main +</pre> +</div> +</div>jitprofilingspeedhttps://www.pypy.org/posts/2017/10/how-to-make-your-code-80-times-faster-1424098117108093942.htmlMon, 30 Oct 2017 10:15:00 GMT(Cape of) Good Hope for PyPyhttps://www.pypy.org/posts/2017/10/cape-of-good-hope-for-pypy-hello-from-3656631725712879033.htmlAntonio Cuni<div> +<br></div> +Hello from the other side of the world (for most of you)!<br> +<br> +With the excuse of coming to <a class="reference external" href="https://za.pycon.org/">PyCon ZA</a> during the last two weeks Armin, +Ronan, Antonio and sometimes Maciek had a very nice and productive sprint in +Cape Town, as pictures show :). We would like to say a big thank you to +Kiwi.com, which sponsored part of the travel costs via its awesome <a class="reference external" href="https://www.kiwi.com/sourcelift/">Sourcelift</a> +program to help Open Source projects.<br> +<br> +<table align="center" cellpadding="0" cellspacing="0" class="tr-caption-container" style="float: right; margin-left: 1em; text-align: right;"><tbody> +<tr><td style="text-align: center;"><a href="https://3.bp.blogspot.com/-9YVNucPN1wE/WeaWmTUFB-I/AAAAAAAABMQ/HeVMqS-ya2IYJuk0iZZODlULqpKaf5XcgCLcBGAs/s1600/DSC_2418.JPG" style="margin-left: auto; margin-right: auto;"><img border="0" height="225" src="https://3.bp.blogspot.com/-9YVNucPN1wE/WeaWmTUFB-I/AAAAAAAABMQ/HeVMqS-ya2IYJuk0iZZODlULqpKaf5XcgCLcBGAs/s400/DSC_2418.JPG" width="400"></a></td></tr> +<tr><td class="tr-caption" style="text-align: center;">Armin, Anto and Ronan at Cape Point</td></tr> +</tbody></table> +<br> +Armin, Ronan and Anto spent most of the time hacking at cpyext, our CPython +C-API compatibility layer: during the last years, the focus was to make it +working and compatible with CPython, in order to run existing libraries such +as numpy and pandas. However, we never paid too much attention to performance, +so the net result is that with the latest released version of PyPy, C +extensions generally work but their speed ranges from "slow" to "horribly +slow".<br> +<br> +For example, these very simple <a class="reference external" href="https://github.com/antocuni/cpyext-benchmarks">microbenchmarks</a> measure the speed of +calling (empty) C functions, i.e. the time you spend to "cross the border" +between RPython and C. <i>(Note: this includes the time spent doing the loop in regular Python code.)</i> These are the results on CPython, on PyPy 5.8, and on +our newest in-progress version:<br> +<br> +<pre class="literal-block">$ python bench.py # CPython +noargs : 0.41 secs +onearg(None): 0.44 secs +onearg(i) : 0.44 secs +varargs : 0.58 secs +</pre> +<div> +<br></div> +<pre class="literal-block">$ pypy-5.8 bench.py # PyPy 5.8 +noargs : 1.01 secs +onearg(None): 1.31 secs +onearg(i) : 2.57 secs +varargs : 2.79 secs +</pre> +<div> +<br></div> +<pre class="literal-block">$ pypy bench.py # cpyext-refactor-methodobject branch +noargs : 0.17 secs +onearg(None): 0.21 secs +onearg(i) : 0.22 secs +varargs : 0.47 secs +</pre> +<div> +<br></div> +<pre class="literal-block"></pre> +<pre class="literal-block"></pre> +So yes: before the sprint, we were ~2-6x slower than CPython. Now, we are +<strong>faster</strong> than it! +To reach this result, we did various improvements, such as: +<br> +<blockquote> +<ol class="arabic simple"> +<li>teach the JIT how to look (a bit) inside the cpyext module;</li> +<li>write specialized code for calling <tt class="docutils literal">METH_NOARGS</tt>, <tt class="docutils literal">METH_O</tt> and +<tt class="docutils literal">METH_VARARGS</tt> functions; previously, we always used a very general and +slow logic;</li> +<li>implement freelists to allocate the cpyext versions of <tt class="docutils literal">int</tt> and +<tt class="docutils literal">tuple</tt> objects, as CPython does;</li> +<li>the <a class="reference external" href="https://foss.heptapod.net/pypy/pypy/-/merge_requests/573">cpyext-avoid-roundtrip</a> branch: crossing the RPython/C border is +slowish, but the real problem was (and still is for many cases) we often +cross it many times for no good reason. So, depending on the actual API +call, you might end up in the C land, which calls back into the RPython +land, which goes to C, etc. etc. (ad libitum).</li> +</ol> +</blockquote> +The branch tries to fix such nonsense: so far, we fixed only some cases, which +are enough to speed up the benchmarks shown above. But most importantly, we +now have a clear path and an actual plan to improve cpyext more and +more. Ideally, we would like to reach a point in which cpyext-intensive +programs run at worst at the same speed of CPython.<br> +<br> +The other big topic of the sprint was Armin and Maciej doing a lot of work on the +<a class="reference external" href="https://bitbucket.org/pypy/pypy/commits/branch/unicode-utf8">unicode-utf8</a> branch: the goal of the branch is to always use UTF-8 as the +internal representation of unicode strings. The advantages are various: +<br> +<blockquote> +<ul class="simple"> +<li>decoding a UTF-8 stream is super fast, as you just need to check that the +stream is valid;</li> +<li>encoding to UTF-8 is almost a no-op;</li> +<li>UTF-8 is always more compact representation than the currently +used UCS-4. It's also almost always more compact than CPython 3.5 latin1/UCS2/UCS4 combo;</li> +<li>smaller representation means everything becomes quite a bit faster due to lower cache pressure.</li> +</ul> +</blockquote> +Before you ask: yes, this branch contains special logic to ensure that random +access of single unicode chars is still O(1), as it is on both CPython and the +current PyPy.<br> +We also plan to improve the speed of decoding even more by using modern processor features, like SSE and AVX. Preliminary results show that decoding can be done 100x faster than the current setup. +<br> +<br> +In summary, this was a long and profitable sprint, in which we achieved lots +of interesting results. However, what we liked even more was the privilege of +doing <a class="reference external" href="https://bitbucket.org/pypy/pypy/commits/a4307fb5912e">commits</a> from awesome places such as the top of Table Mountain:<br> +<br> +<blockquote class="twitter-tweet"> +<div dir="ltr" lang="en"> +Our sprint venue today <a href="https://twitter.com/hashtag/pypy?src=hash&amp;ref_src=twsrc%5Etfw">#pypy</a> <a href="https://t.co/o38IfTYmAV">pic.twitter.com/o38IfTYmAV</a></div> +— Ronan Lamy (@ronanlamy) <a href="https://twitter.com/ronanlamy/status/915575026107240449?ref_src=twsrc%5Etfw">4 ottobre 2017</a></blockquote> + + +<br> +<table align="center" cellpadding="0" cellspacing="0" class="tr-caption-container" style="float: left; margin-right: 1em; text-align: left;"><tbody> +<tr><td style="text-align: center;"><a href="https://foss.heptapod.net/pypy/extradoc/-/blob/branch/extradoc/sprintinfo/cape-town-2017/2017-10-04-155524.jpg" style="margin-left: auto; margin-right: auto;"><img border="0" height="360" src="https://bytebucket.org/pypy/extradoc/raw/extradoc/sprintinfo/cape-town-2017/2017-10-04-155524.jpg" width="640"></a></td></tr> +<tr><td class="tr-caption" style="text-align: center;">The panorama we looked at instead of staring at cpyext code</td></tr> +</tbody></table>cpyextprofilingspeedsprintunicodehttps://www.pypy.org/posts/2017/10/cape-of-good-hope-for-pypy-hello-from-3656631725712879033.htmlWed, 18 Oct 2017 13:31:00 GMTBinary wheels for PyPyhttps://www.pypy.org/posts/2017/07/binary-wheels-for-pypy-8718353804433344916.htmlAntonio Cuni<p>Hi,<br> +<br> +this is a short blog post, just to announce the existence of this <a href="https://github.com/antocuni/pypy-wheels" target="_blank">Github repository</a>, which contains binary PyPy wheels for some selected packages. The availability of binary wheels means that you can install the packages much more quickly, without having to wait for compilation.<br> +</p><div> +<br></div> +At the moment of writing, these packages are available:<br> +<br> +<ul> +<li>numpy</li> +<li>scipy</li> +<li>pandas</li> +<li>psutil</li> +<li>netifaces</li> +</ul> +<br> +For now, we provide only wheels built on Ubuntu, compiled for PyPy 5.8.<br> +In particular, it is worth noting that they are <b>not</b> <span>manylinux1</span> wheels, which means they could not work on other Linux distributions. For more information, see the explanation in the README of the above repo.<br> +<br> +Moreover, the existence of the wheels does not guarantee that they work correctly 100% of the time. they still depend on <span>cpyext</span>, our C-API emulation layer, which is still work-in-progress, although it has become better and better during the last months. Again, the wheels are there only to save compilation time.<br> +<br> +To install a package from the wheel repository, you can invoke <span>pip</span> like this:<br> +<br> +<span>$ pip install --extra-index https://antocuni.github.io/pypy-wheels/ubuntu numpy</span><br> +<div> +<br></div> +<div> +Happy installing!</div>https://www.pypy.org/posts/2017/07/binary-wheels-for-pypy-8718353804433344916.htmlWed, 26 Jul 2017 16:53:00 GMT \ No newline at end of file diff --git a/authors/armin-rigo.html b/authors/armin-rigo.html new file mode 100644 index 000000000..4facf84ea --- /dev/null +++ b/authors/armin-rigo.html @@ -0,0 +1,323 @@ + + + + + +Posts by Armin Rigo | PyPy + + + + + + + + + + + + + + + + + Skip to main content +
+

Posts by Armin Rigo

+ +
+
+ + \ No newline at end of file diff --git a/authors/armin-rigo.xml b/authors/armin-rigo.xml new file mode 100644 index 000000000..238031def --- /dev/null +++ b/authors/armin-rigo.xml @@ -0,0 +1,780 @@ + +PyPy (Posts by Armin Rigo)https://www.pypy.org/enContents © 2024 <a href="mailto:pypy-dev@pypy.org">The PyPy Team</a> Sat, 31 Aug 2024 17:48:12 GMTNikola (getnikola.com)http://blogs.law.harvard.edu/tech/rssMac meets Arm64https://www.pypy.org/posts/2020/12/mac-meets-arm64-940822335619099039.htmlArmin Rigo<b>Looking for sponsorship</b> + +<p>Apple now ships Macs which are running on an arm64 variant machine with the +latest version of MacOS, Big Sur M1. We are getting requests for PyPy to +support this new architecture. Here is our position on this topic (or at least +mine, Armin Rigo's), and how you can help.</p> + +<p>Porting PyPy is harder than just re-running the compiler, because PyPy contains +a few big architecture-dependent "details", like the JIT compiler and the +foreign function interfaces (CFFI and ctypes).</p> + +<p>Fixing the JIT compiler should not be too much work: we already support arm64, +just the Linux one. But Apple made various details different (like the calling +conventions). A few other parts need to be fixed too, notably CFFI and ctypes, +again because of the calling conventions.</p> + +<p>Fixing that would be a reasonable amount of work. I would do it myself for a +small amount of money. However, the story doesn't finish here. Obviously, the +<b>start</b> of the story would be to get ssh access to a Big Sur M1 machine. (If at +this point you're thinking "sure, I can give you ssh access for three months", +then please read on.) The <b>next</b> part of the story is that we need a machine +available long term. It can be either a machine provided and maintained by a +third party, or alternatively a pot of money big enough to support the +acquision of a machine and ongoing work of one of us.</p> + +<p>If we go with the provided-machine solution: What we need isn't a lot of +resources. Our CI requires maybe 10 GB of disk space, and a few hours of CPU +per run. It should fit into 8 GB of RAM. We normally do a run every night but +we can certainly lower the frequency a bit if that would help. However, we'd +ideally like some kind of assurance that you are invested into maintaining the +machine for the next 3-5 years (I guess, see below). We had far too many +machines that disappeared after a few months.</p> + +<p>If we go with the money-supported solution: it's likely that after 3-5 years +the whole Mac base will have switched to arm64, we'll drop x86-64 support for +Mac, and we'll be back to the situation of the past where there was only one +kind of Mac machine to care about. In the meantime, we are looking at 3-5 +years of lightweight extra maintenance. We have someone that has said he would +do it, but not for free.</p> + +<p>If either of these two solutions occurs, we'll still have, I quote, "probably +some changes in distutils-type stuff to make python happy", and then some +packaging/deployment changes to support the "universal2" architecture, i.e. +including both versions inside a single executable (which will <b>not</b> be just an +extra switch to clang, because the two versions need a different JIT backend +and so must be translated separately).</p> + +<p>So, now all the factors are on the table. We won't do the minimal "just the +JIT compiler fixes" if we don't have a plan that goes farther. Either we get +sufficient money, and maybe support, and then we can do it quickly; or PyPy +will just remain not natively available on M1 hardware for the next 3-5 years. +We are looking forward to supporting M1, and view resources contributed by +the community as a vote of confidence in assuring the future of PyPy on this +hardware. Contact us: <a href="mailto:pypy-dev@python.org">pypy-dev@python.org</a>, or our private mailing +list <a href="mailto:pypy-z@python.org">pypy-z@python.org</a>.</p> + +<p>Thanks for reading!</p> + +<p>Armin Rigo</p>https://www.pypy.org/posts/2020/12/mac-meets-arm64-940822335619099039.htmlThu, 31 Dec 2020 09:53:00 GMTPyPy is on Open Collectivehttps://www.pypy.org/posts/2020/08/pypy-is-on-open-collective-5673322428814364737.htmlArmin Rigo<p>Hi all,</p> + +<p>PyPy is now a <a href="https://opencollective.com/pypy">member of Open Collective</a>, a fiscal host. We have been thinking about switching to this organization for a couple of years; we like it for various reasons, like the budget transparency and the lightweight touch. We can now officially announce our membership!</p> + +<p>With this, we are now again free to use PyPy for all financial issues, like receiving funds professionally, paying parts of sprint budgets as we like, and so on. We will shortly be reintroducing buttons that link to Open Collective from the PyPy web site.</p> + +<p>Although the old donation buttons were removed last year, we believe that there are still a few people that send regularly money to the SFC, the not-for-profit charity we were affiliated with. If you do, please stop doing it now (and, if you like to do so, please set up an equivalent donation to <a href="https://opencollective.com/pypy">PyPy on Open Collective</a>).</p> + +<p>And by the way, sorry for all of you who were getting mixed feelings from the previous blog post (co-written with the SFC). <b>PyPy is committed to continue being Open Source just like before.</b> This was never in question. What these two blog posts mean is only that we switched to a different organization for our internal finances.</p> + +<p>We're looking forward to how this new relationship will go!</p> + +<p>Armin Rigo, for the PyPy team</p>https://www.pypy.org/posts/2020/08/pypy-is-on-open-collective-5673322428814364737.htmlSat, 29 Aug 2020 11:53:00 GMTLeysin Winter sprint 2020: Feb 29 - March 8thhttps://www.pypy.org/posts/2020/01/leysin-winter-sprint-2020-feb-28-march-6349761524797409012.htmlArmin Rigo<a href="https://q-cf.bstatic.com/images/hotel/max1280x900/321/32136520.jpg" style="clear: right; float: right; margin-bottom: 1em; margin-left: 1em;"><img border="0" height="240" src="https://q-cf.bstatic.com/images/hotel/max1280x900/321/32136520.jpg" width="320"></a>The next PyPy sprint will be in Leysin, Switzerland, for the fourteenth +time. This is a fully public sprint: newcomers and topics other than +those proposed below are welcome.<br> +<br> +<br> +<br> +<h3> +Goals and topics of the sprint</h3> +The list of topics is open.  For reference, we would like to work at least partially on the following topics:<br> +<ul> +<li><a href="https://github.com/pyhandle/hpy">HPy</a> </li> +<li>Python 3.7 support (<a href="https://buildbot.pypy.org/summary?branch=py3.7">buildbot status</a>)</li> +</ul> +As usual, the main side goal is to have fun in winter sports :-) +We can take a day off (for ski or anything else).<br> +<br> +<h3> +Times and accomodation</h3> +The sprint will occur for one week starting on Saturday, the 29th of February, to Sunday, the 8th of March 2020 <b>(dates were pushed back one day!)</b>  It will occur in <a href="https://www.booking.com/hotel/ch/les-airelles.html">Les Airelles</a>, a different bed-and-breakfast place from the traditional one in <span class="il">Leysin</span>.  It is a nice old house at the top of the village.<br> +<br> +<strike>We have a 4- or 5-people room as well as up to three double-rooms.  Please register early!  These rooms are not booked for the sprint in advance, and might be already taken if you end up announcing yourself late.</strike>  We have a big room for up to 7 people with nice view, which might be split in two or three sub-rooms; plus possibly separately-booked double rooms if needed. (But it is of course always possible to book at a different place in Leysin.)<br> +<br> +For more information, see our <a href="https://foss.heptapod.net/pypy/extradoc/-/blob/branch/default/extradoc/sprintinfo/leysin-winter-2020/">repository</a> or write to me directly at armin.rigo@gmail.com.https://www.pypy.org/posts/2020/01/leysin-winter-sprint-2020-feb-28-march-6349761524797409012.htmlFri, 17 Jan 2020 10:36:00 GMTA second life for the Sandboxhttps://www.pypy.org/posts/2019/08/a-second-life-for-sandbox-6848726729476245390.htmlArmin Rigo<p>Hi all,<br> +<br> +<a href="https://anvil.works/" target="_blank">Anvil</a> is a UK-based company sponsoring one month of work to revive PyPy's +"sandbox" mode and upgrade it to PyPy3. Thanks to them, sandboxing will be +given a second life!<br> +<br> +The <a href="https://doc.pypy.org/en/latest/sandbox.html">sandboxed PyPy</a> is a special version of PyPy that runs +fully isolated. It gives a safe way to execute arbitrary Python +programs (<i>whole</i> programs, not small bits of code inside your larger Python +program). Such scripts can be fully untrusted, and they can try to do +anything—there are no syntax-based restrictions, for example—but whatever +they do, any communication with the external world is not actually done but +delegated to the parent process. This is similar but much more flexible than +Linux's Seccomp approach, and it is more lightweight than setting up a full +virtual machine. It also works without operating system support.<br> +<br> +However, during the course of the years the sandbox mode of PyPy has been +mostly unmaintained and unsupported by the core developers, mostly because of +a lack of interest by users and because it took too much effort to maintain +it.<br> +<br> +Now we have found that we have an actual user, <a href="https://anvil.works/" target="_blank">Anvil</a>. As far as I can tell +they are still using a very old version of PyPy, the last one that supported +sandboxing. This is where this contract comes from: the goal is to modernize sandboxing and port it to PyPy3.<br> +<br> +Part of my motivation for accepting this work is that I may have found a way to +tweak the protocol on the pipe between the sandboxed PyPy and the parent +controller process. This should make the sandboxed PyPy more resilient against +future developments and easier to maintain; at most, in the future some tweaks will be needed in the +controller process but hopefully not deep inside the guts of the sandboxed +PyPy. Among the advantages, such a more robust solution should mean that we +can actually get a working sandboxed PyPy—or sandboxed PyPy3 or sandboxed +version of <a href="https://rpython.readthedocs.io/en/latest/examples.html">any other interpreter written in RPython</a>—with just an extra +argument when calling <span>rpython</span> to translate this interpreter. If everything +works as planned, sandboxing may be given a second life.<br> +<br> +Armin Rigo</p>https://www.pypy.org/posts/2019/08/a-second-life-for-sandbox-6848726729476245390.htmlWed, 07 Aug 2019 18:31:00 GMTLeysin Winter Sprint 2018: reviewhttps://www.pypy.org/posts/2018/03/leysin-winter-sprint-2018-review-3988364248531980164.htmlArmin Rigo<p>Like every year, the PyPy developers and a couple of newcomers + gathered in Leysin, Switzerland, to share their thoughts and + contribute to the development of PyPy.</p> + <p>As always, we had interesting discussions about how we could + improve PyPy, to make it the first choice for even more + developers. We also made some progress with current issues, like + compatibility with Python 3.6 and improving the performance of + CPython extension modules, where we fixed a lot of bugs and gained + new insights about where and how we could tweak PyPy.<br> + </p> + <p> We were very happy about the number of new people who joined us + for the first time, and hope they enjoyed it as much as everyone + else. <br> + </p> + <h3>Topics</h3> + We worked on the following topics (and more!):<br> + <ul> + <li>Introductions for newcomers</li> + <li>Python 3.5 and 3.6 improvements</li> + <li>CPyExt performance improvements and GC implementation<br> + </li> + <li>JIT: guard-compatible implementation<br> + </li> + <li>Pygame performance improvements</li> + <li>Unicode/UTF8 implementation<br> + </li> + <li>CFFI tutorial/overview rewrite + </li> + <li>py3 test runners refactoring</li> + <li>RevDB improvements<br> + </li> + </ul> + The weather was really fine for most of the week, with only + occasional snow and fog. We started our days with a short (and + sometimes not so short) planning session and enjoyed our dinners in + the great restaurants in the area. Some of us even started earlier + and continued till late night. It was a relaxed, but also very + productive atmosphere. On our break day on Wednesday, we enjoyed the + great conditions and went skiing and hiking. + <h3>Attendees</h3> + <ul> + <li>Arianna</li> + <li>Jean-Daniel<br> + </li> + <li>Stefan Beyer</li> + <li>Floris Bruynooghe<br> + </li> + <li>Antonio Cuni</li> + <li>René Dudfield</li> + <li>Manuel Jacob</li> + <li>Ronan Lamy</li> + <li>Remi Meier</li> + <li>Matti Picus<br> + </li> + <li>Armin Rigo</li> + <li>Alexander Schremmer<br> + </li> + </ul> + Leysin is easily reachable by Geneva Airport, so feel free to join + us next time!<br> + <br> + <br> + <p>Cheers,<br> + Stefan<br> + </p>https://www.pypy.org/posts/2018/03/leysin-winter-sprint-2018-review-3988364248531980164.htmlTue, 27 Mar 2018 07:54:00 GMTLeysin Winter sprint: 17-24 March 2018https://www.pypy.org/posts/2018/01/leysin-winter-sprint-17-24-march-2018-7141092581585849418.htmlArmin Rigo<table border="0"> +<tr> +<td> + +<p>The next PyPy sprint will be in Leysin, Switzerland, for the thirteenth +time. This is a fully public sprint: newcomers and topics other than +those proposed below are welcome.</p> + +<p>(Note: this sprint is independent from the suggested April-May sprint in +Poland.)</p> + +<h3>Goals and topics of the sprint</h3> + +<p>The list of topics is open, but here is our current list:</p> + +</td> +<td> +<div class="separator" style="clear: both; text-align: center;"><a href="https://4.bp.blogspot.com/-HQ8S2DcTato/WIc2XXEQP9I/AAAAAAAAAQw/7BNVgaMg-jcaMzVaAc2JD5sMgu-Xn5DeACLcB/s1600/chalet1_004.jpg" style="margin-left: 1em; margin-right: 1em;"><img border="0" height="240" src="https://4.bp.blogspot.com/-HQ8S2DcTato/WIc2XXEQP9I/AAAAAAAAAQw/7BNVgaMg-jcaMzVaAc2JD5sMgu-Xn5DeACLcB/s320/chalet1_004.jpg" width="320"></a></div> +</td> +</tr> +</table> + +<ul> +<li> cffi tutorial/overview rewrite +</li><li> py3 test runners are too complicated +</li><li> make win32 builds green +</li><li> make packaging more like cpython/portable builds +</li><li> get CI builders for PyPy into mainstream projects (Numpy, Scipy, lxml, uwsgi) +</li><li> get more of scientific stack working (tensorflow?) +</li><li> cpyext performance improvements +</li><li> General 3.5 and 3.6 improvements +</li><li> JIT topics: guard-compatible, and the subsequent research project to save and reuse traces across processes +</li><li> finish unicode-utf8 +</li><li> update www.pypy.org, speed.pypy.org (web devs needed) +</li></ul> + +<p>As usual, the main side goal is to have fun in winter sports :-) +We can take a day off (for ski or anything else).</p> + +<h3>Exact times</h3> + +<p>Work days: starting March 18th (~noon), ending March 24th (~noon).</p> + +<p>Please see <a href="https://foss.heptapod.net/pypy/extradoc/-/blob/branch/default/extradoc/sprintinfo/leysin-winter-2018/announcement.txt">announcement.txt</a> for more information.</p>https://www.pypy.org/posts/2018/01/leysin-winter-sprint-17-24-march-2018-7141092581585849418.htmlMon, 08 Jan 2018 10:33:00 GMTLeysin Winter Sprint: 25/26th Feb. - 4th March 2017https://www.pypy.org/posts/2017/01/leysin-winter-sprint-2526th-feb-4th-3831779797804484935.htmlArmin Rigo<p>The next PyPy sprint will be in Leysin, Switzerland, for the twelveth time. +This is a fully public sprint: newcomers and topics other than those +proposed below are welcome.</p> +<div class="section" id="goals-and-topics-of-the-sprint"> +<h3>Goals and topics of the sprint</h3> +<p>The list of topics is very open.</p> +<ul class="simple"> +<li>The main topic is Python 3.5 support in PyPy, as most py3.5 +contributors should be present. It is also a good topic if you have +no or limited experience with PyPy contribution: we can easily find +something semi-independent that is not done in py3.5 so far, and +do pair-programming with you.</li> +<li>Any other topic is fine too: JIT compiler optimizations, CFFI, +the RevDB reverse debugger, improving to speed of your program on +PyPy, etc.</li> +<li>And as usual, the main side goal is to have fun in winter sports :-) +We can take a day off (for ski or anything else).</li> +</ul> +</div> +<div class="section" id="exact-times"> +<h3>Exact times</h3> +<p>Work days: starting 26th Feb (~noon), ending March 4th (~noon).</p> +<p>I have pre-booked the week from Saturday Feb 25th to Saturday March 4th. +If it is possible for you to arrive Sunday before mid-afternoon, then +you should get a booking from Sunday only. The break day should be +around Wednesday.</p> +<p>It is fine to stay a few more days on either side, or conversely to book +for a part of that time only.</p> +</div> +<div class="section" id="location-accomodation"> +<h3>Location &amp; Accomodation</h3> + +<p>Leysin, Switzerland, "same place as before".</p> + +<div class="separator" style="clear: both; text-align: center;"><a href="https://4.bp.blogspot.com/-HQ8S2DcTato/WIc2XXEQP9I/AAAAAAAAAQw/7BNVgaMg-jcaMzVaAc2JD5sMgu-Xn5DeACLcB/s1600/chalet1_004.jpg" style="margin-left: 1em; margin-right: 1em;"><img border="0" height="240" src="https://4.bp.blogspot.com/-HQ8S2DcTato/WIc2XXEQP9I/AAAAAAAAAQw/7BNVgaMg-jcaMzVaAc2JD5sMgu-Xn5DeACLcB/s320/chalet1_004.jpg" width="320"></a></div> + +<p>Let me refresh your +memory: both the sprint venue and the lodging will be in a +pair of chalets built specifically for bed &amp; breakfast: +<a class="reference external" href="https://www.ermina.ch/">https://www.ermina.ch/</a>. The place has a good ADSL Internet connection +with wireless installed. You can also arrange your own lodging +elsewhere (as long as you are in Leysin, you cannot be more than a 15 +minutes walk away from the sprint venue).</p> +<p>Please <em>confirm</em> that you are coming so that we can adjust the +reservations as appropriate.</p> +<p>The options of rooms are a bit more limited than on previous years +because the place for bed-and-breakfast is shrinking; but we should +still have enough room for us. The price is around 60 CHF, breakfast +included, in shared rooms (3 or 4 people). If there are people that +would prefer a double or single room, please contact me and we'll see +what choices you have. There are also a choice of hotels in Leysin.</p> +<p>Please register by Mercurial:</p> +<blockquote> +<a href="https://bitbucket.org/pypy/extradoc/">https://bitbucket.org/pypy/extradoc/</a> +<a href="https://foss.heptapod.net/pypy/extradoc/-/blob/branch/default/extradoc/sprintinfo/leysin-winter-2017/">https://foss.heptapod.net/pypy/extradoc/-/blob/branch/default/extradoc/sprintinfo/leysin-winter-2017/</a></blockquote> +<p>or on the pypy-dev mailing list if you do not yet have check-in rights:</p> +<blockquote> +<a class="reference external" href="https://mail.python.org/mailman/listinfo/pypy-dev">https://mail.python.org/mailman/listinfo/pypy-dev</a></blockquote> +<p>You need a Swiss-to-(insert country here) power adapter. There will be +some Swiss-to-EU adapters around, and at least one EU-format power strip.</p> +</div>https://www.pypy.org/posts/2017/01/leysin-winter-sprint-2526th-feb-4th-3831779797804484935.htmlTue, 24 Jan 2017 11:01:00 GMTRevDB released, v5.4.1https://www.pypy.org/posts/2016/09/revdb-released-v541-6719768292347391304.htmlArmin Rigo<p>Hi all,</p> + +<p> +The first beta version of <a href="https://bitbucket.org/pypy/revdb/">RevDB</a> is out! <a href="https://www.pypy.org/posts/2016/07/reverse-debugging-for-python-8854823774141612670.html">Remember</a> that RevDB is a reverse debugger for Python. The idea is that it is a debugger that can run forward and backward in time, letting you more easily understand your subtle bug in your big Python program.</p> + +<p> +RevDB should work on almost any Python program. Even if you are normally only using CPython, trying to reproduce the bug with RevDB is similar to trying to run the program on a regular PyPy---usually it just works, <a href="https://pypy.org/compat.html">even if not quite always</a>. + +</p><p> +News from the alpha version in the <a href="https://www.pypy.org/posts/2016/07/reverse-debugging-for-python-8854823774141612670.html">previous blog post</a> include notably support for: +</p><ul> +<li>Threads. +</li><li>CPyExt, the compatibility layer of PyPy that can run CPython C extension modules. +</li></ul> +as well as many other improvements. + +<p> +You need to build it yourself for now. It is tested on 64-bit Linux. 32-bit Linux, OS/X, and other POSIX platforms should all either work out of the box or be just a few fixes away (contributions welcome). Win32 support is a lot more involved but not impossible.</p> + +<p> +See <a href="https://bitbucket.org/pypy/revdb/">https://bitbucket.org/pypy/revdb/</a> for more information!</p> + +<p>Armin</p>releaserevdbhttps://www.pypy.org/posts/2016/09/revdb-released-v541-6719768292347391304.htmlSat, 10 Sep 2016 09:30:00 GMTPyPy gets funding from Mozilla for Python 3.5 supporthttps://www.pypy.org/posts/2016/08/pypy-gets-funding-from-mozilla-for-5569307998787871200.htmlArmin Rigo<p>"Python 2.x versus Python 3.x": this is by now an old question. In the eyes of some people Python 2 is here to stay, and in the eyes of others Python has long been 3 only.</p> + +<p>PyPy's own position is that PyPy will support Python 2.7 forever---the RPython language in which PyPy is written is a subset of 2.7, and we have no plan to upgrade that. But at the same time, we want to support 3.x. This is particularly true now: a relatively recent development is that Python 3.5 seems to attract more and more people. The "switch" to Python 3.x might be starting to happen.</p> + +<p>Correspondingly, PyPy has been searching for a while for a way to support a larger-scale development effort. The goal is to support not just any old version of Python 3.x, but Python 3.5, as this seems to be the version that people are switching to. PyPy is close to supporting all of Python 3.3 now; but the list of what is new in Python <a href="https://docs.python.org/3/whatsnew/3.4.html">3.4</a> and <a href="https://docs.python.org/3/whatsnew/3.5.html">3.5</a> is far, far longer than anyone imagines. The long-term goal is also to get a version of "PyPy3" that is as good as "PyPy2" is, including its performance and its cpyext layer (CPython C API interoperability), for example.</p> + +<p>So, the end result: <a href="https://blog.mozilla.org/blog/2016/08/04/mozilla-awards-585000-to-nine-open-source-projects-in-q2-2016/">Mozilla recently decided to award $200,000</a> to <a href="https://baroquesoftware.com/">Baroque Software</a> to work on PyPy as part of its Mozilla Open Source Support (MOSS) initiative. This money will be used to implement the Python 3.5 features in PyPy. Within the next year, we plan to use the money to pay four core PyPy developers half-time to work on the missing features and on some of the big performance and cpyext issues. This should speed up the progress of catching up with Python 3.x significantly. We are extremely thankful to Mozilla for supporting us in this way, and will keep you updated on the progress via this blog.</p>sponsorshttps://www.pypy.org/posts/2016/08/pypy-gets-funding-from-mozilla-for-5569307998787871200.htmlTue, 09 Aug 2016 16:38:00 GMTReverse debugging for Pythonhttps://www.pypy.org/posts/2016/07/reverse-debugging-for-python-8854823774141612670.htmlArmin Rigo<div class="section" id="revpdb"> +<h3>RevPDB</h3> +<p>A "reverse debugger" is a debugger where you can go forward and +backward in time. It is an uncommon feature, at least in the open +source world, but I have no idea why. I have used <a class="reference external" href="https://undo.io/">undodb-gdb</a> and +<a class="reference external" href="https://rr-project.org/">rr</a>, which are reverse debuggers for C code, and I can only say that +they saved me many, many days of poking around blindly in gdb.</p> +<p>The PyPy team is pleased to give you "RevPDB", a reverse-debugger +similar to <tt class="docutils literal">rr</tt> but for Python.</p> +<p>An example is worth a thousand words. Let's say your big Python +program has a bug that shows up inconsistently. You have nailed it +down to something like:</p> +<ul class="simple"> +<li>start <tt class="docutils literal">x.py</tt>, which does stuff (maybe involving processing files, +answering some web requests that you simulate from another terminal, +etc.);</li> +<li>sometimes, after a few minutes, your program's state becomes +inconsistent and you get a failing assert or another exception.</li> +</ul> +<p>This is the case where RevPDB is useful.</p> +<p>RevPDB is available only on 64-bit Linux and OS/X right now, but should +not be too hard to port to other OSes. It is very much <em>alpha-level!</em> +(It is a debugger full of bugs. Sorry about that.) I believe it is +still useful---it helped me in one <a class="reference external" href="https://bitbucket.org/pypy/pypy/commits/bd220c268bc9">real use case</a> already.</p> +</div> +<div class="section" id="how-to-get-revpdb"> +<h3>How to get RevPDB</h3> +<p>The following demo was done with an alpha version for 64-bit Linux, +compiled for Arch Linux. I won't provide the binary; it should be +easy enough to retranslate (much faster than a regular PyPy because it +contains neither a JIT nor a custom GC). Grab the <a class="reference external" href="https://pypy.org/download.html#building-from-source">PyPy sources</a> from +Mercurial, and then:</p> +<pre class="literal-block"> +hg update reverse-debugger +# or "hg update ff376ccacb36" for exactly this demo +cd pypy/goal +../../rpython/bin/rpython -O2 --revdb targetpypystandalone.py \ + --withoutmod-cpyext --withoutmod-micronumpy +</pre> +<p>and possibly rename the final <tt class="docutils literal"><span class="pre">pypy-c</span></tt> to <tt class="docutils literal"><span class="pre">pypy-revdb</span></tt> to avoid +confusion.</p> +<p>Other platforms than 64-bit Linux and OS/X need some fixes before they work.</p> +</div> +<div class="section" id="demo"> +<h3>Demo</h3> +<p>For this demo, we're going to use this <tt class="docutils literal">x.py</tt> as the "big program":</p> +<pre class="literal-block"> +import os + +class Foo(object): + value = 5 + +lst1 = [Foo() for i in range(100)] +lst1[50].value += 1 +for x in lst1: + x.value += 1 + +for x in lst1: + if x.value != 6: + print 'oops!' + os._exit(1) +</pre> +<p>Of course, it is clear what occurs in this small example: the check +fails on item 50. For this demo, the check has been written with +<tt class="docutils literal">os._exit(1)</tt>, because this exits immediately the program. If it +was written with an <tt class="docutils literal">assert</tt>, then its failure would execute things +in the <tt class="docutils literal">traceback</tt> module afterwards, to print the traceback; it +would be a minor mess just to find the exact point of the failing +<tt class="docutils literal">assert</tt>. (This and other issues are supposed to be fixed in the +future, but for now it is alpha-level.)</p> +<p>Anyway, with a regular <tt class="docutils literal">assert</tt> and a regular post-mortem <tt class="docutils literal">pdb</tt>, +we could observe that <tt class="docutils literal">x.value</tt> is indeed 7 instead of 6 when the +assert fails. Imagine that the program is much bigger: how would we +find the exact chain of events that caused this value 7 to show up on +this particular <tt class="docutils literal">Foo</tt> object? This is what RevPDB is for.</p> +<p><strike>First, we need for now to disable Address Space Layout Randomization +(ASLR), otherwise replaying will not work. This is done once with the +following command line, which changes the state until the next +reboot:</strike></p> +<pre class="literal-block"> +echo 0 | sudo tee /proc/sys/kernel/randomize_va_space +</pre> +<p><strong>UPDATE:</strong> the above is no longer necessary from revision ff376ccacb36.</p> +<p>Run <tt class="docutils literal">x.py</tt> with RevPDB's version of PyPy instead of the regular +interpreter (CPython or PyPy):</p> +<pre class="literal-block"> +PYPYRDB=log.rdb ./pypy-revdb x.py +</pre> +<p>This <tt class="docutils literal"><span class="pre">pypy-revdb</span></tt> executable is like a slow PyPy executable, running +(for now) without a JIT. This produces a file <tt class="docutils literal">log.rdb</tt> which +contains a complete log of this execution. (If the bug we are +tracking occurs rarely, we need to re-run it several times until we +get the failure. But once we got the failure, then we're done with +this step.)</p> +<p>Start:</p> +<pre class="literal-block"> +rpython/translator/revdb/revdb.py log.rdb +</pre> +<p>We get a pdb-style debugger. This <tt class="docutils literal">revdb.py</tt> is a normal Python +program, which you run with an unmodified Python; internally, it looks +inside the log for the path to <tt class="docutils literal"><span class="pre">pypy-revdb</span></tt> and run it as needed (as +one forking subprocess, in a special mode).</p> +<p>Initially, we are at the start of the program---not at the end, like +we'd get in a regular debugger:</p> +<pre class="literal-block"> +File "&lt;builtin&gt;/app_main.py", line 787 in setup_bootstrap_path: +(1)$ +</pre> +<p>The list of commands is available with <tt class="docutils literal">help</tt>.</p> +<p>Go to the end with <tt class="docutils literal">continue</tt> (or <tt class="docutils literal">c</tt>):</p> +<pre class="literal-block"> +(1)$ continue +File "/tmp/x.py", line 14 in &lt;module&gt;: +... + lst1 = [Foo() for i in range(100)] + lst1[50].value += 1 + for x in lst1: + x.value += 1 + + for x in lst1: + if x.value != 6: + print 'oops!' +&gt; os._exit(1) +(19727)$ +</pre> +<p>We are now at the beginning of the last executed line. The number +19727 is the "time", measured in number of lines executed. We can go +backward with the <tt class="docutils literal">bstep</tt> command (backward step, or <tt class="docutils literal">bs</tt>), line +by line, and forward again with the <tt class="docutils literal">step</tt> command. There are also +commands <tt class="docutils literal">bnext</tt>, <tt class="docutils literal">bcontinue</tt> and <tt class="docutils literal">bfinish</tt> and their forward +equivalents. There is also "<tt class="docutils literal">go TIME</tt>" to jump directly to the specified +time. (Right now the debugger only stops at "line start" +events, not at function entry or exit, which makes some cases a bit +surprising: for example, a <tt class="docutils literal">step</tt> from the return statement of +function <tt class="docutils literal">foo()</tt> will jump directly to the caller's caller, if the +caller's current line was <tt class="docutils literal">return foo() + 2</tt>, because no "line +start" event occurs in the caller after <tt class="docutils literal">foo()</tt> returns to it.)</p> +<p>We can print Python expressions and statements using the <tt class="docutils literal">p</tt> +command:</p> +<pre class="literal-block"> +(19727)$ p x +$0 = &lt;__main__.Foo object at 0xfffffffffffeab3e&gt; +(19727)$ p x.value +$1 = 7 +(19727)$ p x.value + 1 +8 +</pre> +<p>The "<tt class="docutils literal">$NUM =</tt>" prefix is only shown when we print an object that +really exists in the debugged program; that's why the last line does +not contain it. Once a <tt class="docutils literal">$NUM</tt> has been printed, then we can use +it in further expressions---even at a different point time. It +becomes an anchor that always refers to the same object:</p> +<pre class="literal-block"> +(19727)$ bstep + +File "/tmp/x.py", line 13 in &lt;module&gt;: +... + + lst1 = [Foo() for i in range(100)] + lst1[50].value += 1 + for x in lst1: + x.value += 1 + + for x in lst1: + if x.value != 6: +&gt; print 'oops!' + os._exit(1) +(19726)$ p $0.value +$1 = 7 +</pre> +<p>In this case, we want to know when this value 7 was put in this +attribute. This is the job of a watchpoint:</p> +<pre class="literal-block"> +(19726)$ watch $0.value +Watchpoint 1 added +updating watchpoint value: $0.value =&gt; 7 +</pre> +<p>This watchpoint means that <tt class="docutils literal">$0.value</tt> will be evaluated at each line. +When the <tt class="docutils literal">repr()</tt> of this expression changes, the watchpoint activates +and execution stops:</p> +<pre class="literal-block"> +(19726)$ bcontinue +[searching 19629..19726] +[searching 19338..19629] + +updating watchpoint value: $0.value =&gt; 6 +Reverse-hit watchpoint 1: $0.value +File "/tmp/x.py", line 9 in &lt;module&gt;: + import os + + class Foo(object): + value = 5 + + lst1 = [Foo() for i in range(100)] + lst1[50].value += 1 + for x in lst1: +&gt; x.value += 1 + + for x in lst1: + if x.value != 6: + print 'oops!' + os._exit(1) +(19524)$ +</pre> +<p>Note that using the <tt class="docutils literal">$NUM</tt> syntax is essential in watchpoints. You +can't say "<tt class="docutils literal">watch x.value</tt>", because the variable <tt class="docutils literal">x</tt> will go out +of scope very soon when we move forward or backward in time. In fact +the watchpoint expression is always evaluated inside an environment +that contains the builtins but not the current locals and globals. +But it also contains all the <tt class="docutils literal">$NUM</tt>, which can be used to refer to +known objects. It is thus common to watch <tt class="docutils literal">$0.attribute</tt> if <tt class="docutils literal">$0</tt> +is an object, or to watch <tt class="docutils literal"><span class="pre">len($1)</span></tt> if <tt class="docutils literal">$1</tt> is some list. The +watch expression can also be a simple boolean: for example, "<tt class="docutils literal">watch +$2 in $3</tt>" where <tt class="docutils literal">$3</tt> is some dict and <tt class="docutils literal">$2</tt> is some object that +you find now in the dict; you would use this to find out the time when +<tt class="docutils literal">$2</tt> was put inside <tt class="docutils literal">$3</tt>, or removed from it.</p> +<p>Use "<tt class="docutils literal">info watchpoints</tt>" and "<tt class="docutils literal">delete &lt;watchpointnum&gt;</tt>" to manage +watchpoints.</p> +<p>There are also regular breakpoints, which you set with "<tt class="docutils literal">b +FUNCNAME</tt>". It breaks whenever there is a call to a function that +happens to have the given name. (It might be annoying to use for a +function like <tt class="docutils literal">__init__()</tt> which has many homonyms. There is no +support for breaking on a fully-qualified name or at a given line +number for now.)</p> +<p>In our demo, we stop at the line <tt class="docutils literal">x.value += 1</tt>, which is where the +value was changed from 6 to 7. Use <tt class="docutils literal">bcontinue</tt> again to stop at the +line <tt class="docutils literal"><span class="pre">lst1[50].value</span> += 1</tt>, which is where the value was changed from +5 to 6. Now we know how this <tt class="docutils literal">value</tt> attribute ends up being 7.</p> +<pre class="literal-block"> +(19524)$ bcontinue +[searching 19427..19524] +[searching 19136..19427] + +updating watchpoint value: $0.value =&gt; 5 +Reverse-hit watchpoint 1: $0.value +File "/tmp/x.py", line 7 in &lt;module&gt;: + import os + + class Foo(object): + value = 5 + + lst1 = [Foo() for i in range(100)] +&gt; lst1[50].value += 1 + for x in lst1: + x.value += 1 + + for x in lst1: + if x.value != 6: +... +(19422)$ +</pre> +<p>Try to use <tt class="docutils literal">bcontinue</tt> yet another time. It will stop now just before +<tt class="docutils literal">$0</tt> is created. At that point in time, <tt class="docutils literal">$0</tt> refers to +an object that does not exist yet, so the watchpoint now evaluates to +an error message (but it continues to work as before, with that error +message as the string it currently evaluates to).</p> +<pre class="literal-block"> +(19422)$ bcontinue +[searching 19325..19422] + +updating watchpoint value: $0.value =&gt; RuntimeError: + '$0' refers to an object created later in time +Reverse-hit watchpoint 1: $0.value +File "/tmp/x.py", line 6 in &lt;module&gt;: + import os + + class Foo(object): + value = 5 + +&gt; lst1 = [Foo() for i in range(100)] + lst1[50].value += 1 + for x in lst1: + x.value += 1 + + for x in lst1: +... +(19371)$ +</pre> +<p>In big programs, the workflow is similar, just more complex. Usually +it works this way: we find interesting points in time with some +combination of watchpoints and some direct commands to move around. +We write down on a piece of (real or virtual) paper these points in +history, including most importantly their time, so that we can +construct an ordered understanding of what is going on.</p> +<p>The current <tt class="docutils literal">revdb</tt> can be annoying and sometimes even crash; but +the history you reconstruct can be kept. All the times and +expressions printed are still valid when you restart <tt class="docutils literal">revdb</tt>. The +only thing "lost" is the <tt class="docutils literal">$NUM</tt> objects, which you need to print +again. (Maybe instead of <tt class="docutils literal">$0</tt>, <tt class="docutils literal">$1</tt>, ... we should use <tt class="docutils literal">$&lt;big +number&gt;</tt>, where the big number identifies uniquely the object by its +creation time. These numbers would continue to be valid even after +<tt class="docutils literal">revdb</tt> is restarted. They are more annoying to use than just +<tt class="docutils literal">$0</tt> though.)</p> +<p><b>Screencast:</b> Here's a (slightly typo-y) screencast of cfbolz using the reverse debugger: +</p> +</div> +<div class="section" id="current-issues"> +<h3>Current issues</h3> +<p>General issues:</p> +<ul class="simple"> +<li>If you are using <tt class="docutils literal">revdb</tt> on a log that took more than a few +minutes to record, then it can be painfully slow. This is because +<tt class="docutils literal">revdb</tt> needs to replay again big parts of the log for some +operations.</li> +<li>The <tt class="docutils literal"><span class="pre">pypy-revdb</span></tt> is currently missing the following modules:<ul> +<li><tt class="docutils literal">thread</tt> (implementing multithreading is possible, but not done +yet);</li> +<li><tt class="docutils literal">cpyext</tt> (the CPython C API compatibility layer);</li> +<li><tt class="docutils literal">micronumpy</tt> (minor issue only);</li> +<li><tt class="docutils literal">_continuation</tt> (for greenlets).</li> +</ul> +</li> +<li>Does not contain a JIT, and does not use our fast garbage +collectors. You can expect <tt class="docutils literal"><span class="pre">pypy-revdb</span></tt> to be maybe 3 times +slower than CPython.</li> +<li>Only works on Linux and OS/X. There is no fundamental reason for +this restriction, but it is some work to fix.</li> +<li>Replaying a program uses a <em>lot</em> more memory; maybe 15x as much than +during the recording. This is because it creates many forks. If +you have a program that consumes 10% of your RAM or more, you will +need to reduce <tt class="docutils literal">MAX_SUBPROCESSES</tt> in <tt class="docutils literal">process.py</tt>.</li> +</ul> +<p>Replaying also comes with a bunch of user interface issues:</p> +<ul class="simple"> +<li><tt class="docutils literal">Attempted to do I/O or access raw memory</tt>: we get this whenever +trying to <tt class="docutils literal">print</tt> some expression that cannot be evaluated with +only the GC memory---or which can, but then the <tt class="docutils literal">__repr__()</tt> +method of the result cannot. We need to reset the state with +<tt class="docutils literal">bstep</tt> + <tt class="docutils literal">step</tt> before we can print anything else. However, +if only the <tt class="docutils literal">__repr__()</tt> crashes, you still see the <tt class="docutils literal">$NUM =</tt> +prefix, and you can use that <tt class="docutils literal">$NUM</tt> afterwards.</li> +<li><tt class="docutils literal">id()</tt> is globally unique, returning a reproducible 64-bit number, +so sometimes using <tt class="docutils literal">id(x)</tt> is a workaround for when using <tt class="docutils literal">x</tt> +doesn't work because of <tt class="docutils literal">Attempted to do I/O</tt> issues (e.g. <tt class="docutils literal">p +[id(x) for x in somelist]</tt>).</li> +<li>as explained in the demo, next/bnext/finish/bfinish might jump +around a bit non-predictably.</li> +<li>similarly, breaks on watchpoints can stop at apparently unexpected +places (when going backward, try to do "step" once). The issue is +that it can only stop at the beginning of every line. In the +extreme example, if a line is <tt class="docutils literal"><span class="pre">foo(somelist.pop(getindex()))</span></tt>, +then <tt class="docutils literal">somelist</tt> is modified in the middle. Immediately before +this modification occurs, we are in <tt class="docutils literal">getindex()</tt>, and +immediately afterwards we are in <tt class="docutils literal">foo()</tt>. The watchpoint will +stop the program at the end of <tt class="docutils literal">getindex()</tt> if running backward, +and at the start of <tt class="docutils literal">foo()</tt> if running forward, but never +actually on the line doing the change.</li> +<li>watchpoint expressions <em>must not</em> have any side-effect at all. If +they do, the replaying will get out of sync and <tt class="docutils literal">revdb.py</tt> will +complain about that. Regular <tt class="docutils literal">p</tt> expressions and statements can +have side-effects; these effects are discarded as soon as you move +in time again.</li> +<li>sometimes even "<tt class="docutils literal">p import foo</tt>" will fail with <tt class="docutils literal">Attempted to do +I/O</tt>. Use instead "<tt class="docutils literal">p import sys; foo = <span class="pre">sys.modules['foo']</span></tt>".</li> +<li>use <tt class="docutils literal">help</tt> to see all commands. <tt class="docutils literal">backtrace</tt> can be useful. +There is no <tt class="docutils literal">up</tt> command; you have to move in time instead, +e.g. using <tt class="docutils literal">bfinish</tt> to go back to the point where the current +function was called.</li> +</ul> +</div> +<div class="section" id="how-revpdb-is-done"> +<h3>How RevPDB is done</h3> +<p>If I had to pick the main advantage of PyPy over CPython, it is that +we have got with the RPython translation toolchain a real place for +experimentation. Every now and then, we build inside RPython some +feature that gives us an optionally tweaked version of the PyPy +interpreter---tweaked in a way that would be hard to do with CPython, +because it would require systematic changes everywhere. The most +obvious and successful examples are the GC and the JIT. But there +have been many other experiments along the same lines, from the +so-called <a class="reference external" href="https://foss.heptapod.net/pypy/extradoc/-/blob/branch/default/tip/eu-report/D07.1_Massive_Parallelism_and_Translation_Aspects-2007-02-28.pdf">stackless transformation</a> in the early days, to the STM +version of PyPy.</p> +<p>RevPDB works in a similar way. It is a version of PyPy in which some +operations are systematically replaced with other operations.</p> +<p>To keep the log file at a reasonable size, we duplicate the content of +all GC objects during replaying---by repeating the same actions on +them, without writing anything in the log file. So that means that in +the <tt class="docutils literal"><span class="pre">pypy-revdb</span></tt> binary, the operations that do arithmetic or +read/write GC-managed memory are not modified. Most operations are +like that. However, the other operations, the ones that involve +either non-GC memory or calls to external C functions, are tweaked. +Each of these operations is replaced with code that works in two +modes, based on a global flag:</p> +<ul class="simple"> +<li>in "recording" mode, we log the result of the operation (but not the +arguments);</li> +<li>in "replaying" mode, we don't really do the operation at all, but +instead just fetch the result from the log.</li> +</ul> +<p>Hopefully, all remaining unmodified operations (arithmetic and GC +load/store) are completely deterministic. So during replaying, every +integer or non-GC pointer variable will have exactly the same value as +it had during recording. Interestingly, it means that if the +recording process had a big array in non-GC memory, then in the +replaying process, the array is not allocated at all; it is just +represented by the same address, but there is nothing there. When we +record "read item 123 from the array", we record the result of the +read (but not the "123"). When we replay, we're seeing again the same +"read item 123 from the array" operation. At that point, we don't +read anything; we just return the result from the log. Similarly, +when recording a "write" to the array, we record nothing (this write +operation has no result); so that when replaying, we redo nothing.</p> +<p>Note how that differs from anything managed by GC memory: GC objects +(including GC arrays) are really allocated, writes really occur, and +reads are redone. We don't touch the log in this case.</p> +</div> +<div class="section" id="other-reverse-debuggers-for-python"> +<h3>Other reverse debuggers for Python</h3> +<p>There are already some Python experiments about <a class="reference external" href="https://en.wikipedia.org/wiki/Debugger#Reverse_debugging">reverse debugging</a>. +This is also known as "omniscient debugging". However, I claim that +the result they get to is not very useful (for the purpose presented +here). How they work is typically by recording changes to some +objects, like lists and dictionaries, in addition to recording the +history of where your program passed through. However, the problem of +Python is that lists and dictionaries are not the end of the story. +There are many, many, many types of objects written in C which are +mutable---in fact, the immutable ones are the exception. You can try +to systematically record all changes, but it is a huge task and easy +to forget a detail.</p> +<p>In other words it is a typical use case for tweaking the RPython +translation toolchain, rather than tweaking the CPython (or PyPy) +interpreter directly. The result that we get here with RevPDB is more +similar to <a class="reference external" href="https://rr-project.org/">rr</a> anyway, in that only a relatively small number of +external events are recorded---not every single change to every single +list and dictionary.</p> +<p>Some links:</p> +<ul class="simple"> +<li>epdb: <a class="reference external" href="https://github.com/native-human/epdb">https://github.com/native-human/epdb</a></li> +<li>pode: <a class="reference external" href="https://github.com/rodsenra/pode">https://github.com/rodsenra/pode</a></li> +</ul> +<p>For C:</p> +<ul class="simple"> +<li>rr: <a class="reference external" href="https://rr-project.org/">https://rr-project.org/</a></li> +<li>undodb-gdb: <a class="reference external" href="https://undo.io/">https://undo.io/</a></li> +</ul> +</div> +<div class="section" id="future-work"> +<h3>Future work</h3> +<p>As mentioned above, it is alpha-level, and only works on Linux and OS/X. +So the plans for the immediate future are to fix the various +issues described above, and port to more operating systems. The core of the system +is in the C file and headers in <tt class="docutils literal"><span class="pre">rpython/translator/revdb/src-revdb</span></tt>.</p> +<p>For interested people, there is also the <a class="reference external" href="https://bitbucket.org/pypy/duhton/">Duhton</a> interpreter and its +<tt class="docutils literal"><span class="pre">reverse-debugger</span></tt> branch, which is where I prototyped the RPython +concept before moving to PyPy. The basics should work for any +interpreter written in RPython, but they require some specific code to +interface with the language; in the case of PyPy, it is in +<tt class="docutils literal">pypy/interpreter/reverse_debugging.py</tt>.</p> +<p>In parallel, there are various user interface improvements that people +could be interested in, like a more "pdb++" experience. (And the script +at <tt class="docutils literal">rpython/translator/revdb/revdb.py</tt> should be moved out into some +more "official" place, and the <tt class="docutils literal"><span class="pre">reverse-debugger</span></tt> branch should be +merged back to default.)</p> +<p>I would certainly welcome any help!</p> +<p>-+- Armin</p> +</div>revdbhttps://www.pypy.org/posts/2016/07/reverse-debugging-for-python-8854823774141612670.htmlFri, 08 Jul 2016 11:39:00 GMT \ No newline at end of file diff --git a/authors/bea-during.html b/authors/bea-during.html new file mode 100644 index 000000000..b54dad058 --- /dev/null +++ b/authors/bea-during.html @@ -0,0 +1,113 @@ + + + + + +Posts by Bea Düring | PyPy + + + + + + + + + + + + + + + + + Skip to main content +
+
+
+ + \ No newline at end of file diff --git a/authors/bea-during.xml b/authors/bea-during.xml new file mode 100644 index 000000000..09fecbcb4 --- /dev/null +++ b/authors/bea-during.xml @@ -0,0 +1,34 @@ + +PyPy (Posts by Bea Düring)https://www.pypy.org/enContents © 2024 <a href="mailto:pypy-dev@pypy.org">The PyPy Team</a> Sat, 31 Aug 2024 17:48:11 GMTNikola (getnikola.com)http://blogs.law.harvard.edu/tech/rssFunding of some recent progress by Google's Open Source Programshttps://www.pypy.org/posts/2008/06/pypy-improvements-5272963843122158791.htmlBea Düring<p>As readers of this blog already know, <a class="reference external" href="https://codespeak.net/pypy/dist/pypy/doc/home.html">PyPy</a> development has +recently focused on getting the code base to a more usable state. One +of the most important parts of this work was creating an +implementation of the <a class="reference external" href="https://docs.python.org/lib/module-ctypes.html">ctypes module</a> for PyPy, which +provides a realistic way to interface with external libraries. The +module is now fairly complete (if somewhat slow), and has generated a +great deal of community interest. One of the main reasons this work +progressed so well was that we received funding from <a class="reference external" href="https://code.google.com/opensource/">Google's Open +Source Programs Office</a>. This is +really fantastic for us, and we cannot thank Google and <a class="reference external" href="https://www.python.org/%7Eguido/">Guido</a> enough for helping PyPy progress +more rapidly than we could have with volunteer-only time!</p> +<p>This funding opportunity arose from the <a class="reference external" href="https://morepypy.blogspot.com/2007/11/pypy-road-show.html">PyPy US road trip</a> at the end +of last year, which included a <a class="reference external" href="https://www.pypy.org/posts/2007/11/pypy-google-tech-talk-9082134238390123890.html">visit to Google</a>. You +can check out <a class="reference external" href="https://www.youtube.com/watch?v=GnPmErtqPXk">the video</a> +of the talk we gave during our visit. We wrapped up our day with +discussions about the possibility of Google funding some PyPy work and +soon after a we were at work on the proposal for improvements we'd +submitted.</p> +<p>One nice side-effect of the funding is indeed that we can use some of +the money for funding travels of contributors to our sprint meetings. +The next scheduled Google funding proposal also aims at making our +Python interpreter more usable and compliant with <a class="reference external" href="https://wiki.python.org/moin/CPython">CPython</a>. This will be done by trying to +fully run <a class="reference external" href="https://www.djangoproject.org/">Django</a> on top of PyPy. With +more efforts like this one we're hoping that PyPy can start to be used +as a CPython replacement before the end of 2008.</p> +<p>Many thanks to the teams at <a class="reference external" href="https://merlinux.de/">merlinux</a> and <a class="reference external" href="https://www.openend.se/">Open End</a> for making this development possible, including +Carl Friedrich Bolz, Antonio Cuni, Holger Krekel, Maciek Fijalkowski +at merlinux, Samuele Pedroni and yours truly at Open End.</p> +<p>We always love to hear feedback from the community, and you can get +the latest word on our development and let us know your thoughts <a class="reference external" href="https://morepypy.blogspot.com/">here in the comments</a>.</p> +<p>Bea Düring, Open End AB</p> + +<p>PS: Thanks Carl Friedrich Bolz for drafting this post.</p>https://www.pypy.org/posts/2008/06/pypy-improvements-5272963843122158791.htmlThu, 26 Jun 2008 11:41:00 GMT \ No newline at end of file diff --git a/authors/benjamin-peterson.html b/authors/benjamin-peterson.html new file mode 100644 index 000000000..c1707c890 --- /dev/null +++ b/authors/benjamin-peterson.html @@ -0,0 +1,113 @@ + + + + + +Posts by Benjamin Peterson | PyPy + + + + + + + + + + + + + + + + + Skip to main content +
+
+
+ + \ No newline at end of file diff --git a/authors/benjamin-peterson.xml b/authors/benjamin-peterson.xml new file mode 100644 index 000000000..1b222a541 --- /dev/null +++ b/authors/benjamin-peterson.xml @@ -0,0 +1,6 @@ + +PyPy (Posts by Benjamin Peterson)https://www.pypy.org/enContents © 2024 <a href="mailto:pypy-dev@pypy.org">The PyPy Team</a> Sat, 31 Aug 2024 17:48:11 GMTNikola (getnikola.com)http://blogs.law.harvard.edu/tech/rssPyPy gets a new compilerhttps://www.pypy.org/posts/2009/08/pypy-gets-new-compiler_25-6401910947439531107.htmlBenjamin Peterson<p>Today, I merged the parser-compiler branch, which I have been working on over the summer. It contained a total rewrite of both PyPy's Python parser and AST compiler. PyPy's old parser was (in)famous internally for being complicated and slow (with many algorithmic complexities greater than O(n)). The new parser is a simple as <a href="https://codespeak.net/viewvc/pypy/trunk/pypy/interpreter/pyparser/parser.py?view=markup">I could make it</a> LL(1) parser like CPython (though it doesn't share the hacks of CPython's parser).</p> + +<p>The new compiler is based on the <a href="https://doc.python.org/3.1/library/ast">Abstract Syntax Trees (AST) that CPython 2.5 introduced</a> instead of PyPy's old AST based on the <a href="https://doc.python.org/library/compiler">compiler package's</a>. This means that Python code running on PyPy will be able to use the same _ast interface as CPython. PyPy's _ast implementation supports AST features that CPython 2.6 added, including <a href="https://pythonic.pocoo.org/2008/3/29/ast-compilation-from-python">compiling modified AST to bytecode and executing it</a>. In this rewrite, some more obscure compiler features were added, too. For example, jumps in bytecode can now be greater than 65535 bytes! (That's like an if statement with 7000 lines of code in the body.)</p> + +<p>While the PyPy translation toolchain still has many obscure details and hacks, this merge completes the process of making the actual Python interpreter very clean. Hopefully, this will make adding new features much easier and make PyPy less frustrating to maintain as well as providing application level code with an improved AST interface!</p>compilerparserspeedhttps://www.pypy.org/posts/2009/08/pypy-gets-new-compiler_25-6401910947439531107.htmlTue, 25 Aug 2009 16:05:00 GMT \ No newline at end of file diff --git a/authors/brian-kearns.html b/authors/brian-kearns.html new file mode 100644 index 000000000..6cf7d0b25 --- /dev/null +++ b/authors/brian-kearns.html @@ -0,0 +1,122 @@ + + + + + +Posts by Brian Kearns | PyPy + + + + + + + + + + + + + + + + + Skip to main content +
+
+
+ + \ No newline at end of file diff --git a/authors/brian-kearns.xml b/authors/brian-kearns.xml new file mode 100644 index 000000000..ad85794e7 --- /dev/null +++ b/authors/brian-kearns.xml @@ -0,0 +1,37 @@ + +PyPy (Posts by Brian Kearns)https://www.pypy.org/enContents © 2024 <a href="mailto:pypy-dev@pypy.org">The PyPy Team</a> Sat, 31 Aug 2024 17:48:10 GMTNikola (getnikola.com)http://blogs.law.harvard.edu/tech/rssNumPy on PyPy - Status Updatehttps://www.pypy.org/posts/2014/04/numpy-on-pypy-status-update-1103134247318103282.htmlBrian Kearns<p>Work on NumPy on PyPy continued in March, though at a lighter pace than the previous few months. Progress was made on both compatibility and speed fronts. Several behavioral issues reported to the bug tracker were resolved. The most significant of these was probably the correction of casting to built-in Python types. Previously, int/long conversions of numpy scalars such as inf/nan/1e100 would return bogus results. Now, they raise or return values, as appropriate.<br> +<br> +On the speed front, enhancements to the PyPy JIT were made to support virtualizing the raw_store/raw_load memory operations used in numpy arrays. Further work remains here in virtualizing the alloc_raw_storage when possible. This will allow scalars to have storages but still be virtualized when possible in loops.<br> +<br> +Aside from continued work on compatibility/speed of existing code, we also hope to begin implementing the C-level components of other numpy modules such as mtrand, nditer, linalg, and so on. Several approaches could be taken to get C-level code in these modules working, ranging from reimplementing in RPython to interfacing with existing code with CFFI, if possible. The appropriate approach depends on many factors and will probably vary from module to module.<br><br>To try out PyPy + NumPy, grab a <a href="https://buildbot.pypy.org/nightly/trunk/">nightly PyPy</a> and install our <a href="https://bitbucket.org/pypy/numpy">NumPy fork</a>. Feel free to report comments/issues to IRC, our mailing list, or bug tracker. Thanks to the contributors to the <a href="https://pypy.org/numpydonate.html">NumPy on PyPy</a> proposal for supporting this work.</p>numpyhttps://www.pypy.org/posts/2014/04/numpy-on-pypy-status-update-1103134247318103282.htmlTue, 15 Apr 2014 20:08:00 GMTNumPy on PyPy - Progress in Februaryhttps://www.pypy.org/posts/2014/03/numpy-status-update-february-1245769841736493525.htmlBrian Kearns<p>More progress was made on the NumPy front in the past month. On the compatibility front, we now pass ~130 more tests from NumPy's suite since the end of January. Currently, we pass 2336 tests out of 3265 tests run, with many of the failures representing portions of NumPy that we don't plan to implement in the near future (object dtypes, unicode, etc). There are still some failures that do represent issues, such as special indexing cases and failures to respect subclassed ndarrays in return values, which we do plan to resolve. There are also some unimplemented components and ufuncs remaining which we hope to implement, such as nditer and mtrand. Overall, the most common array functionality should be working.<br> +<br> +Additionally, I began to take a look at some of the loops generated by our code. One widely used loop is dot, and we were running about 5x slower than NumPy's C version. I was able to optimize the dot loop and also the general array iterator to get us to ~1.5x NumPy C time on dot operations of various sizes. Further progress in this area could be made by using CFFI to tie into BLAS libraries, when available. Also, work remains in examining traces generated for our other loops and checking for potential optimizations.<br> +<br> +To try out PyPy + NumPy, grab a <a href="https://buildbot.pypy.org/nightly/trunk/">nightly PyPy</a> and install our <a href="https://bitbucket.org/pypy/numpy">NumPy fork</a>. Feel free to report comments/issues to IRC, our mailing list, or bug tracker. Thanks to the contributors to the <a href="https://pypy.org/numpydonate.html">NumPy on PyPy</a> proposal for supporting this work.<br> +<br> +Cheers,<br> +Brian</p>numpyhttps://www.pypy.org/posts/2014/03/numpy-status-update-february-1245769841736493525.htmlFri, 07 Mar 2014 05:05:00 GMTNumPy Status Update - December/Januaryhttps://www.pypy.org/posts/2014/02/numpy-status-update-decemberjanuary-4292961614234099787.htmlBrian Kearns<p>Work continued on the NumPy + PyPy front steadily in December and more lightly in January. The continued focus was compatibility, targeting incorrect or unimplemented features that appeared in multiple NumPy test suite failures. We now pass ~2/3 of the NumPy test suite. The biggest improvements were made in these areas:<br> +<br> +- Bugs in conversions of arrays/scalars to/from native types<br> +- Fix cases where we would choose incorrect dtypes when initializing or computing results<br> +- Improve handling of subclasses of ndarray through computations<br> +- Support some optional arguments for array methods that are used in the pure-python part of NumPy<br> +- Support additional attributes in arrays, array.flags, and dtypes<br> +- Fix some indexing corner cases that arise in NumPy testing<br> +- Implemented part of numpy.fft (cffti and cfftf)<br> +<br> +Looking forward, we plan to continue improving the correctness of the existing implemented NumPy functionality, while also beginning to look at performance. The initial focus for performance will be to look at areas where we are significantly worse than CPython+NumPy. Those interested in trying these improvements out will need a <a href="https://buildbot.pypy.org/nightly/trunk/">PyPy nightly</a>, and an install of the <a href="https://bitbucket.org/pypy/numpy">PyPy NumPy fork</a>. Thanks again to the <a href="https://pypy.org/numpydonate.html">NumPy on PyPy</a> donors for funding this work.</p>numpyhttps://www.pypy.org/posts/2014/02/numpy-status-update-decemberjanuary-4292961614234099787.htmlThu, 06 Feb 2014 19:06:00 GMTNumPy Status Update - Novemberhttps://www.pypy.org/posts/2013/12/numpy-status-update-november-364321959153372759.htmlBrian Kearns<p>Since the PyPy 2.2 release last month, more progress has been made on the NumPy compatibility front. Initial work has been directed by running the NumPy test suite and targeting failures that appear most frequently, along with fixing the few bugs reported on the bug tracker.<br> +<br> +Improvements were made in these areas:<br> +- Many missing/broken scalar functionalities were added/fixed. The scalar API should match up more closely with arrays now.<br> +- Some missing dtype functionality was added (newbyteorder, hasobject, descr, etc)<br> +- Support for optional arguments (axis, order) was added to some ndarray functions<br> +- Fixed some corner cases for string/record types<br> +<br> +Most of these improvements went onto trunk after 2.2 was split, so if you're interested in trying them out or running into problems on 2.2, try the +<a href="https://buildbot.pypy.org/nightly/trunk/">nightly</a>.<br> +<br> +Thanks again to the <a href="https://pypy.org/numpydonate.html">NumPy on PyPy</a> donors who make this continued progress possible.<br> +<br> +Cheers,<br> +Brian</p>numpyhttps://www.pypy.org/posts/2013/12/numpy-status-update-november-364321959153372759.htmlMon, 09 Dec 2013 23:05:00 GMT \ No newline at end of file diff --git a/authors/carl-friedrich-bolz-tereick.html b/authors/carl-friedrich-bolz-tereick.html new file mode 100644 index 000000000..70577b7a6 --- /dev/null +++ b/authors/carl-friedrich-bolz-tereick.html @@ -0,0 +1,428 @@ + + + + + +Posts by Carl Friedrich Bolz-Tereick | PyPy + + + + + + + + + + + + + + + + + Skip to main content +
+

Posts by Carl Friedrich Bolz-Tereick

+ +
+
+ + \ No newline at end of file diff --git a/authors/carl-friedrich-bolz-tereick.xml b/authors/carl-friedrich-bolz-tereick.xml new file mode 100644 index 000000000..86fe83910 --- /dev/null +++ b/authors/carl-friedrich-bolz-tereick.xml @@ -0,0 +1,3350 @@ + +PyPy (Posts by Carl Friedrich Bolz-Tereick)https://www.pypy.org/enContents © 2024 <a href="mailto:pypy-dev@pypy.org">The PyPy Team</a> Sat, 31 Aug 2024 17:48:13 GMTNikola (getnikola.com)http://blogs.law.harvard.edu/tech/rssFixing a Bug in PyPy's Incremental GChttps://www.pypy.org/posts/2024/03/fixing-bug-incremental-gc.htmlCarl Friedrich Bolz-Tereick<h2 id="introduction">Introduction</h2> +<p>Since last summer, I've been looking on and off into a weird and hard to +reproduce <a href="https://github.com/pypy/pypy/issues/3959">crash bug in PyPy</a>. It was +manifesting only on CI, and it seemed to always happen in the AST rewriting +phase of <a href="https://pytest.org">pytest</a>, the symptoms being that PyPy would crash +with a segfault. All my attempts to reproduce it locally failed, and my +attempts to try to understand the problem by dumping the involved ASTs lead +nowhere.</p> +<p>A few weeks ago, we got <a href="https://github.com/PyO3/pyo3/issues/3766">two more</a> +<a href="https://github.com/orgs/pypy/discussions/4923">bug reports</a>, the last one by +the authors of the <a href="https://nanobind.readthedocs.io/">nanobind</a> binding +generator, with the same symptoms: crash in AST rewriting, only on CI. I +decided to make a more serious push to try to find the bug this time. +Ultimately the problem turned out to be several bugs in PyPy's garbage +collector (GC) that had been there since its inception in +<a href="https://www.pypy.org/posts/2013/10/incremental-garbage-collector-in-pypy-8956893523842234676.html">2013</a>. +Understanding the +situation turned out to be quite involved, additionally complicated by this +being the first time that I was working on this particular aspect of PyPy's GC. +Since the bug was so much work to find, I thought I'd write a blog post about +it.</p> +<p>The blog post consists of three parts: first a chronological description of +what I did to find the bug, a technical explanation of what goes wrong, some +reflections on the bug (and then a bonus bug I also found in the process).</p> +<h2 id="finding-the-bug">Finding the Bug</h2> +<p>I started from the failing <a href="https://github.com/wjakob/nanobind/actions/runs/8234561874/job/22516568891">nanobind CI +runs</a> +that ended with a segfault of the PyPy interpreter. This was only an +intermittent problem, not every run was failing. When I tried to just run the +test suite locally, I couldn't get it to fail. Therefore at first I tried to +learn more about what was happening by looking on the CI runners.</p> +<h3 id="running-on-ci">Running on CI</h3> +<p>I forked the nanobind repo and hacked the CI script in order to get it to use a +PyPy build with <a href="https://doc.pypy.org/en/latest/build.html#making-a-debug-build-of-pypy">full debug information and more assertions turned on</a>. In order +to increase the probability of seeing the crash I added an otherwise unused +<a href="https://docs.github.com/en/actions/using-jobs/using-a-matrix-for-your-jobs">matrix</a> +variable to the CI script that just contained 32 parameters. This means every +build is done 32 times (sorry Github for wasting your CPUs 😕). With that +amount of repetition, I got at least one job of every build that was crashing.</p> +<p>Then I added the <code>-Xfaulthandler</code> option to the PyPy command which will use the +<a href="https://docs.python.org/3.11/library/faulthandler.html">faulthandler</a> module +try to print a Python stacktrace if the VM segfaults to confirm that PyPy was +indeed crashing in the <a href="https://docs.python.org/3/library/ast.html">AST</a> +<a href="https://github.com/pytest-dev/pytest/blob/main/src/_pytest/assertion/rewrite.py">rewriting +phase</a> +of pytest, which pytest uses for <a href="https://docs.pytest.org/en/7.1.x/how-to/assert.html#asserting-with-the-assert-statement">nicer +assertions</a>. +I experimented with hacking our faulthandler implementation to also give me a +C-level callstack, but that didn't work as well as I hoped.</p> +<p>Then I tried to run <a href="https://sourceware.org/gdb/">gdb</a> on CI to try to get it +to print a C callstack at the crash point. You can get gdb to execute commands +as if typed at the prompt with the <code>-ex</code> commandline option, I used something +like this:</p> +<div class="code"><pre class="code literal-block">gdb -ex "set confirm off" -ex "set pagination off" -ex \ + "set debuginfod enabled off" -ex run -ex where -ex quit \ + --args &lt;command&gt; &lt;arguments&gt; +</pre></div> + +<p>But unfortunately the crash never occurred when running in gdb.</p> +<p>Afterwards I tried the next best thing, which was configuring the CI runner to +<a href="https://github.com/itamarst/gha-upload-cores">dump a core file and upload it as a build +artifact</a>, which worked. Looking +at the cores locally only sort of worked, because I am running a different +version of Ubuntu than the CI runners. So I used +<a href="https://mxschmitt.github.io/action-tmate/">tmate</a> to be able to log into the +CI runner after a crash and interactively used gdb there. Unfortunately what I +learned from that was that the bug was some kind of <strong>memory corruption</strong>, +which is always incredibly unpleasant to debug. Basically the header word of a +Python object had been corrupted somehow at the point of the crash, which means +that it's <a href="https://en.wikipedia.org/wiki/Virtual_method_table">vtable</a> wasn't +usable any more.</p> +<p>(Sidenote: <a href="https://www.pypy.org/posts/2009/10/gc-improvements-6174120095428192954.html#unifying-the-vtable-ptr-with-the-gc-header">PyPy doesn't really use a vtable +pointer</a>, +instead it uses half a word in the header for the vtable, and the other half +for flags that the GC needs to keep track of the state of the object. +Corrupting all this is still bad.)</p> +<h3 id="reproducing-locally">Reproducing Locally</h3> +<p>At that point it was clear that I had to push to reproduce the problem on my +laptop, to allow me to work on the problem more directly and not to always have +to go via the CI runner. Memory corruption bugs often have a lot of randomness +(depending on which part of memory gets modified, things might crash or more +likely just happily keep running). Therefore I decided to try to brute-force +reproducing the crash by simply running the tests many many times. Since the +crash happened in the AST rewriting phase of pytest, and that happens only if +no <a href="https://stackoverflow.com/questions/2998215/if-python-is-interpreted-what-are-pyc-files">pyc +files</a> +of the bytecode-compiled rewritten ASTs exist, I made sure to delete them +before every test run.</p> +<p>To repeat the test runs I used +<a href="https://tratt.net/laurie/src/multitime/">multitime</a>, which is a simple program +that runs a command repeatedly. It's meant for lightweight benchmarking +purposes, but it also halts the execution of the command if that command exits +with an error (and it sleeps a small random time between runs, which might help +with randomizing the situation, maybe). Here's a demo:</p> +<script src="https://asciinema.org/a/648877.js" id="asciicast-648877" async="true"></script> + +<p>(<a href="https://bernsteinbear.com/">Max</a> pointed out +<a href="https://github.com/silentbicycle/autoclave">autoclave</a> to me when reviewing +this post, which is a more dedicated tool for this job.)</p> +<p>Thankfully, running the tests repeatedly eventually lead to a crash, solving my +"only happens on CI" problem. I then tried various variants to exclude possible +sources of errors. The first source of errors to exclude in PyPy bugs is the +just-in-time compiler, so I reran the tests with <code>--jit off</code> to see whether I +could still get it to crash, and thankfully I eventually could (JIT bugs are +often very annoying).</p> +<p>Next source of bugs to exclude where C-extensions. Since those were the tests +of nanobind, a framework for creating C-extension modules I was a bit worried +that the bug might be in our emulation of CPython's C-API. But running PyPy +with the <code>-v</code> option (which will print all the imports as they happen) +confirmed that at the point of crash no C-extension had been imported yet.</p> +<h3 id="using-rr">Using <code>rr</code></h3> +<p>I still couldn't get the bug to happen in GDB, so the tool I tried next was +<a href="https://rr-project.org/">rr, the "reverse debugger"</a>. rr can record the execution of a program and +later replay it arbitrarily often. This gives you a time-traveling debugger +that allows you to execute the program backwards in addition to forwards. +Eventually I managed to get the crash to happen when running the tests with +<code>rr record --chaos</code> (<code>--chaos</code> randomizes some decisions that rr takes, to try to +increase the chance of reproducing bugs).</p> +<p>Using rr well is quite hard, and I'm not very good at it. The main approach I +use with rr to debug memory corruption is to replay the crash, then set a +<a href="https://sourceware.org/gdb/current/onlinedocs/gdb.html/Set-Watchpoints.html">watchpoint</a> +for the corrupted memory location, then use the command <code>reverse-continue</code> to +find the place in the code that mutated the memory location. <code>reverse-continue</code> +is like <code>continue</code>, except that it will execute the program backwards from the +current point. Here's a little demo of this:</p> +<script src="https://asciinema.org/a/648814.js" id="asciicast-648814" async="true"></script> + +<p>Doing this for my bug revealed that the object that was being corrupted was +erroneously collected by the garbage collector. For some reason the GC had +wrongly decided that the object was no longer reachable and therefore put the +object into a freelist by writing a pointer to the next entry in the freelist +into the first word of the object, overwriting the object's header. The next +time the object was used things crashed.</p> +<h3 id="side-quest-wrong-gc-assertions">Side-quest: wrong GC assertions</h3> +<p>At this point in the process, I got massively side-tracked. PyPy's GC has a +number of debug modes that you can optionally turn on. Those slow down the +program execution a lot, but they should in theory help to understand why the +GC goes wrong. When I turned them on, I was getting a failing assertion really +early in the test execution, complaining about an invariant violation in the GC +logic. At first this made me very happy. I thought that this would help me fix +the bug more quickly.</p> +<p>Extremely frustratingly, after two days of work I concluded that the assertion +logic itself was wrong. I have fixed that in the meantime too, the details +of that are in the bonus section at the end of the post.</p> +<h3 id="using-gdb-scripting-to-find-the-real-bug">Using GDB scripting to find the real bug</h3> +<p>After that disaster I went back to the earlier rr recording without GC assertions +and tried to understand in more detail why the GC decided to free an object +that was still being referenced. To be able to do that I used the <a href="https://sourceware.org/gdb/current/onlinedocs/gdb.html/Python-API.html">GDB Python +scripting +API</a> to +write some helper commands to understand the state of the GC heap (rr is an +extension of GDB, so the GDB scripting API works in rr too).</p> +<p>The first (small) helper command I wrote with the GDB scripting API was a way +to pretty-print the currently active GC flags of a random PyPy object, starting +just from the pointer. The more complex command I wrote was an object tracer, +which follows pointers to GC objects starting from a root object to explore the +object graph. The object tracer isn't complete, it doesn't deal with all the +complexities of PyPy's GC. But it was good enough to help me with my problem, I +found out that the corrupted object was stored in an array.</p> +<p>As an example, here's a function that uses the GDB API to walk one of the +helper data structures of the GC, a stack of pointers:</p> +<div class="code"><pre class="code literal-block"><span class="k">def</span> <span class="nf">walk_addr_stack</span><span class="p">(</span><span class="n">obj</span><span class="p">):</span> +<span class="w"> </span><span class="sd">""" walk an instance of the AddressStack class (which is a linked list of</span> +<span class="sd"> arrays of 1019 pointers).</span> + +<span class="sd"> the first of the arrays is only partially filled with used_in_last_chunk</span> +<span class="sd"> items, all the other chunks are full."""</span> + <span class="k">if</span> <span class="n">obj</span><span class="o">.</span><span class="n">type</span><span class="o">.</span><span class="n">code</span> <span class="o">==</span> <span class="n">gdb</span><span class="o">.</span><span class="n">TYPE_CODE_PTR</span><span class="p">:</span> + <span class="n">obj</span> <span class="o">=</span> <span class="n">obj</span><span class="o">.</span><span class="n">dereference</span><span class="p">()</span> + <span class="n">used_in_last_chunk</span> <span class="o">=</span> <span class="n">lookup</span><span class="p">(</span><span class="n">obj</span><span class="p">,</span> <span class="s2">"used_in_last_chunk"</span><span class="p">)</span> + <span class="n">chunk</span> <span class="o">=</span> <span class="n">lookup</span><span class="p">(</span><span class="n">obj</span><span class="p">,</span> <span class="s2">"inst_chunk"</span><span class="p">)</span><span class="o">.</span><span class="n">dereference</span><span class="p">()</span> + <span class="k">while</span> <span class="mi">1</span><span class="p">:</span> + <span class="n">items</span> <span class="o">=</span> <span class="n">lookup</span><span class="p">(</span><span class="n">chunk</span><span class="p">,</span> <span class="s2">"items"</span><span class="p">)</span> + <span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="n">used_in_last_chunk</span><span class="p">):</span> + <span class="k">yield</span> <span class="n">items</span><span class="p">[</span><span class="n">i</span><span class="p">]</span> + <span class="n">chunk</span> <span class="o">=</span> <span class="n">lookup</span><span class="p">(</span><span class="n">chunk</span><span class="p">,</span> <span class="s2">"next"</span><span class="p">)</span> + <span class="k">if</span> <span class="ow">not</span> <span class="n">chunk</span><span class="p">:</span> + <span class="k">break</span> + <span class="n">chunk</span> <span class="o">=</span> <span class="n">chunk</span><span class="o">.</span><span class="n">dereference</span><span class="p">()</span> + <span class="n">used_in_last_chunk</span> <span class="o">=</span> <span class="mi">1019</span> +</pre></div> + +<p>The full file of supporting code I wrote can be found in <a href="https://gist.github.com/cfbolz/13cadcbbef321d93fc9790dff6f60a6a">this +gist</a>. This is +pretty rough throw-away code, however.</p> +<p>In the following recording I show a staged debugging session with some of the +extra commands I wrote with the Python API. The details aren't important, I +just wanted to give a bit of a flavor of what inspecting objects looks like:</p> +<script src="https://asciinema.org/a/648889.js" id="asciicast-648889" async="true"></script> + +<p>The next step was to understand why the array content wasn't being correctly +traced by the GC, which I eventually managed with some <a href="https://www.fayewilliams.com/2011/07/13/gdb-conditional-breakpoints/">conditional +breakpoints</a>, +more watchpoints, and using <code>reverse-continue</code>. It turned out to be a bug that +occurs when the content of one array was memcopied into another array. The +technical details of why the array wasn't traced correctly are described in +detail in the next section.</p> +<h3 id="writing-a-unit-test">Writing a unit test</h3> +<p>To try to make sure I really understood the bug correctly I then wrote a GC +unit test that shows the problem. Like most of PyPy, our GC is written in +RPython, a (somewhat strange) subset/dialect of Python2, which can be compiled +to C code. However, since it is also valid Python2 code, it can be <a href="https://www.pypy.org/posts/2022/04/how-is-pypy-tested.html">unit-tested +on top of a Python2 +implementation</a> +(which is one of the reasons why we keep maintaining PyPy2).</p> +<p>In the GC unit tests you have a lot of control about what order things happen +in, e.g. how objects are allocated, when garbage collection phases happen, etc. +After some trying I managed to write a test that crashes with the same kind of +memory corruption that my original crash exhibited: an object that is still +reachable via an array is collected by the GC. To give you a flavor of what +this kind of test looks like, here's an (edited for clarity) version of the +test I eventually managed to write</p> +<div class="code"><pre class="code literal-block"><span class="k">def</span> <span class="nf">test_incrementality_bug_arraycopy</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span> + <span class="n">source</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">malloc</span><span class="p">(</span><span class="n">VAR</span><span class="p">,</span> <span class="mi">8</span><span class="p">)</span> <span class="c1"># first array</span> + <span class="c1"># the stackroots list emulates the C stack</span> + <span class="bp">self</span><span class="o">.</span><span class="n">stackroots</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">source</span><span class="p">)</span> + <span class="n">target</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">malloc</span><span class="p">(</span><span class="n">VAR</span><span class="p">,</span> <span class="mi">8</span><span class="p">)</span> <span class="c1"># second array</span> + <span class="bp">self</span><span class="o">.</span><span class="n">stackroots</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">target</span><span class="p">)</span> + <span class="n">node</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">malloc</span><span class="p">(</span><span class="n">S</span><span class="p">)</span> <span class="c1"># unrelated object, will be collected</span> + <span class="n">node</span><span class="o">.</span><span class="n">x</span> <span class="o">=</span> <span class="mi">5</span> + <span class="c1"># store reference into source array, calling the write barrier</span> + <span class="bp">self</span><span class="o">.</span><span class="n">writearray</span><span class="p">(</span><span class="n">source</span><span class="p">,</span> <span class="mi">0</span><span class="p">,</span> <span class="n">node</span><span class="p">)</span> + <span class="n">val</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">gc</span><span class="o">.</span><span class="n">collect_step</span><span class="p">()</span> + <span class="n">source</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">stackroots</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span> <span class="c1"># reload arrays, they might have moved</span> + <span class="n">target</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">stackroots</span><span class="p">[</span><span class="mi">1</span><span class="p">]</span> + <span class="c1"># this GC step traces target</span> + <span class="n">val</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">gc</span><span class="o">.</span><span class="n">collect_step</span><span class="p">()</span> + + <span class="c1"># emulate what a memcopy of arrays does</span> + <span class="n">res</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">gc</span><span class="o">.</span><span class="n">writebarrier_before_copy</span><span class="p">(</span><span class="n">source</span><span class="p">,</span> <span class="n">target</span><span class="p">,</span> <span class="mi">0</span><span class="p">,</span> <span class="mi">0</span><span class="p">,</span> <span class="mi">2</span><span class="p">)</span> + <span class="k">assert</span> <span class="n">res</span> + <span class="n">target</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span> <span class="o">=</span> <span class="n">source</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span> <span class="c1"># copy two elements of the arrays</span> + <span class="n">target</span><span class="p">[</span><span class="mi">1</span><span class="p">]</span> <span class="o">=</span> <span class="n">source</span><span class="p">[</span><span class="mi">1</span><span class="p">]</span> + <span class="c1"># now overwrite the reference to node in source</span> + <span class="bp">self</span><span class="o">.</span><span class="n">writearray</span><span class="p">(</span><span class="n">source</span><span class="p">,</span> <span class="mi">0</span><span class="p">,</span> <span class="n">lltype</span><span class="o">.</span><span class="n">nullptr</span><span class="p">(</span><span class="n">S</span><span class="p">))</span> + <span class="c1"># this GC step traces source</span> + <span class="bp">self</span><span class="o">.</span><span class="n">gc</span><span class="o">.</span><span class="n">collect_step</span><span class="p">()</span> + <span class="c1"># some more collection steps, crucially target isn't traced again</span> + <span class="c1"># but node is deleted</span> + <span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="mi">3</span><span class="p">):</span> + <span class="bp">self</span><span class="o">.</span><span class="n">gc</span><span class="o">.</span><span class="n">collect_step</span><span class="p">()</span> + <span class="c1"># used to crash, node got collected</span> + <span class="k">assert</span> <span class="n">target</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span><span class="o">.</span><span class="n">x</span> <span class="o">==</span> <span class="mi">5</span> +</pre></div> + +<p>One of the good properties of testing our GC that way is that all the memory is +emulated. The crash in the last line of the test isn't a segfault at all, +instead you get a nice exception saying that you tried to access a freed chunk +of memory and you can then debug this with a python2 debugger.</p> +<h3 id="fixing-the-bug">Fixing the Bug</h3> +<p>With the unit test in hand, fixing the test was relatively straightforward (the +diff in its simplest form is anyway only a <a href="https://github.com/pypy/pypy/commit/78bbeb93471b5f38438004e971f4b4f84ab17a84">single line +change</a>). +After this first version of my fix, I +<a href="https://github.com/pypy/pypy/issues/4925#issuecomment-2014459454">talked to Armin +Rigo</a> who +helped me find different case that was still wrong, in the same area of the +code.</p> +<p>I also got help by the developers at <a href="https://portaone.com/">PortaOne</a> +who are using PyPy on their servers and had seen some <a href="https://github.com/pypy/pypy/issues/4900">mysterious PyPy +crashes</a> +recently, that looked related to the GC. They did test deployments of my fixes +in their various stages to their servers to try to see whether stability +improved for them. Unfortunately in the end it turned out that their crashes +are an unrelated GC bug related to object pinning, which we haven't resolved +yet.</p> +<h3 id="writing-a-gc-fuzzerproperty-based-test">Writing a GC fuzzer/property based test</h3> +<p>Finding bugs in the GC is always extremely disconcerting, particularly since +this one managed to hide for so long (more than ten years!). Therefore I wanted +to use these bugs as motivation to try to find more problems in PyPy's GC. Given +the ridiculous effectiveness of fuzzing, I used +<a href="https://hypothesis.readthedocs.io/en/latest/">hypothesis</a> to write a +property-based test. Every test performs a sequence of randomly chosen steps +from the following list:</p> +<ul> +<li>allocate an object</li> +<li>read a random field from a random object</li> +<li>write a random reference into a random object</li> +<li>drop a random stack reference</li> +<li>perform one GC step</li> +<li>allocate an array</li> +<li>read a random index from a random array</li> +<li>write to an array</li> +<li>memcopy between two arrays</li> +</ul> +<p>This approach of doing a sequence of steps is pretty close to the <a href="https://hypothesis.readthedocs.io/en/latest/stateful.html">stateful +testing</a> approach of +hypothesis, but I just implemented it manually with the <a href="https://hypothesis.readthedocs.io/en/latest/data.html#drawing-interactively-in-tests">data +strategy</a>.</p> +<p>Every one of those steps is always performed on both the tested GC, and on some +regular Python objects. The Python objects provide the "ground truth" of what +the heap should look like, so we can compare the state of the GC objects +with the state of the Python objects to find out whether the GC made a mistake.</p> +<p>In order to check whether the test is actually useful, I reverted my bug fixes +and made sure that the test re-finds both the spurious GC assertion error and the +problems with memcopying an array.</p> +<p>In addition, the test also found corner cases in my fix. There was a situation +that I hadn't accounted for, which the test found after eventually. +I also plan on adding a bunch of other GC features as steps in the +test to stress them too (for example weakrefs, identity hashes, pinning, maybe +finalization).</p> +<p>At the point of publishing this post, the fixes got merged to the 2.7/3.9/3.10 +branches of PyPy, and will be part of the next release (v7.3.16).</p> +<h2 id="the-technical-details-of-the-bug">The technical details of the bug</h2> +<p>In order to understand the technical details of the bug, I need to give some +background explanations about PyPy's GC.</p> +<h3 id="pypys-incremental-gc">PyPy's incremental GC</h3> +<p>PyPy uses an incremental generational mark-sweep GC. It's +<a href="https://en.wikipedia.org/wiki/Tracing_garbage_collection#Generational_GC_(ephemeral_GC)">generational</a> +and therefore has minor collections (where only young objects get collected) +and major collections (collecting long-lived objects eventually, using a +<a href="https://en.wikipedia.org/wiki/Tracing_garbage_collection#Na%C3%AFve_mark-and-sweep">mark-and-sweep</a> +algorithm). Young objects are allocated in a nursery using a +bump-pointer allocator, which makes allocation quite efficient. They are moved +out of the nursery by minor collections. In order to find references from old +to young objects the GC uses a write barrier to detect writes into old objects.</p> +<p>The GC is also +<a href="https://en.wikipedia.org/wiki/Tracing_garbage_collection#Stop-the-world_vs._incremental_vs._concurrent">incremental</a>, +which means that its major collections aren't done all at once (which would +lead to long pauses). Instead, major collections are sliced up into small +steps, which are done directly after a minor collection (the GC isn't +<em>concurrent</em> though, which would mean that the GC does work in a separate +thread).</p> +<p>The incremental GC uses <a href="https://en.wikipedia.org/wiki/Tracing_garbage_collection#Tri-color_marking">tri-color +marking</a> +to reason about the reachable part of the heap during the marking phase, where +every old object can be:</p> +<ul> +<li>black: already marked, reachable, definitely survives the collection</li> +<li>grey: will survive, but still needs to be marked</li> +<li>white: potentially dead</li> +</ul> +<p>The color of every object is encoded by setting flags +in the object header.</p> +<p>The GC maintains the <strong>invariant</strong> that black objects must never point to white +objects. At the start of a major collection cycle the stack roots are turned +gray. During the mark phase of a major collection cycle, the GC will trace gray +objects, until +none are left. To trace a gray object, all the objects it references have to be +marked grey if they are white so far. After a grey object is traced, it can be +marked black (because all the referenced objects are now either black or gray). +Eventually, there are no gray objects left. At that point (because no white +object can be reached from a black one) all the white objects are known to be +unreachable and can therefore be freed.</p> +<p>The GC is incremental because every collection step will only trace a limited +number of gray objects, before giving control back to the program. This leads to +a problem: if an already traced (black) object is changed between two marking +steps of the GC, the program can mutate that object and write a new reference +into one of its fields. This could lead to an invariant violation, if the +referenced object is white. Therefore, the GC uses the write barrier (which it +needs anyway to find references from old to young objects) to mark all black +objects that are modified gray, and then trace them again at one of the +later collection steps.</p> +<h3 id="the-special-write-barrier-of-memcopy">The special write barrier of memcopy</h3> +<p>Arrays use a different kind of write barrier than normal objects. Since they +can be arbitrarily large, tracing them can take a long time. Therefore it's +potentially wasteful to trace them fully at a minor collection. To fix this, +the array write barrier keeps more granular information about which parts of +the array have been modified since the last collection step. Then only the +modified parts of the array need to be traced, not the whole array.</p> +<p>In addition, there is another optimization for arrays, which is that memcopy is +treated specially by the GC. If memcopy is implemented by simply writing a loop +that copies the content of one array to the other, that will invoke the write +barrier every single loop iteration for the write of every array element, +costing a lot of overhead. Here's some pseudo-code:</p> +<div class="code"><pre class="code literal-block"><span class="k">def</span> <span class="nf">arraycopy</span><span class="p">(</span><span class="n">source</span><span class="p">,</span> <span class="n">dest</span><span class="p">,</span> <span class="n">source_start</span><span class="p">,</span> <span class="n">dest_start</span><span class="p">,</span> <span class="n">length</span><span class="p">):</span> + <span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="n">length</span><span class="p">):</span> + <span class="n">value</span> <span class="o">=</span> <span class="n">source</span><span class="p">[</span><span class="n">source_start</span> <span class="o">+</span> <span class="n">i</span><span class="p">]</span> + <span class="n">dest</span><span class="p">[</span><span class="n">dest_start</span> <span class="o">+</span> <span class="n">i</span><span class="p">]</span> <span class="o">=</span> <span class="n">value</span> <span class="c1"># &lt;- write barrier inserted here</span> +</pre></div> + +<p>Therefore the GC has a special memcopy-specific +write barrier that will perform the GC logic once before the memcopy loop, and +then use a regular (typically SIMD-optimized) memcopy implementation from +<code>libc</code>. Roughly like this:</p> +<div class="code"><pre class="code literal-block"><span class="k">def</span> <span class="nf">arraycopy</span><span class="p">(</span><span class="n">source</span><span class="p">,</span> <span class="n">dest</span><span class="p">,</span> <span class="n">source_start</span><span class="p">,</span> <span class="n">dest_start</span><span class="p">,</span> <span class="n">length</span><span class="p">):</span> + <span class="n">gc_writebarrier_before_array_copy</span><span class="p">(</span><span class="n">source</span><span class="p">,</span> <span class="n">dest</span><span class="p">,</span> <span class="n">source_start</span><span class="p">,</span> <span class="n">dest_start</span><span class="p">,</span> <span class="n">length</span><span class="p">)</span> + <span class="n">raw_memcopy</span><span class="p">(</span><span class="n">cast_to_voidp</span><span class="p">(</span><span class="n">source</span><span class="p">)</span> <span class="o">+</span> <span class="n">source_start</span><span class="p">,</span> + <span class="n">cast_to_voidp</span><span class="p">(</span><span class="n">dest</span><span class="p">)</span> <span class="o">+</span> <span class="n">dest_start</span><span class="p">,</span> + <span class="n">sizeof</span><span class="p">(</span><span class="n">itemtype</span><span class="p">(</span><span class="n">source</span><span class="p">))</span> <span class="o">*</span> <span class="n">length</span><span class="p">)</span> +</pre></div> + +<p>(this is really a rough sketch. The <a href="https://github.com/pypy/pypy/blob/789f964fff59c722b0872abcdc56d2b1373a9f3b/rpython/rlib/rgc.py#L365">real +code</a> +is much more complicated.)</p> +<h3 id="the-bug">The bug</h3> +<p>The bugs turned out to be precisely in this memcopy write barrier. When we +implemented the current GC, we adapted our previous GC, which was a +generational mark-sweep GC but <em>not</em> incremental. We started with most of the +previous GC's code, including the write barriers. The regular write barriers +were adapted to the new incremental assumptions, in particular the need for the +write barrier to also turn black objects back to gray when they are modified +during a marking phase. This was simply not done at all for the memcopy write +barrier, at least in two of the code paths. Fixing this problem fixes the unit +tests and stops the crashes.</p> +<h2 id="reflections">Reflections</h2> +<p>The way the bug was introduced is really typical. A piece of code (the memcopy +write barrier) was written under a set of assumptions. Then those assumptions +changed later. Not all the code pieces that relied on these assumptions to be +correct were updated. It's pretty hard to prevent this in all situations.</p> +<p>I still think we could have done more to prevent the bug occurring. Writing a +property-based test for the GC would have been a good idea given the complexity +of the GC, and definitely something we did in other parts of our code at the +time (just using the <code>random</code> module mostly, we started using hypothesis +later).</p> +<p>It's a bit of a mystery to me why this bug managed to be undetected for so +long. Memcopy happens in a lot of pretty core operations of e.g. lists in +Python (<code>list.extend</code>, to name just one example). To speculate, I would suspect +that all the other preconditions for the bug occurring made it pretty rare:</p> +<ul> +<li>the content of an old list that is not yet marked needs to be copied into + another old list that is marked already</li> +<li>the source of the copy needs to also store an object that has no other + references</li> +<li>the source of the copy then needs to be overwritten with other data</li> +<li>then the next collection steps need to be happening at the right points</li> +<li>...</li> +</ul> +<p>Given the complexity of the GC logic I also wonder whether some lightweight +formal methods would have been a good idea. Formalizing some of the core +invariants in <a href="https://en.wikipedia.org/wiki/B-Method">B</a> or +<a href="https://en.wikipedia.org/wiki/TLA%2B">TLA+</a> and then <a href="https://en.wikipedia.org/wiki/Model_checking">model +checking</a> them up to some number +of +objects would have found this problem pretty quickly. There are also correctness +proofs for GC algorithms in some research papers, but I don't have a good +overview of the literature to point to any that are particularly good or bad. +Going such a more formal route might have fixed this and probably a whole bunch +of other bugs, but of course it's a pretty expensive (and tedious) approach.</p> +<p>While it was super annoying to track this down, it was definitely good to learn +a bit more about how to use rr and the GDB scripting interface.</p> +<h2 id="bonus-section-the-wrong-assertion">Bonus Section: The Wrong Assertion</h2> +<p>Some more technical information about the wrong assertion is in this section.</p> +<h3 id="background-pre-built-objects">Background: pre-built objects</h3> +<p>PyPy's VM-building bootstrapping process can "freeze" a bunch of heap objects +into the final binary. This allows the VM to start up quickly, because those +frozen objects are loaded by the OS as part of the binary.</p> +<p>Those frozen pre-built objects are parts of the 'roots' of the garbage +collector and need to be traced. However, tracing all the pre-built objects at +every collection would be very expensive, because there are a lot of them +(about 150,000 in a PyPy 3.10 binary). Tracing them all is also not necessary, +because most of them are never modified. Unmodified pre-built objects can only reference +other pre-built objects, which can never be deallocated anyway. Therefore we +have an optimization that uses the write barrier (which we need anyway to find +old-to-young pointers) to notice when a pre-built object gets modified for the +very first time. If that happens, it gets added to the set of pre-built objects +that gets counted as a root, and is traced as a root at collections +from then on.</p> +<h3 id="the-wrong-assertion">The wrong assertion</h3> +<p>The assertion that triggered when I turned on the GC debug mode was saying that +the GC found a reference from a black to a white object, violating its +invariant. Unmodified pre-built objects count as black, and they aren't roots, +because they can only ever reference other pre-built objects. However, when a +pre-built object gets modified for the first time, it becomes part of the root +set and will be marked gray. This logic works fine.</p> +<p>The wrong assertion triggers if a pre-built object is mutated for the very +first time in the middle of an incremental marking phase. While the pre-built +object gets added to the root set just fine, and will get traced before the +marking phase ends, this is encoded slightly differently for pre-built objects, +compared to "regular" old objects. Therefore, the invariant checking code +wrongly reported a black-&gt;white pointer in this situation.</p> +<p>To fix it I also wrote a unit test checking the problem, made sure that the GC +hypothesis test also found the bug, and then fixed the wrong assertion to take +the color encoding of pre-built objects into account.</p> +<p>The bug managed to be invisible because we don't tend to turn on the GC +assertions very often. We only do that when we find a GC bug, which is of +course also when we need it the most to be correct.</p> +<h2 id="acknowledgements">Acknowledgements</h2> +<p>Thanks to Matti Picus, Max Bernstein, Wouter van Heyst for giving me feedback on drafts of the +post. Thanks to Armin Rigo for reviewing the code and pointing out holes in my +thinking. Thanks to the original reporters of the various forms of the bug, +including Lily Foote, David Hewitt, Wenzel Jakob.</p>https://www.pypy.org/posts/2024/03/fixing-bug-incremental-gc.htmlTue, 26 Mar 2024 19:14:09 GMTRPython-based emulator speeds up RISC-V simulation over 15xhttps://www.pypy.org/posts/2023/05/rpython-used-to-speed-up-risc-v-simulation-over-15x.htmlCarl Friedrich Bolz-Tereick<p>In cooperation with <a class="reference external" href="https://riscv.org/">RISC-V International</a>, who funded a part of this project, +we recently created a workflow to +use RPython to take a <a class="reference external" href="https://github.com/riscv/sail-riscv#riscv-sail-model">Sail RISC-V</a> model and automatically create a RISC-V ISA +emulator from it, which we call <a class="reference external" href="https://docs.pydrofoil.org">Pydrofoil</a>. The simulator sped up booting a +linux emulator from 35 minutes (using the standard Sail-generated emulator in +C) to 2 minutes, a speedup of 17.5x. More details about the process are in the +<a class="reference external" href="https://riscv.org/blog/2023/05/how-to-speed-up-the-emulating-process-with-pydrofoil-carl-friedrich/">RISC-V blog post</a>.</p> +<p>A few take-aways from the project:</p> +<ul class="simple"> +<li><p>While PyPy has shown it can speed up generic python code <a class="reference external" href="https://speed.pypy.org">about 4x</a>, the +technology behind PyPy can really shine in other areas.</p></li> +<li><p>RPython is malleable and can be molded to many tasks, the RPython meta-JIT is +very flexible.</p></li> +<li><p>A JIT is well-suited for the problem of emulation, because it can +perform dynamic binary translation.</p></li> +</ul> +<p>PyPy can solve real world performance problems, even somewhat unusual ones. +Please <a class="reference external" href="https://www.pypy.org/pypy-sponsors.html">get in touch</a> and let us know how we can help you solve yours!</p>casestudyperformancehttps://www.pypy.org/posts/2023/05/rpython-used-to-speed-up-risc-v-simulation-over-15x.htmlTue, 16 May 2023 11:22:35 GMTRepeated string concatenation is quadratic in PyPy (and CPython)https://www.pypy.org/posts/2023/01/string-concatenation-quadratic.htmlCarl Friedrich Bolz-Tereick<p>This is a super brief blog post responding to an <a class="reference external" href="https://foss.heptapod.net/pypy/pypy/-/issues/3885">issue</a> that we got on the PyPy +issue tracker. I am moving my response to the blog (with permission of the +submitter) to have a post to point to, since it's a problem that comes up with +some regularity. It's also documented on our page of <a class="reference external" href="https://doc.pypy.org/en/latest/cpython_differences.html?highlight=join#performance-differences">differences between PyPy +and CPython</a> but I thought an additional blog post might be good.</p> +<p>The issue pointed out that a small program that operates on strings is much +slower on PyPy compared to CPython. The program is a solution for 2016's +Advent of Code <a class="reference external" href="https://adventofcode.com/2016/day/16">Day 16</a> and looks like this:</p> +<div class="code"><pre class="code python"><a id="rest_code_2392824f2ba74fb299c7ea44c8e4838c-1" name="rest_code_2392824f2ba74fb299c7ea44c8e4838c-1" href="https://www.pypy.org/posts/2023/01/string-concatenation-quadratic.html#rest_code_2392824f2ba74fb299c7ea44c8e4838c-1"></a><span class="k">def</span> <span class="nf">dragon</span><span class="p">(</span><span class="n">a</span><span class="p">):</span> +<a id="rest_code_2392824f2ba74fb299c7ea44c8e4838c-2" name="rest_code_2392824f2ba74fb299c7ea44c8e4838c-2" href="https://www.pypy.org/posts/2023/01/string-concatenation-quadratic.html#rest_code_2392824f2ba74fb299c7ea44c8e4838c-2"></a> <span class="n">b</span> <span class="o">=</span> <span class="n">a</span><span class="p">[::</span><span class="o">-</span><span class="mi">1</span><span class="p">]</span><span class="o">.</span><span class="n">replace</span><span class="p">(</span><span class="s1">'0'</span><span class="p">,</span><span class="s1">'r'</span><span class="p">)</span><span class="o">.</span><span class="n">replace</span><span class="p">(</span><span class="s1">'1'</span><span class="p">,</span><span class="s1">'0'</span><span class="p">)</span><span class="o">.</span><span class="n">replace</span><span class="p">(</span><span class="s1">'r'</span><span class="p">,</span><span class="s1">'1'</span><span class="p">)</span> +<a id="rest_code_2392824f2ba74fb299c7ea44c8e4838c-3" name="rest_code_2392824f2ba74fb299c7ea44c8e4838c-3" href="https://www.pypy.org/posts/2023/01/string-concatenation-quadratic.html#rest_code_2392824f2ba74fb299c7ea44c8e4838c-3"></a> <span class="k">return</span> <span class="n">a</span><span class="o">+</span><span class="s1">'0'</span><span class="o">+</span><span class="n">b</span> +<a id="rest_code_2392824f2ba74fb299c7ea44c8e4838c-4" name="rest_code_2392824f2ba74fb299c7ea44c8e4838c-4" href="https://www.pypy.org/posts/2023/01/string-concatenation-quadratic.html#rest_code_2392824f2ba74fb299c7ea44c8e4838c-4"></a> +<a id="rest_code_2392824f2ba74fb299c7ea44c8e4838c-5" name="rest_code_2392824f2ba74fb299c7ea44c8e4838c-5" href="https://www.pypy.org/posts/2023/01/string-concatenation-quadratic.html#rest_code_2392824f2ba74fb299c7ea44c8e4838c-5"></a><span class="k">def</span> <span class="nf">diffstr</span><span class="p">(</span><span class="n">a</span><span class="p">):</span> +<a id="rest_code_2392824f2ba74fb299c7ea44c8e4838c-6" name="rest_code_2392824f2ba74fb299c7ea44c8e4838c-6" href="https://www.pypy.org/posts/2023/01/string-concatenation-quadratic.html#rest_code_2392824f2ba74fb299c7ea44c8e4838c-6"></a> <span class="n">b</span> <span class="o">=</span> <span class="s2">""</span> +<a id="rest_code_2392824f2ba74fb299c7ea44c8e4838c-7" name="rest_code_2392824f2ba74fb299c7ea44c8e4838c-7" href="https://www.pypy.org/posts/2023/01/string-concatenation-quadratic.html#rest_code_2392824f2ba74fb299c7ea44c8e4838c-7"></a> <span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="mi">0</span><span class="p">,</span><span class="nb">len</span><span class="p">(</span><span class="n">a</span><span class="p">),</span><span class="mi">2</span><span class="p">):</span> +<a id="rest_code_2392824f2ba74fb299c7ea44c8e4838c-8" name="rest_code_2392824f2ba74fb299c7ea44c8e4838c-8" href="https://www.pypy.org/posts/2023/01/string-concatenation-quadratic.html#rest_code_2392824f2ba74fb299c7ea44c8e4838c-8"></a> <span class="n">b</span> <span class="o">+=</span> <span class="p">[</span><span class="s1">'0'</span><span class="p">,</span><span class="s1">'1'</span><span class="p">][</span><span class="n">a</span><span class="p">[</span><span class="n">i</span><span class="p">]</span> <span class="o">==</span> <span class="n">a</span><span class="p">[</span><span class="n">i</span><span class="o">+</span><span class="mi">1</span><span class="p">]]</span> +<a id="rest_code_2392824f2ba74fb299c7ea44c8e4838c-9" name="rest_code_2392824f2ba74fb299c7ea44c8e4838c-9" href="https://www.pypy.org/posts/2023/01/string-concatenation-quadratic.html#rest_code_2392824f2ba74fb299c7ea44c8e4838c-9"></a> <span class="k">return</span> <span class="n">b</span> +<a id="rest_code_2392824f2ba74fb299c7ea44c8e4838c-10" name="rest_code_2392824f2ba74fb299c7ea44c8e4838c-10" href="https://www.pypy.org/posts/2023/01/string-concatenation-quadratic.html#rest_code_2392824f2ba74fb299c7ea44c8e4838c-10"></a> +<a id="rest_code_2392824f2ba74fb299c7ea44c8e4838c-11" name="rest_code_2392824f2ba74fb299c7ea44c8e4838c-11" href="https://www.pypy.org/posts/2023/01/string-concatenation-quadratic.html#rest_code_2392824f2ba74fb299c7ea44c8e4838c-11"></a><span class="k">def</span> <span class="nf">iterdiff</span><span class="p">(</span><span class="n">a</span><span class="p">):</span> +<a id="rest_code_2392824f2ba74fb299c7ea44c8e4838c-12" name="rest_code_2392824f2ba74fb299c7ea44c8e4838c-12" href="https://www.pypy.org/posts/2023/01/string-concatenation-quadratic.html#rest_code_2392824f2ba74fb299c7ea44c8e4838c-12"></a> <span class="n">b</span> <span class="o">=</span> <span class="n">a</span> +<a id="rest_code_2392824f2ba74fb299c7ea44c8e4838c-13" name="rest_code_2392824f2ba74fb299c7ea44c8e4838c-13" href="https://www.pypy.org/posts/2023/01/string-concatenation-quadratic.html#rest_code_2392824f2ba74fb299c7ea44c8e4838c-13"></a> <span class="k">while</span><span class="p">(</span><span class="nb">len</span><span class="p">(</span><span class="n">b</span><span class="p">)</span> <span class="o">%</span> <span class="mi">2</span> <span class="o">==</span> <span class="mi">0</span><span class="p">):</span> +<a id="rest_code_2392824f2ba74fb299c7ea44c8e4838c-14" name="rest_code_2392824f2ba74fb299c7ea44c8e4838c-14" href="https://www.pypy.org/posts/2023/01/string-concatenation-quadratic.html#rest_code_2392824f2ba74fb299c7ea44c8e4838c-14"></a> <span class="n">b</span> <span class="o">=</span> <span class="n">diffstr</span><span class="p">(</span><span class="n">b</span><span class="p">)</span> +<a id="rest_code_2392824f2ba74fb299c7ea44c8e4838c-15" name="rest_code_2392824f2ba74fb299c7ea44c8e4838c-15" href="https://www.pypy.org/posts/2023/01/string-concatenation-quadratic.html#rest_code_2392824f2ba74fb299c7ea44c8e4838c-15"></a> <span class="k">return</span> <span class="n">b</span> +<a id="rest_code_2392824f2ba74fb299c7ea44c8e4838c-16" name="rest_code_2392824f2ba74fb299c7ea44c8e4838c-16" href="https://www.pypy.org/posts/2023/01/string-concatenation-quadratic.html#rest_code_2392824f2ba74fb299c7ea44c8e4838c-16"></a> +<a id="rest_code_2392824f2ba74fb299c7ea44c8e4838c-17" name="rest_code_2392824f2ba74fb299c7ea44c8e4838c-17" href="https://www.pypy.org/posts/2023/01/string-concatenation-quadratic.html#rest_code_2392824f2ba74fb299c7ea44c8e4838c-17"></a><span class="n">size</span> <span class="o">=</span> <span class="mi">35651584</span> +<a id="rest_code_2392824f2ba74fb299c7ea44c8e4838c-18" name="rest_code_2392824f2ba74fb299c7ea44c8e4838c-18" href="https://www.pypy.org/posts/2023/01/string-concatenation-quadratic.html#rest_code_2392824f2ba74fb299c7ea44c8e4838c-18"></a><span class="n">initstate</span> <span class="o">=</span> <span class="s1">'10010000000110000'</span> +<a id="rest_code_2392824f2ba74fb299c7ea44c8e4838c-19" name="rest_code_2392824f2ba74fb299c7ea44c8e4838c-19" href="https://www.pypy.org/posts/2023/01/string-concatenation-quadratic.html#rest_code_2392824f2ba74fb299c7ea44c8e4838c-19"></a><span class="k">while</span><span class="p">(</span><span class="nb">len</span><span class="p">(</span><span class="n">initstate</span><span class="p">)</span> <span class="o">&lt;</span> <span class="n">size</span><span class="p">):</span> +<a id="rest_code_2392824f2ba74fb299c7ea44c8e4838c-20" name="rest_code_2392824f2ba74fb299c7ea44c8e4838c-20" href="https://www.pypy.org/posts/2023/01/string-concatenation-quadratic.html#rest_code_2392824f2ba74fb299c7ea44c8e4838c-20"></a> <span class="n">initstate</span> <span class="o">=</span> <span class="n">dragon</span><span class="p">(</span><span class="n">initstate</span><span class="p">)</span> +<a id="rest_code_2392824f2ba74fb299c7ea44c8e4838c-21" name="rest_code_2392824f2ba74fb299c7ea44c8e4838c-21" href="https://www.pypy.org/posts/2023/01/string-concatenation-quadratic.html#rest_code_2392824f2ba74fb299c7ea44c8e4838c-21"></a><span class="n">initstate</span> <span class="o">=</span> <span class="n">initstate</span><span class="p">[:</span><span class="n">size</span><span class="p">]</span> +<a id="rest_code_2392824f2ba74fb299c7ea44c8e4838c-22" name="rest_code_2392824f2ba74fb299c7ea44c8e4838c-22" href="https://www.pypy.org/posts/2023/01/string-concatenation-quadratic.html#rest_code_2392824f2ba74fb299c7ea44c8e4838c-22"></a><span class="nb">print</span><span class="p">(</span><span class="n">iterdiff</span><span class="p">(</span><span class="n">initstate</span><span class="p">))</span> +</pre></div> +<p>The submitter pointed out, that the program is fast on CPython (~8s on my +laptop) and slow (didn't finish) on PyPy.</p> +<p>The reason for the performance difference is that <code class="docutils literal">+=</code> on strings in a loop +has quadratic complexity in PyPy, which is what <code class="docutils literal">diffstr</code> does. To see the +quadraticness, consider that to add a character at the end of the string, the +beginning of the string needs to be copied into a new chunk of memory. If the +loop runs <code class="docutils literal">n</code> times, that means there are</p> +<p><code class="docutils literal">1 + 2 + 3 + ... + n = n * (n + 1) // 2</code></p> +<p>character copies.</p> +<p>Repeated string concatenations are in principle also quadratic in CPython, but +CPython has an <a class="reference external" href="https://docs.python.org/2/whatsnew/2.4.html#optimizations">optimization</a> that makes them sometimes not quadratic, which is +what makes this program not too slow in CPython.</p> +<p>In order to fix the problem on PyPy it's best to use a list for the string +parts, which has the right amortized O(1) complexity for <code class="docutils literal">.append</code> calls, and +then use <code class="docutils literal">str.join</code> after the loop:</p> +<div class="code"><pre class="code python"><a id="rest_code_ad4f7a3ef35a44588b2a5efc3fee9a33-1" name="rest_code_ad4f7a3ef35a44588b2a5efc3fee9a33-1" href="https://www.pypy.org/posts/2023/01/string-concatenation-quadratic.html#rest_code_ad4f7a3ef35a44588b2a5efc3fee9a33-1"></a><span class="k">def</span> <span class="nf">diffstr</span><span class="p">(</span><span class="n">a</span><span class="p">):</span> +<a id="rest_code_ad4f7a3ef35a44588b2a5efc3fee9a33-2" name="rest_code_ad4f7a3ef35a44588b2a5efc3fee9a33-2" href="https://www.pypy.org/posts/2023/01/string-concatenation-quadratic.html#rest_code_ad4f7a3ef35a44588b2a5efc3fee9a33-2"></a> <span class="n">b</span> <span class="o">=</span> <span class="p">[]</span> +<a id="rest_code_ad4f7a3ef35a44588b2a5efc3fee9a33-3" name="rest_code_ad4f7a3ef35a44588b2a5efc3fee9a33-3" href="https://www.pypy.org/posts/2023/01/string-concatenation-quadratic.html#rest_code_ad4f7a3ef35a44588b2a5efc3fee9a33-3"></a> <span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="mi">0</span><span class="p">,</span><span class="nb">len</span><span class="p">(</span><span class="n">a</span><span class="p">),</span><span class="mi">2</span><span class="p">):</span> +<a id="rest_code_ad4f7a3ef35a44588b2a5efc3fee9a33-4" name="rest_code_ad4f7a3ef35a44588b2a5efc3fee9a33-4" href="https://www.pypy.org/posts/2023/01/string-concatenation-quadratic.html#rest_code_ad4f7a3ef35a44588b2a5efc3fee9a33-4"></a> <span class="n">b</span><span class="o">.</span><span class="n">append</span><span class="p">([</span><span class="s1">'0'</span><span class="p">,</span><span class="s1">'1'</span><span class="p">][</span><span class="n">a</span><span class="p">[</span><span class="n">i</span><span class="p">]</span> <span class="o">==</span> <span class="n">a</span><span class="p">[</span><span class="n">i</span><span class="o">+</span><span class="mi">1</span><span class="p">]])</span> +<a id="rest_code_ad4f7a3ef35a44588b2a5efc3fee9a33-5" name="rest_code_ad4f7a3ef35a44588b2a5efc3fee9a33-5" href="https://www.pypy.org/posts/2023/01/string-concatenation-quadratic.html#rest_code_ad4f7a3ef35a44588b2a5efc3fee9a33-5"></a> <span class="k">return</span> <span class="s2">""</span><span class="o">.</span><span class="n">join</span><span class="p">(</span><span class="n">b</span><span class="p">)</span> +</pre></div> +<p>With this change the program becomes a little bit faster on CPython for me, and +on PyPy it stops being quadratic and runs in ~3.5s.</p> +<p>In general, it's best not to rely on the presence of this optimization in +CPython either. Sometimes, a small innocent looking changes will break CPython's +optimization. E.g. this useless change makes CPython also take ages:</p> +<div class="code"><pre class="code python"><a id="rest_code_634bfd4fb6b14908b50e7dd140ed4b3a-1" name="rest_code_634bfd4fb6b14908b50e7dd140ed4b3a-1" href="https://www.pypy.org/posts/2023/01/string-concatenation-quadratic.html#rest_code_634bfd4fb6b14908b50e7dd140ed4b3a-1"></a><span class="k">def</span> <span class="nf">diffstr</span><span class="p">(</span><span class="n">a</span><span class="p">):</span> +<a id="rest_code_634bfd4fb6b14908b50e7dd140ed4b3a-2" name="rest_code_634bfd4fb6b14908b50e7dd140ed4b3a-2" href="https://www.pypy.org/posts/2023/01/string-concatenation-quadratic.html#rest_code_634bfd4fb6b14908b50e7dd140ed4b3a-2"></a> <span class="n">b</span> <span class="o">=</span> <span class="s2">""</span> +<a id="rest_code_634bfd4fb6b14908b50e7dd140ed4b3a-3" name="rest_code_634bfd4fb6b14908b50e7dd140ed4b3a-3" href="https://www.pypy.org/posts/2023/01/string-concatenation-quadratic.html#rest_code_634bfd4fb6b14908b50e7dd140ed4b3a-3"></a> <span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="mi">0</span><span class="p">,</span><span class="nb">len</span><span class="p">(</span><span class="n">a</span><span class="p">),</span><span class="mi">2</span><span class="p">):</span> +<a id="rest_code_634bfd4fb6b14908b50e7dd140ed4b3a-4" name="rest_code_634bfd4fb6b14908b50e7dd140ed4b3a-4" href="https://www.pypy.org/posts/2023/01/string-concatenation-quadratic.html#rest_code_634bfd4fb6b14908b50e7dd140ed4b3a-4"></a> <span class="n">b</span> <span class="o">+=</span> <span class="p">[</span><span class="s1">'0'</span><span class="p">,</span><span class="s1">'1'</span><span class="p">][</span><span class="n">a</span><span class="p">[</span><span class="n">i</span><span class="p">]</span> <span class="o">==</span> <span class="n">a</span><span class="p">[</span><span class="n">i</span><span class="o">+</span><span class="mi">1</span><span class="p">]]</span> +<a id="rest_code_634bfd4fb6b14908b50e7dd140ed4b3a-5" name="rest_code_634bfd4fb6b14908b50e7dd140ed4b3a-5" href="https://www.pypy.org/posts/2023/01/string-concatenation-quadratic.html#rest_code_634bfd4fb6b14908b50e7dd140ed4b3a-5"></a> <span class="n">c</span> <span class="o">=</span> <span class="n">b</span> +<a id="rest_code_634bfd4fb6b14908b50e7dd140ed4b3a-6" name="rest_code_634bfd4fb6b14908b50e7dd140ed4b3a-6" href="https://www.pypy.org/posts/2023/01/string-concatenation-quadratic.html#rest_code_634bfd4fb6b14908b50e7dd140ed4b3a-6"></a> <span class="k">return</span> <span class="n">b</span> +</pre></div> +<p>The reason why this change breaks the optimization in CPython is that it only +triggers if the reference count of <code class="docutils literal">b</code> is 1, in which case it uses <code class="docutils literal">realloc</code> +on the string. The change is unrealistic of course, but you could imagine a +related that keeps an extra reference to <code class="docutils literal">b</code> for a sensible reason.</p> +<p>Another situation in which the optimization doesn't work is discussed in this +<a class="reference external" href="https://stackoverflow.com/a/44487738">StackOverflow question</a> with an answer by Tim Peters.</p> +<p>It's unlikely that PyPy will fix this. We had a prototype how to do it, but it +seems very little "production" code uses <cite>+=</cite> on strings in a loop, and the fix +makes the strings implementation quite a bit more complex.</p> +<p>So, in summary, don't use repeated concatenations in a loop!</p>performancehttps://www.pypy.org/posts/2023/01/string-concatenation-quadratic.htmlWed, 04 Jan 2023 09:00:00 GMTFinding JIT Optimizer Bugs using SMT Solvers and Fuzzinghttps://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.htmlCarl Friedrich Bolz-Tereick<p>In this blog post I want to describe a recent bug finding technique that I've +added to the PyPy JIT testing infrastructure. This technique uses the Z3 +theorem prover to find bugs in the optimizer of PyPy's JIT, in particular its +integer operation optimizations. The approach is +based on things I have learned from <a class="reference external" href="https://www.cs.utah.edu/~regehr/">John Regehr's</a> <a class="reference external" href="https://blog.regehr.org/">blog</a> (<a class="reference external" href="https://blog.regehr.org/archives/1122">this post</a> is a +good first one to read), <a class="reference external" href="https://twitter.com/johnregehr/">Twitter</a>, and on +his (et al) paper <a class="reference external" href="https://www.cs.utah.edu/~regehr/alive2-pldi21.pdf">Alive2: Bounded Translation Validation for LLVM</a>. The work +was triggered by a recent miscompilation bug my current bachelor student Nico +Rittinghaus found.</p> +<section id="background-python-integers-in-the-pypy-jit"> +<h2>Background: Python Integers in the PyPy JIT</h2> +<p>The optimizer of PyPy's JITs operates on traces, which are linear sequences of +instructions with guards. The instructions in the traces operate on different +machine-level data types, machine integers, doubles, pointers, bools, etc. In +this post we'll be mostly concerned with machine integers.</p> +<p>To given some wider context I'll explain a bit how Python ints in the user code +relate to the types that are used in traces when the PyPy Python implementation +is used. +When PyPy turns a regular Python 3 function into a trace, there is a lot of work +happening in the JIT frontend to try to observe and infer the types that the +Python function concretely uses at runtime. The traces are generated under these +typing assumptions. Therefore, code that uses <code class="docutils literal">ints</code> in the Python code can +typically be translated into traces that operate on machine integers. In order +to make sure that the Python integer semantics are upheld, many of the +operations in the traces need to check that the integer results of some +operations still fit into a machine integer. If that is not the case (a rare +situation for most programs), the trace is left via a guard, execution falls +back to the interpreter, and there a big integer representation is chosen for +the too big value (the big integer representation is done via a pointer and +some storage on the heap).</p> +<p>All of this machinery is not going to be too relevant for the rest of the +post. For the post it's important to know that trace instructions operate on +machine integers and other low-level types, and some of the operations can +optionally check whether the +results still fit into a machine integer. These trace operations are improved by +the optimizer, which tries to transform the trace into one that behaves the +same, but is less costly to execute.</p> +</section> +<section id="background-bounds-analysis-in-pypy-s-jit"> +<h2>Background: Bounds Analysis in PyPy's JIT</h2> +<p>The optimizer of PyPy's JIT has an analysis based on <a class="reference external" href="https://en.wikipedia.org/wiki/Abstract_interpretation">abstract interpretation</a> +that tries to find out whether the integer values stored in a variable are +actually not using the full 64 bit (or 32 bit) range, but instead fit into some +smaller range. This means that for every integer variable <code class="docutils literal">x</code> in a trace, the +JIT compiler tracks upper and lower bounds of the runtime value of that +variable: a range <code class="docutils literal">[a, b]</code> such that for every concrete runtime value <code class="docutils literal">v</code> +that gets stored in variable <code class="docutils literal">x</code>, <code class="docutils literal">a &lt;= v &lt;= b</code> must be true. +<code class="docutils literal">a</code> and <code class="docutils literal">b</code> start out +as the most general <code class="docutils literal">MININT</code> and <code class="docutils literal">MAXINT</code>, but sometimes there is extra +information that makes it possible to improve these known bounds, and that is +often useful to optimize the code.</p> +<p>A typical example is that the JIT knows that the length of a string is +non-negative, so for this kind of code: <code class="docutils literal">x = len(s)</code> where <code class="docutils literal">s</code> is a string, +<code class="docutils literal">x</code> gets a range <code class="docutils literal">[0, MAXINT]</code> assigned. With this information we could for +example remove a check <code class="docutils literal">x + 10 &lt; 0</code> completely, because it can never be true.</p> +<p>The bounds information is useful for optimization, but the analysis of the +bounds is also a source of bugs in the JIT, because the reasoning is often +subtle and easy to get wrong in corner cases. We already use a number of testing +techniques to try to make sure that it is correct. A simple one is +<a class="reference external" href="https://hypothesis.works/articles/what-is-property-based-testing/">property-based testing</a> using <a class="reference external" href="https://github.com/HypothesisWorks/hypothesis">Hypothesis</a> on the operations on bounds. Even +though Hypothesis is fantastic, it unfortunately does not catch +absolutely all the bugs even if we'd like it too, as we'll see in the next +section.</p> +</section> +<section id="motivation-a-jit-miscompilation"> +<h2>Motivation: A JIT Miscompilation</h2> +<p>I am currently supervising a Bachelor thesis by Nico Rittinghaus, who is +extending the integer analysis in the JIT. He'll probably write a separate blog +post about that soon. In the process of his work, the current bounds analysis +code got a lot of scrutiny, and we found out that one of the unit tests of the +bounds analysis was actually incorrect, and the example code in that unit test +was optimized incorrectly. This case of incorrect optimization is not a big deal +for regular Python code, because it involved a "wrapping integer addition +operation", i.e. one where overflowing results just wrap around to negative +values. All the additions and other arithmetic operations that the PyPy Python +frontend generates actually have +overflow checks (to be able to switch to a big integer representation if +needed). +However, it's still possible to trigger the problem with the +<code class="docutils literal">__pypy__.intop.int_add</code> API which is a function that exposes wraparound +arithmetic on Python ints.</p> +<p><a class="reference external" href="https://foss.heptapod.net/pypy/pypy/-/issues/3832">Here's the miscompilation</a>. The JIT optimizes the following function:</p> +<div class="code"><pre class="code python"><a id="rest_code_fe430f89c3ac44bd87113cd210a97ff1-1" name="rest_code_fe430f89c3ac44bd87113cd210a97ff1-1" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_fe430f89c3ac44bd87113cd210a97ff1-1"></a><span class="kn">import</span> <span class="nn">__pypy__</span> +<a id="rest_code_fe430f89c3ac44bd87113cd210a97ff1-2" name="rest_code_fe430f89c3ac44bd87113cd210a97ff1-2" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_fe430f89c3ac44bd87113cd210a97ff1-2"></a> +<a id="rest_code_fe430f89c3ac44bd87113cd210a97ff1-3" name="rest_code_fe430f89c3ac44bd87113cd210a97ff1-3" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_fe430f89c3ac44bd87113cd210a97ff1-3"></a><span class="k">def</span> <span class="nf">wrong</span><span class="p">(</span><span class="n">x</span><span class="p">):</span> +<a id="rest_code_fe430f89c3ac44bd87113cd210a97ff1-4" name="rest_code_fe430f89c3ac44bd87113cd210a97ff1-4" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_fe430f89c3ac44bd87113cd210a97ff1-4"></a> <span class="n">a</span> <span class="o">=</span> <span class="n">__pypy__</span><span class="o">.</span><span class="n">intop</span><span class="o">.</span><span class="n">int_add</span><span class="p">(</span><span class="n">x</span><span class="p">,</span> <span class="mi">10</span><span class="p">)</span> +<a id="rest_code_fe430f89c3ac44bd87113cd210a97ff1-5" name="rest_code_fe430f89c3ac44bd87113cd210a97ff1-5" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_fe430f89c3ac44bd87113cd210a97ff1-5"></a> <span class="k">if</span> <span class="n">a</span> <span class="o">&lt;</span> <span class="mi">15</span><span class="p">:</span> +<a id="rest_code_fe430f89c3ac44bd87113cd210a97ff1-6" name="rest_code_fe430f89c3ac44bd87113cd210a97ff1-6" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_fe430f89c3ac44bd87113cd210a97ff1-6"></a> <span class="k">if</span> <span class="n">x</span> <span class="o">&lt;</span> <span class="mi">6</span><span class="p">:</span> +<a id="rest_code_fe430f89c3ac44bd87113cd210a97ff1-7" name="rest_code_fe430f89c3ac44bd87113cd210a97ff1-7" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_fe430f89c3ac44bd87113cd210a97ff1-7"></a> <span class="k">return</span> <span class="mi">0</span> +<a id="rest_code_fe430f89c3ac44bd87113cd210a97ff1-8" name="rest_code_fe430f89c3ac44bd87113cd210a97ff1-8" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_fe430f89c3ac44bd87113cd210a97ff1-8"></a> <span class="k">return</span> <span class="mi">1</span> +<a id="rest_code_fe430f89c3ac44bd87113cd210a97ff1-9" name="rest_code_fe430f89c3ac44bd87113cd210a97ff1-9" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_fe430f89c3ac44bd87113cd210a97ff1-9"></a> <span class="k">return</span> <span class="mi">2</span> +</pre></div> +<p>Into the following code:</p> +<div class="code"><pre class="code python"><a id="rest_code_4ffb3edd0ebd4f739819d99c60b8f91d-1" name="rest_code_4ffb3edd0ebd4f739819d99c60b8f91d-1" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_4ffb3edd0ebd4f739819d99c60b8f91d-1"></a><span class="kn">import</span> <span class="nn">__pypy__</span> +<a id="rest_code_4ffb3edd0ebd4f739819d99c60b8f91d-2" name="rest_code_4ffb3edd0ebd4f739819d99c60b8f91d-2" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_4ffb3edd0ebd4f739819d99c60b8f91d-2"></a> +<a id="rest_code_4ffb3edd0ebd4f739819d99c60b8f91d-3" name="rest_code_4ffb3edd0ebd4f739819d99c60b8f91d-3" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_4ffb3edd0ebd4f739819d99c60b8f91d-3"></a><span class="k">def</span> <span class="nf">wrong</span><span class="p">(</span><span class="n">x</span><span class="p">):</span> +<a id="rest_code_4ffb3edd0ebd4f739819d99c60b8f91d-4" name="rest_code_4ffb3edd0ebd4f739819d99c60b8f91d-4" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_4ffb3edd0ebd4f739819d99c60b8f91d-4"></a> <span class="n">a</span> <span class="o">=</span> <span class="n">__pypy__</span><span class="o">.</span><span class="n">intop</span><span class="o">.</span><span class="n">int_add</span><span class="p">(</span><span class="n">x</span><span class="p">,</span> <span class="mi">10</span><span class="p">)</span> +<a id="rest_code_4ffb3edd0ebd4f739819d99c60b8f91d-5" name="rest_code_4ffb3edd0ebd4f739819d99c60b8f91d-5" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_4ffb3edd0ebd4f739819d99c60b8f91d-5"></a> <span class="k">if</span> <span class="n">a</span> <span class="o">&lt;</span> <span class="mi">15</span><span class="p">:</span> +<a id="rest_code_4ffb3edd0ebd4f739819d99c60b8f91d-6" name="rest_code_4ffb3edd0ebd4f739819d99c60b8f91d-6" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_4ffb3edd0ebd4f739819d99c60b8f91d-6"></a> <span class="k">return</span> <span class="mi">0</span> +<a id="rest_code_4ffb3edd0ebd4f739819d99c60b8f91d-7" name="rest_code_4ffb3edd0ebd4f739819d99c60b8f91d-7" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_4ffb3edd0ebd4f739819d99c60b8f91d-7"></a> <span class="k">return</span> <span class="mi">2</span> +</pre></div> +<p>Basically the faulty reasoning of the JIT looks like this: if <code class="docutils literal">int_add(x, 10) &lt; 15</code> +then it must follow that <code class="docutils literal">x &lt; 5</code>, which is stronger than <code class="docutils literal">x &lt; 6</code>, so the +second <code class="docutils literal">if</code> is always true. This sounds good, but is actually wrong +if the addition <code class="docutils literal">+ 10</code> wrapped around. So if <code class="docutils literal">x == MAXINT</code>, then +<code class="docutils literal">int_add(x, 10) == MININT + 9 &lt; 15</code>. But <code class="docutils literal">MAXINT &lt; 5</code> is not +correct.</p> +<p>Note how the same reasoning with overflow-checking addition is correct! If <code class="docutils literal">x + +10 &lt; 15</code> and the <code class="docutils literal">+</code> didn't overflow, then indeed <code class="docutils literal">x &lt; 6</code>. And if your +mind bends starting to think about all this, you understand some of the +difficulty of getting the JIT correct in this area.</p> +</section> +<section id="how-could-we-have-avoided-this-bug"> +<h2>How could we have avoided this bug?</h2> +<p>One <a class="reference external" href="https://twitter.com/cfbolz/status/1482649144099586051">exercise I try to do after finding bugs</a> is to reflect on ways that the +bug could have been avoided. I think this is particularly important in the JIT, +where bugs are potentially really annoying to find and can cause very strange +behaviour in basically arbitrary Python code.</p> +<p>It's easy to always answer this question with "try to think more carefully +when working", but that approach cannot be relied on in complicated situations, +because humans don't concentrate perfectly for long stretches of time.</p> +<p>A situation-specific problem I identified was the bad design of the range analysis API. +A range is not just represented by two numbers, instead it's two numbers +and two bools that are supposed to represent that some operation did or did not +underflow/overflow. The meaning of these bools was quite hard to grasp and easy +to get wrong, so probably they should never have been introduced in the first +place (and my bugfix indeed removed them).</p> +<p>But in the rest of this blog post I want to talk about another, systematic +approach that can be applied to the problem of mis-optimizations of integer +operations, and that is done by applying an SMT solver to the problem.</p> +<p>An SMT solver (<a class="reference external" href="https://en.wikipedia.org/wiki/Satisfiability_modulo_theories">Satisfyability Modulo Theories</a>) is a tool that can be used to +find out whether mathematical formulas are "satisfiable", i.e. whether +some chosen set of inputs exists that will make the formulas evaluate to true. SMT solvers are +commonly used in a wide range of CS applications including program correctness +proofs, program synthesis, etc. The most widely known one is probably <a class="reference external" href="https://github.com/Z3Prover">Z3</a> by +Microsoft Research which has the nice advantage of coming with an easy-to-use +Python binding.</p> +<p>Going into this I basically knew next to nothing about SMT solvers (despite +having been embedded in a formal methods research group for years!) so it was an +interesting new world to learn about.</p> +<p>As briefly mentioned in the introduction, the approach I took followed a similar +(but <em>much</em> more properly executed) one applied to LLVM operations, called +<a class="reference external" href="https://github.com/AliveToolkit/alive2/">Alive2</a>. Krister Waldfridsson has done <a class="reference external" href="https://kristerw.github.io/2022/09/13/translation-validation/">similar work for GCC recently</a>, +described on his blog.</p> +</section> +<section id="z3-proof-of-concept"> +<h2>Z3 Proof of Concept</h2> +<p>The first thing I did was to try to get Z3 find the above bug, by encoding the +input program into an SMT formula by hand and trying to get Z3 to prove the condition +that the JIT thinks is always true. The Z3 code for this looks as follows:</p> +<div class="code"><pre class="code python"><a id="rest_code_2fe5dd23f4ec46749496562618a462eb-1" name="rest_code_2fe5dd23f4ec46749496562618a462eb-1" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_2fe5dd23f4ec46749496562618a462eb-1"></a><span class="kn">from</span> <span class="nn">z3</span> <span class="kn">import</span> <span class="n">BitVec</span><span class="p">,</span> <span class="n">Implies</span><span class="p">,</span> <span class="n">prove</span> +<a id="rest_code_2fe5dd23f4ec46749496562618a462eb-2" name="rest_code_2fe5dd23f4ec46749496562618a462eb-2" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_2fe5dd23f4ec46749496562618a462eb-2"></a><span class="n">x</span> <span class="o">=</span> <span class="n">BitVec</span><span class="p">(</span><span class="s1">'x'</span><span class="p">,</span> <span class="mi">64</span><span class="p">)</span> +<a id="rest_code_2fe5dd23f4ec46749496562618a462eb-3" name="rest_code_2fe5dd23f4ec46749496562618a462eb-3" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_2fe5dd23f4ec46749496562618a462eb-3"></a><span class="n">a</span> <span class="o">=</span> <span class="n">x</span> <span class="o">+</span> <span class="mi">10</span> +<a id="rest_code_2fe5dd23f4ec46749496562618a462eb-4" name="rest_code_2fe5dd23f4ec46749496562618a462eb-4" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_2fe5dd23f4ec46749496562618a462eb-4"></a><span class="n">cond1</span> <span class="o">=</span> <span class="n">a</span> <span class="o">&lt;</span> <span class="mi">15</span> +<a id="rest_code_2fe5dd23f4ec46749496562618a462eb-5" name="rest_code_2fe5dd23f4ec46749496562618a462eb-5" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_2fe5dd23f4ec46749496562618a462eb-5"></a><span class="n">cond2</span> <span class="o">=</span> <span class="n">x</span> <span class="o">&lt;</span> <span class="mi">6</span> +<a id="rest_code_2fe5dd23f4ec46749496562618a462eb-6" name="rest_code_2fe5dd23f4ec46749496562618a462eb-6" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_2fe5dd23f4ec46749496562618a462eb-6"></a><span class="n">prove</span><span class="p">(</span><span class="n">Implies</span><span class="p">(</span><span class="n">cond1</span><span class="p">,</span> <span class="n">cond2</span><span class="p">))</span> +</pre></div> +<p>Here, <code class="docutils literal">x</code> is defined to be a bit vector variable of width 64, which is a +datatype that can be used to represent bounded machine integers. Addition on +bit vectors performs wraparound arithmetic, like the <code class="docutils literal">__pypy__.intop.int_add</code> +call in the original code. The JIT optimized the second condition away, so +essentially it was convinced that the first condition implies the second one. +The above snippet tries to get Z3 to confirm this.</p> +<p>When run, the above program prints:</p> +<pre class="literal-block">counterexample +[x = 9223372036854775803]</pre> +<p>Which shows the bug. As a small side-note, I thought it was cool that the +process of "proving" something in Z3 basically means trying to find an example +for the negation of the formula. If no counterexample can be found for the +negation, the original formula is true. If the original formula turns out to be +false (like here) we get a nice example that shows the problem to go with it.</p> +<p>It's not realistic to hand-translate all the hundreds of +unit-tests into Z3 formulas and then ask Z3 to prove the optimizations. Instead, +we want to have a program that does this for us.</p> +</section> +<section id="smt-checking-of-the-jit-optimizer"> +<h2>SMT Checking of the JIT Optimizer</h2> +<p>What we want from this program is the following: given an unoptimized trace and +its optimized version, we want to use Z3 to check whether the optimized trace +behaves identically to the unoptimized one. One question is what "behaves +identically" means. What we care about is the outputs of the trace being the +same values, no matter how they are computed. Also, for every guard we want to +make sure that it fails in identical ways in the optimized and unoptimized +versions. A guard is only allowed to be optimized away if it can never fail. +The code that comes after a guard can assume that the guard has not failed, +because otherwise execution would have left the trace. All of this should be +true regardless for the values of the input variables of the trace.</p> +<p>So in order to check that the two traces are behaving identically, we do the +following:</p> +<ul class="simple"> +<li><p>We create Z3 variables for every input variable. We use the same input +variables both for the unoptimized as well as the optimized trace.</p></li> +<li><p>We align the two traces at the corresponding guards. Thankfully the optimizer +keeps track of which optimized guard corresponds to which unoptimized input +guard.</p></li> +<li><p>All the operations before a guard are translated into Z3 formulas, for both +versions of the trace.</p></li> +<li><p>For two corresponding guards, we ask Z3 to prove that the guard conditions are +identical.</p></li> +<li><p>For a guard that was optimized away we ask Z3 to prove that the condition is +always true.</p></li> +<li><p>After a guard, we tell Z3 that from now on it can assume that the guard +condition is true.</p></li> +<li><p>We repeat this, guard for guard, until we reach the end of the trace. There, +we ask Z3 to prove that the output variables in the unoptimized trace and the +optimized trace are identical (every trace can return one or many values).</p></li> +</ul> +<p>I implemented this, it's <a class="reference external" href="https://foss.heptapod.net/pypy/pypy/-/blob/branch/default/rpython/jit/metainterp/optimizeopt/test/test_z3checktests.py">not a lot of code</a>, basically a couple of hundred lines +of (somewhat hacky) Python code. So far I only support integer +operations. Here are some parts of the code to give you a flavor of what this +looks like.</p> +<p>This is the code that translates operations into Z3 formulas:</p> +<div class="code"><pre class="code python"><a id="rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-1" name="rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-1" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-1"></a><span class="k">def</span> <span class="nf">add_to_solver</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">ops</span><span class="p">,</span> <span class="n">state</span><span class="p">):</span> +<a id="rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-2" name="rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-2" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-2"></a> <span class="k">for</span> <span class="n">op</span> <span class="ow">in</span> <span class="n">ops</span><span class="p">:</span> +<a id="rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-3" name="rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-3" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-3"></a> <span class="k">if</span> <span class="n">op</span><span class="o">.</span><span class="n">type</span> <span class="o">!=</span> <span class="s1">'v'</span><span class="p">:</span> <span class="c1"># is it an operation with a result</span> +<a id="rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-4" name="rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-4" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-4"></a> <span class="n">res</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">newvar</span><span class="p">(</span><span class="n">op</span><span class="p">)</span> +<a id="rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-5" name="rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-5" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-5"></a> <span class="k">else</span><span class="p">:</span> <span class="c1"># or does it return void</span> +<a id="rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-6" name="rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-6" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-6"></a> <span class="n">res</span> <span class="o">=</span> <span class="kc">None</span> +<a id="rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-7" name="rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-7" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-7"></a> +<a id="rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-8" name="rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-8" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-8"></a> <span class="c1"># ...</span> +<a id="rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-9" name="rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-9" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-9"></a> +<a id="rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-10" name="rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-10" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-10"></a> <span class="c1"># convert arguments</span> +<a id="rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-11" name="rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-11" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-11"></a> <span class="k">if</span> <span class="n">op</span><span class="o">.</span><span class="n">numargs</span><span class="p">()</span> <span class="o">==</span> <span class="mi">1</span><span class="p">:</span> +<a id="rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-12" name="rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-12" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-12"></a> <span class="n">arg0</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">convertarg</span><span class="p">(</span><span class="n">op</span><span class="p">,</span> <span class="mi">0</span><span class="p">)</span> +<a id="rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-13" name="rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-13" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-13"></a> <span class="k">elif</span> <span class="n">op</span><span class="o">.</span><span class="n">numargs</span><span class="p">()</span> <span class="o">==</span> <span class="mi">2</span><span class="p">:</span> +<a id="rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-14" name="rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-14" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-14"></a> <span class="n">arg0</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">convertarg</span><span class="p">(</span><span class="n">op</span><span class="p">,</span> <span class="mi">0</span><span class="p">)</span> +<a id="rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-15" name="rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-15" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-15"></a> <span class="n">arg1</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">convertarg</span><span class="p">(</span><span class="n">op</span><span class="p">,</span> <span class="mi">1</span><span class="p">)</span> +<a id="rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-16" name="rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-16" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-16"></a> +<a id="rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-17" name="rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-17" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-17"></a> <span class="c1"># compute results</span> +<a id="rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-18" name="rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-18" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-18"></a> <span class="k">if</span> <span class="n">opname</span> <span class="o">==</span> <span class="s2">"int_add"</span><span class="p">:</span> +<a id="rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-19" name="rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-19" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-19"></a> <span class="n">expr</span> <span class="o">=</span> <span class="n">arg0</span> <span class="o">+</span> <span class="n">arg1</span> +<a id="rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-20" name="rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-20" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-20"></a> <span class="k">elif</span> <span class="n">opname</span> <span class="o">==</span> <span class="s2">"int_sub"</span><span class="p">:</span> +<a id="rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-21" name="rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-21" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-21"></a> <span class="n">expr</span> <span class="o">=</span> <span class="n">arg0</span> <span class="o">-</span> <span class="n">arg1</span> +<a id="rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-22" name="rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-22" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-22"></a> <span class="k">elif</span> <span class="n">opname</span> <span class="o">==</span> <span class="s2">"int_mul"</span><span class="p">:</span> +<a id="rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-23" name="rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-23" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-23"></a> <span class="n">expr</span> <span class="o">=</span> <span class="n">arg0</span> <span class="o">*</span> <span class="n">arg1</span> +<a id="rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-24" name="rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-24" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-24"></a> <span class="k">elif</span> <span class="n">opname</span> <span class="o">==</span> <span class="s2">"int_and"</span><span class="p">:</span> +<a id="rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-25" name="rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-25" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-25"></a> <span class="n">expr</span> <span class="o">=</span> <span class="n">arg0</span> <span class="o">&amp;</span> <span class="n">arg1</span> +<a id="rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-26" name="rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-26" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-26"></a> <span class="k">elif</span> <span class="n">opname</span> <span class="o">==</span> <span class="s2">"int_or"</span><span class="p">:</span> +<a id="rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-27" name="rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-27" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-27"></a> <span class="n">expr</span> <span class="o">=</span> <span class="n">arg0</span> <span class="o">|</span> <span class="n">arg1</span> +<a id="rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-28" name="rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-28" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-28"></a> <span class="k">elif</span> <span class="n">opname</span> <span class="o">==</span> <span class="s2">"int_xor"</span><span class="p">:</span> +<a id="rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-29" name="rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-29" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-29"></a> <span class="n">expr</span> <span class="o">=</span> <span class="n">arg0</span> <span class="o">^</span> <span class="n">arg1</span> +<a id="rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-30" name="rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-30" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-30"></a> +<a id="rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-31" name="rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-31" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-31"></a> <span class="c1"># ... more operations, some shown below</span> +<a id="rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-32" name="rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-32" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-32"></a> +<a id="rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-33" name="rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-33" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-33"></a> <span class="bp">self</span><span class="o">.</span><span class="n">solver</span><span class="o">.</span><span class="n">add</span><span class="p">(</span><span class="n">res</span> <span class="o">==</span> <span class="n">expr</span><span class="p">)</span> +</pre></div> +<p>New Z3 variables are defined by the helper function <code class="docutils literal">newvar</code>, which adds the +operation to a dictionary <code class="docutils literal">box_to_z3</code> mapping boxes (=variables) to Z3 +variables. Due to the <a class="reference external" href="https://en.wikipedia.org/wiki/Static_single-assignment_form">SSA</a> property that traces have, a variable must be defined +before its first use.</p> +<p>Here's what <code class="docutils literal">newvar</code> looks like (<code class="docutils literal">LONG_BIT</code> is a constant that is either +<code class="docutils literal">64</code> or <code class="docutils literal">32</code>, depending on the target architecture):</p> +<div class="code"><pre class="code python"><a id="rest_code_36cab9b8d68941ecafeac4cb42b72541-1" name="rest_code_36cab9b8d68941ecafeac4cb42b72541-1" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_36cab9b8d68941ecafeac4cb42b72541-1"></a><span class="k">def</span> <span class="nf">newvar</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">box</span><span class="p">,</span> <span class="nb">repr</span><span class="o">=</span><span class="kc">None</span><span class="p">):</span> +<a id="rest_code_36cab9b8d68941ecafeac4cb42b72541-2" name="rest_code_36cab9b8d68941ecafeac4cb42b72541-2" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_36cab9b8d68941ecafeac4cb42b72541-2"></a> <span class="c1"># ... some logic around making the string representation</span> +<a id="rest_code_36cab9b8d68941ecafeac4cb42b72541-3" name="rest_code_36cab9b8d68941ecafeac4cb42b72541-3" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_36cab9b8d68941ecafeac4cb42b72541-3"></a> <span class="c1"># somewhat nicer omitted</span> +<a id="rest_code_36cab9b8d68941ecafeac4cb42b72541-4" name="rest_code_36cab9b8d68941ecafeac4cb42b72541-4" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_36cab9b8d68941ecafeac4cb42b72541-4"></a> <span class="n">result</span> <span class="o">=</span> <span class="n">z3</span><span class="o">.</span><span class="n">BitVec</span><span class="p">(</span><span class="nb">repr</span><span class="p">,</span> <span class="n">LONG_BIT</span><span class="p">)</span> +<a id="rest_code_36cab9b8d68941ecafeac4cb42b72541-5" name="rest_code_36cab9b8d68941ecafeac4cb42b72541-5" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_36cab9b8d68941ecafeac4cb42b72541-5"></a> <span class="bp">self</span><span class="o">.</span><span class="n">box_to_z3</span><span class="p">[</span><span class="n">box</span><span class="p">]</span> <span class="o">=</span> <span class="n">result</span> +<a id="rest_code_36cab9b8d68941ecafeac4cb42b72541-6" name="rest_code_36cab9b8d68941ecafeac4cb42b72541-6" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_36cab9b8d68941ecafeac4cb42b72541-6"></a> <span class="k">return</span> <span class="n">result</span> +</pre></div> +<p>The <code class="docutils literal">convert</code> method turns an operation argument (either a constant or a +variable) into a Z3 formula (either a constant bit vector or an already defined +Z3 variable). <code class="docutils literal">convertarg</code> is a helper function that takes an operation, reads +its nth argument and converts it.</p> +<div class="code"><pre class="code python"><a id="rest_code_70b9c80263b2495bab6ea46cbe5febbc-1" name="rest_code_70b9c80263b2495bab6ea46cbe5febbc-1" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_70b9c80263b2495bab6ea46cbe5febbc-1"></a><span class="k">def</span> <span class="nf">convert</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">box</span><span class="p">):</span> +<a id="rest_code_70b9c80263b2495bab6ea46cbe5febbc-2" name="rest_code_70b9c80263b2495bab6ea46cbe5febbc-2" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_70b9c80263b2495bab6ea46cbe5febbc-2"></a> <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">box</span><span class="p">,</span> <span class="n">ConstInt</span><span class="p">):</span> +<a id="rest_code_70b9c80263b2495bab6ea46cbe5febbc-3" name="rest_code_70b9c80263b2495bab6ea46cbe5febbc-3" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_70b9c80263b2495bab6ea46cbe5febbc-3"></a> <span class="k">return</span> <span class="n">z3</span><span class="o">.</span><span class="n">BitVecVal</span><span class="p">(</span><span class="n">box</span><span class="o">.</span><span class="n">getint</span><span class="p">(),</span> <span class="n">LONG_BIT</span><span class="p">)</span> +<a id="rest_code_70b9c80263b2495bab6ea46cbe5febbc-4" name="rest_code_70b9c80263b2495bab6ea46cbe5febbc-4" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_70b9c80263b2495bab6ea46cbe5febbc-4"></a> <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">box_to_z3</span><span class="p">[</span><span class="n">box</span><span class="p">]</span> +<a id="rest_code_70b9c80263b2495bab6ea46cbe5febbc-5" name="rest_code_70b9c80263b2495bab6ea46cbe5febbc-5" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_70b9c80263b2495bab6ea46cbe5febbc-5"></a> +<a id="rest_code_70b9c80263b2495bab6ea46cbe5febbc-6" name="rest_code_70b9c80263b2495bab6ea46cbe5febbc-6" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_70b9c80263b2495bab6ea46cbe5febbc-6"></a><span class="k">def</span> <span class="nf">convertarg</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">box</span><span class="p">,</span> <span class="n">arg</span><span class="p">):</span> +<a id="rest_code_70b9c80263b2495bab6ea46cbe5febbc-7" name="rest_code_70b9c80263b2495bab6ea46cbe5febbc-7" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_70b9c80263b2495bab6ea46cbe5febbc-7"></a> <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">convert</span><span class="p">(</span><span class="n">box</span><span class="o">.</span><span class="n">getarg</span><span class="p">(</span><span class="n">arg</span><span class="p">))</span> +</pre></div> +<p>The lookup of variables in <code class="docutils literal">box_to_z3</code> that <code class="docutils literal">convert</code> does cannot fail, +because the variable must have been defined before use.</p> +<p>Comparisons return the bit vector 0 or bit vector 1, we use a helper function +<code class="docutils literal">cond</code> to turn the Z3 truth value of the comparison into a bit vector:</p> +<div class="code"><pre class="code python"><a id="rest_code_af8f5f62807f4670b7bb2c8ec574b55d-1" name="rest_code_af8f5f62807f4670b7bb2c8ec574b55d-1" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_af8f5f62807f4670b7bb2c8ec574b55d-1"></a><span class="k">def</span> <span class="nf">cond</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">z3expr</span><span class="p">):</span> +<a id="rest_code_af8f5f62807f4670b7bb2c8ec574b55d-2" name="rest_code_af8f5f62807f4670b7bb2c8ec574b55d-2" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_af8f5f62807f4670b7bb2c8ec574b55d-2"></a> <span class="k">return</span> <span class="n">z3</span><span class="o">.</span><span class="n">If</span><span class="p">(</span><span class="n">z3expr</span><span class="p">,</span> <span class="n">TRUEBV</span><span class="p">,</span> <span class="n">FALSEBV</span><span class="p">)</span> +<a id="rest_code_af8f5f62807f4670b7bb2c8ec574b55d-3" name="rest_code_af8f5f62807f4670b7bb2c8ec574b55d-3" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_af8f5f62807f4670b7bb2c8ec574b55d-3"></a> +<a id="rest_code_af8f5f62807f4670b7bb2c8ec574b55d-4" name="rest_code_af8f5f62807f4670b7bb2c8ec574b55d-4" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_af8f5f62807f4670b7bb2c8ec574b55d-4"></a> +<a id="rest_code_af8f5f62807f4670b7bb2c8ec574b55d-5" name="rest_code_af8f5f62807f4670b7bb2c8ec574b55d-5" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_af8f5f62807f4670b7bb2c8ec574b55d-5"></a><span class="k">def</span> <span class="nf">add_to_solver</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">ops</span><span class="p">,</span> <span class="n">state</span><span class="p">):</span> +<a id="rest_code_af8f5f62807f4670b7bb2c8ec574b55d-6" name="rest_code_af8f5f62807f4670b7bb2c8ec574b55d-6" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_af8f5f62807f4670b7bb2c8ec574b55d-6"></a> <span class="c1"># ... start as above</span> +<a id="rest_code_af8f5f62807f4670b7bb2c8ec574b55d-7" name="rest_code_af8f5f62807f4670b7bb2c8ec574b55d-7" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_af8f5f62807f4670b7bb2c8ec574b55d-7"></a> +<a id="rest_code_af8f5f62807f4670b7bb2c8ec574b55d-8" name="rest_code_af8f5f62807f4670b7bb2c8ec574b55d-8" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_af8f5f62807f4670b7bb2c8ec574b55d-8"></a> <span class="c1"># more cases</span> +<a id="rest_code_af8f5f62807f4670b7bb2c8ec574b55d-9" name="rest_code_af8f5f62807f4670b7bb2c8ec574b55d-9" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_af8f5f62807f4670b7bb2c8ec574b55d-9"></a> <span class="k">elif</span> <span class="n">opname</span> <span class="o">==</span> <span class="s2">"int_eq"</span><span class="p">:</span> +<a id="rest_code_af8f5f62807f4670b7bb2c8ec574b55d-10" name="rest_code_af8f5f62807f4670b7bb2c8ec574b55d-10" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_af8f5f62807f4670b7bb2c8ec574b55d-10"></a> <span class="n">expr</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">cond</span><span class="p">(</span><span class="n">arg0</span> <span class="o">==</span> <span class="n">arg1</span><span class="p">)</span> +<a id="rest_code_af8f5f62807f4670b7bb2c8ec574b55d-11" name="rest_code_af8f5f62807f4670b7bb2c8ec574b55d-11" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_af8f5f62807f4670b7bb2c8ec574b55d-11"></a> <span class="k">elif</span> <span class="n">opname</span> <span class="o">==</span> <span class="s2">"int_ne"</span><span class="p">:</span> +<a id="rest_code_af8f5f62807f4670b7bb2c8ec574b55d-12" name="rest_code_af8f5f62807f4670b7bb2c8ec574b55d-12" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_af8f5f62807f4670b7bb2c8ec574b55d-12"></a> <span class="n">expr</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">cond</span><span class="p">(</span><span class="n">arg0</span> <span class="o">!=</span> <span class="n">arg1</span><span class="p">)</span> +<a id="rest_code_af8f5f62807f4670b7bb2c8ec574b55d-13" name="rest_code_af8f5f62807f4670b7bb2c8ec574b55d-13" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_af8f5f62807f4670b7bb2c8ec574b55d-13"></a> <span class="k">elif</span> <span class="n">opname</span> <span class="o">==</span> <span class="s2">"int_lt"</span><span class="p">:</span> +<a id="rest_code_af8f5f62807f4670b7bb2c8ec574b55d-14" name="rest_code_af8f5f62807f4670b7bb2c8ec574b55d-14" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_af8f5f62807f4670b7bb2c8ec574b55d-14"></a> <span class="n">expr</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">cond</span><span class="p">(</span><span class="n">arg0</span> <span class="o">&lt;</span> <span class="n">arg1</span><span class="p">)</span> +<a id="rest_code_af8f5f62807f4670b7bb2c8ec574b55d-15" name="rest_code_af8f5f62807f4670b7bb2c8ec574b55d-15" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_af8f5f62807f4670b7bb2c8ec574b55d-15"></a> <span class="k">elif</span> <span class="n">opname</span> <span class="o">==</span> <span class="s2">"int_le"</span><span class="p">:</span> +<a id="rest_code_af8f5f62807f4670b7bb2c8ec574b55d-16" name="rest_code_af8f5f62807f4670b7bb2c8ec574b55d-16" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_af8f5f62807f4670b7bb2c8ec574b55d-16"></a> <span class="n">expr</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">cond</span><span class="p">(</span><span class="n">arg0</span> <span class="o">&lt;=</span> <span class="n">arg1</span><span class="p">)</span> +<a id="rest_code_af8f5f62807f4670b7bb2c8ec574b55d-17" name="rest_code_af8f5f62807f4670b7bb2c8ec574b55d-17" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_af8f5f62807f4670b7bb2c8ec574b55d-17"></a> <span class="k">elif</span> <span class="n">opname</span> <span class="o">==</span> <span class="s2">"int_gt"</span><span class="p">:</span> +<a id="rest_code_af8f5f62807f4670b7bb2c8ec574b55d-18" name="rest_code_af8f5f62807f4670b7bb2c8ec574b55d-18" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_af8f5f62807f4670b7bb2c8ec574b55d-18"></a> <span class="n">expr</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">cond</span><span class="p">(</span><span class="n">arg0</span> <span class="o">&gt;</span> <span class="n">arg1</span><span class="p">)</span> +<a id="rest_code_af8f5f62807f4670b7bb2c8ec574b55d-19" name="rest_code_af8f5f62807f4670b7bb2c8ec574b55d-19" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_af8f5f62807f4670b7bb2c8ec574b55d-19"></a> <span class="k">elif</span> <span class="n">opname</span> <span class="o">==</span> <span class="s2">"int_ge"</span><span class="p">:</span> +<a id="rest_code_af8f5f62807f4670b7bb2c8ec574b55d-20" name="rest_code_af8f5f62807f4670b7bb2c8ec574b55d-20" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_af8f5f62807f4670b7bb2c8ec574b55d-20"></a> <span class="n">expr</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">cond</span><span class="p">(</span><span class="n">arg0</span> <span class="o">&gt;=</span> <span class="n">arg1</span><span class="p">)</span> +<a id="rest_code_af8f5f62807f4670b7bb2c8ec574b55d-21" name="rest_code_af8f5f62807f4670b7bb2c8ec574b55d-21" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_af8f5f62807f4670b7bb2c8ec574b55d-21"></a> <span class="k">elif</span> <span class="n">opname</span> <span class="o">==</span> <span class="s2">"int_is_true"</span><span class="p">:</span> +<a id="rest_code_af8f5f62807f4670b7bb2c8ec574b55d-22" name="rest_code_af8f5f62807f4670b7bb2c8ec574b55d-22" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_af8f5f62807f4670b7bb2c8ec574b55d-22"></a> <span class="n">expr</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">cond</span><span class="p">(</span><span class="n">arg0</span> <span class="o">!=</span> <span class="n">FALSEBV</span><span class="p">)</span> +<a id="rest_code_af8f5f62807f4670b7bb2c8ec574b55d-23" name="rest_code_af8f5f62807f4670b7bb2c8ec574b55d-23" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_af8f5f62807f4670b7bb2c8ec574b55d-23"></a> <span class="k">elif</span> <span class="n">opname</span> <span class="o">==</span> <span class="s2">"uint_lt"</span><span class="p">:</span> +<a id="rest_code_af8f5f62807f4670b7bb2c8ec574b55d-24" name="rest_code_af8f5f62807f4670b7bb2c8ec574b55d-24" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_af8f5f62807f4670b7bb2c8ec574b55d-24"></a> <span class="n">expr</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">cond</span><span class="p">(</span><span class="n">z3</span><span class="o">.</span><span class="n">ULT</span><span class="p">(</span><span class="n">arg0</span><span class="p">,</span> <span class="n">arg1</span><span class="p">))</span> +<a id="rest_code_af8f5f62807f4670b7bb2c8ec574b55d-25" name="rest_code_af8f5f62807f4670b7bb2c8ec574b55d-25" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_af8f5f62807f4670b7bb2c8ec574b55d-25"></a> <span class="k">elif</span> <span class="n">opname</span> <span class="o">==</span> <span class="s2">"uint_le"</span><span class="p">:</span> +<a id="rest_code_af8f5f62807f4670b7bb2c8ec574b55d-26" name="rest_code_af8f5f62807f4670b7bb2c8ec574b55d-26" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_af8f5f62807f4670b7bb2c8ec574b55d-26"></a> <span class="n">expr</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">cond</span><span class="p">(</span><span class="n">z3</span><span class="o">.</span><span class="n">ULE</span><span class="p">(</span><span class="n">arg0</span><span class="p">,</span> <span class="n">arg1</span><span class="p">))</span> +<a id="rest_code_af8f5f62807f4670b7bb2c8ec574b55d-27" name="rest_code_af8f5f62807f4670b7bb2c8ec574b55d-27" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_af8f5f62807f4670b7bb2c8ec574b55d-27"></a> <span class="k">elif</span> <span class="n">opname</span> <span class="o">==</span> <span class="s2">"uint_gt"</span><span class="p">:</span> +<a id="rest_code_af8f5f62807f4670b7bb2c8ec574b55d-28" name="rest_code_af8f5f62807f4670b7bb2c8ec574b55d-28" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_af8f5f62807f4670b7bb2c8ec574b55d-28"></a> <span class="n">expr</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">cond</span><span class="p">(</span><span class="n">z3</span><span class="o">.</span><span class="n">UGT</span><span class="p">(</span><span class="n">arg0</span><span class="p">,</span> <span class="n">arg1</span><span class="p">))</span> +<a id="rest_code_af8f5f62807f4670b7bb2c8ec574b55d-29" name="rest_code_af8f5f62807f4670b7bb2c8ec574b55d-29" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_af8f5f62807f4670b7bb2c8ec574b55d-29"></a> <span class="k">elif</span> <span class="n">opname</span> <span class="o">==</span> <span class="s2">"uint_ge"</span><span class="p">:</span> +<a id="rest_code_af8f5f62807f4670b7bb2c8ec574b55d-30" name="rest_code_af8f5f62807f4670b7bb2c8ec574b55d-30" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_af8f5f62807f4670b7bb2c8ec574b55d-30"></a> <span class="n">expr</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">cond</span><span class="p">(</span><span class="n">z3</span><span class="o">.</span><span class="n">UGE</span><span class="p">(</span><span class="n">arg0</span><span class="p">,</span> <span class="n">arg1</span><span class="p">))</span> +<a id="rest_code_af8f5f62807f4670b7bb2c8ec574b55d-31" name="rest_code_af8f5f62807f4670b7bb2c8ec574b55d-31" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_af8f5f62807f4670b7bb2c8ec574b55d-31"></a> <span class="k">elif</span> <span class="n">opname</span> <span class="o">==</span> <span class="s2">"int_is_zero"</span><span class="p">:</span> +<a id="rest_code_af8f5f62807f4670b7bb2c8ec574b55d-32" name="rest_code_af8f5f62807f4670b7bb2c8ec574b55d-32" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_af8f5f62807f4670b7bb2c8ec574b55d-32"></a> <span class="n">expr</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">cond</span><span class="p">(</span><span class="n">arg0</span> <span class="o">==</span> <span class="n">FALSEBV</span><span class="p">)</span> +<a id="rest_code_af8f5f62807f4670b7bb2c8ec574b55d-33" name="rest_code_af8f5f62807f4670b7bb2c8ec574b55d-33" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_af8f5f62807f4670b7bb2c8ec574b55d-33"></a> +<a id="rest_code_af8f5f62807f4670b7bb2c8ec574b55d-34" name="rest_code_af8f5f62807f4670b7bb2c8ec574b55d-34" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_af8f5f62807f4670b7bb2c8ec574b55d-34"></a> <span class="c1"># ... rest as above</span> +</pre></div> +<p>So basically for every trace operation that operates on integers I had to give a +translation into Z3 formulas, which is mostly straightforward.</p> +<p>Guard operations get converted into a Z3 boolean by their own helper function, +which looks like this:</p> +<div class="code"><pre class="code python"><a id="rest_code_3de914924f164344a1267234ae4925f2-1" name="rest_code_3de914924f164344a1267234ae4925f2-1" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_3de914924f164344a1267234ae4925f2-1"></a><span class="k">def</span> <span class="nf">guard_to_condition</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">guard</span><span class="p">,</span> <span class="n">state</span><span class="p">):</span> +<a id="rest_code_3de914924f164344a1267234ae4925f2-2" name="rest_code_3de914924f164344a1267234ae4925f2-2" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_3de914924f164344a1267234ae4925f2-2"></a> <span class="n">opname</span> <span class="o">=</span> <span class="n">guard</span><span class="o">.</span><span class="n">getopname</span><span class="p">()</span> +<a id="rest_code_3de914924f164344a1267234ae4925f2-3" name="rest_code_3de914924f164344a1267234ae4925f2-3" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_3de914924f164344a1267234ae4925f2-3"></a> <span class="k">if</span> <span class="n">opname</span> <span class="o">==</span> <span class="s2">"guard_true"</span><span class="p">:</span> +<a id="rest_code_3de914924f164344a1267234ae4925f2-4" name="rest_code_3de914924f164344a1267234ae4925f2-4" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_3de914924f164344a1267234ae4925f2-4"></a> <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">convertarg</span><span class="p">(</span><span class="n">guard</span><span class="p">,</span> <span class="mi">0</span><span class="p">)</span> <span class="o">==</span> <span class="n">TRUEBV</span> +<a id="rest_code_3de914924f164344a1267234ae4925f2-5" name="rest_code_3de914924f164344a1267234ae4925f2-5" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_3de914924f164344a1267234ae4925f2-5"></a> <span class="k">elif</span> <span class="n">opname</span> <span class="o">==</span> <span class="s2">"guard_false"</span><span class="p">:</span> +<a id="rest_code_3de914924f164344a1267234ae4925f2-6" name="rest_code_3de914924f164344a1267234ae4925f2-6" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_3de914924f164344a1267234ae4925f2-6"></a> <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">convertarg</span><span class="p">(</span><span class="n">guard</span><span class="p">,</span> <span class="mi">0</span><span class="p">)</span> <span class="o">==</span> <span class="n">FALSEBV</span> +<a id="rest_code_3de914924f164344a1267234ae4925f2-7" name="rest_code_3de914924f164344a1267234ae4925f2-7" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_3de914924f164344a1267234ae4925f2-7"></a> <span class="k">elif</span> <span class="n">opname</span> <span class="o">==</span> <span class="s2">"guard_value"</span><span class="p">:</span> +<a id="rest_code_3de914924f164344a1267234ae4925f2-8" name="rest_code_3de914924f164344a1267234ae4925f2-8" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_3de914924f164344a1267234ae4925f2-8"></a> <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">convertarg</span><span class="p">(</span><span class="n">guard</span><span class="p">,</span> <span class="mi">0</span><span class="p">)</span> <span class="o">==</span> <span class="bp">self</span><span class="o">.</span><span class="n">convertarg</span><span class="p">(</span><span class="n">guard</span><span class="p">,</span> <span class="mi">1</span><span class="p">)</span> +<a id="rest_code_3de914924f164344a1267234ae4925f2-9" name="rest_code_3de914924f164344a1267234ae4925f2-9" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_3de914924f164344a1267234ae4925f2-9"></a> +<a id="rest_code_3de914924f164344a1267234ae4925f2-10" name="rest_code_3de914924f164344a1267234ae4925f2-10" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_3de914924f164344a1267234ae4925f2-10"></a> <span class="c1"># ... some more exist, shown below</span> +</pre></div> +<p>Some operations are a bit trickier. An important example in the context of +this blog post are integer operations that check for overflow. The overflow +operations return a result, but also a boolean whether the operation overflowed +or not.</p> +<div class="code"><pre class="code python"><a id="rest_code_51a2bf22ac6042edb7137eeab86ff8c4-1" name="rest_code_51a2bf22ac6042edb7137eeab86ff8c4-1" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_51a2bf22ac6042edb7137eeab86ff8c4-1"></a><span class="k">def</span> <span class="nf">add_to_solver</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">ops</span><span class="p">,</span> <span class="n">state</span><span class="p">):</span> +<a id="rest_code_51a2bf22ac6042edb7137eeab86ff8c4-2" name="rest_code_51a2bf22ac6042edb7137eeab86ff8c4-2" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_51a2bf22ac6042edb7137eeab86ff8c4-2"></a> +<a id="rest_code_51a2bf22ac6042edb7137eeab86ff8c4-3" name="rest_code_51a2bf22ac6042edb7137eeab86ff8c4-3" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_51a2bf22ac6042edb7137eeab86ff8c4-3"></a> <span class="c1"># ... more cases</span> +<a id="rest_code_51a2bf22ac6042edb7137eeab86ff8c4-4" name="rest_code_51a2bf22ac6042edb7137eeab86ff8c4-4" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_51a2bf22ac6042edb7137eeab86ff8c4-4"></a> +<a id="rest_code_51a2bf22ac6042edb7137eeab86ff8c4-5" name="rest_code_51a2bf22ac6042edb7137eeab86ff8c4-5" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_51a2bf22ac6042edb7137eeab86ff8c4-5"></a> <span class="k">elif</span> <span class="n">opname</span> <span class="o">==</span> <span class="s2">"int_add_ovf"</span><span class="p">:</span> +<a id="rest_code_51a2bf22ac6042edb7137eeab86ff8c4-6" name="rest_code_51a2bf22ac6042edb7137eeab86ff8c4-6" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_51a2bf22ac6042edb7137eeab86ff8c4-6"></a> <span class="n">expr</span> <span class="o">=</span> <span class="n">arg0</span> <span class="o">+</span> <span class="n">arg1</span> +<a id="rest_code_51a2bf22ac6042edb7137eeab86ff8c4-7" name="rest_code_51a2bf22ac6042edb7137eeab86ff8c4-7" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_51a2bf22ac6042edb7137eeab86ff8c4-7"></a> <span class="n">m</span> <span class="o">=</span> <span class="n">z3</span><span class="o">.</span><span class="n">SignExt</span><span class="p">(</span><span class="n">LONG_BIT</span><span class="p">,</span> <span class="n">arg0</span><span class="p">)</span> <span class="o">+</span> <span class="n">z3</span><span class="o">.</span><span class="n">SignExt</span><span class="p">(</span><span class="n">LONG_BIT</span><span class="p">,</span> <span class="n">arg1</span><span class="p">)</span> +<a id="rest_code_51a2bf22ac6042edb7137eeab86ff8c4-8" name="rest_code_51a2bf22ac6042edb7137eeab86ff8c4-8" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_51a2bf22ac6042edb7137eeab86ff8c4-8"></a> <span class="n">state</span><span class="o">.</span><span class="n">no_ovf</span> <span class="o">=</span> <span class="n">m</span> <span class="o">==</span> <span class="n">z3</span><span class="o">.</span><span class="n">SignExt</span><span class="p">(</span><span class="n">LONG_BIT</span><span class="p">,</span> <span class="n">expr</span><span class="p">)</span> +<a id="rest_code_51a2bf22ac6042edb7137eeab86ff8c4-9" name="rest_code_51a2bf22ac6042edb7137eeab86ff8c4-9" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_51a2bf22ac6042edb7137eeab86ff8c4-9"></a> <span class="k">elif</span> <span class="n">opname</span> <span class="o">==</span> <span class="s2">"int_sub_ovf"</span><span class="p">:</span> +<a id="rest_code_51a2bf22ac6042edb7137eeab86ff8c4-10" name="rest_code_51a2bf22ac6042edb7137eeab86ff8c4-10" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_51a2bf22ac6042edb7137eeab86ff8c4-10"></a> <span class="n">expr</span> <span class="o">=</span> <span class="n">arg0</span> <span class="o">-</span> <span class="n">arg1</span> +<a id="rest_code_51a2bf22ac6042edb7137eeab86ff8c4-11" name="rest_code_51a2bf22ac6042edb7137eeab86ff8c4-11" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_51a2bf22ac6042edb7137eeab86ff8c4-11"></a> <span class="n">m</span> <span class="o">=</span> <span class="n">z3</span><span class="o">.</span><span class="n">SignExt</span><span class="p">(</span><span class="n">LONG_BIT</span><span class="p">,</span> <span class="n">arg0</span><span class="p">)</span> <span class="o">-</span> <span class="n">z3</span><span class="o">.</span><span class="n">SignExt</span><span class="p">(</span><span class="n">LONG_BIT</span><span class="p">,</span> <span class="n">arg1</span><span class="p">)</span> +<a id="rest_code_51a2bf22ac6042edb7137eeab86ff8c4-12" name="rest_code_51a2bf22ac6042edb7137eeab86ff8c4-12" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_51a2bf22ac6042edb7137eeab86ff8c4-12"></a> <span class="n">state</span><span class="o">.</span><span class="n">no_ovf</span> <span class="o">=</span> <span class="n">m</span> <span class="o">==</span> <span class="n">z3</span><span class="o">.</span><span class="n">SignExt</span><span class="p">(</span><span class="n">LONG_BIT</span><span class="p">,</span> <span class="n">expr</span><span class="p">)</span> +<a id="rest_code_51a2bf22ac6042edb7137eeab86ff8c4-13" name="rest_code_51a2bf22ac6042edb7137eeab86ff8c4-13" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_51a2bf22ac6042edb7137eeab86ff8c4-13"></a> <span class="k">elif</span> <span class="n">opname</span> <span class="o">==</span> <span class="s2">"int_mul_ovf"</span><span class="p">:</span> +<a id="rest_code_51a2bf22ac6042edb7137eeab86ff8c4-14" name="rest_code_51a2bf22ac6042edb7137eeab86ff8c4-14" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_51a2bf22ac6042edb7137eeab86ff8c4-14"></a> <span class="n">expr</span> <span class="o">=</span> <span class="n">arg0</span> <span class="o">*</span> <span class="n">arg1</span> +<a id="rest_code_51a2bf22ac6042edb7137eeab86ff8c4-15" name="rest_code_51a2bf22ac6042edb7137eeab86ff8c4-15" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_51a2bf22ac6042edb7137eeab86ff8c4-15"></a> <span class="n">m</span> <span class="o">=</span> <span class="n">z3</span><span class="o">.</span><span class="n">SignExt</span><span class="p">(</span><span class="n">LONG_BIT</span><span class="p">,</span> <span class="n">arg0</span><span class="p">)</span> <span class="o">*</span> <span class="n">z3</span><span class="o">.</span><span class="n">SignExt</span><span class="p">(</span><span class="n">LONG_BIT</span><span class="p">,</span> <span class="n">arg1</span><span class="p">)</span> +<a id="rest_code_51a2bf22ac6042edb7137eeab86ff8c4-16" name="rest_code_51a2bf22ac6042edb7137eeab86ff8c4-16" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_51a2bf22ac6042edb7137eeab86ff8c4-16"></a> <span class="n">state</span><span class="o">.</span><span class="n">no_ovf</span> <span class="o">=</span> <span class="n">m</span> <span class="o">==</span> <span class="n">z3</span><span class="o">.</span><span class="n">SignExt</span><span class="p">(</span><span class="n">LONG_BIT</span><span class="p">,</span> <span class="n">expr</span><span class="p">)</span> +<a id="rest_code_51a2bf22ac6042edb7137eeab86ff8c4-17" name="rest_code_51a2bf22ac6042edb7137eeab86ff8c4-17" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_51a2bf22ac6042edb7137eeab86ff8c4-17"></a> +<a id="rest_code_51a2bf22ac6042edb7137eeab86ff8c4-18" name="rest_code_51a2bf22ac6042edb7137eeab86ff8c4-18" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_51a2bf22ac6042edb7137eeab86ff8c4-18"></a> <span class="c1"># ...</span> +</pre></div> +<p>The boolean is computed by comparing the result of the bit vector operation with +the result of converting the input bit vectors into an abstract (arbitrary +precision) integer and the result back to bit vectors. Let's go through the +addition case step by step, the other cases work analogously.</p> +<p>The addition in the first <code class="docutils literal">elif</code> that computes <code class="docutils literal">expr</code> is an addition on bit +vectors, therefore it is performing wraparound arithmetic. +<code class="docutils literal">z3.SignExt(LONG_BIT, arg0)</code> sign-extends <code class="docutils literal">arg0</code> from a bit vector of +<code class="docutils literal">LONG_BIT</code> bits to an abstract, arbitrary precision integer. The addition in +the second line is therefore an addition between abstract integers, so it will +never overflow and just compute the correct result as an integer.</p> +<p>The condition to check for overflow is now: if the results of the two different +ways to do the addition are the same, then overflow did not occur. So in order +to compute <code class="docutils literal">state.no_ovf</code> in the addition case the +code converts the result of the bit vector wraparound addition to +an abstract integer (using <code class="docutils literal">SignExt</code> again), and then compares that to the integer +result.</p> +<p>This boolean can then be checked by the guard operations <code class="docutils literal">guard_no_overflow</code> +and <code class="docutils literal">guard_overflow</code>.</p> +<div class="code"><pre class="code python"><a id="rest_code_71e8db552ee64a1abcb47ebbdb1df319-1" name="rest_code_71e8db552ee64a1abcb47ebbdb1df319-1" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_71e8db552ee64a1abcb47ebbdb1df319-1"></a><span class="k">def</span> <span class="nf">guard_to_condition</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">guard</span><span class="p">,</span> <span class="n">state</span><span class="p">):</span> +<a id="rest_code_71e8db552ee64a1abcb47ebbdb1df319-2" name="rest_code_71e8db552ee64a1abcb47ebbdb1df319-2" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_71e8db552ee64a1abcb47ebbdb1df319-2"></a> +<a id="rest_code_71e8db552ee64a1abcb47ebbdb1df319-3" name="rest_code_71e8db552ee64a1abcb47ebbdb1df319-3" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_71e8db552ee64a1abcb47ebbdb1df319-3"></a> <span class="c1"># ... more cases</span> +<a id="rest_code_71e8db552ee64a1abcb47ebbdb1df319-4" name="rest_code_71e8db552ee64a1abcb47ebbdb1df319-4" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_71e8db552ee64a1abcb47ebbdb1df319-4"></a> +<a id="rest_code_71e8db552ee64a1abcb47ebbdb1df319-5" name="rest_code_71e8db552ee64a1abcb47ebbdb1df319-5" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_71e8db552ee64a1abcb47ebbdb1df319-5"></a> <span class="k">elif</span> <span class="n">opname</span> <span class="o">==</span> <span class="s2">"guard_no_overflow"</span><span class="p">:</span> +<a id="rest_code_71e8db552ee64a1abcb47ebbdb1df319-6" name="rest_code_71e8db552ee64a1abcb47ebbdb1df319-6" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_71e8db552ee64a1abcb47ebbdb1df319-6"></a> <span class="k">assert</span> <span class="n">state</span><span class="o">.</span><span class="n">no_ovf</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span> +<a id="rest_code_71e8db552ee64a1abcb47ebbdb1df319-7" name="rest_code_71e8db552ee64a1abcb47ebbdb1df319-7" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_71e8db552ee64a1abcb47ebbdb1df319-7"></a> <span class="k">return</span> <span class="n">state</span><span class="o">.</span><span class="n">no_ovf</span> +<a id="rest_code_71e8db552ee64a1abcb47ebbdb1df319-8" name="rest_code_71e8db552ee64a1abcb47ebbdb1df319-8" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_71e8db552ee64a1abcb47ebbdb1df319-8"></a> <span class="k">elif</span> <span class="n">opname</span> <span class="o">==</span> <span class="s2">"guard_overflow"</span><span class="p">:</span> +<a id="rest_code_71e8db552ee64a1abcb47ebbdb1df319-9" name="rest_code_71e8db552ee64a1abcb47ebbdb1df319-9" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_71e8db552ee64a1abcb47ebbdb1df319-9"></a> <span class="k">assert</span> <span class="n">state</span><span class="o">.</span><span class="n">no_ovf</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span> +<a id="rest_code_71e8db552ee64a1abcb47ebbdb1df319-10" name="rest_code_71e8db552ee64a1abcb47ebbdb1df319-10" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_71e8db552ee64a1abcb47ebbdb1df319-10"></a> <span class="k">return</span> <span class="n">z3</span><span class="o">.</span><span class="n">Not</span><span class="p">(</span><span class="n">state</span><span class="o">.</span><span class="n">no_ovf</span><span class="p">)</span> +<a id="rest_code_71e8db552ee64a1abcb47ebbdb1df319-11" name="rest_code_71e8db552ee64a1abcb47ebbdb1df319-11" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_71e8db552ee64a1abcb47ebbdb1df319-11"></a> +<a id="rest_code_71e8db552ee64a1abcb47ebbdb1df319-12" name="rest_code_71e8db552ee64a1abcb47ebbdb1df319-12" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_71e8db552ee64a1abcb47ebbdb1df319-12"></a> <span class="c1"># ... more cases</span> +</pre></div> +</section> +<section id="finding-the-bug-again"> +<h2>Finding the Bug, Again</h2> +<p>Let's actually make all of this more concrete by applying it to the trace of our +original bug. The input trace and the incorrectly optimized trace for that look +like this (differences highlighted):</p> +<div class="code"><pre class="code python"><a id="rest_code_b7b84df3112e4bbf8acd0ef739239ca0-1" name="rest_code_b7b84df3112e4bbf8acd0ef739239ca0-1" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_b7b84df3112e4bbf8acd0ef739239ca0-1"></a><span class="c1"># input # optimized</span> +<a id="rest_code_b7b84df3112e4bbf8acd0ef739239ca0-2" name="rest_code_b7b84df3112e4bbf8acd0ef739239ca0-2" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_b7b84df3112e4bbf8acd0ef739239ca0-2"></a><span class="p">[</span><span class="n">i0</span><span class="p">]</span> <span class="p">[</span><span class="n">i0</span><span class="p">]</span> +<a id="rest_code_b7b84df3112e4bbf8acd0ef739239ca0-3" name="rest_code_b7b84df3112e4bbf8acd0ef739239ca0-3" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_b7b84df3112e4bbf8acd0ef739239ca0-3"></a><span class="n">i1</span> <span class="o">=</span> <span class="n">int_add</span><span class="p">(</span><span class="n">i0</span><span class="p">,</span> <span class="mi">10</span><span class="p">)</span> <span class="n">i1</span> <span class="o">=</span> <span class="n">int_add</span><span class="p">(</span><span class="n">i0</span><span class="p">,</span> <span class="mi">10</span><span class="p">)</span> +<a id="rest_code_b7b84df3112e4bbf8acd0ef739239ca0-4" name="rest_code_b7b84df3112e4bbf8acd0ef739239ca0-4" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_b7b84df3112e4bbf8acd0ef739239ca0-4"></a><span class="n">i2</span> <span class="o">=</span> <span class="n">int_lt</span><span class="p">(</span><span class="n">i1</span><span class="p">,</span> <span class="mi">15</span><span class="p">)</span> <span class="n">i2</span> <span class="o">=</span> <span class="n">int_lt</span><span class="p">(</span><span class="n">i1</span><span class="p">,</span> <span class="mi">15</span><span class="p">)</span> +<a id="rest_code_b7b84df3112e4bbf8acd0ef739239ca0-5" name="rest_code_b7b84df3112e4bbf8acd0ef739239ca0-5" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_b7b84df3112e4bbf8acd0ef739239ca0-5"></a><span class="n">guard_true</span><span class="p">(</span><span class="n">i2</span><span class="p">)</span> <span class="n">guard_true</span><span class="p">(</span><span class="n">i2</span><span class="p">)</span> +<a id="rest_code_b7b84df3112e4bbf8acd0ef739239ca0-6" name="rest_code_b7b84df3112e4bbf8acd0ef739239ca0-6" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_b7b84df3112e4bbf8acd0ef739239ca0-6"></a><span class="hll"><span class="n">i3</span> <span class="o">=</span> <span class="n">int_lt</span><span class="p">(</span><span class="n">i0</span><span class="p">,</span> <span class="mi">6</span><span class="p">)</span> <span class="n">jump</span><span class="p">(</span><span class="mi">0</span><span class="p">)</span> +</span><a id="rest_code_b7b84df3112e4bbf8acd0ef739239ca0-7" name="rest_code_b7b84df3112e4bbf8acd0ef739239ca0-7" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_b7b84df3112e4bbf8acd0ef739239ca0-7"></a><span class="hll"><span class="n">guard_true</span><span class="p">(</span><span class="n">i3</span><span class="p">)</span> +</span><a id="rest_code_b7b84df3112e4bbf8acd0ef739239ca0-8" name="rest_code_b7b84df3112e4bbf8acd0ef739239ca0-8" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_b7b84df3112e4bbf8acd0ef739239ca0-8"></a><span class="hll"><span class="n">jump</span><span class="p">(</span><span class="mi">0</span><span class="p">)</span> +</span></pre></div> +<p>Note that the trace represents just one of the paths through the control flow +graph of the original function, which is typical for tracing JITs (the other +paths could incrementally get added later).</p> +<p>The first guards in both these traces correspond to each other, so the first +chunks to check are the first three operations (lines 1-4). Those operations +don't get changed by the optimizer at all.</p> +<p>These two identical traces get translated to the following Z3 formulas:</p> +<div class="code"><pre class="code text"><a id="rest_code_25c448b34dd145d1837209987991ae86-1" name="rest_code_25c448b34dd145d1837209987991ae86-1" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_25c448b34dd145d1837209987991ae86-1"></a>i1unoptimized == input_i0 + 10 +<a id="rest_code_25c448b34dd145d1837209987991ae86-2" name="rest_code_25c448b34dd145d1837209987991ae86-2" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_25c448b34dd145d1837209987991ae86-2"></a>i2unoptimized == If(i1unoptimized &lt; 15, 1, 0) +<a id="rest_code_25c448b34dd145d1837209987991ae86-3" name="rest_code_25c448b34dd145d1837209987991ae86-3" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_25c448b34dd145d1837209987991ae86-3"></a>i1optimized == input_i0 + 10 +<a id="rest_code_25c448b34dd145d1837209987991ae86-4" name="rest_code_25c448b34dd145d1837209987991ae86-4" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_25c448b34dd145d1837209987991ae86-4"></a>i2optimized == If(i1optimized &lt; 15, 1, 0) +</pre></div> +<p>To check that the two corresponding guards are the same, the solver is asked to +prove that <code class="docutils literal">(i2unoptimized == 1) == (i2optimized == 1)</code>. This is +correct, because the formulas for <code class="docutils literal">i2unoptimized</code> and <code class="docutils literal">i2optimized</code> are +completely identical.</p> +<p>After checking that the guards behave the same, we add the knowledge to the +solver that the guards passed. So the Z3 formulas become:</p> +<div class="code"><pre class="code text"><a id="rest_code_bd0fcf12b5514a38b91ef86a0afa4a3c-1" name="rest_code_bd0fcf12b5514a38b91ef86a0afa4a3c-1" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_bd0fcf12b5514a38b91ef86a0afa4a3c-1"></a>i1unoptimized == input_i0 + 10 +<a id="rest_code_bd0fcf12b5514a38b91ef86a0afa4a3c-2" name="rest_code_bd0fcf12b5514a38b91ef86a0afa4a3c-2" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_bd0fcf12b5514a38b91ef86a0afa4a3c-2"></a>i2unoptimized == If(i1unoptimized &lt; 15, 1, 0) +<a id="rest_code_bd0fcf12b5514a38b91ef86a0afa4a3c-3" name="rest_code_bd0fcf12b5514a38b91ef86a0afa4a3c-3" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_bd0fcf12b5514a38b91ef86a0afa4a3c-3"></a>i1optimized == input_i0 + 10 +<a id="rest_code_bd0fcf12b5514a38b91ef86a0afa4a3c-4" name="rest_code_bd0fcf12b5514a38b91ef86a0afa4a3c-4" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_bd0fcf12b5514a38b91ef86a0afa4a3c-4"></a>i2optimized == If(i1optimized &lt; 15, 1, 0) +<a id="rest_code_bd0fcf12b5514a38b91ef86a0afa4a3c-5" name="rest_code_bd0fcf12b5514a38b91ef86a0afa4a3c-5" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_bd0fcf12b5514a38b91ef86a0afa4a3c-5"></a>i1optimized == 1 +<a id="rest_code_bd0fcf12b5514a38b91ef86a0afa4a3c-6" name="rest_code_bd0fcf12b5514a38b91ef86a0afa4a3c-6" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_bd0fcf12b5514a38b91ef86a0afa4a3c-6"></a>i2optimized == 1 +</pre></div> +<p>Now we continue with the remaining operations of the two traces (lines 6-8).</p> +<p>We start by adding the <code class="docutils literal">int_lt</code> operation in the unoptimized trace to the Z3 +formulas:</p> +<div class="code"><pre class="code text"><a id="rest_code_572cd48587b84ad4aea4ab9fb60d80fd-1" name="rest_code_572cd48587b84ad4aea4ab9fb60d80fd-1" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_572cd48587b84ad4aea4ab9fb60d80fd-1"></a>... +<a id="rest_code_572cd48587b84ad4aea4ab9fb60d80fd-2" name="rest_code_572cd48587b84ad4aea4ab9fb60d80fd-2" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_572cd48587b84ad4aea4ab9fb60d80fd-2"></a>i3unoptimized == If(input_i0 &lt; 6, 1, 0) +</pre></div> +<p>Because the second guard was optimized away, we need to ask Z3 to prove that +<code class="docutils literal">i3unoptimized == 1</code> is always true, which fails and gives the following +counterexample:</p> +<div class="code"><pre class="code text"><a id="rest_code_dad63ba423ac4e599c421529bf5361a0-1" name="rest_code_dad63ba423ac4e599c421529bf5361a0-1" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_dad63ba423ac4e599c421529bf5361a0-1"></a>input_i0 = 9223372036854775800 +<a id="rest_code_dad63ba423ac4e599c421529bf5361a0-2" name="rest_code_dad63ba423ac4e599c421529bf5361a0-2" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_dad63ba423ac4e599c421529bf5361a0-2"></a>i1unoptimized = 9223372036854775810 +<a id="rest_code_dad63ba423ac4e599c421529bf5361a0-3" name="rest_code_dad63ba423ac4e599c421529bf5361a0-3" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_dad63ba423ac4e599c421529bf5361a0-3"></a>i2unoptimized = 0 +<a id="rest_code_dad63ba423ac4e599c421529bf5361a0-4" name="rest_code_dad63ba423ac4e599c421529bf5361a0-4" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_dad63ba423ac4e599c421529bf5361a0-4"></a>i1optimized = 9223372036854775810 +<a id="rest_code_dad63ba423ac4e599c421529bf5361a0-5" name="rest_code_dad63ba423ac4e599c421529bf5361a0-5" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_dad63ba423ac4e599c421529bf5361a0-5"></a>i2optimized = 1 +<a id="rest_code_dad63ba423ac4e599c421529bf5361a0-6" name="rest_code_dad63ba423ac4e599c421529bf5361a0-6" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_dad63ba423ac4e599c421529bf5361a0-6"></a>i3unoptimized = 0 +</pre></div> +<p>Thus demonstrating the bug. The fact that the Z3-based equivalence check also +managed to find the original motivating bug without manually translating it to +a formula is a good confirmation that the approach works.</p> +</section> +<section id="second-bug"> +<h2>Second bug</h2> +<p>So with this code I applied the Z3-based equivalence check to all our optimizer +unit tests. In addition to the bug we've been discussing the whole post, it also +found another buggy test! I had found it too by hand by staring at all the tests +in the process of writing all the Z3 infrastructure, but it was still a good +confirmation that the process worked. This bug was in the range analysis for +<code class="docutils literal">int_neg</code>, integer negation. It failed to account that <code class="docutils literal"><span class="pre">-MININT</span> == MININT</code> +and therefore did a mis-optimization along the following lines:</p> +<div class="code"><pre class="code python"><a id="rest_code_486f2b8abd90465a8220a1becde3f0bd-1" name="rest_code_486f2b8abd90465a8220a1becde3f0bd-1" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_486f2b8abd90465a8220a1becde3f0bd-1"></a><span class="kn">import</span> <span class="nn">__pypy__</span> +<a id="rest_code_486f2b8abd90465a8220a1becde3f0bd-2" name="rest_code_486f2b8abd90465a8220a1becde3f0bd-2" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_486f2b8abd90465a8220a1becde3f0bd-2"></a> +<a id="rest_code_486f2b8abd90465a8220a1becde3f0bd-3" name="rest_code_486f2b8abd90465a8220a1becde3f0bd-3" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_486f2b8abd90465a8220a1becde3f0bd-3"></a><span class="k">def</span> <span class="nf">wrong</span><span class="p">(</span><span class="n">x</span><span class="p">):</span> +<a id="rest_code_486f2b8abd90465a8220a1becde3f0bd-4" name="rest_code_486f2b8abd90465a8220a1becde3f0bd-4" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_486f2b8abd90465a8220a1becde3f0bd-4"></a> <span class="n">a</span> <span class="o">=</span> <span class="n">__pypy__</span><span class="o">.</span><span class="n">intop</span><span class="o">.</span><span class="n">int_sub</span><span class="p">(</span><span class="mi">0</span><span class="p">,</span> <span class="n">x</span><span class="p">)</span> +<a id="rest_code_486f2b8abd90465a8220a1becde3f0bd-5" name="rest_code_486f2b8abd90465a8220a1becde3f0bd-5" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_486f2b8abd90465a8220a1becde3f0bd-5"></a> <span class="k">if</span> <span class="n">a</span> <span class="o">&lt;</span> <span class="mi">0</span><span class="p">:</span> +<a id="rest_code_486f2b8abd90465a8220a1becde3f0bd-6" name="rest_code_486f2b8abd90465a8220a1becde3f0bd-6" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_486f2b8abd90465a8220a1becde3f0bd-6"></a> <span class="k">if</span> <span class="n">x</span> <span class="o">&gt;</span> <span class="mi">0</span><span class="p">:</span> +<a id="rest_code_486f2b8abd90465a8220a1becde3f0bd-7" name="rest_code_486f2b8abd90465a8220a1becde3f0bd-7" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_486f2b8abd90465a8220a1becde3f0bd-7"></a> <span class="k">return</span> <span class="mi">0</span> +<a id="rest_code_486f2b8abd90465a8220a1becde3f0bd-8" name="rest_code_486f2b8abd90465a8220a1becde3f0bd-8" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_486f2b8abd90465a8220a1becde3f0bd-8"></a> <span class="k">return</span> <span class="mi">1</span> +<a id="rest_code_486f2b8abd90465a8220a1becde3f0bd-9" name="rest_code_486f2b8abd90465a8220a1becde3f0bd-9" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_486f2b8abd90465a8220a1becde3f0bd-9"></a> <span class="k">return</span> <span class="mi">2</span> +</pre></div> +<p>Which was wrongly optimized into:</p> +<div class="code"><pre class="code python"><a id="rest_code_a6cf538b3ecd4a0ebaf7b8c0cc7c7007-1" name="rest_code_a6cf538b3ecd4a0ebaf7b8c0cc7c7007-1" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_a6cf538b3ecd4a0ebaf7b8c0cc7c7007-1"></a><span class="kn">import</span> <span class="nn">__pypy__</span> +<a id="rest_code_a6cf538b3ecd4a0ebaf7b8c0cc7c7007-2" name="rest_code_a6cf538b3ecd4a0ebaf7b8c0cc7c7007-2" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_a6cf538b3ecd4a0ebaf7b8c0cc7c7007-2"></a> +<a id="rest_code_a6cf538b3ecd4a0ebaf7b8c0cc7c7007-3" name="rest_code_a6cf538b3ecd4a0ebaf7b8c0cc7c7007-3" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_a6cf538b3ecd4a0ebaf7b8c0cc7c7007-3"></a><span class="k">def</span> <span class="nf">wrong</span><span class="p">(</span><span class="n">x</span><span class="p">):</span> +<a id="rest_code_a6cf538b3ecd4a0ebaf7b8c0cc7c7007-4" name="rest_code_a6cf538b3ecd4a0ebaf7b8c0cc7c7007-4" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_a6cf538b3ecd4a0ebaf7b8c0cc7c7007-4"></a> <span class="n">a</span> <span class="o">=</span> <span class="n">__pypy__</span><span class="o">.</span><span class="n">intop</span><span class="o">.</span><span class="n">int_sub</span><span class="p">(</span><span class="mi">0</span><span class="p">,</span> <span class="n">x</span><span class="p">)</span> +<a id="rest_code_a6cf538b3ecd4a0ebaf7b8c0cc7c7007-5" name="rest_code_a6cf538b3ecd4a0ebaf7b8c0cc7c7007-5" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_a6cf538b3ecd4a0ebaf7b8c0cc7c7007-5"></a> <span class="k">if</span> <span class="n">a</span> <span class="o">&lt;</span> <span class="mi">0</span><span class="p">:</span> +<a id="rest_code_a6cf538b3ecd4a0ebaf7b8c0cc7c7007-6" name="rest_code_a6cf538b3ecd4a0ebaf7b8c0cc7c7007-6" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_a6cf538b3ecd4a0ebaf7b8c0cc7c7007-6"></a> <span class="k">return</span> <span class="mi">0</span> +<a id="rest_code_a6cf538b3ecd4a0ebaf7b8c0cc7c7007-7" name="rest_code_a6cf538b3ecd4a0ebaf7b8c0cc7c7007-7" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_a6cf538b3ecd4a0ebaf7b8c0cc7c7007-7"></a> <span class="k">return</span> <span class="mi">2</span> +</pre></div> +<p>This is wrong precisely for <code class="docutils literal">x == MININT</code>.</p> +</section> +<section id="generating-random-traces"> +<h2>Generating Random Traces</h2> +<p>These two bugs were the only two that the Z3 checker found for existing unit +tests. To try to find some more bugs I combined PyPy's existing random trace +generator with the Z3 optimization checker. The random trace generator has so +far been mostly used to find bugs in the machine code backends, particularly +also in the register allocator. So far we haven't used it with our optimizer, +but my experiments show that we should have!</p> +<p>I'm going to describe a little bit how the random trace generator works. It's +actually not that complicated, but there's one neat trick to it.</p> +<p>The basic idea is straightforward, it starts out with an empty trace with a +random number of input variables. Then it adds some number of operations to the +trace, either regular operations or guards. Every operation takes already +existing variables as input.</p> +<p>The neat trick is that our random trace generator keeps a concrete random +example value for every one of the input variables, and an example result for +every operation. In this way, it is possible to generate guards that are +consistent with the example values to ensure that running the trace to its end +is possible with at least one set of values.</p> +<p>Here's an example random trace that is generated, together with the random +example inputs and the results of every operation at the end of every line:</p> +<pre class="literal-block">[i0, i1, i2, i3, i4, i5] # example values: 9, 11, -8, -95, 46, 57 +i6 = int_add_ovf(i3, i0) # -86 +guard_no_overflow() +i7 = int_sub(i2, -35/ci) # 27 +i8 = uint_ge(i3, i5) # 1 +guard_true(i8) +i9 = int_lt(i7, i8) # 0 +i10 = int_mul_ovf(34/ci, i7) # 918 +guard_no_overflow() +i11 = int_and(i10, 63/ci) # 22 +i12 = int_rshift(i3, i11) # -1 +i13 = int_is_zero(i7) # 0 +i14 = int_is_true(i13) # 0 +guard_false(i13) +i15 = int_lt(i8, i4) # 1 +i16 = int_and(i6, i0) # 8 +i17 = uint_ge(i6, -6/ci) # 0 +finish()</pre> +<p>Note how every guard generated is true for the example values.</p> +<p>I have been running this combination of random trace generation and Z3 checking +for many nights and it has found some bugs, which I'll describe in the next +section. It should probably be run for a lot longer, but still a useful +exercise already.</p> +<p>In this mode, I'm giving every Z3 call a time limit to make sure that the random +tests don't just take arbitrarily long. This means that asking Z3 to prove +something can have three outcomes, either it's proved, or Z3 finds a +counterexample, or Z3 times out.</p> +</section> +<section id="bugs-found"> +<h2>Bugs Found</h2> +<p>In addition to the two bugs I've already described, I'll briefly list the +additional bugs that were found by optimizing random traces and then trying to +prove the equivalence with Z3.</p> +<p>Most of the bugs were actually identified by optimizing random traces alone, not +by the Z3 component. They manifested as assert failures in the JIT compiler.</p> +<ul class="simple"> +<li><p>The JIT concluded after <code class="docutils literal">12 == int_mul(x, 12)</code> that <code class="docutils literal">x == 1</code>, which is +incorrect if overflow occurred (a counterexample is <code class="docutils literal">0x8000000000000001</code>).</p></li> +<li><p>An amusing bug, where from <code class="docutils literal">0 == int_lshift(0x1000000000000000, x)</code> with +<code class="docutils literal">x &lt;= 0 &lt;= 15</code>, the JIT concluded that <code class="docutils literal">0x1000000000000000 == 0</code>, +triggering an assert. This wrong conclusion was again caused by not taking the +possibility of overflow into account.</p></li> +<li><p>A corner case in an optimization for chained integer additions with a +constant, where in complex enough expressions, the wrong IR API was used +(which works correctly in simple cases). Again, this triggered an assert.</p></li> +</ul> +<p>This shows that we should have been fuzzing our JIT optimizer already (not a +surprising observation in hindsight, fuzz all the things!).</p> +<p>Thankfully, there was also one further bug that really failed in the Z3 +verifier. It's a bug in common subexpression elimination / arithmetic +simplification, which again does not take overflow correctly into account.</p> +<p>The buggy trace looks like this (unfortunately it's not easily possible to show +this bug in Python code).</p> +<div class="code"><pre class="code text"><a id="rest_code_40493479399f42558ecf3121b6abb0ca-1" name="rest_code_40493479399f42558ecf3121b6abb0ca-1" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_40493479399f42558ecf3121b6abb0ca-1"></a>[a, b] +<a id="rest_code_40493479399f42558ecf3121b6abb0ca-2" name="rest_code_40493479399f42558ecf3121b6abb0ca-2" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_40493479399f42558ecf3121b6abb0ca-2"></a>c = int_add(a, b) +<a id="rest_code_40493479399f42558ecf3121b6abb0ca-3" name="rest_code_40493479399f42558ecf3121b6abb0ca-3" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_40493479399f42558ecf3121b6abb0ca-3"></a>r = int_sub_ovf(c, b) +<a id="rest_code_40493479399f42558ecf3121b6abb0ca-4" name="rest_code_40493479399f42558ecf3121b6abb0ca-4" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_40493479399f42558ecf3121b6abb0ca-4"></a>guard_no_ovf() +<a id="rest_code_40493479399f42558ecf3121b6abb0ca-5" name="rest_code_40493479399f42558ecf3121b6abb0ca-5" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_40493479399f42558ecf3121b6abb0ca-5"></a>finish(r) +</pre></div> +<p>This was optimized to:</p> +<div class="code"><pre class="code text"><a id="rest_code_30cdbc23b541425f891edc9180ced3c0-1" name="rest_code_30cdbc23b541425f891edc9180ced3c0-1" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_30cdbc23b541425f891edc9180ced3c0-1"></a>[a, b] +<a id="rest_code_30cdbc23b541425f891edc9180ced3c0-2" name="rest_code_30cdbc23b541425f891edc9180ced3c0-2" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_30cdbc23b541425f891edc9180ced3c0-2"></a>finish(a) +</pre></div> +<p>Which is incorrect, because the guard can fail given the right inputs. +But the optimizer concluded that the subtraction is safe, because its the +inverse of an earlier addition, not taking into account that this earlier +addition can have overflowed.</p> +<p>Note that a related optimization is actually correct. Given this code:</p> +<div class="code"><pre class="code text"><a id="rest_code_6037a89ec2e141f3a6fb830fe938b2f4-1" name="rest_code_6037a89ec2e141f3a6fb830fe938b2f4-1" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_6037a89ec2e141f3a6fb830fe938b2f4-1"></a>[a, b] +<a id="rest_code_6037a89ec2e141f3a6fb830fe938b2f4-2" name="rest_code_6037a89ec2e141f3a6fb830fe938b2f4-2" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_6037a89ec2e141f3a6fb830fe938b2f4-2"></a>c = int_add_ovf(a, b) +<a id="rest_code_6037a89ec2e141f3a6fb830fe938b2f4-3" name="rest_code_6037a89ec2e141f3a6fb830fe938b2f4-3" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_6037a89ec2e141f3a6fb830fe938b2f4-3"></a>guard_no_ovf() +<a id="rest_code_6037a89ec2e141f3a6fb830fe938b2f4-4" name="rest_code_6037a89ec2e141f3a6fb830fe938b2f4-4" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_6037a89ec2e141f3a6fb830fe938b2f4-4"></a>r = int_sub(c, b) +<a id="rest_code_6037a89ec2e141f3a6fb830fe938b2f4-5" name="rest_code_6037a89ec2e141f3a6fb830fe938b2f4-5" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_6037a89ec2e141f3a6fb830fe938b2f4-5"></a>finish(r) +</pre></div> +<p>It can be optimized to:</p> +<div class="code"><pre class="code text"><a id="rest_code_231f1b026f874575959e48a29de9a78d-1" name="rest_code_231f1b026f874575959e48a29de9a78d-1" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_231f1b026f874575959e48a29de9a78d-1"></a>[a, b] +<a id="rest_code_231f1b026f874575959e48a29de9a78d-2" name="rest_code_231f1b026f874575959e48a29de9a78d-2" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_231f1b026f874575959e48a29de9a78d-2"></a>c = int_add_ovf(a, b) +<a id="rest_code_231f1b026f874575959e48a29de9a78d-3" name="rest_code_231f1b026f874575959e48a29de9a78d-3" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_231f1b026f874575959e48a29de9a78d-3"></a>guard_no_ovf() +<a id="rest_code_231f1b026f874575959e48a29de9a78d-4" name="rest_code_231f1b026f874575959e48a29de9a78d-4" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_231f1b026f874575959e48a29de9a78d-4"></a>finish(a) +</pre></div> +</section> +<section id="future-work-and-conclusion"> +<h2>Future Work and Conclusion</h2> +<p>In the current form the Z3 checker is only a start, even though it has already +been concretely useful. There are various directions into which we could extend +it. In addition to generate random tests completely from scratch, we could also +start from the existing manually written unit-tests and randomly mutate those.</p> +<p>I also want to extend the Z3 checker with support more operations, heap +operations in particular (but it's not quite clear to me how to model garbage +collection).</p> +<p>I also want to try to switch the code away from the Z3 API and use the more +general <a class="reference external" href="https://smtlib.cs.uiowa.edu/">smtlib</a> interface directly, in order to be able to use other SMT +checkers than Z3, eg <a class="reference external" href="https://cvc4.github.io/">CVC4</a>.</p> +<p>But all in all this was a fun and not too hard way to find a bunch of bugs in +our optimizer! And the infrastructure is now in place, which means that we run +some random test cases every time we execute our tests. This is going to be +particularly useful when we do further work on the integer reasoning of the JIT +(like Nico is doing, for example). As of time of writing of this post, all the +bugs mentioned have been fixed and the Z3 code has landed on the default branch +and runs as part of PyPy's CI infrastructure.</p> +</section> +<section id="acknowledgements"> +<h2>Acknowledgements</h2> +<p>Thanks to <a class="reference external" href="http://saambarati.org/">Saam Barati</a>, <a class="reference external" href="https://bernsteinbear.com">Max Bernstein</a>, <a class="reference external" href="https://www.cs.hhu.de/lehrstuehle-und-arbeitsgruppen/softwaretechnik-und-programmiersprachen/unser-team/team/schmidt">Joshua Schmidt</a> and <a class="reference external" href="https://martinfriedrichberger.net/">Martin +Berger</a>, for great feedback on drafts of this post!</p> +</section>jittestinghttps://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.htmlSun, 11 Dec 2022 18:00:00 GMTThe PyPy Blog Turns 15 Yearshttps://www.pypy.org/posts/2022/10/blog-15-years.htmlCarl Friedrich Bolz-Tereick<p>Exactly 15 years ago today we wrote the <a class="reference external" href="https://www.pypy.org/posts/2007/10/first-post-8150793557471983289.html">first blog post on the PyPy blog</a>! +Over the years, we have written 423 posts, from the <a class="reference external" href="https://www.pypy.org/posts/2007/12/faster-than-c-8057790636822502084.html">shortest</a> to the +<a class="reference external" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html">longest</a>. In 2021 we <a class="reference external" href="https://www.pypy.org/posts/2021/03/pypys-blog-has-moved.html">moved</a> from <a class="reference external" href="https://morepypy.blogspot.com">blogger</a> to our own domain.</p> +<p>The topics over the years varied widely, we published <a class="reference external" href="https://www.pypy.org/posts/2013/05/pypy-20-einstein-sandwich-635158782365435530.html">release</a> <a class="reference external" href="https://www.pypy.org/posts/2017/06/pypy-v58-released-739876359584854017.html">announcements</a>; +<a class="reference external" href="https://www.pypy.org/posts/2009/04/roadmap-for-jit-377358891902851723.html">roadmaps</a>; <a class="reference external" href="https://www.pypy.org/posts/2010/06/blackhole-interpreter-2752965445510091289.html">JIT</a>, <a class="reference external" href="https://www.pypy.org/posts/2013/10/incremental-garbage-collector-in-pypy-8956893523842234676.html">GC</a> and <a class="reference external" href="https://www.pypy.org/posts/2013/10/update-on-stm-7145890443443707910.html">STM</a> <a class="reference external" href="https://www.pypy.org/posts/2019/07/pypy-jit-for-aarch64-7161523403247118006.html">updates</a>; <a class="reference external" href="https://www.pypy.org/posts/2018/06/repeating-matrix-multiplication-8641748742577945875.html">benchmarks</a>; <a class="reference external" href="https://www.pypy.org/posts/2008/10/dsseldorf-sprint-report-days-1-3-5256639868851086032.html">sprint</a>, <a class="reference external" href="https://www.pypy.org/posts/2007/11/pypy-road-show-1-new-york-and-ibm-7837076523877011699.html">trip</a> and +<a class="reference external" href="https://www.pypy.org/posts/2009/07/ecoop-2009-8415055006373020774.html">conference</a> <a class="reference external" href="https://www.pypy.org/posts/2012/04/pycon-2012-wrap-up-559575896040055505.html">reports</a>; <a class="reference external" href="https://www.pypy.org/posts/2016/07/reverse-debugging-for-python-8854823774141612670.html">technical</a> <a class="reference external" href="https://www.pypy.org/posts/2010/11/efficiently-implementing-python-objects-3838329944323946932.html">deep</a> <a class="reference external" href="https://www.pypy.org/posts/2015/10/pypy-memory-and-warmup-improvements-2-4598780879518640015.html">dives</a>; <a class="reference external" href="https://www.pypy.org/posts/2022/02/nlp-icelandic-case-study.html">case studies</a>; <a class="reference external" href="https://www.pypy.org/posts/2008/04/trying-to-get-pypy-to-run-on-python-30-5082015544752137606.html">april</a> <a class="reference external" href="https://www.pypy.org/posts/2008/04/other-aprils-fools-ideas-955926452383759016.html">fool's +jokes</a>; <a class="reference external" href="https://www.pypy.org/posts/2015/03/pydgin-using-rpython-to-generate-fast-1514065178985838697.html">research</a> projects; <a class="reference external" href="https://www.pypy.org/posts/2013/02/announcing-topaz-rpython-powered-ruby-6662407703061538341.html">other</a> <a class="reference external" href="https://www.pypy.org/posts/2012/07/hello-everyone-6869934374873967346.html">languages</a> <a class="reference external" href="https://www.pypy.org/posts/2014/08/a-field-test-of-software-transactional-5659022209916605798.html">using</a> RPython; finished <a class="reference external" href="https://www.pypy.org/posts/2010/10/phd-thesis-about-pypys-cli-jit-backend-969267841095296323.html">PhD</a> +<a class="reference external" href="https://www.pypy.org/posts/2019/04/an-rpython-jit-for-lpegs-4779548053359386284.html">Bachelor</a> and <a class="reference external" href="https://www.pypy.org/posts/2008/10/prolog-jit-masters-thesis-finished-5462132148241449867.html">Master</a>, theses; pictures:</p> +<a class="reference external image-reference" href="https://www.pypy.org/images/2022-pypy-pictures-collage.jpg"> +<img alt="a collage of photos taken at PyPy sprints" src="https://www.pypy.org/images/2022-pypy-pictures-collage-small.jpg"> +</a> +<p>and diagrams:</p> +<a class="reference external image-reference" href="https://www.pypy.org/images/2022-pypy-diagrams-collage.png"> +<img alt="a collage of diagrams from previous blog posts" src="https://www.pypy.org/images/2022-pypy-diagrams-collage-small.png"> +</a> +<p>Quite a number of blog posts were very early iterations of papers that we +published later, here are a few that I can remember:</p> +<ul class="simple"> +<li><p><a class="reference external" href="https://www.pypy.org/posts/2009/03/applying-tracing-jit-to-interpreter-3287844903778799266.html">Applying a Tracing JIT to an Interpreter</a> became <a class="reference external" href="https://dl.acm.org/doi/10.1145/1565824.1565827">Tracing the meta-level: +PyPy's tracing JIT compiler</a> at ICOOOLPS 2009, by far our most successful +paper.</p></li> +<li><p><a class="reference external" href="https://www.pypy.org/posts/2010/09/escape-analysis-in-pypys-jit-1780048403046080197.html">Escape Analysis in PyPy's JIT</a> became <a class="reference external" href="https://dl.acm.org/doi/10.1145/1929501.1929508">Allocation removal by partial +evaluation in a tracing JIT</a> at PEPM 2010.</p></li> +<li><p><a class="reference external" href="https://www.pypy.org/posts/2011/03/controlling-tracing-of-interpreter-with_21-6524148550848694588.html">Controlling the Tracing of an Interpreter With Hints</a> was a draft of the +paper <a class="reference external" href="https://dl.acm.org/doi/10.1145/2069172.2069181">Runtime feedback in a meta-tracing JIT for efficient dynamic +languages</a> at ICOOOLPS 2011</p></li> +<li><p><a class="reference external" href="https://www.pypy.org/posts/2010/09/using-escape-analysis-across-loop-2887031293132023676.html">Using Escape Analysis Across Loop Boundaries for Specialization</a> was the +nucleus of <a class="reference external" href="https://dl.acm.org/doi/10.1145/2384577.2384586">Loop-aware optimizations in PyPy's tracing JIT</a> at DLS 2012.</p></li> +<li><p><a class="reference external" href="https://www.pypy.org/posts/2011/10/more-compact-lists-with-list-strategies-8229304944653956829.html">List Strategies</a> was eventually turned into the paper <a class="reference external" href="https://dl.acm.org/doi/10.1145/2509136.2509531">Storage strategies +for collections in dynamically typed languages</a> at OOPSLA 2013.</p></li> +</ul> +<section id="greatest-hits"> +<h2>Greatest Hits</h2> +<p>In terms of visitors, the top five posts on the old blog were – on the new blog +we simply don't have stats (yet?):</p> +<ol class="arabic simple"> +<li><p><a class="reference external" href="https://www.pypy.org/posts/2017/08/lets-remove-global-interpreter-lock-748023554216649595.html">Let's remove the global interpreter lock</a></p></li> +<li><p><a class="reference external" href="https://www.pypy.org/posts/2011/04/tutorial-writing-interpreter-with-pypy-3785910476193156295.html">Tutorial: Writing an Interpreter with PyPy, Part 1</a></p></li> +<li><p><a class="reference external" href="https://www.pypy.org/posts/2019/10/pypys-new-json-parser-492911724084305501.html">PyPy's new JSON parser</a></p></li> +<li><p><a class="reference external" href="https://www.pypy.org/posts/2016/08/pypy-gets-funding-from-mozilla-for-5569307998787871200.html">PyPy gets funding from Mozilla for Python 3.5 support</a></p></li> +<li><p><a class="reference external" href="https://www.pypy.org/posts/2017/10/how-to-make-your-code-80-times-faster-1424098117108093942.html">How to make your code 80 times faster</a></p></li> +</ol> +<p>The number of posts per year developed like this:</p> +<img alt="/images/2022-pypy-posts-per-year.svg" src="https://www.pypy.org/images/2022-pypy-posts-per-year.svg"> +<p>The most prolific authors are:</p> +<ol class="arabic simple"> +<li><p><a class="reference external" href="https://www.pypy.org/authors/maciej-fijalkowski.html">Maciej Fijałkowski</a></p></li> +<li><p><a class="reference external" href="https://www.pypy.org/authors/carl-friedrich-bolz-tereick.html">Carl Friedrich Bolz-Tereick</a></p></li> +<li><p><a class="reference external" href="https://www.pypy.org/authors/armin-rigo.html">Armin Rigo</a></p></li> +<li><p><a class="reference external" href="https://www.pypy.org/authors/antonio-cuni.html">Antonio Cuni</a></p></li> +<li><p><a class="reference external" href="https://www.pypy.org/authors/mattip.html">Matti Picus</a></p></li> +</ol> +<p>Several blog posts have made it to the Hacker News front page, three of them to +number 1:</p> +<ul class="simple"> +<li><p><a class="reference external" href="https://www.pypy.org/posts/2014/07/pypy-stm-first-interesting-release-8684276541915333814.html">PyPy-STM: first “interesting” release</a> (<a class="reference external" href="https://news.ycombinator.com/item?id=7991404">discussion</a>)</p></li> +<li><p><a class="reference external" href="https://www.pypy.org/posts/2017/08/lets-remove-global-interpreter-lock-748023554216649595.html">Let's Remove the Global Interpreter Lock</a> (<a class="reference external" href="https://news.ycombinator.com/item?id=15008636">discussion</a>)</p></li> +<li><p><a class="reference external" href="https://www.pypy.org/posts/2018/09/inside-cpyext-why-emulating-cpython-c-8083064623681286567.html">Inside cpyext: Why emulating CPython C API is so Hard</a> (<a class="reference external" href="https://news.ycombinator.com/item?id=18040664">discussion</a>)</p></li> +</ul> +</section> +<section id="personal-favourites"> +<h2>Personal Favourites</h2> +<p>While looking through the posts, there were a few that stood out to me in some +way, so here's a subjective list of ones that I had fun looking at again:</p> +<ul class="simple"> +<li><p>2008: <a class="reference external" href="https://www.pypy.org/posts/2008/10/sprint-discussions-jit-generator-3301578822967655604.html">Sprint Discussions: JIT Generator Planning</a></p></li> +<li><p>2009: <a class="reference external" href="https://www.pypy.org/posts/2009/08/pypy-gets-new-compiler_25-6401910947439531107.html">PyPy gets a new compiler</a></p></li> +<li><p>2010: <a class="reference external" href="https://www.pypy.org/posts/2010/12/oh-and-btw-pypy-gets-funding-through-3568486750776147382.html">Oh, and btw: PyPy gets funding through "Eurostars"</a></p></li> +<li><p>2011: <a class="reference external" href="https://www.pypy.org/posts/2011/07/realtime-image-processing-in-python-6985924592886873374.html">Realtime image processing in Python</a></p></li> +<li><p>2012: <a class="reference external" href="https://www.pypy.org/posts/2012/06/architecture-of-cppyy-9077100041707701102.html">Architecture of Cppyy</a></p></li> +<li><p>2013: <a class="reference external" href="https://www.pypy.org/posts/2013/02/10-years-of-pypy-634401291726575821.html">10 years of PyPy</a></p></li> +<li><p>2014: <a class="reference external" href="https://www.pypy.org/posts/2014/11/pypy-io-improvements-1042070332447047674.html">PyPy IO Improvements</a></p></li> +<li><p>2015: <a class="reference external" href="https://www.pypy.org/posts/2015/10/automatic-simd-vectorization-support-in-639063580401330508.html">Automatic SIMD vectorization support in PyPy</a></p></li> +<li><p>2016: <a class="reference external" href="https://www.pypy.org/posts/2016/04/pypy-enterprise-edition-3688275697656890948.html">PyPy Enterprise Edition</a></p></li> +<li><p>2017: <a class="reference external" href="https://www.pypy.org/posts/2017/03/async-http-benchmarks-on-pypy3-1092124994927894138.html">Async HTTP benchmarks on PyPy3</a></p></li> +<li><p>2018: <a class="reference external" href="https://www.pypy.org/posts/2018/04/improving-syntaxerror-in-pypy-5733639208090522433.html">Improving SyntaxError in PyPy</a></p></li> +<li><p>2018: <a class="reference external" href="https://www.pypy.org/posts/2018/09/the-first-15-years-of-pypy-3412615975376972020.html#incentives-of-oss-compared-to-academia">The First 15 Years of PyPy — a Personal Retrospective</a></p></li> +<li><p>2019: <a class="reference external" href="https://www.pypy.org/posts/2019/01/pypy-for-low-latency-systems-613165393301401965.html">PyPy for low-latency systems</a></p></li> +<li><p>2020: <a class="reference external" href="https://www.pypy.org/posts/2020/02/pypy-and-cffi-have-moved-to-heptapod-5791595152472747032.html">PyPy and CFFI have moved to Heptapod</a></p></li> +<li><p>2021: <a class="reference external" href="https://www.pypy.org/posts/2021/04/ways-pypy-graphviz.html">Some Ways that PyPy uses Graphviz</a></p></li> +</ul> +<p>We'd like to thank our authors, guest authors, commenters, users and readers who +have stuck with us through one and a half decades! If there's any particular +topics you would like to read something about, or any guest posts you'd like to +write, let us know!</p> +</section>metahttps://www.pypy.org/posts/2022/10/blog-15-years.htmlSun, 30 Oct 2022 12:00:00 GMTAllocation Removal in the Toy Optimizerhttps://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.htmlCarl Friedrich Bolz-Tereick<p>One of the workhorse optimization of RPython's tracing JIT is <a class="reference external" href="https://dl.acm.org/doi/10.1145/1929501.1929508">allocation +removal</a>, which removes short-lived object allocation from traces. Many Python +programs create a lot of objects that only live for a short time, and whose +lifespan is fully predictable (common examples are integer and float boxes, but +also tuples, frames, intermediate string results, etc). Allocation removal will +try (and very often succeed) to remove these allocations from traces. In +this blog post I want to show a toy version of how allocation removal is +implemented.</p> +<p>In the <a class="reference external" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html">previous</a> blog post of this series I showed the complete code for +writing a toy one-pass optimizer that does constant folding, common +subexpression elimination and strength reduction. In this +second post, I want to use allocation removal as a more advanced optimization +pass. The basic optimization framework is the same, we will use the same +datastructures for intermediate representation and also keep using the same +union find data structure to store equivalences between IR operations. Here's +the infrastructure code from the last post:</p> +<div class="code"><pre class="code python"><a id="rest_code_1adc0460707d4986a0ff9334f2124306-1" name="rest_code_1adc0460707d4986a0ff9334f2124306-1" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1adc0460707d4986a0ff9334f2124306-1"></a><span class="kn">import</span> <span class="nn">pytest</span> +<a id="rest_code_1adc0460707d4986a0ff9334f2124306-2" name="rest_code_1adc0460707d4986a0ff9334f2124306-2" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1adc0460707d4986a0ff9334f2124306-2"></a><span class="kn">from</span> <span class="nn">typing</span> <span class="kn">import</span> <span class="n">Optional</span><span class="p">,</span> <span class="n">Any</span> +<a id="rest_code_1adc0460707d4986a0ff9334f2124306-3" name="rest_code_1adc0460707d4986a0ff9334f2124306-3" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1adc0460707d4986a0ff9334f2124306-3"></a> +<a id="rest_code_1adc0460707d4986a0ff9334f2124306-4" name="rest_code_1adc0460707d4986a0ff9334f2124306-4" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1adc0460707d4986a0ff9334f2124306-4"></a> +<a id="rest_code_1adc0460707d4986a0ff9334f2124306-5" name="rest_code_1adc0460707d4986a0ff9334f2124306-5" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1adc0460707d4986a0ff9334f2124306-5"></a><span class="k">class</span> <span class="nc">Value</span><span class="p">:</span> +<a id="rest_code_1adc0460707d4986a0ff9334f2124306-6" name="rest_code_1adc0460707d4986a0ff9334f2124306-6" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1adc0460707d4986a0ff9334f2124306-6"></a> <span class="k">def</span> <span class="nf">find</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span> +<a id="rest_code_1adc0460707d4986a0ff9334f2124306-7" name="rest_code_1adc0460707d4986a0ff9334f2124306-7" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1adc0460707d4986a0ff9334f2124306-7"></a> <span class="k">raise</span> <span class="ne">NotImplementedError</span><span class="p">(</span><span class="s2">"abstract"</span><span class="p">)</span> +<a id="rest_code_1adc0460707d4986a0ff9334f2124306-8" name="rest_code_1adc0460707d4986a0ff9334f2124306-8" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1adc0460707d4986a0ff9334f2124306-8"></a> +<a id="rest_code_1adc0460707d4986a0ff9334f2124306-9" name="rest_code_1adc0460707d4986a0ff9334f2124306-9" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1adc0460707d4986a0ff9334f2124306-9"></a> <span class="k">def</span> <span class="nf">_set_forwarded</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span> +<a id="rest_code_1adc0460707d4986a0ff9334f2124306-10" name="rest_code_1adc0460707d4986a0ff9334f2124306-10" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1adc0460707d4986a0ff9334f2124306-10"></a> <span class="k">raise</span> <span class="ne">NotImplementedError</span><span class="p">(</span><span class="s2">"abstract"</span><span class="p">)</span> +<a id="rest_code_1adc0460707d4986a0ff9334f2124306-11" name="rest_code_1adc0460707d4986a0ff9334f2124306-11" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1adc0460707d4986a0ff9334f2124306-11"></a> +<a id="rest_code_1adc0460707d4986a0ff9334f2124306-12" name="rest_code_1adc0460707d4986a0ff9334f2124306-12" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1adc0460707d4986a0ff9334f2124306-12"></a> +<a id="rest_code_1adc0460707d4986a0ff9334f2124306-13" name="rest_code_1adc0460707d4986a0ff9334f2124306-13" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1adc0460707d4986a0ff9334f2124306-13"></a><span class="k">class</span> <span class="nc">Operation</span><span class="p">(</span><span class="n">Value</span><span class="p">):</span> +<a id="rest_code_1adc0460707d4986a0ff9334f2124306-14" name="rest_code_1adc0460707d4986a0ff9334f2124306-14" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1adc0460707d4986a0ff9334f2124306-14"></a> <span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span> +<a id="rest_code_1adc0460707d4986a0ff9334f2124306-15" name="rest_code_1adc0460707d4986a0ff9334f2124306-15" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1adc0460707d4986a0ff9334f2124306-15"></a> <span class="bp">self</span><span class="p">,</span> <span class="n">name</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span> <span class="n">args</span><span class="p">:</span> <span class="nb">list</span><span class="p">[</span><span class="n">Value</span><span class="p">]</span> +<a id="rest_code_1adc0460707d4986a0ff9334f2124306-16" name="rest_code_1adc0460707d4986a0ff9334f2124306-16" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1adc0460707d4986a0ff9334f2124306-16"></a> <span class="p">):</span> +<a id="rest_code_1adc0460707d4986a0ff9334f2124306-17" name="rest_code_1adc0460707d4986a0ff9334f2124306-17" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1adc0460707d4986a0ff9334f2124306-17"></a> <span class="bp">self</span><span class="o">.</span><span class="n">name</span> <span class="o">=</span> <span class="n">name</span> +<a id="rest_code_1adc0460707d4986a0ff9334f2124306-18" name="rest_code_1adc0460707d4986a0ff9334f2124306-18" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1adc0460707d4986a0ff9334f2124306-18"></a> <span class="bp">self</span><span class="o">.</span><span class="n">args</span> <span class="o">=</span> <span class="n">args</span> +<a id="rest_code_1adc0460707d4986a0ff9334f2124306-19" name="rest_code_1adc0460707d4986a0ff9334f2124306-19" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1adc0460707d4986a0ff9334f2124306-19"></a> <span class="bp">self</span><span class="o">.</span><span class="n">forwarded</span> <span class="o">=</span> <span class="kc">None</span> +<a id="rest_code_1adc0460707d4986a0ff9334f2124306-20" name="rest_code_1adc0460707d4986a0ff9334f2124306-20" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1adc0460707d4986a0ff9334f2124306-20"></a> <span class="bp">self</span><span class="o">.</span><span class="n">info</span> <span class="o">=</span> <span class="kc">None</span> +<a id="rest_code_1adc0460707d4986a0ff9334f2124306-21" name="rest_code_1adc0460707d4986a0ff9334f2124306-21" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1adc0460707d4986a0ff9334f2124306-21"></a><span class="hll"> +</span><a id="rest_code_1adc0460707d4986a0ff9334f2124306-22" name="rest_code_1adc0460707d4986a0ff9334f2124306-22" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1adc0460707d4986a0ff9334f2124306-22"></a> <span class="k">def</span> <span class="fm">__repr__</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span> +<a id="rest_code_1adc0460707d4986a0ff9334f2124306-23" name="rest_code_1adc0460707d4986a0ff9334f2124306-23" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1adc0460707d4986a0ff9334f2124306-23"></a> <span class="k">return</span> <span class="p">(</span> +<a id="rest_code_1adc0460707d4986a0ff9334f2124306-24" name="rest_code_1adc0460707d4986a0ff9334f2124306-24" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1adc0460707d4986a0ff9334f2124306-24"></a> <span class="sa">f</span><span class="s2">"Operation(</span><span class="si">{</span><span class="bp">self</span><span class="o">.</span><span class="n">name</span><span class="si">}</span><span class="s2">, "</span> +<a id="rest_code_1adc0460707d4986a0ff9334f2124306-25" name="rest_code_1adc0460707d4986a0ff9334f2124306-25" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1adc0460707d4986a0ff9334f2124306-25"></a> <span class="sa">f</span><span class="s2">"</span><span class="si">{</span><span class="bp">self</span><span class="o">.</span><span class="n">args</span><span class="si">}</span><span class="s2">, </span><span class="si">{</span><span class="bp">self</span><span class="o">.</span><span class="n">forwarded</span><span class="si">}</span><span class="s2">, "</span> +<a id="rest_code_1adc0460707d4986a0ff9334f2124306-26" name="rest_code_1adc0460707d4986a0ff9334f2124306-26" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1adc0460707d4986a0ff9334f2124306-26"></a> <span class="sa">f</span><span class="s2">"</span><span class="si">{</span><span class="bp">self</span><span class="o">.</span><span class="n">info</span><span class="si">}</span><span class="s2">)"</span> +<a id="rest_code_1adc0460707d4986a0ff9334f2124306-27" name="rest_code_1adc0460707d4986a0ff9334f2124306-27" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1adc0460707d4986a0ff9334f2124306-27"></a> <span class="p">)</span> +<a id="rest_code_1adc0460707d4986a0ff9334f2124306-28" name="rest_code_1adc0460707d4986a0ff9334f2124306-28" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1adc0460707d4986a0ff9334f2124306-28"></a> +<a id="rest_code_1adc0460707d4986a0ff9334f2124306-29" name="rest_code_1adc0460707d4986a0ff9334f2124306-29" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1adc0460707d4986a0ff9334f2124306-29"></a> <span class="k">def</span> <span class="nf">find</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Value</span><span class="p">:</span> +<a id="rest_code_1adc0460707d4986a0ff9334f2124306-30" name="rest_code_1adc0460707d4986a0ff9334f2124306-30" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1adc0460707d4986a0ff9334f2124306-30"></a> <span class="n">op</span> <span class="o">=</span> <span class="bp">self</span> +<a id="rest_code_1adc0460707d4986a0ff9334f2124306-31" name="rest_code_1adc0460707d4986a0ff9334f2124306-31" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1adc0460707d4986a0ff9334f2124306-31"></a> <span class="k">while</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">op</span><span class="p">,</span> <span class="n">Operation</span><span class="p">):</span> +<a id="rest_code_1adc0460707d4986a0ff9334f2124306-32" name="rest_code_1adc0460707d4986a0ff9334f2124306-32" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1adc0460707d4986a0ff9334f2124306-32"></a> <span class="nb">next</span> <span class="o">=</span> <span class="n">op</span><span class="o">.</span><span class="n">forwarded</span> +<a id="rest_code_1adc0460707d4986a0ff9334f2124306-33" name="rest_code_1adc0460707d4986a0ff9334f2124306-33" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1adc0460707d4986a0ff9334f2124306-33"></a> <span class="k">if</span> <span class="nb">next</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span> +<a id="rest_code_1adc0460707d4986a0ff9334f2124306-34" name="rest_code_1adc0460707d4986a0ff9334f2124306-34" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1adc0460707d4986a0ff9334f2124306-34"></a> <span class="k">return</span> <span class="n">op</span> +<a id="rest_code_1adc0460707d4986a0ff9334f2124306-35" name="rest_code_1adc0460707d4986a0ff9334f2124306-35" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1adc0460707d4986a0ff9334f2124306-35"></a> <span class="n">op</span> <span class="o">=</span> <span class="nb">next</span> +<a id="rest_code_1adc0460707d4986a0ff9334f2124306-36" name="rest_code_1adc0460707d4986a0ff9334f2124306-36" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1adc0460707d4986a0ff9334f2124306-36"></a> <span class="k">return</span> <span class="n">op</span> +<a id="rest_code_1adc0460707d4986a0ff9334f2124306-37" name="rest_code_1adc0460707d4986a0ff9334f2124306-37" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1adc0460707d4986a0ff9334f2124306-37"></a> +<a id="rest_code_1adc0460707d4986a0ff9334f2124306-38" name="rest_code_1adc0460707d4986a0ff9334f2124306-38" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1adc0460707d4986a0ff9334f2124306-38"></a> <span class="k">def</span> <span class="nf">arg</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">index</span><span class="p">):</span> +<a id="rest_code_1adc0460707d4986a0ff9334f2124306-39" name="rest_code_1adc0460707d4986a0ff9334f2124306-39" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1adc0460707d4986a0ff9334f2124306-39"></a> <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">args</span><span class="p">[</span><span class="n">index</span><span class="p">]</span><span class="o">.</span><span class="n">find</span><span class="p">()</span> +<a id="rest_code_1adc0460707d4986a0ff9334f2124306-40" name="rest_code_1adc0460707d4986a0ff9334f2124306-40" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1adc0460707d4986a0ff9334f2124306-40"></a> +<a id="rest_code_1adc0460707d4986a0ff9334f2124306-41" name="rest_code_1adc0460707d4986a0ff9334f2124306-41" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1adc0460707d4986a0ff9334f2124306-41"></a> <span class="k">def</span> <span class="nf">make_equal_to</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="n">Value</span><span class="p">):</span> +<a id="rest_code_1adc0460707d4986a0ff9334f2124306-42" name="rest_code_1adc0460707d4986a0ff9334f2124306-42" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1adc0460707d4986a0ff9334f2124306-42"></a> <span class="bp">self</span><span class="o">.</span><span class="n">find</span><span class="p">()</span><span class="o">.</span><span class="n">_set_forwarded</span><span class="p">(</span><span class="n">value</span><span class="p">)</span> +<a id="rest_code_1adc0460707d4986a0ff9334f2124306-43" name="rest_code_1adc0460707d4986a0ff9334f2124306-43" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1adc0460707d4986a0ff9334f2124306-43"></a> +<a id="rest_code_1adc0460707d4986a0ff9334f2124306-44" name="rest_code_1adc0460707d4986a0ff9334f2124306-44" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1adc0460707d4986a0ff9334f2124306-44"></a> <span class="k">def</span> <span class="nf">_set_forwarded</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="n">Value</span><span class="p">):</span> +<a id="rest_code_1adc0460707d4986a0ff9334f2124306-45" name="rest_code_1adc0460707d4986a0ff9334f2124306-45" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1adc0460707d4986a0ff9334f2124306-45"></a> <span class="bp">self</span><span class="o">.</span><span class="n">forwarded</span> <span class="o">=</span> <span class="n">value</span> +<a id="rest_code_1adc0460707d4986a0ff9334f2124306-46" name="rest_code_1adc0460707d4986a0ff9334f2124306-46" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1adc0460707d4986a0ff9334f2124306-46"></a> +<a id="rest_code_1adc0460707d4986a0ff9334f2124306-47" name="rest_code_1adc0460707d4986a0ff9334f2124306-47" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1adc0460707d4986a0ff9334f2124306-47"></a> +<a id="rest_code_1adc0460707d4986a0ff9334f2124306-48" name="rest_code_1adc0460707d4986a0ff9334f2124306-48" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1adc0460707d4986a0ff9334f2124306-48"></a><span class="k">class</span> <span class="nc">Constant</span><span class="p">(</span><span class="n">Value</span><span class="p">):</span> +<a id="rest_code_1adc0460707d4986a0ff9334f2124306-49" name="rest_code_1adc0460707d4986a0ff9334f2124306-49" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1adc0460707d4986a0ff9334f2124306-49"></a> <span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="n">Any</span><span class="p">):</span> +<a id="rest_code_1adc0460707d4986a0ff9334f2124306-50" name="rest_code_1adc0460707d4986a0ff9334f2124306-50" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1adc0460707d4986a0ff9334f2124306-50"></a> <span class="bp">self</span><span class="o">.</span><span class="n">value</span> <span class="o">=</span> <span class="n">value</span> +<a id="rest_code_1adc0460707d4986a0ff9334f2124306-51" name="rest_code_1adc0460707d4986a0ff9334f2124306-51" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1adc0460707d4986a0ff9334f2124306-51"></a> +<a id="rest_code_1adc0460707d4986a0ff9334f2124306-52" name="rest_code_1adc0460707d4986a0ff9334f2124306-52" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1adc0460707d4986a0ff9334f2124306-52"></a> <span class="k">def</span> <span class="fm">__repr__</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span> +<a id="rest_code_1adc0460707d4986a0ff9334f2124306-53" name="rest_code_1adc0460707d4986a0ff9334f2124306-53" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1adc0460707d4986a0ff9334f2124306-53"></a> <span class="k">return</span> <span class="sa">f</span><span class="s2">"Constant(</span><span class="si">{</span><span class="bp">self</span><span class="o">.</span><span class="n">value</span><span class="si">}</span><span class="s2">)"</span> +<a id="rest_code_1adc0460707d4986a0ff9334f2124306-54" name="rest_code_1adc0460707d4986a0ff9334f2124306-54" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1adc0460707d4986a0ff9334f2124306-54"></a> +<a id="rest_code_1adc0460707d4986a0ff9334f2124306-55" name="rest_code_1adc0460707d4986a0ff9334f2124306-55" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1adc0460707d4986a0ff9334f2124306-55"></a> <span class="k">def</span> <span class="nf">find</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span> +<a id="rest_code_1adc0460707d4986a0ff9334f2124306-56" name="rest_code_1adc0460707d4986a0ff9334f2124306-56" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1adc0460707d4986a0ff9334f2124306-56"></a> <span class="k">return</span> <span class="bp">self</span> +<a id="rest_code_1adc0460707d4986a0ff9334f2124306-57" name="rest_code_1adc0460707d4986a0ff9334f2124306-57" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1adc0460707d4986a0ff9334f2124306-57"></a> +<a id="rest_code_1adc0460707d4986a0ff9334f2124306-58" name="rest_code_1adc0460707d4986a0ff9334f2124306-58" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1adc0460707d4986a0ff9334f2124306-58"></a> <span class="k">def</span> <span class="nf">_set_forwarded</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="n">Value</span><span class="p">):</span> +<a id="rest_code_1adc0460707d4986a0ff9334f2124306-59" name="rest_code_1adc0460707d4986a0ff9334f2124306-59" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1adc0460707d4986a0ff9334f2124306-59"></a> <span class="k">assert</span> <span class="p">(</span> +<a id="rest_code_1adc0460707d4986a0ff9334f2124306-60" name="rest_code_1adc0460707d4986a0ff9334f2124306-60" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1adc0460707d4986a0ff9334f2124306-60"></a> <span class="nb">isinstance</span><span class="p">(</span><span class="n">value</span><span class="p">,</span> <span class="n">Constant</span><span class="p">)</span> +<a id="rest_code_1adc0460707d4986a0ff9334f2124306-61" name="rest_code_1adc0460707d4986a0ff9334f2124306-61" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1adc0460707d4986a0ff9334f2124306-61"></a> <span class="ow">and</span> <span class="n">value</span><span class="o">.</span><span class="n">value</span> <span class="o">==</span> <span class="bp">self</span><span class="o">.</span><span class="n">value</span> +<a id="rest_code_1adc0460707d4986a0ff9334f2124306-62" name="rest_code_1adc0460707d4986a0ff9334f2124306-62" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1adc0460707d4986a0ff9334f2124306-62"></a> <span class="p">)</span> +<a id="rest_code_1adc0460707d4986a0ff9334f2124306-63" name="rest_code_1adc0460707d4986a0ff9334f2124306-63" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1adc0460707d4986a0ff9334f2124306-63"></a> +<a id="rest_code_1adc0460707d4986a0ff9334f2124306-64" name="rest_code_1adc0460707d4986a0ff9334f2124306-64" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1adc0460707d4986a0ff9334f2124306-64"></a><span class="k">class</span> <span class="nc">Block</span><span class="p">(</span><span class="nb">list</span><span class="p">):</span> +<a id="rest_code_1adc0460707d4986a0ff9334f2124306-65" name="rest_code_1adc0460707d4986a0ff9334f2124306-65" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1adc0460707d4986a0ff9334f2124306-65"></a> <span class="k">def</span> <span class="nf">opbuilder</span><span class="p">(</span><span class="n">opname</span><span class="p">):</span> +<a id="rest_code_1adc0460707d4986a0ff9334f2124306-66" name="rest_code_1adc0460707d4986a0ff9334f2124306-66" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1adc0460707d4986a0ff9334f2124306-66"></a> <span class="k">def</span> <span class="nf">wraparg</span><span class="p">(</span><span class="n">arg</span><span class="p">):</span> +<a id="rest_code_1adc0460707d4986a0ff9334f2124306-67" name="rest_code_1adc0460707d4986a0ff9334f2124306-67" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1adc0460707d4986a0ff9334f2124306-67"></a> <span class="k">if</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">arg</span><span class="p">,</span> <span class="n">Value</span><span class="p">):</span> +<a id="rest_code_1adc0460707d4986a0ff9334f2124306-68" name="rest_code_1adc0460707d4986a0ff9334f2124306-68" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1adc0460707d4986a0ff9334f2124306-68"></a> <span class="n">arg</span> <span class="o">=</span> <span class="n">Constant</span><span class="p">(</span><span class="n">arg</span><span class="p">)</span> +<a id="rest_code_1adc0460707d4986a0ff9334f2124306-69" name="rest_code_1adc0460707d4986a0ff9334f2124306-69" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1adc0460707d4986a0ff9334f2124306-69"></a> <span class="k">return</span> <span class="n">arg</span> +<a id="rest_code_1adc0460707d4986a0ff9334f2124306-70" name="rest_code_1adc0460707d4986a0ff9334f2124306-70" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1adc0460707d4986a0ff9334f2124306-70"></a> <span class="k">def</span> <span class="nf">build</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="n">args</span><span class="p">):</span> +<a id="rest_code_1adc0460707d4986a0ff9334f2124306-71" name="rest_code_1adc0460707d4986a0ff9334f2124306-71" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1adc0460707d4986a0ff9334f2124306-71"></a> <span class="c1"># construct an Operation, wrap the</span> +<a id="rest_code_1adc0460707d4986a0ff9334f2124306-72" name="rest_code_1adc0460707d4986a0ff9334f2124306-72" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1adc0460707d4986a0ff9334f2124306-72"></a> <span class="c1"># arguments in Constants if necessary</span> +<a id="rest_code_1adc0460707d4986a0ff9334f2124306-73" name="rest_code_1adc0460707d4986a0ff9334f2124306-73" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1adc0460707d4986a0ff9334f2124306-73"></a> <span class="n">op</span> <span class="o">=</span> <span class="n">Operation</span><span class="p">(</span><span class="n">opname</span><span class="p">,</span> +<a id="rest_code_1adc0460707d4986a0ff9334f2124306-74" name="rest_code_1adc0460707d4986a0ff9334f2124306-74" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1adc0460707d4986a0ff9334f2124306-74"></a> <span class="p">[</span><span class="n">wraparg</span><span class="p">(</span><span class="n">arg</span><span class="p">)</span> <span class="k">for</span> <span class="n">arg</span> <span class="ow">in</span> <span class="n">args</span><span class="p">])</span> +<a id="rest_code_1adc0460707d4986a0ff9334f2124306-75" name="rest_code_1adc0460707d4986a0ff9334f2124306-75" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1adc0460707d4986a0ff9334f2124306-75"></a> <span class="c1"># add it to self, the basic block</span> +<a id="rest_code_1adc0460707d4986a0ff9334f2124306-76" name="rest_code_1adc0460707d4986a0ff9334f2124306-76" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1adc0460707d4986a0ff9334f2124306-76"></a> <span class="bp">self</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">op</span><span class="p">)</span> +<a id="rest_code_1adc0460707d4986a0ff9334f2124306-77" name="rest_code_1adc0460707d4986a0ff9334f2124306-77" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1adc0460707d4986a0ff9334f2124306-77"></a> <span class="k">return</span> <span class="n">op</span> +<a id="rest_code_1adc0460707d4986a0ff9334f2124306-78" name="rest_code_1adc0460707d4986a0ff9334f2124306-78" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1adc0460707d4986a0ff9334f2124306-78"></a> <span class="k">return</span> <span class="n">build</span> +<a id="rest_code_1adc0460707d4986a0ff9334f2124306-79" name="rest_code_1adc0460707d4986a0ff9334f2124306-79" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1adc0460707d4986a0ff9334f2124306-79"></a> +<a id="rest_code_1adc0460707d4986a0ff9334f2124306-80" name="rest_code_1adc0460707d4986a0ff9334f2124306-80" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1adc0460707d4986a0ff9334f2124306-80"></a> <span class="c1"># a bunch of operations we support</span> +<a id="rest_code_1adc0460707d4986a0ff9334f2124306-81" name="rest_code_1adc0460707d4986a0ff9334f2124306-81" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1adc0460707d4986a0ff9334f2124306-81"></a> <span class="n">add</span> <span class="o">=</span> <span class="n">opbuilder</span><span class="p">(</span><span class="s2">"add"</span><span class="p">)</span> +<a id="rest_code_1adc0460707d4986a0ff9334f2124306-82" name="rest_code_1adc0460707d4986a0ff9334f2124306-82" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1adc0460707d4986a0ff9334f2124306-82"></a> <span class="n">mul</span> <span class="o">=</span> <span class="n">opbuilder</span><span class="p">(</span><span class="s2">"mul"</span><span class="p">)</span> +<a id="rest_code_1adc0460707d4986a0ff9334f2124306-83" name="rest_code_1adc0460707d4986a0ff9334f2124306-83" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1adc0460707d4986a0ff9334f2124306-83"></a> <span class="n">getarg</span> <span class="o">=</span> <span class="n">opbuilder</span><span class="p">(</span><span class="s2">"getarg"</span><span class="p">)</span> +<a id="rest_code_1adc0460707d4986a0ff9334f2124306-84" name="rest_code_1adc0460707d4986a0ff9334f2124306-84" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1adc0460707d4986a0ff9334f2124306-84"></a> <span class="n">dummy</span> <span class="o">=</span> <span class="n">opbuilder</span><span class="p">(</span><span class="s2">"dummy"</span><span class="p">)</span> +<a id="rest_code_1adc0460707d4986a0ff9334f2124306-85" name="rest_code_1adc0460707d4986a0ff9334f2124306-85" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1adc0460707d4986a0ff9334f2124306-85"></a> <span class="n">lshift</span> <span class="o">=</span> <span class="n">opbuilder</span><span class="p">(</span><span class="s2">"lshift"</span><span class="p">)</span> +<a id="rest_code_1adc0460707d4986a0ff9334f2124306-86" name="rest_code_1adc0460707d4986a0ff9334f2124306-86" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1adc0460707d4986a0ff9334f2124306-86"></a> <span class="c1"># some new one for this post</span> +<a id="rest_code_1adc0460707d4986a0ff9334f2124306-87" name="rest_code_1adc0460707d4986a0ff9334f2124306-87" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1adc0460707d4986a0ff9334f2124306-87"></a> <span class="n">alloc</span> <span class="o">=</span> <span class="n">opbuilder</span><span class="p">(</span><span class="s2">"alloc"</span><span class="p">)</span> +<a id="rest_code_1adc0460707d4986a0ff9334f2124306-88" name="rest_code_1adc0460707d4986a0ff9334f2124306-88" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1adc0460707d4986a0ff9334f2124306-88"></a><span class="hll"> <span class="n">load</span> <span class="o">=</span> <span class="n">opbuilder</span><span class="p">(</span><span class="s2">"load"</span><span class="p">)</span> +</span><a id="rest_code_1adc0460707d4986a0ff9334f2124306-89" name="rest_code_1adc0460707d4986a0ff9334f2124306-89" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1adc0460707d4986a0ff9334f2124306-89"></a><span class="hll"> <span class="n">store</span> <span class="o">=</span> <span class="n">opbuilder</span><span class="p">(</span><span class="s2">"store"</span><span class="p">)</span> +</span><a id="rest_code_1adc0460707d4986a0ff9334f2124306-90" name="rest_code_1adc0460707d4986a0ff9334f2124306-90" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1adc0460707d4986a0ff9334f2124306-90"></a><span class="hll"> <span class="nb">print</span> <span class="o">=</span> <span class="n">opbuilder</span><span class="p">(</span><span class="s2">"print"</span><span class="p">)</span> +</span><a id="rest_code_1adc0460707d4986a0ff9334f2124306-91" name="rest_code_1adc0460707d4986a0ff9334f2124306-91" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1adc0460707d4986a0ff9334f2124306-91"></a><span class="hll"> +</span><a id="rest_code_1adc0460707d4986a0ff9334f2124306-92" name="rest_code_1adc0460707d4986a0ff9334f2124306-92" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1adc0460707d4986a0ff9334f2124306-92"></a><span class="hll"><span class="k">def</span> <span class="nf">bb_to_str</span><span class="p">(</span><span class="n">bb</span><span class="p">:</span> <span class="n">Block</span><span class="p">,</span> <span class="n">varprefix</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">"var"</span><span class="p">):</span> +</span><a id="rest_code_1adc0460707d4986a0ff9334f2124306-93" name="rest_code_1adc0460707d4986a0ff9334f2124306-93" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1adc0460707d4986a0ff9334f2124306-93"></a> <span class="k">def</span> <span class="nf">arg_to_str</span><span class="p">(</span><span class="n">arg</span><span class="p">:</span> <span class="n">Value</span><span class="p">):</span> +<a id="rest_code_1adc0460707d4986a0ff9334f2124306-94" name="rest_code_1adc0460707d4986a0ff9334f2124306-94" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1adc0460707d4986a0ff9334f2124306-94"></a> <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">arg</span><span class="p">,</span> <span class="n">Constant</span><span class="p">):</span> +<a id="rest_code_1adc0460707d4986a0ff9334f2124306-95" name="rest_code_1adc0460707d4986a0ff9334f2124306-95" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1adc0460707d4986a0ff9334f2124306-95"></a> <span class="k">return</span> <span class="nb">str</span><span class="p">(</span><span class="n">arg</span><span class="o">.</span><span class="n">value</span><span class="p">)</span> +<a id="rest_code_1adc0460707d4986a0ff9334f2124306-96" name="rest_code_1adc0460707d4986a0ff9334f2124306-96" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1adc0460707d4986a0ff9334f2124306-96"></a> <span class="k">else</span><span class="p">:</span> +<a id="rest_code_1adc0460707d4986a0ff9334f2124306-97" name="rest_code_1adc0460707d4986a0ff9334f2124306-97" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1adc0460707d4986a0ff9334f2124306-97"></a> <span class="k">return</span> <span class="n">varnames</span><span class="p">[</span><span class="n">arg</span><span class="p">]</span> +<a id="rest_code_1adc0460707d4986a0ff9334f2124306-98" name="rest_code_1adc0460707d4986a0ff9334f2124306-98" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1adc0460707d4986a0ff9334f2124306-98"></a> +<a id="rest_code_1adc0460707d4986a0ff9334f2124306-99" name="rest_code_1adc0460707d4986a0ff9334f2124306-99" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1adc0460707d4986a0ff9334f2124306-99"></a> <span class="n">varnames</span> <span class="o">=</span> <span class="p">{}</span> +<a id="rest_code_1adc0460707d4986a0ff9334f2124306-100" name="rest_code_1adc0460707d4986a0ff9334f2124306-100" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1adc0460707d4986a0ff9334f2124306-100"></a> <span class="n">res</span> <span class="o">=</span> <span class="p">[]</span> +<a id="rest_code_1adc0460707d4986a0ff9334f2124306-101" name="rest_code_1adc0460707d4986a0ff9334f2124306-101" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1adc0460707d4986a0ff9334f2124306-101"></a> <span class="k">for</span> <span class="n">index</span><span class="p">,</span> <span class="n">op</span> <span class="ow">in</span> <span class="nb">enumerate</span><span class="p">(</span><span class="n">bb</span><span class="p">):</span> +<a id="rest_code_1adc0460707d4986a0ff9334f2124306-102" name="rest_code_1adc0460707d4986a0ff9334f2124306-102" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1adc0460707d4986a0ff9334f2124306-102"></a> <span class="n">var</span> <span class="o">=</span> <span class="sa">f</span><span class="s2">"</span><span class="si">{</span><span class="n">varprefix</span><span class="si">}{</span><span class="n">index</span><span class="si">}</span><span class="s2">"</span> +<a id="rest_code_1adc0460707d4986a0ff9334f2124306-103" name="rest_code_1adc0460707d4986a0ff9334f2124306-103" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1adc0460707d4986a0ff9334f2124306-103"></a> <span class="n">varnames</span><span class="p">[</span><span class="n">op</span><span class="p">]</span> <span class="o">=</span> <span class="n">var</span> +<a id="rest_code_1adc0460707d4986a0ff9334f2124306-104" name="rest_code_1adc0460707d4986a0ff9334f2124306-104" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1adc0460707d4986a0ff9334f2124306-104"></a> <span class="n">arguments</span> <span class="o">=</span> <span class="s2">", "</span><span class="o">.</span><span class="n">join</span><span class="p">(</span> +<a id="rest_code_1adc0460707d4986a0ff9334f2124306-105" name="rest_code_1adc0460707d4986a0ff9334f2124306-105" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1adc0460707d4986a0ff9334f2124306-105"></a> <span class="n">arg_to_str</span><span class="p">(</span><span class="n">op</span><span class="o">.</span><span class="n">arg</span><span class="p">(</span><span class="n">i</span><span class="p">))</span> +<a id="rest_code_1adc0460707d4986a0ff9334f2124306-106" name="rest_code_1adc0460707d4986a0ff9334f2124306-106" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1adc0460707d4986a0ff9334f2124306-106"></a> <span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="nb">len</span><span class="p">(</span><span class="n">op</span><span class="o">.</span><span class="n">args</span><span class="p">))</span> +<a id="rest_code_1adc0460707d4986a0ff9334f2124306-107" name="rest_code_1adc0460707d4986a0ff9334f2124306-107" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1adc0460707d4986a0ff9334f2124306-107"></a> <span class="p">)</span> +<a id="rest_code_1adc0460707d4986a0ff9334f2124306-108" name="rest_code_1adc0460707d4986a0ff9334f2124306-108" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1adc0460707d4986a0ff9334f2124306-108"></a> <span class="n">strop</span> <span class="o">=</span> <span class="sa">f</span><span class="s2">"</span><span class="si">{</span><span class="n">var</span><span class="si">}</span><span class="s2"> = </span><span class="si">{</span><span class="n">op</span><span class="o">.</span><span class="n">name</span><span class="si">}</span><span class="s2">(</span><span class="si">{</span><span class="n">arguments</span><span class="si">}</span><span class="s2">)"</span> +<a id="rest_code_1adc0460707d4986a0ff9334f2124306-109" name="rest_code_1adc0460707d4986a0ff9334f2124306-109" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1adc0460707d4986a0ff9334f2124306-109"></a> <span class="n">res</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">strop</span><span class="p">)</span> +<a id="rest_code_1adc0460707d4986a0ff9334f2124306-110" name="rest_code_1adc0460707d4986a0ff9334f2124306-110" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1adc0460707d4986a0ff9334f2124306-110"></a> <span class="k">return</span> <span class="s2">"</span><span class="se">\n</span><span class="s2">"</span><span class="o">.</span><span class="n">join</span><span class="p">(</span><span class="n">res</span><span class="p">)</span> +</pre></div> +<p>There are two changes to the code from the last post: <code class="docutils literal">Operation</code> instances +have a new <code class="docutils literal">.info</code> field, which is set to <code class="docutils literal">None</code> by default. We will learn +how the info field is used a bit further down. Also, we define some new +operations.</p> +<section id="interpreter"> +<h2>Interpreter</h2> +<p>In this post we will mainly concern ourselves with optimizing +programs that allocate memory. We assume that our language is garbage collected +and memory safe. The new operations that we will optimize are <code class="docutils literal">alloc</code> +(allocates some new object), <code class="docutils literal">store</code> (stores a value into a fixed field of an +object), <code class="docutils literal">load</code> (loads the value from a field in the object).</p> +<p>We are leaving out a lot of details of a "real" system here, usually an +<code class="docutils literal">alloc</code> operation would get some extra information, for example the type of +the freshly allocated object or at least its size. <code class="docutils literal">load</code> and <code class="docutils literal">store</code> would +typically have some kind of field offset and maybe some information about the +field's type</p> +<p>Here's a simple program that uses these operations:</p> +<pre class="literal-block">var0 = getarg(0) +obj0 = alloc() +store(obj0, 0, var0) +var1 = load(obj0, 0) +print(var1)</pre> +<p>The code allocates a new object <code class="docutils literal">obj0</code>, stores <code class="docutils literal">var0</code> into field <code class="docutils literal">0</code> of +the object, the loads the same field and prints the result of the load.</p> +<p>Before we get started in writing the optimizer for these operations, let's try +to understand the semantics of the new operations a bit better. To do this, we +can sketch a small interpreter for basic blocks, supporting only <code class="docutils literal">getarg</code>, +<code class="docutils literal">alloc</code>, <code class="docutils literal">store</code>, <code class="docutils literal">load</code>, <code class="docutils literal">print</code>:</p> +<div class="code"><pre class="code python"><a id="rest_code_ec539155f5ee4081b1310b37de07b1b6-1" name="rest_code_ec539155f5ee4081b1310b37de07b1b6-1" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_ec539155f5ee4081b1310b37de07b1b6-1"></a><span class="k">def</span> <span class="nf">test_interpret</span><span class="p">():</span> +<a id="rest_code_ec539155f5ee4081b1310b37de07b1b6-2" name="rest_code_ec539155f5ee4081b1310b37de07b1b6-2" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_ec539155f5ee4081b1310b37de07b1b6-2"></a> <span class="n">bb</span> <span class="o">=</span> <span class="n">Block</span><span class="p">()</span> +<a id="rest_code_ec539155f5ee4081b1310b37de07b1b6-3" name="rest_code_ec539155f5ee4081b1310b37de07b1b6-3" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_ec539155f5ee4081b1310b37de07b1b6-3"></a> <span class="n">var0</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">getarg</span><span class="p">(</span><span class="mi">0</span><span class="p">)</span> +<a id="rest_code_ec539155f5ee4081b1310b37de07b1b6-4" name="rest_code_ec539155f5ee4081b1310b37de07b1b6-4" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_ec539155f5ee4081b1310b37de07b1b6-4"></a> <span class="n">obj</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">alloc</span><span class="p">()</span> +<a id="rest_code_ec539155f5ee4081b1310b37de07b1b6-5" name="rest_code_ec539155f5ee4081b1310b37de07b1b6-5" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_ec539155f5ee4081b1310b37de07b1b6-5"></a> <span class="n">sto</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">store</span><span class="p">(</span><span class="n">obj</span><span class="p">,</span> <span class="mi">0</span><span class="p">,</span> <span class="n">var0</span><span class="p">)</span> +<a id="rest_code_ec539155f5ee4081b1310b37de07b1b6-6" name="rest_code_ec539155f5ee4081b1310b37de07b1b6-6" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_ec539155f5ee4081b1310b37de07b1b6-6"></a> <span class="n">var1</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">load</span><span class="p">(</span><span class="n">obj</span><span class="p">,</span> <span class="mi">0</span><span class="p">)</span> +<a id="rest_code_ec539155f5ee4081b1310b37de07b1b6-7" name="rest_code_ec539155f5ee4081b1310b37de07b1b6-7" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_ec539155f5ee4081b1310b37de07b1b6-7"></a> <span class="n">bb</span><span class="o">.</span><span class="n">print</span><span class="p">(</span><span class="n">var1</span><span class="p">)</span> +<a id="rest_code_ec539155f5ee4081b1310b37de07b1b6-8" name="rest_code_ec539155f5ee4081b1310b37de07b1b6-8" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_ec539155f5ee4081b1310b37de07b1b6-8"></a> <span class="k">assert</span> <span class="n">interpret</span><span class="p">(</span><span class="n">bb</span><span class="p">,</span> <span class="mi">17</span><span class="p">)</span> <span class="o">==</span> <span class="mi">17</span> +<a id="rest_code_ec539155f5ee4081b1310b37de07b1b6-9" name="rest_code_ec539155f5ee4081b1310b37de07b1b6-9" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_ec539155f5ee4081b1310b37de07b1b6-9"></a> +<a id="rest_code_ec539155f5ee4081b1310b37de07b1b6-10" name="rest_code_ec539155f5ee4081b1310b37de07b1b6-10" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_ec539155f5ee4081b1310b37de07b1b6-10"></a><span class="k">class</span> <span class="nc">Object</span><span class="p">:</span> +<a id="rest_code_ec539155f5ee4081b1310b37de07b1b6-11" name="rest_code_ec539155f5ee4081b1310b37de07b1b6-11" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_ec539155f5ee4081b1310b37de07b1b6-11"></a> <span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span> +<a id="rest_code_ec539155f5ee4081b1310b37de07b1b6-12" name="rest_code_ec539155f5ee4081b1310b37de07b1b6-12" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_ec539155f5ee4081b1310b37de07b1b6-12"></a> <span class="bp">self</span><span class="o">.</span><span class="n">contents</span><span class="p">:</span> <span class="nb">dict</span><span class="p">[</span><span class="nb">int</span><span class="p">,</span> <span class="n">Any</span><span class="p">]</span> <span class="o">=</span> <span class="p">{}</span> +<a id="rest_code_ec539155f5ee4081b1310b37de07b1b6-13" name="rest_code_ec539155f5ee4081b1310b37de07b1b6-13" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_ec539155f5ee4081b1310b37de07b1b6-13"></a> +<a id="rest_code_ec539155f5ee4081b1310b37de07b1b6-14" name="rest_code_ec539155f5ee4081b1310b37de07b1b6-14" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_ec539155f5ee4081b1310b37de07b1b6-14"></a> <span class="k">def</span> <span class="nf">store</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">idx</span> <span class="p">:</span> <span class="nb">int</span><span class="p">,</span> <span class="n">value</span> <span class="p">:</span> <span class="n">Any</span><span class="p">):</span> +<a id="rest_code_ec539155f5ee4081b1310b37de07b1b6-15" name="rest_code_ec539155f5ee4081b1310b37de07b1b6-15" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_ec539155f5ee4081b1310b37de07b1b6-15"></a> <span class="bp">self</span><span class="o">.</span><span class="n">contents</span><span class="p">[</span><span class="n">idx</span><span class="p">]</span> <span class="o">=</span> <span class="n">value</span> +<a id="rest_code_ec539155f5ee4081b1310b37de07b1b6-16" name="rest_code_ec539155f5ee4081b1310b37de07b1b6-16" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_ec539155f5ee4081b1310b37de07b1b6-16"></a> +<a id="rest_code_ec539155f5ee4081b1310b37de07b1b6-17" name="rest_code_ec539155f5ee4081b1310b37de07b1b6-17" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_ec539155f5ee4081b1310b37de07b1b6-17"></a> <span class="k">def</span> <span class="nf">load</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">idx</span> <span class="p">:</span> <span class="nb">int</span><span class="p">):</span> +<a id="rest_code_ec539155f5ee4081b1310b37de07b1b6-18" name="rest_code_ec539155f5ee4081b1310b37de07b1b6-18" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_ec539155f5ee4081b1310b37de07b1b6-18"></a> <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">contents</span><span class="p">[</span><span class="n">idx</span><span class="p">]</span> +<a id="rest_code_ec539155f5ee4081b1310b37de07b1b6-19" name="rest_code_ec539155f5ee4081b1310b37de07b1b6-19" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_ec539155f5ee4081b1310b37de07b1b6-19"></a> +<a id="rest_code_ec539155f5ee4081b1310b37de07b1b6-20" name="rest_code_ec539155f5ee4081b1310b37de07b1b6-20" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_ec539155f5ee4081b1310b37de07b1b6-20"></a><span class="k">def</span> <span class="nf">get_num</span><span class="p">(</span><span class="n">op</span><span class="p">,</span> <span class="n">index</span><span class="o">=</span><span class="mi">1</span><span class="p">):</span> +<a id="rest_code_ec539155f5ee4081b1310b37de07b1b6-21" name="rest_code_ec539155f5ee4081b1310b37de07b1b6-21" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_ec539155f5ee4081b1310b37de07b1b6-21"></a> <span class="k">assert</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">op</span><span class="o">.</span><span class="n">arg</span><span class="p">(</span><span class="n">index</span><span class="p">),</span> <span class="n">Constant</span><span class="p">)</span> +<a id="rest_code_ec539155f5ee4081b1310b37de07b1b6-22" name="rest_code_ec539155f5ee4081b1310b37de07b1b6-22" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_ec539155f5ee4081b1310b37de07b1b6-22"></a> <span class="k">return</span> <span class="n">op</span><span class="o">.</span><span class="n">arg</span><span class="p">(</span><span class="n">index</span><span class="p">)</span><span class="o">.</span><span class="n">value</span> +<a id="rest_code_ec539155f5ee4081b1310b37de07b1b6-23" name="rest_code_ec539155f5ee4081b1310b37de07b1b6-23" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_ec539155f5ee4081b1310b37de07b1b6-23"></a> +<a id="rest_code_ec539155f5ee4081b1310b37de07b1b6-24" name="rest_code_ec539155f5ee4081b1310b37de07b1b6-24" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_ec539155f5ee4081b1310b37de07b1b6-24"></a><span class="k">def</span> <span class="nf">interpret</span><span class="p">(</span><span class="n">bb</span> <span class="p">:</span> <span class="n">Block</span><span class="p">,</span> <span class="o">*</span><span class="n">args</span> <span class="p">:</span> <span class="nb">tuple</span><span class="p">[</span><span class="n">Any</span><span class="p">]):</span> +<a id="rest_code_ec539155f5ee4081b1310b37de07b1b6-25" name="rest_code_ec539155f5ee4081b1310b37de07b1b6-25" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_ec539155f5ee4081b1310b37de07b1b6-25"></a> <span class="k">def</span> <span class="nf">argval</span><span class="p">(</span><span class="n">op</span><span class="p">,</span> <span class="n">i</span><span class="p">):</span> +<a id="rest_code_ec539155f5ee4081b1310b37de07b1b6-26" name="rest_code_ec539155f5ee4081b1310b37de07b1b6-26" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_ec539155f5ee4081b1310b37de07b1b6-26"></a> <span class="n">arg</span> <span class="o">=</span> <span class="n">op</span><span class="o">.</span><span class="n">arg</span><span class="p">(</span><span class="n">i</span><span class="p">)</span> +<a id="rest_code_ec539155f5ee4081b1310b37de07b1b6-27" name="rest_code_ec539155f5ee4081b1310b37de07b1b6-27" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_ec539155f5ee4081b1310b37de07b1b6-27"></a> <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">arg</span><span class="p">,</span> <span class="n">Constant</span><span class="p">):</span> +<a id="rest_code_ec539155f5ee4081b1310b37de07b1b6-28" name="rest_code_ec539155f5ee4081b1310b37de07b1b6-28" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_ec539155f5ee4081b1310b37de07b1b6-28"></a> <span class="k">return</span> <span class="n">arg</span><span class="o">.</span><span class="n">value</span> +<a id="rest_code_ec539155f5ee4081b1310b37de07b1b6-29" name="rest_code_ec539155f5ee4081b1310b37de07b1b6-29" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_ec539155f5ee4081b1310b37de07b1b6-29"></a> <span class="k">else</span><span class="p">:</span> +<a id="rest_code_ec539155f5ee4081b1310b37de07b1b6-30" name="rest_code_ec539155f5ee4081b1310b37de07b1b6-30" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_ec539155f5ee4081b1310b37de07b1b6-30"></a> <span class="k">assert</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">arg</span><span class="p">,</span> <span class="n">Operation</span><span class="p">)</span> +<a id="rest_code_ec539155f5ee4081b1310b37de07b1b6-31" name="rest_code_ec539155f5ee4081b1310b37de07b1b6-31" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_ec539155f5ee4081b1310b37de07b1b6-31"></a> <span class="k">return</span> <span class="n">arg</span><span class="o">.</span><span class="n">info</span> +<a id="rest_code_ec539155f5ee4081b1310b37de07b1b6-32" name="rest_code_ec539155f5ee4081b1310b37de07b1b6-32" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_ec539155f5ee4081b1310b37de07b1b6-32"></a> +<a id="rest_code_ec539155f5ee4081b1310b37de07b1b6-33" name="rest_code_ec539155f5ee4081b1310b37de07b1b6-33" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_ec539155f5ee4081b1310b37de07b1b6-33"></a> <span class="k">for</span> <span class="n">index</span><span class="p">,</span> <span class="n">op</span> <span class="ow">in</span> <span class="nb">enumerate</span><span class="p">(</span><span class="n">bb</span><span class="p">):</span> +<a id="rest_code_ec539155f5ee4081b1310b37de07b1b6-34" name="rest_code_ec539155f5ee4081b1310b37de07b1b6-34" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_ec539155f5ee4081b1310b37de07b1b6-34"></a> <span class="k">if</span> <span class="n">op</span><span class="o">.</span><span class="n">name</span> <span class="o">==</span> <span class="s2">"getarg"</span><span class="p">:</span> +<a id="rest_code_ec539155f5ee4081b1310b37de07b1b6-35" name="rest_code_ec539155f5ee4081b1310b37de07b1b6-35" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_ec539155f5ee4081b1310b37de07b1b6-35"></a> <span class="n">res</span> <span class="o">=</span> <span class="n">args</span><span class="p">[</span><span class="n">get_num</span><span class="p">(</span><span class="n">op</span><span class="p">,</span> <span class="mi">0</span><span class="p">)]</span> +<a id="rest_code_ec539155f5ee4081b1310b37de07b1b6-36" name="rest_code_ec539155f5ee4081b1310b37de07b1b6-36" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_ec539155f5ee4081b1310b37de07b1b6-36"></a> <span class="k">elif</span> <span class="n">op</span><span class="o">.</span><span class="n">name</span> <span class="o">==</span> <span class="s2">"alloc"</span><span class="p">:</span> +<a id="rest_code_ec539155f5ee4081b1310b37de07b1b6-37" name="rest_code_ec539155f5ee4081b1310b37de07b1b6-37" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_ec539155f5ee4081b1310b37de07b1b6-37"></a> <span class="n">res</span> <span class="o">=</span> <span class="n">Object</span><span class="p">()</span> +<a id="rest_code_ec539155f5ee4081b1310b37de07b1b6-38" name="rest_code_ec539155f5ee4081b1310b37de07b1b6-38" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_ec539155f5ee4081b1310b37de07b1b6-38"></a> <span class="k">elif</span> <span class="n">op</span><span class="o">.</span><span class="n">name</span> <span class="o">==</span> <span class="s2">"load"</span><span class="p">:</span> +<a id="rest_code_ec539155f5ee4081b1310b37de07b1b6-39" name="rest_code_ec539155f5ee4081b1310b37de07b1b6-39" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_ec539155f5ee4081b1310b37de07b1b6-39"></a> <span class="n">fieldnum</span> <span class="o">=</span> <span class="n">get_num</span><span class="p">(</span><span class="n">op</span><span class="p">)</span> +<a id="rest_code_ec539155f5ee4081b1310b37de07b1b6-40" name="rest_code_ec539155f5ee4081b1310b37de07b1b6-40" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_ec539155f5ee4081b1310b37de07b1b6-40"></a> <span class="n">res</span> <span class="o">=</span> <span class="n">argval</span><span class="p">(</span><span class="n">op</span><span class="p">,</span> <span class="mi">0</span><span class="p">)</span><span class="o">.</span><span class="n">load</span><span class="p">(</span><span class="n">fieldnum</span><span class="p">)</span> +<a id="rest_code_ec539155f5ee4081b1310b37de07b1b6-41" name="rest_code_ec539155f5ee4081b1310b37de07b1b6-41" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_ec539155f5ee4081b1310b37de07b1b6-41"></a> <span class="k">elif</span> <span class="n">op</span><span class="o">.</span><span class="n">name</span> <span class="o">==</span> <span class="s2">"store"</span><span class="p">:</span> +<a id="rest_code_ec539155f5ee4081b1310b37de07b1b6-42" name="rest_code_ec539155f5ee4081b1310b37de07b1b6-42" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_ec539155f5ee4081b1310b37de07b1b6-42"></a> <span class="n">obj</span> <span class="o">=</span> <span class="n">argval</span><span class="p">(</span><span class="n">op</span><span class="p">,</span> <span class="mi">0</span><span class="p">)</span> +<a id="rest_code_ec539155f5ee4081b1310b37de07b1b6-43" name="rest_code_ec539155f5ee4081b1310b37de07b1b6-43" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_ec539155f5ee4081b1310b37de07b1b6-43"></a> <span class="n">fieldnum</span> <span class="o">=</span> <span class="n">get_num</span><span class="p">(</span><span class="n">op</span><span class="p">)</span> +<a id="rest_code_ec539155f5ee4081b1310b37de07b1b6-44" name="rest_code_ec539155f5ee4081b1310b37de07b1b6-44" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_ec539155f5ee4081b1310b37de07b1b6-44"></a> <span class="n">fieldvalue</span> <span class="o">=</span> <span class="n">argval</span><span class="p">(</span><span class="n">op</span><span class="p">,</span> <span class="mi">2</span><span class="p">)</span> +<a id="rest_code_ec539155f5ee4081b1310b37de07b1b6-45" name="rest_code_ec539155f5ee4081b1310b37de07b1b6-45" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_ec539155f5ee4081b1310b37de07b1b6-45"></a> <span class="n">obj</span><span class="o">.</span><span class="n">store</span><span class="p">(</span><span class="n">fieldnum</span><span class="p">,</span> <span class="n">fieldvalue</span><span class="p">)</span> +<a id="rest_code_ec539155f5ee4081b1310b37de07b1b6-46" name="rest_code_ec539155f5ee4081b1310b37de07b1b6-46" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_ec539155f5ee4081b1310b37de07b1b6-46"></a> <span class="c1"># no result, only side effect</span> +<a id="rest_code_ec539155f5ee4081b1310b37de07b1b6-47" name="rest_code_ec539155f5ee4081b1310b37de07b1b6-47" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_ec539155f5ee4081b1310b37de07b1b6-47"></a> <span class="k">continue</span> +<a id="rest_code_ec539155f5ee4081b1310b37de07b1b6-48" name="rest_code_ec539155f5ee4081b1310b37de07b1b6-48" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_ec539155f5ee4081b1310b37de07b1b6-48"></a> <span class="k">elif</span> <span class="n">op</span><span class="o">.</span><span class="n">name</span> <span class="o">==</span> <span class="s2">"print"</span><span class="p">:</span> +<a id="rest_code_ec539155f5ee4081b1310b37de07b1b6-49" name="rest_code_ec539155f5ee4081b1310b37de07b1b6-49" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_ec539155f5ee4081b1310b37de07b1b6-49"></a> <span class="n">res</span> <span class="o">=</span> <span class="n">argval</span><span class="p">(</span><span class="n">op</span><span class="p">,</span> <span class="mi">0</span><span class="p">)</span> +<a id="rest_code_ec539155f5ee4081b1310b37de07b1b6-50" name="rest_code_ec539155f5ee4081b1310b37de07b1b6-50" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_ec539155f5ee4081b1310b37de07b1b6-50"></a> <span class="nb">print</span><span class="p">(</span><span class="n">res</span><span class="p">)</span> +<a id="rest_code_ec539155f5ee4081b1310b37de07b1b6-51" name="rest_code_ec539155f5ee4081b1310b37de07b1b6-51" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_ec539155f5ee4081b1310b37de07b1b6-51"></a> <span class="k">return</span> <span class="n">res</span> +<a id="rest_code_ec539155f5ee4081b1310b37de07b1b6-52" name="rest_code_ec539155f5ee4081b1310b37de07b1b6-52" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_ec539155f5ee4081b1310b37de07b1b6-52"></a> <span class="k">else</span><span class="p">:</span> +<a id="rest_code_ec539155f5ee4081b1310b37de07b1b6-53" name="rest_code_ec539155f5ee4081b1310b37de07b1b6-53" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_ec539155f5ee4081b1310b37de07b1b6-53"></a> <span class="k">raise</span> <span class="ne">NotImplementedError</span><span class="p">(</span> +<a id="rest_code_ec539155f5ee4081b1310b37de07b1b6-54" name="rest_code_ec539155f5ee4081b1310b37de07b1b6-54" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_ec539155f5ee4081b1310b37de07b1b6-54"></a> <span class="sa">f</span><span class="s2">"</span><span class="si">{</span><span class="n">op</span><span class="o">.</span><span class="n">name</span><span class="si">}</span><span class="s2"> not supported"</span><span class="p">)</span> +<a id="rest_code_ec539155f5ee4081b1310b37de07b1b6-55" name="rest_code_ec539155f5ee4081b1310b37de07b1b6-55" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_ec539155f5ee4081b1310b37de07b1b6-55"></a> <span class="n">op</span><span class="o">.</span><span class="n">info</span> <span class="o">=</span> <span class="n">res</span> +</pre></div> +<p>The interpreter walks the operations of a block, executing each one in turn. It +uses the <code class="docutils literal">info</code> field to store the result of each already executed +<code class="docutils literal">Operation</code>. In this interpreter sketch we stop at the first <code class="docutils literal">print</code> that +we execute and return its argument for the simple but bad reason that it makes +<code class="docutils literal">test_interpret</code> easier to write.</p> +<p>Objects in the interpreter are represented using a class <code class="docutils literal">Object</code>, which +stores the object's field into a Python dictionary. As written above, this is a +simplification, in a real system the <cite>alloc</cite> operation might for example take +some kind of type as an argument, that describes which kinds of fields an +object has and how they are laid out in memory, which would allow more +efficient storage of the content. But we don't want to care about this level of +detail in the post, so using a dict in the interpreter is good enough.</p> +</section> +<section id="version-1-naive-attempt"> +<h2>Version 1: Naive Attempt</h2> +<p>In many programs, some allocated objects don't live for very long and have a +completely predictable lifetime. They get allocated, used for a while, and then +there is no way to reference them any more, so the garbage collector will +reclaim them. The very first example block had such an allocation:</p> +<pre class="literal-block">var0 = getarg(0) +obj0 = alloc() +store(obj0, 0, var0) +var1 = load(obj0, 0) +print(var1)</pre> +<p>Here <code class="docutils literal">obj0</code> is written to, then read from, and then it's no longer used. We +want to optimize such programs to remove this <code class="docutils literal">alloc</code> operation. The optimized +version of this program would look like this:</p> +<pre class="literal-block">var0 = getarg(0) +print(var0)</pre> +<p>The <code class="docutils literal">alloc</code>, <code class="docutils literal">store</code> and <code class="docutils literal">load</code> operations have been completely removed. +This is a pretty important optimizations for PyPy's JIT: Allocations, memory +reads and writes are quite costly and occur <em>a lot</em> in Python, so getting rid +of as many of them as possible is instrumental for performance.</p> +<p>Implementing the optimization is not a lot of code! However, understanding all +the corner cases of the +optimization and making sure that the resulting program behave correctly is not +completely trivial. Therefore we will develop the optimization step by step, in +a test driven fashion: I will start each section with a new test that shows a +bug in the version of the optimization that we have so far.</p> +<p>Let's start in a really naive way. Here's the first test we would like to +pass, using the example program above:</p> +<div class="code"><pre class="code python"><a id="rest_code_7861ee93b7b24a2c9694eb266d65ded5-1" name="rest_code_7861ee93b7b24a2c9694eb266d65ded5-1" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_7861ee93b7b24a2c9694eb266d65ded5-1"></a><span class="k">def</span> <span class="nf">test_remove_unused_allocation</span><span class="p">():</span> +<a id="rest_code_7861ee93b7b24a2c9694eb266d65ded5-2" name="rest_code_7861ee93b7b24a2c9694eb266d65ded5-2" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_7861ee93b7b24a2c9694eb266d65ded5-2"></a> <span class="n">bb</span> <span class="o">=</span> <span class="n">Block</span><span class="p">()</span> +<a id="rest_code_7861ee93b7b24a2c9694eb266d65ded5-3" name="rest_code_7861ee93b7b24a2c9694eb266d65ded5-3" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_7861ee93b7b24a2c9694eb266d65ded5-3"></a> <span class="n">var0</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">getarg</span><span class="p">(</span><span class="mi">0</span><span class="p">)</span> +<a id="rest_code_7861ee93b7b24a2c9694eb266d65ded5-4" name="rest_code_7861ee93b7b24a2c9694eb266d65ded5-4" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_7861ee93b7b24a2c9694eb266d65ded5-4"></a> <span class="n">obj</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">alloc</span><span class="p">()</span> +<a id="rest_code_7861ee93b7b24a2c9694eb266d65ded5-5" name="rest_code_7861ee93b7b24a2c9694eb266d65ded5-5" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_7861ee93b7b24a2c9694eb266d65ded5-5"></a> <span class="n">sto</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">store</span><span class="p">(</span><span class="n">obj</span><span class="p">,</span> <span class="mi">0</span><span class="p">,</span> <span class="n">var0</span><span class="p">)</span> +<a id="rest_code_7861ee93b7b24a2c9694eb266d65ded5-6" name="rest_code_7861ee93b7b24a2c9694eb266d65ded5-6" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_7861ee93b7b24a2c9694eb266d65ded5-6"></a> <span class="n">var1</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">load</span><span class="p">(</span><span class="n">obj</span><span class="p">,</span> <span class="mi">0</span><span class="p">)</span> +<a id="rest_code_7861ee93b7b24a2c9694eb266d65ded5-7" name="rest_code_7861ee93b7b24a2c9694eb266d65ded5-7" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_7861ee93b7b24a2c9694eb266d65ded5-7"></a> <span class="n">bb</span><span class="o">.</span><span class="n">print</span><span class="p">(</span><span class="n">var1</span><span class="p">)</span> +<a id="rest_code_7861ee93b7b24a2c9694eb266d65ded5-8" name="rest_code_7861ee93b7b24a2c9694eb266d65ded5-8" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_7861ee93b7b24a2c9694eb266d65ded5-8"></a> <span class="n">opt_bb</span> <span class="o">=</span> <span class="n">optimize_alloc_removal</span><span class="p">(</span><span class="n">bb</span><span class="p">)</span> +<a id="rest_code_7861ee93b7b24a2c9694eb266d65ded5-9" name="rest_code_7861ee93b7b24a2c9694eb266d65ded5-9" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_7861ee93b7b24a2c9694eb266d65ded5-9"></a> <span class="c1"># the virtual object looks like this:</span> +<a id="rest_code_7861ee93b7b24a2c9694eb266d65ded5-10" name="rest_code_7861ee93b7b24a2c9694eb266d65ded5-10" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_7861ee93b7b24a2c9694eb266d65ded5-10"></a> <span class="c1"># obj</span> +<a id="rest_code_7861ee93b7b24a2c9694eb266d65ded5-11" name="rest_code_7861ee93b7b24a2c9694eb266d65ded5-11" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_7861ee93b7b24a2c9694eb266d65ded5-11"></a> <span class="c1"># ┌──────────┐</span> +<a id="rest_code_7861ee93b7b24a2c9694eb266d65ded5-12" name="rest_code_7861ee93b7b24a2c9694eb266d65ded5-12" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_7861ee93b7b24a2c9694eb266d65ded5-12"></a> <span class="c1"># │ 0: var0 │</span> +<a id="rest_code_7861ee93b7b24a2c9694eb266d65ded5-13" name="rest_code_7861ee93b7b24a2c9694eb266d65ded5-13" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_7861ee93b7b24a2c9694eb266d65ded5-13"></a> <span class="c1"># └──────────┘</span> +<a id="rest_code_7861ee93b7b24a2c9694eb266d65ded5-14" name="rest_code_7861ee93b7b24a2c9694eb266d65ded5-14" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_7861ee93b7b24a2c9694eb266d65ded5-14"></a> <span class="k">assert</span> <span class="n">bb_to_str</span><span class="p">(</span><span class="n">opt_bb</span><span class="p">,</span> <span class="s2">"optvar"</span><span class="p">)</span> <span class="o">==</span> <span class="s2">"""</span><span class="se">\</span> +<a id="rest_code_7861ee93b7b24a2c9694eb266d65ded5-15" name="rest_code_7861ee93b7b24a2c9694eb266d65ded5-15" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_7861ee93b7b24a2c9694eb266d65ded5-15"></a><span class="s2">optvar0 = getarg(0)</span> +<a id="rest_code_7861ee93b7b24a2c9694eb266d65ded5-16" name="rest_code_7861ee93b7b24a2c9694eb266d65ded5-16" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_7861ee93b7b24a2c9694eb266d65ded5-16"></a><span class="s2">optvar1 = print(optvar0)"""</span> +</pre></div> +<p>We will define a class <code class="docutils literal">VirtualObject</code> that is basically identical to +<code class="docutils literal">Object</code> above. But it will not be used by the interpreter, instead we will +use it during optimization.</p> +<div class="code"><pre class="code python"><a id="rest_code_ac03f32ad0a0449495b514e97e81c430-1" name="rest_code_ac03f32ad0a0449495b514e97e81c430-1" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_ac03f32ad0a0449495b514e97e81c430-1"></a><span class="k">class</span> <span class="nc">VirtualObject</span><span class="p">:</span> +<a id="rest_code_ac03f32ad0a0449495b514e97e81c430-2" name="rest_code_ac03f32ad0a0449495b514e97e81c430-2" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_ac03f32ad0a0449495b514e97e81c430-2"></a> <span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span> +<a id="rest_code_ac03f32ad0a0449495b514e97e81c430-3" name="rest_code_ac03f32ad0a0449495b514e97e81c430-3" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_ac03f32ad0a0449495b514e97e81c430-3"></a> <span class="bp">self</span><span class="o">.</span><span class="n">contents</span><span class="p">:</span> <span class="nb">dict</span><span class="p">[</span><span class="nb">int</span><span class="p">,</span> <span class="n">Value</span><span class="p">]</span> <span class="o">=</span> <span class="p">{}</span> +<a id="rest_code_ac03f32ad0a0449495b514e97e81c430-4" name="rest_code_ac03f32ad0a0449495b514e97e81c430-4" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_ac03f32ad0a0449495b514e97e81c430-4"></a> +<a id="rest_code_ac03f32ad0a0449495b514e97e81c430-5" name="rest_code_ac03f32ad0a0449495b514e97e81c430-5" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_ac03f32ad0a0449495b514e97e81c430-5"></a> <span class="k">def</span> <span class="nf">store</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">idx</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span> +<a id="rest_code_ac03f32ad0a0449495b514e97e81c430-6" name="rest_code_ac03f32ad0a0449495b514e97e81c430-6" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_ac03f32ad0a0449495b514e97e81c430-6"></a> <span class="bp">self</span><span class="o">.</span><span class="n">contents</span><span class="p">[</span><span class="n">idx</span><span class="p">]</span> <span class="o">=</span> <span class="n">value</span> +<a id="rest_code_ac03f32ad0a0449495b514e97e81c430-7" name="rest_code_ac03f32ad0a0449495b514e97e81c430-7" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_ac03f32ad0a0449495b514e97e81c430-7"></a> +<a id="rest_code_ac03f32ad0a0449495b514e97e81c430-8" name="rest_code_ac03f32ad0a0449495b514e97e81c430-8" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_ac03f32ad0a0449495b514e97e81c430-8"></a> <span class="k">def</span> <span class="nf">load</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">idx</span><span class="p">):</span> +<a id="rest_code_ac03f32ad0a0449495b514e97e81c430-9" name="rest_code_ac03f32ad0a0449495b514e97e81c430-9" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_ac03f32ad0a0449495b514e97e81c430-9"></a> <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">contents</span><span class="p">[</span><span class="n">idx</span><span class="p">]</span> +</pre></div> +<p>The structure of the optimizer is going to be like those in the first blog post. +The optimizer makes a single pass over all operations. It removes some and +emits others.</p> +<p>This first version of the allocation removal optimizer is going to be extremely +optimistic. It simply assumes that <em>all</em> the allocations in the program can be +optimized away. That is not realistic in practice. We will have to +refine this approach later, but it's a good way to start. That means whenever +the optimizer sees an <code class="docutils literal">alloc</code> operation, it removes it and creates a +<code class="docutils literal">VirtualObject</code> object which stores the information that is known during +optimization about the result of the <code class="docutils literal">alloc</code>. Like in the interpreter, the +<code class="docutils literal">VirtualObject</code> is stored in the <code class="docutils literal">.info</code> field of the <code class="docutils literal">Operation</code> instance +that represents the <code class="docutils literal">alloc</code>.</p> +<p>When the optimizer sees a <code class="docutils literal">store</code> operation, it will also remove it and +instead execute the store by calling the <code class="docutils literal">VirtualObject.store</code> method. +Here is one important difference between the interpreter and the optimizer: In +the interpreter, the values that were stored into an <code class="docutils literal">Object</code> (and thus +put into the object's <code class="docutils literal">.contents</code> dictionary) were runtime values, for +example integers or other objects. In the optimizer however, the +fields of the <code class="docutils literal">VirtualObject</code> store <code class="docutils literal">Value</code> instances, either <code class="docutils literal">Constant</code> +instances or <code class="docutils literal">Operation</code> instances.</p> +<p>When the optimizer sees a <code class="docutils literal">load</code> operation, it <em>also</em> removes it, and replaces +the <code class="docutils literal">load</code> with the <code class="docutils literal">Operation</code> (or <code class="docutils literal">Constant</code>) that is stored in the +<code class="docutils literal">VirtualObject</code> at that point:</p> +<div class="code"><pre class="code python"><a id="rest_code_e1d77e0a46db40298289e70f374a23cf-1" name="rest_code_e1d77e0a46db40298289e70f374a23cf-1" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_e1d77e0a46db40298289e70f374a23cf-1"></a><span class="k">def</span> <span class="nf">optimize_alloc_removal</span><span class="p">(</span><span class="n">bb</span><span class="p">):</span> +<a id="rest_code_e1d77e0a46db40298289e70f374a23cf-2" name="rest_code_e1d77e0a46db40298289e70f374a23cf-2" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_e1d77e0a46db40298289e70f374a23cf-2"></a> <span class="n">opt_bb</span> <span class="o">=</span> <span class="n">Block</span><span class="p">()</span> +<a id="rest_code_e1d77e0a46db40298289e70f374a23cf-3" name="rest_code_e1d77e0a46db40298289e70f374a23cf-3" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_e1d77e0a46db40298289e70f374a23cf-3"></a> <span class="k">for</span> <span class="n">op</span> <span class="ow">in</span> <span class="n">bb</span><span class="p">:</span> +<a id="rest_code_e1d77e0a46db40298289e70f374a23cf-4" name="rest_code_e1d77e0a46db40298289e70f374a23cf-4" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_e1d77e0a46db40298289e70f374a23cf-4"></a> <span class="k">if</span> <span class="n">op</span><span class="o">.</span><span class="n">name</span> <span class="o">==</span> <span class="s2">"alloc"</span><span class="p">:</span> +<a id="rest_code_e1d77e0a46db40298289e70f374a23cf-5" name="rest_code_e1d77e0a46db40298289e70f374a23cf-5" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_e1d77e0a46db40298289e70f374a23cf-5"></a> <span class="n">op</span><span class="o">.</span><span class="n">info</span> <span class="o">=</span> <span class="n">VirtualObject</span><span class="p">()</span> +<a id="rest_code_e1d77e0a46db40298289e70f374a23cf-6" name="rest_code_e1d77e0a46db40298289e70f374a23cf-6" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_e1d77e0a46db40298289e70f374a23cf-6"></a> <span class="k">continue</span> +<a id="rest_code_e1d77e0a46db40298289e70f374a23cf-7" name="rest_code_e1d77e0a46db40298289e70f374a23cf-7" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_e1d77e0a46db40298289e70f374a23cf-7"></a> <span class="k">if</span> <span class="n">op</span><span class="o">.</span><span class="n">name</span> <span class="o">==</span> <span class="s2">"load"</span><span class="p">:</span> +<a id="rest_code_e1d77e0a46db40298289e70f374a23cf-8" name="rest_code_e1d77e0a46db40298289e70f374a23cf-8" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_e1d77e0a46db40298289e70f374a23cf-8"></a> <span class="n">info</span> <span class="o">=</span> <span class="n">op</span><span class="o">.</span><span class="n">arg</span><span class="p">(</span><span class="mi">0</span><span class="p">)</span><span class="o">.</span><span class="n">info</span> +<a id="rest_code_e1d77e0a46db40298289e70f374a23cf-9" name="rest_code_e1d77e0a46db40298289e70f374a23cf-9" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_e1d77e0a46db40298289e70f374a23cf-9"></a> <span class="n">field</span> <span class="o">=</span> <span class="n">get_num</span><span class="p">(</span><span class="n">op</span><span class="p">)</span> +<a id="rest_code_e1d77e0a46db40298289e70f374a23cf-10" name="rest_code_e1d77e0a46db40298289e70f374a23cf-10" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_e1d77e0a46db40298289e70f374a23cf-10"></a> <span class="n">op</span><span class="o">.</span><span class="n">make_equal_to</span><span class="p">(</span><span class="n">info</span><span class="o">.</span><span class="n">load</span><span class="p">(</span><span class="n">field</span><span class="p">))</span> +<a id="rest_code_e1d77e0a46db40298289e70f374a23cf-11" name="rest_code_e1d77e0a46db40298289e70f374a23cf-11" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_e1d77e0a46db40298289e70f374a23cf-11"></a> <span class="k">continue</span> +<a id="rest_code_e1d77e0a46db40298289e70f374a23cf-12" name="rest_code_e1d77e0a46db40298289e70f374a23cf-12" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_e1d77e0a46db40298289e70f374a23cf-12"></a> <span class="k">if</span> <span class="n">op</span><span class="o">.</span><span class="n">name</span> <span class="o">==</span> <span class="s2">"store"</span><span class="p">:</span> +<a id="rest_code_e1d77e0a46db40298289e70f374a23cf-13" name="rest_code_e1d77e0a46db40298289e70f374a23cf-13" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_e1d77e0a46db40298289e70f374a23cf-13"></a> <span class="n">info</span> <span class="o">=</span> <span class="n">op</span><span class="o">.</span><span class="n">arg</span><span class="p">(</span><span class="mi">0</span><span class="p">)</span><span class="o">.</span><span class="n">info</span> +<a id="rest_code_e1d77e0a46db40298289e70f374a23cf-14" name="rest_code_e1d77e0a46db40298289e70f374a23cf-14" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_e1d77e0a46db40298289e70f374a23cf-14"></a> <span class="n">field</span> <span class="o">=</span> <span class="n">get_num</span><span class="p">(</span><span class="n">op</span><span class="p">)</span> +<a id="rest_code_e1d77e0a46db40298289e70f374a23cf-15" name="rest_code_e1d77e0a46db40298289e70f374a23cf-15" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_e1d77e0a46db40298289e70f374a23cf-15"></a> <span class="n">info</span><span class="o">.</span><span class="n">store</span><span class="p">(</span><span class="n">field</span><span class="p">,</span> <span class="n">op</span><span class="o">.</span><span class="n">arg</span><span class="p">(</span><span class="mi">2</span><span class="p">))</span> +<a id="rest_code_e1d77e0a46db40298289e70f374a23cf-16" name="rest_code_e1d77e0a46db40298289e70f374a23cf-16" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_e1d77e0a46db40298289e70f374a23cf-16"></a> <span class="k">continue</span> +<a id="rest_code_e1d77e0a46db40298289e70f374a23cf-17" name="rest_code_e1d77e0a46db40298289e70f374a23cf-17" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_e1d77e0a46db40298289e70f374a23cf-17"></a> <span class="n">opt_bb</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">op</span><span class="p">)</span> +<a id="rest_code_e1d77e0a46db40298289e70f374a23cf-18" name="rest_code_e1d77e0a46db40298289e70f374a23cf-18" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_e1d77e0a46db40298289e70f374a23cf-18"></a> <span class="k">return</span> <span class="n">opt_bb</span> +</pre></div> +<p>This is the first version of the optimization. It doesn't handle all kinds of +difficult cases, and we'll have to do something about its optimism. +But, already in this minimalistic form, we can write a slightly more complicated +test with two allocations, one object pointing to the other. It works correctly +too, both allocations are removed:</p> +<div class="code"><pre class="code python"><a id="rest_code_c4a730568f38466fa02866676b4b8737-1" name="rest_code_c4a730568f38466fa02866676b4b8737-1" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_c4a730568f38466fa02866676b4b8737-1"></a><span class="k">def</span> <span class="nf">test_remove_two_allocations</span><span class="p">():</span> +<a id="rest_code_c4a730568f38466fa02866676b4b8737-2" name="rest_code_c4a730568f38466fa02866676b4b8737-2" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_c4a730568f38466fa02866676b4b8737-2"></a> <span class="n">bb</span> <span class="o">=</span> <span class="n">Block</span><span class="p">()</span> +<a id="rest_code_c4a730568f38466fa02866676b4b8737-3" name="rest_code_c4a730568f38466fa02866676b4b8737-3" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_c4a730568f38466fa02866676b4b8737-3"></a> <span class="n">var0</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">getarg</span><span class="p">(</span><span class="mi">0</span><span class="p">)</span> +<a id="rest_code_c4a730568f38466fa02866676b4b8737-4" name="rest_code_c4a730568f38466fa02866676b4b8737-4" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_c4a730568f38466fa02866676b4b8737-4"></a> <span class="n">obj0</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">alloc</span><span class="p">()</span> +<a id="rest_code_c4a730568f38466fa02866676b4b8737-5" name="rest_code_c4a730568f38466fa02866676b4b8737-5" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_c4a730568f38466fa02866676b4b8737-5"></a> <span class="n">sto1</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">store</span><span class="p">(</span><span class="n">obj0</span><span class="p">,</span> <span class="mi">0</span><span class="p">,</span> <span class="n">var0</span><span class="p">)</span> +<a id="rest_code_c4a730568f38466fa02866676b4b8737-6" name="rest_code_c4a730568f38466fa02866676b4b8737-6" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_c4a730568f38466fa02866676b4b8737-6"></a> <span class="n">obj1</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">alloc</span><span class="p">()</span> +<a id="rest_code_c4a730568f38466fa02866676b4b8737-7" name="rest_code_c4a730568f38466fa02866676b4b8737-7" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_c4a730568f38466fa02866676b4b8737-7"></a> <span class="n">sto2</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">store</span><span class="p">(</span><span class="n">obj1</span><span class="p">,</span> <span class="mi">0</span><span class="p">,</span> <span class="n">obj0</span><span class="p">)</span> +<a id="rest_code_c4a730568f38466fa02866676b4b8737-8" name="rest_code_c4a730568f38466fa02866676b4b8737-8" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_c4a730568f38466fa02866676b4b8737-8"></a> <span class="n">var1</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">load</span><span class="p">(</span><span class="n">obj1</span><span class="p">,</span> <span class="mi">0</span><span class="p">)</span> +<a id="rest_code_c4a730568f38466fa02866676b4b8737-9" name="rest_code_c4a730568f38466fa02866676b4b8737-9" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_c4a730568f38466fa02866676b4b8737-9"></a> <span class="n">var2</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">load</span><span class="p">(</span><span class="n">var1</span><span class="p">,</span> <span class="mi">0</span><span class="p">)</span> +<a id="rest_code_c4a730568f38466fa02866676b4b8737-10" name="rest_code_c4a730568f38466fa02866676b4b8737-10" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_c4a730568f38466fa02866676b4b8737-10"></a> <span class="n">bb</span><span class="o">.</span><span class="n">print</span><span class="p">(</span><span class="n">var2</span><span class="p">)</span> +<a id="rest_code_c4a730568f38466fa02866676b4b8737-11" name="rest_code_c4a730568f38466fa02866676b4b8737-11" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_c4a730568f38466fa02866676b4b8737-11"></a> <span class="c1"># the virtual objects look like this:</span> +<a id="rest_code_c4a730568f38466fa02866676b4b8737-12" name="rest_code_c4a730568f38466fa02866676b4b8737-12" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_c4a730568f38466fa02866676b4b8737-12"></a> <span class="c1"># obj0</span> +<a id="rest_code_c4a730568f38466fa02866676b4b8737-13" name="rest_code_c4a730568f38466fa02866676b4b8737-13" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_c4a730568f38466fa02866676b4b8737-13"></a> <span class="c1"># ┌──────┐</span> +<a id="rest_code_c4a730568f38466fa02866676b4b8737-14" name="rest_code_c4a730568f38466fa02866676b4b8737-14" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_c4a730568f38466fa02866676b4b8737-14"></a> <span class="c1"># │ 0: ╷ │</span> +<a id="rest_code_c4a730568f38466fa02866676b4b8737-15" name="rest_code_c4a730568f38466fa02866676b4b8737-15" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_c4a730568f38466fa02866676b4b8737-15"></a> <span class="c1"># └────┼─┘</span> +<a id="rest_code_c4a730568f38466fa02866676b4b8737-16" name="rest_code_c4a730568f38466fa02866676b4b8737-16" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_c4a730568f38466fa02866676b4b8737-16"></a> <span class="c1"># │</span> +<a id="rest_code_c4a730568f38466fa02866676b4b8737-17" name="rest_code_c4a730568f38466fa02866676b4b8737-17" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_c4a730568f38466fa02866676b4b8737-17"></a> <span class="c1"># ▼</span> +<a id="rest_code_c4a730568f38466fa02866676b4b8737-18" name="rest_code_c4a730568f38466fa02866676b4b8737-18" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_c4a730568f38466fa02866676b4b8737-18"></a> <span class="c1"># obj1</span> +<a id="rest_code_c4a730568f38466fa02866676b4b8737-19" name="rest_code_c4a730568f38466fa02866676b4b8737-19" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_c4a730568f38466fa02866676b4b8737-19"></a> <span class="c1"># ┌─────────┐</span> +<a id="rest_code_c4a730568f38466fa02866676b4b8737-20" name="rest_code_c4a730568f38466fa02866676b4b8737-20" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_c4a730568f38466fa02866676b4b8737-20"></a> <span class="c1"># │ 0: var0 │</span> +<a id="rest_code_c4a730568f38466fa02866676b4b8737-21" name="rest_code_c4a730568f38466fa02866676b4b8737-21" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_c4a730568f38466fa02866676b4b8737-21"></a> <span class="c1"># └─────────┘</span> +<a id="rest_code_c4a730568f38466fa02866676b4b8737-22" name="rest_code_c4a730568f38466fa02866676b4b8737-22" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_c4a730568f38466fa02866676b4b8737-22"></a> <span class="c1"># therefore</span> +<a id="rest_code_c4a730568f38466fa02866676b4b8737-23" name="rest_code_c4a730568f38466fa02866676b4b8737-23" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_c4a730568f38466fa02866676b4b8737-23"></a> <span class="c1"># var1 is the same as obj0</span> +<a id="rest_code_c4a730568f38466fa02866676b4b8737-24" name="rest_code_c4a730568f38466fa02866676b4b8737-24" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_c4a730568f38466fa02866676b4b8737-24"></a> <span class="c1"># var2 is the same as var0</span> +<a id="rest_code_c4a730568f38466fa02866676b4b8737-25" name="rest_code_c4a730568f38466fa02866676b4b8737-25" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_c4a730568f38466fa02866676b4b8737-25"></a> <span class="n">opt_bb</span> <span class="o">=</span> <span class="n">optimize_alloc_removal</span><span class="p">(</span><span class="n">bb</span><span class="p">)</span> +<a id="rest_code_c4a730568f38466fa02866676b4b8737-26" name="rest_code_c4a730568f38466fa02866676b4b8737-26" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_c4a730568f38466fa02866676b4b8737-26"></a> <span class="k">assert</span> <span class="n">bb_to_str</span><span class="p">(</span><span class="n">opt_bb</span><span class="p">,</span> <span class="s2">"optvar"</span><span class="p">)</span> <span class="o">==</span> <span class="s2">"""</span><span class="se">\</span> +<a id="rest_code_c4a730568f38466fa02866676b4b8737-27" name="rest_code_c4a730568f38466fa02866676b4b8737-27" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_c4a730568f38466fa02866676b4b8737-27"></a><span class="s2">optvar0 = getarg(0)</span> +<a id="rest_code_c4a730568f38466fa02866676b4b8737-28" name="rest_code_c4a730568f38466fa02866676b4b8737-28" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_c4a730568f38466fa02866676b4b8737-28"></a><span class="s2">optvar1 = print(optvar0)"""</span> +</pre></div> +</section> +<section id="version-2-re-materializing-allocations"> +<h2>Version 2: Re-Materializing Allocations</h2> +<p>To make it easier to talk about how the optimizer operates, let's introduce +some terminology. As already seen by the choice +of the class name <code class="docutils literal">VirtualObject</code>, we will call an object <strong>virtual</strong> if the +optimizer has optimized away the <code class="docutils literal">alloc</code> operation that creates the object. +Other objects are equivalently <strong>not virtual</strong>, for example those that have +existed before we enter the current code block.</p> +<p>The first problem that we need to fix is the assumption that every +allocation can be removed. So far we only looked at small programs where every +allocation could be removed, or equivalently, where every object is virtual. +A program that creates virtual objects, stores into and loads from them, and +then forgets the objects. In this simple case removing the allocations is fine. +As we saw in the previous section, it's also fine to have a virtual object +reference another virtual, both allocations can be removed.</p> +<p>What are the cases were we <em>can't</em> remove an allocation? +The first version of the optimizer simply assumed that every allocation can be +removed. This can't work. We will replace this assumption with the following +simple heuristic:</p> +<p>If a reference to a virtual object <code class="docutils literal">a</code> is stored into an object <code class="docutils literal">b</code> +that is not virtual, then <code class="docutils literal">a</code> will also stop being virtual. If an object <code class="docutils literal">a</code> +that was virtual stops being virtual, we say that it <strong>escapes</strong>. <a class="reference internal" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#target-4">¹</a></p> +<p>The simplest test case for this happening looks like this:</p> +<div class="code"><pre class="code python"><a id="rest_code_0c257544406048c9853180429a9d35a8-1" name="rest_code_0c257544406048c9853180429a9d35a8-1" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_0c257544406048c9853180429a9d35a8-1"></a><span class="k">def</span> <span class="nf">test_materialize</span><span class="p">():</span> +<a id="rest_code_0c257544406048c9853180429a9d35a8-2" name="rest_code_0c257544406048c9853180429a9d35a8-2" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_0c257544406048c9853180429a9d35a8-2"></a> <span class="n">bb</span> <span class="o">=</span> <span class="n">Block</span><span class="p">()</span> +<a id="rest_code_0c257544406048c9853180429a9d35a8-3" name="rest_code_0c257544406048c9853180429a9d35a8-3" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_0c257544406048c9853180429a9d35a8-3"></a> <span class="n">var0</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">getarg</span><span class="p">(</span><span class="mi">0</span><span class="p">)</span> +<a id="rest_code_0c257544406048c9853180429a9d35a8-4" name="rest_code_0c257544406048c9853180429a9d35a8-4" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_0c257544406048c9853180429a9d35a8-4"></a> <span class="n">obj</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">alloc</span><span class="p">()</span> +<a id="rest_code_0c257544406048c9853180429a9d35a8-5" name="rest_code_0c257544406048c9853180429a9d35a8-5" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_0c257544406048c9853180429a9d35a8-5"></a> <span class="n">sto</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">store</span><span class="p">(</span><span class="n">var0</span><span class="p">,</span> <span class="mi">0</span><span class="p">,</span> <span class="n">obj</span><span class="p">)</span> +<a id="rest_code_0c257544406048c9853180429a9d35a8-6" name="rest_code_0c257544406048c9853180429a9d35a8-6" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_0c257544406048c9853180429a9d35a8-6"></a> <span class="n">opt_bb</span> <span class="o">=</span> <span class="n">optimize_alloc_removal</span><span class="p">(</span><span class="n">bb</span><span class="p">)</span> +<a id="rest_code_0c257544406048c9853180429a9d35a8-7" name="rest_code_0c257544406048c9853180429a9d35a8-7" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_0c257544406048c9853180429a9d35a8-7"></a> <span class="c1"># obj is virtual, without any fields</span> +<a id="rest_code_0c257544406048c9853180429a9d35a8-8" name="rest_code_0c257544406048c9853180429a9d35a8-8" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_0c257544406048c9853180429a9d35a8-8"></a> <span class="c1"># ┌───────┐</span> +<a id="rest_code_0c257544406048c9853180429a9d35a8-9" name="rest_code_0c257544406048c9853180429a9d35a8-9" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_0c257544406048c9853180429a9d35a8-9"></a> <span class="c1"># │ empty │</span> +<a id="rest_code_0c257544406048c9853180429a9d35a8-10" name="rest_code_0c257544406048c9853180429a9d35a8-10" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_0c257544406048c9853180429a9d35a8-10"></a> <span class="c1"># └───────┘</span> +<a id="rest_code_0c257544406048c9853180429a9d35a8-11" name="rest_code_0c257544406048c9853180429a9d35a8-11" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_0c257544406048c9853180429a9d35a8-11"></a> <span class="c1"># then we store a reference to obj into</span> +<a id="rest_code_0c257544406048c9853180429a9d35a8-12" name="rest_code_0c257544406048c9853180429a9d35a8-12" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_0c257544406048c9853180429a9d35a8-12"></a> <span class="c1"># field 0 of var0. Since var0 is not virtual,</span> +<a id="rest_code_0c257544406048c9853180429a9d35a8-13" name="rest_code_0c257544406048c9853180429a9d35a8-13" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_0c257544406048c9853180429a9d35a8-13"></a> <span class="c1"># obj escapes, so we have to put it back</span> +<a id="rest_code_0c257544406048c9853180429a9d35a8-14" name="rest_code_0c257544406048c9853180429a9d35a8-14" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_0c257544406048c9853180429a9d35a8-14"></a> <span class="c1"># into the optimized basic block</span> +<a id="rest_code_0c257544406048c9853180429a9d35a8-15" name="rest_code_0c257544406048c9853180429a9d35a8-15" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_0c257544406048c9853180429a9d35a8-15"></a> <span class="k">assert</span> <span class="n">bb_to_str</span><span class="p">(</span><span class="n">opt_bb</span><span class="p">,</span> <span class="s2">"optvar"</span><span class="p">)</span> <span class="o">==</span> <span class="s2">"""</span><span class="se">\</span> +<a id="rest_code_0c257544406048c9853180429a9d35a8-16" name="rest_code_0c257544406048c9853180429a9d35a8-16" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_0c257544406048c9853180429a9d35a8-16"></a><span class="s2">optvar0 = getarg(0)</span> +<a id="rest_code_0c257544406048c9853180429a9d35a8-17" name="rest_code_0c257544406048c9853180429a9d35a8-17" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_0c257544406048c9853180429a9d35a8-17"></a><span class="s2">optvar1 = alloc()</span> +<a id="rest_code_0c257544406048c9853180429a9d35a8-18" name="rest_code_0c257544406048c9853180429a9d35a8-18" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_0c257544406048c9853180429a9d35a8-18"></a><span class="s2">optvar2 = store(optvar0, 0, optvar1)"""</span> +<a id="rest_code_0c257544406048c9853180429a9d35a8-19" name="rest_code_0c257544406048c9853180429a9d35a8-19" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_0c257544406048c9853180429a9d35a8-19"></a> <span class="c1"># so far, fails like this:</span> +<a id="rest_code_0c257544406048c9853180429a9d35a8-20" name="rest_code_0c257544406048c9853180429a9d35a8-20" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_0c257544406048c9853180429a9d35a8-20"></a> <span class="c1"># the line:</span> +<a id="rest_code_0c257544406048c9853180429a9d35a8-21" name="rest_code_0c257544406048c9853180429a9d35a8-21" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_0c257544406048c9853180429a9d35a8-21"></a> <span class="c1"># info.store(field, op.arg(2))</span> +<a id="rest_code_0c257544406048c9853180429a9d35a8-22" name="rest_code_0c257544406048c9853180429a9d35a8-22" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_0c257544406048c9853180429a9d35a8-22"></a> <span class="c1"># produces an AttributeError because info</span> +<a id="rest_code_0c257544406048c9853180429a9d35a8-23" name="rest_code_0c257544406048c9853180429a9d35a8-23" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_0c257544406048c9853180429a9d35a8-23"></a> <span class="c1"># is None</span> +</pre></div> +<p>If the optimizer reaches a point where a virtual object escapes (like the +<code class="docutils literal">store</code> operation in the test), the optimizer has already removed the <code class="docutils literal">alloc</code> +operation that created the virtual object. If the object escapes, we don't want +to go back in the operations list and re-insert the <code class="docutils literal">alloc</code> operation, that +sounds potentially very complicated. Instead, we re-insert the <code class="docutils literal">alloc</code> +operation that will recreate the virtual object at the point of escape using a +helper function <code class="docutils literal">materialize</code>.</p> +<div class="code"><pre class="code python"><a id="rest_code_e9d7ee3ad1a7422f81a27a1c3a1b1466-1" name="rest_code_e9d7ee3ad1a7422f81a27a1c3a1b1466-1" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_e9d7ee3ad1a7422f81a27a1c3a1b1466-1"></a><span class="hll"><span class="k">def</span> <span class="nf">materialize</span><span class="p">(</span><span class="n">opt_bb</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="n">Operation</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="kc">None</span><span class="p">:</span> +</span><a id="rest_code_e9d7ee3ad1a7422f81a27a1c3a1b1466-2" name="rest_code_e9d7ee3ad1a7422f81a27a1c3a1b1466-2" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_e9d7ee3ad1a7422f81a27a1c3a1b1466-2"></a><span class="hll"> <span class="k">assert</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">value</span><span class="p">,</span> <span class="n">Constant</span><span class="p">)</span> +</span><a id="rest_code_e9d7ee3ad1a7422f81a27a1c3a1b1466-3" name="rest_code_e9d7ee3ad1a7422f81a27a1c3a1b1466-3" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_e9d7ee3ad1a7422f81a27a1c3a1b1466-3"></a><span class="hll"> <span class="k">assert</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">value</span><span class="p">,</span> <span class="n">Operation</span><span class="p">)</span> +</span><a id="rest_code_e9d7ee3ad1a7422f81a27a1c3a1b1466-4" name="rest_code_e9d7ee3ad1a7422f81a27a1c3a1b1466-4" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_e9d7ee3ad1a7422f81a27a1c3a1b1466-4"></a><span class="hll"> <span class="n">info</span> <span class="o">=</span> <span class="n">value</span><span class="o">.</span><span class="n">info</span> +</span><a id="rest_code_e9d7ee3ad1a7422f81a27a1c3a1b1466-5" name="rest_code_e9d7ee3ad1a7422f81a27a1c3a1b1466-5" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_e9d7ee3ad1a7422f81a27a1c3a1b1466-5"></a><span class="hll"> <span class="k">assert</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">info</span><span class="p">,</span> <span class="n">VirtualObject</span><span class="p">)</span> +</span><a id="rest_code_e9d7ee3ad1a7422f81a27a1c3a1b1466-6" name="rest_code_e9d7ee3ad1a7422f81a27a1c3a1b1466-6" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_e9d7ee3ad1a7422f81a27a1c3a1b1466-6"></a><span class="hll"> <span class="k">assert</span> <span class="n">value</span><span class="o">.</span><span class="n">name</span> <span class="o">==</span> <span class="s2">"alloc"</span> +</span><a id="rest_code_e9d7ee3ad1a7422f81a27a1c3a1b1466-7" name="rest_code_e9d7ee3ad1a7422f81a27a1c3a1b1466-7" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_e9d7ee3ad1a7422f81a27a1c3a1b1466-7"></a><span class="hll"> <span class="c1"># put the alloc operation back into the trace</span> +</span><a id="rest_code_e9d7ee3ad1a7422f81a27a1c3a1b1466-8" name="rest_code_e9d7ee3ad1a7422f81a27a1c3a1b1466-8" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_e9d7ee3ad1a7422f81a27a1c3a1b1466-8"></a><span class="hll"> <span class="n">opt_bb</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">value</span><span class="p">)</span> +</span></pre></div> +<p>I've added a number of fairly strong assertions to <code class="docutils literal">materialize</code> to encode our +current assumptions about the situations in which it expects to be called. We +will remove some of them later as we generalize the code.</p> +<p>Now that we have <code class="docutils literal">materialize</code> we need to change <code class="docutils literal">optimize_alloc_removal</code> to +recognize the case of storing a virtual object into a non-virtual one. We can +recognize <code class="docutils literal">Operation</code> instances that produced a virtual object by looking at +their <code class="docutils literal">.info</code> field. If it is <code class="docutils literal">None</code>, the object is not virtual, otherwise +it is. If we store something into a virtual object, we leave the code as above. +If we store a virtual object into an object that is not virtual, we will first +materialize the virtual object, and then emit the store.</p> +<div class="code"><pre class="code python"><a id="rest_code_5fe65fac17ce4da58592318a00455537-1" name="rest_code_5fe65fac17ce4da58592318a00455537-1" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_5fe65fac17ce4da58592318a00455537-1"></a><span class="k">def</span> <span class="nf">optimize_alloc_removal</span><span class="p">(</span><span class="n">bb</span><span class="p">):</span> +<a id="rest_code_5fe65fac17ce4da58592318a00455537-2" name="rest_code_5fe65fac17ce4da58592318a00455537-2" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_5fe65fac17ce4da58592318a00455537-2"></a> <span class="n">opt_bb</span> <span class="o">=</span> <span class="n">Block</span><span class="p">()</span> +<a id="rest_code_5fe65fac17ce4da58592318a00455537-3" name="rest_code_5fe65fac17ce4da58592318a00455537-3" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_5fe65fac17ce4da58592318a00455537-3"></a> <span class="k">for</span> <span class="n">op</span> <span class="ow">in</span> <span class="n">bb</span><span class="p">:</span> +<a id="rest_code_5fe65fac17ce4da58592318a00455537-4" name="rest_code_5fe65fac17ce4da58592318a00455537-4" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_5fe65fac17ce4da58592318a00455537-4"></a> <span class="k">if</span> <span class="n">op</span><span class="o">.</span><span class="n">name</span> <span class="o">==</span> <span class="s2">"alloc"</span><span class="p">:</span> +<a id="rest_code_5fe65fac17ce4da58592318a00455537-5" name="rest_code_5fe65fac17ce4da58592318a00455537-5" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_5fe65fac17ce4da58592318a00455537-5"></a> <span class="n">op</span><span class="o">.</span><span class="n">info</span> <span class="o">=</span> <span class="n">VirtualObject</span><span class="p">()</span> +<a id="rest_code_5fe65fac17ce4da58592318a00455537-6" name="rest_code_5fe65fac17ce4da58592318a00455537-6" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_5fe65fac17ce4da58592318a00455537-6"></a> <span class="k">continue</span> +<a id="rest_code_5fe65fac17ce4da58592318a00455537-7" name="rest_code_5fe65fac17ce4da58592318a00455537-7" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_5fe65fac17ce4da58592318a00455537-7"></a> <span class="k">if</span> <span class="n">op</span><span class="o">.</span><span class="n">name</span> <span class="o">==</span> <span class="s2">"load"</span><span class="p">:</span> +<a id="rest_code_5fe65fac17ce4da58592318a00455537-8" name="rest_code_5fe65fac17ce4da58592318a00455537-8" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_5fe65fac17ce4da58592318a00455537-8"></a> <span class="n">info</span> <span class="o">=</span> <span class="n">op</span><span class="o">.</span><span class="n">arg</span><span class="p">(</span><span class="mi">0</span><span class="p">)</span><span class="o">.</span><span class="n">info</span> +<a id="rest_code_5fe65fac17ce4da58592318a00455537-9" name="rest_code_5fe65fac17ce4da58592318a00455537-9" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_5fe65fac17ce4da58592318a00455537-9"></a> <span class="n">field</span> <span class="o">=</span> <span class="n">get_num</span><span class="p">(</span><span class="n">op</span><span class="p">)</span> +<a id="rest_code_5fe65fac17ce4da58592318a00455537-10" name="rest_code_5fe65fac17ce4da58592318a00455537-10" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_5fe65fac17ce4da58592318a00455537-10"></a> <span class="n">op</span><span class="o">.</span><span class="n">make_equal_to</span><span class="p">(</span><span class="n">info</span><span class="o">.</span><span class="n">load</span><span class="p">(</span><span class="n">field</span><span class="p">))</span> +<a id="rest_code_5fe65fac17ce4da58592318a00455537-11" name="rest_code_5fe65fac17ce4da58592318a00455537-11" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_5fe65fac17ce4da58592318a00455537-11"></a> <span class="k">continue</span> +<a id="rest_code_5fe65fac17ce4da58592318a00455537-12" name="rest_code_5fe65fac17ce4da58592318a00455537-12" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_5fe65fac17ce4da58592318a00455537-12"></a> <span class="k">if</span> <span class="n">op</span><span class="o">.</span><span class="n">name</span> <span class="o">==</span> <span class="s2">"store"</span><span class="p">:</span> +<a id="rest_code_5fe65fac17ce4da58592318a00455537-13" name="rest_code_5fe65fac17ce4da58592318a00455537-13" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_5fe65fac17ce4da58592318a00455537-13"></a> <span class="n">info</span> <span class="o">=</span> <span class="n">op</span><span class="o">.</span><span class="n">arg</span><span class="p">(</span><span class="mi">0</span><span class="p">)</span><span class="o">.</span><span class="n">info</span> +<a id="rest_code_5fe65fac17ce4da58592318a00455537-14" name="rest_code_5fe65fac17ce4da58592318a00455537-14" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_5fe65fac17ce4da58592318a00455537-14"></a><span class="hll"> <span class="k">if</span> <span class="n">info</span><span class="p">:</span> <span class="c1"># virtual</span> +</span><a id="rest_code_5fe65fac17ce4da58592318a00455537-15" name="rest_code_5fe65fac17ce4da58592318a00455537-15" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_5fe65fac17ce4da58592318a00455537-15"></a><span class="hll"> <span class="n">field</span> <span class="o">=</span> <span class="n">get_num</span><span class="p">(</span><span class="n">op</span><span class="p">)</span> +</span><a id="rest_code_5fe65fac17ce4da58592318a00455537-16" name="rest_code_5fe65fac17ce4da58592318a00455537-16" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_5fe65fac17ce4da58592318a00455537-16"></a><span class="hll"> <span class="n">info</span><span class="o">.</span><span class="n">store</span><span class="p">(</span><span class="n">field</span><span class="p">,</span> <span class="n">op</span><span class="o">.</span><span class="n">arg</span><span class="p">(</span><span class="mi">2</span><span class="p">))</span> +</span><a id="rest_code_5fe65fac17ce4da58592318a00455537-17" name="rest_code_5fe65fac17ce4da58592318a00455537-17" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_5fe65fac17ce4da58592318a00455537-17"></a><span class="hll"> <span class="k">continue</span> +</span><a id="rest_code_5fe65fac17ce4da58592318a00455537-18" name="rest_code_5fe65fac17ce4da58592318a00455537-18" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_5fe65fac17ce4da58592318a00455537-18"></a><span class="hll"> <span class="k">else</span><span class="p">:</span> <span class="c1"># not virtual</span> +</span><a id="rest_code_5fe65fac17ce4da58592318a00455537-19" name="rest_code_5fe65fac17ce4da58592318a00455537-19" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_5fe65fac17ce4da58592318a00455537-19"></a><span class="hll"> <span class="c1"># first materialize the</span> +</span><a id="rest_code_5fe65fac17ce4da58592318a00455537-20" name="rest_code_5fe65fac17ce4da58592318a00455537-20" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_5fe65fac17ce4da58592318a00455537-20"></a><span class="hll"> <span class="c1"># right hand side</span> +</span><a id="rest_code_5fe65fac17ce4da58592318a00455537-21" name="rest_code_5fe65fac17ce4da58592318a00455537-21" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_5fe65fac17ce4da58592318a00455537-21"></a><span class="hll"> <span class="n">materialize</span><span class="p">(</span><span class="n">opt_bb</span><span class="p">,</span> <span class="n">op</span><span class="o">.</span><span class="n">arg</span><span class="p">(</span><span class="mi">2</span><span class="p">))</span> +</span><a id="rest_code_5fe65fac17ce4da58592318a00455537-22" name="rest_code_5fe65fac17ce4da58592318a00455537-22" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_5fe65fac17ce4da58592318a00455537-22"></a><span class="hll"> <span class="c1"># then emit the store via</span> +</span><a id="rest_code_5fe65fac17ce4da58592318a00455537-23" name="rest_code_5fe65fac17ce4da58592318a00455537-23" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_5fe65fac17ce4da58592318a00455537-23"></a><span class="hll"> <span class="c1"># the general path below</span> +</span><a id="rest_code_5fe65fac17ce4da58592318a00455537-24" name="rest_code_5fe65fac17ce4da58592318a00455537-24" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_5fe65fac17ce4da58592318a00455537-24"></a> <span class="n">opt_bb</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">op</span><span class="p">)</span> +<a id="rest_code_5fe65fac17ce4da58592318a00455537-25" name="rest_code_5fe65fac17ce4da58592318a00455537-25" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_5fe65fac17ce4da58592318a00455537-25"></a> <span class="k">return</span> <span class="n">opt_bb</span> +</pre></div> +<p>This is the general idea, and it is enough to pass <code class="docutils literal">test_materialize</code>. But of +course there are still a number of further problems that we now need to solve.</p> +</section> +<section id="version-3-don-t-materialize-twice"> +<h2>Version 3: Don't Materialize Twice</h2> +<p>The first problem is the fact that after we materialize a virtual object, it is +no longer virtual. So if it escapes a second time, it should <em>not</em> be +materialized a second time. A test for that case could simply repeat the +<code class="docutils literal">store</code> operation:</p> +<div class="code"><pre class="code python"><a id="rest_code_078d6f2e53ce48fa9bfa0bdb049a4bae-1" name="rest_code_078d6f2e53ce48fa9bfa0bdb049a4bae-1" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_078d6f2e53ce48fa9bfa0bdb049a4bae-1"></a><span class="k">def</span> <span class="nf">test_dont_materialize_twice</span><span class="p">():</span> +<a id="rest_code_078d6f2e53ce48fa9bfa0bdb049a4bae-2" name="rest_code_078d6f2e53ce48fa9bfa0bdb049a4bae-2" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_078d6f2e53ce48fa9bfa0bdb049a4bae-2"></a> <span class="c1"># obj is again an empty virtual object,</span> +<a id="rest_code_078d6f2e53ce48fa9bfa0bdb049a4bae-3" name="rest_code_078d6f2e53ce48fa9bfa0bdb049a4bae-3" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_078d6f2e53ce48fa9bfa0bdb049a4bae-3"></a> <span class="c1"># and we store it into var0 *twice*.</span> +<a id="rest_code_078d6f2e53ce48fa9bfa0bdb049a4bae-4" name="rest_code_078d6f2e53ce48fa9bfa0bdb049a4bae-4" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_078d6f2e53ce48fa9bfa0bdb049a4bae-4"></a> <span class="c1"># this should only materialize it once</span> +<a id="rest_code_078d6f2e53ce48fa9bfa0bdb049a4bae-5" name="rest_code_078d6f2e53ce48fa9bfa0bdb049a4bae-5" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_078d6f2e53ce48fa9bfa0bdb049a4bae-5"></a> <span class="n">bb</span> <span class="o">=</span> <span class="n">Block</span><span class="p">()</span> +<a id="rest_code_078d6f2e53ce48fa9bfa0bdb049a4bae-6" name="rest_code_078d6f2e53ce48fa9bfa0bdb049a4bae-6" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_078d6f2e53ce48fa9bfa0bdb049a4bae-6"></a> <span class="n">var0</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">getarg</span><span class="p">(</span><span class="mi">0</span><span class="p">)</span> +<a id="rest_code_078d6f2e53ce48fa9bfa0bdb049a4bae-7" name="rest_code_078d6f2e53ce48fa9bfa0bdb049a4bae-7" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_078d6f2e53ce48fa9bfa0bdb049a4bae-7"></a> <span class="n">obj</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">alloc</span><span class="p">()</span> +<a id="rest_code_078d6f2e53ce48fa9bfa0bdb049a4bae-8" name="rest_code_078d6f2e53ce48fa9bfa0bdb049a4bae-8" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_078d6f2e53ce48fa9bfa0bdb049a4bae-8"></a> <span class="n">sto0</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">store</span><span class="p">(</span><span class="n">var0</span><span class="p">,</span> <span class="mi">0</span><span class="p">,</span> <span class="n">obj</span><span class="p">)</span> +<a id="rest_code_078d6f2e53ce48fa9bfa0bdb049a4bae-9" name="rest_code_078d6f2e53ce48fa9bfa0bdb049a4bae-9" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_078d6f2e53ce48fa9bfa0bdb049a4bae-9"></a> <span class="n">sto1</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">store</span><span class="p">(</span><span class="n">var0</span><span class="p">,</span> <span class="mi">0</span><span class="p">,</span> <span class="n">obj</span><span class="p">)</span> +<a id="rest_code_078d6f2e53ce48fa9bfa0bdb049a4bae-10" name="rest_code_078d6f2e53ce48fa9bfa0bdb049a4bae-10" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_078d6f2e53ce48fa9bfa0bdb049a4bae-10"></a> <span class="n">opt_bb</span> <span class="o">=</span> <span class="n">optimize_alloc_removal</span><span class="p">(</span><span class="n">bb</span><span class="p">)</span> +<a id="rest_code_078d6f2e53ce48fa9bfa0bdb049a4bae-11" name="rest_code_078d6f2e53ce48fa9bfa0bdb049a4bae-11" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_078d6f2e53ce48fa9bfa0bdb049a4bae-11"></a> <span class="k">assert</span> <span class="n">bb_to_str</span><span class="p">(</span><span class="n">opt_bb</span><span class="p">,</span> <span class="s2">"optvar"</span><span class="p">)</span> <span class="o">==</span> <span class="s2">"""</span><span class="se">\</span> +<a id="rest_code_078d6f2e53ce48fa9bfa0bdb049a4bae-12" name="rest_code_078d6f2e53ce48fa9bfa0bdb049a4bae-12" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_078d6f2e53ce48fa9bfa0bdb049a4bae-12"></a><span class="s2">optvar0 = getarg(0)</span> +<a id="rest_code_078d6f2e53ce48fa9bfa0bdb049a4bae-13" name="rest_code_078d6f2e53ce48fa9bfa0bdb049a4bae-13" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_078d6f2e53ce48fa9bfa0bdb049a4bae-13"></a><span class="s2">optvar1 = alloc()</span> +<a id="rest_code_078d6f2e53ce48fa9bfa0bdb049a4bae-14" name="rest_code_078d6f2e53ce48fa9bfa0bdb049a4bae-14" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_078d6f2e53ce48fa9bfa0bdb049a4bae-14"></a><span class="s2">optvar2 = store(optvar0, 0, optvar1)</span> +<a id="rest_code_078d6f2e53ce48fa9bfa0bdb049a4bae-15" name="rest_code_078d6f2e53ce48fa9bfa0bdb049a4bae-15" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_078d6f2e53ce48fa9bfa0bdb049a4bae-15"></a><span class="s2">optvar3 = store(optvar0, 0, optvar1)"""</span> +<a id="rest_code_078d6f2e53ce48fa9bfa0bdb049a4bae-16" name="rest_code_078d6f2e53ce48fa9bfa0bdb049a4bae-16" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_078d6f2e53ce48fa9bfa0bdb049a4bae-16"></a> <span class="c1"># fails so far: the operations that we get</span> +<a id="rest_code_078d6f2e53ce48fa9bfa0bdb049a4bae-17" name="rest_code_078d6f2e53ce48fa9bfa0bdb049a4bae-17" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_078d6f2e53ce48fa9bfa0bdb049a4bae-17"></a> <span class="c1"># at the moment are:</span> +<a id="rest_code_078d6f2e53ce48fa9bfa0bdb049a4bae-18" name="rest_code_078d6f2e53ce48fa9bfa0bdb049a4bae-18" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_078d6f2e53ce48fa9bfa0bdb049a4bae-18"></a> <span class="c1"># optvar0 = getarg(0)</span> +<a id="rest_code_078d6f2e53ce48fa9bfa0bdb049a4bae-19" name="rest_code_078d6f2e53ce48fa9bfa0bdb049a4bae-19" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_078d6f2e53ce48fa9bfa0bdb049a4bae-19"></a> <span class="c1"># optvar1 = alloc()</span> +<a id="rest_code_078d6f2e53ce48fa9bfa0bdb049a4bae-20" name="rest_code_078d6f2e53ce48fa9bfa0bdb049a4bae-20" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_078d6f2e53ce48fa9bfa0bdb049a4bae-20"></a> <span class="c1"># optvar2 = store(optvar0, 0, optvar1)</span> +<a id="rest_code_078d6f2e53ce48fa9bfa0bdb049a4bae-21" name="rest_code_078d6f2e53ce48fa9bfa0bdb049a4bae-21" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_078d6f2e53ce48fa9bfa0bdb049a4bae-21"></a> <span class="c1"># optvar3 = alloc()</span> +<a id="rest_code_078d6f2e53ce48fa9bfa0bdb049a4bae-22" name="rest_code_078d6f2e53ce48fa9bfa0bdb049a4bae-22" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_078d6f2e53ce48fa9bfa0bdb049a4bae-22"></a> <span class="c1"># optvar4 = store(optvar0, 0, optvar3)</span> +<a id="rest_code_078d6f2e53ce48fa9bfa0bdb049a4bae-23" name="rest_code_078d6f2e53ce48fa9bfa0bdb049a4bae-23" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_078d6f2e53ce48fa9bfa0bdb049a4bae-23"></a> <span class="c1"># ie the object is materialized twice,</span> +<a id="rest_code_078d6f2e53ce48fa9bfa0bdb049a4bae-24" name="rest_code_078d6f2e53ce48fa9bfa0bdb049a4bae-24" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_078d6f2e53ce48fa9bfa0bdb049a4bae-24"></a> <span class="c1"># which is incorrect</span> +</pre></div> +<p>We solve the problem by setting the <code class="docutils literal">.info</code> field of an object that we +materialize to <code class="docutils literal">None</code> to mark it as no longer being virtual.</p> +<div class="code"><pre class="code python"><a id="rest_code_5a0ccaa034e54ffe9f2eea09a15e056c-1" name="rest_code_5a0ccaa034e54ffe9f2eea09a15e056c-1" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_5a0ccaa034e54ffe9f2eea09a15e056c-1"></a><span class="k">def</span> <span class="nf">materialize</span><span class="p">(</span><span class="n">opt_bb</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="n">Operation</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="kc">None</span><span class="p">:</span> +<a id="rest_code_5a0ccaa034e54ffe9f2eea09a15e056c-2" name="rest_code_5a0ccaa034e54ffe9f2eea09a15e056c-2" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_5a0ccaa034e54ffe9f2eea09a15e056c-2"></a> <span class="k">assert</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">value</span><span class="p">,</span> <span class="n">Constant</span><span class="p">)</span> +<a id="rest_code_5a0ccaa034e54ffe9f2eea09a15e056c-3" name="rest_code_5a0ccaa034e54ffe9f2eea09a15e056c-3" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_5a0ccaa034e54ffe9f2eea09a15e056c-3"></a> <span class="k">assert</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">value</span><span class="p">,</span> <span class="n">Operation</span><span class="p">)</span> +<a id="rest_code_5a0ccaa034e54ffe9f2eea09a15e056c-4" name="rest_code_5a0ccaa034e54ffe9f2eea09a15e056c-4" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_5a0ccaa034e54ffe9f2eea09a15e056c-4"></a> <span class="n">info</span> <span class="o">=</span> <span class="n">value</span><span class="o">.</span><span class="n">info</span> +<a id="rest_code_5a0ccaa034e54ffe9f2eea09a15e056c-5" name="rest_code_5a0ccaa034e54ffe9f2eea09a15e056c-5" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_5a0ccaa034e54ffe9f2eea09a15e056c-5"></a><span class="hll"> <span class="k">if</span> <span class="n">info</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span> +</span><a id="rest_code_5a0ccaa034e54ffe9f2eea09a15e056c-6" name="rest_code_5a0ccaa034e54ffe9f2eea09a15e056c-6" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_5a0ccaa034e54ffe9f2eea09a15e056c-6"></a><span class="hll"> <span class="k">return</span> <span class="c1"># already materialized</span> +</span><a id="rest_code_5a0ccaa034e54ffe9f2eea09a15e056c-7" name="rest_code_5a0ccaa034e54ffe9f2eea09a15e056c-7" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_5a0ccaa034e54ffe9f2eea09a15e056c-7"></a> <span class="k">assert</span> <span class="n">value</span><span class="o">.</span><span class="n">name</span> <span class="o">==</span> <span class="s2">"alloc"</span> +<a id="rest_code_5a0ccaa034e54ffe9f2eea09a15e056c-8" name="rest_code_5a0ccaa034e54ffe9f2eea09a15e056c-8" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_5a0ccaa034e54ffe9f2eea09a15e056c-8"></a><span class="hll"> <span class="c1"># put the alloc operation back into the trace</span> +</span><a id="rest_code_5a0ccaa034e54ffe9f2eea09a15e056c-9" name="rest_code_5a0ccaa034e54ffe9f2eea09a15e056c-9" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_5a0ccaa034e54ffe9f2eea09a15e056c-9"></a> <span class="n">opt_bb</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">value</span><span class="p">)</span> +<a id="rest_code_5a0ccaa034e54ffe9f2eea09a15e056c-10" name="rest_code_5a0ccaa034e54ffe9f2eea09a15e056c-10" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_5a0ccaa034e54ffe9f2eea09a15e056c-10"></a><span class="hll"> <span class="c1"># but only once</span> +</span><a id="rest_code_5a0ccaa034e54ffe9f2eea09a15e056c-11" name="rest_code_5a0ccaa034e54ffe9f2eea09a15e056c-11" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_5a0ccaa034e54ffe9f2eea09a15e056c-11"></a><span class="hll"> <span class="n">value</span><span class="o">.</span><span class="n">info</span> <span class="o">=</span> <span class="kc">None</span> +</span><a id="rest_code_5a0ccaa034e54ffe9f2eea09a15e056c-12" name="rest_code_5a0ccaa034e54ffe9f2eea09a15e056c-12" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_5a0ccaa034e54ffe9f2eea09a15e056c-12"></a> +<a id="rest_code_5a0ccaa034e54ffe9f2eea09a15e056c-13" name="rest_code_5a0ccaa034e54ffe9f2eea09a15e056c-13" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_5a0ccaa034e54ffe9f2eea09a15e056c-13"></a><span class="c1"># optimize_alloc_removal unchanged</span> +</pre></div> +<p>This fixes the problem, only one <code class="docutils literal">alloc</code> is created. This fix also allows +another test case to pass, one where we store a non-virtual into another +non-virtual, code which we cannot optimize at all:</p> +<div class="code"><pre class="code python"><a id="rest_code_205bbe2ab59241609c95782a0781cd2c-1" name="rest_code_205bbe2ab59241609c95782a0781cd2c-1" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_205bbe2ab59241609c95782a0781cd2c-1"></a><span class="k">def</span> <span class="nf">test_materialize_non_virtuals</span><span class="p">():</span> +<a id="rest_code_205bbe2ab59241609c95782a0781cd2c-2" name="rest_code_205bbe2ab59241609c95782a0781cd2c-2" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_205bbe2ab59241609c95782a0781cd2c-2"></a> <span class="c1"># in this example we store a non-virtual var1</span> +<a id="rest_code_205bbe2ab59241609c95782a0781cd2c-3" name="rest_code_205bbe2ab59241609c95782a0781cd2c-3" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_205bbe2ab59241609c95782a0781cd2c-3"></a> <span class="c1"># into another non-virtual var0</span> +<a id="rest_code_205bbe2ab59241609c95782a0781cd2c-4" name="rest_code_205bbe2ab59241609c95782a0781cd2c-4" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_205bbe2ab59241609c95782a0781cd2c-4"></a> <span class="c1"># this should just lead to no optimization at</span> +<a id="rest_code_205bbe2ab59241609c95782a0781cd2c-5" name="rest_code_205bbe2ab59241609c95782a0781cd2c-5" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_205bbe2ab59241609c95782a0781cd2c-5"></a> <span class="c1"># all</span> +<a id="rest_code_205bbe2ab59241609c95782a0781cd2c-6" name="rest_code_205bbe2ab59241609c95782a0781cd2c-6" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_205bbe2ab59241609c95782a0781cd2c-6"></a> <span class="n">bb</span> <span class="o">=</span> <span class="n">Block</span><span class="p">()</span> +<a id="rest_code_205bbe2ab59241609c95782a0781cd2c-7" name="rest_code_205bbe2ab59241609c95782a0781cd2c-7" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_205bbe2ab59241609c95782a0781cd2c-7"></a> <span class="n">var0</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">getarg</span><span class="p">(</span><span class="mi">0</span><span class="p">)</span> +<a id="rest_code_205bbe2ab59241609c95782a0781cd2c-8" name="rest_code_205bbe2ab59241609c95782a0781cd2c-8" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_205bbe2ab59241609c95782a0781cd2c-8"></a> <span class="n">var1</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">getarg</span><span class="p">(</span><span class="mi">1</span><span class="p">)</span> +<a id="rest_code_205bbe2ab59241609c95782a0781cd2c-9" name="rest_code_205bbe2ab59241609c95782a0781cd2c-9" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_205bbe2ab59241609c95782a0781cd2c-9"></a> <span class="n">sto</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">store</span><span class="p">(</span><span class="n">var0</span><span class="p">,</span> <span class="mi">0</span><span class="p">,</span> <span class="n">var1</span><span class="p">)</span> +<a id="rest_code_205bbe2ab59241609c95782a0781cd2c-10" name="rest_code_205bbe2ab59241609c95782a0781cd2c-10" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_205bbe2ab59241609c95782a0781cd2c-10"></a> <span class="n">opt_bb</span> <span class="o">=</span> <span class="n">optimize_alloc_removal</span><span class="p">(</span><span class="n">bb</span><span class="p">)</span> +<a id="rest_code_205bbe2ab59241609c95782a0781cd2c-11" name="rest_code_205bbe2ab59241609c95782a0781cd2c-11" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_205bbe2ab59241609c95782a0781cd2c-11"></a> <span class="k">assert</span> <span class="n">bb_to_str</span><span class="p">(</span><span class="n">opt_bb</span><span class="p">,</span> <span class="s2">"optvar"</span><span class="p">)</span> <span class="o">==</span> <span class="s2">"""</span><span class="se">\</span> +<a id="rest_code_205bbe2ab59241609c95782a0781cd2c-12" name="rest_code_205bbe2ab59241609c95782a0781cd2c-12" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_205bbe2ab59241609c95782a0781cd2c-12"></a><span class="s2">optvar0 = getarg(0)</span> +<a id="rest_code_205bbe2ab59241609c95782a0781cd2c-13" name="rest_code_205bbe2ab59241609c95782a0781cd2c-13" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_205bbe2ab59241609c95782a0781cd2c-13"></a><span class="s2">optvar1 = getarg(1)</span> +<a id="rest_code_205bbe2ab59241609c95782a0781cd2c-14" name="rest_code_205bbe2ab59241609c95782a0781cd2c-14" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_205bbe2ab59241609c95782a0781cd2c-14"></a><span class="s2">optvar2 = store(optvar0, 0, optvar1)"""</span> +</pre></div> +</section> +<section id="version-4-materialization-of-constants"> +<h2>Version 4: Materialization of Constants</h2> +<p>Another straightforward extension is to support materializing constants. A +constant is never virtual, so materializing it should do nothing.</p> +<div class="code"><pre class="code python"><a id="rest_code_b709144ffac344d1ba11ab5b097883f0-1" name="rest_code_b709144ffac344d1ba11ab5b097883f0-1" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_b709144ffac344d1ba11ab5b097883f0-1"></a><span class="k">def</span> <span class="nf">test_materialization_constants</span><span class="p">():</span> +<a id="rest_code_b709144ffac344d1ba11ab5b097883f0-2" name="rest_code_b709144ffac344d1ba11ab5b097883f0-2" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_b709144ffac344d1ba11ab5b097883f0-2"></a> <span class="c1"># in this example we store the constant 17</span> +<a id="rest_code_b709144ffac344d1ba11ab5b097883f0-3" name="rest_code_b709144ffac344d1ba11ab5b097883f0-3" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_b709144ffac344d1ba11ab5b097883f0-3"></a> <span class="c1"># into the non-virtual var0</span> +<a id="rest_code_b709144ffac344d1ba11ab5b097883f0-4" name="rest_code_b709144ffac344d1ba11ab5b097883f0-4" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_b709144ffac344d1ba11ab5b097883f0-4"></a> <span class="c1"># again, this will not be optimized</span> +<a id="rest_code_b709144ffac344d1ba11ab5b097883f0-5" name="rest_code_b709144ffac344d1ba11ab5b097883f0-5" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_b709144ffac344d1ba11ab5b097883f0-5"></a> <span class="n">bb</span> <span class="o">=</span> <span class="n">Block</span><span class="p">()</span> +<a id="rest_code_b709144ffac344d1ba11ab5b097883f0-6" name="rest_code_b709144ffac344d1ba11ab5b097883f0-6" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_b709144ffac344d1ba11ab5b097883f0-6"></a> <span class="n">var0</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">getarg</span><span class="p">(</span><span class="mi">0</span><span class="p">)</span> +<a id="rest_code_b709144ffac344d1ba11ab5b097883f0-7" name="rest_code_b709144ffac344d1ba11ab5b097883f0-7" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_b709144ffac344d1ba11ab5b097883f0-7"></a> <span class="n">sto</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">store</span><span class="p">(</span><span class="n">var0</span><span class="p">,</span> <span class="mi">0</span><span class="p">,</span> <span class="mi">17</span><span class="p">)</span> +<a id="rest_code_b709144ffac344d1ba11ab5b097883f0-8" name="rest_code_b709144ffac344d1ba11ab5b097883f0-8" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_b709144ffac344d1ba11ab5b097883f0-8"></a> <span class="n">opt_bb</span> <span class="o">=</span> <span class="n">optimize_alloc_removal</span><span class="p">(</span><span class="n">bb</span><span class="p">)</span> +<a id="rest_code_b709144ffac344d1ba11ab5b097883f0-9" name="rest_code_b709144ffac344d1ba11ab5b097883f0-9" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_b709144ffac344d1ba11ab5b097883f0-9"></a> <span class="c1"># the previous line fails so far, triggering</span> +<a id="rest_code_b709144ffac344d1ba11ab5b097883f0-10" name="rest_code_b709144ffac344d1ba11ab5b097883f0-10" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_b709144ffac344d1ba11ab5b097883f0-10"></a> <span class="c1"># the assert:</span> +<a id="rest_code_b709144ffac344d1ba11ab5b097883f0-11" name="rest_code_b709144ffac344d1ba11ab5b097883f0-11" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_b709144ffac344d1ba11ab5b097883f0-11"></a> <span class="c1"># assert not isinstance(value, Constant)</span> +<a id="rest_code_b709144ffac344d1ba11ab5b097883f0-12" name="rest_code_b709144ffac344d1ba11ab5b097883f0-12" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_b709144ffac344d1ba11ab5b097883f0-12"></a> <span class="c1"># in materialize</span> +<a id="rest_code_b709144ffac344d1ba11ab5b097883f0-13" name="rest_code_b709144ffac344d1ba11ab5b097883f0-13" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_b709144ffac344d1ba11ab5b097883f0-13"></a> <span class="k">assert</span> <span class="n">bb_to_str</span><span class="p">(</span><span class="n">opt_bb</span><span class="p">,</span> <span class="s2">"optvar"</span><span class="p">)</span> <span class="o">==</span> <span class="s2">"""</span><span class="se">\</span> +<a id="rest_code_b709144ffac344d1ba11ab5b097883f0-14" name="rest_code_b709144ffac344d1ba11ab5b097883f0-14" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_b709144ffac344d1ba11ab5b097883f0-14"></a><span class="s2">optvar0 = getarg(0)</span> +<a id="rest_code_b709144ffac344d1ba11ab5b097883f0-15" name="rest_code_b709144ffac344d1ba11ab5b097883f0-15" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_b709144ffac344d1ba11ab5b097883f0-15"></a><span class="s2">optvar1 = store(optvar0, 0, 17)"""</span> +</pre></div> +<p>To implement that case, we check for <code class="docutils literal">value</code> being a constant and return +early:</p> +<div class="code"><pre class="code python"><a id="rest_code_c7e8dca37bae45e3aae461ef85046c6f-1" name="rest_code_c7e8dca37bae45e3aae461ef85046c6f-1" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_c7e8dca37bae45e3aae461ef85046c6f-1"></a><span class="k">def</span> <span class="nf">materialize</span><span class="p">(</span><span class="n">opt_bb</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="n">Operation</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="kc">None</span><span class="p">:</span> +<a id="rest_code_c7e8dca37bae45e3aae461ef85046c6f-2" name="rest_code_c7e8dca37bae45e3aae461ef85046c6f-2" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_c7e8dca37bae45e3aae461ef85046c6f-2"></a><span class="hll"> <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">value</span><span class="p">,</span> <span class="n">Constant</span><span class="p">):</span> +</span><a id="rest_code_c7e8dca37bae45e3aae461ef85046c6f-3" name="rest_code_c7e8dca37bae45e3aae461ef85046c6f-3" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_c7e8dca37bae45e3aae461ef85046c6f-3"></a><span class="hll"> <span class="k">return</span> +</span><a id="rest_code_c7e8dca37bae45e3aae461ef85046c6f-4" name="rest_code_c7e8dca37bae45e3aae461ef85046c6f-4" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_c7e8dca37bae45e3aae461ef85046c6f-4"></a> <span class="k">assert</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">value</span><span class="p">,</span> <span class="n">Operation</span><span class="p">)</span> +<a id="rest_code_c7e8dca37bae45e3aae461ef85046c6f-5" name="rest_code_c7e8dca37bae45e3aae461ef85046c6f-5" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_c7e8dca37bae45e3aae461ef85046c6f-5"></a> <span class="n">info</span> <span class="o">=</span> <span class="n">value</span><span class="o">.</span><span class="n">info</span> +<a id="rest_code_c7e8dca37bae45e3aae461ef85046c6f-6" name="rest_code_c7e8dca37bae45e3aae461ef85046c6f-6" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_c7e8dca37bae45e3aae461ef85046c6f-6"></a> <span class="k">if</span> <span class="n">info</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span> +<a id="rest_code_c7e8dca37bae45e3aae461ef85046c6f-7" name="rest_code_c7e8dca37bae45e3aae461ef85046c6f-7" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_c7e8dca37bae45e3aae461ef85046c6f-7"></a> <span class="k">return</span> <span class="c1"># already materialized</span> +<a id="rest_code_c7e8dca37bae45e3aae461ef85046c6f-8" name="rest_code_c7e8dca37bae45e3aae461ef85046c6f-8" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_c7e8dca37bae45e3aae461ef85046c6f-8"></a> <span class="k">assert</span> <span class="n">value</span><span class="o">.</span><span class="n">name</span> <span class="o">==</span> <span class="s2">"alloc"</span> +<a id="rest_code_c7e8dca37bae45e3aae461ef85046c6f-9" name="rest_code_c7e8dca37bae45e3aae461ef85046c6f-9" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_c7e8dca37bae45e3aae461ef85046c6f-9"></a> <span class="c1"># put the alloc operation back into the trace</span> +<a id="rest_code_c7e8dca37bae45e3aae461ef85046c6f-10" name="rest_code_c7e8dca37bae45e3aae461ef85046c6f-10" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_c7e8dca37bae45e3aae461ef85046c6f-10"></a> <span class="n">opt_bb</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">value</span><span class="p">)</span> +<a id="rest_code_c7e8dca37bae45e3aae461ef85046c6f-11" name="rest_code_c7e8dca37bae45e3aae461ef85046c6f-11" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_c7e8dca37bae45e3aae461ef85046c6f-11"></a> <span class="c1"># but only once</span> +<a id="rest_code_c7e8dca37bae45e3aae461ef85046c6f-12" name="rest_code_c7e8dca37bae45e3aae461ef85046c6f-12" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_c7e8dca37bae45e3aae461ef85046c6f-12"></a> <span class="n">value</span><span class="o">.</span><span class="n">info</span> <span class="o">=</span> <span class="kc">None</span> +<a id="rest_code_c7e8dca37bae45e3aae461ef85046c6f-13" name="rest_code_c7e8dca37bae45e3aae461ef85046c6f-13" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_c7e8dca37bae45e3aae461ef85046c6f-13"></a> +<a id="rest_code_c7e8dca37bae45e3aae461ef85046c6f-14" name="rest_code_c7e8dca37bae45e3aae461ef85046c6f-14" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_c7e8dca37bae45e3aae461ef85046c6f-14"></a><span class="c1"># optimize_alloc_removal unchanged</span> +</pre></div> +</section> +<section id="version-5-materializing-fields"> +<h2>Version 5: Materializing Fields</h2> +<p>Now we need to solve a more difficult problem. So far, the virtual objects that +we have materialized have all been empty, meaning they didn't have any fields +written to at the point of materialization. Let's write a test for this:</p> +<div class="code"><pre class="code python"><a id="rest_code_fbc3fcddd68741a38bb88dc5981923f9-1" name="rest_code_fbc3fcddd68741a38bb88dc5981923f9-1" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_fbc3fcddd68741a38bb88dc5981923f9-1"></a><span class="k">def</span> <span class="nf">test_materialize_fields</span><span class="p">():</span> +<a id="rest_code_fbc3fcddd68741a38bb88dc5981923f9-2" name="rest_code_fbc3fcddd68741a38bb88dc5981923f9-2" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_fbc3fcddd68741a38bb88dc5981923f9-2"></a> <span class="n">bb</span> <span class="o">=</span> <span class="n">Block</span><span class="p">()</span> +<a id="rest_code_fbc3fcddd68741a38bb88dc5981923f9-3" name="rest_code_fbc3fcddd68741a38bb88dc5981923f9-3" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_fbc3fcddd68741a38bb88dc5981923f9-3"></a> <span class="n">var0</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">getarg</span><span class="p">(</span><span class="mi">0</span><span class="p">)</span> +<a id="rest_code_fbc3fcddd68741a38bb88dc5981923f9-4" name="rest_code_fbc3fcddd68741a38bb88dc5981923f9-4" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_fbc3fcddd68741a38bb88dc5981923f9-4"></a> <span class="n">var1</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">getarg</span><span class="p">(</span><span class="mi">1</span><span class="p">)</span> +<a id="rest_code_fbc3fcddd68741a38bb88dc5981923f9-5" name="rest_code_fbc3fcddd68741a38bb88dc5981923f9-5" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_fbc3fcddd68741a38bb88dc5981923f9-5"></a> <span class="n">obj</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">alloc</span><span class="p">()</span> +<a id="rest_code_fbc3fcddd68741a38bb88dc5981923f9-6" name="rest_code_fbc3fcddd68741a38bb88dc5981923f9-6" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_fbc3fcddd68741a38bb88dc5981923f9-6"></a> <span class="n">contents0</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">store</span><span class="p">(</span><span class="n">obj</span><span class="p">,</span> <span class="mi">0</span><span class="p">,</span> <span class="mi">8</span><span class="p">)</span> +<a id="rest_code_fbc3fcddd68741a38bb88dc5981923f9-7" name="rest_code_fbc3fcddd68741a38bb88dc5981923f9-7" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_fbc3fcddd68741a38bb88dc5981923f9-7"></a> <span class="n">contents1</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">store</span><span class="p">(</span><span class="n">obj</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="n">var1</span><span class="p">)</span> +<a id="rest_code_fbc3fcddd68741a38bb88dc5981923f9-8" name="rest_code_fbc3fcddd68741a38bb88dc5981923f9-8" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_fbc3fcddd68741a38bb88dc5981923f9-8"></a> <span class="n">sto</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">store</span><span class="p">(</span><span class="n">var0</span><span class="p">,</span> <span class="mi">0</span><span class="p">,</span> <span class="n">obj</span><span class="p">)</span> +<a id="rest_code_fbc3fcddd68741a38bb88dc5981923f9-9" name="rest_code_fbc3fcddd68741a38bb88dc5981923f9-9" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_fbc3fcddd68741a38bb88dc5981923f9-9"></a> +<a id="rest_code_fbc3fcddd68741a38bb88dc5981923f9-10" name="rest_code_fbc3fcddd68741a38bb88dc5981923f9-10" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_fbc3fcddd68741a38bb88dc5981923f9-10"></a> <span class="c1"># the virtual obj looks like this</span> +<a id="rest_code_fbc3fcddd68741a38bb88dc5981923f9-11" name="rest_code_fbc3fcddd68741a38bb88dc5981923f9-11" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_fbc3fcddd68741a38bb88dc5981923f9-11"></a> <span class="c1"># obj</span> +<a id="rest_code_fbc3fcddd68741a38bb88dc5981923f9-12" name="rest_code_fbc3fcddd68741a38bb88dc5981923f9-12" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_fbc3fcddd68741a38bb88dc5981923f9-12"></a> <span class="c1"># ┌──────┬──────────┐</span> +<a id="rest_code_fbc3fcddd68741a38bb88dc5981923f9-13" name="rest_code_fbc3fcddd68741a38bb88dc5981923f9-13" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_fbc3fcddd68741a38bb88dc5981923f9-13"></a> <span class="c1"># │ 0: 8 │ 1: var1 │</span> +<a id="rest_code_fbc3fcddd68741a38bb88dc5981923f9-14" name="rest_code_fbc3fcddd68741a38bb88dc5981923f9-14" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_fbc3fcddd68741a38bb88dc5981923f9-14"></a> <span class="c1"># └──────┴──────────┘</span> +<a id="rest_code_fbc3fcddd68741a38bb88dc5981923f9-15" name="rest_code_fbc3fcddd68741a38bb88dc5981923f9-15" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_fbc3fcddd68741a38bb88dc5981923f9-15"></a> <span class="c1"># then it needs to be materialized</span> +<a id="rest_code_fbc3fcddd68741a38bb88dc5981923f9-16" name="rest_code_fbc3fcddd68741a38bb88dc5981923f9-16" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_fbc3fcddd68741a38bb88dc5981923f9-16"></a> <span class="c1"># this is the first example where a virtual</span> +<a id="rest_code_fbc3fcddd68741a38bb88dc5981923f9-17" name="rest_code_fbc3fcddd68741a38bb88dc5981923f9-17" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_fbc3fcddd68741a38bb88dc5981923f9-17"></a> <span class="c1"># object that we want to materialize has any</span> +<a id="rest_code_fbc3fcddd68741a38bb88dc5981923f9-18" name="rest_code_fbc3fcddd68741a38bb88dc5981923f9-18" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_fbc3fcddd68741a38bb88dc5981923f9-18"></a> <span class="c1"># content and is not just an empty object</span> +<a id="rest_code_fbc3fcddd68741a38bb88dc5981923f9-19" name="rest_code_fbc3fcddd68741a38bb88dc5981923f9-19" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_fbc3fcddd68741a38bb88dc5981923f9-19"></a> <span class="n">opt_bb</span> <span class="o">=</span> <span class="n">optimize_alloc_removal</span><span class="p">(</span><span class="n">bb</span><span class="p">)</span> +<a id="rest_code_fbc3fcddd68741a38bb88dc5981923f9-20" name="rest_code_fbc3fcddd68741a38bb88dc5981923f9-20" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_fbc3fcddd68741a38bb88dc5981923f9-20"></a> <span class="k">assert</span> <span class="n">bb_to_str</span><span class="p">(</span><span class="n">opt_bb</span><span class="p">,</span> <span class="s2">"optvar"</span><span class="p">)</span> <span class="o">==</span> <span class="s2">"""</span><span class="se">\</span> +<a id="rest_code_fbc3fcddd68741a38bb88dc5981923f9-21" name="rest_code_fbc3fcddd68741a38bb88dc5981923f9-21" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_fbc3fcddd68741a38bb88dc5981923f9-21"></a><span class="s2">optvar0 = getarg(0)</span> +<a id="rest_code_fbc3fcddd68741a38bb88dc5981923f9-22" name="rest_code_fbc3fcddd68741a38bb88dc5981923f9-22" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_fbc3fcddd68741a38bb88dc5981923f9-22"></a><span class="s2">optvar1 = getarg(1)</span> +<a id="rest_code_fbc3fcddd68741a38bb88dc5981923f9-23" name="rest_code_fbc3fcddd68741a38bb88dc5981923f9-23" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_fbc3fcddd68741a38bb88dc5981923f9-23"></a><span class="s2">optvar2 = alloc()</span> +<a id="rest_code_fbc3fcddd68741a38bb88dc5981923f9-24" name="rest_code_fbc3fcddd68741a38bb88dc5981923f9-24" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_fbc3fcddd68741a38bb88dc5981923f9-24"></a><span class="s2">optvar3 = store(optvar2, 0, 8)</span> +<a id="rest_code_fbc3fcddd68741a38bb88dc5981923f9-25" name="rest_code_fbc3fcddd68741a38bb88dc5981923f9-25" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_fbc3fcddd68741a38bb88dc5981923f9-25"></a><span class="s2">optvar4 = store(optvar2, 1, optvar1)</span> +<a id="rest_code_fbc3fcddd68741a38bb88dc5981923f9-26" name="rest_code_fbc3fcddd68741a38bb88dc5981923f9-26" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_fbc3fcddd68741a38bb88dc5981923f9-26"></a><span class="s2">optvar5 = store(optvar0, 0, optvar2)"""</span> +<a id="rest_code_fbc3fcddd68741a38bb88dc5981923f9-27" name="rest_code_fbc3fcddd68741a38bb88dc5981923f9-27" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_fbc3fcddd68741a38bb88dc5981923f9-27"></a> <span class="c1"># fails so far! the operations we get</span> +<a id="rest_code_fbc3fcddd68741a38bb88dc5981923f9-28" name="rest_code_fbc3fcddd68741a38bb88dc5981923f9-28" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_fbc3fcddd68741a38bb88dc5981923f9-28"></a> <span class="c1"># at the moment are:</span> +<a id="rest_code_fbc3fcddd68741a38bb88dc5981923f9-29" name="rest_code_fbc3fcddd68741a38bb88dc5981923f9-29" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_fbc3fcddd68741a38bb88dc5981923f9-29"></a> <span class="c1"># optvar0 = getarg(0)</span> +<a id="rest_code_fbc3fcddd68741a38bb88dc5981923f9-30" name="rest_code_fbc3fcddd68741a38bb88dc5981923f9-30" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_fbc3fcddd68741a38bb88dc5981923f9-30"></a> <span class="c1"># optvar1 = getarg(1)</span> +<a id="rest_code_fbc3fcddd68741a38bb88dc5981923f9-31" name="rest_code_fbc3fcddd68741a38bb88dc5981923f9-31" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_fbc3fcddd68741a38bb88dc5981923f9-31"></a> <span class="c1"># optvar2 = alloc()</span> +<a id="rest_code_fbc3fcddd68741a38bb88dc5981923f9-32" name="rest_code_fbc3fcddd68741a38bb88dc5981923f9-32" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_fbc3fcddd68741a38bb88dc5981923f9-32"></a> <span class="c1"># optvar3 = store(optvar0, 0, optvar2)</span> +<a id="rest_code_fbc3fcddd68741a38bb88dc5981923f9-33" name="rest_code_fbc3fcddd68741a38bb88dc5981923f9-33" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_fbc3fcddd68741a38bb88dc5981923f9-33"></a> <span class="c1"># which is wrong, because the store operations</span> +<a id="rest_code_fbc3fcddd68741a38bb88dc5981923f9-34" name="rest_code_fbc3fcddd68741a38bb88dc5981923f9-34" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_fbc3fcddd68741a38bb88dc5981923f9-34"></a> <span class="c1"># into optvar1 got lost</span> +</pre></div> +<p>To fix this problem, we need to re-create a <code class="docutils literal">store</code> operation for every +element of the <code class="docutils literal">.contents</code> dictionary of the virtual object we are +materializing. <a class="reference internal" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#target-5">²</a></p> +<div class="code"><pre class="code python"><a id="rest_code_3903547c0616440380ad3221ad822e36-1" name="rest_code_3903547c0616440380ad3221ad822e36-1" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_3903547c0616440380ad3221ad822e36-1"></a><span class="k">def</span> <span class="nf">materialize</span><span class="p">(</span><span class="n">opt_bb</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="n">Operation</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="kc">None</span><span class="p">:</span> +<a id="rest_code_3903547c0616440380ad3221ad822e36-2" name="rest_code_3903547c0616440380ad3221ad822e36-2" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_3903547c0616440380ad3221ad822e36-2"></a> <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">value</span><span class="p">,</span> <span class="n">Constant</span><span class="p">):</span> +<a id="rest_code_3903547c0616440380ad3221ad822e36-3" name="rest_code_3903547c0616440380ad3221ad822e36-3" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_3903547c0616440380ad3221ad822e36-3"></a> <span class="k">return</span> +<a id="rest_code_3903547c0616440380ad3221ad822e36-4" name="rest_code_3903547c0616440380ad3221ad822e36-4" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_3903547c0616440380ad3221ad822e36-4"></a> <span class="k">assert</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">value</span><span class="p">,</span> <span class="n">Operation</span><span class="p">)</span> +<a id="rest_code_3903547c0616440380ad3221ad822e36-5" name="rest_code_3903547c0616440380ad3221ad822e36-5" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_3903547c0616440380ad3221ad822e36-5"></a> <span class="n">info</span> <span class="o">=</span> <span class="n">value</span><span class="o">.</span><span class="n">info</span> +<a id="rest_code_3903547c0616440380ad3221ad822e36-6" name="rest_code_3903547c0616440380ad3221ad822e36-6" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_3903547c0616440380ad3221ad822e36-6"></a> <span class="k">if</span> <span class="n">info</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span> +<a id="rest_code_3903547c0616440380ad3221ad822e36-7" name="rest_code_3903547c0616440380ad3221ad822e36-7" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_3903547c0616440380ad3221ad822e36-7"></a> <span class="k">return</span> <span class="c1"># already materialized</span> +<a id="rest_code_3903547c0616440380ad3221ad822e36-8" name="rest_code_3903547c0616440380ad3221ad822e36-8" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_3903547c0616440380ad3221ad822e36-8"></a> <span class="k">assert</span> <span class="n">value</span><span class="o">.</span><span class="n">name</span> <span class="o">==</span> <span class="s2">"alloc"</span> +<a id="rest_code_3903547c0616440380ad3221ad822e36-9" name="rest_code_3903547c0616440380ad3221ad822e36-9" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_3903547c0616440380ad3221ad822e36-9"></a> <span class="c1"># put the alloc operation back into the trace</span> +<a id="rest_code_3903547c0616440380ad3221ad822e36-10" name="rest_code_3903547c0616440380ad3221ad822e36-10" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_3903547c0616440380ad3221ad822e36-10"></a> <span class="n">opt_bb</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">value</span><span class="p">)</span> +<a id="rest_code_3903547c0616440380ad3221ad822e36-11" name="rest_code_3903547c0616440380ad3221ad822e36-11" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_3903547c0616440380ad3221ad822e36-11"></a><span class="hll"> <span class="c1"># put the content back</span> +</span><a id="rest_code_3903547c0616440380ad3221ad822e36-12" name="rest_code_3903547c0616440380ad3221ad822e36-12" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_3903547c0616440380ad3221ad822e36-12"></a><span class="hll"> <span class="k">for</span> <span class="n">idx</span><span class="p">,</span> <span class="n">val</span> <span class="ow">in</span> <span class="n">info</span><span class="o">.</span><span class="n">contents</span><span class="o">.</span><span class="n">items</span><span class="p">():</span> +</span><a id="rest_code_3903547c0616440380ad3221ad822e36-13" name="rest_code_3903547c0616440380ad3221ad822e36-13" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_3903547c0616440380ad3221ad822e36-13"></a><span class="hll"> <span class="c1"># re-create store operation</span> +</span><a id="rest_code_3903547c0616440380ad3221ad822e36-14" name="rest_code_3903547c0616440380ad3221ad822e36-14" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_3903547c0616440380ad3221ad822e36-14"></a><span class="hll"> <span class="n">opt_bb</span><span class="o">.</span><span class="n">store</span><span class="p">(</span><span class="n">value</span><span class="p">,</span> <span class="n">idx</span><span class="p">,</span> <span class="n">val</span><span class="p">)</span> +</span><a id="rest_code_3903547c0616440380ad3221ad822e36-15" name="rest_code_3903547c0616440380ad3221ad822e36-15" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_3903547c0616440380ad3221ad822e36-15"></a> <span class="c1"># only materialize once</span> +<a id="rest_code_3903547c0616440380ad3221ad822e36-16" name="rest_code_3903547c0616440380ad3221ad822e36-16" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_3903547c0616440380ad3221ad822e36-16"></a> <span class="n">value</span><span class="o">.</span><span class="n">info</span> <span class="o">=</span> <span class="kc">None</span> +<a id="rest_code_3903547c0616440380ad3221ad822e36-17" name="rest_code_3903547c0616440380ad3221ad822e36-17" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_3903547c0616440380ad3221ad822e36-17"></a> +<a id="rest_code_3903547c0616440380ad3221ad822e36-18" name="rest_code_3903547c0616440380ad3221ad822e36-18" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_3903547c0616440380ad3221ad822e36-18"></a><span class="c1"># optimize_alloc_removal unchanged</span> +</pre></div> +<p>This is enough to pass the test.</p> +</section> +<section id="version-6-recursive-materialization"> +<h2>Version 6: Recursive Materialization</h2> +<p>In the above example, the fields of the virtual objects contained +only constants or non-virtual objects. However, we could have a situation where +a whole tree of virtual objects is built, and then the root of the tree escapes. +This makes it necessary to escape the whole tree. Let's write a test for a small +tree of two virtual objects:</p> +<div class="code"><pre class="code python"><a id="rest_code_f4090cb2279842e1b178f2d23ac47659-1" name="rest_code_f4090cb2279842e1b178f2d23ac47659-1" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_f4090cb2279842e1b178f2d23ac47659-1"></a><span class="k">def</span> <span class="nf">test_materialize_chained_objects</span><span class="p">():</span> +<a id="rest_code_f4090cb2279842e1b178f2d23ac47659-2" name="rest_code_f4090cb2279842e1b178f2d23ac47659-2" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_f4090cb2279842e1b178f2d23ac47659-2"></a> <span class="n">bb</span> <span class="o">=</span> <span class="n">Block</span><span class="p">()</span> +<a id="rest_code_f4090cb2279842e1b178f2d23ac47659-3" name="rest_code_f4090cb2279842e1b178f2d23ac47659-3" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_f4090cb2279842e1b178f2d23ac47659-3"></a> <span class="n">var0</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">getarg</span><span class="p">(</span><span class="mi">0</span><span class="p">)</span> +<a id="rest_code_f4090cb2279842e1b178f2d23ac47659-4" name="rest_code_f4090cb2279842e1b178f2d23ac47659-4" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_f4090cb2279842e1b178f2d23ac47659-4"></a> <span class="n">obj0</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">alloc</span><span class="p">()</span> +<a id="rest_code_f4090cb2279842e1b178f2d23ac47659-5" name="rest_code_f4090cb2279842e1b178f2d23ac47659-5" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_f4090cb2279842e1b178f2d23ac47659-5"></a> <span class="n">obj1</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">alloc</span><span class="p">()</span> +<a id="rest_code_f4090cb2279842e1b178f2d23ac47659-6" name="rest_code_f4090cb2279842e1b178f2d23ac47659-6" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_f4090cb2279842e1b178f2d23ac47659-6"></a> <span class="n">contents</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">store</span><span class="p">(</span><span class="n">obj0</span><span class="p">,</span> <span class="mi">0</span><span class="p">,</span> <span class="n">obj1</span><span class="p">)</span> +<a id="rest_code_f4090cb2279842e1b178f2d23ac47659-7" name="rest_code_f4090cb2279842e1b178f2d23ac47659-7" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_f4090cb2279842e1b178f2d23ac47659-7"></a> <span class="n">const</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">store</span><span class="p">(</span><span class="n">obj1</span><span class="p">,</span> <span class="mi">0</span><span class="p">,</span> <span class="mi">1337</span><span class="p">)</span> +<a id="rest_code_f4090cb2279842e1b178f2d23ac47659-8" name="rest_code_f4090cb2279842e1b178f2d23ac47659-8" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_f4090cb2279842e1b178f2d23ac47659-8"></a> <span class="n">sto</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">store</span><span class="p">(</span><span class="n">var0</span><span class="p">,</span> <span class="mi">0</span><span class="p">,</span> <span class="n">obj0</span><span class="p">)</span> +<a id="rest_code_f4090cb2279842e1b178f2d23ac47659-9" name="rest_code_f4090cb2279842e1b178f2d23ac47659-9" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_f4090cb2279842e1b178f2d23ac47659-9"></a> <span class="c1"># obj0</span> +<a id="rest_code_f4090cb2279842e1b178f2d23ac47659-10" name="rest_code_f4090cb2279842e1b178f2d23ac47659-10" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_f4090cb2279842e1b178f2d23ac47659-10"></a> <span class="c1"># ┌──────┐</span> +<a id="rest_code_f4090cb2279842e1b178f2d23ac47659-11" name="rest_code_f4090cb2279842e1b178f2d23ac47659-11" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_f4090cb2279842e1b178f2d23ac47659-11"></a> <span class="c1"># │ 0: ╷ │</span> +<a id="rest_code_f4090cb2279842e1b178f2d23ac47659-12" name="rest_code_f4090cb2279842e1b178f2d23ac47659-12" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_f4090cb2279842e1b178f2d23ac47659-12"></a> <span class="c1"># └────┼─┘</span> +<a id="rest_code_f4090cb2279842e1b178f2d23ac47659-13" name="rest_code_f4090cb2279842e1b178f2d23ac47659-13" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_f4090cb2279842e1b178f2d23ac47659-13"></a> <span class="c1"># │</span> +<a id="rest_code_f4090cb2279842e1b178f2d23ac47659-14" name="rest_code_f4090cb2279842e1b178f2d23ac47659-14" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_f4090cb2279842e1b178f2d23ac47659-14"></a> <span class="c1"># ▼</span> +<a id="rest_code_f4090cb2279842e1b178f2d23ac47659-15" name="rest_code_f4090cb2279842e1b178f2d23ac47659-15" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_f4090cb2279842e1b178f2d23ac47659-15"></a> <span class="c1"># obj1</span> +<a id="rest_code_f4090cb2279842e1b178f2d23ac47659-16" name="rest_code_f4090cb2279842e1b178f2d23ac47659-16" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_f4090cb2279842e1b178f2d23ac47659-16"></a> <span class="c1"># ┌─────────┐</span> +<a id="rest_code_f4090cb2279842e1b178f2d23ac47659-17" name="rest_code_f4090cb2279842e1b178f2d23ac47659-17" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_f4090cb2279842e1b178f2d23ac47659-17"></a> <span class="c1"># │ 0: 1337 │</span> +<a id="rest_code_f4090cb2279842e1b178f2d23ac47659-18" name="rest_code_f4090cb2279842e1b178f2d23ac47659-18" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_f4090cb2279842e1b178f2d23ac47659-18"></a> <span class="c1"># └─────────┘</span> +<a id="rest_code_f4090cb2279842e1b178f2d23ac47659-19" name="rest_code_f4090cb2279842e1b178f2d23ac47659-19" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_f4090cb2279842e1b178f2d23ac47659-19"></a> <span class="c1"># now obj0 escapes</span> +<a id="rest_code_f4090cb2279842e1b178f2d23ac47659-20" name="rest_code_f4090cb2279842e1b178f2d23ac47659-20" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_f4090cb2279842e1b178f2d23ac47659-20"></a> <span class="n">opt_bb</span> <span class="o">=</span> <span class="n">optimize_alloc_removal</span><span class="p">(</span><span class="n">bb</span><span class="p">)</span> +<a id="rest_code_f4090cb2279842e1b178f2d23ac47659-21" name="rest_code_f4090cb2279842e1b178f2d23ac47659-21" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_f4090cb2279842e1b178f2d23ac47659-21"></a> <span class="k">assert</span> <span class="n">bb_to_str</span><span class="p">(</span><span class="n">opt_bb</span><span class="p">,</span> <span class="s2">"optvar"</span><span class="p">)</span> <span class="o">==</span> <span class="s2">"""</span><span class="se">\</span> +<a id="rest_code_f4090cb2279842e1b178f2d23ac47659-22" name="rest_code_f4090cb2279842e1b178f2d23ac47659-22" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_f4090cb2279842e1b178f2d23ac47659-22"></a><span class="s2">optvar0 = getarg(0)</span> +<a id="rest_code_f4090cb2279842e1b178f2d23ac47659-23" name="rest_code_f4090cb2279842e1b178f2d23ac47659-23" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_f4090cb2279842e1b178f2d23ac47659-23"></a><span class="s2">optvar1 = alloc()</span> +<a id="rest_code_f4090cb2279842e1b178f2d23ac47659-24" name="rest_code_f4090cb2279842e1b178f2d23ac47659-24" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_f4090cb2279842e1b178f2d23ac47659-24"></a><span class="s2">optvar2 = alloc()</span> +<a id="rest_code_f4090cb2279842e1b178f2d23ac47659-25" name="rest_code_f4090cb2279842e1b178f2d23ac47659-25" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_f4090cb2279842e1b178f2d23ac47659-25"></a><span class="s2">optvar3 = store(optvar2, 0, 1337)</span> +<a id="rest_code_f4090cb2279842e1b178f2d23ac47659-26" name="rest_code_f4090cb2279842e1b178f2d23ac47659-26" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_f4090cb2279842e1b178f2d23ac47659-26"></a><span class="s2">optvar4 = store(optvar1, 0, optvar2)</span> +<a id="rest_code_f4090cb2279842e1b178f2d23ac47659-27" name="rest_code_f4090cb2279842e1b178f2d23ac47659-27" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_f4090cb2279842e1b178f2d23ac47659-27"></a><span class="s2">optvar5 = store(optvar0, 0, optvar1)"""</span> +<a id="rest_code_f4090cb2279842e1b178f2d23ac47659-28" name="rest_code_f4090cb2279842e1b178f2d23ac47659-28" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_f4090cb2279842e1b178f2d23ac47659-28"></a> <span class="c1"># fails in an annoying way! the resulting</span> +<a id="rest_code_f4090cb2279842e1b178f2d23ac47659-29" name="rest_code_f4090cb2279842e1b178f2d23ac47659-29" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_f4090cb2279842e1b178f2d23ac47659-29"></a> <span class="c1"># basic block is not in proper SSA form</span> +<a id="rest_code_f4090cb2279842e1b178f2d23ac47659-30" name="rest_code_f4090cb2279842e1b178f2d23ac47659-30" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_f4090cb2279842e1b178f2d23ac47659-30"></a> <span class="c1"># so printing it fails. The optimized</span> +<a id="rest_code_f4090cb2279842e1b178f2d23ac47659-31" name="rest_code_f4090cb2279842e1b178f2d23ac47659-31" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_f4090cb2279842e1b178f2d23ac47659-31"></a> <span class="c1"># block would look like this:</span> +<a id="rest_code_f4090cb2279842e1b178f2d23ac47659-32" name="rest_code_f4090cb2279842e1b178f2d23ac47659-32" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_f4090cb2279842e1b178f2d23ac47659-32"></a> <span class="c1"># optvar0 = getarg(0)</span> +<a id="rest_code_f4090cb2279842e1b178f2d23ac47659-33" name="rest_code_f4090cb2279842e1b178f2d23ac47659-33" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_f4090cb2279842e1b178f2d23ac47659-33"></a> <span class="c1"># optvar1 = alloc()</span> +<a id="rest_code_f4090cb2279842e1b178f2d23ac47659-34" name="rest_code_f4090cb2279842e1b178f2d23ac47659-34" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_f4090cb2279842e1b178f2d23ac47659-34"></a> <span class="c1"># optvar3 = store(optvar1, 0, optvar2)</span> +<a id="rest_code_f4090cb2279842e1b178f2d23ac47659-35" name="rest_code_f4090cb2279842e1b178f2d23ac47659-35" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_f4090cb2279842e1b178f2d23ac47659-35"></a> <span class="c1"># optvar4 = store(optvar0, 0, optvar1)</span> +<a id="rest_code_f4090cb2279842e1b178f2d23ac47659-36" name="rest_code_f4090cb2279842e1b178f2d23ac47659-36" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_f4090cb2279842e1b178f2d23ac47659-36"></a> <span class="c1"># where optvar2 is an ``alloc`` Operation</span> +<a id="rest_code_f4090cb2279842e1b178f2d23ac47659-37" name="rest_code_f4090cb2279842e1b178f2d23ac47659-37" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_f4090cb2279842e1b178f2d23ac47659-37"></a> <span class="c1"># that is not itself in the output block</span> +</pre></div> +<p>To fix it, <code class="docutils literal">materialize</code> needs to call itself recursively for all the field +values of the virtual object:</p> +<div class="code"><pre class="code python"><a id="rest_code_89139e94fb0a484da178cf4cdcdc7e6a-1" name="rest_code_89139e94fb0a484da178cf4cdcdc7e6a-1" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_89139e94fb0a484da178cf4cdcdc7e6a-1"></a><span class="k">def</span> <span class="nf">materialize</span><span class="p">(</span><span class="n">opt_bb</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="n">Operation</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="kc">None</span><span class="p">:</span> +<a id="rest_code_89139e94fb0a484da178cf4cdcdc7e6a-2" name="rest_code_89139e94fb0a484da178cf4cdcdc7e6a-2" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_89139e94fb0a484da178cf4cdcdc7e6a-2"></a> <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">value</span><span class="p">,</span> <span class="n">Constant</span><span class="p">):</span> +<a id="rest_code_89139e94fb0a484da178cf4cdcdc7e6a-3" name="rest_code_89139e94fb0a484da178cf4cdcdc7e6a-3" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_89139e94fb0a484da178cf4cdcdc7e6a-3"></a> <span class="k">return</span> +<a id="rest_code_89139e94fb0a484da178cf4cdcdc7e6a-4" name="rest_code_89139e94fb0a484da178cf4cdcdc7e6a-4" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_89139e94fb0a484da178cf4cdcdc7e6a-4"></a> <span class="k">assert</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">value</span><span class="p">,</span> <span class="n">Operation</span><span class="p">)</span> +<a id="rest_code_89139e94fb0a484da178cf4cdcdc7e6a-5" name="rest_code_89139e94fb0a484da178cf4cdcdc7e6a-5" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_89139e94fb0a484da178cf4cdcdc7e6a-5"></a> <span class="n">info</span> <span class="o">=</span> <span class="n">value</span><span class="o">.</span><span class="n">info</span> +<a id="rest_code_89139e94fb0a484da178cf4cdcdc7e6a-6" name="rest_code_89139e94fb0a484da178cf4cdcdc7e6a-6" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_89139e94fb0a484da178cf4cdcdc7e6a-6"></a> <span class="k">if</span> <span class="n">info</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span> +<a id="rest_code_89139e94fb0a484da178cf4cdcdc7e6a-7" name="rest_code_89139e94fb0a484da178cf4cdcdc7e6a-7" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_89139e94fb0a484da178cf4cdcdc7e6a-7"></a> <span class="k">return</span> <span class="c1"># already materialized</span> +<a id="rest_code_89139e94fb0a484da178cf4cdcdc7e6a-8" name="rest_code_89139e94fb0a484da178cf4cdcdc7e6a-8" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_89139e94fb0a484da178cf4cdcdc7e6a-8"></a> <span class="k">assert</span> <span class="n">value</span><span class="o">.</span><span class="n">name</span> <span class="o">==</span> <span class="s2">"alloc"</span> +<a id="rest_code_89139e94fb0a484da178cf4cdcdc7e6a-9" name="rest_code_89139e94fb0a484da178cf4cdcdc7e6a-9" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_89139e94fb0a484da178cf4cdcdc7e6a-9"></a> <span class="c1"># put the alloc operation back into the trace</span> +<a id="rest_code_89139e94fb0a484da178cf4cdcdc7e6a-10" name="rest_code_89139e94fb0a484da178cf4cdcdc7e6a-10" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_89139e94fb0a484da178cf4cdcdc7e6a-10"></a> <span class="n">opt_bb</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">value</span><span class="p">)</span> +<a id="rest_code_89139e94fb0a484da178cf4cdcdc7e6a-11" name="rest_code_89139e94fb0a484da178cf4cdcdc7e6a-11" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_89139e94fb0a484da178cf4cdcdc7e6a-11"></a> <span class="c1"># put the content back</span> +<a id="rest_code_89139e94fb0a484da178cf4cdcdc7e6a-12" name="rest_code_89139e94fb0a484da178cf4cdcdc7e6a-12" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_89139e94fb0a484da178cf4cdcdc7e6a-12"></a> <span class="k">for</span> <span class="n">idx</span><span class="p">,</span> <span class="n">val</span> <span class="ow">in</span> <span class="nb">sorted</span><span class="p">(</span><span class="n">info</span><span class="o">.</span><span class="n">contents</span><span class="o">.</span><span class="n">items</span><span class="p">()):</span> +<a id="rest_code_89139e94fb0a484da178cf4cdcdc7e6a-13" name="rest_code_89139e94fb0a484da178cf4cdcdc7e6a-13" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_89139e94fb0a484da178cf4cdcdc7e6a-13"></a><span class="hll"> <span class="c1"># materialize recursively</span> +</span><a id="rest_code_89139e94fb0a484da178cf4cdcdc7e6a-14" name="rest_code_89139e94fb0a484da178cf4cdcdc7e6a-14" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_89139e94fb0a484da178cf4cdcdc7e6a-14"></a><span class="hll"> <span class="n">materialize</span><span class="p">(</span><span class="n">opt_bb</span><span class="p">,</span> <span class="n">val</span><span class="p">)</span> +</span><a id="rest_code_89139e94fb0a484da178cf4cdcdc7e6a-15" name="rest_code_89139e94fb0a484da178cf4cdcdc7e6a-15" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_89139e94fb0a484da178cf4cdcdc7e6a-15"></a> <span class="n">opt_bb</span><span class="o">.</span><span class="n">store</span><span class="p">(</span><span class="n">value</span><span class="p">,</span> <span class="n">idx</span><span class="p">,</span> <span class="n">val</span><span class="p">)</span> +<a id="rest_code_89139e94fb0a484da178cf4cdcdc7e6a-16" name="rest_code_89139e94fb0a484da178cf4cdcdc7e6a-16" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_89139e94fb0a484da178cf4cdcdc7e6a-16"></a> <span class="c1"># only materialize once</span> +<a id="rest_code_89139e94fb0a484da178cf4cdcdc7e6a-17" name="rest_code_89139e94fb0a484da178cf4cdcdc7e6a-17" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_89139e94fb0a484da178cf4cdcdc7e6a-17"></a> <span class="n">value</span><span class="o">.</span><span class="n">info</span> <span class="o">=</span> <span class="kc">None</span> +<a id="rest_code_89139e94fb0a484da178cf4cdcdc7e6a-18" name="rest_code_89139e94fb0a484da178cf4cdcdc7e6a-18" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_89139e94fb0a484da178cf4cdcdc7e6a-18"></a> +<a id="rest_code_89139e94fb0a484da178cf4cdcdc7e6a-19" name="rest_code_89139e94fb0a484da178cf4cdcdc7e6a-19" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_89139e94fb0a484da178cf4cdcdc7e6a-19"></a><span class="c1"># optimize_alloc_removal unchanged</span> +</pre></div> +<p>Getting there, the materialization logic is almost done. We need to fix a +subtle remaining problem though.</p> +</section> +<section id="version-7-dealing-with-object-cycles"> +<h2>Version 7: Dealing with Object Cycles</h2> +<p>The bug we need to fix in this section is a bit tricky, and does not immediately +occur in a lot of programs. In +fact, in PyPy a variant of it was hiding out in our optimizer +until we found it much later (despite us being aware of the general problem and +correctly dealing with it in other cases).</p> +<p>The problem is this: a virtual object can (directly or indirectly) point to +itself, and we must carefully deal with that case to avoid infinite recursion in +<code class="docutils literal">materialize</code>. Here's the simplest test:</p> +<div class="code"><pre class="code python"><a id="rest_code_14cfb68b67424d1ea9623fffbe8cd9e3-1" name="rest_code_14cfb68b67424d1ea9623fffbe8cd9e3-1" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_14cfb68b67424d1ea9623fffbe8cd9e3-1"></a><span class="k">def</span> <span class="nf">test_object_graph_cycles</span><span class="p">():</span> +<a id="rest_code_14cfb68b67424d1ea9623fffbe8cd9e3-2" name="rest_code_14cfb68b67424d1ea9623fffbe8cd9e3-2" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_14cfb68b67424d1ea9623fffbe8cd9e3-2"></a> <span class="n">bb</span> <span class="o">=</span> <span class="n">Block</span><span class="p">()</span> +<a id="rest_code_14cfb68b67424d1ea9623fffbe8cd9e3-3" name="rest_code_14cfb68b67424d1ea9623fffbe8cd9e3-3" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_14cfb68b67424d1ea9623fffbe8cd9e3-3"></a> <span class="n">var0</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">getarg</span><span class="p">(</span><span class="mi">0</span><span class="p">)</span> +<a id="rest_code_14cfb68b67424d1ea9623fffbe8cd9e3-4" name="rest_code_14cfb68b67424d1ea9623fffbe8cd9e3-4" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_14cfb68b67424d1ea9623fffbe8cd9e3-4"></a> <span class="n">var1</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">alloc</span><span class="p">()</span> +<a id="rest_code_14cfb68b67424d1ea9623fffbe8cd9e3-5" name="rest_code_14cfb68b67424d1ea9623fffbe8cd9e3-5" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_14cfb68b67424d1ea9623fffbe8cd9e3-5"></a> <span class="n">var2</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">store</span><span class="p">(</span><span class="n">var1</span><span class="p">,</span> <span class="mi">0</span><span class="p">,</span> <span class="n">var1</span><span class="p">)</span> +<a id="rest_code_14cfb68b67424d1ea9623fffbe8cd9e3-6" name="rest_code_14cfb68b67424d1ea9623fffbe8cd9e3-6" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_14cfb68b67424d1ea9623fffbe8cd9e3-6"></a> <span class="n">var3</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">store</span><span class="p">(</span><span class="n">var0</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="n">var1</span><span class="p">)</span> +<a id="rest_code_14cfb68b67424d1ea9623fffbe8cd9e3-7" name="rest_code_14cfb68b67424d1ea9623fffbe8cd9e3-7" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_14cfb68b67424d1ea9623fffbe8cd9e3-7"></a> <span class="c1"># ┌────────┐</span> +<a id="rest_code_14cfb68b67424d1ea9623fffbe8cd9e3-8" name="rest_code_14cfb68b67424d1ea9623fffbe8cd9e3-8" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_14cfb68b67424d1ea9623fffbe8cd9e3-8"></a> <span class="c1"># ▼ │</span> +<a id="rest_code_14cfb68b67424d1ea9623fffbe8cd9e3-9" name="rest_code_14cfb68b67424d1ea9623fffbe8cd9e3-9" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_14cfb68b67424d1ea9623fffbe8cd9e3-9"></a> <span class="c1"># obj0 │</span> +<a id="rest_code_14cfb68b67424d1ea9623fffbe8cd9e3-10" name="rest_code_14cfb68b67424d1ea9623fffbe8cd9e3-10" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_14cfb68b67424d1ea9623fffbe8cd9e3-10"></a> <span class="c1"># ┌──────┐ │</span> +<a id="rest_code_14cfb68b67424d1ea9623fffbe8cd9e3-11" name="rest_code_14cfb68b67424d1ea9623fffbe8cd9e3-11" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_14cfb68b67424d1ea9623fffbe8cd9e3-11"></a> <span class="c1"># │ 0: ╷ │ │</span> +<a id="rest_code_14cfb68b67424d1ea9623fffbe8cd9e3-12" name="rest_code_14cfb68b67424d1ea9623fffbe8cd9e3-12" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_14cfb68b67424d1ea9623fffbe8cd9e3-12"></a> <span class="c1"># └────┼─┘ │</span> +<a id="rest_code_14cfb68b67424d1ea9623fffbe8cd9e3-13" name="rest_code_14cfb68b67424d1ea9623fffbe8cd9e3-13" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_14cfb68b67424d1ea9623fffbe8cd9e3-13"></a> <span class="c1"># │ │</span> +<a id="rest_code_14cfb68b67424d1ea9623fffbe8cd9e3-14" name="rest_code_14cfb68b67424d1ea9623fffbe8cd9e3-14" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_14cfb68b67424d1ea9623fffbe8cd9e3-14"></a> <span class="c1"># └─────┘</span> +<a id="rest_code_14cfb68b67424d1ea9623fffbe8cd9e3-15" name="rest_code_14cfb68b67424d1ea9623fffbe8cd9e3-15" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_14cfb68b67424d1ea9623fffbe8cd9e3-15"></a> <span class="c1"># obj0 points to itself, and then it is</span> +<a id="rest_code_14cfb68b67424d1ea9623fffbe8cd9e3-16" name="rest_code_14cfb68b67424d1ea9623fffbe8cd9e3-16" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_14cfb68b67424d1ea9623fffbe8cd9e3-16"></a> <span class="c1"># escaped</span> +<a id="rest_code_14cfb68b67424d1ea9623fffbe8cd9e3-17" name="rest_code_14cfb68b67424d1ea9623fffbe8cd9e3-17" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_14cfb68b67424d1ea9623fffbe8cd9e3-17"></a> <span class="n">opt_bb</span> <span class="o">=</span> <span class="n">optimize_alloc_removal</span><span class="p">(</span><span class="n">bb</span><span class="p">)</span> +<a id="rest_code_14cfb68b67424d1ea9623fffbe8cd9e3-18" name="rest_code_14cfb68b67424d1ea9623fffbe8cd9e3-18" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_14cfb68b67424d1ea9623fffbe8cd9e3-18"></a> <span class="c1"># the previous line fails with an</span> +<a id="rest_code_14cfb68b67424d1ea9623fffbe8cd9e3-19" name="rest_code_14cfb68b67424d1ea9623fffbe8cd9e3-19" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_14cfb68b67424d1ea9623fffbe8cd9e3-19"></a> <span class="c1"># InfiniteRecursionError</span> +<a id="rest_code_14cfb68b67424d1ea9623fffbe8cd9e3-20" name="rest_code_14cfb68b67424d1ea9623fffbe8cd9e3-20" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_14cfb68b67424d1ea9623fffbe8cd9e3-20"></a> <span class="c1"># materialize calls itself, infinitely</span> +<a id="rest_code_14cfb68b67424d1ea9623fffbe8cd9e3-21" name="rest_code_14cfb68b67424d1ea9623fffbe8cd9e3-21" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_14cfb68b67424d1ea9623fffbe8cd9e3-21"></a> +<a id="rest_code_14cfb68b67424d1ea9623fffbe8cd9e3-22" name="rest_code_14cfb68b67424d1ea9623fffbe8cd9e3-22" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_14cfb68b67424d1ea9623fffbe8cd9e3-22"></a> <span class="c1"># what we want is instead this output:</span> +<a id="rest_code_14cfb68b67424d1ea9623fffbe8cd9e3-23" name="rest_code_14cfb68b67424d1ea9623fffbe8cd9e3-23" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_14cfb68b67424d1ea9623fffbe8cd9e3-23"></a> <span class="k">assert</span> <span class="n">bb_to_str</span><span class="p">(</span><span class="n">opt_bb</span><span class="p">,</span> <span class="s2">"optvar"</span><span class="p">)</span> <span class="o">==</span> <span class="s2">"""</span><span class="se">\</span> +<a id="rest_code_14cfb68b67424d1ea9623fffbe8cd9e3-24" name="rest_code_14cfb68b67424d1ea9623fffbe8cd9e3-24" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_14cfb68b67424d1ea9623fffbe8cd9e3-24"></a><span class="s2">optvar0 = getarg(0)</span> +<a id="rest_code_14cfb68b67424d1ea9623fffbe8cd9e3-25" name="rest_code_14cfb68b67424d1ea9623fffbe8cd9e3-25" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_14cfb68b67424d1ea9623fffbe8cd9e3-25"></a><span class="s2">optvar1 = alloc()</span> +<a id="rest_code_14cfb68b67424d1ea9623fffbe8cd9e3-26" name="rest_code_14cfb68b67424d1ea9623fffbe8cd9e3-26" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_14cfb68b67424d1ea9623fffbe8cd9e3-26"></a><span class="s2">optvar2 = store(optvar1, 0, optvar1)</span> +<a id="rest_code_14cfb68b67424d1ea9623fffbe8cd9e3-27" name="rest_code_14cfb68b67424d1ea9623fffbe8cd9e3-27" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_14cfb68b67424d1ea9623fffbe8cd9e3-27"></a><span class="s2">optvar3 = store(optvar0, 1, optvar1)"""</span> +</pre></div> +<p>The fix is not a big change, but a little bit subtle nevertheless. +We have to change the +order in which things are done in <code class="docutils literal">materialize</code>. Right after emitting the +<code class="docutils literal">alloc</code>, we set the <code class="docutils literal">.info</code> to <code class="docutils literal">None</code>, to mark the object as not virtual. +Only <em>afterwards</em> do we re-create the stores and call <code class="docutils literal">materialize</code> recursively. +If a recursive call reaches the same object, it's already marked as non-virtual, +so <code class="docutils literal">materialize</code> won't recurse further:</p> +<div class="code"><pre class="code python"><a id="rest_code_7ed667c4854348719d115ecce0edcf63-1" name="rest_code_7ed667c4854348719d115ecce0edcf63-1" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_7ed667c4854348719d115ecce0edcf63-1"></a><span class="k">def</span> <span class="nf">materialize</span><span class="p">(</span><span class="n">opt_bb</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="n">Operation</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="kc">None</span><span class="p">:</span> +<a id="rest_code_7ed667c4854348719d115ecce0edcf63-2" name="rest_code_7ed667c4854348719d115ecce0edcf63-2" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_7ed667c4854348719d115ecce0edcf63-2"></a> <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">value</span><span class="p">,</span> <span class="n">Constant</span><span class="p">):</span> +<a id="rest_code_7ed667c4854348719d115ecce0edcf63-3" name="rest_code_7ed667c4854348719d115ecce0edcf63-3" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_7ed667c4854348719d115ecce0edcf63-3"></a> <span class="k">return</span> +<a id="rest_code_7ed667c4854348719d115ecce0edcf63-4" name="rest_code_7ed667c4854348719d115ecce0edcf63-4" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_7ed667c4854348719d115ecce0edcf63-4"></a> <span class="k">assert</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">value</span><span class="p">,</span> <span class="n">Operation</span><span class="p">)</span> +<a id="rest_code_7ed667c4854348719d115ecce0edcf63-5" name="rest_code_7ed667c4854348719d115ecce0edcf63-5" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_7ed667c4854348719d115ecce0edcf63-5"></a> <span class="n">info</span> <span class="o">=</span> <span class="n">value</span><span class="o">.</span><span class="n">info</span> +<a id="rest_code_7ed667c4854348719d115ecce0edcf63-6" name="rest_code_7ed667c4854348719d115ecce0edcf63-6" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_7ed667c4854348719d115ecce0edcf63-6"></a> <span class="k">if</span> <span class="n">info</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span> +<a id="rest_code_7ed667c4854348719d115ecce0edcf63-7" name="rest_code_7ed667c4854348719d115ecce0edcf63-7" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_7ed667c4854348719d115ecce0edcf63-7"></a> <span class="k">return</span> <span class="c1"># already materialized</span> +<a id="rest_code_7ed667c4854348719d115ecce0edcf63-8" name="rest_code_7ed667c4854348719d115ecce0edcf63-8" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_7ed667c4854348719d115ecce0edcf63-8"></a> <span class="k">assert</span> <span class="n">value</span><span class="o">.</span><span class="n">name</span> <span class="o">==</span> <span class="s2">"alloc"</span> +<a id="rest_code_7ed667c4854348719d115ecce0edcf63-9" name="rest_code_7ed667c4854348719d115ecce0edcf63-9" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_7ed667c4854348719d115ecce0edcf63-9"></a> <span class="c1"># put the alloc operation back into the trace</span> +<a id="rest_code_7ed667c4854348719d115ecce0edcf63-10" name="rest_code_7ed667c4854348719d115ecce0edcf63-10" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_7ed667c4854348719d115ecce0edcf63-10"></a> <span class="n">opt_bb</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">value</span><span class="p">)</span> +<a id="rest_code_7ed667c4854348719d115ecce0edcf63-11" name="rest_code_7ed667c4854348719d115ecce0edcf63-11" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_7ed667c4854348719d115ecce0edcf63-11"></a><span class="hll"> <span class="c1"># only materialize once</span> +</span><a id="rest_code_7ed667c4854348719d115ecce0edcf63-12" name="rest_code_7ed667c4854348719d115ecce0edcf63-12" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_7ed667c4854348719d115ecce0edcf63-12"></a><span class="hll"> <span class="n">value</span><span class="o">.</span><span class="n">info</span> <span class="o">=</span> <span class="kc">None</span> +</span><a id="rest_code_7ed667c4854348719d115ecce0edcf63-13" name="rest_code_7ed667c4854348719d115ecce0edcf63-13" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_7ed667c4854348719d115ecce0edcf63-13"></a><span class="hll"> <span class="c1"># put the content back</span> +</span><a id="rest_code_7ed667c4854348719d115ecce0edcf63-14" name="rest_code_7ed667c4854348719d115ecce0edcf63-14" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_7ed667c4854348719d115ecce0edcf63-14"></a><span class="hll"> <span class="k">for</span> <span class="n">idx</span><span class="p">,</span> <span class="n">val</span> <span class="ow">in</span> <span class="nb">sorted</span><span class="p">(</span><span class="n">info</span><span class="o">.</span><span class="n">contents</span><span class="o">.</span><span class="n">items</span><span class="p">()):</span> +</span><a id="rest_code_7ed667c4854348719d115ecce0edcf63-15" name="rest_code_7ed667c4854348719d115ecce0edcf63-15" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_7ed667c4854348719d115ecce0edcf63-15"></a><span class="hll"> <span class="c1"># materialize recursively</span> +</span><a id="rest_code_7ed667c4854348719d115ecce0edcf63-16" name="rest_code_7ed667c4854348719d115ecce0edcf63-16" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_7ed667c4854348719d115ecce0edcf63-16"></a><span class="hll"> <span class="n">materialize</span><span class="p">(</span><span class="n">opt_bb</span><span class="p">,</span> <span class="n">val</span><span class="p">)</span> +</span><a id="rest_code_7ed667c4854348719d115ecce0edcf63-17" name="rest_code_7ed667c4854348719d115ecce0edcf63-17" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_7ed667c4854348719d115ecce0edcf63-17"></a><span class="hll"> <span class="n">opt_bb</span><span class="o">.</span><span class="n">store</span><span class="p">(</span><span class="n">value</span><span class="p">,</span> <span class="n">idx</span><span class="p">,</span> <span class="n">val</span><span class="p">)</span> +</span></pre></div> +</section> +<section id="version-8-loading-from-non-virtual-objects"> +<h2>Version 8: Loading from non-virtual objects</h2> +<p>Now materialize is done. We need to go back to <code class="docutils literal">optimize_alloc_removal</code> and +improve it further. The last time we changed it, we added a case analysis to the +code dealing with <code class="docutils literal">store</code>, distinguishing between storing to a virtual and to +a non-virtual object. We need to add an equivalent distinction to the <code class="docutils literal">load</code> +case, because right now loading from a non-virtual crashes.</p> +<div class="code"><pre class="code python"><a id="rest_code_3b225716a45245fb930b2e4ec0343836-1" name="rest_code_3b225716a45245fb930b2e4ec0343836-1" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_3b225716a45245fb930b2e4ec0343836-1"></a><span class="k">def</span> <span class="nf">test_load_non_virtual</span><span class="p">():</span> +<a id="rest_code_3b225716a45245fb930b2e4ec0343836-2" name="rest_code_3b225716a45245fb930b2e4ec0343836-2" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_3b225716a45245fb930b2e4ec0343836-2"></a> <span class="n">bb</span> <span class="o">=</span> <span class="n">Block</span><span class="p">()</span> +<a id="rest_code_3b225716a45245fb930b2e4ec0343836-3" name="rest_code_3b225716a45245fb930b2e4ec0343836-3" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_3b225716a45245fb930b2e4ec0343836-3"></a> <span class="n">var0</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">getarg</span><span class="p">(</span><span class="mi">0</span><span class="p">)</span> +<a id="rest_code_3b225716a45245fb930b2e4ec0343836-4" name="rest_code_3b225716a45245fb930b2e4ec0343836-4" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_3b225716a45245fb930b2e4ec0343836-4"></a> <span class="n">var1</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">load</span><span class="p">(</span><span class="n">var0</span><span class="p">,</span> <span class="mi">0</span><span class="p">)</span> +<a id="rest_code_3b225716a45245fb930b2e4ec0343836-5" name="rest_code_3b225716a45245fb930b2e4ec0343836-5" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_3b225716a45245fb930b2e4ec0343836-5"></a> <span class="n">bb</span><span class="o">.</span><span class="n">print</span><span class="p">(</span><span class="n">var1</span><span class="p">)</span> +<a id="rest_code_3b225716a45245fb930b2e4ec0343836-6" name="rest_code_3b225716a45245fb930b2e4ec0343836-6" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_3b225716a45245fb930b2e4ec0343836-6"></a> <span class="c1"># the next line fails in the line</span> +<a id="rest_code_3b225716a45245fb930b2e4ec0343836-7" name="rest_code_3b225716a45245fb930b2e4ec0343836-7" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_3b225716a45245fb930b2e4ec0343836-7"></a> <span class="c1"># op.make_equal_to(info.load(field))</span> +<a id="rest_code_3b225716a45245fb930b2e4ec0343836-8" name="rest_code_3b225716a45245fb930b2e4ec0343836-8" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_3b225716a45245fb930b2e4ec0343836-8"></a> <span class="c1"># because info is None</span> +<a id="rest_code_3b225716a45245fb930b2e4ec0343836-9" name="rest_code_3b225716a45245fb930b2e4ec0343836-9" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_3b225716a45245fb930b2e4ec0343836-9"></a> <span class="n">opt_bb</span> <span class="o">=</span> <span class="n">optimize_alloc_removal</span><span class="p">(</span><span class="n">bb</span><span class="p">)</span> +<a id="rest_code_3b225716a45245fb930b2e4ec0343836-10" name="rest_code_3b225716a45245fb930b2e4ec0343836-10" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_3b225716a45245fb930b2e4ec0343836-10"></a> <span class="k">assert</span> <span class="n">bb_to_str</span><span class="p">(</span><span class="n">opt_bb</span><span class="p">,</span> <span class="s2">"optvar"</span><span class="p">)</span> <span class="o">==</span> <span class="s2">"""</span><span class="se">\</span> +<a id="rest_code_3b225716a45245fb930b2e4ec0343836-11" name="rest_code_3b225716a45245fb930b2e4ec0343836-11" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_3b225716a45245fb930b2e4ec0343836-11"></a><span class="s2">optvar0 = getarg(0)</span> +<a id="rest_code_3b225716a45245fb930b2e4ec0343836-12" name="rest_code_3b225716a45245fb930b2e4ec0343836-12" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_3b225716a45245fb930b2e4ec0343836-12"></a><span class="s2">optvar1 = load(optvar0, 0)</span> +<a id="rest_code_3b225716a45245fb930b2e4ec0343836-13" name="rest_code_3b225716a45245fb930b2e4ec0343836-13" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_3b225716a45245fb930b2e4ec0343836-13"></a><span class="s2">optvar2 = print(optvar1)"""</span> +</pre></div> +<p>To fix it, we split the <code class="docutils literal">load</code> code into two cases, leaving the virtual path +as before, and letting the <code class="docutils literal">load</code> from a non-virtual fall through to the +general code at the end of the function.</p> +<div class="code"><pre class="code python"><a id="rest_code_1b477872bb23416f9c4122fbbbbfb0c0-1" name="rest_code_1b477872bb23416f9c4122fbbbbfb0c0-1" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1b477872bb23416f9c4122fbbbbfb0c0-1"></a><span class="k">def</span> <span class="nf">optimize_alloc_removal</span><span class="p">(</span><span class="n">bb</span><span class="p">):</span> +<a id="rest_code_1b477872bb23416f9c4122fbbbbfb0c0-2" name="rest_code_1b477872bb23416f9c4122fbbbbfb0c0-2" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1b477872bb23416f9c4122fbbbbfb0c0-2"></a> <span class="n">opt_bb</span> <span class="o">=</span> <span class="n">Block</span><span class="p">()</span> +<a id="rest_code_1b477872bb23416f9c4122fbbbbfb0c0-3" name="rest_code_1b477872bb23416f9c4122fbbbbfb0c0-3" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1b477872bb23416f9c4122fbbbbfb0c0-3"></a> <span class="k">for</span> <span class="n">op</span> <span class="ow">in</span> <span class="n">bb</span><span class="p">:</span> +<a id="rest_code_1b477872bb23416f9c4122fbbbbfb0c0-4" name="rest_code_1b477872bb23416f9c4122fbbbbfb0c0-4" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1b477872bb23416f9c4122fbbbbfb0c0-4"></a> <span class="k">if</span> <span class="n">op</span><span class="o">.</span><span class="n">name</span> <span class="o">==</span> <span class="s2">"alloc"</span><span class="p">:</span> +<a id="rest_code_1b477872bb23416f9c4122fbbbbfb0c0-5" name="rest_code_1b477872bb23416f9c4122fbbbbfb0c0-5" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1b477872bb23416f9c4122fbbbbfb0c0-5"></a> <span class="n">op</span><span class="o">.</span><span class="n">info</span> <span class="o">=</span> <span class="n">VirtualObject</span><span class="p">()</span> +<a id="rest_code_1b477872bb23416f9c4122fbbbbfb0c0-6" name="rest_code_1b477872bb23416f9c4122fbbbbfb0c0-6" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1b477872bb23416f9c4122fbbbbfb0c0-6"></a> <span class="k">continue</span> +<a id="rest_code_1b477872bb23416f9c4122fbbbbfb0c0-7" name="rest_code_1b477872bb23416f9c4122fbbbbfb0c0-7" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1b477872bb23416f9c4122fbbbbfb0c0-7"></a> <span class="k">if</span> <span class="n">op</span><span class="o">.</span><span class="n">name</span> <span class="o">==</span> <span class="s2">"load"</span><span class="p">:</span> +<a id="rest_code_1b477872bb23416f9c4122fbbbbfb0c0-8" name="rest_code_1b477872bb23416f9c4122fbbbbfb0c0-8" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1b477872bb23416f9c4122fbbbbfb0c0-8"></a> <span class="n">info</span> <span class="o">=</span> <span class="n">op</span><span class="o">.</span><span class="n">arg</span><span class="p">(</span><span class="mi">0</span><span class="p">)</span><span class="o">.</span><span class="n">info</span> +<a id="rest_code_1b477872bb23416f9c4122fbbbbfb0c0-9" name="rest_code_1b477872bb23416f9c4122fbbbbfb0c0-9" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1b477872bb23416f9c4122fbbbbfb0c0-9"></a><span class="hll"> <span class="k">if</span> <span class="n">info</span><span class="p">:</span> <span class="c1"># virtual</span> +</span><a id="rest_code_1b477872bb23416f9c4122fbbbbfb0c0-10" name="rest_code_1b477872bb23416f9c4122fbbbbfb0c0-10" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1b477872bb23416f9c4122fbbbbfb0c0-10"></a><span class="hll"> <span class="n">field</span> <span class="o">=</span> <span class="n">get_num</span><span class="p">(</span><span class="n">op</span><span class="p">)</span> +</span><a id="rest_code_1b477872bb23416f9c4122fbbbbfb0c0-11" name="rest_code_1b477872bb23416f9c4122fbbbbfb0c0-11" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1b477872bb23416f9c4122fbbbbfb0c0-11"></a><span class="hll"> <span class="n">op</span><span class="o">.</span><span class="n">make_equal_to</span><span class="p">(</span><span class="n">info</span><span class="o">.</span><span class="n">load</span><span class="p">(</span><span class="n">field</span><span class="p">))</span> +</span><a id="rest_code_1b477872bb23416f9c4122fbbbbfb0c0-12" name="rest_code_1b477872bb23416f9c4122fbbbbfb0c0-12" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1b477872bb23416f9c4122fbbbbfb0c0-12"></a><span class="hll"> <span class="k">continue</span> +</span><a id="rest_code_1b477872bb23416f9c4122fbbbbfb0c0-13" name="rest_code_1b477872bb23416f9c4122fbbbbfb0c0-13" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1b477872bb23416f9c4122fbbbbfb0c0-13"></a><span class="hll"> <span class="c1"># otherwise not virtual, use the</span> +</span><a id="rest_code_1b477872bb23416f9c4122fbbbbfb0c0-14" name="rest_code_1b477872bb23416f9c4122fbbbbfb0c0-14" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1b477872bb23416f9c4122fbbbbfb0c0-14"></a><span class="hll"> <span class="c1"># general path below</span> +</span><a id="rest_code_1b477872bb23416f9c4122fbbbbfb0c0-15" name="rest_code_1b477872bb23416f9c4122fbbbbfb0c0-15" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1b477872bb23416f9c4122fbbbbfb0c0-15"></a> <span class="k">if</span> <span class="n">op</span><span class="o">.</span><span class="n">name</span> <span class="o">==</span> <span class="s2">"store"</span><span class="p">:</span> +<a id="rest_code_1b477872bb23416f9c4122fbbbbfb0c0-16" name="rest_code_1b477872bb23416f9c4122fbbbbfb0c0-16" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1b477872bb23416f9c4122fbbbbfb0c0-16"></a> <span class="n">info</span> <span class="o">=</span> <span class="n">op</span><span class="o">.</span><span class="n">arg</span><span class="p">(</span><span class="mi">0</span><span class="p">)</span><span class="o">.</span><span class="n">info</span> +<a id="rest_code_1b477872bb23416f9c4122fbbbbfb0c0-17" name="rest_code_1b477872bb23416f9c4122fbbbbfb0c0-17" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1b477872bb23416f9c4122fbbbbfb0c0-17"></a> <span class="k">if</span> <span class="n">info</span><span class="p">:</span> <span class="c1"># virtual</span> +<a id="rest_code_1b477872bb23416f9c4122fbbbbfb0c0-18" name="rest_code_1b477872bb23416f9c4122fbbbbfb0c0-18" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1b477872bb23416f9c4122fbbbbfb0c0-18"></a> <span class="n">field</span> <span class="o">=</span> <span class="n">get_num</span><span class="p">(</span><span class="n">op</span><span class="p">)</span> +<a id="rest_code_1b477872bb23416f9c4122fbbbbfb0c0-19" name="rest_code_1b477872bb23416f9c4122fbbbbfb0c0-19" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1b477872bb23416f9c4122fbbbbfb0c0-19"></a> <span class="n">info</span><span class="o">.</span><span class="n">store</span><span class="p">(</span><span class="n">field</span><span class="p">,</span> <span class="n">op</span><span class="o">.</span><span class="n">arg</span><span class="p">(</span><span class="mi">2</span><span class="p">))</span> +<a id="rest_code_1b477872bb23416f9c4122fbbbbfb0c0-20" name="rest_code_1b477872bb23416f9c4122fbbbbfb0c0-20" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1b477872bb23416f9c4122fbbbbfb0c0-20"></a> <span class="k">continue</span> +<a id="rest_code_1b477872bb23416f9c4122fbbbbfb0c0-21" name="rest_code_1b477872bb23416f9c4122fbbbbfb0c0-21" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1b477872bb23416f9c4122fbbbbfb0c0-21"></a> <span class="k">else</span><span class="p">:</span> <span class="c1"># not virtual</span> +<a id="rest_code_1b477872bb23416f9c4122fbbbbfb0c0-22" name="rest_code_1b477872bb23416f9c4122fbbbbfb0c0-22" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1b477872bb23416f9c4122fbbbbfb0c0-22"></a> <span class="c1"># first materialize the</span> +<a id="rest_code_1b477872bb23416f9c4122fbbbbfb0c0-23" name="rest_code_1b477872bb23416f9c4122fbbbbfb0c0-23" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1b477872bb23416f9c4122fbbbbfb0c0-23"></a> <span class="c1"># right hand side</span> +<a id="rest_code_1b477872bb23416f9c4122fbbbbfb0c0-24" name="rest_code_1b477872bb23416f9c4122fbbbbfb0c0-24" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1b477872bb23416f9c4122fbbbbfb0c0-24"></a> <span class="n">materialize</span><span class="p">(</span><span class="n">opt_bb</span><span class="p">,</span> <span class="n">op</span><span class="o">.</span><span class="n">arg</span><span class="p">(</span><span class="mi">2</span><span class="p">))</span> +<a id="rest_code_1b477872bb23416f9c4122fbbbbfb0c0-25" name="rest_code_1b477872bb23416f9c4122fbbbbfb0c0-25" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1b477872bb23416f9c4122fbbbbfb0c0-25"></a> <span class="c1"># then emit the store via</span> +<a id="rest_code_1b477872bb23416f9c4122fbbbbfb0c0-26" name="rest_code_1b477872bb23416f9c4122fbbbbfb0c0-26" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1b477872bb23416f9c4122fbbbbfb0c0-26"></a> <span class="c1"># the general path below</span> +<a id="rest_code_1b477872bb23416f9c4122fbbbbfb0c0-27" name="rest_code_1b477872bb23416f9c4122fbbbbfb0c0-27" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1b477872bb23416f9c4122fbbbbfb0c0-27"></a> <span class="n">opt_bb</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">op</span><span class="p">)</span> +<a id="rest_code_1b477872bb23416f9c4122fbbbbfb0c0-28" name="rest_code_1b477872bb23416f9c4122fbbbbfb0c0-28" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1b477872bb23416f9c4122fbbbbfb0c0-28"></a> <span class="k">return</span> <span class="n">opt_bb</span> +</pre></div> +</section> +<section id="version-9-final-materialize-on-other-operations"> +<h2>Version 9 (Final): Materialize on Other Operations</h2> +<p>We're almost at the end now. There's one final generalization left to do. We +started with the heuristic that storing a virtual into a non-virtual would +escape it. This should be generalized. Every time we pass a virtual into any +operation where it is not the first argument of a <code class="docutils literal">load</code> and a <code class="docutils literal">store</code> +should also escape it (imagine passing the virtual to some function call). +Let's test this as usual with our <code class="docutils literal">print</code> operation:</p> +<div class="code"><pre class="code python"><a id="rest_code_50e1d8b837bf447b84874ecb13f34fc3-1" name="rest_code_50e1d8b837bf447b84874ecb13f34fc3-1" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_50e1d8b837bf447b84874ecb13f34fc3-1"></a><span class="k">def</span> <span class="nf">test_materialize_on_other_ops</span><span class="p">():</span> +<a id="rest_code_50e1d8b837bf447b84874ecb13f34fc3-2" name="rest_code_50e1d8b837bf447b84874ecb13f34fc3-2" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_50e1d8b837bf447b84874ecb13f34fc3-2"></a> <span class="c1"># materialize not just on store</span> +<a id="rest_code_50e1d8b837bf447b84874ecb13f34fc3-3" name="rest_code_50e1d8b837bf447b84874ecb13f34fc3-3" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_50e1d8b837bf447b84874ecb13f34fc3-3"></a> <span class="n">bb</span> <span class="o">=</span> <span class="n">Block</span><span class="p">()</span> +<a id="rest_code_50e1d8b837bf447b84874ecb13f34fc3-4" name="rest_code_50e1d8b837bf447b84874ecb13f34fc3-4" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_50e1d8b837bf447b84874ecb13f34fc3-4"></a> <span class="n">var0</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">getarg</span><span class="p">(</span><span class="mi">0</span><span class="p">)</span> +<a id="rest_code_50e1d8b837bf447b84874ecb13f34fc3-5" name="rest_code_50e1d8b837bf447b84874ecb13f34fc3-5" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_50e1d8b837bf447b84874ecb13f34fc3-5"></a> <span class="n">var1</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">alloc</span><span class="p">()</span> +<a id="rest_code_50e1d8b837bf447b84874ecb13f34fc3-6" name="rest_code_50e1d8b837bf447b84874ecb13f34fc3-6" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_50e1d8b837bf447b84874ecb13f34fc3-6"></a> <span class="n">var2</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">print</span><span class="p">(</span><span class="n">var1</span><span class="p">)</span> +<a id="rest_code_50e1d8b837bf447b84874ecb13f34fc3-7" name="rest_code_50e1d8b837bf447b84874ecb13f34fc3-7" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_50e1d8b837bf447b84874ecb13f34fc3-7"></a> <span class="n">opt_bb</span> <span class="o">=</span> <span class="n">optimize_alloc_removal</span><span class="p">(</span><span class="n">bb</span><span class="p">)</span> +<a id="rest_code_50e1d8b837bf447b84874ecb13f34fc3-8" name="rest_code_50e1d8b837bf447b84874ecb13f34fc3-8" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_50e1d8b837bf447b84874ecb13f34fc3-8"></a> <span class="k">assert</span> <span class="n">bb_to_str</span><span class="p">(</span><span class="n">opt_bb</span><span class="p">,</span> <span class="s2">"optvar"</span><span class="p">)</span> <span class="o">==</span> <span class="s2">"""</span><span class="se">\</span> +<a id="rest_code_50e1d8b837bf447b84874ecb13f34fc3-9" name="rest_code_50e1d8b837bf447b84874ecb13f34fc3-9" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_50e1d8b837bf447b84874ecb13f34fc3-9"></a><span class="s2">optvar0 = getarg(0)</span> +<a id="rest_code_50e1d8b837bf447b84874ecb13f34fc3-10" name="rest_code_50e1d8b837bf447b84874ecb13f34fc3-10" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_50e1d8b837bf447b84874ecb13f34fc3-10"></a><span class="s2">optvar1 = alloc()</span> +<a id="rest_code_50e1d8b837bf447b84874ecb13f34fc3-11" name="rest_code_50e1d8b837bf447b84874ecb13f34fc3-11" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_50e1d8b837bf447b84874ecb13f34fc3-11"></a><span class="s2">optvar2 = print(optvar1)"""</span> +<a id="rest_code_50e1d8b837bf447b84874ecb13f34fc3-12" name="rest_code_50e1d8b837bf447b84874ecb13f34fc3-12" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_50e1d8b837bf447b84874ecb13f34fc3-12"></a> <span class="c1"># again, the resulting basic block is not in</span> +<a id="rest_code_50e1d8b837bf447b84874ecb13f34fc3-13" name="rest_code_50e1d8b837bf447b84874ecb13f34fc3-13" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_50e1d8b837bf447b84874ecb13f34fc3-13"></a> <span class="c1"># valid SSA form</span> +</pre></div> +<p>To fix this, we will take the call to <code class="docutils literal">materialize</code> out of the <code class="docutils literal">store</code> code +path and instead put it into the generic code path the end of the <code class="docutils literal">while</code> +loop:</p> +<div class="code"><pre class="code python"><a id="rest_code_1265bdac21584a538123beb08104bbb3-1" name="rest_code_1265bdac21584a538123beb08104bbb3-1" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1265bdac21584a538123beb08104bbb3-1"></a><span class="c1"># materialize is unchanged</span> +<a id="rest_code_1265bdac21584a538123beb08104bbb3-2" name="rest_code_1265bdac21584a538123beb08104bbb3-2" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1265bdac21584a538123beb08104bbb3-2"></a><span class="k">def</span> <span class="nf">materialize</span><span class="p">(</span><span class="n">opt_bb</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="n">Value</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="kc">None</span><span class="p">:</span> +<a id="rest_code_1265bdac21584a538123beb08104bbb3-3" name="rest_code_1265bdac21584a538123beb08104bbb3-3" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1265bdac21584a538123beb08104bbb3-3"></a> <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">value</span><span class="p">,</span> <span class="n">Constant</span><span class="p">):</span> +<a id="rest_code_1265bdac21584a538123beb08104bbb3-4" name="rest_code_1265bdac21584a538123beb08104bbb3-4" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1265bdac21584a538123beb08104bbb3-4"></a> <span class="k">return</span> +<a id="rest_code_1265bdac21584a538123beb08104bbb3-5" name="rest_code_1265bdac21584a538123beb08104bbb3-5" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1265bdac21584a538123beb08104bbb3-5"></a> <span class="k">assert</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">value</span><span class="p">,</span> <span class="n">Operation</span><span class="p">)</span> +<a id="rest_code_1265bdac21584a538123beb08104bbb3-6" name="rest_code_1265bdac21584a538123beb08104bbb3-6" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1265bdac21584a538123beb08104bbb3-6"></a> <span class="n">info</span> <span class="o">=</span> <span class="n">value</span><span class="o">.</span><span class="n">info</span> +<a id="rest_code_1265bdac21584a538123beb08104bbb3-7" name="rest_code_1265bdac21584a538123beb08104bbb3-7" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1265bdac21584a538123beb08104bbb3-7"></a> <span class="k">if</span> <span class="ow">not</span> <span class="n">info</span><span class="p">:</span> +<a id="rest_code_1265bdac21584a538123beb08104bbb3-8" name="rest_code_1265bdac21584a538123beb08104bbb3-8" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1265bdac21584a538123beb08104bbb3-8"></a> <span class="c1"># Already materialized</span> +<a id="rest_code_1265bdac21584a538123beb08104bbb3-9" name="rest_code_1265bdac21584a538123beb08104bbb3-9" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1265bdac21584a538123beb08104bbb3-9"></a> <span class="k">return</span> +<a id="rest_code_1265bdac21584a538123beb08104bbb3-10" name="rest_code_1265bdac21584a538123beb08104bbb3-10" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1265bdac21584a538123beb08104bbb3-10"></a> <span class="k">assert</span> <span class="n">value</span><span class="o">.</span><span class="n">name</span> <span class="o">==</span> <span class="s2">"alloc"</span> +<a id="rest_code_1265bdac21584a538123beb08104bbb3-11" name="rest_code_1265bdac21584a538123beb08104bbb3-11" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1265bdac21584a538123beb08104bbb3-11"></a> <span class="n">opt_bb</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">value</span><span class="p">)</span> +<a id="rest_code_1265bdac21584a538123beb08104bbb3-12" name="rest_code_1265bdac21584a538123beb08104bbb3-12" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1265bdac21584a538123beb08104bbb3-12"></a> <span class="n">value</span><span class="o">.</span><span class="n">info</span> <span class="o">=</span> <span class="kc">None</span> +<a id="rest_code_1265bdac21584a538123beb08104bbb3-13" name="rest_code_1265bdac21584a538123beb08104bbb3-13" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1265bdac21584a538123beb08104bbb3-13"></a> <span class="k">for</span> <span class="n">idx</span><span class="p">,</span> <span class="n">val</span> <span class="ow">in</span> <span class="nb">sorted</span><span class="p">(</span><span class="n">info</span><span class="o">.</span><span class="n">contents</span><span class="o">.</span><span class="n">items</span><span class="p">()):</span> +<a id="rest_code_1265bdac21584a538123beb08104bbb3-14" name="rest_code_1265bdac21584a538123beb08104bbb3-14" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1265bdac21584a538123beb08104bbb3-14"></a> <span class="n">materialize</span><span class="p">(</span><span class="n">opt_bb</span><span class="p">,</span> <span class="n">val</span><span class="p">)</span> +<a id="rest_code_1265bdac21584a538123beb08104bbb3-15" name="rest_code_1265bdac21584a538123beb08104bbb3-15" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1265bdac21584a538123beb08104bbb3-15"></a> <span class="n">opt_bb</span><span class="o">.</span><span class="n">store</span><span class="p">(</span><span class="n">value</span><span class="p">,</span> <span class="n">idx</span><span class="p">,</span> <span class="n">val</span><span class="p">)</span> +<a id="rest_code_1265bdac21584a538123beb08104bbb3-16" name="rest_code_1265bdac21584a538123beb08104bbb3-16" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1265bdac21584a538123beb08104bbb3-16"></a> +<a id="rest_code_1265bdac21584a538123beb08104bbb3-17" name="rest_code_1265bdac21584a538123beb08104bbb3-17" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1265bdac21584a538123beb08104bbb3-17"></a><span class="k">def</span> <span class="nf">optimize_alloc_removal</span><span class="p">(</span><span class="n">bb</span><span class="p">):</span> +<a id="rest_code_1265bdac21584a538123beb08104bbb3-18" name="rest_code_1265bdac21584a538123beb08104bbb3-18" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1265bdac21584a538123beb08104bbb3-18"></a> <span class="n">opt_bb</span> <span class="o">=</span> <span class="n">Block</span><span class="p">()</span> +<a id="rest_code_1265bdac21584a538123beb08104bbb3-19" name="rest_code_1265bdac21584a538123beb08104bbb3-19" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1265bdac21584a538123beb08104bbb3-19"></a> <span class="k">for</span> <span class="n">op</span> <span class="ow">in</span> <span class="n">bb</span><span class="p">:</span> +<a id="rest_code_1265bdac21584a538123beb08104bbb3-20" name="rest_code_1265bdac21584a538123beb08104bbb3-20" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1265bdac21584a538123beb08104bbb3-20"></a> <span class="k">if</span> <span class="n">op</span><span class="o">.</span><span class="n">name</span> <span class="o">==</span> <span class="s2">"alloc"</span><span class="p">:</span> +<a id="rest_code_1265bdac21584a538123beb08104bbb3-21" name="rest_code_1265bdac21584a538123beb08104bbb3-21" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1265bdac21584a538123beb08104bbb3-21"></a> <span class="n">op</span><span class="o">.</span><span class="n">info</span> <span class="o">=</span> <span class="n">VirtualObject</span><span class="p">()</span> +<a id="rest_code_1265bdac21584a538123beb08104bbb3-22" name="rest_code_1265bdac21584a538123beb08104bbb3-22" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1265bdac21584a538123beb08104bbb3-22"></a> <span class="k">continue</span> +<a id="rest_code_1265bdac21584a538123beb08104bbb3-23" name="rest_code_1265bdac21584a538123beb08104bbb3-23" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1265bdac21584a538123beb08104bbb3-23"></a> <span class="k">if</span> <span class="n">op</span><span class="o">.</span><span class="n">name</span> <span class="o">==</span> <span class="s2">"load"</span><span class="p">:</span> +<a id="rest_code_1265bdac21584a538123beb08104bbb3-24" name="rest_code_1265bdac21584a538123beb08104bbb3-24" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1265bdac21584a538123beb08104bbb3-24"></a> <span class="n">info</span> <span class="o">=</span> <span class="n">op</span><span class="o">.</span><span class="n">arg</span><span class="p">(</span><span class="mi">0</span><span class="p">)</span><span class="o">.</span><span class="n">info</span> +<a id="rest_code_1265bdac21584a538123beb08104bbb3-25" name="rest_code_1265bdac21584a538123beb08104bbb3-25" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1265bdac21584a538123beb08104bbb3-25"></a> <span class="k">if</span> <span class="n">info</span><span class="p">:</span> <span class="c1"># virtual</span> +<a id="rest_code_1265bdac21584a538123beb08104bbb3-26" name="rest_code_1265bdac21584a538123beb08104bbb3-26" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1265bdac21584a538123beb08104bbb3-26"></a> <span class="n">field</span> <span class="o">=</span> <span class="n">get_num</span><span class="p">(</span><span class="n">op</span><span class="p">)</span> +<a id="rest_code_1265bdac21584a538123beb08104bbb3-27" name="rest_code_1265bdac21584a538123beb08104bbb3-27" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1265bdac21584a538123beb08104bbb3-27"></a> <span class="n">op</span><span class="o">.</span><span class="n">make_equal_to</span><span class="p">(</span><span class="n">info</span><span class="o">.</span><span class="n">load</span><span class="p">(</span><span class="n">field</span><span class="p">))</span> +<a id="rest_code_1265bdac21584a538123beb08104bbb3-28" name="rest_code_1265bdac21584a538123beb08104bbb3-28" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1265bdac21584a538123beb08104bbb3-28"></a> <span class="k">continue</span> +<a id="rest_code_1265bdac21584a538123beb08104bbb3-29" name="rest_code_1265bdac21584a538123beb08104bbb3-29" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1265bdac21584a538123beb08104bbb3-29"></a> <span class="k">if</span> <span class="n">op</span><span class="o">.</span><span class="n">name</span> <span class="o">==</span> <span class="s2">"store"</span><span class="p">:</span> +<a id="rest_code_1265bdac21584a538123beb08104bbb3-30" name="rest_code_1265bdac21584a538123beb08104bbb3-30" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1265bdac21584a538123beb08104bbb3-30"></a> <span class="n">info</span> <span class="o">=</span> <span class="n">op</span><span class="o">.</span><span class="n">arg</span><span class="p">(</span><span class="mi">0</span><span class="p">)</span><span class="o">.</span><span class="n">info</span> +<a id="rest_code_1265bdac21584a538123beb08104bbb3-31" name="rest_code_1265bdac21584a538123beb08104bbb3-31" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1265bdac21584a538123beb08104bbb3-31"></a> <span class="k">if</span> <span class="n">info</span><span class="p">:</span> <span class="c1"># virtual</span> +<a id="rest_code_1265bdac21584a538123beb08104bbb3-32" name="rest_code_1265bdac21584a538123beb08104bbb3-32" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1265bdac21584a538123beb08104bbb3-32"></a> <span class="n">field</span> <span class="o">=</span> <span class="n">get_num</span><span class="p">(</span><span class="n">op</span><span class="p">)</span> +<a id="rest_code_1265bdac21584a538123beb08104bbb3-33" name="rest_code_1265bdac21584a538123beb08104bbb3-33" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1265bdac21584a538123beb08104bbb3-33"></a> <span class="n">info</span><span class="o">.</span><span class="n">store</span><span class="p">(</span><span class="n">field</span><span class="p">,</span> <span class="n">op</span><span class="o">.</span><span class="n">arg</span><span class="p">(</span><span class="mi">2</span><span class="p">))</span> +<a id="rest_code_1265bdac21584a538123beb08104bbb3-34" name="rest_code_1265bdac21584a538123beb08104bbb3-34" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1265bdac21584a538123beb08104bbb3-34"></a> <span class="k">continue</span> +<a id="rest_code_1265bdac21584a538123beb08104bbb3-35" name="rest_code_1265bdac21584a538123beb08104bbb3-35" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1265bdac21584a538123beb08104bbb3-35"></a><span class="hll"> <span class="c1"># materialize all the arguments of</span> +</span><a id="rest_code_1265bdac21584a538123beb08104bbb3-36" name="rest_code_1265bdac21584a538123beb08104bbb3-36" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1265bdac21584a538123beb08104bbb3-36"></a><span class="hll"> <span class="c1"># operations that are put into the</span> +</span><a id="rest_code_1265bdac21584a538123beb08104bbb3-37" name="rest_code_1265bdac21584a538123beb08104bbb3-37" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1265bdac21584a538123beb08104bbb3-37"></a><span class="hll"> <span class="c1"># output basic block</span> +</span><a id="rest_code_1265bdac21584a538123beb08104bbb3-38" name="rest_code_1265bdac21584a538123beb08104bbb3-38" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1265bdac21584a538123beb08104bbb3-38"></a><span class="hll"> <span class="k">for</span> <span class="n">arg</span> <span class="ow">in</span> <span class="n">op</span><span class="o">.</span><span class="n">args</span><span class="p">:</span> +</span><a id="rest_code_1265bdac21584a538123beb08104bbb3-39" name="rest_code_1265bdac21584a538123beb08104bbb3-39" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1265bdac21584a538123beb08104bbb3-39"></a><span class="hll"> <span class="n">materialize</span><span class="p">(</span><span class="n">opt_bb</span><span class="p">,</span> <span class="n">arg</span><span class="o">.</span><span class="n">find</span><span class="p">())</span> +</span><a id="rest_code_1265bdac21584a538123beb08104bbb3-40" name="rest_code_1265bdac21584a538123beb08104bbb3-40" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1265bdac21584a538123beb08104bbb3-40"></a> <span class="n">opt_bb</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">op</span><span class="p">)</span> +<a id="rest_code_1265bdac21584a538123beb08104bbb3-41" name="rest_code_1265bdac21584a538123beb08104bbb3-41" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1265bdac21584a538123beb08104bbb3-41"></a> <span class="k">return</span> <span class="n">opt_bb</span> +</pre></div> +<p>That's it, we're done. It's not a lot of code, but actually quite a powerful +optimization. In addition to removing allocations for objects that are only used +briefly and in predictable ways, it also has another effect. If an object is +allocated, used in a number of operations and then escapes further down in the +block, the operations in between can often be optimized away. This is +demonstrated by the next test (which already passes):</p> +<div class="code"><pre class="code python"><a id="rest_code_dcf3f980aee84678b8a7e06a810cf4a6-1" name="rest_code_dcf3f980aee84678b8a7e06a810cf4a6-1" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_dcf3f980aee84678b8a7e06a810cf4a6-1"></a><span class="k">def</span> <span class="nf">test_sink_allocations</span><span class="p">():</span> +<a id="rest_code_dcf3f980aee84678b8a7e06a810cf4a6-2" name="rest_code_dcf3f980aee84678b8a7e06a810cf4a6-2" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_dcf3f980aee84678b8a7e06a810cf4a6-2"></a> <span class="n">bb</span> <span class="o">=</span> <span class="n">Block</span><span class="p">()</span> +<a id="rest_code_dcf3f980aee84678b8a7e06a810cf4a6-3" name="rest_code_dcf3f980aee84678b8a7e06a810cf4a6-3" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_dcf3f980aee84678b8a7e06a810cf4a6-3"></a> <span class="n">var0</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">getarg</span><span class="p">(</span><span class="mi">0</span><span class="p">)</span> +<a id="rest_code_dcf3f980aee84678b8a7e06a810cf4a6-4" name="rest_code_dcf3f980aee84678b8a7e06a810cf4a6-4" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_dcf3f980aee84678b8a7e06a810cf4a6-4"></a> <span class="n">var1</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">alloc</span><span class="p">()</span> +<a id="rest_code_dcf3f980aee84678b8a7e06a810cf4a6-5" name="rest_code_dcf3f980aee84678b8a7e06a810cf4a6-5" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_dcf3f980aee84678b8a7e06a810cf4a6-5"></a> <span class="n">var2</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">store</span><span class="p">(</span><span class="n">var1</span><span class="p">,</span> <span class="mi">0</span><span class="p">,</span> <span class="mi">123</span><span class="p">)</span> +<a id="rest_code_dcf3f980aee84678b8a7e06a810cf4a6-6" name="rest_code_dcf3f980aee84678b8a7e06a810cf4a6-6" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_dcf3f980aee84678b8a7e06a810cf4a6-6"></a> <span class="n">var3</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">store</span><span class="p">(</span><span class="n">var1</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="mi">456</span><span class="p">)</span> +<a id="rest_code_dcf3f980aee84678b8a7e06a810cf4a6-7" name="rest_code_dcf3f980aee84678b8a7e06a810cf4a6-7" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_dcf3f980aee84678b8a7e06a810cf4a6-7"></a> <span class="n">var4</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">load</span><span class="p">(</span><span class="n">var1</span><span class="p">,</span> <span class="mi">0</span><span class="p">)</span> +<a id="rest_code_dcf3f980aee84678b8a7e06a810cf4a6-8" name="rest_code_dcf3f980aee84678b8a7e06a810cf4a6-8" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_dcf3f980aee84678b8a7e06a810cf4a6-8"></a> <span class="n">var5</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">load</span><span class="p">(</span><span class="n">var1</span><span class="p">,</span> <span class="mi">1</span><span class="p">)</span> +<a id="rest_code_dcf3f980aee84678b8a7e06a810cf4a6-9" name="rest_code_dcf3f980aee84678b8a7e06a810cf4a6-9" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_dcf3f980aee84678b8a7e06a810cf4a6-9"></a> <span class="n">var6</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">add</span><span class="p">(</span><span class="n">var4</span><span class="p">,</span> <span class="n">var5</span><span class="p">)</span> +<a id="rest_code_dcf3f980aee84678b8a7e06a810cf4a6-10" name="rest_code_dcf3f980aee84678b8a7e06a810cf4a6-10" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_dcf3f980aee84678b8a7e06a810cf4a6-10"></a> <span class="n">var7</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">store</span><span class="p">(</span><span class="n">var1</span><span class="p">,</span> <span class="mi">0</span><span class="p">,</span> <span class="n">var6</span><span class="p">)</span> +<a id="rest_code_dcf3f980aee84678b8a7e06a810cf4a6-11" name="rest_code_dcf3f980aee84678b8a7e06a810cf4a6-11" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_dcf3f980aee84678b8a7e06a810cf4a6-11"></a> <span class="n">var8</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">store</span><span class="p">(</span><span class="n">var0</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="n">var1</span><span class="p">)</span> +<a id="rest_code_dcf3f980aee84678b8a7e06a810cf4a6-12" name="rest_code_dcf3f980aee84678b8a7e06a810cf4a6-12" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_dcf3f980aee84678b8a7e06a810cf4a6-12"></a> <span class="n">opt_bb</span> <span class="o">=</span> <span class="n">optimize_alloc_removal</span><span class="p">(</span><span class="n">bb</span><span class="p">)</span> +<a id="rest_code_dcf3f980aee84678b8a7e06a810cf4a6-13" name="rest_code_dcf3f980aee84678b8a7e06a810cf4a6-13" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_dcf3f980aee84678b8a7e06a810cf4a6-13"></a> <span class="k">assert</span> <span class="n">bb_to_str</span><span class="p">(</span><span class="n">opt_bb</span><span class="p">,</span> <span class="s2">"optvar"</span><span class="p">)</span> <span class="o">==</span> <span class="s2">"""</span><span class="se">\</span> +<a id="rest_code_dcf3f980aee84678b8a7e06a810cf4a6-14" name="rest_code_dcf3f980aee84678b8a7e06a810cf4a6-14" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_dcf3f980aee84678b8a7e06a810cf4a6-14"></a><span class="s2">optvar0 = getarg(0)</span> +<a id="rest_code_dcf3f980aee84678b8a7e06a810cf4a6-15" name="rest_code_dcf3f980aee84678b8a7e06a810cf4a6-15" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_dcf3f980aee84678b8a7e06a810cf4a6-15"></a><span class="s2">optvar1 = add(123, 456)</span> +<a id="rest_code_dcf3f980aee84678b8a7e06a810cf4a6-16" name="rest_code_dcf3f980aee84678b8a7e06a810cf4a6-16" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_dcf3f980aee84678b8a7e06a810cf4a6-16"></a><span class="s2">optvar2 = alloc()</span> +<a id="rest_code_dcf3f980aee84678b8a7e06a810cf4a6-17" name="rest_code_dcf3f980aee84678b8a7e06a810cf4a6-17" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_dcf3f980aee84678b8a7e06a810cf4a6-17"></a><span class="s2">optvar3 = store(optvar2, 0, optvar1)</span> +<a id="rest_code_dcf3f980aee84678b8a7e06a810cf4a6-18" name="rest_code_dcf3f980aee84678b8a7e06a810cf4a6-18" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_dcf3f980aee84678b8a7e06a810cf4a6-18"></a><span class="s2">optvar4 = store(optvar2, 1, 456)</span> +<a id="rest_code_dcf3f980aee84678b8a7e06a810cf4a6-19" name="rest_code_dcf3f980aee84678b8a7e06a810cf4a6-19" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_dcf3f980aee84678b8a7e06a810cf4a6-19"></a><span class="s2">optvar5 = store(optvar0, 1, optvar2)"""</span> +</pre></div> +<p>Note that the addition is not optimized away, because the code from this blog +post does not contain constant folding and the other optimizations from +the last one. Combining them would not be too hard though.</p> +</section> +<section id="conclusion"> +<h2>Conclusion</h2> +<p>That's it! The core idea of PyPy's allocation removal optimization in one or +two screens of code. The real implementation has a number of refinements, +but the core ideas are all here.</p> +<p>I'm not going to show any benchmark numbers or anything like that here, if you +are interested in numbers you could look at the evaluation Section 6. +"Implementation and Evaluation" of the <a class="reference external" href="https://www3.hhu.de/stups/downloads/pdf/BoCuFiLePeRi2011.pdf">paper</a> that describes the work.</p> +<p>There's a complementary optimization that improves <code class="docutils literal">load</code> and <code class="docutils literal">store</code> +operations for objects that are <em>not</em> virtual. I'll probably not write that +down as another post, but <a class="reference external" href="https://bernsteinbear.com/">Max Bernstein</a> and I developed that together on a +<a class="reference external" href="https://www.pypy.org/posts/2022/10/twitch.tv/pypyproject">PyPy Twitch channel</a> channel a few weeks ago, here's the recording:</p> +<iframe width="560" height="315" src="https://www.youtube-nocookie.com/embed/w-UHg0yOPSE" title="YouTube video player" frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture" allowfullscreen></iframe></section> +<section id="footnotes"> +<h2>Footnotes</h2> +<p id="target-4">¹ This is how PyPy uses the terminology, not really used consistently by other +projects. The term "escape" is fairly standard throughout the <a class="reference external" href="https://en.wikipedia.org/wiki/Escape_analysis">escape +analysis</a> literature. The term "virtual" was used originally in <a class="reference external" href="https://dl.acm.org/doi/abs/10.1145/1014007.1014010">Armin Rigo's +Psyco</a> but is e.g. also used by the paper <a class="reference external" href="https://www.ssw.uni-linz.ac.at/Research/Papers/Stadler14/Stadler2014-CGO-PEA.pdf">Partial Escape Analysis and Scalar +Replacement for Java</a>.</p> +<p id="target-5">² The order in which we put the <cite>store</cite> operations back is relying on +dictionary iteration order, which is insertion order. That's not a bad +ordering, we could also be explicit and sort the fields in some order (ideally +the order in which the object lays them out in memory).</p> +</section>toy-optimizerhttps://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.htmlTue, 25 Oct 2022 07:55:00 GMTDüsseldorf HPy/PyPy/GraalPy sprint September 19-23rd 2022https://www.pypy.org/posts/2022/07/ddorf-sprint-sep-2022.htmlCarl Friedrich Bolz-Tereick<p>The programming language group of the Computer Science department of +Heinrich-Heine Universität Düsseldorf is happy to invite everybody to another +sprint in Düsseldorf, from the 19th to the 23rd of September 2022. This is a +fully public sprint, everyone and particularly newcomers are welcome to join +us! The goal is to bring together people from the <a class="reference external" href="https://hpyproject.org/">HPy</a>, PyPy, <a class="reference external" href="https://www.graalvm.org/python/">GraalPy</a> and +CPython communities.</p> +<section id="topics-and-goals"> +<h2>Topics and goals</h2> +<ul class="simple"> +<li><p>work on HPy APIs, discussions around next steps for the project</p></li> +<li><p>continuing new and ongoing ports to HPy, including Cython, NumPy, Pillow, Matplotlib</p></li> +<li><p>3.10 support on PyPy and GraalPy</p></li> +<li><p>preparing the next PyPy release</p></li> +<li><p>discussions around ways to improve collaboration between the different Python +implementations</p></li> +</ul> +</section> +<section id="what-is-a-sprint"> +<h2>What is a sprint?</h2> +<p>The experience of the PyPy project has shown the benefits of regular +sprints. They are focussed one week physical meetings where people pair-program +on new features and discuss future plans. Coming to one is a great way to get +started with a project!</p> +</section> +<section id="location"> +<h2>Location</h2> +<p>The sprint will take place in a seminar room of the computer science +department. It is in the building 25.12, room 02.50 (second floor) of the +university campus. For travel instructions see</p> +<blockquote> +<p><a class="reference external" href="https://www.cs.hhu.de/lehrstuehle-und-arbeitsgruppen/softwaretechnik-und-programmiersprachen/kontakt/service/lage-und-anreise">https://www.cs.hhu.de/lehrstuehle-und-arbeitsgruppen/softwaretechnik-und-programmiersprachen/kontakt/service/lage-und-anreise</a></p> +</blockquote> +<p>We ask participants to wear masks during the indoor working hours.</p> +<figure> +<a class="reference external image-reference" href="https://commons.wikimedia.org/wiki/File:Universitaets-_und_Landesbibliothek_Duesseldorf_in_Duesseldorf-Bilk,_von_Nordwesten.jpg"> +<img alt="Photograph of the statue of Heinrich Heine in front of the University library on the campus in Düsseldorf" src="https://upload.wikimedia.org/wikipedia/commons/thumb/0/00/Universitaets-_und_Landesbibliothek_Duesseldorf_in_Duesseldorf-Bilk%2C_von_Nordwesten.jpg/640px-Universitaets-_und_Landesbibliothek_Duesseldorf_in_Duesseldorf-Bilk%2C_von_Nordwesten.jpg"> +</a> +<figcaption> +<p>Wiegels, CC BY 3.0, via Wikimedia Commons</p> +</figcaption> +</figure> +</section> +<section id="exact-times"> +<h2>Exact times</h2> +<p>Work days: starting September 19th (~morning), ending September 23rd (~afternoon). +We will do a to-be-planned social activity on Wednesday afternoon.</p> +</section> +<section id="registration"> +<h2>Registration</h2> +<p>Please register by editing this file or by opening a <a class="reference external" href="https://doc.pypy.org/en/latest/coding-guide.html">pull request</a>:</p> +<blockquote> +<p><a class="reference external" href="https://foss.heptapod.net/pypy/extradoc/-/blob/branch/extradoc/sprintinfo/ddorf2022/people.txt">https://foss.heptapod.net/pypy/extradoc/-/blob/branch/extradoc/sprintinfo/ddorf2022/people.txt</a></p> +</blockquote> +<p>or by sending a quick mail to the pypy-dev mailing list:</p> +<blockquote> +<p><a class="reference external" href="http://mail.python.org/mailman/listinfo/pypy-dev">http://mail.python.org/mailman/listinfo/pypy-dev</a></p> +</blockquote> +</section>sprintshttps://www.pypy.org/posts/2022/07/ddorf-sprint-sep-2022.htmlFri, 29 Jul 2022 12:00:00 GMTImplementing a Toy Optimizerhttps://www.pypy.org/posts/2022/07/toy-optimizer.htmlCarl Friedrich Bolz-Tereick<p>In this blog post I want to show the complete code (in Python3) of how a very +simple optimizer for sequences of operations can work. These algorithms could +be part of a (really simple) compiler, or a JIT. The architecture of the code in +this blog post is very similar to that of the trace optimizer of the PyPy JIT: +After a trace is produced, is is optimized before being sent to the machine code +backend that produces binary instructions for the CPU architecture that PyPy is +running on.</p> +<p>To get started, the first thing we need to do is define how our operations are +stored. The +format that a compiler uses to store the program while it is being optimized +is usually called its <a class="reference external" href="https://en.wikipedia.org/wiki/Intermediate_representation">intermediate representation</a> (IR). Many production +compilers use IRs that are in the <a class="reference external" href="https://en.wikipedia.org/wiki/Static_single-assignment_form">Static Single-Assignment Form</a> (SSA), and +we will also use that. SSA form has the property that every variable is +assigned to exactly once, and every variable is defined before it is used. This +simplifies many things.</p> +<p>Let's make this concrete. If our input program is a complex expressions, such +as <code class="docutils literal">a * (b + 17) + (b + 17)</code> the intermediate representation of that (or at +least its text representation) would maybe be something like:</p> +<pre class="literal-block">var1 = add(b, 17) +var2 = mul(a, var1) +var3 = add(b, 17) +var4 = add(var2, var3)</pre> +<p>This sequence of instructions is inefficient. The operation <code class="docutils literal">add(b, 17)</code> is +computed twice and we can save time by removing the second one and only +computing it once. In this post I want to show an optimizer that can do this +(and some related) optimizations.</p> +<p>Looking at the IR we notice that the input expression has been linearized +into a sequence of operations, and all the intermedia results have been given +unique variable names. The value that every variable is assigned is computed +by the right hand side, which is some operation consisting of an operand and an +arbitrary number of arguments. The arguments of an operation are either +themselves variables or constants.</p> +<p>I will not at all talk about the process of translating the input program +into the IR. Instead, I will assume we have some component that does this +translation already. The tests in this blog post will construct small +snippets of IR by hand. I also won't talk about what happens after the +optimization (usually the optimized IR is translated into machine code).</p> +<section id="implementing-the-intermediate-representation"> +<h2>Implementing the Intermediate Representation</h2> +<p>Let's start modelling the intermediate representation with Python classes. +First we define a base class of all values that can be used as arguments in +operations, and let's also add a class that represents constants:</p> +<div class="code"><pre class="code python"><a id="rest_code_69e9b0ec902f4309ab1c8f4987b5f274-1" name="rest_code_69e9b0ec902f4309ab1c8f4987b5f274-1" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_69e9b0ec902f4309ab1c8f4987b5f274-1"></a><span class="kn">import</span> <span class="nn">pytest</span> +<a id="rest_code_69e9b0ec902f4309ab1c8f4987b5f274-2" name="rest_code_69e9b0ec902f4309ab1c8f4987b5f274-2" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_69e9b0ec902f4309ab1c8f4987b5f274-2"></a><span class="kn">from</span> <span class="nn">typing</span> <span class="kn">import</span> <span class="n">Optional</span><span class="p">,</span> <span class="n">Any</span> +<a id="rest_code_69e9b0ec902f4309ab1c8f4987b5f274-3" name="rest_code_69e9b0ec902f4309ab1c8f4987b5f274-3" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_69e9b0ec902f4309ab1c8f4987b5f274-3"></a> +<a id="rest_code_69e9b0ec902f4309ab1c8f4987b5f274-4" name="rest_code_69e9b0ec902f4309ab1c8f4987b5f274-4" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_69e9b0ec902f4309ab1c8f4987b5f274-4"></a><span class="k">class</span> <span class="nc">Value</span><span class="p">:</span> +<a id="rest_code_69e9b0ec902f4309ab1c8f4987b5f274-5" name="rest_code_69e9b0ec902f4309ab1c8f4987b5f274-5" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_69e9b0ec902f4309ab1c8f4987b5f274-5"></a> <span class="k">pass</span> +<a id="rest_code_69e9b0ec902f4309ab1c8f4987b5f274-6" name="rest_code_69e9b0ec902f4309ab1c8f4987b5f274-6" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_69e9b0ec902f4309ab1c8f4987b5f274-6"></a> +<a id="rest_code_69e9b0ec902f4309ab1c8f4987b5f274-7" name="rest_code_69e9b0ec902f4309ab1c8f4987b5f274-7" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_69e9b0ec902f4309ab1c8f4987b5f274-7"></a><span class="k">class</span> <span class="nc">Constant</span><span class="p">(</span><span class="n">Value</span><span class="p">):</span> +<a id="rest_code_69e9b0ec902f4309ab1c8f4987b5f274-8" name="rest_code_69e9b0ec902f4309ab1c8f4987b5f274-8" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_69e9b0ec902f4309ab1c8f4987b5f274-8"></a> <span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="n">Any</span><span class="p">):</span> +<a id="rest_code_69e9b0ec902f4309ab1c8f4987b5f274-9" name="rest_code_69e9b0ec902f4309ab1c8f4987b5f274-9" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_69e9b0ec902f4309ab1c8f4987b5f274-9"></a> <span class="bp">self</span><span class="o">.</span><span class="n">value</span> <span class="o">=</span> <span class="n">value</span> +<a id="rest_code_69e9b0ec902f4309ab1c8f4987b5f274-10" name="rest_code_69e9b0ec902f4309ab1c8f4987b5f274-10" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_69e9b0ec902f4309ab1c8f4987b5f274-10"></a> +<a id="rest_code_69e9b0ec902f4309ab1c8f4987b5f274-11" name="rest_code_69e9b0ec902f4309ab1c8f4987b5f274-11" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_69e9b0ec902f4309ab1c8f4987b5f274-11"></a> <span class="k">def</span> <span class="fm">__repr__</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span> +<a id="rest_code_69e9b0ec902f4309ab1c8f4987b5f274-12" name="rest_code_69e9b0ec902f4309ab1c8f4987b5f274-12" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_69e9b0ec902f4309ab1c8f4987b5f274-12"></a> <span class="k">return</span> <span class="sa">f</span><span class="s2">"Constant(</span><span class="si">{</span><span class="bp">self</span><span class="o">.</span><span class="n">value</span><span class="si">}</span><span class="s2">)"</span> +</pre></div> +<p>One consequence of the fact that every variable is assigned to only once is +that variables are in a one-to-one correspondence with the right-hand-side of +their unique assignments. That means that we don't need a class that represents +variables at all. Instead, it's sufficient to have a class that represents an +operation (the right-hand side), and that by definition is the same as the variable (left-hand side) that it defines:</p> +<div class="code"><pre class="code python"><a id="rest_code_40d95908ff0c4e0fbb15a2f1c054d2f4-1" name="rest_code_40d95908ff0c4e0fbb15a2f1c054d2f4-1" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_40d95908ff0c4e0fbb15a2f1c054d2f4-1"></a><span class="k">class</span> <span class="nc">Operation</span><span class="p">(</span><span class="n">Value</span><span class="p">):</span> +<a id="rest_code_40d95908ff0c4e0fbb15a2f1c054d2f4-2" name="rest_code_40d95908ff0c4e0fbb15a2f1c054d2f4-2" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_40d95908ff0c4e0fbb15a2f1c054d2f4-2"></a> <span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">name</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span> <span class="n">args</span><span class="p">:</span> <span class="nb">list</span><span class="p">[</span><span class="n">Value</span><span class="p">]):</span> +<a id="rest_code_40d95908ff0c4e0fbb15a2f1c054d2f4-3" name="rest_code_40d95908ff0c4e0fbb15a2f1c054d2f4-3" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_40d95908ff0c4e0fbb15a2f1c054d2f4-3"></a> <span class="bp">self</span><span class="o">.</span><span class="n">name</span> <span class="o">=</span> <span class="n">name</span> +<a id="rest_code_40d95908ff0c4e0fbb15a2f1c054d2f4-4" name="rest_code_40d95908ff0c4e0fbb15a2f1c054d2f4-4" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_40d95908ff0c4e0fbb15a2f1c054d2f4-4"></a> <span class="bp">self</span><span class="o">.</span><span class="n">args</span> <span class="o">=</span> <span class="n">args</span> +<a id="rest_code_40d95908ff0c4e0fbb15a2f1c054d2f4-5" name="rest_code_40d95908ff0c4e0fbb15a2f1c054d2f4-5" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_40d95908ff0c4e0fbb15a2f1c054d2f4-5"></a> +<a id="rest_code_40d95908ff0c4e0fbb15a2f1c054d2f4-6" name="rest_code_40d95908ff0c4e0fbb15a2f1c054d2f4-6" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_40d95908ff0c4e0fbb15a2f1c054d2f4-6"></a> <span class="k">def</span> <span class="fm">__repr__</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span> +<a id="rest_code_40d95908ff0c4e0fbb15a2f1c054d2f4-7" name="rest_code_40d95908ff0c4e0fbb15a2f1c054d2f4-7" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_40d95908ff0c4e0fbb15a2f1c054d2f4-7"></a> <span class="k">return</span> <span class="sa">f</span><span class="s2">"Operation(</span><span class="si">{</span><span class="bp">self</span><span class="o">.</span><span class="n">name</span><span class="si">}</span><span class="s2">, </span><span class="si">{</span><span class="bp">self</span><span class="o">.</span><span class="n">args</span><span class="si">}</span><span class="s2">)"</span> +<a id="rest_code_40d95908ff0c4e0fbb15a2f1c054d2f4-8" name="rest_code_40d95908ff0c4e0fbb15a2f1c054d2f4-8" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_40d95908ff0c4e0fbb15a2f1c054d2f4-8"></a> +<a id="rest_code_40d95908ff0c4e0fbb15a2f1c054d2f4-9" name="rest_code_40d95908ff0c4e0fbb15a2f1c054d2f4-9" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_40d95908ff0c4e0fbb15a2f1c054d2f4-9"></a> <span class="k">def</span> <span class="nf">arg</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">index</span><span class="p">:</span> <span class="nb">int</span><span class="p">):</span> +<a id="rest_code_40d95908ff0c4e0fbb15a2f1c054d2f4-10" name="rest_code_40d95908ff0c4e0fbb15a2f1c054d2f4-10" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_40d95908ff0c4e0fbb15a2f1c054d2f4-10"></a> <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">args</span><span class="p">[</span><span class="n">index</span><span class="p">]</span> +</pre></div> +<p>Now we can instantiate these two classes to represent the example sequence of +operations above:</p> +<div class="code"><pre class="code python"><a id="rest_code_31b4664131db44af997a1af90a539c87-1" name="rest_code_31b4664131db44af997a1af90a539c87-1" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_31b4664131db44af997a1af90a539c87-1"></a><span class="k">def</span> <span class="nf">test_construct_example</span><span class="p">():</span> +<a id="rest_code_31b4664131db44af997a1af90a539c87-2" name="rest_code_31b4664131db44af997a1af90a539c87-2" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_31b4664131db44af997a1af90a539c87-2"></a> <span class="c1"># first we need something to represent</span> +<a id="rest_code_31b4664131db44af997a1af90a539c87-3" name="rest_code_31b4664131db44af997a1af90a539c87-3" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_31b4664131db44af997a1af90a539c87-3"></a> <span class="c1"># "a" and "b". In our limited view, we don't</span> +<a id="rest_code_31b4664131db44af997a1af90a539c87-4" name="rest_code_31b4664131db44af997a1af90a539c87-4" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_31b4664131db44af997a1af90a539c87-4"></a> <span class="c1"># know where they come from, so we will define</span> +<a id="rest_code_31b4664131db44af997a1af90a539c87-5" name="rest_code_31b4664131db44af997a1af90a539c87-5" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_31b4664131db44af997a1af90a539c87-5"></a> <span class="c1"># them with a pseudo-operation called "getarg"</span> +<a id="rest_code_31b4664131db44af997a1af90a539c87-6" name="rest_code_31b4664131db44af997a1af90a539c87-6" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_31b4664131db44af997a1af90a539c87-6"></a> <span class="c1"># which takes a number n as an argument and</span> +<a id="rest_code_31b4664131db44af997a1af90a539c87-7" name="rest_code_31b4664131db44af997a1af90a539c87-7" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_31b4664131db44af997a1af90a539c87-7"></a> <span class="c1"># returns the n-th input argument. The proper</span> +<a id="rest_code_31b4664131db44af997a1af90a539c87-8" name="rest_code_31b4664131db44af997a1af90a539c87-8" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_31b4664131db44af997a1af90a539c87-8"></a> <span class="c1"># SSA way to do this would be phi-nodes.</span> +<a id="rest_code_31b4664131db44af997a1af90a539c87-9" name="rest_code_31b4664131db44af997a1af90a539c87-9" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_31b4664131db44af997a1af90a539c87-9"></a> +<a id="rest_code_31b4664131db44af997a1af90a539c87-10" name="rest_code_31b4664131db44af997a1af90a539c87-10" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_31b4664131db44af997a1af90a539c87-10"></a> <span class="n">a</span> <span class="o">=</span> <span class="n">Operation</span><span class="p">(</span><span class="s2">"getarg"</span><span class="p">,</span> <span class="p">[</span><span class="n">Constant</span><span class="p">(</span><span class="mi">0</span><span class="p">)])</span> +<a id="rest_code_31b4664131db44af997a1af90a539c87-11" name="rest_code_31b4664131db44af997a1af90a539c87-11" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_31b4664131db44af997a1af90a539c87-11"></a> <span class="n">b</span> <span class="o">=</span> <span class="n">Operation</span><span class="p">(</span><span class="s2">"getarg"</span><span class="p">,</span> <span class="p">[</span><span class="n">Constant</span><span class="p">(</span><span class="mi">1</span><span class="p">)])</span> +<a id="rest_code_31b4664131db44af997a1af90a539c87-12" name="rest_code_31b4664131db44af997a1af90a539c87-12" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_31b4664131db44af997a1af90a539c87-12"></a> <span class="c1"># var1 = add(b, 17)</span> +<a id="rest_code_31b4664131db44af997a1af90a539c87-13" name="rest_code_31b4664131db44af997a1af90a539c87-13" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_31b4664131db44af997a1af90a539c87-13"></a> <span class="n">var1</span> <span class="o">=</span> <span class="n">Operation</span><span class="p">(</span><span class="s2">"add"</span><span class="p">,</span> <span class="p">[</span><span class="n">b</span><span class="p">,</span> <span class="n">Constant</span><span class="p">(</span><span class="mi">17</span><span class="p">)])</span> +<a id="rest_code_31b4664131db44af997a1af90a539c87-14" name="rest_code_31b4664131db44af997a1af90a539c87-14" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_31b4664131db44af997a1af90a539c87-14"></a> <span class="c1"># var2 = mul(a, var1)</span> +<a id="rest_code_31b4664131db44af997a1af90a539c87-15" name="rest_code_31b4664131db44af997a1af90a539c87-15" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_31b4664131db44af997a1af90a539c87-15"></a> <span class="n">var2</span> <span class="o">=</span> <span class="n">Operation</span><span class="p">(</span><span class="s2">"mul"</span><span class="p">,</span> <span class="p">[</span><span class="n">a</span><span class="p">,</span> <span class="n">var1</span><span class="p">])</span> +<a id="rest_code_31b4664131db44af997a1af90a539c87-16" name="rest_code_31b4664131db44af997a1af90a539c87-16" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_31b4664131db44af997a1af90a539c87-16"></a> <span class="c1"># var3 = add(b, 17)</span> +<a id="rest_code_31b4664131db44af997a1af90a539c87-17" name="rest_code_31b4664131db44af997a1af90a539c87-17" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_31b4664131db44af997a1af90a539c87-17"></a> <span class="n">var3</span> <span class="o">=</span> <span class="n">Operation</span><span class="p">(</span><span class="s2">"add"</span><span class="p">,</span> <span class="p">[</span><span class="n">b</span><span class="p">,</span> <span class="n">Constant</span><span class="p">(</span><span class="mi">17</span><span class="p">)])</span> +<a id="rest_code_31b4664131db44af997a1af90a539c87-18" name="rest_code_31b4664131db44af997a1af90a539c87-18" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_31b4664131db44af997a1af90a539c87-18"></a> <span class="c1"># var4 = add(var2, var3)</span> +<a id="rest_code_31b4664131db44af997a1af90a539c87-19" name="rest_code_31b4664131db44af997a1af90a539c87-19" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_31b4664131db44af997a1af90a539c87-19"></a> <span class="n">var4</span> <span class="o">=</span> <span class="n">Operation</span><span class="p">(</span><span class="s2">"add"</span><span class="p">,</span> <span class="p">[</span><span class="n">var2</span><span class="p">,</span> <span class="n">var3</span><span class="p">])</span> +<a id="rest_code_31b4664131db44af997a1af90a539c87-20" name="rest_code_31b4664131db44af997a1af90a539c87-20" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_31b4664131db44af997a1af90a539c87-20"></a> +<a id="rest_code_31b4664131db44af997a1af90a539c87-21" name="rest_code_31b4664131db44af997a1af90a539c87-21" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_31b4664131db44af997a1af90a539c87-21"></a> <span class="n">sequence</span> <span class="o">=</span> <span class="p">[</span><span class="n">a</span><span class="p">,</span> <span class="n">b</span><span class="p">,</span> <span class="n">var1</span><span class="p">,</span> <span class="n">var2</span><span class="p">,</span> <span class="n">var3</span><span class="p">,</span> <span class="n">var4</span><span class="p">]</span> +<a id="rest_code_31b4664131db44af997a1af90a539c87-22" name="rest_code_31b4664131db44af997a1af90a539c87-22" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_31b4664131db44af997a1af90a539c87-22"></a> <span class="c1"># nothing to test really, it shouldn't crash</span> +</pre></div> +<p>Usually, complicated programs are represented as a <a class="reference external" href="https://en.wikipedia.org/wiki/Control-flow_graph">control flow graph</a> in a +compiler, which represents all the possible paths that control can take while +executing the program. Every node in the control flow graph is a <a class="reference external" href="https://en.wikipedia.org/wiki/Basic_block">basic +block</a>. A basic block is a linear sequence of operations with no control flow +inside of it.</p> +<p>When optimizing a program, a compiler usually looks at the whole control flow +graph of a function. However, that is still too complicated! So let's +simplify further and look at only at optimizations we can do when looking at +a single basic block and its sequence of instructions (they are called local +optimizations).</p> +<p>Let's define a class representing basic blocks and let's also add some +convenience functions for constructing sequences of operations, because the +code in <code class="docutils literal">test_construct_example</code> is a bit annoying.</p> +<div class="code"><pre class="code python"><a id="rest_code_cadeff25d2194d8a8f26c581650641c7-1" name="rest_code_cadeff25d2194d8a8f26c581650641c7-1" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_cadeff25d2194d8a8f26c581650641c7-1"></a><span class="k">class</span> <span class="nc">Block</span><span class="p">(</span><span class="nb">list</span><span class="p">):</span> +<a id="rest_code_cadeff25d2194d8a8f26c581650641c7-2" name="rest_code_cadeff25d2194d8a8f26c581650641c7-2" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_cadeff25d2194d8a8f26c581650641c7-2"></a> <span class="k">def</span> <span class="nf">opbuilder</span><span class="p">(</span><span class="n">opname</span><span class="p">):</span> +<a id="rest_code_cadeff25d2194d8a8f26c581650641c7-3" name="rest_code_cadeff25d2194d8a8f26c581650641c7-3" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_cadeff25d2194d8a8f26c581650641c7-3"></a> <span class="k">def</span> <span class="nf">wraparg</span><span class="p">(</span><span class="n">arg</span><span class="p">):</span> +<a id="rest_code_cadeff25d2194d8a8f26c581650641c7-4" name="rest_code_cadeff25d2194d8a8f26c581650641c7-4" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_cadeff25d2194d8a8f26c581650641c7-4"></a> <span class="k">if</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">arg</span><span class="p">,</span> <span class="n">Value</span><span class="p">):</span> +<a id="rest_code_cadeff25d2194d8a8f26c581650641c7-5" name="rest_code_cadeff25d2194d8a8f26c581650641c7-5" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_cadeff25d2194d8a8f26c581650641c7-5"></a> <span class="n">arg</span> <span class="o">=</span> <span class="n">Constant</span><span class="p">(</span><span class="n">arg</span><span class="p">)</span> +<a id="rest_code_cadeff25d2194d8a8f26c581650641c7-6" name="rest_code_cadeff25d2194d8a8f26c581650641c7-6" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_cadeff25d2194d8a8f26c581650641c7-6"></a> <span class="k">return</span> <span class="n">arg</span> +<a id="rest_code_cadeff25d2194d8a8f26c581650641c7-7" name="rest_code_cadeff25d2194d8a8f26c581650641c7-7" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_cadeff25d2194d8a8f26c581650641c7-7"></a> <span class="k">def</span> <span class="nf">build</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="n">args</span><span class="p">):</span> +<a id="rest_code_cadeff25d2194d8a8f26c581650641c7-8" name="rest_code_cadeff25d2194d8a8f26c581650641c7-8" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_cadeff25d2194d8a8f26c581650641c7-8"></a> <span class="c1"># construct an Operation, wrap the</span> +<a id="rest_code_cadeff25d2194d8a8f26c581650641c7-9" name="rest_code_cadeff25d2194d8a8f26c581650641c7-9" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_cadeff25d2194d8a8f26c581650641c7-9"></a> <span class="c1"># arguments in Constants if necessary</span> +<a id="rest_code_cadeff25d2194d8a8f26c581650641c7-10" name="rest_code_cadeff25d2194d8a8f26c581650641c7-10" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_cadeff25d2194d8a8f26c581650641c7-10"></a> <span class="n">op</span> <span class="o">=</span> <span class="n">Operation</span><span class="p">(</span><span class="n">opname</span><span class="p">,</span> +<a id="rest_code_cadeff25d2194d8a8f26c581650641c7-11" name="rest_code_cadeff25d2194d8a8f26c581650641c7-11" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_cadeff25d2194d8a8f26c581650641c7-11"></a> <span class="p">[</span><span class="n">wraparg</span><span class="p">(</span><span class="n">arg</span><span class="p">)</span> <span class="k">for</span> <span class="n">arg</span> <span class="ow">in</span> <span class="n">args</span><span class="p">])</span> +<a id="rest_code_cadeff25d2194d8a8f26c581650641c7-12" name="rest_code_cadeff25d2194d8a8f26c581650641c7-12" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_cadeff25d2194d8a8f26c581650641c7-12"></a> <span class="c1"># add it to self, the basic block</span> +<a id="rest_code_cadeff25d2194d8a8f26c581650641c7-13" name="rest_code_cadeff25d2194d8a8f26c581650641c7-13" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_cadeff25d2194d8a8f26c581650641c7-13"></a> <span class="bp">self</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">op</span><span class="p">)</span> +<a id="rest_code_cadeff25d2194d8a8f26c581650641c7-14" name="rest_code_cadeff25d2194d8a8f26c581650641c7-14" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_cadeff25d2194d8a8f26c581650641c7-14"></a> <span class="k">return</span> <span class="n">op</span> +<a id="rest_code_cadeff25d2194d8a8f26c581650641c7-15" name="rest_code_cadeff25d2194d8a8f26c581650641c7-15" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_cadeff25d2194d8a8f26c581650641c7-15"></a> <span class="k">return</span> <span class="n">build</span> +<a id="rest_code_cadeff25d2194d8a8f26c581650641c7-16" name="rest_code_cadeff25d2194d8a8f26c581650641c7-16" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_cadeff25d2194d8a8f26c581650641c7-16"></a> +<a id="rest_code_cadeff25d2194d8a8f26c581650641c7-17" name="rest_code_cadeff25d2194d8a8f26c581650641c7-17" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_cadeff25d2194d8a8f26c581650641c7-17"></a> <span class="c1"># a bunch of operations we support</span> +<a id="rest_code_cadeff25d2194d8a8f26c581650641c7-18" name="rest_code_cadeff25d2194d8a8f26c581650641c7-18" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_cadeff25d2194d8a8f26c581650641c7-18"></a> <span class="n">add</span> <span class="o">=</span> <span class="n">opbuilder</span><span class="p">(</span><span class="s2">"add"</span><span class="p">)</span> +<a id="rest_code_cadeff25d2194d8a8f26c581650641c7-19" name="rest_code_cadeff25d2194d8a8f26c581650641c7-19" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_cadeff25d2194d8a8f26c581650641c7-19"></a> <span class="n">mul</span> <span class="o">=</span> <span class="n">opbuilder</span><span class="p">(</span><span class="s2">"mul"</span><span class="p">)</span> +<a id="rest_code_cadeff25d2194d8a8f26c581650641c7-20" name="rest_code_cadeff25d2194d8a8f26c581650641c7-20" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_cadeff25d2194d8a8f26c581650641c7-20"></a> <span class="n">getarg</span> <span class="o">=</span> <span class="n">opbuilder</span><span class="p">(</span><span class="s2">"getarg"</span><span class="p">)</span> +<a id="rest_code_cadeff25d2194d8a8f26c581650641c7-21" name="rest_code_cadeff25d2194d8a8f26c581650641c7-21" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_cadeff25d2194d8a8f26c581650641c7-21"></a> <span class="n">dummy</span> <span class="o">=</span> <span class="n">opbuilder</span><span class="p">(</span><span class="s2">"dummy"</span><span class="p">)</span> +<a id="rest_code_cadeff25d2194d8a8f26c581650641c7-22" name="rest_code_cadeff25d2194d8a8f26c581650641c7-22" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_cadeff25d2194d8a8f26c581650641c7-22"></a> <span class="n">lshift</span> <span class="o">=</span> <span class="n">opbuilder</span><span class="p">(</span><span class="s2">"lshift"</span><span class="p">)</span> +<a id="rest_code_cadeff25d2194d8a8f26c581650641c7-23" name="rest_code_cadeff25d2194d8a8f26c581650641c7-23" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_cadeff25d2194d8a8f26c581650641c7-23"></a> +<a id="rest_code_cadeff25d2194d8a8f26c581650641c7-24" name="rest_code_cadeff25d2194d8a8f26c581650641c7-24" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_cadeff25d2194d8a8f26c581650641c7-24"></a><span class="k">def</span> <span class="nf">test_convencience_block_construction</span><span class="p">():</span> +<a id="rest_code_cadeff25d2194d8a8f26c581650641c7-25" name="rest_code_cadeff25d2194d8a8f26c581650641c7-25" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_cadeff25d2194d8a8f26c581650641c7-25"></a> <span class="n">bb</span> <span class="o">=</span> <span class="n">Block</span><span class="p">()</span> +<a id="rest_code_cadeff25d2194d8a8f26c581650641c7-26" name="rest_code_cadeff25d2194d8a8f26c581650641c7-26" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_cadeff25d2194d8a8f26c581650641c7-26"></a> <span class="c1"># a again with getarg, the following line</span> +<a id="rest_code_cadeff25d2194d8a8f26c581650641c7-27" name="rest_code_cadeff25d2194d8a8f26c581650641c7-27" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_cadeff25d2194d8a8f26c581650641c7-27"></a> <span class="c1"># defines the Operation instance and</span> +<a id="rest_code_cadeff25d2194d8a8f26c581650641c7-28" name="rest_code_cadeff25d2194d8a8f26c581650641c7-28" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_cadeff25d2194d8a8f26c581650641c7-28"></a> <span class="c1"># immediately adds it to the basic block bb</span> +<a id="rest_code_cadeff25d2194d8a8f26c581650641c7-29" name="rest_code_cadeff25d2194d8a8f26c581650641c7-29" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_cadeff25d2194d8a8f26c581650641c7-29"></a> <span class="n">a</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">getarg</span><span class="p">(</span><span class="mi">0</span><span class="p">)</span> +<a id="rest_code_cadeff25d2194d8a8f26c581650641c7-30" name="rest_code_cadeff25d2194d8a8f26c581650641c7-30" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_cadeff25d2194d8a8f26c581650641c7-30"></a> <span class="k">assert</span> <span class="nb">len</span><span class="p">(</span><span class="n">bb</span><span class="p">)</span> <span class="o">==</span> <span class="mi">1</span> +<a id="rest_code_cadeff25d2194d8a8f26c581650641c7-31" name="rest_code_cadeff25d2194d8a8f26c581650641c7-31" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_cadeff25d2194d8a8f26c581650641c7-31"></a> <span class="k">assert</span> <span class="n">bb</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span><span class="o">.</span><span class="n">name</span> <span class="o">==</span> <span class="s2">"getarg"</span> +<a id="rest_code_cadeff25d2194d8a8f26c581650641c7-32" name="rest_code_cadeff25d2194d8a8f26c581650641c7-32" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_cadeff25d2194d8a8f26c581650641c7-32"></a> +<a id="rest_code_cadeff25d2194d8a8f26c581650641c7-33" name="rest_code_cadeff25d2194d8a8f26c581650641c7-33" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_cadeff25d2194d8a8f26c581650641c7-33"></a> <span class="c1"># it's a Constant</span> +<a id="rest_code_cadeff25d2194d8a8f26c581650641c7-34" name="rest_code_cadeff25d2194d8a8f26c581650641c7-34" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_cadeff25d2194d8a8f26c581650641c7-34"></a> <span class="k">assert</span> <span class="n">bb</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span><span class="o">.</span><span class="n">args</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span><span class="o">.</span><span class="n">value</span> <span class="o">==</span> <span class="mi">0</span> +<a id="rest_code_cadeff25d2194d8a8f26c581650641c7-35" name="rest_code_cadeff25d2194d8a8f26c581650641c7-35" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_cadeff25d2194d8a8f26c581650641c7-35"></a> +<a id="rest_code_cadeff25d2194d8a8f26c581650641c7-36" name="rest_code_cadeff25d2194d8a8f26c581650641c7-36" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_cadeff25d2194d8a8f26c581650641c7-36"></a> <span class="c1"># b with getarg</span> +<a id="rest_code_cadeff25d2194d8a8f26c581650641c7-37" name="rest_code_cadeff25d2194d8a8f26c581650641c7-37" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_cadeff25d2194d8a8f26c581650641c7-37"></a> <span class="n">b</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">getarg</span><span class="p">(</span><span class="mi">1</span><span class="p">)</span> +<a id="rest_code_cadeff25d2194d8a8f26c581650641c7-38" name="rest_code_cadeff25d2194d8a8f26c581650641c7-38" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_cadeff25d2194d8a8f26c581650641c7-38"></a> <span class="c1"># var1 = add(b, 17)</span> +<a id="rest_code_cadeff25d2194d8a8f26c581650641c7-39" name="rest_code_cadeff25d2194d8a8f26c581650641c7-39" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_cadeff25d2194d8a8f26c581650641c7-39"></a> <span class="n">var1</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">add</span><span class="p">(</span><span class="n">b</span><span class="p">,</span> <span class="mi">17</span><span class="p">)</span> +<a id="rest_code_cadeff25d2194d8a8f26c581650641c7-40" name="rest_code_cadeff25d2194d8a8f26c581650641c7-40" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_cadeff25d2194d8a8f26c581650641c7-40"></a> <span class="c1"># var2 = mul(a, var1)</span> +<a id="rest_code_cadeff25d2194d8a8f26c581650641c7-41" name="rest_code_cadeff25d2194d8a8f26c581650641c7-41" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_cadeff25d2194d8a8f26c581650641c7-41"></a> <span class="n">var2</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">mul</span><span class="p">(</span><span class="n">a</span><span class="p">,</span> <span class="n">var1</span><span class="p">)</span> +<a id="rest_code_cadeff25d2194d8a8f26c581650641c7-42" name="rest_code_cadeff25d2194d8a8f26c581650641c7-42" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_cadeff25d2194d8a8f26c581650641c7-42"></a> <span class="c1"># var3 = add(b, 17)</span> +<a id="rest_code_cadeff25d2194d8a8f26c581650641c7-43" name="rest_code_cadeff25d2194d8a8f26c581650641c7-43" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_cadeff25d2194d8a8f26c581650641c7-43"></a> <span class="n">var3</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">add</span><span class="p">(</span><span class="n">b</span><span class="p">,</span> <span class="mi">17</span><span class="p">)</span> +<a id="rest_code_cadeff25d2194d8a8f26c581650641c7-44" name="rest_code_cadeff25d2194d8a8f26c581650641c7-44" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_cadeff25d2194d8a8f26c581650641c7-44"></a> <span class="c1"># var4 = add(var2, var3)</span> +<a id="rest_code_cadeff25d2194d8a8f26c581650641c7-45" name="rest_code_cadeff25d2194d8a8f26c581650641c7-45" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_cadeff25d2194d8a8f26c581650641c7-45"></a> <span class="n">var4</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">add</span><span class="p">(</span><span class="n">var2</span><span class="p">,</span> <span class="n">var3</span><span class="p">)</span> +<a id="rest_code_cadeff25d2194d8a8f26c581650641c7-46" name="rest_code_cadeff25d2194d8a8f26c581650641c7-46" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_cadeff25d2194d8a8f26c581650641c7-46"></a> <span class="k">assert</span> <span class="nb">len</span><span class="p">(</span><span class="n">bb</span><span class="p">)</span> <span class="o">==</span> <span class="mi">6</span> +</pre></div> +<p>That's a good bit of infrastructure to make the tests easy to write. One +thing we are lacking though is a way to print the basic blocks into a nicely +readable textual representation. Because in the current form, the <code class="docutils literal">repr</code> of a +Block is very annoying, the output of pretty-printing <code class="docutils literal">bb</code> in the test above +looks like this:</p> +<div class="code"><pre class="code python"><a id="rest_code_3b3ea7bc40a549f49465cd29e353728b-1" name="rest_code_3b3ea7bc40a549f49465cd29e353728b-1" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_3b3ea7bc40a549f49465cd29e353728b-1"></a><span class="p">[</span><span class="n">Operation</span><span class="p">(</span><span class="s1">'getarg'</span><span class="p">,</span> <span class="p">[</span><span class="n">Constant</span><span class="p">(</span><span class="mi">0</span><span class="p">)]),</span> +<a id="rest_code_3b3ea7bc40a549f49465cd29e353728b-2" name="rest_code_3b3ea7bc40a549f49465cd29e353728b-2" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_3b3ea7bc40a549f49465cd29e353728b-2"></a> <span class="n">Operation</span><span class="p">(</span><span class="s1">'getarg'</span><span class="p">,</span> <span class="p">[</span><span class="n">Constant</span><span class="p">(</span><span class="mi">1</span><span class="p">)]),</span> +<a id="rest_code_3b3ea7bc40a549f49465cd29e353728b-3" name="rest_code_3b3ea7bc40a549f49465cd29e353728b-3" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_3b3ea7bc40a549f49465cd29e353728b-3"></a> <span class="n">Operation</span><span class="p">(</span><span class="s1">'add'</span><span class="p">,</span> +<a id="rest_code_3b3ea7bc40a549f49465cd29e353728b-4" name="rest_code_3b3ea7bc40a549f49465cd29e353728b-4" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_3b3ea7bc40a549f49465cd29e353728b-4"></a> <span class="p">[</span><span class="n">Operation</span><span class="p">(</span><span class="s1">'getarg'</span><span class="p">,</span> +<a id="rest_code_3b3ea7bc40a549f49465cd29e353728b-5" name="rest_code_3b3ea7bc40a549f49465cd29e353728b-5" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_3b3ea7bc40a549f49465cd29e353728b-5"></a> <span class="p">[</span><span class="n">Constant</span><span class="p">(</span><span class="mi">1</span><span class="p">)]),</span> +<a id="rest_code_3b3ea7bc40a549f49465cd29e353728b-6" name="rest_code_3b3ea7bc40a549f49465cd29e353728b-6" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_3b3ea7bc40a549f49465cd29e353728b-6"></a> <span class="n">Constant</span><span class="p">(</span><span class="mi">17</span><span class="p">)]),</span> +<a id="rest_code_3b3ea7bc40a549f49465cd29e353728b-7" name="rest_code_3b3ea7bc40a549f49465cd29e353728b-7" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_3b3ea7bc40a549f49465cd29e353728b-7"></a> <span class="n">Operation</span><span class="p">(</span><span class="s1">'mul'</span><span class="p">,</span> +<a id="rest_code_3b3ea7bc40a549f49465cd29e353728b-8" name="rest_code_3b3ea7bc40a549f49465cd29e353728b-8" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_3b3ea7bc40a549f49465cd29e353728b-8"></a> <span class="p">[</span><span class="n">Operation</span><span class="p">(</span><span class="s1">'getarg'</span><span class="p">,</span> +<a id="rest_code_3b3ea7bc40a549f49465cd29e353728b-9" name="rest_code_3b3ea7bc40a549f49465cd29e353728b-9" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_3b3ea7bc40a549f49465cd29e353728b-9"></a> <span class="p">[</span><span class="n">Constant</span><span class="p">(</span><span class="mi">0</span><span class="p">)]),</span> +<a id="rest_code_3b3ea7bc40a549f49465cd29e353728b-10" name="rest_code_3b3ea7bc40a549f49465cd29e353728b-10" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_3b3ea7bc40a549f49465cd29e353728b-10"></a> <span class="n">Operation</span><span class="p">(</span><span class="s1">'add'</span><span class="p">,</span> +<a id="rest_code_3b3ea7bc40a549f49465cd29e353728b-11" name="rest_code_3b3ea7bc40a549f49465cd29e353728b-11" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_3b3ea7bc40a549f49465cd29e353728b-11"></a> <span class="p">[</span><span class="n">Operation</span><span class="p">(</span><span class="s1">'getarg'</span><span class="p">,</span> +<a id="rest_code_3b3ea7bc40a549f49465cd29e353728b-12" name="rest_code_3b3ea7bc40a549f49465cd29e353728b-12" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_3b3ea7bc40a549f49465cd29e353728b-12"></a> <span class="p">[</span><span class="n">Constant</span><span class="p">(</span><span class="mi">1</span><span class="p">)]),</span> +<a id="rest_code_3b3ea7bc40a549f49465cd29e353728b-13" name="rest_code_3b3ea7bc40a549f49465cd29e353728b-13" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_3b3ea7bc40a549f49465cd29e353728b-13"></a> <span class="n">Constant</span><span class="p">(</span><span class="mi">17</span><span class="p">)])]),</span> +<a id="rest_code_3b3ea7bc40a549f49465cd29e353728b-14" name="rest_code_3b3ea7bc40a549f49465cd29e353728b-14" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_3b3ea7bc40a549f49465cd29e353728b-14"></a> <span class="n">Operation</span><span class="p">(</span><span class="s1">'add'</span><span class="p">,</span> +<a id="rest_code_3b3ea7bc40a549f49465cd29e353728b-15" name="rest_code_3b3ea7bc40a549f49465cd29e353728b-15" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_3b3ea7bc40a549f49465cd29e353728b-15"></a> <span class="p">[</span><span class="n">Operation</span><span class="p">(</span><span class="s1">'getarg'</span><span class="p">,</span> +<a id="rest_code_3b3ea7bc40a549f49465cd29e353728b-16" name="rest_code_3b3ea7bc40a549f49465cd29e353728b-16" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_3b3ea7bc40a549f49465cd29e353728b-16"></a> <span class="p">[</span><span class="n">Constant</span><span class="p">(</span><span class="mi">1</span><span class="p">)]),</span> +<a id="rest_code_3b3ea7bc40a549f49465cd29e353728b-17" name="rest_code_3b3ea7bc40a549f49465cd29e353728b-17" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_3b3ea7bc40a549f49465cd29e353728b-17"></a> <span class="n">Constant</span><span class="p">(</span><span class="mi">17</span><span class="p">)]),</span> +<a id="rest_code_3b3ea7bc40a549f49465cd29e353728b-18" name="rest_code_3b3ea7bc40a549f49465cd29e353728b-18" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_3b3ea7bc40a549f49465cd29e353728b-18"></a> <span class="n">Operation</span><span class="p">(</span><span class="s1">'add'</span><span class="p">,</span> +<a id="rest_code_3b3ea7bc40a549f49465cd29e353728b-19" name="rest_code_3b3ea7bc40a549f49465cd29e353728b-19" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_3b3ea7bc40a549f49465cd29e353728b-19"></a> <span class="p">[</span><span class="n">Operation</span><span class="p">(</span><span class="s1">'mul'</span><span class="p">,</span> +<a id="rest_code_3b3ea7bc40a549f49465cd29e353728b-20" name="rest_code_3b3ea7bc40a549f49465cd29e353728b-20" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_3b3ea7bc40a549f49465cd29e353728b-20"></a> <span class="p">[</span><span class="n">Operation</span><span class="p">(</span><span class="s1">'getarg'</span><span class="p">,</span> +<a id="rest_code_3b3ea7bc40a549f49465cd29e353728b-21" name="rest_code_3b3ea7bc40a549f49465cd29e353728b-21" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_3b3ea7bc40a549f49465cd29e353728b-21"></a> <span class="p">[</span><span class="n">Constant</span><span class="p">(</span><span class="mi">0</span><span class="p">)]),</span> +<a id="rest_code_3b3ea7bc40a549f49465cd29e353728b-22" name="rest_code_3b3ea7bc40a549f49465cd29e353728b-22" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_3b3ea7bc40a549f49465cd29e353728b-22"></a> <span class="n">Operation</span><span class="p">(</span><span class="s1">'add'</span><span class="p">,</span> +<a id="rest_code_3b3ea7bc40a549f49465cd29e353728b-23" name="rest_code_3b3ea7bc40a549f49465cd29e353728b-23" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_3b3ea7bc40a549f49465cd29e353728b-23"></a> <span class="p">[</span><span class="n">Operation</span><span class="p">(</span><span class="s1">'getarg'</span><span class="p">,</span> +<a id="rest_code_3b3ea7bc40a549f49465cd29e353728b-24" name="rest_code_3b3ea7bc40a549f49465cd29e353728b-24" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_3b3ea7bc40a549f49465cd29e353728b-24"></a> <span class="p">[</span><span class="n">Constant</span><span class="p">(</span><span class="mi">1</span><span class="p">)]),</span> +<a id="rest_code_3b3ea7bc40a549f49465cd29e353728b-25" name="rest_code_3b3ea7bc40a549f49465cd29e353728b-25" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_3b3ea7bc40a549f49465cd29e353728b-25"></a> <span class="n">Constant</span><span class="p">(</span><span class="mi">17</span><span class="p">)])]),</span> +<a id="rest_code_3b3ea7bc40a549f49465cd29e353728b-26" name="rest_code_3b3ea7bc40a549f49465cd29e353728b-26" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_3b3ea7bc40a549f49465cd29e353728b-26"></a> <span class="n">Operation</span><span class="p">(</span><span class="s1">'add'</span><span class="p">,</span> +<a id="rest_code_3b3ea7bc40a549f49465cd29e353728b-27" name="rest_code_3b3ea7bc40a549f49465cd29e353728b-27" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_3b3ea7bc40a549f49465cd29e353728b-27"></a> <span class="p">[</span><span class="n">Operation</span><span class="p">(</span><span class="s1">'getarg'</span><span class="p">,</span> +<a id="rest_code_3b3ea7bc40a549f49465cd29e353728b-28" name="rest_code_3b3ea7bc40a549f49465cd29e353728b-28" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_3b3ea7bc40a549f49465cd29e353728b-28"></a> <span class="p">[</span><span class="n">Constant</span><span class="p">(</span><span class="mi">1</span><span class="p">)]),</span> +<a id="rest_code_3b3ea7bc40a549f49465cd29e353728b-29" name="rest_code_3b3ea7bc40a549f49465cd29e353728b-29" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_3b3ea7bc40a549f49465cd29e353728b-29"></a> <span class="n">Constant</span><span class="p">(</span><span class="mi">17</span><span class="p">)])])]</span> +</pre></div> +<p>It's impossible to see what is going on here, because the <code class="docutils literal">Operations</code> in the +basic block appear several times, once as elements of the list but then also as +arguments to operations further down in the list. So we need some code that +turns things back into a readable textual representation, so we have a chance +to debug.</p> +<div class="code"><pre class="code python"><a id="rest_code_9248a9c16ce744bb86b23599baa5ddcb-1" name="rest_code_9248a9c16ce744bb86b23599baa5ddcb-1" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_9248a9c16ce744bb86b23599baa5ddcb-1"></a><span class="k">def</span> <span class="nf">bb_to_str</span><span class="p">(</span><span class="n">bb</span><span class="p">:</span> <span class="n">Block</span><span class="p">,</span> <span class="n">varprefix</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">"var"</span><span class="p">):</span> +<a id="rest_code_9248a9c16ce744bb86b23599baa5ddcb-2" name="rest_code_9248a9c16ce744bb86b23599baa5ddcb-2" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_9248a9c16ce744bb86b23599baa5ddcb-2"></a> <span class="c1"># the implementation is not too important,</span> +<a id="rest_code_9248a9c16ce744bb86b23599baa5ddcb-3" name="rest_code_9248a9c16ce744bb86b23599baa5ddcb-3" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_9248a9c16ce744bb86b23599baa5ddcb-3"></a> <span class="c1"># look at the test below to see what the</span> +<a id="rest_code_9248a9c16ce744bb86b23599baa5ddcb-4" name="rest_code_9248a9c16ce744bb86b23599baa5ddcb-4" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_9248a9c16ce744bb86b23599baa5ddcb-4"></a> <span class="c1"># result looks like</span> +<a id="rest_code_9248a9c16ce744bb86b23599baa5ddcb-5" name="rest_code_9248a9c16ce744bb86b23599baa5ddcb-5" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_9248a9c16ce744bb86b23599baa5ddcb-5"></a> +<a id="rest_code_9248a9c16ce744bb86b23599baa5ddcb-6" name="rest_code_9248a9c16ce744bb86b23599baa5ddcb-6" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_9248a9c16ce744bb86b23599baa5ddcb-6"></a> <span class="k">def</span> <span class="nf">arg_to_str</span><span class="p">(</span><span class="n">arg</span><span class="p">:</span> <span class="n">Value</span><span class="p">):</span> +<a id="rest_code_9248a9c16ce744bb86b23599baa5ddcb-7" name="rest_code_9248a9c16ce744bb86b23599baa5ddcb-7" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_9248a9c16ce744bb86b23599baa5ddcb-7"></a> <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">arg</span><span class="p">,</span> <span class="n">Constant</span><span class="p">):</span> +<a id="rest_code_9248a9c16ce744bb86b23599baa5ddcb-8" name="rest_code_9248a9c16ce744bb86b23599baa5ddcb-8" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_9248a9c16ce744bb86b23599baa5ddcb-8"></a> <span class="k">return</span> <span class="nb">str</span><span class="p">(</span><span class="n">arg</span><span class="o">.</span><span class="n">value</span><span class="p">)</span> +<a id="rest_code_9248a9c16ce744bb86b23599baa5ddcb-9" name="rest_code_9248a9c16ce744bb86b23599baa5ddcb-9" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_9248a9c16ce744bb86b23599baa5ddcb-9"></a> <span class="k">else</span><span class="p">:</span> +<a id="rest_code_9248a9c16ce744bb86b23599baa5ddcb-10" name="rest_code_9248a9c16ce744bb86b23599baa5ddcb-10" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_9248a9c16ce744bb86b23599baa5ddcb-10"></a> <span class="c1"># the key must exist, otherwise it's</span> +<a id="rest_code_9248a9c16ce744bb86b23599baa5ddcb-11" name="rest_code_9248a9c16ce744bb86b23599baa5ddcb-11" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_9248a9c16ce744bb86b23599baa5ddcb-11"></a> <span class="c1"># not a valid SSA basic block:</span> +<a id="rest_code_9248a9c16ce744bb86b23599baa5ddcb-12" name="rest_code_9248a9c16ce744bb86b23599baa5ddcb-12" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_9248a9c16ce744bb86b23599baa5ddcb-12"></a> <span class="c1"># the variable must be defined before</span> +<a id="rest_code_9248a9c16ce744bb86b23599baa5ddcb-13" name="rest_code_9248a9c16ce744bb86b23599baa5ddcb-13" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_9248a9c16ce744bb86b23599baa5ddcb-13"></a> <span class="c1"># its first use</span> +<a id="rest_code_9248a9c16ce744bb86b23599baa5ddcb-14" name="rest_code_9248a9c16ce744bb86b23599baa5ddcb-14" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_9248a9c16ce744bb86b23599baa5ddcb-14"></a> <span class="k">return</span> <span class="n">varnames</span><span class="p">[</span><span class="n">arg</span><span class="p">]</span> +<a id="rest_code_9248a9c16ce744bb86b23599baa5ddcb-15" name="rest_code_9248a9c16ce744bb86b23599baa5ddcb-15" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_9248a9c16ce744bb86b23599baa5ddcb-15"></a> +<a id="rest_code_9248a9c16ce744bb86b23599baa5ddcb-16" name="rest_code_9248a9c16ce744bb86b23599baa5ddcb-16" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_9248a9c16ce744bb86b23599baa5ddcb-16"></a> <span class="n">varnames</span> <span class="o">=</span> <span class="p">{}</span> +<a id="rest_code_9248a9c16ce744bb86b23599baa5ddcb-17" name="rest_code_9248a9c16ce744bb86b23599baa5ddcb-17" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_9248a9c16ce744bb86b23599baa5ddcb-17"></a> <span class="n">res</span> <span class="o">=</span> <span class="p">[]</span> +<a id="rest_code_9248a9c16ce744bb86b23599baa5ddcb-18" name="rest_code_9248a9c16ce744bb86b23599baa5ddcb-18" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_9248a9c16ce744bb86b23599baa5ddcb-18"></a> <span class="k">for</span> <span class="n">index</span><span class="p">,</span> <span class="n">op</span> <span class="ow">in</span> <span class="nb">enumerate</span><span class="p">(</span><span class="n">bb</span><span class="p">):</span> +<a id="rest_code_9248a9c16ce744bb86b23599baa5ddcb-19" name="rest_code_9248a9c16ce744bb86b23599baa5ddcb-19" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_9248a9c16ce744bb86b23599baa5ddcb-19"></a> <span class="c1"># give the operation a name used while</span> +<a id="rest_code_9248a9c16ce744bb86b23599baa5ddcb-20" name="rest_code_9248a9c16ce744bb86b23599baa5ddcb-20" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_9248a9c16ce744bb86b23599baa5ddcb-20"></a> <span class="c1"># printing:</span> +<a id="rest_code_9248a9c16ce744bb86b23599baa5ddcb-21" name="rest_code_9248a9c16ce744bb86b23599baa5ddcb-21" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_9248a9c16ce744bb86b23599baa5ddcb-21"></a> <span class="n">var</span> <span class="o">=</span> <span class="sa">f</span><span class="s2">"</span><span class="si">{</span><span class="n">varprefix</span><span class="si">}{</span><span class="n">index</span><span class="si">}</span><span class="s2">"</span> +<a id="rest_code_9248a9c16ce744bb86b23599baa5ddcb-22" name="rest_code_9248a9c16ce744bb86b23599baa5ddcb-22" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_9248a9c16ce744bb86b23599baa5ddcb-22"></a> <span class="n">varnames</span><span class="p">[</span><span class="n">op</span><span class="p">]</span> <span class="o">=</span> <span class="n">var</span> +<a id="rest_code_9248a9c16ce744bb86b23599baa5ddcb-23" name="rest_code_9248a9c16ce744bb86b23599baa5ddcb-23" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_9248a9c16ce744bb86b23599baa5ddcb-23"></a> <span class="n">arguments</span> <span class="o">=</span> <span class="s2">", "</span><span class="o">.</span><span class="n">join</span><span class="p">(</span> +<a id="rest_code_9248a9c16ce744bb86b23599baa5ddcb-24" name="rest_code_9248a9c16ce744bb86b23599baa5ddcb-24" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_9248a9c16ce744bb86b23599baa5ddcb-24"></a> <span class="n">arg_to_str</span><span class="p">(</span><span class="n">op</span><span class="o">.</span><span class="n">arg</span><span class="p">(</span><span class="n">i</span><span class="p">))</span> +<a id="rest_code_9248a9c16ce744bb86b23599baa5ddcb-25" name="rest_code_9248a9c16ce744bb86b23599baa5ddcb-25" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_9248a9c16ce744bb86b23599baa5ddcb-25"></a> <span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="nb">len</span><span class="p">(</span><span class="n">op</span><span class="o">.</span><span class="n">args</span><span class="p">))</span> +<a id="rest_code_9248a9c16ce744bb86b23599baa5ddcb-26" name="rest_code_9248a9c16ce744bb86b23599baa5ddcb-26" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_9248a9c16ce744bb86b23599baa5ddcb-26"></a> <span class="p">)</span> +<a id="rest_code_9248a9c16ce744bb86b23599baa5ddcb-27" name="rest_code_9248a9c16ce744bb86b23599baa5ddcb-27" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_9248a9c16ce744bb86b23599baa5ddcb-27"></a> <span class="n">strop</span> <span class="o">=</span> <span class="sa">f</span><span class="s2">"</span><span class="si">{</span><span class="n">var</span><span class="si">}</span><span class="s2"> = </span><span class="si">{</span><span class="n">op</span><span class="o">.</span><span class="n">name</span><span class="si">}</span><span class="s2">(</span><span class="si">{</span><span class="n">arguments</span><span class="si">}</span><span class="s2">)"</span> +<a id="rest_code_9248a9c16ce744bb86b23599baa5ddcb-28" name="rest_code_9248a9c16ce744bb86b23599baa5ddcb-28" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_9248a9c16ce744bb86b23599baa5ddcb-28"></a> <span class="n">res</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">strop</span><span class="p">)</span> +<a id="rest_code_9248a9c16ce744bb86b23599baa5ddcb-29" name="rest_code_9248a9c16ce744bb86b23599baa5ddcb-29" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_9248a9c16ce744bb86b23599baa5ddcb-29"></a> <span class="k">return</span> <span class="s2">"</span><span class="se">\n</span><span class="s2">"</span><span class="o">.</span><span class="n">join</span><span class="p">(</span><span class="n">res</span><span class="p">)</span> +<a id="rest_code_9248a9c16ce744bb86b23599baa5ddcb-30" name="rest_code_9248a9c16ce744bb86b23599baa5ddcb-30" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_9248a9c16ce744bb86b23599baa5ddcb-30"></a> +<a id="rest_code_9248a9c16ce744bb86b23599baa5ddcb-31" name="rest_code_9248a9c16ce744bb86b23599baa5ddcb-31" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_9248a9c16ce744bb86b23599baa5ddcb-31"></a><span class="k">def</span> <span class="nf">test_basicblock_to_str</span><span class="p">():</span> +<a id="rest_code_9248a9c16ce744bb86b23599baa5ddcb-32" name="rest_code_9248a9c16ce744bb86b23599baa5ddcb-32" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_9248a9c16ce744bb86b23599baa5ddcb-32"></a> <span class="n">bb</span> <span class="o">=</span> <span class="n">Block</span><span class="p">()</span> +<a id="rest_code_9248a9c16ce744bb86b23599baa5ddcb-33" name="rest_code_9248a9c16ce744bb86b23599baa5ddcb-33" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_9248a9c16ce744bb86b23599baa5ddcb-33"></a> <span class="n">var0</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">getarg</span><span class="p">(</span><span class="mi">0</span><span class="p">)</span> +<a id="rest_code_9248a9c16ce744bb86b23599baa5ddcb-34" name="rest_code_9248a9c16ce744bb86b23599baa5ddcb-34" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_9248a9c16ce744bb86b23599baa5ddcb-34"></a> <span class="n">var1</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">add</span><span class="p">(</span><span class="mi">5</span><span class="p">,</span> <span class="mi">4</span><span class="p">)</span> +<a id="rest_code_9248a9c16ce744bb86b23599baa5ddcb-35" name="rest_code_9248a9c16ce744bb86b23599baa5ddcb-35" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_9248a9c16ce744bb86b23599baa5ddcb-35"></a> <span class="n">var2</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">add</span><span class="p">(</span><span class="n">var1</span><span class="p">,</span> <span class="n">var0</span><span class="p">)</span> +<a id="rest_code_9248a9c16ce744bb86b23599baa5ddcb-36" name="rest_code_9248a9c16ce744bb86b23599baa5ddcb-36" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_9248a9c16ce744bb86b23599baa5ddcb-36"></a> +<a id="rest_code_9248a9c16ce744bb86b23599baa5ddcb-37" name="rest_code_9248a9c16ce744bb86b23599baa5ddcb-37" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_9248a9c16ce744bb86b23599baa5ddcb-37"></a> <span class="k">assert</span> <span class="n">bb_to_str</span><span class="p">(</span><span class="n">bb</span><span class="p">)</span> <span class="o">==</span> <span class="s2">"""</span><span class="se">\</span> +<a id="rest_code_9248a9c16ce744bb86b23599baa5ddcb-38" name="rest_code_9248a9c16ce744bb86b23599baa5ddcb-38" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_9248a9c16ce744bb86b23599baa5ddcb-38"></a><span class="s2">var0 = getarg(0)</span> +<a id="rest_code_9248a9c16ce744bb86b23599baa5ddcb-39" name="rest_code_9248a9c16ce744bb86b23599baa5ddcb-39" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_9248a9c16ce744bb86b23599baa5ddcb-39"></a><span class="s2">var1 = add(5, 4)</span> +<a id="rest_code_9248a9c16ce744bb86b23599baa5ddcb-40" name="rest_code_9248a9c16ce744bb86b23599baa5ddcb-40" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_9248a9c16ce744bb86b23599baa5ddcb-40"></a><span class="s2">var2 = add(var1, var0)"""</span> +<a id="rest_code_9248a9c16ce744bb86b23599baa5ddcb-41" name="rest_code_9248a9c16ce744bb86b23599baa5ddcb-41" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_9248a9c16ce744bb86b23599baa5ddcb-41"></a> +<a id="rest_code_9248a9c16ce744bb86b23599baa5ddcb-42" name="rest_code_9248a9c16ce744bb86b23599baa5ddcb-42" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_9248a9c16ce744bb86b23599baa5ddcb-42"></a> <span class="c1"># with a different prefix for the invented</span> +<a id="rest_code_9248a9c16ce744bb86b23599baa5ddcb-43" name="rest_code_9248a9c16ce744bb86b23599baa5ddcb-43" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_9248a9c16ce744bb86b23599baa5ddcb-43"></a> <span class="c1"># variable names:</span> +<a id="rest_code_9248a9c16ce744bb86b23599baa5ddcb-44" name="rest_code_9248a9c16ce744bb86b23599baa5ddcb-44" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_9248a9c16ce744bb86b23599baa5ddcb-44"></a> <span class="k">assert</span> <span class="n">bb_to_str</span><span class="p">(</span><span class="n">bb</span><span class="p">,</span> <span class="s2">"x"</span><span class="p">)</span> <span class="o">==</span> <span class="s2">"""</span><span class="se">\</span> +<a id="rest_code_9248a9c16ce744bb86b23599baa5ddcb-45" name="rest_code_9248a9c16ce744bb86b23599baa5ddcb-45" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_9248a9c16ce744bb86b23599baa5ddcb-45"></a><span class="s2">x0 = getarg(0)</span> +<a id="rest_code_9248a9c16ce744bb86b23599baa5ddcb-46" name="rest_code_9248a9c16ce744bb86b23599baa5ddcb-46" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_9248a9c16ce744bb86b23599baa5ddcb-46"></a><span class="s2">x1 = add(5, 4)</span> +<a id="rest_code_9248a9c16ce744bb86b23599baa5ddcb-47" name="rest_code_9248a9c16ce744bb86b23599baa5ddcb-47" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_9248a9c16ce744bb86b23599baa5ddcb-47"></a><span class="s2">x2 = add(x1, x0)"""</span> +<a id="rest_code_9248a9c16ce744bb86b23599baa5ddcb-48" name="rest_code_9248a9c16ce744bb86b23599baa5ddcb-48" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_9248a9c16ce744bb86b23599baa5ddcb-48"></a> +<a id="rest_code_9248a9c16ce744bb86b23599baa5ddcb-49" name="rest_code_9248a9c16ce744bb86b23599baa5ddcb-49" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_9248a9c16ce744bb86b23599baa5ddcb-49"></a> <span class="c1"># and our running example:</span> +<a id="rest_code_9248a9c16ce744bb86b23599baa5ddcb-50" name="rest_code_9248a9c16ce744bb86b23599baa5ddcb-50" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_9248a9c16ce744bb86b23599baa5ddcb-50"></a> <span class="n">bb</span> <span class="o">=</span> <span class="n">Block</span><span class="p">()</span> +<a id="rest_code_9248a9c16ce744bb86b23599baa5ddcb-51" name="rest_code_9248a9c16ce744bb86b23599baa5ddcb-51" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_9248a9c16ce744bb86b23599baa5ddcb-51"></a> <span class="n">a</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">getarg</span><span class="p">(</span><span class="mi">0</span><span class="p">)</span> +<a id="rest_code_9248a9c16ce744bb86b23599baa5ddcb-52" name="rest_code_9248a9c16ce744bb86b23599baa5ddcb-52" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_9248a9c16ce744bb86b23599baa5ddcb-52"></a> <span class="n">b</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">getarg</span><span class="p">(</span><span class="mi">1</span><span class="p">)</span> +<a id="rest_code_9248a9c16ce744bb86b23599baa5ddcb-53" name="rest_code_9248a9c16ce744bb86b23599baa5ddcb-53" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_9248a9c16ce744bb86b23599baa5ddcb-53"></a> <span class="n">var1</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">add</span><span class="p">(</span><span class="n">b</span><span class="p">,</span> <span class="mi">17</span><span class="p">)</span> +<a id="rest_code_9248a9c16ce744bb86b23599baa5ddcb-54" name="rest_code_9248a9c16ce744bb86b23599baa5ddcb-54" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_9248a9c16ce744bb86b23599baa5ddcb-54"></a> <span class="n">var2</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">mul</span><span class="p">(</span><span class="n">a</span><span class="p">,</span> <span class="n">var1</span><span class="p">)</span> +<a id="rest_code_9248a9c16ce744bb86b23599baa5ddcb-55" name="rest_code_9248a9c16ce744bb86b23599baa5ddcb-55" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_9248a9c16ce744bb86b23599baa5ddcb-55"></a> <span class="n">var3</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">add</span><span class="p">(</span><span class="n">b</span><span class="p">,</span> <span class="mi">17</span><span class="p">)</span> +<a id="rest_code_9248a9c16ce744bb86b23599baa5ddcb-56" name="rest_code_9248a9c16ce744bb86b23599baa5ddcb-56" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_9248a9c16ce744bb86b23599baa5ddcb-56"></a> <span class="n">var4</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">add</span><span class="p">(</span><span class="n">var2</span><span class="p">,</span> <span class="n">var3</span><span class="p">)</span> +<a id="rest_code_9248a9c16ce744bb86b23599baa5ddcb-57" name="rest_code_9248a9c16ce744bb86b23599baa5ddcb-57" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_9248a9c16ce744bb86b23599baa5ddcb-57"></a> +<a id="rest_code_9248a9c16ce744bb86b23599baa5ddcb-58" name="rest_code_9248a9c16ce744bb86b23599baa5ddcb-58" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_9248a9c16ce744bb86b23599baa5ddcb-58"></a> <span class="k">assert</span> <span class="n">bb_to_str</span><span class="p">(</span><span class="n">bb</span><span class="p">,</span> <span class="s2">"v"</span><span class="p">)</span> <span class="o">==</span> <span class="s2">"""</span><span class="se">\</span> +<a id="rest_code_9248a9c16ce744bb86b23599baa5ddcb-59" name="rest_code_9248a9c16ce744bb86b23599baa5ddcb-59" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_9248a9c16ce744bb86b23599baa5ddcb-59"></a><span class="s2">v0 = getarg(0)</span> +<a id="rest_code_9248a9c16ce744bb86b23599baa5ddcb-60" name="rest_code_9248a9c16ce744bb86b23599baa5ddcb-60" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_9248a9c16ce744bb86b23599baa5ddcb-60"></a><span class="s2">v1 = getarg(1)</span> +<a id="rest_code_9248a9c16ce744bb86b23599baa5ddcb-61" name="rest_code_9248a9c16ce744bb86b23599baa5ddcb-61" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_9248a9c16ce744bb86b23599baa5ddcb-61"></a><span class="s2">v2 = add(v1, 17)</span> +<a id="rest_code_9248a9c16ce744bb86b23599baa5ddcb-62" name="rest_code_9248a9c16ce744bb86b23599baa5ddcb-62" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_9248a9c16ce744bb86b23599baa5ddcb-62"></a><span class="s2">v3 = mul(v0, v2)</span> +<a id="rest_code_9248a9c16ce744bb86b23599baa5ddcb-63" name="rest_code_9248a9c16ce744bb86b23599baa5ddcb-63" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_9248a9c16ce744bb86b23599baa5ddcb-63"></a><span class="s2">v4 = add(v1, 17)</span> +<a id="rest_code_9248a9c16ce744bb86b23599baa5ddcb-64" name="rest_code_9248a9c16ce744bb86b23599baa5ddcb-64" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_9248a9c16ce744bb86b23599baa5ddcb-64"></a><span class="s2">v5 = add(v3, v4)"""</span> +<a id="rest_code_9248a9c16ce744bb86b23599baa5ddcb-65" name="rest_code_9248a9c16ce744bb86b23599baa5ddcb-65" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_9248a9c16ce744bb86b23599baa5ddcb-65"></a> <span class="c1"># Note the re-numbering of the variables! We</span> +<a id="rest_code_9248a9c16ce744bb86b23599baa5ddcb-66" name="rest_code_9248a9c16ce744bb86b23599baa5ddcb-66" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_9248a9c16ce744bb86b23599baa5ddcb-66"></a> <span class="c1"># don't attach names to Operations at all, so</span> +<a id="rest_code_9248a9c16ce744bb86b23599baa5ddcb-67" name="rest_code_9248a9c16ce744bb86b23599baa5ddcb-67" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_9248a9c16ce744bb86b23599baa5ddcb-67"></a> <span class="c1"># the printing will just number them in</span> +<a id="rest_code_9248a9c16ce744bb86b23599baa5ddcb-68" name="rest_code_9248a9c16ce744bb86b23599baa5ddcb-68" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_9248a9c16ce744bb86b23599baa5ddcb-68"></a> <span class="c1"># sequence, can sometimes be a source of</span> +<a id="rest_code_9248a9c16ce744bb86b23599baa5ddcb-69" name="rest_code_9248a9c16ce744bb86b23599baa5ddcb-69" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_9248a9c16ce744bb86b23599baa5ddcb-69"></a> <span class="c1"># confusion.</span> +</pre></div> +<p>This is much better. Now we're done with the basic infrastructure, we can +define sequences of operations and print them in a readable way. Next we need a +central data structure that is used when actually optimizing basic blocks.</p> +</section> +<section id="storing-equivalences-between-operations-using-a-union-find-data-structure"> +<h2>Storing Equivalences between Operations Using a Union-Find Data Structure</h2> +<p>When optimizing a sequence of operations, we want to make it less costly to +execute. For that we typically want to remove operations (and sometimes +replace operations with less expensive ones). We can remove operations if +they do redundant computation, like case of the duplicate <code class="docutils literal">add(v1, 17)</code> in +the example. So what we want to do is to turn the running input sequence:</p> +<pre class="literal-block">v0 = getarg(0) +v1 = getarg(1) +v2 = add(v1, 17) +v3 = mul(v0, v2) +v4 = add(v1, 17) +v5 = add(v3, v4)</pre> +<p>Into the following optimized output sequence:</p> +<pre class="literal-block">optvar0 = getarg(0) +optvar1 = getarg(1) +optvar2 = add(optvar1, 17) +optvar3 = mul(optvar0, optvar2) +optvar4 = add(optvar3, optvar2)</pre> +<p>We left out the second <code class="docutils literal">add</code> (which defines <code class="docutils literal">v4</code>), and then replaced the +usage of <code class="docutils literal">v4</code> with <code class="docutils literal">v2</code> in the final operation that defines <code class="docutils literal">v5</code>.</p> +<p>What we effectively did was discover that <code class="docutils literal">v2</code> and <code class="docutils literal">v4</code> are equivalent and then +replaced <code class="docutils literal">v4</code> with <code class="docutils literal">v2</code>. In general, we might discover more such equivalences, +and we need a data structure to store them. A good data structure to store +these equivalences is <a class="reference external" href="https://en.wikipedia.org/wiki/Disjoint-set_data_structure">Union Find</a> (also called Disjoint-set data structure), +which stores a collection of disjoint sets. Disjoint means, that no operation +can appear in more than one set. The sets in our concrete case are the sets of +operations that compute the same result.</p> +<p>When we start out, every operation is in its own singleton set, with no other +member. As we discover more equivalences, we will unify sets into larger sets +of operations that all compute the same result. So one operation the data +structure supports is <code class="docutils literal">union</code>, to unify two sets, we'll call that +<code class="docutils literal">make_equal_to</code> in the code below.</p> +<p>The other operation the data structure supports is <code class="docutils literal">find</code>, which takes an +operation and returns a "representative" of the set of all equivalent +operations. Two operations are in the same set, if the representative that +find returns for them is the same.</p> +<p>The exact details of how the data structure works are only sort of important +(even though it's very cool, I promise!). It's OK to skip over the +implementation. We will add the data structure right into our <code class="docutils literal">Value</code>, +<code class="docutils literal">Constant</code> and <code class="docutils literal">Operation</code> classes:</p> +<div class="code"><pre class="code python"><a id="rest_code_a15a9155215648a298765668d60a43a2-1" name="rest_code_a15a9155215648a298765668d60a43a2-1" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_a15a9155215648a298765668d60a43a2-1"></a><span class="k">class</span> <span class="nc">Value</span><span class="p">:</span> +<a id="rest_code_a15a9155215648a298765668d60a43a2-2" name="rest_code_a15a9155215648a298765668d60a43a2-2" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_a15a9155215648a298765668d60a43a2-2"></a> <span class="k">def</span> <span class="nf">find</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span> +<a id="rest_code_a15a9155215648a298765668d60a43a2-3" name="rest_code_a15a9155215648a298765668d60a43a2-3" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_a15a9155215648a298765668d60a43a2-3"></a> <span class="k">raise</span> <span class="ne">NotImplementedError</span><span class="p">(</span><span class="s2">"abstract"</span><span class="p">)</span> +<a id="rest_code_a15a9155215648a298765668d60a43a2-4" name="rest_code_a15a9155215648a298765668d60a43a2-4" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_a15a9155215648a298765668d60a43a2-4"></a> <span class="k">def</span> <span class="nf">_set_forwarded</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span> +<a id="rest_code_a15a9155215648a298765668d60a43a2-5" name="rest_code_a15a9155215648a298765668d60a43a2-5" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_a15a9155215648a298765668d60a43a2-5"></a> <span class="k">raise</span> <span class="ne">NotImplementedError</span><span class="p">(</span><span class="s2">"abstract"</span><span class="p">)</span> +<a id="rest_code_a15a9155215648a298765668d60a43a2-6" name="rest_code_a15a9155215648a298765668d60a43a2-6" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_a15a9155215648a298765668d60a43a2-6"></a> +<a id="rest_code_a15a9155215648a298765668d60a43a2-7" name="rest_code_a15a9155215648a298765668d60a43a2-7" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_a15a9155215648a298765668d60a43a2-7"></a> +<a id="rest_code_a15a9155215648a298765668d60a43a2-8" name="rest_code_a15a9155215648a298765668d60a43a2-8" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_a15a9155215648a298765668d60a43a2-8"></a><span class="k">class</span> <span class="nc">Operation</span><span class="p">(</span><span class="n">Value</span><span class="p">):</span> +<a id="rest_code_a15a9155215648a298765668d60a43a2-9" name="rest_code_a15a9155215648a298765668d60a43a2-9" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_a15a9155215648a298765668d60a43a2-9"></a> <span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">name</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span> <span class="n">args</span><span class="p">:</span> <span class="nb">list</span><span class="p">[</span><span class="n">Value</span><span class="p">]):</span> +<a id="rest_code_a15a9155215648a298765668d60a43a2-10" name="rest_code_a15a9155215648a298765668d60a43a2-10" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_a15a9155215648a298765668d60a43a2-10"></a> <span class="bp">self</span><span class="o">.</span><span class="n">name</span> <span class="o">=</span> <span class="n">name</span> +<a id="rest_code_a15a9155215648a298765668d60a43a2-11" name="rest_code_a15a9155215648a298765668d60a43a2-11" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_a15a9155215648a298765668d60a43a2-11"></a> <span class="bp">self</span><span class="o">.</span><span class="n">args</span> <span class="o">=</span> <span class="n">args</span> +<a id="rest_code_a15a9155215648a298765668d60a43a2-12" name="rest_code_a15a9155215648a298765668d60a43a2-12" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_a15a9155215648a298765668d60a43a2-12"></a> <span class="bp">self</span><span class="o">.</span><span class="n">forwarded</span> <span class="o">=</span> <span class="kc">None</span> +<a id="rest_code_a15a9155215648a298765668d60a43a2-13" name="rest_code_a15a9155215648a298765668d60a43a2-13" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_a15a9155215648a298765668d60a43a2-13"></a> +<a id="rest_code_a15a9155215648a298765668d60a43a2-14" name="rest_code_a15a9155215648a298765668d60a43a2-14" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_a15a9155215648a298765668d60a43a2-14"></a> <span class="k">def</span> <span class="fm">__repr__</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span> +<a id="rest_code_a15a9155215648a298765668d60a43a2-15" name="rest_code_a15a9155215648a298765668d60a43a2-15" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_a15a9155215648a298765668d60a43a2-15"></a> <span class="k">return</span> <span class="p">(</span> +<a id="rest_code_a15a9155215648a298765668d60a43a2-16" name="rest_code_a15a9155215648a298765668d60a43a2-16" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_a15a9155215648a298765668d60a43a2-16"></a> <span class="sa">f</span><span class="s2">"Operation(</span><span class="si">{</span><span class="bp">self</span><span class="o">.</span><span class="n">name</span><span class="si">}</span><span class="s2">,"</span> +<a id="rest_code_a15a9155215648a298765668d60a43a2-17" name="rest_code_a15a9155215648a298765668d60a43a2-17" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_a15a9155215648a298765668d60a43a2-17"></a> <span class="sa">f</span><span class="s2">"</span><span class="si">{</span><span class="bp">self</span><span class="o">.</span><span class="n">args</span><span class="si">}</span><span class="s2">, </span><span class="si">{</span><span class="bp">self</span><span class="o">.</span><span class="n">forwarded</span><span class="si">}</span><span class="s2">)"</span> +<a id="rest_code_a15a9155215648a298765668d60a43a2-18" name="rest_code_a15a9155215648a298765668d60a43a2-18" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_a15a9155215648a298765668d60a43a2-18"></a> <span class="p">)</span> +<a id="rest_code_a15a9155215648a298765668d60a43a2-19" name="rest_code_a15a9155215648a298765668d60a43a2-19" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_a15a9155215648a298765668d60a43a2-19"></a> +<a id="rest_code_a15a9155215648a298765668d60a43a2-20" name="rest_code_a15a9155215648a298765668d60a43a2-20" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_a15a9155215648a298765668d60a43a2-20"></a> <span class="k">def</span> <span class="nf">find</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Value</span><span class="p">:</span> +<a id="rest_code_a15a9155215648a298765668d60a43a2-21" name="rest_code_a15a9155215648a298765668d60a43a2-21" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_a15a9155215648a298765668d60a43a2-21"></a> <span class="c1"># returns the "representative" value of</span> +<a id="rest_code_a15a9155215648a298765668d60a43a2-22" name="rest_code_a15a9155215648a298765668d60a43a2-22" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_a15a9155215648a298765668d60a43a2-22"></a> <span class="c1"># self, in the union-find sense</span> +<a id="rest_code_a15a9155215648a298765668d60a43a2-23" name="rest_code_a15a9155215648a298765668d60a43a2-23" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_a15a9155215648a298765668d60a43a2-23"></a> <span class="n">op</span> <span class="o">=</span> <span class="bp">self</span> +<a id="rest_code_a15a9155215648a298765668d60a43a2-24" name="rest_code_a15a9155215648a298765668d60a43a2-24" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_a15a9155215648a298765668d60a43a2-24"></a> <span class="k">while</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">op</span><span class="p">,</span> <span class="n">Operation</span><span class="p">):</span> +<a id="rest_code_a15a9155215648a298765668d60a43a2-25" name="rest_code_a15a9155215648a298765668d60a43a2-25" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_a15a9155215648a298765668d60a43a2-25"></a> <span class="c1"># could do path compression here too</span> +<a id="rest_code_a15a9155215648a298765668d60a43a2-26" name="rest_code_a15a9155215648a298765668d60a43a2-26" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_a15a9155215648a298765668d60a43a2-26"></a> <span class="c1"># but not essential</span> +<a id="rest_code_a15a9155215648a298765668d60a43a2-27" name="rest_code_a15a9155215648a298765668d60a43a2-27" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_a15a9155215648a298765668d60a43a2-27"></a> <span class="nb">next</span> <span class="o">=</span> <span class="n">op</span><span class="o">.</span><span class="n">forwarded</span> +<a id="rest_code_a15a9155215648a298765668d60a43a2-28" name="rest_code_a15a9155215648a298765668d60a43a2-28" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_a15a9155215648a298765668d60a43a2-28"></a> <span class="k">if</span> <span class="nb">next</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span> +<a id="rest_code_a15a9155215648a298765668d60a43a2-29" name="rest_code_a15a9155215648a298765668d60a43a2-29" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_a15a9155215648a298765668d60a43a2-29"></a> <span class="k">return</span> <span class="n">op</span> +<a id="rest_code_a15a9155215648a298765668d60a43a2-30" name="rest_code_a15a9155215648a298765668d60a43a2-30" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_a15a9155215648a298765668d60a43a2-30"></a> <span class="n">op</span> <span class="o">=</span> <span class="nb">next</span> +<a id="rest_code_a15a9155215648a298765668d60a43a2-31" name="rest_code_a15a9155215648a298765668d60a43a2-31" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_a15a9155215648a298765668d60a43a2-31"></a> <span class="k">return</span> <span class="n">op</span> +<a id="rest_code_a15a9155215648a298765668d60a43a2-32" name="rest_code_a15a9155215648a298765668d60a43a2-32" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_a15a9155215648a298765668d60a43a2-32"></a> +<a id="rest_code_a15a9155215648a298765668d60a43a2-33" name="rest_code_a15a9155215648a298765668d60a43a2-33" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_a15a9155215648a298765668d60a43a2-33"></a> <span class="k">def</span> <span class="nf">arg</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">index</span><span class="p">):</span> +<a id="rest_code_a15a9155215648a298765668d60a43a2-34" name="rest_code_a15a9155215648a298765668d60a43a2-34" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_a15a9155215648a298765668d60a43a2-34"></a> <span class="c1"># change to above: return the</span> +<a id="rest_code_a15a9155215648a298765668d60a43a2-35" name="rest_code_a15a9155215648a298765668d60a43a2-35" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_a15a9155215648a298765668d60a43a2-35"></a> <span class="c1"># representative of argument 'index'</span> +<a id="rest_code_a15a9155215648a298765668d60a43a2-36" name="rest_code_a15a9155215648a298765668d60a43a2-36" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_a15a9155215648a298765668d60a43a2-36"></a> <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">args</span><span class="p">[</span><span class="n">index</span><span class="p">]</span><span class="o">.</span><span class="n">find</span><span class="p">()</span> +<a id="rest_code_a15a9155215648a298765668d60a43a2-37" name="rest_code_a15a9155215648a298765668d60a43a2-37" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_a15a9155215648a298765668d60a43a2-37"></a> +<a id="rest_code_a15a9155215648a298765668d60a43a2-38" name="rest_code_a15a9155215648a298765668d60a43a2-38" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_a15a9155215648a298765668d60a43a2-38"></a> <span class="k">def</span> <span class="nf">make_equal_to</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="n">Value</span><span class="p">):</span> +<a id="rest_code_a15a9155215648a298765668d60a43a2-39" name="rest_code_a15a9155215648a298765668d60a43a2-39" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_a15a9155215648a298765668d60a43a2-39"></a> <span class="c1"># this is "union" in the union-find sense,</span> +<a id="rest_code_a15a9155215648a298765668d60a43a2-40" name="rest_code_a15a9155215648a298765668d60a43a2-40" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_a15a9155215648a298765668d60a43a2-40"></a> <span class="c1"># but the direction is important! The</span> +<a id="rest_code_a15a9155215648a298765668d60a43a2-41" name="rest_code_a15a9155215648a298765668d60a43a2-41" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_a15a9155215648a298765668d60a43a2-41"></a> <span class="c1"># representative of the union of Operations</span> +<a id="rest_code_a15a9155215648a298765668d60a43a2-42" name="rest_code_a15a9155215648a298765668d60a43a2-42" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_a15a9155215648a298765668d60a43a2-42"></a> <span class="c1"># must be either a Constant or an operation</span> +<a id="rest_code_a15a9155215648a298765668d60a43a2-43" name="rest_code_a15a9155215648a298765668d60a43a2-43" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_a15a9155215648a298765668d60a43a2-43"></a> <span class="c1"># that we know for sure is not optimized</span> +<a id="rest_code_a15a9155215648a298765668d60a43a2-44" name="rest_code_a15a9155215648a298765668d60a43a2-44" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_a15a9155215648a298765668d60a43a2-44"></a> <span class="c1"># away.</span> +<a id="rest_code_a15a9155215648a298765668d60a43a2-45" name="rest_code_a15a9155215648a298765668d60a43a2-45" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_a15a9155215648a298765668d60a43a2-45"></a> +<a id="rest_code_a15a9155215648a298765668d60a43a2-46" name="rest_code_a15a9155215648a298765668d60a43a2-46" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_a15a9155215648a298765668d60a43a2-46"></a> <span class="bp">self</span><span class="o">.</span><span class="n">find</span><span class="p">()</span><span class="o">.</span><span class="n">_set_forwarded</span><span class="p">(</span><span class="n">value</span><span class="p">)</span> +<a id="rest_code_a15a9155215648a298765668d60a43a2-47" name="rest_code_a15a9155215648a298765668d60a43a2-47" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_a15a9155215648a298765668d60a43a2-47"></a> +<a id="rest_code_a15a9155215648a298765668d60a43a2-48" name="rest_code_a15a9155215648a298765668d60a43a2-48" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_a15a9155215648a298765668d60a43a2-48"></a> <span class="k">def</span> <span class="nf">_set_forwarded</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="n">Value</span><span class="p">):</span> +<a id="rest_code_a15a9155215648a298765668d60a43a2-49" name="rest_code_a15a9155215648a298765668d60a43a2-49" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_a15a9155215648a298765668d60a43a2-49"></a> <span class="bp">self</span><span class="o">.</span><span class="n">forwarded</span> <span class="o">=</span> <span class="n">value</span> +<a id="rest_code_a15a9155215648a298765668d60a43a2-50" name="rest_code_a15a9155215648a298765668d60a43a2-50" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_a15a9155215648a298765668d60a43a2-50"></a> +<a id="rest_code_a15a9155215648a298765668d60a43a2-51" name="rest_code_a15a9155215648a298765668d60a43a2-51" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_a15a9155215648a298765668d60a43a2-51"></a> +<a id="rest_code_a15a9155215648a298765668d60a43a2-52" name="rest_code_a15a9155215648a298765668d60a43a2-52" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_a15a9155215648a298765668d60a43a2-52"></a><span class="k">class</span> <span class="nc">Constant</span><span class="p">(</span><span class="n">Value</span><span class="p">):</span> +<a id="rest_code_a15a9155215648a298765668d60a43a2-53" name="rest_code_a15a9155215648a298765668d60a43a2-53" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_a15a9155215648a298765668d60a43a2-53"></a> <span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="n">Any</span><span class="p">):</span> +<a id="rest_code_a15a9155215648a298765668d60a43a2-54" name="rest_code_a15a9155215648a298765668d60a43a2-54" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_a15a9155215648a298765668d60a43a2-54"></a> <span class="bp">self</span><span class="o">.</span><span class="n">value</span> <span class="o">=</span> <span class="n">value</span> +<a id="rest_code_a15a9155215648a298765668d60a43a2-55" name="rest_code_a15a9155215648a298765668d60a43a2-55" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_a15a9155215648a298765668d60a43a2-55"></a> +<a id="rest_code_a15a9155215648a298765668d60a43a2-56" name="rest_code_a15a9155215648a298765668d60a43a2-56" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_a15a9155215648a298765668d60a43a2-56"></a> <span class="k">def</span> <span class="fm">__repr__</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span> +<a id="rest_code_a15a9155215648a298765668d60a43a2-57" name="rest_code_a15a9155215648a298765668d60a43a2-57" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_a15a9155215648a298765668d60a43a2-57"></a> <span class="k">return</span> <span class="sa">f</span><span class="s2">"Constant(</span><span class="si">{</span><span class="bp">self</span><span class="o">.</span><span class="n">value</span><span class="si">}</span><span class="s2">)"</span> +<a id="rest_code_a15a9155215648a298765668d60a43a2-58" name="rest_code_a15a9155215648a298765668d60a43a2-58" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_a15a9155215648a298765668d60a43a2-58"></a> +<a id="rest_code_a15a9155215648a298765668d60a43a2-59" name="rest_code_a15a9155215648a298765668d60a43a2-59" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_a15a9155215648a298765668d60a43a2-59"></a> <span class="k">def</span> <span class="nf">find</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span> +<a id="rest_code_a15a9155215648a298765668d60a43a2-60" name="rest_code_a15a9155215648a298765668d60a43a2-60" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_a15a9155215648a298765668d60a43a2-60"></a> <span class="k">return</span> <span class="bp">self</span> +<a id="rest_code_a15a9155215648a298765668d60a43a2-61" name="rest_code_a15a9155215648a298765668d60a43a2-61" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_a15a9155215648a298765668d60a43a2-61"></a> +<a id="rest_code_a15a9155215648a298765668d60a43a2-62" name="rest_code_a15a9155215648a298765668d60a43a2-62" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_a15a9155215648a298765668d60a43a2-62"></a> <span class="k">def</span> <span class="nf">_set_forwarded</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="n">Value</span><span class="p">):</span> +<a id="rest_code_a15a9155215648a298765668d60a43a2-63" name="rest_code_a15a9155215648a298765668d60a43a2-63" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_a15a9155215648a298765668d60a43a2-63"></a> <span class="c1"># if we found out that an Operation is</span> +<a id="rest_code_a15a9155215648a298765668d60a43a2-64" name="rest_code_a15a9155215648a298765668d60a43a2-64" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_a15a9155215648a298765668d60a43a2-64"></a> <span class="c1"># equal to a constant, it's a compiler bug</span> +<a id="rest_code_a15a9155215648a298765668d60a43a2-65" name="rest_code_a15a9155215648a298765668d60a43a2-65" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_a15a9155215648a298765668d60a43a2-65"></a> <span class="c1"># to find out that it's equal to another</span> +<a id="rest_code_a15a9155215648a298765668d60a43a2-66" name="rest_code_a15a9155215648a298765668d60a43a2-66" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_a15a9155215648a298765668d60a43a2-66"></a> <span class="c1"># constant</span> +<a id="rest_code_a15a9155215648a298765668d60a43a2-67" name="rest_code_a15a9155215648a298765668d60a43a2-67" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_a15a9155215648a298765668d60a43a2-67"></a> <span class="k">assert</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">value</span><span class="p">,</span> <span class="n">Constant</span><span class="p">)</span> <span class="ow">and</span> \ +<a id="rest_code_a15a9155215648a298765668d60a43a2-68" name="rest_code_a15a9155215648a298765668d60a43a2-68" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_a15a9155215648a298765668d60a43a2-68"></a> <span class="n">value</span><span class="o">.</span><span class="n">value</span> <span class="o">==</span> <span class="bp">self</span><span class="o">.</span><span class="n">value</span> +<a id="rest_code_a15a9155215648a298765668d60a43a2-69" name="rest_code_a15a9155215648a298765668d60a43a2-69" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_a15a9155215648a298765668d60a43a2-69"></a> +<a id="rest_code_a15a9155215648a298765668d60a43a2-70" name="rest_code_a15a9155215648a298765668d60a43a2-70" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_a15a9155215648a298765668d60a43a2-70"></a><span class="k">def</span> <span class="nf">test_union_find</span><span class="p">():</span> +<a id="rest_code_a15a9155215648a298765668d60a43a2-71" name="rest_code_a15a9155215648a298765668d60a43a2-71" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_a15a9155215648a298765668d60a43a2-71"></a> <span class="c1"># construct three operation, and unify them</span> +<a id="rest_code_a15a9155215648a298765668d60a43a2-72" name="rest_code_a15a9155215648a298765668d60a43a2-72" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_a15a9155215648a298765668d60a43a2-72"></a> <span class="c1"># step by step</span> +<a id="rest_code_a15a9155215648a298765668d60a43a2-73" name="rest_code_a15a9155215648a298765668d60a43a2-73" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_a15a9155215648a298765668d60a43a2-73"></a> <span class="n">bb</span> <span class="o">=</span> <span class="n">Block</span><span class="p">()</span> +<a id="rest_code_a15a9155215648a298765668d60a43a2-74" name="rest_code_a15a9155215648a298765668d60a43a2-74" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_a15a9155215648a298765668d60a43a2-74"></a> <span class="n">a1</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">dummy</span><span class="p">(</span><span class="mi">1</span><span class="p">)</span> +<a id="rest_code_a15a9155215648a298765668d60a43a2-75" name="rest_code_a15a9155215648a298765668d60a43a2-75" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_a15a9155215648a298765668d60a43a2-75"></a> <span class="n">a2</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">dummy</span><span class="p">(</span><span class="mi">2</span><span class="p">)</span> +<a id="rest_code_a15a9155215648a298765668d60a43a2-76" name="rest_code_a15a9155215648a298765668d60a43a2-76" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_a15a9155215648a298765668d60a43a2-76"></a> <span class="n">a3</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">dummy</span><span class="p">(</span><span class="mi">3</span><span class="p">)</span> +<a id="rest_code_a15a9155215648a298765668d60a43a2-77" name="rest_code_a15a9155215648a298765668d60a43a2-77" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_a15a9155215648a298765668d60a43a2-77"></a> +<a id="rest_code_a15a9155215648a298765668d60a43a2-78" name="rest_code_a15a9155215648a298765668d60a43a2-78" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_a15a9155215648a298765668d60a43a2-78"></a> <span class="c1"># at the beginning, every op is its own</span> +<a id="rest_code_a15a9155215648a298765668d60a43a2-79" name="rest_code_a15a9155215648a298765668d60a43a2-79" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_a15a9155215648a298765668d60a43a2-79"></a> <span class="c1"># representative, that means every</span> +<a id="rest_code_a15a9155215648a298765668d60a43a2-80" name="rest_code_a15a9155215648a298765668d60a43a2-80" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_a15a9155215648a298765668d60a43a2-80"></a> <span class="c1"># operation is in a singleton set</span> +<a id="rest_code_a15a9155215648a298765668d60a43a2-81" name="rest_code_a15a9155215648a298765668d60a43a2-81" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_a15a9155215648a298765668d60a43a2-81"></a> <span class="c1"># {a1} {a2} {a3}</span> +<a id="rest_code_a15a9155215648a298765668d60a43a2-82" name="rest_code_a15a9155215648a298765668d60a43a2-82" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_a15a9155215648a298765668d60a43a2-82"></a> <span class="k">assert</span> <span class="n">a1</span><span class="o">.</span><span class="n">find</span><span class="p">()</span> <span class="ow">is</span> <span class="n">a1</span> +<a id="rest_code_a15a9155215648a298765668d60a43a2-83" name="rest_code_a15a9155215648a298765668d60a43a2-83" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_a15a9155215648a298765668d60a43a2-83"></a> <span class="k">assert</span> <span class="n">a2</span><span class="o">.</span><span class="n">find</span><span class="p">()</span> <span class="ow">is</span> <span class="n">a2</span> +<a id="rest_code_a15a9155215648a298765668d60a43a2-84" name="rest_code_a15a9155215648a298765668d60a43a2-84" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_a15a9155215648a298765668d60a43a2-84"></a> <span class="k">assert</span> <span class="n">a3</span><span class="o">.</span><span class="n">find</span><span class="p">()</span> <span class="ow">is</span> <span class="n">a3</span> +<a id="rest_code_a15a9155215648a298765668d60a43a2-85" name="rest_code_a15a9155215648a298765668d60a43a2-85" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_a15a9155215648a298765668d60a43a2-85"></a> +<a id="rest_code_a15a9155215648a298765668d60a43a2-86" name="rest_code_a15a9155215648a298765668d60a43a2-86" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_a15a9155215648a298765668d60a43a2-86"></a> <span class="c1"># now we unify a2 and a1, then the sets are</span> +<a id="rest_code_a15a9155215648a298765668d60a43a2-87" name="rest_code_a15a9155215648a298765668d60a43a2-87" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_a15a9155215648a298765668d60a43a2-87"></a> <span class="c1"># {a1, a2} {a3}</span> +<a id="rest_code_a15a9155215648a298765668d60a43a2-88" name="rest_code_a15a9155215648a298765668d60a43a2-88" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_a15a9155215648a298765668d60a43a2-88"></a> <span class="n">a2</span><span class="o">.</span><span class="n">make_equal_to</span><span class="p">(</span><span class="n">a1</span><span class="p">)</span> +<a id="rest_code_a15a9155215648a298765668d60a43a2-89" name="rest_code_a15a9155215648a298765668d60a43a2-89" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_a15a9155215648a298765668d60a43a2-89"></a> <span class="c1"># they both return a1 as the representative</span> +<a id="rest_code_a15a9155215648a298765668d60a43a2-90" name="rest_code_a15a9155215648a298765668d60a43a2-90" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_a15a9155215648a298765668d60a43a2-90"></a> <span class="k">assert</span> <span class="n">a1</span><span class="o">.</span><span class="n">find</span><span class="p">()</span> <span class="ow">is</span> <span class="n">a1</span> +<a id="rest_code_a15a9155215648a298765668d60a43a2-91" name="rest_code_a15a9155215648a298765668d60a43a2-91" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_a15a9155215648a298765668d60a43a2-91"></a> <span class="k">assert</span> <span class="n">a2</span><span class="o">.</span><span class="n">find</span><span class="p">()</span> <span class="ow">is</span> <span class="n">a1</span> +<a id="rest_code_a15a9155215648a298765668d60a43a2-92" name="rest_code_a15a9155215648a298765668d60a43a2-92" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_a15a9155215648a298765668d60a43a2-92"></a> <span class="c1"># a3 is still different</span> +<a id="rest_code_a15a9155215648a298765668d60a43a2-93" name="rest_code_a15a9155215648a298765668d60a43a2-93" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_a15a9155215648a298765668d60a43a2-93"></a> <span class="k">assert</span> <span class="n">a3</span><span class="o">.</span><span class="n">find</span><span class="p">()</span> <span class="ow">is</span> <span class="n">a3</span> +<a id="rest_code_a15a9155215648a298765668d60a43a2-94" name="rest_code_a15a9155215648a298765668d60a43a2-94" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_a15a9155215648a298765668d60a43a2-94"></a> +<a id="rest_code_a15a9155215648a298765668d60a43a2-95" name="rest_code_a15a9155215648a298765668d60a43a2-95" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_a15a9155215648a298765668d60a43a2-95"></a> <span class="c1"># now they are all in the same set {a1, a2, a3}</span> +<a id="rest_code_a15a9155215648a298765668d60a43a2-96" name="rest_code_a15a9155215648a298765668d60a43a2-96" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_a15a9155215648a298765668d60a43a2-96"></a> <span class="n">a3</span><span class="o">.</span><span class="n">make_equal_to</span><span class="p">(</span><span class="n">a2</span><span class="p">)</span> +<a id="rest_code_a15a9155215648a298765668d60a43a2-97" name="rest_code_a15a9155215648a298765668d60a43a2-97" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_a15a9155215648a298765668d60a43a2-97"></a> <span class="k">assert</span> <span class="n">a1</span><span class="o">.</span><span class="n">find</span><span class="p">()</span> <span class="ow">is</span> <span class="n">a1</span> +<a id="rest_code_a15a9155215648a298765668d60a43a2-98" name="rest_code_a15a9155215648a298765668d60a43a2-98" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_a15a9155215648a298765668d60a43a2-98"></a> <span class="k">assert</span> <span class="n">a2</span><span class="o">.</span><span class="n">find</span><span class="p">()</span> <span class="ow">is</span> <span class="n">a1</span> +<a id="rest_code_a15a9155215648a298765668d60a43a2-99" name="rest_code_a15a9155215648a298765668d60a43a2-99" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_a15a9155215648a298765668d60a43a2-99"></a> <span class="k">assert</span> <span class="n">a3</span><span class="o">.</span><span class="n">find</span><span class="p">()</span> <span class="ow">is</span> <span class="n">a1</span> +<a id="rest_code_a15a9155215648a298765668d60a43a2-100" name="rest_code_a15a9155215648a298765668d60a43a2-100" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_a15a9155215648a298765668d60a43a2-100"></a> +<a id="rest_code_a15a9155215648a298765668d60a43a2-101" name="rest_code_a15a9155215648a298765668d60a43a2-101" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_a15a9155215648a298765668d60a43a2-101"></a> <span class="c1"># now they are still all the same, and we</span> +<a id="rest_code_a15a9155215648a298765668d60a43a2-102" name="rest_code_a15a9155215648a298765668d60a43a2-102" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_a15a9155215648a298765668d60a43a2-102"></a> <span class="c1"># also learned that they are the same as the</span> +<a id="rest_code_a15a9155215648a298765668d60a43a2-103" name="rest_code_a15a9155215648a298765668d60a43a2-103" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_a15a9155215648a298765668d60a43a2-103"></a> <span class="c1"># constant 6</span> +<a id="rest_code_a15a9155215648a298765668d60a43a2-104" name="rest_code_a15a9155215648a298765668d60a43a2-104" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_a15a9155215648a298765668d60a43a2-104"></a> <span class="c1"># the single remaining set then is</span> +<a id="rest_code_a15a9155215648a298765668d60a43a2-105" name="rest_code_a15a9155215648a298765668d60a43a2-105" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_a15a9155215648a298765668d60a43a2-105"></a> <span class="c1"># {6, a1, a2, a3}</span> +<a id="rest_code_a15a9155215648a298765668d60a43a2-106" name="rest_code_a15a9155215648a298765668d60a43a2-106" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_a15a9155215648a298765668d60a43a2-106"></a> <span class="n">c</span> <span class="o">=</span> <span class="n">Constant</span><span class="p">(</span><span class="mi">6</span><span class="p">)</span> +<a id="rest_code_a15a9155215648a298765668d60a43a2-107" name="rest_code_a15a9155215648a298765668d60a43a2-107" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_a15a9155215648a298765668d60a43a2-107"></a> <span class="n">a2</span><span class="o">.</span><span class="n">make_equal_to</span><span class="p">(</span><span class="n">c</span><span class="p">)</span> +<a id="rest_code_a15a9155215648a298765668d60a43a2-108" name="rest_code_a15a9155215648a298765668d60a43a2-108" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_a15a9155215648a298765668d60a43a2-108"></a> <span class="k">assert</span> <span class="n">a1</span><span class="o">.</span><span class="n">find</span><span class="p">()</span> <span class="ow">is</span> <span class="n">c</span> +<a id="rest_code_a15a9155215648a298765668d60a43a2-109" name="rest_code_a15a9155215648a298765668d60a43a2-109" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_a15a9155215648a298765668d60a43a2-109"></a> <span class="k">assert</span> <span class="n">a2</span><span class="o">.</span><span class="n">find</span><span class="p">()</span> <span class="ow">is</span> <span class="n">c</span> +<a id="rest_code_a15a9155215648a298765668d60a43a2-110" name="rest_code_a15a9155215648a298765668d60a43a2-110" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_a15a9155215648a298765668d60a43a2-110"></a> <span class="k">assert</span> <span class="n">a3</span><span class="o">.</span><span class="n">find</span><span class="p">()</span> <span class="ow">is</span> <span class="n">c</span> +<a id="rest_code_a15a9155215648a298765668d60a43a2-111" name="rest_code_a15a9155215648a298765668d60a43a2-111" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_a15a9155215648a298765668d60a43a2-111"></a> +<a id="rest_code_a15a9155215648a298765668d60a43a2-112" name="rest_code_a15a9155215648a298765668d60a43a2-112" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_a15a9155215648a298765668d60a43a2-112"></a> <span class="c1"># union with the same constant again is fine</span> +<a id="rest_code_a15a9155215648a298765668d60a43a2-113" name="rest_code_a15a9155215648a298765668d60a43a2-113" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_a15a9155215648a298765668d60a43a2-113"></a> <span class="n">a2</span><span class="o">.</span><span class="n">make_equal_to</span><span class="p">(</span><span class="n">c</span><span class="p">)</span> +</pre></div> +</section> +<section id="constant-folding"> +<h2>Constant Folding</h2> +<p>Now comes the first actual optimization, a simple <a class="reference external" href="https://en.wikipedia.org/wiki/Constant_folding">constant folding</a> pass. It +will remove operations where all the arguments are constants and replace them +with the constant result.</p> +<p>Every pass has the same structure: we go over all operations in the basic +block in order and decide for each operation whether it can be removed. For the +constant folding pass, we can remove all the operations with constant +arguments (but we'll implement only the <code class="docutils literal">add</code> case here).</p> +<p>I will show a buggy version of the <a class="reference external" href="https://en.wikipedia.org/wiki/Constant_folding">constant folding</a> pass first. It has a +problem that is related to why we need the union-find data structure. We will +fix it a bit further down.</p> +<div class="code"><pre class="code python"><a id="rest_code_daa85120c4a44379affe0c40e571c659-1" name="rest_code_daa85120c4a44379affe0c40e571c659-1" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_daa85120c4a44379affe0c40e571c659-1"></a><span class="k">def</span> <span class="nf">constfold_buggy</span><span class="p">(</span><span class="n">bb</span><span class="p">:</span> <span class="n">Block</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Block</span><span class="p">:</span> +<a id="rest_code_daa85120c4a44379affe0c40e571c659-2" name="rest_code_daa85120c4a44379affe0c40e571c659-2" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_daa85120c4a44379affe0c40e571c659-2"></a> <span class="n">opt_bb</span> <span class="o">=</span> <span class="n">Block</span><span class="p">()</span> +<a id="rest_code_daa85120c4a44379affe0c40e571c659-3" name="rest_code_daa85120c4a44379affe0c40e571c659-3" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_daa85120c4a44379affe0c40e571c659-3"></a> +<a id="rest_code_daa85120c4a44379affe0c40e571c659-4" name="rest_code_daa85120c4a44379affe0c40e571c659-4" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_daa85120c4a44379affe0c40e571c659-4"></a> <span class="k">for</span> <span class="n">op</span> <span class="ow">in</span> <span class="n">bb</span><span class="p">:</span> +<a id="rest_code_daa85120c4a44379affe0c40e571c659-5" name="rest_code_daa85120c4a44379affe0c40e571c659-5" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_daa85120c4a44379affe0c40e571c659-5"></a> <span class="c1"># basic idea: go over the list and do</span> +<a id="rest_code_daa85120c4a44379affe0c40e571c659-6" name="rest_code_daa85120c4a44379affe0c40e571c659-6" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_daa85120c4a44379affe0c40e571c659-6"></a> <span class="c1"># constant folding of add where possible</span> +<a id="rest_code_daa85120c4a44379affe0c40e571c659-7" name="rest_code_daa85120c4a44379affe0c40e571c659-7" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_daa85120c4a44379affe0c40e571c659-7"></a> <span class="k">if</span> <span class="n">op</span><span class="o">.</span><span class="n">name</span> <span class="o">==</span> <span class="s2">"add"</span><span class="p">:</span> +<a id="rest_code_daa85120c4a44379affe0c40e571c659-8" name="rest_code_daa85120c4a44379affe0c40e571c659-8" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_daa85120c4a44379affe0c40e571c659-8"></a> <span class="n">arg0</span> <span class="o">=</span> <span class="n">op</span><span class="o">.</span><span class="n">args</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span> +<a id="rest_code_daa85120c4a44379affe0c40e571c659-9" name="rest_code_daa85120c4a44379affe0c40e571c659-9" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_daa85120c4a44379affe0c40e571c659-9"></a> <span class="n">arg1</span> <span class="o">=</span> <span class="n">op</span><span class="o">.</span><span class="n">args</span><span class="p">[</span><span class="mi">1</span><span class="p">]</span> +<a id="rest_code_daa85120c4a44379affe0c40e571c659-10" name="rest_code_daa85120c4a44379affe0c40e571c659-10" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_daa85120c4a44379affe0c40e571c659-10"></a> <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">arg0</span><span class="p">,</span> <span class="n">Constant</span><span class="p">)</span> <span class="ow">and</span> \ +<a id="rest_code_daa85120c4a44379affe0c40e571c659-11" name="rest_code_daa85120c4a44379affe0c40e571c659-11" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_daa85120c4a44379affe0c40e571c659-11"></a> <span class="nb">isinstance</span><span class="p">(</span><span class="n">arg1</span><span class="p">,</span> <span class="n">Constant</span><span class="p">):</span> +<a id="rest_code_daa85120c4a44379affe0c40e571c659-12" name="rest_code_daa85120c4a44379affe0c40e571c659-12" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_daa85120c4a44379affe0c40e571c659-12"></a> <span class="c1"># can constant-fold! that means we</span> +<a id="rest_code_daa85120c4a44379affe0c40e571c659-13" name="rest_code_daa85120c4a44379affe0c40e571c659-13" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_daa85120c4a44379affe0c40e571c659-13"></a> <span class="c1"># learned a new equality, namely</span> +<a id="rest_code_daa85120c4a44379affe0c40e571c659-14" name="rest_code_daa85120c4a44379affe0c40e571c659-14" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_daa85120c4a44379affe0c40e571c659-14"></a> <span class="c1"># that op is equal to a specific</span> +<a id="rest_code_daa85120c4a44379affe0c40e571c659-15" name="rest_code_daa85120c4a44379affe0c40e571c659-15" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_daa85120c4a44379affe0c40e571c659-15"></a> <span class="c1"># constant</span> +<a id="rest_code_daa85120c4a44379affe0c40e571c659-16" name="rest_code_daa85120c4a44379affe0c40e571c659-16" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_daa85120c4a44379affe0c40e571c659-16"></a> <span class="n">value</span> <span class="o">=</span> <span class="n">arg0</span><span class="o">.</span><span class="n">value</span> <span class="o">+</span> <span class="n">arg1</span><span class="o">.</span><span class="n">value</span> +<a id="rest_code_daa85120c4a44379affe0c40e571c659-17" name="rest_code_daa85120c4a44379affe0c40e571c659-17" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_daa85120c4a44379affe0c40e571c659-17"></a> <span class="n">op</span><span class="o">.</span><span class="n">make_equal_to</span><span class="p">(</span><span class="n">Constant</span><span class="p">(</span><span class="n">value</span><span class="p">))</span> +<a id="rest_code_daa85120c4a44379affe0c40e571c659-18" name="rest_code_daa85120c4a44379affe0c40e571c659-18" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_daa85120c4a44379affe0c40e571c659-18"></a> <span class="c1"># don't need to have the operation</span> +<a id="rest_code_daa85120c4a44379affe0c40e571c659-19" name="rest_code_daa85120c4a44379affe0c40e571c659-19" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_daa85120c4a44379affe0c40e571c659-19"></a> <span class="c1"># in the optimized basic block</span> +<a id="rest_code_daa85120c4a44379affe0c40e571c659-20" name="rest_code_daa85120c4a44379affe0c40e571c659-20" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_daa85120c4a44379affe0c40e571c659-20"></a> <span class="k">continue</span> +<a id="rest_code_daa85120c4a44379affe0c40e571c659-21" name="rest_code_daa85120c4a44379affe0c40e571c659-21" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_daa85120c4a44379affe0c40e571c659-21"></a> <span class="c1"># otherwise the operation is not</span> +<a id="rest_code_daa85120c4a44379affe0c40e571c659-22" name="rest_code_daa85120c4a44379affe0c40e571c659-22" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_daa85120c4a44379affe0c40e571c659-22"></a> <span class="c1"># constant-foldable and we put into the</span> +<a id="rest_code_daa85120c4a44379affe0c40e571c659-23" name="rest_code_daa85120c4a44379affe0c40e571c659-23" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_daa85120c4a44379affe0c40e571c659-23"></a> <span class="c1"># output list</span> +<a id="rest_code_daa85120c4a44379affe0c40e571c659-24" name="rest_code_daa85120c4a44379affe0c40e571c659-24" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_daa85120c4a44379affe0c40e571c659-24"></a> <span class="n">opt_bb</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">op</span><span class="p">)</span> +<a id="rest_code_daa85120c4a44379affe0c40e571c659-25" name="rest_code_daa85120c4a44379affe0c40e571c659-25" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_daa85120c4a44379affe0c40e571c659-25"></a> <span class="k">return</span> <span class="n">opt_bb</span> +<a id="rest_code_daa85120c4a44379affe0c40e571c659-26" name="rest_code_daa85120c4a44379affe0c40e571c659-26" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_daa85120c4a44379affe0c40e571c659-26"></a> +<a id="rest_code_daa85120c4a44379affe0c40e571c659-27" name="rest_code_daa85120c4a44379affe0c40e571c659-27" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_daa85120c4a44379affe0c40e571c659-27"></a> +<a id="rest_code_daa85120c4a44379affe0c40e571c659-28" name="rest_code_daa85120c4a44379affe0c40e571c659-28" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_daa85120c4a44379affe0c40e571c659-28"></a><span class="k">def</span> <span class="nf">test_constfold_simple</span><span class="p">():</span> +<a id="rest_code_daa85120c4a44379affe0c40e571c659-29" name="rest_code_daa85120c4a44379affe0c40e571c659-29" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_daa85120c4a44379affe0c40e571c659-29"></a> <span class="n">bb</span> <span class="o">=</span> <span class="n">Block</span><span class="p">()</span> +<a id="rest_code_daa85120c4a44379affe0c40e571c659-30" name="rest_code_daa85120c4a44379affe0c40e571c659-30" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_daa85120c4a44379affe0c40e571c659-30"></a> <span class="n">var0</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">getarg</span><span class="p">(</span><span class="mi">0</span><span class="p">)</span> +<a id="rest_code_daa85120c4a44379affe0c40e571c659-31" name="rest_code_daa85120c4a44379affe0c40e571c659-31" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_daa85120c4a44379affe0c40e571c659-31"></a> <span class="n">var1</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">add</span><span class="p">(</span><span class="mi">5</span><span class="p">,</span> <span class="mi">4</span><span class="p">)</span> +<a id="rest_code_daa85120c4a44379affe0c40e571c659-32" name="rest_code_daa85120c4a44379affe0c40e571c659-32" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_daa85120c4a44379affe0c40e571c659-32"></a> <span class="n">var2</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">add</span><span class="p">(</span><span class="n">var1</span><span class="p">,</span> <span class="n">var0</span><span class="p">)</span> +<a id="rest_code_daa85120c4a44379affe0c40e571c659-33" name="rest_code_daa85120c4a44379affe0c40e571c659-33" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_daa85120c4a44379affe0c40e571c659-33"></a> +<a id="rest_code_daa85120c4a44379affe0c40e571c659-34" name="rest_code_daa85120c4a44379affe0c40e571c659-34" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_daa85120c4a44379affe0c40e571c659-34"></a> <span class="n">opt_bb</span> <span class="o">=</span> <span class="n">constfold_buggy</span><span class="p">(</span><span class="n">bb</span><span class="p">)</span> +<a id="rest_code_daa85120c4a44379affe0c40e571c659-35" name="rest_code_daa85120c4a44379affe0c40e571c659-35" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_daa85120c4a44379affe0c40e571c659-35"></a> <span class="k">assert</span> <span class="n">bb_to_str</span><span class="p">(</span><span class="n">opt_bb</span><span class="p">,</span> <span class="s2">"optvar"</span><span class="p">)</span> <span class="o">==</span> <span class="s2">"""</span><span class="se">\</span> +<a id="rest_code_daa85120c4a44379affe0c40e571c659-36" name="rest_code_daa85120c4a44379affe0c40e571c659-36" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_daa85120c4a44379affe0c40e571c659-36"></a><span class="s2">optvar0 = getarg(0)</span> +<a id="rest_code_daa85120c4a44379affe0c40e571c659-37" name="rest_code_daa85120c4a44379affe0c40e571c659-37" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_daa85120c4a44379affe0c40e571c659-37"></a><span class="s2">optvar1 = add(9, optvar0)"""</span> +<a id="rest_code_daa85120c4a44379affe0c40e571c659-38" name="rest_code_daa85120c4a44379affe0c40e571c659-38" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_daa85120c4a44379affe0c40e571c659-38"></a> +<a id="rest_code_daa85120c4a44379affe0c40e571c659-39" name="rest_code_daa85120c4a44379affe0c40e571c659-39" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_daa85120c4a44379affe0c40e571c659-39"></a><span class="nd">@pytest</span><span class="o">.</span><span class="n">mark</span><span class="o">.</span><span class="n">xfail</span> +<a id="rest_code_daa85120c4a44379affe0c40e571c659-40" name="rest_code_daa85120c4a44379affe0c40e571c659-40" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_daa85120c4a44379affe0c40e571c659-40"></a><span class="k">def</span> <span class="nf">test_constfold_buggy_limitation</span><span class="p">():</span> +<a id="rest_code_daa85120c4a44379affe0c40e571c659-41" name="rest_code_daa85120c4a44379affe0c40e571c659-41" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_daa85120c4a44379affe0c40e571c659-41"></a> <span class="c1"># this test fails! it shows the problem with</span> +<a id="rest_code_daa85120c4a44379affe0c40e571c659-42" name="rest_code_daa85120c4a44379affe0c40e571c659-42" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_daa85120c4a44379affe0c40e571c659-42"></a> <span class="c1"># the above simple constfold_buggy pass</span> +<a id="rest_code_daa85120c4a44379affe0c40e571c659-43" name="rest_code_daa85120c4a44379affe0c40e571c659-43" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_daa85120c4a44379affe0c40e571c659-43"></a> +<a id="rest_code_daa85120c4a44379affe0c40e571c659-44" name="rest_code_daa85120c4a44379affe0c40e571c659-44" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_daa85120c4a44379affe0c40e571c659-44"></a> <span class="n">bb</span> <span class="o">=</span> <span class="n">Block</span><span class="p">()</span> +<a id="rest_code_daa85120c4a44379affe0c40e571c659-45" name="rest_code_daa85120c4a44379affe0c40e571c659-45" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_daa85120c4a44379affe0c40e571c659-45"></a> <span class="n">var0</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">getarg</span><span class="p">(</span><span class="mi">0</span><span class="p">)</span> +<a id="rest_code_daa85120c4a44379affe0c40e571c659-46" name="rest_code_daa85120c4a44379affe0c40e571c659-46" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_daa85120c4a44379affe0c40e571c659-46"></a> <span class="c1"># this is folded</span> +<a id="rest_code_daa85120c4a44379affe0c40e571c659-47" name="rest_code_daa85120c4a44379affe0c40e571c659-47" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_daa85120c4a44379affe0c40e571c659-47"></a> <span class="n">var1</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">add</span><span class="p">(</span><span class="mi">5</span><span class="p">,</span> <span class="mi">4</span><span class="p">)</span> +<a id="rest_code_daa85120c4a44379affe0c40e571c659-48" name="rest_code_daa85120c4a44379affe0c40e571c659-48" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_daa85120c4a44379affe0c40e571c659-48"></a> <span class="c1"># we want this folded too, but it doesn't work</span> +<a id="rest_code_daa85120c4a44379affe0c40e571c659-49" name="rest_code_daa85120c4a44379affe0c40e571c659-49" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_daa85120c4a44379affe0c40e571c659-49"></a> <span class="n">var2</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">add</span><span class="p">(</span><span class="n">var1</span><span class="p">,</span> <span class="mi">10</span><span class="p">)</span> +<a id="rest_code_daa85120c4a44379affe0c40e571c659-50" name="rest_code_daa85120c4a44379affe0c40e571c659-50" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_daa85120c4a44379affe0c40e571c659-50"></a> <span class="n">var3</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">add</span><span class="p">(</span><span class="n">var2</span><span class="p">,</span> <span class="n">var0</span><span class="p">)</span> +<a id="rest_code_daa85120c4a44379affe0c40e571c659-51" name="rest_code_daa85120c4a44379affe0c40e571c659-51" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_daa85120c4a44379affe0c40e571c659-51"></a> +<a id="rest_code_daa85120c4a44379affe0c40e571c659-52" name="rest_code_daa85120c4a44379affe0c40e571c659-52" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_daa85120c4a44379affe0c40e571c659-52"></a> <span class="n">opt_bb</span> <span class="o">=</span> <span class="n">constfold_buggy</span><span class="p">(</span><span class="n">bb</span><span class="p">)</span> +<a id="rest_code_daa85120c4a44379affe0c40e571c659-53" name="rest_code_daa85120c4a44379affe0c40e571c659-53" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_daa85120c4a44379affe0c40e571c659-53"></a> <span class="k">assert</span> <span class="n">bb_to_str</span><span class="p">(</span><span class="n">opt_bb</span><span class="p">,</span> <span class="s2">"optvar"</span><span class="p">)</span> <span class="o">==</span> <span class="s2">"""</span><span class="se">\</span> +<a id="rest_code_daa85120c4a44379affe0c40e571c659-54" name="rest_code_daa85120c4a44379affe0c40e571c659-54" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_daa85120c4a44379affe0c40e571c659-54"></a><span class="s2">optvar0 = getarg(0)</span> +<a id="rest_code_daa85120c4a44379affe0c40e571c659-55" name="rest_code_daa85120c4a44379affe0c40e571c659-55" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_daa85120c4a44379affe0c40e571c659-55"></a><span class="s2">optvar1 = add(19, optvar0)"""</span> +</pre></div> +<p>Why does the test fail? The <code class="docutils literal">opt_bb</code> printed output looks like this:</p> +<pre class="literal-block">optvar0 = getarg(0) +optvar1 = add(9, 10) +optvar2 = add(optvar1, optvar0)</pre> +<p>The problem is that when we optimize the second addition in <cite>constfold_buggy</cite>, +the argument of that operation is an <em>Operation</em> not a <code class="docutils literal">Constant</code>, so +constant-folding is not applied to the second add. However, we have already +learned that the argument <code class="docutils literal">var1</code> to the operation <code class="docutils literal">var2</code> is equal to +<code class="docutils literal">Constant(9)</code>. This information is stored in the union-find data structure. +So what we are missing are suitable find calls in the constant folding pass, to +make use of the previously learned equalities.</p> +<p>Here's the fixed version:</p> +<div class="code"><pre class="code python"><a id="rest_code_b07e310d695e4fd184cd77ed64be36f5-1" name="rest_code_b07e310d695e4fd184cd77ed64be36f5-1" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_b07e310d695e4fd184cd77ed64be36f5-1"></a><span class="k">def</span> <span class="nf">constfold</span><span class="p">(</span><span class="n">bb</span><span class="p">:</span> <span class="n">Block</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Block</span><span class="p">:</span> +<a id="rest_code_b07e310d695e4fd184cd77ed64be36f5-2" name="rest_code_b07e310d695e4fd184cd77ed64be36f5-2" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_b07e310d695e4fd184cd77ed64be36f5-2"></a> <span class="n">opt_bb</span> <span class="o">=</span> <span class="n">Block</span><span class="p">()</span> +<a id="rest_code_b07e310d695e4fd184cd77ed64be36f5-3" name="rest_code_b07e310d695e4fd184cd77ed64be36f5-3" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_b07e310d695e4fd184cd77ed64be36f5-3"></a> +<a id="rest_code_b07e310d695e4fd184cd77ed64be36f5-4" name="rest_code_b07e310d695e4fd184cd77ed64be36f5-4" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_b07e310d695e4fd184cd77ed64be36f5-4"></a> <span class="k">for</span> <span class="n">op</span> <span class="ow">in</span> <span class="n">bb</span><span class="p">:</span> +<a id="rest_code_b07e310d695e4fd184cd77ed64be36f5-5" name="rest_code_b07e310d695e4fd184cd77ed64be36f5-5" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_b07e310d695e4fd184cd77ed64be36f5-5"></a> <span class="c1"># basic idea: go over the list and do</span> +<a id="rest_code_b07e310d695e4fd184cd77ed64be36f5-6" name="rest_code_b07e310d695e4fd184cd77ed64be36f5-6" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_b07e310d695e4fd184cd77ed64be36f5-6"></a> <span class="c1"># constant folding of add where possible</span> +<a id="rest_code_b07e310d695e4fd184cd77ed64be36f5-7" name="rest_code_b07e310d695e4fd184cd77ed64be36f5-7" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_b07e310d695e4fd184cd77ed64be36f5-7"></a> <span class="k">if</span> <span class="n">op</span><span class="o">.</span><span class="n">name</span> <span class="o">==</span> <span class="s2">"add"</span><span class="p">:</span> +<a id="rest_code_b07e310d695e4fd184cd77ed64be36f5-8" name="rest_code_b07e310d695e4fd184cd77ed64be36f5-8" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_b07e310d695e4fd184cd77ed64be36f5-8"></a> <span class="c1"># &gt;&gt;&gt; changed</span> +<a id="rest_code_b07e310d695e4fd184cd77ed64be36f5-9" name="rest_code_b07e310d695e4fd184cd77ed64be36f5-9" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_b07e310d695e4fd184cd77ed64be36f5-9"></a><span class="hll"> <span class="n">arg0</span> <span class="o">=</span> <span class="n">op</span><span class="o">.</span><span class="n">arg</span><span class="p">(</span><span class="mi">0</span><span class="p">)</span> <span class="c1"># uses .find()</span> +</span><a id="rest_code_b07e310d695e4fd184cd77ed64be36f5-10" name="rest_code_b07e310d695e4fd184cd77ed64be36f5-10" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_b07e310d695e4fd184cd77ed64be36f5-10"></a><span class="hll"> <span class="n">arg1</span> <span class="o">=</span> <span class="n">op</span><span class="o">.</span><span class="n">arg</span><span class="p">(</span><span class="mi">1</span><span class="p">)</span> <span class="c1"># uses .find()</span> +</span><a id="rest_code_b07e310d695e4fd184cd77ed64be36f5-11" name="rest_code_b07e310d695e4fd184cd77ed64be36f5-11" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_b07e310d695e4fd184cd77ed64be36f5-11"></a> <span class="c1"># &lt;&lt;&lt; end changes</span> +<a id="rest_code_b07e310d695e4fd184cd77ed64be36f5-12" name="rest_code_b07e310d695e4fd184cd77ed64be36f5-12" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_b07e310d695e4fd184cd77ed64be36f5-12"></a> <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">arg0</span><span class="p">,</span> <span class="n">Constant</span><span class="p">)</span> <span class="ow">and</span> \ +<a id="rest_code_b07e310d695e4fd184cd77ed64be36f5-13" name="rest_code_b07e310d695e4fd184cd77ed64be36f5-13" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_b07e310d695e4fd184cd77ed64be36f5-13"></a> <span class="nb">isinstance</span><span class="p">(</span><span class="n">arg1</span><span class="p">,</span> <span class="n">Constant</span><span class="p">):</span> +<a id="rest_code_b07e310d695e4fd184cd77ed64be36f5-14" name="rest_code_b07e310d695e4fd184cd77ed64be36f5-14" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_b07e310d695e4fd184cd77ed64be36f5-14"></a> <span class="c1"># can constant-fold! that means we</span> +<a id="rest_code_b07e310d695e4fd184cd77ed64be36f5-15" name="rest_code_b07e310d695e4fd184cd77ed64be36f5-15" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_b07e310d695e4fd184cd77ed64be36f5-15"></a> <span class="c1"># learned a new equality, namely</span> +<a id="rest_code_b07e310d695e4fd184cd77ed64be36f5-16" name="rest_code_b07e310d695e4fd184cd77ed64be36f5-16" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_b07e310d695e4fd184cd77ed64be36f5-16"></a> <span class="c1"># that op is equal to a specific</span> +<a id="rest_code_b07e310d695e4fd184cd77ed64be36f5-17" name="rest_code_b07e310d695e4fd184cd77ed64be36f5-17" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_b07e310d695e4fd184cd77ed64be36f5-17"></a> <span class="c1"># constant</span> +<a id="rest_code_b07e310d695e4fd184cd77ed64be36f5-18" name="rest_code_b07e310d695e4fd184cd77ed64be36f5-18" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_b07e310d695e4fd184cd77ed64be36f5-18"></a> <span class="n">value</span> <span class="o">=</span> <span class="n">arg0</span><span class="o">.</span><span class="n">value</span> <span class="o">+</span> <span class="n">arg1</span><span class="o">.</span><span class="n">value</span> +<a id="rest_code_b07e310d695e4fd184cd77ed64be36f5-19" name="rest_code_b07e310d695e4fd184cd77ed64be36f5-19" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_b07e310d695e4fd184cd77ed64be36f5-19"></a> <span class="n">op</span><span class="o">.</span><span class="n">make_equal_to</span><span class="p">(</span><span class="n">Constant</span><span class="p">(</span><span class="n">value</span><span class="p">))</span> +<a id="rest_code_b07e310d695e4fd184cd77ed64be36f5-20" name="rest_code_b07e310d695e4fd184cd77ed64be36f5-20" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_b07e310d695e4fd184cd77ed64be36f5-20"></a> <span class="c1"># don't need to have the operation</span> +<a id="rest_code_b07e310d695e4fd184cd77ed64be36f5-21" name="rest_code_b07e310d695e4fd184cd77ed64be36f5-21" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_b07e310d695e4fd184cd77ed64be36f5-21"></a> <span class="c1"># in the optimized basic block</span> +<a id="rest_code_b07e310d695e4fd184cd77ed64be36f5-22" name="rest_code_b07e310d695e4fd184cd77ed64be36f5-22" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_b07e310d695e4fd184cd77ed64be36f5-22"></a> <span class="k">continue</span> +<a id="rest_code_b07e310d695e4fd184cd77ed64be36f5-23" name="rest_code_b07e310d695e4fd184cd77ed64be36f5-23" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_b07e310d695e4fd184cd77ed64be36f5-23"></a> <span class="c1"># otherwise the operation is not</span> +<a id="rest_code_b07e310d695e4fd184cd77ed64be36f5-24" name="rest_code_b07e310d695e4fd184cd77ed64be36f5-24" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_b07e310d695e4fd184cd77ed64be36f5-24"></a> <span class="c1"># constant-foldable and we put into the</span> +<a id="rest_code_b07e310d695e4fd184cd77ed64be36f5-25" name="rest_code_b07e310d695e4fd184cd77ed64be36f5-25" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_b07e310d695e4fd184cd77ed64be36f5-25"></a> <span class="c1"># output list</span> +<a id="rest_code_b07e310d695e4fd184cd77ed64be36f5-26" name="rest_code_b07e310d695e4fd184cd77ed64be36f5-26" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_b07e310d695e4fd184cd77ed64be36f5-26"></a> <span class="n">opt_bb</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">op</span><span class="p">)</span> +<a id="rest_code_b07e310d695e4fd184cd77ed64be36f5-27" name="rest_code_b07e310d695e4fd184cd77ed64be36f5-27" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_b07e310d695e4fd184cd77ed64be36f5-27"></a> <span class="k">return</span> <span class="n">opt_bb</span> +<a id="rest_code_b07e310d695e4fd184cd77ed64be36f5-28" name="rest_code_b07e310d695e4fd184cd77ed64be36f5-28" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_b07e310d695e4fd184cd77ed64be36f5-28"></a> +<a id="rest_code_b07e310d695e4fd184cd77ed64be36f5-29" name="rest_code_b07e310d695e4fd184cd77ed64be36f5-29" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_b07e310d695e4fd184cd77ed64be36f5-29"></a> +<a id="rest_code_b07e310d695e4fd184cd77ed64be36f5-30" name="rest_code_b07e310d695e4fd184cd77ed64be36f5-30" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_b07e310d695e4fd184cd77ed64be36f5-30"></a><span class="k">def</span> <span class="nf">test_constfold_two_ops</span><span class="p">():</span> +<a id="rest_code_b07e310d695e4fd184cd77ed64be36f5-31" name="rest_code_b07e310d695e4fd184cd77ed64be36f5-31" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_b07e310d695e4fd184cd77ed64be36f5-31"></a> <span class="c1"># now it works!</span> +<a id="rest_code_b07e310d695e4fd184cd77ed64be36f5-32" name="rest_code_b07e310d695e4fd184cd77ed64be36f5-32" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_b07e310d695e4fd184cd77ed64be36f5-32"></a> <span class="n">bb</span> <span class="o">=</span> <span class="n">Block</span><span class="p">()</span> +<a id="rest_code_b07e310d695e4fd184cd77ed64be36f5-33" name="rest_code_b07e310d695e4fd184cd77ed64be36f5-33" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_b07e310d695e4fd184cd77ed64be36f5-33"></a> <span class="n">var0</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">getarg</span><span class="p">(</span><span class="mi">0</span><span class="p">)</span> +<a id="rest_code_b07e310d695e4fd184cd77ed64be36f5-34" name="rest_code_b07e310d695e4fd184cd77ed64be36f5-34" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_b07e310d695e4fd184cd77ed64be36f5-34"></a> <span class="n">var1</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">add</span><span class="p">(</span><span class="mi">5</span><span class="p">,</span> <span class="mi">4</span><span class="p">)</span> +<a id="rest_code_b07e310d695e4fd184cd77ed64be36f5-35" name="rest_code_b07e310d695e4fd184cd77ed64be36f5-35" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_b07e310d695e4fd184cd77ed64be36f5-35"></a> <span class="n">var2</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">add</span><span class="p">(</span><span class="n">var1</span><span class="p">,</span> <span class="mi">10</span><span class="p">)</span> +<a id="rest_code_b07e310d695e4fd184cd77ed64be36f5-36" name="rest_code_b07e310d695e4fd184cd77ed64be36f5-36" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_b07e310d695e4fd184cd77ed64be36f5-36"></a> <span class="n">var3</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">add</span><span class="p">(</span><span class="n">var2</span><span class="p">,</span> <span class="n">var0</span><span class="p">)</span> +<a id="rest_code_b07e310d695e4fd184cd77ed64be36f5-37" name="rest_code_b07e310d695e4fd184cd77ed64be36f5-37" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_b07e310d695e4fd184cd77ed64be36f5-37"></a> <span class="n">opt_bb</span> <span class="o">=</span> <span class="n">constfold</span><span class="p">(</span><span class="n">bb</span><span class="p">)</span> +<a id="rest_code_b07e310d695e4fd184cd77ed64be36f5-38" name="rest_code_b07e310d695e4fd184cd77ed64be36f5-38" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_b07e310d695e4fd184cd77ed64be36f5-38"></a> +<a id="rest_code_b07e310d695e4fd184cd77ed64be36f5-39" name="rest_code_b07e310d695e4fd184cd77ed64be36f5-39" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_b07e310d695e4fd184cd77ed64be36f5-39"></a> <span class="k">assert</span> <span class="n">bb_to_str</span><span class="p">(</span><span class="n">opt_bb</span><span class="p">,</span> <span class="s2">"optvar"</span><span class="p">)</span> <span class="o">==</span> <span class="s2">"""</span><span class="se">\</span> +<a id="rest_code_b07e310d695e4fd184cd77ed64be36f5-40" name="rest_code_b07e310d695e4fd184cd77ed64be36f5-40" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_b07e310d695e4fd184cd77ed64be36f5-40"></a><span class="s2">optvar0 = getarg(0)</span> +<a id="rest_code_b07e310d695e4fd184cd77ed64be36f5-41" name="rest_code_b07e310d695e4fd184cd77ed64be36f5-41" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_b07e310d695e4fd184cd77ed64be36f5-41"></a><span class="s2">optvar1 = add(19, optvar0)"""</span> +</pre></div> +</section> +<section id="common-subexpression-elimination"> +<h2>Common Subexpression Elimination</h2> +<p>The <code class="docutils literal">constfold</code> pass only discovers equalities between <code class="docutils literal">Operations</code> and +<code class="docutils literal">Constants</code>. Let's do a second pass that also discovers equalities between +<code class="docutils literal">Operations</code> and other <code class="docutils literal">Operations</code>.</p> +<p>A simple optimization that does that has this property <a class="reference external" href="https://en.wikipedia.org/wiki/Common_subexpression_elimination">common subexpression +elimination</a> (CSE), which will finally optimize away the problem in the +introductory example code that we had above.</p> +<div class="code"><pre class="code python"><a id="rest_code_5a167cf0fa6e448499556f57339456ca-1" name="rest_code_5a167cf0fa6e448499556f57339456ca-1" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_5a167cf0fa6e448499556f57339456ca-1"></a><span class="k">def</span> <span class="nf">cse</span><span class="p">(</span><span class="n">bb</span><span class="p">:</span> <span class="n">Block</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Block</span><span class="p">:</span> +<a id="rest_code_5a167cf0fa6e448499556f57339456ca-2" name="rest_code_5a167cf0fa6e448499556f57339456ca-2" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_5a167cf0fa6e448499556f57339456ca-2"></a> <span class="c1"># structure is the same, loop over the input,</span> +<a id="rest_code_5a167cf0fa6e448499556f57339456ca-3" name="rest_code_5a167cf0fa6e448499556f57339456ca-3" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_5a167cf0fa6e448499556f57339456ca-3"></a> <span class="c1"># add some but not all operations to the</span> +<a id="rest_code_5a167cf0fa6e448499556f57339456ca-4" name="rest_code_5a167cf0fa6e448499556f57339456ca-4" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_5a167cf0fa6e448499556f57339456ca-4"></a> <span class="c1"># output</span> +<a id="rest_code_5a167cf0fa6e448499556f57339456ca-5" name="rest_code_5a167cf0fa6e448499556f57339456ca-5" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_5a167cf0fa6e448499556f57339456ca-5"></a> +<a id="rest_code_5a167cf0fa6e448499556f57339456ca-6" name="rest_code_5a167cf0fa6e448499556f57339456ca-6" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_5a167cf0fa6e448499556f57339456ca-6"></a> <span class="n">opt_bb</span> <span class="o">=</span> <span class="n">Block</span><span class="p">()</span> +<a id="rest_code_5a167cf0fa6e448499556f57339456ca-7" name="rest_code_5a167cf0fa6e448499556f57339456ca-7" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_5a167cf0fa6e448499556f57339456ca-7"></a> +<a id="rest_code_5a167cf0fa6e448499556f57339456ca-8" name="rest_code_5a167cf0fa6e448499556f57339456ca-8" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_5a167cf0fa6e448499556f57339456ca-8"></a> <span class="k">for</span> <span class="n">op</span> <span class="ow">in</span> <span class="n">bb</span><span class="p">:</span> +<a id="rest_code_5a167cf0fa6e448499556f57339456ca-9" name="rest_code_5a167cf0fa6e448499556f57339456ca-9" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_5a167cf0fa6e448499556f57339456ca-9"></a> <span class="c1"># only do CSE for add here, but it</span> +<a id="rest_code_5a167cf0fa6e448499556f57339456ca-10" name="rest_code_5a167cf0fa6e448499556f57339456ca-10" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_5a167cf0fa6e448499556f57339456ca-10"></a> <span class="c1"># generalizes</span> +<a id="rest_code_5a167cf0fa6e448499556f57339456ca-11" name="rest_code_5a167cf0fa6e448499556f57339456ca-11" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_5a167cf0fa6e448499556f57339456ca-11"></a> <span class="k">if</span> <span class="n">op</span><span class="o">.</span><span class="n">name</span> <span class="o">==</span> <span class="s2">"add"</span><span class="p">:</span> +<a id="rest_code_5a167cf0fa6e448499556f57339456ca-12" name="rest_code_5a167cf0fa6e448499556f57339456ca-12" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_5a167cf0fa6e448499556f57339456ca-12"></a> <span class="n">arg0</span> <span class="o">=</span> <span class="n">op</span><span class="o">.</span><span class="n">arg</span><span class="p">(</span><span class="mi">0</span><span class="p">)</span> +<a id="rest_code_5a167cf0fa6e448499556f57339456ca-13" name="rest_code_5a167cf0fa6e448499556f57339456ca-13" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_5a167cf0fa6e448499556f57339456ca-13"></a> <span class="n">arg1</span> <span class="o">=</span> <span class="n">op</span><span class="o">.</span><span class="n">arg</span><span class="p">(</span><span class="mi">1</span><span class="p">)</span> +<a id="rest_code_5a167cf0fa6e448499556f57339456ca-14" name="rest_code_5a167cf0fa6e448499556f57339456ca-14" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_5a167cf0fa6e448499556f57339456ca-14"></a> <span class="c1"># Check whether we have emitted the</span> +<a id="rest_code_5a167cf0fa6e448499556f57339456ca-15" name="rest_code_5a167cf0fa6e448499556f57339456ca-15" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_5a167cf0fa6e448499556f57339456ca-15"></a> <span class="c1"># same operation already</span> +<a id="rest_code_5a167cf0fa6e448499556f57339456ca-16" name="rest_code_5a167cf0fa6e448499556f57339456ca-16" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_5a167cf0fa6e448499556f57339456ca-16"></a> <span class="n">prev_op</span> <span class="o">=</span> <span class="n">find_prev_add_op</span><span class="p">(</span> +<a id="rest_code_5a167cf0fa6e448499556f57339456ca-17" name="rest_code_5a167cf0fa6e448499556f57339456ca-17" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_5a167cf0fa6e448499556f57339456ca-17"></a> <span class="n">arg0</span><span class="p">,</span> <span class="n">arg1</span><span class="p">,</span> <span class="n">opt_bb</span><span class="p">)</span> +<a id="rest_code_5a167cf0fa6e448499556f57339456ca-18" name="rest_code_5a167cf0fa6e448499556f57339456ca-18" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_5a167cf0fa6e448499556f57339456ca-18"></a> <span class="k">if</span> <span class="n">prev_op</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span> +<a id="rest_code_5a167cf0fa6e448499556f57339456ca-19" name="rest_code_5a167cf0fa6e448499556f57339456ca-19" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_5a167cf0fa6e448499556f57339456ca-19"></a> <span class="c1"># if yes, we can optimize op away</span> +<a id="rest_code_5a167cf0fa6e448499556f57339456ca-20" name="rest_code_5a167cf0fa6e448499556f57339456ca-20" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_5a167cf0fa6e448499556f57339456ca-20"></a> <span class="c1"># and replace it with the earlier</span> +<a id="rest_code_5a167cf0fa6e448499556f57339456ca-21" name="rest_code_5a167cf0fa6e448499556f57339456ca-21" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_5a167cf0fa6e448499556f57339456ca-21"></a> <span class="c1"># result, which is an Operation</span> +<a id="rest_code_5a167cf0fa6e448499556f57339456ca-22" name="rest_code_5a167cf0fa6e448499556f57339456ca-22" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_5a167cf0fa6e448499556f57339456ca-22"></a> <span class="c1"># that was already emitted to</span> +<a id="rest_code_5a167cf0fa6e448499556f57339456ca-23" name="rest_code_5a167cf0fa6e448499556f57339456ca-23" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_5a167cf0fa6e448499556f57339456ca-23"></a> <span class="c1"># opt_bb</span> +<a id="rest_code_5a167cf0fa6e448499556f57339456ca-24" name="rest_code_5a167cf0fa6e448499556f57339456ca-24" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_5a167cf0fa6e448499556f57339456ca-24"></a> <span class="n">op</span><span class="o">.</span><span class="n">make_equal_to</span><span class="p">(</span><span class="n">prev_op</span><span class="p">)</span> +<a id="rest_code_5a167cf0fa6e448499556f57339456ca-25" name="rest_code_5a167cf0fa6e448499556f57339456ca-25" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_5a167cf0fa6e448499556f57339456ca-25"></a> <span class="k">continue</span> +<a id="rest_code_5a167cf0fa6e448499556f57339456ca-26" name="rest_code_5a167cf0fa6e448499556f57339456ca-26" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_5a167cf0fa6e448499556f57339456ca-26"></a> <span class="n">opt_bb</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">op</span><span class="p">)</span> +<a id="rest_code_5a167cf0fa6e448499556f57339456ca-27" name="rest_code_5a167cf0fa6e448499556f57339456ca-27" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_5a167cf0fa6e448499556f57339456ca-27"></a> <span class="k">return</span> <span class="n">opt_bb</span> +<a id="rest_code_5a167cf0fa6e448499556f57339456ca-28" name="rest_code_5a167cf0fa6e448499556f57339456ca-28" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_5a167cf0fa6e448499556f57339456ca-28"></a> +<a id="rest_code_5a167cf0fa6e448499556f57339456ca-29" name="rest_code_5a167cf0fa6e448499556f57339456ca-29" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_5a167cf0fa6e448499556f57339456ca-29"></a> +<a id="rest_code_5a167cf0fa6e448499556f57339456ca-30" name="rest_code_5a167cf0fa6e448499556f57339456ca-30" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_5a167cf0fa6e448499556f57339456ca-30"></a><span class="k">def</span> <span class="nf">eq_value</span><span class="p">(</span><span class="n">val0</span><span class="p">,</span> <span class="n">val1</span><span class="p">):</span> +<a id="rest_code_5a167cf0fa6e448499556f57339456ca-31" name="rest_code_5a167cf0fa6e448499556f57339456ca-31" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_5a167cf0fa6e448499556f57339456ca-31"></a> <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">val0</span><span class="p">,</span> <span class="n">Constant</span><span class="p">)</span> <span class="ow">and</span> \ +<a id="rest_code_5a167cf0fa6e448499556f57339456ca-32" name="rest_code_5a167cf0fa6e448499556f57339456ca-32" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_5a167cf0fa6e448499556f57339456ca-32"></a> <span class="nb">isinstance</span><span class="p">(</span><span class="n">val1</span><span class="p">,</span> <span class="n">Constant</span><span class="p">):</span> +<a id="rest_code_5a167cf0fa6e448499556f57339456ca-33" name="rest_code_5a167cf0fa6e448499556f57339456ca-33" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_5a167cf0fa6e448499556f57339456ca-33"></a> <span class="c1"># constants compare by their value</span> +<a id="rest_code_5a167cf0fa6e448499556f57339456ca-34" name="rest_code_5a167cf0fa6e448499556f57339456ca-34" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_5a167cf0fa6e448499556f57339456ca-34"></a> <span class="k">return</span> <span class="n">val0</span><span class="o">.</span><span class="n">value</span> <span class="o">==</span> <span class="n">val1</span><span class="o">.</span><span class="n">value</span> +<a id="rest_code_5a167cf0fa6e448499556f57339456ca-35" name="rest_code_5a167cf0fa6e448499556f57339456ca-35" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_5a167cf0fa6e448499556f57339456ca-35"></a> <span class="c1"># everything else by identity</span> +<a id="rest_code_5a167cf0fa6e448499556f57339456ca-36" name="rest_code_5a167cf0fa6e448499556f57339456ca-36" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_5a167cf0fa6e448499556f57339456ca-36"></a> <span class="k">return</span> <span class="n">val0</span> <span class="ow">is</span> <span class="n">val1</span> +<a id="rest_code_5a167cf0fa6e448499556f57339456ca-37" name="rest_code_5a167cf0fa6e448499556f57339456ca-37" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_5a167cf0fa6e448499556f57339456ca-37"></a> +<a id="rest_code_5a167cf0fa6e448499556f57339456ca-38" name="rest_code_5a167cf0fa6e448499556f57339456ca-38" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_5a167cf0fa6e448499556f57339456ca-38"></a> +<a id="rest_code_5a167cf0fa6e448499556f57339456ca-39" name="rest_code_5a167cf0fa6e448499556f57339456ca-39" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_5a167cf0fa6e448499556f57339456ca-39"></a><span class="k">def</span> <span class="nf">find_prev_add_op</span><span class="p">(</span><span class="n">arg0</span><span class="p">:</span> <span class="n">Value</span><span class="p">,</span> <span class="n">arg1</span><span class="p">:</span> <span class="n">Value</span><span class="p">,</span> +<a id="rest_code_5a167cf0fa6e448499556f57339456ca-40" name="rest_code_5a167cf0fa6e448499556f57339456ca-40" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_5a167cf0fa6e448499556f57339456ca-40"></a> <span class="n">opt_bb</span><span class="p">:</span> <span class="n">Block</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Operation</span><span class="p">]:</span> +<a id="rest_code_5a167cf0fa6e448499556f57339456ca-41" name="rest_code_5a167cf0fa6e448499556f57339456ca-41" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_5a167cf0fa6e448499556f57339456ca-41"></a> <span class="c1"># Really naive and quadratic implementation.</span> +<a id="rest_code_5a167cf0fa6e448499556f57339456ca-42" name="rest_code_5a167cf0fa6e448499556f57339456ca-42" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_5a167cf0fa6e448499556f57339456ca-42"></a> <span class="c1"># What we do is walk over the already emitted</span> +<a id="rest_code_5a167cf0fa6e448499556f57339456ca-43" name="rest_code_5a167cf0fa6e448499556f57339456ca-43" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_5a167cf0fa6e448499556f57339456ca-43"></a> <span class="c1"># operations and see whether we emitted an add</span> +<a id="rest_code_5a167cf0fa6e448499556f57339456ca-44" name="rest_code_5a167cf0fa6e448499556f57339456ca-44" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_5a167cf0fa6e448499556f57339456ca-44"></a> <span class="c1"># with the current arguments already. A real</span> +<a id="rest_code_5a167cf0fa6e448499556f57339456ca-45" name="rest_code_5a167cf0fa6e448499556f57339456ca-45" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_5a167cf0fa6e448499556f57339456ca-45"></a> <span class="c1"># implementation might use a hashmap of some</span> +<a id="rest_code_5a167cf0fa6e448499556f57339456ca-46" name="rest_code_5a167cf0fa6e448499556f57339456ca-46" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_5a167cf0fa6e448499556f57339456ca-46"></a> <span class="c1"># kind, or at least only look at a limited</span> +<a id="rest_code_5a167cf0fa6e448499556f57339456ca-47" name="rest_code_5a167cf0fa6e448499556f57339456ca-47" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_5a167cf0fa6e448499556f57339456ca-47"></a> <span class="c1"># window of instructions.</span> +<a id="rest_code_5a167cf0fa6e448499556f57339456ca-48" name="rest_code_5a167cf0fa6e448499556f57339456ca-48" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_5a167cf0fa6e448499556f57339456ca-48"></a> <span class="k">for</span> <span class="n">opt_op</span> <span class="ow">in</span> <span class="n">opt_bb</span><span class="p">:</span> +<a id="rest_code_5a167cf0fa6e448499556f57339456ca-49" name="rest_code_5a167cf0fa6e448499556f57339456ca-49" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_5a167cf0fa6e448499556f57339456ca-49"></a> <span class="k">if</span> <span class="n">opt_op</span><span class="o">.</span><span class="n">name</span> <span class="o">!=</span> <span class="s2">"add"</span><span class="p">:</span> +<a id="rest_code_5a167cf0fa6e448499556f57339456ca-50" name="rest_code_5a167cf0fa6e448499556f57339456ca-50" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_5a167cf0fa6e448499556f57339456ca-50"></a> <span class="k">continue</span> +<a id="rest_code_5a167cf0fa6e448499556f57339456ca-51" name="rest_code_5a167cf0fa6e448499556f57339456ca-51" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_5a167cf0fa6e448499556f57339456ca-51"></a> <span class="c1"># It's important to call arg here,</span> +<a id="rest_code_5a167cf0fa6e448499556f57339456ca-52" name="rest_code_5a167cf0fa6e448499556f57339456ca-52" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_5a167cf0fa6e448499556f57339456ca-52"></a> <span class="c1"># for the same reason why we</span> +<a id="rest_code_5a167cf0fa6e448499556f57339456ca-53" name="rest_code_5a167cf0fa6e448499556f57339456ca-53" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_5a167cf0fa6e448499556f57339456ca-53"></a> <span class="c1"># needed it in constfold: we need to</span> +<a id="rest_code_5a167cf0fa6e448499556f57339456ca-54" name="rest_code_5a167cf0fa6e448499556f57339456ca-54" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_5a167cf0fa6e448499556f57339456ca-54"></a> <span class="c1"># make sure .find() is called</span> +<a id="rest_code_5a167cf0fa6e448499556f57339456ca-55" name="rest_code_5a167cf0fa6e448499556f57339456ca-55" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_5a167cf0fa6e448499556f57339456ca-55"></a> <span class="k">if</span> <span class="n">eq_value</span><span class="p">(</span><span class="n">arg0</span><span class="p">,</span> <span class="n">opt_op</span><span class="o">.</span><span class="n">arg</span><span class="p">(</span><span class="mi">0</span><span class="p">))</span> <span class="ow">and</span> \ +<a id="rest_code_5a167cf0fa6e448499556f57339456ca-56" name="rest_code_5a167cf0fa6e448499556f57339456ca-56" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_5a167cf0fa6e448499556f57339456ca-56"></a> <span class="n">eq_value</span><span class="p">(</span><span class="n">arg1</span><span class="p">,</span> <span class="n">opt_op</span><span class="o">.</span><span class="n">arg</span><span class="p">(</span><span class="mi">1</span><span class="p">)):</span> +<a id="rest_code_5a167cf0fa6e448499556f57339456ca-57" name="rest_code_5a167cf0fa6e448499556f57339456ca-57" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_5a167cf0fa6e448499556f57339456ca-57"></a> <span class="k">return</span> <span class="n">opt_op</span> +<a id="rest_code_5a167cf0fa6e448499556f57339456ca-58" name="rest_code_5a167cf0fa6e448499556f57339456ca-58" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_5a167cf0fa6e448499556f57339456ca-58"></a> <span class="k">return</span> <span class="kc">None</span> +<a id="rest_code_5a167cf0fa6e448499556f57339456ca-59" name="rest_code_5a167cf0fa6e448499556f57339456ca-59" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_5a167cf0fa6e448499556f57339456ca-59"></a> +<a id="rest_code_5a167cf0fa6e448499556f57339456ca-60" name="rest_code_5a167cf0fa6e448499556f57339456ca-60" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_5a167cf0fa6e448499556f57339456ca-60"></a> +<a id="rest_code_5a167cf0fa6e448499556f57339456ca-61" name="rest_code_5a167cf0fa6e448499556f57339456ca-61" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_5a167cf0fa6e448499556f57339456ca-61"></a><span class="k">def</span> <span class="nf">test_cse</span><span class="p">():</span> +<a id="rest_code_5a167cf0fa6e448499556f57339456ca-62" name="rest_code_5a167cf0fa6e448499556f57339456ca-62" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_5a167cf0fa6e448499556f57339456ca-62"></a> <span class="n">bb</span> <span class="o">=</span> <span class="n">Block</span><span class="p">()</span> +<a id="rest_code_5a167cf0fa6e448499556f57339456ca-63" name="rest_code_5a167cf0fa6e448499556f57339456ca-63" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_5a167cf0fa6e448499556f57339456ca-63"></a> <span class="n">a</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">getarg</span><span class="p">(</span><span class="mi">0</span><span class="p">)</span> +<a id="rest_code_5a167cf0fa6e448499556f57339456ca-64" name="rest_code_5a167cf0fa6e448499556f57339456ca-64" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_5a167cf0fa6e448499556f57339456ca-64"></a> <span class="n">b</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">getarg</span><span class="p">(</span><span class="mi">1</span><span class="p">)</span> +<a id="rest_code_5a167cf0fa6e448499556f57339456ca-65" name="rest_code_5a167cf0fa6e448499556f57339456ca-65" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_5a167cf0fa6e448499556f57339456ca-65"></a> <span class="n">var1</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">add</span><span class="p">(</span><span class="n">b</span><span class="p">,</span> <span class="mi">17</span><span class="p">)</span> +<a id="rest_code_5a167cf0fa6e448499556f57339456ca-66" name="rest_code_5a167cf0fa6e448499556f57339456ca-66" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_5a167cf0fa6e448499556f57339456ca-66"></a> <span class="n">var2</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">mul</span><span class="p">(</span><span class="n">a</span><span class="p">,</span> <span class="n">var1</span><span class="p">)</span> +<a id="rest_code_5a167cf0fa6e448499556f57339456ca-67" name="rest_code_5a167cf0fa6e448499556f57339456ca-67" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_5a167cf0fa6e448499556f57339456ca-67"></a> <span class="n">var3</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">add</span><span class="p">(</span><span class="n">b</span><span class="p">,</span> <span class="mi">17</span><span class="p">)</span> +<a id="rest_code_5a167cf0fa6e448499556f57339456ca-68" name="rest_code_5a167cf0fa6e448499556f57339456ca-68" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_5a167cf0fa6e448499556f57339456ca-68"></a> <span class="n">var4</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">add</span><span class="p">(</span><span class="n">var2</span><span class="p">,</span> <span class="n">var3</span><span class="p">)</span> +<a id="rest_code_5a167cf0fa6e448499556f57339456ca-69" name="rest_code_5a167cf0fa6e448499556f57339456ca-69" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_5a167cf0fa6e448499556f57339456ca-69"></a> +<a id="rest_code_5a167cf0fa6e448499556f57339456ca-70" name="rest_code_5a167cf0fa6e448499556f57339456ca-70" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_5a167cf0fa6e448499556f57339456ca-70"></a> <span class="n">opt_bb</span> <span class="o">=</span> <span class="n">cse</span><span class="p">(</span><span class="n">bb</span><span class="p">)</span> +<a id="rest_code_5a167cf0fa6e448499556f57339456ca-71" name="rest_code_5a167cf0fa6e448499556f57339456ca-71" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_5a167cf0fa6e448499556f57339456ca-71"></a> <span class="k">assert</span> <span class="n">bb_to_str</span><span class="p">(</span><span class="n">opt_bb</span><span class="p">,</span> <span class="s2">"optvar"</span><span class="p">)</span> <span class="o">==</span> <span class="s2">"""</span><span class="se">\</span> +<a id="rest_code_5a167cf0fa6e448499556f57339456ca-72" name="rest_code_5a167cf0fa6e448499556f57339456ca-72" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_5a167cf0fa6e448499556f57339456ca-72"></a><span class="s2">optvar0 = getarg(0)</span> +<a id="rest_code_5a167cf0fa6e448499556f57339456ca-73" name="rest_code_5a167cf0fa6e448499556f57339456ca-73" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_5a167cf0fa6e448499556f57339456ca-73"></a><span class="s2">optvar1 = getarg(1)</span> +<a id="rest_code_5a167cf0fa6e448499556f57339456ca-74" name="rest_code_5a167cf0fa6e448499556f57339456ca-74" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_5a167cf0fa6e448499556f57339456ca-74"></a><span class="s2">optvar2 = add(optvar1, 17)</span> +<a id="rest_code_5a167cf0fa6e448499556f57339456ca-75" name="rest_code_5a167cf0fa6e448499556f57339456ca-75" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_5a167cf0fa6e448499556f57339456ca-75"></a><span class="s2">optvar3 = mul(optvar0, optvar2)</span> +<a id="rest_code_5a167cf0fa6e448499556f57339456ca-76" name="rest_code_5a167cf0fa6e448499556f57339456ca-76" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_5a167cf0fa6e448499556f57339456ca-76"></a><span class="s2">optvar4 = add(optvar3, optvar2)"""</span> +</pre></div> +</section> +<section id="strength-reduction"> +<h2>Strength Reduction</h2> +<p>Now we have one pass that replaces <code class="docutils literal">Operations</code> with <code class="docutils literal">Constants</code> and one that +replaces <code class="docutils literal">Operations</code> with previously existing <code class="docutils literal">Operations</code>. Let's now do one +final pass that replaces <code class="docutils literal">Operations</code> by newly invented <code class="docutils literal">Operations</code>, a simple +<a class="reference external" href="https://en.wikipedia.org/wiki/Strength_reduction">strength reduction</a>. This one will be simple.</p> +<div class="code"><pre class="code python"><a id="rest_code_0f38ef580c61466493f9ead527062ee0-1" name="rest_code_0f38ef580c61466493f9ead527062ee0-1" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_0f38ef580c61466493f9ead527062ee0-1"></a><span class="k">def</span> <span class="nf">strength_reduce</span><span class="p">(</span><span class="n">bb</span><span class="p">:</span> <span class="n">Block</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Block</span><span class="p">:</span> +<a id="rest_code_0f38ef580c61466493f9ead527062ee0-2" name="rest_code_0f38ef580c61466493f9ead527062ee0-2" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_0f38ef580c61466493f9ead527062ee0-2"></a> <span class="n">opt_bb</span> <span class="o">=</span> <span class="n">Block</span><span class="p">()</span> +<a id="rest_code_0f38ef580c61466493f9ead527062ee0-3" name="rest_code_0f38ef580c61466493f9ead527062ee0-3" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_0f38ef580c61466493f9ead527062ee0-3"></a> <span class="k">for</span> <span class="n">op</span> <span class="ow">in</span> <span class="n">bb</span><span class="p">:</span> +<a id="rest_code_0f38ef580c61466493f9ead527062ee0-4" name="rest_code_0f38ef580c61466493f9ead527062ee0-4" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_0f38ef580c61466493f9ead527062ee0-4"></a> <span class="k">if</span> <span class="n">op</span><span class="o">.</span><span class="n">name</span> <span class="o">==</span> <span class="s2">"add"</span><span class="p">:</span> +<a id="rest_code_0f38ef580c61466493f9ead527062ee0-5" name="rest_code_0f38ef580c61466493f9ead527062ee0-5" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_0f38ef580c61466493f9ead527062ee0-5"></a> <span class="n">arg0</span> <span class="o">=</span> <span class="n">op</span><span class="o">.</span><span class="n">arg</span><span class="p">(</span><span class="mi">0</span><span class="p">)</span> +<a id="rest_code_0f38ef580c61466493f9ead527062ee0-6" name="rest_code_0f38ef580c61466493f9ead527062ee0-6" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_0f38ef580c61466493f9ead527062ee0-6"></a> <span class="n">arg1</span> <span class="o">=</span> <span class="n">op</span><span class="o">.</span><span class="n">arg</span><span class="p">(</span><span class="mi">1</span><span class="p">)</span> +<a id="rest_code_0f38ef580c61466493f9ead527062ee0-7" name="rest_code_0f38ef580c61466493f9ead527062ee0-7" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_0f38ef580c61466493f9ead527062ee0-7"></a> <span class="k">if</span> <span class="n">arg0</span> <span class="ow">is</span> <span class="n">arg1</span><span class="p">:</span> +<a id="rest_code_0f38ef580c61466493f9ead527062ee0-8" name="rest_code_0f38ef580c61466493f9ead527062ee0-8" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_0f38ef580c61466493f9ead527062ee0-8"></a> <span class="c1"># x + x turns into x &lt;&lt; 1</span> +<a id="rest_code_0f38ef580c61466493f9ead527062ee0-9" name="rest_code_0f38ef580c61466493f9ead527062ee0-9" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_0f38ef580c61466493f9ead527062ee0-9"></a> <span class="n">newop</span> <span class="o">=</span> <span class="n">opt_bb</span><span class="o">.</span><span class="n">lshift</span><span class="p">(</span><span class="n">arg0</span><span class="p">,</span> <span class="mi">1</span><span class="p">)</span> +<a id="rest_code_0f38ef580c61466493f9ead527062ee0-10" name="rest_code_0f38ef580c61466493f9ead527062ee0-10" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_0f38ef580c61466493f9ead527062ee0-10"></a> <span class="n">op</span><span class="o">.</span><span class="n">make_equal_to</span><span class="p">(</span><span class="n">newop</span><span class="p">)</span> +<a id="rest_code_0f38ef580c61466493f9ead527062ee0-11" name="rest_code_0f38ef580c61466493f9ead527062ee0-11" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_0f38ef580c61466493f9ead527062ee0-11"></a> <span class="k">continue</span> +<a id="rest_code_0f38ef580c61466493f9ead527062ee0-12" name="rest_code_0f38ef580c61466493f9ead527062ee0-12" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_0f38ef580c61466493f9ead527062ee0-12"></a> <span class="n">opt_bb</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">op</span><span class="p">)</span> +<a id="rest_code_0f38ef580c61466493f9ead527062ee0-13" name="rest_code_0f38ef580c61466493f9ead527062ee0-13" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_0f38ef580c61466493f9ead527062ee0-13"></a> <span class="k">return</span> <span class="n">opt_bb</span> +<a id="rest_code_0f38ef580c61466493f9ead527062ee0-14" name="rest_code_0f38ef580c61466493f9ead527062ee0-14" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_0f38ef580c61466493f9ead527062ee0-14"></a> +<a id="rest_code_0f38ef580c61466493f9ead527062ee0-15" name="rest_code_0f38ef580c61466493f9ead527062ee0-15" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_0f38ef580c61466493f9ead527062ee0-15"></a><span class="k">def</span> <span class="nf">test_strength_reduce</span><span class="p">():</span> +<a id="rest_code_0f38ef580c61466493f9ead527062ee0-16" name="rest_code_0f38ef580c61466493f9ead527062ee0-16" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_0f38ef580c61466493f9ead527062ee0-16"></a> <span class="n">bb</span> <span class="o">=</span> <span class="n">Block</span><span class="p">()</span> +<a id="rest_code_0f38ef580c61466493f9ead527062ee0-17" name="rest_code_0f38ef580c61466493f9ead527062ee0-17" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_0f38ef580c61466493f9ead527062ee0-17"></a> <span class="n">var0</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">getarg</span><span class="p">(</span><span class="mi">0</span><span class="p">)</span> +<a id="rest_code_0f38ef580c61466493f9ead527062ee0-18" name="rest_code_0f38ef580c61466493f9ead527062ee0-18" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_0f38ef580c61466493f9ead527062ee0-18"></a> <span class="n">var1</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">add</span><span class="p">(</span><span class="n">var0</span><span class="p">,</span> <span class="n">var0</span><span class="p">)</span> +<a id="rest_code_0f38ef580c61466493f9ead527062ee0-19" name="rest_code_0f38ef580c61466493f9ead527062ee0-19" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_0f38ef580c61466493f9ead527062ee0-19"></a> +<a id="rest_code_0f38ef580c61466493f9ead527062ee0-20" name="rest_code_0f38ef580c61466493f9ead527062ee0-20" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_0f38ef580c61466493f9ead527062ee0-20"></a> <span class="n">opt_bb</span> <span class="o">=</span> <span class="n">strength_reduce</span><span class="p">(</span><span class="n">bb</span><span class="p">)</span> +<a id="rest_code_0f38ef580c61466493f9ead527062ee0-21" name="rest_code_0f38ef580c61466493f9ead527062ee0-21" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_0f38ef580c61466493f9ead527062ee0-21"></a> +<a id="rest_code_0f38ef580c61466493f9ead527062ee0-22" name="rest_code_0f38ef580c61466493f9ead527062ee0-22" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_0f38ef580c61466493f9ead527062ee0-22"></a> <span class="k">assert</span> <span class="n">bb_to_str</span><span class="p">(</span><span class="n">opt_bb</span><span class="p">,</span> <span class="s2">"optvar"</span><span class="p">)</span> <span class="o">==</span> <span class="s2">"""</span><span class="se">\</span> +<a id="rest_code_0f38ef580c61466493f9ead527062ee0-23" name="rest_code_0f38ef580c61466493f9ead527062ee0-23" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_0f38ef580c61466493f9ead527062ee0-23"></a><span class="s2">optvar0 = getarg(0)</span> +<a id="rest_code_0f38ef580c61466493f9ead527062ee0-24" name="rest_code_0f38ef580c61466493f9ead527062ee0-24" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_0f38ef580c61466493f9ead527062ee0-24"></a><span class="s2">optvar1 = lshift(optvar0, 1)"""</span> +</pre></div> +</section> +<section id="putting-things-together"> +<h2>Putting Things Together</h2> +<p>Let's combine the passes into one single pass, so that we are going over all +the operations only exactly once, instead of having to look at every operation +once for all the different passes.</p> +<div class="code"><pre class="code python"><a id="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-1" name="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-1" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_bfff4cedf6c14a58be32aea404b8c6e5-1"></a><span class="k">def</span> <span class="nf">optimize</span><span class="p">(</span><span class="n">bb</span><span class="p">:</span> <span class="n">Block</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Block</span><span class="p">:</span> +<a id="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-2" name="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-2" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_bfff4cedf6c14a58be32aea404b8c6e5-2"></a> <span class="n">opt_bb</span> <span class="o">=</span> <span class="n">Block</span><span class="p">()</span> +<a id="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-3" name="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-3" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_bfff4cedf6c14a58be32aea404b8c6e5-3"></a> +<a id="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-4" name="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-4" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_bfff4cedf6c14a58be32aea404b8c6e5-4"></a> <span class="k">for</span> <span class="n">op</span> <span class="ow">in</span> <span class="n">bb</span><span class="p">:</span> +<a id="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-5" name="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-5" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_bfff4cedf6c14a58be32aea404b8c6e5-5"></a> <span class="k">if</span> <span class="n">op</span><span class="o">.</span><span class="n">name</span> <span class="o">==</span> <span class="s2">"add"</span><span class="p">:</span> +<a id="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-6" name="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-6" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_bfff4cedf6c14a58be32aea404b8c6e5-6"></a> <span class="n">arg0</span> <span class="o">=</span> <span class="n">op</span><span class="o">.</span><span class="n">arg</span><span class="p">(</span><span class="mi">0</span><span class="p">)</span> +<a id="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-7" name="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-7" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_bfff4cedf6c14a58be32aea404b8c6e5-7"></a> <span class="n">arg1</span> <span class="o">=</span> <span class="n">op</span><span class="o">.</span><span class="n">arg</span><span class="p">(</span><span class="mi">1</span><span class="p">)</span> +<a id="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-8" name="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-8" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_bfff4cedf6c14a58be32aea404b8c6e5-8"></a> +<a id="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-9" name="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-9" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_bfff4cedf6c14a58be32aea404b8c6e5-9"></a> <span class="c1"># constant folding</span> +<a id="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-10" name="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-10" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_bfff4cedf6c14a58be32aea404b8c6e5-10"></a> <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">arg0</span><span class="p">,</span> <span class="n">Constant</span><span class="p">)</span> <span class="ow">and</span> \ +<a id="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-11" name="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-11" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_bfff4cedf6c14a58be32aea404b8c6e5-11"></a> <span class="nb">isinstance</span><span class="p">(</span><span class="n">arg1</span><span class="p">,</span> <span class="n">Constant</span><span class="p">):</span> +<a id="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-12" name="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-12" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_bfff4cedf6c14a58be32aea404b8c6e5-12"></a> <span class="n">value</span> <span class="o">=</span> <span class="n">arg0</span><span class="o">.</span><span class="n">value</span> <span class="o">+</span> <span class="n">arg1</span><span class="o">.</span><span class="n">value</span> +<a id="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-13" name="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-13" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_bfff4cedf6c14a58be32aea404b8c6e5-13"></a> <span class="n">op</span><span class="o">.</span><span class="n">make_equal_to</span><span class="p">(</span><span class="n">Constant</span><span class="p">(</span><span class="n">value</span><span class="p">))</span> +<a id="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-14" name="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-14" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_bfff4cedf6c14a58be32aea404b8c6e5-14"></a> <span class="k">continue</span> +<a id="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-15" name="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-15" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_bfff4cedf6c14a58be32aea404b8c6e5-15"></a> +<a id="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-16" name="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-16" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_bfff4cedf6c14a58be32aea404b8c6e5-16"></a> <span class="c1"># cse</span> +<a id="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-17" name="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-17" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_bfff4cedf6c14a58be32aea404b8c6e5-17"></a> <span class="n">prev_op</span> <span class="o">=</span> <span class="n">find_prev_add_op</span><span class="p">(</span> +<a id="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-18" name="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-18" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_bfff4cedf6c14a58be32aea404b8c6e5-18"></a> <span class="n">arg0</span><span class="p">,</span> <span class="n">arg1</span><span class="p">,</span> <span class="n">opt_bb</span><span class="p">)</span> +<a id="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-19" name="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-19" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_bfff4cedf6c14a58be32aea404b8c6e5-19"></a> <span class="k">if</span> <span class="n">prev_op</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span> +<a id="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-20" name="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-20" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_bfff4cedf6c14a58be32aea404b8c6e5-20"></a> <span class="n">op</span><span class="o">.</span><span class="n">make_equal_to</span><span class="p">(</span><span class="n">prev_op</span><span class="p">)</span> +<a id="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-21" name="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-21" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_bfff4cedf6c14a58be32aea404b8c6e5-21"></a> <span class="k">continue</span> +<a id="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-22" name="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-22" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_bfff4cedf6c14a58be32aea404b8c6e5-22"></a> +<a id="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-23" name="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-23" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_bfff4cedf6c14a58be32aea404b8c6e5-23"></a> <span class="c1"># strength reduce:</span> +<a id="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-24" name="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-24" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_bfff4cedf6c14a58be32aea404b8c6e5-24"></a> <span class="c1"># x + x turns into x &lt;&lt; 1</span> +<a id="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-25" name="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-25" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_bfff4cedf6c14a58be32aea404b8c6e5-25"></a> <span class="k">if</span> <span class="n">arg0</span> <span class="ow">is</span> <span class="n">arg1</span><span class="p">:</span> +<a id="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-26" name="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-26" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_bfff4cedf6c14a58be32aea404b8c6e5-26"></a> <span class="n">newop</span> <span class="o">=</span> <span class="n">opt_bb</span><span class="o">.</span><span class="n">lshift</span><span class="p">(</span><span class="n">arg0</span><span class="p">,</span> <span class="mi">1</span><span class="p">)</span> +<a id="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-27" name="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-27" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_bfff4cedf6c14a58be32aea404b8c6e5-27"></a> <span class="n">op</span><span class="o">.</span><span class="n">make_equal_to</span><span class="p">(</span><span class="n">newop</span><span class="p">)</span> +<a id="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-28" name="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-28" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_bfff4cedf6c14a58be32aea404b8c6e5-28"></a> <span class="k">continue</span> +<a id="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-29" name="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-29" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_bfff4cedf6c14a58be32aea404b8c6e5-29"></a> +<a id="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-30" name="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-30" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_bfff4cedf6c14a58be32aea404b8c6e5-30"></a> <span class="c1"># and while we are at it, let's do some</span> +<a id="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-31" name="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-31" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_bfff4cedf6c14a58be32aea404b8c6e5-31"></a> <span class="c1"># arithmetic simplification:</span> +<a id="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-32" name="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-32" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_bfff4cedf6c14a58be32aea404b8c6e5-32"></a> <span class="c1"># a + 0 =&gt; a</span> +<a id="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-33" name="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-33" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_bfff4cedf6c14a58be32aea404b8c6e5-33"></a> <span class="k">if</span> <span class="n">eq_value</span><span class="p">(</span><span class="n">arg0</span><span class="p">,</span> <span class="n">Constant</span><span class="p">(</span><span class="mi">0</span><span class="p">)):</span> +<a id="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-34" name="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-34" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_bfff4cedf6c14a58be32aea404b8c6e5-34"></a> <span class="n">op</span><span class="o">.</span><span class="n">make_equal_to</span><span class="p">(</span><span class="n">arg1</span><span class="p">)</span> +<a id="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-35" name="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-35" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_bfff4cedf6c14a58be32aea404b8c6e5-35"></a> <span class="k">continue</span> +<a id="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-36" name="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-36" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_bfff4cedf6c14a58be32aea404b8c6e5-36"></a> <span class="k">if</span> <span class="n">eq_value</span><span class="p">(</span><span class="n">arg1</span><span class="p">,</span> <span class="n">Constant</span><span class="p">(</span><span class="mi">0</span><span class="p">)):</span> +<a id="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-37" name="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-37" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_bfff4cedf6c14a58be32aea404b8c6e5-37"></a> <span class="n">op</span><span class="o">.</span><span class="n">make_equal_to</span><span class="p">(</span><span class="n">arg0</span><span class="p">)</span> +<a id="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-38" name="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-38" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_bfff4cedf6c14a58be32aea404b8c6e5-38"></a> <span class="k">continue</span> +<a id="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-39" name="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-39" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_bfff4cedf6c14a58be32aea404b8c6e5-39"></a> <span class="n">opt_bb</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">op</span><span class="p">)</span> +<a id="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-40" name="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-40" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_bfff4cedf6c14a58be32aea404b8c6e5-40"></a> <span class="k">return</span> <span class="n">opt_bb</span> +<a id="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-41" name="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-41" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_bfff4cedf6c14a58be32aea404b8c6e5-41"></a> +<a id="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-42" name="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-42" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_bfff4cedf6c14a58be32aea404b8c6e5-42"></a> +<a id="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-43" name="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-43" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_bfff4cedf6c14a58be32aea404b8c6e5-43"></a><span class="k">def</span> <span class="nf">test_single_pass</span><span class="p">():</span> +<a id="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-44" name="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-44" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_bfff4cedf6c14a58be32aea404b8c6e5-44"></a> <span class="n">bb</span> <span class="o">=</span> <span class="n">Block</span><span class="p">()</span> +<a id="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-45" name="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-45" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_bfff4cedf6c14a58be32aea404b8c6e5-45"></a> <span class="c1"># constant folding</span> +<a id="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-46" name="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-46" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_bfff4cedf6c14a58be32aea404b8c6e5-46"></a> <span class="n">var0</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">getarg</span><span class="p">(</span><span class="mi">0</span><span class="p">)</span> +<a id="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-47" name="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-47" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_bfff4cedf6c14a58be32aea404b8c6e5-47"></a> <span class="n">var1</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">add</span><span class="p">(</span><span class="mi">5</span><span class="p">,</span> <span class="mi">4</span><span class="p">)</span> +<a id="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-48" name="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-48" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_bfff4cedf6c14a58be32aea404b8c6e5-48"></a> <span class="n">var2</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">add</span><span class="p">(</span><span class="n">var1</span><span class="p">,</span> <span class="mi">10</span><span class="p">)</span> +<a id="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-49" name="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-49" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_bfff4cedf6c14a58be32aea404b8c6e5-49"></a> <span class="n">var3</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">add</span><span class="p">(</span><span class="n">var2</span><span class="p">,</span> <span class="n">var0</span><span class="p">)</span> +<a id="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-50" name="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-50" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_bfff4cedf6c14a58be32aea404b8c6e5-50"></a> +<a id="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-51" name="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-51" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_bfff4cedf6c14a58be32aea404b8c6e5-51"></a> <span class="n">opt_bb</span> <span class="o">=</span> <span class="n">optimize</span><span class="p">(</span><span class="n">bb</span><span class="p">)</span> +<a id="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-52" name="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-52" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_bfff4cedf6c14a58be32aea404b8c6e5-52"></a> <span class="k">assert</span> <span class="n">bb_to_str</span><span class="p">(</span><span class="n">opt_bb</span><span class="p">,</span> <span class="s2">"optvar"</span><span class="p">)</span> <span class="o">==</span> <span class="s2">"""</span><span class="se">\</span> +<a id="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-53" name="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-53" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_bfff4cedf6c14a58be32aea404b8c6e5-53"></a><span class="s2">optvar0 = getarg(0)</span> +<a id="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-54" name="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-54" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_bfff4cedf6c14a58be32aea404b8c6e5-54"></a><span class="s2">optvar1 = add(19, optvar0)"""</span> +<a id="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-55" name="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-55" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_bfff4cedf6c14a58be32aea404b8c6e5-55"></a> +<a id="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-56" name="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-56" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_bfff4cedf6c14a58be32aea404b8c6e5-56"></a> <span class="c1"># cse + strength reduction</span> +<a id="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-57" name="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-57" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_bfff4cedf6c14a58be32aea404b8c6e5-57"></a> <span class="n">bb</span> <span class="o">=</span> <span class="n">Block</span><span class="p">()</span> +<a id="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-58" name="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-58" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_bfff4cedf6c14a58be32aea404b8c6e5-58"></a> <span class="n">var0</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">getarg</span><span class="p">(</span><span class="mi">0</span><span class="p">)</span> +<a id="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-59" name="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-59" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_bfff4cedf6c14a58be32aea404b8c6e5-59"></a> <span class="n">var1</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">getarg</span><span class="p">(</span><span class="mi">1</span><span class="p">)</span> +<a id="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-60" name="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-60" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_bfff4cedf6c14a58be32aea404b8c6e5-60"></a> <span class="n">var2</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">add</span><span class="p">(</span><span class="n">var0</span><span class="p">,</span> <span class="n">var1</span><span class="p">)</span> +<a id="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-61" name="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-61" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_bfff4cedf6c14a58be32aea404b8c6e5-61"></a> <span class="n">var3</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">add</span><span class="p">(</span><span class="n">var0</span><span class="p">,</span> <span class="n">var1</span><span class="p">)</span> <span class="c1"># the same as var3</span> +<a id="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-62" name="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-62" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_bfff4cedf6c14a58be32aea404b8c6e5-62"></a> <span class="n">var4</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">add</span><span class="p">(</span><span class="n">var2</span><span class="p">,</span> <span class="mi">2</span><span class="p">)</span> +<a id="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-63" name="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-63" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_bfff4cedf6c14a58be32aea404b8c6e5-63"></a> <span class="n">var5</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">add</span><span class="p">(</span><span class="n">var3</span><span class="p">,</span> <span class="mi">2</span><span class="p">)</span> <span class="c1"># the same as var4</span> +<a id="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-64" name="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-64" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_bfff4cedf6c14a58be32aea404b8c6e5-64"></a> <span class="n">var6</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">add</span><span class="p">(</span><span class="n">var4</span><span class="p">,</span> <span class="n">var5</span><span class="p">)</span> +<a id="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-65" name="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-65" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_bfff4cedf6c14a58be32aea404b8c6e5-65"></a> +<a id="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-66" name="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-66" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_bfff4cedf6c14a58be32aea404b8c6e5-66"></a> <span class="n">opt_bb</span> <span class="o">=</span> <span class="n">optimize</span><span class="p">(</span><span class="n">bb</span><span class="p">)</span> +<a id="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-67" name="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-67" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_bfff4cedf6c14a58be32aea404b8c6e5-67"></a> <span class="k">assert</span> <span class="n">bb_to_str</span><span class="p">(</span><span class="n">opt_bb</span><span class="p">,</span> <span class="s2">"optvar"</span><span class="p">)</span> <span class="o">==</span> <span class="s2">"""</span><span class="se">\</span> +<a id="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-68" name="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-68" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_bfff4cedf6c14a58be32aea404b8c6e5-68"></a><span class="s2">optvar0 = getarg(0)</span> +<a id="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-69" name="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-69" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_bfff4cedf6c14a58be32aea404b8c6e5-69"></a><span class="s2">optvar1 = getarg(1)</span> +<a id="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-70" name="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-70" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_bfff4cedf6c14a58be32aea404b8c6e5-70"></a><span class="s2">optvar2 = add(optvar0, optvar1)</span> +<a id="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-71" name="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-71" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_bfff4cedf6c14a58be32aea404b8c6e5-71"></a><span class="s2">optvar3 = add(optvar2, 2)</span> +<a id="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-72" name="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-72" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_bfff4cedf6c14a58be32aea404b8c6e5-72"></a><span class="s2">optvar4 = lshift(optvar3, 1)"""</span> +<a id="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-73" name="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-73" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_bfff4cedf6c14a58be32aea404b8c6e5-73"></a> +<a id="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-74" name="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-74" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_bfff4cedf6c14a58be32aea404b8c6e5-74"></a> <span class="c1"># removing + 0</span> +<a id="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-75" name="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-75" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_bfff4cedf6c14a58be32aea404b8c6e5-75"></a> <span class="n">bb</span> <span class="o">=</span> <span class="n">Block</span><span class="p">()</span> +<a id="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-76" name="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-76" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_bfff4cedf6c14a58be32aea404b8c6e5-76"></a> <span class="n">var0</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">getarg</span><span class="p">(</span><span class="mi">0</span><span class="p">)</span> +<a id="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-77" name="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-77" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_bfff4cedf6c14a58be32aea404b8c6e5-77"></a> <span class="n">var1</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">add</span><span class="p">(</span><span class="mi">16</span><span class="p">,</span> <span class="o">-</span><span class="mi">16</span><span class="p">)</span> +<a id="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-78" name="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-78" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_bfff4cedf6c14a58be32aea404b8c6e5-78"></a> <span class="n">var2</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">add</span><span class="p">(</span><span class="n">var0</span><span class="p">,</span> <span class="n">var1</span><span class="p">)</span> +<a id="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-79" name="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-79" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_bfff4cedf6c14a58be32aea404b8c6e5-79"></a> <span class="n">var3</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">add</span><span class="p">(</span><span class="mi">0</span><span class="p">,</span> <span class="n">var2</span><span class="p">)</span> +<a id="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-80" name="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-80" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_bfff4cedf6c14a58be32aea404b8c6e5-80"></a> <span class="n">var4</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">add</span><span class="p">(</span><span class="n">var2</span><span class="p">,</span> <span class="n">var3</span><span class="p">)</span> +<a id="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-81" name="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-81" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_bfff4cedf6c14a58be32aea404b8c6e5-81"></a> +<a id="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-82" name="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-82" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_bfff4cedf6c14a58be32aea404b8c6e5-82"></a> <span class="n">opt_bb</span> <span class="o">=</span> <span class="n">optimize</span><span class="p">(</span><span class="n">bb</span><span class="p">)</span> +<a id="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-83" name="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-83" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_bfff4cedf6c14a58be32aea404b8c6e5-83"></a> <span class="k">assert</span> <span class="n">bb_to_str</span><span class="p">(</span><span class="n">opt_bb</span><span class="p">,</span> <span class="s2">"optvar"</span><span class="p">)</span> <span class="o">==</span> <span class="s2">"""</span><span class="se">\</span> +<a id="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-84" name="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-84" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_bfff4cedf6c14a58be32aea404b8c6e5-84"></a><span class="s2">optvar0 = getarg(0)</span> +<a id="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-85" name="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-85" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_bfff4cedf6c14a58be32aea404b8c6e5-85"></a><span class="s2">optvar1 = lshift(optvar0, 1)"""</span> +</pre></div> +</section> +<section id="conclusion"> +<h2>Conclusion</h2> +<p>That's it for now. Why is this architecture cool? From a software engineering +point of view, sticking everything into a single function like in <code class="docutils literal">optimize</code> +above is obviously not great, and if you wanted to do this for real you would +try to split the cases into different functions that are individually +digestible, or even use a DSL that makes the pattern matching much more +readable. But the advantage of the architecture is that it's quite efficient, +it makes it possible to pack a lot of good optimizations into a single pass +over a basic block.</p> +<p>Of course this works even better if you are in a tracing context, where +everything is put into a trace, which is basically one incredibly long basic +block. In a JIT context it's also quite important that the +optimizer itself runs quickly.</p> +<p>Various other optimizations are possible in this model. There is a +<a class="reference external" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html">follow-up post</a> that show how to implement what is arguably PyPy's <a class="reference external" href="https://www.pypy.org/posts/2010/09/escape-analysis-in-pypys-jit-1780048403046080197.html">most +important optimization</a>.</p> +</section> +<section id="some-further-pointers"> +<h2>Some Further Pointers</h2> +<p>This post is only a short introduction and is taking some shortcuts, I wanted to +also give some (non-exhaustive) pointers to more general literature about the +touched topics.</p> +<p>The approach to CSE described here is usually can be seen as <a class="reference external" href="https://en.wikipedia.org/wiki/Value_numbering">value +numbering</a>, it's normally really implemented with a hashmap though. Here's a +<a class="reference external" href="https://www.cs.tufts.edu/~nr/cs257/archive/keith-cooper/value-numbering.pdf">paper</a> that describes various styles of implementing that, even beyond a +single basic block. The paper also partly takes the perspective of discovering +equivalence classes of operations that compute the same result.</p> +<p>A technique that leans even more fully into finding equivalences between +operations is using e-graphs and then applying <a class="reference external" href="https://en.wikipedia.org/wiki/E-graph#Equality_saturation">equality saturation</a> (this is +significantly more advanced that what I described here though). A cool modern +project that applies this technique is <a class="reference external" href="https://egraphs-good.github.io/">egg</a>.</p> +<p>If you squint a bit, you can generally view a constant folding pass as a very +simple form of <a class="reference external" href="https://en.wikipedia.org/wiki/Partial_evaluation">Partial Evaluation</a>: every operation that has constant +arguments is constant-folded away, and the remaining ones are "residualized", +i.e. put into the output program. This point of view is not super important for +the current post, but will become important in the next one.</p> +<p><strong>Acknowledgements:</strong> Thanks to <a class="reference external" href="https://thorstenball.com/">Thorsten Ball</a> for <a class="reference external" href="https://twitter.com/cfbolz/status/1547231548017106944">getting me</a> to write +this and for his enthusiastic feedback. I also got great feedback from <a class="reference external" href="https://bernsteinbear.com/">Max +Bernstein</a>, Matti Picus and Per Vognsen. A conversation with <a class="reference external" href="https://pengwu.substack.com/">Peng Wu</a> that +we had many many years ago and that stuck with me made me keep thinking about +various ways to view compiler optimizations.</p> +</section>toy-optimizerhttps://www.pypy.org/posts/2022/07/toy-optimizer.htmlTue, 19 Jul 2022 12:00:00 GMTHow is PyPy Tested?https://www.pypy.org/posts/2022/04/how-is-pypy-tested.htmlCarl Friedrich Bolz-Tereick<section id="how-is-pypy-tested"> +<h2>How is PyPy Tested?</h2> +<p>In this post I want to give an overview of how the PyPy project does and thinks +about testing. PyPy takes testing quite seriously and has done some from the +start of the project. Here I want to present the different styles of +tests that PyPy has, when we use them and how I think about them.</p> +<section id="background"> +<h3>Background</h3> +<p>To make the blog post self-contained, I am going to start with a small overview +about PyPy's architecture. If you already know what PyPy is and how it works, +you can skip this section.</p> +<p>PyPy means "Python in Python". It is an alternative implementation of the Python +language. Usually, when we speak of "Python", we can mean two different things. +On the one hand it means "Python as an abstract programming language". On the +other hand, the main implementation of that language is also often called +"Python". To more clearly distinguish the two, the implementation is often also +called "CPython", because it is an interpreter implemented in C code.</p> +<p>Now we can make the statement "PyPy is Python in Python" more precise: PyPy is +an interpreter for Python 3.9, implemented in RPython. RPython ("Restricted +Python") is a subset of Python 2, which is statically typed (using type +inference, not type annotations) and can be compiled +to C code. That means we can take our Python 3.9 interpreter, and compile it +into a C binary that can run Python 3.9 code. The final binary behaves pretty +similarly to CPython.</p> +<p>The main thing that makes PyPy interesting is that during the translation of our +interpreter to C, a number of components are automatically inserted into the +final binary. One component is a reasonably good garbage collector.</p> +<p>The more exciting component that is inserted into the binary is a just-in-time +compiler. The insertion of this component is not fully automatic, instead it is +guided by a small number of annotations in the source code of the interpreter. +The effect of inserting this JIT compiler into the binary is that the resulting +binary can run Python code significantly faster than CPython, in many cases. +How this works is not important for the rest of the post, if you want to see an +example of concretely doing that to a small interpreter you can look at this +<a class="reference external" href="https://www.youtube.com/watch?v=fZj3uljJl_k">video</a>.</p> +</section> +<section id="pypy-testing-history"> +<h3>PyPy Testing History</h3> +<p>A few historical notes on the PyPy project and its relationship to testing: The +PyPy project <a class="reference external" href="https://www.pypy.org/posts/2018/09/the-first-15-years-of-pypy-3412615975376972020.html">was started in 2004</a>. At the time when the project was started, +Extreme Programming and Agile Software Development were up and coming. On the +methodology side, PyPy was heavily influenced by these, and started using +Test-Driven Development and pair programming right from the start.</p> +<p>Also technologically, PyPy has been influential on testing in the Python world. +Originally, PyPy had used the <code class="docutils literal">unittest</code> testing framework, but pretty soon +the developers got frustrated with it. <a class="reference external" href="https://holgerkrekel.net/">Holger Krekel</a>, one of the original +developers who started PyPy, started the <a class="reference external" href="https://pytest.org/">pytest</a> testing framework soon +afterwards.</p> +</section> +<section id="interpreter-level-tests"> +<h3>Interpreter-Level Tests</h3> +<p>So, how are tests for PyPy written, concretely? The tests for the interpreter +are split into two different kinds, which we call "interpreter level tests" and +"application level tests". The former are tests that can be used to test the +objects and functions that are used in the implementation of the Python +interpreter. Since the interpreter is written in Python 2, those tests are also +written in Python 2, using pytest. They tend to be more on the unit test side of +things. They are in files with the pattern <code class="docutils literal"><span class="pre">test_*.py</span></code>.</p> +<p>Here is an example that tests the implementation of integers (very slightly +simplified):</p> +<div class="code"><pre class="code python"><a id="rest_code_223464e271d942c6b06c319feee613b9-1" name="rest_code_223464e271d942c6b06c319feee613b9-1" href="https://www.pypy.org/posts/2022/04/how-is-pypy-tested.html#rest_code_223464e271d942c6b06c319feee613b9-1"></a><span class="k">class</span> <span class="nc">TestW_IntObject</span><span class="p">:</span> +<a id="rest_code_223464e271d942c6b06c319feee613b9-2" name="rest_code_223464e271d942c6b06c319feee613b9-2" href="https://www.pypy.org/posts/2022/04/how-is-pypy-tested.html#rest_code_223464e271d942c6b06c319feee613b9-2"></a> <span class="o">...</span> +<a id="rest_code_223464e271d942c6b06c319feee613b9-3" name="rest_code_223464e271d942c6b06c319feee613b9-3" href="https://www.pypy.org/posts/2022/04/how-is-pypy-tested.html#rest_code_223464e271d942c6b06c319feee613b9-3"></a> +<a id="rest_code_223464e271d942c6b06c319feee613b9-4" name="rest_code_223464e271d942c6b06c319feee613b9-4" href="https://www.pypy.org/posts/2022/04/how-is-pypy-tested.html#rest_code_223464e271d942c6b06c319feee613b9-4"></a> <span class="k">def</span> <span class="nf">test_hash</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span> +<a id="rest_code_223464e271d942c6b06c319feee613b9-5" name="rest_code_223464e271d942c6b06c319feee613b9-5" href="https://www.pypy.org/posts/2022/04/how-is-pypy-tested.html#rest_code_223464e271d942c6b06c319feee613b9-5"></a> <span class="n">w_x</span> <span class="o">=</span> <span class="n">W_IntObject</span><span class="p">(</span><span class="mi">42</span><span class="p">)</span> +<a id="rest_code_223464e271d942c6b06c319feee613b9-6" name="rest_code_223464e271d942c6b06c319feee613b9-6" href="https://www.pypy.org/posts/2022/04/how-is-pypy-tested.html#rest_code_223464e271d942c6b06c319feee613b9-6"></a> <span class="n">w_result</span> <span class="o">=</span> <span class="n">w_x</span><span class="o">.</span><span class="n">descr_hash</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">space</span><span class="p">)</span> +<a id="rest_code_223464e271d942c6b06c319feee613b9-7" name="rest_code_223464e271d942c6b06c319feee613b9-7" href="https://www.pypy.org/posts/2022/04/how-is-pypy-tested.html#rest_code_223464e271d942c6b06c319feee613b9-7"></a> <span class="k">assert</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">w_result</span><span class="p">,</span> <span class="n">W_IntObject</span><span class="p">)</span> +<a id="rest_code_223464e271d942c6b06c319feee613b9-8" name="rest_code_223464e271d942c6b06c319feee613b9-8" href="https://www.pypy.org/posts/2022/04/how-is-pypy-tested.html#rest_code_223464e271d942c6b06c319feee613b9-8"></a> <span class="k">assert</span> <span class="n">w_result</span><span class="o">.</span><span class="n">intval</span> <span class="o">==</span> <span class="mi">42</span> +</pre></div> +<p>This test checks that if you take an object that represents integers in the +Python language (using the class <code class="docutils literal">W_IntObject</code>, a "wrapped integer object") +with the value 42, computing the hash of that object returns another instance of +the same class, also with the value 42.</p> +<p>These tests can be run on top of any Python 2 implementation, either CPython or +PyPy. We can then test and debug the internals of the PyPy interpreter using +familiar tools like indeed pytest and the Python debuggers. They can be run, +because all the involved code like the tests and the class <code class="docutils literal">W_IntObject</code> are +just completely regular Python 2 classes that behave in the regular way when +run on top of a Python interpreter.</p> +<p>In CPython, these tests don't really have an equivalent. They would correspond +to tests that are written in C and that can test the logic of all the C +functions of CPython that execute certain functionality, accessing the internals +of C structs in the process. <a class="reference internal" href="https://www.pypy.org/posts/2022/04/how-is-pypy-tested.html#target-1">¹</a></p> +</section> +<section id="application-level-tests"> +<h3>Application-Level Tests</h3> +<p>There is also a second class of tests for the interpreter. Those are tests that +don't run on the level of the implementation. Instead, they are executed <em>by</em> +the PyPy Python interpreter, thus running on the level of the applications run +by PyPy. Since the interpreter is running Python 3, the tests are also written +in Python 3. They are stored in files with the pattern <code class="docutils literal"><span class="pre">apptest_*.py</span></code> and +look like "regular" Python 3 tests. <a class="reference internal" href="https://www.pypy.org/posts/2022/04/how-is-pypy-tested.html#target-2">²</a></p> +<p>Here's an example of how you could write a test equivalent to the one above:</p> +<div class="code"><pre class="code python"><a id="rest_code_3641cdf6102042d3b390de7c88f30047-1" name="rest_code_3641cdf6102042d3b390de7c88f30047-1" href="https://www.pypy.org/posts/2022/04/how-is-pypy-tested.html#rest_code_3641cdf6102042d3b390de7c88f30047-1"></a><span class="k">def</span> <span class="nf">test_hash</span><span class="p">():</span> +<a id="rest_code_3641cdf6102042d3b390de7c88f30047-2" name="rest_code_3641cdf6102042d3b390de7c88f30047-2" href="https://www.pypy.org/posts/2022/04/how-is-pypy-tested.html#rest_code_3641cdf6102042d3b390de7c88f30047-2"></a> <span class="k">assert</span> <span class="nb">hash</span><span class="p">(</span><span class="mi">42</span><span class="p">)</span> <span class="o">==</span> <span class="mi">42</span> +</pre></div> +<p>This style of test looks more "natural" and is the preferred one in cases where +the test does not need to access the internals of the logic or the objects of +the interpreter.</p> +<p>Application level tests can be run in two different ways. On the one hand, we +can simply run them on CPython 3. This is very useful! Since we want PyPy to +behave like CPython, running the tests that we write on CPython is useful to +make sure that the tests themselves aren't wrong.</p> +<p>On the other hand, the main way to run these tests is on top of PyPy, itself +running on top of a Python 2 implementation. This makes it possible to run the +test without first bootstrapping PyPy to C. Since bootstrapping to C is a +relatively slow operation (can take up to an hour) it is crucially important to +be able to run tests without bootstrapping first. It also again makes it +possible to debug crashes in the interpreter using the regular Python 2 +debugger. Of course running tests in this way is unfortunately itself not super +fast, given that they run on a stack of two different interpreters.</p> +<p>Application-level tests correspond quite closely to CPython's tests suite (which +is using the unittest framework). Of course in CPython it is not possible to run +the test suite without building the CPython binary using a C compiler. <a class="reference internal" href="https://www.pypy.org/posts/2022/04/how-is-pypy-tested.html#target-3">³</a></p> +<p>So when do we write application-level tests, and when interpreter-level tests? +Interpreter-level tests are necessary to test internal data structures that +touch data and logic that is not directly exposed to the Python language. If +that is not necessary, we try to write application-level tests. App-level tests +are however by their nature always more on the integration test side of things. +To be able to run the <code class="docutils literal">test_hash</code> function above, many parts of PyPy need to +work correctly, the parser, the bytecode compiler, the bytecode interpreter, the +<code class="docutils literal">hash</code> builtin, calling the <code class="docutils literal">__hash__</code> special method, etc, etc.</p> +<p>This observation is also true for CPython! One could argue that CPython has no +unit tests at all, because in order to be able to even run the tests, most of +Python needs to be in working order already, so all the tests are really +implicitly integration tests.</p> +</section> +<section id="the-cpython-test-suite"> +<h3>The CPython Test Suite</h3> +<p>We also use the CPython Test suite as a final check to see whether our +interpreter correctly implements all the features of the Python language. In +that sense it acts as some kind of compliance test suite that checks whether we +implement the language correctly. The test suite is not perfect for this. +Since it is written for CPython's purposes during its development, a +lot of the tests check really specific CPython implementation details. Examples +for these are tests that check that <code class="docutils literal">__del__</code> is called immediately after +objects go out of scope (which only happens if you use reference counting as a +garbage collection strategy, PyPy uses a <a class="reference external" href="https://www.pypy.org/posts/2013/10/incremental-garbage-collector-in-pypy-8956893523842234676.html">different approach to garbage +collection</a>). Other examples are checking +for exception error messages very explicitly. However, the CPython test suite +has gotten a lot better in these regards over time, by adding +<code class="docutils literal">support.gc_collect()</code> calls to fix the former problem, and by marking some +very specific tests with the <code class="docutils literal">@impl_detail</code> decorator. Thanks to all the +CPython developers who have worked on this!</p> +<p>In the process of re-implementing CPython's functionality and running CPython's +tests suite, PyPy can often also be a good way to find bugs in CPython. While we +think about the corner cases of some Python feature we occasionally find +situations where CPython didn't get everything completely correct either, which +we then report back.</p> +</section> +<section id="testing-for-performance-regressions"> +<h3>Testing for Performance Regressions</h3> +<p>All the tests we described so far are checking <em>behaviour</em>. But one of PyPy's +important goals is to be a <em>fast</em> implementation not "just" a correct one. Some +aspects of performance can be tested by regular unit tests, either application- +or interpreter-level. In order to check whether some performance shortcut is +taken in the interpreter, we sometimes can write tests that monkeypatch the slow +default implementation to always error. Then, if the fast path is taken +properly, that slow default implementation is never reached.</p> +<p>But we also have additional tests that test the correct interaction with the JIT +explicitly. For that, we have a special style of test that checks that the JIT +will produce the correct machine code for a small snippet of Python code. To +make this kind of test somewhat more robust, we don't check the machine code +directly, but instead the architecture independent <a class="reference external" href="https://www.pypy.org/posts/2018/09/the-first-15-years-of-pypy-3412615975376972020.html">intermediate +representation</a> that the JIT uses to produce machine code from.</p> +<p>As an example, here is a small test that loading the attribute of a constant +global instance can be completely constant folded away:</p> +<div class="code"><pre class="code python"><a id="rest_code_3aa93b3bfac04019acf125a5791c07ec-1" name="rest_code_3aa93b3bfac04019acf125a5791c07ec-1" href="https://www.pypy.org/posts/2022/04/how-is-pypy-tested.html#rest_code_3aa93b3bfac04019acf125a5791c07ec-1"></a><span class="k">def</span> <span class="nf">test_load_attr</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span> +<a id="rest_code_3aa93b3bfac04019acf125a5791c07ec-2" name="rest_code_3aa93b3bfac04019acf125a5791c07ec-2" href="https://www.pypy.org/posts/2022/04/how-is-pypy-tested.html#rest_code_3aa93b3bfac04019acf125a5791c07ec-2"></a> <span class="n">src</span> <span class="o">=</span> <span class="s1">'''</span> +<a id="rest_code_3aa93b3bfac04019acf125a5791c07ec-3" name="rest_code_3aa93b3bfac04019acf125a5791c07ec-3" href="https://www.pypy.org/posts/2022/04/how-is-pypy-tested.html#rest_code_3aa93b3bfac04019acf125a5791c07ec-3"></a><span class="s1"> class A(object):</span> +<a id="rest_code_3aa93b3bfac04019acf125a5791c07ec-4" name="rest_code_3aa93b3bfac04019acf125a5791c07ec-4" href="https://www.pypy.org/posts/2022/04/how-is-pypy-tested.html#rest_code_3aa93b3bfac04019acf125a5791c07ec-4"></a><span class="s1"> pass</span> +<a id="rest_code_3aa93b3bfac04019acf125a5791c07ec-5" name="rest_code_3aa93b3bfac04019acf125a5791c07ec-5" href="https://www.pypy.org/posts/2022/04/how-is-pypy-tested.html#rest_code_3aa93b3bfac04019acf125a5791c07ec-5"></a><span class="s1"> a = A()</span> +<a id="rest_code_3aa93b3bfac04019acf125a5791c07ec-6" name="rest_code_3aa93b3bfac04019acf125a5791c07ec-6" href="https://www.pypy.org/posts/2022/04/how-is-pypy-tested.html#rest_code_3aa93b3bfac04019acf125a5791c07ec-6"></a><span class="s1"> a.x = 1</span> +<a id="rest_code_3aa93b3bfac04019acf125a5791c07ec-7" name="rest_code_3aa93b3bfac04019acf125a5791c07ec-7" href="https://www.pypy.org/posts/2022/04/how-is-pypy-tested.html#rest_code_3aa93b3bfac04019acf125a5791c07ec-7"></a><span class="s1"> def main(n):</span> +<a id="rest_code_3aa93b3bfac04019acf125a5791c07ec-8" name="rest_code_3aa93b3bfac04019acf125a5791c07ec-8" href="https://www.pypy.org/posts/2022/04/how-is-pypy-tested.html#rest_code_3aa93b3bfac04019acf125a5791c07ec-8"></a><span class="s1"> i = 0</span> +<a id="rest_code_3aa93b3bfac04019acf125a5791c07ec-9" name="rest_code_3aa93b3bfac04019acf125a5791c07ec-9" href="https://www.pypy.org/posts/2022/04/how-is-pypy-tested.html#rest_code_3aa93b3bfac04019acf125a5791c07ec-9"></a><span class="s1"> while i &lt; n:</span> +<a id="rest_code_3aa93b3bfac04019acf125a5791c07ec-10" name="rest_code_3aa93b3bfac04019acf125a5791c07ec-10" href="https://www.pypy.org/posts/2022/04/how-is-pypy-tested.html#rest_code_3aa93b3bfac04019acf125a5791c07ec-10"></a><span class="s1"> i = i + a.x</span> +<a id="rest_code_3aa93b3bfac04019acf125a5791c07ec-11" name="rest_code_3aa93b3bfac04019acf125a5791c07ec-11" href="https://www.pypy.org/posts/2022/04/how-is-pypy-tested.html#rest_code_3aa93b3bfac04019acf125a5791c07ec-11"></a><span class="s1"> return i</span> +<a id="rest_code_3aa93b3bfac04019acf125a5791c07ec-12" name="rest_code_3aa93b3bfac04019acf125a5791c07ec-12" href="https://www.pypy.org/posts/2022/04/how-is-pypy-tested.html#rest_code_3aa93b3bfac04019acf125a5791c07ec-12"></a><span class="s1"> '''</span> +<a id="rest_code_3aa93b3bfac04019acf125a5791c07ec-13" name="rest_code_3aa93b3bfac04019acf125a5791c07ec-13" href="https://www.pypy.org/posts/2022/04/how-is-pypy-tested.html#rest_code_3aa93b3bfac04019acf125a5791c07ec-13"></a> <span class="n">log</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">run</span><span class="p">(</span><span class="n">src</span><span class="p">,</span> <span class="p">[</span><span class="mi">1000</span><span class="p">])</span> +<a id="rest_code_3aa93b3bfac04019acf125a5791c07ec-14" name="rest_code_3aa93b3bfac04019acf125a5791c07ec-14" href="https://www.pypy.org/posts/2022/04/how-is-pypy-tested.html#rest_code_3aa93b3bfac04019acf125a5791c07ec-14"></a> <span class="k">assert</span> <span class="n">log</span><span class="o">.</span><span class="n">result</span> <span class="o">==</span> <span class="mi">1000</span> +<a id="rest_code_3aa93b3bfac04019acf125a5791c07ec-15" name="rest_code_3aa93b3bfac04019acf125a5791c07ec-15" href="https://www.pypy.org/posts/2022/04/how-is-pypy-tested.html#rest_code_3aa93b3bfac04019acf125a5791c07ec-15"></a> <span class="n">loop</span><span class="p">,</span> <span class="o">=</span> <span class="n">log</span><span class="o">.</span><span class="n">loops_by_filename</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">filepath</span><span class="p">)</span> +<a id="rest_code_3aa93b3bfac04019acf125a5791c07ec-16" name="rest_code_3aa93b3bfac04019acf125a5791c07ec-16" href="https://www.pypy.org/posts/2022/04/how-is-pypy-tested.html#rest_code_3aa93b3bfac04019acf125a5791c07ec-16"></a> <span class="k">assert</span> <span class="n">loop</span><span class="o">.</span><span class="n">match</span><span class="p">(</span><span class="s2">"""</span> +<a id="rest_code_3aa93b3bfac04019acf125a5791c07ec-17" name="rest_code_3aa93b3bfac04019acf125a5791c07ec-17" href="https://www.pypy.org/posts/2022/04/how-is-pypy-tested.html#rest_code_3aa93b3bfac04019acf125a5791c07ec-17"></a><span class="s2"> i9 = int_lt(i5, i6)</span> +<a id="rest_code_3aa93b3bfac04019acf125a5791c07ec-18" name="rest_code_3aa93b3bfac04019acf125a5791c07ec-18" href="https://www.pypy.org/posts/2022/04/how-is-pypy-tested.html#rest_code_3aa93b3bfac04019acf125a5791c07ec-18"></a><span class="s2"> guard_true(i9, descr=...)</span> +<a id="rest_code_3aa93b3bfac04019acf125a5791c07ec-19" name="rest_code_3aa93b3bfac04019acf125a5791c07ec-19" href="https://www.pypy.org/posts/2022/04/how-is-pypy-tested.html#rest_code_3aa93b3bfac04019acf125a5791c07ec-19"></a><span class="s2"> guard_not_invalidated(descr=...)</span> +<a id="rest_code_3aa93b3bfac04019acf125a5791c07ec-20" name="rest_code_3aa93b3bfac04019acf125a5791c07ec-20" href="https://www.pypy.org/posts/2022/04/how-is-pypy-tested.html#rest_code_3aa93b3bfac04019acf125a5791c07ec-20"></a><span class="s2"> i10 = int_add(i5, 1)</span> +<a id="rest_code_3aa93b3bfac04019acf125a5791c07ec-21" name="rest_code_3aa93b3bfac04019acf125a5791c07ec-21" href="https://www.pypy.org/posts/2022/04/how-is-pypy-tested.html#rest_code_3aa93b3bfac04019acf125a5791c07ec-21"></a><span class="s2"> --TICK--</span> +<a id="rest_code_3aa93b3bfac04019acf125a5791c07ec-22" name="rest_code_3aa93b3bfac04019acf125a5791c07ec-22" href="https://www.pypy.org/posts/2022/04/how-is-pypy-tested.html#rest_code_3aa93b3bfac04019acf125a5791c07ec-22"></a><span class="s2"> jump(..., descr=...)</span> +<a id="rest_code_3aa93b3bfac04019acf125a5791c07ec-23" name="rest_code_3aa93b3bfac04019acf125a5791c07ec-23" href="https://www.pypy.org/posts/2022/04/how-is-pypy-tested.html#rest_code_3aa93b3bfac04019acf125a5791c07ec-23"></a><span class="s2"> """</span><span class="p">)</span> +</pre></div> +<p>The string passed to the <code class="docutils literal">loop.match</code> function is a string representation of +the intermediate representation code that is generated for the <code class="docutils literal">while</code> loop in +the <code class="docutils literal">main</code> function given in the source. The important part of that +intermediate representation is that the <code class="docutils literal">i = i + a.x</code> addition is optimized +into an <code class="docutils literal">int_add(x, 1)</code> operation. The second argument for the addition is the +constant <code class="docutils literal">1</code>, because the JIT noted that the global <code class="docutils literal">a</code> is a constant, and +the attribute <code class="docutils literal">x</code> of that instance is always <code class="docutils literal">1</code>. The test thus checks that +this optimization still works.</p> +<p>Those tests are again more on the unit test side of things (and can thus +unfortunately be a bit brittle sometimes and break). The integration test +equivalent for performance is the <a class="reference external" href="https://speed.pypy.org/">PyPy Speed Center</a> which tracks the +performance of micro- and macro-benchmarks over time and lets us see when big +performance regressions are happening. The speed center is not really an +automatic test and does not produce pass/fail outcomes. Instead, it requires +human judgement and intervention in order to interpret the performance changes. +Having a real pass/fail mechanism is something that would be <a class="reference external" href="https://twitter.com/glyph/status/1495122754286198790">great to have</a> +but is probably <a class="reference external" href="https://arxiv.org/abs/1602.00602">quite tricky in practice</a>.</p> +</section> +<section id="conclusion"> +<h3>Conclusion</h3> +<p>This concludes my overview of some of the different styles of tests that we use +to develop the PyPy Python interpreter.</p> +<p>There is a whole other set of tests for the development of the RPython language, +the garbage collectors it provides as well as the code that does the automatic +JIT insertion, maybe I'll cover these in a future post.</p> +<section id="footnotes"> +<h4>Footnotes</h4> +<p id="target-1">¹ CPython has the <cite>_testcapimodule.c</cite> and related modules, that are used to +unit-test the C-API. However, these are still driven from Python tests using +the <code class="docutils literal">unittest</code> framework and wouldn't run without the Python interpreter +already working.</p> +<p id="target-2">² There is also a deprecated different way to write these tests, by putting +them in the <code class="docutils literal"><span class="pre">test_*.py</span></code> files that interpreter level tests are using and +then having a test class with the pattern <code class="docutils literal">class AppTest*</code>. We haven't +converted all of them to the new style yet, even though the old style is +quite weird: since the <code class="docutils literal"><span class="pre">test_*.py</span></code> files are themselves parsed by +Python 2, the tests methods in <code class="docutils literal">AppTest*</code> classes need to be written in the +subset of Python 3 syntax that is also valid Python 2 syntax, leading to a lot +of confusion.</p> +<p id="target-3">³ Nit-picky side-note: <a class="reference external" href="https://root.cern.ch/root/html534/guides/users-guide/CINT.html">C interpreters</a> <a class="reference external" href="https://www.youtube.com/watch?v=yyDD_KRdQQU">are a thing</a>! But not that +widely used in practice, or only in very specific situations.</p> +</section> +</section> +</section>https://www.pypy.org/posts/2022/04/how-is-pypy-tested.htmlSat, 02 Apr 2022 15:00:00 GMTError Message Style Guides of Various Languageshttps://www.pypy.org/posts/2021/12/error-message-style-guides.htmlCarl Friedrich Bolz-Tereick<section id="error-message-style-guides-of-various-languages"> +<h2>Error Message Style Guides of Various Languages</h2> +<p>PyPy has been trying to produce good <a class="reference external" href="https://www.pypy.org/posts/2018/04/improving-syntaxerror-in-pypy-5733639208090522433.html">SyntaxErrors</a> and <a class="reference external" href="https://twitter.com/cfbolz/status/783313503230844929/photo/1">other</a> <a class="reference external" href="https://twitter.com/pypyproject/status/999930324481081344">errors</a> for +a long time. CPython has also made an enormous push to <a class="reference external" href="https://docs.python.org/3/whatsnew/3.10.html#better-error-messages">improve its +SyntaxErrors in the last few releases</a>. These improvements are great, but the process +feels somewhat arbitrary sometimes. To see what other languages are doing, I +<a class="reference external" href="https://twitter.com/cfbolz/status/1466033151315173384">asked people on Twitter</a> whether they know of error message style guides for +other programming languages.</p> +<p>Wonderfully, people answered me with lots of helpful links (<a class="reference internal" href="https://www.pypy.org/posts/2021/12/error-message-style-guides.html#full-list">full list</a> at the +end of the post), thank you everybody! All those sources are very interesting +and contain many great points, I recommend reading them directly! In this +post, I'll try to summarize some common themes or topics that I thought were +particularly interesting.</p> +<section id="language-use"> +<h3>Language Use</h3> +<p>Almost all guides stress the need for plain and simple English, as well as +conciseness and clarity [Flix, Racket, Rust, Flow]. Flow suggests to put coding +effort into making the grammar correct, for example in the case of plurals or +to distinguish between "a" and "an".</p> +<p>The suggested tone should be friendly and neutral, the messages should not +blame the Programmer [Flow]. Rust and Flix suggest to not use the term +'illegal' and use something like 'invalid' instead.</p> +<p>Flow suggests to avoid "compiler speak". For example terms like 'token' and +'identifier' should be avoided and terms that are more familiar to programmers +be used (eg "name" is better). The Racket guide goes further and has a list of +allowed technical terms and some prohibited terms.</p> +</section> +<section id="structure"> +<h3>Structure</h3> +<p>Several guides (such as Flix and Flow) point out a 80/20 rule: 80% of the times an error message is +read, the developer knows that message well and knows exactly what to do. For +this use case it's important that the message is short. On the other hand, 20% +of the times this same message will have to be understood by a developer who +has never seen it before and is confused, and so the message needs to contain +enough information +to allow them to find out what is going on. So the error message needs to strike +a balance between brevity and clarity.</p> +<p>The Racket guide proposes to use the following general structure for errors: +'State the constraint that was violated ("expected a"), followed by what was +found instead.'</p> +<p>The Rust guides says to avoid "Did you mean?" and questions in general, and +wants the compiler to instead be explicit about why something was suggested. The +example the Rust guide gives is: 'Compare "did you mean: Foo" vs. "there is a +struct with a similar name: Foo".' Racket goes further and forbids +suggestions altogether because "Students will follow well‐meaning‐but‐wrong +advice uncritically, if only because they have no reason to doubt the +authoritative voice of the tool."</p> +</section> +<section id="formatting-and-source-positions"> +<h3>Formatting and Source Positions</h3> +<p>The Rust guide suggests to put all identifiers into backticks (like in +Markdown), Flow formats the error messages using full Markdown.</p> +<p>The Clang, Flow and Rust guides point out the importance of using precise +source code spans to point to errors, which is especially important if the +compiler information is used in the context of an IDE to show a red squiggly +underline or some other highlighting. The spans should be as small as possible to point out the source of +the error [Flow].</p> +</section> +<section id="conclusion"> +<h3>Conclusion</h3> +<p>I am quite impressed how advanced and well-thought out the approaches are. I wonder whether it would makes sense for +Python to adopt a (probably minimal, to get started) subset of these ideas as guidelines for its own errors.</p> +</section> +<section id="sources"> +<span id="full-list"></span><h3>Sources</h3> +<ul class="simple"> +<li><p>Rust: <a class="reference external" href="https://rustc-dev-guide.rust-lang.org/diagnostics.html">https://rustc-dev-guide.rust-lang.org/diagnostics.html</a></p></li> +<li><p>Clang: <a class="reference external" href="https://clang.llvm.org/diagnostics.html">https://clang.llvm.org/diagnostics.html</a></p></li> +<li><p>Flix: <a class="reference external" href="https://flix.dev/principles/">https://flix.dev/principles/</a></p></li> +<li><p>Racket: <a class="reference external" href="https://cs.brown.edu/~kfisler/Misc/error-msg-guidelines-racket-studlangs.pdf">https://cs.brown.edu/~kfisler/Misc/error-msg-guidelines-racket-studlangs.pdf</a></p></li> +<li><p>More about the research that lead to the Racket guidelines (including the referenced papers): <a class="reference external" href="https://twitter.com/ShriramKMurthi/status/1451688982761381892">https://twitter.com/ShriramKMurthi/status/1451688982761381892</a></p></li> +<li><p>Flow: <a class="reference external" href="https://calebmer.com/2019/07/01/writing-good-compiler-error-messages.html">https://calebmer.com/2019/07/01/writing-good-compiler-error-messages.html</a></p></li> +<li><p>Elm: <a class="reference external" href="https://elm-lang.org/news/compiler-errors-for-humans">https://elm-lang.org/news/compiler-errors-for-humans</a></p></li> +<li><p>Elm's error message catalog: <a class="reference external" href="https://github.com/elm/error-message-catalog">https://github.com/elm/error-message-catalog</a></p></li> +<li><p>Reason: <a class="reference external" href="https://reasonml.github.io/blog/2017/08/25/way-nicer-error-messages.html">https://reasonml.github.io/blog/2017/08/25/way-nicer-error-messages.html</a></p></li> +</ul> +</section> +</section>https://www.pypy.org/posts/2021/12/error-message-style-guides.htmlSun, 05 Dec 2021 14:00:00 GMT \ No newline at end of file diff --git a/authors/cf-bolz-tereick.html b/authors/cf-bolz-tereick.html new file mode 100644 index 000000000..b1f4d8491 --- /dev/null +++ b/authors/cf-bolz-tereick.html @@ -0,0 +1,119 @@ + + + + + +Posts by CF Bolz-Tereick | PyPy + + + + + + + + + + + + + + + + + Skip to main content +
+
+
+ + \ No newline at end of file diff --git a/authors/cf-bolz-tereick.xml b/authors/cf-bolz-tereick.xml new file mode 100644 index 000000000..074c23c29 --- /dev/null +++ b/authors/cf-bolz-tereick.xml @@ -0,0 +1,2344 @@ + +PyPy (Posts by CF Bolz-Tereick)https://www.pypy.org/enContents © 2024 <a href="mailto:pypy-dev@pypy.org">The PyPy Team</a> Sat, 31 Aug 2024 17:48:12 GMTNikola (getnikola.com)http://blogs.law.harvard.edu/tech/rssA Knownbits Abstract Domain for the Toy Optimizer, Correctlyhttps://www.pypy.org/posts/2024/08/toy-knownbits.htmlCF Bolz-Tereick<p>After <a href="https://bernsteinbear.com/blog/toy-abstract-interpretation/">Max' introduction to abstract interpretation for the toy optimizer</a> in the +last post, I want to present a more complicated abstract domain in this post. +This abstract domain reasons about the individual bits of a variable in a trace. +Every bit can be either "known zero", "known one" or "unknown". The abstract +domain is useful for optimizing integer operations, particularly the bitwise operations. +The abstract domain follows quite closely the <a href="https://github.com/torvalds/linux/blob/master/kernel/bpf/tnum.c">tristate abstract domain of the +eBPF verifier in the Linux +Kernel</a>, as +described by the paper +<a href="https://arxiv.org/abs/2105.05398">Sound, Precise, and Fast Abstract Interpretation with Tristate +Numbers</a> by Harishankar Vishwanathan, Matan +Shachnai, Srinivas Narayana, and Santosh Nagarakatte.</p> +<p>The presentation in this post will still be in the context of the +<a href="https://www.pypy.org/categories/toy-optimizer">toy optimizer</a>. We'll spend a significant part of +the post convincing ourselves that the abstract domain transfer functions that +we're writing are really correct, using both property-based testing and +automated proofs (again using Z3).</p> +<p>PyPy has implemented and merged a more complicated version of the same abstract +domain for the "real" PyPy JIT. A more thorough explanation of that real world +implementation will follow.</p> +<p>I'd like to thank Max Bernstein and Armin Rigo for lots of great feedback on +drafts of this post. The PyPy implementation was mainly done by Nico +Rittinghaus and me.</p> +<p><strong>Contents:</strong></p> +<div class="toc"> +<ul> +<li><a href="https://www.pypy.org/posts/2024/08/toy-knownbits.html#motivation">Motivation</a></li> +<li><a href="https://www.pypy.org/posts/2024/08/toy-knownbits.html#the-knownbits-abstract-domain">The Knownbits Abstract Domain</a></li> +<li><a href="https://www.pypy.org/posts/2024/08/toy-knownbits.html#transfer-functions">Transfer Functions</a></li> +<li><a href="https://www.pypy.org/posts/2024/08/toy-knownbits.html#property-based-tests-with-hypothesis">Property-based Tests with Hypothesis</a></li> +<li><a href="https://www.pypy.org/posts/2024/08/toy-knownbits.html#when-are-transfer-functions-correct-how-do-we-test-them">When are Transfer Functions Correct? How do we test them?</a></li> +<li><a href="https://www.pypy.org/posts/2024/08/toy-knownbits.html#implementing-binary-transfer-functions">Implementing Binary Transfer Functions</a></li> +<li><a href="https://www.pypy.org/posts/2024/08/toy-knownbits.html#addition-and-subtraction">Addition and Subtraction</a></li> +<li><a href="https://www.pypy.org/posts/2024/08/toy-knownbits.html#proving-correctness-of-the-transfer-functions-with-z3">Proving correctness of the transfer functions with Z3</a></li> +<li><a href="https://www.pypy.org/posts/2024/08/toy-knownbits.html#cases-where-this-style-of-z3-proof-doesnt-work">Cases where this style of Z3 proof doesn't work</a></li> +<li><a href="https://www.pypy.org/posts/2024/08/toy-knownbits.html#making-statements-about-precision">Making Statements about Precision</a></li> +<li><a href="https://www.pypy.org/posts/2024/08/toy-knownbits.html#using-the-abstract-domain-in-the-toy-optimizer-for-generalized-constant-folding">Using the Abstract Domain in the Toy Optimizer for Generalized Constant Folding</a></li> +<li><a href="https://www.pypy.org/posts/2024/08/toy-knownbits.html#using-the-knownbits-domain-for-conditional-peephole-rewrites">Using the KnownBits Domain for Conditional Peephole Rewrites</a></li> +<li><a href="https://www.pypy.org/posts/2024/08/toy-knownbits.html#conclusion">Conclusion</a></li> +</ul> +</div> +<h3 id="motivation">Motivation</h3> +<p>In many programs that do bit-manipulation of integers, some of the bits of the +integer variables of the program can be statically known. Here's a simple +example:</p> +<div class="code"><pre class="code literal-block"><span class="nv">x</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="nv">a</span><span class="w"> </span><span class="o">|</span><span class="w"> </span><span class="mi">1</span> +... +<span class="k">if</span><span class="w"> </span><span class="nv">x</span><span class="w"> </span><span class="o">&amp;</span><span class="w"> </span><span class="mi">1</span>: +<span class="w"> </span>... +<span class="k">else</span>: +<span class="w"> </span>... +</pre></div> + +<p>After the assignment <code>x = a | 1</code>, we know that the lowest bit of <code>x</code> must be <code>1</code> +(the other bits are unknown) and an optimizer could remove the condition <code>x &amp; 1</code> by +constant-folding it to <code>1</code>.</p> +<p>Another (more complicated) example is:</p> +<div class="code"><pre class="code literal-block">assert i &amp; 0b111 == 0 # check that i is a multiple of 8 +j = i + 16 +assert j &amp; 0b111 == 0 +</pre></div> + +<p>This kind of code could e.g. happen in a <a href="https://docs.pydrofoil.org/en/latest/">CPU +emulator</a>, where <code>i</code> and <code>j</code> are +integers that represent emulated pointers, and the <code>assert</code>s are alignment +checks. The first assert implies that the lowest three bits of i must be <code>0</code>. +Adding 16 to such a number produces a result where the lowest three bits are +again all <code>0</code>, therefore the second assert is always true. So we would like a +compiler to remove the second assert.</p> +<p>Both of these will optimizations are doable with the help of the knownbits +abstract domain that we'll discuss in the rest of the post.</p> +<h3 id="the-knownbits-abstract-domain">The Knownbits Abstract Domain</h3> +<p>An abstract value of the knownbits domain needs to be able to store, for every +bit of an integer variable in a program, whether it is known 0, known 1, or +unknown. To represent +three different states, we need 2 bits, which we will call <code>one</code> and <code>unknown</code>. +Here's the encoding:</p> +<table> +<thead> +<tr> +<th>one</th> +<th>unknown</th> +<th align="right">knownbit</th> +</tr> +</thead> +<tbody> +<tr> +<td>0</td> +<td>0</td> +<td align="right">0</td> +</tr> +<tr> +<td>1</td> +<td>0</td> +<td align="right">1</td> +</tr> +<tr> +<td>0</td> +<td>1</td> +<td align="right">?</td> +</tr> +<tr> +<td>1</td> +<td>1</td> +<td align="right">illegal</td> +</tr> +</tbody> +</table> +<p>The <code>unknown</code> bit is set if we don't know the value of the bit ("?"), the <code>one</code> +bit is set if the bit is known to be a <code>1</code>. Since two bits are enough to encode +four different states, but we only need three, the combination of a set <code>one</code> +bit and a set <code>unknown</code> is not allowed.</p> +<p>We don't just want to encode a single bit, however. Instead, we want to do this +for all the bits of an integer variable. Therefore the instances of the abstract +domain get two integer fields <code>ones</code> and <code>unknowns</code>, where each pair of +corresponding bits encodes the knowledge about the corresponding bit of the +integer variable in the program.</p> +<p>We can start implementing a Python class that works like this:</p> +<div class="code"><pre class="code literal-block"><span class="kn">from</span> <span class="nn">dataclasses</span> <span class="kn">import</span> <span class="n">dataclass</span> + +<span class="nd">@dataclass</span><span class="p">(</span><span class="n">eq</span><span class="o">=</span><span class="kc">False</span><span class="p">)</span> +<span class="k">class</span> <span class="nc">KnownBits</span><span class="p">:</span> + <span class="n">ones</span> <span class="p">:</span> <span class="nb">int</span> + <span class="n">unknowns</span> <span class="p">:</span> <span class="nb">int</span> + + <span class="k">def</span> <span class="nf">__post_init__</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span> + <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">ones</span><span class="p">,</span> <span class="nb">int</span><span class="p">):</span> + <span class="k">assert</span> <span class="bp">self</span><span class="o">.</span><span class="n">is_well_formed</span><span class="p">()</span> + + <span class="k">def</span> <span class="nf">is_well_formed</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span> + <span class="c1"># a bit cannot be both 1 and unknown</span> + <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">ones</span> <span class="o">&amp;</span> <span class="bp">self</span><span class="o">.</span><span class="n">unknowns</span> <span class="o">==</span> <span class="mi">0</span> + + <span class="nd">@staticmethod</span> + <span class="k">def</span> <span class="nf">from_constant</span><span class="p">(</span><span class="n">const</span> <span class="p">:</span> <span class="nb">int</span><span class="p">):</span> +<span class="w"> </span><span class="sd">""" Construct a KnownBits corresponding to a constant, where all bits</span> +<span class="sd"> are known."""</span> + <span class="k">return</span> <span class="n">KnownBits</span><span class="p">(</span><span class="n">const</span><span class="p">,</span> <span class="mi">0</span><span class="p">)</span> + + <span class="k">def</span> <span class="nf">is_constant</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span> +<span class="w"> </span><span class="sd">""" Check if the KnownBits instance represents a constant. """</span> + <span class="c1"># it's a constant if there are no unknowns</span> + <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">unknowns</span> <span class="o">==</span> <span class="mi">0</span> +</pre></div> + +<p>We can also add some convenience properties. Sometimes it is easier to work +with an integer where all the <em>known</em> bits are set, or one where the positions +of all the known zeros have a set bit:</p> +<div class="code"><pre class="code literal-block"><span class="k">class</span> <span class="nc">KnownBits</span><span class="p">:</span> + <span class="o">...</span> + + <span class="nd">@property</span> + <span class="k">def</span> <span class="nf">knowns</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span> +<span class="w"> </span><span class="sd">""" return an integer where the known bits are set. """</span> + <span class="c1"># the knowns are just the unknowns, inverted</span> + <span class="k">return</span> <span class="o">~</span><span class="bp">self</span><span class="o">.</span><span class="n">unknowns</span> + + <span class="nd">@property</span> + <span class="k">def</span> <span class="nf">zeros</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span> +<span class="w"> </span><span class="sd">""" return an integer where the places that are known zeros have a bit</span> +<span class="sd"> set. """</span> + <span class="c1"># it's a 0 if it is known, but not 1</span> + <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">knowns</span> <span class="o">&amp;</span> <span class="o">~</span><span class="bp">self</span><span class="o">.</span><span class="n">ones</span> +</pre></div> + +<p>Also, for debugging and for writing tests we want a way to print the known bits +in a human-readable form, and also to have a way to construct a <code>KnownBits</code> +instance from a string. It's not important to understand the details of +<code>__str__</code> or <code>from_str</code> for the rest of the post, so I'm putting them into a fold:</p> +<details> +<summary><code>KnownBits</code> from and to string conversions</summary> + + +<div class="code"><pre class="code literal-block"><span class="k">class</span> <span class="nc">KnownBits</span><span class="p">:</span> + <span class="o">...</span> + + <span class="k">def</span> <span class="fm">__repr__</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span> + <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">is_constant</span><span class="p">():</span> + <span class="k">return</span> <span class="sa">f</span><span class="s2">"KnownBits.from_constant(</span><span class="si">{</span><span class="bp">self</span><span class="o">.</span><span class="n">ones</span><span class="si">}</span><span class="s2">)"</span> + <span class="k">return</span> <span class="sa">f</span><span class="s2">"KnownBits(</span><span class="si">{</span><span class="bp">self</span><span class="o">.</span><span class="n">ones</span><span class="si">}</span><span class="s2">, </span><span class="si">{</span><span class="bp">self</span><span class="o">.</span><span class="n">unknowns</span><span class="si">}</span><span class="s2">)"</span> + + <span class="k">def</span> <span class="fm">__str__</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span> + <span class="n">res</span> <span class="o">=</span> <span class="p">[]</span> + <span class="n">ones</span><span class="p">,</span> <span class="n">unknowns</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">ones</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">unknowns</span> + <span class="c1"># construct the string representation right to left</span> + <span class="k">while</span> <span class="mi">1</span><span class="p">:</span> + <span class="k">if</span> <span class="ow">not</span> <span class="n">ones</span> <span class="ow">and</span> <span class="ow">not</span> <span class="n">unknowns</span><span class="p">:</span> + <span class="k">break</span> <span class="c1"># we leave off the leading known 0s</span> + <span class="k">if</span> <span class="n">ones</span> <span class="o">==</span> <span class="o">-</span><span class="mi">1</span> <span class="ow">and</span> <span class="ow">not</span> <span class="n">unknowns</span><span class="p">:</span> + <span class="c1"># -1 has all bits set in two's complement, so the leading</span> + <span class="c1"># bits are all 1</span> + <span class="n">res</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="s1">'1'</span><span class="p">)</span> + <span class="n">res</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="s2">"..."</span><span class="p">)</span> + <span class="k">break</span> + <span class="k">if</span> <span class="n">unknowns</span> <span class="o">==</span> <span class="o">-</span><span class="mi">1</span><span class="p">:</span> + <span class="c1"># -1 has all bits set in two's complement, so the leading bits</span> + <span class="c1"># are all ?</span> + <span class="k">assert</span> <span class="ow">not</span> <span class="n">ones</span> + <span class="n">res</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="s2">"?"</span><span class="p">)</span> + <span class="n">res</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="s2">"..."</span><span class="p">)</span> + <span class="k">break</span> + <span class="k">if</span> <span class="n">unknowns</span> <span class="o">&amp;</span> <span class="mi">1</span><span class="p">:</span> + <span class="n">res</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="s1">'?'</span><span class="p">)</span> + <span class="k">elif</span> <span class="n">ones</span> <span class="o">&amp;</span> <span class="mi">1</span><span class="p">:</span> + <span class="n">res</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="s1">'1'</span><span class="p">)</span> + <span class="k">else</span><span class="p">:</span> + <span class="n">res</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="s1">'0'</span><span class="p">)</span> + <span class="n">ones</span> <span class="o">&gt;&gt;=</span> <span class="mi">1</span> + <span class="n">unknowns</span> <span class="o">&gt;&gt;=</span> <span class="mi">1</span> + <span class="k">if</span> <span class="ow">not</span> <span class="n">res</span><span class="p">:</span> + <span class="n">res</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="s1">'0'</span><span class="p">)</span> + <span class="n">res</span><span class="o">.</span><span class="n">reverse</span><span class="p">()</span> + <span class="k">return</span> <span class="s2">""</span><span class="o">.</span><span class="n">join</span><span class="p">(</span><span class="n">res</span><span class="p">)</span> + + <span class="nd">@staticmethod</span> + <span class="k">def</span> <span class="nf">from_str</span><span class="p">(</span><span class="n">s</span><span class="p">):</span> +<span class="w"> </span><span class="sd">""" Construct a KnownBits instance that from a string. String can start</span> +<span class="sd"> with ...1 to mean that all higher bits are 1, or ...? to mean that all</span> +<span class="sd"> higher bits are unknown. Otherwise it is assumed that the higher bits</span> +<span class="sd"> are all 0. """</span> + <span class="n">ones</span><span class="p">,</span> <span class="n">unknowns</span> <span class="o">=</span> <span class="mi">0</span><span class="p">,</span> <span class="mi">0</span> + <span class="n">startindex</span> <span class="o">=</span> <span class="mi">0</span> + <span class="k">if</span> <span class="n">s</span><span class="o">.</span><span class="n">startswith</span><span class="p">(</span><span class="s2">"...?"</span><span class="p">):</span> + <span class="n">unknowns</span> <span class="o">=</span> <span class="o">-</span><span class="mi">1</span> + <span class="n">startindex</span> <span class="o">=</span> <span class="mi">4</span> + <span class="k">elif</span> <span class="n">s</span><span class="o">.</span><span class="n">startswith</span><span class="p">(</span><span class="s2">"...1"</span><span class="p">):</span> + <span class="n">ones</span> <span class="o">=</span> <span class="o">-</span><span class="mi">1</span> + <span class="n">startindex</span> <span class="o">=</span> <span class="mi">4</span> + <span class="k">for</span> <span class="n">index</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="n">startindex</span><span class="p">,</span> <span class="nb">len</span><span class="p">(</span><span class="n">s</span><span class="p">)):</span> + <span class="n">ones</span> <span class="o">&lt;&lt;=</span> <span class="mi">1</span> + <span class="n">unknowns</span> <span class="o">&lt;&lt;=</span> <span class="mi">1</span> + <span class="n">c</span> <span class="o">=</span> <span class="n">s</span><span class="p">[</span><span class="n">index</span><span class="p">]</span> + <span class="k">if</span> <span class="n">c</span> <span class="o">==</span> <span class="s1">'1'</span><span class="p">:</span> + <span class="n">ones</span> <span class="o">|=</span> <span class="mi">1</span> + <span class="k">elif</span> <span class="n">c</span> <span class="o">==</span> <span class="s1">'?'</span><span class="p">:</span> + <span class="n">unknowns</span> <span class="o">|=</span> <span class="mi">1</span> + <span class="k">return</span> <span class="n">KnownBits</span><span class="p">(</span><span class="n">ones</span><span class="p">,</span> <span class="n">unknowns</span><span class="p">)</span> + + <span class="nd">@staticmethod</span> + <span class="k">def</span> <span class="nf">all_unknown</span><span class="p">():</span> +<span class="w"> </span><span class="sd">""" convenience constructor for the "all bits unknown" abstract value</span> +<span class="sd"> """</span> + <span class="k">return</span> <span class="n">KnownBits</span><span class="o">.</span><span class="n">from_str</span><span class="p">(</span><span class="s2">"...?"</span><span class="p">)</span> +</pre></div> + + + +</details> + +<p>And here's a <a href="https://pytest.org">pytest</a>-style unit test for <code>str</code>:</p> +<div class="code"><pre class="code literal-block"><span class="k">def</span> <span class="nf">test_str</span><span class="p">():</span> + <span class="k">assert</span> <span class="nb">str</span><span class="p">(</span><span class="n">KnownBits</span><span class="o">.</span><span class="n">from_constant</span><span class="p">(</span><span class="mi">0</span><span class="p">))</span> <span class="o">==</span> <span class="s1">'0'</span> + <span class="k">assert</span> <span class="nb">str</span><span class="p">(</span><span class="n">KnownBits</span><span class="o">.</span><span class="n">from_constant</span><span class="p">(</span><span class="mi">5</span><span class="p">))</span> <span class="o">==</span> <span class="s1">'101'</span> + <span class="k">assert</span> <span class="nb">str</span><span class="p">(</span><span class="n">KnownBits</span><span class="p">(</span><span class="mi">5</span><span class="p">,</span> <span class="mb">0b10</span><span class="p">))</span> <span class="o">==</span> <span class="s1">'1?1'</span> + <span class="k">assert</span> <span class="nb">str</span><span class="p">(</span><span class="n">KnownBits</span><span class="p">(</span><span class="o">~</span><span class="mb">0b1111</span><span class="p">,</span> <span class="mb">0b10</span><span class="p">))</span> <span class="o">==</span> <span class="s1">'...100?0'</span> + <span class="k">assert</span> <span class="nb">str</span><span class="p">(</span><span class="n">KnownBits</span><span class="p">(</span><span class="mi">1</span><span class="p">,</span> <span class="o">~</span><span class="mb">0b1</span><span class="p">))</span> <span class="o">==</span> <span class="s1">'...?1'</span> +</pre></div> + +<p>An instance of <code>KnownBits</code> represents a set of integers, namely those that match +the known bits stored in the instance. We can write a method <code>contains</code> that +takes a concrete <code>int</code> value and returns <code>True</code> if the value matches the +pattern of the known bits:</p> +<div class="code"><pre class="code literal-block"><span class="k">class</span> <span class="nc">KnownBits</span><span class="p">:</span> + <span class="o">...</span> + + <span class="k">def</span> <span class="nf">contains</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span> <span class="p">:</span> <span class="nb">int</span><span class="p">):</span> +<span class="w"> </span><span class="sd">""" Check whether the KnownBits instance contains the concrete integer</span> +<span class="sd"> `value`. """</span> + <span class="c1"># check whether value matches the bit pattern. in the places where we</span> + <span class="c1"># know the bits, the value must agree with ones.</span> + <span class="k">return</span> <span class="n">value</span> <span class="o">&amp;</span> <span class="bp">self</span><span class="o">.</span><span class="n">knowns</span> <span class="o">==</span> <span class="bp">self</span><span class="o">.</span><span class="n">ones</span> +</pre></div> + +<p>and a test:</p> +<div class="code"><pre class="code literal-block"><span class="k">def</span> <span class="nf">test_contains</span><span class="p">():</span> + <span class="n">k1</span> <span class="o">=</span> <span class="n">KnownBits</span><span class="o">.</span><span class="n">from_str</span><span class="p">(</span><span class="s1">'1?1'</span><span class="p">)</span> + <span class="k">assert</span> <span class="n">k1</span><span class="o">.</span><span class="n">contains</span><span class="p">(</span><span class="mb">0b111</span><span class="p">)</span> + <span class="k">assert</span> <span class="n">k1</span><span class="o">.</span><span class="n">contains</span><span class="p">(</span><span class="mb">0b101</span><span class="p">)</span> + <span class="k">assert</span> <span class="ow">not</span> <span class="n">k1</span><span class="o">.</span><span class="n">contains</span><span class="p">(</span><span class="mb">0b110</span><span class="p">)</span> + <span class="k">assert</span> <span class="ow">not</span> <span class="n">k1</span><span class="o">.</span><span class="n">contains</span><span class="p">(</span><span class="mb">0b011</span><span class="p">)</span> + + <span class="n">k2</span> <span class="o">=</span> <span class="n">KnownBits</span><span class="o">.</span><span class="n">from_str</span><span class="p">(</span><span class="s1">'...?1'</span><span class="p">)</span> <span class="c1"># all odd numbers</span> + <span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="o">-</span><span class="mi">101</span><span class="p">,</span> <span class="mi">100</span><span class="p">):</span> + <span class="k">assert</span> <span class="n">k2</span><span class="o">.</span><span class="n">contains</span><span class="p">(</span><span class="n">i</span><span class="p">)</span> <span class="o">==</span> <span class="p">(</span><span class="n">i</span> <span class="o">&amp;</span> <span class="mi">1</span><span class="p">)</span> +</pre></div> + +<h3 id="transfer-functions">Transfer Functions</h3> +<p>Now that we have implemented the basics of the <code>KnownBits</code> class, we need to +start implementing the transfer functions. They are for computing what we know +about the <em>results</em> of an operation, given the knowledge we have about the bits +of the arguments.</p> +<p>We'll start with a simple unary operation, <code>invert(x)</code> (which is <code>~x</code> in Python +and C syntax), which flips all the bits of at integer. If we know some bits of +the arguments, we can compute the corresponding bits of the result. The unknown +bits remain unknown.</p> +<p>Here's the code:</p> +<div class="code"><pre class="code literal-block"><span class="k">class</span> <span class="nc">KnownBits</span><span class="p">:</span> + <span class="o">...</span> + + <span class="k">def</span> <span class="nf">abstract_invert</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span> + <span class="c1"># self.zeros has bits set where the known 0s are in self</span> + <span class="k">return</span> <span class="n">KnownBits</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">zeros</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">unknowns</span><span class="p">)</span> +</pre></div> + +<p>And a unit-test:</p> +<div class="code"><pre class="code literal-block"><span class="k">def</span> <span class="nf">test_invert</span><span class="p">():</span> + <span class="n">k1</span> <span class="o">=</span> <span class="n">KnownBits</span><span class="o">.</span><span class="n">from_str</span><span class="p">(</span><span class="s1">'01?01?01?'</span><span class="p">)</span> + <span class="n">k2</span> <span class="o">=</span> <span class="n">k1</span><span class="o">.</span><span class="n">abstract_invert</span><span class="p">()</span> + <span class="k">assert</span> <span class="nb">str</span><span class="p">(</span><span class="n">k2</span><span class="p">)</span> <span class="o">==</span> <span class="s1">'...10?10?10?'</span> + + <span class="n">k1</span> <span class="o">=</span> <span class="n">KnownBits</span><span class="o">.</span><span class="n">from_str</span><span class="p">(</span><span class="s1">'...?'</span><span class="p">)</span> + <span class="n">k2</span> <span class="o">=</span> <span class="n">k1</span><span class="o">.</span><span class="n">abstract_invert</span><span class="p">()</span> + <span class="k">assert</span> <span class="nb">str</span><span class="p">(</span><span class="n">k2</span><span class="p">)</span> <span class="o">==</span> <span class="s1">'...?'</span> +</pre></div> + +<p>Before we continue with further transfer functions, we'll think about +correctness of the transfer functions and build up some test infrastructure. To +test transfer functions, it's quite important to move being simple example-style +unit tests. The state-space for more complicated binary transfer functions is +extremely large and it's too easy to do something wrong in a corner case. +Therefore we'll look at property-based-test for <code>KnownBits</code> next.</p> +<h3 id="property-based-tests-with-hypothesis">Property-based Tests with Hypothesis</h3> +<p>We want to do property-based tests of <code>KnownBits</code>, to try +make it less likely that we'll get a corner-case in the implementation wrong. +We'll use <a href="https://hypothesis.readthedocs.io/en/latest/">Hypothesis</a> for that.</p> +<p>I can't give a decent introduction to Hypothesis here, but want to give a few +hints about the API. Hypothesis is a way to run unit tests with randomly +generated input. It provides <em>strategies</em> to describe the data that the test +functions expects. Hypothesis provides primitive strategies (for things like +integers, strings, floats, etc) and ways to build composite strategies out of +the primitive ones.</p> +<p>To be able to write the tests, we need to generate random <code>KnownBits</code> instances, +and we also want an <code>int</code> instance that is a member of the <code>KnownBits</code> instance. +We generate tuples of <code>(KnownBits, int)</code> together, to ensure this property. +We'll ask Hypothesis to generate us a random concrete <code>int</code> as the concrete +value, and then we'll also generate a second random <code>int</code> to use as the +<code>unknown</code> masks (i.e. which bits of the concrete int we don't know in the +<code>KnownBits</code> instance). Here's a function that takes two such ints and builds the +tuple:</p> +<div class="code"><pre class="code literal-block"><span class="k">def</span> <span class="nf">build_knownbits_and_contained_number</span><span class="p">(</span><span class="n">concrete_value</span> <span class="p">:</span> <span class="nb">int</span><span class="p">,</span> <span class="n">unknowns</span> <span class="p">:</span> <span class="nb">int</span><span class="p">):</span> + <span class="c1"># to construct a valid KnownBits instance, we need to mask off the unknown</span> + <span class="c1"># bits</span> + <span class="n">ones</span> <span class="o">=</span> <span class="n">concrete_value</span> <span class="o">&amp;</span> <span class="o">~</span><span class="n">unknowns</span> + <span class="k">return</span> <span class="n">KnownBits</span><span class="p">(</span><span class="n">ones</span><span class="p">,</span> <span class="n">unknowns</span><span class="p">),</span> <span class="n">concrete_value</span> +</pre></div> + +<p>We can turn this function into a hypothesis strategy to generate input data +using the <code>strategies.builds</code> function:</p> +<div class="code"><pre class="code literal-block"><span class="kn">from</span> <span class="nn">hypothesis</span> <span class="kn">import</span> <span class="n">strategies</span><span class="p">,</span> <span class="n">given</span><span class="p">,</span> <span class="n">settings</span> + +<span class="n">ints</span> <span class="o">=</span> <span class="n">strategies</span><span class="o">.</span><span class="n">integers</span><span class="p">()</span> + +<span class="n">random_knownbits_and_contained_number</span> <span class="o">=</span> <span class="n">strategies</span><span class="o">.</span><span class="n">builds</span><span class="p">(</span> + <span class="n">build_knownbits_and_contained_number</span><span class="p">,</span> + <span class="n">ints</span><span class="p">,</span> <span class="n">ints</span> +<span class="p">)</span> +</pre></div> + +<p>One important special case of <code>KnownBits</code> are the constants, which contain only +a single concrete value. We'll also generate some of those specifically, and +then combine the <code>random_knownbits_and_contained_number</code> strategy with it:</p> +<div class="code"><pre class="code literal-block"><span class="n">constant_knownbits</span> <span class="o">=</span> <span class="n">strategies</span><span class="o">.</span><span class="n">builds</span><span class="p">(</span> + <span class="k">lambda</span> <span class="n">value</span><span class="p">:</span> <span class="p">(</span><span class="n">KnownBits</span><span class="o">.</span><span class="n">from_constant</span><span class="p">(</span><span class="n">value</span><span class="p">),</span> <span class="n">value</span><span class="p">),</span> + <span class="n">ints</span> +<span class="p">)</span> + +<span class="n">knownbits_and_contained_number</span> <span class="o">=</span> <span class="n">constant_knownbits</span> <span class="o">|</span> <span class="n">random_knownbits_and_contained_number</span> +</pre></div> + +<p>Now we can write the first property-based tests, for the <code>KnownBits.contains</code> +method:</p> +<div class="code"><pre class="code literal-block"><span class="nd">@given</span><span class="p">(</span><span class="n">knownbits_and_contained_number</span><span class="p">)</span> +<span class="k">def</span> <span class="nf">test_contains</span><span class="p">(</span><span class="n">t</span><span class="p">):</span> + <span class="n">k</span><span class="p">,</span> <span class="n">n</span> <span class="o">=</span> <span class="n">t</span> + <span class="k">assert</span> <span class="n">k</span><span class="o">.</span><span class="n">contains</span><span class="p">(</span><span class="n">t</span><span class="p">)</span> +</pre></div> + +<p>The <code>@given</code> decorator is used to tell Hypothesis which strategy to use to +generate random data for the test function. Hypothesis will run the test with a +number of random examples (100 by default). If it finds an error, it will try to +minimize the example needed that demonstrates the problem, to try to make it +easier to understand what is going wrong. It also saves all failing cases into +an example database and tries them again on subsequent runs.</p> +<p>This test is as much a check for whether we got the strategies right as it is +for the logic in <code>KnownBits.contains</code>. Here's an example output of random +concrete and abstract values that we are getting here:</p> +<div class="code"><pre class="code literal-block"><span class="mf">110000011001101</span><span class="w"> </span><span class="mf">...</span><span class="err">?</span><span class="mf">0</span><span class="err">???</span><span class="mf">1</span> +<span class="mf">...1011011</span><span class="w"> </span><span class="mf">...1011011</span> +<span class="mf">...1001101110101000010010011111011</span><span class="w"> </span><span class="mf">...1001101110101000010010011111011</span> +<span class="mf">...1001101110101000010010011111011</span><span class="w"> </span><span class="mf">...100110111010100001</span><span class="err">?</span><span class="mf">010</span><span class="err">?</span><span class="mf">1</span><span class="err">??</span><span class="mf">1</span><span class="err">??</span><span class="mf">11</span> +<span class="mf">1000001101111101001011010011111101000011000111011001011111101</span><span class="w"> </span><span class="mf">1000001101111101001011010011111101000011000111011001011111101</span> +<span class="mf">1000001101111101001011010011111101000011000111011001011111101</span><span class="w"> </span><span class="mf">1000001101111101001011010011111101000011000111</span><span class="err">????</span><span class="mf">01</span><span class="err">?</span><span class="mf">11</span><span class="err">?????</span><span class="mf">1</span> +<span class="mf">1111100000010</span><span class="w"> </span><span class="mf">1111100000010</span> +<span class="mf">1111100000010</span><span class="w"> </span><span class="mf">...</span><span class="err">?</span><span class="mf">11111</span><span class="err">?</span><span class="mf">00000</span><span class="err">??</span> +<span class="mf">110110</span><span class="w"> </span><span class="mf">110110</span> +<span class="mf">110110</span><span class="w"> </span><span class="mf">...</span><span class="err">?</span><span class="mf">00</span><span class="err">?</span><span class="mf">00</span><span class="err">????</span><span class="mf">11</span><span class="err">??</span><span class="mf">10</span> +<span class="mf">110110</span><span class="w"> </span><span class="err">??</span><span class="mf">0</span><span class="err">??</span><span class="mf">0</span> +<span class="mf">...100010111011111</span><span class="w"> </span><span class="mf">...</span><span class="err">?</span><span class="mf">100</span><span class="err">?</span><span class="mf">10111</span><span class="err">??</span><span class="mf">111</span><span class="err">?</span> +<span class="mf">...1000100000110001</span><span class="w"> </span><span class="mf">...</span><span class="err">?</span><span class="mf">000</span><span class="err">?</span><span class="mf">00000</span><span class="err">??</span><span class="mf">000</span><span class="err">?</span> +<span class="mf">110000001110</span><span class="w"> </span><span class="mf">...</span><span class="err">?</span><span class="mf">0</span><span class="err">?</span><span class="mf">0</span><span class="err">??</span><span class="mf">000</span><span class="err">?</span><span class="mf">00</span><span class="err">?</span><span class="mf">0</span><span class="err">?</span><span class="mf">0000000</span><span class="err">?</span><span class="mf">00</span><span class="err">???</span><span class="mf">0000</span><span class="err">?????</span><span class="mf">00</span><span class="err">???</span><span class="mf">000</span><span class="err">?</span><span class="mf">0</span><span class="err">?</span><span class="mf">00</span><span class="err">?</span><span class="mf">01</span><span class="err">?</span><span class="mf">000</span><span class="err">?</span><span class="mf">0</span><span class="err">??</span><span class="mf">1</span><span class="err">??</span> +<span class="mf">110000001110</span><span class="w"> </span><span class="err">??</span><span class="mf">000000</span><span class="err">???</span><span class="mf">0</span> +<span class="mf">1011011010000001110101001111000010001001011101010010010001000000010101010010001101110101111111010101010010101100110000011110000</span><span class="w"> </span><span class="mf">1011011010000001110101001111000010001001011101010010010001000000010101010010001101110101111111010101010010101100110000011110000</span> +<span class="mf">...1011010010010100</span><span class="w"> </span><span class="mf">...1011010010010100</span> +<span class="mf">...1011111110110011</span><span class="w"> </span><span class="mf">...1011111110110011</span> +<span class="mf">101000011110110</span><span class="w"> </span><span class="mf">101000011</span><span class="err">?</span><span class="mf">10</span><span class="err">?</span><span class="mf">1</span><span class="err">?</span> +<span class="mf">100101</span><span class="w"> </span><span class="err">?</span><span class="mf">00</span><span class="err">?</span><span class="mf">0</span><span class="err">?</span> +</pre></div> + +<p>That looks suitably random, but we might want to bias our random numbers a +little bit towards common error values like small constants, powers of two, etc. +Like this:</p> +<div class="code"><pre class="code literal-block"><span class="n">INTEGER_WIDTH</span> <span class="o">=</span> <span class="mi">64</span> +<span class="c1"># some small integers</span> +<span class="n">ints_special</span> <span class="o">=</span> <span class="nb">set</span><span class="p">(</span><span class="nb">range</span><span class="p">(</span><span class="mi">100</span><span class="p">))</span> +<span class="c1"># powers of two</span> +<span class="n">ints_special</span> <span class="o">=</span> <span class="n">ints_special</span><span class="o">.</span><span class="n">union</span><span class="p">(</span><span class="mi">1</span> <span class="o">&lt;&lt;</span> <span class="n">i</span> <span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="n">INTEGER_WIDTH</span> <span class="o">-</span> <span class="mi">2</span><span class="p">))</span> +<span class="c1"># powers of two - 1</span> +<span class="n">ints_special</span> <span class="o">=</span> <span class="n">ints_special</span><span class="o">.</span><span class="n">union</span><span class="p">((</span><span class="mi">1</span> <span class="o">&lt;&lt;</span> <span class="n">i</span><span class="p">)</span> <span class="o">-</span> <span class="mi">1</span> <span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="n">INTEGER_WIDTH</span> <span class="o">-</span> <span class="mi">2</span><span class="p">))</span> +<span class="c1"># negative versions of what we have so far</span> +<span class="n">ints_special</span> <span class="o">=</span> <span class="n">ints_special</span><span class="o">.</span><span class="n">union</span><span class="p">(</span><span class="o">-</span><span class="n">x</span> <span class="k">for</span> <span class="n">x</span> <span class="ow">in</span> <span class="n">ints_special</span><span class="p">)</span> +<span class="c1"># bit-flipped versions of what we have so far</span> +<span class="n">ints_special</span> <span class="o">=</span> <span class="n">ints_special</span><span class="o">.</span><span class="n">union</span><span class="p">(</span><span class="o">~</span><span class="n">x</span> <span class="k">for</span> <span class="n">x</span> <span class="ow">in</span> <span class="n">ints_special</span><span class="p">)</span> +<span class="n">ints_special</span> <span class="o">=</span> <span class="nb">list</span><span class="p">(</span><span class="n">ints_special</span><span class="p">)</span> +<span class="c1"># sort them (because hypothesis simplifies towards earlier elements in the list)</span> +<span class="n">ints_special</span><span class="o">.</span><span class="n">sort</span><span class="p">(</span><span class="n">key</span><span class="o">=</span><span class="k">lambda</span> <span class="n">element</span><span class="p">:</span> <span class="p">(</span><span class="nb">abs</span><span class="p">(</span><span class="n">element</span><span class="p">),</span> <span class="n">element</span> <span class="o">&lt;</span> <span class="mi">0</span><span class="p">))</span> + +<span class="n">ints</span> <span class="o">=</span> <span class="n">strategies</span><span class="o">.</span><span class="n">sampled_from</span><span class="p">(</span><span class="n">ints_special</span><span class="p">)</span> <span class="o">|</span> <span class="n">strategies</span><span class="o">.</span><span class="n">integers</span><span class="p">()</span> +</pre></div> + +<p>Now we get data like this:</p> +<div class="code"><pre class="code literal-block"><span class="mf">1110</span><span class="w"> </span><span class="mf">1110</span> +<span class="mf">...10000000000000000001</span><span class="w"> </span><span class="mf">...10000</span><span class="err">??</span><span class="mf">0</span><span class="err">??</span><span class="mf">0000</span><span class="err">??</span><span class="mf">00</span><span class="err">?</span><span class="mf">1</span> +<span class="mf">1</span><span class="w"> </span><span class="err">??</span><span class="mf">0</span><span class="err">??</span><span class="mf">0000</span><span class="err">??</span><span class="mf">00</span><span class="err">?</span><span class="mf">1</span> +<span class="mf">1</span><span class="w"> </span><span class="err">?</span> +<span class="mf">...10101100</span><span class="w"> </span><span class="mf">...10101100</span> +<span class="mf">110000000011001010111011111111111111011110010001001100110001011</span><span class="w"> </span><span class="mf">...</span><span class="err">?</span><span class="mf">0</span><span class="err">?</span><span class="mf">101</span><span class="err">?</span> +<span class="mf">110000000011001010111011111111111111011110010001001100110001011</span><span class="w"> </span><span class="err">??</span><span class="mf">00000000</span><span class="err">??</span><span class="mf">00</span><span class="err">?</span><span class="mf">0</span><span class="err">?</span><span class="mf">0</span><span class="err">???</span><span class="mf">0</span><span class="err">??????????????</span><span class="mf">0</span><span class="err">????</span><span class="mf">00</span><span class="err">?</span><span class="mf">000</span><span class="err">?</span><span class="mf">00</span><span class="err">??</span><span class="mf">00</span><span class="err">??</span><span class="mf">000</span><span class="err">?</span><span class="mf">0</span><span class="err">??</span> +<span class="mf">...1011111111111111111111111111</span><span class="w"> </span><span class="mf">...</span><span class="err">?</span><span class="mf">11</span><span class="err">?</span><span class="mf">11</span><span class="err">??</span> +<span class="mf">...1011111111111111111111111111</span><span class="w"> </span><span class="mf">...</span><span class="err">?</span><span class="mf">0</span><span class="err">??????????????????????????</span> +<span class="mf">0</span><span class="w"> </span><span class="mf">...</span><span class="err">?</span><span class="mf">0</span><span class="err">??????????????????????????</span> +<span class="mf">101101</span><span class="w"> </span><span class="mf">101101</span> +<span class="mf">111111111111111111111111111111111111111111111</span><span class="w"> </span><span class="mf">111111111111111111111111111111111111111111111</span> +<span class="mf">10111</span><span class="w"> </span><span class="mf">10111</span> +<span class="mf">...101100</span><span class="w"> </span><span class="mf">...1</span><span class="err">?</span><span class="mf">111011</span><span class="err">?</span><span class="mf">0</span> +<span class="mf">101000</span><span class="w"> </span><span class="err">?</span><span class="mf">001010</span><span class="err">?</span><span class="mf">0</span> +<span class="mf">101000</span><span class="w"> </span><span class="err">?</span><span class="mf">0</span><span class="err">?</span><span class="mf">000</span> +<span class="mf">110010</span><span class="w"> </span><span class="mf">110010</span> +<span class="mf">...100111</span><span class="w"> </span><span class="mf">...100111</span> +<span class="mf">1111011010010</span><span class="w"> </span><span class="mf">1111011010010</span> +<span class="mf">...1000000000000000000000000000000000000</span><span class="w"> </span><span class="mf">...1000000000000000000000000000000000000</span> +</pre></div> + +<p>We can also write a test that checks that the somewhat tricky logic in +<code>__str__</code> and <code>from_str</code> is correct, by making sure that the two functions +round-trip (ie converting a <code>KnownBits</code> to a string and then back to a +<code>KnownBits</code> instance produces the same abstract value).</p> +<div class="code"><pre class="code literal-block"><span class="nd">@given</span><span class="p">(</span><span class="n">knownbits_and_contained_number</span><span class="p">)</span> +<span class="k">def</span> <span class="nf">test_hypothesis_str_roundtrips</span><span class="p">(</span><span class="n">t1</span><span class="p">):</span> + <span class="n">k1</span><span class="p">,</span> <span class="n">n1</span> <span class="o">=</span> <span class="n">t1</span> + <span class="n">s</span> <span class="o">=</span> <span class="nb">str</span><span class="p">(</span><span class="n">k1</span><span class="p">)</span> + <span class="n">k2</span> <span class="o">=</span> <span class="n">KnownBits</span><span class="o">.</span><span class="n">from_str</span><span class="p">(</span><span class="n">s</span><span class="p">)</span> + <span class="k">assert</span> <span class="n">k1</span><span class="o">.</span><span class="n">ones</span> <span class="o">==</span> <span class="n">k2</span><span class="o">.</span><span class="n">ones</span> + <span class="k">assert</span> <span class="n">k1</span><span class="o">.</span><span class="n">unknowns</span> <span class="o">==</span> <span class="n">k2</span><span class="o">.</span><span class="n">unknowns</span> +</pre></div> + +<p>Now let's actually apply this infrastructure to test <code>abstract_invert</code>.</p> +<h3 id="when-are-transfer-functions-correct-how-do-we-test-them">When are Transfer Functions Correct? How do we test them?</h3> +<p>Abstract values, i.e. instances of <code>KnownBits</code> represent <em>sets</em> of concrete +values. We want the transfer functions to compute <em>overapproximations</em> of the +concrete values. So if we have an arbitrary abstract value <code>k</code>, with a concrete +number <code>n</code> that is a member of the abstract values (i.e. +<code>k.contains(n) == True</code>) then the result of the concrete operation <code>op(n)</code> +<strong>must</strong> be a member of the result of the abstract operation <code>k.abstract_op()</code> +(i.e. <code>k.abstract_op().contains(op(n)) == True</code>).</p> +<p>Checking the correctness/overapproximation property is a good match for +hypothesis. Here's what the test for <code>abstract_invert</code> looks like:</p> +<div class="code"><pre class="code literal-block"><span class="nd">@given</span><span class="p">(</span><span class="n">knownbits_and_contained_number</span><span class="p">)</span> +<span class="k">def</span> <span class="nf">test_hypothesis_invert</span><span class="p">(</span><span class="n">t</span><span class="p">):</span> + <span class="n">k1</span><span class="p">,</span> <span class="n">n1</span> <span class="o">=</span> <span class="n">t1</span> + <span class="n">n2</span> <span class="o">=</span> <span class="o">~</span><span class="n">n1</span> <span class="c1"># compute the real result</span> + <span class="n">k2</span> <span class="o">=</span> <span class="n">k1</span><span class="o">.</span><span class="n">abstract_invert</span><span class="p">()</span> <span class="c1"># compute the abstract result</span> + <span class="k">assert</span> <span class="n">k2</span><span class="o">.</span><span class="n">contains</span><span class="p">(</span><span class="n">n2</span><span class="p">)</span> <span class="c1"># the abstract result must contain the real result</span> +</pre></div> + +<p>This is the <em>only</em> condition needed for <code>abstract_invert</code> to be correct. If +<code>abstract_invert</code> fulfils this property for every combination of abstract and +concrete value then <code>abstract_invert</code> is correct. Note however, that this test +does not actually check whether <code>abstract_invert</code> gives us precise results. A +correct (but imprecise) implementation of <code>abstract_invert</code> would simply return +a completely unknown result, regardless of what is known about the input +<code>KnownBits</code>.</p> +<p>The "proper" CS term for this notion of correctness is called <em>soundness</em>. The +correctness condition on the transfer functions is called a <em>Galois +connection</em>. I won't go into any mathematical/technical details here, but +wanted to at least mention the terms. I found <a href="https://web.njit.edu/~mjk76/">Martin +Kellogg</a>'s +<a href="https://web.njit.edu/~mjk76/teaching/cs684-sp24/assets/lecture-12.pdf#34">slides</a> +to be quite an approachable introduction to the Galois connection and how to +show soundness.</p> +<h3 id="implementing-binary-transfer-functions">Implementing Binary Transfer Functions</h3> +<p>Now we have infrastructure in place for testing transfer functions with random +inputs. With that we can start thinking about the more complicated case, that of +binary operations. Let's start with the simpler ones, <code>and</code> and <code>or</code>. For <code>and</code>, +we can know a <code>0</code> bit in the result if either of the input bits are known <code>0</code>; +or we can know a <code>1</code> bit in the result if both input bits are known <code>1</code>. +Otherwise the resulting bit is unknown. Let's look at all the combinations:</p> +<div class="code"><pre class="code literal-block">and +input1: 000111??? +input2: 01?01?01? +result: 00001?0?? +</pre></div> + +<div class="code"><pre class="code literal-block"><span class="k">class</span> <span class="nc">KnownBits</span><span class="p">:</span> + <span class="o">...</span> + + <span class="k">def</span> <span class="nf">abstract_and</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">):</span> + <span class="n">ones</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">ones</span> <span class="o">&amp;</span> <span class="n">other</span><span class="o">.</span><span class="n">ones</span> <span class="c1"># known ones</span> + <span class="n">knowns</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">zeros</span> <span class="o">|</span> <span class="n">other</span><span class="o">.</span><span class="n">zeros</span> <span class="o">|</span> <span class="n">ones</span> + <span class="k">return</span> <span class="n">KnownBits</span><span class="p">(</span><span class="n">ones</span><span class="p">,</span> <span class="o">~</span><span class="n">knowns</span><span class="p">)</span> +</pre></div> + +<p>Here's an example unit-test and a property-based test for <code>and</code>:</p> +<div class="code"><pre class="code literal-block"><span class="k">def</span> <span class="nf">test_and</span><span class="p">():</span> + <span class="c1"># test all combinations of 0, 1, ? in one example</span> + <span class="n">k1</span> <span class="o">=</span> <span class="n">KnownBits</span><span class="o">.</span><span class="n">from_str</span><span class="p">(</span><span class="s1">'01?01?01?'</span><span class="p">)</span> + <span class="n">k2</span> <span class="o">=</span> <span class="n">KnownBits</span><span class="o">.</span><span class="n">from_str</span><span class="p">(</span><span class="s1">'000111???'</span><span class="p">)</span> + <span class="n">res</span> <span class="o">=</span> <span class="n">k1</span><span class="o">.</span><span class="n">abstract_and</span><span class="p">(</span><span class="n">k2</span><span class="p">)</span> <span class="c1"># should be: 0...00001?0??</span> + <span class="k">assert</span> <span class="nb">str</span><span class="p">(</span><span class="n">res</span><span class="p">)</span> <span class="o">==</span> <span class="s2">"1?0??"</span> + +<span class="nd">@given</span><span class="p">(</span><span class="n">knownbits_and_contained_number</span><span class="p">,</span> <span class="n">knownbits_and_contained_number</span><span class="p">)</span> +<span class="k">def</span> <span class="nf">test_hypothesis_and</span><span class="p">(</span><span class="n">t1</span><span class="p">,</span> <span class="n">t2</span><span class="p">):</span> + <span class="n">k1</span><span class="p">,</span> <span class="n">n1</span> <span class="o">=</span> <span class="n">t1</span> + <span class="n">k2</span><span class="p">,</span> <span class="n">n2</span> <span class="o">=</span> <span class="n">t2</span> + <span class="n">k3</span> <span class="o">=</span> <span class="n">k1</span><span class="o">.</span><span class="n">abstract_and</span><span class="p">(</span><span class="n">k2</span><span class="p">)</span> + <span class="n">n3</span> <span class="o">=</span> <span class="n">n1</span> <span class="o">&amp;</span> <span class="n">n2</span> + <span class="k">assert</span> <span class="n">k3</span><span class="o">.</span><span class="n">contains</span><span class="p">(</span><span class="n">n3</span><span class="p">)</span> +</pre></div> + +<p>To implement <code>or</code> is pretty similar. The result is known <code>1</code> where either of the +inputs is <code>1</code>. The result is known <code>0</code> where both inputs are known <code>0</code>, and <code>?</code> +otherwise.</p> +<div class="code"><pre class="code literal-block">or +input1: 000111??? +input2: 01?01?01? +result: 01?111?1? +</pre></div> + +<div class="code"><pre class="code literal-block"><span class="k">class</span> <span class="nc">KnownBits</span><span class="p">:</span> + <span class="o">...</span> + + <span class="k">def</span> <span class="nf">abstract_or</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">):</span> + <span class="n">ones</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">ones</span> <span class="o">|</span> <span class="n">other</span><span class="o">.</span><span class="n">ones</span> + <span class="n">zeros</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">zeros</span> <span class="o">&amp;</span> <span class="n">other</span><span class="o">.</span><span class="n">zeros</span> + <span class="n">knowns</span> <span class="o">=</span> <span class="n">ones</span> <span class="o">|</span> <span class="n">zeros</span> + <span class="k">return</span> <span class="n">KnownBits</span><span class="p">(</span><span class="n">ones</span><span class="p">,</span> <span class="o">~</span><span class="n">knowns</span><span class="p">)</span> +</pre></div> + +<p>Here's an example unit-test and a property-based test for <code>or</code>:</p> +<div class="code"><pre class="code literal-block"><span class="k">def</span> <span class="nf">test_or</span><span class="p">():</span> + <span class="n">k1</span> <span class="o">=</span> <span class="n">KnownBits</span><span class="o">.</span><span class="n">from_str</span><span class="p">(</span><span class="s1">'01?01?01?'</span><span class="p">)</span> + <span class="n">k2</span> <span class="o">=</span> <span class="n">KnownBits</span><span class="o">.</span><span class="n">from_str</span><span class="p">(</span><span class="s1">'000111???'</span><span class="p">)</span> + <span class="n">res</span> <span class="o">=</span> <span class="n">k1</span><span class="o">.</span><span class="n">abstract_or</span><span class="p">(</span><span class="n">k2</span><span class="p">)</span> <span class="c1"># should be: 0...01?111?1?</span> + <span class="k">assert</span> <span class="nb">str</span><span class="p">(</span><span class="n">res</span><span class="p">)</span> <span class="o">==</span> <span class="s2">"1?111?1?"</span> + +<span class="nd">@given</span><span class="p">(</span><span class="n">knownbits_and_contained_number</span><span class="p">,</span> <span class="n">knownbits_and_contained_number</span><span class="p">)</span> +<span class="k">def</span> <span class="nf">test_hypothesis_or</span><span class="p">(</span><span class="n">t1</span><span class="p">,</span> <span class="n">t2</span><span class="p">):</span> + <span class="n">k1</span><span class="p">,</span> <span class="n">n1</span> <span class="o">=</span> <span class="n">t1</span> + <span class="n">k2</span><span class="p">,</span> <span class="n">n2</span> <span class="o">=</span> <span class="n">t2</span> + <span class="n">k3</span> <span class="o">=</span> <span class="n">k1</span><span class="o">.</span><span class="n">abstract_or</span><span class="p">(</span><span class="n">k2</span><span class="p">)</span> + <span class="n">n3</span> <span class="o">=</span> <span class="n">n1</span> <span class="o">|</span> <span class="n">n2</span> + <span class="k">assert</span> <span class="n">k3</span><span class="o">.</span><span class="n">contains</span><span class="p">(</span><span class="n">n3</span><span class="p">)</span> +</pre></div> + +<p>Implementing support for <code>abstract_xor</code> is relatively simple, and left as an +exercise :-).</p> +<h3 id="addition-and-subtraction">Addition and Subtraction</h3> +<p><code>invert</code>, <code>and</code>, and <code>or</code> are relatively simple transfer functions to write, +because they compose over the individual bits of the integers. The arithmetic +functions <code>add</code> and <code>sub</code> are significantly harder, because of carries and +borrows. Coming up with the formulas for them and gaining an intuitive +understanding is quite tricky and involves carefully going through a few +examples with pen and paper. When implementing this in PyPy, Nico and I didn't +come up with the implementation ourselves, but instead took them from the +<a href="https://arxiv.org/abs/2105.05398">Tristate Numbers</a> paper. Here's the code, +with example tests and hypothesis tests:</p> +<div class="code"><pre class="code literal-block"><span class="k">class</span> <span class="nc">KnownBits</span><span class="p">:</span> + <span class="o">...</span> + + <span class="k">def</span> <span class="nf">abstract_add</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">):</span> + <span class="n">sum_ones</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">ones</span> <span class="o">+</span> <span class="n">other</span><span class="o">.</span><span class="n">ones</span> + <span class="n">sum_unknowns</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">unknowns</span> <span class="o">+</span> <span class="n">other</span><span class="o">.</span><span class="n">unknowns</span> + <span class="n">all_carries</span> <span class="o">=</span> <span class="n">sum_ones</span> <span class="o">+</span> <span class="n">sum_unknowns</span> + <span class="n">ones_carries</span> <span class="o">=</span> <span class="n">all_carries</span> <span class="o">^</span> <span class="n">sum_ones</span> + <span class="n">unknowns</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">unknowns</span> <span class="o">|</span> <span class="n">other</span><span class="o">.</span><span class="n">unknowns</span> <span class="o">|</span> <span class="n">ones_carries</span> + <span class="n">ones</span> <span class="o">=</span> <span class="n">sum_ones</span> <span class="o">&amp;</span> <span class="o">~</span><span class="n">unknowns</span> + <span class="k">return</span> <span class="n">KnownBits</span><span class="p">(</span><span class="n">ones</span><span class="p">,</span> <span class="n">unknowns</span><span class="p">)</span> + + <span class="k">def</span> <span class="nf">abstract_sub</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">):</span> + <span class="n">diff_ones</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">ones</span> <span class="o">-</span> <span class="n">other</span><span class="o">.</span><span class="n">ones</span> + <span class="n">val_borrows</span> <span class="o">=</span> <span class="p">(</span><span class="n">diff_ones</span> <span class="o">+</span> <span class="bp">self</span><span class="o">.</span><span class="n">unknowns</span><span class="p">)</span> <span class="o">^</span> <span class="p">(</span><span class="n">diff_ones</span> <span class="o">-</span> <span class="n">other</span><span class="o">.</span><span class="n">unknowns</span><span class="p">)</span> + <span class="n">unknowns</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">unknowns</span> <span class="o">|</span> <span class="n">other</span><span class="o">.</span><span class="n">unknowns</span> <span class="o">|</span> <span class="n">val_borrows</span> + <span class="n">ones</span> <span class="o">=</span> <span class="n">diff_ones</span> <span class="o">&amp;</span> <span class="o">~</span><span class="n">unknowns</span> + <span class="k">return</span> <span class="n">KnownBits</span><span class="p">(</span><span class="n">ones</span><span class="p">,</span> <span class="n">unknowns</span><span class="p">)</span> + + +<span class="k">def</span> <span class="nf">test_add</span><span class="p">():</span> + <span class="n">k1</span> <span class="o">=</span> <span class="n">KnownBits</span><span class="o">.</span><span class="n">from_str</span><span class="p">(</span><span class="s1">'0?10?10?10'</span><span class="p">)</span> + <span class="n">k2</span> <span class="o">=</span> <span class="n">KnownBits</span><span class="o">.</span><span class="n">from_str</span><span class="p">(</span><span class="s1">'0???111000'</span><span class="p">)</span> + <span class="n">res</span> <span class="o">=</span> <span class="n">k1</span><span class="o">.</span><span class="n">abstract_add</span><span class="p">(</span><span class="n">k2</span><span class="p">)</span> + <span class="k">assert</span> <span class="nb">str</span><span class="p">(</span><span class="n">res</span><span class="p">)</span> <span class="o">==</span> <span class="s2">"?????01?10"</span> + +<span class="k">def</span> <span class="nf">test_sub</span><span class="p">():</span> + <span class="n">k1</span> <span class="o">=</span> <span class="n">KnownBits</span><span class="o">.</span><span class="n">from_str</span><span class="p">(</span><span class="s1">'0?10?10?10'</span><span class="p">)</span> + <span class="n">k2</span> <span class="o">=</span> <span class="n">KnownBits</span><span class="o">.</span><span class="n">from_str</span><span class="p">(</span><span class="s1">'0???111000'</span><span class="p">)</span> + <span class="n">res</span> <span class="o">=</span> <span class="n">k1</span><span class="o">.</span><span class="n">abstract_sub</span><span class="p">(</span><span class="n">k2</span><span class="p">)</span> + <span class="k">assert</span> <span class="nb">str</span><span class="p">(</span><span class="n">res</span><span class="p">)</span> <span class="o">==</span> <span class="s2">"...?11?10"</span> + <span class="n">k1</span> <span class="o">=</span> <span class="n">KnownBits</span><span class="o">.</span><span class="n">from_str</span><span class="p">(</span> <span class="s1">'...1?10?10?10'</span><span class="p">)</span> + <span class="n">k2</span> <span class="o">=</span> <span class="n">KnownBits</span><span class="o">.</span><span class="n">from_str</span><span class="p">(</span><span class="s1">'...10000???111000'</span><span class="p">)</span> + <span class="n">res</span> <span class="o">=</span> <span class="n">k1</span><span class="o">.</span><span class="n">abstract_sub</span><span class="p">(</span><span class="n">k2</span><span class="p">)</span> + <span class="k">assert</span> <span class="nb">str</span><span class="p">(</span><span class="n">res</span><span class="p">)</span> <span class="o">==</span> <span class="s2">"111?????11?10"</span> + +<span class="nd">@given</span><span class="p">(</span><span class="n">knownbits_and_contained_number</span><span class="p">,</span> <span class="n">knownbits_and_contained_number</span><span class="p">)</span> +<span class="k">def</span> <span class="nf">test_hypothesis_add</span><span class="p">(</span><span class="n">t1</span><span class="p">,</span> <span class="n">t2</span><span class="p">):</span> + <span class="n">k1</span><span class="p">,</span> <span class="n">n1</span> <span class="o">=</span> <span class="n">t1</span> + <span class="n">k2</span><span class="p">,</span> <span class="n">n2</span> <span class="o">=</span> <span class="n">t2</span> + <span class="n">k3</span> <span class="o">=</span> <span class="n">k1</span><span class="o">.</span><span class="n">abstract_add</span><span class="p">(</span><span class="n">k2</span><span class="p">)</span> + <span class="n">n3</span> <span class="o">=</span> <span class="n">n1</span> <span class="o">+</span> <span class="n">n2</span> + <span class="k">assert</span> <span class="n">k3</span><span class="o">.</span><span class="n">contains</span><span class="p">(</span><span class="n">n3</span><span class="p">)</span> + +<span class="nd">@given</span><span class="p">(</span><span class="n">knownbits_and_contained_number</span><span class="p">,</span> <span class="n">knownbits_and_contained_number</span><span class="p">)</span> +<span class="k">def</span> <span class="nf">test_hypothesis_sub</span><span class="p">(</span><span class="n">t1</span><span class="p">,</span> <span class="n">t2</span><span class="p">):</span> + <span class="n">k1</span><span class="p">,</span> <span class="n">n1</span> <span class="o">=</span> <span class="n">t1</span> + <span class="n">k2</span><span class="p">,</span> <span class="n">n2</span> <span class="o">=</span> <span class="n">t2</span> + <span class="n">k3</span> <span class="o">=</span> <span class="n">k1</span><span class="o">.</span><span class="n">abstract_sub</span><span class="p">(</span><span class="n">k2</span><span class="p">)</span> + <span class="n">n3</span> <span class="o">=</span> <span class="n">n1</span> <span class="o">-</span> <span class="n">n2</span> + <span class="k">assert</span> <span class="n">k3</span><span class="o">.</span><span class="n">contains</span><span class="p">(</span><span class="n">n3</span><span class="p">)</span> +</pre></div> + +<p>Now we are in a pretty good situation, and have implemented abstract versions +for a bunch of important arithmetic and binary functions. What's also surprising +is that the implementation of all of the transfer functions is quite efficient. +We didn't have to write loops over the individual bits at all, instead we found +closed form expressions using primitive operations on the underlying integers +<code>ones</code> and <code>unknowns</code>. This means that computing the results of abstract +operations is quite efficient, which is important when using the abstract domain +in the context of a JIT compiler.</p> +<h3 id="proving-correctness-of-the-transfer-functions-with-z3">Proving correctness of the transfer functions with Z3</h3> +<p>As one can probably tell from my recent posts, I've been thinking about +compiler correctness a lot. Getting the transfer functions absolutely +correct is really crucial, because a bug in them would lead to miscompilation of +Python code when the abstract domain is added to the JIT. While the randomized +tests are great, it's still entirely possible for them to miss bugs. The state +space for the arguments of a binary transfer function is <code>3**64 * 3**64</code>, and if +only a small part of that contains wrong behaviour it would be really unlikely +for us to find it with random tests by chance. Therefore I was reluctant to +merge the PyPy branch that contained the new abstract domain for a long time.</p> +<p>To increase our confidence in the correctness of the transfer functions further, +we can use Z3 to <em>prove</em> their correctness, which gives us much stronger +guarantees (not 100%, obviously). In this subsection I will show how to do that.</p> +<p>Here's an attempt to do this manually in the Python repl:</p> +<div class="code"><pre class="code literal-block"><span class="o">&gt;&gt;&gt;&gt;</span><span class="w"> </span><span class="kn">import</span><span class="w"> </span><span class="nn">z3</span> +<span class="o">&gt;&gt;&gt;&gt;</span><span class="w"> </span><span class="n">solver</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">z3</span><span class="o">.</span><span class="n">Solver</span><span class="p">()</span> +<span class="o">&gt;&gt;&gt;&gt;</span><span class="w"> </span><span class="c1"># like last blog post, proof by failing to find counterexamples</span> +<span class="o">&gt;&gt;&gt;&gt;</span><span class="w"> </span><span class="k">def</span><span class="w"> </span><span class="nf">prove</span><span class="p">(</span><span class="n">cond</span><span class="p">):</span><span class="w"> </span><span class="k">assert</span><span class="w"> </span><span class="n">solver</span><span class="o">.</span><span class="n">check</span><span class="p">(</span><span class="n">z3</span><span class="o">.</span><span class="n">Not</span><span class="p">(</span><span class="n">cond</span><span class="p">))</span><span class="w"> </span><span class="o">==</span><span class="w"> </span><span class="n">z3</span><span class="o">.</span><span class="n">unsat</span> +<span class="o">&gt;&gt;&gt;&gt;</span> +<span class="o">&gt;&gt;&gt;&gt;</span><span class="w"> </span><span class="c1"># let's set up a z3 bitvector variable for an arbitrary concrete value</span> +<span class="o">&gt;&gt;&gt;&gt;</span><span class="w"> </span><span class="n">n1</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">z3</span><span class="o">.</span><span class="n">BitVec</span><span class="p">(</span><span class="s1">'concrete_value'</span><span class="p">,</span><span class="w"> </span><span class="mi">64</span><span class="p">)</span> +<span class="o">&gt;&gt;&gt;&gt;</span><span class="w"> </span><span class="n">n1</span> +<span class="n">concrete_value</span> +<span class="o">&gt;&gt;&gt;&gt;</span><span class="w"> </span><span class="c1"># due to operator overloading we can manipulate z3 formulas</span> +<span class="o">&gt;&gt;&gt;&gt;</span><span class="w"> </span><span class="n">n2</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="o">~</span><span class="n">n1</span> +<span class="o">&gt;&gt;&gt;&gt;</span><span class="w"> </span><span class="n">n2</span> +<span class="o">~</span><span class="n">concrete_value</span> +<span class="o">&gt;&gt;&gt;&gt;</span><span class="w"> </span> +<span class="o">&gt;&gt;&gt;&gt;</span><span class="w"> </span><span class="c1"># now z3 bitvector variables for the ones and zeros fields</span> +<span class="o">&gt;&gt;&gt;&gt;</span><span class="w"> </span><span class="n">ones</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">z3</span><span class="o">.</span><span class="n">BitVec</span><span class="p">(</span><span class="s1">'abstract_ones'</span><span class="p">,</span><span class="w"> </span><span class="mi">64</span><span class="p">)</span> +<span class="o">&gt;&gt;&gt;&gt;</span><span class="w"> </span><span class="n">unknowns</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">z3</span><span class="o">.</span><span class="n">BitVec</span><span class="p">(</span><span class="s1">'abstract_unknowns'</span><span class="p">,</span><span class="w"> </span><span class="mi">64</span><span class="p">)</span> +<span class="o">&gt;&gt;&gt;&gt;</span><span class="w"> </span><span class="c1"># we construct a KnownBits instance with the z3 variables</span> +<span class="o">&gt;&gt;&gt;&gt;</span><span class="w"> </span><span class="n">k1</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">KnownBits</span><span class="p">(</span><span class="n">ones</span><span class="p">,</span><span class="w"> </span><span class="n">unknowns</span><span class="p">)</span> +<span class="o">&gt;&gt;&gt;&gt;</span><span class="w"> </span><span class="c1"># due to operator overloading we can call the methods on k1:</span> +<span class="o">&gt;&gt;&gt;&gt;</span><span class="w"> </span><span class="n">k2</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">k1</span><span class="o">.</span><span class="n">abstract_invert</span><span class="p">()</span> +<span class="o">&gt;&gt;&gt;&gt;</span><span class="w"> </span><span class="n">k2</span><span class="o">.</span><span class="n">ones</span> +<span class="o">~</span><span class="n">abstract_unknowns</span><span class="w"> </span><span class="o">&amp;</span><span class="w"> </span><span class="o">~</span><span class="n">abstract_ones</span> +<span class="o">&gt;&gt;&gt;&gt;</span><span class="w"> </span><span class="n">k2</span><span class="o">.</span><span class="n">unknowns</span> +<span class="n">abstract_unknowns</span> +<span class="o">&gt;&gt;&gt;&gt;</span><span class="w"> </span><span class="c1"># here's the correctness condition that we want to prove:</span> +<span class="o">&gt;&gt;&gt;&gt;</span><span class="w"> </span><span class="n">k2</span><span class="o">.</span><span class="n">contains</span><span class="p">(</span><span class="n">n2</span><span class="p">)</span> +<span class="o">~</span><span class="n">concrete_value</span><span class="w"> </span><span class="o">&amp;</span><span class="w"> </span><span class="o">~</span><span class="n">abstract_unknowns</span><span class="w"> </span><span class="o">==</span> +<span class="o">~</span><span class="n">abstract_unknowns</span><span class="w"> </span><span class="o">&amp;</span><span class="w"> </span><span class="o">~</span><span class="n">abstract_ones</span> +<span class="o">&gt;&gt;&gt;&gt;</span><span class="w"> </span><span class="c1"># let's try</span> +<span class="o">&gt;&gt;&gt;&gt;</span><span class="w"> </span><span class="n">prove</span><span class="p">(</span><span class="n">k2</span><span class="o">.</span><span class="n">contains</span><span class="p">(</span><span class="n">n2</span><span class="p">))</span> +<span class="n">Traceback</span><span class="w"> </span><span class="p">(</span><span class="n">most</span><span class="w"> </span><span class="n">recent</span><span class="w"> </span><span class="n">call</span><span class="w"> </span><span class="n">last</span><span class="p">):</span> +<span class="w"> </span><span class="n">File</span><span class="w"> </span><span class="s2">"&lt;stdin&gt;"</span><span class="p">,</span><span class="w"> </span><span class="n">line</span><span class="w"> </span><span class="mi">1</span><span class="p">,</span><span class="w"> </span><span class="ow">in</span><span class="w"> </span><span class="o">&lt;</span><span class="n">module</span><span class="o">&gt;</span> +<span class="w"> </span><span class="n">File</span><span class="w"> </span><span class="s2">"&lt;stdin&gt;"</span><span class="p">,</span><span class="w"> </span><span class="n">line</span><span class="w"> </span><span class="mi">1</span><span class="p">,</span><span class="w"> </span><span class="ow">in</span><span class="w"> </span><span class="n">prove</span> +<span class="n">AssertionError</span> +<span class="o">&gt;&gt;&gt;&gt;</span><span class="w"> </span><span class="c1"># it doesn't work! let's look at the counterexample to see why:</span> +<span class="o">&gt;&gt;&gt;&gt;</span><span class="w"> </span><span class="n">solver</span><span class="o">.</span><span class="n">model</span><span class="p">()</span> +<span class="p">[</span><span class="n">abstract_unknowns</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="mi">0</span><span class="p">,</span> +<span class="w"> </span><span class="n">abstract_ones</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="mi">0</span><span class="p">,</span> +<span class="w"> </span><span class="n">concrete_value</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="mi">1</span><span class="p">]</span> +<span class="o">&gt;&gt;&gt;&gt;</span><span class="w"> </span><span class="c1"># we can build a KnownBits instance with the values in the</span> +<span class="o">&gt;&gt;&gt;&gt;</span><span class="w"> </span><span class="c1"># counterexample:</span> +<span class="o">&gt;&gt;&gt;&gt;</span><span class="w"> </span><span class="o">~</span><span class="mi">1</span><span class="w"> </span><span class="c1"># concrete result</span> +<span class="o">-</span><span class="mi">2</span> +<span class="o">&gt;&gt;&gt;&gt;</span><span class="w"> </span><span class="n">counter_example_k1</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">KnownBits</span><span class="p">(</span><span class="mi">0</span><span class="p">,</span><span class="w"> </span><span class="mi">0</span><span class="p">)</span> +<span class="o">&gt;&gt;&gt;&gt;</span><span class="w"> </span><span class="n">counter_example_k1</span> +<span class="n">KnownBits</span><span class="o">.</span><span class="n">from_constant</span><span class="p">(</span><span class="mi">0</span><span class="p">)</span> +<span class="o">&gt;&gt;&gt;&gt;</span><span class="w"> </span><span class="n">counter_example_k2</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">counter_example_k1</span><span class="o">.</span><span class="n">abstract_invert</span><span class="p">()</span> +<span class="o">&gt;&gt;&gt;&gt;</span><span class="w"> </span><span class="n">counter_example_k2</span> +<span class="n">KnownBits</span><span class="o">.</span><span class="n">from_constant</span><span class="p">(</span><span class="o">-</span><span class="mi">1</span><span class="p">)</span> +<span class="o">&gt;&gt;&gt;&gt;</span><span class="w"> </span><span class="c1"># let's check the failing condition</span> +<span class="o">&gt;&gt;&gt;&gt;</span><span class="w"> </span><span class="n">counter_example_k2</span><span class="o">.</span><span class="n">contains</span><span class="p">(</span><span class="o">~</span><span class="mi">1</span><span class="p">)</span> +<span class="kc">False</span> +</pre></div> + +<p>What is the problem here? We didn't tell Z3 that <code>n1</code> was supposed to be a +member of <code>k1</code>. We can add this as a precondition to the solver, and then the +prove works:</p> +<div class="code"><pre class="code literal-block">&gt;&gt;&gt;&gt; solver.add(k1.contains(n1)) +&gt;&gt;&gt;&gt; prove(k2.contains(n2)) # works! +</pre></div> + +<p>This is super cool! It's really a proof about the actual implementation, because +we call the implementation methods directly, and due to the operator overloading +that Z3 does we can be sure that we are actually checking a formula that +corresponds to the Python code. This eliminates one source of errors in formal +methods.</p> +<p>Doing the proof manually on the Python REPL is kind of annoying though, and we +also would like to make sure that the proofs are re-done when we change the +code. What we would really like to do is writing the proofs as a unit-test that +we can run while developing and in CI. Doing this is possible, and the unit +tests that really perform proofs look pleasingly similar to the +Hypothesis-based ones.</p> +<p>First we need to set up a bit of infrastructure:</p> +<div class="code"><pre class="code literal-block"><span class="n">INTEGER_WIDTH</span> <span class="o">=</span> <span class="mi">64</span> + +<span class="k">def</span> <span class="nf">BitVec</span><span class="p">(</span><span class="n">name</span><span class="p">):</span> + <span class="k">return</span> <span class="n">z3</span><span class="o">.</span><span class="n">BitVec</span><span class="p">(</span><span class="n">name</span><span class="p">,</span> <span class="n">INTEGER_WIDTH</span><span class="p">)</span> + +<span class="k">def</span> <span class="nf">BitVecVal</span><span class="p">(</span><span class="n">val</span><span class="p">):</span> + <span class="k">return</span> <span class="n">z3</span><span class="o">.</span><span class="n">BitVecVal</span><span class="p">(</span><span class="n">val</span><span class="p">,</span> <span class="n">INTEGER_WIDTH</span><span class="p">)</span> + +<span class="k">def</span> <span class="nf">z3_setup_variables</span><span class="p">():</span> + <span class="c1"># instantiate a solver</span> + <span class="n">solver</span> <span class="o">=</span> <span class="n">z3</span><span class="o">.</span><span class="n">Solver</span><span class="p">()</span> + + <span class="c1"># a Z3 variable for the first concrete value</span> + <span class="n">n1</span> <span class="o">=</span> <span class="n">BitVec</span><span class="p">(</span><span class="s2">"n1"</span><span class="p">)</span> + <span class="c1"># a KnownBits instances that uses Z3 variables as its ones and unknowns,</span> + <span class="c1"># representing the first abstract value</span> + <span class="n">k1</span> <span class="o">=</span> <span class="n">KnownBits</span><span class="p">(</span><span class="n">BitVec</span><span class="p">(</span><span class="s2">"n1_ones"</span><span class="p">),</span> <span class="n">BitVec</span><span class="p">(</span><span class="s2">"n1_unkowns"</span><span class="p">))</span> + <span class="c1"># add the precondition to the solver that the concrete value n1 must be a</span> + <span class="c1"># member of the abstract value k1</span> + <span class="n">solver</span><span class="o">.</span><span class="n">add</span><span class="p">(</span><span class="n">k1</span><span class="o">.</span><span class="n">contains</span><span class="p">(</span><span class="n">n1</span><span class="p">))</span> + + <span class="c1"># a Z3 variable for the second concrete value</span> + <span class="n">n2</span> <span class="o">=</span> <span class="n">BitVec</span><span class="p">(</span><span class="s2">"n2"</span><span class="p">)</span> + <span class="c1"># a KnownBits instances for the second abstract value</span> + <span class="n">k2</span> <span class="o">=</span> <span class="n">KnownBits</span><span class="p">(</span><span class="n">BitVec</span><span class="p">(</span><span class="s2">"n2_ones"</span><span class="p">),</span> <span class="n">BitVec</span><span class="p">(</span><span class="s2">"n2_unkowns"</span><span class="p">))</span> + <span class="c1"># add the precondition linking n2 and k2 to the solver</span> + <span class="n">solver</span><span class="o">.</span><span class="n">add</span><span class="p">(</span><span class="n">k2</span><span class="o">.</span><span class="n">contains</span><span class="p">(</span><span class="n">n2</span><span class="p">))</span> + <span class="k">return</span> <span class="n">solver</span><span class="p">,</span> <span class="n">k1</span><span class="p">,</span> <span class="n">n1</span><span class="p">,</span> <span class="n">k2</span><span class="p">,</span> <span class="n">n2</span> + +<span class="k">def</span> <span class="nf">prove</span><span class="p">(</span><span class="n">cond</span><span class="p">,</span> <span class="n">solver</span><span class="p">):</span> + <span class="n">z3res</span> <span class="o">=</span> <span class="n">solver</span><span class="o">.</span><span class="n">check</span><span class="p">(</span><span class="n">z3</span><span class="o">.</span><span class="n">Not</span><span class="p">(</span><span class="n">cond</span><span class="p">))</span> + <span class="k">if</span> <span class="n">z3res</span> <span class="o">!=</span> <span class="n">z3</span><span class="o">.</span><span class="n">unsat</span><span class="p">:</span> + <span class="k">assert</span> <span class="n">z3res</span> <span class="o">==</span> <span class="n">z3</span><span class="o">.</span><span class="n">sat</span> <span class="c1"># can't be timeout, we set no timeout</span> + <span class="c1"># make the model with the counterexample global, to make inspecting the</span> + <span class="c1"># bug easier when running pytest --pdb</span> + <span class="k">global</span> <span class="n">model</span> + <span class="n">model</span> <span class="o">=</span> <span class="n">solver</span><span class="o">.</span><span class="n">model</span><span class="p">()</span> + <span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">"n1=</span><span class="si">{</span><span class="n">model</span><span class="o">.</span><span class="n">eval</span><span class="p">(</span><span class="n">n1</span><span class="p">)</span><span class="si">}</span><span class="s2">, n2=</span><span class="si">{</span><span class="n">model</span><span class="o">.</span><span class="n">eval</span><span class="p">(</span><span class="n">n2</span><span class="p">)</span><span class="si">}</span><span class="s2">"</span><span class="p">)</span> + <span class="n">counter_example_k1</span> <span class="o">=</span> <span class="n">KnownBits</span><span class="p">(</span><span class="n">model</span><span class="o">.</span><span class="n">eval</span><span class="p">(</span><span class="n">k1</span><span class="o">.</span><span class="n">ones</span><span class="p">)</span><span class="o">.</span><span class="n">as_signed_long</span><span class="p">(),</span> + <span class="n">model</span><span class="o">.</span><span class="n">eval</span><span class="p">(</span><span class="n">k1</span><span class="o">.</span><span class="n">unknowns</span><span class="p">)</span><span class="o">.</span><span class="n">as_signed_long</span><span class="p">())</span> + <span class="n">counter_example_k2</span> <span class="o">=</span> <span class="n">KnownBits</span><span class="p">(</span><span class="n">model</span><span class="o">.</span><span class="n">eval</span><span class="p">(</span><span class="n">k2</span><span class="o">.</span><span class="n">ones</span><span class="p">)</span><span class="o">.</span><span class="n">as_signed_long</span><span class="p">(),</span> + <span class="n">model</span><span class="o">.</span><span class="n">eval</span><span class="p">(</span><span class="n">k2</span><span class="o">.</span><span class="n">unknowns</span><span class="p">)</span><span class="o">.</span><span class="n">as_signed_long</span><span class="p">())</span> + <span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">"k1=</span><span class="si">{</span><span class="n">counter_example_k1</span><span class="si">}</span><span class="s2">, k2=</span><span class="si">{</span><span class="n">counter_example_k2</span><span class="si">}</span><span class="s2">"</span><span class="p">)</span> + <span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">"but </span><span class="si">{</span><span class="n">cond</span><span class="si">=}</span><span class="s2"> evaluates to </span><span class="si">{</span><span class="n">model</span><span class="o">.</span><span class="n">eval</span><span class="p">(</span><span class="n">cond</span><span class="p">)</span><span class="si">}</span><span class="s2">"</span><span class="p">)</span> + <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="n">solver</span><span class="o">.</span><span class="n">model</span><span class="p">())</span> +</pre></div> + +<p>And then we can write proof-unit-tests like this:</p> +<div class="code"><pre class="code literal-block"><span class="k">def</span> <span class="nf">test_z3_abstract_invert</span><span class="p">():</span> + <span class="n">solver</span><span class="p">,</span> <span class="n">k1</span><span class="p">,</span> <span class="n">n1</span><span class="p">,</span> <span class="n">_</span><span class="p">,</span> <span class="n">_</span> <span class="o">=</span> <span class="n">z3_setup_variables</span><span class="p">()</span> + <span class="n">k2</span> <span class="o">=</span> <span class="n">k1</span><span class="o">.</span><span class="n">abstract_invert</span><span class="p">()</span> + <span class="n">n2</span> <span class="o">=</span> <span class="o">~</span><span class="n">n1</span> + <span class="n">prove</span><span class="p">(</span><span class="n">k2</span><span class="o">.</span><span class="n">contains</span><span class="p">(</span><span class="n">n2</span><span class="p">),</span> <span class="n">solver</span><span class="p">)</span> + +<span class="k">def</span> <span class="nf">test_z3_abstract_and</span><span class="p">():</span> + <span class="n">solver</span><span class="p">,</span> <span class="n">k1</span><span class="p">,</span> <span class="n">n1</span><span class="p">,</span> <span class="n">k2</span><span class="p">,</span> <span class="n">n2</span> <span class="o">=</span> <span class="n">z3_setup_variables</span><span class="p">()</span> + <span class="n">k3</span> <span class="o">=</span> <span class="n">k1</span><span class="o">.</span><span class="n">abstract_and</span><span class="p">(</span><span class="n">k2</span><span class="p">)</span> + <span class="n">n3</span> <span class="o">=</span> <span class="n">n1</span> <span class="o">&amp;</span> <span class="n">n2</span> + <span class="n">prove</span><span class="p">(</span><span class="n">k3</span><span class="o">.</span><span class="n">contains</span><span class="p">(</span><span class="n">n3</span><span class="p">),</span> <span class="n">solver</span><span class="p">)</span> + +<span class="k">def</span> <span class="nf">test_z3_abstract_or</span><span class="p">():</span> + <span class="n">solver</span><span class="p">,</span> <span class="n">k1</span><span class="p">,</span> <span class="n">n1</span><span class="p">,</span> <span class="n">k2</span><span class="p">,</span> <span class="n">n2</span> <span class="o">=</span> <span class="n">z3_setup_variables</span><span class="p">()</span> + <span class="n">k3</span> <span class="o">=</span> <span class="n">k1</span><span class="o">.</span><span class="n">abstract_or</span><span class="p">(</span><span class="n">k2</span><span class="p">)</span> + <span class="n">n3</span> <span class="o">=</span> <span class="n">n1</span> <span class="o">|</span> <span class="n">n2</span> + <span class="n">prove</span><span class="p">(</span><span class="n">k3</span><span class="o">.</span><span class="n">contains</span><span class="p">(</span><span class="n">n3</span><span class="p">),</span> <span class="n">solver</span><span class="p">)</span> + +<span class="k">def</span> <span class="nf">test_z3_abstract_add</span><span class="p">():</span> + <span class="n">solver</span><span class="p">,</span> <span class="n">k1</span><span class="p">,</span> <span class="n">n1</span><span class="p">,</span> <span class="n">k2</span><span class="p">,</span> <span class="n">n2</span> <span class="o">=</span> <span class="n">z3_setup_variables</span><span class="p">()</span> + <span class="n">k3</span> <span class="o">=</span> <span class="n">k1</span><span class="o">.</span><span class="n">abstract_add</span><span class="p">(</span><span class="n">k2</span><span class="p">)</span> + <span class="n">n3</span> <span class="o">=</span> <span class="n">n1</span> <span class="o">+</span> <span class="n">n2</span> + <span class="n">prove</span><span class="p">(</span><span class="n">k3</span><span class="o">.</span><span class="n">contains</span><span class="p">(</span><span class="n">n3</span><span class="p">),</span> <span class="n">solver</span><span class="p">)</span> + +<span class="k">def</span> <span class="nf">test_z3_abstract_sub</span><span class="p">():</span> + <span class="n">solver</span><span class="p">,</span> <span class="n">k1</span><span class="p">,</span> <span class="n">n1</span><span class="p">,</span> <span class="n">k2</span><span class="p">,</span> <span class="n">n2</span> <span class="o">=</span> <span class="n">z3_setup_variables</span><span class="p">()</span> + <span class="n">k3</span> <span class="o">=</span> <span class="n">k1</span><span class="o">.</span><span class="n">abstract_sub</span><span class="p">(</span><span class="n">k2</span><span class="p">)</span> + <span class="n">n3</span> <span class="o">=</span> <span class="n">n1</span> <span class="o">-</span> <span class="n">n2</span> + <span class="n">prove</span><span class="p">(</span><span class="n">k3</span><span class="o">.</span><span class="n">contains</span><span class="p">(</span><span class="n">n3</span><span class="p">),</span> <span class="n">solver</span><span class="p">)</span> +</pre></div> + +<p>It's possible to write a bit more Python-metaprogramming-magic and unify the +Hypothesis and Z3 tests into the same test definition.<sup id="fnref:proof_bitwidths"><a class="footnote-ref" href="https://www.pypy.org/posts/2024/08/toy-knownbits.html#fn:proof_bitwidths">1</a></sup></p> +<h3 id="cases-where-this-style-of-z3-proof-doesnt-work">Cases where this style of Z3 proof doesn't work</h3> +<p>Unfortunately the approach described in the previous section only works for a +very small number of cases. It breaks down as soon as the <code>KnownBits</code> methods +that we're calling contain any <code>if</code> conditions (including hidden ones like +the short-circuiting <code>and</code> and <code>or</code> in Python). Let's look at an example and +implement <code>abstract_eq</code>. <code>eq</code> is supposed to be an operation that compares two +integers and returns <code>0</code> or <code>1</code> if they are different or equal, respectively. +Implementing this in knownbits looks like this (with example and hypothesis +tests):</p> +<div class="code"><pre class="code literal-block"><span class="k">class</span> <span class="nc">KnownBits</span><span class="p">:</span> + <span class="o">...</span> + + <span class="k">def</span> <span class="nf">abstract_eq</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">):</span> + <span class="c1"># the result is a 0, 1, or ?</span> + + <span class="c1"># if they are both the same constant, they must be equal</span> + <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">is_constant</span><span class="p">()</span> <span class="ow">and</span> <span class="n">other</span><span class="o">.</span><span class="n">is_constant</span><span class="p">()</span> <span class="ow">and</span> <span class="bp">self</span><span class="o">.</span><span class="n">ones</span> <span class="o">==</span> <span class="n">other</span><span class="o">.</span><span class="n">ones</span><span class="p">:</span> + <span class="k">return</span> <span class="n">KnownBits</span><span class="o">.</span><span class="n">from_constant</span><span class="p">(</span><span class="mi">1</span><span class="p">)</span> + <span class="c1"># check whether we have known disagreeing bits, then we know the result</span> + <span class="c1"># is 0</span> + <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">_disagrees</span><span class="p">(</span><span class="n">other</span><span class="p">):</span> + <span class="k">return</span> <span class="n">KnownBits</span><span class="o">.</span><span class="n">from_constant</span><span class="p">(</span><span class="mi">0</span><span class="p">)</span> + <span class="k">return</span> <span class="n">KnownBits</span><span class="p">(</span><span class="mi">0</span><span class="p">,</span> <span class="mi">1</span><span class="p">)</span> <span class="c1"># an unknown boolean</span> + + <span class="k">def</span> <span class="nf">_disagrees</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">):</span> + <span class="c1"># check whether the bits disagree in any place where both are known</span> + <span class="n">both_known</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">knowns</span> <span class="o">&amp;</span> <span class="n">other</span><span class="o">.</span><span class="n">knowns</span> + <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">ones</span> <span class="o">&amp;</span> <span class="n">both_known</span> <span class="o">!=</span> <span class="n">other</span><span class="o">.</span><span class="n">ones</span> <span class="o">&amp;</span> <span class="n">both_known</span> + +<span class="k">def</span> <span class="nf">test_eq</span><span class="p">():</span> + <span class="n">k1</span> <span class="o">=</span> <span class="n">KnownBits</span><span class="o">.</span><span class="n">from_str</span><span class="p">(</span><span class="s1">'...?'</span><span class="p">)</span> + <span class="n">k2</span> <span class="o">=</span> <span class="n">KnownBits</span><span class="o">.</span><span class="n">from_str</span><span class="p">(</span><span class="s1">'...?'</span><span class="p">)</span> + <span class="k">assert</span> <span class="nb">str</span><span class="p">(</span><span class="n">k1</span><span class="o">.</span><span class="n">abstract_eq</span><span class="p">(</span><span class="n">k2</span><span class="p">))</span> <span class="o">==</span> <span class="s1">'?'</span> + <span class="n">k1</span> <span class="o">=</span> <span class="n">KnownBits</span><span class="o">.</span><span class="n">from_constant</span><span class="p">(</span><span class="mi">10</span><span class="p">)</span> + <span class="k">assert</span> <span class="nb">str</span><span class="p">(</span><span class="n">k1</span><span class="o">.</span><span class="n">abstract_eq</span><span class="p">(</span><span class="n">k1</span><span class="p">))</span> <span class="o">==</span> <span class="s1">'1'</span> + <span class="n">k1</span> <span class="o">=</span> <span class="n">KnownBits</span><span class="o">.</span><span class="n">from_constant</span><span class="p">(</span><span class="mi">10</span><span class="p">)</span> + <span class="n">k2</span> <span class="o">=</span> <span class="n">KnownBits</span><span class="o">.</span><span class="n">from_constant</span><span class="p">(</span><span class="mi">20</span><span class="p">)</span> + <span class="k">assert</span> <span class="nb">str</span><span class="p">(</span><span class="n">k1</span><span class="o">.</span><span class="n">abstract_eq</span><span class="p">(</span><span class="n">k2</span><span class="p">))</span> <span class="o">==</span> <span class="s1">'0'</span> + +<span class="nd">@given</span><span class="p">(</span><span class="n">knownbits_and_contained_number</span><span class="p">,</span> <span class="n">knownbits_and_contained_number</span><span class="p">)</span> +<span class="k">def</span> <span class="nf">test_hypothesis_eq</span><span class="p">(</span><span class="n">t1</span><span class="p">,</span> <span class="n">t2</span><span class="p">):</span> + <span class="n">k1</span><span class="p">,</span> <span class="n">n1</span> <span class="o">=</span> <span class="n">t1</span> + <span class="n">k2</span><span class="p">,</span> <span class="n">n2</span> <span class="o">=</span> <span class="n">t2</span> + <span class="n">k3</span> <span class="o">=</span> <span class="n">k1</span><span class="o">.</span><span class="n">abstract_eq</span><span class="p">(</span><span class="n">k2</span><span class="p">)</span> + <span class="k">assert</span> <span class="n">k3</span><span class="o">.</span><span class="n">contains</span><span class="p">(</span><span class="nb">int</span><span class="p">(</span><span class="n">n1</span> <span class="o">==</span> <span class="n">n2</span><span class="p">))</span> +</pre></div> + +<p>Trying to do the proof in the same style as before breaks:</p> +<div class="code"><pre class="code literal-block"><span class="o">&gt;&gt;&gt;&gt;</span> <span class="n">k3</span> <span class="o">=</span> <span class="n">k1</span><span class="o">.</span><span class="n">abstract_eq</span><span class="p">(</span><span class="n">k2</span><span class="p">)</span> +<span class="n">Traceback</span> <span class="p">(</span><span class="n">most</span> <span class="n">recent</span> <span class="n">call</span> <span class="n">last</span><span class="p">):</span> + <span class="n">File</span> <span class="s2">"&lt;stdin&gt;"</span><span class="p">,</span> <span class="n">line</span> <span class="mi">1</span><span class="p">,</span> <span class="ow">in</span> <span class="o">&lt;</span><span class="n">module</span><span class="o">&gt;</span> + <span class="n">File</span> <span class="s2">"knownbits.py"</span><span class="p">,</span> <span class="n">line</span> <span class="mi">246</span><span class="p">,</span> <span class="ow">in</span> <span class="n">abstract_eq</span> + <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">_disagrees</span><span class="p">(</span><span class="n">other</span><span class="p">):</span> + <span class="n">File</span> <span class="s2">"venv/site-packages/z3/z3.py"</span><span class="p">,</span> <span class="n">line</span> <span class="mi">381</span><span class="p">,</span> <span class="ow">in</span> <span class="fm">__bool__</span> + <span class="k">raise</span> <span class="n">Z3Exception</span><span class="p">(</span><span class="s2">"Symbolic expressions cannot be cast to concrete Boolean values."</span><span class="p">)</span> +<span class="n">z3</span><span class="o">.</span><span class="n">z3types</span><span class="o">.</span><span class="n">Z3Exception</span><span class="p">:</span> <span class="n">Symbolic</span> <span class="n">expressions</span> <span class="n">cannot</span> <span class="n">be</span> <span class="n">cast</span> <span class="n">to</span> <span class="n">concrete</span> <span class="n">Boolean</span> <span class="n">values</span><span class="o">.</span> +</pre></div> + +<p>We cannot call <code>abstract_eq</code> on a <code>KnownBits</code> with Z3 variables as fields, +because once we hit an <code>if</code> statement, the whole approach of relying on the +operator overloading breaks down. Z3 doesn't actually parse the Python code or +anything advanced like that, we rather build an expression only by running the +code and letting the Z3 formulas build up.</p> +<p>To still prove the correctness of <code>abstract_eq</code> we need to manually transform +the control flow logic of the function into a Z3 formula that uses the <code>z3.If</code> +expression, using a small helper function:</p> +<div class="code"><pre class="code literal-block"><span class="k">def</span> <span class="nf">z3_cond</span><span class="p">(</span><span class="n">b</span><span class="p">,</span> <span class="n">trueval</span><span class="o">=</span><span class="mi">1</span><span class="p">,</span> <span class="n">falseval</span><span class="o">=</span><span class="mi">0</span><span class="p">):</span> + <span class="k">return</span> <span class="n">z3</span><span class="o">.</span><span class="n">If</span><span class="p">(</span><span class="n">b</span><span class="p">,</span> <span class="n">BitVecVal</span><span class="p">(</span><span class="n">trueval</span><span class="p">),</span> <span class="n">BitVecVal</span><span class="p">(</span><span class="n">falseval</span><span class="p">))</span> + +<span class="k">def</span> <span class="nf">z3_abstract_eq</span><span class="p">(</span><span class="n">k1</span><span class="p">,</span> <span class="n">k2</span><span class="p">):</span> + <span class="c1"># follow the *logic* of abstract_eq, we can't call it due to the ifs in it</span> + <span class="n">case1cond</span> <span class="o">=</span> <span class="n">z3</span><span class="o">.</span><span class="n">And</span><span class="p">(</span><span class="n">k1</span><span class="o">.</span><span class="n">is_constant</span><span class="p">(),</span> <span class="n">k2</span><span class="o">.</span><span class="n">is_constant</span><span class="p">(),</span> <span class="n">k1</span><span class="o">.</span><span class="n">ones</span> <span class="o">==</span> <span class="n">k2</span><span class="o">.</span><span class="n">ones</span><span class="p">)</span> + <span class="n">case2cond</span> <span class="o">=</span> <span class="n">k1</span><span class="o">.</span><span class="n">_disagrees</span><span class="p">(</span><span class="n">k2</span><span class="p">)</span> + + <span class="c1"># ones is 1 in the first case, 0 otherwise</span> + <span class="n">ones</span> <span class="o">=</span> <span class="n">z3_cond</span><span class="p">(</span><span class="n">case1cond</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="mi">0</span><span class="p">)</span> + + <span class="c1"># in the first two cases, unknowns is 0, 1 otherwise</span> + <span class="n">unknowns</span> <span class="o">=</span> <span class="n">z3_cond</span><span class="p">(</span><span class="n">z3</span><span class="o">.</span><span class="n">Or</span><span class="p">(</span><span class="n">case1cond</span><span class="p">,</span> <span class="n">case2cond</span><span class="p">),</span> <span class="mi">0</span><span class="p">,</span> <span class="mi">1</span><span class="p">)</span> + <span class="k">return</span> <span class="n">KnownBits</span><span class="p">(</span><span class="n">ones</span><span class="p">,</span> <span class="n">unknowns</span><span class="p">)</span> + +<span class="k">def</span> <span class="nf">test_z3_abstract_eq_logic</span><span class="p">():</span> + <span class="n">solver</span><span class="p">,</span> <span class="n">k1</span><span class="p">,</span> <span class="n">n1</span><span class="p">,</span> <span class="n">k2</span><span class="p">,</span> <span class="n">n2</span> <span class="o">=</span> <span class="n">z3_setup_variables</span><span class="p">()</span> + <span class="n">n3</span> <span class="o">=</span> <span class="n">z3_cond</span><span class="p">(</span><span class="n">n1</span> <span class="o">==</span> <span class="n">n2</span><span class="p">)</span> <span class="c1"># concrete result</span> + <span class="n">k3</span> <span class="o">=</span> <span class="n">z3_abstract_eq</span><span class="p">(</span><span class="n">k1</span><span class="p">,</span> <span class="n">k2</span><span class="p">)</span> + <span class="n">prove</span><span class="p">(</span><span class="n">k3</span><span class="o">.</span><span class="n">contains</span><span class="p">(</span><span class="n">n3</span><span class="p">),</span> <span class="n">solver</span><span class="p">)</span> +</pre></div> + +<p>This proof works. It is a lot less satisfying than the previous ones though, +because we could have done an error in the manual transcription from Python code +to Z3 formulas (there are possibly more heavy-handed approaches where we do +this transformation more automatically using e.g. the <code>ast</code> module to analyze +the source code, but that's a much more complicated researchy project). To +lessen this problem somewhat we can factor out the parts of the logic that don't +have any conditions into small helper methods (like <code>_disagrees</code> in this +example) and use them in the manual conversion of the code to Z3 formulas.<sup id="fnref:tests_vs_proofs"><a class="footnote-ref" href="https://www.pypy.org/posts/2024/08/toy-knownbits.html#fn:tests_vs_proofs">2</a></sup></p> +<p>The final condition that Z3 checks, btw, is this one:</p> +<div class="code"><pre class="code literal-block">If(n1 == n2, 1, 0) &amp; +~If(Or(And(n1_unkowns == 0, + n2_unkowns == 0, + n1_ones == n2_ones), + n1_ones &amp; ~n1_unkowns &amp; ~n2_unkowns != + n2_ones &amp; ~n1_unkowns &amp; ~n2_unkowns), + 0, 1) == +If(And(n1_unkowns == 0, n2_unkowns == 0, n1_ones == n2_ones), + 1, 0) +</pre></div> + +<h3 id="making-statements-about-precision">Making Statements about Precision</h3> +<p>So far we have only used Z3 to prove statements about correctness, i.e. that +our abstract operations overapproximate what can happen with concrete values. +While proving this property is essential if we want to avoid miscompilation, +correctness alone is not a very strong constraint on the implementation of our +abstract transfer functions. We could simply return <code>Knownbits.unknowns()</code> for +every <code>abstract_*</code> method and the resulting overapproximation would be correct, +but useless in practice.</p> +<p>It's much harder to make statements about whether the transfer functions are +maximally precise. There are two aspects of precision I want to discuss in this +section, however.</p> +<p>The first aspect is that we would really like it if the transfer functions +compute the maximally precise results for singleton sets. If all abstract +arguments of an operations are constants, i.e. contain only a single concrete +element, then we know that the resulting set also has only a single element. We +can prove that all our transfer functions have this property:</p> +<div class="code"><pre class="code literal-block"><span class="k">def</span> <span class="nf">test_z3_prove_constant_folding</span><span class="p">():</span> + <span class="n">solver</span><span class="p">,</span> <span class="n">k1</span><span class="p">,</span> <span class="n">n1</span><span class="p">,</span> <span class="n">k2</span><span class="p">,</span> <span class="n">n2</span> <span class="o">=</span> <span class="n">z3_setup_variables</span><span class="p">()</span> + <span class="n">k3</span> <span class="o">=</span> <span class="n">k1</span><span class="o">.</span><span class="n">abstract_invert</span><span class="p">()</span> + <span class="n">prove</span><span class="p">(</span><span class="n">z3</span><span class="o">.</span><span class="n">Implies</span><span class="p">(</span><span class="n">k1</span><span class="o">.</span><span class="n">is_constant</span><span class="p">(),</span> + <span class="n">k3</span><span class="o">.</span><span class="n">is_constant</span><span class="p">()),</span> <span class="n">solver</span><span class="p">)</span> + + <span class="n">k3</span> <span class="o">=</span> <span class="n">k1</span><span class="o">.</span><span class="n">abstract_and</span><span class="p">(</span><span class="n">k2</span><span class="p">)</span> + <span class="n">prove</span><span class="p">(</span><span class="n">z3</span><span class="o">.</span><span class="n">Implies</span><span class="p">(</span><span class="n">z3</span><span class="o">.</span><span class="n">And</span><span class="p">(</span><span class="n">k1</span><span class="o">.</span><span class="n">is_constant</span><span class="p">(),</span> <span class="n">k2</span><span class="o">.</span><span class="n">is_constant</span><span class="p">()),</span> + <span class="n">k3</span><span class="o">.</span><span class="n">is_constant</span><span class="p">()),</span> <span class="n">solver</span><span class="p">)</span> + + <span class="n">k3</span> <span class="o">=</span> <span class="n">k1</span><span class="o">.</span><span class="n">abstract_or</span><span class="p">(</span><span class="n">k2</span><span class="p">)</span> + <span class="n">prove</span><span class="p">(</span><span class="n">z3</span><span class="o">.</span><span class="n">Implies</span><span class="p">(</span><span class="n">z3</span><span class="o">.</span><span class="n">And</span><span class="p">(</span><span class="n">k1</span><span class="o">.</span><span class="n">is_constant</span><span class="p">(),</span> <span class="n">k2</span><span class="o">.</span><span class="n">is_constant</span><span class="p">()),</span> + <span class="n">k3</span><span class="o">.</span><span class="n">is_constant</span><span class="p">()),</span> <span class="n">solver</span><span class="p">)</span> + + <span class="n">k3</span> <span class="o">=</span> <span class="n">k1</span><span class="o">.</span><span class="n">abstract_sub</span><span class="p">(</span><span class="n">k2</span><span class="p">)</span> + <span class="n">prove</span><span class="p">(</span><span class="n">z3</span><span class="o">.</span><span class="n">Implies</span><span class="p">(</span><span class="n">z3</span><span class="o">.</span><span class="n">And</span><span class="p">(</span><span class="n">k1</span><span class="o">.</span><span class="n">is_constant</span><span class="p">(),</span> <span class="n">k2</span><span class="o">.</span><span class="n">is_constant</span><span class="p">()),</span> + <span class="n">k3</span><span class="o">.</span><span class="n">is_constant</span><span class="p">()),</span> <span class="n">solver</span><span class="p">)</span> + + <span class="n">k3</span> <span class="o">=</span> <span class="n">z3_abstract_eq</span><span class="p">(</span><span class="n">k1</span><span class="p">,</span> <span class="n">k2</span><span class="p">)</span> + <span class="n">prove</span><span class="p">(</span><span class="n">z3</span><span class="o">.</span><span class="n">Implies</span><span class="p">(</span><span class="n">z3</span><span class="o">.</span><span class="n">And</span><span class="p">(</span><span class="n">k1</span><span class="o">.</span><span class="n">is_constant</span><span class="p">(),</span> <span class="n">k2</span><span class="o">.</span><span class="n">is_constant</span><span class="p">()),</span> + <span class="n">k3</span><span class="o">.</span><span class="n">is_constant</span><span class="p">()),</span> <span class="n">solver</span><span class="p">)</span> +</pre></div> + +<p>Proving with Z3 that the transfer functions are maximally precise for +non-constant arguments seems to be relatively hard. I tried a few completely +rigorous approaches and failed. The paper <a href="https://arxiv.org/pdf/2105.05398">Sound, Precise, and Fast Abstract +Interpretation with Tristate Numbers</a> +contains an optimality proof for the transfer functions of addition and +subtraction, so we can be certain that they are as precise as is +possible.</p> +<p>I still want to show an approach for trying to find concrete examples of +abstract values that are less precise than they could be, using a combination +of Hypothesis and Z3. The idea is to use hypothesis to pick random abstract +values. Then we compute the abstract result using our transfer function. +Afterwards we can ask Z3 to find us an abstract result that is better than the +one our transfer function produced. If Z3 finds a better abstract result, we +have a concrete example of imprecision for our transfer function. Those tests +aren't strict proofs, because they rely on generating random abstract values, +but they can still be valuable (not for the transfer functions in this blog +post, which are all optimal).</p> +<p>Here is what the code looks like (this is a little bit bonus content, I'll not +explain the details and can only hope that the comments are somewhat helpful):</p> +<div class="code"><pre class="code literal-block"><span class="nd">@given</span><span class="p">(</span><span class="n">random_knownbits_and_contained_number</span><span class="p">,</span> <span class="n">random_knownbits_and_contained_number</span><span class="p">)</span> +<span class="nd">@settings</span><span class="p">(</span><span class="n">deadline</span><span class="o">=</span><span class="kc">None</span><span class="p">)</span> +<span class="k">def</span> <span class="nf">test_check_precision</span><span class="p">(</span><span class="n">t1</span><span class="p">,</span> <span class="n">t2</span><span class="p">):</span> + <span class="n">k1</span><span class="p">,</span> <span class="n">n1</span> <span class="o">=</span> <span class="n">t1</span> + <span class="n">k2</span><span class="p">,</span> <span class="n">n2</span> <span class="o">=</span> <span class="n">t2</span> + <span class="c1"># apply transfer function</span> + <span class="n">k3</span> <span class="o">=</span> <span class="n">k1</span><span class="o">.</span><span class="n">abstract_add</span><span class="p">(</span><span class="n">k2</span><span class="p">)</span> + <span class="n">example_res</span> <span class="o">=</span> <span class="n">n1</span> <span class="o">+</span> <span class="n">n2</span> + + <span class="c1"># try to find a better version of k3 with Z3</span> + <span class="n">solver</span> <span class="o">=</span> <span class="n">z3</span><span class="o">.</span><span class="n">Solver</span><span class="p">()</span> + <span class="n">solver</span><span class="o">.</span><span class="n">set</span><span class="p">(</span><span class="s2">"timeout"</span><span class="p">,</span> <span class="mi">8000</span><span class="p">)</span> + + <span class="n">var1</span> <span class="o">=</span> <span class="n">BitVec</span><span class="p">(</span><span class="s1">'v1'</span><span class="p">)</span> + <span class="n">var2</span> <span class="o">=</span> <span class="n">BitVec</span><span class="p">(</span><span class="s1">'v2'</span><span class="p">)</span> + + <span class="n">ones</span> <span class="o">=</span> <span class="n">BitVec</span><span class="p">(</span><span class="s1">'ones'</span><span class="p">)</span> + <span class="n">unknowns</span> <span class="o">=</span> <span class="n">BitVec</span><span class="p">(</span><span class="s1">'unknowns'</span><span class="p">)</span> + <span class="n">better_k3</span> <span class="o">=</span> <span class="n">KnownBits</span><span class="p">(</span><span class="n">ones</span><span class="p">,</span> <span class="n">unknowns</span><span class="p">)</span> + <span class="nb">print</span><span class="p">(</span><span class="n">k1</span><span class="p">,</span> <span class="n">k2</span><span class="p">,</span> <span class="n">k3</span><span class="p">)</span> + + <span class="c1"># we're trying to find an example for a better k3, so we use check, without</span> + <span class="c1"># negation:</span> + <span class="n">res</span> <span class="o">=</span> <span class="n">solver</span><span class="o">.</span><span class="n">check</span><span class="p">(</span><span class="n">z3</span><span class="o">.</span><span class="n">And</span><span class="p">(</span> + <span class="c1"># better_k3 should be a valid knownbits instance</span> + <span class="n">better_k3</span><span class="o">.</span><span class="n">is_well_formed</span><span class="p">(),</span> + <span class="c1"># it should be better than k3, ie there are known bits in better_k3</span> + <span class="c1"># that we don't have in k3</span> + <span class="n">better_k3</span><span class="o">.</span><span class="n">knowns</span> <span class="o">&amp;</span> <span class="o">~</span><span class="n">k3</span><span class="o">.</span><span class="n">knowns</span> <span class="o">!=</span> <span class="mi">0</span><span class="p">,</span> + <span class="c1"># now encode the correctness condition for better_k3 with a ForAll:</span> + <span class="c1"># for all concrete values var1 and var2, it must hold that if</span> + <span class="c1"># var1 is in k1 and var2 is in k2 it follows that var1 + var2 is in</span> + <span class="c1"># better_k3</span> + <span class="n">z3</span><span class="o">.</span><span class="n">ForAll</span><span class="p">(</span> + <span class="p">[</span><span class="n">var1</span><span class="p">,</span> <span class="n">var2</span><span class="p">],</span> + <span class="n">z3</span><span class="o">.</span><span class="n">Implies</span><span class="p">(</span> + <span class="n">z3</span><span class="o">.</span><span class="n">And</span><span class="p">(</span><span class="n">k1</span><span class="o">.</span><span class="n">contains</span><span class="p">(</span><span class="n">var1</span><span class="p">),</span> <span class="n">k2</span><span class="o">.</span><span class="n">contains</span><span class="p">(</span><span class="n">var2</span><span class="p">)),</span> + <span class="n">better_k3</span><span class="o">.</span><span class="n">contains</span><span class="p">(</span><span class="n">var1</span> <span class="o">+</span> <span class="n">var2</span><span class="p">)))))</span> + <span class="c1"># if this query is satisfiable, we have found a better result for the</span> + <span class="c1"># abstract_add</span> + <span class="k">if</span> <span class="n">res</span> <span class="o">==</span> <span class="n">z3</span><span class="o">.</span><span class="n">sat</span><span class="p">:</span> + <span class="n">model</span> <span class="o">=</span> <span class="n">solver</span><span class="o">.</span><span class="n">model</span><span class="p">()</span> + <span class="n">rk3</span> <span class="o">=</span> <span class="n">KnownBits</span><span class="p">(</span><span class="n">model</span><span class="o">.</span><span class="n">eval</span><span class="p">(</span><span class="n">ones</span><span class="p">)</span><span class="o">.</span><span class="n">as_signed_long</span><span class="p">(),</span> <span class="n">model</span><span class="o">.</span><span class="n">eval</span><span class="p">(</span><span class="n">unknowns</span><span class="p">)</span><span class="o">.</span><span class="n">as_signed_long</span><span class="p">())</span> + <span class="nb">print</span><span class="p">(</span><span class="s2">"better"</span><span class="p">,</span> <span class="n">rk3</span><span class="p">)</span> + <span class="k">assert</span> <span class="mi">0</span> + <span class="k">if</span> <span class="n">res</span> <span class="o">==</span> <span class="n">z3</span><span class="o">.</span><span class="n">unknown</span><span class="p">:</span> + <span class="nb">print</span><span class="p">(</span><span class="s2">"timeout"</span><span class="p">)</span> +</pre></div> + +<p>It does not actually fail for <code>abstract_add</code> (nor the other abstract +functions). To see the test failing we can add some imprecision to the +implementation of <code>abstract_add</code> to see Hypothesis and Z3 find examples of +values that are not optimally precise (for example by setting some bits +of <code>unknowns</code> in the implementation of <code>abstract_add</code> unconditionally).</p> +<h3 id="using-the-abstract-domain-in-the-toy-optimizer-for-generalized-constant-folding">Using the Abstract Domain in the Toy Optimizer for Generalized Constant Folding</h3> +<p>Now after all this work we can finally actually use the knownbits abstract +domain in the toy optimizer. The code for this follows <a href="https://bernsteinbear.com/blog/toy-abstract-interpretation/">Max' intro post about +abstract interpretation</a> +quite closely.</p> +<p>For completeness sake, in the fold there's the basic infrastructure classes +that make up the IR again (they are identical or at least extremely close to +the previous toy posts).</p> +<details> +<summary>toy infrastructure</summary> + +<div class="code"><pre class="code literal-block"><span class="k">class</span> <span class="nc">Value</span><span class="p">:</span> + <span class="k">def</span> <span class="nf">find</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span> + <span class="k">raise</span> <span class="ne">NotImplementedError</span><span class="p">(</span><span class="s2">"abstract"</span><span class="p">)</span> + + +<span class="nd">@dataclass</span><span class="p">(</span><span class="n">eq</span><span class="o">=</span><span class="kc">False</span><span class="p">)</span> +<span class="k">class</span> <span class="nc">Operation</span><span class="p">(</span><span class="n">Value</span><span class="p">):</span> + <span class="n">name</span> <span class="p">:</span> <span class="nb">str</span> + <span class="n">args</span> <span class="p">:</span> <span class="nb">list</span><span class="p">[</span><span class="n">Value</span><span class="p">]</span> + + <span class="n">forwarded</span> <span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Value</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span> + + <span class="k">def</span> <span class="nf">find</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Value</span><span class="p">:</span> + <span class="n">op</span> <span class="o">=</span> <span class="bp">self</span> + <span class="k">while</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">op</span><span class="p">,</span> <span class="n">Operation</span><span class="p">):</span> + <span class="nb">next</span> <span class="o">=</span> <span class="n">op</span><span class="o">.</span><span class="n">forwarded</span> + <span class="k">if</span> <span class="nb">next</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span> + <span class="k">return</span> <span class="n">op</span> + <span class="n">op</span> <span class="o">=</span> <span class="nb">next</span> + <span class="k">return</span> <span class="n">op</span> + + <span class="k">def</span> <span class="nf">arg</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">index</span><span class="p">):</span> + <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">args</span><span class="p">[</span><span class="n">index</span><span class="p">]</span><span class="o">.</span><span class="n">find</span><span class="p">()</span> + + <span class="k">def</span> <span class="nf">make_equal_to</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span> <span class="p">:</span> <span class="n">Value</span><span class="p">):</span> + <span class="bp">self</span><span class="o">.</span><span class="n">find</span><span class="p">()</span><span class="o">.</span><span class="n">forwarded</span> <span class="o">=</span> <span class="n">value</span> + + +<span class="nd">@dataclass</span><span class="p">(</span><span class="n">eq</span><span class="o">=</span><span class="kc">False</span><span class="p">)</span> +<span class="k">class</span> <span class="nc">Constant</span><span class="p">(</span><span class="n">Value</span><span class="p">):</span> + <span class="n">value</span> <span class="p">:</span> <span class="nb">object</span> + + <span class="k">def</span> <span class="nf">find</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span> + <span class="k">return</span> <span class="bp">self</span> + + +<span class="k">class</span> <span class="nc">Block</span><span class="p">(</span><span class="nb">list</span><span class="p">):</span> + <span class="k">def</span> <span class="fm">__getattr__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">opname</span><span class="p">):</span> + <span class="k">def</span> <span class="nf">wraparg</span><span class="p">(</span><span class="n">arg</span><span class="p">):</span> + <span class="k">if</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">arg</span><span class="p">,</span> <span class="n">Value</span><span class="p">):</span> + <span class="n">arg</span> <span class="o">=</span> <span class="n">Constant</span><span class="p">(</span><span class="n">arg</span><span class="p">)</span> + <span class="k">return</span> <span class="n">arg</span> + <span class="k">def</span> <span class="nf">make_op</span><span class="p">(</span><span class="o">*</span><span class="n">args</span><span class="p">):</span> + <span class="n">op</span> <span class="o">=</span> <span class="n">Operation</span><span class="p">(</span><span class="n">opname</span><span class="p">,</span> + <span class="p">[</span><span class="n">wraparg</span><span class="p">(</span><span class="n">arg</span><span class="p">)</span> <span class="k">for</span> <span class="n">arg</span> <span class="ow">in</span> <span class="n">args</span><span class="p">])</span> + <span class="bp">self</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">op</span><span class="p">)</span> + <span class="k">return</span> <span class="n">op</span> + <span class="k">return</span> <span class="n">make_op</span> + + +<span class="k">def</span> <span class="nf">bb_to_str</span><span class="p">(</span><span class="n">l</span> <span class="p">:</span> <span class="n">Block</span><span class="p">,</span> <span class="n">varprefix</span> <span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">"var"</span><span class="p">):</span> + <span class="k">def</span> <span class="nf">arg_to_str</span><span class="p">(</span><span class="n">arg</span> <span class="p">:</span> <span class="n">Value</span><span class="p">):</span> + <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">arg</span><span class="p">,</span> <span class="n">Constant</span><span class="p">):</span> + <span class="k">return</span> <span class="nb">str</span><span class="p">(</span><span class="n">arg</span><span class="o">.</span><span class="n">value</span><span class="p">)</span> + <span class="k">else</span><span class="p">:</span> + <span class="k">return</span> <span class="n">varnames</span><span class="p">[</span><span class="n">arg</span><span class="p">]</span> + + <span class="n">varnames</span> <span class="o">=</span> <span class="p">{}</span> + <span class="n">res</span> <span class="o">=</span> <span class="p">[]</span> + <span class="k">for</span> <span class="n">index</span><span class="p">,</span> <span class="n">op</span> <span class="ow">in</span> <span class="nb">enumerate</span><span class="p">(</span><span class="n">l</span><span class="p">):</span> + <span class="c1"># give the operation a name used while</span> + <span class="c1"># printing:</span> + <span class="n">var</span> <span class="o">=</span> <span class="sa">f</span><span class="s2">"</span><span class="si">{</span><span class="n">varprefix</span><span class="si">}{</span><span class="n">index</span><span class="si">}</span><span class="s2">"</span> + <span class="n">varnames</span><span class="p">[</span><span class="n">op</span><span class="p">]</span> <span class="o">=</span> <span class="n">var</span> + <span class="n">arguments</span> <span class="o">=</span> <span class="s2">", "</span><span class="o">.</span><span class="n">join</span><span class="p">(</span> + <span class="n">arg_to_str</span><span class="p">(</span><span class="n">op</span><span class="o">.</span><span class="n">arg</span><span class="p">(</span><span class="n">i</span><span class="p">))</span> + <span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="nb">len</span><span class="p">(</span><span class="n">op</span><span class="o">.</span><span class="n">args</span><span class="p">))</span> + <span class="p">)</span> + <span class="n">strop</span> <span class="o">=</span> <span class="sa">f</span><span class="s2">"</span><span class="si">{</span><span class="n">var</span><span class="si">}</span><span class="s2"> = </span><span class="si">{</span><span class="n">op</span><span class="o">.</span><span class="n">name</span><span class="si">}</span><span class="s2">(</span><span class="si">{</span><span class="n">arguments</span><span class="si">}</span><span class="s2">)"</span> + <span class="n">res</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">strop</span><span class="p">)</span> + <span class="k">return</span> <span class="s2">"</span><span class="se">\n</span><span class="s2">"</span><span class="o">.</span><span class="n">join</span><span class="p">(</span><span class="n">res</span><span class="p">)</span> +</pre></div> + + + +</details> + +<p>Now we can write some first tests, the first one simply checking constant +folding:</p> +<div class="code"><pre class="code literal-block"><span class="k">def</span> <span class="nf">test_constfold_two_ops</span><span class="p">():</span> + <span class="n">bb</span> <span class="o">=</span> <span class="n">Block</span><span class="p">()</span> + <span class="n">var0</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">getarg</span><span class="p">(</span><span class="mi">0</span><span class="p">)</span> + <span class="n">var1</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">int_add</span><span class="p">(</span><span class="mi">5</span><span class="p">,</span> <span class="mi">4</span><span class="p">)</span> + <span class="n">var2</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">int_add</span><span class="p">(</span><span class="n">var1</span><span class="p">,</span> <span class="mi">10</span><span class="p">)</span> + <span class="n">var3</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">int_add</span><span class="p">(</span><span class="n">var2</span><span class="p">,</span> <span class="n">var0</span><span class="p">)</span> + + <span class="n">opt_bb</span> <span class="o">=</span> <span class="n">simplify</span><span class="p">(</span><span class="n">bb</span><span class="p">)</span> + <span class="k">assert</span> <span class="n">bb_to_str</span><span class="p">(</span><span class="n">opt_bb</span><span class="p">,</span> <span class="s2">"optvar"</span><span class="p">)</span> <span class="o">==</span> <span class="s2">"""</span><span class="se">\</span> +<span class="s2">optvar0 = getarg(0)</span> +<span class="s2">optvar1 = int_add(19, optvar0)"""</span> +</pre></div> + +<p>Calling the transfer functions on constant <code>KnownBits</code> produces a constant +results, as we have seen. Therefore "regular" constant folding should hopefully +be achieved by optimizing with the <code>KnownBits</code> abstract domain too.</p> +<p>The next two tests are slightly more complicated and can't be optimized by +regular constant-folding. They follow the motivating examples from the start of +this blog post, a hundred years ago:</p> +<div class="code"><pre class="code literal-block"><span class="n">def</span><span class="w"> </span><span class="n">test_constfold_via_knownbits</span><span class="p">():</span> +<span class="w"> </span><span class="n">bb</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">Block</span><span class="p">()</span> +<span class="w"> </span><span class="n">var0</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">bb</span><span class="o">.</span><span class="n">getarg</span><span class="p">(</span><span class="mi">0</span><span class="p">)</span> +<span class="w"> </span><span class="n">var1</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">bb</span><span class="o">.</span><span class="n">int_or</span><span class="p">(</span><span class="n">var0</span><span class="p">,</span><span class="w"> </span><span class="mi">1</span><span class="p">)</span> +<span class="w"> </span><span class="n">var2</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">bb</span><span class="o">.</span><span class="n">int_and</span><span class="p">(</span><span class="n">var1</span><span class="p">,</span><span class="w"> </span><span class="mi">1</span><span class="p">)</span> +<span class="w"> </span><span class="n">var3</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">bb</span><span class="o">.</span><span class="n">dummy</span><span class="p">(</span><span class="n">var2</span><span class="p">)</span> + +<span class="w"> </span><span class="n">opt_bb</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">simplify</span><span class="p">(</span><span class="n">bb</span><span class="p">)</span> +<span class="w"> </span><span class="nb">assert</span><span class="w"> </span><span class="n">bb_to_str</span><span class="p">(</span><span class="n">opt_bb</span><span class="p">,</span><span class="w"> </span><span class="s2">"optvar"</span><span class="p">)</span><span class="w"> </span><span class="o">==</span><span class="w"> </span><span class="s2">"""</span><span class="se">\</span> +<span class="s2">optvar0 = getarg(0)</span> +<span class="s2">optvar1 = int_or(optvar0, 1)</span> +<span class="s2">optvar2 = dummy(1)"""</span> + +<span class="n">def</span><span class="w"> </span><span class="n">test_constfold_alignment_check</span><span class="p">():</span> +<span class="w"> </span><span class="n">bb</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">Block</span><span class="p">()</span> +<span class="w"> </span><span class="n">var0</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">bb</span><span class="o">.</span><span class="n">getarg</span><span class="p">(</span><span class="mi">0</span><span class="p">)</span> +<span class="w"> </span><span class="n">var1</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">bb</span><span class="o">.</span><span class="n">int_invert</span><span class="p">(</span><span class="mi">0</span><span class="n">b111</span><span class="p">)</span> +<span class="w"> </span><span class="c1"># mask off the lowest three bits, thus var2 is aligned</span> +<span class="w"> </span><span class="n">var2</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">bb</span><span class="o">.</span><span class="n">int_and</span><span class="p">(</span><span class="n">var0</span><span class="p">,</span><span class="w"> </span><span class="n">var1</span><span class="p">)</span> +<span class="w"> </span><span class="c1"># add 16 to aligned quantity</span> +<span class="w"> </span><span class="n">var3</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">bb</span><span class="o">.</span><span class="n">int_add</span><span class="p">(</span><span class="n">var2</span><span class="p">,</span><span class="w"> </span><span class="mi">16</span><span class="p">)</span> +<span class="w"> </span><span class="c1"># check alignment of result</span> +<span class="w"> </span><span class="n">var4</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">bb</span><span class="o">.</span><span class="n">int_and</span><span class="p">(</span><span class="n">var3</span><span class="p">,</span><span class="w"> </span><span class="mi">0</span><span class="n">b111</span><span class="p">)</span> +<span class="w"> </span><span class="n">var5</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">bb</span><span class="o">.</span><span class="n">int_eq</span><span class="p">(</span><span class="n">var4</span><span class="p">,</span><span class="w"> </span><span class="mi">0</span><span class="p">)</span> +<span class="w"> </span><span class="c1"># var5 should be const-folded to 1</span> +<span class="w"> </span><span class="n">var6</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">bb</span><span class="o">.</span><span class="n">dummy</span><span class="p">(</span><span class="n">var5</span><span class="p">)</span> + +<span class="w"> </span><span class="n">opt_bb</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">simplify</span><span class="p">(</span><span class="n">bb</span><span class="p">)</span> +<span class="w"> </span><span class="nb">assert</span><span class="w"> </span><span class="n">bb_to_str</span><span class="p">(</span><span class="n">opt_bb</span><span class="p">,</span><span class="w"> </span><span class="s2">"optvar"</span><span class="p">)</span><span class="w"> </span><span class="o">==</span><span class="w"> </span><span class="s2">"""</span><span class="se">\</span> +<span class="s2">optvar0 = getarg(0)</span> +<span class="s2">optvar1 = int_and(optvar0, -8)</span> +<span class="s2">optvar2 = int_add(optvar1, 16)</span> +<span class="s2">optvar3 = dummy(1)"""</span> +</pre></div> + +<p>Here is <code>simplify</code> to make these tests pass:</p> +<div class="code"><pre class="code literal-block"><span class="k">def</span> <span class="nf">unknown_transfer_functions</span><span class="p">(</span><span class="o">*</span><span class="n">abstract_args</span><span class="p">):</span> + <span class="k">return</span> <span class="n">KnownBits</span><span class="o">.</span><span class="n">all_unknown</span><span class="p">()</span> + + +<span class="k">def</span> <span class="nf">simplify</span><span class="p">(</span><span class="n">bb</span><span class="p">:</span> <span class="n">Block</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Block</span><span class="p">:</span> + <span class="n">abstract_values</span> <span class="o">=</span> <span class="p">{}</span> <span class="c1"># dict mapping Operation to KnownBits</span> + + <span class="k">def</span> <span class="nf">knownbits_of</span><span class="p">(</span><span class="n">val</span> <span class="p">:</span> <span class="n">Value</span><span class="p">):</span> + <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">val</span><span class="p">,</span> <span class="n">Constant</span><span class="p">):</span> + <span class="k">return</span> <span class="n">KnownBits</span><span class="o">.</span><span class="n">from_constant</span><span class="p">(</span><span class="n">val</span><span class="o">.</span><span class="n">value</span><span class="p">)</span> + <span class="k">return</span> <span class="n">abstract_values</span><span class="p">[</span><span class="n">val</span><span class="p">]</span> + + <span class="n">opt_bb</span> <span class="o">=</span> <span class="n">Block</span><span class="p">()</span> + <span class="k">for</span> <span class="n">op</span> <span class="ow">in</span> <span class="n">bb</span><span class="p">:</span> + <span class="c1"># apply the transfer function on the abstract arguments</span> + <span class="n">name_without_prefix</span> <span class="o">=</span> <span class="n">op</span><span class="o">.</span><span class="n">name</span><span class="o">.</span><span class="n">removeprefix</span><span class="p">(</span><span class="s2">"int_"</span><span class="p">)</span> + <span class="n">method_name</span> <span class="o">=</span> <span class="sa">f</span><span class="s2">"abstract_</span><span class="si">{</span><span class="n">name_without_prefix</span><span class="si">}</span><span class="s2">"</span> + <span class="n">transfer_function</span> <span class="o">=</span> <span class="nb">getattr</span><span class="p">(</span><span class="n">KnownBits</span><span class="p">,</span> <span class="n">method_name</span><span class="p">,</span> <span class="n">unknown_transfer_functions</span><span class="p">)</span> + <span class="n">abstract_args</span> <span class="o">=</span> <span class="p">[</span><span class="n">knownbits_of</span><span class="p">(</span><span class="n">arg</span><span class="o">.</span><span class="n">find</span><span class="p">())</span> <span class="k">for</span> <span class="n">arg</span> <span class="ow">in</span> <span class="n">op</span><span class="o">.</span><span class="n">args</span><span class="p">]</span> + <span class="n">abstract_res</span> <span class="o">=</span> <span class="n">abstract_values</span><span class="p">[</span><span class="n">op</span><span class="p">]</span> <span class="o">=</span> <span class="n">transfer_function</span><span class="p">(</span><span class="o">*</span><span class="n">abstract_args</span><span class="p">)</span> + <span class="c1"># if the result is a constant, we optimize the operation away and make</span> + <span class="c1"># it equal to the constant result</span> + <span class="k">if</span> <span class="n">abstract_res</span><span class="o">.</span><span class="n">is_constant</span><span class="p">():</span> + <span class="n">op</span><span class="o">.</span><span class="n">make_equal_to</span><span class="p">(</span><span class="n">Constant</span><span class="p">(</span><span class="n">abstract_res</span><span class="o">.</span><span class="n">ones</span><span class="p">))</span> + <span class="k">continue</span> + <span class="c1"># otherwise emit the op</span> + <span class="n">opt_bb</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">op</span><span class="p">)</span> + <span class="k">return</span> <span class="n">opt_bb</span> +</pre></div> + +<p>The code follows the approach from the previous blog post very closely. The +only difference is that we apply the transfer function <em>first</em>, to be able to +detect whether the abstract domain can tell us that the result has to always be +a constant. This code makes all three tests pass.</p> +<h3 id="using-the-knownbits-domain-for-conditional-peephole-rewrites">Using the <code>KnownBits</code> Domain for Conditional Peephole Rewrites</h3> +<p>So far we are only using the <code>KnownBits</code> domain to find out that certain +operations have to produce a constant. We can also use the <code>KnownBits</code> domain +to check whether certain operation rewrites are correct. Let's use one of the +examples from the <a href="https://www.pypy.org/posts/2024/07/mining-jit-traces-missing-optimizations-z3.html">Mining JIT traces for missing optimizations with +Z3</a> +post, where Z3 found the inefficiency <code>(x &lt;&lt; 4) &amp; -0xf == x &lt;&lt; 4</code> in PyPy JIT +traces. We don't have shift operations, but we want to generalize this optimization +anyway. The general form of this rewrite is that under some circumstances <code>x &amp; +y == x</code>, and we can use the <code>KnownBits</code> domain to detect situations where this +must be true.</p> +<p>To understand <em>when</em> <code>x &amp; y == x</code> is true, we can think about individual pairs of +bits <code>a</code> and <code>b</code>. If <code>a == 0</code>, then <code>a &amp; b == 0 &amp; b == 0 == a</code>. If <code>b == 1</code> +then <code>a &amp; b == a &amp; 1 == a</code>. So if either <code>a == 0</code> or <code>b == 1</code> is true, +<code>a &amp; b == a</code> follows. And if either of these conditions is true for <em>all</em> the +bits of <code>x</code> and <code>y</code>, we can know that <code>x &amp; y == x</code>.</p> +<p>We can write a method on <code>KnownBits</code> to check for this condition:</p> +<div class="code"><pre class="code literal-block"><span class="k">class</span> <span class="nc">KnownBits</span><span class="p">:</span> + <span class="o">...</span> + + <span class="k">def</span> <span class="nf">is_and_identity</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">):</span> +<span class="w"> </span><span class="sd">""" Return True if n1 &amp; n2 == n1 for any n1 in self and n2 in other.</span> +<span class="sd"> (or, equivalently, return True if n1 | n2 == n2)"""</span> + <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">zeros</span> <span class="o">|</span> <span class="n">other</span><span class="o">.</span><span class="n">ones</span> <span class="o">==</span> <span class="o">-</span><span class="mi">1</span> +</pre></div> + +<p>Since my reasoning about this feels ripe for errors, let's check that our +understanding is correct with Z3:</p> +<div class="code"><pre class="code literal-block"><span class="k">def</span> <span class="nf">test_prove_is_and_identity</span><span class="p">():</span> + <span class="n">solver</span><span class="p">,</span> <span class="n">k1</span><span class="p">,</span> <span class="n">n1</span><span class="p">,</span> <span class="n">k2</span><span class="p">,</span> <span class="n">n2</span> <span class="o">=</span> <span class="n">z3_setup_variables</span><span class="p">()</span> + <span class="n">prove</span><span class="p">(</span><span class="n">z3</span><span class="o">.</span><span class="n">Implies</span><span class="p">(</span><span class="n">k1</span><span class="o">.</span><span class="n">is_and_identity</span><span class="p">(</span><span class="n">k2</span><span class="p">),</span> <span class="n">n1</span> <span class="o">&amp;</span> <span class="n">n2</span> <span class="o">==</span> <span class="n">n1</span><span class="p">),</span> <span class="n">solver</span><span class="p">)</span> +</pre></div> + +<p>Now let's use this in the toy optimizer. Here are two tests for this rewrite:</p> +<div class="code"><pre class="code literal-block"><span class="k">def</span> <span class="nf">test_remove_redundant_and</span><span class="p">():</span> + <span class="n">bb</span> <span class="o">=</span> <span class="n">Block</span><span class="p">()</span> + <span class="n">var0</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">getarg</span><span class="p">(</span><span class="mi">0</span><span class="p">)</span> + <span class="n">var1</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">int_invert</span><span class="p">(</span><span class="mb">0b1111</span><span class="p">)</span> + <span class="c1"># mask off the lowest four bits</span> + <span class="n">var2</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">int_and</span><span class="p">(</span><span class="n">var0</span><span class="p">,</span> <span class="n">var1</span><span class="p">)</span> + <span class="c1"># applying the same mask is not redundant</span> + <span class="n">var3</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">int_and</span><span class="p">(</span><span class="n">var2</span><span class="p">,</span> <span class="n">var1</span><span class="p">)</span> + <span class="n">var4</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">dummy</span><span class="p">(</span><span class="n">var3</span><span class="p">)</span> + + <span class="n">opt_bb</span> <span class="o">=</span> <span class="n">simplify</span><span class="p">(</span><span class="n">bb</span><span class="p">)</span> + <span class="k">assert</span> <span class="n">bb_to_str</span><span class="p">(</span><span class="n">opt_bb</span><span class="p">,</span> <span class="s2">"optvar"</span><span class="p">)</span> <span class="o">==</span> <span class="s2">"""</span><span class="se">\</span> +<span class="s2">optvar0 = getarg(0)</span> +<span class="s2">optvar1 = int_and(optvar0, -16)</span> +<span class="s2">optvar2 = dummy(optvar1)"""</span> + +<span class="k">def</span> <span class="nf">test_remove_redundant_and_more_complex</span><span class="p">():</span> + <span class="n">bb</span> <span class="o">=</span> <span class="n">Block</span><span class="p">()</span> + <span class="n">var0</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">getarg</span><span class="p">(</span><span class="mi">0</span><span class="p">)</span> + <span class="n">var1</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">getarg</span><span class="p">(</span><span class="mi">1</span><span class="p">)</span> + <span class="c1"># var2 has bit pattern ????</span> + <span class="n">var2</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">int_and</span><span class="p">(</span><span class="n">var0</span><span class="p">,</span> <span class="mb">0b1111</span><span class="p">)</span> + <span class="c1"># var3 has bit pattern ...?1111</span> + <span class="n">var3</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">int_or</span><span class="p">(</span><span class="n">var1</span><span class="p">,</span> <span class="mb">0b1111</span><span class="p">)</span> + <span class="c1"># var4 is just var2</span> + <span class="n">var4</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">int_and</span><span class="p">(</span><span class="n">var2</span><span class="p">,</span> <span class="n">var3</span><span class="p">)</span> + <span class="n">var5</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">dummy</span><span class="p">(</span><span class="n">var4</span><span class="p">)</span> + + <span class="n">opt_bb</span> <span class="o">=</span> <span class="n">simplify</span><span class="p">(</span><span class="n">bb</span><span class="p">)</span> + <span class="k">assert</span> <span class="n">bb_to_str</span><span class="p">(</span><span class="n">opt_bb</span><span class="p">,</span> <span class="s2">"optvar"</span><span class="p">)</span> <span class="o">==</span> <span class="s2">"""</span><span class="se">\</span> +<span class="s2">optvar0 = getarg(0)</span> +<span class="s2">optvar1 = getarg(1)</span> +<span class="s2">optvar2 = int_and(optvar0, 15)</span> +<span class="s2">optvar3 = int_or(optvar1, 15)</span> +<span class="s2">optvar4 = dummy(optvar2)"""</span> +</pre></div> + +<p>The first test could also be made to pass by implementing a reassociation +optimization that turns <code>(x &amp; c1) &amp; c2</code> into <code>x &amp; (c1 &amp; c2)</code> and then constant-folds the second <code>and</code>. But here we want to +use <code>KnownBits</code> and conditionally rewrite <code>int_and</code> to its first argument. So to make the tests pass, +we can change <code>simplify</code> like this:</p> +<div class="code"><pre class="code literal-block"><span class="k">def</span> <span class="nf">simplify</span><span class="p">(</span><span class="n">bb</span><span class="p">:</span> <span class="n">Block</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Block</span><span class="p">:</span> + <span class="n">abstract_values</span> <span class="o">=</span> <span class="p">{}</span> <span class="c1"># dict mapping Operation to KnownBits</span> + + <span class="k">def</span> <span class="nf">knownbits_of</span><span class="p">(</span><span class="n">val</span> <span class="p">:</span> <span class="n">Value</span><span class="p">):</span> + <span class="o">...</span> + + <span class="n">opt_bb</span> <span class="o">=</span> <span class="n">Block</span><span class="p">()</span> + <span class="k">for</span> <span class="n">op</span> <span class="ow">in</span> <span class="n">bb</span><span class="p">:</span> + <span class="c1"># apply the transfer function on the abstract arguments</span> + <span class="n">name_without_prefix</span> <span class="o">=</span> <span class="n">op</span><span class="o">.</span><span class="n">name</span><span class="o">.</span><span class="n">removeprefix</span><span class="p">(</span><span class="s2">"int_"</span><span class="p">)</span> + <span class="n">method_name</span> <span class="o">=</span> <span class="sa">f</span><span class="s2">"abstract_</span><span class="si">{</span><span class="n">name_without_prefix</span><span class="si">}</span><span class="s2">"</span> + <span class="n">transfer_function</span> <span class="o">=</span> <span class="nb">getattr</span><span class="p">(</span><span class="n">KnownBits</span><span class="p">,</span> <span class="n">method_name</span><span class="p">,</span> <span class="n">unknown_transfer_functions</span><span class="p">)</span> + <span class="n">abstract_args</span> <span class="o">=</span> <span class="p">[</span><span class="n">knownbits_of</span><span class="p">(</span><span class="n">arg</span><span class="o">.</span><span class="n">find</span><span class="p">())</span> <span class="k">for</span> <span class="n">arg</span> <span class="ow">in</span> <span class="n">op</span><span class="o">.</span><span class="n">args</span><span class="p">]</span> + <span class="n">abstract_res</span> <span class="o">=</span> <span class="n">abstract_values</span><span class="p">[</span><span class="n">op</span><span class="p">]</span> <span class="o">=</span> <span class="n">transfer_function</span><span class="p">(</span><span class="o">*</span><span class="n">abstract_args</span><span class="p">)</span> + <span class="c1"># if the result is a constant, we optimize the operation away and make</span> + <span class="c1"># it equal to the constant result</span> + <span class="k">if</span> <span class="n">abstract_res</span><span class="o">.</span><span class="n">is_constant</span><span class="p">():</span> + <span class="n">op</span><span class="o">.</span><span class="n">make_equal_to</span><span class="p">(</span><span class="n">Constant</span><span class="p">(</span><span class="n">abstract_res</span><span class="o">.</span><span class="n">ones</span><span class="p">))</span> + <span class="k">continue</span> + <span class="c1"># &lt;&lt;&lt;&lt; new code</span> + <span class="c1"># conditionally rewrite int_and(x, y) to x</span> + <span class="k">if</span> <span class="n">op</span><span class="o">.</span><span class="n">name</span> <span class="o">==</span> <span class="s2">"int_and"</span><span class="p">:</span> + <span class="n">k1</span><span class="p">,</span> <span class="n">k2</span> <span class="o">=</span> <span class="n">abstract_args</span> + <span class="k">if</span> <span class="n">k1</span><span class="o">.</span><span class="n">is_and_identity</span><span class="p">(</span><span class="n">k2</span><span class="p">):</span> + <span class="n">op</span><span class="o">.</span><span class="n">make_equal_to</span><span class="p">(</span><span class="n">op</span><span class="o">.</span><span class="n">arg</span><span class="p">(</span><span class="mi">0</span><span class="p">))</span> + <span class="k">continue</span> + <span class="c1"># &gt;&gt;&gt;&gt; end changes</span> + <span class="n">opt_bb</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">op</span><span class="p">)</span> + <span class="k">return</span> <span class="n">opt_bb</span> +</pre></div> + +<p>And with that, the new tests pass as well. A real implementation would also +check the other argument order, but we leave that out for the sake of brevity.</p> +<p>This rewrite also generalizes the <a href="https://pypy.org/posts/2024/07/finding-simple-rewrite-rules-jit-z3.html">rewrites</a> <code>int_and(0, x) -&gt; 0</code> and +<code>int_and(-1, x) -&gt; x</code>, let's add a test for those:</p> +<div class="code"><pre class="code literal-block"><span class="k">def</span> <span class="nf">test_remove_and_simple</span><span class="p">():</span> + <span class="n">bb</span> <span class="o">=</span> <span class="n">Block</span><span class="p">()</span> + <span class="n">var0</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">getarg</span><span class="p">(</span><span class="mi">0</span><span class="p">)</span> + <span class="n">var1</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">getarg</span><span class="p">(</span><span class="mi">1</span><span class="p">)</span> + <span class="n">var2</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">int_and</span><span class="p">(</span><span class="mi">0</span><span class="p">,</span> <span class="n">var0</span><span class="p">)</span> <span class="c1"># == 0</span> + <span class="n">var3</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">int_invert</span><span class="p">(</span><span class="n">var2</span><span class="p">)</span> <span class="c1"># == -1</span> + <span class="n">var4</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">int_and</span><span class="p">(</span><span class="n">var1</span><span class="p">,</span> <span class="n">var3</span><span class="p">)</span> <span class="c1"># == var1</span> + <span class="n">var5</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">dummy</span><span class="p">(</span><span class="n">var4</span><span class="p">)</span> + + <span class="n">opt_bb</span> <span class="o">=</span> <span class="n">simplify</span><span class="p">(</span><span class="n">bb</span><span class="p">)</span> + <span class="k">assert</span> <span class="n">bb_to_str</span><span class="p">(</span><span class="n">opt_bb</span><span class="p">,</span> <span class="s2">"optvar"</span><span class="p">)</span> <span class="o">==</span> <span class="s2">"""</span><span class="se">\</span> +<span class="s2">optvar0 = getarg(0)</span> +<span class="s2">optvar1 = getarg(1)</span> +<span class="s2">optvar2 = dummy(optvar1)"""</span> +</pre></div> + +<p>This test just passes. And that's it for this post!</p> +<h3 id="conclusion">Conclusion</h3> +<p>In this post we've seen the implementation, testing and proofs about a 'known +bits' abstract domain, as well as its use in the toy optimizer to generalize +constant folding, and to implement conditional peephole rewrites.</p> +<p>In the next posts I'll write about the real implementation of a knownbits +domain in PyPy's JIT, its combination with the existing interval abstract +domain, how to deal with gaining information from conditions in the program, +and some lose ends.</p> +<p>Sources:</p> +<ul> +<li><a href="https://github.com/llvm/llvm-project/blob/main/llvm/lib/Support/KnownBits.cpp">Known bits in LLVM</a></li> +<li><a href="https://github.com/torvalds/linux/blob/master/kernel/bpf/tnum.c">Tristate numbers for known bits in Linux eBPF</a></li> +<li><a href="https://arxiv.org/abs/2105.05398">Sound, Precise, and Fast Abstract Interpretation with Tristate Numbers</a></li> +<li><a href="https://people.cs.rutgers.edu/~sn349/papers/agni-cav2023.pdf">Verifying the Verifier: eBPF Range Analysis Verification</a></li> +<li><a href="https://dougallj.wordpress.com/2020/01/13/bit-twiddling-addition-with-unknown-bits/">Bit-Twiddling: Addition with Unknown + Bits</a> + is a super readable blog post by Dougall J. I've taken the <code>ones</code> and + <code>unknowns</code> naming from this post, which I find significantly clearer than + <code>value</code> and <code>mask</code>, which the Linux kernel uses.</li> +<li><a href="https://bitmath.blogspot.com/">Bits, Math and Performance(?)</a>, a fantastic + blog by <a href="https://mastodon.gamedev.place/@harold">Harold Aptroot</a>. There are a + lot of relevant posts about known bits, range analysis etc. Harold is also + the author of <a href="http://haroldbot.nl/">Haroldbot</a>, a website that can be used + for bitvector calculations, and also checks bitvector identities.</li> +<li><a href="https://cea.hal.science/cea-01795779/document">Sharpening Constraint Programming approaches for Bit-Vector Theory</a></li> +<li><a href="https://users.cs.utah.edu/~regehr/papers/lctes06_2/fp019-regehr.pdf">Deriving Abstract Transfer Functions for Analyzing Embedded Software</a></li> +<li><a href="https://arxiv.org/abs/2105.00493">Synthesizing Abstract Transformers</a></li> +</ul> +<div class="footnote"> +<hr> +<ol> +<li id="fn:proof_bitwidths"> +<p>There's a subtletly about the Z3 proofs that I'm sort of +glossing over here. Python integers are of arbitrary width, and the +<code>KnownBits</code> code is actually carefully written to work for integers of any +size. This property is tested by the Hypothesis tests, which don't limit +the sizes of the generated random integers. However, the Z3 proofs only +check bitvectors of a fixed bitwidth of 64. There are various ways to deal +with this situation. For most "real" compilers, the bitwidth of integers +would be fixed anyway. Then the components <code>ones</code> and <code>unknowns</code> of the +<code>KnownBits</code> class would use the number of bits the corresponding integer +variable has, and the Z3 proofs would use the same width. This is what we +do in the PyPy JIT. <a class="footnote-backref" href="https://www.pypy.org/posts/2024/08/toy-knownbits.html#fnref:proof_bitwidths" title="Jump back to footnote 1 in the text">↩</a></p> +</li> +<li id="fn:tests_vs_proofs"> +<p>The less close connection between implementation and proof +for <code>abstract_eq</code> is one of the reasons why it makes sense to do +unit-testing <em>in addition</em> to proofs. For a more detailed explanation of +why both tests and proofs are good to +have, see <a href="https://siek.blogspot.com/2024/06/data-structures-and-algorithms-correctly.html#correct-software-via-write-test-and-prove:~:text=We%20recognize%20that%20once%20step,detect%20most%20of%20the%20bugs">Jeremy Siek's blog +post</a>, +as well as the <a href="https://www-cs-faculty.stanford.edu/~knuth/faq.html#:~:text=What's%20the%20exact%20citation%20of%20your%20oft%2Dcited%20comment%20about%20bugs?">Knuth +quote</a>. <a class="footnote-backref" href="https://www.pypy.org/posts/2024/08/toy-knownbits.html#fnref:tests_vs_proofs" title="Jump back to footnote 2 in the text">↩</a></p> +</li> +</ol> +</div>toy-optimizerz3https://www.pypy.org/posts/2024/08/toy-knownbits.htmlSat, 03 Aug 2024 14:00:00 GMTMining JIT traces for missing optimizations with Z3https://www.pypy.org/posts/2024/07/mining-jit-traces-missing-optimizations-z3.htmlCF Bolz-Tereick<p>In my last post I've described <a href="https://www.pypy.org/posts/2024/07/finding-simple-rewrite-rules-jit-z3.html">how to use Z3 to find simple local peephole +optimization patterns</a> +for the integer operations in PyPy's JIT. An example is <code>int_and(x, 0) -&gt; +0</code>. In this post I want to scale up the problem of identifying possible +optimizations to much bigger instruction sequences, also using Z3. For that, I +am starting with the JIT traces of <strong>real benchmarks</strong>, after they have been +optimized by the optimizer of PyPy's JIT. Then we can ask Z3 to find +inefficient integer operations in those traces.</p> +<p>Starting from the optimized traces of real programs has some big +advantages over the "classical" superoptimization approach of generating and +then trying all possible sequences of instructions. It avoids the +combinatorial explosion that happens with the latter approach. Also, starting +from the traces of benchmarks or (even better) actual programs makes sure that +we actually care about the missing optimizations +that are found in this way. And because the traces are analyzed after they have +been optimized by PyPy's optimizer, we only get reports for <em>missing</em> +optimizations, that the JIT isn't able to do (yet).</p> +<p>The techniques and experiments I describe in this post are again the result of +a bunch of discussions with John Regehr at a conference a few weeks ago, as +well as reading his blog posts and papers. Thanks John! Also thanks to <a href="https://bernsteinbear.com/">Max +Bernstein</a> for super helpful feedback on the drafts +of this blog post (and for poking me to write things in general).</p> +<h3 id="high-level-approach">High-Level Approach</h3> +<p>The approach that I took works as follows:</p> +<ul> +<li>Run benchmarks or other interesting programs and then dump the IR of the JIT + traces into a file. The traces have at that point been already optimized by + the PyPy JIT's optimizer.</li> +<li>For every trace, ignore all the operations on non-integer variables.</li> +<li>Translate every integer operation into a Z3 formula.</li> +<li>For every operation, use Z3 to find out whether the operation is redundant + (how that is done is described below).</li> +<li>If the operation is redundant, the trace is less efficient than it could have + been, because the optimizer could also have removed the operation. Report the + inefficiency.</li> +<li>Minimize the inefficient programs by removing as many operations as possible + to make the problem easier to understand.</li> +</ul> +<p>In the post I will describe the details and show some pseudocode of the +approach. I'll also make the proper code public eventually (but it needs a +healthy dose of cleanups first).</p> +<h3 id="dumping-pypy-traces">Dumping PyPy Traces</h3> +<p>PyPy will write its JIT traces into the file <code>out</code> if the environment variable +<a href="https://doc.pypy.org/en/latest/man/pypy.1.html"><code>PYPYLOG</code></a> is set as follows:</p> +<div class="code"><pre class="code literal-block">PYPYLOG=jit-log-opt:out pypy &lt;program.py&gt; +</pre></div> + +<p>This environment variable works for PyPy, but also for other virtual machines +built with RPython.</p> +<p>(This is really a side point for the rest of the blog post, but since the +question came up I wanted to clarify it: Operations on integers in the Python +program that the JIT is running don't all correspond 1-to-1 with the <code>int_...</code> +operations in the traces. The <code>int_...</code> trace operations always operate on +machine words. The Python <code>int</code> type supports arbitrarily large integers. PyPy +will optimistically try to lower the operations on Python integers into machine +word operations, but adds the necessary guards into the trace to make sure that +overflow outside of the range of machine words is caught. In case one of these +guards fails the interpreter switches to a big integer heap-allocated +representation.)</p> +<h3 id="encoding-traces-as-z3-formulas">Encoding Traces as Z3 formulas</h3> +<p>The last blog post already contained the code to encode the results of +individual trace operations into Z3 formulas, so we don't need to repeat that +here. To encode traces of operations we introduce a Z3 variable for every +operation in the trace and then call the <code>z3_expression</code> function for every +single one of the operations in the trace.</p> +<p>For example, for the following trace:</p> +<div class="code"><pre class="code literal-block"><span class="k">[i1]</span> +<span class="na">i2</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="s">uint_rshift(i1, 32)</span> +<span class="na">i3</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="s">int_and(i2, 65535)</span> +<span class="na">i4</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="s">uint_rshift(i1, 48)</span> +<span class="na">i5</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="s">int_lshift(i4, 16)</span> +<span class="na">i6</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="s">int_or(i5, i3)</span> +<span class="na">jump(i6, i2) # equal</span> +</pre></div> + +<p>We would get the Z3 formula:</p> +<div class="code"><pre class="code literal-block">z3.And(i2 == LShR(i1, 32), + i3 == i2 &amp; 65535, + i4 == LShR(i1, 48), + i5 == i4 &lt;&lt; 16) +</pre></div> + +<p>Usually we won't ask for the formula of the whole trace at once. Instead we go +through the trace operation by operation and try to find inefficiencies in the +current one we are looking at. Roughly like this (pseudo-)code:</p> +<div class="code"><pre class="code literal-block"><span class="k">def</span> <span class="nf">newvar</span><span class="p">(</span><span class="n">name</span><span class="p">):</span> + <span class="k">return</span> <span class="n">z3</span><span class="o">.</span><span class="n">BitVec</span><span class="p">(</span><span class="n">name</span><span class="p">,</span> <span class="n">INTEGER_WIDTH</span><span class="p">)</span> + +<span class="k">def</span> <span class="nf">find_inefficiencies</span><span class="p">(</span><span class="n">trace</span><span class="p">):</span> + <span class="n">solver</span> <span class="o">=</span> <span class="n">z3</span><span class="o">.</span><span class="n">Solver</span><span class="p">()</span> + <span class="n">var_to_z3var</span> <span class="o">=</span> <span class="p">{}</span> + <span class="k">for</span> <span class="n">input_argument</span> <span class="ow">in</span> <span class="n">trace</span><span class="o">.</span><span class="n">inputargs</span><span class="p">:</span> + <span class="n">var_to_z3var</span><span class="p">[</span><span class="n">input_argument</span><span class="p">]</span> <span class="o">=</span> <span class="n">newz3var</span><span class="p">(</span><span class="n">input_argument</span><span class="p">)</span> + <span class="k">for</span> <span class="n">op</span> <span class="ow">in</span> <span class="n">trace</span><span class="p">:</span> + <span class="n">var_to_z3var</span><span class="p">[</span><span class="n">op</span><span class="p">]</span> <span class="o">=</span> <span class="n">z3resultvar</span> <span class="o">=</span> <span class="n">newz3var</span><span class="p">(</span><span class="n">op</span><span class="o">.</span><span class="n">resultvarname</span><span class="p">)</span> + <span class="n">arg0</span> <span class="o">=</span> <span class="n">op</span><span class="o">.</span><span class="n">args</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span> + <span class="n">z3arg0</span> <span class="o">=</span> <span class="n">var_to_z3var</span><span class="p">[</span><span class="n">arg0</span><span class="p">]</span> + <span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">op</span><span class="o">.</span><span class="n">args</span><span class="p">)</span> <span class="o">==</span> <span class="mi">2</span><span class="p">:</span> + <span class="n">arg1</span> <span class="o">=</span> <span class="n">op</span><span class="o">.</span><span class="n">args</span><span class="p">[</span><span class="mi">1</span><span class="p">]</span> + <span class="n">z3arg1</span> <span class="o">=</span> <span class="n">var_to_z3var</span><span class="p">[</span><span class="n">arg1</span><span class="p">]</span> + <span class="k">else</span><span class="p">:</span> + <span class="n">z3arg1</span> <span class="o">=</span> <span class="kc">None</span> + <span class="n">res</span><span class="p">,</span> <span class="n">valid_if</span> <span class="o">=</span> <span class="n">z3_expression</span><span class="p">(</span><span class="n">op</span><span class="o">.</span><span class="n">name</span><span class="p">,</span> <span class="n">z3arg0</span><span class="p">,</span> <span class="n">z3arg1</span><span class="p">)</span> + <span class="c1"># checking for inefficiencies, see the next sections</span> + <span class="o">...</span> + <span class="k">if</span> <span class="o">...</span><span class="p">:</span> + <span class="k">return</span> <span class="s2">"inefficient"</span><span class="p">,</span> <span class="n">op</span> + + <span class="c1"># not inefficient, assert op into the solver and continue with the next op</span> + <span class="n">solver</span><span class="o">.</span><span class="n">add</span><span class="p">(</span><span class="n">z3resultvar</span> <span class="o">==</span> <span class="n">res</span><span class="p">)</span> + <span class="k">return</span> <span class="kc">None</span> <span class="c1"># no inefficiency found</span> +</pre></div> + +<h3 id="identifying-constant-booleans-with-z3">Identifying constant booleans with Z3</h3> +<p>To get started finding inefficiencies in a trace, we can +first focus on boolean variables. For every operation in the trace that +returns a bool we can ask Z3 to prove that this variable must be always True or +always False. Most of the time, neither of these proofs will succeed. But if Z3 +manages to prove one of them, we know have found an ineffiency: instead of +computing the boolean result (eg by executing a comparison) the JIT's optimizer +could have replaced the operation with the corresponding boolean constant.</p> +<p>Here's an example of an inefficiency found that way: if <code>x &lt; y</code> and <code>y &lt; z</code> are +both true, PyPy's JIT could conclude that <code>x &lt; z</code> must also +be true. However, currently the JIT cannot make that conclusion because it +only reasons about the concrete ranges (lower and upper bounds) for every +integer variable, but it has no way to remember anything about relationships +between different variables. This kind of reasoning would quite often be useful +to remove list/string bounds checks. Here's a <a href="https://www.youtube.com/watch?app=desktop&amp;v=1hm5ZVmBEvo">talk about how LLVM does +this</a> (but it might be +too heavyweight for a JIT setting).</p> +<p>Here are some more examples found that way:</p> +<ul> +<li><code>x - 1 == x</code> is always False</li> +<li><code>x - (x == -1) == -1</code> is always False. The pattern <code>x - (x == -1)</code> happens a + lot in PyPy's hash computations: To be compatible with the CPython hashes we + need to make sure that no object's hash is -1 (CPython uses -1 as an error + value on the C level).</li> +</ul> +<p>Here's pseudo-code for how to implement checking boolean operations for +inefficiencies:</p> +<div class="code"><pre class="code literal-block"><span class="k">def</span> <span class="nf">find_inefficiencies</span><span class="p">(</span><span class="n">trace</span><span class="p">):</span> + <span class="o">...</span> + <span class="k">for</span> <span class="n">op</span> <span class="ow">in</span> <span class="n">trace</span><span class="p">:</span> + <span class="o">...</span> + <span class="n">res</span><span class="p">,</span> <span class="n">valid_if</span> <span class="o">=</span> <span class="n">z3_expression</span><span class="p">(</span><span class="n">op</span><span class="o">.</span><span class="n">name</span><span class="p">,</span> <span class="n">z3arg0</span><span class="p">,</span> <span class="n">z3arg1</span><span class="p">)</span> + <span class="c1"># check for boolean constant result</span> + <span class="k">if</span> <span class="n">op</span><span class="o">.</span><span class="n">has_boolean_result</span><span class="p">():</span> + <span class="k">if</span> <span class="n">prove</span><span class="p">(</span><span class="n">solver</span><span class="p">,</span> <span class="n">res</span> <span class="o">==</span> <span class="mi">0</span><span class="p">):</span> + <span class="k">return</span> <span class="s2">"inefficient"</span><span class="p">,</span> <span class="n">op</span><span class="p">,</span> <span class="mi">0</span> + <span class="k">if</span> <span class="n">prove</span><span class="p">(</span><span class="n">solver</span><span class="p">,</span> <span class="n">res</span> <span class="o">==</span> <span class="mi">1</span><span class="p">):</span> + <span class="k">return</span> <span class="s2">"inefficient"</span><span class="p">,</span> <span class="n">op</span><span class="p">,</span> <span class="mi">1</span> + <span class="c1"># checking for other inefficiencies, see the next sections</span> + <span class="o">...</span> + + <span class="c1"># not inefficient, add op to the solver and continue with the next op</span> + <span class="n">solver</span><span class="o">.</span><span class="n">add</span><span class="p">(</span><span class="n">z3resultvar</span> <span class="o">==</span> <span class="n">res</span><span class="p">)</span> + <span class="k">return</span> <span class="kc">None</span> <span class="c1"># no inefficiency found</span> +</pre></div> + +<h3 id="identifying-redundant-operations">Identifying redundant operations</h3> +<p>A more interesting class of redundancy is to try to find two operations in a +trace that compute the same result. We can do that by asking Z3 to prove for +each pair of different operations in the trace to prove that the result is +always the same. If a previous operation returns the same result, the JIT could +have re-used that result instead of re-computing it, saving time. Doing this +search for equivalent operations with Z3 is quadratic in the number of +operations, but since traces have a maximum length it is not too bad in +practice.</p> +<p>This is the real workhorse of my script so far, it's what finds most of the +inefficiencies. Here's a few examples:</p> +<ul> +<li>The very first and super useful example the script found is <code>int_eq(b, 1) == + b</code> if <code>b</code> is known to be a boolean (ie and integer 0 or 1). I have already + implemented this optimization in the JIT.</li> +<li>Similarly, <code>int_and(b, 1) == b</code> for booleans.</li> +<li><code>(x &lt;&lt; 4) &amp; -0xf == x &lt;&lt; 4</code></li> +<li><code>((x &gt;&gt; 63) &lt;&lt; 1) &lt;&lt; 2) &gt;&gt; 3 == x &gt;&gt; 63</code>. In general the JIT is quite bad at + optimizing repeated shifts (the infrastructure for doing better with that is + already in place, so this will be a relatively easy fix).</li> +<li><code>(x &amp; 0xffffffff) | ((x &gt;&gt; 32) &lt;&lt; 32) == x</code>. Having the JIT optimize this + would maybe require first recognizing that <code>(x &gt;&gt; 32) &lt;&lt; 32</code> can be expressed + as a mask: <code>(x &amp; 0xffffffff00000000)</code>, and then using <code>(x &amp; c1) | (x &amp; c2) == + x &amp; (c1 | c2)</code></li> +<li>A commonly occurring pattern is variations of this one: + <code>((x &amp; 1345) ^ 2048) - 2048 == x &amp; 1345</code> (with different constants, of + course). xor is add without carry, and <code>x &amp; 1345</code> does not have the bit + <code>2048</code> set. Therefore the <code>^ 2048</code> is equivalent to <code>+ 2048</code>, which the <code>- + 2048</code> cancels. More generally, if <code>a &amp; b == 0</code>, then <code>a + b == a | b == a ^ b</code>. + I don't understand at all why this appears so often in the traces, but I + see variations of it a lot. LLVM can optimize this, but <a href="https://gcc.gnu.org/bugzilla/show_bug.cgi?id=115829">GCC + can't</a>, thanks to + <a href="https://hachyderm.io/@pinskia/112752641328799157">Andrew Pinski for filing the + bug</a>!</li> +</ul> +<p>And here's some implementation pseudo-code again:</p> +<div class="code"><pre class="code literal-block"><span class="k">def</span> <span class="nf">find_inefficiencies</span><span class="p">(</span><span class="n">trace</span><span class="p">):</span> + <span class="o">...</span> + <span class="k">for</span> <span class="n">op</span> <span class="ow">in</span> <span class="n">trace</span><span class="p">:</span> + <span class="o">...</span> + <span class="n">res</span><span class="p">,</span> <span class="n">valid_if</span> <span class="o">=</span> <span class="n">z3_expression</span><span class="p">(</span><span class="n">op</span><span class="o">.</span><span class="n">name</span><span class="p">,</span> <span class="n">z3arg0</span><span class="p">,</span> <span class="n">z3arg1</span><span class="p">)</span> + <span class="c1"># check for boolean constant result</span> + <span class="o">...</span> + <span class="c1"># searching for redundant operations</span> + <span class="k">for</span> <span class="n">previous_op</span> <span class="ow">in</span> <span class="n">trace</span><span class="p">:</span> + <span class="k">if</span> <span class="n">previous_op</span> <span class="ow">is</span> <span class="n">op</span><span class="p">:</span> + <span class="k">break</span> <span class="c1"># done, reached the current op</span> + <span class="n">previous_op_z3var</span> <span class="o">=</span> <span class="n">var_to_z3var</span><span class="p">[</span><span class="n">previous_op</span><span class="p">]</span> + <span class="k">if</span> <span class="n">prove</span><span class="p">(</span><span class="n">solver</span><span class="p">,</span> <span class="n">previous_op_z3var</span> <span class="o">==</span> <span class="n">res</span><span class="p">):</span> + <span class="k">return</span> <span class="s2">"inefficient"</span><span class="p">,</span> <span class="n">op</span><span class="p">,</span> <span class="n">previous_op</span> + <span class="o">...</span> + <span class="c1"># more code here later</span> + <span class="o">...</span> + + <span class="c1"># not inefficient, add op to the solver and continue with the next op</span> + <span class="n">solver</span><span class="o">.</span><span class="n">add</span><span class="p">(</span><span class="n">z3resultvar</span> <span class="o">==</span> <span class="n">res</span><span class="p">)</span> + <span class="k">return</span> <span class="kc">None</span> <span class="c1"># no inefficiency found</span> +</pre></div> + +<h3 id="synthesizing-more-complicated-constants-with-exists-forall">Synthesizing more complicated constants with exists-forall</h3> +<p>To find out whether some integer operations always return a constant result, we +can't simply use the same trick as for those operations that return boolean +results, because enumerating 2⁶⁴ possible constants and checking them all +would take too long. Like in the last post, we can use <code>z3.ForAll</code> to find out +whether Z3 can synthesize a constant for the result of an operation for us. +If such a constant exists, the JIT could have removed the operation, +and replaced it with the constant that Z3 provides.</p> +<p>Here a few examples of inefficiencies found this way:</p> +<ul> +<li><code>(x ^ 1) ^ x == 1</code> (or, more generally: <code>(x ^ y) ^ x == y</code>)</li> +<li>if <code>x | y == 0</code>, it follows that <code>x == 0</code> and <code>y == 0</code></li> +<li>if <code>x != MAXINT</code>, then <code>x + 1 &gt; x</code></li> +</ul> +<p>Implementing this is actually slightly annoying. The <code>solver.add</code> calls for +non-inefficient ops add assertions to the solver, which are now confusing the +<code>z3.ForAll</code> query. We could remove all assertion from the solver, then do the +<code>ForAll</code> query, then add the assertions back. What I ended doing instead was +instantiating a second solver object that I'm using for the <code>ForAll</code> queries, +that remains empty the whole time.</p> +<div class="code"><pre class="code literal-block"><span class="k">def</span> <span class="nf">find_inefficiencies</span><span class="p">(</span><span class="n">trace</span><span class="p">):</span> + <span class="n">solver</span> <span class="o">=</span> <span class="n">z3</span><span class="o">.</span><span class="n">Solver</span><span class="p">()</span> + <span class="n">empty_solver</span> <span class="o">=</span> <span class="n">z3</span><span class="o">.</span><span class="n">Solver</span><span class="p">()</span> + <span class="n">var_to_z3var</span> <span class="o">=</span> <span class="p">{}</span> + <span class="o">...</span> + <span class="k">for</span> <span class="n">op</span> <span class="ow">in</span> <span class="n">trace</span><span class="p">:</span> + <span class="o">...</span> + <span class="n">res</span><span class="p">,</span> <span class="n">valid_if</span> <span class="o">=</span> <span class="n">z3_expression</span><span class="p">(</span><span class="n">op</span><span class="o">.</span><span class="n">name</span><span class="p">,</span> <span class="n">z3arg0</span><span class="p">,</span> <span class="n">z3arg1</span><span class="p">)</span> + <span class="c1"># check for boolean constant result</span> + <span class="o">...</span> + <span class="c1"># searching for redundant operations</span> + <span class="o">...</span> + <span class="c1"># checking for constant results</span> + <span class="n">constvar</span> <span class="o">=</span> <span class="n">z3</span><span class="o">.</span><span class="n">BitVec</span><span class="p">(</span><span class="s1">'find_const'</span><span class="p">,</span> <span class="n">INTEGER_WIDTH</span><span class="p">)</span> + <span class="n">condition</span> <span class="o">=</span> <span class="n">z3</span><span class="o">.</span><span class="n">ForAll</span><span class="p">(</span> + <span class="n">var_to_z3var</span><span class="o">.</span><span class="n">values</span><span class="p">(),</span> + <span class="n">z3</span><span class="o">.</span><span class="n">Implies</span><span class="p">(</span> + <span class="o">*</span><span class="n">solver</span><span class="o">.</span><span class="n">assertions</span><span class="p">(),</span> + <span class="n">expr</span> <span class="o">==</span> <span class="n">constvar</span> + <span class="p">)</span> + <span class="p">)</span> + <span class="k">if</span> <span class="n">empty_solver</span><span class="o">.</span><span class="n">check</span><span class="p">(</span><span class="n">condition</span><span class="p">)</span> <span class="o">==</span> <span class="n">z3</span><span class="o">.</span><span class="n">sat</span><span class="p">:</span> + <span class="n">model</span> <span class="o">=</span> <span class="n">empty_solver</span><span class="o">.</span><span class="n">model</span><span class="p">()</span> + <span class="n">const</span> <span class="o">=</span> <span class="n">model</span><span class="p">[</span><span class="n">constvar</span><span class="p">]</span><span class="o">.</span><span class="n">as_signed_long</span><span class="p">()</span> + <span class="k">return</span> <span class="s2">"inefficient"</span><span class="p">,</span> <span class="n">op</span><span class="p">,</span> <span class="n">const</span> + + <span class="c1"># not inefficient, add op to the solver and continue with the next op</span> + <span class="n">solver</span><span class="o">.</span><span class="n">add</span><span class="p">(</span><span class="n">z3resultvar</span> <span class="o">==</span> <span class="n">res</span><span class="p">)</span> + <span class="k">return</span> <span class="kc">None</span> <span class="c1"># no inefficiency found</span> +</pre></div> + +<h3 id="minimization">Minimization</h3> +<p>Analyzing an inefficiency by hand in the context of a larger trace is quite +tedious. Therefore I've implemented a (super inefficient) script to try to make +the examples smaller. Here's how that works:</p> +<ul> +<li>First throw out all the operations that occur <em>after</em> the inefficient operation + in the trace.</li> +<li>Then we remove all "dead" operations, ie operations that don't have their + results used (all the operations that we can analyze with Z3 are without side + effects).</li> +<li>Now we try to remove every guard in the trace one by one and check + afterwards, whether the resulting trace still has an inefficiency.</li> +<li>We also try to replace every single operation with a new argument to the + trace, to see whether the inefficiency is still present.</li> +</ul> +<p>The minimization process is sort of inefficient and I should probably be using + <a href="https://github.com/DRMacIver/shrinkray">shrinkray</a> or + <a href="https://github.com/csmith-project/creduce">C-Reduce</a> instead. However, it + seems to work well in practice and the runtime isn't too bad.</p> +<h3 id="results">Results</h3> +<p>So far I am using the JIT traces of three programs: 1) Booting Linux on the +<a href="https://docs.pydrofoil.org">Pydrofoil</a> RISC-V emulator, 2) booting Linux on the Pydrofoil ARM emulator, and 3) +running the PyPy bootstrap process on top of PyPy.</p> +<p>I picked these programs because most Python programs don't contain interesting +amounts of integer operations, and the traces of the emulators +contain a lot of them. I also used the bootstrap process because I still wanted +to try a big Python program and personally care about the runtime of this +program a lot.</p> +<p>The script identifies 94 +inefficiencies in the traces, a lot of them come from repeating +patterns. My next steps will be to manually inspect them all, categorize them, and +implement easy optimizations identified that way. I also want a way to sort the +examples by execution count in the benchmarks, to get a feeling for which of +them are most important.</p> +<p>I didn't investigate the full set of <a href="https://speed.pypy.org">Python +benchmarks</a> that PyPy uses yet, because I don't expect +them to contain interesting amounts of integer operations, but maybe I am wrong +about that? Will have to try eventually.</p> +<h3 id="conclusion">Conclusion</h3> +<p>This was again much easier to do than I would have expected! Given that I had +the translation of trace ops to Z3 already in place, it was a matter of about a +day's of programming to use this infrastructure to find the first problems and +minimizing them.</p> +<p>Reusing the results of existing operations or replacing operations by constants +can be seen as "zero-instruction superoptimization". I'll probably be rather +busy for a while to add the missing optimizations identified by my simple +script. But later extensions to actually synthesize one or several operations +in the attempt to optimize the traces more and find more opportunities should +be possible.</p> +<p>Finding inefficiencies in traces with Z3 is significantly less +annoying and also less error-prone than just manually inspecting traces and +trying to spot optimization opportunities.</p> +<h3 id="random-notes-and-sources">Random Notes and Sources</h3> +<p>Again, John's blog posts:</p> +<ul> +<li><a href="https://blog.regehr.org/archives/1109">Let’s Work on an LLVM Superoptimizer</a></li> +<li><a href="https://blog.regehr.org/archives/1146">Early Superoptimizer Results</a></li> +<li><a href="https://blog.regehr.org/archives/1252">A Few Synthesizing Superoptimizer Results</a></li> +<li><a href="https://blog.regehr.org/archives/1636">Synthesizing Constants</a></li> +</ul> +<p>and papers:</p> +<ul> +<li><a href="https://arxiv.org/pdf/1711.04422">A Synthesizing Superoptimizer</a></li> +<li><a href="https://dl.acm.org/doi/pdf/10.1145/3649837">Hydra: Generalizing Peephole Optimizations with Program Synthesis</a></li> +</ul> +<p>I remembered recently that I had seen the approach of optimizing the traces of +a tracing JIT with Z3 a long time ago, as part of the (now long dead, I think) +<a href="https://web.archive.org/web/20160304055149/http://research.microsoft.com/en-us/projects/spur/">SPUR +project</a>. +There's a <a href="https://web.archive.org/web/20161029162737/http://csl.stanford.edu/~christos/pldi2010.fit/tillmann.provers4jit.pdf">workshop +paper</a> +from 2010 about this. SPUR was trying to use Z3 built into the actual JIT (as +opposed to using Z3 only to find places where the regular optimizers could be +improved). In addition to bitvectors, SPUR also used the Z3 support for arrays +to model the C# heap and remove redundant stores. This is still another future +extension for all the Z3 work I've been doing in the context of the PyPy JIT.</p>jitz3https://www.pypy.org/posts/2024/07/mining-jit-traces-missing-optimizations-z3.htmlFri, 19 Jul 2024 17:01:09 GMTFinding Simple Rewrite Rules for the JIT with Z3https://www.pypy.org/posts/2024/07/finding-simple-rewrite-rules-jit-z3.htmlCF Bolz-Tereick<p>In June I was at the <a href="https://pldi24.sigplan.org/">PLDI conference</a> in +Copenhagen to present a <a href="https://dl.acm.org/doi/10.1145/3652588.3663316">paper</a> +I co-authored with <a href="https://bernsteinbear.com/">Max Bernstein</a>. I also finally +met <a href="https://blog.regehr.org/">John Regehr</a>, who I'd been talking on social +media for ages but had never met. John has been working on compiler correctness +and better techniques for building compilers and optimizers since a very long +time. The blog post <a href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html">Finding JIT Optimizer Bugs using SMT Solvers and +Fuzzing</a> +was heavily inspired by this work. We talked a lot about his and his groups +work on using Z3 for +<a href="https://en.wikipedia.org/wiki/Superoptimization">superoptimization</a> and for +finding missing optimizations. I have applied some of the things John told me +about to the traces of PyPy's JIT, and wanted to blog about that. However, my +draft felt quite hard to understand. Therefore I have now written this current +post, to at least try to provide a somewhat gentler on-ramp to the topic.</p> +<p>In <em>this</em> post we will use the Python-API to Z3 to find local peephole rewrite +rules for the operations in the intermediate representation of PyPy's tracing +JIT. The code for this is simple enough that we can go through all of it.</p> +<p>The PyPy JIT produces traces of machine level instructions, which are optimized +and then turned into machine code. The optimizer uses a number of approaches to +make the traces more efficient. For integer operations it applies a number of +arithmetic simplification rules rules, for example <code>int_add(x, 0) -&gt; x</code>. When +implementing these rules in the JIT there are <strong>two problems</strong>: How do we know +that the rules are correct? And how do we know that we haven't forgotten any +rules? We'll try to answer both of these, but the first one in particular.</p> +<p>We'll be using Z3, a satisfiability module theories (SMT) solver which has good +bitvector support and most importantly an excellent Python API. We can use the +solver to reason about bitvectors, which are how we will model machine +integers.</p> +<p>To find rewrite rules, we will consider the binary operations (i.e. those +taking two arguments) in PyPy traces that take and produce integers. The +completely general form <code>op(x, y)</code> is not simplifiable on its own. But if +either <code>x == y</code> +or if one of the arguments is a constant, we can potentially simplify the +operation into a simpler form. The results are either the variable <code>x</code>, or a +(potentially different) constant. We'll ignore constant-folding where both +arguments of the binary operation are constants. The possible results for a +simplifiable binary operation are the variable <code>x</code> or another constant. This +leaves the following patterns as possibilities:</p> +<ul> +<li><code>op(x, x) == x</code></li> +<li><code>op(x, x) == c1</code></li> +<li><code>op(x, c1) == x</code></li> +<li><code>op(c1, x) == x</code></li> +<li><code>op(x, c1) == c2</code></li> +<li><code>op(c1, x) == c2</code></li> +</ul> +<p>Our approach will be to take every single supported binary integer operation, +instantiate all of these patterns, and try to ask Z3 whether the resulting +simplification is valid for all values of <code>x</code>.</p> +<h3 id="quick-intro-to-the-z3-python-api">Quick intro to the Z3 Python-API</h3> +<p>Here's a terminal session showing the use of the Z3 Python API:</p> +<div class="code"><pre class="code literal-block"><span class="go">&gt;&gt;&gt;&gt; import z3</span> +<span class="go">&gt;&gt;&gt;&gt; # construct a Z3 bitvector variable of width 8, with name x:</span> +<span class="go">&gt;&gt;&gt;&gt; x = z3.BitVec('x', 8)</span> +<span class="go">&gt;&gt;&gt;&gt; # construct a more complicated formula by using operator overloading:</span> +<span class="go">&gt;&gt;&gt;&gt; x + x</span> +<span class="go">x + x</span> +<span class="go">&gt;&gt;&gt;&gt; x + 1</span> +<span class="go">x + 1</span> +</pre></div> + +<p>Z3 checks the "satisfiability" of a formula. This means that it tries to find +an example set of concrete values for the variables that occur in a formula, +such that the formula becomes true. Examples:</p> +<div class="code"><pre class="code literal-block"><span class="go">&gt;&gt;&gt;&gt; solver = z3.Solver()</span> +<span class="go">&gt;&gt;&gt;&gt; solver.check(x * x == 3)</span> +<span class="go">unsat</span> +<span class="go">&gt;&gt;&gt;&gt; # meaning no x fulfils this property</span> +<span class="go">&gt;&gt;&gt;&gt;</span> +<span class="go">&gt;&gt;&gt;&gt; solver.check(x * x == 9)</span> +<span class="go">sat</span> +<span class="go">&gt;&gt;&gt;&gt; model = solver.model()</span> +<span class="go">&gt;&gt;&gt;&gt; model</span> +<span class="go">[x = 253]</span> +<span class="go">&gt;&gt;&gt;&gt; model[x].as_signed_long()</span> +<span class="go">-3</span> +<span class="go">&gt;&gt;&gt;&gt; # 253 is the same as -3 in two's complement arithmetic with 8 bits</span> +</pre></div> + +<p>In order to use Z3 to prove something, we can ask Z3 to find counterexamples +for the statement, meaning concrete values that would make the negation of the +statement true:</p> +<div class="code"><pre class="code literal-block"><span class="go">&gt;&gt;&gt;&gt; solver.check(z3.Not(x ^ -1 == ~x))</span> +<span class="go">unsat</span> +</pre></div> + +<p>The result <code>unsat</code> means that we just proved that <code>x ^ -1 == ~x</code> is true for +all <code>x</code>, because there is no value for <code>x</code> that makes <code>not (x ^ -1 == ~x)</code> +true (this works because -1 has all the bits set).</p> +<p>If we try to prove something incorrect in this way, the following happens:</p> +<div class="code"><pre class="code literal-block"><span class="go">&gt;&gt;&gt;&gt; solver.check(z3.Not(x ^ -1 == x))</span> +<span class="go">sat</span> +</pre></div> + +<p><code>sat</code> shows that <code>x ^ -1 == x</code> is (unsurprisingly) not always true, and we can +ask for a counterexample:</p> +<div class="code"><pre class="code literal-block"><span class="go">&gt;&gt;&gt;&gt; solver.model()</span> +<span class="go">[x = 0]</span> +</pre></div> + +<p>This way of proving this works because the <code>check</code> calls try to solve an +(implicit) "exists" quantifier, over all the Z3 variables used in the formula. +<code>check</code> will either return <code>z3.unsat</code>, which means that no concrete values make +the formula true; or <code>z3.sat</code>, which means that you can get some concrete +values that make the formula true by calling <code>solver.model()</code>.</p> +<p>In math terms we prove things using <code>check</code> by de-Morgan's rules for quantifiers:</p> +<p>$$ \lnot \exists x: \lnot f(x) \implies \forall x: f(x) $$</p> +<p>Now that we've seen the basics of using the Z3 API on a few small examples, +we'll use it in a bigger program.</p> +<h3 id="encoding-the-integer-operations-of-rpythons-jit-into-z3-formulas">Encoding the integer operations of RPython's JIT into Z3 formulas</h3> +<p>Now we'll use the API to reason about the integer operations of the PyPy JIT +intermediate representation (IR). The binary integer operations are:</p> +<div class="code"><pre class="code literal-block"><span class="n">opnames2</span> <span class="o">=</span> <span class="p">[</span> +<span class="s2">"int_add"</span><span class="p">,</span> +<span class="s2">"int_sub"</span><span class="p">,</span> +<span class="s2">"int_mul"</span><span class="p">,</span> +<span class="s2">"int_and"</span><span class="p">,</span> +<span class="s2">"int_or"</span><span class="p">,</span> +<span class="s2">"int_xor"</span><span class="p">,</span> +<span class="s2">"int_eq"</span><span class="p">,</span> +<span class="s2">"int_ne"</span><span class="p">,</span> +<span class="s2">"int_lt"</span><span class="p">,</span> +<span class="s2">"int_le"</span><span class="p">,</span> +<span class="s2">"int_gt"</span><span class="p">,</span> +<span class="s2">"int_ge"</span><span class="p">,</span> +<span class="s2">"uint_lt"</span><span class="p">,</span> +<span class="s2">"uint_le"</span><span class="p">,</span> +<span class="s2">"uint_gt"</span><span class="p">,</span> +<span class="s2">"uint_ge"</span><span class="p">,</span> +<span class="s2">"int_lshift"</span><span class="p">,</span> +<span class="s2">"int_rshift"</span><span class="p">,</span> +<span class="s2">"uint_rshift"</span><span class="p">,</span> +<span class="s2">"uint_mul_high"</span><span class="p">,</span> +<span class="s2">"int_pydiv"</span><span class="p">,</span> +<span class="s2">"int_pymod"</span><span class="p">,</span> +<span class="p">]</span> +</pre></div> + +<p>There's not much special about the integer operations. Like in LLVM, most of +them are signedness-independent: <code>int_add</code>, <code>int_sub</code>, <code>int_mul</code>, ... work +correctly for unsigned integers but also for +<a href="https://en.wikipedia.org/wiki/Two%27s_complement">two's-complement</a> signed +integers. Exceptions for that are order comparisons like <code>int_lt</code> etc. for +which we have unsigned variants <code>uint_lt</code> etc. All operations that produce a +boolean result return a full-width integer <code>0</code> or <code>1</code> (the PyPy JIT supports +only word-sized integers in its intermediate representation)</p> +<p>In order to reason about the IR operations, some ground work:</p> +<div class="code"><pre class="code literal-block"><span class="kn">import</span> <span class="nn">z3</span> + +<span class="n">INTEGER_WIDTH</span> <span class="o">=</span> <span class="mi">64</span> +<span class="n">solver</span> <span class="o">=</span> <span class="n">z3</span><span class="o">.</span><span class="n">Solver</span><span class="p">()</span> +<span class="n">solver</span><span class="o">.</span><span class="n">set</span><span class="p">(</span><span class="s2">"timeout"</span><span class="p">,</span> <span class="mi">10000</span><span class="p">)</span> <span class="c1"># milliseconds, ie 10s</span> +<span class="n">xvar</span> <span class="o">=</span> <span class="n">z3</span><span class="o">.</span><span class="n">BitVec</span><span class="p">(</span><span class="s1">'x'</span><span class="p">,</span> <span class="n">INTEGER_WIDTH</span><span class="p">)</span> +<span class="n">constvar</span> <span class="o">=</span> <span class="n">z3</span><span class="o">.</span><span class="n">BitVec</span><span class="p">(</span><span class="s1">'const'</span><span class="p">,</span> <span class="n">INTEGER_WIDTH</span><span class="p">)</span> +<span class="n">constvar2</span> <span class="o">=</span> <span class="n">z3</span><span class="o">.</span><span class="n">BitVec</span><span class="p">(</span><span class="s1">'const2'</span><span class="p">,</span> <span class="n">INTEGER_WIDTH</span><span class="p">)</span> +<span class="n">TRUEBV</span> <span class="o">=</span> <span class="n">z3</span><span class="o">.</span><span class="n">BitVecVal</span><span class="p">(</span><span class="mi">1</span><span class="p">,</span> <span class="n">INTEGER_WIDTH</span><span class="p">)</span> +<span class="n">FALSEBV</span> <span class="o">=</span> <span class="n">z3</span><span class="o">.</span><span class="n">BitVecVal</span><span class="p">(</span><span class="mi">0</span><span class="p">,</span> <span class="n">INTEGER_WIDTH</span><span class="p">)</span> +</pre></div> + +<p>And here's the a function to turn an integer IR operation of PyPy's JIT into Z3 +formulas:</p> +<div class="code"><pre class="code literal-block"><span class="k">def</span> <span class="nf">z3_expression</span><span class="p">(</span><span class="n">opname</span><span class="p">,</span> <span class="n">arg0</span><span class="p">,</span> <span class="n">arg1</span><span class="o">=</span><span class="kc">None</span><span class="p">):</span> +<span class="w"> </span><span class="sd">""" computes a tuple of (result, valid_if) of Z3 formulas. `result` is the</span> +<span class="sd"> formula representing the result of the operation, given argument formulas</span> +<span class="sd"> arg0 and arg1. `valid_if` is a pre-condition that must be true for the</span> +<span class="sd"> result to be meaningful. """</span> + <span class="n">result</span> <span class="o">=</span> <span class="kc">None</span> + <span class="n">valid_if</span> <span class="o">=</span> <span class="kc">True</span> <span class="c1"># the precondition is mostly True, with few exceptions</span> + <span class="k">if</span> <span class="n">opname</span> <span class="o">==</span> <span class="s2">"int_add"</span><span class="p">:</span> + <span class="n">result</span> <span class="o">=</span> <span class="n">arg0</span> <span class="o">+</span> <span class="n">arg1</span> + <span class="k">elif</span> <span class="n">opname</span> <span class="o">==</span> <span class="s2">"int_sub"</span><span class="p">:</span> + <span class="n">result</span> <span class="o">=</span> <span class="n">arg0</span> <span class="o">-</span> <span class="n">arg1</span> + <span class="k">elif</span> <span class="n">opname</span> <span class="o">==</span> <span class="s2">"int_mul"</span><span class="p">:</span> + <span class="n">result</span> <span class="o">=</span> <span class="n">arg0</span> <span class="o">*</span> <span class="n">arg1</span> + <span class="k">elif</span> <span class="n">opname</span> <span class="o">==</span> <span class="s2">"int_and"</span><span class="p">:</span> + <span class="n">result</span> <span class="o">=</span> <span class="n">arg0</span> <span class="o">&amp;</span> <span class="n">arg1</span> + <span class="k">elif</span> <span class="n">opname</span> <span class="o">==</span> <span class="s2">"int_or"</span><span class="p">:</span> + <span class="n">result</span> <span class="o">=</span> <span class="n">arg0</span> <span class="o">|</span> <span class="n">arg1</span> + <span class="k">elif</span> <span class="n">opname</span> <span class="o">==</span> <span class="s2">"int_xor"</span><span class="p">:</span> + <span class="n">result</span> <span class="o">=</span> <span class="n">arg0</span> <span class="o">^</span> <span class="n">arg1</span> + <span class="k">elif</span> <span class="n">opname</span> <span class="o">==</span> <span class="s2">"int_eq"</span><span class="p">:</span> + <span class="n">result</span> <span class="o">=</span> <span class="n">cond</span><span class="p">(</span><span class="n">arg0</span> <span class="o">==</span> <span class="n">arg1</span><span class="p">)</span> + <span class="k">elif</span> <span class="n">opname</span> <span class="o">==</span> <span class="s2">"int_ne"</span><span class="p">:</span> + <span class="n">result</span> <span class="o">=</span> <span class="n">cond</span><span class="p">(</span><span class="n">arg0</span> <span class="o">!=</span> <span class="n">arg1</span><span class="p">)</span> + <span class="k">elif</span> <span class="n">opname</span> <span class="o">==</span> <span class="s2">"int_lt"</span><span class="p">:</span> + <span class="n">result</span> <span class="o">=</span> <span class="n">cond</span><span class="p">(</span><span class="n">arg0</span> <span class="o">&lt;</span> <span class="n">arg1</span><span class="p">)</span> + <span class="k">elif</span> <span class="n">opname</span> <span class="o">==</span> <span class="s2">"int_le"</span><span class="p">:</span> + <span class="n">result</span> <span class="o">=</span> <span class="n">cond</span><span class="p">(</span><span class="n">arg0</span> <span class="o">&lt;=</span> <span class="n">arg1</span><span class="p">)</span> + <span class="k">elif</span> <span class="n">opname</span> <span class="o">==</span> <span class="s2">"int_gt"</span><span class="p">:</span> + <span class="n">result</span> <span class="o">=</span> <span class="n">cond</span><span class="p">(</span><span class="n">arg0</span> <span class="o">&gt;</span> <span class="n">arg1</span><span class="p">)</span> + <span class="k">elif</span> <span class="n">opname</span> <span class="o">==</span> <span class="s2">"int_ge"</span><span class="p">:</span> + <span class="n">result</span> <span class="o">=</span> <span class="n">cond</span><span class="p">(</span><span class="n">arg0</span> <span class="o">&gt;=</span> <span class="n">arg1</span><span class="p">)</span> + <span class="k">elif</span> <span class="n">opname</span> <span class="o">==</span> <span class="s2">"uint_lt"</span><span class="p">:</span> + <span class="n">result</span> <span class="o">=</span> <span class="n">cond</span><span class="p">(</span><span class="n">z3</span><span class="o">.</span><span class="n">ULT</span><span class="p">(</span><span class="n">arg0</span><span class="p">,</span> <span class="n">arg1</span><span class="p">))</span> + <span class="k">elif</span> <span class="n">opname</span> <span class="o">==</span> <span class="s2">"uint_le"</span><span class="p">:</span> + <span class="n">result</span> <span class="o">=</span> <span class="n">cond</span><span class="p">(</span><span class="n">z3</span><span class="o">.</span><span class="n">ULE</span><span class="p">(</span><span class="n">arg0</span><span class="p">,</span> <span class="n">arg1</span><span class="p">))</span> + <span class="k">elif</span> <span class="n">opname</span> <span class="o">==</span> <span class="s2">"uint_gt"</span><span class="p">:</span> + <span class="n">result</span> <span class="o">=</span> <span class="n">cond</span><span class="p">(</span><span class="n">z3</span><span class="o">.</span><span class="n">UGT</span><span class="p">(</span><span class="n">arg0</span><span class="p">,</span> <span class="n">arg1</span><span class="p">))</span> + <span class="k">elif</span> <span class="n">opname</span> <span class="o">==</span> <span class="s2">"uint_ge"</span><span class="p">:</span> + <span class="n">result</span> <span class="o">=</span> <span class="n">cond</span><span class="p">(</span><span class="n">z3</span><span class="o">.</span><span class="n">UGE</span><span class="p">(</span><span class="n">arg0</span><span class="p">,</span> <span class="n">arg1</span><span class="p">))</span> + <span class="k">elif</span> <span class="n">opname</span> <span class="o">==</span> <span class="s2">"int_lshift"</span><span class="p">:</span> + <span class="n">result</span> <span class="o">=</span> <span class="n">arg0</span> <span class="o">&lt;&lt;</span> <span class="n">arg1</span> + <span class="n">valid_if</span> <span class="o">=</span> <span class="n">z3</span><span class="o">.</span><span class="n">And</span><span class="p">(</span><span class="n">arg1</span> <span class="o">&gt;=</span> <span class="mi">0</span><span class="p">,</span> <span class="n">arg1</span> <span class="o">&lt;</span> <span class="n">INTEGER_WIDTH</span><span class="p">)</span> + <span class="k">elif</span> <span class="n">opname</span> <span class="o">==</span> <span class="s2">"int_rshift"</span><span class="p">:</span> + <span class="n">result</span> <span class="o">=</span> <span class="n">arg0</span> <span class="o">&lt;&lt;</span> <span class="n">arg1</span> + <span class="n">valid_if</span> <span class="o">=</span> <span class="n">z3</span><span class="o">.</span><span class="n">And</span><span class="p">(</span><span class="n">arg1</span> <span class="o">&gt;=</span> <span class="mi">0</span><span class="p">,</span> <span class="n">arg1</span> <span class="o">&lt;</span> <span class="n">INTEGER_WIDTH</span><span class="p">)</span> + <span class="k">elif</span> <span class="n">opname</span> <span class="o">==</span> <span class="s2">"uint_rshift"</span><span class="p">:</span> + <span class="n">result</span> <span class="o">=</span> <span class="n">z3</span><span class="o">.</span><span class="n">LShR</span><span class="p">(</span><span class="n">arg0</span><span class="p">,</span> <span class="n">arg1</span><span class="p">)</span> + <span class="n">valid_if</span> <span class="o">=</span> <span class="n">z3</span><span class="o">.</span><span class="n">And</span><span class="p">(</span><span class="n">arg1</span> <span class="o">&gt;=</span> <span class="mi">0</span><span class="p">,</span> <span class="n">arg1</span> <span class="o">&lt;</span> <span class="n">INTEGER_WIDTH</span><span class="p">)</span> + <span class="k">elif</span> <span class="n">opname</span> <span class="o">==</span> <span class="s2">"uint_mul_high"</span><span class="p">:</span> + <span class="c1"># zero-extend args to 2*INTEGER_WIDTH bit, then multiply and extract</span> + <span class="c1"># highest INTEGER_WIDTH bits</span> + <span class="n">zarg0</span> <span class="o">=</span> <span class="n">z3</span><span class="o">.</span><span class="n">ZeroExt</span><span class="p">(</span><span class="n">INTEGER_WIDTH</span><span class="p">,</span> <span class="n">arg0</span><span class="p">)</span> + <span class="n">zarg1</span> <span class="o">=</span> <span class="n">z3</span><span class="o">.</span><span class="n">ZeroExt</span><span class="p">(</span><span class="n">INTEGER_WIDTH</span><span class="p">,</span> <span class="n">arg1</span><span class="p">)</span> + <span class="n">result</span> <span class="o">=</span> <span class="n">z3</span><span class="o">.</span><span class="n">Extract</span><span class="p">(</span><span class="n">INTEGER_WIDTH</span> <span class="o">*</span> <span class="mi">2</span> <span class="o">-</span> <span class="mi">1</span><span class="p">,</span> <span class="n">INTEGER_WIDTH</span><span class="p">,</span> <span class="n">zarg0</span> <span class="o">*</span> <span class="n">zarg1</span><span class="p">)</span> + <span class="k">elif</span> <span class="n">opname</span> <span class="o">==</span> <span class="s2">"int_pydiv"</span><span class="p">:</span> + <span class="n">valid_if</span> <span class="o">=</span> <span class="n">arg1</span> <span class="o">!=</span> <span class="mi">0</span> + <span class="n">r</span> <span class="o">=</span> <span class="n">arg0</span> <span class="o">/</span> <span class="n">arg1</span> + <span class="n">psubx</span> <span class="o">=</span> <span class="n">r</span> <span class="o">*</span> <span class="n">arg1</span> <span class="o">-</span> <span class="n">arg0</span> + <span class="n">result</span> <span class="o">=</span> <span class="n">r</span> <span class="o">+</span> <span class="p">(</span><span class="n">z3</span><span class="o">.</span><span class="n">If</span><span class="p">(</span><span class="n">arg1</span> <span class="o">&lt;</span> <span class="mi">0</span><span class="p">,</span> <span class="n">psubx</span><span class="p">,</span> <span class="o">-</span><span class="n">psubx</span><span class="p">)</span> <span class="o">&gt;&gt;</span> <span class="p">(</span><span class="n">INTEGER_WIDTH</span> <span class="o">-</span> <span class="mi">1</span><span class="p">))</span> + <span class="k">elif</span> <span class="n">opname</span> <span class="o">==</span> <span class="s2">"int_pymod"</span><span class="p">:</span> + <span class="n">valid_if</span> <span class="o">=</span> <span class="n">arg1</span> <span class="o">!=</span> <span class="mi">0</span> + <span class="n">r</span> <span class="o">=</span> <span class="n">arg0</span> <span class="o">%</span> <span class="n">arg1</span> + <span class="n">result</span> <span class="o">=</span> <span class="n">r</span> <span class="o">+</span> <span class="p">(</span><span class="n">arg1</span> <span class="o">&amp;</span> <span class="n">z3</span><span class="o">.</span><span class="n">If</span><span class="p">(</span><span class="n">arg1</span> <span class="o">&lt;</span> <span class="mi">0</span><span class="p">,</span> <span class="o">-</span><span class="n">r</span><span class="p">,</span> <span class="n">r</span><span class="p">)</span> <span class="o">&gt;&gt;</span> <span class="p">(</span><span class="n">INTEGER_WIDTH</span> <span class="o">-</span> <span class="mi">1</span><span class="p">))</span> + <span class="k">elif</span> <span class="n">opname</span> <span class="o">==</span> <span class="s2">"int_is_true"</span><span class="p">:</span> + <span class="n">result</span> <span class="o">=</span> <span class="n">cond</span><span class="p">(</span><span class="n">arg0</span> <span class="o">!=</span> <span class="n">FALSEBV</span><span class="p">)</span> + <span class="k">elif</span> <span class="n">opname</span> <span class="o">==</span> <span class="s2">"int_is_zero"</span><span class="p">:</span> + <span class="n">result</span> <span class="o">=</span> <span class="n">cond</span><span class="p">(</span><span class="n">arg0</span> <span class="o">==</span> <span class="n">FALSEBV</span><span class="p">)</span> + <span class="k">elif</span> <span class="n">opname</span> <span class="o">==</span> <span class="s2">"int_neg"</span><span class="p">:</span> + <span class="n">result</span> <span class="o">=</span> <span class="o">-</span><span class="n">arg0</span> + <span class="k">elif</span> <span class="n">opname</span> <span class="o">==</span> <span class="s2">"int_invert"</span><span class="p">:</span> + <span class="n">result</span> <span class="o">=</span> <span class="o">~</span><span class="n">arg0</span> + <span class="k">else</span><span class="p">:</span> + <span class="k">assert</span> <span class="mi">0</span><span class="p">,</span> <span class="s2">"unknown operation "</span> <span class="o">+</span> <span class="n">opname</span> + <span class="k">return</span> <span class="n">result</span><span class="p">,</span> <span class="n">valid_if</span> + +<span class="k">def</span> <span class="nf">cond</span><span class="p">(</span><span class="n">z3expr</span><span class="p">):</span> +<span class="w"> </span><span class="sd">""" helper function to turn a Z3 boolean result z3expr into a 1 or 0</span> +<span class="sd"> bitvector, using z3.If """</span> + <span class="k">return</span> <span class="n">z3</span><span class="o">.</span><span class="n">If</span><span class="p">(</span><span class="n">z3expr</span><span class="p">,</span> <span class="n">TRUEBV</span><span class="p">,</span> <span class="n">FALSEBV</span><span class="p">)</span> +</pre></div> + +<p>We map the semantics of a PyPy JIT operation to Z3 with the <code>z3_expression</code> +function. It takes the name of a JIT operation and its two (or one) arguments +into a pair of Z3 formulas, <code>result</code> and <code>valid_if</code>. The resulting formulas are +constructed with the operator overloading of Z3 variables/formulas.</p> +<p>The first element <code>result</code> of the result of <code>z3_expression</code> represents the result +of performing the operation. <code>valid_if</code> is a bool that represents a condition that +needs to be <code>True</code> in order for the result of the operation to be defined. E.g. +<code>int_pydiv(a, b)</code> is only valid if <code>b != 0</code>. Most operations are always valid, +so they return <code>True</code> as that condition (we'll ignore <code>valid_if</code> for a bit, but it +will become more relevant further down in the post).</p> +<p>We can define a helper function to prove things by finding counterexamples:</p> +<div class="code"><pre class="code literal-block"><span class="k">def</span> <span class="nf">prove</span><span class="p">(</span><span class="n">cond</span><span class="p">):</span> +<span class="w"> </span><span class="sd">""" Try to prove a condition cond by searching for counterexamples of its negation. """</span> + <span class="n">z3res</span> <span class="o">=</span> <span class="n">solver</span><span class="o">.</span><span class="n">check</span><span class="p">(</span><span class="n">z3</span><span class="o">.</span><span class="n">Not</span><span class="p">(</span><span class="n">cond</span><span class="p">))</span> + <span class="k">if</span> <span class="n">z3res</span> <span class="o">==</span> <span class="n">z3</span><span class="o">.</span><span class="n">unsat</span><span class="p">:</span> + <span class="k">return</span> <span class="kc">True</span> + <span class="k">elif</span> <span class="n">z3res</span> <span class="o">==</span> <span class="n">z3</span><span class="o">.</span><span class="n">unknown</span><span class="p">:</span> <span class="c1"># eg on timeout</span> + <span class="k">return</span> <span class="kc">False</span> + <span class="k">elif</span> <span class="n">z3res</span> <span class="o">==</span> <span class="n">z3</span><span class="o">.</span><span class="n">sat</span><span class="p">:</span> + <span class="k">return</span> <span class="kc">False</span> + <span class="k">assert</span> <span class="mi">0</span><span class="p">,</span> <span class="s2">"should be unreachable"</span> +</pre></div> + +<h3 id="finding-rewrite-rules">Finding rewrite rules</h3> +<p>Now we can start finding our first rewrite rules, following the first pattern +<code>op(x, x) -&gt; x</code>. We do this by iterating over all the supported binary +operation names, getting the z3 expression for <code>op(x, x)</code> and then asking Z3 to +prove <code>op(x, x) == x</code>.</p> +<div class="code"><pre class="code literal-block"><span class="k">for</span> <span class="n">opname</span> <span class="ow">in</span> <span class="n">opnames2</span><span class="p">:</span> + <span class="n">result</span><span class="p">,</span> <span class="n">valid_if</span> <span class="o">=</span> <span class="n">z3_expression</span><span class="p">(</span><span class="n">opname</span><span class="p">,</span> <span class="n">xvar</span><span class="p">,</span> <span class="n">xvar</span><span class="p">)</span> + <span class="k">if</span> <span class="n">prove</span><span class="p">(</span><span class="n">result</span> <span class="o">==</span> <span class="n">xvar</span><span class="p">):</span> + <span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">"</span><span class="si">{</span><span class="n">opname</span><span class="si">}</span><span class="s2">(x, x) -&gt; x, </span><span class="si">{</span><span class="n">result</span><span class="si">}</span><span class="s2">"</span><span class="p">)</span> +</pre></div> + +<p>This yields the simplifications:</p> +<div class="code"><pre class="code literal-block"><span class="n">int_and</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="n">x</span> +<span class="n">int_or</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="n">x</span> +</pre></div> + +<h3 id="synthesizing-constants">Synthesizing constants</h3> +<p>Supporting the next patterns is harder: <code>op(x, x) == c1</code>, <code>op(x, c1) == x</code>, and +<code>op(c1, x) == x</code>. We don't know which constants to pick to try to get Z3 to +prove the equality. We could iterate over common constants like <code>0</code>, <code>1</code>, +<code>MAXINT</code>, etc, or even over all the 256 values for a bitvector of length 8. +However, we will instead ask Z3 to find the constants for us too.</p> +<p>This can be done by using quantifiers, in this case <code>z3.ForAll</code>. The query we +pose to Z3 is "does there exist a constant <code>c1</code> such that for all <code>x</code> the +following is true: <code>op(x, c1) == x</code>? Note that the constant <code>c1</code> is not +necessarily unique, there could be many of them. We generate several matching +constant, and add that they must be different to the condition of the second +and further queries.</p> +<p>We can express this in a helper function:</p> +<div class="code"><pre class="code literal-block"><span class="k">def</span> <span class="nf">find_constant</span><span class="p">(</span><span class="n">z3expr</span><span class="p">,</span> <span class="n">number_of_results</span><span class="o">=</span><span class="mi">5</span><span class="p">):</span> + <span class="n">condition</span> <span class="o">=</span> <span class="n">z3</span><span class="o">.</span><span class="n">ForAll</span><span class="p">(</span> + <span class="p">[</span><span class="n">xvar</span><span class="p">],</span> + <span class="n">z3expr</span> + <span class="p">)</span> + <span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="n">number_of_results</span><span class="p">):</span> + <span class="n">checkres</span> <span class="o">=</span> <span class="n">solver</span><span class="o">.</span><span class="n">check</span><span class="p">(</span><span class="n">condition</span><span class="p">)</span> + <span class="k">if</span> <span class="n">checkres</span> <span class="o">==</span> <span class="n">z3</span><span class="o">.</span><span class="n">sat</span><span class="p">:</span> + <span class="c1"># if a solver check succeeds, we can ask for a model, which is</span> + <span class="c1"># concrete values for the variables constvar</span> + <span class="n">model</span> <span class="o">=</span> <span class="n">solver</span><span class="o">.</span><span class="n">model</span><span class="p">()</span> + <span class="n">const</span> <span class="o">=</span> <span class="n">model</span><span class="p">[</span><span class="n">constvar</span><span class="p">]</span><span class="o">.</span><span class="n">as_signed_long</span><span class="p">()</span> + <span class="k">yield</span> <span class="n">const</span> + <span class="c1"># make sure we don't generate the same constant again on the</span> + <span class="c1"># next call</span> + <span class="n">condition</span> <span class="o">=</span> <span class="n">z3</span><span class="o">.</span><span class="n">And</span><span class="p">(</span><span class="n">constvar</span> <span class="o">!=</span> <span class="n">const</span><span class="p">,</span> <span class="n">condition</span><span class="p">)</span> + <span class="k">else</span><span class="p">:</span> + <span class="c1"># no (more) constants found</span> + <span class="k">break</span> +</pre></div> + +<p>We can use this new function for the three mentioned patterns:</p> +<div class="code"><pre class="code literal-block"><span class="c1"># try to find constants for op(x, x) == c</span> +<span class="k">for</span> <span class="n">opname</span> <span class="ow">in</span> <span class="n">opnames2</span><span class="p">:</span> + <span class="n">result</span><span class="p">,</span> <span class="n">valid_if</span> <span class="o">=</span> <span class="n">z3_expression</span><span class="p">(</span><span class="n">opname</span><span class="p">,</span> <span class="n">xvar</span><span class="p">,</span> <span class="n">xvar</span><span class="p">)</span> + <span class="k">for</span> <span class="n">const</span> <span class="ow">in</span> <span class="n">find_constant</span><span class="p">(</span><span class="n">result</span> <span class="o">==</span> <span class="n">constvar</span><span class="p">):</span> + <span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">"</span><span class="si">{</span><span class="n">opname</span><span class="si">}</span><span class="s2">(x, x) -&gt; </span><span class="si">{</span><span class="n">const</span><span class="si">}</span><span class="s2">"</span><span class="p">)</span> +<span class="c1"># try to find constants for op(x, c) == x and op(c, x) == x</span> +<span class="k">for</span> <span class="n">opname</span> <span class="ow">in</span> <span class="n">opnames2</span><span class="p">:</span> + <span class="n">result</span><span class="p">,</span> <span class="n">valid_if</span> <span class="o">=</span> <span class="n">z3_expression</span><span class="p">(</span><span class="n">opname</span><span class="p">,</span> <span class="n">xvar</span><span class="p">,</span> <span class="n">constvar</span><span class="p">)</span> + <span class="k">for</span> <span class="n">const</span> <span class="ow">in</span> <span class="n">find_constant</span><span class="p">(</span><span class="n">result</span> <span class="o">==</span> <span class="n">xvar</span><span class="p">):</span> + <span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">"</span><span class="si">{</span><span class="n">opname</span><span class="si">}</span><span class="s2">(x, </span><span class="si">{</span><span class="n">const</span><span class="si">}</span><span class="s2">) -&gt; x"</span><span class="p">)</span> + <span class="n">result</span><span class="p">,</span> <span class="n">valid_if</span> <span class="o">=</span> <span class="n">z3_expression</span><span class="p">(</span><span class="n">opname</span><span class="p">,</span> <span class="n">constvar</span><span class="p">,</span> <span class="n">xvar</span><span class="p">)</span> + <span class="k">for</span> <span class="n">const</span> <span class="ow">in</span> <span class="n">find_constant</span><span class="p">(</span><span class="n">result</span> <span class="o">==</span> <span class="n">xvar</span><span class="p">):</span> + <span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">"</span><span class="si">{</span><span class="n">opname</span><span class="si">}</span><span class="s2">(</span><span class="si">{</span><span class="n">const</span><span class="si">}</span><span class="s2">, x) -&gt; x"</span><span class="p">)</span> +<span class="c1"># this code is not quite correct, we'll correct it later</span> +</pre></div> + +<p>Together this yields the following new simplifications:</p> +<div class="code"><pre class="code literal-block"><span class="cp"># careful, these are not all correct!</span> +<span class="n">int_sub</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="mi">0</span> +<span class="n">int_xor</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="mi">0</span> +<span class="n">int_eq</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="mi">1</span> +<span class="n">int_ne</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="mi">0</span> +<span class="n">int_lt</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="mi">0</span> +<span class="n">int_le</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="mi">1</span> +<span class="n">int_gt</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="mi">0</span> +<span class="n">int_ge</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="mi">1</span> +<span class="n">uint_lt</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="mi">0</span> +<span class="n">uint_le</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="mi">1</span> +<span class="n">uint_gt</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="mi">0</span> +<span class="n">uint_ge</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="mi">1</span> +<span class="n">uint_rshift</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="mi">0</span> +<span class="n">int_pymod</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="mi">0</span> +<span class="n">int_add</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="mi">0</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="n">x</span> +<span class="n">int_add</span><span class="p">(</span><span class="mi">0</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="n">x</span> +<span class="n">int_sub</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="mi">0</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="n">x</span> +<span class="n">int_mul</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="mi">1</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="n">x</span> +<span class="n">int_mul</span><span class="p">(</span><span class="mi">1</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="n">x</span> +<span class="n">int_and</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="o">-</span><span class="mi">1</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="n">x</span> +<span class="n">int_and</span><span class="p">(</span><span class="o">-</span><span class="mi">1</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="n">x</span> +<span class="n">int_or</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="mi">0</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="n">x</span> +<span class="n">int_or</span><span class="p">(</span><span class="mi">0</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="n">x</span> +<span class="n">int_xor</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="mi">0</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="n">x</span> +<span class="n">int_xor</span><span class="p">(</span><span class="mi">0</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="n">x</span> +<span class="n">int_lshift</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="mi">0</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="n">x</span> +<span class="n">int_rshift</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="mi">0</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="n">x</span> +<span class="n">uint_rshift</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="mi">0</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="n">x</span> +<span class="n">int_pydiv</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="mi">1</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="n">x</span> +<span class="n">int_pymod</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="mi">0</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="n">x</span> +</pre></div> + +<p>Most of these look good at first glance, but the last one reveals a problem: +we've been ignoring the <code>valid_if</code> expression up to now. We can stop doing that by +changing the code like this, which adds <code>z3.And(valid_if, ...)</code> to the argument of +the calls to <code>find_constant</code>:</p> +<div class="code"><pre class="code literal-block"><span class="c1"># try to find constants for op(x, x) == c, op(x, c) == x and op(c, x) == x</span> +<span class="k">for</span> <span class="n">opname</span> <span class="ow">in</span> <span class="n">opnames2</span><span class="p">:</span> + <span class="n">result</span><span class="p">,</span> <span class="n">valid_if</span> <span class="o">=</span> <span class="n">z3_expression</span><span class="p">(</span><span class="n">opname</span><span class="p">,</span> <span class="n">xvar</span><span class="p">,</span> <span class="n">xvar</span><span class="p">)</span> + <span class="k">for</span> <span class="n">const</span> <span class="ow">in</span> <span class="n">find_constant</span><span class="p">(</span><span class="n">z3</span><span class="o">.</span><span class="n">And</span><span class="p">(</span><span class="n">valid_if</span><span class="p">,</span> <span class="n">result</span> <span class="o">==</span> <span class="n">constvar</span><span class="p">)):</span> + <span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">"</span><span class="si">{</span><span class="n">opname</span><span class="si">}</span><span class="s2">(x, x) -&gt; </span><span class="si">{</span><span class="n">const</span><span class="si">}</span><span class="s2">"</span><span class="p">)</span> +<span class="c1"># try to find constants for op(x, c) == x and op(c, x) == x</span> +<span class="k">for</span> <span class="n">opname</span> <span class="ow">in</span> <span class="n">opnames2</span><span class="p">:</span> + <span class="n">result</span><span class="p">,</span> <span class="n">valid_if</span> <span class="o">=</span> <span class="n">z3_expression</span><span class="p">(</span><span class="n">opname</span><span class="p">,</span> <span class="n">xvar</span><span class="p">,</span> <span class="n">constvar</span><span class="p">)</span> + <span class="k">for</span> <span class="n">const</span> <span class="ow">in</span> <span class="n">find_constant</span><span class="p">(</span><span class="n">z3</span><span class="o">.</span><span class="n">And</span><span class="p">(</span><span class="n">result</span> <span class="o">==</span> <span class="n">xvar</span><span class="p">,</span> <span class="n">valid_if</span><span class="p">)):</span> + <span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">"</span><span class="si">{</span><span class="n">opname</span><span class="si">}</span><span class="s2">(x, </span><span class="si">{</span><span class="n">const</span><span class="si">}</span><span class="s2">) -&gt; x"</span><span class="p">)</span> + <span class="n">result</span><span class="p">,</span> <span class="n">valid_if</span> <span class="o">=</span> <span class="n">z3_expression</span><span class="p">(</span><span class="n">opname</span><span class="p">,</span> <span class="n">constvar</span><span class="p">,</span> <span class="n">xvar</span><span class="p">)</span> + <span class="k">for</span> <span class="n">const</span> <span class="ow">in</span> <span class="n">find_constant</span><span class="p">(</span><span class="n">z3</span><span class="o">.</span><span class="n">And</span><span class="p">(</span><span class="n">result</span> <span class="o">==</span> <span class="n">xvar</span><span class="p">,</span> <span class="n">valid_if</span><span class="p">)):</span> + <span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">"</span><span class="si">{</span><span class="n">opname</span><span class="si">}</span><span class="s2">(</span><span class="si">{</span><span class="n">const</span><span class="si">}</span><span class="s2">, x) -&gt; x"</span><span class="p">)</span> +</pre></div> + +<p>And we get this list instead:</p> +<div class="code"><pre class="code literal-block"><span class="n">int_sub</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="mi">0</span> +<span class="n">int_xor</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="mi">0</span> +<span class="n">int_eq</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="mi">1</span> +<span class="n">int_ne</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="mi">0</span> +<span class="n">int_lt</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="mi">0</span> +<span class="n">int_le</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="mi">1</span> +<span class="n">int_gt</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="mi">0</span> +<span class="n">int_ge</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="mi">1</span> +<span class="n">uint_lt</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="mi">0</span> +<span class="n">uint_le</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="mi">1</span> +<span class="n">uint_gt</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="mi">0</span> +<span class="n">uint_ge</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="mi">1</span> +<span class="n">int_add</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="mi">0</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="n">x</span> +<span class="n">int_add</span><span class="p">(</span><span class="mi">0</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="n">x</span> +<span class="n">int_sub</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="mi">0</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="n">x</span> +<span class="n">int_mul</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="mi">1</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="n">x</span> +<span class="n">int_mul</span><span class="p">(</span><span class="mi">1</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="n">x</span> +<span class="n">int_and</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="o">-</span><span class="mi">1</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="n">x</span> +<span class="n">int_and</span><span class="p">(</span><span class="o">-</span><span class="mi">1</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="n">x</span> +<span class="n">int_or</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="mi">0</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="n">x</span> +<span class="n">int_or</span><span class="p">(</span><span class="mi">0</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="n">x</span> +<span class="n">int_xor</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="mi">0</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="n">x</span> +<span class="n">int_xor</span><span class="p">(</span><span class="mi">0</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="n">x</span> +<span class="n">int_lshift</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="mi">0</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="n">x</span> +<span class="n">int_rshift</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="mi">0</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="n">x</span> +<span class="n">uint_rshift</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="mi">0</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="n">x</span> +<span class="n">int_pydiv</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="mi">1</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="n">x</span> +</pre></div> + +<h3 id="synthesizing-two-constants">Synthesizing two constants</h3> +<p>For the patterns <code>op(x, c1) == c2</code> and <code>op(c1, x) == c2</code> we need to synthesize +two constants. We can again write a helper method for that:</p> +<div class="code"><pre class="code literal-block"><span class="k">def</span> <span class="nf">find_2consts</span><span class="p">(</span><span class="n">z3expr</span><span class="p">,</span> <span class="n">number_of_results</span><span class="o">=</span><span class="mi">5</span><span class="p">):</span> + <span class="n">condition</span> <span class="o">=</span> <span class="n">z3</span><span class="o">.</span><span class="n">ForAll</span><span class="p">(</span> + <span class="p">[</span><span class="n">xvar</span><span class="p">],</span> + <span class="n">z3expr</span> + <span class="p">)</span> + <span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="n">number_of_results</span><span class="p">):</span> + <span class="n">checkres</span> <span class="o">=</span> <span class="n">solver</span><span class="o">.</span><span class="n">check</span><span class="p">(</span><span class="n">condition</span><span class="p">)</span> + <span class="k">if</span> <span class="n">checkres</span> <span class="o">==</span> <span class="n">z3</span><span class="o">.</span><span class="n">sat</span><span class="p">:</span> + <span class="n">model</span> <span class="o">=</span> <span class="n">solver</span><span class="o">.</span><span class="n">model</span><span class="p">()</span> + <span class="n">const</span> <span class="o">=</span> <span class="n">model</span><span class="p">[</span><span class="n">constvar</span><span class="p">]</span><span class="o">.</span><span class="n">as_signed_long</span><span class="p">()</span> + <span class="n">const2</span> <span class="o">=</span> <span class="n">model</span><span class="p">[</span><span class="n">constvar2</span><span class="p">]</span><span class="o">.</span><span class="n">as_signed_long</span><span class="p">()</span> + <span class="k">yield</span> <span class="n">const</span><span class="p">,</span> <span class="n">const2</span> + <span class="n">condition</span> <span class="o">=</span> <span class="n">z3</span><span class="o">.</span><span class="n">And</span><span class="p">(</span><span class="n">z3</span><span class="o">.</span><span class="n">Or</span><span class="p">(</span><span class="n">constvar</span> <span class="o">!=</span> <span class="n">const</span><span class="p">,</span> <span class="n">constvar2</span> <span class="o">!=</span> <span class="n">const2</span><span class="p">),</span> <span class="n">condition</span><span class="p">)</span> + <span class="k">else</span><span class="p">:</span> + <span class="k">return</span> +</pre></div> + +<p>And then use it like this:</p> +<div class="code"><pre class="code literal-block"><span class="k">for</span> <span class="n">opname</span> <span class="ow">in</span> <span class="n">opnames2</span><span class="p">:</span> + <span class="c1"># try to find constants c1, c2 such that op(c1, x) -&gt; c2</span> + <span class="n">result</span><span class="p">,</span> <span class="n">valid_if</span> <span class="o">=</span> <span class="n">z3_expression</span><span class="p">(</span><span class="n">opname</span><span class="p">,</span> <span class="n">constvar</span><span class="p">,</span> <span class="n">xvar</span><span class="p">)</span> + <span class="n">consts</span> <span class="o">=</span> <span class="n">find_2consts</span><span class="p">(</span><span class="n">z3</span><span class="o">.</span><span class="n">And</span><span class="p">(</span><span class="n">valid_if</span><span class="p">,</span> <span class="n">result</span> <span class="o">==</span> <span class="n">constvar2</span><span class="p">))</span> + <span class="k">for</span> <span class="n">const</span><span class="p">,</span> <span class="n">const2</span> <span class="ow">in</span> <span class="n">consts</span><span class="p">:</span> + <span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">"</span><span class="si">{</span><span class="n">opname</span><span class="si">}</span><span class="s2">(</span><span class="si">{</span><span class="n">const</span><span class="si">}</span><span class="s2">, x) -&gt; </span><span class="si">{</span><span class="n">const2</span><span class="si">}</span><span class="s2">"</span><span class="p">)</span> + <span class="c1"># try to find constants c1, c2 such that op(x, c1) -&gt; c2</span> + <span class="n">result</span><span class="p">,</span> <span class="n">valid_if</span> <span class="o">=</span> <span class="n">z3_expression</span><span class="p">(</span><span class="n">opname</span><span class="p">,</span> <span class="n">xvar</span><span class="p">,</span> <span class="n">constvar</span><span class="p">)</span> + <span class="n">consts</span> <span class="o">=</span> <span class="n">find_2consts</span><span class="p">(</span><span class="n">z3</span><span class="o">.</span><span class="n">And</span><span class="p">(</span><span class="n">valid_if</span><span class="p">,</span> <span class="n">result</span> <span class="o">==</span> <span class="n">constvar2</span><span class="p">))</span> + <span class="k">for</span> <span class="n">const</span><span class="p">,</span> <span class="n">const2</span> <span class="ow">in</span> <span class="n">consts</span><span class="p">:</span> + <span class="nb">print</span><span class="p">(</span><span class="s2">"</span><span class="si">%s</span><span class="s2">(x, </span><span class="si">%s</span><span class="s2">) -&gt; </span><span class="si">%s</span><span class="s2">"</span> <span class="o">%</span> <span class="p">(</span><span class="n">opname</span><span class="p">,</span> <span class="n">const</span><span class="p">,</span> <span class="n">const2</span><span class="p">))</span> +</pre></div> + +<p>Which yields some straightforward simplifications:</p> +<div class="code"><pre class="code literal-block"><span class="n">int_mul</span><span class="p">(</span><span class="mi">0</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="mi">0</span> +<span class="n">int_mul</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="mi">0</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="mi">0</span> +<span class="n">int_and</span><span class="p">(</span><span class="mi">0</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="mi">0</span> +<span class="n">int_and</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="mi">0</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="mi">0</span> +<span class="n">uint_lt</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="mi">0</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="mi">0</span> +<span class="n">uint_le</span><span class="p">(</span><span class="mi">0</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="mi">1</span> +<span class="n">uint_gt</span><span class="p">(</span><span class="mi">0</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="mi">0</span> +<span class="n">uint_ge</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="mi">0</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="mi">1</span> +<span class="n">int_lshift</span><span class="p">(</span><span class="mi">0</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="mi">0</span> +<span class="n">int_rshift</span><span class="p">(</span><span class="mi">0</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="mi">0</span> +<span class="n">uint_rshift</span><span class="p">(</span><span class="mi">0</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="mi">0</span> +<span class="n">uint_mul_high</span><span class="p">(</span><span class="mi">0</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="mi">0</span> +<span class="n">uint_mul_high</span><span class="p">(</span><span class="mi">1</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="mi">0</span> +<span class="n">uint_mul_high</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="mi">0</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="mi">0</span> +<span class="n">uint_mul_high</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="mi">1</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="mi">0</span> +<span class="n">int_pymod</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="mi">1</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="mi">0</span> +<span class="n">int_pymod</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="o">-</span><span class="mi">1</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="mi">0</span> +</pre></div> + +<p>A few require a bit more thinking:</p> +<div class="code"><pre class="code literal-block"><span class="n">int_or</span><span class="p">(</span><span class="o">-</span><span class="mi">1</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="o">-</span><span class="mi">1</span> +<span class="n">int_or</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="o">-</span><span class="mi">1</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="o">-</span><span class="mi">1</span> +</pre></div> + +<p>The are true because in two's complement, <code>-1</code> has all bits set.</p> +<p>The following ones require recognizing that <code>-9223372036854775808 == -2**63</code> is +the most negative signed 64-bit integer, and <code>9223372036854775807 == 2 ** 63 - +1</code> is the most positive one:</p> +<div class="code"><pre class="code literal-block"><span class="n">int_lt</span><span class="p">(</span><span class="mi">9223372036854775807</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="mi">0</span> +<span class="n">int_lt</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="o">-</span><span class="mi">9223372036854775808</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="mi">0</span> +<span class="n">int_le</span><span class="p">(</span><span class="o">-</span><span class="mi">9223372036854775808</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="mi">1</span> +<span class="n">int_le</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="mi">9223372036854775807</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="mi">1</span> +<span class="n">int_gt</span><span class="p">(</span><span class="o">-</span><span class="mi">9223372036854775808</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="mi">0</span> +<span class="n">int_gt</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="mi">9223372036854775807</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="mi">0</span> +<span class="n">int_ge</span><span class="p">(</span><span class="mi">9223372036854775807</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="mi">1</span> +<span class="n">int_ge</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="o">-</span><span class="mi">9223372036854775808</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="mi">1</span> +</pre></div> + +<p>The following ones are true because the bitpattern for <code>-1</code> is the largest +unsigned number:</p> +<div class="code"><pre class="code literal-block"><span class="n">uint_lt</span><span class="p">(</span><span class="o">-</span><span class="mi">1</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="mi">0</span> +<span class="n">uint_le</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="o">-</span><span class="mi">1</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="mi">1</span> +<span class="n">uint_gt</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="o">-</span><span class="mi">1</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="mi">0</span> +<span class="n">uint_ge</span><span class="p">(</span><span class="o">-</span><span class="mi">1</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="mi">1</span> +</pre></div> + +<h3 id="strength-reductions">Strength Reductions</h3> +<p>All the patterns so far only had a variable or a constant on the target of the +rewrite. We can also use the machinery to do strengh-reductions where we +generate a single-argument operation <code>op1(x)</code> for input operations <code>op(x, c1)</code> +or <code>op(c1, x)</code>. To achieve this, we try all combinations of binary and unary +operations. (We won't consider strength reductions where a binary operation +gets turned into a "cheaper" other binary operation here.)</p> +<div class="code"><pre class="code literal-block"><span class="n">opnames1</span> <span class="o">=</span> <span class="p">[</span> +<span class="s2">"int_is_true"</span><span class="p">,</span> +<span class="s2">"int_is_zero"</span><span class="p">,</span> +<span class="s2">"int_neg"</span><span class="p">,</span> +<span class="s2">"int_invert"</span><span class="p">,</span> +<span class="p">]</span> + +<span class="k">for</span> <span class="n">opname</span> <span class="ow">in</span> <span class="n">opnames2</span><span class="p">:</span> + <span class="k">for</span> <span class="n">opname1</span> <span class="ow">in</span> <span class="n">opnames1</span><span class="p">:</span> + <span class="n">result</span><span class="p">,</span> <span class="n">valid_if</span> <span class="o">=</span> <span class="n">z3_expression</span><span class="p">(</span><span class="n">opname</span><span class="p">,</span> <span class="n">xvar</span><span class="p">,</span> <span class="n">constvar</span><span class="p">)</span> + <span class="c1"># try to find a constant op(x, c) == g(x)</span> + <span class="n">result1</span><span class="p">,</span> <span class="n">valid_if1</span> <span class="o">=</span> <span class="n">z3_expression</span><span class="p">(</span><span class="n">opname1</span><span class="p">,</span> <span class="n">xvar</span><span class="p">)</span> + <span class="n">consts</span> <span class="o">=</span> <span class="n">find_constant</span><span class="p">(</span><span class="n">z3</span><span class="o">.</span><span class="n">And</span><span class="p">(</span><span class="n">valid_if</span><span class="p">,</span> <span class="n">valid_if1</span><span class="p">,</span> <span class="n">result</span> <span class="o">==</span> <span class="n">result1</span><span class="p">))</span> + <span class="k">for</span> <span class="n">const</span> <span class="ow">in</span> <span class="n">consts</span><span class="p">:</span> + <span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">"</span><span class="si">{</span><span class="n">opname</span><span class="si">}</span><span class="s2">(x, </span><span class="si">{</span><span class="n">const</span><span class="si">}</span><span class="s2">) -&gt; </span><span class="si">{</span><span class="n">opname1</span><span class="si">}</span><span class="s2">(x)"</span><span class="p">)</span> + + <span class="c1"># try to find a constant op(c, x) == g(x)</span> + <span class="n">result</span><span class="p">,</span> <span class="n">valid_if</span> <span class="o">=</span> <span class="n">z3_expression</span><span class="p">(</span><span class="n">opname</span><span class="p">,</span> <span class="n">constvar</span><span class="p">,</span> <span class="n">xvar</span><span class="p">)</span> + <span class="n">result1</span><span class="p">,</span> <span class="n">valid_if1</span> <span class="o">=</span> <span class="n">z3_expression</span><span class="p">(</span><span class="n">opname1</span><span class="p">,</span> <span class="n">xvar</span><span class="p">)</span> + <span class="n">consts</span> <span class="o">=</span> <span class="n">find_constant</span><span class="p">(</span><span class="n">z3</span><span class="o">.</span><span class="n">And</span><span class="p">(</span><span class="n">valid_if</span><span class="p">,</span> <span class="n">valid_if1</span><span class="p">,</span> <span class="n">result</span> <span class="o">==</span> <span class="n">result1</span><span class="p">))</span> + <span class="k">for</span> <span class="n">const</span> <span class="ow">in</span> <span class="n">consts</span><span class="p">:</span> + <span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">"</span><span class="si">{</span><span class="n">opname</span><span class="si">}</span><span class="s2">(</span><span class="si">{</span><span class="n">const</span><span class="si">}</span><span class="s2">, x) -&gt; </span><span class="si">{</span><span class="n">opname1</span><span class="si">}</span><span class="s2">(x)"</span><span class="p">)</span> +</pre></div> + +<p>Which yields the following new simplifications:</p> +<div class="code"><pre class="code literal-block"><span class="n">int_sub</span><span class="p">(</span><span class="mi">0</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="n">int_neg</span><span class="p">(</span><span class="n">x</span><span class="p">)</span> +<span class="n">int_sub</span><span class="p">(</span><span class="o">-</span><span class="mi">1</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="n">int_invert</span><span class="p">(</span><span class="n">x</span><span class="p">)</span> +<span class="n">int_mul</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="o">-</span><span class="mi">1</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="n">int_neg</span><span class="p">(</span><span class="n">x</span><span class="p">)</span> +<span class="n">int_mul</span><span class="p">(</span><span class="o">-</span><span class="mi">1</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="n">int_neg</span><span class="p">(</span><span class="n">x</span><span class="p">)</span> +<span class="n">int_xor</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="o">-</span><span class="mi">1</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="n">int_invert</span><span class="p">(</span><span class="n">x</span><span class="p">)</span> +<span class="n">int_xor</span><span class="p">(</span><span class="o">-</span><span class="mi">1</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="n">int_invert</span><span class="p">(</span><span class="n">x</span><span class="p">)</span> +<span class="n">int_eq</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="mi">0</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="n">int_is_zero</span><span class="p">(</span><span class="n">x</span><span class="p">)</span> +<span class="n">int_eq</span><span class="p">(</span><span class="mi">0</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="n">int_is_zero</span><span class="p">(</span><span class="n">x</span><span class="p">)</span> +<span class="n">int_ne</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="mi">0</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="n">int_is_true</span><span class="p">(</span><span class="n">x</span><span class="p">)</span> +<span class="n">int_ne</span><span class="p">(</span><span class="mi">0</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="n">int_is_true</span><span class="p">(</span><span class="n">x</span><span class="p">)</span> +<span class="n">uint_lt</span><span class="p">(</span><span class="mi">0</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="n">int_is_true</span><span class="p">(</span><span class="n">x</span><span class="p">)</span> +<span class="n">uint_lt</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="mi">1</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="n">int_is_zero</span><span class="p">(</span><span class="n">x</span><span class="p">)</span> +<span class="n">uint_le</span><span class="p">(</span><span class="mi">1</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="n">int_is_true</span><span class="p">(</span><span class="n">x</span><span class="p">)</span> +<span class="n">uint_le</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="mi">0</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="n">int_is_zero</span><span class="p">(</span><span class="n">x</span><span class="p">)</span> +<span class="n">uint_gt</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="mi">0</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="n">int_is_true</span><span class="p">(</span><span class="n">x</span><span class="p">)</span> +<span class="n">uint_gt</span><span class="p">(</span><span class="mi">1</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="n">int_is_zero</span><span class="p">(</span><span class="n">x</span><span class="p">)</span> +<span class="n">uint_ge</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="mi">1</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="n">int_is_true</span><span class="p">(</span><span class="n">x</span><span class="p">)</span> +<span class="n">uint_ge</span><span class="p">(</span><span class="mi">0</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="n">int_is_zero</span><span class="p">(</span><span class="n">x</span><span class="p">)</span> +<span class="n">int_pydiv</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="o">-</span><span class="mi">1</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="n">int_neg</span><span class="p">(</span><span class="n">x</span><span class="p">)</span> +</pre></div> + +<h3 id="conclusions">Conclusions</h3> +<p>With not very little code we managed to generate a whole lot of local +simplifications for integer operations in the IR of PyPy's JIT. The rules +discovered that way are "simple", in the sense that they only require looking +at a single instruction, and not where the arguments of that instruction came +from. They also don't require any knowledge about the properties of the +arguments of the instructions (e.g. that they are positive).</p> +<p>The rewrites in this post have mostly been in PyPy's JIT already. But now we +mechanically confirmed that they are correct. I've also added the remaining +useful looking ones, in particular <code>int_eq(x, 0) -&gt; int_is_zero(x)</code> etc.</p> +<p>If we wanted to scale this approach up, we would have to work much harder! +There are a bunch of problems that come with generalizing the approach to +looking at sequences of instructions:</p> +<ul> +<li> +<p>Combinatorial explosion: if we look at sequences of instructions, we very + quickly get a combinatorial explosion and it becomes untractable to try all + combinations.</p> +</li> +<li> +<p>Finding non-minimal patterns: Some complicated simplifications can be + instances of simpler ones. For example, because <code>int_add(x, 0) -&gt; x</code>, it's + also true that <code>int_add(int_sub(x, y), 0) -&gt; int_sub(x, y)</code>. If we simply + generate all possible sequences, we will find the latter simplification rule, + which we would usually not care about.</p> +</li> +<li> +<p>Unclear usefulness: if we simply generate all rewrites up to a certain number + of instructions, we will get a lot of patterns that are useless in the sense + that they typically aren't found in realistic programs. It would be much + better to somehow focus on the patterns that real benchmarks are using.</p> +</li> +</ul> +<p>In the <a href="https://www.pypy.org/posts/2024/07/mining-jit-traces-missing-optimizations-z3.html">next blog post</a> I'll discuss an alternative approach to simply generating +all possible sequences of instructions, that tries to address these problems. +This works by analyzing the real traces of benchmarks and mining those for +inefficiencies, which only shows problems that occur in actual programs.</p> +<h3 id="sources">Sources</h3> +<p>I've been re-reading a lot of blog posts from John's blog:</p> +<ul> +<li><a href="https://blog.regehr.org/archives/1109">Let’s Work on an LLVM Superoptimizer</a></li> +<li><a href="https://blog.regehr.org/archives/1146">Early Superoptimizer Results</a></li> +<li><a href="https://blog.regehr.org/archives/1252">A Few Synthesizing Superoptimizer Results</a></li> +<li><a href="https://blog.regehr.org/archives/1636">Synthesizing Constants</a></li> +</ul> +<p>but also papers:</p> +<ul> +<li><a href="https://arxiv.org/pdf/1711.04422">A Synthesizing Superoptimizer</a></li> +<li><a href="https://dl.acm.org/doi/pdf/10.1145/3649837">Hydra: Generalizing Peephole Optimizations with Program Synthesis</a></li> +</ul> +<p>Another of my favorite blogs has been <a href="https://www.philipzucker.com/">Philipp Zucker's +blog</a> in the last year or two, lots of excellent +posts about/using Z3 on there.</p>jitz3https://www.pypy.org/posts/2024/07/finding-simple-rewrite-rules-jit-z3.htmlFri, 12 Jul 2024 19:14:09 GMT \ No newline at end of file diff --git a/authors/christoph-jung.html b/authors/christoph-jung.html new file mode 100644 index 000000000..2f4f93ba1 --- /dev/null +++ b/authors/christoph-jung.html @@ -0,0 +1,113 @@ + + + + + +Posts by Christoph Jung | PyPy + + + + + + + + + + + + + + + + + Skip to main content +
+
+
+ + \ No newline at end of file diff --git a/authors/christoph-jung.xml b/authors/christoph-jung.xml new file mode 100644 index 000000000..5d33f2a33 --- /dev/null +++ b/authors/christoph-jung.xml @@ -0,0 +1,77 @@ + +PyPy (Posts by Christoph Jung)https://www.pypy.org/enContents © 2024 <a href="mailto:pypy-dev@pypy.org">The PyPy Team</a> Sat, 31 Aug 2024 17:48:12 GMTNikola (getnikola.com)http://blogs.law.harvard.edu/tech/rssProfiling PyPy using the Firefox profiler user interfacehttps://www.pypy.org/posts/2024/05/vmprof-firefox-converter.htmlChristoph Jung<h3 id="introduction">Introduction</h3> +<p>If you ever wanted to profile your Python code on PyPy, you probably came across <a href="https://vmprof.readthedocs.io/en/latest/vmprof.html">VMProf</a> — a statistical profiler for PyPy.</p> +<p>VMProf's console output can already give some insights into where your code spends time, +but it is far from showing all the information captured while profiling.</p> +<p>There have been some tools around to visualize VMProf's output. +Unfortunately the vmprof.com user interface is no longer available and vmprof-server is not as easy to use, you may want to take a look at a local viewer or converter. +Those so far could give you some general visualizations of your profile, but do not show any PyPy related context like PyPy's log output (<a href="https://rpython.readthedocs.io/en/latest/logging.html">PyPyLog</a>, which is output when using the PYPYLOG environment variable to log JIT actions).</p> +<p>To bring all of those features together in one tool, you may take a look at the vmprof-firefox-converter.</p> +<p>Created in the context of my bachelor's thesis, the vmprof-firefox-converter is a tool for analyzing VMProf profiles with the <a href="https://profiler.firefox.com/">Firefox profiler</a> user interface. +Instead of building a new user interface from scratch, this allows us to reuse the user interface work Mozilla put into the Firefox profiler. +The Firefox profiler offers a timeline where you can zoom into profiles and work with different visualizations like a flame graph or a stack chart. +To understand why there is time spent inside a function, you can revisit the source code and even dive into the intermediate representation of functions executed by PyPy's just-in-time compiler. +Additionally, there is a visualization for PyPy's log output, to keep track whether PyPy spent time inside the interpreter, JIT or GC throughout the profiling time.</p> +<h3 id="profiling-word-count">Profiling word count</h3> +<p>In this blog post, I want to show an example of how to use the vmprof-firefox-converter for a simple Python program. +Based on Ben Hoyt's blog <a href="https://benhoyt.com/writings/count-words/">Performance comparison: counting words in Python, Go, C++, C, AWK, Forth, and Rust</a> we will profile two python versions of a word counter running on PyPy. One being a bit more optimized. For this, VMProf will be used, but instead of just going with the console output, we will use the Firefox profiler user interface.</p> +<p>At first, we are going to look at a simple way of counting words with <code>Collections.Counter</code>. +This will read one line from the standard input at a time and count the words with <code>counter.update()</code></p> +<div class="code"><pre class="code literal-block">counts = collections.Counter() +for line in sys.stdin: + words = line.lower().split() + counts.update(words) + +for word, count in counts.most_common(): + print(word, count) +</pre></div> + +<p>To start profiling, simply execute: +<code>pypy -m vmprofconvert -run simple.py &lt;kjvbible_x10.txt</code></p> +<p>This will run the above code with vmprof, automatically capture and convert the results and finally open the Firefox profiler. </p> +<p>The input file is the king James version of the bible concatenated ten times.</p> +<p>To get started, we take a look at the call stack.</p> +<p><img src="https://github.com/Cskorpion/vmprof-firefox-converter/blob/main/images/blog/simple_call_stack_crp.png?raw=true"> +Here we see that most of the time is spent in native code (marked as blue) e.g., the <code>counter.update()</code> or <code>split()</code> C implementation.</p> +<p>Now let's proceed with the more optimized version. +This time we read 64 Kb of data from the standard input and count the words with <code>counter.update()</code>.</p> +<div class="code"><pre class="code literal-block">counts = collections.Counter() +remaining = '' +while True: + chunk = remaining + sys.stdin.read(64*1024) + if not chunk: + break + last_lf = chunk.rfind('\n') # process to last LF character + if last_lf == -1: + remaining = '' + else: + remaining = chunk[last_lf+1:] + chunk = chunk[:last_lf] + counts.update(chunk.lower().split()) + +for word, count in counts.most_common(): + print(word, count) +</pre></div> + +<p>As we did before, we are going to take a peek at the call stack.</p> +<p><img src="https://github.com/Cskorpion/vmprof-firefox-converter/blob/main/images/blog/optimized_call_stack_crp.png?raw=true"> </p> +<p>Now there is more time spent in native code, caused by larger chunks of text passed to <code>counter.update()</code>.</p> +<p>This becomes even more clear by comparing the stack charts.</p> +<p><img src="https://github.com/Cskorpion/vmprof-firefox-converter/blob/main/images/blog/simple_stack_chart.png?raw=true"></p> +<p>Here, in the unoptimized case, we only read in one line at each loop iteration. +This results in small "spikes" in the stack chart. </p> +<p>But let's take an even closer look.</p> +<p><img src="https://github.com/Cskorpion/vmprof-firefox-converter/blob/main/images/blog/simple_stack_chart_zoom.png?raw=true"></p> +<p>Zoomed in, we see the call stack alternating between <code>_count_elements()</code> and (unfortunately unsymbolized) native calls coming from reading and splitting the input text (e.g., <code>decode()</code>).</p> +<p>Let us now take a look at the optimized case.</p> +<p><img src="https://github.com/Cskorpion/vmprof-firefox-converter/blob/main/images/blog/optimized_stack_chart.png?raw=true"></p> +<p>And if we look closer at the same interval as before, we see some spikes, but slightly different.</p> +<p><img src="https://github.com/Cskorpion/vmprof-firefox-converter/blob/main/images/blog/optimized_stack_chart_zoom.png?raw=true"></p> +<p>Even though we do not want to compare the (amount of) milliseconds directly, we clearly see that the spikes are wider, i.e. the time spent in those function calls is longer. +You may already know where this comes from. +We read a 64 Kb chunk of data from std in and pass that to <code>counter.update()</code>, so both these tasks do more work and take longer. +Bigger chunks mean there is less alternating between reading and counting, so there is more time spent doing work than "doing" loop iterations.</p> +<h3 id="getting-started">Getting started</h3> +<p>You can get the converter from <a href="https://github.com/Cskorpion/vmprof-firefox-converter">GitHub</a>.</p> +<p>Both VMProf and the vmprof-firefox-converter were created for profiling PyPy, but you can also use them with CPython. </p> +<p>This project is still somewhat experimental, so if you want to try it out, please let us know whether it worked for you.</p>https://www.pypy.org/posts/2024/05/vmprof-firefox-converter.htmlFri, 26 Apr 2024 14:38:00 GMT \ No newline at end of file diff --git a/authors/david-schneider.html b/authors/david-schneider.html new file mode 100644 index 000000000..409184363 --- /dev/null +++ b/authors/david-schneider.html @@ -0,0 +1,125 @@ + + + + + +Posts by David Schneider | PyPy + + + + + + + + + + + + + + + + + Skip to main content +
+
+
+ + \ No newline at end of file diff --git a/authors/david-schneider.xml b/authors/david-schneider.xml new file mode 100644 index 000000000..1a7be8bdb --- /dev/null +++ b/authors/david-schneider.xml @@ -0,0 +1,245 @@ + +PyPy (Posts by David Schneider)https://www.pypy.org/enContents © 2024 <a href="mailto:pypy-dev@pypy.org">The PyPy Team</a> Sat, 31 Aug 2024 17:48:10 GMTNikola (getnikola.com)http://blogs.law.harvard.edu/tech/rssPyPy 2.1 - Considered ARMfulhttps://www.pypy.org/posts/2013/08/pypy-21-considered-armful-7177475722033479233.htmlDavid Schneider<p>We're pleased to announce PyPy 2.1, which targets version 2.7.3 of the Python<br> +language. This is the first release with official support for ARM processors in the JIT.<br> +This release also contains several bugfixes and performance improvements.</p><p>You can download the PyPy 2.1 release here:</p><blockquote><a class="reference external" href="https://pypy.org/download.html">https://pypy.org/download.html</a></blockquote><p>We would like to thank the <a class="reference external" href="https://www.raspberrypi.org">Raspberry Pi Foundation</a> for supporting the work<br> +to finish PyPy's ARM support.</p><p>The first beta of PyPy3 2.1, targeting version 3 of the Python language, was<br> +just released, more details can be found <a class="reference external" href="https://www.pypy.org/posts/2013/07/pypy3-21-beta-1-8647445024868663902.html">here</a>.</p><div class="section" id="what-is-pypy"><br> +<h3>What is PyPy?</h3><p>PyPy is a very compliant Python interpreter, almost a drop-in replacement for +CPython 2.7. It's fast (<a class="reference external" href="https://speed.pypy.org">pypy 2.1 and cpython 2.7.2</a> performance comparison) +due to its integrated tracing JIT compiler.</p><p>This release supports x86 machines running Linux 32/64, Mac OS X 64 or Windows +32. This release also supports ARM machines running Linux 32bit - anything with +<tt class="docutils literal">ARMv6</tt> (like the Raspberry Pi) or <tt class="docutils literal">ARMv7</tt> (like the Beagleboard, +Chromebook, Cubieboard, etc.) that supports <tt class="docutils literal">VFPv3</tt> should work. Both +hard-float <tt class="docutils literal">armhf/gnueabihf</tt> and soft-float <tt class="docutils literal">armel/gnueabi</tt> builds are +provided. The <tt class="docutils literal">armhf</tt> builds for Raspbian are created using the Raspberry Pi +<a class="reference external" href="https://github.com/raspberrypi">custom cross-compilation toolchain</a> +based on <tt class="docutils literal"><span class="pre">gcc-arm-linux-gnueabihf</span></tt> and should work on <tt class="docutils literal">ARMv6</tt> and +<tt class="docutils literal">ARMv7</tt> devices running Debian or Raspbian. The <tt class="docutils literal">armel</tt> builds are built +using the <tt class="docutils literal"><span class="pre">gcc-arm-linux-gnuebi</span></tt> toolchain provided by Ubuntu and +currently target <tt class="docutils literal">ARMv7</tt>.</p><p>Windows 64 work is still stalling, we would welcome a volunteer +to handle that.</p></div><div class="section" id="highlights"><h3>Highlights</h3><ul class="simple"><li>JIT support for ARM, architecture versions 6 and 7, hard- and soft-float ABI</li> +<li>Stacklet support for ARM</li> +<li>Support for os.statvfs and os.fstatvfs on unix systems</li> +<li>Improved logging performance</li> +<li>Faster sets for objects</li> +<li>Interpreter improvements</li> +<li>During packaging, compile the CFFI based TK extension</li> +<li>Pickling of numpy arrays and dtypes</li> +<li>Subarrays for numpy</li> +<li>Bugfixes to numpy</li> +<li>Bugfixes to cffi and ctypes</li> +<li>Bugfixes to the x86 stacklet support</li> +<li>Fixed issue <a class="reference external" href="https://bugs.pypy.org/issue1533">1533</a>: fix an RPython-level OverflowError for space.float_w(w_big_long_number).</li> +<li>Fixed issue <a class="reference external" href="https://bugs.pypy.org/issue1552">1552</a>: GreenletExit should inherit from BaseException.</li> +<li>Fixed issue <a class="reference external" href="https://bugs.pypy.org/issue1537">1537</a>: numpypy __array_interface__</li> +<li>Fixed issue <a class="reference external" href="https://bugs.pypy.org/issue1238">1238</a>: Writing to an SSL socket in PyPy sometimes failed with a "bad write retry" message.</li> +</ul><p>Cheers,</p><p>David Schneider for the PyPy team.</p></div>https://www.pypy.org/posts/2013/08/pypy-21-considered-armful-7177475722033479233.htmlThu, 01 Aug 2013 14:38:00 GMTPyPy 2.1 beta 2https://www.pypy.org/posts/2013/07/pypy-21-beta-2-264349571160808803.htmlDavid Schneider<p>We're pleased to announce the second beta of the upcoming 2.1 release of PyPy.<br> +This beta adds one new feature to the 2.1 release and contains several bugfixes listed below.</p><p>You can download the PyPy 2.1 beta 2 release here:</p><blockquote><a class="reference external" href="https://pypy.org/download.html">https://pypy.org/download.html</a></blockquote><div class="section" id="highlights"><h3>Highlights</h3><ul class="simple"><li>Support for os.statvfs and os.fstatvfs on unix systems.</li> +<li>Fixed issue <a class="reference external" href="https://bugs.pypy.org/issue1533">1533</a>: fix an RPython-level OverflowError for space.float_w(w_big_long_number).</li> +<li>Fixed issue <a class="reference external" href="https://bugs.pypy.org/issue1552">1552</a>: GreenletExit should inherit from BaseException.</li> +<li>Fixed issue <a class="reference external" href="https://bugs.pypy.org/issue1537">1537</a>: numpypy __array_interface__</li> +<li>Fixed issue <a class="reference external" href="https://bugs.pypy.org/issue1238">1238</a>: Writing to an SSL socket in pypy sometimes failed with a "bad write retry" message.</li> +<li><a class="reference external" href="https://bitbucket.org/pypy/pypy/src/0c6eeae0316c11146f47fcf83e21e24f11378be1/?at=distutils-cppldflags">distutils</a>: copy CPython's implementation of customize_compiler, dont call<br> +split on environment variables, honour CFLAGS, CPPFLAGS, LDSHARED and<br> +LDFLAGS.</li> +<li>During packaging, compile the CFFI tk extension.</li> +</ul></div><div class="section" id="what-is-pypy"><h3>What is PyPy?</h3><p>PyPy is a very compliant Python interpreter, almost a drop-in replacement for<br> +CPython 2.7.3. It's fast due to its integrated tracing JIT compiler.</p><p>This release supports x86 machines running Linux 32/64, Mac OS X 64 or Windows<br> +32. Also this release supports ARM machines running Linux 32bit - anything with<br> +<tt class="docutils literal">ARMv6</tt> (like the Raspberry Pi) or <tt class="docutils literal">ARMv7</tt> (like Beagleboard,<br> +Chromebook, Cubieboard, etc.) that supports <tt class="docutils literal">VFPv3</tt> should work.</p><p>Windows 64 work is still stalling, we would welcome a volunteer<br> +to handle that.</p></div><div class="section" id="how-to-use-pypy"><h3>How to use PyPy?</h3><p>We suggest using PyPy from a <a class="reference external" href="https://www.virtualenv.org/en/latest/">virtualenv</a>. Once you have a virtualenv<br> +installed, you can follow instructions from <a class="reference external" href="https://doc.pypy.org/en/latest/getting-started.html#installing-using-virtualenv">pypy documentation</a> on how<br> +to proceed. This document also covers other <a class="reference external" href="https://doc.pypy.org/en/latest/getting-started.html#installing-pypy">installation schemes</a>.</p><p>Cheers,<br> +The PyPy Team.</p></div>https://www.pypy.org/posts/2013/07/pypy-21-beta-2-264349571160808803.htmlFri, 26 Jul 2013 10:33:00 GMTPyPy 2.1 betahttps://www.pypy.org/posts/2013/07/pypy-21-beta-1351105697755187196.htmlDavid Schneider<p>We're pleased to announce the first beta of the upcoming 2.1 release of PyPy. This beta contains many bugfixes and improvements, numerous improvements to the numpy in pypy effort. The main feature being that the ARM processor support is not longer considered alpha level.<br> +<br> +We would like to thank the <a class="reference external" href="https://www.raspberrypi.org/">Raspberry Pi Foundation</a> for supporting the work to finish PyPy's ARM support.<br> +<br> +<br> +You can download the PyPy 2.1 beta release here:<br> +</p><blockquote> +<a class="reference external" href="https://pypy.org/download.html">https://pypy.org/download.html</a></blockquote> +<div class="section" id="highlights"> +<h3> +<br></h3> +<h3> +Highlights</h3> +<ul class="simple"> +<li>Bugfixes to the ARM JIT backend, so that ARM is now an officially<br> +supported processor architecture</li> +<li>Stacklet support on ARM</li> +<li>Interpreter improvements</li> +<li>Various numpy improvements</li> +<li>Bugfixes to cffi and ctypes</li> +<li>Bugfixes to the stacklet support</li> +<li>Improved logging performance</li> +<li>Faster sets for objects</li> +</ul> +</div> +<div class="section" id="what-is-pypy"> +<h3> +<br></h3> +<h3> +What is PyPy?</h3> +PyPy is a very compliant Python interpreter, almost a drop-in replacement for CPython 2.7.3. It's fast due to its integrated tracing JIT compiler. This release supports x86 machines running Linux 32/64, Mac OS X 64 or Windows 32. Also this release supports ARM machines running Linux 32bit - anything with <tt class="docutils literal">ARMv6</tt> (like the Raspberry Pi) or <tt class="docutils literal">ARMv7</tt> (like Beagleboard, Chromebook, Cubieboard, etc.) that supports <tt class="docutils literal">VFPv3</tt> should work. Both hard-float <tt class="docutils literal">armhf/gnueabihf</tt> and soft-float <tt class="docutils literal">armel/gnueabi</tt> builds are provided. <tt class="docutils literal">armhf</tt> builds for Raspbian are created using the Raspberry Pi<br> +<a class="reference external" href="https://github.com/raspberrypi">custom cross-compilation toolchain</a> based on <tt class="docutils literal"><span class="pre">gcc-arm-linux-gnueabihf</span></tt> and should work on <tt class="docutils literal">ARMv6 </tt>and <tt class="docutils literal">ARMv7</tt> devices running Debian or Raspbian. <tt class="docutils literal">armel</tt> builds are built using the <tt class="docutils literal"><span class="pre">gcc-arm-linux-gnuebi</span></tt> toolchain provided by Ubuntu and currently target <tt class="docutils literal">ARMv7</tt>.<br> +<br> +Windows 64 work is still stalling, we would welcome a volunteer to handle that.</div> +<div class="section" id="how-to-use-pypy"> +<h3> +<br></h3> +<h3> +How to use PyPy?</h3> +We suggest using PyPy from a <a class="reference external" href="https://www.virtualenv.org/en/latest/">virtualenv</a>. Once you have a virtualenv installed, you can follow instructions from <a class="reference external" href="https://doc.pypy.org/en/latest/getting-started.html#installing-using-virtualenv">pypy documentation</a> on how to proceed. This document also covers other <a class="reference external" href="https://doc.pypy.org/en/latest/getting-started.html#installing-pypy">installation schemes</a>.<br> +<br> +Cheers,<br> +<br> +the PyPy team.</div>https://www.pypy.org/posts/2013/07/pypy-21-beta-1351105697755187196.htmlThu, 11 Jul 2013 10:36:00 GMTAlmost There - PyPy's ARM Backendhttps://www.pypy.org/posts/2012/02/almost-there-pypys-arm-backend_01-3216759488618774525.htmlDavid Schneider<div style="text-align: left;"> +In this post I want to give an update on the status of the ARM backend for PyPy's JIT and describe some of the issues and details of the backend.</div> +<div class="section" id="current-status"> +<br> +<h2> + + + + +Current Status</h2> +It has been a more than a year that I have been working on the ARM backend. Now it is in a shape, that we can measure meaningful numbers and also ask for some feedback. Since the <a class="reference external" href="https://www.pypy.org/posts/2011/01/jit-backend-for-arm-processors-5994810755839586463.html">last post about the backend</a> we have added support floating point operations as well as for PyPy's framework GC's. Another area of work was to keep up with the constant improvements done in the main development branch, such as out-of-line guards, labels, etc. It has been possible for about a year to cross-translate the PyPy Python interpreter and other interpreters such as <a class="reference external" href="https://bitbucket.org/cfbolz/pyrolog/">Pyrolog</a>, with a JIT, to run benchmarks on ARM. Up until now there remained some hard to track bugs that would cause the interpreter to crash with a segmentation fault in certain cases when running with the JIT on ARM. Lately it was possible to run all benchmarks without problems, but when running the translation toolchain itself it would crash. During the last PyPy sprint in <a class="reference external" href="https://www.pypy.org/posts/2011/12/leysin-winter-sprint-6862532189897876336.html">Leysin</a> Armin and I managed to fix several of these hard to track bugs in the ARM backend with the result that, it is now possible to run the PyPy translator on ARM itself (at least unless until it runs out of memory), which is a kind of litmus test for the backend itself and used to crash before. Just to point it out, we are not able to complete a PyPy translation on ARM, because on the hardware we have currently available there is not enough memory. But up to the point we run out of memory the JIT does not hit any issues.<br> +<br></div> +<div class="section" id="implementation-details"> +<h2> + + + + +Implementation Details</h2> +The hardware requirements to run the JIT on ARM follow those for Ubuntu on ARM which targets ARMv7 with a VFP unit running in little endian mode. The JIT can be translated without floating point support, but there might be a few places that need to be fixed to fully work in this setting. We are targeting the ARM instruction set, because at least at the time we decided to use it seemed to be the best choice in terms of speed while having some size overhead compared to the Thumb2 instruction set. It appears that the Thumb2 instruction set should give comparable speed with better code density but has a few restriction on the number of registers available and the use of conditional execution. Also the implementation is a bit easier using a fixed width instruction set and we can use the full set of registers in the generated code when using the ARM instruction set.<br> +<br></div> +<div class="section" id="the-calling-convention-on-arm"> +<h2> + + + + +The calling convention on ARM</h2> +The calling convention on ARM uses 4 of the general purpose registers to pass arguments to functions, further arguments are passed on the stack. The presence of a floating point unit is not required for ARM cores, for this reason there are different ways of handling floats with relation to the calling convention. There is a so called soft-float calling convention that is independent of the presence of a floating point unit. For this calling convention floating point arguments to functions are stored in the general purpose registers and on the stack. Passing floats around this way works with software and hardware floating point implementations. But in presence of a floating point unit it produces some overhead, because floating point numbers need to be moved from the floating point unit to the core registers to do a call and moved back to the floating point registers by the callee. The alternative calling convention is the so-called hard-float calling convention which requires the presence of a floating point unit but has the advantage of getting rid of the overhead of moving floating point values around when performing a call. Although it would be better in the long term to support the hard-float calling convention, we need to be able to interoperate with external code compiled for the operating system we are running on. For this reason at the moment we only support the soft-float to interoperate with external code. We implemented and tested the backend on a <a class="reference external" href="https://beagleboard.org/hardware-xM/">BeagleBoard-xM</a> with a <a class="reference external" href="https://www.arm.com/products/processors/cortex-a/cortex-a8.php">Cortex-A8</a> processor running <a class="reference external" href="https://wiki.ubuntu.com/ARM">Ubuntu 11.04 for ARM</a>.<br> +<br></div> +<div class="section" id="translating-for-arm"> +<h2> + + + + +Translating for ARM</h2> +The toolchain used to translate PyPy currently is based on a <a class="reference external" href="https://maemo.gitorious.org/scratchbox2/pages/Home">Scratchbox2</a>. Scratchbox2 is a cross-compiling environment. Development had stopped for a while, but it seems to have revived again. We run a 32-bit Python interpreter on the host system and perform all calls to the compiler using a Scratchbox2 based environment. A description on how to setup the cross translation toolchain can be found <a class="reference external" href="https://bitbucket.org/pypy/pypy/src/1f07ea8076c9/pypy/doc/arm.rst">here</a>.<br> +<br></div> +<div class="section" id="results"> +<h2> + + + + +Results</h2> +The current results on ARM, as shown in the graph below, show that the JIT currently gives a speedup of about 3.5 times compared to CPython on ARM. The benchmarks were run on the before mentioned BeagleBoard-xM with a 1GHz ARM Cortex-A8 processor and 512MB of memory. The operating system on the board is Ubuntu 11.04 for ARM. We measured the PyPy interpreter with the JIT enabled and disabled comparing each to CPython Python 2.7.1+ (r271:86832) for ARM. The graph shows the speedup or slowdown of both PyPy versions for the different benchmarks from our benchmark suite normalized to the runtime of CPython. The data used for the graph can be seen below.<br> +<div class="separator" style="clear: both; text-align: center;"> +<a href="https://2.bp.blogspot.com/-uckc9tOWgnM/TykHMuuGT9I/AAAAAAAAAKg/J8_fC6RS-QA/s1600/graph.png" style="margin-left: 1em; margin-right: 1em;"><img border="0" height="258" src="https://2.bp.blogspot.com/-uckc9tOWgnM/TykHMuuGT9I/AAAAAAAAAKg/J8_fC6RS-QA/s400/graph.png" width="400"></a></div> +<br> +The speedup is less than the speedup of 5.2 times we currently get on x86 on our own benchmark suite (see <a class="reference external" href="https://speed.pypy.org/">https://speed.pypy.org</a> for details). There are several possible reasons for this. Comparing the results for the interpreter without the JIT on ARM and x86 suggests that the interpreter generated by PyPy, without the JIT, has a worse performance when compared to CPython that it does on x86. Also it is quite possible that the code we are generating with the JIT is not yet optimal. Also there are some architectural constraints produce some overhead. One of these differences is the handling of constants, most ARM instructions only support 8 bit (that can be shifted) immediate values, larger constants need to be loaded into a register, something that is not necessary on x86.<br> +<br> +<table border="1" class="docutils"><colgroup></colgroup><colgroup><col width="40%"></colgroup><colgroup><col width="32%"></colgroup><colgroup><col width="28%"></colgroup><tbody valign="top"> +<tr><td>Benchmark</td><td>PyPy JIT</td><td>PyPy no JIT</td></tr> +<tr><td>ai</td><td>0.484439780047</td><td>3.72756749625</td></tr> +<tr><td>chaos</td><td>0.0807291691934</td><td>2.2908692212</td></tr> +<tr><td>crypto_pyaes</td><td>0.0711114832245</td><td>3.30112318509</td></tr> +<tr><td>django</td><td>0.0977743245519</td><td>2.56779947601</td></tr> +<tr><td>fannkuch</td><td>0.210423735698</td><td>2.49163632938</td></tr> +<tr><td>float</td><td>0.154275334675</td><td>2.12053281495</td></tr> +<tr><td>go</td><td>0.330483034202</td><td>5.84628320479</td></tr> +<tr><td>html5lib</td><td>0.629264389862</td><td>3.60333138526</td></tr> +<tr><td>meteor-contest</td><td>0.984747426912</td><td>2.93838610037</td></tr> +<tr><td>nbody_modified</td><td>0.236969593082</td><td>1.40027234936</td></tr> +<tr><td>pyflate-fast</td><td>0.367447191807</td><td>2.72472422146</td></tr> +<tr><td>raytrace-simple</td><td>0.0290527461437</td><td>1.97270054339</td></tr> +<tr><td>richards</td><td>0.034575573553</td><td>3.29767342015</td></tr> +<tr><td>slowspitfire</td><td>0.786642551908</td><td>3.7397367403</td></tr> +<tr><td>spambayes</td><td>0.660324379456</td><td>3.29059863111</td></tr> +<tr><td>spectral-norm</td><td>0.063610783731</td><td>4.01788986233</td></tr> +<tr><td>spitfire</td><td>0.43617131165</td><td>2.72050579076</td></tr> +<tr><td>spitfire_cstringio</td><td>0.255538702134</td><td>1.7418593111</td></tr> +<tr><td>telco</td><td>0.102918930413</td><td>3.86388866047</td></tr> +<tr><td>twisted_iteration</td><td>0.122723986805</td><td>4.33632475491</td></tr> +<tr><td>twisted_names</td><td>2.42367797135</td><td>2.99878698076</td></tr> +<tr><td>twisted_pb</td><td>1.30991837431</td><td>4.48877805486</td></tr> +<tr><td>twisted_tcp</td><td>0.927033354055</td><td>2.8161624665</td></tr> +<tr><td>waf</td><td>1.02059811932</td><td>1.03793427321</td></tr> +</tbody></table> +</div> +<br> +<br> +<div class="section" id="the-next-steps-and-call-for-help"> +<h2> + + + + +The next steps and call for help</h2> +Although there probably still are some remaining issues which have not surfaced yet, the JIT backend for ARM is working. Before we can merge the backend into the main development line there are some things that we would like to do first, in particular it we are looking for a way to run the all PyPy tests to verify that things work on ARM before we can merge. Additionally there are some other longterm ideas. To do this we are looking for people willing to help, either by contributing to implement the open features or that can help us with hardware to test.<br> +<br> +The incomplete list of open topics:<br> +<ul class="simple"> +<li>We are looking for a better way to translate PyPy for ARM, than the one describe above. I am not sure if there currently is hardware with enough memory to directly translate PyPy on an ARM based system, this would require between 1.5 or 2 Gig of memory. A fully <a class="reference external" href="https://wiki.qemu.org/Main_Page">QEMU</a> based approach could also work, instead of Scratchbox2 that uses QEMU under the hood.</li> +<li>Test the JIT on different hardware.</li> +<li>Experiment with the JIT settings to find the optimal thresholds for ARM.</li> +<li>Continuous integration: We are looking for a way to run the PyPy test suite to make sure everything works as expected on ARM, here QEMU also might provide an alternative.</li> +<li>A long term plan would be to port the backend to ARMv5 ISA and improve the support for systems without a floating point unit. This would require to implement the ISA and create different code paths and improve the instruction selection depending on the target architecture.</li> +<li>Review of the generated machine code the JIT generates on ARM to see if the instruction selection makes sense for ARM.</li> +<li>Build a version that runs on Android.</li> +<li>Improve the tools, i.e. integrate with <a class="reference external" href="https://bitbucket.org/pypy/jitviewer">jitviewer</a>.</li> +</ul> +So if you are interested or willing to help in any way contact us.</div>armjitpypyhttps://www.pypy.org/posts/2012/02/almost-there-pypys-arm-backend_01-3216759488618774525.htmlWed, 01 Feb 2012 09:43:00 GMTA JIT Backend for ARM Processorshttps://www.pypy.org/posts/2011/01/jit-backend-for-arm-processors-5994810755839586463.htmlDavid Schneider<div class="document" id="a-jit-backend-for-arm-processors"> +In the past few months, I have been developing as a part of my master thesis +the ARM backend for the the PyPy JIT, in the <a class="reference external" href="https://foss.heptapod.net/pypy/pypy/-/tree/branch/arm-backend-2">arm-backend</a> branch. Currently, it is still work in progress: all integer and object operations are working and +the support for <a class="reference external" href="https://foss.heptapod.net/pypy/pypy/-/tree/branch/arm-backed-float">floating point</a> is also under development.<br> +ARM processors are very widely used, beeing deployed in servers, some netbooks +and mainly mobile devices such as phones and tablets. One of our goals is to be +able to run PyPy on phones, specially on Android. Currently is not yet possible +to translate and compile PyPy for Android automatically, but there has been +some <a class="reference external" href="https://pyppet.blogspot.com/2011/01/android-and-rpython.html">work</a> on using Android's NDK to compile PyPy's generated C code.<br> +The JIT Backend targets the application profile of the ARMv7 instruction set +architecture which is found for example in the Cortex-A8 processors used in many Android powered devices and in Apple's <a class="reference external" href="https://en.wikipedia.org/wiki/Apple_A4">A4 processors</a> built into the latest iOS devices. To develop and +test the backend we are using a <a class="reference external" href="https://beagleboard.org/hardware-xM">BeagleBoard-xM</a> which has a 1 GHz ARM +Cortex-A8 and 512 MB of RAM running the <a class="reference external" href="https://wiki.ubuntu.com/ARM">ARM port</a> of Ubuntu 10.10.<br> +Currently on Linux it is possible to translate and cross-compile PyPy's Python +interpreter as well as other interpreters with the ARM JIT backend enabled +using Scratchbox 2 to provide a build environment and the GNU ARM cross +compilation toolchain. So far the backend only supports the <a class="reference external" href="https://www.hpl.hp.com/personal/Hans_Boehm/gc/">Boehm</a> garbage +collector which does not produce the best results combined with the JIT, but we +plan to add support for the other GCs in the future, doing so should increase +the performance of PyPy on ARM.<br> +While still debugging the last issues with the backend we already can run some +simple benchmarks on <a class="reference external" href="https://bitbucket.org/cfbolz/pyrolog">Pyrolog</a>, a prolog interpreter written in RPython. +Even using Boehm as the GC the results look very promising. In the benchmarks +we compare Pyrolog to <a class="reference external" href="https://www.swi-prolog.org/">SWI-Prolog</a>, a prolog interpreter written in C, which +is available from the package repositories for Ubuntu's ARM port.<br> +The benchmarks can be found in the <a class="reference external" href="https://bitbucket.org/cfbolz/pyrolog-benchmark">pyrolog-bench</a> repository.<br> +<table border="1" class="docutils"><colgroup></colgroup><colgroup><col width="32%"></colgroup><colgroup><col width="30%"></colgroup><colgroup><col width="25%"></colgroup><colgroup><col width="13%"></colgroup><thead valign="bottom"> +<tr><th class="head">Benchmark</th><th class="head">SWI-Prolog in ms.</th><th class="head">Pyrolog in ms.</th><th class="head">Speedup</th></tr> +</thead><tbody valign="top"> +<tr><td>iterate</td><td>60.0</td><td>6.0</td><td>10.0</td></tr> +<tr><td>iterate_assert</td><td>130.0</td><td>6.0</td><td>21.67</td></tr> +<tr><td>iterate_call</td><td>3310.0</td><td>5.0</td><td>662.0</td></tr> +<tr><td>iterate_cut</td><td>60.0</td><td>359.0</td><td>0.16713</td></tr> +<tr><td>iterate_exception</td><td>4950.0</td><td>346.0</td><td>14.306</td></tr> +<tr><td>iterate_failure</td><td>400.0</td><td>127.0</td><td>3.1496</td></tr> +<tr><td>iterate_findall</td><td>740.0</td><td>No res.</td><td></td></tr> +<tr><td>iterate_if</td><td>140.0</td><td>6.0</td><td>23.333</td></tr> +</tbody></table> +The iterate_call benchmark, which constructs a predicate and calls it at +runtime, with a speedup of 662 times over SWI-Prolog is an example where the +JIT can show its strength. The Pyrolog interpreter and the JIT treat +dynamically defined predicates as static ones and can generate optimezed code +in both cases. Whereas SWI only compiles statically defined rules and has to +fall back to interpretation on dynamic ones.<br> +For simple benchmarks running on PyPy's Python intepreter we see some speedups +over CPython, but we still need to debug the backend bit more before we can +show numbers on more complex benchmarks. So, stay tuned.</div>https://www.pypy.org/posts/2011/01/jit-backend-for-arm-processors-5994810755839586463.htmlSat, 29 Jan 2011 14:19:00 GMT \ No newline at end of file diff --git a/authors/hakan-ardo.html b/authors/hakan-ardo.html new file mode 100644 index 000000000..02dfae882 --- /dev/null +++ b/authors/hakan-ardo.html @@ -0,0 +1,116 @@ + + + + + +Posts by Hakan Ardo | PyPy + + + + + + + + + + + + + + + + + Skip to main content +
+
+
+ + \ No newline at end of file diff --git a/authors/hakan-ardo.xml b/authors/hakan-ardo.xml new file mode 100644 index 000000000..443773867 --- /dev/null +++ b/authors/hakan-ardo.xml @@ -0,0 +1,565 @@ + +PyPy (Posts by Hakan Ardo)https://www.pypy.org/enContents © 2024 <a href="mailto:pypy-dev@pypy.org">The PyPy Team</a> Sat, 31 Aug 2024 17:48:11 GMTNikola (getnikola.com)http://blogs.law.harvard.edu/tech/rssPlaying with Linear Programming on PyPyhttps://www.pypy.org/posts/2011/05/playing-with-linear-programming-on-pypy-4040572987275633047.htmlHakan Ardo<p>Fancy hi-level interfaces often come with a high runtime overhead +making them slow. Here is an experiment with building such an +interface using constructions that PyPy should be good at +optimizing. The idea is to allow the JIT in PyPy to remove the +overhead introduced by using a fancy high-level python interface +on top of a low-level C interface. The application considered is +<a href="https://en.wikipedia.org/wiki/Linear_programming">Linear +programming</a>. It is a tool used to solve linear optimization +problems. It can for example be used to find the nonnegative values +x, y and z that gives the maximum value of + +</p><center> +<img src="https://4.bp.blogspot.com/-WZq2bkIyCu8/Tct9px6L9vI/AAAAAAAAAEc/1cAAIqo6Lbk/s320/eqsource1.png"> +</center> + +without violating the constraints + +<center> +<img alt="" border="0" id="BLOGGER_PHOTO_ID_5603193483237811810" src="https://2.bp.blogspot.com/-Sz2pbOB-2jI/TcKKyTgsZmI/AAAAAAAAAEE/_B5_wHCXbxE/s320/eqsource4.png" style="display: block; margin: 0px auto 10px; text-align: center; cursor: pointer; cursor: hand; width: 143px; height: 15px;"> +<img alt="" border="0" id="BLOGGER_PHOTO_ID_5603193442782682514" src="https://4.bp.blogspot.com/-Bh4OdLbZN_0/TcKKv8zcHZI/AAAAAAAAAD8/CXBq2l48HV4/s320/eqsource3.png" style="display: block; margin: 0px auto 10px; text-align: center; cursor: pointer; cursor: hand; width: 145px; height: 16px;"> +<img alt="" border="0" id="BLOGGER_PHOTO_ID_5603193369540695138" src="https://3.bp.blogspot.com/-6mQW30hs9vE/TcKKrr9MFGI/AAAAAAAAAD0/_x8dND-knN4/s320/eqsource2.png" style="display: block; margin: 0px auto 10px; text-align: center; cursor: pointer; cursor: hand; width: 114px; height: 15px;"> +</center> + +There exists general purpose solvers for these kind of problems that +are very fast and can literally handle millions of variables. To use +them however the problem has to be transformed into some specific +matrix form, and the coefficients of all the matrices +has to be passed to the solver using some API. This transformation is +a tedious and error prone step that forces you to work with matrix +indexes instead of readable variable names. Also it makes maintaining +an implementation hard since any modification has to be transformed +too. + +<p></p> +The example above comes from the manual of +the <a href="ftp://ftp.gnu.org/pub/gnu/glpk/">glpk</a> library. That +manual continues by describing how to convert this problem into the +standard form of glpk (which involves introducing three new variables) +and then gives the <a href="https://bitbucket.org/hakanardo/pplp/src/825cdbc5cae6/blog/sample.c">c-code</a> needed to call the +library. Relating that c-code to the problem above without the +intermediate explanation of the manual is not easy. A common +solution here is to build a hi-level interface that allows a more +natural way of defining the matrices and/or allow the equations to be +entered symbolically. Unfortunately, such interfaces often become +slow. For the benchmark below for example, +<a href="https://abel.ee.ucla.edu/cvxopt">cvxopt</a> +requires 20 minutes to setup a problem that takes 9.43 seconds to solve +(this seems a bit extreme, am I doing something wrong?). + +<p></p> +The high-level interface I constructed on top of the +<a href="ftp://ftp.gnu.org/pub/gnu/glpk/">glpk</a> library is +<a href="https://bitbucket.org/hakanardo/pplp">pplp</a> and it allows +the equations to be entered symbolically. The above problem can be +solved using +<pre> + lp = LinearProgram() + x, y, z = lp.IntVar(), lp.IntVar(), lp.IntVar() + lp.objective = 10*x + 6*y + 4*z + lp.add_constraint( x + y + z &lt;= 100 ) + lp.add_constraint( 10*x + 4*y + 5*z &lt;= 600 ) + lp.add_constraint( 2*x + 2*y + 6*z &lt;= 300 ) + lp.add_constraint( x &gt;= 0 ) + lp.add_constraint( y &gt;= 0 ) + lp.add_constraint( z &gt;= 0 ) + + maxval = lp.maximize() + print maxval + print x.value, y.value, z.value +</pre> + +<p></p> +To benchmark the API I used it to solve a +<a href="https://en.wikipedia.org/wiki/Minimum-cost_flow_problem">minimum-cost + flow problem</a> with 154072 nodes and 390334 arcs. The C library + needs 9.43 s to solve this and the pplp interface adds another 5.89 + s under PyPy and 28.17 s under CPython. A large amount of time is + still spend setting up the problem, but it's a significant + improvement over the 20 minutes required on CPython by + <a href="https://abel.ee.ucla.edu/cvxopt">cvxopt</a>. It is + probably not designed to be fast on this kind of benchmark. I have + not been able to get cvxopt to work under PyPy. The benchmark used is + available <a href="https://bitbucket.org/hakanardo/pplp/src/default/benchmark/">here</a>https://www.pypy.org/posts/2011/05/playing-with-linear-programming-on-pypy-4040572987275633047.htmlWed, 11 May 2011 12:27:00 GMTLoop invariant code motionhttps://www.pypy.org/posts/2011/01/loop-invariant-code-motion-1998392217676829154.htmlHakan Ardo<p>Recently, the jit-unroll-loops branch was merged. It implements the +idea described in +<a href="https://www.pypy.org/posts/2010/09/using-escape-analysis-across-loop-2887031293132023676.html">Using Escape Analysis Across Loop Boundaries for Specialization</a>. +That post does only talk about virtuals, but the idea turned out +to be more far reaching. After the metainterpreter produces a trace, +several optimizations are applied to the trace before it is turned +into binary code. Removing allocations is only one of them. There are also +for instance +</p><ul> +<li> Heap optimizations that removes memory accesses by reusing results + previously read from or written to the same location. +</li><li> Reusing of the results of pure operations if the same pure + operation is executed twice. +</li><li> Removal of redundant guards. +</li><li> ... +</li></ul> +A lot of these optimizations are in one way or another removing +operations form the trace and/or reusing previous results. All of these +optimizations could benefit from being able to operate across loop +boundaries. Not only in the sense that operations operating on loop +invariants could be moved out of the loop entirely. But also that +results produced at the end of an iteration could be reused at the +beginning of the next even if there are no loop invariants involved. + +<p> + +This is achieved by unrolling the trace into two iterations, and +letting the optimizer work on this two-iteration-trace. +The optimizer will now be able to optimize the second iteration more than the +first since it can reuse results from the first iteration. The +optimized version of the first iteration we call the <em>preamble</em> and the +optimized version of the second iteration we call the <em>loop</em>. The +preamble will end with a jump to the loop, while the loop will end +with a jump to itself. This means that the preamble will be executed +once for the first iteration, the loop will be executed for all following +iterations. + +</p><p> +</p><h2>Sqrt example</h2> +Here is an example of a Python implementation of sqrt using a fairly +simple algorithm + +<p> +<!-- pygmentize -f html -O full -o t.html t.py --> + + +</p><div class="highlight"><pre><span class="k">def</span> <span class="nf">sqrt</span><span class="p">(</span><span class="n">y</span><span class="p">,</span> <span class="n">n</span><span class="o">=</span><span class="mi">10000</span><span class="p">):</span> + <span class="n">x</span> <span class="o">=</span> <span class="n">y</span> <span class="o">/</span> <span class="mi">2</span> + <span class="k">while</span> <span class="n">n</span> <span class="o">&gt;</span> <span class="mi">0</span><span class="p">:</span> + <span class="n">n</span> <span class="o">-=</span> <span class="mi">1</span> + <span class="n">x</span> <span class="o">=</span> <span class="p">(</span><span class="n">x</span> <span class="o">+</span> <span class="n">y</span><span class="o">/</span><span class="n">x</span><span class="p">)</span> <span class="o">/</span> <span class="mi">2</span> + <span class="k">return</span> <span class="n">x</span> +</pre></div> +<p> + +If it is called with <tt>sqrt(1234.0)</tt>, +<a href="https://bitbucket.org/hakanardo/hakanardo/src/edbd398575ad/sqrt/noopt.txt">a fairly long trace</a> is produced. From this trace +the optimizer creates +the +following preamble (Loop 1) and loop (Loop 0) + + +</p><p> +<!-- +<img src="trace1.png"> +<a onblur="try {parent.deselectBloggerImageGracefully();} catch(e) {}" href="https://2.bp.blogspot.com/_Ti798w4YGGI/TSyurBi3pLI/AAAAAAAAADA/IJQ0Ki4Pu0E/s1600/trace1.png"><img style="display:block; margin:0px auto 10px; text-align:center;cursor:pointer; cursor:hand;width: 225px; height: 320px;" src="https://2.bp.blogspot.com/_Ti798w4YGGI/TSyurBi3pLI/AAAAAAAAADA/IJQ0Ki4Pu0E/s320/trace1.png" border="0" alt=""id="BLOGGER_PHOTO_ID_5561011694068016306" /></a> +--> +<img src="https://2.bp.blogspot.com/_Ti798w4YGGI/TSyurBi3pLI/AAAAAAAAADA/IJQ0Ki4Pu0E/s1600/trace1.png"> +</p><p> + +Looking at the preamble, it starts by making sure that it is not +currently being profiled, the guard +on <tt>i5</tt>, and that the function object have not been changed +since the trace was made, the guard on <tt>p3</tt>. Somewhat +intermixed with that, the +integer variable <tt>n</tt> is unboxed, by making sure <tt>p11</tt> +points to an integer object and reading out the integer value from +that object. +These operations are not needed in the +loop (and have been removed from it) as emitting the same guards again +would be redundant and <tt>n</tt> becomes a virtual before the +end of the preamble. +</p><pre> + guard_value(i5, 0, descr=&lt;Guard6&gt;) + guard_nonnull_class(p11, ConstClass(W_IntObject), descr=&lt;Guard7&gt;) + guard_value(p3, ConstPtr(ptr15), descr=&lt;Guard8&gt;) + i16 = getfield_gc_pure(p11, descr=&lt;W_IntObject.inst_intval&gt;) +</pre> + +Next comes a test and a guard implementing the while statement +followed by the decrementing of <tt>n</tt>. These operation appear +both in the preamble and in the loop +<pre> + i18 = int_gt(i16, 0) + guard_true(i18, descr=&lt;Guard9&gt;) + i20 = int_sub(i16, 1) +</pre> + +After that the two floating point variables <tt>x</tt> and <tt>y</tt> +are unboxed. Again this is only needed in the preamble. Note how the +unboxed value of <tt>y</tt>, called <tt>f23</tt>, is passed unchanged +from the preamble to the loop in arguments of the jump +to allow it to be reused. It will not become a virtual +since it is never changed within the loop. +<pre> + guard_nonnull_class(p12, 17652552, descr=&lt;Guard10&gt;) + guard_nonnull_class(p10, 17652552, descr=&lt;Guard11&gt;) + f23 = getfield_gc_pure(p10, descr=&lt;W_FloatObject.inst_floatval&gt;) + f24 = getfield_gc_pure(p12, descr=&lt;W_FloatObject.inst_floatval&gt;) +</pre> + +Following that is the actual calculations performed in the loop in +form of floating point operations (since the function was called with +a float argument). These appear in both the loop +and the preamble. +<pre> + i26 = float_eq(f24, 0.000000) + guard_false(i26, descr=&lt;Guard12&gt;) + f27 = float_truediv(f23, f24) + f28 = float_add(f24, f27) + f30 = float_truediv(f28, 2.000000) +</pre> + +Finally there are some tests checking if a signal was received +(such as when the user presses ctrl-C) and thus should execute some +signal handler or if we need to hand over to another thread. This is +implemented with a counter that is decreased once every iteration. It +will go below zero after some specific number of iterations, tunable by +<tt>sys.setcheckinterval</tt>. The counter is read from and written to +some global location where it also can be made negative by a C-level +signal handler. +<pre> + i32 = getfield_raw(32479328, descr=&lt;pypysig_long_struct.c_value&gt;) + i34 = int_sub(i32, 2) + setfield_raw(32479328, i34, descr=&lt;pypysig_long_struct.c_value&gt;) + i36 = int_lt(i34, 0) + guard_false(i36, descr=&lt;Guard13&gt;) + jump(p0, p1, p2, p4, p10, i20, f30, f23, descr=&lt;Loop0&gt;) +</pre> + +<p> +</p><h2>Bridges</h2> + +When a guard fails often enough, the meta-interpreter is started again +to produce a new trace starting at the failing guard. The tracing is +continued until a previously compiled loop is entered. This could +either be the the same loop that contains the failing guard +or some completely different loop. If it is the same loop, executing +the preamble again maybe be unnecessary. +It is preferable to end the bridge with a jump directly to +the loop. To achieve this the optimizer tries to produce <i>short + preambles</i> that are inlined at the end of bridges allowing +them to jump directly to the loop. Inlining is better than jumping to +a common preamble because most of the inlined short preamble can +typically be removed again by the optimizer. +Creating such a short +preamble is however not always possible. Bridges jumping to loops for which +no short preamble can be generated have to end with a jump to the +full preamble instead. + +<p> + +The short preamble is created by comparing the operations in the +preamble with the operations in the loop. The +operations that are in the preamble but not in the loop +are moved to the short preamble whenever it is safe to move them to +the front of the operations remaining. In other words, the full preamble +is equivalent to the short preamble followed by one iteration of the +loop. + +</p><p> + +This much has currently been implemented. To give the full picture +here, there are two more features that +hopefully will be implemented in the near future. +The first is to replace the full preamble, used by the interpreter +when it reaches a compiled loop, with the short preamble. +This is currently not done and is probably not as straight forward as +it might first seem. The problem is where to resume interpreting on a +guard failure. However, implementing that should save some +memory. Not only +because the preamble will become smaller, but mainly because the +guards will appear either in the loop or in the preamble, but not +in both (as they do now). That means there will only be a single bridge and +not potentially two copies once the guards are traced. + +</p><p> + +The sqrt example above would with a short preamble result in a trace +like this + +</p><p> +<!-- +<img src="trace2.png"><p> +<a onblur="try {parent.deselectBloggerImageGracefully();} catch(e) {}" href="https://4.bp.blogspot.com/_Ti798w4YGGI/TS4S_CifokI/AAAAAAAAADI/uPpoIngmD-I/s1600/trace2.png"><img style="display:block; margin:0px auto 10px; text-align:center;cursor:pointer; cursor:hand;width: 306px; height: 320px;" src="https://4.bp.blogspot.com/_Ti798w4YGGI/TS4S_CifokI/AAAAAAAAADI/uPpoIngmD-I/s320/trace2.png" border="0" alt=""id="BLOGGER_PHOTO_ID_5561403464071750210" /></a> +--> +<img src="https://4.bp.blogspot.com/_Ti798w4YGGI/TS4S_CifokI/AAAAAAAAADI/uPpoIngmD-I/s1600/trace2.png"> +</p> +If it is executed long enough, the last guard will be traced to form a +bridge. The trace will inherit the virtuals from its parent. This can +be used to optimize away the part of the inlined short preamble +that deals with virtuals. The resulting bridge should look +something like + +<pre> + [p0, p1, p2, p3, p4, f5, i6] + i7 = force_token() + setfield_gc(p1, i7, descr=&lt;PyFrame.vable_token&gt;) + call_may_force(ConstClass(action_dispatcher), p0, p1, descr=&lt;VoidCallDescr&gt;) + guard_not_forced(, descr=&lt;Guard19&gt;) + guard_no_exception(, descr=&lt;Guard20&gt;) + + guard_nonnull_class(p4, 17674024, descr=&lt;Guard21&gt;) + f52 = getfield_gc_pure(p4, descr=&lt;W_FloatObject.inst_floatval&gt;) + jump(p1, p0, p2, p3, p4, i38, f53, f52, descr=&lt;Loop0&gt;) +</pre> + +Here the first paragraph comes from the traced bridge and the second +is what remains of the short preamble after optimization. The +box <tt>p4</tt> is +not a virtual (it contains a pointer to <tt>y</tt> which is never +changed), and it is only virtuals +that the bridge inherit from it's parents. This is why the last two +operations currently cannot be removed. + + +<p> + +Each time the short preamble is inlined, a new copy of each of the +guards in it is generated. Typically the short preamble is inlined in +several places and thus there will be several copies of each of those +guards. +If they fail often enough bridges +from them will be traced (as with all guards). But since there +typically are several copies of each guard the same bridge +will be generated in +several places. To prevent this, mini-bridges from the inlined guards +are produced already during the inlining. These mini-bridges contain +nothing but a jump to the preamble. + +</p><p> +The mini-bridges needs the arguments of the preamble to be able +to jump to it. These arguments contain among other things, boxed +versions of the +variables <tt>x</tt> and <tt>y</tt>. Those variables are virtuals in +the loop, and have to be allocated. Currently those allocations +are placed in front of the inlined guard. Moving those allocations into +the mini-bridges is the second feature that +hopefully will be implemented in the near future. +<!-- +The current approach actually kills the entire benefit of the inlining in most +real world cases as typically all the virtuals are forced. +--> +After this feature is +implemented, the result should look something like +</p><p> +<!-- +<img src="trace3.png"><p> +<a onblur="try {parent.deselectBloggerImageGracefully();} catch(e) {}" href="https://4.bp.blogspot.com/_Ti798w4YGGI/TS4TlHWn2GI/AAAAAAAAADg/xqc4-B4HH3M/s1600/trace3.png"><img style="display:block; margin:0px auto 10px; text-align:center;cursor:pointer; cursor:hand;width: 320px; height: 243px;" src="https://4.bp.blogspot.com/_Ti798w4YGGI/TS4TlHWn2GI/AAAAAAAAADg/xqc4-B4HH3M/s320/trace3.png" border="0" alt=""id="BLOGGER_PHOTO_ID_5561404118199162978" /></a> +--> +<a href="https://4.bp.blogspot.com/_Ti798w4YGGI/TS4TlHWn2GI/AAAAAAAAADg/xqc4-B4HH3M/s1600/trace3.png"><img alt="" border="0" id="BLOGGER_PHOTO_ID_5561404118199162978" src="https://4.bp.blogspot.com/_Ti798w4YGGI/TS4TlHWn2GI/AAAAAAAAADg/xqc4-B4HH3M/s1600/trace3.png" style="display: block; margin: 0px auto 10px; text-align: center; cursor: pointer; cursor: hand; width: 597px; height: 454px;"></a> + +</p><p> +</p><h2>Multiple specialized versions</h2> + +Floating point operations were generated in the trace above +because <tt>sqrt</tt> was called with a float argument. If it is +instead called with an int argument, integer operations will be generated. The +somewhat more complex situations is when both int's and float's are +used as arguments. Then the jit need to generate multiple versions of +the same loop, specialized in different ways. The details, given +below, on how this is achieved is somewhat involved. For the casual +reader it would make perfect sense to skip to the next section here. + +<p> + +Consider the case when <tt>sqrt</tt> is first called with a float +argument (but with <tt>n</tt> small enough not to generate the +bridge). Then the trace shown above will be +generated. If <tt>sqrt</tt> is now called with an int argument, the +guard in the preamble testing that the type of the input object is float +will fail: +</p><pre> + guard_nonnull_class(p12, 17652552, descr=&lt;Guard10&gt;) +</pre> +It will fail every iteration, so soon enough a bridge will be +generated from this guard in the preamble. This guard will end with a +jump to the same loop, and the optimizer will try to inline +the short preamble at the end of it. This will however fail +since now there are two guards on <tt>p12</tt>. One that makes sure it +is an int and and one that makes sure it is a float. The optimizer +will detect that the second guard will always fail and mark the bridge +as invalid. Invalid loops are not passed on to the backend for +compilation. + +<p> + +If a loop is detected to be invalid while inlining the short preamble, +the metainterpreter will continue to trace for yet another +iteration of the loop. This new trace can be compiled as above and +will produce a new loop with a new preamble that are now specialized +for int arguments instead of float arguments. The bridge that +previously became invalid will now be tried again. This time inlining +the short preamble of the new loop instead. This will produce a set of +traces connected like this + +</p><p> + +<a href="https://2.bp.blogspot.com/_Ti798w4YGGI/TS4TdW6kYmI/AAAAAAAAADY/Py2Eh0sNaOk/s1600/trace4mag.png"><img alt="" border="0" id="BLOGGER_PHOTO_ID_5561403875692575442" src="https://2.bp.blogspot.com/_Ti798w4YGGI/TS4TW_8i-tI/AAAAAAAAADQ/Ssq63lSRDIU/s1600/trace4.png" style="display: block; margin: 0px auto 10px; text-align: center; cursor: pointer; cursor: hand; width: 630px; height: 603px;"></a> +(click for some hairy details) +</p><p> + +The height of the boxes is this figure represents how many instructions +they contain (presuming the missing features from the previous section +are implemented). Loop 0 is specialized for floats and it's preamble have +been split into two boxes at the failing guard. Loop 2 is specialized +for ints and is larger than Loop 0. This is mainly because the integer +division in python does not map to the integer division of the +machine, but have to be implemented with several instructions (integer +division in python truncates its result towards minus +infinity, while the the machine integer division truncates towards +0). Also the height of the bridge is about the same as the height of +Loop 2. This is because it contains a full iteration of the loop. + +</p><p> + +</p><h2>A More Advanced Example</h2> + +Let's conclude with an example that is a bit more advanced, where this unrolling +approach actually outperforms the previous approach. Consider +making a +<a href="https://en.wikipedia.org/wiki/Fixed-point_arithmetic">fixed-point</a> +implementation of the square root using 16 bit's of decimals. This can be +done using the same implementation +of <tt>sqrt</tt> but calling it with an object of a class representing +such fixed-point real numbers: + +<p> +</p><div class="highlight"><pre><span class="k">class</span> <span class="nc">Fix16</span><span class="p">(</span><span class="nb">object</span><span class="p">):</span> + <span class="k">def</span> <span class="nf">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">val</span><span class="p">,</span> <span class="n">scale</span><span class="o">=</span><span class="bp">True</span><span class="p">):</span> + <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">val</span><span class="p">,</span> <span class="n">Fix16</span><span class="p">):</span> + <span class="bp">self</span><span class="o">.</span><span class="n">val</span> <span class="o">=</span> <span class="n">val</span><span class="o">.</span><span class="n">val</span> + <span class="k">else</span><span class="p">:</span> + <span class="k">if</span> <span class="n">scale</span><span class="p">:</span> + <span class="bp">self</span><span class="o">.</span><span class="n">val</span> <span class="o">=</span> <span class="nb">int</span><span class="p">(</span><span class="n">val</span> <span class="o">*</span> <span class="mi">2</span><span class="o">**</span><span class="mi">16</span><span class="p">)</span> + <span class="k">else</span><span class="p">:</span> + <span class="bp">self</span><span class="o">.</span><span class="n">val</span> <span class="o">=</span> <span class="n">val</span> + + <span class="k">def</span> <span class="nf">__add__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">):</span> + <span class="k">return</span> <span class="n">Fix16</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">val</span> <span class="o">+</span> <span class="n">Fix16</span><span class="p">(</span><span class="n">other</span><span class="p">)</span><span class="o">.</span><span class="n">val</span><span class="p">,</span> <span class="bp">False</span><span class="p">)</span> + + <span class="k">def</span> <span class="nf">__sub__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">):</span> + <span class="k">return</span> <span class="n">Fix16</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">val</span> <span class="o">-</span> <span class="n">Fix16</span><span class="p">(</span><span class="n">other</span><span class="p">)</span><span class="o">.</span><span class="n">val</span><span class="p">,</span> <span class="bp">False</span><span class="p">)</span> + + <span class="k">def</span> <span class="nf">__mul__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">):</span> + <span class="k">return</span> <span class="n">Fix16</span><span class="p">((</span><span class="bp">self</span><span class="o">.</span><span class="n">val</span> <span class="o">&gt;&gt;</span> <span class="mi">8</span><span class="p">)</span> <span class="o">*</span> <span class="p">(</span><span class="n">Fix16</span><span class="p">(</span><span class="n">other</span><span class="p">)</span><span class="o">.</span><span class="n">val</span> <span class="o">&gt;&gt;</span> <span class="mi">8</span><span class="p">),</span> <span class="bp">False</span><span class="p">)</span> + + <span class="k">def</span> <span class="nf">__div__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">):</span> + <span class="k">return</span> <span class="n">Fix16</span><span class="p">((</span><span class="bp">self</span><span class="o">.</span><span class="n">val</span> <span class="o">&lt;&lt;</span> <span class="mi">16</span><span class="p">)</span> <span class="o">/</span> <span class="n">Fix16</span><span class="p">(</span><span class="n">other</span><span class="p">)</span><span class="o">.</span><span class="n">val</span><span class="p">,</span> <span class="bp">False</span><span class="p">)</span> +</pre></div> + +<p> + +Below is a table comparing the runtime of the sqrt function above with +different argument types on different python interpreters. Pypy 1.4.1 +was released before the optimizations described in this post were in place +while they are in place in the +<a href="https://buildbot.pypy.org/nightly/trunk/pypy-c-jit-40390-e1ab35394b0f-linux64.tar.bz2">nightly + build from January 5</a>, +denoted pypy in the table. There are also the running time for the same +algorithms implemented in C and compiled with "gcc -O3 +-march=native". Tests were executed on a 2.53GHz Intel Core2 +processor with <tt>n=100000000</tt> iterations. +Comparing the integer versions with C may be considered a +bit unfair because of the more advanced integer division operator in +python. The left part of this table shows runtimes of <tt>sqrt</tt> in +a program containing a single call to sqrt (i.e. only a single +specialized version of the loop is needed). The right part shows the +runtime of <tt>sqrt</tt> when it has been called with a different +type of argument before. + +</p><p> + +</p><table> + <tr><th></th><th colspan="3">First call</th><th></th><th colspan="3">Second call</th></tr> + <tr><th></th><th>float</th><th>int</th><th>Fix16</th><th>  </th> + <th>float</th><th>int</th><th>Fix16</th></tr> + <tr align="right"><th align="left">cpython</th> + <td> 28.18 s</td> + <td> 22.13 s</td> + <td> 779.04 s</td> + <td></td> + <td> 28.07 s</td> + <td> 22.21 s</td> + <td> 767.03 s</td> + </tr> + <tr align="right"><th align="left">pypy 1.4.1</th> + <td> 1.20 s</td> + <td> 6.49 s</td> + <td> 11.31 s</td> + <td></td> + <td> 1.20 s</td> + <td> 6.54 s</td> + <td> 11.23 s</td> + </tr> + <tr align="right"><th align="left">pypy</th> + <td> 1.20 s</td> + <td> 6.44 s</td> + <td> 6.78 s</td> + <td></td> + <td> 1.19 s</td> + <td> 6.26 s</td> + <td> 6.79 s</td> + </tr> + <tr align="right"><th align="left">gcc</th> + <td> 1.15 s</td> + <td> 1.82 s</td> + <td> 1.89 s</td> + <td></td> + <td> 1.15 s</td> + <td> 1.82 s</td> + <td> 1.89 s</td> + </tr> +</table> + +<p> + +For this to work in the last case, when Fix16 is the argument type in +the second type, +the trace_limit had to be increased from its default value to prevent +the metainterpreter from aborting while tracing the second version of +the loop. Also sys.setcheckinterval(1000000) were used to prevent the +bridge from being generated. With the bridge the performance of the +last case is significantly worse. Maybe because the optimizer currently +fails to generate a short preamble for it. But the slowdown +seems too big for that to be the only explanation. Below are the runtimes +numbers with checkinterval set to its default value of 100: + +</p><table> + <tr><th></th><th colspan="3">First call</th><th></th><th colspan="3">Second call</th></tr> + <tr><th></th><th>float</th><th>int</th><th>Fix16</th><th>  </th> + <th>float</th><th>int</th><th>Fix16</th></tr> + <tr align="right"><th align="left">cpython</th> + <td> 28.71 s</td> + <td> 22.09 s</td> + <td> 781.86 s</td> + <td></td> + <td> 28.28 s</td> + <td> 21.92 s</td> + <td> 761.59 s</td> + </tr> + <tr align="right"><th align="left">pypy 1.4.1</th> + <td> 1.21 s</td> + <td> 6.48 s</td> + <td> 11.22 s</td> + <td></td> + <td> 1.72 s</td> + <td> 7.58 s</td> + <td> 12.18 s</td> + </tr> + <tr align="right"><th align="left">pypy</th> + <td> 1.21 s</td> + <td> 6.27 s</td> + <td> 7.22 s</td> + <td></td> + <td> 1.20 s</td> + <td> 6.29 s</td> + <td> 90.47 s</td> + </tr> +</table> + +<p> +</p><h2>Conclusions</h2> +Even though we are seeing speedups in a variety of different small +benchmarks, more complicated examples are not affected much by these +optimizations. It might partly be because larger examples have longer +and more complicated loops, and thus allowing optimizations to operate +across loop boundary will have a smaller relative effect. Another problem is +that with more complicated examples there will be more bridges, and bridges +are currently not handled very well (most of the time all virtuals are +forced at the end of the bridge as explained above). But moving those +forcings into the mini bridges should fix that.https://www.pypy.org/posts/2011/01/loop-invariant-code-motion-1998392217676829154.htmlTue, 11 Jan 2011 19:22:00 GMT \ No newline at end of file diff --git a/authors/hodgestar.html b/authors/hodgestar.html new file mode 100644 index 000000000..5fbcd44f7 --- /dev/null +++ b/authors/hodgestar.html @@ -0,0 +1,116 @@ + + + + + +Posts by hodgestar | PyPy + + + + + + + + + + + + + + + + + Skip to main content +
+
+
+ + \ No newline at end of file diff --git a/authors/hodgestar.xml b/authors/hodgestar.xml new file mode 100644 index 000000000..d5ccadd36 --- /dev/null +++ b/authors/hodgestar.xml @@ -0,0 +1,129 @@ + +PyPy (Posts by hodgestar)https://www.pypy.org/enContents © 2024 <a href="mailto:pypy-dev@pypy.org">The PyPy Team</a> Sat, 31 Aug 2024 17:48:12 GMTNikola (getnikola.com)http://blogs.law.harvard.edu/tech/rssA new chapter for PyPyhttps://www.pypy.org/posts/2020/08/a-new-chapter-for-pypy-8388322709667328389.htmlhodgestar<p><i>PyPy winds down its membership in the Software Freedom Conservancy</i></p> + +<h1>Conservancy and PyPy's great work together</h1> + +<p><a href="https://pypy.org/">PyPy</a> joined <a href="https://sfconservancy.org/">Conservancy</a> in +the <a href="https://sfconservancy.org/blog/2011/jan/02/oct-dec-2010/">second half of 2010</a>, shortly after the release of +PyPy 1.2, the first version to contain a fully functional JIT. <a href="https://lwn.net/Articles/550427/">In 2013</a>, PyPy +started supporting ARM, bringing its just-in-time speediness to many more devices and began working toward supporting NumPy to help +scientists crunch their numbers faster. Together, PyPy and Conservancy ran successful fundraising drives and facilitated payment +and oversight for <a href="https://sfconservancy.org/blog/2016/dec/01/pypy-2016/">contractors and code sprints</a>.</p> + +<p>Conservancy supported PyPy's impressive growth as it expanded support for +different hardware platforms, greatly improved the performance of C extensions, +and added support for Python 3 as the language itself evolved.</p> + +<h1>The road ahead</h1> + +<p>Conservancy provides a fiscal and organizational home for projects that find the +freedoms and guardrails that come along with a charitable home advantageous for +their community goals. While this framework was a great fit for the early PyPy +community, times change and all good things must come to an end.</p> + +<p>PyPy will remain a free and open source project, but the community's structure +and organizational underpinnings will be changing and the PyPy community will be +exploring options outside of the charitable realm for its next phase of growth +("charitable" in the legal sense -- PyPy will remain a community project).</p> + +<p>During the last year PyPy and Conservancy have worked together to properly +utilise the generous donations made by stalwart PyPy enthusiats over the years +and to wrap up PyPy's remaining charitable obligations. PyPy is grateful for +the Conservancy's help in shepherding the project toward its next chapter.</p> + +<h1>Thank yous</h1><p>From Conservancy: <br></p><p style="text-align: left;"></p><blockquote>"We are happy that Conservancy was able to help PyPy bring important software +for the public good during a critical time in its history. We wish the +community well and look forward to seeing it develop and succeed in new ways." <br></blockquote><blockquote>— Karen Sandler, Conservancy's Executive Director</blockquote><p></p><p>From PyPy:</p><p></p><div style="text-align: left;"><div style="text-align: left;"><blockquote><p>"PyPy would like to thank Conservancy for their decade long support in +building the community and wishes Conservancy continued success in their +journey promoting, improving, developing and defending free and open source +sofware." <br></p></blockquote><blockquote><p style="text-align: left;">— Simon Cross &amp; Carl Friedrich Bolz-Tereick, on behalf of PyPy.</p></blockquote></div></div><p></p><blockquote> +</blockquote> + +<h1>About</h1> + +<p><a class="reference external" href="https://pypy.org/">PyPy</a> is a multi-layer python interpreter with a built-in JIT compiler that runs +Python quickly across different computing environments. +<a class="reference external" href="https://sfconservancy.org/">Software Freedom Conservancy</a> (Conservancy) is a charity that provides a home +to over forty free and open source software projects.</p>pypyhttps://www.pypy.org/posts/2020/08/a-new-chapter-for-pypy-8388322709667328389.htmlWed, 12 Aug 2020 19:00:00 GMTLeysin 2020 Sprint Reporthttps://www.pypy.org/posts/2020/03/leysin-2020-sprint-report-764567777353955897.htmlhodgestar<p>At the end of February ten of us gathered in Leysin, Switzerland to work on<br> +a variety of topics including <a class="reference external" href="https://github.com/pyhandle/hpy/">HPy</a>, <a class="reference external" href="https://buildbot.pypy.org/summary?branch=py3.7">PyPy Python 3.7</a> support and the PyPy<br> +migration to <a class="reference external" href="https://foss.heptapod.net/pypy/">Heptapod</a>.<br> +<br> +</p><div class="separator" style="clear: both; text-align: center;"> +<a href="https://1.bp.blogspot.com/-PIs_hVhn3RY/XnFDceuihNI/AAAAAAAAbRg/LKMOMWxeFw4jhcwqy8jx7iKzKE01fbfxQCEwYBhgL/s1600/2020_leysin_sprint_attendees.jpg" style="margin-left: 1em; margin-right: 1em;"><img border="0" height="180" src="https://1.bp.blogspot.com/-PIs_hVhn3RY/XnFDceuihNI/AAAAAAAAbRg/LKMOMWxeFw4jhcwqy8jx7iKzKE01fbfxQCEwYBhgL/s320/2020_leysin_sprint_attendees.jpg" width="320"></a></div> +<br> +We had a fun and productive week. The snow was beautiful. There was skiing<br> +and lunch at the top of <a class="reference external" href="https://en.wikipedia.org/wiki/Berneuse">Berneuse</a>, cooking together, some late nights at<br> +the pub next door, some even later nights coding, and of course the<br> +obligatory cheese fondue outing.<br> +<br> +There were a few of us participating in a PyPy sprint for the first time<br> +and a few familiar faces who had attended many sprints. Many different<br> +projects were represented including PyPy, <a class="reference external" href="https://github.com/pyhandle/hpy/">HPy</a>, <a class="reference external" href="https://github.com/graalvm/graalpython">GraalPython</a>,<br> +<a class="reference external" href="https://foss.heptapod.net/pypy/">Heptapod</a>, and <a class="reference external" href="https://github.com/dgrunwald/rust-cpython">rust-cpython</a>. The atmosphere was relaxed and welcoming, so if<br> +you're thinking of attending the next one -- please do!<br> +<br> +Topics worked on:<br> +<br> +<h2> +HPy</h2> +HPy is a new project to design and implement a better API for extending<br> +Python in C. If you're unfamiliar with it you can read more about it at<br> +<a class="reference external" href="https://github.com/pyhandle/hpy/">HPy</a>.<br> +<br> +A lot of attention was devoted to the Big HPy Design Discussion which<br> +took up two full mornings. So much was decided that this will likely<br> +get its own detailed write-up, but bigger topics included:<br> +<ul class="simple"> +<li>the HPy GetAttr, SetAttr, GetItem and SetItem methods,</li> +<li>HPy_FromVoidP and HPy_AsVoidP for passing HPy handles to C functions<br> +that pass void* pointers to callbacks,</li> +<li>avoiding having va_args as part of the ABI,</li> +<li>exception handling,</li> +<li>support for creating custom types.</li> +</ul> +Quite a few things got worked on too:<br> +<ul class="simple"> +<li>implemented support for writing methods that take keyword arguments with<br> +HPy_METH_KEYWORDS,</li> +<li>implemented HPy_GetAttr, HPy_SetAttr, HPy_GetItem, and HPy_SetItem,</li> +<li>started implementing support for adding custom types,</li> +<li>started implementing dumping JSON objects in ultrajson-hpy,</li> +<li>refactored the PyPy GIL to improve the interaction between HPy and<br> +PyPy's cpyext,</li> +<li>experimented with adding HPy support to rust-cpython.</li> +</ul> +And there was some discussion of the next steps of the HPy initiative<br> +including writing documentation, setting up websites and funding, and<br> +possibly organising another HPy gathering later in the year.<br> +<br> +<h2> +PyPy</h2> +<ul class="simple"> +<li>Georges gave a presentation on the Heptapod topic and branch workflows<br> +and showed everyone how to use hg-evolve.</li> +<li>Work was done on improving the PyPy CI buildbot post the move to<br> +heptapod, including a light-weight pre-merge CI and restricting<br> +when the full CI is run to only branch commits.</li> +<li>A lot of work was done improving the -D tests. </li> +</ul> +<br> +<h2> +Miscellaneous</h2> +<ul class="simple"> +<li>Armin demoed VRSketch and NaN Industries in VR, including an implementation<br> +of the Game of Life within NaN Industries!</li> +<li>Skiing!</li> +</ul> +<br> +<h2> +Aftermath</h2> +Immediately after the sprint large parts of Europe and the world were<br> +hit by the COVID-19 epidemic. It was good to spend time together before<br> +travelling ceased to be a sensible idea and many gatherings were cancelled.<br> +<br> +Keep safe out there everyone.<br> +<br> +The HPy &amp; PyPy Team &amp; Friends<br> +<br> +<i>In joke for those who attended the sprint: Please don't replace this blog post<br> +with its Swedish translation (or indeed a translation to any other language :).</i>cpyextCPythonGraalPythonHeptapodhpypypypypy3https://www.pypy.org/posts/2020/03/leysin-2020-sprint-report-764567777353955897.htmlTue, 17 Mar 2020 21:57:00 GMT \ No newline at end of file diff --git a/authors/holger-krekel.html b/authors/holger-krekel.html new file mode 100644 index 000000000..39c5cf054 --- /dev/null +++ b/authors/holger-krekel.html @@ -0,0 +1,131 @@ + + + + + +Posts by holger krekel | PyPy + + + + + + + + + + + + + + + + + Skip to main content +
+
+
+ + \ No newline at end of file diff --git a/authors/holger-krekel.xml b/authors/holger-krekel.xml new file mode 100644 index 000000000..c1773ed9f --- /dev/null +++ b/authors/holger-krekel.xml @@ -0,0 +1,86 @@ + +PyPy (Posts by holger krekel)https://www.pypy.org/enContents © 2024 <a href="mailto:pypy-dev@pypy.org">The PyPy Team</a> Sat, 31 Aug 2024 17:48:11 GMTNikola (getnikola.com)http://blogs.law.harvard.edu/tech/rssSpeeding up PyPy by donationshttps://www.pypy.org/posts/2010/11/speeding-up-pypy-by-donations-6035529829962326007.htmlholger krekel<div class="document" id="pypy-joins-the-software-freedom-conservancy"> +<h3 class="title">PyPy joins the Software Freedom Conservancy</h3> + +<p>Good news. PyPy is now a member of the Software Freedom Conservancy (SFC), +see <a class="reference external" href="https://sfconservancy.org/news/2010/nov/10/pypy-joins/">the SFC blog post</a>. This allows us to manage non-profit monetary aspects of +the project independently from a company or particular persons. So we +can now officially receive donations both from people prefering right or +left sides, see the <tt class="docutils literal">Donate</tt> buttons on our <a class="reference external" href="https://pypy.org/">home page</a> and our <a class="reference external" href="https://morepypy.blogspot.com/">blog</a>. +And you can use PayPal or Google Checkout, Donations are tax-exempt in the +USA and hopefully soon in Europe as well.</p> +<p>What's it going to get used for? For the immediate future we intend to use +the donations for funding travels of core contributors to PyPy sprints +who otherwise can't afford to come. So if you have no time but some +money you can help to encourage coding contributors to care for PyPy. +If we end up with bigger sums we'll see and take suggestions. Money +spending decisions will be done by core PyPy people according to +non-profit guidelines. And we'll post information from time to time +about how much we got and where the money went.</p> +<p>If you have any questions regarding the SFC membership or donations +you may send email to sfc at pypy.org which will be observed +by Carl Friedrich Bolz, Jacob Hallen and Holger Krekel - the initial +PyPy SFC representatives on behalf of the PyPy team. Many thanks go +out to Bradley M. Kuhn for helping to implement the PyPy SFC membership.</p> +<p>cheers,</p> +<p>Holger &amp; Carl Friedrich</p> +</div>https://www.pypy.org/posts/2010/11/speeding-up-pypy-by-donations-6035529829962326007.htmlWed, 10 Nov 2010 15:40:00 GMTPyPy talk at OpenBossa 09https://www.pypy.org/posts/2009/03/pypy-talk-at-openbossa-09-5135830287297423499.htmlholger krekel<p>Yesterday i gave my <a class="reference external" href="https://merlinux.eu/~hpk/pypy-openbossa09.pdf">PyPy status/mobile perspectives</a> at OpenBossa, Nokia's developer conference for embedded platforms in Brazil. Found it a bit of a tough task to do that in 50 minutes. I had some 50, later more developers attending the talk and was happy with the questions and the feedback. Guess it's a good sign if the number of people grows during a talk :) It was the first time i tried to work more with pictures and actually used some devianart photos from <a class="reference external" href="https://marikaz.deviantart.com/">Marikaz</a> to mark section transitions. I summarize/highlight some key points here in the post.</p> +<p>After intro and 2.5 compatibility status, i talked about our measurements of PyPy's Python on Nokia's N810 internet tablet. The best bit is that for almost all Python data structures PyPy has smaller memory representations than CPython. Particularly good are class instances which often score at 50% of CPython's sizes. Startup time is also often better and can be improved. On the bad side, PyPy's quite large base interpreter size and its bytecode execution is often worse. In the talk i also outline ideas for "perfect PYC files" for minimizing module import times and maximizing sharing across interpreter processes. I also briefly discussed the PyPy situation with extension modules and regarding C++ libs. Most of these ideas arose from sprint discussions last year. In the morning i also had some good talk with Stefan Seefeld about Boost Python and the new QT4 bindings. Maybe to use Boost Python is also a good opportunity - but PyPy does not currently have a C-level or C++ level API.</p> +<p>In subsequent lunch discussions people agreed that PyPy has three main interesting areas currently:</p> +<ul class="simple"> +<li>the Python Just-In-Time Compiler</li> +<li>a virtualized, sandboxed Python interpreter</li> +<li>an efficient Python interpreter for small devices</li> +</ul> +<p>I think our upcoming 1.1 release will be a good point in time for many people to look some more into PyPy. I hope we are crossing the chasm soon. It's been a while since the project started :) Getting some more sponsoring to sustain and increase our current efforts probably wouldn't hurt.</p> +<p>Now i am off to spend my last day in Recife / Brazil, fly back to Germany in the evening and then spend time on preparing for Pycon 2009. And I guess i am going to enjoy some naturally cold air - at least my two jogging sessions at Brazillian beaches, at a sustained 30 degrees celsius, were tough. I guess i shouldn't complain, though :)</p> +<p>Was great meeting all the brazillian guys and the few women - just had breakfeast with Kate Alhola, kernel hacker and working on the new "Freemantle" graphical platform. Many thanks go to Marcio Marcedo and the Python team at <a class="reference external" href="https://www.indt.org/institutional/index.php">INDT</a> who invited me here. Hope to come again next year and eventually talk more about the Zone VM :)</p> +<p>If you are interested in some more not so pypy-specific bits about the conference and what i experienced, you might head over to my <a class="reference external" href="https://tetamap.wordpress.com">tetamap</a> blog.</p> +<p>holger</p>https://www.pypy.org/posts/2009/03/pypy-talk-at-openbossa-09-5135830287297423499.htmlThu, 12 Mar 2009 14:00:00 GMTPyPy on Mobiles, at OpenBossahttps://www.pypy.org/posts/2009/03/pypy-on-mobiles-at-openbossa-845760004725129519.htmlholger krekel<p>Next week i am going to give a talk on PyPy at <a class="reference external" href="https://www.bossaconference.indt.org/">OpenBossa</a>, a developer conference on embedded platforms. I've written up a bit more of my background and why i find it very interesting to go there <a class="reference external" href="https://tetamap.wordpress.com">on my blog</a>. Probably will mostly follow up there or on twitter and not much here on the PyPy blog because it's not all about PyPy. To summarize how i see it: i think there is great potential for Python and PyPy on mobiles and am thrilled to hear about what's going on currently and to discuss opportunities.</p> +cheers, holgerhttps://www.pypy.org/posts/2009/03/pypy-on-mobiles-at-openbossa-845760004725129519.htmlWed, 04 Mar 2009 20:36:00 GMTPyPy/Python at the Maemo summithttps://www.pypy.org/posts/2008/09/pypypython-at-maemo-summit-6115106472056714072.htmlholger krekel<p>Maciej and me visited the <a class="reference" href="https://wiki.maemo.org/Maemo_Summit_2008">Maemo Summit</a> in Berlin - +a community meetup around Nokia's Linux based +mobile platform. We spontaneously did a lightning +talk about a first running pypy-c on Maemo +and got nice feedback. + +</p><p>We also had a nice lunch with guys from the <a href="https://www.indt.org.br/institutional/index.php">INDT</a> in Brazil, including Marcio Marcedo and <a href="https://www.marceloeduardo.com/blog/">Marcelo Eduardo</a>. It turns out that Python is used a lot on Maemo, for example the nice <a>Canola</a> UI is done with it. Will be interesting to see how this shapes up in relation to the iPhone and Android. + +</p><p>A lot of Nokia engineers were around and they announced that from October on they are going for weekly new releases of their SDK for the new Fremantle (Maemo-5) debian-based platform until the SDK becomes final - if we got this right. + +</p><p>Funnily enough, we met <a href="https://mg.pov.lt/blog">Marius Gedminas</a> from the Programmers of Vilnius - he gave a lightning talk on his impressions as a community member. We think python programmers really should go much more to non-Python centric conferences. + +</p><p>The whole event took place at the <a href="https://www.c-base.org">C-Base</a> - was a bit +crammed in some of the <a href="https://wiki.maemo.org/Maemo_Summit_2008">sessions</a> with something like 200 people attending. +<br> +cheers, Maciej and Holger</p>https://www.pypy.org/posts/2008/09/pypypython-at-maemo-summit-6115106472056714072.htmlSat, 20 Sep 2008 13:58:00 GMTpylib/py.test 0.9.2 releasedhttps://www.pypy.org/posts/2008/08/pylibpytest-092-released-6233865913406513469.htmlholger krekel<p>PyPy and <a href="https://wyvern.cs.uni-duesseldorf.de/pypytest/summary.html">its 14638 automated tests</a> use the py.test tool which is also used by many other projects. PyPy developers have actually driven and contributed a lot to its development. + +I just released version <a href="https://codespeak.net/py/dist/release-0.9.2.html">0.9.2 of the py lib</a> mainly fixing Windows issues and providing better packaging and integration with setuptools. It's usable completely independently from PyPy - "easy_install py" gives you the py.test command line. Of course you can run py.test on top of a translated PyPy version as well. Here is a quick summary of what the py lib provides besides py.test: +</p><ul class="simple"> +<li><a class="reference" href="https://pylib.org/execnet.html">py.execnet</a>: ad-hoc code distribution to SSH, Socket and local sub processes</li> +<li><a class="reference" href="https://pylib.org/greenlet.html">py.magic.greenlet</a>: micro-threads on standard CPython ("stackless-light") and PyPy</li> +<li><a class="reference" href="https://pylib.org/path.html">py.path</a>: path abstractions over local and subversion files</li> +<li><a class="reference" href="https://pylib.org/code.html">py.code</a>: dynamic code compile and traceback printing support</li> +<li>tested against Linux, Win32, OSX, works on python 2.3-2.6</li> +</ul> +Good general entry points for installation and documentation: +<ul> +<li><a href="https://pypi.python.org/pypi/py/">Pypi pages</a> +</li><li><a href="https://codespeak.net/py/0.9.2/download.html">Download/Install</a> +</li><li><a href="https://codespeak.net/py/0.9.2/index.html">Documentation/API</a> +</li></ul> +have fun, holger krekelreleasehttps://www.pypy.org/posts/2008/08/pylibpytest-092-released-6233865913406513469.htmlFri, 22 Aug 2008 12:32:00 GMTPyPy's Python runs Pinax / Djangohttps://www.pypy.org/posts/2008/07/pypys-python-runs-pinax-django-1265543049596913506.htmlholger krekel<p>During the EP2008 sprint we got <a href="https://pinax.hotcluboffrance.com/">Pinax</a> running on top of PyPy. At <a href="https://play1.pypy.org:7788">our play1 server we have it running on top of pypy-c</a>. Not that you'll notice many differences to the original site but that's the point, isn't it? ... Well, in fact i am too lazy to customize our play1 version now - i rather spent a nice evening with the other sprint guys :) + +Pinax integrates numerous reusable <a href="https://djangoproject.com">Django</a> apps to take care of the things that many sites have in common. Many thanks particularly to Henrik Vendelbo who sorted out various Pinax and PyPy issues, and wrote up a nice <a href="https://code.djangoproject.com/wiki/DjangoAndPyPy">DjangoAndPyPy wiki page</a> describing the installation process. + +greetings from Vilnius (Lithunia), Holger</p>https://www.pypy.org/posts/2008/07/pypys-python-runs-pinax-django-1265543049596913506.htmlSat, 12 Jul 2008 18:10:00 GMTEP2008: PyPy meets Jythonhttps://www.pypy.org/posts/2008/07/ep2008-pypy-meets-jython-1107070144380217881.htmlholger krekel<p>One of the great events at EuroPython 2008 were our chats and meetings with the Jython and Sun people. The Jython people recently are pushing into releasing Python version 2.5 and they currently pursue many interesting sub projects. Coincidentally, PyPy also has tons of interesting areas and results :) So we eventually got into brainstorming a number of possible technical collab ideas. Further below is a first list as i wrote it down from our 10 people PyPy / Jython 30 minute close up meeting yesterday. + +It felt great to be able to talk to the Jython people this way - kudos to Sun for their clear commitments and open ways to go about things! I sense a genuine interest on fair collaboration with non-java developer communities. Seems like they are serious about not focusing on "Java this", "Java that" anymore but rather focus on the JVM platform. Good! And about language +independent interest in ambitious technology. Even Better! I am tensed to see how things go from here. + +So here the list of technical collab ideas: +</p><ul><li>ctypes - try to create _rawffi module in Java for Jython, which will enable Jython to reuse our existing ctypes implementation (and have PyPy use the Jython-rawffi for its own for PyPy.JVM)</li><li> generally see to share work / (continue) collaborate regarding extension modules</li><li>Jython/PyPy (and eventually IronPython): document known differences to CPython, maybe in a PEP</li><li>Python Interpreter for Jython (in order to run CPython's .pyc files): re-use pypy's bytecode evaluator, implement a "Jython object space". </li><li>re-use rpython-extension modules for jython (e.g. SRE), by compiling them to Java and reusing as a native library.</li><li>collaborate on testing framework / benchmarking, have a common site to show test results</li><li>make py.test compatible with jython</li><li>come up with a set of "pure Python language" tests, which would gather and refactor tests from CPython, PyPy and Jython. </li><li>look into using java types / jython approaches for implementing free threading.</li><li>share knowledge regarding JIT / psyco +</li></ul>If you have any more ideas, comments or would like to join efforts, let us know! + +Cheers and thanks to <a href="https://www.sauria.com/blog/">Ted Leung</a>, <a href="https://fwierzbicki.blogspot.com/">Frank Wierzbiki</a>, <a href="https://www.zyasoft.com/pythoneering/">Jim Baker</a> and Tobias Ivarsson from Sun and Jython fame respectively, + +Holgerep2008jythonpypysunhttps://www.pypy.org/posts/2008/07/ep2008-pypy-meets-jython-1107070144380217881.htmlThu, 10 Jul 2008 08:29:00 GMT \ No newline at end of file diff --git a/authors/index.html b/authors/index.html new file mode 100644 index 000000000..61af5f417 --- /dev/null +++ b/authors/index.html @@ -0,0 +1,132 @@ + + + + + + +Authors | PyPy + + + + + + + + + + + + + + + + Skip to main content + + + \ No newline at end of file diff --git a/authors/maciej-fijalkowski.html b/authors/maciej-fijalkowski.html new file mode 100644 index 000000000..eaebdbcee --- /dev/null +++ b/authors/maciej-fijalkowski.html @@ -0,0 +1,419 @@ + + + + + +Posts by Maciej Fijalkowski | PyPy + + + + + + + + + + + + + + + + + Skip to main content +
+

Posts by Maciej Fijalkowski

+ +
+
+ + \ No newline at end of file diff --git a/authors/maciej-fijalkowski.xml b/authors/maciej-fijalkowski.xml new file mode 100644 index 000000000..c06b70f5b --- /dev/null +++ b/authors/maciej-fijalkowski.xml @@ -0,0 +1,1146 @@ + +PyPy (Posts by Maciej Fijalkowski)https://www.pypy.org/enContents © 2024 <a href="mailto:pypy-dev@pypy.org">The PyPy Team</a> Sat, 31 Aug 2024 17:48:12 GMTNikola (getnikola.com)http://blogs.law.harvard.edu/tech/rssPyPy JIT for Aarch64https://www.pypy.org/posts/2019/07/pypy-jit-for-aarch64-7161523403247118006.htmlMaciej Fijalkowski<div dir="ltr" style="text-align: left;"> + +<p>Hello everyone.</p> +<p>We are pleased to announce the availability of the new PyPy for AArch64. This +port brings PyPy's high-performance just-in-time compiler to the AArch64 +platform, also known as 64-bit ARM. With the addition of AArch64, PyPy now +supports a total of 6 architectures: x86 (32 &amp; 64bit), ARM (32 &amp; 64bit), PPC64, +and s390x. The AArch64 work was funded by ARM Holdings Ltd. and Crossbar.io.</p> +<p>PyPy has a good record of boosting the performance of Python programs on the +existing platforms. To show how well the new PyPy port performs, we compare the +performance of PyPy against CPython on a set of benchmarks. As a point of +comparison, we include the results of PyPy on x86_64.</p> +<p>Note, however, that the results presented here were measured on a Graviton A1 +machine from AWS, which comes with a very serious word of warning: Graviton A1's +are virtual machines, and, as such, they are not suitable for benchmarking. If +someone has access to a beefy enough (16G) ARM64 server and is willing to give +us access to it, we are happy to redo the benchmarks on a real machine. One +major concern is that while a virtual CPU is 1-to-1 with a real CPU, it is not +clear to us how CPU caches are shared across virtual CPUs. Also, note that by no +means is this benchmark suite representative enough to average the results. Read +the numbers individually per benchmark.</p> +<p>The following graph shows the speedups on AArch64 of PyPy (hg id 2417f925ce94) compared to +CPython (2.7.15), as well as the speedups on a x86_64 Linux laptop +comparing the most recent release, PyPy 7.1.1, to CPython 2.7.16.</p> + +<div class="separator" style="clear: both; text-align: center;"><a href="https://1.bp.blogspot.com/-zC5JsKK5msM/XTmxQdJawEI/AAAAAAAAJgY/mDR_IbpJOAEImVSkGtVb2V5snEtqZcdnQCLcBGAs/s1600/2019-07-arm64-speedups.png" style="margin-left: 1em; margin-right: 1em;"><img border="0" height="231" src="https://1.bp.blogspot.com/-zC5JsKK5msM/XTmxQdJawEI/AAAAAAAAJgY/mDR_IbpJOAEImVSkGtVb2V5snEtqZcdnQCLcBGAs/s400/2019-07-arm64-speedups.png" width="400"></a></div> + +<p>In the majority of benchmarks, the speedups achieved on AArch64 match those +achieved on the x86_64 laptop. Over CPython, PyPy on AArch64 achieves speedups +between 0.6x to 44.9x. These speedups are comparable to x86_64, where the +numbers are between 0.6x and 58.9x.</p> +<p>The next graph compares between the speedups achieved on AArch64 to the speedups +achieved on x86_64, i.e., how great the speedup is on AArch64 vs. the same +benchmark on x86_64. This comparison should give a rough idea about the +quality of the generated code for the new platform.</p> + +<div class="separator" style="clear: both; text-align: center;"><a href="https://4.bp.blogspot.com/-29YGxYG1SLU/XTmxbjoz9nI/AAAAAAAAJgc/efNeh3P4guwHtgqKXjyMgfwfUbMFl3eDACLcBGAs/s1600/2019-07-arm64-relative.png" style="margin-left: 1em; margin-right: 1em;"><img border="0" height="133" src="https://4.bp.blogspot.com/-29YGxYG1SLU/XTmxbjoz9nI/AAAAAAAAJgc/efNeh3P4guwHtgqKXjyMgfwfUbMFl3eDACLcBGAs/s400/2019-07-arm64-relative.png" width="400"></a></div> + +<p>Note that we see a large variance: There are generally three groups of +benchmarks - those that run at more or less the same speed, those that +run at 2x the speed, and those that run at 0.5x the speed of x86_64.</p> +<p>The variance and disparity are likely related to a variety of issues, mostly due +to differences in architecture. What <em>is</em> however interesting is that, compared +to measurements performed on older ARM boards, the branch predictor on the +Graviton A1 machine appears to have improved. As a result, the speedups achieved +by PyPy over CPython are smaller than on older ARM boards: sufficiently branchy +code, like CPython itself, simply runs a lot faster. Hence, the advantage +of the non-branchy code generated by PyPy's just-in-time compiler is smaller.</p> +<p>One takeaway here is that many possible improvements for PyPy have yet to be +implemented. This is true for both of the above platforms, but probably more so +for AArch64, which comes with a large number of CPU registers. The PyPy backend +was written with x86 (the 32-bit variant) in mind, which has a really low number +of registers. We think that we can improve in the area of emitting more modern +machine code, which may have a higher impact on AArch64 than on x86_64. There is +also a number of missing features in the AArch64 backend. These features are +currently implemented as expensive function calls instead of inlined native +instructions, something we intend to improve.</p> +<p>Best,</p> +<p>Maciej Fijalkowski, Armin Rigo and the PyPy team</p> + +<br></div>https://www.pypy.org/posts/2019/07/pypy-jit-for-aarch64-7161523403247118006.htmlThu, 25 Jul 2019 14:41:00 GMTFunding for 64-bit Armv8-a support in PyPyhttps://www.pypy.org/posts/2018/11/hello-everyone-at-pypy-we-are-trying-to-5336557946798583063.htmlMaciej Fijalkowski<div dir="ltr" style="text-align: left;"> + +<p>Hello everyone</p> + +<p>At PyPy we are trying to support a relatively wide range of platforms. We have PyPy working on OS X, Windows and various flavors of linux (and unofficially various flavors of BSD) on the software side, with hardware side having x86, x86_64, PPC, 32-bit Arm (v7) and even zarch. This is harder than for other projects, since PyPy emits assembler on the fly from the just in time compiler and it requires significant amount of work to port it to a new platform.</p> + +<p>We are pleased to inform that <a href="https://www.arm.com/">Arm Limited</a>, together with <a href="https://crossbario.com/">Crossbar.io GmbH</a>, are sponsoring the development of 64-bit Armv8-a architecture support through <a href="https://baroquesoftware.com">Baroque Software OU</a>, which would allow PyPy to run on a new variety of low-power, high-density servers with that architecture. We believe this will be beneficial for the funders, for the PyPy project as well as to the wider community.</p> + +<p>The work will commence soon and will be done some time early next year with expected speedups either comparable to x86 speedups or, if our <a href="https://www.pypy.org/posts/2013/05/pypy-20-alpha-for-arm-2318299473927531503.html">current experience with ARM holds</a>, more significant than x86 speedups.</p> + +<p>Best,<br> +Maciej Fijalkowski and the PyPy team</p> +<br></div>https://www.pypy.org/posts/2018/11/hello-everyone-at-pypy-we-are-trying-to-5336557946798583063.htmlThu, 29 Nov 2018 12:09:00 GMTPyPy2.7 and PyPy3.5 v5.10 dual releasehttps://www.pypy.org/posts/2017/12/pypy27-and-pypy35-v510-dual-release-3223396318213306071.htmlMaciej Fijalkowski<div dir="ltr" style="text-align: left;"> +<p>The PyPy team is proud to release both PyPy2.7 v5.10 (an interpreter supporting +Python 2.7 syntax), and a final PyPy3.5 v5.10 (an interpreter for Python +3.5 syntax). The two releases are both based on much the same codebase, thus +the dual release.</p> +<p>This release is an incremental release with very few new features, the main +feature being the final PyPy3.5 release that works on linux and OS X with beta +windows support. It also includes fixes for <a class="reference external" href="https://vmprof.readthedocs.io">vmprof</a> cooperation with greenlets.</p> +<p>Compared to 5.9, the 5.10 release contains mostly bugfixes and small improvements. +We have in the pipeline big new features coming for PyPy 6.0 that did not make +the release cut and should be available within the next couple months.</p> +<p>As always, this release is 100% compatible with the previous one and fixed +several issues and bugs raised by the growing community of PyPy users. +As always, we strongly recommend updating.</p> +<p>There are quite a few important changes that are in the pipeline that did not +make it into the 5.10 release. Most important are speed improvements to cpyext +(which will make numpy and pandas a bit faster) and utf8 branch that changes +internal representation of unicode to utf8, which should help especially the +Python 3.5 version of PyPy.</p> +<p>This release concludes the Mozilla Open Source <a class="reference external" href="https://www.pypy.org/posts/2016/08/pypy-gets-funding-from-mozilla-for-5569307998787871200.html">grant</a> for having a compatible +PyPy 3.5 release and we're very grateful for that. Of course, we will continue +to improve PyPy 3.5 and probably move to 3.6 during the course of 2018.</p> +<p>You can download the v5.10 releases here:</p> +<blockquote> +<a class="reference external" href="https://pypy.org/download.html">https://pypy.org/download.html</a></blockquote> +<p>We would like to thank our donors for the continued support of the PyPy +project.</p> +<p>We would also like to thank our contributors and +encourage new people to join the project. PyPy has many +layers and we need help with all of them: <a class="reference external" href="https://www.pypy.org/posts/2017/12/index.html">PyPy</a> and <a class="reference external" href="https://rpython.readthedocs.org">RPython</a> documentation +improvements, tweaking popular <a class="reference external" href="https://www.pypy.org/posts/2017/12/project-ideas.html#make-more-python-modules-pypy-friendly">modules</a> to run on pypy, or general <a class="reference external" href="https://www.pypy.org/posts/2017/12/project-ideas.html">help</a> +with making RPython's JIT even better.</p> +<div class="section" id="what-is-pypy"> +<h1>What is PyPy?</h1> +<p>PyPy is a very compliant Python interpreter, almost a drop-in replacement for +CPython 2.7 and CPython 3.5. It's fast (<a class="reference external" href="https://speed.pypy.org">PyPy and CPython 2.7.x</a> performance comparison) +due to its integrated tracing JIT compiler.</p> +<p>We also welcome developers of other <a class="reference external" href="https://rpython.readthedocs.io/en/latest/examples.html">dynamic languages</a> to see what RPython +can do for them.</p> +<p>The PyPy release supports:</p> +<blockquote> +<ul class="simple"> +<li><strong>x86</strong> machines on most common operating systems +(Linux 32/64 bits, Mac OS X 64 bits, Windows 32 bits, OpenBSD, FreeBSD)</li> +<li>newer <strong>ARM</strong> hardware (ARMv6 or ARMv7, with VFPv3) running Linux,</li> +<li>big- and little-endian variants of <strong>PPC64</strong> running Linux,</li> +<li><strong>s390x</strong> running Linux</li> +</ul> +</blockquote> +</div> +<div class="section" id="changelog"> +<h1>Changelog</h1> +<ul class="simple"> +<li>improve ssl handling on windows for pypy3 (makes pip work)</li> +<li>improve unicode handling in various error reporters</li> +<li>fix vmprof cooperation with greenlets</li> +<li>fix some things in cpyext</li> +<li>test and document the cmp(nan, nan) == 0 behaviour</li> +<li>don't crash when calling sleep with inf or nan</li> +<li>fix bugs in _io module</li> +<li>inspect.isbuiltin() now returns True for functions implemented in C</li> +<li>allow the sequences future-import, docstring, future-import for CPython bug compatibility</li> +<li>Issue #2699: non-ascii messages in warnings</li> +<li>posix.lockf</li> +<li>fixes for FreeBSD platform</li> +<li>add .debug files, so builds contain debugging info, instead of being stripped</li> +<li>improvements to cppyy</li> +<li>issue #2677 copy pure c PyBuffer_{From,To}Contiguous from cpython</li> +<li>issue #2682, split firstword on any whitespace in sqlite3</li> +<li>ctypes: allow ptr[0] = foo when ptr is a pointer to struct</li> +<li>matplotlib will work with tkagg backend once <a class="reference external" href="https://github.com/matplotlib/matplotlib/pull/9356">matplotlib pr #9356</a> is merged</li> +<li>improvements to utf32 surrogate handling</li> +<li>cffi version bump to 1.11.2</li> +</ul> +Maciej Fijalkowski, Matti Picus and the whole PyPy team +</div> +<br></div>releasehttps://www.pypy.org/posts/2017/12/pypy27-and-pypy35-v510-dual-release-3223396318213306071.htmlMon, 25 Dec 2017 18:51:00 GMTLet's remove the Global Interpreter Lockhttps://www.pypy.org/posts/2017/08/lets-remove-global-interpreter-lock-748023554216649595.htmlMaciej Fijalkowski<div dir="ltr" style="text-align: left;"> +<p>Hello everyone</p> +<p>The Python community has been discussing removing the Global Interpreter Lock for +a long time. +There have been various attempts at removing it: +Jython or IronPython successfully removed it with the help of the underlying +platform, and some have yet to bear fruit, like <a class="reference external" href="https://github.com/larryhastings/gilectomy">gilectomy</a>. Since our <a class="reference external" href="https://www.pypy.org/posts/2017/03/leysin-winter-sprint-summary-4587213628578490701.html">February sprint</a> in Leysin, +we have experimented with the topic of GIL removal in the PyPy project. +We believe that the work done in IronPython or Jython can be reproduced with +only a bit more effort in PyPy. Compared to that, removing the GIL in CPython is a much +harder topic, since it also requires tackling the problem of multi-threaded reference +counting. See the section below for further details.</p> +<p>As we announced at EuroPython, what we have so far is a GIL-less PyPy +which can run <strong>very simple</strong> multi-threaded, nicely parallelized, programs. +At the moment, more complicated programs probably segfault. The +remaining 90% (and another 90%) of work is with putting locks in strategic +places so PyPy does not segfault during concurrent accesses to +data structures.</p> +<p>Since such work would complicate the PyPy code base and our day-to-day work, +we would like to judge the interest of the community and the commercial +partners to make it happen (we are not looking for individual +donations at this point). We estimate a total cost of $50k, +out of which we already have backing for about 1/3 (with a possible 1/3 +extra from the STM money, see below). This would give us a good +shot at delivering a good proof-of-concept working PyPy with no GIL. If we can get a $100k +contract, we will deliver a fully working PyPy interpreter with no GIL as a release, +possibly separate from the default PyPy release.</p> +<p>People asked several questions, so I'll try to answer the technical parts +here.</p> +<h3>What would the plan entail?</h3> +<p>We've already done the work on the Garbage Collector to allow doing multi- +threaded programs in RPython. "All" that is left is adding locks on mutable +data structures everywhere in the PyPy codebase. Since it would significantly complicate +our workflow, we require real interest in that topic, backed up by +commercial contracts in order to justify the added maintenance burden.</p> +<h3>Why did the STM effort not work out?</h3> +<p>STM was a research project that proved that the idea is possible. However, +the amount of user effort that is required to make programs run in a +parallelizable way is significant, and we never managed to develop tools +that would help in doing so. At the moment we're not sure if more work +spent on tooling would improve the situation or if the whole idea is really doomed. +The approach also ended up adding significant overhead on single threaded programs, +so in the end it is very easy to make your programs slower. (We have some money +left in the donation pot for STM which we are not using; according to the rules, we +could declare the STM attempt failed and channel that money towards the present +GIL removal proposal.)</p> +<h3>Wouldn't subinterpreters be a better idea?</h3> +<p>Python is a very mutable language - there are tons of mutable state and +basic objects (classes, functions,...) that are compile-time in other +language but runtime and fully mutable in Python. In the end, sharing +things between subinterpreters would be restricted to basic immutable +data structures, which defeats the point. Subinterpreters suffers from the same problems as +multiprocessing with no additional benefits. +We believe that reducing mutability to implement subinterpreters is not viable without seriously impacting the +semantics of the language (a conclusion which applies to many other +approaches too).</p> +<h3>Why is it easier to do in PyPy than CPython?</h3> +<p>Removing the GIL in CPython has two problems:</p> +<ul class="simple"> +<li>how do we guard access to mutable data structures with locks and</li> +<li>what to do with reference counting that needs to be guarded.</li> +</ul> +<p>PyPy only has the former problem; the latter doesn't exist, +due to a different garbage collector approach. Of course the first problem +is a mess too, but at least we are already half-way there. Compared to Jython +or IronPython, PyPy lacks some data structures that are provided by JVM or .NET, +which we would need to implement, hence the problem is a little harder +than on an existing multithreaded platform. However, there is good research +and we know how that problem can be solved.</p> +<p>Best regards,<br> +Maciej Fijalkowski</p> +<br></div>https://www.pypy.org/posts/2017/08/lets-remove-global-interpreter-lock-748023554216649595.htmlMon, 14 Aug 2017 14:34:00 GMTWarmup improvements: more efficient trace representationhttps://www.pypy.org/posts/2016/04/warmup-improvements-more-efficient-7082900097299909512.htmlMaciej Fijalkowski<div dir="ltr" style="text-align: left;"> +<p>Hello everyone.</p> +<p>I'm pleased to inform that we've finished another round of +improvements to the warmup performance of PyPy. Before I go +into details, I'll recap the achievements that we've done since we've started +working on the warmup performance. I picked a random PyPy from November 2014 +(which is definitely before we started the warmup work) and compared it with +a recent one, after 5.0. The exact revisions are respectively <tt class="docutils literal">ffce4c795283</tt> +and <tt class="docutils literal">cfbb442ae368</tt>. First let's compare <a class="reference external" href="https://bitbucket.org/pypy/benchmarks/src/59290b59a24e54057d4c694fa4f47e7879a347a0/warmup/?at=default">pure warmup benchmarks</a> that +can be found in our benchmarking suite. Out of those, +<tt class="docutils literal"><span class="pre">pypy-graph-alloc-removal</span></tt> numbers should be taken with a grain of salt, +since other work could have influenced the results. +The rest of the benchmarks mentioned is bottlenecked purely by warmup times.</p> +<p>You can see how much your program spends in warmup running +<tt class="docutils literal"><span class="pre">PYPYLOG=jit-summary:-</span> pypy <span class="pre">your-program.py</span></tt> under "tracing" and "backend" +fields (in the first three lines). An example looks like that:</p> +<pre class="literal-block"> +[e00c145a41] {jit-summary +Tracing: 71 0.053645 &lt;- time spent tracing &amp; optimizing +Backend: 71 0.028659 &lt;- time spent compiling to assembler +TOTAL: 0.252217 &lt;- total run time of the program +</pre> +<p>The results of the benchmarks</p> +<table border="1" class="docutils"> +<colgroup> +<col width="29%"> +<col width="13%"> +<col width="13%"> +<col width="10%"> +<col width="17%"> +<col width="17%"> +</colgroup> +<tbody valign="top"> +<tr><td>benchmark</td> +<td>time - old</td> +<td>time - new</td> +<td>speedup</td> +<td>JIT time - old</td> +<td>JIT time - new</td> +</tr> +<tr><td>function_call</td> +<td>1.86</td> +<td>1.42</td> +<td>1.3x</td> +<td>1.12s</td> +<td>0.57s</td> +</tr> +<tr><td>function_call2</td> +<td>5.17s</td> +<td>2.73s</td> +<td>1.9x</td> +<td>4.2s</td> +<td>1.6s</td> +</tr> +<tr><td>bridges</td> +<td>2.77s</td> +<td>2.07s</td> +<td>1.3x</td> +<td>1.5s</td> +<td>0.8s</td> +</tr> +<tr><td>pypy-graph-alloc-removal</td> +<td>2.06s</td> +<td>1.65s</td> +<td>1.25x</td> +<td>1.25s</td> +<td>0.79s</td> +</tr> +</tbody> +</table> +<p>As we can see, the overall warmup benchmarks got up to <strong>90% faster</strong> with +JIT time dropping by up to <strong>2.5x</strong>. We have more optimizations in the pipeline, +with an idea how to transfer some of the JIT gains into more of a total program +runtime by jitting earlier and more eagerly.</p> +<div class="section" id="details-of-the-last-round-of-optimizations"> +<h1>Details of the last round of optimizations</h1> +<p>Now the nitty gritty details - what did we actually do? I covered a lot of +warmup improvements in the <a class="reference external" href="https://www.pypy.org/posts/2015/10/pypy-memory-and-warmup-improvements-2-4598780879518640015.html">past</a> <a class="reference external" href="https://www.pypy.org/posts/2015/09/pypy-warmup-improvements-8349465374608676233.html">blog</a> posts so I'm going to focus on +the last change, the jit-leaner-frontend branch. This last change is simple, instead of using +pointers to store the "operations" objects created during tracing, we use a compact list of +16-bit integers (with 16bit pointers in between). On 64bit machine the memory wins are +tremendous - the new representation is 4x more efficient to use 16bit pointers than full 64bit pointers. +Additionally, the smaller representation has much better cache behavior and much less +pointer chasing in memory. It also has a better defined lifespan, so we don't need to +bother tracking them by the GC, which also saves quite a bit of time.</p> +<p>The change sounds simple, but the details in the underlaying data mean that +everything in the JIT had to be changed which took quite a bit of effort :-)</p> +<p>Going into the future on the JIT front, we have an exciting set of optimizations, +ranging from faster loops through faster warmup to using better code generation +techniques and broadening the kind of program that PyPy speeds up. Stay tuned +for the updates.</p> +<p>We would like to thank our commercial partners for making all of this possible. +The work has been performed by <a class="reference external" href="https://baroquesoftware.com">baroquesoftware</a> and would not be possible +without support from people using PyPy in production. If your company uses +PyPy and want it to do more or does not use PyPy but has performance problems +with the Python installation, feel free to get in touch with me, trust me using +PyPy ends up being a lot cheaper than rewriting everything in go :-)</p> +<p>Best regards,<br> +Maciej Fijalkowski</p> +</div> +<br></div>https://www.pypy.org/posts/2016/04/warmup-improvements-more-efficient-7082900097299909512.htmlThu, 07 Apr 2016 09:56:00 GMTPyPy memory and warmup improvements (2) - Sharing of Guardshttps://www.pypy.org/posts/2015/10/pypy-memory-and-warmup-improvements-2-4598780879518640015.htmlMaciej Fijalkowski<div dir="ltr" style="text-align: left;"> +<p>Hello everyone!</p> +<p>This is the second part of the series of improvements in warmup time and +memory consumption in the PyPy JIT. This post covers recent work on sharing guard +resume data that was recently merged to trunk. It will be a part +of the next official PyPy release. To understand what it does, let's +start with a loop for a simple example:</p> +<pre class="literal-block"> +class A(object): + def __init__(self, x, y): + self.x = x + self.y = y + + def call_method(self, z): + return self.x + self.y + z + +def f(): + s = 0 + for i in range(100000): + a = A(i, 1 + i) + s += a.call_method(i) +</pre> +<p>At the entrance of the loop, we have the following set of operations:</p> +<pre class="literal-block"> +<div style="color: red;">guard(i5 == 4)</div> +<div style="color: red;">guard(p3 is null)</div> +p27 = p2.co_cellvars +p28 = p2.co_freevars +<div style="color: red;">guard_class(p17, 4316866008, descr=&lt;Guard0x104295e08&gt;)</div> +p30 = p17.w_seq +<div style="color: red;">guard_nonnull(p30, descr=&lt;Guard0x104295db0&gt;)</div> +i31 = p17.index +p32 = p30.strategy +<div style="color: red;">guard_class(p32, 4317041344, descr=&lt;Guard0x104295d58&gt;)</div> +p34 = p30.lstorage +i35 = p34..item0 +</pre> +<p>The above operations gets executed at the entrance, so each time we call <tt class="docutils literal">f()</tt>. They ensure +all the optimizations done below stay valid. Now, as long as nothing +out of the ordinary happens, they only ensure that the world around us never changed. However, if e.g. someone puts new +methods on class <tt class="docutils literal">A</tt>, any of the above guards might fail. Despite the fact that it's a very unlikely +case, PyPy needs to track how to recover from such a situation. Each of those points needs to keep the full +state of the optimizations performed, so we can safely deoptimize them and reenter the interpreter. +This is vastly wasteful since most of those guards never fail, hence some <a href="https://www.stups.uni-duesseldorf.de/mediawiki/images/c/c4/Pub-schneider_efficient_2012.pdf">sharing between guards</a> +has been performed.</p> +<p>We went a step further - when two guards are next to each other or the +operations in between them don't have side effects, we can safely redo the operations or to simply +put, resume in the previous guard. That means every now and again we execute a few +operations extra, but not storing extra info saves quite a bit of time and memory. This is similar to the approach that LuaJIT takes, which is called <a href="https://lua-users.org/lists/lua-l/2009-11/msg00089.html">sparse snapshots</a>.</p> + +<p> +I've done some measurements on annotating &amp; rtyping translation of pypy, which +is a pretty memory hungry program that compiles a fair bit. I measured, respectively:</p> +<ul class="simple"> +<li>total time the translation step took (annotating or rtyping)</li> +<li>time it took for tracing (that excludes backend time for the total JIT time) at +the end of rtyping.</li> +<li>memory the GC feels responsible for after the step. The real amount of memory +consumed will always be larger and the coefficient of savings is in 1.5-2x mark</li> +</ul> +<p>Here is the table:</p> +<table border="1" class="docutils"> +<colgroup> +<col width="10%"> +<col width="19%"> +<col width="16%"> +<col width="21%"> +<col width="18%"> +<col width="16%"> +</colgroup> +<thead valign="bottom"> +<tr><th class="head">branch</th> +<th class="head">time annotation</th> +<th class="head">time rtyping</th> +<th class="head">memory annotation</th> +<th class="head">memory rtyping</th> +<th class="head">tracing time</th> +</tr> +</thead> +<tbody valign="top"> +<tr><td>default</td> +<td>317s</td> +<td>454s</td> +<td>707M</td> +<td>1349M</td> +<td>60s</td> +</tr> +<tr><td>sharing</td> +<td>302s</td> +<td>430s</td> +<td>595M</td> +<td>1070M</td> +<td>51s</td> +</tr> +<tr><td>win</td> +<td>4.8%</td> +<td>5.5%</td> +<td>19%</td> +<td>26%</td> +<td>17%</td> +</tr> +</tbody> +</table> +<p>Obviously pypy translation is an extreme example - the vast majority of the code out there +does not have that many lines of code to be jitted. However, it's at the very least +a good win for us :-)</p> +<p>We will continue to improve the warmup performance and keep you posted!</p> +<p>Cheers,<br> +fijal</p> +</div> +<br>https://www.pypy.org/posts/2015/10/pypy-memory-and-warmup-improvements-2-4598780879518640015.htmlMon, 05 Oct 2015 10:31:00 GMTPyPy warmup improvementshttps://www.pypy.org/posts/2015/09/pypy-warmup-improvements-8349465374608676233.htmlMaciej Fijalkowski<div dir="ltr" style="text-align: left;"> + +<p>Hello everyone!</p> +<p>I'm very pleased to announce that we've just managed to merge +the optresult branch. +Under this cryptic name is the biggest JIT refactoring we've done in a couple +years, mostly focused on the warmup time and memory impact of PyPy.</p> +<p>To understand why we did that, let's look back in time - back when we +got the first working JIT prototype in 2009 we were focused exclusively +on achieving peak performance with some consideration towards memory usage, but +without serious consideration towards warmup time. This means we accumulated +quite a bit of technical debt over time that we're trying, with difficulty, +to address right now. This branch mostly does not affect the peak performance +- it should however help you with short-living scripts, like test runs.</p> +<p>We identified warmup time to be one of the major pain points for pypy users, +along with memory impact and compatibility issues with CPython C extension +world. While we can't address all the issues at once, we're trying to address +the first two in the work contributing to this blog post. I will write +a separate article on the last item separately.</p> +<p>To see how much of a problem warmup is for your program, you can run your +program with <tt class="docutils literal"><span class="pre">PYPYLOG=jit-summary:-</span></tt> environment variable set. +This should show you something like this:</p> +<pre class="literal-block"> +(pypy-optresult)fijal@hermann:~/src/botbot-web$ PYPYLOG=jit-summary:- python orm.py 1500 +[d195a2fcecc] {jit-summary +Tracing: 781 2.924965 +Backend: 737 0.722710 +TOTAL: 35.912011 +ops: 1860596 +recorded ops: 493138 + calls: 81022 +guards: 131238 +opt ops: 137263 +opt guards: 35166 +forcings: 4196 +abort: trace too long: 22 +abort: compiling: 0 +abort: vable escape: 22 +abort: bad loop: 0 +abort: force quasi-immut: 0 +nvirtuals: 183672 +nvholes: 25797 +nvreused: 116131 +Total # of loops: 193 +Total # of bridges: 575 +Freed # of loops: 6 +Freed # of bridges: 75 +[d195a48de18] jit-summary} +</pre> +<p>This means that the total (wall clock) time was 35.9s, out of which we spent +2.9s tracing 781 loops and 0.72s compiling them. The remaining couple were +aborted (trace too long is normal, vable escape means someone called +<tt class="docutils literal">sys._getframe()</tt> or equivalent). You can do the following things:</p> +<ul class="simple"> +<li>compare the numbers with <tt class="docutils literal">pypy <span class="pre">--jit</span> off</tt> and see at which number of +iterations <tt class="docutils literal">pypy</tt> jit kicks in</li> +<li>play with the thresholds: +<tt class="docutils literal">pypy <span class="pre">--jit</span> threshold=500,function_threshold=400,trace_eagerness=50</tt> was +much better in this example. What this does is to lower the threshold +for tracing loops from default of 1039 to 400, threshold for tracing +functions from the start from 1619 to 500 and threshold for tracing bridges +from 200 to 50. Bridges are "alternative paths" that JIT did not take that +are being additionally traced. We believe in sane defaults, so we'll try +to improve upon those numbers, but generally speaking there is no one-size +fits all here.</li> +<li>if the tracing/backend time stays high, come and complain to us with +benchmarks, we'll try to look at them</li> +</ul> +<p>Warmup, as a number, is notoriously hard to measure. It's a combination of:</p> +<ul class="simple"> +<li>pypy running interpreter before jitting</li> +<li>pypy needing time to JIT the traces</li> +<li>additional memory allocations needed during tracing to accomodate bookkeeping +data</li> +<li>exiting and entering assembler until there is enough coverage of assembler</li> +</ul> +<p>We're working hard on making a better assesment at this number, stay tuned :-)</p> +<div class="section" id="speedups"> +<h1>Speedups</h1> +<p>Overall we measured about 50% speed improvement in the optimizer, which reduces +the overall warmup time between 10% and 30%. The very +<a class="reference external" href="https://bitbucket.org/pypy/benchmarks/src/fe2e89c0ae6846e3a8d4142106a4857e95f17da7/warmup/function_call2.py?at=default">obvious warmup benchmark</a> got a speedup from 4.5s to 3.5s, almost +30% improvement. Obviously the speedups on benchmarks would vastly +depend on how much warmup time is there in those benchmarks. We observed +annotation of pypy to decreasing by about 30% and the overall translation +time by about 7%, so your mileage may vary.</p> +<p>Of course, as usual with the large refactoring of a crucial piece of PyPy, +there are expected to be bugs. We are going to wait for the default branch +to stabilize so you should see warmup improvements in the next release. +If you're not afraid to try, <a class="reference external" href="https://buildbot.pypy.org/nightly/trunk">nightlies</a> will already have them.</p> +<p>We're hoping to continue improving upon warmup time and memory impact in the +future, stay tuned for improvements.</p> +</div> +<div class="section" id="technical-details"> +<h1>Technical details</h1> +<p>The branch does "one" thing - it changes the underlying model of how operations +are represented during tracing and optimizations. Let's consider a simple +loop like:</p> +<pre class="literal-block"> +[i0, i1] +i2 = int_add(i0, i1) +i3 = int_add(i2, 1) +i4 = int_is_true(i3) +guard_true(i4) +jump(i3, i2) +</pre> +<p>The original representation would allocate a <tt class="docutils literal">Box</tt> for each of <tt class="docutils literal">i0</tt> - <tt class="docutils literal">i4</tt> +and then store those boxes in instances of <tt class="docutils literal">ResOperation</tt>. The list of such +operations would then go to the optimizer. Those lists are big - we usually +remove <tt class="docutils literal">90%</tt> of them during optimizations, but they can be a couple thousand +elements. Overall, allocating those big lists takes a toll on warmup time, +especially due to the GC pressure. The branch removes the existance of <tt class="docutils literal">Box</tt> +completely, instead using a link to <tt class="docutils literal">ResOperation</tt> itself. So say in the above +example, <tt class="docutils literal">i2</tt> would refer to its producer - <tt class="docutils literal">i2 = int_add(i0, i1)</tt> with +arguments getting special treatment.</p> +<p>That alone reduces the GC pressure slightly, but a reduced number +of instances also lets us store references on them directly instead +of going through expensive dictionaries, which were used to store optimizing +information about the boxes.</p> +<p>Cheers!<br> +fijal &amp; arigo</p> +</div> + +<br></div>https://www.pypy.org/posts/2015/09/pypy-warmup-improvements-8349465374608676233.htmlWed, 09 Sep 2015 15:52:00 GMTPydgin: Using RPython to Generate Fast Instruction-Set Simulatorshttps://www.pypy.org/posts/2015/03/pydgin-using-rpython-to-generate-fast-1514065178985838697.htmlMaciej Fijalkowski<div dir="ltr" style="text-align: left;"> + +<p><strong>Note:</strong> This is a guest blog post by Derek Lockhart and Berkin Ilbeyi from +Computer Systems Laboratory of Cornell University.</p> +<p>In this blog post I'd like to describe some recent work on using the RPython +translation toolchain to generate fast instruction set simulators. +Our open-source framework, Pydgin <a class="citation-reference" href="https://www.pypy.org/posts/2015/03/pydgin-using-rpython-to-generate-fast-1514065178985838697.html#a" id="id1">[a]</a>, provides a domain-specific +language (DSL) embedded in Python for concisely describing instruction set +architectures <a class="citation-reference" href="https://www.pypy.org/posts/2015/03/pydgin-using-rpython-to-generate-fast-1514065178985838697.html#b" id="id2">[b]</a> and then uses these descriptions to generate fast, +JIT-enabled simulators. +Pydgin will be presented at the <em>IEEE International Symposium on Performance +Analysis of Systems and Software (ISPASS)</em> and in this post we provide a +preview of that work. +In addition, we discuss some additional progress updates that occurred after +the publishing deadline and will not appear in the final paper <a class="footnote-reference" href="https://www.pypy.org/posts/2015/03/pydgin-using-rpython-to-generate-fast-1514065178985838697.html#id16" id="id3">[1]</a>.</p> +<p>Our area of research expertise is computer architecture, which is perhaps an +unfamiliar topic for some readers of the PyPy blog. +Below we provide some brief background on hardware simulation in the field of +computer architecture, as well as some context as to why instruction set +simulators in particular are such an important tool.</p> +<div class="section" id="simulators-designing-hardware-with-software"> +<h3>Simulators: Designing Hardware with Software</h3> +<p>For computer architects in both academia and industry, a key step in designing +new computational hardware (e.g., CPUs, GPUs, and mobile system-on-chips) is +simulation <a class="citation-reference" href="https://www.pypy.org/posts/2015/03/pydgin-using-rpython-to-generate-fast-1514065178985838697.html#c" id="id4">[c]</a> of the target system. +While numerous models for simulation exist, three classes are particularly +important in hardware design.</p> +<p><strong>Functional Level</strong> models simulate the <em>behavior</em> of the target system. +These models are useful for creating a "golden" reference which can serve as an +executable specification or alternatively as an emulation platform for software +development.</p> +<p><strong>Cycle Level</strong> models aim to simulate both the <em>behavior</em> and the approximate +<em>timing</em> of a hardware component. +These models help computer architects explore design tradeoffs and quickly +determine things like how big caches should be, how many functional units are +needed to meet throughput targets, and how the addition of a custom accelerator +block may impact total system performance.</p> +<p><strong>Register-Transfer Level</strong> (RTL) models specify the <em>behavior</em>, <em>timing</em>, and +<em>resources</em> (e.g., registers, wires, logic gates) of a hardware component. +RTL models are bit-accurate hardware specifications typically written in a +hardware description language (HDL) such as Verilog or VHDL. +Once verified through extensive simulation, HDL specifications can be passed +into synthesis and place-and-route tools to estimate area/energy/timing or to +create FPGA or ASIC prototypes.</p> +<p>An <em>instruction set simulator</em> (ISS) is a special kind of +<em>functional-level</em> model that simulates the behavior of a processor or +system-on-chip (SOC). ISSs serve an important role in hardware design +because they model the instruction set architecture (ISA) interface: the +contractual boundary between hardware designers and software developers. +ISSs allow hardware designers to quickly experiment with adding new processor +instructions while also allowing software developers to build new compilers, +libraries, and applications long before physical silicon is available.</p> +</div> +<div class="section" id="instruction-set-simulators-must-be-fast-and-productive"> +<h3>Instruction-Set Simulators Must be Fast and Productive</h3> +<p>Instruction-set simulators are more important than ever because the ISA +boundary has become increasingly fluid. +While <a class="reference external" href="https://en.wikipedia.org/wiki/Moore%27s_law">Moore's law</a> has continued to deliver larger numbers of transistors +which computer architects can use to build increasingly complex chips, limits +in <a class="reference external" href="https://en.wikipedia.org/wiki/Dennard_scaling#Recent_breakdown_of_Dennard_scaling">Dennard scaling</a> have restricted how these transistors can be used <a class="citation-reference" href="https://www.pypy.org/posts/2015/03/pydgin-using-rpython-to-generate-fast-1514065178985838697.html#d" id="id5">[d]</a>. +In more simple terms, thermal constraints (and energy constraints in mobile +devices) have resulted in a growing interest in pervasive <em>specialization</em>: +using custom accelerators to more efficiently perform compute intensive tasks. +This is already a reality for designers of mobile SOCs who continually add new +accelerator blocks and custom processor instructions in order to achieve higher +performance with less energy consumption. +ISSs are indispensable tools in this SOC design process for both hardware +architects building the silicon and software engineers developing the software +stack on top of it.</p> +<p>An instruction set simulator has two primary responsibilities: 1) accurately +emulating the external execution behavior of the target, and 2) providing +observability by accurately reproducing the target's internal state (e.g., +register values, program counter, status flags) at each time step. +However, other qualities critical to an effective ISS are <strong>simulation +performance</strong> and <strong>designer productivity</strong>. +Simulation performance is important because shorter simulation times allow +developers to more quickly execute and verify large software applications. +Designer productivity is important because it allows hardware architects to +easily experiment with adding new instructions and estimate their impact on +application performance.</p> +<p>To improve simulation performance, high-performance ISSs use dynamic binary +translation (DBT) as a mechanism to translate frequently visited blocks of +target instructions into optimized sequences of host instructions. +To improve designer productivity, many design toolchains automatically generate +ISSs from an architectural description language (ADL): a special +domain-specific language for succinctly specifying instruction encodings and +instruction semantics of an ISA. +Very few existing systems have managed to encapsulate the design complexity of +DBT engines such that high-performance, DBT-accelerated ISSs could be +automatically generated from ADLs <a class="citation-reference" href="https://www.pypy.org/posts/2015/03/pydgin-using-rpython-to-generate-fast-1514065178985838697.html#e" id="id6">[e]</a>. +Unfortunately, tools which have done so are either proprietary software or +leave much to be desired in terms of performance or productivity.</p> +</div> +<div class="section" id="why-rpython"> +<h3>Why RPython?</h3> +<p>Our research group learned of the RPython translation toolchain through our +experiences with PyPy, which we had used in conjunction with our Python +hardware modeling framework to achieve significant improvements in simulation +performance <a class="footnote-reference" href="https://www.pypy.org/posts/2015/03/pydgin-using-rpython-to-generate-fast-1514065178985838697.html#id17" id="id7">[2]</a>. +We realized that the RPython translation toolchain could potentially be adapted +to create fast instruction set simulators since the process of interpreting +executables comprised of binary instructions shared many similarities with the +process of interpreting bytecodes in a dynamic-language VM. +In addition, we were inspired by PyPy's meta-tracing approach to JIT-optimizing +VM design which effectively separates the process of specifying a language +interpreter from the optimization machinery needed to achieve good performance.</p> +<p>Existing ADL-driven ISS generators have tended to use domain-specific +languages that require custom parsers or verbose C-based syntax that +distracts from the instruction specification. +Creating an embedded-ADL within Python provides several benefits over these +existing approaches including a gentler learning curve for new users, access to +better debugging tools, and easier maintenance and extension by avoiding a +custom parser. +Additionally, we have found that the ability to directly execute Pydgin +ISA descriptions in a standard Python interpreter such as CPython or PyPy +significantly helps debugging and testing during initial ISA exploration. +Python's concise, pseudocode-like syntax also manages to map quite closely to +the pseudocode specifications provided by many ISA manuals <a class="citation-reference" href="https://www.pypy.org/posts/2015/03/pydgin-using-rpython-to-generate-fast-1514065178985838697.html#f" id="id8">[f]</a>.</p> +</div> +<div class="section" id="the-pydgin-embedded-adl"> +<h3>The Pydgin embedded-ADL</h3> +<p>Defining a new ISA in the Pydgin embedded-ADL requires four primary pieces of +information: the architectural state (e.g. register file, program counter, +control registers), the bit encodings of each instruction, the instruction +fields, and the semantic definitions for each instruction. Pydgin aims to make +this process as painless as possible by providing helper classes and functions +where possible.</p> +<p>For example, below we provide a truncated example of the ARMv5 instruction +encoding table. Pydgin maintains encodings of all instructions in a centralized +<tt class="docutils literal">encodings</tt> data structure for easy maintenance and quick lookup. The +user-provided instruction names and bit encodings are used to automatically +generate decoders for the simulator. Unlike many ADLs, Pydgin does not require +that the user explicitly specify instruction types or mask bits for field +matching because the Pydgin decoder generator can automatically infer decoder +fields from the encoding table.</p> +<pre class="code python literal-block"> +<span class="name">encodings</span> <span class="operator">=</span> <span class="punctuation">[</span> + <span class="punctuation">[</span><span class="literal string">'adc'</span><span class="punctuation">,</span> <span class="literal string">'xxxx00x0101xxxxxxxxxxxxxxxxxxxxx'</span><span class="punctuation">],</span> + <span class="punctuation">[</span><span class="literal string">'add'</span><span class="punctuation">,</span> <span class="literal string">'xxxx00x0100xxxxxxxxxxxxxxxxxxxxx'</span><span class="punctuation">],</span> + <span class="punctuation">[</span><span class="literal string">'and'</span><span class="punctuation">,</span> <span class="literal string">'xxxx00x0000xxxxxxxxxxxxxxxxxxxxx'</span><span class="punctuation">],</span> + <span class="punctuation">[</span><span class="literal string">'b'</span><span class="punctuation">,</span> <span class="literal string">'xxxx1010xxxxxxxxxxxxxxxxxxxxxxxx'</span><span class="punctuation">],</span> + <span class="punctuation">[</span><span class="literal string">'bl'</span><span class="punctuation">,</span> <span class="literal string">'xxxx1011xxxxxxxxxxxxxxxxxxxxxxxx'</span><span class="punctuation">],</span> + <span class="punctuation">[</span><span class="literal string">'bic'</span><span class="punctuation">,</span> <span class="literal string">'xxxx00x1110xxxxxxxxxxxxxxxxxxxxx'</span><span class="punctuation">],</span> + <span class="punctuation">[</span><span class="literal string">'bkpt'</span><span class="punctuation">,</span> <span class="literal string">'111000010010xxxxxxxxxxxx0111xxxx'</span><span class="punctuation">],</span> + <span class="punctuation">[</span><span class="literal string">'blx1'</span><span class="punctuation">,</span> <span class="literal string">'1111101xxxxxxxxxxxxxxxxxxxxxxxxx'</span><span class="punctuation">],</span> + <span class="punctuation">[</span><span class="literal string">'blx2'</span><span class="punctuation">,</span> <span class="literal string">'xxxx00010010xxxxxxxxxxxx0011xxxx'</span><span class="punctuation">],</span> + <span class="comment"># ...</span> + <span class="punctuation">[</span><span class="literal string">'teq'</span><span class="punctuation">,</span> <span class="literal string">'xxxx00x10011xxxxxxxxxxxxxxxxxxxx'</span><span class="punctuation">],</span> + <span class="punctuation">[</span><span class="literal string">'tst'</span><span class="punctuation">,</span> <span class="literal string">'xxxx00x10001xxxxxxxxxxxxxxxxxxxx'</span><span class="punctuation">],</span> +<span class="punctuation">]</span> +</pre> +<p>A major goal of Pydgin was ensuring instruction semantic definitions map to ISA +manual specifications as much as possible. The code below shows one such +definition for the ARMv5 <tt class="docutils literal">add</tt> instruction. +A user-defined <tt class="docutils literal">Instruction</tt> class (not shown) specifies field names that can +be used to conveniently access bit positions within an instruction (e.g. +<tt class="docutils literal">rd</tt>, <tt class="docutils literal">rn</tt>, <tt class="docutils literal">S</tt>). +Additionally, users can choose to define their own helper functions, such as +the <tt class="docutils literal">condition_passed</tt> function, to create more concise syntax that better +matches the ISA manual.</p> +<pre class="code python literal-block"> +<span class="keyword">def</span> <span class="name function">execute_add</span><span class="punctuation">(</span> <span class="name">s</span><span class="punctuation">,</span> <span class="name">inst</span> <span class="punctuation">):</span> + <span class="keyword">if</span> <span class="name">condition_passed</span><span class="punctuation">(</span> <span class="name">s</span><span class="punctuation">,</span> <span class="name">inst</span><span class="operator">.</span><span class="name">cond</span><span class="punctuation">()</span> <span class="punctuation">):</span> + <span class="name">a</span><span class="punctuation">,</span> <span class="operator">=</span> <span class="name">s</span><span class="operator">.</span><span class="name">rf</span><span class="punctuation">[</span> <span class="name">inst</span><span class="operator">.</span><span class="name">rn</span><span class="punctuation">()</span> <span class="punctuation">]</span> + <span class="name">b</span><span class="punctuation">,</span> <span class="name">_</span> <span class="operator">=</span> <span class="name">shifter_operand</span><span class="punctuation">(</span> <span class="name">s</span><span class="punctuation">,</span> <span class="name">inst</span> <span class="punctuation">)</span> + <span class="name">result</span> <span class="operator">=</span> <span class="name">a</span> <span class="operator">+</span> <span class="name">b</span> + <span class="name">s</span><span class="operator">.</span><span class="name">rf</span><span class="punctuation">[</span> <span class="name">inst</span><span class="operator">.</span><span class="name">rd</span><span class="punctuation">()</span> <span class="punctuation">]</span> <span class="operator">=</span> <span class="name">trim_32</span><span class="punctuation">(</span> <span class="name">result</span> <span class="punctuation">)</span> + + <span class="keyword">if</span> <span class="name">inst</span><span class="operator">.</span><span class="name">S</span><span class="punctuation">():</span> + <span class="keyword">if</span> <span class="name">inst</span><span class="operator">.</span><span class="name">rd</span><span class="punctuation">()</span> <span class="operator">==</span> <span class="literal number integer">15</span><span class="punctuation">:</span> + <span class="keyword">raise</span> <span class="name">FatalError</span><span class="punctuation">(</span><span class="literal string">'Writing SPSR not implemented!'</span><span class="punctuation">)</span> + <span class="name">s</span><span class="operator">.</span><span class="name">N</span> <span class="operator">=</span> <span class="punctuation">(</span><span class="name">result</span> <span class="operator">&gt;&gt;</span> <span class="literal number integer">31</span><span class="punctuation">)</span><span class="operator">&amp;</span><span class="literal number integer">1</span> + <span class="name">s</span><span class="operator">.</span><span class="name">Z</span> <span class="operator">=</span> <span class="name">trim_32</span><span class="punctuation">(</span> <span class="name">result</span> <span class="punctuation">)</span> <span class="operator">==</span> <span class="literal number integer">0</span> + <span class="name">s</span><span class="operator">.</span><span class="name">C</span> <span class="operator">=</span> <span class="name">carry_from</span><span class="punctuation">(</span> <span class="name">result</span> <span class="punctuation">)</span> + <span class="name">s</span><span class="operator">.</span><span class="name">V</span> <span class="operator">=</span> <span class="name">overflow_from_add</span><span class="punctuation">(</span> <span class="name">a</span><span class="punctuation">,</span> <span class="name">b</span><span class="punctuation">,</span> <span class="name">result</span> <span class="punctuation">)</span> + + <span class="keyword">if</span> <span class="name">inst</span><span class="operator">.</span><span class="name">rd</span><span class="punctuation">()</span> <span class="operator">==</span> <span class="literal number integer">15</span><span class="punctuation">:</span> + <span class="keyword">return</span> + + <span class="name">s</span><span class="operator">.</span><span class="name">rf</span><span class="punctuation">[</span><span class="name">PC</span><span class="punctuation">]</span> <span class="operator">=</span> <span class="name">s</span><span class="operator">.</span><span class="name">fetch_pc</span><span class="punctuation">()</span> <span class="operator">+</span> <span class="literal number integer">4</span> +</pre> +<p>Compared to the ARM ISA Reference manual shown below, the Pydgin instruction +definition is a fairly close match. Pydgin's definitions could certainly be +made more concise by using a custom DSL, however, this would lose many of the +debugging benefits afforded to a well-supported language such as Python and +additionally require using a custom parser that would likely need modification +for each new ISA.</p> +<pre class="code literal-block"> +if ConditionPassed(cond) then + Rd = Rn + shifter_operand + if S == 1 and Rd == R15 then + if CurrentModeHasSPSR() then CPSR = SPSR + else UNPREDICTABLE else if S == 1 then + N Flag = Rd[31] + Z Flag = if Rd == 0 then 1 else 0 + C Flag = CarryFrom(Rn + shifter_operand) + V Flag = OverflowFrom(Rn + shifter_operand) +</pre> +<p>Creating an ISS that can run real applications is a rather complex task, even +for a bare metal simulator with no operating system such as Pydgin. +Each system call in the C library must be properly implemented, and +bootstrapping code must be provided to set up the program stack and +architectural state. +This is a very tedious and error prone process which Pydgin tries to +encapsulate so that it remains as transparent to the end user as possible. +In future versions of Pydgin we hope to make bootstrapping more painless and +support a wider variety of C libraries.</p> +<!-- Architectural state... leave out for now. --> +<!-- :: + +class State( object ): + _virtualizable_ = ['pc', 'ncycles'] + def __init__( self, memory, debug, reset_addr=0x400 ): + self.pc = reset_addr + self.rf = ArmRegisterFile( self, num_regs=16 ) + self.mem = memory + + self.rf[ 15 ] = reset_addr + + # current program status register (CPSR) + self.N = 0b0 # Negative condition + self.Z = 0b0 # Zero condition + self.C = 0b0 # Carry condition + self.V = 0b0 # Overflow condition + + # other registers + self.status = 0 + self.ncycles = 0 + + def fetch_pc( self ): + return self.pc --> +</div> +<div class="section" id="pydgin-performance"> +<h3>Pydgin Performance</h3> +<p>In order to achieve good simulation performance from Pydgin ISSs, significant +work went into adding appropriate JIT annotations to the Pydgin library +components. +These optimization hints, which allow the JIT generated by the RPython +translation toolchain to produce more efficient code, have been specifically +selected for the unique properties of ISSs. +For the sake of brevity, we do not talk about the exact optimizations here but +a detailed discussion can be found in the ISPASS paper <a class="footnote-reference" href="https://www.pypy.org/posts/2015/03/pydgin-using-rpython-to-generate-fast-1514065178985838697.html#id16" id="id9">[1]</a>. +In the paper we evaluate two ISSs, one for a simplified MIPS ISA and another +for the ARMv5 ISA, whereas below we only discuss results for the ARMv5 ISS.</p> +<p>The performance of Pydgin-generated ARMv5 ISSs were compared against +several reference ISSs: the <a class="reference external" href="https://www.gem5.org/">gem5</a> ARM atomic simulator (<em>gem5</em>), +interpretive and JIT-enabled versions of <a class="reference external" href="https://simit-arm.sourceforge.net/">SimIt-ARM</a> (<em>simit-nojit</em> and +<em>simit-jit</em>), and <a class="reference external" href="https://wiki.qemu.org/">QEMU</a>. +Atomic models from the gem5 simulator were chosen for comparison due their wide +usage amongst computer architects <a class="citation-reference" href="https://www.pypy.org/posts/2015/03/pydgin-using-rpython-to-generate-fast-1514065178985838697.html#g" id="id10">[g]</a>. +SimIt-ARM was selected because it is currently the highest performance +ADL-generated DBT-ISS publicly available. +QEMU has long been held as the gold-standard for DBT simulators due to its +extremely high performance, however, QEMU is generally intended for usage as an +emulator rather than a simulator <a class="citation-reference" href="https://www.pypy.org/posts/2015/03/pydgin-using-rpython-to-generate-fast-1514065178985838697.html#c" id="id11">[c]</a> and therefore achieves its excellent +performance at the cost of observability. +Unlike QEMU, all other simulators in our study faithfully track architectural +state at an instruction level rather than block level. +Pydgin ISSs were generated with and without JITs using the RPython translation +toolchain in order to help quantify the performance benefit of the meta-tracing +JIT.</p> +<p>The figure below shows the performance of each ISS executing applications from +the SPEC CINT2006 benchmark suite <a class="citation-reference" href="https://www.pypy.org/posts/2015/03/pydgin-using-rpython-to-generate-fast-1514065178985838697.html#h" id="id12">[h]</a>. +Benchmarks were run to completion on the high-performance DBT-ISSs +(<em>simit-jit</em>, <em>pydgin-jit</em>, and QEMU), but were terminated after only +10 billion simulated instructions for the non-JITed interpretive ISSs +(these would require many hours, in some cases days, to run to completion). +Simulation performance is measured in MIPS <a class="citation-reference" href="https://www.pypy.org/posts/2015/03/pydgin-using-rpython-to-generate-fast-1514065178985838697.html#i" id="id13">[i]</a> and plotted on a <strong>log +scale</strong> due to the wide variance in performance. +The <em>WHMEAN</em> group summarizes each ISS's performance across all benchmarks +using the weighted harmonic mean.</p> + +<div class="separator" style="clear: both; text-align: center;"><a href="https://4.bp.blogspot.com/-fsfrUJOQKZg/VQKqZzgcQsI/AAAAAAAACAA/20NoWKRzmvU/s1600/arm-bar-plot.png" style="margin-left: 1em; margin-right: 1em;"><img border="0" src="https://4.bp.blogspot.com/-fsfrUJOQKZg/VQKqZzgcQsI/AAAAAAAACAA/20NoWKRzmvU/s640/arm-bar-plot.png"></a></div> + +<p>A few points to take away from these results:</p> +<ul class="simple"> +<li>ISSs without JITs (<em>gem5</em>, <em>simit-nojit</em>, and <em>pydgin-nojit</em>) demonstrate +relatively consistent performance across applications, whereas ISSs with JITs +(<em>simit-jit</em>, <em>pydgin-jit</em>, and QEMU) demonstrate much greater +performance variability from application-to-application.</li> +<li>The <em>gem5</em> atomic model demonstrates particularly miserable performance, only +2-3 MIPS!</li> +<li>QEMU lives up to its reputation as a gold-standard for simulator performance, +leading the pack on nearly every benchmark and reaching speeds of 240-1120 +MIPS.</li> +<li><em>pydgin-jit</em> is able to outperform <em>simit-jit</em> on four of the +applications, including considerable performance improvements of 1.44–1.52× +for the applications <em>456.hmmer</em>, <em>462.libquantum</em>, and <em>471.omnetpp</em> +(managing to even outperform QEMU on <em>471.omnetpp</em>).</li> +<li><em>simit-jit</em> is able to obtain much more consistent performance (230-459 +MIPS across all applications) than <em>pydgin-jit</em> (9.6-659 MIPS). This is +due to <em>simit-jit</em>'s page-based approach to JIT optimization compared to +<em>pydgin-jit</em>'s tracing-based approach.</li> +<li><em>464.h264ref</em> displays particularly bad pathological behavior in Pydgin’s +tracing JIT and is the only application to perform worse on <em>pydgin-jit</em> +than <em>pydgin-nojit</em> (9.6 MIPS vs. 21 MIPS).</li> +</ul> +<p>The pathological behavior demonstrated by <em>464.h264ref</em> was of particular +concern because it caused <em>pydgin-jit</em> to perform even worse than having no +JIT at all. RPython JIT logs indicated that the reason for this performance +degradation was a large number of tracing aborts due to JIT traces growing too +long. However, time limitations before the publication deadline prevented us +from investigating this issue thoroughly.</p> +<p>Since the deadline we've applied some minor bug fixes and made some small +improvements in the memory representation. +More importantly, we've addressed the performance degradation in <em>464.h264ref</em> +by increasing trace lengths for the JIT. +Below we show how the performance of <em>464.h264ref</em> changes as the +<strong>trace_limit</strong> parameter exposed by the RPython JIT is varied from the default +size of 6000 operations.</p> + + +<div class="separator" style="clear: both; text-align: center;"><a href="https://2.bp.blogspot.com/-rOklyrr1tzY/VQKqg3GJu9I/AAAAAAAACAI/jfoHvpJbMF8/s1600/trace-length-plot.png" style="margin-left: 1em; margin-right: 1em;"><img border="0" src="https://2.bp.blogspot.com/-rOklyrr1tzY/VQKqg3GJu9I/AAAAAAAACAI/jfoHvpJbMF8/s640/trace-length-plot.png"></a></div> + +<p>By quadrupling the trace limit we achieve an 11x performance improvement in +<em>464.h264ref</em>. +The larger trace limit allows the JIT to optimize long code paths that were +previously triggering trace aborts, greatly helping amortize the costs of +tracing. +Note that arbitrarily increasing this limit can potentially hurt performance if +longer traces are not able to detect optimizable code sequences.</p> +<p>After performing similar experiments across the applications in the SPEC +CINT2006 benchmark suite, we settled on a trace limit of 400,000 operations. +In the figure below we show how the updated Pydgin ISS (<em>pydgin-400K</em>) improves +performance across all benchmarks and fixes the performance degradation +previously seen in <em>464.h264ref</em>. Note that the non-JITted simulators have been +removed for clarity, and simulation performance is now plotted on a +<strong>linear scale</strong> to more clearly distinguish the performance gap between +each ISS.</p> + +<div class="separator" style="clear: both; text-align: center;"><a href="https://1.bp.blogspot.com/-DSAtuNZ7fnQ/VQKqm0HPBfI/AAAAAAAACAQ/8hYCDeZujq8/s1600/new-bar-plot.png" style="margin-left: 1em; margin-right: 1em;"><img border="0" src="https://1.bp.blogspot.com/-DSAtuNZ7fnQ/VQKqm0HPBfI/AAAAAAAACAQ/8hYCDeZujq8/s640/new-bar-plot.png"></a></div> + +<p>With these improvements, we are now able to beat <em>simit-jit</em> on all but two +benchmarks. In future work we hope to further close the gap with QEMU as well.</p> +</div> +<div class="section" id="conclusions-and-future-work"> +<h3>Conclusions and Future Work</h3> +<p>Pydgin demonstrates that the impressive work put into the RPython translation +toolchain, designed to simplify the process of building fast dynamic-language +VMs, can also be leveraged to build fast instruction set simulators. +Our prototype ARMv5 ISS shows that Pydgin can generate ISSs with performance +competitive to SimIt-ARM while also providing a more productive development +experience: RPython allowed us to develop Pydgin with only four person-months +of work. +Another significant benefit of the Pydgin approach is that any performance +improvements applied to the RPython translation toolchain immediately benefit +Pydgin ISSs after a simple software download and retranslation. +This allows Pydgin to track the continual advances in JIT technology introduced +by the PyPy development team.</p> +<p>Pydgin is very much a work in progress. There are many features we would like +to add, including:</p> +<ul class="simple"> +<li>more concise syntax for accessing arbitrary instruction bits</li> +<li>support for other C libraries such as glibc, uClibc, and musl +(we currently only support binaries compiled with newlib)</li> +<li>support for self-modifying code</li> +<li>features for more productive debugging of target applications</li> +<li>ISS descriptions for other ISAs such as RISC-V, ARMv8, and x86</li> +<li>automatic generation of compilers and toolchains from Pydgin descriptions</li> +</ul> +<p>In addition, we think there are opportunities for even greater performance +improvements with more advanced techniques such as:</p> +<ul class="simple"> +<li>automatic generation of optimized instruction decoders</li> +<li>optimizations for floating-point intensive applications</li> +<li>multiple tracing-JITs for parallel simulation of multicore SOCs</li> +<li>a parallel JIT compilation engine as proposed by Böhm et al. <a class="footnote-reference" href="https://www.pypy.org/posts/2015/03/pydgin-using-rpython-to-generate-fast-1514065178985838697.html#id18" id="id14">[3]</a></li> +</ul> +<p>We hope that Pydgin can be of use to others, so if you try it out please let us +know what you think. Feel free to contact us if you find any of the above +development projects interesting, or simply fork the project on GitHub and hack +away!</p> +<p>-- Derek Lockhart and Berkin Ilbeyi</p> +</div> +<div class="section" id="acknowledgements"> +<h3>Acknowledgements</h3> +<p> We would like to sincerely thank Carl Friedrich Bolz and Maciej Fijalkowski for their feedback on the Pydgin publication and their guidance on improving the JIT performance of our simulators. We would also like to thank for the whole PyPy team for their incredible work on the PyPy and the RPython translation toolchain. Finally, thank you to our research advisor, Prof. Christopher Batten, and the sponsors of this work which include the National Science Foundation, the Defense Advanced Research Projects Agency, and Intel Corporation.</p> +</div> +<div class="section" id="footnotes"> +<h3>Footnotes</h3> +<table class="docutils citation" frame="void" id="a" rules="none"> +<colgroup><col class="label"><col></colgroup> +<tbody valign="top"> +<tr><td class="label"><a class="fn-backref" href="https://www.pypy.org/posts/2015/03/pydgin-using-rpython-to-generate-fast-1514065178985838697.html#id1">[a]</a></td><td>Pydgin loosely stands for [Py]thon [D]SL for [G]enerating +[In]struction set simulators and is pronounced the same as “pigeon”. The +name is inspired by the word “pidgin” which is a grammatically simplified +form of language and captures the intent of the Pydgin embedded-ADL. +<a class="reference external" href="https://github.com/cornell-brg/pydgin">https://github.com/cornell-brg/pydgin</a></td></tr> +</tbody> +</table> +<table class="docutils citation" frame="void" id="b" rules="none"> +<colgroup><col class="label"><col></colgroup> +<tbody valign="top"> +<tr><td class="label"><a class="fn-backref" href="https://www.pypy.org/posts/2015/03/pydgin-using-rpython-to-generate-fast-1514065178985838697.html#id2">[b]</a></td><td>Popular instruction set architectures (ISAs) include MIPs, ARM, +x86, and more recently RISC-V</td></tr> +</tbody> +</table> +<table class="docutils citation" frame="void" id="c" rules="none"> +<colgroup><col class="label"><col></colgroup> +<tbody valign="top"> +<tr><td class="label">[c]</td><td><em>(<a class="fn-backref" href="https://www.pypy.org/posts/2015/03/pydgin-using-rpython-to-generate-fast-1514065178985838697.html#id4">1</a>, <a class="fn-backref" href="https://www.pypy.org/posts/2015/03/pydgin-using-rpython-to-generate-fast-1514065178985838697.html#id11">2</a>)</em> For a good discussion of simulators vs. emulators, please see the +following post on StackOverflow: +<a class="reference external" href="https://stackoverflow.com/questions/1584617/simulator-or-emulator-what-is-the-difference">https://stackoverflow.com/questions/1584617/simulator-or-emulator-what-is-the-difference</a></td></tr> +</tbody> +</table> +<table class="docutils citation" frame="void" id="d" rules="none"> +<colgroup><col class="label"><col></colgroup> +<tbody valign="top"> +<tr><td class="label"><a class="fn-backref" href="https://www.pypy.org/posts/2015/03/pydgin-using-rpython-to-generate-fast-1514065178985838697.html#id5">[d]</a></td><td><a class="reference external" href="https://en.wikipedia.org/wiki/Dark_silicon">https://en.wikipedia.org/wiki/Dark_silicon</a></td></tr> +</tbody> +</table> +<table class="docutils citation" frame="void" id="e" rules="none"> +<colgroup><col class="label"><col></colgroup> +<tbody valign="top"> +<tr><td class="label"><a class="fn-backref" href="https://www.pypy.org/posts/2015/03/pydgin-using-rpython-to-generate-fast-1514065178985838697.html#id6">[e]</a></td><td>Please see the Pydgin paper for a more detailed discussion of prior work.</td></tr> +</tbody> +</table> +<table class="docutils citation" frame="void" id="f" rules="none"> +<colgroup><col class="label"><col></colgroup> +<tbody valign="top"> +<tr><td class="label"><a class="fn-backref" href="https://www.pypy.org/posts/2015/03/pydgin-using-rpython-to-generate-fast-1514065178985838697.html#id8">[f]</a></td><td><p class="first">For more examples of Pydgin ISA specifications, please see the ISPASS +paper <a class="footnote-reference" href="https://www.pypy.org/posts/2015/03/pydgin-using-rpython-to-generate-fast-1514065178985838697.html#id16" id="id15">[1]</a> or the Pydgin source code on GitHub.</p> +<p>Pydgin instruction definitions for a simple MIPS-inspired ISA can be +found here:</p> +<ul class="simple"> +<li><a class="reference external" href="https://github.com/cornell-brg/pydgin/blob/master/parc/isa.py">https://github.com/cornell-brg/pydgin/blob/master/parc/isa.py</a></li> +</ul> +<p>Pydgin instruction definitions for a simplified ARMv5 ISA can be found +here:</p> +<ul class="last simple"> +<li><a class="reference external" href="https://github.com/cornell-brg/pydgin/blob/master/arm/isa.py">https://github.com/cornell-brg/pydgin/blob/master/arm/isa.py</a></li> +</ul> +</td></tr> +</tbody> +</table> +<table class="docutils citation" frame="void" id="g" rules="none"> +<colgroup><col class="label"><col></colgroup> +<tbody valign="top"> +<tr><td class="label"><a class="fn-backref" href="https://www.pypy.org/posts/2015/03/pydgin-using-rpython-to-generate-fast-1514065178985838697.html#id10">[g]</a></td><td><p class="first">gem5 is a cycle-level simulation framework that contains both +functional-level (atomic) and cycle-level processor models. Although +primarily used for detailed, cycle-approximate processor simulation, +gem5's atomic model is a popular tool for many ISS tasks.</p> +<ul class="last simple"> +<li><a class="reference external" href="https://www.m5sim.org/SimpleCPU">https://www.m5sim.org/SimpleCPU</a></li> +</ul> +</td></tr> +</tbody> +</table> +<table class="docutils citation" frame="void" id="h" rules="none"> +<colgroup><col class="label"><col></colgroup> +<tbody valign="top"> +<tr><td class="label"><a class="fn-backref" href="https://www.pypy.org/posts/2015/03/pydgin-using-rpython-to-generate-fast-1514065178985838697.html#id12">[h]</a></td><td>All performance measurements were taken on an unloaded server-class +machine.</td></tr> +</tbody> +</table> +<table class="docutils citation" frame="void" id="i" rules="none"> +<colgroup><col class="label"><col></colgroup> +<tbody valign="top"> +<tr><td class="label"><a class="fn-backref" href="https://www.pypy.org/posts/2015/03/pydgin-using-rpython-to-generate-fast-1514065178985838697.html#id13">[i]</a></td><td>Millions of instructions per second.</td></tr> +</tbody> +</table> +</div> +<div class="section" id="references"> +<h3>References</h3> +<table class="docutils footnote" frame="void" id="id16" rules="none"> +<colgroup><col class="label"><col></colgroup> +<tbody valign="top"> +<tr><td class="label">[1]</td><td><em>(<a class="fn-backref" href="https://www.pypy.org/posts/2015/03/pydgin-using-rpython-to-generate-fast-1514065178985838697.html#id3">1</a>, <a class="fn-backref" href="https://www.pypy.org/posts/2015/03/pydgin-using-rpython-to-generate-fast-1514065178985838697.html#id9">2</a>, <a class="fn-backref" href="https://www.pypy.org/posts/2015/03/pydgin-using-rpython-to-generate-fast-1514065178985838697.html#id15">3</a>)</em> <p>Derek Lockhart, Berkin Ilbeyi, and Christopher Batten. "Pydgin: +Generating Fast Instruction Set Simulators from Simple Architecture +Descriptions with Meta-Tracing JIT Compilers." IEEE Int'l Symp. on +Performance Analysis of Systems and Software (ISPASS), Mar. 2015.</p> +<ul class="last simple"> +<li><a class="reference external" href="https://csl.cornell.edu/~cbatten/pdfs/lockhart-pydgin-ispass2015.pdf">https://csl.cornell.edu/~cbatten/pdfs/lockhart-pydgin-ispass2015.pdf</a></li> +<li><a class="reference external" href="https://github.com/cornell-brg/pydgin">https://github.com/cornell-brg/pydgin</a></li> +</ul> +</td></tr> +</tbody> +</table> +<table class="docutils footnote" frame="void" id="id17" rules="none"> +<colgroup><col class="label"><col></colgroup> +<tbody valign="top"> +<tr><td class="label"><a class="fn-backref" href="https://www.pypy.org/posts/2015/03/pydgin-using-rpython-to-generate-fast-1514065178985838697.html#id7">[2]</a></td><td><p class="first">Derek Lockhart, Gary Zibrat, and Christopher Batten. "PyMTL: A Unified +Framework for Vertically Integrated Computer Architecture Research." 47th +ACM/IEEE Int'l Symp. on Microarchitecture (MICRO-47), Dec. 2014.</p> +<ul class="last simple"> +<li><a class="reference external" href="https://csl.cornell.edu/~cbatten/pdfs/lockhart-pymtl-micro2014.pdf">https://csl.cornell.edu/~cbatten/pdfs/lockhart-pymtl-micro2014.pdf</a></li> +<li><a class="reference external" href="https://github.com/cornell-brg/pymtl">https://github.com/cornell-brg/pymtl</a></li> +</ul> +</td></tr> +</tbody> +</table> +<table class="docutils footnote" frame="void" id="id18" rules="none"> +<colgroup><col class="label"><col></colgroup> +<tbody valign="top"> +<tr><td class="label"><a class="fn-backref" href="https://www.pypy.org/posts/2015/03/pydgin-using-rpython-to-generate-fast-1514065178985838697.html#id14">[3]</a></td><td>I. Böhm, B. Franke, and N. Topham. Generalized Just-In-Time Trace +Compilation Using a Parallel Task Farm in a Dynamic Binary Translator. +ACM SIGPLAN Conference on Programming Language Design and Implementation +(PLDI), Jun 2011.</td></tr> +</tbody> +</table> +</div> + +<br></div>https://www.pypy.org/posts/2015/03/pydgin-using-rpython-to-generate-fast-1514065178985838697.htmlFri, 13 Mar 2015 09:31:00 GMTFaster, more memory efficient and more ordered dictionaries on PyPyhttps://www.pypy.org/posts/2015/01/faster-more-memory-efficient-and-more-4096950404745375390.htmlMaciej Fijalkowski<div dir="ltr" style="text-align: left;"> +<p>Hello everyone!</p> +<p>As of today, we merged the latest branch that brings better dictionaries to PyPy by default. The work is based on an idea by Raymond Hettinger on <a class="reference external" href="https://mail.python.org/pipermail/python-dev/2012-December/123028.html">python-dev</a>, with prior work done notably in Java.  It was done by Maciej Fijałkowski and Armin Rigo, with Laurence Tratt recently prodding us to finish it.  (Earlier work going in a similar direction include Alex Gaynor's work on ordered dicts in Topaz, which was also used in the Hippy VM.  Each of these pieces of work is itself based on the original dict implementation in RPython, whose origins fade in the Subversion prehistory of PyPy.)  Coincidentally, a very similar idea has been implemented in Zend PHP very recently. <a class="reference external" href="https://nikic.github.io/2014/12/22/PHPs-new-hashtable-implementation.html">Zend implementation description</a>.</p> +<p>This post covers the basics of design and implementation as well as some basic benchmarks.</p> +</div> +<div class="section" id="dictionaries-are-now-ordered"> +<h3>Dictionaries are now ordered!</h3> +<p>One surprising part is that the new design, besides being more +memory efficient, is ordered by design: it preserves the +insertion order.  This is not forbidden by the Python language, which allows any order.  It makes the <tt class="docutils literal">collections.OrderedDict</tt> subclass much faster than before: it is now a thin subclass of <tt class="docutils literal">dict</tt>.  Obviously, we recommend that any portable Python program continues to use <tt class="docutils literal">OrderedDict</tt> when ordering is important.  Note that a non-portable program might rely on more: for example, a <tt class="docutils literal">**keywords</tt> argument now receives the keywords in the same order as the one in which they were given in the call.  (Whether such a thing might be called a language design change or not is a bit borderline.)  The point is that Python programs that work on CPython or previous versions of PyPy should continue to work on PyPy.</p> +<p>There is one exception, though.  The iterators of the <tt class="docutils literal">OrderedDict</tt> subclass are now working just like the ones of the <tt class="docutils literal">dict</tt> builtin: they will raise <tt class="docutils literal">RuntimeError</tt> when iterating if the dictionary was modified.  In the CPython design, the class <tt class="docutils literal">OrderedDict</tt> explicitly doesn't worry about that, and instead you get some result that might range from correct to incorrect to crashes (i.e. random Python exceptions).</p> +</div> +<div class="section" id="original-pypy-dictionary-design"> +<h3>Original PyPy dictionary design</h3> +<p>Originally, PyPy dictionaries, as well as CPython dictionaries +are implemented as follows (simplified view):</p> +<pre class="literal-block"> +struct dict { + long num_items; + dict_entry* items;   /* pointer to array */ +} + +struct dict_entry { + long hash; + PyObject* key; + PyObject* value; +} +</pre> +<p>Where items is a sparse array, with 1/3 to 1/2 of the items being NULL. +The average space occupied by a dictionary is <tt class="docutils literal">3 * WORD * 12/7</tt> plus some small constant (the smallest dict has 8 entries, which is +<tt class="docutils literal">8 * 3 * WORD + 2 * WORD = 26 WORDs</tt>).</p> +</div> +<div class="section" id="new-pypy-dictionary-design"> +<h3>New PyPy dictionary design</h3> +<p>The new PyPy dictionary is split in two arrays:</p> +<pre class="literal-block"> +struct dict { + long num_items; + variable_int *sparse_array; + dict_entry* compact_array; +} + +struct dict_entry { + long hash; + PyObject *key; + PyObject *value; +} +</pre> +<p>Here, <tt class="docutils literal">compact_array</tt> stores all the items in order of insertion, while <tt class="docutils literal">sparse_array</tt> is a 1/2 to 2/3 full array of integers. The integers themselves are of the smallest size necessary for indexing the <tt class="docutils literal">compact_array</tt>. So if <tt class="docutils literal">compact_array</tt> has less than 256 items, then <tt class="docutils literal">sparse_array</tt> will be made of bytes; if less than 2^16, it'll be two-byte integers; and so on.</p> +<p>This design saves quite a bit of memory. For example, on 64bit systems we can, but almost never, use indexing of more than 4 billion elements; and for small dicts, the extra <tt class="docutils literal">sparse_array</tt> takes very little space.  For example a 100 element dict, would be on average for the original design on 64bit: 100 * 12/7 * WORD * 3 =~ 4100 bytes, while on new design it's 100 * 12/7 + 3 * WORD * 100 =~ 2600 bytes, quite a significant saving.</p> +</div> +<div class="section" id="gc-friendliness"> +<h3>GC friendliness</h3> +<p>The obvious benefit of having more compact dictionaries is an increased cache friendliness. In modern CPUs cache misses are much more costly than doing additional simple work, like having an additional level of (in-cache) indirection. Additionally, there is a GC benefit coming from it. When doing a minor collection, the GC has to visit all the GC fields in old objects that can point to young objects. In the case of large arrays, this can prove problematic since the array grows and with each minor collection we need to visit more and more GC pointers. In order to avoid it, large arrays in PyPy employ a technique called "card marking" where the GC only visits "cards" or subsets of arrays that were modified between collections. The problem with dictionaries was that by design modifications in a dictionary occur randomly, hence a lot of cards used to get invalidated. In the new design, however, new items are typically appended to the <tt class="docutils literal">compact_array</tt>, hence invalidate much fewer cards --- which improves GC performance.  (The new <tt class="docutils literal">sparse_array</tt> is an array of integers, so it does not suffer from the same problems.)</p> +</div> +<div class="section" id="deletion"> +<h3>Deletion</h3> +<p>Deleting entries from dictionaries is not very common, but important in a few use cases.  To preserve order, when we delete an entry, we mark the entry as removed but don't otherwise shuffle the remaining entries.  If we repeat this operation often enough, there will be a lot of removed entries in the (originally compact) array.  At this point, we need to do a "packing" operation, which moves all live entries to the start of the array (and then reindexes the sparse array, as the positions changed).  This works well, but there are use cases where previously no reindexing was ever needed, so it makes these cases a bit slower (for example when repeatedly adding and removing keys in equal number).</p> +</div> +<div class="section" id="benchmarks"> +<h3>Benchmarks</h3> +<p>The PyPy speed benchmarks show mostly small effect, <a class="reference external" href="https://speed.pypy.org/changes/?tre=10&amp;rev=75419%3Ac52fc1774518&amp;exe=1&amp;env=1">see changes</a>. The microbenchmarks that we did show large improvements on large and very large dictionaries (particularly, building dictionaries of at least a couple 100s of items is now twice faster) and break-even on small ones (between 20% slower and 20% faster depending very much on the usage patterns and sizes of dictionaries). The new dictionaries enable various optimization possibilities which we're going to explore in the near future.</p> +<p>Cheers,<br> +fijal, arigo and the PyPy team</p> +</div> +<br>https://www.pypy.org/posts/2015/01/faster-more-memory-efficient-and-more-4096950404745375390.htmlThu, 22 Jan 2015 11:31:00 GMTSeptember donations and thank you to the Python Software Foundation!https://www.pypy.org/posts/2014/11/september-donations-and-thank-you-to-4531550307707104017.htmlMaciej Fijalkowski<div dir="ltr" style="text-align: left;"> + +<p>Hello everyone!</p> +<p>We would like to show you a short update on the PyPy funding. +We gathered a total of $15,986 in the month of September and as per +<a class="reference external" href="https://www.pypy.org/posts/2014/09/python-software-foundation-matching-2230529993193139046.html">earlier agreement</a>, the Python Software Foundation donated $10,000 +to PyPy. We would like to thank everyone participating and the PSF in +particular for supporting the PyPy project and making our work possible!</p> +<p>We've been working hard on the goals outlined in the funding proposals.</p> +<ul class="simple"> +<li><a class="reference external" href="https://www.pypy.org/posts/2014/10/pypy3-240-released-5007750685927360190.html">PyPy Python 3</a> support has been in beta for a while and it's already +being used by many people, as seen per the number of reported bugs. +We're currently supporting 3.2, planning on moving towards 3.4 in the +future.</li> +<li>Software Transactional Memory has been a successful research project, +with <a class="reference external" href="https://www.pypy.org/posts/2014/11/tornado-without-gil-on-pypy-stm-7284102716557557428.html">first real world</a> results shown during the Warsaw sprint.</li> +<li>More detailed update on numpy will be published soon. A little spoiler is +that we're planning on addressing matplotlib, scipy and the larger ecosystem +to some extent. Stay tuned!</li> +</ul> +<p>Again, thanks to everyone who donated and happy Thanksgiving to everyone +on that side of the world!</p> +<p>Cheers,<br> +fijal and the entire PyPy team</p> + +<br></div>https://www.pypy.org/posts/2014/11/september-donations-and-thank-you-to-4531550307707104017.htmlFri, 28 Nov 2014 12:49:00 GMT \ No newline at end of file diff --git a/authors/mattip.html b/authors/mattip.html new file mode 100644 index 000000000..3eb24be9e --- /dev/null +++ b/authors/mattip.html @@ -0,0 +1,245 @@ + + + + + +Posts by mattip | PyPy + + + + + + + + + + + + + + + + + Skip to main content +
+

Posts by mattip

+ +
+
+ + \ No newline at end of file diff --git a/authors/mattip.xml b/authors/mattip.xml new file mode 100644 index 000000000..86dc86ee5 --- /dev/null +++ b/authors/mattip.xml @@ -0,0 +1,739 @@ + +PyPy (Posts by mattip)https://www.pypy.org/enContents © 2024 <a href="mailto:pypy-dev@pypy.org">The PyPy Team</a> Sat, 31 Aug 2024 17:48:12 GMTNikola (getnikola.com)http://blogs.law.harvard.edu/tech/rssPyPy v7.3.17 releasehttps://www.pypy.org/posts/2024/08/pypy-v7317-release.htmlmattip<section id="pypy-v7-3-17-release-of-python-2-7-and-3-10"> +<h2>PyPy v7.3.17: release of python 2.7 and 3.10</h2> +<p>The PyPy team is proud to release version 7.3.17 of PyPy.</p> +<p>This release includes a new <a class="reference internal" href="https://www.pypy.org/posts/2024/08/pypy-v7317-release.html#risc-v-jit-backend">RISC-V JIT backend</a>, an <a class="reference internal" href="https://www.pypy.org/posts/2024/08/pypy-v7317-release.html#improved-repl">improved REPL</a> based on +work by the CPython team, and <a class="reference internal" href="https://www.pypy.org/posts/2024/08/pypy-v7317-release.html#better-jit-optimizations">better JIT optimizations</a> of integer +operations. Special shout-outs to <a class="reference external" href="https://github.com/loganchien">Logan Chien</a> for the <a class="reference external" href="https://github.com/pypy/pypy/pull/5002">RISC-V backend +work</a>, to <a class="reference external" href="https://github.com/nirit100">Nico Rittinghaus</a> for better integer optimization in the JIT, and +the CPython team that has worked on the repl.</p> +<p>The release includes two different interpreters:</p> +<ul class="simple"> +<li><p>PyPy2.7, which is an interpreter supporting the syntax and the features of +Python 2.7 including the stdlib for CPython 2.7.18+ (the <code class="docutils literal">+</code> is for +backported security updates)</p></li> +<li><p>PyPy3.10, which is an interpreter supporting the syntax and the features of +Python 3.10, including the stdlib for CPython 3.10.14.</p></li> +</ul> +<p>The interpreters are based on much the same codebase, thus the dual +release. This is a micro release, all APIs are compatible with the other 7.3 +releases. It follows after 7.3.16 release on April 23, 2024.</p> +<p>We recommend updating. You can find links to download the releases here:</p> +<blockquote> +<p><a class="reference external" href="https://pypy.org/download.html">https://pypy.org/download.html</a></p> +</blockquote> +<p>We would like to thank our donors for the continued support of the PyPy +project. If PyPy is not quite good enough for your needs, we are available for +<a class="reference external" href="https://www.pypy.org/pypy-sponsors.html">direct consulting</a> work. If PyPy is helping you out, we would love to hear +about it and encourage submissions to our <a class="reference external" href="https://pypy.org/blog">blog</a> via a pull request +to <a class="reference external" href="https://github.com/pypy/pypy.org">https://github.com/pypy/pypy.org</a></p> +<p>We would also like to thank our contributors and encourage new people to join +the project. PyPy has many layers and we need help with all of them: bug fixes, +<a class="reference external" href="https://www.pypy.org/posts/2024/08/index.html">PyPy</a> and <a class="reference external" href="https://rpython.readthedocs.org">RPython</a> documentation improvements, or general <a class="reference external" href="https://www.pypy.org/posts/2024/08/project-ideas.html">help</a> with +making RPython's JIT even better.</p> +<p>If you are a python library maintainer and use C-extensions, please consider +making a <a class="reference external" href="https://hpyproject.org/">HPy</a> / <a class="reference external" href="https://cffi.readthedocs.io">CFFI</a> / <a class="reference external" href="https://cppyy.readthedocs.io">cppyy</a> version of your library that would be performant +on PyPy. In any case, both <a class="reference external" href="https://github.com/joerick/cibuildwheel">cibuildwheel</a> and the <a class="reference external" href="https://github.com/matthew-brett/multibuild">multibuild system</a> support +building wheels for PyPy.</p> +<section id="risc-v-backend-for-the-jit"> +<span id="risc-v-jit-backend"></span><h3>RISC-V backend for the JIT</h3> +<p>PyPy's JIT has added support for generating 64-bit RISC-V machine code at +runtime (RV64-IMAD, specifically). So far we are not releasing binaries for any +RISC-V platforms, but there are <a class="reference external" href="https://rpython.readthedocs.io/en/latest/riscv.html">instructions</a> on how to cross-compile binaries.</p> +</section> +<section id="repl-improvements"> +<span id="improved-repl"></span><h3>REPL Improvements</h3> +<p>The biggest user-visible change of the release is new features in the repl of +PyPy3.10. CPython 3.13 has adopted and extended PyPy's pure-Python repl, adding +a number of features and fixing a number or bugs in the process. We have +backported and added the following features:</p> +<ul class="simple"> +<li><p>Prompts and tracebacks use terminal colors, as well as <a class="reference external" href="https://gist.github.com/egmontkob/eb114294efbcd5adb1944c9f3cb5feda">terminal hyperlinks</a> +for file names.</p></li> +<li><p><a class="reference external" href="https://en.wikipedia.org/wiki/Bracketed-paste">Bracketed paste</a> enable pasting several lines of input into the terminal +without auto-indentation getting in the way.</p></li> +<li><p>A special interactive help browser (F1), history browser (F2), explicit paste +mode (F3).</p></li> +<li><p>Support for Ctrl-&lt;left/right&gt; to jump over whole words at a time.</p></li> +</ul> +<p>See the <a class="reference external" href="https://docs.python.org/3.13/whatsnew/3.13.html#a-better-interactive-interpreter">CPython documentation for further details</a>. Thanks to Łukasz Langa, +Pablo Galindo Salgado and the other CPython devs involved in this work.</p> +</section> +<section id="better-jit-optimizations-of-integer-operations"> +<span id="better-jit-optimizations"></span><h3>Better JIT optimizations of integer operations</h3> +<p>The optimizers of PyPy's JIT have become much better at reasoning about and +optimizing integer operations. This is done with a new <a class="reference external" href="https://pypy.org/posts/2024/08/toy-knownbits.html">"knownbits" abstract +domain</a>. In many programs that do bit-manipulation of integers, some of the +bits of the integer variables of the program can be statically known. Here's a +simple example:</p> +<div class="code"><pre class="code python"><a id="rest_code_eca6db629fd844478a4ee5bd2ccb11fc-1" name="rest_code_eca6db629fd844478a4ee5bd2ccb11fc-1" href="https://www.pypy.org/posts/2024/08/pypy-v7317-release.html#rest_code_eca6db629fd844478a4ee5bd2ccb11fc-1"></a><span class="n">x</span> <span class="o">=</span> <span class="n">a</span> <span class="o">|</span> <span class="mi">1</span> +<a id="rest_code_eca6db629fd844478a4ee5bd2ccb11fc-2" name="rest_code_eca6db629fd844478a4ee5bd2ccb11fc-2" href="https://www.pypy.org/posts/2024/08/pypy-v7317-release.html#rest_code_eca6db629fd844478a4ee5bd2ccb11fc-2"></a><span class="o">...</span> +<a id="rest_code_eca6db629fd844478a4ee5bd2ccb11fc-3" name="rest_code_eca6db629fd844478a4ee5bd2ccb11fc-3" href="https://www.pypy.org/posts/2024/08/pypy-v7317-release.html#rest_code_eca6db629fd844478a4ee5bd2ccb11fc-3"></a><span class="k">if</span> <span class="n">x</span> <span class="o">&amp;</span> <span class="mi">1</span><span class="p">:</span> +<a id="rest_code_eca6db629fd844478a4ee5bd2ccb11fc-4" name="rest_code_eca6db629fd844478a4ee5bd2ccb11fc-4" href="https://www.pypy.org/posts/2024/08/pypy-v7317-release.html#rest_code_eca6db629fd844478a4ee5bd2ccb11fc-4"></a> <span class="o">...</span> +<a id="rest_code_eca6db629fd844478a4ee5bd2ccb11fc-5" name="rest_code_eca6db629fd844478a4ee5bd2ccb11fc-5" href="https://www.pypy.org/posts/2024/08/pypy-v7317-release.html#rest_code_eca6db629fd844478a4ee5bd2ccb11fc-5"></a><span class="k">else</span><span class="p">:</span> +<a id="rest_code_eca6db629fd844478a4ee5bd2ccb11fc-6" name="rest_code_eca6db629fd844478a4ee5bd2ccb11fc-6" href="https://www.pypy.org/posts/2024/08/pypy-v7317-release.html#rest_code_eca6db629fd844478a4ee5bd2ccb11fc-6"></a> <span class="o">...</span> +</pre></div> +<p>With the new abstract domain, the JIT can optimize the <code class="docutils literal">if</code>-condition to +<code class="docutils literal">True</code>, because it already knows that the lowest bit of <code class="docutils literal">x</code> must be set. +This optimization applies to all Python-integers that fit into a machine word +(PyPy optimistically picks between two different representations for <code class="docutils literal">int</code>, +depending on the size of the value). Unfortunately there is very little impact +of this change on almost all Python code, because intensive bit-manipulation is +rare in Python. However, the change leads to significant performance +improvements in <a class="reference external" href="https://docs.pydrofoil.org/en/latest/">Pydrofoil</a> (the RPython-based RISC-V/ARM emulators that are +automatically generated from high-level <a class="reference external" href="https://github.com/rems-project/sail/">Sail</a> specifications of the respective +ISAs, and that use the RPython JIT to improve performance).</p> +</section> +<section id="pypy-versions-and-speed-pypy-org"> +<h3>PyPy versions and speed.pypy.org</h3> +<p>The keen-eyed will have noticed no mention of Python version 3.9 in the +releases above. Typically we will maintain only one version of Python3, but due +to PyPy3.9 support on conda-forge we maintained multiple versions from the +first release of PyPy3.10 in PyPy v7.3.12 (Dec 2022). Conda-forge is +<a class="reference external" href="https://pypy.org/posts/2024/08/conda-forge-proposes-dropping-support-for-pypy.html">sunsetting its PyPy support</a>, which means we can drop PyPy3.9. Since that was +the major driver of benchmarks at <a class="reference external" href="https://speed.pypy.org">https://speed.pypy.org</a>, we revamped the site +to showcase PyPy3.9, PyPy3.10, and various versions of cpython on the home +page. For historical reasons, the "baseline" for comparison is still cpython +3.7.19.</p> +<p>We will keep the buildbots building PyPY3.9 until the end of August, these +builds will still be available on the <a class="reference external" href="https://buildbot.pypy.org/nightly/">nightly builds</a> tab of the buildbot.</p> +</section> +<section id="what-is-pypy"> +<h3>What is PyPy?</h3> +<p>PyPy is a Python interpreter, a drop-in replacement for CPython +It's fast (<a class="reference external" href="https://speed.pypy.org">PyPy and CPython</a> performance +comparison) due to its integrated tracing JIT compiler.</p> +<p>We also welcome developers of other <a class="reference external" href="https://rpython.readthedocs.io/en/latest/examples.html">dynamic languages</a> to see what RPython +can do for them.</p> +<p>We provide binary builds for:</p> +<ul class="simple"> +<li><p><strong>x86</strong> machines on most common operating systems +(Linux 32/64 bits, Mac OS 64 bits, Windows 64 bits)</p></li> +<li><p>64-bit <strong>ARM</strong> machines running Linux (<code class="docutils literal">aarch64</code>) and macos (<code class="docutils literal">macos_arm64</code>).</p></li> +</ul> +<p>PyPy supports Windows 32-bit, Linux PPC64 big- and little-endian, Linux ARM +32 bit, RISC-V RV64IMAFD Linux, and s390x Linux but does not release binaries. +Please reach out to us if you wish to sponsor binary releases for those +platforms. Downstream packagers provide binary builds for debian, Fedora, +conda, OpenBSD, FreeBSD, Gentoo, and more.</p> +</section> +<section id="what-else-is-new"> +<h3>What else is new?</h3> +<p>For more information about the 7.3.17 release, see the <a class="reference external" href="https://doc.pypy.org/en/latest/release-v7.3.17.html#changelog">full changelog</a>.</p> +<p>Please update, and continue to help us make pypy better.</p> +<p>Cheers, +The PyPy Team</p> +</section> +</section>releasehttps://www.pypy.org/posts/2024/08/pypy-v7317-release.htmlWed, 28 Aug 2024 12:22:08 GMTConda-forge proposes sunsetting support for PyPyhttps://www.pypy.org/posts/2024/08/conda-forge-proposes-dropping-support-for-pypy.htmlmattip<p>Conda-forge has kindly been providing support for PyPy since 2019. The +conda-forge team has been very patient and generous with resources, but it +seems the uptake of PyPy has not justified the effort. Major packages still +are not <a href="https://conda-forge.org/status/migration/?name=pypy38">available on PyPy</a>, +others find it hard to <a href="https://github.com/conda-forge/numpy-feedstock/pull/310">update +versions</a>. We don't +get much feedback at all about people using PyPy, and even less about PyPy on +conda-forge. The conda-forge team has proposed <a href="https://github.com/conda-forge/conda-forge.github.io/pull/2259">sunsetting +PyPy</a> going +forward, which means current packages would remain but no new packages would be +built. If you have an opinion, you can comment on that PR, or on this blog post.</p> +<p>Since conda-forge supports PyPy3.9 but not PyPy3.10, we have continued +releasing PyPy3.9 even though we typically support only one version of PyPy3. +With the sunsetting proposal, we will not release any more updates to PyPy3.9. +I opened a <a href="https://github.com/orgs/pypy/discussions/4998">poll</a> about the +intention to drop PyPy3.9. If you have an opinion, please chime in.</p>conda-forgehttps://www.pypy.org/posts/2024/08/conda-forge-proposes-dropping-support-for-pypy.htmlFri, 09 Aug 2024 06:27:41 GMTPyPy v7.3.16 releasehttps://www.pypy.org/posts/2024/04/pypy-v7316-release.htmlmattip<section id="pypy-v7-3-16-release-of-python-2-7-3-9-and-3-10"> +<h2>PyPy v7.3.16: release of python 2.7, 3.9, and 3.10</h2> +<p>The PyPy team is proud to release version 7.3.16 of PyPy.</p> +<p>This release includes security fixes from upstream CPython, and bugfixes to the +garbage collector, described in a <a class="reference external" href="https://www.pypy.org/posts/2024/03/fixing-bug-incremental-gc.html">gc bug-hunt blog post</a>.</p> +<p>The release includes three different interpreters:</p> +<blockquote> +<ul class="simple"> +<li><p>PyPy2.7, which is an interpreter supporting the syntax and the features of +Python 2.7 including the stdlib for CPython 2.7.18+ (the <code class="docutils literal">+</code> is for +backported security updates)</p></li> +<li><p>PyPy3.9, which is an interpreter supporting the syntax and the features of +Python 3.9, including the stdlib for CPython 3.9.19.</p></li> +<li><p>PyPy3.10, which is an interpreter supporting the syntax and the features of +Python 3.10, including the stdlib for CPython 3.10.14.</p></li> +</ul> +</blockquote> +<p>The interpreters are based on much the same codebase, thus the multiple +release. This is a micro release, all APIs are compatible with the other 7.3 +releases. It follows after 7.3.15 release on Jan 15, 2024</p> +<p>We recommend updating. You can find links to download the v7.3.16 releases here:</p> +<blockquote> +<p><a class="reference external" href="https://pypy.org/download.html">https://pypy.org/download.html</a></p> +</blockquote> +<p>We would like to thank our donors for the continued support of the PyPy +project. If PyPy is not quite good enough for your needs, we are available for +<a class="reference external" href="https://www.pypy.org/pypy-sponsors.html">direct consulting</a> work. If PyPy is helping you out, we would love to hear +about it and encourage submissions to our <a class="reference external" href="https://pypy.org/blog">blog</a> via a pull request +to <a class="reference external" href="https://github.com/pypy/pypy.org">https://github.com/pypy/pypy.org</a></p> +<p>We would also like to thank our contributors and encourage new people to join +the project. PyPy has many layers and we need help with all of them: bug fixes, +<a class="reference external" href="https://www.pypy.org/posts/2024/04/index.html">PyPy</a> and <a class="reference external" href="https://rpython.readthedocs.org">RPython</a> documentation improvements, or general <a class="reference external" href="https://www.pypy.org/posts/2024/04/project-ideas.html">help</a> with +making RPython's JIT even better.</p> +<p>If you are a python library maintainer and use C-extensions, please consider +making a <a class="reference external" href="https://hpyproject.org/">HPy</a> / <a class="reference external" href="https://cffi.readthedocs.io">CFFI</a> / <a class="reference external" href="https://cppyy.readthedocs.io">cppyy</a> version of your library that would be performant +on PyPy. In any case, both <a class="reference external" href="https://github.com/joerick/cibuildwheel">cibuildwheel</a> and the <a class="reference external" href="https://github.com/matthew-brett/multibuild">multibuild system</a> support +building wheels for PyPy.</p> +<section id="what-is-pypy"> +<h3>What is PyPy?</h3> +<p>PyPy is a Python interpreter, a drop-in replacement for CPython +It's fast (<a class="reference external" href="https://speed.pypy.org">PyPy and CPython 3.7.4</a> performance +comparison) due to its integrated tracing JIT compiler.</p> +<p>We also welcome developers of other <a class="reference external" href="https://rpython.readthedocs.io/en/latest/examples.html">dynamic languages</a> to see what RPython +can do for them.</p> +<p>We provide binary builds for:</p> +<blockquote> +<ul class="simple"> +<li><p><strong>x86</strong> machines on most common operating systems +(Linux 32/64 bits, Mac OS 64 bits, Windows 64 bits)</p></li> +<li><p>64-bit <strong>ARM</strong> machines running Linux (<code class="docutils literal">aarch64</code>).</p></li> +<li><p>Apple <strong>M1 arm64</strong> machines (<code class="docutils literal">macos_arm64</code>).</p></li> +<li><p><strong>s390x</strong> running Linux</p></li> +</ul> +</blockquote> +<p>PyPy support Windows 32-bit, Linux PPC64 big- and little-endian, and Linux ARM +32 bit, but does not release binaries. Please reach out to us if you wish to +sponsor binary releases for those platforms. Downstream packagers provide +binary builds for debian, Fedora, conda, OpenBSD, FreeBSD, Gentoo, and more.</p> +</section> +<section id="what-else-is-new"> +<h3>What else is new?</h3> +<p>For more information about the 7.3.16 release, see the <a class="reference external" href="https://doc.pypy.org/en/latest/release-v7.3.16.html#changelog">full changelog</a>.</p> +<p>Please update, and continue to help us make pypy better.</p> +<p>Cheers, +The PyPy Team</p> +</section> +</section>releasehttps://www.pypy.org/posts/2024/04/pypy-v7316-release.htmlTue, 23 Apr 2024 12:22:08 GMTPyPy v7.3.15 releasehttps://www.pypy.org/posts/2024/01/pypy-v7315-release.htmlmattip<section id="pypy-v7-3-15-release-of-python-2-7-3-9-and-3-10"> +<h2>PyPy v7.3.15: release of python 2.7, 3.9, and 3.10</h2> +<p>The PyPy team is proud to release version 7.3.15 of PyPy.</p> +<p>This is primarily a bug-fix release, and includes work done to migrate PyPy to +Git and Github.</p> +<p>The release includes three different interpreters:</p> +<blockquote> +<ul class="simple"> +<li><p>PyPy2.7, which is an interpreter supporting the syntax and the features of +Python 2.7 including the stdlib for CPython 2.7.18+ (the <code class="docutils literal">+</code> is for +backported security updates)</p></li> +<li><p>PyPy3.9, which is an interpreter supporting the syntax and the features of +Python 3.9, including the stdlib for CPython 3.9.18.</p></li> +<li><p>PyPy3.10, which is an interpreter supporting the syntax and the features of +Python 3.10, including the stdlib for CPython 3.10.13.</p></li> +</ul> +</blockquote> +<p>The interpreters are based on much the same codebase, thus the multiple +release. This is a micro release, all APIs are compatible with the other 7.3 +releases. It follows after 7.3.14 release on Dec 25, 2023</p> +<p>We recommend updating. You can find links to download the v7.3.15 releases here:</p> +<blockquote> +<p><a class="reference external" href="https://pypy.org/download.html">https://pypy.org/download.html</a></p> +</blockquote> +<p>We would like to thank our donors for the continued support of the PyPy +project. If PyPy is not quite good enough for your needs, we are available for +<a class="reference external" href="https://www.pypy.org/pypy-sponsors.html">direct consulting</a> work. If PyPy is helping you out, we would love to hear about +it and encourage submissions to our <a class="reference external" href="https://pypy.org/blog">blog</a> via a pull request +to <a class="reference external" href="https://github.com/pypy/pypy.org">https://github.com/pypy/pypy.org</a></p> +<p>We would also like to thank our contributors and encourage new people to join +the project. PyPy has many layers and we need help with all of them: bug fixes, +<a class="reference external" href="https://www.pypy.org/posts/2024/01/index.html">PyPy</a> and <a class="reference external" href="https://rpython.readthedocs.org">RPython</a> documentation improvements, or general <a class="reference external" href="https://www.pypy.org/posts/2024/01/project-ideas.html">help</a> with +making RPython's JIT even better.</p> +<p>If you are a python library maintainer and use C-extensions, please consider +making a <a class="reference external" href="https://hpyproject.org/">HPy</a> / <a class="reference external" href="https://cffi.readthedocs.io">CFFI</a> / <a class="reference external" href="https://cppyy.readthedocs.io">cppyy</a> version of your library that would be performant +on PyPy. In any case, both <a class="reference external" href="https://github.com/joerick/cibuildwheel">cibuildwheel</a> and the <a class="reference external" href="https://github.com/matthew-brett/multibuild">multibuild system</a> support +building wheels for PyPy.</p> +<section id="what-is-pypy"> +<h3>What is PyPy?</h3> +<p>PyPy is a Python interpreter, a drop-in replacement for CPython +It's fast (<a class="reference external" href="https://speed.pypy.org">PyPy and CPython 3.7.4</a> performance +comparison) due to its integrated tracing JIT compiler.</p> +<p>We also welcome developers of other <a class="reference external" href="https://rpython.readthedocs.io/en/latest/examples.html">dynamic languages</a> to see what RPython +can do for them.</p> +<p>We provide binary builds for:</p> +<blockquote> +<ul class="simple"> +<li><p><strong>x86</strong> machines on most common operating systems +(Linux 32/64 bits, Mac OS 64 bits, Windows 64 bits)</p></li> +<li><p>64-bit <strong>ARM</strong> machines running Linux (<code class="docutils literal">aarch64</code>).</p></li> +<li><p>Apple <strong>M1 arm64</strong> machines (<code class="docutils literal">macos_arm64</code>).</p></li> +<li><p><strong>s390x</strong> running Linux</p></li> +</ul> +</blockquote> +<p>PyPy support Windows 32-bit, Linux PPC64 big- and little-endian, and Linux ARM +32 bit, but does not release binaries. Please reach out to us if you wish to +sponsor binary releases for those platforms. Downstream packagers provide +binary builds for debian, Fedora, conda, OpenBSD, FreeBSD, Gentoo, and more.</p> +</section> +<section id="what-else-is-new"> +<h3>What else is new?</h3> +<p>For more information about the 7.3.15 release, see the <a class="reference external" href="https://doc.pypy.org/en/latest/release-v7.3.15.html#changelog">full changelog</a>.</p> +<p>Please update, and continue to help us make pypy better.</p> +<p>Cheers, +The PyPy Team</p> +</section> +</section>releasehttps://www.pypy.org/posts/2024/01/pypy-v7315-release.htmlMon, 15 Jan 2024 12:22:08 GMTPyPy has moved to Git, GitHubhttps://www.pypy.org/posts/2023/12/pypy-moved-to-git-github.htmlmattip<p>PyPy has moved its canonical repo and issue tracker from +<a href="https://foss.heptapod.net/pypy/pypy">https://foss.heptapod.net/pypy/pypy</a> to <a href="https://github.com/pypy/pypy">https://github.com/pypy/pypy</a>. Obviously, +this means development will now be tracked in Git rather than Mercurial.</p> +<h3 id="motivation">Motivation</h3> +<p>We still feel Mercurial is a better version control system. The named branch +model and user interface are superior. But</p> +<ul> +<li> +<p>foss.heptapod.net is not well indexed in google/bing/duckduckgo + search, so people find it harder to search for issues in the project.</p> +</li> +<li> +<p>Since Heptapod has tightened its spam control, we get reports that + users create issues only to have them flagged as spam.</p> +</li> +<li> +<p>Open Source has become synonymous with GitHub, and we are too small to + change that.</p> +</li> +<li> +<p>Much of the current development comes as a reaction to fixing issues. + Tracking interlocking issues is easier if all the code is on the same + platform.</p> +</li> +<li> +<p>The <a href="https://doc.pypy.org/en/latest/faq.html#why-doesn-t-pypy-use-git-and-move-to-github">FAQ</a> + presents two arguments against the move. <a href="https://git-scm.com/docs/git-notes">Github notes</a> + solves much of point (1): the difficulty of discovering provenance of + commits, although not entirely. But the main problem is point (2), it turns + out that <strong>not</strong> moving to GitHub is an impediment to contribution and issue + reporting.</p> +</li> +<li> +<p>People who wish to continue to use Mercurial can use the same method below to + push to GitHub.</p> +</li> +<li> +<p>GitHub is more resource rich than foss.heptapod.net. We could add CI + jobs to replace some of our aging <a href="https://buildbot.pypy.org">buildbot + infrastructure</a>.</p> +</li> +</ul> +<h3 id="method">Method</h3> +<p>The migration required two parts: migrating the code and then migrating the +issues and merge requests.</p> +<h4 id="code-migration-1-code-and-notes">Code migration 1: code and notes</h4> +<p>I used a <a href="https://github.com/mnauw/git-remote-hg">fork of git-remote-hg</a> to +create a local Git repo with all the changesets. Then I wanted to add a Git +note to each commit with the branch it came from. So I prepared a file with two +columns: the Git commit hash, and the corresponding branch from Mercurial. +Mercurial can describe each commit in two ways: either the commit hash or by a +number index. I used <code>hg log</code> to convert an index <code>i</code> to the Mercurial hash, +and then <code>git-hg-helper</code> from <code>git-remote-hg</code> to convert the Mercurial hash to +a Git hash:</p> +<div class="code"><pre class="code literal-block">$(cd pypy-git; git-hg-helper git-rev $(cd ../pypy-hg; hg log -r $i -T"{node}\n")) +</pre></div> + +<p>Then I used <code>hg log</code> again to print the Mercurial branch for the index <code>i</code>:</p> +<div class="code"><pre class="code literal-block">$(cd pypy-hg; hg log -r $i -T'{branch}\n') +</pre></div> + +<p>Putting these two together, I could loop over all the commits by their +numerical index to prepare the file. Then I iterated over each line in the +file, and added the Git note. Since the <code>git note add</code> command works on the +current HEAD, I needed to checkout each commit in turn and then add the note:</p> +<div class="code"><pre class="code literal-block">git checkout -q &lt;hash&gt; &amp;&amp; git notes --ref refs/notes/branch add -m branch:&lt;branch&gt; +</pre></div> + +<p>I could then use <code>git push --all</code> to push to GitHub.</p> +<h4 id="code-migration-2-prepare-the-branches">Code migration 2: prepare the branches</h4> +<p>PyPy has almost 500 open branches. The code migration created all the branch +HEADs, but <code>git push --all</code> did not push them. I needed to check them out and +push each one. So I created a file with all the branch names</p> +<div class="code"><pre class="code literal-block">cd pypy-hg; hg branches | cut -f1 -d" " &gt; branches.txt +</pre></div> + +<p>and then push each one to the GitHub repo</p> +<div class="code"><pre class="code literal-block"><span class="k">while</span><span class="w"> </span><span class="nv">read</span><span class="w"> </span><span class="nv">branch</span><span class="c1">; do git checkout branches/$branch &amp;&amp; git push origin branches/$branch; done &lt; branches.txt</span> +</pre></div> + +<p>Note that the branches were named <code>branches/XXX</code> by the migration, not <code>branch/XXX</code>. This confuses the merge request migration, more about that later.</p> +<h4 id="issue-and-merge-request-migration">Issue and merge request migration</h4> +<p>I used the solution from +<a href="https://github.com/piceaTech/node-gitlab-2-github">node-gitlab-2-github</a> which +worked almost perfectly. It is important to do the conversion on a <strong>private +repo</strong> otherwise every mention of a successfully mapped user name notifies +the user about the transfer. This can be quite annoying for a repo the size of +PyPy with 600 merge requests and over 4000 issues. Issues transferred without a +problem: the script properly retained the issue numbers. However the script +does not convert the Mercurial hashes to Git hashes, so the bare hashes in +comments show up without a link to the commit. Merge requests are more of a problem:</p> +<ul> +<li>The Mercurial named branch "disappears" once it is merged, so a merge request + to a merged branch does not find the target branch name in Git. The + conversion creates an issue instead with the label <code>gitlab merge request</code>.</li> +<li>For some reason, the branches created by <code>git-remote-hg</code> are called + <code>branches/XXX</code> and not <code>branch/XXX</code> as expected by GitLab. This messes up the + merge request/PR conversion. For some of the branches (open PRs and main + target branches) I manually created additional branches without the <code>es</code>. The + net result is that open merge requests became open PRs, merged merge requests + became issues, and closed-not-merged merge requests were not migrated.</li> +</ul> +<h4 id="layered-conversions">Layered conversions</h4> +<p>PyPy already migrated once from Bitbucket to Heptapod. Many of the issues +reflect the multiple transitions: they have lines like "Created originally on +Bitbucket by XXX" from the first transition, and an additional line "In +Heptapod" from this transition.</p> +<h3 id="credits">Credits</h3> +<p>We would like to express our gratitude to the <a href="https://octobus.net/">Octobus</a> +team who support Heptapod. The transition from Bitbucket was quite an effort, +and they have generously hosted our development since then. We wish them all +the best, and still believe that Mercurial should have "won".</p> +<h3 id="next-steps">Next steps</h3> +<p>While the repo at GitHub is live, there are still a few more things we need to +do:</p> +<ul> +<li>Documentation needs an update for the new repo and the build automation from + readthedocs must be adjusted.</li> +<li>The wiki should be copied from Heptapod.</li> +<li>buildbot.pypy.org should also look at the new repo. I hope the code is up to + the task of interacting with a Git repo.</li> +<li>speed.pypy.org tracks changes, it too needs to reference the new location</li> +<li>To keep tracking branches with Git notes on new commits, I activated a + <a href="https://github.com/Julian/named-branch-action">github action</a> by Julian to + add a Git branch note to each commit. Please see the README there for + directions on using Git notes.</li> +<li>Some of the merge requests were not migrated. If someone wants to, they could + migrate those once they figure out the branch naming problems.</li> +</ul> +<p>Additionally, now is the time for all of you to prove the move is worthwhile:</p> +<ul> +<li>Star the repo, let others know how to find it,</li> +<li>Help fix some of the open issues or file new ones,</li> +<li>Take advantage of the more familiar workflow to get involved in the project,</li> +<li>Suggest ways to improve the migration: are there things I missed or could + have done better?</li> +</ul> +<h3 id="how-will-development-change">How will development change?</h3> +<p>Heptapod did not allow personal forks, so we were generous with a commit bit to +the main repo. Additionally, we (well, me) have been using a +commit-directly-to-main workflow. We will now be adopting a more structured +workflow. Please fork the repo and submit a pull request for any changes. We +can now add some pre-merge CI to check that the PR at least passes the first +stage of translation. The live and active branches will be:</p> +<ul> +<li><code>main</code>: what was "default" in Mercurial, it is the Python2.7 interpreter and + the base of the RPython interpreter,</li> +<li><code>py3.9</code>: the Python3.9 interpreter, which also includes all RPython changes + from <code>main</code>. This is exactly like on Mercurial, and</li> +<li><code>py3.10</code>: the Python3.10 interpreter, which also includes all RPython changes + from <code>main</code> and all bugfixes from <code>py3.9</code>. This is exactly like on Mercurial.</li> +</ul> +<h4 id="working-between-the-repos">Working between the repos</h4> +<h5 id="finding-commits">Finding commits</h5> +<p>If you want to figure out how a Mercurial commit relates to a Git commit, you +can use <code>git-hg-helper</code>. You run it in the Git repo. It takes the full long +hash from one repo and gives you the corresponding hash of the other repo:</p> +<div class="code"><pre class="code literal-block">$<span class="w"> </span>git-hg-helper<span class="w"> </span>git-rev<span class="w"> </span>d64027c4c2b903403ceeef2c301f5132454491df +4527e62ad94b0e940a5b0f9f20d29428672f93f7 +$<span class="w"> </span>git-hg-helper<span class="w"> </span>hg-rev<span class="w"> </span>4527e62ad94b0e940a5b0f9f20d29428672f93f7 +d64027c4c2b903403ceeef2c301f5132454491df +</pre></div> + +<h5 id="finding-branches">Finding branches</h5> +<p>Branches migrated from Mercurial will have a <code>branches</code> prefix, not <code>branch</code>. +While GitLab uses <code>branch</code> for its prefix, the <code>git-remote-hg</code> script uses +<code>branches</code>. New work should be in a PR targeting <code>main</code>, <code>py3.9</code> or <code>py3.10</code>.</p> +<p>Thanks for helping to make PyPy better.</p> +<p>Matti</p> +<h2 id="update">Update</h2> +<p>In the meantime we found out that unfortunately something went wrong in the +migration of the issues. The old <a href="https://foss.heptapod.net/pypy/pypy/-/issues/3655">issue +3655</a> got lost in the +migration. This means that after number 3655 the numbers are different between +github and heptapod, with heptapod being one larger. E.g. <a href="https://foss.heptapod.net/pypy/pypy/-/issues/3700">issue 3700 on +heptapod</a> is <a href="https://github.com/pypy/pypy/issues/3699">issue 3699 on +github</a>. We are <a href="https://github.com/pypy/pypy/issues/4979">investigating +options</a>. </p>https://www.pypy.org/posts/2023/12/pypy-moved-to-git-github.htmlFri, 29 Dec 2023 14:19:55 GMTPyPy v7.3.14 releasehttps://www.pypy.org/posts/2023/12/pypy-v7314-release.htmlmattip<section id="pypy-v7-3-14-release-of-python-2-7-3-9-and-3-10"> +<h2>PyPy v7.3.14: release of python 2.7, 3.9, and 3.10</h2> +<p>The PyPy team is proud to release version 7.3.14 of PyPy.</p> +<p>Highlights of this release are compatibility with <a class="reference external" href="https://hpyproject.org/blog/posts/2023/10/hpy-0.9.0-fourth-public-release/">HPy-0.9</a>, cffi 1.16, +additional C-API interfaces, and more python3.10 fixes.</p> +<p>The release includes three different interpreters:</p> +<blockquote> +<ul class="simple"> +<li><p>PyPy2.7, which is an interpreter supporting the syntax and the features of +Python 2.7 including the stdlib for CPython 2.7.18+ (the <code class="docutils literal">+</code> is for +backported security updates)</p></li> +<li><p>PyPy3.9, which is an interpreter supporting the syntax and the features of +Python 3.9, including the stdlib for CPython 3.9.18.</p></li> +<li><p>PyPy3.10, which is an interpreter supporting the syntax and the features of +Python 3.10, including the stdlib for CPython 3.10.13.</p></li> +</ul> +</blockquote> +<p>The interpreters are based on much the same codebase, thus the multiple +release. This is a micro release, all APIs are compatible with the other 7.3 +releases. It follows after 7.3.13 release on Sept 29, 2023.</p> +<p>We recommend updating. You can find links to download the v7.3.14 releases here:</p> +<blockquote> +<p><a class="reference external" href="https://pypy.org/download.html">https://pypy.org/download.html</a></p> +</blockquote> +<p>We would like to thank our donors for the continued support of the PyPy +project. If PyPy is not quite good enough for your needs, we are available for +<a class="reference external" href="https://www.pypy.org/pypy-sponsors.html">direct consulting</a> work. If PyPy is helping you out, we would love to hear about +it and encourage submissions to our <a class="reference external" href="https://pypy.org/blog">blog</a> via a pull request +to <a class="reference external" href="https://github.com/pypy/pypy.org">https://github.com/pypy/pypy.org</a></p> +<p>We would also like to thank our contributors and encourage new people to join +the project. Since the last release we have contributions from three new +contributors. PyPy has many layers and we need help with all of them: bug +fixes, <a class="reference external" href="https://www.pypy.org/posts/2023/12/index.html">PyPy</a> and <a class="reference external" href="https://rpython.readthedocs.org">RPython</a> documentation improvements, or general <a class="reference external" href="https://www.pypy.org/posts/2023/12/project-ideas.html">help</a> +with making RPython's JIT even better.</p> +<p>If you are a python library maintainer and use C-extensions, please consider +making a <a class="reference external" href="https://hpyproject.org/">HPy</a> / <a class="reference external" href="https://cffi.readthedocs.io">CFFI</a> / <a class="reference external" href="https://cppyy.readthedocs.io">cppyy</a> version of your library that would be performant +on PyPy. In any case, both <a class="reference external" href="https://github.com/joerick/cibuildwheel">cibuildwheel</a> and the <a class="reference external" href="https://github.com/matthew-brett/multibuild">multibuild system</a> support +building wheels for PyPy.</p> +<section id="what-is-pypy"> +<h3>What is PyPy?</h3> +<p>PyPy is a Python interpreter, a drop-in replacement for CPython +It's fast (<a class="reference external" href="https://speed.pypy.org">PyPy and CPython 3.7.4</a> performance +comparison) due to its integrated tracing JIT compiler.</p> +<p>We also welcome developers of other <a class="reference external" href="https://rpython.readthedocs.io/en/latest/examples.html">dynamic languages</a> to see what RPython +can do for them.</p> +<p>We provide binary builds for:</p> +<blockquote> +<ul class="simple"> +<li><p><strong>x86</strong> machines on most common operating systems +(Linux 32/64 bits, Mac OS 64 bits, Windows 64 bits)</p></li> +<li><p>64-bit <strong>ARM</strong> machines running Linux (<code class="docutils literal">aarch64</code>).</p></li> +<li><p>Apple <strong>M1 arm64</strong> machines (<code class="docutils literal">macos_arm64</code>).</p></li> +<li><p><strong>s390x</strong> running Linux</p></li> +</ul> +</blockquote> +<p>PyPy support Windows 32-bit, Linux PPC64 big- and little-endian, and Linux ARM +32 bit, but does not release binaries. Please reach out to us if you wish to +sponsor binary releases for those platforms. Downstream packagers provide +binary builds for debian, Fedora, conda, OpenBSD, FreeBSD, Gentoo, and more.</p> +</section> +<section id="what-else-is-new"> +<h3>What else is new?</h3> +<p>For more information about the 7.3.14 release, see the <a class="reference external" href="https://doc.pypy.org/en/latest/release-v7.3.14.html#changelog">full changelog</a>.</p> +<p>Please update, and continue to help us make pypy better.</p> +<p>Cheers, +The PyPy Team</p> +</section> +</section>releasehttps://www.pypy.org/posts/2023/12/pypy-v7314-release.htmlMon, 25 Dec 2023 04:22:08 GMTPyPy v7.3.13 releasehttps://www.pypy.org/posts/2023/09/pypy-v7313-release.htmlmattip<section id="pypy-v7-3-13-release-of-python-2-7-3-9-and-3-10"> +<h2>PyPy v7.3.13: release of python 2.7, 3.9, and 3.10</h2> +<p>The PyPy team is proud to release version 7.3.13 of PyPy. +This is primarily a security/bug-fix release. CPython released security +patches, and this release also improves the ability to use type +specifications via <code class="docutils literal">PyType_FromSpec</code> and friends. There are also some +small speed-ups.</p> +<p>The release includes three different interpreters:</p> +<blockquote> +<ul class="simple"> +<li><p>PyPy2.7, which is an interpreter supporting the syntax and the features of +Python 2.7 including the stdlib for CPython 2.7.18+ (the <code class="docutils literal">+</code> is for +backported security updates)</p></li> +<li><p>PyPy3.9, which is an interpreter supporting the syntax and the features of +Python 3.9, including the stdlib for CPython 3.9.18.</p></li> +<li><p>PyPy3.10, which is an interpreter supporting the syntax and the features of +Python 3.10, including the stdlib for CPython 3.10.13. Note it requires at +least cython 0.29.35 or cython 3.0.0b3.</p></li> +</ul> +</blockquote> +<p>The interpreters are based on much the same codebase, thus the multiple +release. This is a micro release, all APIs are compatible with the other 7.3 +releases. It follows after 7.3.12 release on June 16, 2023.</p> +<p>We recommend updating. You can find links to download the v7.3.13 releases here:</p> +<blockquote> +<p><a class="reference external" href="https://pypy.org/download.html">https://pypy.org/download.html</a></p> +</blockquote> +<p>We would like to thank our donors for the continued support of the PyPy +project. If PyPy is not quite good enough for your needs, we are available for +<a class="reference external" href="https://www.pypy.org/pypy-sponsors.html">direct consulting</a> work. If PyPy is helping you out, we would love to hear about +it and encourage submissions to our <a class="reference external" href="https://pypy.org/blog">blog</a> via a pull request +to <a class="reference external" href="https://github.com/pypy/pypy.org">https://github.com/pypy/pypy.org</a></p> +<p>We would also like to thank our contributors and encourage new people to join +the project. PyPy has many layers and we need help with all of them: bug fixes, +<a class="reference external" href="https://www.pypy.org/posts/2023/09/index.html">PyPy</a> and <a class="reference external" href="https://rpython.readthedocs.org">RPython</a> documentation improvements, or general <a class="reference external" href="https://www.pypy.org/posts/2023/09/project-ideas.html">help</a> with making +RPython's JIT even better.</p> +<p>If you are a python library maintainer and use C-extensions, please consider +making a <a class="reference external" href="https://hpyproject.org/">HPy</a> / <a class="reference external" href="https://cffi.readthedocs.io">CFFI</a> / <a class="reference external" href="https://cppyy.readthedocs.io">cppyy</a> version of your library that would be performant +on PyPy. In any case, both <a class="reference external" href="https://github.com/joerick/cibuildwheel">cibuildwheel</a> and the <a class="reference external" href="https://github.com/matthew-brett/multibuild">multibuild system</a> support +building wheels for PyPy.</p> +<section id="what-is-pypy"> +<h3>What is PyPy?</h3> +<p>PyPy is a Python interpreter, a drop-in replacement for CPython +It's fast (<a class="reference external" href="https://speed.pypy.org">PyPy and CPython 3.7.4</a> performance +comparison) due to its integrated tracing JIT compiler.</p> +<p>We also welcome developers of other <a class="reference external" href="https://rpython.readthedocs.io/en/latest/examples.html">dynamic languages</a> to see what RPython +can do for them.</p> +<p>We provide binary builds for:</p> +<blockquote> +<ul class="simple"> +<li><p><strong>x86</strong> machines on most common operating systems +(Linux 32/64 bits, Mac OS 64 bits, Windows 64 bits)</p></li> +<li><p>64-bit <strong>ARM</strong> machines running Linux (<code class="docutils literal">aarch64</code>).</p></li> +<li><p>Apple <strong>M1 arm64</strong> machines (<code class="docutils literal">macos_arm64</code>).</p></li> +<li><p><strong>s390x</strong> running Linux</p></li> +</ul> +</blockquote> +<p>PyPy support Windows 32-bit, Linux PPC64 big- and little-endian, and Linux ARM +32 bit, but does not release binaries. Please reach out to us if you wish to +sponsor binary releases for those platforms. Downstream packagers provide +binary builds for debian, Fedora, conda, OpenBSD, FreeBSD, Gentoo, and more.</p> +</section> +<section id="what-else-is-new"> +<h3>What else is new?</h3> +<p>For more information about the 7.3.13 release, see the <a class="reference external" href="https://doc.pypy.org/en/latest/release-v7.3.13.html#changelog">full changelog</a>.</p> +<p>Please update, and continue to help us make pypy better.</p> +<p>Cheers, +The PyPy Team</p> +</section> +</section>releasehttps://www.pypy.org/posts/2023/09/pypy-v7313-release.htmlFri, 29 Sep 2023 04:22:08 GMTPyPy v7.3.12 releasehttps://www.pypy.org/posts/2023/06/pypy-v7312-release.htmlmattip<section id="pypy-v7-3-12-release-of-python-2-7-3-9-and-3-10"> +<h2>PyPy v7.3.12: release of python 2.7, 3.9, and 3.10.</h2> +<p>The PyPy team is proud to release version 7.3.12 of PyPy. +This release includes a new string-to-int algorithm (also appearing in CPython +3.12) that is faster than the older one; support for symlinks in Windows; and +our first Python3.10 version.</p> +<p>The release includes three different interpreters:</p> +<blockquote> +<ul class="simple"> +<li><p>PyPy2.7, which is an interpreter supporting the syntax and the features of +Python 2.7 including the stdlib for CPython 2.7.18+ (the <code class="docutils literal">+</code> is for +backported security updates)</p></li> +<li><p>PyPy3.9, which is an interpreter supporting the syntax and the features of +Python 3.9, including the stdlib for CPython 3.9.17.</p></li> +<li><p>PyPy3.10, which is an interpreter supporting the syntax and the features of +Python 3.10, including the stdlib for CPython 3.10.12. This is our first +release of 3.10, but based on past experience we are quite confident in +its compatibility with upstream. Of course, we recommend testing your code +with this new version before putting it into production. Note it does +require at least cython 0.29.35 or cython 3.0.0b3</p></li> +</ul> +</blockquote> +<p>The interpreters are based on much the same codebase, thus the multiple +release. This is a micro release, all APIs are compatible with the other 7.3 +releases. It follows after 7.3.11 release on Dec 29, 2022</p> +<p>We recommend updating. You can find links to download the v7.3.12 releases here:</p> +<blockquote> +<p><a class="reference external" href="https://pypy.org/download.html">https://pypy.org/download.html</a></p> +</blockquote> +<p>We would like to thank our donors for the continued support of the PyPy +project. If PyPy is not quite good enough for your needs, we are available for +<a class="reference external" href="https://www.pypy.org/pypy-sponsors.html">direct consulting</a> work. If PyPy is helping you out, we would love to hear about +it and encourage submissions to our <a class="reference external" href="https://pypy.org/blog">blog</a> via a pull request +to <a class="reference external" href="https://github.com/pypy/pypy.org">https://github.com/pypy/pypy.org</a></p> +<p>We would also like to thank our contributors and encourage new people to join +the project. PyPy has many layers and we need help with all of them: bug fixes, +<a class="reference external" href="https://www.pypy.org/posts/2023/06/index.html">PyPy</a> and <a class="reference external" href="https://rpython.readthedocs.org">RPython</a> documentation improvements, or general <a class="reference external" href="https://www.pypy.org/posts/2023/06/project-ideas.html">help</a> with making +RPython's JIT even better. Since the previous release, we have accepted +contributions from one new contributor, thanks for pitching in, and welcome +to the project!</p> +<p>If you are a python library maintainer and use C-extensions, please consider +making a <a class="reference external" href="https://hpyproject.org/">HPy</a> / <a class="reference external" href="https://cffi.readthedocs.io">CFFI</a> / <a class="reference external" href="https://cppyy.readthedocs.io">cppyy</a> version of your library that would be performant +on PyPy. In any case, both <a class="reference external" href="https://github.com/joerick/cibuildwheel">cibuildwheel</a> and the <a class="reference external" href="https://github.com/matthew-brett/multibuild">multibuild system</a> support +building wheels for PyPy.</p> +<section id="what-is-pypy"> +<h3>What is PyPy?</h3> +<p>PyPy is a Python interpreter, a drop-in replacement for CPython 2.7, 3.9 and +3.10. It's fast (<a class="reference external" href="https://speed.pypy.org">PyPy and CPython 3.7.4</a> performance +comparison) due to its integrated tracing JIT compiler.</p> +<p>We also welcome developers of other <a class="reference external" href="https://rpython.readthedocs.io/en/latest/examples.html">dynamic languages</a> to see what RPython +can do for them.</p> +<p>We provide binary builds for:</p> +<blockquote> +<ul class="simple"> +<li><p><strong>x86</strong> machines on most common operating systems +(Linux 32/64 bits, Mac OS 64 bits, Windows 64 bits)</p></li> +<li><p>64-bit <strong>ARM</strong> machines running Linux (<code class="docutils literal">aarch64</code>).</p></li> +<li><p>Apple <strong>M1 arm64</strong> machines (<code class="docutils literal">macos_arm64</code>).</p></li> +<li><p><strong>s390x</strong> running Linux</p></li> +</ul> +</blockquote> +<p>PyPy support Windows 32-bit, Linux PPC64 big- and little-endian, and Linux ARM +32 bit, but does not release binaries. Please reach out to us if you wish to +sponsor binary releases for those platforms. Downstream packagers provide +binary builds for debian, Fedora, conda, OpenBSD, FreeBSD, Gentoo, and more.</p> +</section> +<section id="what-else-is-new"> +<h3>What else is new?</h3> +<p>For more information about the 7.3.12 release, see the <a class="reference external" href="https://doc.pypy.org/en/latest/release-v7.3.12.html#changelog">full changelog</a>.</p> +<p>Please update, and continue to help us make pypy better.</p> +<p>Cheers, +The PyPy Team</p> +</section> +</section>releasehttps://www.pypy.org/posts/2023/06/pypy-v7312-release.htmlFri, 16 Jun 2023 04:22:08 GMTPyPy and conda-forgehttps://www.pypy.org/posts/2022/11/pypy-and-conda-forge.htmlmattip<p>You can use PyPy as your python interpreter in a conda environment. The +conda-forge team has graciously provided this service.</p> +<p>The conda-forge <a href="https://conda-forge.org/docs/user/tipsandtricks.html#using-pypy-as-an-interpreter">tips-and-tricks</a> +page says:</p> +<blockquote> +<p>The conda-forge channel supports creating and installing packages into +environments using the PyPy interpreter. Many packages are already available. +You need to enable the conda-forge channel and use the pypy identifier when +creating your environment:</p> +</blockquote> +<div class="code"><pre class="code literal-block"> $ conda create -c conda-forge -n my-pypy-env pypy python=3.8 + $ conda activate my-pypy-env +</pre></div> + +<blockquote> +<p>Currently supported python versions are 3.8 and 3.9. Support for pypy3.7 has +been dropped. While you can still create a python 3.7 environment, you you +will not be getting updates as new package versions are released (including +pypy itself).</p> +<p>if you are using defaults as a low priority channel, then you need to use +strict channel priority as the metadata in defaults has not been patched yet +which allows cpython extension packages to be installed alongside pypy.</p> +</blockquote> +<div class="code"><pre class="code literal-block"> $ conda config --set channel_priority strict +</pre></div> + +<p>The work required some out-of-the-box thinking on the part of conda-forge since +they needed to add the idea of a <code>pypy</code> identifier to the python version and +the whole conda team has been very supportive of the effort needed. Binary +packages are on offer for the usual platforms:</p> +<ul> +<li><code>x86_64</code> windows, macos, linux</li> +<li><code>ppc64le</code> and <code>aarch64</code> linux.</li> +</ul> +<p>There are <a href="https://conda-forge.org/status/#pypy38">currently over 1000 packages</a> available for download via the +conda-forge channel, and more are being added as the kind package maintainers +work around various differences between CPython and PyPy. Please let us know if +your favorite package is not supported.</p>extension moduleshttps://www.pypy.org/posts/2022/11/pypy-and-conda-forge.htmlSat, 05 Nov 2022 17:00:25 GMTPyPy's blog has movedhttps://www.pypy.org/posts/2021/03/pypys-blog-has-moved.htmlmattip<p>For many years, PyPy has been publishing blog posts at +<a href="https://morepypy.blogspot.com">https://morepypy.blogspot.com</a>. From now on, +the posts will be here, at <a href="https://pypy.org/blog">https://pypy.org/blog</a>. The +RSS feed is <a href="https://pypy.org/rss.xml">https://pypy.org/rss.xml</a>. The original +content has been migrated to the newer site, including comments.</p> +<!-- TEASER_END --> + +<p>Among the motivations for the move were:</p> +<h4 id="one-site-to-rule-them-all">One site to rule them all</h4> +<p>Adding the blog posts here seems like a natural extension of the web site +rather than outsourcing it to a third-party. Since the site is generated using +the static site generator <a href="https://getnikola.com/">nikola</a> from the github repo +<a href="https://github.com/pypy/pypy.org">https://github.com/pypy/pypy.org</a>, we also +have good source control for the content.</p> +<h4 id="ci-previews-and-github">CI previews, and github</h4> +<p>Those of you who follow PyPy may note something new in the URL for the repo: +until now PyPy has been using <a href="https://mercurial-scm.org">mercurial</a> as hosted +on <a href="https://foss.heptapod.net">https://foss.heptapod.net</a>. While +<a href="https://heptapod.net/">heptapod</a> (a community driven effort to bring mercurial +support to GitLab™) does provide a GitLab CI runner for the open source +offering, on github it is easier to integrate <a href="https://netlify.com">netlify</a> +for previews. Hopefully the move to the more popular github platform will +encourage new contributors to publish their success stories around using PyPy +and the RPython toolchain.</p> +<h4 id="comments">Comments</h4> +<p>Comments to blog posts are generated via the <a href="https://utteranc.es/">utterances</a> +javascript plugin. The comments appear as issues in the repo. +When viewing the site, a query is made to fetch the comments to the issue with +that name. To comment, users must authorize the utterances app to post on their +behalf using the <a href="https://developer.github.com/v3/oauth/#web-application-flow">GitHub +OAuth</a> flow. +Alternatively, users can comment on the GitHub issue directly. The interaction +with github for authentication and moderation seems more natural than the +manual moderation required on blogspot.</p> +<h4 id="please-prove-to-us-that-the-move-is-worth-it">Please prove to us that the move is worth it</h4> +<p>Help us with guest blog posts, and PRs to improve the styling of the site. One +already <a href="https://github.com/pypy/pypy.org/issues/5">open issue</a> is that the +navbar needlessly uses javascript, help to keep the responsive style in pure +CSS is welcome. The theme could also use tweaking.</p> +<p>But more importantly, we want to hear from you. Guest blog posts about +PyPy are welcome. Just follow the directions in the repo's README to create a +PR with your favorite PyPy story.</p> +<p>The PyPy Team</p>https://www.pypy.org/posts/2021/03/pypys-blog-has-moved.htmlTue, 09 Mar 2021 11:03:09 GMT \ No newline at end of file diff --git a/authors/max-bernstein.html b/authors/max-bernstein.html new file mode 100644 index 000000000..44dfa04a5 --- /dev/null +++ b/authors/max-bernstein.html @@ -0,0 +1,113 @@ + + + + + +Posts by Max Bernstein | PyPy + + + + + + + + + + + + + + + + + Skip to main content +
+
+
+ + \ No newline at end of file diff --git a/authors/max-bernstein.xml b/authors/max-bernstein.xml new file mode 100644 index 000000000..6dcbc886b --- /dev/null +++ b/authors/max-bernstein.xml @@ -0,0 +1,508 @@ + +PyPy (Posts by Max Bernstein)https://www.pypy.org/enContents © 2024 <a href="mailto:pypy-dev@pypy.org">The PyPy Team</a> Sat, 31 Aug 2024 17:48:12 GMTNikola (getnikola.com)http://blogs.law.harvard.edu/tech/rssAbstract interpretation in the Toy Optimizerhttps://www.pypy.org/posts/2024/07/toy-abstract-interpretation.htmlMax Bernstein<p>This is a <a href="https://bernsteinbear.com/blog/toy-abstract-interpretation/" rel="canonical">cross-post</a> +from Max Bernstein from his excellent blog where he writes about programming +languages, compilers, optimizations, virtual machines. He's looking for a +(dynamic language runtime or compiler related) job too.</p> +<hr> +<p>CF Bolz-Tereick wrote some excellent posts in which they <a href="https://pypy.org/posts/2022/07/toy-optimizer.html">introduce a small IR +and optimizer</a> and <a href="https://pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html">extend it with allocation +removal</a>. We also did a live stream together in which +we did <a href="https://www.youtube.com/watch?v=w-UHg0yOPSE">some more heap optimizations</a>.</p> +<p>In this blog post, I'm going to write a small abstract interpreter for the Toy +IR and then show how we can use it to do some simple optimizations. It assumes +that you are familiar with the little IR, which I have reproduced unchanged in +<a href="https://gist.github.com/tekknolagi/4425b28d5267e7bae8b0d7ef8fb4a671">a GitHub Gist</a>.</p> +<p>Abstract interpretation is a general framework for efficiently computing +properties that must be true for all possible executions of a program. It's a +widely used approach both in compiler optimizations as well as offline static +analysis for finding bugs. I'm writing this post to pave the way for CF's next +post on proving abstract interpreters correct for range analysis and known bits +analysis inside PyPy.</p> +<p>Before we begin, I want to note a couple of things:</p> +<ul> +<li>The Toy IR is in SSA form, which means that every variable is defined exactly + once. This means that abstract properties of each variable are easy to track.</li> +<li>The Toy IR represents a linear trace without control flow, meaning we won't + talk about meet/join or fixpoints. They only make sense if the IR has a + notion of conditional branches or back edges (loops).</li> +</ul> +<p>Alright, let's get started.</p> +<h3 id="welcome-to-abstract-interpretation">Welcome to abstract interpretation</h3> +<p>Abstract interpretation means a couple different things to different people. +There's rigorous mathematical formalism thanks to Patrick and Radhia Cousot, +our favorite power couple, and there's also sketchy hand-wavy stuff like what +will follow in this post. In the end, all people are trying to do is reason +about program behavior without running it.</p> +<p>In particular, abstract interpretation is an <em>over-approximation</em> of the +behavior of a program. Correctly implemented abstract interpreters never lie, +but they might be a little bit pessimistic. This is because instead of using +real values and running the program---which would produce a concrete result and +some real-world behavior---we "run" the program with a parallel universe of +<em>abstract</em> values. This abstract run gives us information about all possible +runs of the program.<sup id="fnref:logozzo"><a class="footnote-ref" href="https://www.pypy.org/posts/2024/07/toy-abstract-interpretation.html#fn:logozzo">1</a></sup></p> +<p>Abstract values always represent sets of concrete values. Instead of literally +storing a set (in the world of integers, for example, it could get pretty +big...there are a lot of integers), we group them into a finite number of named +subsets.<sup id="fnref:lattices"><a class="footnote-ref" href="https://www.pypy.org/posts/2024/07/toy-abstract-interpretation.html#fn:lattices">2</a></sup></p> +<p>Let's learn a little about abstract interpretation with an example program and +example abstract domain. Here's the example program:</p> +<div class="code"><pre class="code literal-block"><span class="n">v0</span> <span class="o">=</span> <span class="mi">1</span> +<span class="n">v1</span> <span class="o">=</span> <span class="mi">2</span> +<span class="n">v2</span> <span class="o">=</span> <span class="n">add</span><span class="p">(</span><span class="n">v0</span><span class="p">,</span> <span class="n">v1</span><span class="p">)</span> +</pre></div> + +<p>And our abstract domain is "is the number positive" (where "positive" means +nonnegative, but I wanted to keep the words distinct):</p> +<div class="code"><pre class="code literal-block"> top + / \ +positive negative + \ / + bottom +</pre></div> + +<p>The special <em>top</em> value means "I don't know" and the special <em>bottom</em> value +means "empty set" or "unreachable". The <em>positive</em> and <em>negative</em> values +represent the sets of all positive and negative numbers, respectively.</p> +<p>We initialize all the variables <code>v0</code>, <code>v1</code>, and <code>v2</code> to <em>bottom</em> and then walk +our IR, updating our knowledge as we go.</p> +<div class="code"><pre class="code literal-block"><span class="c1"># here</span> +<span class="n">v0</span><span class="p">:</span><span class="n">bottom</span> <span class="o">=</span> <span class="mi">1</span> +<span class="n">v1</span><span class="p">:</span><span class="n">bottom</span> <span class="o">=</span> <span class="mi">2</span> +<span class="n">v2</span><span class="p">:</span><span class="n">bottom</span> <span class="o">=</span> <span class="n">add</span><span class="p">(</span><span class="n">v0</span><span class="p">,</span> <span class="n">v1</span><span class="p">)</span> +</pre></div> + +<p>In order to do that, we have to have <em>transfer functions</em> for each operation. +For constants, the transfer function is easy: determine if the constant is +positive or negative. For other operations, we have to define a function that +takes the abstract values of the operands and returns the abstract value of the +result.</p> +<p>In order to be correct, transfer functions for operations have to be compatible +with the behavior of their corresponding concrete implementations. You can +think of them having an implicit universal quantifier <em>forall</em> in front of +them.</p> +<p>Let's step through the constants at least:</p> +<div class="code"><pre class="code literal-block"><span class="n">v0</span><span class="p">:</span><span class="n">positive</span> <span class="o">=</span> <span class="mi">1</span> +<span class="n">v1</span><span class="p">:</span><span class="n">positive</span> <span class="o">=</span> <span class="mi">2</span> +<span class="c1"># here</span> +<span class="n">v2</span><span class="p">:</span><span class="n">bottom</span> <span class="o">=</span> <span class="n">add</span><span class="p">(</span><span class="n">v0</span><span class="p">,</span> <span class="n">v1</span><span class="p">)</span> +</pre></div> + +<p>Now we need to figure out the transfer function for <code>add</code>. It's kind of tricky +right now because we haven't specified our abstract domain very well. I keep +saying "numbers", but what kinds of numbers? Integers? Real numbers? Floating +point? Some kind of fixed-width bit vector (<code>int8</code>, <code>uint32</code>, ...) like an +actual machine "integer"?</p> +<p>For this post, I am going to use the mathematical definition of integer, which +means that the values are not bounded in size and therefore do not overflow. +Actual hardware memory constraints aside, this is kind of like a Python <code>int</code>.</p> +<p>So let's look at what happens when we add two abstract numbers:</p> +<table> +<thead> +<tr> +<th></th> +<th>top</th> +<th>positive</th> +<th>negative</th> +<th>bottom</th> +</tr> +</thead> +<tbody> +<tr> +<td><strong>top</strong></td> +<td>top</td> +<td>top</td> +<td>top</td> +<td>bottom</td> +</tr> +<tr> +<td><strong>positive</strong></td> +<td>top</td> +<td>positive</td> +<td>top</td> +<td>bottom</td> +</tr> +<tr> +<td><strong>negative</strong></td> +<td>top</td> +<td>top</td> +<td>negative</td> +<td>bottom</td> +</tr> +<tr> +<td><strong>bottom</strong></td> +<td>bottom</td> +<td>bottom</td> +<td>bottom</td> +<td>bottom</td> +</tr> +</tbody> +</table> +<p>As an example, let's try to add two numbers <code>a</code> and <code>b</code>, where <code>a</code> is positive +and <code>b</code> is negative. We don't know anything about their values other than their +signs. They could be <code>5</code> and <code>-3</code>, where the result is <code>2</code>, or they could be +<code>1</code> and <code>-100</code>, where the result is <code>-99</code>. This is why we can't say anything +about the result of this operation and have to return <em>top</em>.</p> +<p>The short of this table is that we only really know the result of an addition +if both operands are positive or both operands are negative. Thankfully, in +this example, both operands are known positive. So we can learn something about +<code>v2</code>:</p> +<div class="code"><pre class="code literal-block"><span class="n">v0</span><span class="p">:</span><span class="n">positive</span> <span class="o">=</span> <span class="mi">1</span> +<span class="n">v1</span><span class="p">:</span><span class="n">positive</span> <span class="o">=</span> <span class="mi">2</span> +<span class="n">v2</span><span class="p">:</span><span class="n">positive</span> <span class="o">=</span> <span class="n">add</span><span class="p">(</span><span class="n">v0</span><span class="p">,</span> <span class="n">v1</span><span class="p">)</span> +<span class="c1"># here</span> +</pre></div> + +<p>This may not seem useful in isolation, but analyzing more complex programs even +with this simple domain may be able to remove checks such as <code>if (v2 &lt; 0) { ... }</code>.</p> +<p>Let's take a look at another example using an sample <code>absval</code> (absolute value) +IR operation:</p> +<div class="code"><pre class="code literal-block"><span class="n">v0</span> <span class="o">=</span> <span class="n">getarg</span><span class="p">(</span><span class="mi">0</span><span class="p">)</span> +<span class="n">v1</span> <span class="o">=</span> <span class="n">getarg</span><span class="p">(</span><span class="mi">1</span><span class="p">)</span> +<span class="n">v2</span> <span class="o">=</span> <span class="n">absval</span><span class="p">(</span><span class="n">v0</span><span class="p">)</span> +<span class="n">v3</span> <span class="o">=</span> <span class="n">absval</span><span class="p">(</span><span class="n">v1</span><span class="p">)</span> +<span class="n">v4</span> <span class="o">=</span> <span class="n">add</span><span class="p">(</span><span class="n">v2</span><span class="p">,</span> <span class="n">v3</span><span class="p">)</span> +<span class="n">v5</span> <span class="o">=</span> <span class="n">absval</span><span class="p">(</span><span class="n">v4</span><span class="p">)</span> +</pre></div> + +<p>Even though we have no constant/concrete values, we can still learn something +about the states of values throughout the program. Since we know that <code>absval</code> +always returns a positive number, we learn that <code>v2</code>, <code>v3</code>, and <code>v4</code> are all +positive. This means that we can optimize out the <code>absval</code> operation on <code>v5</code>:</p> +<div class="code"><pre class="code literal-block"><span class="n">v0</span><span class="p">:</span><span class="n">top</span> <span class="o">=</span> <span class="n">getarg</span><span class="p">(</span><span class="mi">0</span><span class="p">)</span> +<span class="n">v1</span><span class="p">:</span><span class="n">top</span> <span class="o">=</span> <span class="n">getarg</span><span class="p">(</span><span class="mi">1</span><span class="p">)</span> +<span class="n">v2</span><span class="p">:</span><span class="n">positive</span> <span class="o">=</span> <span class="n">absval</span><span class="p">(</span><span class="n">v0</span><span class="p">)</span> +<span class="n">v3</span><span class="p">:</span><span class="n">positive</span> <span class="o">=</span> <span class="n">absval</span><span class="p">(</span><span class="n">v1</span><span class="p">)</span> +<span class="n">v4</span><span class="p">:</span><span class="n">positive</span> <span class="o">=</span> <span class="n">add</span><span class="p">(</span><span class="n">v2</span><span class="p">,</span> <span class="n">v3</span><span class="p">)</span> +<span class="n">v5</span><span class="p">:</span><span class="n">positive</span> <span class="o">=</span> <span class="n">v4</span> +</pre></div> + +<p>Other interesting lattices include:</p> +<ul> +<li>Constants (where the middle row is pretty wide)</li> +<li>Range analysis (bounds on min and max of a number)</li> +<li>Known bits (using a bitvector representation of a number, which bits are + always 0 or 1)</li> +</ul> +<p>For the rest of this blog post, we are going to do a very limited version of +"known bits", called <em>parity</em>. This analysis only tracks the least significant +bit of a number, which indicates if it is even or odd.</p> +<h3 id="parity">Parity</h3> +<p>The lattice is pretty similar to the positive/negative lattice:</p> +<div class="code"><pre class="code literal-block"> top + / \ +even odd + \ / + bottom +</pre></div> + +<p>Let's define a data structure to represent this in Python code:</p> +<div class="code"><pre class="code literal-block"><span class="k">class</span> <span class="nc">Parity</span><span class="p">:</span> + <span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">name</span><span class="p">):</span> + <span class="bp">self</span><span class="o">.</span><span class="n">name</span> <span class="o">=</span> <span class="n">name</span> + + <span class="k">def</span> <span class="fm">__repr__</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span> + <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">name</span> +</pre></div> + +<p>And instantiate the members of the lattice:</p> +<div class="code"><pre class="code literal-block"><span class="n">TOP</span> <span class="o">=</span> <span class="n">Parity</span><span class="p">(</span><span class="s2">"top"</span><span class="p">)</span> +<span class="n">EVEN</span> <span class="o">=</span> <span class="n">Parity</span><span class="p">(</span><span class="s2">"even"</span><span class="p">)</span> +<span class="n">ODD</span> <span class="o">=</span> <span class="n">Parity</span><span class="p">(</span><span class="s2">"odd"</span><span class="p">)</span> +<span class="n">BOTTOM</span> <span class="o">=</span> <span class="n">Parity</span><span class="p">(</span><span class="s2">"bottom"</span><span class="p">)</span> +</pre></div> + +<p>Now let's write a forward flow analysis of a basic block using this lattice. +We'll do that by assuming that a method on <code>Parity</code> is defined for each IR +operation. For example, <code>Parity.add</code>, <code>Parity.lshift</code>, etc.</p> +<div class="code"><pre class="code literal-block"><span class="k">def</span> <span class="nf">analyze</span><span class="p">(</span><span class="n">block</span><span class="p">:</span> <span class="n">Block</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="kc">None</span><span class="p">:</span> + <span class="n">parity</span> <span class="o">=</span> <span class="p">{</span><span class="n">v</span><span class="p">:</span> <span class="n">BOTTOM</span> <span class="k">for</span> <span class="n">v</span> <span class="ow">in</span> <span class="n">block</span><span class="p">}</span> + + <span class="k">def</span> <span class="nf">parity_of</span><span class="p">(</span><span class="n">value</span><span class="p">):</span> + <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">value</span><span class="p">,</span> <span class="n">Constant</span><span class="p">):</span> + <span class="k">return</span> <span class="n">Parity</span><span class="o">.</span><span class="n">const</span><span class="p">(</span><span class="n">value</span><span class="p">)</span> + <span class="k">return</span> <span class="n">parity</span><span class="p">[</span><span class="n">value</span><span class="p">]</span> + + <span class="k">for</span> <span class="n">op</span> <span class="ow">in</span> <span class="n">block</span><span class="p">:</span> + <span class="n">transfer</span> <span class="o">=</span> <span class="nb">getattr</span><span class="p">(</span><span class="n">Parity</span><span class="p">,</span> <span class="n">op</span><span class="o">.</span><span class="n">name</span><span class="p">)</span> + <span class="n">args</span> <span class="o">=</span> <span class="p">[</span><span class="n">parity_of</span><span class="p">(</span><span class="n">arg</span><span class="o">.</span><span class="n">find</span><span class="p">())</span> <span class="k">for</span> <span class="n">arg</span> <span class="ow">in</span> <span class="n">op</span><span class="o">.</span><span class="n">args</span><span class="p">]</span> + <span class="n">parity</span><span class="p">[</span><span class="n">op</span><span class="p">]</span> <span class="o">=</span> <span class="n">transfer</span><span class="p">(</span><span class="o">*</span><span class="n">args</span><span class="p">)</span> +</pre></div> + +<p>For every operation, we compute the abstract value---the parity---of the +arguments and then call the corresponding method on <code>Parity</code> to get the +abstract result.</p> +<!-- TODO maybe learn more about different IRs and how they do constants. +apparently pypy/llvm are free-floating; cinder is not --> +<p>We need to special case <code>Constant</code>s due to a quirk of how the Toy IR is +constructed: the constants don't appear in the instruction stream and instead +are free-floating.</p> +<p>Let's start by looking at the abstraction function for concrete +values---constants:</p> +<div class="code"><pre class="code literal-block"><span class="k">class</span> <span class="nc">Parity</span><span class="p">:</span> + <span class="c1"># ...</span> + <span class="nd">@staticmethod</span> + <span class="k">def</span> <span class="nf">const</span><span class="p">(</span><span class="n">value</span><span class="p">):</span> + <span class="k">if</span> <span class="n">value</span><span class="o">.</span><span class="n">value</span> <span class="o">%</span> <span class="mi">2</span> <span class="o">==</span> <span class="mi">0</span><span class="p">:</span> + <span class="k">return</span> <span class="n">EVEN</span> + <span class="k">else</span><span class="p">:</span> + <span class="k">return</span> <span class="n">ODD</span> +</pre></div> + +<p>Seems reasonable enough. Let's pause on operations for a moment and consider an +example program:</p> +<div class="code"><pre class="code literal-block"><span class="n">v0</span> <span class="o">=</span> <span class="n">getarg</span><span class="p">(</span><span class="mi">0</span><span class="p">)</span> +<span class="n">v1</span> <span class="o">=</span> <span class="n">getarg</span><span class="p">(</span><span class="mi">1</span><span class="p">)</span> +<span class="n">v2</span> <span class="o">=</span> <span class="n">lshift</span><span class="p">(</span><span class="n">v0</span><span class="p">,</span> <span class="mi">1</span><span class="p">)</span> +<span class="n">v3</span> <span class="o">=</span> <span class="n">lshift</span><span class="p">(</span><span class="n">v1</span><span class="p">,</span> <span class="mi">1</span><span class="p">)</span> +<span class="n">v4</span> <span class="o">=</span> <span class="n">add</span><span class="p">(</span><span class="n">v2</span><span class="p">,</span> <span class="n">v3</span><span class="p">)</span> +<span class="n">v5</span> <span class="o">=</span> <span class="n">dummy</span><span class="p">(</span><span class="n">v4</span><span class="p">)</span> +</pre></div> + +<p>This function (which is admittedly a little contrived) takes two inputs, shifts +them left by one bit, adds the result, and then checks the least significant +bit of the addition result. It then passes that result into a <code>dummy</code> function, +which you can think of as "return" or "escape".</p> +<p>To do some abstract interpretation on this program, we'll need to implement the +transfer functions for <code>lshift</code> and <code>add</code> (<code>dummy</code> will just always return +<code>TOP</code>). We'll start with <code>add</code>. Remember that adding two even numbers returns +an even number, adding two odd numbers returns an even number, and mixing even +and odd returns an odd number.</p> +<div class="code"><pre class="code literal-block"><span class="k">class</span> <span class="nc">Parity</span><span class="p">:</span> + <span class="c1"># ...</span> + <span class="k">def</span> <span class="nf">add</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">):</span> + <span class="k">if</span> <span class="bp">self</span> <span class="ow">is</span> <span class="n">BOTTOM</span> <span class="ow">or</span> <span class="n">other</span> <span class="ow">is</span> <span class="n">BOTTOM</span><span class="p">:</span> + <span class="k">return</span> <span class="n">BOTTOM</span> + <span class="k">if</span> <span class="bp">self</span> <span class="ow">is</span> <span class="n">TOP</span> <span class="ow">or</span> <span class="n">other</span> <span class="ow">is</span> <span class="n">TOP</span><span class="p">:</span> + <span class="k">return</span> <span class="n">TOP</span> + <span class="k">if</span> <span class="bp">self</span> <span class="ow">is</span> <span class="n">EVEN</span> <span class="ow">and</span> <span class="n">other</span> <span class="ow">is</span> <span class="n">EVEN</span><span class="p">:</span> + <span class="k">return</span> <span class="n">EVEN</span> + <span class="k">if</span> <span class="bp">self</span> <span class="ow">is</span> <span class="n">ODD</span> <span class="ow">and</span> <span class="n">other</span> <span class="ow">is</span> <span class="n">ODD</span><span class="p">:</span> + <span class="k">return</span> <span class="n">EVEN</span> + <span class="k">return</span> <span class="n">ODD</span> +</pre></div> + +<p>We also need to fill in the other cases where the operands are <em>top</em> or +<em>bottom</em>. In this case, they are both "contagious"; if either operand is +bottom, the result is as well. If neither is bottom but either operand is top, +the result is as well.</p> +<p>Now let's look at <code>lshift</code>. Shifting any number left by a non-zero number of +bits will always result in an even number, but we need to be careful about the +zero case! Shifting by zero doesn't change the number at all. Unfortunately, +since our lattice has no notion of zero, we have to over-approximate here:</p> +<div class="code"><pre class="code literal-block"><span class="k">class</span> <span class="nc">Parity</span><span class="p">:</span> + <span class="c1"># ...</span> + <span class="k">def</span> <span class="nf">lshift</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">):</span> + <span class="c1"># self &lt;&lt; other</span> + <span class="k">if</span> <span class="n">other</span> <span class="ow">is</span> <span class="n">ODD</span><span class="p">:</span> + <span class="k">return</span> <span class="n">EVEN</span> + <span class="k">return</span> <span class="n">TOP</span> +</pre></div> + +<p>This means that we will miss some opportunities to optimize, but it's a +tradeoff that's just part of the game. (We could also add more elements to our +lattice, but that's a topic for another day.)</p> +<p>Now, if we run our abstract interpretation, we'll collect some interesting +properties about the program. If we temporarily hack on the internals of +<code>bb_to_str</code>, we can print out parity information alongside the IR operations:</p> +<div class="code"><pre class="code literal-block"><span class="n">v0</span><span class="p">:</span><span class="n">top</span> <span class="o">=</span> <span class="n">getarg</span><span class="p">(</span><span class="mi">0</span><span class="p">)</span> +<span class="n">v1</span><span class="p">:</span><span class="n">top</span> <span class="o">=</span> <span class="n">getarg</span><span class="p">(</span><span class="mi">1</span><span class="p">)</span> +<span class="n">v2</span><span class="p">:</span><span class="n">even</span> <span class="o">=</span> <span class="n">lshift</span><span class="p">(</span><span class="n">v0</span><span class="p">,</span> <span class="mi">1</span><span class="p">)</span> +<span class="n">v3</span><span class="p">:</span><span class="n">even</span> <span class="o">=</span> <span class="n">lshift</span><span class="p">(</span><span class="n">v1</span><span class="p">,</span> <span class="mi">1</span><span class="p">)</span> +<span class="n">v4</span><span class="p">:</span><span class="n">even</span> <span class="o">=</span> <span class="n">add</span><span class="p">(</span><span class="n">v2</span><span class="p">,</span> <span class="n">v3</span><span class="p">)</span> +<span class="n">v5</span><span class="p">:</span><span class="n">top</span> <span class="o">=</span> <span class="n">dummy</span><span class="p">(</span><span class="n">v4</span><span class="p">)</span> +</pre></div> + +<p>This is pretty awesome, because we can see that <code>v4</code>, the result of the +addition, is <em>always</em> even. Maybe we can do something with that information.</p> +<h3 id="optimization">Optimization</h3> +<p>One way that a program might check if a number is odd is by checking the least +significant bit. This is a common pattern in C code, where you might see code +like <code>y = x &amp; 1</code>. Let's introduce a <code>bitand</code> IR operation that acts like the +<code>&amp;</code> operator in C/Python. Here is an example of use of it in our program:</p> +<div class="code"><pre class="code literal-block"><span class="n">v0</span> <span class="o">=</span> <span class="n">getarg</span><span class="p">(</span><span class="mi">0</span><span class="p">)</span> +<span class="n">v1</span> <span class="o">=</span> <span class="n">getarg</span><span class="p">(</span><span class="mi">1</span><span class="p">)</span> +<span class="n">v2</span> <span class="o">=</span> <span class="n">lshift</span><span class="p">(</span><span class="n">v0</span><span class="p">,</span> <span class="mi">1</span><span class="p">)</span> +<span class="n">v3</span> <span class="o">=</span> <span class="n">lshift</span><span class="p">(</span><span class="n">v1</span><span class="p">,</span> <span class="mi">1</span><span class="p">)</span> +<span class="n">v4</span> <span class="o">=</span> <span class="n">add</span><span class="p">(</span><span class="n">v2</span><span class="p">,</span> <span class="n">v3</span><span class="p">)</span> +<span class="n">v5</span> <span class="o">=</span> <span class="n">bitand</span><span class="p">(</span><span class="n">v4</span><span class="p">,</span> <span class="mi">1</span><span class="p">)</span> <span class="c1"># new!</span> +<span class="n">v6</span> <span class="o">=</span> <span class="n">dummy</span><span class="p">(</span><span class="n">v5</span><span class="p">)</span> +</pre></div> + +<p>We'll hold off on implementing the transfer function for it---that's left as an +exercise for the reader---and instead do something different.</p> +<p>Instead, we'll see if we can optimize operations of the form <code>bitand(X, 1)</code>. If +we statically know the parity as a result of abstract interpretation, we can +replace the <code>bitand</code> with a constant <code>0</code> or <code>1</code>.</p> +<p>We'll first modify the <code>analyze</code> function (and rename it) to return a new +<code>Block</code> containing optimized instructions:</p> +<div class="code"><pre class="code literal-block"><span class="k">def</span> <span class="nf">simplify</span><span class="p">(</span><span class="n">block</span><span class="p">:</span> <span class="n">Block</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Block</span><span class="p">:</span> + <span class="n">parity</span> <span class="o">=</span> <span class="p">{</span><span class="n">v</span><span class="p">:</span> <span class="n">BOTTOM</span> <span class="k">for</span> <span class="n">v</span> <span class="ow">in</span> <span class="n">block</span><span class="p">}</span> + + <span class="k">def</span> <span class="nf">parity_of</span><span class="p">(</span><span class="n">value</span><span class="p">):</span> + <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">value</span><span class="p">,</span> <span class="n">Constant</span><span class="p">):</span> + <span class="k">return</span> <span class="n">Parity</span><span class="o">.</span><span class="n">const</span><span class="p">(</span><span class="n">value</span><span class="p">)</span> + <span class="k">return</span> <span class="n">parity</span><span class="p">[</span><span class="n">value</span><span class="p">]</span> + + <span class="n">result</span> <span class="o">=</span> <span class="n">Block</span><span class="p">()</span> + <span class="k">for</span> <span class="n">op</span> <span class="ow">in</span> <span class="n">block</span><span class="p">:</span> + <span class="c1"># TODO: Optimize op</span> + <span class="c1"># Emit</span> + <span class="n">result</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">op</span><span class="p">)</span> + <span class="c1"># Analyze</span> + <span class="n">transfer</span> <span class="o">=</span> <span class="nb">getattr</span><span class="p">(</span><span class="n">Parity</span><span class="p">,</span> <span class="n">op</span><span class="o">.</span><span class="n">name</span><span class="p">)</span> + <span class="n">args</span> <span class="o">=</span> <span class="p">[</span><span class="n">parity_of</span><span class="p">(</span><span class="n">arg</span><span class="o">.</span><span class="n">find</span><span class="p">())</span> <span class="k">for</span> <span class="n">arg</span> <span class="ow">in</span> <span class="n">op</span><span class="o">.</span><span class="n">args</span><span class="p">]</span> + <span class="n">parity</span><span class="p">[</span><span class="n">op</span><span class="p">]</span> <span class="o">=</span> <span class="n">transfer</span><span class="p">(</span><span class="o">*</span><span class="n">args</span><span class="p">)</span> + <span class="k">return</span> <span class="n">result</span> +</pre></div> + +<p>We're approaching this the way that PyPy does things under the hood, which is +all in roughly a single pass. It tries to optimize an instruction away, and if +it can't, it copies it into the new block.</p> +<p>Now let's add in the <code>bitand</code> optimization. It's mostly some gross-looking +pattern matching that checks if the right hand side of a bitwise <code>and</code> +operation is <code>1</code> (TODO: the left hand side, too). CF had some neat ideas on how +to make this more ergonomic, which I might save for later.<sup id="fnref:match-args"><a class="footnote-ref" href="https://www.pypy.org/posts/2024/07/toy-abstract-interpretation.html#fn:match-args">3</a></sup></p> +<p>Then, if we know the parity, optimize the <code>bitand</code> into a constant.</p> +<div class="code"><pre class="code literal-block"><span class="k">def</span> <span class="nf">simplify</span><span class="p">(</span><span class="n">block</span><span class="p">:</span> <span class="n">Block</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Block</span><span class="p">:</span> + <span class="n">parity</span> <span class="o">=</span> <span class="p">{</span><span class="n">v</span><span class="p">:</span> <span class="n">BOTTOM</span> <span class="k">for</span> <span class="n">v</span> <span class="ow">in</span> <span class="n">block</span><span class="p">}</span> + + <span class="k">def</span> <span class="nf">parity_of</span><span class="p">(</span><span class="n">value</span><span class="p">):</span> + <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">value</span><span class="p">,</span> <span class="n">Constant</span><span class="p">):</span> + <span class="k">return</span> <span class="n">Parity</span><span class="o">.</span><span class="n">const</span><span class="p">(</span><span class="n">value</span><span class="p">)</span> + <span class="k">return</span> <span class="n">parity</span><span class="p">[</span><span class="n">value</span><span class="p">]</span> + + <span class="n">result</span> <span class="o">=</span> <span class="n">Block</span><span class="p">()</span> + <span class="k">for</span> <span class="n">op</span> <span class="ow">in</span> <span class="n">block</span><span class="p">:</span> + <span class="c1"># Try to simplify</span> + <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">op</span><span class="p">,</span> <span class="n">Operation</span><span class="p">)</span> <span class="ow">and</span> <span class="n">op</span><span class="o">.</span><span class="n">name</span> <span class="o">==</span> <span class="s2">"bitand"</span><span class="p">:</span> + <span class="n">arg</span> <span class="o">=</span> <span class="n">op</span><span class="o">.</span><span class="n">arg</span><span class="p">(</span><span class="mi">0</span><span class="p">)</span> + <span class="n">mask</span> <span class="o">=</span> <span class="n">op</span><span class="o">.</span><span class="n">arg</span><span class="p">(</span><span class="mi">1</span><span class="p">)</span> + <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">mask</span><span class="p">,</span> <span class="n">Constant</span><span class="p">)</span> <span class="ow">and</span> <span class="n">mask</span><span class="o">.</span><span class="n">value</span> <span class="o">==</span> <span class="mi">1</span><span class="p">:</span> + <span class="k">if</span> <span class="n">parity_of</span><span class="p">(</span><span class="n">arg</span><span class="p">)</span> <span class="ow">is</span> <span class="n">EVEN</span><span class="p">:</span> + <span class="n">op</span><span class="o">.</span><span class="n">make_equal_to</span><span class="p">(</span><span class="n">Constant</span><span class="p">(</span><span class="mi">0</span><span class="p">))</span> + <span class="k">continue</span> + <span class="k">elif</span> <span class="n">parity_of</span><span class="p">(</span><span class="n">arg</span><span class="p">)</span> <span class="ow">is</span> <span class="n">ODD</span><span class="p">:</span> + <span class="n">op</span><span class="o">.</span><span class="n">make_equal_to</span><span class="p">(</span><span class="n">Constant</span><span class="p">(</span><span class="mi">1</span><span class="p">))</span> + <span class="k">continue</span> + <span class="c1"># Emit</span> + <span class="n">result</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">op</span><span class="p">)</span> + <span class="c1"># Analyze</span> + <span class="n">transfer</span> <span class="o">=</span> <span class="nb">getattr</span><span class="p">(</span><span class="n">Parity</span><span class="p">,</span> <span class="n">op</span><span class="o">.</span><span class="n">name</span><span class="p">)</span> + <span class="n">args</span> <span class="o">=</span> <span class="p">[</span><span class="n">parity_of</span><span class="p">(</span><span class="n">arg</span><span class="o">.</span><span class="n">find</span><span class="p">())</span> <span class="k">for</span> <span class="n">arg</span> <span class="ow">in</span> <span class="n">op</span><span class="o">.</span><span class="n">args</span><span class="p">]</span> + <span class="n">parity</span><span class="p">[</span><span class="n">op</span><span class="p">]</span> <span class="o">=</span> <span class="n">transfer</span><span class="p">(</span><span class="o">*</span><span class="n">args</span><span class="p">)</span> + <span class="k">return</span> <span class="n">result</span> +</pre></div> + +<p>Remember: because we use union-find to rewrite instructions in the optimizer +(<code>make_equal_to</code>), later uses of the same instruction get the new +optimized version "for free" (<code>find</code>).</p> +<p>Let's see how it works on our IR:</p> +<div class="code"><pre class="code literal-block"><span class="n">v0</span> <span class="o">=</span> <span class="n">getarg</span><span class="p">(</span><span class="mi">0</span><span class="p">)</span> +<span class="n">v1</span> <span class="o">=</span> <span class="n">getarg</span><span class="p">(</span><span class="mi">1</span><span class="p">)</span> +<span class="n">v2</span> <span class="o">=</span> <span class="n">lshift</span><span class="p">(</span><span class="n">v0</span><span class="p">,</span> <span class="mi">1</span><span class="p">)</span> +<span class="n">v3</span> <span class="o">=</span> <span class="n">lshift</span><span class="p">(</span><span class="n">v1</span><span class="p">,</span> <span class="mi">1</span><span class="p">)</span> +<span class="n">v4</span> <span class="o">=</span> <span class="n">add</span><span class="p">(</span><span class="n">v2</span><span class="p">,</span> <span class="n">v3</span><span class="p">)</span> +<span class="n">v6</span> <span class="o">=</span> <span class="n">dummy</span><span class="p">(</span><span class="mi">0</span><span class="p">)</span> +</pre></div> + +<p>Hey, neat! <code>bitand</code> disappeared and the argument to <code>dummy</code> is now the constant +<code>0</code> because we know the lowest bit.</p> +<h3 id="wrapping-up">Wrapping up</h3> +<p>Hopefully you have gained a little bit of an intuitive understanding of +abstract interpretation. Last year, being able to write some code made me more +comfortable with the math. Now being more comfortable with the math is helping +me write the code. It's nice upward spiral.</p> +<p>The two abstract domains we used in this post are simple and not very useful in +practice but it's possible to get very far using slightly more complicated +abstract domains. Common domains include: constant propagation, type inference, +range analysis, effect inference, liveness, etc. For example, here is a a +sample lattice for constant propagation:</p> +<figure style="display: block; margin: 0 auto;"> +<!-- +digraph G { + rankdir="BT"; + top [shape=Msquare]; + bottom [shape=Msquare]; + + bottom -> "-inf"; + bottom -> "-2"; + bottom -> "-1"; + bottom -> 0; + bottom -> 1; + bottom -> 2; + bottom -> "+inf"; + + "-inf" -> negative; + "-2" -> negative; + "-1" -> negative; + 0 -> top; + 1 -> nonnegative; + 2 -> nonnegative; + "+inf" -> nonnegative; + + negative -> nonzero; + nonnegative -> nonzero; + nonzero->top; + + {rank=same; "-inf"; "-2"; "-1"; 0; 1; 2; "+inf"} + {rank=same; nonnegative; negative;} +} +--> + <object class="svg" type="image/svg+xml" data="https://www.pypy.org/images/2024-complex-lattice.svg"> + </object> +</figure> + +<p>It has multiple levels to indicate more and less precision. For example, you +might learn that a variable is either <code>1</code> or <code>2</code> and be able to encode that as +<code>nonnegative</code> instead of just going straight to <code>top</code>.</p> +<p>Check out some real-world abstract interpretation in open source projects:</p> +<ul> +<li><a href="https://github.com/llvm/llvm-project/blob/main/llvm/lib/Support/KnownBits.cpp">Known bits in LLVM</a></li> +<li><a href="https://github.com/llvm/llvm-project/blob/main/llvm/lib/IR/ConstantRange.cpp">Constant range in LLVM</a></li> +<li>But I am told that the ranges don't form a lattice (see <a href="https://dl.acm.org/doi/10.1145/2651360">Interval Analysis and Machine Arithmetic: Why Signedness Ignorance Is Bliss</a>)</li> +<li><a href="https://github.com/torvalds/linux/blob/master/kernel/bpf/tnum.c">Tristate numbers for known bits in Linux eBPF</a></li> +<li><a href="https://github.com/torvalds/linux/blob/28bbe4ea686a023929d907cc168430b61094811c/kernel/bpf/verifier.c#L13335">Range analysis in Linux eBPF</a></li> +<li><a href="https://github.com/bminor/binutils-gdb/blob/master/gdb/prologue-value.c">GDB prologue analysis</a> + of assembly to understand the stack and find frame pointers without using + DWARF (<a href="https://sourceware.org/gdb/wiki/Internals/Prologue%20Analysis">some + docs</a>)</li> +</ul> +<p>If you have some readable examples, please share them so I can add.</p> +<h3 id="acknowledgements">Acknowledgements</h3> +<p>Thank you to <a href="https://cfbolz.de/">CF Bolz-Tereick</a> for the toy optimizer and +helping edit this post!</p> +<div class="footnote"> +<hr> +<ol> +<li id="fn:logozzo"> +<p>In the words of abstract interpretation researchers Vincent Laviron +and Francesco Logozzo in their paper <em>Refining Abstract +Interpretation-based Static Analyses with Hints</em> (APLAS 2009):</p> +<blockquote> +<p>The three main elements of an abstract interpretation are: (i) the +abstract elements ("which properties am I interested in?"); (ii) the +abstract transfer functions ("which is the abstract semantics of basic +statements?"); and (iii) the abstract operations ("how do I combine the +abstract elements?").</p> +</blockquote> +<p>We don't have any of these "abstract operations" in this post because +there's no control flow but you can read about them elsewhere! <a class="footnote-backref" href="https://www.pypy.org/posts/2024/07/toy-abstract-interpretation.html#fnref:logozzo" title="Jump back to footnote 1 in the text">↩</a></p> +</li> +<li id="fn:lattices"> +<p>These abstract values are arranged in a <em>lattice</em>, which is a +mathematical structure with some properties but the most important ones are +that it has a top, a bottom, a partial order, a meet operation, and values +can only move in one direction on the lattice.</p> +<p>Using abstract values from a lattice promises two things:</p> +<ul> +<li>The analysis will terminate</li> +<li>The analysis will be correct for <em>any</em> run of the program, not just one + sample run</li> +</ul> +<p><a class="footnote-backref" href="https://www.pypy.org/posts/2024/07/toy-abstract-interpretation.html#fnref:lattices" title="Jump back to footnote 2 in the text">↩</a></p> +</li> +<li id="fn:match-args"> +<p>Something about <code>__match_args__</code> and <code>@property</code>... <a class="footnote-backref" href="https://www.pypy.org/posts/2024/07/toy-abstract-interpretation.html#fnref:match-args" title="Jump back to footnote 3 in the text">↩</a></p> +</li> +</ol> +</div>toy-optimizerhttps://www.pypy.org/posts/2024/07/toy-abstract-interpretation.htmlWed, 24 Jul 2024 14:48:00 GMT \ No newline at end of file diff --git a/authors/michael-foord.html b/authors/michael-foord.html new file mode 100644 index 000000000..75abc46b0 --- /dev/null +++ b/authors/michael-foord.html @@ -0,0 +1,113 @@ + + + + + +Posts by Michael Foord | PyPy + + + + + + + + + + + + + + + + + Skip to main content +
+
+
+ + \ No newline at end of file diff --git a/authors/michael-foord.xml b/authors/michael-foord.xml new file mode 100644 index 000000000..6b872c9db --- /dev/null +++ b/authors/michael-foord.xml @@ -0,0 +1,33 @@ + +PyPy (Posts by Michael Foord)https://www.pypy.org/enContents © 2024 <a href="mailto:pypy-dev@pypy.org">The PyPy Team</a> Sat, 31 Aug 2024 17:48:11 GMTNikola (getnikola.com)http://blogs.law.harvard.edu/tech/rssPyPy Winter Sprint Reporthttps://www.pypy.org/posts/2011/02/pypy-winter-sprint-report-4155886720346408516.htmlMichael Foord<p>A few weeks ago I had the great fortune to attend the PyPy winter sprint in Leysin Switzerland. I've wanted to contribute to PyPy for a long time and I thought diving into a sprint might be a good way to get familiar with some of the code. What I wasn't expecting was to be using RPython to implement new methods on built-in Python objects on the first day. The main thing I took away from the sprint was just how easy it is to get involved in developing PyPy (well, some bits of it at least and being surrounded by core developers helps). I wrote up a very short description of how to get started <a href="https://bitbucket.org/pypy/pypy/wiki/How%20to%20run%20lib-python%20tests">here</a>, but I'll do a longer blog post with examples on <a href="https://www.voidspace.org.uk/python/weblog/">my own blog</a> soon(ish).<br> +<br> +The sprint was kicked off by Armin merging the "fast-forward" branch of PyPy onto trunk. "fast-forward" brings PyPy from Python 2.5 compatibility to Python 2.7. Along with this it brought a large number of test failures, as the sterling work done by Benjamin Peterson and Amaury Forgeot d'Arc was not complete. This immediately set the primary sprint goal to reduce the number of test failures.<br> +<br> +We made a great deal of progress on this front, and you can see how close PyPy is now from the <a href="https://buildbot.pypy.org/summary?branch=%3Ctrunk%3E">buildbots</a>.<br> +<br> +Jacob Hallén and I started working through the list of tests with failures alphabetically. We made short work of test_asyncore and moved onto test_bytes where I was stuck for the rest of the sprint. I spent much of the remaining days working with Laura Creighton on the pypy bytearray implementation to make it more compatible with Python 2.7. This meant adding new methods, changing some of the Python protocol method implementations and even changing the way that bytearray is constructed. All in all great fun and a great introduction to working with RPython.<br> +<br> +A big part of the compatibility with Python 2.7 work was done by Laura and Armin who basically rewrote the math module from scratch. This was needed to incorporate all the improvements made (mostly by Mark Dickinson) in CPython in 2.7. That involved a lot of head-scratching about such subtleties as whether -0.0 should be considered almost equal to 0.0 and other fun problems.<br> +<span id="goog_788025148"></span><span id="goog_788025149"></span><br> +<br> +</p><table align="center" cellpadding="0" cellspacing="0" class="tr-caption-container" style="margin-left: auto; margin-right: auto; text-align: center;"><tbody> +<tr><td style="text-align: center;"><span class="Apple-style-span" style="margin-left: auto; margin-right: auto;"><img border="0" height="239" src="https://4.bp.blogspot.com/-mtUgzR-TwUA/TVkXkIjqmXI/AAAAAAAAAVc/bbynq2Dwmg8/s320/first-meal.jpg" width="320"></span></td></tr> +<tr><td class="tr-caption" style="text-align: center;"><a href="https://www.flickr.com/photos/mfoord/sets/72157625889973066/">The first meal together, before everyone had arrived</a></td></tr> +</tbody></table> +If you add on top of this the wonderful people, the beautiful scenery, the Swiss cheese fondues, managing to not kill myself with a days skiing and traditional pypy card games, I can heartily recommend pypy sprints as a close approximation of geek nirvana.<br> +<br> +<table align="center" cellpadding="0" cellspacing="0" class="tr-caption-container" style="margin-left: auto; margin-right: auto; text-align: center;"><tbody> +<tr><td style="text-align: center;"><span class="Apple-style-span" style="margin-left: auto; margin-right: auto;"><img border="0" height="239" src="https://4.bp.blogspot.com/-qP95S6g9X9k/TVkYJKNYTQI/AAAAAAAAAVg/Pm3q36yMiLY/s320/mountains.jpg" width="320"></span></td></tr> +<tr><td class="tr-caption" style="text-align: center;"><a href="https://www.flickr.com/photos/mfoord/sets/72157625889973066/">View of the mountains from the sprint</a></td></tr> +</tbody></table> +<br> +Working on 2.7 compatibility wasn't the only work that happened during the sprint. Other activities included:<br> +<ul> +<li>Antonio Cuni worked on the "jittypes" branch. This is a reimplementation of the core of the PyPy ctypes code to make it jittable. The goal is that for common cases the jit should be able to turn ctypes calls from Python into direct C level calls. This work was not completed but very close and is great for the future of integrating C libraries with PyPy. As ctypes is also available in CPython and IronPython, and hopefully will be available in Jython soon, integrating C code with Python through ctypes is the most "implementation portable" technique.</li> +<li>David Schneider continued his work on the <a href="https://www.pypy.org/posts/2011/01/jit-backend-for-arm-processors-5994810755839586463.html">JIT backend for ARM</a>. PyPy has been cross-compilable to ARM for a long time, but bringing the JIT to ARM will provide a *fast* PyPy for ARM, which includes platforms like Android. Again David didn't complete this work but did complete the float support.</li> +<li>Håkan Ardo was present for two days and continued his crazy-clever work on JIT optimisations, some of which are described in the <a href="https://www.pypy.org/posts/2011/01/loop-invariant-code-motion-1998392217676829154.html">Loop invariant code motion</a> blog entry.</li> +<li>Holger Krekel worked on updating the PyPy test suite to the latest version of py.test and also worked with me on the interminable bytearray changes for part of the sprint.</li> +<li>No one was sure what  Maciej Fijałkowski worked on but he seemed to be quite busy.</li> +</ul> +I think that was most of the work done during the actual sprint. There was also a great deal of healthy discussion about the future of PyPy. Expect lots more interesting and exciting developments over the coming year.<br> +<br>sprinthttps://www.pypy.org/posts/2011/02/pypy-winter-sprint-report-4155886720346408516.htmlMon, 14 Feb 2011 12:05:00 GMT \ No newline at end of file diff --git a/authors/philip-jenvey.html b/authors/philip-jenvey.html new file mode 100644 index 000000000..89d4619e5 --- /dev/null +++ b/authors/philip-jenvey.html @@ -0,0 +1,143 @@ + + + + + +Posts by Philip Jenvey | PyPy + + + + + + + + + + + + + + + + + Skip to main content +
+

Posts by Philip Jenvey

+ +
+
+ + \ No newline at end of file diff --git a/authors/philip-jenvey.xml b/authors/philip-jenvey.xml new file mode 100644 index 000000000..458322978 --- /dev/null +++ b/authors/philip-jenvey.xml @@ -0,0 +1,239 @@ + +PyPy (Posts by Philip Jenvey)https://www.pypy.org/enContents © 2024 <a href="mailto:pypy-dev@pypy.org">The PyPy Team</a> Sat, 31 Aug 2024 17:48:10 GMTNikola (getnikola.com)http://blogs.law.harvard.edu/tech/rssPyPy3.3 v5.2 alpha 1 releasedhttps://www.pypy.org/posts/2016/05/pypy33-v52-alpha-1-released-1725927506363370346.htmlPhilip Jenvey<p>We're pleased to announce the first alpha release of PyPy3.3 v5.2. This is the<br> +first release of PyPy which targets Python 3.3 (3.3.5) compatibility.</p><p>We would like to thank all of the people who <a class="reference external" href="https://www.pypy.org/posts/2012/01/py3k-and-numpy-first-stage-thanks-to-3008917396290059758.html">donated</a> to the <a class="reference external" href="https://pypy.org/py3donate.html">py3k proposal</a><br> +for supporting the work that went into this and future releases.</p><p>You can download the PyPy3.3 v5.2 alpha 1 release here:</p><blockquote><a class="reference external" href="https://pypy.org/download.html#python-3-3-5-compatible-pypy3-3-v5-2">https://pypy.org/download.html#python-3-3-5-compatible-pypy3-3-v5-2</a></blockquote><div class="section" id="highlights"><h1>Highlights</h1><ul class="simple"><li>Python 3.3.5 support!<ul><li>Being an early alpha release, there are some <a class="reference external" href="https://foss.heptapod.net/pypy/pypy/-/issues?status=new&amp;status=open&amp;component=PyPy3+%28running+Python+3.x%29&amp;kind=enhancement">missing features</a> such as a<br> +<a class="reference external" href="https://foss.heptapod.net/pypy/pypy/-/issues/2309/optimized-unicode-representation">PEP 393-like space efficient string representation</a> and <a class="reference external" href="https://foss.heptapod.net/pypy/pypy/-/issues?status=new&amp;status=open&amp;component=PyPy3%20%28running%20Python%203.x%29">known issues</a><br> +including performance issues (e.g. issue <a class="reference external" href="https://foss.heptapod.net/pypy/pypy/-/issues/2305">#2305</a>). The focus for this<br> +release has been updating to 3.3 compatibility. Windows is also not yet<br> +supported.</li> +</ul></li> +<li><a class="reference external" href="https://docs.python.org/3/library/ensurepip.html#module-ensurepip">ensurepip</a> is also included (it's only included in CPython 3 &gt;= 3.4).</li> +</ul></div><div class="section" id="what-is-pypy"><h1>What is PyPy?</h1><p>PyPy is a very compliant Python interpreter, almost a drop-in replacement for<br> +CPython 2.7.10 and one day 3.3.5. It's fast due to its integrated tracing JIT<br> +compiler.</p><p>We also welcome developers of other <a class="reference external" href="https://pypyjs.org">dynamic languages</a> to see what RPython<br> +can do for them.</p><p>This release supports:</p><blockquote><ul class="simple"><li><strong>x86</strong> machines on most common operating systems except Windows<br> +(Linux 32/64, Mac OS X 64, OpenBSD, FreeBSD),</li> +<li>newer <strong>ARM</strong> hardware (ARMv6 or ARMv7, with VFPv3) running Linux,</li> +<li>big- and little-endian variants of <strong>PPC64</strong> running Linux,</li> +<li><strong>s390x</strong> running Linux</li> +</ul></blockquote><p>Please try it out and let us know what you think. We welcome feedback, we know<br> +you are using PyPy, please tell us about it!</p><p>We'd especially like to thank these people for their contributions to this<br> +release:</p><p>Manuel Jacob, Ronan Lamy, Mark Young, Amaury Forgeot d'Arc, Philip Jenvey,<br> +Martin Matusiak, Vasily Kuznetsov, Matti Picus, Armin Rigo and many others.</p><p>Cheers</p><p>The PyPy Team</p></div>releasehttps://www.pypy.org/posts/2016/05/pypy33-v52-alpha-1-released-1725927506363370346.htmlMon, 30 May 2016 22:53:00 GMTPyPy3 2.4.0 releasedhttps://www.pypy.org/posts/2014/10/pypy3-240-released-5007750685927360190.htmlPhilip Jenvey<div dir="ltr" style="text-align: left;">We're pleased to announce the availability of PyPy3 2.4.0!<br> +<br> +This release contains several bugfixes and enhancements. Among the user-facing improvements specific to PyPy3:<br> +<ul><li>Better Windows compatibility, e.g. the nt module functions _getfinalpathname &amp; _getfileinformation are now supported (the former is required for the popular pathlib library for example)</li> +<li>Various fsencode PEP 383 related fixes to the posix module (readlink, uname, ttyname and ctermid) and improved locale handling</li> +<li>Switched the default binary name on POSIX distributions from 'pypy' to 'pypy3' (which symlinks to to 'pypy3.2')</li> +<li>Fixed a couple different crashes related to parsing Python 3 source code</li> +</ul><br> +And improvements shared with the recent PyPy 2.4.0 release:<br> +<ul style="text-align: left;"><li>internal refactoring in string and GIL handling which led to significant speedups</li> +<li>improved handling of multiple objects (like sockets) in long-running programs. They are collected and released more efficiently, reducing memory use. In simpler terms - we closed what looked like a memory leak</li> +<li>Windows builds now link statically to zlib, expat, bzip, and openssl-1.0.1i</li> +<li>Many issues were <a class="reference external" href="https://foss.heptapod.net/pypy/pypy/-/issues?status=resolved">resolved</a> since the 2.3.1 release in June</li> +</ul><br> +You can download PyPy3 2.4.0 here <a href="https://pypy.org/download.html">https://pypy.org/download.html.</a><br> +<br> +<a href="https://www.pypy.org/">PyPy</a> is a very compliant Python interpreter, almost a drop-in replacement for <a href="https://www.python.org/">CPython</a> 2.7 and 3.2.5. It's fast (<a href="https://speed.pypy.org/">pypy 2.4 and cpython 2.7.x performance comparison</a>) due to its integrated tracing JIT compiler.<br> +<br> +This release supports x86 machines running Linux 32/64, Mac OS X 64, Windows, and OpenBSD, as well as newer ARM hardware (ARMv6 or ARMv7, with VFPv3) running Linux.  <br> +We would like to thank our donors for the continued support of the PyPy project.<br> +<br> +The complete release notice is <a href="https://doc.pypy.org/en/latest/release-pypy3-2.4.0.html">here.</a><br> +<br> +Please try it out and let us know what you think. We especially welcome success stories, please tell us about how it has helped you!<br> +<br> +Cheers, The PyPy Team<br> +<br> +</div>releasehttps://www.pypy.org/posts/2014/10/pypy3-240-released-5007750685927360190.htmlTue, 21 Oct 2014 18:02:00 GMTPyPy3 2.3.1 - Fulcrumhttps://www.pypy.org/posts/2014/06/pypy3-231-fulcrum-3765964217640322884.htmlPhilip Jenvey<p>We're pleased to announce the first stable release of PyPy3. PyPy3<br> +targets Python 3 (3.2.5) compatibility.</p><p>We would like to thank all of the people who <a class="reference external" href="https://www.pypy.org/posts/2012/01/py3k-and-numpy-first-stage-thanks-to-3008917396290059758.html">donated</a> to the <a class="reference external" href="https://pypy.org/py3donate.html">py3k proposal</a><br> +for supporting the work that went into this.</p><p>You can download the PyPy3 2.3.1 release here:</p><blockquote><a class="reference external" href="https://pypy.org/download.html#pypy3-2-3-1">https://pypy.org/download.html#pypy3-2-3-1</a></blockquote><div class="section" id="highlights"><h1>Highlights</h1><ul class="simple"><li>The first stable release of PyPy3: support for Python 3!</li> +<li>The stdlib has been updated to Python 3.2.5</li> +<li>Additional support for the u'unicode' syntax (<a class="reference external" href="https://legacy.python.org/dev/peps/pep-0414/">PEP 414</a>) from Python 3.3</li> +<li>Updates from the default branch, such as incremental GC and various JIT<br> +improvements</li> +<li>Resolved some notable JIT performance regressions from PyPy2:</li> +</ul><blockquote><ul class="simple"><li>Re-enabled the previously disabled collection (list/dict/set) strategies</li> +<li>Resolved performance of iteration over range objects</li> +<li>Resolved handling of Python 3's exception __context__ unnecessarily forcing<br> +frame object overhead</li> +</ul></blockquote></div><div class="section" id="what-is-pypy"><h1>What is PyPy?</h1><p>PyPy is a very compliant Python interpreter, almost a drop-in replacement for<br> +CPython 2.7.6 or 3.2.5. It's fast due to its integrated tracing JIT compiler.</p><p>This release supports x86 machines running Linux 32/64, Mac OS X 64, Windows,<br> +and OpenBSD,<br> +as well as newer ARM hardware (ARMv6 or ARMv7, with VFPv3) running Linux.</p><p>While we support 32 bit python on Windows, work on the native Windows 64<br> +bit python is still stalling, we would welcome a volunteer<br> +to <a class="reference external" href="https://doc.pypy.org/en/latest/windows.html#what-is-missing-for-a-full-64-bit-translation">handle that</a>.</p></div><div class="section" id="how-to-use-pypy"><h1>How to use PyPy?</h1><p>We suggest using PyPy from a <a class="reference external" href="https://www.virtualenv.org/en/latest/">virtualenv</a>. Once you have a virtualenv<br> +installed, you can follow instructions from <a class="reference external" href="https://doc.pypy.org/en/latest/getting-started.html#installing-using-virtualenv">pypy documentation</a> on how<br> +to proceed. This document also covers other <a class="reference external" href="https://doc.pypy.org/en/latest/getting-started.html#installing-pypy">installation schemes</a>.</p><p>Cheers,<br> +the PyPy team</p></div>pypy3https://www.pypy.org/posts/2014/06/pypy3-231-fulcrum-3765964217640322884.htmlFri, 20 Jun 2014 21:31:00 GMTPy3k status update #13https://www.pypy.org/posts/2014/02/py3k-status-update-13-4630607029125647100.htmlPhilip Jenvey<p>This is the 13th status update about our work on the <a class="reference external" href="https://bitbucket.org/pypy/pypy/commits/all/tip/branch%28%22py3k%22%29">py3k branch</a>, which we<br> +can work on thanks to all of the people who <a class="reference external" href="https://www.pypy.org/posts/2012/01/py3k-and-numpy-first-stage-thanks-to-3008917396290059758.html">donated</a> to the <a class="reference external" href="https://pypy.org/py3donate.html">py3k proposal</a>.</p><p>We're just finishing up a cleanup of int/long types. This work helps the py3k<br> +branch unify these types into the Python 3 int and restore <a class="reference external" href="https://www.pypy.org/posts/2013/11/py3k-status-update-12-5307085693947812769.html">JIT compilation of<br> +machine sized integers</a>.</p><p>This cleanup also removes <a class="reference external" href="https://doc.pypy.org/en/latest/objspace.html#multimethods">multimethods</a> from these types. PyPy has<br> +historically used a clever implementation of multimethod dispatch for declaring<br> +methods of the __builtin__ types in RPython.</p><p>This multimethod scheme provides some convenient features for doing this,<br> +however we've come to the conclusion that it may be more trouble than it's<br> +worth. A major problem of multimethods is that they generate a large amount of<br> +stub methods which burden the already lengthy and memory hungry RPython<br> +translation process. Also, their implementation and behavior can be somewhat<br> +complicated/obscure.</p><p>The alternative to multimethods involves doing the work of the type checking<br> +and dispatching rules in a more verbose, manual way. It's a little more work in<br> +the end but less magical.</p><p>Recently, Manuel Jacob finished a large cleanup effort of the<br> +unicode/string/bytearray types that also removed their multimethods. This work<br> +also benefits the py3k branch: it'll help with future <a class="reference external" href="https://www.python.org/dev/peps/pep-0393/">PEP 393</a> (or <a class="reference external" href="https://lucumr.pocoo.org/2014/1/9/ucs-vs-utf8/">PEP 393<br> +alternative</a>) work. This effort was partly sponsored by Google's Summer of<br> +Code: thanks Manuel and Google!</p><p>Now there's only a couple major pieces left in the multimethod removal (the<br> +float/complex types and special marshaling code) and a few minor pieces that<br> +should be relatively easy.</p><p>In conclusion, there's been some good progress made on py3k and multimethod<br> +removal this winter, albeit a bit slower than we would have liked.</p><p>cheers,<br> +Phil</p>pypy3https://www.pypy.org/posts/2014/02/py3k-status-update-13-4630607029125647100.htmlTue, 18 Feb 2014 02:33:00 GMTPy3k status update #12https://www.pypy.org/posts/2013/11/py3k-status-update-12-5307085693947812769.htmlPhilip Jenvey<p>This is the 12th status update about our work on the <a class="reference external" href="https://bitbucket.org/pypy/pypy/commits/all/tip/branch%28%22py3k%22%29">py3k branch</a>, which we<br> +can work on thanks to all of the people who <a class="reference external" href="https://www.pypy.org/posts/2012/01/py3k-and-numpy-first-stage-thanks-to-3008917396290059758.html">donated</a> to the <a class="reference external" href="https://pypy.org/py3donate.html">py3k proposal</a>.</p><p>Here's an update on the recent progress:</p><ul class="simple"><li>Thank you to everyone who has provided initial feedback on the PyPy3 2.1 beta<br> +1 release. We've gotten a number of bug reports, most of which have been<br> +fixed.</li> +<li>As usual, we're continually keeping up with changes from the default<br> +branch. Oftentimes these merges come at a cost (conflicts and or<br> +reintegration of py3k changes) but occasionally we get goodies for free, such<br> +as the <a class="reference external" href="https://www.pypy.org/posts/2013/10/making-coveragepy-faster-under-pypy-935409618297062344.html">recent JIT optimizations</a> and <a class="reference external" href="https://www.pypy.org/posts/2013/10/incremental-garbage-collector-in-pypy-8956893523842234676.html">incremental garbage collection</a>.</li> +<li>We've been focusing on re-optimizing Python 2 int sized (machine sized)<br> +integers:</li> +</ul><p>We have a couple of known, notable speed regressions in the PyPy3 beta release<br> +vs regular PyPy. The major one being with Python 2.x int sized (or machine<br> +sized) integers.</p><p>Python 3 drops the distinction between int and long types. CPython 3.x<br> +accomplishes this by removing the old int type entirely and renaming the long<br> +type to int. Initially, we've done the same for PyPy3 for the sake of<br> +simplicity and getting everything working.</p><p>However PyPy's JIT is capable of heavily optimizing these machine sized integer<br> +operations, so this came with a regression in performance in this area.</p><p>We're now in the process of solving this. Part of this work also involves some<br> +house cleaning on these numeric types which also benefits the default branch.</p><p>cheers,<br> +Phil</p>pypy3https://www.pypy.org/posts/2013/11/py3k-status-update-12-5307085693947812769.htmlTue, 12 Nov 2013 23:16:00 GMTPyPy3 2.1 beta 1https://www.pypy.org/posts/2013/07/pypy3-21-beta-1-8647445024868663902.htmlPhilip Jenvey<p>We're pleased to announce the first beta of the upcoming 2.1 release of<br> +PyPy3. This is the first release of PyPy which targets Python 3 (3.2.3)<br> +compatibility.</p><p>We would like to thank all of the people who <a class="reference external" href="https://www.pypy.org/posts/2012/01/py3k-and-numpy-first-stage-thanks-to-3008917396290059758.html">donated</a> to the <a class="reference external" href="https://pypy.org/py3donate.html">py3k proposal</a><br> +for supporting the work that went into this and future releases.</p><p>You can download the PyPy3 2.1 beta 1 release here:</p><blockquote><a class="reference external" href="https://pypy.org/download.html#pypy3-2-1-beta-1">https://pypy.org/download.html#pypy3-2-1-beta-1</a></blockquote><div class="section" id="highlights"><h1>Highlights</h1><ul class="simple"><li>The first release of PyPy3: support for Python 3, targetting CPython 3.2.3!<ul><li>There are some <a class="reference external" href="https://bugs.pypy.org/issue?%40search_text=&amp;title=py3k&amp;%40columns=title&amp;keyword=&amp;id=&amp;%40columns=id&amp;creation=&amp;creator=&amp;release=&amp;activity=&amp;%40columns=activity&amp;%40sort=activity&amp;actor=&amp;priority=&amp;%40group=priority&amp;status=-1%2C1%2C2%2C3%2C4%2C5%2C6&amp;%40columns=status&amp;assignedto=&amp;%40columns=assignedto&amp;%40pagesize=50&amp;%40startwith=0&amp;%40queryname=&amp;%40old-queryname=&amp;%40action=search">known issues</a> including performance regressions (issues<br> +<a class="reference external" href="https://bugs.pypy.org/issue1540">#1540</a> &amp; <a class="reference external" href="https://bugs.pypy.org/issue1541">#1541</a>) slated to be resolved before the final release.</li> +</ul></li> +</ul></div><div class="section" id="what-is-pypy"><h1>What is PyPy?</h1><p>PyPy is a very compliant Python interpreter, almost a drop-in replacement for<br> +CPython 2.7.3 or 3.2.3. It's fast due to its integrated tracing JIT compiler.</p><p>This release supports x86 machines running Linux 32/64, Mac OS X 64 or Windows<br> +32. Also this release supports ARM machines running Linux 32bit - anything with<br> +<tt class="docutils literal">ARMv6</tt> (like the Raspberry Pi) or <tt class="docutils literal">ARMv7</tt> (like Beagleboard,<br> +Chromebook, Cubieboard, etc.) that supports <tt class="docutils literal">VFPv3</tt> should work.</p><p>Windows 64 work is still stalling and we would welcome a volunteer to handle<br> +that.</p></div><div class="section" id="how-to-use-pypy"><h1>How to use PyPy?</h1><p>We suggest using PyPy from a <a class="reference external" href="https://www.virtualenv.org/en/latest/">virtualenv</a>. Once you have a virtualenv<br> +installed, you can follow instructions from <a class="reference external" href="https://doc.pypy.org/en/latest/getting-started.html#installing-using-virtualenv">pypy documentation</a> on how<br> +to proceed. This document also covers other <a class="reference external" href="https://doc.pypy.org/en/latest/getting-started.html#installing-pypy">installation schemes</a>.</p><p>Cheers,<br> +the PyPy team</p></div>https://www.pypy.org/posts/2013/07/pypy3-21-beta-1-8647445024868663902.htmlTue, 30 Jul 2013 21:35:00 GMTPy3k status update #11https://www.pypy.org/posts/2013/06/py3k-status-update-11-133025715908408072.htmlPhilip Jenvey<p>This is the 11th status update about our work on the <a class="reference external" href="https://bitbucket.org/pypy/pypy/commits/all/tip/branch%28%22py3k%22%29">py3k branch</a>, which we<br> +can work on thanks to all of the people who <a class="reference external" href="https://www.pypy.org/posts/2012/01/py3k-and-numpy-first-stage-thanks-to-3008917396290059758.html">donated</a> to the <a class="reference external" href="https://pypy.org/py3donate.html">py3k proposal</a>.</p><p>Here's some highlights of the progress made since the previous update:</p><ul class="simple"><li>PyPy py3k now matches CPython 3's hash code for<br> +int/float/complex/Decimal/Fraction</li> +<li>Various outstanding unicode identifier related issues were<br> +resolved. E.g. test_importlib/pep263/ucn/unicode all now fully pass. Various<br> +usage of identifiers (in particular type and module names) have been fixed to<br> +handle non-ascii names -- mostly around display of reprs and exception<br> +messages.</li> +<li>The unicodedata database has been upgraded to 6.0.0.</li> +<li>Windows support has greatly improved, though it could still use some more<br> +help (but so does the default branch to a certain degree).</li> +<li>Probably the last of the parsing related bugs/features have been taken care<br> +of.</li> +<li>Of course various other smaller miscellaneous fixes</li> +</ul><p>This leaves the branch w/ only about 5 outstanding failures of the stdlib test<br> +suite:</p><ul><li><p class="first">test_float</p><p>1 failing test about containment of floats in collections.</p></li> +<li><p class="first">test_memoryview</p><p>Various failures: requires some bytes/str changes among other things (Manuel<br> +Jacob's has some progress on this on the <a class="reference external" href="https://bitbucket.org/pypy/pypy/compare/py3k-memoryview..py3k">py3k-memoryview branch</a>)</p></li> +<li><p class="first">test_multiprocessing</p><p>1 or more tests deadlock on some platforms</p></li> +<li><p class="first">test_sys and test_threading</p><p>2 failing tests for the New GIL's new API</p></li> +</ul><p>Probably the biggest feature left to tackle is the New GIL.</p><p>We're now pretty close to pushing an initial release. We had planned for one<br> +around PyCon, but having missed that we've put some more effort into the branch<br> +to provide a more fully-fledged initial release.</p><p>Thanks to the following for their contributions: Manuel Jacob, Amaury Forgeot<br> +d'Arc, Karl Ramm, Jason Chu and Christian Hudon.</p><p>cheers,<br> +Phil</p>pypy3https://www.pypy.org/posts/2013/06/py3k-status-update-11-133025715908408072.htmlWed, 12 Jun 2013 19:17:00 GMTPy3k status update #10https://www.pypy.org/posts/2013/03/py3k-status-update-10-6681398990092286007.htmlPhilip Jenvey<p>This is the tenth status update about our work on the <a class="reference external" href="https://bitbucket.org/pypy/pypy/commits/all/tip/branch%28%22py3k%22%29">py3k branch</a>, which we<br> +can work on thanks to all of the people who <a class="reference external" href="https://www.pypy.org/posts/2012/01/py3k-and-numpy-first-stage-thanks-to-3008917396290059758.html">donated</a> to the <a class="reference external" href="https://pypy.org/py3donate.html">py3k proposal</a>.</p><p>There's been significant progress since the last update: the <a class="reference external" href="https://buildbot.pypy.org/summary?branch=py3k">linux x86-32<br> +buildbot</a> now passes 289 out of approximately 354 modules (with 39 skips) of<br> +CPython's regression test suite.</p><p>That means there's only 26 test module failures left! The list of major items<br> +remaining for 3.2 compatibility are now short enough to list here, with their<br> +related tests:</p><ul class="simple"><li>Tokenizer support for non-ascii identifiers</li> +</ul><blockquote><ul class="simple"><li>test_importlib</li> +<li>test_pep263</li> +</ul></blockquote><ul class="simple"><li>memoryview (Manuel Jacob's tackling this on the <a class="reference external" href="https://bitbucket.org/pypy/pypy/compare/py3k-memoryview..py3k">py3k-memoryview branch</a>)</li> +</ul><blockquote><ul class="simple"><li>test_memoryview</li> +</ul></blockquote><ul class="simple"><li>multiprocessing module currently deadlocks</li> +</ul><blockquote><ul class="simple"><li>test_multiprocessing</li> +</ul></blockquote><ul class="simple"><li>Buggy handling of the new extended unpacking syntax by the compiler:</li> +</ul><blockquote><ul class="simple"><li>test_unpack_ex</li> +</ul></blockquote><ul class="simple"><li>The new Global Interpreter Lock and new thread signal handling</li> +</ul><blockquote><ul class="simple"><li>test_threading</li> +<li>test_threadsignals</li> +<li>test_sys</li> +</ul></blockquote><ul class="simple"><li>Upgrade unicodedata to 6.0.0 (requires updates to the actual unicodedata<br> +generation script)</li> +</ul><blockquote><ul class="simple"><li>test_ucn</li> +<li>test_unicode</li> +<li>test_unicodedata</li> +</ul></blockquote><ul class="simple"><li><a class="reference external" href="https://www.pypy.org/posts/2010/04/using-cpython-extension-modules-with-5864754772659599217.html">CPyExt</a></li> +</ul><blockquote><ul class="simple"><li>test_capi (currently crashes)</li> +</ul></blockquote><ul class="simple"><li>Update int's hash code to match to CPython (float's is already updated on the<br> +<a class="reference external" href="https://bitbucket.org/pypy/pypy/compare/py3k-newhash..py3k">py3k-newhash branch</a>. note that PyPy 2.x doesn't even totally match<br> +CPython's hashing)</li> +</ul><blockquote><ul class="simple"><li>test_decimal</li> +<li>test_fractions</li> +<li>test_numeric_tower</li> +</ul></blockquote><ul class="simple"><li>Miscellaneous:</li> +</ul><blockquote><ul class="simple"><li>test_complex</li> +<li>test_float</li> +<li>test_peepholer</li> +<li>test_range</li> +<li>test_sqlite (a new cffi based version seems to be coming)</li> +<li>test_ssl</li> +<li>test_struct</li> +<li>test_subprocess</li> +<li>test_sys_settrace</li> +<li>test_time</li> +</ul></blockquote><p>Additionally there are still a number of failures in PyPy's internal test<br> +suite. These tests are usually ran against untranslated versions of PyPy during<br> +development. However we've now began running them against a fully translated<br> +version of PyPy on the buildbot too (thanks to Amaury for setting this<br> +up). This further ensures that our tests and implementation are sane.</p><p>We're getting closer to producing an initial alpha release. Before that happens<br> +we'd like to see:</p><ul class="simple"><li>further test fixes</li> +<li>the results of test runs on other major platforms (e.g. linux x86-64 and osx<br> +seem to have some additional failures as of now)</li> +<li>some basic real world testing</li> +</ul><p>Finally I'd like to thank Manuel Jacob for his various contributions over the<br> +past month, including fixing the array and ctypes modules among other things,<br> +and also Amaury Forgeot d'Arc for his ongoing excellent contributions.</p><p>cheers,<br> +Phil</p>pypy3https://www.pypy.org/posts/2013/03/py3k-status-update-10-6681398990092286007.htmlTue, 05 Mar 2013 20:00:00 GMTPy3k status update #9https://www.pypy.org/posts/2013/01/py3k-status-update-9-98332471264591773.htmlPhilip Jenvey<p>This is the ninth status update about our work on the <a class="reference external" href="https://foss.heptapod.net/pypy/pypy/-/tree/branch/py3k">py3k branch</a>, which<br> +we can work on thanks to all of the people who <a class="reference external" href="https://www.pypy.org/posts/2012/01/py3k-and-numpy-first-stage-thanks-to-3008917396290059758.html">donated</a> to the <a class="reference external" href="https://pypy.org/py3donate.html">py3k<br> +proposal</a>.</p><p>Just a very short update on December's work: we're now passing about 223 of<br> +approximately 355 modules of CPython's regression test suite, up from passing<br> +194 last month.</p><p>Some brief highlights:</p><ul class="simple"><li>More encoding related issues were addressed. e.g. now most if not all the<br> +multibytecodec test modules pass.</li> +<li>Fixed some path handling issues (<tt class="docutils literal">test_os</tt>, <tt class="docutils literal">test_ntpath</tt> and<br> +<tt class="docutils literal">test_posixpath</tt> now pass)</li> +<li>We now pass <tt class="docutils literal">test_class</tt>, <tt class="docutils literal">test_descr</tt> and almost <tt class="docutils literal">test_builtin</tt> (among<br> +other things): these are notable as they are fairly extensive test suites of<br> +core aspects of the langauge.</li> +<li>Amaury Forgeot d'Arc continued making progress on <a class="reference external" href="https://www.pypy.org/posts/2010/04/using-cpython-extension-modules-with-5864754772659599217.html">CPyExt</a> (thanks again!)</li> +</ul><p>cheers,<br> +Phil</p>pypy3https://www.pypy.org/posts/2013/01/py3k-status-update-9-98332471264591773.htmlThu, 10 Jan 2013 06:04:00 GMTPy3k status update #8https://www.pypy.org/posts/2012/12/py3k-status-update-8-3932232806458251730.htmlPhilip Jenvey<p>This is the eight status update about our work on the <a class="reference external" href="https://bitbucket.org/pypy/pypy/commits/all/tip/branch%28%22py3k%22%29">py3k branch</a>, which<br> +we can work on thanks to all of the people who <a class="reference external" href="https://www.pypy.org/posts/2012/01/py3k-and-numpy-first-stage-thanks-to-3008917396290059758.html">donated</a> to the <a class="reference external" href="https://pypy.org/py3donate.html">py3k<br> +proposal</a>.</p><p>Just a short update on November's work: we're now passing about 194 of<br> +approximately 355 modules of CPython's regression test suite, up from passing<br> +160 last month. Many test modules only fail a small number of individual tests<br> +now.</p><p>We'd like to thank Amaury Forgeot d'Arc for his contributions, in particular he<br> +has made significant progress on updating <a class="reference external" href="https://www.pypy.org/posts/2010/04/using-cpython-extension-modules-with-5864754772659599217.html">CPyExt</a> for Python 3 this month.</p><p>Some other highlights:</p><ul class="simple"><li><tt class="docutils literal">test_marshal</tt> now passes, and there's been significant progress on<br> +pickling (thanks <a class="reference external" href="https://twitter.com/Joushou">Kenny Levinsen</a> and Amaury for implementing<br> +<tt class="docutils literal"><span class="pre">int.{to,from}_bytes</span></tt>)</li> +<li>We now have a <tt class="docutils literal">_posixsubprocess</tt> module</li> +<li>More encoding related fixes, which affects many failing tests</li> +<li><tt class="docutils literal">_sre</tt> was updated and now <tt class="docutils literal">test_re</tt> almost passes</li> +<li>Exception behavior is almost complete per the Python 3 specs, what's mostly<br> +missing now are the new <tt class="docutils literal">__context__</tt> and <tt class="docutils literal">__traceback__</tt> attributes (<a class="reference external" href="https://www.python.org/dev/peps/pep-3134/">PEP<br> +3134</a>)</li> +<li>Fixed some crashes and deadlocks occurring during the regression tests</li> +<li>We merged the <a class="reference external" href="https://bitbucket.org/pypy/pypy/commits/all/tip/branch%28%22unicode-strategies%22%29">unicode-strategies</a> branch both to default and to py3k: now we<br> +have versions of lists, dictionaries and sets specialized for unicode<br> +elements, as we already had for strings.</li> +<li>However, for string-specialized containers are still faster in some cases<br> +because there are shortcuts which have not been implemented for unicode yet<br> +(e.g., constructing a set of strings from a list of strings). The plan is to<br> +completely kill the shortcuts and improve the JIT to produce the fast<br> +version automatically for both the string and unicode versions, to have a<br> +more maintainable codebase without sacrificing the speed. The <a class="reference external" href="https://bitbucket.org/pypy/pypy/commits/all/tip/branch%28%22autoreds%22%29">autoreds</a><br> +branch (already merged) was a first step in this direction.</li> +</ul><p>cheers,<br> +Philip&amp;Antonio</p>pypy3https://www.pypy.org/posts/2012/12/py3k-status-update-8-3932232806458251730.htmlTue, 04 Dec 2012 22:30:00 GMT \ No newline at end of file diff --git a/authors/richard-plangger.html b/authors/richard-plangger.html new file mode 100644 index 000000000..faec0a433 --- /dev/null +++ b/authors/richard-plangger.html @@ -0,0 +1,125 @@ + + + + + +Posts by Richard Plangger | PyPy + + + + + + + + + + + + + + + + + Skip to main content +
+
+
+ + \ No newline at end of file diff --git a/authors/richard-plangger.xml b/authors/richard-plangger.xml new file mode 100644 index 000000000..068d54252 --- /dev/null +++ b/authors/richard-plangger.xml @@ -0,0 +1,404 @@ + +PyPy (Posts by Richard Plangger)https://www.pypy.org/enContents © 2024 <a href="mailto:pypy-dev@pypy.org">The PyPy Team</a> Sat, 31 Aug 2024 17:48:12 GMTNikola (getnikola.com)http://blogs.law.harvard.edu/tech/rssNative profiling in VMProfhttps://www.pypy.org/posts/2017/04/native-profiling-in-vmprof-6949065546884243105.htmlRichard Plangger<p>We are happy to announce a new release for the PyPI package <span>vmprof</span>.<br> +It is now able to capture native stack frames on Linux and Mac OS X to show you bottle necks in compiled code (such as CFFI modules, Cython or C Python extensions). It supports PyPy, CPython versions 2.7, 3.4, 3.5 and 3.6. Special thanks to Jetbrains for funding the native profiling support.<br> +<br> +</p><div class="separator" style="clear: both; text-align: center;"> +<a href="https://3.bp.blogspot.com/-94RAR1lkAP8/WNmQn-kpLhI/AAAAAAAAAqE/RXg6T4hptnQtH-8fdi87yh_BI37eN6COQCLcB/s1600/vmprof-logo.png" style="margin-left: 1em; margin-right: 1em;"><img alt="vmprof logo" border="0" src="https://3.bp.blogspot.com/-94RAR1lkAP8/WNmQn-kpLhI/AAAAAAAAAqE/RXg6T4hptnQtH-8fdi87yh_BI37eN6COQCLcB/s1600/vmprof-logo.png" title="vmprof logo"></a></div> +<div class="separator" style="clear: both; text-align: center;"> +</div> +<br> +<span style="font-size: large;">What is vmprof?</span><br> +<br> +<span style="font-size: large;"><span style="font-size: small;">If you have already worked with vmprof you can skip the next two section. If not, here is a short introduction:</span></span><br> +<br> +<span style="font-size: large;"><span style="font-size: small;">The goal of vmprof package is to give you more insight into your program. It is a statistical profiler. Another prominent profiler you might already have worked with is cProfile. It is bundled with the Python standard library.</span></span><br> +<br> +<span style="font-size: large;"><span style="font-size: small;">vmprof's distinct feature (from most other profilers) is that it does not significantly slow down your program execution. The employed strategy is statistical, rather than deterministic. Not every function call is intercepted, but it samples stack traces and memory usage at a configured sample rate (usually around 100hz). You can imagine that this creates a lot less contention than doing work before and after each function call.</span></span><br> +<br> +<span style="font-size: large;"><span style="font-size: small;">As mentioned earlier cProfile gives you a complete profile, but it needs to intercept every function call (it is a deterministic profiler). Usually this means that you have to capture and record every function call, but this takes an significant amount time.</span></span><br> +<span style="font-size: large;"><span style="font-size: small;"><br> +</span></span> <span style="font-size: large;"><span style="font-size: small;">The overhead vmprof consumes is roughly 3-4% of your total program runtime or even less if you reduce the sampling frequency. Indeed it lets you sample and inspect much larger programs. If you failed to profile a large application with cProfile, please give vmprof a shot.</span></span><br> +<br> +<span style="font-size: large;"><span style="font-size: small;"><span style="font-size: large;">vmprof.com or PyCharm</span></span></span><br> +<br> +<div> +<div> +There are two major alternatives to the command-line tools shipped with vmprof:</div> +<ul> +<li>A web service on <a href="https://vmprof.com/">vmprof.com</a></li> +<li>PyCharm Professional Edition </li> +</ul> +<div> +While the command line tool is only good for quick inspections, <a href="https://vmprof.com/">vmprof.com</a> + and PyCharm compliment each other providing deeper insight into your +program. With PyCharm you can view the per-line profiling results inside + the editor. With the <a href="https://vmprof.com/">vmprof.com</a> you get a handy visualization of the profiling results as a flame chart and memory usage graph.</div> +</div> +<div> +<br></div> +<div> +<div> +Since the PyPy Team runs and maintains the service on <a href="https://vmprof.com/">vmprof.com</a> (which is by the way free and open-source), I’ll explain some more details here. On <a href="https://vmprof.com/">vmprof.com</a> you can inspect the generated profile interactively instead of looking at console output. What is sent to <a href="https://vmprof.com/">vmprof.com</a>? You can find details <a href="https://vmprof.readthedocs.io/en/latest/data.html" target="_blank">here</a>.</div> +</div> +<br><span style="font-size: large;"><span style="font-size: small;"><b>Flamegraph</b>: </span></span>Accumulates and displays the most frequent codepaths. It allows you to quickly and accurately identify hot spots in your code. The flame graph below is a very short run of richards.py (Thus it shows a lot of time spent in PyPy's JIT compiler).<br> +<br> +<div class="separator" style="clear: both; text-align: center;"> +<a href="https://4.bp.blogspot.com/-n5LoH2hf7qI/WNvtNvIAbsI/AAAAAAAAAqc/zn0AXv8fkzIMQXWUwMLtLFpjochspz5MwCLcB/s1600/flamegraph.png" style="margin-left: 1em; margin-right: 1em;"><img border="0" height="231" src="https://4.bp.blogspot.com/-n5LoH2hf7qI/WNvtNvIAbsI/AAAAAAAAAqc/zn0AXv8fkzIMQXWUwMLtLFpjochspz5MwCLcB/s400/flamegraph.png" width="400"></a></div> +<br> +<br> +<b>List all functions (optionally sorted)</b>: the equivalent of the vmprof command line output in the web.<br> +<br> +<div class="separator" style="clear: both; text-align: center;"> +<a href="https://3.bp.blogspot.com/-zzAmBuf-3KM/WNvtNze_sZI/AAAAAAAAAqg/9u4Kxv_OzMsTV7KgRx9PvXGHOAPdfXYUgCLcB/s1600/list-of-functions.png" style="margin-left: 1em; margin-right: 1em;"><img border="0" height="215" src="https://3.bp.blogspot.com/-zzAmBuf-3KM/WNvtNze_sZI/AAAAAAAAAqg/9u4Kxv_OzMsTV7KgRx9PvXGHOAPdfXYUgCLcB/s400/list-of-functions.png" width="400"></a></div> +<br> + <b>Memory curve</b>: A line plot that shows how how many MBytes have been consumed over the lifetime of your program (see more info in the section below).<br> +<br> +<div class="separator" style="clear: both; text-align: center;"> +<a href="https://cloud.githubusercontent.com/assets/175722/17400119/70d43a84-5a46-11e6-974b-913cfa22a531.png" style="margin-left: 1em; margin-right: 1em;"><img border="0" height="187" src="https://cloud.githubusercontent.com/assets/175722/17400119/70d43a84-5a46-11e6-974b-913cfa22a531.png" width="400"></a></div> +<span style="font-size: large;"><span style="font-size: small;"><span style="font-size: large;">Native programs</span></span></span><br> +<span style="font-size: large;"><span style="font-size: small;"></span></span><br> +<span style="font-size: large;"><span style="font-size: small;">The new feature introduced in vmprof 0.4.x allows you to look beyond the Python level. As you might know, Python maintains a stack of frames to save the execution. Up to now the vmprof profiles only contained that level of information. But what if you program jumps to native code (such as calling gzip compression on a large file)? Up to now you would not see that information.</span></span><br> +<span style="font-size: large;"><span style="font-size: small;"><br></span></span> +<span style="font-size: large;"><span style="font-size: small;">Many packages make use of the CPython C API (which we discurage, please lookup <a href="https://cffi.readthedocs.org/" target="_blank">cffi</a> for a better way to call C). Have you ever had the issue that you know that your performance problems reach down to, but you could not profile it properly?<b> Now you can!</b></span></span><br> +<span style="font-size: large;"><span style="font-size: small;"><br> +</span></span> <span style="font-size: large;"><span style="font-size: small;">Let's inspect a very simple Python program to find out why a program is significantly slower on Linux than on Mac:</span></span><br> +<br> +<span style="font-size: large;"><span style="font-size: small;"><span>import numpy as np<br> +n = 1000<br> +a = np.random.random((n, n))<br> +b = np.random.random((n, n))<br> +c = np.dot(np.abs(a), b)</span><br> +</span></span><br> +<br> +Take two NxN random matrix objects and create a dot product. The first argument to the dot product provides the absolute value of the random matrix.<br> +<br> +<table border="1" style="border: 1px solid silver;"><tbody> +<tr><td>Run</td><td>Python</td><td>NumPy</td><td>OS</td><td>n=...</td> <td>Took</td> </tr> +<tr> <td>[1]</td><td>CPython 3.5.2</td><td>NumPy 1.12.1</td><td>Mac OS X, 10.12.3</td><td>n=5000</td><td>~9 sec</td></tr> +<tr> <td>[2]</td><td>CPython 3.6.0</td><td>NumPy 1.12.1</td><td>Linux 64, Kernel 4.9.14</td><td>n=1000</td><td>~26 sec</td></tr> +</tbody></table> +<br> +Note that the Linux machine operates on a 5 times smaller matrix, still it takes much longer. What is wrong? Is Linux slow? CPython 3.6.0? Well no, lets inspect and <a href="https://vmprof.com/#/567aa150-5927-4867-b22d-dbb67ac824ac" target="_blank">[1]</a> and <a href="https://vmprof.com/#/097fded2-b350-4d68-ae93-7956cd10150c" target="_blank">[2]</a> (shown below in that order).<br> +<div class="separator" style="clear: both; text-align: center;"> +<a href="https://3.bp.blogspot.com/-WF-JpMQhJaI/WNvx8CPNpTI/AAAAAAAAAqw/ixZpWng6TDc4kIlEHu9zhqrNX4tx0S4rgCLcB/s1600/macosx-profile-blog.png" style="margin-left: 1em; margin-right: 1em;"><img border="0" height="105" src="https://3.bp.blogspot.com/-WF-JpMQhJaI/WNvx8CPNpTI/AAAAAAAAAqw/ixZpWng6TDc4kIlEHu9zhqrNX4tx0S4rgCLcB/s400/macosx-profile-blog.png" width="400"></a></div> +<div class="separator" style="clear: both; text-align: center;"> +<a href="https://1.bp.blogspot.com/-gjM2uj5Ko_E/WNvx73qcXEI/AAAAAAAAAqs/cMvDfcHQ2eAti4BRU0ldwGQ5M-1_TQ2FACEw/s1600/linux-blog.png" style="margin-left: 1em; margin-right: 1em;"><img border="0" height="113" src="https://1.bp.blogspot.com/-gjM2uj5Ko_E/WNvx73qcXEI/AAAAAAAAAqs/cMvDfcHQ2eAti4BRU0ldwGQ5M-1_TQ2FACEw/s400/linux-blog.png" width="400"></a></div> +<br> +<a href="https://vmprof.com/#/097fded2-b350-4d68-ae93-7956cd10150c" target="_blank">[2]</a> runs on Linux, spends nearly all of the time in PyArray_MatrixProduct2, if you compare to <a href="https://vmprof.com/#/567aa150-5927-4867-b22d-dbb67ac824ac" target="_blank">[1]</a> on Mac OS X, you'll see that a lot of time is spent in generating the random numbers and the rest in cblas_matrixproduct.<br> +<br> +Blas has a very efficient implementation so you can achieve the same on Linux if you install a blas implementation (such as openblas).<br> +<br> +Usually you can spot potential program source locations that take a lot of time and might be the first starting point to resolve performance issues.<br> +<br> +<span style="font-size: large;">Beyond Python programs </span><br> +<br> +It is not unthinkable that the strategy can be reused for native programs. Indeed this can already be done by creating a small cffi wrapper around an entry point of a compiled C program. It would even work for programs compiled from other languages (e.g. C++ or Fortran). The resulting function names are the full symbol name embedded into either the executable symboltable or extracted from the dwarf debugging information. Most of those will be compiler specific and contain some cryptic information.<br> +<br> +<span style="font-size: large;">Memory profiling</span><br> +We thankfully received a code contribution from the company Blue Yonder. They have built a memory profiler (for Linux and Mac OS X) on top of vmprof.com that displays the memory consumption for the runtime of your process.<br> +<br> +You can run it the following way:<br> +<br> +<span>$ python -m vmprof --mem --web script.py</span><br> +<br> +By adding --mem, vmprof will capture memory information and display it in the dedicated view on vmprof.com. You can tha view by by clicking the 'Memory' switch in the flamegraph view.<br> +<br> +<span style="font-size: large;">There is more</span><br> +<br> +Some more minor highlights contained in 0.4.x:<br> +<ul> +<li>VMProf support for Windows 64 bit (No native profiling)</li> +<li>VMProf can read profiles generated by another host system</li> +<li>VMProf is now bundled in several binary wheel for fast and easy installation (Mac OS X, Linux 32/64 for CPython 2.7, 3.4, 3.5, 3.6)</li> +</ul> +<span style="font-size: large;">Future plans - Profile Streaming</span><br> +<br> +vmprof has not reached the end of development. There are many features we could implement. But there is one feature that could be a great asset to many Python developers.<br> +<br> +Continuous delivery of your statistical profile, or in short, profile streaming. One of the great strengths of vmprof is that is consumes very little overhead. It is not a crazy idea to run this in production.<br> +<br> +It would require a smart way to stream the profile in the background to vmprof.com and new visualizations to look at much more data your Python service produces.<br> +<br> +If that sounds like a solid vmprof improvement, don't hesitate to get in touch with us (e.g. IRC #pypy, mailing list pypy-dev, or comment below)<br> +<br> +<span style="font-size: large;">You can help! </span><br> +<br> +There are some immediate things other people could help with. Either by donating time or money (yes we have occasional contributors which is great)!<br> +<ul> +<li>We gladly received code contribution for the memory profiler. But it was not enough time to finish the migration completely. Sadly it is a bit brittle right now.</li> +<li>We would like to spend more time on other visualizations. This should include to give a much better user experience on vmprof.com (like a tutorial that explains the visualization that we already have). </li> +<li>Build Windows 32/64 bit wheels (for all CPython versions we currently support)</li> +</ul> +We are also happy to accept google summer of code projects on vmprof for new visualizations and other improvements. If you qualify and are interested, don't hesitate to ask!<br> +<br> +Richard Plangger (plan_rich) and the PyPy Team<br> +<br> +[1] Mac OS X <a href="https://vmprof.com/#/567aa150-5927-4867-b22d-dbb67ac824ac">https://vmprof.com/#/567aa150-5927-4867-b22d-dbb67ac824ac</a><br> +[2] Linux64 <a href="https://vmprof.com/#/097fded2-b350-4d68-ae93-7956cd10150c">https://vmprof.com/#/097fded2-b350-4d68-ae93-7956cd10150c</a>https://www.pypy.org/posts/2017/04/native-profiling-in-vmprof-6949065546884243105.htmlSat, 01 Apr 2017 14:17:00 GMTLeysin Winter Sprint Summaryhttps://www.pypy.org/posts/2017/03/leysin-winter-sprint-summary-4587213628578490701.htmlRichard Plangger<span class="">Today + is the last day of our yearly sprint event in Leysin. We had lots of +ideas on how to enhance the current state of PyPy, we went skiing and +had interesting discussions around virtual machines, the Python +ecosystem, and other real world problems.</span><div class="ace-line" id="magicdomid662"> +<span class=""> </span></div> +<div class="ace-line" id="magicdomid664"> +<h2> +<span class="">Why don't you join us next time?</span></h2> +</div> +<div class="ace-line" id="magicdomid667"> +<span class="">A usual PyPy sprints day goes through the following stages:</span></div> +<div class="ace-line" id="magicdomid668"> +<br></div> +<div class="ace-line" id="magicdomid669"> +<ol> +<li><span class=""> <b>Planning Session:</b> Tasks from previous days that have seen progress or +are completed are noted in a shared document. Everyone adds new tasks +and then assigns themselves to one or more tasks (usually in pairs). As +soon as everybody is happy with their task and has a partner to work +with, the planning session is concluded and the work can start.</span></li> +<li><b>Discussions</b><span class=""><b>:</b> A sprint is a good occasion to discuss difficult +and important topics in person. We usually sit down in a separate area +in the sprint room and discuss until a) nobody wants to discuss anymore +or b) we found a solution to the problem. The good thing is that usally +the outcome is b).</span></li> +<li><b><span class=""></span></b><span class=""><b>Lunch:</b> For lunch we prepare sandwiches and other finger food.</span></li> +<li><span class=""></span><span class=""><b>Continue working</b> until dinner, which we eat at a random restaurant in Leysin.</span></li> +<li><span class=""></span><b>Goto 1 the next day, </b>if sprint has not ended.</li> +</ol> +</div> +<div class="ace-line" id="magicdomid677"> +<span class="">Sprints + are open to everybody and help newcomers to get started with PyPy (we usually + pair you with a developer familiar with PyPy). They are perfect to +discuss and find solutions to problems we currently face. If you are +eager to join next year, please don't hesitate to register next year +around January.</span></div> +<div class="ace-line" id="magicdomid677"> +<span class=""> </span></div> +<div class="ace-line" id="magicdomid679"> +<h2> +<span class="">Sprint Summary  </span><span class=""> </span></h2> +<span class="">Sprint goals included to work on the following topics: </span></div> +<div class="ace-line" id="magicdomid682"> +<ul> +<li><span class="">Work towards releasing PyPy 3.5 (it will be released soon)</span></li> +<li><span class=""></span><span class="">CPython Exten</span><span class="author-g-s5b0glqpe67hakm8">s</span><span class="">ion (CPyExt) modules on PyPy</span></li> +<li><span class="">Have fun in winter sports (a side goal)</span></li> +</ul> +</div> +<div class="ace-line" id="magicdomid1290"> +<h3> +<span class="">Highlights</span></h3> +<h3> +</h3> +<h3> +<span class=""></span></h3> +</div> +<div class="ace-line" id="magicdomid7007"> +<ul> +<li><span class=""></span><span class="">We have spent lots of time debugging and fixing memory issues on CPyExt.</span><span class="author-g-fhxuve7d2vlo71s1"> + In particular, we fixed a serious memory leak where taking a memoryview + would prevent numpy arrays from ever being freed. More work is still required to ensure that our GC always releases arrays in a timely +manner.</span></li> +<li><span class=""></span><span class="">Fruitful discussions and progress </span><span class="author-g-s5b0glqpe67hakm8">about </span><span class="">how </span><span class="author-g-fhxuve7d2vlo71s1">to</span><span class=""> flesh out some details about</span><span class="author-g-s5b0glqpe67hakm8"> the</span><span class=""> unicode representation in PyPy. Our current goal is to use utf-8 as</span><span class="author-g-s5b0glqpe67hakm8"> the</span><span class=""> unicode representation internally and have fast vectorized </span><span class="author-g-s5b0glqpe67hakm8">operations</span><span class=""> (indexing, check if valid, ...).</span></li> +<li><span class=""></span><span class="">PyPy will participate in GSoC 2017 and we will try to allocate more resources to that than last year.</span></li> +<li><span class=""></span><span class="">Profile and think about some details how to reduce the starting size of the interpreter. The starting point would be to </span><span class="author-g-s5b0glqpe67hakm8">look at</span><span class=""> the parser and reduce the amount of strings to ke</span><span class="author-g-fhxuve7d2vlo71s1">ep</span><span class=""> alive.</span></li> +<li><span class=""><span class="">Found a topic for a student's master thesis</span><span class="author-g-fhxuve7d2vlo71s1">:</span><span class=""> correctly free</span><span class="author-g-fhxuve7d2vlo71s1">ing</span><span class=""> cpyext reference cycles.</span></span></li> +<li><span class=""><span class=""><span class="">Run lots of Python3 code on top of PyPy3 and resolve issues we found along the way.</span></span></span></li> +<li><span class=""><span class=""><span class=""></span></span></span><span class="">Initial work on making RPython thread-safe without a GIL.</span></li> +</ul> +</div> +<div class="ace-line" id="magicdomid7022"> +<h3> +<span class="">List of attendees</span></h3> +</div> +<div class="ace-line" id="magicdomid7025"> +<span class="">- Stefan Beyer</span></div> +<div class="ace-line" id="magicdomid7026"> +<span class="">- Antonio Cuni</span></div> +<div class="ace-line" id="magicdomid7027"> +<span class="">- Maciej Fijalkowski</span></div> +<div class="ace-line" id="magicdomid7028"> +<span class="">- Manuel Jacob</span></div> +<div class="ace-line" id="magicdomid7029"> +<span class="">- Ronan Lamy</span></div> +<div class="ace-line" id="magicdomid7030"> +<span class="">- Remi Meier</span></div> +<div class="ace-line" id="magicdomid7031"> +<span class="">- Richard Plangger</span></div> +<div class="ace-line" id="magicdomid7032"> +<span class="">- Armin Rigo</span></div> +<div class="ace-line" id="magicdomid7033"> +<span class="">- Robert Zaremba</span></div> +<div class="ace-line" id="magicdomid7033"> +<span class=""> </span></div> +<div class="ace-line" id="magicdomid7033"> +<span class=""><a href="https://1.bp.blogspot.com/-DXQBN6Crkkg/WLqLRCyUQoI/AAAAAAAAAoI/YJkwrYbCX1Y3fS97pcrzx1DLAAbEWSK5wCK4B/s1600/C5r2PnqXQAIHFew.jpg"><img border="0" height="221" src="https://1.bp.blogspot.com/-DXQBN6Crkkg/WLqLRCyUQoI/AAAAAAAAAoI/YJkwrYbCX1Y3fS97pcrzx1DLAAbEWSK5wCK4B/s400/C5r2PnqXQAIHFew.jpg" width="400"></a>  </span></div> +<div class="ace-line" id="magicdomid7033"> +<span class=""><br></span></div> +<div class="ace-line" id="magicdomid7033"> +<div class="ace-line" id="magicdomid7035"> +<a href="https://2.bp.blogspot.com/-GW3_ikXNLnM/WLqLfyj2F1I/AAAAAAAAAoY/rhzM8oMWHCYyIJoZK30GcDCOg8XqIZXagCK4B/s1600/C52T_j3XQAACB0f.jpg%253Alarge.jpg"><img border="0" height="150" src="https://2.bp.blogspot.com/-GW3_ikXNLnM/WLqLfyj2F1I/AAAAAAAAAoY/rhzM8oMWHCYyIJoZK30GcDCOg8XqIZXagCK4B/s200/C52T_j3XQAACB0f.jpg%253Alarge.jpg" width="200"></a><a href="https://2.bp.blogspot.com/-pr6jpOXCwOA/WLqLig_veDI/AAAAAAAAAog/bDmzrAyn4k8xserwrxB8TpGt8WK49LxDACK4B/s1600/C6AjnJHWAAUogZ-.jpg"><img border="0" height="150" src="https://2.bp.blogspot.com/-pr6jpOXCwOA/WLqLig_veDI/AAAAAAAAAog/bDmzrAyn4k8xserwrxB8TpGt8WK49LxDACK4B/s200/C6AjnJHWAAUogZ-.jpg" width="200"></a><a href="https://3.bp.blogspot.com/-Wy9pnXLtepg/WLqOnQR9gcI/AAAAAAAAApA/KaFuiYY1oPAZtqAOK10-lqdS8BNk9v7NgCK4B/s1600/IMG_20170302_173339.jpg"><img border="0" height="150" src="https://3.bp.blogspot.com/-Wy9pnXLtepg/WLqOnQR9gcI/AAAAAAAAApA/KaFuiYY1oPAZtqAOK10-lqdS8BNk9v7NgCK4B/s200/IMG_20170302_173339.jpg" width="200"></a><span class=""></span><span class=""><a href="https://4.bp.blogspot.com/-XsqEquoT_CE/WLqPlTTeP0I/AAAAAAAAApc/U40smIsLpw4I5HqF5-xWfEPTzq_HsSr2QCK4B/s1600/C5wDgCdXQAAedGq.jpg%253Alarge.jpg"><img border="0" height="133" src="https://4.bp.blogspot.com/-XsqEquoT_CE/WLqPlTTeP0I/AAAAAAAAApc/U40smIsLpw4I5HqF5-xWfEPTzq_HsSr2QCK4B/s200/C5wDgCdXQAAedGq.jpg%253Alarge.jpg" width="200"></a></span></div> +<div class="ace-line" id="magicdomid7035"> +<span class=""><br></span></div> +<div class="ace-line" id="magicdomid7035"> +<span class="">We + would like to thank our donors for the continued support of the PyPy +project and we looking forward to next years sprint in Leysin.</span></div> +</div> +<div class="ace-line" id="magicdomid7036"> +<br></div> +<div class="ace-line" id="magicdomid7037"> +<span class="">The PyPy Team</span></div> +<div class="" id="magicdomid210"> +<br></div> +<br> +<br> +<br> +<br>https://www.pypy.org/posts/2017/03/leysin-winter-sprint-summary-4587213628578490701.htmlSat, 04 Mar 2017 09:59:00 GMTVectorization extended. PowerPC and s390xhttps://www.pypy.org/posts/2016/11/vectorization-extended-powerpc-and-s390x-4042433015460084057.htmlRichard Plangger<div dir="ltr" style="text-align: left;"> +We are happy to announce that JIT support in both the PowerPC backend and the<br> +s390x backend have been enhanced. Both can now vectorize loops via SIMD<br> +instructions. Special thanks to IBM for funding this work.<br> +<br> +If you are not familiar with this topic you can read more details <a href="https://pypyvecopt.blogspot.co.at/">here</a>.<br> +<div> +<br> +There are many more enhancements under the hood. Most notably, all pure operations are now delayed until the latest possible point. In some cases indices have been calculated more than once or they needed an additional register, because the old value is still used. Additionally it is now possible to load quadword-aligned memory in both PPC and s390x (x86 currently cannot do that).<br> +<h3> +<span style="font-size: large;">NumPy &amp; CPyExt</span></h3> +The community and core developers have been moving CPyExt towards a complete, but emulated, layer for CPython C extensions. This is great, because the one restriction preventing the wider deployment of PyPy in several scenarios will hopefully be removed. However, we advocate not to use CPyExt, but rather to not write C code at all (let PyPy speed up your Python code) or use <a href="https://cffi.readthedocs.io/en/latest/">cffi</a>.<br> +<br> +The work done here to support vectorization helps<i> micronumpy </i>(NumPyPy) to speed up operations for PPC and s390x. So why is PyPy supporting both NumPyPy and NumPy, do we actually need both? Yes, there are places where gcc can beat the JIT, and places where the tight integration between NumPyPy and PyPy is more performant. We do have plans to integrate both, hijacking the C-extension method calls to use NumPyPy where we know NumPyPy can be faster.<br> +<br> +Just to give you an idea why this is a benefit:<br> +<br> +NumPy arrays can carry custom dtypes and apply user defined python functions on the arrays. How could one optimize this kind of scenario? In a traditional setup, you cannot. But as soon as NumPyPy is turned on, you can suddenly JIT compile this code and vectorize it.<br> +<br> +Another example is element access that occurs frequently, or any other calls that cross between Python and the C level frequently.<br> +<h3> +<span style="font-size: large;">Benchmarks</span></h3> +Let's have a look at some benchmarks reusing <a href="https://bitbucket.org/mikefc/numpy-benchmark/src" target="_blank">mikefc's numpy benchmark suite</a> (find the forked version <a href="https://bitbucket.org/plan_rich/numpy-benchmark" target="_blank">here</a>). <span style="white-space: pre-wrap;">I only ran a subset of microbenchmarks, showing that the core functionality is</span><br> +<span style="white-space: pre-wrap;">functioning properly. </span>Additionally it has been rewritten to use <a href="https://perf.readthedocs.io/en/latest/"><i>perf</i></a> instead of the <i>timeit</i> stdlib module.<br> +<br> +<h2> +<span style="font-size: small;">Setup</span></h2> +x86 runs on a Intel i7-2600 clocked at 3.40GHz using 4 cores. PowerPC runs on the Power 8 clocked at 3.425GHz providing 160 cores. Last but not least the mainframe machine clocked up to 4 GHz, but fully virtualized (as it is common for such machines). Note that PowerPC is a non private remote machine. It is used by many users and it is crowded with processes. It is hard to extract a stable benchmark there.<br> +<br> +x86 ran on Fedora 24 (kernel version of 4.8.4), PPC ran on Fedora 21 (kernel version 3.17.4) and s390x ran on Redhat Linux 7.2 (kernel version 3.10.0). Respectivley, numpy on cpython had openblas available on x86, no blas implementation were present on s390x and PPC provided blas and lapack.<br> +<br> +As you can see all machines run very different configurations. It does not make sense to compare across platforms, but rather implementations on the same platform.<br> +<br> +<br> +<a href="https://4.bp.blogspot.com/-SLgW88U0Bek/WBhPjeuohdI/AAAAAAAAAkY/c5VgHxLjVaoWHIqf6zj65eBQeAefv5HPgCK4B/s1600/vecopt-x86-sse4.png"><img border="0" height="388" src="https://4.bp.blogspot.com/-SLgW88U0Bek/WBhPjeuohdI/AAAAAAAAAkY/c5VgHxLjVaoWHIqf6zj65eBQeAefv5HPgCK4B/s640/vecopt-x86-sse4.png" width="640"></a><br> +<br> +<a href="https://1.bp.blogspot.com/-z8V9bUPw_BY/WBhPoQvdZ2I/AAAAAAAAAkg/n5IoXwIRnIwaNvOcb8S4S7-Iw455_dFGgCK4B/s1600/vecopt-ppc64le.png"></a><a href="https://4.bp.blogspot.com/-b8xDL8pO4q4/WBhPqZW0wRI/AAAAAAAAAko/ZRyZpD4GP9IF6fbT4ngUfWmEJcQ536uZQCK4B/s1600/vecopt-s390x.png"><img border="0" height="390" src="https://4.bp.blogspot.com/-b8xDL8pO4q4/WBhPqZW0wRI/AAAAAAAAAko/ZRyZpD4GP9IF6fbT4ngUfWmEJcQ536uZQCK4B/s640/vecopt-s390x.png" width="640"></a><img border="0" height="396" src="https://1.bp.blogspot.com/-z8V9bUPw_BY/WBhPoQvdZ2I/AAAAAAAAAkg/n5IoXwIRnIwaNvOcb8S4S7-Iw455_dFGgCK4B/s640/vecopt-ppc64le.png" width="640"><br> +<br> +<br> +Blue shows CPython 2.7.10+ available on that platform using the latest NumPy (1.11). Micro NumPy is used for PyPy. PyPy+ indicates that the vectorization optimization is turned on.<br> +All bar charts show the median value of all runs (5 samples, 100 loops, 10 inner loops, for the operations on vectors (not matrices) the loops are set to 1000). PyPy additionally gets 3 extra executions to warmup the JIT.<br> +<br> +The comparison is really comparing speed of machine code. It compares the PyPy's JIT output vs GCC's output. It has little to do with the speed of the interpreter.<br> +<br> +Both new SIMD backends speedup the numeric kernels. Some times it is near to the speed of CPython, some times it is faster. The maximum parallelism very much depends on the extension emitted by the compiler. All three SIMD backends have the same vector register size (which is 128 bit). This means that all three behave similar but ppc and s390x gain more because they can load 128bit of memory from quadword aligned memory.<br> +<br> +<h2> +Future directions</h2> +Python is achieving rapid adoption in data science. This is currently a trend emerging in Europe, and Python is already heavily used for data science in the USA many other places around the world.<br> +<br> +<br> +PyPy can make a valuable contribution for data scientists, helping them to rapidly write scientific programs in Python and run them at near native speed. If you happen to be in that situation, we are eager to hear you feedback or resolve your issues and also work together to improve the performance of your,<br> +code. Just get in touch!<br> +<br> +<br> +Richard Plangger (plan_rich) and the PyPy team</div> +</div>https://www.pypy.org/posts/2016/11/vectorization-extended-powerpc-and-s390x-4042433015460084057.htmlThu, 03 Nov 2016 08:35:00 GMTPyPy Enterprise Editionhttps://www.pypy.org/posts/2016/04/pypy-enterprise-edition-3688275697656890948.htmlRichard Plangger<p>With the latest additions, PyPy's JIT now supports the Z architecture on Linux. The newest architecture revision (also known as s390x, or colloquially referred to as "<a href="https://en.wikipedia.org/wiki/Big_iron" title="Big iron">big iron</a>") is the 64-bit extension for IBM mainframes. Currently only Linux 64 bit is supported (not z/OS nor TPF).<br> +This is the fourth assembler backend supported by PyPy in addition to x86 (32 and 64), ARM (32-bit only) and PPC64 (both little- and big-endian). It might seem that we kind of get a hang of new architectures. Thanks to IBM for funding this work!<br> +<br> +</p><h2> +<span style="font-size: large;">History</span> </h2> +When I went to university one lecture covered the prediction of Thomas Watson in 1943. His famous quote "I think there is a world market for maybe five computers ...", turned out not to be true. <br> +<br> +However, even 70 years later, mainframes are used more often than you think. They back critical tasks requiring a high level of stability/security and offer high hardware and computational utilization rates by virtualization.<br> +<br> +With the new PyPy JIT backend we are happy to present a fast Python virtual machine for mainframes and contribute more free software running on s390x.<br> +<br> +<h2> +<span style="font-size: large;">Meta tracing</span></h2> +Even though the JIT backend has been tested on PyPy, it is not restricted to  the Python programming language. Do you have a great idea for a DSL, or another language that should run on mainframes? Go ahead and just implement your interpreter using RPython.<br> +<br> +<h2> +<span style="font-size: large;">How do I get a copy?</span></h2> +PyPy can be built using the usual instructions found <a href="https://pypy.org/download.html#building-from-source" target="_blank">here</a>. As soon as the next PyPy version has been released we will provide binaries. Until then you can just grab a nightly <a href="https://buildbot.pypy.org/nightly/" target="_blank">here</a>.We are currently busy to get the next version of PyPy ready, so an official release will be rolled out soon.<br> +<br> +<h2> +<span style="font-size: large;"><b>Comparing s390x to x86</b></span></h2> +The goal of this comparison is not to scientifically evaluate the benefits/disadvantages on s390x, but rather to see that PyPy's architecture delivers the same benefits as it does on other platforms. Similar to the comparison done for PPC I ran the benchmarks using the same setup. The first column is the speedup of the PyPy JIT VM compared to the speedup of a pure PyPy interpreter<span style="font-size: xx-small;"> 1)</span>. Note that the s390x's OS was virtualized.<br> +<span><br></span> +<span>  Label               x86     s390x      s390x (run 2)</span><br> +<br> +<span style="font-size: small;"><span>  ai                 13.7      12.4       11.9<br>  bm_chameleon        8.5       6.3        6.8<br>  bm_dulwich_log      5.1       5.0        5.1<br>  bm_krakatau         5.5       2.0        2.0<br>  bm_mako             8.4       5.8        5.9<br>  bm_mdp              2.0       3.8        3.8<br>  chaos              56.9      52.6       53.4<br>  crypto_pyaes       62.5      64.2       64.2<br>  deltablue           3.3       3.9        3.6<br>  django             28.8      22.6       21.7<br>  eparse              2.3       2.5        2.6<br>  fannkuch            9.1       9.9       10.1<br>  float              13.8      12.8       13.8<br>  genshi_text        16.4      10.5       10.9<br>  genshi_xml          8.2       7.9        8.2<br>  go                  6.7       6.2       11.2<br>  hexiom2            24.3      23.8       23.5<br>  html5lib            5.4       5.8        5.7<br>  json_bench         28.8      27.8       28.1<br>  meteor-contest      5.1       4.2        4.4<br>  nbody_modified     20.6      19.3       19.4<br>  pidigits            1.0      -1.1       -1.0<br>  pyflate-fast        9.0       8.7        8.5<br>  pypy_interp         3.3       4.2        4.4<br>  raytrace-simple    69.0     100.9       93.4<br>  richards           94.1      96.6       84.3<br>  rietveld            3.2       2.5        2.7<br>  slowspitfire        2.8       3.3        4.2<br>  spambayes           5.0       4.8        4.8<br>  spectral-norm      41.9      39.8       42.6<br>  spitfire            3.8       3.9        4.3<br>  spitfire_cstringio  7.6       7.9        8.2<br>  sympy_expand        2.9       1.8        1.8<br>  sympy_integrate     4.3       3.9        4.0<br>  sympy_str           1.5       1.3        1.3<br>  sympy_sum           6.2       5.8        5.9<br>  telco              61.2      48.5       54.8<br>  twisted_iteration  55.5      41.9       43.8<br>  twisted_names       8.2       9.3        9.7<br>  twisted_pb         12.1      10.4       10.2<br>  twisted_tcp         4.9       4.8        5.2</span></span><br> +<span style="font-size: small;"><span><br><b>  Geometric mean:    9.31      9.10       9.43</b></span></span><br> +<br> +As you can see the benefits are comparable on both platforms.<br> +Of course this is scientifically not good enough, but it shows a tendency. s390x can achieve the same results as you can get on x86. <br> +<br> +Are you running your business application on a mainframe? We would love to get some feedback. Join us in IRC tell us if PyPy made your application faster! <br> +<br> +plan_rich &amp; the PyPy Team<br> +<br> +<span style="font-size: xx-small;">1) PyPy revision for the benchmarks: 4b386bcfee54</span>https://www.pypy.org/posts/2016/04/pypy-enterprise-edition-3688275697656890948.htmlMon, 18 Apr 2016 10:13:00 GMTAutomatic SIMD vectorization support in PyPyhttps://www.pypy.org/posts/2015/10/automatic-simd-vectorization-support-in-639063580401330508.htmlRichard Plangger<div dir="ltr" style="text-align: left;"> +Hi everyone,<br> +<br> +it took some time to catch up with the JIT refacrtorings merged in this summer. <span style="font-size: small;">But, (drums) we are happy to announce that:</span><br> +<br> +<h2 style="text-align: center;"> +<span style="font-size: large;">The next release of PyPy,  "PyPy 4.0.0", will ship the new auto vectorizer</span></h2> +<span style="font-size: small;">The goal of this project was to increase the speed of numerical applications in both the NumPyPy library and for arbitrary Python programs. In PyPy we have focused a lot on improvements in the 'typical python workload', which usually involves object and string manipulations, mostly for web development. We're hoping with this work that we'll continue improving the other very important Python use case - numerics.</span><br> +<br> +<h2> +<span style="font-size: small;"><span style="font-size: large;">What it can do!</span> </span></h2> +<span style="font-size: small;">It targets numerics only. It +will not execute object manipulations faster, but it is capable of +enhancing common vector and matrix operations.</span><br> +Good news is that it is not specifically targeted for the NumPy library and the PyPy +virtual machine. Any interpreter (written in RPython) is able make use +of the vectorization. For more information about that take a look <a href="https://pypyvecopt.blogspot.co.at/">here</a>, or consult the documentation. For the time being it is not turn on by default, so be sure to enable it by specifying <span>--jit vec=1<span style="font-family: inherit;"> </span></span>before running your program.<br> +<br> +If your language (written in RPython) contains many array/matrix operations, you can easily integrate the optimization by adding the parameter 'vec=1' to the JitDriver.<br> +<br> +<h2> +<span style="font-size: large;">NumPyPy Improvements</span></h2> +<span style="font-size: small;"></span> +<span style="font-size: small;">Let's take a look at the core functions of the NumPyPy library (*). </span><br> +<span style="font-size: small;">The following tests tests show the speedup of the core functions commonly used in Python code interfacing with NumPy, on CPython with NumPy, on the PyPy 2.6.1 relased several weeks ago, and on PyPy 15.11 to be released soon. Timeit was used to test the time needed to run the operation in the plot title on various vector (lower case) and square matrix (upper case) sizes displayed on the X axis. The Y axis shows the speedup compared to CPython 2.7.10. <b>This means that higher is better</b>. </span><br> +<br> +<div class="separator" style="clear: both; text-align: center;"> +<a href="https://3.bp.blogspot.com/-aqC2wMdVRaU/ViUZJYlUNoI/AAAAAAAAAXQ/FGa9DfdDZ-4/s1600/matrix-vector.png" style="clear: left; float: left; margin-bottom: 1em; margin-right: 1em;"><img border="0" height="353" src="https://3.bp.blogspot.com/-aqC2wMdVRaU/ViUZJYlUNoI/AAAAAAAAAXQ/FGa9DfdDZ-4/s640/matrix-vector.png" width="640"></a></div> +<br> +<div class="separator" style="clear: both; text-align: center;"> +</div> +<div class="separator" style="clear: both; text-align: center;"> +</div> +<span style="font-size: small;">In comparison to PyPy 2.6.1, the speedup </span><span style="font-size: small;"><span style="font-size: small;">greatly</span> improved. The hardware support really strips down the runtime of the vector and matrix operations. There is another operation we would like to highlight: the dot product.</span><br> +<span style="font-size: small;">It is a very common operation in numerics and PyPy now (given a moderate sized matrix and vector) decreases the time spent in that operation. See for yourself:</span><br> +<br> +<div class="separator" style="clear: both; text-align: center;"> +<a href="https://3.bp.blogspot.com/-TMuz6OUEOXU/ViUZWEng4AI/AAAAAAAAAXY/dZOYp1LO1G0/s1600/dotproduct.png" style="clear: left; float: left; margin-bottom: 1em; margin-right: 1em;"><img border="0" height="353" src="https://3.bp.blogspot.com/-TMuz6OUEOXU/ViUZWEng4AI/AAAAAAAAAXY/dZOYp1LO1G0/s640/dotproduct.png" width="640"></a></div> +<div class="separator" style="clear: both; text-align: center;"> +</div> +These are nice improvements in the NumPyPy library and we got to a competitive level only making use of SSE4.1.<br> +<br> +<h2> +<span style="font-size: large;">Future work   </span></h2> +<br> +<span style="font-size: small;">This is not the end of the road. The GSoC project showed that it is possible to implement this optimization in PyPy. There might be other improvements we can make to carry this further:</span><br> +<ul> +<li><span style="font-size: small;">Check alignment at runtime to increase the memory throughput of the CPU</span></li> +<li><span style="font-size: small;">Support the AVX vector extension which (at least) doubles the size of the vector register</span></li> +<li><span style="font-size: small;">Handle each and every corner case in Python traces to enable it  globally</span></li> +<li><span style="font-size: small;">Do not rely only on loading operations to trigger the analysis, there might be cases where combination of floating point values could be done in parallel </span></li> +</ul> +Cheers,<br> +The PyPy Team<br> +<h4> +<span style="font-size: x-small;">(*) The benchmark code can be found <a href="https://bitbucket.org/plan_rich/numpy-benchmark">here</a> it was run using this configuration: i7-2600 CPU @ 3.40GHz (4 cores). </span></h4> +</div>https://www.pypy.org/posts/2015/10/automatic-simd-vectorization-support-in-639063580401330508.htmlTue, 20 Oct 2015 14:38:00 GMT \ No newline at end of file diff --git a/authors/romain-guillebert.html b/authors/romain-guillebert.html new file mode 100644 index 000000000..1c089afa5 --- /dev/null +++ b/authors/romain-guillebert.html @@ -0,0 +1,128 @@ + + + + + +Posts by Romain Guillebert | PyPy + + + + + + + + + + + + + + + + + Skip to main content +
+
+
+ + \ No newline at end of file diff --git a/authors/romain-guillebert.xml b/authors/romain-guillebert.xml new file mode 100644 index 000000000..ce3c102f0 --- /dev/null +++ b/authors/romain-guillebert.xml @@ -0,0 +1,96 @@ + +PyPy (Posts by Romain Guillebert)https://www.pypy.org/enContents © 2024 <a href="mailto:pypy-dev@pypy.org">The PyPy Team</a> Sat, 31 Aug 2024 17:48:10 GMTNikola (getnikola.com)http://blogs.law.harvard.edu/tech/rssNumPyPy status - January 2015https://www.pypy.org/posts/2015/02/numpypy-status-january-2015-5092986229783279944.htmlRomain Guillebert<p>Hi Everyone<br> +<br> +Here is what has been done in January thanks to the <a href="https://pypy.org/numpydonate.html" target="_blank">funding of NumPyPy</a>, I would like to thank all the donors and tell you that you can still donate :<br> +</p><ul> +<li>I have focused on implementing the object dtype this month, it is now possible to store objects inside ndarrays using the object dtype</li> +<li>It is also possible to add an object ndarray to any other ndarray (implementing other operators is trivial)</li> +</ul> +<div> +The next things I plan on working on next are :</div> +<div> +<ul> +<li>Implementing the missing operations for object arrays</li> +<li>Implementing garbage collection support for object arrays (currently, storing an object inside an ndarray doesn't keep the object alive)</li> +<li>Packaging NumPyPy on PyPI</li> +</ul> +<div> +Cheers</div> +</div> +<div> +Romain</div>https://www.pypy.org/posts/2015/02/numpypy-status-january-2015-5092986229783279944.htmlWed, 11 Feb 2015 14:46:00 GMTNumPy status updatehttps://www.pypy.org/posts/2013/11/numpy-status-update-1609808546418002632.htmlRomain Guillebert<span style="font-family: inherit;">Here is what has been happening with NumPy in PyPy in October thanks to the people who donated to the </span><a href="https://pypy.org/numpydonate.html" style="font-family: inherit;" target="_blank">NumPyPy proposal</a><span style="font-family: inherit;">:</span><br> +<span style="font-family: inherit;"><br> +</span> <span style="font-family: inherit;">The biggest change is that we shifted to using an <a href="https://bitbucket.org/pypy/numpy" target="_blank">external fork of numpy</a> rather than a minimal numpypy module. The idea is that we will be able to </span>reuse<span style="font-family: inherit;"> most of the upstream pure-python numpy components, replacing the C modules with appropriate RPython micronumpy pieces at the correct places in the module namespace.</span><br> +<span style="font-family: inherit;"><br> +</span> <span style="font-family: inherit;">The numpy fork should work just as well as the old numpypy for functionality that existed previously, and also include much new functionality from the pure-python numpy pieces that simply hadn't been imported yet in numpypy. However, this new functionality will not have been "hand picked" to only include pieces that work, so you may run into functionality that relies on unimplemented components (which should fail with user-level exceptions).</span><br> +<span style="font-family: inherit;"><br> +</span> <span style="font-family: inherit;">This setup also allows us to run the entire numpy test suite, which will help in directing future compatibility development. The recent PyPy release includes these changes, so download it and let us know how it works! And if you want to live on the edge, the nightly includes even more numpy progress made in November.</span><br> +<span style="font-family: inherit;"><br> +</span> <span style="font-family: inherit;">To install the fork, download the latest release, and then install numpy eith</span>er separately with a virtualenv: <tt class="docutils literal">pip install git+https://bitbucket.org/pypy/numpy.git</tt>; or directly: <tt class="docutils literal">git clone https://bitbucket.org/pypy/numpy.git; cd numpy; pypy setup.py install</tt>.<br> + +<br><i><b>EDIT:</b> if you install numpy as root, you may need to also import it once as root before it works: <tt class="docutils literal">sudo pypy -c 'import numpy'</tt></i><br> + +<span style="font-family: inherit;"><br> +</span> <span style="font-family: inherit;">Along with this change, progress was made in fixing internal micronumpy bugs and increasing compatibility:</span><br> +<ul><li><span style="font-family: inherit;"><span style="font-family: inherit;">Fixed a bug with strings in record dtypes</span></span></li> +<li><span style="font-family: inherit;"><span style="font-family: inherit;">Fixed a bug wh</span><span style="background-color: white; font-family: inherit;">ere the multiplication of an ndarray with a Python int or float resulted in loss of the array's dtype</span></span></li> +<li><span style="font-family: inherit;"><span style="background-color: white; font-family: inherit;">Fixed several segfaults encountered in the numpy test suite (suite should run now without segfaulting)</span></span></li> +</ul><span style="font-family: inherit;"><span style="background-color: white;"><br> +</span></span> <span style="font-family: inherit;"><span style="background-color: white;">We also began working on __array_prepare__ and __array_wrap__, which are necessary pieces for a working matplotlib module.</span></span><br> +<span style="font-family: inherit;"><span style="background-color: white;"><br> +</span></span> Cheers,<br> +Romain and Briannumpyhttps://www.pypy.org/posts/2013/11/numpy-status-update-1609808546418002632.htmlFri, 15 Nov 2013 19:30:00 GMTNumpy Status Updatehttps://www.pypy.org/posts/2013/09/numpy-status-update-5160363918470470887.htmlRomain Guillebert<p>Hi everyone<br> +<br> +Thanks to the people who donated money to the <a href="https://pypy.org/numpydonate.html" target="_blank">numpy proposal</a>, here is what I've been working on recently :<br> +<br> +- Fixed conversion from a numpy complex number to a python complex number<br> +- Implement the rint ufunc<br> +- Make numpy.character usable as a dtype<br> +- Fix ndarray(dtype=str).fill()<br> +- Various fixes on boolean and fancy indexing<br> +<br> +Cheers<br> +Romain</p>numpyhttps://www.pypy.org/posts/2013/09/numpy-status-update-5160363918470470887.htmlWed, 25 Sep 2013 17:49:00 GMTNumPyPy Status Updatehttps://www.pypy.org/posts/2013/08/numpypy-status-update-3401163348519734658.htmlRomain Guillebert<p>Hello everyone<br> +<br> +As expected, nditer is a lot of work. I'm going to pause my work on it for now and focus on simpler and more important things, here is a list of what I implemented :<br> +</p><ul> +<li>Fixed a bug on 32 bit that made int32(123).dtype == dtype("int32") fail</li> +<li>Fixed a bug on the pickling of array slices</li> +<li>The external loop flag is implemented on the nditer class</li> +<li>The c_index, f_index and multi_index flags are also implemented</li> +<li>Add dtype("double") and dtype("str")</li> +<li>C-style iteration is available for nditer</li> +</ul> +Cheers<br> +Romain Guillebertnumpyhttps://www.pypy.org/posts/2013/08/numpypy-status-update-3401163348519734658.htmlThu, 08 Aug 2013 19:01:00 GMTNumPyPy status updatehttps://www.pypy.org/posts/2013/06/numpypy-status-update-3846626188716521472.htmlRomain Guillebert<p>Hello everyone,<br> +<br> +May was the first month I was paid to work on NumPyPy (thanks to all who donated!), here is what I worked on during this period :<br> +<br> +</p><ul> +<li>It is now possible to use subarrays.</li> +<li>It is now possible to pickle ndarrays (including those using subarrays), dtypes and scalars, the pickling protocol is the same as numpy's.</li> +</ul> +<div> +<br></div> +<div> +For June, I plan to work on the nditer class, it seems that there's enough work for an entire month.</div> +<br> +Cheers<br> +Romain Guillebertnumpyhttps://www.pypy.org/posts/2013/06/numpypy-status-update-3846626188716521472.htmlMon, 03 Jun 2013 14:09:00 GMTNumpy Status Updatehttps://www.pypy.org/posts/2013/05/numpy-status-update-4176018422530420763.htmlRomain Guillebert<p>Hello Everyone,<br> +<br> +I've started to work on NumPyPy since the end of April and here is a short update :<br> +<br> +</p><ul> +<li>I implemented pickling support on ndarrays and dtypes, it will be compatible with numpy's pickling protocol when the "numpypy" module will be renamed to "numpy".</li> +<li>I am now working on subarrays.</li> +</ul> +<div> +<br></div> +<div> +I would also like to thank everyone who donated and allowed me to work on this.</div> +<div> +<br></div> +<div> +Cheers,</div> +<div> +Romain Guillebert</div>numpyhttps://www.pypy.org/posts/2013/05/numpy-status-update-4176018422530420763.htmlSat, 11 May 2013 17:19:00 GMT \ No newline at end of file diff --git a/authors/samuele-pedroni.html b/authors/samuele-pedroni.html new file mode 100644 index 000000000..12f6c2dc1 --- /dev/null +++ b/authors/samuele-pedroni.html @@ -0,0 +1,113 @@ + + + + + +Posts by Samuele Pedroni | PyPy + + + + + + + + + + + + + + + + + Skip to main content +
+
+
+ + \ No newline at end of file diff --git a/authors/samuele-pedroni.xml b/authors/samuele-pedroni.xml new file mode 100644 index 000000000..1ab4fa04d --- /dev/null +++ b/authors/samuele-pedroni.xml @@ -0,0 +1,23 @@ + +PyPy (Posts by Samuele Pedroni)https://www.pypy.org/enContents © 2024 <a href="mailto:pypy-dev@pypy.org">The PyPy Team</a> Sat, 31 Aug 2024 17:48:11 GMTNikola (getnikola.com)http://blogs.law.harvard.edu/tech/rssPlanning a next release of PyPyhttps://www.pypy.org/posts/2009/12/planning-next-release-of-pypy-4193252449406707091.htmlSamuele Pedroni<p>The PyPy core team is planning to make a new release before the next Pycon US.</p> +<p>The main target of the 1.2 release is packaging the good results +we have achieved applying our current JIT compiler generator to our +Python interpreter. Some of that progress <a class="reference external" href="https://www.pypy.org/posts/2009/10/gc-improvements-6174120095428192954.html">has been</a> <a class="reference external" href="https://www.pypy.org/posts/2009/11/dusseldorf-sprint-report-2505348213879053352.html">chronicled in</a> +<a class="reference external" href="https://www.pypy.org/posts/2009/11/some-benchmarking-9211261260383281459.html">recent posts</a> on the status blog. By releasing them in a +relatively stable prototype we want to encourage people to try them with their +own code and to gather feedback in this way. By construction the JIT compiler +should support all Python features, what may vary are the speedups +achieved (in some cases the JIT may produce worse results than the PyPy +interpreter which we would like to know) and the extra memory required +by it.</p> +<p>For the 1.2 release we will focus on the JIT stability first, less on +improving non-strictly JIT areas. The JIT should be good at many things +as shown by previous blog postings. We want the JIT compiler in the +release to work well on <strong>Intel 32 bits</strong> on Linux, with Mac OS X and +Windows being secondary targets. Which compilation targets work will +depend a bit on contributions.</p> +<p>In order to finalize the release we intend to have a concentrated +effort ("virtual sprint") from the 22nd to the 29th of +January. Coordination will happen as usual through the #pypy irc +channel on freenode. Samuele Pedroni will take the role of release +manager as he already did in the past.</p>releasehttps://www.pypy.org/posts/2009/12/planning-next-release-of-pypy-4193252449406707091.htmlThu, 17 Dec 2009 14:55:00 GMT \ No newline at end of file diff --git a/authors/the-pypy-team.html b/authors/the-pypy-team.html new file mode 100644 index 000000000..9514570c0 --- /dev/null +++ b/authors/the-pypy-team.html @@ -0,0 +1,140 @@ + + + + + +Posts by The PyPy Team | PyPy + + + + + + + + + + + + + + + + + Skip to main content +
+

Posts by The PyPy Team

+ +
+
+ + \ No newline at end of file diff --git a/authors/the-pypy-team.xml b/authors/the-pypy-team.xml new file mode 100644 index 000000000..378bc8f10 --- /dev/null +++ b/authors/the-pypy-team.xml @@ -0,0 +1,797 @@ + +PyPy (Posts by The PyPy Team)https://www.pypy.org/enContents © 2024 <a href="mailto:pypy-dev@pypy.org">The PyPy Team</a> Sat, 31 Aug 2024 17:48:12 GMTNikola (getnikola.com)http://blogs.law.harvard.edu/tech/rssGuest Post: How PortaOne uses PyPy for high-performance processing, connecting over 1B of phone calls every monthhttps://www.pypy.org/posts/2024/08/portaone.htmlThe PyPy Team<p>The PyPy project is always happy to hear about industrial use and deployments +of PyPy. For the <a href="https://www.pypy.org/posts/2024/03/fixing-bug-incremental-gc.html">GC bug +finding</a> +task earlier this year, we collaborated with PortaOne and we're super happy +that Serhii Titov, head of the QA department at PortaOne, was up to writing +this guest post to describe their use and experience with the project.</p> +<hr> +<h3 id="what-does-portaone-do">What does PortaOne do?</h3> +<p>We at <a href="https://www.portaone.com/">PortaOne Inc.</a> allow telecom operators to +launch new services (or provide existing services more efficiently) using our +VoIP platform (PortaSIP) and our real-time charging system (PortaBilling), +which provides additional features for cloud PBX, such as call transfer, +queues, interactive voice response (IVR) and more. At this moment our support +team manages several thousand servers with our software installed in 100 +countries, through which over 500 telecommunication service providers connect +millions of end users every day. The unique thing about PortaOne is that we +supply the source code of our product to our customers - something unheard of +in the telecom world! Thus we attract "telco innovators", who use our APIs to +build around the system and the source code to create unique tweaks of +functionality, which produces amazing products.</p> +<p>At the core of PortaSIP is the middle-ware component (the proper name for it is +"B2BUA", but that probably does not say much to anyone outside of experts in +VoIP), which implements the actual handling of SIP calls, messages, etc. and +all added features (for instance, trying to send a call via telco operators +through which the cost per minute is lower). It has to be fast (since even a +small delay in establishing a call is noticed by a customer), reliable +(everyone hates when a call drops or cannot be completed) and yet easily +expandable with new functionality. This is why we decided to use Python as +opposed to C/C++ or similar programming languages, which are often used in +telecom equipment.</p> +<p>The B2BUA component is a batch of similar Python processes that are looped +inside a +<a href="https://docs.python.org/3.10/library/asyncore.html"><code>asyncore.dispatcher</code></a> +wrapper. The load balancing between these Python processes is done by our +stateless SIP proxy server written in C++. All our sockets are served by this +B2BUA. We have our custom client-wrappers around <code>pymysql</code>, <code>redis</code>, +<code>cassandra-driver</code> and <code>requests</code> to communicate with external services. Some +of the Python processes use <a href="https://cffi.readthedocs.io/en/stable/"><code>cffi</code></a> +wrappers around C-code to improve their performance (examples: an Oracle DB +driver, a client to a radius server, a custom C logger).</p> +<p>The I/O operations that block the main thread of the Python processes are +processed in sub-threads. We have custom wrappers around <code>threading.Thread</code> +and also <code>asyncore.dispatcher</code>. The results of such operations are returned to +the main thread.</p> +<h3 id="improving-our-performance-with-pypy">Improving our performance with PyPy</h3> +<p>We started with CPython and then in 2014 switched to PyPy because it was +faster. Here's an exact quote from our first testing notes: "PyPy gives +significant performance boost, ~50%". Nowadays, after years of changes in all +the software involved, PyPy still gives us +50% boost compared to CPython.</p> +<p>Taking care of real time traffic for so many people around the globe is +something we're really proud of. I hope the PyPy team can be proud of it as +well, as the PyPy product is a part of this solution.</p> +<h3 id="finding-a-garbage-collector-bug-stage-1-the-gc-hooks">Finding a garbage collector bug: stage 1, the GC hooks</h3> +<p>However our path with PyPy wasn't perfectly smooth. There were very rare cases +of crashes on PyPy that we weren't able to catch. That's because to make +coredump useful we needed to switch to PyPy with debug, but we cannot let it +run in that mode on a production system for an extended period of time, and we +did not have any STR (steps-to-reproduce) to make PyPy crash again in our lab. +That's why we kept (and still keep) both interpreters installed just in case, +and we would switch to CPython if we noticed it happening.</p> +<p>At the time of updating PyPy from 3.5 to 3.6 our QA started noticing those +crashes more often, but we still had no luck with STR or collecting proper +coredumps with debug symbols. Then it became even worse after our development +played with the <a href="https://doc.pypy.org/en/latest/gc_info.html">Garbage Collector's +options</a> to increase performance +of our middleware component. The crashes started to affect our regular +performance testing (controlled by QA manager Yevhenii Bovda). At that point it +was decided that we can no longer live like that and so we started an intense +investigation.</p> +<p>During the first stage of our investigation (following the best practice of +troubleshooting) we narrowed down the issue as much as we could. So, it was not +our code, it was definitely somewhere in PyPy. Eventually our SIP software +engineer <a href="https://github.com/Yevhenii-Yatchenko">Yevhenii Yatchenko</a> found out +that this bug is connected with the use of our <a href="https://doc.pypy.org/en/latest/gc_info.html#gc-hooks">custom hooks in the +GC</a>. Yevhenii created +ticket <a href="https://github.com/pypy/pypy/issues/4899">#4899</a> and within 2-3 days we +got a fix from a <a href="https://github.com/cfbolz">member of the PyPy team</a>, in true open-source fashion.</p> +<h3 id="finding-a-garbage-collector-bug-stage-2-the-real-bug">Finding a garbage collector bug: stage 2, the real bug</h3> +<p>Then came stage 2. In parallel with the previous ticket, Yevhenii created +<a href="https://github.com/pypy/pypy/issues/4900">#4900</a> that we still see failing +with coredumps quite often, and they are not connected to GC custom hooks. In a +nutshell, it took us dozens of back and forward emails, three Zoom sessions and +four versions of a patch to solve the issue. During the last iteration we got a +new set of options to try and a new version of the patch. Surprisingly, that +helped! What a relief! So, the next logical step was to remove all debug +options and run PyPy only with the patch. Unfortunately, it started to fail +again and we came to the obvious conclusion that what will help us is not a +patch, but one of options we were testing out. At that point we found out that +<a href="https://doc.pypy.org/en/latest/gc_info.html#environment-variables"><code>PYPY_GC_MAX_PINNED=0</code></a> +is a necessary and sufficient condition to solve our issue. This points to +another bug in the garbage collector, somehow related to object pinning.</p> +<p>Here's our current state: we have to add <code>PYPY_GC_MAX_PINNED=0</code>, but we do not +face the crashes anymore.</p> +<h3 id="conclusion-and-next-steps">Conclusion and next steps</h3> +<p>Gratitude is extended to Carl for his invaluable assistance in resolving the +nasty bugss, because it seems we're the only ones who suffered from the last +one and we really did not want to fall back to CPython due to its performance +disadvantage.</p> +<p>Serhii Titov, head of the QA department at PortaOne Inc.</p> +<p>P.S. If you are a perfectionist and at this point you have mixed feelings and +you are still bothered by the question "But there might still be a bug in the +GC, what about that?" - Carl has some ideas about it and he will sort it out +(we will help with the testing/verification part).</p>casestudyguestposthttps://www.pypy.org/posts/2024/08/portaone.htmlThu, 29 Aug 2024 09:00:00 GMTPyPy v7.3.11 releasehttps://www.pypy.org/posts/2022/12/pypy-v7311-release.htmlThe PyPy Team<section id="pypy-v7-3-11-release-of-python-2-7-3-8-and-3-9"> +<h2>PyPy v7.3.11: release of python 2.7, 3.8, and 3.9</h2> +<p>The PyPy team is proud to release version 7.3.11 of PyPy. As could be expected, +the first release of macOS arm64 impacted the macOS x86-64 build, so this is +a bug release to restore the ability of macOS users to run PyPy on +<code class="docutils literal">macOS &lt; 11.0</code>. It also incorporates the latest CPython stdlib updates +released the day after 7.3.10 went out, and a few more bug fixes. The release +includes three different interpreters:</p> +<blockquote> +<ul class="simple"> +<li><p>PyPy2.7, which is an interpreter supporting the syntax and the features of +Python 2.7 including the stdlib for CPython 2.7.18+ (the <code class="docutils literal">+</code> is for +backported security updates)</p></li> +<li><p>PyPy3.8, which is an interpreter supporting the syntax and the features of +Python 3.8, including the stdlib for CPython 3.8.16. Note we intend to drop +support for this version in an upcoming release as soon as we release +Pyython 3.10.</p></li> +<li><p>PyPy3.9, which is an interpreter supporting the syntax and the features of +Python 3.9, including the stdlib for CPython 3.9.16.</p></li> +</ul> +</blockquote> +<p>The interpreters are based on much the same codebase, thus the multiple +release. This is a micro release, all APIs are compatible with the other 7.3 +releases and follows quickly on the heals of the 7.3.10 release on Dec 6.</p> +<p>We recommend updating. You can find links to download the v7.3.11 releases here:</p> +<blockquote> +<p><a class="reference external" href="https://pypy.org/download.html">https://pypy.org/download.html</a></p> +</blockquote> +<p>We would like to thank our donors for the continued support of the PyPy +project. If PyPy is not quite good enough for your needs, we are available for +<a class="reference external" href="https://www.pypy.org/pypy-sponsors.html">direct consulting</a> work. If PyPy is helping you out, we would love to hear about +it and encourage submissions to our <a class="reference external" href="https://pypy.org/blog">blog</a> via a pull request +to <a class="reference external" href="https://github.com/pypy/pypy.org">https://github.com/pypy/pypy.org</a></p> +<p>We would also like to thank our contributors and encourage new people to join +the project. PyPy has many layers and we need help with all of them: bug fixes, +<a class="reference external" href="https://www.pypy.org/posts/2022/12/index.html">PyPy</a> and <a class="reference external" href="https://rpython.readthedocs.org">RPython</a> documentation improvements, or general <a class="reference external" href="https://www.pypy.org/posts/2022/12/project-ideas.html">help</a> with making +RPython's JIT even better. Since the previous release, we have accepted +contributions from one new contributor, thanks for pitching in, and welcome +to the project!</p> +<p>If you are a python library maintainer and use C-extensions, please consider +making a <a class="reference external" href="https://hpyproject.org/">HPy</a> / <a class="reference external" href="https://cffi.readthedocs.io">CFFI</a> / <a class="reference external" href="https://cppyy.readthedocs.io">cppyy</a> version of your library that would be performant +on PyPy. +In any case, both <a class="reference external" href="https://github.com/joerick/cibuildwheel">cibuildwheel</a> and the <a class="reference external" href="https://github.com/matthew-brett/multibuild">multibuild system</a> support +building wheels for PyPy.</p> +<section id="what-is-pypy"> +<h3>What is PyPy?</h3> +<p>PyPy is a Python interpreter, a drop-in replacement for CPython 2.7, 3.8 and +3.9. It's fast (<a class="reference external" href="https://speed.pypy.org">PyPy and CPython 3.7.4</a> performance +comparison) due to its integrated tracing JIT compiler.</p> +<p>We also welcome developers of other <a class="reference external" href="https://rpython.readthedocs.io/en/latest/examples.html">dynamic languages</a> to see what RPython +can do for them.</p> +<p>We provide binary builds for:</p> +<blockquote> +<ul class="simple"> +<li><p><strong>x86</strong> machines on most common operating systems +(Linux 32/64 bits, Mac OS 64 bits, Windows 64 bits)</p></li> +<li><p>64-bit <strong>ARM</strong> machines running Linux (<code class="docutils literal">aarch64</code>).</p></li> +<li><p>Apple <strong>M1 arm64</strong> machines (<code class="docutils literal">macos_arm64</code>).</p></li> +<li><p><strong>s390x</strong> running Linux</p></li> +</ul> +</blockquote> +<p>PyPy support Windows 32-bit, Linux PPC64 big- and little-endian, and Linux ARM +32 bit, but does not release binaries. Please reach out to us if you wish to +sponsor binary releases for those platforms. Downstream packagers provide +binary builds for debian, Fedora, conda, OpenBSD, FreeBSD, Gentoo, and more.</p> +</section> +<section id="what-else-is-new"> +<h3>What else is new?</h3> +<p>For more information about the 7.3.11 release, see the <a class="reference external" href="https://doc.pypy.org/en/latest/release-v7.3.11.html#changelog">full changelog</a>.</p> +<p>Please update, and continue to help us make pypy better.</p> +<p>Cheers, +The PyPy Team</p> +</section> +</section>releasehttps://www.pypy.org/posts/2022/12/pypy-v7311-release.htmlThu, 29 Dec 2022 13:22:08 GMTPyPy v7.3.10 releasehttps://www.pypy.org/posts/2022/12/pypy-v7310-release.htmlThe PyPy Team<section id="pypy-v7-3-10-release-of-python-2-7-3-8-and-3-9"> +<h2>PyPy v7.3.10: release of python 2.7, 3.8, and 3.9</h2> +<p>The PyPy team is proud to release version 7.3.10 of PyPy. We have some nice +speedups and bugfixes we wish to share. The release includes three different +interpreters:</p> +<blockquote> +<ul class="simple"> +<li><p>PyPy2.7, which is an interpreter supporting the syntax and the features of +Python 2.7 including the stdlib for CPython 2.7.18+ (the <code class="docutils literal">+</code> is for +backported security updates)</p></li> +<li><p>PyPy3.8, which is an interpreter supporting the syntax and the features of +Python 3.8, including the stdlib for CPython 3.8.15.</p></li> +<li><p>PyPy3.9, which is an interpreter supporting the syntax and the features of +Python 3.9, including the stdlib for CPython 3.9.15. We have gained +confidence in the stability of this version, and are removing the "beta" +label.</p></li> +</ul> +</blockquote> +<p>The interpreters are based on much the same codebase, thus the multiple +release. This is a micro release, all APIs are compatible with the other 7.3 +releases. Highlights of the release, since the release of 7.3.9 in March 2022 +include:</p> +<blockquote> +<ul class="simple"> +<li><p>A release of Apple Silicon M1 arm64 versions. This work <a class="reference external" href="https://www.pypy.org/posts/2022/07/m1-support-for-pypy.html">was sponsored</a> by +an anonymous donor and is tested on our buildbots.</p></li> +<li><p>Many improvements to the basic interpreter to make it 15-20% faster</p></li> +<li><p>The conda-forge community <a class="reference external" href="https://www.pypy.org/posts/2022/11/pypy-and-conda-forge.html">has built</a> over 1000 packages for PyPy3.8 and 3.9, +making it easier than ever to use PyPy.</p></li> +<li><p>Update the packaged OpenSSL to 1.1.1s, sqlite3 to 3.39.4, and apply +applicable security fixes from CPython 3.9.15 to PyPy2.7</p></li> +<li><p>Update the <a class="reference external" href="https://hpyproject.org/">HPy</a> backend in PyPy3.8 and PyPy3.9 to 0.0.4</p></li> +</ul> +</blockquote> +<p>We recommend updating. You can find links to download the v7.3.10 releases here:</p> +<blockquote> +<p><a class="reference external" href="https://pypy.org/download.html">https://pypy.org/download.html</a></p> +</blockquote> +<p>We would like to thank our donors for the continued support of the PyPy +project. If PyPy is not quite good enough for your needs, we are available for +<a class="reference external" href="https://www.pypy.org/pypy-sponsors.html">direct consulting</a> work. If PyPy is helping you out, we would love to hear about +it and encourage submissions to our <a class="reference external" href="https://pypy.org/blog">blog</a> via a pull request +to <a class="reference external" href="https://github.com/pypy/pypy.org">https://github.com/pypy/pypy.org</a></p> +<p>We would also like to thank our contributors and encourage new people to join +the project. PyPy has many layers and we need help with all of them: bug fixes, +<a class="reference external" href="https://www.pypy.org/posts/2022/12/index.html">PyPy</a> and <a class="reference external" href="https://rpython.readthedocs.org">RPython</a> documentation improvements, or general <a class="reference external" href="https://www.pypy.org/posts/2022/12/project-ideas.html">help</a> with making +RPython's JIT even better. Since the previous release, we have accepted +contributions from five new contributors, thanks for pitching in, and welcome +to the project!</p> +<p>If you are a python library maintainer and use C-extensions, please consider +making a <a class="reference external" href="https://hpyproject.org/">HPy</a> / <a class="reference external" href="https://cffi.readthedocs.io">CFFI</a> / <a class="reference external" href="https://cppyy.readthedocs.io">cppyy</a> version of your library that would be performant +on PyPy. +In any case, both <a class="reference external" href="https://github.com/joerick/cibuildwheel">cibuildwheel</a> and the <a class="reference external" href="https://github.com/matthew-brett/multibuild">multibuild system</a> support +building wheels for PyPy.</p> +<section id="what-is-pypy"> +<h3>What is PyPy?</h3> +<p>PyPy is a Python interpreter, a drop-in replacement for CPython 2.7, 3.8 and +3.9. It's fast (<a class="reference external" href="https://speed.pypy.org">PyPy and CPython 3.7.4</a> performance +comparison) due to its integrated tracing JIT compiler.</p> +<p>We also welcome developers of other <a class="reference external" href="https://rpython.readthedocs.io/en/latest/examples.html">dynamic languages</a> to see what RPython +can do for them.</p> +<p>We provide binary builds for:</p> +<blockquote> +<ul class="simple"> +<li><p><strong>x86</strong> machines on most common operating systems +(Linux 32/64 bits, Mac OS 64 bits, Windows 64 bits)</p></li> +<li><p>64-bit <strong>ARM</strong> machines running Linux (<code class="docutils literal">aarch64</code>).</p></li> +<li><p>Apple <strong>M1 arm64</strong> machines (<code class="docutils literal">macos_arm64</code>).</p></li> +<li><p><strong>s390x</strong> running Linux</p></li> +</ul> +</blockquote> +<p>PyPy support Windows 32-bit, Linux PPC64 big- and little-endian, and Linux ARM +32 bit, but does not release binaries. Please reach out to us if you wish to +sponsor binary releases for those platforms. Downstream packagers provide +binary builds for debian, Fedora, conda, OpenBSD, FreeBSD, Gentoo, and more.</p> +</section> +<section id="what-else-is-new"> +<h3>What else is new?</h3> +<p>For more information about the 7.3.10 release, see the <a class="reference external" href="https://doc.pypy.org/en/latest/release-v7.3.10.html#changelog">full changelog</a>.</p> +<p>Please update, and continue to help us make pypy better.</p> +<p>Cheers, +The PyPy Team</p> +</section> +</section>releasehttps://www.pypy.org/posts/2022/12/pypy-v7310-release.htmlTue, 06 Dec 2022 13:22:08 GMTM1 support for PyPyhttps://www.pypy.org/posts/2022/07/m1-support-for-pypy.htmlThe PyPy Team<p>The PyPy team is happy to announce that we can now target the macOS ARM64 +platform. Much of the work was executed by Maciej Fijałkowski (fijal) and +funded via a generous contribution to our <a class="reference external" href="https://opencollective.com/pypy">OpenCollective</a>. The work is based +on our existing <a class="reference external" href="https://www.pypy.org/posts/2019/07/pypy-jit-for-aarch64-7161523403247118006.html">support for aarch64</a> (arm64 on linux) with some twists +to support the differences between the CPUs and the operating system. There +are nightly builds for <a class="reference external" href="https://buildbot.pypy.org/nightly/py3.8/">pypy3.8</a> and <a class="reference external" href="https://buildbot.pypy.org/nightly/py3.9/">pypy3.9</a> (look for <code class="docutils literal">macos_arm64</code>), and +the architecture will be part of our next release.</p> +<p>Please try it out and let us know how it is useful for you or how we could +improve.</p> +<p>We still need help improving our macOS support. We have an <a class="reference external" href="https://foss.heptapod.net/pypy/pypy/-/issues/3697">open issue</a> to +help our packaging story. Help is welcome.</p> +<p>The PyPy team.</p>https://www.pypy.org/posts/2022/07/m1-support-for-pypy.htmlThu, 21 Jul 2022 18:27:14 GMTPyPy v7.3.9 security releasehttps://www.pypy.org/posts/2022/03/pypy-v738-release.htmlThe PyPy Team<section id="pypy-v7-3-9-security-release"> +<h2>PyPy v7.3.9 security release</h2> +<p>The PyPy team is proud to release version 7.3.9 of PyPy. This is a security +release to match the recent <a class="reference external" href="https://discuss.python.org/t/py-day-is-coming-a-joint-security-release-spree-for-python-3-7-3-8-3-9-and-3-10-on-march-14th">CPython release</a> and updates the portable pypy +tarballs with <code class="docutils literal">bzip2 1.0.8</code>, <code class="docutils literal">openssl1.1.1n</code>, and <code class="docutils literal">libexpat 2.4.7</code>. Along +the way this release fixes some issues discovered after the 7.3.8 release and +updates <code class="docutils literal">sqlite3</code> to 3.38.2. It includes:</p> +<blockquote> +<ul class="simple"> +<li><p>PyPy2.7, which is an interpreter supporting the syntax and the features of +Python 2.7 including the stdlib for CPython 2.7.18+ (the <code class="docutils literal">+</code> is for +backported security updates)</p></li> +<li><p>PyPy3.7, which is an interpreter supporting the syntax and the features of +Python 3.7, including the stdlib for CPython 3.7.13. This will be the last +release of PyPy3.7.</p></li> +<li><p>PyPy3.8, which is an interpreter supporting the syntax and the features of +Python 3.8, including the stdlib for CPython 3.8.13.</p></li> +<li><p>PyPy3.9, which is an interpreter supporting the syntax and the features of +Python 3.9, including the stdlib for CPython 3.9.12. We relate to this as +"beta" quality. We welcome testing of this version, if you discover +incompatibilities, please report them so we can gain confidence in the version.</p></li> +</ul> +</blockquote> +<p>The interpreters are based on much the same codebase, thus the multiple +release. This is a micro release, all APIs are compatible with the other 7.3 +releases. Highlights of the release, since the release of 7.3.8 in February 2022, +include:</p> +<blockquote> +<ul class="simple"> +<li><p>Fixed some failing stdlib tests on PyPy3.9</p></li> +<li><p>Update the bundled libexpat to 2.4.6 and sqlite3 to 3.38.2</p></li> +</ul> +</blockquote> +<p>We recommend updating. You can find links to download the v7.3.9 releases here:</p> +<blockquote> +<p><a class="reference external" href="https://pypy.org/download.html">https://pypy.org/download.html</a></p> +</blockquote> +<p>We would like to thank our donors for the continued support of the PyPy +project. If PyPy is not quite good enough for your needs, we are available for +direct consulting work. If PyPy is helping you out, we would love to hear about +it and encourage submissions to our <a class="reference external" href="https://pypy.org/blog">blog</a> via a pull request +to <a class="reference external" href="https://github.com/pypy/pypy.org">https://github.com/pypy/pypy.org</a></p> +<p>We would also like to thank our contributors and encourage new people to join +the project. PyPy has many layers and we need help with all of them: <a class="reference external" href="https://www.pypy.org/posts/2022/03/index.html">PyPy</a> +and <a class="reference external" href="https://rpython.readthedocs.org">RPython</a> documentation improvements, tweaking popular modules to run +on PyPy, or general <a class="reference external" href="https://www.pypy.org/posts/2022/03/project-ideas.html">help</a> with making RPython's JIT even better. Since the +7.3.7 release, we have accepted contributions from 6 new contributors, +thanks for pitching in, and welcome to the project!</p> +<p>If you are a python library maintainer and use C-extensions, please consider +making a <a class="reference external" href="https://hpyproject.org/">HPy</a> / <a class="reference external" href="https://cffi.readthedocs.io">CFFI</a> / <a class="reference external" href="https://cppyy.readthedocs.io">cppyy</a> version of your library that would be performant +on PyPy. +In any case both <a class="reference external" href="https://github.com/joerick/cibuildwheel">cibuildwheel</a> and the <a class="reference external" href="https://github.com/matthew-brett/multibuild">multibuild system</a> support +building wheels for PyPy.</p> +<section id="what-is-pypy"> +<h3>What is PyPy?</h3> +<p>PyPy is a Python interpreter, a drop-in replacement for CPython 2.7, 3.7, 3.8 and +3.9. It's fast (<a class="reference external" href="https://speed.pypy.org">PyPy and CPython 3.7.4</a> performance +comparison) due to its integrated tracing JIT compiler.</p> +<p>We also welcome developers of other <a class="reference external" href="https://rpython.readthedocs.io/en/latest/examples.html">dynamic languages</a> to see what RPython +can do for them.</p> +<p>This PyPy release supports:</p> +<blockquote> +<ul class="simple"> +<li><p><strong>x86</strong> machines on most common operating systems +(Linux 32/64 bits, Mac OS X 64 bits, Windows 64 bits, OpenBSD, FreeBSD)</p></li> +<li><p>64-bit <strong>ARM</strong> machines running Linux. A shoutout to Huawei for sponsoring +the VM running the tests.</p></li> +<li><p><strong>s390x</strong> running Linux</p></li> +<li><p>big- and little-endian variants of <strong>PPC64</strong> running Linux,</p></li> +</ul> +</blockquote> +<p>PyPy support Windows 32-bit, PPC64 big- and little-endian, and ARM 32 bit, but +does not release binaries. Please reach out to us if you wish to sponsor +releases for those platforms.</p> +</section> +<section id="known-issues-with-pypy3-9"> +<h3>Known Issues with PyPy3.9</h3> +<ul class="simple"> +<li><p>We slightly modified the concurrent future's <code class="docutils literal">ProcessExcecutorPool</code> to +start all the worker processes when the first task is received (like on +Python3.8) to avoid an apparent race condition when using <code class="docutils literal">fork</code> and +threads (issue <a class="reference external" href="https://foss.heptapod.net/pypy/pypy/-/issues/3650">3650</a>).</p></li> +</ul> +</section> +<section id="what-else-is-new"> +<h3>What else is new?</h3> +<p>For more information about the 7.3.9 release, see the <a class="reference external" href="https://doc.pypy.org/en/latest/release-v7.3.9.html#changelog">full changelog</a>.</p> +<p>Please update, and continue to help us make PyPy better.</p> +<p>Cheers, +The PyPy team</p> +</section> +</section>releasehttps://www.pypy.org/posts/2022/03/pypy-v738-release.htmlWed, 30 Mar 2022 05:53:45 GMTPyPy v7.3.8: release of python 2.7, 3.7, 3.8, and 3.9https://www.pypy.org/posts/2022/02/pypy-v738-release.htmlThe PyPy Team<section id="pypy-v7-3-8-release-of-python-2-7-3-7-3-8-and-3-9-beta"> +<h2>PyPy v7.3.8: release of python 2.7, 3.7, 3.8, and 3.9-beta</h2> +<p>The PyPy team is proud to release version 7.3.8 of PyPy. It has been only a few +months since our last release, but we have some nice speedups and bugfixes we +wish to share. The release includes four different interpreters:</p> +<blockquote> +<ul class="simple"> +<li><p>PyPy2.7, which is an interpreter supporting the syntax and the features of +Python 2.7 including the stdlib for CPython 2.7.18+ (the <code class="docutils literal">+</code> is for +backported security updates)</p></li> +<li><p>PyPy3.7, which is an interpreter supporting the syntax and the features of +Python 3.7, including the stdlib for CPython 3.7.12. This will be the last +release of PyPy3.7.</p></li> +<li><p>PyPy3.8, which is an interpreter supporting the syntax and the features of +Python 3.8, including the stdlib for CPython 3.8.12. This is our third +release of this interpreter, and we are removing the "beta" tag.</p></li> +<li><p>PyPy3.9, which is an interpreter supporting the syntax and the features of +Python 3.9, including the stdlib for CPython 3.9.10. As this is our first +release of this interpreter, we relate to this as "beta" quality. We +welcome testing of this version, if you discover incompatibilities, please +report them so we can gain confidence in the version.</p></li> +</ul> +</blockquote> +<p>The interpreters are based on much the same codebase, thus the multiple +release. This is a micro release, all APIs are compatible with the other 7.3 +releases. Highlights of the release, since the release of 7.3.7 in late October 2021, +include:</p> +<blockquote> +<ul class="simple"> +<li><p>PyPy3.9 uses an RPython version of the PEG parser which brought with it a +cleanup of the lexer and parser in general</p></li> +<li><p>Fixed a regression in PyPy3.8 when JITting empty list comprehensions</p></li> +<li><p>Tweaked some issues around changing the file layout after packaging to make +the on-disk layout of PyPy3.8 more compatible with CPython. This requires +<code class="docutils literal"><span class="pre">setuptools&gt;=58.1.0</span></code></p></li> +<li><p>RPython now allows the target executable to have a <code class="docutils literal">.</code> in its name, so +PyPy3.9 will produce a <code class="docutils literal"><span class="pre">pypy3.9-c</span></code> and <code class="docutils literal"><span class="pre">libpypy3.9-c.so</span></code>. Changing the +name of the shared object to be version-specific (it used to be +<code class="docutils literal"><span class="pre">libpypy3-c.so</span></code>) will allow it to live alongside other versions.</p></li> +<li><p>Building PyPy3.9+ accepts a <code class="docutils literal"><span class="pre">--platlibdir</span></code> argument like CPython.</p></li> +<li><p>Improvement in ssl's use of CFFI buffers to speed up <code class="docutils literal">recv</code> and <code class="docutils literal">recvinto</code></p></li> +<li><p>Update the packaged OpenSSL to 1.1.1m</p></li> +</ul> +</blockquote> +<p>We recommend updating. You can find links to download the v7.3.8 releases here:</p> +<blockquote> +<p><a class="reference external" href="https://pypy.org/download.html">https://pypy.org/download.html</a></p> +</blockquote> +<p>We would like to thank our donors for the continued support of the PyPy +project. If PyPy is not quite good enough for your needs, we are available for +direct consulting work. If PyPy is helping you out, we would love to hear about +it and encourage submissions to our <a class="reference external" href="https://pypy.org/blog">blog</a> via a pull request +to <a class="reference external" href="https://github.com/pypy/pypy.org">https://github.com/pypy/pypy.org</a></p> +<p>We would also like to thank our contributors and encourage new people to join +the project. PyPy has many layers and we need help with all of them: <a class="reference external" href="https://www.pypy.org/posts/2022/02/index.html">PyPy</a> +and <a class="reference external" href="https://rpython.readthedocs.org">RPython</a> documentation improvements, tweaking popular modules to run +on PyPy, or general <a class="reference external" href="https://www.pypy.org/posts/2022/02/project-ideas.html">help</a> with making RPython's JIT even better. Since the +previous release, we have accepted contributions from 6 new contributors, +thanks for pitching in, and welcome to the project!</p> +<p>If you are a python library maintainer and use C-extensions, please consider +making a <a class="reference external" href="https://hpyproject.org/">HPy</a> / <a class="reference external" href="https://cffi.readthedocs.io">CFFI</a> / <a class="reference external" href="https://cppyy.readthedocs.io">cppyy</a> version of your library that would be performant +on PyPy. +In any case both <a class="reference external" href="https://github.com/joerick/cibuildwheel">cibuildwheel</a> and the <a class="reference external" href="https://github.com/matthew-brett/multibuild">multibuild system</a> support +building wheels for PyPy.</p> +<section id="what-is-pypy"> +<h3>What is PyPy?</h3> +<p>PyPy is a Python interpreter, a drop-in replacement for CPython 2.7, 3.7, 3.8 and +3.9. It's fast (<a class="reference external" href="https://speed.pypy.org">PyPy and CPython 3.7.4</a> performance +comparison) due to its integrated tracing JIT compiler.</p> +<p>We also welcome developers of other <a class="reference external" href="https://rpython.readthedocs.io/en/latest/examples.html">dynamic languages</a> to see what RPython +can do for them.</p> +<p>This PyPy release supports:</p> +<blockquote> +<ul class="simple"> +<li><p><strong>x86</strong> machines on most common operating systems +(Linux 32/64 bits, Mac OS X 64 bits, Windows 64 bits, OpenBSD, FreeBSD)</p></li> +<li><p>64-bit <strong>ARM</strong> machines running Linux. A shoutout to Huawei for sponsoring +the VM running the tests.</p></li> +<li><p><strong>s390x</strong> running Linux</p></li> +<li><p>big- and little-endian variants of <strong>PPC64</strong> running Linux,</p></li> +</ul> +</blockquote> +<p>PyPy support Windows 32-bit, PPC64 big- and little-endian, and ARM 32 bit, but +does not release binaries. Please reach out to us if you wish to sponsor +releases for those platforms.</p> +</section> +<section id="known-issues-with-pypy3-9"> +<h3>Known Issues with PyPy3.9</h3> +<ul class="simple"> +<li><p>There is still a known <a class="reference external" href="https://foss.heptapod.net/pypy/pypy/-/issues/3649">speed regression</a> around <code class="docutils literal">**kwargs</code> handling</p></li> +<li><p>We slightly modified the concurrent future's <code class="docutils literal">ProcessExcecutorPool</code> to +start all the worker processes when the first task is received (like on +Python3.8) to avoid an apparent race condition when using <code class="docutils literal">fork</code> and +threads (issue <a class="reference external" href="https://foss.heptapod.net/pypy/pypy/-/issues/3650">3650</a>).</p></li> +</ul> +</section> +<section id="what-else-is-new"> +<h3>What else is new?</h3> +<p>For more information about the 7.3.8 release, see the <a class="reference external" href="https://doc.pypy.org/en/latest/release-v7.3.8.html#changelog">full changelog</a>.</p> +<p>Please update, and continue to help us make PyPy better.</p> +<p>Cheers, +The PyPy team</p> +</section> +</section>releasehttps://www.pypy.org/posts/2022/02/pypy-v738-release.htmlSun, 20 Feb 2022 05:53:45 GMTPyPy v7.3.7: bugfix release of python 3.7 and 3.8https://www.pypy.org/posts/2021/10/pypy-v737-release.htmlThe PyPy Team<section id="pypy-v7-3-7-bug-fix-release-of-3-7-3-8"> +<h2>PyPy v7.3.7: bug-fix release of 3.7, 3.8</h2> +<p>We are releasing a PyPy 7.3.7 to fix the recent 7.3.6 release's binary +incompatibility with the previous 7.3.x releases. We mistakenly added fields +to <code class="docutils literal">PyFrameObject</code> and <code class="docutils literal">PyDateTime_CAPI</code> that broke the promise of binary +compatibility, which means that c-extension wheels compiled for 7.3.5 will not +work with 7.3.6 and via-versa. Please do not use 7.3.6.</p> +<p>We have added a cursory test for binary API breakage to the +<a class="reference external" href="https://github.com/pypy/binary-testing">https://github.com/pypy/binary-testing</a> repo which hopefully will prevent such +mistakes in the future.</p> +<p>Additionally, a few smaller bugs were fixed:</p> +<ul class="simple"> +<li><p>Use <code class="docutils literal">uint</code> for the <code class="docutils literal">request</code> argument of <code class="docutils literal">fcntl.ioctl</code> (issue <a class="reference external" href="https://foss.heptapod.net/pypy/pypy/-/issues/3568">3568</a>)</p></li> +<li><p>Fix incorrect tracing of <cite>while True`</cite> body in 3.8 (issue <a class="reference external" href="https://foss.heptapod.net/pypy/pypy/-/issues/3577">3577</a>)</p></li> +<li><p>Properly close resources when using a <code class="docutils literal">concurrent.futures.ProcessPool</code> +(issue <a class="reference external" href="https://foss.heptapod.net/pypy/pypy/-/issues/3317">3317</a>)</p></li> +<li><p>Fix the value of <code class="docutils literal">LIBDIR</code> in <code class="docutils literal">_sysconfigdata</code> in 3.8 (issue <a class="reference external" href="https://foss.heptapod.net/pypy/pypy/-/issues/3582">3582</a>)</p></li> +</ul> +<p>You can find links to download the v7.3.7 releases here:</p> +<blockquote> +<p><a class="reference external" href="https://pypy.org/download.html">https://pypy.org/download.html</a></p> +</blockquote> +<p>We would like to thank our donors for the continued support of the PyPy +project. If PyPy is not quite good enough for your needs, we are available for +direct consulting work. If PyPy is helping you out, we would love to hear about +it and encourage submissions to our <a class="reference external" href="https://pypy.org/blog">blog site</a> via a pull request +to <a class="reference external" href="https://github.com/pypy/pypy.org">https://github.com/pypy/pypy.org</a></p> +<p>We would also like to thank our contributors and encourage new people to join +the project. PyPy has many layers and we need help with all of them: <a class="reference external" href="https://www.pypy.org/posts/2021/10/index.html">PyPy</a> +and <a class="reference external" href="https://rpython.readthedocs.org">RPython</a> documentation improvements, tweaking popular modules to run +on PyPy, or general <a class="reference external" href="https://www.pypy.org/posts/2021/10/project-ideas.html">help</a> with making RPython's JIT even better.</p> +<p>If you are a python library maintainer and use C-extensions, please consider +making a <a class="reference external" href="https://cffi.readthedocs.io">CFFI</a> / <a class="reference external" href="https://cppyy.readthedocs.io">cppyy</a> version of your library that would be performant on PyPy. +In any case both <a class="reference external" href="https://github.com/joerick/cibuildwheel">cibuildwheel</a> and the <a class="reference external" href="https://github.com/matthew-brett/multibuild">multibuild system</a> support +building wheels for PyPy.</p> +<section id="what-is-pypy"> +<h3>What is PyPy?</h3> +<p>PyPy is a Python interpreter, a drop-in replacement for CPython 2.7, 3.7, and +3.8. It's fast (<a class="reference external" href="https://speed.pypy.org">PyPy and CPython 3.7.4</a> performance +comparison) due to its integrated tracing JIT compiler.</p> +<p>We also welcome developers of other <a class="reference external" href="https://rpython.readthedocs.io/en/latest/examples.html">dynamic languages</a> to see what RPython +can do for them.</p> +<p>This PyPy release supports:</p> +<blockquote> +<ul class="simple"> +<li><p><strong>x86</strong> machines on most common operating systems +(Linux 32/64 bits, Mac OS X 64 bits, Windows 64 bits, OpenBSD, FreeBSD)</p></li> +<li><p>64-bit <strong>ARM</strong> machines running Linux.</p></li> +<li><p><strong>s390x</strong> running Linux</p></li> +</ul> +</blockquote> +<p>PyPy does support ARM 32 bit and PPC64 processors, but does not release binaries.</p> +</section> +</section>releasehttps://www.pypy.org/posts/2021/10/pypy-v737-release.htmlMon, 25 Oct 2021 05:53:45 GMTPyPy v7.3.6: release of python 2.7, 3.7, and 3.8https://www.pypy.org/posts/2021/10/pypy-v736-release.htmlThe PyPy Team<section id="pypy-v7-3-6-release-of-python-2-7-3-7-and-3-8-beta"> +<h2>PyPy v7.3.6: release of python 2.7, 3.7, and 3.8-beta</h2> +<p>The PyPy team is proud to release version 7.3.6 of PyPy, which includes +three different interpreters:</p> +<blockquote> +<ul class="simple"> +<li><p>PyPy2.7, which is an interpreter supporting the syntax and the features of +Python 2.7 including the stdlib for CPython 2.7.18+ (the <code class="docutils literal">+</code> is for +backported security updates)</p></li> +<li><p>PyPy3.7, which is an interpreter supporting the syntax and the features of +Python 3.7, including the stdlib for CPython 3.7.12.</p></li> +<li><p>PyPy3.8, which is an interpreter supporting the syntax and the features of +Python 3.8, including the stdlib for CPython 3.8.12. Since this is our +first release of the interpreter, we relate to this as "beta" quality. We +welcome testing of this version, if you discover incompatibilites, please +report them so we can gain confidence in the version.</p></li> +</ul> +</blockquote> +<p>The interpreters are based on much the same codebase, thus the multiple +release. This is a micro release, all APIs are compatible with the other 7.3 +releases. Highlights of the release, since the release of 7.3.5 in May 2021, +include:</p> +<blockquote> +<ul class="simple"> +<li><p>We have merged a backend for <a class="reference external" href="https://hpyproject.org/">HPy</a>, the better C-API interface. The backend +implements HPy version 0.0.3.</p></li> +<li><p>Translation of PyPy into a binary, known to be slow, is now about 40% +faster. On a modern machine, PyPy3.8 can translate in about 20 minutes.</p></li> +<li><p>PyPy Windows 64 is now available on <a class="reference external" href="https://conda-forge.org/blog//2020/03/10/pypy">conda-forge</a>, along with nearly 700 +commonly used binary packages. This new offering joins the more than 1000 +conda packages for PyPy on Linux and macOS. Many thanks to the conda-forge +maintainers for pushing this forward over the past 18 months.</p></li> +<li><p>Speed improvements were made to <code class="docutils literal">io</code>, <code class="docutils literal">sum</code>, <code class="docutils literal">_ssl</code> and more. These +were done in response to user feedback.</p></li> +<li><p>The 3.8 version of the release contains a beta-quality improvement to the +JIT to better support <a class="reference external" href="https://www.pypy.org/posts/2021/09/jit-auto-generated-code.html">compiling huge Python functions</a> by breaking them +up into smaller pieces.</p></li> +<li><p>The release of Python3.8 required a concerted effort. We were greatly +helped by @isidentical (Batuhan Taskaya) and other new contributors.</p></li> +<li><p>The 3.8 package now uses the same layout as CPython, and many of the +PyPy-specific changes to <code class="docutils literal">sysconfig</code>, <code class="docutils literal">distutils.sysconfig</code>, and +<code class="docutils literal">distutils.commands.install.py</code> have been removed. The <code class="docutils literal">stdlib</code> now +is located in <code class="docutils literal"><span class="pre">&lt;base&gt;/lib/pypy3.8</span></code> on <code class="docutils literal">posix</code> systems, and in +<code class="docutils literal"><span class="pre">&lt;base&gt;/Lib</span></code> on Windows. The include files on windows remain the same. +On <code class="docutils literal">posix</code> they are in <code class="docutils literal"><span class="pre">&lt;base&gt;/include/pypy3.8</span></code>. Note we still use the +<code class="docutils literal">pypy</code> prefix to prevent mixing the files with CPython (which uses +<code class="docutils literal">python</code>.</p></li> +</ul> +</blockquote> +<p>We recommend updating. You can find links to download the v7.3.6 releases here:</p> +<blockquote> +<p><a class="reference external" href="https://pypy.org/download.html">https://pypy.org/download.html</a></p> +</blockquote> +<p>We would like to thank our donors for the continued support of the PyPy +project. If PyPy is not quite good enough for your needs, we are available for +direct consulting work. If PyPy is helping you out, we would love to hear about +it and encourage submissions to our <a class="reference external" href="https://pypy.org/blog">blog</a> via a pull request +to <a class="reference external" href="https://github.com/pypy/pypy.org">https://github.com/pypy/pypy.org</a></p> +<p>We would also like to thank our contributors and encourage new people to join +the project. PyPy has many layers and we need help with all of them: <a class="reference external" href="https://pypy.org">PyPy</a> +and <a class="reference external" href="https://rpython.readthedocs.org">RPython</a> documentation improvements, tweaking popular modules to run +on PyPy, or general <a class="reference external" href="https://doc.pypy.org/en/latest/project-ideas.html">help</a> with making RPython's JIT even better. Since the +previous release, we have accepted contributions from 7 new contributors, +thanks for pitching in, and welcome to the project!</p> +<p>If you are a python library maintainer and use C-extensions, please consider +making a <a class="reference external" href="https://cffi.readthedocs.io">CFFI</a> / <a class="reference external" href="https://cppyy.readthedocs.io">cppyy</a> version of your library that would be performant on PyPy. +In any case both <a class="reference external" href="https://github.com/joerick/cibuildwheel">cibuildwheel</a> and the <a class="reference external" href="https://github.com/matthew-brett/multibuild">multibuild system</a> support +building wheels for PyPy.</p> +<section id="what-is-pypy"> +<h3>What is PyPy?</h3> +<p>PyPy is a Python interpreter, a drop-in replacement for CPython 2.7, 3.7, and +soon 3.8. It's fast (<a class="reference external" href="https://speed.pypy.org">PyPy and CPython 3.7.4</a> performance +comparison) due to its integrated tracing JIT compiler.</p> +<p>We also welcome developers of other <a class="reference external" href="https://rpython.readthedocs.io/en/latest/examples.html">dynamic languages</a> to see what RPython +can do for them.</p> +<p>This PyPy release supports:</p> +<blockquote> +<ul class="simple"> +<li><p><strong>x86</strong> machines on most common operating systems +(Linux 32/64 bits, Mac OS X 64 bits, Windows 64 bits, OpenBSD, FreeBSD)</p></li> +<li><p>big- and little-endian variants of <strong>PPC64</strong> running Linux,</p></li> +<li><p><strong>s390x</strong> running Linux</p></li> +<li><p>64-bit <strong>ARM</strong> machines running Linux.</p></li> +</ul> +</blockquote> +<p>PyPy does support Windows 32-bit and ARM 32 bit processors, but does not +release binaries. Please reach out to us if you wish to sponsor releases for +those platforms.</p> +</section> +<section id="what-else-is-new"> +<h3>What else is new?</h3> +<p>For more information about the 7.3.6 release, see the <a class="reference external" href="https://doc.pypy.org/en/latest/release-v7.3.6.html#changelog">full changelog</a>.</p> +<p>Please update, and continue to help us make PyPy better.</p> +<p>Cheers, +The PyPy team</p> +</section> +</section>releasehttps://www.pypy.org/posts/2021/10/pypy-v736-release.htmlSun, 17 Oct 2021 05:53:45 GMTPyPy v7.3.5: bugfix release of python 2.7 and 3.7https://www.pypy.org/posts/2021/05/pypy-v735-release.htmlThe PyPy Team<section id="pypy-v7-3-5-release-of-2-7-and-3-7"> +<h2>PyPy v7.3.5: release of 2.7 and 3.7</h2> +<p>We are releasing a PyPy 7.3.5 with bugfixes for PyPy 7.3.4, released April 4. +PyPy 7.3.4 was the first release that runs on windows 64-bit, so that support +is still "beta". We are releasing it in the hopes that we can garner momentum +for its continued support, but are already aware of some problems, for instance +it errors in the NumPy test suite (issue <a class="reference external" href="https://foss.heptapod.net/pypy/pypy/-/issues/3462">3462</a>). Please help out with testing +the release and reporting successes and failures, financially supporting our +ongoing work, and helping us find the source of these problems.</p> +<ul class="simple"> +<li><p>The new windows 64-bit builds improperly named c-extension modules +with the same extension as the 32-bit build (issue <a class="reference external" href="https://foss.heptapod.net/pypy/pypy/-/issues/3443">3443</a>)</p></li> +<li><p>Use the windows-specific <code class="docutils literal">PC/pyconfig.h</code> rather than the posix one</p></li> +<li><p>Fix the return type for <code class="docutils literal">_Py_HashDouble</code> which impacts 64-bit windows</p></li> +<li><p>A change to the python 3.7 <code class="docutils literal"><span class="pre">sysconfig.get_config_var('LIBDIR')</span></code> was wrong, +leading to problems finding <cite>libpypy3-c.so</cite> for embedded PyPy (issue <a class="reference external" href="https://foss.heptapod.net/pypy/pypy/-/issues/3442">3442</a>).</p></li> +<li><p>Instantiate <code class="docutils literal">distutils.command.install</code> schema for PyPy-specific +<code class="docutils literal">implementation_lower</code></p></li> +<li><p>Delay thread-checking logic in greenlets until the thread is actually started +(continuation of issue <a class="reference external" href="https://foss.heptapod.net/pypy/pypy/-/issues/3441">3441</a>)</p></li> +<li><p>Four upstream (CPython) security patches were applied:</p> +<ul> +<li><p><a class="reference external" href="https://bugs.python.org/issue42988">BPO 42988</a> to remove <code class="docutils literal">pydoc.getfile</code></p></li> +<li><p><a class="reference external" href="https://bugs.python.org/issue43285">BPO 43285</a> to not trust the <code class="docutils literal">PASV</code> response in <code class="docutils literal">ftplib</code>.</p></li> +<li><p><a class="reference external" href="https://bugs.python.org/issue43075">BPO 43075</a> to remove a possible ReDoS in <code class="docutils literal">urllib</code> <code class="docutils literal">AbstractBasicAuthHandler</code></p></li> +<li><p><a class="reference external" href="https://bugs.python.org/issue43882">BPO 43882</a> to sanitize urls containing ASCII newline and tabs in +<code class="docutils literal">urllib.parse</code></p></li> +</ul> +</li> +<li><p>Fix for json-specialized dicts (issue <a class="reference external" href="https://foss.heptapod.net/pypy/pypy/-/issues/3460">3460</a>)</p></li> +<li><p>Specialize <code class="docutils literal">ByteBuffer.setslice</code> which speeds up binary file reading by a +factor of 3</p></li> +<li><p>When assigning the full slice of a list, evaluate the rhs before clearing the +list (issue <a class="reference external" href="https://foss.heptapod.net/pypy/pypy/-/issues/3440">3440</a>)</p></li> +<li><p>On Python2, <code class="docutils literal">PyUnicode_Contains</code> accepts bytes as well as unicode.</p></li> +<li><p>Finish fixing <code class="docutils literal">_sqlite3</code> - untested <code class="docutils literal">_reset()</code> was missing an argument +(issue <a class="reference external" href="https://foss.heptapod.net/pypy/pypy/-/issues/3432">3432</a>)</p></li> +<li><p>Update the packaged sqlite3 to 3.35.5 on windows. While not a bugfix, this +seems like an easy win.</p></li> +</ul> +<p>We recommend updating. These fixes are the direct result of end-user bug +reports, so please continue reporting issues as they crop up.</p> +<p>You can find links to download the v7.3.5 releases here:</p> +<blockquote> +<p><a class="reference external" href="https://pypy.org/download.html">https://pypy.org/download.html</a></p> +</blockquote> +<p>We would like to thank our donors for the continued support of the PyPy +project. If PyPy is not quite good enough for your needs, we are available for +direct consulting work. If PyPy is helping you out, we would love to hear about +it and encourage submissions to our <a class="reference external" href="https://pypy.org/blog">renovated blog site</a> via a pull request +to <a class="reference external" href="https://github.com/pypy/pypy.org">https://github.com/pypy/pypy.org</a></p> +<p>We would also like to thank our contributors and encourage new people to join +the project. PyPy has many layers and we need help with all of them: <a class="reference external" href="https://www.pypy.org/posts/2021/05/index.html">PyPy</a> +and <a class="reference external" href="https://rpython.readthedocs.org">RPython</a> documentation improvements, tweaking popular modules to run +on PyPy, or general <a class="reference external" href="https://www.pypy.org/posts/2021/05/project-ideas.html">help</a> with making RPython's JIT even better.</p> +<p>If you are a python library maintainer and use C-extensions, please consider +making a <a class="reference external" href="https://cffi.readthedocs.io">CFFI</a> / <a class="reference external" href="https://cppyy.readthedocs.io">cppyy</a> version of your library that would be performant on PyPy. +In any case both <a class="reference external" href="https://github.com/joerick/cibuildwheel">cibuildwheel</a> and the <a class="reference external" href="https://github.com/matthew-brett/multibuild">multibuild system</a> support +building wheels for PyPy.</p> +<section id="what-is-pypy"> +<h3>What is PyPy?</h3> +<p>PyPy is a Python interpreter, a drop-in replacement for CPython 2.7, 3.7, and +soon 3.8. It's fast (<a class="reference external" href="https://speed.pypy.org">PyPy and CPython 3.7.4</a> performance +comparison) due to its integrated tracing JIT compiler.</p> +<p>We also welcome developers of other <a class="reference external" href="https://rpython.readthedocs.io/en/latest/examples.html">dynamic languages</a> to see what RPython +can do for them.</p> +<p>This PyPy release supports:</p> +<blockquote> +<ul class="simple"> +<li><p><strong>x86</strong> machines on most common operating systems +(Linux 32/64 bits, Mac OS X 64 bits, Windows 32/64 bits, OpenBSD, FreeBSD)</p></li> +<li><p>big- and little-endian variants of <strong>PPC64</strong> running Linux,</p></li> +<li><p><strong>s390x</strong> running Linux</p></li> +<li><p>64-bit <strong>ARM</strong> machines running Linux.</p></li> +</ul> +</blockquote> +<p>PyPy does support ARM 32 bit processors, but does not release binaries.</p> +</section> +</section>releasehttps://www.pypy.org/posts/2021/05/pypy-v735-release.htmlSun, 23 May 2021 05:53:45 GMTPyPy v7.3.4: release of python 2.7 and 3.7https://www.pypy.org/posts/2021/04/pypy-v734-release-of-python-27-and-37.htmlThe PyPy Team<section id="pypy-v7-3-4-release-of-python-2-7-and-3-7"> +<h2>PyPy v7.3.4: release of python 2.7 and 3.7</h2> +<p>The PyPy team is proud to release the version 7.3.4 of PyPy, which includes +two different interpreters:</p> +<blockquote> +<ul class="simple"> +<li><p>PyPy2.7, which is an interpreter supporting the syntax and the features of +Python 2.7 including the stdlib for CPython 2.7.18+ (the <code class="docutils literal">+</code> is for +backported security updates)</p></li> +<li><p>PyPy3.7, which is an interpreter supporting the syntax and the features of +Python 3.7, including the stdlib for CPython 3.7.10. We no longer refer to +this as beta-quality as the last incompatibilities with CPython (in the +<code class="docutils literal">re</code> module) have been fixed.</p></li> +</ul> +</blockquote> +<p>We are no longer releasing a Python3.6 version, as we focus on updating to +Python 3.8. We have begun streaming the advances towards this goal on Saturday +evenings European time on <a class="reference external" href="https://www.twitch.tv/pypyproject">https://www.twitch.tv/pypyproject</a>. If Python3.6 is +important to you, please reach out as we could offer sponsored longer term +support.</p> +<p>The two interpreters are based on much the same codebase, thus the multiple +release. This is a micro release, all APIs are compatible with the other 7.3 +releases. Highlights of the release include binary <strong>Windows 64</strong> support, +faster numerical instance fields, and a preliminary HPy backend.</p> +<p>A new contributor (Ondrej Baranovič - thanks!) took us up on the challenge to get +<a class="reference external" href="https://foss.heptapod.net/pypy/pypy/-/issues/2073#note_141389">windows 64-bit</a> support. The work has been merged and for the first time we +are releasing a 64-bit Windows binary package.</p> +<p>The release contains the biggest change to <a class="reference external" href="https://www.pypy.org/posts/2010/11/efficiently-implementing-python-objects-3838329944323946932.html">PyPy's implementation of the +instances of user-defined classes</a> in many years. The optimization was +motivated by the report of performance problems running a <a class="reference external" href="https://github.com/paugier/nbabel">numerical particle +emulation</a>. We implemented an optimization that stores <code class="docutils literal">int</code> and <code class="docutils literal">float</code> +instance fields in an unboxed way, as long as these fields are type-stable +(meaning that the same field always stores the same type, using the principle +of <a class="reference external" href="https://www.csl.cornell.edu/~cbatten/pdfs/cheng-type-freezing-cgo2020.pdf">type freezing</a>). This gives significant performance improvements on +numerical pure-Python code, and other code where instances store many integers +or floating point numbers.</p> +<p>There were also a number of optimizations for methods around strings and bytes, +following user reported performance problems. If you are unhappy with PyPy's +performance on some code of yours, please report <a class="reference external" href="https://foss.heptapod.net/pypy/pypy/-/issues/">an issue</a>!</p> +<p>A major new feature is prelminary support for the Universal mode of HPy: a +new way of writing c-extension modules to totally encapsulate <code class="docutils literal">PyObject*</code>. +The goal, as laid out in the <a class="reference external" href="https://hpy.readthedocs.io/en/latest/">HPy documentation</a> and recent <a class="reference external" href="https://hpyproject.org/blog/posts/2021/03/hello-hpy/">HPy blog post</a>, +is to enable a migration path +for c-extension authors who wish their code to be performant on alternative +interpreters like <a class="reference external" href="https://github.com/graalvm/graalpython">GraalPython</a> (written on top of the Java virtual machine), +<a class="reference external" href="https://github.com/RustPython/RustPython">RustPython</a>, and PyPy. Thanks to Oracle and IBM for sponsoring work on HPy.</p> +<p>Support for the <a class="reference external" href="https://vmprof.readthedocs.io/en/latest/">vmprof</a> statistical profiler has been extended to ARM64 via a +built-in backend.</p> +<p>Several issues exposed in the 7.3.3 release were fixed. Many of them came from the +great work ongoing to ship PyPy-compatible binary packages in <a class="reference external" href="https://conda-forge.org/blog//2020/03/10/pypy">conda-forge</a>. +A big shout out to them for taking this on.</p> +<p>Development of PyPy takes place on <a class="reference external" href="https://foss.heptapod.net/pypy/pypy">https://foss.heptapod.net/pypy/pypy</a>. +We have seen an increase in the number of drive-by contributors who are able to +use gitlab + mercurial to create merge requests.</p> +<p>The <a class="reference external" href="https://cffi.readthedocs.io">CFFI</a> backend has been updated to version 1.14.5 and the <a class="reference external" href="https://cppyy.readthedocs.io">cppyy</a> backend +to 1.14.2. We recommend using CFFI rather than C-extensions to interact with C, +and using cppyy for performant wrapping of C++ code for Python.</p> +<p>As always, we strongly recommend updating to the latest versions. Many fixes +are the direct result of end-user bug reports, so please continue reporting +issues as they crop up.</p> +<p>You can find links to download the v7.3.4 releases here:</p> +<blockquote> +<p><a class="reference external" href="https://pypy.org/download.html">https://pypy.org/download.html</a></p> +</blockquote> +<p>We would like to thank our donors for the continued support of the PyPy +project. If PyPy is not quite good enough for your needs, we are available for +direct consulting work. If PyPy is helping you out, we would love to hear about +it and encourage submissions to our <a class="reference external" href="https://pypy.org/blog">renovated blog site</a> via a pull request +to <a class="reference external" href="https://github.com/pypy/pypy.org">https://github.com/pypy/pypy.org</a></p> +<p>We would also like to thank our contributors and encourage new people to join +the project. PyPy has many layers and we need help with all of them: <a class="reference external" href="https://www.pypy.org/posts/2021/04/index.html">PyPy</a> +and <a class="reference external" href="https://rpython.readthedocs.org">RPython</a> documentation improvements, tweaking popular modules to run +on PyPy, or general <a class="reference external" href="https://www.pypy.org/posts/2021/04/project-ideas.html">help</a> with making RPython's JIT even better. Since the +previous release, we have accepted contributions from 10 new contributors, +thanks for pitching in, and welcome to the project!</p> +<p>If you are a python library maintainer and use C-extensions, please consider +making a cffi / cppyy version of your library that would be performant on PyPy. +In any case both <a class="reference external" href="https://github.com/joerick/cibuildwheel">cibuildwheel</a> and the <a class="reference external" href="https://github.com/matthew-brett/multibuild">multibuild system</a> support +building wheels for PyPy.</p> +<section id="what-is-pypy"> +<h3>What is PyPy?</h3> +<p>PyPy is a Python interpreter, a drop-in replacement for CPython 2.7, 3.7, and +soon 3.8. It's fast (<a class="reference external" href="https://speed.pypy.org">PyPy and CPython 3.7.4</a> performance +comparison) due to its integrated tracing JIT compiler.</p> +<p>We also welcome developers of other <a class="reference external" href="https://rpython.readthedocs.io/en/latest/examples.html">dynamic languages</a> to see what RPython +can do for them.</p> +<p>This PyPy release supports:</p> +<blockquote> +<ul class="simple"> +<li><p><strong>x86</strong> machines on most common operating systems +(Linux 32/64 bits, Mac OS X 64 bits, Windows 32/64 bits, OpenBSD, FreeBSD)</p></li> +<li><p>big- and little-endian variants of <strong>PPC64</strong> running Linux,</p></li> +<li><p><strong>s390x</strong> running Linux</p></li> +<li><p>64-bit <strong>ARM</strong> machines running Linux.</p></li> +</ul> +</blockquote> +<p>PyPy does support ARM 32 bit processors, but does not release binaries.</p> +</section> +<section id="what-else-is-new"> +<h3>What else is new?</h3> +<p>For more information about the 7.3.4 release, see the <a class="reference external" href="https://doc.pypy.org/en/latest/release-v7.3.4.html#changelog">full changelog</a>.</p> +<p>Please update, and continue to help us make PyPy better.</p> +<p>Cheers, +The PyPy team</p> +</section> +</section>releasehttps://www.pypy.org/posts/2021/04/pypy-v734-release-of-python-27-and-37.htmlThu, 08 Apr 2021 05:53:45 GMT \ No newline at end of file diff --git a/authors/unknown.html b/authors/unknown.html new file mode 100644 index 000000000..1d69614d0 --- /dev/null +++ b/authors/unknown.html @@ -0,0 +1,119 @@ + + + + + +Posts by Unknown | PyPy + + + + + + + + + + + + + + + + + Skip to main content +
+
+
+ + \ No newline at end of file diff --git a/authors/unknown.xml b/authors/unknown.xml new file mode 100644 index 000000000..d95265395 --- /dev/null +++ b/authors/unknown.xml @@ -0,0 +1,203 @@ + +PyPy (Posts by Unknown)https://www.pypy.org/enContents © 2024 <a href="mailto:pypy-dev@pypy.org">The PyPy Team</a> Sat, 31 Aug 2024 17:48:10 GMTNikola (getnikola.com)http://blogs.law.harvard.edu/tech/rssAsync HTTP benchmarks on PyPy3https://www.pypy.org/posts/2017/03/async-http-benchmarks-on-pypy3-1092124994927894138.htmlUnknown<div class="" id="magicdomid3"> +<span class="author-g-1lpsz122z2ma8y7sqx7l">Hello everyone,</span></div> +<div class="" id="magicdomid4"> +<br></div> +<div class="" id="magicdomid5"> +<span class="author-g-1lpsz122z2ma8y7sqx7l">Since </span><a href="https://blog.mozilla.org/blog/2016/08/04/mozilla-awards-585000-to-nine-open-source-projects-in-q2-2016/" target="_blank">Mozilla announced funding</a><span class="author-g-1d7t1l2jbyeccm49">,</span><span class="author-g-1lpsz122z2ma8y7sqx7l"> we've been working quite hard on delivering you a working </span><span class="author-g-1d7t1l2jbyeccm49">P</span><span class="author-g-1lpsz122z2ma8y7sqx7l">ython 3.5.</span></div> +<div class="" id="magicdomid5"> +<span class="author-g-1lpsz122z2ma8y7sqx7l"> </span></div> +<div class="" id="magicdomid5"> +<span class="author-g-1lpsz122z2ma8y7sqx7l">We are almost ready to release an alpha version of PyPy 3.5. Our goal is to release it shortly after the sprint. Many modules have already been ported and  it can probably run many Python 3 programs already. We are happy to receive any feedback after the next release.  </span></div> +<div class="" id="magicdomid6"> +<br></div> +<div class="" id="magicdomid7"> +<span class="author-g-1lpsz122z2ma8y7sqx7l">To show that the heart (asyncio) of Python 3 is already working we have prepared some benchmarks. They are done by </span><span class="author-g-d5i2rz122z7s6cn7iauy">Paweł Piotr Przeradowski @squeaky_pl</span><span class="author-g-1lpsz122z2ma8y7sqx7l"> for </span><span class="author-g-d5i2rz122z7s6cn7iauy">a HTTP</span><span class="author-g-1lpsz122z2ma8y7sqx7l"> workload on serveral</span><span class="author-g-1d7t1l2jbyeccm49"></span><span class="author-g-1lpsz122z2ma8y7sqx7l"> </span><span class="author-g-d5i2rz122z7s6cn7iauy">asynchronous IO</span><span class="author-g-1lpsz122z2ma8y7sqx7l"> libraries</span><span class="author-g-d5i2rz122z7s6cn7iauy">, namely </span><span class="author-g-1d7t1l2jbyeccm49">the </span><span class="author-g-d5i2rz122z7s6cn7iauy">relatively new </span><span class="author-g-d5i2rz122z7s6cn7iauy i"><i>asyncio</i></span><span class="author-g-1d7t1l2jbyeccm49 i"><i> and</i></span><span class="author-g-d5i2rz122z7s6cn7iauy"> </span><span class="author-g-d5i2rz122z7s6cn7iauy i"><i>curio</i></span><span class="author-g-1d7t1l2jbyeccm49 i"><i> libraries</i></span><span class="author-g-d5i2rz122z7s6cn7iauy"> and</span><span class="author-g-1d7t1l2jbyeccm49"> the</span><span class="author-g-d5i2rz122z7s6cn7iauy"> battle-tested </span><span class="author-g-d5i2rz122z7s6cn7iauy i"><i>tornado</i></span><span class="author-g-d5i2rz122z7s6cn7iauy">, </span><span class="author-g-d5i2rz122z7s6cn7iauy i"><i>gevent and Twisted</i></span><span class="author-g-1d7t1l2jbyeccm49 i"><i> libraries</i></span><span class="author-g-1lpsz122z2ma8y7sqx7l">. To see the benchmarks check out </span><span class="author-g-d5i2rz122z7s6cn7iauy url"><a href="https://github.com/squeaky-pl/zenchmarks">https://github.com/squeaky-pl/zenchmarks</a></span><span class="author-g-1lpsz122z2ma8y7sqx7l"> and the instructions </span><span class="author-g-d5i2rz122z7s6cn7iauy">for reproducing can be found inside README.md in the repository</span><span class="author-g-1lpsz122z2ma8y7sqx7l">. Raw results </span><span class="author-g-d5i2rz122z7s6cn7iauy">can be obtained from </span><span class="author-g-d5i2rz122z7s6cn7iauy url"><a href="https://github.com/squeaky-pl/zenchmarks/blob/master/results.csv">https://github.com/squeaky-pl/zenchmarks/blob/master/results.csv</a></span><span class="author-g-1lpsz122z2ma8y7sqx7l">.</span></div> +<div class="" id="magicdomid8"> +<br></div> +<div class="" id="magicdomid9"> +<span class="author-g-d5i2rz122z7s6cn7iauy">The + purpose of the presented benchmarks is showing that the upcoming PyPy release +is already working with unmodified code that runs on CPython 3.5. PyPy +also manages to make them run significantly faster.</span></div> +<div class="" id="magicdomid10"> +<br></div> +<div class="" id="magicdomid11"> +<span class="author-g-d5i2rz122z7s6cn7iauy">The + benchmarks consist of HTTP servers implemented on the top of the mentioned +libraries. All the servers are single-threaded relying on underlying +event loops to provide concurrency. Access logging was disabled to +exclude terminal I/O from the results. The view code consists of a +lookup in a dictionary mapping ASCII letters to verses from the famous +Zen of Python. If a verse is found the view returns it, otherwise a 404 +Not Found response is served. The 400 Bad Request and 500 Internal +Server Error cases are also handled.</span></div> +<div class="" id="magicdomid12"> +<br></div> +<div class="" id="magicdomid13"> +<span class="author-g-d5i2rz122z7s6cn7iauy">The workload was generated with the </span><a href="https://github.com/wg/wrk" target="_blank"><span class="author-g-d5i2rz122z7s6cn7iauy i"><i>wrk</i></span></a> H<span class="author-g-d5i2rz122z7s6cn7iauy">TTP benchmarking tool. It is run with one thread opening up to 100 +concurrent connections for 2 seconds and repeated 1010 times to get +consecutive measures. There is a <a href="https://github.com/squeaky-pl/zenchmarks/blob/master/zenhttp.lua" target="_blank">Lua script </a>provided</span><span class="author-g-d5i2rz122z7s6cn7iauy"> + that instructs <a href="https://github.com/wg/wrk" target="_blank">wrk</a> to continuously send 24 different requests that hit +different execution paths (200, 404, 400) in the view code. Also it is +worth noting that </span><a href="https://github.com/wg/wrk" target="_blank"><span class="author-g-d5i2rz122z7s6cn7iauy i"><i>wrk</i></span></a><span class="author-g-d5i2rz122z7s6cn7iauy"> will only count 200 responses as successful so the actual request per second throughput is higher.</span></div> +<div class="" id="magicdomid14"> +<br></div> +<div class="" id="magicdomid15"> +<span class="author-g-d5i2rz122z7s6cn7iauy">For your convenience all the used libraries versions are <a href="https://github.com/squeaky-pl/zenchmarks/tree/master/vendor" target="_blank">vendored</a> </span><span class="author-g-d5i2rz122z7s6cn7iauy">into the benchmark repository. There is also a precompiled portable version of </span><span class="author-g-d5i2rz122z7s6cn7iauy i"><i>wrk </i></span><span class="author-g-d5i2rz122z7s6cn7iauy">provided + that should run on any reasonably recent (10 year old or newer) Linux +x86_64 distribution. The benchmark was performed on a public cloud </span><a href="https://www.scaleway.com/" target="_blank"><span class="author-g-d5i2rz122z7s6cn7iauy i"><i>scaleway</i></span></a><span class="author-g-d5i2rz122z7s6cn7iauy"> x86_64 server launched in a Paris data center. The server was running +Ubuntu 16.04.01 LTS and reported Intel(R) Xeon(R) CPU D-1531 @ 2.20GHz +CPU. CPython 3.5.2 (shipped by default in Ubuntu) was benchmarked +against a <a href="https://buildbot.pypy.org/nightly/py3.5/pypy-c-jit-90326-88ef793308eb-linux64.tar.bz2" target="_blank">pypy-c-jit-90326-88ef793308eb-linux64</a></span><span class="author-g-d5i2rz122z7s6cn7iauy"> snapshot of the 3.5 compatibility branch of PyPy.</span></div> +<div class="" id="magicdomid16"> +<br></div> +<div class="" id="magicdomid17"> +<span class="author-g-1lpsz122z2ma8y7sqx7l"><a href="https://1.bp.blogspot.com/-cjlKx06ZBaY/WLb_S3TBWuI/AAAAAAAAAmI/s2fsZ-SaJiwS2B-nAmyTheJfMQrKFHuQACK4B/s1600/graphs.png"><img border="0" height="540" src="https://1.bp.blogspot.com/-cjlKx06ZBaY/WLb_S3TBWuI/AAAAAAAAAmI/s2fsZ-SaJiwS2B-nAmyTheJfMQrKFHuQACK4B/s640/graphs.png" width="640"></a></span></div> +<div class="" id="magicdomid19"> +<span class="author-g-1lpsz122z2ma8y7sqx7l"> </span></div> +<div class="" id="magicdomid19"> +<span class="author-g-1lpsz122z2ma8y7sqx7l"> </span></div> +<div class="" id="magicdomid19"> +<span class="author-g-1lpsz122z2ma8y7sqx7l"><a href="https://4.bp.blogspot.com/-Qn9iiR_-ZKA/WLb_pXFG9mI/AAAAAAAAAmQ/rvEYKM1KYbIzFmTeu9utt9oNALlc9mTNwCK4B/s1600/table.png"><img border="0" height="306" src="https://4.bp.blogspot.com/-Qn9iiR_-ZKA/WLb_pXFG9mI/AAAAAAAAAmQ/rvEYKM1KYbIzFmTeu9utt9oNALlc9mTNwCK4B/s640/table.png" width="640"></a> </span></div> +<div class="" id="magicdomid19"> +<span class="author-g-1lpsz122z2ma8y7sqx7l"> </span></div> +<div class="" id="magicdomid19"> +<span class="author-g-1lpsz122z2ma8y7sqx7l">We want to thank Mozilla for supporting our work!</span></div> +<div class="" id="magicdomid19"> +<span class="author-g-1lpsz122z2ma8y7sqx7l"><br></span></div> +<div class="" id="magicdomid19"> +<span class="author-g-1lpsz122z2ma8y7sqx7l">Cheers,</span></div> +<div class="" id="magicdomid19"> +<span class="author-g-1lpsz122z2ma8y7sqx7l">fijal, </span><span class="author-g-d5i2rz122z7s6cn7iauy">squeaky_pl and the PyPy Team</span></div> +<div class="" id="magicdomid20"> +<br></div>https://www.pypy.org/posts/2017/03/async-http-benchmarks-on-pypy3-1092124994927894138.htmlWed, 01 Mar 2017 17:28:00 GMTPyPy3 5.5.0 releasedhttps://www.pypy.org/posts/2016/10/pypy3-550-released-8069558680221199646.htmlUnknown<p>We're pleased to announce the release of PyPy3 v5.5.0. Coming four months after PyPy3.3 v5.2, it improves compatibility with Python 3.3 (3.3.5). We strongly recommend updating from previous PyPy3 versions.<br> +<br> +We would like to thank all of the people who donated to the py3k proposal for supporting the work that went into this release.<br> +<br> +You can download the PyPy3.3 v5.5.0 release here: <a href="https://pypy.org/download.html#python-3-3-5-compatible-pypy3-3-v5-5">https://pypy.org/download.html</a><br> +</p><ul> +<li>Improved Python 3.3.5 support.</li> +<ul> +<li>os.get_terminal_size(), time.monotonic(), str.casefold() </li> +<li>faulthandler module</li> +<li>There are still some missing features such as a PEP 393-like space efficient string representation and including performance regressions (e.g. issue #2305). The focus for this release has been updating to 3.3 compatibility. Windows is also not yet supported.</li> +</ul> +<li><i>ensurepip</i> is also included (it's only included in CPython 3 &gt;= 3.4).</li> +<li>Buffer interface improvements (numpy on top of cpyext)</li> +<li>Several JIT improvements (force-virtual-state, residual calls)</li> +<li>Search path for libpypy-c.so has changed (helps with cffi embedding on linux distributions)</li> +<li>Improve the error message when the user forgot the "self" argument of a method</li> +<li>Many more small improvements, please head over to our documentation for more information</li> +</ul> +<h3> +Towards Python 3.5</h3> +<div> +<div> +We have started to work on Python 3.5, which is a version used by many software projects. It seems to get wide adoption. We are happy to be part of the Mozilla Open Source Support (MOSS) initiative.</div> +<div> +<br></div> +<div> +Nevertheless we want to give our users the chance to use PyPy in their Python 3 projects, thus we have prepared this release.</div> +</div> +<h3> +What is PyPy?</h3> +PyPy is a very compliant Python interpreter, almost a drop-in replacement for CPython 2.7.10 and 3.3.5. It's fast due to its integrated tracing JIT compiler.<br> +<span style="white-space: pre-wrap;"><span><br></span></span> We also welcome developers of other dynamic languages to see what RPython can do for them.<br> +<br> +This release supports:<br> +<ul> +<li>x86 machines on most common operating systems except Windows </li> +<li>newer ARM hardware (ARMv6 or ARMv7, with VFPv3) running Linux </li> +<li>big- and little-endian variants of PPC64 running Linux </li> +<li>s390x running Linux</li> +</ul> +Please try it out and let us know what you think. We welcome feedback, we know<br> +you are using PyPy, please tell us about it!<br> +<br> +Cheers<br> +<br> +The PyPy Teamreleasehttps://www.pypy.org/posts/2016/10/pypy3-550-released-8069558680221199646.htmlWed, 12 Oct 2016 09:37:00 GMTPyPy Tooling Upgrade: JitViewer and VMProfhttps://www.pypy.org/posts/2016/08/pypy-tooling-upgrade-jitviewer-and-5107430577468391432.htmlUnknown<p>We are happy to announce a major JitViewer (JV) update.<br> +JV allows you to inspect RPython's internal compiler representation (the language in which PyPy is implemented) including the generated machine code of your program. It can graphically show you details of the JIT compiled code and helps you pinpoint issues in your program.<br> +<br> +VMProf is a statistical CPU profiler for python imposing very little overhead at runtime.<br> +<br> +Both VMProf and JitViewer share a common goal: Present useful information for your python program.<br> +The combination of both can reveal more information than either alone.<br> +That is the reason why they are now both packaged together.<br> +We also updated <a href="https://vmprof.com/" target="_blank">vmprof.com</a> with various bug fixes and changes including an all new interface to JV.<br> +<br> +This work was done with the goal of improving tooling and libraries around the Python/PyPy/RPython ecosystem.<br> +Some of the tools we have developed:<br> +<br> +</p><ul> +<li><b>CFFI</b> - Foreign Function Interface that avoids CPyExt (<a href="https://cffi.readthedocs.io/en/latest/" target="_blank">CFFI docs</a>)</li> +<li><b>RevDB </b>- A reverse debugger for python (<a href="https://morepypy.blogspot.co.at/2016/07/reverse-debugging-for-python.html" target="_blank">RevDB blog post</a>)</li> +</ul> +<br> +and of course the tools we discuss here:<br> +<br> +<ul> +<li><b>VMProf</b> - A statistical CPU profiler (<a href="https://vmprof.readthedocs.io/en/latest/" target="_blank">VMProf docs</a>)</li> +<li><b>JitViewer</b> - Visualization of the log file produced by RPython (<a href="https://vmprof.readthedocs.io/en/latest/">JitLog docs</a>)</li> +</ul> +<br> +<h3> +A "brand new" JitViewer</h3> +<br> +JitViewer has two pieces: you create a log file when running your program, and then use a graphic tool to view what happened.<br> +<br> +The old logging format was a hard-to-maintain, plain-text-logging facility. Frequent changes often broke internal tools.<br> +Additionally, the logging output of a long running program required a lot of disk space.<br> +<br> +Our new binary format encodes data densely, makes use of some compression (gzip), and tries to remove repetition where possible.<br> +It also supports versioning for future proofing and can be extended easily.<br> +<br> +And *drumroll* you no longer need to install a tool to view the log yourself<br> +anymore! The whole system moved to vmprof.com and you can use it any time.<br> +<br> +Sounds great. But what can you do with it? Here are two examples for a PyPy user:<br> +<h3> +<br>PyPy crashed? Did you discover a bug?</h3> +<br> +For some hard to find bugs it is often necessary to look at the compiled code. The old<br> +procedure often required you to upload a plain text file which was hard to parse and to look through.<br> +<br> +A better way to share a crash report is to install the ``vmprof`` module from PyPi and execute either of the two commands:<br> +<span><br></span> +<span># this program does not crash, but has some weird behaviour</span><br> +<span>$ pypy -m jitlog --web &lt;your program args&gt;</span><br> +<span>...</span><br> +<span>PyPy Jitlog: https://vmprof.com/#/&lt;hash&gt;/traces</span><br> +<span># this program segfaults</span><br> +<span>$ pypy -m jitlog -o /tmp/log &lt;your program args&gt;</span><br> +<span>...</span><br> +<span>&lt;Segfault&gt;</span><br> +<span>$ pypy -m jitlog --upload /tmp/log</span><br> +<span>PyPy Jitlog: https://vmprof.com/#/&lt;hash&gt;/traces</span><br> +<br> +<br> +Providing the link in the bug report allows PyPy developers to browse and identify potential issues.<br> +<br> +<h3> +Speed issues</h3> +<br> +VMProf is a great tool to find hot spots that consume a lot of time in your program. As soon as you have identified code that runs slowly, you can switch to jitlog and maybe pinpoint certain aspects that do not behave as expected. You will find an overview, and are able to browse the generated code. If you cannot make sense of all that, you can just share the link with us and we can have a look too.<br> +<h3> +<br>Future direction</h3> +<br> +We hope that the new release will help both PyPy developers and PyPy users resolve potential issues and easily point them out.<br> +<br> +Here are a few ideas what might come in the next few releases:<br> +<br> +<br> +<ul> +<li> Combination of CPU profiles and the JITLOG (sadly did not make it into the current release).</li> +<li>Extend vmprof.com to be able to query vmprof/jitlog. <br>An example query for vmprof: 'methods.callsites() &gt; 5' and<br>for the jitlog would be 'traces.contains('call_assembler').hasbridge('*my_func_name*')'.</li> +<li>Extend the jitlog to capture the information of the optimization stage.</li> +</ul> +<br> +<br> +Richard Plangger (plan_rich) and the PyPy team<br> +<div> +<br></div>https://www.pypy.org/posts/2016/08/pypy-tooling-upgrade-jitviewer-and-5107430577468391432.htmlThu, 11 Aug 2016 11:52:00 GMT \ No newline at end of file diff --git a/authors/vilhjalmur-thorsteinsson.html b/authors/vilhjalmur-thorsteinsson.html new file mode 100644 index 000000000..48aee578d --- /dev/null +++ b/authors/vilhjalmur-thorsteinsson.html @@ -0,0 +1,113 @@ + + + + + +Posts by Vilhjálmur Þorsteinsson | PyPy + + + + + + + + + + + + + + + + + Skip to main content +
+
+
+ + \ No newline at end of file diff --git a/authors/vilhjalmur-thorsteinsson.xml b/authors/vilhjalmur-thorsteinsson.xml new file mode 100644 index 000000000..0b898c749 --- /dev/null +++ b/authors/vilhjalmur-thorsteinsson.xml @@ -0,0 +1,145 @@ + +PyPy (Posts by Vilhjálmur Þorsteinsson)https://www.pypy.org/enContents © 2024 <a href="mailto:pypy-dev@pypy.org">The PyPy Team</a> Sat, 31 Aug 2024 17:48:12 GMTNikola (getnikola.com)http://blogs.law.harvard.edu/tech/rssNatural Language Processing for Icelandic with PyPy: A Case Studyhttps://www.pypy.org/posts/2022/02/nlp-icelandic-case-study.htmlVilhjálmur Þorsteinsson<section id="natural-language-processing-for-icelandic-with-pypy-a-case-study"> +<h2>Natural Language Processing for Icelandic with PyPy: A Case Study</h2> +<p><a class="reference external" href="https://en.wikipedia.org/wiki/Icelandic_language">Icelandic</a> is one +of the smallest languages of the world, with about 370.000 speakers. It +is a language in the Germanic family, most similar to Norwegian, Danish +and Swedish, but closer to the original <a class="reference external" href="https://en.wikipedia.org/wiki/Old_Norse">Old +Norse</a> spoken throughout +Scandinavia until about the 14th century CE.</p> +<p>As with other small languages, there are <a class="reference external" href="https://www.theguardian.com/world/2018/feb/26/icelandic-language-battles-threat-of-digital-extinction">worries that the language may +not +survive</a> +in a digital world, where all kinds of fancy applications are developed +first - and perhaps only - for the major languages. Voice assistants, +chatbots, spelling and grammar checking utilities, machine translation, +etc., are increasingly becoming staples of our personal and professional +lives, but if they don’t exist for Icelandic, Icelanders will gravitate +towards English or other languages where such tools are readily +available.</p> +<p>Iceland is a technology-savvy country, with <a class="reference external" href="https://ourworldindata.org/grapher/share-of-individuals-using-the-internet?tab=table">world-leading adoption +rates of the +Internet</a>, +PCs and smart devices, and a thriving software industry. So the +government figured that it would be worthwhile to fund a <a class="reference external" href="https://aclanthology.org/2020.lrec-1.418.pdf">5-year +plan</a> to build natural +language processing (NLP) resources and other infrastructure for the +Icelandic language. The project focuses on collecting data and +developing open source software for a range of core applications, such +as tokenization, vocabulary lookup, n-gram statistics, part-of-speech +tagging, named entity recognition, spelling and grammar checking, neural +language models and speech processing.</p> +<hr class="docutils"> +<p>My name is Vilhjálmur Þorsteinsson, and I’m the founder and CEO of a +software startup <a class="reference external" href="https://mideind.is/english.html">Miðeind</a> in Reykjavík, +Iceland, that employs 10 software engineers and linguists and focuses on +NLP and AI for the Icelandic language. The company participates in the +government’s language technology program, and has contributed +significantly to the program’s core tools (e.g., a tokenizer and a +parser), spelling and grammar checking modules, and a neural machine +translation stack.</p> +<p>When it came to a choice of programming languages and development tools +for the government program, the requirements were for a major, well +supported, vendor-and-OS-agnostic FOSS platform with a large and diverse +community, including in the NLP space. The decision to select Python as +a foundational language for the project was a relatively easy one. That +said, there was a bit of trepidation around the well known fact that +CPython can be slow for inner-core tasks, such as tokenization and +parsing, that can see heavy workloads in production.</p> +<p>I first became aware of PyPy in early 2016 when I was developing a +crossword game <a class="reference external" href="https://github.com/mideind/Netskrafl">Netskrafl</a> in Python 2.7 +for Google App Engine. I had a utility program that compressed a +dictionary into a Directed Acyclic Word Graph and was taking 160 +seconds  to run on CPython 2.7, so I tried PyPy and to my amazement saw +a 4x speedup (down to 38 seconds), with literally no effort besides +downloading the PyPy runtime.</p> +<p>This led me to select PyPy as the default Python interpreter for my +company’s Python development efforts as well as for our production +websites and API servers, a role in which it remains to this day. We +have followed PyPy’s upgrades along the way, being just about to migrate +our minimally required language version from 3.6 to 3.7.</p> +<p>In NLP, speed and memory requirements can be quite important for +software usability. On the other hand, NLP logic and algorithms are +often complex and challenging to program, so programmer productivity and +code clarity are also critical success factors. A pragmatic approach +balances these factors, avoids premature optimization and seeks a +careful compromise between maximal run-time efficiency and minimal +programming and maintenance effort.</p> +<p>Turning to our use cases, our Icelandic text +tokenizer <a class="reference external" href="https://github.com/mideind/Tokenizer">"Tokenizer"</a> is fairly light, +runs tight loops and performs a large number of small, repetitive +operations. It runs very well on PyPy’s JIT and has not required further +optimization.</p> +<p>Our Icelandic parser <a class="reference external" href="https://github.com/mideind/GreynirPackage">Greynir</a> +(known on PyPI as <a class="reference external" href="https://pypi.org/project/reynir/">reynir</a>) is, +if I may say so myself, a piece of work. It <a class="reference external" href="https://aclanthology.org/R19-1160.pdf">parses natural language +text</a> according to a +<a class="reference external" href="https://github.com/mideind/GreynirPackage/blob/master/src/reynir/Greynir.grammar">hand-written context-free +grammar</a>, +using an <a class="reference external" href="https://en.wikipedia.org/wiki/Earley_parser">Earley-type +algorithm</a> as <a class="reference external" href="https://www.sciencedirect.com/science/article/pii/S0167642309000951">enhanced +by Scott and +Johnstone</a>. +The CFG contains almost 7,000 nonterminals and 6,000 terminals, and the +parser handles ambiguity as well as left, right and middle recursion. It +returns a packed parse forest for each input sentence, which is then +pruned by a scoring heuristic down to a single best result tree.</p> +<p>This parser was originally coded in pure Python and turned out to be +unusably slow when run on CPython - but usable on PyPy, where it was +3-4x faster. However, when we started applying it to heavier production +workloads, it  became apparent that it needed to be faster still. We +then proceeded to convert the innermost Earley parsing loop from Python +to <a class="reference external" href="https://github.com/mideind/GreynirPackage/blob/master/src/reynir/eparser.cpp">tight +C++</a> +and to call it from PyPy via +<a class="reference external" href="https://cffi.readthedocs.io/en/latest/">CFFI</a>, with callbacks for +token-terminal matching functions (“business logic”) that remained on +the Python side. This made the parser much faster (on the order of 100x +faster than the original on CPython) and quick enough for our production +use cases. Even after moving much of the heavy processing to C++ and using CFFI, PyPy still gives a significant speed boost over CPython.</p> +<p>Connecting C++ code with PyPy proved to be quite painless using CFFI, +although we had to figure out a few <a class="reference external" href="https://github.com/mideind/GreynirPackage/blob/master/src/reynir/eparser_build.py">magic incantations in our build +module</a> +to make it compile smoothly during setup from source on Windows and +MacOS in addition to Linux. Of course, we build binary PyPy and CPython +wheels for the most common targets so most users don’t have to worry +about setup requirements.</p> +<p>With the positive experience from the parser project, we proceeded to +take a similar approach for two other core NLP packages: our compressed +vocabulary package <a class="reference external" href="https://github.com/mideind/BinPackage">BinPackage</a> +(known on PyPI as <a class="reference external" href="https://pypi.org/project/islenska/">islenska</a>) and our +trigrams database package <a class="reference external" href="https://github.com/mideind/Icegrams">Icegrams</a>. +These packages both take large text input (3.1 million word forms with +inflection data in the vocabulary case; 100 million tokens in the +trigrams case) and compress it into packed binary structures. These +structures are then memory-mapped at run-time using +<a class="reference external" href="https://docs.python.org/3/library/mmap.html">mmap</a> and queried via +Python functions with a lookup time in the microseconds range. The +low-level data structure navigation is <a class="reference external" href="https://github.com/mideind/Icegrams/blob/master/src/icegrams/trie.cpp">done in +C++</a>, +called from Python via CFFI. The ex-ante preparation, packing, +bit-fiddling and data structure generation is fast enough with PyPy, so +we haven’t seen a need to optimize that part further.</p> +<p>To showcase our tools, we host public (and open source) websites such as +<a class="reference external" href="https://greynir.is/">greynir.is</a> for our parsing, named entity +recognition and query stack and +<a class="reference external" href="https://yfirlestur.is/">yfirlestur.is</a> for our spell and grammar +checking stack. The server code on these sites is all Python running on +PyPy using <a class="reference external" href="https://flask.palletsprojects.com/en/2.0.x/">Flask</a>, +wrapped in <a class="reference external" href="https://gunicorn.org/">gunicorn</a> and hosted on +<a class="reference external" href="https://www.nginx.com/">nginx</a>. The underlying database is +<a class="reference external" href="https://www.postgresql.org/">PostgreSQL</a> accessed via +<a class="reference external" href="https://www.sqlalchemy.org/">SQLAlchemy</a> and +<a class="reference external" href="https://pypi.org/project/psycopg2cffi/">psycopg2cffi</a>. This setup +has served us well for 6 years and counting, being fast, reliable and +having helpful and supporting communities.</p> +<p>As can be inferred from the above, we are avid fans of PyPy and +commensurately thankful for the great work by the PyPy team over the +years. PyPy has enabled us to use Python for a larger part of our +toolset than CPython alone would have supported, and its smooth +integration with C/C++ through CFFI has helped us attain a better +tradeoff between performance and programmer productivity in our +projects. We wish for PyPy a great and bright future and also look +forward to exciting related developments on the horizon, such as +<a class="reference external" href="https://hpyproject.org/">HPy</a>.</p> +</section>casestudyhttps://www.pypy.org/posts/2022/02/nlp-icelandic-case-study.htmlSun, 06 Feb 2022 15:00:00 GMT \ No newline at end of file diff --git a/authors/wim-lavrijsen.html b/authors/wim-lavrijsen.html new file mode 100644 index 000000000..68bb0911e --- /dev/null +++ b/authors/wim-lavrijsen.html @@ -0,0 +1,116 @@ + + + + + +Posts by Wim Lavrijsen | PyPy + + + + + + + + + + + + + + + + + Skip to main content +
+
+
+ + \ No newline at end of file diff --git a/authors/wim-lavrijsen.xml b/authors/wim-lavrijsen.xml new file mode 100644 index 000000000..06a2b837f --- /dev/null +++ b/authors/wim-lavrijsen.xml @@ -0,0 +1,578 @@ + +PyPy (Posts by Wim Lavrijsen)https://www.pypy.org/enContents © 2024 <a href="mailto:pypy-dev@pypy.org">The PyPy Team</a> Sat, 31 Aug 2024 17:48:10 GMTNikola (getnikola.com)http://blogs.law.harvard.edu/tech/rsscppyy status updatehttps://www.pypy.org/posts/2013/02/cppyy-status-update-808802896237239604.htmlWim Lavrijsen<p>The <a href="https://doc.pypy.org/en/latest/cppyy.html">cppyy module</a> +provides C++ bindings for PyPy by using the reflection information extracted +from C++ header files by means of the +<a href="https://root.cern.ch/drupal/content/reflex">Reflex package</a>. +In order to support C++11, the goal is to move away from Reflex and instead use +<a href="https://root.cern.ch/drupal/content/cling">cling</a>, an interactive +C++ interpreter, as the backend. +Cling is based on <a href="https://llvm.org/">llvm</a>'s +<a href="https://clang.llvm.org/">clang</a>. + +The use of a real compiler under the hood has the advantage that it is now +possible to cover every conceivable corner case. +The disadvantage, however, is that every corner case actually has to be +covered. +Life is somewhat easier when calls come in from the python interpreter, as +those calls have already been vetted for syntax errors and all lookups are +well scoped. +Furthermore, the real hard work of getting sane responses from and for C++ +in an interactive environment is done in cling, not in the bindings. +Nevertheless, it is proving a long road (but for that matter clang does not +support all of C++11 yet), so here's a quick status update showing that good +progress is being made. + +</p><p>The following example is on CPython, not PyPy, but moving a third +(after Reflex and +<a href="https://root.cern.ch/root/Cint.html">CINT</a>) backend into place +underneath cppyy is straightforward compared to developing the backend +in the first place. + +Take this snippet of C++11 code +(<tt class="docutils literal"><span class="pre">cpp11.C</span></tt>): + +</p><p></p><pre> constexpr int data_size() { return 5; } + + auto N = data_size(); + + template&lt;class L, class R&gt; + struct MyMath { + static auto add(L l, R r) -&gt; decltype(l+r) { return l + r; } + }; + + template class MyMath&lt;int, int&gt;;</pre> + +<p>As a practical matter, most usage of new C++11 features will live in +implementations, not in declarations, and are thus never seen by the bindings. +The above example is therefore somewhat contrived, but it will serve to show +that these new declarations actually work. +The new features used here are +<tt class="docutils literal"><span class="pre">constexpr</span></tt>, +<tt class="docutils literal"><span class="pre">auto</span></tt>, and +<tt class="docutils literal"><span class="pre">decltype</span></tt>. +Here is how you could use these from CPython, using the +<a href="https://root.cern.ch/viewvc/trunk/bindings/pyroot/">PyROOT</a> +package, which has more than a passing resemblance to cppyy, as one is based +on the other: + +</p><p></p><pre> import ROOT as gbl + gbl.gROOT.LoadMacro('cpp11.C') + + print 'N =', gbl.N + print '1+1 =', gbl.MyMath(int, int).add(1,1)</pre> + +which, when entered into a file +(<tt class="docutils literal"><span class="pre">cpp11.py</span></tt>) and executed, +prints the expected results: + +<p></p><pre> $ python cpp11.py + N = 5 + 1+1 = 2</pre> + +In the example, the C++ code is compiled on-the-fly, rather than first generating +a dictionary as is needed with Reflex. +A deployment model that utilizes stored pre-compiled information is foreseen +to work with larger projects, which may have to pull in headers from many places. + +<p>Work is going to continue first on C++03 on cling with CPython (about 85% of +unit tests currently pass), with a bit of work on C++11 support on the side. +Once fully in place, it can be brought into a new backend for cppyy, after +which the remaining parts of C++11 can be fleshed out for both interpreters. + +</p><p>Cheers,<br> +Wim Lavrijsen</p>https://www.pypy.org/posts/2013/02/cppyy-status-update-808802896237239604.htmlThu, 28 Feb 2013 00:01:00 GMTC++ objects in cppyy, part 1: Data Membershttps://www.pypy.org/posts/2012/08/c-objects-in-cppyy-part-1-data-members-1105848719513737614.htmlWim Lavrijsen<p>The cppyy module makes it possible to call into C++ from PyPy through the +<a href="https://root.cern.ch/drupal/content/reflex">Reflex package</a>. +Documentation and setup instructions are +<a href="https://doc.pypy.org/en/latest/cppyy.html">available here</a>. +Recent work has focused on STL, low-level buffers, and code quality, but also +a lot on pythonizations for the +<a href="https://root.cern.ch/drupal/content/cint">CINT backend</a>, which is +mostly for High Energy Physics (HEP) use only. +A +<a href="https://www.pypy.org/posts/2012/06/architecture-of-cppyy-9077100041707701102.html">previous posting</a> walked +through the high-level structure and organization of the module, where it was +argued why it is necessary to write cppyy in RPython and generate bindings at +run-time for the best performance. +This posting details how access to C++ data structures is provided and is part +of a series of 3 postings on C++ object representation in Python: the second +posting will be about method dispatching, the third will tie up several odds +and ends by showing how the choices presented here and in part 2 work together +to make features such as auto-casting possible. + + +</p><h3>Wrapping Choices</h3> + +<p>Say we have a plain old data type (POD), which is the simplest possible +data structure in C++. +Like for example: + +</p><pre> struct A { + int m_i; + double m_d; + };</pre> + +<p>What should such a POD look like when represented in Python? +Let's start by looking at a Python data structure that is functionally +similar, in that it also carries two public data members of the desired +types. +Something like this: + +</p><pre> class A(object): + def __init__(self): + self.m_i = 0 + self.m_d = 0.</pre> + +<p>Alright, now how to go about connecting this Python class with the former +C++ POD? +Or rather, how to connect instances of either. +The exact memory layout of a Python +<tt class="docutils literal"><span class="pre">A</span></tt> +instance is up to Python, and likewise the layout of a C++ +<tt class="docutils literal"><span class="pre">A</span></tt> instance is up +to C++. +Both layouts are implementation details of the underlying language, language +implementation, language version, and the platform used. +It should be no surprise then, that for example an +<tt class="docutils literal"><span class="pre">int</span></tt> in C++ looks +nothing like a +<tt class="docutils literal"><span class="pre">PyIntObject</span></tt>, even +though it is perfectly possible, in both cases, to point out in memory where +the integer value is. +The two representations can thus not make use of the same block of memory +internally. +However, the requirement is that the access to C++ from Python looks and feels +natural in its use, not that the mapping is exact. +Another requirement is that we want access to the actual object from both +Python and C++. +In practice, it is easier to provide natural access to C++ from Python than +the other way around, because the choices of memory layout in C++ are far more +restrictive: the memory layout defines the access, as the actual class +definition is gone at run-time. +The best choice then, is that the Python object will act as a proxy to the C++ +object, with the actual data always being in C++. + +</p><p>From here it follows that if the +<tt class="docutils literal"><span class="pre">m_i</span></tt> data member +lives in C++, then Python needs some kind of helper to access it. +Conveniently, since version 2.2, Python has a +<tt class="docutils literal"><span class="pre">property</span></tt> construct +that can take a getter and setter function that are called when the property +is used in Python code, and present it to the programmer as if it were a data +member. +So we arrive at this (note how the +<tt class="docutils literal"><span class="pre">property</span></tt> instance +is a variable at the class level): + +</p><pre> class A(object): + def __init__(self): + self._cppthis = construct_new_A() + m_i = property(get_m_i, set_m_i) + m_d = property(get_m_d, set_m_d)</pre> + +<p>The +<tt class="docutils literal"><span class="pre">construct_new_A</span></tt> +helper is not very interesting (the reflection layer can provide for it +directly), and methods are a subject for part 2 of this posting, so focus on +<tt class="docutils literal"><span class="pre">get_m_i</span></tt> +and <tt class="docutils literal"><span class="pre">set_m_i</span></tt>. +In order for the getter to work, the method needs to have access to the C++ +instance for which the Python object is a proxy. +On access, Python will call the getter function with the proxy instance for +which it is called. +The proxy has a +<tt class="docutils literal"><span class="pre">_cppthis</span></tt> data +member from which the C++ instance can be accessed (think of it as a pointer) +and all is good, at least for +<tt class="docutils literal"><span class="pre">m_i</span></tt>. +The second data member +<tt class="docutils literal"><span class="pre">m_d</span></tt>, however, +requires some more work: it is located at some offset into +<tt class="docutils literal"><span class="pre">_cppthis</span></tt>. +This offset can be obtained from the reflection information, which lets the +C++ compiler calculate it, so details such as +<a href="https://en.wikipedia.org/wiki/Byte_padding#Data_structure_padding">byte padding</a> +are fully accounted for. +Since the setter also needs the offset, and since both share some more details +such as the containing class and type information of the data member, it is +natural to create a custom property class. +The getter and setter methods then become bound methods of an instance of that +custom property, +<tt class="docutils literal"><span class="pre">CPPDataMember</span></tt>, and +there is one such instance per data member. +Think of something along these lines: + +</p><pre> def make_datamember(cppclass, name): + cppdm = cppyy.CPPDataMember(cppclass, name) + return property(cppdm.get, cppdm.set)</pre> + +where the +<tt class="docutils literal"><span class="pre">make_datamember</span></tt> +function replaces the call to +<tt class="docutils literal"><span class="pre">property</span></tt> in the +class definition above. + +<p>Now hold on a minute! +Before it was argued that Python and C++ can not share the same underlying +memory structure, because of choices internal to the language. +But if on the Python side choices are being made by the developer of the +language bindings, that is no longer a limitation. +In other words, why not go through e.g. the Python extension API, and do +this: + +</p><pre> struct A_pyproxy { + PyObject_HEAD + int m_i; + double m_d; + };</pre> + +<p>Doing so would save on +<a href="https://en.wikipedia.org/wiki/Malloc">malloc overhead</a> and remove +a pointer indirection. +There are some technical issues specific to PyPy for such a choice: there is +no such thing as +<tt class="docutils literal"><span class="pre">PyPyObject_HEAD</span></tt> +and the layout of objects is not a given as that is decided only at +translation time. +But assume that those issues can be solved, and also accept that there is no +problem in creating structure definitions like this at run-time, since the +reflection layer can provide both the required size and access to the +placement +<tt class="docutils literal"><span class="pre">new operator</span></tt> +(compare e.g. CPython's +<a href="https://docs.python.org/library/struct.html">struct module</a>). +There is then still a more fundamental problem: it must be possible to take +over ownership in Python from instances created in C++ and vice-versa. +With a proxy scheme, that is trivial: just pass the pointer and do the +necessary bookkeeping. +With an embedded object, however, not every use case can be implemented: e.g. +if an object is created in Python, passed to C++, and deleted in C++, it +must have been allocated independently. +The proxy approach is therefore still the best choice, although embedding +objects may provide for optimizations in some use cases. + + +</p><h3>Inheritance</h3> + +<p>The next step, is to take a more complicated C++ class, one with inheritance +(I'm leaving out details such as constructors etc., for brevity): + +</p><pre> class A { + public: + virtual ~A() {} + int m_i; + double m_d; + }; + + class B : public A { + public: + virtual ~B() {} + int m_j; + };</pre> + +<p>From the previous discussion, it should already be clear what this will look +like in Python: + +</p><pre> class A(object): + def __init__(self): + self._cppthis = construct_new_A() + m_i = make_datamember('A', 'm_i') + m_d = make_datamember('A', 'm_d') + + class B(A): + def __init__(self): + self._cppthis = construct_new_B() + m_j = make_datamember('B', 'm_j')</pre> + +<p>There are some minor adjustments needed, however. +For one, the offset of the +<tt class="docutils literal"><span class="pre">m_i</span></tt> data member +may be no longer zero: it is possible that a virtual function dispatch table +(<a href="https://en.wikipedia.org/wiki/Virtual_method_table">vtable</a>) +pointer is added at the beginning of +<tt class="docutils literal"><span class="pre">A</span></tt> (an alternative +is to have the vtable pointer at the end of the object). +But if +<tt class="docutils literal"><span class="pre">m_i</span></tt> is handled the +same way as +<tt class="docutils literal"><span class="pre">m_d</span></tt>, with the +offset provided by the compiler, then the compiler will add the bits, if any, +for the vtable pointer and all is still fine. +A real problem could come in however, with a call of the +<tt class="docutils literal"><span class="pre">m_i</span></tt> property on +an instance of +<tt class="docutils literal"><span class="pre">B</span></tt>: in that case, +the <tt class="docutils literal"><span class="pre">_cppthis</span></tt> +points to a <tt class="docutils literal"><span class="pre">B</span></tt> +instance, whereas the getter/setter pair expect an +<tt class="docutils literal"><span class="pre">A</span></tt> instance. +In practice, this is usually not a problem: compilers will align +<tt class="docutils literal"><span class="pre">A</span></tt> and +<tt class="docutils literal"><span class="pre">B</span></tt> and calculate +an offset for +<tt class="docutils literal"><span class="pre">m_j</span></tt> from the start +of <tt class="docutils literal"><span class="pre">A</span></tt>. +Still, that is an implementation detail (even though it is one that can be +determined at run-time and thus taken advantage of by the JIT), so it can not +be relied upon. +The <tt class="docutils literal"><span class="pre">m_i</span></tt> getter +thus needs to take into account that it can be called with a derived type, +and so it needs to add an additional offset. +With that modification, the code looks something like this (as you would have +guessed, this is getting more and more into pseudo-code territory, although it +is conceptually close to the actual implementation in cppyy): + +</p><pre> def get_m_i(self): + return int(self._cppthis + offset(A, m_i) + offset(self.__class__, A))</pre> + +<p>Which is a shame, really, because the offset between +<tt class="docutils literal"><span class="pre">B</span></tt> and +<tt class="docutils literal"><span class="pre">A</span></tt> is going +to be zero most of the time in practice, and the JIT can not completely +<a href="https://www.pypy.org/posts/2011/03/controlling-tracing-of-interpreter-with_15-3281215865169782921.html">elide</a> +the offset calculation (as we will see later; it is easy enough to elide if +<tt class="docutils literal"><span class="pre">self.__class__</span></tt> is +<tt class="docutils literal"><span class="pre">A</span></tt>, though). +One possible solution is to repeat the properties for each derived class, i.e. +to have a +<tt class="docutils literal"><span class="pre">get_B_m_i</span></tt> etc., but +that looks ugly on the Python side and anyway +does not work in all cases: e.g. with multiple inheritance where there are +data members with the same name in both bases, or if +<tt class="docutils literal"><span class="pre">B</span></tt> itself has a +public data member called +<tt class="docutils literal"><span class="pre">m_i</span></tt> that shadows +the one from <tt class="docutils literal"><span class="pre">A</span></tt>. +The optimization then, is achieved by making +<tt class="docutils literal"><span class="pre">B</span></tt> in charge of the +offset calculations, by making +<tt class="docutils literal"><span class="pre">offset</span></tt> a method of +<tt class="docutils literal"><span class="pre">B</span></tt>, like so: + +</p><pre> def get_m_i(self): + return int(self._cppthis + offset(A, m_i) + self.offset(A))</pre> + +<p>The insight is that by scanning the inheritance hierarchy of a derived +class like <tt class="docutils literal"><span class="pre">B</span></tt>, you +can know statically whether it may sometimes need offsets, or whether the +offsets are always going to be zero. +Hence, if the offsets are always zero, the method +<tt class="docutils literal"><span class="pre">offset</span></tt> on +<tt class="docutils literal"><span class="pre">B</span></tt> will +simply return the literal +<tt class="docutils literal"><span class="pre">0</span></tt> as its +implementation, with the JIT taking care of the rest through inlining and +constant folding. +If the offset could be non-zero, then the method will perform an actual +calculation, and it will let the JIT elide the call only if possible. + + +</p><h3>Multiple Virtual Inheritance</h3> + +<p>Next up would be multiple inheritance, but that is not very interesting: we +already have the offset calculation between the actual and base class, which +is all that is needed to resolve any multiple inheritance hierarchy. +So, skip that and move on to multiple <i>virtual</i> inheritance. +That that is going to be a tad more complicated will be clear if you show the +following code snippet to any old C++ hand and see how they respond. +Most likely you will be told: "Don't ever do that." +But if code can be written, it will be written, and so for the sake of the +argument, what would this look like in Python: + +</p><pre> class A { + public: + virtual ~A() {} + int m_a; + }; + + class B : public virtual A { + public: + virtual ~B() {} + int m_b; + }; + + class C : public virtual A { + public: + virtual ~C() {} + int m_c; + }; + + class D : public virtual B, public virtual C { + public: + virtual ~D() {} + int m_d; + };</pre> + +<p>Actually, nothing changes from what we have seen so far: the scheme as laid +out above is fully sufficient. +For example, <tt class="docutils literal"><span class="pre">D</span></tt> +would simply look like: + +</p><pre> class D(B, C): + def __init__(self): + self._cppthis = construct_new_D() + m_d = make_datamember('D', 'm_d')</pre> + +<p>Point being, the only complication added by the multiple virtual +inheritance, is that navigation of the C++ instance happens with pointers +internal to the instance rather than with offsets. +However, it is still a fixed offset from any location to any other location +within the instance as its parts are laid out consecutively in memory (this is +not a requirement, but it is the most efficient, so it is what is used in +practice). +But what you can not do, is determine the offset statically: you need a live +(i.e. constructed) object for any offset calculations. +In Python, everything is always done dynamically, so that is of itself not a +limitation. +Furthermore, +<tt class="docutils literal"><span class="pre">self</span></tt> is already +passed to the offset calculation (remember that this was done to put the +calculation in the derived class, to optimize the common case of zero +offset), thus a live C++ instance is there precisely when it is needed. +The call to the offset calculation is hard to elide, since the instance will +be passed to a C++ helper and so the most the JIT can do is guard on the +instance's memory address, which is likely to change between traces. +Instead, explicit caching is needed on the base and derived types, allowing +the JIT to elide the lookup in the explicit cache. + + +</p><h3>Static Data Members and Global Variables</h3> + +<p>That, so far, covers all access to instance data members. +Next up are static data members and global variables. +A complication here is that a Python +<tt class="docutils literal"><span class="pre">property</span></tt> needs to +live on the class in order to work its magic. +Otherwise, if you get the property, it will simply return the getter function, +and if you set it, it will dissappear. +The logical conclusion then, is that a +<tt class="docutils literal"><span class="pre">property</span></tt> +representing a static or global variable, needs to live on the class of the +class, or the metaclass. +If done directly though, that would mean that every static data member is +available from every class, since all Python classes have the same metaclass, +which is class +<tt class="docutils literal"><span class="pre">type</span></tt> (and which is +its own metaclass). +To prevent that from happening and because +<tt class="docutils literal"><span class="pre">type</span></tt> is actually +immutable, each proxy class needs to have its own custom metaclass. +Furthermore, since static data can also be accessed on the instance, the +class, too, gets a +<tt class="docutils literal"><span class="pre">property</span></tt> object +for each static data member. +Expressed in code, for a basic C++ class, this looks as follows: + +</p><pre> class A { + public: + static int s_i; + };</pre> + +<p>Paired with some Python code such as this, needed to expose the static +variable both on the class and the instance level: + +</p><pre> meta_A = type(CppClassMeta, 'meta_A', [CPPMetaBase], {}) + meta_A.s_i = make_datamember('A', 's_i') + + class A(object): + __metaclass__ = meta_A + s_i = make_datamember('A', 's_i')</pre> + +<p>Inheritance adds no complications for the access of static data per se, but +there is the issue that the metaclasses must follow the same hierarchy as the +proxy classes, for the Python method resolution order (MRO) to work. +In other words, there are two complete, parallel class hierarchies that map +one-to-one: a hierarchy for the proxy classes and one for their metaclasses. + +</p><p>A parallel class hierarchy is used also in other highly dynamic, +object-oriented environments, such as for example +<a href="https://en.wikipedia.org/wiki/Metaclass#In_Smalltalk-80">Smalltalk</a>. +In Smalltalk as well, class-level constructs, such as class methods and data +members, are defined for the class in the metaclass. +A metaclass hierarchy has further uses, such as lazy loading of nested +classes and member templates (this would be coded up in the base class of all +metaclasses: +<tt class="docutils literal"><span class="pre">CPPMetaBase</span></tt>), and +makes it possible to distribute these over different reflection libraries. +With this in place, you can write Python codes like so: + +</p><pre> &gt;&gt;&gt;&gt; from cppyy.gbl import A + &gt;&gt;&gt;&gt; a = A() + &gt;&gt;&gt;&gt; a.s_i = 42 + &gt;&gt;&gt;&gt; print A.s_i == a.s_i + True + &gt;&gt;&gt;&gt; # etc.</pre> + +<p>The implementation of the getter for +<tt class="docutils literal"><span class="pre">s_i</span></tt> is a lot +easier than for instance data: the static data lives at a fixed, global, +address, so no offset calculations are needed. +The same is done for global data or global data living in namespaces: +namespaces are represented as Python classes, and global data are implemented +as properties on them. +The need for a metaclass is one of the reasons why it is easier for namespaces +to be classes: module objects are too restrictive. +And even though namespaces are not modules, you still can, with +some limitations, +<tt class="docutils literal"><span class="pre">import</span></tt> from +them anyway. + +</p><p>It is common that global objects themselves are pointers, and therefore it +is allowed that the stored +<tt class="docutils literal"><span class="pre">_cppthis</span></tt> is not a +pointer to a C++ object, but rather a pointer to a pointer to a C++ object. +A double pointer, as it were. +This way, if the C++ code updates the global pointer, it will automatically +reflect on the Python side in the proxy. +Likewise, if on the Python side the pointer gets set to a different variable, +it is the pointer that gets updated, and this will be visible on the C++ side. +In general, however, the same caveat as for normal Python code applies: in +order to set a global object, it needs to be set within the scope of that +global object. +As an example, consider the following code for a C++ namespace +<tt class="docutils literal"><span class="pre">NS</span></tt> with +global variable +<tt class="docutils literal"><span class="pre">g_a</span></tt>, which behaves +the same as Python code for what concerns the visibility of changes to the +global variable: + +</p><pre> &gt;&gt;&gt;&gt; from cppyy.gbl import NS, A + &gt;&gt;&gt;&gt; from NS import g_a + &gt;&gt;&gt;&gt; g_a = A(42) # does NOT update C++ side + &gt;&gt;&gt;&gt; print NS.g_a.m_i + 13 # the old value happens to be 13 + &gt;&gt;&gt;&gt; NS.g_a = A(42) # does update C++ side + &gt;&gt;&gt;&gt; print NS.g_a.m_i + 42 + &gt;&gt;&gt;&gt; # etc.</pre> + + +<h3>Conclusion</h3> + +<p>That covers all there is to know about data member access of C++ classes in +Python through a reflection layer! +A few final notes: RPython does not support metaclasses, and so the +construction of proxy classes (code like +<tt class="docutils literal"><span class="pre">make_datamember</span></tt> +above) happens in Python code instead. +There is an overhead penalty of about 2x over pure RPython code associated +with that, due to extra guards that get inserted by the JIT. +A factor of 2 sounds like a lot, but the overhead is tiny to begin with, and +2x of tiny is still tiny and it's not easy to measure. +The class definition of the custom property, +<tt class="docutils literal"><span class="pre">CPPDataMember</span></tt>, is +in RPython code, to be transparent to the JIT. +The actual offset calculations are in the reflection layer. +Having the proxy class creation in Python, with structural code in RPython, +complicates matters if proxy classes need to be constructed on-demand. +For example, if an instance of an as-of-yet unseen type is returned by a +method. +Explaining how that is solved is a topic of part 2, method calls, so stay +tuned. + +</p><p>This posting laid out the reasoning behind the object representation of C++ +objects in Python by cppyy for the purpose of data member access. +It explained how the chosen representation of offsets gives rise to a very +pythonic representation, which allows Python introspection tools to work as +expected. +It also explained some of the optimizations done for the benefit of the JIT. +Next up are method calls, which will be described in part 2.</p>https://www.pypy.org/posts/2012/08/c-objects-in-cppyy-part-1-data-members-1105848719513737614.htmlMon, 13 Aug 2012 09:26:00 GMT \ No newline at end of file diff --git a/blog/index-1.html b/blog/index-1.html new file mode 100644 index 000000000..233838641 --- /dev/null +++ b/blog/index-1.html @@ -0,0 +1,703 @@ + + + + + + +PyPy (old posts, page 1) | PyPy + + + + + + + + + + + + + + + + + Skip to main content +
+
+
+
+
+
+ + Eric wrote on 2007-11-30 09:27: +
+
+

Absolutely fascinating! I have to admit that there were a few (ok, a lot) of times where I couldn't quite follow along, but you guys are doing some absolutely amazing work.

+
+
+
+ +

Sprint Pictures

+ +
+

The obligatory sprint picture post... + + + + +Alexander Schremmer, Armin Rigo, Maciek Fijalkowski, Antonio Cuni + +Anders Chrigström, Samuele Pedroni, Laura Creighton, Jacob Hallén, Carl Friedrich Bolz, Richard Emslie, Maciek Fijalkowski, Armin Rigo + +Holger Krekel + +Whiteboard with "real world goals" dependencies.

+
+

Sprint Discussions: Wrapping External Libraries

+ +
+

A more technical discussion during the sprint was about the next steps for the external module problem (minutes). One of PyPy's biggest problems in becoming more generally useful are C extension modules, which can't work with PyPy's Python interpreter. We already reimplemented many of the more commonly used extension modules in CPython's standard library in Python or RPython. However, there are more missing and there is no way to implement all the extension modules that other people have written.

+

+

Whiteboard after the discussion. +

+

+

+

Therefore we need a different approach to this problem. Extension modules are commonly written for two different reasons, one being speed, the other being wrapping non-Python libraries. At the moment we want mostly to approach a solution for the latter problem, because we hope that the JIT will eventually make it possible to not have to write extension modules for speed reasons any more.

There are two rough ideas to approach this problem in the near future (there are other, more long-term ideas that I am not describing now): One of them is to add the ctypes module to PyPy's Python interpreter, which would mean re-implementing it since the existing implementation is written in C.

The other way would be to work on the existing way to get extensions in that PyPy provides, which are "mixed modules". Mixed modules are written in a combination of RPython and normal Python code. To then wrap C libraries you would use rffi, which is the foreign function interface of RPython.

+

The discussion round: Maciek Fijalkowski, Armin Rigo, Richard Emslie, Alexander Schremmer.

Both approaches have problems: With ctypes you have no built-in way to query C header files for structure layouts and constants which requires you to hard-wire them, which is highly platform dependant. Mixed modules are not really fun to write, since they need to be RPython and we currently don't have a way to do separate compilation, so you always need to translate PyPy's whole Python interpreter to see whether your module is correct.

In the meeting it was decided to first go for a ctypes replacement. The replacement would be written in pure Python, we already have a very thin wrapper around libffi which the new ctypes implementation would use. The goal to reach would be to get the pygame implementation in ctypes to run on PyPy.

To make ctypes more useful in general to write this kind of wrappers, we will probably extract some code that we have already written for PyPy's own usage: it gives a way to write "imprecise" declarations ("a structure with at least fields called x and y which are of some kind of integer type") and turn them into exact ctypes declarations, internally using the C compiler to inspect the platform headers.

After this is done we should approach separate compilation so that developing modules in RPython has a quicker turnaround time. This is somewhat involved to implement for technical reasons. There are ideas how to implement it quickly to make it usable for prototyping, but it's still a lot of work.

+
+
+
+
+ + Simon Burton wrote on 2007-11-25 18:37: +
+
+

Is it not possibe to test rpython extension modules for pypy on top of cpython ? (ie. without compilation)

+
+
+
+
+ + Maciej Fijalkowski wrote on 2007-11-25 21:39: +
+
+

Yop, sure it is. PyPy extension modules runs through ctypes on top of CPython.

+
+
+
+
+ + Anonymous wrote on 2007-11-26 13:04: +
+
+

I guess that Simon meant to ask why easier module testing requires separate compilation. The fact is that even if a module runs fine on top of CPython, there will be some RPython issues that are only visible when you try to translate it.

Armin

+
+
+
+ +

Sprint Discussions: Releases, Testing

+ +
+

During the sprint we had various discussions about technical issues as well as planning discussions about how we want to go about things. One of them was about the stability of PyPy, how to ensure stability, how to handle releases and approaches to being more "usable". I will describe this discussion in this post (there are also minutes of the meeting). + + + +The Meetings whiteboard + +

+

+

Testing +

First we discussed the current situation in terms of testing. PyPy has been extremely testing-oriented from the start, it is being developed almost exclusively in test-driven-development style. To deal with the large number of tests we already have some infrastructure in place:

+

As you can see, we are lacking in the Windows testing area, which is an even worse problem because none of the currently active developers has Windows as his primary OS. We should improve this by finding a Windows machine where the tests are run nightly and where we can log in to try bug-fixes quickly. The latter bit is important, we had a nightly windows test run before (thanks to Scott Dial) but it didn't help, because even if you tried to fix a bug you would have to wait until the next night to see whether it worked.

Another very serious problem is that of aggregation: we have these various test runs that all have a web interface to check for errors but there is no easy way to find out which tests failed. You have to go to each page and even some sub-pages to see what needs fixing, which is a tedious process. The idea for solving this is aggregate all the available information into some sort of testing-entry-point page that gives a quick overview of the regressions that happened during the night. It's not clear whether we can achieve that with existing tools (buildbots or whatever), but we will investigate that. +

Releases +

+

The discussion about releases was more on a fundamental and less on a concrete level (especially when it comes to time-frames). We discussed what it means to make a release, because obviously it is more than just taking an SVN revision and putting a tarball of it onto the webpage. During the EU period we were required to make several releases, but those were not really meant to be more than technology previews for the brave adventurers to try. In the future we have the goal to release things that are more stable and hopefully more practically useful. The plan is to use medium-sized Python applications that have a chance to run on top of PyPy because they don't use too many extension modules (web apps being likely candidates) and that have good unit-tests themselves. The first step would be to find some applications that fit this description, fix the bugs that prevents PyPy from running them and from then on run them nightly on one of the testing machines to check for regressions. This would allow us to be more confident when stating that "PyPy works".

Another thing to keep in mind for releases is the special features that our Python interpreter provides (e.g. the thunk and the taint object space, our stackless features, transparent proxies, sandboxing, special object implementations). Those features are neither tested by the CPython tests nor by any existing applications. Therefore we cannot really be confident that these features work and don't have too many bugs (in fact, the first time somebody really use the become feature of the thunk space in earnest he found a serious bug that is not fixed so far). To get around this problem, we plan to write small-to-medium sized example applications for each of these features (for stackless we can maybe use one of the existing stackless examples). This will hopefully find bugs and will also make it possible to evaluate whether the features make sense from a language design point of view.

A minor thing to make releases easier is to be able to not only have the tests be run once a night but also be able to trigger them manually on the release branch before doing the release.

+

Publishing Cool Things +

Since we decided that the releases we make should be stable and usable, we also discussed how we would go about making new "cool things" like features, experiments etc. better known. The consensus was that this blog is probably the best forum for doing this. In addition we discussed having a stabler snapshot of the trunk made to ensure that people wanting to play around with these features don't accidentally get +a broken version.

+

Helping Out +

+

Right now we are still in cleanup mode (the cleanup sprint is nearly done, but we haven't finished all the cleanups yet), so we won't be able to start on the above things right now. However, they will have a strong focus soon. So if you are interested in trying out to run programs on top of PyPy or writing new ones that use the new features you are most welcome to do so and we will try to fix the bugs or help you doing it (of course some tolerance against frustration is needed when you do that, because the bugs that turn up tend to be obscure). We have not been perfect at this in the past, but this will have to change.

+
+
+
+
+ + Bill Mill wrote on 2007-11-25 14:00: +
+
+

Please do publish more about the cool things in pypy! I find that, for most languages, I get the right information level from blog announcements. Reading the mailing list is like drinking from a fire hose when I only want to stay informed of where you guys are at.

(I post a lot on reddit too, and it's nicer to post blog articles than mailing list postings)

+
+
+
+ +

Ropes branch merged

+ +
+

This afternoon we merged the ropes branch that I have been working on on the side for a while (also to cut down the number of currently active branches a bit, since we are doing major cleanups right now). It contained a new (optional) implementation of the unicode type using the rope data structure. Ropes essentially use concatenation trees to represent strings. The leaves of the trees contain either byte arrays or arrays of unicode characters. + + +Of course the fact that ropes are used is mostly completely transparent to the user (as usual in the pypy world :) ). Normal and unicode strings are implemented with them, but just from the behavior of these types the user has a hard time noticing. Of course there are significant changes in performance (in both directions). + +Using ropes to implement strings has some interesting effects. The most obvious one is that string concatenation, slicing and repetition is really fast (I suspect that it is amortized O(1), but haven't proved it). This is probably not helping most existing Python programs because people tend to code in such a way that these operations are not done too often. However, with ropes it is possible to do something like this: +

+
Python 2.4.1 (pypy 1.0.0 build 48942) on linux2
+Type "help", "copyright", "credits" or "license" for more information.
+>>>> import sys
+>>>> a = "a" * sys.maxint
+>>>> hash(a)
+-768146060
+
+ +So somebody who is targeting a Python implementation that has ropes could write his code in such a way that this is taken into account. Another interesting feature is that ropes try to share as much data as possible with each other, so if you create a large slice of a large string, the slice is not going to take much additional memory. + +One of the most interesting use-cases of ropes are together with unicode. The leaf nodes of a rope unicode string can be either a byte array or an array of unicode characters. This means that a unicode string that uses only characters that are latin-1 or ascii will use one byte of memory per character. If a unicode string contains mostly only unicode characters that are latin-1 and a few that are not, it will still use 1 byte for most of the latin-1 characters. This property also allows really fast encoding and decoding of unicode strings as long as they don't contain non-latin-1 characters (only with certain encodings of course): +
>>>> s = "a" * sys.maxint
+>>>> u = s.decode("ascii")
+>>>> u = s.decode("latin-1")
+>>>> u = s.decode("utf-8")
+Again, encoding and decoding strings that contain a few non-latin-1 characters is again efficient: +
>>>> u = "a" * 100000000 + u"\uffff"
+>>>> s = u.encode("utf-8")
+>>>> len(s)
+100000003
+I am not completely certain how useful this behaviour is for real-life applications, but it's kind of cool :-). It saves memory for european languages that contain few non-ascii characters. + +Of course there is at least one down-side to all of this, which is that string indexing is not O(1) any longer, because we have to walk down the tree to find the correct leaf where the character is actually in. I have not measured much, but I expect it to be quite fast in practice, because the trees are never deeper than 32 nodes. +
+
+
+
+ + Unknown wrote on 2007-11-23 18:54: +
+
+

awesome.

and what about pattern matching? for substring and regexps?

+
+
+
+
+ + Carl Friedrich Bolz-Tereick wrote on 2007-11-23 22:59: +
+
+

Substring matching should not be too slow, but there was no specific work on that. I think it only makes sense to optimize this once someone has a concrete application for that, because otherwise you don't know what you are optimizing for. So if anyone has ideas, I am interested to hear them.

+
+
+
+
+ + Bruce Hoult wrote on 2007-11-24 02:53: +
+
+

Go and try this year's ICFP programming contest task (just the initial virtual machine part) using this.

+
+
+
+
+ + Carl Friedrich Bolz-Tereick wrote on 2007-11-24 20:38: +
+
+

Ah, nice idea. Somebody should try this.

+
+
+
+
+ + Titus Brown wrote on 2007-12-03 11:03: +
+
+

I'm writing up a GHOP task related to this; let me know if anyone is interested in mentoring it.

(I'm asking for a CPython implementation, and separately the ICFP implementation)

--titus
titus@idyll.org

+
+
+
+ +

Unicode support in RPython

+ +
+

In the recent days we (Carl Friedrich, Anto and me) implemented native unicode support for RPython. This means that now you can write u'xxxx' directly in your RPython program, as well as unicode(some_string_variable) and most of the unicode methods should work as well. The things that don't work, are operations that require the unicode database (such as .upper() and friends) and encodings (unicode(x, encoding) for example). Right now our python interpreter does not use this at all, but that's the next step. +

+Cheers,
+fijal

+
+
+
+
+ + Miguel Filipe wrote on 2007-11-13 15:29: +
+
+

Hi there,
It would be nice for the pypy site to mention this blog, or update the news section.
I stumbled here from reading the ML.

BTW: for when a new release?

+
+
+
+
+ + Carl Friedrich Bolz-Tereick wrote on 2007-11-13 15:55: +
+
+

Hi Miguel,

the blog is still somewhat unofficial so I don't want to give it completely official status by linking it from the PyPy page. But I guess a news item makes sense.

There are no release-plans, we will discuss it next week on the sprint.

Cheers,

Carl Friedrich

+
+
+
+
+ + Anonymous wrote on 2008-01-16 18:52: +
+
+

The blog is now official and posted
on the PyPy website. The plan is to
use it as the main channel for
updates on what is happening.

Jacob

+
+
+
+ +

The PyPy Road Show (1): New York and IBM

+ +
+

We're slowly getting adjusted to the jet-lag (except maybe Samuele). Time to blog...

The past two days at IBM, in New York, have been quite interesting. The place is a research center. Feels University-like, but meetings rooms have no windows and climatization fixed on "polar" settings. The building is of course heated at this time of the year, and then the meeting rooms are climatized... I guess that just doesn't make sense to me.

We gave a 1h30 talk to a general audience first. Then we had a compact schedule of meetings with various people or groups of people. In the early preparations for this trip we planned to stay only one day, but Martin Hirzel, our host, found too many people that wanted to talk with us :-)

I think that both us and most of the people we talked with got interesting things out of the meetings. On our side, let me point a few highlights.

We asked two people that worked on the GCs for the Jikes RVM if reusing them for RPython programs would make sense. They didn't scream "you're mad!", so I guess the answer is yes. Apparently, it has been done before, too. I'm still not sure I got this right, but it seems that Microsoft paid someone money to integrate them with Rotor... Then the real-time garbage-collection guys explained to us the things that we need to take care about when writing a VM: real-time GC needs not only write barriers and read barriers, but pointer-equality-comparison barriers... They have bad memories of trying to add a posteriori this kind of barrier into existing VMs, so it took us a bit of explaining to make them realize that adding new kinds of barriers is mostly trivial for us (I'm still not 100% sure they got it... bad memories can stick hard).

Then we had discussions with JIT people. Mostly, this allowed us to confirm that Samuele has already got a good idea about what Java JITs like Hotspot can do, and in which kind of situation they work well. As expected, the most difficult bit for a PyPy-like JIT that would run on top of a JVM would be the promotion. We discussed approaches like first generating fall-back cases that include some instrumentation logic, and regenerating code with a few promoted values after some time if it seems like it will be a gain. Replacing a method with a new version is difficult to do in a way that is portable across Java VMs. There are still possible workarounds, but it also means that if we really want to explore this seriously, we should consider experimenting with specifics VMs - e.g. the Jikes RVM gives (or could be adapted to give) hooks to replace methods with new versions of them, which is something that the JVM's own JIT internally does all the time.

We showed the taint object space and the sandboxed PyPy to several groups of security people. I won't say much about it here, beyond the fact that they were generally interested by the fact that the corresponding code is very short and easy to play with. They are doing a lot on security in Java and... PHP, for web sites. Someone could write a PHP interpreter (!) in PyPy to get the same kind of results. But as Laura and Samuele put it, there are things in life you do for fun, and things you do for money :-)

We're in Vancouver today and tomorrow. More about this later...

Armin Rigo

+
+
+
+
+ + Miguel Filipe wrote on 2007-11-13 15:12: +
+
+

Thast's amazing news.
I always thought that the forest of groups wouring on VM technologies should work more closely.

I sure am happy to know that PyPy is having input and talking to a bunch of ibm'ers who have worked or work on VM, JIT, GC technologies.

Best regards,

+
+
+
+ +

The PyPy Road Show

+ +
+

Armin Rigo, Samuele Pedroni, Laura Creighton and Jacob Hallén are on a two-week-trip through the USA and Canada, to present PyPy to various companies and institutions. The next few blog entries will cover our experiences and adventures. + +Here is a glimpse of our schedule (all November 2007): +

+
    +
  • 4th: Chigaco
  • +
  • 5th-6th: New York
  • +
  • 7th-8th: Vancouver
  • +
  • 9th-18th: San Francisco and the Bay Area +
  • +
Notably, we meet with IBM Research in New York and give a Google Talk in the Bay Area. +
+
+
+
+ + Arnar Birgisson wrote on 2007-11-14 11:08: +
+
+

Hey there,

Will they by any chance be stopping over in Iceland on their way back?

cheers,
Arnar

+
+
+
+
+ + Anonymous wrote on 2007-11-14 14:40: +
+
+

Alas, we fly directly from SFO
to Frankfurt, and then to
Göteborg where we will immediately
have a PyPy sprint. But we
could come visit another day.
Are you connected with CCP games?
Or are there other people in Iceland who are interested in PyPy? I'd love to come to Iceland. I'll bet the PyPy team has other people who feel the same way. But let us take this off-line, ok?

Laura

+
+
+
+ +

First Post

+ +
+

Welcome to the PyPy status blog. After we got a lot of positive feedback about the blog coverage of our Squeak/PyPy sprint in Bern we decided that having a general PyPy blog sounds like a good idea. We will try to periodically post about what is going on in the PyPy project, cover sprints and other events where PyPyers are present. If you have any wishes about things we should write about, feel free to leave a comment.

+
+
+
+
+ + Martijn Faassen wrote on 2007-10-31 15:53: +
+
+

You should write about PyPy's upcoming "Grand US" tour!

+
+
+
+
+ + Carl Friedrich Bolz-Tereick wrote on 2007-10-31 17:16: +
+
+

Hi Martijn!

I think that is the plan, yes. But let's see whether they will have time to write blog posts :-).

+
+
+
+
+ + Steven Kryskalla wrote on 2007-11-03 02:40: +
+
+

Good to see you guys are getting more involved in promoting and showing off Pypy. I check the mailing list from time to time for interesting developments, but a blog is much easier to keep track of!

As far as ideas for posts, maybe something like the old python-dev summaries? (posts every week or two summarizing the new mailing list posts)

Release announcements, sprint announcements / reports, technical information, tutorials, etc. would all be good too.

+
+
+
+
+ + Sarah Kerrigan wrote on 2007-11-20 18:54: +
+
+

Even though there is a lot of work down the road, I am genuinely interested in the progress of this project. I'm taking a compilers class at UCR as a CS student so I'm furthering my appreciation of well written compilers.

We had a guest speaker the other day, Jens Palsberg, who created a subset of Java, miniJava, (the language we are writing our compilers for), talk about the future of compilers. He said that the future is in the ability to generate code suitable for multi-threading. With hardware slowing down and resorting to increasing the amount of cores on a die instead of making them faster, this makes sense. I also asked questions about just-in-time compilers and about the possibilities to improve performance beyond current compilers using runtime information.

To see you guys work on attacking those problems using a high-level language like python shows to me that we are getting closer to reaching those goals.

Keep up the good work. This blog is a great idea. I can't wait to use PyPy to speed up all my python based applications in an expedient and robust fashion.

+
+
+
+
+ + Sarah Kerrigan wrote on 2007-11-20 18:56: +
+
+

Also you should let your comments be displayed on the page without linking.

+
+
+
+ +
+
+ +
+
+
+ +
+ + + + \ No newline at end of file diff --git a/blog/index-10.html b/blog/index-10.html new file mode 100644 index 000000000..205efe279 --- /dev/null +++ b/blog/index-10.html @@ -0,0 +1,1490 @@ + + + + + + +PyPy (old posts, page 10) | PyPy + + + + + + + + + + + + + + + + + + Skip to main content +
+
+
+

JIT progress

+ +
+

In the last days I finally understood how to do virtualizables. Now the frame overhead is gone. This was done with the help of discussion with Samuele, porting ideas from PyPy's first JIT attempt. +

+

+This is of course work in progress, but it works in PyPy (modulo a few XXXs, but no bugs so far). The performance of the resulting code is quite good: even with Boehm (the GC that is easy to compile to but gives a slowish pypy-c), a long-running loop typically runs 50% faster than CPython. That's "baseline" speed, moreover: we will get better speed-ups by applying optimizations on the generated code. Doing so is in progress, but it suddenly became easier because that optimization phase no longer has to consider virtualizables -- they are now handled earlier. +

+

Update:Virtualizables is basically a way to avoid frame overhead. The frame object +is allocated and has a pointer, but the JIT is free to unpack it's fields (for example python +level locals) and store them somewhere else (stack or registers). Each external (out of jit) access +to frame managed by jit, needs to go via special accessors that can ask jit where those variables +are.

+
+
+
+
+ + Luis wrote on 2009-06-23 22:06: +
+
+

I have no clue of what you're talking about, bit it sounds great! Keep it up!!

+
+
+
+
+ + Anonymous wrote on 2009-06-23 23:51: +
+
+

What are virtualizables?

+
+
+
+
+ + Leonardo Santagada wrote on 2009-06-24 00:06: +
+
+

From what I understand virtualizables are objects that you use to represent objects that are expensive to construct. For example frame objects in python are very expensive so they are virtualizables and if a function is executed and it doesn't try to access its frame object it is never created.

Probably armin can give a more precise answer.

What I want to know, couldn't CPython have virtualizables for frame objects? I guess the answer is that it could but would involve a lot of C code.

+
+
+
+
+ + Maciej Fijalkowski wrote on 2009-06-24 00:09: +
+
+

Ok, I updated the post with quick explanation of what actually virtualizables are. Leonardo: you need compiler in the first place for that :-) Psyco has some kind of virtualizables (but psyco frames are read only).

Cheers,
fijal

+
+
+
+
+ + Unknown wrote on 2009-06-24 10:12: +
+
+

Could you use virtualizables to avoid constructing the frame at all, and then only allocate it if it is accessed?

+
+
+
+
+ + Anonymous wrote on 2009-06-24 14:22: +
+
+

@Leonardo:

I'm guessing that yes, CPython COULD have virtualizables. However, the people who built CPython a) didn't know about them, b) didn't know how to code that in "C", or c) didn't consider it a priority item.

Either way, these are the types of advantages I would imagine coding python using python would expose. Optimize what you need to, and then start to see the real ROI of PyPy!

+
+
+
+
+ + Antonio Cuni wrote on 2009-06-24 14:50: +
+
+

@Ben: no. In the current incarnation, the JITs generated by PyPy optimize only hot loops, when they are executed more than N times. At that point, the frame object has already been allocated.

The real advantage of virtualizables is that they allows to:

1) produce very fast code, as if the frame weren't allocated at all (e.g. by storing local variables on the stack or in the registers)

2) they don't compromise the compatibility with CPython; in particular, sys._getframe() & co. still works fine, because the JIT knows how and when to synchronize the virtualizable (i.e., the frame) with the values that are on the stack.


@gregturn: I don't see how you can implement something similar to virtualizables without writing a compiler, and CPython is not such a thing :-)

+
+
+
+ +

News from the jit front

+ +
+

+As usual, progress is going slower then predicted, +but nevertheless, we're working hard to make some progress. +

+

+We recently managed to make our nice GCs cooperate with our JIT. This is +one point from our detailed plan. As of now, we have a JIT with GCs and +no optimizations. It already speeds up some things, while slowing down +others. The main reason for this is that the JIT generates assembler which is kind +of ok, but it does not do the same level of optimizations gcc would do. +

+

+So the current status of the JIT is that it can produce assembler out +of executed python code (or any interpreter written in RPython actually), +but the results are not high quality enough since we're missing optimizations. +

+

+The current plan, as of now, looks as follows: +

+
    +
  • +Improve the handling of GCs in JIT with inlining of malloc-fast + paths, that should speed up things by a constant, not too big factor. +
  • +
  • +Write a simplified python interpreter, which will be a base for experiments + and to make sure that our JIT does correct things with regard to + optimizations. That would work as mid-level integration test. +
  • +
  • +Think about ways to inline loop-less python functions into their parent's loop. +
  • +
  • +Get rid of frame overhead (by virtualizables) +
  • +
  • +Measure, write benchmarks, publish +
  • +
  • +Profit +
  • +
+ +Cheers,
+fijal +
+
+
+
+ + Anonymous wrote on 2009-06-16 08:03: +
+
+

nice to see the progresses on pypy jit!!

+
+
+
+
+ + Anonymous wrote on 2009-06-16 09:22: +
+
+

Do you expect to produce jit faster, then Unladen-Swallow's LLVM based ?

+
+
+
+
+ + Anonymous wrote on 2009-06-16 13:20: +
+
+

Thanks for all the hard work, guys. Keep it up!

+
+
+
+
+ + Anonymous wrote on 2009-06-16 13:46: +
+
+

ah, this jit business is so exciting!

+
+
+
+
+ + Anonymous wrote on 2009-06-16 17:00: +
+
+

I am not really shure how this plan relates to the roadmap that was presented in April.

+
+
+
+
+ + Armin Rigo wrote on 2009-06-16 18:15: +
+
+

How this plan relates: it does not. Fijal's style is to give the current idea of the plans. Don't believe him too much :-) This and April's plan need somehow to be added to each other, or something :-)

+
+
+
+
+ + Armin Rigo wrote on 2009-06-16 18:22: +
+
+

Unladen-Swallow's LLVM JIT is a very different beast: it compiles each Python function as a unit. You can only get a uniform bit of speedup this way (maybe 2-3x). By contrast, what we are doing gives a non-uniform speedup: like Psyco, we will probably obtain speedups between 2x and 100x depending on the use case.

(Of course the plan is to be faster than Psyco in the common case :-)

+
+
+
+
+ + Luis wrote on 2009-06-17 00:11: +
+
+

Armin: regarding Unladen-Swallow, does this approach prevent coming up later with a tracing jit? Or it could be done on top of it?

+
+
+
+
+ + Nighteh3 wrote on 2009-06-17 05:45: +
+
+

Sweet !! Good luck guys :)

+
+
+
+
+ + Maciej Fijalkowski wrote on 2009-06-17 05:55: +
+
+

No no no no, trust me :-)

The thing is that I'm trying to present "current plan"
as live as it can be. Which means we might change
our mind completely. But otherwise, the whole blog
would be mostly empty and boring...

Cheers,
fijal

+
+
+
+
+ + tobami wrote on 2009-06-17 11:22: +
+
+

Could you please, elaborate on the second point about a simplified python interpreter?

+
+
+
+
+ + tobami wrote on 2009-06-17 11:26: +
+
+

Also, wouldn't it be better to refactor the plan as follows?:

- Improve the handling of GCs in JIT with inlining of malloc-fast paths, that should speed up things by a constant, not too big factor.
- Measure, write benchmarks
- Write a simplified python interpreter, which will be a base for experiments and to make sure that our JIT does correct things with regard to optimizations. That would work as mid-level integration test.
- Think about ways to inline loop-less python functions into their parent's loop.
- Measure, publish benchmarks, RELEASE 1.2
- Get rid of frame overhead (by virtualizables)
- Measure, publish benchmarks
- Iterate...

+
+
+
+
+ + Anonymous wrote on 2009-06-17 14:01: +
+
+

Concerning current ideas vs April's roadmap: I understand that plans change and that's ok of course. But as April's roadmap isn't mentioned at all, I have no idea how the current ideas relate to the previous roadmap (like the current ideas replace the old road map or parts of it / they are additional ideas and the old roadmap is postponed / they are a detailing of (parts of) April's roadmap). Maybe that's obvious to people with better pypy-knowledge than me. I understand Armin's comment that they are additional ideas.

Keep up the good work!

Branko

+
+
+
+
+ + Anonymous wrote on 2009-06-18 14:40: +
+
+

What about threading? Will we have a GIL-less interpreter in the end (assuming the GCs support that)?

+
+
+
+ +

ICOOOLPS Submissions

+ +
+

Both of the papers that people from the PyPy team submitted to ICOOOLPS have +been accepted. They are:

+
+
    +
  • "Faster than C#: efficient implementation of dynamic languages on .NET" +(pdf1) by Armin, Anto and Davide Ancona, who is Anto's Ph.D. advisor
  • +
  • "Tracing the Meta-Level: PyPy’s Tracing JIT Compiler" (pdf2) by Carl +Friedrich, Armin, Anto and Maciek
  • +
+
+

(the pdfs are obviously the submitted versions, not the final ones).

+

This year ICOOOLPS (Implementation, Compilation, Optimization of +Object-Oriented Languages, Programs and Systems) is being held on July the 6th +at ECOOP 2009 in Genova, Italy. Other than these two papers, Anto and Carl +Friedrich will also present a PyPy tutorial, on July the 7th.

+
+
+
+
+ + Unknown wrote on 2009-05-16 11:22: +
+
+

It does seem like an odd idea to trace the bytecode of an interpreter of the bytecode of a language, rather than just tracing the bytecode for a language. For example, it requires that you annotate the interpreter to retain information that you would otherwise naturally have, and it requires that you trace lots of extra bookkeeping code in the interpreter.

Given that you're writing a JIT that traces the execution of some bytecode, what advantages does tracing the outer bytecode have over tracing the inner bytecode? Is it that the outer bytecode is simpler than the inner bytecode; if so, is there no way to (inefficiently) compile the inner bytecode to the outer bytecode?

+
+
+
+
+ + Carl Friedrich Bolz-Tereick wrote on 2009-05-16 12:08: +
+
+

John: The main reason for writing a JIT that traces the bytecode of the "outer" interpreter (which we call language interpreter in the paper) is that then we need to write only one tracing JIT in PyPy, and can use it for a variety of languages.

The tracing of the extra bookkeeping code is not a problem is not such a large problem, as the paper shows. None of these opcodes are actually part of the final trace.

If you want to discuss this more, I would suggest that we move this discussion to pypy-dev@codespeak.net which is the project mailing list. Not everybody is reading comments here :).

+
+
+
+ +

4 weeks of GDB

+ +
+

Hello.

+

+So, according to our jit +plan we're mostly done with point 1, that is to provide a JIT that compiles +python code to assembler in the most horrible manner possible but doesn't +break. That meant mostly 4 weeks of glaring at GDB and megabytess of assembler +generated by C code generated from python code. The figure of 4 weeks proves +that our approach is by far superior to the one of psyco, since Armin says it's +"only 4 weeks" :-) +

+

+Right now, pypy compiled with JIT can run the whole CPython test suite +without crashing, which means we're done with obvious bugs and the only +ones waiting for us are really horrible. (Or they really don't exist. +At least they should never be about obscure Python corner cases: they can +only be in the 10'000 lines of relatively clear code that is our JIT +generator.) +

+

+But... the fun thing is that we can actually concentrate on optimizations! +So the next step is to provide a JIT that is correct *and* actually speeds +up python. Stay tuned for more :-) +

+Cheers,
+fijal, armin & benjamin +

+UPDATE: for those of you blessed with no knowledge of C, gdb stands for GNU debugger, a classic debugger for C. (It's also much more powerful than python debugger, pdb, which is kind of surprising).

+
+
+
+
+ + Alexander Kellett wrote on 2009-04-30 23:15: +
+
+

*bow*

+
+
+
+
+ + Luis wrote on 2009-05-01 00:00: +
+
+

I love this kind of posts. Keep'em coming!

+
+
+
+
+ + Unknown wrote on 2009-05-01 01:06: +
+
+

This is probably the most exciting thing I've heard since I started tracking PyPy. Can't wait to see how fast JIT Python flies. :-)

+
+
+
+
+ + René Dudfield wrote on 2009-05-01 01:56: +
+
+

nice one! Really looking forward to it.

Is this for just i386? Or is this for amd64/ppc etc?

+
+
+
+
+ + Maciej Fijalkowski wrote on 2009-05-01 02:11: +
+
+

amd64 and ppc are only available in enterprise version :-)

We cannot really solve all problems at once, it's one-by-one approach.

+
+
+
+
+ + Armin Rigo wrote on 2009-05-01 09:47: +
+
+

illume: if you are comparing with Psyco, then it's definitely "any platform provided someone writes a backend for it". Writing a backend is really much easier than porting the whole of Psyco...

Our vague plans include an AMD64 backend and an LLVM-JIT one, the latter being able to target any platform that LLVM targets.

+
+
+
+
+ + DSM wrote on 2009-05-01 10:33: +
+
+

Nice!

I assume that it would be (relatively, as these things go) straightforward for those us interested to turn the x86 assembly backend into a C backend?

I know that even mentioning number-crunching applications gets certain members of the pypy team beating their heads against the wall (lurkers can read the grumbling on irc too!). But with a delegate-to-C backend, those of us who have unimplemented architectures and are in the happy regime where we don't care about compilation overhead can get the benefits of icc's excellent optimizations without having to do any of the work. We'd just need to make sure that the generated C is code that icc can handle. (There are unfortunately idioms that gcc and icc don't do very well with.)

To be clear, I'm not suggesting that the pypy team itself go this route: at the moment it feels like the rest of us should stay out of your way.. laissez les bon temps roulez! :^)

I'm asking instead if there are any obvious gotchas involved in doing so.

+
+
+
+
+ + Tim Parkin wrote on 2009-05-01 11:28: +
+
+

Congrats for stage one... exciting times for python..

+
+
+
+
+ + proteusguy wrote on 2009-05-02 11:48: +
+
+

Nice job guys! Once you announce that PyPy is pretty much of comparable (90% or better) speed to that of CPython then we will be happy to start running capacity tests of our web services environment on top of it and report back our results.

Given the growing number of python implementations has there ever been a discussion of PyPy replacing CPython as the canonical implementation of python once it consistently breaks performance & reliability issues? I don't know enough of the details to advocate such a position - just curious if there's been any official thought to the possibility.

+
+
+
+
+ + Armin Rigo wrote on 2009-05-04 13:55: +
+
+

DSM, Proteusguy: I'd be happy to answer your questions on the pypy-dev mailing list. I think that there is no answer short enough to fit a blog post comment.

+
+
+
+
+ + Jacob Hallén wrote on 2009-05-04 14:45: +
+
+

proteusguy: It is our hope that PyPy can one day replace CPython as the reference implementation, but this depends on many factors. Most of them are way out of our control. It will depend very much on the level of PyPy uptake in the community, but this is just a first step. With enough adoption, the Python developers (the people actually making new versions of CPython) need to be convinced that working from PyPy as a base to develop the language makes sense. If they are convinced, Guido may decide that it is a good idea and make the switch, but not before then.

+
+
+
+
+ + Anonymous wrote on 2009-05-08 15:36: +
+
+

See https://moderator.appspot.com/#9/e=c9&t=pypy for Guido's opinion about PyPy.

+
+
+
+
+ + Anonymous wrote on 2009-05-10 11:19: +
+
+

Surely gdb is more powerful than pdb because many more people are forced to used gdb. c code is much harder to debug than python code, and needs debugging more often than python code.

+
+
+
+
+ + Cacas Macas wrote on 2009-05-14 08:31: +
+
+

Good day.
I use Python for about 3 years and i am following your blog almost every day to see news.
I am very excited to see more Pypy, though i don't understand how to use it (?!?!) and i never managed to install it!
Wikipedia says "PyPy is a followup to the Psyco project" and i use Psyco, so Pypy must be a very good thing. I use Psyco very intense, in all my applications, but it's very easy to use.
I have Windows and Windows document is incomplete "https://codespeak.net/pypy/dist/pypy/doc/windows.html". I have MinGW compiler.
Pypy is not very friendly with users. I think more help documents would be very useful. When i will understand how to install Pypy, i will use it.
Keep up the good work!

+
+
+
+
+ + stracin wrote on 2009-06-13 14:56: +
+
+

"""Rumors have it that the secret goal is being faster-than-C which is nonsense, isn't it?"""

what does this statement from the pypy homepage mean?

that c-pypy will be faster than cpython?

or that code run in c-pypy will be faster than compiled C code? :o

because of the "nonsense" i think you mean the latter? but isn't it nonsense? :) would be awesome though.

+
+
+
+ +
+
+
+ + Anonymous wrote on 2009-04-28 17:09: +
+
+

Congratulations on the new release!

+
+
+
+
+ + Anonymous wrote on 2009-04-28 19:09: +
+
+

Congrats! This is a great project :)

+
+
+
+
+ + Anonymous wrote on 2009-04-29 11:33: +
+
+

Any chance of prebuilt binaries? I tried to compile but had to give up after 2 hours (I guess my laptop is not up to the task).

By the way, you should put the release note somewhere on the main page of the PyPy site. Currently this page gives no indication that a release of PyPy exists at all.

+
+
+
+
+ + Armin Rigo wrote on 2009-04-30 11:08: +
+
+

Thanks, added a link from the main page to release-1.1.0.html.

About binaries: there are just too many possible combinations, not only of platforms but of kinds of pypy-c. I suppose that we can list other people's pages with some of them, if they mention them to us.

+
+
+
+ +

Roadmap for JIT

+ +
+

Hello. +

+

+First a disclaimer. This post is more about plans for future than current +status. We usually try to write about things that we have done, because +it's much much easier to promise things than to actually make it happen, +but I think it's important enough to have some sort of roadmap. +

+

+In recent months we came to the point where the 5th generation of +JIT prototype was working as nice +or even a bit nicer than 1st one back in 2007. Someone might ask "so why +did you spend all this time without going forward?". And indeed, we spend +a lot of time moving sideways, but as posted, we also spent a lot of time +doing some other things, which are important as well. +The main advantage of current JIT incarnation is much much simpler than +the first one. Even I can comprehend it, which is much of an improvement :-) +

+

+So, the prototype is working and gives very nice speedups in range of 20-30x +over CPython. We're pretty confident this prototype will work and will +produce fast python interpreter eventually. So we decided that now we'll +work towards changing prototype into something stable and solid. This +might sound easy, but in fact it's not. Having stable assembler backend +and optimizations that keep semantics is not as easy as it might sound. +

+

+The current roadmap, as I see it, looks like as following: +

+
    +
  • Provide a JIT that does not speedup things, but produce assembler without + optimizations turned on, that is correct and able to run CPython's library + tests on a nightly basis. +
  • +
  • + Introduce simple optimizations, that should make above JIT a bit faster than + CPython. With optimizations disabled JIT is producing incredibly dumb + assembler, which is slower than correspoding C code, even with removal + of interpretation overhead (which is not very surprising). +
  • +
  • + Backport optimizations from JIT prototype, one by one, keeping an eye + on how they perform and making sure they don't break anything. +
  • +
  • + Create new optimizations, like speeding up attribute access. +
  • +
  • + Profit. +
  • +
+

+This way, we can hopefully provide a working JIT, which gives fast python +interpreter, which is a bit harder than just a nice prototype. +

+

+Tell us what you think about this plan. +

+Cheers,
+fijal & others. +
+
+
+
+ + Anonymous wrote on 2009-04-21 20:58: +
+
+

I think it's a great idea. If the test suite succeeds on the basic JIT, it's much easier to spot regressions when you start adding the cool stuff. It also gives you a solid foundation to build on.

Good luck, this project is amazing :)

+
+
+
+
+ + rjw wrote on 2009-04-21 21:54: +
+
+

Its not obvious from this post what would actually be the difference between the prototype and the final jit with all the prototypes optimisations. So ... it sounds like a lot of work for zero gain. I'm sure there is missing information, like what is actually missing from or wrong with the prototype ( is it in a different language? Prolog?) Without this information its impossible to judge this plan.

+
+
+
+
+ + Michael Foord wrote on 2009-04-21 22:54: +
+
+

This sounds like a very pragmatic approach and is very encouraging. Nice work guys - very much looking forward to what the future has to offer.

+
+
+
+
+ + Tim Parkin wrote on 2009-04-21 23:06: +
+
+

I'm extremely excited about seeing this happen. It is an unfortunate fact that the majority of people won't get PyPy until they see a 'big win'. Once they've noticed the big win they will start to see the 'hidden genius'. I'm glad that you are taking such a professional approach to this next phase and look forward to the day when people will start to look give PyPy the attention it deserves (if not for quite the right reason).

+
+
+
+
+ + Alex wrote on 2009-04-22 00:34: +
+
+

I agree with Michael, one of the hallmarks of Python philosophy has always been "make it right, and then make it fast", sounds like you guys have taken this to heart.

+
+
+
+
+ + Leonardo Santagada wrote on 2009-04-22 02:52: +
+
+

Great guys, the plan seems very solid and reasonable!

responding to rjw: I think the problem was that the prototype was really incomplete, putting all the complexity needed for the rest of the language could be done without removing the optimizations but would make bug finding way harder.

I hope that this could be the only new feature for the next pypy release. Focusing on the JIT might be the best way to attract many more eyes and hands to the project.

+
+
+
+
+ + Michael Hudson-Doyle wrote on 2009-04-22 04:12: +
+
+

This sounds like a very sane plan. Good luck with it!

+
+
+
+
+ + Anonymous wrote on 2009-04-22 07:59: +
+
+

I like how for once step 2 isn't "???", but a well thought out plan =).

+
+
+
+
+ + Zemantic dreams wrote on 2009-04-22 10:20: +
+
+

guys, you rock! I can't wait to see the results!

bye
Andraz Tori, Zemanta

+
+
+
+
+ + Anonymous wrote on 2009-04-22 13:21: +
+
+

Very sensible plan! Good luck guys. Here's to pypy taking over the world (-:

+
+
+
+
+ + herse wrote on 2009-04-22 19:36: +
+
+

"It's super easy to provide 95% of python in a reasonable speed, just the last 5% gets tricky."

i often come across this statement.

wouldn't it make sense then to offer a pypy compile option for producing an interpreter which leaves away those 5% in favor of speed for people who don't need those 5%?

or isn't this feasible or wanted for some reason?

i am just curious... :) pypy is an awesome project and i am looking forward to the jit!

+
+
+
+
+ + Anonymous wrote on 2009-04-24 09:34: +
+
+

The roadmap is okay. The only thing I miss is a rough timeline.

+
+
+
+
+ + Anonymous wrote on 2009-04-24 22:18: +
+
+

Tenretn hör eviece ne Pypy tan cafretn anretx. Lbisi programma o oitcenno ih ecafretn cabpöo, anretn 'retupmo ih nis secorpbut pypy eka LD oitcenno huob raa rawtfo laweri anosre Python code?

+
+
+
+
+ + René Leonhardt wrote on 2009-04-24 23:26: +
+
+

Congratulations, the LLVM backend for JIT has been accepted, I am eager to see the results :)

+
+
+
+
+ + Armin Rigo wrote on 2009-04-28 20:18: +
+
+

herse: that's an approach which is often mentioned, but which does not make sense in PyPy. The JIT is generated from the language spec; whether this spec covers 95% or 100% of Python doesn't change anything. The 95%-versus-100% debate only makes sense at another level, e.g. if we wanted to make PyPy faster without a JIT at all.

+
+
+
+
+ + Richard Emslie wrote on 2009-04-29 23:47: +
+
+

Awesome work thus far & congratulations guys. Sounds like a good strategy to having something that works. Best of luck and I'm looking forward to see how things pan out. :-)

+
+
+
+
+ + herse wrote on 2009-04-30 05:12: +
+
+

"""The JIT is generated from the language spec; whether this spec covers 95% or 100% of Python doesn't change anything."""

i see. the whole pypy idea really sounds awesome to me.

i have another question. your python interpeter is written in rpython so it is supposed to be simpler to work with than the c implementation. but i could imagine that it is incredibly hard to debug problems in pypy-c? doesn't this counterbalance the advantage again?

+
+
+
+
+ + Maciej Fijalkowski wrote on 2009-04-30 05:58: +
+
+

We're usually not debugging problems in pypy-c. It turns out that 99% of the problems you can debug by running on top of CPython, so you can test things really deeply, without compilation.

+
+
+
+
+ + Collin Winter wrote on 2009-06-08 23:21: +
+
+

This looks like a good plan. I look forward to sharing ideas with you in the future :)

When you say, "So, the prototype is working and gives very nice speedups in range of 20-30x over CPython", what benchmarks is that on? Can you be more specific?

+
+
+
+ +

Leysin Sprint Report

+ +
+

The Leysin sprint is nearing its end, as usual here is an attempt at a summary +

+

of what we did.

+Beautiful Leysin Landscape
+

Release Work

+

Large parts of the sprint were dedicated to fixing bugs. Since the easy bugs +seem to have been fixed long ago, those were mostly very annoying and hard bugs. +This work was supported by our buildbots, which we tried to get free of +test-failures. This was worked on by nearly all participants of the sprint +(Samuele, Armin, Anto, Niko, Anders, Christian, Carl Friedrich). One +particularly annoying bug was the differences in the tracing events that PyPy +produces (fixed by Anders, Samuele and Christian). Some details about larger +tasks are in the sections below.

+

The work culminated in the beta released on Sunday.

+
+

Stackless

+

A large number of problems came from our stackless features, which do some +advanced things and thus seem to contain advanced bugs. Samuele and Carl +Friedrich spent some time fixing tasklet pickling and unpickling. This was +achieved by supporting the (un)pickling of builtin code objects. In addition +they fixed some bugs in the finalization of tasklets. This needs some care +because the __del__ of a tasklet cannot run at arbitrary points in time, but +only at safe points. This problem was a bit subtle to get right, and popped up +nearly every morning of the sprint in form of a test failure.

+

Armin and Niko added a way to restrict the stack depth of the RPython-level +stack. This can useful when using stackless, because if this is not there it is +possible that you fill your whole heap with stack frames in the case of an +infinite recursion. Then they went on to make stackless not segfault when +threads are used at the same time, or if a callback from C library code is in +progress. Instead you get a RuntimeError now, which is not good but better +than a segfault.

+
+Anto and Armin working on the JIT + +
+

Killing Features

+

During the sprint we discussed the fate of the LLVM and the JS backends. Both +have not really been maintained for some time, and even partially untested +(their tests were skipped). Also their usefulness appears to be limited. The JS +backend is cool in principle, but has some serious limitations due to the fact +that JavaScript is really a dynamic language, while RPython is rather static. +This made it hard to use some features of JS from RPython, e.g. RPython does not +support closures of any kind.

+

The LLVM backend had its own set of problems. For +a long time it produced the fastest form of PyPy's Python interpreter, by first +using the LLVM backend, applying the LLVM optimizations to the result, then +using LLVM's C backend to produce C code, then apply GCC to the result :-). +However, it is not clear that it is still useful to directly produce LLVM +bitcode, since LLVM has rather good C frontends nowadays, with llvm-gcc and +clang. It is likely that we will use LLVM in the future in our JIT (but that's +another story, based on different code).

+

Therefore we decided to remove these two backends from SVN, which Samuele and +Carl Friedrich did. They are not dead, only resting until somebody who is +interested in maintaining them steps up.

+
+
+

Windows

+

One goal of the release is good Windows-support. Anders and Samuele set up a new +windows buildbot which revealed a number of failures. Those were attacked by +Anders, Samuele and Christian as well as by Amaury (who was not at the sprint, +but thankfully did a lot of Windows work in the last months).

+
+
+

OS X

+

Christian with some help by Samuele tried to get translation working again under +Mac OS X. This was a large mess, because of different behaviours of some POSIX +functionality in Leopard. It is still possible to get the old behaviour back, +but whether that was enabled or not depended on a number of factors such as +which Python is used. Eventually they managed to successfully navigate that maze +and produce something that almost works (there is still a problem remaining +about OpenSSL).

+
+Samuele and Carl Friedrich pretending to work on something +
+

Documentation

+

The Friday of the sprint was declared to be a documentation day, where (nearly) +no coding was allowed. This resulted in a newly structured and improved getting +started document (done by Carl Friedrich, Samuele and some help of Niko) and +a new document describing differences to CPython (Armin, Carl Friedrich) as +well as various improvements to existing documents (everybody else). Armin +undertook the Sisyphean task of listing all talks, paper and related stuff +of the PyPy project.

+
+
+
+

Various Stuff

+
+

Java Backend Work

+

Niko and Anto worked on the JVM backend for a while. First they had to fix +translation of the Python interpreter to Java. Then they tried to improve the +performance of the Python interpreter when translated to Java. Mostly they did a +lot of profiling to find performance bottlenecks. They managed to improve +performance by 40% by overriding fillInStackTrace of the generated exception +classes. Apart from that they found no simple-to-fix performance problems.

+
+
+

JIT Work

+

Armin gave a presentation about the current state of the JIT to the sprinters as +well as Adrian Kuhn, Toon Verwaest and Camillo Bruni of the University of Bern +who came to visit for one day. There was a bit of work on the JIT going on too; +Armin and Anto tried to get closer to having a working JIT on top of the CLI.

+
+
+
+
+
+
+ + Unknown wrote on 2009-04-22 07:46: +
+
+

Guys, are you going to make a new release with the things done during the sprint? Thanks.

(pypy is a great work; Keep it up!)

+
+
+
+
+ + vak wrote on 2009-11-03 12:30: +
+
+

hi,
could you please make a new blog-post and tell us about news regarding LLVM and PyPy, please?

thanks in advance!

+
+
+
+ +

Beta for 1.1.0 released

+ +
+

Today we are releasing a beta of the upcoming PyPy 1.1 release. There +are some Windows and OS X issues left that we would like to address +between now and the final release but apart from this things should be +working. We would appreciate feedback.

+

The PyPy development team.

+
+

PyPy 1.1: Compatibility & Consolidation

+

Welcome to the PyPy 1.1 release - the first release after the end of EU +funding. This release focuses on making PyPy's Python interpreter more +compatible with CPython (currently CPython 2.5) and on making the +interpreter more stable and bug-free.

+

PyPy's Getting Started lives at:

+
+https://codespeak.net/pypy/dist/pypy/doc/getting-started.html +
+
+

Highlights of This Release

+
+ +
+
+
+

Other Changes

+
+ +
+
+
+

What is PyPy?

+

Technically, PyPy is both a Python interpreter implementation and an +advanced compiler, or more precisely a framework for implementing dynamic +languages and generating virtual machines for them.

+

The framework allows for alternative frontends and for alternative +backends, currently C, Java and .NET. For our main target "C", we can +"mix in" different garbage collectors and threading models, +including micro-threads aka "Stackless". The inherent complexity that +arises from this ambitious approach is mostly kept away from the Python +interpreter implementation, our main frontend.

+

Socially, PyPy is a collaborative effort of many individuals working +together in a distributed and sprint-driven way since 2003. PyPy would +not have gotten as far as it has without the coding, feedback and +general support from numerous people.

+

Have fun,

+
+

the PyPy release team, [in alphabetical order]

+

Amaury Forgeot d'Arc, Anders Hammerquist, Antonio Cuni, Armin Rigo, +Carl Friedrich Bolz, Christian Tismer, Holger Krekel, +Maciek Fijalkowski, Samuele Pedroni

+

and many others: +https://codespeak.net/pypy/dist/pypy/doc/contributor.html

+
+
+
+
+
+
+
+ + Benjamin Peterson wrote on 2009-04-20 01:21: +
+
+

Congratulations! PyPy is becoming more and more viable every day. I hope I can continue to become more involved in this awesome project.

+
+
+
+
+ + Anonymous wrote on 2009-04-21 01:18: +
+
+

pypy is a very interesting project!

i have a question. do you think pypy-c without jit can ever reach the speed of c-python? why is it slower?

or will you put all the optimization efforts into the jit now? doesn't the performance difference matter because the jit will make it up anyway?

+
+
+
+
+ + Maciej Fijalkowski wrote on 2009-04-21 04:36: +
+
+

PyPy without jit can (and is sometimes) be faster than cpython, for various reasons, including garbage collector.

On the other hand, we rather won't sacrifice simplicity for speed and we hope that jit will go that part. Also the funny thing is that since we generate our jit, it gets better as interpreter gets simpler, because jit generator is able to find out more on it's own. So in fact we might give up on some optimizations in favor of simplicity, because jit will be happier.

Cheers,
fijal

+
+
+
+
+ + Luis wrote on 2009-04-21 14:04: +
+
+

Sorry for my anxiety, but is there any rough estimation on when the jit will be in a usable state?

+
+
+
+
+ + Maciej Fijalkowski wrote on 2009-04-21 22:14: +
+
+

Personally, I'm doing it in my free time. That means I'm giving no estimates, because it makes no sense. If you wish to go into some contractual obligations on our sides, we're up to discuss I suppose :-)

+
+
+
+
+ + Luis wrote on 2009-04-21 22:33: +
+
+

Maciej, I know how hard you are working on this. I didn't mean to sound disrespectful and I don't want to bother you... It's just that as everyone else, I'm anxoiusly looking forward to seeing pypy's magic in action. By the way, the new post is very much appreciated. Thanks!

+
+
+
+
+ + Anonymous wrote on 2009-06-29 07:47: +
+
+

I am desperately looking for some help building PyPy. I have posted a an Issue (#443) about my issues in the PyPy site.

If anyone from the release/Dev. team can give me a hand, I would seriously appreciate this!

I can be reached at wnyrodeo@yahoo.com

Thanks.

+
+
+
+ +
+
+
+ + larsr wrote on 2009-04-08 15:25: +
+
+

I found the slides to the python in a sandbox to be useful too.

+
+
+
+ +
+
+ +
+
+
+ +
+ + + + \ No newline at end of file diff --git a/blog/index-11.html b/blog/index-11.html new file mode 100644 index 000000000..2f35ec091 --- /dev/null +++ b/blog/index-11.html @@ -0,0 +1,2364 @@ + + + + + + +PyPy (old posts, page 11) | PyPy + + + + + + + + + + + + + + + + + + Skip to main content +
+
+
+

GC improvements

+ +
+

In the last week, I (Armin) have been taking some time off the +JIT work to improve our GCs. More precisely, our GCs now take +one or two words less for every object. This further reduce the +memory usage of PyPy, as we will show at the end.

+ +

Background information: RPython object model

+ +

We first need to understand the RPython object model as +implemented by our GCs and our C backend. (Note that the +object model of the Python interpreter is built on top of +that, but is more complicated -- e.g. Python-level objects +are much more flexible than RPython objects.)

+ +

Consider these two RPython classes:

+ +
+class A:
+    def __init__(self, x):
+        self.x = x
+    def f(self):
+        return self.x * 42
+
+class B(A):
+    def __init__(self, x, y):
+        self.x = x
+        self.y = y
+    def f(self):
+        return self.x + self.y
+
+ +

The instances of A and B look like this in memory (all cells +are one word):

+ +

+ + + + + +
GC headervtable ptr of Ahashx
+

+ + + + + + +
GC headervtable ptr of Bhashxy
+

The first word, the GC header, describes the layout. It +encodes on half a word the shape of the object, including where it +contains further pointers, so that the GC can trace it. The +other half contains GC flags (e.g. the mark bit of a +mark-and-sweep GC).

+ +

The second word is used for method dispatch. It is similar to a +C++ vtable pointer. It points to static data that is mostly a +table of methods (as function pointers), containing e.g. the method f +of the example.

+ +

The hash field is not necessarily there; it is only present in classes +whose hash is ever taken in the RPython program (which includes being +keys in a dictionary). It is an "identity hash": it works like +object.__hash__() in Python, but it cannot just be the address of +the object in case of a GC that moves objects around.

+ +

Finally, the x and y fields are, obviously, used to store the value +of the fields. Note that instances of B can be used in places that +expect a pointer to an instance of A.

+ +

Unifying the vtable ptr with the GC header

+ +

The first idea of saving a word in every object is the observation +that both the vtable ptr and the GC header store information about +the class of the object. Therefore it is natural to try to only have +one of them. The problem is that we still need bits for the GC flags, +so the field that we have to remove is the vtable pointer.

+ +

This means that method dispatch needs to be more clever: it +cannot directly read the vtable ptr, but needs to compute it +from the half-word of the GC header. Fortunately, this can be +done with no extra instruction on the assembler level. Here is +how things will look like in the end, assuming a 32-bit x86 +machine (but note that as usual we just generate portable C).

+ +

The trick for achieving efficiency is that we store all +vtables together in memory, and make sure that they don't take +more than 256 KB in total (16 bits, plus 2 bits of alignment). +Here is how the assembler code (produced by the normal C +compiler, e.g. gcc) for calling a method looks like. Before +the change:

+ +
+MOV EDX, [EAX + 4]               # load the vtable ptr from object EAX
+MOV EDX, [EDX + method_offset]   # load the function pointer from the vtable
+CALL EDX
+
+ +

Instead, we now have:

+ +
+MOVZX EDX, [EAX]     # load the 16-bit part of the GC header from EAX
+MOV EDX, [vtable_start + 4*EDX + method_offset]
+CALL EDX
+
+ +

Note that the complex addressing scheme done by the second MOV +is still just one instruction: the vtable_start and +method_offset are constants, so they are combined. And as the +vtables are anyway aligned at a word boundary, we can use +4*EDX to address them, giving us 256 KB instead of just 64 KB +of vtables.

+ +

Optimizing the hash field

+ +

In PyPy's Python interpreter, all application-level objects +are represented as an instance of some subclass of W_Root. +Since all of these objects could potentially be stored in a +dictionary by the application Python program, all these +objects need a hash field. Of course, in practice, only a +fraction of all objects in a Python program end up having +their hash ever taken. Thus this field of W_Root is wasted +memory most of the time.

+ +

(Up to now, we had a hack in place to save the hash field +on a few classes like W_IntegerObject, but that meant that +the Python expression ``object.__hash__(42)'' would raise +a TypeError in PyPy.)

+ +

The solution we implemented now (done by some Java GCs, among +others) is to add a hash field to an object when the +(identity) hash of that object is actually taken. This means +that we had to enhance our GCs to support this. When objects +are allocated, we don't reserve any space for the hash:

+ +object at 0x74B028 + + + + +
...00...xy
+

When the hash of an object is taken, we use its current memory +address, and set a flag in the GC header saying that this +particular object needs a hash:

+ +object at 0x74B028 + + + + +
...01...xy
+

If the GC needs to move the object to another memory location, +it will make the new version of the object bigger, i.e. it +will also allocate space for the hash field:

+ +object at 0x825F60 + + + + + +
...11...xy0x74B028
+

This hash field is immediately initialized with the old memory +address, which is the hash value that we gave so far for the +object. To not disturb the layout of the object, we always +put the extra hash field at the end. Of course, once set, +the hash value does not change even if the object needs to +move again.

+ +

Results

+ +

Running the following program on PyPy's Python interpreter +with n=4000000:

+ +
+def make_linked_list(n):
+    a = None
+    i = 0
+    while i < n:
+        b = X()
+        b.next = a
+        a = b
+        i += 1
+
+ +

the two optimizations together save 32 MB of RAM (i.e. 8 bytes +per object). The version of PyPy we measured this with was built +as follows:

+ +
+./translate.py --gcremovetypeptr targetpypystandalone --objspace-std-withsharingdict
+
+ +

The total amount of RAM used on a 32-bit Linux is 247 MB, +completing in 10.3 seconds. On CPython, it consumes 684 MB +and takes 89 seconds to complete... This nicely shows that +our GCs are much faster at allocating objects, and that our +objects can be much smaller than CPython's.

+ +

Armin Rigo & Carl Friedrich Bolz

+
+
+
+
+ + Shahms wrote on 2009-10-16 16:53: +
+
+

Not really GC related and you may have covered this in another post, but how does PyPy handle id() in a world where the object may move? Is the hash field reused for this when necessary as well? If so, how do you deal with the possibility of another object being allocated at the same address as the original object? If not, how do you avoid having an object's id() change when it's moved?

+
+
+
+
+ + kbob wrote on 2009-10-16 17:55: +
+
+

Very nice. Using the address for the hash value was especially clever. But how random are those hash values?

+
+
+
+
+ + Alex wrote on 2009-10-16 19:15: +
+
+

kbob: If PyPy is anything like CPython the randomness isn't so important. The CPython dictionary hash collision resolution strategy is extremely efficient, even amongst hashes with very similar values.

+
+
+
+
+ + Lucian wrote on 2009-10-16 19:39: +
+
+

This is all sorts of cool. I can't wait for a mostly-production-ready PyPy with JIT.

On a somewhat related note, how do the JIT and ctypes interact right now, if at all?

+
+
+
+
+ + Carl Friedrich Bolz-Tereick wrote on 2009-10-16 19:43: +
+
+

Shams: Excellent question! The implementation of id that we have is basically a weak key dict mapping objects to ids on demand. This has the fun side-effect that the ids of PyPy's object start with 1 on count up from there.

This is rather inefficient (e.g. your garbage collections become linearly slower the more objects you have that have their id taken), but there is not much else you can do. Jython uses a similar solution. For this reason, calling id a lot is essentially discouraged in code you want to run on PyPy.

+
+
+
+
+ + Carl Friedrich Bolz-Tereick wrote on 2009-10-16 19:50: +
+
+

kbob: I think they should be random enough. You get a collision if you ask the hash of object a, then a collection happens that moves a, then you ask object b for its hash and object b happens to be in the place where object a was before. That sounds unlikely.

If you write contrived code that has a loop that repeatedly allocates an object, asks its hash by putting it into a dict and then forces a nursery collection, you can get collision: all those objects will be at the beginning of the nursery when their hash is taken. Unlikely again to occur in practise.

+
+
+
+
+ + Carl Friedrich Bolz-Tereick wrote on 2009-10-16 19:57: +
+
+

Alex: you are right. We use exactly CPython's algorithm for implementing dicts, so having bad hash functions is not a big problem. However, if you really have hash value collisions (e.g. everything hashes to 1) your dict still degenerates to essentially a linear search.

+
+
+
+
+ + Skandalfo wrote on 2009-10-16 20:05: +
+
+

Wow! You guys that you are my computing heroes.

Whenever I talk to other people about your project, I always state you are the best example I can imagine of REAL innovation in computer languages.

That said, I gather the only thing making id() different from hash() is that you need to guarantee that the values for live objects are always unique.

You could just use the same strategy as with the hash, sticking the id value along the object the next time the object is moved by the GC.

Meanwhile, from the time id() is called to the time the object is moved, you can just temporarily store an {address: id} mapping somewhere. Entries would be removed from the map once the objects get moved. From then on the id would be attached to the object.

If GC cycles are frequent, the map doesn't have to grow too large.

I don't know if the need for id reuse after the id space gets exhausted is important or not. Once you get to the end of the space, you would have to scan the map and heap to find a convenient "hole" to reuse, I suppose.

+
+
+
+
+ + Shahms wrote on 2009-10-16 20:19: +
+
+

Thanks, Carl. Following up what Skandalfo said, (although this is probably a poor forum for such discussions), it seems like you could reuse the hash field for id as well. Given that the minimum size for a Python object is > 1 byte, you should have at least that much space for offsetting the hash/id. As the GC/allocator has to store information about addresses and blocks anyway it should be a relatively simple matter of building and maintaining a bloom filter of offsets in use for a particular base address.

Of course, this also constraints the addresses at which Python objects may be allocated and the lower bits in the address may already be used for other purposes...

+
+
+
+
+ + Carl Friedrich Bolz-Tereick wrote on 2009-10-16 20:37: +
+
+

Skandalof, Shahms: I guess there are possible ways to make id a bit faster than what we have now. What we have now is well-tested and works reasonably enough. I assume anyway that there is not too much Python code whose performance depends critically on having an extremely efficient implementation of id (and if there is, I am prepared to ask the author to rewrite the code instead :-) ).

+
+
+
+
+ + Skandalfo wrote on 2009-10-16 20:38: +
+
+

Shahms: I confess I don't understand your proposal. Do you mean you can have at most as many live objects as the available address space divided by the object alignment?

When I talked about id space I wasn't referring to the memory required to store the per-object id value, but the fact that if you assign the id values using sequential values, and those values are, for instance, 64 bit integers, you could theoretically create and destroy a lot of objects in a long lived process and the sequence would wrap around.

About making hash/id the same, I've just checked that CPython does indeed use the id() value as the value returned by the default hash() implementation.

You could just do the same, and use the id value as the "master" one. For hash() you would just call id(). This allows you to use just one value attached to the objects for both functions.

The cost of that approach would be having to assign an id immediately (having to put it into the temporary map, then having to look it up in the map until the next time the object is moved) for the call to hash() (with no call to id()) too.

The good thing compared to the weak key dict, is that the temporary map doesn't need to be garbage collected at all. The entries are removed when objects are moved (or collected).

+
+
+
+
+ + Shahms wrote on 2009-10-16 20:44: +
+
+

Carl, no doubt you're right. I know that I can probably count the number of times I've needed to use id() on one hand and I'm pretty sure the vast majority of those cases was sticking an-hashable object in a dict.

+
+
+
+
+ + Skandalfo wrote on 2009-10-16 20:53: +
+
+

Carl, Shahms: I couldn't agree more about id() not being important.

Probably Guido should have refrained from making it available in CPython at the time. I suppose it was just easy to add it to the language with the memory allocation model of CPython. The fact is that I don't really see any use for id() once you have the "is" operator and the hash() method...

+
+
+
+
+ + Michael Hudson-Doyle wrote on 2009-10-16 22:19: +
+
+

Yay, I remember talking about removing the gc type pointer, oh, about 3.5 years ago :) Cool that it got done, sounds like a neat pair of hacks.

+
+
+
+
+ + Maciej Fijalkowski wrote on 2009-10-17 01:17: +
+
+

@Lucian:

ctypes and JIT works just fine together.

+
+
+
+
+ + Anonymous wrote on 2009-10-17 09:57: +
+
+

Doesn't deepcopy use id() a lot? I remember once using deepcopy on a complicated structure, resulting in thousands of id() calls.

+
+
+
+
+ + RonnyPfannschmidt wrote on 2009-10-17 10:08: +
+
+

what about pickle - as far as i remember its memo code for dealing with object cycles is using id, too

+
+
+
+
+ + Armin Rigo wrote on 2009-10-17 16:32: +
+
+

Too bad for the current implementation of pickle and deepcopy. The fault in that case is CPython's general view that id() is cheap, despite repeated attempts to convince them otherwise. These attempts have been done notably by guys from Jython, even before PyPy time; indeed id() is a mess for any implementation apart from CPython's simple non-moving GC).

A suitable replacement would be e.g. a 'collections.identitydict' type, if someone feels like going Yet Another Time to python-dev with this issue.

+
+
+
+
+ + Marius Gedminas wrote on 2009-10-17 22:20: +
+
+

When I was writing objgraph I saw no way of traversing arbitrary object graphs without using id().

collections.identitydict sounds like a nice idea. Has anyone written a PEP for it?

+
+
+
+
+ + Anonymous wrote on 2009-10-18 09:14: +
+
+

Is there any possibility to translate pypy under OSX 10.6 as 32bit? Translation works but I get an "ValueError: bad marshal data" when running pypy-c. I assume that is due to the fact that I got a 64bit binary.

+
+
+
+
+ + Maciej Fijalkowski wrote on 2009-10-18 18:49: +
+
+

@Anonymous:

Try deleting all your .pyc files and see what happens.

+
+
+
+
+ + Armin Rigo wrote on 2009-10-19 10:30: +
+
+

Marius: as I said, feel free to :-) but the current situation is, no, not as far as I know.

+
+
+
+
+ + klaussfreire wrote on 2009-10-19 16:38: +
+
+

Wouldn't it free up the GC from all that burden if only a set of live ids were kept? (ie: no weak dict)

So, when you get an id() call, you check the object to see if there's a cached id (much like the hash hack) - if not, you generate a random (or sequential) unused id and store it both in the "live ids" set and in the object's structure, as done with hash values.

So, successive calls to id() would be as fast as in CPython, and garbage collection would be fast too (only an extra set deletion per object whose id was obtained).

In fact, this set could be implemented as a bit array with "free lists", which could be very very efficient, given that its size will be bound by the number of live objects.

+
+
+
+
+ + Armin Rigo wrote on 2009-10-21 08:11: +
+
+

Claudio: this doesn't work (unless I misunderstood). You cannot add a field like the hash field at any point in time, but only during collections when the object moves.

+
+
+
+
+ + klaussfreire wrote on 2009-10-21 13:34: +
+
+

Yes, I've been thinking about that too.

But that can be patched - the weak key dict could still be used for those objects that haven't been collected yet. Since addition of the id would most likely happen in the nursery, or the first generation at most (big assumption), I don't think the dict would grow very big even under heavy id() usage.

+
+
+
+
+ + omul cu 6233 wrote on 2009-11-02 21:51: +
+
+

Wohoo, nice performance

+
+
+
+
+ + Unknown wrote on 2010-04-14 15:14: +
+
+

I'm astonished a bit by your need to pack vtables together within 256KB. How many bits do you need for mark-and-sweep marking or similar stuff? The usual solution I've seen for this is to use the low two bits of the vtable pointer for flags, usually, and mask them off when reading the vtable pointer. Would it work here?

If that isn't enough, then you have to pack vtables together as you do (maybe in a bigger space if you can use more bits).

+
+
+
+
+ + PJE wrote on 2010-09-22 18:22: +
+
+

I can think of one place where I use a lot of id() calls, and that's in PEAK-Rules' generic function implementation, for indexing "is" tests.

For example, if you have a bunch of methods that test if "x is Something" (for different values of Something), then a dictionary of id()'s is used to identify which of these tests went off. While the total number of Somethings isn't likely to be high, the weakref dict in PyPy means that every 'x' the function is called with will end up burning memory and speed to hold an id forever.

While it's perhaps the case that I could avoid this by using a linear search (ugh) in cases where the number of Somethings is small, it's an example of a place where id() makes an operation neat, fast, and simple in regular Python.

Of course, if there were another way to put arbitrary (i.e possibly-unhashable, comparable only by identity) objects in a dictionary, and then determine whether a given object was one of them, that'd certainly be a suitable substitute.

Or, if PyPI offered a temp_id() that would simply let you *check* identity, without forcing the object to hold onto it, that'd work fine too. Say, if there was a has_id() that told you if an id() is outstanding for the object already, or a get_id() that returned None for an object whose id() had never been taken.

With an API like that, I could prevent memory/speed blowup by not having each call of the function adding more objects to PyPy's id() dict.

(Heck, perhaps such an API should be added across Python versions, i.e., to CPython and Jython as well.)

+
+
+
+
+ + Maciej Fijalkowski wrote on 2010-09-22 18:30: +
+
+

@PJE PyPy offers collections.identity_dict, or something similar which would have the effect how you like (but internally doesn't use id operation, just the object identity).

+
+
+
+
+ + Anonymous wrote on 2011-05-07 02:34: +
+
+

This program in C# takes 589 miliseconds, and 52 MB RAM. 17x faster, 4.75x less RAM.

+
+
+
+
+ + Anonymous wrote on 2011-09-15 10:37: +
+
+

And in assembly it will be even faster and smaller.

Python has many lovely attributes, but efficiency is not its primary virtue. That said, making it more efficient is still a plus, which this work is doing

+
+
+
+ +

First pypy-cli-jit benchmarks

+ +
+

As the readers of this blog already know, I've been working on porting the +JIT to CLI/.NET for the last months. Now that it's finally possible to get a +working pypy-cli-jit, it's time to do some benchmarks.

+

Warning: as usual, all of this has to be considered to be a alpha version: +don't be surprised if you get a crash when trying to run pypy-cli-jit. Of +course, things are improving very quickly so it should become more and more +stable as days pass.

+

For this time, I decided to run four benchmarks. Note that for all of them we +run the main function once in advance, to let the JIT recoginizing the hot +loops and emitting the corresponding code. Thus, the results reported do +not include the time spent by the JIT compiler itself, but give a good +measure of how good is the code generated by the JIT. At this point in time, +I know that the CLI JIT backend spends way too much time compiling stuff, but +this issue will be fixed soon.

+
+
    +
  • +f1.py: this is the classic PyPy JIT benchmark. It is just a function +that does some computational intensive work with integers.
  • +
  • +floatdemo.py: this is the same benchmark involving floating point +numbers that have already been described in a previous blog post.
  • +
  • +oodemo.py: this is just a microbenchmark doing object oriented stuff +such as method calls and attribute access.
  • +
  • +richards2.py: a modified version of the classic richards.py, with a +warmup call before starting the real benchmark.
  • +
+
+

The benchmarks were run on a Windows machine with an Intel Pentium Dual Core +E5200 2.5GHz and 2GB RAM, both with .NET (CLR 2.0) and Mono 2.4.2.3.

+

Because of a known mono bug, if you use a version older than 2.1 you need +to pass the option -O=-branch to mono when running pypy-cli-jit, else it +will just loop forever.

+

For comparison, we also run the same benchmarks with IronPython 2.0.1 and +IronPython 2.6rc1. Note that IronPython 2.6rc1 does not work with mono.

+

So, here are the results (expressed in seconds) with Microsoft CLR:

+
+ ++++++++ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Benchmarkpypy-cli-jitipy 2.0.1ipy 2.6ipy2.01/ pypyipy2.6/ pypy
f10.0280.1450.1365.18x4.85x
floatdemo0.6710.7650.8121.14x1.21x
oodemo1.254.2783.8163.42x3.05x
richards212284426700.36x0.54x
+
+

And with Mono:

+
+ ++++++ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Benchmarkpypy-cli-jitipy 2.0.1ipy2.01/ pypy
f10.0420.69516.54x
floatdemo0.7811.2181.55x
oodemo1.7039.5015.31x
richards27208621.20x
+
+

These results are very interesting: under the CLR, we are between 5x faster +and 3x slower than IronPython 2.0.1, and between 4.8x faster and 1.8x slower +than IronPython 2.6. On the other hand, on mono we are consistently faster +than IronPython, up to 16x. Also, it is also interesting to note that +pypy-cli runs faster on CLR than mono for all benchmarks except richards2.

+

I've not investigated yet, but I think that the culprit is the terrible +behaviour of tail calls on CLR: as I already wrote in another blog post, +tail calls are ~10x slower than normal calls on CLR, while being only ~2x +slower than normal calls on mono. richads2 is probably the benchmark that +makes most use of tail calls, thus explaining why we have a much better result +on mono than CLR.

+

The next step is probably to find an alternative implementation that does not +use tail calls: this probably will also improve the time spent by the JIT +compiler itself, which is not reported in the numbers above but that so far it +is surely too high to be acceptable. Stay tuned.

+
+
+
+
+ + Michael Foord wrote on 2009-10-15 15:01: +
+
+

Perhaps you should try another run with the .NET 4 beta. They have at least *mostly* fixed the terrible performance of tail calls there.

Anyway - interesting stuff, keep up the good work. What is the current state of .NET integration with pypy-cli?

+
+
+
+
+ + Antonio Cuni wrote on 2009-10-15 15:17: +
+
+

Oh, I didn't know about .NET 4 beta. Have you got any link that explains how they fixed the tail call stuff? I'll surely give it a try.

About the .NET integration: no news from this front. Nowadays I'm fully concentrated on the JIT because I need some (possibly good :-)) results for my phd thesis. When pypy-cli-jit is super-fast, I'll try to make is also useful :-)

+
+
+
+
+ + Michael Foord wrote on 2009-10-15 15:30: +
+
+

Here's at least one link (with some references) on the tail call improvements in .NET 4:

https://extended64.com/blogs/news/archive/2009/05/10/tail-call-improvements-in-net-framework-4.aspx

+
+
+
+
+ + Michael Foord wrote on 2009-10-15 15:31: +
+
+

I'm also intrigued as to why you didn't benchmark IronPython 2.6 on Mono? I thought that on very recent versions of Mono you could build and run IronPython 2.6 fine now?

+
+
+
+
+ + Michael Foord wrote on 2009-10-15 15:34: +
+
+

Ah, I see now you say that it doesn't work. Hmmm... there are definitely folks who maintain a version that does work (perhaps needing Mono 2.4.3 which I guess is trunk?).

See the download previews here anyway: https://ironpython-urls.blogspot.com/2009/09/more-from-mono-moonlight-2-monodevelop.html

+
+
+
+
+ + Anonymous wrote on 2009-10-15 16:09: +
+
+

I wonder if this paper would be useful? It's a way to do continuations using the stack on .NET. Maybe you can use it to speed up tail calls?

https://www.cs.brown.edu/~sk/Publications/Papers/Published/pcmkf-cont-from-gen-stack-insp/

+
+
+
+
+ + Antonio Cuni wrote on 2009-10-19 11:58: +
+
+

@Michael: from the link you posted, it seems that tail call improvements in .NET 4 are only for x86_64, but my benchmarks were un on 32 bit, so I don't think it makes a difference. Anyway, I'll try to benchmark with .NET 4 soon, thanks for the suggestion.

@Anonymous: the paper is interesting, but I don't think it's usable for our purposes: throwing and catching exception is incredibly costing in .NET, we cannot really use them too heavily. The fact that the paper says nothing about performances is also interesting :-)

+
+
+
+ +

PyPy's JIT now supports floats

+ +
+

+Hello. +

+ +

+We've just merged branch which adds float support to x86 backend. +This means that floating point operations are now super fast +in PyPy's JIT. Let's have a look at example, provided by +Alex Gaynor +and stolen from Factor blog. +

+ +

+The original version of the benchmark, was definitely tuned for the performance needs of CPython. +

+

+For running this on PyPy, I changed to a bit simpler version of the program, +and I'll explain a few changes that I did, which the reflect current +limitations of PyPy's JIT. They're not very deep and they might be +already gone while you're reading it: +

+ +
    +
  • Usage of __slots__. This is a bit ridiculous, but we spend quite a bit + of time to speed up normal instances of new-style classes which are + very fast, yet ones with __slots__ are slower. To be fixed soon.
  • + +
  • Usage of reduce. This one is even more obscure, but reduce is not + perceived as a thing producing loops in a program. Moving to + a pure-Python version of reduce fixes the problem.
  • + +
  • Using x ** 2 vs x * x. In PyPy, reading a local variable is a + no-op when JITted (the same as reading local variable in C). However + multiplication is simpler operation that power operation.
  • +
+

+I also included the original Java benchmark. Please +note that original java version is similar to my modified one +(not the one specifically tuned for CPython) +

+ +The performance figures below (for n = 1 000 000), average of 10 runs: + +
    +
  • CPython 2.6: 7.56s +
  • +
  • CPython & psyco 2.6: 4.44s +
  • +
  • PyPy: 1.63s +
  • +
  • Java (JVM 1.6, client mode): 0.77s +
  • +
+

+and while JVM is much faster, it's very good that we can even compare :-) +

+ +Cheers
+fijal +
+
+
+
+ + Anonymous wrote on 2009-10-06 17:26: +
+
+

So it's much faster than Psyco and only about 2x slower than the JVM. That's impressive, as Python is much more dynamic!

Congrats and thanks for the regular updates, it's much appreciated.

+
+
+
+
+ + Luis wrote on 2009-10-06 17:31: +
+
+

Very exciting!
By the way, this result doesn't include the time to generate assembler. Right?

+
+
+
+
+ + Anonymous wrote on 2009-10-06 17:37: +
+
+

Great, you guys are heroes!

Btw, what's the next big hurdle to run real-world programs? Memory use? Threads?

+
+
+
+
+ + Anonymous wrote on 2009-10-06 17:47: +
+
+

Great job! I really appreciate your work.

@Luis: I think, it does include the assembler. I just compiled trunk and ran the modified benchmark on python 2.6 and pypy-c-jit. Best time of 10 runs:
Python 2.6.2: 0.911113977432
Pypy: 0.153664112091
So it's nearly 6x faster for me (including the time for generating the assembler, of course) - even much better than on the postet numbers...I don't know, if cpython was run with the unmodified version of the benchmark though.

+
+
+
+
+ + William wrote on 2009-10-06 19:36: +
+
+

I'd be interested to see the results for a much longer run (n = 10 000 000?).

+
+
+
+
+ + Panos Laganakos wrote on 2009-10-06 19:55: +
+
+

Wicked! Keep the sweetness coming :)

+
+
+
+
+ + Unknown wrote on 2009-10-07 03:15: +
+
+

Very exciting. Thanks! These are nearing "holy crap" numbers.

<mindControl>siiiixty foooouuur biiiiit<mindControl>

:-)

+
+
+
+
+ + René Dudfield wrote on 2009-10-07 11:35: +
+
+

awesome! things are really starting to move along now :)

I tried the same little benchmark with the shedskin python to C++ compiler for comparison:

cpython2.5: 16.2770409584
cpython2.6: 12.2321541309
shedskin: 0.316256999969

Shedskin is 38.6 times faster than cpython2.6, and 51.4 times faster than cpython2.5... and to extrapolate from your numbers 3.9 times faster than the jvm.

Of course that doesn't include the time it takes to generate the C++ and then compile it with g++ (using the old 4.0.1 g++, not the latest 4.4). I also didn't include the python interpreter startup cost.

btw, I found map, reduce and filter all to be faster with pure python versions when using psyco too.

cu!

+
+
+
+
+ + Maciej Fijalkowski wrote on 2009-10-07 13:42: +
+
+

@illume

that's a bit unfair comparison, since shedskin is not python. you can compare RPython and shedskin though. RPython is sometimes faster than C even...

And also, yes, in PyPy or psyco time we include compilation time.

Cheers,
fijal

+
+
+
+
+ + Luis wrote on 2009-10-07 14:34: +
+
+

I'm still confussed.. if you post the average of 10 runs, and assembler is generated only in the first run, then this time is diluted. Shouldn't you compute the average of 10 runs, but excluding the first one? (that means, runing it 11 times and ignoring the first one?).

+
+
+
+
+ + Anonymous wrote on 2009-10-07 18:31: +
+
+

@Luis: no, I think fijal started the pypy-c interpreter 10 times, and each time it generates assembly (it's not cached afaik).

+
+
+
+
+ + Luis wrote on 2009-10-07 19:28: +
+
+

Well, no matter how they measure it, this is definitely within the "Holy Crap" range...

+
+
+
+
+ + Maciej Fijalkowski wrote on 2009-10-07 20:37: +
+
+

@Luis:

Maybe I should... I really run this 10 times while assembler was generated only during the first time. But also dilluting assembler generation time over runs is kind of real-life effect...

+
+
+
+
+ + Baczek wrote on 2009-10-08 16:06: +
+
+

how about including unladen swallow results?

+
+
+
+
+ + Michael Allman wrote on 2009-10-08 18:26: +
+
+

How come the pypy JIT is compiled AOT to C? I thought the idea of PyPy was to implement a python runtime in python? Why not run the JIT on a python runtime?

Awesome work. I wish the Ruby folk were as motivated...

Cheers.

+
+
+
+
+ + Anonymous wrote on 2009-10-08 18:32: +
+
+

I seem to recall grumblings from C++ programmers a few years ago when Java started supporting multi-core architecture, which made Java execution as fast or faster than C++ with much less development effort (for free with the Java interpreter vs hand-written C++ support).

If your testing machine is a multi-core/processor machine, it might be appropriate to say that PyPy is now as fast as C++ (without explicit multi-core support). Wow!

+
+
+
+
+ + Armin Rigo wrote on 2009-10-09 11:39: +
+
+

Michael: because our goal is to have a general framework, not a Python-centered solution. For example, the JIT generator works mostly out of the box with any other language that we implemented in RPython (which includes Smalltalk).

+
+
+
+
+ + hihhu wrote on 2009-10-09 18:06: +
+
+

Great work!

How large an effort would it be to have eg. Perl or Ruby working with this? Just out of curiosity, I'm trying to understand this project better.

+
+
+
+
+ + Anonymous wrote on 2009-10-09 20:23: +
+
+

In the correct original version of the benchmark there are two calls to sin(). A good compiler optimizes one of them away. A worse compiler don't. So it's more fair to put back the second sin in the Python code too.

+
+
+
+
+ + Maciej Fijalkowski wrote on 2009-10-11 19:37: +
+
+

@hihu:

It would be a bit easier than writing the interpreter in C, since RPython is much nicer. Also, you get JIT for almost free and decent GC for free. On the other hand, writing interpreters it's quite a bit of work on it's own.

@Anonymous:

Indeed, well, spotted, it would be more fair. However, there is no measurable difference (at least in pypy running time).

PS. We have weekends, too.

Cheers,
fijal

+
+
+
+
+ + della wrote on 2009-10-12 08:48: +
+
+

Would a Pypy implementation of Perl/Ruby/PHP mean that it would be possible to use libraries developed in one language for the other one? That would be very cool indeed.

And, for that matter, would that mean interoperability between python2 and python3 modules when the py3 interpreter will be done? :)

+
+
+
+
+ + Maciej Fijalkowski wrote on 2009-10-12 15:33: +
+
+

@della.

In general, that would not be that simple. You need to somehow map data types between interpreters in an unclear manner. For example, what would happen if you call Python2.x function passing argument that is py3k dict (which has different interface)?

Cheers,
fijal

+
+
+
+
+ + della wrote on 2009-10-13 09:34: +
+
+

One would imagine having different interfaces for the same objects when accessed from 2.x and 3.x code. Would that be difficult?

Of course, I understand mapping data structures between languages that have many more differences between them than py2 and py3 would definitely be more complex.

+
+
+
+
+ + Anonymous wrote on 2009-11-02 18:08: +
+
+

Not to rain on the parade, but Java's trig functions are very slow outside of -pi/2,pi/2 range to correct terrible fsin/fcos results on Intel x86.

See https://bugs.sun.com/bugdatabase/view_bug.do?bug_id=4857011

Your benchmark should include something to measure the error, or not use trig functions as a benchmark when comparing to Java.

+
+
+
+ +

First results of the JIT

+ +
+

Hi all,

+ +

Just a quick note to tell you that we are progressing on the +JIT front. Here are the running times of the richards +benchmark on my laptop:

+ +
    +
  • 8.18 seconds with CPython 2.5.2; + +
  • +
  • 2.61 seconds with pypy-c-jit (3x faster than CPython); + +
  • +
  • 1.04 seconds if you ignore the time spent making assembler (8x faster than CPython); + +
  • +
  • 1.59 seconds on Psyco, for reference (5x faster that CPython).
  • +
+

Yes, as this table shows, we are spending 1.57 seconds in the JIT +support code. That's too much -- even ridiculously so -- for anything but a +long-running process. We are working on that :-)

+ +

If you want to build your own pypy-c-jit (for x86-32 only for now):

+ +
    +
  • you need a Subversion checkout of trunk; + +
  • +
  • run pypy/translator/goal/translate.py with the -Ojit + option; + +
  • +
  • as usual, wait a long time (and be sure you have more than 1GB of RAM).
  • +
+

For now pypy-c-jit spews a lot of debugging output and +there are a few known +examples where it crashes. As we like to repeat, however, it's a complete JIT: +apart from the crashes (the bugs are probably in the JIT support code), it supports the whole Python language from the start -- in the sense of doing correct things. Future work include +Python-specific improvements by e.g. tweaking the data structures used to store Python objects so that they are more JIT-friendly.

+ +

EDIT: Oh yes, fijal reminds me that CPython 2.6 is 30% faster than CPython 2.5 on this benchmark (which is mostly my "fault", as I extracted a small part of PyPy and submitted it as a patch to CPython that works particularly well for examples like richards). It does not fundamentally change the fact that we are way faster though.

+
+
+
+
+ + Unknown wrote on 2009-09-27 17:56: +
+
+

This thing just got interesting.

Why this particular benchmark?

+
+
+
+
+ + cjrh wrote on 2009-09-27 19:32: +
+
+

Fantastic!

At this point, it would be a really good idea for the pypy team to prepare downloadable binaries or setup tools, or eggs for making it extremely easy for a new user to try it out. Now that the performance is starting to become interesting, many more people will want to experiment with it and you don't want that enthusiam hampered by a somewhat involved setup process.

+
+
+
+
+ + Unknown wrote on 2009-09-27 19:40: +
+
+

> it would be a really good idea for the pypy team to prepare downloadable binaries or setup tools, or eggs

I second this notion. I am among the group of people who are quite tempted to try things out, but not sure how much work I'll have to do first.

+
+
+
+
+ + Anonymous wrote on 2009-09-27 20:27: +
+
+

Me too I'd like pre-built binaries

+
+
+
+
+ + Anonymous wrote on 2009-09-27 20:29: +
+
+

I agree. Please put some binaries on your page to make it easier for everyone to survey what you've done!

+
+
+
+
+ + Armin Rigo wrote on 2009-09-27 20:30: +
+
+

This particular benchmark happens to be the one we use; there is no deep reason besides its relative simplicity (but it is not a microbenchmark, it's really computing something). We will of course make more tests with a better range of benchmarks when things start to settle a bit. Right now we are busy developing, and the numbers change every week.

It's also for this reason that there is no nicely-packaged release, sorry :-)
Note that translating your own pypy-c-jit is not a lot of work for you. It is just a lot of work for your CPU :-) You just do "svn co", "cd pypy/translator/goal" and "./translate -Ojit".

+
+
+
+
+ + Anonymous wrote on 2009-09-27 20:41: +
+
+

I would appreciate binaries because I don't have a computer with multi-GB RAM. I tried translating pypy a few months ago but gave up after several hours (the computer was just swapping constantly).

I can wait some longer, but regular binary releases (even if just unstable trunk snapshots) would be useful.

Anyway, keep up the good work! This is looking really promising.

+
+
+
+
+ + Anonymous wrote on 2009-09-28 00:47: +
+
+

Quite nice, thank you..

+
+
+
+
+ + PavPanchekha wrote on 2009-09-28 03:27: +
+
+

Perhaps there is some way to store generated assembler code? I don't know too much about assembler or the JIT backend, but I assume that it'd be possible to stick the generated assembler code into a comment (or, if those don't exist, a docstring) in the .pyc file, so that a library that is commonly imported won't have to waste time generating assembler.

+
+
+
+
+ + Benjamin Peterson wrote on 2009-09-28 03:45: +
+
+

@PavPanchekha We specialize the assembler agressively, so that probably wouldn't be so useful. We have a lot of room to improve on assembly generation, though.

+
+
+
+
+ + Unknown wrote on 2009-09-28 07:09: +
+
+

Thanks for the update!

+
+
+
+
+ + Anonymous wrote on 2009-09-28 11:40: +
+
+ Anonymous said: I would appreciate binaries because I don't have a computer with multi-GB RAM.

I do have such a computer, but I would still appreciate binaries, because the current trunk does not translate for me:

[translation:ERROR] File "/tmp/pypy/pypy/annotation/annrpython.py", line 227, in addpendingblock
[translation:ERROR] assert s_oldarg.contains(s_newarg)
[translation:ERROR] AssertionError':
[translation:ERROR] .. v1703 = simple_call((function mmap), v1702, map_size_0, (7), (34), (-1), (0))
[translation:ERROR] .. '(pypy.rlib.rmmap:628)alloc' +
+
+
+
+ + Armin Rigo wrote on 2009-09-28 11:59: +
+
+

You are probably missing a dependency. See https://codespeak.net/pypy/dist/pypy/doc/getting-started-python.html#translating-the-pypy-python-interpreter

+
+
+
+
+ + della wrote on 2009-09-28 13:43: +
+
+

Great work! Is it possible to build the 32-bit binary on a 64-bit machine without too much effort? Having those instructions would certainly help us 64-bit people :)

+
+
+
+
+ + Luis wrote on 2009-09-28 14:02: +
+
+

I guess the time spent making assembler is only the first time the code is executed. Is that right? If so, we can consider an 8x speedup as the most accurate result. Or not?

+
+
+
+
+ + nshepperd wrote on 2009-09-28 14:41: +
+
+

@della: I use a 32-bit chroot on my own x64 machine. I don't know if that counts as "too much effort" (certainly it theoretically shouldn't require that), but it has been for me the most painless way to do it.

+
+
+
+
+ + Maciej Fijalkowski wrote on 2009-09-28 15:16: +
+
+

@Luis: yes, it's only first time.
Well, depends how you count, but it
can be considered 8x speedup...

+
+
+
+
+ + Armin Rigo wrote on 2009-09-28 20:17: +
+
+

Here are prebuilt C sources (in which "tracing" time was reduced by 20-30% since the blog post):

https://wyvern.cs.uni-duesseldorf.de/~arigo/chain.tar.bz2

Linux x86-32 only. You still need a svn checkout of PyPy, and you still need to compile them with gcc -- but it does not take too long: edit the first entry of the Makefile to point to your checkout of PyPy and type "make". This still assumes that all dependencies have been installed first. Don't complain if the #includes are at the wrong location for your distribution; you would get them right if you translated the whole thing yourself. In fact, don't complain if it does not compile for any reason, please :-) C sources like that are not really supposed to be portable, because they are just intermediates in the translation process.

+
+
+
+
+ + Anonymous wrote on 2009-09-28 21:28: +
+
+

̉You are probably missing a dependency. See https://codespeak.net/pypy/dist/pypy/doc/getting-started-python.html#translating-the-pypy-python-interpreter

Dear Armin, it seem like this document should mention libexpat1-dev and libssl-dev as dependencies, too. Anyway, I managed to build pypy-c, and here are the result for some small benchmarks I wrote. (Is there a way here at blogger.com to not break the formatting?)

python 2.5 psyco pypy-c
richards 14.9 2.9 3.9
mergesort 27.6 4.8 26.3
convexhull 9.4 5.6 6.3
bigcityskyline 46.9 3.1 7.6
fft 14.1 15.4 25.0

Thank you all for your efforts.

+
+
+
+
+ + Armin Rigo wrote on 2009-09-29 07:47: +
+
+

Thanks for the missing dependencies; added to the development version of the page. Thanks also for the numbers you report. The next obvious thing we miss is float support (coming soon!), which shows in some of your benchmarks.

+
+
+
+
+ + René Dudfield wrote on 2009-09-29 08:06: +
+
+

Hi,

this is so unbelievably awesome, it's going to take me a while to recover from all the awesomness.

CONGRATS!

ps. a nice improvement for users is to get your ./configure script to find dependencies and report the ones missing, and ones used (s/configure/setup.py/g).

+
+
+
+
+ + Anonymous wrote on 2009-09-29 13:09: +
+
+

nice!

so what is your guess at the moment? how fast can pypy get if you further optimize the jit?

+
+
+
+
+ + Anonymous wrote on 2009-09-29 13:47: +
+
+

Dear Pypy developers, is it possible to switch off the very agressive JIT logging in pypy-c? First, this could make pypy-c a drop-in replacement for cpython. (Many more beta-testers.) Second, the logging itself seems to be somewhat resource-intensive.

Very cool Mandelbrot ascii art, by the way.

+
+
+
+
+ + Maciej Fijalkowski wrote on 2009-09-30 08:42: +
+
+

Dear anonymous.

you can compile ./translate.py -Ojit --jit-debug=profile

There is no runtime switch unfortunately, so far.

Cheers,
fijal

+
+
+
+
+ + Anonymous wrote on 2009-09-30 10:32: +
+
+

Thank you! For many of us, the translation-time switch will be just as good.

+
+
+
+
+ + della wrote on 2009-10-01 09:31: +
+
+

I can't seem to compile (32-bit Ubuntu 9.10 chroot), by manually executing the Makefile in /tmp/usession-0/testing_1 I get this traceback:

File "/home/della/pkg/pypy/trunk/pypy/translator/c/gcc/trackgcroot.py", line 1210, in (module)
tracker.process(f, g, filename=fn)
File "/home/della/pkg/pypy/trunk/pypy/translator/c/gcc/trackgcroot.py", line 229, in process
lines = self.process_function(lines, entrypoint, filename)
File "/home/della/pkg/pypy/trunk/pypy/translator/c/gcc/trackgcroot.py", line 244, in process_function
table = tracker.computegcmaptable(self.verbose)
File "/home/della/pkg/pypy/trunk/pypy/translator/c/gcc/trackgcroot.py", line 285, in computegcmaptable
self.parse_instructions()
File "/home/della/pkg/pypy/trunk/pypy/translator/c/gcc/trackgcroot.py", line 364, in parse_instructions
meth = self.find_missing_visit_method(opname)
File "/home/della/pkg/pypy/trunk/pypy/translator/c/gcc/trackgcroot.py", line 390, in find_missing_visit_method
raise UnrecognizedOperation(opname)
__main__.UnrecognizedOperation: jc

there are some type warnings also for pointers, I don't know if they could be any useful. Maybe you can help me?

+
+
+
+
+ + Armin Rigo wrote on 2009-10-01 16:56: +
+
+

Thanks for the report, della. Fixed, if you want to try again. Parsing gcc output is a bit delicate as the exact set of operations used depends on the specific version and command-line options passed to gcc.

+
+
+
+
+ + Armin Rigo wrote on 2009-10-01 16:58: +
+
+

Since the blog post, here are the updated numbers: we run richards.py in 2.10 seconds (almost 4x faster than CPython), and only spend 0.916 seconds actually running the assembler (almost 9x faster than CPython).

+
+
+
+
+ + Anonymous wrote on 2009-10-01 18:25: +
+
+

Very nice. Do you expect to get faster than psyco?

+
+
+
+
+ + Luis wrote on 2009-10-02 01:27: +
+
+

This is very exciting! Please, try to post updates to these figures... thanks!

+
+
+
+
+ + Unhelpful wrote on 2009-10-04 01:13: +
+
+

I was having the same problem as della, and your fix seems to work, but it's breaking somewhere else now. I don't think I have a dependency problem, I can build a working pypy-c without jit. Running make manually produces heaps of warnings about incompatible pointers, some probably harmless (int* vs long int*, these should be the same on x86-32), but others worry me more, like struct stat* vs struct stat64*, or struct SSL* vs char**. I put the complete output of a manual run of make online.

+
+
+
+
+ + Anonymous wrote on 2009-10-04 01:40: +
+
+

Interestingly, the translation itself seems to consume at most about 960MB of ram. It's easy to translate on a system even with only a gig of ram if you stop everything else.

Try switching run levels or the like.

The -Ojit option seems to cause an error in translation with Revision 68125, when translated using Python 2.5.2 on Debian Lenny.

+
+
+
+
+ + proteusguy wrote on 2009-10-04 07:31: +
+
+

First off - congratulations and good job on the great progress. I've been watching this project since the 2007 PyCon in DFW and it's great to see these promising results.

That said, while I know there's still a lot of work to do and this is very much an in-progress thing, I'm very much looking forward to an excuse to try this stuff out in anger - real practical situations. For me that means some statistical calculation engines (monto-carlo analysis) front ended by web services. In both situations this brings up two constraints: a) must support 64bit (because our data sets rapidly go above 4GB RAM) and b) must not be overly memory hungry (because any significant incremental overhead really hurts when your data sets are already over 4GB RAM).

For now we use Psyco for small stuff but have to re-implement in C++ once we hit that 32-bit limit. PyPy is very exciting as a practical alternative to Psyco because of anticipated 64bit support. I wonder if, due to the existence fo Psyco already, that PyPy shouldn't focus first on 64bit instead?

Few things would speed up progress than getting PyPy used out in the wild - even if only by those of us who appreciate it's very much in flux but still understand how to benefit from it.

I understand you guys have your focus and goals and encourage you to keep up the good work. Just thought I'd throw this out as an idea to consider. I'm sure there are a lot like me anxious to give it a spin.

-- Ben Scherrey

+
+
+
+
+ + Armin Rigo wrote on 2009-10-04 11:25: +
+
+

Andrew: can you update and try again? If you still have the .c files around it is enough to go there and type "make"; otherwise, restart the build. It should still crash, but give us more information about why it does.

+
+
+
+
+ + Unhelpful wrote on 2009-10-04 16:04: +
+
+

The new traceback is:

Traceback (most recent call last):
File "/home/chshrcat/build/pypy-trunk/pypy/translator/c/gcc/trackgcroot.py", line 1211, in <module>
assert fn.endswith('.s')
AssertionError

Is the position in input tracked? that might help, or I could package my .gcmap files.

+
+
+
+
+ + Unhelpful wrote on 2009-10-04 16:15: +
+
+

The trouble seems to be implement.gcmap and implement_9.gcmap. These are bothe empty, and trigger the assertion error.

Running trackgcroot as the Makefile does, but without those two files, permits compilation to continue, but linking fails with undefined references to various symbols with the prefix 'pypy_g_'.

I suspected the changes might have invalidated the old .gcmap files, so I tried removing them, and got this when it tried to generate implement.gcmap:

Traceback (most recent call last):
File "/home/chshrcat/build/pypy-trunk/pypy/translator/c/gcc/trackgcroot.py", line 1214, in <module>
tracker.process(f, g, filename=fn)
File "/home/chshrcat/build/pypy-trunk/pypy/translator/c/gcc/trackgcroot.py", line 229, in process
lines = self.process_function(lines, entrypoint, filename)
File "/home/chshrcat/build/pypy-trunk/pypy/translator/c/gcc/trackgcroot.py", line 244, in process_function
table = tracker.computegcmaptable(self.verbose)
File "/home/chshrcat/build/pypy-trunk/pypy/translator/c/gcc/trackgcroot.py", line 285, in computegcmaptable
self.parse_instructions()
File "/home/chshrcat/build/pypy-trunk/pypy/translator/c/gcc/trackgcroot.py", line 365, in parse_instructions
insn = meth(line)
File "/home/chshrcat/build/pypy-trunk/pypy/translator/c/gcc/trackgcroot.py", line 741, in visit_jmp
self.conditional_jump(line)
File "/home/chshrcat/build/pypy-trunk/pypy/translator/c/gcc/trackgcroot.py", line 757, in conditional_jump
raise UnrecognizedOperation(line)
__main__.UnrecognizedOperation: jmp T.14141

+
+
+
+
+ + Anonymous wrote on 2009-10-04 16:19: +
+
+

A correction/clarification to last night's post:

There isn't a bug in the -Ojit translation process, I was just missing a dependency that I could've sworn I've installed before.

The translation process only takes < 1GB memory if done without any options. Attempting to translate with the -Ojit option takes at least 2.5GB of RAM, as I tried last night (with it as the only running process) and it consumed my swapfile and ran out of memory.

Is there any documented way to use a translated pypy binary to build other pypy translations? That might help reduce the build requirements, and would also be mighty cool.

+
+
+
+
+ + Armin Rigo wrote on 2009-10-04 16:50: +
+
+

NickDaly: checked in, please try. Also, please come to the mailing list instead of posting here if you have further comments to do... https://codespeak.net/mailman/listinfo/pypy-dev

+
+
+
+
+ + Michael Allman wrote on 2009-10-05 10:56: +
+
+

Is pypy-c-jit written in C or Python or something else? I ask because of the "c" in pypy-c-jit.

+
+
+
+
+ + Carl Friedrich Bolz-Tereick wrote on 2009-10-05 14:28: +
+
+

Michael: It is written in RPython (a subset of Python) but then translated to C. By convention we therefore call the executable-name pypy-c. If the executable also contains a JIT, we call it pypy-c-jit.

+
+
+
+
+ + Carl Friedrich Bolz-Tereick wrote on 2009-10-05 15:48: +
+
+

Ben Scherrey: 64bit support might happen not too far in the future. Not using too much memory is a different problem, that might take a while longer. It has two aspects, one is that the JIT itself uses way too much memory at the moment. We will work on that soon.

The other aspect is making sure that your dataset does not take too much heap. It depends a bit which data structures you use, but it's not likely to be that great right now. That might change at some point, I have some ideas in that direction, but not really time to work on the soon.

+
+
+
+ +

PyPy sprint in Düsseldorf, 6 Nov - 13 Nov

+ +
+

The next PyPy sprint will be held in the Computer Science department of +Heinrich-Heine Universität Düsseldorf from the 6th to the 13th of +November 2009. This is a fully public sprint, everyone is welcome to +join us.

+
+

Topics and goals

+

At the sprint we intend to work on the JIT generator in PyPy and on +applying it to PyPy Python interpreter.

+

The precise work that will be done is not fixed, as we don't know in +which state the JIT will be in November. However, possible areas of +work might include:

+
    +
  • tweaking the interpreter/objspace to be more JIT-friendly, e.g. +instance implementation code, call code
  • +
  • if there is interest starting non x86-32 JIT backends
  • +
  • trying out existing software to find features where the optimizations +of the JIT could be improved
  • +
  • improving our benchmarking infrastructure
  • +
+

We will give special priority to topics that "non-core" people find +interesting (as long as they are somehow JIT-related).

+

For an introduction of how our JIT-generation process works, please +refer to our blog:

+

https://morepypy.blogspot.com/2009/03/jit-bit-of-look-inside.html

+

There is also a more dense academic paper about the subject:

+

https://codespeak.net/svn/pypy/extradoc/talk/icooolps2009/bolz-tracing-jit-final.pdf

+
+
+

Location

+

The sprint will take place in a seminar room of the computer science +department. It is in the building 25.12 of the university campus. For +travel instructions see

+
+https://stups.cs.uni-duesseldorf.de/anreise/esbahn.php +
+
+
+

Registration

+

If you'd like to come, please subscribe to the pypy-sprint mailing +list and drop a note about your interests and post any questions. +More organisational information will be send to that list. We'll keep a +list of people which we'll update (which you can do so yourself if +you have codespeak commit rights).

+
+
+
+
+
+ + Unknown wrote on 2009-09-25 08:53: +
+
+

Following the svn mailing list, there appears to have been a number of quite large refactorings of the JIT recently. Is there a good description of what they are going to achieve, and what the performance gains are? A blog post with an update would be really cool

+
+
+
+ +

PyPy gets a new compiler

+ +
+

Today, I merged the parser-compiler branch, which I have been working on over the summer. It contained a total rewrite of both PyPy's Python parser and AST compiler. PyPy's old parser was (in)famous internally for being complicated and slow (with many algorithmic complexities greater than O(n)). The new parser is a simple as I could make it LL(1) parser like CPython (though it doesn't share the hacks of CPython's parser).

+ +

The new compiler is based on the Abstract Syntax Trees (AST) that CPython 2.5 introduced instead of PyPy's old AST based on the compiler package's. This means that Python code running on PyPy will be able to use the same _ast interface as CPython. PyPy's _ast implementation supports AST features that CPython 2.6 added, including compiling modified AST to bytecode and executing it. In this rewrite, some more obscure compiler features were added, too. For example, jumps in bytecode can now be greater than 65535 bytes! (That's like an if statement with 7000 lines of code in the body.)

+ +

While the PyPy translation toolchain still has many obscure details and hacks, this merge completes the process of making the actual Python interpreter very clean. Hopefully, this will make adding new features much easier and make PyPy less frustrating to maintain as well as providing application level code with an improved AST interface!

+
+
+
+
+ + Jeremy Cowles wrote on 2009-08-25 23:03: +
+
+

Nice, keep up the good work!

+
+
+
+
+ + Anonymous wrote on 2009-08-26 08:12: +
+
+

Thank you.. Keep it up.

+
+
+
+
+ + random user wrote on 2009-08-26 17:52: +
+
+

Very nice. Thanks for all of your work!

+
+
+
+
+ + tobami wrote on 2009-08-31 10:43: +
+
+

Hi, the Gothenburg sprint news are very interesting.

What are your thoughts about a release roadmap?. Do you intend to release a pypy 1.2 with improved compatibility and speed but no JIT, and later include the JIT (version 1.5, maybe?)?.

I think publishing some kind of roadmap would be useful, as a project suffers when its release cycles are BOTH long and unpredictable.

+
+
+
+
+ + tobami wrote on 2009-08-31 10:51: +
+
+

Also, starting from the next stable release, it would be great to publish some kind of benchmarks page to keep track of performance across different versions (cpython 2.6 vs pypy 1.1 vs pypy 1.2 vs pypy with JIT).

Now that I think of it, do you need some kind of help with the website?. I think starting with the next pypy's release, the project will get a lot more visibility and a nicer and better structured website would be a definite plus. If you feel it would be a useful task I could help there.

+
+
+
+
+ + Maciej Fijalkowski wrote on 2009-08-31 16:05: +
+
+

Hey.

Both, the benchmarks (that would also include say jython) and a nice website for people who actually want to use it would be a very nice addon. We definitely would appreciate some help with it.

If you have any ideas feel free to continue discussion on pypy-dev.

Cheers,
fijal

+
+
+
+
+ + tobami wrote on 2009-08-31 21:46: +
+
+

Hi Maciej, as you suggested, I have subscribed to the pypy-dev mailing list and have started the discussion.

Cheers,

Miquel

+
+
+
+
+ + Maciej Fijalkowski wrote on 2009-09-01 15:47: +
+
+

Hey Miguel.

I fail to see your post.

Cheers,
fijal

+
+
+
+
+ + tobami wrote on 2009-09-02 10:28: +
+
+

it got rejected. I have written to pypy-dev-owner to see where the problem is.

Cheers

+
+
+
+
+ + Maciej Fijalkowski wrote on 2009-09-05 10:13: +
+
+

@tobami

you should subscribe to the list first.
We get far too much spam to accept
posts from non-members.

Cheers,
fijal

+
+
+
+
+ + tobami wrote on 2009-09-05 21:21: +
+
+

@Maciej,

well, I subscribed first, that is the problem. I now get sent the pypy-dev mailing list, but my post got rejected anyway. And pypy-owner hasn't answered yet.

What can I do?

+
+
+
+
+ + Maciej Fijalkowski wrote on 2009-09-06 19:57: +
+
+

@tobami

you did something wrong. pypy-dev
is not a moderated list (from
members, that is). Can you leave your mail, so we can no longer spam here? Mine is fijal at merlinux.eu

+
+
+
+ +

Gothenburg JIT sprint report

+ +
+

Finally, we managed to squeeze in some time to write a report about what +has been going on the mysterious JIT sprint in Gothenburg, Sweden. +The main goals of the sprint were to lay down the groundwork for getting +more JIT work going in the next months and get more of PyPy developers +up to speed with the current state of the JIT. One of the elements was +to get better stability of the JIT, moving it slowly from being a prototype to +actually work nicely on larger programs.

+ +

The secret goal of the sprint was to seek more speed, which Anto and +Carl Friedrich did even during the break day:

+ + +

We spent the first two days improving test coverage of the x86 backend +and the optimizer. Now we have 100% coverage with unittests +(modulo figleaf bugs), which does not mean anything, but it's better +than before.

+ +

Then we spent quite some time improving the optimizer passes, so +now we generate far less code than before the sprint, because a lot of +it is optimized away. On the interpreter side, we marked more objects +(like code objects) as immutable, so that reading fields from them +can be constant-folded.

+

Another important optimization that we did is to remove consecutive +reading of the same fields from the same structure, if no code in between +can change it.

+

Our JIT is a hybrid environment, where only hot loops of code are jitted +and the rest stays being interpreted. We found out that the performance +of the non-jitted part was suboptimal, because all accesses to python +frames went through an extra layer of indirection. We removed this layer +of indirection, in the case where the jit and the interpreter cannot +access the same frame (which is the common case).

+

We also spent some time improving the performance of our x86 backend, +by making it use more registers and by doing more advanced variable +renaming at the end of loops. It seems that using more registerd is not as +much of a win as we hoped, because modern day processors are much +smarter than we thought.

+

The most mind bending part was finding why we loose performance by +making the JIT see more of the interpreter. It took us two very frustrating +days and 36 gray hairs to find out that from the JIT we call a different malloc +function in the Boehm GC, which is by far slower than the version that +we use from the interpreter. This meant that the more we jitted, the +slower our code got, purely because of the mallocs.

+

Now that this is fixed, the world makes much more sense again.

+

A lot of the sprint's work is not directly measurable in the performance +figures, but we did a lot of work that is necessary for performance to +improve in the next weeks. After we have done a bit more work, we should +be able to provide some performance figures for programs that are +more realistic than just loops that count to ten millions (which are +very fast already :).

+

Now we're going to enjoy a couple of days off to recover from the sprint.

+

Bästa hälsningar,
+Carl Friedrich, fijal

+
+
+
+
+ + Anonymous wrote on 2009-08-25 14:26: +
+
+

Excellent summary. You should never doubt the value of these updates, they are essential for maintaining awareness.

+
+
+
+
+ + Bourne wrote on 2009-08-25 18:11: +
+
+

Congrats on your impressive work! This sounds more and more promising.

+
+
+
+
+ + Freakazo wrote on 2009-08-28 15:15: +
+
+

Updates like this are extremely interesting to read, and it gives me a months worth of new terms and technology to learn :D

Can't wait to use Pypy!

+
+
+
+ +

PyPy numeric experiments

+ +
+

+Because PyPy will be presenting at the upcoming euroscipy conference, I have been playing recently with the idea of NumPy and PyPy integration. My idea is to integrate PyPy's JIT with NumPy or at least a very basic subset of it. Time constraints make it impossible to hand write a JIT compiler that understands NumPy. But given PyPy's architecture we actually have a JIT generator, so we don't need to write one :-) +

+ +

+Our JIT has shown that it can speed up small arithmetic examples significantly. What happens with something like NumPy? +

+

+I wrote a very minimal subset of NumPy in RPython, called micronumpy (only single-dimension int arrays that can only get and set items), and a benchmark against it. The point of this benchmark is to compare the performance of a builtin function (numpy.minimum) against the equivalent hand-written function, written in pure Python and compiled by our JIT. +

+

+The goal is to prove that it is possible to write algorithms in Python instead of C without loss of efficiency. Sure, we can write some functions (like minimum in the following example), but there is a whole universe of other ufuncs which would be cool to have in Python instead, assuming this could be done without a huge loss in efficiency. +

+

+Here are the results. This is comparing PyPy svn revision 66303 in the pyjitpl5 branch against python 2.6 with NumPy 1.2.1. The builtin numpy.minimum in PyPy is just a naive implementation in RPython, which is comparable to the speed of a naive implementation written in C (and thus a bit slower than the optimized +version in NumPy): +

+ + + + + + + + + + + + + + + + + +
NumPy (builtin function)0.12s
PyPy's micronumpy (builtin function)0.28s
CPython (pure Python)11s
PyPy with JIT (pure Python)0.91s
+

+As we can see, PyPy's JIT is slower than the optmized NumPy's C version, but still much faster than CPython (12x). +

+

+Why is it slower? When you actually look at assembler, it's pretty obvious that it's atrocious. There's a lot of speedup to be gained out of just doing simple optimizations on resulting assembler. There are also pretty obvious limitations, like x86 backend not being able to emit opcodes for floats or x86_64 not being there. Those limitations are not fundamental in any sense and can be relatively straightforward to overcome. Therefore it seems we can get C-level speeds for pure Python implementations of numeric algorithms using NumPy arrays in PyPy. I think it's an interesting perspective that Python has the potential of becoming less of a glue language and more of a real implementation language in the scientific field. +

+Cheers,
+fijal +
+
+
+
+ + Anonymous wrote on 2009-07-17 20:50: +
+
+

I have the feeling your are confessing pypys secrete goal ;-).

+
+
+
+
+ + Anonymous wrote on 2009-07-18 08:48: +
+
+

a really efficient python for science: THAT would be a real milestone for dynamic languages; and start their era...

+
+
+
+
+ + tobami wrote on 2009-07-21 10:51: +
+
+

Very, very interesting.

Something I missed though was a real naive C implementation. You state it is about as fast as "PyPy's micronumpy", but it would have been nice to post the numbers. Of course, the problem is that the code would be different (C, instead of Python), but still...

+
+
+
+
+ + Anonymous wrote on 2009-07-22 09:37: +
+
+

What would it take to get this really started? Some of our group would happily help here, if there is a sort of a guideline (a TODO list?) that tells what must be done (i.e. as a friend put it, we would be codemonkeys).

+
+
+
+
+ + Yosef wrote on 2009-07-27 07:19: +
+
+

The difference in pure-python speed is what is most interesting for me, as however much NumPy you use, sometimes important parts of the software still can't be easily vectorized (or at all). If PyPy can let me run compiled NumPy (or Cython) code glued with lightning-fast Python, this leaves me with almost no performance problems. Add to that the convenience of vectorization as a means of writing short, readable code, and its a winning combination.

+
+
+
+
+ + Zeev wrote on 2009-07-29 09:37: +
+
+

Saying that implementing efficient code generation for floating point code on x86 in your jit is going to be straight forward is disingenuous.

+
+
+
+
+ + René Dudfield wrote on 2009-07-30 04:02: +
+
+

Here's a project using corepy, runtime assembler to create a faster numpy:

https://numcorepy.blogspot.com/

There's also projects like pycuda, and pygpu which generate numpy code to run on GPUs.

It gets many times than standard numpy.

pygame uses SDL blitters, and its own blitters - which are specialised array operations for images... these are many times faster than numpy in general - since they are hand optimized assembler, or very efficiently optimised C.

Remember that hand optimized assembler can be 10x faster than even C, and that not all C code is equal.

So it seems that even the pypy generated C code could even be faster.

What about applying pypy to CUDA, or OpenCL C like languages?

cu,

+
+
+
+
+ + Maciej Fijalkowski wrote on 2009-08-02 22:17: +
+
+

@ilume

I think you're completely missing the point. These experiments are performed using pure-python code that happens to operate on numpy arrays. Assembler generation happens when interpreting this code by the interpreter, so it's not really even the level of hand-written C. Corenumpy on the other hand is trying to speed up numpy operations itself (which is also a nice goal, but completely different).

Cheers,
fijal

+
+
+
+
+ + Anonymous wrote on 2011-04-13 09:48: +
+
+

Hi Maciej! Would you mind blogging an update on PyPy / C interfaces and NumPy?

I am extensively using NumPy / SciPy / NLopt (apart from apart from the stuff I import from there, my stuff is mostly pure Python algorithms, which interpreter spends most time working on).

The latest improvements in PyPy JIT really sound like if they could magically dramatically speed up my stuff...

I don't mind trying PyPy out in production if it will yield significant speedups and otherwise debugging why wouldn't it, but I need access to C stuff from within Python.

+
+
+
+
+ + Maciej Fijalkowski wrote on 2011-04-13 09:53: +
+
+

Stay tuned, I'll blog about it when I have more results. The progress has been slow so far, but it might accelerate

+
+
+
+
+ + Anonymous wrote on 2011-04-13 13:37: +
+
+

Hi! Thanks, can't wait for it... :-)

+
+
+
+ +

ECOOP 2009

+ +
+

Last week (from 6th to 10th of July) Anto, Armin and me (Carl Friedrich) were in +the magnificent city of Genova, Italy at the ECOOP conference. In this blog +post I want to give a (necessarily personal) account of what we did there.

+
+

Workshop days: ICOOOLPS

+

The first two days of the conference were the workshop days. On Monday we +attended the ICOOOLPS workshop, (see the programme of the workshop). We +had gotten two papers accepted at the workshop (one about layering PyPy's JIT +on top of the CLR and one about the basic idea of PyPy's tracing JIT) and +thus gave two presentations at the workshop, one was given by Anto, the other +by me. Both went reasonably well, we got some positive feedback.

+

Nearly all the other talks were rather interesting as well. I particularly liked +the one by Hans Schippers, who presented a machine model built on delegation +called delMDSOC. The model is meant implement most features that a language +would need that makes it possible to separate cross-cutting concerns. In the +talk at ICOOOLPS he presented an extension to the model that adds concurrency +support, using a combination of actors and coroutines. He then showed that the +concurrency mechanisms of Java, Salsa (and extension of Java adding actors) and +Io can be mapped to this model.

+

Furthermore there were two interesting invited talks, one by Andreas Gal +(Mozilla), and one by Cliff Click (Azul Systems). Andreas explained how +TraceMonkey works. This was very useful for me, because his talk was just before +mine and I could thus kill most of my introduction about tracing JIT compilers +and have more time for the really interesting stuff :-). Cliff talked about +implementing other languages on top of the JVM and some of the pitfalls in +getting them perform well.

+

All in all, ICOOOLPS was a very enjoyable workshop, also with many interesting +discussions.

+

On Tuesday there were more workshops, but also the PyPy tutorial, so I only went +to a few talks of the COP workshop and spent the rest of the morning +preparing the tutorial (see next section).

+
+
+

Tutorial

+

On Tuesday afternoon we gave a PyPy Tutorial, as part of the ECOOP summer +school. The first lesson we learned was that (as opposed to a community +conference) people don't necessarily want to actually take their laptop out and +try stuff. We gave a slow walk-through about the full life-cycle of development +of a dynamic language interpreter using PyPy's tool-chain: Starting from writing +your interpreter in RPython, testing it on top of CPython to translating it to +C, .NET or Java to actually adding hints to get a JIT inserted.

+

There were about seven people attending the tutorial, a couple of which were +very interested and were asking questions and discussing. Some of the +discussions were even very technical, e.g. one about the details of our +type-inference algorithm for RPython and why we cannot do a bottom-up analysis +but have to use forward-propagation instead.

+

Jan Vitek of Purdue University told of some of the problems of the OVM +project, which is (among other things) a Java implementation in Java (OVM also +wants to support implementing VMs for other languages with it, if I understood +correctly). He said that the project has +essentially gotten too large and complicated, which means that it is very hard +for new people to get into the project. While PyPy doesn't have some of the +problems of a full Java implementation (e.g. right now our concurrency support +is minimal) I definitely think that some of these risks apply to PyPy as well +and we should find ways to improve the situation in this regard. Channeling +Samuele: Somewhere inside the large lumbering blob of PyPy there is an elegant +core trying to get out.

+
+
+

Main Conference

+

From Wednesday till Friday the main conference was happening. Many of the +talks were not all that interesting for me, being quite Java centric. One talk +that I liked a lot was "Making Sense of Large Heaps", which was presented by +Nick Mitchell (IBM). He presented a tool called "Yeti" that can be used to +analyze large heaps of Java programs. The tool uses some clever algorithms and +heuristics to summarize the heap usage of data structures in intelligent ways to +make it easier to find possible memory-wasters in a program. Nick also gave Anto +and me a demo of the tool, where we tried to apply it to pypy-jvm (we found +out that a fifth of the static data in there belongs to the parser/compiler :-( +).

+

On each of the days of the conference there was a keynote. I missed the one by +Simon Peyton-Jones on Wednesday about type classes in Haskell. On Thursday, +David Ungar was awarded the Dahl-Nygaard-Prize for his work on the Self +programming language. Subsequently he gave a really inspiring keynote with the +title "Self and Self: Whys and Wherefores" where he recollected Self's history, +both on a technical as well as on a social level. Parts of the talk were +snippets from the movies Self: The Movie and Alternate Reality Kit, both +of which I highly recommend.

+

The keynote on Friday was by Cliff Click with the title "Java on 1000 Cores: +Tales of Hardware/Software Co-design". He described the custom CPU architecture +that Azul Systems has developed to run Java server applications on hundreds of +cores. The talk mostly talked about the hardware, which I found very interesting +(but some people didn't care for too much). Azul's CPU is essentially 54 in-order +RISC cores in a single processor. The cores have a lot of extensions that make +it easier to run Java on them, e.g. hardware read- and write-barriers, +hardware-transactional-memory and hardware escape-detection (!).

+

In addition to the talks, there is of course always the hallway track (or coffee +track) which is the track where you stand in the hallway and discuss with +people. As usual, this was the most interesting part of the conference. One of +those talks was Anto and me giving a PyPy demo to David Ungar. We had a very +interesting discussion about VM implementation in general and the sort of +debugging tools you need to write in particular. He liked PyPy a lot, which +makes me very happy. He also liked the fact that I have actually read most Self +papers :-).

+
+
+
+
+
+ + Alexander Kellett wrote on 2009-07-16 19:09: +
+
+

The link to delMDSOC should be https://www.hpi.uni-potsdam.de/hirschfeld/projects/delmdsoc/

Alex

+
+
+
+
+ + Anonymous wrote on 2009-07-17 04:10: +
+
+

Glad it went well.

I gather there wasn't any sprint at EuroPython? I was hoping for some news.

If you can get a real python implementation out there that is starting to get faster than CPython, you could get some momentum really quickly.

I hope Unladen Swallow doesn't end up stealing your potential userbase, or at least dividing it.

+
+
+
+
+ + Donovan Preston wrote on 2009-07-18 01:28: +
+
+

I <3 Self.

+
+
+
+
+ + Terrence wrote on 2009-07-18 05:45: +
+
+

Is something like your PyPy Tutorial online somewhere? I have a befunge interpreter that I've been meaning to get working on pypy but I have almost no idea where to begin. I've been reading pypy's code on and off for awhile now and it's very slow going. If there were some way to get up and running faster, I would really like to know about it.

+
+
+
+
+ + Armin Rigo wrote on 2009-07-20 10:25: +
+
+

You can find the tutorial
here but the part written down is quite limited. If you need starting points, look at pypy/translator/goal/target*.py (for example targetjsstandalone, which runs our partial JS interpreter).

+
+
+
+
+ + Terrence wrote on 2009-07-21 08:40: +
+
+

Thank you for the link. I had started with targetpypystandalone.py, which, on reflection, appears to be more towards the deep end of the pool. The javascript target is exactly what I'm looking for.

+
+
+
+ +
+
+
+ + Luis wrote on 2009-07-09 19:37: +
+
+

Please guys, can anyone of you tell something about Europython's vm panel news? I've been searching for comments on blogs in the last days but I couldn't find anything... There were many interesting presentations (hotpy, crosstwiner, psyco v2, etc...) but no comments so far! Is there any significant news in that field? How do these projects compare to pypy...?

+
+
+
+
+ + Kumo wrote on 2009-07-11 00:44: +
+
+

Will you publish the slides of the 2nd talk, as you did with the 1st?

I am looking forward to reading the slides as well as more comments about the talks.

Keep the good work!

+
+
+
+ +
+
+ +
+
+
+ +
+ + + + \ No newline at end of file diff --git a/blog/index-12.html b/blog/index-12.html new file mode 100644 index 000000000..124d05a95 --- /dev/null +++ b/blog/index-12.html @@ -0,0 +1,1461 @@ + + + + + + +PyPy (old posts, page 12) | PyPy + + + + + + + + + + + + + + + + + + Skip to main content +
+
+
+

Nightly graphs of PyPy's performance

+ +
+

Hello.

+

In the past few months, we made tremendous progress on the JIT front. +To monitor the progress daily, we introduced recently some cool graphs +that plot revision vs performance. They are based on unladen swallow +benchmark runner and they're written entirely in JavaScript, using canvas +via the JQuery and Flot libraries. +It's amazing what you can do in JavaScript these days... They are also +tested via the very good oejskit plugin, that integrates py.test +with JavaScript testing, driven by the command line.

+

As you can probably see, we're very good on some benchmarks and not that +great on others. Some of the bad results come from the fact that while we +did a lot of JIT-related work, other PyPy parts did not see that much +love. Some of our algorithms on the builtin data types are inferior to those +of CPython. This is going to be an ongoing focus for a while.

+

We want to first improve on the benchmarks for a couple +of weeks before doing a release to gather further feedback.

+

Cheers, +fijal

+
+
+
+
+ + Bill Mill wrote on 2010-01-25 17:38: +
+
+

So... what's a revision number that I can use? Am I just supposed to guess? The page should have a reasonable default revision number.

+
+
+
+
+ + Bill Mill wrote on 2010-01-25 17:40: +
+
+

for anyone else looking, 70700 is a reasonable place to start. (The graphs are really nice by the way, I'm not hating!)

+
+
+
+
+ + Anonymous wrote on 2010-01-25 18:12: +
+
+

a couple of suggestions:

1. scale for X axis (dates are likely to be more interesting than revision numbers)

1a. scale for Y axis

2. Add another line: unladen swallow performance

+
+
+
+
+ + Gaëtan de Menten wrote on 2010-01-25 19:36: +
+
+

+1 for Anonymous's suggestions 1 and 2.

+
+
+
+
+ + RPG wrote on 2010-01-25 20:18: +
+
+

This is cool.

Unladen Swallow's perf should also be considered if possible.

+
+
+
+
+ + Maciej Fijalkowski wrote on 2010-01-25 21:04: +
+
+

Hey.

Regarding revisions - by default it points to the first one we have graphs from, so you can just slice :) Also, yeah, revision numbers and dates should show up, will fix that. We don't build nightly unladen swallow and we don't want to run it against some older version, because they're improving constantly.

Cheers,
fijal

+
+
+
+
+ + Anonymous wrote on 2010-01-25 23:55: +
+
+

Wonderful idea, great implementation (axis are needed, tooltips would be interesting for long series), impressive results.

I hope you guys exploit this to raise interest in PyPy in this pre-release period. Just take a look at the response you get to posts involving numbers, benchmarks, etc. (BTW, keep linking to the funding post) :)

A series of short posts discussing hot topics would be a sure way to keep Pypy around the news until the release, so you get as much feedback as possible.

Suggestions:

- Possible factors in slower results (discuss points in the Some Benchmarking post);

- One-of comparisons to different CPython versions, Unladen Swallow, ShedSkin, [C|J|IronP]ython (revisit old benchmarks posts?);

- Mention oprofile and the need for better profiling tools in blog, so you can crowdsource a review of options;

- Ping the Phoronix Test Suite folks to include Pypy translation (or even these benchmarks) in their tests: Python is an important part of Linux distros;

- Don't be afraid to post press-quotable numbers and pics, blurbs about what Pypy is and how much it's been improving, etc. Mention unrelated features of the interpreter (sandboxable!), the framework (free JIT for other languages), whatever;

- The benchmark platform (code, hardware, plans for new features).

+
+
+
+
+ + Unknown wrote on 2010-01-26 06:32: +
+
+

Regarding comparison with unladen swallow: I think having a point per month would be good enough for comparison purposes.

+
+
+
+
+ + Maciej Fijalkowski wrote on 2010-01-26 08:53: +
+
+

@Anonymous: Great suggestions! I'll look at this issues. In fact, things like profiling has been highly on our todo list, but we should advertise it more. We surely miss someone who'll be good at PR :-)

+
+
+
+
+ + Luis wrote on 2010-02-24 10:51: +
+
+

Something's wrong with plot one's scale: the speed ups are represented by a first line of 2x, a second one of 4x and the third one is 8x. Shouldn't it be 6x instead?

+
+
+
+ +

Accelerating PyPy development by funding

+ +
+

PyPy has recently made some great speed and memory progress towards providing the most efficient Python interpreter out there. We also just announced +our plans for the pypy-1.2 release. Much of this is driven by personal +commitment, by individuals and companies investing time and money. +Now we'd appreciate some feedback and help regarding getting money +into the PyPy project to help its core members (between +5 and 15 people depending how you count) to sustain themselves. We see +several options:

+
    +
  • use a foundation structure and ask for tax-exempt donations to the +project, its developers and infrastructure. We just got +a letter from the Software Freedom Conservancy that they view +our application favourably so this option becomes practical hopefully +soon.
  • +
  • offer to implement certain features like a 64bit JIT-backend, +Numpy for PyPy or a streamlined installation in exchange for money, +contributed in small portions/donations. Do you imagine you or your +company would sponsor PyPy on a small scale for efforts like this? +Any other bits you'd like to see?
  • +
  • offer to implement larger scale tasks by contracting PyPy related companies, +namely Open End and merlinux who have successfully done such +contracts in the past. Please don't hesitate to contact +holger@merlinux.eu and bea@openend.se if you want to start a +conversation on this.
  • +
  • apply for public/state funding - in fact we are likely to get some +funding through Eurostars, more on that separately. Such funding +is usually only a 50-60% percentage of actual employment and +project costs, and is tied to research questions rather than +to make PyPy a production-useable interpreter, though.
  • +
+

Anything else we should look out for?

+

cheers & thanks for any feedback, +Maciej and Holger

+
+
+
+
+ + Anonymous wrote on 2009-12-21 18:28: +
+
+

What's the status of possible mobile applications for PyPy? That seems nearer in terms of potential products and thus 'commercial' funding.

+
+
+
+
+ + Po wrote on 2009-12-21 21:57: +
+
+

Have you guys looked into jitting regular expressions?
I am not quite sure how hard it would be but having very fast regexps would be a great selling point for Pypy.

+
+
+
+
+ + Anonymous wrote on 2009-12-22 00:24: +
+
+

What about activating the Python Users Groups around the world? I think the case has to be made for PyPy still to the regular folk, if you will. So - what if you conducted a video showing off it's potential, or maybe a series of videos, much like the "Summer of NHibernate" series. All the while, on the same site as the videos, you have a "tips" jar for donations. The videos would serve as great marketing campaign and would invite the development community into the fold, earning the buy-in you seek. This kind of attention in the community would only serve the project well when attracting the larger fish to the pond.

Just my thoughts. :)

+
+
+
+
+ + holger krekel wrote on 2009-12-22 08:59: +
+
+

@kitblake good point. The main blocker for making PyPy useful on mobile phones is support for GUI apps. Alexander's recent PyPy QT experiments are teasing in this direction. To fully exploit PyPy's super-efficient memory usage we probably need to provide native bindings. That and maybe a GIL-less interpreter would make PyPy a superior choice for mobile devices.

However, GUI-bindings/free threading are orthogonal to the ongoing JIT-efforts. Somehow PyPy suffers a bit from its big potential (there also is stackless and sandboxing etc.). Question is: (How) can we make donation/other funding guide PyPy developments and at the same time support dedicated developers?

+
+
+
+
+ + holger krekel wrote on 2009-12-22 09:07: +
+
+

@Ryan Interesting idea. What would you suppose to see in such a video?

+
+
+
+
+ + Niki wrote on 2009-12-22 09:58: +
+
+

What if new PySide code generator targets RPython?
https://www.pyside.org/

+
+
+
+
+ + Alexander Schremmer wrote on 2009-12-22 21:39: +
+
+

Niki, generally thats a viable approach. Pyside is moving to shiboken, a new framework for generating bindings. Somebody would have to check how large the effort is to port it to RPython.
Currently, Pyside is using Boost::Python AFAIK.

+
+
+
+
+ + Anonymous wrote on 2009-12-30 14:02: +
+
+

could you accept donations via a paypal button or something like that? It's simple and easy but I think it's unlikely to be sufficient.

I'm always amazed at the MoveOn organization... it seems like every week they send out mail like 'hey! we need $400,000 to stop The Man! Can you send us $20?' followed by 'Thanks! We've achieved our goal!'

I don't know how many people or how much each one donates but they always meet their goal!

+
+
+
+
+ + holger krekel wrote on 2010-01-04 15:11: +
+
+

anonymous: yes, establishing some way to accept money via paypal is high on our list. if nothing else we can use some private trusted account. moveon is rather geared towards general politics, i guess, so not directly applicable. But i remember there was some open source market place which allows to bid for certain features ...

+
+
+
+
+ + Anonymous wrote on 2010-01-06 05:18: +
+
+

Have you considered moving to any sort of DVCS (Hg, Git, etc)? Or, given your current management style, does a centralized VCS or a DVCS add more to the project?

Googling "open source bounties", finds Stack Overflow suggesting self-hosting bounties for the best results, which I suppose, makes sense. The people interested in taking bounties would be the ones already at your site. Being one of a million bounty providers on a site wouldn't generate much traffic.

Thinking out loud, moving to a DVCS might actually help the bounty process, assuming you'd want to move in that direction.

+
+
+
+ +

Planning a next release of PyPy

+ +
+

The PyPy core team is planning to make a new release before the next Pycon US.

+

The main target of the 1.2 release is packaging the good results +we have achieved applying our current JIT compiler generator to our +Python interpreter. Some of that progress has been chronicled in +recent posts on the status blog. By releasing them in a +relatively stable prototype we want to encourage people to try them with their +own code and to gather feedback in this way. By construction the JIT compiler +should support all Python features, what may vary are the speedups +achieved (in some cases the JIT may produce worse results than the PyPy +interpreter which we would like to know) and the extra memory required +by it.

+

For the 1.2 release we will focus on the JIT stability first, less on +improving non-strictly JIT areas. The JIT should be good at many things +as shown by previous blog postings. We want the JIT compiler in the +release to work well on Intel 32 bits on Linux, with Mac OS X and +Windows being secondary targets. Which compilation targets work will +depend a bit on contributions.

+

In order to finalize the release we intend to have a concentrated +effort ("virtual sprint") from the 22nd to the 29th of +January. Coordination will happen as usual through the #pypy irc +channel on freenode. Samuele Pedroni will take the role of release +manager as he already did in the past.

+
+
+
+
+ + Anonymous wrote on 2009-12-17 15:37: +
+
+

Good News!
Can't wait to try pypy as my standard python vm on my desktop machine.

Btw: Are there any plans yet for python generators support in the jit?
Because thats the only feature that I'm currently missing when using pypy.
I have some medium sized apps, that I'd like to try, but they often use generators, so these will be slower with jit than without, won't they?

+
+
+
+
+ + Maciej Fijalkowski wrote on 2009-12-17 16:48: +
+
+

@Anonymous.

Generators won't be sped up by JIT. This does not mean that JIT can't run or can't speed up other parts of your program. And yes, there are plans of supporting that.

Cheers,
fijal

+
+
+
+
+ + servo wrote on 2009-12-18 03:11: +
+
+

I want to get involved in the development of PyPy, but I'm just a student with some experience with compilers. There's any list of junior contributions that can be done by somebody starting?

Thanks!

+
+
+
+
+ + Maciej Fijalkowski wrote on 2009-12-18 12:11: +
+
+

@servo

Show on #pypy on freenode IRC. We'll find you something, don't worry :-)

+
+
+
+ +
+
+
+ + Anonymous wrote on 2009-12-05 07:56: +
+
+

It would be nice if there are prebuilt binaries in the next release.
Certainly if it's faster there are a lot of graphics based projects where this would be interesting (pygame, pygelt, cocos2d, shoebot etc).

+
+
+
+
+ + Anonymous wrote on 2009-12-05 14:22: +
+
+

@Anonymous:
Probably they would be still slower, because ctypes is very slow in PyPy afaik.
Someone mentioned in irc that the long time goal for ctypes is, that the jit doesn't use libffi at all but does direct assembler-to-c calls instead, if I remember correctly. - what should be superfast.
That would of course be absolutely awesome. :)
(and it's also the secret reason, why I only use pypy compatible modules for my pyglet game ;)
Unfortunately I don't know if this is going to happen anytime "soon" / before the 1.2 release (at least I can't find it on extradoc/planning/jit.txt) but I know many people who would instantly drop cpython then. :P
Heck, if I only had more clue about, how difficult this is to implement...

+
+
+
+ +

Using CPython extension modules with PyPy, or: PyQt on PyPy

+ +
+
+ +

If you have ever wanted to use CPython extension modules on PyPy, +we want to announce that there is a solution that should be compatible +to quite a bit of the available modules. It is neither new nor written +by us, but works nevertheless great with PyPy.

+

The trick is to use RPyC, a transparent, symmetric remote procedure +call library written in Python. The idea is to start a +CPython process that hosts the PyQt libraries +and connect to it via TCP to send RPC commands to it.

+

I tried to run PyQt applications +using it on PyPy and could get quite a bit of the functionality of these +working. Remaining problems include regular segfaults of CPython +because of PyQt-induced memory corruption and bugs because classes +like StandardButtons behave incorrectly when it comes to arithmetical operations.

+

Changes to RPyC needed to be done to support remote unbound __init__ methods, +shallow call by value for list and dict types (PyQt4 methods want real lists and dicts +as parameters), and callbacks to methods (all remote method objects are wrapped into +small lambda functions to ease the call for PyQt4).

+

If you want to try RPyC to run the PyQt application of your choice, you just +need to follow these steps. Please report your experience here in the blog +comments or on our mailing list.

+
+
    +
  1. Download RPyC from the RPyC download page.
  2. +
  3. Download this patch and apply it to RPyC by running +patch -p1 < rpyc-3.0.7-pyqt4-compat.patch in the RPyC directory.
  4. +
  5. Install RPyc by running python setup.py install as root.
  6. +
  7. Run the file rpyc/servers/classic_server.py using CPython.
  8. +
  9. Execute your PyQt application on PyPy.
  10. +
+
+

PyPy will automatically connect to CPython and use its PyQt libraries.

+

Note that this scheme works with nearly every extension library. Look +at pypy/lib/sip.py on how to add new libraries (you need to create +such a file for every proxied extension module).

+

Have fun with PyQt

+

Alexander Schremmer

+
+
+
+
+
+ + intgr wrote on 2009-11-30 13:03: +
+
+

OT: you should separate labels by commas, so that Blogspot recognizes them as distinct labels.

+
+
+
+
+ + Carl Friedrich Bolz-Tereick wrote on 2009-11-30 13:08: +
+
+

intgr: Thanks, done.

+
+
+
+
+ + Anonymous wrote on 2009-11-30 19:38: +
+
+ "regular segfaults of CPython because of PyQt-induced memory corruption and bugs because classes like StandardButtons behave incorrectly when it comes to arithmetical operations."

These sound interesting. Could you please elaborate? A link would suffice, if these are already documented by non-pypy people. Thanks! +
+
+
+
+ + holger krekel wrote on 2009-12-01 09:17: +
+
+

cool stuff, alexander! Generic access to all CPython-provided extension could remove an importing blocker for PyPy usage, allows incremental migrations.

Besides, I wonder if having two processes, one for application and one for bindings can have benefits to stability.

+
+
+
+
+ + Alexander Schremmer wrote on 2009-12-01 10:28: +
+
+

Dear anonymous,

the StandardButtons bug was already communicated to a Nokia employee.
If you are interested in the segfaults, contact me and I give you the source code that I used for testing.

+
+
+
+
+ + Zemantic dreams wrote on 2009-12-03 06:33: +
+
+

This is an important step forward!

There are probably two reasons why people use extensions: bindings to libraries and performance.

Unfortunately this specific approach does not address performance. Is there anything on horizon that would allow near-CPython API for extensions. So modules would just need to be recompiled against PyPy bindings for CPython API? Probably not 100% compatible, but close?

Any chances of that happening?

Andraz Tori, Zemanta

+
+
+
+
+ + Alexander Schremmer wrote on 2009-12-03 08:51: +
+
+ Any chances of that happening?

In theory, this is possible, but a lot of work. Nobody has stepped up to implement it, yet. +
+
+
+
+ + Unhelpful wrote on 2009-12-04 07:08: +
+
+

Isn't the exposure of refcounts in the CPython C API going to be a bit of a problem for implementing the API on pypy? perhaps a "fake" refcount could be associated with an object when it is first passed to an extension? This could still be problematic if the extension code expects to usefully manipulate the refcount, or to learn anything by examining it...

+
+
+
+
+ + Alexander Schremmer wrote on 2009-12-04 10:10: +
+
+ Isn't the exposure of refcounts in the CPython C API going to be a bit of a problem for implementing the API on pypy?

Indeed, it would be part of the task to introduce support in the GCs for such refcounted objects. Note that real refcounting is necessary because the object could be stored in an C array, invisible to the GC. +
+
+
+
+ + Unhelpful wrote on 2009-12-04 10:32: +
+
+

I'm trying to think of ways around that, but any API change to make objects held only in extensions trackable by the GC would probably be much worse than adding refcounted objects, wouldn't it, unless the extension were written in rpython...

+
+
+
+
+ + handsomegui wrote on 2015-04-28 15:16: +
+
+

Any news on this PyQt on PyPy topic? With the latest PyPy 2.5.1? Thanks.

+
+
+
+ +

Some benchmarking

+ +
+

Hello. +

+

+Recently, thanks to the surprisingly helpful Unhelpful, also known as Andrew Mahone, +we have a decent, if slightly arbitrary, set of performances graphs. +It contains a couple of benchmarks already +seen on this blog as well as some taken from The Great Computer +Language Benchmarks Game. These benchmarks don't even try to represent "real applications" +as they're mostly small algorithmic benchmarks. Interpreters used: +

+
    +
  1. +PyPy trunk, revision 69331 with --translation-backendopt-storesink, which is +now on by default +
  2. +
  3. +Unladen swallow trunk, r900 +
  4. +
  5. CPython 2.6.2 release
  6. +
+

+Here are the graphs; the benchmarks and the runner script are available +

+ + + +And zoomed in for all benchmarks except binary-trees and fannkuch. + + +

+As we can see, PyPy is generally somewhere between the same speed +as CPython to 50x faster (f1int). The places where we're the same +speed as CPython are places where we know we have problems - for example generators are +not sped up by the JIT and they require some work (although not as much by far +as generators & Psyco :-). The glaring inefficiency is in the regex-dna benchmark. +This one clearly demonstrates that our regular expression engine is really, +really, bad and urgently requires attention. +

+

+The cool thing here is, that although these benchmarks might not represent +typical python applications, they're not uninteresting. They show +that algorithmic code does not need to be far slower in Python than in C, +so using PyPy one need not worry about algorithmic code being dramatically +slow. As many readers would agree, that kills yet another usage of C in our +lives :-) +

+Cheers,
+fijal +
+
+
+
+ + Luis wrote on 2009-11-18 22:09: +
+
+

Wow! This is getting really interesting. Congratulations!
By the way, it would be great if you include psyco in future graphs, so speed junkies can have a clearer picture of pypy's progress.

+
+
+
+
+ + Eric Florenzano wrote on 2009-11-18 22:14: +
+
+

Very interesting, congratulations on all the recent progress! It would be very interesting to see how PyPy stacks up against Unladen Swallow on Unladen Swallow's own performance benchmark tests, which do include a bit more real-world scenarios.

+
+
+
+
+ + Maciej Fijalkowski wrote on 2009-11-18 22:31: +
+
+

@Eric: yes, definitely, we're approaching that set of benchmarks

@Luis: yes, definitely, will try to update tomorrow, sorry.

+
+
+
+
+ + Paddy3118 wrote on 2009-11-19 04:06: +
+
+

It's good, but...

We are still in the realms of micro-benchmarks. It would be good to compare their performances when working on something larger. Django or Zope maybe?

+
+
+
+
+ + Gaëtan de Menten wrote on 2009-11-19 07:52: +
+
+

These last months, you seem to have had almost exponential progress. I guess all those years of research are finally paying off. Congratulations!

Also, another graph for memory pressure would be nice to have. Unladen Shadow is (was?) not very good in that area, and I wonder how PyPy compares.

[nitpick warning]
As a general rule, when mentioning trunk revisions, it's nice to also mention a date so that people know the test was fair. People assume it's from the day you did the tests, and confirming that would be nice.
[/nitpick warning]

+
+
+
+
+ + Antoine wrote on 2009-11-19 09:45: +
+
+

How about benchmarking against CPython trunk as well?

cheers

Antoine.

+
+
+
+
+ + Tony Landis wrote on 2009-11-19 16:02: +
+
+

What about memory consumption? That is almost as important to me as speed.

+
+
+
+
+ + wilk wrote on 2009-11-19 16:04: +
+
+

Congratulations !

Please could you remember us how to build and test pypy-jit ?

+
+
+
+
+ + Anonymous wrote on 2009-11-19 23:38: +
+
+

I'm curious why mandelbrot is much less accelerated than, say, nbody. Does PyPy not JIT complex numbers properly yet?

+
+
+
+
+ + Benjamin Peterson wrote on 2009-11-20 03:03: +
+
+

@wilk ./translate.py -Ojit targetpypystandalone.py

+
+
+
+
+ + Benjamin Peterson wrote on 2009-11-20 03:11: +
+
+

@Anon Our array module is in pure Python and much less optimized than CPython's.

+
+
+
+
+ + Leo wrote on 2009-11-20 07:11: +
+
+

How long until I can do

pypy-c-jit translate.py -Ojit targetpypystandalone.py

?

So far, when I try, I get

NameError: global name 'W_NoneObject' is not defined
https://paste.pocoo.org/show/151829/

+
+
+
+
+ + holger krekel wrote on 2009-11-20 07:37: +
+
+

ASFAIU it's not PyPy's regex engine being "bad" but rather the fact that the JIT generator cannot consider and optimize the loop in the regex engine, as it is a nested loop (the outer one being the bytecode interpretation one).

+
+
+
+
+ + Armin Rigo wrote on 2009-11-20 10:41: +
+
+

@holger: yes, that explains why regexps are not faster in PyPy, but not why they are 5x or 10x slower. Of course our regexp engine is terribly bad. We should have at least a performance similar to CPython.

+
+
+
+
+ + Anonymous wrote on 2009-11-20 15:35: +
+
+

Benjamin, is it really an issue with array? The inner loop just does complex arithmetic. --Anon

+
+
+
+
+ + Benjamin Peterson wrote on 2009-11-20 22:41: +
+
+

@Anon I'm only guessing. Our math is awfully fast.

+
+
+
+
+ + Antonio Cuni wrote on 2009-11-20 23:54: +
+
+

@Anon, @Benjamin
I've just noticed that W_ComplexObject in objspace/std/complexobject.py is not marked as _immutable_=True (as it is e.g. W_IntObject), so it is totally possible that the JIT is not able to optimize math with complexes as it does with ints and floats. We should look into it, it is probably easy to discover

+
+
+
+
+ + vak wrote on 2009-11-20 23:58: +
+
+

guys, sorry, who cares about *seconds*??

why didn't you normalize to the test winners? :)

+
+
+
+
+ + Leo wrote on 2009-11-21 09:06: +
+
+

So, um, has anyone managed to get JIT-ed pypy to compile itself?

When I tried to do this today, I got this:

https://paste.pocoo.org/show/151829/

+
+
+
+
+ + Maciej Fijalkowski wrote on 2009-11-21 11:26: +
+
+

@Leo:

yes, we know that bug. Armin is fixing it right now on faster-raise branch.

+
+
+
+
+ + Armin Rigo wrote on 2009-11-21 17:47: +
+
+

antonio: good point. On the second thought, though, it's not a *really* good point because we don't have _immutable_=True on floats either...

+
+
+
+
+ + Leo wrote on 2009-11-21 19:35: +
+
+

@Maciej Great! It'll be awesome to have a (hopefully much faster??) JITted build ... it currently takes my computer more than an hour ...

+
+
+
+
+ + Benjamin Peterson wrote on 2009-11-22 01:45: +
+
+

@Leo it's likely to take tons of memory, though.

+
+
+
+
+ + Anonymous wrote on 2009-11-22 10:13: +
+
+

Would perhaps also be nice to compare the performance with one the current Javascript-Engines(V8, SquirrelFish etc.)

+
+
+
+
+ + Tom Clarke wrote on 2009-11-22 12:08: +
+
+

Nice comparisons - and micro-performance looking good. Congratulations.

HOWEVER - there is no value in having three columns for each benchmark. The overall time is arbitrary, all that matters is relative so you might as well normalise all graphs to CPython = 1.0, for example. The relevant informtion is then easier to see!

+
+
+
+
+ + Unknown wrote on 2009-11-23 19:24: +
+
+

it's called "The Computer Language
Benchmarks Game" these days...

+
+
+
+
+ + Luis wrote on 2009-11-23 21:10: +
+
+

Tom is right, normalizing the graphs to cpython = 1.0 would make them much more readable.
Anyway, this is a very good Job from Unhelpful.
Thanks!

+
+
+
+
+ + Anonymous wrote on 2009-11-27 13:54: +
+
+

Do any of those benchmarks work with shedskin?

+
+
+
+
+ + ¬¬ wrote on 2009-11-30 07:26: +
+
+

glad to see someone did something with my language shootout benchmark comment ;)

+
+
+
+
+ + Anonymous wrote on 2009-12-01 19:07: +
+
+

I checked https://www.looking-glass.us/~chshrcat/python-benchmarks/results.txt but it doesn't have the data for unladen swallow. Where are the number?

+
+
+
+
+ + Term Paper wrote on 2010-02-18 07:05: +
+
+

I'm curious why mandelbrot is much less accelerated than, say, nbody. Does PyPy not JIT complex numbers properly yet?

+
+
+
+ +

Düsseldorf Sprint Report

+ +
+

While the Düsseldorf is dwindling off, we put our minds to the task of retelling +our accomplishments. The sprint was mostly about improving the JIT and we +managed to stick to that task (as much as we managed to stick to anything). The +sprint was mostly filled with doing many small things.

+
+

Inlining

+

Carl Friedrich and Samuele started the sprint trying to tame the JIT's inlining. +Until now, the JIT would try to inline everything in a loop (except other loops) +which is what most tracing JITs actually do. This works great if the resulting +trace is of reasonable length, but if not it would result in excessive memory +consumption and code cache problems in the CPU. So far we just had a limit on +the trace size, and we would abort tracing when the limit was reached. This +would happen again and again for the same loop, which is not useful at all. The +new approach introduced is to be more clever when tracing is aborted by marking +the function with the largest contribution to the trace size as non-inlinable. The +next time this loop is traced, it usually then gives a reasonably sized trace.

+

This gives a problem because now some functions that don't contain loops are not +inlined, which means they never get assembler code for them generated. To remedy +this problem we also make it possible to trace functions from their start (as +opposed to just tracing loops). We do that only for functions that can not be +inlinined (either because they contain loops or they were marked as +non-inlinable as described above).

+

The result of this is that the Python version telco decimal benchmark runs +to completion without having to arbitrarily increase the trace length limit. +It's also about 40% faster than running it on CPython. This is one of the first +non-tiny programs that we speed up.

+
+
+

Reducing GC Pressure

+

Armin and Anto used some GC instrumentation to find places in pypy-c-jit +that allocate a lot of memory. This is an endlessly surprising exercise, as +usually we don't care too much about allocations of short-lived objects when +writing RPython, as our GCs usually deal well with those. They found a few +places where they could remove allocations, most importantly by making one of +the classes that make up traces smaller.

+
+
+

Optimizing Chains of Guards

+

Carl Friedrich and Samuele started a simple optimization on the trace level that +removes superfluous guards. A common pattern in a trace is to have stronger +and stronger guards about the same object. As an example, often there is first a +guard that an object is not None, later followed by a guard that it is exactly +of a given class and then even later that it is a precise instance of that +class. This is inefficient, as we can just check the most precise thing in the +place of the first guard, saving us guards (which take memory, as they need resume data). +Maciek, Armin and Anto later improved on that by introducing a new guard that +checks for non-nullity and a specific class in one guard, which allows us to +collapse more chains.

+
+
+

Improving JIT and Exceptions

+

Armin and Maciek went on a multi-day quest to make the JIT and Python-level +exceptions like each other more. So far, raising and catching exceptions would +make the JIT generate code that has a certain amusement value, but is not really +fast in any way. To improve the situation, they had to dig into the exception +support in the Python interpreter, where they found various inefficiencies. They +also had to rewrite the exceptions module to be in RPython (as opposed to +just pure Python + an old hack). Another problems is that tracebacks give you +access to interpreter frames. This forces the JIT to deoptimize things, as +the JIT keeps some of the frame's content in CPU registers or on the CPU stack, +which reflective access to frames prevents. +Currently we try to improve the simple cases where the traceback is never +actually accessed. This work is not completely finished, but some cases are +already significantly faster.

+
+
+

Moving PyPy to use py.test 1.1

+

Holger worked on porting PyPy to use the newly released py.test 1.1. PyPy +still uses some very old support code in its testing infrastructure, which makes +this task a bit annoying. He also gave the other PyPy developers a demo of some +of the newer py.test features and we discussed which of them we want to start +using to improve our tests to make them shorter and clearer. One of the things +we want to do eventually is to have less skipped tests than now.

+
+
+

Using a Simple Effect Analysis for the JIT

+

One of the optimization the JIT does is caching fields that are read out of +structures on the heap. This cache needs to be invalidated at some points, for +example when such a field is written to (as we don't track aliasing much). +Another case is a call in the assembler, as the target function could +arbitrarily change the heap. This of course is imprecise, since most functions +don't actually change the whole heap, and we have an analysis that finds out +which sorts of types of structs and arrays a function can mutate. During the +sprint Carl Friedrich and Samuele integrated this analysis with the JIT, to help +it invalidate caches less aggressively. Later Anto and Carl Friedrich also +ported this support to the CLI version of the JIT.

+
+
+

Miscellaneous

+

Samuele (with some assistance of Carl Friedrich) set up a buildbot slave on a +Mac Mini at the University. This should let us stabilize on the Max OS X. So far +we still have a number of failing tests, but now we are in a situation to +sanely approach fixing them.

+

Anto improved the CLI backend to support the infrastructure for producing the +profiling graphs Armin introduced.

+

The guinea-pigs that were put into Carl Friedrich's care have been fed (which +was the most important sprint task anyway).

+

Samuele & Carl Friedrich

+
+
+
+
+
+ + Anonymous wrote on 2009-11-13 17:57: +
+
+

Great news and a nice read. Out of curiosity, did you also improve performance for the richards or pystone benchmarks?

+
+
+
+
+ + hubert wrote on 2009-11-14 05:05: +
+
+

this is a very fascinating project and i enjoy reading the blog even if i am not really a computer scientist and don't have a very deep understanding of many details. :)

something i always wonder about... wouldn't it be possible to use genetic algorithms in compiler technology? like a python to machine code compiler that evolves to the fastest solution by itself? or is there still not enough computing power for something like that?

+
+
+
+
+ + pollo wrote on 2009-11-14 11:18: +
+
+

Very interesting. Thanks for all your work!

+
+
+
+
+ + Carl Friedrich Bolz-Tereick wrote on 2009-11-14 14:57: +
+
+

@Anonymous: Richards and Pystone become less and less important as benchmarks, we are trying to look into more application-like larger things now.

+
+
+
+ +

Düsseldorf Sprint Started

+ +
+

The Düsseldorf sprint starts today. Only Samuele and me are there so far, but that should change over the course of the day. We will mostly work on the JIT during this sprint, trying to make it a lot more practical. For that we need to decrease its memory requirements some more and to make it use less aggressive inlining. We will post more as the sprint progresses.

+
+
+
+
+ + kataton wrote on 2009-11-10 07:39: +
+
+

Looking forward to amazing new developments...

+
+
+
+
+ + Luis wrote on 2009-11-12 12:40: +
+
+

Are you planning a new release anytime soon? (hopefully with JIT?)

+
+
+
+
+ + Anonymous wrote on 2009-11-13 13:48: +
+
+

A release is planned for the February-March timeframe.

/Jacob Hallén

+
+
+
+
+ + Armin Rigo wrote on 2009-11-17 10:08: +
+
+

Actually, I would plan the release for the end of the next sprint, which should be in January.

+
+
+
+ +

PyPy on RuPy 2009

+ +
+

Hello. +

+

+It's maybe a bit late to announce, but there will be PyPy talk +at Rupy conference this weekend in +Poznan. Precisely, I'll be talking mostly about PyPy's JIT and +how to use it. Unfortunately the talk is on Saturday, at 8:30 in the morning. +

+

+EDIT: Talk is online, together with examples +

+Cheers,
+fijal +
+
+
+
+ + ulrik wrote on 2009-11-03 19:12: +
+
+

I, and many interested with me, appreciate links to slides, videos or transcripts of the talk once it has been held. PyPy is exciting! Good luck in Poznan.

+
+
+
+
+ + Maciej Fijalkowski wrote on 2009-11-03 21:11: +
+
+

All materials for pypy talks are available in talk directory.

Cheers,
fijal

+
+
+
+ +

Logging and nice graphs

+ +
+

Hi all,

+ +

This week I worked on improving the system we use for logging. Well, it was not really a "system" but rather a pile of hacks to measure in custom ways timings and counts and display them. So now, we have a system :-)

+ +

The system in question was integrated in the code for the GC and the JIT, which are two independent components as far as the source is concerned. However, we can now display a unified view. Here is for example pypy-c-jit running pystone for (only) 5000 iterations:

+ + + +

The top long bar represents time. The bottom shows two summaries of the total time taken by the various components, and also plays the role of a legend to understand the colors at the top. Shades of red are the GC, shades of green are the JIT.

+ +

Here is another picture, this time on pypy-c-jit running 10 iterations of richards:

+ + + +

We have to look more closely at various examples, but a few things immediately show up. One thing is that the GC is put under large pressure by the jit-tracing, jit-optimize and (to a lesser extent) the jit-backend components. So large in fact that the GC takes at least 60-70% of the time there. We will have to do something about it at some point. The other thing is that on richards (and it's likely generally the case), the jit-blackhole component takes a lot of time. "Blackholing" is the operation of recovering from a guard failure in the generated assembler, and falling back to the interpreter. So this is also something we will need to improve.

+ +

That's it! The images were generated with the following commands:

+ +
PYPYLOG=/tmp/log pypy-c-jit richards.py
+python pypy/tool/logparser.py draw-time /tmp/log --mainwidth=8000 --output=filename.png
+ +EDIT: nowadays the command-line has changed to:
python rpython/tool/logparser.py draw-time /tmp/log --mainwidth=8000 filename.png
+
+
+
+
+ + pollo wrote on 2009-11-02 01:09: +
+
+

Nice work.
I think you'll cause a revolution when this project delivers its goals, opening python (and other dynamic languages) to a much wider range of uses.

+
+
+
+
+ + René Dudfield wrote on 2009-11-02 07:56: +
+
+

ooh, pretty graphs :) It's been very good to follow pypy progress through the blog.

Can the gc/jit be made to take up a maximum amount of time, or be an incremental process? This is important for things requiring real time - like games, audio, multimedia, robots, ninjas etc.

A note, that some other languages do gc/jit in other threads. But I imagine, pypy is concentrating on single threaded performance at the moment.

I'm sure you're aware of both those things already, but I'm interested to see what the pypy approach to them is?

cu,

+
+
+
+ +
+
+ +
+
+
+ +
+ + + + \ No newline at end of file diff --git a/blog/index-13.html b/blog/index-13.html new file mode 100644 index 000000000..d7d6f76f1 --- /dev/null +++ b/blog/index-13.html @@ -0,0 +1,1532 @@ + + + + + + +PyPy (old posts, page 13) | PyPy + + + + + + + + + + + + + + + + + + Skip to main content +
+
+
+

Using CPython extension modules with PyPy natively, or: PyPy can load .pyd files with CPyExt!

+ +
+

PyPy is now able to load +and run CPython extension modules (i.e. .pyd and .so files) natively by using the new CPyExt +subsystem. +Unlike the solution presented in another blog post (where extension modules like +numpy etc. were run on CPython and proxied through TCP), this solution does not require +a running CPython anymore. We do not achieve full binary compatiblity +yet (like Ironclad), but recompiling the extension is generally enough.

+

The only prerequisite is that the necessary functions of the C API of CPython are already +implemented in PyPy. If you are a user or an author of a module and miss certain functions +in PyPy, we invite you to implement them. Up until now, a lot of people (including a lot of +new committers) have stepped up and implemented a few functions to get their favorite module +running. See the end of this post for a list of names.

+

Regarding speed, we tried the following: even though there is a bit of overhead when running +these modules, we could run the regular expression engine of CPython (_sre.so) and execute +the spambayes benchmark of the Unladen Swallow benchmark suite (cf. speed.pypy.org) and +experience a speedup: +It became two times faster on pypy-c than with the built-in regular +expression engine of PyPy. From Amdahl's Law it follows that the _sre.so must run several +times faster than the built-in engine.

+

Currently pursued modules include PIL and others. Distutils support is nearly ready. +If you would like to participate or want information on how to use this new feature, come and join +our IRC channel #pypy on freenode.

+

Amaury Forgeot d'Arc and Alexander Schremmer

+

Further CPyExt Contributors:

+
    +
  • Alex Gaynor +
  • +
  • Benjamin Peterson +
  • +
  • Jean-Paul Calderone +
  • +
  • Maciej Fijalkowski +
  • +
  • Jan de Mooij +
  • +
  • Lucian Branescu Mihaila +
  • +
  • Andreas Stührk +
  • +
  • Zooko Wilcox-O Hearn
  • +
+
+
+
+
+ + Anonymous wrote on 2010-04-10 03:38: +
+
+

Holy crap, this is huge! Is it available in the PPA already? I guess this would put all benchmarks past CPython speed (except for outliers like the euler14 thing).

+
+
+
+
+ + Anonymous wrote on 2010-04-10 04:09: +
+
+

Great news! What is the status of numpy/scipy support?

+
+
+
+
+ + Alex wrote on 2010-04-10 06:16: +
+
+

@Anonymous I don't think anyone has started trying to test numpy or scipy yet, however fundamentally it's just a matter of implementing missing functions. For me starting on numpy in my next goal, after PIL.

+
+
+
+
+ + Anonymous wrote on 2010-04-10 08:32: +
+
+

This is very good news. JIT compiled Python can never fully replace extension modules (existing ones, or the need for new ones), so extension support should be a high priority for the PyPy project. I hope you can eventually get rid of that overhead.

+
+
+
+
+ + holger krekel wrote on 2010-04-10 14:41: +
+
+

wow, just coming back from vacation and have to say: great news and great work, guys! Historically speaking, this is the third approach to the "ext" module issue and if the promise works out as it seems to do, probably the last as far as leveraging cpy ext modules are concerned! I wonder - does it still make sense to have "native" extension modules, the ones we currently have as "mixed" modules?

+
+
+
+
+ + Anonymous wrote on 2010-04-10 16:14: +
+
+

Let me ask for a bit more detail. I depend on a module (https://homepages.inf.ed.ac.uk/lzhang10/maxent_toolkit.html), that is currently unsupported, as far as I know. I'd really like to port it to pypy. Where to start?

Is it possible that the module runs without modifications? Can I check this simply by building a pypy-trunk, and write "import cmaxent"?

+
+
+
+
+ + Bartosz SKOWRON wrote on 2010-04-10 16:29: +
+
+

@Anonymous: No it's not in the PPA. We provide only the latest release (1.2 in this case) and weekly builds for trunk (which haven't been announced on the blog yet). CPython extension modules live in their own branch. The branch will be merged into the trunk sooner or later.

PS. The weekly builds are available here at https://launchpad.net/~pypy

+
+
+
+
+ + Alexander Schremmer wrote on 2010-04-10 18:43: +
+
+

@Anonymous

To test your module, you need to compile and load it. For compilation, you can use a compiled pypy binary and run setup.py build_ext with your setup file. For hints about manual compilation and module loading, visit our IRC channel.

+
+
+
+
+ + Alexander Schremmer wrote on 2010-04-10 18:45: +
+
+

@holger

MixedModules allow you to implement modules in RPython (using the PyPy API) and Python at the same time. CPyExt is for modules written in C using the CPython API. So both solutions are for different needs.

+
+
+
+
+ + Unknown wrote on 2010-04-10 20:52: +
+
+

what about embedding pypy? will this work too in the future?

the reason i ask is blender. there were some security concerns among blender developers recently. blender uses embedded cpython for scripting. normal scripts (like exporters) which have to be evoked by the user aren't that much of a problem but blender also supports python expressions for animation parameters. without a sandbox downloading and opening .blend files from unknown sources is kind of risky since a malicious python expression theoretically could wipe your harddisk.

pypy with its support for a sandbox could be a very good replacement for cpython in blender (also because of its speed) but if it isn't compatible with the cpython api then a swap probably would be way too much effort.

+
+
+
+
+ + Alexander Schremmer wrote on 2010-04-10 23:07: +
+
+ @horace
what about embedding pypy?

That should work as easy as extending. +
+
+
+
+ + holger krekel wrote on 2010-04-11 16:54: +
+
+

@alexander True, mixed modules are for rpython-implemented modules and need to be translated together with the pypy interpreter and could make use of the JIT. My question more aimed at the issue for which use cases / goals which kind of extension module mechanism makes sense.
IOW, some discussion and web page regarding rpy-ext/ctypes/cpy-ext would make sense, i guess. Or is it somewhere already?

+
+
+
+
+ + Alexander Schremmer wrote on 2010-04-11 17:03: +
+
+ @holger
some discussion and web page regarding rpy-ext/ctypes/cpy-ext would make sense

Yes, someone could write down guidelines. Using the C API runs your module fast in case of CPython. A bit slower on ironpython and PyPy.

Using ctypes gives your module access to these three interpreters as well, but it will run slower. One advantage here is that you do not need to write C to create a wrapper around a library. If your objective is speed and lower memory usage, then CTypes does not work either.

Mixed modules make your module work only on PyPy and provide a decent speed and a mixture of a decent (Python) and a bit harder to grasp (RPython) programming language. This only makes sense as a platform if your users are also using PyPy. +
+
+
+
+ + René Dudfield wrote on 2010-04-12 13:56: +
+
+

Super awesome! Can't wait to get home and try it out.

+
+
+
+
+ + Gary Robinson wrote on 2010-07-10 12:45: +
+
+

It's a few months later, and I'm wondering what progress has been made. Early comments mentioned that nobody had tried numpy or scipy yet -- has that changed?

Also, does this make the multiprocessing library available? Or, is pp (parallel processing) available?

I'm very excited about PyPy because of the JIT. But for my work I also need some form of utilizing multiple CPU's. Right now I'm using unladen swallow with the multiprocessing module.

+
+
+
+
+ + Anonymous wrote on 2011-04-13 10:07: +
+
+

Yup, I'd love to hear about the progress on this.

+
+
+
+
+ + Anonymous wrote on 2011-05-09 13:40: +
+
+

Any chance this will be released sometime?

+
+
+
+
+ + Alexander Schremmer wrote on 2011-05-09 14:01: +
+
+

It was already released, just check out the current PyPy release.

+
+
+
+ +
+
+
+ + Unknown wrote on 2010-04-09 22:51: +
+
+

Interesting read, thank you. By the way, are there any plans to push for 3.x compatibility?

+
+
+
+
+ + Benjamin Peterson wrote on 2010-04-10 02:59: +
+
+

@Fahrrad The plan is to work towards 2.7 compatibility this summer.

+
+
+
+ +

Introducing nightly builds and ubuntu PPA

+ +
+

Hello. +

+

+We're pleased to announce two things that we were constantly asked for: Nightly builds and Ubuntu PPA for 1.2 release made by Bartosz Skowron. There are no nightly build ubuntu packages (yet). +

+

+Nightly builds are what they are - pure pypy executables with JIT compiled in (for linux only now). They require either a pypy checkout or a release download. The main difference is that by default display more debugging information than release builds and that they contain recent bugfixes and improvements of course :-) +

+Cheers
+fijal +
+
+
+
+ + ben.b.boyer wrote on 2010-03-25 17:08: +
+
+

great!

+
+
+
+
+ + nekto0n wrote on 2010-03-26 09:39: +
+
+

Niiice =) Using PyPy becomes easier.
Could please disable jit on amd64 or perhaps build 32-bit deb for amd64 machines?

+
+
+
+
+ + Maciej Fijalkowski wrote on 2010-03-26 17:15: +
+
+

@nek0ton building 32bit JIT for 64bit is hard since you need 32bit libraries. We just don't build nightly 64bit (nor release contained it).

+
+
+
+
+ + nekto0n wrote on 2010-03-26 17:24: +
+
+

@fijal Why so? 32bit libraries are available on ubuntu (with ia32 suffix), kernel is build with 32bit support option. Don't see any problem here.
I understand why not to build 64bit release - JIT is the goal.
P.S. Maybe unavailable amd64 build would force someone to digg and fix that issue? =) Are there any guides available to do it?

+
+
+
+
+ + Maciej Fijalkowski wrote on 2010-03-26 21:29: +
+
+

the reason is precisely what you described - you need custom libraries linked with special suffix or place which is probably distribution dependent.

+
+
+
+
+ + Unknown wrote on 2010-03-31 12:46: +
+
+

What would it take to make a 64 bit native everything (amd64)?

Btw. I noticed the supported modules list seems to be incomplete at https://pypy.org/compat.html
At least os, subprocess seem to be there even if not listed, probably more?

+
+
+
+
+ + Maciej Fijalkowski wrote on 2010-03-31 16:44: +
+
+

@harri.

The general answer is that both subprocess and os are written in Python (and not C), so we don't list them. However I wonder how we can list things not to confuse people who don't know that. Any ideas (listing all possible modules is a bit too much).

+
+
+
+
+ + Unknown wrote on 2010-04-07 14:39: +
+
+

If the supported modules is over 50% of all, how about just listing modules that still require work? I suspect many people are unaware that PyPy is getting feature complete, usable for real work.

+
+
+
+
+ + Unknown wrote on 2011-07-20 13:08: +
+
+

Any reason the PPA doesn't have a newer 1.5 build for natty?

+
+
+
+ +
+
+
+ + PAOLO BASSO wrote on 2010-03-20 13:15: +
+
+

First of all congratulations for the great work, I can say I am a newbie in Python world but I follow with interest this project. I tryed the release with the JIT compiler with also the parallel python module and the speed gain is sensible. I compared also the performance with psyco on 3 or 4 benchmarks and it seems that the time for the execution is usually more or less the same. Do you think there will be the possibility again for a massive speed improvement in future releases or the level of max performance is not so far? How much faster could it be in the future?

Thanks,

Paolo

+
+
+
+
+ + Luis wrote on 2010-03-22 20:12: +
+
+

Question:
According to the Computer Language Benchmarks Game, there are three benchmarks that perform way slower in Pypy against Python 3 ( see here: https://shootout.alioth.debian.org/u32/benchmark.php?test=all&lang=pypy&lang2=python3 ).

Those are:
1) reverse-complement
2) regex-dna
3) pidgits

I know that regex-dna performs slower because regex haven't been optimized yet, but what's the reason for the other two? Do they use regex too?

+
+
+
+
+ + Anonymous wrote on 2010-03-24 17:02: +
+
+

@Luis pidigits is about using gmpy for cpython vs longs for pypy. It's a bit apples vs oranges. That said, CPython's longs are still faster than pypy's so we definitely can improve. This are needs some love :)

Reverse complement is string benchmark and I did not look but it might be that the speed of str.translate is suboptimal.

Cheers,
fijal, hiding

+
+
+
+ +

Heroes of the 1.2 Release

+ +
+

Now that the release is done I wanted to list and to thank some people that +were essential in the process of getting it out of the door, particularly +because the work of some of them is not very visible usually.

+

Armin Rigo and Maciej Fijałkowski tirelessly worked on most aspects of +the release, be it fixing the last known bugs and performance problems, +packaging or general wizardry.

+

Amaury Forgeot d'Arc made sure that PyPy 1.2 actually supports Windows as a +platform properly and compiled the Windows binaries.

+

Miquel Torres designed and implemented our new speed overview page, +https://speed.pypy.org which is a great tool for us to spot performance +regressions and to showcase our improvements to the general public.

+

tav designed the new user-oriented web page, https://pypy.org which is a lot +nicer for people that only want to use PyPy as a Python implementation (and not +be confused by how PyPy is actually made).

+

Holger Krekel fixed our main development server codespeak.net, even while +being on vacation and not really having online connectivity. Without that, we +couldn't actually have released anything.

+

Bartosz Skowron worked a lot on making Ubuntu packages for PyPy, which is +really cool. Even though he didn't quite finish in time for the release, we will +hopefully get them soon.

+

Thanks to all you guys!

+
+
+
+
+ + Nicola Larosa wrote on 2010-03-13 10:53: +
+
+

Many thanks to all of you for the hard work, PyPy is shaping up very nicely. :-)

+
+
+
+
+ + Bartosz Skowron wrote on 2010-03-13 14:45: +
+
+

Heh, I would finish the Ubuntu package if i didn't have restricted Internet access (only port 80 is working in the hotel where i'm staying now). please wait till Monday :)

+
+
+
+
+ + Philipp Strube wrote on 2010-03-13 14:54: +
+
+

Awesome. Will try this out for our cloud hosting platform.

+
+
+
+ +

Introducing the PyPy 1.2 release

+ +
+

We are pleased to announce PyPy's 1.2 release. +This version 1.2 is a major milestone and it is the first release to ship +a Just-in-Time compiler that is known to be faster than CPython +(and unladen swallow) on some real-world applications (or the best benchmarks +we could get for them). The main theme for the 1.2 release is speed.

+

The JIT is stable and we don't observe crashes. Nevertheless we would +recommend you to treat it as beta software and as a way to try out the JIT +to see how it works for you.

+

Highlights:

+
    +
  • The JIT compiler.
  • +
  • Various interpreter optimizations that improve performance as well as help +save memory. Read our various blog posts about achievements.
  • +
  • Introducing a new PyPy website at pypy.org made by tav and improved +by the PyPy team.
  • +
  • Introducing speed.pypy.org made by Miquel Torres, a new service that monitors our performance +nightly.
  • +
  • There will be ubuntu packages on PyPy's PPA made by Bartosz Skowron, +however various troubles prevented us from having them as of now.
  • +
+

Known JIT problems (or why you should consider this beta software) are:

+
    +
  • The only supported platform is 32bit x86 for now, we're looking for help with +other platforms.
  • +
  • It is still memory-hungry. There is no limit on the amount of RAM that +the assembler can consume; it is thus possible (although unlikely) that +the assembler ends up using unreasonable amounts of memory.
  • +
+

If you want to try PyPy, go to the download page on our excellent new site +and find the binary for your platform. If the binary does not work (e.g. on +Linux, because of different versions of external .so dependencies), or if +your platform is not supported, you can try building from the source.

+

The PyPy release team,
+Armin Rigo, Maciej Fijalkowski and Amaury Forgeot d'Arc

+

Together with
+Antonio Cuni, Carl Friedrich Bolz, Holger Krekel, Samuele Pedroni and many others.

+
+
+
+
+ + Brian Slesinsky wrote on 2010-03-12 18:37: +
+
+

The front page of the new PyPy site should include some of these caveats about it being beta software; it gives the wrong impression about PyPy's current status.

+
+
+
+
+ + Peter wrote on 2010-03-12 18:50: +
+
+

Congratulations! This is great news!

+
+
+
+
+ + stuaxo wrote on 2010-03-12 20:21: +
+
+

Is it possible to install distribute in this?

+
+
+
+
+ + Martijn Faassen wrote on 2010-03-12 20:34: +
+
+

Congrats! Now to port a lot of software onto this!

+
+
+
+
+ + Anonymous wrote on 2010-03-12 21:50: +
+
+

Congratulations! I've been looking forward to this.

Question: does PyPy have an API for creating native modules?

+
+
+
+
+ + Maciej Fijalkowski wrote on 2010-03-12 22:25: +
+
+

@Anonymous:

if you mean wrapping C libraries we recommend using ctypes.

+
+
+
+
+ + Unknown wrote on 2010-03-12 23:31: +
+
+

awesome! congratulations!

why is spambayes so slow? does it use regular expressions?

+
+
+
+
+ + Isaac Gouy wrote on 2010-03-13 00:28: +
+
+

Why is there a problem with nbody and itertools ?

pypy temporarily in the benchmarks game.

+
+
+
+
+ + Benjamin Peterson wrote on 2010-03-13 01:15: +
+
+

@horace: yes, regexes are probably the problem.

@Issac: combinations is a 2.6 feature, which we don't support.

+
+
+
+
+ + Isaac Gouy wrote on 2010-03-13 01:33: +
+
+ combinations is a 2.6 feature, which we don't support

Would anyone care to contribute a modified working nbody program to the benchmarks game? ;-) +
+
+
+
+ + Armin Rigo wrote on 2010-03-13 03:11: +
+
+

@Isaac: we have nbody_modified in our benchmarks, source code here.

+
+
+
+
+ + Unknown wrote on 2010-03-13 10:38: +
+
+

Thanks for creating windows binaries! I waited long time for that...

+
+
+
+
+ + René Dudfield wrote on 2010-03-13 12:52: +
+
+

Congrats to all the pypy peoples!

+
+
+
+
+ + Vitéz Gábor wrote on 2010-03-13 13:18: +
+
+

Great work! Keep it up!

+
+
+
+
+ + Shin Guey wrote on 2010-03-13 15:46: +
+
+

I just tried the windows binary.

Oh damn, it is really FAST!!!

3x performance gain...
C:\work\bzr-proj>pypy script.py -t i2d -f longdata.txt
solve parallel
m = 1 total = 128
m = 2 total = 16384
m = 3 total = 2097152
Require M stage: 3
Time taken 00:00:05 (907ms)

C:\work\bzr-proj>python script.py -t i2d -f longdata.txt
solve parallel
m = 1 total = 128
m = 2 total = 16384
m = 3 total = 2097152
Require M stage: 3
Time taken 00:00:15 (093ms)

+
+
+
+
+ + Shin Guey wrote on 2010-03-13 15:52: +
+
+

Forgot about the memory usage, python consume ~4MB and pypy consume ~24MB. Pypy need 6x more memory, but I don't care about this in my script since the performance gain is significant.

I really want to know the pypy vs luajit, I think luajit should be much faster. I am in progress in converting my script to lua but that is painful, my knowledge on lua doesn't match with python.

+
+
+
+
+ + Carl Friedrich Bolz-Tereick wrote on 2010-03-13 16:11: +
+
+

@shin if you have a comparison to LuaJIT, I would be extremely interested to hear the results! I agree that LuaJIT will likely be faster though.

+
+
+
+
+ + Anonymous wrote on 2010-03-13 19:09: +
+
+

can't wait to test it out!

+
+
+
+
+ + large file transfer wrote on 2010-03-14 14:49: +
+
+

I really want to know the pypy vs luajit, I think luajit should be much faster. I am in progress in converting my script to lua but that is painful, my knowledge on lua doesn't match with python.

+
+
+
+
+ + cjrh wrote on 2010-03-15 07:27: +
+
+

Thanks for windows binaries!

+
+
+
+
+ + Anonymous wrote on 2010-04-09 13:47: +
+
+

Congratulations !

Please mention in the download section that VC2005 redistributables are needed to run it on Win !

+
+
+
+ +

Introducing speed.pypy.org

+ +
+

Hello.

+

Some time ago, we introduced our nightly performance graphs. This was a quick +hack to allow us to see performance regressions. Thanks to Miquel Torres, +we can now introduce https://speed.pypy.org, which is a Django-powered web +app sporting a more polished visualisation of our nightly performance runs.

+

While this website is not finished yet, it's already far better than our previous +approach :-)

+

Details about announcement on pypy-dev are found here.

+

If you're are interested in having something similar for other benchmark runs, contact Miquel (tobami at gmail).

+

Quoting Miquel: "I would also like to note, that if other performance-oriented +opensource projects are interested, I would be willing to see if we can set-up +such a Speed Center for them. There are already people interested in +contributing to make it into a framework to be plugged into buildbots, software +forges and the like. Stay tuned!"

+
+
+
+
+ + Unknown wrote on 2010-03-03 17:12: +
+
+

Excellent! We really ought to deploy this for unladen, too. Unfortunately, I don't think I'll have the time to get that going. :(

+
+
+
+
+ + Unknown wrote on 2010-03-03 19:48: +
+
+

In my mind PyPy with its JIT will/should eventually get us close to matching or beating Java performance for the non-dynamic subset of python. Would that be a fair statment? If so is there some bench mark that allows us to compare that. What that be usefull?

+
+
+
+
+ + Philip Jenvey wrote on 2010-03-03 22:49: +
+
+

I would love to see this become a Python implementation shootout, a single place where we could compare the speeds of CPython/PyPy/Unladen/Jython/IronPython

+
+
+
+
+ + Anonymous wrote on 2010-03-03 23:09: +
+
+

This is great! It's excellent to see the fruits of the pypy jit work so clearly.

I'd also like to see this in place for other Python implementations.

+
+
+
+
+ + matt harrison wrote on 2010-03-03 23:40: +
+
+

Awesome work. One more feature request: Track memory usage.

+
+
+
+
+ + Maciej Fijalkowski wrote on 2010-03-03 23:56: +
+
+

@matt

Ok. So I've seen those feature request often enough. These benchmarks are not good for tracking memory usage - they'll simply measure the amount interpreter allocates at the beginning. If you provide better ones, we'll do it.

Cheers,
fijal

+
+
+
+
+ + Unknown wrote on 2010-03-04 04:59: +
+
+

With the JIT would a script that does not use the dynamic aspects of python be able to match the speed of Java?

+
+
+
+
+ + tobami wrote on 2010-03-04 09:46: +
+
+

@Reid: maybe I can help you out setting it up. You could actually even begin saving results to speed.pypy.org right away with minimal configuration changes (though I understand you may prefer to have your own site and DB).

+
+
+
+
+ + tobami wrote on 2010-03-04 09:54: +
+
+

@Philip, Anonymous

The first features are catering to trunk development, which was the most urgent thing.

But my plan all along was to implement a third tab for comparing implementations (among other things. See mailing list announcement for details).

So your wish should come to pass :-)

+
+
+
+
+ + Neil wrote on 2010-03-04 12:08: +
+
+

Neat! I still like the original graphs though, it's nice to see the history for all the benchmarks together.

I think the 'average' is pretty meaningless - it implies that a simple average of all the benchmarks will correspond to the typical real-world speed up you will get using pypy with your existing python code, which I don't think is true.

+
+
+
+
+ + tobami wrote on 2010-03-04 12:20: +
+
+

@Neil
a view showing all timeline graphs at once is also planned.

About the average, of course you can not take from it that pypy-c-jit is nearly 3 times as fast as cpython. Because it depends on the particular choice of benchmarks, which right now is not at all representative of actual real-world usage.

Regardless, it is there so that a developer gets an overall feeling for how a given revision change has affected performance across all benchmarks.

We can't avoid the risk of people reaching wrong conclusions, but that is always the case with statistics, averages and benchmarks ;-)

+
+
+
+
+ + Carl Friedrich Bolz-Tereick wrote on 2010-03-04 13:14: +
+
+

@sarvi: reaching the speed of Java is a really non-trivial goal, because Sun's JVM has really been highly optimized over many years. I guess it will take us a long time (if at all) to reach such levels of performance.

+
+
+
+
+ + Unknown wrote on 2010-03-05 04:17: +
+
+

I understand JVM is highly optimized.
And overtime and once yall have more momentum industry funding I am sure your VM will get just as optimized. I am sure Google will pick you guys up soon. I have no doubt about it. Unladen Swallow seems a waste of time once yall get more credibility.

Even then I do expect Dynamic scripting capabilities to perform slower the Java.

I am just hoping that eventually the non-dynamic parts of python will perform on par with Java.

And we can all program in just Python and C. :-))

+
+
+
+
+ + della wrote on 2010-03-06 08:55: +
+
+

Great work! BTW, could it be possible to also have a quick link to the source code of the benchmarks in the website?

+
+
+
+
+ + tobami wrote on 2010-03-06 12:51: +
+
+

@della

yeah, such things are missing right now.

An about page, and possibly an explanation (with links to the code) of each benchmark are probably going to be implemented. Currently there is only tooltip explanations for some.

+
+
+
+
+ + Luis wrote on 2010-03-09 01:28: +
+
+

Another silly question:
AFAIK, the benchmark improvements seen lately are due to the way you measure avergages, by excluding warmup time. Seeing that warmup takes time that may be critical in some situations, I wonder if it's possible to somehow "save" the generated jited code so it can be reused after the first time it's generated.
This way, it would be possible to distribute programs already "warmed up", kind of a compiled version of them. Sorry if this doesn't make sense at all... for a clueless ignorant like me, it does!

+
+
+
+
+ + Maciej Fijalkowski wrote on 2010-03-09 17:09: +
+
+

@Luis

Hey. It's a valid option, but it's however at least hard (if not next to impossible). There is work planned on reducing warmup time, so it won't matter that much instead.

Cheers,
fijal

+
+
+
+
+ + stuaxo wrote on 2010-03-09 18:02: +
+
+

It would be nice if the timeline had the date on it (only where the date changes, and the beginning + end).

+
+
+
+
+ + large file transfer wrote on 2010-03-17 17:42: +
+
+

I am sure your VM will get just as optimized. I am sure Google will pick you guys up soon. I have no doubt about it. Unladen Swallow seems a waste of time once yall get more credibility.

+
+
+
+ +

Benchmarking twisted

+ +
+

Hello.

+

I recently did some benchmarking of twisted on top of PyPy. For the very +impatient: PyPy is up to 285% faster than CPython. For more patient people, +there is a full explanation of what I did and how I performed measurments, +so they can judge themselves.

+

The benchmarks are living in twisted-benchmarks and were mostly written +by Jean Paul Calderone. Even though he called them "initial exploratory +investigation into a potential direction for future development resulting +in performance oriented metrics guiding the process of optimization and +avoidance of complexity regressions", they're still much much better than +average benchmarks found out there.

+

The methodology was to run each benchmark for +quite some time (about 1 minute), measuring number of requests each 5s. +Then I looked at dump of data and substracted some time it took +for JIT-capable interpreters to warm up (up to 15s), averaging +everything after that. Averages of requests per second are in the table below (the higher the better):

+ ++++++ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
benchnameCPythonUnladen swallowPyPy
names1093011940 (9% faster)15429 (40% faster)
pb17052280 (34% faster)3029 (78% faster)
iterations7556994554 (25% faster)291066 (285% faster)
accept21762166 (same speed)2290 (5% faster)
web879854 (3% slower)1040 (18% faster)
tcp105M119M (7% faster)60M (46% slower)
+

To reproduce, run each benchmark with:

+
+benchname.py -n 12 -d 5
+

WARNING: running tcp-based benchmarks that open new connection for each +request (web & accept) can exhaust number of some kernel structures, +limit n or wait until next run if you see drops in request per second.

+

The first obvious thing is that various benchmarks are more or less amenable +to speedups by JIT compilation. Accept and tcp getting smallest speedups, if at +all. This is understandable, since JIT is mostly about reducing interpretation +and frame overhead, which is probably not large when it comes to accepting +connections. However, if you actually loop around, doing something, JIT +can give you a lot of speedup.

+

The other obvious thing is that PyPy is the fastest python interpreter +here, almost across-the board (Jython and IronPython won't run twisted), +except for raw tcp throughput. However, speedups can vary and I expect +this to improve after the release, as there are points, where PyPy can +be improved. Regarding raw tcp throughput - this can be a problem for +some applications and we're looking forward to improve this particular +bit.

+

The main reason to use twisted for this comparison is a lot of support from +twisted team and JP Calderone in particular, especially when it comes to +providing benchmarks. If some open source project wants to be looked at +by PyPy team, please provide a reasonable set of benchmarks and infrastructure.

+

If, however, you're a closed source project fighting with performance problems +of Python, we're providing contracting for investigating opportunities, how +PyPy and not only PyPy, can speed up your project.

+

Cheers,
+fijal

+

Benchmark descriptions:

+
    +
  • +names - simple DNS server
  • +
  • +web - simple http hello world server
  • +
  • +pb - perspective broker, RPC mechanism for twisted
  • +
  • +iterations - empty twisted loop
  • +
  • +accept - number of tcp connections accepted per second
  • +
  • +tcp - raw socket transfer throughput
  • +
+

Used interpreters:

+
    +
  • CPython 2.6.2 - as packaged by ubuntu
  • +
  • Unladen swallow svn trunk, revision 1109
  • +
  • PyPy svn trunk, revision 71439
  • +
+

Twisted version used: svn trunk, revision 28580

+

Machine: unfortunately 32bit virtual-machine under qemu, running ubuntu karmic, +on top of Quad core intel Q9550 with 6M cache. Courtesy of Michael Schneider.

+
+
+
+
+ + Alexander Solovyov wrote on 2010-03-01 15:42: +
+
+

Would be nice to see at least rough approximation of amount of RAM used by each implementation. :-)

+
+
+
+
+ + Anonymous wrote on 2010-03-01 18:58: +
+
+

Great as always.

I'm looking forward to use PyPy in production with the next stable release in march. =)

+
+
+
+
+ + Yuri Baburov wrote on 2010-03-01 20:37: +
+
+

Is it possible to run the same tests with CPython+Psyco?
That would be really interesting to see!

+
+
+
+
+ + Tim Parkin wrote on 2010-03-01 20:39: +
+
+

Congrats... things continue to look interesting :-)

+
+
+
+
+ + Maciej Fijalkowski wrote on 2010-03-01 21:31: +
+
+

@Yuri

No, psyco has limitations on frames that break zope.interface which twisted depends on.

+
+
+
+
+ + Doc Button wrote on 2010-03-02 07:02: +
+
+

I agree with Yuri, it would be of interest to record memory stats for each benchmark run.

+
+
+
+
+ + KoObz wrote on 2010-03-02 19:09: +
+
+

Awesome results Maciej.

Question: what's it gonna take for pypy to supplant Cpython?

You're faster and I'm guessing you have nowhere near the manpower of Cpython. Plus, you're written in Python so future work will be much easier. Seems like a no brainer to embrace pypy.

+
+
+
+
+ + Luis wrote on 2010-03-02 23:04: +
+
+

Question: After having read many comments and posts from pypy's developers lately, I got the impression (I might be wrong though), that you are betting all on tracing for getting speedups, (that the slow interpreter will eventually be compensated by the magic of tracing).
However, other projects that rely on tracing seem to favor a dual approach, which is a traditional method-a-time jit (which can evenly speed up all kinds of code) plus tracing for getting the most of highly numerical code (luajit 2.0, mozila's jaegermonkey, for example).

Is this accurate or I'm wrong? Do you think that the current tracing strategy will eventually get speedups for those benchamarks that are currently on par or way bellow cpython? Or will you have to add a more traditional approach for the baseline?

+
+
+
+
+ + Maciej Fijalkowski wrote on 2010-03-03 00:14: +
+
+

Hey Luis.

That's a very interesting question. I will try answer couple of your points, but feel free to move to pypy-dev mailing list if you want to continue discussion.

We indeed bet on tracing (or jitting in general) to compensate for slower interpretation than CPython. However, our tracing is far more general than spidermonkeys - for example we can trace a whole function from start and not require an actual loop. We hope to generalize tracing so it can eventually trace all constructs.

The main difference between ahead-of-time and tracing is that tracing requires actual run, while ahead-of-time tries to predict what will happen. Results are generally in favor of tracing, although the variation will be larger (tracing does statistically correct branch prediction, not necesarilly always the correct one).

Regarding benchmarks, most of those benchmarks that we're slower than CPython showcase that our tracing is slow (they don't contain warmup). And again, for some of those we'll just include warmup (like twisted.web which is web server, makes sense in my opinion), for other we'll try to make tracing faster. And again, the speed of tracing is not the property of tracing, but rather pypy's limitation right now.

Some other benchmarks are slow because we don't JIT regular expressions (spambayes). This should be fixed, but it's again unrelated to tracing.

To summarize: I don't expect us trying dual approach (one jit is enough fun, believe me), but instead generalizing tracing and making it more efficient. How this will go, we'll see, I hope pretty well.

Cheers,
fijal

+
+
+
+
+ + Antonio Cuni wrote on 2010-03-03 09:09: +
+
+

@Luis

other than Maciek's points, which I subscribe, it should be said
that, since each language has a different semantics, the
efficiency of a traditional "method-at-a-time" JIT can vary
dramatically. In particular, the dynamism of Python is so deep
that a traditional JIT cannot win much: Jython and IronPython do
exactly that, but for most use cases are slower than CPython. If
you are interested, Chapter 2 of my PhD thesis explores these
topics :-)
https://codespeak.net/svn/user/antocuni/phd/thesis/thesis.pdf

+
+
+
+
+ + Anonymous wrote on 2010-03-10 00:35: +
+
+

great results!
As for the warm-up, would it be possible to save some of the tracing decisions in some file (.pyt?) to help on next startup?
-shai

+
+
+
+
+ + Maciej Fijalkowski wrote on 2010-03-10 22:08: +
+
+

@Anonymous

Saving the results is hard, but not impossible. There are other possibilities (like keeping process around) though.

Cheers,
fijal

+
+
+
+
+ + Unknown wrote on 2016-07-30 10:28: +
+
+

May I have ur test code?

+
+
+
+ +

Pycon 2010 report

+ +
+

Hello.

+

Greetings to everybody from Pycon 2010 Atlanta. Right now I'm sitting in +a sprint room with people sprinting on various projects, like CPython, +twisted etc. The conference was really great, and I've seen some good talks, +although I've been too exhausted from my own talks to go to too many. +Probably I should stay away from proposing that many talks to next pycon :-)

+

The highlight of sprints was that we got a common mercurial repository at python.org for python benchmarks. We might be able to come up with +"the python benchmark suite" which will mostly consist +of simple benchmarks using large python libraries, rather than microbenchmarks. +The repository was started by the Unladen Swallow people and we already +have common commit access among PyPy, CPython, Unladen Swallow, Jython +and Iron Python. We don't have yet a common place to run benchmarks, +but we should be able to fix that soon.

+

Regarding the talks, there are online videos for +How to write cross-interpreter python programs and Speed of PyPy talks, +among other talks from Pycon. +There should be a video for my short keynote shortly.

+

The talks were well received as there is interest in PyPy's progress.

+

+

+

Cheers,
+fijal

+
+
+
+
+ + Luis wrote on 2010-02-24 20:53: +
+
+

Hi, I just wanted to say that there's something wrong with the PLOT ONE graphic. The speedups are expressed by horizontal lines (each one is 2x). The third line shows 8x instead of 6x.

+
+
+
+
+ + Anonymous wrote on 2010-02-25 00:56: +
+
+

It was nice meeting you. I hope you have fun in South Africa :)

Antoine.

+
+
+
+
+ + Maciej Fijalkowski wrote on 2010-02-25 04:43: +
+
+

@Luis

It's called a logarithmic scale. It means you get 2x 4x 8x 16x etc.

+
+
+
+
+ + Luis wrote on 2010-02-25 20:40: +
+
+

@Fijal
I see... please excuse my ignorance :-)

+
+
+
+ +
+
+ +
+
+
+ +
+ + + + \ No newline at end of file diff --git a/blog/index-14.html b/blog/index-14.html new file mode 100644 index 000000000..fc3e5aa29 --- /dev/null +++ b/blog/index-14.html @@ -0,0 +1,1886 @@ + + + + + + +PyPy (old posts, page 14) | PyPy + + + + + + + + + + + + + + + + + + Skip to main content +
+
+
+

EuroPython 2010 report

+ +
+

So, EuroPython 2010 is over, I am flying home and it's time to write a report +about the conference from the PyPy point of view.

+

As usual, the conference was very interesting and went very well. The quality +of the talks I attended to was high on average and most importantly I could +meet a lot of interesting people to discuss various things.

+

On the first day, Armin, Amaury and I presented the usual PyPy status talk +(here are the slides): +the talk is an extended version of the one that I and Armin presented at +Pycon Italia in May and is divided in three parts: first I talked about the +current status of the project, what is the content of the recent 1.2 and 1.3 +releases and showed a demo of a simple Django application that renders a +Mandelbrot fractal and is measurably faster on PyPy than on CPython. In the +second part of the talk, Armin gave an introduction about the ideas that stand +behind the JIT. Finally, in the third part Amaury explained how the new +cpyext module lets PyPy to compile and load existing CPython extensions +written in C.

+

I think that the talk was well received: the only drawback is that there was +no time to answer questions at the end of the presentation. However, we +received a lot of "offline" questions after the talk finished and thorough the +whole conference: it is always great to see that people are interested in our +work, and I'd like to thank everybody for the feedback that they gave to us.

+

PyPy was also mentioned in the interesting Mark Shannon's talk, where he +compared the optimization techniques used by PyPy, Unladen Swallow and +HotPy, which is Mark's own PhD project. Moreover, Henrik Vendelbo +gave a talk about how to tweak PyPy to produce a standalone +executable which embeds a whole python application to make deployment easier, +while Andrew Francis explained his implementation of the Go select +statement based on the stackless.py module implemented in PyPy. Personally, +I am glad to see that people start to think of PyPy as a useful starting +point to experiment with new features and use cases that we did not think +about: after all, one of PyPy explicit goals is to be "flexible and easy to +experiment with".

+

After the conference there were the usual post EuroPython sprints: this +year we had not planned a PyPy sprint, but some people showed interest +in it and since Armin and I happened to be still around the day after the +conference, we decided to do a mini 1-day sprint, with 6 or 7 people +present. Since there were only two core developers it was impossible to use +our usual pairing scheme, in which every newcomer pairs with someone who is +experienced with the source code to gain knowledge of it. However, I think it +was still a successful day of work, and we managed to fix a couple of bugs +that was standing in our issue tracker. Again, I'd like to thank all the +people that came and worked with us during the sprint.

+

In conclusion I really enjoyed the EuroPython 2010 experience: the fact that I +managed to find a place in Birmingham where to eat a good Italian-style "gelato" +helped a lot :-).

+
+
+
+
+ + Anonymous wrote on 2010-07-26 12:22: +
+
+

Just awesome.

+
+
+
+
+ + Paul Boddie wrote on 2010-08-09 23:03: +
+
+

Finding gelato hopefully won't be a problem at next year's EuroPython. ;-)

+
+
+
+ +

CERN Sprint Report – Wrapping C++ Libraries

+ +
+

The last five days we have been sprinting in a meeting room in the Computing +Center at CERN in Genève, Switzerland. Present are Armin Rigo, Antonio Cuni, +Carl Friedrich Bolz and Wim Lavrijsen (LBL). The goal of the sprint was to use +some of the C++ technology developed at CERN to make it possible to use C++ +libraries from PyPy's Python interpreter. For this we used the Reflex +library, which provides reflection information for C++ classes. We discussed +using Reflex in PyPy during the Düsseldorf sprint of 2008, please read +that blog post if you want some more details on how Reflex works. There is +support for this sort of C++/Python integration also for CPython, using the +PyROOT module.

+

The sprint was very successful. On Monday we had a few discussion about how +Reflex could best be integrated with PyPy. One of the goals of the sprint was to +make the approach JIT-friendly from the start, so that calls to C++ libraries +can be reasonably fast. After the discussion we started coding on the +reflex-support branch. This branch adds a new cppyy builtin module to +PyPy's Python interpreter (why we chose that name is left as an exercise to the +reader). This module can be used to load C++ classes, construct instances and +call static and instance methods on them.

+

The work has just started, as of now, the argument and return types of the +methods are restricted to some simple C types, such as int, double and +char* and pointers to class instances. Most of the work necessary to +properly resolve overloaded methods is done, but default arguments are not.

+

As an example, suppose there is a C++ class like this:

+
class example01 {
+private:
+    static int count;
+    int somedata;
+public:
+
+    example01(int a) : somedata(a) {
+        count++;
+    }
+    ~example01() {
+        count--;
+    }
+    static int getCount() {
+        return count;
+    }
+
+    int addDataToInt(int a) {
+        return somedata + a;
+    }
+};
+int example01::count = 0;
+
+

You can now use it from PyPy's Python interpreter in the following way, after +you have used Reflex to generate reflection information for the class:

+
import cppyy
+cppyy.load_lib("example01Dict.so") # contains the Reflex information
+example01_class = cppyy.gbl.example01
+instance = example01_class(7)
+assert example01_class.getCount() == 1
+res = instance.addDataToInt(4)
+assert res == 11
+res = instance.addDataToInt(-4)
+assert res == 3
+instance.destruct() # so far explicit destruction needed
+assert example01_class.getCount() == 0
+
+

We also did some very early JIT work and some early performance measurements. +The rough figures are that cppyy is two times faster at calling a simple C++ +method from Python than PyROOT. To get a feeling for how fast things could +go in the end, we also implemented a proof-of-concept for some more advanced JIT +technology (which requires a patch for Reflex and uses a GCC extension). With +this, the speedup over PyROOT is a factor of 20. Of course, this is still a +lot slower than a C++ to C++ method call (probably by at least an order of +magnitude).

+

The sprint was very productive because we managed to get the right people into +the same room working together. Wim has a lot of experience with C++ and Reflex, +and is the author of PyROOT, and of course the others know a lot about PyPy +(at the end of the sprint, Anto was very glad that he stopped using C++ a long +time ago). Also, working at CERN was very cool. The atmosphere is amazing, and +we got to visit the ATLAS control room. Extremely advanced technology, and +also research on a completely different scale than what we are used to.

+
+

Comparing SPUR to PyPy

+ +
+

Recently, I've become aware of the SPUR project of Microsoft Research and +read some of their papers (the tech report "SPUR: A Trace-Based JIT Compiler +for CIL" is very cool). I found the project to be very interesting and since +their approach is in many ways related to what PyPy is doing, I now want to +compare and contrast the two projects.

+
+

A Tracing JIT for .NET

+

SPUR consist of two parts: On the one hand it is a VM for CIL, the +bytecode of the .NET VM. This VM uses a tracing JIT compiler to compile the +programs it is running to machine code. As opposed to most existing VMs that +have a tracing JIT it does not use an interpreter at all. Instead it +contains various variants of a JIT compiler that produce different versions of +each method. Those are:

+
    +
  • a profiling JIT which produces code that does lightweight profiling when +running the compiled method
  • +
  • a tracing JIT which produces code that produces a trace when running the +compiled method
  • +
  • a transfer-tail JIT which is used to produce code which is run to get from a +failing guard back to the normal profiling version of a method
  • +
  • an optimizing JIT that actually optimizes traces and turns them into machine code
  • +
+
+

Optimizations Done by the Optimizing JIT

+

SPUR's optimizing JIT does a number of powerful optimizations on the traces before it +turns them into machine code. Among them are usual compiler optimizations such +as register allocation, common subexpression elimination, loop invariant code +motion, etc.

+

It also performs some optimizations that are specific to the tracing context and +are thus not commonly found in "normal" compilers:

+
    +
  • +guard implication: if a guard is implied by an earlier guard, it is removed
  • +
  • +guard strengthening: if there is a sequence of guards that become stronger +and stronger (i.e. each guard implies the previous one), the first guard in +the sequence is replaced by the last one, and all others are removed. This can +greatly reduce the number of guards and is generally safe. It can shift a +guard failure to an earlier point in the trace, but the failure would have +occurred at some point in the trace anyway.
  • +
  • +load/store optimizations: this is an optimization for memory reads/writes. +If several loads from the same memory location occur without writes in +between, all but the first one are removed. Similarly, if a write to a memory +location is performed, this write is delayed as much as possible. If there is +a write to the same location soon afterwards, the first write can be removed.
  • +
  • +escape analysis: for allocations that occur in a loop, the optimizer checks +whether the resulting object escapes the loop. If not, the allocation is moved +before the loop, so that only one object needs to be allocated, instead of one +every loop iteration.
  • +
  • +user-controlled loop unrolling: not exactly an optimization, but an +interesting feature anyway. It is possible to annotate a CIL method with a +special decorator [TraceUnfold] and then the tracing JIT will fully unroll +the loops it contains. This can be useful for loops than are known to run a +small and fixed number of iterations for each call-site.
  • +
  • +user controlled tracing: The user can also control tracing up to a point. +Methods can be annotated with [NativeCall] to tell the tracer to never +trace their execution. Instead they appear as a direct call in the trace.
  • +
+
+
+
+

A JavaScript Implementation

+

In addition to the tracing JIT I just described, SPUR also contains a JavaScript +implementation for .NET. The approach of this implementation is to translate +JavaScript to CIL bytecode, doing some amount of type inference to detect +variables that have fixed types. All operations where no precise type could be +determined are implemented with calls to a JavaScript runtime system, which does +the necessary type dispatching. The JavaScript runtime is implemented in C#.

+

The JavaScript implementation and the CLI VM with a tracing JIT sound quite +unrelated at first, but together they amplify each other. The tracing JIT traces +the JavaScript functions that have been translated to CLI bytecode. Since the +JavaScript runtime is in C#, it exists as CLI bytecode too. Thus it can be +inlined into the JavaScript functions by the tracer. This is highly beneficial, +since it exposes the runtime type dispatching of the JavaScript operations to +the optimizations of the tracing JIT. Particularly the common expression +elimination helps the JavaScript code. If a series of operations is performed on +the same object, the operations will all do the same type checks. All but the +type checks of the first operation can be removed by the optimizer.

+
+

Performance Results

+

The speed results of the combined JavaScript implementation and tracing JIT are +quite impressive. It beats TraceMonkey for most benchmarks in SunSpider (apart +from some string-heavy benchmarks that are quite slow) and can compete with V8 +in many of them. However, all this is steady-state performance and it seems +SPUR's compile time is rather bad currently.

+
+
+

Further Possibilities

+

A further (so far still hypothetical) advantage of SPUR is that the approach can +optimize cases where execution crosses the border of two different systems. If +somebody wrote an HTML layout engine and a DOM in C# to get a web browser and +integrated it with the JavaScript implementation described above, the tracing +JIT could optimize DOM manipulations performed by JavaScript code as well as +callbacks from the browser into JavaScript code.

+

Of course the approach SPUR takes to implement JavaScript is completely +generalizable. It should be possible to implement other dynamic languages in the +same way as JavaScript using SPUR. One would have to write a runtime system for +the language in C#, as well as a compiler from the language into CIL bytecode. +Given these two elements, SPUR's tracing JIT compiler would probably do a +reasonable job at optimizing this other language (of course in practise, the +language implementation would need some tweaking and annotations to make it +really fast).

+
+
+
+

Comparison With PyPy

+

The goals of PyPy and SPUR are very similar. Both projects want to implement +dynamic languages in an efficient way by using a tracing JIT. Both apply the +tracing JIT "one level down", i.e. the runtime system of the dynamic language is +visible to the tracing JIT. This is the crucial point of the approach of both +projects. Since the runtime system of the dynamic language is visible to the +tracing JIT, the JIT can optimize programs in that dynamic language. It does not +itself need to know about the semantics of the dynamic language. This makes the +tracing JIT usable for a variety of dynamic languages. It also means that the +two halves can be implemented and debugged independently.

+

In SPUR, C# (or another language that is compilable to CIL) plays the role of +RPython, and CIL is equivalent to the intermediate format that PyPy's +translation toolchain uses. Both formats operate on a similar abstraction level, +they are quite close to C, but still have support for the object system of their +respective language and are garbage-collected.

+

SPUR supports only a JavaScript implementation so far, which could maybe change in +the future. Thus JavaScript in SPUR corresponds to Python in PyPy, which was the +first dynamic language implemented in PyPy (and is also the reason for PyPy's +existence).

+

There are obviously also differences between the two projects, although many of +them are only skin-deep. The largest difference is the reliance of SPUR on +compilers on all levels. PyPy takes the opposite approach of using interpreters +almost everywhere. The parts of PyPy that correspond to SPUR's compilers are (I +will use the Python implementation of PyPy as an example):

+
    +
  • the JavaScript-to-CIL compiler corresponds to the Python interpreter of PyPy
  • +
  • the profiling JIT corresponds to a part of PyPy's translation toolchain +which adds some profiling support in the process of turning RPython code into +C code,
  • +
  • the tracing JIT corresponds to a special interpreter in the PyPy JIT which +executes an RPython program and produces a trace of the execution
  • +
  • the transfer-tail JIT corresponds to PyPy's blackhole interpreter, also +called fallback interpreter
  • +
  • the optimizing JIT corresponds to the optimizers and backends of PyPy's JIT
  • +
+
+

PyPy's Optimizations

+

Comparing the optimizations that the two projects perform, the biggest +difference is that PyPy does "trace stitching" instead of fully supporting trace +trees. The difference between the two concerns what happens when a new trace +gets added to an existing loop. The new trace starts from a guard in the +existing loop that was observed to fail often. Trace stitching means that the +loop is just patched with a jump to the new trace. SPUR instead recompiles the +whole trace tree, which gives the optimizers more opportunities, but also makes +compilation a lot slower. Another difference is that PyPy does not perform +loop-invariant code motion yet.

+

Many of the remaining optimizations are very similar. PyPy supports guard +implication as well as guard strengthening. It has some load/store +optimizations, but PyPy's alias analysis is quite rudimentary. On the other +hand, PyPy's escape analysis is very powerful. PyPy also has support for the +annotations that SPUR supports, using some decorators in the pypy.rlib.jit +module. User-controlled loop unrolling is performed using the unroll_safe +decorator, tracing of a function can be disabled with the dont_look_inside +decorator.

+

PyPy has a few more annotations that were not mentioned in the SPUR tech report. +Most importantly, it is possible to declare a function as pure, using the +purefunction decorator. PyPy's optimizers will remove calls to a function +decorated that way if the arguments to the call are all constant. In addition it +is possible to declare instances of classes to be immutable, which means that +field accesses on constant instances can be folded away. Furthermore there is +the promote hint, which is spelled x = hint(x, promote=True). This will +produce a guard in the trace, to turn x into a constant after the guard.

+
+
+
+

Summary

+

Given the similarity between the projects' goals, it is perhaps not so +surprising to see that PyPy and SPUR have co-evolved and reached many similar +design decisions. It is still very good to see another project that does many +things in the same way as PyPy.

+
+
+
+
+
+ + Anonymous wrote on 2010-07-04 09:27: +
+
+

Besides being similar projects, is it possible to cross the streams? Could PyPy's CLI backend take the place of the JavaScript-to-CIL compiler (or is that the wrong parallel)?

+
+
+
+
+ + Carl Friedrich Bolz-Tereick wrote on 2010-07-04 21:55: +
+
+

@Anonymous: I guess you could program stuff in RPython, compile to CIL and get it jitted by SPUR. However, this wouldn't work well for our main RPython programs, which are all interpreters. Using a tracing JIT on an interpreter without doing anything special is not helping much.

+
+
+
+ +

"Blackhole" interpreter

+ +
+

Hi all,

+ +

Here are a few words about the JIT's "great speedup in compiling +time" advertized on the PyPy 1.3 release (see the + +previous blog post). +The exact meaning behind these words needs a fair bit of +explanation, so here it is in case you are interested.

+ +

If you download a version of PyPy 1.3 that includes a JIT +compiler, you get an executable that could be qualified as rather +fat: it actually contains three interpreters. You have on the +one hand the regular Python interpreter. It is here because it's +not possible to JIT-compile every single piece of Python code you +try to run; only the most executed loops are JIT-compiled. They +are JIT-compiled with a tracing interpreter that operates one +level down. This is the second interpreter. This tracing step +is quite slow, but it's all right because it's only invoked on +the most executed loops (on the order of 100 to 1000 times in +total in a run of a Python script that takes anyway seconds or +minutes to run).

+ +

So apart from the JIT compilation itself, we have two worlds in +which the execution proceeds: either by regular interpretation, +or by the execution of assembler code generated by the JIT +compiler. And of course, we need to be able to switch from one +world to the other quickly: during regular interpretation we have +to detect if we already have generated assembler for this piece +of code and if so, jump to it; and during execution of the +assembler, when a "guard" fails, i.e. when we meet a path of +execution for which we did not produce assembler, then we need to +switch back to regular interpretation (or occasionally invoke the +JIT compiler again).

+ +

Let us consider the cost of switching from one world to another. +During regular interpretation, if we detect that we already have +assembler corresponding to this Python loop, then we just jump to +it instead of interpreting the Python loop. This is fairly +cheap, as it involves just one fast extra check per Python loop. +The reverse is harder because "guard" failures can occur at any +point in time: it is possible that the bit of assembler that we +already executed so far corresponds to running the first 4 Python +opcodes of the loop and a half. The guard that failed just now +is somewhere in the middle of interpreting that opcode -- say, +multiplying these two Python objects.

+ +

It's almost impossible to just "jump" at the right place in the +code of the regular interpreter -- how do you jump inside a +regular function compiled in C, itself in a call chain, resuming +execution of the function from somewhere in the middle?

+ +

So here is the important new bit in PyPy 1.3. Previously, what +we would do is invoke the JIT compiler again in order to follow +what needs to happen between the guard failure and the real end +of the Python opcode. We would then throw away the trace +generated, as the only purpose was to finish running the current +opcode. We call this "blackhole interpretation". After the end +of the Python opcode, we can jump to the regular interpreter +easily.

+ +

Doing so was straightforward, but slow, in case it needs to be +done very often (as in the case in some examples, but not all). +In PyPy 1.3, this blackhole interpretation step has been +redesigned as a time-critical component, and that's where the +third interpreter comes from. It is an interpreter that works +like the JIT compiler, but without the overhead of tracing (e.g. +it does not need to box all values). It was designed from the +ground up for the sole purpose of finishing the execution of the +current Python opcode. The bytecode format that it interprets is +also new, designed for that purpose, and the JIT compiler itself +(the second interpreter) was adapted to it. +The old bytecode format in PyPy 1.2 is gone +(it was more suited for the JIT compiler, but less for blackhole +interpretation).

+ +

In summary, it was a lot of changes in the most front-end-ish +parts of the JIT compiler, even though it was mostly hidden +changes. I hope that this longish blog post helped bring it a +bit more to the light :-)

+
+
+
+
+ + GRon wrote on 2010-06-26 21:06: +
+
+

Interesting, is there any documentation for the different bytecode sets you have/had?

I would be especially interested in the differences, and the reasons for those design decisions.

+
+
+
+
+ + Armin Rigo wrote on 2010-06-26 23:11: +
+
+

I fear not. The bytecode set is quite custom, made to represent RPython code, which is at the level (roughly speaking) of Java -- with a few additional instructions to guide the JIT compiler. The latest version uses a register-based machine, which is more convenient than a Java-like stack-based approach starting from the control flow graphs of RPython functions. It has three independent sets of registers: integers, pointers, and floating-point (pointers are different from integers at this level because the GC needs to track them and possibly move them). Register numbers are encoded in one byte, so there is room for 256 registers of each kind, but in practice doing a simple register allocation step on each graph means that no bytecode ends up using more than ~15 registers. A few parts are needed only by the JIT compiler and not by the blackhole interpreter; these are encoded "off-line" to avoid slowing down the blackhole interpreter.

Well, I could talk at length about all the details of the format, but in truth there is nothing very deep there :-) See the comments in https://codespeak.net/svn/pypy/trunk/pypy/jit/codewriter/codewriter.py as well as the tests like test/test_flatten.py and test/test_regalloc.py.

+
+
+
+
+ + Zeev wrote on 2010-06-27 01:40: +
+
+

Does the PyPy JIT replace a running interpreted loop with a compiled one mid-run or only on the next iteration or only the next time this loop starts?

Is there a way to ask the PyPy interpreter to tell me what it jitted as it ran some code?

Or will it be too difficult for me to relate the produced machine code with my python source code (because it's not a straightforward method jit)?

+
+
+
+
+ + Maciej Fijalkowski wrote on 2010-06-27 17:00: +
+
+

Hi Zeev.

Only at the next iteration of the loop. However, you have to have at least ~1000 iterations before it happens.

There is a variety of tools that we use for inspecting generated loops. There is no programmable interface from python yet, but there are some external tools.

Run: PYPYJITLOG=jit-log-opt:log pypy

and you'll get a file log which contains all the loops. There are tools in the source checkout pypy/jit/tool, loopviewer.py, showstats.py and traceviewer.py which can help you viewing those loops. They'll contain debug_merge_points which are with info about python opcodes (including functions and file), but they can span several functions. Have fun :)

If you want more info, drop by on #pypy at irc.freenode.net.

Cheers,
fijal

+
+
+
+
+ + Luis wrote on 2010-06-30 02:20: +
+
+

Is this Blackhole interpreter the Jaegermonkey of pypy?

+
+
+
+
+ + Armin Rigo wrote on 2010-07-08 16:26: +
+
+

Luis: no.

+
+
+
+
+ + Antonio Cuni wrote on 2010-07-08 16:47: +
+
+

@Luis: just to expand a bit Armin's answer :-).

Jaegermonkey is a method-by-method compiler that Tracemonkey uses *before* the tracing compiler enters in action. In pypy, this is equivalent to the normal Python interpreter that profiles your loops to find the hot ones, with the obvious difference that Jaegermonkey is a compiler, while ours is an interpreter.

The blackhole interpreter is something that it's used internally by our tracing jit compiler, and AFAIK it has no equivalent in tracemonkey.

+
+
+
+
+ + Luis wrote on 2010-07-08 23:27: +
+
+

I see. Thanks Armin and Antonio.

+
+
+
+ +

PyPy 1.3 released

+ +
+

Hello.

+

We're please to announce the release of PyPy 1.3. This release has two major +improvements. First of all, we stabilized the JIT compiler since 1.2 release, +answered user issues, fixed bugs, and generally improved speed.

+

We're also pleased to announce alpha support for loading CPython extension +modules written in C. While the main purpose of this release is increased +stability, this feature is in alpha stage and it is not yet suited for +production environments.

+
+

Highlights of this release

+
    +
  • +

    We introduced support for CPython extension modules written in C. As of now, +this support is in alpha, and it's very unlikely unaltered C extensions will +work out of the box, due to missing functions or refcounting details. The +support is disabled by default, so you have to do:

    +
    +import cpyext
    +
    +

    before trying to import any .so file. Also, libraries are source-compatible +and not binary-compatible. That means you need to recompile binaries, using +for example:

    +
    +pypy setup.py build
    +
    +

    Details may vary, depending on your build system. Make sure you include +the above line at the beginning of setup.py or put it in your PYTHONSTARTUP.

    +

    This is alpha feature. It'll likely segfault. You have been warned!

    +
  • +
  • +

    JIT bugfixes. A lot of bugs reported for the JIT have been fixed, and its +stability greatly improved since 1.2 release.

    +
  • +
  • +

    Various small improvements have been added to the JIT code, as well as a great +speedup of compiling time.

    +
  • +
+
+

+Cheers,
+Maciej Fijalkowski, Armin Rigo, Alex Gaynor, Amaury Forgeot d'Arc and the PyPy team +

+

+Update:The correct command to build extension is "pypy setup.py build", not "python setup.py build" as it was stated before.

+
+
+
+
+ + Isaac Gouy wrote on 2010-06-27 23:18: +
+
+

fyi benchmarks game

+
+
+
+
+ + Maciej Fijalkowski wrote on 2010-06-28 07:04: +
+
+

Thanks. I don't think we improved in any of the areas measured by those benchmarks (even if, only by a tiny bit).

Cheers,
fijal

+
+
+
+ +

A JIT for Regular Expression Matching

+ +
+

This is part 2 of a series, see Part 1 for an introduction. In this post +I want to describe how the JIT generator of the PyPy project can be used to turn +the elegant but not particularly fast regular expression matcher from the first +part into a rather fast implementation. In addition, I will show some speed +measurements against various regular expression implementations.

+

Again, note the disclaimer: This technology could not easily be used +to implement Python's re-module.

+
+

Example Expression and First Numbers

+

The regular expression I will use as an example in the rest of this paper is +the expression (a|b)*a(a|b){20}a(a|b)*. It matches all strings that have two +a with exactly 20 characters between them. This regular expression has +the property that the corresponding DFA needs 2**(n+1) different states. As +an input string, we use a random string (of varying lengths) that does not +match the regular expression. I will give all results as number of chars matched +per second. While this is not a particularly typical regular expression, it +should still be possible to get some ballpark numbers for the speeds of various +implementations – as we will see, the differences between implementations are +huge anyway.

+

All the benchmarks were performed on my laptop, which has an Intel Core2 Duo +P8400 processor with 2.26 GHz and 3072 KB of cache on a machine with 3GB RAM +running Ubuntu Linux 10.04.

+

To get a feeling for the orders of magnitude involved, the CPython re module +(which is implemented in C and quite optimized) can match 2'500'000 chars/s. +Google's new re2 implementation still matches 550'000 chars/s. Google's +implementation is slower, but their algorithm gives complexity and space +guarantees similar to our implementation in the last blog post.

+

On the other end of the performance scale is the pure-Python code from the last +blog post running on CPython. It can match only 12'200 chars/s and is thus 200 +times slower than the re module.

+
+
+

Translating the Matcher

+

The code described in the last blog post is not only normal Python code, but +also perfectly valid RPython code. Nothing particularly dynamic is going on in +the code, thus it can be translated with PyPy's translation toolchain to C code. +The resulting binary is considerably faster and can match 720'000 chars/s, 60 +times faster than the untranslated version.

+

Another approach is to write equivalent versions of the algorithms in lower +level languages. This has been done for C++ by Sebastian Fischer and for Java by +Baltasar Trancón y Widemann. The algorithm is object-oriented enough to be +mapped very closely to the respective languages. The C++ version is +a little bit faster than the RPython version translated to C, at 750'000 chars/s. That's +not very surprising, given their similarity. The Java version is more than twice +as fast, with 1'920'000 chars/s. Apparently the Java JIT compiler is a lot +better at optimizing the method calls in the algorithm or does some other +optimizations. One reason for this could be that the Java JIT can assume that +the classes it sees are all there are (and it will invalidate the generated +machine code if more classes are loaded), whereas the C++ compiler needs to +generate code that works even in the presence of more regular expression +classes.

+
+
+

Generating a JIT

+

To get even more performance out of the RPython code, it is possible to generate +a JIT for it with the help of the PyPy translation toolchain. To do this, the +matching code needs to be extended somewhat by some hints that tell PyPy's JIT +generator how this is to be done. The JIT generator can automatically produce a +JIT compiler from an RPython interpreter of the source language. In our case, +we view the regular expression matcher as an interpreter for regular +expressions. Then the match function corresponds to the +dispatch loop of a traditional interpreter.

+

Our regular expression matcher is a very peculiar interpreter. The matcher +works by running exactly one loop (the one in match) as many times as the +input string is long, irrespective of the "program", i.e. the particular +regular expressions. In addition, within the loop there are no conditions (e.g. +if statements) at all, it is just linear code. This makes it almost perfectly +suited +to the JIT generator, which produces a tracing JIT. A tracing JIT compiles the +hot loops of a program (i.e. regular expression) and has to do extra work if +there are conditions in the loop. In our case, there is exactly one loop per +regular expression, without any condition.

+
+

JIT Hints

+

The hints that are needed for the match function of the last blog post can +be seen here (the function is slightly rewritten, e.g. the JIT does only +properly support a while loop as the main dispatch loop):

+
jitdriver = jit.JitDriver(reds=["i", "result", "s"], greens=["re"])
+
+def match(re, s):
+    if not s:
+        return re.empty
+    # shift a mark in from the left
+    result = re.shift(s[0], 1)
+    i = 1
+    while i < len(s):
+        jitdriver.can_enter_jit(i=i, result=result, s=s, re=re)
+        jitdriver.jit_merge_point(i=i, result=result, s=s, re=re)
+        # shift the internal marks around
+        result = re.shift(s[i], 0)
+        i += 1
+    re.reset()
+    return result
+
+

The jitdriver is an instance describing the data of the interpreter we are +dealing with. The arguments to the constructor need to list all local variables +of the dispatch loop. The local variables are classified into two classes, red +ones and green ones. The green ones hold the objects that make up the program +that the interpreter currently runs and which position in the program is +currently being executed. In a typical bytecode interpreter, the bytecode object +and the program counter would be green. In our case, the regular expression is +the program, so it is green. The rest of the variables are red.

+

The green variables are treated specially by the JIT generator. At runtime, for +a given value of the green variables, one piece of machine code will be +generated. This piece of machine code can therefore assume that the value of +the green variable is constant.

+

There are two additional hints, which are method calls on the +jitdriver instance. The jit_merge_point method marks the beginning of +the main interpreter loop. The can_enter_jit function marks the point where +a loop in the user program can be closed, which in our case is trivial, it's +just at the end of the interpreter loop (for technical reasons it is put at the beginning, because nothing must happen between the can_enter_jit and jit_merge_point invocations).

+

Those are the hints that the JIT generator needs to function at all. We added +some additional hints, that give the JIT generator more information to work +with. Those hints are immutability information, which means that certain +instance fields can not be changed after the object has been constructed. Apart +from the marked field, none of the fields of any of the Regex subclasses +can change. For example for the Char class this is expressed in the +following way:

+
class Char(Regex):
+    _immutable_fields_ = ["c"]
+    def __init__(self, c):
+        ...
+
+

These hints allow the generated JIT to constant-fold reads out of the immutable +fields in some situations.

+
+
+

Adaptions to the Original Code

+

In the introduction above I wrote that the code within the loop in match +uses no conditions. It is indeed true that none of the _shift methods +have an if statement or similar. However, there are some hidden conditions +due to the fact that the and and or boolean operators are used, which +are short-circuiting. Therefore the JIT-version of the code needs to be adapted +to use the non-short-circuiting operators & and |.

+
+
+

JIT Example

+

To get an impression of how the generated machine code looks like, consider the +regular expression (a|b)*. As regular expression objects this would be +Repetition(Alternative(Char('a'), Char('b'))). The machine code in its intermediate, +machine-independent form looks as follows (I have slightly cleaned it up and +added comments for clarity):

+
# arguments of the loop
+# i0 is i in the match function
+# result0 is result in the match function
+# s0 is s in the match function
+[i0, result0, s0] # those are the arguments to the machine code
+char = s0[i0] # read the character
+# read the current mark:
+i5 = ConstPtr(ptr_repetition).marked
+i7 = char == 'a' # is the character equal to 'a'
+i8 = i5 & i7
+i10 = char == 'b' # is the character equal to 'b'
+i11 = i5 & i10
+# write new mark
+ConstPtr(ptr_chara).marked = i8
+i13 = i8 | i11
+# write new mark
+ConstPtr(ptr_charb).marked = i11
+# write new mark
+ConstPtr(ptr_alternative).marked = i13
+# increment the index
+i17 = i0 + 1
+i18 = len(s0)
+# write new mark
+ConstPtr(ptr_repetition).marked = i13
+# check that index is smaller than the length of the string
+i19 = i17 < i18
+if not i19:
+    go back to normally running match
+jump(i17, i13, s0) # start from the top again
+
+

The various ConstPtr(ptr_*) denote constant addresses of parts of the regular +expression tree:

+
    +
  • +ptr_repetition is the Repetition +
  • +
  • +ptr_chara is Char('a') +
  • +
  • +ptr_charb is Char('b') +
  • +
  • +ptr_alternative is the Alternative +
  • +
+

Essentially the machine code reads the next char out of the string, the current +mark out of the Repetition and then performs some boolean operations on +those, writing back the new marks. Note in particular how the generated +machine code does not need to do any method calls to shift and _shift and +that most field reads out of the regular expression classes have been optimized +away, because the fields are immutable. Therefore the machine code does not +need to deconstruct the tree of regular expression objects at all, it just +knows where in memory the various parts of it are, and encodes that directly +into the code.

+
+
+

Performance Results With JIT

+

With the regular expression matcher translated to C and with a generated JIT, +the regular expression performance increases significantly. Our running example +can match 16'500'000 chars/s, which is more than six times faster than the +re module. This is not an entirely fair comparison, because the re +module can give more information than just "matches" or "doesn't match", but +it's still interesting to see. A more relevant comparison is that between the +program with and without a JIT: Generating a JIT speeds the matcher up by more +than 20 times.

+
+
+
+

Conclusion

+

So, what have we actually won? We translated the relatively simple and very slow +regular expression matching algorithm from the last post to C and were thus able +to speed it up significantly. The real win is gained by also generating a JIT +for the matcher, which can be regarded as a simple interpreter. The resulting +matcher is rather fast.

+

The lesson from these posts is not that you can or should write a practical +and general regular expression module in this way – indeed, enhancing the +algorithm to support more features of the re module would be a lot of work +and it is also unclear what the speed results for more realistic regular +expressions would be. However, it makes for a great case study of the JIT +generator. It was relatively straightforward to generate a JIT for the regex +matcher, and the speed results were great (Admittedly I know rather a lot about +PyPy's JIT though). This approach is generalizable to many programs that are +sufficiently "interpreter-like" (whatever that exactly means).

+

All the results that appeared at various points in this blog post can be seen +here:

+ +++++ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Implementationchars/sspeedup over pure Python
Pure Python code12'2001
Python re module2'500'000205
Google's re2 implementation550'00045
RPython implementation translated to C720'00059
C++ implementation750'00061
Java implementation1'920'000157
RPython implementation with JIT16'500'0001352
+
+

Sources

+

All the source code can be found in my Subversion user directory on Codespeak.

+
+
+

Edit:

+

Armin is right (see first comment). I fixed the problem.

+
+
+
+
+ + Armin Rigo wrote on 2010-06-08 13:11: +
+
+

Warning: the first example is wrong: there should be no code executed between can_enter_jit() and jit_merge_point(). In this case, there is the exit condition of the loop. It needs to be rewritten as a "while True:" loop with a "break" just before can_enter_jit().

+
+
+
+
+ + Carl Friedrich Bolz-Tereick wrote on 2010-06-08 13:35: +
+
+

@Armin: Damn, you're right. I fixed the blog post.

+
+
+
+
+ + Nelson Elhage wrote on 2010-06-08 15:36: +
+
+

What happens if you don't replace and and or?

Without those changes, the modifications for JIT really are prety small --
mostly just some annotations in the main loop and at toplevel for each
class. With those changes, though, you need to potentially check the entire
codebase of your interpreter.

Pretty fun performance results, though.

+
+
+
+
+ + Carl Friedrich Bolz-Tereick wrote on 2010-06-08 16:40: +
+
+

@Nelson: If you don't change the "and" and "or" you get a lot of assembler code generated, and it's not particularly fast.

Note that this "and" and "or" business is quite specific to this particular example. Usually you can work more incrementally by generating a JIT, then looking at the produced assembler and then doing some small changes in the interpreter to improve parts of it. Each such change is usually localized to one part of the interpreter improves the performance of some language feature.

This example is not really large enough to show this way of working, though :-). Maybe at some point I should write a walk-through for some interpreter.

+
+
+
+
+ + Kumo wrote on 2010-06-08 22:55: +
+
+

Would it be possible to create a pypy or cpython extension module this way?

+
+
+
+
+ + Jared Forsyth wrote on 2010-06-09 21:27: +
+
+

Could you post your 'test runner' code? I'm running some tests (with your) code and getting drastically different numbers...

+
+
+
+
+ + Carl Friedrich Bolz-Tereick wrote on 2010-06-10 10:51: +
+
+

@jabapyth: there is no test runner code. I am simply running something like

genrandom 20 1000000 | time regex-c

What performance results are you getting? Are you sure that you translated jitregex.py with -Ojit? Otherwise the JIT is not put into the executable.

+
+
+
+
+ + Maxim Yegorushkin wrote on 2010-08-02 00:19: +
+
+

boost::regex is not mentioned. It's got both recursive and non-recursive implementations. And it is the base of the standard C++ TR1 regex. Would be interesting to stack it up against other results because it is

+
+
+
+
+ + Maciej Fijalkowski wrote on 2010-08-02 08:52: +
+
+

We can't possibly include all regex engines (even if we would like to). However, sources are out there and you can always rerun those benchmarks and see how it compares :-)

Cheers,
fijal

+
+
+
+
+ + Nikhil wrote on 2013-01-12 21:25: +
+
+

I'm not able to access the code on codespeak.net. Has the code been moved to some other place?

+
+
+
+
+ + Maciej Fijalkowski wrote on 2013-01-12 22:40: +
+
+

The code has been merged to PyPy since I think. Look up cfbolz repos on bitbucket though

+
+
+
+ +

PyPy in Google's Summer of Code 2010

+ +
+

Good news everyone.

+

This year, thanks to google generosity and PSF support, we got two and a +half of students for PyPy's summer of code. We didn't cut any students, but one +of the projects is a joint project of PyPy and numpy. Hereby I present +descriptions, in my own words with my own opinions and in arbitrary order. For +more details please follow links to particular blogs.

+
+

+Jason Creighton: 64bit JIT backend for PyPy

+

Intel 64bit (and I mean x86_64) compatibility for JIT has been one of the top +requested features (along with GIL removal). While GIL removal is not really an +easy task, having our JIT emit 64bit assembler is sort of easy, thanks to our +JIT backend abstraction. It will likely be faster, thanks to abundance of +registers.

+
+
+

+Bartosz Skowron: Fast ctypes for PyPy

+

Historically weak point of PyPy was compatibility with extension modules. We +have progressed quite a bit in recent years, first introducing ctypes for +pypy then progressing towards CPython extension modules. However, ctypes is +well known to be slow (and it's even slower on PyPy) and writing CPython +extension modules is ugly, and it's going to be only with compatibility layer +that'll keep this slow. What happens if we try to employ JIT technology to +ctypes? Maybe we can compile calls to C code from Python as a direct calls in +compiled assembler? Why not?

+

This project will look how the JIT technology can be employed to do some +sort of FFI. There is no guarantee we'll get super-fast ctypes as a result, +but it's good to see progress in that area.

+
+
+

+Dan Roberts: Numpy in PyPy

+

This is a joint project of numpy and PyPy. The main objective is to bring +numpy to PyPy, possibly fast. The official mentor for this project is +Stefan van der Walt from numpy community. During initial meeting it was +agreed that probably the best way to go would be to support original numpy +with CPython extension compatibility and then provide a minimal native numpy +framework for pypy. The former would retain full compatibility, while the +latter would have JIT integration, with line of our previous +numeric experiments. There would be an explicit interface from converting +one array to another for convinience.

+
+

Overall, I'm very happy to see so much support for PyPy from SoC. I hope all +three proposals will be successful!

+

Cheers,
+fijal & pypy team.

+
+
+
+
+ + Michael Twomey wrote on 2010-06-01 11:48: +
+
+

Some really nice stuff in there, very interested in the potential for JIT + numpy, keep up the good work!

+
+
+
+
+ + Anonymous wrote on 2010-06-01 11:53: +
+
+

Cool projects. Two of them live as PyPy branches:

https://codespeak.net/viewvc/pypy/branch/x86-64-jit-backend/

https://codespeak.net/viewvc/pypy/branch/fast-ctypes/

Where can we follow the NumPy work? :)

+
+
+
+
+ + Unknown wrote on 2010-06-07 16:16: +
+
+

when will pypy catch up with python 3.1? will it happen during the python language moratorium (pep 3003)?

+
+
+
+
+ + Maciej Fijalkowski wrote on 2010-06-10 20:08: +
+
+

@horace

Depends when you can help :)

+
+
+
+ +

An Efficient and Elegant Regular Expression Matcher in Python

+ +
+

Two weeks ago, I was at the Workshop Programmiersprachen und Rechenkonzepte, +a yearly meeting of German programming language researchers. At the workshop, +Frank Huch and Sebastian Fischer gave a really excellent talk about an +elegant regular expression matcher written in Haskell. One design goal of the +matcher was to run in time linear to the length of the input string (i.e. +without backtracking) and linear in the size of the regular expression. The +memory use should also only be linear in the regular expression.

+

During the workshop, some of the Haskell people and me then implemented the +algorithm in (R)Python. Involved were Frank, Sebastian, Baltasar Trancón y +Widemann, Bernd Braßel and Fabian Reck.

+

In this blog post I want to describe this implementation and show the code of +it, because it is quite simple. In a later post I will show what optimizations +PyPy can perform on this matcher and also do some benchmarks.

+

A Note on terminology: In the rest of the post "regular expression" is meant +in the Computer Science sense, not in the POSIX sense. Most importantly, that +means that back-references are not allowed.

+

Another note: This algorithm could not be used to implement PyPy's re +module! So it won't help to speed up this currently rather slow implementation.

+
+

Implementing Regular Expression Matchers

+

There are two typical approaches to implement regular expression. A naive one is +to use a back-tracking implementation, which can lead to exponential matching +times given a sufficiently evil regular expression.

+

The other, more complex one, is to transform the regular expression into a +non-deterministic finite automaton (NFA) and then transform the NFA into a +deterministic finite automaton (DFA). A DFA can be used to efficiently match +a string, the problem of this approach is that turning an NFA into a DFA can +lead to exponentially large automatons.

+

Given this problem of potential memory explosion, a more sophisticated approach +to matching is to not construct the DFA fully, but instead use the NFA for +matching. This requires some care, because it is necessary to keep track of +which set of states the automaton is in (it is not just one state, because the +automaton is non-deterministic).

+

The algorithm described here is essentially equivalent to this approach, however +it does not need an intermediate NFA and represents a state of a corresponding +DFA as marked regular expression (represented as a tree of nodes). For many +details about an alternative approach to implement regular expressions +efficiently, see Russ Cox excellent article collection.

+
+
+

The Algorithm

+

In the algorithm the regular expression is represented as a tree of nodes. The +leaves of the nodes can match exactly one character (or the epsilon node, which +matches the empty string). The inner nodes of the tree combine other nodes in +various ways, like alternative, sequence or repetition. Every node in the tree +can potentially have a mark. The meaning of the mark is that a node is marked, +if that sub-expression matches the string seen so far.

+

The basic approach of the algorithm is that for every character of the input +string the regular expression tree is walked and a number of the nodes in the +regular expression are marked. At the end of the string, if the top-level node +is marked, the string matches, otherwise it does not. At the beginning of the +string, one mark gets shifted into the regular expression from the top, and then +the marks that are in the regex already are shifted around for every additional +character.

+

Let's start looking at some code, and an example to make this clearer. The base +class of all regular expression nodes is this:

+
class Regex(object):
+    def __init__(self, empty):
+        # empty denotes whether the regular expression
+        # can match the empty string
+        self.empty = empty
+        # mark that is shifted through the regex
+        self.marked = False
+
+    def reset(self):
+        """ reset all marks in the regular expression """
+        self.marked = False
+
+    def shift(self, c, mark):
+        """ shift the mark from left to right, matching character c."""
+        # _shift is implemented in the concrete classes
+        marked = self._shift(c, mark)
+        self.marked = marked
+        return marked
+
+

The match function which checks whether a string matches a regex is:

+
def match(re, s):
+    if not s:
+        return re.empty
+    # shift a mark in from the left
+    result = re.shift(s[0], True)
+    for c in s[1:]:
+        # shift the internal marks around
+        result = re.shift(c, False)
+    re.reset()
+    return result
+
+

The most important subclass of Regex is Char, which matches one +concrete character:

+
class Char(Regex):
+    def __init__(self, c):
+        Regex.__init__(self, False)
+        self.c = c
+
+    def _shift(self, c, mark):
+        return mark and c == self.c
+
+

Shifting the mark through Char is easy: a Char instance retains a mark +that is shifted in when the current character is the same as that in the +instance.

+

Another easy case is that of the empty regular expression Epsilon:

+
class Epsilon(Regex):
+    def __init__(self):
+        Regex.__init__(self, empty=True)
+
+    def _shift(self, c, mark):
+        return False
+
+

Epsilons never get a mark, but they can match the empty string.

+
+

Alternative

+

Now the more interesting cases remain. First we define an abstract base class +Binary for the case of composite regular expressions with two children, and +then the first subclass Alternative which matches if either of two regular +expressions matches the string (usual regular expressions syntax a|b).

+
class Binary(Regex):
+    def __init__(self, left, right, empty):
+        Regex.__init__(self, empty)
+        self.left = left
+        self.right = right
+
+    def reset(self):
+        self.left.reset()
+        self.right.reset()
+        Regex.reset(self)
+
+class Alternative(Binary):
+    def __init__(self, left, right):
+        empty = left.empty or right.empty
+        Binary.__init__(self, left, right, empty)
+
+    def _shift(self, c, mark):
+        marked_left  = self.left.shift(c, mark)
+        marked_right = self.right.shift(c, mark)
+        return marked_left or marked_right
+
+

An Alternative can match the empty string, if either of its children can. +Similarly, shifting a mark into an Alternative shifts it into both its +children. If either of the children are marked afterwards, the Alternative +is marked too.

+

As an example, consider the regular expression a|b|c, which would be +represented by the objects Alternative(Alternative(Char('a'), Char('b')), Char('c')). +Matching the string "a" would lead to the following marks in +the regular expression objects (green nodes are marked, white ones are +unmarked):

+ +alternativea.gif

At the start of the process, no node is marked. Then the first char is matched, +which adds a mark to the Char('a') node, and the mark will propagate up the +two Alternative nodes.

+
+
+

Repetition

+

The two remaining classes are slightly trickier. Repetition is used to match +a regular expression any number of times (usual regular expressions syntax +a*):

+
class Repetition(Regex):
+    def __init__(self, re):
+        Regex.__init__(self, True)
+        self.re = re
+
+    def _shift(self, c, mark):
+        return self.re.shift(c, mark or self.marked)
+
+    def reset(self):
+        self.re.reset()
+        Regex.reset(self)
+
+

A Repetition can always match the empty string. The mark is shifted into the +child, but if the Repetition is already marked, this will be shifted into +the child as well, because the Repetition could match a second time.

+

As an example, consider the regular expression (a|b|c)* matching the string +abcbac:

+repetition.gif

For every character, one of the alternatives matches, thus the repetition matches +as well.

+
+
+

Sequence

+

The only missing class is that for sequences of expressions, Sequence (usual +regular expressions syntax ab):

+
class Sequence(Binary):
+    def __init__(self, left, right):
+        empty = left.empty and right.empty
+        Binary.__init__(self, left, right, empty)
+
+    def _shift(self, c, mark):
+        old_marked_left = self.left.marked
+        marked_left = self.left.shift(c, mark)
+        marked_right = self.right.shift(
+            c, old_marked_left or (mark and self.left.empty))
+        return (marked_left and self.right.empty) or marked_right
+
+

A Sequence can be empty only if both its children are empty. The mark +handling is a bit delicate. If a mark is shifted in, it will be shifted to the +left child regular expression. If that left child is already marked before the +shift, that mark is shifted to the right child. If the left child can match the +empty string, the right child gets the mark shifted in as well.

+

The whole sequence matches (i.e. is marked), if the left child is marked after +the shift and if the right child can match the empty string, or if the right +child is marked.

+

Consider the regular expression abc matching the string abcd. For the +first three characters, the marks wander from left to right, when the d is +reached, the matching fails.

+sequence.gif +
+
+

More Complex Example

+

As a more complex example, consider the expression ((abc)*|(abcd))(d|e) +matching the string abcabcabcd.

+complex.gif

Note how the two branches of the first alternative match the first abc in +parallel, until it becomes clear that only the left alternative (abc)* can +work.

+
+
+

Complexity

+

The match function above loops over the entire string without going back and +forth. Each iteration goes over the whole tree every time. Thus the complexity +of the algorithm is O(m*n) where m is the size of the regular expression +and n is the length of the string.

+
+
+
+

Summary & Outlook

+

So, what have we achieved now? The code shown here can match regular expressions +with the desired complexity. It is also not much code. By itself, the Python +code shown above is not terribly efficient. In the next post I will show how the +JIT generator can be used to make the simple matcher shown above really fast.

+
+
+
+
+
+ + Marius Gedminas wrote on 2010-05-21 16:41: +
+
+

Have you seen Russ Cox's series of articles about regular expressions?

Google Chrome's regexp library is also interesting.

Google appears to have put a lot of research in efficient regexp algorithms while paying attention to backwards-compatibility concerns, as existing applications often rely on backtracking.

+
+
+
+
+ + kay schluehr wrote on 2010-05-21 20:26: +
+
+ Most importantly, that means that back-references are not allowed.

Limited backreferences can be integrated within this pattern matching scheme. General backreferences are only possible with backtracking but unless you want to solve NP complete problems using POSIX style regexps they might not be necessary. +
+
+
+
+ + Carl Friedrich Bolz-Tereick wrote on 2010-05-21 21:48: +
+
+

Marius: The Russ Cox site is linked from the article :-).

Kay: Thanks for the link, will check it out.

+
+
+
+
+ + Thomas wrote on 2010-05-21 22:07: +
+
+

I do not use regular expressions very heavily and am very new to pypy in general (1.2 works pretty good for me on my pure python code). From this article I don't see a full explaination why this basic algorithm couldn't be used for pypy. Is it primarily due to concerns about backward compatiblity or something more interesting? I am looking forward to the article to come about applying the JIT.

+
+
+
+
+ + Benjamin Peterson wrote on 2010-05-22 20:58: +
+
+

@Thomas Python's re library requires backtracking.

+
+
+
+
+ + Anonymous wrote on 2010-05-23 00:24: +
+
+

This is just beautiful, I hope some version of this will be available for PyPy users: sandboxing + non-pathological REs sounds like a nice combo.

+
+
+
+
+ + Damian Cugley wrote on 2010-05-24 16:22: +
+
+

Though these regexes can't be used as a drop-in replacement for the re module, if there were strikingly faster it might be worth having them as an alternative. The backtracking features are so seldom required that a faster, non-backtracking algorithm might prove popular with people who worry about matching speed.

+
+
+
+
+ + Anonymous wrote on 2010-05-25 12:53: +
+
+

It would be fun to read an article where you take the real Python regexes and apply PyPy's JIT code generation to them, i.e. when you call re.compile(...), you'd get native code out of it, specialized for the regex being compiled. After all, haven't you used the JIT on "toy" languages before? Regexes are a "toy" language, albeit a useful one..

+
+
+
+
+ + Carl Friedrich Bolz-Tereick wrote on 2010-05-25 14:21: +
+
+

Anonymous2: Yes, but PyPy's current implementation of the re module is a bit of a mess, and not really fast. It's rather unclear how easy/possible it would be to generate a good JIT for it.

+
+
+
+
+ + Unhelpful wrote on 2010-06-04 21:17: +
+
+

Instead of a "special" interpreter for REs in RPython, and a JIT for it, what about "compiling" REs to Python bytecode, and letting the existing PyPy JIT trace and compile them if they end up being used often enough? This is probably slower in the case of lots of throwaway REs that are used once, but when a few REs are used repeatedly it ought to work.

+
+
+
+
+ + Anonymous wrote on 2017-11-21 15:28: +
+
+

"As an example, consider the regular expression a|b|c, which would be represented by the objects Alternative(Alternative(Char('a'), Char('b')), Char('c'))"

But how to create such a representation, when you scan input regex literal by literal?

+
+
+
+ +

Running wxPython on top of pypy

+ +
+

Hello,

+

These last three weeks we have been busy working on the cpyext subsystem, which +allows pypy to execute extension modules written with the Python C API.

+

Today we hacked enough to have wxPython compile, and run its wonderful demo. +This: + +cannot be distinguished from the same run with a +standard python interpreter, but this: + +shows an exception that +CPython never produces.

+

wxPython is a big extension module: it has more than 500 classes and 7500 +functions, most of the code is automatically generated by swig. It uses +advanced techniques, like "Original Object Return" and cross-platform +polymorphism, that effectively allows the developer to seamlessly subclass C++ +objects in Python and write GUI applications efficiently.

+

The demo application runs reasonably fast, it feels slower than with CPython, +but I did not activate the JIT option of pypy. It still crashes in some places +(the demo is very comprehensive and covers all the aspects of wxPython), and +threads are expected to not work at the moment.

+

We had to modify a little the code of wxPython, mainly because it often stores +borrowed references into C++ objects. This does not work well in pypy, where +all other counted references can disappear, and allows the address of the object +to change. The solution is to use weak references instead. The patch is here, +it will eventually be merged into the upstream wxPython version.

+

This first real test proves that CPython extensions can be migrated to pypy +without much pain. It also points some places which can be improved, like +better diagnostics in crashes, better support of distutils...

+

Amaury Forgeot d'Arc

+
+
+
+
+ + René Dudfield wrote on 2010-05-03 17:09: +
+
+

sweet as!

+
+
+
+
+ + Dan Villiom Podlaski Christiansen wrote on 2010-05-03 18:00: +
+
+

Nice! Do you have any plans for making Mac nightlies with this available? I'd love to try out PyPy, but the one time I tried bootstrapping, it used all available memory. After I had let it run overnight but it didn't finish, I killed it…

+
+
+
+
+ + Bourne wrote on 2010-05-03 19:59: +
+
+

This is very good news.

Finishing wxPython and the JIT is probably all that's needed to make PyPy a **great** alternative to CPython. (but I guess you figured that already)

Thanks!

+
+
+
+
+ + Stu wrote on 2010-05-03 23:52: +
+
+

Sweet ! I wonder if pycairo and pygtk... at the moment I don't know if it's cairo or python slowing down my app (I have an idea it's both, but running it in pypy does seem attractive).

+
+
+
+
+ + René Dudfield wrote on 2010-05-04 09:33: +
+
+

Are there docs for how to compile extensions somewhere? I had a quick look, but couldn't find them.

+
+
+
+
+ + The Cannon Family wrote on 2010-05-04 19:28: +
+
+

this is a major accomplishment in terms of usability, many people use Python extension modules, way to go. (and next steps, PIL).

+
+
+
+
+ + Amaury Forgeot d'Arc wrote on 2010-05-04 22:38: +
+
+

PIL also works with PyPy. I've only tried basic tests though (like gif->png conversion)

+
+
+
+
+ + Amaury Forgeot d'Arc wrote on 2010-05-04 22:40: +
+
+

@illume: you have to compile pypy with the option "--withmod-cpyext", then it should be enough to run "/path/to/pypy-c setup.py build"

+
+
+
+
+ + Unknown wrote on 2010-05-07 13:44: +
+
+

Well done! The Italian Python Community has an article on this (here, in Italian)

+
+
+
+
+ + Anonymous wrote on 2010-05-09 09:49: +
+
+

Wow. PyPy is coming along quite nicely :-)

+
+
+
+ +
+
+ +
+
+
+ +
+ + + + \ No newline at end of file diff --git a/blog/index-15.html b/blog/index-15.html new file mode 100644 index 000000000..a92692394 --- /dev/null +++ b/blog/index-15.html @@ -0,0 +1,1485 @@ + + + + + + +PyPy (old posts, page 15) | PyPy + + + + + + + + + + + + + + + + + + Skip to main content +
+
+
+

The peace of green

+ +
+

No, we are not going to talk about the environment (i.e., the set of variables +as printed by /usr/bin/env. What else? :-)).

+

After months in which we had a couple of tests failing every day, we finally +managed to turn (almost) everything green today, at least on Linux. Enjoy +this screenshoot taken from the nightly build page:

+ + + + +

As usual, the full buildbot results can be seen from the summary page.

+

cheers, +Anto

+
+
+
+
+ + intgr wrote on 2010-10-25 16:32: +
+
+

Am I sensing a PyPy 1.4 release?

+
+
+
+
+ + Antonio Cuni wrote on 2010-10-25 21:05: +
+
+

@intgr: "yes", although we don't have any concrete plan to do a release. But it's true that if we keep all our tests green, doing a release it's much less effort

+
+
+
+ +

PhD Thesis about PyPy's CLI JIT Backend

+ +
+

Hi all,

+

few months ago I finished the PhD studies and now my thesis is available, +just in case someone does not have anything better to do than read it :-).

+

The title of the thesis is High performance implementation of Python for +CLI/.NET with JIT compiler generation for dynamic languages, and its mainly +based on my work on the CLI backend for the PyPy JIT (note that the CLI JIT +backend is currently broken on trunk, but it's still working in the cli-jit +branch).

+

The thesis might be useful also for people that are not directly interested in +the CLI JIT backend, as it also contains general information about the inner +workings of PyPy which are independent from the backend: in particular, +chapters 5 and 6 explain how the JIT frontend works.

+
+
Here is the summary of chapters:
+
    +
  1. Introduction
  2. +
  3. The problem
  4. +
  5. Enter PyPy
  6. +
  7. Characterization of the target platform
  8. +
  9. Tracing JITs in a nutshell
  10. +
  11. The PyPy JIT compiler generator
  12. +
  13. The CLI JIT backend
  14. +
  15. Benchmarks
  16. +
  17. Conclusion and Future Work
  18. +
+
+

cheers, +Anto

+
+
+
+
+ + The Cannon Family wrote on 2010-10-22 18:42: +
+
+

congratulations.

+
+
+
+
+ + Eric van Riet Paap wrote on 2010-10-22 18:43: +
+
+

Yes Anto, congratulations!

+
+
+
+
+ + Lino wrote on 2010-10-23 01:26: +
+
+

Impressive work, Antonio.

ciao

+
+
+
+
+ + Anonymous wrote on 2010-10-23 08:36: +
+
+

Very interesting stuff, still busily reading... could you write a short bibtex entry for citation? Thanks

+
+
+
+
+ + glyph wrote on 2010-10-23 20:56: +
+
+

Congratulations!

(but when are we going to see it merged to trunk... ;-))

+
+
+
+
+ + Antonio Cuni wrote on 2010-10-24 10:08: +
+
+

thank you, guys :-)

@anonymous: here you can find the bibtex for the thesis, as wall as for other PyPy related papers: https://codespeak.net/svn/pypy/extradoc/talk/bibtex.bib

@glyph: unfortunately, trunk has diverged a lot since the cli-jit branch, and merging is not an easy issue. There are also fundamental features that on CLI cannot be implemented as efficently as on x86. It's on my todo list, but no concrete plan so far :-(

+
+
+
+ +

Next PyPy sprint

+ +
+

Hi all,

+ +

The next PyPy sprint is scheduled for the end of the month, from the 25th to the 31st of October 2010. It will be done at the university of Düsseldorf, Germany, where three of us are working.

+ +

Please see this link for more information.

+
+

PyPy in Google's Summer of Code 2010

+ +
+

Hello.

+

This year we had a record of two and a half applications (one was on a cross +section of PyPy and numpy) accepted for the Google +SoC program. Since it ended a couple of weeks ago, we wanted to present the results that +were achieved. All three projects were completed successfully, although the rate +of success varied quite a bit.

+

The Numpy proposal progress significantly on making numpy compatible with +PyPy's CPython's extension module support, but failed to bring PyPy's numpy +implementation into a usable shape (which is a somewhat ambitious goal, one +might argue). The experiments done during the projects are living on the +micronumpy branch.

+

The Fast ctypes proposal did some useful experiments on how to JIT external +calls from PyPy to C, however, the actual code as of now is not very +interesting and it's quite far from providing a full ctypes replacement (or +equivalent).

+

Definitely the most successful proposal was a 64bit (x86_64) backend for PyPy's +JIT. It not only includes working 64bit JIT (merged into PyPy trunk), but also +a working asmgcc for x86_64 linux platform, that makes it possible to run the JIT +on this architecture with our advanced garbage collectors. One can say that +x64_64 is now no longer a second-class citizen for PyPy, although it definitely +didn't receive as much testing as the x86 platform. Expect this to be a major +selling point for the next PyPy release :-)

+

Cheers, +fijal & the PyPy team

+
+
+
+
+ + Anonymous wrote on 2010-09-24 05:26: +
+
+

Awesome news.

+
+
+
+ +

Using Escape Analysis Across Loop Boundaries for Specialization

+ +
+

This blog post is a successor to the one about escape analysis in PyPy's +JIT. The examples from there will be continued here. This post is a bit +science-fictiony. The algorithm that PyPy currently uses is significantly more +complex and much harder than the one that is described here. The resulting +behaviour is very similar, however, so we will use the simpler version (and we +might switch to that at some point in the actual implementation).

+

In the last blog post we described how escape analysis can be used to remove +many of the allocations of short-lived objects and many of the type dispatches +that are present in a non-optimized trace. In this post we will improve the +optimization to also handle more cases.

+

To understand some more what the optimization described in the last blog post +can achieve, look at the following figure:

+

+ +new lifetimes

+

The figure shows a trace before optimization, together with the lifetime of +various kinds of objects created in the trace. It is executed from top to +bottom. At the bottom, a jump is used to execute the same loop another time. +For clarity, the figure shows two iterations of the loop. +The loop is executed until one of the guards in the trace fails, and the +execution is aborted.

+

Some of the operations within this trace are new operations, which each create a +new instance of some class. These instances are used for a while, e.g. by +calling methods on them, reading and writing their fields. Some of these +instances escape, which means that they are stored in some globally accessible +place or are passed into a function.

+

Together with the new operations, the figure shows the lifetimes of the +created objects. Objects in category 1 live for a while, and are then just not +used any more. The creation of these objects is removed by the +optimization described in the last blog post.

+

Objects in category 2 live for a while and then escape. The optimization of the +last post deals with them too: the new that creates them and +the field accesses are deferred, until the point where the object escapes.

+

The objects in category 3 and 4 are in principle like the objects in category 1 +and 2. They are created, live for a while, but are then passed as an argument +to the jump operation. In the next iteration they can either die (category +3) or escape (category 4).

+

The optimization of the last post considered the passing of an object along a +jump to be equivalent to escaping. It was thus treating objects in category 3 +and 4 like those in category 2.

+

The improved optimization described in this post will make it possible to deal +better with objects in category 3 and 4. This will have two consequences: on +the one hand, more allocations are removed from the trace (which is clearly +good). As a side-effect of this, the traces will also be type-specialized.

+
+

Optimizing Across the Jump

+

Let's look at the final trace obtained in the last post for the example loop. +The final trace was much better than the original one, because many allocations +were removed from it. However, it also still contained allocations:

+
+step 1 +
+

The two new BoxedIntegers stored in p15 and p10 are passed into +the next iteration of the loop. The next iteration will check that they are +indeed BoxedIntegers, read their intval fields and then not use them +any more. Thus those instances are in category 3.

+

In its current state the loop +allocates two BoxedIntegers at the end of every iteration, that then die +very quickly in the next iteration. In addition, the type checks at the start +of the loop are superfluous, at least after the first iteration.

+

The reason why we cannot optimize the remaining allocations away is because +their lifetime crosses the jump. To improve the situation, a little trick is +needed. The trace above represents a loop, i.e. the jump at the end jumps to +the beginning. Where in the loop the jump occurs is arbitrary, since the loop +can only be left via failing guards anyway. Therefore it does not change the +semantics of the loop to put the jump at another point into the trace and we +can move the jump operation just above the allocation of the objects that +appear in the current jump. This needs some care, because the arguments to +jump are all currently live variables, thus they need to be adapted.

+

If we do that for our example trace above, the trace looks like this:

+
+step 2 +
+

Now the lifetime of the remaining allocations no longer crosses the jump, and +we can run our escape analysis a second time, to get the following trace:

+
+step3 +
+

This result is now really good. The code performs the same operations than +the original code, but using direct CPU arithmetic and no boxing, as opposed to +the original version which used dynamic dispatching and boxing.

+

Looking at the final trace it is also completely clear that specialization has +happened. The trace corresponds to the situation in which the trace was +originally recorded, which happened to be a loop where BoxedIntegers were +used. The now resulting loop does not refer to the BoxedInteger class at +all any more, but it still has the same behaviour. If the original loop had +used BoxedFloats, the final loop would use float_* operations +everywhere instead (or even be very different, if the object model had +user-defined classes).

+
+
+

Entering the Loop

+

The approach of placing the jump at some other point in the loop leads to +one additional complication that we glossed over so far. The beginning of the +original loop corresponds to a point in the original program, namely the +while loop in the function f from the last post.

+

Now recall that in a VM that uses a tracing JIT, all programs start by being +interpreted. This means that when f is executed by the interpreter, it is +easy to go from the interpreter to the first version of the compiled loop. +After the jump is moved and the escape analysis optimization is applied a +second time, this is no longer easily possible. In particular, the new loop +expects two integers as input arguments, while the old one expected two +instances.

+

To make it possible to enter the loop directly from the intepreter, there +needs to be some additional code that enters the loop by taking as input +arguments what is available to the interpreter, i.e. two instances. This +additional code corresponds to one iteration of the loop, which is thus +peeled off:

+
+step 4 +
+
+
+

Summary

+

The optimization described in this post can be used to optimize away +allocations in category 3 and improve allocations in category 4, by deferring +them until they are no longer avoidable. A side-effect of these optimizations +is also that the optimized loops are specialized for the types of the variables +that are used inside them.

+
+
+
+
+
+ + Ole Laursen wrote on 2010-09-24 17:18: +
+
+

Interesting, like the previous post. Keep 'em coming. :)

+
+
+
+ +

Escape Analysis in PyPy's JIT

+ +
+

The goal of a just-in-time compiler for a dynamic language is obviously to +improve the speed of the language over an implementation of the language that +uses interpretation. The first goal of a JIT is thus to remove the +interpretation overhead, i.e. the overhead of bytecode (or AST) dispatch and the +overhead of the interpreter's data structures, such as operand stack etc. The +second important problem that any JIT for a dynamic language needs to solve is +how to deal with the overhead of boxing of primitive types and of type +dispatching. Those are problems that are usually not present in statically typed +languages.

+

Boxing of primitive types means that dynamic languages need to be able to handle +all objects, even integers, floats, etc. in the same way as user-defined +instances. Thus those primitive types are usually boxed, i.e. a small +heap-structure is allocated for them, that contains the actual value.

+

Type dispatching is the process of finding the concrete implementation that is +applicable to the objects at hand when doing a generic operation at hand. An +example would be the addition of two objects: The addition needs to check what +the concrete objects are that should be added are, and choose the implementation +that is fitting for them.

+

Last year, we wrote a blog post and a paper about how PyPy's meta-JIT +approach works. These explain how the meta-tracing JIT can remove the overhead +of bytecode dispatch. In this post (and probably a followup) we want to explain +how the traces that are produced by our meta-tracing JIT are then optimized to +also remove some of the overhead more closely associated to dynamic languages, +such as boxing overhead and type dispatching. The most important technique to +achieve this is a form of escape analysis that we call virtual objects. +This is best explained via an example.

+
+

Running Example

+

For the purpose of this blog post, we are going to use a very simple object +model, that just supports an integer and a float type. The objects support only +two operations, add, which adds two objects (promoting ints to floats in a +mixed addition) and is_positive, which returns whether the number is greater +than zero. The implementation of add uses classical Smalltalk-like +double-dispatching. These classes could be part of the implementation of a very +simple interpreter written in RPython.

+
class Base(object):
+    def add(self, other):
+        """ add self to other """
+        raise NotImplementedError("abstract base")
+    def add__int(self, intother):
+        """ add intother to self, where intother is a Python integer """
+        raise NotImplementedError("abstract base")
+    def add__float(self, floatother):
+        """ add floatother to self, where floatother is a Python float """
+        raise NotImplementedError("abstract base")
+    def is_positive(self):
+        """ returns whether self is positive """
+        raise NotImplementedError("abstract base")
+
+class BoxedInteger(Base):
+    def __init__(self, intval):
+        self.intval = intval
+    def add(self, other):
+        return other.add__int(self.intval)
+    def add__int(self, intother):
+        return BoxedInteger(intother + self.intval)
+    def add__float(self, floatother):
+        return BoxedFloat(floatother + float(self.intval))
+    def is_positive(self):
+        return self.intval > 0
+
+class BoxedFloat(Base):
+    def __init__(self, floatval):
+        self.floatval = floatval
+    def add(self, other):
+        return other.add__float(self.floatval)
+    def add__int(self, intother):
+        return BoxedFloat(float(intother) + self.floatval)
+    def add__float(self, floatother):
+        return BoxedFloat(floatother + self.floatval)
+    def is_positive(self):
+        return self.floatval > 0.0
+
+

Using these classes to implement arithmetic shows the basic problem that a +dynamic language implementation has. All the numbers are instances of either +BoxedInteger or BoxedFloat, thus they consume space on the heap. Performing many +arithmetic operations produces lots of garbage quickly, thus putting pressure on +the garbage collector. Using double dispatching to implement the numeric tower +needs two method calls per arithmetic operation, which is costly due to the +method dispatch.

+

To understand the problems more directly, let us consider a simple function that +uses the object model:

+
def f(y):
+    res = BoxedInteger(0)
+    while y.is_positive():
+        res = res.add(y).add(BoxedInteger(-100))
+        y = y.add(BoxedInteger(-1))
+    return res
+
+

The loop iterates y times, and computes something in the process. To +understand the reason why executing this function is slow, here is the trace +that is produced by the tracing JIT when executing the function with y +being a BoxedInteger:

+
+# arguments to the trace: p0, p1
+# inside f: res.add(y)
+guard_class(p1, BoxedInteger)
+    # inside BoxedInteger.add
+    i2 = getfield_gc(p1, intval)
+    guard_class(p0, BoxedInteger)
+        # inside BoxedInteger.add__int
+        i3 = getfield_gc(p0, intval)
+        i4 = int_add(i2, i3)
+        p5 = new(BoxedInteger)
+            # inside BoxedInteger.__init__
+            setfield_gc(p5, i4, intval)
+# inside f: BoxedInteger(-100)
+p6 = new(BoxedInteger)
+    # inside BoxedInteger.__init__
+    setfield_gc(p6, -100, intval)
+
+# inside f: .add(BoxedInteger(-100))
+guard_class(p5, BoxedInteger)
+    # inside BoxedInteger.add
+    i7 = getfield_gc(p5, intval)
+    guard_class(p6, BoxedInteger)
+        # inside BoxedInteger.add__int
+        i8 = getfield_gc(p6, intval)
+        i9 = int_add(i7, i8)
+        p10 = new(BoxedInteger)
+            # inside BoxedInteger.__init__
+            setfield_gc(p10, i9, intval)
+
+# inside f: BoxedInteger(-1)
+p11 = new(BoxedInteger)
+    # inside BoxedInteger.__init__
+    setfield_gc(p11, -1, intval)
+
+# inside f: y.add(BoxedInteger(-1))
+guard_class(p0, BoxedInteger)
+    # inside BoxedInteger.add
+    i12 = getfield_gc(p0, intval)
+    guard_class(p11, BoxedInteger)
+        # inside BoxedInteger.add__int
+        i13 = getfield_gc(p11, intval)
+        i14 = int_add(i12, i13)
+        p15 = new(BoxedInteger)
+            # inside BoxedInteger.__init__
+            setfield_gc(p15, i14, intval)
+
+# inside f: y.is_positive()
+guard_class(p15, BoxedInteger)
+    # inside BoxedInteger.is_positive
+    i16 = getfield_gc(p15, intval)
+    i17 = int_gt(i16, 0)
+# inside f
+guard_true(i17)
+jump(p15, p10)
+
+

(indentation corresponds to the stack level of the traced functions).

+

The trace is inefficient for a couple of reasons. One problem is that it checks +repeatedly and redundantly for the class of the objects around, using a +guard_class instruction. In addition, some new BoxedInteger instances are +constructed using the new operation, only to be used once and then forgotten +a bit later. In the next section, we will see how this can be improved upon, +using escape analysis.

+
+
+

Virtual Objects

+

The main insight to improve the code shown in the last section is that some of +the objects created in the trace using a new operation don't survive very +long and are collected by the garbage collector soon after their allocation. +Moreover, they are used only inside the loop, thus we can easily prove that +nobody else in the program stores a reference to them. The +idea for improving the code is thus to analyze which objects never escape the +loop and may thus not be allocated at all.

+

This process is called escape analysis. The escape analysis of +our tracing JIT works by using virtual objects: The trace is walked from +beginning to end and whenever a new operation is seen, the operation is +removed and a virtual object is constructed. The virtual object summarizes the +shape of the object that is allocated at this position in the original trace, +and is used by the escape analysis to improve the trace. The shape describes +where the values that would be stored in the fields of the allocated objects +come from. Whenever the optimizer sees a setfield that writes into a virtual +object, that shape summary is thus updated and the operation can be removed. +When the optimizer encounters a getfield from a virtual, the result is read +from the virtual object, and the operation is also removed.

+

In the example from last section, the following operations would produce two +virtual objects, and be completely removed from the optimized trace:

+
+p5 = new(BoxedInteger)
+setfield_gc(p5, i4, intval)
+p6 = new(BoxedInteger)
+setfield_gc(p6, -100, intval)
+
+

The virtual object stored in p5 would know that it is an BoxedInteger, and that +the intval field contains i4, the one stored in p6 would know that +its intval field contains the constant -100.

+

The following operations, that use p5 and p6 could then be +optimized using that knowledge:

+
+guard_class(p5, BoxedInteger)
+i7 = getfield_gc(p5, intval)
+# inside BoxedInteger.add
+guard_class(p6, BoxedInteger)
+# inside BoxedInteger.add__int
+i8 = getfield_gc(p6, intval)
+i9 = int_add(i7, i8)
+
+

The guard_class operations can be removed, because the classes of p5 and +p6 are known to be BoxedInteger. The getfield_gc operations can be removed +and i7 and i8 are just replaced by i4 and -100. Thus the only +remaining operation in the optimized trace would be:

+
+i9 = int_add(i4, -100)
+
+

The rest of the trace is optimized similarly.

+

So far we have only described what happens when virtual objects are used in +operations that read and write their fields. When the virtual object is used in +any other operation, it cannot stay virtual. For example, when a virtual object +is stored in a globally accessible place, the object needs to actually be +allocated, as it will live longer than one iteration of the loop.

+

This is what happens at the end of the trace above, when the jump operation +is hit. The arguments of the jump are at this point virtual objects. Before the +jump is emitted, they are forced. This means that the optimizers produces code +that allocates a new object of the right type and sets its fields to the field +values that the virtual object has. This means that instead of the jump, the +following operations are emitted:

+
+p15 = new(BoxedInteger)
+setfield_gc(p15, i14, intval)
+p10 = new(BoxedInteger)
+setfield_gc(p10, i9, intval)
+jump(p15, p10)
+
+

Note how the operations for creating these two instances has been moved down the +trace. It looks like for these operations we actually didn't win much, because +the objects are still allocated at the end. However, the optimization was still +worthwhile even in this case, because some operations that have been performed +on the forced virtual objects have been removed (some getfield_gc operations +and guard_class operations).

+

The final optimized trace of the example looks like this:

+
+# arguments to the trace: p0, p1
+guard_class(p1, BoxedInteger)
+i2 = getfield_gc(p1, intval)
+guard_class(p0, BoxedInteger)
+i3 = getfield_gc(p0, intval)
+i4 = int_add(i2, i3)
+i9 = int_add(i4, -100)
+
+guard_class(p0, BoxedInteger)
+i12 = getfield_gc(p0, intval)
+i14 = int_add(i12, -1)
+
+i17 = int_gt(i14, 0)
+guard_true(i17)
+p15 = new(BoxedInteger)
+setfield_gc(p15, i14, intval)
+p10 = new(BoxedInteger)
+setfield_gc(p10, i9, intval)
+jump(p15, p10)
+
+

The optimized trace contains only two allocations, instead of the original five, +and only three guard_class operations, from the original seven.

+
+
+

Summary

+

In this blog post we described how simple escape analysis within the scope of +one loop works. This optimizations reduces the allocation of many intermediate +data structures that become garbage quickly in an interpreter. It also removes a +lot of the type dispatching overhead. In a later post, we will explain how this +optimization can be improved further.

+
+
+
+
+
+ + Anonymous wrote on 2010-09-13 19:38: +
+
+

Beautiful post. I love it when people dare to broach more 'advanced' subjects in blog format.

+
+
+
+
+ + Carl Friedrich Bolz-Tereick wrote on 2010-09-14 11:49: +
+
+

Thanks a lot :-).

+
+
+
+
+ + jdb wrote on 2010-09-15 10:21: +
+
+

+1, thanks

+
+
+
+ +

EuroPython 2010 Videos available

+ +
+

Hi all,

+

the videos of the talks from EuroPython 2010 are now available on +blip.tv: in particular, there are the three videos of the PyPy talk.

+

Part 1: What's news in PyPy 1.2 and 1.3 (by Antonio Cuni)

+

Part 2: Just in Time compilation (by Armin Rigo)

+

Part 3: cpyext (by Amaury Forgeot d'Arc)

+

Moreover, here is Mark Shannon's talk which compares HotPy, Unladen Swallow +and PyPy:

+
+
+
+
+ + Anonymous wrote on 2010-08-17 18:36: +
+
+

Can you post the links to the blip.tv pages, so I can go there and download the videos?

The blip.tv viewer applet has no such link, and even digging the source isn't helpful (it seems that they use different identifiers for embed applets than for the same videos on their own website). Grrr!

+
+
+
+
+ + Antonio Cuni wrote on 2010-08-17 20:07: +
+
+

Sure, here are the links:
Part 1: https://europythonvideos.blip.tv/file/3981017/

Part 2: https://europythonvideos.blip.tv/file/3981028/

Part 3: https://europythonvideos.blip.tv/file/4000720/

HotPy: https://europythonvideos.blip.tv/file/3980963/

+
+
+
+
+ + Lucian wrote on 2010-08-18 01:10: +
+
+

@Anonymous

You can also click the title of the video in the Blip.tv embedded player.

+
+
+
+
+ + horace wrote on 2010-08-19 18:51: +
+
+

if i remember correctly, a while ago you were looking for alternative names for pypy.

i just came across the wikipeda article for the "black mamba" which states that it is the fastest snake of the world.

how about the name "black mamba"? :)

+
+
+
+ +

Call for Benchmarks

+ +
+

As you know, a lot of PyPy's recent development effort has gone into speeding up +execution of Python programs. However, an additional good property of PyPy's +Python interpreter is that most objects are represented in a much more compact +way than in CPython. We would like to investigate some more advanced techniques +to reduce the memory usage of Python programs further.

+

To do this it is necessary to investigate the memory behaviour of real programs +with large heaps. For speed measurements there are standard benchmarks, but for +memory improvements there is nothing comparable, the memory behaviour of large +programs is not that well understood. Therefore we are looking for programs that we +can study and use as benchmarks.

+

Specifically we are looking for Python programs with the following properties:

+
    +
  • large heaps of about 10MB-1GB
  • +
  • should have non-trivial runtime as well (in the range of a few seconds), to +judge the speed impact of optimizations
  • +
  • ideally pure-Python programs that don't use extension modules so that they run +under both CPython and PyPy (this is optional, but makes my life much easier).
  • +
+

We are also rather interested in programs that do a lot of string/unicode +processing.

+

We would be grateful for all ideas. Telling us about a program also has the +advantage that we will work on optimizing PyPy for it :-).

+
+
+
+
+ + lasi wrote on 2010-08-17 12:15: +
+
+

I'm not think very much about it. But Zodb, durus or dobbin could be useful.

+
+
+
+
+ + Zeev wrote on 2010-08-17 12:26: +
+
+

portage, the official Gentoo Linux package manager, does package dependency resolution and can take a few seconds for large updates. It parses package metadata from text files.

+
+
+
+
+ + Peter Goodman wrote on 2010-08-17 12:31: +
+
+

You could run a program that determinizes a large NFA. Given an existing Python program that can determinize an NFA, you could give it an expanded version of the NFA on page 15 here: https://www.springerlink.com/content/cq16j1uv511g793g/fulltext.pdf. Another way is to take some complex NFAs, concatenate them, and determinize.

+
+
+
+
+ + Anonymous wrote on 2010-08-17 13:09: +
+
+

Bazaar and mercurial take a lot of memory (time as well) when updating/merging etc. large repositories, especially if they contain large files.

+
+
+
+
+ + Anonymous wrote on 2010-08-17 13:23: +
+
+

Pylint (https://www.logilab.org/project/pylint) could be a nice target. Pure Python, the size of the heap and run time depend on what kind of code you throw at it.

+
+
+
+
+ + VanL wrote on 2010-08-17 14:15: +
+
+

You could try loading and manipulating a large graph with NetworkX. Pure Python, and the size and runtime could be tuned by varying the size of the graph and the algorithms that are run.

+
+
+
+
+ + Unknown wrote on 2010-08-17 14:51: +
+
+ Whoosh comes to mind. People will always be grateful if you speed up search for them :) +
+
+
+
+ + Anonymous wrote on 2010-08-17 15:15: +
+
+

The CDPedia creates and manipulates its index with a pure-python inverted index implementation.

It could be extracted and made into a benchmark - there are other pure-python inverted indices around, those could also work.

They do tend to use lots and lots of memory, the CDPedia's implementation uses the builtin array module for byte sequence manipulation and bare strings as data store (it's highly optimized for lowering CPython's memory usage), but there are a few dict-heavy places yet.

+
+
+
+
+ + Anonymous wrote on 2010-08-17 16:32: +
+
+

Agreed that Bazaar and Mercurial would be interesting use cases, especially for projects with large revision history graphs.

Memory usage analysis has come up recently on the bzr list:
https://lists.ubuntu.com/archives/bazaar/2010q3/069549.html

+
+
+
+
+ + Carl Friedrich Bolz-Tereick wrote on 2010-08-17 16:33: +
+
+

All great ideas, thanks a lot!

+
+
+
+
+ + Anonymous wrote on 2010-08-17 17:42: +
+
+

Python Natural Language Toolkit
https://www.nltk.org/

Give a huge corpus (Wikipedia?) and do any operation on it -- nltk will take huge loads of memory in all kinds of custom objects, lists and tuples.

+
+
+
+
+ + Pingveno wrote on 2010-08-17 21:25: +
+
+

From what I understand, PyExcelerator, a writer/reader for Excel files, takes huge amounts of memory for very large files. It uses pure Python objects for each cell, which kills memory use when you're writing many millions of cells.

+
+
+
+
+ + Dan Stromberg wrote on 2010-08-17 22:53: +
+
+

A couple of possibilities from my own OSS code:

https://stromberg.dnsalias.org/~strombrg/treap

https://stromberg.dnsalias.org/~strombrg/pyindex.html



I'd most likely be happy to relicense the treap code as needed to facilitate inclusion. The pyindex code is under a UCI (I believe it's BSDish) license, and would probably need to remain so.

+
+
+
+
+ + Anonymous wrote on 2010-08-18 15:18: +
+
+

I really didn't think about it much, I'm just trying to chew through my RSS backlog, and ran into a post about pkgcore dealing with memory issues just a few minutes after I read this call for benchmarks.

Maybe you could use that.

+
+
+
+
+ + Anonymous wrote on 2010-08-18 23:45: +
+
+

You might want to lok at MiniLight:

https://www.hxa.name/minilight/#comparison

+
+
+
+
+ + none wrote on 2010-08-20 13:40: +
+
+

I'm the author of a scientific application that can be suited to your needs. It runs both with Python 2.x and PyPy, so I bundled a distribution with some example benchmarks if this interests you: https://dl.dropbox.com/u/7931953/pypy-bench.tar.bz2 (see bench.README)

An interesting observation in my opinion is that on small runs, CPython outperforms PyPy but this progressively reverses on longer runs.

+
+
+
+
+ + Carl Friedrich Bolz-Tereick wrote on 2010-08-20 14:44: +
+
+

@all: thanks for the proposals, I am looking at them.

@Franck: This is probably due to the JIT, which needs some time to compile at the beginning. Later, the assembler exists and executes quickly. Will look at your code, thanks for providing it.

+
+
+
+
+ + Anonymous wrote on 2010-08-20 18:58: +
+
+

Hello, i am the author of an chess program being written entirely in python. I haven't published it jet, because i am a bit ashame of its poor quality. However it should suffice for the sole purpose of benchmarking. Please drop me a note if you are interested. My email adress is: larudwer at freenet dot de

Some Notes:
The Program is just console mode (UCI), no gui.

it eats up all the memory you have

cpython is almost twice as fast as pypy-1.3 on this program and psyco accelerates it by another factor of two.

+
+
+
+
+ + Unknown wrote on 2010-08-21 17:33: +
+
+

You could consider Tahoe-LAFS. A good reason to use it is that it is a practicality-oriented, widely deployed tool with significant memory usage that we routinely spend engineering effort to track and manage.

Here are some graphs of the memory usage of different versions of Tahoe-LAFS over time:

32-bit machine:
https://tahoe-lafs.org/tahoe-figleaf-graph/hanford.allmydata.com-tahoe_memstats.html

64-bit machine:
https://tahoe-lafs.org/tahoe-figleaf-graph/hanford.allmydata.com-tahoe_memstats_64.html

Here are some open tickets about memory usage in our issue tracker:

https://tahoe-lafs.org/trac/tahoe-lafs/query?status=!closed&keywords=~memory&order=priority

The reason not to use Tahoe-LAFS as a subject is that it uses several native-code libraries to for the CPU-intensive inner loops (cryptography, erasure coding). I really want those libraries, and hence Tahoe-LAFS, to be usable with cpyext as soon as possible, but I haven't tried and I assume that cpyext isn't 100% there yet.

By the way the easiest way to measure the performance of Tahoe-LAFS would be to run its unit tests and measure the memory usage and runtime. This is not only the easiest way, but it is also a pressing issue for us! Tahoe-LAFS unit tests take too long to run, and this causes problems for us, and we very much like it if they could run to completion faster.

https://tahoe-lafs.org/trac/tahoe-lafs/ticket/20# unit tests take too long

Here are our buildbots showing unit test runtime among other things:

https://tahoe-lafs.org/buildbot/waterfall?show_events=true&builder=Kyle+OpenBSD-4.6+amd64&builder=hardy-amd64&builder=Arthur+lenny+c7+32bit&builder=Eugen+lenny-amd64&builder=David+A.+OpenSolaris+i386&builder=Ruben+Fedora&builder=Zooko+zomp+Mac-amd64+10.6+py2.6&builder=FreeStorm+WinXP-x86+py2.6&builder=tarballs

+
+
+
+
+ + Adam Sampson wrote on 2010-08-22 16:22: +
+
+

rawdog (disclosure of bias: I wrote it) sounds like it might be of use. It's an RSS aggregator that generates static HTML. Pure Python 2, with lots of string processing, mostly in the feedparser module. Memory usage and runtime depends on how many feeds it's reading and how much history it keeps, since it does everything in memory at the moment, using pickle for persistant state. (With my 800-odd feeds and two-month history, writing the entire store to HTML will use a few hundred meg of memory and run for several minutes.)

A future redesign will use a more sensible database-backed approach...

+
+
+
+
+ + Bob Ziuchkovski wrote on 2010-08-24 00:10: +
+
+

Scapy would be a great one to benchmark. Depending on the size of the packet capture, it can consume quite a bit of proc/mem when loading and dissecting large captures. I run it at work on Cpython and would love to see it running/optimized under pypy. The only problem is that I believe it uses some 2.6 pythonisms.

+
+
+
+
+ + Carl Friedrich Bolz-Tereick wrote on 2010-08-27 19:03: +
+
+

Thanks again for all the additional pointers. Still investigating all of them.

+
+
+
+
+ + Anonymous wrote on 2010-09-05 05:01: +
+
+

How about Nucular, a search engine written in python by aaron watters.

https://nucular.sourceforge.net/

+
+
+
+
+ + Anonymous wrote on 2010-09-10 20:27: +
+
+

In my view, the natural competitors to PyPy (in the domain of fast interpreters for dynamic languages) are Tracemonkey and V8. Therefore, translations of the Sunspider, V8, and Dromaeo benchmarks would be appropriate.

+
+
+
+
+ + Anonymous wrote on 2010-09-17 14:58: +
+
+

bitbake looks like a good candidate. It's a derivative of portage, and used to crosscompile linux distro for embedded device.

With non trivial distro, it can use up to 400Mb. It already use psyco if available, and can be interesting compare speed/memory usage with pypy.

+
+
+
+ +

PyOhio

+ +
+

This weekend I delivered a talk at PyOhio (an annual conference in Columbus, OH, USA) on PyPy and Unladen Swallow. The talk covered reasons that Python, the language, is hard to optimize, why CPython is slow, and a few optimizations that PyPy and Unladen Swallow have implemented. The slides from my talk are online, and the talk was recorded so a video will follow. I gave a similar talk to ChiPy (the Chicago Python user group), which was also recorded and the video is available. Both audiences were excited about the futures for PyPy and Unladen Swallow, and for the future of a faster Python.

+

Alex

+
+
+
+
+ + tucuman87 wrote on 2010-08-05 13:55: +
+
+

I do not understand why is python so hard to optimize- after all, LuaJIT is VERY fast, and I thought Lua has the same dynamical features as python. I'm no Python nor Lua expert, but it would be nice knowing...

Thanks!

+
+
+
+
+ + Maciej Fijalkowski wrote on 2010-08-05 13:57: +
+
+

Did you actually watch the video? it's explained there.

+
+
+
+
+ + Anonymous wrote on 2010-08-05 19:27: +
+
+

Any chance of putting the slides somewhere that can be directed downloaded?

+
+
+
+
+ + Alex wrote on 2010-08-05 19:33: +
+
+

There's a link to download the slides on the right hand side. This link: https://www.scribd.com/document_downloads/direct/35240506?extension=pdf&ft=1281033139&lt=1281036749&uahk=mAWsHOEi/etYRUUXWst+oYKiWIU
should also work.

+
+
+
+
+ + tucuman87 wrote on 2010-08-05 20:42: +
+
+

I've seen the video, and the question I asked is not answered: what dynamic feature Python has that Lua doesn't?

is such a specific feature responsible for the fast LuaJIT?

Thanks.

+
+
+
+
+ + Maciej Fijalkowski wrote on 2010-08-05 20:47: +
+
+

The main meta-reason is that Python is a very complex language compared to lua, so you have to take into account a lot of things that you don't care about in lua. one example is new style classes with insane semantics about descriptors.

+
+
+
+
+ + Luis wrote on 2010-08-06 04:16: +
+
+

The author of Luajit, Mike Pall, participated in a long thread posted here https://lambda-the-ultimate.org/node/3851#comment-57700 , as well as Maciej and others.

There, he said about python:
"Python (the core language) just has different quirks that need to be worked around. But no show stoppers.
What is more challenging, is to efficiently handle Python (the programming environment) with all of its classes and methods. In particular the plethora of container types, since you really want to inline their accessors.
Since I don't believe in multi-language VMs, LuaJIT is pretty Lua-specific on all levels. Your best bet right now is to join the PyPy effort (they already handle the issues I mentioned)."

+
+
+
+ +

Using virtualenv with PyPy

+ +
+

Thanks to the work that was recently done on the sys-prefix branch, it is +now possible to use virtualenv with PyPy.

+

To try it, you need:

+
+
    +
  • a recent version of PyPy: PyPy 1.3 does not contain the necessary logic to +work with virtualenv, so you need a more recent PyPy from subversion +trunk. You can either build it by yourself or download one of our +precompiled nightly builds +
  • +
  • a copy of virtualenv-pypy: this is a fork of virtualenv that contains +all the patches needed to work with PyPy, and hopefully will be merged +back at some point. It should be totally compatible with the official +version of virtualenv, so it is safe to use it even to create non-PyPy +environments. If you notice some weird behavior that does not happen with +the standard virtualenv, please let us know.
  • +
+
+

The directory layout has been redesigned in a way that it is possible to use +virtualenv to install a PyPy both from a precompiled tarball or from an svn +checkout:

+
+# from a tarball
+$ virtualenv -p /opt/pypy-c-jit-76426-linux/bin/pypy my-pypy-env
+
+# from the svn checkout
+$ virtualenv -p /path/to/pypy-trunk/pypy/translator/goal/pypy-c my-pypy-env
+
+

Once the environment has been created, you can enter it as usual. Note that +bin/python is now a symlink to bin/pypy.

+

Enjoy it :-)

+
+
+
+
+ + René Dudfield wrote on 2010-08-02 17:13: +
+
+

Another great step for pypy being used in more productions.

+
+
+
+
+ + Konrad wrote on 2010-08-03 17:50: +
+
+

Good job!

+
+
+
+
+ + Alexei Boronine wrote on 2010-08-05 16:22: +
+
+

I recently made a script called pypyenv for easily installing PyPy in a virtualenv side by side with CPython, sharing site-packages. It will allow one to experiment with PyPy in a working virtualenv without breaking current code.

+
+
+
+
+ + Antonio Cuni wrote on 2010-08-05 19:16: +
+
+

@Alex: nice. pypyenv is obviously something different that virtualenv-pypy, but it might be useful if someone wants to try PyPy.

However, I don't think that sharing site-packages is a good idea: it works as long as you have only pure python packages, but it stops as soon as you build some C extension, as the .so produced by PyPy are incompatible with CPython

+
+
+
+
+ + Alexei Boronine wrote on 2010-08-06 00:58: +
+
+

@Antonio

Interesting point, I'll mention it in the README. (by the way, thank you for your work, PyPy rocks my socks!)

+
+
+
+
+ + Xiong Chiamiov wrote on 2010-08-11 23:31: +
+
+

A bit unrelated, but would it be possible to have nightly releases with consistent filenames? Right now, they all include the svn revision number (I assume that's what it is), which makes it difficult to write a script that downloads and installs the latest version.

Specifically, I'm looking to create an Arch pkgbuild, because it takes too damn long to compile on my notebook, and I don't want to use the stable release.

+
+
+
+
+ + jezdez wrote on 2010-09-03 15:10: +
+
+

FYI, the fixes have been merged in virtualenv's main repo.

+
+
+
+
+ + Anonymous wrote on 2011-12-14 10:28: +
+
+

it seems there's no pypy/translator/goal/pypy-c anymore?
how to init virtualenv from pypy source now?

+
+
+
+
+ + Maciej Fijalkowski wrote on 2011-12-14 10:31: +
+
+

It's not in a checkout you have to compile it first.

+
+
+
+
+ + Asmo wrote on 2012-04-20 12:47: +
+
+

Is the information here still valid? Or should virtualenv work fine with pypy?

+
+
+
+
+ + Maciej Fijalkowski wrote on 2012-04-20 12:50: +
+
+

Virtualenv should work just fine

+
+
+
+ +
+
+ +
+
+
+ +
+ + + + \ No newline at end of file diff --git a/blog/index-16.html b/blog/index-16.html new file mode 100644 index 000000000..a62b55188 --- /dev/null +++ b/blog/index-16.html @@ -0,0 +1,2092 @@ + + + + + + +PyPy (old posts, page 16) | PyPy + + + + + + + + + + + + + + + + + + Skip to main content +
+
+
+

Leysin Winter sprint

+ +
+

Hi all,

+ + + + +
+ +

The next sprint will be in Leysin, Switzerland, during the week of the 16th-22nd of January 2011.

+ +

Now that we have released 1.4, and plan to release 1.4.1 soon, the sprint is going to be mainly working on fixing issues reported by various users. Of course this does not prevent people from showing up with a more precise interest in mind.

+ +

As usual, the break day on the sprint will likely be a day of skiing :-)

+ +

Hoping to see you there.

+ +
+ +
+

Update: there are actually a number of branches that we want to polish and merge into trunk: at least fast-forward, jit-unroll-loops, arm-backend and jitypes2. For more details, see the announcement.

+
+
+
+
+ + Victor wrote on 2010-12-10 10:24: +
+
+

Armin,
Don't forget to ping pypy-sprint about this to avoid people getting confused again ;)

+
+
+
+
+ + Armin Rigo wrote on 2010-12-10 16:28: +
+
+

Victor: sorry, I don't get you. Do you mean, to tell people about the updates I did to the blog post? Or just to send the announcement to pypy-sprint too (I only sent it to pypy-dev so far)?

+
+
+
+
+ + Victor wrote on 2010-12-10 22:12: +
+
+

I meant just to send the announcement to pypy-sprint too. IIRC, someone looked there for the last sprint and got confused by the announcement of a sprint in the same month of last year.

+
+
+
+ +

PyPy 1.4 release aftermath

+ +
+

A couple days have passed since the announcement of the 1.4 release, and this +is a short summary of what happened afterwards. Let's start with +numbers:

+
    +
  • 16k visits to the release announcement on our blog
  • +
  • we don't have download statistics unfortunately
  • +
  • 10k visits to speed center +
  • +
  • most traffic comes from referring sites, reddit alone creating above a third +of our traffic
  • +
+

Not too bad for a project that doesn't have a well-established user base.

+

Lessons learned:

+
    +
  • Releases are very important. They're still the major way projects communicate +with community, even if we have nightly builds that are mostly stable.
  • +
  • No segfaults were reported, no incompatibilities between JIT and normal +interpretation. We think that proves (or at least provides a lot of +experimental evidence) that our write-once-and-then-transform method is +effective.
  • +
  • A lot of people complained about their favorite module in C not working, we +should have made it clearer that CPyExt is in alpha state. Indeed, we +would like to know which C extension modules do work :-).
  • +
  • Some people reported massive speedups, other reported slowdowns compared +to CPython. Most of those slowdowns relate to modules being inefficient +(or doing happy nonsense), like ctypes. This is expected, given that +not all modules are even jitted (although having them jitted is usually +a matter of a couple of minutes).
  • +
  • Nobody complained about a lack of some stdlib module. We implemented the ones +which are used more often, but this makes us wonder if less used stdlib modules +have any users at all.
  • +
+

In general feedback has been overwhelmingly positive and we would like to +thank everyone trying (and especially those reporting problems)

+

Cheers,
+fijal

+
+
+
+
+ + Bryan Murphy wrote on 2010-12-01 14:26: +
+
+

I love what you guys are doing. Keep up the good work!

+
+
+
+
+ + Leonardo Santagada wrote on 2010-12-01 17:07: +
+
+

There was a complain about the lack of ssl module by someone trying to use pg8000 with pypy. I wonder if pypy should focus on openssl or on the ssl module.

+
+
+
+
+ + Anonymous wrote on 2010-12-01 20:47: +
+
+

pyglet actually seems to use the MacOS module to create some windows.

+
+
+
+
+ + Paul Boddie wrote on 2010-12-02 00:16: +
+
+

I'm very impressed with what you've all achieved!

I've been testing PyPy 1.4 with some code I'm working on which only depends on two pure-Python non-stdlib libraries, and although the result was a 50% longer running time than with Python 2.5, it's remarkable that the code behaves in the same way and produces the same results. When trying to produce a fully compatible implementation of something, it's no trivial task to get identical behaviour (even though I'm not really using "exotic" or "frivolous" language features): some corner case usually comes along and makes things difficult. To see a non-trivial example work just like "normal Python" is surely evidence that PyPy is ready for a wider audience.

As for my code, I've been doing some profiling generally - it uses things like the array and bisect modules substantially - and will attempt to see how profile-directed improvements affect PyPy's performance.

Keep up the good work!

+
+
+
+
+ + The Cannon Family wrote on 2010-12-03 07:11: +
+
+

A lot of the standard library looks like it has volumes of _legacy_ code depending on it, even if the current bleeding edge people use it less. In my mind supporting essentially all the standard library is a good long term goal, but as pointed out, parts of it can wait. Eventually I would like to see Tkinter support, and I would surmise that it is the most used of the stuff that is not implemented. We use it in a couple items (+/- 10% of total code, not likely to change). I would guess that the situations where these obscure parts of the standard library are being used are the parts were speed is maybe not the most important thing, supporting an existing workflow or parsing legacy data is the key.

+
+
+
+
+ + Maciej Fijalkowski wrote on 2010-12-05 08:09: +
+
+

@The Cannon Family

The question is why those legacy people would move to PyPy? PyPy is bleeding edge in a way.

Besides a lot of those modules are like audioop or ossaudiodev. I don't see legitimate usecase for those, even in legacy code.

+
+
+
+
+ + Richard Jones wrote on 2010-12-09 04:02: +
+
+

I'm very, very impressed and can't wait to use pypy in a real project. I'm blocked at the moment because I need pyglet on OS X (no MacOS module).

I gave an introduction to cython at the local Python user group and for a lark I ran the original pure-Python code up against the cython version.

cpython: 1.4s
cython: 0.2s
pypy: 0.2s

Hmm :-)

+
+
+
+
+ + Xavier Combelle wrote on 2010-12-15 16:36: +
+
+

I don't know how it is representative but for this usecase
there is a factor 7 between pypy and cpython 2.7

cpython 2.7
>>> timeit.Timer('sum(x for x in xrange(100000))').repeat(10,100)
[1.3338480523322112, 1.5916376967269201, 1.5959533140645483, 1.8427266639818676,
1.3473615220676294, 1.842070271069737, 1.3346074032759319, 1.5859678554627408,
1.8533299541683306, 1.5872797264355398]

pypy 1.4
>>>> timeit.Timer('sum(x for x in xrange(100000))').repeat(10,100)
[7.5079355199007978, 7.9444552948765477, 7.2710836043080178, 7.5406516611307666,
7.5192312421594352, 7.4927645588612677, 7.5075613773735768, 7.5201248774020826,
7.7839006757141931, 7.5898334809973278]

but maybe it is not representative

+
+
+
+ +

We are not heroes, just very patient

+ +
+

Inspired by some of the comments to the release that said "You are heroes", I though a bit about the longish history of PyPy and hunted around for some of the mailing list posts that started the project. Then I put all this information together into the following timeline:

+ +

There is also a larger version of the timeline. Try to click on some of the events, the links usually go to the sprint descriptions. I also tried to find pictures for the sprints but succeeded for only half of them, if anybody still has some, I would be interested. It's kind of fun to browse around in some of the old sprint descriptions to see how PyPy evolved. Some of the current ideas have been around for a long time, some are new. In the description of the releases I put estimates for the speed of the release.

+
+
+
+
+ + Symbol wrote on 2010-12-01 14:37: +
+
+

Many promising projects bite the dust not due to lack of talent, interest, need or support, but perseverance.

Not only do I believe that pypy has yet to realize it's full potential, I believe that it will actually achieve it. And then some.

So again, keep up the good work!!
p.s
(my flattr account is not yet operational ;-<)

+
+
+
+
+ + Symbol wrote on 2010-12-01 14:41: +
+
+

Question,
What do the funds(EU, eurostars) cover?

I see that there had been a burst of activity during the EU period.

Does this mean that funding is a bottleneck to this project? Would the end of the current eurostars funding be an obstacle?

+
+
+
+
+ + Skandalfo wrote on 2010-12-01 19:16: +
+
+

Then you are just very patient heroes :-D

+
+
+
+
+ + holger krekel wrote on 2010-12-01 22:16: +
+
+

Sure, funding does make a difference. There are couple of people currently (Anto, Armin, Carl Friedrich, partially Maciej, me ...) who get some money through the Eurostars project. This does make a difference in terms of how much time can be devoted. I guess there should be a clarifying blog post on this and maybe also some opinions and views on how things can continue after the funding period (which ends second half next year).

+
+
+
+
+ + Nik Haldimann wrote on 2010-12-02 03:28: +
+
+

Amazing how far you have come. Congrats!

I found myself in 3 of those old sprint pictures, and I remember all of them as very good times that overall probably taught me more than the school I was attending during that time.

+
+
+
+
+ + glyph wrote on 2010-12-02 07:21: +
+
+

This timeline sort of makes the point. You are heroes ;). Patience is harder than a few nights of crazy hacking and brilliant ideas.

+
+
+
+
+ + Martijn Faassen wrote on 2010-12-10 12:21: +
+
+

Yeah, you have more heroic patience than I tended to display
cheerleading/criticizing the project.

Currently there's nothing left to criticize for me -- I think everything's being done pretty much right (communication, releases, even work on C-module support!).

But that might change once I start to use the project seriously. :)

+
+
+
+ +

PyPy 1.4: Ouroboros in practice

+ +
+

We're pleased to announce the 1.4 release of PyPy. This is a major breakthrough +in our long journey, as PyPy 1.4 is the first PyPy release that can translate +itself faster than CPython. Starting today, we are using PyPy more for +our every-day development. So may you :) You can download it here:

+

https://pypy.org/download.html

+
+

What is PyPy

+

PyPy is a very compliant Python interpreter, almost a drop-in replacement +for CPython. It is fast (pypy 1.4 and cpython 2.6 comparison).

+

New Features

+

Among its new features, this release includes numerous performance improvements +(which made fast self-hosting possible), a 64-bit JIT backend, as well +as serious stabilization. As of now, we can consider the 32-bit and 64-bit +linux versions of PyPy stable enough to run in production.

+

Numerous speed achievements are described on our blog. Normalized speed +charts comparing pypy 1.4 and pypy 1.3 as well as pypy 1.4 and cpython 2.6 +are available on the benchmark website. For the impatient: yes, we got a lot faster!

+
+
+

More highlights

+
    +
  • PyPy's built-in Just-in-Time compiler is fully transparent and +automatically generated; it now also has very reasonable memory +requirements. The total memory used by a very complex and +long-running process (translating PyPy itself) is within 1.5x to +at most 2x the memory needed by CPython, for a speed-up of 2x.
  • +
  • More compact instances. All instances are as compact as if +they had __slots__. This can give programs a big gain in +memory. (In the example of translation above, we already have +carefully placed __slots__, so there is no extra win.)
  • +
  • +Virtualenv support: now PyPy is fully compatible with virtualenv: note that +to use it, you need a recent version of virtualenv (>= 1.5).
  • +
  • Faster (and JITted) regular expressions - huge boost in speeding up +the re module.
  • +
  • Other speed improvements, like JITted calls to functions like map().
  • +
+

Cheers,
+Carl Friedrich Bolz, Antonio Cuni, Maciej Fijalkowski, +Amaury Forgeot d'Arc, Armin Rigo and the PyPy team

+
+
+
+
+
+ + ipc wrote on 2010-11-26 18:42: +
+
+

congratulations!

+
+
+
+
+ + why wrote on 2010-11-26 18:47: +
+
+

This is unacceptable. Christmas is not until next month!!!

+
+
+
+
+ + Tim Parkin wrote on 2010-11-26 19:09: +
+
+

Massive congratulations - exciting!

+
+
+
+
+ + Unknown wrote on 2010-11-26 19:18: +
+
+

Sweet! Keep up the great work !

+
+
+
+
+ + Anonymous wrote on 2010-11-26 19:41: +
+
+

Woohoo!!

+
+
+
+
+ + Martijn Faassen wrote on 2010-11-26 20:07: +
+
+

Awesome!

+
+
+
+
+ + Anonymous wrote on 2010-11-26 20:59: +
+
+

Hip hip hooooraaaay!!!!

+
+
+
+
+ + ipc wrote on 2010-11-26 22:51: +
+
+

all I want for Christmas is stackless support in a 64-bit pypy-c-jit :) 'two greenlets switching and a partridge in a pear tree!'

+
+
+
+
+ + Unknown wrote on 2010-11-26 23:14: +
+
+

Congratulations. I hope the PPA is going to be updated soon. Too lazy to build it myself, right now. (:

+
+
+
+
+ + Paul Boddie wrote on 2010-11-26 23:29: +
+
+

Is there a -j <number-of-cores> option for the translation process? It's a bit unfortunate that 15 cores on the nice machine I'm using can't be put to use making it translate faster. (Or unfortunate that I didn't read the documentation, maybe.)

+
+
+
+
+ + ipc wrote on 2010-11-26 23:54: +
+
+

--make-jobs=N only some parts of the translation process is parallel.

+
+
+
+
+ + Anonymous wrote on 2010-11-27 00:10: +
+
+

Eta until numpy scipy?

+
+
+
+
+ + Paul Boddie wrote on 2010-11-27 01:00: +
+
+

The report of 2.4GB usage on x86-64 is accurate, but it took about 7800s on a 2.33GHz Xeon. Next time I'll try and exercise some of the other cores, though.

+
+
+
+
+ + Anonymous wrote on 2010-11-27 04:54: +
+
+

so pypy on average is now about 2x faster than cpython?

and unladen swallows goal was beeing 5x faster? was that totally unrealistic?

+
+
+
+
+ + Leonard Ritter wrote on 2010-11-27 10:59: +
+
+

You are my heroes!

+
+
+
+
+ + Symbol wrote on 2010-11-27 11:37: +
+
+

Just Awesome!!!

KUTGW!

+
+
+
+
+ + Daivd wrote on 2010-11-27 12:02: +
+
+

Does this release include the -free branch that was mentioned in the previous post? The 2x memory requirements lead me to believe so.

+
+
+
+
+ + Maciej Fijalkowski wrote on 2010-11-27 13:45: +
+
+

@Daivd
yes, it does

@Anonymous
5x improvement is not a well defined goal, however it's a good marketing thing. PyPy is 2x faster on translation, 60x faster on some benchmarks while slower on other. What does it mean to be 5x faster?

+
+
+
+
+ + Christian S. Perone wrote on 2010-11-27 14:23: +
+
+

Sounds great, great work, great thanks !

+
+
+
+
+ + scientist wrote on 2010-11-27 14:34: +
+
+

Do you know why the purely numerical benchmarks nbody and spectral-norm are still so much slower in PyPy compared to e.g. LuaJIT?

+
+
+
+
+ + tobami wrote on 2010-11-27 14:44: +
+
+

This is awesome. PyPy 1.4 addresses the 2 slowest benchmarks, slowspitfire and spambayes. There is no benchmark anymore where PyPy is much slower than CPython.

To me, this marks the first time you can say that PyPy is ready for general "consumption". Congratulations!

PS: The best comparison to appreciate how much of an improvement 1.4 has been is:
https://speed.pypy.org/comparison/?exe=2%2B35,1%2B41,1%2B172&ben=1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20&env=1&hor=false&bas=2%2B35&chart=normal+bars

+
+
+
+
+ + Maciej Fijalkowski wrote on 2010-11-27 17:37: +
+
+

@scientist

Sure, because LuaJIT is crazy when it comes to optimizations :-) We'll get there eventually, but purely numerical stuff is not as high on our list as other things.

+
+
+
+
+ + Luis wrote on 2010-11-27 18:37: +
+
+

@maciej: in an old thread (have tracing compilers won?) you replied to Mike Pall saying that pypy was in a way middle ground, that it didn't offer as much opportunities for micro optimizations as luajit.

You were discussing about keeping high level constructions from the user program to perform more tricks.

Has the situation changed?
Do you really think now that you'll get there?

Anyway, let me tell you that you are all already my super heroes :-)

+
+
+
+
+ + Maciej Fijalkowski wrote on 2010-11-27 18:46: +
+
+

Heh, I don't remember that :-)

Anyway, LuaJIT has more options for microoptimziations simply because Lua is a simpler language. That doesn't actually make it impossible for PyPy, it simply make it harder and taking more time (but it's still possible). I still think we can get (but predicting future is hard) where LuaJIT is right now, but racing Mike would be a challenge that we might loose ;-)

That said, even in simple loops there are obvious optimizations to be performed, so we're far from being done. We're going there, but it's taking time ;-)

+
+
+
+
+ + Victor wrote on 2010-11-27 19:33: +
+
+

Congrats to all PyPy developers for making huge contributions to Python performance, JIT and implementation research and delivering an end product that will help many developers to get more done.

IIUC, we still have ARM, jit-unroll-loops, more memory improvements, Python 2.7 (Fast Forward branch) and a bunch of other cool improvements in the works, besides some known interesting targets that will eventually be tackled (e.g. JITted stackless).

I wish more big Python apps and developers would play with PyPy and report the results.

Cheers!

P.S.: Fijal: see https://lambda-the-ultimate.org/node/3851#comment-57715

+
+
+
+
+ + Michal M. wrote on 2010-11-29 18:55: +
+
+

Congratulations.
However, you suggest people used it in production environment - please, give us version compatible at least with CPython 2.6.
I hope that you plan it but at first you wanted to have stable and fast base. :)

+
+
+
+
+ + Amaury Forgeot d'Arc wrote on 2010-12-01 22:21: +
+
+

@Michal:
There is already an ongoing effort to port PyPy to Python 2.7.

But we need some help! It's a good way to become a PyPy developer.
And no, you don't have to be a JIT expert to implement itertools.combinations or asian codecs.

+
+
+
+
+ + Anonymous wrote on 2011-02-09 00:18: +
+
+

kudos to whip-smart guys for this wonderful piece of software.

+
+
+
+ +

Improving Memory Behaviour to Make Self-Hosted PyPy Translations Practical

+ +
+

In our previous blog post, we talked about how fast PyPy can translate +itself compared to CPython. However, the price to pay for the 2x speedup was +an huge amount of memory: actually, it was so huge that a standard -Ojit +compilation could not be completed on 32-bit because it required more than the +4 GB of RAM that are addressable on that platform. On 64-bit, it consumed +8.3 GB of RAM instead of the 2.3 GB needed by CPython.

+

This behavior was mainly caused by the JIT, because at the time we wrote the +blog post the generated assembler was kept alive forever, together with some +big data structure needed to execute it.

+

In the past two weeks Anto and Armin attacked the issue in the jit-free +branch, which has been recently merged to trunk. The branch solves several +issues. The main idea of the branch is that if a +loop has not been executed for a certain amount of time (controlled by the new +loop_longevity JIT parameter) we consider it "old" and no longer needed, +thus we deallocate it.

+

(In the process of doing this, we also discovered and fixed an +oversight in the implementation of generators, which led to generators being +freed only very slowly.)

+

To understand the freeing of loops some more, let's look at how many loops are +actually created during a translation. +The purple line in the following graph shows how many loops (and bridges) are +alive at any point in time with an infinite longevity, which is equivalent to +the situation we had before the jit-free branch. By contrast, the blue +line shows the number of loops that you get in the current trunk: the +difference is evident, as now we never have more than 10000 loops alive, while +previously we got up to about 37000 ones. The time on the X axis is expressed +in "Giga Ticks", where a tick is the value read out of the Time Stamp Counter +of the CPU.

+ + + +

The grey vertical bars represent the beginning of each phase of the +translation:

+
    +
  • +annotate performs control flow graph construction and type inference.
  • +
  • +rtype lowers the abstraction level of the control flow graphs with types to that of C.
  • +
  • +pyjitpl constructs the JIT.
  • +
  • +backendopt optimizes the control flow graphs.
  • +
  • +stackcheckinsertion finds the places in the call graph that can overflow the C stack and inserts checks that raise an exception instead.
  • +
  • +database_c produces a database of all the objects the C code will have to know about.
  • +
  • +source_c produces the C source code.
  • +
  • +compile_c calls the compiler to produce the executable.
  • +
+

You can nicely see, how the number of alive graphs drops shortly after the +beginning of a new phase.

+

Those two fixes, freeing loops and generators, improve the memory usage greatly: +now, translating PyPy +on PyPy on 32-bit consumes 2 GB of RAM, while on CPython it consumes 1.1 GB. +This result can even be improved somewhat, because we are not actually freeing +the assembler code itself, but +only the large data structures around it; we can consider it as a residual +memory leak of around 150 MB in this case. This will be fixed in the +jit-free-asm branch.

+

The following graph shows the memory usage in more detail:

+
+
    +
  • the blue line (cpython-scaled) shows the total amount of RAM that the +OS allocates for CPython. Note that the X axis (the time) has been +scaled down so that it spans as much as the PyPy one, to ease the +comparison. Actually, CPython took more than twice as much time as PyPy to +complete the translation
  • +
  • the red line (VmRss) shows the total amount of RAM that the +OS allocates for PyPy: it includes both the memory directly handled by +our GC and the "raw memory" that we need to allocate for other tasks, such +as the assembly code generated by the JIT
  • +
  • the brown line (gc-before) shows how much memory is used by the GC +before each major collection
  • +
  • the yellow line (gc-after) shows how much memory is used by the GC +after each major collection: this represent the amount of memory which is +actually needed to hold our Python objects. The difference between +gc-before and gc-after (the GC delta) is the amout of memory that the GC +uses before triggering a new major collection
  • +
+
+ + + +

By comparing gc-after and cpython-scaled, we can see that PyPy +uses mostly the same amount of memory as CPython for storing the application +objects (due to reference counting the memory usage in CPython is always very +close to the actually necessary memory). The extra memory +used by PyPy is due to the GC delta, to the machine code generated by the JIT +and probably to some other external effect (such as e.g. Memory +Fragmentation).

+

Note that the GC delta can be set arbitrarly low (another recent addition -- +the default value depends on the actual RAM on your computer; it probably +works to translate if your computer has precisely 2 GB, because in this +case the GC delta and thus the total memory usage will be somewhat +lower than reported here), but the cost is to have more +frequent major collections and thus a higher run-time overhead. The same is +true for the memory needed by the JIT, which can be reduced by telling the JIT +to compile less often or to discard old loops more frequently. As often +happens in computer science, there is a trade-off between space and time, and +currently for this particular example PyPy runs twice as fast as CPython by +doubling the memory usage. We hope to improve even more on this trade-off.

+

On 64-bit, things are even better as shown by the the following graph:

+ + + +

The general shape of the lines is similar to the 32-bit graph. However, the +relative difference to CPython is much better: we need about 3 GB of RAM, just +24% more than the 2.4 GB needed by CPython. And we are still more than 2x +faster!

+

The memory saving is due (partly?) to the vtable ptr optimization, which is +enabled by default on 64-bit because it has no speed penalty (see +Unifying the vtable ptr with the GC header).

+

The net result of our work is that now translating PyPy on PyPy is practical +and takes less than 30 minutes. It's impressive how quickly you get used to +translation taking half the time -- now we cannot use CPython any more for that +because it feels too slow :-).

+
+
+
+
+ + crncosta wrote on 2010-11-26 16:29: +
+
+

Big huge improvement since last post. Kudos!! :-)

Please don't get me wrong, but I need to ask: is there any plan to merge pypy into cPython (or even replace it)?

BTW, I'm following the blog (please, keep this regular posts) and planing to make a donation to support your next sprint due to the regular and very well done work.

congratulations again.

+
+
+
+
+ + Martijn Faassen wrote on 2010-11-26 17:09: +
+
+

This is incredibly cool! Congrats!

+
+
+
+
+ + Leonardo Santagada wrote on 2010-11-26 17:38: +
+
+

This is amazing. It was kind of a let down when you reported it used too much memory. But now I can on my laptop translate pypy in 32 and 64 bits using pypy itself :)

The world is good again :)

+
+
+
+
+ + Maciej Fijalkowski wrote on 2010-11-26 17:48: +
+
+

@crncosta

There are no plans for merging PyPy to CPython. I don't think "replacing" is a good word, but you can use PyPy for a lot of things already, so it is a viable Python implementation together with CPython.

+
+
+
+
+ + Luis wrote on 2010-11-26 18:20: +
+
+

I am curious... Has there ever been interest from Google to sponsor this project?
I know about unladen swallow, but has anyone there expressed interest in using pypy somewhere in their organization?

Sorry for the off topic question...

+
+
+
+
+ + Peter wrote on 2010-11-26 18:28: +
+
+

Always fascinating to read about the work you're doing. Please keep posting, and keep up the good work. You really are heroes.

+
+
+
+
+ + WI wrote on 2010-11-27 01:09: +
+
+

Wow! this is great news.. keep us posted on what other developments you have.

+
+
+
+
+ + Anonymous wrote on 2010-11-27 05:02: +
+
+

like luis i am also curious about why google doesn't show a lot more interest in pypy. unladen swallow didn't really work out or did it?

+
+
+
+
+ + Symbol wrote on 2010-11-27 11:37: +
+
+

Kudos!

KUTGW!!

+
+
+
+
+ + Anonymous wrote on 2010-11-28 08:33: +
+
+

Congrats!

+
+
+
+ +

Running large radio telescope software on top of PyPy and twisted

+ +
+

Hello.

+

As some of you already know, I've recently started working on a +very large radio telescope at SKA South Africa. This telescope's +operating software runs almost exclusively on Python (several high throughput +pieces are in C or CUDA or directly executed by FPGAs). Some cool telescope pictures:

+ +
+ +
+
+ +
+ +

(photos courtesy of SKA South Africa)

+

Most of the operation software is using the KatCP protocol to talk between devices. +The currently used implementation is Open Source software with a custom home built +server and client. As part of the experiments, I've implemented a Twisted based +version and run in on top of CPython and PyPy for both the default +implementation and the one based on Twisted to see how those perform.

+

There are two testing scenarios: the first one is trying to saturate the connection +by setting up multiple sensors that report state every 10ms, the second one +is measuring a round-trip between sending a request and receiving the response. +Both numbers are measuring the number of requests per 0.2s, so the more the better. On X axis there is a number of simultanously connected clients.

+

All benchmark code is available in the KatCP repository.

+

The results are as follows:

+ +
+ +
+
+ +
+

As you can see, in general Twisted has larger overhead for a single client +and scales better as the number of clients increases. That's I think expected, +since Twisted has extra layers of indirection. The round trip degradation of +Twisted has to be investigated, but for us scenario1 is by far more important.

+

All across the board PyPy performs much better than CPython for both +Twisted and a home-made solution, which I think is a pretty good result.

+

Note: we didn't roll this set up into production yet, but there are high +chances for both twisted and PyPy to be used in some near future.

+

Cheers, +fijal

+
+
+
+
+ + Anonymous wrote on 2010-11-16 01:56: +
+
+

Why not try PyZmq (https://www.zeromq.org/bindings:python):)? the IPython project(https://ipython.scipy.org/moin/) is also moving from Twisted
to PyZmq.

+
+
+
+
+ + Maciej Fijalkowski wrote on 2010-11-16 06:23: +
+
+

Sorry this is not an apropriate forum to discuss this. One of the reasons would be that Twisted and PyZmq are doing two completely different things and PyZmq won't work on PyPy.

+
+
+
+
+ + Michal M. wrote on 2010-11-16 07:59: +
+
+

Oh, I envy you. And congratulations.
Keep working.
I wait for 2.6 compatible ver. of PyPy to try it with my little project.

A widząc, że prawdopodobnie rodak, to tym bardziej się cieszę.

+
+
+
+
+ + glyph wrote on 2010-11-16 09:13: +
+
+

Maciej, this is great news. Congratulations.

I look forward to making PyPy+Twisted even faster from the Twisted side :).

+
+
+
+
+ + Alessio Sclocco wrote on 2010-11-17 14:00: +
+
+

Hi Maciej,

You say that there you are mostly using Python and sometimes C, CUDA or FPGAs.
I am writing my master thesis in the Netherlands, it is about the efficient implementation of a beam forming algorithm (the one used by the LOFAR) on modern GPUs using CUDA and OpenCL. Do you have some papers or other material there about the telescope software ? I would be really interested on citing it on the related works part.

+
+
+
+
+ + Maciej Fijalkowski wrote on 2010-11-17 16:26: +
+
+

Hey Alessio. I think this blog is not really a good medium for 2-way communication feel free to come to #pypy on irc.freenode.net or write to me directly at fijall at gmail.

In general, we don't want beam forming to be performed on GPU (because it's hard), but rather on custom-built hardware and FPGAs.

+
+
+
+
+ + Anonymous wrote on 2010-11-21 10:40: +
+
+

I have a program using Python and Twisted where I load tested both server and client connections (the program can do both the server and client protocol). I tested both types out to 100 connections (at 50 milli-second polling intervals) while measuring CPU load.

What I found was that when acting as a server it scaled fairly linearly. When acting as the client side however, load rose to a peak about 60 clients, then fell by a third until 80 clients, and then rose again until at 100 clients it reached the same load level as at 60. If you have a similar situation you may need to watch out for this phenomenon.

I also found that using the epoll reactor on Linux made a *big* difference to capacity in my applications, much more so than any normal program optimization efforts that I made. I have multiple clients and multiple server ports all running simultaneously, so I'm not sure how this may translate to your application if you are only using Twisted as a server.

Here's a link to my project web site where I show the connections versus CPU load chart (first chart):

https://mblogic.sourceforge.net/mblogichelp/general/Capacity-en.html

I haven't tested this with PyPy as I don't have a combination of anything that is both 32-bit *and* new enough to run a recent version.

+
+
+
+
+ + Maciej Fijalkowski wrote on 2010-11-21 12:16: +
+
+

PyPy has 64bit support these days.

+
+
+
+
+ + Anonymous wrote on 2010-11-22 20:28: +
+
+

I also made the previous anonymous post on the 21st. I haven't been able to get the 64 bit JIT version to run or build. That may be my fault, but I haven't been able to test it (this isn't a problem that I want to waste your time on however).

I have tested the non-JIT Pypy using a simplified version of my server and client programs, using asyncore instead of Twisted. The server and client use a standard industrial automation protocol to talk to each other over a TCP socket. The programs also make heavy use of list slicing and struct.

The non-JIT version passes all the tests I have for the server, and runs my application performance test at roughly 1/3 the speed of CPython 2.6. This is very impressive, as I have never been able to get either IronPython (on Mono) nor Jython to even run the programs, let alone pass my functional tests. The fact that Pypy (non-JIT) can run these programs perfectly without changes is something that I find very promising.

Please continue the good work, and thank you for what you've done so far!

+
+
+
+
+ + Maciej Fijalkowski wrote on 2010-11-23 08:13: +
+
+

Hey, great to hear!

Well, the non-JIT version would rather be slow, but that's fine :) We try very hard to produce a compliant python interpreter and twisted folk helped us greatly with getting all the posix stuff right.

Cheers,
fijal

+
+
+
+
+ + Unknown wrote on 2010-11-26 23:53: +
+
+

I would be very interested if someone could provide some info on how to get twisted working on pyp. I have managed to install twisted in the pypy setup but starting it produces:
AttributeError: 'module' object has no attribute 'load_dynamic'
coming from zope

+
+
+
+ +

Efficiently Implementing Python Objects With Maps

+ +
+

As could be foreseen by my Call for Memory Benchmarks post a while ago, I am +currently working on improving the memory behaviour of PyPy's Python +interpreter. In this blog post I want to describe the various data a Python +instance can store. Then I want to describe how a branch that I did and that was +recently merged implements the various features of instances in a very +memory-efficient way.

+
+

Python's Object Model

+

All "normal" new-style Python instances (i.e. instances of subclasses of object +without added declarations) store two (or possibly three) kinds of information.

+
+

Storing the Class

+

Every instance knows which class it belongs to. This information is accessible +via the .__class__ attribute. It can also be changed to other (compatible +enough) classes by writing to that attribute.

+
+
+

Instance Variables

+

Every instance also has an arbitrary number of attributes stored (also called +instance variables). The instance variables used can vary per instance, which is +not the case in most other class-based languages: traditionally (e.g. in +Smalltalk or Java) the class describes the shape of its instances, +which means that the +set of admissible instance variable names is the same for all instances of a +class.

+

In Python on the other hand, it is possible to add arbitrary attributes to an +instance at any point. The instance behaves like a dictionary mapping attribute +names (as strings) to the attribute values.

+

This is actually how CPython implements instances. Every instance has a +reference to a dictionary that stores all the attributes of the instance. This +dictionary can be reached via the .__dict__ attribute. To make things more +fun, the dictionary can also be changed by writing to that attribute.

+
+
+

Example

+

As an example, consider the following code:

+
class A(object):
+    pass
+
+instance1 = A()
+instance1.x = 4
+instance1.y = 6
+instance1.z = -1
+
+instance2 = A()
+instance2.x = 1
+instance2.y = 2
+instance2.z = 3
+
+

These two instances would look something like this in memory:

+ +

(The picture glosses over a number of details, but it still shows the essential +issues.)

+

This way of storing things is simple, but unfortunately rather inefficient. Most +instances of the same class have the same shape, i.e. the same set of instance +attribute names. That means that the key part of all the dictionaries is +identical (shown grey here). Therefore storing that part repeatedly in all +instances is a waste. In addition, dictionaries are themselves rather large. +Since they are typically implemented as hashmaps, which must not be too full to +be efficient, a dictionary will use something like 6 words on average per key.

+
+
+

Slots

+

Since normal instances are rather large, CPython 2.2 introduced slots, to make +instances consume less memory. Slots are a way to fix the set of attributes an +instance can have. This is achieved by adding a declaration to a class, like +this:

+
class B(object):
+    __slots__ = ["x", "y", "z"]
+
+

Now the instances of B can only have x, y and z as attributes +and don't have a dictionary at all. Instead, the instances of B get +allocated with enough size to hold exactly the number of instance variables that +the class permits. This clearly saves a lot of memory over the dictionary +approach, but has a number of disadvantages. It is obviously less flexible, as +you cannot add additional instance variables to an instance if you happen to +need to do that. It also introduces a set of rules and corner-cases that can +be surprising sometimes (e.g. instances of a subclass of a class with slots that +doesn't have a slots declaration will have a dict).

+
+
+
+

Using Maps for Memory-Efficient Instances

+

As we have seen in the diagram above, the dictionaries of instances of the same +class tend to look very similar and share all the keys. The central idea to use +less memory is to "factor out" the common parts of the instance dictionaries +into a new object, called a "map" (because it is a guide to the landscape of the +object, or something). After that factoring out, the representation of the +instances above looks something like this:

+ +

Every instance now has a reference to its map, which describes what the instance +looks like. The actual instance variables are stored in an array (called +storage in the diagram). In the example here, the map describes that the +instances have three attributes x, y and z. The numbers after the +attributes are indexes into the storage array.

+

If somebody adds a new attribute to one of the instances, the map for that +instance will be changed to another map that also contains the new attribute, +and the storage will have to grow a field with the new attribute. The maps are +immutable, immortal and reused as much as possible. This means, that two +instances of the same class with the same set of attributes will have the same +map. This also means that the memory the map itself uses is not too important, +because it will potentially be amortized over many instances.

+

Note that using maps makes instances nearly as small as if the correct slots had +been declared in the class. The only overhead needed is the indirection to the +storage array, because you can get new instance variables, but not new slots.

+

The concept of a "map" that describes instances is kind of old and comes from +the virtual machine for the Self programming language. The optimization was +first described in 1989 in a paper by Chambers, Ungar and Lee with the title An +Efficient Implementation of Self, a Dynamically-Typed Object-Oriented Language +Based on Prototypes. A similar technique is used in Google's V8 JavaScript +engine, where the maps are called hidden classes and in the Rhino +JavaScript engine.

+

The rest of the post describes a number of further details that occur if +instances are implemented using maps.

+
+

Supporting Dictionaries with Maps

+

The default instance representation with maps as shown above works without +actually having a dictionary as part of each instance. If a dictionary is +actually requested, by accessing the .__dict__ attribute, it needs to be +created and cached. The dictionary is not a normal Python dictionary, but a thin +wrapper around the object that forwards all operations to it. From the user's +point of view it behaves like a normal dictionary though (it even has the +correct type).

+

The dictionary needs to be cached, because accessing .__dict__ several times +should always return the same dictionary. The caching happens by using a +different map that knows about the dictionary and putting the dictionary into +the storage array:

+ +

Things become really complex if the fake dict is used in strange ways. As long +as the keys are strings, everything is fine. If somebody adds other keys to the +dict, they cannot be represented by the map any more (which supports only +attributes, i.e. string keys in the __dict__). If that happens, all the +information of the instance will move into the fake dictionary, like this:

+ +

In this picture, the key -1 was added to the instance's dictionary. Since +using the dictionary in arbitrary ways should be rare, we are fine with the +additional time and memory that the approach takes.

+
+
+

Slots and Maps

+

Maps work perfectly together with slots, because the slots can just be stored +into the storage array used by the maps as well (in practise there are some +refinements to that scheme). This means that putting a __slots__ on a +class has mostly no effect, because the instance only stores the values of the +attributes (and not the names), which is equivalent to the way slots are stored +in CPython.

+
+
+
+

Implementation Details

+

In the diagrams above, I represented the maps as flat objects. In practise this +is a bit more complex, because it needs to be efficient to go from one map to +the next when new attributes are added. Thus the maps are organized in a tree.

+

The instances with their maps from above look a bit more like this in practise:

+ +

Every map just describes one attribute of the object, with a name and a an +index. Every map also has a back field, that points to another map +describing what the rest of the object looks like. This chain ends with a +terminator, which also stores the class of the object.

+

The maps also contain the information necessary for making a new object of +class A. Immediately after the new object has been created, its map is the +terminator. If the x attribute is added, its maps is changed to the +second-lowest map, and so on. The blue arrows show the sequence of maps that +the new object goes through when the attributes x, y, z are added.

+

This representation of maps as chains of objects sounds very inefficient if an +object has many attributes. The whole chain has to be walked to find the index. +This is true to some extent. The problem goes away in the presence of the JIT, +which knows that the chain of maps is an immutable structure, and will thus +optimize away all the chain-walking. If the JIT is not used, there are a few +caches that try to speed up the walking of this chain (similar to the method +cache in CPython and PyPy).

+
+
+

Results

+

It's hard to compare the improvements of this optimization in a fair way, as +the trade-offs are just very different. Just to give an impression, a million +objects of the same class with three fields on a 32bit system takes:

+

without slots:

+
    +
  • 182 MiB memory in CPython
  • +
  • 177 MiB memory in PyPy without maps
  • +
  • 40 MiB memory in PyPy with maps
  • +
+

with slots:

+
    +
  • 45 MiB memory in CPython
  • +
  • 50 MiB memory in PyPy without maps
  • +
  • 40 MiB memory in PyPy with maps
  • +
+

Note how maps make the objects a bit more efficient like CPython using slots. +Also, using slots has no additional effect in PyPy.

+
+
+

Conclusion

+

Maps are a powerful approach to shrinking the memory used by many similar +instances. I think they can be pushed even further (e.g. by adding information +about the types of the attributes) and plan to do so in the following months. +Details will be forthcoming.

+
+
+
+
+
+ + Unknown wrote on 2010-11-13 17:28: +
+
+

Not sure if you are glossing over this, but it seems trivial to avoid the map chain walking by duplicating all of the information in a maps back pointer chain into the map itself. However, the lookup keys are still strings, so your options are some kind of frozen hashtable (which could be nice) or a sorted array.

Both of those still seem much more efficient than chasing pointers.

+
+
+
+
+ + Erez wrote on 2010-11-13 19:08: +
+
+

What about the additional running time overhead?

+
+
+
+
+ + Anonymous wrote on 2010-11-13 20:48: +
+
+

I was surprised not to see any real-world benchmarks (since you collected them earlier). That leaves the impression, that it might be disapointing (knowing that the object/class ratio generally isn't very large).

+
+
+
+
+ + Carl Friedrich Bolz-Tereick wrote on 2010-11-13 21:48: +
+
+

@Reid:
I am glossing over the runtime overhead, because the JIT completely removes it, as it knows that the maps are immutable. So you only have a problem if you don't want a JIT, in which case maps indeed make some things a bit slower. Duplicating the information everywhere is possible, but I would like to avoid it (we had a prototype that did it, and it became messy quickly).

@Erez
There is no additional runtime overhead if you have the JIT – in fact, things become faster, because the JIT can turn an attribute access into a array field read out of the storage array at a fixed offset.

@Anonymous
I have not presented any real-world benchmarks, because I actually did not get around to running them. Yes, I collected some and started writing a memory benchmark framework. But I didn't have time for a full analysis yet. I plan to do such an analysis hopefully soon.

Anyway, maps never make anything larger, so it is really just a matter of how many instances there are in practice. This will just depend on the benchmark.

+
+
+
+
+ + Zeev wrote on 2010-11-14 00:13: +
+
+

Does this optimization enable building pypy using pypy without having 16GB of ram?

+
+
+
+
+ + Anonymous wrote on 2010-11-14 01:29: +
+
+

Is there anything intrinsic toPyPy in this, or can this optimisation be used in CPython as well?

+
+
+
+
+ + ot wrote on 2010-11-14 03:16: +
+
+

To remove the chain-walking overhead when the code is not JITted, would it be possible to use a persistent hashtable, like for example the hash trie used in Clojure (see https://blog.higher-order.net/2009/09/08/understanding-clojures-persistenthashmap-deftwice/)? They are quite simple to implement and very fast (almost as fast as a normal hashtable lookup)

+
+
+
+
+ + Allen Short wrote on 2010-11-14 08:34: +
+
+

ot: i'm part way to implementing that for Python.

https://bazaar.launchpad.net/~washort/%2Bjunk/perseus/annotate/head:/perseus/__init__.py

+
+
+
+
+ + ot wrote on 2010-11-14 21:23: +
+
+

@Allen: interesting, I wonder how much code would need to be changed to make it RPython...

+
+
+
+
+ + verte wrote on 2010-11-15 07:45: +
+
+

What are the pypy "with slots" and "without slots" numbers? They are different even though you point out that they have no effect. Is the pypy in question one with sharing dicts?

+
+
+
+
+ + Maciej Fijalkowski wrote on 2010-11-15 07:56: +
+
+

@verte mapdicts help pypy objects with or without slots (although much more for the latter). There is no difference in pypy with mapdict between having slots or not having slots.

+
+
+
+
+ + Carl Friedrich Bolz-Tereick wrote on 2010-11-15 12:58: +
+
+

@Zeev no, the translation results from last week already included this optimization. Maps don't help translation much, because we already added all the necessary slot declarations to save memory on CPython.

@ot Would be possible, yes. Not sure it is worth it, given that the total set of attributes of typical instances is not very large. The data structure looks interesting though.

+
+
+
+
+ + ot wrote on 2010-11-15 13:21: +
+
+

@Carl: yes, it was just hypotetic, "if it is a bottleneck". I think anyway that even with very small instance dictionaries there could be a benefit: most keys would be resolved within the first layer of the trie, so with a single lookup or two at most. But it could be premature optimization.

+
+
+
+
+ + verte wrote on 2010-11-16 00:30: +
+
+

I still don't understand. Was there a difference with __slots__ on pypy before mapdict? Why are the numbers different on pypy without mapdict? Are those the numbers with or without the old sharing dictimpl? If without, what is the memory usage with sharing dicts?

+
+
+
+
+ + barnert wrote on 2014-08-03 23:31: +
+
+

This came up on StackOverflow, but let me answer it here.

While the CPython split-dict implementation in 3.3+ (PEP 412) may be inspired by your design, it's not the same, and it doesn't provide nearly as much savings.

The first difference is that it still has a full dict struct in the instance. For classes without that many attributes (i.e., most of them), the dict struct is almost as big as the hash table, so this means you typically only get half the savings as in PyPy. However, this means the thing you can access by __dict__ isn't created dynamically, it acts exactly like a dict even at the C API level, and it can transparently (again, even at the C API level) convert itself to a combined dict if needed (if the table needs to expand and there's more than one reference to the shared key table). The fact that the difference between 3.2 and 3.3 is completely undetectable to any code that doesn't directly access the hash buckets is a big part of the reason Mark Shannon was able to get everyone to agree to accept it, and as far as I know there's no serious consideration for changing it.

The second difference is that the dict struct's array is kept in the same (sparse) order as the shared key table, rather than being a compact array indexed by the values of the shared key table. This means it's kept at least 1/3rd unloaded, meaning that again you get less space savings than with PyPy. There's less of a good rationale here; there's a small performance cost to the slightly more complicated code needed for indexing PyPy-style, and it would make small combined dicts one word larger (which could affect classes whose instances all have different attributes, or cases where you have a huge number of classes with few instances, or other edge cases). The PEP implies that using the sparse implementation is probably overly conservative, and leaves open the possibility of changing it after 3.3.

+
+
+
+ +

Speeding up PyPy by donations

+ +
+
+

PyPy joins the Software Freedom Conservancy

+ +

Good news. PyPy is now a member of the Software Freedom Conservancy (SFC), +see the SFC blog post. This allows us to manage non-profit monetary aspects of +the project independently from a company or particular persons. So we +can now officially receive donations both from people prefering right or +left sides, see the Donate buttons on our home page and our blog. +And you can use PayPal or Google Checkout, Donations are tax-exempt in the +USA and hopefully soon in Europe as well.

+

What's it going to get used for? For the immediate future we intend to use +the donations for funding travels of core contributors to PyPy sprints +who otherwise can't afford to come. So if you have no time but some +money you can help to encourage coding contributors to care for PyPy. +If we end up with bigger sums we'll see and take suggestions. Money +spending decisions will be done by core PyPy people according to +non-profit guidelines. And we'll post information from time to time +about how much we got and where the money went.

+

If you have any questions regarding the SFC membership or donations +you may send email to sfc at pypy.org which will be observed +by Carl Friedrich Bolz, Jacob Hallen and Holger Krekel - the initial +PyPy SFC representatives on behalf of the PyPy team. Many thanks go +out to Bradley M. Kuhn for helping to implement the PyPy SFC membership.

+

cheers,

+

Holger & Carl Friedrich

+
+
+
+
+
+ + glyph wrote on 2010-11-11 08:13: +
+
+

Congratulations, welcome to the SFC family! It's been great for Twisted. Just donated $25 myself - now go make Twisted faster on PyPy :).

+
+
+
+
+ + holger krekel wrote on 2010-11-11 13:13: +
+
+

Thanks glyph. I realized we should have mentioned Twisted already in the post since you are working through the SFC for some time now. In fact, your being there was a good argument for us to also consider going there, so thanks for that :)

+
+
+
+
+ + Carl Friedrich Bolz-Tereick wrote on 2010-11-11 13:14: +
+
+

@glyph cool, thanks! As for making Twisted faster, we already did some of that: https://bit.ly/aGCY6r
No clue how these benchmarks reflect an actual application of course :-)

+
+
+
+ +

A snake which bites its tail: PyPy JITting itself

+ +
+ + + +

We have to admit: even if we have been writing for years about the fantastic +speedups that the PyPy JIT gives, we, the PyPy developers, still don't use it +for our daily routine. Until today :-).

+

Readers brave enough to run translate.py to translate PyPy by themselves +surely know that the process takes quite a long time to complete, about a hour +on super-fast hardware and even more on average computers. Unfortunately, it +happened that translate.py was a bad match for our JIT and thus ran much +slower on PyPy than on CPython.

+

One of the main reasons is that the PyPy translation toolchain makes heavy use +of custom metaclasses, and until few weeks ago metaclasses disabled some of +the central optimizations which make PyPy so fast. During the recent +Düsseldorf sprint, Armin and Carl Friedrich fixed this problem and +re-enabled all the optimizations even in presence of metaclasses.

+

So, today we decided that it was time to benchmark again PyPy against itself. +First, we tried to translate PyPy using CPython as usual, with the following +command line (on a machine with an "Intel(R) Xeon(R) CPU W3580 @ 3.33GHz" and +12 GB of RAM, running a 32-bit Ubuntu):

+
+$ python ./translate.py -Ojit targetpypystandalone --no-allworkingmodules
+
+... lots of output, fractals included ...
+
+[Timer] Timings:
+[Timer] annotate                       ---  252.0 s
+[Timer] rtype_lltype                   ---  199.3 s
+[Timer] pyjitpl_lltype                 ---  565.2 s
+[Timer] backendopt_lltype              ---  217.4 s
+[Timer] stackcheckinsertion_lltype     ---   26.8 s
+[Timer] database_c                     ---  234.4 s
+[Timer] source_c                       ---  480.7 s
+[Timer] compile_c                      ---  258.4 s
+[Timer] ===========================================
+[Timer] Total:                         --- 2234.2 s
+
+

Then, we tried the same command line with PyPy (SVN revision 78903, x86-32 JIT +backend, downloaded from the nightly build page):

+
+$ pypy-c-78903 ./translate.py -Ojit targetpypystandalone --no-allworkingmodules
+
+... lots of output, fractals included ...
+
+[Timer] Timings:
+[Timer] annotate                       ---  165.3 s
+[Timer] rtype_lltype                   ---  121.9 s
+[Timer] pyjitpl_lltype                 ---  224.0 s
+[Timer] backendopt_lltype              ---   72.1 s
+[Timer] stackcheckinsertion_lltype     ---    7.0 s
+[Timer] database_c                     ---  104.4 s
+[Timer] source_c                       ---  167.9 s
+[Timer] compile_c                      ---  320.3 s
+[Timer] ===========================================
+[Timer] Total:                         --- 1182.8 s
+
+

Yes, it's not a typo: PyPy is almost two times faster than CPython! +Moreover, we can see that PyPy is faster in each of the individual steps apart +compile_c, which consists in just a call to make to invoke gcc. +The slowdown comes from the fact that the Makefile also contains a lot of +calls to the trackgcroot.py script, which happens to perform badly on PyPy +but we did not investigate why yet.

+

However, there is also a drawback: on this specific benchmark, PyPy consumes +much more memory than CPython. The reason why the command line above contains +--no-allworkingmodules is that if we include all the modules the +translation crashes when it's complete at 99% because it consumes all the 4GB +of memory which is addressable by a 32-bit process.

+

A partial explanation if that so far the assembler generated by the PyPy JIT +is immortal, and the memory allocated for it is never reclaimed. This is +clearly bad for a program like translate.py which is divided into several +independent steps, and for which most of the code generated in each step could +be safely be thrown away when it's completed.

+

If we switch to 64-bit we can address the whole 12 GB of RAM that we have, and +thus translating with all working modules is no longer an issue. This is the +time taken with CPython (note that it does not make sense to compare with the +32-bit CPython translation above, because that one does not include all the +modules):

+
+$ python ./translate.py -Ojit
+
+[Timer] Timings:
+[Timer] annotate                       ---  782.7 s
+[Timer] rtype_lltype                   ---  445.2 s
+[Timer] pyjitpl_lltype                 ---  955.8 s
+[Timer] backendopt_lltype              ---  457.0 s
+[Timer] stackcheckinsertion_lltype     ---   63.0 s
+[Timer] database_c                     ---  505.0 s
+[Timer] source_c                       ---  939.4 s
+[Timer] compile_c                      ---  465.1 s
+[Timer] ===========================================
+[Timer] Total:                         --- 4613.2 s
+
+

And this is for PyPy:

+
+$ pypy-c-78924-64 ./translate.py -Ojit
+
+[Timer] Timings:
+[Timer] annotate                       ---  505.8 s
+[Timer] rtype_lltype                   ---  279.4 s
+[Timer] pyjitpl_lltype                 ---  338.2 s
+[Timer] backendopt_lltype              ---  125.1 s
+[Timer] stackcheckinsertion_lltype     ---   21.7 s
+[Timer] database_c                     ---  187.9 s
+[Timer] source_c                       ---  298.8 s
+[Timer] compile_c                      ---  650.7 s
+[Timer] ===========================================
+[Timer] Total:                         --- 2407.6 s
+
+

The results are comparable with the 32-bit case: PyPy is still almost 2 times +faster than CPython. And it also shows that our 64-bit JIT backend is as good +as the 32-bit one. Again, the drawback is in the consumed memory: CPython +used 2.3 GB while PyPy took 8.3 GB.

+

Overall, the results are impressive: we knew that PyPy can be good at +optimizing small benchmarks and even middle-sized programs, but as far as we +know this is the first example in which it heavily optimizes a huge, real world +application. And, believe us, the PyPy translation toolchain is complex +enough to contains all kinds of dirty tricks and black magic that make Python +lovable and hard to optimize :-).

+
+
+
+
+ + Victor wrote on 2010-11-09 17:50: +
+
+

This is amazing, huge kudos to all PyPy developers!

Do these results include "Håkan's jit-unroll-loops branch" you mentioned in sprint report? When are we going to get a release containing these improvements? And do the nightly builds include them?

+
+
+
+
+ + Carl Friedrich Bolz-Tereick wrote on 2010-11-09 18:05: +
+
+

@Victor: No, Håkan's branch has not been merged. It still has some problems that we don't quite know how to solve.

The nightly builds include all other improvements though. We plan to do a release at some point soon.

+
+
+
+
+ + Anonymous wrote on 2010-11-09 18:28: +
+
+

This is great!

One question: A while back, after the GSoC project for 64-bit, there was an issue with asmgcc-64 such that the 64-bit GC was slower than it should be.

It appears from the performance described in this post, that that must be resolved now. Is that right?

Thanks,
Gary

+
+
+
+
+ + Leonardo Santagada wrote on 2010-11-09 18:36: +
+
+

There should be a way to not only throw away jit memory but somehow tell pypy to try to not use more than say 3gb of ram so it will not hit swap on 4gb machines.

+
+
+
+
+ + Maciej Fijalkowski wrote on 2010-11-09 18:39: +
+
+

@Gary yes, that is correct

+
+
+
+
+ + Anonymous wrote on 2010-11-09 23:00: +
+
+

Wow, cool work!

+
+
+
+
+ + Eric van Riet Paap wrote on 2010-11-09 23:26: +
+
+

Excellent.. congratulations!

+
+
+
+
+ + Unknown wrote on 2010-11-09 23:30: +
+
+

Wow, looks great!

Many thanks for posting the benchmark – and for your relentless work on pypy!

One thing: Could you add tests comparing with programs converted to python3?

+
+
+
+
+ + Antonio Cuni wrote on 2010-11-10 07:21: +
+
+

@ArneBab: I'm not sure what you mean, but consider that at the moment PyPy does not support Python 3, so it does not make sense to compare against it.

+
+
+
+
+ + Michael Foord wrote on 2010-11-10 16:04: +
+
+

PyPy continues to get more and more impressive.

+
+
+
+
+ + Armin Rigo wrote on 2010-11-10 17:02: +
+
+

For reference, at some point (long ago) I tried to use Psyco to speed up translate.py on CPython; but i didn't make any difference -- I'm guessing it's because we have nested scope variables at a few critical points, which Psyco cannot optimize. Now I no longer have a need for that :-)

+
+
+
+
+ + Anonymous wrote on 2010-11-10 23:19: +
+
+

Very cool achievement. I'm curious however to know why compile_c section is slower. I thought it was mostly waiting on external programs to run and so should of been similar time cpython? Congratulations!

+
+
+
+
+ + Antonio Cuni wrote on 2010-11-11 10:28: +
+
+

@Anonymous: you are right when you say that compile_c mostly invokes gcc, but also a python script called trackgcroot.py.

The python script is run with the same interpreter using for translate.py (so pypy in this case), and it happens that it's slower than with cpython.

+
+
+
+
+ + Anonymous wrote on 2010-11-11 13:03: +
+
+

How come the 64 bit timings are so much worse than the 32 bit timings (both CPython and PyPy)?

+
+
+
+
+ + Carl Friedrich Bolz-Tereick wrote on 2010-11-11 13:11: +
+
+

@Anonymous: Because the 64bit version is translating all modules, which simply gives the translator a lot more to do. We cannot do that yet on 32bit due to memory problems.

+
+
+
+
+ + Victor wrote on 2010-11-11 17:26: +
+
+

@cfbolz Well, but you sure can run the 64bit version with the same module list as you did for 32bit... So if running the benchmark again in the same conditions isn't a lot of work, it'd provide yet another interesting data point ;)

+
+
+
+
+ + adimasci wrote on 2010-11-12 07:13: +
+
+

Nice work !

+
+
+
+
+ + Anonymous wrote on 2010-11-12 07:29: +
+
+

In other words: The pypy jit compiler leaks a massive amount of memory. Will you address this issue?

+
+
+
+
+ + Maciej Fijalkowski wrote on 2010-11-12 07:58: +
+
+

Technically it's not "leaking". And yes, we're trying to address this issue.

+
+
+
+
+ + Tim Parkin wrote on 2010-11-23 10:10: +
+
+

Yes I think the word you wanted was "uses" instead of "leaks". The latter implies unforseen problems and errors, the former implies that memory usage hasn't been addressed yet... Just to reiterate - PyPy currently *uses* more memory than CPython.

+
+
+
+
+ + Tim Parkin wrote on 2010-11-23 10:10: +
+
+

Oh and a huge congratulations for this achievement!!!

+
+
+
+ +

Düsseldorf Sprint Report 2010

+ +
+ + +

This years installment of the yearly PyPy Düsseldorf Sprint is drawing to a +close. As usual, we worked in the seminar room of the programming language +group at the University of Düsseldorf. The sprint was different from previous +ones in that we had fewer people than usual and many actually live in +Düsseldorf all the time.

+

David spent the sprint working on the arm-backend branch, which is adding an +ARM backend to the JIT. With the help of Armin he added support for bridges in +the JIT and generally implemented missing operations, mostly for handling integers so far.

+

Ronny and Anto worked the whole week trying to come up with a scheme for +importing PyPy's SVN history into a mercurial repository without loosing too +much information. This is a non-trivial task, because PyPy's history is gnarly. +We are nearly at revision 79000 and when we started using it, Subversion was at +version 0.1. All possible and impossible ways to mangle and mistreat a +Subversion repository have been applied to PyPy's repo, so most of the +importing tools just give up. Ronny and Anto came up with a new plan and new +helper scripts every day, only to then discover another corner case that they +hadn't thought of. Now they might actually have a final plan (but they said +that every day, so who knows?).

+The branch history of PyPy's repository (every box is a branch)

Carl Friedrich and Lukas started working in earnest on memory benchmarks to +understand the memory behaviour of Python code better. They have now +implemented a generic memory benchmark runner and a simple analysis that walks +all objects and collects size information about them. They also added some +benchmarks that were proposed in the comments of the recent call for +benchmarks. As soon as some results from that work are there, we will post +about them.

+

There were also some minor tasks performed during the sprint. Armin implemented +the _bisect module and the dict.popitem method in RPython. Armin and +Carl Friedrich made the new memory-saving mapdict implementation more suitable +to use without the JIT (blog post should come about that too, at some point). +They also made classes with custom metaclasses a lot faster when the JIT is +used.

+

The last three days of the sprint were spent working on Håkan's +jit-unroll-loops branch. The branch is meant to move loop invariants out of +the loop, using techniques very similar to what is described in the recent post +on escape analysis across loop boundaries (see? it will soon stop being +science-fiction). Some of the ideas of this approach also come from LuaJIT +which also uses very aggressive loop invariant code motion in its optimizers. +Moving loop invariants outside of the loop is very useful, because many of the +lookups that Python programs do in loops are loop invariants. An example is if +you call a function in a loop: The global lookup can often be done only once.

+

This branch fundamentally changes some of the core assumptions of the JIT, so +it is a huge amount of work to make it fit with all the other parts and to +adapt all tests. That work is now nearly done, some failing tests remain. The +next steps are to fix them and then do additional tests with the translated +executable and look at the benchmarks.

+
+
+
+
+ + Luis wrote on 2010-11-04 13:58: +
+
+

It's great to see improvements in pypy. At this moment, the only three benchmarks that perform better in cpython than in pypy are spitfire, slow spitfire and twisted_tcp.

What's the reason for the lower performance on these benchmarks? Is it the same reason for the three or there are multiple causes?

Luis

+
+
+
+
+ + Maciej Fijalkowski wrote on 2010-11-04 14:03: +
+
+

Hey.

spitfire and slowspitfire are a 'won't fix' benchmarks (at least in the near future). The spitfire_cstringio is using the same thing, but cStringIO instead of a list of strings.

Twisted_tcp is slightly more complex and has something to do with pushing a lot of data through sockets. In pypy you usually have to copy data before write, because it can potentially be moved in the GC.

Cheers,
fijal

+
+
+
+
+ + Luis wrote on 2010-11-04 19:46: +
+
+

Thanks! I suppose "won't fix" has a meaning in a pypy context. What does it mean?

+
+
+
+
+ + Maciej Fijalkowski wrote on 2010-11-04 21:16: +
+
+

won't fix means we won't fix it ;-) To be precise it means we know this program is slow, but also there is a way to write this program to be fast, please use the other way.

+
+
+
+
+ + Luis wrote on 2010-11-04 23:41: +
+
+

So it doesn't make much sense including these benchmarks in speed.pypy.org, don't you think?
Perhaps it should be described somewhere what are the strengths and weaknesses of this implementation, suggesting the right approach for each task. Something like "best practices" or something like that...

+
+
+
+
+ + Maciej Fijalkowski wrote on 2010-11-05 07:36: +
+
+

I think deleting it from the nightly run doesn't make sense. It still measures something and helps us catch regressions.

The document you're proposing is actually a really neat idea. I've already did a couple of presentation on it, so it's only about gathering knowledge ("only").

+
+
+
+ +
+
+ +
+
+
+ +
+ + + + \ No newline at end of file diff --git a/blog/index-17.html b/blog/index-17.html new file mode 100644 index 000000000..67b1fc482 --- /dev/null +++ b/blog/index-17.html @@ -0,0 +1,2260 @@ + + + + + + +PyPy (old posts, page 17) | PyPy + + + + + + + + + + + + + + + + + + Skip to main content +
+
+
+

US Trip Report: POPL, Microsoft, IBM

+ +
+

Some notes from my recent trip (from 23rd of January to 17th of February) to the +US where, I presented PyPy at various scientifically oriented places. In +summary, there seems to be quite a bit of interest in PyPy within the research +community, details below.

+
+

PEPM/POPL/STOP

+

From the 24th to the 29th of January I was in Austin, Texas at the POPL +conference, where I gave a talk at one of the workshops, PEPM (Partial +Evaluation and Program Manipulation). The title of our paper is +"Allocation Removal by Partial Evaluation in a Tracing JIT", the abstract is:

+
+The performance of many dynamic language implementations suffers from high +allocation rates and runtime type checks. This makes dynamic languages less +applicable to purely algorithmic problems, despite their growing +popularity. In this paper we present a simple compiler optimization based +on online partial evaluation to remove object allocations and runtime type +checks in the context of a tracing JIT. We evaluate the optimization using +a Python VM and find that it gives good results for all our (real-life) +benchmarks.
+

The talk (slides) seemed to be well-received and there was +a good discussion afterwards. PEPM in general was a very enjoyable workshop +with many interesting talks on partial evaluation (which I am very interested +in) and a great keynote by Olivier Danvy about "A Walk in the Semantic Park".

+

POPL itself was a bit outside of the area I am most knowledgeable in, most of +the talks being on formal topics. Some of the talks that stuck to my mind:

+
    +
  • +"The Design of Kodu: A Tiny Visual Programming Language for Children on the +Xbox 360", the keynote by Matthew MacLaurin from Microsoft Research. I didn't +know about Kodu before, and was very impressed by it.
  • +
+
    +
  • +"Automating String Processing in Spreadsheets using Input-Output Examples" +(paper) by Sumit Gulwani (also from MS Research) describes a plugin to Excel +that can automate many common string processing tasks by giving a couple of +examples, which are then abstracted into a generic string manipulation. Very +cool.
  • +
+
    +
  • +"Dynamic Inference of Static Types for Ruby" (paper) by Michael Furr, +Jong-hoon (David) An, Jeffrey S. Foster and Michael Hicks describes an +approach to type inference that works by observing the actual types seen +during unit-testing. Similar things have been done a few times before, +however, the paper actually gives a correctness result.
  • +
+
    +
  • +"The Essence of Compiling with Traces" (paper) by Shu-Yu Guo and Jens +Palsberg describes a formalization of a simple imperative language and +proves that executing it using trace compilation will do exactly the same +thing than using an interpreter. It also looks at what conditions an +optimization on traces must fulfill to still produce valid results.
  • +
+

After the main conference, I took part in the STOP (Scripts to Programs) +workshop. It had a great keynote "Scripting in a Concurrent World" by John Field +about the Thorn language and a few interesting other talks.

+
+
+

Microsoft Research

+

After POPL I went to Redmond to visit Microsoft Research for a week, +specifically the RiSE group. This is the group that did the SPUR project, +a meta-tracing JIT for C# applied to a JavaScript interpreter in C#. I compared +PyPy to SPUR last year. I am very grateful for Microsoft for inviting me +there.

+

At Microsoft I gave a talk about "PyPy's Approach to Implementing Dynamic +Languages Using a Tracing JIT Compiler", the slides of which can be found +here. The talk was filmed and is online. People seemed to be impressed +with the "product qualities" of PyPy, e.g. the buildbot infrastructure and +speed tracking website.

+

The rest of the time I discussed with various researchers in the RiSE group, +particularly with Nikolai Tillmann. We talked a lot about similarities and +differences between SPUR and PyPy and tried to understand our respective projects +better. SPUR is a really great project and I learned a lot in the discussions, +for example about the optimizations and heuristics their trace compiler uses.

+

Another very cool project done by the RiSE group that I learned more about is +PEX. PEX is a unit test generator for C# that tries to produce unit tests for +so-far untested execution paths within methods. There is an online puzzle +version of it, if you want to get an impression of the technology (including a +very impressive C# IDE in the browser).

+
+
+

IBM

+

For the last part of the trip I stayed in New York City for two weeks, +mostly as a vacation. However, I also visited IBM Watson Research Center for +two days, to which I had been invited by David Edelsohn.

+

The first day I gave the same presentation I had given at Microsoft (with some +improvements to the slides), again it was quite well received. The rest of +the time I spent in (very fruitful) discussions with various people and teams, +among them the Liquid Metal team and the Thorn team.

+

The second day I met with members of the FIORANO group, who are working on +dynamic compilation for dynamic languages and Java. They explored various ways +to speed up Python, both by improving the CPython interpreter as well as with +JIT compilation techniques.

+

Another of their projects is to add a trace compiler to IBM's J9 JVM, about +which the paper "A Trace-based Java JIT Compiler Retrofitted from a +Method-based Compiler" is going to appear at CGO. I discussed tracing JITs with +Peng Wu, one of the authors of that paper. Peng tries to systematically look at +the various heuristics found in the different VMs that use tracing JITs. This +is a very different perspective from the one I usually have, focusing on how to +improve PyPy's specific heuristics. Therefore that discussion helped me thinking +about the issues more generally.

+

Another goal of the group is to try to find benchmarks that are representative +for typical Python workloads, which is something that has been done very +carefully for Java e.g. when developing the DaCapo benchmark suite. The +benchmarks that the Python community uses have not been selected in such a +careful and measured way, so I think that trying to be more systematic there is +a very worthwhile endeavour.

+
+
+
+
+
+ + holger krekel wrote on 2011-03-05 14:04: +
+
+

Thanks for the interesting overview of your travels and research interactions! I i agree that getting better and more systematic benchmarks for Python would be worthwhile.

+
+
+
+
+ + Ivan wrote on 2011-03-07 20:36: +
+
+

I find this project fascinating.

I wonder what's the theoretical limit of this approach for improving the performance of python (or any other language implemented in pypy)?

Do you have any rought estimation on how far you can go? Have you reached a limit or you are just scratching the possibilities?

For example, do you think you can compete with javascript v8 or luajit?

+
+
+
+
+ + Maciej Fijalkowski wrote on 2011-03-08 13:59: +
+
+

Hi Ivan.

In general I don't think there are limits of approach other than say time and money. Python is a complex language.

Can you come up with an example where PyPy is actually slower than V8 *other* than computer language shootout? Programs on computer language shootout are just not nicely optimized for PyPy.

+
+
+
+
+ + Ivan wrote on 2011-03-08 16:10: +
+
+

Hi Fijall,

I'm afraid I don't know about benchmarks and comparison between these languages, other than the shootout. I guess this is the first reference someone gets when comparing languages, since it's the most popular out there.

But it would be great if there was a resource to compare against other languages. At least, from a marketing point of view, it would be very good for pypy.

May I know why the shootout is not a good parameter?

And, is there any other benchmarks comparing pypy against v8, tracemonkey/jägermonkey, etc..?

+
+
+
+
+ + Maciej Fijalkowski wrote on 2011-03-08 16:21: +
+
+

Hi Ivan.

Shootout is not good because it contains heavily tuned programs, some of them even massively stretching the benchmark restrictions. They're tailored towards specific implementations, contain specific per-benchmark options etc. Nobody looked at python programs at detail and especially from PyPy perspective. This would need to be done first to compare those fairly, until it's not done, it's comparing naive version to a heavily optimized one and not comparing languages.

From what I measured roughly PyPy comes on par with tracemonkey and about 2x slower V8. But those were very unscientific experiments and I'll deny everything :)

I don't think there is any good cross-language comparison and that's at least partly due to the fact that workloads differ in different languages. Most shootout programs for example are tailored towards C workloads. Optimizing precisely for them (even if you have a good programs) is kind of fun, but it does not represent what we try to achieve, that is speeding up large python programs.

I hope this answers your question.

Cheers,
fijal

+
+
+
+
+ + Anonymous wrote on 2011-03-10 23:15: +
+
+

to me it seems like you have reached the goals of unladen swallow and unladen swallow was a bit of a failure?

if google wants a faster python, why don't they fund you? it would be awesome if the core team could work on it full-time. :)

+
+
+
+ +

PyPy Winter Sprint Report

+ +
+

A few weeks ago I had the great fortune to attend the PyPy winter sprint in Leysin Switzerland. I've wanted to contribute to PyPy for a long time and I thought diving into a sprint might be a good way to get familiar with some of the code. What I wasn't expecting was to be using RPython to implement new methods on built-in Python objects on the first day. The main thing I took away from the sprint was just how easy it is to get involved in developing PyPy (well, some bits of it at least and being surrounded by core developers helps). I wrote up a very short description of how to get started here, but I'll do a longer blog post with examples on my own blog soon(ish).

+The sprint was kicked off by Armin merging the "fast-forward" branch of PyPy onto trunk. "fast-forward" brings PyPy from Python 2.5 compatibility to Python 2.7. Along with this it brought a large number of test failures, as the sterling work done by Benjamin Peterson and Amaury Forgeot d'Arc was not complete. This immediately set the primary sprint goal to reduce the number of test failures.

+We made a great deal of progress on this front, and you can see how close PyPy is now from the buildbots.

+Jacob Hallén and I started working through the list of tests with failures alphabetically. We made short work of test_asyncore and moved onto test_bytes where I was stuck for the rest of the sprint. I spent much of the remaining days working with Laura Creighton on the pypy bytearray implementation to make it more compatible with Python 2.7. This meant adding new methods, changing some of the Python protocol method implementations and even changing the way that bytearray is constructed. All in all great fun and a great introduction to working with RPython.

+A big part of the compatibility with Python 2.7 work was done by Laura and Armin who basically rewrote the math module from scratch. This was needed to incorporate all the improvements made (mostly by Mark Dickinson) in CPython in 2.7. That involved a lot of head-scratching about such subtleties as whether -0.0 should be considered almost equal to 0.0 and other fun problems.


+ + + +
The first meal together, before everyone had arrived
+If you add on top of this the wonderful people, the beautiful scenery, the Swiss cheese fondues, managing to not kill myself with a days skiing and traditional pypy card games, I can heartily recommend pypy sprints as a close approximation of geek nirvana.

+ + +
View of the mountains from the sprint
+
+Working on 2.7 compatibility wasn't the only work that happened during the sprint. Other activities included:
    +
  • Antonio Cuni worked on the "jittypes" branch. This is a reimplementation of the core of the PyPy ctypes code to make it jittable. The goal is that for common cases the jit should be able to turn ctypes calls from Python into direct C level calls. This work was not completed but very close and is great for the future of integrating C libraries with PyPy. As ctypes is also available in CPython and IronPython, and hopefully will be available in Jython soon, integrating C code with Python through ctypes is the most "implementation portable" technique.
  • +
  • David Schneider continued his work on the JIT backend for ARM. PyPy has been cross-compilable to ARM for a long time, but bringing the JIT to ARM will provide a *fast* PyPy for ARM, which includes platforms like Android. Again David didn't complete this work but did complete the float support.
  • +
  • Håkan Ardo was present for two days and continued his crazy-clever work on JIT optimisations, some of which are described in the Loop invariant code motion blog entry.
  • +
  • Holger Krekel worked on updating the PyPy test suite to the latest version of py.test and also worked with me on the interminable bytearray changes for part of the sprint.
  • +
  • No one was sure what  Maciej Fijałkowski worked on but he seemed to be quite busy.
  • +
+I think that was most of the work done during the actual sprint. There was also a great deal of healthy discussion about the future of PyPy. Expect lots more interesting and exciting developments over the coming year.

+
+
+
+
+ + Anonymous wrote on 2011-02-14 15:00: +
+
+

"There was also a great deal of healthy discussion about the future of PyPy."

World domination?

+
+
+
+
+ + Carl Friedrich Bolz-Tereick wrote on 2011-02-14 16:19: +
+
+

Very nice report, thanks a lot Michael!

+
+
+
+
+ + Anonymous wrote on 2011-02-15 01:16: +
+
+

> world domination?

why yes of course! the ouroboros is their symbol; PyPy is, evidently, backed by the templars

+
+
+
+
+ + Anonymous wrote on 2011-02-15 16:21: +
+
+

> world domination?

Mongol General: Pypy devs! What is best in life?
Pypy dev: To crush your enemies, see them driven before you, and to hear the lamentation of their women.
Mongol General: That is good! That is good.

+
+
+
+
+ + Carl Friedrich Bolz-Tereick wrote on 2011-02-15 16:41: +
+
+

@Anonymous: Let's not get too far off-track. Also, I don't really like being ascribed a rather violent quote by (supposedly) Genghis Khan, so stop that please.

+
+
+
+
+ + Anonymous wrote on 2011-02-15 17:44: +
+
+

@Carl, it wasn't Genghis Khan.
It was Conan the Barbarian, impersonated by former California governor.
Not to be taken too seriously... :-)

+
+
+
+
+ + Carl Friedrich Bolz-Tereick wrote on 2011-02-15 20:40: +
+
+

@Anonymous: https://www.barbariankeep.com/ctbsecrets.html

+
+
+
+ +

The PyPy San Franciso Bay Area Tour 2011

+ +
+

PyPy is coming to the San Francisco Bay Area in the beginning of March with +a series of talks and a mini sprint.

+ +
    +
  • +

    Wednesday March 2, 4:15 p.m. Armin Rigo gives +a +talk at Stanford. open to the public.

    + +
  • +
  • +

    Thursday March 3, 6:00 p.m. General talk at Yelp, 706 Mission St 9th Floor, + San Francisco CA 94103 open to the public.

    + +
  • +
  • +

    Saturday and Sunday March 5 and 6. + PyPy mini sprint at noisebridge. + 2169 Mission street between 17th and 18th in San Francisco. Open to the public.

    + +
  • +
  • +

    Monday March 7th, 11:30 a.m. Google Tech talk in Mountain View at the + Googleplex. Not open to the public (but the video should be available + later).

    + +
  • +
  • +

    Monday March 7th, 2:30 p.m. Talk at Mozilla in Mountain View. Not + open to the public (but Mozilla developers can videoconference).

    +
  • +
+

From the PyPy project team we will have Armin Rigo, Maciej Fijałkowski +(from 6th March), Laura Creighton and Jacob Hallén and possibly +Christian Tismer attending.

+ +

Most of the talks will focus on (some of) the highlights and the +status of pypy:

+ +
    +
  • most Python benchmarks run much faster than with CPython or Psyco +
  • +
  • the real-world PyPy compiler toolchain itself (200 KLocs) runs twice as fast +
  • +
  • supports x86 32 and 64bit and is in the process of supporting ARM +
  • +
  • full compatibility with CPython (more than Jython/IronPython) +
  • +
  • full (and JIT-ed) ctypes support to call C libraries from Python +
  • +
  • supports Stackless Python (in-progress) +
  • +
  • new "cpyext" layer which integrates existing CPython C extensions +
  • +
  • an experimental super-fast JIT-compilation of calls to C++ libraries +
  • +
+

As is usual for us, there is vastly more material that is available for +us to cover than time, especially when it comes to possible future +directions for PyPy. We want to reserve a certain amount of time at +each talk purely to discuss things that are of interest to audience +members. However, if you already know what you wish we would discuss, +and are attending a talk (or even if you aren't), please let us know. +You can either reply to this blog post, or mail Laura directly at +lac at openend.se .

+ +

Apart from getting more technical and project insight, our travel is +also a good possibility for companies in the SF area to talk to us +regarding contracting. In September 2011 our current "Eurostars" research +project ends and some of us are looking for ways to continue working on +PyPy through consulting, subcontracting or hiring. The two companies, +Open End and merlinux, have successfully done a number of such contracts +and projects in the past. If you want to talk business or get together for +lunch or dinner, let us know! If you would like us to come to your company +and make a presentation, let us know! If you have any ideas about what +we should discuss in a presentation so that you could use it to convince +the powers-that-be at your place of employment that investing time and +money in PyPy would be a good idea, let us know!

+ +

On Tuesday March 8th we will be heading for Atlanta for the Python VM +and Language Summits before attending PyCon. Maciej Fijałkowski and +Alex Gaynor will be giving a talk entitled +Why is +Python slow and how can PyPy help? +Maciej will also be giving the talk +Running +ultra large telescopes in Python which is +partially about his experiences using PyPy in the Square Kilometer Array +project in South Africa. There will be a PyPy Sprint March 14-17. +All are welcome.

+
+
+
+
+ + Dan wrote on 2011-02-13 01:42: +
+
+

I wanted to let everyone know, there is a PSF sponsored code sprint in Portland, Oregon on February 26th starting at 9am. If you're going to be in the area, it promises to be a great time. We've got a great plan for the day which can be see in this google doc. I hope to see some of you there!

--Dan

+
+
+
+
+ + Anonymous wrote on 2011-02-16 00:51: +
+
+

We'll be giving a talk at Dropbox in San Francisco at 16:00 on Friday March 4th.

+
+
+
+
+ + Anonymous wrote on 2011-02-22 05:52: +
+
+

And we'll be dropping by the Google building in San Francisco at 10.45 a.m.
on Tuesday March 1st to chat with
Googlers there and give an informal
talk.

+
+
+
+ +

PyPy faster than C on a carefully crafted example

+ +
+

Good day everyone.

+

Recent round of optimizations, especially loop invariant code motion +has been very good for small to medium examples. There is work ongoing to +make them scale to larger ones, however there are few examples worth showing +how well they perform. This one following example, besides getting benefits +from loop invariants, also shows a difference between static and dynamic +compilation. In fact, after applying all the optimizations C does, only a +JIT can use the extra bit of runtime information to run even faster.

+

The example is as follows. First Python. I create two files, x.py:

+
+def add(a, b):
+  return a + b
+
+

And y.py:

+
+from x import add
+
+def main():
+    i = 0
+    a = 0.0
+    while i < 1000000000:
+        a += 1.0
+        add(a, a)
+        i += 1
+
+main()
+
+

For C, x.c:

+
+double add(double a, double b)
+{
+  return a + b;
+}
+
+

and y.c:

+
+double add(double a, double b);
+
+int main()
+{
+  int i = 0;
+  double a = 0;
+  while (i < 1000000000) {
+    a += 1.0;
+    add(a, a);
+    i++;
+  }
+}
+
+

Results?

+
    +
  • 1.97s - PyPy
  • +
  • 3.07s - C
  • +
+Compilation options: +
    +
  • PyPy trunk (386ed41eae0c), running pypy-c y.py
  • +
  • C - gcc -O3 (GCC 4.4.5 shipped with Ubuntu Maverick)
  • +
+

Hence, PyPy 50% faster than C on this carefully crafted example. The reason +is obvious - static compiler can't inline across file boundaries. In C, +you can somehow circumvent that, however, it wouldn't anyway work +with shared libraries. In Python however, even when the whole import system +is completely dynamic, the JIT can dynamically find out what can be inlined. +That example would work equally well for Java and other decent JITs, it's +however good to see we work in the same space :-)

+

Cheers,
+fijal

+

EDIT: Updated GCC version

+
+
+
+
+ + Anonymous wrote on 2011-02-04 11:43: +
+
+

> The reason is obvious - static compiler can't inline across file boundaries.

That's what link-time optimizations are for, which where added to GCC in 2009; however, your point concerning shared libaries is valid...

+
+
+
+
+ + Zeev wrote on 2011-02-04 11:55: +
+
+

I added a printf("%f\n",a) to the end of the file so the compiler wouldn't optimize the whole thing away. On my Cure 2 Duo 2.33Ghz, I got for gcc -O3:

1000000000.000000

real 0m4.396s
user 0m4.386s
sys 0m0.007s

and for gcc -O3 -flto -fwhole-program:


1000000000.000000

real 0m1.312s
user 0m1.308s
sys 0m0.003s

+
+
+
+
+ + Anonymous wrote on 2011-02-04 11:59: +
+
+

Great work!

Now you just have to identify and remove dead code in your jit. Then you could remove the call to 'add' altogether.

+
+
+
+
+ + Armin Rigo wrote on 2011-02-04 12:23: +
+
+

In this strange example, in our JIT, the call to 'add' is indeed removed because of inlining, and then the addition that occurs in there is removed because of dead code elimination.

+
+
+
+
+ + Maciej Fijalkowski wrote on 2011-02-04 12:56: +
+
+

@Zeev yes, but C equivalent of Python import is indeed shared libraries, where -fwhole-program no longer works.

+
+
+
+
+ + Maciej Fijalkowski wrote on 2011-02-04 13:01: +
+
+

@Armin note that even when the result is accumulated (addition is not removed, although the call is still inlined), PyPy is still faster. Not as much though: 2.5s vs 3.0s

+
+
+
+
+ + Anonymous wrote on 2011-02-04 13:23: +
+
+

For completeness's sake, what's the output of `gcc --version` in your example?

+
+
+
+
+ + klauss wrote on 2011-02-04 14:37: +
+
+

Not to mention specialization: python's (and pypy's) add() can add pretty much anything - strings if you will.

The JIT will inline a specialized version particular to the call site, whereas C can only apply generalized optimizations.

+
+
+
+
+ + Greg Milner wrote on 2011-02-05 02:02: +
+
+

Everyone knows Python runs faster than C...

By about 6 weeks.

+
+
+
+
+ + Anonymous wrote on 2011-02-05 13:01: +
+
+

There's another simple case where pypy could (in principle) do very much better than standard C: turn pow(x, i) into sqrt(x*x*x) if i == 3/2, and other reductions. In practice if you don't know what i is at compiletime you often bundle the simplifications into a function (at the cost of some ifs) but a JIT could do a very nice job on this automagically whenever i is fixed, which it usually is.

+
+
+
+
+ + Anonymous wrote on 2011-02-06 14:12: +
+
+

You wrote: "PyPy 50% faster than C on this carefully crafted example".

The truth is: PyPy is 35% faster than the C code (using C as the baseline), because it completes in 65% of the time required by the C version.

The C code takes 50% more time to execute (is slower by 50%, 1.5x slower) than the PyPy code (using PyPy as the baseline).

+
+
+
+
+ + haypo wrote on 2011-02-08 22:58: +
+
+

Test with gcc (Debian 20110126-0ubuntu1) 4.6.0 20110126 (experimental) [trunk revision 169285]: "/usr/lib/gcc-snapshot/bin/gcc [OPTIONS] x.c y.c -o x && time ./x". OPTIONS=-O0: 10.1s; OPTIONS=-O3: 9.1s; OPTIONS=-O3 -flto: 0.002s. Woops, 0.002 second? I checked: the result is correct :-) LTO rocks!

+
+
+
+
+ + Maciej Fijalkowski wrote on 2011-02-09 06:43: +
+
+

@haypo print the result so the loop don't get removed as dead code. Besides, the problem is really the fact that's -flto is unfair since python imports more resemble shared libraries than statically-compiled files.

+
+
+
+
+ + Anonymous wrote on 2011-05-05 06:40: +
+
+

In general, if you want to compare the performance of languages, you're actually supposed to try to write the *fastest* implementation in each language. Not just some arbitrary one.

In this example, the program has no output, so both implementations are crap and could be made a lot faster.

Come up with a program that has testable output, and see if someone can't comment with a C program that's faster than your python.

+
+
+
+
+ + Anonymous wrote on 2011-12-01 00:09: +
+
+

RIDICULOUS!

+
+
+
+
+ + Eric wrote on 2012-11-20 15:46: +
+
+

Pypy isn't faster than C, even on this example for multiple reasons:

First it's conceptual: C is almost as optimized as assembly (it's often referred to as a super assembler) so even if Pypy ends-up generating some assembly code, it has first to evaluate the runtime environment to figure out the type of variables and emit assembly code, and all this process is not free... so Pypy can only asymptotically reach the same level as C and assembly.

Second, the test is flawed: I did a slight modification that shouldn't change the results: I've inlined the add() in both python and C. Oh! surprise: Pypy keeps the same time whereas C is 4x faster than before (without inlining).

So to make it fair, we need to use the best capabilities of both languages:
- python: I'm sure the author provided the best python implementation, and the fact that inlining add() doesn't change results kinda proves this)
- C: when you inline the function you get:

[code]

static inline double add_double(double a, double b) {
return a + b;
}

int main()
{
unsigned int i;
double a = 0.0;

for (i = 0; i < N; i++) {
a += 1.0;
add_double(a, a);
}
printf("%f\n", a);
}

[/code]

Results:
C inlined: 1.10s
C: 3.98s
Pypy inlined: 3.30s
Pypy: 3.28s

Conclusion:
- When using the right C code, on the same example C is 3 times faster than Pypy.
- As demonstrated, the statement that Pypy is faster than C is simply biased by a not optimizsed C code.

+
+
+
+
+ + Staff wrote on 2012-11-21 06:07: +
+
+

@Eric This post is not trying to argue that Python is "better" or even faster than C. It is just pointing out that certain classes of optimizations (i.e. whole program optimizations) come naturally to the PyPy JIT.

This is, of course, only one small facet of why a program runs fast. The author admits that it is a contrived example to illustrate the point.

Taking the point to an extreme, one could see a PyPy program run faster than a C program if the C program made many calls to simple shared libraries. For example, if one dynamically links a C stdlib into their program, and uses it heavily, the equivalent python code may conceivably run faster.

+
+
+
+
+ + Eric wrote on 2012-11-21 14:44: +
+
+

Please read the title of this article again: "PyPy faster than C on a carefully crafted example"

Based on a specific example or not it doesn't matter, I'm simply not comfortable with reading strong statement like this that are obvioulsy false to any serious computer scientist and misleading to beginners. It's false because it's the conclusion of a test which is biased.

The root of benchmarking is to get rid of any bias
In this case the obvious bias is that Pypy is optimized and C isn't (as demonstrated above with inline functions).

You can't transpose only what you want in real life and not the other: your argument that in real life the C could use external library hence be slower is valid, but then you have to compare with real life Python scripts which can't be as much optimized by Pypy as this crafted example. So in real life you get a C code that may be slowed down a bit by dynamic linking, and python scripts that are much slower because Pypy isn't ready to match C speed for everything (yet).

If you want to use a crafted Python example, you have to compare it to a crafted C example, so that you can compare apples with apples.

All that is methodology, that said JIT is quite powerful and it's impressive in itself to beat CPython by a large margin.

+
+
+
+
+ + keegano wrote on 2013-02-06 22:53: +
+
+

Eric: Your comments about "real life" are irrelevant - the post is about a specific, contrived example. I don't think anyone would argue that a high-level, garbage-collected language like python could ever beat out C in general - it's simply a demonstration that, in a very specific instance, equivalent code in python and C can run faster in python because of the JIT making optimizations that can't occur at compile time.

+
+
+
+
+ + Eric wrote on 2013-02-06 23:02: +
+
+

You're assuming that python is faster even on this crafted example, but keep in mind that this comparison is biased because the C version isn't optimal.

+
+
+
+
+ + Eric wrote on 2013-02-06 23:02: +
+
+

you're assuming that python is faster even on this crafted example, but keep in mind that this comparison is biased because the C version isn't optimal.

+
+
+
+
+ + Staff wrote on 2013-02-07 01:42: +
+
+

stop feeding this troll

+
+
+
+
+ + Eric wrote on 2013-02-07 10:18: +
+
+

point taken, but do update the article to take into account my remark: both the title and the conclusion of the "demonstration" are false, even on a contrived example as you barely can't find any C code that would be slower than the code generated by your JIT for the simple reason that C is really too close to assembly and that JIT adds an overhead.

+
+
+
+
+ + Maciej Fijalkowski wrote on 2013-02-07 10:30: +
+
+

Hey Eric.

Your argument is incredibly flawed. You can compile faster version of assembler (or is C the fastest assembler ever?) if you try hard enough. Why not?

+
+
+
+
+ + Eric wrote on 2013-02-07 10:48: +
+
+

Please don't digress, what I say is simple:
The article states that Pypy generates code faster than C on a crafted example.
I demonstrate there is a more optimized C code that the author's one, hence that the whole article is wrong... end of the story.

+
+
+
+
+ + Maciej Fijalkowski wrote on 2013-02-07 10:52: +
+
+

No, it's a reasonable piece of C. You don't inline your printf code, do you? dynamic linking is a thing that people use.

+
+
+
+
+ + Eric wrote on 2013-02-09 11:38: +
+
+

You're right, people very often use dynamic linking. However the following is not a reasonable piece of Python code:

def add(a, b): return a + b

People rarely use that and more importantly they don't write a loop that calls it 1 billion times.

The point is that the reasoning spans two levels (hence is flawed/biased):
- in Python the author took a crafted piece of Python that is not meaningful in real life because it has the property to do what he wants at the Pypy level
- in C the author uses a very common mechanism that isn't fully optimized (not as much as Python/Ppy is optimized).

I know you will not agree since you're all proud that "Pypy is faster than C" (lol it's nonsense even on a "crafted example") but you have to compare apples with apples.

+
+
+
+
+ + Dvd Fo wrote on 2013-09-20 18:29: +
+
+

@Eric what you don't understand is the point of the article. The actual point is to demonstrate a nice property of PyPy JIT, which is able to generate fast code when it can. Comparing to C in this manner proves that PyPy's generated machine code is relevant with regard to speed.
Of course this example is fragile because it relies on suboptimal C code, but this serves only to prove the point about PyPy.

+
+
+
+
+ + Anonymous wrote on 2013-12-07 06:14: +
+
+

@Eric... Non sense.. Are you a ambassador for C ?

+
+
+
+
+ + Eric wrote on 2013-12-07 08:44: +
+
+

Do argue if you disagree, don't troll.

I think everything have been said already anyway.

+
+
+
+ +

A JIT Backend for ARM Processors

+ +
+
+In the past few months, I have been developing as a part of my master thesis +the ARM backend for the the PyPy JIT, in the arm-backend branch. Currently, it is still work in progress: all integer and object operations are working and +the support for floating point is also under development.
+ARM processors are very widely used, beeing deployed in servers, some netbooks +and mainly mobile devices such as phones and tablets. One of our goals is to be +able to run PyPy on phones, specially on Android. Currently is not yet possible +to translate and compile PyPy for Android automatically, but there has been +some work on using Android's NDK to compile PyPy's generated C code.
+The JIT Backend targets the application profile of the ARMv7 instruction set +architecture which is found for example in the Cortex-A8 processors used in many Android powered devices and in Apple's A4 processors built into the latest iOS devices. To develop and +test the backend we are using a BeagleBoard-xM which has a 1 GHz ARM +Cortex-A8 and 512 MB of RAM running the ARM port of Ubuntu 10.10.
+Currently on Linux it is possible to translate and cross-compile PyPy's Python +interpreter as well as other interpreters with the ARM JIT backend enabled +using Scratchbox 2 to provide a build environment and the GNU ARM cross +compilation toolchain. So far the backend only supports the Boehm garbage +collector which does not produce the best results combined with the JIT, but we +plan to add support for the other GCs in the future, doing so should increase +the performance of PyPy on ARM.
+While still debugging the last issues with the backend we already can run some +simple benchmarks on Pyrolog, a prolog interpreter written in RPython. +Even using Boehm as the GC the results look very promising. In the benchmarks +we compare Pyrolog to SWI-Prolog, a prolog interpreter written in C, which +is available from the package repositories for Ubuntu's ARM port.
+The benchmarks can be found in the pyrolog-bench repository.
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
BenchmarkSWI-Prolog in ms.Pyrolog in ms.Speedup
iterate60.06.010.0
iterate_assert130.06.021.67
iterate_call3310.05.0662.0
iterate_cut60.0359.00.16713
iterate_exception4950.0346.014.306
iterate_failure400.0127.03.1496
iterate_findall740.0No res.
iterate_if140.06.023.333
+The iterate_call benchmark, which constructs a predicate and calls it at +runtime, with a speedup of 662 times over SWI-Prolog is an example where the +JIT can show its strength. The Pyrolog interpreter and the JIT treat +dynamically defined predicates as static ones and can generate optimezed code +in both cases. Whereas SWI only compiles statically defined rules and has to +fall back to interpretation on dynamic ones.
+For simple benchmarks running on PyPy's Python intepreter we see some speedups +over CPython, but we still need to debug the backend bit more before we can +show numbers on more complex benchmarks. So, stay tuned.
+
+
+
+
+ + Anonymous wrote on 2011-01-30 10:01: +
+
+

Awesome stuff. I have a panda board and another xm that's usually not doing much if you want to borrow some cycles :-)

When you support floats will you be aiming for hard float? It's the way of the future, I hear...

+
+
+
+
+ + Unknown wrote on 2011-01-30 11:47: +
+
+

I am curious if you had any use for ThumbEE (or Jazelle RCT) to speed up?

+
+
+
+
+ + David Schneider wrote on 2011-01-30 19:05: +
+
+

@mwhudson: thanks it would be great to be able to test on more hardware.

For the float support we still need to investigate a bit, but if possible I would like to target hard floats.

@dbrodie: currently we are targeting the arm state, so not at the moment.

+
+
+
+
+ + Martijn Faassen wrote on 2011-01-31 14:11: +
+
+

One would imagine conserving memory would be an important factor on mobile devices. Even though mobile devices have a growing amount of memory available, it will still be less than desktops for the forseeable future. Memory pressure can create real slowdowns.

A JIT normally takes more memory, but on the other hand PyPy offers features to reduce usage of memory. Could you share some of your thinking on this?

+
+
+
+
+ + Armin Rigo wrote on 2011-02-05 19:51: +
+
+

Martijn: you are describing the situation as well as we (at least I) know it so far: while PyPy has in many cases a lower non-JIT memory usage, the JIT adds some overhead. But it seems to be within ~200MB on "pypy translate.py", which is kind of the extreme example in hugeness. So already on today's high-end boards with 1GB of RAM, it should easily fit. Moreover it can be tweaked, e.g. it's probably better on these systems to increase the threshold at which JITting starts (which also reduces the number of JITted code paths). So I think that the possibility is real.

+
+
+
+
+ + Dan wrote on 2011-04-30 16:40: +
+
+

Showing speedups over repetitive instructions (which caching & JIT are really good at) is irrelevant.

What happens when people use real benchmarks, like constraint-based solvers and non-iterative stuff (maybe take a look at the other benchmarks) ...

Prolog is a declative language, not a sysadmin scripting language.

Also, the SWI implementation adds so many functionalities, it's like making a «Extract chars from an RDBMS vs Text files» benchmark.

+
+
+
+
+ + Carl Friedrich Bolz-Tereick wrote on 2011-05-02 19:02: +
+
+

@Dan

Why are you so defensive? This benchmark is clearly not about how fast Pyrolog is, but how the ARM JIT backend performs, using trivial Prolog microbenchmarks, with SWI to give a number to compare against.

Pyrolog is a minimal Prolog implementation that is (at least so far) mostly an experiment to see how well PyPy's JIT technology can do on an non-imperative language. This paper contains more interesting benchmarks:

https://portal.acm.org/citation.cfm?id=1836102

+
+
+
+
+ + jamu wrote on 2011-05-16 13:11: +
+
+

Hi,
Is there a way to cross compile on a host machine (but not with scratch box) where I have tool chain and file system for the target?

Any instructions for building with arm back-end?

Cheers

+
+
+
+
+ + David Schneider wrote on 2011-06-08 20:41: +
+
+

@jamu: scratchbox 2 is currently the only option to cross-translate pypy for ARM. You can find some documentation about the cross translation at https://foss.heptapod.net/pypy/pypy/-/tree/branch/arm-backend-2/pypy/doc/arm.rst

+
+
+
+
+ + vak wrote on 2011-09-30 10:12: +
+
+

Sounds very cool, are there any updates?

+
+
+
+ +

PyPy wants you!

+ +
+ + +

If you ever considered contributing to PyPy, but never did so far, this is a +good moment to start! :-)

+

Recently, we merged the fast-forward branch which brings Python 2.7 +compatibility, with the plan of releasing a new version of PyPy as soon as all +tests pass.

+

However, at the moment there are still quite a few of failing tests because +of new 2.7 features that have not been implemented yet: many of them are easy +to fix, and doing it represents a good way to get confidence with the code +base, for those who are interested in it. Michael Foord wrote a little howto +explaining the workflow for running lib-python tests.

+

Thus, if you are willing to join us in the effort of having a PyPy compatible +with Python 2.7, probably the most sensible option is to come on the #PyPy IRC +channel on Freenode, so we can coordinate each other not to fix the same test +twice.

+

Moreover, if you are a student and are considering participating in the next +Google Summer of Code this is a good time to get into pypy. You have the +opportunity to get a good understanding of pypy for when you decide what you +would like to work on over the summer.

+
+
+
+
+ + Oliver Sherouse wrote on 2011-01-21 19:15: +
+
+

Would you mind giving us a hint of what skills programmers would need to be actually useful? I know you don't want to scare anybody off, but PyPy is kind of the ultimate evolution of what you can do with the language, and I get the sense (perhaps wrongly!) that it goes places where desktop-and-web-app guys like me are a bit out of our depth and actually might waste time more than anything else.

I'm asking this here because I'm pretty sure that others are going to be thinking the same thing.

+
+
+
+
+ + nekto0n wrote on 2011-01-21 20:37: +
+
+

Seems a lot of volantiers applied - buildbot.pypy.org renders 502 Proxy Error

+
+
+
+
+ + holger krekel wrote on 2011-01-22 11:35: +
+
+

Nofrak: you ask good questions. I'd say you need to know your way around Python programming in general which you most certainly do if you have done desktop or Web apps in Python.

Secondly, it's important to know a bit about the basic structure of an Python interpreter. Reading some docs, among them Chapter 1 of https://codespeak.net/pypy/trunk/pypy/doc/coding-guide.html#overview-and-motivation should help.

Thirdly, methodology: PyPy is written in a test-driven way, and for the Python interpreter there are several places for tests: one is the (sometimes slightly modified) standard CPython tests in the lib-python/(modified-)2.7.0 directory, another is pypy/objspace/std/test. The implementation of the interpreter mainly is written down in pypy/objspace/std/*.py.

Hope that helps a bit. IRC is a good place to ask for further directions, of course.

+
+
+
+
+ + Anonymous wrote on 2011-01-22 20:31: +
+
+

And then what do we do after fixing a failing test case? For each patch, create a new bug in the bug tracker and attach it?

+
+
+
+
+ + Antonio Cuni wrote on 2011-01-22 22:59: +
+
+

@Anonymous: creating a new issue in the bug tracker is not necessary: you can just come on IRC or write to pypy-dev attaching your patch, or you can e.g. fork the project on bitbucket and send a pull request, or you can send us the mercurial bundle, etc. etc.

There is no really any bureaucracy for this :)

+
+
+
+
+ + Simon JOnes wrote on 2011-01-26 23:12: +
+
+

What is the best IRC channel to go on?

+
+
+
+
+ + Maciej Fijalkowski wrote on 2011-01-27 05:33: +
+
+

#pypy on freenode

+
+
+
+
+ + Anonymous wrote on 2011-02-16 19:01: +
+
+

What's the story on PyPy for the Python 3 language? Python 3 is over 2 years old, and Python 2.x is looking older every day. I might consider getting involved, but I don't want to feel like I'm spending time contributing to a dead-end branch of the language.

+
+
+
+ +

Loop invariant code motion

+ +
+

Recently, the jit-unroll-loops branch was merged. It implements the +idea described in +Using Escape Analysis Across Loop Boundaries for Specialization. +That post does only talk about virtuals, but the idea turned out +to be more far reaching. After the metainterpreter produces a trace, +several optimizations are applied to the trace before it is turned +into binary code. Removing allocations is only one of them. There are also +for instance +

+
    +
  • Heap optimizations that removes memory accesses by reusing results + previously read from or written to the same location. +
  • +
  • Reusing of the results of pure operations if the same pure + operation is executed twice. +
  • +
  • Removal of redundant guards. +
  • +
  • ... +
  • +
+A lot of these optimizations are in one way or another removing +operations form the trace and/or reusing previous results. All of these +optimizations could benefit from being able to operate across loop +boundaries. Not only in the sense that operations operating on loop +invariants could be moved out of the loop entirely. But also that +results produced at the end of an iteration could be reused at the +beginning of the next even if there are no loop invariants involved. + +

+ +This is achieved by unrolling the trace into two iterations, and +letting the optimizer work on this two-iteration-trace. +The optimizer will now be able to optimize the second iteration more than the +first since it can reuse results from the first iteration. The +optimized version of the first iteration we call the preamble and the +optimized version of the second iteration we call the loop. The +preamble will end with a jump to the loop, while the loop will end +with a jump to itself. This means that the preamble will be executed +once for the first iteration, the loop will be executed for all following +iterations. + +

+

+

+

Sqrt example

+Here is an example of a Python implementation of sqrt using a fairly +simple algorithm + +

+ + + +

+
def sqrt(y, n=10000):
+    x = y / 2
+    while n > 0:
+        n -= 1
+        x = (x + y/x) / 2
+    return x
+
+

+ +If it is called with sqrt(1234.0), +a fairly long trace is produced. From this trace +the optimizer creates +the +following preamble (Loop 1) and loop (Loop 0) + + +

+

+ +

+

+ +Looking at the preamble, it starts by making sure that it is not +currently being profiled, the guard +on i5, and that the function object have not been changed +since the trace was made, the guard on p3. Somewhat +intermixed with that, the +integer variable n is unboxed, by making sure p11 +points to an integer object and reading out the integer value from +that object. +These operations are not needed in the +loop (and have been removed from it) as emitting the same guards again +would be redundant and n becomes a virtual before the +end of the preamble. +

+
+        guard_value(i5, 0, descr=<Guard6>) 
+        guard_nonnull_class(p11, ConstClass(W_IntObject), descr=<Guard7>) 
+        guard_value(p3, ConstPtr(ptr15), descr=<Guard8>) 
+        i16 = getfield_gc_pure(p11, descr=<W_IntObject.inst_intval>)
+
+ +Next comes a test and a guard implementing the while statement +followed by the decrementing of n. These operation appear +both in the preamble and in the loop +
+        i18 = int_gt(i16, 0)
+        guard_true(i18, descr=<Guard9>) 
+        i20 = int_sub(i16, 1)
+
+ +After that the two floating point variables x and y +are unboxed. Again this is only needed in the preamble. Note how the +unboxed value of y, called f23, is passed unchanged +from the preamble to the loop in arguments of the jump +to allow it to be reused. It will not become a virtual +since it is never changed within the loop. +
+        guard_nonnull_class(p12, 17652552, descr=<Guard10>) 
+        guard_nonnull_class(p10, 17652552, descr=<Guard11>) 
+        f23 = getfield_gc_pure(p10, descr=<W_FloatObject.inst_floatval>)
+        f24 = getfield_gc_pure(p12, descr=<W_FloatObject.inst_floatval>)
+
+ +Following that is the actual calculations performed in the loop in +form of floating point operations (since the function was called with +a float argument). These appear in both the loop +and the preamble. +
+        i26 = float_eq(f24, 0.000000)
+        guard_false(i26, descr=<Guard12>) 
+        f27 = float_truediv(f23, f24)
+        f28 = float_add(f24, f27)
+        f30 = float_truediv(f28, 2.000000)
+
+ +Finally there are some tests checking if a signal was received +(such as when the user presses ctrl-C) and thus should execute some +signal handler or if we need to hand over to another thread. This is +implemented with a counter that is decreased once every iteration. It +will go below zero after some specific number of iterations, tunable by +sys.setcheckinterval. The counter is read from and written to +some global location where it also can be made negative by a C-level +signal handler. +
+        i32 = getfield_raw(32479328, descr=<pypysig_long_struct.c_value>)
+        i34 = int_sub(i32, 2)
+        setfield_raw(32479328, i34, descr=<pypysig_long_struct.c_value>)
+        i36 = int_lt(i34, 0)
+        guard_false(i36, descr=<Guard13>) 
+        jump(p0, p1, p2, p4, p10, i20, f30, f23, descr=<Loop0>)
+
+ +

+

+

Bridges

+ +When a guard fails often enough, the meta-interpreter is started again +to produce a new trace starting at the failing guard. The tracing is +continued until a previously compiled loop is entered. This could +either be the the same loop that contains the failing guard +or some completely different loop. If it is the same loop, executing +the preamble again maybe be unnecessary. +It is preferable to end the bridge with a jump directly to +the loop. To achieve this the optimizer tries to produce short + preambles that are inlined at the end of bridges allowing +them to jump directly to the loop. Inlining is better than jumping to +a common preamble because most of the inlined short preamble can +typically be removed again by the optimizer. +Creating such a short +preamble is however not always possible. Bridges jumping to loops for which +no short preamble can be generated have to end with a jump to the +full preamble instead. + +

+ +The short preamble is created by comparing the operations in the +preamble with the operations in the loop. The +operations that are in the preamble but not in the loop +are moved to the short preamble whenever it is safe to move them to +the front of the operations remaining. In other words, the full preamble +is equivalent to the short preamble followed by one iteration of the +loop. + +

+

+ +This much has currently been implemented. To give the full picture +here, there are two more features that +hopefully will be implemented in the near future. +The first is to replace the full preamble, used by the interpreter +when it reaches a compiled loop, with the short preamble. +This is currently not done and is probably not as straight forward as +it might first seem. The problem is where to resume interpreting on a +guard failure. However, implementing that should save some +memory. Not only +because the preamble will become smaller, but mainly because the +guards will appear either in the loop or in the preamble, but not +in both (as they do now). That means there will only be a single bridge and +not potentially two copies once the guards are traced. + +

+

+ +The sqrt example above would with a short preamble result in a trace +like this + +

+

+ +

+If it is executed long enough, the last guard will be traced to form a +bridge. The trace will inherit the virtuals from its parent. This can +be used to optimize away the part of the inlined short preamble +that deals with virtuals. The resulting bridge should look +something like + +
+    [p0, p1, p2, p3, p4, f5, i6]
+    i7 = force_token()
+    setfield_gc(p1, i7, descr=<PyFrame.vable_token>)
+    call_may_force(ConstClass(action_dispatcher), p0, p1, descr=<VoidCallDescr>)
+    guard_not_forced(, descr=<Guard19>) 
+    guard_no_exception(, descr=<Guard20>) 
+
+    guard_nonnull_class(p4, 17674024, descr=<Guard21>) 
+    f52 = getfield_gc_pure(p4, descr=<W_FloatObject.inst_floatval>)
+    jump(p1, p0, p2, p3, p4, i38, f53, f52, descr=<Loop0>)
+
+ +Here the first paragraph comes from the traced bridge and the second +is what remains of the short preamble after optimization. The +box p4 is +not a virtual (it contains a pointer to y which is never +changed), and it is only virtuals +that the bridge inherit from it's parents. This is why the last two +operations currently cannot be removed. + + +

+ +Each time the short preamble is inlined, a new copy of each of the +guards in it is generated. Typically the short preamble is inlined in +several places and thus there will be several copies of each of those +guards. +If they fail often enough bridges +from them will be traced (as with all guards). But since there +typically are several copies of each guard the same bridge +will be generated in +several places. To prevent this, mini-bridges from the inlined guards +are produced already during the inlining. These mini-bridges contain +nothing but a jump to the preamble. + +

+

+The mini-bridges needs the arguments of the preamble to be able +to jump to it. These arguments contain among other things, boxed +versions of the +variables x and y. Those variables are virtuals in +the loop, and have to be allocated. Currently those allocations +are placed in front of the inlined guard. Moving those allocations into +the mini-bridges is the second feature that +hopefully will be implemented in the near future. + +After this feature is +implemented, the result should look something like +

+

+ + + +

+

+

+

Multiple specialized versions

+ +Floating point operations were generated in the trace above +because sqrt was called with a float argument. If it is +instead called with an int argument, integer operations will be generated. The +somewhat more complex situations is when both int's and float's are +used as arguments. Then the jit need to generate multiple versions of +the same loop, specialized in different ways. The details, given +below, on how this is achieved is somewhat involved. For the casual +reader it would make perfect sense to skip to the next section here. + +

+ +Consider the case when sqrt is first called with a float +argument (but with n small enough not to generate the +bridge). Then the trace shown above will be +generated. If sqrt is now called with an int argument, the +guard in the preamble testing that the type of the input object is float +will fail: +

+
+        guard_nonnull_class(p12, 17652552, descr=<Guard10>) 
+
+It will fail every iteration, so soon enough a bridge will be +generated from this guard in the preamble. This guard will end with a +jump to the same loop, and the optimizer will try to inline +the short preamble at the end of it. This will however fail +since now there are two guards on p12. One that makes sure it +is an int and and one that makes sure it is a float. The optimizer +will detect that the second guard will always fail and mark the bridge +as invalid. Invalid loops are not passed on to the backend for +compilation. + +

+ +If a loop is detected to be invalid while inlining the short preamble, +the metainterpreter will continue to trace for yet another +iteration of the loop. This new trace can be compiled as above and +will produce a new loop with a new preamble that are now specialized +for int arguments instead of float arguments. The bridge that +previously became invalid will now be tried again. This time inlining +the short preamble of the new loop instead. This will produce a set of +traces connected like this + +

+

+ + +(click for some hairy details) +

+

+ +The height of the boxes is this figure represents how many instructions +they contain (presuming the missing features from the previous section +are implemented). Loop 0 is specialized for floats and it's preamble have +been split into two boxes at the failing guard. Loop 2 is specialized +for ints and is larger than Loop 0. This is mainly because the integer +division in python does not map to the integer division of the +machine, but have to be implemented with several instructions (integer +division in python truncates its result towards minus +infinity, while the the machine integer division truncates towards +0). Also the height of the bridge is about the same as the height of +Loop 2. This is because it contains a full iteration of the loop. + +

+

+ +

+

A More Advanced Example

+ +Let's conclude with an example that is a bit more advanced, where this unrolling +approach actually outperforms the previous approach. Consider +making a +fixed-point +implementation of the square root using 16 bit's of decimals. This can be +done using the same implementation +of sqrt but calling it with an object of a class representing +such fixed-point real numbers: + +

+

+
class Fix16(object):
+    def __init__(self, val, scale=True):
+        if isinstance(val, Fix16):
+            self.val = val.val
+        else:
+            if scale:
+                self.val = int(val * 2**16)
+            else:
+                self.val = val
+
+    def __add__(self, other):
+        return  Fix16(self.val + Fix16(other).val, False)
+
+    def __sub__(self, other):
+        return  Fix16(self.val - Fix16(other).val, False)
+
+    def __mul__(self, other):
+        return  Fix16((self.val >> 8) * (Fix16(other).val >> 8), False)
+
+    def __div__(self, other):
+        return  Fix16((self.val << 16) / Fix16(other).val, False)
+
+ +

+ +Below is a table comparing the runtime of the sqrt function above with +different argument types on different python interpreters. Pypy 1.4.1 +was released before the optimizations described in this post were in place +while they are in place in the +nightly + build from January 5, +denoted pypy in the table. There are also the running time for the same +algorithms implemented in C and compiled with "gcc -O3 +-march=native". Tests were executed on a 2.53GHz Intel Core2 +processor with n=100000000 iterations. +Comparing the integer versions with C may be considered a +bit unfair because of the more advanced integer division operator in +python. The left part of this table shows runtimes of sqrt in +a program containing a single call to sqrt (i.e. only a single +specialized version of the loop is needed). The right part shows the +runtime of sqrt when it has been called with a different +type of argument before. + +

+

+ +

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
First callSecond call
floatintFix16  floatintFix16
cpython 28.18 s 22.13 s 779.04 s 28.07 s 22.21 s 767.03 s
pypy 1.4.1 1.20 s 6.49 s 11.31 s 1.20 s 6.54 s 11.23 s
pypy 1.20 s 6.44 s 6.78 s 1.19 s 6.26 s 6.79 s
gcc 1.15 s 1.82 s 1.89 s 1.15 s 1.82 s 1.89 s
+

+ +For this to work in the last case, when Fix16 is the argument type in +the second type, +the trace_limit had to be increased from its default value to prevent +the metainterpreter from aborting while tracing the second version of +the loop. Also sys.setcheckinterval(1000000) were used to prevent the +bridge from being generated. With the bridge the performance of the +last case is significantly worse. Maybe because the optimizer currently +fails to generate a short preamble for it. But the slowdown +seems too big for that to be the only explanation. Below are the runtimes +numbers with checkinterval set to its default value of 100: + +

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
First callSecond call
floatintFix16  floatintFix16
cpython 28.71 s 22.09 s 781.86 s 28.28 s 21.92 s 761.59 s
pypy 1.4.1 1.21 s 6.48 s 11.22 s 1.72 s 7.58 s 12.18 s
pypy 1.21 s 6.27 s 7.22 s 1.20 s 6.29 s 90.47 s
+

+

+

Conclusions

+Even though we are seeing speedups in a variety of different small +benchmarks, more complicated examples are not affected much by these +optimizations. It might partly be because larger examples have longer +and more complicated loops, and thus allowing optimizations to operate +across loop boundary will have a smaller relative effect. Another problem is +that with more complicated examples there will be more bridges, and bridges +are currently not handled very well (most of the time all virtuals are +forced at the end of the bridge as explained above). But moving those +forcings into the mini bridges should fix that. +
+
+
+
+ + Anonymous wrote on 2011-01-13 07:22: +
+
+

Great post.

+
+
+
+
+ + Eric wrote on 2012-11-20 16:17: +
+
+

Do you think you could fix the pictures?
I only see black images with a exclamation marks.

thanks

+
+
+
+
+ + Anonymous wrote on 2013-03-05 01:33: +
+
+

Something has eaten the images. Please fix, if you can.

+
+
+
+ +

PyPy 1.4.1

+ +
+

Here is PyPy 1.4.1 :-)

+ +

Update: Win32 binaries available.

+ +

Enjoy!

+ +

Release announcement

+ +

We're pleased to announce +the 1.4.1 release of PyPy. +This release consolidates all the bug fixes that occurred since the +previous release. To everyone that took the trouble to report +them, we want to say thank you.

+ +

What is PyPy

+ +

PyPy is a very compliant Python interpreter, almost a drop-in +replacement for CPython. Note that it still only emulates Python +2.5 by default; the fast-forward branch with Python 2.7 +support is slowly getting ready but will only be integrated in +the next release.

+ +

In two words, the advantage of trying out PyPy instead of CPython +(the default implementation of Python) is, for now, the +performance. Not all programs are faster in PyPy, but we are +confident that any CPU-intensive task will be much faster, at +least if it runs for long enough (the JIT has a slow warm-up +phase, which can take several seconds or even one minute on the +largest programs).

+ +

Note again that we do support compiling and using C extension +modules from CPython (pypy setup.py install). However, this +is still an alpha feature, and the most complex modules typically +fail for various reasons; others work (e.g. PIL) but take a +serious performance hit. Also, for Mac OS X see below.

+ +

Please note also that PyPy's performance was optimized almost +exclusively on Linux. It seems from some reports that on Windows +as well as Mac OS X (probably for different reasons) the +performance might be lower. We did not investigate much so far.

+ +

More highlights

+ +
    +
  • We migrated to Mercurial (thanks to Ronny Pfannschmidt and + Antonio Cuni) for the effort) and moved to bitbucket. The new + command to check out a copy of PyPy is: +
    hg clone https://bitbucket.org/pypy/pypy + +

    +
  • +
  • In long-running processes, the assembler generated by old + JIT-compilations is now freed. There should be no more leak, + however long the process runs. + +

    +
  • +
  • Improve a lot the performance of the binascii module, and + of hashlib.md5 and hashlib.sha. + +

    +
  • +
  • Made sys.setrecursionlimit() a no-op. Instead, we rely purely + on the built-in stack overflow detection mechanism, which also + gives you a RuntimeError -- just not at some exact recursion + level. + +

    +
  • +
  • Fix argument processing (now e.g. pypy -OScpass works like + it does on CPython --- if you have a clue what it does there + :-) ) + +

    +
  • +
  • cpyext on Mac OS X: it still does not seem to work. I get + systematically a segfault in dlopen(). Contributions welcome. + +

    +
  • +
  • Fix two corner cases in the GC (one in minimark, one in + asmgcc+JIT). This notably prevented pypy translate.py -Ojit + from working on Windows, leading to crashes. + +

    +
  • +
  • Fixed a corner case in the JIT's optimizer, leading to Fatal + RPython error: AssertionError. + +

    +
  • +
  • Added some missing built-in functions into the 'os' module. + +

    +
  • +
  • Fix ctypes (it was not propagating keepalive information from + c_void_p). + +
  • +
+
+
+
+
+ + Symbol wrote on 2010-12-22 12:00: +
+
+

Wow, and I thought 1.4.1 would come out after the january sprint!

A christmas present :->

What would be the focus of the january sprint then?

+
+
+
+
+ + Armin Rigo wrote on 2010-12-22 12:09: +
+
+

There are still a number of branches that have not been merged into trunk yet: at least fast-forward (Python 2.7), jit-unroll-loops (better JITting of arithmetic and short loops), arm-backend (JIT support on ARM) and jitypes2 (turn ctypes calls into real assembler-level calls with the JIT). There is also the stackless+JIT integration pending. Finally the sprint will also be a place to try out and run some applications. So it's not like we are out of work :-)

+
+
+
+
+ + Unknown wrote on 2010-12-22 13:10: +
+
+

I'm interested in the performance improvement in hashlib.sha. I haven't seen that one before on https://speed.pypy.org . Could you give me more details?

Regards,

Zooko

+
+
+
+
+ + Armin Rigo wrote on 2010-12-22 13:59: +
+
+

Actually, hashlib.sha was not the same as sha.sha: the former used to be a ctypes call to the OpenSSL lib, whereas the latter uses our built-in sha implementation. So hashlib.sha was faster in theory, but killed by the overhead of using ctypes. Now, at least in a default version of pypy, the hashlib.md5 and .sha are redirected to the built-in md5.md5 and sha.sha.

Another issue was that with the built-in md5.md5 and sha.sha, on 64-bit, there was a 1.5x speed impact due to the C compiler not recognizing an expression that was meant to be a 32-bit integer rotation.

I guess that https://speed.pypy.org don't show this because they use directly md5.md5 or sha.sha, and are on 32-bit.

+
+
+
+
+ + Martijn Faassen wrote on 2010-12-22 14:14: +
+
+

Thanks for PyPy 1.4.1. I reported two issues concerning buildout with PyPy 1.4, and they all got fixed!

So PyPy 1.4.1 is now compatible with buildout, which is really convenient as it makes it easy for me to test other projects.

+
+
+
+
+ + shadinger wrote on 2010-12-28 16:00: +
+
+

I compiled 1.4.1 on Win32 using Visual C++ 2010.

Do you want to add it to the download page?

To whom shall I send it?

Happy new year.

+
+
+
+
+ + Andrei wrote on 2011-01-28 20:08: +
+
+

Hello,

sorry, I'm a bit new here - is it possible that PyPy makes Python run in a browser? Somehow "translating" all the Python into Javascript?

I'm wondering because I saw you run, for example, CLI, so perhaps PyPy may somehow enable Python in a browser?

+
+
+
+
+ + Armin Rigo wrote on 2011-01-29 10:23: +
+
+

Andrei: not directly. We played at some point with translating RPython code to Javascript, but it didn't give enough benefits (because it's not full Python that we can translate, just "RPython"). The alternative would be to translate the whole PyPy interpreter to Javascript, but that would give a result that is both huge (in term of download size) and horribly slow (100x slower than Javascript maybe).

+
+
+
+ +

PyPy migrates to Mercurial

+ +
+

The assiduous readers of this blog surely remember that during the last +Düsseldorf sprint in October, we started the process for migrating our main +development repository from Subversion to Mercurial. Today, after more than +two months, the process has finally been completed :-).

+

The new official PyPy repository is hosted on BitBucket.

+

The migration has been painful because the SVN history of PyPy was a mess and +none of the existing conversion tools could handle it correctly. This was +partly because PyPy started when subversion was still at version 0.9 when some +best-practices were still to be established, and partly because we probably +managed to invent all the possible ways to do branches (and even some of the +impossible ones: there is at least one commit which you cannot do with the +plain SVN client but you have to speak to the server by yourself :-)).

+

The actual conversion was possible thanks to the enormous work done by Ronny +Pfannschmidt and his hackbeil tool. I would like to personally thank Ronny +for his patience to handle all the various requests we asked for.

+

We hope that PyPy development becomes even more approachable now, at least from +a version control point of view.

+
+
+
+
+ + Anonymous wrote on 2010-12-14 20:19: +
+
+

Awesome! Besides simplifying life for potential new contributors, it's very nice to be able to follow progress using the shortlog on bitbucket.org.

+
+
+
+
+ + Vladimir wrote on 2010-12-14 21:08: +
+
+

Over 9000 branches :/

+
+
+
+
+ + Antonio Cuni wrote on 2010-12-14 22:34: +
+
+

@Владимир: 9000? I count 459 on my local repo, which is still a lot, but not so much :-)
Anyway, most of them are closed, it's just that bitbucket displays also those. And I think that the huge number of branches is another evidence of the "we are not heroes" thing :-)
https://morepypy.blogspot.com/2010/12/we-are-not-heroes-just-very-patient.html

+
+
+
+
+ + Michael Foord wrote on 2010-12-15 01:38: +
+
+

Hey, you guys are *my* heroes. :-)

+
+
+
+
+ + Leonardo Santagada wrote on 2010-12-15 13:03: +
+
+

"PyPy is faster than CPython, again" should be the title. Faster at migrating to mercurial

:)

Great work, now pypy could be even more self hosting if it would run hg on it, when it becomes faster than cpython and stable to do so.

+
+
+
+
+ + Bernhard Leiner wrote on 2010-12-15 20:28: +
+
+

PyPy running Mercurial is actually not to far away...

https://markmail.org/message/wjik2ecanvmt463y#query:+page:1+mid:qbdxn3566j2y7piu+state:results

+
+
+
+ +

Oh, and btw: PyPy gets funding through "Eurostars"

+ +
+

There is a supporting reason why we made so many advances in the last year: +funding through Eurostars, a European research funding program. +The title of our proposal (accepted in 2009) is: "PYJIT - a fast +and flexible toolkit for dynamic programming languages based on PyPy". +And the participants are Open End AB, the Heinrich-Heine-Universität +Düsseldorf (HHU), and merlinux GmbH.

+

It's not hard to guess what PYJIT is actually about, is it? +Quoting: "The PYJIT project will deliver a fast and flexible +Just-In-Time Compiler toolkit based on PyPy to the market of dynamic +languages. Our main aim is to showcase our project's results for the +Open Source language Python, providing unprecedented levels of +flexibility and with speed hitherto only available using statically +typed languages." (Details in German or in Swedish :-)

+

A subgoal is to improve our development and testing infrastructure, +mainly showcased by Holger's recent py.test releases, the testing tool +used by PyPy for its 16K tests and the speed.pypy.org infrastructure +(web app programmed by Miquel Torres on his own time).

+

The overall scope of this project is smaller than that of the previous EU project +from 2004 to 2007. The persons that are (or were) getting money to work +on PyPy are Samuele Pedroni (at Open End), Maciej Fijalkowski (as a +subcontractor), Carl Friedrich Bolz, Armin Rigo, Antonio Cuni (all at +HHU), and Holger Krekel (at merlinux) as well as Ronny Pfannschmidt (as +a subcontractor).

+

The Eurostars funding lasts until August 2011. What comes afterwards? +Well, for one, many of the currently funded people have done work without +getting funding in previous years. This will probably continue. +We also have non-funded people in the core group right now and we'll +hope to enlarge it further. But of course there are still large tasks +ahead which may greatly benefit from funding. We have setup a +donation infrastructure and maybe we can win one or more larger +organisations to provide higher or regular sums of money to fund future +development work. Another possibility for companies is to pay +PyPy developers to help and improve PyPy for their particular use cases.

+

And finally, your help, donations and suggestions are always +welcome and overall we hope to convince more and more people it's +worthwhile to invest into PyPy's future.

+
+
+
+
+ +
+
+
+ +
+ + + + \ No newline at end of file diff --git a/blog/index-18.html b/blog/index-18.html new file mode 100644 index 000000000..baba44c69 --- /dev/null +++ b/blog/index-18.html @@ -0,0 +1,2884 @@ + + + + + + +PyPy (old posts, page 18) | PyPy + + + + + + + + + + + + + + + + + + Skip to main content +
+
+
+

Using Tkinter and IDLE with PyPy

+ +
+

We are pleased to announce that Tkinter, the GUI library based on TCL/TK, now +works with PyPy.
+Tkinter is composed of two parts:

+
+
    +
  • +_tkinter, a module written in C which interfaces with the TCL world
  • +
  • +Tkinter, a pure Python package which wraps _tkinter to expose the +pythonic API we are used to
  • +
+
+
+
+
+ +
+The PyPy version of _tkinter reuses the C code of as found in CPython and +compile it through the PyPy C-API compatibility layer, cpyext. To make it +work with PyPy, we had to modify it slightly, in order to remove the +dependency on some API functions which are not supported by PyPy. In particular, we +removed the dependency on the PyOS_InputHook variable, which allows a nice +integration of Tkinter and the Python interactive prompt: the result is that, +unlike CPython, in PyPy Tk windows created at the interactive prompt are not +shown until we manually call the mainloop method. Apart from this +inconvenience, all the rest works fine.
+At the moment, _tkinter is not distributed with PyPy because our build +system does not support automatic compilation of C extension. Instead, it is +necessary to install it manually, either directly from source or by +easy_installing/pip installing tkinter-pypy from PyPI.
+For everything to work correctly, you need a recent build of PyPy: the +following is a step-by-step guide to install _tkinter in a PyPy nightly +build for Linux 64 bit; for other architectures, look at the nightly build +page:
$ wget https://buildbot.pypy.org/nightly/trunk/pypy-c-jit-43485-1615dfd7d8f1-linux64.tar.bz2
+
+$ tar xfv pypy-c-jit-43485-1615dfd7d8f1-linux64.tar.bz2
+
+$ cd pypy-c-jit-43485-1615dfd7d8f1-linux64/
+
+$ wget https://peak.telecommunity.com/dist/ez_setup.py
+
+$ ./bin/pypy ez_setup.py    # install setuptools
+
+$ ./bin/easy_install tkinter-pypy
+
+Once you complete the steps above, you can start using Tkinter from your +python programs. In particular, you can use IDLE, the IDE which is part of +the Python standard library. To start IDLE, type:
$ ./bin/pypy -m idlelib.idle
+
+Have fun :-) +
+
+
+
+ + Unknown wrote on 2011-04-20 15:09: +
+
+

It is sooo ancient. I'd think twice before bundling anything potentially exploitable (read - compiled C modules) with PyPy.

+
+
+
+
+ + RonnyPfannschmidt wrote on 2011-04-20 22:59: +
+
+

i fail to see how this is more exploitable than say ctypes (which is already shipped)

+
+
+
+
+ + Brandon Corfman wrote on 2011-04-22 17:01: +
+
+

I'm really REALLY happy about this ... Tkinter, multiprocessing, and 2.7 support were my remaining roadblocks to using PyPy. I'm d/l now to give it a try with Raven Checkers. I hope that I won't need to look back.

+
+
+
+
+ + Joaquin Abian wrote on 2011-05-13 20:41: +
+
+

I tried to install tkinter on win 7. When I do pypy ez_setup.py I get a traceback that finish with:

File "ez_setup.py", line 212, in main
from setuptools.command.easy_install import main
ZipImportError: 'setuptools.command.install'

Some hint on how to solve it?

+
+
+
+
+ + Antonio Cuni wrote on 2011-05-18 15:13: +
+
+

@Joaquin:
indeed, ez_setup seems not to work on windows. It might be related to this, although I did not investigate further:
https://bugs.pypy.org/issue725

Instead of ez_setup, you can try to follow these instructions and install distribute/pip, which we recommend anyway nowadays:
https://doc.pypy.org/en/latest/getting-started.html#installing-pypy

Note however that tkinter-pypy is not precompiled for windows, so you need to have the necessary developer tools installed. If you manage to build a precompiled binary of tkinter-pypy, I'd be happy to put it in pypi :-)

+
+
+
+
+ + Anonymous wrote on 2011-11-24 16:52: +
+
+

Seems that tcl8.4-dev and tk8.4-dev needs to be installed!
This should be insert into the "install instruction" ;)

+
+
+
+
+ + Daniel Petti wrote on 2012-05-29 19:01: +
+
+

What does "command 'cc' failed with error 1" mean? I keep getting that upon installing tkinter-pypy

+
+
+
+
+ + Anonymous wrote on 2012-10-22 17:27: +
+
+

I'm unable to compile it on Windows (MinGW and also tried with VS 2010). Getting the following error:

fatal error: tcl.h: No such file or directory

My TCL installed under a different directory. How can I point the compiler to use tcl.h file from that directory?

+
+
+
+
+ + Rich Wandell wrote on 2013-05-03 14:47: +
+
+

I am having an incredible amount of problems attempting to build tkinter for pypy on windows. Is there anywhere I can download a pre built version?

+
+
+
+
+ + Anonymous wrote on 2013-10-28 18:14: +
+
+

This is outdated. But how to use Tkinter currently under windows?

+
+
+
+
+ + Unknown wrote on 2014-02-02 11:18: +
+
+

I think I've managed to compile Tkinter for Windows. Could anyone interested please try it out? Just download this archive and extract it into your Pypy folder:
https://dl-web.dropbox.com/get/Public/Tkinter%20for%20Windows.zip?_subject_uid=29914669&w=AACPaRHDWsfcxafgdXsHV405wJNIsKrYzRXZMHwIKPuiNA&dl=1

+
+
+
+
+ + Luis wrote on 2014-05-11 22:35: +
+
+

XJDHDR: The link is not working. Do you still have the file available to download?

+
+
+
+
+ + Unknown wrote on 2014-05-12 17:27: +
+
+

@Luis
The file is still available. Try this link:
https://dl.dropboxusercontent.com/u/29914669/Tkinter%20for%20Windows.zip

Dropbox must have changed something on their end.

+
+
+
+ +

Tutorial Part 2: Adding a JIT

+ +
+

This is the second part of a tutorial written by Andrew Brown. The first +part described how to write an interpreter with PyPy.

+
+

Adding JIT

+

Translating RPython to C is pretty cool, but one of the best features of PyPy +is its ability to generate just-in-time compilers for your interpreter. +That's right, from just a couple hints on how your interpreter is structured, +PyPy will generate and include a JIT compiler that will, at runtime, translate +the interpreted code of our BF language to machine code!

+

So what do we need to tell PyPy to make this happen? First it needs to know +where the start of your bytecode evaluation loop is. This lets it keep track of +instructions being executed in the target language (BF).

+

We also need to let it know what defines a particular execution frame. Since +our language doesn't really have stack frames, this boils down to what's +constant for the execution of a particular instruction, and what's not. These +are called "green" and "red" variables, respectively.

+

Refer back to example2.py for the following.

+

In our main loop, there are four variables used: pc, program, bracket_map, and +tape. Of those, pc, program, and bracket_map are all green variables. They +define the execution of a particular instruction. If the JIT routines see the +same combination of green variables as before, it knows it's skipped back and +must be executing a loop. The variable "tape" is our red variable, it's what's +being manipulated by the execution.

+

So let's tell PyPy this info. Start by importing the JitDriver class and making +an instance:

+
from pypy.rlib.jit import JitDriver
+jitdriver = JitDriver(greens=['pc', 'program', 'bracket_map'],
+        reds=['tape'])
+
+

And we add this line to the very top of the while loop in the mainloop +function:

+
jitdriver.jit_merge_point(pc=pc, tape=tape, program=program,
+        bracket_map=bracket_map)
+
+

We also need to define a JitPolicy. We're not doing anything fancy, so this is +all we need somewhere in the file:

+
def jitpolicy(driver):
+    from pypy.jit.codewriter.policy import JitPolicy
+    return JitPolicy()
+
+

See this example at example3.py

+

Now try translating again, but with the flag --opt=jit:

+
+$ python ./pypy/pypy/translator/goal/translate.py --opt=jit example3.py
+
+

It will take significantly longer to translate with JIT enabled, almost 8 +minutes on my machine, and the resulting binary will be much larger. When it's +done, try having it run the mandelbrot program again. A world of difference, +from 12 seconds compared to 45 seconds before!

+

Interestingly enough, you can see when the JIT compiler switches from +interpreted to machine code with the mandelbrot example. The first few lines of +output come out pretty fast, and then the program gets a boost of speed and +gets even faster.

+
+
+

A bit about Tracing JIT Compilers

+

It's worth it at this point to read up on how tracing JIT compilers work. +Here's a brief explanation: The interpreter is usually running your interpreter +code as written. When it detects a loop of code in the target language (BF) is +executed often, that loop is considered "hot" and marked to be traced. The next +time that loop is entered, the interpreter gets put in tracing mode where every +executed instruction is logged.

+

When the loop is finished, tracing stops. The trace of the loop is sent to an +optimizer, and then to an assembler which outputs machine code. That machine +code is then used for subsequent loop iterations.

+

This machine code is often optimized for the most common case, and depends on +several assumptions about the code. Therefore, the machine code will contain +guards, to validate those assumptions. If a guard check fails, the runtime +falls back to regular interpreted mode.

+

A good place to start for more information is +https://en.wikipedia.org/wiki/Just-in-time_compilation

+
+
+

Debugging and Trace Logs

+

Can we do any better? How can we see what the JIT is doing? Let's do two +things.

+

First, let's add a get_printable_location function, which is used during debug +trace logging:

+
def get_location(pc, program, bracket_map):
+    return "%s_%s_%s" % (
+            program[:pc], program[pc], program[pc+1:]
+            )
+jitdriver = JitDriver(greens=['pc', 'program', 'bracket_map'], reds=['tape'],
+        get_printable_location=get_location)
+
+

This function is passed in the green variables, and should return a string. +Here, we're printing out the BF code, surrounding the currently executing +instruction with underscores so we can see where it is.

+

Download this as example4.py and translate it the same as example3.py.

+

Now let's run a test program (test.b, which just prints the letter "A" 15 or so +times in a loop) with trace logging:

+
+$ PYPYLOG=jit-log-opt:logfile ./example4-c test.b
+
+

Now take a look at the file "logfile". This file is quite hard to read, so +here's my best shot at explaining it.

+

The file contains a log of every trace that was performed, and is essentially a +glimpse at what instructions it's compiling to machine code for you. It's +useful to see if there are unnecessary instructions or room for optimization.

+

Each trace starts with a line that looks like this:

+
+[3c091099e7a4a7] {jit-log-opt-loop
+
+

and ends with a line like this:

+
+[3c091099eae17d jit-log-opt-loop}
+
+

The next line tells you which loop number it is, and how many ops are in it. +In my case, the first trace looks like this:

+ + + +
 1
+ 2
+ 3
+ 4
+ 5
+ 6
+ 7
+ 8
+ 9
+10
+11
+12
+13
+14
+15
+16
+17
+18
+19
+20
+21
+22
+23
+24
+25
+26
+27
+28
+29
+
  [3c167c92b9118f] {jit-log-opt-loop
+  # Loop 0 : loop with 26 ops
+  [p0, p1, i2, i3]
+  debug_merge_point('+<[>[_>_+<-]>.[<+>-]<<-]++++++++++.', 0)
+  debug_merge_point('+<[>[>_+_<-]>.[<+>-]<<-]++++++++++.', 0)
+  i4 = getarrayitem_gc(p1, i2, descr=<SignedArrayDescr>)
+  i6 = int_add(i4, 1)
+  setarrayitem_gc(p1, i2, i6, descr=<SignedArrayDescr>)
+  debug_merge_point('+<[>[>+_<_-]>.[<+>-]<<-]++++++++++.', 0)
+  debug_merge_point('+<[>[>+<_-_]>.[<+>-]<<-]++++++++++.', 0)
+  i7 = getarrayitem_gc(p1, i3, descr=<SignedArrayDescr>)
+  i9 = int_sub(i7, 1)
+  setarrayitem_gc(p1, i3, i9, descr=<SignedArrayDescr>)
+  debug_merge_point('+<[>[>+<-_]_>.[<+>-]<<-]++++++++++.', 0)
+  i10 = int_is_true(i9)
+  guard_true(i10, descr=<Guard2>) [p0]
+  i14 = call(ConstClass(ll_dict_lookup__dicttablePtr_Signed_Signed), ConstPtr(ptr12), 90, 90, descr=<SignedCallDescr>)
+  guard_no_exception(, descr=<Guard3>) [i14, p0]
+  i16 = int_and(i14, -9223372036854775808)
+  i17 = int_is_true(i16)
+  guard_false(i17, descr=<Guard4>) [i14, p0]
+  i19 = call(ConstClass(ll_get_value__dicttablePtr_Signed), ConstPtr(ptr12), i14, descr=<SignedCallDescr>)
+  guard_no_exception(, descr=<Guard5>) [i19, p0]
+  i21 = int_add(i19, 1)
+  i23 = int_lt(i21, 114)
+  guard_true(i23, descr=<Guard6>) [i21, p0]
+  guard_value(i21, 86, descr=<Guard7>) [i21, p0]
+  debug_merge_point('+<[>[_>_+<-]>.[<+>-]<<-]++++++++++.', 0)
+  jump(p0, p1, i2, i3, descr=<Loop0>)
+  [3c167c92bc6a15] jit-log-opt-loop}
+
+
+

I've trimmed the debug_merge_point lines a bit, they were really long.

+

So let's see what this does. This trace takes 4 parameters: 2 object pointers +(p0 and p1) and 2 integers (i2 and i3). Looking at the debug lines, it seems to +be tracing one iteration of this loop: "[>+<-]"

+

It starts executing the first operation on line 4, a ">", but immediately +starts executing the next operation. The ">" had no instructions, and looks +like it was optimized out completely. This loop must always act on the same +part of the tape, the tape pointer is constant for this trace. An explicit +advance operation is unnecessary.

+

Lines 5 to 8 are the instructions for the "+" operation. First it gets the +array item from the array in pointer p1 at index i2 (line 6), adds 1 to it and +stores it in i6 (line 7), and stores it back in the array (line 8).

+

Line 9 starts the "<" instruction, but it is another no-op. It seems that i2 +and i3 passed into this routine are the two tape pointers used in this loop +already calculated. Also deduced is that p1 is the tape array. It's not clear +what p0 is.

+

Lines 10 through 13 perform the "-" operation: get the array value (line 11), +subtract (line 12) and set the array value (line 13).

+

Next, on line 14, we come to the "]" operation. Lines 15 and 16 check whether +i9 is true (non-zero). Looking up, i9 is the array value that we just +decremented and stored, now being checked as the loop condition, as expected +(remember the definition of "]"). Line 16 is a guard, if the condition is not +met, execution jumps somewhere else, in this case to the routine called +<Guard2> and is passed one parameter: p0.

+

Assuming we pass the guard, lines 17 through 23 are doing the dictionary lookup +to bracket_map to find where the program counter should jump to. I'm not too +familiar with what the instructions are actually doing, but it looks like there +are two external calls and 3 guards. This seems quite expensive, especially +since we know bracket_map will never change (PyPy doesn't know that). We'll +see below how to optimize this.

+

Line 24 increments the newly acquired instruction pointer. Lines 25 and 26 make +sure it's less than the program's length.

+

Additionally, line 27 guards that i21, the incremented instruction pointer, is +exactly 86. This is because it's about to jump to the beginning (line 29) and +the instruction pointer being 86 is a precondition to this block.

+

Finally, the loop closes up at line 28 so the JIT can jump to loop body <Loop0> +to handle that case (line 29), which is the beginning of the loop again. It +passes in parameters (p0, p1, i2, i3).

+
+
+

Optimizing

+

As mentioned, every loop iteration does a dictionary lookup to find the +corresponding matching bracket for the final jump. This is terribly +inefficient, the jump target is not going to change from one loop to the next. +This information is constant and should be compiled in as such.

+

The problem is that the lookups are coming from a dictionary, and PyPy is +treating it as opaque. It doesn't know the dictionary isn't being modified or +isn't going to return something different on each query.

+

What we need to do is provide another hint to the translation to say that the +dictionary query is a pure function, that is, its output depends only on its +inputs and the same inputs should always return the same output.

+

To do this, we use a provided function decorator pypy.rlib.jit.purefunction, +and wrap the dictionary call in a decorated function:

+
@purefunction
+def get_matching_bracket(bracket_map, pc):
+    return bracket_map[pc]
+
+

This version can be found at example5.py

+

Translate again with the JIT option and observe the speedup. Mandelbrot now +only takes 6 seconds! (from 12 seconds before this optimization)

+

Let's take a look at the trace from the same function:

+
[3c29fad7b792b0] {jit-log-opt-loop
+# Loop 0 : loop with 15 ops
+[p0, p1, i2, i3]
+debug_merge_point('+<[>[_>_+<-]>.[<+>-]<<-]++++++++++.', 0)
+debug_merge_point('+<[>[>_+_<-]>.[<+>-]<<-]++++++++++.', 0)
+i4 = getarrayitem_gc(p1, i2, descr=<SignedArrayDescr>)
+i6 = int_add(i4, 1)
+setarrayitem_gc(p1, i2, i6, descr=<SignedArrayDescr>)
+debug_merge_point('+<[>[>+_<_-]>.[<+>-]<<-]++++++++++.', 0)
+debug_merge_point('+<[>[>+<_-_]>.[<+>-]<<-]++++++++++.', 0)
+i7 = getarrayitem_gc(p1, i3, descr=<SignedArrayDescr>)
+i9 = int_sub(i7, 1)
+setarrayitem_gc(p1, i3, i9, descr=<SignedArrayDescr>)
+debug_merge_point('+<[>[>+<-_]_>.[<+>-]<<-]++++++++++.', 0)
+i10 = int_is_true(i9)
+guard_true(i10, descr=<Guard2>) [p0]
+debug_merge_point('+<[>[_>_+<-]>.[<+>-]<<-]++++++++++.', 0)
+jump(p0, p1, i2, i3, descr=<Loop0>)
+[3c29fad7ba32ec] jit-log-opt-loop}
+
+

Much better! Each loop iteration is an add, a subtract, two array loads, two +array stores, and a guard on the exit condition. That's it! This code doesn't +require any program counter manipulation.

+

I'm no expert on optimizations, this tip was suggested by Armin Rigo on the +pypy-dev list. Carl Friedrich has a series of posts on how to optimize your +interpreter that are also very useful: https://bit.ly/bundles/cfbolz/1

+
+
+

Final Words

+

I hope this has shown some of you what PyPy is all about other than a faster +implementation of Python.

+

For those that would like to know more about how the process works, there are +several academic papers explaining the process in detail that I recommend. In +particular: Tracing the Meta-Level: PyPy's Tracing JIT Compiler.

+

See https://readthedocs.org/docs/pypy/en/latest/extradoc.html

+
+
+
+
+
+ + Winston Ewert wrote on 2011-04-06 21:59: +
+
+

Some interpreters are written to evaluate directly from the AST. i.e. they never generate bytecode, instead each node in the ast simply has the code to execute it as a "virtual" function. Could PyPy JIT such an interpreter? Or does it essentially assume a bytecode based interpreter?

+
+
+
+
+ + Anonymous wrote on 2011-04-07 05:56: +
+
+

In theory it should be able to, if it's written in RPython. Perhaps it would be harder to place the hints for the jit engine?

As far as I understand it, it still traces some kind of bytecode (generated from the RPython code), but uses the can_enter_jit hints to determine what to trace and the length of a trace.

If it'll be fast is another question though. Why not give it a try? (E.g. one could implement the LLVM kaleidoscope language in RPython.)

+
+
+
+
+ + Maciej Fijalkowski wrote on 2011-04-07 06:05: +
+
+

@Winston in theory nothing prevents JIT from working on AST-based interpreters. In practice however, it would require a bit of engineering to convince the JIT that the green (constant) argument is a complex object structure. That's however just engineering

+
+
+
+
+ + Carl Friedrich Bolz-Tereick wrote on 2011-04-07 09:24: +
+
+

It's actually not a problem at all to have an AST-based interpreter. In fact, the Prolog uses "ASTs" (Prolog is homoiconic, so the ASTs are just Prologs normal data structures).

Maciej: that's not a problem if your ASTs are actually immutable. If they aren't you have a problem which indeed requires some engineering.

+
+
+
+
+ + Quiz wrote on 2011-04-07 10:45: +
+
+

The effect of the loop "[>+<-]" is

tape[position+1] += tape[position]
tape[position] = 0

We saw that PyPy can optimize the program counter away in this loop--but this loop could be executed in constant time. Will PyPy ever be able to optimize it to that degree?

+
+
+
+
+ + Winston Ewert wrote on 2011-04-10 01:53: +
+
+

Well, you finally motivated me to give it a try. I optimized the BF example and managed to get some pretty nice speed boosts all without dipping into the low level (aside from reading the log)

+
+
+
+
+ + Anonymous wrote on 2011-04-13 09:50: +
+
+

Great article, man! Many thanks and keep on rocking!

+
+
+
+
+ + Anonymous wrote on 2011-08-07 08:47: +
+
+

Great tutorial, but where can I find the 'test.b' file (mentioned for the tracing JIT) for a try?

+
+
+
+
+ + Anonymous wrote on 2012-11-22 10:50: +
+
+

hi guys. can jit merge points not be put inside methods? Going off example3.py, if I take the body of the while loop and move it into a method of the Tape class (along with the jitdriver), all the speed gains go away. can anyone explain why this happens? Thanks!

+
+
+
+
+ + Sarah Mount wrote on 2016-07-30 23:12: +
+
+

BTW the link to https://bit.ly/bundles/cfbolz/1 has bit-rotted.

+
+
+
+ +

Tutorial: Writing an Interpreter with PyPy, Part 1

+ +
+

This is a guest blog post written by Andrew Brown, with help from the PyPy developers +on the pypy-dev mailing list.

+

This tutorial's master copy and supporting files live at +https://bitbucket.org/brownan/pypy-tutorial/

+
+

When I first learned about the PyPy project, it took me a while to figure out +exactly what it was about. For those that don't already know, it's two things:

+
    +
  • A set of tools for implementing interpreters for interpreted languages
  • +
  • An implementation of Python using this toolchain
  • +
+

The second part is probably what most people think PyPy is, but this tutorial +is not about their Python interpreter. It is about writing your own +interpreter for your own language.

+

This is the project I undertook to help myself better understand how PyPy works +and what it's all about.

+

This tutorial assumes you know very little about PyPy, how it works, and even +what it's all about. I'm starting from the very beginning here.

+
+

What PyPy Does

+

Here's a brief overview of what PyPy can do. Let's say you want to write an +interpreted language. This involves writing some kind of source code parser, a +bytecode interpretation loop, and lots of standard library code.

+

That's quite a bit of work for moderately complicated languages, and there's a +lot of low level work involved. Writing the parser and compiler code usually +isn't fun, that's why there are tools out there to generate parsers and +compilers for you.

+

Even then, you still must worry about memory management in your interpreter, +and you're going to be re-implementing a lot if you want data types like +arbitrary precision integers, nice general hash tables, and such. It's enough +to put someone off from implementing their idea for a language.

+

Wouldn't it be nice if you could write your language in an existing high level +language like, for example, Python? That sure would be ideal, you'd get all the +advantages of a high level language like automatic memory management and rich +data types at your disposal. Oh, but an interpreted language interpreting +another language would be slow, right? That's twice as much interpreting going +on.

+

As you may have guessed, PyPy solves this problem. PyPy is a sophisticated +toolchain for analyzing and translating your interpreter code to C code (or JVM +or CLI). This process is called "translation", and it knows how to translate +quite a lot of Python's syntax and standard libraries, but not everything. All +you have to do is write your interpreter in RPython, a subset of the Python +language carefully defined to allow this kind of analysis and translation, and +PyPy will produce for you a very efficient interpreter.

+

Because efficient interpreters should not be hard to write.

+
+
+

The Language

+

The language I've chosen to implement is dead simple. The language runtime +consists of a tape of integers, all initialized to zero, and a single pointer +to one of the tape's cells. The language has 8 commands, described here:

+
+
>
+
Moves the tape pointer one cell to the right
+
+
+
<
+
Moves the tape pointer one cell to the left
+
+
+
Increments the value of the cell underneath the pointer
+
-
+
Decrements the value of the cell underneath the pointer
+
+
+
[
+
If the cell under the current pointer is 0, skip to the instruction after +the matching ]
+
+
+
]
+
Skip back to the matching [ (evaluating its condition)
+
+
+
.
+
Print out a single byte to stdout from the cell under the pointer
+
+
+
,
+
Read in a single byte from stdin to the cell under the pointer
+
+

Any unrecognized bytes are ignored.

+

Some of you may recognize this language. I will be referring to it as BF.

+

One thing to notice is that the language is its own bytecode; there is no +translation from source code to bytecode. This means that the language can be +interpreted directly: the main eval loop of our interpreter will operate right +on the source code. This simplifies the implementation quite a bit.

+
+
+

First Steps

+

Let's start out by writing a BF interpreter in plain old Python. The first step +is sketching out an eval loop:

+
def mainloop(program):
+    tape = Tape()
+    pc = 0
+    while pc < len(program):
+        code = program[pc]
+
+        if code == ">":
+            tape.advance()
+        elif code == "<":
+            tape.devance()
+        elif code == "+":
+            tape.inc()
+        elif code == "-":
+            tape.dec()
+        elif code == ".":
+            sys.stdout.write(chr(tape.get()))
+        elif code == ",":
+            tape.set(ord(sys.stdin.read(1)))
+        elif code == "[" and value() == 0:
+            # Skip forward to the matching ]
+        elif code == "]" and value() != 0:
+            # Skip back to the matching [
+
+        pc += 1
+
+

As you can see, a program counter (pc) holds the current instruction index. The +first statement in the loop gets the instruction to execute, and then a +compound if statement decides how to execute that instruction.

+

The implementation of [ and ] are left out here, but they should change the +program counter to the value of the matching bracket. (The pc then gets +incremented, so the condition is evaluated once when entering a loop, and once +at the end of each iteration)

+

Here's the implementation of the Tape class, which holds the tape's values as +well as the tape pointer:

+
class Tape(object):
+    def __init__(self):
+        self.thetape = [0]
+        self.position = 0
+
+    def get(self):
+        return self.thetape[self.position]
+    def set(self, val):
+        self.thetape[self.position] = val
+    def inc(self):
+        self.thetape[self.position] += 1
+    def dec(self):
+        self.thetape[self.position] -= 1
+    def advance(self):
+        self.position += 1
+        if len(self.thetape) <= self.position:
+            self.thetape.append(0)
+    def devance(self):
+        self.position -= 1
+
+

As you can see, the tape expands as needed to the right, indefinitely. We +should really add some error checking to make sure the pointer doesn't go +negative, but I'm not worrying about that now.

+

Except for the omission of the "[" and "]" implementation, this code will work +fine. However, if the program has a lot of comments, it will have to skip over +them one byte at a time at runtime. So let's parse those out once and for all.

+

At the same time, we'll build a dictionary mapping between brackets, so that +finding a matching bracket is just a single dictionary lookup. Here's how:

+
def parse(program):
+    parsed = []
+    bracket_map = {}
+    leftstack = []
+
+    pc = 0
+    for char in program:
+        if char in ('[', ']', '<', '>', '+', '-', ',', '.'):
+            parsed.append(char)
+
+            if char == '[':
+                leftstack.append(pc)
+            elif char == ']':
+                left = leftstack.pop()
+                right = pc
+                bracket_map[left] = right
+                bracket_map[right] = left
+            pc += 1
+
+    return "".join(parsed), bracket_map
+
+

This returns a string with all invalid instructions removed, and a dictionary +mapping bracket indexes to their matching bracket index.

+

All we need is some glue code and we have a working BF interpreter:

+
def run(input):
+    program, map = parse(input.read())
+    mainloop(program, map)
+
+if __name__ == "__main__":
+    import sys
+    run(open(sys.argv[1], 'r'))
+
+

If you're following along at home, you'll also need to change the signature of +mainloop() and implement the bracket branches of the if statement. Here's the +complete example: example1.py

+

At this point you can try it out to see that it works by running the +interpreter under python, but be warned, it will be very slow on the more +complex examples:

+
+$ python example1.py 99bottles.b
+
+

You can find mandel.b and several other example programs (not written by me) in +my repository.

+
+
+

PyPy Translation

+

But this is not about writing a BF interpreter, this is about PyPy. So what +does it take to get PyPy to translate this into a super-fast executable?

+

As a side note, there are some simple examples in the pypy/translator/goal +directory of the PyPy source tree that are helpful here. My starting point for +learning this was the example "targetnopstandalone.py", a simple hello world +for PyPy.

+

For our example, the module must define a name called "target" which returns the +entry point. The translation process imports your module and looks for that +name, calls it, and the function object returned is where it starts the +translation.

+
def run(fp):
+    program_contents = ""
+    while True:
+        read = os.read(fp, 4096)
+        if len(read) == 0:
+            break
+        program_contents += read
+    os.close(fp)
+    program, bm = parse(program_contents)
+    mainloop(program, bm)
+
+def entry_point(argv):
+    try:
+        filename = argv[1]
+    except IndexError:
+        print "You must supply a filename"
+        return 1
+
+    run(os.open(filename, os.O_RDONLY, 0777))
+    return 0
+
+def target(*args):
+    return entry_point, None
+
+if __name__ == "__main__":
+    entry_point(sys.argv)
+
+

The entry_point function is passed the command line arguments when you run the +resulting executable.

+

A few other things have changed here too. See the next section...

+
+
+

About RPython

+

Let's talk a bit about RPython at this point. PyPy can't translate arbitrary +Python code because Python is a bit too dynamic. There are restrictions on what +standard library functions and what syntax constructs one can use. I won't be +going over all the restrictions, but for more information see +https://readthedocs.org/docs/pypy/en/latest/coding-guide.html#restricted-python

+

In the example above, you'll see a few things have changed. I'm now using low +level file descriptors with os.open and os.read instead of file objects. +The implementation of "." and "," are similarly tweaked (not shown above). +Those are the only changes to make to this code, the rest is simple enough for +PyPy to digest.

+

That wasn't so hard, was it? I still get to use dictionaries, expandable lists, +and even classes and objects! And if low level file descriptors are too low for +you, there are some helpful abstractions in the rlib.streamio module included +with PyPy's "RPython standard library."

+

For the example thus far, see example2.py

+
+
+

Translating

+

If you haven't already, check yourself out the latest version of PyPy from +their bitbucket.org repository:

+
+$ hg clone https://bitbucket.org/pypy/pypy
+
+

(A recent revision is necessary because of a bugfix that makes my example +possible)

+

The script to run is in "pypy/translator/goal/translate.py". Run this script, +passing in our example module as an argument.

+

[A note added much later: this script has been moved to "rpython/bin/rpython".]

+
+$ python ./pypy/pypy/translator/goal/translate.py example2.py
+
+

(You can use PyPy's python interpreter for extra speed, but it's not necessary)

+

PyPy will churn for a bit, drawing some nice looking fractals to your console +while it works. It takes around 20 seconds on my machine.

+

The result from this is an executable binary that interprets BF programs. +Included in my repository are some example BF programs, including a mandelbrot +fractal generator, which takes about 45 seconds to run on my computer. Try it +out:

+
+$ ./example2-c mandel.b
+
+

Compare this to running the interpreter un-translated on top of python:

+
+$ python example2.py mandel.b
+
+

Takes forever, doesn't it?

+

So there you have it. We've successfully written our own interpreter in RPython +and translated it with the PyPy toolchain.

+
+

(more in the next blog post...)

+
+
+
+
+
+ + Dunk wrote on 2011-04-05 14:10: +
+
+

nice post!

+
+
+
+
+ + DaNmarner wrote on 2011-04-05 16:35: +
+
+

Hmmmmmm, yum.

I'm going to translate this into Chinese, if you don't mind?

+
+
+
+
+ + Anonymous wrote on 2011-04-05 16:56: +
+
+

"devance"? I think you meant "retract".

+
+
+
+
+ + Paul Smith wrote on 2011-04-06 04:09: +
+
+

On my Ubuntu 10.10 laptop, the PyPy BF interpreter ran hanoi in ~20 sec and mandel in ~40 sec. By comparison, the beef BF interpreter (written in C) ran these in ~10 and ~20 sec., respectively. Not too shabby, PyPy.

+
+
+
+
+ + Unknown wrote on 2011-04-06 10:22: +
+
+

Nice article though I'm really missing a simple benchmark between the python interpreter and the pypy interpreter. "Takes forever" vs "45 seconds" isn't as awesome of a conclusion as I'd hoped for.

+
+
+
+
+ + Anonymous wrote on 2011-04-06 14:52: +
+
+

@temptemptemp13: I think you are missing something much more substantial. This article is not about Python at all. It is about how to use the PyPy toolchain to implement a different language - in this case the brainfuck programming language.

While BF isn't a very useful language, it has the nice properties of being very small. Almost all of the language fits in a blog post.

+
+
+
+
+ + Unknown wrote on 2011-04-08 10:32: +
+
+

Thanks. I've finally understood what PyPy is.

+
+
+
+
+ + Anonymous wrote on 2011-04-12 18:24: +
+
+

I like how this article became family-friendly by actually avoiding calling BF by its name :-)

+
+
+
+
+ + Davide wrote on 2011-04-15 03:52: +
+
+

Amazing! Thanks for posting. I was wondering, what's about a pure C or C++ implementations, as close as reasonable to the python one? So I wrote them. You can read more details here, but the bottom line is that PyPy is (marginally) faster than C++, and (marginally) slower than C :-O

+
+
+
+
+ + Antonio Cuni wrote on 2011-04-15 07:53: +
+
+

@Davide: you should compare your C version against the PyPy version WITH the JIT, as explained here:

https://morepypy.blogspot.com/2011/04/tutorial-part-2-adding-jit.html

I bet that PyPy will easily win :-)

+
+
+
+
+ + Anonymous wrote on 2011-12-12 01:15: +
+
+

Nice post. I just want to report that I tried running

/usr/share/pypy-1.6/pypy/translator/goal/translate.py example2.py

and got the following error.
This is with an Ubuntu 1.7 pypy package rebuilt on Debian squeeze (the 1.6 is a typo, it should be 1.7).

[translation:ERROR] Error:
[translation:ERROR] Traceback (most recent call last):
[translation:ERROR] File "/usr/share/pypy-1.6/pypy/translator/goal/translate.py", line 308, in main
[translation:ERROR] drv.proceed(goals)
[translation:ERROR] File "/usr/share/pypy-1.6/pypy/translator/driver.py", line 809, in proceed
[translation:ERROR] return self._execute(goals, task_skip = self._maybe_skip())
[translation:ERROR] File "/usr/share/pypy-1.6/pypy/translator/tool/taskengine.py", line 116, in _execute
[translation:ERROR] res = self._do(goal, taskcallable, *args, **kwds)
[translation:ERROR] File "/usr/share/pypy-1.6/pypy/translator/driver.py", line 286, in _do
[translation:ERROR] res = func()
[translation:ERROR] File "/usr/share/pypy-1.6/pypy/translator/driver.py", line 441, in task_backendopt_lltype
[translation:ERROR] from pypy.translator.backendopt.all import backend_optimizations
[translation:ERROR] File "/usr/share/pypy-1.6/pypy/translator/backendopt/all.py", line 2, in
[translation:ERROR] from pypy.translator.backendopt import removenoops
[translation:ERROR] File "/usr/share/pypy-1.6/pypy/translator/backendopt/removenoops.py", line 5, in
[translation:ERROR] from pypy import conftest
[translation:ERROR] File "/usr/share/pypy-1.6/pypy/conftest.py", line 1, in
[translation:ERROR] import py, pytest, sys, os, textwrap, types
[translation:ERROR] ImportError: No module named pytest
[translation] start debugger...
> /usr/share/pypy-1.6/pypy/conftest.py(1)()
-> import py, pytest, sys, os, textwrap, types
(Pdb+)

So, it looks like pytest needs to be installed. This does not appear to be available as a Debian package.

Regards, Faheem Mitha
(faheem at faheem dot info)

+
+
+
+
+ + James Mills wrote on 2013-02-14 05:44: +
+
+

This is a great post for anyone interested in programming languages :) Great post!

+
+
+
+
+ + ℭacilhας, ℒa ℬatalema wrote on 2013-02-23 02:12: +
+
+

Now, with os.read() and os.write():

[translation:ERROR] Error:
[translation:ERROR] Traceback (most recent call last):
[translation:ERROR] File "/opt/local/lib/pypy/src/pypy-pypy-07e08e9c885c/pypy/translator/goal/translate.py", line 303, in main
[translation:ERROR] drv.proceed(goals)
[translation:ERROR] File "/opt/local/lib/pypy-2.0-b1/src/pypy-pypy-07e08e9c885c/pypy/translator/driver.py", line 771, in proceed
[translation:ERROR] return self._execute(goals, task_skip = self._maybe_skip())
[translation:ERROR] File "/opt/local/lib/pypy-2.0-b1/src/pypy-pypy-07e08e9c885c/pypy/translator/tool/taskengine.py", line 116, in _execute
[translation:ERROR] res = self._do(goal, taskcallable, *args, **kwds)
[translation:ERROR] File "/opt/local/lib/pypy-2.0-b1/src/pypy-pypy-07e08e9c885c/pypy/translator/driver.py", line 283, in _do
[translation:ERROR] res = func()
[translation:ERROR] File "/opt/local/lib/pypy-2.0-b1/src/pypy-pypy-07e08e9c885c/pypy/translator/driver.py", line 319, in task_annotate
[translation:ERROR] s = annotator.build_types(self.entry_point, self.inputtypes)
[translation:ERROR] File "/opt/local/lib/pypy-2.0-b1/src/pypy-pypy-07e08e9c885c/pypy/annotation/annrpython.py", line 89, in build_types
[translation:ERROR] return self.build_graph_types(flowgraph, inputcells, complete_now=complete_now)
[translation:ERROR] File "/opt/local/lib/pypy-2.0-b1/src/pypy-pypy-07e08e9c885c/pypy/annotation/annrpython.py", line 142, in build_graph_types
[translation:ERROR] self.complete()
[translation:ERROR] File "/opt/local/lib/pypy-2.0-b1/src/pypy-pypy-07e08e9c885c/pypy/annotation/annrpython.py", line 217, in complete
[translation:ERROR] raise AnnotatorError(text)
[translation:ERROR] AnnotatorError: -+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
[translation:ERROR] Blocked block -- operation cannot succeed
[translation:ERROR]
[translation:ERROR] v1 = ord(v0)
[translation:ERROR] In :
[translation:ERROR] Happened at file /Users/cacilhas/Workspace/Personal/brainfuck/src/brainfuck/parser.py line 29
[translation:ERROR]
[translation:ERROR] ==> tape.set(ord(os.read(0, 1)))
[translation:ERROR]
[translation:ERROR] Known variable annotations:
[translation:ERROR] v0 = SomeString(can_be_None=True)

+
+
+
+
+ + Dvd Fo wrote on 2013-08-26 12:25: +
+
+

I think that your "," implementation is incorrect, os.read returns an empty string on EOF, thus [0] triggers an exception.
According to Wikipedia, setting the cell to 0, -1 or leaving the cell unchanged each may be used to tell EOF apart from other characters.

+
+
+
+
+ + James wrote on 2015-12-02 05:50: +
+
+

I followed this tutorial again several years later :) (just for fun) using the newly published rpython toolchain now available up on PyPi. You can now just: pip install rpython -- I also wanted to point out that recent versions of the RPython toolchain have made advances in what it can translate it seems; specifically I did not need to change the open(...).read() parts to lower level os.read() calls.

+
+
+
+ +

PyPy Göteborg Post-Easter Sprint April 25 - May 1 2011

+ +
+

The next PyPy sprint will be in Gothenburg, Sweden. It is a public sprint, +very suitable for newcomers. We'll focus on making the 1.5 release (if +it hasn't already happened) and whatever interests the Sprint attendees.

+
+

Topics and goals

+

The main goal is to polish and release PyPy 1.5, supporting Python 2.7 +as well as the last few months' improvements in the JIT (provided that +it hasn't already happened). Other topics:

+
    +
  • Going over our documentation, and classifying our docs in terms of +mouldiness. Deciding what needs writing, and maybe writing it.
  • +
  • Helping people get their code running with PyPy
  • +
  • maybe work on EuroPython Training, and talks
  • +
  • Summer of Code preparation
  • +
  • speed.pypy.org
  • +
  • any other programming task is welcome too -- e.g. tweaking the +Python or JavaScript interpreter, Stackless support, and so on.
  • +
+
+
+

Location

+

The sprint will be held in the apartment of Laura Creighton and Jacob Hallén +which is at Götabergsgatan 22 in Gothenburg, Sweden. Here is a map. This is +in central Gothenburg. It is between the tram stops of Vasaplatsen and +Valand, (a distance of 4 blocks) where many lines call -- the 2, 3, 4, 5, +7, 10 and 13.

+

Probably cheapest and not too far away is to book accomodation at SGS +Veckobostader. The Elite Park Avenyn Hotel is a luxury hotel just a +few blocks away. There are scores of hotels a short walk away from the +sprint location, suitable for every budget, desire for luxury, and desire +for the unusual. You could, for instance, stay on a boat. Options are +too numerous to go into here. Just ask in the mailing list or on the blog.

+

Hours will be +from 10:00 until people have had enough. It's a good idea to arrive a +day before the sprint starts and leave a day later. In the middle of +the sprint there usually is a break day and it's usually ok to take +half-days off if you feel like it.

+
+
+

Good to Know

+

Sweden is not part of the Euro zone. One SEK (krona in singular, kronor +in plural) is roughly 1/10th of a Euro (9.36 SEK to 1 Euro).

+

The venue is central in Gothenburg. There is a large selection of +places to get food nearby, from edible-and-cheap to outstanding. We +often cook meals together, so let us know if you have any food allergies, +dislikes, or special requirements.

+

Sweden uses the same kind of plugs as Germany. 230V AC.

+

The Sprint will be held the week following Easter. This means, as always, +that Gothcon will be taking place the weekend before (Easter weekend). +Gothcon, now in its 35 year, is the largest European game players conference. +Some of you may be interested in arriving early for the board games. +The conference site is only in Swedish, alas. You don't need to register +in advance unless you are planning to host a tournament, (and it's too +late for that anyway).

+
+
+

Getting Here

+

If are coming train, you will arrive at the Central Station. It is +about 12 blocks to the site from there, or you can take a tram.

+

There are two airports which are local to Göteborg, Landvetter (the main +one) and Gothenburg City Airport (where some budget airlines fly). +If you arrive at Landvetter the airport bus stops right downtown at +Elite Park Avenyn Hotel which is the second stop, 4 blocks from the +Sprint site, as well as the end of the line, which is the Central Station. +If you arrive at Gothenburg City Airport take the bus to the end of the +line. You will be at the Central Station.

+

You can also arrive by ferry, from either Kiel in Germany or Frederikshavn +in Denmark.

+
+
+

Who's Coming?

+

If you'd like to come, please let us know when you will be arriving and +leaving, as well as letting us know your interests We'll keep a list +of people which we'll update (which you can do so yourself if you +have bitbucket pypy commit rights).

+
+
+
+
+
+ + intgr wrote on 2011-04-04 22:37: +
+
+

"e.g. tweaking the Python or JavaScript interpreter"

Are you implying that PyPy has a JavaScript interpreter now?

+
+
+
+
+ + Carl Friedrich Bolz-Tereick wrote on 2011-04-05 13:58: +
+
+

It had one since a few years. It's not complete though: https://bitbucket.org/pypy/lang-js/overview

+
+
+
+
+ + vak wrote on 2011-04-28 08:59: +
+
+

any updates from the event?

+
+
+
+ +

Controlling the Tracing of an Interpreter With Hints, Part 4: Benchmarks

+ +
+

This is part 4 and the final part of the series on how to speed up an interpreter +written with PyPy by adding JIT hints to the interpreter. Part 1 described how +to control the extent of tracing. Part 2 described how to influence the +optimizer with promotion and pure functions. Part 3 described a simple object +model and how it can be optimized by doing small rewrites. In this (short) post +I present some benchmarks.

+
+

Benchmarks

+

For the benchmarks I ran a subset of the benchmarks on https://speed.pypy.org +with CPython and four different executables of PyPy's Python interpreter (all +with a JIT). The executables contain all combinations of enabling maps (which +make instance attributes fast) and type versions (which makes method lookup +fast).

+
    +
  • +pypy-slow: contains neither maps nor type versions.
  • +
  • +pypy-map: contains maps but not type versions.
  • +
  • +pypy-version: contains type versions but not maps.
  • +
  • +pypy-full: contains both maps and type versions
  • +
+

The results are as follows:

+ +

The graph shows the speedup over CPython's numbers. The results are quite +interesting. Maps by themselves do not speed up much over the bare JIT, whereas +typed versions alone improve on the JIT baseline in many cases. However, maps +are not useless. In combination with type versions they add a nice improvement +over just type versions in a number of benchmarks (most notably +raytrace-simple and richards but also in crypto-pyaes, django +and go).

+

It's clear that type versions can be arbitrarily effective. A method lookup on a +class can be arbitrarily slow, if the inheritance hierarchy becomes deeper and +deeper. The full lookup is replaced by one promotion if type versions are +enabled.

+

Maps on the other hand always replace one dict lookup with one promotion. Since +dict lookups are already very fast, this by itself does not lead to a gigantic +improvement. Only in combination with type versions do they show their full +potential.

+
+
+
+
+
+ + Winston Ewert wrote on 2011-03-26 20:17: +
+
+

It's not clear to me why version + maps combine so well. Maps should effectively eliminate lookups on the instance dict and versions eliminate lookups on the class dict. Both versions would seem to eliminate different classes of lookups, so I'm not seeing why we have dramatic improvement when using them together.

+
+
+
+
+ + Alex wrote on 2011-03-26 20:19: +
+
+

I'm not an expert at CPU architecture, but ISTM eliminating both can eliminate a large number of memory reads which would help with pipelining and other very low level optimizations.

+
+
+
+
+ + Carl Friedrich Bolz-Tereick wrote on 2011-03-26 21:33: +
+
+

@Winston: I actually have no clue :-). The numbers are hard to deny though. I plan to stare at the traces a bit next week, can comment here if I find something interesting.

+
+
+
+
+ + Carl Friedrich Bolz-Tereick wrote on 2011-03-27 14:52: +
+
+

@Winston: ok, I probably found out. Your reasoning is too simple because usually you do several lookups on the same object in a row. Every lookup looks first in the class, then in the instance. So it looks a bit like this:

lookup name1 in obj.__class__
lookup name1 in obj.__dict__
lookup name2 in obj.__class__
lookup name2 in obj.__dict__
lookup name2 in obj.__class__
lookup name2 in obj.__dict__

when using maps, every lookup in the dict is simply reading the map, promoting it and then a read. after the promotion of the map, the instance's layout is fully known. however, if type versions are disabled, the lookups in the class are complex operations that are opaque to the JIT. Therefore the JIT assumes they can change the layout and thus the map of the object.

If you also enable type versions, then the class lookups are understandable to the JIT. therefore the JIT can see that the class lookup didn't change the layout of the class. This means that after the first instance lookup, the following instance lookups cost nothing at all.

+
+
+
+
+ + klaussfreire wrote on 2011-03-28 15:04: +
+
+

I think an important improvement brought about by maps is the memory footprint reduction.

It won't matter all the time, but it makes all classes as space-efficient as if they used __slots__, all automagically, which is no small thing.

For programs that handle lots of small objects, this can really make a difference, in memory consumption and speed (less memory to shuffle around will invariably be faster)

Perhaps the benchmark suite doesn't have enough of those cases.

+
+
+
+
+ + Maciej Fijalkowski wrote on 2011-03-28 22:16: +
+
+

@cfbolz I think one reason why maps+version tags are fast is because we lack jit.unroll_safe on several lookup functions when version tags are disabled. Marking them as unrollable would speed things up.

The reasoning behind this is that old style classes which have maps, but no version tags are much faster than new style classes with version tags disabled.

+
+
+
+
+ + Winston Ewert wrote on 2011-03-30 00:41: +
+
+

Thanks for taking the time to answer my query.

The use of class versions eliminates the opaque function being called because the JIT knows the return will be constant. This allows optimizations to work correctly. But this makes me wonder how much of the improvement is due to class versions and how much is due to lack of opaqueness.

At any rate, I always find the posts on this blog very interesting. It definitely some neat stuff you are doing here.

+
+
+
+
+ + Carl Friedrich Bolz-Tereick wrote on 2011-03-30 11:30: +
+
+

@fijal I thought old-style classes had celldicts? That's yet another thing, but your point is still correct.

+
+
+
+
+ + Benjamin wrote on 2011-04-27 22:48: +
+
+

I'd love to see a blog post about conventions to favor or avoid while writing python code to best take advantage of these excellent features. For example, your previous post implied something like this would be faster than changing the class directly:

class Counter(object):
....def __init__(self):
........self.count = 0
....def increment(self):
........self.count += 1

class Many(object):
....counter = Counter()
....def __init__(self):
........self.counter.increment()

Granted, it would be preferable, from a coding standpoint, to just use a simple class attribute, but the adaptations that would likely work best for the pypy JIT seem like far smaller divergences from the 'ideal' python than many other lengths people go to when coding for speed, particularly compared to something like cython.

+
+
+
+ +

A thank you to the PSF

+ +
+

This year's PyCon was an incredible time; several members of the PyPy team were +there, and we'll be blogging more about our experiences in the coming days. +However, we quickly wanted to extend a thank you to the Python Software +Foundation (PSF).

+

As you may have heard, on Friday morning at PyCon Jesse Noller handed the PyPy +team a check for $10,000, on behalf of the PSF. This was in recognition of our +success over the past few years in bringing PyPy from a research project +to a fast, compliant, production-ready Python implementation, and to allow us +to continue our work on making it faster and more up-to-date with upstream +version changes.

+

Beyond the large check, we're grateful for the endorsement this represents, +not only of our work on PyPy, but also of all alternatve Python VMs. +The PSF has shifted its focus from representing just CPython to representing +the Python Language, reguardless of its implementation, something we are very +appreciative of.

+ +

From left to right, PyPy people present at PyCon 2011: Maciej Fijałkowski, Armin Rigo, Alex Gaynor, Laura Creighton and Jacob Hallén

+ +

Thank you, PSF.

+
+
+
+
+ + Hodgestar wrote on 2011-03-22 00:17: +
+
+

Congratulations! It's great to see the PSF embracing the broader Python ecosystem.

+
+
+
+
+ + Steve wrote on 2011-03-22 03:24: +
+
+

It's nice to be able to offer this support as an indication that we aren't just the CPython Software Foundation. It is a well-deserved award, and we know it will be put to good use.

+
+
+
+
+ + Unknown wrote on 2011-03-23 14:47: +
+
+

Yyes. Keep it Going! =)

+
+
+
+
+ + Unknown wrote on 2011-05-03 08:34: +
+
+

Wow, congratulations! PyPy has gone a long way.

+
+
+
+ +

Controlling the Tracing of an Interpreter With Hints, Part 3: Putting it All Together

+ +
+

This is part 3 of the series on how to speed up an interpreter written with +PyPy by adding JIT hints to the interpreter. Part 1 described how to control +the extent of tracing. Part 2 described how to influence the optimizer with +promotion and pure functions. In this post I describe a worked-out example of +a small object model for a dynamic language and how to make it efficient using +the hints described in the previous posts.

+
+

A Simple Object Model

+

To implement a dynamic language efficiently, the operations on its objects need +to be fast. Most dynamic languages have object models that are made by using +dictionaries everywhere. Let's look at an example of how the JIT can be made to +optimize such operations.

+

For the purpose of this blog post we will use a very simple and bare-bones +object model that just supports very simple classes and instances, without any +inheritance or any fancy features. The model has classes, which contain methods. +Instances have a class. Instances have their own attributes. When looking up an +attribute on an instance, the instances attributes are searched. If the +attribute is not found there, the class' attributes are searched.

+

To implement this object model, we could use the following RPython code as part +of the interpreter source code:

+
class Class(object):
+    def __init__(self, name):
+        self.name = name
+        self.methods = {}
+
+    def instantiate(self):
+        return Instance(self)
+
+    def find_method(self, name):
+        result = self.methods.get(name)
+        if result is not None:
+            return result
+        raise AttributeError(name)
+
+    def change_method(self, name, value):
+        self.methods[name] = value
+
+
+class Instance(object):
+    def __init__(self, cls):
+        self.cls = cls
+        self.attributes = {}
+
+    def getfield(self, name):
+        result = self.attributes.get(name)
+        if result is not None:
+            return result
+        raise AttributeError(name)
+
+    def write_attribute(self, name, value):
+        self.attributes[name] = value
+
+    def getattr(self, name):
+        try:
+            return self.getfield(name)
+        except AttributeError:
+            return self.cls.find_method(name)
+
+

In this straightforward implementation the methods and attributes are just +stored in dictionaries on the classes/instances. While this object model is very +simple it already contains all the hard parts of Python's object model. Both +instances and classes can have arbitrary fields, and they are changeable at +any time. Moreover, instances can change their class after they have been +created.

+

When using this object model in +an interpreter, a huge amount of time will be spent doing lookups in these +dictionaries. To make the language efficient using a tracing JIT, we need to +find a way to get rid of these dictionary lookups somehow.

+

Let's assume we trace through code that sums three attributes, such as:

+
inst.getattr("a") + inst.getattr("b") + inst.getattr("c")
+
+

The trace could look like this:

+
# inst.getattr("a")
+attributes1 = inst.attributes
+result1 = dict.get(attributes1, "a")
+guard(result1 is not None)
+
+# inst.getattr("b")
+attributes2 = inst.attributes
+v1 = dict.get(attributes2, "b")
+guard(v1 is None)
+cls1 = inst.cls
+methods1 = cls.methods
+result2 = dict.get(methods1, "b")
+guard(result2 is not None)
+v2 = result1 + result2
+
+# inst.getattr("c")
+attributes3 = inst.attributes
+v3 = dict.get(attributes3, "c")
+guard(v3 is None)
+cls1 = inst.cls
+methods2 = cls.methods
+result3 = dict.get(methods2, "c")
+guard(result3 is not None)
+
+v4 = v2 + result3
+return(v4)
+
+

In this example, the attribute a is found on the instance, but the +attributes b and c are found on the class. The trace indeed contains +five calls to dict.get, which is slow.

+
+
+

Making Instance Attributes Faster Using Maps

+

The first step in making getattr faster in our object model is to optimize +away the dictionary lookups on the instances. The hints we have looked at in the +two earlier blog posts don't seem to help with the current object model. There is +no pure function to be seen, and the instance is not a candidate for promotion, +because there tend to be many instances.

+

This is a common problem when trying to apply hints. Often, the interpreter +needs a small rewrite to expose the pure functions and nearly-constant objects +that are implicitly there. In the case of instance fields this rewrite is not +entirely obvious. The basic idea is as follows. In theory instances can have +arbitrary fields. In practice however many instances share their layout (i.e. +their set of keys) with many other instances.

+

Therefore it makes sense to factor the layout information out of the instance +implementation into a shared object. This shared layout object is called a +map. Maps are an old idea that comes originally from the SELF language. They are +also used by many JavaScript implementations such as V8. I've written about maps +before, so I won't explain them fully again.

+

The rewritten Instance class using maps looks like this:

+
class Map(object):
+    def __init__(self):
+        self.attribute_indexes = {}
+        self.other_maps = {}
+
+    @purefunction
+    def getindex(self, name):
+        return self.attribute_indexes.get(name, -1)
+
+    @purefunction
+    def new_map_with_additional_attribute(self, name):
+        if name not in self.other_maps:
+            newmap = Map()
+            newmap.attribute_indexes.update(self.attribute_indexes)
+            newmap.attribute_indexes[name] = len(self.attribute_indexes)
+            self.other_maps[name] = newmap
+        return self.other_maps[name]
+
+
+EMPTY_MAP = Map()
+
+class Instance(object):
+    def __init__(self, cls):
+        self.cls = cls
+        self.map = EMPTY_MAP
+        self.storage = []
+
+    def getfield(self, name):
+        map = hint(self.map, promote=True)
+        index = map.getindex(name)
+        if index != -1:
+            return self.storage[index]
+        raise AttributeError(name)
+
+    def write_attribute(self, name, value):
+        map = hint(self.map, promote=True)
+        index = map.getindex(name)
+        if index != -1:
+            self.storage[index] = value
+            return
+        self.map = map.new_map_with_additional_attribute(name)
+        self.storage.append(value)
+
+    def getattr(self, name):
+        try:
+            return self.getfield(name)
+        except AttributeError:
+            return self.cls.find_method(name)
+
+

Instances no longer use dictionaries to store their fields. Instead, they have a +reference to a map, which maps field names to indexes into a storage list. The +storage list contains the actual field values. The maps are shared between +objects with the same layout. Therefore they have to be immutable, which means +that their getindex method is a pure function. When a new attribute is added +to an instance, a new map needs to be chosen, which is done with the +new_map_with_additional_attribute method on the previous map. Now that we have +introduced maps, it is safe to promote the map everywhere, because we assume +that the number of different instance layouts is small.

+

With this changed instance implementation, the trace we had above changes to the +following, where 0xb74af4a8 is the memory address of the Map instance that +has been promoted:

+
# inst.getattr("a")
+map1 = inst.map
+guard(map1 == 0xb74af4a8)
+index1 = Map.getindex(map1, "a")
+guard(index1 != -1)
+storage1 = inst.storage
+result1 = storage1[index1]
+
+# inst.getattr("b")
+map2 = inst.map
+guard(map2 == 0xb74af4a8)
+index2 = Map.getindex(map2, "b")
+guard(index2 == -1)
+cls1 = inst.cls
+methods1 = cls.methods
+result2 = dict.get(methods1, "b")
+guard(result2 is not None)
+v2 = result1 + result2
+
+# inst.getattr("c")
+map3 = inst.map
+guard(map3 == 0xb74af4a8)
+index3 = Map.getindex(map3, "c")
+guard(index3 == -1)
+cls1 = inst.cls
+methods2 = cls.methods
+result3 = dict.get(methods2, "c")
+guard(result3 is not None)
+
+v4 = v2 + result3
+return(v4)
+
+

The calls to Map.getindex can be optimized away, because they are calls to +a pure function and they have constant arguments. That means that index1/2/3 +are constant and the guards on them can be removed. All but the first guard on +the map will be optimized away too, because the map cannot have changed in +between. The optimized trace looks like this:

+
# inst.getattr("a")
+map1 = inst.map
+guard(map1 == 0xb74af4a8)
+storage1 = inst.storage
+result1 = storage1[0]
+
+# inst.getattr("b")
+cls1 = inst.cls
+methods1 = cls1.methods
+result2 = dict.get(methods1, "b")
+guard(result2 is not None)
+v2 = result1 + result2
+
+# inst.getattr("c")
+cls2 = inst.cls
+methods2 = cls2.methods
+result3 = dict.get(methods2, "c")
+guard(result3 is not None)
+
+v4 = v2 + result3
+return(v4)
+
+

The index 0 that is used to read out of the storage array is the result +of the constant-folded getindex call. This trace is already much better than +the original one. Now we are down from five dictionary lookups to just two.

+
+
+

Versioning of Classes

+

Instances were optimized making the assumption that the total number of +Instance layouts is small compared to the number of instances. For classes we +will make an even stronger assumption. We simply assume that it is rare for +classes to change at all. This is not totally reasonable (sometimes classes contain +counters or similar things) but for this simple example it is good enough.

+

What we would really like is if the Class.find_method method were pure. +But it cannot be, because it is always possible to change the class itself. +Every time the class changes, find_method can potentially return a +new value.

+

Therefore, we give every class a version number, which is increased every time a +class gets changed (i.e., the content of the methods dictionary changes). +This means that the result of methods.get() for a given (name, +version) pair will always be the same, i.e. it is a pure operation. To help +the JIT to detect this case, we factor it out in a helper method which is +explicitly marked as @purefunction. The refactored Class looks like +this:

+
class VersionTag(object):
+    pass
+
+class Class(object):
+    def __init__(self, name):
+        self.name = name
+        self.methods = {}
+        self.version = VersionTag()
+
+    def find_method(self, name):
+        self = hint(self, promote=True)
+        version = hint(self.version, promote=True)
+        result = self._find_method(name, version)
+        if result is not None:
+            return result
+        raise AttributeError(name)
+
+    @purefunction
+    def _find_method(self, name, version):
+        return self.methods.get(name)
+
+    def change_method(self, name, value):
+        self.methods[name] = value
+        self.version = VersionTag()
+
+

What is interesting here is that _find_method takes the version +argument but it does not use it at all. Its only purpose is to make the call +pure (because when the version number changes, the result of the call might be +different than the previous one).

+

The trace with this new class implementation looks like this:

+
# inst.getattr("a")
+map1 = inst.map
+guard(map1 == 0xb74af4a8)
+index1 = Map.getindex(map1, "a")
+guard(index1 != -1)
+storage1 = inst.storage
+result1 = storage1[index1]
+
+# inst.getattr("b")
+map2 = inst.map
+guard(map2 == 0xb74af4a8)
+index2 = Map.getindex(map2, "b")
+guard(index2 == -1)
+cls1 = inst.cls
+guard(cls1 == 0xb7aaaaf8)
+version1 = cls1.version
+guard(version1 == 0xb7bbbb18)
+result2 = Class._find_method(cls, "b", version1)
+guard(result2 is not None)
+v2 = result1 + result2
+
+# inst.getattr("c")
+map3 = inst.map
+guard(map3 == 0xb74af4a8)
+index3 = Map.getindex(map3, "c")
+guard(index3 == -1)
+cls2 = inst.cls
+guard(cls2 == 0xb7aaaaf8)
+version2 = cls2.version
+guard(version2 == 0xb7bbbb18)
+result3 = Class._find_method(cls, "c", version2)
+guard(result3 is not None)
+
+v4 = v2 + result3
+return(v4)
+
+

The calls to Class._find_method can now be optimized away, also the +promotion of the class and the version, except for the first one. The final +optimized trace looks like this:

+
# inst.getattr("a")
+map1 = inst.map
+guard(map1 == 0xb74af4a8)
+storage1 = inst.storage
+result1 = storage1[0]
+
+# inst.getattr("b")
+cls1 = inst.cls
+guard(cls1 == 0xb7aaaaf8)
+version1 = cls1.version
+guard(version1 == 0xb7bbbb18)
+v2 = result1 + 41
+
+# inst.getattr("c")
+v4 = v2 + 17
+return(v4)
+
+

The constants 41 and 17 are the results of the folding of the +_find_method` calls. This final trace is now very good. It no longer performs any +dictionary lookups. Instead it contains several guards. The first guard +checks that the map is still the same. This guard will fail if the same +code is executed with an instance that has another layout. The second guard +checks that the class of inst is still the same. It will fail if trace is +executed with an instance of another class. The third guard checks that the +class did not change since the trace was produced. It will fail if somebody +calls the change_method method on the class.

+
+
+

Real-World Considerations

+

The techniques used above for the simple object model are used for the object +model of PyPy's Python interpreter too. Since Python's object model is +considerably more complex, some additional work needs to be done.

+

The first problem that needs to be solved is that Python supports (multiple) +inheritance. Therefore looking up a method in a class needs to consider the +whole method resolution order. This makes the versioning of classes more +complex. If a class is changed its version changes. At the same time, the +versions of all the classes inheriting from it need to be changed as well, +recursively. This makes class changes expensive, but they should be rare. On the +other hand, a method lookup in a complex class hierarchy is as optimized in the +trace as in our object model here.

+

A downside of the versioning of classes that we haven't yet fixed in PyPy, is +that some classes do change a lot. An example would be a class that keeps a +counter of how many instances have been created so far. This is very slow right +now, but we have ideas about how to fix it in the future.

+

Another optimization is that in practice the shape of an instance is correlated +with its class. In our code above, we allow both to vary independently. +In PyPy's Python interpreter we act somewhat more cleverly. The class of +an instance is not stored on the instance itself, but on the map. This means +that we get one fewer promotion (and thus one fewer guard) in the trace, because the class doesn't need to +be promoted after the map has been.

+
+
+

More General Patterns

+

The techniques we used above to make instance and class lookups faster are +applicable in more general cases than the one we developed them for. A more +abstract view of maps is that of splitting a data-structure into a part that +changes slowly, and a part that changes quickly. In the concrete example of maps +we split the original dictionary into the map (the slow-changing part) and the +storage array (the quick-changing part). All the computation on the +slow-changing part can be constant-folded during tracing so that only the +manipulation of the quick-changing part remains.

+

Similarly, versions can be used to constant-fold arbitrary functions of large data +structures. The version needs to be updated carefully every time the result of +this function can change. Therefore this is useful only if the data structure is +expected to change slowly.

+
+
+

Conclusion

+

In this post I showed how to use purefunction and promote to make a +small but still relevant dynamic object model no longer use any dictionary lookups +after tracing. Instead a number of guards are inserted into the +trace to check whether the assumptions about the objects are still true. This +makes operations on objects seriously faster. I plan to write another small post +that shows the speed benefits for PyPy's Python interpreter for exactly these +operations.

+
+
+
+
+
+ + Unknown wrote on 2011-03-21 19:33: +
+
+

Very clever indeed.
I think and additional speedup can be achieved
by using a technique from smalltalk intrepters: Method lookup cache.
The cache is organized so that function
cache(class, method) returns a pointer to the method.
The early Smalltalk implementors reported pretty spectacular speedups when this cache was implemented.

+
+
+
+
+ + Anonymous wrote on 2011-03-21 20:03: +
+
+

SO MUCH AWESOME.

+
+
+
+
+ + RonnyPfannschmidt wrote on 2011-03-21 22:07: +
+
+

@vadiml: the jit+version tags already acts as method lookup cache for jited code
it basically inlines lookup(class, method)

+
+
+
+
+ + Unknown wrote on 2011-03-22 07:46: +
+
+

@RonnyPfannschmidt: thinking more about it
yes, you're right of course

+
+
+
+
+ + Anonymous wrote on 2011-03-23 18:37: +
+
+

I'm wondering about VersionTag(). The guard you've shown looks at its memory address. Doesn't PyPy use compacting garbage collectors? I seem to recall that from earlier posts about the cost of id().

+
+
+
+
+ + Anonymous wrote on 2011-03-23 20:23: +
+
+

Hmm. And now I think I know why twisted isn't any faster in pypy. I remember looking at the source a few years ago and being horrified to see that they were changing class methods during runtime. I guessed to avoid one layer of dispatch in state machines. Anyway, it's an "optimisation" that will hurt pypy.

+
+
+
+
+ + Carl Friedrich Bolz-Tereick wrote on 2011-03-24 09:11: +
+
+

@Marius: You are right. The trace is a bit simplified, in practice there is an indirection so that if the GC moves the object, the trace still works.

@Anonymous: can you find that place in twisted? would be very interesting to see. Also it probably means we should implement these ideas about making changing classes not quite so inefficient.

+
+
+
+ +

Controlling the Tracing of an Interpreter With Hints, Part 2: Controlling Optimization

+ +
+

This is part 2 of a series on how to speed up an interpreter written with PyPy +by adding JIT hints to the interpreter. Part 1 described how to control the +extent of tracing. In this post I will describe how to add hints that +influence the optimizer. If applied correctly these techniques can give +really big speedups by pre-computing parts of what happens at runtime. On the other +hand, if applied incorrectly they might lead to code bloat, thus making the +resulting program actually slower.

+
+

Background

+

Before sending the trace to the backend to produce actual machine code, it is +optimized. The optimizer applies a number of techniques to remove or reduce +the number of operations: most of these are well known compiler optimization +techniques, with the difference that it is easier to apply them in a tracing +JIT because it only has to deal with linear traces. Among the techniques:

+ +

In some places it turns out that if the interpreter author rewrites some parts +of the interpreter with these optimizations in mind the traces that are produced +by the optimizer can be vastly improved.

+

In this post I will describe two hints that allow the interpreter author to +increase the optimization opportunities for constant folding. For constant +folding to work, two conditions need +to be met:

+
    +
  • the arguments of an operation actually need to all be constant, +i.e. statically known by the optimizer
  • +
  • the operation needs to be pure, i.e. always yield the same result given +the same arguments.
  • +
+

The PyPy JIT generator automatically detects the majority of these conditions. +However, for the cases in which the automatic detection does not work, the +interpreter author can apply hints to improve the optimization +opportunities. There is one kind of hint for both of the conditions above.

+

Note: These hints are written by an interpreter developer and applied to the +RPython source of the interpreter. Normal Python users will never see them.

+
+
+

Where Do All the Constants Come From

+

It is worth clarifying what is a "constant" in this context. A variable of +the trace is said to be constant if its value is statically known by the +optimizer.

+

The simplest example of constants are literal values. For example, if in the +RPython source code we have a line like y = x + 1, the second operand will +be a constant in the trace.

+

However, the optimizer can statically know the value of a variable even if it +is not a constant in the original source code. For example, consider the +following fragment of RPython code:

+
if x == 4:
+    y = y + x
+
+

If the fragment is traced with x being 4, the following trace is +produced:

+
+guard(x == 4)
+y = y + x
+
+

In the trace above, the value of x is statically known thanks to the +guard. Remember that a guard is a runtime check. The above trace will run to +completion when x == 4. If the check fails, execution of the trace is +stopped and the interpreter continues to run.

+

There are cases in which it is useful to turn an arbitrary variable +into a constant value. This process is called promotion and it is an old idea +in partial evaluation (it's called "the trick" there). Promotion is also heavily +used by Psyco and by all older versions of PyPy's JIT. Promotion is a technique +that only works well in JIT compilers, in +static compilers it is significantly less applicable.

+

Promotion is essentially a tool for trace specialization. In some places in the +interpreter it would be very useful if a variable were constant, even though it +could have different values in practice. In such a place, promotion is used. The +typical reason to do that is if there is +a lot of computation depending on the value of that variable.

+

Let's make this more concrete. If we trace a call to the following function:

+
def f1(x, y):
+    z = x * 2 + 1
+    return z + y
+
+

We get a trace that looks like this:

+
+v1 = x * 2
+z = v1 + 1
+v2 = z + y
+return(v2)
+
+

Observe how the first two operations could be constant-folded if the value of +x were known. Let's assume that the value of x can vary, but does so +rarely, i.e. only takes a few different values at runtime. If this is the +case, we can add a hint to promote x, like this:

+
def f2(x, y):
+    x = hint(x, promote=True)
+    z = x * 2 + 1
+    return z + y
+
+

The meaning of this hint is that the tracer should pretend that x is a +constant +in the code that follows. When just running the code, the function has no +effect, as it simply returns its first argument. When tracing, some extra work +is done. Let's assume that this changed function is traced with +the arguments 4 and 8. The trace will be the same, except for one +operation at the beginning:

+
+guard(x == 4)
+v1 = x * 2
+z = v1 + 1
+v2 = z + y
+return(v2)
+
+

The promotion is turned into a guard operation in the trace. The guard +captures the value of x as it was at runtime. From the point of view of the +optimizer, this guard is not any different than the one produced by the if +statement in the example above. After the guard, the rest of the trace can +assume that x is equal to 4, meaning that the optimizer will turn this +trace into:

+
+guard(x == 4)
+v2 = 9 + y
+return(v2)
+
+

Notice how the first two arithmetic operations were constant folded. The hope is +that the guard is executed quicker than the multiplication and the addition that +was now optimized away.

+

If this trace is executed with values of x other than 4, the guard will +fail, and execution will continue in the interpreter. If the guard fails often +enough, a new trace will be started from the guard. This other trace will +capture a different value of x. If it is e.g. 2, then the optimized +trace looks like this:

+
+guard(x == 2)
+v2 = 5 + y
+return(v2)
+
+

This new trace will be attached to the guard instruction of the first trace. If +x takes on even more values, a new trace will eventually be made for all of them, +linking them into a chain. This is clearly not desirable, so we should promote +only variables that don't vary much. However, adding a promotion hint will never produce wrong +results. It might just lead to too much assembler code.

+

Promoting integers, as in the examples above, is not used that often. +However, the internals of dynamic language interpreters often +have values that are variable but vary little in the context of parts of a user +program. An example would be the types of variables in a user function. Even +though in principle the argument to a Python function could be any Python type, +in practise the argument types tend to not vary much. Therefore it is possible to +promote the types. In the next blog post I will give a complete example for how +this works.

+
+
+

Declaring New Pure Operations

+

In the last section we saw a way to turn arbitrary variables into constants. All +pure operations on these constants can be constant-folded. This works great for +constant folding of simple types, e.g. integers. Unfortunately, in the context of an +interpreter for a dynamic +language, most operations actually manipulate objects, not simple types. The +operations on objects are often not pure and might even have side-effects. If +one reads a field out of a constant reference to an object this cannot +necessarily be folded away because the object can be mutated. Therefore, another +hint is needed.

+

As an example, take the following class:

+
class A(object):
+    def __init__(self, x, y):
+        self.x = x
+        self.y = y
+
+    def f(self, val):
+        self.y = self.compute() + val
+
+    def compute(self):
+        return self.x * 2 + 1
+
+

Tracing the call a.f(10) of some instance of A yields the following +trace (note how the call to compute is inlined):

+
+x = a.x
+v1 = x * 2
+v2 = v1 + 1
+v3 = v2 + val
+a.y = v3
+
+

In this case, adding a promote of self in the f method to get rid of the +computation of the first few operations does not help. Even if a is a +constant reference to an object, reading the x field does not necessarily +always yield the same value. To solve this problem, there is another annotation, +which lets the interpreter author communicate invariants to the optimizer. In +this case, she could decide that the x field of instances of A is +immutable, and therefore compute +is a pure function. To communicate this, there is a purefunction decorator. +If the code in compute should be constant-folded away, we would change the +class as follows:

+
class A(object):
+    def __init__(self, x, y):
+        self.x = x
+        self.y = y
+
+    def f(self, val):
+        self = hint(self, promote=True)
+        self.y = self.compute() + val
+
+    @purefunction
+    def compute(self):
+        return self.x * 2 + 1
+
+

Now the trace will look like this:

+
+guard(a == 0xb73984a8)
+v1 = compute(a)
+v2 = v1 + val
+a.y = v2
+
+

Here, 0xb73984a8 is the address of the instance of A that was used +during tracing. The call to compute is not inlined, so that the optimizer +has a chance to see it. Since compute function is marked as pure, and its +argument +is a constant reference, the call will be removed by the optimizer. The final +trace looks like this:

+
+guard(a == 0xb73984a8)
+v2 = 9 + val
+a.y = v2
+
+

(assuming that the x field's value is 4).

+

On the one hand, the purefunction annotation is very powerful. It can be +used to constant-fold arbitrary parts of the computation in the interpreter. +However, the annotation also gives you ample opportunity to mess things up. If a +function is annotated to be pure, but is not really, the optimizer can produce +subtly wrong code. Therefore, a lot of care has to be taken when using this +annotation.

+
+

Observably Pure Functions

+

Why can't we simply write an analysis to find out that the x fields of the +A instances is immutable and deduce that compute is a pure function, +since it only reads the x field and does not have side effects? This might +be possible in this particular case, but in practice the functions that are +annotate with the purefunction decorator are usually more complex. +The easiest example for this is that of a function that uses memoization to +cache its results. If you analyze this function, it looks like the function has +side effects, because it changes the memoizing dictionary. However, because this side +effect is not externally visible, the function from the outside is pure. This is +a property that is not easily detectable by analysis. Therefore, the purity +of this function needs to be annotated.

+
+
+

Immutable Fields

+

One of the most common cases of pure functions is reading immutable +values out of objects. Since this is so common, we have special syntactic sugar +for it. A RPython class can have a class attribute _immutable_fields_ set to +a list of strings, listing the fields that cannot be changed. This is equivalent +to using getters and annotating them with purefunction.

+
+
+
+

Conclusion

+

In this blog post I explained two more hints that can be used in the source code +of the interpreter. They are used to influence what the optimizer does with the +trace. I realize the examples given here are a bit too small, in the next +installment I will give a worked-out example that puts all the pieces together.

+
+
+
+
+
+ + Gaëtan de Menten wrote on 2011-03-16 10:56: +
+
+

Again a very interesting post. I would like some precisions for one sentence:
"If x takes on even more values, a new trace will eventually be made for all of them, linking them into a chain."

Does it mean they are all tried in sequence, or is there some dispatch mechanism? If there isn't, wouldn't it be beneficial to have one in place (probably using a hash table of some sort) when there is more than a few values? Or is the number of "generated branches" never supposed to be large enough to make such an approach worthwile?

+
+
+
+
+ + Carl Friedrich Bolz-Tereick wrote on 2011-03-16 12:27: +
+
+

@Gaëtan:

Right now it's just a linear search always, which is clearly not ideal and we might very well fix this in the future. Currently we have the hope that in practice the number of values is always small, but we never measured.

+
+
+
+ +

Controlling the Tracing of an Interpreter With Hints, Part 1: Controlling the Extent of Tracing

+ +
+

The question I was asked most often during my recent US trip was how exactly +the hints work that interpreter authors can use to improve the execution speed +of the programs running on their interpreters. Since those hints are not really +documented all that well, I decided to write blog posts about them. This is the +first one.

+
+

Background

+

First, let's recap some basics: PyPy's approach to implementing dynamic +languages is to write an interpreter for +the language in RPython. This interpreter can be translated to C and then +further to machine code. The interpreter consists of code in the form of a +large number of generated C functions and some data. Similarly, the user +program consists of functions in the language the interpreter executes.

+

As was explained in a blog post and a paper two years ago, PyPy's JIT is a +meta-tracer. Since we want to re-use our tracer for a variety of languages, we +don't trace the execution of the user program, but instead trace the execution +of the interpreter that is running the program. This means that the traces +don't contain the bytecodes of the language in question, but RPython-level +operations that the interpreter did to execute the program.

+

On the other hand, the loops that are traced by the tracer are the loops in the +user program. This means that the tracer stops tracing after one iteration of +the loop in the user function that is being considered. At this point, it can +have traced many iterations of the interpreter main loop.

+

Here's a diagram of this process:

+ + + +

On the left you see the levels of execution. The CPU executes the binary of +PyPy's Python interpreter, which consists of RPython functions that have been +compiled first to C, then to machine code. Some of these functions contain +loops, others don't. The interpreter runs a Python program written by a +programmer (the user). If the tracer is used, it traces operations on the level +of the interpreter. However, the extent of the trace is determined by the loops +in the user program.

+
+
+

How Far Should Tracing Go

+

When the tracer encounters a function call at the interpreter level, e.g. the +interpreter main loop calling a helper function, it can do one of two things:

+
    +
  1. it can trace into the helper function, effectively inlining it into the trace.
  2. +
  3. it can not trace into the function and instead record a call to that function +as an operation in the trace. Such a call operation in the trace is sometimes +called residual call.
  4. +
+

As a default, the tracer will try to trace into the helper because that will +give more information to the optimizer, allowing it to do a better job. This is +particularly important for the allocation removal optimization, because if a +freshly allocated object is passed as an argument to a residual call, its +allocation cannot be optimized away.

+

There is a problem however if the helper function itself contains a loop. The +tracer records the linear sequence of operations that are being executed. Thus +when it encounters a loop on the interpreter level it records all the +operations of every iteration of the loop itself, with the net effect of +unrolling it. The only places where the tracer stops and tries to close the +trace is in the main loop of the interpreter. When the tracer encounters the +main loop, it also checks whether the original user loop has been closed, and +thus whether it can stop tracing.

+

For most helper functions in the interpreter that contain loops, fully +unrolling does not make sense. If a loop is unrolled, the trace is specific to +the number of iteration that was seen during tracing. If the trace is later +executed with a different number of iterations, the trace will be left via a +guard failure, which is inefficient. Therefore the default behaviour of the +tracer is to never trace into a function on the interpreter level that contains +a loop, but to trace into all non-looping helper functions.

+

This default behaviour is essentially a heuristic, but one that usually makes +sense. We want to produce just enough traces to make the resulting code +efficient, but not more. Therefore we trace as much as possible (everything by +default) except the functions which loops where tracing would produce code that +is less general than it could be.

+

As an example for a helper with a loop, take string concatenation. It loops over +the characters of both arguments and copies them over into the result string. It +does not make sense to unroll the loops in this function. If we do that, +the resulting trace can only be used for strings of the length that was seen +during tracing. In practise, the string lengths are usually different each run, +meaning that the trace with unrolling is not run to completion in most cases.

+
+
+

Influencing the Default Behaviour

+

Sometimes the default behaviour is not actually what is wanted. This is +something the interpreter author has to decide, usually by looking at the traces +that are produced and deciding that they should be improved. There are two ways +in which the default is wrong:

+
    +
  • +false negatives: if a helper function that does contain a loop should +be traced into, unrolling the loop.
  • +
  • +false positives: if a helper function that does not contain a loop is +inlined into the trace, but the interpreter author decides that this is not +helpful.
  • +
+

If the interpreter author finds false negatives or false positives, she can fix +that by applying a hint to the tracer. These hints take the form of function +decorators (which both live in the pypy.rlib.jit module). In the next two +subsections I will describe these two function decorators and their use.

+
+

Unrolling Functions With Loops

+

The first decorator, used to fix false negatives, is the unroll_safe +decorator. It is used to tell the tracer to always trace into a function that +has a loop, effectively unrolling the loop. This decorator should be used only +if the loop in the helper function is expected to always run for the same number +of iterations. This sounds like a strong restriction, in practise this is less +severe: The number of iterations needs to only be the same in the context where +the helper functions is traced from.

+

It is easiest to understand this condition via an example. Let's look at the +BUILD_TUPLE bytecode in Python. It takes one argument, the length n of +the tuple being built. The bytecode pops n arguments from the stack, turns +them into a tuple and pushes that tuple on the stack. Thus the function that +implements BUILD_TUPLE in PyPy's Python interpreter calls a helper +popvalues which pops n values from the stack and returns them in a list. +This helper is implemented with a loop and would thus not be traced into by +default. The loop in the helper can run for very different numbers of +iterations, because it is used in a variety of places. However, for every +concrete BUILD_TUPLE bytecode, the argument will be constant. Therefore it +is safe (and even necessary) to annotate popvalues with the unroll_safe +decorator.

+

A different example is the implementation of the isinstance builtin. It is +used to check whether an object a is an instance of a class B like +this: isinstance(a, B). The second argument of the function can also be a +tuple of classes to check whether an object is an instance of one of a number of +classes: isinstance(a, (A, B, C, D)). To implement this second case, the +implementation of isinstance contains a loop iterating over the elements of +the tuple. The number of loop iterations can vary, but is usually fixed for each +individual call site which typically just lists a few classes in the source +code. Therefore it is also safe to annotate the implementation of isinstance +with the unroll_safe decorator.

+
+
+

Preventing the Tracing of Functions

+

The second decorator dont_look_inside is used to fix false positives. It +tells the JIT to never trace into the decorated function and just always produce +a residual call instead. This decorator is in many ways less important than the +unrolling one (except for a special situation that I will describe in a +follow-up post). It is used if tracing into a function is not expected to yield +any speed benefits, because the optimizer will not be able to improve it much. +This is often the case if the called helper function does not contain any +"dynamic" behaviour. In such a situation it is better to just leave the function +call in the trace, because that produces less code.

+

An example would be the import mechanism in Python. It's very unlikely that any +performance improvement can be had by turning part of it into assembler. +Therefore we hide it from the tracer by annotating them with +dont_look_inside.

+
+
+
+

Conclusion

+

In this post we discussed two hints that can be used to control precisely which +parts of the interpreter should be meta-traced. If these hints are used +carefully, this can go a long way to making the interpreter produce traces that +contain exactly the interesting part of the execution, and will contain calls to +the functions that can not be optimized by tracing techniques.

+

In the next part of this series I will discuss a different set of hints that can +be used to strongly optimize traces.

+
+
+
+
+
+ + Victor wrote on 2011-03-12 21:28: +
+
+

Would it be possible (i.e. is the code amenable) to programmatically randomly sprinkle these decorators around and compare effects on speed (or on measurable trace quality)?

It would make JIT generation a bit more meta :)

+
+
+
+
+ + Gaëtan de Menten wrote on 2011-03-13 10:42: +
+
+

Thanks for the very interesting post!

Sorry if the following questions are naive, but you post makes me wonder if not tracing at all the functions which contain loops with a varying number of iteration means that no optimization is possible at all for those loops? Also, wouldn't it be possible to detect there is a loop and produce a special kind of trace in that case which do not duplicate the body of the loop? I guess that if it was possible and useful, you'd have done it, so I guess the real question is: why doesn't this work?

+
+
+
+
+ + Carl Friedrich Bolz-Tereick wrote on 2011-03-14 09:54: +
+
+

@Victor: yes, there are probably ways to do place some of the hints more automatically. However, you will always have to look at the traces and think about how to improve them, so we chose the pragmatic path and didn't do anything magic.

+
+
+
+
+ + Carl Friedrich Bolz-Tereick wrote on 2011-03-14 10:02: +
+
+

@Gaëtan: those are excellent questions!

Yes, functions in the interpreter with loops that we do not trace are not optimized at all. For most of these functions this is not a problem, e.g. string concatenation does not have much optimization potential anyway. However, there are some functions with loops (like the implementation of the map builtin) that would benefit from tracing, and we don't have a good general solution for that yet.

One of the ideas for solutions are indeed to try to start new traces in the interpreter functions with loops. We did not get around to playing with this yet, as there are not so many cases in the Python interpreter where this leads to a huge benefit.

+
+
+
+
+ + Gaëtan de Menten wrote on 2011-03-14 13:50: +
+
+

I'm puzzled now. I fail to see why those loops "do not have much optimization potential". I can understand that it's hard to optimize them because of the trace problem but I thought they would benefit from optimization like any other code (eg avoiding boxing/unboxing temporary variables), especially since they are within a loop, hence any gain will be multiplied by the number of iterations.

+
+
+
+
+ + Carl Friedrich Bolz-Tereick wrote on 2011-03-14 14:01: +
+
+

@Gaëtan:
is it possible that you are mixing up the two levels involved? The post talked only about functions in the interpreter, not about the functions in pure Python that a user of the interpreter might write. To clarify:

- All loops on the application level, i.e. in the program the user wrote, are traceable and will be traced if they are executed often enough.

- Some loops in the interpreter itself are not. Most of these loops do not do any boxing/unboxing, so they won't benefit from optimization. For some of the loops that would benefit we added some manual hacks to trace them anyway, e.g. for the implementation of "map". Some others still need to be improved, e.g. any, all, zip, ...

+
+
+
+
+ + Unknown wrote on 2011-03-15 14:52: +
+
+

Carl, thanks for the post. The information is very helpful.

While I understand special casing to overwrite the default tracing/not-tracing rules can help performance, I wonder how well are the default heuristics performing. Do you have any bulk part estimation of the performance loss by turning off special casing? And how many hints (related to whether to trace or unroll) do you have to introduce to PyPy?

+
+
+
+
+ + Carl Friedrich Bolz-Tereick wrote on 2011-03-15 16:00: +
+
+

Hi Peng,

Thanks :-). No, I didn't really do benchmarks yet, plan to do so in the future (these blog posts will turn into a paper soonish).

There are about 20-30 unroll_safe hints and equally many dont_look_inside hints. Some of them are really important, ie the speed would be abysmal without them. Most of them are really in the bytecode dispatch area, they are cases that e.g. Jython would not have, because in Jython the Python-to-Java compiler takes care of them.

+
+
+
+
+ + Gaëtan de Menten wrote on 2011-03-16 10:45: +
+
+

No, I wasn't confusing the two levels involved (if pypy wasn't optimizing variable-length loops in userlevel code, it wouldn't optimize much I guess).

My point was more theoretical: I guess that, in theory, those loops would benefit from optimizations like any other part of the interpreter. Your answer leads me to believe that *in practice* this isn't an issue because there are either not that many of them in the interpreter and/or they are not in speed critical parts and most of those that are important speed-wise have been taken care of manually in some way or another.

+
+
+
+
+ + Carl Friedrich Bolz-Tereick wrote on 2011-03-16 12:15: +
+
+

@Gaëtan: yes, that's a good interpretation. At some point we might still think about a more general solution for this problem, to get the remaining rare cases fixed, but for now we have a lot of the common ones covered.

+
+
+
+
+ + Matty wrote on 2017-06-07 12:50: +
+
+

@Gaëtan
Untraceable Interpreter-level loops don't need to be optimized by the jit because they are agressively optimized by the C compiler (remeber that rpython is translated to C)

+
+
+
+ +

Bay Area 2011 Tour Summary

+ +
+

We spent the week in the San Francisco Bay Area showing off PyPy. +Here are notes and photos of the tour.

+
+

Day 1: Google SF

+

Google has offices in downtown San Francisco. They are at a beautiful +place and the views are spectacular. We thank Wesley Chun and Guido van +Rossum for organizing this meeting. Between 25 and 30 engineers showed +up. Some of them were Python programmers, but others were C++ +programmers; and they all seem to have real problems that they want to +solve with PyPy. We didn't have prepared slides so far, so we mostly +ran demos and talked. As predicted, Google would love SWIG support. +They suggested that we rename the translation toolchain (as we vaguely +thought too) to separate it more from PyPy's Python interpreter; up +until today, many had no idea that they could use PyPy for other +languages. All in all, it was very positive and people looked forward +to meeting up at PyCon.

+
+
+

Day 2: Stanford

+ + + + +

This was the most academically-oriented talk. You can find the +abstract, the slides (PgUp/PgDown to navigate) and the video here. +There were around 35 people in the audience, and maybe 1000 real-time +video watchers (who didn't get to ask questions). The live audience +seemed to be a mixture of students, professors, and people from the +local industry. We thank David Allison and Andy Freeman for organizing +it. It has been two or three years since they invited me (Armin) and I +finally managed to get here :-)

+

The slides are longer than the talk; we focused on the JIT because that +was what the audience was most interested in. They were really +impressed at the stability, the tests, and that we don't have lots of +bugs reported in the JIT of our latest public release. We later found +out that many who came to the talk believed that they were going to get +a talk about how we jitted a subset of python because real python is too +hard -- impossible to do. They came to heckle with examples of how +python was impossible. So they were amazed when the first slide of +Armin's presentation was "Python is complicated", and the next slide +"Python is messy". It was a positive outcome. We made new fans :-)

+
+
+

Day 3: Yelp

+ + + + + +

As you can see in the image, tons of people showed up -- ~140. Thanks +to Grace Law, who is the coordinator for the SF Python Meet-up, and to +Jimmy Retzlaff and Ashley King-Bishof from Yelp. Yelp is also located +in downtown San Francisco. This looks like the place to be if you are a +start-up in California (and not in Silicon Valley): lots of enthusiastic +young people are here, and they are hiring. Yelp has an enormous open +space, suitable for huge parties, and the coolest beer dispensers on the +planet, made as a hack-a-thon project by three Yelp engineers (pictured +below):

+ + + + + + + + + +

By the way, their management structure seems to be flat. There are +almost no line managers, i.e. managers for the engineering staff; +instead they self-organize into teams. This is not what you expect +for the USA; things appear to have changed a lot.

+

The talk was in two sections, "PyPy from the user's point of view" and +"How the JIT works". Good feedback; impressed that we support all of +Python 2.7 (including all the modules that are in C in the stdlib), and +impressed that the Python 3.0 conversion is not considered a big deal by +us, although we have no precise date yet. The plan is, of course, just +to tweak the interpreter until it supports both (by adding the necessary +conditions); the other aspects like GC and the JIT will not be affected +at all.

+
+
+

Day 4: Dropbox

+ + + + + + + +

This was another place full of excited, successful young people. The +CTO looks like he turned 30 last week, and he's been CTO for 4 years +now. The three of us were quite obviously the oldest people there. We +felt old. They have another great big open barn complex. It's +loud. Very loud. Loud refrigerators, loud street noise, loud machinery +in the walls doing who knows what, loudly.

+

This was the first tech talk at dropbox. Thanks to Rian Hunter for +organizing it. They have a big kitchen, and we held the talk in there. +There was a skylight, which made the room too bright, so harder to read +the slides than would otherwise be the case. They were jazzed about our +visit, and wanted copies of all the pictures Jacob took before he left.

+

They seemed familiar with Google V8, and thought that how long it took +to build PyPy was a great incentive for us to make PyPy faster. They +are very interested in fast ctypes, fast SWIG, fast Cython. They were +pleased and surprised that we don't have too much JIT bloat (typically +~10% of the total RAM usage).

+

The mobile developers want a smaller Python more than a faster one. +Python takes too much memory given the tiny amount available on a lot of +cell phones. Not that we have an answer to this problem now.

+

They were pleased to learn that we will soon be able to JIT ctypes code. +And the fact that Armin knows many ways to segfault CPython was a bit of +a shock. We talked for an hour after the presentation. Again, a very +positive outcome.

+
+
+

Days 5 and 6: Noisebridge sprint

+ + + +

About six people showed up for the sprint. (Late. Californians really +do start the day at 11.) Noisebridge is a very eclectic place; people +show up to do pretty much everything from sewing to breaking apart +equipment to making robots and beer. It's donation-driven. Thanks to +Jim Stockford for volunteering the space and arranging this and helping +us set up for the sprint.

+

During the sprint, we did a little bit of everything; there was no clear +pattern. Ademan worked on sqlite, Greg Price looked to see if his +software could run on PyPy, Will worked on the documentation, and a few +of us fixed some more 2.7 tests. Alex Gaynor and Fijal joined us, too.

+
+
+

Day 7: Google Mountain View and Mozilla

+

We gave two talks on the 7th day of our trip so we were already quite +exhausted. Fortunately new people joined, so the talks were actually split +between multiple people. We would like to thank Peter Norvig and Ben Bayer +for inviting us to Google and Andreas Gal, Brendan Eich and Dave Herman +for inviting us to Mozilla. Both talks should hopefully appear online +at some point soon, but as of now we don't have a link.

+

It was pretty incredible to find ourselves at Mozilla talking with at +least 15 people who deeply understood the ideas of tracing JITs and +also understood why we undertook the decision to generate our JIT +instead of writing it. They suffered from having to write JavaScript +JIT (even multiple ones) by hand, as Armin did with Psyco. He deeply +sympathizes. The discussion afterwards was very successful and we're +looking forward to cooperating with them. Many exciting things were +discussed as possibilities.

+

Next day we went to Pycon, which is ongoing and a topic for yet another +blog post.

+
+
+
+
+
+ + Luis wrote on 2011-03-11 00:29: +
+
+

Great post, but the links are broken...

+
+
+
+
+ + ipc wrote on 2011-03-11 11:39: +
+
+

thank you for sharing! The tour seems like a very good way to draw the attention of a lot of smart and influential people to the fantastic work you've been doing.

+
+
+
+
+ + Maciej Fijalkowski wrote on 2011-03-11 14:12: +
+
+

@Luis thanks, fixed I hope. bitbucket is not very good at permalinks and I forgot extradoc has "tip" and not "default"

+
+
+
+
+ + Armin Rigo wrote on 2011-03-11 15:31: +
+
+

fijal: bitbucket serves html files as binary or something. This means that at least in Firefox we don't get the "ui" subdirectory, just the raw html. Annoying.

+
+
+
+
+ + Antonio Cuni wrote on 2011-03-11 15:38: +
+
+

@armin: I think that bitbucket's choice is the only reasonable one, else it could be probably exploited to do some sort of Cross Side Scripting attack

+
+
+
+
+ + Maciej Fijalkowski wrote on 2011-03-11 15:52: +
+
+

Eh. That means we should host them somewhere else I fear.

+
+
+
+
+ + Andreas Mueller wrote on 2012-08-16 12:29: +
+
+

The link to the video seems to be broken. At least I can't find the video on the page that is linked to.
Could you please check?
Thanks,
Andy

+
+
+
+ +
+
+ +
+
+
+ +
+ + + + \ No newline at end of file diff --git a/blog/index-19.html b/blog/index-19.html new file mode 100644 index 000000000..f5f2bbe63 --- /dev/null +++ b/blog/index-19.html @@ -0,0 +1,2818 @@ + + + + + + +PyPy (old posts, page 19) | PyPy + + + + + + + + + + + + + + + + + + Skip to main content +
+
+
+

Realtime image processing in Python

+ +
+

Image processing is notoriously a CPU intensive task. To do it in realtime, +you need to implement your algorithm in a fast language, hence trying to do it +in Python is foolish: Python is clearly not fast enough for this task. Is it? +:-)
+Actually, it turns out that the PyPy JIT compiler produces code which is fast +enough to do realtime video processing using two simple algorithms implemented +by Håkan Ardö.
sobel.py implements a classical way of locating edges in images, the +Sobel operator. It is an approximation of the magnitude of the image +gradient. The processing time is spend on two convolutions between the +image and 3x3-kernels.
magnify.py implements a pixel coordinate transformation that rearranges +the pixels in the image to form a magnifying effect in the center. +It consists of a single loop over the pixels in the output image copying +pixels from the input image.
+You can try by yourself by downloading the appropriate demo:

+
+ +
+To run the demo, you need to have mplayer installed on your system. The +demo has been tested only on linux, it might (or not) work also on other +systems:
$ pypy pypy-image-demo/sobel.py
+
+$ pypy pypy-image-demo/magnify.py
+
+By default, the two demos uses an example AVI file. To have more fun, you can +use your webcam by passing the appropriate mplayer parameters to the scripts, +e.g:
$ pypy demo/sobel.py tv://
+
+By default magnify.py uses nearest-neighbor interpolation. By adding the +option -b, bilinear interpolation will be used instead, which gives +smoother result:
$ pypy demo/magnify.py -b
+
+There is only a single implementation of the algorithm in +magnify.py. The two different interpolation methods are implemented by +subclassing the class used to represent images and embed the +interpolation within the pixel access method. PyPy is able to achieve good +performance with this kind of abstractions because it can inline +the pixel access method and specialize the implementation of the algorithm. +In C++ that kind of pixel access method would be virtual and you'll need to use +templates to get the same effect without incurring in runtime overhead.
+ + + + +
+The video above shows PyPy and CPython running sobel.py side by +side (PyPy taking input from the webcam, CPython from the test +file). Alternatively, to have a feeling on how much PyPy is faster than +CPython, try to run the demo with the latter. These are the the average fps +(frames per second) that I get on my machine (Ubuntu 64 bit, Intel i7 920, 4GB +RAM) when processing the default test.avi video and using the prebuilt +PyPy binary found in the full tarball alinked above. For sobel.py:
+
    +
  • PyPy: ~47.23 fps
  • +
  • CPython: ~0.08 fps
  • +
+
+For magnify.py:
+
    +
  • PyPy: ~26.92 fps
  • +
  • CPython: ~1.78 fps
  • +
+
+This means that on sobel.py, PyPy is 590 times faster. On +magnify.py the difference is much less evident and the speedup is "only" +15x.
+It must be noted that this is an extreme example of what PyPy can do. In +particular, you cannot expect (yet :-)) PyPy to be fast enough to run an +arbitrary video processing algorithm in real time, but the demo still proves +that PyPy has the potential to get there. +
+
+
+
+ + Anonymous wrote on 2011-07-07 17:47: +
+
+

Pypy is awesome!

+
+
+
+
+ + Anonymous wrote on 2011-07-07 18:19: +
+
+

I have a n00b problem: On Mac OS X 10.5.8, the precompiled pypy binary crashes with this message:
dyld: Library not loaded: /usr/lib/libssl.0.9.8.dylib

What's up with this? Thanks, and sorry for being offtopic.

+
+
+
+
+ + metapundit.net wrote on 2011-07-07 19:17: +
+
+

I saw this demo recently when Dan Roberts presented at Baypiggies. We broke into spontaneous applause when the pypy runtime ran at a watchable speed after cpython ran at less than 1 frame/second. Very impressive!

+
+
+
+
+ + Anonymous wrote on 2011-07-07 21:07: +
+
+

Anonymous, can you read?

"prebuilt PyPy binaries for linux 32 and 64 bits"
"The demo has been tested only on linux, it might (or not) work also on other systems"

Mac OS X is not Linux.

+
+
+
+
+ + schmichael wrote on 2011-07-07 21:23: +
+
+

Perhaps add a comment to sobel.py explaining what "pypyjit.set_param(trace_limit=200000)" does?

+
+
+
+
+ + Luis wrote on 2011-07-07 22:27: +
+
+

The only chamge I'd like to see in this project is its name... Trying to gather news from twitter for example, makes me search amongst thousands of comments in japanese (pypy means "boobies" in japanese), other incomprehensible comments in malay and hundreds of music fans of Look-Ka PYPY (WTF??)

+
+
+
+
+ + Anonymous wrote on 2011-07-07 22:58: +
+
+

Other Anonymous: Yes, I can read. I should have given a bit more context, but I was offtopic anyway. My goal was not running the demo, my goal was running pypy. I used the OS X binary from pypy.org. For those who are really good at reading, this was probably clear from the fact that my binary only crashed at library loading time.

+
+
+
+
+ + Antonio Cuni wrote on 2011-07-07 23:03: +
+
+

@Anonymous: most probably, the prebuilt PyPy for Mac Os X was built on a system different (older?) than yours.

For a quick workaround, you can try to do "ln -s /usr/lib/libssl-XXX.dylib /usr/lib/libssl.0.9.8.dylib". This should at least make it working, but of course it might break in case you actually use libssl.

The proper fix is to recompile PyPy by yourself.

+
+
+
+
+ + Antonio Cuni wrote on 2011-07-07 23:08: +
+
+

@schmichael

to avoid the potential problem of infinite tracing, the JIT bails out if it traces "too much", depending on the trace_limit.
In this case, the default trace_limit is not enough to fully optimize the whole algorithm, hence we need to help the JIT by telling it to trace a bit more than usual.

I agree that having to mess up with the internal parameters of the JIT is suboptimal. I plan to address this issue in the next weeks.

+
+
+
+
+ + relet wrote on 2011-07-07 23:43: +
+
+

How does it perform against python-opencv?

+
+
+
+
+ + Anonymous wrote on 2011-07-07 23:47: +
+
+

Antonio: Thanks for the quick reply. Unfortunately pypy can't be misled with the symlink hack: "Reason: Incompatible library version: pypy requires version 0.9.8 or later, but libssl.0.9.8.dylib provides version 0.9.7"

It seem like the prebuilt was created on a 10.6, and it does not work on vanilla 10.5 systems. Not a big deal, but is good to know.

+
+
+
+
+ + Anonymous wrote on 2011-07-08 04:44: +
+
+

Thanks for posting this. pypy is great. I'm trying to figure out how to write modules in RPython. I was sad that I missed the Baypiggies presentation.

+
+
+
+
+ + René Dudfield wrote on 2011-07-08 07:35: +
+
+

Hello,

it's lovely that pypy can do this. This result is amazing, wonderful, and is very kittens. pypy is fast at running python code (*happy dance*).

But.

It also makes kittens cry when you compare to CPython in such a way.

The reality is that CPython users would do this using a library like numpy, opencv, pygame, scipy, pyopengl, freej (the list of real time video processing python libraries is very large, so I won't list them all here).

Of course python can do this task well, and has for more than 10 years.

This code does not take advantage of vectorization through efficient SIMD, multiple cores or graphics hardware, and isn't careful with reusing memory - so is not within an order of magnitude of the speed of CPython code with libraries doing real time video processing.

Anyone within the field would ask about using these features.

Another question they would ask is about pauses. How does the JIT affect pauses in animation? What are the rules for when the JIT warms up, and how can you tell when the code will start running fast? How does the GC affect pauses? If there is a way to turn off the GC, or reuse memory in some way such that the GC won't cause the program to fail(Remember that in realtime a pause is a program fail). Does the GC pool memory of similar size objects automatically? Does the GC work well with 256MB-1GB-16GB sized objects? In a 16GB system, can you use 15GB of objects, and then delete those objects to then use another 15GB of different objects? Or will the program swap, or fragment memory causing pauses?

Please don't make kittens cry. Be realistic with CPython comparisons.


At the moment the python implementation is not as elegant as a vector style implementation. A numpy/matlab/CUDA/OpenCL approach looks really nice for this type of code. One speed up might be to reuse memory, or act in place where possible. For example, not copying the image... unless the GC magically takes care of that for you.

+
+
+
+
+ + Jacob Hallén wrote on 2011-07-08 08:21: +
+
+

@illume:More or less everyone knows that you can speed up your code by writing or using an extension library. Unfortunately this introduces a dependency on the library (for instance libssl mentioned in the comment thread) and it usually increases the complexity of your code.

Using PyPy you can solve computationally intensive problems in plain Python. Writing in Python saves development time. This is what the comparison is all about.

+
+
+
+
+ + René Dudfield wrote on 2011-07-08 12:23: +
+
+

hi @jacob: below is code which runs either multi core, vectorised SIMD, and on a GPU if you like. You'll notice that it is way shorter and more elegant than the 'pure python' code.

def sobelEdgeDetect(im=DImage, p=Position):
....wX = outerproduct([1,2,1],[-1,0,1])
....wY = transpose(wX)

....Gx = convolve(wX,im,p)
....Gy = convolve(wY,im,p)

....return sqrt(Gx**2 + Gy**2)

If pypy is 5x slower than C, and SIMD is 5x faster than C... and using multiple cores is 8x faster than a single core you can see this python code is (5 * 5 * 8) 200x faster than the pypy code. This is just comparing CPU based code. Obviously GPU code for real time image processing is very fast compared to CPU based code.

Things like numpy, pyopengl etc come packaged with various OSes - but chosing those dependencies compared to depending on pypy is a separate issue I guess (but many cpython packaged libraries are packaged for more platforms than pypy).

Of course using tested, and debugged existing code written in python will save you development time: for example using sobel written with the scipy library:
https://docs.scipy.org/doc/scipy/reference/generated/scipy.ndimage.filters.sobel.html

The fact is CPython is fast enough, more elegant, and will save you time for realtime image processing - unless you ignore the reality that people use CPython libraries for these tasks.

Finally the given code does not prove that the frames are all processed in realtime. They give an average time over all of the frames. Realtime video requires that you meet your target speed for every frame. It would need to be extended to measure each frame to make sure that each frame is within the required time budget.

+
+
+
+
+ + Antonio Cuni wrote on 2011-07-08 12:31: +
+
+

@illume: I think you completely missed the point of the blog post. This is not about "you should use pypy to do video processing", it's about "pypy runs pure python code very fast".

+
+
+
+
+ + René Dudfield wrote on 2011-07-08 12:58: +
+
+

@Antonio Cuni, I'm saying the post reads like cpython can not do "realtime image processing in python" and that pypy can.

+
+
+
+
+ + tismer wrote on 2011-07-08 14:21: +
+
+

@illume:
This example shows pure python code and compares its execution time in cpython and pypy. Nothing else. Writing graphics code in pure python that runs not dreadfully slow was to my knowledge never before shown.
If enough people understand the potential of this technique and put their time into it, we will hopefully come closer to your (5 * 5 * 8) acceleration in pypy, too.
I will for sure work on this.

+
+
+
+
+ + Eventh wrote on 2011-07-08 14:41: +
+
+

SIMD instructions and multi core support is something PyPy has potential to support, given time and funding.

+
+
+
+
+ + Anonymous wrote on 2011-07-08 21:20: +
+
+

The typical optimization path here would be implementing the necessary numpy array operations for the algorithms described. I wonder how a proper numpy implementation would compare.

+
+
+
+
+ + Armin Rigo wrote on 2011-07-09 13:38: +
+
+

I think you are still missing the point of the post. It was not "use pure Python to write your video processing algos". That's of course nonsense, given the amount and quality of existing C extension modules to do that.

The point is that when you want to experiment with writing a new algorithm of any kind, it is now possible to do it in pure Python instead of, say, C code. If later your project needs to move past the experimentation phase, you will have to decide if you want to keep that Python code, rewrite it in C, or (if applicable) use SIMD instructions from Python or from C, or whatever.

The real point of this demo is to show that PyPy makes Python fast enough as an early experimentation platform for almost any kind of algorithm. If you can write in Python instead of in C, you'll save 50% of your time (random estimate); and then for the 5% of projects that go past the experimentation phase and where Python is not enough (other random estimate), spend more time learning other techniques and using them. The result is still in your favor, and it's only going to be more so as PyPy continues to improve.

+
+
+
+
+ + Yaacov wrote on 2011-10-18 23:31: +
+
+

I was hoping to experiment with this amazing demo on my Windows-based computers. Any advice for how I would start making the required changes?

Jacob

+
+
+
+
+ + Anonymous wrote on 2012-07-24 13:38: +
+
+

dead links

+
+
+
+
+ + Maciej Fijalkowski wrote on 2012-07-24 13:41: +
+
+

Unfortunately the server died :( I'm not sure where exactly are packaged demos, but they can be run from:

https://foss.heptapod.net/pypy/extradoc/-/blob/branch/default/extradoc/talk/iwtc11/benchmarks/image

+
+
+
+
+ + Unknown wrote on 2012-10-04 22:08: +
+
+
The python code for this seems to be now here:
https://foss.heptapod.net/pypy/extradoc/-/blob/branch/default/talk/dls2012/demo +
+
+
+
+ + Unknown wrote on 2012-10-04 22:09: +
+
+

The scripts can be found here:

https://foss.heptapod.net/pypy/extradoc/-/blob/branch/default/153804ce4fc3/talk/dls2012/demo

+
+
+
+ +

Global Interpreter Lock, or how to kill it

+ +
+

People that listened to my (Armin Rigo) lightning talk at EuroPython know that +suddenly, we have a plan to remove the Global Interpreter Lock --- the +infamous GIL, the thing in CPython that prevents multiple threads from +actually running in your Python code in parallel.

+

That's not actually new, because Jython has been doing it all along. +Jython works by very carefully adding locks to +all the mutable built-in types, and by relying on the underlying Java +platform to be efficient about them (so that the result is faster than, +say, very carefully adding similar locks in CPython). By "very +carefully", I mean really really carefully; for example, +'dict1.update(dict2)' needs to lock both dict1 and dict2, but if you do +it naively, then a parallel 'dict2.update(dict1)' might cause a +deadlock.

+

All of PyPy, CPython and IronPython have a GIL. But for PyPy we are considering +a quite different approach than Jython's, based on Software +Transactional Memory. This is a recent development in computer +science, and it gives a nicer solution than locking. Here is a short +introduction to it.

+

Say you want to atomically pop an item from 'list1' and append it to +'list2':

+
+def f(list1, list2):
+    x = list1.pop()
+    list2.append(x)
+
+

This is not safe in multithreaded cases (even with the GIL). Say that +you call f(l1, l2) in thread 1 and f(l2, l1) in thread 2. What +you want is that it has no effect at all (x is moved from one list to +the other, then back). But what can occur is that instead the top of +the two lists are swapped, depending on timing issues.

+

One way to fix it is with a global lock:

+
+def f(list1, list2):
+    global_lock.acquire()
+    x = list1.pop()
+    list2.append(x)
+    global_lock.release()
+
+

A finer way to fix it is with locks that come with the lists:

+
+def f(list1, list2):
+    acquire_all_locks(list1.lock, list2.lock)
+    x = list1.pop()
+    list2.append(x)
+    release_all_locks(list1.lock, list2.lock)
+
+

The second solution is a model for Jython's, while the first is a model +for CPython's. Indeed, in CPython's interpreter, we acquire the GIL, +then we do one bytecode (or actually a number of them, like 100), then +we release the GIL; and then we proceed to the next bunch of 100.

+

Software Transactional Memory (STM) gives a third solution:

+
+def f(list1, list2):
+    while True:
+        t = transaction()
+        x = list1.pop(t)
+        list2.append(t, x)
+        if t.commit():
+            break
+
+

In this solution, we make a transaction object and use it in all +reads and writes we do to the lists. There are actually several +different models, but let's focus on one of them. During a transaction, +we don't actually change the global memory at all. Instead, we use the +thread-local transaction object. We store in it which objects we +read from, which objects we write to, and what values we write. It is +only when the transaction reaches its end that we attempt to "commit" +it. Committing might fail if other commits have occurred in between, +creating inconsistencies; in that case, the transaction aborts and +must restart from the beginning.

+

In the same way as the previous two solutions are models for CPython and +Jython, the STM solution looks like it could be a model for PyPy in the +future. In such a PyPy, the interpreter would start a transaction, do +one or several bytecodes, and then end the transaction; and repeat. +This is very similar to what is going on in CPython with the GIL. In +particular, it means that it gives programmers all the same guarantees +as the GIL does. The only difference is that it can actually run +multiple threads in parallel, as long as their code does not interfere +with each other. (In particular, if you need not just the GIL but actual +locks in your existing multi-threaded program, then this will not +magically remove the need for them. You might get an additional built-in +module that exposes STM to your Python programs, if you prefer it over +locks, but that's another question.)

+

Why not apply that idea to CPython? Because we would need to change +everything everywhere. In the example above, you may have noted that I +no longer call 'list1.pop()', but 'list1.pop(t)'; this is a way to tell +that the implementation of all the methods needs to be changed, in order +to do their work "transactionally". This means that instead of really +changing the global memory in which the list is stored, it must instead +record the change in the transation object. If our interpreter is +written in C, as CPython is, then we need to write it explicitly +everywhere. If it is written instead in a higher-level language, as +PyPy is, then we can add this behavior as as set of translation rules, and +apply them automatically wherever it is necessary. Moreover, it can be +a translation-time option: you can either get the current "pypy" with a +GIL, or a version with STM, which would be slower due to the extra +bookkeeping. (How much slower? I have no clue, but as a wild guess, +maybe between 2 and 5 times slower. That is fine if you have enough +cores, as long as it scales nicely :-)

+

A final note: as STM research is very recent (it started around 2003), +there are a number of variants around, and it's not clear yet which one +is better in which cases. As far as I can tell, the approach described +in "A Comprehensive Strategy for Contention Management in Software +Transactional Memory" seems to be one possible state-of-the-art; it also +seems to be "good enough for all cases".

+

So, when will it be done? I cannot say yet. It is still at the idea +stage, but I think that it can work. How long would it take us to +write it? Again no clue, but we are looking at many months rather +than many days. This is the sort of thing that I would +like to be able to work on full time after the Eurostars funding +runs out on September 1. We are currently looking at ways to use +crowdfunding to raise money so that I can do exactly that. Expect +a blog post about that very soon. But this looks like a perfect +candidate for crowdfunding -- there are at least thousands of you who +would be willing to pay 10s of Euros to Kill the GIL. Now we only +have to make this happen.

+
+
+
+
+ + Michael Foord wrote on 2011-06-29 17:54: +
+
+

If you concurrently run two transactions that interfere with each other - and they both restart on failure - isn't there a possibility that neither would ever complete? How would you mitigate against that? (Fallback to a global lock after a certain number of transaction failures perhaps?)

+
+
+
+
+ + Anonymous wrote on 2011-06-29 18:13: +
+
+

There's a thing that is not clear to me: how do you detect failures during commits?

+
+
+
+
+ + jdhardy wrote on 2011-06-29 18:16: +
+
+

IronPython doesn't have a GIL - it's the same as Jython.

+
+
+
+
+ + Michael Foord wrote on 2011-06-29 18:17: +
+
+

Plus transactions have to be scoped around code that is side-effect free (or you can guarantee containing the side-effects within the transaction). Why STM research was done in Haskell I guess. Anyway, it sounds like a hard problem. That's why Armin is interested I guess... :-)

+
+
+
+
+ + Antonio Cuni wrote on 2011-06-29 18:23: +
+
+

@michael: if two transactions conflict, you rollback only one of those, and from the external the effect is the same as having one locked by the GIL

About side effects: the plan is to close a transaction before a side effect operation and reopen a new one after it: this is what happens already with the GIL, which is released e.g. before I/O calls.

At least, this is how I understand it, and since I'm not Armin I might be wrong :-)

+
+
+
+
+ + Michael Foord wrote on 2011-06-29 18:26: +
+
+

@antonio
Ah, that makes sense. Thanks. :-)

+
+
+
+
+ + Anonymous wrote on 2011-06-29 18:30: +
+
+

This sounds like a great idea...

What happens when transaction interleaves together and fail? Both threads will still continue trying so to me this appears to be somewhat as efficient as locks. (Note I know nothing in this topic and would definitely like to learn more).

+
+
+
+
+ + Sebastian Noack wrote on 2011-06-29 19:14: +
+
+

I don't think that the primary reason STM is slower than the GIL, is the extra bookkeeping, but the fact that things need to be repeated. However, I could imagine, that STM still might yield better response times than acquiring locks, in some cases.

+
+
+
+
+ + Tuomas Jorma Juhani Räsänen wrote on 2011-06-29 20:27: +
+
+

STM is not ot that "recent" though:

Nir Shavit and Dan Touitou. Software transactional memory. In PODC '95: Proceedings of the fourteenth annual ACM symposium on Principles of distributed computing, pages 204-213, New York, NY, USA, 1995. ACM.

+
+
+
+
+ + xyproto wrote on 2011-06-29 20:34: +
+
+

I can imagine the reason this is efficient is because code often work on different parts of memory in different threads.

+
+
+
+
+ + ChrisW wrote on 2011-06-29 22:17: +
+
+

Hmm, ZODB has this kind of optimistic transaction committing, it results in having to deal with ConflictErrors and slowness from retrying requests when they conflict amongst other pain. If that's the price for losing the GIL, I'll stick with the GIL, thanks...

+
+
+
+
+ + gertjan wrote on 2011-06-29 22:48: +
+
+

Well when it comes to removing the GIL I have always had my hopes on pypy, and I'd be very happy to contribute some coin to make it happen. I'll be looking out for that crowdfunding post.

+
+
+
+
+ + Zemantic dreams wrote on 2011-06-29 23:00: +
+
+

Ok, so where can we give a small contribution?




Andraz Tori, Zemanta

+
+
+
+
+ + Richard wrote on 2011-06-30 00:32: +
+
+

Have you read about Microsoft's abandoned attempt to bring STM to .NET? Have you considered the problems they had?

+
+
+
+
+ + Jon Morgan wrote on 2011-06-30 05:56: +
+
+

Interesting idea, but some questions:
1. What do C extensions do? (extensions designed for CPython that are using GIL methods). Would they still be able to be used, or would they have to be rewritten for PyPy?

2. What happens if repeatable operations are interleaved with operations that are not repeatable? (e.g. logging values to a file - we wouldn't want it to happen twice if there was a conflict, unless of course you are using that logging to trace what is happening...).

+
+
+
+
+ + Ben wrote on 2011-06-30 10:30: +
+
+

@Michael Foord: In state-of-the-art lazy[1] STM systems, the probability of two transactions continually causing each other to restart is minuscule. A transaction only causes another one to restart when it tries to commit. So when somebody restarts, it means that someone else has successfully committed.

[1] In "Lazy" STMs, transactions only get exclusive access to the things they're trying to write to for a very short window of time at the end. This means they have to record writes in a transaction log, as Armin described, because there might be many pending writes for the same object. An alternative design is "eager" STM, where transactions write directly and have to "undo" their writes if they get aborted. Eager systems look good on paper, but in my opinion they're not worth it. With eager STM, the runtime system has to be very carefully designed to avoid livelock (when the system hangs because some transactions constantly abort each other). Lazy STM is almost impossible to livelock in practice, because even if some transactions are highly conflicting at least one of them (almost always) has to commit.

+
+
+
+
+ + Ben wrote on 2011-06-30 10:52: +
+
+

Also, my honours project was implementing most of an STM system, and I've been a long time fan of (and sometime tinkerer with) PyPy, so I would be very interested in where this goes.

And I know this is extremely premature, but if there were enough money coming in for this project and the PyPy team were willing to include outside developers, I would absolutely love to put serious work into this.

+
+
+
+
+ + Armin Rigo wrote on 2011-06-30 11:28: +
+
+

@Richard: reading the web page you point out, Microsoft's STM attempt (like most others I'm aware of) seems to work at a different level: basically as a library for application programmers. I can go through all 4 points and show why they are not relevant in our context:

* any visible I/O (e.g. writing to a file or a log) is going to end the transaction and start the next one, just like the GIL is released and re-acquired around most calls to the C library's write() function

* the 2nd issue is moot, because STM will be an internal detail in PyPy, not a user-visible feature

* the 3nd issue he describes is about "update-in-place" STM, which I believe is not the best solution: we want instead to keep a local log of the changes, and apply them only at commit-time (as described e.g. in the paper I pointed out)

* the final issue is the lack of real successes with STM. Well, we can't do anything about that ahead of time :-)

+
+
+
+
+ + Anonymous wrote on 2011-06-30 11:29: +
+
+

One note on the lock-based example you gave, that locks list1 and then list2: It isn't free of deadlocks!

Having two threads call the function simultaneously with swapped args may cause a deadlock. See the bank account problem.

+
+
+
+
+ + Armin Rigo wrote on 2011-06-30 11:49: +
+
+

@Anonymous: yes, I know it can deadlock. I have hidden the problem into some theoretical function acquire_all_locks(), which should somehow make sure that all locks are atomically acquired, in any order (which I think is possible by first sorting the locks according to their address in memory). I didn't want to put too much emphasis on the negative side of locks :-)

+
+
+
+
+ + Armin Rigo wrote on 2011-06-30 11:51: +
+
+

@Jon Morgan:

1. We would most probably still
have a GIL for the CPython C
extensions. Only one can run at a
time, but any number of PyPy
threads can run at the same time.
(This is because the CPython C
extensions never access PyPy's own
objects directly --- they cannot,
because PyPy's own objects can
move, and the C code is not
prepared for that.)

2. Logging to a file is done with a
call to a function like write().
In CPython and so far in PyPy, the
call to write() is preceded by
"release GIL" and followed by
"re-acquire GIL". In the STM PyPy,
it would be preceded by "end the
current transaction" and "start the
next transaction". This gives the
same behavior. But we may have to
think a bit harder about writes
that are buffered, because it seems
that if all threads write into the
same buffer then it will cause many
transaction conflicts.

Note however that we are talking
here about very short-lived
transactions. Even if you have 20
threads all writing to the same log
file, each thread is going to run
much more than 20 bytecodes between
any two writes to the log file.
You only get conflicts if two of
these threads are running the
write() call at the same time, and
such a conflict only causes one of
the threads to roll back and retry
the write(), not more.

+
+
+
+
+ + Armin Rigo wrote on 2011-06-30 11:54: +
+
+

@tuomasjjrasanen: yes, actually the first paper is from the 80's. But I think that it's only from around 2003 or 2004 that research seriously started, in the sense that papers were produced regularly, from several teams.

+
+
+
+
+ + Kevin Granade wrote on 2011-06-30 14:47: +
+
+

To address the anonymous question near the start of the comments, one way to detect commit collision is to copy a global generation counter at the start of your transaction, and then compare your stored copy to the current generation counter at commit time (after taking a lock), and if no one else has incremented the generation counter, you do so and complete your operation.

So transaction does:
self.generation = global.generation

And commit does:
if lock(global.lock):
if self.generation == global.generation:
global.generation += 1
return True
unlock(global.lock)
return False

+
+
+
+
+ + Jan Ziak (atomsymbol) wrote on 2011-06-30 16:47: +
+
+

I am not sure what to make out of the solution (=STM) to GIL you proposed in the article. You are essentially suggesting to slow down all Python programs in PyPy by a factor of, say, 4 and hope to recover the loss for a very small percentage of programs on an 8-core machine.

That can't be right. Please tell me I am dreaming ... :)

+
+
+
+
+ + Michael Foord wrote on 2011-06-30 19:29: +
+
+

So if there is only one thread transactions will be disabled?

I wonder how "fine grained" transactions will be: if you have parallel operations working concurrently on a large array do you think you will be able to allow threads to simultaneously modify different areas of the array?

+
+
+
+
+ + Ben wrote on 2011-06-30 21:22: +
+
+

@⚛: That's kind of how parallelization goes. There are overheads, and the only way to make up for them is to hope you have enough parallel speedup. STM (and any approach to this problem based on fine-grained locking) would work best if only a small known set of objects are shared between threads, and only those are synchronized, which unfortunately cannot be the case for a general GIL-removal proposal.

However I think PyPy's JIT could potentially help a little here. The escape analysis PyPy already does can also prove "this value cannot be accessed by another thread" and used to avoid logging some values, since they cannot conflict with parallel transactions. There are probably some more STM-specific optimizations the JIT could do as well.

+
+
+
+
+ + Ben wrote on 2011-06-30 21:27: +
+
+

@Michael Foord: STM definitely can be made as fine-grained as you like. Some existing STM systems operate at the level of machine words. Given that this one will be operating at the interpreter level, I would guess that code working on different sections of the same object (or array) would able to run in parallel, but I guess it depends on how the tradeoffs play out.

+
+
+
+
+ + Armin Rigo wrote on 2011-06-30 22:12: +
+
+

@⚛: to complete Ben's answer: yes, you are correct, but that's why the translation step "insert STM logic" is never going to be mandatory. You will get either a regular pypy-c-gil or a pypy-c-stm, as two different executables, and you will choose the one most suited for your particular program. I still expect pypy-c-gil to be the most used one, with pypy-c-stm an alternative that is only useful for people with massively multi-threaded programs.

+
+
+
+
+ + EmilK wrote on 2011-07-01 10:55: +
+
+

It would be cool, if the python programmer could mark "uncritical" sections, such that the stm book keeping is disabled for those sections where the programmer knows that there is no concurrency.

+
+
+
+
+ + Jacob Hallén wrote on 2011-07-01 14:17: +
+
+

@EmilK: I think that would be very uncool. You would allow the developer to introduce bugs that would be extremely hard to locate. Parallel programs are quite difficult to get right to start with, and anyone who does not have complete understanding of what constitutes a critical section will be very likely to make an error.

+
+
+
+
+ + Skandalfo wrote on 2011-07-02 20:18: +
+
+

There's an intermediate option between the GIL and the careful locking done by Jython, that I had a look at some time ago for making Python more thread friendly.

Just exchanging the GIL for a global readers-writer lock would allow Python to use way more concurrency. You would run all Python code under a reader lock for operations that were read-only on objects. For modifying built in mutable objects, or for things like the one involving both lists in the Jython example, or when calling into C modules, you would have to acquire the writer version of the lock.

Python threads would relinquish the reader lock each N opcodes, just like it's done now for the GIL, and I guess the acquisition of the writer lock should be given priority over the reader ones.

This approach should be simpler to implement than using the transactional memory approach, and it should be possible to bake it into CPython too. I think I remember having read some discussion about this somewhere, but it didn't seem to come to anything...

+
+
+
+
+ + Armin Rigo wrote on 2011-07-06 14:26: +
+
+

@Skandalfo: this cannot work with CPython, because of reference counting -- every bytecode modifies reference counts, so needs the "write" lock. But it could be a possible idea to consider in PyPy.

+
+
+
+
+ + WhiteLynx wrote on 2011-07-06 19:42: +
+
+

I love this idea.

Just musing on an implementation detail here, but isn't the "lazy" STM implementation's transaction system effectively just an in-memory implementation of copy-on-write semantics? It might be interesting to take a look at other things that have used COW for inspiration. (ZFS and btrfs come to mind) I like the idea that committing a transaction for a given object would just involve changing the object's address in memory to the modified copy.

Also, I'd be interested to see the read/write lock system get implemented, because it seems like it might be a better choice for programs that only use a couple threads.

+
+
+
+
+ + Anonymous wrote on 2011-07-06 21:30: +
+
+

What is wrong with Jython's lock model? Java is a pretty efficient language, no? And there is also no need to acquire locks for objects that you can prove won't cause conflicts...

+
+
+
+
+ + Skandalfo wrote on 2011-07-06 21:47: +
+
+

@Armin Rigo: If the problem for the RW-lock approach in CPython is just about reference count updates and checks, perhaps those could be done via atomic primitives, as supported on most modern architectures. This is what boost::shared_ptr does, IIRC, for the pointers to be thread-safe by default.

+
+
+
+
+ + Armin Rigo wrote on 2011-07-09 13:18: +
+
+

@Skandalfo: right, indeed. I don't know exactly the cost of such atomic operations. Maybe it's fine, but I fear that doing tons of increfs/decrefs all the time (as needed for refcounts in CPython's simple interpreter) has an important cost.

+
+
+
+
+ + Tuure Laurinolli wrote on 2011-07-11 20:10: +
+
+

@Armin Rigo

You'd need similar atomic instructions for an STM implementation too - although perhaps not as many? In any case they should be about as cheap as L1 cache writes unless there's contention, but then things are going to be slow in any case if you have contention. Of course you might have false sharing of objects etc. to muddle things up.

In any case, what sort of semantics would a GIL-free Python have in multi-threaded case, compared to current GIL-infested Python? Each opcode can assumed to execute atomically?

+
+
+
+
+ + Anonymous wrote on 2011-07-17 12:32: +
+
+

One thread have one interpreter.
Threads interactive like os native thread, use the os interactive method wrap by py.

I want to embed multi interpreter in my c code!

Please kill GIL!!!

+
+
+
+
+ + Raymin wrote on 2011-07-17 12:48: +
+
+

One thread have one interpreter.
Threads interactive like os native thread, use the os interactive method wrap by py.

I want to embed multi interpreter in my c code!

Please kill GIL!!!

+
+
+
+
+ + Armin Rigo wrote on 2011-07-24 13:07: +
+
+

@Tuure Laurinolli: yes, but PyPy has no refcounts. I was just discussing the pro/cons of the proposed locking solution on CPython (which is off-topic as far as this original blog post is concerned). I don't even want to think about STM for CPython :-)

For your second question, from the user's point of view, the semantics we would get with STM are automatically the same as with the GIL, which is why I like the approach.

+
+
+
+
+ + Anonymous wrote on 2011-07-29 14:08: +
+
+

Also, what about the performance if the lazy commit method used in the post? Every transaction will create additional memory? Is that really efficient, IMHO this model is aiming a very small number of use cases??

+
+
+
+
+ + klaussfreire wrote on 2011-10-14 21:26: +
+
+

I can see a use for STM in CPython, too, though. Even though it seems to be not applicable, it need not be true.

I worked on making the reference counting thread-friendly, in the sense that when you have multiple threads reading a big data structure, CPython's reference counting turns all the reads into writes, which is awful for performance.

I wrote a patch to pack all writes in the same memory page (ie, reference pools, external reference counting), and was working on a patch for STM reference count updates.

The thing with STM and reference counting, is that many operations cancel out at the end of the transaction. Like when you just read objects while performing computations, you acquire a reference, work, then release it.

In the end, STM here would remove the need to write to shared memory.

In the process of working on that patch, I can tell CPython can be made to use STM techniques. You have thread-local storage at the VM level already, macros handle almost all reference counting operations, it's all abstracted enough that it might be possible.

For reference counting, the only problem is that STM is way slower for single threaded applications. WAY slower. For multithreaded, it pays off considerably, but CPython guys are very strongly set in favouring single-threaded performance.

+
+
+
+
+ + halfaleague wrote on 2011-10-28 03:55: +
+
+

How can we fund this?

+
+
+
+
+ + Maciej Fijalkowski wrote on 2011-10-28 07:31: +
+
+

@halfaleague get in contact. pypy@sfconservancy.org is the right address for non-profit funding inquires.

+
+
+
+
+ + Daniel Waterworth wrote on 2011-12-11 07:40: +
+
+

I managed to write a Haskell STM implementation in a single morning. It may not be the most efficient implementation (I've found it to be about half the speed of the GHC implementation in the limited testing I've done), but it's really simple and only uses atomic CAS.

https://gist.github.com/1454995

+
+
+
+
+ + shawn wrote on 2011-12-31 20:38: +
+
+

have you looked at all at "Worlds" as a simpler interface to STM?

https://www.vpri.org/pdf/tr2011001_final_worlds.pdf

+
+
+
+ +

Report back from our survey

+ +
+

Hi all,

+

I'm here to report back the results of our survey. First, we're very pleased to +report that a number of you guys are happilly running PyPy in production! Most +(97%) of the respondants using PyPy are using it because it's faster, but a +further 26% (respondants could choose multiple answers) are using it because of +lower memory usage. Of users who aren't using PyPy, the most common reason was +C extensions, followed by "Other".

+

From reading the extra comments section there are a few things we've learned:

+
    +
  1. Google docs needs a better UI for this stuff
  2. +
  3. A huge number of people want NumPy and SciPy, it was easily the most +requested C extension (25% of respondants said somthing about NumPy). We've +already blogged on the topic of our plans for NumPy.
  4. +
  5. Having packages in the various OS's repositories would be a big help in +getting users up and running.
  6. +
+

A huge thanks to everyone who responded! Finally, if you're using PyPy in +production we'd love to get a testimonial from you, if you're willing to spare +a few minutes to give us a quote or two please get in contact with us via our +mailing list.

+

Thanks, +Alex

+
+
+
+
+ + Paul wrote on 2011-06-08 10:18: +
+
+

I'm surprised more people didn't mention Python 3 support as a big breaker. I certainly did.

+
+
+
+
+ + Jan Ziak (atomsymbol) wrote on 2011-06-08 14:16: +
+
+

"... we're very pleased to report that a number of you guys are happilly running PyPy in production"

You decided to keep the actual number of users a secret? Why?

+
+
+
+
+ + Maciej Fijalkowski wrote on 2011-06-08 14:20: +
+
+

@⚛ I think Alex was simply too lazy to count :-) At some point there were 600 respondents and roughly 10% of them used pypy in production, which is pretty good IMO.

+
+
+
+
+ + Jan Ziak (atomsymbol) wrote on 2011-06-08 18:05: +
+
+

@Maciej Fijalkowski: Ok, thanks for the clarification.

+
+
+
+
+ + Marko Tasic wrote on 2011-06-08 20:42: +
+
+

I'm using pypy 1.5 with jit in production for highly reliable and responsive distributed and decentralized systems, and I'm happy with it.

+
+
+
+
+ + Jan Ziak (atomsymbol) wrote on 2011-06-09 07:22: +
+
+

@Marko Tasic: If I may ask a question. You wrote that you are using PyPy for highly reliable systems. I know what you mean, but it seems to me that certain features of Python are in contradiction with high reliability. For example, it is in practice impossible to know at compile-time whether you misspelled a variable or parameter in Python source code. My question would be: why are you using a language which has only rudimentary compile-time error detection to implement a high reliability system?

+
+
+
+
+ + Maciej Fijalkowski wrote on 2011-06-09 07:58: +
+
+

@⚛ Not even trying to argue with you, comments on this blog is not a proper place to discuss whether Python is good for high-reliability systems. Please take the discussion somewhere else

Thanks,
fijal

+
+
+
+
+ + Jan Ziak (atomsymbol) wrote on 2011-06-09 09:38: +
+
+

@Maciej Fijalkowski: I will of course do what you ask, but I would like you to point me to at least one blog comment that: (1) Is initially saying that Python/PyPy is *good* for task X, and (2) You or somebody else from the PyPy team wrote "Please take the discussion about X somewhere else".

Thanks

+
+
+
+
+ + Maciej Fijalkowski wrote on 2011-06-09 09:41: +
+
+

@⚛ The line might be blurry, but "I'm using PyPy for X" or "I'm not using PyPy for X, because ..." is on topic. While "Python can be used for X" or "Python can't be used for X, because ..." is not on topic. This is a fine line between language implementation (which is PyPy about) and language design (which PyPy is not about, python-dev/python-list/python-ideas mailing lists are about that).

Cheers,
fijal

+
+
+
+
+ + Anonymous wrote on 2011-06-11 01:06: +
+
+

What about a FFI to C or C++? Something like LuaJit's FFI, which is really good.

+
+
+
+
+ + Anonymous wrote on 2011-06-15 10:10: +
+
+

Lack of support for numpy and scipy are what keep me from using pypy. Am using python for analysis of ultra high throughput DNA sequencing data.

Would be very curious to see how much performance I could gain by using pypy.

+
+
+
+ +

PyPy Genova-Pegli Post-EuroPython Sprint June 27 - July 2 2011

+ +
+

The next PyPy sprint will be in Genova-Pegli, Italy, the week after EuroPython +(which is in Florence, about 3h away by train). This is a fully public sprint: +newcomers and topics other than those proposed below are welcome.

+
+

+ +Goals and topics of the sprint

+
    +
  • +
    +Now that we have released 1.5, the sprint itself is going to be mainly +working on fixing issues reported by various users. Possible topics +include, but are not limited to:
    +
    +
      +
    • fixing issues in the bug tracker
    • +
    • improve cpyext, the C-API compatibility layer, to support more extension +modules
    • +
    • finish/improve/merge jitypes2, the branch which makes ctypes JIT friendly
    • +
    • general JIT improvements
    • +
    • improve our tools, like the jitviewer or the buildbot infrastructure
    • +
    • make your favorite module/application working on PyPy, if it doesn't yet
    • +
    +
    +
  • +
  • +
    +Of course this does not prevent people from showing up with a more precise +interest in mind If there are newcomers, we will gladly give introduction +talks.
    +
  • +
  • +
    +Since we are almost on the beach, we can take one day off for summer +relaxation and/or tourist visits nearby :-).
    +
  • +
+
+
+

+ +Exact times

+The work days should be 27 June - 2 July 2011. People may arrive on +the 26th already and/or leave on the 3rd.
+
+

+ +Location & Accomodation

+Both the sprint venue and the lodging will be at Albergo Puppo in +Genova-Pegli, Italy. Pegli is a nice and peaceful little quarter of Genova, +and the hotel is directly on the beach, making it a perfect place for those +who want to enjoy the sea in the middle of the Italian summer, as a quick +search on Google Images shows :-)

+The place has a good ADSL Internet connexion with wireless installed. You can +of course arrange your own lodging anywhere but I definitely recommend lodging +there too.
+Please confirm that you are coming so that we can adjust the reservations as +appropriate. The prices are as follows, and they include breakfast and a +parking place for the car, in case you need it:
+
    +
  • single room: 70 €
  • +
  • double room: 95 €
  • +
  • triple room: 105 €
  • +
+
+Please register by hg:
+https://foss.heptapod.net/pypy/extradoc/-/blob/branch/default/extradoc/sprintinfo/genova-pegli-2011/people.txt +
+or on the pypy-dev mailing list if you do not yet have check-in rights:
+https://mail.python.org/mailman/listinfo/pypy-dev +
+In case you want to share a room with someone else but you don't know who, +please let us know (either by writing it directly in people.txt or by writing +on the mailing list) and we will try to arrange it.
+
+
+
+
+ + vak wrote on 2011-05-25 11:39: +
+
+

Hi,

as for upcoming sprint...

The grid on https://speed.pypy.org/timeline/ is a totally great idea. However the benchmark tests listed represent no progress since a long time already.

Q1. Does it mean that the set is not representative any more and should be extended?

Q2. Is it possible to include some micro benchmarks, please? (Oh, please!)

+
+
+
+
+ + vak wrote on 2011-06-14 14:31: +
+
+

no answers -- it's a pity

+
+
+
+ +

PyPy Usage Survey

+ +
+

We've been working on PyPy for a long time. But readers of this blog will know +that in the past year something has changed: we think PyPy is production ready. +And it's not just us, this week LWN.net wrote an article about how PyPy +sped up one of their scripts by a factor of three, noting that, "plans are to +run gitdm under PyPy from here on out". All in all we think PyPy is pretty +great, but not everyone is using it yet, and we want to know why. We want your +feedback on why PyPy isn't ready to be your only Python yet, and how we can +improve it to make that happen.

+

Therefore, we've put together a quick survey, whether you're using PyPy or not +if you could take a few minutes to fill it out and let us know how we're doing +we'd really appreciate it. You can find the form here.

+

Thanks, +The PyPy team

+
+
+
+
+ + Anonymous wrote on 2011-05-16 18:23: +
+
+

We are very interested in using PyPy in production but our project is based on lxml library and both are incompatible. Do you suggest any fix for this? I'm not sure if PyPy would compensate the reduction if performance of a pure Python XML library.

+
+
+
+
+ + Anonymous wrote on 2011-05-16 18:55: +
+
+

Biggest blocker right now is gevent, which I believe would require pypy stackless and JIT to get along plus some work to make gevent use ctypes in place of cpython api.

+
+
+
+
+ + Anonymous wrote on 2011-05-16 19:12: +
+
+

I suggest that you reproduce this survey on StackOverflow (if it's acceptable there, maybe Programmers?) and Quora, maybe Convore too. Posting to comp.lang.python would also help.

+
+
+
+
+ + Anonymous wrote on 2011-05-16 19:22: +
+
+

Pypy needs to either be a dropin replacement for python or provide a significant (order of magnitude) difference in performance that moving to pypy won't be as big of a deal when you lose the support of so many 3rd party libraries.

+
+
+
+
+ + Anonymous wrote on 2011-05-16 19:35: +
+
+

1. Installation is long and non-intuitive. I'd like to see PyPy packaged up for all the major distros + Mac OSX via Fink, Homebrew, and MacPorts.

2. A comprehensive listing of modules that can and cannot be used with PyPy. I'm still not quite clear as to how PyPy interacts with the major web frameworks and WSGI (haven't researched it much either).

3. Drop-in replacement for Python 2.7. I want my scripts that I wrote in Python to run in PyPy with no complications.

+
+
+
+
+ + Pavel wrote on 2011-05-16 19:46: +
+
+

Could you provide the downloads with PGP signatures, please? We would like to use PyPy in production to run our payment processing system backend, but verified integrity and authenticity of its source code is strictly required.

+
+
+
+
+ + Victor wrote on 2011-05-16 20:05: +
+
+ 2. A comprehensive listing of modules that can and cannot be used with PyPy. I'm still not quite clear as to how PyPy interacts with the major web frameworks and WSGI (haven't researched it much either).

This is available at the PyPy Compatibility Wiki (I should update it this week, lots of new information around). +
+
+
+
+ + Anonymous wrote on 2011-05-16 20:20: +
+
+

We would use it across all our deployments (hundreds of thousands of LOCs) and gladly contribute and invest in pypy as soon as you guys implement python3 spec. Literally can't wait.

+
+
+
+
+ + Daniel Kluev wrote on 2011-05-17 06:52: +
+
+

I'd love to use PyPy in some of my projects, but they rely on lots of 3rd-party C/C++-based libraries.

1) lxml, thats an absolute must for most of my applications. Original ETree now lacks many features lxml has, so there is no ready pure-python replacement avail.
2) Some my own boost::python libraries. I didn't actually try to compile them on PyPy, but as I was told on IRC, support for b::p is still marginal.
3) PycURL, PyV8, PyQt, wxPython and so on.

+
+
+
+
+ + Martin Gfeller wrote on 2011-05-17 09:14: +
+
+

We would like to profit from the speedup, but it would be a major piece of work for us, as we're currently running Zope 2.13 (which we could replace, because we make only limited use of it and have our own hybrid database). However, before making an investment, we need to be sure that:


- PyPy won't go away like Psyco did. A kind of "mainstream endorsement" by PSF would be helpful


- numpy and scipy are available

- a decent ODBC package is available (we're using mxODBC) at the moment

- full support on Windows 32 and 64 bit



Best regards, Martin

Swisscom IT Services Finance

+
+
+
+
+ + Maciej Fijalkowski wrote on 2011-05-17 09:18: +
+
+

@martin

* numpy, scipy support is on the way

* 32bit windows is done, 64bit windows will happen, it's on the todo list

* PSF has just endorsed PyPy in front of 1000 people crowd on pycon giving us a 10000$ check (https://3.bp.blogspot.com/-yLUKuyRgjdg/TYfklB5Jg4I/AAAAAAAABKM/_5Rv2thqzA0/s1600/pycon_cheque.jpg).

That answers roughly half to 3/4 of your issues, no bad, we're getting there :)

+
+
+
+
+ + Anonymous wrote on 2011-05-17 15:48: +
+
+

I would like to repeat the numpy and scipy thing. I have to add matplotlib, which a lot of people use for plotting. Personally I also cannot live without h5py, which is awesome for storing and handling numerical data. I have no idea if it will work with pypy, because it does require numpy first.

I'm looking forward to pypy becoming faster, better supported, and more popular! I am convinced that it will.

+
+
+
+
+ + wilk wrote on 2011-05-17 16:38: +
+
+

I've a project wich use psyco with a factor 15 (computation of train path) ! yes really, this project is in production (unfortunately not open source) ! I just tried it with pypy 1.5, and it works with the same factor (congratulation to you). So i'm sure that we'll use pypy.

But like my other project, i don't change something wich already works. Most of them don't need speed improvement.

On one scrabble game i'd like to replace a scrabble solver in C (if someone wants to help, it's opensource ?)

I also hope to see a debian package in the next debian release...

Thanks for your work, i follow it !

+
+
+
+
+ + Anonymous wrote on 2011-05-18 13:26: +
+
+

On my server I'm running couple of Django based ecommerce systems. I hope to be running more of them soon (hopefully). There is also PostgreSQL. Still not using PyPy but I just can't wait to check if it will be faster and if so then how much. I don't know yet how to run Django app on production on PyPy but as soon I check and run couple of performance tests I will surely give some feedback.

+
+
+
+
+ + raptor wrote on 2011-05-23 00:53: +
+
+

Its all about compatibility with 3rd party libs, C libs or boost::python. Otherwise those who want to JIT their Python are just going to wait a bit longer for PEP 3146 so they can have a good LLVM based JIT in standard Python.

https://www.python.org/dev/peps/pep-3146/

+
+
+
+
+ + Anonymous wrote on 2011-05-23 03:48: +
+
+

The pypy group should make a full featured ide with a gui designer with built in packaging to .exe and linux .deb and .rpm that only runs the pypy vm. That would bring the desktop application programmers in by the droves.

+
+
+
+
+ + Carl Friedrich Bolz-Tereick wrote on 2011-05-23 07:35: +
+
+

@Hart: unladen swallow is dead:

https://qinsb.blogspot.com/2011/03/unladen-swallow-retrospective.html

+
+
+
+
+ + Anonymous wrote on 2011-05-23 15:21: +
+
+

Well, basically, it's NumPy, SciPy, Matplotlib and MayaVi. I'm also using Cython to optimize computation intensive code paths, but of course it would be nicer to stick to pure Python and let JIT do it's magic.

+
+
+
+
+ + lazyweb wrote on 2011-05-23 18:44: +
+
+

Arrgh, gevent does not work with pypy? There's my blocker.

+
+
+
+
+ + Gaëtan de Menten wrote on 2011-05-30 12:59: +
+
+

How long are you planning to keep this poll open? I hope you will blog about its results when it's closed...

+
+
+
+
+ + Almir Karic wrote on 2011-06-02 02:49: +
+
+

would love to see the results

+
+
+
+
+ + Anonymous wrote on 2011-06-02 15:21: +
+
+

I'm interesting in psycopg2and PIL libraries.

+
+
+
+
+ + Caetano wrote on 2011-06-02 15:29: +
+
+

The only thing that makes me not using pypy is the lack of supporting python bynaries .so, .pyd, etc.
I know that is a hard feature to implement because is needed to stub the CPython api.
but I think when its done will there is no reasons to not using pypy for anybody.

+
+
+
+
+ + Anonymous wrote on 2011-08-04 21:21: +
+
+

Numpy, scipy, matplotlib, and image are the stick ups for me.

+
+
+
+ +

Server migration in progress

+ +
+

Hi all,

+ +

We are in the process of migrating the hosting machine for PyPy, moving away from codespeak.net and towards a mixture of custom servers (e.g. for buildbot.pypy.org) and wide-scale services (e.g. for the docs, now at readthedocs.org).

+ +

When this is done, a proper announce will be posted here. In the meantime, we have already moved the mailing lists, now hosted on python.org. The subscribers' list have been copied, so if you didn't notice anything special for the past week, then everything works fine :-) This concerns pypy-dev, pypy-issue and pypy-commit. Two notes: +

+
    +
  • Some settings have not been copied, notably if you used to disable mail delivery. Sorry about that; you have to re-enter such settings. +
  • +
  • Following the move, about 50 addresses have been dropped for being invalid. I'm unsure why they were not dropped earlier, but in case sending mail to you from python.org instead of codespeak.net fails, then you have been dropped from the mailing lists, and you need to subscribe again. +
  • +
+
+
+
+
+ + Henrik Vendelbo wrote on 2011-05-17 16:15: +
+
+

I enjoy PyPy a lot, and would use it for production.

However I tend to have a lot of problems when I upgrade to the latest source as my PyPy modules/extensions break and I will have to reimplement them with the new internal APIs.

It would be great if there was a bit more stability around the structure of main and how to write a module.

+
+
+
+ +

Playing with Linear Programming on PyPy

+ +
+

Fancy hi-level interfaces often come with a high runtime overhead +making them slow. Here is an experiment with building such an +interface using constructions that PyPy should be good at +optimizing. The idea is to allow the JIT in PyPy to remove the +overhead introduced by using a fancy high-level python interface +on top of a low-level C interface. The application considered is +Linear +programming. It is a tool used to solve linear optimization +problems. It can for example be used to find the nonnegative values +x, y and z that gives the maximum value of + +

+
+ +
+ +without violating the constraints + +
+ +
+ +There exists general purpose solvers for these kind of problems that +are very fast and can literally handle millions of variables. To use +them however the problem has to be transformed into some specific +matrix form, and the coefficients of all the matrices +has to be passed to the solver using some API. This transformation is +a tedious and error prone step that forces you to work with matrix +indexes instead of readable variable names. Also it makes maintaining +an implementation hard since any modification has to be transformed +too. + +

+The example above comes from the manual of +the glpk library. That +manual continues by describing how to convert this problem into the +standard form of glpk (which involves introducing three new variables) +and then gives the c-code needed to call the +library. Relating that c-code to the problem above without the +intermediate explanation of the manual is not easy. A common +solution here is to build a hi-level interface that allows a more +natural way of defining the matrices and/or allow the equations to be +entered symbolically. Unfortunately, such interfaces often become +slow. For the benchmark below for example, +cvxopt +requires 20 minutes to setup a problem that takes 9.43 seconds to solve +(this seems a bit extreme, am I doing something wrong?). + +

+The high-level interface I constructed on top of the +glpk library is +pplp and it allows +the equations to be entered symbolically. The above problem can be +solved using +
+    lp = LinearProgram()
+    x, y, z = lp.IntVar(), lp.IntVar(), lp.IntVar()
+    lp.objective = 10*x + 6*y + 4*z
+    lp.add_constraint( x + y + z <= 100 )
+    lp.add_constraint( 10*x + 4*y + 5*z <= 600 )
+    lp.add_constraint( 2*x + 2*y + 6*z <= 300 )
+    lp.add_constraint( x >= 0 )
+    lp.add_constraint( y >= 0 )
+    lp.add_constraint( z >= 0 )
+
+    maxval = lp.maximize()
+    print maxval
+    print x.value, y.value, z.value
+
+ +

+To benchmark the API I used it to solve a +minimum-cost + flow problem with 154072 nodes and 390334 arcs. The C library + needs 9.43 s to solve this and the pplp interface adds another 5.89 + s under PyPy and 28.17 s under CPython. A large amount of time is + still spend setting up the problem, but it's a significant + improvement over the 20 minutes required on CPython by + cvxopt. It is + probably not designed to be fast on this kind of benchmark. I have + not been able to get cvxopt to work under PyPy. The benchmark used is + available here +
+
+
+
+ + The Cannon Family wrote on 2011-05-11 23:27: +
+
+

for the first equation do you not perhaps mean f(x,y,z) = 10x+6y+4z instead of z = 10x+6y+4z ?

+
+
+
+
+ + Hakan Ardo wrote on 2011-05-12 07:29: +
+
+

Yes, there is a typo there, I'll update the post. Thanx for noting.

+
+
+
+
+ + Winston Ewert wrote on 2011-05-12 14:28: +
+
+

That seems like a lot of overhead for the wrapper, what is up with that? I mean, I'd expect the wrapper to reasonably quickly pass it off to the C library.

+
+
+
+
+ + Anonymous wrote on 2011-05-12 16:48: +
+
+

you should try www.solverfoundation.com using ironpython too.

+
+
+
+
+ + Hakan Ardo wrote on 2011-05-12 18:53: +
+
+

Winston: It is indeed. What cvxopt spends 20 min on I don't know. One guess would be that it is passing the ~2 million coefficients involved to C one by one, possible with a bit of error checking for each of them. As for the 6 s used by pplp, it needs to convert the equations into the matrices glpk wants. That means shuffling the coefficients around a bit and some bookkeeping to keep track of which goes where.

Anonymous: OK, how would the above example look in that case?

+
+
+
+
+ + Hakan Ardo wrote on 2011-05-14 12:24: +
+
+

Thanx for noting, I've fixed the post (again).

+
+
+
+
+ + Unknown wrote on 2011-05-30 18:48: +
+
+

have you tried openopt[1]?

[1] openopt.org

+
+
+
+
+ + Joachim Dahl wrote on 2011-08-05 09:37: +
+
+

Are you distinguishing between the time it takes to setup the optimization problem and the time it takes to actually solve it?

GLPK is a simplex solver written in C, and CVXOPT is an interior point solver written in Python/C and is not particularly optimized for sparse problem. Nevertheless, you should check the you actually formulate a large sparse problem in CVXOPT, and not a dense one.

+
+
+
+ +

NumPy Follow up

+ +
+

Hi everyone. Since yesterday's blog post we got a ton of feedback, so we want +to clarify a few things, as well as share some of the progress we've made, in +only the 24 hours since the post.

+

Reusing the original NumPy

+

First, a lot of people have asked why we cannot just reuse the original NumPy +through cpyext, our CPython C-API compatibility layer. We believe this is +not the best approach, for a few reasons:

+
+
    +
  1. +cpyext is slow, and always will be slow. It has to emulate far too many +details of the CPython object model that don't exist on PyPy (e.g., +reference counting). Since people are using NumPy primarily for speed this +would mean that even if we could have a working NumPy, no one would want to +use it. Also, as soon as the execution crosses the cpyext boundary, it +becomes invisible to the JIT, which means the JIT has to assume the worst +and deoptimize stuff away.
  2. +
  3. NumPy uses many obscure documented and undocumented details of the CPython +C-API. Emulating these is often difficult or impossible (e.g. we can't fix +accessing a struct field, as there's no function call for us to intercept).
  4. +
  5. It's not much fun. Frankly, working on cpyext, debugging the crashes, +and everything else that goes with it is not terribly fun, especially when +you know that the end result will be slow. We've demonstrated we can build +a much faster NumPy, in a way that's more fun, and given that the people +working on this are volunteers, it's important to keep us motivated.
  6. +
+
+

Finally, we are not proposing to rewrite the entirety of NumPy or, god +forbid, BLAST, or any of the low level stuff that operates on C-level arrays, +only the parts that interface with Python code directly.

+

C bindings vs. CPython C-API

+

There are two issues on C code, one has a very nice story, and the other not so +much. First is the case of arbitrary C-code that isn't Python related, things +like libsqlite, libbz2, or any random C shared library on your system. +PyPy will quite happily call into these, and bindings can be developed either +at the RPython level (using rffi) or in pure Python, using ctypes. +Writing bindings with ctypes has the advantage that they can run on every +alternative Python implementation, such as Jython and IronPython. Moreover, +once we merge the jittypes2 branch ctypes calls will even be smoking +fast.

+

On the other hand there is the CPython C-extension API. This is a very specific +API which CPython exposes, and PyPy tries to emulate. It will never be fast, +because there is far too much overhead in all the emulation that needs to be +done.

+

One of the reasons people write C extensions is for speed. Often, with PyPy +you can just forget about C, write everything in pure python and let the JIT to +do its magic.

+

In case the PyPy JIT alone isn't fast enough, or you just want to +use existing C code then it might make sense to split +your C-extension into 2 parts, one which doesn't touch the CPython C-API and +thus can be loaded with ctypes and called from PyPy, and another which does +the interfacing with Python for CPython (where it will be faster).

+

There are also libraries written in C to interface with existing C codebases, +but for whom performance is not the largest goal, for these the right solution +is to try using CPyExt, and if it works that's great, but if it fails the +solution will be to rewrite using ctypes, where it will work on all Python +VMs, not just CPython.

+

And finally there are rare cases where rewriting in RPython makes more sense, +NumPy is one of the few examples of these because we need to be able to give +the JIT hints on how to appropriately vectorize all of the operations on an +array. In general writing in RPython is not necessary for almost any +libraries, NumPy is something of a special case because it is so ubiquitous +that every ounce of speed is valuable, and makes the way people use it leads to +code structure where the JIT benefits enormously from extra hints and the +ability to manipulate memory directly, which is not possible from Python.

+

Progress

+

On a more positive note, after we published the last post, several new people +came and contributed improvements to the numpy-exp branch. We would like to +thank all of them:

+
+
    +
  • nightless_night contributed: An implementation of __len__, fixed bounds +checks on __getitem__ and __setitem__.
  • +
  • brentp contributed: Subtraction and division on NumPy arrays.
  • +
  • MostAwesomeDude contributed: Multiplication on NumPy arrays.
  • +
  • hodgestar contributed: Binary operations between floats and NumPy arrays.
  • +
+
+

Those last two were technically an outstanding branch we finally merged, but +hopefully you get the picture. In addition there was some exciting work done by +regular PyPy contributors. I hope it's clear that there's a place to jump in +for people with any level of PyPy familiarity. If you're interested in +contributing please stop by #pypy on irc.freenode.net, the pypy-dev mailing +list, or send us pull requests on bitbucket.

+

Alex

+
+
+
+
+ + Anonymous wrote on 2011-05-05 23:14: +
+
+

How does this suggestion to use ctypes to interface with external C modules square with the python-dev antipathy towards doing that?

"Given the choice of using either ctypes or an external package, I prefer the external package." Martin v. Löwis

"If it means using ctypes to interface with system C libraries, I'm -10 on it :)" Antoine Pitrou

+
+
+
+
+ + Alex wrote on 2011-05-05 23:19: +
+
+

I don't know what to say for them, besides they apparently don't hate it so much as to remove it from the stdlib :)

+
+
+
+
+ + Michael Foord wrote on 2011-05-06 00:08: +
+
+

Isn't there another fairly major drawback to implementing in RPython - that you can only use it if it is compiled (translated) at the same time as pypy. So effectively pypy *has* to be distributed with all the RPython extensions you will ever use, or you have to retranslate *everything* whenever you add a new extension.

Developing cross-platform, cross-architecture, stuff with ctypes can also be a lot more painful than writing extensions using the Python C API (and having the compiler make some decisions at compile time rather than having to do it all at runtime).

+
+
+
+
+ + Robert Kern wrote on 2011-05-06 04:54: +
+
+

Most of python-dev's "antipathy" towards using ctypes is focused on using ctypes for stdlib modules, not on general principles. For security, stability, and portability reasons, many platforms need to disable ctypes when they build Python. Consequently, there is a policy that no stdlib module can use ctypes. They are not recommending against using ctypes in general.

+
+
+
+
+ + Anonymous wrote on 2011-05-06 05:19: +
+
+

One major worry is how well you will end up tracking NumPy development. Will you evenutally add an "RPython glue" subdir to NumPy's distribution?

+
+
+
+
+ + Anonymous wrote on 2011-05-06 05:59: +
+
+

thanks for the follow-up. I won't argue with points 1 and 3, but I think 2 can be reasonably addressed: I don't think the usage of internal details is pervasive in the code, and most of it is for historical reasons. We cannot remove them altogether from the numpy headers for backward compatibility reasons, but we can replace most of it inside numpy itself.

I am still a bit confused though: from your description, it seems that you intend to fork numpy to replace some pieces from C to RPython, but if I look at the numpy-ext branch, I see a rewrite of numpy in rpython. Maybe you are talking about another code ?

+
+
+
+
+ + Anonymous wrote on 2011-05-06 08:22: +
+
+

I think that the most important part of numpy is array operations (indexing, +-*/, broadcasting, etc). So it would be good enough to implement only array class in RPython and call to numpy using ctypes/cpyext for all other stuff. I've read somewhere about the plans to impose separation between numpy and scipy so numpy holds only implementation of fast arrays and scipy will hold all non-trivial operations on them. IMHO such separation will be ideal for pypy too.

+
+
+
+
+ + Wladimir wrote on 2011-05-06 08:42: +
+
+

Thanks for the clear explanation. I really wondered why it was so hard to re-use the existing numpy.

+
+
+
+
+ + Antoine P. wrote on 2011-05-06 15:02: +
+
+

Thanks Robert for clarifying our position :)

Another issue with ctypes is that it doesn't work on all systems.

Yet another issue with ctypes is that it is currently unmaintained (which won't help fixing portability issues :-)).

+
+
+
+
+ + Anonymous wrote on 2011-05-06 17:26: +
+
+

I am sory for the silly question, but how do I install this module in an existing pypy instalation ?

Thanks for the great job !

+
+
+
+
+ + Anonymous wrote on 2011-05-06 21:15: +
+
+

OK I see ...

hg clone https://foss.heptapod.net/pypy/pypy/-/tree/branch/numpy-exp .....

+
+
+
+
+ + Anonymous wrote on 2011-05-07 03:49: +
+
+

I like the idea of reimplementing part of Numpy in pypy to leverage the JIT in pypy. The existence of numexpr demonstrates the deficiency of Numpy as a Python library. A JIT is much more appropriate for what effectively should be a DSL.

But I would recommend something grander, perhaps for the longer term. I think if pypy could produce do for Python what McVM and McJIT propose to do for Matlab, it would be game-changing for Python and pypy. It would make pypy not only competitive with Matlab in ways that Numpy and Scipy are not yet and may never be, but also with F#. The rapid uptake of F# in financial industry in particular, despite the availability of Matlab, showcases the need for a fast prototyping language that does not rely on calling Fortran code for speed. I know I am looking for such language; Numpy and Python simply don't offer enough power and flexibility. I hope I can choose pypy.

+
+
+
+
+ + Anonymous wrote on 2011-05-11 00:31: +
+
+

Any idea about an eta on merging the jitypes2 branch (and/or a little more info on what it does to speed ctypes up so much)?

+
+
+
+
+ + Antonio Cuni wrote on 2011-05-11 07:33: +
+
+

@anonymous: the jitypes2 branch is mostly ready, but we are hunting two bugs and won't be merged until we fix them.

The speedup comes from the fact that ctypes call are seen by the JIT, and directly compiled into a call to the corresponding C function. Thus, most of the overhead of ctypes itself is optimized away.

+
+
+
+
+ + Unknown wrote on 2011-05-11 19:51: +
+
+

I wonder if an RPython/cython backend might be possible. cython is already my favorite way to write CExtensions and it generates code for both python 2.x and 3.x. It would be great if it could be adapted for PyPy extensions.

+
+
+
+
+ + Anonymous wrote on 2011-05-12 18:51: +
+
+

Hi!

Thanks a lot for the previous post and the follow up! I really appreciate that you could find time to make a write up on the progress that you made so far on this extremely important feature.

This all sounds very cool, but also to me it seems that it's very important to work with NumPy / SciPy developers, so that the parts that have to be replaced would be isolated and maintained in parallel for RPython and C API, or rewritten in ctypes (not sure if this is even possible). This way this eternal catch-up trap that many seem to be afraid of will not happen.

Also, I wonder in how much money this would actually translate. Maybe Enthought could sponsor some development...

Regarding Cython... I also use it to write trivial extensions to implement computation kernels outside Python in C. It would be great if Cython were able to generate something that would work with PyPy as well...

Thanks!

+
+
+
+
+ + Laura Creighton wrote on 2011-05-13 17:55: +
+
+

CLM:We actually have a GSoC student proposal from Romain Guillebert to
investigate this idea.

+
+
+
+
+ + Maciej Fijalkowski wrote on 2011-05-23 08:55: +
+
+

@Anonymous the idea is that you should not use Cython at all and PyPy's JIT should handle the computational kernel just fine.

+
+
+
+
+ + Anonymous wrote on 2011-07-26 11:18: +
+
+

I don't know why do you decide to use ctypes - in numpy community it is considered as obsolete already for a long time (maybe several years), is not under active development, and now Cython is recommended by default tool for it:

https://docs.scipy.org/doc/numpy/user/misc.html?highlight=cython#interfacing-to-c

Also, I guess you could search for some volunteers to work on numpy-PYPY in numpy-user, scipy-user, scipy-dev mail lists.

I'm interested in operations like hstack, vstack, max, min, argmin, nanmax, nanargmin (along a given axis) etc - are they already available? Or when it will be done?

+
+
+
+ +

Numpy in PyPy - status and roadmap

+ +
+

Hello.

+

NumPy integration is one of the single most requested features for PyPy. This +post tries to describe where we are, what we plan (or what we don't plan), and +how you can help.

+

Short version for the impatient: we are doing experiments, which show that +PyPy+numpy can be faster and better than CPython+numpy. We have a plan on how +to move forward, but at the moment there is lack of dedicated people or money +to tackle it.

+
+

The slightly longer version

+

Integrating numpy in PyPy has been my pet project on an on-and-off (mostly off) +basis over the past two years. There were some experiments, then a long +pause, and then some more experiments which are documented below.

+

The general idea is not to use the existing CPython module, but to +reimplement numpy in RPython (i.e. the language PyPy is implemented in), thus +letting our JIT achieve extra speedups. The really cool thing about this part +is that numpy will automatically benefit of any general JIT improvements, +without any need of extra tweaking.

+

At the moment, there is branch called numpy-exp which contains a +translatable version of a very minimal version of numpy in the module called +micronumpy. Example benchmarks show the following:

+ +++++ + + + + + + + + + + + + + + + + + +
 additerate
CPython 2.6.5 with numpy 1.3.00.260s (1x)4.2 (1x)
PyPy numpy-exp @ 3a9d77b789e10.120s (2.2x)0.087 (48x)
+

The add benchmark spends most of the time inside the + operator on +arrays (doing a + a + a + a + a), , which in CPython is implemented in C. +As you can see from the table above, the PyPy version is already ~2 times +faster. (Although numexpr is still faster than PyPy, but we're working on it).

+

The exact way array addition is implemented is worth another blog post, but in +short it lazily evaluates the expression and computes it at the end, avoiding +intermediate results. This approach scales much better than numexpr +and can lead to speeding up all the operations that you can perform on matrices.

+

The next obvious step to get even more speedups would be to extend the JIT to +use SSE operations on x86 CPUs, which should speed it up by about additional +2x, as well as using multiple threads to do operations.

+

iterate is also interesting, but for entirely different reasons. On CPython +it spends most of the time inside a Python loop; the PyPy version is ~48 times +faster, because the JIT can optimize across the python/numpy boundary, showing +the potential of this approach, users are not grossly penalized for writing +their loops in Python.

+

The drawback of this approach is that we need to reimplement numpy in RPython, +which takes time. A very rough estimate is that it would be possible to +implement an useful subset of it (for some definition of useful) in a period +of time comprised between one and three man-months.

+

It also seems that the result will be faster for most cases and the same speed +as original numpy for other cases. The only problem is finding the dedicated +persons willing to spend quite some time on this and however, I am willing to +both mentor such a person and encourage him or her.

+

The good starting point for helping would be to look at what's already +implemented in micronumpy modules and try extending it. Adding a - operator +or adding integers would be an interesting start. Drop by on #pypy on +irc.freenode.net or get in contact with developers via some other channel (such +as the pypy-dev mailing list) if you want to help.

+

Another option would be to sponsor NumPy development. In case you're +interested, please get in touch with us or leave your email in comments.

+

Cheers,
+fijal

+
+
+
+
+
+ + Unknown wrote on 2011-05-04 17:30: +
+
+

While the RPython approach does sound valuable long-term, do you know if anyone has experimented with cpyext and the CPython extension module as a near-term alternative?

+
+
+
+
+ + matt harrison wrote on 2011-05-04 17:30: +
+
+

Great post. (I'm another person who would like numpy on pypy).
What are the guidelines for when something should be implemented in RPython? For me personally there are a few instances I would trade some of the dynamicism of Python for speed in my own code.

+
+
+
+
+ + Maciej Fijalkowski wrote on 2011-05-04 17:35: +
+
+

@Nick the mixed approach (use cpyext and pieces in RPython) sounds maybe valuable short term, but it can burn people easily. RPython-only is way more elegant and gives you wins upfront. Since there is noone willing to invest time in short term approach, this sounds like a no-brainer.

@matt almost nothing should be implemented in RPython, except the interpreter itself. Writing Python should be fast enough. Numpy is a notable example where we want to leverage last bits and pieces of JIT and be really really fast. For example you can't really leverage SSE from Python layer.

+
+
+
+
+ + Davide wrote on 2011-05-04 18:12: +
+
+

Are you in touch with Numpy developers? Are they eager to "stop" using Python and move to RPython? I mean, if this work needs to be redone for each version of Numpy, we will be always lagging behind, and always spend lot of efforts. On the other hand, if Numpy devs will start using the RPython for and let die the pure-Python one, then, the porting effort would me much more meaningful, and I believe it will be easier to find a group of people interested in doing it (myself, maybe)

+
+
+
+
+ + Davide wrote on 2011-05-04 18:13: +
+
+

And what about SciPy?

+
+
+
+
+ + Anonymous wrote on 2011-05-04 18:15: +
+
+

I've got to say that this worries me more than it encourages me.

1) It doesn't sound like this path will lead to easier integration of scipy. If I'm wrong please let me know! But if I'm right, the reality is that most of the reason I care about numpy is because scipy depends on it, and I care about scipy.

2) What about the numpy refactoring effort, which is supposed to be making a better C interface for numpy, which works with IronPython as well as CPython (https://lists.ironpython.com/pipermail/users-ironpython.com/2010-December/014059.html)? Why not just encourage that effort, and leverage it for PyPy integration? Is there a reason it won't work for numpy even though it works for both IronPython and CPython? (

+
+
+
+
+ + Maciej Fijalkowski wrote on 2011-05-04 18:27: +
+
+

@Davide it's not Python vs RPython, it's C (which numpy is implemented in) vs RPython. No numpy users will be requires to use RPython for anything.

@Gary I believe you're wrong. The idea stays the same - you can call arbitrary C code that will manipulate raw memory and do what it wants to do. The idea is to implement only the interface part (which uses CPython C API) and not the C part, which will work anyway. So at the end, we hope to leverage that effort. Also we're not microsoft and we can't pay large sums of money to do it and having small subset of numpy that's really fast appeals much more to me than a large effort that only gives numpy for pypy (that's not faster than cpython's one).

+
+
+
+
+ + Davide wrote on 2011-05-04 19:12: +
+
+

@Maciej: It was clear to me that numpy users shouldn't change anything, but I thought you intended to change only the Python part of Numpy, not the C part.

Now, if you plan to change the whole C sections, that's a huge job. What are your plans for dependencies like the BLAS, LAPACK and the likes? Would you reimplement them in RPython as well?

And regardless of the answer, my question is still valid: do you see this project as a "catch-up porting" of Numpy, with the version for CPython going on by itself? Or do you see the RPython fork becoming the mainstream Numpy? And if it's the latter, how that would perform on CPython? I think these questions are the key of the matter.

+
+
+
+
+ + Maciej Fijalkowski wrote on 2011-05-04 19:18: +
+
+

see my reply above about BLAS/LAPACK etc. Regarding the C part, it's a big task, but I think not too big. Also it's relatively easy to come up with working piece that's not full, nontheless useful.

This won't work on CPython, period.

+
+
+
+
+ + Anonymous wrote on 2011-05-04 19:25: +
+
+

@Maciej -- sorry if I'm being dense, but are you saying that the approach you're outlining will allow for scipy to work with numpy?

+
+
+
+
+ + Maciej Fijalkowski wrote on 2011-05-04 19:29: +
+
+

@Gary an uneducated guess would be "yes". Probably with some work and scipy refactoring.

+
+
+
+
+ + cool-RR wrote on 2011-05-04 19:34: +
+
+

Thanks for writing this post Maciej! It's great to have some visibility on your plans about this issue.

+
+
+
+
+ + Anonymous wrote on 2011-05-04 19:47: +
+
+

OK. As I've argued before in various pypy groups, I think one of the groups that will most strongly benefit from pypy's speed is the scientific community -- but they need numpy and scipy. So now that I know that this plan will (hopefully) allow for both of those to be used from pypy, I'm encouraged by it.

+
+
+
+
+ + Anonymous wrote on 2011-05-04 19:49: +
+
+

@Maciej: The parts of Scipy written in Python are for the most part not large. The main work would be in reimplementing the C code that uses Numpy's C-API, and figuring out a way to interface with Fortran code.

+
+
+
+
+ + Joseph wrote on 2011-05-04 20:21: +
+
+

You say you lack sufficient resources to put in a large effort, but your answers to CPython extensions is "reimplement everything RPython". Would it not make more sense to improve cpyext so that you get good performance out of it (maybe even JIT compatible)? This seems like a better answer then re-writing every single CPython extension and trying to keep the RPython implementation in sync.

+
+
+
+
+ + Peter Cock wrote on 2011-05-04 20:33: +
+
+

Have you tried micronumpy under Jython? I'm assuming RPython, being just a subset of Python, should also work there, and might double as a way to get (some of) NumPy on Jython.

+
+
+
+
+ + Maciej Fijalkowski wrote on 2011-05-04 20:34: +
+
+

@Joseph cpyext will always be only a semi-permanent compatibility layer. Making numpy work with cpyext is both unrewarding (hard work with obscure bugs), but also significantly harder to make fast, in some places completely impossible. Yes, it doesn't make sense for all extensions, it doesn't even make sense for most. Numpy is however special, since speed is the reason of it's existence. Also, frankly, when it comes down to my free time "let's make this cool JITed code run 50x faster than CPython" beats "let's stare puzzled at this segfault".

+
+
+
+
+ + Maciej Fijalkowski wrote on 2011-05-04 20:35: +
+
+

@Joseph anyway, it's exactly for the same reason "why write another interpreter if you can just improve CPython". Because it's easier at the end.

+
+
+
+
+ + Corbin Simpson wrote on 2011-05-04 21:45: +
+
+

To everybody asking why we cannot just use cpyext: I already tried it. It's not gonna happen without hacking the crap out of numpy. Additionally, it's going to be slow: Numpy is not fast for most operations, because of double-unboxing. Only vector ops are fast. JITing the operations is going to be a big win.

For those of you not believing numpy is slow, look at numexpr (https://code.google.com/p/numexpr/) which implements many of the same ideas that we are planning on implementing.

+
+
+
+
+ + Jonas B. wrote on 2011-05-04 21:45: +
+
+

Extremely exciting! Perhaps this is a good time to document the internals of NumPy a bit better while your scour the source to reimplement in RPython.

Perhaps this is a good fit for a Kickstarter (or similar) project? I believe this requires very talented and dedicated developers and paying the professionally by raising money on the Internet should be possible. It's been done before.

+
+
+
+
+ + Anonymous wrote on 2011-05-04 22:58: +
+
+

Yes, having a couple of Kickstarter projects for PyPy would be nice. It seems the current view is "we'll wait for someone wanting a feature enough to fund it". Picking one or two known valuable features to put on Kickstarter would provide for a nice test: can you raise more money by asking for it in a targeted way?

+
+
+
+
+ + Anonymous wrote on 2011-05-05 01:23: +
+
+

Two comments:

One, you guys need to make up your minds with respect to how people are supposed to interface C code with PyPy, and make one well-supported way. The sooner, the better.

Two, as long as your numpy clone implements the (new-style) Python array interface, it should "just work" with Scipy, with everything else being a Scipy bug. (Correct me if I'm wrong.)

Andreas

+
+
+
+
+ + Anonymous wrote on 2011-05-05 01:58: +
+
+

Doesn't getting SciPy to work involve interfacing with a lot of Fortran code?

+
+
+
+
+ + Unknown wrote on 2011-05-05 04:47: +
+
+

To address some of the criticism you're receiving, it may be worth making another post clarifying the points made in the comments and elsewhere:

- numpy+cpyext has been tried and found wanting (and very hard to debug)
- no developers available that are interested in beating their heads against that particular wall
- pure C and Python components of numpy should remain largely the same
- only the Python bindings layer that uses the CPython C API needs to be reimplemented
- RPython has its own FFI which is PyPy's preferred way to interface to non-Python code (https://pypy.readthedocs.org/en/latest/rffi.html)
- cpyext is a useful tool for compatibility with relatively simple C extensions that don't stress the C API greatly, but numpy is not such an extension.

+
+
+
+
+ + david wrote on 2011-05-05 09:42: +
+
+

Hi maciej, I am david (we quickly met at pycon where I presented myself as a numpy guy).

I think part of the misunderstanding is around the meaning of "numpy in pypy". Rewriting an array class on top of pypy is certainly valuable, and I am in no position to tell other people what to do in their free time. But I don't think it can realistically mean people will be able to use this instead of numpy after 2-3 man months: how will interfacing with BLAS/LAPACK work ? How will interfacing with the vast amount of fortran code in scipy work ?

If cpyext is indeed a dead-end, it would valuable to know why. Personally, I would certainly be happy to fix parts of numpy that makes cpyext impractically, even if it meant it were twice slower than on cpython. Because I could still benefit from pypy *elsewhere*, without having to rewrite all the numpy/scipy/etc... code.

+
+
+
+
+ + Maciej Fijalkowski wrote on 2011-05-05 09:53: +
+
+

@david please look above at my responses. there will still be a piece of memory you can pass to LAPACK or BLAS or something. the RPython part is about the interface only and not C-only part. If you want to improve numpy, please separate C-only parts from interface parts as much as possible, using C from RPython is a no-brainer.

+
+
+
+
+ + Dániel Varga wrote on 2011-05-05 10:34: +
+
+

Maciej, let me second Nick's polite request for a more detailed post about the plan.

If even David, an actual numpy developer can misunderstand your description, what do you expect from the unwashed masses of scipy users like me? :) Fortunately it does not take too much effort to alleviate the worries. All you have to do is explain to everyone that the plan takes into account the giant amount of C and Fortran code in numpy/scipy, and takes into account the fact that forking numpy/scipy is infeasible.

+
+
+
+
+ + Bluebird wrote on 2011-05-05 11:49: +
+
+

Didn't you say in another post that the JIT is more efficient at optimizing Python code than RPython ?

+
+
+
+
+ + cournape wrote on 2011-05-05 12:17: +
+
+

@daniel: I don't think there is a misunderstanding as much as a different people wanting different things. I believe that Maciej and other pypy people are more interested in leveraging pypy and its JIT do to things which are indeed quite complicated in numpy today (avoid temporary, fast iterators in python, etc...). I have little doubt that pypy is a better platform than cpython to experiment this kind of things.

I am more surprised about the claim that numpy is so tight to cpyhon internals. It certainly depends on the C API, but mostly public API, documented as such.

+
+
+
+
+ + Armin Rigo wrote on 2011-05-05 12:45: +
+
+

@nick: thank you very much for giving all relevant pieces of information that are missing from the original post!

+
+
+
+
+ + glyph wrote on 2011-05-05 19:32: +
+
+

Hey Maciej! This sounds absolutely awesome. I hope you can find someone to do the necessary work. I think you might need to explain a little better in a separate post where that 48x speedup comes from, and why RPython is a necessary part of it. I think I understand why, but clearly some of the commenters don't :).

+
+
+
+
+ + Anonymous wrote on 2011-06-21 22:50: +
+
+

Well, if the answer of "How to make numpy available in pypy" is "do a complicated rewrite of numpy," then I'm pretty skeptical about the pypy project. I primarily use numpy, but also scipy sometimes and Image sometimes. As a user it's most important to me that code runs. Speed is not as critical. For example if I take stddev() of an array I first want that to run, and only secondarily want it efficient. If there's a library that I might want to use, and I can't expend a reasonable amount of effort to wrap it, or else someone else can do that, then I don't find pypy that encouraging at all. Since there are lots of libraries out there, and it has been convincingly argued that Python's primary utility is its library support.

+
+
+
+
+ + Alex wrote on 2011-06-21 22:56: +
+
+

@Anonymous: While you may not be concerned with performance, a great many people are. The only way to have arbitrary numpy stuff work in theory would be CPyExt, but as we've said that's frought with complications in that a) it won't work out of the box on something that uses as many corners of the CPython C-API as NumPy, and b) will always be slow. Given people's desire for speed with respect to NumPy we consider reimplementing it a reasonable course.

+
+
+
+
+ + Anonymous wrote on 2011-06-22 00:33: +
+
+

Alex -- I'm not saying speed is unimportant. What I'm saying is being able to easily make existing CPython extension modules compile against numpy is very important to people. If there is a 20% slowdown or a 10% speedup of the C extension in many cases that is no big deal. Most importantly it would put PyPy on rather equal standing with CPython. And then the JIT pure Python code might win out for efficiency, so PyPy might be a net win for many users.

On the other hand doing research into lazy evaluation and vectorizing and loop restructuring, can obviously make numpy faster, but is more of a tangent, than being helpful to the majority of users who just want to run their CPython extensions at roughly the same speed under PyPy. Until people can actually run their extensions easily (which I argue is the major value that Python has) I doubt there will be much adoption of PyPy.

Say I can already add lists of floats and take their standard deviation using numpy, using the C extension library. It isn't clear to me why this should be substantially less efficient under PyPy than under CPython.

We see the same issue with Python 3.0 adoption. Personally I think it makes bad language changes such as getting rid of string % operator which I use constantly, so I'd avoid it for that reason. But far more importantly it can't run a lot of the libraries I use, with comparable performance. So it's completely a no go to me for that reason.

So I am suggesting that optimizing a single library by rewriting it, seems a case of premature optimization when most libraries can't even run with PyPy.

+
+
+
+
+ + Maciej Fijalkowski wrote on 2011-06-22 07:36: +
+
+

It's a tough call, but for me most libraries run under PyPy. There are few that don't but I can usually work around that. Regarding numpy - noone wants slower numpy *no matter what*. Besides, it's not clear whether making numpy behave using CPyext would take less effort than writing it from scratch - the first reasonable subset can be expected *much* faster, when doing a rewrite.

Numpy really *is* special, for all my needs, I want a small subset that performs reasonably well, not a whole thing that performs poorly. It's a matter of taste, but it's also much more fun, which plays a lot in terms of people spending free time on it. Would you rather add functionality for X that you need or fix next obscure segfault?

Cheers,
fijal

+
+
+
+
+ + Maciej Fijalkowski wrote on 2011-06-22 10:14: +
+
+

@Anonymous Clarifying: We're hoping to reuse most parts of numpy (and scipy), especially those written in pure C. The "only" part requiring rewriting is the part that uses CPython C API, which is mostly the array interface.

+
+
+
+
+ + Anonymous wrote on 2011-06-23 04:23: +
+
+

Maciej -- I didn't realize large parts of these libraries could be reused. So maybe once the PyPy C extension facilities are working well enough that important 3rd party libraries can be compiled, I'll just switch to PyPy for performance. It sure does sound more fun to make numpy functions compile down to heavily optimized RPython and get big speed gains. But I still maintain that users would appreciate being able to get all arbitrary libraries to build in the first place, e.g. if scipy or library X depends on the numpy C interface, and that gets broken in the PyPy numpy implementation, then users won't be able to use their desired library at all. So I guess I'm just arguing that the most C extension modules that can work with numpy, the better. Since if we wanted fast but no libraries we'd be using C :-).

+
+
+
+
+ + Davide wrote on 2011-06-23 16:19: +
+
+

Maciej (et all),
it looks like this issue isn't clear yet to people. Let's see if I can help.

Numpy is made of 3 "piece" (doesn't matter if they are separate pieces or mingled together, they are there): a pure python part, a pure C part and a C-to-python "glue". All of them are very important to numpy, but the C-to-python glue is special, in that both python and C need to access the same data structures without any conversion or copy (otherwise it will be slow). I'm not sure what exactly numpy is doing for this "special glue" part, but that's the point where pypy suffer: of course pypy works just fine with pure python, and doesn't "care" at all about the C sections. So one option is to rewrite the C-to-python pieces of numpy. I'm sorry but it's still unclear to me if you want also to rewrite the C part or not (here you said kind-of-yes: https://morepypy.blogspot.com/2011/05/numpy-in-pypy-status-and-roadmap.html?showComment=1304533136864#c3499269873134208179 and here you said no: https://morepypy.blogspot.com/2011/05/numpy-in-pypy-status-and-roadmap.html?showComment=1308734098907#c2151221303214453177 so probably you should clarify better)

Now, if I understand it right, your plan is to fork numpy for this purpose (either rewrite the C-to-python glue only, or the C part also). I believe this will fail, and the reason is pretty simple: first, even before you start, you already say that you don't have people/money/time to commit to this project. Second, maintaining a fork is a huge, huge task. You might easily introduce bugs, break feature, etc - while people are expecting something that "just works" as drop-in replacement, so even a "almost success" from a technical point of view, can be a big failure for adopter, if it doesn't behave. Last, but not least, numpy is a moving target, and you'll always play catch up. Is this the game you want to play??

Now, I don't want to tell you what you have to do for fun, but if you want to have chances of success, you have to change the "politics" of your plan. I trust you that technically your plan is fine, but rather than implementing it within a numpy fork (or worst: rewrite), I suggest that you work with the numpy and/or CPython community, to see if you can write a wrapper around cpyext (or whatever they are using for C-to-Python glue). This wrapper (at compiler time) should either become cpyext (or whatever) if you are using CPython, or become "something else" if you are using pypy. If you persuade numpy people to use this wrapper you'll have the same numpy code base working as is in CPython and pypy. Sure you will not be exploiting the faster-than-C capabilities of pypy, but you can get there more smoothly: improving the speed one feature at time, while the rest of the framework is still working and thus useful, and thus increasing its user base, people interested in it (and some of them may become contributors).

Instead your plan sounds like: implement one feature at time, while the rest of the framework doesn't work and thus nobody uses it in production, let alone care about its speed. On top of which, you'll be trying to catch-up with numpy.

+
+
+
+
+ + Maciej Fijalkowski wrote on 2011-06-23 18:12: +
+
+

@Anonymous there are many things I disagree with and I'm not going to fork numpy.

The basis is - I claim there is more use for fast numpy which is incomplete than slow complete numpy.

I would refer you to yet another blog post (personal this time) explaining more why I do what I do: https://lostinjit.blogspot.com

+
+
+
+
+ + Connelly Barnes wrote on 2011-08-24 04:13: +
+
+

Here is a completely different approach taken by IronPython for Scipy+Numpy compatibility:

https://www.johndcook.com/blog/2009/03/19/ironclad-ironpytho/

It's basically a bidirectional FFI. Have a CPython and an IronPython both running, and wrap objects so that IronPython objects can be used by CPython and vice versa. This requires some platform specific binary level compatibility, in their case, DLL hacking, to allow the FFI to work in both directions.

It seems like that approach should be practical for getting all of large libraries such as Scipy or Numpy working in Pypy. Since it's already been demonstrated to work for IronPython.

The above roadmap proposes instead speeding up the core array object by coding it in RPython.

But I wonder if these two approaches could work together. For example Numpy could be configured to use ordinary CPython array objects, or PyPy compiled RPython array objects. Then the FFI just has to take care to wrap objects appropriately that are in the "other interpreter".

Thoughts?

+
+
+
+
+ + Connelly Barnes wrote on 2011-10-13 00:30: +
+
+

As a follow up to my previous comment, I noticed there is a bidirectional FFI for Python called RPyC that was previously discussed on the Pypy blog:

https://morepypy.blogspot.com/2009/11/using-cpython-extension-modules-with.html

I have no idea if it has been tried with Numpy yet.

+
+
+
+ +

PyPy 1.5 Released: Catching Up

+ +
+

We're pleased to announce the 1.5 release of PyPy. This release updates +PyPy with the features of CPython 2.7.1, including the standard library. Thus +all the features of CPython 2.6 and CPython 2.7 are now supported. It +also contains additional performance improvements. You can download it here:

+
+https://pypy.org/download.html +
+
+

What is PyPy?

+

PyPy is a very compliant Python interpreter, almost a drop-in replacement for +CPython 2.7.1. It's fast (pypy 1.5 and cpython 2.6.2 performance comparison) +due to its integrated tracing JIT compiler.

+

This release includes the features of CPython 2.6 and 2.7. It also includes a +large number of small improvements to the tracing JIT compiler. It supports +Intel machines running Linux 32/64 or Mac OS X. Windows is beta (it roughly +works but a lot of small issues have not been fixed so far). Windows 64 is +not yet supported.

+

Numerous speed achievements are described on our blog. Normalized speed +charts comparing pypy 1.5 and pypy 1.4 as well as pypy 1.5 and cpython +2.6.2 are available on our benchmark website. The speed improvement over 1.4 +seems to be around 25% on average.

+
+
+

More highlights

+
    +
  • The largest change in PyPy's tracing JIT is adding support for loop invariant +code motion, which was mostly done by Håkan Ardö. This feature improves the +performance of tight loops doing numerical calculations.
  • +
  • The CPython extension module API has been improved and now supports many more +extensions. For information on which one are supported, please refer to our +compatibility wiki.
  • +
  • These changes make it possible to support Tkinter and IDLE.
  • +
  • The cProfile profiler is now working with the JIT. However, it skews the +performance in unstudied ways. Therefore it is not yet usable to analyze +subtle performance problems (the same is true for CPython of course).
  • +
  • There is an external fork which includes an RPython version of the +postgresql. However, there are no prebuilt binaries for this.
  • +
  • Our developer documentation was moved to Sphinx and cleaned up.
  • +
  • and many small things :-)
  • +
+

Cheers,

+

Carl Friedrich Bolz, Laura Creighton, Antonio Cuni, Maciej Fijalkowski, +Amaury Forgeot d'Arc, Alex Gaynor, Armin Rigo and the PyPy team

+
+
+
+
+
+ + kost BebiX wrote on 2011-04-30 16:59: +
+
+

Cool. Blog design became blue :-)

+
+
+
+
+ + Anonymous wrote on 2011-04-30 17:37: +
+
+

Unless there is something Intel specific - maybe calling it x86/x86-64 might be a good idea since this suggests that pypy does not work on amd / via chips.

+
+
+
+
+ + Anonymous wrote on 2011-04-30 21:33: +
+
+

do you have plans to add CPython 2.7.1 to speed.pypy.org?

+
+
+
+
+ + Anonymous wrote on 2011-04-30 22:21: +
+
+

Is it just me or does cProfile seem rather broken (at least on Windows)? I get random subtimings that are negative or in the billions.

>>>> cProfile.run("[abs(1) for n in xrange(10**6)]")
1000002 function calls in 1.000 seconds

Ordered by: standard name

ncalls tottime percall cumtime percall filename:lineno(function)
1 -137.813 -137.813 1.000 1.000 :1()
1000000 138.813 0.000 138.813 0.000 {abs}
1 0.000 0.000 0.000 0.000 {method 'disable' of '_lsprof.Prof
iler' objects}

+
+
+
+
+ + Zooko wrote on 2011-04-30 22:34: +
+
+

Where's the flattr button? I want to give you a euro tip again, just like I do every time you blog.

Also: way to go on releasing PyPy 1.5! This project is really growing up!

+
+
+
+
+ + Armin Rigo wrote on 2011-05-01 11:10: +
+
+

Anonymous: cProfile on Windows works for me. It might be details of your Windows version or whatever. Can you open it as a proper bug report? Thanks! https://codespeak.net/issue/pypy-dev/

+
+
+
+
+ + Unknown wrote on 2011-05-01 11:24: +
+
+

Awesome! Looking forward to PyPy on NaCl.

+
+
+
+
+ + Antonio Cuni wrote on 2011-05-01 12:20: +
+
+

@zooko: I don't know why the flattr button went away. I re-uploaded the template to blogger and now it seems to be there again, can you confirm?

+
+
+
+
+ + etal wrote on 2011-05-01 13:40: +
+
+

Great stuff. Do you think PyPy is ready to be re-packaged for Debian yet?

I'm looking at this:
https://bugs.debian.org/538858

I have a feeling the popcon would be quite a bit higher nowadays.

+
+
+
+
+ + Gaëtan de Menten wrote on 2011-05-02 08:19: +
+
+

Congratulations to the whole team. What's coming next now that this large milestone is completed?

+
+
+
+
+ + Anonymous wrote on 2011-05-02 11:17: +
+
+

Is it just me or does the download page still point to the 1.4.1 release?

+
+
+
+
+ + Antonio Cuni wrote on 2011-05-02 11:23: +
+
+

@Anonymous: what is the "download page" you are talking about? For me,
https://pypy.org/download.html

shows only links to PyPy 1.5. Maybe it's a browser cache issue?

+
+
+
+
+ + Anonymous wrote on 2011-05-02 11:31: +
+
+

This is insane.

I clicked on the link multiple times yesterday and today (after restarting firefox) and only now the page refreshed correctly.

Just shows you that anything can happen.

+
+
+
+
+ + vak wrote on 2011-05-03 16:43: +
+
+

btw, regarding https://bitbucket.org/pypy/compatibility/wiki/Home -- i am using pymongo driver under pypy without problems (not yet checked against the fresh pypy 1.5 though)

+
+
+
+
+ + vak wrote on 2011-05-04 09:19: +
+
+

minor thing -- version isn't updated?

Python 2.7.1 (b590cf6de419, Apr 30 2011, 02:00:34)
[PyPy 1.5.0-alpha0 with GCC 4.4.3] on linux2

+
+
+
+
+ + Anonymous wrote on 2011-05-05 12:29: +
+
+

Great news, 25% speedup over PyPy 1.4 is just another great step forward. I'm looking forward for times when Python will be fastest dynamic object-oriented language and it will be more and more popular. I feel that these times are very close thanks to PyPy.

What about adding PyPy to The Computer Language Benchmarks Game?

+
+
+
+
+ + Damian Cugley wrote on 2011-05-07 10:36: +
+
+

I have not yet managed to build C extensions on Mac OS X with distribute/distutils/whatever because sysconfig.get_config_var returns None. Is there a quick way to fix this?

+
+
+
+
+ + Damian Cugley wrote on 2011-05-07 10:38: +
+
+

@anonymous The Computer Language Benchmarks Game only permits one implementation per language, and CPython 3.2 is the implementation they use for Python.

+
+
+
+
+ + Anonymous wrote on 2011-05-07 14:09: +
+
+

Would it be easy to implement mutable builtin classes (for example for adding new methods to int or str) in pypy?

+
+
+
+
+ + Thomas Heller wrote on 2011-06-07 17:38: +
+
+

I'm speechless :-)

This is the first time I use pypy and it works out of the box even with my fancy Windows GUI toolkit (written completely in ctypes) out of the box.

Great work, guys!

+
+
+
+ +
+
+ +
+
+
+ +
+ + + + \ No newline at end of file diff --git a/blog/index-2.html b/blog/index-2.html new file mode 100644 index 000000000..34687ef1f --- /dev/null +++ b/blog/index-2.html @@ -0,0 +1,707 @@ + + + + + + +PyPy (old posts, page 2) | PyPy + + + + + + + + + + + + + + + + + + Skip to main content +
+
+
+

Visualizing a Python tokenizer

+ +
+

Armin and me have been working on PyPy's parser and bytecode compiler for the Python language in the last days. Armin implemented several bytecode optimizations that CPython has since a while whereas I tried to refactor our tokenizer and parser (because our existing parser is rather slow and also not very nice code). Armin is mostly done whereas the new parser is not very far yet.

What is done, however, is the Python tokenizer. It is implemented in the usual way, by using a set of regular expressions to generate a deterministic finite automaton (DFA). This automaton is then turned into a big function which does the actual tokenization. Of course the picture is not quite as simple for Python, because it is not possible to tokenize Python using only regular expressions. To generate the proper "indent" and "dedent" tokens it would be necessary to keep state (the previous indentation levels) which a DFA cannot do. This is solved by postprocessing the tokens that the tokenizer produces to turn whitespace tokens into the proper indent and dedent tokens.

+

For debugging purposes I implemented a visualization tool for DFAs using PyPy's pygame-based graph viewer. The graph viewer is able to visualize interactively any graph given in the graph-description language of Graphviz. Looking at the tokenizing DFA for Python is rather instructive, both for understanding how tokenizing works and (maybe) for understanding the Python language. To try it, download the dot file of the DFA and run from a pypy checkout:

+

+
$ python pypy/bin/dotviewer.py tokenizer.dot
+

The following is a screenshot of the graphviewer: + +

For people who don't want do checkout PyPy I generated a (rather big) png for the DFA.

+

Next thing I would like to do (apart from actually finishing the parser, of course :-) ) is visualize the Python grammar itself using syntax diagrams or something similar. So far I couldn't really find a program to do that, though.

+
+
+
+
+ + Anonymous wrote on 2008-01-08 16:11: +
+ +
+
+
+ + Carl Friedrich Bolz-Tereick wrote on 2008-01-08 17:52: +
+
+

Hi Robin,

Yes, this is sort of cool already, but it doesn't really give you as much information as the grammar itself. It's more of an overview-like thing.

Cheers,

Carl Friedrich

+
+
+
+
+ + Anonymous wrote on 2008-01-08 18:05: +
+
+

Check out ANTLRWorks or ANTLRStudio for superb visualization of grammars.

+
+
+
+
+ + Carl Friedrich Bolz-Tereick wrote on 2008-01-09 13:57: +
+
+

Yeah, ANTLRWorks seems pretty nice in that respect. Thanks for the hint.

+
+
+
+
+ + Unknown wrote on 2008-01-18 13:52: +
+
+

Hello!

I was considering to use

https://www.antlr.org/wiki/display/ANTLR3/Antlr3PythonTarget

for a toy language project for uni (Computer Engineering, IT) because I guess ANTLR(works) and its mailing list could help me a lot in understanding the very basics of 'grammar design'...

Or at least I hope so! :-)

However, I've also been lurking the PyPy ml for quite a while now and was considering the possibility to implement the whole (*toy*) interpreter in RPython, so to understand a bit more of PyPy's design by actually coding something simple which makes some use of it. :-)

So, would you consider trying to port the current ANTLR's Python runtime to RPython a good way for me to start doing something with PyPy?

Would you consider the thing interesting? I know this possibility had been discussed on IRC some times ago and it wasn't thought to be that useful at last, but maybe you discussed the thing some more since then and changed idea, I don't know...

How would you rate the difficulty of such a task for a PyPy and ANTLR newbie, also? :-)

I wouldn't try doing that right now, anyway, but maybe in March I should manage to get some spare time for it. In the meantime, I'd try to read as many docs and sources as possible...

Tell me your opinion, please! :-)

Cheers,
Matteo

PS: I'm writing here because you were discussing PyPy's Python lexer and somebody wrote about ANTLRworks, but if you think I'd better send this message to the list please just tell me and I'll do so! :-)

+
+
+
+
+ + Carl Friedrich Bolz-Tereick wrote on 2008-01-18 16:32: +
+
+

Hi Matteo,

I would post to the pypy-dev mailing list instead, it is better to discuss such longish things there. Thanks for you interest!

Cheers,

Carl Friedrich

+
+
+
+
+ + Unknown wrote on 2008-05-19 14:45: +
+
+

Does it use Graphviz's dot utility for rendering?

+
+
+
+
+ + Konrad wrote on 2008-06-04 18:36: +
+
+

@techtonik

Yes, it does.

+
+
+
+
+ + Unknown wrote on 2008-06-04 18:47: +
+
+

That's bad it cann't be used as a standalone product without the need to install Graphvis.

+
+
+
+
+ + Anonymous wrote on 2009-04-22 02:22: +
+ +
+
+
+ + wow power leveling wrote on 2009-04-22 02:27: +
+ +
+
+ +

PyPy Winter Sports Sprint from 12-19th of January in Leysin, Switzerland

+ +
+ + + + + + + +
+

The next PyPy sprint will be held in Leysin, Switzerland, for +the fifth time. The overall idea of the sprint is to continue +working on making PyPy ready for general use.

+
+

The proposed topics are: ctypes, JIT, testing, LLVM. This is +a fully public sprint, so newcomers and other topics are +welcome. And like previous winters, the main side goal is to +have fun in winter sports :-) See the sprint announcement +for details.

+
  +
+
+

Various Performance Improvements

+ +
+

A few days ago, Armin discovered Gnuplot. He wrote a script that turns the results of the nightly benchmark runs into plots (lower is always better, all the numbers of the microbenchmarks are "times slower than CPython"). The corresponding microbenchmarks can be found in the repository. Staring at the plots revealed a strange performance regression around the revision 45000. After some investigation Armin found that an mostly unrelated change had disabled our method cache, which caused the regression. This was fixed. + +In addition, Armin did a few other small tweaks in the interpreter main loop, making sure that small bytecodes are inlined into the main loop. This gave another few percent of performance increase. Together with the GC improvements two weeks ago this leads to the fastest non-JIT PyPy ever. Unfortunately "fastest" is not really very fast yet in absolute terms, with realistic apps being around 3-4 times slower than CPython. Especially calls (in all its variants) are quite slow, which is something we should look into.

+
+
+
+
+ + Anonymous wrote on 2008-01-06 06:05: +
+
+

I'm amazed by the progress you guys have made. 3 - 4 times slower than CPython is actually really good considering what it does!

PyPy is one of the most interesting computer language projects on the net.

+
+
+
+ +

Faster implementation of classic classes merged

+ +
+

Old-style classes have so far been a bit neglected by PyPy's Python interpreter. By default, PyPy makes all classes new-style and you have to use a command-line switch (--oldstyle) at startup or at translation time to change that default. Then you would get an pure-Python implementation of classic classes. This implementation was extremely slow (around 20 times slower than classic classes in CPython). In the past we had hoped that we could get away with mostly only supporting new-style classes, however it seems that real-world software seems to rely on them quite a bit, so we decided to offer a better migration path. + +A while ago I therefore started a re-implementation of classic classes in RPython to speed them up. This work is now finished, the branch I worked on got merged today. Speed for the old-style class benchmarks was improved greatly and I found quite a number of bugs in the old implementation too. New-style classes are still a bit faster than old-style in PyPy though, and this is unlikely to change.

+
+
+
+
+ + Michael Foord wrote on 2007-12-14 18:29: +
+
+

Hey guys - its great to hear so much about PyPy progress. Keep up the good work (coding *and* blogging of course).

Michael

+
+
+
+
+ + Carl Friedrich Bolz-Tereick wrote on 2007-12-14 18:38: +
+
+

Hi Michael!

It seems we are slowly getting into this blogging thing. Good to hear that somebody is actually reading that stuff too :-).

Cheers,

Carl Friedrich

+
+
+
+
+ + Anonymous wrote on 2007-12-15 01:05: +
+
+

Of course it is being read, more please :)!

+
+
+
+
+ + Anonymous wrote on 2007-12-20 14:10: +
+
+

Dear Carl and other PyPy developers,
Thank you for all of your hard work in getting PyPy to its present impressive state.
I really enjoy reading about your activities and accomplishments on this blog and on the PyPy irc logs.

-gyro

+
+
+
+
+ + Carl Friedrich Bolz-Tereick wrote on 2007-12-20 14:23: +
+
+

Hi gyro!

Really impressive that you chew through all the IRC-logs: Even I find that a lot of work sometimes :-)

+
+
+
+ +

Profiling for fun with valgrind

+ +
+

Recently I've been doing a lot of profiling on the PyPy executables to find speed bottlenecks. Valgrind (the original page seems to be down) is an extremely nice tool for doing this. It has several built-in tools that give you different types of profiles. The callgrind mode provides you with a lot of information including relative call costs. The cachegrind tool gives you less information, but what it gives you (e.g. cache misses) is much more accurate. The obvious choice would be to have a way to combine the results of two profiling runs to have both. In the last days I wrote a script that does this. It's available at my user's svn and has a pretty intuitive command line interface. The combining calculation are not perfect yet, total costs of functions can still be a bit bogus (they can sum up to whatever) but at least the relative figures are good. This means that we can stop looking at two different types of graphs now. + +An awesome tool for analyzing the profile data is kcachegrind. + + + +Which also proves that my 12'' display is to small at least for some things :-). + + +Update: pygrind is available under the MIT license.

+
+
+
+
+ + José Fonseca wrote on 2007-12-14 13:11: +
+
+

Nice!

In what license are you releasing pygrind? I would like to integrate pygrind's code into Gprof2Dot (LGPL) to be able to generate nice-looking graphs from cachegrind output.

+
+
+
+
+ + Maciej Fijalkowski wrote on 2007-12-14 13:18: +
+
+

Indeed, good question, thanks. I've updated blog post (I think it satisfies you, if not you can have it under LGPL if you like).

+
+
+
+
+ + José Fonseca wrote on 2007-12-14 14:43: +
+
+

Excellent. Thanks!

+
+
+
+
+ + Aaron Bentley wrote on 2007-12-14 17:57: +
+
+

If you like kcachegrind, you might like using it for profiling python:
https://ddaa.net/blog/python/lsprof-calltree

+
+
+
+ +

PyPy tasks in GHOP

+ +
+

In the latest bunch of tasks that Titus released on Friday for the Google Highly Open Participation Contest there are several that are related to PyPy. Some of them are about presenting PyPy to a technical audience: Task 187, Task 188, Task 189, Task 190. + +Then there are some three about Ropes, which are all rather challenging: +

+ +In addition there is a task to use PyPy's sandboxing features to provide an interactive Python tutorial on a web page: Task 220. + +We're really looking forward to working together with some bright students! +
+
+
+
+ + teki wrote on 2007-12-06 01:28: +
+
+

"faster than c" means you can write code in other languages which look like do the same as the c one, but the c one is slower

+
+
+
+
+ + Anonymous wrote on 2007-12-06 09:20: +
+
+

Took me some time to get the joke. =)

+
+
+
+
+ + Fredrik Johansson wrote on 2007-12-06 16:48: +
+
+

Dream on; only imaginary things can be faster than c.

(Relativistic mass formula with square root of a negative number for those who don't get it.)

+
+
+
+
+ + Carl Friedrich Bolz-Tereick wrote on 2007-12-06 17:09: +
+
+

Sorry, sorry. I think I spent too much time around physicists. Further hint: notice the lower-case c.

+
+
+
+
+ + Eduardo O. Padoan wrote on 2007-12-18 16:06: +
+
+

Benchmarking is relative :)

+
+
+
+ +

Good news from the garbage collection front

+ +
+

It seems that we can do better! Armin fixed a bug in our generational garbage collector, which caused variable sized objects (e.g. arrays) to be allocated outside of the nursery. This resulted in 50% speedup on synthetic benchmarks and about 10-20% on real world ones. Doing some preliminary measures, it seems that we spend roughly 10% of the time in garbage collection, which is good (and there is still some room for improvements!)

+
+
+
+
+ +
+
+
+ +
+ + + + \ No newline at end of file diff --git a/blog/index-20.html b/blog/index-20.html new file mode 100644 index 000000000..1cd06117c --- /dev/null +++ b/blog/index-20.html @@ -0,0 +1,2935 @@ + + + + + + +PyPy (old posts, page 20) | PyPy + + + + + + + + + + + + + + + + + + Skip to main content +
+
+
+

Speeding up JSON encoding in PyPy

+ +
+

Hi

+

Recently I spent a bit of effort into speeding up JSON in PyPy. I started with +writing a benchmark, which is admittedly not a very good one, but it's +better than nothing (suggestions on how to improve it are welcome!).

+

For this particular benchmark, the numbers are as follow. Note that CPython by +default uses the optimized C extension, while PyPy uses the pure Python one. +PyPy trunk contains another pure Python version which has been optimized +specifically for the PyPy JIT. Detailed optimizations are described later in +this post.

+

The number reported is the time taken for the third run, when things are +warmed up. Full session here.

+ ++++ + + + + + + + + + + + + + + + + + + + + + + + + + + +
CPython 2.622s
CPython 2.73.7s
CPython 2.7 no C extension44s
PyPy 1.534s
PyPy 1.622s
PyPy trunk3.3s
+

Lessons learned:

+
+

Expectations are high

+

A lot of performance critical stuff in Python world is already written in a hand +optimized C. Writing C (especially when you interface with CPython C API) is +ugly and takes significant effort. This approach does not scale well when +there is a lot of code to be written or when there is a very tight coupling +between the part to be rewritten and the rest of the code. Still, people would +expect PyPy to be better at "tasks" and not precisely at running equivalent +code, hence a comparison between the C extension and the pure python version +is sound. Fortunately it's possible to outperform the C extension, but requires +a bit of effort on the programmer side as well.

+
+
+

Often interface between the C and Python part is ugly

+

This is very clear if you look at json module as implemented in CPython's +standard library. Not everything is in C (it would probably be just too +much effort) and the interface to what is in C is guided via profiling not +by what kind of interface makes sense. This especially is evident comparing CPython 2.6 to 2.7. +Just adapting the code to an interface with C made the Python version slower. +Removing this clutter improves the readability a lot and improves PyPy's version +a bit, although I don't have hard numbers.

+
+
+

JitViewer is crucial

+

In case you're fighting with PyPy's performance, jitviewer is worth a shot. +While it's not completely trivial to understand what's going on, it'll +definitely show you what kind of loops got compiled and how.

+
+
+

No nice and fast way to build strings in Python

+

PyPy has a custom thing called __pypy__.builders.StringBuilder. It has +a few a features that make it much easier to optimize than other ways like +str.join() or cStringIO.

+
    +
  • You can specify the start size, which helps a lot if you can even provide +a rough estimate on the size of the string (less copying)
  • +
  • Only append and build are allowed. While the string is being built you +can't seek or do anything else. After it's built you can never append any more.
  • +
  • Unicode version available as well as __pypy__.builders.UnicodeBuilder.
  • +
+
+
+

Method calls are ok, immutable globals are ok

+

PyPy's JIT seems to be good enough for at least the simple cases. Calling +methods for common infrastructure or loading globals (instead of rebinding as +locals) is fast enough and improves code readability.

+
+
+

String copying is expensive

+

Edit: see the comment at the end

+

If you use re.sub, the current implementation will always create a copy +of the string even if there was no match to replace. +If you know your regexp is simple, first try to check if there is +anything to replace. This is a pretty hard optimization to +do automatically -- simply matching the regular expression can be too costly +for it to make sense. In our particular example however, the regexp is really +simple, checking ranges of characters. It also seems that this is by far the +fastest way to escape characters as of now.

+
+
+

Generators are slower than they should be

+

I changed the entire thing to simply call builder.append instead of +yielding to the main loop where it would be gathered. This is kind of a PyPy +bug that using generators extensively is slower, but a bit hard to fix. +Especially in cases where there is relatively little data being passed around +(few bytes), it makes sense to gather it first. If I were to implement an +efficient version of iterencode, I would probably handle chunks of +predetermined size, about 1000 bytes instead of yielding data every few bytes.

+
+
+

I must admit I worked around PyPy's performance bug

+

For obscure (although eventually fixable) reasons, this:

+
+for c in s: # s is string
+  del c
+
+

is faster than:

+
+for c in s:
+  pass
+
+

This is a PyPy performance bug and should be fixed, but on a different branch ;-)

+
+
+

PyPy's JIT is good

+

I was pretty surprised, but the JIT actually did make stuff work nicely. +The changes that were done were relatively minor and straightforward, once +the module was cleaned to the normal "pythonic" state. +It is worth noting that it's possible to write code in Python and make it +run really fast, but you have to be a bit careful. Again, jitviewer is your +friend when determining why things are slow. I hope we can write more tools +in the future that would more automatically guide people through potential +performance pitfals.

+

Cheers, +fijal

+

Edit: I was wrong about re.sub. It just seems to be that the JIT is figuring match better than sub, will be fixed soon

+
+
+
+
+
+ + Ian McKellar wrote on 2011-10-27 17:20: +
+
+

It would be neat to get UnicodeBuilder and StringBuilder in to mainline Python. They'd be more efficient in CPython than existing string construction methods and it would be easier to write more performant portable Python.

+
+
+
+
+ + Yury S wrote on 2011-10-27 17:32: +
+
+

Can you elaborate a bit on the slowness of generators?

+
+
+
+
+ + Alex wrote on 2011-10-27 17:52: +
+
+

Ian: yes it would, python-ideas/dev has had this discussion many times, if you want to convince them of the merit of this idea, feel free to try, but I've gotten weary of this discussion

+
+
+
+
+ + Anonymous wrote on 2011-10-27 23:27: +
+
+

This is not meant to derail the rather nice performance numbers, but I wouldn't call the json/simplejson code "pythonic" in the first place.

+
+
+
+
+ + Gaëtan de Menten wrote on 2011-10-28 07:06: +
+
+

I wonder if using a constant object to dump in each iteration doesn't skew the benchmark in favor of pypy, whereas the jit couldn't optimize as much with a varying object (which is what usually happens in real-life scenarios).

+
+
+
+
+ + Maciej Fijalkowski wrote on 2011-10-28 07:27: +
+
+

@Gaetan it certainly could in theory. In practice it does not occur here, but I only know that from looking at traces. However, creating a new object each time would make the benchmark more of an object creation one (probably GC related)

+
+
+
+
+ + Gaëtan de Menten wrote on 2011-10-28 07:42: +
+
+

@Maciej: not if you build the list of objects to dump out of the timed loop, or did I miss something?

+
+
+
+
+ + Maciej Fijalkowski wrote on 2011-10-28 07:47: +
+
+

True, that might be a bit biggish though. Anyway as I said, it's good enough, JIT does not assume such things are constant. In fact it would execute exactly the same code for similarily shaped objects (different if all objects slightly differ in shape though)

+
+
+
+
+ + James Thiele wrote on 2011-10-28 16:11: +
+
+

Interfacing Python to C isn't ugly if you use Cython.

+
+
+
+
+ + Maciej Fijalkowski wrote on 2011-10-28 16:31: +
+
+

That is probably a matter of taste which we should not discuss among gentleman, I however find pure python better than Python-Cython-C combination. Also parsing JSON in C is not fun at all.

+
+
+
+
+ + Leonardo Santagada wrote on 2011-10-31 19:15: +
+
+

The guys from ultrajson have a benchmark here https://github.com/esnme/ultrajson/blob/master/python/benchmark.py

and the results are in the README https://github.com/esnme/ultrajson/blob/master/README

would be interesting to run those benchmarks (of course, first warming up the jit), and comparing the results to ultrajson.

+
+
+
+
+ + Maciej Fijalkowski wrote on 2011-10-31 20:43: +
+
+

feel free leonardo :)

+
+
+
+
+ + Leonardo Santagada wrote on 2011-11-01 14:42: +
+
+

It was just a suggestion on how to improve it, like you asked. If it was just going to be ignored I would not have bothered.

+
+
+
+ +

PyPy Göteborg Post-Hallowe'en Sprint Nov 2nd - Nov 9th

+ +
+

The next PyPy sprint will be in Gothenburg, Sweden. It is a public sprint, +suitable for newcomers. We'll focus on making a public kickoff for +both the numpy/pypy integration project +and the Py3k support project, +as well as whatever interests the Sprint attendees. Since both of these +projects are very new, there will be plenty of work suitable for newcomers +to PyPy.

+

Other topics might include:

+
    +
  • Helping people get their code running with PyPy
  • +
  • work on a FSCons talk?
  • +
  • state of the STM Vinnova project (We most likely, but not for certain will +know whether or not we are approved by this date.)
  • +
+
+

Other Useful dates

+

GothPyCon - Saturday Oct 29.

+

FSCONS Friday Nov 11 - Sunday Nov 12.

+
+
+

Location

+

The sprint will be held in the apartment of Laura Creighton and Jacob Hallén +which is at Götabergsgatan 22 in Gothenburg, Sweden. Here is a map. This is +in central Gothenburg. It is between the tram stops of Vasaplatsen and +Valand, (a distance of 4 blocks) where many lines call -- the 2, 3, 4, 5, +7, 10 and 13.

+

Probably cheapest and not too far away is to book accomodation at SGS +Veckobostader. The Elite Park Avenyn Hotel is a luxury hotel just a +few blocks away. There are scores of hotels a short walk away from the +sprint location, suitable for every budget, desire for luxury, and desire +for the unusual. You could, for instance, stay on a boat. Options are +too numerous to go into here. Just ask in the mailing list or on the blog.

+

Hours will be +from 10:00 until people have had enough. It's a good idea to arrive a +day before the sprint starts and leave a day later. In the middle of +the sprint there usually is a break day and it's usually ok to take +half-days off if you feel like it. Of course, many of you may be interested +in sticking around for FSCons, held the weekend after the sprint.

+
+
+

Good to Know

+

Sweden is not part of the Euro zone. One SEK (krona in singular, kronor +in plural) is roughly 1/10th of a Euro (9.36 SEK to 1 Euro).

+

The venue is central in Gothenburg. There is a large selection of +places to get food nearby, from edible-and-cheap to outstanding. We +often cook meals together, so let us know if you have any food allergies, +dislikes, or special requirements.

+

Sweden uses the same kind of plugs as Germany. 230V AC.

+
+
+

Getting Here

+

If are coming train, you will arrive at the Central Station. It is +about 12 blocks to the site from there, or you can take a tram.

+

There are two airports which are local to Göteborg, Landvetter (the main +one) and Gothenburg City Airport (where some budget airlines fly). +If you arrive at Landvetter the airport bus stops right downtown at +Elite Park Avenyn Hotel which is the second stop, 4 blocks from the +Sprint site, as well as the end of the line, which is the Central Station. +If you arrive at Gothenburg City Airport take the bus to the end of the +line. You will be at the Central Station.

+

You can also arrive by ferry, from either Kiel in Germany or Frederikshavn +in Denmark.

+
+
+

Who's Coming?

+

If you'd like to come, please let us know when you will be arriving and +leaving, as well as letting us know your interests We'll keep a list +of people which we'll update (which you can do so yourself if you +have bitbucket pypy commit rights).

+
+
+

Numpy funding and status update

+ +
+

Hi everyone,

+

It's been a little while since we wrote about NumPy on PyPy, so we wanted to +give everyone an update on what we've been up to, and what's up next for us.

+

We would also like to note that we're launching a funding campaign +for NumPy support in PyPy. Details can be found on the donation page.

+

Some of the things that have happened since last we wrote are:

+
    +
  • We added dtype support, meaning you can now create arrays of a bunch of +different types, including bools, ints of a various sizes, and floats.
  • +
  • More array methods and ufuncs, including things like comparison methods +(==, >, etc.)
  • +
  • Support for more and more argument types, for example you can index by a +tuple now (only works with tuples of length one, since we only have +single-dimension arrays thus far).
  • +
+

Some of the things we're working on at the moment:

+
    +
  • More dtypes, including complex values and user-defined dtypes.
  • +
  • Subscripting arrays by other array as indices, and by bool arrays as masks.
  • +
  • Starting to reuse Python code from the original numpy.
  • +
+

Some of the things on the near horizon are:

+
    +
  • Better support for scalar data, for example did you know that +numpy.array([True], dtype=bool)[0] doesn't return a bool object? +Instead it returns a numpy.bool_.
  • +
  • Multi-dimensional array support.
  • +
+

If you're interested in helping out, we always love more contributors, +Alex, Maciej, Justin, and the whole PyPy team

+
+
+
+
+ + Anonymous wrote on 2011-10-12 23:34: +
+
+

What is the best way to contact people about this? Our company has some interest in sponsporing this work, but it wasn't clear from this or the donations page how to actually talk to anyone about it. Maybe I'm missing the obvious.

+
+
+
+
+ + Alex wrote on 2011-10-12 23:53: +
+
+

Anonymous: The address to contact is "pypy at sfconservancy.org". Thanks!

+
+
+
+
+ + stan wrote on 2011-10-13 00:14: +
+
+

Yay! Time to put my money where my mouth is. :)

+
+
+
+
+ + Anonymous wrote on 2011-10-14 07:31: +
+
+

What does it mean "Starting to reuse Python code from the original numpy"? If it is copy-paste and something will be changed in numpy git trunc, will it be automatically taken into account by your numpy for PyPy?

+
+
+
+
+ + Luis wrote on 2011-10-15 04:05: +
+
+

This is off topic but, congratulations! You already achieved Unladen Swallow's performance goal of 5x faster than cpython on average.

https://code.google.com/p/unladen-swallow/wiki/ProjectPlan#Performance

https://speed.pypy.org/

+
+
+
+
+ + Anonymous wrote on 2011-10-17 08:51: +
+
+

You probably have already seen that, but there is an interesting comment from Travis Oliphant about the porting of numpy to pypy :

https://technicaldiscovery.blogspot.com/2011/10/thoughts-on-porting-numpy-to-pypy.html

+
+
+
+
+ + D wrote on 2011-10-17 10:50: +
+
+

You haven't answered my question about reuse numpy code for 3 days, I guess because you don't know it overall. I'm not 100% agree with neither Travis opinion nor Stefan M comment from https://morepypy.blogspot.com/2011/09/py3k-for-pypy-fundraiser.html , but in answer to Stefan M you say "Since this is open source, people either work on what they like, because it's fun or scratches their itch" and "Improving the [C extensions] support is boring and frustrating". Guys, AFAIK you received FP7 support for developing some soft for users, not for fun. You should spend some efforts for boring yet important work toward the mentioned things, if you would like to obtain further increase of users number and finance support. Also, clarification about reusing CPython numpy code is also highly appreciated.

+
+
+
+ +

More Compact Lists with List Strategies

+ +
+

Since we come closer to merging the list-strategy branch I want to try to explain this memory optimization today.

+

Datatypes in PyPy are stored as W_<type>Objects (e.g. W_StringObject to represent strings, W_IntObject to represent ints). This is necessary due to the dynamic nature of Python. So the actual value (e.g. string, integer) is stored inside that box, resulting in an indirection. When having a large amount of such boxed objects, for example in a list, the wasted memory can become quite large.

+

If you have a closer look at such lists, you will see that in many of them only one type of data is stored and only few (and smaller) lists store mixed types. Another thing to observe is that those lists often won't change the types of the objects they contain at runtime very often. For instance a list of a million integers is very unlikely to suddenly get a string appended to it.

+

List Strategies

+

The goal of this work is to write an optimization that exploits this behaviour. Instead of wrapping all items in a list, we implement lists in a way that they are optimized for storing certain (primitive) datatypes. These implementations store the content of the list in unwrapped form, getting rid of the extra indirection and wrapper objects.

+

One approach would be to add a level of indirection, making each W_ListObject instance point to another object that stores the actual content. For this other object, several implementations would exist, for every datatype we want to store without wrapping it (as well as a general one that deals with arbitrary content). The data layout would look something like this:

+

This approach has the problem that we need two indirections to get to the data and that the implementation instances need memory themselves.

+

What we would like to do is to make the W_ListObject point to an RPython list directly, that contains either wrapped or unwrapped data. This plan has the problem that storing different unwrapped data is not directly possible in RPython.

+

To solve the problem, we use the rerased RPython library module. It allows us to erase the type of an object, in this case lists, and returns something similar to void-star in C, or Object in Java. This object is then stored on the W_ListObject in the field storage. If we want to work with the list, for example to append or delete items, we need to unerase the storage again.

+

Example for rerase:

+
storage = erase([1 ,2 ,3 ,4])
+# storage is an opaque object that you can do nothing with
+....
+l = unerase(storage)
+l.clear()
+
+

Now that we know how to make the W_ListObject point directly to wrapped or unwrapped data, we need to find out how to actually do any operations on this data. This can be accomplished by adding another field to our W_ListObject. This field points to a ListStrategy object. The actual implementation of W_ListObject is now deferred to those ListStrategy classes. For instance, a W_ListObject which holds only integers will use the IntegerListStrategy.

+

When the type of content is being changed, we need to change the used strategy as well as the storage in compatible ways. For example when we add a string to the list of integers we need to switch to the ObjectListStrategy and change the storage to be a list of wrapped objects. Thus the currently used strategy always knows what to do with what is currently in the storage.

+

As you can see, we now save one level of indirections by storing some of the data unwrapped. Of course each operation on a list needs to go via the strategy, but since we save one indirection for each element stored in that list and the Strategy classes are singletons, the benefits outweigh the costs.

+

Currently there are only strategies for integers and strings since many lists seem to have these datatypes. Other strategies i.e for floats and unicode strings are planned. We also implemented two special strategies for empty lists and range-lists. The EmptyListStrategy's storage is None. If objects are added to the list we just switch to the appropriate strategy (determined by the item's type). RangeListsStrategies do not store any items at all. Instead they only store values describing the range of the list, i.e. start, step and length. On any operations that changes the data of the list we switch to the IntegerStrategy.

+

A nice side-effect of storing unwrapped datatypes is that we can implement optimized methods for certain cases. For instance, since comparison of unwrapped integers is now much faster than comparison between arbitrary objects, we can rewrite the sorting methods for lists containing integers.

+

Microbenchmarks

+

Finally here is an early overview of the memory consumption of different Python implementations: CPython, PyPy and PyPy-list which uses list-strategies. To demonstrate how powerful list-strategies can be in the best case, we wrote benchmarks that create a list of integers, a list of strings and a range-list each with one million elements each and then reads out the heap size of the process as reported by the OS.

+

The results are as follows:

+

The savings on integers and strings in this ideal case are quite big.

+

The benchmark for range-lists is a little unfair, since in CPython one could accomplish the same memory behaviour using xrange. However, in PyPy users won't notice that internally the list does not store all items, making it still possible to use all list methods, such as append or delete.

+

Conclusion

+

We hope that list strategies bring memory savings for applications that use homogeneous lists of primitive types. Furthermore, operations on such lists tend to be somewhat faster as well. This also integrates well with the JIT. The list strategies optimizations will be merged to the PyPy's default branch at some point in the next months. An equivalent optimization for dictionaries has already been merged (and is part of PyPy 1.6), one for sets is coming in the future.

+

Lukas Diekmann and Carl Friedrich Bolz

+
+
+
+
+ + Winston Ewert wrote on 2011-10-11 13:10: +
+
+

Nice.

But isn't there a small change in semantics to do that? If a push a python int object onto a list and then pop it back off I'll have the exact same object. But if you unwrap the object and store it as a plain int and then repop it I don't have the exact same object. I've a got a new object.

+
+
+
+
+ + Anonymous wrote on 2011-10-11 13:20: +
+
+

It seems to be very nice.

By the way, are object attributes optimized the same way? Objects of the same class can be expected to frequently store data of the same type in the same attribute.
I've found a nearly-year-old post on maps ( https://morepypy.blogspot.com/2010/11/efficiently-implementing-python-objects.html ), but it does not mention attribute value types... has this idea been considered?

+
+
+
+
+ + Unknown wrote on 2011-10-11 13:25: +
+
+

I can see float support presenting some interesting challenges being emblematic of a wider issue. It would be very easy for someone to have a list of "floats" but if they populated it with any literals, most likely they'll be integer literals, missing any of the float optimization.

For most apps this won't be a problem but if someone is trying to optimize their application they might see this as a performance heisenbug. For example they write a hard coded list and it is slow, read it from a file and it is fast.

One approach is for there to be a document on some website (that gets out of date) that lists PyPy micro-optimizations. Someone would then need continually audit their code against that list. This doesn't seem practical.

I've seen posted some low level visualization tools. I'd be curious how practical it would be to have a higher level profiler tool integrate with the JIT to detect patterns like the list of mixed float/int situation to flag these micro-optimizations in a more automated fashion.

+
+
+
+
+ + Alex wrote on 2011-10-11 13:27: +
+
+

Winston: Indeed, very clever of you to notice :) However, we noticed as well, going forward integers (and other primitives) identity will be a function of their value, not the identity of their box. This means that for all ints `i is x` if and only if `i == x`. This also means that `id()` is now a function of value for primitives. Don't rely on that though! Just like we don't want people relying on `i is x` if `i == x and -100 < i < 200`, we don't want people relying on this either.

Anonymous:

Yes, this is definitely a consideration, I keep meaning to make time to work on this.

+
+
+
+
+ + evilpies wrote on 2011-10-11 14:08: +
+
+

Well interesting, SpiderMonkey is considering to implement something like this, because NaN-boxing usually wastes a lot of memory.

+
+
+
+
+ + Maciej Fijalkowski wrote on 2011-10-11 19:52: +
+
+

@Ed I think float list can accomodate a limited set of integer values (those that can be represented correctly when interpreted as float) without any issue. You would then however need to tag which one is integer and which one is float, having to keep a bitmap. That's certainly possible, but a bit of a mess.

+
+
+
+
+ + Alex wrote on 2011-10-11 20:23: +
+
+

fijal: I think better than obscure hacks like a bitmap allowing integers as floats, perhaps it would be better just to eventually have logging of when you get fallbacks like that. For eventual integration with the jitviewer of course :)

+
+
+
+
+ + Winston Ewert wrote on 2011-10-11 20:54: +
+
+

A general runtime warning system that could say things like: "list of floats decaying to list of objects because of adding int", "two ints being compared via is", etc. might be useful. That could handle any number of situations with surprising semantics or performance.

+
+
+
+
+ + Anonymous wrote on 2011-10-11 21:47: +
+
+

This is very interesting. I have been thinking along somewhat similar lines for a while (but for performance reasons, rather than memory size), and so have already reviewed how I use lists in my own code. In my own programs, having non-uniform data types in a list is extremely rare. However, some lists are lists of lists (or tuples or dictionaries). The most common large lists however tend to be lists of strings.

1) If I correctly understand your explanation of what you are doing, your "list strategies" are effectively marking uniform lists as being either of one of a few known basic types (e.g. IntegerListStrategy), or just a traditional list of objects. Is that correct?

2) Do you think there are any meaningful performance optimsations which could be gained when the list type is known in advance?

3) What about built-in functions such as all(), any(), len(), min(), max(), etc? Would they be able to make use of this to improve their performance?

4) Would the underlying array data format be exposed for people who want to write extensions making direct use of it (e.g. for things like SIMD libraries)?

5) Could this allow a list to be in shared memory and directly accessed by another program?

6) Would the new list format be compatible with a "memoryview" (as str and bytearray are)?

7) My own earlier thoughts had involved marking a list as being of a uniform or non-uniform data when the list is created or altered, and using optimised code for the expected type for uniform lists. One sticky point however was threading, as a different type could be appended in another thread, which means that the consuming function would have to somehow be aware of this. Would your concept have a problem with threading if appending a string to an integer list suddenly required changing the underlying list strategy while another thread was accessing the same list?

8) Python 3.x seems to favour iterators over creating lists (e.g. map, filter, range are replaced by what used to be imap, ifilter, and xrange), and generators were introduced to complement list comprehensions in order to save memory. Does this have any implications for what you are doing?

9) Could your list concept be applied by the CPython developers to CPython? This might help ensure that any subtle semantic issues which arise as a result apply equally to CPython, rather than having people call them "Pypy bugs".

10) What about changing the Python language semantics to allow a user to specify that a list must be of a specific uniform type, and raising a type error if an element(s) of an unexpected type is added to the list? This is actually a language feature that I would like to have in order to catch errors without having to write code to examine each individual data element (as that can be slow and error prone in itself).

11) Finally, why is there such a large start-up memory use in your micro-benchmarks when comparing Pypy-list to CPython? Is this just general overhead from Pypy itself, or is that due to something related to converting the list format to a particular "list strategy"?

+
+
+
+
+ + Alex wrote on 2011-10-11 23:14: +
+
+

Anonymous: Wow a lot of questions, I'll try to answer them :)

1) Yes.

2) Probably not, you get the most performance gains when you have a large list, and if it's large the very-very-very-small initial transition is amortized over many elements.

3) Many of those are pure-python and will thus automatically gain these benefits, max() and min() unfortunately are not.

4) Probably not, we don't expose this data in any other place nor do we have any APIs for it.

5) I suppose in theory, again we have no API for it.

6) No, it wouldn't be, since that's not a part of the list API. We don't define the language, we just implement it (faster).

7) No, there's no problem with this, you simply need to lock (or whatever the equivilant in STM) is the list and do the modifications.

8) No, I don't think it does.

9) Yes, it could be applied to CPython with slightly more difficulty, and it would see the memory gains. However, it would see performance losses (as you do with teh array module on CPython) because it would need to box/unbox at every iteraction, whereas teh JIT is able to remove that.

10) Propose it to python-ideas, we don't define the language.

11) I can't understand the charts, so I can't answer this one.

+
+
+
+
+ + Anonymous wrote on 2011-10-12 03:15: +
+
+

Alex: I'm the anonymous with all the questions. Thank you for your detailed answers. I completely understand that there are side issues that you don't want to deal with at this time.

As for the possible performance effects of the proposed new list data format if applied to CPython, doing the operation: "y = reduce(operator.add, x, 0)" where x is either a list or array of 1,000,000 integers does not seem to produce a measurable difference in speed for me (Python 2.6 on 64 bit Ubuntu). Any differences seem to go either way when the test is repeated, so they seem equivalent within the margin of error. An equivalent for loop yields the same result (except for being slower, of course).

When extracting or replacing slices for lists and arrays (e.g. "y = x[i:i + 50]" and "x[i:i + 50] = y") within a for loop, the array version seems to be significantly *faster* than the list version for large slices (e.g. 50), and approximately the same for small slices (e.g. 5).

Theoretically, yes the implementation with array should always be slower, but I can't seem to get that result when I attempt to measure it. Perhaps I'm doing something wrong, but it appears from the (admittedly minimal) testing that I have done that significant speed penalties for CPython cannot simply be assumed.

I realize that ultimately this isn't a matter for the Pypy developers to concern themselves with, but should the question ever arise I don't think it can be dismissed out of hand.

+
+
+
+
+ + Carl Friedrich Bolz-Tereick wrote on 2011-10-12 08:18: +
+
+

Some additional thoughts to @Anonymous questions:

3) What about built-in functions such as all(), any(), len(), min(), max(), etc? Would they be able to make use of this to improve their performance?

len does not depend on the content of the list, so it does not win. all, any, min and max could be improved, yes.

7) My own earlier thoughts had involved marking a list as being of a uniform or non-uniform data when the list is created or altered, and using optimised code for the expected type for uniform lists. One sticky point however was threading, as a different type could be appended in another thread, which means that the consuming function would have to somehow be aware of this. Would your concept have a problem with threading if appending a string to an integer list suddenly required changing the underlying list strategy while another thread was accessing the same list?

The JIT does indeed produce special optimized code for the type of list it is currently observing, making operations faster. The fact that another thread could change the type of the list is not a problem, because we have a GIL and thus the JIT knows at which points another thread can run.

10) What about changing the Python language semantics to allow a user to specify that a list must be of a specific uniform type, and raising a type error if an element(s) of an unexpected type is added to the list? This is actually a language feature that I would like to have in order to catch errors without having to write code to examine each individual data element (as that can be slow and error prone in itself).

this already exists. it's called the array module.

11) Finally, why is there such a large start-up memory use in your micro-benchmarks when comparing Pypy-list to CPython? Is this just general overhead from Pypy itself, or is that due to something related to converting the list format to a particular "list strategy"?

The higher startup memory is also there in the PyPy without list strategies, so those have nothing to do with it.

+
+
+
+
+ + Anonymous wrote on 2011-10-13 09:54: +
+
+

I too had trouble understanding the chart. The vertical axis doesn't have negative numbers to represent a delta, just ignore the signs.

The blue area is an algebraically positive area, representing the startup memory use. The yellow area represents the memory use delta after doing the 1e6 items list operations.

+
+
+
+
+ + Armin Rigo wrote on 2011-10-19 13:26: +
+
+

Re list of floats-and-ints: a fully compatible way is to use the NaN-tagging idea from SpiderMonkey, i.e. have a special encoding of NaN that is normally not used, and that still leaves 32 bits of extra information. We would then represent ints in the list as such a NaN-encoded float. (At least it works as long as the integer is not too large, on 64-bit platforms.)

+
+
+
+
+ + Ole Laursen wrote on 2011-11-18 14:55: +
+
+

Neat!

Nice work people. I'm amazed it's so simple do to afterall, just switch type based on what the first element is. It must be a big boon for garbage collection, too?

+
+
+
+
+ + Anonymous wrote on 2011-12-16 14:07: +
+
+

The benchmark measures virtual memory (don't know on which architecture); measuring RSS would be more representative of the actual amount of RAM spent storing the data. Presumably it would also be more favourable to PyPy, since moving garbage collection doubles the amount of virtual memory.

+
+
+
+ +
+
+
+ + stan wrote on 2011-09-21 18:21: +
+
+

Two comments:

1. It would be really nice to see a semi-frequently updated progress bar (live is best) with # of dollars and # of contributions for the fundraising.

Part of the excitement created by sites like Kickstarter (and the Humble Indie Bundle and so on) is seeing how your small contribution adds to the whole. A donate button feels more like throwing your money into a dark hole (a very reasonable and worthwhile hole, but a hole nonetheless). Take advantage of some video game psychology and give us that "level up" feedback when we contribute! :)

2. I know you don't want to oversubscribe yourselves, but would you consider doing a similar funding drive for Numpy support? PLEASE???

+
+
+
+
+ + Konstantine Rybnikov wrote on 2011-09-21 18:55: +
+
+

Totally agree with stan about progress bar. Recent novacut's donation campaign showed importance of that a lot (since people saw that they need to hurry up with fundings and did lots of them in last couple of days).

+
+
+
+
+ + Carl Friedrich Bolz-Tereick wrote on 2011-09-21 19:30: +
+
+

@stan: 1. progress bar will be coming soon

2. we are actively working on putting up an equivalent page for Numpy support.

+
+
+
+
+ + stan wrote on 2011-09-21 19:45: +
+
+

Awesome! I want to be first in line to pitch $50 into the Numpy jar.

+
+
+
+
+ + Anonymous wrote on 2011-09-21 21:03: +
+
+

Awesome! Infact I regard Python 3 as much more important as any other features you could add now. 10% more performance is not nearly in the same league as Python3 support. Will happily spend some money on this.

+
+
+
+
+ + João Bernardo wrote on 2011-09-22 00:01: +
+
+

Great!! I was waiting for that

+
+
+
+
+ + Anonymous wrote on 2011-09-22 01:03: +
+
+

For complete support thats like 200,000$. I understand it's a willing feature, but I don't think the pypy community and followers are that huge.

Btw, nice getting all the benchmarks above CPython 2.6 :)

+
+
+
+
+ + Sojin wrote on 2011-09-22 05:41: +
+
+

Great work guys! I think keeping this amazing project alive is important for the Python eco-system... Here comes my $$.

+
+
+
+
+ + Laurent wrote on 2011-09-22 14:56: +
+
+

I've heard that Py3K support for PyPy will be implemented in Python 2.X anyway. Is that true?

+
+
+
+
+ + Antonio Cuni wrote on 2011-09-22 16:49: +
+
+

@Laurent: to be more precise, py3k will be implemented in RPython, which is indeed a subset of Python 2.

Right now we don't have any plan to port RPython to Python 3: it's not a priority and it won't give any advantage to the PyPy end users.

+
+
+
+
+ + Anonymous wrote on 2011-09-23 00:31: +
+
+

cpython3 is a fork of cpython2, but here you intend to support both versions with the same codebase. Does not this make the task much harder, and peeking into cpython3 code for guidance less useful? Also, isn't it possible that the resulting large set of switch (PYVERSION) {} statements will make the code less readable and maintainable?

Anyway, I have full faith in your assessment of the best approach, but I am still interested in your explanation. :)

+
+
+
+
+ + Zinahe wrote on 2011-09-30 16:25: +
+
+

Just made my donation. GOD SPEED.

I second stan's idea of providing a progress bar showing the overall status of the fundraising.

+
+
+
+
+ + Harald Armin Massa wrote on 2011-09-30 21:28: +
+
+

a) please, please get the pages lac showed in her lightning talk at pycon.uk online.
- There are pictures of people in it, and it is easier to donate to people then to something abstract
- there is text what happened
- there is text that anonymous donation is possible

b) please, work on the feedback. It is CRUCIAL to show the actual state. Giving 5€ and nothing happens is dull. Giving 5€ and a number goes up - good. Giving 500€ and a rendered bar moves a pixel - awesome!

c) I found the Python3PyPy fundraiser easily. I did not find the numpypy fundraiser. Please, put lacs pages up :) if I can vote for them somewhere, please let me know.

+
+
+
+
+ + Maciej Fijalkowski wrote on 2011-09-30 21:36: +
+
+

@Harald poke lac harder so she deploys it :)

+
+
+
+
+ + Anonymous wrote on 2011-10-05 01:04: +
+
+

It's been a couple of weeks and the progress bar still isn't there. Although there is a link for it that doesn't work.
Please fix this and make it visible without having to click anything.

+
+
+
+
+ + Anonymous wrote on 2011-10-06 23:45: +
+
+

Hi! Please create the same kind of bucket for numpy support. I'm a big fan of Py3k, but I'm an even bigger fan of numpy - and I need it for my work. I'll donate to Py3k now, but I'll donate a bigger sum to both when I see the new bucket.

+
+
+
+
+ + Maciej Fijalkowski wrote on 2011-10-07 00:10: +
+
+

We're waiting for the final ok of the proposal so it can be said it benefits the public good. Any day now :)

+
+
+
+
+ + Stefan M wrote on 2011-10-08 19:48: +
+
+

* Who needs Python 3 support??? *

It looks like the PyPy project is adding things just to improve something and keep doing something but for who's sake?

I really need is proper support for C extensions. Without it, people who use Python professionally like myself, cannot switch to PyPy and we are stuck with Cython and/or Psyco.

Who steers the development of Pypy and why would these people refuse to realize what hinders thousands of developers, who would love to use Pypy to make the switch from CPython ???

Please tell me which real software projects use PyPy and for what reason they would need Py3K support!


Go ahead and add more language constructs that you can use to run academic programs even faster and keep ignoring what is really necessary to push Pypy into day-to-day usability

(* frustrated *)

+
+
+
+
+ + Maciej Fijalkowski wrote on 2011-10-08 20:11: +
+
+

Hi Stefan. There is noone who steers direction in PyPy. Since this is open source, people either work on what they like, because it's fun or scratches their itch. Note that Python 3 work is something that people expressed interest in funding -- if they fund it enough, why wouldn't developers work on it? It's more interesting than most jobs.

With regard to C extensions - it's good enough for many people, like quora to run on PyPy. Improving the support is boring and frustrating, so I don't think anyone would be willing to invest significant amount of his *free time* into that. However, feel free to speak with your money, you know how to find me.

Cheers,
fijal

+
+
+
+
+ + Stefan M wrote on 2011-10-13 03:46: +
+
+

Hi Maciej,

I realize that I came across in a somewhat obnoxious way. Sorry for that - I simply did not realize that PyPy is a true hobbyist project (at least currently).
I wish I could contribute funding but though I am using Python a lot at work, we are a National Lab and struggling to keep our government funding ourselves.

I hope a deep-pocket corporate will fund the Numpy development

Cheers, Stefan

+
+
+
+
+ + Anonymous wrote on 2011-10-26 11:09: +
+
+

i wonder why you don't get (more) funding from google?

you seem to have reached the goal of unladen swallow now and there still is room for improvement.

and it would be peanuts for them anyway. :)

+
+
+
+ +

Wrapping C++ Libraries with Reflection — Status Report One Year Later

+ +
+

Well over a year ago, work was started on the cppyy module which lives in the +reflex-support branch. +Since then, work has progressed at a varying pace and has included a recent +sprint in Düsseldorf, last July.

+

Let's first take a step back and recap why we're interested in doing this, +given that it is perfectly possible to use C++ through generated bindings and +cpyext. +cppyy makes use of reflection information generated for the C++ classes of +interest, and has that reflection information available at run time. +Therefore, it is able to open up complex C++ types to the JIT in a +conceptually similar manner as simple types are open to it. +This means that it is possible to get rid of a lot of the marshalling layers +when making cross-language calls, resulting in much lower call overhead than +is possible when going through the CPython API, or other methods of wrapping.

+

There are two problems that need to be solved: C++ language constructs need to +be presented on the Python side in a natural way; and cross-language impedance +mismatches need to be minimized, with some hints of the user if need be. +For the former, the list of mapped features has grown to a set that is +sufficient to do real work. +There is now support for:

+
+
    +
  • builtin, pointer, and array types
  • +
  • namespaces, classes, and inner classes
  • +
  • global functions, global data
  • +
  • static/instance data members and methods
  • +
  • default variables, object return by value
  • +
  • single and multiple (virtual) inheritance
  • +
  • templated classes
  • +
  • basic STL support and pythonizations
  • +
  • basic (non-global) operator mapping
  • +
+
+

The second problem is harder and will always be an on-going process. +But one of the more important issues has been solved at the recent Düsseldorf +sprint, namely, that of reclaiming C++ objects instantiated from the Python +side by the garbage collector.

+

Performance has also improved, especially that of the nicer "pythonized" +interface that the user actually sees, although it still misses out on +about a factor of 2.5 in comparison to the lower-level interface (which has +gotten uglier, so you really don't want to use that). +Most of this improvement is due to restructuring so that it plays nicer with +the JIT and libffi, both of which themselves have seen improvements.

+

Work is currently concentrated on the back-ends: a CINT back-end is underway +and a LLVM/CLang pre-compiled headers (PCH) back-end is planned. +The latter is needed for this code to be released in the wild, rather than +just used in high energy physics (HEP), as that would be easier to support. +Also, within HEP, CLang's PCH are foreseen to be the future format of +reflection information.

+

At the end of the Düsseldorf sprint, we tried a little code that did something +actually "useful," namely the filling of a histogram with some random values. +We did get it to work, but trying cppyy on a large class library showed +that a good warning system for such things like missing classes was sorely +needed. +That has been added since, and revisiting the histogram example later, here is +an interesting note: the pypy-c run takes 1.5x the amount of time of that +of the compiled, optimized, C++ code. +The run was timed start to finish, including the reflection library loading +and JIT warm-up that is needed in the case of Python, but not for the compiled +C++ code. +However, in HEP, scientists run many short jobs while developing their +analysis codes, before submitting larger jobs on the GRID to run during lunch +time or overnight. +Thus, a more realistic comparison is to include the compilation time needed +for the C++ code and with that, the Python code needs only 55% of the time +required by C++.

+

The choice of a programming language is often a personal one, and such +arguments like the idea that C++ is hard to use typically do not carry much +weight with the in-crowd that studies quantum field dynamics for fun. +However, getting the prompt with your analysis results back faster is a sure +winner. We hope that cppyy will soon have progressed far enough to make it +useful first to particle physicists and then other uses for wrapping C++ +libraries.

+ +Wim Lavrijsen, Carl Friedrich Bolz, Armin Rigo +
+
+
+
+ + René Dudfield wrote on 2011-08-31 10:15: +
+
+

Hi,

nice result. Wrapping C++ code can be even more tiresome than C, especially with large code bases. This will be a very welcome tool.


This question has probably been answered before... but I ask anyway since I couldn't find the answer.

Can the jit information be saved, so it does not need to be worked out again? Assuming all of the dependencies have not changed (.py files, pypy itself, .so files etc). Maybe if location independent code can not be saved, then trace hints or some higher level structure could be saved to inform the jit about what traces to jit? That sounds like a solution to jit warm up for code that is used repeatedly.

cu.

+
+
+
+
+ + Wim Lavrijsen wrote on 2011-08-31 18:53: +
+
+

Hi,

thanks! :)

There was a recent thread on saving JIT information on pypy-dev:

https://mail.python.org/pipermail/pypy-dev/2011-August/008073.html

and the conclusion there was that it is too hard to be of benefit because too many parts contain addresses or calculated variables that were turned into constants.

For our (HEP) purposes, it would be of limited benefit: in the development cycle, the .py's would change all the time, and it is a safe assumption that the user codes that are being developed are the most "hot." If there is anything in the supporting code that is "hot" (most likely in the framework) it'd be in C/C++ at that point anyway.

Rather, I'd like to have an easy way of letting the user determine which portions of the code will be hot. Saving not having to run a hot loop 1000x in interpreted mode before the JIT kicks in, is going to be more valuable in scientific codes where the hot loops tend to be blatantly obvious.

Cheers,
Wim

+
+
+
+
+ + Anonymous wrote on 2011-08-31 19:49: +
+
+

This is great. I have been looking for just such a tool to wrap C++ numerical code.

I guess I have two questions:
1. Is there any documentation on how to use it?
2. It is very important to be able to translate between NumPy data structure and C++ data structure for me, so is there any plan to make this easy?

Thanks for great work.

+
+
+
+
+ + Wim Lavrijsen wrote on 2011-08-31 22:18: +
+
+

Hi,

thanks! :)

ad 1) it's not at the level of being usable in a production environment. I have two known issues to resolve and probably some more unknowns. I've posted a description on pypy-dev, and I'm helping a few patient, very friendly users along. But actual documentation suggest a level of support that currently can't be offered, because all the current (and soon to disappear) caveats would need documenting as well.

ad 2) not sure what data translation you're thinking of, but in the CPython equivalent, support was added for the buffer interface and MemoryView. Those, or something similar, will be there so that numpy array's etc. can be build from return values, from public data members, and passed into function calls as arguments. Those are not translations, but rather extraction of the data pointers (which is typically intended and the most efficient, to be sure).

Cheers,
Wim

+
+
+
+
+ + wholesale electronics wrote on 2011-12-17 01:23: +
+
+

Maybe if location independent code can not be saved, then trace hints or some higher level structure could be saved to inform the jit about what traces to jit?

+
+
+
+ +

We need Software Transactional Memory

+ +
+

Hi all. Here is (an extract of) a short summary paper about my current position on +Software Transactional Memory as a general tool in the implementation +of Python or Python-like languages. Thanks to people on IRC for discussion on making +this blog post better (lucian, Alex Gaynor, rguillebert, timonator, Da_Blitz). +For the purpose of the present discussion, we are comparing Java with Python +when it comes to multi-threading.

+ +

The problem in complex high-level languages

+

Like Java, the Python language gives guarantees: it is not acceptable +for the Python virtual machine to crash due to incorrect usage of +threads. A primitive operation in Java is something like reading or +writing a field of an object; the corresponding guarantees are along the +lines of: if the program reads a field of an object, and another thread +writes to the same field of the same object, then the program will see +either the old value, or the new value, but not something else entirely, +and the virtual machine will not crash.

+

Higher-level languages like Python differ from Java by the fact that a +"primitive operation" is far more complex. It may for example involve +looking in several hash maps, perhaps doing updates. In general, it is +completely impossible to map every operation that must be atomic to a +single processor instruction.

+ +

Jython: fine-grained locking

+

This problem has been solved "explicitly" in the Jython interpreter that +runs on top of Java. The solution is explicit in the following sense: +throughout the Jython interpreter, every single operation makes careful +use of Java-level locking mechanisms. This is an application of +"fine-grained locking". For example, operations like attribute lookup, +which need to perform look-ups in a number of hash maps, are protected +by acquiring and releasing locks (in __getattribute__).

+

A draw-back of this solution is the attention to detail required. +If even one place misses a lock, then there is either a +bug --- and such bugs occur in cases that are increasingly rare and hard +to debug as the previous bugs are fixed --- or we just file it under "differences +from CPython". There is however the risk of +deadlock, if two threads attempt to lock the same objects in different +order.

+ +

In practice, the situation is actually not as bad as +I may paint it: the number of locks in Jython is reasonable, and allows for +all the "common cases" to work as expected. +(For the uncommon cases, see below.)

+ +

Performance-wise, the Java virtual machine itself comes with locks that +have been heavily optimized over a long period of time, so the +performance is acceptable. However if this solution were coded in C, it +would need a lot of extra work to optimize the locks manually (possibly +introducing more of the subtle bugs).

+ +

CPython: coarse-grained locking

+

CPython, the standard implementation of Python in C, took a different +and simpler approach: it has a single global lock, called the Global +Interpreter Lock (GIL). It uses "coarse-grained locking": the lock is +acquired and released around the whole execution of one bytecode (or +actually a small number of bytecodes, like 100). This solution is +enough to ensure that no two operations can conflict with each other, +because the two bytecodes that invoke them are themselves +serialized by the GIL. It is a solution which avoids --- unlike Jython +--- writing careful lock-acquiring code all over the interpreter. It +also offers even stronger guarantees: every bytecode runs entirely +atomically.

+

Nowadays, the draw-back of the GIL approach is obvious on multi-core +machines: by serializing the execution of bytecodes, starting multiple +threads does not actually let the interpreter use of more than one core.

+

PyPy, the Python implementation in Python, takes the same approach so +far.

+ +

Existing usage

+

As we have seen, we have the following situation: the existing Python +language, as CPython implements it, offers very strong guarantees about +multi-threaded usage. It is important to emphasize that most existing +multi-threaded Python programs actually rely on such strong guarantees. +This can be seen for example in a problem that takes a populated list +and does in several threads:

+
+next_item = global_list.pop()
+
+

This implicitly relies on the fact that pop() will perform atomic +removal from the list. If two threads try to pop() from the same list +at the same time, then the two operations will occur in one order or the +other; but they will not e.g. return the same object to both threads or +mess up the internal state of the list object.

+

With such an example in mind, it should be clear that we do not want a +solution to the multi-core issue that involves dropping these strong +guarantees. It is ok however to lower the barrier, as Jython does; but +any Python implementation must offer some guarantees, or not offer +multi-threading at all. This includes the fact that a lot of methods on +built-in types are supposed to be atomic.

+ +

(It should be noted that not offering multi-threading at all is actually +also a (partial) solution to the problem. Recently, several "hacks" +have appeared that give a programmer more-or-less transparent access to +multiple independent processes (e.g. multiprocessing). While these provide appropriate +solutions in some context, they are not as widely applicable as +multi-threading. As a typical example, they fail to apply when the +mutiple cores need to process information that cannot be serialized at +all --- a requirement for any data exchange between several processes.)

+ +

Here is an example of how Jython's consistency is weaker than CPython's GIL. +It takes uncommon examples to show it, and the fact that it does not work +like a CPython programmer expect them to is generally considered as an +implementation detail. Consider:

+
Thread 1:  set1.update(set2)
+Thread 2:  set2.update(set3)
+Thread 3:  set3.update(set1)
+

Each operation is atomic in the case of CPython, but decomposed in two steps +(which can each be considered atomic) in the case of Jython: reading from the +argument, and then updating the target set. Suppose that initially +set1 = {1}, set2 = {2}, set3 = {3}. On CPython, independently on +the order in which the threads run, we will end up with at least one of the +sets being {1, 2, 3}. On Jython, it is possible that all +three sets end up as containing two items only. The example is a bit +far-fetched but should show that CPython's consistency is strictly stronger +than Jython's.

+ +

PyPy

+

PyPy is a Python interpreter much like CPython or Jython, but the way it +is produced is particular. It is an interpreter written in RPython, a +subset of Python, which gets turned into a complete virtual machine (as +generated C code) automatically by a step called the "translation". In +this context, the trade-offs are different from the ones in CPython and +in Jython: it is possible in PyPy, and even easy, to apply arbitrary +whole-program transformations to the interpreter at "translation-time".

+

With this in mind, it is possible to imagine a whole-program +transformation that would add locking on every object manipulated in +RPython by the interpreter. This would end up in a situation similar to +Jython. However, it would not automatically solve the issue of +deadlocks, which is avoided in the case of Jython by careful manual +placement of the locks. (In fact, being deadlock-free is a global +program property that cannot be automatically ensured or verified; any +change to Jython can in theory break this property, and thus introduce +subtle deadlocks. The same applies to non-atomicity.)

+

In fact, we can easily check that if the interpreter accesses (for +both reading and writing) +objects A and B in a bytecode of thread 1, and objects B and A (in the +opposite order) in a bytecode of thread 2 --- and moreover if you need to +have accessed the first object before you can decide that you will need +to access the second object --- then there is no way (apart from the GIL) to avoid +a deadlock while keeping the strong guarantee of atomicity. Indeed, if +both threads have progressed to the middle of the execution of their +bytecode, then A has already been mutated by thread 1 and similarly B +has already been mutated by thread 2. It is not possible to +successfully continue running the threads in that case.

+ +

Using Software Transactional Memory

+

Software Transactional Memory (STM) is an approach that gives a solution +to precisely the above problem. If a thread ended up in a situation +where continuing to run it would be wrong, then we can abort and +rollback. This is similar to the notion of transaction on databases. +In the above example, one or both threads would notice that they are +about to run into troubles and abort. This means more concretely that +they need to have a way to restart execution at the start of the +bytecode, with all the side-effects of what they did so far being either +cancelled or just not committed yet.

+

We think that this capacity to abort and rollback is the missing piece +of the puzzle of multi-threaded implementations of Python. +Actually, according to the presentation of the problem given +above, it is unavoidable that any solution that wants to offer the +same level of consistency and atomicity as CPython would involve +the capacity of aborting and rolling back --- which means precisely +that STM cannot be avoided.

+ +

Ok, but why not settle down with Jython's +approach and put careful locks left and right throughout the interpreter? +Because (1) we would have to consider every operation's atomicity and make decisions +(or steal Jython's) and document them +here; +(2) it would also be really a lot of work, to optimize these locks e.g. with the +JIT as well as the JVM does; and (3) it is not the PyPy way to require manually +tweaking your code everywhere for a feature that should be orthogonal. Point +(3) is probably the most important here: you need to redo the work for every +language you implement in PyPy. +It also implies my own point (4): it is not fun :-)

+ +

In more details, the process would work as follows. (This gives an +overview of one possible model; it is possible that a different model +will end up being better.) In every thread:

+
    +
  • At the start of a bytecode, we start a "transaction". This means +setting up a thread-local data structure to record a log of what +occurs in the transaction.
  • +
  • We record in the log all objects that are read, as well as the +modifications that we would like to make.
  • +
  • During this time, we detect "read" inconsistencies, shown by the +object's "last-modified" timestamp being later than the start time +of the current transaction, and abort. This prevents the rest of +the code from running with inconsistent values.
  • +
  • If we reach the end of the bytecode without a "read" inconsistency, +then we atomically check for "write" inconsistencies. These are +inconsistencies which arise from concurrent updates to objects +in the other threads --- either our "write" objects, or our "read" +objects.
  • +
  • If no inconsistency is found, we "commit" the transaction by copying +the delayed writes from the log into main memory.
  • +
+

The points at which a transaction starts or ends are exactly the +points at which, in CPython, the Global Interpreter Lock is +respectively acquired and released. If we ignore the fact that (purely for +performance) CPython acquires and releases the GIL only every N bytecodes, +then this means:

+
    +
  1. Before any bytecode we acquire the GIL (start a transaction), and after +the bytecode we release it (ends the transaction); and +
  2. +
  3. Before doing an external call to the C library or the OS we release the GIL +(ends the transaction) and afterwards re-acquire it (start the next transaction). +
  4. +
+So in particular this model is well suited to the STM condition that we cannot +do anything in a transaction that cannot be rolled back, like --- precisely --- +system calls. Indeed, by construction, these system calls occur outside a +transaction, because in CPython they occur with the GIL released. + +

Performance

+

A large number of implementation details are still open for now. +From a user's point of view (i.e. the programmer using Python), +the most relevant one is the overall performance impact. We +cannot give precise numbers so far, and we expect the initial +performance to be abysmally bad (maybe 10x slower); however, with +successive improvements to the locking mechanism, to the global +program transformation inserting the locks, to the garbage +collector (GC), and to the Just-in-Time (JIT) compiler, we +believe that it should be possible to get a roughly reasonable +performance (up to maybe 2x slower). For example, the GC can +maintain flags on the objects to know that they did not escape +their creation thread, and do not need any logging; and the JIT +compiler can aggregate several reads or writes to an object into +one. We believe that these are the kind of optimizations that +can give back a lot of the performance lost.

+ +

The state of STM

+

Transactional Memory is itself a relatively old idea, originating +from a 1986 paper by Tom Knight. At first based on hardware +support, the idea of software-only transactional memory (STM) was +popularized in 1995 and has recently been the focus of intense +research.

+

The approach outlined above --- using STM to form the core of the +implementation of a language --- is new, as far as we know. So +far, most implementations provide STM as a library feature. It +requires explicit usage, often in the form of explicitly +declaring which objects must be protected by STM (object-based +STMs). It is only recently that native STM support has started +to appear, notably in the Clojure language.

+

STM is described on Wikipedia as an approach that "greatly +simplifies conceptual understanding of multithreaded programs and +helps make programs more maintainable by working in harmony with +existing high-level abstractions such as objects and modules." +We actually think that these benefits are important enough to +warrant being exposed to the Python programmer as well, instead +of being used only internally. This would give the Python +programmer a very simple interface:

+
+with atomic:
+    <these operations are executed atomically>
+
+

(This is an old idea. Funny how back in 2003 people, including me, thought that this was a hack. Now I'm writing a blog post to say "it was not a hack; it's explicitly using locks that is a hack." I'm buying the idea of composability.)

+ +

From a practical point of view, I started looking seriously at +the University of Rochester STM (RSTM), a C++ library that has +been a focus of --- and a collection of results from --- recent +research. One particularly representative paper is +A +Comprehensive Strategy for Contention Management in Software +Transactional Memory by Michael F. Spear, Luke Dalessandro, +Virendra J. Marathe and Michael L. Scott.

+ +

Conclusion

+

Taking these ideas and applying them in the context of an +implementation of a complex high-level language like Python comes +with its own challanges. In this context, using PyPy makes sense +as both an experimentation platform and as a platform that is +recently gaining attention for its performance. The alternatives +are unattractive: doing it in CPython for example would mean +globally rewriting the interpreter. In PyPy instead, we write it +as a transformation that is applied systematically at translation-time. +Also, PyPy is a general platform for generating fast interpreters +for dynamic languages; the STM implementation in PyPy would work +out of the box for other language implementations as well, instead +of just for Python.

+

Update: +

+
    +
  • This is mostly me (Armin Rigo) ranting aloud and trying experiments; +this post should not be confused as meaning that the whole PyPy team +will now spend the next years working on it full-time. +As I said it is orthogonal to the actual Python interpreter, and it is in +any case a feature that can be turned on or off during translation; I know +that in many or most use cases, people are more interested in getting a +fast PyPy rather than one which is twice as slow but scales well. +
  • +
  • Nothing I said is really new. For proof, see +Riley and Zilles (2006) +as well as Tabba (2010) who both experimented with Hardware Transactional Memory, turning CPython or PyPy interpreter's GIL into start/end transactions, as I describe here. +
  • +
+
+
+
+
+ + Anonymous wrote on 2011-08-23 13:40: +
+
+

How to handle composability ("with atomic") when something inside composed block turns out to make a system call? With explicit locking, this shouldn't be a problem.

+
+
+
+
+ + ajuc wrote on 2011-08-23 14:43: +
+
+

Re sys calls in transactions:

In clojure it is solved by requiring that code in transaction is side effect free.

You can tag code as having side effects by macro "io!" :

(defn launch-missiles
“Launch attack on remote targets with everything we have.”
[]
(io!
(doseq [missile (all-silos)]
(fire missile))))

Then if you try to execut this code in transaction clojure will complain, because you can't really rollback launching nuclear missiles :)

+
+
+
+
+ + ajuc wrote on 2011-08-23 14:49: +
+
+

Ehh, I should've thought more before posting.

Code in transactions need not be side effect free - in fact in clojure side effects are the whole point of transactions. But this code should only change STM controlled variables, not outside world.

And "io!" macro is for marking code that changes things outside of STM.

Sorry for confusion.

+
+
+
+
+ + Armin Rigo wrote on 2011-08-23 14:56: +
+
+

Here are my current hacks in C, based on RSTM: https://bitbucket.org/arigo/arigo/raw/default/hack/stm/c , from the repo https://bitbucket.org/arigo/arigo .

+
+
+
+
+ + Thomas Schilling wrote on 2011-08-23 14:56: +
+
+

Implementing STM at a core level is certainly a nice research topic, but I wonder whether it's the best way forward for Python.

STM works well in Haskell because it has the type system to enforce several constraints. Also most data is immutable in Haskell, so threading is mostly safe by default.

Most Python objects are mutable (by default), so users have to be very careful when using multi-threading. STM gives you a nice, composable primitive to protect your critical sections, but it does not tell where your critical sections are.

You dismiss multiprocessing because of serialization issues, but what about multiprocessing within the same process? You have a VM already, so my guess would be that it wouldn't be that hard to implement software processes (a la Erlang). Sure, using message passing may lead to a fair amount of copying, but I it seems to be much easier to implement and easier to use than shared-memory concurrency + STM.

+
+
+
+
+ + Armin Rigo wrote on 2011-08-23 15:23: +
+
+

@Thomas Schilling: I don't see how having a "multiprocessing" that uses the same process, rather than different processes, makes a difference. In both cases you need to write your threading code specially and care about explicitly transferring objects via shared memory --- either to another OS thread in the same process, or to a different process altogether.

+
+
+
+
+ + René Dudfield wrote on 2011-08-23 16:04: +
+
+

closures

+
+
+
+
+ + Sam Wilson wrote on 2011-08-23 16:32: +
+
+

I'm with illume... look at what Apple has done with blocks. This seems like a very efficient way forward.

Separately, you are missing something about the Java-side.

For many of the data structures in Java there are atomic and non-atomic versions. That is, when you are using a data structure on a single thread, you grab the non-atomic version. This way, you don't pay for the overhead of the locking. But, when you are sharing a data structure between threads, you use the atomic version. As a by-product of history, though it is a nice by-product, you usually get the atomic version by default. That is to say, you have to go looking for trouble by explicitly asking for the non-atomic version.

By baking this into the language, you are forcing a single policy on all programs, rather than letting the programmer choose what policy is going to be best in that scenario. Either that, or they will be forced to put code guards all over the place.

To me, it seems like the language/runtime should provide the most basic of atomic operations, and the run-time library on top should provide the policy. That's the Java approach, in a nutshell. It gives the programmer flexibility and keeps the core runtime simple and easier to optimize.

Granted, you want a high-level language where the programmer doesn't make a lot of these decisions. So... looking at your own arguments... you are expecting an initial 10x performance hit relative to the current GIL-python approach, with hopes of getting it down to 2x performance... If that's the case, why not just stick with the GIL and have Python programmers take advantage of multiprocessing by creating co-operative programs using a message passing API. In some ways, it's a little more TAUP to do it that way, isn't it?

+
+
+
+
+ + nekto0n wrote on 2011-08-23 16:37: +
+
+

What about replaying syscalls? Is it possible that such situation will happen?

+
+
+
+
+ + Armin Rigo wrote on 2011-08-23 16:45: +
+
+

@Anonymous: this case can be handled on a case-by-case basis (e.g. special-casing "prints" to buffer), but it also has a general solution: we turn the transaction into an "inevitable" transaction, i.e. one which cannot fail.

I already have support for this in my demo code, because it is needed to handle the cases where the nesting of the C program is such that setjmp/longjmp can no longer work. The typical example is the RETURN_VALUE bytecode. It starts a transaction, returns to the caller by popping off some C frames, then ends the transaction in the caller. When we return from the C frame of the callee, in the middle of the transaction, we notice that we won't have the setjmp around any longer, so we are not allowed to abort and rollback any more.

Inevitable transactions have the property of being "a bit like" a GIL in the sense that you can only have one in total, and other transactions cannot commit before it does. In case of the RETURN_VALUE, it's a very short transaction so it shouldn't really be a problem. For the case of a user-specified "with atomic:" block, it can make all the other threads pause. Not ideal, but at least better than nothing...

+
+
+
+
+ + TomV wrote on 2011-08-23 16:49: +
+
+

Could you explain a bit more what PyPy currently does to prevent these kinds of problems?

+
+
+
+
+ + nekto0n wrote on 2011-08-23 16:52: +
+
+

@TomV PyPy uses GIL

+
+
+
+
+ + Armin Rigo wrote on 2011-08-23 16:54: +
+
+

@Sam Wilson: as you know, the PyPy approach is to sacrifice nothing to performance for the user, and get reasonably good (if not exactly Java-level) performance anyway :-)

I should also mention generally that for some programs that I have in mind, using a message-passing API would be a complete rewrite (if it is really possible at all), whereas "just" making them multithreaded can be done. The "translate.py" of PyPy falls into this category. It is a program that heavily use objects within objects within objects in a big non-nicely-separable "mess", and I would not dare to think about how to send parts of this object graph over a messaging API and get back localized updates.

Of course there are also other use cases where you can naturally get a model that plays nicely with message passing.

+
+
+
+
+ + Armin Rigo wrote on 2011-08-23 17:03: +
+
+

@nekto0n: that's not really possible in general, because you need to have the return value of the syscall to decide what to do next, which normally means that you have to really do the syscall.

+
+
+
+
+ + nekto0n wrote on 2011-08-23 17:12: +
+
+

@armin please describe what will happen if 2 threads call write() on single socket object? what exactly should/will happen when iterpreter begins to dispatch CALL bytecode?

I think, it's the most questionable part of STM approach.

+
+
+
+
+ + Rodrigo Araújo wrote on 2011-08-23 17:33: +
+
+

some change in my code

https://paste.pocoo.org/show/463085/

+
+
+
+
+ + Armin Rigo wrote on 2011-08-23 17:34: +
+
+

@nekto0n: nothing particular. The two threads will run the calls in parallel, just like CPython, which calls the send() function without any GIL acquired. What exactly occurs depends on the OS and not on the language.

+
+
+
+
+ + Anonymous wrote on 2011-08-23 17:37: +
+
+

I dissagree to the fact that threads whose transactions would be invalidated, are stealing CPU timeshares from other processes / threads.

STM is an 'egoist' aproach

+
+
+
+
+ + kost BebiX wrote on 2011-08-23 20:54: +
+
+

I know this might sound stupid, but is it possible to enable/disable STM on the fly? Like to enable it only for several threads involved.

+
+
+
+
+ + kost BebiX wrote on 2011-08-23 20:55: +
+
+

Or just not open transaction when there's only 1 thread?

+
+
+
+
+ + Unknown wrote on 2011-08-23 22:43: +
+
+

Hi,

I thought a bit about what you said about Jython. Mostly, I was thinking about a way to do this automatically instead of making it explicitly.

I came up with this first draft: https://github.com/albertz/automatic_object_locking

This will obviously also be very slow but it should be possible to optimize this well (similarly to STM). And I think it is much easier than STM.

-Albert

+
+
+
+
+ + Anonymous wrote on 2011-08-24 07:39: +
+
+

Funny to see how Python eats itself like an Ouroboros. Wrong design decisions that made concurrency almost impossible, dirty hacks ("dirty" compared to, for example, Erlang's approach to SMP — almost linear scalability with a number of cores with 10-20% static overhead thanks to locks) that PyPy team are trying to do to solve problems introduced by Guido's ignorance, and a lot of Python "programmers" that don't understand what SMP is. Python is a ghetto, for real.

+
+
+
+
+ + Paul Harrison wrote on 2011-08-24 07:51: +
+
+

Seems like it should be possible to guarantee performance not much worse than with a GIL.

Am I right in thinking there is a locked section where changes are written to memory? The execution before this is effectively just some speculative computation to to speed up the locked section. If it turns out there's an inconsistency, just execute the locked section as you would normally. If the speculative computation is failing most of the time or is slow, switch to not doing it -- and we are back to GIL performance.

+
+
+
+
+ + Armin Rigo wrote on 2011-08-24 10:29: +
+
+

@all: please come to the #pypy irc channel on irc.freenode.net if you want to discuss this further.

+
+
+
+
+ + Thomas Schilling wrote on 2011-08-24 12:01: +
+
+

@Armin: Each in-memory process would use its own part of the heap so there would be no locking necessary except during message sending. You also don't need to have a 1-to-1 mapping of OS threads to processes. You could schedule N processes onto M OS threads (preferably chosen to match the number of CPU cores).

Of course, if you don't want a message-passing model (as you mentioned in another comment) then fine.

My argument is just that: STM is difficult to implement, difficult to make fast, and it still isn't that easy to use. A message passing model is much easier to implement and easier to use for end users. (You can still get deadlocks, but you could provide libraries for standard communication patterns which you only have to get right once, like Erlang's OTP.)

+
+
+
+
+ + Jan Ziak (atomsymbol) wrote on 2011-08-24 13:39: +
+
+

I think that there is some confusion here about what the underlying problem that you are trying to solve is.

The underlying (fundamental) problem that transactional memory as a method to replace GIL in Python is trying to solve is: automatic parallelization. That *is* hard.

Mediocre implementations of transactional memory are trivial to implement. Almost anybody can do it. Of course, the performance will be horrible.

If we stick to the idea about the underlying problem (automatic parallelization) and keep it in our minds while thinking, it is clear and utterly obvious that *any* implementation of transactional memory which is slower than serial execution is simply missing the target. The target is, obviously, to run the program faster than serial execution. Otherwise, it would be totally pointless.

Based on this reasoning, it is an *obvious* conclusion that a transactional memory implementation simply cannot be allowed to result in lower performance than serial execution of the code. Allowing lower performance would be completely irrational.

We are humans, not animals. Rationality is our distinctive feature. We have to try to follow rationality.

In light of this, saying that "It is OK for transactional memory to result in 2x slowdown" is irrational. I will write it one more time: accepting 2x slowdown is irrational.

Now, it is crucial to note that there are various kinds of performance measurements. And it is OK to slow down one performance indicator while boosting another performance indicator. For example, in web server environment, it is OK to slow down the delivery of individual web pages by a factor 1.3 - while boosting the number of requests per second by 2.3. That is *rational* and perfectly OK. Also, 3x developer productivity boost would be OK.

Following this example, if transactional memory is allowed to slow down performance of the program (compared to serial execution) by 2x, a person who follows rationally would immediately be drawn to seek for the evidence of a greater-than-2x performance boost in another area of the program.

Omitting developer productivity, how are the PyPy developers going to deliver the *mandatory* greater-than-2x performance boost (in some area) without actually solving the underlying hard problems requiring hard-core code analysis?

If PyPy's transactional memory implementation would serialize calls to the Linux kernel (because it is hard to emulate them in user-space), then this alone would prevent some programs to achieve the more-than-2x performance boost. This is because it is impossible to boost program performance (in some areas, given a particular performance indicator) unless the modified program is allowed to call kernel functions out-of-order or in parallel.

-----

Note: I am *not* saying that PyPy should give up. I am just trying to note that you do not seem to know what you are doing. But I may be wrong.

+
+
+
+
+ + Armin Rigo wrote on 2011-08-24 16:50: +
+
+

Of course the sentence "It is OK for transactional memory to result in 2x slowdown" was meant "on one thread". As soon as your program uses more than 2 threads, on a more-than-2-CPUs machine, then you win.

+
+
+
+
+ + Jan Ziak (atomsymbol) wrote on 2011-08-24 17:20: +
+
+

I read "Tabba (2010)" (Tabba: Adding Concurrency in Python Using a Commercial Processor’s Hardware Transactional Memory Support) just now.


The article:

- benchmark "iterate": This function is not making calls to other functions. The authors are independently running 16 instances of the "iterate" function on a 16-core CPU using 16 threads. The speedup in respect to unmodified CPython is 7x. The slowdown in respect to 16 CPython processes is 2.2x.

- benchmark "count": This is similar to "iterate". The speedup in respect to unmodified CPython is 4.5x. The slowdown in respect to 16 CPython processes is 3.5x.

- benchmark "pystone": This function is making calls to other functions. 16 instances of the "pystone" function on a 16-core CPU using 16 threads. The speedup in respect to unmodified CPython is 0.9x. The slowdown in respect to 16 CPython processes is 17x.


My analysis:

- iterate: The fact that N instances of this function can run in parallel without any interference can be determined easily. The algorithm to determine this is trivial. (Not to mention, the pointless loop in the function can be replaced by a NOP in a dead-code elimination pass).

- count: same as "iterate".

- pystone: It is not trivial to determine whether multiple instances can run in parallel. So, it should presumably run single-threaded.

- The article is *not* mentioning any real problem that was solved by TM in the case of "iterate", "count" or "pystone". That is logical, since the truth is that there is no real problem to solve here. The benchmark functions can be trivially run in 16 CPython Linux processes - anybody can do that (even your grandma).


My summary:

- In case of the two functions for which it *can* be trivially determined whether their instances can run in parallel, the TM approach results in a 2x-3x slowdown compared to the most basic auto-parallelization algorithm.

- In case of the function for which it *cannot* be trivially determined whether multiple instances can run in parallel, the TM approach running on 4-16 threads achieved 90% (loss of 10%) of the speed of single-threaded CPython without TM. On 1 thread, the TM approach is 2.1x slower.


Bottom line:

Call me crazy, but my conclusion from this article is that TM (at least the TM approach from the article) is not working at all.

+
+
+
+
+ + Greg Wilson wrote on 2011-08-24 17:43: +
+
+

Cool to see this happening. What's also cool is the result reported in Rossbach et al's study (https://www.neverworkintheory.org/?p=122): novices using STM did better in simple programming problems than students using traditional mechanisms, even though they thought they had done worse. "Baroque syntax" may be part of the problem; I'm sure the paper's authors would be happy to chat.

+
+
+
+
+ + Timo wrote on 2011-08-27 13:52: +
+
+

⚛, you're missing a very important bit of the paper. In it, the authors say, that the Rock hardware only holds 256 bytes of write-buffer content, while Riley and Zilles¹ determined the average write-buffer size needed for transactions to not fail prematurely would be "less than 640 bytes", which is almost three times as much as Rock offers.

Thus, the big slowdown that the pystone benchmark experiences could be caused by the shortcomings of the TM built into Rock.

I do have to agree, though, that the "benchmarks" used in the paper are not very satisfactory. However, the magical "simple parallelization algorithm" you summon in your comment would break down quite easily shortly after the complexity of the situation increases by just a bit, would it not?

¹ I only briefly glanced over the paper, so if anyone read it more thoroughly, they can feel free to correct me.

+
+
+
+
+ + Unknown wrote on 2011-08-28 00:01: +
+
+

I thought Erlang successfully solved this problem years ago? And I don't think anything scales better than it. So why aren't we just copying them? Message passing, where each thread or process share absolutely nothing, is the sanest and safest way to do concurrent and multi-threaded programming. I mean, you don't even have to worry about locking! STM always seemed complicated to me.

+
+
+
+
+ + Anonymous wrote on 2011-08-30 03:00: +
+
+

is there a branch we can check this out?

+
+
+
+
+ + squeaky_pl wrote on 2011-09-01 10:31: +
+
+

Hardware transactional memory anyone? https://arstechnica.com/hardware/news/2011/08/ibms-new-transactional-memory-make-or-break-time-for-multithreaded-revolution.ars

+
+
+
+
+ + Armin Rigo wrote on 2011-09-21 19:10: +
+
+

@squeaky_pl: thanks for the link. In some way researching this is ultimately doomed: either transactional memory doesn't work, or it does and in 5 or 10 years all CPUs will have good hardware support and will be able to run existing software like CPython with minor changes. :-)

+
+
+
+
+ + staila wrote on 2011-11-03 05:31: +
+
+

We are actually working on implementing this directly into stailaOS.

+
+
+
+
+ + Unknown wrote on 2012-05-12 10:24: +
+
+

@Mystilleef agree 100%

+
+
+
+
+ + Unknown wrote on 2012-07-05 22:42: +
+
+

The high-level semantics that the Python VM provides through the GIL are perfect for most programs, and for most programmer's knowledge about concurrency.

What is the purpose of going after the GIL?

If it's just a performance boost on multiple cores, then an GIOL (global IO lock) implemented on the VM, as the GIL is, should be considered. The VM could run several OS threads blocking them on IO and releasing GIL.

If the purpose is to make concurrent programming easy and correct, it can be proven that it is not possible.

Yet, there are alternatives that don't alter the language or the semantics that can be explored.

Erlang-style message passing can be provided through object proxies implemented on top or beneath the VM, so the threads/processes can even run on different computers.

In short, an Actor model is much preferable to a shared-memory one.

https://en.wikipedia.org/wiki/Actor_model

+
+
+
+
+ + Alex moner wrote on 2014-10-21 17:38: +
+
+

In general, it is completely impossible to map every operation that must be atomic to a single processor instruction.Uni-source

+
+
+
+ +

PyPy 1.6 - kickass panda

+ +
+

We're pleased to announce the 1.6 release of PyPy. This release brings a lot +of bugfixes and performance improvements over 1.5, and improves support for +Windows 32bit and OS X 64bit. This version fully implements Python 2.7.1 and +has beta level support for loading CPython C extensions. You can download it +here:

+
+https://pypy.org/download.html +
+
+

What is PyPy?

+

PyPy is a very compliant Python interpreter, almost a drop-in replacement for +CPython 2.7.1. It's fast (pypy 1.6 and cpython 2.6.2 performance comparison) +due to its integrated tracing JIT compiler.

+

This release supports x86 machines running Linux 32/64 or Mac OS X. Windows 32 +is beta (it roughly works but a lot of small issues have not been fixed so +far). Windows 64 is not yet supported.

+

The main topics of this release are speed and stability: on average on +our benchmark suite, PyPy 1.6 is between 20% and 30% faster than PyPy 1.5, +which was already much faster than CPython on our set of benchmarks.

+

The speed improvements have been made possible by optimizing many of the +layers which compose PyPy. In particular, we improved: the Garbage Collector, +the JIT warmup time, the optimizations performed by the JIT, the quality of +the generated machine code and the implementation of our Python interpreter.

+
+
+

Highlights

+
    +
  • Numerous performance improvements, overall giving considerable speedups:
      +
    • better GC behavior when dealing with very large objects and arrays
    • +
    • +fast ctypes: now calls to ctypes functions are seen and optimized +by the JIT, and they are up to 60 times faster than PyPy 1.5 and 10 times +faster than CPython
    • +
    • improved generators(1): simple generators now are inlined into the caller +loop, making performance up to 3.5 times faster than PyPy 1.5.
    • +
    • improved generators(2): thanks to other optimizations, even generators +that are not inlined are between 10% and 20% faster than PyPy 1.5.
    • +
    • faster warmup time for the JIT
    • +
    • JIT support for single floats (e.g., for array('f'))
    • +
    • optimized dictionaries: the internal representation of dictionaries is now +dynamically selected depending on the type of stored objects, resulting in +faster code and smaller memory footprint. For example, dictionaries whose +keys are all strings, or all integers. Other dictionaries are also smaller +due to bugfixes.
    • +
    +
  • +
  • JitViewer: this is the first official release which includes the JitViewer, +a web-based tool which helps you to see which parts of your Python code have +been compiled by the JIT, down until the assembler. The jitviewer 0.1 has +already been release and works well with PyPy 1.6.
  • +
  • The CPython extension module API has been improved and now supports many +more extensions. For information on which one are supported, please refer to +our compatibility wiki.
  • +
  • Multibyte encoding support: this was of of the last areas in which we were +still behind CPython, but now we fully support them.
  • +
  • Preliminary support for NumPy: this release includes a preview of a very +fast NumPy module integrated with the PyPy JIT. Unfortunately, this does +not mean that you can expect to take an existing NumPy program and run it on +PyPy, because the module is still unfinished and supports only some of the +numpy API. However, barring some details, what works should be +blazingly fast :-)
  • +
  • Bugfixes: since the 1.5 release we fixed 53 bugs in our bug tracker, not +counting the numerous bugs that were found and reported through other +channels than the bug tracker.
  • +
+

Cheers,

+

Hakan Ardo, Carl Friedrich Bolz, Laura Creighton, Antonio Cuni, +Maciej Fijalkowski, Amaury Forgeot d'Arc, Alex Gaynor, +Armin Rigo and the PyPy team

+
+
+
+
+
+ + Anonymous wrote on 2011-08-18 18:59: +
+
+

Finally :) I'm really looking forward to test this code out :)

+
+
+
+
+ + René Dudfield wrote on 2011-08-18 19:01: +
+
+

Congrats team pypy!

+
+
+
+
+ + Anonymous wrote on 2011-08-18 21:15: +
+
+

I look forward to support Python 3

+
+
+
+
+ + Anonymous wrote on 2011-08-18 21:58: +
+
+

"and has beta level support for loading CPython C extensions"

does this mean that the regular Numpy and Scipy can be used with this.

+
+
+
+
+ + almir karic wrote on 2011-08-18 22:54: +
+
+

no.

"Unfortunately, this does not mean that you can expect to take an existing NumPy program and run it on PyPy"

thanks for the release pypy team!

+
+
+
+
+ + Anonymous wrote on 2011-08-19 03:37: +
+
+

Impressive as always. Thanks for releasing such great software.
Keep up the good work.

Anghel

+
+
+
+
+ + profu wrote on 2011-08-19 05:13: +
+
+

Where is the windows version?

+
+
+
+
+ + Anonymous wrote on 2011-08-19 07:36: +
+
+

I did some benchmark with some simple parameterized SELECT statements, and found that pg8000 on pypy 1.6 is more than one time slower than pg8000 on python 2.7.1, while the later is already more than one time slower than psycopg2 on python 2.7.1.

+
+
+
+
+ + Anonymous wrote on 2011-08-19 07:55: +
+
+

Still can't build and run python-ldap extension... :(
That's a deal-breaker for me.

+
+
+
+
+ + Maciej Szumocki wrote on 2011-08-19 08:34: +
+
+

What kind of problems prevent releasing a Windows 64bit version?

+
+
+
+
+ + Lenz wrote on 2011-08-19 12:59: +
+
+

Congrats !!! Realy amazing job !!

By the way, where can I find more informations about the alredy implemented numpy functions ?

Thanks.

+
+
+
+
+ + jensck wrote on 2011-08-19 18:28: +
+
+

Amazing - PyPy just keeps making leaps and bounds forward for compat. and processing performance. I don't know how you guys keep up such pace, but I dig it!

How is typical memory usage these days? It's been a while since anything was reported on its resource usage vs. CPython. Maybe such a benchmark could be added to the speed site?

+
+
+
+
+ + Jan Ziak (atomsymbol) wrote on 2011-08-19 21:24: +
+
+

PyPy 1.5: 68 seconds
PyPy 1.6: 65 seconds
Python 3.2 (Intel C compiler): 36 seconds

Extrapolation to the future:
PyPy 1.17: 35 seconds ?

Jokes aside, PyPy's compatibility with CPython is good.

+
+
+
+
+ + Anonymous wrote on 2011-08-20 01:05: +
+
+

I'm still most looking forward to the day your jit makes some .pyj files. Initialization time for the jit is a bit high, especially if you use pypy integrated into other scripts where the init time might impact performance, numpy had no dtypes, and laked almost every function, but atleast it's a step in the right direction :)

Memory usage for one of my own test apps (building a one dimensional dict with int keys, and object (sometimes by reference from a second dict) resulted in 76MB to python2.7 and 108MB to pypy 1.6. So memory usage is still a bit behind tho (the pypy runtime was better with around 35% tho).

+
+
+
+
+ + Anonymous wrote on 2011-08-21 14:54: +
+
+

Is there a 32-bit OSX version somewhere? 64-bit seems to eat up memory in my tests...
Impressive stuff, though :-)

+
+
+
+
+ + Anonymous wrote on 2011-08-22 10:56: +
+
+

But man! What's wrong with Windows?

Will the windows version will be dropped?

Version 1.5 does not fully work on Windows and now you release 1.6 you does not provide a windows version...

So, I really want to know if it will be future support for windows.

This will help to decide if pypy will be an option or one just have to find other options to speed up the programs.

Please, clarify this.

Bests,

+
+
+
+
+ + Anonymous wrote on 2011-08-22 18:25: +
+
+

Pypy does support Windows 32bit, with a couple of bugs, the windows support have been improved from 1.5 to 1.6. Perhaps it will be fully working by 1.7.

+
+
+
+
+ + Anonymous wrote on 2011-08-23 09:45: +
+
+

Ok, but where to download PYPY for Win32 ?

+
+
+
+
+ + Anonymous wrote on 2011-08-23 12:48: +
+
+

I believe you got to compile it yourself.

+
+
+
+
+ + vak wrote on 2011-08-24 11:16: +
+
+

just impressive. If you guys could resurrect the numpy operation like:

boolean_array = arr > value

it would be just a dream. This important operation returns not an array, but a value now.

+
+
+
+ +

Visualization of JITted code

+ +
+

Hello.

+

We're proud to announce the first public release of the jitviewer. As of now, +jitviewer is a slightly internal tool that helps understanding how your Python +source code is compiled by the PyPy's JIT all the way down to machine code.

+

To install it, you need a very recent version of PyPy +(newer than 9th of August), for example one of the nightly builds:

+
+
    +
  • install pip and distribute either by creating a PyPy virtualenv +or by following the installation instructions.
  • +
  • make sure to have a source code checkout of PyPy and put it in your +PYTHONPATH.
  • +
  • +pip install jitviewer. Note that you need to run the pip +executable which belongs to PyPy, not the globally installed one.
  • +
+
+

Have a look at the README for how to start it, or try the online demo if +you just want to play with it.

+

The jitviewer is a web application written with flask and jinja2. If +you have experience with web development and you want to help PyPy, don't +hesitate to contact us, there are plenty of things to improve in it :-).

+
+

What does the jitviewer really do?

+

At the top of the page, you will see the list of pieces of code which has been +compiled by the JIT. You will see entries for both normal loops and for +"entry bridges". This is not the right place to discuss the difference +between those, but you most probably want to look at loops, because usually +it's where most of the time is spent.

+

Note that for each loop, you will see the name of the function which contains +the first instruction of the loop. However, thanks to the inlining done +by the JIT, it will contain also the code for other functions.

+

Once you select a loop, the jitviewer shows how the JIT has compiled the +Python source code into assembler in a hierarchical way. It displays four +levels:

+
    +
  • +

    Python source code: only the lines shown in azure have been compiled for +this particular loop, the ones in gray have not.

    +
  • +
  • +

    Python bytecode, the one you would get by doing:

    +
    +def f(a, b):
    +   return a + b
    +
    +import dis
    +dis.dis(f)
    +
    +

    The opcodes are e.g. LOAD_FAST, LOAD_GLOBAL etc. The opcodes +which are not in bold have been completely optimized aways by the JIT.

    +
  • +
  • +

    Intermediate representation of jit code (IR). This is a combination of +operations (like integer addition, reading fields out of structures) and +guards (which check that the assumptions we made are actually true). Guards +are in red. These operations are "at the same level as C": so, for example, ++ takes two unboxed integers which can be stored into the register +of the CPU.

    +
  • +
  • +

    Assembler: you can see it by clicking on "Show assembler" in the menu on the +right.

    +
  • +
+

Sometimes you'll find that a guard fails often enough that a new piece of +assembler is required to be compiled. This is an alternative path through the +code and it's called a bridge. You can see bridges in the jitviewer when +there is a link next to a guard. For more information about purpose look up +the jit documentation.

+
+
+

I'm still confused

+

Jitviewer is not perfect when it comes to explaining what's going on. Feel free +to pop up on IRC or send us a mail to the mailing list, we'll try to explain +and/or improve the situation. Consult the contact page for details.

+

Cheers,
+fijal & antocuni

+
+
+
+
+
+ + Paul Smith wrote on 2011-08-13 20:47: +
+
+

I'm getting a TemplateNotFound jinja2 exception when I run the jitviewer.py as shown in the README.

+
+
+
+
+ + Maciej Fijalkowski wrote on 2011-08-13 20:48: +
+
+

I think you have to python setup.py install it in a virtualenv. It might not work from the checkout any more.

+
+
+
+
+ + Paul Smith wrote on 2011-08-13 21:31: +
+
+

That fixed it, thanks.

+
+
+
+
+ + Anonymous wrote on 2011-08-14 10:25: +
+
+

Would it be possible to get some screenshots of jitviewer, as the online demo is currently down?

+
+
+
+
+ + Garito wrote on 2011-08-16 19:23: +
+
+

The demo doesn't work

Please, could you put it back?

Thanks a lot!

I'm developing a programming languaje based on mindmaps and I would like to know if my code works with pypy...

+
+
+
+
+ + Unknown wrote on 2011-10-02 08:33: +
+
+

jitviewer repository - https://bitbucket.org/pypy/jitviewer

+
+
+
+ +

PyPy is faster than C, again: string formatting

+ +
+

String formatting is probably something you do just about every day in Python, +and never think about. It's so easy, just "%d %d" % (i, i) and you're +done. No thinking about how to size your result buffer, whether your output +has an appropriate NULL byte at the end, or any other details. A C +equivalent might be:

+
+char x[44];
+sprintf(x, "%d %d", i, i);
+
+

Note that we had to stop for a second and consider how big numbers might get +and overestimate the size (44 = length of the biggest number on 64bit (20) + +1 for the sign * 2 + 1 (for the space) + 1 (NUL byte)), it took the authors of +this post, fijal and alex, 3 tries to get the math right on this :-)

+

This is fine, except you can't even return x from this function, a more +fair comparison might be:

+
+char *x = malloc(44 * sizeof(char));
+sprintf(x, "%d %d", i, i);
+
+

x is slightly overallocated in some situations, but that's fine.

+

But we're not here to just discuss the implementation of string +formatting, we're here to discuss how blazing fast PyPy is at it, with +the new unroll-if-alt branch. Given the Python code:

+
+def main():
+    for i in xrange(10000000):
+        "%d %d" % (i, i)
+
+main()
+
+

and the C code:

+
+#include <stdio.h>
+#include <stdlib.h>
+
+
+int main() {
+    int i = 0;
+    char x[44];
+    for (i = 0; i < 10000000; i++) {
+        sprintf(x, "%d %d", i, i);
+    }
+}
+
+

Run under PyPy, at the head of the unroll-if-alt branch, and +compiled with GCC 4.5.2 at -O4 (other optimization levels were tested, +this produced the best performance). It took 0.85 seconds to +execute under PyPy, and 1.63 seconds with the compiled binary. We +think this demonstrates the incredible potential of dynamic +compilation, GCC is unable to inline or unroll the sprintf call, +because it sits inside of libc.

+

Benchmarking the C code:

+
+#include <stdio.h>
+#include <stdlib.h>
+
+
+int main() {
+    int i = 0;
+    for (i = 0; i < 10000000; i++) {
+        char *x = malloc(44 * sizeof(char));
+        sprintf(x, "%d %d", i, i);
+        free(x);
+    }
+}
+
+

Which as discussed above, is more comperable to the Python, gives a +result of 1.96 seconds.

+

Summary of performance:

+ +++++++ + + + + + + + + + + + + + + + + + + + + + + + +
PlatformGCC (stack)GCC (malloc)CPythonPyPy (unroll-if-alt)
Time1.63s1.96s10.2s0.85s
relative to C1x0.83x0.16x1.9x
+

Overall PyPy is almost 2x faster. This is clearly win for dynamic +compilation over static - the sprintf function lives in libc and so +cannot be specializing over the constant string, which has to be parsed +every time it's executed. In the case of PyPy, we specialize +the assembler if we detect the left hand string of the modulo operator +to be constant.

+

Cheers,
+alex & fijal

+
+
+
+
+ + salmon wrote on 2011-08-02 19:23: +
+
+

What about '{0}'.format('pypy') ?
Is this also faster?

+
+
+
+
+ + JoeHillen wrote on 2011-08-02 19:59: +
+
+

Where can we see this "unroll-if-alt" branch?

+
+
+
+
+ + Greg Haines wrote on 2011-08-02 20:13: +
+
+

Are you sure the compiler isn't optimizing away the actual execution since you're not doing anything with the result?

+
+
+
+
+ + Thomas Schilling wrote on 2011-08-02 20:18: +
+
+

How are those two loops equivalent? You're not printing anything in the Python loop. I/O buffering etc. can eat quite a bit of runtime. It would also be nice to see what the particular improvements in this "unroll-if-alt" branch are.

+
+
+
+
+ + Anonymous wrote on 2011-08-02 20:19: +
+
+

How about doing something like that:
....
char p[5] = "%d %d"
//and then
sprintf(x, p, i,i);
....

?

+
+
+
+
+ + Andrew Pendleton wrote on 2011-08-02 20:25: +
+
+

@Thomas the C one doesn't print anything, either; sprintf just returns a string. printf is the one that prints.

+
+
+
+
+ + Anonymous wrote on 2011-08-02 20:26: +
+
+

@Thomas the C one doesn't print anything either, so it sounds pretty equivalent to me.

+
+
+
+
+ + Johan Tibell wrote on 2011-08-02 20:28: +
+
+

This doesn't really have anything to do with dynamic compilation, but cross module optimization. There are static compilers, such as the Glasgow Haskell Compiler, that do this. If the compilation strategy depended on runtime data (e.g. measure hot spots), it would be dynamic compilation.

+
+
+
+
+ + Anonymous wrote on 2011-08-02 20:56: +
+
+

*yawn* If you want to see ridiculously fast string formatting, look at the Boost's Spirit library (specifically Karma). Small test case, but point well illustrated: https://www.boost.org/doc/libs/1_47_0/libs/spirit/doc/html/spirit/karma/performance_measurements/numeric_performance/int_performance.html Or look at Spirit's input parser for even integers: https://alexott.blogspot.com/2010/01/boostspirit2-vs-atoi.html

+
+
+
+
+ + Antonio Cuni wrote on 2011-08-02 20:57: +
+
+

@JoeHillen: the unroll-if-alt branch is inside the main pypy repo on bitbucket (together with all the other branches).

@Greg: yes, we checked the generated code, it's not optimized away.

@anonymous: why it should be any faster? String literals in C are constants, it's not that you need to create a new one at each iteration

@Johan: note that the PyPy approach can generate code optimized for a formatting string loaded from a disk, or computed at runtime. No static compiler could do that.

+
+
+
+
+ + Anonymous wrote on 2011-08-02 21:10: +
+
+

What machine are you on that an int is 64 bits? Hardly anybody uses ILP64 or SILP64 data models ( https://en.wikipedia.org/wiki/64-bit#Specific_C-language_data_models ). Maybe a fourth try is in order? :P

+
+
+
+
+ + Johan Tibell wrote on 2011-08-02 21:14: +
+
+

Antonio, that is indeed neat.

+
+
+
+
+ + Unknown wrote on 2011-08-02 22:04: +
+
+

So when are you going to teach PyPy that the result of an unused string formatting can be deleted, and then delete the loop? ;)

I'm not sure how you'd get there from a tracing JIT, though. WIth Python, you still have to call all the formatting and stringification methods because they might have side effects. You only get to know that the entire operation is a no-op after you've inlined everything, but by then it will be at a low enough representation that it's hard to tell.

+
+
+
+
+ + Anonymous wrote on 2011-08-02 22:04: +
+
+

sizeof(char)==1. By definition. Argh.

PS: negative karma for lying headline

+
+
+
+
+ + Anonymous wrote on 2011-08-02 22:15: +
+
+

Check that you're not spending all your time in malloc/free(). Also use the return value from a failed snprintf(), plus 1, to size your output buffer.

+
+
+
+
+ + Unknown wrote on 2011-08-02 22:21: +
+
+

@Anonymous 2: Even if all the time were spent in malloc/free, PyPy has to dynamically allocate the string data structure, as well as provide a buffer to fill with the characters from the integers, since it has no way of knowing how much space will be needed (could be a custom integer class).

However, you're right that malloc and free are slow and a good gc system would have a faster allocator.

+
+
+
+
+ + vsergeev wrote on 2011-08-02 22:24: +
+
+

a quick tip to minimize the math in determining your sprintf buffer size for your experiment:
#include < stdint.h >
len = snprintf(NULL, 0, "%d %d", INT32_MIN, INT32_MIN);
will give you the string length required (not including null terminating byte) to fit the formatted string.

Similarly, %lld and INT64_MIN will do the trick (on the right platform) for 64-bit signed integers.

(not that I advocate fixed sized buffers for formatted strings based on min/max digit lengths for any real application)

+
+
+
+
+ + Anonymous wrote on 2011-08-02 22:33: +
+
+

You wrote:
and compiled with GCC 4.5.2 at -O4

Please read the manual of GCC. There you will see that every optimization level above 3 is handled as it would be 3. '-O4' is nothing else than '-O3'.

It is also known that optimizing with -O3 may lead to several problems at runtime (e.g. memory delays for short programs or memory allocation failure in larger programs).
That's why the recommended optimization level is '2' (or 's' for embedded systems) and not '3'.

Did you test with a realtime kernel?
How about the scheduler?

Maybe you should double check your test environment.

+
+
+
+
+ + Anonymous wrote on 2011-08-02 22:47: +
+
+

For all you complaining about test eviorment. Pypy would still have to do that internaly. If they should be truely comparable, then you need to also include snprintf inside the loop, making C even slower. Also, I doubt you will get 200% performance boost from scheduler change.

unroll-if-alt will be included in 1.6 right? Also when will 1.6 be released?

+
+
+
+
+ + Thomas Schilling wrote on 2011-08-02 22:50: +
+
+

@Andrew, @hobbs: Oh, sorry I overlooked the "s" in "sprintf". It would still be nice compare the generated machine code to explain the differences.

Whenever, someone claims language L1 implementation A is faster than language L2 implementation B there are obvious questions about (1) fairness of comparison, (2) what is being measured. In this case PyPy is specializing on the format string interpreter (does that require library annotations?) which a C compiler could do in principle here (but probably doesn't.) So, I'm always a bit suspicious when I see these kinds of comparisons.

@Johan: GHC's cross-module optimization often comes at the expense of binary compatibility. A JIT has a big advantage here.

+
+
+
+
+ + René Dudfield wrote on 2011-08-02 23:33: +
+
+

The python faster than C day has come! Congrats.

ps. Did you try it with (Link Time Optimization)LTO? that is with gcc the option: -flto ? Also, are you using PGO with gcc?

+
+
+
+
+ + nekto0n wrote on 2011-08-02 23:40: +
+
+

@salmon According to this commit new style formatting is supported too.

Someone correct me if I'm wrong.

+
+
+
+
+ + Anonymous wrote on 2011-08-02 23:49: +
+
+

I think that computation is not correct yet. IIRC, you only get 20 digits in an unsigned 64-bit quantity.

Worse, (again IIRC) sprintf is locale dependent. It may insert thousands separators.

+
+
+
+
+ + Anonymous wrote on 2011-08-03 00:31: +
+
+

This is not a good performance test because all printf function have high constant complexity, without looking at format string, check it

+
+
+
+
+ + Strohan wrote on 2011-08-03 01:54: +
+
+

wouldn't it be better to run your test with a more modern c++ library like cstring?

+
+
+
+
+ + Anonymous wrote on 2011-08-03 03:07: +
+
+

If 1.9x is "almost 2x faster", then what is "1x faster"?

+
+
+
+
+ + Poposhka wrote on 2011-08-03 05:09: +
+
+

post the Assembly code, map files and call graph or it didnt happen!!!!!!!!

+
+
+
+
+ + Reinis I. wrote on 2011-08-03 07:13: +
+
+

"one time faster" is bad English.

+
+
+
+
+ + Anonymous wrote on 2011-08-03 08:38: +
+
+

What performance impact does the malloc/free produce in the C code? AFAIK Python allocates memory in larger chunks from the operating system. Probably Python does not have to call malloc after initialization after it allocated the first chunk.

AFAIK each malloc/free crosses the boundaries between user-mode/kernel-mode.

So, IMHO you should compare the numbers of a C program which
does not allocate dynamic memory more than once and uses an internal memory management system.

These numbers would be interesting.

Have fun

+
+
+
+
+ + Damian Cugley wrote on 2011-08-03 08:44: +
+
+

The point here is not that the Python implementation of formatting is better than the C standard library, but that dynamic optimisation can make a big difference. The first time the formatting operator is called its format string is parsed and assembly code for assembling the output generated. The next 999999 times that assembly code is used without doing the parsing step. Even if sprintf were defined locally, a static compiler can’t optimise away the parsing step, so that work is done redundantly every time around the loop.

In a language like Haskell something similar happens. A string formatting function in the style of sprintf would take a format string as a parameter and return a new function that formats its arguments according to that string. The new function corresponds to the specialized assembly code generated by PyPy’s JIT. I think if you wanted to give the static compiler the opportunity to do optimizations that PyPy does at runtime you would need to use a custom type rather than a string as the formatting spec. (NB my knowledge of functional-language implementation is 20 years out of date so take the above with a pinch of salt.)

+
+
+
+
+ + Dave Kirby wrote on 2011-08-03 12:50: +
+
+

@Anonymous:

The C code shown does not do any malloc/free. The sprintf function formats the string into the char array x, which is allocated on the stack. It is highly unlikely that the sprintf function itself mallocs any memory.

+
+
+
+
+ + Paul Jaros wrote on 2011-08-03 15:45: +
+
+

I'm following the progress on pypy since many years and the potential is and has always been here. And boy, pypy has come a looong way.

You are my favorite open-source project and I am excited to see what will happen next. Go pypy-team, go!

+
+
+
+
+ + Stepan Koltsov wrote on 2011-08-03 18:25: +
+
+

PyPy does nothing 1.9 times faster than C.

+
+
+
+
+ + Jan Ziak (atomsymbol) wrote on 2011-08-03 19:51: +
+
+

You wrote: "We think this demonstrates the incredible potential of dynamic compilation, ..."

I disagree. You tested a microbenchmark. Claims about compiler or language X winning over Y should be made after observing patterns in real programs. That is: execute or analyse real C programs which are making use of 'sprintf', record their use of 'sprintf', create a statistics out of the recorded data and then finally use the statistical distributions to create Python programs with a similar distribution of calls to '%'.

Trivial microbenchmarks can be deceiving.

+
+
+
+
+ + Anonymous wrote on 2011-08-04 01:11: +
+
+

@Dave Kirby:

There are two C programs there. One on the stack, one with a malloc / free in the loop.

Which one is used for the faster claim?

+
+
+
+
+ + Armin Rigo wrote on 2011-08-04 08:47: +
+
+

@Anonymous: this branch, unroll-if-alt, will not be included in the release 1.6, which we're doing right now (it should be out any day now). It will only be included in the next release, which we hope to do soonish. It will also be in the nightly builds as soon as it is merged.

+
+
+
+
+ + Connelly Barnes wrote on 2011-08-04 20:50: +
+
+

Is string/IO performance in general being worked on in Pypy? Last I looked Pypy showed it was faster than CPython in many cases on its benchmarks page, but for many string/IO intensive tasks I tried Pypy v1.5 on, it was slower.

+
+
+
+
+ + Maciej Fijalkowski wrote on 2011-08-05 07:06: +
+
+

@Connelly yes, for some definition of working (being thought about). that's one reason why twisted_tcp is slower than other twisted benchmarks. We however welcome simple benchmarks as bugs on the issue tracker

+
+
+
+
+ + tt wrote on 2011-08-05 10:05: +
+
+

This is a horribly flawed benchmark which illustrates absolutely nothing. First of all, an optimizing JIT should be (easily) able to detect that your inner loop has no side effects and optimize it away. Secondly, with code like that you should expect all kinds of weirds transformations by the compiler, hence - you can't be really sure what you are comparing here. As many here have pointed out, you should compare the output assembly.

Anyway, if you really want to do a benchmark like that, do it the right way. Make the loop grow a string by continuous appending and write the string to the file in the end (time the loop only). This way you will get accurate results which really compare the performance of two compilers performing the same task.

+
+
+
+
+ + Anonymous wrote on 2011-08-05 11:28: +
+
+

try in nodejs:

var t = (new Date()).getTime();

function main() {
var x;
for (var i = 0; i < 10000000; i++)
x = i + " " + i;
return x;
}
x = main();

t = (new Date()).getTime() - t;
console.log(x + ", " + t);

+
+
+
+
+ + tt wrote on 2011-08-05 16:15: +
+
+

I have now put a small, slightly more realistic benchmark. I used following code.

Python

def main():
x = ""
for i in xrange(50000):
x = "%s %d" % (x, i)
return x

x = main()

f = open("log.txt", "w")
f.write(x)
f.close()

C
#include
#include
#include


int main() {
int i;
char *x = malloc(0);
FILE *file;

*x = 0x00;

for (i = 0; i < 50000; i++) {
char *nx = malloc(strlen(x) + 16); // +16 bytes to be on the safe side

sprintf(nx, "%s %d", x, i);
free(x);
x = nx;
}

file = fopen("log1.txt","w");
fprintf(file, "%s", x);
fclose(file);
}

JavaScript (NodeJS)

var fs = require('fs');

String.prototype.format = function() {
var formatted = this;
for (var i = 0; i < arguments.length; i++) {
var regexp = new RegExp('\\{'+i+'\\}', 'gi');
formatted = formatted.replace(regexp, arguments[i]);
}
return formatted;
};


function main() {
var x = "";
for (var i = 0; i < 50000; i++)
x = "{0} {1}".format(x, i);
return(x)
}

x = main();
fs.writeFile('log.txt', x)


Note for JS example: I did not want to use the stuff like i + " " + i because it bypasses the format function call. Obviously, using the + operator the nodejs example would be much faster (but pypy probably as well).

Also, I used PyPy 1.5 as I did not find any precompiled PyPy 1.6 for OS X.

Results:

PyPy: real 0m13.307s
NodeJS: real 0m44.350s
C: real 0m1.812s

+
+
+
+
+ + Jan Ziak (atomsymbol) wrote on 2011-08-05 18:32: +
+
+

@tt: This is a very inefficient C/C++ implementation of the idea "make a loop grow a string by continuous appending and write the string to the file in the end". In addition, it appears to be an uncommon piece of C/C++ code.

+
+
+
+
+ + tt wrote on 2011-08-05 20:03: +
+
+

Well, I never said anything about writing super efficient C code. Anyway, I don't see how you want to implement string formatting more efficiently - if we talk about general usage scenario. You can't really reuse the old string buffer, you basically have to allocate new one each time the string grows. Or pre-allocate a larger string buffer and do some substring copies (which will result in a much more complicated code). Anyway, the malloc() on OS X is very fast.

My point is: even this C code, which you call inefficient is around 6 times faster then pypy 1.5

+
+
+
+
+ + Antiplutocrat wrote on 2011-08-05 21:08: +
+
+

@tt except one of the main points of the post was that they had implemented a *new* feature (unroll-if-alt, I believe) that sped things up a bunch.

I'm not sure how much any comparison that *doesn't* use this new feature is worth ...

So many haters ! ;)

+
+
+
+
+ + Anonymous wrote on 2011-08-06 00:59: +
+
+

The compare is good because both use standard langauge fetatures to do the same thing, using a third part lib is not the same, then I have to code the same implant in RPython and people would still complain do to RPython often being faster then C regardless.

Python could have detected that the loop is not doing anything, but give that one value had a __str__ call it could've broken some code. Anyway, C compiler could also see that you didn't do anything with the value and optimalize it the same way.

+
+
+
+
+ + tt wrote on 2011-08-06 11:05: +
+
+

@Antiplutocrat:
Honestly, I expected a bit more objectivity from posters here. I am really disappointed that you compare me to "haters" (whoever that may be).

Your point about unroll-if-alt is absolutely valid and I myself have explicitly stated that I did not use that feature. At no point I have refuted that the original blog post was wrong - it is still very well possible that PyPy 1.6 is faster then C in this usage scenario. The main goal of my post was to make clear that the original benchmarks were flawed, as they grant the compiler too much space for unpredictable optimizations. I believe that my benchmark code produces more realistic results and I suggest that the authors of this blog entry re-run the benchmark using my code (or something similar, which controls for unpredictable optimizations).

+
+
+
+
+ + Anonymous wrote on 2011-08-06 16:35: +
+
+

@tt: Code is doing something else so it's not the same.

+
+
+
+
+ + Anonymous wrote on 2011-08-12 17:07: +
+
+

Only a quick note OffTopic: in the python FAQ, one could update adding PyPy besides Psyco in the performance tips:

https://docs.python.org/faq/programming.html#my-program-is-too-slow-how-do-i-speed-it-up

+
+
+
+
+ + Jan Ziak (atomsymbol) wrote on 2011-08-14 18:34: +
+
+

@Anonymous: I agree with your other paragraphs, but not with the one where you wrote that "... OLDER version (4.5.x) of GCC whilst a newer version (4.6.x) is available with major improvements to the optimizer in general".

I am not sure, what "major improvements" in GCC 4.6 do you mean? Do you have benchmark numbers to back up your claim?

As far as well-written C code is concerned, in my opinion, there haven't been any "major improvements" in GCC for more than 5+ years. There have been improvements of a few percent in a limited number of cases - but nothing major.

Even LTO (link-time optimization (and lets hope it will be safe/stable to use when GCC 4.7 is released)) isn't a major boost. I haven't seen LTO being able to optimize calls to functions living in dynamic libraries (the bsearch(3) function would be a nice candidate). And I also haven't seen GCC's LTO being able to optimize calls to/within the Qt GUI library when painting pixels or lines onto the screen.

The main point of the PyPy article was that run-time optimizations in PyPy have a chance of surpassing GCC in certain cases.

Personally, I probably wouldn't willingly choose to work on a project like PyPy - since, err, I believe that hard-core JIT optimizations on a dynamically typed language like Python are generally a bad idea - but I am (in a positive way) eager to see what the PyPy team will be able to do in this field in the years to come.

+
+
+
+
+ + Anonymous wrote on 2011-08-14 20:17: +
+
+

@⚛: I indeed do not have benchmarks for these claims, but GCC 4.6 indeed added some newer optimization techniques to its assortment. Maybe these may not have had a significant influence in said case but they might have somewhere else. I'm merely saying: you can't really compare the latest hot inventions with something that is surpassed (e.g. compare Java 7 to a program output by Visual C++ back form the VS 2003 IDE).

All by all, I'm not saying that Python sucks and don't want to sound like a fanboy (on the contrary, Linux uses a great deal of Python and if this could mean a major speedup, then why the hell not ;).

I guess I was pissed off because the written article sounds very much fanboyish and pro-Python (just look at the title alone).

+
+
+
+
+ + Anonymous wrote on 2012-11-01 17:25: +
+
+

So a loop that doesn't print in Python is compared to a loop in C that does and that was compiled on one of the slowest C compilers out there.

YearOfTheLinuxDesktopIsAtHand(TM)

+
+
+
+
+ + Cees Timmerman wrote on 2012-11-12 15:36: +
+
+

@Anonymous, "the C one doesn't print anything, either; sprintf just returns a string. printf is the one that prints." - Andrew Pendleton, this page, August 2, 2011 9:25 PM

+
+
+
+ +
+
+ +
+
+
+ +
+ + + + \ No newline at end of file diff --git a/blog/index-21.html b/blog/index-21.html new file mode 100644 index 000000000..cb3683cb1 --- /dev/null +++ b/blog/index-21.html @@ -0,0 +1,2253 @@ + + + + + + +PyPy (old posts, page 21) | PyPy + + + + + + + + + + + + + + + + + + Skip to main content +
+
+
+

Comparing Partial Evaluation and Tracing, Part 1

+ +
+

As part of writing my PhD I am currently thinking about the relationship +between PyPy's meta-tracing approach with various previous ideas to +automatically get a (JIT-)compiler from only an interpreter of a language. One +of the most-researched ideas along these lines is that of partial evaluation. +Partial evaluation has basically the same goals as PyPy when it comes to +compilers: Write an interpreter, and get a compiler for free. The methods for +reaching that goal are a bit different. In this series of blog posts, I am +trying to explore the similarities and differences of partial evaluation and +PyPy's meta-tracing.

+

A Flowgraph Language

+

To be able to clearly understand what "partial evaluation" is and what +"meta-tracing" is I will show an "executable model" of both. To that end, I am +defining a small imperative language and will then show what a partial evaluator +and a tracer for that language look like. All this code will be +implemented in Prolog. (Any pattern-matching functional language would do, but I +happen to know Prolog best. Backtracking is not used, so you can read things +simply as functional programs.) In this post I will start with +the definition of the language, and a partial evaluator for it. The code +written in this blog post can be found fully here: https://paste.pocoo.org/show/541004/

+

The language is conceptionally similar to PyPy's flow graphs, but a bit more +restricted. It does not have function calls, only labelled basic blocks +that consist of a series of linearly executed operations, followed by a +conditional or an unconditional jump. Every operation is assigning a value to a +variable, which is computed by applying some operation to some arguments.

+

A simple program to raise x to the yth power in that language looks like +this:

+
+power:
+    res = 1
+    if y goto power_rec else goto power_done
+
+power_rec:
+    res = res * x
+    y = y - 1
+    if y goto power_rec else goto power_done
+
+power_done:
+    print_and_stop(res)
+
+

To represent the same program as Prolog data structures, we use the +following Prolog code:

+
block(power, op1(res, same, const(1),
+             if(y, power_rec, power_done))).
+block(power_rec, op2(res, mul, var(res), var(x),
+                 op2(y, sub, var(y), const(1),
+                 if(y, power_rec, power_done)))).
+block(power_done, print_and_stop(var(res))).
+
+

Every rule of block declares one block by first giving the label of the +block, followed by the code. Code is a series of op1 or op2 statements +terminated by a jump, an if or a print_and_stop. op1 statements +are operations with one argument of the form op1(res_variable, +operation_name, argument, next_statement). Arguments can be either variables +in the form var(name) or constants in the form const(value).

+

To run programs in this flowgraph language, we first need some helper +functionality. The first few helper functions are concerned with the handling of +environments, the data structures the interpreter uses to map variable +names occuring in the program to the variables' current values. In Python +dictionaries would be used for this purpose, but in Prolog we have to emulate +these by lists of key/value pairs (not very efficient, but good enough):

+
lookup(X, [], _) :- throw(key_not_found(X)).
+lookup(Key, [Key/Value | _], Value) :- !.
+lookup(Key, [_ | Rest], Value) :- lookup(Key, Rest, Value).
+
+write_env([], X, V, [X/V]).
+write_env([Key/_ | Rest], Key, Value, [Key/Value | Rest]) :- !.
+write_env([Pair | Rest], Key, Value, [Pair | NewRest]) :- write_env(Rest, Key, Value, NewRest).
+
+remove_env([], _, []).
+remove_env([Key/_ | Rest], Key, Rest) :- !.
+remove_env([Pair | Rest], Key, [Pair | NewRest]) :- remove_env(Rest, Key, NewRest).
+
+resolve(const(X), _, X).
+resolve(var(X), Env, Y) :- lookup(X, Env, Y).
+
+

The implementation of these functions is not too important. The lookup +function finds a key in an environment list, the write_env function adds a +new key/value pair to an environment, remove_env removes a key. The +resolve function is used to take either a constant or a variable and return +a value. If it's a constant, the value of that constant is returned, if it's a +variable it is looked up in the environment. Note how the last argument of +lookup and resolve is actually a return value, which is the typical +approach in Prolog.

+

So far we have not specified what the primitive operations that can occur in the +program actually mean. For that we define a do_op function which +executes primitive operations:

+
do_op(same, X, X).
+do_op(mul, X, Y, Z) :- Z is X * Y.
+do_op(add, X, Y, Z) :- Z is X + Y.
+do_op(sub, X, Y, Z) :- Z is X - Y.
+do_op(eq, X, Y, Z) :- X == Y -> Z = 1; Z = 0.
+do_op(ge, X, Y, Z) :- X >= Y -> Z = 1; Z = 0.
+do_op(readlist, L, I, X) :- nth0(I, L, X).
+do_op(Op, _, _, _) :- throw(missing_op(Op)).
+
+

Again the last argument is an output variable.

+

Now we can start executing simple operations. For that an interp predicate +is defined. It takes as its first argument the current environment and as the +second argument the operation to execute. E.g. to execute primitive operations +with one or two arguments:

+
interp(op1(ResultVar, Op, Arg, Rest), Env) :-
+    resolve(Arg, Env, RArg),
+    do_op(Op, RArg, Res),
+    write_env(Env, ResultVar, Res, NEnv),
+    interp(Rest, NEnv).
+
+interp(op2(ResultVar, Op, Arg1, Arg2, Rest), Env) :-
+    resolve(Arg1, Env, RArg1),
+    resolve(Arg2, Env, RArg2),
+    do_op(Op, RArg1, RArg2, Res),
+    write_env(Env, ResultVar, Res, NEnv),
+    interp(Rest, NEnv).
+
+

First the arguments are resolved into values. Afterwards the operation is executed, +and the result is written back into the environment. Then interp is called on +the rest of the program. Similarly easy are the unconditional jump and +print_and_stop:

+
interp(jump(L), Env) :-
+    block(L, Block),
+    interp(Block, Env).
+
+
+interp(print_and_stop(Arg), Env) :-
+    resolve(Arg, Env, Val),
+    print(Val), nl.
+
+

In the unconditional jump we simply get the target block and continue executing +that. To execute print_and_stop we resolve the argument, print the value and +then are done.

+

The conditional jump is only slightly more difficult:

+
interp(if(V, L1, L2), Env) :-
+    lookup(V, Env, Val),
+    (Val == 0 ->
+        block(L2, Block)
+    ;
+        block(L1, Block)
+    ),
+    interp(Block, Env).
+
+

First the variable is looked up in the environment. If the variable is zero, +execution continues at the second block, otherwise it continues at the first +block.

+

Given this interpreter, we can execute the above example program like this, on a +Prolog console:

+
$ swipl -s cfglang.pl
+?- block(power, Block), interp(Block, [x/10, y/10]).
+10000000000
+
+

Partial Evaluation of the Flowgraph Language

+

Let's look at what a partial evaluator for this simple flowgraph language would +look like. Partial evaluation (PE), also called specialization, is a program +manipuation technique. PE takes an input program and transforms it into a +(hopefully) simpler and faster output program. It does this by assuming that +some variables in the input program are constants. All operations that act only +on such constants can be folded away. All other operations need to remain in the +output program (called residual program). Thus the partial evaluator proceeds +much like an interpreter, just that it cannot actually execute some operations. +Also, its output is not just a value, but also list of remaining operations that +could not be optimized away.

+

The partial evaluator cannot use normal environments, because unlike the +interpreter not all variables' values are known to it. It will therefore work on +partial environments, which store just the know variables. For these partial +environments, some new helper functions are needed:

+
plookup(Key, [], var(Key)).
+plookup(Key, [Key/Value | _], const(Value)) :- !.
+plookup(Key, [_ | Rest], Value) :- plookup(Key, Rest, Value).
+
+presolve(const(X), _, const(X)).
+presolve(var(V), PEnv, X) :- plookup(V, PEnv, X).
+
+

The function plookup takes a variable and a partial environment and returns +either const(Value) if the variable is found in the partial environment or +var(Key) if it is not. Equivalently, presolve is like resolve, +except that it uses plookup instead of lookup.

+

With these helpers we can start writing a partial evaluator. The following two +rules are where the main optimization in the form of constant folding happens. +The idea is that when the partial evaluator sees an operation that involves +only constant arguments, it can constant-fold the operation, otherwise it +can't:

+
pe(op1(ResultVar, Op, Arg, Rest), PEnv, NewOp) :-
+    presolve(Arg, PEnv, RArg),
+    (RArg = const(C) ->
+        do_op(Op, C, Res),
+        write_env(PEnv, ResultVar, Res, NEnv),
+        RestResidual = NewOp
+    ;
+        remove_env(PEnv, ResultVar, NEnv),
+        NewOp = op1(ResultVar, Op, RArg, RestResidual)
+    ),
+    pe(Rest, NEnv, RestResidual).
+
+pe(op2(ResultVar, Op, Arg1, Arg2, Rest), PEnv, NewOp) :-
+    presolve(Arg1, PEnv, RArg1),
+    presolve(Arg2, PEnv, RArg2),
+    (RArg1 = const(C1), RArg2 = const(C2) ->
+        do_op(Op, C1, C2, Res),
+        write_env(PEnv, ResultVar, Res, NEnv),
+        RestResidual = NewOp
+
+    ;
+        remove_env(PEnv, ResultVar, NEnv),
+        NewOp = op2(ResultVar, Op, RArg1, RArg2, RestResidual)
+    ),
+    pe(Rest, NEnv, RestResidual).
+
+

The pe predicate takes a partial environment, the current operations and +potentially returns a new operation. To partially evaluate a simple operation, its arguments are +looked up in the partial environment. If all the arguments are constants, the +operation can be executed, and no new operation is produced. Otherwise, we need +to produce a new residual operation which is exactly like the one currently +looked at. Also, the result variable needs to be removed from the partial +environment, because it was just overwritten by an unknown value.

+

The potentially generated residual operation is stored into the output argument +NewOp. The output argument of the recursive call is the last argument of +the newly created residual operation, which will then be filled by the +recursive call. This is a typical approach in Prolog, but may look strange if +you are not familiar with it.

+

Note how the first case of these two rules is just like interpretation. The +second case doesn't really do anything, it just produces a residual operation. +This relationship between normal evaluation and partial evaluation is very +typical.

+

The unconditional jump and print_and_stop are not much more complex:

+
pe(jump(L), PEnv, jump(LR)) :-
+    do_pe(L, PEnv, LR).
+
+pe(print_and_stop(Arg), Env, print_and_stop(RArg)) :-
+    presolve(Arg, Env, RArg).
+
+

To partially evaluate an unconditional jump we again produce a jump. The target +label of that residual jump is computed by asking the partial evaluator to +produce residual code for the label L with the given partial environment. +print_and_stop is simply turned into a print_and_stop. We will see the +code for do_pe soon.

+

Conditional jumps are more interesting:

+
pe(if(V, L1, L2), PEnv, NewOp) :-
+    plookup(V, PEnv, Val),
+    (Val = const(C) ->
+        (C = 0 ->
+            L = L2
+        ;
+            L = L1
+        ),
+        do_pe(L, PEnv, LR),
+        NewOp = jump(LR)
+    ;
+        do_pe(L1, PEnv, L1R),
+        do_pe(L2, PEnv, L2R),
+        NewOp = if(V, L1R, L2R)
+    ).
+
+

First we look up the value of the condition variable. If it is a constant, we +can produce better code, because we know statically that only one path is +reachable. Thus we produce code for that path, and then emit an unconditional +jump there. If the condition variable is not known at partial evaluation time, +we need to partially evaluate both paths and produce a conditional jump in the +residual code.

+

This rule is the one that causes the partial evaluator to potentially do much +more work than the interpreter, because after an if sometimes both paths +need to be explored. In the worst case this process never stops, so a real +partial evaluator would need to ensure somehow that it terminates. There are +many algorithms for doing that, but I will ignore this problem here.

+

Now we need to understand what the do_pe predicate is doing. Its most +important task is to make sure that we don't do the same work twice by +memoizing code that was already partially evaluated in the past. For that it +keeps a mapping of Label, Partial Environment to Label of the residual +code:

+
do_pe(L, PEnv, LR) :-
+    (code_cache(L, PEnv, LR) ->
+        true
+    ;
+        gensym(L, LR),
+        assert(code_cache(L, PEnv, LR)),
+        block(L, Code),
+        pe(Code, PEnv, Residual),
+        assert(block(LR, Residual))
+    ).
+
+

If the code cache indicates that label L was already partially evaluated +with partial environment PEnv, then the previous residual code label +LPrevious +is returned. Otherwise, a new label is generated with gensym, the code cache +is informed of that new label with assert, then the block is partially +evaluated and the residual code is added to the database.

+

For those who know partial evaluation terminology: This partial evaluator is a +polyvariant online partial evaluator. "Polyvariant" means that for every label, +several specialized version of the block can be generated. "Online" means that +no preprocessing is done before the partial evaluator runs.

+ +

Partial Evaluation Example

+

With this code we can look at the classical example of partial evaluation (it's +probably the "Hello World" of partial evaluation). We +can ask the partial evaluator to compute a power function, where the exponent +y is a fixed number, e.g. 5, and the base x is unknown:

+
?- do_pe(power, [y/5], LR).
+LR = power1.
+
+

To find out which code was produced, we can use listing:

+
?- listing(code_cache)
+code_cache(power, [y/5], power1).
+code_cache(power_rec, [y/5, res/1], power_rec1).
+code_cache(power_rec, [y/4], power_rec2).
+code_cache(power_rec, [y/3], power_rec3).
+code_cache(power_rec, [y/2], power_rec4).
+code_cache(power_rec, [y/1], power_rec5).
+code_cache(power_done, [y/0], power_done1).
+
+?- listing(block)
+.... the block definition of the user program ....
+block(power_done1, print_and_stop(var(res))).
+block(power_rec5, op2(res, mul, var(res), var(x), jump(power_done1))).
+block(power_rec4, op2(res, mul, var(res), var(x), jump(power_rec5))).
+block(power_rec3, op2(res, mul, var(res), var(x), jump(power_rec4))).
+block(power_rec2, op2(res, mul, var(res), var(x), jump(power_rec3))).
+block(power_rec1, op2(res, mul, const(1), var(x), jump(power_rec2))).
+block(power1, jump(power_rec1)).
+
+

The code_cache tells which residual labels correspond to which original +labels under which partial environments. Thus, power1 contains the code of +power under the assumption that y is 5. Looking at the block listing, +the label power1 corresponds to code that simply multiplies res by x +five times without using the variable x at all. The loop that was present +in the original program has been fully unrolled, the loop variable y has +disappeared. Hopefully this is faster than the original program.

+ +

Conclusion

+

In this blog post we saw an interpreter for a simple flow graph language in +Prolog, together with a partial evaluator for it. The partial evaluator +essentially duplicates every rule of the interpreter. If all the arguments of +the current operation are known, it acts like the interpreter, otherwise it +simply copies the operation into the residual code.

+

Partial evaluation can be used for a variety of applications, but the most +commonly cited one is that of applying it to an interpreter. To do that, the +program that the interpreter runs is assumed to be constant by the partial +evaluator. Thus a specialized version of the interpreter is produced that does +not use the input program at all. That residual code can be seen as a compiled +version of the input program.

+

In the next blog post in this series we will look at writing a simple tracer for +the same flowgraph language.

+
+
+
+
+ + 單中杰 wrote on 2012-01-26 16:57: +
+
+

Excellent example and explanation! I look forward to the next installment!

But down with gensym! Instead, you can just let LR=pair(L,PEnv).

+
+
+
+
+ + Armin Rigo wrote on 2012-01-26 17:36: +
+
+

For those not too familiar with Prolog: assert(foo(..)) is not at all like the "assert" of Python or C code. Instead, it adds the rule 'foo(..)' in the database of rules. In other words, it is as if 'foo(..)' was added to the currently running program, as an extra rule.

+
+
+
+
+ + Carl Friedrich Bolz-Tereick wrote on 2012-01-27 10:01: +
+
+

單中杰: Thanks for the compliments.

I really like the idea of getting rid of gensym that way. It had never occurred to me to simply use a non-atomic term as a label, very nice.

+
+
+
+
+ + Anonymous wrote on 2012-01-27 13:29: +
+
+

Very interesting, but I'm a bit confused - what does block(X, Y) do? It isn't defined anywhere.

+
+
+
+
+ + Carl Friedrich Bolz-Tereick wrote on 2012-01-27 13:38: +
+
+

@Anonymous: block(L, O) lists all the labels and operations corresponding to the labels that exist in the user program. See the very beginning of the post. Also, when partial evaluation creates new code it adds new cases to block(L, O), with the statement assert(block(..., ...)).

+
+
+
+ +

PyPy internship at NCAR

+ +
+

Hello, everyone

+

I would like to inform you that there is a very interesting opportunity +for doing an internship at NCAR in the lovely town of Boulder, situated +on the foothils of Rocky Mountains. Before you read on, make sure you:

+
    +
  • are a student of a US University, who is legally eligible to work in the US
  • +
  • are at least finishing second year this year
  • +
  • apply before February 3rd.
  • +
+

The internship itself will focus on using PyPy (in some way) to provide +a high performance numeric kernel for an atmospheric model, and measuring how +fast we can go. This is very much in line with what the current effort on +NumPy in PyPy is about. The internship will be mentored by Davide del Vento +and I hope to have some influence over where it goes myself :-)

+

A few interesting links:

+ +

Feel free to contact Davide for details about the proposal and pypy-dev or +me directly for details about PyPy.

+

Cheers, +fijal

+
+
+
+
+ + Rahul wrote on 2012-01-16 05:03: +
+
+

It looks good opportunity for a student. You can also post it on https://jobs.pythonweekly.com/

+
+
+
+
+ + Cameron Sparr wrote on 2012-02-01 22:56: +
+
+

I've applied for the internship already but was hoping to get some more details so I could make some last-minute edits to my application! Do you have Davide Del Vento's contact info?

+
+
+
+
+ + Maciej Fijalkowski wrote on 2012-02-02 08:34: +
+
+

send me a mail

+
+
+
+ +

Transactional Memory (II)

+ +
+

Here is an update about the previous blog post about the +Global Interpreter Lock (GIL). In 5 months, the point of view +changed quite a bit.

+

Let me remind you that the GIL is the technique used in both CPython and +PyPy to safely run multi-threaded programs: it is a global lock that +prevents multiple threads from actually running at the same time. The +reason to do that is that it would have disastrous effects in the +interpreter if several threads access the same object concurrently --- to +the point that in CPython even just manipulating the object's reference +counter needs to be protected by the lock.

+

So far, the ultimate goal to enable true multi-CPU usage has been to +remove the infamous GIL from the interpreter, so that multiple threads +could actually run in parallel. It's a lot of work, but this has been +done in Jython. The reason that it has not been done in CPython so far +is that it's even more work: we would need to care not only about +carefully adding fine-grained locks everywhere, but also about reference +counting; and there are a lot more C extension modules that would need +care, too. And we don't have locking primitives as performant as +Java's, which have been hand-tuned since ages (e.g. to use help from the +JIT compiler).

+

But we think we have a plan to implement a different model for using +multiple cores. Believe it or not, this is better than just removing +the GIL from PyPy. You might get to use all your cores without ever +writing threads.

+

You would instead just use some event dispatcher, say from Twisted, from +Stackless, or from your favorite GUI; or just write your own. From +there, you (or someone else) would add some minimal extra code to the +event dispatcher's source code, to exploit the new transactional features +offered by PyPy. Then you would run your program on a +special version of PyPy, and voilà: you get some form of automatic parallelization. +Sounds magic, but the basic idea is simple: start handling multiple +events in parallel, giving each one its own transaction. More about +it later.

+ +

Threads or Events?

+

First, why would this be better than "just" removing the GIL? Because +using threads can be a mess in any complex program. Some authors (e.g. +Lee) have argued that the reason is that threads are fundamentally +non-deterministic. This makes it very hard to reason about them. +Basically the programmer needs to "trim" down the non-determinism (e.g. +by adding locks, semaphores, etc.), and it's hard to be sure when he's +got a sufficiently deterministic result, if only because he can't write +exhaustive tests for it.

+

By contrast, consider a Twisted program. It's not a multi-threaded +program, which means that it handles the "events" one after the other. +The exact ordering of the events is not really deterministic, because +they often correspond to external events; but that's the only source of +non-determinism. The actual handling of each event occurs in a nicely +deterministic way, and most importantly, not in parallel with the +handling of other events. The same is true about other libraries like +GUI toolkits, gevent, or Stackless.

+

(Of course the Twisted and the Stackless models, to cite only these two, +are quite different from each other; but they have in common the fact +that they are not multi-threaded, and based instead on "events" --- +which in the Stackless case means running a tasklet from one switch() +point to the next one.)

+

These two models --- threads or events --- are the two main models we +have right now. The latter is more used in Python, because it is much +simpler to use than the former, and the former doesn't give any benefit +because of the GIL. A third model, which is the only one that gives +multi-core benefits, is to use multiple processes, and do inter-process +communication.

+ +

The problem

+

Consider the case of a big program that has arbitrary complicated +dependencies. Even assuming a GIL-less Python, this is likely enough to +prevent the programmer from even starting a multi-threaded rewrite, +because it would require a huge mess of locks. He could also consider +using multiple processes instead, but the result is annoying as well: +the complicated dependencies translate into a huge mess of inter-process +synchronization.

+

The problem can also be down-sized to very small programs, like the kind +of hacks that you do and forget about. In this case, the dependencies +might be simpler, but you still have to learn and use subtle locking +patterns or a complex inter-process library, which is overkill for the +purpose.

+

(This is similar to how explicit memory management is not very hard for +small programs --- but still, nowadays a lot of people agree that +automatic memory management is easier for programs of all sizes. I +think the same will eventually be true for using multiple CPUs, but the +correct solution will take time to mature, like garbage collectors did. +This post is a step in hopefully the right direction :-))

+ +

Events in Transactions

+

Let me introduce the notion of independent events: two events are +independent if they don't touch the same set of objects. In a multi-threaded +world, it means that they can be executed in parallel without needing any lock +to ensure correctness.

+

Events might also be mostly independent, i.e. they rarely access the same +object concurrently. Of course, in a multi-threaded world we would still need +locks to ensure correctness, but the point is that the locks are rarely causing +pauses: lock contention is low.

+

Consider again the Twisted example I gave above. There are often several +events pending in the dispatch queue (assuming the program is using 100% +of our single usable CPU, otherwise the whole discussion is moot). The case I am +interested in is the case in which these events are generally mostly +independent, i.e. we expect few conflicts between them. However +they don't have to be proved independent. In fact it is fine if +they have arbitrary complicated dependencies as described above. The +point is the expected common case. Imagine that you have a GIL-less +Python and that you can, by a wave of your hand, have all the careful +locking mess magically done. Then what I mean here is the case in which +such a theoretical program would run mostly in parallel on multiple +core, without waiting too often on the locks.

+

In this case, the solution I'm proposing is that with minimal tweaks +in the event dispatch loop, we can +handle multiple events on multiple threads, each in its own transaction. +A transaction is basically a tentative execution of the corresponding +piece of code: if we detect conflicts with other concurrently executing +transactions, we abort the whole transaction and restart it from +scratch.

+

By now, the fact that it can basically work should be clear: multiple +transactions will only get into conflict when modifying the same data +structures, which is the case where the magical wand above would have +put locks. If the magical program could progress without too many +locks, then the transactional program can progress without too many +conflicts. In a way, you get even more than what the magical program +can give you: each event is dispatched in its own transaction, which +means that from each event's point of view, we have the illusion that +nobody else is running concurrently. This is exactly what all existing +Twisted-/Stackless-/etc.-based programs are assuming.

+

Note that this solution, without transactions, already exists in some +other languages: for example, Erlang is all about independent events. +This is the simple case where we can just run them on multiple cores, +knowing by construction of the language that you can't get conflicts. +Of course, it doesn't work for Python or for a lot of other languages. +From that point of view, what I'm suggesting is merely that +transactional memory could be a good model to cope with the risks of +conflicts that come from not having a special-made language.

+ +

Not a perfect solution

+

Of course, transactional memory +(TM) is not a perfect solution either. Right now, the biggest issue is +the performance hit that comes from the software implementation (STM). +In time, hardware support (HTM) is likely to show up and help +mitigate the problem; but I won't deny the fact that in some cases, +because it's simple enough and/or because you really need the top +performance, TM is not the best solution.

+

Also, the explanations above are silent on what is a hard point for TM, +namely system calls. The basic general solution is to suspend other +transactions as soon as a transaction does its first system call, so +that we are sure that the transaction will succeed. Of course this +solution is far from optimal. Interestingly, it's possible to do better +on a case-by-case basis: for example, by adding in-process buffers, we +can improve the situation for sockets, by having recv() store in a +buffer what is received so that it can be re-recv()-ed later if the +transaction is aborted; similarly, send() or writes to log files can be +delayed until we are sure that the transaction will commit.

+

From my point of view, the most important point is that the TM solution +comes from the correct side of the "determinism" scale. With threads, +you have to prune down non-determinism. With TM, you start from a +mostly deterministic point, and if needed, you add non-determinism. The +reason you would want to do so is to make the transactions shorter: +shorter transactions have less risks of conflicts, and when there are +conflicts, less things to redo. So making transactions shorter +increases the parallelism that your program can achieve, while at the +same time requiring more care.

+

In terms of an event-driven model, the equivalent would be to divide the +response of a big processing event into several events that are handled +one after the other: for example, the first event sets things up and fires the second +event, which does the actual computation; and afterwards a third event +writes the results back. As a result, the second event's transaction +has little risks of getting aborted. On the other hand, the writing +back needs to be aware of the fact that it's not in the same transaction +as the original setting up, which means that other unrelated +transactions may have run in-between.

+ +

One step towards the future?

+

These, and others, are the problems of the TM approach. They are "new" +problems, too, in the sense that the existing ways of programming don't +have these problems.

+

Still, as you have guessed, I think that it is overall a win, and +possibly a big win --- a win that might be on the same scale for the age +of multiple CPUs as automatic garbage collection was 20 years ago for +the age of RAM size explosion.

+

Stay tuned for more!

+

--- Armin (and reviews by Antonio and Fijal)

+ +
UPDATE: please look at the tiny transaction module I wrote as an example. The idea is to have the same interface as this module, but implemented differently. By making use of transactional memory internally, it should be possible to safely run on multiple CPUs while keeping the very same programmer interface. +
+
+
+
+ + Unknown wrote on 2012-01-14 15:17: +
+
+

Great article, great solution to a big problem...

I am really looking forward to this :-)

As an experiment I have developed Pyworks, which makes objects concurrent and methods asynchronious. But it makes little sense to do performance test on an multicore CPU because of the GIL.

The code for Pyworks can be found at https://bitbucket.org/raindog/pyworks

+
+
+
+
+ + Anonymous wrote on 2012-01-14 15:38: +
+
+

> These two models --- threads or events --- are the two main models we have right now.

Where does Go-style concurrency fit in?

+
+
+
+
+ + gasche wrote on 2012-01-14 16:50: +
+
+

If you go that road, you will certainly find out that Transactional Memory is much, much harder to get right than it looks like in today effectful/imperative languages. Sure, it looks wonderful on paper, but if your language doesn't help you control side-effects it will give you a very hard time.

Currently, there is satisfying STM support in Haskell (because of its tight type-based control of side-effects) and Clojure (beacuse of its tight control on mutability), and it might be getting into Scala.

I doubt Python can easily get such control, at least without an important reorganization of idiomatic practices and frameworks, that go beyond the "let's be event-driven" decision. Which makes your "this is going to work magically" story a bit hard to believe.

There has been intense research on this topic for some decades now, and several attempts at getting it to work in current mainstream languages have mostly failed.

See for example this long retrospective of the STM.NET effort at Microsoft Research, by Joe Duffy:
A (brief) retrospective on transactional memory
or this shorter blog post by Brian Hurt:
The problem with STM: your languages still suck.

I was a bit disappointed that you didn't cite any of the relevant literature in your post. It made me suspicious of "reiventing the wheel"...

+
+
+
+
+ + Anonymous wrote on 2012-01-14 16:57: +
+
+

One major use-case for multithreading involves a large, unchanging data structure which many threads access. I.e., the data structure is loaded by a parent task, then not modified again; a number of threads are then spawned to use it for calculations.

In CPython, the GIL makes this impossible if only because the reference counters need to be protected. With Cython in threads, however, you can turn off the GIL and do some work on C-style data structures.

I'm wondering whether the STM PyPy effort could have a very useful, and very early, benefit: simply enabling an unchanging data structure to be accessed by a number of processors via the kinds of events you describe. There wouldn't be a need for transactions, because the programmer would take responsibility for only sharing unchanging structures between simultaneously-executing events.

But it seems like the basic requirements for this kind of facility might be met in in early stage of STM development. And a solution that allowed multiple processors to access large, unchanging structures would be very useful in certain applications. I know I have one in mind that I'm looking at CPython/Cython for, but I'd rather see if I could get the performance I need from PyPy.

Just thought it was worth mentioning.

+
+
+
+
+ + Armin Rigo wrote on 2012-01-14 19:27: +
+
+

@Anonymous: in the extract you cite I meant "the two main models in Python". As far as I can tell, Go does concurrency by enforcing all communications to go via channels, so I would classify it as a "special-made" language. This solution might be nice and usable, but it does not really work at all in languages like Python.

+
+
+
+
+ + Daniel Waterworth wrote on 2012-01-14 20:27: +
+
+

@Armin, CSP may be built into Go, but IMO this was a mistake, there is no requirement for it to be a language feature; it fits nicer as library. See [python-csp] for a python implementation.

[python-csp] https://code.google.com/p/python-csp/wiki/Tutorial

+
+
+
+
+ + Armin Rigo wrote on 2012-01-14 21:11: +
+
+

@gasche: I know about Haskell, Clojure and Scala, and I just read the two blog posts you pointed to.

I'm not talking about giving explicit TM to the end programmers. I'm instead considering TM as an internal, implementation-only feature. That makes it very similar to GCs.

I know the points and issues of traditional TM systems, which are nicely reported by Joe Duffy in "A (brief) retrospective on transactional memory". These are of course perfectly valid issues, but I think they do not apply (or "not that much") in the particular context I'm talking about. For example, this includes the large sections about nested transactions, and about consistency between the transactional and non-transactional worlds (Weak or Strong Atomicity, The Privatization Problem). Even "Where is the Killer App?" is obvious in this case: any existing Twisted App is potentially a Killer App.

Sorry for not including references to papers. I must admit I don't know any paper that describes a similar use case for TM.

+
+
+
+
+ + Simon Weber wrote on 2012-01-14 21:45: +
+
+

The link to the previous blog post is broken. It should be: https://morepypy.blogspot.com/2011/06/global-interpreter-lock-or-how-to-kill.html

+
+
+
+
+ + Anonymous wrote on 2012-01-15 07:24: +
+
+

> @Armin, CSP may be built into Go, but IMO this was a mistake, there is no requirement for it to be a language feature; it fits nicer as library. See [python-csp] for a python implementation.

Stackless (which PyPy enables) supports Go-style channels as well, no?

https://www.stackless.com/wiki/Channels

+
+
+
+
+ + René Dudfield wrote on 2012-01-15 08:03: +
+
+

Your idea could work for other easy to inject into points, such as loops, and comprehensions. Especially with much of the work in pypy already done for identifying information about loops.

How does this compare to grand central dispatch and blocks? https://en.wikipedia.org/wiki/Grand_Central_Dispatch

Events are a very good way to model concurrency, and are widely used. It is a great place to dispatch concurrency into parallelism.

Closures/blocks provide a fairly decent way to get some of the protection of STM - and in many programs give you the 80% solution. For code that plays nicely and avoids mutable, or global data - this works. Luckily, a lot of event based code is already written in this way. As you say, they are "generally mostly independent".

Making the bad cases a quick fail, like in JavaScript worker threads could be an ok option. As soon as someone tries to access global data(do a system call, access the DOM, or access data outside the closure even), the program would fail there. Then you could fix those cases, or "add non-determinism" as you say. I think I'd prefer fail fast here, rather than have to detect these problems, and have them silently pass by.

You still have scheduling problems, and trying to figure out task size. As well, this does not solve lots of other problems. However, it is cool that it could be applied automatically, and probably 'safely'.

Another random thought... you could probably mark chunks of code as 'pure' as your run through them, and if they do a system call or mutate global data mark them as 'unpure' and don't try them again.

I very much look forward to reading your results as you implement more.

+
+
+
+
+ + Eric van Riet Paap wrote on 2012-01-15 08:56: +
+
+

When Armin gets this excited I'd fasten my seatbelt and put my goggles on.

Thank you for letting me be an (otherwise mostly silent) observer.

Please keep shifting boundaries!

- Eric

+
+
+
+
+ + Armin Rigo wrote on 2012-01-16 10:08: +
+
+

Update: please look at the tiny transaction module I wrote as an example. The idea is to have the same interface as this module, but implemented differently. By making use of transactional memory internally, it should be possible to safely run on multiple CPUs while keeping the very same programmer interface.

https://bitbucket.org/arigo/arigo/raw/default/hack/stm/transactionmodule/

+
+
+
+
+ + René Dudfield wrote on 2012-01-16 12:11: +
+
+

@Armin: That transaction code looks very simple. It seems trivial to implement a map/mapReduce style function on top of your transaction module.

It is a very similar API to worker pool APIs which many thread using programs use. The main difference is that you combine the join() in the run method. It seems that a threaded web server for example could use this? What would happen if each incoming request comes in, and is put into the transaction (and say the 10th request has an error)? Would it be better to use multiple transactions?

Have you thought how thread local storage would work?

+
+
+
+
+ + Armin Rigo wrote on 2012-01-16 12:55: +
+
+

@notme: yes, a web server or anything can use this instead of using threads. It's of course missing a convincing select() or poll() version for that.

The details haven't been thought out; right now an exception interrupts everything. In an STM model it's unclear if concurrent transactions should still be allowed to complete or not. Anyway the point is that exceptions should not really occur because precisely they interrupt everything --- you would typically add instead in every transaction code like "try: .. except: traceback.print_exc()".

Thread local storage: what would be the point?

+
+
+
+
+ + Unknown wrote on 2012-01-18 10:06: +
+
+

I also see no reason for Thread local memory.

I like the idea of thinking about TM in the same line as GC. When you have GC the changes to the language is that you don't need to write free/dealloc.

Having TM would mean that you don't have to write acquire_GIL

+
+
+
+
+ + headius wrote on 2012-01-24 04:22: +
+
+

The devil's in the details.

I'm not sure I buy your conclusions here. STM is not a panacea for solving concurrency issues, and it has some key limitations that limit its general applicability.

On what granularity do you plan to have transactions? How do you know? Perhaps the VM will have enough knowledge of a given thread's activities to limit transactional overhead to only those structures in memory that are shared, but there still needs to be some indirection in case another thread hops in and starts making changes.

Where do transactions start and end? In STMs I know, the in-transaction overhead for reading and writing data is *much* higher, since it needs to know if someone else has committed a transaction first and be able to roll back.

Perhaps this is all intended to be hidden, and you never actually have "threads" that the user can see. But if you're going to parallelize, you'll have threads *somewhere* that are going to contend for resources. If they're going to contend for resources, even in an STM, they're going to have to check for contention, register their interest, and then you're back to the indirection overhead.

Perhaps I'm not understand what your end goal is. You can't simply turn the world into a series of transactions unless you want every read and write to have transaction overhead or you have some clear way of limiting transaction overhead to only where it's needed. You cite Erlang...but Erlang deals with immutable objects, and there's far less need for anything like an STM. Others have mentioned Clojure...but again, Clojure is mostly immutable structures, and transactional overhead is limited to Refs, where you'll make single coarse-grained reads and writes.

Am I missing the point? Are you not suggesting VM-wide STM, with the resulting transactional overhead for every read and write?

+
+
+
+
+ + Armin Rigo wrote on 2012-01-24 10:03: +
+
+

@Charles: Indeed, I am suggesting VM-wide STM, with the resulting transactional overhead for every read and write. I actually got such a VM yesterday (with no GC): it seems to be about 10x slower on a single thread.

Note that even 10x slower is a plus if it scales to dozens of processors. But of course, a better point of view is that some years ago the regular pypy *was* 10x slower than CPython. It was a lot of efforts but we managed to make it only 1.5-2x slower. And this is all without counting the JIT. If STM bogs down to a generally-not-triggered read barrier before every read, then the performance impact could be well under 2x.

Please note also that I don't care about Java-like performance where even loosing 10% of performance would be a disaster. If we end up with a pypy-tm that is 2x slower than a regular pypy, I would be quite happy, and I believe that there is a non-negligible fraction of the Python users that would be, too.

On granularity: for now I'm going with the idea that the granularity is defined "naturally" in the source program as the amount of work done every time some central dispatch loop calls some code. There might be several dispatch loops in total, too. This is true in the cases I can think of: typical Twisted or Stackless programs, pypy's "translate.py", the richards benchmark, etc.

Please look at https://paste.pocoo.org/show/539822/ for an example of what I'm talking about. It's a diff against the standard richards.py: it is a pure Python user program in which I added calls to the new 'transaction' module. At this level there is no hint of Transactional Memory.

+
+
+
+
+ + Armin Rigo wrote on 2012-01-31 17:13: +
+
+

@Gary Robinson: (off-topic:) for this kind of use case, you can use os.fork() after the immutable data is ready. It "kind of works" both in pypy and in cpython, although not really --- in cpython the reference counts are modified, causing the pages to get unshared between processes; and in pypy the garbage collector (GC) has the same effect, so far. It could be solved in pypy by more tweaks the GC.

+
+
+
+
+ + Anonymous wrote on 2012-02-01 18:43: +
+
+

@armin: @Anonymous: in the extract you cite I meant "the two main models in Python". As far as I can tell, Go does concurrency by enforcing all communications to go via channels, so I would classify it as a "special-made" language. This solution might be nice and usable, but it does not really work at all in languages like Python.

Armin, Stackless Python uses a model that at the API level is very similar to Go. Go borrows from the Bell Labs family of languages (i.e. Newsqueak). The fundamental idea is that message pasing is used to share information between threads/processes/coroutines. In this regard, Go is in the same camp as say, Erlang (although the messaging systems are different).


What I think is interesting and workable for Python are efforts in languages like Polyphonic C# (see the paper "Scalable Join Patterns") and Concurrent/Parallel ML, where lock-free libraries and STM techniques are used under the hood to improve the efficiency of the messaging/synchronisation system. In this fashion, the programmer has a conceptually clean concurrency model and still can make the important decisions about how to partition the problem.

Cheers,
Andrew

+
+
+
+
+ + Anonymous wrote on 2012-02-01 18:59: +
+
+

@daniel@Armin, CSP may be built into Go, but IMO this was a mistake, there is no requirement for it to be a language feature; it fits nicer as library. See [python-csp] for a python library

I have looked at Python-CSP a long time ago. I recall it being verbose. However I use Stackless Python. And using PyPy's stackless.py, I implemented select() and join patterns. Sometimes I wish I had language support: they cut down on silly mistakes and make the code less verbose for simple cases. However what I have found is that the language can get in the way. For instance, in Go, one has to come up with hacks to do some simple like do a select on an arbitrary number of channels. Perhaps I am wrong but I suspect stuff like select()'s design was influenced by the fact Newsqueak was originally designed to make a windowing system easier to write. So one is monitoring only a handful of channels. In constrast, this is not the way Stackless Python programmes are written.

Cheers,
Andrew

+
+
+
+
+ + Armin Rigo wrote on 2012-02-01 20:39: +
+
+

A link to a group that did the same thing (thanks a lot Andrew for this link!):

https://research.microsoft.com/en-us/projects/ame/

In particular the May 2007 paper (HotOS) nicely summarizes exactly what I'm trying to say, and I think it is clearer than me, if I have to jugde from feedback :-)

+
+
+
+
+ + Anonymous wrote on 2012-02-27 17:57: +
+
+

Speaking as someone maintaining a large application that uses Twisted, this sounds great.

+
+
+
+ +

NumPyPy progress report - running benchmarks

+ +
+

Hello.

+

We're excited to let you know about some of the great progress we've made on +NumPyPy: both completeness and performance. In this blog entry we mostly +will talk about performance and how much progress we have made so far. +

+

Word of warning: this +work is in progress -- we're maybe half way to where we want to be and there are +many trivial and not so trivial optimizations to be written. (For example, we +haven't even started to implement important optimizations, like vectorization.)

+
+

Benchmark

+

We chose a laplace equation solver, based on SciPy's PerformancePython wiki. +Unfortunately, the different implementations on the wiki page accidentally use +two different algorithms, which have different convergences, and very different +performance characteristics on modern computers. As a result, we implemented +our own versions in both C and Python (with and without NumPy). The full source +can be found in fijal's hack repo, all these benchmarks were performed at +revision 18502dbbcdb3.

+

First, let me describe various algorithms used. Note that some of them contain +PyPy-specific hacks to work around limitations in the current implementation. +These hacks will go away eventually and the performance will improve. +Numerically the algorithms used are identical, however exact data layout in +memory differs between them.

+

A note about all the benchmarks: they each were run once, but the +performance is very stable across runs.

+

Starting with the C version, it implements a trivial laplace transform +using two loops and double-reference memory (array of int*). The double +reference does not matter for performance and the two algorithms are +implemented in inline-laplace.c and laplace.c. They were both compiled +with gcc 4.4.5 at -O3. The inline version modifies array in-place while the non-inline version stores results in a copy. That makes them converge at different rate, hence different number of iterations

+

A straightforward version of those in Python is implemented in laplace.py +using, respectively, inline_slow_time_step and slow_time_step. +slow_2_time_step does the same thing, except it copies arrays in-place +instead of creating new copies. Table below compares running PyPy against C:

+ +++++ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
benchnumber of iterationstime per iteration
laplace C2196.3ms
inline-laplace C27820ms
slow python21917ms
slow 2 python21914ms
inline_slow python27823.7ms
+

An important thing to notice is the data dependency of the inline +version causes a huge slowdown for the C versions. This is not a severe +disadvantage for us though -- the brain-dead Python version takes longer +and PyPy is not able to take advantage of the knowledge that the data is +independent. The results are in the same ballpark as the C versions -- +15% - 170% slower, but the algorithm +one chooses matters more than the language. By comparison, the slow versions +take about 5.75s each on CPython 2.6 per iteration and, by estimation, +are about 200x slower than the PyPy equivalent, if I had the patience to +measure the full run.

+

The next step is to use NumPy expressions. The first problem we run into is +that computing the error requires walking the entire array a second time. This +is fairly inefficient in terms of cache access, so I took the liberty of +computing the errors every 15 steps. This results in the convergence being +rounded to the nearest 15 iterations, but speeds things up considerably. +numeric_time_step takes the most braindead approach of replacing the array +with itself, like this:

+
+u[1:-1, 1:-1] = ((u[0:-2, 1:-1] + u[2:, 1:-1])*dy2 +
+                       (u[1:-1,0:-2] + u[1:-1, 2:])*dx2)*dnr_inv
+
+

We need 3 arrays here -- one is an intermediate (PyPy only needs one, for all of +those subexpressions), one is a copy for computing the error, and one is the +result. This works automatically because in NumPy + or * creates an +intermediate, while NumPyPy avoids allocating the intermediate if possible.

+

numeric_2_time_step works in pretty much the same way:

+
+src = self.u
+self.u = src.copy()
+self.u[1:-1, 1:-1] = ((src[0:-2, 1:-1] + src[2:, 1:-1])*dy2 +
+                      (src[1:-1,0:-2] + src[1:-1, 2:])*dx2)*dnr_inv
+
+

except the copy is now explicit rather than implicit.

+

numeric_3_time_step does the same thing, but notice one doesn't have to copy +the entire array, it's enough to copy the border pieces and fill rest with +zeros:

+
+src = self.u
+self.u = numpy.zeros((self.nx, self.ny), 'd')
+self.u[0] = src[0]
+self.u[-1] = src[-1]
+self.u[:, 0] = src[:, 0]
+self.u[:, -1] = src[:, -1]
+self.u[1:-1, 1:-1] = ((src[0:-2, 1:-1] + src[2:, 1:-1])*dy2 +
+                      (src[1:-1,0:-2] + src[1:-1, 2:])*dx2)*dnr_inv
+
+

numeric_4_time_step is the one that tries hardest to resemble the C version. +Instead of doing an array copy, it actually notices that one can alternate +between two arrays. This is exactly what the C version does. The +remove_invalidates call is a PyPy specific hack - we hope to remove this +call in the near future, but, in short, it promises "I don't have any unbuilt +intermediates that depend on the value of the argument", which means one doesn't +have to compute sub-expressions one is not actually using:

+
+remove_invalidates(self.old_u)
+remove_invalidates(self.u)
+self.old_u[:,:] = self.u
+src = self.old_u
+self.u[1:-1, 1:-1] = ((src[0:-2, 1:-1] + src[2:, 1:-1])*dy2 +
+                      (src[1:-1,0:-2] + src[1:-1, 2:])*dx2)*dnr_inv
+
+

This one is the most comparable to the C version.

+

numeric_5_time_step does the same thing, but notices one doesn't have to copy +the entire array, it's enough to just copy the edges. This is an optimization +that was not done in the C version:

+
+remove_invalidates(self.old_u)
+remove_invalidates(self.u)
+src = self.u
+self.old_u, self.u = self.u, self.old_u
+self.u[0] = src[0]
+self.u[-1] = src[-1]
+self.u[:, 0] = src[:, 0]
+self.u[:, -1] = src[:, -1]
+self.u[1:-1, 1:-1] = ((src[0:-2, 1:-1] + src[2:, 1:-1])*dy2 +
+                      (src[1:-1,0:-2] + src[1:-1, 2:])*dx2)*dnr_inv
+
+

Let's look at the table of runs. As before, gcc 4.4.5, compiled at -O3, +and PyPy nightly 7bb8b38d8563, on an x86-64 machine. All of the numeric methods +run for 226 steps, slightly more than the 219, rounding to the next 15 when the +error is computed.

+ +++++ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
benchmarkPyPyCPython
numeric21ms35ms
numeric 214ms37ms
numeric 313ms29ms
numeric 411ms31ms
numeric 59.3ms21ms
+

We think that these preliminary results are pretty good. They're not as fast as +the C version (or as fast as we'd like them to be), but we're already much +faster than NumPy on CPython -- almost always by more than 2x on this relatively +real-world example. This is not the end, though. In fact, it's hardly the +beginning! As we continue work, we hope to make even more use of the +high level information that we have. Looking at the assembler generated by +gcc for this example, it's pretty clear we can outperform it thanks to better +aliasing information and hence better possibilities for vectorization. +Stay tuned.

+

EDIT: fixed the benchmark name +

+

EDIT2: added info that first table is about PyPy

+

Cheers, +fijal

+
+
+
+
+
+ + D wrote on 2012-01-10 20:24: +
+
+

Nice to hear, but what we (numpy users) really need is 2-dimensional matrices with basic arithmetic operations (+, -, /, *, sin, cos, pow etc) and other related methods, e.g. min(array,axis), nanmax(array, axis), argmax(array,axis), nanargmin(array, axis) etc. While CPython soft dependent on these operations works more or less fast, with PyPy it mere doesn't work at all. I hope first of all you'll focus on it instead of speed improvement for single-dimensional arrays.
Regards, D.

+
+
+
+
+ + Maciej Fijalkowski wrote on 2012-01-10 20:27: +
+
+

It would be really cool if you try before complaining. I think all of it works on a nightly build, except the axis argument which is on a branch being worked on.

+
+
+
+
+ + D wrote on 2012-01-10 20:28: +
+
+

Also, IIRC NumPyPy still misses linalg.solve method for solving systems of linear equations, that is highly important for lots of soft. Connecting sparse SLE solver (like umfpack or superlu from scipy.sparse) also would be very essential.

+
+
+
+
+ + Maciej Fijalkowski wrote on 2012-01-10 20:30: +
+
+

We're working on it. Stay tuned

+
+
+
+
+ + D wrote on 2012-01-10 20:32: +
+
+

Maciej, anything about 2-dimensional matrix implementations with related operations haven't been mentioned in blog, so why I have to know about it? I only installed and tried stable PyPy 1.7, because I had tried building PyPy from sources and found it damned hard, especially for my limited hardware (2 GB RAM).

+
+
+
+
+ + Maciej Fijalkowski wrote on 2012-01-10 20:33: +
+
+

Good point, we'll write a blog post what has been implemented as well. Try nightly

+
+
+
+
+ + Adam wrote on 2012-01-10 21:02: +
+
+

A Laplace transform is something quite different to solving Laplace's equation with finite differences...

+
+
+
+
+ + Maciej Fijalkowski wrote on 2012-01-10 21:07: +
+
+

fixed, thanks

+
+
+
+
+ + Anonymous wrote on 2012-01-10 21:13: +
+
+

It may be nice to link to the nightly builds so that people can try this out :)

+
+
+
+
+ + Chris LeBlanc wrote on 2012-01-10 23:12: +
+
+

This is excellent! Great work, the potential of this project is very exciting. I was quietly wishing for this since pypy first started.

I use NumPy all the time, and any increase in performance makes a big difference. This is one of the main advantages of NumPyPy over NumPy, so it makes sense to focus on it.

There seems to be lots of complaining about missing features and such, but having a solid foundation to work from seems to be the most important thing. Missing features can be added down the line.

I remember reading a blog post last year about using transactional memory as a way of removing the GIL. If you could combine that with NumPyPy to run numerical tasks in parallel, that would make a lot of scientific programmers very happy. I don't know if this is feasible, but it sure would be nice.

Keep up the good work.

+
+
+
+
+ + Maciej Fijalkowski wrote on 2012-01-10 23:18: +
+
+

Hi Chris.

We have vague plans how to parallelize numpy expressions without even having to remove the GIL. That way you'll have workers that are able to perform (or help perform) numeric tasks, but the interpreter itself will still run in a single thread. The same goes for GPUs and MIC.

+
+
+
+
+ + Anonymous wrote on 2012-01-11 10:55: +
+
+

Nightly builds
https://buildbot.pypy.org/nightly/trunk

+
+
+
+
+ + Anonymous wrote on 2012-01-11 13:33: +
+
+

Please when you consider parallelizing things, do remember about leaving an explicit switch to turn it off!

I run my Python stuff on clusters through a queuing system and it will be VERY unhappy if single processes use more than one thread without informing the scheduler.

+
+
+
+
+ + Anonymous wrote on 2012-01-11 13:34: +
+
+

Hey, by the way, your progress on NumPy is amazing and highly appreciated.

+
+
+
+
+ + Maciej Fijalkowski wrote on 2012-01-11 15:31: +
+
+

@Anonymous of course, this is a given that we'll leave the switch to turn it off. It might be not even on by default, that's up for discussion

+
+
+
+
+ + Paul Harrison wrote on 2012-01-12 02:42: +
+
+

Chris, if you haven't considered this already, it's sometimes possible to achieve parallelism with multiple processes using memory mapped files as numpy arrays. It's a bit awkward, but it can also make for an easier path to a computation that is resumable or can be run on a cluster.

GIL removal would be wonderful, but it's a pretty ambitious idea. Then again, these pypy folk seem able to deliver on some pretty amazing stuff.

+
+
+
+
+ + Peter S wrote on 2012-01-16 10:33: +
+
+

I am closely following these developments with numpypy and I just succesfully tested the last nightly build, which I find very impressive!

For research purposes, the main thing we need is scipy.stats.ttest_1samp to work on pypy. Is there an estimation on when scipypy will be available?

+
+
+
+ +

Leysin Winter Sprint

+ +
+

PyPy Leysin Winter Sprint: 15-22nd January 2012

+ +

+The next PyPy sprint will be in Leysin, Switzerland, for the +eighth time. This is a fully public sprint: newcomers and topics +other than those proposed below are welcome.

+ +

Goals and topics of the sprint

+ +
    +
  • Py3k: work towards supporting Python 3 in PyPy + +
  • +
  • NumPyPy: work towards supporting the numpy module in PyPy + +
  • +
  • JIT backends: integrate tests for ARM; look at the PowerPC 64; + maybe try again to write an LLVM- or GCC-based one + +
  • +
  • STM and STM-related topics; or the Concurrent Mark-n-Sweep GC + +
  • +
  • And as usual, the main side goal is to have fun in winter sports :-) + We can take a day off for ski. +
  • +
+

Exact times

+ +

The work days should be 15-21 January 2011 (Sunday-Saturday). The +official plans are for people to arrive on the 14th or the 15th, and to +leave on the 22nd.

+ +

Interested? Read more...

+
+
+
+
+ + Anonymous wrote on 2011-12-28 01:30: +
+
+

How is the STM work going, btw?

Do you have any indications yet on whether it'll be workable in an imperative VM?

+
+
+
+
+ + Anonymous wrote on 2012-01-02 11:49: +
+
+

any news on the win64 port?

+
+
+
+
+ + Klaus Ramelow wrote on 2012-01-07 12:56: +
+
+

Leysin Winter Sprint
Exact times

The work days should be 15-21 January 2011 (Sunday-Saturday).

I assume it will be January 2012

+
+
+
+
+ + Armin Rigo wrote on 2012-01-09 19:54: +
+
+

STM work is slowly progressing, as you must have noticed in pypy-dev.

The Win64 port's progress is unknown, sorry.

+
+
+
+ +

Come see us at PyCon 2012

+ +
+

PyCon 2012 is coming up in just a few short months, and PyPy will be well
+represented there. We'll be delivering a tutorial, two talks, plus we'll be
+around for the sprints.

+

Here are the abstracts for the tutorials and talks:

+
    +
  • +How to get the most out of your PyPy, by Maciej Fijalkowski, Alex Gaynor
    +and Armin Rigo: For many applications PyPy can provide performance benefits
    +right out of the box. However, little details can push your application to
    +perform much better. In this tutorial we'll give you insights on how to push
    +PyPy to its limits. We'll focus on understanding the performance
    +characteristics of PyPy, and learning the analysis tools in order to maximize
    +your applications' performance. This is the tutorial. +
  • +
  • +Why PyPy by example, by Maciej Fijalkowski, Alex Gaynor and Armin Rigo:
    +One of the goals of PyPy is to make existing Python code faster; however an
    +even broader goal was to make it possible to write things in Python that
    +previously would needed to be written in C or other low-level language. This
    +talk will show examples of this, and describe how they represent the
    +tremendous progress PyPy has made, and what it means for people looking at
    +using PyPy.
  • +
  • +How the PyPy JIT works, by Benjamin Peterson: The Python community is
    +abuzz about the major speed gains PyPy can offer for pure Python code. But how
    +does the PyPy JIT actually work? This talk will discuss how the PyPy JIT is
    +implemented. It will include descriptions of the tracing, optimization, and
    +assembly generation phases. I will demonstrate each step with an example loop.
  • +
+

If you have any questions let us know! We look forward to seeing people at
+PyCon and chatting about PyPy and the entire Python ecosystem.

+

See you there,
+Maciej Fijalkowski, Alex Gaynor, Benjamin Peterson, Armin Rigo, and the entire PyPy team

+
+

Plotting using matplotlib from PyPy

+ +
+

Big fat warning This is just a proof of concept. It barely works. There are +missing pieces left and right, which were replaced with hacks so I can get this +to run and prove it's possible. Don't try this at home, especially your home. +You have been warned.

+

There has been a lot of talking about PyPy not integrating well with the +current scientific Python ecosystem, and numpypy (a NumPy reimplementation +on top of pypy) was dubbed "a fancy array library". I'm going to show that +integration with this ecosystem is possible with our design.

+

First, the demo:

+
+#!/usr/bin/env pypy
+
+# numpy, pypy version
+import numpypy as numpy
+# DRAGONS LIVE THERE (fortunately hidden)
+from embed.emb import import_mod
+
+pylab = import_mod('matplotlib.pylab')
+
+if __name__ == '__main__':
+    a = numpy.arange(100, dtype=int)
+    b = numpy.sin(a)
+    pylab.plot(a, b)
+    pylab.show()
+
+

And you get:

+ + + +

Now, how to reproduce it:

+
    +
  • +

    You need a PyPy without cpyext, I did not find a linker that would support +overriding symbols. Right now there are no nightlies like this, so you have +to compile it yourself, like:

    +
    +./translate.py -Ojit targetpypystandalone.py --withoutmod-cpyext
    +
    +

    That would give you a PyPy that's unable to load some libraries like PIL, but +perfectly working otherwise.

    +
  • +
  • +

    Speaking of which, you need a reasonably recent PyPy.

    +
  • +
  • +

    The approach is generally portable, however the implementation has been +tested only on 64bit linux. Few tweaks might be required.

    +
  • +
  • +

    You need to install python2.6, the python2.6 development headers, and have +numpy and matplotlib installed on that python.

    +
  • +
  • +

    You need a checkout of my hacks directory and put embedded on your +PYTHONPATH, your pypy checkout also has to be on the PYTHONPATH.

    +
  • +
+
+

Er wait, what happened?

+

What didn't happen is we did not reimplement matplotlib on top of PyPy. What +did happen is we embed CPython inside of PyPy using ctypes. We instantiate it. +and follow the embedding tutorial for CPython. Since numpy arrays are not +movable, we're able to pass around an integer that's represents the memory +address of the array data and reconstruct it in the embedded interpreter. Hence +with a relatively little effort we managed to reuse the same array data on both +sides to plot at array. Easy, no?

+

This approach can be extended to support anything that's not too tied with +python objects. SciPy and matplotlib both fall into the same category +but probably the same strategy can be applied to anything, like GTK or QT. +It's just a matter of extending a hack into a working library.

+

To summarize, while we're busy making numpypy better and faster, it seems +that all external libraries on the C side can be done using an embedded Python +interpreter with relatively little effort. To get to that point, I spent +a day and a half to learn how to embed CPython, with very little prior +experience in the CPython APIs. Of course you should still keep as much as +possible in PyPy to make it nice and fast :)

+

Cheers, +fijal

+
+
+
+
+
+ + Kumo wrote on 2011-12-09 04:06: +
+
+

Pretty cool!

+
+
+
+
+ + Eli Bressert wrote on 2011-12-09 20:27: +
+
+

Two thumbs up! This is quite exciting! Looking forward to further followup from this.

How does Scipy look in terms of implementation, e.g. wrapping fortran code with f2py? Could it become achieved?

+
+
+
+
+ + Pankaj wrote on 2011-12-10 06:14: +
+
+

freaking awesome :)

+
+
+
+
+ + Laptop repair wrote on 2011-12-10 13:02: +
+
+

PyPy Version is showing best result, it is giving extra protection to program.

+
+
+
+
+ + dac wrote on 2011-12-13 20:52: +
+
+

Good work. Is this approach your long term plan for supporting scientific python libraries or just a stop-gap solution until "proper" support can be added to pypy (or to the library)?

+
+
+
+
+ + Maciej Fijalkowski wrote on 2011-12-13 23:06: +
+
+

@dac this can scale to the entire matplotlib/scipy fully. Whether scientific community people will take up a gargantuan task of moving SciPy/matplotlib out of using CPython C API is beyond my knowledge, but even if it'll happen, it won't happen in short-to-mid-term.

So overall I think it's a good midterm solution, that might just stay forever.

+
+
+
+
+ + Anonymous wrote on 2014-01-04 09:23: +
+
+

Another solution containing dragons, that someone might find useful:
1) create new python file, that would print diagrams
2) send data from main program running in pypy to the second python file using call from subprocess
eg. call(["python", "pythondiagrams.py", "-data", str(my_data).replace(" ", ";")]), data should be be text type and contain separator other than space
3) parse input data using argparse and convert them using ast

+
+
+
+
+ + Konstantin Lopuhin wrote on 2014-02-02 12:32: +
+
+

Also, seems that embed must live below the root of pypy source tree (else it fails to create proper paths to ".o" output files in rpython.translator.platform.Platform._make_o_file).

+
+
+
+ +

PyPy 1.7 - widening the sweet spot

+ +
+

We're pleased to announce the 1.7 release of PyPy. As became a habit, this +release brings a lot of bugfixes and performance improvements over the 1.6 +release. However, unlike the previous releases, the focus has been on widening +the "sweet spot" of PyPy. That is, classes of Python code that PyPy can greatly +speed up should be vastly improved with this release. You can download the 1.7 +release here:

+
+https://pypy.org/download.html +
+
+

What is PyPy?

+

PyPy is a very compliant Python interpreter, almost a drop-in replacement for +CPython 2.7. It's fast (pypy 1.7 and cpython 2.7.1 performance comparison) +due to its integrated tracing JIT compiler.

+

This release supports x86 machines running Linux 32/64, Mac OS X 32/64 or +Windows 32. Windows 64 work is ongoing, but not yet natively supported.

+

The main topic of this release is widening the range of code which PyPy +can greatly speed up. On average on +our benchmark suite, PyPy 1.7 is around 30% faster than PyPy 1.6 and up +to 20 times faster on some benchmarks.

+
+
+

Highlights

+
    +
  • +

    Numerous performance improvements. There are too many examples which python +constructs now should behave faster to list them.

    +
  • +
  • +

    Bugfixes and compatibility fixes with CPython.

    +
  • +
  • +

    Windows fixes.

    +
  • +
  • +

    PyPy now comes with stackless features enabled by default. However, +any loop using stackless features will interrupt the JIT for now, so no real +performance improvement for stackless-based programs. Contact pypy-dev for +info how to help on removing this restriction.

    +
  • +
  • +

    NumPy effort in PyPy was renamed numpypy. In order to try using it, simply +write:

    +
    +import numpypy as numpy
    +
    +

    at the beginning of your program. There is a huge progress on numpy in PyPy +since 1.6, the main feature being implementation of dtypes.

    +
  • +
  • +

    JSON encoder (but not decoder) has been replaced with a new one. This one +is written in pure Python, but is known to outperform CPython's C extension +up to 2 times in some cases. It's about 20 times faster than +the one that we had in 1.6.

    +
  • +
  • +

    The memory footprint of some of our RPython modules has been drastically +improved. This should impact any applications using for example cryptography, +like tornado.

    +
  • +
  • +

    There was some progress in exposing even more CPython C API via cpyext.

    +
  • +
+
+
+

Things that didn't make it, expect in 1.8 soon

+

There is an ongoing work, which while didn't make it to the release, is +probably worth mentioning here. This is what you should probably expect in +1.8 some time soon:

+
    +
  • Specialized list implementation. There is a branch that implements lists of +integers/floats/strings as compactly as array.array. This should drastically +improve performance/memory impact of some applications
  • +
  • NumPy effort is progressing forward, with multi-dimensional arrays coming +soon.
  • +
  • There are two brand new JIT assembler backends, notably for the PowerPC and +ARM processors.
  • +
+
+
+

Fundraising

+

It's maybe worth mentioning that we're running fundraising campaigns for +NumPy effort in PyPy and for Python 3 in PyPy. In case you want to see any +of those happen faster, we urge you to donate to numpy proposal or +py3k proposal. In case you want PyPy to progress, but you trust us with +the general direction, you can always donate to the general pot.

+
+

Cheers,
Maciej Fijałkowki, Armin Rigo and the entire PyPy team

+
+
+
+
+ + Unknown wrote on 2011-11-21 12:29: +
+
+

Could you put a link to some sort of NEWS file, a list of issue tracker tickets, or at least the relevant span of the revision control tool so that I could browse what sorts of changes have gone into trunk since 1.6?

+
+
+
+
+ + Anonymous wrote on 2011-11-21 12:54: +
+
+

"PyPy now comes with stackless features enabled by default"

Could you please tell a bit more about it? Is it just sort of internal optimizations, something under the hood? Or does it mean tail recursion optimization? Or cooperative multitasking with greenlets? What's the API for stackless features?

+
+
+
+
+ + Anonymous wrote on 2011-11-21 14:27: +
+
+

Is it so hard to wait until you have a Windows build before announcing a release?

Or not telling in the release that the Windows binary is available?

+
+
+
+
+ + Benjamin Peterson wrote on 2011-11-21 15:30: +
+
+

@Zooko

hg log -rrelease-1.6:release-1.7

+
+
+
+
+ + Jan Ziak (atomsymbol) wrote on 2011-11-21 16:38: +
+
+

I am getting a segmentation fault.

+
+
+
+
+ + D wrote on 2011-11-21 18:37: +
+
+

So if I want to run PyPy on my code with numpy I have to replace in each file "import numpy" by "import numpypy", "from numpy import ..." by "from numpypy import ...". And each time I want to switch beween PyPy and CPython, I have to search and replace all those occurrences backward. Well done...

+
+
+
+
+ + Anonymous wrote on 2011-11-21 19:35: +
+
+

Thank you for all your work, it's nice to see how far you have come in so little time! Keep raising the bar.

+
+
+
+
+ + Amaury wrote on 2011-11-21 21:06: +
+
+

@D: Please take it the easy way and add "sys.modules['numpy'] = numpypy" at the start of your program.

+
+
+
+
+ + Maciej Fijalkowski wrote on 2011-11-21 21:08: +
+
+

@⚛ report a bug to bugs.pypy.org

@D it's gonna stay like this until it's finished. The problem is that most programs won't run out of the box anyway as of now, because of some missing functionality. We'll probably rename it back once it's finished.

+
+
+
+
+ + Armin Rigo wrote on 2011-11-21 21:09: +
+
+

@D: all you need is to create a file "numpy.py" that contains "from numpypy import *". (The real reason we did this temporary renaming is because numpy developers asked us to.)

More likely, though, you are probably going to hit some unimplemented feature anyway, as our numpy(py) is still incomplete.

+
+
+
+
+ + Anonymous wrote on 2011-11-21 22:49: +
+
+

Re: numpypy. The standard in the bad old days with three different and subtly incompatible array libraries was "try: import ...; except: ..."

+
+
+
+
+ + Jan Ziak (atomsymbol) wrote on 2011-11-22 07:14: +
+
+

@Maciej: I am *not* going to submit a bug report, on purpose. When developing software for the masses, there are always two sets of users. One set comprises the users who report bugs, the other set comprises the users who are experiencing issues but do not report bugs.

The ideal state would be that there are no bugs, but this is only theoretical of course.

As an experiment, I have decided not to tell you any information about the segmentation fault. Nothing. Absolutely nothing.

The question is what measures are you going to take to solve this PyPy issue.

Good luck ...

+
+
+
+
+ + Maciej Fijalkowski wrote on 2011-11-22 08:09: +
+
+

@⚛ we're going to do nothing with that. Most probably you're using a CPython C extension or some illegal ctypes invocation or older version of jinja that did that or something... Besides, there is absolutely no point in trying to fix a bug that noone can potentially provide any information for.

Cheers,
fijal

+
+
+
+
+ + Jan Ziak (atomsymbol) wrote on 2011-11-22 09:07: +
+
+

@Maciej:

PyPy 1.6 worked OK (but it was slower than CPython).

"we're going to do nothing with that."

OK

"Most probably you're using a CPython C extension or some illegal ctypes invocation or older version of jinja that did that or something..."

I don't think so. GDB says that the EIP register stops at an address which does not seem to belong to the PyPy executable nor to any dynamically loaded library. This leads me to the conclusion that the issue is in the x86 code generated by PyPy.

"Besides, there is absolutely no point in trying to fix a bug that noone can potentially provide any information for."

I am not saying you have to fix it. I am just saying that PyPy 1.7 generates code that segfaults.

Does PyPy employ partial verification when generating x86 code?

+
+
+
+
+ + Jorgen wrote on 2011-11-22 09:28: +
+
+

@Flower

"As an experiment, I have decided not to tell you any information about the segmentation fault. Nothing. Absolutely nothing."

So you want to conduct an experiment into 'How to help out an open source project by withholding crucial information'? And I thought the ideas of my PhD-advisor were bad ...

+
+
+
+
+ + Anonymous wrote on 2011-11-22 09:56: +
+
+

The point he, she, or it is making is that PyPy should contain a theorem prover to verify the code it generates so it is possible to prove mathematically that it never generates bad code—and that anything else is beneath the contempt of a serious computer scientist. If you need information about a segfault in order to debug it, you obviously have not thought it through thoroughly enough.

+
+
+
+
+ + Jan Ziak (atomsymbol) wrote on 2011-11-22 10:02: +
+
+

@Jorgen and @Maciej:

Well, I previously wrote here that "The question is what measures are you (=the PyPy team) going to take to solve this PyPy issue."

This sentence of mine contained the additional information that: I believe that it is a PyPy issue.

Maciej then wrote: "Most probably you're using a CPython C extension or ... that did that or something". This means he was trying to put the blame on others (C extensions or whatever) rather than admitting that it might be an issue attributable to PyPy and PyPy alone.

Then you (Jorgen) wrote "So you want to conduct an experiment into 'How to help out an open source project by withholding crucial information'?". And that is exactly what I intend to do: to help the PyPy project by withholding crucial information.

It will work.

+
+
+
+
+ + Jan Ziak (atomsymbol) wrote on 2011-11-22 10:14: +
+
+

@Damian:

"... PyPy should contain a theorem prover to verify the code it generates so it is possible to prove mathematically that it never generates bad code"

I believe such a thing is impossible.

+
+
+
+
+ + Anonymous wrote on 2011-11-22 11:36: +
+
+

It's possible if you let the verifier reject legal code. It's probably not realistic though, RPython (or is that the JIT-annotation language?) would have to be designed to be verifiable for whatever property you want to verify.

+
+
+
+
+ + Armin Rigo wrote on 2011-11-22 14:28: +
+
+

@⚛: you're sitting in your own corner of the world thinking that we will try hard to figure out which segfault you could possibly mean, and that it will help the PyPy project :-) I've heard many misconceptions of how Open Source works, but I've never heard this one.

How it really works is: you think you have a genuine segfault and want to report it, in which case you file a bug to https://bugs.pypy.org, and maybe we have to discuss more to figure out why, for example, it appears on your machine and not ours, or which configuration you need to reproduce it; sometimes it can take efforts on both parties to even reproduce the problem.

You are free to not play this game, but then just like Maciej said, you will be fully ignored. Even if it's a real bug, it's likely that over time someone else will report or fix it. I'm not trying to force you to "reveal" it to us; feel free to ignore me. I'm just explaining how I believe Open Source works.

The difference for us is small, because a real bug will be seen and reported by others too. The difference for you is whether you would like to contribute and get our thanks, or don't care about it.

+
+
+
+
+ + Anonymous wrote on 2011-11-22 23:50: +
+
+

The pypy team "could" solve it. But it would be a massive waste of time, and of cource the changes are that they are unable to because of problems in your setup. I most certainly hope no open source team really spend their time on such ghost hunts.

+
+
+
+
+ + Anonymous wrote on 2011-11-23 04:25: +
+
+

https://democreatorreview.blogspot.com/

+
+
+
+
+ + Winston Ewert wrote on 2011-11-23 04:42: +
+
+

Somewhat off the topic of this post, but I'm wondering what the special optimization of string lists would be. I can see obvious benefits to storing ints/floats directly in the list rather then as boxed numbers, but not so much for strings since they have be stored using an indirection anyways.

+
+
+
+
+ + Carl Friedrich Bolz-Tereick wrote on 2011-11-23 09:15: +
+
+

@Winston:

astutely observed (as always). There are two points to string lists:

1) PyPy's strings have one extra indirection, e.g. the data is not stored in the string box. This is due to RPython restrictions. With string lists, one indirection can be removed.

2) If the JIT knows that the full list stores only strings, it can actually generate better code, because it does not need to check the type of the item that was just read out of the list.

+
+
+
+
+ + vacation homes in kissimmee florida wrote on 2011-11-25 09:30: +
+
+

This means he was trying to put the blame on others....

+
+
+
+
+ + wholesale electronics wrote on 2011-12-17 01:20: +
+
+

omething under the hood? Or does it mean tail recursion optimization?

+
+
+
+ +

Gothenburg sprint report

+ +
+

In the past week, we have been busy hacking on PyPy at the Gothenburg sprint, the second of this 2011. The sprint was hold at Laura's and Jacob's place, and here is a brief report of what happened.


+In the first day we welcomed Mark Pearse, who was new to PyPy and at his first sprint. Mark worked the whole sprint in the new SpecialisedTuple branch, whose aim is to have a special implementation for small 2-items and 3-items tuples of primitive types (e.g., ints or floats) to save memory. Mark paired with Antonio for a couple of days, then he continued alone and did an amazing job. He even learned how to properly do Test Driven Development :-).

+Antonio spent a couple of days investigating whether it is possible to use application checkpoint libraries such as BLCR and DMTCP to save the state of the PyPy interpreter between subsequent runs, thus saving also the JIT-compiled code to reduce the warmup time. The conclusion is that these are interesting technologies, but more work would be needed (either on the PyPy side or on the checkpoint library side) before it can have a practical usage for PyPy users.

+Then, Antonio spent most of the rest of the sprint working on his ffistruct branch, whose aim is to provide a very JIT-friendly way to interact with C structures, and eventually implement ctypes.Structure on top of that. The "cool part" of the branch is already done, and the JIT already can compile set/get of fields into a single fast assembly instruction, about 400 times faster than the corresponding ctypes code. What is still left to do is to add a nicer syntax (which is easy) and to implement all the ctypes peculiarities (which is tedious, at best :-)).

+As usual, Armin did tons of different stuff, including fixing a JIT bug, improving the performance of file.readlines() and working on the STM branch (for Software Transactional Memory), which is now able to run RPython multithreaded programs using software transaction (as long as they don't fill up all the memory, because support for the GC is still missing :-)). Finally, he worked on improving the Windows version of PyPy. While doing so he discovered together with Anto a terrible bug which lead to a continuous leak of stack space because the JIT called some functions using the wrong calling convention.

+Håkan, with some help from Armin, worked on the jit-targets branch, whose goal is to heavily refactor the way the traces are internally represented by the JIT, so that in the end we can produce (even :-)) better code than what we do nowadays. More details in this mail.

+Andrew Dalke worked on a way to integrate PyPy with FORTRAN libraries, and in particular the ones which are wrapped by Numpy and Scipy: in doing so, he wrote f2pypy, which is similar to the existing f2py but instead of producing a CPython extension module it produces a pure python modules based on ctypes. More work is needed before it can be considered complete, but f2pypy is already able to produce a wrapper for BLAS which passes most of the tests under CPython, although there's still work left to get it working for PyPy.

+ + + +
Armin and Håkan with Laura's "5x faster" cake
Christian Tismer worked the whole sprint on the branch to make PyPy compatible with Windows 64 bit. This needs a lot of work because a lot of PyPy is written under the assumption that the long type in C has the same bit size than void*, which is not true on Win64. Christian says that in the past Genova-Pegli sprint he completed 90% of the work, and in this sprint he did the other 90% of the work. Obviously, what is left to complete the task is the third 90% :-). More seriously, he estimated a total of 2-4 person-weeks of work to finish it.

+But, all in all, the best part of the sprint has been the cake that Laura baked to celebrate the "5x faster than CPython" achievement. Well, actually our speed page reports "only" 4.7x, but that's because in the meantime we switched from comparing against CPython 2.6 to comparing against CPython 2.7, which is slightly faster. We are confident that we will reach the 5x goal again, and that will be the perfect excuse to eat another cake :-) +
+
+
+
+ + Albien wrote on 2011-11-15 00:40: +
+
+

Freaking amazing guys together!!!

+
+
+
+
+ + Kumo wrote on 2011-11-15 03:28: +
+
+

"5x faster than CPython cake". Sounds delicious.

+
+
+
+
+ + Anonymous wrote on 2011-11-15 10:18: +
+
+

awesome! what do you think? how much room for improvement is there? is 10x possible? :)

+
+
+
+
+ + Luis wrote on 2011-11-15 13:52: +
+
+

Congratulations! I guess that 5x faster (Unladen Swallow's performance goal) means that pypy is now "officially" fast.

As Anonymous asked above, I also wonder how much room for improvement there is from now on.
Have all the low hanging fruits been picked already? Can we expect this pace of improvement to go on for a while? Or you are close to hit the limit?

Well, I know it's hard to predict... I'd just like to know what your heart tells you :-)

Thank you guys for all the hard work!

+
+
+
+
+ + Anonymous wrote on 2011-11-18 15:56: +
+
+

does pygame work with pypy? would be awesome... what about pyopengl?

+
+
+
+
+ + Anonymous wrote on 2011-11-19 00:28: +
+
+

Sorry, but pyopengl require either numpy or Numeric, which unfortunatly ain't supported yet.

+
+
+
+
+ + Anonymous wrote on 2011-12-17 01:06: +
+
+

Five times faster than CPython. Great! How does it compare to C?

+
+
+
+ +
+
+ +
+
+
+ +
+ + + + \ No newline at end of file diff --git a/blog/index-22.html b/blog/index-22.html new file mode 100644 index 000000000..08c68f632 --- /dev/null +++ b/blog/index-22.html @@ -0,0 +1,2203 @@ + + + + + + +PyPy (old posts, page 22) | PyPy + + + + + + + + + + + + + + + + + + Skip to main content +
+
+
+

Py3k status update #2

+ +
+

This is the second status update about my work on the py3k branch, which I can work on thanks to all of the people who donated to the py3k proposal.

+

Since my previous status update, things have improved a lot: first of all, I fixed the syntax of many more tests, which were failing on the branch because they used constructs which are no longer valid in Python 3, such as u'' strings, the print statement or the old except Exception, e syntax. I have to say that this work is tedious and not very rewarding, but it has to be done anyway, so that the real failures can stand up.

+

Then, I spent most of the rest of the time by killing features which are present in Python 2 and are gone in Python 3.

+

Some of them were easy and mechnical: for example, I removed all the function attributes such as func_code and func_closure, which has been renamed to __code__ and __closure__, and then I had to find and fix all the places which still expected the old ones.

+

Some were trickier: I removed support for the cmp function and the __cmp__ special method, but this also meant that I had to fix a few types which relied on it to be comparable (for example, did you know that the cells contained in __closure__ are comparable?). At the same time, I also removed the old behavior which in Python 2 allows us to compare arbitrary objects with <, > & co.: in Python 3 the only comparisons allowed between incompatible types are == and !=.

+

Speaking of old special methods, __hex__ and __oct__ are gone as well (and I didn't even know about their existence before removing them :-))

+

But the most important breakthrough was the removal of the _file module, containing the implementation of the file type in Python 2, which is now gone since in Python 3 files are handled by the _io module. Killing the module was not straightforward, because some of the importing logic was tightly tied to the internal implementation of files, so it needed some refactoring. Finally, I had to fix the marshal module to correctly detect text files vs. byte files.

+

Among these things, I fixed tons of smaller issues here and there. As a result, there are many fewer failing tests than a few weeks ago. Obviously the number itself does not mean much, because sometimes fixing a single test takes hours, and some other times by changing one line one fixes tens of tests. But at the end, seeing it dropping from 999 to 650 always is nice and rewarding :-).

+

The road for having a pypy3k is still long, but everything is going fine so far. Stay tuned for more updates!

+

cheers, Antonio

+
+
+
+
+
+ + Larry Hastings wrote on 2012-03-02 01:17: +
+
+

You might consider leaving the u prefix in--PEP 414 puts it back, and it just got accepted.

https://www.python.org/dev/peps/pep-0414/

+
+
+
+
+ + Unknown wrote on 2012-03-02 05:57: +
+
+

It's cleaner to flush them out - the forward porting effort is targeting 3.2, so the stdlib should respect that. (i.e. if it runs on PyPy by default, it should run on CPython 3.2 as well).

Otherwise we'll end up with confusing cases of "this runs on 3.2 in PyPy, but CPython reports a syntax error"

+
+
+
+
+ + Unknown wrote on 2012-03-02 05:59: +
+
+

For importing support, you may want to look at the current state of the 3.3 importlib implementation. Brett's on the verge of hooking that up as CPython's native import system - it should be possible for PyPy3k to use it as well.

+
+
+
+
+ + Antonio Cuni wrote on 2012-03-02 08:28: +
+
+

@Larry: yes, I've considered that, but as Nick says we are targeting 3.2, so it's much easier to just kill it in the meantime. Adding it back will be very easy.

@Nick: yes, using importlib is something which I also have considered. However, I'd prefer not to diverge too much from the "default" branch (because we are going to regularly merge default into py3k for a long time). I suppose that as long as the current importing logic works fine, we'll keep it :-)

+
+
+
+
+ + shaurz wrote on 2012-03-03 15:01: +
+
+

The u"" syntax is coming back.

+
+
+
+ +

Py3k status update

+ +
+
Thank to all the people who donated to the py3k proposal, we managed to collect enough money to start to work on the first step. This is a quick summary of what I did since I began working on this.
+First of all, many thanks to Amaury Forgeot d'Arc, who started the py3k branch months ago, and already implemented lots of features including e.g. switching to "unicode everywhere" and the int/long unification, making my job considerably easier :-)
+I started to work on the branch at the last Leysin sprint together with Romain Guillebert, where we worked on various syntactical changes such as extended tuple unpacking and keyword-only arguments. Working on such features is a good way to learn about a lot of the layers which the PyPy Python interpreter is composed of, because often you have to touch the tokenizer, the parser, the ast builder, the compiler and finally the interpreter.
+Then I worked on improving our test machinery in various way, e.g. by optimizing the initialization phase of the object space created by tests, which considerably speeds up small test runs, and adding the possibility to automatically run our tests against CPython 3, to ensure that what we are not trying to fix a test which is meant to fail :-). I also setup our buildbot to run the py3k tests nightly, so that we can have an up to date overview of what is left to do.
+Finally I started to look at all the tests in the interpreter/ directory, trying to unmangle the mess of failing tests. Lots of tests were failing because of simple syntax errors (e.g., by using the no longer valid except Exception, e syntax or the old print statement), others for slightly more complex reasons like unicode vs bytes or the now gone int/long distinction. Others were failing simply because they relied on new features, such as the new lexical exception handlers.
+To give some numbers, at some point in january we had 1621 failing tests in the branch, while today we are under 1000 (to be exact: 999, and this is why I've waited until today to post the status update :-)).
+Before ending this blog post, I would like to thank once again all the people who donated to PyPy, who let me to do this wonderful job. That's all for now, I'll post more updates soon.
+cheers, Antonio
+
+
+
+
+ + Piotr Husiatyński wrote on 2012-02-16 16:12: +
+
+

Will the py3 branch be finally merged into main branch and the pypy will be albe to run both 2.x and 3.x depending on the boot switch or the 2.x support will be dropped or the will be no merge?

+
+
+
+
+ + Antonio Cuni wrote on 2012-02-16 16:23: +
+
+

@Piotr: the work in the py3k branch is destroying some of the python2 semantics, so we won't merge the twos as long as we support python2 (and we'll do it for a long time, because pypy itself is written in python2).
The current plan is just to keep the development in parallel, and regularly merge "default" into "py3k".

+
+
+
+
+ + Alberto Berti wrote on 2012-02-16 18:24: +
+
+

Ciao Antonio,

very good news, i'm glad that my little monetary contribution allowed you to be paid to work on that. Keep us posted!

Cheers,

Alberto

+
+
+
+
+ + Seb wrote on 2012-02-16 23:07: +
+
+

So, it has to be asked: any plan of rewriting PyPy in python 3? :D

+
+
+
+
+ + Antonio Cuni wrote on 2012-02-16 23:30: +
+
+

@Alberto: thank you very much :-)

@Seb: not in the short/middle term (and it's unclear whether we want to go there at all)

+
+
+
+
+ + Echo wrote on 2012-02-17 10:21: +
+
+

Good luck preventing the python2 and python3 branches from convergng; it's what merges do, and the alternative is a lot of cherry-picking. A lot of codebases use feature-flipping instead.

+
+
+
+
+ + Antonio Cuni wrote on 2012-02-17 10:30: +
+
+

@Echo: no, the plan is to regularly merge "default" (i.e. python2) into "py3k". Note that in "default" we are mostly developing other parts than the Python interpreter (e.g., the JIT compiler), so this should not be too much of a problem, although it will be annoying sometimes

+
+
+
+ +

A Larger Example for the Flow Graph Language

+ +
+

Part 4 of Comparing Partial Evaluation to Tracing

+

This is the fourth and final blog post in a series about comparing partial evaluation and +tracing. We've come a long way: In the first post of the series I showed an interpreter for a small flow-graph +language together with a partial evaluator it. In the second post I showed how a tracer for +the same language works and how it relates to both execution and to partial +evaluation. The third post described an optimizer for traces.

+

In this final post we can compare and contrast the two different approaches of +tracing and partial evaluation by means of an example. The programs in the flow +chart language seen so far have been rather small, so I want to give an example +of a larger program: an interpreter for an extremely simple bytecode +instruction set. I will look at how the partial evaluator deals with that +interpreter, and +what the tracer does with it. The code for +that, as well as all the code of the series can be found here: https://paste.pocoo.org/show/550282/ (some small +additions have been made, such as a nicer way to print traces).

+

A Bytecode Interpreter

+

Writing programs in the flow graph language is painful, but I still want to give +an example that is a bit more interesting than the tiny ones that we've seen so +far. The example is an interpreter for the bytecode of a very trivial +register-based language. The language has four registers, one of which is an +accumulator on which all the actual operations are performed.

+

The opcodes of the language are:

+
    +
  • +jump_if_a, jumps to a target address when the accumulator is non-zero
  • +
  • +mov_a_r0, mov_a_r1, mov_a_r2 move the value of the accumulator to +the respective register
  • +
  • +mov_r0_a, mov_r1_a, mov_r2_a move the value of a register to +the accumulator
  • +
  • +add_r0_to_a, add_r1_to_a, add_r2_to_a add the value of the +register to the accumulator
  • +
  • +decr_a decrement the accumulator
  • +
  • +return_a stop the program and print the accumulator
  • +
+

The interpreter has a main loop that reads the opcode at the current program +counter, does a (lengthy) dispatch to the right bytecode via a series of if +statements and then executes the right opcode. Afterwards the next opcode is +treated equivalently.

+

Here is a part of the source code in the flow graph language. As pseudocode:

+
+bytecode_loop:
+    opcode = bytecode[pc]
+    pc = pc + 1
+    c = opcode == 'jump_if_a'
+    if c goto op_jump_if_a else goto not_jump_if_a
+
+# select the right bytecode via a long series of if statements
+not_jump_if_a:
+    c = opcode == 'mov_a_r0'
+    if y goto op_mov_a_r0 else goto not_mov_a_r0
+not_mov_a_r0:
+    c = opcode == 'mov_a_r0'
+    if y goto op_mov_a_r1 else goto not_mov_a_r1
+...
+
+# bytecode implementations
+op_mov_a_r0:
+    r0 = a
+    goto bytecode_loop
+
+op_jump_if_a:
+    c = a == 0
+    target = bytecode[pc]
+    pc += 1
+    if c goto bytecode_loop else goto op_jump_if_a_jump
+
+op_jump_if_a_jump:
+    pc = target
+    goto bytecode_loop
+...
+
+

And actually working, as Prolog facts (the full implementation can be found at +the link above):

+
% bytecode dispatch loop
+block(bytecode_loop,
+      op2(opcode, readlist, var(bytecode), var(pc),
+      op2(pc, add, var(pc), const(1),
+      op2(c, eq, var(opcode), const(jump_if_a),
+      if(c, op_jump_if_a, not_jump_if_a))))).
+
+% select the right bytecode via a long series of if statements
+block(not_jump_if_a,
+      op2(c, eq, var(opcode), const(mov_a_r0),
+      if(c, op_mov_a_r0, not_mov_a_r0))).
+block(not_mov_a_r0,
+      op2(c, eq, var(opcode), const(mov_a_r1),
+      if(c, op_mov_a_r1, not_mov_a_r1))).
+...
+
+% bytecode implementations
+block(op_jump_if_a,
+      op2(c, eq, var(a), const(0),
+      op2(target, readlist, var(bytecode), var(pc),
+      op2(pc, add, var(pc), const(1),
+      if(c, bytecode_loop, op_jump_if_a_jump))))).
+block(op_jump_if_a_jump,
+      op1(pc, same, var(target),
+      promote(bytecode, bytecode_loop))).
+block(op_mov_a_r0,
+      op1(r0, same, var(a), jump(bytecode_loop))).
+...
+
+

The bytecode_loop block is the main dispatch loop. It reads an opcode out of the +bytecode list at the program counter position, then has a long series of if +statements that compares the current opcode to the various existing opcodes. +The full code of the interpreter can be found under the link above.

+

The bytecodes of the interpreter don't really permit hugely complex +programs, but it can be used to write a program that computes the square of a +number with the following program:

+
+mov_a_r0     # r0 = a
+mov_a_r1     # r1 = a
+# 2:
+mov_r0_a     # r0--
+decr_a
+mov_a_r0
+mov_r2_a     # r2 += a
+add_r1_to_a
+mov_a_r2
+mov_r0_a     # if r0!=0: goto 2
+jump_if_a 2
+mov_r2_a     # return r2
+return_a
+
+

Partially Evaluating the Bytecode Interpreter

+

The partial evaluator from the first blog post can be easily used to partially +evaluate the bytecode interpreter. The static input is the bytecode for +computing the square and the initial program counter value, as given above. The +dynamic input are the content of the accumulator (the number to be squared). +This can be done as follows:

+
?- bytecode_square(B),
+Env = [bytecode/B, pc/0],
+do_pe(bytecode_loop, Env, Label),
+REnv = [a/16, r0/0, r1/0, r2/0],
+interp(jump(Label), REnv), listing(block).
+256
+:- dynamic block/2.
+
+<lots of generated code>
+
+

The code that is generated by the partial evaluation process is somewhat hard to +read. It contains a lot of passages like this:

+
...
+block(op_return_a1, print_and_stop(var(a))).
+block(not_decr_a1, jump(op_return_a1)).
+block(not_add_r2_to_a2, jump(not_decr_a1)).
+block(not_add_r1_to_a2, jump(not_add_r2_to_a2)).
+block(not_add_r0_to_a3, jump(not_add_r1_to_a2)).
+block(not_mov_r2_a3, jump(not_add_r0_to_a3)).
+block(not_mov_r1_a5, jump(not_mov_r2_a3)).
+block(not_mov_r0_a5, jump(not_mov_r1_a5)).
+block(not_mov_a_r27, jump(not_mov_r0_a5)).
+block(not_mov_a_r18, jump(not_mov_a_r27)).
+block(not_mov_a_r09, jump(not_mov_a_r18)).
+block(not_jump_if_a11, jump(not_mov_a_r09)).
+block(bytecode_loop12, jump(not_jump_if_a11)).
+block(op_mov_r2_a2, op1(a, same, var(r2), jump(bytecode_loop12))).
+...
+
+

I.e. lots of blocks that do nothing but jump to another block, interspersed with +some blocks that contain an actual operation. I cleaned the output up manually +and got something like the following (this sort of cleanup is something a good +partial evaluation system would do itself, after partial evaluation has +occurred):

+
block(bytecode_loop1,
+    op1(r0, same, var(a),
+    op1(r1, same, var(a),
+    op1(a, same, var(r0),
+    op2(a, sub, var(a), const(1),
+    op1(r0, same, var(a),
+    op1(a, same, var(r2),
+    op2(a, add, var(a), var(r1),
+    op1(r2, same, var(a),
+    op1(a, same, var(r0),
+    op2(c, eq, var(a), const(0),
+    if(c, bytecode_loop11, op_jump_if_a_jump1)))))))))))).
+
+block(bytecode_loop11,
+    op1(a, same, var(r2),
+    print_and_stop(var(a))).
+
+block(op_jump_if_a_jump1,
+    op1(a, same, var(r0),
+    op2(a, sub, var(a), const(1),
+    op1(r0, same, var(a),
+    op1(a, same, var(r2),
+    op2(a, add, var(a), var(r1),
+    op1(r2, same, var(a),
+    op1(a, same, var(r0),
+    op2(c, eq, var(a), const(0),
+    if(c, bytecode_loop11, op_jump_if_a_jump1)))))))))).
+
+

What do we see here? The partial evaluator has generated a block bytecode_loop1, +which corresponds to the initialization opcodes mov_a_r0 and mov_a_r1 together +with one iteration of the loop. Then it either jumps to a copy of the main loop +(label op_jump_if_a_jump1) or to block bytecode_loop11, which prints the result +and then stops. The residual code does exactly what the bytecode did: It +squares the accumulator then prints that. All the uses of the bytecode and +pc variable are gone.

+

Why did the partial evaluator produce two copies of the main loop that +look the same? The reason for that is that in the second copy, the additional +static information target = 2 is known, where target is a variable in +the interpreter source that stores the jump target, for very brief periods of +time. This additional static information does not have any effect on the +residual code, so the same code is uselessly generated twice. This is an +example of overspecialization.

+

Tracing the Interpreter

+

In this section we will look at what happens if we try to trace the interpreter. +The naive way of doing that yields traces that are not very useful, because they +abort after one iteration. We will look at a way of avoiding this problem. The +problems described in this section are at the core of the paper Tracing the +meta-level: PyPy's tracing JIT compiler (that paper uses a slightly more +advanced version of the bytecode interpreter as an example).

+

To trace the interpreter, it is useful to change the bytecode_loop block from above +to always promote the bytecode and the pc variables, because without +knowing them the trace produced is not really interesting. This is similar to +making these variables static in the partial evaluation example above:

+
block(bytecode_loop,
+      promote(bytecode, bytecode_loop_promote_bytecode)).
+block(bytecode_loop_promote_bytecode,
+      promote(pc, bytecode_loop_promote_pc)).
+block(bytecode_loop_promote_pc,
+      op2(opcode, readlist, var(bytecode), var(pc),
+      op2(pc, add, var(pc), const(1),
+      op2(c, eq, var(opcode), const(0),
+      if(c, op_jump_if_a, not_jump_if_a))))).
+...
+
+

The rest of the interpreter stays unchanged.

+

To trace the interpreter we would start naively at the bytecode_loop label, because +that's the label in the interpreter that is jumped to most often (which a +profiler could establish easily). The following command can be used for that +(this output prints traces in a slightly more readable way than in previous blog +posts):

+
?- bytecode_square(B),
+        A = 16, Env = [bytecode/B, pc/2, a/A, r0/A, r1/A, r2/0],
+        do_trace(bytecode_loop, Env).
+trace
+  guard_value(bytecode,[mov_a_r0,mov_a_r1,mov_r0_a,decr_a,mov_a_r0,mov_r2_a,add_r1_to_a,mov_a_r2,mov_r0_a,jump_if_a,2,mov_r2_a,return_a],[],bytecode_loop_promote_bytecode)
+  guard_value(pc,2,[],bytecode_loop_promote_pc)
+  op2(opcode,readlist,var(bytecode),var(pc))
+  op2(pc,add,var(pc),const(1))
+  op2(c,eq,var(opcode),const(jump_if_a))
+  guard_false(c,[],op_jump_if_a)
+  op2(c,eq,var(opcode),const(mov_a_r0))
+  guard_false(c,[],op_mov_a_r0)
+  op2(c,eq,var(opcode),const(mov_a_r1))
+  guard_false(c,[],op_mov_a_r1)
+  op2(c,eq,var(opcode),const(mov_a_r2))
+  guard_false(c,[],op_mov_a_r2)
+  op2(c,eq,var(opcode),const(mov_r0_a))
+  guard_true(c,[],not_mov_r0_a)
+  op1(a,same,var(r0))
+  loop
+
+opttrace
+  guard_value(bytecode,[mov_a_r0,mov_a_r1,mov_r0_a,decr_a,mov_a_r0,mov_r2_a,add_r1_to_a,mov_a_r2,mov_r0_a,jump_if_a,2,mov_r2_a,return_a],[],bytecode_loop_promote_bytecode)
+  guard_value(pc,2,[bytecode/[mov_a_r0,mov_a_r1,mov_r0_a,decr_a,mov_a_r0,mov_r2_a,add_r1_to_a,mov_a_r2,mov_r0_a,jump_if_a,2,mov_r2_a,return_a]],bytecode_loop_promote_pc)
+  op1(a,same,var(r0))
+  op1(bytecode,same,const([mov_a_r0,mov_a_r1,mov_r0_a,decr_a,mov_a_r0,mov_r2_a,add_r1_to_a,mov_a_r2,mov_r0_a,jump_if_a,2,mov_r2_a,return_a]))
+  op1(pc,same,const(3))
+  op1(opcode,same,const(mov_r0_a))
+  op1(c,same,const(1))
+  loop
+
+256
+B = [mov_a_r0, mov_a_r1, mov_r0_a, decr_a, mov_a_r0, mov_r2_a, add_r1_to_a, mov_a_r2, mov_r0_a|...],
+A = 16,
+Env = [bytecode/[mov_a_r0, mov_a_r1, mov_r0_a, decr_a, mov_a_r0, mov_r2_a, add_r1_to_a|...], pc/2, a/16, r0/16, r1/16, r2/0]
+
+

These traces are very short. They start with promoting the bytecode and the +pc, followed by the execution of the opcode mov_r0_a, which is the +one at position 2 in the given bytecode. Then they increment the pc and +loop back to the beginning. Looking at the optimized trace, it is clear that the +trace is essentially useless. It will run only for one iteration, because in the +second iteration the pc is 3, thus the guard_value at the beginning +will fail.

+

This problem can be solved by tracing more than just one iteration of the +bytecode dispatch loop, which is called meta-tracing. To get this behaviour, in +this simple example it is enough to start (and thus end) tracing at a different +label, op_jump_if_a_jump. This label is hit when the interpreter executes a +jump_if_a bytecode and the jump is taken. In a loop on the level of the +executed bytecode program there is one such jump. Thus tracing from this label, +a full loop in the bytecode program is traced, containing potentially many +iterations of the bytecode dispatch loop in the control flow graph language.

+

Doing that yields the following:

+
?- bytecode_square(B),
+        A = 16, Env = [bytecode/B, pc/11, a/A, r0/A, r1/A, r2/0, target/2],
+        do_trace(op_jump_if_a_jump, Env).
+trace
+  op1(pc,same,var(target))
+  guard_value(bytecode,[mov_a_r0,mov_a_r1,mov_r0_a,decr_a,mov_a_r0,mov_r2_a,add_r1_to_a,mov_a_r2,mov_r0_a,jump_if_a,2,mov_r2_a,return_a],[],bytecode_loop)
+  guard_value(bytecode,[mov_a_r0,mov_a_r1,mov_r0_a,decr_a,mov_a_r0,mov_r2_a,add_r1_to_a,mov_a_r2,mov_r0_a,jump_if_a,2,mov_r2_a,return_a],[],bytecode_loop_promote_bytecode)
+  guard_value(pc,2,[],bytecode_loop_promote_pc)
+  op2(opcode,readlist,var(bytecode),var(pc))
+  op2(pc,add,var(pc),const(1))
+  op2(c,eq,var(opcode),const(jump_if_a))
+  guard_false(c,[],op_jump_if_a)
+  op2(c,eq,var(opcode),const(mov_a_r0))
+  guard_false(c,[],op_mov_a_r0)
+  op2(c,eq,var(opcode),const(mov_a_r1))
+  guard_false(c,[],op_mov_a_r1)
+  op2(c,eq,var(opcode),const(mov_a_r2))
+  guard_false(c,[],op_mov_a_r2)
+  op2(c,eq,var(opcode),const(mov_r0_a))
+  guard_true(c,[],not_mov_r0_a)
+  op1(a,same,var(r0))
+  guard_value(bytecode,[mov_a_r0,mov_a_r1,mov_r0_a,decr_a,mov_a_r0,mov_r2_a,add_r1_to_a,mov_a_r2,mov_r0_a,jump_if_a,2,mov_r2_a,return_a],[],bytecode_loop_promote_bytecode)
+  guard_value(pc,3,[],bytecode_loop_promote_pc)
+  op2(opcode,readlist,var(bytecode),var(pc))
+  ...
+  lots of operations ommitted
+  ...
+  guard_value(bytecode,[mov_a_r0,mov_a_r1,mov_r0_a,decr_a,mov_a_r0,mov_r2_a,add_r1_to_a,mov_a_r2,mov_r0_a,jump_if_a,2,mov_r2_a,return_a],[],bytecode_loop_promote_bytecode)
+  guard_value(pc,9,[],bytecode_loop_promote_pc)
+  op2(opcode,readlist,var(bytecode),var(pc))
+  op2(pc,add,var(pc),const(1))
+  op2(c,eq,var(opcode),const(jump_if_a))
+  guard_true(c,[],not_jump_if_a)
+  op2(c,eq,var(a),const(0))
+  op2(target,readlist,var(bytecode),var(pc))
+  op2(pc,add,var(pc),const(1))
+  guard_false(c,[],bytecode_loop)
+  loop
+
+opttrace
+  op1(pc,same,var(target))
+  guard_value(bytecode,[mov_a_r0,mov_a_r1,mov_r0_a,decr_a,mov_a_r0,mov_r2_a,add_r1_to_a,mov_a_r2,mov_r0_a,jump_if_a,2,mov_r2_a,return_a],[],bytecode_loop)
+  guard_value(pc,2,[bytecode/[mov_a_r0,mov_a_r1,mov_r0_a,decr_a,mov_a_r0,mov_r2_a,add_r1_to_a,mov_a_r2,mov_r0_a,jump_if_a,2,mov_r2_a,return_a]],bytecode_loop_promote_pc)
+  op1(a,same,var(r0))
+  op2(a,sub,var(a),const(1))
+  op1(r0,same,var(a))
+  op1(a,same,var(r2))
+  op2(a,add,var(a),var(r1))
+  op1(r2,same,var(a))
+  op1(a,same,var(r0))
+  op2(c,eq,var(a),const(0))
+  guard_false(c,[bytecode/[mov_a_r0,mov_a_r1,mov_r0_a,decr_a,mov_a_r0,mov_r2_a,add_r1_to_a,mov_a_r2,mov_r0_a,jump_if_a,2,mov_r2_a,return_a],pc/11,opcode/jump_if_a,target/2],bytecode_loop)
+  op1(bytecode,same,const([mov_a_r0,mov_a_r1,mov_r0_a,decr_a,mov_a_r0,mov_r2_a,add_r1_to_a,mov_a_r2,mov_r0_a,jump_if_a,2,mov_r2_a,return_a]))
+  op1(pc,same,const(11))
+  op1(opcode,same,const(jump_if_a))
+  op1(target,same,const(2))
+  op1(c,same,const(0))
+  loop
+
+256
+B = [mov_a_r0, mov_a_r1, mov_r0_a, decr_a, mov_a_r0, mov_r2_a, add_r1_to_a, mov_a_r2, mov_r0_a|...],
+A = 16,
+Env = [bytecode/[mov_a_r0, mov_a_r1, mov_r0_a, decr_a, mov_a_r0, mov_r2_a, add_r1_to_a|...], pc/11, a/16, r0/16, r1/16, r2/0, target/2] .
+
+

That looks better. The trace corresponds to the interpreter running all the +bytecodes in the loop of the squaring function in the example bytecode above. +The optimized code starts with +two guards (checking that the bytecode is still the one for the squaring +function, checking that the pc is 2) and then only does the operations +that actually do the computation. No bytecode dispatching is performed, thus the +interpretation overhead is fully removed, apart from the two guard_value +operations at the beginning.

+

Many of the assignments in the trace are superfluous, e.g. all the copying back +and forth between registers r1, r1, r2 and accumulator a. This +could be easily solved by an even more intelligent optimization utilizing SSA +form.

+

Conclusion About the Interpreter

+

Both partial evaluation and meta-tracing can be used to transform the example +bytecode computing a square into a form that shows the essential computation +that is going on, without the interpretation overhead. The naive partial evaluator +produces lots of extra blocks that just jump around, which could be solved with +a post-processing step. The tracer by itself produces uselessly short traces, +but with a simple trick of starting the trace at a different point the results +become a lot better.

+

In a real meta-tracing system, the meta-tracer would need a way for the author +of the interpreter +to mark which bytecode corresponds to a backward jump. It would also need better +integration with the interpreter to start tracing automatically, as well as +cache the traces. Additionally, it would have to deal better with guards that fail a +lot, attaching new traces to the failing guards. However, all that is "just" +engineering on top of the ideas presented in this series of blog posts.

+

High-Level Conclusion

+

Some concluding high-level thoughts about the similarities of tracing and +partial evaluation: Tracing and partial evaluation try to tackle a similar +problem, that of automatically reducing the interpreter overhead, their +approaches are slightly different though.

+

Tracing is very close to normal evaluation, only keeping some extra information +in the process. But then, the optimizer that is used in a tracer +is again very similar in structure to a partial evaluator. The task of the +optimizer is much simpler though, because it does not need to deal with control +flow at all, just a linear list of operations.

+

So in a sense tracing is taking those parts of partial evaluation that work (the +"just evaluate those things that you can, and leave the others") and replacing +the parts that don't (controlling unfolding) by a much more pragmatic mechanism. +That mechanism observes actual execution runs of the program to choose control +flow paths that are typical. At the same time, the tracer's focus is on loops, +because they are where most programs spend significant amounts of time.

+

Another point of view of tracing is that it is a form of partial evaluation that +replaces the control components of a partial evaluator with an oracle (the +actual execution runs) that provide the information which paths to look at.

+

Already in the quite trivial interpreter here the effects of this are visible. +The simple partial evaluator over-specializes the loop and produces two +identical versions of it, that aren't different. The tracer doesn't, and it +also generates only code for the loop itself, not for the initialization +opcodes.

+

That's it for this series. To those that made it, thanks for following along. +Also thanks to Samuele and Sven, who consistently gave me good feedback on the +posts before I put them here.

+
+

PyPy 1.8 - business as usual

+ +
+

We're pleased to announce the 1.8 release of PyPy. As habitual this +release brings a lot of bugfixes, together with performance and memory +improvements over the 1.7 release. The main highlight of the release +is the introduction of list strategies which makes homogenous lists +more efficient both in terms of performance and memory. This release +also upgrades us from Python 2.7.1 compatibility to 2.7.2. Otherwise +it's "business as usual" in the sense that performance improved +roughly 10% on average since the previous release.

+

you can download the PyPy 1.8 release here:

+
+https://pypy.org/download.html +
+
+

What is PyPy?

+

PyPy is a very compliant Python interpreter, almost a drop-in replacement for +CPython 2.7. It's fast (pypy 1.8 and cpython 2.7.1 performance comparison) +due to its integrated tracing JIT compiler.

+

This release supports x86 machines running Linux 32/64, Mac OS X 32/64 or +Windows 32. Windows 64 work has been stalled, we would welcome a volunteer +to handle that.

+
+
+

Highlights

+
    +
  • +

    List strategies. Now lists that contain only ints or only floats should +be as efficient as storing them in a binary-packed array. It also improves +the JIT performance in places that use such lists. There are also special +strategies for unicode and string lists.

    +
  • +
  • +

    As usual, numerous performance improvements. There are many examples +of python constructs that now should be faster; too many to list them.

    +
  • +
  • +

    Bugfixes and compatibility fixes with CPython.

    +
  • +
  • +

    Windows fixes.

    +
  • +
  • +

    NumPy effort progress; for the exact list of things that have been done, +consult the numpy status page. A tentative list of things that has +been done:

    +
      +
    • multi dimensional arrays
    • +
    • various sizes of dtypes
    • +
    • a lot of ufuncs
    • +
    • a lot of other minor changes
    • +
    +

    Right now the numpy module is available under both numpy and numpypy +names. However, because it's incomplete, you have to import numpypy first +before doing any imports from numpy.

    +
  • +
  • +

    New JIT hooks that allow you to hook into the JIT process from your python +program. There is a brief overview of what they offer.

    +
  • +
  • +

    Standard library upgrade from 2.7.1 to 2.7.2.

    +
  • +
+
+
+

Ongoing work

+

As usual, there is quite a bit of ongoing work that either didn't make it to +the release or is not ready yet. Highlights include:

+
    +
  • Non-x86 backends for the JIT: ARMv7 (almost ready) and PPC64 (in progress)
  • +
  • Specialized type instances - allocate instances as efficient as C structs, +including type specialization
  • +
  • More numpy work
  • +
  • Since the last release there was a significant breakthrough in PyPy's +fundraising. We now have enough funds to work on first stages of numpypy +and py3k. We would like to thank again to everyone who donated.
  • +
  • It's also probably worth noting, we're considering donations for the +Software Transactional Memory project. You can read more about our plans +
  • +
+

Cheers,
+The PyPy Team

+
+
+
+
+
+ + Anonymous wrote on 2012-02-10 11:08: +
+
+

As usual, excellent work!
The faster Pypy becomes, the less the need to use limited languages just for speed considerations.
List specialization is really cool and seems to boost performance and reduce memory usage considerably. I'd love seeing specializations for tuples of ints/floats/strings as structs.
On a side note, what stops people from using RPython as a compiled language (in terms of speed) with a nicer syntax?

+
+
+
+
+ + Daivd wrote on 2012-02-10 12:57: +
+
+

Well done!

I find nothing on the comparison page about memory (maybe because it's called speed.pypy.org...). How are you stacking up against CPython there, on benchmarks and real word examples? I realize a JIT will always need some memory overhead, but perhaps you have done enough clever things now, like list strategies, to be competitive anyway?

+
+
+
+
+ + halfaleague wrote on 2012-02-10 14:03: +
+
+

I would donate to this.
Would this give us 'true' multithreading? a la clojure?

+
+
+
+
+ + Unknown wrote on 2012-02-10 17:45: +
+
+

Seems like you guys are ahead of the curve with STM: https://arstechnica.com/business/news/2012/02/transactional-memory-going-mainstream-with-intel-haswell.ars

+
+
+
+
+ + kurdakov wrote on 2012-02-12 11:29: +
+
+

Did anybody test if it works with
pypy?

https://github.com/mvantellingen/psycopg2-ctypes

would be great to have out of the box postgresql support for Django

+
+
+
+
+ + Joko Susilo wrote on 2012-02-13 11:02: +
+
+

i will try it first

+
+
+
+
+ + Anonymous wrote on 2012-02-13 14:23: +
+
+

Just donated for py3k. I think it would make sense to allow donations for STM as well.

+
+
+
+
+ + One Wellness Place wrote on 2012-04-20 10:44: +
+
+

I will try it.

+
+
+
+ +

Introductory Article About RPython

+ +
+ Laurence Tratt from King's College London has written a long and detailed introduction to the goals and significance of RPython over on his blog. Laurie has been implementing his Converge Language in RPython in the last months. He is one of the first people external to the PyPy team who have pushed a sizeable RPython-based VM quite far, adding and tuning JIT hints. The post describes some of that work and his impressions of RPython and PyPy.

+"RPython, to my mind, is an astonishing project. It has, almost single-handedly, opened up an entirely new approach to VM implementation. As my experience shows, creating a decent RPython VM is not a huge amount of work (despite some frustrations). In short: never again do new languages need come with unusably slow VMs. That the the PyPy / RPython team have shown that these ideas scale up to a fast implementation of a large, real-world language (Python) is another feather in their cap." 
+
+
+
+
+ + Luis wrote on 2012-02-10 01:38: +
+
+

My English is not very good, but I suspect "Introductionary" is not a word. I would use "introductory" instead.

+
+
+
+
+ + Carl Friedrich Bolz-Tereick wrote on 2012-02-10 10:07: +
+
+

It probably wasn't before, but now it is! (and a pretty nice word, no?)

Fixed.

+
+
+
+
+ + Luis wrote on 2012-02-10 23:56: +
+
+

"It probably wasn't before, but now it is! (and a pretty nice word, no?)"

Well, it surely sounds more sophisticated :-)

+
+
+
+ +

Optimizing Traces of the Flow Graph Language

+ +
+

Part 3 of Comparing Partial Evaluation to Tracing

+

This is the third blog post in a series about comparing partial evaluation and +tracing. In the first post of the series I introduced a small flow-graph +language together with an interpreter for it. Then I showed a partial evaluator +for the language. In the second post of the series I showed how a tracer for +the same language works and how it relates to both execution and to partial +evaluation. Then I added support for promotion to that tracer.

+

In this post I will show how to optimize the traces that are produced by the +tracer and compare the structure of the optimizer to that of partial +evaluation.

+

The code from this post can be found here: https://paste.pocoo.org/show/547304/

+

Optimizing Traces

+

In the last post we saw how to produce a linear trace with guards by +interpreting a control flow graph program in a special mode. A trace always end with +a loop statement, which jumps to the beginning. The tracer is just logging +the operations that are done while interpreting, so the trace can contain +superfluous operations. On the other hand, the trace also contains some of the +runtime values through promotions and some decisions made on them which can be +exploited by optimization. An example for this is the trace produced by the +promotion example from the last post:

+
op2(c,ge,var(i),const(0),
+guard_true(c,[],l_done,
+guard_value(x,5,[],b2,
+op2(x2,mul,var(x),const(2),
+op2(x3,add,var(x2),const(1),
+op2(i,sub,var(i),var(x3),
+loop))))))
+
+

After the guard_value(x, 5, ...) operation, x is know to be 5: If +it isn't 5, execution falls back to the interpreter. Therefore, operations +on x after the guard can be constant-folded. To do that sort of +constant-folding, +an extra optimization step is needed. That optimization step walks along the +trace, remembers which variables are constants and what their values are using a +partial environment. The opimizer removes operations that have only constant +arguments and leaves the others in the trace. This process is actually +remarkably similar to partial evaluation: Some variables are known to be +constants, operations on only constant arguments are optimized away, the rest +remains.

+

The code for optimizing operations looks as follows:

+
optimize(op1(ResultVar, Op, Arg, Rest), PEnv, NewOp) :-
+    presolve(Arg, PEnv, RArg),
+    (RArg = const(C) ->
+        do_op(Op, C, Res),
+        write_env(PEnv, ResultVar, Res, NEnv),
+        NewOp = RestResidual
+    ;
+        remove_env(PEnv, ResultVar, NEnv),
+        NewOp = op1(ResultVar, Op, RArg, RestResidual)
+    ),
+    optimize(Rest, NEnv, RestResidual).
+
+optimize(op2(ResultVar, Op, Arg1, Arg2, Rest), PEnv, NewOp) :-
+    presolve(Arg1, PEnv, RArg1),
+    presolve(Arg2, PEnv, RArg2),
+    (RArg1 = const(C1), RArg2 = const(C2) ->
+        do_op(Op, C1, C2, Res),
+        write_env(PEnv, ResultVar, Res, NEnv),
+        NewOp = RestResidual
+    ;
+        remove_env(PEnv, ResultVar, NEnv),
+        NewOp = op2(ResultVar, Op, RArg1, RArg2, RestResidual)
+    ),
+    optimize(Rest, NEnv, RestResidual).
+
+

Just like partial evaluation! It even reuses the helper functions presolve +from the partial evaluator and a partial environment PEnv. When the +arguments of the operation are known constants in the partial environment, the +operation can be executed at optimization time and removed from the trace. +Otherwise, the operation has to stay in the output trace. The result variable +(as in the partial evaluator) needs to be removed from the partial environment, +because it was just overwritten by an unknown result.

+

Now we need to deal with guards in the trace.

+
optimize(guard_true(V, [], L, Rest), PEnv, NewOp) :-
+    plookup(V, PEnv, Val),
+    (Val = const(C) ->
+        NewOp = RestResidual
+    ;
+        NewOp = guard_true(V, PEnv, L, RestResidual)
+    ),
+    optimize(Rest, PEnv, RestResidual).
+
+optimize(guard_false(V, [], L, Rest), PEnv, NewOp) :-
+    plookup(V, PEnv, Val),
+    (Val = const(C) ->
+        NewOp = RestResidual,
+        NEnv = PEnv
+    ;
+        write_env(PEnv, V, 0, NEnv),
+        NewOp = guard_false(V, PEnv, L, RestResidual)
+    ),
+    optimize(Rest, NEnv, RestResidual).
+
+

When the variable that is being guarded is actually known to be a constant, we +can remove the guard. Note that it is not possible that the guard of that +constant fails: The tracer recorded the operation while running with real +values, therefore the guards have to succeed for values the optimizer +discovers to be constant.

+

guard_false is slightly different from guard_true: after the former we +know that the argument is actually 0. After guard_true we only know that +it is not equal to zero, but not which precise value it has.

+

Another point to note in the optimization of guards is that the second argument +of the guard operation, which was so far always just an empty list, is now +replaced by the partial environment PEnv. I will discuss further down why +this is needed.

+

Optimizing guard_value is very similar, except that it really gives precise +information about the variable involved:

+
optimize(guard_value(V, C, [], L, Rest), PEnv, NewOp) :-
+    plookup(V, PEnv, Val),
+    (Val = const(C1) ->
+        NewOp = RestResidual,
+        NEnv = PEnv
+    ;
+        write_env(PEnv, V, C, NEnv),
+        NewOp = guard_value(V, C, PEnv, L, RestResidual)
+    ),
+    optimize(Rest, NEnv, RestResidual).
+
+

This operation is the main way how the optimizer gains constant variables that +it then exploits to do constant-folding on later operations. This is a chief +difference from partial evaluation: There the optimizer knows the value of some +variables from the start. When optimizing traces, at the beginning the value of +no variable is known. Knowledge about some variables is only later gained +through guards.

+

Now we are missing what happens with the loop statement. In principle, it is +turned into a loop statement again. However, at the loop statement a few +additional operations need to be emitted. The reason is that we optimized away +operations and thus assignments when the result value of the variable was a +constant. That means the involved variable still potentially has some older +value. The next iteration of the loop would continue with this older value, +which is obviously wrong. Therefore we need to emit some assignments before the +loop statement, one per entry in the partial environment:

+
optimize(loop, PEnv, T) :-
+    generate_assignments(PEnv, T).
+
+generate_assignments([], loop).
+generate_assignments([Var/Val | Tail], op1(Var, same, const(Val), T)) :-
+    generate_assignments(Tail, T).
+
+

As an example of how generate_assignments assignments works, let's look at +the following example. When the partial environment is, [x/5, y/10] the +following assignments are generated:

+
?- generate_assignments([x/5, y/10], Out).
+Out = op1(x, same, const(5), op1(y, same, const(10), loop)).
+
+

That's all the code of the optimizer. While the basic structure is quite similar to partial evaluation, +it's a lot less complex as well. What made the partial evaluator hard was that +it needs to deal with control flow statements and with making sure that code is +reused if the same block is partially evaluated with the same constants. Here, +all these complexities go away. The tracer has already removed all control flow +and replaced it with guards and one loop operation at the end. Thus, the +optimizer can simply do one pass over the operations, removing some (with some +extra care around the loop statement).

+

With this machinery in place, we can optimize the trace from the promotion +example of the last post:

+
?- optimize(
+    guard_value(x,3,[],b2,
+    op2(x2,mul,var(x),const(2),
+    op2(x3,add,var(x2),const(1),
+    op2(i,sub,var(i),var(x3),
+    op2(c,ge,var(i),const(0),
+    guard_true(c,[],l_done, loop)))))),
+    [],
+    LoopOut).
+LoopOut = guard_value(x, 3, [], b2, op2(i, sub, var(i), const(7), op2(c, ge, var(i), const(0), guard_true(c, [x/3, x2/6, x3/7], l_done, op1(x, same, const(3), op1(x2, same, const(6), op1(x3, same, const(7), loop)))))))
+
+

More readably, the optimized version is:

+
guard_value(x, 3, [], b2,
+op2(i, sub, var(i), const(7),
+op2(c, ge, var(i), const(0),
+guard_true(c, [x/3, x2/6, x3/7], l_done,
+op1(x, same, const(3),
+op1(x2, same, const(6),
+op1(x3, same, const(7),
+loop)))))))
+
+

As intended, the operations on x after the guard_value have all been +removed. However, some additional assignments (to x, x2, x3) at the end have been generated as +well. The assignments look superfluous, but the optimizer does not have +enough information to easily recognize this. That can be fixed, but only at the +cost of additional complexity. (A real system would transform the trace into +static single assignment form to answer such questions.)

+

Resuming to the Interpreter

+

Why does the code above need to add the partial environment to +the guards that cannot be optimized away? The reason is related to why we needed +to generate assignments before the loop statement. The problem is that the optimizer +removes assignments to variables when it knows the values of these variables. +That means that when switching back from running the optimized trace to the +interpreter, a number of variables are not updated in the environment, making +the execution in the interpreter incorrect.

+

In the example above, this applies to the variables x2 and x3. When the +second guard fails, they have not been assigned in the optimized case. +Therefore, the guard lists them and their (always constant) values.

+

When switching back these assignments need to be made. Thus we need to adapt the +resume_interp function from the last blog post as follows:

+
write_resumevars([], Env, Env).
+write_resumevars([Key / Value | Rest], Env, NEnv) :-
+    write_env(Env, Key, Value, Env1),
+    write_resumevars(Rest, Env1, NEnv).
+
+resume_interp(Env, ResumeVars, L) :-
+    write_resumevars(ResumeVars, Env, NEnv),
+    block(L, Block),
+    interp(Block, NEnv).
+
+

On resuming, the ResumeVars (a former partial environment) are simply added +back to the normal environment before going back to the interpreter.

+

The data attached to guards about what needs to be done to resume to the +interpreter when the guard fails is often a very complex part of a tracing +system. The data can become big, yet most guards never fail. Therefore, most +real systems try hard to compress the attached data or try to share it between +subsequent guards.

+

Summary

+

In this post we have shown how to optimize traces by applying a variant of the +partial evaluation principle: Perform all the operations that have only constant +arguments, leave the others alone. However, optimizing traces is much simpler, +because no control flow is involved. All the questions about control flow have +already been solved by the tracing component.

+

In the next and final post of the series I will show a larger example of how +tracing and partial evaluation can be used to optimize a small bytecode +interpreter.

+
+

Almost There - PyPy's ARM Backend

+ +
+
+In this post I want to give an update on the status of the ARM backend for PyPy's JIT and describe some of the issues and details of the backend.
+
+

+ + + + +Current Status

+It has been a more than a year that I have been working on the ARM backend. Now it is in a shape, that we can measure meaningful numbers and also ask for some feedback. Since the last post about the backend we have added support floating point operations as well as for PyPy's framework GC's. Another area of work was to keep up with the constant improvements done in the main development branch, such as out-of-line guards, labels, etc. It has been possible for about a year to cross-translate the PyPy Python interpreter and other interpreters such as Pyrolog, with a JIT, to run benchmarks on ARM. Up until now there remained some hard to track bugs that would cause the interpreter to crash with a segmentation fault in certain cases when running with the JIT on ARM. Lately it was possible to run all benchmarks without problems, but when running the translation toolchain itself it would crash. During the last PyPy sprint in Leysin Armin and I managed to fix several of these hard to track bugs in the ARM backend with the result that, it is now possible to run the PyPy translator on ARM itself (at least unless until it runs out of memory), which is a kind of litmus test for the backend itself and used to crash before. Just to point it out, we are not able to complete a PyPy translation on ARM, because on the hardware we have currently available there is not enough memory. But up to the point we run out of memory the JIT does not hit any issues.

+
+
+

+ + + + +Implementation Details

+The hardware requirements to run the JIT on ARM follow those for Ubuntu on ARM which targets ARMv7 with a VFP unit running in little endian mode. The JIT can be translated without floating point support, but there might be a few places that need to be fixed to fully work in this setting. We are targeting the ARM instruction set, because at least at the time we decided to use it seemed to be the best choice in terms of speed while having some size overhead compared to the Thumb2 instruction set. It appears that the Thumb2 instruction set should give comparable speed with better code density but has a few restriction on the number of registers available and the use of conditional execution. Also the implementation is a bit easier using a fixed width instruction set and we can use the full set of registers in the generated code when using the ARM instruction set.

+
+
+

+ + + + +The calling convention on ARM

+The calling convention on ARM uses 4 of the general purpose registers to pass arguments to functions, further arguments are passed on the stack. The presence of a floating point unit is not required for ARM cores, for this reason there are different ways of handling floats with relation to the calling convention. There is a so called soft-float calling convention that is independent of the presence of a floating point unit. For this calling convention floating point arguments to functions are stored in the general purpose registers and on the stack. Passing floats around this way works with software and hardware floating point implementations. But in presence of a floating point unit it produces some overhead, because floating point numbers need to be moved from the floating point unit to the core registers to do a call and moved back to the floating point registers by the callee. The alternative calling convention is the so-called hard-float calling convention which requires the presence of a floating point unit but has the advantage of getting rid of the overhead of moving floating point values around when performing a call. Although it would be better in the long term to support the hard-float calling convention, we need to be able to interoperate with external code compiled for the operating system we are running on. For this reason at the moment we only support the soft-float to interoperate with external code. We implemented and tested the backend on a BeagleBoard-xM with a Cortex-A8 processor running Ubuntu 11.04 for ARM.

+
+
+

+ + + + +Translating for ARM

+The toolchain used to translate PyPy currently is based on a Scratchbox2. Scratchbox2 is a cross-compiling environment. Development had stopped for a while, but it seems to have revived again. We run a 32-bit Python interpreter on the host system and perform all calls to the compiler using a Scratchbox2 based environment. A description on how to setup the cross translation toolchain can be found here.

+
+
+

+ + + + +Results

+The current results on ARM, as shown in the graph below, show that the JIT currently gives a speedup of about 3.5 times compared to CPython on ARM. The benchmarks were run on the before mentioned BeagleBoard-xM with a 1GHz ARM Cortex-A8 processor and 512MB of memory. The operating system on the board is Ubuntu 11.04 for ARM. We measured the PyPy interpreter with the JIT enabled and disabled comparing each to CPython Python 2.7.1+ (r271:86832) for ARM. The graph shows the speedup or slowdown of both PyPy versions for the different benchmarks from our benchmark suite normalized to the runtime of CPython. The data used for the graph can be seen below.
+ +
+
+The speedup is less than the speedup of 5.2 times we currently get on x86 on our own benchmark suite (see https://speed.pypy.org for details). There are several possible reasons for this. Comparing the results for the interpreter without the JIT on ARM and x86 suggests that the interpreter generated by PyPy, without the JIT, has a worse performance when compared to CPython that it does on x86. Also it is quite possible that the code we are generating with the JIT is not yet optimal. Also there are some architectural constraints produce some overhead. One of these differences is the handling of constants, most ARM instructions only support 8 bit (that can be shifted) immediate values, larger constants need to be loaded into a register, something that is not necessary on x86.

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
BenchmarkPyPy JITPyPy no JIT
ai0.4844397800473.72756749625
chaos0.08072916919342.2908692212
crypto_pyaes0.07111148322453.30112318509
django0.09777432455192.56779947601
fannkuch0.2104237356982.49163632938
float0.1542753346752.12053281495
go0.3304830342025.84628320479
html5lib0.6292643898623.60333138526
meteor-contest0.9847474269122.93838610037
nbody_modified0.2369695930821.40027234936
pyflate-fast0.3674471918072.72472422146
raytrace-simple0.02905274614371.97270054339
richards0.0345755735533.29767342015
slowspitfire0.7866425519083.7397367403
spambayes0.6603243794563.29059863111
spectral-norm0.0636107837314.01788986233
spitfire0.436171311652.72050579076
spitfire_cstringio0.2555387021341.7418593111
telco0.1029189304133.86388866047
twisted_iteration0.1227239868054.33632475491
twisted_names2.423677971352.99878698076
twisted_pb1.309918374314.48877805486
twisted_tcp0.9270333540552.8161624665
waf1.020598119321.03793427321
+
+

+

+ + + + +The next steps and call for help

+Although there probably still are some remaining issues which have not surfaced yet, the JIT backend for ARM is working. Before we can merge the backend into the main development line there are some things that we would like to do first, in particular it we are looking for a way to run the all PyPy tests to verify that things work on ARM before we can merge. Additionally there are some other longterm ideas. To do this we are looking for people willing to help, either by contributing to implement the open features or that can help us with hardware to test.

+The incomplete list of open topics:
    +
  • We are looking for a better way to translate PyPy for ARM, than the one describe above. I am not sure if there currently is hardware with enough memory to directly translate PyPy on an ARM based system, this would require between 1.5 or 2 Gig of memory. A fully QEMU based approach could also work, instead of Scratchbox2 that uses QEMU under the hood.
  • +
  • Test the JIT on different hardware.
  • +
  • Experiment with the JIT settings to find the optimal thresholds for ARM.
  • +
  • Continuous integration: We are looking for a way to run the PyPy test suite to make sure everything works as expected on ARM, here QEMU also might provide an alternative.
  • +
  • A long term plan would be to port the backend to ARMv5 ISA and improve the support for systems without a floating point unit. This would require to implement the ISA and create different code paths and improve the instruction selection depending on the target architecture.
  • +
  • Review of the generated machine code the JIT generates on ARM to see if the instruction selection makes sense for ARM.
  • +
  • Build a version that runs on Android.
  • +
  • Improve the tools, i.e. integrate with jitviewer.
  • +
+So if you are interested or willing to help in any way contact us.
+
+
+
+
+ + Michael Hudson-Doyle wrote on 2012-02-02 00:20: +
+
+

Awesome news. We might be able to donate some time in the Linaro validation lab to running tests, I'll see what we can do...

+
+
+
+
+ + Anonymous wrote on 2012-02-02 21:55: +
+
+

"Just to point it out, we are not able to complete a PyPy translation on ARM, because on the hardware we have currently available there is not enough memory."

Can't you just add more swap?

+
+
+
+
+ + Jan Ziak (atomsymbol) wrote on 2012-02-03 09:08: +
+
+

You wrote: "The speedup is less than the speedup of 5.2 times you currently get on x86."

The removed comment was meant to point out that the author of the blog post does not (and cannot) know the actual speedups (and slowdowns) people are getting on their machines.

The speedup of 5.2 you mentioned is contradicting my own experience.

I suggest you rewrite your sentence into "The speedup is less than the speedup of 5.2 times we currently get on x86."

+
+
+
+
+ + Maciej Fijalkowski wrote on 2012-02-03 09:15: +
+
+

"The speedup of 5.2 you mentioned is contradicting my own experience."

Did you run the very same benchmark suite or some arbitrary programs? If arbitrary programs then sorry, but it's seriously impossible for us to optimize stuff we don't know about. Please submit bug tracker issues for that.

I agree the speedup should be qualified "on our own benchmark suite", but if you don't contribute benchmarks, you can't complain.

+
+
+
+
+ + David Schneider wrote on 2012-02-03 11:30: +
+
+

@⚛ to avoid misunderstandings I updated the sentence in question to make it clear that I was comparing the performance of the ARM backend running our own benchmark suite to the results of the benchmarks as shown on speed.pypy.org.

+
+
+
+
+ + Naos wrote on 2012-02-03 23:19: +
+
+

Every time I visit comments to blog posts on this page I see some hater or two or even more who, don't know why, have wierd problems without a reason. People chill out, you get this brilliant piece of software and you do not have to pay for it.

+
+
+
+
+ + Jan Ziak (atomsymbol) wrote on 2012-02-04 08:36: +
+
+

@Naos: I do *not* hate PyPy. I like it and want to make it better. To do that, I am using a method different from your method. I would like the PyPy team to figure out how to run my benchmark faster without me disclosing the name of the benchmark. I believe that in the end this method will lead to a couple of universal optimizations in PyPy.

+
+
+
+
+ + Maciej Fijalkowski wrote on 2012-02-04 09:38: +
+
+

@⚛ Contrary to what you might believe, this ends up with you being annoying and nothing else. If you think we don't think all the time about general improvements, you're wrong, but pointless, unscientific complaining is not welcomed here.

+
+
+
+
+ + Anonymous wrote on 2012-02-06 10:55: +
+
+

will it work on the raspberry pi or does it use a different arm architecture?

(i am confused by arms. :) apparently it isn't like in the x86 world where everything is compatible with each other.)

+
+
+
+
+ + Anonymous wrote on 2012-02-06 11:46: +
+
+

@Anonymous

> The hardware requirements to run the JIT on ARM follow those for Ubuntu on ARM which targets ARMv7 with a VFP unit running in little endian mode.

This is higher than Raspberry Pi's ARMv6.

+
+
+
+
+ + Anonymous wrote on 2012-02-06 22:14: +
+
+ A long term plan would be to port the backend to ARMv5 ISA and improve the support for systems without a floating point unit. This would require to implement the ISA and create different code paths and improve the instruction selection depending on the target architecture.

so someday it will support the v6 too? or isn't v5 and v6 compatible either? +
+
+
+
+ + David Schneider wrote on 2012-02-09 14:18: +
+
+

@Anonymous ARMv6 should be backwards compatible to the ARMv5 ISA. So it should be possible to use the JIT on ARMv6 once it works for ARMv5.

+
+
+
+
+ + Anonymous wrote on 2012-02-10 08:10: +
+
+

Really nice with raspberry Pi being released and ARM proto boards gaining traction. Looking forward to develop w/ this on rPi

+
+
+
+
+ + d.q. wrote on 2012-03-14 16:42: +
+
+

I have an arm system with enough swap, microsd or cifs or both... though a setup like this will probably take until next release to translate pypy, won't it?
Can also do usermode qemu of course.

+
+
+
+
+ + David Schneider wrote on 2012-04-18 10:05: +
+
+

@d.q. It would at least take until the next release, if not several...
A qemu based solution would be interesting. If you are interested I would propose you join #pypy on IRC to discuss

+
+
+
+
+ + Anonymous wrote on 2013-03-01 01:28: +
+
+

I'm pretty sure the guys over at Boundary Devices are making a SabreLite variant with 2GB ram.

https://boundarydevices.com/imx6-options-single-dual-core-2gb-ddr/

You may need to email them directly to purchase it still, but they're responsive and a pleasure to work with.

+
+
+
+
+ + Eric van Riet Paap wrote on 2013-03-19 14:15: +
+
+

Anyone try to create a qemu based solution? Even if only a build system. I would be interested in having some documentation about how to build pypy for Raspberry Pi. I know it's a non-compatible ARM version but we have to start somewhere. Plus I have a very RPi's laying around to play with...

+
+
+
+
+ + Eric van Riet Paap wrote on 2013-03-19 14:17: +
+
+

Anyone tried to build with a qemu setup? I have several Raspberry Pi's around that I could play with. I know the JIT will not work because of the difference in ARM versions but it's a start.

+
+
+
+ +

A Simple Tracer for the Flow Graph Language

+ +
+

Part 2 of Comparing Partial Evaluation to Tracing

+

This is the second blog post in a series about comparing partial evaluation and +tracing. In the first post of the series I introduced a small flow-graph +language together with an interpreter for it. Then I showed a partial evaluator +for the language. In this post I will show how a tracer for the same language +works and how it relates to both execution and to partial evaluation. +The code from this post can be found here: https://paste.pocoo.org/show/543542/

+

Tracing Execution

+

The idea of a tracer (for the described language and also in general) is to do completely normal +interpretation but at the same time keep a log of all the normal operations +(i.e. non-control-flow operations) that were performed. This continues until the +tracer executes the code block where it started at, in which case the trace +corresponds to a closed loop. Then tracing stops and the last operation is +replaced by a jump to the start. After tracing has ended, the trace can be +executed, optionally optimizing it before that.

+

To write a tracer, we start from the rules of the interpreter, rename the +predicate to trace and add some extra arguments. Thus, the following rules +in the interpreter:

+
interp(op1(ResultVar, Op, Arg, Rest), Env) :-
+    resolve(Arg, Env, RArg),
+    do_op(Op, RArg, Res),
+    write_env(Env, ResultVar, Res, NEnv),
+    interp(Rest, NEnv).
+
+interp(op2(ResultVar, Op, Arg1, Arg2, Rest), Env) :-
+    resolve(Arg1, Env, RArg1),
+    resolve(Arg2, Env, RArg2),
+    do_op(Op, RArg1, RArg2, Res),
+    write_env(Env, ResultVar, Res, NEnv),
+    interp(Rest, NEnv).
+
+

become the following rules in the tracer:

+
trace(op1(ResultVar, Op, Arg, Rest), Env, op1(ResultVar, Op, Arg, T), TraceAnchor) :-
+    resolve(Arg, Env, RArg),
+    do_op(Op, RArg, Res),
+    write_env(Env, ResultVar, Res, NEnv),
+    trace(Rest, NEnv, T, TraceAnchor).
+
+trace(op2(ResultVar, Op, Arg1, Arg2, Rest), Env, op2(ResultVar, Op, Arg1, Arg2, T), TraceAnchor) :-
+    resolve(Arg1, Env, RArg1),
+    resolve(Arg2, Env, RArg2),
+    do_op(Op, RArg1, RArg2, Res),
+    write_env(Env, ResultVar, Res, NEnv),
+    trace(Rest, NEnv, T, TraceAnchor).
+
+

Note how the bodies of the trace rules correspond exactly to the bodies of +the interp rules, the only difference is the recursive call to trace. +The meaning of the arguments of trace is as follows: The first and second argument are +the operation currently executed and the environment, +like in the interpreter. The argument +after that is an output argument that collects the currently traced operation, +in the example above it is exactly like the operation that was executed. +TraceAnchor is additional information about the trace that is being built +right now, most of the time it is just handed on to the recursive call of +trace. We will see later what it contains.

+

The rule for print_and_stop is very simple, as execution (and therefore also +tracing) simply stops there:

+
trace(print_and_stop(V), Env, print_and_stop(V), _) :-
+    resolve(V, Env, Val),
+    print(Val), nl.
+
+

Left are the rules for the control operations jump and if. A trace +linearizes one execution path, it contains no jumps. However, when a jump to the +starting label is reached, tracing should stop. Therefore, the implementation of +jump contains two cases:

+
trace(jump(L), Env, T, TraceAnchor) :-
+    (TraceAnchor = traceanchor(L, FullTrace) ->
+        T = loop,
+        write(trace), nl, write(FullTrace), nl,
+        do_optimize(FullTrace, OptTrace),
+        write(opttrace), nl, write(OptTrace), nl,
+        runtrace(OptTrace, Env, OptTrace)
+    ;
+        block(L, Block),
+        trace(Block, Env, T, TraceAnchor)
+    ).
+
+

Let's disect this code in small increments. First, we see what TraceAnchor +is. It is a term of the form +traceanchor(StartLabel, FullTrace). StartLabel is a label in the program +where tracing started (and where it should end as well, when the loop is +closed). The argument FullTrace is an accumulator which contains the full +trace that is being built right now.

+

The condition at the start of the rule checks whether the taget-label L is +the same as the one stored in the trace anchor. If that is the case, we can stop +tracing. The rest of the trace T is assigned the operation loop, which +jumps back to the beginning of the trace. Afterwards we print and optimize the +trace, then run it, using the FullTrace part of the traceanchor.

+

If the label we jump to is not the StartLabel we simply continue tracing +without recording any operation. This part of the rule is again extremely +similar to the interpretation of jump.

+

For now, we will not use any interesting optimizations, just return the +unoptimized trace unchanged:

+
do_optimize(FullTrace, FullTrace).
+
+

The missing operation now is if. An if statement needs special treatment, +because it is a way where control flow can diverge from the trace. The trace is +linear, therefore it can only record one of the two possible paths. When +executing the trace it is possible for the other path to be taken. Therefore +we need to make sure that the same conditions that were true or false during +tracing are still true or false during the execution of the trace. This is done +with a guard operation, which checks for this condition. The following rule +implements it:

+
trace(if(V, L1, L2), Env, T, TraceAnchor) :-
+    lookup(V, Env, Val),
+    (Val == 0 ->
+        L = L2, T = guard_false(V, [], L1, NT)
+    ;
+        L = L1, T = guard_true(V, [], L2, NT)
+    ),
+    trace(jump(L), Env, NT, TraceAnchor).
+
+

It is very similar to the interp rule of if. The rule inserts a +guard_true into the case, if the condition is true, and a guard_false if +the condition is false. The arguments of the guard are: The variable that is +being guarded, an empty list (the reason for that will be explained in a later +post), the label where execution needs to continue when the guard fails and the +rest of the trace.

+

Let's also add a small helper predicate that can be used to conveniently start +tracing:

+
do_trace(L, Env) :-
+    block(L, StartBlock),
+    trace(StartBlock, Env, ProducedTrace, traceanchor(L, ProducedTrace)).
+
+

The predicate takes a label and an environment and executes the label with the +given environment by first producing a trace, then executing the trace and +eventually jumping back to interpretation, if a guard fails. It does this by +reading the code at label L with the block statement, and then calling +trace with an unbound variable ProducedTrace to hold the trace and a trace +anchor that contains the label where tracing started and the produced trace +variable.

+

With that predicate and the trace so far we can already trace the power +implementation from the last blog post, just not execute the trace (which we +will do in the next section):

+
?- do_trace(power_rec, [res/1, x/10, y/20]).
+trace
+op2(res,mul,var(res),var(x),op2(y,sub,var(y),const(1),guard_true(y,[],power_done,loop)))
+opttrace
+op2(res,mul,var(res),var(x),op2(y,sub,var(y),const(1),guard_true(y,[],power_done,loop)))
+...
+
+

The computed trace is:

+
+op2(res,mul,var(res),var(x),
+op2(y,sub,var(y),const(1),
+guard_true(y,[],power_done,
+loop)))
+
+

which is exactly the content of the loop from power_rec. Note how the if +is turned into a guard_true which jumps to power_done if the guard +fails.

+

A real tracing system would need a way for the tracer to get started, e.g. by +doing profiling in an interpreter and starting the tracer for labels that are +jumped to often. Also, traces for the same label are usually cached in some way. +These details are left out in this simple model.

+

Executing Traces

+

In a real tracing system, the traces would be turned into machine code and +executed by the CPU. In our small model, we will simply write another +interpreter for them. This interpreter is very simple and looks again very +similar to interp.

+
runtrace(op1(ResultVar, Op, Arg, Rest), Env, TraceFromStart) :-
+    resolve(Arg, Env, RArg),
+    do_op(Op, RArg, Res),
+    write_env(Env, ResultVar, Res, NEnv),
+    runtrace(Rest, NEnv, TraceFromStart).
+
+runtrace(op2(ResultVar, Op, Arg1, Arg2, Rest), Env, TraceFromStart) :-
+    resolve(Arg1, Env, RArg1),
+    resolve(Arg2, Env, RArg2),
+    do_op(Op, RArg1, RArg2, Res),
+    write_env(Env, ResultVar, Res, NEnv),
+    runtrace(Rest, NEnv, TraceFromStart).
+
+

These rules are completely equivalent to the interp rules for op1 and +op2. runtrace needs an extra argument, TraceFromStart, which is +always just handed over to the recursive call of runtrace.

+

When the end of the trace is reached and the loop statement is encountered, +we simply start from the beginning:

+
runtrace(loop, Env, TraceFromStart) :-
+    runtrace(TraceFromStart, Env, TraceFromStart).
+
+

The remaining question is what to do when encountering guards. In that case the +guard condition needs to be checked. If the guard succeeds, executing the trace can +continue. Otherwise the trace is aborted and the interpreter resumes execution:

+
runtrace(guard_true(V, ResumeVars, L, Rest), Env, TraceFromStart) :-
+    lookup(V, Env, Val),
+    (Val == 0 ->
+        resume_interp(Env, ResumeVars, L)
+    ;
+        runtrace(Rest, Env, TraceFromStart)
+    ).
+
+runtrace(guard_false(V, ResumeVars, L, Rest), Env, TraceFromStart) :-
+    lookup(V, Env, Val),
+    (Val == 0 ->
+        runtrace(Rest, Env, TraceFromStart)
+    ;
+        resume_interp(Env, ResumeVars, L)
+    ).
+
+
+resume_interp(Env, [], L) :-
+    block(L, Block),
+    interp(Block, Env).
+
+

Note how the execution is handed over to the interpreter at the label that was +encoded as the third argument in the guard operation. +What the ResumeVars are for we will see in a later post. For now we assume +that it is always an empty list.

+

With this interpreter for traces we can now trace and then execute the example:

+
:- do_trace(power_rec, [res/1, x/10, y/20]).
+trace
+op2(res,mul,var(res),var(x),op2(y,sub,var(y),const(1),guard_true(y,[],power_done,loop)))
+opttrace
+op2(res,mul,var(res),var(x),op2(y,sub,var(y),const(1),guard_true(y,[],power_done,loop)))
+100000000000000000000
+
+

Of course this is example is not very exciting, because the trace looks more or less exactly +like the original code as well. There will be more exciting examples in a later +post.

+

Extension: Promotion

+

As it is, the tracer does not actually add much to the interpreter. It +linearizes control flow, but nothing deeply advanced happens. In this section I +will add a crucial but simple to implement extension to the control flow language that allows the tracer +to do more interesting things. This extension is called promotion.

+

Promotion is basically a hint that the programmer can add to her control flow +graph program. A promotion is an operation promote(V, L) that takes a +variable V and a label L. When the interpreter runs this statement, it +simply jumps to the label L and ignores the variable:

+
interp(promote(_, L), Env) :-
+    interp(jump(L), Env).
+
+

However, the tracer does something much more interesting. For the tracer, the +promote statement is a hint that it would be very useful to know the value +of V and that the rest of the trace should keep that value as a constant. +Therefore, when the tracer encounters a promotion, it inserts a special kind of +guard called guard_value

+
trace(promote(V, L), Env, guard_value(V, Val, [], L, T), TraceAnchor) :-
+    lookup(V, Env, Val),
+    trace(jump(L), Env, T, TraceAnchor).
+
+

The guard_value is an interesting operation, because it freezes the current +value FVal of variable V into the trace. When the trace is executed, the +guard checks that the current value of the variable and the frozen value are the +same. If yes, execution continues, if not, the trace is aborted:

+
runtrace(guard_value(V, FVal, ResumeVars, L, Rest), Env, TraceFromStart) :-
+    lookup(V, Env, Val),
+    (Val == FVal ->
+        runtrace(Rest, Env, TraceFromStart)
+    ;
+        resume_interp(Env, ResumeVars, L)
+    ).
+
+

What can this operation be used for? It's a way to communicate to the tracer +that variable V is not changing very often and that it is therefore useful +to freeze the current value into the trace. This can be done even without +knowing the value of V in advance.

+

Let's look at a (slightly contrived) example:

+
+l:
+    c = i >= 0
+    if c goto b else goto l_done
+
+l_done:
+    print_and_stop(var(i))
+
+b:
+    promote(x, b2)
+
+b2:
+    x2 = x * 2
+    x3 = x2 + 1
+    i = i - x3
+    goto l
+
+

Encoded in Prolog syntax:

+
block(l, op2(c, ge, var(i), const(0),
+         if(c, b, l_done))).
+block(l_done, print_and_stop(var(i))).
+
+block(b, promote(x, b2)).
+block(b2, op2(x2, mul, var(x), const(2),
+          op2(x3, add, var(x2), const(1),
+          op2(i, sub, var(i), var(x3),
+          jump(l))))).
+
+

This is a simple loop that counts down in steps of x * 2 + 1, whatever x +might be, until i >= 0 is no longer true. Assuming that x doesn't change +often, it is worth to promote it to be able to constant-fold x * 2 + 1 to +not have to redo it every iteration. This is done with the promotion of x +(of course optimizing this loop with loop invariant code motion would work as +well, because x doesn't actually change during the loop).

+

To trace this, we can run the following query:

+
?- do_trace(b, [i/100, x/5]).
+trace
+guard_value(x,5,[],b2,op2(x2,mul,var(x),const(2),op2(x3,add,var(x2),const(1),op2(i,sub,var(i),var(x3),op2(c,ge,var(i),const(0),guard_true(c,[],l_done,loop))))))
+opttrace
+guard_value(x,5,[],b2,op2(x2,mul,var(x),const(2),op2(x3,add,var(x2),const(1),op2(i,sub,var(i),var(x3),op2(c,ge,var(i),const(0),guard_true(c,[],l_done,loop))))))
+-10
+
+

Writing the trace in a more readable way:

+
guard_value(x,3,[],b2,
+op2(x2,mul,var(x),const(2),
+op2(x3,add,var(x2),const(1),
+op2(i,sub,var(i),var(x3),
+op2(c,ge,var(i),const(0),
+guard_true(c,[],l_done,
+loop))))))
+
+

After the guard_value the operations performed on x could be +constant-folded away, because the guard ensures that x is 5 before +execution continues. To actually do the constant-folding we would need some +optimization component that optimizes traces. This will be done in the next blog +post.

+

In this section I mostly talked about how promotion is realized in the tracer, +not what and how to use to use it for. Promotion is one of the most important +ingredients that's responsible for the success of PyPy's tracing approach. How +this works is discussed in detail in the paper "Runtime feedback in a +meta-tracing JIT for efficient dynamic languages".

+

Conclusion

+

In this blog post we have seen a very minimalistic tracer and an interpreter for +the produced traces. The tracer is very much like the original interpreter, it +just also keeps track of which operations were executed, in addition to +executing the program. Tracing stops when a loop is closed, then the trace can +be optimized and run. Running a trace continues until a failing guard is hit. At +that point, execution goes back to the normal interpreter (and stays there, in +this very simple implementation).

+

I also presented an extension of tracing that makes it possible to add a hint +called promote to the original program that tells the tracer to feed back a +runtime value into the trace and freeze it there. This extension would be +impossible to do in the partial evaluator from the last post, because partial +evaluation is done strictly before run time, so if a variable isn't already +known, its likely runtime value cannot be found out.

+

In the next post I will show how to optimize traces before executing them and +how the optimizer for traces is related to partial evaluation.

+
+
+
+
+ + larsr wrote on 2012-02-01 13:54: +
+
+

Hi, these posts are great!

A question: shouldn't runtrace resume tracing instead of running the interpreter (in resume_interp)?

And perhaps a clarification: when the blog post calls do_trace all of the necessary code has not been shown yet, so one can't really follow along at the keyboard there just yet.

+
+
+
+
+ + Carl Friedrich Bolz-Tereick wrote on 2012-02-02 15:36: +
+
+

@larsr: thanks!

Yes, in principle you are right that there could be a mechanism that stars to trace from the point where a guard fails. This is an element of tracing JITs that the current code leaves off, it would need to be solved together with the caching of traces.

A lot of things are just sketched in this implementation, e.g. only one trace ever is started, once you end up in the interpreter the tracer never starts again.

+
+
+
+
+ + Anonymous wrote on 2012-03-04 11:46: +
+
+

trace(if(V, L1, L2), Env, T, TraceAnchor) :-
lookup(V, Env, Val),
(Val == 0 ->
L = L2, T = guard_false(V, [], L1, NT)
;
L = L1, T = guard_true(V, [], L2, NT)
),
trace(jump(L), Env, NT, TraceAnchor). This trac is okay, but python IDE is not Adroid supported. ビーグレン

+
+
+
+
+ + quadhier wrote on 2020-08-04 15:52: +
+
+

Hi! Great posts. But the source code url is invalid now, could you please provide the source code again? THANKS!

+
+
+
+ +

NumPyPy status update

+ +
+

Hello.

+

This is just a quick status update on the NumPy in PyPy project that very +recently became my day job. I should give my thanks once again to Getco, +Nate Lawson and other contributors who donated above $40000 towards the goal.

+

Recently we (Alex Gaynor, Matti Picus and me) implemented a few interesting things +that a lot of people use:

+
    +
  • more ufuncs
  • +
  • most ufuncs now accept the axis parameter (except all and any)
  • +
  • fixed string representation of arrays, now it's identical to numpy (uses +pretty much the same code)
  • +
  • +ndarray.flat should be working correctly
  • +
  • +ndarray.flatten, ndarray.ravel, ndarray.take +
  • +
  • indexing arrays by boolean arrays of the same size
  • +
  • and various bugfixes.
  • +
+

We would also like to introduce the nightly report of numpy status. This +is an automated tool that does package introspection. While it gives some +sort of idea how much of numpy is implemented, it's not by far the authority. +Your tests should be the authority. It won't report whether functions +support all kinds of parameters (for example masked arrays and out parameter +are completely unsupported) or that functions work at all. We also +reserve the right to incorporate jokes in that website, so don't treat it +that seriously overall :-)

+

Thanks, and stay tuned. We hope to post here regular updates on the +progress.

+

Cheers,
+fijal & the PyPy team

+
+
+
+
+ + Anonymous wrote on 2012-01-28 14:54: +
+
+

I use "out" parameter very often in my code (with numpy.take), without this one my code would run much worse (because huge arrays of hundreds MB would copy many times inside a big cycle). How currently the "out" parameter is handled (warning, error, nothing)?

+
+
+
+
+ + Maciej Fijalkowski wrote on 2012-01-28 15:01: +
+
+

It just errors with more or less acceptable error message. Note that pypy does not create intermediates for most of operations, so if you have a lot of them chained actually using out will be worse than not using it.

+
+
+
+
+ + Anonymous wrote on 2012-01-29 23:31: +
+
+

I'm new to python but not to Cpython/numpy/scipy/matplotlib and I fail to understand what you are doing.

* In a nutshell, what's numpypy? Is it a rewrite of the numpy code to make it compatible with pypy? or are you working on pypy itself to be able to run numpy as it is??

* if numpypy is a rewrite of numpy, that's good but how do you plan to keep numpy and numpypy sync (in terms of functionalities)??

* Using numpy with pypy will be great but what about scipy qnd matplotlib??
Many users need at least these two modules on top of numpy;

I would be very happy with pypy being able to work with unpachted numpy/scipy/matplotlib.

I think your website should summarise these issues on its front page.

+
+
+
+ +

Py3k and Numpy First Stage: Thanks to all who Gave

+ +
+

Last year was quite successful for PyPy fundraising through the Software Freedom Conservancy, and Conservancy and PyPy are very excited to announce that enough was raised to begin the first stages on the Py3k and Numpy grant proposals.

+

As of the end of 2011, 135 different individuals gave to the Py3k campaign, and 114 to the Numpy campaign. We thank each of you who donated to help make this work possible. Meanwhile, if you haven't given to support these projects, we do hope you'll give generously now to help fund their second stages later this year!

+

We're also particularly excited that a few donors gave particularly large donations to support this work; those big donations really filled in the gap to help us get started!

+

Specifically, we're pleased to announce that Google donated $35000 towards implementing Python 3 in PyPy. Google's general support of the Python community is well known, and their specific support of our grant proposal is much appreciated.

+

Meanwhile, Numpy was supported in part by contributions from Nate Lawson, Cantab Capital Partners, and Getco, as well as more than a hundred other contributors.

+

With these donations combined with many others, we're now starting work on both projects. This week, the Conservancy signed contracts with Antonio Cuni and Benjamin Peterson to work towards the Stage 1.1 goals in Py3k proposal (and is negotiating for another contractor as well), and with Maciej Fijałkowski to work towards the Stage 1 goals in the Numpy proposal.

+

In 2012, PyPy will continue regular sprint meetings, at which Py3K and Numpy efforts will certainly have a place. We have some limited funds to fund travels of contributors to those meetings.

+

We're very thankful for all who donated so far to support these efforts, and we hope that now that work has begun, even more donors will come forward to help us finish the job. In the meantime, watch for the commits showing up from these developers and other contributors in the PyPy repositories!

+

Cheers, The PyPy Team

+
+
+
+
+ + Gaëtan de Menten wrote on 2012-01-28 20:35: +
+
+

It seems strange to me that Amaury Forgeot d'Arc wasn't the first one to be contracted for working on Py3k support. From the commit messages, he seems to have done most of the work in the py3k branch so far, or is he the unnamed third contractor?

+
+
+
+
+ + Anonymous wrote on 2012-01-28 23:12: +
+
+

What about a Py2k8, is there any hope? Will at least 2.7 still be supported?

+
+
+
+
+ + Amaury Forgeot d'Arc wrote on 2012-01-28 23:22: +
+
+

@Gaëtan: The reason is simple: I already have a regular day job, 40 hours a week, and I cannot have another remuneration without consent of my employer.

Actually I started the py3k branch before the funding proposal, and even before that I've been trying different ways to do the transition from str to unicode.
Then, my understanding of the JIT and other optimizations is very poor. And there are important changes to do around the representation of unicode for example, or the int/long unification, if we want pypy3k to be as fast as 2.7.

I am quite happy of the current state: some people are paid to do and finish the real job, and volunteers can have fun and help in some parts, working on the most interesting project around Python.

+
+
+
+
+ + Amaury Forgeot d'Arc wrote on 2012-01-28 23:27: +
+
+

@Anonymous: there won't be any Python 2.8 (search for PEP404 for the reasons), but as stated in the py3k grant proposal: https://pypy.org/py3donate.html
"The goal of the PyPy community is to support both Python 2 and Python 3 for the forseeable future"

+
+
+
+ +
+
+ +
+
+
+ +
+ + + + \ No newline at end of file diff --git a/blog/index-23.html b/blog/index-23.html new file mode 100644 index 000000000..9020d9595 --- /dev/null +++ b/blog/index-23.html @@ -0,0 +1,1512 @@ + + + + + + +PyPy (old posts, page 23) | PyPy + + + + + + + + + + + + + + + + + + Skip to main content +
+
+
+

STM with threads

+ +
+

Hi all,

+

A quick update. The first version of pypy-stm based on regular
+threads
is ready. Still having no JIT and a 4-or-5-times performance
+hit, it is not particularly fast, but I am happy that it turns out not
+to be much slower than the previous thread-less attempts. It is at
+least fast enough to run faster (in real time) than an equivalent no-STM
+PyPy, if fed with an eight-threaded program on an eight-core machine
+(provided, of course, you don't mind it eating all 8 cores' CPU power
+instead of just one :-).

+

You can download and play around with this binary for Linux 64. It
+was made from the stm-thread branch of the PyPy repository (translate.py --stm -O2 targetpypystandalone.py). (Be sure
+to put it where it can find its stdlib, e.g. by putting it inside the
+directory from the official 1.9 release.)

+

This binary supports the thread module and runs without the GIL.
+So, despite the factor-of-4 slow-down issue, it should be the fourth
+complete Python interpreter in which we can reasonably claim to have
+resolved the problem of the GIL. (The first one was Greg Stein's Python
+1.4, re-explored here; the second one is Jython; the third one is
IronPython.) Unlike the previous three, it is also the first one to
+offer full GIL semantics to the programmer, and additionally
thread.atomic (see below). I should also add that we're likely to
+see in the next year a 5th such interpreter, too, based on Hardware
+Transactional Memory (same approach as with STM, but using e.g.
Intel's HTM).

+

The binary I linked to above supports all built-in modules from PyPy,
+apart from signal, still being worked on (which can be a bit
+annoying because standard library modules like subprocess depend on
+it). The sys.get/setcheckinterval() functions can be used to tweak
+the frequency of the automatic commits. Additionally, it offers
thread.atomic, described in the previous blog post as a way to
+create longer atomic sections (with the observable effect of preventing
+the "GIL" to be released during that time). A complete
transaction.py module based on it is available from the sources.

+

The main missing features are:

+
    +
  • the signal module;
  • +
  • the Garbage Collector, which does not do major collections so far, only
    +minor ones;
  • +
  • and finally, the JIT, which needs some amount of integration to generate
    +the correctly-tweaked assembler.
  • +
+

Have fun!

+

Armin.

+
+
+
+
+ + Anonymous wrote on 2012-06-12 08:11: +
+
+

STM has such much potential. I wonder if it gets the attention of the hacker community it deserves. And if not, why not? I hope this is getting more recognition in the future.

+
+
+
+
+ + Paul Jaros wrote on 2012-06-12 08:12: +
+
+

Ah... didn't mean to post it anonymously.

+
+
+
+
+ + Unknown wrote on 2012-06-13 11:21: +
+
+

Nice!

+
+
+
+
+ + Armin Rigo wrote on 2012-06-13 15:19: +
+
+

@Paul: my guess would be that the majority of people that know STM are still looking at it from the point of view of short or very short transactions, as a replacement of locking. Even gcc 4.7 got an STM extension, but it cannot be used with long-running transactions: the performance is not at all tuned for this case, and you cannot express things you need in real long-running transactions, like interrupting them for I/O.

Moreover the single-core 4x performance hit is usually far more that what people are willing to accept --- not realizing that in many cases it will soon be outdated, as a way of measuring performance: the future is toward many-cores machines.

+
+
+
+
+ + Anonymous wrote on 2012-06-14 16:11: +
+
+

For a casual Python programmer like me, how does STM affect the way I write my programs? I know about suggested benefits of STM on multi-core machines. However, what I'm asking is what is it that I have to do differently to get that benefit ?

Thanks

+
+
+
+
+ + Armin Rigo wrote on 2012-06-15 07:42: +
+
+

@Anonymous: https://foss.heptapod.net/pypy/pypy/-/tree/branch//stm-thread/pypy/doc/stm.rst

+
+
+
+ +

PyPy 1.9 - Yard Wolf

+ +
+

We're pleased to announce the 1.9 release of PyPy. This release brings mostly
+bugfixes, performance improvements, other small improvements and overall
+progress on the numpypy effort.
+It also brings an improved situation on Windows and OS X.

+

You can download the PyPy 1.9 release here:

+
https://pypy.org/download.html
+
+

What is PyPy?

+

PyPy is a very compliant Python interpreter, almost a drop-in replacement for
+CPython 2.7. It's fast (pypy 1.9 and cpython 2.7.2 performance comparison)
+due to its integrated tracing JIT compiler.

+

This release supports x86 machines running Linux 32/64, Mac OS X 64 or
+Windows 32. Windows 64 work is still stalling, we would welcome a volunteer
+to handle that.

+
+
+

Thanks to our donors

+

But first of all, we would like to say thank you to all people who
+donated some money to one of our four calls:

+
+

Thank you all for proving that it is indeed possible for a small team of
+programmers to get funded like that, at least for some
+time. We want to include this thank you in the present release
+announcement even though most of the work is not finished yet. More
+precisely, neither Py3k nor STM are ready to make it in an official release
+yet: people interested in them need to grab and (attempt to) translate
+PyPy from the corresponding branches (respectively py3k and
stm-thread).

+
+
+

Highlights

+
    +
  • This release still implements Python 2.7.2.
  • +
  • Many bugs were corrected for Windows 32 bit. This includes new
    +functionality to test the validity of file descriptors; and
    +correct handling of the calling convensions for ctypes. (Still not
    +much progress on Win64.) A lot of work on this has been done by Matti Picus
    +and Amaury Forgeot d'Arc.
  • +
  • Improvements in cpyext, our emulator for CPython C extension modules.
    +For example PyOpenSSL should now work. We thank various people for help.
  • +
  • Sets now have strategies just like dictionaries. This means for example
    +that a set containing only ints will be more compact (and faster).
  • +
  • A lot of progress on various aspects of numpypy. See the numpy-status
    +page for the automatic report.
  • +
  • It is now possible to create and manipulate C-like structures using the
    +PyPy-only _ffi module. The advantage over using e.g. ctypes is that
    _ffi is very JIT-friendly, and getting/setting of fields is translated
    +to few assembler instructions by the JIT. However, this is mostly intended
    +as a low-level backend to be used by more user-friendly FFI packages, and
    +the API might change in the future. Use it at your own risk.
  • +
  • The non-x86 backends for the JIT are progressing but are still not
    +merged (ARMv7 and PPC64).
  • +
  • JIT hooks for inspecting the created assembler code have been improved.
    +See JIT hooks documentation for details.
  • +
  • +select.kqueue has been added (BSD).
  • +
  • Handling of keyword arguments has been drastically improved in the best-case
    +scenario: proxy functions which simply forwards *args and **kwargs
    +to another function now performs much better with the JIT.
  • +
  • List comprehension has been improved.
  • +
+
+
+

JitViewer

+

There will be a corresponding 1.9 release of JitViewer which is guaranteed to work
+with PyPy 1.9. See the JitViewer docs for details.

+

Cheers,
+The PyPy Team

+
+
+
+
+
+ + Dmitrey wrote on 2012-06-08 11:11: +
+
+

I have took a look at the mentioned numpypy table (https://buildbot.pypy.org/numpy-status/latest.html), and it lies in many ways. At first, some methods marked as "done" and undone yet, e.g. consider searchsorted:
>>>> from numpypy import searchsorted
>>>> searchsorted([1,2,3],[2,3])
Traceback (most recent call last):
File "", line 1, in
File "/home/dmitrey/Install/pypy-c-jit-55492-ac392fb76904-linux/lib_pypy/numpypy/core/fromnumeric.py", line 763, in searchsorted
raise NotImplementedError('Waiting on interp level method')
NotImplementedError: Waiting on interp level method

(and AFAIK there are many other similar numpypy funcs that are present in dir(numpypy), but only raise NotImplementedError).

At 2nd, some funcs like all and any, also mentioned there as "done", don't work with "axis" parameter and thus also should be unmarked.

FYI as a temporary replacement for some missing in PyPy yet numpy funcs (atleast_1d, atleast_2d, hstack, vstack, cumsum, isscalar, asscalar, asfarray, flatnonzero, tile, zeros_like, ones_like, empty_like, where, searchsorted;
with "axis" parameter: nan(arg)min, nan(arg)max, all, any )

I have implemented them in AppLevel (thus PyPy developers refuce to commit them, but some users could be interested right now), see https://openopt.org/PyPy for more details and my sincere opinion on the situation.

Best wishes for PyPy developers and users, D.

+
+
+
+
+ + Maciej Fijalkowski wrote on 2012-06-08 12:52: +
+
+

Hi Dmitrey, nice to hear from you.

The page is automatically generated - we should probably just disable those functions, I can't remember the exact reason why they're there in the first place.

When it comes to missing arguments - you just can't help it. It's an automatically generated page that should give only an overview.

As far as your patches go - yes, we need tests and we also need tests that cover corner cases. This is very important for us, we can live without the rest (like implementations on the interp-level). We do care about quality a lot.

Cheers,
fijal

+
+
+
+
+ + Dmitrey wrote on 2012-06-08 15:24: +
+
+

hi fijal,
as far as I remember, main reasons of PyPy developers (I don't remember namely) to reject my funcs propositions were AppLevel vs InterpLevel, not corner testcases (they even said "don't start the func, it must be InterpLevel"). Thus to speedup OpenOpt port on PyPy I went other way and as you probably have seen that some OpenOpt Suite functionality is already available in PyPy and works some times faster.

If apperplevel is ok for some of those funcs mentioned above, you or any other PyPy programmer can take anything from the code; as for me, I have lots of other things to do with my projects, especially now, before regular release, and thus cannot allocate time to create testcases for the numpy funcs.

BTW, what about fancy indexing with int arrays (https://bugs.pypy.org/issue1130) - when it will be implemented? It's very important for many Python projects and hangs for a long time already.

+
+
+
+
+ + Peter Thomson wrote on 2012-06-10 16:56: +
+
+

Congratulations to the new release to the best and most awesome team there is. We work daily with Python and PyPy and always look forward to the latest release :-)

+
+
+
+ +

Py3k status update #4

+ +
+

This is the fourth status update about our work on the py3k branch, which we
+can work on thanks to all of the people who donated to the py3k proposal.

+

For various reasons, less work than usual has been done since the last status
+update. However, some interesting things happened anyway.

+

As readers know, so far we spent most of the effort in fixing all PyPy's own
+tests which started to fail for various py2/py3 differences. Most of them
+failed for shallow reasons, e.g. syntactic changes or the int/long
+unifications. Others failed for subtle differences and needed a bit more care,
+for example the fact that unbound methods are gone in Py3k.

+

The good news is that finally we are seeing the light at the end of the
+tunnel. Most of them have been fixed. For sine other tests, we introduced the
+concept of "py3k-skipping": some optimizations and modules are indeed failing,
+but right now we are concentrating on completing the core language and so we
+are not interested in those. When the core language will be done, we will be
+able to easily find and work on the py3k-skipped tests. In particular, for
+now we disabled the Int and String dict strategies, which are broken
+because of the usual int/long unification and str vs bytes. As for modules,
+for now _continuation (needed for stackless) and _multiprocessing do
+not work yet.

+

Another non-trivial feature we implemented is the proper cleaning of exception
+variables when we exit except blocks. This is a feature which touches
+lots of levels of PyPy, starting from astcompiler, down to the bytecode
+interpreter. It tooks two days of headache, but at the end we made it :-).

+

Additionally, Amaury did a lot of improvements to cpyext, which had been
+broken since forever on this branch.

+

As for the next plans, now that things are starting to work and PyPy's own
+tests mostly pass, we can finally start to run the compiled PyPy against
+CPython's test suite. It is very likely that we will have tons of failures at
+the beginning, but once we start to fix them one by one, a Py3k-compatible
+PyPy will be closer and closer.

+
+
+
+
+ + Connelly Barnes wrote on 2012-06-27 18:28: +
+
+

Does anyone actually use Python 3? That whole project of Guido's reminds me of "things you should never do: rewrites."

https://www.neilgunton.com/doc/?o=1&doc_id=8583

+
+
+
+
+ + Unknown wrote on 2012-06-29 10:55: +
+
+

I cheered at your update when I saw it originally - but did not write this here.

Since no one else did that, yet, I want to go back to fix the mistake:

Great work!

I’m anxious to see my python3 code running under pypy!

+
+
+
+
+ + z1r0un wrote on 2013-08-06 04:24: +
+
+

@Connelly Barnes:
Wat.
I use Python3 almost exclusively, mainly because filter, map, and friends return iterators as FSM intended. I haven't done much string work, but that's another major win. And it's not like 2.7's EOL'd.
In short, un-bunch your knickers and roll with the times.

+
+
+
+ +

STM update: back to threads?

+ +
+

Hi again,

+Here is another update on the status of Software Transactional Memory on PyPy.

+Those of you who have been closely following this blog since last year know that, from the very first post about STM, I explored various design ideas about the API that we should get when programming in Python.

+I went a full circle, and now I am back to where I started (with, important difference, a very roughly working implementation of pypy-stm).

+What I realized is that the "thread" module is not that bad after all --- I mean, yes, it is a horribly low-level interface, but it is general enough to build various interesting things on top of it. What the "stm-thread" branch of PyPy contains is, basically, the regular "thread" module in which the GIL was replaced with STM. It gives multicore capabilities to any program based on multiple threads. (This is so far exactly the idea same than the one being investigated for Hardware Transactional Memory. It is roughly also what you would get if you managed to convince GCC 4.7 to compile CPython using STM.)

+Now while this might already be quite interesting to some people, here is how it relates to all I said previously: namely, threads are bad, and some new "transaction" module would be a better idea.

+There is one new core functionality in the "stm-thread" branch: it is "thread.atomic", a context manager that can be used in a "with" statement (exact name subject to change). In terms of the GIL, it prevents the GIL from being released in the "with" block. In terms of STM, it prevents a "transaction break", which means that the whole "with" statement runs in one single transaction. (From the Python programmer's point of view, the net effect is the same.)

+So far, no ground-breaking news. But what I missed previously is that this is enough to give multicore capabilities even to a program that is not using threads so far. It is possible to rewrite an equivalent of the old transaction module in a few pages of pure Python, using "thread.atomic". Something along the following lines: start N threads that each reads from a Queue.Queue() the next job to do, and does it in a "with thread.atomic" block. The STM version of PyPy is then able to run these atomic blocks concurrently. The key point is that the slightly delicate handling of threads should be nicely hidden inside the new "transaction" module, and from outside the observed behavior would be exactly as if the transactions that we schedule are run serially.

+The point I kept missing was that, yes, this sounds like nonsense, because it seems that we create N threads just to serialize their work again in "thread.atomic" sections. In fact this would be nonsense in any model that would "just" remove the GIL to let multiple threads run concurrently without crashing. Indeed, you have multiple threads, but their atomic blocks would be again a sort of GIL: only one of them would run at a time. And this is indeed the simple model of execution that you get even with STM --- but not the model of performance. The performance with STM scales with the number of cores, as long as there is enough non-conflicting work to do.

+So in summary the complete circle back to the starting point is that threads might be a good low-level model. It mends itself naturally to, say, a kind of program in which the main thread polls file descriptors using select() or the Linux epoll(), and the work received is split along N other threads --- which is the kind of program you would naturally write in other languages that don't have a GIL, say Java. The other threads can then use "thread.atomic" blocks to protect sections of their work. The traditional Transactional Memory point of view is that you use such blocks to guard the short sections of code that communicate with other threads or modify global state, but nothing prevents you from using much larger sections: you should be able to scale them up to the size of a native "unit of work", so that every unit is naturally atomic. And then it's only a matter of design: you can tweak an existing module that does the thread pooling to add one "with thread.atomic"; or do it yourself from scratch; or (if the design is compatible enough) just plug in the proposed pure-Python "transaction" module. Or if you feel like it you can even use threads directly (but keep in mind that using threads too explicitly is not a composable abstraction, whereas higher-level designs typically are).

+At the end of the day, you can write or reuse programs whose global structure you are already familiar with, for example with a thread pool (that can be hidden in a library if you prefer), or any other structure with or without explicit threads. But you can do so without all the mess that comes with threads like locks and deadlocks. From that angle it is really similar to Garbage Collection: e.g. the Boehm GC (now used by GCC itself) lets you write C code like you are used to, but forgeting all you had to learn about careful explicit memory management.

+
+
+
+
+ + Benjamin wrote on 2012-05-08 04:38: +
+
+

So I'm not sure if I fully grok STM, but my basic understanding of the workflow for a transaction is this:

1. Make a copy of whatever it is you're planning to use, ie, 'stuff'.
2. Do anything that doesn't have side effects (writing to memory/disk).
3. Acquire a lock & compare the state of the parts of 'stuff' you want to change to the current state.
4a. If 'stuff to write' is unchanged, write it and release lock.
4b. Otherwise, release lock and restart transaction.

With the context manager, how is 'stuff' determined? Does it record everything in locals()? That seems like it might be excessive. Would it make sense to expose 'stuff' to the programmer?

If you were to expose 'stuff' to the programmer, I'd think you'd want a new local context where the only variables available were those explicitly specified as 'stuff' (and builtins, etc) so as to avoid congruency accidents. Something like:

with atomic(f, x, y, z, q) as f, x, y, z, q:
z += f(x, y)
y = x
x = q.pop()

This would also help remind folks to keep their transactions small.

Furthermore, this could easily be transformed into a very useful (function) decorator that uses the function's arguments as the 'stuff'.

Am I missing something? Are my suggestions reasonable?

+
+
+
+
+ + Unknown wrote on 2012-05-08 06:09: +
+
+ this might give you some insight into another approach for passing messages (aka information) between threads which might be GIL friendly. +
+
+
+
+ + Frankier wrote on 2012-05-08 07:29: +
+
+

@Benjamin:

My understanding is STM is using these type of transactions: https://en.wikipedia.org/wiki/Optimistic_concurrency_control

+
+
+
+
+ + Armin Rigo wrote on 2012-05-08 08:17: +
+
+

@Benjamin: no, that's not reasonable at all in the context of large transactions. "Help remind folks to keep their transactions small" is precisely what I don't want: I want large transactions. This might be harder to do efficiently, it might be more conflict-prone, etc.; but what I don't want is the classical situation where you have to be very careful about keeping your transactions as small as possible, because that's just as hard and error-prone as using locks.

What I want is for "the average programmer" to not use the "thread" module at all, including "thread.atomic". This should be part of a library that does thread pooling and dispatching (large) transactions.

+
+
+
+
+ + Kristján Valur wrote on 2012-05-08 11:33: +
+
+

You know, of course, that stackless has an "atomic" property, and stacklesslib has an stacklesslib.utils.atomic ctxtmgr.

I recently modified stackless so that the "atomic" property also inhibited GIL release, so that inter-thread tasklet operations could be made safe.

On a whim I scoured the python archives and found that such a property had been proposed to cPython but rejected (unwisely imho) in favor of general locking.

Perhaps we can get them to reconsider?

+
+
+
+
+ + Kristján Valur wrote on 2012-05-08 11:41: +
+
+

Oh, and btw:
an "atomic" property in regular cPython (and stackless) of course only prevents preemptive release of the GIL. Any blocking IO calls will still cause a "co-operative" GIL release. For this reason, "atomic" cannot be replace generic locks completely.

How does this play with longer "transactions" in STM?

+
+
+
+
+ + Armin Rigo wrote on 2012-05-08 11:54: +
+
+

@Kris: ah, interesting. You did the same as what I attempted in my hack of CPython at https://bitbucket.org/arigo/cpython-withatomic . This didn't really work out, though, because the stdlib (including file objects) use regular locks. A simple "print" in an atomic block could lead to deadlocks: the atomic block can block waiting for the stdout's file lock to be released, but it does so without releasing the GIL. Now the lock would typically be released by another thread --- if only it could grab the GIL for a short while.

You can see the workaround I found in the last few commit messages of the above repository, but I'm not satisfied with it... In general I'm still unsure what the best way is. For now in pypy-stm I'm going to hack on a case-by-case basis to convert the locks to atomic sections.

Perhaps it is possible to do it semi-generically, e.g. convert all syntactically nested "with lock:" statements in the user code into "with atomic:" statements (similar to next year's Intel CPUs, which will have "lock elision" to help convert from lock-based to HTM programs). As far as I know, this idea doesn't work in all situations, e.g. if you acquire a lock in one thread and release it in another thread.

As far as I can say, this issue is the main blocker preventing any further progress on the CPython side. It is certainly the reason I stopped pushing for it last year.

+
+
+
+
+ + Armin Rigo wrote on 2012-05-08 11:58: +
+
+

@Kris: ah, ok: you have a version of "atomic" that doesn't prevent the GIL from being released around I/O calls. This is different from the version described in this post, which is also what I assumed in my previous answer. In a "with atomic" block, the GIL is not released under any circumstance (equivalently, the whole "atomic" block runs as a single transaction), so that the programmer can assume that a "with atomic" block is truly atomic.

+
+
+
+
+ + Unknown wrote on 2012-05-08 12:59: +
+
+

How would a code example look for thread.atomic?

+
+
+
+
+ + Armin Rigo wrote on 2012-05-08 13:23: +
+
+

@Arne: here is an example using directly thread.atomic. In your multithreaded application, at some point, you want to remove an item from list1 and add it to list2, knowing that list1 and list2 are also accessed by other threads. Then you write:

with thread.atomic:
x = list1.pop()
list2.append(x)

This is a classical STM example. What I'm pushing for is not that, though: it is for not writing multithreaded code in the first place. With the proper library code you can write code like the first few lines of transaction. The library code would itself use thread.atomic, but not you directly.

+
+
+
+
+ + Kristján Valur wrote on 2012-05-08 15:02: +
+
+

Yes, sorry for not being clear, Armin. But an "atomic" flag that inhibits involountary thread switching is useful too, because it is a fast "lock" around all kinds of code:

with atomic:
foo = foo+1 #multi-threading-safe

without the overhead of real locks.
In our GIL world, real locks only benefit areas that incur thread-blocking operations such as IO.

Anyway, that is off-topic, I suppose :)

+
+
+
+
+ + Kristján Valur wrote on 2012-05-08 15:06: +
+
+

Of course, we cannot replace thread._Lock with an "atomic" equivalent, because it is a non-recursive entity, also used for such things as condition variables!.

Not a very wise move, in retrospect.

+
+
+
+
+ + Armin Rigo wrote on 2012-05-08 16:38: +
+
+

@Kris: indeed. I found out a way that should in all cases either work or raise an exception if unsupported (and not unexpectedly deadlock).

The unsupported situation is: we are in a "with atomic" block trying to acquire a lock, and this lock is acquired already. In this case, there is nothing the interpreter can do automatically. It can only complain rather than deadlocking: no other thread is going to run in parallel to release the lock.

This should let the "common use case" work, which is locks used as scoped mutexes. Caveat: only as long as you use them either only in "with atomic" blocks --- because they appear to be fully serialized, so the mutex will never block --- or only outside "with atomic" blocks.

This leaves the case of mixed usage as unsupported, but I don't see how it could reasonably be supported.

So for now, pypy-stm will raise "oups deadlock" if you try to use "print" statements both inside and outside atomic blocks in parallel... that's the best I could come up with so far.

+
+
+
+
+ + Anonymous wrote on 2012-05-09 00:38: +
+
+

thanks for the article. might want to reword "This is so far exactly the idea same than the one being investigated for Hardware Transactional Memory.". :)

+
+
+
+
+ + Ole Laursen wrote on 2012-05-11 12:20: +
+
+

To expand slightly on what someone else commented, there was a talk not too long ago by some guys who found out using queues to communicate between threads can be pretty hefty bottleneck. They were using the JVM.

The talk is interesting because they actually measured the stuff they do and compared it with how it affects the CPU pipelines/caches. The queue discussion is around 32 minutes into the talk.

It's perhaps not relevant for pypy-stm at the moment, but it's definitely relevant for anyone interested in high-performance multithreaded code.

+
+
+
+
+ + Dima Q wrote on 2012-05-18 10:03: +
+
+

Good job, Armin!

This is exactly what Python needs, and if turns out hard rather than insanely hard, all the better!

+
+
+
+
+ + Jonas W. wrote on 2012-05-21 17:51: +
+
+

I am not entirely sure about the concept which is being implemented in PyPy-stm or better, which is planned for a parallel PyPy in the future.

I think am a pretty conservative programmer, and I actually dislike the idea of running code twice because of conflicts which could have been foreseen at development time ;). I still see the advantages STM brings regarding development time.

So I'm wondering about a point which was not entirely clear in your post. You're saying you don't want people to (be forced to?) write short transactions. However, I could still in a project which is both CPU and memory intensive try to keep the thread.atomic sections as small as possible to avoid unneccessary overheads but still get effective logs?

+
+
+
+
+ + Armin Rigo wrote on 2012-05-21 22:42: +
+
+

@Jonas: it is not always obvious at development time -- to say the least -- how to avoid all conflicts. Think about how hard it is to add automatic GC to C++ in a large project: it's messy but you might get pretty far with just reference counting -- until some point when you loose because of cyclic references. If instead you had used a proper GC-managed language, the problem would just not exist. It's the same about Transactional Memory and conflicts: you can either think harder and harder about using locks correctly, until your programs becomes really messy; then you give up and use TM, solving the issue instantly and letting you think again about your original problem.

Regarding the transaction size: with a good implementation, big transactions should not be slower than small transactions. The only potential drawback of having big transactions is that the risks of conflicts might increase (depending on your program).

Note that this question has a different answer for Python than for C, where code outside transactions runs faster than code within transactions. It is not so in Python. The reason is that transactions are always needed in Python: either explicitly, or implicitly in order to protect the interpreter structures (in replacement of the famous GIL).

+
+
+
+
+ + Connelly Barnes wrote on 2012-05-30 05:53: +
+
+

Is there any plan to add type declarations as some optional mode in PyPy, like Cython allows? Because PyPy can sometimes give some speed up, but when it doesn't it seems the alternative for the user is to go back to CPython + Cython.

+
+
+
+
+ + Unknown wrote on 2012-06-05 12:26: +
+
+

@Armin: Looks nice!

But you’re right: The explicit transaction still looks nicer.

I think though, that both can nicely complement each other:

(1) The transaction is efficient for pushing out parts of the code from the main run to get it multithreaded (think “#pragma omp parallel for” from OpenMP).

(2) The thread.atomic is efficient for protecting stuff inside a threaded application. Also I like that I don’t have to explicitely state which variables I want to protect. And I like that it is not full locking: If I don’t actually get a conflict, other code still runs in parallel.

The first actually looks more interesting though, because it might be possible to make every for-loop run like this, as long as later runs are not dependent on the result of previous runs. This would require quite heavy runtime analysis, though.

+
+
+
+ +

STM update (and thanks everybody)

+ +
+

A short update on the Software Transactional Memory (STM) side. Let me remind you that the work is to add STM internally into PyPy, with the goal of letting the user's programs run on multiple cores after a minor adaptation. (The goal is not to expose STM to the user's program.) I will soon write some official documentation that explains in more details exactly what you get. For now you can read the previous blog posts, and you can also find technical details in the call for donation itself; or directly look at how I adapted the examples linked to later in this post.

+

I have now reached the point where the basics seem to work. There is no integration with the JIT so far; moreover the integration with the Garbage Collection subsystem is not finished right now, but at least it is "not crashing in my simple tests and not leaking memory too quickly". (It means that it is never calling __del__ so far, although it releases memory; and when entering transactional mode or when going to the next transaction, all live objects become immortal. This should still let most not-too-long-running programs work.)

+

If you want to play with it, you can download this binary (you need to put it in a place with the paths lib-python and lib_pypy, for example inside the main directory from a regular nightly tarball or from a full checkout). This version was compiled for Linux x86 32-bit from the stm-gc branch on the 25th of April. It runs e.g. the modified version of richards. This branch could also be translated for Linux x86-64, but not for other OSes nor other CPUs for now.

+

The resulting pypy-stm exposes the same interface as the pure Python transaction module, which is an emulator (running on CPython or any version of PyPy) which can be used to play around and prepare your programs. See the comments in there. A difference is that the real pypy-stm doesn't support epoll right now, so it cannot be used yet to play with a branch of Twisted that was already adapted (thanks Jean-Paul Calderone); but that's coming soon. For now you can use it to get multi-core usage on purely computational programs.

+

I did for example adapt PyPy's own translate.py: see the tweak in rpython/rtyper.py. Lines 273-281 are all that I needed to add, and they are mostly a "simplification and parallelization" of the lines above. There are a few more places in the whole translate.py that could be similarly modified, but overall it is just that: a few places. I did not measure performance, but I checked that it is capable of using multiple cores in the RTyping step of translation, with --- as expected --- some still-reasonable number of conflicts, particularly at the beginning when shared data structures are still being built.

+

On a few smaller, more regular examples like richards, I did measure the performance. It is not great, even taking into account that it has no JIT so far. Running pypy-stm with one thread is roughly 5 times slower than running a regular PyPy with no JIT (it used to be better in previous versions, but they didn't have any GC; nevertheless, I need to investigate). However, it does seem to scale. At least, it scales roughly as expected on my 2-real-cores, 4-hyperthreaded-cores laptop (i.e. for N between 1 and 4, the N-threaded pypy-stm performs similarly to N independent pypy-stm's running one thread each).

+

And finally...

+

...a big thank you to everyone who contributed some money to support this! As you see on the PyPy site, we got more than 6700$ so far in only 5 or 6 weeks. Thanks to that, my contract started last Monday, and I am now paid a small salary via the Software Freedom Conservancy (thanks Bradley M. Kuhn for organizational support from the SFC). Again, thank you everybody!

+

UPDATE: The performance regression was due to disabling an optimization, the method cache, which caused non-deterministic results --- the performance could vary from simple to double. Today, as a workaround, I made the method cache transaction-local for now; it is only effective for transactions that run for long enough (maybe 0.1ms or 1ms), but at least it is there in this situation. In the version of richards presented above, the transactions are too short to make a difference (around 0.015ms).

+
+
+
+
+ + Anonymous wrote on 2012-04-27 20:37: +
+
+

I don't get it. It's great that pypy libs and so on will be multithreaded with good performance, but how does that help you to write a multithreaded program with good performance, if you don't expose the tools you used to do that?

+
+
+
+
+ + Alexander Sedov wrote on 2012-04-27 20:44: +
+
+

Interface is exposed; transaction module it is.

+
+
+
+
+ + Texatril wrote on 2012-04-27 20:44: +
+
+

I think the idea is that the GIL would be gone since internally the interpreter would use STM, and at the programmer level, you would be free to use the normal threading mechanisms

+
+
+
+
+ + Maciej Fijalkowski wrote on 2012-04-27 20:49: +
+
+

@Texatril no, the point is you would not have to. You write a normal event-based program with transaction module and boom it works. It's easier than writing correct multithreaded code.

+
+
+
+
+ + Anonymous wrote on 2012-04-27 20:50: +
+
+

Ah, you kinda contradicted yourself by saying the goal wasn't to expose STM to users' programs, but then saying that it exposed the same API as the transaction module.

The transaction module is pretty horrible though. Might I suggest a better syntax than the transaction module? Something like exceptions would be better:

begin:
...
commit

or:

transaction:
...
rollback:
retry

perhaps with an option (in a later version?) to replace the "retry" with alternate code.

+
+
+
+
+ + Maciej Fijalkowski wrote on 2012-04-27 20:52: +
+
+

@Anonymous that would be a bad API, because you cannot fail a transaction. It'll be automatically retried until it finishes. That's in-line with correct programs, just multithreaded

+
+
+
+
+ + Unknown wrote on 2012-04-28 09:37: +
+
+

Anonymous: the user level API is any asynchronous event handling framework that uses the transaction library internally to handle events in parallel.

So, for example, you take *any* Twisted program and run it on pypy-stm and it will use the available number of cores to process events without losing any of the normal correctness guarantees of event-based programming.

+
+
+
+
+ + Armin Rigo wrote on 2012-04-29 07:43: +
+
+

The goal is really not to expose STM to the user. The pure Python transaction module is a working implementation, running on a single core but running. The fact that pypy-stm provides an alternate implementation, based on STM and giving multi-core usage --- this is the implementation detail.

That's why it has the kind of API you see, and not some STM syntax like "begin: rollback: commit". I also dislike custom keywords, because then we can no longer run the program on CPython or non-STM-based PyPys. But I know I am no language designer myself, so the details are open for discussion.

Nick: thanks for the precisions. Note however that the transaction module is also meant to be used directly, e.g. in CPU-intensive computational programs that don't use any event framework, like I did in rpython/rtyper.py.

+
+
+
+
+ + Unknown wrote on 2012-05-01 06:10: +
+
+

That sounds great!

From the code I wondered, though, if it’s not actually only 2 lines:

for block in pending:
transaction.add(self.specialize_block, block)
transaction.run()

That sounds like map() - for example like the futures module:

with concurrent.futures.ThreadExecutor() as e:
e.map(...)

Similarly something like

with transaction.Runner() as r:
r.map(self.specialize_block, block)

might be easier.

Anyway: Your STM project sounds great!

+
+
+
+
+ + Armin Rigo wrote on 2012-05-01 08:51: +
+
+

@arne: right, maybe. It points to a similarity, at least. This simple example corresponds nicely to map(), but in other examples (like richards) we add() more transactions from within transactions. Nevertheless, using the "with ... as t:" syntax might work, by passing the "t" inside transactions in order to call t.map() or t.add() on it too.

This would also open the door to naturally nest these constructs. Right now if you call transaction.run() inside a transaction, you get an error. Such a case is more work to support in the current implementation, but from the surface it looks like a transaction.Runner() kind of interface should allow us to express what we need.

+
+
+
+
+ + Unknown wrote on 2012-05-03 19:51: +
+
+

@Armin: Nice! Congrats for the great project!

+
+
+
+ +

NumPy on PyPy progress report

+ +
+

Hello.

+

A lot of things happened in March, like pycon. I was also busy doing other +things (pictured), so apologies for the late numpy status update.

+

However, a lot of things have happened and numpy continues to be one of the +main points of entry for hacking on PyPy. Apologies to all the people whose +patches I don't review in timely manner, but seriously, you do a lot of +work.

+

This list of changes is definitely not exhaustive, and I might be forgetting +important contributions. In a loose order:

+
    +
  • +

    Matti Picus made out parameter work for a lot of (but not all) +functions.

    +
  • +
  • +

    We merged record dtypes support. The only missing dtypes left are complex +(important), datetime (less important) and object (which will probably +never be implemented because it makes very little sense and is a mess with moving GCs).

    +
  • +
  • +

    Taavi Burns and others implemented lots of details, including lots of ufuncs. +On the completely unscientific measure of "implemented functions" on +numpypy status page, we're close to 50% of numpy working. In reality +it might be more or less, but after complex dtypes we're getting very close +to running real programs.

    +
  • +
  • +

    Bool indexing of arrays of the same size should work, leaving only +arrays-of-ints indexing as the last missing element of fancy indexing.

    +
  • +
  • +

    I did some very early experiments on SSE. This work is seriously +preliminary - in fact the only implemented operation is addition of +float single-dimension numpy arrays. However, results are encouraging, +given that our assembler generator is far from ideal:

    + ++++++++ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
      +

    Numpy

    +
    +

    PyPy SSE

    +
    +

    PyPy

    +
    +

    GCC non-looped

    +
    +

    GCC looped

    +
    +

    a+b

    +
    +

    0.6s

    +
    +

    0.3s

    +
    +

    0.4s

    +
    +

    0.3s

    +
    +

    0.25s

    +
    +

    a+b+c

    +
    +

    1.9s

    +
    +

    0.35s

    +
    +

    0.5s

    +
    +

    0.7s

    +
    +

    0.32s

    +
    +

    a+b+c+d+e

    +
    +

    3.2s

    +
    +

    0.36s

    +
    +

    0.8s

    +
    +

    1.7s

    +
    +

    0.51s

    +
    +

    The benchmark repo is available. GCC was run with -O3, no further +options specified. PyPy was run with default options, the SSE branch is under +backend-vector-ops, but it's not working completely yet.

    +

    One might argue that C and Python is not the same code - indeed it is not. +It just shows some possible approach to writing numeric code.

    +
  • +
+

Next step would be to just continue implementing missing features such as

+
    +
  • specialised arrays i.e. masked arrays and matrixes
  • +
  • core modules such as fft, linalg, random.
  • +
  • numpy's testing framework
  • +
+

The future is hard to predict, but we're not far off!

+

Cheers,
fijal

+ +

UPDATE:Indeed, string and unicode dtypes are not supported yet. They're as important as complex dtype

+
+
+
+
+ + Jeff Terrace wrote on 2012-04-17 18:53: +
+
+

I think the string dtype is missing too?

+
+
+
+
+ + Anonymous wrote on 2012-04-17 19:57: +
+
+

Hello,

May you get a bit more precise on the GCC test ?

For instance, is the GCC code using SSE too ? Is it written in a single loop (x[i] = a[i] + b[i] + c[i]) or in several consecutive loops first a+b then (a+b) + c ?

Just to know :-)

+
+
+
+
+ + Winston Ewert wrote on 2012-04-17 20:03: +
+
+

One thing I'll note is that I do from time to time use the object dtype. Occasionally, I've got multidimensional arrays of objects, and the array operations from numpy are useful. I don't really get a speed advantage there, but the interface from numpy is useful. But its not super necessary and certainly not a priority.

+
+
+
+
+ + Anonymous wrote on 2012-04-17 20:04: +
+
+

Sorry, didn't RTFA completely. I just had a look at the C code.

Still, a question: is PyPy doing the optimization of combining operations in one step ?

A "good" Fortran compiler should be able to do those optimizations, for instance.

+
+
+
+
+ + Gaël wrote on 2012-04-17 21:17: +
+
+

You should compare to numpy with a JIT, such as numexpr, it would be interesting to see whether PyPy is able to beat the numexpr JIT.

+
+
+
+
+ + x wrote on 2012-04-17 22:55: +
+
+

Very cool!

+
+
+
+
+ + Armin Rigo wrote on 2012-04-18 10:07: +
+
+

"busy doing other things (pictured)". Pictured where? :-)

+
+
+
+
+ + Ralf Gommers wrote on 2012-04-18 20:45: +
+
+

Hi, Numpy masked arrays, matrices and the testing framework are pure Python, so why do you need to implement them?

+
+
+
+
+ + Alex wrote on 2012-04-18 22:20: +
+
+

Ralf, we don't have to implement the pure-python stuff, so much as we need to make sure the features of NumPy's core that they depend on are implemented.

+
+
+
+
+ + EOL (Eric O LEBIGOT) wrote on 2012-04-19 10:17: +
+
+

Support for objects is actually quite useful: please reconsider adding it.

Here is a very useful case: the manipulation of arrays of numbers with uncertainties (special uncertainties.UFloat objects). Numbers with uncertainties behave very much like regular numbers: it is very useful to be able to use the regular NumPy syntax for array operations, for calculating matrix inverses when the matrices contain number with uncertainties, etc. I know many people use these features.

It would be *great* (read: irreplaceable :) to have support for the object NumPy dtype.

+
+
+
+
+ + Unknown wrote on 2012-04-19 13:47: +
+
+

This sounds really cool!

And it would be awesome if you’d manage to coordinate with numpy, so the projects merge to a single python codebase with two separate backends: One C-Extension based for CPython and one Pure-Python based for pypy.

+
+
+
+
+ + Anonymous wrote on 2012-04-19 17:19: +
+
+

Any chance comparing with Fortran? There are assumptions about pointers and alignment that Fortran compiler can make.

+
+
+
+
+ + Unknown wrote on 2012-04-20 15:26: +
+
+

Nice...but what is the next step?
Numpy alone is not that useful.

"We" need at least scipy and matplotlib.

Are you going to port all these modules? I don't think so.

One way forward could be to have numpy in pypy and at least scipy and matplotlib working with the pypy C api at a decent speed.

What do you think?

+
+
+
+
+ + Anonymous wrote on 2012-04-22 20:07: +
+
+

What about pickling? I'd love to experiment with hybrid CPython/PyPy execution using some magic from the multiprocessing module or a similar parallel computation framework.

+
+
+
+
+ + Anonymous wrote on 2012-07-30 21:02: +
+
+

Hello,

This is a very promising result, thank you for sharing it.
Could you give a few more details about the differences wrt to numpy?

What would people have to do to use numpypy with scipy?

+
+
+
+
+ + Raul Durand wrote on 2012-08-06 16:52: +
+
+

I think the numpy.linalg module is pretty important.
How to move efforts into this?

+
+
+
+
+ + Raul Durand wrote on 2012-08-06 16:53: +
+
+

I think the numpy.linalg module is pretty important.
How to move efforts into this?

+
+
+
+ +

PyCon 2012 wrap up

+ +
+

So, PyCon happened. This was the biggest PyCon ever and probably the biggest +gathering of Python hackers ever.

+

From the PyPy perspective, a lot at PyCon was about PyPy. Listing things:

+
    +
  • David Beazley presented an excellent keynote describing his experience +diving head-first into PyPy and at least partly failing. He, however, did +not fail to explain bits and pieces about PyPy's architecture. +Video is available.
  • +
  • We gave tons of talks, including the tutorial, why pypy by example +and pypy's JIT architecture +
  • +
  • We had a giant influx of new commiters, easily doubling the amount of pull +requests ever created for PyPy. The main topics for newcomers were numpy and +py3k, disproving what David said about PyPy being too hard to dive into ;)
  • +
  • Guido argued in his keynote that Python is not too slow. In the meantime, +we're trying to prove him correct :-)
  • +
+

We would like to thank everyone who talked to us, shared ideas and especially +those who participated in sprints - we're always happy to welcome newcomers!

+

I'm sure there are tons of things I forgot, but thank you all!

+

Cheers, +fijal

+
+
+
+
+ + Dave Beazley wrote on 2012-04-14 00:16: +
+
+

I'm so happy to be proven wrong!

+
+
+
+
+ + Maciej Fijalkowski wrote on 2012-04-14 09:36: +
+
+

I think "proven" is a bit strong word, we're trying though :)

+
+
+
+ +

Py3k status update #3

+ +
+
This is the third status update about my work on the py3k branch, which I can work on thanks to all of the people who donated to the py3k proposal.

+A lot of work has been done during the last month: as usual, the list of changes is too big to be reported in a detalied way, so this is just a summary of what happened.

+One of the most active areas was killing old and deprecated features. In particular, we killed support for the __cmp__ special method and its counsins, the cmp builtin function and keyword argument for list.sort() and sorted(). Killing is easy, but then you have to fix all the places which breaks because of this, including all the types which relied on __cmp__ to be comparable,, fixing all the tests which tried to order objects which are no longer ordeable now, or implementing new behavior like forbidding calling hash() on objects which implement __eq__ but not __hash__.

+Among the other features, we killed lots of now-gone functions in the operator module, the builtins apply(), reduce() and buffer, and the os.* functions to deal with temporary files, which has been deprecated in favour of the new tempfile module.

+The other topic which can't miss in a py3k status update is, as usual, string-vs-unicode. At this round, we fixed bugs in string formatting (in particular to teach format() to always use unicode strings) and various corner cases about when calling the (possibly overridden) __str__ method on subclasses of str. Believe me, you don't want to know the precise rules :-).

+Other features which we worked on and fixed tests include, but are not limited to, marshal, hashlib, zipimport, _socket and itertools, plus the habitual endless lists of tests which fail for shallow reasons such as the syntactic differences, int vs long, range() vs list(range()) etc. As a result, the number of failing tests dropped from 650 to 235: we are beginning to see the light at the end of the tunnel :-)

+Benjamin finished implementing Python 3 syntax. Most of it was small cleanups and tweaks to be compatible with CPython such as making True and False keywords and preventing . . . (note spaces between dots) from being parsed as Ellipsis. Larger syntax additions included keyword only arguments and function annotations.

+Finally, we did some RPython fixes, so that it is possible again to translate PyPy in the py3k branch. However, the resuling binary is a strange beast which mixes python 2 and python 3 semantics, so it is unusable for anything but showing friends how cool it is.

+I would like to underline that I was not alone in doing all this work. In particular, a lot of people joined the PyPy sprint at Pycon and worked on the branch, as you can clearly see in this activity graph. I would like to thank all who helped!
+
+cheers,
+Antonio and Benjamin
+
+
+
+
+ + Unknown wrote on 2012-04-11 11:19: +
+
+

Very cool work!

Thanks for the update! I‘ll need to see if I can already let it hit my own python3 project (I had to convert that to python2.x to make it run with pypy, being able to get rid of that step would be really cool!)

Do you already have prebuilt binaries of pypy3?

+
+
+
+
+ + Antonio Cuni wrote on 2012-04-14 11:21: +
+
+

I don't think that there is any chance that a python3 project will run as of now, there are still tons of features missing. So far my job as mostly been to fix all the failing tests in the PyPy testsuite. When I'll have finished, I'll be able to start with new features.

And, for the same reason: no prebuilt binaries yet, sorry.

+
+
+
+
+ + Unknown wrote on 2012-04-19 13:41: +
+
+

OK, thanks for the info!

I’m anxious to test it, once you give it a chance to run simple unicode-using code!

+
+
+
+
+ + Anonymous wrote on 2012-05-30 20:01: +
+
+

Pocoo's pastebin has unfortunately permanently shut down. Any chance you could repaste how cool it is somewhere else?

+
+
+
+ +

PyPy sprint in Leipzig, Germany (June 22-27)

+ +
+

The next PyPy sprint will be held --- for the first time in a while --- +in a place where we haven't been so far: Leipzig, Germany, at the +Python Academy's Teaching Center. It will take place from the 22nd +to the 27th of June 2012, before EuroPython. Thanks to Mike Müller for +organizing it!

+

This is a fully public sprint, everyone is welcome to join us. All days are +full sprint days, so it is recommended to arrive the 21st and leave the 28th.

+

Topics and goals

+

Open. Here are some goals:

+
    +
  • numpy: progress towards completing the numpypy module; try to +use it in real code
  • +
  • stm: progress on Transactional Memory; try out the transaction module on real code.
  • +
  • jit optimizations: there are a number of optimizations we can still +try out or refactor.
  • +
  • work on various, more efficient data structures for Python language. +A good example would be lazy string slicing/concatenation or more efficient +objects.
  • +
  • any other PyPy-related topic is fine too.
  • +
+

Grants

+

For students, we have the possibility to support some costs via PyPy +funds. Additionally, we can support you applying for grants from the +PSF and other sources.

+

Registration

+

If you'd like to come, please sign up either by announcing yourself on +pypy-dev, or by directly adding yourself to the list of people. +(We need to have a head count for the organization.) If you are new to +the project please drop a note about your interests and post any +questions.

+

More...

+

For more information, please see the sprint announcement.

+
+

Call for donations for Software Transactional Memory

+ +
+

Hi all,

+ +

The Software Transactional Memory +call for donations is up. From the proposal:

+ + + + + +
+Previous attempts on Hardware Transactional Memory focused on parallelizing existing programs written using the thread or threading modules. However, as argued here, this may not be the most practical way to achieve real multithreading; it seems that better alternatives would offer good scalability too. Notably, Transactional Memory could benefit any event-based system that is written to dispatch events serially (Twisted-based, most GUI toolkit, Stackless, gevent, and so on). The events would internally be processed in parallel, while maintaining the illusion of serial execution, with all the corresponding benefits of safety. This should be possible with minimal changes to the event dispatchers. This approach has been described by the Automatic Mutual Exclusion work at Microsoft Research, but not been implemented anywhere (to the best of our knowledge). +

+Note that, yes, this gives you both sides of the coin: you keep using your non-thread-based program (without worrying about locks and their drawbacks like deadlocks, races, and friends), and your programs benefit from all your cores. +

+In more details, a low-level built-in module will provide the basics to start transactions in parallel; but this module will be only used internally in a tweaked version of, say, a Twisted reactor. Using this reactor will be enough for your existing Twisted-based programs to actually run on multiple cores. You, as a developer of the Twisted-based program, have only to care about improving the parallelizability of your program (e.g. by splitting time-consuming transactions into several parts; the exact rules will be published in detail once they are known). +
+

The point is that your program is always correct, and can be tweaked to improve performance. This is the opposite from what explicit threads and locks give you, which is a performant program which you need to tweak to remove bugs. Arguably, this approach is the reason for why you use Python in the first place :-)

+ +

Armin

+
+
+
+
+ + Konstantine Rybnikov wrote on 2012-03-08 21:13: +
+
+

Great news, really looking into experimenting with that, good luck!

My question is: will it map to os thread being created on each event dispatch or can it potentially be somehow optimized? I mean, you can potentially end up with code that has tons of small events, and creating os thread on each event would slow down your program.

+
+
+
+
+ + Anonymous wrote on 2012-03-08 23:22: +
+
+

@k_bx it's not like that at all. There are links in the proposal that may enlighten, depending on what you already know.

+
+
+
+
+ + Armin Rigo wrote on 2012-03-09 01:49: +
+
+

Indeed, it is creating a pool of N threads and reusing them, where N is configurable. Ideally it should default to the number of cores you have, detected in some (sadly non-portable) way.

+
+
+
+
+ + Anonymous wrote on 2012-03-09 09:06: +
+
+

Are any of you affiliated with a university? Since this is research, maybe you can get a grant for a post-doc or a PhD position.

+
+
+
+
+ + Anonymous wrote on 2012-03-09 10:03: +
+
+

Trivial comment - on the donation page in the "What is Transactional Memory?" section, I think a (TM) has been turned into a superscript TM (as in trademark).

+
+
+
+
+ + Steve Phillips wrote on 2012-03-10 00:50: +
+
+

This sounds exciting for the kinds of Python programs that would benefit from TM, but can anyone give a ballpark estimate of what percentage of programs that might be?

Personally, I write various (non-evented) Python scripts (replacements for Bash scripts, IRC bot, etc) and do a lot of Django web dev. It's not clear that I or similar people would benefit from Transactional Memory.

Is that correct?

+
+
+
+
+ + Anonymous wrote on 2012-03-10 01:23: +
+
+

Could u update the donation page? It doesn't seem to be tallying the amounts.

I am really excited to see this work even if it is pure research (I donated $200). It would be awesome if

stm:
....pre:
........# init transaction state
....trans:
........# parallel stuff

So it would be easy to retry failed transactions or be able to reorder them for contention or perf.

+
+
+
+
+ + kurdakov wrote on 2012-03-17 17:07: +
+
+

offtopic:

there is a project to help bring C# and C++ together

https://github.com/mono/cxxi
and fork https://github.com/kthompson/cxxi

in essence: there is a generation step which allows then to easily use C++ objects in C# and vice versa.

considering that ctypes are very much like p/invoke, it looks like pypy might have something similar for python/C++ environments , this might allow much easier to port, for example, Blender to use pypy as scripting language.

+
+
+
+
+ + Arne Babenhauserheide wrote on 2012-03-22 14:08: +
+
+

Could you post an example snippet of code which would benefit from that?

I ask because I have trouble really imagining example code.

Something minimal with the least possible amount of extension modules which I could just throw into the pypy and pypy-tm interpreter and see the difference.

+
+
+
+
+ + Armin Rigo wrote on 2012-04-02 15:12: +
+
+

I wrote a minimal example here:

https://foss.heptapod.net/pypy/pypy/-/tree/branch//stm-gc/lib_pypy/transaction.py

+
+
+
+ +
+
+ +
+
+
+ +
+ + + + \ No newline at end of file diff --git a/blog/index-24.html b/blog/index-24.html new file mode 100644 index 000000000..c20f9e19f --- /dev/null +++ b/blog/index-24.html @@ -0,0 +1,2420 @@ + + + + + + +PyPy (old posts, page 24) | PyPy + + + + + + + + + + + + + + + + + + Skip to main content +
+
+
+

CFFI release 0.3

+ +
+

Hi everybody,

+

We released CFFI 0.3. This is the first release that supports more +than CPython 2.x :-)

+
    +
  • CPython 2.6, 2.7, and 3.x are supported (3.3 definitely, but maybe 3.2 or earlier too)
  • +
  • +PyPy trunk is supported.
  • +
+

In more details, the main news are:

+
    +
  • support for PyPy. You need to get a trunk version of PyPy, which +comes with the built-in module _cffi_backend to use with the CFFI +release. For testing, you can download the Linux 32/64 versions of +PyPy trunk. The OS/X and Windows versions of _cffi_backend +are not tested at all so far, so probably don't work yet.
  • +
  • support for Python 3. It is unknown which exact version is +required; probably 3.2 or even earlier, but we need 3.3 to run the +tests. The 3.x version is not a separate source; it runs out of the same sources. Thanks Amaury for starting this port.
  • +
  • the main change in the API is that you need to use ffi.string(cdata) +instead of str(cdata) or unicode(cdata). The motivation for this +change was the Python 3 compatibility. If your Python 2 code used to +contain str(<cdata 'char *'>), it would interpret the memory content +as a null-terminated string; but on Python 3 it would just return a +different string, namely "<cdata 'char *'>", and proceed without even +a crash, which is bad. So ffi.string() solves it by always returning +the memory content as an 8-bit string (which is a str in Python 2 and +a bytes in Python 3).
  • +
  • other minor API changes are documented at +https://cffi.readthedocs.org/ (grep for version 0.3).
  • +
+

Upcoming work, to be done before release 1.0:

+
    +
  • expose to the user the module cffi.model in a possibly refactored +way, for people that don't like (or for some reason can't easily use) +strings containing snippets of C declarations. We are thinking about +refactoring it in such a way that it has a ctypes-compatible +interface, to ease porting existing code from ctypes to cffi. Note +that this would concern only the C type and function declarations, not +all the rest of ctypes.
  • +
  • CFFI 1.0 will also have a corresponding PyPy release. We are thinking +about calling it PyPy 2.0 and including the whole of CFFI (instead of +just the _cffi_backend module like now). In other words it will +support CFFI out of the box --- we want to push forward usage of CFFI +in PyPy :-) +
  • +
+

Cheers,

+

Armin Rigo and Maciej Fijałkowski

+
+

C++ objects in cppyy, part 1: Data Members

+ +
+

The cppyy module makes it possible to call into C++ from PyPy through the +Reflex package. +Documentation and setup instructions are +available here. +Recent work has focused on STL, low-level buffers, and code quality, but also +a lot on pythonizations for the +CINT backend, which is +mostly for High Energy Physics (HEP) use only. +A +previous posting walked +through the high-level structure and organization of the module, where it was +argued why it is necessary to write cppyy in RPython and generate bindings at +run-time for the best performance. +This posting details how access to C++ data structures is provided and is part +of a series of 3 postings on C++ object representation in Python: the second +posting will be about method dispatching, the third will tie up several odds +and ends by showing how the choices presented here and in part 2 work together +to make features such as auto-casting possible. + + +

+

Wrapping Choices

+ +

Say we have a plain old data type (POD), which is the simplest possible +data structure in C++. +Like for example: + +

+
    struct A {
+        int    m_i;
+        double m_d;
+    };
+ +

What should such a POD look like when represented in Python? +Let's start by looking at a Python data structure that is functionally +similar, in that it also carries two public data members of the desired +types. +Something like this: + +

+
    class A(object):
+        def __init__(self):
+            self.m_i = 0
+            self.m_d = 0.
+ +

Alright, now how to go about connecting this Python class with the former +C++ POD? +Or rather, how to connect instances of either. +The exact memory layout of a Python +A +instance is up to Python, and likewise the layout of a C++ +A instance is up +to C++. +Both layouts are implementation details of the underlying language, language +implementation, language version, and the platform used. +It should be no surprise then, that for example an +int in C++ looks +nothing like a +PyIntObject, even +though it is perfectly possible, in both cases, to point out in memory where +the integer value is. +The two representations can thus not make use of the same block of memory +internally. +However, the requirement is that the access to C++ from Python looks and feels +natural in its use, not that the mapping is exact. +Another requirement is that we want access to the actual object from both +Python and C++. +In practice, it is easier to provide natural access to C++ from Python than +the other way around, because the choices of memory layout in C++ are far more +restrictive: the memory layout defines the access, as the actual class +definition is gone at run-time. +The best choice then, is that the Python object will act as a proxy to the C++ +object, with the actual data always being in C++. + +

+

From here it follows that if the +m_i data member +lives in C++, then Python needs some kind of helper to access it. +Conveniently, since version 2.2, Python has a +property construct +that can take a getter and setter function that are called when the property +is used in Python code, and present it to the programmer as if it were a data +member. +So we arrive at this (note how the +property instance +is a variable at the class level): + +

+
    class A(object):
+        def __init__(self):
+            self._cppthis = construct_new_A()
+        m_i = property(get_m_i, set_m_i)
+        m_d = property(get_m_d, set_m_d)
+ +

The +construct_new_A +helper is not very interesting (the reflection layer can provide for it +directly), and methods are a subject for part 2 of this posting, so focus on +get_m_i +and set_m_i. +In order for the getter to work, the method needs to have access to the C++ +instance for which the Python object is a proxy. +On access, Python will call the getter function with the proxy instance for +which it is called. +The proxy has a +_cppthis data +member from which the C++ instance can be accessed (think of it as a pointer) +and all is good, at least for +m_i. +The second data member +m_d, however, +requires some more work: it is located at some offset into +_cppthis. +This offset can be obtained from the reflection information, which lets the +C++ compiler calculate it, so details such as +byte padding +are fully accounted for. +Since the setter also needs the offset, and since both share some more details +such as the containing class and type information of the data member, it is +natural to create a custom property class. +The getter and setter methods then become bound methods of an instance of that +custom property, +CPPDataMember, and +there is one such instance per data member. +Think of something along these lines: + +

+
    def make_datamember(cppclass, name):
+        cppdm = cppyy.CPPDataMember(cppclass, name)
+        return property(cppdm.get, cppdm.set)
+ +where the +make_datamember +function replaces the call to +property in the +class definition above. + +

Now hold on a minute! +Before it was argued that Python and C++ can not share the same underlying +memory structure, because of choices internal to the language. +But if on the Python side choices are being made by the developer of the +language bindings, that is no longer a limitation. +In other words, why not go through e.g. the Python extension API, and do +this: + +

+
    struct A_pyproxy {
+        PyObject_HEAD
+        int    m_i;
+        double m_d;
+    };
+ +

Doing so would save on +malloc overhead and remove +a pointer indirection. +There are some technical issues specific to PyPy for such a choice: there is +no such thing as +PyPyObject_HEAD +and the layout of objects is not a given as that is decided only at +translation time. +But assume that those issues can be solved, and also accept that there is no +problem in creating structure definitions like this at run-time, since the +reflection layer can provide both the required size and access to the +placement +new operator +(compare e.g. CPython's +struct module). +There is then still a more fundamental problem: it must be possible to take +over ownership in Python from instances created in C++ and vice-versa. +With a proxy scheme, that is trivial: just pass the pointer and do the +necessary bookkeeping. +With an embedded object, however, not every use case can be implemented: e.g. +if an object is created in Python, passed to C++, and deleted in C++, it +must have been allocated independently. +The proxy approach is therefore still the best choice, although embedding +objects may provide for optimizations in some use cases. + + +

+

Inheritance

+ +

The next step, is to take a more complicated C++ class, one with inheritance +(I'm leaving out details such as constructors etc., for brevity): + +

+
    class A {
+    public:
+        virtual ~A() {}
+        int    m_i;
+        double m_d;
+    };
+
+    class B : public A {
+    public:
+        virtual ~B() {}
+        int    m_j;
+    };
+ +

From the previous discussion, it should already be clear what this will look +like in Python: + +

+
    class A(object):
+        def __init__(self):
+            self._cppthis = construct_new_A()
+        m_i = make_datamember('A', 'm_i')
+        m_d = make_datamember('A', 'm_d')
+
+    class B(A):
+        def __init__(self):
+            self._cppthis = construct_new_B()
+        m_j = make_datamember('B', 'm_j')
+ +

There are some minor adjustments needed, however. +For one, the offset of the +m_i data member +may be no longer zero: it is possible that a virtual function dispatch table +(vtable) +pointer is added at the beginning of +A (an alternative +is to have the vtable pointer at the end of the object). +But if +m_i is handled the +same way as +m_d, with the +offset provided by the compiler, then the compiler will add the bits, if any, +for the vtable pointer and all is still fine. +A real problem could come in however, with a call of the +m_i property on +an instance of +B: in that case, +the _cppthis +points to a B +instance, whereas the getter/setter pair expect an +A instance. +In practice, this is usually not a problem: compilers will align +A and +B and calculate +an offset for +m_j from the start +of A. +Still, that is an implementation detail (even though it is one that can be +determined at run-time and thus taken advantage of by the JIT), so it can not +be relied upon. +The m_i getter +thus needs to take into account that it can be called with a derived type, +and so it needs to add an additional offset. +With that modification, the code looks something like this (as you would have +guessed, this is getting more and more into pseudo-code territory, although it +is conceptually close to the actual implementation in cppyy): + +

+
    def get_m_i(self):
+        return int(self._cppthis + offset(A, m_i) + offset(self.__class__, A))
+ +

Which is a shame, really, because the offset between +B and +A is going +to be zero most of the time in practice, and the JIT can not completely +elide +the offset calculation (as we will see later; it is easy enough to elide if +self.__class__ is +A, though). +One possible solution is to repeat the properties for each derived class, i.e. +to have a +get_B_m_i etc., but +that looks ugly on the Python side and anyway +does not work in all cases: e.g. with multiple inheritance where there are +data members with the same name in both bases, or if +B itself has a +public data member called +m_i that shadows +the one from A. +The optimization then, is achieved by making +B in charge of the +offset calculations, by making +offset a method of +B, like so: + +

+
    def get_m_i(self):
+        return int(self._cppthis + offset(A, m_i) + self.offset(A))
+ +

The insight is that by scanning the inheritance hierarchy of a derived +class like B, you +can know statically whether it may sometimes need offsets, or whether the +offsets are always going to be zero. +Hence, if the offsets are always zero, the method +offset on +B will +simply return the literal +0 as its +implementation, with the JIT taking care of the rest through inlining and +constant folding. +If the offset could be non-zero, then the method will perform an actual +calculation, and it will let the JIT elide the call only if possible. + + +

+

Multiple Virtual Inheritance

+ +

Next up would be multiple inheritance, but that is not very interesting: we +already have the offset calculation between the actual and base class, which +is all that is needed to resolve any multiple inheritance hierarchy. +So, skip that and move on to multiple virtual inheritance. +That that is going to be a tad more complicated will be clear if you show the +following code snippet to any old C++ hand and see how they respond. +Most likely you will be told: "Don't ever do that." +But if code can be written, it will be written, and so for the sake of the +argument, what would this look like in Python: + +

+
    class A {
+    public:
+        virtual ~A() {}
+        int m_a;
+    };
+
+    class B : public virtual A {
+    public:
+        virtual ~B() {}
+        int m_b;
+    };
+
+    class C : public virtual A {
+    public:
+        virtual ~C() {}
+        int m_c;
+    };
+
+    class D : public virtual B, public virtual C {
+    public:
+        virtual ~D() {}
+        int m_d;
+    };
+ +

Actually, nothing changes from what we have seen so far: the scheme as laid +out above is fully sufficient. +For example, D +would simply look like: + +

+
    class D(B, C):
+        def __init__(self):
+            self._cppthis = construct_new_D()
+        m_d = make_datamember('D', 'm_d')
+ +

Point being, the only complication added by the multiple virtual +inheritance, is that navigation of the C++ instance happens with pointers +internal to the instance rather than with offsets. +However, it is still a fixed offset from any location to any other location +within the instance as its parts are laid out consecutively in memory (this is +not a requirement, but it is the most efficient, so it is what is used in +practice). +But what you can not do, is determine the offset statically: you need a live +(i.e. constructed) object for any offset calculations. +In Python, everything is always done dynamically, so that is of itself not a +limitation. +Furthermore, +self is already +passed to the offset calculation (remember that this was done to put the +calculation in the derived class, to optimize the common case of zero +offset), thus a live C++ instance is there precisely when it is needed. +The call to the offset calculation is hard to elide, since the instance will +be passed to a C++ helper and so the most the JIT can do is guard on the +instance's memory address, which is likely to change between traces. +Instead, explicit caching is needed on the base and derived types, allowing +the JIT to elide the lookup in the explicit cache. + + +

+

Static Data Members and Global Variables

+ +

That, so far, covers all access to instance data members. +Next up are static data members and global variables. +A complication here is that a Python +property needs to +live on the class in order to work its magic. +Otherwise, if you get the property, it will simply return the getter function, +and if you set it, it will dissappear. +The logical conclusion then, is that a +property +representing a static or global variable, needs to live on the class of the +class, or the metaclass. +If done directly though, that would mean that every static data member is +available from every class, since all Python classes have the same metaclass, +which is class +type (and which is +its own metaclass). +To prevent that from happening and because +type is actually +immutable, each proxy class needs to have its own custom metaclass. +Furthermore, since static data can also be accessed on the instance, the +class, too, gets a +property object +for each static data member. +Expressed in code, for a basic C++ class, this looks as follows: + +

+
    class A {
+    public:
+        static int s_i;
+    };
+ +

Paired with some Python code such as this, needed to expose the static +variable both on the class and the instance level: + +

+
    meta_A = type(CppClassMeta, 'meta_A', [CPPMetaBase], {})
+    meta_A.s_i = make_datamember('A', 's_i')
+
+    class A(object):
+        __metaclass__ = meta_A
+        s_i = make_datamember('A', 's_i')
+ +

Inheritance adds no complications for the access of static data per se, but +there is the issue that the metaclasses must follow the same hierarchy as the +proxy classes, for the Python method resolution order (MRO) to work. +In other words, there are two complete, parallel class hierarchies that map +one-to-one: a hierarchy for the proxy classes and one for their metaclasses. + +

+

A parallel class hierarchy is used also in other highly dynamic, +object-oriented environments, such as for example +Smalltalk. +In Smalltalk as well, class-level constructs, such as class methods and data +members, are defined for the class in the metaclass. +A metaclass hierarchy has further uses, such as lazy loading of nested +classes and member templates (this would be coded up in the base class of all +metaclasses: +CPPMetaBase), and +makes it possible to distribute these over different reflection libraries. +With this in place, you can write Python codes like so: + +

+
    >>>> from cppyy.gbl import A
+    >>>> a = A()
+    >>>> a.s_i = 42
+    >>>> print A.s_i == a.s_i
+    True
+    >>>> # etc.
+ +

The implementation of the getter for +s_i is a lot +easier than for instance data: the static data lives at a fixed, global, +address, so no offset calculations are needed. +The same is done for global data or global data living in namespaces: +namespaces are represented as Python classes, and global data are implemented +as properties on them. +The need for a metaclass is one of the reasons why it is easier for namespaces +to be classes: module objects are too restrictive. +And even though namespaces are not modules, you still can, with +some limitations, +import from +them anyway. + +

+

It is common that global objects themselves are pointers, and therefore it +is allowed that the stored +_cppthis is not a +pointer to a C++ object, but rather a pointer to a pointer to a C++ object. +A double pointer, as it were. +This way, if the C++ code updates the global pointer, it will automatically +reflect on the Python side in the proxy. +Likewise, if on the Python side the pointer gets set to a different variable, +it is the pointer that gets updated, and this will be visible on the C++ side. +In general, however, the same caveat as for normal Python code applies: in +order to set a global object, it needs to be set within the scope of that +global object. +As an example, consider the following code for a C++ namespace +NS with +global variable +g_a, which behaves +the same as Python code for what concerns the visibility of changes to the +global variable: + +

+
    >>>> from cppyy.gbl import NS, A
+    >>>> from NS import g_a
+    >>>> g_a = A(42)                     # does NOT update C++ side
+    >>>> print NS.g_a.m_i
+    13                                   # the old value happens to be 13
+    >>>> NS.g_a = A(42)                  # does update C++ side
+    >>>> print NS.g_a.m_i
+    42
+    >>>> # etc.
+ + +

Conclusion

+ +

That covers all there is to know about data member access of C++ classes in +Python through a reflection layer! +A few final notes: RPython does not support metaclasses, and so the +construction of proxy classes (code like +make_datamember +above) happens in Python code instead. +There is an overhead penalty of about 2x over pure RPython code associated +with that, due to extra guards that get inserted by the JIT. +A factor of 2 sounds like a lot, but the overhead is tiny to begin with, and +2x of tiny is still tiny and it's not easy to measure. +The class definition of the custom property, +CPPDataMember, is +in RPython code, to be transparent to the JIT. +The actual offset calculations are in the reflection layer. +Having the proxy class creation in Python, with structural code in RPython, +complicates matters if proxy classes need to be constructed on-demand. +For example, if an instance of an as-of-yet unseen type is returned by a +method. +Explaining how that is solved is a topic of part 2, method calls, so stay +tuned. + +

+

This posting laid out the reasoning behind the object representation of C++ +objects in Python by cppyy for the purpose of data member access. +It explained how the chosen representation of offsets gives rise to a very +pythonic representation, which allows Python introspection tools to work as +expected. +It also explained some of the optimizations done for the benefit of the JIT. +Next up are method calls, which will be described in part 2.

+
+
+
+
+ + Sindwiller wrote on 2012-09-12 13:50: +
+
+

On a related note, do you know when Reflex will discard gccxml? I'm using Boost.Python with Ogre3D (among other things) right now and I'm looking into the pypy option. Gccxml, however, complains about some C++11 related stuff (which is somewhat odd, to the least, as I don't expose any Ogre-internal class or anything like that).

+
+
+
+
+ + Wim Lavrijsen wrote on 2013-02-27 23:28: +
+
+

Reflex itself will be discarded in favor of clang from llvm. That is, however, still experimental, but we're getting there.

+
+
+
+
+ + heemanshu bhalla wrote on 2013-10-03 14:18: +
+
+

Complete explanation of static data members with classes and program go to link :-

https://geeksprogrammings.blogspot.in/2013/09/static-data-members.html

+
+
+
+ +

Multicore Programming in PyPy and CPython

+ +
+

Hi all,

+

This is a short "position paper" kind of post about my view (Armin +Rigo's) on the future of multicore programming in high-level languages. +It is a summary of the +keynote presentation at EuroPython. As I learned by talking with people +afterwards, I am not a good enough speaker to manage to convey a deeper +message in a 20-minutes talk. I will try instead to convey it in a +250-lines post...

+

This is about three points:

+
    +
  1. We often hear about people wanting a version of Python running without +the Global Interpreter Lock (GIL): a "GIL-less Python". But what we +programmers really need is not just a GIL-less Python --- we need a +higher-level way to write multithreaded programs than using directly +threads and locks. One way is Automatic Mutual Exclusion (AME), which +would give us an "AME Python".
  2. +
  3. A good enough Software Transactional Memory (STM) system can be used +as an internal tool to do that. +This is what we are building into an "AME PyPy".
  4. +
  5. The picture is darker for CPython, though there is a way too. The +problem is that when we say STM, we think about either GCC 4.7's STM +support, or Hardware Transactional Memory (HTM). However, both +solutions are enough for a "GIL-less CPython", but not +for "AME CPython", due to capacity limitations. For the latter, we +need somehow to add some large-scale STM into the compiler.
  6. +
+

Let me explain these points in more details.

+
+

GIL-less versus AME

+

The first point is in favor of the so-called Automatic Mutual Exclusion +approach. The issue with using threads (in any language with or without +a GIL) is that threads are fundamentally non-deterministic. In other +words, the programs' behaviors are not reproductible at all, and worse, +we cannot even reason about it --- it becomes quickly messy. We would +have to consider all possible combinations of code paths and timings, +and we cannot hope to write tests that cover all combinations. This +fact is often documented as one of the main blockers towards writing +successful multithreaded applications.

+

We need to solve this issue with a higher-level solution. Such +solutions exist theoretically, and Automatic Mutual Exclusion (AME) is +one of them. The idea of AME is that we divide the execution of each +thread into a number of "atomic blocks". Each block is well-delimited +and typically large. Each block runs atomically, as if it acquired a +GIL for its whole duration. The trick is that internally we use +Transactional Memory, which is a technique that lets the system run the +atomic blocks from each thread in parallel, while giving the programmer +the illusion that the blocks have been run in some global serialized +order.

+

This doesn't magically solve all possible issues, but it helps a lot: it +is far easier to reason in terms of a random ordering of large atomic +blocks than in terms of a random ordering of lines of code --- not to +mention the mess that multithreaded C is, where even a random ordering +of instructions is not a sufficient model any more.

+

How do such atomic blocks look like? For example, a program might +contain a loop over all keys of a dictionary, performing some +"mostly-independent" work on each value. This is a typical example: +each atomic block is one iteration through the loop. By using the +technique described here, we can run the iterations in parallel +(e.g. using a thread pool) but using AME to ensure that they appear to +run serially.

+

In Python, we don't care about the order in which the loop iterations +are done, because we are anyway iterating over the keys of a dictionary. +So we get exactly the same effect as before: the iterations still run in +some random order, but --- and that's the important point --- they +appear to run in a +global serialized order. In other words, we introduced parallelism, but +only under the hood: from the programmer's point of view, his program +still appears to run completely serially. Parallelisation as a +theoretically invisible optimization... more about the "theoretically" +in the next paragraph.

+

Note that randomness of order is not fundamental: they are techniques +building on top of AME that can be used to force the order of the +atomic blocks, if needed.

+
+
+

PyPy and STM/AME

+

Talking more precisely about PyPy: the current prototype pypy-stm is +doing precisely this. In pypy-stm, the length of the atomic blocks is +selected in one of two ways: either explicitly or automatically.

+

The automatic selection gives blocks corresponding to some small number +of bytecodes, in which case we have merely a GIL-less Python: multiple +threads will appear to run serially, with the execution randomly +switching from one thread to another at bytecode boundaries, just like +in CPython.

+

The explicit selection is closer to what was described in the previous +section: someone --- the programmer or the author of some library that +the programmer uses --- will explicitly put with thread.atomic: in +the source, which delimitates an atomic block. For example, we can use +it to build a library that can be used to iterate over the keys of a +dictionary: instead of iterating over the dictionary directly, we would +use some custom utility which gives the elements "in parallel". It +would give them by using internally a pool of threads, but enclosing +every handling of an element into such a with thread.atomic block.

+

This gives the nice illusion of a global serialized order, and thus +gives us a well-behaving model of the program's behavior.

+

Restating this differently, +the only semantical difference between pypy-stm and +a regular PyPy or CPython is that it has thread.atomic, which is a +context manager that gives the illusion of forcing the GIL to not be +released during the execution of the corresponding block of code. Apart +from this addition, they are apparently identical.

+

Of course they are only semantically identical if we ignore performance: +pypy-stm uses multiple threads and can potentially benefit from that +on multicore machines. The drawback is: when does it benefit, and how +much? The answer to this question is not immediate. The programmer +will usually have to detect and locate places that cause too many +"conflicts" in the Transactional Memory sense. A conflict occurs when +two atomic blocks write to the same location, or when A reads it, +B writes it, but B finishes first and commits. A conflict +causes the execution of one atomic block to be aborted and restarted, +due to another block committing. Although the process is transparent, +if it occurs more than occasionally, then it has a negative impact on +performance.

+

There is no out-of-the-box perfect solution for solving all conflicts. +What we will need is more tools to detect them and deal with them, data +structures that are made aware of the risks of "internal" conflicts when +externally there shouldn't be one, and so on. There is some work ahead.

+

The point here is that from the point of view of the final programmer, +we gets conflicts that we should resolve --- but at any point, our +program is correct, even if it may not be yet as efficient as it could +be. This is the opposite of regular multithreading, where programs are +efficient but not as correct as they could be. In other words, as we +all know, we only have resources to do the easy 80% of the work and not +the remaining hard 20%. So in this model we get a program that has 80% +of the theoretical maximum of performance and it's fine. In the regular +multithreading model we would instead only manage to remove 80% of the +bugs, and we are left with obscure rare crashes.

+
+
+

CPython and HTM

+

Couldn't we do the same for CPython? The problem here is that +pypy-stm is implemented as a transformation step during translation, +which is not directly possible in CPython. Here are our options:

+
    +
  • We could review and change the C code everywhere in CPython.
  • +
  • We use GCC 4.7, which supports some form of STM.
  • +
  • We wait until Intel's next generation of CPUs comes out ("Haswell") +and use HTM.
  • +
  • We write our own C code transformation within a compiler (e.g. LLVM).
  • +
+

I will personally file the first solution in the "thanks but no thanks" +category. If anything, it will give us another fork of CPython that +will painfully struggle to keep not more than 3-4 versions behind, and +then eventually die. It is very unlikely to be ever merged into the +CPython trunk, because it would need changes everywhere. Not to +mention that these changes would be very experimental: tomorrow we might +figure out that different changes would have been better, and have to +start from scratch again.

+

Let us turn instead to the next two solutions. Both of these solutions +are geared toward small-scale transactions, but not long-running ones. +For example, I have no clue how to give GCC rules about performing I/O +in a transaction --- this seems not supported at all; and moreover +looking at the STM library that is available so far to be linked with +the compiled program, it assumes short transactions only. By contrast, +when I say "long transaction" I mean transactions that can run for 0.1 +seconds or more. To give you an idea, in 0.1 seconds a PyPy program +allocates and frees on the order of ~50MB of memory.

+

Intel's Hardware Transactional Memory solution is both more flexible and +comes with a stricter limit. In one word, the transaction boundaries +are given by a pair of special CPU instructions that make the CPU enter +or leave "transactional" mode. If the transaction aborts, the CPU +cancels any change, rolls back to the "enter" instruction and causes +this instruction to return an error code instead of re-entering +transactional mode (a bit like a fork()). The software then detects +the error code. Typically, if transactions are rarely cancelled, it is +fine to fall back to a GIL-like solution just to redo these cancelled +transactions.

+

About the implementation: this is done by recording all the changes that +a transaction wants to do to the main memory, and keeping them invisible +to other CPUs. This is "easily" achieved by keeping them inside this +CPU's local cache; rolling back is then just a matter of discarding a +part of this cache without committing it to memory. From this point of +view, there is a lot to bet that we are actually talking about the +regular per-core Level 1 and Level 2 caches --- so any transaction that +cannot fully store its read and written data in the 64+256KB of the L1+L2 +caches will abort.

+

So what does it mean? A Python interpreter overflows the L1 cache of +the CPU very quickly: just creating new Python function frames takes a +lot of memory (on the order of magnitude of 1/100 of the whole L1 +cache). Adding a 256KB L2 cache into the picture helps, particularly +because it is highly associative and thus avoids a lot of fake conflicts. +However, as long as the HTM support is limited to L1+L2 caches, +it is not going to be enough to run an "AME Python" with any sort of +medium-to-long transaction. It can +run a "GIL-less Python", though: just running a few hundred or even +thousand bytecodes at a time should fit in the L1+L2 caches, for most +bytecodes.

+

I would vaguely guess that it will take on the order of 10 years until +CPU cache sizes grow enough for a CPU in HTM mode to actually be able to +run 0.1-second transactions. (Of course in 10 years' time a lot of other +things may occur too, including the whole Transactional Memory model +being displaced by something else.)

+
+
+

Write your own STM for C

+

Let's discuss now the last option: if neither GCC 4.7 nor HTM are +sufficient for an "AME CPython", then we might want to +write our own C compiler patch (as either extra work on GCC 4.7, or an +extra pass to LLVM, for example).

+

We would have to deal with the fact that we get low-level information, +and somehow need to preserve interesting high-level bits through the +compiler up to the point at which our pass runs: for example, whether +the field we read is immutable or not. (This is important because some +common objects are immutable, e.g. PyIntObject. Immutable reads don't +need to be recorded, whereas reads of mutable data must be protected +against other threads modifying them.) We can also have custom code to +handle the reference counters: e.g. not consider it a conflict if +multiple transactions have changed the same reference counter, but just +resolve it automatically at commit time. We are also free to handle I/O +in the way we want.

+

More generally, the advantage of this approach over both the current GCC +4.7 and over HTM is that we control the whole process. While this still +looks like a lot of work, it looks doable. It would be possible to come +up with a minimal patch of CPython that can be accepted into core +without too much troubles (e.g. to mark immutable fields and tweak the +refcounting macros), and keep all the cleverness inside the compiler +extension.

+
+
+

Conclusion

+

I would assume that a programming model specific to PyPy and not +applicable to CPython has little chances to catch on, as long as PyPy is +not the main Python interpreter (which looks unlikely to change anytime +soon). Thus as long as only PyPy has AME, it looks like it will not +become the main model of multicore usage in Python. However, I can +conclude with a more positive note than during the EuroPython +conference: it is a lot of work, but there is a more-or-less reasonable +way forward to have an AME version of CPython too.

+

In the meantime, pypy-stm is around the corner, and together with +tools developed on top of it, it might become really useful and used. I +hope that in the next few years this work will trigger enough motivation +for CPython to follow the ideas.

+
+
+
+
+
+ + JohnLenton wrote on 2012-08-09 12:29: +
+
+

A question: does a “donate towards STM/AME in pypy” also count as a donation towards the CPython work? Getting the hooks in CPython to allow exploration and implementation of this seems at least as important as the pypy work. In fact, I think it’s quite a bit more important.

+
+
+
+
+ + Armin Rigo wrote on 2012-08-09 12:55: +
+
+

@John: I didn't foresee this development at the start of the year, so I don't know. It's a topic that would need to be discussed internally, likely with feedback from past donators.

Right now of course I'm finishing the basics of pypy-stm (working on the JIT now), and from there on there is a lot that can be done as pure Python, like libraries of better-suited data structures --- and generally gaining experience that would anyway be needed for CPython's work.

+
+
+
+
+ + Anonymous wrote on 2012-08-09 15:53: +
+
+

With HTM you don't have to have a one-to-one mapping between your application transactions and the hardware interface. You can also have an STM, that is implemented using HTM. So you may do all the book-keeping yourself in software, but then at commit time use HTM.

+
+
+
+
+ + Nat Tuck wrote on 2012-08-09 16:37: +
+
+

No. We really do want a GIL-free Python. Even if that means we sometimes need to deal with locks.

Right now a high end server can have 64 cores. That means that parallel python code could run faster than serial C code.

STM and other high level abstractions are neat, but they're no substitute for just killing the damn GIL.

+
+
+
+
+ + Anonymous wrote on 2012-08-09 17:32: +
+
+

What does 'just killing the damn GIL' mean without something like STM? Do you consider it acceptable for Python primitives not to be threadsafe?

If you intend to run 64 cores, then what is the exact reason you need threading and can't use multiprocessing?

+
+
+
+
+ + Anonymous wrote on 2012-08-09 19:54: +
+
+

Jesus Christ why don't we all just spend 5 min fiddling with the multiprocessing module and learn how to partition execution and queues like we partition sequences of statements into functions? So sick of GIL articles and the obsession with not learning how to divide up the work and communicate. In some ways the need to recognize narrow channels where relatively small amounts of data are being channeled through relatively intense blocks of execution and create readable, explicit structure around those blocks might actually improve the comprehensibility of some code I've seen. Getting a little tired of seeing so much effort by excellent, essential, dedicated Python devs getting sucked up by users who won't get it.

I think users are driving this speed-for-free obsession way to far. If anything bugs in a magical system are harder to find than understanding explicit structure and explicit structure that's elegant is neither crufty nor slow. Eventually, no interpreter will save a bad programmer. Are we next going to enable the novice "Pythonista" to forego any knowledge of algorithms?

We -need- JIT on production systems to get response times down for template processing without micro-caching out the wazoo. These types of services are already parallel by nature of the servers and usually I/O bound except for the few slow parts. Cython already serves such an excellent roll for both C/C++ API's AND speed AND optimizing existing python code with minimal changes. JIT PyPy playing well with Cython would make Python very generally uber. Users who actually get multiprocessing and can divide up the workflow won't want a slower implementation of any other kind. Getting a somewhat good solution for 'free' is not nearly as appealing as the additional headroom afforded by an incremental user cost (adding some strong typing or patching a function to work with pypy/py3k).

+
+
+
+
+ + Unknown wrote on 2012-08-09 19:59: +
+
+

template processing. lol.

+
+
+
+
+ + Maciej Fijalkowski wrote on 2012-08-09 21:27: +
+
+

@Anonymous.

I welcome you to work out how to make pypy translation process parallel using any techniques you described.

+
+
+
+
+ + Benjamin wrote on 2012-08-10 07:27: +
+
+

I get the overall goals and desires and I think they are fabulous. However, one notion that seems counterintuitive to me is the desire for large atomic operations.

Aside from the nomenclature (atomic generally means smallest possible), my intuition is that STM would generally operate more efficiently by having fewer roll-backs with small atomic operations and frequent commits. This leads me to assume there is some sort of significant overhead involved with the setup or teardown of the STM 'wrapper'.

From a broader perspective, I get that understanding interlacing is much easier with larger pieces, but larger pieces of code don't lend themselves to wide distribution across many cores like small pieces do.

It seems, to me, that you're focusing heavily on the idea of linearly written code magically functioning in parallel and neglecting the idea of simple, low-cost concurrency, which might have a much bigger short-term impact; and which, through use, may shed light on better frameworks for reducing the complexity inherent in concurrency.

+
+
+
+
+ + Armin Rigo wrote on 2012-08-10 08:57: +
+
+

@Anonymous: "So you may do all the book-keeping yourself in software, but then at commit time use HTM.": I don't see how (or the point), can you be more explicit or post a link?

@Anonymous: I'm not saying that STM is the final solution to all problems. Some classes of problems have other solutions that work well so far and I'm not proposing to change them. Big servers can naturally handle big loads just by having enough processes. What I'm describing instead is a pure language feature that may or may not help in particular cases --- and there are other cases than the one you describe where the situation is very different and multiprocessing doesn't help at all. Also, you have to realise that any argument "we will never need feature X because we can work around it using hack Y" is bound to lose eventually: at least some people in some cases will need the clean feature X because the hack Y is too complicated to learn or use correctly.

@Benjamin: "atomic" actually means "not decomposable", not necessarily "as small as possible". This focus on smallness of transaction IMO is an artefact of last decade's research focus. In my posts I tend to focus on large transaction as a counterpoint: in the use cases I have in mind there is no guarantee that all transactions will be small. Some of them may be, but others not, and this is a restriction. In things like "one iteration through this loop = one transaction", some of these iterations go away and do a lot of stuff.

+
+
+
+
+ + Unknown wrote on 2012-08-10 18:15: +
+
+

Transactional programming is neat. So are Goroutines and functional-style parallelism. On the other hand, I think that C and C++ (or at least C1x and C++11) get one thing completely right: they don't try to enforce any particular threading model. For some problems (like reference counts, as you mention), you really do want a different model. As long as other languages force me to choose a single model, my big projects will stay in C/C++.

+
+
+
+
+ + Benjamin wrote on 2012-08-10 21:17: +
+
+

@Armin I'd love to hear your thoughts (benefits, costs, entrenched ideas, etc.) on large vs small transactions at some point. Though I suspect that would be a post unto itself.

+
+
+
+
+ + Armin Rigo wrote on 2012-08-10 22:04: +
+
+

@Benjamin: a user program might be optimized to reduce its memory usage, for example by carefully reusing objects instead of throwing them away, finding more memory-efficient constructs, and so on. But in many cases in Python you don't care too much. Similarly, I expect that it's possible to reduce the size of transactions by splitting them up carefully, hoping to get some extras in performance. But most importantly I'd like a system where the programmer didn't have to care overmuch about that. It should still work reasonably well for *any* size, just like a reasonable GC should work for any heap size.

If I had to describe the main issue I have against HTM, it is that beyond some transaction size we loose all parallelism because it has to fall back on the GIL.

Well, now that I think about it, it's the same in memory usage: if you grow past the RAM size, the program is suddenly swapping, and performance becomes terrible. But RAM sizes are so far much more generous than maximum hardware transaction sizes.

+
+
+
+
+ + Unknown wrote on 2012-08-12 08:26: +
+
+

There are two key concurrency patterns to keep in mind when considering Armin's STM work:

1. Event-loop based applications that spend a lot of time idling waiting for events.

2. Map-reduce style applications where only the reduce step is particularly prone to resource contention, but the map step is read-heavy (and thus hard to split amongst multiple processes)

For both of those use cases, splitting out multiple processes often won't pay off due to either the serialisation overhead or the additional complexity needed to make serialisation possible at all.

Coarse-grained STM, however, should pay off handsomely in both of those scenarios: if the CPU bound parts of the application are touching different data structures, or are only *reading* any shared data, with any writes being batched for later application, then the STM interaction can be built in to the event loop or parallel execution framework.

Will STM help with threading use cases where multiple threads are simultaneously reading and writing the same data structure? No, it won't. However, such applications don't exploit multiple cores effectively even with free threading, because their *lock* contention will also be high.

As far as "just kill the GIL" goes, I've already written extensively on that topic: https://python-notes.boredomandlaziness.org/en/latest/python3/questions_and_answers.html#but-but-surely-fixing-the-gil-is-more-important-than-fixing-unicode

+
+
+
+
+ + klaussfreire wrote on 2012-08-13 23:35: +
+
+

Option 5, implement STM on the operating system. Linux already has COW for processes, imagine COW-MERGE for threads.

When you start transactional mode, all pages are marked read-only, thread-private and COW. When you commit, dirty pages are merged with the processes' page maps, unless conflicts arise (the process already has dirty pages).

A simple versioning system and version checks would take care of conflict detection.

I just wonder how difficult it would be designing applications that can run on this model (conflicts at page level vs object level).

Thread-private allocation arenas are entirely possible to avoid new objects from creating conflicts all the time, so it would be a matter of making read-only use of objects really read-only, something I've done incrementally in patches already. Reference counts have to be externalized (taken out of PyObject), for instance.

+
+
+
+
+ + Armin Rigo wrote on 2012-08-14 09:12: +
+
+

@klaussfreire: that approach is a cool hack but unlikely to work in practice in a language like Python, because the user doesn't control at all what objects are together with what other objects on the same pages. Even with the reference counts moved out of the way I guess you'd have far too many spurious conflicts.

+
+
+
+
+ + klaussfreire wrote on 2012-08-14 15:43: +
+
+

@Armin, well, Python itself does know.

In my half-formed idea in my head, python would use thread-local versions of the integer pool and the various free lists, and allocation of new objects would be served from an also thread-local arena (while in a transaction).

Read-write access to shared objects, yes, would be a little bit unpredictable. That's why I was wondering how good (if at all) it would work for Python.

+
+
+
+
+ + Wim Lavrijsen wrote on 2012-08-14 20:18: +
+
+

@klaussfreire

is this perhaps what you are looking for: https://plasma.cs.umass.edu/emery/grace

Cheers,
Wim

+
+
+
+
+ + klaussfreire wrote on 2012-08-14 21:50: +
+
+

Damn. And I thought I was being original. I can already spot a few key places where kernel-based support would be superior (not only raw performance, but also transparency), but in general, that's exactly what I was talking about, sans transaction retrials.

+
+
+
+
+ + Mark D. wrote on 2012-08-16 04:23: +
+
+

0.1 second transactions? With hardware transactional memory the general idea is transactions about ten thousand times smaller. A dozen memory modifications maybe.

It would be prohibitively expensive, hardware wise, to implement conflict detection for transactions much larger than that, to say nothing of the occurrence of conflicts requiring rollback and re-execution if such enormously large transactions were executed optimistically.

+
+
+
+
+ + Armin Rigo wrote on 2012-08-19 11:58: +
+
+

@Mark D.: I don't know if "a dozen memory modification" comes from real work in the field or is just a guess. My own guess would be that Intel Haswell supports easily hunderds of modifications, possibly thousands. Moreover the built-in cache coherency mechanisms should be used here too, in a way that scales with the cache size; this means they should not be "prohibitively expensive".
Of course I know that in 0.1 seconds we do far more than thousands writes, but I think that nothing strictly limits the progression of future processors in that respect.

The occurrence of conflicts in large transactions depends on two factors. First, "true conflicts", which is the hard problem, but which I think should be relatively deterministic and debuggable with new tools. Second, "false conflicts", which is the HTM/STM mechanism detecting a conflict when there is none. To handle large transactions this should occur with a probability very, very close to 0% for each memory access. In pypy-stm it is 0%, but indeed, with HTM it depends on how close to 0% they can get. I have no data on that.

+
+
+
+
+ + Ole Laursen wrote on 2012-09-06 15:04: +
+
+

I'm a little late, but regarding the simple let's-do-the-loop-concurrently example, if pypy-stm ends up working out as hoped, would it be relatively easy for pypy to do it automatically without having to use parallel loop thing explicitly?

I have a hunch the answer would be yes, but that the hard part is figuring out when it makes sense and how to do the split (each thread needs a good chunk to work on).

On the other hand, GCC has OpenMP which does seem really convenient and also looks like it has (or rather an implementation of that would have to have) solved part of this problem.

Many years ago, I read about research in auto-parallellising compilers and it stroke me as a really hard problem. But if you can just do some magic with the loops, perhaps it's an attainable goal?

+
+
+
+
+ + Unknown wrote on 2012-09-06 21:02: +
+
+

I really believe that concurrency - like memory allocation, GC and safe arrays - should be done without the user thinking about it...

Languages like Erlang, ABCL and Concurrent Object Oriented C solves this quite elegant.

Just make every Object a "process" (thread/greenlet) and every return value a Future and your are done :-)

+
+
+
+
+ + Anonymous wrote on 2015-09-22 07:53: +
+
+

Ammm... Jython 2.7.0 !

All pure Python syntax using threading instantly go MULTI-CORE! All you need to do is replace the 'p' with a 'j' in your command and voila!

;)

+
+
+
+ +

NumPyPy non-progress report

+ +
+
+

Hello everyone.

+

Not much has happened in the past few months with numpypy development. A part +of the reason was doing other stuff for me, a part of the reason was +various unexpected visa-related admin, a part of the reason was EuroPython +and a part was long-awaited holiday.

+

The thing that's maybe worth mentioning is that it does not mean the donations +disappeared in the mist. PyPy developers are being paid to work on NumPyPy on +an hourly basis - that means if I decide to take holidays or work on something +else, the money is simply staying in the account until later.

+

Thanks again for all the donations, I hope to get back to this topic soon!

+

Cheers,
+fijal

+
+
+
+
+
+
+ + Stephen Weber wrote on 2012-08-09 00:37: +
+
+

Thanks for the non-update, I trust you that all is well. Rest helps us work better!

+
+
+
+
+ + Unknown wrote on 2012-08-13 13:25: +
+
+

Please don’t worry too much about the money lost/not-lost. The important part is that you enjoy the programming. For you, because that’s more fun and for us because more fun for the programmer means better code.

+
+
+
+ +

CFFI release 0.2.1

+ +
+

Hi everybody,

+

We released CFFI 0.2.1 (expected to be 1.0 soon). CFFI is a way to call C from Python.

+

EDIT: Win32 was broken in 0.2. Fixed.

+

This release is only for CPython 2.6 or 2.7. PyPy support is coming in
+the ffi-backend branch, but not finished yet. CPython 3.x would be
+easy but requires the help of someone.

+

The package is available on bitbucket as well as documented. You
+can also install it straight from the python package index: pip install cffi

+
    +
  • Contains numerous small changes and support for more C-isms.
  • +
  • The biggest news is the support for installing packages that use
    ffi.verify() on machines without a C compiler. Arguably, this
    +lifts the last serious restriction for people to use CFFI.
  • +
  • Partial list of smaller changes:
      +
    • mappings between 'wchar_t' and Python unicodes
    • +
    • the introduction of ffi.NULL
    • +
    • a possibly clearer API for ffi.new(): e.g. to allocate a single int and obtain a pointer to it, use ffi.new("int *") instead of the old
      ffi.new("int") +
    • +
    • and of course a plethora of smaller bug fixes
    • +
    +
  • +
  • CFFI uses pkg-config to install itself if available. This helps
    +locate libffi on modern Linuxes. Mac OS/X support is available too
    +(see the detailed installation instructions). Win32 should work out
    +of the box. Win64 has not been really tested yet.
  • +
+

Cheers,
+Armin Rigo and Maciej Fijałkowski

+
+

Prototype PHP interpreter using the PyPy toolchain - Hippy VM

+ +
+

Hello everyone.

+

I'm proud to release the result of a Facebook-sponsored study on the feasibility of +using the RPython toolchain to produce a PHP interpreter. The rules were +simple: two months; one person; get as close to PHP as possible, implementing +enough warts and corner cases to be reasonably sure that it answers hard +problems in the PHP language. The outcome is called Hippy VM and implements +most of the PHP 1.0 language (functions, arrays, ints, floats and strings). +This should be considered an alpha release.

+

The resulting interpreter is obviously incomplete – it does not support all +modern PHP constructs (classes are completely unimplemented), builtin functions, +grammar productions, web server integration, builtin libraries +etc., etc.. It's just complete enough for me to reasonably be able to +say that – given some engineering effort – it's possible to provide a rock-solid +and fast PHP VM using PyPy technologies.

+

The result is available in a Bitbucket repo and is released under the MIT +license.

+
+

Performance

+

The table below shows a few benchmarks comparing Hippy VM to Zend (a standard +PHP interpreter available in Linux distributions) and HipHop VM (a PHP-to-C++ +optimizing compiler developed by Facebook). The versions used were Zend 5.3.2 +(Zend Engine v2.3.0) and HipHop VM heads/vm-0-ga4fbb08028493df0f5e44f2bf7c042e859e245ab +(note that you need to check out the vm branch to get the newest version).

+

The run was performed on 64-bit Linux running on a Xeon W3580 with 8M of +L2 cache, which was otherwise unoccupied.

+

Unfortunately, I was not able to run it on the JITted version of HHVM, the new effort by Facebook, +but people involved with the project told me it's usually slower or comparable with the compiled HipHop. +Their JITted VM is still alpha software, so I'll update it as soon as I have the info.

+
+ ++++++++ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
benchmarkZendHipHop VMHippy VMHippy / ZendHippy / HipHop
arr2.7710.508+-0%0.274+-0%10.1x1.8x
fannkuch21.2397.248+-0%1.377+-0%15.4x5.3x
heapsort1.7390.507+-0%0.192+-0%9.1x2.6x
binary_trees3.2230.641+-0%0.460+-0%7.0x1.4x
cache_get_scb3.3500.614+-0%0.267+-2%12.6x2.3x
fib2.3570.497+-0%0.021+-0%111.6x23.5x
fasta1.4990.233+-4%0.177+-0%8.5x1.3x
+
+

The PyPy compiler toolchain provides a way to implement a dynamic +language interpreter in a high-level language called RPython. This is +a language which is lower-level than Python, but still higher-level than +C or C++: for example, RPython is a garbage-collected language. The killer +feature is that the toolchain will generate a JIT for your interpreter which +will be able to leverage most of the work that has been done on speeding up Python +in the PyPy project. The resulting JIT is generated for your interpreter, and is not Python-specific. +This was one of the toolchain's original design decisions – in contrast to e.g. the JVM, +which was initially only used to interpret Java and later adjusted to serve as a platform for +dynamic languages.

+

Another important difference is that there is no common bytecode to which you compile both your +language and Python, so you don't inherit problems presented when implementing language X on top of, +say, Parrot VM or the JVM. The PyPy toolchain does not impose constraints on the semantics of +your language, whereas the benefits of the JVM only apply to languages that map well onto Java concepts.

+

To read more about creating your own interpreters using the PyPy toolchain, +read more blog posts or an excellent article by Laurence Tratt.

+
+
+

PHP deviations

+

The project's biggest deviation from the PHP specification is probably +that GC is no longer reference counting. That means that the object finalizer, when +implemented, will not be called directly at the moment of object death, but +at some later point. There are possible future developments to alleviate that +problem, by providing "refcounted" objects when leaving the current scope. +Research has to be done in order to achieve that.

+
+
+

Assessment

+

The RPython toolchain seems to be a cost-effective choice for writing +dynamic language VMs. It both provides a fast JIT and gives you +access to low-level primitives when you need them. A good example is +in the directory hippy/rpython which contains the implementation +of an ordered dictionary. An ordered dictionary is not a primitive +that RPython provides – it's not necessary for the goal of +implementing Python. Now, implementing it on top of a normal dictionary +is possible, but inefficient. RPython provides a way to work +directly at a lower level, if you desire to do so.

+

Things that require improvements in RPython:

+
    +
  • Lack of mutable strings on the RPython level ended up being a problem. +I ended up using lists of characters; which are efficient, but inconvenient, +since they don't support any string methods.
  • +
  • Frame handling is too conservative and too Python-specific, especially around +the calls. It's possible to implement less general, but simpler and faster +frame handling implementation in RPython.
  • +
+
+
+

Status of the implementation

+

Don't use it! It's a research prototype intended to assess the feasibility +of using RPython to create dynamic language VMs. The most notable +feature that's missing is reasonable error reporting. That said, I'm +confident it implements enough of the PHP language to prove that the full +implementation will present the same performance characteristics.

+
+
+

Benchmarks

+

The benchmarks are a selection of computer language shootout benchmarks, as well +as cache_get_scb, which is a part of old Facebook code. All benchmarks other +than this one (which is not open source, but definitely the most interesting :( ) are +available in the bench directory. The Python program to run them is called +runner.py and is in the same directory. It runs them 10 times, cutting off the first +3 runs (to ignore the JIT warm-up time) and averaging the rest. As you can see +the standard deviation is fairly minimal for all interpreters and runs; if +it's omitted it means it's below 0.5%.

+

The benchmarks were not selected for their ease of optimization – the optimizations +in the interpreter were written specifically for this set of benchmarks. No special JIT +optimizations were added, and barring what's mentioned below a vanilla PyPy 1.9 checkout +was used for compilation.

+
+
+

So, how fast will my website run if this is completed?

+

The truth is that I lack the benchmarks to be able to answer that right now. The core +of the PHP language is implemented up to the point where I'm confident +that the performance will not change as we get more of the PHP going.

+
+
+

How do I run it?

+

Get a PyPy checkout, apply the diff if you want to squeeze out the last +bits of performance and run pypy-checkout/pypy/bin/rpython targethippy.py to +get an executable that resembles a PHP interpreter. You can also directly run +python targethippy.py file.php, but this will be about 2000x slower.

+
+
+

RPython modifications

+

There was a modification that I did to the PyPy source code; the diff +is available. It's trivial, and should simply be made optional in the +RPython JIT generator, but it was easier just to do it, given the very constrained time +frame.

+
    +
  • +gen_store_back_in_virtualizable was disabled. This feature is +necessary for Python frames but not for PHP frames. PHP frames +do not have to be kept alive after we exit a function.
  • +
+
+
+

Future

+

Hippy is a cool prototype that presents a very interesting path towards a fast +PHP VM. However, at the moment I have too many other open source commitments +to take on the task of completing it in my spare time. I do think that this project +has a lot of potential, but I will not commit to any further development at +this time. If you send pull requests I'll try to review them. I'm also open +to having further development on this project funded, so if you're interested +in this project and the potential of a fast PHP interpreter, please get in +touch.

+
+

Cheers,
+fijal

+

EDIT: Fixed the path to the rpython binary

+
+
+
+
+ + Anonymous wrote on 2012-07-13 23:26: +
+
+

it's cool. Next on the list Javascript to Python/PyPy converter...

+
+
+
+
+ + Maciej Fijalkowski wrote on 2012-07-13 23:34: +
+
+

please read the blog post first. It's *not* PHP to Python converter. There is also a started JS implementation on in https://bitbucket.org/pypy/lang-js, but JS is kind of useless without a browser.

+
+
+
+
+ + Anonymous wrote on 2012-07-14 00:30: +
+
+

JS to pypy would be useful when time comes to running all those node based apps in prod ;)

Also, Java to PyPy would be a cool experiment too - jvm's way too bloated...

+
+
+
+
+ + Christian Heimes wrote on 2012-07-14 01:42: +
+
+

Do I read the numbers correctly? The fibonacci test runs more than 110 times faster in your experimental, 2 months old VM than in the default Zend VM? That's amazing!

It took me a while to figure out the meaning of the numbers. Please add units and explain that small is faster.

Christian

+
+
+
+
+ + Unknown wrote on 2012-07-14 02:27: +
+
+

Nice, Python surprising when

+
+
+
+
+ + Konstantine Rybnikov wrote on 2012-07-14 07:25: +
+
+

Cool. When will your pypy converter convert my c++ programs to python? Can't wait until that happens! Anyway, nice work!

p.s.: sarcasm

+
+
+
+
+ + Benedikt Morbach wrote on 2012-07-14 10:22: +
+
+

Hey there, nice work.

Do you have any numbers or estimates how memory consumption compares?

+
+
+
+
+ + Ole Laursen wrote on 2012-07-14 11:56: +
+
+

I hope you get funding for researching the refcount thing. Being able to predict when something gets whacked is just really convenient and something PyPy Python can benefit from too.

While GC may be more efficient, the unpredictable nature of it do become a problem in production in some cases.

For instance, for a webapp written with Django and CPython, when a request is over I know that the stuff that was allocated is now gone unless I put something in a global data structure. I suspect many applications have similar patterns where you perform a big operation after which it's natural to have a clean up.

+
+
+
+
+ + Inactive Account wrote on 2012-07-15 00:21: +
+
+

Wow, this is wonderful.
You rock.

I surely hope you get funding.

If I didn't live in Brazil, and our currency wasn't so weak, and my income wasn't so low, I would definitely donate some dozens of dollars.

Keep the good work

+
+
+
+
+ + Tom wrote on 2012-07-15 19:02: +
+
+

I would like to see how this compares to the Phalanger project. Which runs PHP in the .NET runtime.

+
+
+
+
+ + Maciej Fijalkowski wrote on 2012-07-15 19:05: +
+
+

About phalanger: the short answer is that I don't have windows and comparisons on mono would be a bit ingenuine. The longer answer is that I don't expect phalanger to particularly excel compared to Zend.

For example compare the performance of IronPython and CPython. The same reasons apply as they do towards JVM or Parrot - this is IMO nto the right way for dynamic lanaguages.

+
+
+
+
+ + Anonymous wrote on 2012-07-15 20:16: +
+
+

Does the Zend test include APC as well? That's the current standard way to run php scripts...

+
+
+
+
+ + Maciej Fijalkowski wrote on 2012-07-15 20:29: +
+
+

Yes, although APC does not change anything in *this* set of benchmarks, precisely because you run everything in-process (within the same interpreter instance even).

+
+
+
+
+ + Reini Urban wrote on 2012-07-16 16:25: +
+
+

Love this effort and esp. the benchmarks! Great work

Referring to your mentioning of JVM and parrot:

You consider as disadvantage to be tied to an existing set of VM opcodes to implement many languages. You were talking about .NET (which had to add Iron-style dynamic reflection later) or the JVM.

parrot already has all the functionality the JVM or .NET was missing and even more (e.g. dynamic types loadable as plugins) and considers it as advantage to share opcodes and bytecode libraries across different languages.

But parrot cannot compete with your speed yet.

+
+
+
+
+ + SM wrote on 2012-07-16 17:36: +
+
+

Very interesting project. It would be nice if you used a recent version of PHP for comparisons - 5.3.2 is over 2 years old and one version behind. Try something like 5.4.4.

+
+
+
+
+ + Reinis I. wrote on 2012-07-18 20:59: +
+
+

> JS is kind of useless without a browser

This would have been more true before Node.js, but now it's false.

+
+
+
+
+ + Arne Babenhauserheide wrote on 2012-07-18 22:18: +
+
+

Wow, 1.5x to 20x faster than a PHP-compiler and 7x to 100x faster than PHP itself… congrats!

+
+
+
+
+ + Anonymous wrote on 2012-07-24 11:01: +
+
+

Offtopic: not trying to sound offensive or pushy, but what happened to numpypy development? I'm regularly checking https://buildbot.pypy.org/numpy-status/latest.html, and it looks like its development is stale for several months.

+
+
+
+
+ + Maciej Fijalkowski wrote on 2012-07-24 11:06: +
+
+

@Anonymous not much. I'll write a non-progress blog post some time soon.

+
+
+
+
+ + Anonymous wrote on 2012-07-24 11:46: +
+
+

@Fijal
Thank you!

+
+
+
+
+ + Dima Tisnek wrote on 2012-08-08 09:33: +
+
+

Awesome proof of concept!

Can you post memory footprint comparison, please?

And perhaps a quick overview what these test cases cover, arithmetic, function call overhead, dynamic language features?

Thanks for your hard work, without likes of you OSS would never exist!

+
+
+
+
+ + Anonymous wrote on 2013-02-03 15:15: +
+
+

Just in case anyone *is* interested in implementing PHP on the Parrot Virtual Machine, you don't have to tie yourself to the PVM bytecodes.

You can write your PHP compiler entirely in NQP (Not Quite Perl) which in turn produces parrot bytecode for you.

This is important for two reasons:

First, NQP is a mid level language, and is relatively easy to write in, and doesn't require you to know anything at all about the PVM.

Second, although NQP *presently* only targets PVM, there's an in-progress backend which targets the Java Virtual Machine! Early benchmarks suggest that it is already faster than perl5, and there are many optimizations and speedups to come.

Thus, if you were to write a PHP compiler in NQP, you could target either the Parrot Virtual machine, or (in the future) the Java virtual machine.

+
+
+
+
+ + Unknown wrote on 2013-02-03 15:16: +
+
+

Just in case anyone *is* interested in implementing PHP on the Parrot Virtual Machine, you don't have to tie yourself to the PVM bytecodes.

You can write your PHP compiler entirely in NQP (Not Quite Perl) which in turn produces parrot bytecode for you.

This is important for two reasons:

First, NQP is a mid level language, and is relatively easy to write in, and doesn't require you to know anything at all about the PVM.

Second, although NQP *presently* only targets PVM, there's an in-progress backend which targets the Java Virtual Machine! Early benchmarks suggest that it is already faster than perl5, and there are many optimizations and speedups to come.

Thus, if you were to write a PHP compiler in NQP, you could target either the Parrot Virtual machine, or (in the future) the Java virtual machine.

+
+
+
+ +

Py3k status update #5

+ +
+

This is the fifth status update about our work on the py3k branch, which we
+can work on thanks to all of the people who donated to the py3k proposal.

+

Apart from the usual "fix shallow py3k-related bugs" part, most of my work in
+this iteration has been to fix the bootstrap logic of the interpreter, in
+particular to setup the initial sys.path.

+

Until few weeks ago, the logic to determine sys.path was written entirely
+at app-level in pypy/translator/goal/app_main.py, which is automatically
+included inside the executable during translation. The algorithm is more or
+less like this:

+
    +
  1. find the absolute path of the executable by looking at sys.argv[0]
    +and cycling through all the directories in PATH +
  2. +
  3. starting from there, go up in the directory hierarchy until we find a
    +directory which contains lib-python and lib_pypy +
  4. +
+

This works fine for Python 2 where the paths and filenames are represented as
+8-bit strings, but it is a problem for Python 3 where we want to use unicode
+instead. In particular, whenever we try to encode a 8-bit string into an
+unicode, PyPy asks the _codecs built-in module to find the suitable
+codec. Then, _codecs tries to import the encodings package, to list
+all the available encodings. encodings is a package of the standard
+library written in pure Python, so it is located inside
lib-python/3.2. But at this point in time we yet have to add
lib-python/3.2 to sys.path, so the import fails. Bootstrap problem!

+

The hard part was to find the problem: since it is an error which happens so
+early, the interpreter is not even able to display a traceback, because it
+cannot yet import traceback.py. The only way to debug it was through some
+carefully placed print statement and the help of gdb. Once found the
+problem, the solution was as easy as moving part of the logic to RPython,
+where we don't have bootstrap problems.

+

Once the problem was fixed, I was able to finally run all the CPython test
+against the compiled PyPy. As expected there are lots of failures, and fixing
+them will be the topic of my next months.

+
+
+
+
+ + Anonymous wrote on 2012-07-10 17:10: +
+
+

Would be nice to have a PyPy distribution embeded in OpenOffice 3.4.2

+
+
+
+
+ + haypo wrote on 2012-07-11 10:18: +
+
+

I solved a similar issue in Python 3.2. Python 3 did use the wrong encoding to encode/decode filenames. When I tried to use the filesystem encoding instead, I had an ugly bootstrap issue with encodings implemented in Python (whereas ASCII, latin1 and utf-8 are implemented in C with a fast-path).

The solution is to use C function to encode to/decode from the locale encoding, because the filesystem encoding is the locale encoding. mbstowcs() and wcstombs() are used until the Python codec machinery is ready.

+
+
+
+
+ + Anonymous wrote on 2012-07-13 15:58: +
+
+

Did you try to compare PyPy to Pythran? According to his author, Pythran is on some benchmarks 30x faster than PyPy: https://linuxfr.org/users/serge_ss_paille/journaux/pythran-python-c#comment-1366988

see also the manual here: https://github.com/serge-sans-paille/pythran/blob/master/MANUAL

What do you think of this approach of translating Python to C++ ?

+
+
+
+
+ + Maciej Fijalkowski wrote on 2012-07-13 17:54: +
+
+

@Anonymous - there is extremely little point in comparing python with whatever-looks-like-python-but-is-not. It's beyond the scope of this blog for sure.

+
+
+
+
+ + Anonymous wrote on 2012-07-13 21:11: +
+
+

To be fair to @Anonymous, the pypy developers commonly compare pypy to C in benchmarks so it's not so unreasonable. The point is that only that one should understand that they are different languages, not that all comparisons between languages are pointless.

+
+
+
+
+ + Maciej Fijalkowski wrote on 2012-07-13 21:19: +
+
+

Oh yes sure. It's as producting to compare pypy to shedskin as it is to compare pypy with g77. It still *is* or might be a valuable comparison, but it is important to keep in mind that those languages are different.

+
+
+
+
+ + Unknown wrote on 2012-08-13 17:30: +
+
+

Any news on the py3k side?

That’s actually what’s most interesting to me on a practical level and it would be nice to know how long it will still take till I can test it :)

+
+
+
+
+ + Antonio Cuni wrote on 2012-08-14 10:06: +
+
+

@arne due to EuroPython and some personal issues not much has happened on the py3k side in the past month.

It is hard to give estimates about when things will be ready, because it depends a lot on how much time I'll be able to dedicate on it. At this point, most of the major features are implemented and I am fixing all the smaller ones which are highlighted by failing CPython tests. However, sometimes a small feature might take much more time to fix than a big one

+
+
+
+ +

EuroPython sprint

+ +
+

Hi all,

EuroPython is next week. We will actually be giving a presentation on Monday, in one of the plenary talks: PyPy: current status and GIL-less future. This is the first international PyPy keynote we give, as far as I know, but not the first keynote about PyPy [David Beazley's video] :-)

+The other talks are PyPy JIT under the hood and to some extent Performance analysis tools for JITted VMs. This year we are also trying out a help desk. Finally, we will have the usual sprint after EuroPython on Saturday and Sunday.

+See you soon!

+Armin.

+
+
+
+
+ + holger krekel wrote on 2012-06-28 10:35: +
+
+

Don't you consider the David Beazley keynote at Pycon 2012 as a talk about PyPy? (even if not from a core dev)

+
+
+
+
+ + Armin Rigo wrote on 2012-06-28 10:38: +
+
+

That's what the link "the first keynote about PyPy" is about. It's a link to the pypy blog where we talk about David's keynote. I did not find a direct page at us.pycon.org...

+
+
+
+
+ + Armin Rigo wrote on 2012-06-28 10:39: +
+
+

That's what the link "the first keynote about PyPy" is about. It's a link to the pypy blog where we talk about David's keynote. I did not find a direct page at us.pycon.org...

+
+
+
+ +

Architecture of Cppyy

+ +
+

The cppyy module makes it possible to call into C++ from PyPy through the +Reflex package. +Work started about two years ago, with a follow-up sprint a year later. +The module has now reached an acceptable level of maturity and initial +documentation with setup instructions, as well as a list of the currently +supported language features, are now available here. +There is a sizable (non-PyPy) set of unit and application tests that is still +being worked through, not all of them of general applicability, so development +continues its current somewhat random walk towards full language coverage. +However, if you find that cppyy by and large works for you except for certain +specific features, feel free to ask for them to be given higher priority.

+

Cppyy handles bindings differently than what is typically found in other +tools with a similar objective, so this update walks through some of these +differences, and explains why choices were made as they are.

+

The most visible difference, is from the viewpoint of the Python programmer +interacting with the module. +The two canonical ways of making Python part of a larger environment, are to +either embed or extend it. +The latter is done with so-called extension modules, which are explicitly +constructed to be very similar in their presentation to the Python programmer +as normal Python modules. +In cppyy, however, the external C++ world is presented from a single entrance +point, the global C++ namespace (in the form of the variable cppyy.gbl). +Thus, instead of importing a package that contains your C++ classes, usage +looks like this (assuming class MyClass in the global namespace):

+
+>>>> import cppyy
+>>>> m = cppyy.gbl.MyClass()
+>>>> # etc.
+
+

This is more natural than it appears at first: C++ classes and functions are, +once compiled, represented by unique linker symbols, so it makes sense to give +them their own unique place on the Python side as well. +This organization allows pythonizations of C++ classes to propagate from one +code to another, ensures that all normal Python introspection (such as +issubclass and isinstance) works as expected in all cases, and that it +is possible to represent C++ constructs such as typedefs simply by Python +references. +Achieving this unified presentation would clearly require a lot of internal +administration to track all C++ entities if they each lived in their own, +pre-built extension modules. +So instead, cppyy generates the C++ bindings at run-time, which brings us to +the next difference.

+

Then again, that is not really a difference: when writing or generating a +Python extension module, the result is some C code that consists of calls into +Python, which then gets compiled. +However, it is not the bindings themselves that are compiled; it is the code +that creates the bindings that gets compiled. +In other words, any generated or hand-written extension module does exactly +what cppyy does, except that they are much more specific in that the bound +code is hard-wired with e.g. fixed strings and external function calls. +The upshot is that in Python, where all objects are first-class and run-time +constructs, there is no difference whatsoever between bindings generated at +run-time, and bindings generated at ... well, run-time really. +There is a difference in organization, though, which goes back to the first +point of structuring the C++ class proxies in Python: given that a class will +settle in a unique place once bound, instead of inside a module that has no +meaning in the C++ world, it follows that it can also be uniquely located in +the first place. +In other words, cppyy can, and does, make use of a class loader to +auto-load classes on-demand.

+

If at this point, this all reminds you of a bit ctypes, just with some extra +bells and whistles, you would be quite right. +In fact, internally cppyy makes heavy use of the RPython modules that form the +guts of ctypes. +The difficult part of ctypes, however, is the requirement to annotate +functions and structures. +That is not very pleasant in C, but in C++ there is a whole other level of +complexity in that the C++ standard specifies many low-level details, that are +required for dispatching calls and understanding object layout, as +"implementation defined." +Of course, in the case of Open Source compilers, getting at those details is +doable, but having to reverse engineer closed-source compilers gets old rather +quickly in more ways than one. +More generally, these implementation defined details prevent a clean interface, +i.e. without a further dependency on the compiler, into C++ like the one that +the CFFI module provides for C. +Still, once internal pointers have been followed, offsets have been calculated, +this objects have been provided, etc., etc., the final dispatch into binary +C++ is no different than that into C, and cppyy will therefore be able to make +use of CFFI internally, like it does with ctypes today. +This is especially relevant in the CLang/LLVM world, where stub functions +are done away with. +To get the required low-level details then, cppyy relies on a back-end, rather +than getting it from the programmer, and this is where Reflex (together with +the relevant C++ compiler) comes in, largely automating this tedious process.

+

There is nothing special about Reflex per se, other than that it is relatively +lightweight, available, and has proven to be able to handle huge code bases. +It was a known quantity when work on cppyy started, and given the number +of moving parts in learning PyPy, that was a welcome relief. +Reflex is based on gccxml, and can therefore handle pretty much any C or +C++ code that you care to throw at it. +It is also technically speaking obsolete as it will not support C++11, since +gccxml won't, but its expected replacement, based on CLang/LLVM, is not +quite there yet (we are looking at Q3 of this year). +In cppyy, access to Reflex, or any back-end for that matter, is through a +thin C API (see the schematic below): cppyy asks high level questions to the +back-end, and receives low-level results, some of which are in the form of +opaque handles. +This ensures that cppyy is not tied to any specific back-end. +In fact, currently it already supports another, CINT, but that back-end is +of little interest outside of High Energy Physics (HEP). +The Python side is always the same, however, so any Python code based on cppyy +does not have to change if the back-end changes. +To use the system, a back-end specific tool (genreflex for Reflex) is +first run on a set of header files with a selection file for choosing the +required classes. +This produces a C++ file that must be compiled into a shared library, and a +corresponding map file for the class loader. +These shared libraries, with their map files alongside, can be put anywhere +as long as they can be located through the standard paths for the dynamic +loader. +With that in place, the setup is ready, and the C++ classes are available to +be used from cppyy.

+ +

So far, nothing that has been described is specific to PyPy. +In fact, most of the technologies described have been used for a long time +on CPython already, so why the need for a new, PyPy-specific, module? +To get to that, it is important to first understand how a call is mediated +between Python and C++. +In Python, there is the concept of a PyObject, which has a reference count, a +pointer to a type object, and some payload. +There are APIs to extract the low-level information from the payload for use +in the C++ call, and to repackage any results from the call. +This marshalling is where the bulk of the time is spent when dispatching. +To be absolutely precise, most C++ extension module generators produce slow +dispatches because they don't handle overloads efficiently, but even in there, +they still spend most of their time in the marshalling code, albeit in calls +that fail before trying the next overload. +In PyPy, speed is gained by having the JIT unbox objects into the payload only, +allowing it to become part of compiled traces. +If the same marshalling APIs were used, the JIT is forced to rebox the payload, +hand it over through the API, only to have it unboxed again by the binding. +Doing so is dreadfully inefficient. +The objective of cppyy, then, is to keep all code transparent to the JIT until +the absolute last possible moment, i.e. the call into C++ itself, therefore +allowing it to (more or less) directly pass the payload it already has, with +an absolute minimal amount of extra work. +In the extreme case when the binding is not to a call, but to a data member of +an object (or to a global variable), the memory address is delivered to the +JIT and this results in direct access with no overhead. +Note the interplay: cppyy in PyPy does not work like a binding in the CPython +sense that is a back-and-forth between the interpreter and the extension. +Instead, it does its work by being transparent to the JIT, allowing the JIT to +dissolve the binding. +And with that, we have made a full circle: if to work well with the JIT, and +in so doing achieve the best performance, you can not have marshalling or do +any other API-based driving, then the concept of compiled extension modules is +out, and the better solution is in run-time generated bindings.

+

That leaves one final point. +What if you do want to present an extension module-like interface to +programmers that use your code? +But of course, this is Python: everything consists of first-class objects, +whose behavior can be changed on the fly. +In CPython, you might hesitate to make such changes, as every overlay or +indirection results in quite a bit of overhead. +With PyPy, however, these layers are all optimized out of existences, making +that a non-issue.

+

This posting laid out the reasoning behind the organization of cppyy. +A follow-up is planned, to explain how C++ objects are handled and +represented internally.

+

Wim Lavrijsen

+
+
+
+
+ + Fernando Perez wrote on 2012-06-25 21:00: +
+
+

Thanks for this excellent post; any chance you'll make it to Scipy'2012 in Austin? I still remember your talk at one of the very old Scipys at Caltech as one of the best we've had; it would be great to catch up on the implications of your continued work on this front since. With the recent progress on cython and numpy/numba, fresh ideas on the C++ front are a great complement.

+
+
+
+
+ + Sebastien Binet wrote on 2012-06-26 09:28: +
+
+

Wim,

I know you are quite attached to details so I was surprised by:

"""
Reflex is based on gccxml, and can therefore handle pretty much any C or C++ code that you care to throw at it
"""

but that's not true: gccxml being an interesting and useful hack of the C++ frontend of GCC, it can only correctly parse the subset of C which is valid C++.

here are a few links:
https://stackoverflow.com/questions/1201593/c-subset-of-c-where-not-examples

https://en.wikipedia.org/wiki/Compatibility_of_C_and_C%2B%2B

I discovered it the hard way...

+
+
+
+
+ + Anonymous wrote on 2012-06-26 09:45: +
+
+

@Sebastien, GCC-XML must be able to parse the entirety of C, since it has to support "extern C" blocks, mustn't it?

+
+
+
+
+ + Sebastien Binet wrote on 2012-06-26 12:30: +
+
+

"extern C" is "just" modifying the symbol mangling mechanism of the identifiers inside the extern-C block.

just try this example from the link I posted earlier:
https://stackoverflow.com/questions/1201593/c-subset-of-c-where-not-examples

"""
struct A { struct B { int a; } b; int c; };
struct B b; // ill-formed: b has incomplete type (*not* A::B)
"""

even if you create a foo.h like so:

"""
#ifdef __cplusplus
extern "C" {
#endif

struct A { struct B { int a; } b; int c; };
struct B b;
#ifdef __cplusplus
}
#endif
"""

and compile some main.c/cxx (which just includes that header) with gcc/g++, you'll get:

"""
$ gcc main.c
$ echo $?
0

$ g++ main.cxx
In file included from main.cxx:2:0:
foo.h:7:12: error: aggregate ‘B b’ has incomplete type and cannot be defined
zsh: exit 1 g++ main.cxx
"""

gccxml is using the C++ parser, thus my first remark :}

+
+
+
+
+ + Sebastien Binet wrote on 2012-06-26 12:54: +
+
+

Also, as we are in the nitpicking and parsing department, any C++ keyword which isn't a C one, can be correctly used in a C file, making that file landing in the valid-C-which-isnt-in-the-C++-subset-of-C
(e.g.: class,new,this to name a few of the most popular types or identifiers one can find in C codebases)

+
+
+
+
+ + Wim Lavrijsen wrote on 2012-06-26 17:59: +
+
+

@Fernando: no, no travel for me anytime soon. If Py4Science is still going, though, I can always walk down the hill, of course. :)

I've seen Numba (Stefan brought it up on the pypy-dev list), but it appears to be focused on C. With LLVM, we are using the AST directly. I don't think you can drive C++ through llvm-py.

@Sebastien: the "details" that you are missing are in that "pretty much any" is not the same as "all." Worse, Reflex has a whole toolchain of gccxml, genreflex, C++ compiler, and finally the Reflex API. You lose information at every step along the way. It's one more reason for CLang/LLVM, but as said, that's for Q3/2012.

Note though that there are two kinds of C headers that one may encounter. Those that are in a pure C environment, and those for mixed C/C++ use (e.g. Python.h and the system headers). In the former case, no-one would drag in the dependency on a C++ compiler, just to use Reflex. Using e.g. CFFI is a much better option. In the other case, there is no problem either way.

Cheers,
Wim

+
+
+
+
+ + Anonymous wrote on 2012-06-27 11:46: +
+
+

On a similar note, what's the state of embedding PyPy into C++ (or does cppyy make that case fully obsolete?)?

+
+
+
+
+ + Wim Lavrijsen wrote on 2012-06-27 18:16: +
+
+

@anonymous: there was a recent thread on pypy-dev, showing a successful embedding: https://mail.python.org/pipermail/pypy-dev/2012-March/009661.html

If done through C++, you can use the Python C-API (through cpyext), but AFAIK, that doesn't play nicely with threads yet.

Cheers,
Wim

+
+
+
+
+ + Matthias wrote on 2012-06-28 16:50: +
+
+

From my past experience wrapping a C++ library to python is a whole lot more than just being able to call functions and having objects.

For example using a binding generator like SWIG you need to annotate your source, because the source alone does not have sufficient information to generate proper bindings (at least no bindings that feel python-like).

So I am wondering how Cppyy behaves in this area.

E.g. how does this play with templates? I will probably still need to define up-front which instantiations I need to be available in python?

How does it deal with object ownership? E.g. what happens if the C++ code decides to delete an object that python still points to? Or how are shared pointers dealt with?

How is type mapping handled? E.g. you might want to call functions taking MyString with "standard" python strings instead of having to construct MyString() objects first and then passing those.

+
+
+
+
+ + Wim Lavrijsen wrote on 2012-06-28 18:36: +
+
+

@Matthias: there are several follow-up posts planned to explain everything in detail, so just a few quick answers now.

Pythonizations are handled automatically based on signature, otherwise by allowing user defined pythonization functions.

Template instantiations are still needed in the Reflex world, but with CLang/LLVM, those can be generated by the backend (CINT can perform the instantiations automatically as well).

Object ownership can be handled heuristically if the C++ side behaves (this is e.g. the case for most of ROOT). If that's not the case, extra annotations per function or per object are needed. In addition, communication with the memory regulator (a tracker of all proxies on the python side) through a callback on both sides is possible.

Type mappings happen through custom converters that are to be coded up in either Python or C++. Standard mappings (e.g. the use of std::string in the way that you describe for MyString) have been added by default. Type mappings can also be done based on signature in some cases.

Not everything of the above is implemented in cppyy yet, but all have been solved before in PyROOT on CPython. It's just a matter of time to implement things for cppyy. The important point, however, is that none of this needs a separate language: most of it can be handled automatically, with a little work of the programmer in python proper or, worst case, with a C++ helper.

Cheers,
Wim

+
+
+
+
+ + Anonymous wrote on 2013-09-20 06:58: +
+
+

Hmm is anyone else experiencing problems with the pictures on this blog loading?
I'm trying to find out if its a problem on my end or if it's the
blog. Any feed-back would be greatly appreciated.

my site ... Splendyr REview - https://livingwaychristianfriendshipgroup.com/members/starcormi/activity/932712/ -

+
+
+
+ +

Release 0.1 of CFFI

+ +
+
+

Hi.

+

We're pleased to announce the first public release, 0.1 of CFFI, a way to call C from Python.
+(This release does not support PyPy yet --- but we announce it here as it is planned for the
+next release :-)

+

The package is available on bitbucket as well as documented. You can also install it
+straight from the python package index (pip).

+

The aim of this project is to provide a convenient and reliable way of calling C code from Python.
+The interface is based on LuaJIT's FFI and follows a few principles:

+
    +
  • The goal is to call C code from Python. You should be able to do so
    +without learning a 3rd language: every alternative requires you to learn
    +their own language (Cython, SWIG) or API (ctypes). So we tried to
    +assume that you know Python and C and minimize the extra bits of API that
    +you need to learn.
  • +
  • Keep all the Python-related logic in Python so that you don't need to
    +write much C code (unlike CPython native C extensions).
  • +
  • Work either at the level of the ABI (Application Binary Interface)
    +or the API (Application Programming Interface). Usually, C
    +libraries have a specified C API but often not an ABI (e.g. they may
    +document a "struct" as having at least these fields, but maybe more).
    +(ctypes works at the ABI level, whereas Cython or native C extensions
    +work at the API level.)
  • +
  • We try to be complete. For now some C99 constructs are not supported,
    +but all C89 should be, including macros (and including macro "abuses",
    +which you can manually wrap in saner-looking C functions).
  • +
  • We attempt to support both PyPy and CPython (although PyPy support is not
    +complete yet) with a reasonable path for other Python implementations like
    +IronPython and Jython.
  • +
  • Note that this project is not about embedding executable C code in
    +Python, unlike Weave. This is about calling existing C libraries
    +from Python.
  • +
+
+

Status of the project

+

Consider this as a beta release. Creating CPython extensions is fully supported and the API should
+be relatively stable; however, minor adjustements of the API are possible.

+

PyPy support is not yet done and this is a goal for the next release. There are vague plans to make this the
+preferred way to call C from Python that can reliably work between PyPy and CPython.

+

Right now CFFI's verify() requires a C compiler and header files to be available at run-time.
+This limitation will be lifted in the near future and it'll contain a way to cache the resulting binary.

+

Cheers,

+Armin Rigo and Maciej Fijałkowski

+
+
+
+
+
+
+ + intgr wrote on 2012-06-19 00:28: +
+
+

Will the CFFI be any JIT-friendlier than PyPy's ctypes?

+
+
+
+
+ + Anonymous wrote on 2012-06-19 16:46: +
+
+

What's the difference between CFFI and CPyExt?

+
+
+
+
+ + RonnyPfannschmidt wrote on 2012-06-19 18:04: +
+
+

@intgr yes

@anon cffi is a FFI, cpyext is a api emulation they are completely different things

+
+
+
+ +
+
+ +
+
+
+ +
+ + + + \ No newline at end of file diff --git a/blog/index-25.html b/blog/index-25.html new file mode 100644 index 000000000..2b4759374 --- /dev/null +++ b/blog/index-25.html @@ -0,0 +1,1363 @@ + + + + + + +PyPy (old posts, page 25) | PyPy + + + + + + + + + + + + + + + + + + Skip to main content +
+
+
+

PyPy related internship at NCAR

+ +
+
+

Hello everyone

+

I would like to advertise a PyPy-related summer internship at +the National Center for Atmospheric Research, which is located in lovely +Boulder, Colorado. As for the last year, the mentor will be Davide del Vento, +with my possible support on the PyPy side.

+

The full details of the application are to be found on +the internship description and make sure you read the requirements +first. Important requirements:

+
    +
  • Must currently be enrolled in a United States university.
  • +
  • Only students authorized to work for any employer in the United +States will be considered for the SIParCS program.
  • +
  • Must be a graduate or under graduate who has completed their sophomore year.
  • +
+

If you happen to fulfill the requirements, to me this sounds like +a great opportunity to spend a summer at NCAR in Boulder hacking on atmospheric +models using PyPy.

+

Cheers, +fijal

+
+
+
+
+
+
+ + Anonymous wrote on 2012-12-07 23:35: +
+
+

You can post it on https://jobs.pythonweekly.com/

+
+
+
+ +

Py3k status update #8

+ +
+

This is the eight status update about our work on the py3k branch, which
+we can work on thanks to all of the people who donated to the py3k
+proposal
.

+

Just a short update on November's work: we're now passing about 194 of
+approximately 355 modules of CPython's regression test suite, up from passing
+160 last month. Many test modules only fail a small number of individual tests
+now.

+

We'd like to thank Amaury Forgeot d'Arc for his contributions, in particular he
+has made significant progress on updating CPyExt for Python 3 this month.

+

Some other highlights:

+
    +
  • +test_marshal now passes, and there's been significant progress on
    +pickling (thanks Kenny Levinsen and Amaury for implementing
    int.{to,from}_bytes)
  • +
  • We now have a _posixsubprocess module
  • +
  • More encoding related fixes, which affects many failing tests
  • +
  • +_sre was updated and now test_re almost passes
  • +
  • Exception behavior is almost complete per the Python 3 specs, what's mostly
    +missing now are the new __context__ and __traceback__ attributes (PEP
    +3134
    )
  • +
  • Fixed some crashes and deadlocks occurring during the regression tests
  • +
  • We merged the unicode-strategies branch both to default and to py3k: now we
    +have versions of lists, dictionaries and sets specialized for unicode
    +elements, as we already had for strings.
  • +
  • However, for string-specialized containers are still faster in some cases
    +because there are shortcuts which have not been implemented for unicode yet
    +(e.g., constructing a set of strings from a list of strings). The plan is to
    +completely kill the shortcuts and improve the JIT to produce the fast
    +version automatically for both the string and unicode versions, to have a
    +more maintainable codebase without sacrificing the speed. The autoreds
    +branch (already merged) was a first step in this direction.
  • +
+

cheers,
+Philip&Antonio

+
+
+
+
+ + Anonymous wrote on 2012-12-05 22:14: +
+
+

Well done. PyPy is one the most interesting projects out there today.
Keep up the amazing work guys!

J.

+
+
+
+
+ + Anonymous wrote on 2013-01-03 14:56: +
+
+

thank you for your work!!

+
+
+
+ +

PyPy San Francisco Sprint Dec 1st - Dec 2nd 2012

+ +
+

The next PyPy sprint will be in San Francisco, California. It is a
+public sprint, suitable for newcomers. It will run on Saturday December 1st and
+Sunday December 2nd. The goals for the sprint are continued work towards the
+2.0 release as well as code cleanup, we of course welcome any topic which
+contributors are interested in working on.

+

Some other possible topics are:

+
    +
  • running your software on PyPy
  • +
  • work on PyPy's numpy (status)
  • +
  • work on STM (status)
  • +
  • JIT improvements
  • +
  • any exciting stuff you can think of
  • +
+

If there are newcomers, we'll run the usual introduction to hacking on
+PyPy.

+

Location

+

The sprint will be held at the Rackspace Office:

+

620 Folsom St, Ste 100
+San Francisco

+

The doors will open at 10AM both days, and run until 6PM both days.

+

Thanks to David Reid for helping get everything set up!

+
+
+
+
+ + Mike Pavone wrote on 2012-11-29 22:49: +
+
+

Hi, I'm interested in getting involved with PyPy development and would love to attend the sprint to get started, but I'm not sure I can make it both days. Would it be okay to just participate Sunday or would that not make sense?

+
+
+
+
+ + Maciej Fijalkowski wrote on 2012-11-29 23:06: +
+
+

absolutely

+
+
+
+
+ + Jean-Paul Calderone wrote on 2012-11-30 19:16: +
+
+

Awww jeez, you guys couldn't wait a couple more weeks? Have fun. If anyone's still in the bay area after Dec 10th give a holler.

+
+
+
+
+ + Anonymous wrote on 2012-12-03 01:17: +
+
+

It would have helped a lot if this sprint was announced more in advance. I just missed it because I didn't bother to check the PyPy blog last week.

+
+
+
+
+ + Maciej Fijalkowski wrote on 2012-12-03 16:35: +
+
+

I'm sorry, but we didn't know more in advance.

+
+
+
+
+ + Anonymous wrote on 2012-12-25 20:48: +
+
+

STM update looks interesting and promising!

+
+
+
+ +

PyPy 2.0 beta 1

+ +
+
+

We're pleased to announce the 2.0 beta 1 release of PyPy. This release is +not a typical beta, in a sense the stability is the same or better than 1.9 +and can be used in production. It does however include a few performance +regressions documented below that don't allow us to label is as 2.0 final. +(It also contains many performance improvements.)

+

The main features of this release are support for ARM processor and +compatibility with CFFI. It also includes +numerous improvements to the numpy in pypy effort, cpyext and performance.

+

You can download the PyPy 2.0 beta 1 release here:

+
+https://pypy.org/download.html +
+
+

What is PyPy?

+

PyPy is a very compliant Python interpreter, almost a drop-in replacement for +CPython 2.7.3. It's fast (pypy 2.0 beta 1 and cpython 2.7.3 +performance comparison) due to its integrated tracing JIT compiler.

+

This release supports x86 machines running Linux 32/64, Mac OS X 64 or +Windows 32. It also supports ARM machines running Linux. +Windows 64 work is still stalling, we would welcome a volunteer +to handle that.

+
+
+

How to use PyPy?

+

We suggest using PyPy from a virtualenv. Once you have a virtualenv +installed, you can follow instructions from pypy documentation on how +to proceed. This document also covers other installation schemes.

+
+
+

Regressions

+

Reasons why this is not PyPy 2.0:

+
    +
  • the ctypes fast path is now slower than it used to be. In PyPy +1.9 ctypes was either incredibly faster or slower than CPython depending whether +you hit the fast path or not. Right now it's usually simply slower. We're +probably going to rewrite ctypes using cffi, which will make it +universally faster.
  • +
  • +cffi (an alternative to interfacing with C code) is very fast, but +it is missing one optimization that will make it as fast as a native +call from C.
  • +
  • +numpypy lazy computation was disabled for the sake of simplicity. +We should reenable this for the final 2.0 release.
  • +
+
+
+

Highlights

+
    +
  • +cffi is officially supported by PyPy. You can install it normally by +using pip install cffi once you have installed PyPy and pip. +The corresponding 0.4 version of cffi has been released.
  • +
  • ARM is now an officially supported processor architecture. +PyPy now work on soft-float ARM/Linux builds. Currently ARM processors +supporting the ARMv7 and later ISA that include a floating-point unit are +supported.
  • +
  • This release contains the latest Python standard library 2.7.3 and is fully +compatible with Python 2.7.3.
  • +
  • It does not however contain hash randomization, since the solution present +in CPython is not solving the problem anyway. The reason can be +found on the CPython issue tracker.
  • +
  • +gc.get_referrers() is now faster.
  • +
  • Various numpy improvements. The list includes:
      +
    • axis argument support in many places
    • +
    • full support for fancy indexing
    • +
    • +complex128 and complex64 dtypes
    • +
    +
  • +
  • +JIT hooks are now a powerful tool to introspect the JITting process that +PyPy performs.
  • +
  • +**kwds usage is much faster in the typical scenario
  • +
  • operations on long objects are now as fast as in CPython (from +roughly 2x slower)
  • +
  • We now have special strategies for dict/set/list which contain +unicode strings, which means that now such collections will be both faster +and more compact.
  • +
+
+
+

Things we're working on

+

There are a few things that did not make it to the 2.0 beta 1, which +are being actively worked on. Greenlets support in the JIT is one +that we would like to have before 2.0 final. Two important items that +will not make it to 2.0, but are being actively worked on, are:

+
    +
  • Faster JIT warmup time.
  • +
  • Software Transactional Memory.
  • +
+

Cheers,
+Maciej Fijalkowski, Armin Rigo and the PyPy team

+
+
+
+
+
+
+
+ + Anonymous wrote on 2012-11-22 16:51: +
+
+

Good job! 2 things:
1) the link to the .tar.bz for Linux 64 (libc 2.13) links to a corrupted file (bz2 claims it is corrupted, and its MD5 hash doesn't match the one on the page)

2) the link to the benchmark on this page: https://speed.pypy.org/comparison/?exe=1%2B785,2%2B472&ben=1,34,27,2,25,3,46,4,5,41,42,22,44,6,39,7,8,45,23,24,9,10,11,12,13,40,14,15,35,36,37,38,16,28,30,32,29,33,17,18,19,20,43&env=1,2&hor=true&bas=2%2B472&chart=normal+bars

is empty -- no charts were plotted. (I've turned off all my adblocking).

+
+
+
+
+ + Anonymous wrote on 2012-11-22 16:52: +
+
+

Oops, the chart appears now -- it took a long time to load.

+
+
+
+
+ + Unknown wrote on 2012-11-22 17:46: +
+
+

The OSX binary segfaults on a Lion 64bit. I tried both 2.0-beta1 and a nightly build. Notice, 1.9 works perfectly.

+
+
+
+
+ + Unknown wrote on 2012-11-23 05:51: +
+
+

I would be more than happy to give it a shot if there was solid PostgreSQL support - otherwise it is a no-go for me.

+
+
+
+
+ + Anonymous wrote on 2012-11-23 19:34: +
+
+

Issue 1257 still not fixed (memory leak when using web.py framework).

+
+
+
+
+ + Anonymous wrote on 2012-11-23 19:35: +
+
+

For PostgreSQL it works with psycopg2ct.

+
+
+
+
+ + Gabriel wrote on 2012-11-30 09:26: +
+
+

Just announced on the IRC channel: psycopg2cffi. They ported it for speed, but from my CFFI experience, I think the biggest advantage is maintainability.

+
+
+
+
+ + Anonymous wrote on 2012-12-04 19:44: +
+
+

I think I should give a try to this.
Goona give a shot
---------------
www.insecuregeek.blogspot.com

+
+
+
+
+ + Unknown wrote on 2013-01-07 15:07: +
+
+

If we can get greenlet support in the JIT that'd be fantastic - my non-blocking driver for MongoDB, Motor, will need it before it's usable with PyPy. Thanks for the amazing work!

+
+
+
+ +

Py3k status update #7

+ +
+

This is the seventh status update about our work on the py3k branch, which
+we can work on thanks to all of the people who donated to the py3k
+proposal
.

+

The biggest news is that this month Philip started to work on py3k in parallel
+to Antonio. As such, there was an increased amount of activity.

+

The py3k buildbots now fully translate the branch every night and run the
+Python standard library tests.

+

We currently pass 160 out of approximately 355 modules of CPython's standard
+test suite, fail 144 and skip approximately 51.

+

Some highlights:

+
    +
  • dictviews (the objects returned by dict.keys/values/items) has been greatly
    +improved, and now they full support set operators
  • +
  • a lot of tests has been fixed wrt complex numbers (and in particular the
    __complex__ method)
  • +
  • _csv has been fixed and now it correctly handles unicode instead of bytes
  • +
  • more parser fixes, py3k list comprehension semantics; now you can no longer
    +access the list comprehension variable after it finishes
  • +
  • 2to3'd most of the lib_pypy modules (pypy's custom standard lib
    +replacements/additions)
  • +
  • py3-enabled pyrepl: this means that finally readline works at the command
    +prompt, as well as builtins.input(). pdb seems to work, as well as
    fancycompleter to get colorful TAB completions :-)
  • +
  • py3 round
  • +
  • further tightening/cleanup of the unicode handling (more usage of
    +surrogateescape, surrogatepass among other things)
  • +
  • as well as keeping up with some big changes happening on the default branch
    +and of course various other fixes.
  • +
+

Finally, we would like to thank Amaury Forgeot d'Arc for his significant
+contributions.

+

cheers,
+Philip&Antonio

+
+
+
+
+ + Unknown wrote on 2012-11-03 20:23: +
+
+

Very cool!

Thank you for your work!

+
+
+
+
+ + Anonymous wrote on 2012-11-04 05:32: +
+
+

Great work!

+
+
+
+
+ + Anonymous wrote on 2012-11-06 05:22: +
+
+

thanks for sharing.

+
+
+
+
+ + Unknown wrote on 2012-11-08 21:26: +
+
+

How do I compile/translate it for testing the py3k branch?

How much optimization is already possible?

+
+
+
+
+ + Antonio Cuni wrote on 2012-11-13 08:59: +
+
+

@arne: you can just use the usual translate.py command inside the py3k branch.
Or download one of the nightly builds:
https://buildbot.pypy.org/nightly/py3k/

however, note that:
- JIT is not enabled (yet)
- no focus has been put on performances (yet :)) so it is probably slower than even the non-jitted python2

+
+
+
+
+ + Anonymous wrote on 2012-11-22 07:14: +
+
+

when will pypy-2.0 be available ?

+
+
+
+
+ + Anonymous wrote on 2012-11-22 07:15: +
+
+

when will pypy-2.0 be avaliable ?

+
+
+
+
+ + Maciej Fijalkowski wrote on 2012-11-22 08:12: +
+
+

2.0 beta 1 - today. 2.0 final - no date yet.

+
+
+
+
+ + Anonymous wrote on 2012-11-22 10:19: +
+
+

looking forward to see the release note of pypy-2.0 b1

+
+
+
+
+ + Unknown wrote on 2012-11-22 12:17: +
+
+

@antonio: for me the translate with goal pypy (interpreter) did not work, so I asked.

I’ll try again. Thanks!

+
+
+
+
+ + Antonio Cuni wrote on 2012-11-22 15:05: +
+
+

@arne: it's surely possible that translation is broken at some revision, it's all work in progress :). If you go to the nightly build page, you can see which for which revision translation did work

+
+
+
+ +

NumPy status update #5

+ +
+
+ +

Hello.

+

I'm quite excited to inform that work on NumPy in PyPy has been restarted +and there has been quite a bit of progress on the NumPy front in PyPy in the +past two months. Things that happened:

+
    +
  • +complex dtype support - thanks to matti picus, NumPy on PyPy now supports +complex dtype (only complex128 so far, there is work on the other part)
  • +
  • +big refactoring - probably the biggest issue we did was finishing +a big refactoring that disabled some speedups (notably lazy computation +of arrays), but lowered the barrier of implementing cool new features.
  • +
  • +fancy indexing support - all fancy indexing tricks should now work, +including a[b] where b is an array of integers.
  • +
  • +newaxis support - now you can use newaxis features
  • +
  • improvements to ``intp``, ``uintp``, ``void``, ``string`` and record dtypes
  • +
+

Features that have active branches, but hasn't been merged:

+
    +
  • float16 dtype support
  • +
  • +missing ndarray attributes - this is a branch to finish all attributes +on ndarray, hence ending one chapter.
  • +
  • +pickling support for numarray - hasn't started yet, but next on the list
  • +
+

More importantly, we're getting very close to able to import the python part +of the original numpy with only import modifications and running it's tests. +Most tests will fail at this point, however it'll be a good start for another +chapter :-)

+

Cheers,
+fijal

+
+
+
+
+
+
+ + Dmitrey wrote on 2012-11-01 17:11: +
+
+

Hi,
are sort() and argsort(), preferably with axis parameter, in nearest future plans?

Regards, Dmitrey.

+
+
+
+
+ + Maciej Fijalkowski wrote on 2012-11-01 17:13: +
+
+

Hi Dmitrey.

argsort (with axis) is already implemented on a branch, sort coming later (it's further in the alphabet, I'm at g now ;-)

+
+
+
+
+ + Anonymous wrote on 2012-11-01 17:14: +
+
+

hey, cool progress!
numpypy.complex64(complex(4., 3.)) works for me on nightlies, FWIW

+
+
+
+ +

Cape Town 2012 sprint report

+ +
+
+

Hello.

+

We're about to finish a PyPy sprint in Cape Town, South Africa that was +one of the smallest done so far, only having Armin Rigo and Maciej Fijalkowski +with Alex Gaynor joining briefly at the beginning, however also one of the +longest, lasting almost 3 weeks. The sprint theme seems to be predominantly +"no new features" and "spring cleaning". We overall removed about 20k lines +of code in the PyPy source tree. The breakdown of things done and worked on:

+
    +
  • +

    We killed SomeObject support in annotation and rtyper. This is a modest +code saving, however, it reduces the complexity of RPython and also, +hopefully, improves compile errors from RPython. We're far from done +on the path to have comprehensible compile-time errors, but the first +step is always the hardest :)

    +
  • +
  • +

    We killed some magic in specifying the interface between builtin functions +and Python code. It used to be possible to write builtin functions like this:

    +
    +def f(space, w_x='xyz'):
    +
    +

    which will magically wrap 'xyz' into a W_StringObject. Right now, instead, +you have to write:

    +
    +@unwrap_spec(w_x=WrappedDefault('xyz'))
    +def f(space, w_x):
    +
    +

    which is more verbose, but less magical.

    +
  • +
  • +

    We killed the CExtModuleBuilder which is the last remaining part of +infamous extension compiler that could in theory build C extensions +for CPython in RPython. This was never working very well and the main +part was killed long ago.

    +
  • +
  • +

    We killed various code duplications in the C backend.

    +
  • +
  • +

    We killed microbench and a bunch of other small-to-medium unused +directories.

    +
  • +
  • +

    We killed llgraph JIT backend and rewrote it from scratch. Now the llgraph +backend is not translatable, but this feature was rarely used and caused +a great deal of complexity.

    +
  • +
  • +

    We progressed on continulet-jit-3 branch, up to the point of merging +it into result-in-resops branch, which also has seen a bit of progress.

    +

    Purpose of those two branches:

    +
      +
    • +continulet-jit-3: enable stackless to interact with the JIT by killing +global state while resuming from the JIT into the interpreter. This has +multiple benefits. For example it's one of the stones on the path to +enable STM for PyPy. It also opens new possibilities for other optimizations +including Python-Python calls and generators.
    • +
    • +result-in-resops: the main goal is to speed up the tracing time of PyPy. +We found out the majority of time is spent in the optimizer chain, +which faces an almost complete rewrite. It also simplifies the storage +of the operations as well as the number of implicit invariants that have +to be kept in mind while developing.
    • +
    +
  • +
  • +

    We finished and merged the excellent work by Ronan Lamy which makes the +flow object space (used for abstract interpretation during RPython +compilation) independent from the Python interpreter. This means +we've achieved an important milestone on the path of separating the RPython +translation toolchain from the PyPy Python interpreter.

    +
  • +
+

Cheers,
+fijal & armin

+
+
+
+

Py3k status update #6

+ +
+

This is the sixth status update about our work on the py3k branch, which we
+can work on thanks to all of the people who donated to the py3k proposal.

+

The coolest news is not about what we did in the past weeks, but what we will
+do in the next: I am pleased to announce that Philip Jenvey has been
+selected by the PyPy communitiy to be funded for his upcoming work on py3k,
+thanks to your generous donations. He will start to work on it shortly, and he
+will surely help the branch to make faster progress. I am also particularly
+happy of this because Philip is the first non-core developer who is getting
+paid with donations: he demonstrated over the past months to be able to work
+effectively on PyPy, and so we were happy to approve his application for the
+job. This means that anyone can potentially be selected in the future, the
+only strict requirement is to have a deep interest in working on PyPy and to
+prove to be able to do so by contributing to the project.

+

Back to the status of the branch. Most of the work since the last status
+update has been done in the area of, guess what? Unicode strings. As usual,
+this is one of the most important changes between Python 2 and Python 3, so
+it's not surprising. The biggest news is that now PyPy internally supports
+unicode identifiers (such as names of variables, functions, attributes, etc.),
+whereas earlier it supported only ASCII bytes strings. The changes is still
+barely visible from the outside, because the parser still rejects non-ASCII
+identifiers, however you can see it with a bit of creativity:

+
>>>> def foo(x): pass
+>>>> foo(**{'àèìòù': 42})
+Traceback (most recent call last):
+  File "<console>", line 1, in <module>
+TypeError: foo() got an unexpected keyword argument 'àèìòù'
+
+

Before the latest changes, you used to get question marks instead of the
+proper name for the keyword argument. Although this might seem like a small
+detail, it is a big step towards a proper working Python 3 interpreter and it
+required a couple of days of headaches. A spin-off of this work is that now
+RPython has better built-in support for unicode (also in the default branch):
+for example, it now supports unicode string formatting (using the percent
+operator) and the methods .encode/.decode('utf-8').

+

Other than that there is the usual list of smaller issues and bugs that got
+fixed, including (but not limited to):

+
    +
  • teach the compiler when to emit the new opcode DELETE_DEREF (and
    +implement it!)
  • +
  • detect when we use spaces and TABs inconsistently in the source code, as
    +CPython does
  • +
  • fix yet another bug related to the new lexically scoped exceptions (this
    +is the last one, hopefully)
  • +
  • port some of the changes that we did to the standard CPython 2.7 tests to
    +3.2, to mark those which are implementation details and should not be run on
    +PyPy
  • +
+

Finally, I would like to thank Amaury Forgeot d'Arc and Ariel Ben-Yehuda for
+their work on the branch; among other things, Amaury recently worked on
cpyext and on the PyPy _cffi_backend, while Ariel submitted a patch to
+implement PEP 3138.

+
+
+
+
+ + Ernst Sjöstrand wrote on 2012-09-26 10:48: +
+
+

Following your work, great to see progress!

+
+
+
+
+ + Anonymous wrote on 2012-10-05 16:43: +
+
+

Python 3.3 has some absolutely crucial fixes (finally! to Unicode). I'd go as far as to say that Python 3.3 is the first Pyton version of all that is truly suitable for the full range of internationalized apps. So I wonder a bit about the set target for the PyPy3 work being 3.2. Any chance it can be 3.2 with the 3.3 Unicode implementation?

+
+
+
+
+ + Antonio Cuni wrote on 2012-10-05 17:35: +
+
+

we chose to target 3.2 because at the time 3.3 was a moving target. Now we could indeed decide to retarget 3.3, but I'm not sure it's a good idea. There is still a lot of work to be done for 3.2, and adding more features would only shift the end to later.

+
+
+
+
+ + Anonymous wrote on 2012-10-07 19:41: +
+
+

I know this might be an odd request but... Has the pypy team ever considered inquiring Mozilla about embedding pypy into their browser? I say this because Google is embedding a vm from chromium to support dart. I do not think this is ideal for an open web. Pypy, on the other hand, would be ideal as an open web vm! Think about it!

+
+
+
+
+ + rental mobil jakarta wrote on 2012-10-08 15:15: +
+
+

Nice article, thanks for the information.

+
+
+
+
+ + Arne Babenhauserheide wrote on 2012-10-17 09:08: +
+
+

That sounds great!

Thank you for your work - and for keeping us up to date!

+
+
+
+
+ + Unknown wrote on 2012-10-17 09:11: +
+
+

I think main change in 3.3 is that they allow u'' as syntax for indicating a string (just inactive syntax for easing the porting of python2 code: '' is exactly equal to u'').

+
+
+
+ +

PyPy Cape Town Sprint Oct 7th - Oct 21st 2012

+ +
+
+

Hello everyone!

+

The next PyPy sprint will be in Cape Town, South Africa. It is a +public sprint, suitable for newcomers. It starts a couple of days +after PyCon South Africa, which is on the 4th and 5th of October. +This is a relatively unusual sprint in that it is hosted halfway +across the world from where most contributors live, so we plan to +spend some time during those two weeks doing sprinting and some time +doing touristy stuff. The goals for the sprint are general progress +and whatever people are interested in.

+

Possible topics:

+
    +
  • PyPy release 2.0
  • +
  • running your software on PyPy
  • +
  • work on PyPy's numpy (status)
  • +
  • work on STM (status)
  • +
  • JIT improvements
  • +
  • any exciting stuff you can think of
  • +
+

If there are newcomers, we'll run the usual introduction to hacking on +PyPy.

+
+

Location

+

The sprint will be held either in the apartment of fijal, which is in +Tamboerskloof, Cape Town, or in the offices of the Praekelt +Foundation, located in Woodstock, Cape Town. The Praekelt Foundation +has offered to host us, if needed.

+

Cape Town, as a very touristy place, has tons of accomodation ranging +in quality from good to amazing. Depending on the sprint location you +might need a car.

+
+
+

Good to Know

+

You probably don't need visa for South Africa -- consult Wikipedia. +South Africa is a lovely place with lots of stuff to do. You can see +penguins, elephants, lions and sharks all on one day (or better yet, +on multiple days).

+

There is a wide selection of good restaurants within a reasonable +distance of the sprint venue (depending on the venue, either walking +or driving).

+

The power plug is some weird derivative of an old-english standard, +but adapters are easily acquired.

+
+
+

Who's Coming?

+

If you'd like to come, please let us know when you will be arriving +and leaving, as well as what your interests are. We'll keep a list of +people which we'll update (or you can do so yourself if you have +bitbucket pypy commit rights).

+
+

Cheers,
+fijal +

+
+
+
+
+
+
+ + Anonymous wrote on 2012-09-07 11:16: +
+
+

Why pypy is three times slower than python2.6 + psyco2 ??

# text parser:
# python2.7 - 0.94s
# python2.7 + cython - 0.73s
# pypy1.9 - 0.68s
# python2.5 + psyco1.6 - 0.31s
# python2.6 + psyco2 - 0.23s

"python2.6 + psyco2" is 3.3 times faster than pypy1.9, why ??

+
+
+
+
+ + Maciej Fijalkowski wrote on 2012-09-07 13:48: +
+
+

Obviously if you don't provide a benchmark we're completely clueless.

+
+
+
+
+ + Anonymous wrote on 2012-09-09 13:31: +
+
+

I found that "cStringIO" is extremely slow in pypy1.9 (almost three times slower than python2.7), I'm using a lot of cStringIO in my text parser. here is my benchmark:

import time, cStringIO

def test1():
text = '1234567890' * 1024 * 256
sio = cStringIO.StringIO()
ts = time.time()
for ch in text: sio.write(ch)
print 'ts', time.time() - ts

try:
import psyco
psyco.full()
except:
pass


test1()
test1()
test1()

# python2.7 0.45s
# psyco2 0.26s
# pypy-1.9 1.30s

+
+
+
+
+ + Arne Babenhauserheide wrote on 2012-09-12 15:29: +
+
+

You could try using StringIO instead of cStringIO. pypy can optimize that much better.

Here’s an adapted example:

------ ------ ------

import time, StringIO, cStringIO

def csio():
text = '1234567890' * 1024 * 256
sio = cStringIO.StringIO()
ts = time.time()
for ch in text: sio.write(ch)
print 'ts', time.time() - ts

def nsio():
text = '1234567890' * 1024 * 256
sio = StringIO.StringIO()
ts = time.time()
for ch in text: sio.write(ch)
print 'ts', time.time() - ts


print "cStringIO"
csio()
csio()
csio()

print "StringIO"
nsio()
nsio()
nsio()

------ ------ ------

Results for me with pypy 1.9:

$ python stringiotest.py
cStringIO
ts 0.636300086975
ts 0.63633108139
ts 0.636710882187
StringIO
ts 3.35502791405
ts 3.34557986259
ts 3.33949017525
$ bin/pypy stringiotest.py
cStringIO
ts 1.05391597748
ts 0.528824090958
ts 0.530929803848
StringIO
ts 0.359623908997
ts 0.277186870575
ts 0.273662090302

+
+
+
+
+ + Anonymous wrote on 2012-09-13 13:25: +
+
+

thanks, it works with StringIO.

+
+
+
+
+ + Unknown wrote on 2012-09-13 13:26: +
+
+

Increase the amount of iterations for even higher speedups:

text = '1234567890' * 1024 * 256 * 16





$ bin/pypy stringiotest.py
cStringIO
ts 224.367353201
ts 140.621050835
ts 140.672322035
StringIO
ts 5.80670285225
ts 4.95937395096
ts 4.82084798813

$ python stringiotest.py
cStringIO
ts 9.54650998116
ts 9.60773801804
ts 9.56916093826
StringIO
ts 47.1465728283
ts 47.145359993
ts 47.1618230343


Interestingly pypy with StringIO is twice as fast as python with cStringIO. But pypy with cStringIO is slow.

So pypy with StringIO might still require 2x as much time as python2.6+psyco2.

But remember that this compares pure python code on pypy with hand-optimized C-code+psyco.

+
+
+
+
+ + Unknown wrote on 2012-09-13 13:29: +
+
+

Glad to help :)

The cool part here is that pypy allows us to replace many C-modules with nicely readable python-code and still get a fast program.

And that your custom code gets the same speedups.

+
+
+
+
+ + Anonymous wrote on 2012-09-13 13:32: +
+
+

in order to import StringIO as cStringIO. how to confirm my script is running pypy? not python ?

+
+
+
+
+ + how to climb wrote on 2012-09-13 16:12: +
+
+

thanks for the post dear. nice blog.

+
+
+
+
+ + Unknown wrote on 2012-09-14 10:04: +
+
+

you could just import sys:

import sys
ispypy = hasattr(sys, "pypy_version_info")

+
+
+
+ +

NumPy on PyPy status update

+ +
+
+

Hello everyone.

+

It's been a while since we posted a numpy work update, but I'm pleased to +inform you that work on it has been restarted. A lot of the work has been +done by Matti Picus, who is one of the newest contributors to the PyPy +project. None of the work below has been merged so far, it's work in progress:

+
    +
  • Complex dtype support.
  • +
  • Fixing incompatibilities between numpy and pypy's version.
  • +
  • Refactoring numpypy to simplify the code and make it easier for new +contributors.
  • +
  • Reuse most of the numpy's pure python code without modifications.
  • +
+

Finishing this is also the plan for the next month.

+

Cheers,
+fijal

+
+
+
+
+
+
+ + Anonymous wrote on 2012-09-05 20:59: +
+
+

Exciting stuff!

It would be great to see a write-up of what, if anything, still remains to be done after this merge to have full compatibility with numpy.

+
+
+
+
+ + Maciej Fijalkowski wrote on 2012-09-05 21:31: +
+
+

Once we have a better idea about the numpy's test status we'll post it. That would be probably on the next month's update report.

+
+
+
+
+ + Unknown wrote on 2012-09-07 15:03: +
+
+

Great to hear that!

I’m anxious to see numpy on pypy bear fruit, so I can test it with some model of a model I experiment with.

+
+
+
+
+ + Raul Durand wrote on 2012-12-04 12:31: +
+
+

Pypy and numpypy are just great!
I will be able to move some projects completely to pypy after Linalg implementation.
In the meanwhile I just noticed that vectorized operations as dot product in numpypy are not yet as fast as in numpy.

+
+
+
+
+ + Raul Durand wrote on 2012-12-04 12:32: +
+
+

Pypy and numpypy are just great!
I will be able to move some projects completely to pypy after Linalg implementation.
In the meanwhile I just noticed that vectorized operations as dot product in numpypy are not yet as fast as in numpy.

+
+
+
+ +
+
+ +
+
+
+ +
+ + + + \ No newline at end of file diff --git a/blog/index-26.html b/blog/index-26.html new file mode 100644 index 000000000..6c72d4e73 --- /dev/null +++ b/blog/index-26.html @@ -0,0 +1,1583 @@ + + + + + + +PyPy (old posts, page 26) | PyPy + + + + + + + + + + + + + + + + + + Skip to main content +
+
+
+

So, you want to try PyPy

+ +
+
+

Hello.

+

During the PyCon trip multiple people asked me how exactly they could run +their stuff on PyPy to get the speedups. Now, in an ideal world, +you would just swap CPython with PyPy, everything would run tons of times +faster and everyone would live happily ever after. However, we don't live in +an ideal world and PyPy does not speed up everything you could +potentially run. Chances are that you can run your stuff quite a bit faster, but +it requires quite a bit more R&D than just that. This blog post is an attempt to +explain certain steps that might help. So here we go:

+
    +
  • Download and install PyPy. 2.0 beta 1 or upcoming 2.0 beta 2 would be a good +candidate; it's not called a beta for stability reasons.
  • +
  • Run your tests on PyPy. There is absolutely no need for fast software that +does not work. There might be some failures. Usually they're harmless (e.g. +you forgot to close the file); either fix them or at least inspect them. In +short, make sure stuff works.
  • +
  • Inspect your stack. In particular, C extensions, while sometimes working, are +a potential source of instability and slowness. Fortunately, +since the introduction of cffi, the ecosystem of PyPy-compatible software +has been growing. Things I know are written with PyPy in mind:
      +
    • the new version of pyOpenSSL will support PyPy via cffi
    • +
    • +psycopg2cffi is the most actively maintained postgres binding for PyPy, +with pg8000 reported working
    • +
    • mysql has a ctypes based implementation (although a cffi-based one would +be definitely better)
    • +
    • PyPy 2.0 beta 2 will come with sqlite-using-cffi
    • +
    • lxml-cffi
    • +
    • +uWSGI, while working, is almost certainly not the best choice. Try +tornado, twisted.web, cyclone.io, gunicorn or gevent +(note: gevent support for PyPy is not quite finished; will write about it +in a separate blog post, but you can't just use the main branch of gevent)
    • +
    • consult (and contribute to) pypy compatibility wiki for details (note +that it's community maintained, might be out of date)
    • +
    +
  • +
+
    +
  • Have benchmarks. If you don't have benchmarks, then performance does not +matter for you. Since PyPy's warm-up time is bad (and yes, we know, we're +working on it), you should leave ample time for warm-ups. Five to ten seconds +of continuous computation should be enough.
  • +
  • Try them. If you get lucky, the next step might be to deploy and be happy. +If you're unlucky, profile and try to isolate bottlenecks. They might be in +a specific library or they might be in your code. The better you can isolate +them, the higher your chances of understanding what's going on.
  • +
  • Don't take it for granted. PyPy's JIT is very good, but there is a variety +of reasons that it might not work how you expect it to. A lot of times it +starts off slow, but a little optimization can improve the speed as much as +10x. Since PyPy's runtime is less mature than CPython, there are higher +chances of finding an obscure corner of the standard library that might be +atrociously slow.
  • +
  • Most importantly, if you run out of options and you have a reproducible +example, please report it. A pypy-dev email, popping into #pypy +on irc.freenode.net, or getting hold of me on twitter are good ways. +You can also contact me directly at fijall at gmail.com as well. While +it's cool if the example is slow, a lot of problems only show up on large +and convoluted examples. As long as I can reproduce it on my machine or I can +log in somewhere, I am usually happy to help.
  • +
  • I typically use a combination of jitviewer, valgrind and +lsprofcalltree to try to guess what's going on. These tools are all +useful, but use them with care. They usually require quite a bit of +understanding before being useful. Also sometimes they're just plain useless +and you need to write your own analysis.
  • +
+

I hope this summary of steps to take is useful. We hear a lot of stories +of people trying PyPy, most of them positive, but some of them negative. +If you just post "PyPy didn't work for me" on your blog, that's +cool too, but you're missing an opportunity. The reasons may vary from +something serious like "this is a bad pattern for PyPy GC" to something +completely hilarious like "oh, I left this sys._getframe() somewhere +in my hot loops for debugging" or "I used the logging module which uses +sys._getframe() all over the place".

+

Cheers,
+fijal

+
+
+
+
+
+
+ + Unknown wrote on 2013-03-28 09:45: +
+
+

waiting for gevent's support

+
+
+
+
+ + Anonymous wrote on 2013-03-28 13:39: +
+
+

Just curious, why is uwsgi not the best choice?

+
+
+
+
+ + Unknown wrote on 2013-03-28 21:28: +
+
+

I'm also curious what are the issues with uWSGI.

+
+
+
+
+ + Unknown wrote on 2013-03-28 22:12: +
+
+

As the main uWSGI author i can only confirm the post. Embedding pypy in c applications (not the inverse) is still hacky, and afaik uWSGi is the only project trying to do it. So albeit the combo works, it is only a proof of concept that require still lot of effort (both from pypy and uWSGI) to be production-ready.

+
+
+
+
+ + Jacob Stoner wrote on 2013-03-28 23:06: +
+
+

looking forward to the post on gevent with pypy

+
+
+
+
+ + Josell wrote on 2013-03-29 05:04: +
+
+

Ruby or nothing. Sorry.

+
+
+
+
+ + Anonymous wrote on 2013-04-02 13:05: +
+
+

thanks for share...

+
+
+
+
+ + Anonymous wrote on 2013-04-02 14:46: +
+
+

will there maybe be an asm.js backend for pypy? :) that would be kind of nice. finally python in the browser.

to me it seems like asm.js will be more successful than google's native client since it is much simpler to implement and since it is a subset of javascript it already works everywhere, just slower.

+
+
+
+ +

Numpy status update and developer announcement

+ +
+
+ +

Hello, some good news!

+

First the update:

+
    +
  • +dtype support - NumPy on PyPy now supports non-native storage formats. +Due to a lack of true support for longdoubles in rpython, we decided to back +out the support of longdouble-as-double which was misleading.
  • +
  • +missing ndarray attributes - work has been made toward supporting the +complete set of attributes +on ndarrays. We are progressing alphabetically, and have made it to d. +Unsupported attributes, and unsupported arguments to attribute calls +will raise a NotImplementedError.
  • +
  • +pickling support for numarray - hasn't started yet, but next on the list
  • +
  • There has been some work on exposing FFI routines in numpypy.
  • +
  • Brian Kearns has made progress in improving the numpypy namespace. +The python numpypy submodules now more closely resemble their numpy +counterparts. Also, translated _numpypy submodules are now more properly +mapped to the numpy core c-based submodules, furthering the goal of being +able to install numpy as a pure-python module with few modifications.
  • +
+

And now the good news:

+

While our funding drive over 2012 did not reach our goal, we still managed to +raise a fair amount of money in donations. So far we only managed to spend around $10 000 of it. +We issued a call for additional developers, and are glad to welcome Romain Guillebert and Ronan Lamy +to the numpypy team. Hopefully we will be able to report on speedier progress soon.

+

Cheers,
+Matti Picus, Maciej Fijalkowski

+
+
+
+
+
+
+ + cournape wrote on 2013-03-19 08:46: +
+
+

Regarding long double, that's clearly something you should not waste your time on. I think the way it was implemented in numpy is not good, and I generally advise against it (the only real use I can see is if you need to interoperate with binary formats that use it, but even there, the complete platform specificity of it is a killer).

+
+
+
+
+ + Power Cords wrote on 2013-03-20 06:15: +
+
+

Joining of two additional developers is a good sign for Numpy and so we hope that they will now focus on speedier progress soon.

+
+
+
+ +

Py3k status update #10

+ +
+

This is the tenth status update about our work on the py3k branch, which we
+can work on thanks to all of the people who donated to the py3k proposal.

+

There's been significant progress since the last update: the linux x86-32
+buildbot
now passes 289 out of approximately 354 modules (with 39 skips) of
+CPython's regression test suite.

+

That means there's only 26 test module failures left! The list of major items
+remaining for 3.2 compatibility are now short enough to list here, with their
+related tests:

+
    +
  • Tokenizer support for non-ascii identifiers
  • +
+
    +
  • test_importlib
  • +
  • test_pep263
  • +
+ +
    +
  • test_memoryview
  • +
+
    +
  • multiprocessing module currently deadlocks
  • +
+
    +
  • test_multiprocessing
  • +
+
    +
  • Buggy handling of the new extended unpacking syntax by the compiler:
  • +
+
    +
  • test_unpack_ex
  • +
+
    +
  • The new Global Interpreter Lock and new thread signal handling
  • +
+
    +
  • test_threading
  • +
  • test_threadsignals
  • +
  • test_sys
  • +
+
    +
  • Upgrade unicodedata to 6.0.0 (requires updates to the actual unicodedata
    +generation script)
  • +
+
    +
  • test_ucn
  • +
  • test_unicode
  • +
  • test_unicodedata
  • +
+ +
    +
  • test_capi (currently crashes)
  • +
+
    +
  • Update int's hash code to match to CPython (float's is already updated on the
    py3k-newhash branch. note that PyPy 2.x doesn't even totally match
    +CPython's hashing)
  • +
+
    +
  • test_decimal
  • +
  • test_fractions
  • +
  • test_numeric_tower
  • +
+
    +
  • Miscellaneous:
  • +
+
    +
  • test_complex
  • +
  • test_float
  • +
  • test_peepholer
  • +
  • test_range
  • +
  • test_sqlite (a new cffi based version seems to be coming)
  • +
  • test_ssl
  • +
  • test_struct
  • +
  • test_subprocess
  • +
  • test_sys_settrace
  • +
  • test_time
  • +
+

Additionally there are still a number of failures in PyPy's internal test
+suite. These tests are usually ran against untranslated versions of PyPy during
+development. However we've now began running them against a fully translated
+version of PyPy on the buildbot too (thanks to Amaury for setting this
+up). This further ensures that our tests and implementation are sane.

+

We're getting closer to producing an initial alpha release. Before that happens
+we'd like to see:

+
    +
  • further test fixes
  • +
  • the results of test runs on other major platforms (e.g. linux x86-64 and osx
    +seem to have some additional failures as of now)
  • +
  • some basic real world testing
  • +
+

Finally I'd like to thank Manuel Jacob for his various contributions over the
+past month, including fixing the array and ctypes modules among other things,
+and also Amaury Forgeot d'Arc for his ongoing excellent contributions.

+

cheers,
+Phil

+
+
+
+
+ + Ernst Sjöstrand wrote on 2013-03-05 20:47: +
+
+

A chart with failing tests over time would be cool. Or, just work on fixing those tests! :-)

+
+
+
+
+ + René Dudfield wrote on 2013-03-06 10:54: +
+
+

Congrats!

+
+
+
+
+ + Arne Babenhauserheide wrote on 2013-03-07 10:59: +
+
+

That’s really, really, REALLY COOL!

+
+
+
+
+ + Power Cords wrote on 2013-03-12 13:57: +
+
+

Cool. How many errors have been fixed in current update? Is there any log available?

+
+
+
+ +

10 years of PyPy

+ +
+
+ +

+

From a software engineering perspective, 10 years is indistinguishable +from infinity, so I don't care what happens 10 years from now -- as +long as you don't blame me. :-)

- Guido van Rossum, Python creator. +

10 years is indeed a long time. PyPy was created approximately 10 years ago, +with the exact date being lost in the annals of the version control system. +We've come a long way during those 10 years, from a "minimal Python" that +was supposed to serve mostly as an educational tool, through to a vehicle for +academic research to a high performance VM for Python and beyond.

+

Some facts from the PyPy timeline:

+
    +
  • In 2007, at the end of the EU funding period, we promised the JIT was just around the corner. +It turned out we misjudged it pretty badly -- the first usable PyPy was released in 2010.
  • +
  • At some point we decided to have a JavaScript backend so one could compile RPython programs +to JavaScript and run them in a browser. Turned out it was a horrible idea.
  • +
  • Another option we tried was using RPython to write CPython C extensions. Again, it turned out RPython +is a bad language and instead we made a fast JIT, so you don't have to write C extensions.
  • +
  • We made N attempts to use LLVM. Seriously, N is 4 or 5. But we haven't fully given up yet :-) +They all run into issues one way or another.
  • +
  • We were huge fans of ctypes at the beginning. Up to the point where we tried to make +a restricted subset with static types, called rctypes for RPython. Turned out to be horrible. +Twice.
  • +
  • We were very hopeful about creating a JIT generator from the beginning. But the first one failed miserably, +generating too much assembler. The second failed too. The third first burned down and then failed. +However, we managed to release a working JIT in 2010, against all odds.
  • +
  • Martijn Faassen used to ask us "how fast is PyPy" so we decided to name an option enabling all +optimizations "--faassen". Then "--no-faassen" was naturally added too. Later we +decided to grow up and renamed it to "-O2", and now "-Ojit".
  • +
  • The first time the Python interpreter successfully compiled to C, it segfaulted because the code generator used signed chars instead of unsigned chars...
  • +
  • To make it more likely to be accepted, the proposal for the EU project contained basically every feature under the sun a language could have. This proved to be annoying, because we had to actually implement all that stuff. Then we had to do a cleanup sprint where we deleted 30% of codebase and 70% of features.
  • +
  • At one sprint someone proposed a new software development methodology: 'Terminology-Driven Programming' means to pick a fancy name, then discuss what it could mean, then implement it. Examples: timeshifter, rainbow interpreter, meta-space bubble, hint annotations (all but one of these really existed).
  • +
  • There is a conspiracy theory that the reason why translation is so slow is because time is stored away during it, which is later retrieved when an actual program runs to make them appear faster
  • +
+

Overall, it was a really long road. However, 10 years later we are in +good shape. A quick look on the immediate future: we are approaching +PyPy 2.0 with stackless+JIT and cffi support, +the support for Python 3 is taking shape, non-standard +extensions like STM are slowly getting ready (more soon), and there are +several non-Python interpreters around the corner (Hippy, Topaz and more).

+

Cheers,
+fijal, arigo, hodgestar, cfbolz and the entire pypy team.

+ + +
+
+
+
+
+
+ + Anonymous wrote on 2013-02-28 22:43: +
+
+

My best wishes to whole PyPy team! And thanks for all the hard work!

+
+
+
+
+ + Anonymous wrote on 2013-02-28 23:01: +
+
+

You guys rock!

+
+
+
+
+ + Anonymous wrote on 2013-02-28 23:04: +
+
+

Best blog posting - ever! Heres to another 10 pypy years and N llvm endeavours. -- rxe

+
+
+
+
+ + Anonymous wrote on 2013-02-28 23:33: +
+
+

You've made a great work so far, please continue with it!!

+
+
+
+
+ + Vanessa wrote on 2013-03-01 00:37: +
+
+

Only those who dare to fail greatly can ever achieve greatly. --RFK
Congrats, guys!

+
+
+
+
+ + Anonymous wrote on 2013-03-01 01:45: +
+
+

Congratulations and thank you for the great work, looking forward to the next 10 years!

+
+
+
+
+ + dmatos wrote on 2013-03-01 02:16: +
+
+

Great work!

+
+
+
+
+ + Anonymous wrote on 2013-03-01 06:20: +
+
+

How will PyPy impact Python future and it's adoption as preferred language?

+
+
+
+
+ + Anonymous wrote on 2013-03-01 08:23: +
+
+

indeed: congratulations and much respect for the perseverance and hard work you have put into this project over the years!

+
+
+
+
+ + Gaëtan de Menten wrote on 2013-03-01 08:42: +
+
+

First, congratulations for keeping at it for 10 years! PyPy is one of the most interesting project I know of.

This blog post is also very interesting but by reading it I can't help but think: are all those "failures" documented somewhere in one place? It could be a very interesting read.

Or more specifically:
* Why was the JavaScript backend a horrible idea?
* Why is RPython a bad language (for writing CPython extensions)?
* What went wrong in the different attempts at using LLVM?
* What were those "70% of features" that were dropped after the EU project?

+
+
+
+
+ + glyph wrote on 2013-03-01 09:16: +
+
+

Congratulations! Here's to another 10 years!

And the JavaScript backend was a great idea - bring it back! It's certainly better than the other Python-to-JS translators out there, at least in terms of actually parsing some Python. I want Python in my browser!

+
+
+
+
+ + kayhayen wrote on 2013-03-01 11:29: +
+
+

I was and always will be impressed by PyPy. And the self-critic of this post only furthers it. You are cool people, looking forward to meet you again.

+
+
+
+
+ + Anonymous wrote on 2013-03-01 12:12: +
+
+

I remember 10 years ago, when I decided to learn to program... I didn't know what language to choose, and someone suggested python. It was someone I approached through a mailing list, and he was passionate explaining why python is so special.

I remember reading about it being cool but with a "performance problem". However, there were some nerds out there talking about a minimal python, that would eventually become a fast python, so I said "cool, perhaps in a few months there will be a fast python...".

I spent ten years following silently this story, and I'm happy to say "Happy birthday Pypy!".

I've never met any of you, but I feel I know you.
You showed me the value of perseverance, that every failure is one step closer to success.

Congratulations and a big THANK YOU!
Luis Gonzalez, from Buenos Aires.

+
+
+
+
+ + Paul Jaros wrote on 2013-03-01 14:12: +
+
+

PyPy is my favorite open-source project. Best of wishes for the future development.
May you find all the funding you need, become the leading STM Implementation and become the defacto Python standard.

+
+
+
+
+ + Stefane Fermigier wrote on 2013-03-01 14:34: +
+
+

+1 on Gaëtan de Menten's comment.

+
+
+
+
+ + Daniel wrote on 2013-03-01 22:06: +
+
+

One more +1 on Gaëtan de Menten's comment. :)

+
+
+
+
+ + Anonymous wrote on 2013-03-02 01:06: +
+
+

You are incredible people and you do such cool stuff! Best of luck to you and keep up the great work!

+
+
+
+
+ + Arne Babenhauserheide wrote on 2013-03-02 11:03: +
+
+

Thank you for the great post - and thank you for sticking to it and finding ways to get time to make it work - including to add everything under the sun into that EU project to be able to go full-time!

You’re a great example how to really do stuff right - by actually doing it and keeping at it through every stumbling block on the way.

Happy birthday - and thank you for pypy!

+
+
+
+
+ + Jan Brohl wrote on 2013-03-03 12:32: +
+
+

+1 on Gaëtan de Menten's comment.

+
+
+
+
+ + Anonymous wrote on 2013-03-04 14:11: +
+
+

I'd also like to see the failures documented. Trying and failing is a great way to learn - but even better is to learn from other's failures.

+
+
+
+
+ + Anonymous wrote on 2013-03-05 11:49: +
+
+

Great work guys! Happy birthday PyPy!

+
+
+
+
+ + Электроник wrote on 2013-03-10 01:34: +
+
+

Thanks for making fast Python possible and creating a masterpiece in process!
About Terminology-Driven Programming: let me guess, the only nonexistent thing is a timeshifter? Three other names make a lot of sense in context of PyPy.

+
+
+
+
+ + Armin Rigo wrote on 2013-03-23 16:42: +
+
+

Электроник: no :-) Try again.

+
+
+
+ +

cppyy status update

+ +
+

The cppyy module +provides C++ bindings for PyPy by using the reflection information extracted +from C++ header files by means of the +Reflex package. +In order to support C++11, the goal is to move away from Reflex and instead use +cling, an interactive +C++ interpreter, as the backend. +Cling is based on llvm's +clang. + +The use of a real compiler under the hood has the advantage that it is now +possible to cover every conceivable corner case. +The disadvantage, however, is that every corner case actually has to be +covered. +Life is somewhat easier when calls come in from the python interpreter, as +those calls have already been vetted for syntax errors and all lookups are +well scoped. +Furthermore, the real hard work of getting sane responses from and for C++ +in an interactive environment is done in cling, not in the bindings. +Nevertheless, it is proving a long road (but for that matter clang does not +support all of C++11 yet), so here's a quick status update showing that good +progress is being made. + +

+

The following example is on CPython, not PyPy, but moving a third +(after Reflex and +CINT) backend into place +underneath cppyy is straightforward compared to developing the backend +in the first place. + +Take this snippet of C++11 code +(cpp11.C): + +

+

+
    constexpr int data_size() { return 5; }
+
+    auto N = data_size();
+
+    template<class L, class R>
+    struct MyMath {
+       static auto add(L l, R r) -> decltype(l+r) { return l + r; }
+    };
+
+    template class MyMath<int, int>;
+ +

As a practical matter, most usage of new C++11 features will live in +implementations, not in declarations, and are thus never seen by the bindings. +The above example is therefore somewhat contrived, but it will serve to show +that these new declarations actually work. +The new features used here are +constexpr, +auto, and +decltype. +Here is how you could use these from CPython, using the +PyROOT +package, which has more than a passing resemblance to cppyy, as one is based +on the other: + +

+

+
    import ROOT as gbl
+    gbl.gROOT.LoadMacro('cpp11.C')
+
+    print 'N =', gbl.N
+    print '1+1 =', gbl.MyMath(int, int).add(1,1)
+ +which, when entered into a file +(cpp11.py) and executed, +prints the expected results: + +

+
    $ python cpp11.py
+    N = 5
+    1+1 = 2
+ +In the example, the C++ code is compiled on-the-fly, rather than first generating +a dictionary as is needed with Reflex. +A deployment model that utilizes stored pre-compiled information is foreseen +to work with larger projects, which may have to pull in headers from many places. + +

Work is going to continue first on C++03 on cling with CPython (about 85% of +unit tests currently pass), with a bit of work on C++11 support on the side. +Once fully in place, it can be brought into a new backend for cppyy, after +which the remaining parts of C++11 can be fleshed out for both interpreters. + +

+

Cheers,
+Wim Lavrijsen

+
+
+
+
+ + Anonymous wrote on 2013-02-28 00:17: +
+
+

How would memory management work for C++ objects which own PyPy objects? In CPython, or any similar reference counting system, a C++ class can hold only references via special smart pointers. These smart pointers don't need to be registered in any way with the outer class, since there's no need for a garbage collector to traverse from the outer object to the inner smart pointer instances.

For decent garbage collection to work, presumably one needs to be able to enumerate the PyPy objects pointed to by a C++ object. How would this work?

+
+
+
+
+ + Wim Lavrijsen wrote on 2013-02-28 00:34: +
+
+

Right now, there are no PyPy objects exposed as such, but only PyObjects through cpyext in support of the python C-API. In cppyy, cpyext is used for any interface that has a PyObject* as argument or return value. It is cpyext that takes care of marrying the ref-count API with the garbage collector.

Don't pin me down on the details, but from what I understand of cpyext, a wrapper object with the proper C layout is created, and given a life line by putting it in an internal container holding all such objects safe from the gc simply by existing. When the ref count hits zero, the life line gets removed. Object identity is preserved by finding objects in the internal container and reusing them.

+
+
+
+ +

PyCon Silicon Valley and San Francisco visit

+ +
+
+ +

Hello everyone.

+

We (Armin Rigo and Maciej Fijalkowski) are visiting San Francisco/Silicon Valley +for PyCon and beyond. Alex Gaynor, another core PyPy dev is living there +permanently. My visiting dates are 12-28 of March, Armin's 11-21st. +If you want us to give a talk at your company or simply catch up with us +for a dinner +please get in touch. Write to pypy-dev@python.org, if you want this publically +known or simply send me a mail at fijall@gmail.com if you don't want it public.

+

Cheers,
+fijal

+
+
+
+

Announcing Topaz, an RPython powered Ruby interpreter

+ +
+

Hello everyone

+ +

Last week, Alex Gaynor announced the first public release of +Topaz, +a Ruby interpreter written in RPython. This is the culmination of a +part-time effort over the past 10 months to provide a Ruby interpreter +that implements enough interesting constructs in Ruby to show that the +RPython toolchain can produce a Ruby implementation fast enough to +beat what is out there.

+ +

Disclaimer

+ +

Obviously the implementation is very incomplete currently in terms of +available standard library. We are working on getting it useable. If +you want to try it, grab a +nightly build.

+ +

We have run some benchmarks from the +Ruby benchmark suite +and the +metatracing VMs experiment. The +preliminary results are promising, but at this point we are missing so +many method implementations that most benchmarks won't run yet. So instead of +performance, I'm going to talk about the high-level structure of the +implementation.

+ +

Architecture

+ +

Topaz interprets a custom bytecode set. The basics are similar to +Smalltalk VMs, with bytecodes for loading and storing locals and +instance variables, sending messages, and stack management. Some +syntactical features of Ruby, such as defining classes and modules, +literal regular expressions, hashes, ranges, etc also have their own +bytecodes. The third kind of bytecodes are for control flow constructs +in Ruby, such as loops, exception handling, break, continue, etc.

+ +

In trying to get from Ruby source code to bytecode, we found that the +easiest way to support all of the Ruby syntax is to write a custom +lexer and use an RPython port of PLY +(fittingly called RPly) to create the +parser from the Ruby yacc grammar.

+ +

The Topaz interpreter uses an ObjectSpace (similar to how PyPy does +it), to interact with the Ruby world. The object space contains all +the logic for wrapping and interacting with Ruby objects from the +VM. It's __init__ method sets up the core classes, initial globals, +and creates the main thread (the only one right now, as we do not have +threading, yet).

+ +

Classes are mostly written in Python. We use ClassDef objects to +define the Ruby hierarchy and attach RPython methods to Ruby via +ClassDef decorators. These two points warrant a little explanation.

+ +
Hierarchies
+ +

All Ruby classes ultimately inherit from BasicObject. However, most +objects are below Object (which is a direct subclass of +BasicObject). This includes objects of type Fixnum, Float, +Class, and Module, which may not need all of the facilities of +full objects most of the time.

+ +

Most VMs treat such objects specially, using tagged pointers to +represent Fixnums, for example. Other VMs (for example from the +SOM Family) +don't. In the latter case, the implementation hierarchy matches the +language hierarchy, which means that objects like Fixnum share a +representation with all other objects (e.g. they have class pointers +and some kind of instance variable storage).

+ +

In Topaz, implementation hierarchy and language hierarchy are +separate. The first is defined through the Python inheritance. The +other is defined through the ClassDef for each Python class, where the +appropriate Ruby superclass is chosen. The diagram below shows how the +implementation class W_FixnumObject inherits directly from +W_RootObject. Note that W_RootObject doesn't have any attrs, +specifically no storage for instance variables and no map (for +determining the class - we'll get to that). These attributes are +instead defined on W_Object, which is what most other implementation +classes inherit from. However, on the Ruby side, Fixnum correctly +inherits (via Numeric and Integer) from Object.

+ +
+ +
+ +

This simple structural optimization gives a huge speed boost, but +there are VMs out there that do not have it and suffer performance +hits for it.

+ +
Decorators
+ +

Ruby methods can have symbols in its names that are not allowed as +part of Python method names, for example !, ?, or =, so we +cannot simply define Python methods and expose them to Ruby by the +same name.

+ +

For defining the Ruby method name of a function, as well as argument +number checking, Ruby type coercion and unwrapping of Ruby objects to +their Python equivalents, we use decorators defined on ClassDef. When +the ObjectSpace initializes, it builds all Ruby classes from their +respective ClassDef objects. For each method in an implementation +class that has a ClassDef decorator, a wrapper method is generated and +exposed to Ruby. These wrappers define the name of the Ruby method, +coerce Ruby arguments, and unwrap them for the Python method.

+ +

Here is a simple example:

+ +
@classdef.method("*", times="int")
+def method_times(self, space, times):
+    return self.strategy.mul(space, self.str_storage, times)
+
+ +

This defines the method * on the Ruby String class. When this is +called, the first argument is converted into a Ruby Fixnum object +using the appropriate coercion method, and then unwrapped into a plain +Python int and passed as argument to method_times. The wrapper +method also supplies the space argument.

+ +

Object Structure

+ +

Ruby objects have dynamically defined instance variables and may +change their class at any time in the program (a concept called +singleton class +in Ruby - it allows each object to have unique behaviour). To still +efficiently access instance variables, you want to avoid dictionary +lookups and let the JIT know about objects of the same class that have +the same instance variables. Topaz, like PyPy (which got it from +Self), implements instances using maps, which transforms dictionary +lookups into array accesses. See the +blog post +for the details.

+ +

This is only a rough overview of the architecture. If you're +interested, get in touch on +#topaz.freenode.net, follow the +Topaz Twitter account or contribute +on GitHub.

+ +Tim Felgentreff +
+
+
+
+ + Shin Guey wrote on 2013-02-12 19:25: +
+
+

Interesting. Although I code a lot in python but still quite like Ruby. Am looking forward for a fast ruby...

+
+
+
+
+ + Unknown wrote on 2013-02-12 20:37: +
+
+

Does this mean that JVM is now obsolete?

+
+
+
+
+ + Anonymous wrote on 2013-02-13 14:36: +
+
+

Don't worry. JVM will outlive you and your grandgrandchildren.

+
+
+
+
+ + smurfix wrote on 2013-02-17 09:05: +
+
+

"Its __init__ method", not "It's".

+
+
+
+ +

CFFI 0.5

+ +
+

Hi all,

+ +

A short notice to tell you that CFFI 0.5 was released. This +contains a number of small improvements from 0.4, but seems to otherwise +be quite stable since a couple of months --- no change since January 10, +apart from the usual last-minute fixes for Python 3 and for Windows.

+ +

Have fun!

+ +

Armin

+
+
+
+
+ + Dirkjan Ochtman wrote on 2013-02-08 11:53: +
+
+

Nice! I've added it to the Gentoo package repository; all the tests passed without any issues, this time.

+
+
+
+
+ + mattip wrote on 2013-03-31 14:41: +
+
+

Note that pypy uses a builtin cffi_backend which must match the cffi version. As of March 31 for instance nightly builds work with cffi 0.6

+
+
+
+ +

NumPyPy 2013 Developer Position

+ +
+
+
+

Introduction

+

Proposed herein is a part-time fellowship for developing NumPy in PyPy. +The work will initially consist of 100 hours +with the possibility of extension, until the funds run out. +Development and improvement of PyPy's NumPyPy (as +with most Open Source and Free Software) is done as a collaborative process +between volunteer, paid, and academic contributors. Due to a successful funding +drive but a lack of contributors willing to work directly for PyPy, we find +ourselves in the enviable situation of being able to offer this position.

+
+
+

Background

+

PyPy's developers make all PyPy software available to the public +without charge, under PyPy's Open Source copyright license, the +permissive MIT License. PyPy's license assures that PyPy is equally +available to everyone freely on terms that allow both non-commercial +and commercial activity. This license allows for academics, for-profit +software developers, volunteers and enthusiasts alike to collaborate +together to make a better Python implementation for everyone.

+

NumPy support for PyPy is licensed similarly, and therefore NumPy in +PyPy support can directly help researchers and developers who seek to +do numeric computing but want an easier programming language to use +than Fortan or C, which is typically used for these +applications. Being licensed freely to the general public means that +opportunities to use, improve and learn about how NumPy in PyPy works +itself will be generally available to everyone.

+
+
+

The Need for a Part-Time Developer

+

NumPy project in PyPy has seen some slow, but steady progress since we started +working about a year ago. On one hand, +it's actually impressive what we could deliver with the effort undertaken, +on the other hand, we would like to see the development accelerated.

+

PyPy has strict coding, testing, documentation, and review standards, +which ensures excellent code quality, continually improving +documentation and code test coverage, and minimal regressions. A +part-time developer will be able to bring us closer to the goal of +full numpy-api implementation and speed improvements.

+
+
+

Work Plan

+

The current proposal is split into two parts:

+
    +
  • +

    Compatibility:

    +

    This part covers the core NumPy Python API. We'll implement most NumPy APIs +that are officially documented and we'll pass most of NumPy's tests that +cover documented APIs and are not implementation details. +Specifically, we don't plan to:

    +
      +
    • implement NumPy's C API
    • +
    • implement other scientific libraries, like SciPy, matplotlib or biopython
    • +
    • implement details that are otherwise agreed by consensus to not have a place +in PyPy's implementation of NumPy or agreed with NumPy community +to be implementation details
    • +
    +
  • +
  • +

    Speed:

    +

    This part will cover significant speed improvements in the JIT that would +make numeric computations faster. This includes, but is not necesarilly +limited to:

    +
      +
    • write a set of benchmarks covering various use cases
    • +
    • teaching the JIT backend (or multiple backends) how to deal with vector +operations, like SSE
    • +
    • experiments with automatic parallelization using multiple threads, akin +to numexpr
    • +
    • improving the JIT register allocator that will make a difference, especially +for tight loops
    • +
    +

    As with all speed improvements, it's relatively hard to predict exactly +how it'll cope, however we expect the results to be withing an order +of magnitude of handwritten C equivalent.

    +
  • +
+
+
+

Position Candidate

+

We would like people who are proficient in NumPy and PyPy (but don't have to be +core developers of either) to step up. The developer selection will be done +by consensus of PyPy core developers and consulted with the Software Freedom +Conservancy for lack of conflict of interest. The main criterium will be +past contributions to the PyPy project, but they don't have to be significant +in size.

+

A candidate for the Developer position will demonstrate the following:

+
    +
  • The ability to write clear, stable, suitable and tested code
  • +
  • The ability to understand and extend the JIT capabilities used in NumPyPy.
  • +
  • A positive presence in PyPy's online community on IRC and the mailing +list.
  • +
+

Ideally the Developer will also:

+
    +
  • Have familiarity with the infrastructure of the PyPy project (including +bug tracker and buildbot).
  • +
  • Have Worked to provide education or outreach on PyPy in other forums such as +workshops, conferences, and user groups.
  • +
+

Conservancy and PyPy are excited to announce the Developer Position. +Renumeration for the position will be at the rate of 60 USD per hour, through +the Software Freedom Conservancy.

+

PyPy community is promising to provide necessary guidance and help into +the current codebase, however we expect a successful candidate to be able +to review code and incorporate external patches within two months of the +starting date of the contract.

+

Candidates should submit their proposal (including their CV) to:

+

pypy-z@python.org

+

The deadline for this initial round of proposals is February 1, 2013.

+
+
+
+
+
+
+
+ + Anonymous wrote on 2013-01-26 11:37: +
+
+

I was wondering, why is PyPy so eager to support NumPy of all things? Surely there are things more interesting to a general python/pypy user base. Can someone clarify that for me?

+
+
+
+
+ + Maciej Fijalkowski wrote on 2013-01-26 11:40: +
+
+

There was a numpy fundraiser due to popular demand. Feel free to suggest a different fundraiser if you want something else. I would be willing to even do a survey.

+
+
+
+
+ + Anonymous wrote on 2013-01-26 14:56: +
+
+

The thing is, the most interesting use of Python is in science, IMHO at least. And absolute majority of python scientific libraries use numpy as base. So, it would be awesome to have fast and robust numpy compatible library running on pypy.

+
+
+
+
+ + Armin Rigo wrote on 2013-01-26 17:28: +
+
+

The deadline seems too tight: it's next Friday.

+
+
+
+
+ + Anonymous wrote on 2013-01-26 18:31: +
+
+

It's been said before but as a long time NumPy and SciPy user, please please please don't call this project NumPy. It's great for PyPy to have an nd-array lib and for sure NumPy has some of the best semantics and user API for that so by all means make it compatible, but giving it the same name just makes tremendous confusion for users. For scientific users without the C-API which allows most of the widely used scientific extensions it is simply not "numpy".

+
+
+
+
+ + Wes Turner wrote on 2013-01-26 22:24: +
+
+

@201301261931

As NumPyPy intends to implement NumPy APIs, as a non-contributor, I feel like NumPyPy is a good name.

So then the package names would be:

* https://pypi.python.org/pypi/numpy
* https://pypi.python.org/pypi/numpypy

@201301261237

IMHO, this is not the forum for discussing what sort of pony you would like?

+
+
+
+
+ + Anonymous wrote on 2013-01-27 16:19: +
+
+

FWIW I think that numpypy to work is hugely important for the acceptance of pypy. Simple things like using matplotlib are crucial to lots of people who aren't using much of the rest of scipy, for example.

+
+
+
+
+ + Rahul Chaudhary wrote on 2013-01-28 01:31: +
+
+

You can post it on https://jobs.pythonweekly.com/ and it will be included in Python Weekly newsletter too.

+
+
+
+
+ + Anonymous wrote on 2013-01-30 01:36: +
+
+

I am following each of your announcements with great interest.
JIT optimization of array manipulations would enormously benefit my daily work.

Even though I am trying hard to follow the discussion, I have difficulty understanding the issues at hand, and what numpypy is going to be when it is finished.

Probably I am not the only one, considering the sometimes controversial discussion.

My current understanding is this:
All python code in numpy will run much better under pypy.

The problem are the external libraries. Depending on the type, there will be different approaches.

I assume that you will re-write a large part of the c-part of numpy directly in python, and then make use of the JIT optimizer. That would be the approach for all of the algorithms that are currently written in c, but could be easily re-implemented in python.
Something like ufunc_object.c could probably be rewritten in python without a loss of speed.
Of course, even though this would still run under normal python, it would be far to slow.

Then you have external dlls, like BLAS. I assume you will call them differently (ctypes?), and not as extension modules. If you use ctypes, it will still run under normal python, maybe a bit slower.

Then you have parts that are currently written in c, but that you can neither re-implement in python, nor call as a dll. Will you re-write those in c, using a different c-api? Or re-write them, so that they can be called using ctypes?


Maybe you give a short general overview about the issues with the c-api and what you are doing?

Something like. "Currently the function numpy.dot is written as a c-extension. It makes extensive use of PyArray_GETITEM. This limits the optimizer. We are therefore completely rewriting the function in python"

What is the best approach for a user like me, who makes heavy use of numpy, but also scipy and my own extension modules, cython and f2py?

Should I preferably write future modules as dlls, so that they can be called with ctypes (or cffi or something else), instead of making extension modules?

Do you think it will be possible at all to use scipy, which makes much more use of non-python libraries, or do you think that scipy will have to be re-written?

+
+
+
+
+ + Alendit wrote on 2013-02-09 12:09: +
+
+

Just a question - the donation figures on the homepage seem to be the same for the last 6 month or so. Is there really no donation or aren't they updated anymore.

+
+
+
+ +

Py3k status update #9

+ +
+

This is the ninth status update about our work on the py3k branch, which
+we can work on thanks to all of the people who donated to the py3k
+proposal
.

+

Just a very short update on December's work: we're now passing about 223 of
+approximately 355 modules of CPython's regression test suite, up from passing
+194 last month.

+

Some brief highlights:

+
    +
  • More encoding related issues were addressed. e.g. now most if not all the
    +multibytecodec test modules pass.
  • +
  • Fixed some path handling issues (test_os, test_ntpath and
    test_posixpath now pass)
  • +
  • We now pass test_class, test_descr and almost test_builtin (among
    +other things): these are notable as they are fairly extensive test suites of
    +core aspects of the langauge.
  • +
  • Amaury Forgeot d'Arc continued making progress on CPyExt (thanks again!)
  • +
+

cheers,
+Phil

+
+
+
+
+ + Unknown wrote on 2013-01-14 10:58: +
+
+

Nice! Thank you for your update!

+
+
+
+
+ + Kevin S. Smith wrote on 2013-01-24 17:24: +
+
+

The update was expected. Thank you for your update. Hope to see more.

+
+
+
+ +
+
+ +
+
+
+ +
+ + + + \ No newline at end of file diff --git a/blog/index-27.html b/blog/index-27.html new file mode 100644 index 000000000..bbb02e4c3 --- /dev/null +++ b/blog/index-27.html @@ -0,0 +1,1582 @@ + + + + + + +PyPy (old posts, page 27) | PyPy + + + + + + + + + + + + + + + + + + Skip to main content +
+
+
+

EuroPython

+ +
+

Hi all,

+ +

A short note: if you're at EuroPython right now and wondering if PyPy is +dead because you don't see the obviously expected talk about PyPy, don't +worry. PyPy is still alive and kicking. The truth is two-fold: (1) we +missed the talk deadline (duh!)... but as importantly, (2) for various +reasons we chose not to travel to Florence this year after our trip to +PyCon US. (Antonio Cuni is at Florence but doesn't have a talk about PyPy +either.)

+ +

Armin

+
+
+
+
+ + rokujyouhitoma wrote on 2013-07-04 20:25: +
+
+

I think of it for a moment. >dead.
Also...I can not meet you at EuroPython :(

See you next time!
From Japanese Pythonista.

+
+
+
+ +

Py3k status update #11

+ +
+

This is the 11th status update about our work on the py3k branch, which we
+can work on thanks to all of the people who donated to the py3k proposal.

+

Here's some highlights of the progress made since the previous update:

+
    +
  • PyPy py3k now matches CPython 3's hash code for
    +int/float/complex/Decimal/Fraction
  • +
  • Various outstanding unicode identifier related issues were
    +resolved. E.g. test_importlib/pep263/ucn/unicode all now fully pass. Various
    +usage of identifiers (in particular type and module names) have been fixed to
    +handle non-ascii names -- mostly around display of reprs and exception
    +messages.
  • +
  • The unicodedata database has been upgraded to 6.0.0.
  • +
  • Windows support has greatly improved, though it could still use some more
    +help (but so does the default branch to a certain degree).
  • +
  • Probably the last of the parsing related bugs/features have been taken care
    +of.
  • +
  • Of course various other smaller miscellaneous fixes
  • +
+

This leaves the branch w/ only about 5 outstanding failures of the stdlib test
+suite:

+
    +
  • +

    test_float

    +

    1 failing test about containment of floats in collections.

    +
  • +
  • +

    test_memoryview

    +

    Various failures: requires some bytes/str changes among other things (Manuel
    +Jacob's has some progress on this on the py3k-memoryview branch)

    +
  • +
  • +

    test_multiprocessing

    +

    1 or more tests deadlock on some platforms

    +
  • +
  • +

    test_sys and test_threading

    +

    2 failing tests for the New GIL's new API

    +
  • +
+

Probably the biggest feature left to tackle is the New GIL.

+

We're now pretty close to pushing an initial release. We had planned for one
+around PyCon, but having missed that we've put some more effort into the branch
+to provide a more fully-fledged initial release.

+

Thanks to the following for their contributions: Manuel Jacob, Amaury Forgeot
+d'Arc, Karl Ramm, Jason Chu and Christian Hudon.

+

cheers,
+Phil

+
+
+
+
+ + Anonymous wrote on 2013-06-14 12:20: +
+
+

In my new project I'm using Python3.
I can't when I will run it with PyPy.

Thanks for your work!

+
+
+
+
+ + Unknown wrote on 2013-06-14 20:29: +
+
+

I just donated and found this post :) Great work!

+
+
+
+
+ + Paul Jaros wrote on 2013-06-17 08:12: +
+
+

The "new GIL" picked my curiosity. Was is it? Is it related to the STM or is it a separate thing?

Also, thanks for the update.

+
+
+
+
+ + Philip Jenvey wrote on 2013-06-18 19:35: +
+
+

The new GIL is briefly explained here: https://docs.python.org/3.4/whatsnew/3.2.html#multi-threading

Additionally, David Beazly has done a couple talks/blog posts about the problems of the old GIL and how the new GIL has improved over the old design.

+
+
+
+
+ + Paul Jaros wrote on 2013-06-19 12:02: +
+
+

Thanks for the link

+
+
+
+
+ + randomlessly wrote on 2013-06-22 17:37: +
+
+

kkk @Tom Li

+
+
+
+
+ + Unknown wrote on 2013-06-23 09:15: +
+
+

Will the pre-release already be optimized?

+
+
+
+
+ + Tony wrote on 2013-07-31 08:36: +
+
+

This is cool!

+
+
+
+ +

STM on the drawing board

+ +
+

Hi all!

+ +

This is an update about the Software Transactional Memory subproject of +PyPy. I have some good news of progress. Also, +Remi Meier will +likely help me this summer. He did various +investigations with PyPy-STM for his Master's Thesis and contributed back +a lot of ideas and some code. Welcome again Remi!

+ +

I am also sorry that it seems to advance so slowly. Beyond the usual +excuses --- I was busy with other things, e.g. releasing PyPy 2.0 --- I +would like to reassure people: I'm again working on it, and the financial +contributions are still there and reserved for STM (almost half the money is +left, a big thank you again if you contributed!).

+ +

The real reason for the apparent slowness, though, is that it is really +a research project. It's possible to either have hard deadlines, or to +follow various tracks and keep improving the basics, but not both at the +same time.

+ +

During the past month where I have worked again on STM, I worked still on +the second option; and I believe it was worth every second of it. Let me try +to convince you :-)

+ +

The main blocker was that the STM subsystem, written in C, and the +Garbage Collection (GC) subsystem, written in RPython, were getting +harder and harder to coordinate. So what I did instead is to give up +using RPython in favor of using only C for both. C is a good language +for some things, which includes low-level programming where we must take +care of delicate multithreading issues; RPython is not a good fit in +that case, and wasn't designed to be.

+ +

I started a fresh Mercurial repo +which is basically a stand-alone C library. This library (in heavy development +right now!) gives any C +program some functions to allocate and track GC-managed objects, and +gives an actual STM+GC combination on these objects. It's possible +(though rather verbose) to use it directly in C programs, like in a +small example interpreter. Of course the eventual purpose is to link it +with PyPy during translation to C, with all the verbose calls +automatically generated.

+ +

Since I started this, bringing the GC closer to the STM, I kept finding +new ways that the two might interact to improve the performance, maybe +radically. Here is a summary of the current ideas.

+ +

When we run +multiple threads, there are two common cases: one is to access (read and write) +objects that have only been seen by the current thread; the other is to read +objects seen by all threads, like in Python the modules/functions/classes, +but not to write to them. Of course, writing to the same object from +multiple threads occurs too, and it is handled correctly (that's the whole +point), but it is a relatively rare case.

+ +

So each object is classified as "public" or "protected" (or "private", +when they belong to the current transaction). Newly created objects, once +they are no longer private, remain protected until +they are read by a different thread. Now, the point is to use very +different mechanisms for public and for protected objects. Public +objects are visible by all threads, but read-only in memory; to change +them, a copy must be made, and the changes are written to the copy (the +"redolog" approach to STM). Protected objects, on the other hand, are +modified in-place, with (if necessary) a copy of them being made +for the sole purpose of a possible abort of the transaction (the "undolog" +approach).

+ +

This is combined with a generational GC similar to PyPy's --- but here, +each thread gets its own nursery and does its own "minor collections", +independently of the others.

+ +

So objects are by default protected; when another thread tries to follow a +pointer to them, then it is that other thread's job to carefully "steal" +the object and turn it public (possibly making a copy of it if needed, +e.g. if it was still a young object living in the original nursery).

+ +

The same object can exist temporarily in multiple versions: any number +of public copies; at most one active protected copy; and optionally one +private copy per thread (this is the copy as currently seen by the +transaction in progress on that thread). The GC cleans up the +unnecessary copies.

+ +

These ideas are variants and extensions of the same basic idea +of keeping multiple copies with revision numbers to track them. +Moreover, "read barriers" and "write barriers" are used by the C program +calling into this library in order to be sure that it is accessing the +right version of the object. In the currently investigated variant +I believe it should be possible to have rather cheap +read barriers, which would definitely be a major speed improvement over +the previous variants. Actually, as far as I know, it would be a major +improvement over most of the other existing STMs: in them, the typical read barrier +involves following chains of pointers, and checking some dictionary to see if this +thread has a modified local copy of the object. The difference with a +read barrier that can resolve most cases in a few CPU cycles should be +huge.

+ +

So, this is research :-) It is progressing, and at some point I'll be +satisfied with it and stop rewriting everything; and then the actual +integration into PyPy should be straightforward (there is already code +to detect where the read and write barriers need to be inserted, where +transactions can be split, etc.). Then there is support for the +JIT to be written, and so on. But more about it later.

+ +

The purpose of this post was to give you some glimpses into what I'm +working on right now. As usual, no plan for release yet. But you can +look forward to seeing the C library progress. I'll probably also start +soon some sample interpreter in C, to test the waters (likely a +revival of duhton). +If you know nothing about Python but all about the C-level +multithreading issues, now is a good time to get involved :-)

+ +

Thanks for reading!

+ +

Armin

+
+
+
+
+ + Paul Jaros wrote on 2013-06-06 12:48: +
+
+

Thanks for the update. I was wondering since some time how the progress in STM has come along.
Good job also :)

+
+
+
+
+ + Tuure Laurinolli wrote on 2013-06-18 05:27: +
+
+

Do you have a description of the read and write barriers required somewhere? How does requiring a copy to be made of protected objects upon modification work with e.g. large arrays?

+
+
+
+
+ + David wrote on 2013-08-04 18:06: +
+
+

Check out John Carmack's brainstorm on an integrated STM+GC system concept which is sort of "globally phased compacting GC+STM". He doesn't use the term STM, but the concept is the same.

https://www.youtube.com/watch?v=1PhArSujR_A&feature=player_detailpage&t=1354

+
+
+
+ +

NumPyPy status update

+ +
+

Hello everyone,

+May was the first month I was paid to work on NumPyPy (thanks to all who donated!), here is what I worked on during this period :

+
    +
  • It is now possible to use subarrays.
  • +
  • It is now possible to pickle ndarrays (including those using subarrays), dtypes and scalars, the pickling protocol is the same as numpy's.
  • +
+
+
+
+
+For June, I plan to work on the nditer class, it seems that there's enough work for an entire month.
+
+Cheers
+Romain Guillebert +
+
+
+
+ + Anonymous wrote on 2013-06-03 18:49: +
+
+

What's numpypy's recommended way for a C/cffi extension to get a pointer to the data?

Thanks,
Andreas

+
+
+
+
+ + Anonymous wrote on 2013-06-04 08:37: +
+
+

Excellent work!

+
+
+
+
+ + Anonymous wrote on 2013-06-04 10:34: +
+
+

Thanks! But pickling sliced arrays doesn't work yet (tested with nightly build pypy-c-jit-64739-f556942951f9-linux):

import cPickle as pickle
import numpypy as numpy
a = numpy.arange(10.)[::2]
print a # [ 0. 2. 4. 6. 8.]
p = pickle.dumps(a)
print pickle.loads(p) # [ 0. 1. 2. 3. 4.] oops!

+
+
+
+
+ + Romain Guillebert wrote on 2013-06-04 19:55: +
+
+

@Anonymous

Thanks for reporting it, it's fixed

+
+
+
+
+ + Anonymous wrote on 2013-06-04 21:16: +
+
+

Great to hear about the progress, keep up the good work!

+
+
+
+
+ + Anonymous wrote on 2013-12-05 22:23: +
+
+

It is working very well for me, thanks!

Now, is there any way to load the resulting pickle in cPython?

numpy.save and numpy.load do work between pypy and cPython, but my arrays are embedded in larger data structures.

The motivation is that I would like to run a numerical program and store some results, and then load the results and plot them with matplotlib (which does not work on pypy).

Here is the error in cPython:

>>> pickle.load(open('/tmp/x', 'r+b'))
Traceback (most recent call last):
File "", line 1, in
File "/usr/lib/python2.7/pickle.py", line 1378, in load
return Unpickler(file).load()
File "/usr/lib/python2.7/pickle.py", line 858, in load
dispatch[key](self)
File "/usr/lib/python2.7/pickle.py", line 1090, in load_global
klass = self.find_class(module, name)
File "/usr/lib/python2.7/pickle.py", line 1124, in find_class
__import__(module)
ImportError: No module named _numpypy.multiarray

+
+
+
+
+ + Anonymous wrote on 2013-12-05 22:28: +
+
+

It is working great, thanks!

Now, is there any way to load the resulting pickle in cPython?

numpy.save and numpy.load do work between pypy and cPython, but my arrays are embedded in larger data structures.

The motivation is that I would like to run a numerical program and store some results, and then load the results and plot them with matplotlib (which does not work on pypy).

Here is the error in cPython:

>>> pickle.load(open('/tmp/x', 'r+b'))
Traceback (most recent call last):
File "", line 1, in
File "/usr/lib/python2.7/pickle.py", line 1378, in load
return Unpickler(file).load()
File "/usr/lib/python2.7/pickle.py", line 858, in load
dispatch[key](self)
File "/usr/lib/python2.7/pickle.py", line 1090, in load_global
klass = self.find_class(module, name)
File "/usr/lib/python2.7/pickle.py", line 1124, in find_class
__import__(module)
ImportError: No module named _numpypy.multiarray

+
+
+
+ +

PyPy 2.0.2 - Fermi Panini

+ +
+

We're pleased to announce PyPy 2.0.2. This is a stable bugfix release +over 2.0 and 2.0.1. You can download it here:

+
+https://pypy.org/download.html +
+

It fixes a crash in the JIT when calling external C functions (with +ctypes/cffi) in a multithreaded context.

+
+

What is PyPy?

+

PyPy is a very compliant Python interpreter, almost a drop-in replacement for +CPython 2.7. It's fast (pypy 2.0 and cpython 2.7.3 performance comparison) +due to its integrated tracing JIT compiler.

+

This release supports x86 machines running Linux 32/64, Mac OS X 64 or +Windows 32. Support for ARM is progressing but not bug-free yet.

+
+
+

Highlights

+

This release contains only the fix described above. A crash (or wrong +results) used to occur if all these conditions were true:

+
    +
  • your program is multithreaded;
  • +
  • it runs on a single-core machine or a heavily-loaded multi-core one;
  • +
  • it uses ctypes or cffi to issue external calls to C functions.
  • +
+

This was fixed in the branch emit-call-x86 (see the example file +bug1.py).

+

Cheers, +arigo et. al. for the PyPy team

+
+
+
+
+
+ + Valentin wrote on 2013-07-22 09:12: +
+
+

This is cool!

+
+
+
+ +

PyPy 2.0.1 - Bohr Smørrebrød

+ +
+

We're pleased to announce PyPy 2.0.1. This is a stable bugfix release +over 2.0. You can download it here:

+
+https://pypy.org/download.html +
+

The fixes are mainly about fatal errors or crashes in our stdlib. See +below for more details.

+
+

What is PyPy?

+

PyPy is a very compliant Python interpreter, almost a drop-in replacement for +CPython 2.7. It's fast (pypy 2.0 and cpython 2.7.3 performance comparison) +due to its integrated tracing JIT compiler.

+

This release supports x86 machines running Linux 32/64, Mac OS X 64 or +Windows 32. Support for ARM is progressing but not bug-free yet.

+
+
+

Highlights

+ +

Cheers, +arigo et. al. for the PyPy team

+
+
+

Numpy Status Update

+ +
+

Hello Everyone,

+I've started to work on NumPyPy since the end of April and here is a short update :

+
    +
  • I implemented pickling support on ndarrays and dtypes, it will be compatible with numpy's pickling protocol when the "numpypy" module will be renamed to "numpy".
  • +
  • I am now working on subarrays.
  • +
+
+
+
+
+I would also like to thank everyone who donated and allowed me to work on this.
+
+
+
+
+Cheers,
+
+Romain Guillebert
+
+
+
+
+ + Anonymous wrote on 2013-05-12 11:09: +
+
+

No, thank you! Cannot wait till the day PyPy fully supports NumPy.

+
+
+
+
+ + Anonymous wrote on 2013-05-13 00:19: +
+
+

I second the anonymous comment above. The day PyPy fully supports NumPy is the day I switch from CPython.

+
+
+
+
+ + Paul Jaros wrote on 2013-05-13 08:32: +
+
+

Aww... Anonymous.

@Romain Guillebert Thank you for the hard work you are putting into it. I will be testing my code with the current release.

+
+
+
+
+ + Anonymous wrote on 2013-05-13 18:38: +
+
+

This (and to a lesser extent Python 3 support) is the only thing holding me back from switching to PyPy for all of my python programming. Thank you very much for this fantastic project!

+
+
+
+
+ + Paul Jaros wrote on 2013-05-14 22:44: +
+
+

Results from running my own little Benchmark: Labyrinth Generator
Array Size: 77711x711:

C-Code:
4.45 Seconds, ~50M Memory Usage.

Pypy with standard List:
14.5 Seconds, ~750M Memory Usage.

Pypy with Numpypy:
11.0 Seconds, ~78M Memory Usage.

Pretty impressive if you ask me. Older Numpypy where about as fast as the standard List. Also Pypy is approaching C-Performance with bigger steps than I dared hoping for.

CPython Benchmark intentionally left out... it takes ages.

+
+
+
+
+ + Anonymous wrote on 2013-05-15 14:50: +
+
+

It's great to see a progress in important libraries support.

Speed is important, but when we get acceptable speed then library support is what we need.

+
+
+
+ +

PyPy 2.0 - Einstein Sandwich

+ +
+
+

We're pleased to announce PyPy 2.0. This is a stable release that brings +a swath of bugfixes, small performance improvements and compatibility fixes. +PyPy 2.0 is a big step for us and we hope in the future we'll be able to +provide stable releases more often.

+

You can download the PyPy 2.0 release here:

+
+https://pypy.org/download.html +
+

The two biggest changes since PyPy 1.9 are:

+
    +
  • stackless is now supported including greenlets, which means eventlet +and gevent should work (but read below about gevent)
  • +
  • PyPy now contains release 0.6 of cffi as a builtin module, which +is preferred way of calling C from Python that works well on PyPy
  • +
+

If you're using PyPy for anything, it would help us immensely if you fill out +the following survey: https://bit.ly/pypysurvey This is for the developers +eyes and we will not make any information public without your agreement.

+
+

What is PyPy?

+

PyPy is a very compliant Python interpreter, almost a drop-in replacement for +CPython 2.7. It's fast (pypy 2.0 and cpython 2.7.3 performance comparison) +due to its integrated tracing JIT compiler.

+

This release supports x86 machines running Linux 32/64, Mac OS X 64 or +Windows 32. Windows 64 work is still stalling, we would welcome a volunteer +to handle that. ARM support is on the way, as you can see from the recently +released alpha for ARM.

+
+
+

Highlights

+
    +
  • Stackless including greenlets should work. For gevent, you need to check +out pypycore and use the pypy-hacks branch of gevent.
  • +
  • cffi is now a module included with PyPy. (cffi also exists for +CPython; the two versions should be fully compatible.) It is the +preferred way of calling C from Python that works on PyPy.
  • +
  • Callbacks from C are now JITted, which means XML parsing is much faster.
  • +
  • A lot of speed improvements in various language corners, most of them small, +but speeding up some particular corners a lot.
  • +
  • The JIT was refactored to emit machine code which manipulates a "frame" +that lives on the heap rather than on the stack. This is what makes +Stackless work, and it could bring another future speed-up (not done yet).
  • +
  • A lot of stability issues fixed.
  • +
  • Refactoring much of the numpypy array classes, which resulted in removal of +lazy expression evaluation. On the other hand, we now have more complete +dtype support and support more array attributes.
  • +
+

Cheers,
+fijal, arigo and the PyPy team

+
+
+
+
+
+
+
+ + Unknown wrote on 2013-05-09 20:01: +
+
+

I read this as gevent needs a special branch but eventlet doesn't. Is that correct, or does eventlet require you to use that branch as well?

+
+
+
+
+ + Anonymous wrote on 2013-05-10 01:04: +
+
+

Congrats guys! Thanks so much for all your hard work. Python is awesome, and PyPy makes it more awesome!

+
+
+
+
+ + Robert wrote on 2013-05-10 13:39: +
+
+

Are we going to get lazy expression evaluation in numpypy back sometime?

+
+
+
+
+ + Wim Lavrijsen wrote on 2013-05-10 17:26: +
+
+

Another thing that's new, is that cppyy is enabled, albeit that you need to install the Reflex library separately. See (Linux only, sorry): https://doc.pypy.org/en/latest/cppyy.html#installation

+
+
+
+
+ + Unknown wrote on 2013-07-20 15:24: +
+
+

I'd not say eventlet just works. In this example: https://eventlet.net/doc/examples.html#web-crawler I keep receiving:

File "/usr/lib/pypy/lib-python/2.7/socket.py", line 430, in read
data = self._sock.recv(left)
File "/home/divius/Projects/!demo/eventlet/env/site-packages/eventlet/greenio.py", line 251, in recv
return fd.recv(buflen, flags)
File "/usr/lib/pypy/lib-python/2.7/socket.py", line 188, in recv
return self._sock.recv(buffersize, flags=flags)
error: [Errno 9] Bad file descriptor

+
+
+
+
+ + Armin Rigo wrote on 2013-07-21 22:24: +
+
+

See https://bugs.pypy.org/issue1492. This was reported and we believe we fixed it on trunk.

+
+
+
+ +

PyPy 2.0 alpha for ARM

+ +
+
+ +

Hello.

+

We're pleased to announce an alpha release of PyPy 2.0 for ARM. This is mostly +a technology preview, as we know the JIT is not yet stable enough for the +full release. However please try your stuff on ARM and report back.

+

This is the first release that supports a range of ARM devices - anything with +ARMv6 (like the Raspberry Pi) or ARMv7 (like Beagleboard, Chromebook, +Cubieboard, etc.) that supports VFPv3 should work. We provide builds with +support for both ARM EABI variants: hard-float and some older operating +systems soft-float.

+

This release comes with a list of limitations, consider it alpha quality, +not suitable for production:

+
    +
  • stackless support is missing.
  • +
  • assembler produced is not always correct, but we successfully managed to +run large parts of our extensive benchmark suite, so most stuff should work.
  • +
+

You can download the PyPy 2.0 alpha ARM release here (including a deb for raspbian):

+
+https://pypy.org/download.html +
+

Part of the work was sponsored by the Raspberry Pi foundation.

+
+

What is PyPy?

+

PyPy is a very compliant Python interpreter, almost a drop-in replacement for +CPython 2.7.3. It's fast due to its integrated tracing JIT compiler.

+

This release supports ARM machines running Linux 32bit. Both hard-float +armhf and soft-float armel builds are provided. armhf builds are +created using the Raspberry Pi custom cross-compilation toolchain based on +gcc-arm-linux-gnueabihf and should work on ARMv6 and ARMv7 devices running at +least debian or ubuntu. armel builds are built using gcc-arm-linux-gnuebi +toolchain provided by ubuntu and currently target ARMv7. If there is interest +in other builds, such as gnueabi for ARMv6 or without requiring a VFP let us +know in the comments or in IRC.

+
+
+

Benchmarks

+

Everybody loves benchmarks. Here is a table of our benchmark suite +(for ARM we don't provide it yet on https://speed.pypy.org, +unfortunately).

+

This is a comparison of Cortex A9 processor with 4M cache and Xeon W3580 with +8M of L3 cache. The set of benchmarks is a subset of what we run for +https://speed.pypy.org that finishes in reasonable time. The ARM machine +was provided by Calxeda. +Columns are respectively:

+
    +
  • benchmark name
  • +
  • PyPy speedup over CPython on ARM (Cortex A9)
  • +
  • PyPy speedup over CPython on x86 (Xeon)
  • +
  • speedup on Xeon vs Cortex A9, as measured on CPython
  • +
  • speedup on Xeon vs Cortex A9, as measured on PyPy
  • +
  • relative speedup (how much bigger the x86 speedup is over ARM speedup)
  • +
+ ++++++++ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
BenchmarkPyPy vs CPython (arm)PyPy vs CPython (x86)x86 vs arm (pypy)x86 vs arm (cpython)relative speedup
ai3.613.167.708.820.87
bm_mako3.412.118.5613.820.62
chaos21.8217.806.938.500.82
crypto_pyaes22.5319.486.537.560.86
django13.4311.167.909.510.83
eparse1.431.176.618.120.81
fannkuch6.225.366.187.160.86
float5.226.009.688.431.15
go4.723.345.918.370.71
hexiom28.707.007.699.560.80
html5lib2.352.136.597.260.91
json_bench1.120.937.198.680.83
meteor-contest2.131.685.957.540.79
nbody_modified8.197.786.086.400.95
pidigits1.270.9514.6719.660.75
pyflate-fast3.303.5710.649.841.08
raytrace-simple46.4129.005.148.230.62
richards31.4828.516.957.680.91
slowspitfire1.281.145.916.610.89
spambayes1.931.274.156.300.66
sphinx1.011.057.767.451.04
spitfire1.551.585.625.491.02
spitfire_cstringio9.615.745.439.090.60
sympy_expand1.420.973.865.660.68
sympy_integrate1.600.954.247.120.60
sympy_str0.720.483.685.560.66
sympy_sum1.991.193.836.380.60
telco14.289.363.946.020.66
twisted_iteration11.607.336.049.550.63
twisted_names3.682.835.016.500.77
twisted_pb4.943.025.108.340.61
+

It seems that Cortex A9, while significantly slower than Xeon, has higher +slowdowns with a large interpreter (CPython) than a JIT compiler (PyPy). This +comes as a surprise to me, especially that our ARM assembler is not nearly +as polished as our x86 assembler. As for the causes, various people mentioned +branch predictor, but I would not like to speculate without actually knowing.

+
+
+

How to use PyPy?

+

We suggest using PyPy from a virtualenv. Once you have a virtualenv +installed, you can follow instructions from pypy documentation on how +to proceed. This document also covers other installation schemes.

+

We would not recommend using in production PyPy on ARM just quite yet, +however the day of a stable PyPy ARM release is not far off.

+

Cheers,
+fijal, bivab, arigo and the whole PyPy team

+
+
+
+
+
+
+
+ + Anonymous wrote on 2013-05-08 14:43: +
+
+

Congratulations!

+
+
+
+
+ + Anonymous wrote on 2013-05-08 14:43: +
+
+

Congratulations!

+
+
+
+
+ + Rasmus wrote on 2013-05-09 12:43: +
+
+

This is truly amazing! Great work and I'm very interested about the future.

+
+
+
+
+ + João Magalhães wrote on 2013-05-10 22:48: +
+
+

This is really great news especially for the raspberry pi guys.

Congratulations !!!

+
+
+
+
+ + Verona wrote on 2013-07-30 06:19: +
+
+

This is cool!

+
+
+
+
+ + xaRD wrote on 2015-01-25 12:13: +
+
+

Where I get the source code for the benchmark's you have used?

+
+
+
+
+ + Armin Rigo wrote on 2015-01-26 09:59: +
+
+

https://foss.heptapod.net/pypy/benchmarks/

+ +
+
+
+ +

PyPy 2.0 beta 2 released

+ +
+
+

We're pleased to announce the 2.0 beta 2 release of PyPy. This is a major +release of PyPy and we're getting very close to 2.0 final, however it includes +quite a few new features that require further testing. Please test and report +issues, so we can have a rock-solid 2.0 final. It also includes a performance +regression of about 5% compared to 2.0 beta 1 that we hope to fix before +2.0 final. The ARM support is not working yet and we're working hard to +make it happen before the 2.0 final. The new major features are:

+
    +
  • JIT now supports stackless features, that is greenlets and stacklets. This +means that JIT can now optimize the code that switches the context. It enables +running eventlet and gevent on PyPy (although gevent requires some +special support that's not quite finished, read below).
  • +
  • This is the first PyPy release that includes cffi as a core library. +Version 0.6 comes included in the PyPy library. cffi has seen a lot of +adoption among library authors and we believe it's the best way to wrap +C libaries. You can see examples of cffi usage in _curses.py and +_sqlite3.py in the PyPy source code.
  • +
+

You can download the PyPy 2.0 beta 2 release here:

+
+https://pypy.org/download.html +
+
+

What is PyPy?

+

PyPy is a very compliant Python interpreter, almost a drop-in replacement for +CPython 2.7.3. It's fast (pypy 2.0 beta 2 and cpython 2.7.3 +performance comparison) due to its integrated tracing JIT compiler.

+

This release supports x86 machines running Linux 32/64, Mac OS X 64 or +Windows 32. It also supports ARM machines running Linux, however this is +disabled for the beta 2 release. +Windows 64 work is still stalling, we would welcome a volunteer +to handle that.

+
+
+

How to use PyPy?

+

We suggest using PyPy from a virtualenv. Once you have a virtualenv +installed, you can follow instructions from pypy documentation on how +to proceed. This document also covers other installation schemes.

+
+
+

Highlights

+
    +
  • +cffi is officially supported by PyPy. It comes included in the standard +library, just use import cffi +
  • +
  • stackless support - eventlet just works and gevent requires pypycore +and pypy-hacks branch of gevent (which mostly disables cython-based +modules)
  • +
  • callbacks from C are now much faster. pyexpat is about 3x faster, cffi +callbacks around the same
  • +
  • +__length_hint__ is implemented (PEP 424)
  • +
  • a lot of numpy improvements
  • +
+
+
+

Improvements since 1.9

+
    +
  • +JIT hooks are now a powerful tool to introspect the JITting process that +PyPy performs
  • +
  • various performance improvements compared to 1.9 and 2.0 beta 1
  • +
  • operations on long objects are now as fast as in CPython (from +roughly 2x slower)
  • +
  • we now have special strategies for dict/set/list which contain +unicode strings, which means that now such collections will be both faster +and more compact.
  • +
+
+
+
+
+
+
+
+ + Anonymous wrote on 2013-04-08 08:30: +
+
+

why do you ship with pypy sqlite version 3.5.9 (windows version),
this is an old version which doesn't support wal mode

2008-May-12 - Version 3.5.9

+
+
+
+
+ + Anonymous wrote on 2013-04-08 16:40: +
+
+

Congratulations! And hope the ARM version of PyPy together with ARM v6 support will also coming soon.

+
+
+
+
+ + Anonymous wrote on 2013-04-08 17:26: +
+
+

Can you explain "performance regression of about 5% " and also "various performance improvements compared to 1.9 and 2.0 beta 1"?

What is faster and what is slower?

+
+
+
+
+ + Anonymous wrote on 2013-04-12 11:59: +
+
+

And we've got a lot of segfaults with beta2…

+
+
+
+
+ + Maciej Fijalkowski wrote on 2013-04-12 12:11: +
+
+

@Anonymous - please report those. It's impossible for us to determine what's going on without reporting back.

+
+
+
+
+ + Mak Sim wrote on 2013-04-18 10:59: +
+
+

Thank you for great job.
Do you plan to release 64-bit binaries for Windows?
I'm trying to build from tag "pypy-2.0-beta2" under Windows7 x64, with MSVC compiler AMD-64, and I've got an exception:

[translation:ERROR] TypeError: <Struct PyTypeObject { c_ob_refcnt, c__pad0, c__pad1, c__pad2, c__pad3, c_ob_type, c_ob_size, c__pad4, c__pad5, c__pad6, c__pad7, c_tp_name, c_tp_basicsize, c_tp_itemsize, c_tp_dealloc, c_tp_print, c_tp_getattr, c_tp_setattr, c_tp_compare, c_tp_repr, c_tp_as_number, c_tp_as_sequence, c_tp_as_mapping, c_tp_hash, c_tp_call, c_tp_str, c_tp_getattro, c_tp_setattro, c_tp_as_buffer, c_tp_flags, c__pad8, c__pad9, c__pad10, c__pad11, c_tp_doc, c_tp_traverse, c_tp_clear, c_tp_richcompare, c_tp_weaklistoffset, c__pad12, c__pad13, c__pad14, c__pad15, c_tp_iter, c_tp_iternext, c_tp_methods, c_tp_members, c_tp_getset, c_tp_base, c_tp_dict, c_tp_descr_get, c_tp_descr_set, c_tp_dictoffset, c__pad16, c__pad17, c__pad18, c__pad19, c_tp_init, c_tp_alloc, c_tp_new, c_tp_free, c_tp_is_gc, c_tp_bases, c_tp_mro, c_tp_cache, c_tp_subclasses, c_tp_weaklist, c_tp_del, c__pad20, c__pad21, c__pad22, c__pad23, c__pad24, c__pad25, c__pad26, c__pad27 }> instance field 'c_ob_refcnt':
[translation:ERROR] expects <INT>
[translation:ERROR] got <Signed>

+
+
+
+
+ + Maciej Fijalkowski wrote on 2013-04-18 11:01: +
+
+

As it says in the release announcement, win64 is not supported. You need to build a 32bit binary (using 32bit Python)

+
+
+
+
+ + Egypt News wrote on 2013-04-22 05:04: +
+
+

great news, waiting for the final v2.0

+
+
+
+ +
+
+ +
+
+
+ +
+ + + + \ No newline at end of file diff --git a/blog/index-28.html b/blog/index-28.html new file mode 100644 index 000000000..0c91d660c --- /dev/null +++ b/blog/index-28.html @@ -0,0 +1,1157 @@ + + + + + + +PyPy (old posts, page 28) | PyPy + + + + + + + + + + + + + + + + + + Skip to main content +
+
+
+

Update on STM

+ +
+

Hi all,

+ +

A quick update on Software Transactional Memory. We are +working on two fronts.

+ +

On the one hand, the integration of the "c4" C library with PyPy is done +and works well, but is still subject to improvements. The "PyPy-STM" +executable (without the JIT) +seems to be stable, as far as it has been tested. It runs a simple +benchmark like Richards with a 3.2x slow-down over a regular JIT-less +PyPy.

+ +

The main factor of this slow-down: the numerous "barriers" in +the code --- checks that are needed a bit everywhere to verify that a +pointer to an object points to a recent enough version, and if not, to +go to the most recent version. These barriers are inserted automatically +during the translation; there is no need for us to manually put 42 million +barriers in the source code of PyPy. But this automatic insertion uses a +primitive algorithm right now, which usually ends up putting more barriers than the +theoretical optimum. I (Armin) am trying to improve that --- and progressing: +last week the slow-down was around 4.5x. This is done in the branch +stmgc-static-barrier.

+ +

On the other hand, Remi is progressing on the JIT integration in +the branch stmgc-c4. +This has been working in simple cases since a couple of weeks by now, but the +resulting "PyPy-JIT-STM" often crashes. This is because while the +basics are not really hard, we keep hitting new issues that must be +resolved.

+ +

The basics are that whenever the JIT is about to generate +assembler corresponding to a load or a store in a GC object, it must +first generate a bit of extra assembler that corresponds to the barrier +that we need. This works fine by now (but could benefit from the same +kind of optimizations described above, to reduce the number of barriers). +The additional issues are all more subtle. I will describe the current +one as an example: it is how to write constant pointers inside the assembler.

+ +

Remember that the STM library classifies objects as either +"public" or "protected/private". A "protected/private" object +is one which has not been seen by another thread so far. +This is essential as an optimization, because we know that no +other thread will access our protected or private objects in parallel, +and thus we are free to modify their content in place. By contrast, +public objects are frozen, and to do any change, we first need to +build a different (protected) copy of the object. See this +blog +post for more details.

+ +

So far so good, but the JIT will sometimes (actually often) hard-code +constant pointers into the assembler it produces. For example, this is the +case when the Python code being JITted creates an instance of a known class; +the corresponding assembler produced by the JIT will reserve the memory for +the instance and then write the constant type pointer in it. This type +pointer is a GC object (in the simple model, it's the Python class object; +in PyPy it's actually the "map" object, which is +a different story).

+ +

The problem right now is that this constant pointer may point to a +protected object. This is a problem because the same piece of assembler +can later be executed by a different thread. If it does, then this +different thread will create instances whose type pointer is bogus: looking +like a protected object, but actually protected by a different thread. +Any attempt to use this type pointer to change anything on the class +itself will likely crash: the threads will all think they can safely change it +in-place. To fix this, we need to make sure we only write pointers to +public objects in the assembler. This is a bit involved because we need +to ensure that there is a public version of the object to start with.

+ +

When this is done, we will likely hit the next problem, and the next one; +but at some point it should converge (hopefully!) and we'll give you our first +PyPy-JIT-STM ready to try. Stay tuned :-)

+ +

A bientôt,

+ +

Armin.

+
+
+
+
+ + Anonymous wrote on 2013-08-19 11:06: +
+
+

*assembly

+
+
+
+
+ + Unknown wrote on 2013-08-20 21:31: +
+
+

Thanks for the update; glad it's coming together! I'm really looking forward to seeing how it stacks up once the JIT work is complete.

Do you think that it'll be possible to ever get better than a 2x slowdown for serial operations? Or is that the minimal possible? Naively, it makes sense that it'll never be as fast, but if 1.5x or lower were possible, that would be very exciting.

Also, is the end goal that you would have a module you import to "turn on" STM? Or would it always be a separate build of pypy, just like JIT/JIT-less?

+
+
+
+
+ + Armin Rigo wrote on 2013-08-21 09:05: +
+
+

@Christopher: the slow-down we'll get is still unknown, but I fear it won't really go well under 2x.

I see it mainly as a separate build: either you want to run all these barrier instructions everywhere (which gives the slow-down) or not. It could be possible in theory to have a version that has the barriers everywhere, but creates JIT-generated assembler that doesn't, and thus runs almost as fast as a regular PyPy as long as you don't "turn on" STM. We will see if that makes sense.

+
+
+
+
+ + Armin Rigo wrote on 2013-08-21 09:12: +
+
+

@Anonymous: ah, thanks :-) I think I now learned the difference between "assembler" and "assembly" in English, which was never quite clear to me. Note that in french the same word ("assembleur") is used to mean both terms.

+
+
+
+
+ + Unknown wrote on 2013-08-22 17:14: +
+
+

@Armin: Ah, I see. Well, from a user's perspective, what I most write in python these days is either GUI applications (for which I've never been able to use pypy due to lack of bindings, but that's another issue entirely), or for small services, for which pypy has provided a rather nice speed improvement.

In a perfect world, I'd be able to use pypy for both of these tasks, not using STM for my GUI applications, but turning it on for the services I write (well, once they reach a certain point where I'd gain something from concurrency).

I suspect having a separate build would make such a use-case awkward.

Also, my interest is a bit self-motivated; at work we current use node.js for a lot of our services. Pypy compares decently for a lot of our tasks, but it not 'clearly better'. Once STM is stable, however, several of our services that we've struggled scaling to multiple cores on node.js could be rewritten in pypy STM, and should scale much easier. (Manual process management is painful!)

Again, if pypy STM were a seperate build, we'd have to manage having both installed in the case where we have servers running services that need concurrency, or ones that work well enough with a very fast async implementation. Not impossible, just a bit awkward. :)

Either way, I'm pretty excited!

+
+
+
+
+ + Unknown wrote on 2013-10-16 15:22: +
+
+

Are there any plans or experiments going on related to Hardware Transactional Memory?

+
+
+
+
+ + Armin Rigo wrote on 2013-10-16 15:55: +
+
+

@Ignacio Hernandez: for HTM, our position is still as described last year in: https://morepypy.blogspot.com/2012/08/multicore-programming-in-pypy-and.html

+
+
+
+ +

NumPyPy Status Update

+ +
+

Hello everyone

+As expected, nditer is a lot of work. I'm going to pause my work on it for now and focus on simpler and more important things, here is a list of what I implemented :

+
    +
  • Fixed a bug on 32 bit that made int32(123).dtype == dtype("int32") fail
  • +
  • Fixed a bug on the pickling of array slices
  • +
  • The external loop flag is implemented on the nditer class
  • +
  • The c_index, f_index and multi_index flags are also implemented
  • +
  • Add dtype("double") and dtype("str")
  • +
  • C-style iteration is available for nditer
  • +
+Cheers
+Romain Guillebert +
+
+
+
+ + René Dudfield wrote on 2013-08-09 10:17: +
+
+

Nice work :)

+
+
+
+
+ + Arne Babenhauserheide wrote on 2013-08-12 09:38: +
+
+

thanks for the update!

+
+
+
+ +

PyPy 2.1 - Considered ARMful

+ +
+

We're pleased to announce PyPy 2.1, which targets version 2.7.3 of the Python
+language. This is the first release with official support for ARM processors in the JIT.
+This release also contains several bugfixes and performance improvements.

+

You can download the PyPy 2.1 release here:

+
https://pypy.org/download.html
+

We would like to thank the Raspberry Pi Foundation for supporting the work
+to finish PyPy's ARM support.

+

The first beta of PyPy3 2.1, targeting version 3 of the Python language, was
+just released, more details can be found here.

+
+

What is PyPy?

+

PyPy is a very compliant Python interpreter, almost a drop-in replacement for +CPython 2.7. It's fast (pypy 2.1 and cpython 2.7.2 performance comparison) +due to its integrated tracing JIT compiler.

+

This release supports x86 machines running Linux 32/64, Mac OS X 64 or Windows +32. This release also supports ARM machines running Linux 32bit - anything with +ARMv6 (like the Raspberry Pi) or ARMv7 (like the Beagleboard, +Chromebook, Cubieboard, etc.) that supports VFPv3 should work. Both +hard-float armhf/gnueabihf and soft-float armel/gnueabi builds are +provided. The armhf builds for Raspbian are created using the Raspberry Pi +custom cross-compilation toolchain +based on gcc-arm-linux-gnueabihf and should work on ARMv6 and +ARMv7 devices running Debian or Raspbian. The armel builds are built +using the gcc-arm-linux-gnuebi toolchain provided by Ubuntu and +currently target ARMv7.

+

Windows 64 work is still stalling, we would welcome a volunteer +to handle that.

+
+
+

Highlights

+
    +
  • JIT support for ARM, architecture versions 6 and 7, hard- and soft-float ABI
  • +
  • Stacklet support for ARM
  • +
  • Support for os.statvfs and os.fstatvfs on unix systems
  • +
  • Improved logging performance
  • +
  • Faster sets for objects
  • +
  • Interpreter improvements
  • +
  • During packaging, compile the CFFI based TK extension
  • +
  • Pickling of numpy arrays and dtypes
  • +
  • Subarrays for numpy
  • +
  • Bugfixes to numpy
  • +
  • Bugfixes to cffi and ctypes
  • +
  • Bugfixes to the x86 stacklet support
  • +
  • Fixed issue 1533: fix an RPython-level OverflowError for space.float_w(w_big_long_number).
  • +
  • Fixed issue 1552: GreenletExit should inherit from BaseException.
  • +
  • Fixed issue 1537: numpypy __array_interface__
  • +
  • Fixed issue 1238: Writing to an SSL socket in PyPy sometimes failed with a "bad write retry" message.
  • +
+

Cheers,

+

David Schneider for the PyPy team.

+
+
+
+
+
+ + Anonymous wrote on 2013-08-02 03:42: +
+
+

What about gevent support in this release? i am waiting for full support to switch to pypy on production

+
+
+
+
+ + Armin Rigo wrote on 2013-08-02 08:02: +
+
+

Some issues with gevent were fixed. You need to try it out and report any remaining issues, if any.

+
+
+
+
+ + Unknown wrote on 2013-08-02 08:43: +
+
+

If i read well, you did not use any ThumbEE instructions for your Arm support ? So there is room for improvement ?

+
+
+
+
+ + Armin Rigo wrote on 2013-08-02 09:44: +
+
+

ThumbEE is deprecated nowadays.

+
+
+
+
+ + Unknown wrote on 2013-08-07 15:12: +
+
+

Has cdecimal been backported into either version of PyPy yet? If not, any near-term plan to do so?

+
+
+
+
+ + Armin Rigo wrote on 2013-08-08 08:07: +
+
+

cdecimal is purely a speed gain. On PyPy the pure Python decimal.py is accelerated by the JIT, though it is probably possible to gain some small extra factor by rewriting it directly in RPython.

If your problem is merely that project X has listed cdecimal in its dependencies, then we could add a "cdecimal.egg-info" file that says "yup, it's installed" and be done (assuming that the API is really the same one as decimal.py).

+
+
+
+
+ + Amaury Forgeot d'Arc wrote on 2013-08-08 23:18: +
+
+

cdecimal is actually based on a C library (libmpdec). Maybe a ffi-based binding could give interesting results.

+
+
+
+
+ + Anonymous wrote on 2013-08-16 12:00: +
+
+

Importing sqlite3 incurs a huge delay in the latest armhf jit nightly (15 August).

+
+
+
+
+ + Anonymous wrote on 2013-08-26 12:55: +
+
+

Will PyPy PPA be updated? https://launchpad.net/~pypy/+archive/ppa

+
+
+
+ +

PyPy Demo Evening in London, August 27, 2013

+ +
+

As promised in the London sprint announcement we are organising a PyPy demo +evening during the London sprint on Tuesday, August 27 2013, 18:30-19:30 (BST). The +description of the event is below. If you want to come, please register on the +Eventbrite page.

+
+

PyPy is a fast Python VM. Maybe you've never used PyPy and want to find out +what use it might be for you? Or you and your organisation have been using it +and you want to find out more about how it works under the hood? If so, this +demo session is for you!

+

Members of the PyPy team will give a series of lightning talks on PyPy: its +benefits; how it works; research currently being undertaken to make it +faster; and unusual uses it can be put to. Speakers will be available +afterwards for informal discussions. This is the first time an event like +this has been held in the UK, and is a unique opportunity to speak to core +people. Speakers confirmed thus far include: Armin Rigo, Maciej Fijałkowski, +Carl Friedrich Bolz, Lukas Diekmann, Laurence Tratt, Edd Barrett.

+

The venue for this talk is the Software Development Team, King's College +London. The main entrance is on the Strand, from where the room for the event +will be clearly signposted. Travel directions can be found at +https://www.kcl.ac.uk/campuslife/campuses/directions/strand.aspx

+

If you have any questions about the event, please contact Laurence Tratt

+
+

PyPy3 2.1 beta 1

+ +
+

We're pleased to announce the first beta of the upcoming 2.1 release of
+PyPy3. This is the first release of PyPy which targets Python 3 (3.2.3)
+compatibility.

+

We would like to thank all of the people who donated to the py3k proposal
+for supporting the work that went into this and future releases.

+

You can download the PyPy3 2.1 beta 1 release here:

+
https://pypy.org/download.html#pypy3-2-1-beta-1
+
+

Highlights

+
    +
  • The first release of PyPy3: support for Python 3, targetting CPython 3.2.3!
      +
    • There are some known issues including performance regressions (issues
      #1540 & #1541) slated to be resolved before the final release.
    • +
    +
  • +
+
+
+

What is PyPy?

+

PyPy is a very compliant Python interpreter, almost a drop-in replacement for
+CPython 2.7.3 or 3.2.3. It's fast due to its integrated tracing JIT compiler.

+

This release supports x86 machines running Linux 32/64, Mac OS X 64 or Windows
+32. Also this release supports ARM machines running Linux 32bit - anything with
ARMv6 (like the Raspberry Pi) or ARMv7 (like Beagleboard,
+Chromebook, Cubieboard, etc.) that supports VFPv3 should work.

+

Windows 64 work is still stalling and we would welcome a volunteer to handle
+that.

+
+
+

How to use PyPy?

+

We suggest using PyPy from a virtualenv. Once you have a virtualenv
+installed, you can follow instructions from pypy documentation on how
+to proceed. This document also covers other installation schemes.

+

Cheers,
+the PyPy team

+
+
+
+
+
+ + Arne Babenhauserheide wrote on 2013-07-31 08:47: +
+
+

This is *really* cool!

Thank you for realizing pypy for python3! This should make it much easier to continue work on one of my projects (it was on hold, because pypy made it much faster, but I had to convert from python3 to python2 for running it, and that became a maintenance nightmare.

+
+
+
+
+ + Anonymous wrote on 2013-08-02 11:30: +
+
+

So how does one build PyPy3? It doesn't seem to be documented anywhere.

+
+
+
+
+ + Anonymous wrote on 2013-08-02 12:02: +
+
+

Sorry never mind. I thought it was being developed in the same codebase, but now I realize there's a separate branch for PyPy3 that must be used to build the Python3 version.

+
+
+
+ +

PyPy 2.1 beta 2

+ +
+

We're pleased to announce the second beta of the upcoming 2.1 release of PyPy.
+This beta adds one new feature to the 2.1 release and contains several bugfixes listed below.

+

You can download the PyPy 2.1 beta 2 release here:

+
https://pypy.org/download.html
+
+

Highlights

+
    +
  • Support for os.statvfs and os.fstatvfs on unix systems.
  • +
  • Fixed issue 1533: fix an RPython-level OverflowError for space.float_w(w_big_long_number).
  • +
  • Fixed issue 1552: GreenletExit should inherit from BaseException.
  • +
  • Fixed issue 1537: numpypy __array_interface__
  • +
  • Fixed issue 1238: Writing to an SSL socket in pypy sometimes failed with a "bad write retry" message.
  • +
  • +distutils: copy CPython's implementation of customize_compiler, dont call
    +split on environment variables, honour CFLAGS, CPPFLAGS, LDSHARED and
    +LDFLAGS.
  • +
  • During packaging, compile the CFFI tk extension.
  • +
+
+
+

What is PyPy?

+

PyPy is a very compliant Python interpreter, almost a drop-in replacement for
+CPython 2.7.3. It's fast due to its integrated tracing JIT compiler.

+

This release supports x86 machines running Linux 32/64, Mac OS X 64 or Windows
+32. Also this release supports ARM machines running Linux 32bit - anything with
ARMv6 (like the Raspberry Pi) or ARMv7 (like Beagleboard,
+Chromebook, Cubieboard, etc.) that supports VFPv3 should work.

+

Windows 64 work is still stalling, we would welcome a volunteer
+to handle that.

+
+
+

How to use PyPy?

+

We suggest using PyPy from a virtualenv. Once you have a virtualenv
+installed, you can follow instructions from pypy documentation on how
+to proceed. This document also covers other installation schemes.

+

Cheers,
+The PyPy Team.

+
+
+

PyPy San Francisco Sprint July 27th 2013

+ +
+

The next PyPy sprint will be in San Francisco, California. It is a public
+sprint, suitable for newcomers. It will run on Saturday July 27th.

+

Some possible things people will be hacking on the sprint:

+
    +
  • running your software on PyPy
  • +
  • making your software fast on PyPy
  • +
  • improving PyPy's JIT
  • +
  • improving Twisted on PyPy
  • +
  • any exciting stuff you can think of
  • +
+

If there are newcomers, we'll run an introduction to hacking on PyPy.

+

Location
+The sprint will be held at the Rackspace Office:

+

620 Folsom St, Ste 100

+

The doors will open at 10AM and run until 6PM.

+
+
+
+
+ + Garen wrote on 2013-07-26 04:29: +
+
+

s/2012/2013/;

+
+
+
+
+ + Anonymous wrote on 2013-07-30 11:39: +
+
+

You think you might get more folks if you gave more than 24 hours notice?

Just saying...

+
+
+
+ +

PyPy London Sprint (August 26 - September 1 2013)

+ +
+

The next PyPy sprint will be in London, United Kingdom for the first +time. This is a fully public sprint. PyPy sprints are a very good way +to get into PyPy development and no prior PyPy knowledge is necessary.

+

Goals and topics of the sprint

+

For newcomers:

+
    +
  • bring your application/library and we'll help you port it to PyPy, +benchmark and profile
  • +
  • come and write your favorite missing numpy function
  • +
  • help us work on developer tools like jitviewer
  • +
+

We'll also work on:

+
    +
  • refactoring the JIT optimizations
  • +
  • STM and STM-related topics
  • +
  • anything else attendees are interested in
  • +
+

Exact times

+

The work days should be August 26 - September 1 2013 (Monday-Sunday). +The official plans are for people to arrive on the 26th, and +to leave on the 2nd. There will be a break day in the middle. +We'll typically start at 10:00 in the morning.

+

Location

+

The sprint will happen within a room of King's College's Strand +Campus in Central London, UK. There are some travel instructions how to +get there. We are being hosted by Laurence Tratt and the Software +Development Team.

+

Demo Session

+

If you don't want to come to the full sprint, but still want to chat a +bit, we are planning to have a demo session on Tuesday August 27. We +will announce this separately on the blog. If you are interested, please +leave a comment.

+

Registration

+

If you want to attend, please register by adding yourself to the +"people.txt" file in Mercurial:

+
+https://bitbucket.org/pypy/extradoc/
+https://foss.heptapod.net/pypy/extradoc/-/blob/branch/default/extradoc/sprintinfo/london-2013
+
+

or on the pypy-dev mailing list if you do not yet have check-in rights:

+
+https://mail.python.org/mailman/listinfo/pypy-dev
+
+

Remember that you may need a (insert country here)-to-UK power adapter. +Please note that UK is not within the Schengen zone, so non-EU and +non-Switzerland citizens may require specific visa. Please check travel +regulations. Also, the UK uses pound sterling (GBP).

+
+
+
+
+ + griff wrote on 2013-07-19 15:05: +
+
+

I'd be up for joining Andrew :)

+
+
+
+
+ + Unknown wrote on 2013-07-19 15:06: +
+
+

Cannot quite get a week off for this, but would be very interested in the demo session on the Tuesday.

+
+
+
+
+ + Daniel wrote on 2013-07-22 11:06: +
+
+

I would be very interested in the demo session on the Tuesday 27th.

+
+
+
+
+ + Carin Robert wrote on 2013-08-24 07:57: +
+
+

Does the demo session happen on August 27th only? What are the timings?

+
+
+
+
+ + Armin Rigo wrote on 2013-08-24 10:42: +
+
+

@Carin: https://morepypy.blogspot.ch/2013/08/preliminary-london-demo-evening-agenda.html

+
+
+
+ +

Software Transactional Memory lisp experiments

+ +
+
+

As covered in the previous blog post, the STM subproject of PyPy has been +back on the drawing board. The result of this experiment is an STM-aware +garbage collector written in C. This is finished by now, thanks to Armin's +and Remi's work, we have a fully functional garbage collector and a STM system +that can be used from any C program with enough effort. Using it is more than +a little mundane, since you have to inserts write and read barriers by hand +everywhere in your code that reads or writes to garbage collector controlled +memory. In the PyPy integration, this manual work is done automatically +by the STM transformation in the interpreter.

+

However, to experiment some more, we created a minimal +lisp-like/scheme-like interpreter +(called Duhton), that follows closely CPython's implementation strategy. +For anyone familiar with CPython's source code, it should be pretty +readable. This interpreter works like a normal and very basic lisp variant, +however it comes with a transaction builtin, that lets you spawn transactions +using the STM system. We implemented a few demos that let you play with the +transaction system. All the demos are running without conflicts, which means +there are no conflicting writes to global memory and hence the demos are very +amenable to parallelization. They exercise:

+
    +
  • arithmetics - demo/many_sqare_roots.duh +
  • +
  • read-only access to globals - demo/trees.duh +
  • +
  • read-write access to local objects - demo/trees2.duh +
  • +
+

With the latter ones being very similar to the classic gcbench. STM-aware +Duhton can be found in the stmgc repo, while the STM-less Duhton, +that uses refcounting, can be found in the duhton repo under the base +branch.

+

Below are some benchmarks. Note that this is a little comparing apples to +oranges since the single-threaded duhton uses refcounting GC vs generational +GC for STM version. Future pypy benchmarks will compare more apples to apples. +Moreover none of the benchmarks has any conflicts. Time is the total time +that the benchmark took (not the CPU time) and there was very little variation +in the consecutive runs (definitely below 5%).

+ +++++++ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
benchmark1 thread (refcount)1 thread (stm)2 threads4 threads
square1.9s3.5s1.8s0.9s
trees0.6s1.0s0.54s0.28s
trees21.4s2.2s1.1s0.57s
+

As you can see, the slowdown for STM vs single thread is significant +(1.8x, 1.7x, 1.6x respectively), but still lower than 2x. However the speedup +from running on multiple threads parallelizes the problem almost perfectly.

+

While a significant milestone, we hope the next blog post will cover +STM-enabled pypy that's fully working with JIT work ongoing.

+

Cheers,
+fijal on behalf of Remi Meier and Armin Rigo

+

+
+
+
+
+
+ + Anonymous wrote on 2013-07-12 13:06: +
+
+

I hacked a bit; inserted likely hint on early exit on spinlock acquisition, Haswell xacquire/xrelease hints on spinlock acquisition and release, and compiled with Haswell optimized flags.

Resulting scaling from 1 to 4 threads for tests were 1.92, 1.87 and 1.88. I think that's already quite close to 2.

I think this is OK, but not extraordinary.

+
+
+
+
+ + Anonymous wrote on 2013-07-12 13:12: +
+
+

Just to clarify my above comment: those were average factors of scaling per doubling of threads. So, 4-thread version ran actually 3.67, 3.50 and 3.54 times faster than single-threaded version.

+
+
+
+
+ + Armin Rigo wrote on 2013-07-12 13:15: +
+
+

Cool that you hacked on it! Note however that spinlock acquisition is not a blocker in these examples --- we implement STM mostly without locks, and locks are acquired rarely. Running independent code without getting STM conflicts means that each thread will in practice only acquire its own lock. And a single global lock is used for major GC --- but there, the large amount of work done means that using the Haswell xacquire/xrelease hints is just counterproductive.

"Resulting scaling from 1 to 4 threads" doesn't mean anything, as in some examples it scales perfectly, and in other examples it doesn't scale at all (as expected).

+
+
+
+
+ + Anonymous wrote on 2013-07-12 13:39: +
+
+

All your arguments are valid, and I didn't really expect much from hinting, just decided to try. It would seem that Haswell is still inching towards higher multicore scalability - probably thanks to improved atomic and fence ops in general. It's a benefit for those workloads that should conceptually scale well...

+
+
+
+
+ + Glen Newton wrote on 2013-07-13 18:19: +
+
+

You really need to go above 4 threads: 8,16,32, and 64 at least. Then plot out the overhead of the STM related to this level of threading. If your benchmark is too small, alter it so that it makes sense to try and solve it with 64 threads.

+
+
+
+
+ + Armin Rigo wrote on 2013-07-14 06:31: +
+
+

@glen: we're focusing right now on the machines we have, which are standard Intels with 4, 8, or at most 12 cores. I believe it is interesting too, and it's what people have right now in their own desktop or laptop computers. Obviously the scalability to larger numbers of cores is important as well, but we can't simply disregard any result involving less than 64 cores.

+
+
+
+
+ + Anonymous wrote on 2013-07-17 17:20: +
+
+

This is a really great news.

Wish you all the best with further work!

+
+
+
+ +

PyPy 2.1 beta

+ +
+

We're pleased to announce the first beta of the upcoming 2.1 release of PyPy. This beta contains many bugfixes and improvements, numerous improvements to the numpy in pypy effort. The main feature being that the ARM processor support is not longer considered alpha level.

+We would like to thank the Raspberry Pi Foundation for supporting the work to finish PyPy's ARM support.


+You can download the PyPy 2.1 beta release here:

+
+https://pypy.org/download.html +
+
+

+
+

+

+Highlights

+
    +
  • Bugfixes to the ARM JIT backend, so that ARM is now an officially
    +supported processor architecture
  • +
  • Stacklet support on ARM
  • +
  • Interpreter improvements
  • +
  • Various numpy improvements
  • +
  • Bugfixes to cffi and ctypes
  • +
  • Bugfixes to the stacklet support
  • +
  • Improved logging performance
  • +
  • Faster sets for objects
  • +
+
+
+

+
+

+

+What is PyPy?

+PyPy is a very compliant Python interpreter, almost a drop-in replacement for CPython 2.7.3. It's fast due to its integrated tracing JIT compiler. This release supports x86 machines running Linux 32/64, Mac OS X 64 or Windows 32. Also this release supports ARM machines running Linux 32bit - anything with ARMv6 (like the Raspberry Pi) or ARMv7 (like Beagleboard, Chromebook, Cubieboard, etc.) that supports VFPv3 should work. Both hard-float armhf/gnueabihf and soft-float armel/gnueabi builds are provided. armhf builds for Raspbian are created using the Raspberry Pi
custom cross-compilation toolchain based on gcc-arm-linux-gnueabihf and should work on ARMv6 and ARMv7 devices running Debian or Raspbian. armel builds are built using the gcc-arm-linux-gnuebi toolchain provided by Ubuntu and currently target ARMv7.

+Windows 64 work is still stalling, we would welcome a volunteer to handle that.
+
+

+
+

+

+How to use PyPy?

+We suggest using PyPy from a virtualenv. Once you have a virtualenv installed, you can follow instructions from pypy documentation on how to proceed. This document also covers other installation schemes.

+Cheers,

+the PyPy team.
+
+
+
+
+ +
+
+
+ +
+ + + + \ No newline at end of file diff --git a/blog/index-29.html b/blog/index-29.html new file mode 100644 index 000000000..9eca3182e --- /dev/null +++ b/blog/index-29.html @@ -0,0 +1,1533 @@ + + + + + + +PyPy (old posts, page 29) | PyPy + + + + + + + + + + + + + + + + + + Skip to main content +
+
+
+

PyPy 2.2 - Incrementalism

+ +
+

We're pleased to announce PyPy 2.2, which targets version 2.7.3 of the Python language. This release main highlight is the introduction of the incremental garbage collector, sponsored by the Raspberry Pi Foundation.
+This release also contains several bugfixes and performance improvements.
+You can download the PyPy 2.2 release here:

+
https://pypy.org/download.html
We would like to thank our donors for the continued support of the PyPy project. We showed quite a bit of progress on all three projects (see below) and we're slowly running out of funds. Please consider donating more so we can finish those projects! The three projects are:
    +
  • Py3k (supporting Python 3.x): the release PyPy3 2.2 is imminent.
  • +
  • STM (software transactional memory): a preview will be released very soon, as soon as we fix a few bugs
  • +
  • NumPy: the work done is included in the PyPy 2.2 release. More details below.
  • +
+
+

What is PyPy?

PyPy is a very compliant Python interpreter, almost a drop-in replacement for CPython 2.7. It's fast (pypy 2.2 and cpython 2.7.2 performance comparison) due to its integrated tracing JIT compiler.
+This release supports x86 machines running Linux 32/64, Mac OS X 64, Windows 32, or ARM (ARMv6 or ARMv7, with VFPv3).
+Work on the native Windows 64 is still stalling, we would welcome a volunteer to handle that.
+
+

Highlights

+
    +
  • Our Garbage Collector is now "incremental". It should avoid almost all pauses due to a major collection taking place. Previously, it would pause the program (rarely) to walk all live objects, which could take arbitrarily long if your process is using a whole lot of RAM. Now the same work is done in steps. This should make PyPy more responsive, e.g. in games. There are still other pauses, from the GC and the JIT, but they should be on the order of 5 milliseconds each.
  • +
  • The JIT counters for hot code were never reset, which meant that a process running for long enough would eventually JIT-compile more and more rarely executed code. Not only is it useless to compile such code, but as more compiled code means more memory used, this gives the impression of a memory leak. This has been tentatively fixed by decreasing the counters from time to time.
  • +
  • NumPy has been split: now PyPy only contains the core module, called _numpypy. The numpy module itself has been moved to https://bitbucket.org/pypy/numpy and numpypy disappeared. You need to install NumPy separately with a virtualenv: pip install git+https://bitbucket.org/pypy/numpy.git; or directly: git clone https://bitbucket.org/pypy/numpy.git; cd numpy; pypy setup.py install.
  • +
  • non-inlined calls have less overhead
  • +
  • Things that use sys.set_trace are now JITted (like coverage)
  • +
  • JSON decoding is now very fast (JSON encoding was already very fast)
  • +
  • various buffer copying methods experience speedups (like list-of-ints to int[] buffer from cffi)
  • +
  • We finally wrote (hopefully) all the missing os.xxx() functions, including os.startfile() on Windows and a handful of rare ones on Posix.
  • +
  • numpy has a rudimentary C API that cooperates with cpyext +
  • +
Cheers,
+Armin Rigo and Maciej Fijalkowski
+
+
+
+
+ + Armin Rigo wrote on 2013-11-14 11:52: +
+
+

The Win32 build is here, thanks Matti! https://bitbucket.org/pypy/pypy/downloads/pypy-2.2-win32.zip

+
+
+
+
+ + foobie42 wrote on 2013-11-14 14:23: +
+
+

Congrats! adb push pypypypy /sdcard/!

+
+
+
+
+ + Anonymous wrote on 2013-11-14 19:38: +
+
+

@foobie42 that's what I've done just a second ago! Gotta unpack raspbian chroot zip now...

+
+
+
+
+ + Wilfred wrote on 2013-11-16 10:55: +
+
+

Is speed.pypy.org still updated? The second graph on https://speed.pypy.org/ only shows 2.0 beta and trunk, and https://speed.pypy.org/comparison/ doesn't offer 2.1 or 2.2 either.

+
+
+
+
+ + Maciej Fijalkowski wrote on 2013-11-16 11:56: +
+
+

I managed to update it, check it out now

+
+
+
+
+ + Unknown wrote on 2014-02-24 21:39: +
+
+

Do you have plans to support python 3.3 features?

+
+
+
+ +

Py3k status update #12

+ +
+

This is the 12th status update about our work on the py3k branch, which we
+can work on thanks to all of the people who donated to the py3k proposal.

+

Here's an update on the recent progress:

+
    +
  • Thank you to everyone who has provided initial feedback on the PyPy3 2.1 beta
    +1 release. We've gotten a number of bug reports, most of which have been
    +fixed.
  • +
  • As usual, we're continually keeping up with changes from the default
    +branch. Oftentimes these merges come at a cost (conflicts and or
    +reintegration of py3k changes) but occasionally we get goodies for free, such
    +as the recent JIT optimizations and incremental garbage collection.
  • +
  • We've been focusing on re-optimizing Python 2 int sized (machine sized)
    +integers:
  • +
+

We have a couple of known, notable speed regressions in the PyPy3 beta release
+vs regular PyPy. The major one being with Python 2.x int sized (or machine
+sized) integers.

+

Python 3 drops the distinction between int and long types. CPython 3.x
+accomplishes this by removing the old int type entirely and renaming the long
+type to int. Initially, we've done the same for PyPy3 for the sake of
+simplicity and getting everything working.

+

However PyPy's JIT is capable of heavily optimizing these machine sized integer
+operations, so this came with a regression in performance in this area.

+

We're now in the process of solving this. Part of this work also involves some
+house cleaning on these numeric types which also benefits the default branch.

+

cheers,
+Phil

+
+
+
+
+ + Armin Rigo wrote on 2013-11-13 08:33: +
+
+

We should note that the re-optimization is different than CPython's. In the latter they use a "long" implementation which they heavily optimized for the common case of small integers. In PyPy instead we use two really different implementations (like "int" and "long" on Python 2); they just happen to be exposed at the user level with the same Python type in Python 3.

+
+
+
+
+ + Anonymous wrote on 2013-11-13 21:58: +
+
+

I just have to say, the PyPy team is doing a great job.

Well done guys!

+
+
+
+
+ + Alessandro wrote on 2014-01-04 05:47: +
+
+

I know nothing on pypy, but I'm interested. I have a doubt: Will the PyPy version with python 3 support leverage all of the progress of the python 2 pypy version?

Like for example, will current numpypy be able to work on PyPy3k ?

+
+
+
+ +

Making coverage.py faster under PyPy

+ +
+

If you've ever tried to run your programs with coverage.py under PyPy,
+you've probably experienced some incredible slowness. Take this simple
+program:

+
def f():
+    return 1
+
+
+def main():
+    i = 10000000
+    while i:
+        i -= f()
+
+main()
+
+

Running time coverage.py run test.py five times, and looking at the best
+run, here's how PyPy 2.1 stacks up against CPython 2.7.5:

+ +++++ + + + + + + + + + + + + + + + + + +
PythonTimeNormalized to CPython
CPython 2.7.53.879s1.0x
PyPy 2.153.330s13.7x slower
+

Totally ridiculous. I got turned onto this problem because on one of my
+projects CPython takes about 1.5 minutes to run our test suite on the build
+bot, but PyPy takes 8-10 minutes.

+

So I sat down to address it. And the results:

+ +++++ + + + + + + + + + + + + + + + + + + + + + + +
PythonTimeNormalized to CPython
CPython 2.7.53.879s1.0x
PyPy 2.153.330s13.7x slower
PyPy head1.433s2.7x faster
+

Not bad.

+
+

Technical details

+

So how'd we do it? Previously, using sys.settrace() (which coverage.py
+uses under the hood) disabled the JIT. Except it didn't just disable the JIT,
+it did it in a particularly insidious way — the JIT had no idea it was being
+disabled!

+

Instead, every time PyPy discovered that one of your functions was a hotspot,
+it would start tracing to observe what the program was doing, and right when it
+was about to finish, coverage would run and cause the JIT to abort. Tracing
+is a slow process, it makes up for it by generating fast machine code at the
+end, but tracing is still incredibly slow. But we never actually got to the
+"generate fast machine code" stage. Instead we'd pay all the cost of tracing,
+but then we'd abort, and reap none of the benefits.

+

To fix this, we adjusted some of the heuristics in the JIT, to better show it
+how sys.settrace(<tracefunc>) works. Previously the JIT saw it as an opaque
+function which gets the frame object, and couldn't tell whether or not it
+messed with the frame object. Now we let the JIT look inside the
<tracefunc> function, so it's able to see that coverage.py isn't
+messing with the frame in any weird ways, it's just reading the line number and
+file path out of it.

+

I asked several friends in the VM implementation and research field if they
+were aware of any other research into making VMs stay fast when debugging tools
+like coverage.py are running. No one I spoke to was aware of any (but I
+didn't do a particularly exhaustive review of the literature, I just tweeted at
+a few people), so I'm pleased to say that PyPy is quite possibly the first VM
+to work on optimizing code in debugging mode! This is possible because of our
+years spent investing in meta-tracing research.

+
+

Happy testing,
+Alex

+
+
+
+
+ + John Doe wrote on 2013-10-26 20:40: +
+
+

No, you're not the first to make this pretentious mistake.

What's the report for code that was actually eliminated by optimizations? Was it covered? Was it not?

+
+
+
+
+ + Anonymous wrote on 2013-10-27 17:53: +
+
+

You misunderstand John Doe. The coverage report is for the user's Python code, which isn't optimized, eliminated, or otherwise modified. The PyPy speedups come from a clever reimplementation of the interpreter that runs the user's Python code, and this article was explaining how they found and fixed a big slowdown that happens to be triggered by a common test-related library.

+
+
+
+
+ + Armin Rigo wrote on 2013-10-27 19:37: +
+
+

@John Doe: sadly, we fail to understand exactly what part of the blog post you're answering to in your sentence "No, you're not the first to make this pretentious mistake". Can you please give more context and elaborate a bit?

+
+
+
+
+ + John M. Camara wrote on 2013-10-27 21:09: +
+
+

@Armin: I believe John Doe is talking about the last paragraph as I believe the JVM also does not disable optimizations when using debug tools.

If this is the case than his comment is silly as Alex clearly stated he didn't do an exhaustive search.

+
+
+
+
+ + Unknown wrote on 2013-10-30 07:17: +
+
+

This sounds similar to the -Og setting of GCC, which enables all optimizations which do not interfere with debugging.

+
+
+
+ +

Update on STM

+ +
+

Hi all,

+

The sprint in London was a lot of fun and very fruitful. In the last +update on STM, Armin was working on improving and specializing the +automatic barrier placement. There is still a lot to do in that area, +but that work is merged now. Specializing and improving barrier placement +is still to be done for the JIT.

+

But that is not all. Right after the sprint, we were able to squeeze +the last obvious bugs in the STM-JIT combination. However, the performance +was nowhere near to what we want. So until now, we fixed some of the most +obvious issues. Many come from RPython erring on the side of caution +and e.g. making a transaction inevitable even if that is not strictly +necessary, thereby limiting parallelism. Another problem came from +increasing counters everytime a guard fails, which caused transactions +to conflict on these counter updates. Since these counters do not have +to be completely accurate, we update them non-transactionally now with +a chance of small errors.

+

There are still many such performance issues of various complexity left +to tackle: we are nowhere near done. So stay tuned or contribute :)

+ +

Performance

+

Now, since the JIT is all about performance, we want to at least +show you some numbers that are indicative of things to come. +Our set of STM benchmarks is very small unfortunately +(something you can help us out with), so this is +not representative of real-world performance. We tried to +minimize the effect of JIT warm-up in the benchmark results.

+

The machine these benchmarks were executed on has 4 physical +cores with Hyper-Threading (8 hardware threads).

+

Raytracer from stm-benchmarks: +Render times in seconds for a 1024x1024 image:

+ +++++ + + + + + + + + + + + + + + + + + + + + + + +
InterpreterBase time: 1 thread8 threads (speedup)
PyPy-2.12.472.56 (0.96x)
CPython81.173.4 (1.1x)
PyPy-STM50.210.8 (4.6x)
+

For comparison, disabling the JIT gives 148s on PyPy-2.1 and 87s on +PyPy-STM (with 8 threads).

+

Richards from PyPy repository on the stmgc-c4 +branch: +Average time per iteration in milliseconds:

+ +++++ + + + + + + + + + + + + + + + + + + + + + + +
InterpreterBase time: 1 thread8 threads (speedup)
PyPy-2.115.615.4 (1.01x)
CPython239237 (1.01x)
PyPy-STM371116 (3.2x)
+

For comparison, disabling the JIT gives 492ms on PyPy-2.1 and 538ms on +PyPy-STM.

+ +

Try it!

+

All this can be found in the PyPy repository on the stmgc-c4 +branch. +Try it for yourself, but keep in mind that this is still experimental +with a lot of things yet to come. Only Linux x64 is supported right +now, but contributions are welcome.

+

You can download a prebuilt binary from here: +https://bitbucket.org/pypy/pypy/downloads/pypy-oct13-stm.tar.bz2 +(Linux x64 Ubuntu >= 12.04). This was made at revision bafcb0cdff48.

+ +

Summary

+

What the numbers tell us is that PyPy-STM is, as expected, +the only of the three interpreters where multithreading gives a large +improvement in speed. What they also tell us is that, obviously, the +result is not good enough yet: it still takes longer on a 8-threaded +PyPy-STM than on a regular single-threaded PyPy-2.1. However, as you +should know by now, we are good at promising speed and delivering it... +years later :-)

+

But it has been two years already since PyPy-STM started, and this is +our first preview of the JIT integration. Expect major improvements +soon: with STM, the JIT generates code that is completely suboptimal in +many cases (barriers, allocation, and more). Once we improve this, the +performance of the STM-JITted code should come much closer to PyPy 2.1.

+

Cheers

+

Remi & Armin

+
+
+
+
+ + tobami wrote on 2013-10-16 21:14: +
+
+

To see a multithreading speed up in a python interpreter is awesome!

For next update, I would suggest to do the benchmarking turning off hyperthreading and measuring 1, 2 and 4 threads. That would give a better picture of how the STM implementation scales with threads/cores.

+
+
+
+
+ + Mak Sim wrote on 2013-10-17 09:22: +
+
+

Guys you are doing great job!

+
+
+
+
+ + Anonymous wrote on 2013-10-17 13:34: +
+
+

STM | Société de transport de Montréal ?

+
+
+
+
+ + Anonymous wrote on 2013-10-17 17:07: +
+
+

STM stands for Software Transactional Memory and is a way to run multiple non-conflicting tasks at the same time and make it appear as if they had run in sequence.

+
+
+
+
+ + LKRaider wrote on 2013-10-23 21:45: +
+
+

A bit off-topic, but just came across this paper:

"Speculative Staging for Interpreter Optimization
(...)
-- we report that our optimization makes the CPython interpreter up to more than four times faster, where our interpreter closes the gap between and sometimes even outperforms PyPy's just-in-time compiler."
https://arxiv.org/abs/1310.2300

+
+
+
+ +

Incremental Garbage Collector in PyPy

+ +
+
+
+ +

Hello everyone.

+

We're pleased to announce that as of today, +the default PyPy comes with a GC that has much smaller pauses than yesterday.

+

Let's start with explaining roughly what GC pauses are. In CPython each +object has a reference count, which is incremented each time we create +references and decremented each time we forget them. This means that objects +are freed each time they become unreachable. That is only half of the story +though. First note that when the last reference to a large tree of +objects goes away, you have a pause: all the objects are freed. Your +program is not progressing at all during this pause, and this pause's +duration can be arbitrarily large. This occurs at deterministic times, +though. But consider code like this:

+
+class A(object):
+     pass
+
+a = A()
+b = A()
+a.item = b
+b.item = a
+del a
+del b
+
+

This creates a reference cycle. It means that while we deleted references to +a and b from the current scope, they still have a reference count of 1, +because they point to each other, even though the whole group has no references +from the outside. CPython employs a cyclic garbage collector which is used to +find such cycles. It walks over all objects in memory, starting from some known +roots, such as type objects, variables on the stack, etc. This solves the +problem, but can create noticeable, nondeterministic GC pauses as the heap +becomes large and convoluted.

+

PyPy essentially has only the cycle finder - it does not bother with reference +counting, instead it walks alive objects every now and then (this is a big +simplification, PyPy's GC is much more complex than this). Although this might +sound like a missing feature, it is really one of the reasons why PyPy is so +fast, because at the end of the day the total time spent in managing the +memory is lower in PyPy than CPython. However, as a result, PyPy also has the +problem of GC pauses.

+

To alleviate this problem, which is essential for +applications like games, we started to work on incremental GC, which spreads +the walking of objects and cleaning them across the execution time in smaller +intervals. The work was sponsored by the Raspberry Pi foundation, started +by Andrew Chambers and finished by Armin Rigo and Maciej Fijałkowski.

+
+
+

Benchmarks

+

Everyone loves benchmarks. We did not measure any significant speed difference +on our quite extensive benchmark suite on speed.pypy.org. The main +benchmark that we used for other comparisons was translating the topaz +ruby interpreter using various versions of PyPy and CPython. The exact +command was python <pypy-checkout>/bin/rpython -O2 --rtype targettopaz.py. +Versions:

+
    +
  • topaz - dce3eef7b1910fc5600a4cd0afd6220543104823
  • +
  • pypy source - defb5119e3c6
  • +
  • pypy compiled with minimark (non-incremental GC) - d1a0c07b6586
  • +
  • pypy compiled with incminimark (new, incremental GC) - 417a7117f8d7
  • +
  • CPython - 2.7.3
  • +
+

The memory usage of CPython, PyPy with minimark and PyPy with incminimark is +shown here. Note that this benchmark is quite bad for PyPy in general, the +memory usage is higher and the amount of time taken is longer. This is due +to the JIT warmup being both memory hungry and inefficient (see below). +But first, the new GC is not worse than the old one.

+ +
+

EDIT:Red line is CPython, blue is incminimark (new), green is minimark (old)

+ +

The image was obtained by graphing the output of memusage.py.

+

However, the GC pauses are significantly smaller. For PyPy the way to +get GC pauses is to measure time between start and stop while running stuff +with PYPYLOG=gc-collect:log pypy program.py, for CPython, the magic +incantation is gc.set_debug(gc.DEBUG_STATS) and parsing the output. +For what is worth, the average and total for CPython, as well as the total +number of events are not directly comparable since it only shows the cyclic +collector, not the reference counts. The only comparable thing is the +amount of long pauses and their duration. In the table below, pause duration +is sorted into 8 buckets, each meaning "below that or equal to the threshold". +The output is generated using the gcanalyze tool.

+

CPython:

+ ++++++++++ + + + + + + + + + + + + + + + + + + + + + + +
150.1ms300.2ms450.3ms600.5ms750.6ms900.7ms1050.8ms1200.9ms
54175321101
+

PyPy minimark (non-incremental GC):

+ ++++++++++ + + + + + + + + + + + + + + + + + + + + + + +
216.4ms432.8ms649.2ms865.6ms1082.0ms1298.4ms1514.8ms1731.2ms
2714646533
+

PyPy incminimark (new incremental GC):

+ ++++++++++ + + + + + + + + + + + + + + + + + + + + + + +
15.7ms31.4ms47.1ms62.8ms78.6ms94.3ms110.0ms125.7ms
25512122410002
+

As we can see, while there is still work to be done (the 100ms ones could +be split among several steps), we did improve the situation quite drastically +without any actual performance difference.

+

Note about the benchmark - we know it's a pretty extreme case of JIT +warmup, we know we suck on it, we're working on it and we're not afraid of +showing PyPy is not always the best ;-)

+
+
+

Nitty gritty details

+

Here are some nitty gritty details for people really interested in +Garbage Collection. This was done as a patch to "minimark", our current +GC, and called "incminimark" for now. The former is a generational +stop-the-world GC. New objects are allocated "young", which means that +they initially live in the "nursery", a special zone of a few MB of +memory. When the nursery is full, a "minor collection" step moves the +surviving objects out of the nursery. This can be done quickly (a few +millisecond) because we only need to walk through the young objects that +survive --- usually a small fraction of all young objects; and also by +far not all objects that are alive at this point, but only the young +ones. However, from time to time this minor collection is followed by a +"major collection": in that step, we really need to walk all objects to +classify which ones are still alive and which ones are now dead +("marking") and free the memory occupied by the dead ones ("sweeping"). +You can read more details here.

+

This "major collection" is what gives the long GC pauses. To fix this +problem we made the GC incremental: instead of running one complete +major collection, we split its work into a variable number of pieces and +run each piece after every minor collection for a while, until there are +no more pieces. The pieces are each doing a fraction of marking, or a +fraction of sweeping. It adds some few milliseconds after each of these +minor collections, rather than requiring hundreds of milliseconds in one +go.

+

The main issue is that splitting the major collections means that the +main program is actually running between the pieces, and so it can +change the pointers in the objects to point to other objects. This is +not a problem for sweeping: dead objects will remain dead whatever the +main program does. However, it is a problem for marking. Let us see +why.

+

In terms of the incremental GC literature, objects are either "white", +"gray" or "black". This is called tri-color marking. See for example +this blog post about Rubinius, or this page about LuaJIT or the wikipedia description. The +objects start as "white" at the beginning of marking; become "gray" when +they are found to be alive; and become "black" when they have been fully +traversed. Marking proceeds by scanning grey objects for pointers to +white objects. The white objects found are turned grey, and the grey +objects scanned are turned black. When there are no more grey objects, +the marking phase is complete: all remaining white objects are truly +unreachable and can be freed (by the following sweeping phase).

+

In this model, the important part is that a black object can never point +to a white object: if the latter remains white until the end, it will be +freed, which is incorrect because the black object itself can still be +reached. How do we ensure that the main program, running in the middle +of marking, will not try to write a pointer to white object into a black +object? This requires a "write barrier", i.e. a piece of code that runs +every time we set a pointer into an object or array. This piece of code +checks if some (hopefully rare) condition is met, and calls a function +if that is the case.

+

The trick we used in PyPy is to consider minor collections as part of +the whole, rather than focus only on major collections. The existing +minimark GC had always used a write barrier of its own to do its job, +like any generational GC. This existing write barrier is used to detect +when an old object (outside the nursery) is modified to point to a young +object (inside the nursery), which is essential information for minor +collections. Actually, although this was the goal, the actual write +barrier code is simpler: it just records all old objects into which we +write any pointer --- to a young or old object. As we found out over +time, doing so is not actually slower, and might actually be a +performance improvement: for example, if the main program does a lot of +writes into the same old object, we don't need to check over and over +again if the written pointer points to a young object or not. We just +record the old object in some list the first time, and that's it.

+

The trick is that this unmodified write barrier works for incminimark +too. Imagine that we are in the middle of the marking phase, running +the main program. The write barrier will record all old objects that +are being modified. Then at the next minor collection, all surviving +young objects will be moved out of the nursery. At this point, as we're +about to continue running the major collection's marking phase, we +simply add to the list of pending gray objects all the objects that we +just considered --- both the objects listed as "old objects that are +being modified", and the objects that we just moved out of the nursery. +A fraction from the former list were black object; so this mean that +they are turned back from the black to the gray color. This technique +implements nicely, if indirectly, what is called a "backward write +barrier" in the literature. The backwardness is about the color that +needs to be changed in the opposite of the usual direction "white -> +gray -> black", thus making more work for the GC. (This is as opposed +to "forward write barrier", where we would also detect "black -> white" +writes but turn the white object gray.)

+

In summary, I realize that this description is less about how we turned +minimark into incminimark, and more about how we differ from the +standard way of making a GC incremental. What we really had to do to +make incminimark was to write logic that says "if the major collection +is in the middle of the marking phase, then add this object to the list +of gray objects", and put it at a few places throughout minor +collection. Then we simply split a major collection into increments, +doing marking or sweeping of some (relatively arbitrary) number of +objects before returning. That's why, after we found that the existing +write barrier would do, it was not much actual work, and could be done +without major changes. For example, not a single line from the JIT +needed adaptation. All in all it was relatively painless work. ;-) +

+

Cheers,
armin and fijal

+
+
+
+
+
+
+
+ + H* wrote on 2013-10-15 14:24: +
+
+

Nice work! :)

+
+
+
+
+ + Unknown wrote on 2013-10-15 19:10: +
+
+

Thank you for this nice explanation.

Which mechanism do you use for not adding twice an old object in the list of modified old objects?

+
+
+
+
+ + René Dudfield wrote on 2013-10-15 21:56: +
+
+

Thank you! thank you! thank you! Game dev on pypy just leveled up!

+
+
+
+
+ + Anonymous wrote on 2013-10-15 22:08: +
+
+

Very clever! But eh, your graphs show that your program is using 2-3x the memory of CPython. How much faster is your program overall in exchange for this hugely larger memory usage?

+
+
+
+
+ + Armin Rigo wrote on 2013-10-16 07:00: +
+
+

@François: a flag on the object. All old objects have this flag initially, and we use it to detect if the write barrier must trigger. We remove it when the write barrier has triggered once. We re-add it during the following minor collection.

@Anonymous: this program is slower on PyPy too. The point of the benchmark is to show that incminimark gives the same results as minimark, and to show that the JIT has bad cases. Running the same program for a much longer time (5-10x) lets PyPy slowly catch up and eventually beat CPython by a factor 2. The memory usage is evening out at around around 4 or 4.5GB (and I'd expect even larger examples to show lower consumption on PyPy, but that's mostly a guess).

+
+
+
+
+ + Anonymous wrote on 2013-10-16 10:03: +
+
+

Thanks for moving Python forward!

How does the incminimarc compares to Azul C4 JVM GC and Hotspots G1 GC?
In other words are there strong guarantees that for big heap sizes e.g. 12 GB the GC pauses will not exceed some value e.g. 100ms?

+
+
+
+
+ + Anonymous wrote on 2013-10-16 10:21: +
+
+

Sounds like great progress, but I hope you understand that even 15-30ms is way too much for games. That's 1-2 frames. It needs to be an order of magnitude less to ensure smooth FPS.

Do you have plans to give the program any say in whether the GC should strive for low latency vs. high throughput?

+
+
+
+
+ + vdp wrote on 2013-10-16 11:13: +
+
+

Great writeup, explaining that kind of concept in a clear way is not easy. And well done on the unequivocal improvements :)

@annonymous Yes you'll still miss some frames, but compared to a 1 second pause, pypy suddenly became usable for games. 55fps (over what duration did those 25K collections happen ?) is not perfect, but most users won't notice. That said, it *would* be nice to be able to tune latency vs throughput.

+
+
+
+
+ + Armin Rigo wrote on 2013-10-16 12:36: +
+
+

@Anonymous: our incminimark comes with no serious strong guarantee. I still think it's enough for most games, say, if "almost all" the pauses are around 10ms. It's also tweakable (see the PYPY_GC_* environment variables documented in rpython/memory/gc/incminimark.py, and try to call something like gc.collect(1) at the end of each frame).

Anyway, at around the same time scale is the time spent JITting, which also causes apparent pauses in the program. I think that fixing it all with really strong guarantees is a much, much harder problem. CPython doesn't gives any guarantee either, as explained at the start of the blog post.

+
+
+
+
+ + Anonymous wrote on 2013-10-16 15:02: +
+
+

@Armin Rigo: Yeah, it's no use pushing GC pauses much lower than other pauses, but that just means other things need improving as well. ;) If I had to draw an arbitrary line, I'd say half a frame (i.e. 8ms for 60fps, 4ms for 120fps 3D) is probably a good target for the maximum.

The thing with CPython is that you can turn the GC off and still have everything non-cyclic collected. So with enough attention to detail you can avoid GC pauses completely.

BTW, is any work ongoing with regard to fully concurrent GC?

+
+
+
+
+ + Armin Rigo wrote on 2013-10-16 15:14: +
+
+

You *cannot* avoid GC pauses in CPython: see the first paragraph of the blog post. You can only make the GC pauses deterministic, by disabling the cyclic collector. Then you can hack the program as needed to reduce GC pauses if there are some.

+
+
+
+
+ + Unknown wrote on 2013-10-16 22:14: +
+
+

Thanks for this really educational and accessible explanation - it's rare to find such a concise and clear piece of writing that a non expert can understand on the subject of GC.

+
+
+
+
+ + Unknown wrote on 2013-10-21 06:17: +
+
+

This needs visualization of processes to win Wikipedia article of the month.

+
+
+
+
+ + Michael Hudson-Doyle wrote on 2013-10-22 02:29: +
+
+

You can also get arbitrarily long "gc" pauses in CPython by removing the last reference to some deeply nested data structure...

+
+
+
+
+ + Dima Q wrote on 2013-11-10 13:07: +
+
+

Wow PyPy keeps paying off!
I am so glad you guys have time (and hopefully funding) push dynamic language world forward!

+
+
+
+
+ + Franck wrote on 2013-11-21 12:19: +
+
+

If you fork() the whole process and do the marking on the frozen forked copy (on write) then you can be fully incremental without pauses, as long as you've got enough spare system memory compared to process size (as the main process keeps growing while you're marking and the pathological case of copy on write is 2x, however unlikely).

+
+
+
+ +

Numpy Status Update

+ +
+

Hi everyone

+Thanks to the people who donated money to the numpy proposal, here is what I've been working on recently :

+- Fixed conversion from a numpy complex number to a python complex number
+- Implement the rint ufunc
+- Make numpy.character usable as a dtype
+- Fix ndarray(dtype=str).fill()
+- Various fixes on boolean and fancy indexing

+Cheers
+Romain

+
+

PyCon South Africa & sprint

+ +
+

Hi all,

+ +

For those of you that happen to be from South Africa: don't miss +PyCon ZA 2013, next October 3rd and 4th! +Like last year, a few of us will be there. There will be the first talk +about STM getting ready (a +blog post about that should follow soon).

+ +

Moreover, general sprints will continue on the weekend (5th and 6th). +Afterwards, Fijal will host a longer PyPy sprint (marathon?) with me +until around the 21th. You are welcome to it as well! Write to the mailing list or to fijal directly (fijall +at gmail.com), or simply in comments of this post.

+ +

--- Armin

+
+
+
+
+ + Anonymous wrote on 2013-10-09 21:49: +
+
+

Hey lads, any change of 64-bit arm pypy build?

now that hardware is finally generally available...

I'm sure someone at the conference has the hw, perhaps already rooted?

+
+
+
+
+ + Maciej Fijalkowski wrote on 2013-10-09 21:52: +
+
+

we don't have access to 64bit ARM. feel free to help us. also it's quite a bit of work

+
+
+
+
+ + Nickolas wrote on 2013-10-10 06:17: +
+
+

Hey Armin

Thanks for the awesome presentations :-)

I'm very excited to try it out soon. I was wondering, would it not be useful to try and get the "with atomic" statement at the very least working on regular CPython? (just operating on the GIL, or simulated with a lock). This could smooth over migration somewhat?

Also, thanks for your live demo of cffi, It is so much simpler than ctypes :-)

+
+
+
+
+ + Maciej Fijalkowski wrote on 2013-10-10 07:06: +
+
+

Hi Nikolas.

with atomic can be trivially available on CPython (and not do anything beyond have a lock)

+
+
+
+ +
+
+
+ + Anonymous wrote on 2013-08-30 20:41: +
+
+

Is there a better look ink to the slides? Watching them on the blog is difficult

+
+
+
+
+ + Anonymous wrote on 2013-08-30 22:28: +
+
+

Clicking the full screen button makes them easy to read for me. Maybe try that?

+
+
+
+
+ + Anonymous wrote on 2013-09-03 16:53: +
+
+

Could there perhaps be videos from the presentation?

big up for good work!

+
+
+
+
+ + Unknown wrote on 2013-09-10 14:22: +
+
+

i know, such questions probably get on your nerves, but do you think you will every reach a 10x average on speed.pypy.org? :)

+
+
+
+ +

NumPy road forward

+ +
+
+

Hello everyone.

+

This is the roadmap for numpy effort in PyPy as discussed on the London sprint. +First, the highest on our priority list is to finish the low-level part +of the numpy module. What +we'll do is to finish the RPython part of numpy and provide a pip installable +numpypy repository that includes the pure python part of Numpy. This would +contain the original Numpy with a few minor changes.

+

Second, we need to work on the JIT support that will make NumPy on PyPy +faster. In detail:

+
    +
  • reenable the lazy loop evaluation
  • +
  • optimize bridges, which is depending on optimizer refactorings
  • +
  • SSE support
  • +
+

On the compatibility front, there were some independent attempts into +making the following stuff working:

+
    +
  • f2py
  • +
  • C API (in fact, PyArray_* API is partly present in the nightly builds of +PyPy)
  • +
  • matplotlib (both using PyArray_* API and embedding CPython runtime in PyPy)
  • +
  • scipy
  • +
+

In order to make all of the above happen faster, it would be helpful to raise +more funds. You can donate to PyPy's NumPy project on our website. Note +that PyPy is a member of SFC which is a 501(c)(3) US non-profit, so donations +from US companies can be tax-deducted.

+

Cheers,
+fijal, arigo, ronan, rguillebert, anto and others

+
+
+
+
+
+
+ + Pim wrote on 2013-08-27 16:41: +
+
+

Thanks for the update. I'm hoping the other presentations can also be summarized here for those who couldn't attend this (very interesting) mini-conference.

+
+
+
+
+ + Dan wrote on 2013-08-28 20:11: +
+
+

Thanks for the info! I can't wait to play with it.
I only have a very rudimentary understanding of numpypy and pypy, so please forgive if this is a stupid question:

Will there be a way to do additional high level optimization steps before the JIT level?

I.e. elimination of temporaries for matrices, expression optimization and so on?

Basically check if the expression should be handled by the pypy JIT, or if if should be passed on to something like numexpr
https://code.google.com/p/numexpr/
that will itself hand over the code to optimized vendor libraries?

I am a bit concerned that while the pypy JIT optimizations are without question very impressive and probably close to optimal to what can be done for generic code, the performance issues with numerical code are very different.

Any JIT will (please correct me if I am wrong, this would be a significant breakthrough) never be able to even come close to what a vendor library like the MKL can do.

The comparison will be even more to the disadvantage of the JIT if one uses a library like Theano that runs the code on the GPU.

For my work, beating c for speed is not enough anymore, the challenges are how to run the computation in parallel, how to call optimized libraries without pain and how to use a GPU without re-writing the entire program and learning about a completely new system.

Will libraries like numexpr, numba and theano be able to run under pypy, and will it eventually be possible to automatically hand over numerical expressions automatically to these libraries?

+
+
+
+
+ + Maciej Fijalkowski wrote on 2013-09-11 14:26: +
+
+

Hi Dan.

Yes, pypy will do the removal of temporary matrices, this is a very basic optimization that we had, but disabled for a while to simplify development.

I don't think numba, numexpr or theano would ever work on PyPy (I would ask their authors though), but I personally think we can match their performance or even exceed it, time will tell though.

Cheers,
fijal

+
+
+
+
+ + Anonymous wrote on 2013-09-14 00:40: +
+
+

Hi Maciej,

Thanks for the answer.

A pypy that matches what numba or theano can do, all without doing any extra annotation, would not only be a huge breakthrough for pypy, it will be a gigantic step forward for the entire numerics community.

Thank you and keep up the good work,


Dan

+
+
+
+
+ + Armin Rigo wrote on 2013-09-14 13:31: +
+
+

@Anonymous: I'd like to point out again that all this NumPy work would get more traction and faster development within PyPy if we could manage to interest (and get contributions from) anyone that comes from the scientific community. Ourselves, we are looking at this topic as a smallish part of the whole Python world, so we disagree (to a point) with your comment "a huge breakthrough for pypy". :-)

+
+
+
+ +

Preliminary London Demo Evening Agenda

+ +
+

We now have a preliminary agenda for the demo evening in London next week. It takes place on Tuesday, August 27 2013, 18:30-19:30 (BST) at King's College London, Strand. The preliminary agenda is as follows:

+ + +

All the talks are lightning talks. Afterwards there will be plenty of time for discussion.

+ +

There's still free spots, if you want to come, please register on the Eventbrite page. Hope to see you there!

+
+
+
+
+ + Anonymous wrote on 2013-08-20 13:35: +
+
+

Will the video of the talks be available online?

+
+
+
+
+ + Carl Friedrich Bolz-Tereick wrote on 2013-08-20 13:36: +
+
+

@Anonymous: unfortunately there are no plans to film the event, no :-(

+
+
+
+
+ + Paddy3118 wrote on 2013-08-21 15:17: +
+
+

Another request for cideos of the event to be made available. Please.

+
+
+
+ +
+
+ +
+
+
+ +
+ + + + \ No newline at end of file diff --git a/blog/index-3.html b/blog/index-3.html new file mode 100644 index 000000000..6ffd6e2af --- /dev/null +++ b/blog/index-3.html @@ -0,0 +1,971 @@ + + + + + + +PyPy (old posts, page 3) | PyPy + + + + + + + + + + + + + + + + + + Skip to main content +
+
+
+

Python Finalizers Semantics, Part 1

+ +
+

Python's garbage collection semantics is very much historically grown and +implementation-driven. Samuele Pedroni therefore likes to call it the "'there +is no such thing as too much chocolate'-approach to GC semantics" :-). In this +two-part post series I am going to talk about the semantics of finalization +(__del__ methods) in CPython and PyPy.

+

The current behaviour is mostly all a consequence of the fact that CPython uses +reference counting for garbage collection. The first consequence is that if +several objects die at the same time, their finalizers are called in a +so-called topological order, which is a feature that some GCs have that +CPython offers by chance. This ensures, that in a __del__ method, all the +attributes of the object didn't get their __del__ called yet. A simple +example:

+
+class B(object):
+    def __init__(self, logfile):
+        self.logfile = logfile
+    def __del__(self):
+        self.logfile.write("done doing stuff")
+b = B(file("logfile.txt", "w"))
+
+

If the instance of B dies now, both it and the logfile are dead. They will +get their __del__``s called and it's important that the file's ``__del__ +gets called second, because otherwise the __del__ of B would try to +write to a closed file.

+

The correct ordering happens completely automatically if you use reference +counting: Setting b to None will decref the old value of b. This reduces +the reference count of this instance to 0, so the finalizer will be called. +After the __del__ has finished, this object will be freed and all the +objects it points to decrefed as well, which decreases the reference count of +the file to 0 and call its `` __del__`` as well, which closes the file.

+

The behaviour of PyPy's semispace and generational GCs wasn't very nice so far: +it just called the finalizers in an essentially random order. Last week Armin +came up with a somewhat complicated algorithm that solves this by emulating +CPython's finalization order, which we subsequently implemented. So PyPy does +what you expect now! The Boehm GC does a topological ordering by default, so it +wasn't a problem there.

+

A small twist on the above is when +there is a cycle of objects involving finalizers: +In this case a topological ordering is not possible, so that CPython refuses to +guess the finalization order and puts such cycles into gc.garbage. This +would be very hard for PyPy to do, since our GC implementation is essentially +independent from the Python interpreter. The same GCs work for our other +interpreters after all too. Therefore we decided to break such a cycle at an +arbitrary place, which doesn't sound too insane. The insane thing is for +a Python program to create a cycle of objects with finalizers and depend +on the order in which the finalizers are called. Don't do that :-) (After +all, CPython wouldn't even call the finalizers in this case.)

+
+
+
+
+ + SamB wrote on 2015-03-15 05:46: +
+
+

The link to the "somewhat complicated algorithm" is a bit broken, but you can still get to it at the web archive.

+
+
+
+
+ + Armin Rigo wrote on 2015-03-30 08:07: +
+
+

Thanks, link updated.

+
+
+
+ +

PyPy presence on various conferences in the near future

+ +
+

Hello! I will have the pleasure of presenting PyPy on various conferences in the near future. They're (in chronological order): + +

+
    +
  • Studencki Festiwal Informatyczny in Krakow, POLAND 6-8 March 2008. I think this might be only interesting for polish people (website, in polish)
  • + +
  • Pycon Chicago, IL, USA. 14-17 March 2008. There should be also a PyPy sprint afterwards, including newbie-friendly tutorial, everybody is welcome to join us! (Provided that I'll get the US visa, which seems to be non-trivial issue for a polish citizen)
  • +
  • RuPy, Poznan, POLAND 13-14 April 2008 (website). This is small, but very friendly Ruby and Python conference. Last year was amazing, I can strongly recommend to go there (Poznan is only 2h by train from Berlin also has its own airport).
  • +
+ +Hope to see you at those places!

+ +Cheers,
+fijal +
+
+
+
+ + Michael Foord wrote on 2008-02-12 14:04: +
+
+

Hey - I'll be at both the Polish conferences talking about IronPython. I hope you will be talking in English!

Look forward to meeting up with you.

Michael Foord

+
+
+
+
+ + Maciej Fijalkowski wrote on 2008-02-12 14:56: +
+
+

Cheers Michael. Looking forward to see you!

At rupy definitely. At sfi it depends on them (I'll try to, also that I have noone to help me with slides in polish :)

+
+
+
+
+ + Konrad wrote on 2008-02-15 23:52: +
+
+

Hey Fijal.

I think the Academic IT Festival in Cracow would be interesting not only for polish people. Large part of the talks will be given in English.

Here's the link to the english version of the festival website: https://www.sfi.org.pl/news

Konrad Delong, SFI :)

+
+
+
+ +

Buildbots and Better Platform Support

+ +
+

In the last days we improved platform-support of PyPy's Python interpreter. +Jean-Paul Calderone has been tirelessly working for some time now on setting up a +buildbot for translating and testing PyPy. So far the basic mechanisms are +working and the buildbot is running on various machines, including some that +Michael Schneider (bigdog) lets us use, one of them being a Windows machine, +the other one with a 64bit Linux (lots of thanks to those two, you are +awesome!).

+

What is still missing is a nice way to visualize the test results to quickly see +which tests have started failing on which platforms. There is a prototype +already, which still needs some tweaking.

+

The availability of these machines has triggered some much-needed bug-fixing in +PyPy to make our Python interpreter work better on Windows and on 64 bit Linux. +Maciek and Michael Schneider worked on this quite a bit last week, with the +result that PyPy supports many more extension modules now on Windows and 64 bit +Linux. Since we now have the buildbot the hope is that the support also won't +disappear soon :-).

+
+
+
+
+ + Unknown wrote on 2008-02-06 20:37: +
+
+

Cool
I just found your blog and now I am going to read it every day:)
I love reading about the progress you guys are making on PyPy.

+
+
+
+ +

RPython can be faster than C

+ +
+

(yes, C as in language, not c as in speed of light). I looked recently at the great computer language shootout, for some benchmarks and to make some speed comparisons. I use this benchmark, modified it to be rpythonic-enough and compared speeds. The code is here (the only change from the Python version was to create a class instead of tuple, so actually this version is more OO). Also the benchmark is very likely flawed because it favours better GCs :).
+So, here we go: + +

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Language:Time of run (for N=14):
Python version running on Python 2.5.1, distribution25.5s
Python version running on PyPy with generational GC45.5
Python with psyco20s
RPython translated to C using PyPy's generational GC0.42s
compiling the Haskell version with GHC 6.6.11.6s
compiling the C version with gcc 4.1.2 -O3 -fomit-frame-pointer0.6s
+

+ +Also worth noticing is that when using psyco with the original version (with tuples) it is very fast (2s).

+ +So, PyPy's Python interpreter is 80% slower than CPython on this (not too horrible), but RPython is 40% faster than gcc here. Cool. The result is mostly due to our GC, which also proves that manual memory-management can be slower than garbage collection in some situations. Please note that this result does not mean that RPython is meant for you. It requires a completely different mindset than the one used to program in Python. Don't say you weren't warned! :-) +
+
+
+
+ + Jonathan Ellis wrote on 2008-01-21 16:02: +
+
+

"It requires a completely different mindset than the one used to program in Python."

Can you elaborate? "RPython for Python programmers" would be an excellent addition to the docs or this blog. :)

+
+
+
+
+ + Michael Foord wrote on 2008-01-21 16:14: +
+
+

I agree with Jonathan. There are many Python programmers who would *love* to be able to write Python extensions with RPython.

I know that this is already possible, but there are two issues:

* Lack of documentation on programming with RPython (I realise that this is a moving target)
* Last I heard, the refcounting implementation made RPython extensions inefficient

If these two issues were resolved (or mostly resolved) then a lot more people might start using the PyPy toolchain.

Asides from my growsing, it looks like PyPy is becoming more impressive by the day. Congratulations.

+
+
+
+
+ + Leonardo Santagada wrote on 2008-01-21 16:20: +
+
+

As of today you can't write CPython extensions in RPython.

Why not ask for the great computer language shootout to include RPython as one of their benchmarking languages? This could be a good and free advertising for the pypy project.

+
+
+
+
+ + Silveira Neto wrote on 2008-01-21 16:55: +
+
+

mmm
0.42
42
The answer...

+
+
+
+
+ + Carl Friedrich Bolz-Tereick wrote on 2008-01-21 17:06: +
+
+

Hi Michael,

Leonardo is correct, the extension compiler was removed from SVN in November. We had many discussions about this step, but eventually it turned out to be necessary for many reasons. The extcompiler never was that useful in the first place because the produced extensions weren't fast (one of the reasons being the bad refcounting indeed).

The other reasons were that the extcompiler was impossible to maintain and was actually preventing progress, because it kept code alive that we wanted to get rid off.

So at the moment you cannot use PyPy any more to produce CPython extensions, only standalone programs.

It's completely possible that the extcompiler will be reborn in the future, but at the moment our priorities are really to make PyPy a good Python and not do tons of things on the side.

Cheers,

Carl Friedrich

+
+
+
+
+ + Antonio Cuni wrote on 2008-01-21 18:06: +
+
+

I would also say nowadays it's already possible to write extension modules in RPython... but just for PyPy, now for CPython :-).

Jokes apart, if someone is really interested in writing part of its application in RPython (despite our warnings :-)), targeting PyPy could be an interesting alternative, as long as you don't need external libraries and the speed gain is more than what you loose in other areas where PyPy is actually slower.

+
+
+
+
+ + Justin wrote on 2008-01-21 21:17: +
+
+

I think a lot of people are interested in using RPython for performance reasons. But about nobody will leave CPython atm, because extension modules are not working.

At the moment, I wouldn't leave CPython since all I am doing is heavily based on scipy. And so my only option is (a) to wait PyPy being able to compile extensions for CPython or (b) PyPy making use of CPython extensions.

As long as this is not going to happen, I probably will not use RPython for serious projects. :/

+
+
+
+
+ + Isaac Gouy wrote on 2008-01-25 17:11: +
+
+ "Also the benchmark is very likely flawed because it favours better GCs :)"

Why would that be a flaw? Note: this is an adaptation of a benchmark for testing GC


Leonardo Santagada said "Why not ask for the great computer language shootout to include RPython ..."

FAQ Why don't you include language X? +
+
+
+
+ + Unknown wrote on 2008-01-25 21:03: +
+
+

Once the RPython was translated to C by PyPy how did you compile the C?

+
+
+
+
+ + Maciej Fijalkowski wrote on 2008-01-25 21:31: +
+
+

> Why would that be a flaw? Note: this is an adaptation of a benchmark for testing GC

I know, but I realised it after posting :) (We even have original somewhere around to compare gcs). Also, honestly a lot of python versions rely on libraries written in C, hence it took me a while that is "pure enough".

> Once the RPython was translated to C by PyPy how did you compile the C?

With the very same options as bare gcc. -O3 -fomit-frame-pointer

+
+
+
+
+ + Anonymous wrote on 2008-01-28 20:18: +
+
+

Did you try any of the other computer language shootout benchmarks?

+
+
+
+
+ + _ wrote on 2008-02-14 03:14: +
+
+

More objects != OO.

Perhaps you meant to say that it more closely reflects the domain?

No, I don't know how I ended up on this blog post.

+
+
+
+
+ + Ariel Balter wrote on 2009-10-15 04:25: +
+
+

How does RPython compare to Python Shedskin?

+
+
+
+
+ + Patric Dexheimer wrote on 2010-09-01 15:12: +
+
+

"Can you elaborate? "RPython for Python programmers" would be an excellent addition to the docs or this blog. :)"

+1 on this.

Greetings from Brazil!

+
+
+
+ +

PyPy.NET goes Windows Forms

+ +
+ +

After having spent the last few days on understanding PyPy's JIT, +today I went back hacking the clr module. As a result, it is now +possible to import and use external assemblies from pypy-cli, +including Windows Forms

+

Here is a screenshot of the result you get by typing the following at +the pypy-cli interactive prompt:

+
+>>>> import clr
+>>>> clr.AddReferenceByPartialName("System.Windows.Forms")
+>>>> clr.AddReferenceByPartialName("System.Drawing")
+>>>> from System.Windows.Forms import Application, Form, Label
+>>>> from System.Drawing import Point
+>>>>
+>>>> frm = Form()
+>>>> frm.Text = "The first pypy-cli Windows Forms app ever"
+>>>> lbl = Label()
+>>>> lbl.Text = "Hello World!"
+>>>> lbl.AutoSize = True
+>>>> lbl.Location = Point(100, 100)
+>>>> frm.Controls.Add(lbl)
+>>>> Application.Run(frm)
+
+

Unfortunately at the moment you can't do much more than this, because +we still miss support for delegates and so it's not possibile to +handle events. Still, it's a step in the right direction :-).

+
+

Improve .NET Integration

+ +
+

A while ago Amit Regmi, a student from Canada, started working on the +clr module improvements branch as a university project.

+

During the sprint Carl Friedrich, Paul and me worked more on it and +brought it to a mergeable state.

+

It adds a lot of new features to the clr module, which is the +module that allows integration between pypy-cli (aka PyPy.NET) and +the surrounding .NET environment:

+
+
    +
  • full support to generic classes;
  • +
  • a new importer hook, allowing things like from System import +Math and so on;
  • +
  • .NET classes that implements IEnumerator are treated +as Python iterators; e.g. it's is possile to iterate over them +with a for loop.
  • +
+
+

This is an example of a pypy-cli session:

+
+>>>> from System import Math
+>>>> Math.Abs(-42)
+42
+>>>> from System.Collections.Generic import List
+>>>> mylist = List[int]()
+>>>> mylist.Add(42)
+>>>> mylist.Add(43)
+>>>> mylist.Add("foo")
+Traceback (most recent call last):
+  File "<console>", line 1, in <interactive>
+TypeError: No overloads for Add could match
+>>>> mylist[0]
+42
+>>>> for item in mylist: print item
+42
+43
+
+

This is still to be considered an alpha version; there are few known +bugs and probably a lot of unknown ones :-), so don't expect it to +work in every occasion. Still, it's a considerable step towards real +world :-).

+
+

Crashing Other People's Compilers

+ +
+

Over the years PyPy has (ab?)used various external software for different +purposes, and we've discovered bugs in nearly all of them, mostly by pushing them +to their limits. For example, many compilers are not happy with 200MB of +source in one file. The Microsoft C compiler has a limit of 65536 lines of code +per file and the CLI was raising "System.InvalidProgramException: Method +pypy.runtime.Constants:.cctor () is too complex.", where too complex probably +means "too long". Just for fun, today we collected all projects we could think of +in which we found bugs:

+
+ +
+

So one could say that PyPy is really just the most expensive debugging tool +ever :-).

+
+
+
+
+ + Michael Hudson-Doyle wrote on 2008-01-19 10:53: +
+
+

You know, the one piece of external software we most depend on is one we haven't found bugs in: gcc (at least, I can't remember any problems). That's pretty impressive.

I think you can probably add gdb to the list though.

+
+
+
+
+ + Alok wrote on 2008-01-20 13:16: +
+
+

Can you, maybe, give a few examples of what you did. Linking to items about them if you wrote about it.

+
+
+
+
+ + Unknown wrote on 2008-01-21 21:46: +
+
+

I'd be interested in knowing which projects were the most receptive to the bug reports.

+
+
+
+
+ + Carl Friedrich Bolz-Tereick wrote on 2008-01-21 22:24: +
+
+

Hi Brett,

I think the project most receptive to bug reports is LLVM, where bugs that we find are usually fixed within a small number of days. I think in general Open Source projects react quite well, as you would expect. A negative example is graphviz, which still segfaults despite us producing a patch which fixes the problem.

Microsoft proves to be completely unapproachable, it seems you have to pay them if you want to report a bug (should be the other way round, of course :-)).

+
+
+
+
+ + Unknown wrote on 2008-01-21 23:19: +
+
+

@Carl:

Thanks for the info, Carl. I have been contemplating trying to rely on them for compiling Python for testing purposes, especially with clang coming along (although I am waiting for them to address a bug I found =). Good to know they are responsive.

And yes, it is really unfortunate that Microsoft doesn't make reporting bugs easy, but I guess no one wants to deal with the number of reports they would most likely get. =)

+
+
+
+
+ + /SiD wrote on 2008-02-13 22:14: +
+
+

Regarding Microsoft bug reports, there's Connect. And I've got some degree of success with it.

+
+
+
+ +

Leysin Winter Sport Sprint Started

+ +
+ + +

The Leysin sprint has started since yesterday morning in the usual location. The view is spectacular (see photo) the weather mostly sunny. The following people are sprinting: +

+
    +
  • Maciej Fijalkowski
  • +
  • Armin Rigo
  • +
  • Toby Watson
  • +
  • Paul deGrandis
  • +
  • Antonio Cuni
  • +
  • Carl Friedrich Bolz
  • +
So it is a rather small sprint.

We started working on various features and performance improvements for the high level backends (JVM and .NET) and on implementing ctypes for PyPy. Later this week we plan to spend a few days on the JIT, because Anto and I both need to get into it for our respective university projects.

+
+
+
+
+ + ajaksu wrote on 2008-01-14 22:29: +
+
+

For those curious about what is going on: SVN commits

Great work, guys! :)

+
+
+
+ +

Finding GC roots: using LLVM or parsing assembler files from GCC

+ +
+

PyPy contains a framework for writing custom Garbage Collectors, and a few simple GCs have been written in this framework. A common issue with all these GCs is how to find all the stack roots, i.e. all the pointers to live GC-managed objects currently stored in local variables, in all the callers of the current function. The current solution is to maintain a custom shadow stack of roots, where all functions push and pop copies of their local variables of type "GC pointer". Clearly this is an overhead. Can we remove it?

+ +

LLVM has recently grown some support for this. By emitting markers in the LLVM source and with the help of a bit of custom C++ code, we can generate stack maps for the functions compiled by LLVM. Then, with 100% non-portable code in our framework GC's root finding algorithm, we can walk the machine stack and locate where in each stack frame LLVM stores the GC pointers. (Yes, I mean non-portable: LLVM offers no help for doing that. Maybe it will at some point, though I didn't manage to explain why this is an issue to people working on this in LLVM so far...). I've tried that approach in the llvmgcroot branch. Over the manually-managed shadow stack, this gives speed improvements which are, very roughly, on the order of 5%.

+ +

Note that this prevents some optimizations in LLVM, because it forces it to allocate all local variables of type "GC pointer" in the stack; it cannot keep them in registers and it must assume that they can be changed more or less at any time (as moving GCs do). Can we do better?

+ +

Actually, yes. We can even do better in the C backend, using a GCC hack. GCC has this nice extension: +

+
asm("bla", constrains);
+This is meant to generate assembler instructions directly from C. Internally, GCC considers the whole asm() as a single regular instruction of its intermediate language; the constrains are expressed in the same way as the constrains for all the prebuilt intermediate language instructions. They express things like input and output operands of the instruction, whether they can live in memory or in registers, whether the whole instruction has side-effects, etc. The nice thing about asm() is that it doesn't kill any optimization whatsoever in GCC - it's your job to make sure that you use the correct constrains. + +

So what I've tried in the asmgcroot branch is to use asm() as markers. In this branch, the C backend produces code like this after each function call, for each local variable containing a live GC pointer:

+ +
asm("/* GCROOT %0 */" : "=g"(localvar) : "0"(localvar) : "memory");
+ +

This causes GCC to emit the following line in the assembler file it generates:

+ +
/* GCROOT register-or-memory-containing-localvar */
+ +

I won't go in the details of the asm() line above - the constrains are just enough to make sure that GCC doesn't optimize too much, but don't prevent most optimizations from occurring. For example, the localvar can be in a register.

+ +

The assembler will just ignore the line above; it is a comment. But what we can do is write our own tool parsing the assembler files. This tool locates the /* GCROOT */ comments and follows where the register or memory location in the comment comes from (to do this it must follow the control flow and data flow of the function). This allows it to build a stack map: for each call instruction it knows exactly which registers and frame stack locations contain a live GC pointer. The stack map is then emitted in an extra assembler file that we link with the rest. As with LLVM above, the stack map is then used at run-time by non-portable code written in our GC's stack root tracker.

+ +

Yes, that's rather insane. But at least, we don't need to modify the assembler file - just read it. If GCC is too clever in its optimizations, the custom parser will get lost and complain cleanly; but I think that it is relatively safe in the sense that GCC optimizations should not be able to make the custom parser produce wrong results.

+ +

The branch is not merged because it's probably too insane to merge (not to mention, it's probably not portable to non-GCC compilers, and it is completely platform-specific). Still, it gives good results, better that the pure LLVM approach - on the order of 10% to 25% speed-ups for pypy-c.

+
+
+
+
+ + Anonymous wrote on 2008-01-11 21:18: +
+
+

How does Objective-C 2.0 handle this same problem?

+
+
+
+
+ + Armin Rigo wrote on 2008-01-12 09:04: +
+
+

Obviously it depends on the compiler, but the basic idea is that the natural place to support this is in the compiler itself. For example, instead of parsing the assembler produced by GCC, it would probably be possible to extend GCC to cleanly generate stack maps. (This is basically what I tried to do with LLVM, which gives a plug-in API to do that.)

After a bit of googling, GCC doesn't seem to support Objective-C 2.0 yet. Moreover, the current Objective-C run-time library simply uses the conservative Boehm collector.

+
+
+
+
+ + Anonymous wrote on 2008-01-15 08:28: +
+
+

ObjC 2 does not use Boehm. The collecting thread suspends other threads and conservatively scans their stacks. It picks up values in registers by querying the kernel for the suspended thread state. It depends heavily on Mach.

+
+
+
+
+ + Anonymous wrote on 2008-01-16 17:39: +
+
+

llvm-gcc fully supports inline asm, so you could use the same hack you use with GCC with your llvm backend.

Also, you might be interested in https://llvm.org/PR1917, which proposes a method of identifying GC pointers that doesn't disable most optimizations.

+
+
+
+
+ + Barry Kelly wrote on 2010-04-08 21:10: +
+
+

To Anonymous saying Objective C 2 not using Boehm: that may be true (I don't know the details), but the Boehm GC also suspends other threads, conservatively scans their stacks and picks up values in registers using the OS.

+
+
+
+ +
+
+ +
+
+
+ +
+ + + + \ No newline at end of file diff --git a/blog/index-30.html b/blog/index-30.html new file mode 100644 index 000000000..3fadd0ea6 --- /dev/null +++ b/blog/index-30.html @@ -0,0 +1,971 @@ + + + + + + +PyPy (old posts, page 30) | PyPy + + + + + + + + + + + + + + + + + + Skip to main content +
+
+
+

NumPy on PyPy - Progress in February

+ +
+

More progress was made on the NumPy front in the past month. On the compatibility front, we now pass ~130 more tests from NumPy's suite since the end of January. Currently, we pass 2336 tests out of 3265 tests run, with many of the failures representing portions of NumPy that we don't plan to implement in the near future (object dtypes, unicode, etc). There are still some failures that do represent issues, such as special indexing cases and failures to respect subclassed ndarrays in return values, which we do plan to resolve. There are also some unimplemented components and ufuncs remaining which we hope to implement, such as nditer and mtrand. Overall, the most common array functionality should be working.

+Additionally, I began to take a look at some of the loops generated by our code. One widely used loop is dot, and we were running about 5x slower than NumPy's C version. I was able to optimize the dot loop and also the general array iterator to get us to ~1.5x NumPy C time on dot operations of various sizes. Further progress in this area could be made by using CFFI to tie into BLAS libraries, when available. Also, work remains in examining traces generated for our other loops and checking for potential optimizations.

+To try out PyPy + NumPy, grab a nightly PyPy and install our NumPy fork. Feel free to report comments/issues to IRC, our mailing list, or bug tracker. Thanks to the contributors to the NumPy on PyPy proposal for supporting this work.

+Cheers,
+Brian

+
+
+
+
+ + Anonymous wrote on 2014-03-09 06:05: +
+
+

Thanks! It would be easier to repost this if the title contained pypy: "numpy in pypy - progress in February"

+
+
+
+
+ + Canesin wrote on 2014-03-17 12:33: +
+
+

It would be great if the first performance optimizations where actually wrapper to BLAS, there is outstanding BSD license BLAS at https://github.com/xianyi/OpenBLAS

+
+
+
+
+ + Armin Rigo wrote on 2014-03-17 20:50: +
+
+

I believe the "performance optimizations" mentioned in the blog post are unrelated to BLAS. BLAS is about calling an external library. You can't optimize that, you just merely call it. The performance optimizations are about things like computing the matrix "a + b + c", which can be done without computing the intermediate result "a + b".

+
+
+
+
+ + Canesin wrote on 2014-03-18 11:16: +
+
+

Armin, I agree with you. What I'm trying to say is that maybe to make the BLAS interface is going to be very easy, give great performance and people will use it most of the time if you bundle it.

+
+
+
+ +

Py3k status update #13

+ +
+

This is the 13th status update about our work on the py3k branch, which we
+can work on thanks to all of the people who donated to the py3k proposal.

+

We're just finishing up a cleanup of int/long types. This work helps the py3k
+branch unify these types into the Python 3 int and restore JIT compilation of
+machine sized integers
.

+

This cleanup also removes multimethods from these types. PyPy has
+historically used a clever implementation of multimethod dispatch for declaring
+methods of the __builtin__ types in RPython.

+

This multimethod scheme provides some convenient features for doing this,
+however we've come to the conclusion that it may be more trouble than it's
+worth. A major problem of multimethods is that they generate a large amount of
+stub methods which burden the already lengthy and memory hungry RPython
+translation process. Also, their implementation and behavior can be somewhat
+complicated/obscure.

+

The alternative to multimethods involves doing the work of the type checking
+and dispatching rules in a more verbose, manual way. It's a little more work in
+the end but less magical.

+

Recently, Manuel Jacob finished a large cleanup effort of the
+unicode/string/bytearray types that also removed their multimethods. This work
+also benefits the py3k branch: it'll help with future PEP 393 (or PEP 393
+alternative
) work. This effort was partly sponsored by Google's Summer of
+Code: thanks Manuel and Google!

+

Now there's only a couple major pieces left in the multimethod removal (the
+float/complex types and special marshaling code) and a few minor pieces that
+should be relatively easy.

+

In conclusion, there's been some good progress made on py3k and multimethod
+removal this winter, albeit a bit slower than we would have liked.

+

cheers,
+Phil

+
+
+
+
+ + Armin Rigo wrote on 2014-02-18 09:41: +
+
+

The str/unicode/bytearray refactoring is not completely done yet.

+
+
+
+ +

Rewrites of the STM core model -- again

+ +
+

Hi all,

+ +

A quick note about the Software Transactional Memory (STM) front.

+ +

Since the previous +post, we believe we progressed a lot by discovering an alternative +core model for software transactions. Why do I say "believe"? It's +because it means again that we have to rewrite from scratch the C +library handling STM. This is currently work in progress. Once this is +done, we should be able to adapt the existing pypy-stm to run on top of +it without much rewriting efforts; in fact it should simplify the +difficult issues we ran into for the JIT. So while this is basically +yet another restart similar to last +June's, the difference is that the work that we have already put in the PyPy +part (as opposed to the C library) remains.

+ +

You can read about the basic ideas of this new C library here. +It is still STM-only, not HTM, but because it doesn't constantly move +objects around in memory, it would be easier to adapt an HTM version. +There are even potential ideas about a hybrid TM, like using HTM but +only to speed up the commits. It is based on a Linux-only system call, remap_file_pages() +(poll: who heard about it before? :-). As previously, the work is done +by Remi Meier and myself.

+ +

Currently, the C library is incomplete, but early experiments show good +results in running duhton, +the interpreter for a minimal language created for the purpose of +testing STM. Good results means we brough down the slow-downs from +60-80% (previous version) to around 15% (current version). This number +measures the slow-down from the non-STM-enabled to the STM-enabled +version, on one CPU core; of course, the idea is that the STM version +scales up when using more than one core.

+ +

This means that we are looking forward to a result that is much better +than originally predicted. The pypy-stm has chances to run at a +one-thread speed that is only "n%" slower than the regular pypy-jit, for +a value of "n" that is optimistically 15 --- but more likely some number +around 25 or 50. This is seriously better than the original estimate, +which was "between 2x and 5x". It would mean that using pypy-stm is +quite worthwhile even with just two cores.

+ +

More updates later...

+ +

Armin

+
+
+
+
+ + Anonymous wrote on 2014-02-10 17:29: +
+
+

Did you consider existing STM libraries in your implementation? It might be worthwhile to take a look at stasis (https://code.google.com/p/stasis/) which has a pretty complete set of features.

https://www.eecs.berkeley.edu/Pubs/TechRpts/2010/EECS-2010-2.pdf

+
+
+
+
+ + Armin Rigo wrote on 2014-02-10 20:22: +
+
+

Statis is not really applicable here: it's a Transactional Storage system, which despite the attempt of this paper to generalize it, is not going to apply successfully in the context of PyPy.

+
+
+
+
+ + Armin Rigo wrote on 2014-02-10 20:22: +
+
+

More comments on Hacker News.

+
+
+
+
+ + Dima Tisnek wrote on 2014-02-11 13:32: +
+
+

poll response: I've heard of remap_file_pages! :)

I was wondering how to use this call when I learnt of it, but couldn't figure anything out except possibly database applications (similar) and sort algorithms (too limited). I think this call may be used when manipulating framebuffer too, there was something about having multiple mappings [to hardware] some readonly, some not.

I would like to [possibly] disagree with your statement in c7 README "Most probably, this comes with no overhead once the change is done..."

TLB cache is a limited resource and may easily be contended on large systems. Regular mmap could [in theory] use huge TLB pages, remapped individual pages cannot.

In addition there is a small penalty during first access to the remapped page, though you may consider it amortized depending on remap/reuse ratio.

Granted it's still small stuff.

Reserving one register is is a cool trick, and I find quite acceptable. It too has a small penalty, but the benefits surely outweigh those!

+
+
+
+
+ + Armin Rigo wrote on 2014-02-11 13:39: +
+
+

@Dina: Thanks for the feedback! Note that "%gs" is a special register that is usually not used: there is no direct way to read/write its actual value. It needs to be done with a syscall, at least before very recent CPUs. It can only be used in addressing instructions as an additional offset.

+
+
+
+
+ + Arne Babenhauserheide wrote on 2014-02-21 08:04: +
+
+

just 15% slower sounds wonderful!

+
+
+
+ +

NumPy Status Update - December/January

+ +
+

Work continued on the NumPy + PyPy front steadily in December and more lightly in January. The continued focus was compatibility, targeting incorrect or unimplemented features that appeared in multiple NumPy test suite failures. We now pass ~2/3 of the NumPy test suite. The biggest improvements were made in these areas:

+- Bugs in conversions of arrays/scalars to/from native types
+- Fix cases where we would choose incorrect dtypes when initializing or computing results
+- Improve handling of subclasses of ndarray through computations
+- Support some optional arguments for array methods that are used in the pure-python part of NumPy
+- Support additional attributes in arrays, array.flags, and dtypes
+- Fix some indexing corner cases that arise in NumPy testing
+- Implemented part of numpy.fft (cffti and cfftf)

+Looking forward, we plan to continue improving the correctness of the existing implemented NumPy functionality, while also beginning to look at performance. The initial focus for performance will be to look at areas where we are significantly worse than CPython+NumPy. Those interested in trying these improvements out will need a PyPy nightly, and an install of the PyPy NumPy fork. Thanks again to the NumPy on PyPy donors for funding this work.

+
+
+
+
+ + Anatoly Vostryakov wrote on 2014-02-06 21:38: +
+
+

Many thanks for your work! Looking forward to support a full functionality of numpy in pypy!

+
+
+
+
+ + Anonymous wrote on 2014-02-06 22:21: +
+
+

> We now pass ~2/3 of the NumPy test suite.

Is the test coverage of numpy high enough so that a 100% green numpypy can be considered a full port? (Honest question, I have no background information suggesting the opposite.)

+
+
+
+
+ + Anonymous wrote on 2014-02-08 18:58: +
+
+

Great news that you are making progress on numpy. I can't wait!

+
+
+
+
+ + Anonymous wrote on 2014-02-13 14:57: +
+
+

I can't wait to use Numpypy to speed up scientific analysis.

Are there any updates on using numpypy with a plotting package such as matplotlib?

+
+
+
+
+ + Armin Rigo wrote on 2014-02-18 13:43: +
+
+

https://mail.python.org/pipermail/pypy-dev/2014-February/012209.html

+
+
+
+ +

NumPy Status Update - November

+ +
+

Since the PyPy 2.2 release last month, more progress has been made on the NumPy compatibility front. Initial work has been directed by running the NumPy test suite and targeting failures that appear most frequently, along with fixing the few bugs reported on the bug tracker.

+Improvements were made in these areas:
+- Many missing/broken scalar functionalities were added/fixed. The scalar API should match up more closely with arrays now.
+- Some missing dtype functionality was added (newbyteorder, hasobject, descr, etc)
+- Support for optional arguments (axis, order) was added to some ndarray functions
+- Fixed some corner cases for string/record types

+Most of these improvements went onto trunk after 2.2 was split, so if you're interested in trying them out or running into problems on 2.2, try the +nightly.

+Thanks again to the NumPy on PyPy donors who make this continued progress possible.

+Cheers,
+Brian

+
+
+
+
+ + Anonymous wrote on 2013-12-17 13:45: +
+
+

This is fantastic news! I can't wait until I can run my numpy scripts under pypy as easily as I can my standard python scripts.

+
+
+
+ +

PyGame CFFI

+ +
+
+ +

One of the RaspberryPi's goals is to be a fun toolkit for school children (and adults!) to learn programming and electronics with. Python and pygame are part of this toolkit. Recently the RaspberryPi Foundation funded parts of the effort of porting of pypy to the Pi -- making Python programs on the Pi faster!

+

Unfortunately pygame is written as a Python C extension that wraps SDL which means performance of pygame under pypy remains mediocre. To fix this pygame needs to be rewritten using cffi to wrap SDL instead.

+

RaspberryPi sponsored a CTPUG (Cape Town Python User Group) hackathon to put together a proof-of-concept pygame-cffi. The day was quite successful - we got a basic version of the bub'n'bros client working on pygame-cffi (and on PyPy). The results can be found on github with contributions from the five people present at the sprint.

+

While far from complete, the proof of concept does show that there are no major obstacles to porting pygame to cffi and that cffi is a great way to bind your Python package to C libraries.

+

Amazingly, we managed to have machines running all three major platforms (OS X, Linux and Windows) at the hackathon so the code runs on all of them!

+

We would like to thank the Praekelt foundation for providing the venue and The Raspberry Pi foundation for providing food and drinks!

+

Cheers,
+Simon Cross, Jeremy Thurgood, Neil Muller, David Sharpe and fijal.

+
+
+
+
+
+
+ + René Dudfield wrote on 2013-12-09 14:21: +
+
+

Why not use the ctypes based pysdl2?

+
+
+
+
+ + Maciej Fijalkowski wrote on 2013-12-09 16:19: +
+
+

first of all pygame depends on SDL 1. Second ctypes kinda suck and I don't quite buy it's stability (especially with changing APIs, though it can be less of an issue with SDL). It's also slow on pypy

+
+
+
+
+ + René Dudfield wrote on 2013-12-09 17:09: +
+
+

Ah, ok. Very nice work anyway. It's impressive what you all managed to get done in the sprint :)

Here's some information from pygame land about where the project is heading.

SDL 1 is the past, and the SDL developers are no longer putting out releases. However, I think many people will continue to patch it up for many years. SDL 2 is the future and after many years finally has a release out (2 now). pysdl2 is part of the future of pygame. pysdl2 matches the SDL 2 API as closely as possible. A pygame API ontop of pysdl2 is the future of pygame.

ctypes is no good for some platforms like iOS, and the web and pypy apparently. Although note, that pysdl2 already 'works' on top of pypy.

https://bitbucket.org/marcusva/py-sdl2/
https://pysdl2.readthedocs.org/en/latest/


Happy hacking :)

+
+
+
+
+ + Anonymous wrote on 2013-12-09 18:56: +
+
+

Amazing - you consider a messy cffi implementation (sometimes it builds on platform X, sometimes it does not, sometimes it works, sometimes it does not) a better choice over ctypes?

+
+
+
+
+ + Maciej Fijalkowski wrote on 2013-12-09 19:16: +
+
+

@Anonymous - your comment is pretty loaded, but we do think cffi is better than ctypes on all platforms, that's why we came up with cffi in the first place. I think cffi FAQ contains an answer to that.

+
+
+
+
+ + Armin Rigo wrote on 2013-12-10 09:30: +
+
+

@Rene: if pysdl2 is a bare-metal ctypes wrapper, writing a similar cffi wrapper instead should be very straightforward (even more than the current pygame-cffi). But do you know if pygame is really going that route, and if so, how soon?

+
+
+
+
+ + Unknown wrote on 2013-12-10 23:33: +
+
+

I've been looking at cffi since it was first mentioned on our Pygame mailing list. It does look promising. I see only two, buffer related, issues that need to be resolved.

First, PyPy lacks an array export mechanism comparable to the CPython PEP 3113 buffer protocol. Instead, only the NumPy Array Interface, version: 3 is available. Though Pygame supports both the Python and C sides of the interface, it relies on CPython's reference counting for timely buffer release [1]. Periodic garbage collection is too unpredictable.

Second, the cffi module does not support CPython api function calls. So a cffi Pygame could not support the buffer protocol on CPython.

A possible solution to the first issue is for PyPy to use an extended array interface that includes a PEP 3118 like buffer release callback. I am working to resolve the second issue: [Issue13797] Allow objects implemented in pure Python to export PEP 3118 buffers.

[1] Add PEP 3118 (new) buffer support to Pygame surfaces

+
+
+
+
+ + Anonymous wrote on 2013-12-15 21:32: +
+
+

Hm, I can't get this to work on Ubuntu 12.04 doing the following

virtualenv -p /usr/bin/pypy pypy
cd pypy
source bin/activate
pip install git+https://github.com/eliben/pycparser.git
pip install hg+https://github.com/eliben/pycparser.git
pip install hg+https://foss.heptapod.net/cffi/cffi
git clone https://github.com/CTPUG/pygame_cffi.git
cd pygame_cffi/
pypy
import pygame

>>>> import pygame
Traceback (most recent call last):
File "", line 1, in
File "pygame/__init__.py", line 9, in
from pygame.color import Color
File "pygame/color.py", line 3, in
from pygame._sdl import ffi, sdl
File "pygame/_sdl.py", line 6, in
ffi = cffi.FFI()
File "/home/me/Documents/python/pygame/pypy/site-packages/cffi/api.py", line 56, in __init__
import _cffi_backend as backend
ImportError: No module named _cffi_backend


dpkg -l pypy
...
ii pypy 1.8+dfsg-2 fast alternative implementation of Python - PyPy interpreter


Do I need a newer pypy? Am I missing something else?

+
+
+
+
+ + Maciej Fijalkowski wrote on 2013-12-15 21:48: +
+
+

yes, you need a vastly newer pypy

+
+
+
+
+ + Unknown wrote on 2013-12-16 18:49: +
+
+

I am +1 on porting PySDL2 to CFFI instead of pygame.

+
+
+
+
+ + Unknown wrote on 2016-05-03 01:01: +
+
+
great! what's current status of it? I really can't wait to use Pygame on a PI through pypy. +
+
+
+
+ + Armin Rigo wrote on 2016-05-04 10:16: +
+
+

Development occurs at https://github.com/CTPUG/pygame_cffi nowadays.

+
+
+
+ +

PyPy Leysin Winter Sprint (11-19st January 2014)

+ +
+

The next PyPy sprint will be in Leysin, Switzerland, for the ninth time. +This is a fully public sprint: newcomers and topics other than those +proposed below are welcome.

+

Goals and topics of the sprint

+
    +
  • Py3k: work towards supporting Python 3 in PyPy
  • +
  • NumPyPy: work towards supporting the numpy module in PyPy
  • +
  • STM: work towards supporting Software Transactional Memory
  • +
  • And as usual, the main side goal is to have fun in winter sports :-) +We can take a day off for ski.
  • +
+

Exact times

+

For a change, and as an attempt to simplify things, I specified the +dates as 11-19 January 2014, where 11 and 19 are travel days. We will +work full days between the 12 and the 18. You are of course allowed to +show up for a part of that time only, too.

+

Location & Accomodation

+

Leysin, Switzerland, "same place as before". Let me refresh your +memory: both the sprint venue and the lodging will be in a very spacious +pair of chalets built specifically for bed & breakfast: +https://www.ermina.ch/. The place has a good ADSL Internet connexion +with wireless installed. You can of course arrange your own lodging +anywhere (as long as you are in Leysin, you cannot be more than a 15 +minutes walk away from the sprint venue), but I definitely recommend +lodging there too -- you won't find a better view anywhere else (though +you probably won't get much worse ones easily, either :-)

+

Please confirm that you are coming so that we can adjust the +reservations as appropriate. The rate so far has been around 60 CHF a +night all included in 2-person rooms, with breakfast. There are larger +rooms too (less expensive per person) and maybe the possibility to get a +single room if you really want to.

+

Please register by Mercurial:

+
+https://bitbucket.org/pypy/extradoc/
+https://foss.heptapod.net/pypy/extradoc/-/blob/branch/default/extradoc/sprintinfo/leysin-winter-2014
+
+

or on the pypy-dev mailing list if you do not yet have check-in rights:

+
+https://mail.python.org/mailman/listinfo/pypy-dev +
+

You need a Swiss-to-(insert country here) power adapter. There will be +some Swiss-to-EU adapters around -- bring a EU-format power strip if you +have one.

+
+
+
+
+ + Pim wrote on 2014-01-29 11:57: +
+
+

Very interested to know how far you got, especially STM

+
+
+
+
+ + Armin Rigo wrote on 2014-01-29 21:24: +
+
+

I'll do a proper post about STM, but in the meantime: we progressed on STM-C7, without hitting an obstacle so far, so hopes are high :-)

+
+
+
+ +

PyPy 2.2.1 - Incrementalism.1

+ +
+

We're pleased to announce PyPy 2.2.1, which targets version 2.7.3 of the Python +language. This is a bugfix release over 2.2.

+

You can download the PyPy 2.2.1 release here:

+
+https://pypy.org/download.html +
+

What is PyPy?

+

PyPy is a very compliant Python interpreter, almost a drop-in replacement for +CPython 2.7. It's fast (pypy 2.2 and cpython 2.7.2 performance comparison) +due to its integrated tracing JIT compiler.

+

This release supports x86 machines running Linux 32/64, Mac OS X 64, Windows +32, or ARM (ARMv6 or ARMv7, with VFPv3).

+

Work on the native Windows 64 is still stalling, we would welcome a volunteer +to handle that.

+

Highlights

+

This is a bugfix release. The most important bugs fixed are:

+
    +
  • an issue in sockets' reference counting emulation, showing up +notably when using the ssl module and calling makefile().
  • +
  • Tkinter support on Windows.
  • +
  • If sys.maxunicode==65535 (on Windows and maybe OS/X), the json +decoder incorrectly decoded surrogate pairs.
  • +
  • some FreeBSD fixes.
  • +
+

Note that CFFI 0.8.1 was released. Both versions 0.8 and 0.8.1 are +compatible with both PyPy 2.2 and 2.2.1.

+

Cheers, +Armin Rigo & everybody

+
+
+
+
+ + renaud wrote on 2013-11-27 15:06: +
+
+

what about pypy3-2.2?
by the way, thank you!

+
+
+
+
+ + Armin Rigo wrote on 2013-11-28 08:58: +
+
+

Waiting for answers from https://mail.python.org/pipermail/pypy-dev/2013-November/011965.html.

+
+
+
+ +

CFFI 0.8

+ +
+

Hi all,

+ +

CFFI 0.8 for CPython (2.6-3.x) has been released.

+ +

Quick download: pip install cffi --upgrade +
Documentation: https://cffi.readthedocs.org/en/release-0.8/

+ +

What's new: a number of small fixes; ffi.getwinerror(); integrated support for C99 variable-sized structures; multi-thread safety.

+ +

--- Armin

+ +

Update: CFFI 0.8.1, with fixes on Python 3 on OS/X, and some FreeBSD fixes (thanks Tobias).

+
+

NumPy status update

+ +
+ Here is what has been happening with NumPy in PyPy in October thanks to the people who donated to the NumPyPy proposal:

The biggest change is that we shifted to using an external fork of numpy rather than a minimal numpypy module. The idea is that we will be able to reuse most of the upstream pure-python numpy components, replacing the C modules with appropriate RPython micronumpy pieces at the correct places in the module namespace.

The numpy fork should work just as well as the old numpypy for functionality that existed previously, and also include much new functionality from the pure-python numpy pieces that simply hadn't been imported yet in numpypy. However, this new functionality will not have been "hand picked" to only include pieces that work, so you may run into functionality that relies on unimplemented components (which should fail with user-level exceptions).

This setup also allows us to run the entire numpy test suite, which will help in directing future compatibility development. The recent PyPy release includes these changes, so download it and let us know how it works! And if you want to live on the edge, the nightly includes even more numpy progress made in November.

To install the fork, download the latest release, and then install numpy either separately with a virtualenv: pip install git+https://bitbucket.org/pypy/numpy.git; or directly: git clone https://bitbucket.org/pypy/numpy.git; cd numpy; pypy setup.py install.

EDIT: if you install numpy as root, you may need to also import it once as root before it works: sudo pypy -c 'import numpy'

Along with this change, progress was made in fixing internal micronumpy bugs and increasing compatibility:
    +
  • Fixed a bug with strings in record dtypes
  • +
  • Fixed a bug where the multiplication of an ndarray with a Python int or float resulted in loss of the array's dtype
  • +
  • Fixed several segfaults encountered in the numpy test suite (suite should run now without segfaulting)
  • +
+
We also began working on __array_prepare__ and __array_wrap__, which are necessary pieces for a working matplotlib module.

Cheers,
+Romain and Brian +
+
+
+
+ + Anonymous wrote on 2013-11-16 09:28: +
+
+

Hi,

Thanks for all your efforts on pypy-*, we really appreciate it!

I'm trying to compile numpy with pypy-2.2-osx64 but the building process (manual and pip) fails with:
AttributeError: 'module' object has no attribute 'get_makefile_filename'

Full build log: https://pastebin.com/S4dybCV0

Any idea how to resolve this?

Thanks,
t

+
+
+
+
+ + Maciej Fijalkowski wrote on 2013-11-16 10:06: +
+
+

Hey

Please put such reports to bugs.pypy.org so they don't get lost.

Thanks!
fijal

+
+
+
+
+ + Brian Kearns wrote on 2013-11-16 17:43: +
+
+

Installation on OS X was fixed.

+
+
+
+
+ + Sau wrote on 2014-03-12 05:47: +
+
+

I am getting an error when installing numpy for pypy 2.2.1:

https://stackoverflow.com/questions/22342769/error-when-installing-numpy-for-pypy2-2-1

+
+
+
+
+ + Sau wrote on 2014-03-12 05:47: +
+
+

I am getting an error when installing numpy for pypy 2.2.1:

https://stackoverflow.com/questions/22342769/error-when-installing-numpy-for-pypy2-2-1

+
+
+
+ +
+
+ +
+
+
+ +
+ + + + \ No newline at end of file diff --git a/blog/index-31.html b/blog/index-31.html new file mode 100644 index 000000000..cc845681d --- /dev/null +++ b/blog/index-31.html @@ -0,0 +1,1942 @@ + + + + + + +PyPy (old posts, page 31) | PyPy + + + + + + + + + + + + + + + + + + Skip to main content +
+
+
+

A Field Test of Software Transactional Memory Using the RSqueak Smalltalk VM

+ +
+

+Extending the Smalltalk RSqueakVM with STM

+

by Conrad Calmez, Hubert Hesse, Patrick Rein and Malte Swart supervised by Tim Felgentreff and Tobias Pape

+

+Introduction

+

After pypy-stm we can announce that through the RSqueakVM (which used to be called SPyVM) a second VM implementation supports software transactional memory. RSqueakVM is a Smalltalk implementation based on the RPython toolchain. We have added STM support based on the STM tools from RPython (rstm). The benchmarks indicate that linear scale up is possible, however in some situations the STM overhead limits speedup.

+

The work was done as a master's project at the Software Architechture Group of Professor Robert Hirschfeld at at the Hasso Plattner Institut at the University of Potsdam. We - four students - worked about one and a half days per week for four months on the topic. The RSqueakVM was originally developped during a sprint at the University of Bern. When we started the project we were new to the topic of building VMs / interpreters.

+

We would like to thank Armin, Remi and the #pypy IRC channel who supported us over the course of our project. We also like to thank Toni Mattis and Eric Seckler, who have provided us with an initial code base.

+

+Introduction to RSqueakVM

+

As the original Smalltalk implementation, the RSqueakVM executes a given Squeak Smalltalk image, containing the Smalltalk code and a snapshot of formerly created objects and active execution contexts. These execution contexts are scheduled inside the image (greenlets) and not mapped to OS threads. Thereby the non-STM RSqueakVM runs on only one OS thread.

+

+Changes to RSqueakVM

+

The core adjustments to support STM were inside the VM and transparent from the view of a Smalltalk user. Additionally we added Smalltalk code to influence the behavior of the STM. As the RSqueakVM has run in one OS thread so far, we added the capability to start OS threads. Essentially, we added an additional way to launch a new Smalltalk execution context (thread). But in contrast to the original one this one creates a new native OS thread, not a Smalltalk internal green thread.

+ +

STM (with automatic transaction boundaries) already solves the problem of concurrent access on one value as this is protected by the STM transactions (to be more precise one instruction). But there are cases were the application relies on the fact that a bigger group of changes is executed either completely or not at all (atomic). Without further information transaction borders could be in the middle of such a set of atomic statements. rstm allows to aggregate multiple statements into one higher level transaction. To let the application mark the beginning and the end of these atomic blocks (high-level transactions), we added two more STM specific extensions to Smalltalk.

+ +

+Benchmarks

+

RSqueak was executed in a single OS thread so far. rstm enables us to execute the VM using several OS threads. Using OS threads we expected a speed-up in benchmarks which use multiple threads. We measured this speed-up by using two benchmarks: a simple parallel summation where each thread sums up a predefined interval and an implementation of Mandelbrot where each thread computes a range of predefined lines.

+ +

To assess the speed-up, we used one RSqueakVM compiled with rstm enabled, but once running the benchmarks with OS threads and once with Smalltalk green threads. The workload always remained the same and only the number of threads increased. To assess the overhead imposed by the STM transformation we also ran the green threads version on an unmodified RSqueakVM. All VMs were translated with the JIT optimization and all benchmarks were run once before the measurement to warm up the JIT. As the JIT optimization is working it is likely to be adoped by VM creators (the baseline RSqueakVM did that) so that results with this optimization are more relevant in practice than those without it. We measured the execution time by getting the system time in Squeak. The results are:

+

+Parallel Sum Ten Million

+ + + +
+ +
+ +
Benchmark Parallel Sum 10,000,000
+ + + + + + + + + + + + + + + +
Thread Count RSqueak green threads RSqueak/STM green threads RSqueak/STM OS threads Slow down from RSqueak green threads to RSqueak/STM green threads Speed up from RSqueak/STM green threads to RSQueak/STM OS Threads
1 168.0 ms 240.0 ms 290.9 ms 0.70 0.83
2 167.0 ms 244.0 ms 246.1 ms 0.68 0.99
4 167.8 ms 240.7 ms 366.7 ms 0.70 0.66
8 168.1 ms 241.1 ms 757.0 ms 0.70 0.32
16 168.5 ms 244.5 ms 1460.0 ms 0.69 0.17
+

+Parallel Sum One Billion

+ + + +
+ +
+ +
Benchmark Parallel Sum 1,000,000,000
+
+ + + + + + + + + + + + + + + + + + + + +
Thread CountRSqueak green threadsRSqueak/STM green threadsRSqueak/STM OS threadsSlow down from RSqueak green threads to RSqueak/STM green threadsSpeed up from RSqueak/STM green threads to RSQueak/STM OS Threads
1 16831.0 ms 24111.0 ms 23346.0 ms 0.70 1.03
2 17059.9 ms 24229.4 ms 16102.1 ms 0.70 1.50
4 16959.9 ms 24365.6 ms 12099.5 ms 0.70 2.01
8 16758.4 ms 24228.1 ms 14076.9 ms 0.69 1.72
16 16748.7 ms 24266.6 ms 55502.9 ms 0.69 0.44
+

+Mandelbrot Iterative

+ + + +
+ +
+ +
Benchmark Mandelbrot
+ + + + + + + + + + + + + + +
Thread Count RSqueak green threads RSqueak/STM green threads RSqueak/STM OS threads Slow down from RSqueak green threads to RSqueak/STM green threads Speed up from RSqueak/STM green threads to RSqueak/STM OS Threads
1 724.0 ms 983.0 ms 1565.5 ms 0.74 0.63
2 780.5 ms 973.5 ms 5555.0 ms 0.80 0.18
4 781.0 ms 982.5 ms 20107.5 ms 0.79 0.05
8 779.5 ms 980.0 ms 113067.0 ms 0.80 0.01
+

+Discussion of benchmark results

+

First of all, the ParallelSum benchmarks show that the parallelism is actually paying off, at least for sufficiently large embarrassingly parallel problems. Thus RSqueak can also benefit from rstm.

+

On the other hand, our Mandelbrot implementation shows the limits of our current rstm integration. We implemented two versions of the algorithm one using one low-level array and one using two nested collections. In both versions, one job only calculates a distinct range of rows and both lead to a slowdown. The summary of the state of rstm transactions shows that there are a lot of inevitable transactions (transactions which must be completed). One reason might be the interactions between the VM and its low-level extensions, so called plugins. We have to investigate this further.

+

+Limitations

+

Although the current VM setup is working well enough to support our benchmarks, the VM still has limitations. First of all, as it is based on rstm, it has the current limitation of only running on 64-bit Linux.

+

Besides this, we also have two major limitations regarding the VM itself. First, the atomic interface exposed in Smalltalk is currently not working, when the VM is compiled using the just-in-time compiler transformation. Simple examples such as concurrent parallel sum work fine while more complex benchmarks such as chameneos fail. The reasons for this are currently beyond our understanding. Second, Smalltalk supports green threads, which are threads which are managed by the VM and are not mapped to OS threads. We currently support starting new Smalltalk threads as OS threads instead of starting them as green threads. However, existing threads in a Smalltalk image are not migrated to OS threads, but remain running as green threads.

+

+Future work for STM in RSqueak

+The work we presented showed interesting problems, we propose the following problem statements for further analysis:
    +
  • +Inevitable transactions in benchmarks. This looks like it could limit other applications too so it should be solved.
  • +
  • +Collection implementation aware of STM: The current implementation of collections can cause a lot of STM collisions due to their internal memory structure. We believe it could bear potential for performance improvements, if we replace these collections in an STM enabled interpreter with implementations with less STM collisions. As already proposed by Remi Meier, bags, sets and lists are of particular interest.
  • +
  • Finally, we exposed STM through languages features such as the atomic method, which is provided through the VM. Originally, it was possible to model STM transactions barriers implicitly by using clever locks, now its exposed via the atomic keyword. From a language design point of view, the question arises whether this is a good solution and what features an stm-enabled interpreter must provide to the user in general? Of particular interest are for example, access to the transaction length and hints for transaction borders to and their performance impact.
  • +
+
    +

    +Details for the technically inclined

    +
      +
    • +Adjustments to the interpreter loop were minimal.
    • +
    • STM works on bytecode granularity that means, there is a implicit transaction border after every bytecode executed. Possible alternatives: only break transactions after certain bytecodes, break transactions on one abstraction layer above, e.g. object methods (setter, getter).
    • +
    • rstm calls were exposed using primtives (a way to expose native code in Smalltalk), this was mainly used for atomic.
    • +
    • Starting and stopping OS threads is exposed via primitives as well. Threads are started from within the interpreter.
    • +
    • For Smalltalk enabled STM code we currently have different image versions. However another way to add, load and replace code to the Smalltalk code base is required to make a switch between STM and non-STM code simple.
    • +
    +
      +

      +Details on the project setup

      +

      From a non-technical perspective, a problem we encountered was the huge roundtrip times (on our machines up to 600s, 900s with JIT enabled). This led to a tendency of bigger code changes ("Before we compile, let's also add this"), lost flow ("What where we doing before?") and different compiled interpreters in parallel testing ("How is this version different from the others?") As a consequence it was harder to test and correct errors. While this is not as much of a problem for other RPython VMs, RSqueakVM needs to execute the entire image, which makes running it untranslated even slower.

      +

      +Summary

      +

      The benchmarks show that speed up is possible, but also that the STM overhead in some situations can eat up the speedup. The resulting STM-enabled VM still has some limitations: As rstm is currently only running on 64-bit Linux the RSqueakVM is doing so as well. Eventhough it is possible for us now to create new threads that map to OS threads within the VM, the migration of exiting Smalltalk threads keeps being problematic.

      +

      We showed that an existing VM code base can benefit of STM in terms of scaling up. Further it was relatively easy to enable STM support. This may also be valuable to VM developers considering to get STM support for their VMs.

      +
      +
      +
      +
      + + Armin Rigo wrote on 2014-08-09 14:10: +
      +
      +

      "We showed that an existing VM code base can benefit of STM in terms of scaling up." I dispute this conclusion: in the benchmarks, it seems that the non-STM version is scaling up well, even better than the STM+OS-threads version. But how can the non-STM version scale at all? It shouldn't: that's a property of RPython. And why is the STM+OS-threads version faster even with just 1 thread? I think you need to answer these questions first. Right now it screams "you are running buggy benchmarks" to me.

      +
      +
      +
      +
      + + Stefan Marr wrote on 2014-08-10 09:09: +
      +
      +

      I concur with Armin, the conclusions are problematic in the light of the current numbers.

      Could you give some more details on the benchmarks? Can I find the Smalltalk code somewhere?

      Things that come to mind are details about the scheduler. In the RoarVM, that was also one of the issues (which we did not solve). The standard Squeak scheduling data structure remains unchanged I suppose? How does that interact with the STM, is it problematic that each STM thread updates this shared data structure during every scheduling operation?

      Also, more basic, are you making sure that the benchmark processes are running with highest priority (80, IIRC), to avoid interference with other processes in the image?

      On the language level, something that could also have an impact on the results is closures. How are they implemented? I suppose similar to the way the CogVM implements them? I suppose, you make sure that closures are not shared between processes?

      And finally, what kind of benchmark harness are you using? Did you have a look at SMark? (https://smalltalkhub.com/#!/~StefanMarr/SMark)
      We used that one for the RoarVM, and it provides various options to do different kind of benchmarks, including weak-scaling benchmarks, which I would find more appropriate for scalability tests. Weak-scaling means, you increase the problem size with the number of cores. That replicates the scenario where the problem itself is not really parallelizable, but you can solve more problems at the same time in parallel. It also makes sure that each process/thread does the identical operations (if setup correctly).

      Well, all those questions aside, interesting work :) Hope to read more soon ;)

      +
      +
      +
      +
      + + Unknown wrote on 2014-08-10 20:13: +
      +
      +

      You definitely hit a really weak spot in our report... Today we investigated the ParallelSum benchmark again. So far, we've found out that it was indeed partially a problem with the priority of the benchmark process. The preliminary benchmark results make more sense now and as soon as we have stable ones we will update them.

      I'll still try to address some of your questions right now. :)

      1. Benchmark code
      I've just wrapped up the current version of our benchmarks and put them in our repository. You can find the two Squeak4.5 images at the stmgc-c7 branch of the RSqueak Repository . You can find the benchmarks in the CPB package. The Squeak4.5stm image needs the RSqueak/STM VM.

      2. Scheduler data structures
      Yes, the scheduling data structure is completely unchanged. We have only added a new subclass of Process which overwrites fork and calls a different primitive. However, these Processes are not managed by the Smalltalk scheduler, so there should be no synchronization issues here.

      3. Interference of other processes:
      This is probably the source of the "speed-up" we observe on the normal RSqueakVM. With more threads we might get a bigger portion of the total runtime. So far, the benchmarks already ran in a VM mode which disables the Smalltalk GUI thread, however in the traces we found that the event handler is still scheduled every now and then. We've done it as you suggested, Stefan, and set the priority to 80 (or 79 to not mess up the timer interrupt handler).

      4. Benchmark harness
      We actually use SMark and also made sure the timing operations of RSqueak do their job correctly. However we are probably not using SMark at its full potential.

      +
      +
      +
      +
      + + Unknown wrote on 2014-08-11 10:12: +
      +
      +

      I've just updated the benchmarks. All benchmark processes are now running with the Smalltalk process priority of 79 (80 is the highest). The single-threaded VMs now show the expected behavior.

      +
      +
      +
      +
      + + Unknown wrote on 2014-08-11 14:11: +
      +
      +

      To further clarify on the Mandelbrot benchmarks: After a discussion with Stefan, I have changed the Mandelbrot implementation. Each job now only has private data and does not read or write in any shared data structure. Still the benchmark results remain the same and we can still observe a high proportion of inevitable transactions.

      As Armin pointed out, and which would be a next step, we would need to figure out which parts of the interpreter might cause systematic conflicts.

      +
      +
      +
      + +

      PyPy-STM: first "interesting" release

      + +
      +

      Hi all,

      + +

      PyPy-STM is now reaching a point where we can say it's good enough to be +a GIL-less Python. (We don't guarantee there are no more bugs, so please +report them :-) The first official STM release:

      + +
        +
      • +pypy-stm-2.3-r2-linux64 +
        (UPDATE: this is release r2, fixing a systematic segfault at start-up on some systems) +
      • +
      +

      This corresponds roughly to PyPy 2.3 (not 2.3.1). It requires 64-bit +Linux. More precisely, this release is built for Ubuntu 12.04 to 14.04; +you can also rebuild it +from source by getting the branch stmgc-c7. You need +clang to compile, and you need a patched +version of llvm.

      + +

      This version's performance can reasonably be compared with a regular +PyPy, where both include the JIT. Thanks for following the meandering progress of PyPy-STM over the past three years --- we're finally getting somewhere really interesting! We cannot thank enough all contributors to the previous PyPy-STM money pot that made this possible. And, although this blog post is focused on the results from that period of time, I have of course to remind you that we're running a second call for donation for future work, which I will briefly mention again later.

      + +

      A recap of what we did to get there: around the start of the year we found a new model, a "redo-log"-based STM which uses a couple of hardware tricks to not require chasing pointers, giving it (in this context) exceptionally cheap read barriers. This idea was developed over the following months and (relatively) easily integrated with the JIT compiler. The most recent improvements on the Garbage Collection side are closing the gap with a regular PyPy (there is still a bit more to do there). There is some preliminary user documentation.

      + +

      Today, the result of this is a PyPy-STM that is capable of running pure Python code on multiple threads in parallel, as we will show in the benchmarks that follow. A quick warning: this is only about pure Python code. We didn't try so far to optimize the case where most of the time is spent in external libraries, or even manipulating "raw" memory like array.array or numpy arrays. To some extent there is no point because the approach of CPython works well for this case, i.e. releasing the GIL around the long-running operations in C. Of course it would be nice if such cases worked as well in PyPy-STM --- which they do to some extent; but checking and optimizing that is future work.

      + +

      As a starting point for our benchmarks, when running code that +only uses one thread, we get a slow-down between 1.2 and 3: at worst, +three times as slow; at best only 20% slower than a regular +PyPy. This worst case has been brought down --it used to be 10x-- by +recent work on "card marking", a useful GC technique that is also +present in the regular PyPy (and about which I don't find any blog post; +maybe we should write one :-) The main remaining issue is fork(), or +any function that creates subprocesses: it works, but is very slow. To +remind you of this fact, it prints a line to stderr when used.

      + +

      Now the real main part: when you run multithreaded code, it scales very nicely with two +threads, and less-than-linearly but still not badly with three or four +threads. Here is an artificial example:

      + +
          total = 0
      +    lst1 = ["foo"]
      +    for i in range(100000000):
      +        lst1.append(i)
      +        total += lst1.pop()
      + +

      We run this code N times, once in each of N threads +(full +benchmark). Run times, best of three:

      + + + + + + + + + + + + + + + + + + + + + + + + + + + +
      Number of threadsRegular PyPy (head)PyPy-STM
      N = 1real 0.92s
      +user+sys 0.92s
      real 1.34s
      +user+sys 1.34s
      N = 2real 1.77s
      +user+sys 1.74s
      real 1.39s
      +user+sys 2.47s
      N = 3real 2.57s
      +user+sys 2.56s
      real 1.58s
      +user+sys 4.106s
      N = 4real 3.38s
      +user+sys 3.38s
      real 1.64s
      +user+sys 5.35s
      +

      (The "real" time is the wall clock time. The "user+sys" time is the +recorded CPU time, which can be larger than the wall clock time if +multiple CPUs run in parallel. This was run on a 4x2 cores machine. +For direct comparison, avoid loops that are so trivial +that the JIT can remove all allocations from them: right now +PyPy-STM does not handle this case well. It has to force a dummy allocation +in such loops, which makes minor collections occur much more frequently.)

      + +

      Four threads is the limit so far: only four threads can be executed in +parallel. Similarly, the memory usage is limited to 2.5 GB of GC +objects. These two limitations are not hard to increase, but at least +increasing the memory limit requires fighting against more LLVM bugs. +(Include here snark remarks about LLVM.)

      + +

      Here are some measurements from more real-world benchmarks. This time, +the amount of work is fixed and we parallelize it on T threads. The first benchmark is just running translate.py on a trunk PyPy. The last +three benchmarks are here.

      + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
      BenchmarkPyPy 2.3(PyPy head)PyPy-STM, T=1T=2T=3T=4
      +translate.py --no-allworkingmodules
      +(annotation step)
      184s(170s)386s (2.10x)n/a
      multithread-richards
      +5000 iterations
      24.2s(16.8s)52.5s (2.17x)37.4s (1.55x)25.9s (1.07x)32.7s (1.35x)
      mandelbrot
      +divided in 16-18 bands
      22.9s(18.2s)27.5s (1.20x)14.4s (0.63x)10.3s (0.45x)8.71s (0.38x)
      btree2.26s(2.00s)2.01s (0.89x)2.22s (0.98x)2.14s (0.95x)2.42s (1.07x)
      +

      This shows various cases that can occur:

      + +
        +
      • The mandelbrot example runs with minimal overhead and very good parallelization. +It's dividing the plane to compute in bands, and each of the T threads receives the +same number of bands. + +
      • +
      • Richards, a classical benchmark for PyPy (tweaked to run the iterations +in multiple threads), is hard to beat on regular PyPy: +we suspect that the difference is due to the fact that a lot of +paths through the loops don't allocate, triggering the issue already +explained above. Moreover, the speed of Richards was again improved +dramatically recently, in trunk. + +
      • +
      • The translation benchmark measures the time translate.py +takes to run the first phase only, "annotation" (for now it consumes too much memory +to run translate.py to the end). Moreover the timing starts only after the large number of +subprocesses spawned at the beginning (mostly gcc). This benchmark is not parallel, but we +include it for reference here. The slow-down factor of 2.1x is still too much, but +we have some idea about the reasons: most likely, again the Garbage Collector, missing the regular PyPy's +very fast small-object allocator for old objects. Also, translate.py +is an example of application that could, with +reasonable efforts, be made largely parallel in the future using atomic blocks. + +
      • +
      • Atomic blocks are also present in the btree benchmark. I'm not completely sure +but it seems that, in this case, the atomic blocks create too many +conflicts between the threads for actual parallization: the base time is very good, +but running more threads does not help at all. +
      • +
      +

      As a summary, PyPy-STM looks already useful to run CPU-bound multithreaded +applications. We are certainly still going to fight slow-downs, but it +seems that there are cases where 2 threads are enough to outperform a regular +PyPy, by a large margin. Please try it out on your own small examples!

      + +

      And, at the same time, please don't attempt to retrofit threads inside +an existing large program just to benefit from PyPy-STM! +Our goal is not to send everyone down the obscure route of multithreaded +programming and its dark traps. We are going finally to shift our main +focus on the phase 2 of our +research (donations welcome): how to enable a better way of writing multi-core programs. +The starting point is to fix and test atomic blocks. Then we will have to +debug common causes of conflicts and fix them or work around them; and +try to see how common frameworks like Twisted can be adapted.

      + +

      Lots of work ahead, but lots of work behind too :-)

      + +

      Armin (thanks Remi as well for the work).

      +
      +
      +
      +
      + + Armin Rigo wrote on 2014-07-05 16:22: +
      +
      +

      You're just extracting and running the "bin/pypy"? It works for me on a very close configuration, Ubuntu 14.04 too...

      +
      +
      +
      +
      + + Armin Rigo wrote on 2014-07-05 20:13: +
      +
      +

      Yes. Sorry, it doesn't make sense to me. You need to debug with gdb, probably with an executable that has got the debugging symbols. You need to either build it yourself, or recompile the pregenerated sources from: https://cobra.cs.uni-duesseldorf.de/~buildmaster/misc/pypy-c-r72356-stm-jit-SOURCE.txz

      +
      +
      +
      +
      + + Ernst Sjöstrand wrote on 2014-07-05 23:40: +
      +
      +

      If I try virtualenv I get:
      virtualenv stmtest -p Projekt/pypy-stm-2.3-linux64/bin/pypy
      Running virtualenv with interpreter Projekt/pypy-stm-2.3-linux64/bin/pypy
      [forking: for now, this operation can take some time]
      [forking: for now, this operation can take some time]
      New pypy executable in stmtest/bin/pypy
      [forking: for now, this operation can take some time]
      ERROR: The executable stmtest/bin/pypy is not functioning
      ERROR: It thinks sys.prefix is u'/home/ernst' (should be u'/home/ernst/stmtest')
      ERROR: virtualenv is not compatible with this system or executable

      +
      +
      +
      +
      + + Armin Rigo wrote on 2014-07-06 08:48: +
      +
      +

      @Ernst: sorry, it works fine for me as well. I tried the pypy-stm provided here, both on a Ubuntu 12.04 and a Ubuntu 14.04 machine. Maybe you have a too old virtualenv? Does it work with regular PyPy?

      +
      +
      +
      +
      + + Armin Rigo wrote on 2014-07-07 12:37: +
      +
      +

      Thanks to the author of the now-deleted comments, we could track and fix a bug that only shows up on some Linux systems. If pypy-stm systematically segfaults at start-up for you too, try the "2.3-r2" release (see update in the post itself).

      +
      +
      +
      +
      + + Anonymous wrote on 2014-07-07 20:00: +
      +
      +

      This is exciting! One minor bug in the actual post: you can describe slowdown / speedup in two different ways, with total time as a percentage of original time, or with time difference as a percentage of original time. You mention a 20% slowdown (clearly using the latter standard) and then a 300% slowdown, which you describe as 3x (suggesting that you use the former standard). To be consistent , you should either describe them as 120% and 300%, respectively (using the former standard), or 20% and 200%, respectively (using the latter standard).

      Thanks!

      +
      +
      +
      +
      + + Unknown wrote on 2014-07-07 21:35: +
      +
      +

      Hi again,

      just to play around a little I've put together https://github.com/Tinche/stm-playground for myself.

      I picked a generic CPU-bound problem (primality testing) and tried comparing multithreaded implementations in CPython 2.7, ordinary PyPy and PyPy-STM.

      I figured this would be easily parallelizable (low conflicts) but it doesn't seem to be the case - I don't get all my cores pegged using the STM.

      bench-threadpool.py, on my machine, gives about the same time for CPython and PyPy-STM, while ordinary PyPy totally smokes them both (even with the GIL :), one order of magnitude difference (20 sec vs 2 sec).

      bench-threadpool-naive will crash the STM interpreter on my system. :)

      Getting away from threads, CPython will actually beat PyPy in a multi-process scenario by a factor of 2, which I found surprising. CPython does indeed use up all my cores 100% while dealing with a process pool, while PyPy has won't even come close.

      For the same workload, PyPy is actually faster running multithreaded with the GIL than multi-process, and fastest running with only 1 thread (expected, with the GIL only being overhead in this scenario).

      +
      +
      +
      +
      + + Pim wrote on 2014-07-07 21:40: +
      +
      +

      This is good news. For many of my applications, an important feature in the next phase will be the optimization for [..] the built-in dictionary type, for which we would like accesses and writes using independent keys to be truly independent [..]. My applications are mostly server applications (Twisted-based and others) that store state information on sessions/transactions in a small number of dictionaries that can have hundreds or thousands of entries concurrently, and would be accessed constantly.

      I'm glad I donated and plan do so again in the future :-)

      +
      +
      +
      +
      + + Armin Rigo wrote on 2014-07-08 10:47: +
      +
      +

      @Tin: I would tweak bench-queue.py to avoid a million inter-thread communications via the queue. For example, run 1000 check_primes instead of just 1 for every number received from the queue.

      +
      +
      +
      +
      + + Armin Rigo wrote on 2014-07-08 11:17: +
      +
      +

      @Tin: ...no, I tried too and it doesn't seem to help. We'll need to look into this in more details....

      +
      +
      +
      +
      + + Unknown wrote on 2014-07-08 21:04: +
      +
      +

      @Armin I've pushed a version of bench-queue with a tweakable batch size and concurrency level. Doing the work in batches of, say, 1000 does indeed make it go faster with all implementations.

      I've noticed pypy-stm runs have a large variance. It's not like I'm doing scientific measurements here, but for the queue test I'm getting runtimes from ~15 sec to ~27 sec, whereas for example ordinary PyPy is in the range 4.6 sec - 4.9 sec, and CPython ~22.5 - ~24.7, again, relatively close. Again, this is just something I noticed along the way and not the result of serious benchmarking in isolation.

      +
      +
      +
      +
      + + Armin Rigo wrote on 2014-07-10 08:44: +
      +
      +

      Ooooof. Ok, I found out what is wrong in bench-queue. The issue is pretty technical, but basically if you add "with __pypy__.thread.atomic:" in the main top-level loop in worker(), then it gets vastly faster. On my machine it beats the real-time speed of a regular pypy. See https://bpaste.net/show/450553/

      It clearly needs to be fixed...

      +
      +
      +
      +
      + + Armin Rigo wrote on 2014-07-10 09:31: +
      +
      +

      Added an answer to the question "what about PyPy3?": https://pypy.readthedocs.org/en/latest/stm.html#python-3

      +
      +
      +
      +
      + + Unknown wrote on 2014-07-12 01:03: +
      +
      +

      @Armin, cool! I've found that the thread pool version can be sped up ~2-3x by wrapping the contents of check_prime with 'atomic' too.

      One more observation: with the atomic context manager, on PyPy-STM the queue implementation will beat the thread pool implementation (slightly), which isn't the case for CPython or ordinary PyPy.

      +
      +
      +
      +
      + + geerk wrote on 2014-07-16 08:16: +
      +
      +

      This is exciting news! I think pypy is the future of python.

      +
      +
      +
      +
      + + Canesin wrote on 2014-07-19 15:40: +
      +
      +

      If you guys did a facelift on the website like yours HippyVM I believe the project would gain a lot of momentum, it is unfortunate but true that most company managers would visit it and think it is not industrial quality if an employ comes saying that they should sponsor developing something in PyPy.

      +
      +
      +
      +
      + + Anonymous wrote on 2014-07-20 11:26: +
      +
      +

      r2 still doesn't work for me (ubuntu 14.04, intel Core2 CPU T7400)
      bash: ./pypy: cannot execute binary file: Exec format error

      +
      +
      +
      +
      + + isomorph wrote on 2014-07-31 05:46: +
      +
      +

      this is a question for the guys developing PyPy... i am completely new to Python so please bear with me.

      here is what i don't understand: it seems to me that you are reinventing the wheel because doesn't the Oracle or Azul Systems JVM already provide a super performant GC and JIT? even STM is becoming available. and since Jython can run on the JVM, why do PyPy at all?

      wouldn't a JVM compliant implementation of Python be more performant than PyPy or CPython?

      or am i missing something here?

      any pointers greatly appreciated. thanks.

      +
      +
      +
      +
      + + Armin Rigo wrote on 2014-08-04 08:04: +
      +
      +

      Having a JIT in the JVM is very different from having a JIT that can understand Python. For proof, the best (and only) implementation of Python on the JVM, Jython, is running at around CPython speed (generally a bit slower). I suspect that STM is similarly not designed for the purposes to which Jython would put it and would thus perform poorly. The only part that would probably work out of the box would be the GC. A more subtle argument against starting from the JVM is that of semantic mismatch. See for example https://www.stups.uni-duesseldorf.de/mediawiki/images/5/51/Pypy.pdf

      +
      +
      +
      +
      + + isomorph wrote on 2014-08-04 14:44: +
      +
      +

      awesome! thanks a lot armin. :D

      +
      +
      +
      + +

      PyPy3 2.3.1 - Fulcrum

      + +
      +

      We're pleased to announce the first stable release of PyPy3. PyPy3
      +targets Python 3 (3.2.5) compatibility.

      +

      We would like to thank all of the people who donated to the py3k proposal
      +for supporting the work that went into this.

      +

      You can download the PyPy3 2.3.1 release here:

      +
      https://pypy.org/download.html#pypy3-2-3-1
      +
      +

      Highlights

      +
        +
      • The first stable release of PyPy3: support for Python 3!
      • +
      • The stdlib has been updated to Python 3.2.5
      • +
      • Additional support for the u'unicode' syntax (PEP 414) from Python 3.3
      • +
      • Updates from the default branch, such as incremental GC and various JIT
        +improvements
      • +
      • Resolved some notable JIT performance regressions from PyPy2:
      • +
      +
        +
      • Re-enabled the previously disabled collection (list/dict/set) strategies
      • +
      • Resolved performance of iteration over range objects
      • +
      • Resolved handling of Python 3's exception __context__ unnecessarily forcing
        +frame object overhead
      • +
      +
      +
      +

      What is PyPy?

      +

      PyPy is a very compliant Python interpreter, almost a drop-in replacement for
      +CPython 2.7.6 or 3.2.5. It's fast due to its integrated tracing JIT compiler.

      +

      This release supports x86 machines running Linux 32/64, Mac OS X 64, Windows,
      +and OpenBSD,
      +as well as newer ARM hardware (ARMv6 or ARMv7, with VFPv3) running Linux.

      +

      While we support 32 bit python on Windows, work on the native Windows 64
      +bit python is still stalling, we would welcome a volunteer
      +to handle that.

      +
      +
      +

      How to use PyPy?

      +

      We suggest using PyPy from a virtualenv. Once you have a virtualenv
      +installed, you can follow instructions from pypy documentation on how
      +to proceed. This document also covers other installation schemes.

      +

      Cheers,
      +the PyPy team

      +
      +
      +
      +
      +
      + + Omer Katz wrote on 2014-06-24 08:26: +
      +
      +

      Can we get some benchmarks much like we have for PyPY and CPython 2.7?

      +
      +
      +
      +
      + + Armin Rigo wrote on 2014-06-24 09:06: +
      +
      +

      As far as I know, a majority of the benchmarks we use have never been ported to Python 3. So it's far more complicated than just push a switch.

      +
      +
      +
      +
      + + jusic wrote on 2014-06-25 08:25: +
      +
      +

      Awesome, congrats on the new release! Finally some stable PyPy goodness for Python 3 as well :)

      +
      +
      +
      +
      + + Anonymous wrote on 2014-06-27 05:37: +
      +
      +

      Woo! This is exciting! (Now we just need to upgrade to 3.4... : ) )

      +
      +
      +
      +
      + + geerk wrote on 2014-06-28 09:06: +
      +
      +

      Glad to hear that PyPy is now for python 3. Great work!

      +
      +
      +
      +
      + + Unknown wrote on 2014-07-03 15:04: +
      +
      +

      This is great!

      Now I can finally test PyPy on some code for which I wanted to test it for years!

      (backporting to py2 was too painful)

      Thank you very much!

      +
      +
      +
      + +

      PyPy 2.3.1 - Terrestrial Arthropod Trap Revisited

      + +
      +
      We're pleased to announce PyPy 2.3.1, a feature-and-bugfix improvement over our recent 2.3 release last month.

      +This release contains several bugfixes and enhancements among the user-facing improvements:
        +
      • The built-in struct module was renamed to _struct, solving issues with IDLE and other modules
      • +
      • Support for compilation with gcc-4.9
      • +
      • A CFFI-based version of the gdbm module is now included in our binary bundle
      • +
      • Many issues were resolved since the 2.3 release on May 8
      • +
      +
      +You can download the PyPy 2.3.1 release here:

      https://pypy.org/download.html

      PyPy is a very compliant Python interpreter, almost a drop-in replacement for CPython 2.7. It's fast (pypy 2.3.1 and cpython 2.7.x performance comparison) due to its integrated tracing JIT compiler.

      +This release supports x86 machines running Linux 32/64, Mac OS X 64, Windows, and OpenBSD, as well as newer ARM hardware (ARMv6 or ARMv7, with VFPv3) running Linux. 
      +We would like to thank our donors for the continued support of the PyPy project.

      +The complete release notice is here.

      +Please try it out and let us know what you think. We especially welcome success stories, please tell us about how it has helped you!

      +Cheers, The PyPy Team
      +
      +

      PyPy 2.3 - Terrestrial Arthropod Trap

      + +
      +
      +
      +We’re pleased to announce PyPy 2.3, which targets version 2.7.6 of the Python language. This release updates the stdlib from 2.7.3, jumping directly to 2.7.6.

      +This release also contains several bugfixes and performance improvements, many generated by real users finding corner cases. CFFI has made it easier than ever to use existing C code with both cpython and PyPy, easing the transition for packages like cryptographyPillow(Python Imaging Library [Fork]), a basic port of pygame-cffi, and others.

      +PyPy can now be embedded in a hosting application, for instance inside uWSGI

      +You can download the PyPy 2.3 release here:

      https://pypy.org/download.html

      +PyPy is a very compliant Python interpreter, almost a drop-in replacement for CPython 2.7. It's fast (pypy 2.3 and cpython 2.7.x performance comparison; note that cpython's speed has not changed since 2.7.2) due to its integrated tracing JIT compiler.

      +This release supports x86 machines running Linux 32/64, Mac OS X 64, Windows, and OpenBSD, as well as newer ARM hardware (ARMv6 or ARMv7, with VFPv3) running Linux. 
      +
      +We would like to thank our donors for the continued support of the PyPy project.

      +The complete release notice is here

      +Cheers, The PyPy Team
      +
      +
      +
      +
      + + Anonymous wrote on 2014-05-10 05:20: +
      +
      +

      Hi Why don't you accept Bitcoin as one of donation methods? Bitcoin makes it easier to donate your project

      I believe that you add it and announce it here, there will be several posts in Reddit and others sources that help you to collect funds

      +
      +
      +
      +
      + + Anonymous wrote on 2014-05-10 06:40: +
      +
      +

      right, i think so

      +
      +
      +
      +
      + + Anonymous wrote on 2014-05-10 22:21: +
      +
      +

      Hey,
      Just wondering, does v2.3 contains the fix for issue 1683 titled "BytesIO leaks like hell"?

      https://bugs.pypy.org/issue1683

      +
      +
      +
      +
      + + Eric van Riet Paap wrote on 2014-05-12 21:40: +
      +
      +

      The bug status is set to resolved so one would expect it to be fixed. Please reopen the bug report if you think differently.

      +
      +
      +
      +
      + + Unknown wrote on 2014-05-14 10:59: +
      +
      +

      There is no info about what what exactly made CFFI easier in this release.

      +
      +
      +
      +
      + + Unknown wrote on 2014-05-14 20:21: +
      +
      +

      Hello pypy team! If you have not have not seen this post... https://www.rfk.id.au/blog/entry/pypy-js-faster-than-cpython/ , I think you will find it to be quite interesting!

      +
      +
      +
      + +

      NumPy on PyPy - Status Update

      + +
      +

      Work on NumPy on PyPy continued in March, though at a lighter pace than the previous few months. Progress was made on both compatibility and speed fronts. Several behavioral issues reported to the bug tracker were resolved. The most significant of these was probably the correction of casting to built-in Python types. Previously, int/long conversions of numpy scalars such as inf/nan/1e100 would return bogus results. Now, they raise or return values, as appropriate.

      +On the speed front, enhancements to the PyPy JIT were made to support virtualizing the raw_store/raw_load memory operations used in numpy arrays. Further work remains here in virtualizing the alloc_raw_storage when possible. This will allow scalars to have storages but still be virtualized when possible in loops.

      +Aside from continued work on compatibility/speed of existing code, we also hope to begin implementing the C-level components of other numpy modules such as mtrand, nditer, linalg, and so on. Several approaches could be taken to get C-level code in these modules working, ranging from reimplementing in RPython to interfacing with existing code with CFFI, if possible. The appropriate approach depends on many factors and will probably vary from module to module.

      To try out PyPy + NumPy, grab a nightly PyPy and install our NumPy fork. Feel free to report comments/issues to IRC, our mailing list, or bug tracker. Thanks to the contributors to the NumPy on PyPy proposal for supporting this work.

      +
      +
      +
      +
      + + Unknown wrote on 2014-04-24 23:22: +
      +
      +

      Trying to install scipy on top gives me an error while compiling scipy/cluster/src/vq_module.c; isn't scipy yet supported?

      +
      +
      +
      +
      + + Anonymous wrote on 2014-04-30 12:38: +
      +
      +

      scipy is not supported. Sometimes scipy functions are in fact in numpy in which case you can just copy the code. Otherwise you need to start learning cffi.

      +
      +
      +
      +
      + + Yichao Yu wrote on 2014-05-18 02:07: +
      +
      +

      You mentioned storage and scalar types. Is it related to this bug

      +
      +
      +
      +
      + + vak wrote on 2014-08-14 09:19: +
      +
      +

      what is the status about incorporating BLAS library?

      +
      +
      +
      +
      + + Anonymous wrote on 2014-09-22 21:52: +
      +
      +

      How far is running Pandas on Pypy? Will it be just a recompile when Numpy is ported, or is it heavy work to port Pandas to Pypy after Numpy is done? Should I look after another solution than plan to run Pandas on Pypy?

      +
      +
      +
      +
      + + Unknown wrote on 2014-11-13 10:07: +
      +
      +

      Pandas on PyPy would indeed be very interesting for huge analysis runs.

      +
      +
      +
      +
      + + Jami wrote on 2014-11-18 17:14: +
      +
      +

      Any news on the NumPy front? I check this blog for such stuff every week and also contributed to the funding drive.

      I fully understand that developers skilled enough to work on such a project are hard to come by even with money, and NumPy support isn't probably the most technologically exciting aspect of PyPy.

      Just even a few lines on the latest development or some milestones would show that the project is alive (although I fully understand that writing blog posts isn't everybody's favorite thing). And some kind of summary that in what shape the developers think the code is in. If you prefer coding to blogging, maybe implementing some kind of time-series graph for the numpypy-status page could be nice also (I keep checking it out but can never remember what was the state last time I checked). Maybe I can see if I can do a quick hack via eg archive.org for this.

      I think also a huge boost would be to have even a hacky temporary way to interface with Matplotlib and/or SciPy, as it's quite hard to do many practical analyses without these. I'd probably try to do my analyses in such an environment and perhaps even implement/fix at least things that are my own itches. There was the 2011 hack, but it doesn't seem to be elaborated anywhere. I could live with (or even prefer, so it definitely won't become the permanent version) a ugly, slow, memory-hungry and unstable hack that would spam the stderr with insulting messages. But without any way of interfacing the existing stuff it's just too much work for the more complicated analyses.

      I'm trying to track the https://bitbucket.org/pypy/numpy branch but it's a bit hard to see the bigger picture just from the commits. Even just some tags and/or meta-issues could be helpful. I'm also a bit confused on where (repo-wise) the development is actually happening. There are some sort of fresh NumPy-branches in the numpy tree. The micronumpy-project is probably dead or merged into the pypy/numpy-branch?

      PS. Please don't take this as too strong criticism. I prefer to just silently code away myself too. Just what would be nice to see as somebody eagerly waiting to use Pypy in numerical stuff.

      +
      +
      +
      +
      + + Maciej Fijalkowski wrote on 2014-11-24 12:00: +
      +
      +

      Hey Jami

      We'll try to write a blog post shortly

      +
      +
      +
      + +

      STM results and Second Call for Donations

      + +
      +

      Hi all,

      + +

      We now have a preliminary version of PyPy-STM +with the JIT, from the new STM documentation +page. This PyPy-STM is still not quite useful, failing to top the +performance of a regular PyPy by a small margin on most benchmarks, but +it's definitely getting there :-) The overheads with the JIT are still +a bit too high. (I've been tracking an obscure bug since days. +It turned out to be a simple buffer overflow. But if anybody has +a clue about why a hardware watchpoint in gdb, set on one of the garbled +memory locations, fails to trigger but the memory ends up being modified +anyway... and, it turns out, by just a regular pointer write... ideas +welcome.)

      + +

      But I go off-topic :-) The main point of this post is to announce the +2nd Call for Donation about +STM. We achieved most of the goals laid out in the first call. We +even largely overachieved them in terms of raw performance, even if +there are many cases that are unreasonably slow for now. So, after the +successful research, we are launching a second proposal about the +development part of the project:

      + +
        +
      1. +

        Polish PyPy-STM to get a consistently reasonable speed, 25%-40% +slower than a regular JITted PyPy when running single-threaded code. Of +course it is supposed to scale nicely as long as there are no +user-visible conflicts.

        + +
      2. +
      3. +

        Focus on developing the Python-facing interface: both internal things +(e.g. do dictionaries need to be more TM-friendly in general?) as well +as directly visible things (e.g. some profiler-like interface to explore +common conflicts in a program).

        + +
      4. +
      5. Regular multithreaded code should benefit out of the box, but the +final goal is to explore and tweak some existing non-multithreaded +frameworks and improve their TM-friendliness. So existing programs +using Twisted or Stackless, for example, should run on multiple cores +without any major change.

      6. +
      +

      See the full call for more +details! I'd like to thank Remi Meier for getting involved. And a big +thank you to everybody who contributed money on the first call. It +took more time than anticipated, but it's there in good but rough shape. +Now it needs a lot of polishing :-)

      + +

      Armin

      +
      +
      +
      +
      + + Dmitrey wrote on 2014-05-03 19:48: +
      +
      +

      it would be good to have compiled stm version for something more recent than Ubuntu 12.04, e.g. 14.04, preferably with numpy included, to simplify numpy installation. Or, maybe, that version for 12.04 works with 14.04?

      +
      +
      +
      +
      + + Armin Rigo wrote on 2014-05-10 14:56: +
      +
      +

      Yes, Ubuntu 14.04 seems to run fine any PyPy compiled for Ubuntu 12.04. Numpy probably works in pypy-stm, but being a module that accesses matrix data as "external" raw memory, it does not support multi-core execution.

      +
      +
      +
      + +

      pygame_cffi: pygame on PyPy

      + +
      +
      +

      The Raspberry Pi aims to be a low-cost educational tool that anyone can use to learn about electronics and programming. Python and pygame are included in the Pi's programming toolkit. And since last year, thanks in part to sponsorship from the Raspberry Pi Foundation, PyPy also works on the Pi (read more here).

      +

      With PyPy working on the Pi, game logic written in Python stands to gain an awesome performance boost. However, the original pygame is a Python C extension. This means it performs poorly on PyPy and negates any speedup in the Python parts of the game code.

      +

      One solution to making pygame games run faster on PyPy, and eventually on the Raspberry Pi, comes in the form of pygame_cffi. pygame_cffi uses CFFI to wrap the underlying SDL library instead of a C extension. A few months ago, the Raspberry Pi Foundation sponsored a Cape Town Python User Group hackathon to build a proof-of-concept pygame using CFFI. This hackathon was a success and it produced an early working version of pygame_cffi.

      +

      So for the last 5 weeks Raspberry Pi has been funding work on pygame_cffi. The goal was a complete implementation of the core modules. We also wanted benchmarks to illuminate performance differences between pygame_cffi on PyPy and pygame on CPython. We are happy to report that those goals were met. So without further ado, here's a rundown of what works.

      +
      +

      Current functionality

      + + +Invention screenshot: + +
      + +Mutable mamba screenshot: + +
      + +

      With the above-mentioned functionality in place we could get 10+ of the pygame examples to work, and a number of PyWeek games. At the time of writing, if a game doesn't work it is most likely due to an unimplemented transform or draw function. That will be remedied soon.

      +
      +
      +

      Performance

      +

      In terms of performance, pygame_cffi on PyPy is showing a lot of promise. It beats pygame on CPython by a significant margin in our events processing and collision detection benchmarks, while blit and fill benchmarks perform similarly. The pygame examples we checked also perform better.

      + +
      + +
      + +

      However, there is still work to be done to identify and eliminate bottlenecks. On the Raspberry Pi performance is markedly worse compared to pygame (barring collision detection). The PyWeek games we tested also performed slightly worse. Fortunately there is room for improvement in various places.

      + +Invention & Mutable Mamba (x86) + +
      + +Standard pygame examples (Raspberry Pi) + +
      + +

      Here's a summary of some of the benchmarks. Relative speed refers to the frame rate obtained in pygame_cffi on PyPy relative to pygame on CPython.

      + ++++ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
      BenchmarkRelative speed (pypy speedup)
      Events (x86)1.41
      Events (Pi)0.58
      N2 collision detection on 100 sprites (x86)4.14
      N2 collision detection on 100 sprites (Pi)1.01
      Blit 100 surfaces (x86)1.06
      Blit 100 surfaces (Pi)0.60
      Invention (x86)0.95
      Mutable Mamba (x86)0.72
      stars example (x86)1.95
      stars example (Pi)0.84
      +
      +

      OpenGL

      +

      Some not-so-great news is that PyOpenGL performs poorly on PyPy since PyOpenGL uses ctypes. This translates into a nasty reduction in frame rate for games that use OpenGL surfaces. It might be worthwhile creating a CFFI-powered version of PyOpenGL as well.

      +
      +
      +
      +

      Where to now?

      +

      Work on pygame_cffi is ongoing. Here are some things that are in the pipeline:

      +
        +
      • Get pygame_cffi on PyPy to a place where it is consistently faster than pygame on CPython.
      • +
      • Implement the remaining modules and functions, starting with draw and transform.
      • +
      • Improve test coverage.
      • +
      • Reduce the time it takes for CFFI to parse the cdef. This makes the initial pygame import slow.
      • +
      +

      If you want to contribute you can find pygame_cffi on Github. +Feel free to find us on #pypy on freenode or post issues on github.

      +

      Cheers,
      +Rizmari Versfeld

      +
      +
      +
      +
      +
      +
      +
      + + Unknown wrote on 2014-03-28 01:04: +
      +
      +

      Pygame should be an excellent way to benchmark the performance of pypy, so this is great! I wanted to let you fellas know of another project that is using pypy that looks really neat as well... https://github.com/rfk/pypyjs

      +
      +
      +
      +
      + + Unknown wrote on 2014-03-28 12:46: +
      +
      +

      pygame seems outdated, because it is based on first SDL version.

      It will be interesting to see CFFI comparison for newer, SDL2 bindings, such as PySDL2, which is ctypes based at the moment.

      https://pypi.python.org/pypi/PySDL2

      +
      +
      +
      +
      + + Maciej Fijalkowski wrote on 2014-03-28 15:02: +
      +
      +

      Anatoly, pygame is outdated but have no clear replacement. PySDL2 is nice, but it's only a low level binding, it does not really help in the case of writing games.

      +
      +
      +
      +
      + + Unknown wrote on 2014-03-28 18:31: +
      +
      +

      Is it not wrapping the current SDL? I thought that it was... On github it says it's a pygame based wrapper(copies the api) for SDL, would that not make it the current SDL?

      +
      +
      +
      +
      + + Anonymous wrote on 2014-03-29 00:37: +
      +
      +

      I looked into PyOpenGL's code to see if there is an easy way to upgrade to CFFI.

      It's a bag of cats EVERYWHERE.

      ctypes are defined all over the place, unlike most ctypes->cffi projects, where there is a single source file (api.py) that is easy to convert due to it being the raw interface to the C library.

      +
      +
      +
      +
      + + Unknown wrote on 2014-03-29 06:41: +
      +
      +

      @Maciej, pygame includes a lot of helpers and good documentation, but it is not perspective technology to play with. I'd say there are more interesting libs out there that gain more interesting results and speeding up dynamic binding for them would be very cool to make things like these - https://devart.withgoogle.com/ - possible.


      @Anonymous, if I were to provide OpenGL bindings, I'd start with looking at https://github.com/p3/regal project and binding generator in scripts/

      +
      +
      +
      +
      + + Temia Eszteri wrote on 2014-03-29 18:42: +
      +
      +

      I've actually been working to see if I can get my own Pygame release, Sky Eraser, optimised enough to work on a Raspberry Pi -- it'd be worth seeing how implementing it under this configuration would work on top of the optimisations I've been working on in the background (boy are there a lot to make).

      I might also be rewriting the APIs for Allegro 5.1 as an experiment though, to test under both CPython and PyPy.

      +
      +
      +
      +
      + + Unknown wrote on 2014-03-29 21:15: +
      +
      +

      I started to work on a newer and experimental OpenGL wrapper for Python, proudly blessed PyOpenGLng.

      In comparison to PyOpenGL, it generates the requested OpenGL API from the OpenGL XML Registry and use an automatic translator to map the C API to Python. The translator is quite light weight in comparison to PyOpenGL source code. And it is already able to run a couple of examples for OpenGL V3 and V4.

      Actually the wrapper use ctypes. But I am looking for tips to do the same for cffi, as well as feedbacks on performance and comments.

      The project is hosted on https://github.com/FabriceSalvaire/PyOpenGLng.

      +
      +
      +
      +
      + + Unknown wrote on 2014-03-30 08:16: +
      +
      +

      @Fabrice, how is your newer and experimental OpenGL wrapper generator is better than existing ones? I am not saying that there is a NIH effect - probably some omission from documentation.

      +
      +
      +
      +
      + + Unknown wrote on 2014-03-30 08:19: +
      +
      +

      I mean that if PyOpenGL doesn't use wrapper generator then there are a couple around not limiting themselves to Python. I am especially interested to know the comparison with regal.

      +
      +
      +
      +
      + + Alecks Gates wrote on 2014-03-30 22:20: +
      +
      +

      It was my impression that OpenGL isn't hardware accelerated on the pi anyway... or am I incorrect?

      +
      +
      +
      +
      + + Unknown wrote on 2014-03-31 10:17: +
      +
      +

      @anatoly: The only real replacement for pygame which I know is pyglet. It is not quite as game-optimized as pygame, but very versatile and a joy to use.

      https://pyglet.org

      +
      +
      +
      +
      + + David wrote on 2014-04-01 20:05: +
      +
      +

      I've actually made a CFFI OpenGL binding, as part of my successor to my old PyGL3Display project. It's not hosted anywhere yet, but I'll see about getting up somewhere soon.

      +
      +
      +
      +
      + + David wrote on 2014-04-02 14:32: +
      +
      +

      And... done. A mostly drop-in replacement for PyOpenGL on CFFI, or at least for OpenGL 3.2 core spec.

      https://www.dropbox.com/s/rd44asge17xjbn2/gl32.zip

      +
      +
      +
      +
      + + Unknown wrote on 2014-04-02 14:35: +
      +
      +

      @Arne, pyglet rocks, because it is just `clone and run` unlike all other engines. But it looks a little outdated, that's why I started to look for alternatives.

      +
      +
      +
      +
      + + Unknown wrote on 2014-04-02 14:38: +
      +
      +

      @David, if you want people to comment on this, Bitbucket would be a better way to share sources than Dropbox.

      +
      +
      +
      +
      + + David wrote on 2014-04-02 14:57: +
      +
      +

      @anatoly techtonick:
      Actually, it'll end up on Launchpad in the near future (probably within 2 weeks?). However, it's the output of a wrapper generator and the wrapper generator is in pretty poor shape at the moment, in terms of packaging it's output. I just figured people might be able to use it in the near future, even if it is in 'source-code-dump' form. If there's a better temporary home for it somewhere, I'm all ears.

      +
      +
      +
      +
      + + Unknown wrote on 2014-04-02 15:08: +
      +
      +

      @David, why reinvent the wheel? There are many wrapper generators around. Also, you project is not a replacement for PyOpenGL, because of GPL restrictions.

      +
      +
      +
      +
      + + David wrote on 2014-04-02 15:39: +
      +
      +

      @anatoly

      I never claimed my project is a replacement for PyOpenGL - it's not API compatible, for a start. Regarding license, it'll probably get changed for the bindings at some point, probably to 3-clause BSD.

      On the wrapper generator: Really, the only actively maintained wrapper generator for Python that I'm aware of (which isn't project specific) is SWIG, which is not appropriate (at the very least, googling for 'python wrapper generator -swig' doesn't seem to give many results). In any case, the wrapper generator isn't a lot of code.

      +
      +
      +
      +
      + + Unknown wrote on 2014-04-03 07:28: +
      +
      +

      @anatoly: pyglet seems to be in maintenance mode right now. There are commits every few days, but only small stuff.

      On the other hand I understand that: pyglet supplies everything a backend for a game-engine needs (I use it¹), so the next step should be to use it for many games and see whether shared needs arise.

      ¹: See https://1w6.org/deutsch/anhang/programme/hexbattle-mit-zombies and https://bitbucket.org/ArneBab/hexbattle/

      +
      +
      +
      +
      + + Unknown wrote on 2014-04-03 10:35: +
      +
      +

      @David, I am speaking about OpenGL specific wrapper generators. I've added information to this page - https://www.opengl.org/wiki/Related_toolkits_and_APIs#OpenGL_loading_libraries

      The OpenGL generator in Python is included in regal project here https://github.com/p3/regal/scripts

      pyglet also has one.

      +
      +
      +
      +
      + + Unknown wrote on 2014-04-03 10:36: +
      +
      +

      Sorry, the correct link is https://github.com/p3/regal/tree/master/scripts

      +
      +
      +
      +
      + + Unknown wrote on 2014-04-03 10:39: +
      +
      +

      @Arne, kissing elves trick is low. =) Otherwise looks wesnothy and 2D. I don't see why it should use OpenGL. 3D models would be cool.

      I'd try to make it run on PySDL2 with "from sdl2.ext.api import pyglet". There is no pyglet API there, but would be interesting to see if it is possible to provide one.

      +
      +
      +
      +
      + + David wrote on 2014-04-03 15:58: +
      +
      +

      @anatoly

      Pyglet's GL wrapper generator creates a lot of chained functions (fairly slow in cPython). I'm also not sure if there's enough development activity in Pyglet to allow modifying core code, and given the size of the Pyglet project I'm not going to fork it. PyOpenGL has more or less the same issues.

      Regal appears to be a very large project (a 68MB checkout), which has a scope much greater than just its wrapper generator - the sheer scope of the project does cause some barriers to entry. I'm still looking through, but I am fairly certain that it would take more effort to adapt Regals binding generator than I have expended on my own.

      +
      +
      +
      +
      + + Unknown wrote on 2014-04-03 21:06: +
      +
      +

      @anatoly: I like kissing elves ☺ (and when I get to write the next part of the story, I intend to keep them as player characters: That someone starts out in an intimate moment does not mean he or she is watchmeat).

      @David: I guess modifying core-code in pyglet is not that big of a problem, especially *because* it is mostly being maintained right now: Little danger of breaking the in-progress work of someone else.

      +
      +
      +
      +
      + + Unknown wrote on 2014-04-03 21:09: +
      +
      +

      @anatoly: more specifically, I do not consider intimate moments as cheap (and WTactics has the image, so I could pull this off). Instead I try to rid myself of baseless inhibitions, though that’s not always easy: Killing off no longer needed societal conditioning is among the hardest battles…

      +
      +
      +
      +
      + + David wrote on 2014-04-04 01:23: +
      +
      +

      @Arne: Maybe it'd be worth looking at integrating it then; however, it really is a completely different approach - gl32 is a source code writer, whereas Pyglet uses Pythons inbuilt metaprogramming capabilities - and so it would be completely rewriting a large chunk of Pyglets core. Once I've got the binding generator finalised, it might be worth seeing if it's possible to replace Pyglet's OpenGL bindings with these ones.

      That said, in the interest of full disclosure: I'm not a fan of Pyglets per object draw method, again in the interests of speed. The per object draw method that Pyglet encourages with its API is not very scalable and eliminates a large number of the advantages of using OpenGL. So whilst I might see if gl32 can be plugged in for interesting benchmarks/proof-of-concept, I probably wouldn't try to get it bug-free and integrated into upstream Pyglet.

      +
      +
      +
      +
      + + David wrote on 2014-04-04 15:26: +
      +
      +

      @Arne: Regarding Pyglet integration - it seems it would require a lot of work. There's two major issues - firstly, Pyglet only has raw OpenGL bindings, which are used everywhere and hence the "more pythonic" bindings of gl32 would be hard to integrate without editing every file using GL in Pyglet. Secondly, Pyglet uses GL functions which were removed in 3.2, and hence are not in gl32, so the API generator would have to be extended to handle any special cases on these functions.

      +
      +
      +
      +
      + + Unknown wrote on 2014-04-07 17:23: +
      +
      +

      @David: The per-object draw-method is very convenient for programming. As soon as you need more performance, most of the objects are grouped into batches, though. That way only the draw method of the batch is called and the batch can do all kinds of optimizations.

      +
      +
      +
      +
      + + Unknown wrote on 2014-04-07 17:25: +
      +
      +

      For Python 3.2 you might find useful stuff in the python-3 port of pyglet, though that hasn’t been released, yet, IIRC.

      +
      +
      +
      +
      + + David wrote on 2014-04-07 21:26: +
      +
      +

      @Arne:

      I'd argue that objects with Z-order would be more convenient programmatically, but frankly that's a matter of opinion. (Incidentally, this is something I'm working on as well, and I think I'm mostly done on it).

      However, per-object-draw is only one concern I have on Pyglets speed credentials, as I do not believe Pyglet was written with speed as a design goal. For a different example, see pyglet.graphics.vertexbuffer; copying a ctypes object into a list in order to get slices to work is not a smart thing to do, performance wise!

      I'm not sure where you got Python 3.2 from, but what I meant was that currently I'm restricting myself to OpenGL 3.2, which means that certain older OpenGL functions do not exist. Pyglet uses some of these removed functions (e.g. glPushClientAttrib), and hence the bindings I'm generating at the moment do not provide all the features Pyglet uses.

      +
      +
      +
      +
      + + Armin Rigo wrote on 2014-04-08 04:47: +
      +
      +

      I'd like to remind readers of these comments that this thread has gone farther and farther from both the original post and the whole blog -- which is supposed to be related to PyPy. I'm rather sure that you're now discussing performance on CPython, which in this case is very different from performance on PyPy (or would be if it supported all packages involved). Maybe move this discussion somewhere more appropriate?

      +
      +
      +
      +
      + + Unknown wrote on 2014-04-09 11:47: +
      +
      +

      @Armin: You’re right… actually I would be pretty interested, though, whether pypy also has a performance issue with pyglet's chained functions.

      +
      +
      +
      +
      + + David wrote on 2014-04-09 14:30: +
      +
      +

      @Arne: In principal, PyPy seems to handle Pyglets chained functions relatively well (non-scientifically running the Astraea examples title screen sees CPU usage start very high, but eventually drops to about 80% of cPythons after the JIT warms up). There is one caveat preventing better testing: the moment keyboard input is given to Astraea on PyPy, PyPy segfaults.

      +
      +
      +
      +
      + + Unknown wrote on 2014-04-10 09:05: +
      +
      +

      @David: That is a really important feedback to Armin and and Anatoly, I think.

      +
      +
      +
      +
      + + Unknown wrote on 2014-04-10 09:06: +
      +
      +

      @David: Can you give some more background on the error (how to get the code, how to reproduce the segfault)?

      +
      +
      +
      +
      + + David wrote on 2014-04-15 11:35: +
      +
      +

      @Arne: It's as simple as running the Astraea example in Pyglet and pressing a key (under PyPy 2.2, Pyglet 1.2-beta, Ubuntu 14.04). As far as I remember, this has been the case for some time (at least as far back as Ubuntu 12.10/PyPy 2.0 beta - although back then the major issue was PyPy using a lot more CPU; I didn't report this then due to a blog post at the time saying how cTypes would be rewritten). The error reported by Apport is "Cannot access memory at address 0x20"

      Doing a cursory scan through other examples, the noisy and text_input examples also have problems. noisy segfaults when a spawned ball collides with a boundary (occasionally giving a partial rpython traceback); text_input appears to have a random chance of any of the input boxes being selectable.

      Maybe it's time to file a proper bug report on this...

      +
      +
      +
      +
      + + David wrote on 2014-04-15 14:09: +
      +
      +

      @Arne: I've now submitted a bug on the PyPy Bug tracker (Issue 1736), with more detail etc. Probably best to move conversation on any Pyglet related issues over there.

      +
      +
      +
      +
      + + Armin Rigo wrote on 2014-04-16 11:50: +
      +
      +

      Maybe indeed :-)

      +
      +
      +
      +
      + + Anonymous wrote on 2015-01-24 15:22: +
      +
      +

      I came up with a funny idea about why not making emscripten generates code targeted on RPython, then now we can use C/C++ in PyPy directly? A LLVM to RPython compiler, how about this?

      +
      +
      +
      + +

      STMGC-C7 with PyPy

      + +
      +

      Hi all,

      + +

      Here is one of the first full PyPy's +(edit: it was r69967+, but the general list of versions is currently here) +compiled with the new StmGC-c7 +library. It has no JIT so far, but it runs some small +single-threaded benchmarks by taking around 40% more time than a +corresponding non-STM, no-JIT version of PyPy. It scales --- up to two +threads only, which is the hard-coded maximum so far in the c7 code. +But the scaling looks perfect in these small benchmarks without +conflict: starting two threads each running a copy of the benchmark +takes almost exactly the same amount of total time, simply using two +cores.

      + +

      Feel free to try it! It is not actually useful so far, because it is +limited to two cores and CPython is something like 2.5x faster. One of +the important next steps is to re-enable the JIT. Based on our current +understanding of the "40%" figure, we can probably reduce it with +enough efforts; but also, the JIT should be able to easily produce +machine code that suffers a bit less than the interpreter from these +effects. This seems to mean that we're looking at 20%-ish slow-downs +for the future PyPy-STM-JIT.

      + +

      Interesting times :-)

      + +

      For reference, this is what you get by downloading the +PyPy binary linked above: a Linux 64 binary (Ubuntu 12.04) that +should behave mostly like a regular PyPy. (One main missing feature is +that destructors are never called.) It uses two cores, but obviously +only if the Python program you run is multithreaded. The only new +built-in feature is with __pypy__.thread.atomic: this gives +you a way to enforce that a block of code runs "atomically", which means +without any operation from any other thread randomly interleaved.

      + +

      If you want to translate it yourself, you need a trunk version of clang +with three patches applied. That's the number of bugs that we couldn't +find workarounds for, not the total number of bugs we found by (ab)using +the address_space feature...

      + +

      Stay tuned for more!

      + +

      Armin & Remi

      +
      +
      +
      +
      + + Armin Rigo wrote on 2014-03-16 20:32: +
      +
      +

      The provided pypy-c crashes when calling fork(). Sadly fork() is indirectly called by a lot of things, including the subprocess module --- which can be executed just by importing random modules...

      +
      +
      +
      +
      + + Unknown wrote on 2014-03-17 08:39: +
      +
      +

      That sounds pretty huge!

      Do you require clang for that? (why is it named on https://foss.heptapod.net/pypy/pypy/-/tree/branch//stmgc-c7/TODO )

      +
      +
      +
      +
      + + Armin Rigo wrote on 2014-03-17 20:42: +
      +
      +

      Only clang has the address_space extension mention in the blog post; gcc does not.

      +
      +
      +
      +
      + + Unknown wrote on 2014-03-19 13:51: +
      +
      +

      I want to hear more talks on this. When is your next talk... pycon 2014? It would be hilarious if the pypy group were able to create naive concurrency in python, no one would have seen that coming! Many would have thought, "surely Haskell", or some other immutable, static language would get us there first. But no, it might just be that pypy allows any language that targets it to be concurrent, kiss style...amazing! Anyway, enough gushing, time for a random question. Mainstream vms like the JVM have added ways of speeding up dynamic languages, what advantages does pypy have over these traditional vms(other than the concurrency one that might come to fruition)? I think this would be a good question to answer at the next talk for pypy.

      +
      +
      +
      +
      + + Armin Rigo wrote on 2014-03-20 06:54: +
      +
      +

      As it turns out there will be no PyPy talk at PyCon 2014.

      The JVM runs Jython at a speed that is around that of CPython. PyPy runs substantially faster than this. One difference is that PyPy contains a small number of annotations targeted specifically towards RPython's JIT generator, whereas the JVM has no support for this.

      +
      +
      +
      +
      + + Armin Rigo wrote on 2014-03-20 07:37: +
      +
      +

      Update containing the most obvious fixes: https://cobra.cs.uni-duesseldorf.de/~buildmaster/misc/pypy-c-r70103-70091-stm.tbz2 (Ubuntu 12.04 Linux 64-bit)

      +
      +
      +
      +
      + + Unknown wrote on 2014-03-20 15:45: +
      +
      +

      Oh, I do not want to know personally about the superiority of pypy vs the jvm. I was just suggesting a talking point; basically, show others that pypy is a better alternative(for dynamic languages, possibly all languages with naive concurrency working!) then llvm, jvm, etc... I do have a question though, would you suppose that performance of pypy-stm would be better than that of something like the approach clojure has? I have heard that immutable data structures are nice for correctness but that they are bad for performance.

      +
      +
      +
      +
      + + Anonymous wrote on 2014-03-21 17:21: +
      +
      +

      So PyPy-STM is Python without GIL? And it's possible to make it only 20% slower than "regular" PyPy? That would be quite an achievement.

      Could you publish a build of PyPy-STM for Debian Stable?

      +
      +
      +
      +
      + + Armin Rigo wrote on 2014-03-22 12:24: +
      +
      +

      The PyPy-STM we have so far doesn't include any JIT. If you want to try it out anyway on other Linux platforms than Ubuntu, you need to translate it yourself, or possibly hack around with symlinks and LD_LIBRARY_PATH.

      +
      +
      +
      +
      + + Anonymous wrote on 2014-03-22 12:44: +
      +
      +

      > The PyPy-STM we have so far doesn't include any JIT

      Yep, that's what blog post said :) But also PyPy-STM doesn't include GIL, does it?

      +
      +
      +
      +
      + + Armin Rigo wrote on 2014-03-23 07:44: +
      +
      +

      Indeed, which is the point :-) You're welcome to try it out, but I'm just saying that I don't want to go to great lengths to provide precompiled binaries that work on Linux XYZ when I could basically release an updated version every couple of days... It's still experimental and in-progress. Early versions are limited to two cores; later versions to 4 cores. We still have to determine the optimal number for this limit; maybe around 8? (higher numbers imply a bit of extra overheads) It's an example of in-progress work. Another example is that so far you don't get feedback from cross-transaction conflicts; you used to in previous versions, but we didn't port it yet.

      +
      +
      +
      + + +
      +
      + +
      +
      +
      + +
      + + + + \ No newline at end of file diff --git a/blog/index-32.html b/blog/index-32.html new file mode 100644 index 000000000..344640da5 --- /dev/null +++ b/blog/index-32.html @@ -0,0 +1,1563 @@ + + + + + + +PyPy (old posts, page 32) | PyPy + + + + + + + + + + + + + + + + + + Skip to main content +
      +
      +
      +

      Faster, more memory efficient and more ordered dictionaries on PyPy

      + +
      +
      +

      Hello everyone!

      +

      As of today, we merged the latest branch that brings better dictionaries to PyPy by default. The work is based on an idea by Raymond Hettinger on python-dev, with prior work done notably in Java.  It was done by Maciej Fijałkowski and Armin Rigo, with Laurence Tratt recently prodding us to finish it.  (Earlier work going in a similar direction include Alex Gaynor's work on ordered dicts in Topaz, which was also used in the Hippy VM.  Each of these pieces of work is itself based on the original dict implementation in RPython, whose origins fade in the Subversion prehistory of PyPy.)  Coincidentally, a very similar idea has been implemented in Zend PHP very recently. Zend implementation description.

      +

      This post covers the basics of design and implementation as well as some basic benchmarks.

      +
      +
      +

      Dictionaries are now ordered!

      +

      One surprising part is that the new design, besides being more +memory efficient, is ordered by design: it preserves the +insertion order.  This is not forbidden by the Python language, which allows any order.  It makes the collections.OrderedDict subclass much faster than before: it is now a thin subclass of dict.  Obviously, we recommend that any portable Python program continues to use OrderedDict when ordering is important.  Note that a non-portable program might rely on more: for example, a **keywords argument now receives the keywords in the same order as the one in which they were given in the call.  (Whether such a thing might be called a language design change or not is a bit borderline.)  The point is that Python programs that work on CPython or previous versions of PyPy should continue to work on PyPy.

      +

      There is one exception, though.  The iterators of the OrderedDict subclass are now working just like the ones of the dict builtin: they will raise RuntimeError when iterating if the dictionary was modified.  In the CPython design, the class OrderedDict explicitly doesn't worry about that, and instead you get some result that might range from correct to incorrect to crashes (i.e. random Python exceptions).

      +
      +
      +

      Original PyPy dictionary design

      +

      Originally, PyPy dictionaries, as well as CPython dictionaries +are implemented as follows (simplified view):

      +
      +struct dict {
      +   long num_items;
      +   dict_entry* items;   /* pointer to array */
      +}
      +
      +struct dict_entry {
      +   long hash;
      +   PyObject* key;
      +   PyObject* value;
      +}
      +
      +

      Where items is a sparse array, with 1/3 to 1/2 of the items being NULL. +The average space occupied by a dictionary is 3 * WORD * 12/7 plus some small constant (the smallest dict has 8 entries, which is +8 * 3 * WORD + 2 * WORD = 26 WORDs).

      +
      +
      +

      New PyPy dictionary design

      +

      The new PyPy dictionary is split in two arrays:

      +
      +struct dict {
      +    long num_items;
      +    variable_int *sparse_array;
      +    dict_entry* compact_array;
      +}
      +
      +struct dict_entry {
      +    long hash;
      +    PyObject *key;
      +    PyObject *value;
      +}
      +
      +

      Here, compact_array stores all the items in order of insertion, while sparse_array is a 1/2 to 2/3 full array of integers. The integers themselves are of the smallest size necessary for indexing the compact_array. So if compact_array has less than 256 items, then sparse_array will be made of bytes; if less than 2^16, it'll be two-byte integers; and so on.

      +

      This design saves quite a bit of memory. For example, on 64bit systems we can, but almost never, use indexing of more than 4 billion elements; and for small dicts, the extra sparse_array takes very little space.  For example a 100 element dict, would be on average for the original design on 64bit: 100 * 12/7 * WORD * 3 =~ 4100 bytes, while on new design it's 100 * 12/7 + 3 * WORD * 100 =~ 2600 bytes, quite a significant saving.

      +
      +
      +

      GC friendliness

      +

      The obvious benefit of having more compact dictionaries is an increased cache friendliness. In modern CPUs cache misses are much more costly than doing additional simple work, like having an additional level of (in-cache) indirection. Additionally, there is a GC benefit coming from it. When doing a minor collection, the GC has to visit all the GC fields in old objects that can point to young objects. In the case of large arrays, this can prove problematic since the array grows and with each minor collection we need to visit more and more GC pointers. In order to avoid it, large arrays in PyPy employ a technique called "card marking" where the GC only visits "cards" or subsets of arrays that were modified between collections. The problem with dictionaries was that by design modifications in a dictionary occur randomly, hence a lot of cards used to get invalidated. In the new design, however, new items are typically appended to the compact_array, hence invalidate much fewer cards --- which improves GC performance.  (The new sparse_array is an array of integers, so it does not suffer from the same problems.)

      +
      +
      +

      Deletion

      +

      Deleting entries from dictionaries is not very common, but important in a few use cases.  To preserve order, when we delete an entry, we mark the entry as removed but don't otherwise shuffle the remaining entries.  If we repeat this operation often enough, there will be a lot of removed entries in the (originally compact) array.  At this point, we need to do a "packing" operation, which moves all live entries to the start of the array (and then reindexes the sparse array, as the positions changed).  This works well, but there are use cases where previously no reindexing was ever needed, so it makes these cases a bit slower (for example when repeatedly adding and removing keys in equal number).

      +
      +
      +

      Benchmarks

      +

      The PyPy speed benchmarks show mostly small effect, see changes. The microbenchmarks that we did show large improvements on large and very large dictionaries (particularly, building dictionaries of at least a couple 100s of items is now twice faster) and break-even on small ones (between 20% slower and 20% faster depending very much on the usage patterns and sizes of dictionaries). The new dictionaries enable various optimization possibilities which we're going to explore in the near future.

      +

      Cheers,
      +fijal, arigo and the PyPy team

      +
      +
      +
      +
      +
      +
      + + Unknown wrote on 2015-01-22 16:26: +
      +
      +

      This is outstanding work, PyPy team. Keep on keeping on!

      +
      +
      +
      +
      + + Wilfred Hughes wrote on 2015-01-22 16:41: +
      +
      +

      Fantastic!

      https://pypy.org/performance.html states that large dicts are a weakness of pypy -- is still the case overall, or is this work sufficient to favour pypy over cpython for large dict work in general?

      +
      +
      +
      +
      + + John M. Camara wrote on 2015-01-23 01:35: +
      +
      +

      Wilfred - With the ordered dict changes that bullet item is no longer true.

      +
      +
      +
      +
      + + EM Lazzarin wrote on 2015-01-23 23:20: +
      +
      +

      Awesome work and thanks. Pypy would be ahead of the game if PEP 468 were accepted.

      +
      +
      +
      +
      + + JSZ wrote on 2015-01-24 19:04: +
      +
      +

      How is deleting an element implemented? It sounds like it would take O(n) work to remove an element from the middle of the compact array.

      +
      +
      +
      +
      + + Armin Rigo wrote on 2015-01-25 06:58: +
      +
      +

      JSZ: the array gets holes. If a lot of items are deleted it can no longer be called "compact", but if it becomes too sparse it is recompacted and rehashed.

      +
      +
      +
      +
      + + Anonymous wrote on 2015-01-28 11:09: +
      +
      +

      There are lots of things to like about this approach!

      Did you find any problems with cache misses? With linear probing, the keys are accessed sequentially (cache friendly), but with this method the keys are accessed in random order.

      +
      +
      +
      +
      + + Carl Friedrich Bolz-Tereick wrote on 2015-01-28 11:13: +
      +
      +

      @Anonymous: The old approach didn't use linear probing either, so in that regard nothing changed.

      +
      +
      +
      +
      + + Anonymous wrote on 2015-01-28 11:45: +
      +
      +

      @carl - ah I see, thats interesting.

      Well then, what about storing the hashes with the indices?
      * Another chunk of memory saved. Only the lowest N bits need be stored that way instead of the full 64 bits. (Big assumption that rehashing on bit size change is ok)

      * The nice thing is that the dense part (cache miss!) need only be accessed if the hash matches.

      I think if I was doing this, I'd skip 8 bit indices and have 16 bit minimum so rehashing would be very rare.

      +
      +
      +
      +
      + + Carl Friedrich Bolz-Tereick wrote on 2015-01-28 12:04: +
      +
      +

      two problems with that:

      - since the hash functions can be written in python, recomputing a hash from a key is potentially expensive

      - why would you want to throw away bits from the hash? comparing the full hashes as a first check to see whether equality has a chance to succeed is very useful. the equality function can again be written in python, so is potentially very slow.

      +
      +
      +
      +
      + + Armin Rigo wrote on 2015-01-28 16:03: +
      +
      +

      @Anonymous: about starting at 16-bit instead of 8-bit: it doesn't give any benefit, because rehashing is needed anyway to grow the sparse table. As long as its size is at most 256, then there is no point in storing 16-bit numbers instead of 8-bit numbers. In theory we could store N-bit numbers for the optimal value of N (= 4, 6, 8, 10...) and pay only the cost of additional complexity for individual reads and writes, not for rehashing.

      +
      +
      +
      +
      + + Anonymous wrote on 2015-01-28 21:39: +
      +
      +

      Ah indeed. I am thinking of implementing this in C++ which has coloured my thoughts somewhat. In my case, key equality checks are for the most part cheap. Thus the size/compute tradeoffs may be a bit different.

      Thanks for your thoughts.

      +
      +
      +
      +
      + + Dustin Boswell wrote on 2015-02-04 23:05: +
      +
      +

      Just curious, was there no slowdown from adding this extra level of indirection? For the case of accessing a random key from a cold dictionary, won't the lookup incur 2 cache misses now (one on each array), compared to just 1 for the original design?

      +
      +
      +
      +
      + + Armin Rigo wrote on 2015-02-05 15:11: +
      +
      +

      @Durtin: there are certainly slow-downs in some cases. If the dictionary is cold, then indeed there is one extra cache miss. It seems to be quickly compensated, though, by the fact that if then you do a few more accesses to the same dict, you are likely to get less cache misses, simply because of the more compact layout. Also, the index array is often single bytes, so it can be fully in the cache very quickly.

      +
      +
      +
      +
      + + Alhabshi3k wrote on 2015-02-11 08:44: +
      +
      +

      Thank you for improving pypy performance and features. Your project and method is promising in improvement weakness aspect of dynamic languages. At the same time, pypy should provide an simplicity of Python rather than diversity , where diversity is the reality but simplicity is the case.

      Making dictionaries ordered by default is part of simplicity; in this effort I wish integrating the features of "defaultdict" as method and properties of the the default basic dictionary.

      similar case , integrating "deque" features (as well ,method and properties) as part of pypy list datatype.

      Usually I wonder why python team didn't integrate the features of these "collections" ( as they say "High-performance container datatypes" ) within original python basic datatype, as we all know , everything in Python is an Object. and I don't think it is a pythonic way to do things in diversity.

      Anyhow , keep on your development and team spirit.

      +
      +
      +
      +
      + + Armin Rigo wrote on 2015-02-11 09:07: +
      +
      +

      @Alhabshi3k: indeed, you're right in that "defaultdict" could be replaced with an alternate constructor of the regular dicts. I'm not sure why it is not so. For deques, it is maybe a question of performance, but particularly of underlying C-level memory layout: CPython can't easily add appendleft() and popleft() to regular lists while still keeping the same C API, notably PyList_GET_ITEM() and PySequence_Fast_ITEMS() --- though that is debatable.

      We could support that in PyPy, but that is arguably more of a language change than just making dicts ordered with no new user-visible API.

      +
      +
      +
      +
      + + Unknown wrote on 2018-02-06 20:31: +
      +
      +

      You say for 100 elements, the new design's compact array uses 3 * WORD * 100 memory, right? So no extra capacity whatsoever? Then what do you do when I insert another element? Allocate a new array with 3 * WORD * 101 memory and copy all data there (and write the new element at the end)? That would be highly inefficient. So I don't believe you're honest about the memory usage.

      +
      +
      +
      +
      + + Armin Rigo wrote on 2018-02-06 21:08: +
      +
      +

      The actual items are stored in a list which, like a list object, is slightly overallocated. Maybe the text in the blog post missed that and it should add a "k": the average is "100 * 12/7 + 3 * WORD * 100 * k" for an average value of k around 17/16. That's around 2700 instead of 2600.

      +
      +
      +
      + +

      Leysin Winter Sprint (20-28th February 2015)

      + +
      +

      The next PyPy sprint will be in Leysin, Switzerland, for the tenth time. +This is a fully public sprint: newcomers and topics other than those +proposed below are welcome.

      + +

      Goals and topics of the sprint

      + +

      The details depend on who is here and ready to work. We might touch +topics such as:

      + +

      +
        +
      • cleaning up the optimization step in the JIT, change the register +allocation done by the JIT's backend, or improvements to the +warm-up time + +

        +
      • +
      • STM (Software Transaction Memory), notably: try to come up with +benchmarks, and measure them carefully in order to test and improve +the conflict reporting tools, and more generally to figure out how +practical it is in large projects to avoid conflicts + +

        +
      • +
      • vmprof - a statistical profiler for CPython and PyPy work, including +making it more user friendly. + +

        +
      • +
      • Py3k (Python 3.x support), NumPyPy (the numpy module) + +

        +
      • +
      • +added: cffi 1.0, trying out pygame+cffi on Raspberry Pi devices + +
      • +
      • And as usual, the main side goal is to have fun in winter sports :-) +We can take a day off for ski. +
      • +
      +

      Exact times

      + +

      For a change, and as an attempt to simplify things, I specified the +dates as 20-28 Februrary 2015, where 20 and 28 are travel days. We will +work full days between the 21 and the 27. You are of course allowed to +show up for a part of that time only, too.

      + +

      Location and Accomodation

      + +

      Leysin, Switzerland, "same place as before". Let me refresh your +memory: both the sprint venue and the lodging will be in a very spacious +pair of chalets built specifically for bed & breakfast: +Ermina. The place has a good ADSL Internet connection +with wireless installed. You can of course arrange your own lodging +anywhere (as long as you are in Leysin, you cannot be more than a 15 +minutes walk away from the sprint venue), but I definitely recommend +lodging there too -- you won't find a better view anywhere else (though +you probably won't get much worse ones easily, either :-)

      + +

      Please confirm that you are coming so that we can adjust the +reservations as appropriate. In the past, the rates were around 60 CHF a +night all included in 2-person rooms, with breakfast. Now, the rooms +available are either single-person (or couple), or rooms for 3 persons. +The latter choice is recommended and should be under 60 CHF per person.

      + +

      Please register by Mercurial, or on the pypy-dev mailing list if you do not yet have check-in rights.

      + +

      You need a Swiss-to-(insert country here) power adapter. There will be +some Swiss-to-EU adapters around, and at least one EU-format power strip.

      +
      +
      +
      +
      + + Unknown wrote on 2015-01-16 21:07: +
      +
      +

      Hi,

      During this sprint, ss it plan to work on yield form syntax, or more generally, Python 3.3 support ?

      I'm very interested to test PyPy with AsyncIO.

      Regards

      +
      +
      +
      +
      + + Armin Rigo wrote on 2015-01-17 10:21: +
      +
      +

      @Ludovic, we don't have precise plans. If there is someone also interested in Python 3, then yes, this kind of work would be nice. (Note that I see some tests about "yield from" in the py3.3 branch, which may mean that it was implemented already.)

      +
      +
      +
      +
      + + Anonymous wrote on 2015-01-24 07:38: +
      +
      +

      Great news. Thanks! PyPy 3

      +
      +
      +
      + +

      September donations and thank you to the Python Software Foundation!

      + +
      +
      + +

      Hello everyone!

      +

      We would like to show you a short update on the PyPy funding. +We gathered a total of $15,986 in the month of September and as per +earlier agreement, the Python Software Foundation donated $10,000 +to PyPy. We would like to thank everyone participating and the PSF in +particular for supporting the PyPy project and making our work possible!

      +

      We've been working hard on the goals outlined in the funding proposals.

      +
        +
      • +PyPy Python 3 support has been in beta for a while and it's already +being used by many people, as seen per the number of reported bugs. +We're currently supporting 3.2, planning on moving towards 3.4 in the +future.
      • +
      • Software Transactional Memory has been a successful research project, +with first real world results shown during the Warsaw sprint.
      • +
      • More detailed update on numpy will be published soon. A little spoiler is +that we're planning on addressing matplotlib, scipy and the larger ecosystem +to some extent. Stay tuned!
      • +
      +

      Again, thanks to everyone who donated and happy Thanksgiving to everyone +on that side of the world!

      +

      Cheers,
      +fijal and the entire PyPy team

      + +
      +
      +
      +
      +
      +
      + + Alessandro wrote on 2014-11-29 02:53: +
      +
      +

      Fantastic work!

      I'm a Python 3 user, as such the PyPy3 was great for me!

      And good news for Numpypy, it would indeed be awesome for supporting the numeric ecosystem.

      +
      +
      +
      +
      + + Anonymous wrote on 2014-11-30 08:31: +
      +
      +

      It would be amazing if pypy supported numpy and matplotlib!!

      +
      +
      +
      +
      + + Anonymous wrote on 2014-12-04 12:35: +
      +
      +

      This is great news! I've been waiting for scipy and matplotlib for a while, now it's finally on the roadmap.

      +
      +
      +
      +
      + + Anonymous wrote on 2015-01-06 12:37: +
      +
      +

      Any news on the Numpy update?

      +
      +
      +
      + +

      Tornado without a GIL on PyPy STM

      + +
      +

      This post is by Konstantin Lopuhin, who tried PyPy STM during the +Warsaw sprint.

      +

      Python has a GIL, right? Not quite - PyPy STM is a python implementation +without a GIL, so it can scale CPU-bound work to several cores. +PyPy STM is developed by Armin Rigo and Remi Meier, +and supported by community donations. +You can read more about it in the +docs.

      +

      Although PyPy STM is still a work in progress, in many cases it can already +run CPU-bound code faster than regular PyPy, when using multiple cores. +Here we will see how to slightly modify Tornado IO loop to use +transaction +module. +This module is described +in the docs and is really simple to use - please see an example there. +An event loop of Tornado, or any other asynchronous +web server, looks like this (with some simplifications):

      +
      +while True:
      +    for callback in list(self._callbacks):
      +        self._run_callback(callback)
      +    event_pairs = self._impl.poll()
      +    self._events.update(event_pairs)
      +    while self._events:
      +        fd, events = self._events.popitem()
      +        handler = self._handlers[fd]
      +        self._handle_event(fd, handler, events)
      +
      +

      We get IO events, and run handlers for all of them, these handlers can +also register new callbacks, which we run too. When using such a framework, +it is very nice to have a guaranty that all handlers are run serially, +so you do not have to put any locks. This is an ideal case for the +transaction module - it gives us guaranties that things appear +to be run serially, so in user code we do not need any locks. We just +need to change the code above to something like:

      +
      +while True:
      +    for callback in list(self._callbacks):
      +        transaction.add(                # added
      +            self._run_callback, callback)
      +    transaction.run()                   # added
      +    event_pairs = self._impl.poll()
      +    self._events.update(event_pairs)
      +    while self._events:
      +        fd, events = self._events.popitem()
      +        handler = self._handlers[fd]
      +        transaction.add(                # added
      +            self._handle_event, fd, handler, events)
      +    transaction.run()                   # added
      +
      +

      The actual commit is +here, +- we had to extract a little function to run the callback.

      +
      +

      Part 1: a simple benchmark: primes

      +

      Now we need a simple benchmark, lets start with +this +- just calculate a list of primes up to the given number, and return it +as JSON:

      +
      +def is_prime(n):
      +    for i in xrange(2, n):
      +        if n % i == 0:
      +            return False
      +    return True
      +
      +class MainHandler(tornado.web.RequestHandler):
      +    def get(self, num):
      +        num = int(num)
      +        primes = [n for n in xrange(2, num + 1) if is_prime(n)]
      +        self.write({'primes': primes})
      +
      +

      We can benchmark it with siege:

      +
      +siege -c 50 -t 20s https://localhost:8888/10000
      +
      +

      But this does not scale. The CPU load is at 101-104 %, and we handle 30 % +less request per second. The reason for the slowdown is STM overhead, +which needs to keep track of all writes and reads in order to detect conflicts. +And the reason for using only one core is, obviously, conflicts! +Fortunately, we can see what this conflicts are, if we run code like this +(here 4 is the number of cores to use):

      +
      +PYPYSTM=stm.log ./primes.py 4
      +
      +

      Then we can use print_stm_log.py +to analyse this log. It lists the most expensive conflicts:

      +
      +14.793s lost in aborts, 0.000s paused (1258x STM_CONTENTION_INEVITABLE)
      +File "/home/ubuntu/tornado-stm/tornado/tornado/httpserver.py", line 455, in __init__
      +    self._start_time = time.time()
      +File "/home/ubuntu/tornado-stm/tornado/tornado/httpserver.py", line 455, in __init__
      +    self._start_time = time.time()
      +...
      +
      +

      There are only three kinds of conflicts, they are described in +stm source, +Here we see that two threads call into external function to get current time, +and we can not rollback any of them, so one of them must wait till the other +transaction finishes. +For now we can hack around this by disabling this timing - this is only +needed for internal profiling in tornado.

      +

      If we do it, we get the following results (but see caveats below):

      + + + + + +
      + ++++ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
      Impl.req/s
      PyPy 2.414.4
      CPython 2.73.2
      PyPy-STM 19.3
      PyPy-STM 216.4
      PyPy-STM 320.4
      PyPy STM 424.2
      +
          + +
      +

      As we can see, in this benchmark PyPy STM using just two cores +can beat regular PyPy! +This is not linear scaling, there are still conflicts left, and this +is a very simple example but still, it works!

      +

      But its not that simple yet :)

      +

      First, these are best-case numbers after long (much longer than for regular +PyPy) warmup. Second, it can sometimes crash (although removing old pyc files +fixes it). Third, benchmark meta-parameters are also tuned.

      +

      Here we get relatively good results only when there are a lot of concurrent +clients - as a results, a lot of requests pile up, the server is not keeping +with the load, and transaction module is busy with work running this piled up +requests. If we decrease the number of concurrent clients, results get slightly worse. +Another thing we can tune is how heavy is each request - again, if we ask +primes up to a lower number, then less time is spent doing calculations, +more time is spent in tornado, and results get much worse.

      +

      Besides the time.time() conflict described above, there are a lot of others. +The bulk of time is lost in these two conflicts:

      +
      +14.153s lost in aborts, 0.000s paused (270x STM_CONTENTION_INEVITABLE)
      +File "/home/ubuntu/tornado-stm/tornado/tornado/web.py", line 1082, in compute_etag
      +    hasher = hashlib.sha1()
      +File "/home/ubuntu/tornado-stm/tornado/tornado/web.py", line 1082, in compute_etag
      +    hasher = hashlib.sha1()
      +
      +13.484s lost in aborts, 0.000s paused (130x STM_CONTENTION_WRITE_READ)
      +File "/home/ubuntu/pypy/lib_pypy/transaction.py", line 164, in _run_thread
      +    got_exception)
      +
      +

      The first one is presumably calling into some C function from stdlib, and we get +the same conflict as for time.time() above, but is can be fixed on PyPy +side, as we can be sure that computing sha1 is pure.

      +

      It is easy to hack around this one too, just removing etag support, but if +we do it, performance is much worse, only slightly faster than regular PyPy, +with the top conflict being:

      +
      +83.066s lost in aborts, 0.000s paused (459x STM_CONTENTION_WRITE_WRITE)
      +File "/home/arigo/hg/pypy/stmgc-c7/lib-python/2.7/_weakrefset.py", line 70, in __contains__
      +File "/home/arigo/hg/pypy/stmgc-c7/lib-python/2.7/_weakrefset.py", line 70, in __contains__
      +
      +

      Comment by Armin: It is unclear why this happens so far. We'll investigate...

      +

      The second conflict (without etag tweaks) originates +in the transaction module, from this piece of code:

      +
      +while True:
      +    self._do_it(self._grab_next_thing_to_do(tloc_pending),
      +                got_exception)
      +    counter[0] += 1
      +
      +

      Comment by Armin: This is a conflict in the transaction module itself; ideally, +it shouldn't have any, but in order to do that we might need a little bit +of support from RPython or C code. So this is pending improvement.

      +

      Tornado modification used in this blog post is based on 3.2.dev2. +As of now, the latest version is 4.0.2, and if we +apply +the same changes to this version, then we no longer get any scaling on this benchmark, +and there are no conflicts that take any substantial time.

      +

      Comment by Armin: There are two possible reactions to a conflict. We can either +abort one of the two threads, or (depending on the circumstances) just +pause the current thread until the other one commits, after which the +thread will likely be able to continue. The tool ``print_stm_log.py`` +did not report conflicts that cause pauses. It has been fixed very +recently. Chances are that on this test it would report long pauses and +point to locations that cause them.

      +
      +
      +

      Part 2: a more interesting benchmark: A-star

      +

      Although we have seen that PyPy STM is not all moonlight and roses, +it is interesting to see how it works on a more realistic application.

      +

      astar.py +is a simple game where several players move on a map +(represented as a list of lists of integers), +build and destroy walls, and ask server to give them +shortest paths between two points +using A-star search, adopted from ActiveState recipie.

      +

      The benchmark bench_astar.py +is simulating players, and tries to put the main load on A-star search, +but also does some wall building and destruction. There are no locks +around map modifications, as normal tornado is executing all callbacks +serially, and we can keep this guaranty with atomic blocks of PyPy STM. +This is also an example of a program that is not trivial +to scale to multiple cores with separate processes (assuming +more interesting shared state and logic).

      +

      This benchmark is very noisy due to randomness of client interactions +(also it could be not linear), so just lower and upper bounds for +number of requests are reported

      + ++++ + + + + + + + + + + + + + + + + + + + + + + +
      Impl.req/s
      PyPy 2.45 .. 7
      CPython 2.70.5 .. 0.9
      PyPy-STM 12 .. 4
      PyPy STM 42 .. 6
      +

      Clearly this is a very bad benchmark, but still we can see that scaling is worse +and STM overhead is sometimes higher. +The bulk of conflicts come from the transaction module (we have seen it +above):

      +
      +91.655s lost in aborts, 0.000s paused (249x STM_CONTENTION_WRITE_READ)
      +File "/home/ubuntu/pypy/lib_pypy/transaction.py", line 164, in _run_thread
      +    got_exception)
      +
      +

      Although it is definitely not ready for production use, you can already try +to run things, report bugs, and see what is missing in user-facing tools +and libraries.

      +

      Benchmarks setup:

      + +
      +
      +
      +
      +
      + + Anonymous wrote on 2014-11-18 07:00: +
      +
      +

      "Clearly this is a very benchmark" - looks like you've missed a word here ;)

      +
      +
      +
      +
      + + crusaderky wrote on 2014-11-19 00:01: +
      +
      +

      in bench_astar.py, you are doing the following queries:
      - try to move: 85%
      - build a wall: 10.5% [(1-.85)*.7]
      - erase something: 0.45% [(1-.85)*(1-.7)*.1]
      - show map: 4.05% [(1-.85)*(1-.7)*(1-.1)]

      I doubt that's intentional.... :P

      +
      +
      +
      +
      + + crusaderky wrote on 2014-11-19 01:01: +
      +
      +

      Correct me if I misunderstood the theory of PyPy-STM, but in the A* test there's nothing that prevents a get() to read the game map while MapChangeHandler.put() is running (that is, while the system is in an incoherent status)?

      Shouldn't MapChangeHandler.put() be wrapped in a exclusive write lock, and all the get() handlers be wrapped with a shared read lock?

      +
      +
      +
      +
      + + Konstantin Lopuhin wrote on 2014-11-19 20:45: +
      +
      +

      > Clearly this is a very benchmark" - looks like you've missed a word here ;)

      Oh, yes, that word is "bad" :)

      > Shouldn't MapChangeHandler.put() be wrapped in a exclusive write lock, and all the get() handlers be wrapped with a shared read lock?

      Here all request handlers are already wrapped inside atomic blocks, but this is hidden from us in (modified) tornado. So we do not need any locks (as in normal tornado too, because normal tornado is single threaded). If request handlers conflict, then we just loose performance, not correctness. This is one of the main points of PyPy STM: it can support multithreaded code without needing to use locks.

      Regarding the probabilities: yes, that's not quite intentional)

      +
      +
      +
      + +

      PyPy IO improvements

      + +
      +
      + +
      +
      +

      Hello everyone!

      +

      We've wrapped up the Warsaw sprint, so I would like to describe some +branches which have been recently merged and which improved the I/O and the +GC: gc_no_cleanup_nursery and gc-incminimark-pinning.

      +

      The first branch was started by Wenzhu Man for her Google Summer of Code +and finished by Maciej Fijałkowski and Armin Rigo. +The PyPy GC works by allocating new objects in the young object +area (the nursery), simply by incrementing a pointer. After each minor +collection, the nursery has to be cleaned up. For simplicity, the GC used +to do it by zeroing the whole nursery.

      +

      This approach has bad effects on the cache, since you zero a large piece of +memory at once and do unnecessary work for things that don't require zeroing +like large strings. We mitigated the first problem somewhat with incremental +nursery zeroing, but this branch removes the zeroing completely, thus +improving the string handling and recursive code (since jitframes don't +requires zeroed memory either). I measured the effect on two examples: +a recursive implementation of fibonacci and gcbench, +to measure GC performance.

      +

      The results for fibonacci and gcbench are below (normalized to cpython +2.7). Benchmarks were run 50 times each (note that the big standard +deviation comes mostly from the warmup at the beginning, true figures +are smaller):

      +

      +

      + ++++++ + + + + + + + + + + + + + + + + + + + + +
      benchmarkCPythonPyPy 2.4PyPy non-zero
      fibonacci4.8+-0.15 (1.0x)0.59+-0.07 (8.1x)0.45+-0.07 (10.6x)
      gcbench22+-0.36 (1.0x)1.34+-0.28 (16.4x)1.02+-0.15 (21.6x)
      +

      The second branch was done by Gregor Wegberg for his master thesis and finished +by Maciej Fijałkowski and Armin Rigo. Because of the way it works, the PyPy GC from +time to time moves the objects in memory, meaning that their address can change. +Therefore, if you want to pass pointers to some external C function (for +example, write(2) or read(2)), you need to ensure that the objects they are +pointing to will not be moved by the GC (e.g. when running a different thread). +PyPy up to 2.4 solves the problem by copying the data into or from a non-movable buffer, which +is obviously inefficient. +The branch introduce the concept of "pinning", which allows us to inform the +GC that it is not allowed to move a certain object for a short period of time. +This introduces a bit of extra complexity +in the garbage collector, but improves the I/O performance quite drastically, +because we no longer need the extra copy to and from the non-movable buffers.

      +

      In this benchmark, which does I/O in a loop, +we either write a number of bytes from a freshly allocated string into +/dev/null or read a number of bytes from /dev/full. I'm showing the results +for PyPy 2.4, PyPy with non-zero-nursery and PyPy with non-zero-nursery and +object pinning. Those are wall times for cases using os.read/os.write +and file.read/file.write, normalized against CPython 2.7.

      +

      Benchmarks were done using PyPy 2.4 and revisions 85646d1d07fb for +non-zero-nursery and 3d8fe96dc4d9 for non-zero-nursery and pinning. +The benchmarks were run once, since the standard deviation was small.

      +

      + +

      +
      +

      The Y axis is speed, normalized to CPython, the more the better

      + +

      What we can see is that os.read and os.write both improved greatly +and outperforms CPython now for each combination. file operations are +a little more tricky, and while those branches improved the situation a bit, +the improvement is not as drastic as in os versions. It really should not +be the case and it showcases how our file buffering is inferior to CPython. +We plan on removing our own buffering and using FILE* in C in the near future, +so we should outperform CPython on those too (since our allocations are cheaper). +If you look carefully in the benchmark, the write function is copied three times. +This hack is intended to avoid JIT overspecializing the assembler code, which happens +because the buffering code was written way before the JIT was done. In fact, our buffering +is hilariously bad, but if stars align correctly it can be JIT-compiled to something +that's not half bad. Try removing the hack and seeing how the performance of the last +benchmark drops :-) Again, this hack should be absolutely unnecessary once we remove +our own buffering, stay tuned for more.

      +

      Cheers,
      +fijal

      +
      +
      +
      +
      + + Yichao Yu wrote on 2014-11-05 18:32: +
      +
      +

      Sounds great!!!

      Just wondering, will the pin-memory also improves the situation when passing strings/other buffers to c functions (e.g. via cffi)?

      +
      +
      +
      +
      + + Anonymous wrote on 2014-11-05 21:54: +
      +
      +

      Hey,

      In your benchmark, the following loop:
      for i in range(num):
      os.write(fd, " " * num2)

      Is not hoisted out by CPython (whereas I guess PyPy does hoist it).
      Which means that the buffer written is basically allocated/freed upon each loop.

      If you want to measure pure I/O performance (so let's say a zero-copy setting), it should be hoisted manually out of the loop for CPython, like this:

      payload = b" " * num2
      for i in range(num):
      os.write(fd, payload)

      Then, the results go from:

      fwrite 100 bytes, 1.93us per write
      fwrite 1000 bytes, 2.57us per write
      fwrite 10000 bytes, 6.73us per write
      file_write 100 bytes, 0.99us per write
      file_write 1000 bytes, 1.68us per write
      file_write 10000 bytes, 4.71us per write


      to

      fwrite 100 bytes, 1.38us per write
      fwrite 1000 bytes, 1.48us per write
      fwrite 10000 bytes, 1.38us per write
      file_write 100 bytes, 0.65us per write
      file_write 1000 bytes, 0.96us per write
      file_write 10000 bytes, 2.32us per write

      Also, might be worth trying wth binary mode.

      Anyway, keep up the great work!

      +
      +
      +
      +
      + + Maciej Fijalkowski wrote on 2014-11-06 06:10: +
      +
      +

      PyPy does not hoist the buffer allocation here. The benchmark specifically allocated/frees the buffer every loop, since we want the object written fresh (otherwise pinning is not needed), but also we think that writing a new object (as opposed to the constant buffer) is really more of a common case. Yes, you get an overhead of allocation measured too, but the case here is that we wanted to measure the IO of fresh objects, not old ones

      +
      +
      +
      + +

      PyPy3 2.4.0 released

      + +
      +
      We're pleased to announce the availability of PyPy3 2.4.0!

      +This release contains several bugfixes and enhancements. Among the user-facing improvements specific to PyPy3:
        +
      • Better Windows compatibility, e.g. the nt module functions _getfinalpathname & _getfileinformation are now supported (the former is required for the popular pathlib library for example)
      • +
      • Various fsencode PEP 383 related fixes to the posix module (readlink, uname, ttyname and ctermid) and improved locale handling
      • +
      • Switched the default binary name on POSIX distributions from 'pypy' to 'pypy3' (which symlinks to to 'pypy3.2')
      • +
      • Fixed a couple different crashes related to parsing Python 3 source code
      • +
      +
      +And improvements shared with the recent PyPy 2.4.0 release:
        +
      • internal refactoring in string and GIL handling which led to significant speedups
      • +
      • improved handling of multiple objects (like sockets) in long-running programs. They are collected and released more efficiently, reducing memory use. In simpler terms - we closed what looked like a memory leak
      • +
      • Windows builds now link statically to zlib, expat, bzip, and openssl-1.0.1i
      • +
      • Many issues were resolved since the 2.3.1 release in June
      • +
      +
      +You can download PyPy3 2.4.0 here https://pypy.org/download.html.

      PyPy is a very compliant Python interpreter, almost a drop-in replacement for CPython 2.7 and 3.2.5. It's fast (pypy 2.4 and cpython 2.7.x performance comparison) due to its integrated tracing JIT compiler.

      +This release supports x86 machines running Linux 32/64, Mac OS X 64, Windows, and OpenBSD, as well as newer ARM hardware (ARMv6 or ARMv7, with VFPv3) running Linux. 
      +We would like to thank our donors for the continued support of the PyPy project.

      +The complete release notice is here.

      +Please try it out and let us know what you think. We especially welcome success stories, please tell us about how it has helped you!

      +Cheers, The PyPy Team

      +
      +
      +
      +
      +
      + + Unknown wrote on 2014-10-22 14:39: +
      +
      +

      Great news. Thanks!

      +
      +
      +
      +
      + + Anonymous wrote on 2014-10-28 13:15: +
      +
      +

      Great work, thanks!

      +
      +
      +
      +
      + + Unknown wrote on 2014-10-30 14:46: +
      +
      +

      That’s great - thanks!

      And the portable release directly works for my keyboard evolution! (it’s roughly 2.5x faster than cPython).

      +
      +
      +
      +
      + + Unknown wrote on 2014-10-30 14:58: +
      +
      +

      Correction: After some warmup time, pypy is more than 2.8x faster than cPython.

      +
      +
      +
      + +

      Couchbase contribution to PyPy

      + +
      +
      +

      Hello everyone!

      +

      We always offer to put on the blog info about our sponsors who donate substantial amounts of money. So far most people decided to stay anonymous, so this is the first blog post describing our sponsor and his relationship to PyPy, hopefully not the last. We'll also publish a full blog post about the PSF-matched fundraiser soon. This is a guest post by Brent Woodruff from Couchbase.

      +

      +

      +
      + +

      +Couchbase is a leading NoSQL document database that provides a flexible data model, high performance, scalability, and high availability. Couchbase is a commercially supported open source project. Visit us at https://www.couchbase.com and https://github.com/couchbase. +

      +

      +Couchbase Inc. donated $2000.00, and employees of Couchbase personally contributed a disclosed additional $230.00, towards Pypy progress during the September funding drive. These funds will see a match from the Python Software Foundation. +

      +

      +Pypy is primarily used by Couchbase employees to perform product analysis and troubleshooting using internally developed tools. Every customer of Couchbase benefits from the use of Pypy; both due to the rapid development provided by Python, and the speed of the resulting tools provided by the Pypy JIT interpreter. +

      +

      +“PyPy is great - it gave us a 4x speedup in our CPU-intensive internal application over CPython” +-Dave Rigby and Daniel Owen, Couchbase Engineers +

      +

      +Additionally, Couchbase has a preliminary CFFI based Couchbase client available for Pypy users. +

      + +
      +
      +
      +
      +
      +
      + + Unknown wrote on 2014-10-14 22:42: +
      +
      +

      Definitely wouldn't have thought to put PyPy and Couchbase in the same sentence, but this is very good of them! Glad to see the support.

      +
      +
      +
      +
      + + Anonymous wrote on 2014-10-15 09:34: +
      +
      +

      Thanks for the donation. Could you give a bit more detail of how hard it was to make your code compatible with PyPy?

      +
      +
      +
      +
      + + Anonymous wrote on 2014-10-15 13:28: +
      +
      +

      Hello from Couchbase. With regards to making our code compatible with PyPy, I can only comment on our internal tooling. Those are currently all pure Python, so it was trivial. We used modules that work with PyPy already: namely pyparsing, LEPL, and tornado. The tools all run under both CPython and PyPy unmodified.

      +
      +
      +
      + +

      PyPy 2.4.0 released, 9 days left in funding drive

      + +
      +
      +We're pleased to announce the availability of PyPy 2.4.0; faster, fewer bugs, and updated to the python 2.7.8 stdlib.

      +This release contains several bugfixes and enhancements. Among the user-facing improvements:
        +
      • internal refactoring in string and GIL handling which led to significant speedups
      • +
      • improved handling of multiple objects (like sockets) in long-running programs. They are collected and released more efficiently, reducing memory use. In simpler terms - we closed what looked like a memory leak
      • +
      • Windows builds now link statically to zlib, expat, bzip, and openssl-1.0.1i
      • +
      • Many issues were resolved since the 2.3.1 release in June
      • +
      +
      +You can download PyPy 2.4.0 here https://pypy.org/download.html.

      +We would like to also point out that in September, the Python Software Foundation will match funds for any donations up to $10k, so head over to our website and help this mostly-volunteer effort out.

      PyPy is a very compliant Python interpreter, almost a drop-in replacement for CPython 2.7 and 3.2.5. It's fast (pypy 2.4 and cpython 2.7.x performance comparison) due to its integrated tracing JIT compiler.

      +This release supports x86 machines running Linux 32/64, Mac OS X 64, Windows, and OpenBSD, as well as newer ARM hardware (ARMv6 or ARMv7, with VFPv3) running Linux. 
      +We would like to thank our donors for the continued support of the PyPy project.

      +The complete release notice is here.

      +Please try it out and let us know what you think. We especially welcome success stories, please tell us about how it has helped you!

      +Cheers, The PyPy Team

      +
      +
      +
      +
      +
      + + Unknown wrote on 2014-10-13 18:32: +
      +
      +

      How did the funding drive work out?

      +
      +
      +
      + +

      PyPy 2.4-beta just in time for PSF's funding drive

      + +
      +
      +We're pleased to announce the availability of PyPy 2.4-beta1; faster, fewer bugs, and updated to the python 2.7.8 stdlib.

      +This release contains several bugfixes and enhancements. Among the user-facing improvements:
        +
      • internal refactoring in string and GIL handling which led to significant speedups
      • +
      • improved handling of multiple objects (like sockets) in long-running programs. They are collected and released more efficiently, reducing memory use. In simpler terms - we closed what looked like a memory leak
      • +
      • Windows builds now link statically to zlib, expat, bzip, and openssl-1.0.1i
      • +
      • Many issues were resolved since the 2.3.1 release in June
      • +
      +
      +You can download the PyPy 2.4-beta1 release here https://pypy.org/download.html.

      +We would like to also point out that in +September, the Python Software Foundation will match funds for +any donations up to $10k, so head over to our website and help this mostly-volunteer effort out.

      PyPy is a very compliant Python interpreter, almost a drop-in replacement for CPython 2.7 and 3.2.5. It's fast (pypy 2.4 and cpython 2.7.x performance comparison) due to its integrated tracing JIT compiler.

      +This + release supports x86 machines running Linux 32/64, Mac OS X 64, +Windows, and OpenBSD, as well as newer ARM hardware (ARMv6 or ARMv7, +with VFPv3) running Linux. 
      +We would like to thank our donors for the continued support of the PyPy project.

      +The complete release notice is here.

      +Please + try it out and let us know what you think. We especially welcome +success stories, please tell us about how it has helped you!

      +Cheers, The PyPy Team

      +News Flash from the beta release cycle:
        +
      • Note that the beta release mistakenly identifies itself in sys.pypy_version_info as releaselevel=='final', please do not mistake this for a final version
      • +
      • The beta can hit a "Illegal instruction" exception in jitted code on ARMv6 processors like the RaspberryPi. This will be fixed for the release.
      • +
      +
      +
      +
      +
      +
      +
      +
      +
      + + Unknown wrote on 2014-09-09 13:11: +
      +
      +

      Short testing note:

      ./pypy: error while loading shared libraries: libtinfo.so.5: cannot open shared object file: No such file or directory

      64 bit Linux version tested on Gentoo GNU/Linux.

      +
      +
      +
      +
      + + Unknown wrote on 2014-09-09 13:14: +
      +
      +

      ah, found it: https://github.com/squeaky-pl/portable-pypy

      +
      +
      +
      +
      + + Unknown wrote on 2014-09-12 13:51: +
      +
      +

      Is there a chance to get pylab/matplotlib running with pypy as you showed in 2011?

      +
      +
      +
      +
      + + Unknown wrote on 2014-09-15 22:12: +
      +
      +

      I just had a very, very good experience with pypy:

      https://github.com/ArneBab/freenet-fundraising/blob/master/slides.org#routing-simulation

      with cpython it needs a day for a simulation with 100k nodes. In pypy it needs a few minutes!

      +
      +
      +
      +
      + + Carlo Pires wrote on 2014-09-27 20:41: +
      +
      +

      Still waiting support for Python3.4.1 to test my "big" application.

      +
      +
      +
      + +

      Python Software Foundation Matching Donations this Month

      + +
      +

      We're extremely excited to announce that for the month of September, any amount
      +you donate to PyPy will be match (up to $10,000) by the Python Software
      +Foundation
      .

      +

      This includes any of our ongoing fundraisers: NumPyPy, STM, Python3, or our
      +general fundraising.

      +

      Here are some of the things your previous donations have helped accomplish:

      +
        +
      • Getting PyPy3 completed (currently 3.2, with 3.3 work underway)
      • +
      • New research and production engineering on STM for PyPy
      • +
      • Lots of progress on NumPy for PyPy
      • +
      • Significant performance improvements
      • +
      +

      You can see a preview of what's coming in our next 2.4 release in the draft
      +release notes
      .

      +

      Thank you to all the individuals and companies which have donated so far.

      +

      So please, donate today: https://pypy.org/

      +

      (Please be aware that the donation progress bars are not live updating, so
      +don't be afraid if your donation doesn't show up immediately).

      +
      +
      +
      +
      + + Unknown wrote on 2014-09-02 08:51: +
      +
      +

      aaand donated ☺

      Thank you, Python Software Foundation!

      +
      +
      +
      +
      + + Anonymous wrote on 2014-09-04 10:57: +
      +
      +

      I think you should be careful about your claims for numpy. It's a great idea and I am sure lots of people would be very interested in anything you do but I for one see very little progress on it.

      +
      +
      +
      +
      + + handsomegui wrote on 2014-09-05 05:59: +
      +
      +

      It would be nice to have a bitcoin donation address for donation.

      +
      +
      +
      +
      + + Unknown wrote on 2014-09-05 13:30: +
      +
      +

      Donated!

      +
      +
      +
      +
      + + Canesin wrote on 2014-09-05 16:00: +
      +
      +

      +1 on the bitcoin address for donation

      +
      +
      +
      +
      + + L. Simon wrote on 2014-09-05 20:32: +
      +
      +

      Consider me another request for a Bitcoin address. I'm in for a few millibits if you provide one.

      +
      +
      +
      +
      + + Armin Rigo wrote on 2014-09-06 16:47: +
      +
      +

      Sorry for the bitcoin requests... setting up a new payment system just for a few millibits is not worth it at all.

      +
      +
      +
      + +
      +
      + +
      +
      +
      + +
      + + + + \ No newline at end of file diff --git a/blog/index-33.html b/blog/index-33.html new file mode 100644 index 000000000..b498f2d20 --- /dev/null +++ b/blog/index-33.html @@ -0,0 +1,2338 @@ + + + + + + +PyPy (old posts, page 33) | PyPy + + + + + + + + + + + + + + + + + + Skip to main content +
      +
      +
      +

      PyPy 2.6.0 release

      + +
      +
      +
      +
      +

      +PyPy 2.6.0 - Cameo Charm

      +
      +We’re pleased to announce PyPy 2.6.0, only two months after PyPy 2.5.1. We are particulary happy to update cffi to version 1.1, which makes the popular ctypes-alternative even easier to use, and to support the new vmprof statistical profiler.
      +
      +
      +
      +You can download the PyPy 2.6.0 release here:
      +
      +
      +
      + +
      +
      +
      +
      +We would like to thank our donors for the continued support of the PyPy project, and for those who donate to our three sub-projects, as well as our volunteers and contributors.
      +
      +
      +
      +Thanks also to Yury V. Zaytsev and David Wilson who recently started running nightly builds on Windows and MacOSX buildbots.
      +
      +
      +
      +We’ve shown quite a bit of progress, but we’re slowly running out of funds. Please consider donating more, or even better convince your employer to donate, so we can finish those projects! The three sub-projects are:
      +
      +
      +
        +
      • +Py3k (supporting Python 3.x): We have released a Python 3.2.5 compatible version we call PyPy3 2.4.0, and are working toward a Python 3.3 compatible version
      • +
      • +STM (software transactional memory): We have released a first working version, and continue to try out new promising paths of achieving a fast multithreaded Python
      • +
      • +NumPy which requires installation of our fork of upstream numpy, available on bitbucket +
      • +
      +
      +
      +
      +We would also like to encourage new people to join the project. PyPy has many layers and we need help with all of them: PyPy and RPython documentation improvements, tweaking popular modules to run on pypy, or general help with making RPython’s JIT even better. Nine new people contributed since the last release, you too could be one of them.
      +
      +

      +What is PyPy?

      +
      +PyPy is a very compliant Python interpreter, almost a drop-in replacement for CPython 2.7. It’s fast (pypy and cpython 2.7.x performance comparison) due to its integrated tracing JIT compiler.
      +
      +
      +
      +This release supports x86 machines on most common operating systems (Linux 32/64, Mac OS X 64, Windows, OpenBSD, freebsd), as well as newer ARM hardware (ARMv6 or ARMv7, with VFPv3) running Linux.
      +
      +
      +
      +While we support 32 bit python on Windows, work on the native Windows 64 bit python is still stalling, we would welcome a volunteer to handle that. We also welcome developers with other operating systems or dynamic languages to see what RPython can do for them.
      +
      +
      +
      +
      +

      +Highlights

      +
        +
      • Python compatibility:
          +
        • Improve support for TLS 1.1 and 1.2
        • +
        • Windows downloads now package a pypyw.exe in addition to pypy.exe
        • +
        • Support for the PYTHONOPTIMIZE environment variable (impacting builtin’s __debug__ property)
        • +
        • Issues reported with our previous release were resolved after reports from users on our issue tracker at https://foss.heptapod.net/pypy/pypy/-/issues or on IRC at #pypy.
        • +
        +
      • +
      • New features:
          +
        • Add preliminary support for a new lightweight statistical profiler vmprof, which has been designed to accomodate profiling JITted code
        • +
        +
      • +
      • Numpy:
          +
        • Support for object dtype via a garbage collector hook
        • +
        • Support for .can_cast and .min_scalar_type as well as beginning a refactoring of the internal casting rules
        • +
        • Better support for subtypes, via the __array_interface__, __array_priority__, and __array_wrap__ methods (still a work-in-progress)
        • +
        • Better support for ndarray.flags
        • +
        +
      • +
      • Performance improvements:
          +
        • Slight improvement in frame sizes, improving some benchmarks
        • +
        • Internal refactoring and cleanups leading to improved JIT performance
        • +
        +
          +
        • Improved IO performance of zlib and bz2 modules
        • +
        • We continue to improve the JIT’s optimizations. Our benchmark suite is now over 7 times faster than cpython
        • +
        +
      • +
      +
      +
      +
      +
      +Please try it out and let us know what you think. We welcome success stories, experiments, or benchmarks, we know you are using PyPy, please tell us about it!
      +Cheers
      +The PyPy Team
      +
      +
      +

      +
      +
      +
      +
      +
      +
      +
      +
      + + mattip wrote on 2015-06-01 16:32: +
      +
      +

      PyPy 2.6.0 - Cameo Charm since PyPy looks best in profile (well, vmprof anyway)

      +
      +
      +
      +
      + + Anonymous wrote on 2015-06-01 17:57: +
      +
      +

      How is matplotlib state in numpypy ?

      +
      +
      +
      +
      + + mattip wrote on 2015-06-02 10:51: +
      +
      +

      No GUI backend, but this fork should work (version 1.4) for non-interactive plotting
      https://github.com/mattip/matplotlib
      You will need to install our fork of numpy as a prerequisite
      https://bitbucket.org/pypy/numpy

      Help with the cffi port of WxPython could get us a GUI backend (or a updated matplotlib)
      https://doc.pypy.org/en/latest/project-ideas.html#make-more-python-modules-pypy-friendly

      +
      +
      +
      +
      + + Anonymous wrote on 2015-06-02 12:07: +
      +
      +

      Thanks for the information

      +
      +
      +
      + +

      CFFI 1.0.1 released

      + +
      +

      CFFI 1.0.1 final has now been released for CPython! CFFI is a (CPython and PyPy) module to interact with C code from Python.

      +

      The main news from CFFI 0.9 is the new way to build extension modules: +the "out-of-line" mode, where you have a separate build script. When +this script is executed, it produces the extension module. This comes +with associated Setuptools support that fixes the headache of +distributing your own CFFI-using packages. It also massively cuts +down the import times.

      +

      Although this is a major new version, it should be fully +backward-compatible: existing projects should continue to work, in +what is now called the "in-line mode".

      +

      The documentation has been reorganized and split into a few pages. +For more information about this new "out-of-line" mode, as well as +more general information about what CFFI is and how to use it, read the Goals and proceed to +the Overview.

      +

      Unlike the 1.0 beta 1 version (ffi.dlopen(), instead of only +ffi.verify().

      +

      PyPy support: PyPy needs integrated support for efficient JITting, +so you cannot install a different version of CFFI on top of an +existing PyPy. You need to wait for the upcoming PyPy 2.6 to use +CFFI 1.0---or get a nightly build.

      +

      My thanks again to the PSF (Python Software Foundation) for their +financial support!

      + +UPDATE:

      Bug with the first example "ABI out-of-line": variadic functions (like printf, ending in a "..." argument) crash. Fixed in CFFI 1.0.2.

      +
      +
      +
      +
      + + Unknown wrote on 2015-05-22 17:21: +
      +
      +

      it's really great!

      +
      +
      +
      +
      + + Unknown wrote on 2015-05-22 23:32: +
      +
      +

      Awesome! Thanks for this. I think is the best way to make extension modules for cpython and pypy.

      +
      +
      +
      +
      + + Unknown wrote on 2015-05-22 23:33: +
      +
      +

      Awesome! Thanks for this. I think is the best way to make extension modules for cpython and pypy.

      +
      +
      +
      + +

      CFFI 1.0 beta 1

      + +
      +

      Finally! CFFI 1.0 is almost ready. CFFI gives Python developers a convenient way to call external C libraries. Here "Python" == "CPython or PyPy", but this post is mostly about the CPython side of CFFI, as the PyPy version is not ready yet.

      +

      On CPython, you can download the version +"1.0.0b1" either by looking for the cffi-1.0 branch in +the repository, or by +saying

      + +pip install "cffi>=1.0.dev0" + +

      (Until 1.0 final is ready, +pip install cffi will still give you version 0.9.2.)

      +

      The main news: you can now explicitly generate and compile a CPython C +extension module from a "build" script. Then in the rest of your +program or library, you no longer need to import cffi at all. +Instead, you simply say:

      +
      +from _my_custom_module import ffi, lib
      +
      +

      Then you use ffi and lib just like you did in your +verify()-based project in CFFI 0.9.2. (The lib is what used to +be the result of verify().) The details of how you use them +should not have changed at all, so that the rest of your program should +not need any update.

      +
      +

      Benefits

      +

      This is a big step towards standard practices for making and +distributing Python packages with C extension modules:

      +
        +
      • on the one hand, you need an explicit compilation step, triggered +here by running the "build" script;
      • +
      • on the other hand, what you gain in return is better control over +when and why the C compilation occurs, and more standard ways to write +distutils- or setuptools-based setup.py files (see below).
      • +
      +

      Additionally, this completely removes one of the main drawbacks of using +CFFI to interface with large C APIs: the start-up time. In some cases +it could be extreme on slow machines (cases of 10-20 seconds on ARM +boards occur commonly). Now, the import above is instantaneous.

      +

      In fact, none of the pure Python cffi package is needed any more at +runtime (it needs only an internal extension module from CFFI, which +can be installed by doing "pip install cffi-runtime" [*] if you only need that). +The ffi object you get by the import above is of a +completely different class written entirely in C. The two +implementations might get merged in the future; for now they are +independent, but give two compatible APIs. The differences are that +some methods like cdef() and verify() and set_source() are +omitted from the C version, because it is supposed to be a complete FFI +already; and other methods like new(), which take as parameter a +string describing a C type, are faster now because that string is parsed +using a custom small-subset-of-C parser, written in C too.

      +
      +
      +

      In practice

      +

      CFFI 1.0 beta 1 was tested on CPython 2.7 and 3.3/3.4, on Linux and to +some extent on Windows and OS/X. Its PyPy version is not ready yet, +and the only docs available so far are those below.

      +

      This is beta software, so there might be bugs and details may change. We are interested in hearing any feedback (irc.freenode.net #pypy) or bug reports.

      +

      To use the new features, create a source file that is not imported by the rest of +your project, in which you place (or move) the code to build the FFI +object:

      +
      +# foo_build.py
      +import cffi
      +ffi = cffi.FFI()
      +
      +ffi.cdef("""
      +    int printf(const char *format, ...);
      +""")
      +
      +ffi.set_source("_foo", """
      +    #include <stdio.h>
      +""")   # and other arguments like libraries=[...]
      +
      +if __name__ == '__main__':
      +    ffi.compile()
      +
      +

      The ffi.set_source() replaces the ffi.verify() of CFFI 0.9.2. +Calling it attaches the given source code to the ffi object, but this call doesn't +compile or return anything by itself. It may be placed above the ffi.cdef() +if you prefer. Its first argument is the name of the C extension module +that will be produced.

      +

      Actual compilation (including generating the complete C sources) occurs +later, in one of two places: either in ffi.compile(), shown above, +or indirectly from the setup.py, shown next.

      +

      If you directly execute the file foo_build.py above, it will +generate a local file _foo.c and compile it to _foo.so (or the +appropriate extension, like _foo.pyd on Windows). This is the +extension module that can be used in the rest of your program by saying +"from _foo import ffi, lib".

      +
      +
      +

      Distutils

      +

      If you want to distribute your program, you write a setup.py using +either distutils or setuptools. Using setuptools is generally +recommended nowdays, but using distutils is possible too. We show it +first:

      +
      +# setup.py
      +from distutils.core import setup
      +import foo_build
      +
      +setup(
      +    name="example",
      +    version="0.1",
      +    py_modules=["example"],
      +    ext_modules=[foo_build.ffi.distutils_extension()],
      +)
      +
      +

      This is similar to the CFFI 0.9.2 way. It only works if cffi was +installed previously, because otherwise foo_build cannot be +imported. The difference is that you use ffi.distutils_extension() +instead of ffi.verifier.get_extension(), because there is no longer +any verifier object if you use set_source().

      +
      +
      +

      Setuptools

      +

      The modern way is to write setup.py files based on setuptools, which +can (among lots of other things) handle dependencies. It is what you +normally get with pip install, too. Here is how you'd write it:

      +
      +# setup.py
      +from setuptools import setup
      +
      +setup(
      +    name="example",
      +    version="0.1",
      +    py_modules=["example"],
      +    setup_requires=["cffi>=1.0.dev0"],
      +    cffi_modules=["foo_build:ffi"],
      +    install_requires=["cffi-runtime"],    # see [*] below
      +)
      +
      +

      Note that "cffi" is mentioned on three lines here:

      +
        +
      • the first time is in setup_requires, which means that cffi will +be locally downloaded and used for the setup.
      • +
      • the second mention is a custom cffi_modules argument. This +argument is handled by cffi as soon as it is locally downloaded. It +should be a list of "module:ffi" strings, where the ffi part +is the name of the global variable in that module.
      • +
      • the third mention is in install_requires. It means that in +order to install this example package, "cffi-runtime" must also be +installed. This is (or will be) a PyPI entry that only contains a +trimmed down version of CFFI, one that does not include the pure +Python "cffi" package and its dependencies. None of it is needed at +runtime.
      • +
      +

      [*] NOTE: The "cffi-runtime" PyPI entry is not ready yet. For now, use "cffi>=1.0.dev0" instead. Considering PyPy, which has got a built-in "_cffi_backend" module, the "cffi-runtime" package could never be upgraded there; but it would still be nice if we were able to upgrade the "cffi" pure Python package on PyPy. This might require some extra care in writing the interaction code. We need to sort it out now...

      +
      +
      +

      Thanks

      +

      Special thanks go to the PSF (Python Software Foundation) for their +financial support, without which this work---er... it might likely have occurred anyway, but at an unknown future date :-)

      +

      (For reference, the amount I asked for (and got) is equal to one +month of what a Google Summer of Code student gets, for work that will +take a bit longer than one month. At least I personally am running mostly +on such money, and so I want to thank the PSF again for their +contribution to CFFI---and while I'm at it, thanks to all other +contributors to PyPy---for making this job more than an unpaid hobby on +the side :-)

      +

      Armin Rigo

      +
      +
      +
      +
      +
      + + Mahmoud wrote on 2015-05-05 20:59: +
      +
      +

      This is great news! We're loving using CFFI via cryptography and PyOpenSSL.

      +
      +
      +
      +
      + + Unknown wrote on 2015-05-05 21:37: +
      +
      +

      An easier way to install cffi 1.0 beta releases is with

      pip install --pre cffi

      The --pre flag indicates pre-releases are acceptable for installation.

      +
      +
      +
      +
      + + Unknown wrote on 2015-05-06 08:54: +
      +
      +

      That's great news! Hard to read though if you're not familiar with CFFI behaviour from before.

      +
      +
      +
      + +

      PyPy-STM 2.5.1 released

      + +
      +

      PyPy-STM 2.5.1 - Mawhrin-Skel

      + +

      We're pleased to announce PyPy-STM 2.5.1, codenamed Mawhrin-Skel. +This is the second official release of PyPy-STM. You can download +this release here (64-bit Linux only):

      +
      +https://pypy.org/download.html +
      +

      Documentation:

      +
      +https://pypy.readthedocs.org/en/latest/stm.html +
      +

      PyPy is an implementation of the Python programming language which focuses +on performance. So far we've been relentlessly optimizing for the single +core/process scenario. PyPy STM brings to the table a version of PyPy +that does not have the infamous Global Interpreter Lock, hence can run +multiple threads on multiple cores. Additionally it comes with a set +of primitives that make writing multithreaded applications a lot easier, +as explained below (see TransactionQueue) and in the documentation.

      +

      Internally, PyPy-STM is based on the Software Transactional Memory +plug-in called stmgc-c7. This version comes with a relatively +reasonable single-core overhead but scales only up to around 4 cores +on some examples; the next version of the plug-in, stmgc-c8, is in +development and should address that limitation (as well as reduce the +overhead). These versions only support 64-bit Linux; we'd welcome +someone to port the upcoming stmgc-c8 to other (64-bit) platforms.

      +

      This release passes all regular PyPy tests, except for a few +special cases. In other words, you should be able to drop in +PyPy-STM instead of the regular PyPy and your program should still +work. See current status for more information.

      +

      This work was done by Remi Meier and Armin Rigo. Thanks to all donors +for crowd-funding the STM work so far! As usual, it took longer +than we would have thought. I really want to thank the people that +kept making donations anyway. Your trust is greatly appreciated!

      +
      +

      What's new?

      +

      Compared to the July 2014 release, the main addition is a way to +get reports about STM conflicts. This is an essential new feature.

      +

      To understand why this is so important, consider that if you already +played around with the previous release, chances are that you didn't +get very far. It probably felt like a toy: on very small examples it +would nicely scale, but on any larger example it would not scale at +all. You didn't get any feedback about why, but the underlying reason +is that, in a typical large example, there are some STM conflicts that +occur all the time and that won't be immediately found just by +thinking. This prevents any parallelization.

      +

      Now PyPy-STM is no longer a black box: you have a way to learn about +these conflicts, fix them, and try again. The tl;dr version is to run:

      +
      +    PYPYSTM=stmlog ./pypy-stm example.py
      +    ./print_stm_log.py stmlog
      +
      +

      More details in the STM user guide.

      +
      +
      +

      Performance

      +

      The performance is now more stable than it used to be. More +precisely, the best case is still "25%-40% single-core slow-down with +very good scaling up to 4 threads", but the average performance seems +not too far from that. There are still dark spots --- notably, the +JIT is still slower to warm up, though it was improved a lot. These +are documented in the current status section. Apart from +that, we should not get more than 2x single-core slow-down in the +worst case. Please report such cases as bugs!

      +
      +
      +

      TransactionQueue

      +

      As explained before, PyPy-STM is more than "just" a Python without +GIL. It is a Python in which you can do minor tweaks to your +existing, non-multithreaded programs and get them to use multiple +cores. You identify medium- or large-sized, likely-independent parts +of the code and to ask PyPy-STM to run these parts in parallel. An +example would be every iteration of some outermost loop over all items +of a dictionary. This is done with a new API: +transaction.TransactionQueue(). See help(TransactionQueue) or +read more about it in the STM user guide.

      +

      This is not a 100% mechanical change: very likely, you need to hunt +for and fix "STM conflicts" that prevent parallel execution (see +docs). However, at all points your program runs correctly, and you +can stop the hunt when you get acceptable performance. You don't get +deadlocks or corrupted state.

      +
      +

      Thanks for reading!
      +Armin, Remi, Fijal

      +
      +
      +
      +
      + + Unknown wrote on 2015-03-31 09:45: +
      +
      +

      From your explanation in this post, STM sounds similar to OpenMP. Can you explain the differences?

      → https://openmp.org/wp/openmp-specifications/

      +
      +
      +
      +
      + + Armin Rigo wrote on 2015-03-31 10:20: +
      +
      +

      This is explained in https://pypy.readthedocs.org/en/latest/stm.html#how-to-write-multithreaded-programs-the-10-000-feet-view

      +
      +
      +
      +
      + + Unknown wrote on 2015-03-31 15:14: +
      +
      +

      Nice - thanks!

      »TransactionQueue is in part similar: your program needs to have “some chances” of parallelization before you can apply it. But I believe that the scope of applicability is much larger with TransactionQueue than with other approaches. It usually works without forcing a complete reorganization of your existing code, and it works on any Python program which has got latent and imperfect parallelism. Ideally, it only requires that the end programmer identifies where this parallelism is likely to be found«

      If I understand that correctly, for STM the parallelism only needs to be likely and can be imperfect, because it can recover from errors.

      This would fix a whole class of problems I experienced in OpenMP Fortran code: Turning a crash or (worse) undefined behavior into a mere performance loss - and that’s really cool!

      Thank you for working on that!

      +
      +
      +
      +
      + + Anonymous wrote on 2015-04-23 13:07: +
      +
      +

      Why do you always ask for money if nothing actually works?

      +
      +
      +
      +
      + + Maciej Fijalkowski wrote on 2015-04-23 15:17: +
      +
      +

      the alternative is to ask for money for stuff that already works, and that's a terrible strategy. suggest better alternatives

      +
      +
      +
      +
      + + Armin Rigo wrote on 2015-04-24 00:53: +
      +
      +

      Your comment suggests PyPy-STM doesn't actually work for you. If you have found a bug, please contribute a bug report, even if only if you have an example of program that should parallelize and doesn't; such bug reports are very useful. Alternatively, you're complaining that PyPy-STM is useless for you. Maybe I've been bad at explaining what you should expect and not expect from it in the first place, so I've given you wrong expectations. In that case, sorry. (The 3rd alternative would be that you're just trolling, but let's discard it for now.)

      +
      +
      +
      + +

      PyPy 2.5.1 Released

      + +
      +
      +
      + +
      +

      +PyPy 2.5.1 - Pineapple Bromeliad

      +We’re pleased to announce PyPy 2.5.1, Pineapple Bromeliad following on the heels of 2.5.0. You can download the PyPy 2.5.1 release here:
      + +
      +We would like to thank our donors for the continued support of the PyPy +project, and for those who donate to our three sub-projects, as well as our +volunteers and contributors. +We’ve shown quite a bit of progress, but we’re slowly running out of funds. +Please consider donating more, or even better convince your employer to donate, +so we can finish those projects! The three sub-projects are:
        +
      • +
        +
        +Py3k (supporting Python 3.x): We have released a Python 3.2.5 compatible version we call PyPy3 2.4.0, and are working toward a Python 3.3 compatible version
        +
         
        +
        +
      • +
      • +
        +STM (software transactional memory): We have released a first working version, +and continue to try out new promising paths of achieving a fast multithreaded Python
        +
        +
        +
        +
      • +
      • +
        +NumPy which requires installation of our fork of upstream numpy, +available on bitbucket +
        +
      • +
      +We would also like to encourage new people to join the project. PyPy has many +layers and we need help with all of them: PyPy and Rpython documentation +improvements, tweaking popular modules to run on pypy, or general help with making +Rpython’s JIT even better.

      +

      +What is PyPy?

      +PyPy is a very compliant Python interpreter, almost a drop-in replacement for +CPython 2.7. It’s fast (pypy and cpython 2.7.x performance comparison) +due to its integrated tracing JIT compiler.

      + +This release supports x86 machines on most common operating systems +(Linux 32/64, Mac OS X 64, Windows, and OpenBSD), +as well as newer ARM hardware (ARMv6 or ARMv7, with VFPv3) running Linux.

      +While we support 32 bit python on Windows, work on the native Windows 64 +bit python is still stalling, we would welcome a volunteer +to handle that.

      +
      +
      +

      +Highlights

      +
        +
      • The past months have seen pypy mature and grow, as rpython becomes the goto +solution for writing fast dynamic language interpreters. Our separation of +Rpython from the python interpreter PyPy is now much clearer in the +PyPy documentation and we now have seperate RPython documentation. +Tell us what still isn’t clear, or even better help us improve the documentation.
      • +
      +
      +
      +
        +
      • We merged version 2.7.9 of python’s stdlib. From the python release notice:
          +
        • The entirety of Python 3.4’s ssl module has been backported. +See PEP 466 for justification.
        • +
        • HTTPS certificate validation using the system’s certificate store is now +enabled by default. See PEP 476 for details.
        • +
        • SSLv3 has been disabled by default in httplib and its reverse dependencies +due to the POODLE attack.
        • +
        • The ensurepip module has been backported, which provides the pip +package manager in every Python 2.7 installation. See PEP 477.
        • +
        +
        +
      • +
      • The garbage collector now ignores parts of the stack which did not change +since the last collection, another performance boost
      • +
      +
        +
      • errno and LastError are saved around cffi calls so things like pdb will not +overwrite it
      • +
      +
        +
      • We continue to asymptotically approach a score of 7 times faster than cpython +on our benchmark suite, we now rank 6.98 on latest runs
      • +
      + +Please try it out and let us know what you think. We welcome +success stories, experiments, or benchmarks, we know you are using PyPy, please tell us about it!
      + +Cheers
      + +The PyPy Team
      +
      +
      +
      +
      +
      +
      +
      +
      + + Anonymous wrote on 2015-03-26 11:10: +
      +
      +

      You mentioned about speed of PyPy over CPython. I'm interesting in memory footprint too in addition to speed up. Please, add to speed.pypy.org memory footprint metric. It's harder to find cheap and huge amount of memory for VPS than slow old cpu. Nice to know minimal memory requirements for django sites on pypy.

      +
      +
      +
      +
      + + Anonymous wrote on 2015-03-26 11:15: +
      +
      +

      Is scores from speed.pypy.org applied to PyPy3 too? Later it was written PyPy3 was not fast as PyPy2.

      +
      +
      +
      +
      + + Maciej Fijalkowski wrote on 2015-03-26 11:56: +
      +
      +

      Memory footprint is tricky to measure. PyPy usually starts at 60M (as opposed to say 6 for cpython), but then data structures are smaller. We'll try to get some measurments going on some point. Benchmarking is hard :-)

      No, PyPy3 is not as fast as PyPy2. We should really look into it at some point.

      +
      +
      +
      + +

      Pydgin: Using RPython to Generate Fast Instruction-Set Simulators

      + +
      +
      + +

      Note: This is a guest blog post by Derek Lockhart and Berkin Ilbeyi from +Computer Systems Laboratory of Cornell University.

      +

      In this blog post I'd like to describe some recent work on using the RPython +translation toolchain to generate fast instruction set simulators. +Our open-source framework, Pydgin [a], provides a domain-specific +language (DSL) embedded in Python for concisely describing instruction set +architectures [b] and then uses these descriptions to generate fast, +JIT-enabled simulators. +Pydgin will be presented at the IEEE International Symposium on Performance +Analysis of Systems and Software (ISPASS) and in this post we provide a +preview of that work. +In addition, we discuss some additional progress updates that occurred after +the publishing deadline and will not appear in the final paper [1].

      +

      Our area of research expertise is computer architecture, which is perhaps an +unfamiliar topic for some readers of the PyPy blog. +Below we provide some brief background on hardware simulation in the field of +computer architecture, as well as some context as to why instruction set +simulators in particular are such an important tool.

      +
      +

      Simulators: Designing Hardware with Software

      +

      For computer architects in both academia and industry, a key step in designing +new computational hardware (e.g., CPUs, GPUs, and mobile system-on-chips) is +simulation [c] of the target system. +While numerous models for simulation exist, three classes are particularly +important in hardware design.

      +

      Functional Level models simulate the behavior of the target system. +These models are useful for creating a "golden" reference which can serve as an +executable specification or alternatively as an emulation platform for software +development.

      +

      Cycle Level models aim to simulate both the behavior and the approximate +timing of a hardware component. +These models help computer architects explore design tradeoffs and quickly +determine things like how big caches should be, how many functional units are +needed to meet throughput targets, and how the addition of a custom accelerator +block may impact total system performance.

      +

      Register-Transfer Level (RTL) models specify the behavior, timing, and +resources (e.g., registers, wires, logic gates) of a hardware component. +RTL models are bit-accurate hardware specifications typically written in a +hardware description language (HDL) such as Verilog or VHDL. +Once verified through extensive simulation, HDL specifications can be passed +into synthesis and place-and-route tools to estimate area/energy/timing or to +create FPGA or ASIC prototypes.

      +

      An instruction set simulator (ISS) is a special kind of +functional-level model that simulates the behavior of a processor or +system-on-chip (SOC). ISSs serve an important role in hardware design +because they model the instruction set architecture (ISA) interface: the +contractual boundary between hardware designers and software developers. +ISSs allow hardware designers to quickly experiment with adding new processor +instructions while also allowing software developers to build new compilers, +libraries, and applications long before physical silicon is available.

      +
      +
      +

      Instruction-Set Simulators Must be Fast and Productive

      +

      Instruction-set simulators are more important than ever because the ISA +boundary has become increasingly fluid. +While Moore's law has continued to deliver larger numbers of transistors +which computer architects can use to build increasingly complex chips, limits +in Dennard scaling have restricted how these transistors can be used [d]. +In more simple terms, thermal constraints (and energy constraints in mobile +devices) have resulted in a growing interest in pervasive specialization: +using custom accelerators to more efficiently perform compute intensive tasks. +This is already a reality for designers of mobile SOCs who continually add new +accelerator blocks and custom processor instructions in order to achieve higher +performance with less energy consumption. +ISSs are indispensable tools in this SOC design process for both hardware +architects building the silicon and software engineers developing the software +stack on top of it.

      +

      An instruction set simulator has two primary responsibilities: 1) accurately +emulating the external execution behavior of the target, and 2) providing +observability by accurately reproducing the target's internal state (e.g., +register values, program counter, status flags) at each time step. +However, other qualities critical to an effective ISS are simulation +performance and designer productivity. +Simulation performance is important because shorter simulation times allow +developers to more quickly execute and verify large software applications. +Designer productivity is important because it allows hardware architects to +easily experiment with adding new instructions and estimate their impact on +application performance.

      +

      To improve simulation performance, high-performance ISSs use dynamic binary +translation (DBT) as a mechanism to translate frequently visited blocks of +target instructions into optimized sequences of host instructions. +To improve designer productivity, many design toolchains automatically generate +ISSs from an architectural description language (ADL): a special +domain-specific language for succinctly specifying instruction encodings and +instruction semantics of an ISA. +Very few existing systems have managed to encapsulate the design complexity of +DBT engines such that high-performance, DBT-accelerated ISSs could be +automatically generated from ADLs [e]. +Unfortunately, tools which have done so are either proprietary software or +leave much to be desired in terms of performance or productivity.

      +
      +
      +

      Why RPython?

      +

      Our research group learned of the RPython translation toolchain through our +experiences with PyPy, which we had used in conjunction with our Python +hardware modeling framework to achieve significant improvements in simulation +performance [2]. +We realized that the RPython translation toolchain could potentially be adapted +to create fast instruction set simulators since the process of interpreting +executables comprised of binary instructions shared many similarities with the +process of interpreting bytecodes in a dynamic-language VM. +In addition, we were inspired by PyPy's meta-tracing approach to JIT-optimizing +VM design which effectively separates the process of specifying a language +interpreter from the optimization machinery needed to achieve good performance.

      +

      Existing ADL-driven ISS generators have tended to use domain-specific +languages that require custom parsers or verbose C-based syntax that +distracts from the instruction specification. +Creating an embedded-ADL within Python provides several benefits over these +existing approaches including a gentler learning curve for new users, access to +better debugging tools, and easier maintenance and extension by avoiding a +custom parser. +Additionally, we have found that the ability to directly execute Pydgin +ISA descriptions in a standard Python interpreter such as CPython or PyPy +significantly helps debugging and testing during initial ISA exploration. +Python's concise, pseudocode-like syntax also manages to map quite closely to +the pseudocode specifications provided by many ISA manuals [f].

      +
      +
      +

      The Pydgin embedded-ADL

      +

      Defining a new ISA in the Pydgin embedded-ADL requires four primary pieces of +information: the architectural state (e.g. register file, program counter, +control registers), the bit encodings of each instruction, the instruction +fields, and the semantic definitions for each instruction. Pydgin aims to make +this process as painless as possible by providing helper classes and functions +where possible.

      +

      For example, below we provide a truncated example of the ARMv5 instruction +encoding table. Pydgin maintains encodings of all instructions in a centralized +encodings data structure for easy maintenance and quick lookup. The +user-provided instruction names and bit encodings are used to automatically +generate decoders for the simulator. Unlike many ADLs, Pydgin does not require +that the user explicitly specify instruction types or mask bits for field +matching because the Pydgin decoder generator can automatically infer decoder +fields from the encoding table.

      +
      +encodings = [
      +  ['adc',      'xxxx00x0101xxxxxxxxxxxxxxxxxxxxx'],
      +  ['add',      'xxxx00x0100xxxxxxxxxxxxxxxxxxxxx'],
      +  ['and',      'xxxx00x0000xxxxxxxxxxxxxxxxxxxxx'],
      +  ['b',        'xxxx1010xxxxxxxxxxxxxxxxxxxxxxxx'],
      +  ['bl',       'xxxx1011xxxxxxxxxxxxxxxxxxxxxxxx'],
      +  ['bic',      'xxxx00x1110xxxxxxxxxxxxxxxxxxxxx'],
      +  ['bkpt',     '111000010010xxxxxxxxxxxx0111xxxx'],
      +  ['blx1',     '1111101xxxxxxxxxxxxxxxxxxxxxxxxx'],
      +  ['blx2',     'xxxx00010010xxxxxxxxxxxx0011xxxx'],
      +  # ...
      +  ['teq',      'xxxx00x10011xxxxxxxxxxxxxxxxxxxx'],
      +  ['tst',      'xxxx00x10001xxxxxxxxxxxxxxxxxxxx'],
      +]
      +
      +

      A major goal of Pydgin was ensuring instruction semantic definitions map to ISA +manual specifications as much as possible. The code below shows one such +definition for the ARMv5 add instruction. +A user-defined Instruction class (not shown) specifies field names that can +be used to conveniently access bit positions within an instruction (e.g. +rd, rn, S). +Additionally, users can choose to define their own helper functions, such as +the condition_passed function, to create more concise syntax that better +matches the ISA manual.

      +
      +def execute_add( s, inst ):
      +  if condition_passed( s, inst.cond() ):
      +    a,   = s.rf[ inst.rn() ]
      +    b, _ = shifter_operand( s, inst )
      +    result = a + b
      +    s.rf[ inst.rd() ] = trim_32( result )
      +
      +    if inst.S():
      +      if inst.rd() == 15:
      +        raise FatalError('Writing SPSR not implemented!')
      +      s.N = (result >> 31)&1
      +      s.Z = trim_32( result ) == 0
      +      s.C = carry_from( result )
      +      s.V = overflow_from_add( a, b, result )
      +
      +    if inst.rd() == 15:
      +      return
      +
      +  s.rf[PC] = s.fetch_pc() + 4
      +
      +

      Compared to the ARM ISA Reference manual shown below, the Pydgin instruction +definition is a fairly close match. Pydgin's definitions could certainly be +made more concise by using a custom DSL, however, this would lose many of the +debugging benefits afforded to a well-supported language such as Python and +additionally require using a custom parser that would likely need modification +for each new ISA.

      +
      +if ConditionPassed(cond) then
      +   Rd = Rn + shifter_operand
      +   if S == 1 and Rd == R15 then
      +     if CurrentModeHasSPSR() then CPSR = SPSR
      +   else UNPREDICTABLE else if S == 1 then
      +     N Flag = Rd[31]
      +     Z Flag = if Rd == 0 then 1 else 0
      +     C Flag = CarryFrom(Rn + shifter_operand)
      +     V Flag = OverflowFrom(Rn + shifter_operand)
      +
      +

      Creating an ISS that can run real applications is a rather complex task, even +for a bare metal simulator with no operating system such as Pydgin. +Each system call in the C library must be properly implemented, and +bootstrapping code must be provided to set up the program stack and +architectural state. +This is a very tedious and error prone process which Pydgin tries to +encapsulate so that it remains as transparent to the end user as possible. +In future versions of Pydgin we hope to make bootstrapping more painless and +support a wider variety of C libraries.

      + + +
      +
      +

      Pydgin Performance

      +

      In order to achieve good simulation performance from Pydgin ISSs, significant +work went into adding appropriate JIT annotations to the Pydgin library +components. +These optimization hints, which allow the JIT generated by the RPython +translation toolchain to produce more efficient code, have been specifically +selected for the unique properties of ISSs. +For the sake of brevity, we do not talk about the exact optimizations here but +a detailed discussion can be found in the ISPASS paper [1]. +In the paper we evaluate two ISSs, one for a simplified MIPS ISA and another +for the ARMv5 ISA, whereas below we only discuss results for the ARMv5 ISS.

      +

      The performance of Pydgin-generated ARMv5 ISSs were compared against +several reference ISSs: the gem5 ARM atomic simulator (gem5), +interpretive and JIT-enabled versions of SimIt-ARM (simit-nojit and +simit-jit), and QEMU. +Atomic models from the gem5 simulator were chosen for comparison due their wide +usage amongst computer architects [g]. +SimIt-ARM was selected because it is currently the highest performance +ADL-generated DBT-ISS publicly available. +QEMU has long been held as the gold-standard for DBT simulators due to its +extremely high performance, however, QEMU is generally intended for usage as an +emulator rather than a simulator [c] and therefore achieves its excellent +performance at the cost of observability. +Unlike QEMU, all other simulators in our study faithfully track architectural +state at an instruction level rather than block level. +Pydgin ISSs were generated with and without JITs using the RPython translation +toolchain in order to help quantify the performance benefit of the meta-tracing +JIT.

      +

      The figure below shows the performance of each ISS executing applications from +the SPEC CINT2006 benchmark suite [h]. +Benchmarks were run to completion on the high-performance DBT-ISSs +(simit-jit, pydgin-jit, and QEMU), but were terminated after only +10 billion simulated instructions for the non-JITed interpretive ISSs +(these would require many hours, in some cases days, to run to completion). +Simulation performance is measured in MIPS [i] and plotted on a log +scale due to the wide variance in performance. +The WHMEAN group summarizes each ISS's performance across all benchmarks +using the weighted harmonic mean.

      + +
      + +

      A few points to take away from these results:

      +
        +
      • ISSs without JITs (gem5, simit-nojit, and pydgin-nojit) demonstrate +relatively consistent performance across applications, whereas ISSs with JITs +(simit-jit, pydgin-jit, and QEMU) demonstrate much greater +performance variability from application-to-application.
      • +
      • The gem5 atomic model demonstrates particularly miserable performance, only +2-3 MIPS!
      • +
      • QEMU lives up to its reputation as a gold-standard for simulator performance, +leading the pack on nearly every benchmark and reaching speeds of 240-1120 +MIPS.
      • +
      • +pydgin-jit is able to outperform simit-jit on four of the +applications, including considerable performance improvements of 1.44–1.52× +for the applications 456.hmmer, 462.libquantum, and 471.omnetpp +(managing to even outperform QEMU on 471.omnetpp).
      • +
      • +simit-jit is able to obtain much more consistent performance (230-459 +MIPS across all applications) than pydgin-jit (9.6-659 MIPS). This is +due to simit-jit's page-based approach to JIT optimization compared to +pydgin-jit's tracing-based approach.
      • +
      • +464.h264ref displays particularly bad pathological behavior in Pydgin’s +tracing JIT and is the only application to perform worse on pydgin-jit +than pydgin-nojit (9.6 MIPS vs. 21 MIPS).
      • +
      +

      The pathological behavior demonstrated by 464.h264ref was of particular +concern because it caused pydgin-jit to perform even worse than having no +JIT at all. RPython JIT logs indicated that the reason for this performance +degradation was a large number of tracing aborts due to JIT traces growing too +long. However, time limitations before the publication deadline prevented us +from investigating this issue thoroughly.

      +

      Since the deadline we've applied some minor bug fixes and made some small +improvements in the memory representation. +More importantly, we've addressed the performance degradation in 464.h264ref +by increasing trace lengths for the JIT. +Below we show how the performance of 464.h264ref changes as the +trace_limit parameter exposed by the RPython JIT is varied from the default +size of 6000 operations.

      + + +
      + +

      By quadrupling the trace limit we achieve an 11x performance improvement in +464.h264ref. +The larger trace limit allows the JIT to optimize long code paths that were +previously triggering trace aborts, greatly helping amortize the costs of +tracing. +Note that arbitrarily increasing this limit can potentially hurt performance if +longer traces are not able to detect optimizable code sequences.

      +

      After performing similar experiments across the applications in the SPEC +CINT2006 benchmark suite, we settled on a trace limit of 400,000 operations. +In the figure below we show how the updated Pydgin ISS (pydgin-400K) improves +performance across all benchmarks and fixes the performance degradation +previously seen in 464.h264ref. Note that the non-JITted simulators have been +removed for clarity, and simulation performance is now plotted on a +linear scale to more clearly distinguish the performance gap between +each ISS.

      + +
      + +

      With these improvements, we are now able to beat simit-jit on all but two +benchmarks. In future work we hope to further close the gap with QEMU as well.

      +
      +
      +

      Conclusions and Future Work

      +

      Pydgin demonstrates that the impressive work put into the RPython translation +toolchain, designed to simplify the process of building fast dynamic-language +VMs, can also be leveraged to build fast instruction set simulators. +Our prototype ARMv5 ISS shows that Pydgin can generate ISSs with performance +competitive to SimIt-ARM while also providing a more productive development +experience: RPython allowed us to develop Pydgin with only four person-months +of work. +Another significant benefit of the Pydgin approach is that any performance +improvements applied to the RPython translation toolchain immediately benefit +Pydgin ISSs after a simple software download and retranslation. +This allows Pydgin to track the continual advances in JIT technology introduced +by the PyPy development team.

      +

      Pydgin is very much a work in progress. There are many features we would like +to add, including:

      +
        +
      • more concise syntax for accessing arbitrary instruction bits
      • +
      • support for other C libraries such as glibc, uClibc, and musl +(we currently only support binaries compiled with newlib)
      • +
      • support for self-modifying code
      • +
      • features for more productive debugging of target applications
      • +
      • ISS descriptions for other ISAs such as RISC-V, ARMv8, and x86
      • +
      • automatic generation of compilers and toolchains from Pydgin descriptions
      • +
      +

      In addition, we think there are opportunities for even greater performance +improvements with more advanced techniques such as:

      +
        +
      • automatic generation of optimized instruction decoders
      • +
      • optimizations for floating-point intensive applications
      • +
      • multiple tracing-JITs for parallel simulation of multicore SOCs
      • +
      • a parallel JIT compilation engine as proposed by Böhm et al. [3] +
      • +
      +

      We hope that Pydgin can be of use to others, so if you try it out please let us +know what you think. Feel free to contact us if you find any of the above +development projects interesting, or simply fork the project on GitHub and hack +away!

      +

      -- Derek Lockhart and Berkin Ilbeyi

      +
      +
      +

      Acknowledgements

      +

      We would like to sincerely thank Carl Friedrich Bolz and Maciej Fijalkowski for their feedback on the Pydgin publication and their guidance on improving the JIT performance of our simulators. We would also like to thank for the whole PyPy team for their incredible work on the PyPy and the RPython translation toolchain. Finally, thank you to our research advisor, Prof. Christopher Batten, and the sponsors of this work which include the National Science Foundation, the Defense Advanced Research Projects Agency, and Intel Corporation.

      +
      +
      +

      Footnotes

      + ++++ + + + + +
      [a]Pydgin loosely stands for [Py]thon [D]SL for [G]enerating +[In]struction set simulators and is pronounced the same as “pigeon”. The +name is inspired by the word “pidgin” which is a grammatically simplified +form of language and captures the intent of the Pydgin embedded-ADL. +https://github.com/cornell-brg/pydgin +
      + ++++ + + + + +
      [b]Popular instruction set architectures (ISAs) include MIPs, ARM, +x86, and more recently RISC-V
      + ++++ + + + + +
      [c] +(1, 2) For a good discussion of simulators vs. emulators, please see the +following post on StackOverflow: +https://stackoverflow.com/questions/1584617/simulator-or-emulator-what-is-the-difference +
      + ++++ + + + + +
      [d]https://en.wikipedia.org/wiki/Dark_silicon
      + ++++ + + + + +
      [e]Please see the Pydgin paper for a more detailed discussion of prior work.
      + ++++ + + + + +
      [f] +

      For more examples of Pydgin ISA specifications, please see the ISPASS +paper [1] or the Pydgin source code on GitHub.

      +

      Pydgin instruction definitions for a simple MIPS-inspired ISA can be +found here:

      + +

      Pydgin instruction definitions for a simplified ARMv5 ISA can be found +here:

      + +
      + ++++ + + + + +
      [g] +

      gem5 is a cycle-level simulation framework that contains both +functional-level (atomic) and cycle-level processor models. Although +primarily used for detailed, cycle-approximate processor simulation, +gem5's atomic model is a popular tool for many ISS tasks.

      + +
      + ++++ + + + + +
      [h]All performance measurements were taken on an unloaded server-class +machine.
      + ++++ + + + + +
      [i]Millions of instructions per second.
      +
      +
      +

      References

      + ++++ + + + + +
      [1] +(1, 2, 3)

      Derek Lockhart, Berkin Ilbeyi, and Christopher Batten. "Pydgin: +Generating Fast Instruction Set Simulators from Simple Architecture +Descriptions with Meta-Tracing JIT Compilers." IEEE Int'l Symp. on +Performance Analysis of Systems and Software (ISPASS), Mar. 2015.

      + +
      + ++++ + + + + +
      [2] +

      Derek Lockhart, Gary Zibrat, and Christopher Batten. "PyMTL: A Unified +Framework for Vertically Integrated Computer Architecture Research." 47th +ACM/IEEE Int'l Symp. on Microarchitecture (MICRO-47), Dec. 2014.

      + +
      + ++++ + + + + +
      [3]I. Böhm, B. Franke, and N. Topham. Generalized Just-In-Time Trace +Compilation Using a Parallel Task Farm in a Dynamic Binary Translator. +ACM SIGPLAN Conference on Programming Language Design and Implementation +(PLDI), Jun 2011.
      +
      + +
      +
      +
      +
      +
      +
      + + Anonymous wrote on 2015-03-30 12:14: +
      + +
      +
      + +

      Experiments in Pyrlang with RPython

      + +
      +

      Pyrlang is an Erlang BEAM bytecode interpreter written in RPython.

      +

      It implements approximately 25% of BEAM instructions. It can support +integer calculations (but not bigint), closures, exception handling, +some operators to atom, list and tuple, user modules, and multi-process +in single core. Pyrlang is still in development.

      +

      There are some differences between BEAM and the VM of PyPy:

      +
        +
      • BEAM is a register-based VM, whereas the VM in PyPy is stack-based.
      • +
      • There is no traditional call-stack in BEAM. The Y register in BEAM is +similar to a call-stack, but the Y register can sometimes store some +variables.
      • +
      • There are no typical language-level threads and OS-level threads in +BEAM; only language-level processes, whose behavior is very similar +to the actor model.
      • +
      +

      Regarding bytecode dispatch loop, Pyrlang uses a while loop to fetch +instructions and operands, call the function corresponding to every +instruction, and jump back to the head of the while loop. Due to the +differences between the RPython call-stack and BEAM’s Y register, we +decided to implement and manage the Y register by hand. On the other +hand, PyPy uses RPython’s call stack to implement Python’s call stack. +As a result, the function for the dispatch loop in PyPy calls itself +recursively. This does not happen in Pyrlang.

      +

      The Erlang compiler (erlc) usually compiles the bytecode instructions +for function invocation into CALL (for normal invocation) and CALL_ONLY +(for tail recursive invocation). You can use a trampoline semantic to +implement it:

      +
        +
      • CALL instruction: The VM pushes the current instruction pointer (or +called-program counter in PyPy) to the Y register, and jumps to the +destination label. When encountering a RETURN instruction, the VM +pops the instruction pointer from the Y register and returns to the +location of the instruction pointer to continue executing the outer +function.
      • +
      • CALL_ONLY instruction: The VM simply jumps to the destination label, +without any modification of the Y register. As a result, the tail +recursive invocation never increases the Y register.
      • +
      +

      The current implementation only inserts the JIT hint of can_enter_jit +following the CALL_ONLY instruction. This means that the JIT only +traces the tail-recursive invocation in Erlang code, which has a very +similar semantic to the loop in imperative programming languages like +Python.

      +

      We have also written a single scheduler to implement the language level +process in a single core. There is a runable queue in the scheduler. On +each iteration, the scheduler pops one element (which is a process +object with dispatch loop) from the queue, and executes the dispatch +loop of the process object. In the dispatch loop, however, there is a +counter-call “reduction” inside the dispatch loop. The reduction +decrements during the execution of the loop, and when the reduction +becomes 0, the dispatch loop terminates. Then the scheduler pushes that +element into the runable queue again, and pops the next element for the +queue, and so on.

      +

      We are planning to implement a multi-process scheduler for multi-core +CPUs, which will require multiple schedulers and even multiple runable +queues for each core, but that will be another story. :-)

      +
      +

      Methods

      +

      We wrote two benchmark programs of Erlang:

      +
        +
      • FACT: A benchmark to calculate the factorial in a tail-recursive +style, but because we haven’t implemented big int, we do a remainder +calculation to the argument for the next iteration, so the number +never overflows.
      • +
      • REVERSE: The benchmark creates a reversed list of numbers, such as +[20000, 19999, 19998, …], and applies a bubble sort to it.
      • +
      +
      +
      +

      Results

      +
      +

      The Value of Reduction

      +

      We used REVERSE to evaluate the JIT with different values of +reduction:

      + + +

      The X axis is the value of reduction, and the Y axis is the execution +time (by second).

      +

      It seems that when the value of reduction is small, the reduction +influences the performance significantly, but when reduction becomes +larger, it only increases the speed very slightly. In fact, we use 2000 +as the default reduction value (as well as the reduction value in the +official Erlang interpreter).

      +

      Surprisingly, the trace is always generated even when the reduction is +very small, such as 0, which means the dispatch loop can only run for a +very limited number of iterations, and the language level process +executes fewer instructions than an entire loop in one switch of the +scheduler). The generated trace is almost the same, regardless of +different reduction values.

      +

      Actually, the RPython JIT only cares what code it meets, but does not +care who executes it, thus the JIT always generates the results above. +The trace even can be shared among different threads if they execute the +same code.

      +

      The overhead at low reduction value may be due to the scheduler, which +switches from different processes too frequently, or from the +too-frequent switching between bytecode interpreter and native code, but +not from JIT itself.

      +

      Here is more explanation from Armin Rigo:

      +
      +“The JIT works well because you’re using a scheme where some counter +is decremented (and the soft-thread interrupted when it reaches +zero) only once in each app-level loop. The soft-thread switch is +done by returning to some scheduler, which will resume a different +soft-thread by calling it. It means the JIT can still compile each +of the loops as usual, with the generated machine code containing +the decrease-and-check-for-zero operation which, when true, exits +the assembler."
      +
      +
      +

      Fair Process Switching vs. Unfair Process Switching

      +

      We are also concerned about the timing for decreasing reduction value. +In our initial version of Pyrlang, we decrease reduction value at every +local function invocation, module function invocation, and BIF (built-in +function) invocation, since this is what the official Erlang interpreter +does. However, since the JIT in RPython basically traces the target +language loop (which is the tail recursive invocation in Pyrlang) it is +typically better to keep the loop whole during a switch of the language +level process. We modified Pyrlang, and made the reduction decrement +only occur after CALL_ONLY, which is actually the loop boundary of the +target language.

      +

      Of course, this strategy may cause an “unfair” execution among language +level processes. For example, if one process has only a single +long-sequence code, it executes until the end of the code. On the other +hand, if a process has a very short loop, it may be executed by very +limited steps then be switched out by the scheduler. However, in the +real world, this “unfairness” is usually considered acceptable, and is +used in many VM implementations including PyPy for improving the overall +performance.

      +

      We compared these two versions of Pyrlang in the FACT benchmark. The +reduction decrement is quite different because there are some BIF +invocations inside the loop. In the old version the process can be +suspended at loop boundaries or other function invocation, but in the +new version, it can be suspended only at loop boundaries.

      +

      We show that the strategy is effective, removing around 7% of the +overhead. We have also compared it in REVERSE, but since there are no +extra invocations inside the trace, it cannot provide any performance +improvement. In the real world, we believe there is usually more than +one extra invocation inside a single loop, so this strategy is effective +for most cases.

      +
      +
      +

      Comparison with Default Erlang and HiPE

      +

      We compared the performance of Pyrlang with the default Erlang +interpreter and the HiPE (High Performance Erlang) complier. HiPE is an +official Erlang compiler that can compile Erlang source code to native +code. The speed of Erlang programs obviously improves but loses its +generality instead.

      +

      Please note that Pyrlang is still in development, so in some situations +it does less work than the default Erlang interpreter, such as not +checking integer overflow when dealing with big integer, and not +checking and adding locks when accessing message queues in the +language-level process, so is therefore faster. The final version of +Pyrlang may be slower.

      +

      We used the two benchmark programs above, and made sure both of them are +executed for more than five seconds to cover the JIT warm-up time for +RPython. The experiment environment is a OS X 10.10 machine with 3.5GHZ +6-core Intel Xeon E5 CPU and 14GB 1866 MHz DDR3 ECC memory.

      +

      Let’s look at the result of FACT. The graph shows that Pyrlang runs +177.41% faster on average than Erlang, and runs at almost the same speed +as HiPE. However, since we haven’t implemented big integer in Pyrlang, +the arithmetical operators do not do any extra overflow checking. It is +reasonable that the final version for Pyrlang will be slower than the +current version and HiPE.

      + +

      As for REVERSE, the graph shows that Pyrlang runs 45.09% faster than +Erlang, but 63.45% slower than HiPE on average. We think this is +reasonable because there are only few arithmetical operators in this +benchmark so the speeds of these three implementations are closer. +However, we observed that at the scale of 40,000, the speed of Pyrlang +slowed down significantly (111.35% slower than HiPE) compared with the +other two scales (56.38% and 22.63% slower than HiPE).

      +

      Until now we can only hypothesize why Pyrlang slows down at that scale. +We guess that the overhead might be from GC. This is because the BEAM +bytecode provides some GC hints to help the default Erlang compiler to +perform some GC operations immediately. For example, using GC_BIF +instead of a BIF instruction tells the VM that there may be a GC +opportunity, and tells the VM how many live variables should be around +one instruction. In Pyrlang we do not use these kinds of hints but rely +on RPython’s GC totally. When there are a huge number of objects during +runtime, (as for REVERSE, it should be the Erlang list object) the speed +therefore slows down.

      + +

      Ruochen Huang

      +
      +
      +
      +
      +
      +
      + + peterfirefly wrote on 2015-02-26 12:14: +
      +
      +

      'there is a counter-call “reduction”' should probably be:

      'there is a counter called “reduction”'.

      +
      +
      +
      + +

      linalg support in pypy/numpy

      + +
      +
      +

      +

      +

      +Introduction

      +PyPy's numpy support has matured enough that it can now support the lapack/blas libraries through the numpy.linalg module. To install the version of numpy this blog post refers to, install PyPy version 2.5.0 or newer, and run this:

      pypy -m pip install git+https://bitbucket.org/pypy/numpy.git
      +
      +
      +This update is a major step forward for PyPy's numpy support. Many of the basic matrix operations depend on linalg, even matplotlib requires it to display legends (a pypy-friendly version of matplotlib 1.3 is available at https://github.com/mattip/matplotlib).

      +A number of improvements and adaptations, some of which are in the newly-released PyPy 2.5.0, made this possible:
        +
      • Support for an extended frompyfunc(), which in the PyPy version supports much of the ufunc API (signatures, multiple dtypes) allowing creation of pure-python, jit-friendly ufuncs. An additional keyword allows choosing between out = func(in) or func(in, out) ufunc signatures. More explanation follows.
      • +
      • Support for GenericUfuncs via PyPy's (slow) capi-compatibility layer. The underlying mechanism actually calls the internal implementation of frompyfunc().
      • +
      • A cffi version of _umath_linalg. Since cffi uses dlopen() to call into shared objects, we added support in the numpy build system to create non-python shared libraries from source code in the numpy tree. We also rewrote parts of the c-based _umath_linalg.c.src in python, renamed numpy's umath_linalg capi module to umath_linag_capi, and use it as a shared object through cffi.
      • +
      +
      +

      +Status

      +We have not completely implemented all the linalg features. dtype resolution via casting is missing, especially for complex ndarrays, which leads to slight numerical errors where numpy uses a more precise type for intermediate calculations. Other missing features in PyPy's numpy support may have implications for complete linalg support.

      +Some OSX users have noticed they need to update pip to version 6.0.8 to overcome a regression in pip, and it is not clear if we support all combinations of blas/lapack implementations on all platforms.

      +Over the next few weeks we will be ironing out these issues.
      +
      +

      +Performance

      +A simple benchmark is shown below, but let's state the obvious: PyPy's JIT and the iterators built into PyPy's ndarray implementation will in most cases be no faster than CPython's numpy. The JIT can help where there is a mixture of python and numpy-array code. We do have plans to implement lazy evaluation and to further optimize PyPy's support for numeric python, but numpy is quite good at what it does.
      +
      +

      +HowTo for PyPy's extended frompyfunc

      +The magic enabling blas support is a rewrite of the _umath_linalg c-based module as a cffi-python module that creates ufuncs via frompyfunc. We extended the numpy frompyfunc to allow it to function as a replacement for the generic ufunc available in numpy only through the c-api.

      +We start with the basic frompyfunc, which wraps a python function into a ufunc:
       
      +
      def times2(in0):
      +    return in0 * 2
      +ufunc = frompyfunc(times2, 1, 1)
      +
      +
      +In cpython's numpy the dtype of the result is always object, which is not implemented (yet) in PyPy, so this example will fail. While the utility of object dtypes can be debated, in the meantime we add a non-numpy-compatible keyword argument dtypes to frompyfunc. If dtype=['match'] the output dtype will match the dtype of the first input ndarray:

      ufunc = frompyfunc(times2, 1, 1, dtype=['match'])
      +ai = arange(24).reshape(3, 4, 2)
      +ao = ufunc(ai)
      +assert  (ao == ai * 2).all()
      +
      +
      +I hear you ask "why is the dtypes keyword argument a list?" This is so we can support the Generalized Universal Function API, which allows specifying a number of specialized functions and the input-output dtypes each specialized function accepts.
      +Note that the function feeds the values of ai one at a time, the function operates on scalar values. To support more complicated ufunc calls, the generalized ufunc API allows defining a signature, which specifies the layout of the ndarray inputs and outputs. So we extended frompyfunc with a signature keyword as well.
      +We add one further extension to frompyfunc: we allow a Boolean keyword stack_inputs to specify the argument layout of the function itself. If the function is of the form:
       
      +
      out0, out1, ... = func(in0, in1,...)
      +
      +
      +then stack_inputs is False. If it is True the function is of the form:
       
      +
      func(in0, in1, ... out0, out1, ...)
      +
      +
      +Here is a complete example of using frompyfunc to create a ufunc, based on this link:
       
      +
      def times2(in_array, out_array):
      +    in_flat = in_array.flat
      +    out_flat = out_array.flat
      +    for i in range(in_array.size):
      +        out_flat[i] = in_flat[i] * 2
      +ufunc = frompyfunc([times2, times2], 1, 1,
      +                signature='(i)->(i)',
      +                dtypes=[dtype(int), dtype(int),
      +                        dtype(float), dtype(float),
      +                       ],
      +                stack_inputs=True,
      +                )
      +ai = arange(10, dtype=int)
      +ai2 = ufunc(ai)
      +assert all(ai2 == ai * 2)
      +
      +
      +Using this extended syntax, we rewrote the lapack calls into the blas functions in pure python, no c needed. Benchmarking this approach actually was much slower than using the upstream umath_linalg module via cpyext, as can be seen in the following benchmarks. This is due to the need to copy c-aligned data into Fortran-aligned format. Our __getitem__ and __setitem__ iterators are not as fast as pointer arithmetic in C. So we next tried a hybrid approach: compile and use numpy's umath_linalg python module as a shared object, and call the optimized specific wrapper function from it.
      +
      +

      +Benchmarks

      +Here are some benchmarks, running a tight loop of the different versions of linalg.inv(a), where a is a 10x10 double ndarray. The benchmark ran on an i7 processor running ubuntu 14.04 64 bit:
      + + + + + + + + + + + + + +
      Impl. Time after warmup
      CPython 2.7 + numpy 1.10.dev + lapack 8.9 msec/1000 loops
      PyPy 2.5.0 + numpy + lapack via cpyext 8.6 msec/1000 loops
      PyPy 2.5.0 + numpy + lapack via pure python + cffi 19.9 msec/1000 loops
      PyPy 2.5.0 + numpy + lapack via python + c + cffi 9.5 msec/1000 loops
      +
      +
      +
      +
      +While no general conclusions may be drawn from a single micro-benchmark, it does indicate that there is some merit in the approach taken.

      +Conclusion

      +PyPy's numpy now includes a working linalg module. There are still some rough corners, but hopefully we have implemented the parts you need. While the speed of the isolated linalg function is no faster than CPython and upstream numpy, it should not be significantly slower either. Your use case may see an improvement if you use a mix of python and lapack, which is the usual case.

      +Please let us know how it goes. We love to hear success stories too.

      +We still have challenges at all levels of programming,and are always looking for people willing to contribute, so stop by on IRC at #pypy.

      +mattip and the PyPy Team
      +
      +
      +
      +
      +
      + + Olivier Grisel wrote on 2015-02-24 10:20: +
      +
      +

      Interesting work although benchmarking linear algebra routines on 10x10 arrays feels wrong: typical linear algebra applications use hundreds or thousands of dimensions. Would you mind re-rerunning those benchmarks on 1000x1000 arrays instead? The use of the CPU cache and multiple threads can be very impacting for such workloads.

      Also some numpy / scipy developers are working on supporting OpenBLAS as the default BLAS/LAPACK by default for the Windows wheel packages and maybe later for the OSX packages as well.

      Under Linux (Debian / Ubuntu) it's pretty easy to have libblas.so / liblapack.so be symlinks to either ATLAS or OpenBLAS using the update-alternative syste,

      +
      +
      +
      +
      + + Maciej Fijalkowski wrote on 2015-02-24 10:23: +
      +
      +

      What blog post somehow fails to mention is that we do not reimplement those but reuse whatever underlaying library is there. The measurments of the actual speed is then not that interesting, because we're only interested in the overhead of call.

      +
      +
      +
      +
      + + Olivier Grisel wrote on 2015-02-24 10:26: +
      +
      +

      It might still be interesting to run that kind of benchmarks on more realistic workloads (maybe in addition to some micro-workloads) to see the importance of the remaining overhead in a typical usage scenario.

      +
      +
      +
      +
      + + mattip wrote on 2015-02-24 16:39: +
      +
      +

      The most interesting benchmark is probably the one only you can run, i.e. how does pypy perform for you on your workload.

      As far as lapack vs openblas, we will try to imitate what numpy does. If cpython/numpy supports a variation of lapack and pypy/numpy doesn't, that should be considered a bug.

      Please let us know how it works for you.

      +
      +
      +
      +
      + + Olivier Grisel wrote on 2015-02-24 17:50: +
      +
      +

      > The most interesting benchmark is probably the one only you can run, i.e. how does pypy perform for you on your workload.

      I agree, but inverting a 10x10 matrix is probably not representative of anybody's workload.

      While it's important not to introduce too much overhead in the bindings, I think it's also good to keep in mind that an overhead of the order of the micro-second is completely negligible compared to the execution time of a typical linear algebra operation running on realistically sized data. Hence my original remark.

      > As far as lapack vs openblas, we will try to imitate what numpy does. If cpython/numpy supports a variation of lapack and pypy/numpy doesn't, that should be considered a bug.

      Just to clarify OpenBLAS is an implementation of the standard BLAS API that also includes the official LAPACK implementation from netlib linked against its own optimized BLAS routines. The 2 main open source optimized implementations of BLAS/LAPACK supported by numpy & scipy are ATLAS and OpenBLAS.

      +
      +
      +
      +
      + + Romain Guillebert wrote on 2015-02-24 19:12: +
      +
      +

      > While it's important not to introduce too much overhead in the bindings, I think it's also good to keep in mind that an overhead of the order of the micro-second is completely negligible compared to the execution time of a typical linear algebra operation running on realistically sized data. Hence my original remark.

      But then you're just benchmarking the underlying library, which is the exact same library as numpy.

      +
      +
      +
      +
      + + Olivier Grisel wrote on 2015-02-24 20:21: +
      +
      +

      > But then you're just benchmarking the underlying library, which is the exact same library as numpy.

      Yes I agree. I just want to highlight that for most common real life use cases, a small performance overhead in those those LAPACK bindings are almost never a problem.

      Otherwise your readers might be mislead into thinking that the "PyPy 2.5.0 + numpy + lapack via pure python + cffi" version is significantly suboptimal (2x slowdown!) while in practice a couple of additional microseconds might be completely undetectable compared to the actual execution time of the "inv" function that typically lasts more than a millisecond on anything that is non-toy data.

      +
      +
      +
      +
      + + Romain Guillebert wrote on 2015-02-24 20:57: +
      +
      +

      ok, makes sense :)

      +
      +
      +
      +
      + + Anonymous wrote on 2015-02-24 21:12: +
      +
      +

      Additional data point: repeatedly inverting a ~10x10 matrix is exactly what I need performance on - for running an extended Kalman Filter. : )

      +
      +
      +
      +
      + + Olivier Grisel wrote on 2015-02-24 21:18: +
      +
      +

      > Additional data point: repeatedly inverting a ~10x10 matrix is exactly what I need performance on - for running an extended Kalman Filter. : )

      Fair enough: so there actually exists a use case for that benchmark. Optimizing the bindings overhead might thus be worthy in the end.

      +
      +
      +
      +
      + + Yaacov wrote on 2015-03-01 22:07: +
      +
      +

      I love hearing about the progress and wish I could test on my benchmarks. Any chance of windows support?

      +
      +
      +
      +
      + + mattip wrote on 2015-03-02 11:01: +
      +
      +

      Yaacov what is missing for you to try it?
      Here is the way I verify that the code works on windows 7 64 bit and windows 8.1:

      download and install compiler
      https://www.microsoft.com/en-us/download/details.aspx?id=44266

      download pypy and open the zip
      https://bitbucket.org/pypy/pypy/downloads/pypy-2.5.0-win32.zip

      install pip into pypy
      https://bootstrap.pypa.io/get-pip.py

      install numpy into pypy
      pip install git+https://bitbucket.org/pypy/numpy.git

      +
      +
      +
      +
      + + Yaacov wrote on 2015-03-06 04:14: +
      +
      +

      I get a tracback ending in

      \appdata\local\temp\pip-wdyqtr-build\numpy\distutils\mi
      sc_util.py", line 872, in _get_configuration_from_setup_py

      config = setup_module.configuration(*args)

      File "numpy\linalg\setup.py", line 85, in configuration

      library_dirs = [sys.real_prefix + '/include',

      AttributeError: 'module' object has no attribute 'real_prefix'

      in a warning "using unoptimized lapack"

      +
      +
      +
      +
      + + Unknown wrote on 2015-03-06 19:24: +
      +
      +

      I still get non-existing conjugate method error when using, e.g., linalg.pinv. Any plan on getting this working?

      +
      +
      +
      +
      + + mattip wrote on 2015-03-07 22:01: +
      +
      +

      fixed, try a nightly (from tomorrow)

      +
      +
      +
      +
      + + Derek Z wrote on 2015-03-13 14:56: +
      +
      +

      I got an error message:

      OSError: Cannot load library /usr/local/Cellar/pypy/2.5.0/libexec/site-packages/numpy/linalg/libumath_linalg_cffi.so: dlopen(/usr/local/Cellar/pypy/2.5.0/libexec/site-packages/numpy/linalg/libumath_linalg_cffi.so, 2): image not found

      Is there anything I am not doing right for the installation? I have pypy 2.5, and Mac OS 10.10.

      +
      +
      +
      +
      + + mattip wrote on 2015-03-13 15:18: +
      +
      +

      Are you installing via pip, if so we have had reports of older versions of pip failing. You should have pip 6.0.8 or later. See https://bitbucket.org/pypy/numpy/issue/21

      +
      +
      +
      +
      + + melbic wrote on 2015-03-19 11:13: +
      +
      +

      Same problem here. (OSX 10.10) I've got the newest pip (6.0.8) and setuptools (14.0.3) version installed.

      +
      +
      +
      +
      + + melbic wrote on 2015-03-19 11:13: +
      +
      +

      Same problem here. (OSX 10.10) I've got the newest pip (6.0.8) and setuptools (14.0.3) version installed.

      +
      +
      +
      +
      + + mattip wrote on 2015-03-19 17:01: +
      +
      +

      I can't reproduce this as I do not have a MacOS machine. The place to follow this up is on our issue tracker, https://bitbucket.org/pypy/numpy/issue/21

      It would be most helpful to attach a full log from "pip install" and 'pypy -c "import numpy"' to that issue

      +
      +
      +
      +
      + + Nimrod wrote on 2015-03-29 15:34: +
      +
      +

      One way pypy might be able to outperform numpy is by eliminating temporaries.

      Just converting the BLAS functions to chain operations efficiently and sometimes update in-place rather than allocating and de-allocating arrays should help a lot.

      +
      +
      +
      +
      + + Unknown wrote on 2015-04-05 23:07: +
      +
      +

      This is great! But I can't use this for almost any of my code before np.einsum is supported :/ IMO, it is a super useful function for almost anything. Any plans for supporting it?

      +
      +
      +
      +
      + + mattip wrote on 2015-04-07 09:01: +
      +
      +

      Koos Zevenhoven - we have plans to implement all of numpy. With that, it looks like einsum will take quite a bit of work

      +
      +
      +
      + +

      NumPyPy status - January 2015

      + +
      +

      Hi Everyone

      +Here is what has been done in January thanks to the funding of NumPyPy, I would like to thank all the donors and tell you that you can still donate :

      +
        +
      • I have focused on implementing the object dtype this month, it is now possible to store objects inside ndarrays using the object dtype
      • +
      • It is also possible to add an object ndarray to any other ndarray (implementing other operators is trivial)
      • +
      +
      +The next things I plan on working on next are :
      +
      +
        +
      • Implementing the missing operations for object arrays
      • +
      • Implementing garbage collection support for object arrays (currently, storing an object inside an ndarray doesn't keep the object alive)
      • +
      • Packaging NumPyPy on PyPI
      • +
      +
      +Cheers
      +
      +
      +Romain
      +
      +
      +
      +
      + + Anonymous wrote on 2015-02-12 02:15: +
      +
      +

      Thanks for the post! This sounds pretty cool.

      The previous post suggested that there would be an update in regards to linalg. Does this mean linalg is working? Is having a working linalg what stands in the way of a working matplotlib? Thanks for answering what might be a naive question!

      +
      +
      +
      +
      + + mattip wrote on 2015-02-12 22:27: +
      +
      +

      Linalg is basically usable with the usual caveats: use PyPy 2.5.0 or later, use pypy/numpy from the bitbucket repo, you can even use matplotlib from my fork at https://github.com/mattip/matplotlib but there is no gui backend available yet, so you can only save the plots to files. Watch this space for the promised blog post, hopefully next week.

      +
      +
      +
      +
      + + Anonymous wrote on 2015-02-13 10:34: +
      +
      +

      Great to hear there is some progress on numpy!

      About matplotlib @mattip. Maybe a GSoC project for the GUI?

      +
      +
      +
      +
      + + Jami wrote on 2015-03-06 20:01: +
      +
      +

      Regarding matplotlib, I whipped up a quick hack that can do at least very simple matplotlib stuff. Based on running a "slave" CPython using RpyC, as I recall was already done in 2011 or so demos.

      Simple stuff can run unmodified, although can be of course slow if there's a lot or frequent data passing from PyPy to CPython.

      Could be probably quite easily done in other direction to, ie running PyPy from CPython.

      https://github.com/jampekka/cpyproxy

      +
      +
      +
      + +

      PyPy 2.5.0 released

      + +
      +
      +
      +
      +

      +PyPy 2.5.0 - Pincushion Protea +

      +We’re pleased to announce PyPy 2.5, which contains significant performance +enhancements and bug fixes.
      +You can download the PyPy 2.5.0 release here:
      + +
      +We would like to thank our donors for the continued support of the PyPy +project, and for those who donate to our three sub-projects, as well as our +volunteers and contributors (10 new commiters joined PyPy since the last +release). +We’ve shown quite a bit of progress, but we’re slowly running out of funds. +Please consider donating more, or even better convince your employer to donate, +so we can finish those projects! The three sub-projects are:
        +
      • +
        +
        +Py3k (supporting Python 3.x): We have released a Python 3.2.5 compatible version
        +
        +
        +we call PyPy3 2.4.0, and are working toward a Python 3.3 compatible version
        +
        +
        +
      • +
      • +
        +STM (software transactional memory): We have released a first working version, +and continue to try out new promising paths of achieving a fast multithreaded Python
        +
      • +
      • +
        +NumPy which requires installation of our fork of upstream numpy, +available on bitbucket +
        +
      • +
      +
      +

      +What is PyPy?

      +PyPy is a very compliant Python interpreter, almost a drop-in replacement for +CPython 2.7. It’s fast (pypy and cpython 2.7.x performance comparison) +due to its integrated tracing JIT compiler.
      +This release supports x86 machines on most common operating systems +(Linux 32/64, Mac OS X 64, Windows, and OpenBSD), +as well as newer ARM hardware (ARMv6 or ARMv7, with VFPv3) running Linux.
      +While we support 32 bit python on Windows, work on the native Windows 64 +bit python is still stalling, we would welcome a volunteer +to handle that.
      +
      +

      +Highlights

      +
        +
      • The past months have seen pypy mature and grow, as rpython becomes the goto +solution for writing fast dynamic language interpreters. Our separation of +rpython and the python interpreter PyPy is now much clearer in the +PyPy documentation and we now have separate RPython documentation.
      • +
      • We have improved warmup time as well as jitted code performance: more than 10% +compared to pypy-2.4.0. +We no longer zero-out memory allocated in the gc nursery by default, work that +was started during a GSoC.
      • +
      • Passing objects between C and PyPy has been improved. We are now able to pass +raw pointers to C (without copying) using pinning. This improves I/O; +benchmarks that use networking intensively improved by about 50%. File() +operations still need some refactoring but are already showing a 20% +improvement on our benchmarks. Let us know if you see similar improvements.
      • +
      • Our integrated numpy support gained much of the GenericUfunc api in order to +support the lapack/blas linalg module of numpy. This dovetails with work in the +pypy/numpy repository to support linalg both through the (slower) cpyext capi +interface and also via (the faster) pure python cffi interface, using an +extended frompyfunc() api. We will soon post a seperate blog post specifically +about linalg and PyPy.
      • +
      • Dictionaries are now ordered by default, see the blog post +
      • +
      • Our nightly translations use –shared by default, including on OS/X and linux
      • +
      • We now more carefully handle errno (and GetLastError, WSAGetLastError) tying +the handlers as close as possible to the external function call, in non-jitted +as well as jitted code.
      • +
      • Issues reported with our previous release were resolved after reports from users on +our issue tracker at https://foss.heptapod.net/pypy/pypy/-/issues or on IRC at +#pypy.
      • +
      +We have further improvements on the way: rpython file handling, +finishing numpy linalg compatibility, numpy object dtypes, a better profiler, +as well as support for Python stdlib 2.7.9.
      +Please try it out and let us know what you think. We especially welcome +success stories, we know you are using PyPy, please tell us about it!
      +Cheers
      +The PyPy Team
      +
      +
      +
      +
      +
      +
      +
      + + Anonymous wrote on 2015-02-04 07:48: +
      +
      +

      Many-many thanks for your work!

      +
      +
      +
      +
      + + rndblnch wrote on 2015-02-04 15:59: +
      +
      +

      any release schedule for pypy3-2.5?
      how can we help with pypy3?

      +
      +
      +
      +
      + + Jami wrote on 2015-02-09 11:48: +
      +
      +

      Sorry to nag, but when are the news about Scipy/Matplotlib compatibility plans coming? I've been checking daily since the November 28th teaser!

      +
      +
      +
      + +
      +
      + +
      +
      +
      + +
      + + + + \ No newline at end of file diff --git a/blog/index-34.html b/blog/index-34.html new file mode 100644 index 000000000..0d137bfc1 --- /dev/null +++ b/blog/index-34.html @@ -0,0 +1,1782 @@ + + + + + + +PyPy (old posts, page 34) | PyPy + + + + + + + + + + + + + + + + + + Skip to main content +
      +
      +
      +

      Using CFFI for embedding

      + +
      +

      Introduction

      + +

      CFFI has been a great success so far to call C libraries in your +Python programs, in a way that is both simple and that works across +CPython 2.x and 3.x and PyPy.

      + +

      This post assumes that you know what CFFI is and how to use it in +API mode (ffi.cdef(), ffi.set_source(), ffi.compile()). +A quick overview can be found in this paragraph.

      + +

      The major news of CFFI 1.4, released last december, was that you can +now declare C functions with extern "Python" in the cdef(). +These magic keywords make the function callable from C (where it is +defined automatically), but calling it will call some Python code +(which you attach with the @ffi.def_extern() decorator). This is +useful because it gives a more straightforward, faster and +libffi-independent way to write callbacks. For more details, see the +documentation.

      + +

      You are, in effect, declaring a static family of C functions which +call Python code. The idea is to take pointers to them, and pass them +around to other C functions, as callbacks. However, the idea of a set +of C functions which call Python code opens another path: embedding +Python code inside non-Python programs.

      + +

      Embedding

      + +

      Embedding is traditionally done using the CPython C API: from C code, +you call Py_Initialize() and then some other functions like +PyRun_SimpleString(). In the simple cases it is, indeed, simple +enough; but it can become a complicated story if you throw in +supporting application-dependent object types; and a messy story if +you add correctly running on multiple threads, for example.

      +

      Moreover, this approach is specific to CPython (2.x or 3.x). It does +not work at all on PyPy, which has its own very different, minimal +embedding API.

      + +

      The new-and-coming thing about CFFI 1.5, meant as replacement of the +above solutions, is direct embedding support---with no fixed API at +all. The idea is to write some Python script with a cdef() which +declares a number of extern "Python" functions. When running the +script, it creates the C source code and compiles it to a +dynamically-linked library (.so on Linux). This is the same as in +the regular API-mode usage. What is new is that these extern +"Python" can now also be exported from the .so, in the C +sense. You also give a bit of initialization-time Python code +directly in the script, which will be compiled into the .so too.

      +

      This library can now be used directly from any C program (and it is +still importable in Python). It exposes the C API of your choice, +which you specified with the extern "Python" declarations. You +can use it to make whatever custom API makes sense in your particular +case. You can even directly make a "plug-in" for any program that +supports them, just by exporting the API expected for such plugins.

      + +

      Trying it out on CPython

      + +

      This is still being finalized, but please try it out. You can +see embedding.py directly online for a quick glance. Or +see below the instructions on Linux with CPython 2.7 (CPython 3.x and +non-Linux platforms are still a work in progress right now, but this +should be quickly fixed):

      +
        +
      • +

        get the branch static-callback-embedding of CFFI:

        +
        +hg clone https://foss.heptapod.net/cffi/cffi
        +hg up static-callback-embedding
        +
        +
      • +
      • +

        make the _cffi_backend.so:

        +
        +python setup_base.py build_ext -f -i
        +
        +
      • +
      • +

        run embedding.py in the demo directory:

        +
        +cd demo
        +PYTHONPATH=.. python embedding.py
        +
        +
      • +
      • +

        this produces _embedding_cffi.c. Run gcc to build it. On Linux:

        +
        +gcc -shared -fPIC _embedding_cffi.c -o _embedding_cffi.so  \
        +    -lpython2.7 -I/usr/include/python2.7
        +
        +
      • +
      • +

        try out the demo C program in embedding_test.c:

        +
        +gcc embedding_test.c _embedding_cffi.so
        +PYTHONPATH=.. LD_LIBRARY_PATH=. ./a.out
        +
        +
      • +
      +

      Note that if you get ImportError: cffi extension module +'_embedding_cffi' has unknown version 0x2701, it means that the +_cffi_backend module loaded is a pre-installed one instead of the +more recent one in "..". Be sure to use PYTHONPATH=.. for now. (Some installations manage to be confused enough to load the system-wide cffi even if another version is in the PYTHONPATH. I think a virtualenv can be used to work around this issue.)

      + +

      Try it out on PyPy

      + +

      Very similar steps can be followed on PyPy, but it requires the +cffi-static-callback-embedding branch of PyPy, which you must +first translate from sources. The difference is then that you need to +adapt the first gcc command line: replace -lpython2.7 with +-lpypy-c and to fix the -I path (and possibly add a -L +path).

      + +

      More details

      + +

      How it works, more precisely, is by automatically initializing CPython/PyPy +the first time any of the extern "Python" +functions is called from the C program. This is done using locks in case of multi-threading, +so several threads can concurrently do this "first call". This should work even if two +different threads call the first time a function from two different +embedded CFFI extensions that happen to be linked with the same program. Explicit initialization is +never needed.

      + +

      The custom initialization-time Python code you put in +ffi.embedding_init_code() is executed at that time. If this code +starts to be big, you can move it to independent modules or packages. +Then the initialization-time Python code only needs to import them. In +that case, you have to carefully set up sys.path if the modules are +not installed in the usual Python way.

      +

      If the Python code is big and full of dependencies, a better alternative +would be to use virtualenv. How to do that is not fully fleshed out so +far. You can certainly run the whole program with the environment +variables set up by the virtualenv's activate script first. There +are probably other solutions that involve using gcc's +-Wl,-rpath=\$ORIGIN/ or -Wl,-rpath=/fixed/path/ options to load +a specific libpython or libypypy-c library. If you try it out and it +doesn't work the way you would like, please complain :-)

      +

      Another point: right now this does not support CPython's notion of +multiple subinterpreters. The logic creates a single global Python +interpreter, and runs everything in that context. Maybe a future +version would have an explicit API to do that — or maybe it should be +the job of a 3rd-party extension module to provide a Python interface +over the notion of subinterpreters...

      +

      More generally, any feedback is appreciated.

      +

      Have fun,

      +

      Armin

      +
      +
      +
      +
      + + Armin Rigo wrote on 2016-01-07 13:29: +
      +
      +

      Thanks to apollo13 on irc for early feedback. Main change: the code put in embedded_init_code() should now start with "from xx import ffi", where "xx" is the name of the module (first argument to set_source()). The goal is to clearly say that you need the same line in other modules imported from there.

      +
      +
      +
      +
      + + Unknown wrote on 2016-01-11 14:42: +
      +
      +

      This is very exciting! Just waiting for Python 3.x support now. :)

      +
      +
      +
      +
      + + Armin Rigo wrote on 2016-01-12 07:54: +
      +
      +

      Python 3 is implemented and tested now.

      +
      +
      +
      +
      + + Armin Rigo wrote on 2016-01-12 18:48: +
      +
      +

      Windows support is now done (tested on Python 2.7). Expect a release soon :-)

      +
      +
      +
      +
      + + Armin Rigo wrote on 2016-01-16 18:02: +
      + +
      +
      +
      + + Anonymous wrote on 2016-02-08 00:10: +
      +
      +

      Excelent feature!!

      CFFI rocks, and the documentation keeps improving :)

      +
      +
      +
      +
      + + d.q. wrote on 2016-03-10 10:51: +
      +
      +

      Awesome, pypyInstaller in cross-hairs!

      +
      +
      +
      + +

      Leysin Winter Sprint (20-27th February 2016)

      + +
      +

      The next PyPy sprint will be in Leysin, Switzerland, for the eleventh time. +This is a fully public sprint: newcomers and topics other than those +proposed below are welcome.

      +
      +

      Goals and topics of the sprint

      +

      The details depend on who is here and ready to work. The list of +topics is mostly the same as last year (did PyPy became a mature +project with only long-term goals?):

      +
        +
      • cpyext (CPython C API emulation layer): various speed and +completeness topics
      • +
      • cleaning up the optimization step in the JIT, change the register +allocation done by the JIT's backend, or more improvements to the +warm-up time
      • +
      • finish vmprof - a statistical profiler for CPython and PyPy
      • +
      • Py3k (Python 3.x support), NumPyPy (the numpy module)
      • +
      • STM (Software Transaction Memory), notably: try to come up with +benchmarks, and measure them carefully in order to test and improve +the conflict reporting tools, and more generally to figure out how +practical it is in large projects to avoid conflicts
      • +
      • And as usual, the main side goal is to have fun in winter sports :-) +We can take a day off for ski.
      • +
      +
      +
      +

      Exact times

      +

      I have booked the week from Saturday 20 to Saturday 27. It is fine to +leave either the 27 or the 28, or even stay a few +more days on either side. The plan is to work full days between the 21 +and the 27. You are of course allowed to show up for a part of that +time only, too.

      +
      +
      +

      Location & Accomodation

      +

      Leysin, Switzerland, "same place as before". Let me refresh your +memory: both the sprint venue and the lodging will be in a +pair of chalets built specifically for bed & breakfast: +https://www.ermina.ch/. The place has a good ADSL Internet connection +with wireless installed. You can also arrange your own lodging +elsewhere (as long as you are in Leysin, you cannot be more than a 15 +minutes walk away from the sprint venue).

      +

      Please confirm that you are coming so that we can adjust the +reservations as appropriate.

      +

      The options of rooms are a bit more limited than on previous years +because the place for bed-and-breakfast is shrinking: what is +guaranteed is only one double-bed room and a bigger room with 5-6 +individual beds (the latter at 50-60 CHF per night, breakfast +included). If there are more people that would prefer a single room, +please contact me and we'll see what choices you have. There are a +choice of hotels, many of them reasonably priced for Switzerland.

      +

      Please register by Mercurial:

      +
      +https://bitbucket.org/pypy/extradoc/
      https://foss.heptapod.net/pypy/extradoc/-/blob/branch/default/extradoc/sprintinfo/leysin-winter-2016 +
      +

      or on the pypy-dev mailing list if you do not yet have check-in rights:

      +
      +https://mail.python.org/mailman/listinfo/pypy-dev +
      +

      You need a Swiss-to-(insert country here) power adapter. There will be +some Swiss-to-EU adapters around, and at least one EU-format power strip.

      +
      +
      +

      PyPy 4.0.1 released please update

      + +
      +
      +
      +
      +
      +

      +PyPy 4.0.1

      +
      +We have released PyPy 4.0.1, three weeks after PyPy 4.0.0. We have fixed a few critical bugs in the JIT compiled code, reported by users. We therefore encourage all users of PyPy to update to this version. There are a few minor enhancements in this version as well.

      +You can download the PyPy 4.0.1 release here:
      + +
      +We would like to thank our donors for the continued support of the PyPy project.
      +We would also like to thank our contributors and encourage new people to join the project. PyPy has many layers and we need help with all of them: PyPy and RPython documentation improvements, tweaking popular modules to run on pypy, or general help with making RPython’s JIT even better.

      +

      +
      +

      +

      +CFFI update

      +
      +While not applicable only to PyPy, cffi is arguably our most significant contribution to the python ecosystem. PyPy 4.0.1 ships with cffi-1.3.1 with the improvements it brings.
      +
      +

      +

      +What is PyPy?

      +
      +PyPy is a very compliant Python interpreter, almost a drop-in replacement for CPython 2.7. It’s fast (pypy and cpython 2.7.x performance comparison) due to its integrated tracing JIT compiler.
      +We also welcome developers of other dynamic languages to see what RPython can do for them.
      +This release supports x86 machines on most common operating systems (Linux 32/64, Mac OS X 64, Windows 32, OpenBSD, freebsd), newer ARM hardware (ARMv6 or ARMv7, with VFPv3) running Linux, and the big- and little-endian variants of ppc64 running Linux.
      +
      +

      +

      +Other Highlights (since 4.0.0 released three weeks ago)

      +

      +

      +
        +
      • +Bug Fixes
          +
        • Fix a bug when unrolling double loops in JITted code
        • +
        • Fix multiple memory leaks in the ssl module, one of which affected CPython as well (thanks to Alex Gaynor for pointing those out)
        • +
        • Use pkg-config to find ssl headers on OS-X
        • +
        • Issues reported with our previous release were resolved after reports from users on our issue tracker at https://foss.heptapod.net/pypy/pypy/-/issues or on IRC at #pypy
        • +
        +
      • +
      • +New features
          +
        • Internal cleanup of RPython class handling
        • +
        • Support stackless and greenlets on PPC machines
        • +
        • Improve debug logging in subprocesses: use PYPYLOG=jit:log.%d for example to have all subprocesses write the JIT log to a file called ‘log.%d’, with ‘%d’ replaced with the subprocess’ PID.
        • +
        • Support PyOS_double_to_string in our cpyext capi compatibility layer
        • +
        +
      • +
      • +Numpy
          +
        • Improve support for __array_interface__
        • +
        • Propagate most NAN mantissas through float16-float32-float64 conversions
        • +
        +
      • +
      • +Performance improvements and refactorings
          +
        • Improvements in slicing byte arrays
        • +
        • Improvements in enumerate()
        • +
        • Silence some warnings while translating
        • +
        +
      • +
      +Please update, and continue to help us make PyPy better.

      +Cheers
      +The PyPy Team
      +
      +
      +
      +
      +
      +
      +
      +
      + + Marius Gedminas wrote on 2015-11-20 11:20: +
      +
      +

      I'd love to upgrade and see if that makes my segfault go away, but the builds at https://launchpad.net/~pypy/+archive/ubuntu/ppa?field.series_filter=precise are two weeks old?

      +
      +
      +
      +
      + + Armin Rigo wrote on 2015-11-20 12:06: +
      +
      +

      Hi Marius! How about directing such complains to the maintainer of the PPA instead of us? :-)

      +
      +
      +
      +
      + + Gerd Puin wrote on 2015-11-27 05:46: +
      +
      +

      Where are the benchmark instructions for the official set?

      +
      +
      +
      +
      + + Armin Rigo wrote on 2015-11-27 09:57: +
      +
      +

      https://bitbucket.org/pypy/benchmarks , file runner.py. This file has various options; try this: ``python runner.py --changed /path/to/pypy``. This example would compare the speed on top of your system's python and on top of /path/to/pypy. Try also ``--fast`` if you're not patient enough.

      +
      +
      +
      +
      + + Unknown wrote on 2015-11-28 02:54: +
      +
      +

      can I run pandas in PyPy. I am using Python for Data Science

      +
      +
      +
      +
      + + Gerd Puin wrote on 2015-11-28 04:20: +
      +
      +

      Thanks Armin, that got me a result.json file - is there a tool to present the data in a more human-readable way?

      +
      +
      +
      +
      + + Armin Rigo wrote on 2015-11-29 08:03: +
      +
      +

      The command itself prints a human-readable result at the end; you can ignore result.json.

      +
      +
      +
      +
      + + Gerd Puin wrote on 2015-11-29 18:48: +
      +
      +

      I see. Just an idea - maybe the results could be reviewed on speed.pypy.org via a web interface?

      Cheers!

      +
      +
      +
      + +

      PyPy 4.0.0 Released - A Jit with SIMD Vectorization and More

      + +
      +
      +

      +PyPy 4.0.0

      +We’re pleased and proud to unleash PyPy 4.0.0, a major update of the PyPy python 2.7.10 compatible interpreter with a Just In Time compiler. We have improved warmup time and memory overhead used for tracing, added vectorization for numpy and general loops where possible on x86 hardware (disabled by default), refactored rough edges in rpython, and increased functionality of numpy.
      +You can download the PyPy 4.0.0 release here:
      + +
      +We would like to thank our donors for the continued support of the PyPy project.
      +We would also like to thank our contributors (7 new ones since PyPy 2.6.0) and encourage new people to join the project. PyPy has many layers and we need help with all of them: PyPy and RPython documentation improvements, tweaking popular modules to run on PyPy, or general help with making RPython’s JIT even better.

      +New Version Numbering

      +
      +Since the past release, PyPy 2.6.1, we decided to update the PyPy 2.x.x versioning directly to PyPy 4.x.x, to avoid confusion with CPython 2.7 and 3.5. Note that this version of PyPy uses the stdlib and implements the syntax of CPython 2.7.10.
      +

      +Vectorization

      +
      +Richard Plangger began work in March and continued over a Google Summer of Code to add a vectorization step to the trace optimizer. The step recognizes common constructs and emits SIMD code where possible, much as any modern compiler does. This vectorization happens while tracing running code, so it is actually easier at run-time to determine the availability of possible vectorization than it is for ahead-of-time compilers.
      +Availability of SIMD hardware is detected at run time, without needing to precompile various code paths into the executable.
      +The first version of the vectorization has been merged in this release, since it is so new it is off by default. To enable the vectorization in built-in JIT drivers (like numpy ufuncs), add –jit vec=1, to enable all implemented vectorization add –jit vec_all=1
      +Benchmarks and a summary of this work appear here +
      +

      +Internal Refactoring: Warmup Time Improvement and Reduced Memory Usage

      +
      +Maciej Fijalkowski and Armin Rigo refactored internals of Rpython that now allow PyPy to more efficiently use guards in jitted code. They also rewrote unrolling, leading to a warmup time improvement of 20% or so. The reduction in guards also means a reduction in the use of memory, also a savings of around 20%.
      +
      +

      +Numpy

      +
      +Our implementation of numpy continues to improve. ndarray and the numeric dtypes are very close to feature-complete; record, string and unicode dtypes are mostly supported. We have reimplemented numpy linalg, random and fft as cffi-1.0 modules that call out to the same underlying libraries that upstream numpy uses. Please try it out, especially using the new vectorization (via –jit vec=1 on the command line) and let us know what is missing for your code.
      +
      +

      +CFFI

      +
      +While not applicable only to PyPy, cffi is arguably our most significant contribution to the python ecosystem. Armin Rigo continued improving it, and PyPy reaps the benefits of cffi-1.3: improved manangement of object lifetimes, __stdcall on Win32, ffi.memmove(), and percolate const, restrict keywords from cdef to C code.
      +
      +

      +What is PyPy?

      +
      +PyPy is a very compliant Python interpreter, almost a drop-in replacement for CPython 2.7. It’s fast (pypy and cpython 2.7.x performance comparison) due to its integrated tracing JIT compiler.
      +We also welcome developers of other dynamic languages to see what RPython can do for them.
      +This release supports x86 machines on most common operating systems (Linux 32/64, Mac OS X 64, Windows 32, OpenBSD, freebsd), as well as newer ARM hardware (ARMv6 or ARMv7, with VFPv3) running Linux.
      +We also introduce support for the 64 bit PowerPC hardware, specifically Linux running the big- and little-endian variants of ppc64.
      +
      +

      +Other Highlights (since 2.6.1 release two months ago)

      +
        +
      • +Bug Fixes
          +
        • Applied OPENBSD downstream fixes
        • +
        • Fix a crash on non-linux when running more than 20 threads
        • +
        • In cffi, ffi.new_handle() is more cpython compliant
        • +
        • Accept unicode in functions inside the _curses cffi backend exactly like cpython
        • +
        • Fix a segfault in itertools.islice()
        • +
        • Use gcrootfinder=shadowstack by default, asmgcc on linux only
        • +
        • Fix ndarray.copy() for upstream compatability when copying non-contiguous arrays
        • +
        • Fix assumption that lltype.UniChar is unsigned
        • +
        • Fix a subtle bug with stacklets on shadowstack
        • +
        • Improve support for the cpython capi in cpyext (our capi compatibility layer). Fixing these issues inspired some thought about cpyext in general, stay tuned for more improvements
        • +
        • When loading dynamic libraries, in case of a certain loading error, retry loading the library assuming it is actually a linker script, like on Arch and Gentoo
        • +
        • Issues reported with our previous release were resolved after reports from users on our issue tracker at https://foss.heptapod.net/pypy/pypy/-/issues or on IRC at #pypy
        • +
        +
      • +
      • +New features:
          +
        • Add an optimization pass to vectorize loops using x86 SIMD intrinsics.
        • +
        • Support __stdcall on Windows in CFFI
        • +
        • Improve debug logging when using PYPYLOG=???
        • +
        • Deal with platforms with no RAND_egd() in OpenSSL
        • +
        +
      • +
      • +Numpy:
          +
        • Add support for ndarray.ctypes
        • +
        • Fast path for mixing numpy scalars and floats
        • +
        • Add support for creating Fortran-ordered ndarrays
        • +
        • Fix casting failures in linalg (by extending ufunc casting)
        • +
        • Recognize and disallow (for now) pickling of ndarrays with objects embedded in them
        • +
        +
      • +
      • +Performance improvements and refactorings:
          +
        • Reuse hashed keys across dictionaries and sets
        • +
        • Refactor JIT interals to improve warmup time by 20% or so at the cost of a minor regression in JIT speed
        • +
        • Recognize patterns of common sequences in the JIT backends and optimize them
        • +
        • Make the garbage collecter more incremental over external_malloc() calls
        • +
        • Share guard resume data where possible which reduces memory usage
        • +
        • Fast path for zip(list, list)
        • +
        • Reduce the number of checks in the JIT for lst[a:]
        • +
        • Move the non-optimizable part of callbacks outside the JIT
        • +
        • Factor in field immutability when invalidating heap information
        • +
        • Unroll itertools.izip_longest() with two sequences
        • +
        • Minor optimizations after analyzing output from vmprof and trace logs
        • +
        • Remove many class attributes in rpython classes
        • +
        • Handle getfield_gc_pure* and getfield_gc_* uniformly in heap.py
        • +
        • Improve simple trace function performance by lazily calling fast2locals and locals2fast only if truly necessary
        • +
        +
      • +
      +
      +
      +
      +
      +
      +Please try it out and let us know what you think. We welcome feedback, we know you are using PyPy, please tell us about it!
      +Cheers
      +The PyPy Team
      +
      +
      +
      +



      +
      +
      +
      +
      +
      +
      + + Gerrit Slonzer wrote on 2015-10-29 14:17: +
      +
      +

      With the SIMD run-time detection implemented, has the --jit-backend option become redundant?

      +
      +
      +
      +
      + + stryker wrote on 2015-10-29 18:07: +
      +
      +

      Will a similar release be coming for Python 3.5?

      +
      +
      +
      +
      + + John M. Camara wrote on 2015-10-29 21:44: +
      +
      +

      @Gerrit, they are 2 different things. One is the option to say you are interested in the SIMD support and the other is a check if SIMD support is available in the HW if you are interested in using it. I'm sure once SIMD support has been used for some time it will eventually be enabled by default but since it is new and potential could have some unknown issues at this time you have to explicitly enable it at this time.

      +
      +
      +
      +
      + + Niklas B wrote on 2015-10-30 10:07: +
      +
      +

      Awesome, can't wait to try it

      +
      +
      +
      +
      + + Unknown wrote on 2015-10-30 19:31: +
      +
      +

      Well done, thx!

      +
      +
      +
      +
      + + Travis Griggs wrote on 2015-10-31 00:31: +
      +
      +

      I keep watching the progress of PyPy with excitement. Cool things happening here. But I continue to be disappointed that it doesn't tip towards Python3. It's dead to me until that becomes the majority effort. :(

      +
      +
      +
      +
      + + Carl Friedrich Bolz-Tereick wrote on 2015-10-31 00:35: +
      +
      +

      The PyPy project contains a large plurality of interests. A lot of the people working on it are volunteers. So PyPy3 will happen if people within the project become interested in that part, or if new people with that interest join the project. At the moment, this seems not happening, which we can all be sad about. However, blaming anybody with differing interest for that situation feels a bit annoying to me.

      +
      +
      +
      +
      + + Travis Griggs wrote on 2015-10-31 07:15: +
      +
      +

      Well said, I apologize for any whining tone. It was not my intent to blame or complain. It really was just meant as a lamentation. Thanks for all you do.

      +
      +
      +
      +
      + + PeteVine wrote on 2015-10-31 17:47: +
      +
      +

      What happened to my comment? Surely the benchmark I was proposing is not censorable...

      +
      +
      +
      +
      + + Maciej Fijalkowski wrote on 2015-10-31 18:14: +
      +
      +

      @PeteVine you posted a random executable from dropbox claiming to have pypy with x87 backend. PyPy does not have an x87 backend and this raises suspitions this was just some malware. Now if you want someone to compare one thing against some other thing, please link to sources and not random binaries so the person comparing can look themselves. Additionally you did not post a benchmark, just a link to the binary

      +
      +
      +
      +
      + + PeteVine wrote on 2015-10-31 19:29: +
      +
      +

      Well, I was suggesting benchmarking the 32-bit backends to see how much difference SIMD makes - x87 means the standard fpu whereas the default uses SSE2. I know it's processor archaeology so you may have forgotten pypy even had it ;)

      The ready-to-use pypy distro (built by me) was meant for anyone in possesion of a real set of benchmarks (not synthetic vector stuff) to be able to try it quickly.

      And btw, you could have simply edited the dropbox link out. I'd already tested py3k using this backend and mentioned it in one of the issues on bitbucket so it's far from random.

      @ all the people asking about pypy3 - you have the python 3.2 compatible pypy (py3k) at your disposal even now.

      +
      +
      +
      +
      + + Armin Rigo wrote on 2015-10-31 20:54: +
      +
      +

      @PeteVine: to clarify, PyPy has no JIT backend emitting the old-style x87 fpu instructions. What you are posting is very likely a PyPy whose JIT doesn't support floats at all. It emits calls to already-compiled functions, like the one doing addition of float objects, instead of writing a SSE2 float addition on unboxed objects.

      Instead, use the official PyPy and run it with vectorization turned on and off (as documented) on the same modern machine. This allows an apple-to-apple comparison.

      +
      +
      +
      +
      + + PeteVine wrote on 2015-10-31 22:18: +
      +
      +

      Thanks for clarifying, I must have confused myself after seeing it was i486 compatible.

      Are you saying the only difference between the backends I wanted to benchmark would boil down to jit-emitting performance and not actual pypy performance? (I must admit I tried this a while ago with fibonacci and there was no difference at all).

      In other words, even before vectorization functionality was added, shouldn't it be possible to detect that the non-SSE2 backend is running on newer hardware and use the available SIMD? (e.g. for max. compatibility)

      +
      +
      +
      +
      + + Armin Rigo wrote on 2015-11-03 11:33: +
      +
      +

      @PeteVine Sorry, I don't understand your questions. Why do you bring the JIT-emitting performance to the table? And why fibonacci (it's not a benchmark with floats at all)? And I don't get the last question either ("SIMD" = "vectorization").

      To some people, merely dropping the word "SIMD" into a performance discussion makes them go "ooh nice" even if they don't have a clue what it is. I hope you're more knowledgeable than that and that I'm merely missing your point :-)

      +
      +
      +
      +
      + + PeteVine wrote on 2015-11-03 13:49: +
      +
      +

      The last part should have been pretty clear as it was referring to the newly added –jit vec=1 so it's not me who's dropping SIMD here (shorthand for different instructions sets) as can be seen in the title of this blog post.

      All this time I was merely interested in comparing the two 32-bit backends, that's all. One's using the i486/x87 instruction set regardless of any jit codes, the other is able take advantage of anything up to SSE2. The quick fibonacci test was all I did so you could have pointed me to a real set of benchmarks instead of throwing these little jabs :)

      +
      +
      +
      +
      + + Carl Friedrich Bolz-Tereick wrote on 2015-11-03 15:12: +
      +
      +

      @PeteVine: ok, there is a misunderstanding somewhere, I think. Let me try to clarify: PyPy's JIT has always used non-SIMD SSE2 instructions to implement floating point operations. We have a slow mode where only x87 instructions are used, but usually don't fall back to that, and it does not make sense to compare against that mode.

      What the new release experimentally added is support for SIMD SSE instructions using autoparallelization when --jit vec=1 is given. This only works if your program uses numpy arrays or other simple list processing code. For details on that (and for benchmarks) it's probably best to read Richard Plangger's blog.

      Does that make sense?

      +
      +
      +
      +
      + + PeteVine wrote on 2015-11-03 15:46: +
      +
      +

      Great, love that explanation! :)

      But please, I'd really like to see how much of a handicap the much-maligned non-SSE2 backend incurs. Could you recommend a set of python (not purely computational) benchmarks so I can put this peevee of mine to rest/test?

      Anyways, @Armin Rigo is a great educator himself judging from his patient replies in the bugtracker! So yeah, kudos to you guys!

      +
      +
      +
      +
      + + Carl Friedrich Bolz-Tereick wrote on 2015-11-03 15:59: +
      +
      +

      If you want to try a proper performance evaluation, the official benchmark set is probably the right one: https://bitbucket.org/pypy/benchmarks/

      However, none of these benchmarks are exercising the new autovectorization. If you're particularly interested in that part, use the benchmarks from Richard's blog.

      +
      +
      +
      +
      + + NortonCommander4ever wrote on 2015-11-09 14:33: +
      +
      +

      Is there a readme on how to use these benchmarks somewhere? (preferably written with windows users in mind, if you know what I mean:))

      +
      +
      +
      + +

      Automatic SIMD vectorization support in PyPy

      + +
      +
      +Hi everyone,

      +it took some time to catch up with the JIT refacrtorings merged in this summer. But, (drums) we are happy to announce that:

      +The next release of PyPy,  "PyPy 4.0.0", will ship the new auto vectorizer +

      +The goal of this project was to increase the speed of numerical applications in both the NumPyPy library and for arbitrary Python programs. In PyPy we have focused a lot on improvements in the 'typical python workload', which usually involves object and string manipulations, mostly for web development. We're hoping with this work that we'll continue improving the other very important Python use case - numerics.

      +What it can do! +

      +It targets numerics only. It +will not execute object manipulations faster, but it is capable of +enhancing common vector and matrix operations.
      +Good news is that it is not specifically targeted for the NumPy library and the PyPy +virtual machine. Any interpreter (written in RPython) is able make use +of the vectorization. For more information about that take a look here, or consult the documentation. For the time being it is not turn on by default, so be sure to enable it by specifying --jit vec=1 before running your program.

      +If your language (written in RPython) contains many array/matrix operations, you can easily integrate the optimization by adding the parameter 'vec=1' to the JitDriver.

      +NumPyPy Improvements +

      + +Let's take a look at the core functions of the NumPyPy library (*).
      The following tests tests show the speedup of the core functions commonly used in Python code interfacing with NumPy, on CPython with NumPy, on the PyPy 2.6.1 relased several weeks ago, and on PyPy 15.11 to be released soon. Timeit was used to test the time needed to run the operation in the plot title on various vector (lower case) and square matrix (upper case) sizes displayed on the X axis. The Y axis shows the speedup compared to CPython 2.7.10. This means that higher is better

      + +
      +
      +
      +
      +
      +In comparison to PyPy 2.6.1, the speedup greatly improved. The hardware support really strips down the runtime of the vector and matrix operations. There is another operation we would like to highlight: the dot product.
      It is a very common operation in numerics and PyPy now (given a moderate sized matrix and vector) decreases the time spent in that operation. See for yourself:

      + +
      +
      +
      +These are nice improvements in the NumPyPy library and we got to a competitive level only making use of SSE4.1.

      +Future work   +

      +
      This is not the end of the road. The GSoC project showed that it is possible to implement this optimization in PyPy. There might be other improvements we can make to carry this further:
        +
      • Check alignment at runtime to increase the memory throughput of the CPU
      • +
      • Support the AVX vector extension which (at least) doubles the size of the vector register
      • +
      • Handle each and every corner case in Python traces to enable it  globally
      • +
      • Do not rely only on loading operations to trigger the analysis, there might be cases where combination of floating point values could be done in parallel
      • +
      +Cheers,
      +The PyPy Team

      +(*) The benchmark code can be found here it was run using this configuration: i7-2600 CPU @ 3.40GHz (4 cores). +

      +
      +
      +
      +
      +
      + + Nax wrote on 2015-10-20 20:27: +
      +
      +

      Which BLAS are u using for CPython Numpy? OpenBlas?

      +
      +
      +
      +
      + + crusaderky wrote on 2015-10-20 22:20: +
      +
      +

      How does it compare to numexpr on those benchmarks?

      Also, any plan of addressing one of the killer features of numexpr, that is the fact that an operation like y += a1*x1 + a2*x2 + a3*x3 will create 5 temporary vectors and make a horrible usage of the CPU cache?

      +
      +
      +
      +
      + + Anonymous wrote on 2015-10-21 05:03: +
      +
      +

      I don't know anyone who uses NumPy for arrays with less than 128 elements.

      Your own benchmark shows NumPypy is much slower than NumPy for large arrays...

      +
      +
      +
      +
      + + Unknown wrote on 2015-10-21 08:44: +
      +
      +

      NumPyPy is currently not complete. Trying to evaluate any numexpr gives a strange error. I guess the problem is a missing field not exported by NumPyPy.
      However we will see how far we can get with this approach. I have made some thoughts on how we could make good use of graphics cards, but this is future work.

      +
      +
      +
      +
      + + René Dudfield wrote on 2015-10-21 11:14: +
      +
      +

      Nice work!

      +
      +
      +
      + +

      PowerPC backend for the JIT

      + +
      +

      Hi all,

      + +

      PyPy's JIT now supports the 64-bit PowerPC architecture! This is the +third architecture supported, in addition to x86 (32 and 64) and ARM +(32-bit only). More precisely, we support Linux running the big- and the +little-endian variants of ppc64. Thanks to IBM for funding this work!

      + +

      The new JIT backend has been merged into "default". You should be able +to translate PPC versions +as usual +directly on the machines. For +the foreseeable future, I will compile and distribute binary versions +corresponding to the official releases (for Fedora), but of course I'd +welcome it if someone else could step in and do it. Also, it is unclear +yet if we will run a buildbot.

      + +

      To check that the result performs well, I logged in a ppc64le machine +and ran the usual benchmark suite of PyPy (minus sqlitesynth: sqlite +was not installed on that machine). I ran it twice at a difference of +12 hours, as an attempt to reduce risks caused by other users suddenly +using the machine. The machine was overall relatively quiet. Of +course, this is scientifically not good enough; it is what I could come +up with given the limited resources.

      + +

      Here are the results, where the numbers are speed-up factors between the +non-jit and the jit version of PyPy. The first column is x86-64, for +reference. The second and third columns are the two ppc64le runs. All +are Linux. A few benchmarks are not reported here because the runner +doesn't execute them on non-jit (however, apart from sqlitesynth, they +all worked).

      + +
      +    ai                        13.7342        16.1659     14.9091
      +    bm_chameleon               8.5944         8.5858        8.66
      +    bm_dulwich_log             5.1256         5.4368      5.5928
      +    bm_krakatau                5.5201         2.3915      2.3452
      +    bm_mako                    8.4802         6.8937      6.9335
      +    bm_mdp                     2.0315         1.7162      1.9131
      +    chaos                     56.9705        57.2608     56.2374
      +    sphinx
      +    crypto_pyaes               62.505         80.149     79.7801
      +    deltablue                  3.3403         5.1199      4.7872
      +    django                    28.9829         23.206       23.47
      +    eparse                     2.3164         2.6281       2.589
      +    fannkuch                   9.1242        15.1768     11.3906
      +    float                     13.8145        17.2582     17.2451
      +    genshi_text               16.4608        13.9398     13.7998
      +    genshi_xml                 8.2782         8.0879      9.2315
      +    go                         6.7458        11.8226     15.4183
      +    hexiom2                   24.3612        34.7991     33.4734
      +    html5lib                   5.4515         5.5186       5.365
      +    json_bench                28.8774        29.5022     28.8897
      +    meteor-contest             5.1518         5.6567      5.7514
      +    nbody_modified            20.6138        22.5466     21.3992
      +    pidigits                   1.0118          1.022      1.0829
      +    pyflate-fast               9.0684        10.0168     10.3119
      +    pypy_interp                3.3977         3.9307      3.8798
      +    raytrace-simple           69.0114       108.8875    127.1518
      +    richards                  94.1863       118.1257    102.1906
      +    rietveld                   3.2421         3.0126      3.1592
      +    scimark_fft
      +    scimark_lu
      +    scimark_montecarlo
      +    scimark_sor
      +    scimark_sparsematmul
      +    slowspitfire               2.8539         3.3924      3.5541
      +    spambayes                  5.0646         6.3446       6.237
      +    spectral-norm             41.9148        42.1831     43.2913
      +    spitfire                   3.8788         4.8214       4.701
      +    spitfire_cstringio          7.606         9.1809      9.1691
      +    sqlitesynth
      +    sympy_expand               2.9537         2.0705      1.9299
      +    sympy_integrate            4.3805         4.3467      4.7052
      +    sympy_str                  1.5431         1.6248      1.5825
      +    sympy_sum                  6.2519          6.096      5.6643
      +    telco                     61.2416        54.7187     55.1705
      +    trans2_annotate
      +    trans2_rtype
      +    trans2_backendopt
      +    trans2_database
      +    trans2_source
      +    twisted_iteration         55.5019        51.5127     63.0592
      +    twisted_names              8.2262         9.0062      10.306
      +    twisted_pb                12.1134         13.644     12.1177
      +    twisted_tcp                4.9778          1.934      5.4931
      +
      +    GEOMETRIC MEAN               9.31           9.70       10.01
      +
      + +

      The last line reports the geometric mean of each column. We see that +the goal was reached: PyPy's JIT actually improves performance by a +factor of around 9.7 to 10 times on ppc64le. By comparison, it "only" +improves performance by a factor 9.3 on Intel x86-64. I don't know why, +but I'd guess it mostly means that a non-jitted PyPy performs slightly +better on Intel than it does on PowerPC.

      + +

      Why is that? Actually, if we do the same comparison with an ARM +column too, we also get higher numbers there than on Intel. +When we discovered that a few years ago, we guessed that +on ARM running the whole interpreter in +PyPy takes up a lot of resources, e.g. of instruction cache, which the +JIT's assembler doesn't need any more after the process is warmed up. +And caches are much bigger on Intel. However, PowerPC is much closer +to Intel, so this argument doesn't work for PowerPC. +But there are other more subtle +variants of it. Notably, Intel is doing crazy things about branch +prediction, which likely helps a big interpreter---both the non-JITted +PyPy and CPython, and both for the interpreter's main loop itself and +for the numerous indirect branches that depend on the types of the +objects. Maybe the PowerPC is as good as Intel, and so this argument +doesn't work either. Another one would be: +on PowerPC I did notice that gcc itself is not +perfect at optimization. During development of this backend, I often +looked at assembler produced by gcc, and there are a number of small +inefficiencies there. All these are factors that slow down the +non-JITted version of PyPy, but don't influence the speed of the +assembler produced just-in-time.

      + +

      Anyway, this is just guessing. The fact remains that PyPy can now +be used on PowerPC machines. Have fun!

      + +

      A bientôt,

      + +

      Armin.

      +
      +

      PyPy memory and warmup improvements (2) - Sharing of Guards

      + +
      +
      +

      Hello everyone!

      +

      This is the second part of the series of improvements in warmup time and +memory consumption in the PyPy JIT. This post covers recent work on sharing guard +resume data that was recently merged to trunk. It will be a part +of the next official PyPy release. To understand what it does, let's +start with a loop for a simple example:

      +
      +class A(object):
      +    def __init__(self, x, y):
      +        self.x = x
      +        self.y = y
      +
      +    def call_method(self, z):
      +        return self.x + self.y + z
      +
      +def f():
      +    s = 0
      +    for i in range(100000):
      +        a = A(i, 1 + i)
      +        s += a.call_method(i)
      +
      +

      At the entrance of the loop, we have the following set of operations:

      +
      +
      guard(i5 == 4)
      +
      guard(p3 is null)
      +p27 = p2.co_cellvars +p28 = p2.co_freevars +
      guard_class(p17, 4316866008, descr=<Guard0x104295e08>)
      +p30 = p17.w_seq +
      guard_nonnull(p30, descr=<Guard0x104295db0>)
      +i31 = p17.index +p32 = p30.strategy +
      guard_class(p32, 4317041344, descr=<Guard0x104295d58>)
      +p34 = p30.lstorage +i35 = p34..item0 +
      +

      The above operations gets executed at the entrance, so each time we call f(). They ensure +all the optimizations done below stay valid. Now, as long as nothing +out of the ordinary happens, they only ensure that the world around us never changed. However, if e.g. someone puts new +methods on class A, any of the above guards might fail. Despite the fact that it's a very unlikely +case, PyPy needs to track how to recover from such a situation. Each of those points needs to keep the full +state of the optimizations performed, so we can safely deoptimize them and reenter the interpreter. +This is vastly wasteful since most of those guards never fail, hence some sharing between guards +has been performed.

      +

      We went a step further - when two guards are next to each other or the +operations in between them don't have side effects, we can safely redo the operations or to simply +put, resume in the previous guard. That means every now and again we execute a few +operations extra, but not storing extra info saves quite a bit of time and memory. This is similar to the approach that LuaJIT takes, which is called sparse snapshots.

      + +

      +I've done some measurements on annotating & rtyping translation of pypy, which +is a pretty memory hungry program that compiles a fair bit. I measured, respectively:

      +
        +
      • total time the translation step took (annotating or rtyping)
      • +
      • time it took for tracing (that excludes backend time for the total JIT time) at +the end of rtyping.
      • +
      • memory the GC feels responsible for after the step. The real amount of memory +consumed will always be larger and the coefficient of savings is in 1.5-2x mark
      • +
      +

      Here is the table:

      + ++++++++ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
      branchtime annotationtime rtypingmemory annotationmemory rtypingtracing time
      default317s454s707M1349M60s
      sharing302s430s595M1070M51s
      win4.8%5.5%19%26%17%
      +

      Obviously pypy translation is an extreme example - the vast majority of the code out there +does not have that many lines of code to be jitted. However, it's at the very least +a good win for us :-)

      +

      We will continue to improve the warmup performance and keep you posted!

      +

      Cheers,
      +fijal

      +
      +
      +
      +
      +
      +
      + + Ernst Sjöstrand wrote on 2015-10-05 20:14: +
      +
      +

      "when two guards are next to each other or the operations in between them don't have side effects, we can safely redo the operations or to simply put, resume in the previous guard"
      Wait... "side effects", "redo"... Does this have synergies with STM?

      +
      +
      +
      +
      + + Maciej Fijalkowski wrote on 2015-10-06 05:45: +
      +
      +

      Side effect operation is one that does not have any side effects. This means that you can execute the operation again (e.g. reading a field or adding two numbers) and will affect nothing but it's result. As for redo - well, it has nothing to do with STM, but doing pure operations again can be sometimes useful (in short - if you have i = a + b, you don't remember the i, just a, b and that i = a + b)

      +
      +
      +
      + +

      PyPy warmup improvements

      + +
      +
      + +

      Hello everyone!

      +

      I'm very pleased to announce that we've just managed to merge +the optresult branch. +Under this cryptic name is the biggest JIT refactoring we've done in a couple +years, mostly focused on the warmup time and memory impact of PyPy.

      +

      To understand why we did that, let's look back in time - back when we +got the first working JIT prototype in 2009 we were focused exclusively +on achieving peak performance with some consideration towards memory usage, but +without serious consideration towards warmup time. This means we accumulated +quite a bit of technical debt over time that we're trying, with difficulty, +to address right now. This branch mostly does not affect the peak performance +- it should however help you with short-living scripts, like test runs.

      +

      We identified warmup time to be one of the major pain points for pypy users, +along with memory impact and compatibility issues with CPython C extension +world. While we can't address all the issues at once, we're trying to address +the first two in the work contributing to this blog post. I will write +a separate article on the last item separately.

      +

      To see how much of a problem warmup is for your program, you can run your +program with PYPYLOG=jit-summary:- environment variable set. +This should show you something like this:

      +
      +(pypy-optresult)fijal@hermann:~/src/botbot-web$ PYPYLOG=jit-summary:- python orm.py 1500
      +[d195a2fcecc] {jit-summary
      +Tracing:            781     2.924965
      +Backend:            737     0.722710
      +TOTAL:                      35.912011
      +ops:                1860596
      +recorded ops:       493138
      +  calls:            81022
      +guards:             131238
      +opt ops:            137263
      +opt guards:         35166
      +forcings:           4196
      +abort: trace too long:      22
      +abort: compiling:   0
      +abort: vable escape:        22
      +abort: bad loop:    0
      +abort: force quasi-immut:   0
      +nvirtuals:          183672
      +nvholes:            25797
      +nvreused:           116131
      +Total # of loops:   193
      +Total # of bridges: 575
      +Freed # of loops:   6
      +Freed # of bridges: 75
      +[d195a48de18] jit-summary}
      +
      +

      This means that the total (wall clock) time was 35.9s, out of which we spent +2.9s tracing 781 loops and 0.72s compiling them. The remaining couple were +aborted (trace too long is normal, vable escape means someone called +sys._getframe() or equivalent). You can do the following things:

      +
        +
      • compare the numbers with pypy --jit off and see at which number of +iterations pypy jit kicks in
      • +
      • play with the thresholds: +pypy --jit threshold=500,function_threshold=400,trace_eagerness=50 was +much better in this example. What this does is to lower the threshold +for tracing loops from default of 1039 to 400, threshold for tracing +functions from the start from 1619 to 500 and threshold for tracing bridges +from 200 to 50. Bridges are "alternative paths" that JIT did not take that +are being additionally traced. We believe in sane defaults, so we'll try +to improve upon those numbers, but generally speaking there is no one-size +fits all here.
      • +
      • if the tracing/backend time stays high, come and complain to us with +benchmarks, we'll try to look at them
      • +
      +

      Warmup, as a number, is notoriously hard to measure. It's a combination of:

      +
        +
      • pypy running interpreter before jitting
      • +
      • pypy needing time to JIT the traces
      • +
      • additional memory allocations needed during tracing to accomodate bookkeeping +data
      • +
      • exiting and entering assembler until there is enough coverage of assembler
      • +
      +

      We're working hard on making a better assesment at this number, stay tuned :-)

      +
      +

      Speedups

      +

      Overall we measured about 50% speed improvement in the optimizer, which reduces +the overall warmup time between 10% and 30%. The very +obvious warmup benchmark got a speedup from 4.5s to 3.5s, almost +30% improvement. Obviously the speedups on benchmarks would vastly +depend on how much warmup time is there in those benchmarks. We observed +annotation of pypy to decreasing by about 30% and the overall translation +time by about 7%, so your mileage may vary.

      +

      Of course, as usual with the large refactoring of a crucial piece of PyPy, +there are expected to be bugs. We are going to wait for the default branch +to stabilize so you should see warmup improvements in the next release. +If you're not afraid to try, nightlies will already have them.

      +

      We're hoping to continue improving upon warmup time and memory impact in the +future, stay tuned for improvements.

      +
      +
      +

      Technical details

      +

      The branch does "one" thing - it changes the underlying model of how operations +are represented during tracing and optimizations. Let's consider a simple +loop like:

      +
      +[i0, i1]
      +i2 = int_add(i0, i1)
      +i3 = int_add(i2, 1)
      +i4 = int_is_true(i3)
      +guard_true(i4)
      +jump(i3, i2)
      +
      +

      The original representation would allocate a Box for each of i0 - i4 +and then store those boxes in instances of ResOperation. The list of such +operations would then go to the optimizer. Those lists are big - we usually +remove 90% of them during optimizations, but they can be a couple thousand +elements. Overall, allocating those big lists takes a toll on warmup time, +especially due to the GC pressure. The branch removes the existance of Box +completely, instead using a link to ResOperation itself. So say in the above +example, i2 would refer to its producer - i2 = int_add(i0, i1) with +arguments getting special treatment.

      +

      That alone reduces the GC pressure slightly, but a reduced number +of instances also lets us store references on them directly instead +of going through expensive dictionaries, which were used to store optimizing +information about the boxes.

      +

      Cheers!
      +fijal & arigo

      +
      + +
      +
      +
      +

      PyPy 2.6.1 released

      + +
      +
      +
      +
      +
      +

      +PyPy 2.6.1

      +We’re pleased to announce PyPy 2.6.1, an update to PyPy 2.6.0 released June 1. +We have fixed many issues, updated stdlib to 2.7.10, cffi to version 1.3, extended support for +the new vmprof statistical profiler for multiple threads, and increased +functionality of numpy.
      +You can download the PyPy 2.6.1 release here:
      + +
      +We would like to thank our donors for the continued support of the PyPy +project, and our volunteers and contributors.

      +We would also like to encourage new people to join the project. PyPy has many +layers and we need help with all of them: PyPy and RPython documentation +improvements, tweaking popular modules to run on pypy, or general help with making +RPython’s JIT even better.

      +

      +What is PyPy?

      +PyPy is a very compliant Python interpreter, almost a drop-in replacement for +CPython 2.7. It’s fast (pypy and cpython 2.7.x performance comparison) +due to its integrated tracing JIT compiler.

      +This release supports x86 machines on most common operating systems +(Linux 32/64, Mac OS X 64, Windows 32, OpenBSD, freebsd), +as well as newer ARM hardware (ARMv6 or ARMv7, with VFPv3) running Linux.

      +We also welcome developers of other +dynamic languages to see what RPython can do for them.
      +
      +

      +Highlights

      +
        +
      • Bug Fixes
          +
        • Revive non-SSE2 support
        • +
        • Fixes for detaching _io.Buffer*
        • +
        • On Windows, close (and flush) all open sockets on exiting
        • +
        • Drop support for ancient macOS v10.4 and before
        • +
        • Clear up contention in the garbage collector between trace-me-later and pinning
        • +
        • Issues reported with our previous release were resolved after reports from users on +our issue tracker at https://foss.heptapod.net/pypy/pypy/-/issues or on IRC at +#pypy.
        • +
        +
      • +
      • New features:
          +
        • cffi was updated to version 1.3
        • +
        • The python stdlib was updated to 2.7.10 from 2.7.9
        • +
        • vmprof now supports multiple threads and OS X
        • +
        • The translation process builds cffi import libraries for some stdlib +packages, which should prevent confusion when package.py is not used
        • +
        • better support for gdb debugging
        • +
        • freebsd should be able to translate PyPy “out of the box” with no patches
        • +
        +
      • +
      • Numpy:
          +
        • Better support for record dtypes, including the align keyword
        • +
        • Implement casting and create output arrays accordingly (still missing some corner cases)
        • +
        • Support creation of unicode ndarrays
        • +
        • Better support ndarray.flags
        • +
        • Support axis argument in more functions
        • +
        • Refactor array indexing to support ellipses
        • +
        • Allow the docstrings of built-in numpy objects to be set at run-time
        • +
        • Support the buffered nditer creation keyword
        • +
        +
      • +
      • Performance improvements:
          +
        • Delay recursive calls to make them non-recursive
        • +
        • Skip loop unrolling if it compiles too much code
        • +
        • Tweak the heapcache
        • +
        • Add a list strategy for lists that store both floats and 32-bit integers. +The latter are encoded as nonstandard NaNs. Benchmarks show that the speed +of such lists is now very close to the speed of purely-int or purely-float +lists.
        • +
        • Simplify implementation of ffi.gc() to avoid most weakrefs
        • +
        • Massively improve the performance of map() with more than +one sequence argument
        • +
        +
      • +
      +Please try it out and let us know what you think. We welcome +success stories, experiments, or benchmarks, we know you are using PyPy, please tell us about it!
      +Cheers
      +The PyPy Team
      +
      +
      +
      +
      +
      +
      +
      +
      + + Anonymous wrote on 2015-09-02 13:28: +
      +
      +

      Cool! Really nice, thank you. Any ETA for Python 3.3 compatibility in pypy?

      +
      +
      +
      +
      + + xndxn wrote on 2015-09-03 17:37: +
      +
      +

      Thanks!

      +
      +
      +
      +
      + + Anonymous wrote on 2015-09-03 18:56: +
      +
      +

      Thanks!

      +
      +
      +
      +
      + + Anonymous wrote on 2015-09-04 05:01: +
      +
      +

      Still waiting for PyPy3's update. The latest version of PyPy is much faster than the latest version of PyPy3. Please update soon. :)

      +
      +
      +
      +
      + + PeteVine wrote on 2015-09-14 00:03: +
      +
      +

      Contrary to what the front page is still saying, the non-SSE2 backend for older x86 processors is fully working and can be built from source, which takes almost 7h on a 2.2GHz Athlon XP.

      You can download a 2.6.1 build from here:

      https://www.dropbox.com/sh/6i7ktwv9551asfc/AADOd55Br0lDJRH8HsKpbIwTa?dl=0

      It should work on any P2 class processor.

      +
      +
      +
      + +

      PyPy and ijson - a guest blog post

      + +
      +
      +This gem was posted in the ijson issue tracker after some discussion on #pypy, and Dav1dde kindly allowed us to repost it here:

      "So, I was playing around with parsing huge JSON files (19GiB, testfile is ~520MiB) and wanted to try a sample code with PyPy, turns out, PyPy needed ~1:30-2:00 whereas CPython 2.7 needed ~13 seconds (the pure python implementation on both pythons was equivalent at ~8 minutes).

      "Apparantly ctypes is really bad performance-wise, especially on PyPy. So I made a quick CFFI mockup: https://gist.github.com/Dav1dde/c509d472085f9374fc1d

      +Before:

      CPython 2.7:
          python -m emfas.server size dumps/echoprint-dump-1.json
          11.89s user 0.36s system 98% cpu 12.390 total 

      +PYPY:
          python -m emfas.server size dumps/echoprint-dump-1.json
          117.19s user 2.36s system 99% cpu 1:59.95 total


      +After (CFFI):

      CPython 2.7:
           python jsonsize.py ../dumps/echoprint-dump-1.json
           8.63s user 0.28s system 99% cpu 8.945 total 

      +PyPy:
           python jsonsize.py ../dumps/echoprint-dump-1.json
           4.04s user 0.34s system 99% cpu 4.392 total +

      "
      +
      +
      +

      Dav1dd goes into more detail in the issue itself, but we just want to emphasize a few significant points from this brief interchange:
        +
      • His CFFI implementation is faster than the ctypes one even on CPython 2.7.
      • +
      • PyPy + CFFI is faster than CPython even when using C code to do the heavy parsing.
      • +
      + The PyPy Team
      +
      +
      +
      +
      +
      +
      +
      +
      +
      + + Alendit wrote on 2015-06-18 08:38: +
      +
      +

      Maybe it's time to discuss inclusion of CFFI into stdandard library again?

      +
      +
      +
      +
      + + Armin Rigo wrote on 2015-06-18 09:52: +
      +
      +

      If CPython decides to include it in its stdlib, I can make sure it is updated as needed. I don't have the energy to discuss its inclusion myself, so if it happens it will be "championed" by someone else. Nowadays, I personally think inclusion has as many drawbacks as advantages, even if CFFI 1.x shouldn't evolve a lot in the foreseeable future after the 1.0 step.

      +
      +
      +
      +
      + + v3ss wrote on 2015-07-18 22:14: +
      +
      +

      The problem is converting existing libs to use cffi. Only very few percent of Libs are ready for python3.x and with this trend , not even 1% of libs will be converted to work with CFFI.
      That makes PyPy adoption a lot slower.

      Is there really no chance of improving ctypes?

      +
      +
      +
      +
      + + Maciej Fijalkowski wrote on 2015-07-19 05:39: +
      +
      +

      you would think, but these days vast majority of popular C bindings come with cffi equivalents. In fact cffi is vastly more popular than ctypes ever was.

      +
      +
      +
      + +
      +
      + +
      +
      +
      + +
      + + + + \ No newline at end of file diff --git a/blog/index-35.html b/blog/index-35.html new file mode 100644 index 000000000..aae5ec531 --- /dev/null +++ b/blog/index-35.html @@ -0,0 +1,1867 @@ + + + + + + +PyPy (old posts, page 35) | PyPy + + + + + + + + + + + + + + + + + + Skip to main content +
      +
      +
      +

      Reverse debugging for Python

      + +
      +
      +

      RevPDB

      +

      A "reverse debugger" is a debugger where you can go forward and +backward in time. It is an uncommon feature, at least in the open +source world, but I have no idea why. I have used undodb-gdb and +rr, which are reverse debuggers for C code, and I can only say that +they saved me many, many days of poking around blindly in gdb.

      +

      The PyPy team is pleased to give you "RevPDB", a reverse-debugger +similar to rr but for Python.

      +

      An example is worth a thousand words. Let's say your big Python +program has a bug that shows up inconsistently. You have nailed it +down to something like:

      +
        +
      • start x.py, which does stuff (maybe involving processing files, +answering some web requests that you simulate from another terminal, +etc.);
      • +
      • sometimes, after a few minutes, your program's state becomes +inconsistent and you get a failing assert or another exception.
      • +
      +

      This is the case where RevPDB is useful.

      +

      RevPDB is available only on 64-bit Linux and OS/X right now, but should +not be too hard to port to other OSes. It is very much alpha-level! +(It is a debugger full of bugs. Sorry about that.) I believe it is +still useful---it helped me in one real use case already.

      +
      +
      +

      How to get RevPDB

      +

      The following demo was done with an alpha version for 64-bit Linux, +compiled for Arch Linux. I won't provide the binary; it should be +easy enough to retranslate (much faster than a regular PyPy because it +contains neither a JIT nor a custom GC). Grab the PyPy sources from +Mercurial, and then:

      +
      +hg update reverse-debugger
      +# or "hg update ff376ccacb36" for exactly this demo
      +cd pypy/goal
      +../../rpython/bin/rpython -O2 --revdb targetpypystandalone.py  \
      +                  --withoutmod-cpyext --withoutmod-micronumpy
      +
      +

      and possibly rename the final pypy-c to pypy-revdb to avoid +confusion.

      +

      Other platforms than 64-bit Linux and OS/X need some fixes before they work.

      +
      +
      +

      Demo

      +

      For this demo, we're going to use this x.py as the "big program":

      +
      +import os
      +
      +class Foo(object):
      +    value = 5
      +
      +lst1 = [Foo() for i in range(100)]
      +lst1[50].value += 1
      +for x in lst1:
      +    x.value += 1
      +
      +for x in lst1:
      +    if x.value != 6:
      +        print 'oops!'
      +        os._exit(1)
      +
      +

      Of course, it is clear what occurs in this small example: the check +fails on item 50. For this demo, the check has been written with +os._exit(1), because this exits immediately the program. If it +was written with an assert, then its failure would execute things +in the traceback module afterwards, to print the traceback; it +would be a minor mess just to find the exact point of the failing +assert. (This and other issues are supposed to be fixed in the +future, but for now it is alpha-level.)

      +

      Anyway, with a regular assert and a regular post-mortem pdb, +we could observe that x.value is indeed 7 instead of 6 when the +assert fails. Imagine that the program is much bigger: how would we +find the exact chain of events that caused this value 7 to show up on +this particular Foo object? This is what RevPDB is for.

      +

      First, we need for now to disable Address Space Layout Randomization +(ASLR), otherwise replaying will not work. This is done once with the +following command line, which changes the state until the next +reboot:

      +
      +echo 0 | sudo tee /proc/sys/kernel/randomize_va_space
      +
      +

      UPDATE: the above is no longer necessary from revision ff376ccacb36.

      +

      Run x.py with RevPDB's version of PyPy instead of the regular +interpreter (CPython or PyPy):

      +
      +PYPYRDB=log.rdb ./pypy-revdb x.py
      +
      +

      This pypy-revdb executable is like a slow PyPy executable, running +(for now) without a JIT. This produces a file log.rdb which +contains a complete log of this execution. (If the bug we are +tracking occurs rarely, we need to re-run it several times until we +get the failure. But once we got the failure, then we're done with +this step.)

      +

      Start:

      +
      +rpython/translator/revdb/revdb.py log.rdb
      +
      +

      We get a pdb-style debugger. This revdb.py is a normal Python +program, which you run with an unmodified Python; internally, it looks +inside the log for the path to pypy-revdb and run it as needed (as +one forking subprocess, in a special mode).

      +

      Initially, we are at the start of the program---not at the end, like +we'd get in a regular debugger:

      +
      +File "<builtin>/app_main.py", line 787 in setup_bootstrap_path:
      +(1)$
      +
      +

      The list of commands is available with help.

      +

      Go to the end with continue (or c):

      +
      +(1)$ continue
      +File "/tmp/x.py", line 14 in <module>:
      +...
      +  lst1 = [Foo() for i in range(100)]
      +  lst1[50].value += 1
      +  for x in lst1:
      +      x.value += 1
      +
      +  for x in lst1:
      +      if x.value != 6:
      +          print 'oops!'
      +>         os._exit(1)
      +(19727)$
      +
      +

      We are now at the beginning of the last executed line. The number +19727 is the "time", measured in number of lines executed. We can go +backward with the bstep command (backward step, or bs), line +by line, and forward again with the step command. There are also +commands bnext, bcontinue and bfinish and their forward +equivalents. There is also "go TIME" to jump directly to the specified +time. (Right now the debugger only stops at "line start" +events, not at function entry or exit, which makes some cases a bit +surprising: for example, a step from the return statement of +function foo() will jump directly to the caller's caller, if the +caller's current line was return foo() + 2, because no "line +start" event occurs in the caller after foo() returns to it.)

      +

      We can print Python expressions and statements using the p +command:

      +
      +(19727)$ p x
      +$0 = <__main__.Foo object at 0xfffffffffffeab3e>
      +(19727)$ p x.value
      +$1 = 7
      +(19727)$ p x.value + 1
      +8
      +
      +

      The "$NUM =" prefix is only shown when we print an object that +really exists in the debugged program; that's why the last line does +not contain it. Once a $NUM has been printed, then we can use +it in further expressions---even at a different point time. It +becomes an anchor that always refers to the same object:

      +
      +(19727)$ bstep
      +
      +File "/tmp/x.py", line 13 in <module>:
      +...
      +
      +  lst1 = [Foo() for i in range(100)]
      +  lst1[50].value += 1
      +  for x in lst1:
      +      x.value += 1
      +
      +  for x in lst1:
      +      if x.value != 6:
      +>         print 'oops!'
      +          os._exit(1)
      +(19726)$ p $0.value
      +$1 = 7
      +
      +

      In this case, we want to know when this value 7 was put in this +attribute. This is the job of a watchpoint:

      +
      +(19726)$ watch $0.value
      +Watchpoint 1 added
      +updating watchpoint value: $0.value => 7
      +
      +

      This watchpoint means that $0.value will be evaluated at each line. +When the repr() of this expression changes, the watchpoint activates +and execution stops:

      +
      +(19726)$ bcontinue
      +[searching 19629..19726]
      +[searching 19338..19629]
      +
      +updating watchpoint value: $0.value => 6
      +Reverse-hit watchpoint 1: $0.value
      +File "/tmp/x.py", line 9 in <module>:
      +  import os
      +
      +  class Foo(object):
      +      value = 5
      +
      +  lst1 = [Foo() for i in range(100)]
      +  lst1[50].value += 1
      +  for x in lst1:
      +>     x.value += 1
      +
      +  for x in lst1:
      +      if x.value != 6:
      +          print 'oops!'
      +          os._exit(1)
      +(19524)$
      +
      +

      Note that using the $NUM syntax is essential in watchpoints. You +can't say "watch x.value", because the variable x will go out +of scope very soon when we move forward or backward in time. In fact +the watchpoint expression is always evaluated inside an environment +that contains the builtins but not the current locals and globals. +But it also contains all the $NUM, which can be used to refer to +known objects. It is thus common to watch $0.attribute if $0 +is an object, or to watch len($1) if $1 is some list. The +watch expression can also be a simple boolean: for example, "watch +$2 in $3" where $3 is some dict and $2 is some object that +you find now in the dict; you would use this to find out the time when +$2 was put inside $3, or removed from it.

      +

      Use "info watchpoints" and "delete <watchpointnum>" to manage +watchpoints.

      +

      There are also regular breakpoints, which you set with "b +FUNCNAME". It breaks whenever there is a call to a function that +happens to have the given name. (It might be annoying to use for a +function like __init__() which has many homonyms. There is no +support for breaking on a fully-qualified name or at a given line +number for now.)

      +

      In our demo, we stop at the line x.value += 1, which is where the +value was changed from 6 to 7. Use bcontinue again to stop at the +line lst1[50].value += 1, which is where the value was changed from +5 to 6. Now we know how this value attribute ends up being 7.

      +
      +(19524)$ bcontinue
      +[searching 19427..19524]
      +[searching 19136..19427]
      +
      +updating watchpoint value: $0.value => 5
      +Reverse-hit watchpoint 1: $0.value
      +File "/tmp/x.py", line 7 in <module>:
      +  import os
      +
      +  class Foo(object):
      +      value = 5
      +
      +  lst1 = [Foo() for i in range(100)]
      +> lst1[50].value += 1
      +  for x in lst1:
      +      x.value += 1
      +
      +  for x in lst1:
      +      if x.value != 6:
      +...
      +(19422)$
      +
      +

      Try to use bcontinue yet another time. It will stop now just before +$0 is created. At that point in time, $0 refers to +an object that does not exist yet, so the watchpoint now evaluates to +an error message (but it continues to work as before, with that error +message as the string it currently evaluates to).

      +
      +(19422)$ bcontinue
      +[searching 19325..19422]
      +
      +updating watchpoint value: $0.value => RuntimeError:
      +               '$0' refers to an object created later in time
      +Reverse-hit watchpoint 1: $0.value
      +File "/tmp/x.py", line 6 in <module>:
      +  import os
      +
      +  class Foo(object):
      +      value = 5
      +
      +> lst1 = [Foo() for i in range(100)]
      +  lst1[50].value += 1
      +  for x in lst1:
      +      x.value += 1
      +
      +  for x in lst1:
      +...
      +(19371)$
      +
      +

      In big programs, the workflow is similar, just more complex. Usually +it works this way: we find interesting points in time with some +combination of watchpoints and some direct commands to move around. +We write down on a piece of (real or virtual) paper these points in +history, including most importantly their time, so that we can +construct an ordered understanding of what is going on.

      +

      The current revdb can be annoying and sometimes even crash; but +the history you reconstruct can be kept. All the times and +expressions printed are still valid when you restart revdb. The +only thing "lost" is the $NUM objects, which you need to print +again. (Maybe instead of $0, $1, ... we should use $<big +number>, where the big number identifies uniquely the object by its +creation time. These numbers would continue to be valid even after +revdb is restarted. They are more annoying to use than just +$0 though.)

      +

      Screencast: Here's a (slightly typo-y) screencast of cfbolz using the reverse debugger: +

      +
      +
      +

      Current issues

      +

      General issues:

      +
        +
      • If you are using revdb on a log that took more than a few +minutes to record, then it can be painfully slow. This is because +revdb needs to replay again big parts of the log for some +operations.
      • +
      • The pypy-revdb is currently missing the following modules:
          +
        • +thread (implementing multithreading is possible, but not done +yet);
        • +
        • +cpyext (the CPython C API compatibility layer);
        • +
        • +micronumpy (minor issue only);
        • +
        • +_continuation (for greenlets).
        • +
        +
      • +
      • Does not contain a JIT, and does not use our fast garbage +collectors. You can expect pypy-revdb to be maybe 3 times +slower than CPython.
      • +
      • Only works on Linux and OS/X. There is no fundamental reason for +this restriction, but it is some work to fix.
      • +
      • Replaying a program uses a lot more memory; maybe 15x as much than +during the recording. This is because it creates many forks. If +you have a program that consumes 10% of your RAM or more, you will +need to reduce MAX_SUBPROCESSES in process.py.
      • +
      +

      Replaying also comes with a bunch of user interface issues:

      +
        +
      • +Attempted to do I/O or access raw memory: we get this whenever +trying to print some expression that cannot be evaluated with +only the GC memory---or which can, but then the __repr__() +method of the result cannot. We need to reset the state with +bstep + step before we can print anything else. However, +if only the __repr__() crashes, you still see the $NUM = +prefix, and you can use that $NUM afterwards.
      • +
      • +id() is globally unique, returning a reproducible 64-bit number, +so sometimes using id(x) is a workaround for when using x +doesn't work because of Attempted to do I/O issues (e.g. p +[id(x) for x in somelist]).
      • +
      • as explained in the demo, next/bnext/finish/bfinish might jump +around a bit non-predictably.
      • +
      • similarly, breaks on watchpoints can stop at apparently unexpected +places (when going backward, try to do "step" once). The issue is +that it can only stop at the beginning of every line. In the +extreme example, if a line is foo(somelist.pop(getindex())), +then somelist is modified in the middle. Immediately before +this modification occurs, we are in getindex(), and +immediately afterwards we are in foo(). The watchpoint will +stop the program at the end of getindex() if running backward, +and at the start of foo() if running forward, but never +actually on the line doing the change.
      • +
      • watchpoint expressions must not have any side-effect at all. If +they do, the replaying will get out of sync and revdb.py will +complain about that. Regular p expressions and statements can +have side-effects; these effects are discarded as soon as you move +in time again.
      • +
      • sometimes even "p import foo" will fail with Attempted to do +I/O. Use instead "p import sys; foo = sys.modules['foo']".
      • +
      • use help to see all commands. backtrace can be useful. +There is no up command; you have to move in time instead, +e.g. using bfinish to go back to the point where the current +function was called.
      • +
      +
      +
      +

      How RevPDB is done

      +

      If I had to pick the main advantage of PyPy over CPython, it is that +we have got with the RPython translation toolchain a real place for +experimentation. Every now and then, we build inside RPython some +feature that gives us an optionally tweaked version of the PyPy +interpreter---tweaked in a way that would be hard to do with CPython, +because it would require systematic changes everywhere. The most +obvious and successful examples are the GC and the JIT. But there +have been many other experiments along the same lines, from the +so-called stackless transformation in the early days, to the STM +version of PyPy.

      +

      RevPDB works in a similar way. It is a version of PyPy in which some +operations are systematically replaced with other operations.

      +

      To keep the log file at a reasonable size, we duplicate the content of +all GC objects during replaying---by repeating the same actions on +them, without writing anything in the log file. So that means that in +the pypy-revdb binary, the operations that do arithmetic or +read/write GC-managed memory are not modified. Most operations are +like that. However, the other operations, the ones that involve +either non-GC memory or calls to external C functions, are tweaked. +Each of these operations is replaced with code that works in two +modes, based on a global flag:

      +
        +
      • in "recording" mode, we log the result of the operation (but not the +arguments);
      • +
      • in "replaying" mode, we don't really do the operation at all, but +instead just fetch the result from the log.
      • +
      +

      Hopefully, all remaining unmodified operations (arithmetic and GC +load/store) are completely deterministic. So during replaying, every +integer or non-GC pointer variable will have exactly the same value as +it had during recording. Interestingly, it means that if the +recording process had a big array in non-GC memory, then in the +replaying process, the array is not allocated at all; it is just +represented by the same address, but there is nothing there. When we +record "read item 123 from the array", we record the result of the +read (but not the "123"). When we replay, we're seeing again the same +"read item 123 from the array" operation. At that point, we don't +read anything; we just return the result from the log. Similarly, +when recording a "write" to the array, we record nothing (this write +operation has no result); so that when replaying, we redo nothing.

      +

      Note how that differs from anything managed by GC memory: GC objects +(including GC arrays) are really allocated, writes really occur, and +reads are redone. We don't touch the log in this case.

      +
      +
      +

      Other reverse debuggers for Python

      +

      There are already some Python experiments about reverse debugging. +This is also known as "omniscient debugging". However, I claim that +the result they get to is not very useful (for the purpose presented +here). How they work is typically by recording changes to some +objects, like lists and dictionaries, in addition to recording the +history of where your program passed through. However, the problem of +Python is that lists and dictionaries are not the end of the story. +There are many, many, many types of objects written in C which are +mutable---in fact, the immutable ones are the exception. You can try +to systematically record all changes, but it is a huge task and easy +to forget a detail.

      +

      In other words it is a typical use case for tweaking the RPython +translation toolchain, rather than tweaking the CPython (or PyPy) +interpreter directly. The result that we get here with RevPDB is more +similar to rr anyway, in that only a relatively small number of +external events are recorded---not every single change to every single +list and dictionary.

      +

      Some links:

      + +

      For C:

      + +
      +
      +

      Future work

      +

      As mentioned above, it is alpha-level, and only works on Linux and OS/X. +So the plans for the immediate future are to fix the various +issues described above, and port to more operating systems. The core of the system +is in the C file and headers in rpython/translator/revdb/src-revdb.

      +

      For interested people, there is also the Duhton interpreter and its +reverse-debugger branch, which is where I prototyped the RPython +concept before moving to PyPy. The basics should work for any +interpreter written in RPython, but they require some specific code to +interface with the language; in the case of PyPy, it is in +pypy/interpreter/reverse_debugging.py.

      +

      In parallel, there are various user interface improvements that people +could be interested in, like a more "pdb++" experience. (And the script +at rpython/translator/revdb/revdb.py should be moved out into some +more "official" place, and the reverse-debugger branch should be +merged back to default.)

      +

      I would certainly welcome any help!

      +

      -+- Armin

      +
      +
      +
      +
      +
      + + Rachmad Imam Tarecha wrote on 2016-07-08 13:57: +
      +
      +

      I think python is hard programming language, :D

      +
      +
      +
      +
      + + mrh1997 wrote on 2016-07-09 22:59: +
      +
      +

      I am really impressed!
      Especially of the fact that you did the Job within one month.

      I had the idea of such a tool, too some time ago (with exactly the same approach, but in CPython instead of PyPy).
      But I failed to implement it, as in CPython I had to do a lot more modifications...

      +
      +
      +
      +
      + + Armin Rigo wrote on 2016-07-10 18:31: +
      +
      +

      Seems to work out of the box on OS/X. I've updated it in the blog post.

      +
      +
      +
      +
      + + Ron Barak wrote on 2016-07-14 22:50: +
      +
      +

      Erratum:
      RevPDB is only available only on 64-bit Linux -> RevPDB is available only on 64-bit Linux

      +
      +
      +
      +
      + + Armin Rigo wrote on 2016-07-15 08:55: +
      +
      +

      Thanks for the typo.

      +
      +
      +
      + +

      PyPy2 v5.3 released - major C-extension support improvements

      + +
      +
      +We have released PyPy2.7 v5.3, about six weeks after PyPy 5.1 and a week after +PyPy3.3 v5.2 alpha 1, the first PyPy release targeting 3.3 +compatibility. This new PyPy2.7 release includes major improvements for the +C-API compatibility layer. In addition to complete support +for lxml, we now pass most (more than 95%) of the upstream numpy test suite. We can build and run scipy and matplotlib as well. Most of the failures have to do with (ab) use of the C-API, for instance writing to a read-only pointer obtained from PyString_AsString().

      +Note that the C-API compatibility layer is significantly slower than CPython, as explained in the blog post about the new strategy for reflection of C objects into the PyPy interpreter.

      +We updated cffi to version 1.7 (incremental changes which provide a nicer developer experience, documented here). We would encourage developers to move their C-extension modules to cffi, but are willing to help you work through issues with existing code; come to #pypy on IRC and let us know how we can help you help us do better.

      +You can download the PyPy2 v5.3 release here:
      + +
      +We would like to thank our donors for their continued support of the PyPy +project. We would also like to thank our contributors and +encourage new people to join the project. PyPy has many +layers and we need help with all of them: PyPy and RPython documentation +improvements, tweaking popular modules to run on PyPy, or general help +with making RPython’s JIT even better.

      +

      +What is PyPy?

      +PyPy is a very compliant Python interpreter, almost a drop-in replacement for CPython 2.7. It’s fast (PyPy and CPython 2.7 performance comparison) due to its integrated tracing JIT compiler.

      +We also welcome developers of other dynamic languages to see what RPython can do for them.

      +This release supports:
        +
      • +x86 machines on most common operating systems (Linux 32/64, Mac OS X 64, Windows 32, OpenBSD, FreeBSD)
      • +
      • newer ARM hardware (ARMv6 or ARMv7, with VFPv3) running Linux
      • +
      • big- and little-endian variants of PPC64 running Linux
      • +
      • +s390x running Linux
      • +
      +
      +
      +

      +Other Highlights

      +

      +(since the release of PyPy 5.1 in April, 2016)

      +
        +
      • +
        +New features: +
        +
          +
        • +
          +Merge a major expansion of the C-API support in cpyext, also expand cpyext tests to allow running them after translation as well as untranslated
          +
        • +
        • +
          +Instead of “GIL not held when a CPython C extension module +calls PyXxx”, we now silently acquire/release the GIL. Helps with +C extension modules that call some PyXxx() functions without +holding the GIL (arguably, they are theoretically buggy).
          +
        • +
        • +
          +Support command line -v to trace import statements
          +
        • +
        • +
          +Revive traceviewer, a tool to use pygame to view traces
          +
          +
          +
          +
        • +
        +
      • +
      • +
        +Numpy via our internal _numpypy module: +
        +
          +
        • Implement ufunc.outer
        • +
        • Move PyPy-specific numpypy headers to a subdirectory (also changed the repo +accordingly)
        • +

        +
      • +
      • +
        +Performance improvements: +
        +
          +
        • Use bitstrings to compress lists of descriptors that are attached to an +EffectInfo
        • +
        • Remove most of the _ovf, _zer and _val operations from RPython. Kills +quite some code internally, and allows the JIT to do better +optimizations: for example, app-level code like x / 2 or x % 2 +can now be turned into x >> 1 or x & 1, even if x is possibly +negative.
        • +
        • Rework the way registers are moved/spilled in before_call()
        • +
        +
        +
      • +
      • +
        +Internal refactorings: +
        +
          +
        • Refactor code to better support Python3-compatible syntax
        • +
        • Reduce the size of generated C sources during translation by +eliminating many many unused struct declarations (Issue #2281)
        • +
        • Reduce the size of generated code by using the same function objects in +all generated subclasses
        • +
        • Share cpyext Py* function wrappers according to the signature, shrinking the +translated libpypy.so by about 10% (without the JIT)
        • +
        +
      • +
      +Please update, and continue to help us make PyPy better. +Cheers
      + +The PyPy Team
      +
      +
      +
      +
      +
      + + Anonymous wrote on 2016-06-09 19:48: +
      +
      +

      "We can build and run scipy and matplotlib as well."

      That's exciting. Are there special instructions needed to build and run spicy and matplotlib with PyPy to see how well it presently works for particular applications? Or is it not even really ready for outsiders to knock it around yet?

      +
      +
      +
      +
      + + mattip wrote on 2016-06-09 23:26: +
      +
      +

      No special instructions, just build from source (binaries precompiled for cpython will not work) using "pypy setup.py install", and let us know how it goes. The order should be numpy, matplotlib, scipy (we have reports that pygtk works too fwiw).

      There have already been some bug reports, so you might want to patch your pymem.h header in pypy/include with this changeset https://bitbucket.org/pypy/pypy/commits/68486f0f79c649514, and if you are on OSX you may need to patch numpy/distutils/fcompiler/gnu.py with this patch https://bitbucket.org/pypy/numpy/commits/50bff5807e09721acc4d778ce8ffdef86e2f4c50

      +
      +
      +
      +
      + + Canesin wrote on 2016-06-12 17:38: +
      +
      +

      Great work as usual!

      +
      +
      +
      + +

      PyPy3.3 v5.2 alpha 1 released

      + +
      +

      We're pleased to announce the first alpha release of PyPy3.3 v5.2. This is the
      +first release of PyPy which targets Python 3.3 (3.3.5) compatibility.

      +

      We would like to thank all of the people who donated to the py3k proposal
      +for supporting the work that went into this and future releases.

      +

      You can download the PyPy3.3 v5.2 alpha 1 release here:

      +
      https://pypy.org/download.html#python-3-3-5-compatible-pypy3-3-v5-2
      +
      +

      Highlights

      + +
      +
      +

      What is PyPy?

      +

      PyPy is a very compliant Python interpreter, almost a drop-in replacement for
      +CPython 2.7.10 and one day 3.3.5. It's fast due to its integrated tracing JIT
      +compiler.

      +

      We also welcome developers of other dynamic languages to see what RPython
      +can do for them.

      +

      This release supports:

      +
        +
      • +x86 machines on most common operating systems except Windows
        +(Linux 32/64, Mac OS X 64, OpenBSD, FreeBSD),
      • +
      • newer ARM hardware (ARMv6 or ARMv7, with VFPv3) running Linux,
      • +
      • big- and little-endian variants of PPC64 running Linux,
      • +
      • +s390x running Linux
      • +
      +

      Please try it out and let us know what you think. We welcome feedback, we know
      +you are using PyPy, please tell us about it!

      +

      We'd especially like to thank these people for their contributions to this
      +release:

      +

      Manuel Jacob, Ronan Lamy, Mark Young, Amaury Forgeot d'Arc, Philip Jenvey,
      +Martin Matusiak, Vasily Kuznetsov, Matti Picus, Armin Rigo and many others.

      +

      Cheers

      +

      The PyPy Team

      +
      +
      +
      +
      +
      + + rnbdlnch wrote on 2016-05-31 09:33: +
      +
      +

      thank you!!!

      +
      +
      +
      +
      + + Unknown wrote on 2016-05-31 12:47: +
      +
      +

      Many, many thanks!

      +
      +
      +
      +
      + + Unknown wrote on 2016-05-31 13:30: +
      +
      +

      As a follow-up: Did asyncio work previously? Anyway, it does now as 'yield from' is there. Beautiful!

      +
      +
      +
      +
      + + Anonymous wrote on 2016-06-01 01:52: +
      +
      +

      Great News!!! Thank you!!!

      +
      +
      +
      +
      + + Robert wrote on 2016-06-03 23:04: +
      +
      +

      Excited! Can't wait for the 3.4 compatibility!

      +
      +
      +
      +
      + + Sean Vieira wrote on 2016-07-05 19:33: +
      +
      +

      Hip, hip, huzzah!

      +
      +
      +
      +
      + + Hai Zaar wrote on 2016-07-25 14:59: +
      +
      +

      Great news guys! Did you consider skipping 3.3/3.4 support all together and going straight for 3.5 compatibility?

      +
      +
      +
      +
      + + Armin Rigo wrote on 2016-07-31 14:22: +
      +
      +

      We'll be working next on 3.5 support.

      +
      +
      +
      + +

      PyPy 5.1.1 bugfix released

      + +
      +
      +
      +We have released a bugfix for PyPy 5.1, due to a regression in installing third-party packages depending on numpy (using our numpy fork available at https://bitbucket.org/pypy/numpy ).

      Thanks to those who reported the issue. We also fixed a regression in translating PyPy which increased the memory required to translate. Improvement will be noticed by downstream packagers and those who translate rather than
      download pre-built binaries.
      +
      +

      +What is PyPy?

      +
      +PyPy is a very compliant Python interpreter, almost a drop-in replacement for CPython 2.7. It's fast (PyPy and CPython 2.7.x performance comparison) due to its integrated tracing JIT compiler.

      We also welcome developers of other dynamic languages to see what RPython can do for them.

      This release supports:
      +
        +
      • +x86 machines on most common operating systems (Linux 32/64, Mac OS X 64, Windows 32, OpenBSD, FreeBSD),
      • +
      • newer ARM hardware (ARMv6 or ARMv7, with VFPv3) running Linux,
      • +
      • big- and little-endian variants of PPC64 running Linux,
      • +
      • +s390x running Linux
      • +
      +
      +Please update, and continue to help us make PyPy better.

      Cheers

      The PyPy Team
      +
      +
      +
      +

      PyPy 5.1 released

      + +
      +
      +
      +
      +We have released PyPy 5.1, about a month after PyPy 5.0.

      +This release includes more improvement to warmup time and memory requirements, extending the work done on PyPy 5.0. We have seen an additional reduction of about 20% in memory requirements, and up to 30% warmup time improvement, more detail in the blog post.

      +We also now have full support for the IBM s390x. Since this support is in RPython, any dynamic language written using RPython, like PyPy, will automagically be supported on that architecture.

      +We updated cffi to 1.6 (cffi 1.6 itself will be released shortly), and continue to improve support for the wider python ecosystem using the PyPy interpreter.

      +You can download the PyPy 5.1 release here:
      + +
      +We would like to thank our donors for the continued support of the PyPy project.
      +We would also like to thank our contributors and encourage new people to join the project. PyPy has many layers and we need help with all of them: PyPy and RPython documentation improvements, tweaking popular modules to run on pypy, or general help with making RPython’s JIT even better.
      +

      +What is PyPy?

      +PyPy is a very compliant Python interpreter, almost a drop-in replacement for CPython 2.7. It’s fast (PyPy and CPython 2.7.x performance comparison) due to its integrated tracing JIT compiler.

      +We also welcome developers of other dynamic languages to see what RPython can do for them.

      +This release supports:
        +
      • +x86 machines on most common operating systems (Linux 32/64, Mac OS X 64, Windows 32, OpenBSD, FreeBSD),
      • +
      • newer ARM hardware (ARMv6 or ARMv7, with VFPv3) running Linux,
      • +
      • big- and little-endian variants of PPC64 running Linux,
      • +
      • +s390x running Linux
      • +
      +
      +
      +
      +

      +Other Highlights

      +

      +(since the release of PyPy 5.0 in March, 2016

      +
        +
      • +

        +New features:

        +
          +
        • A new jit backend for the IBM s390x, which was a large effort over the past few months.
        • +
        • Add better support for PyUnicodeObject in the C-API compatibility layer
        • +
        • Support GNU/kFreeBSD Debian ports in vmprof
        • +
        • Add __pypy__._promote
        • +
        • Make attrgetter a single type for CPython compatibility
        • +
        +
        +
      • +
      • +

        +Bug Fixes

        +
          +
        • Catch exceptions raised in an exit function
        • +
        • Fix a corner case in the JIT
        • +
        • Fix edge cases in the cpyext refcounting-compatible semantics (more work on cpyext compatibility is coming in the cpyext-ext branch, but isn’t ready yet)
        • +
        • Try harder to not emit NEON instructions on ARM processors without NEON support
        • +
        • Improve the rpython posix module system interaction function calls
        • +
        • Detect a missing class function implementation instead of calling a random function
        • +
        • Check that PyTupleObjects do not contain any NULLs at the point of conversion to W_TupleObjects
        • +
        • In ctypes, fix _anonymous_ fields of instances
        • +
        • Fix JIT issue with unpack() on a Trace which contains half-written operations
        • +
        • Fix sandbox startup (a regression in 5.0)
        • +
        • Fix possible segfault for classes with mangled mro or __metaclass__
        • +
        • Fix isinstance(deque(), Hashable) on the pure python deque
        • +
        • Fix an issue with forkpty()
        • +
        • Issues reported with our previous release were resolved after reports from users on our issue tracker at https://foss.heptapod.net/pypy/pypy/-/issues or on IRC at #pypy
        • +
        +
        +
      • +
      • +

        +Numpy:

        +
          +
        • Implemented numpy.where for a single argument
        • +
        • Indexing by a numpy scalar now returns a scalar
        • +
        • Fix transpose(arg) when arg is a sequence
        • +
        • Refactor include file handling, now all numpy ndarray, ufunc, and umath functions exported from libpypy.so are declared in pypy_numpy.h, which is included only when building our fork of numpy
        • +
        • Add broadcast
        • +
        +
        +
      • +
      • +

        +Performance improvements:

        +
          +
        • Improve str.endswith([tuple]) and str.startswith([tuple]) to allow JITting
        • +
        • Merge another round of improvements to the warmup performance
        • +
        • Cleanup history rewriting in pyjitpl
        • +
        • Remove the forced minor collection that occurs when rewriting the assembler at the start of the JIT backend
        • +
        • Port the resource module to cffi
        • +
          +
        • +
        • +

          +Internal refactorings:

          +
            +
          • Use a simpler logger to speed up translation
          • +
          • Drop vestiges of Python 2.5 support in testing
          • +
          • Update rpython functions with ones needed for py3k
          • +
          +
        • +
        +
        +
        +
        +
        +
        +
        +
        +Please update, and continue to help us make PyPy better.
        +Cheers
        +The PyPy Team
        +
        +
        +
        +






        +
        +
        +
        +

        PyPy Enterprise Edition

        + +
        +

        With the latest additions, PyPy's JIT now supports the Z architecture on Linux. The newest architecture revision (also known as s390x, or colloquially referred to as "big iron") is the 64-bit extension for IBM mainframes. Currently only Linux 64 bit is supported (not z/OS nor TPF).
        +This is the fourth assembler backend supported by PyPy in addition to x86 (32 and 64), ARM (32-bit only) and PPC64 (both little- and big-endian). It might seem that we kind of get a hang of new architectures. Thanks to IBM for funding this work!

        +

        +History

        +When I went to university one lecture covered the prediction of Thomas Watson in 1943. His famous quote "I think there is a world market for maybe five computers ...", turned out not to be true.

        +However, even 70 years later, mainframes are used more often than you think. They back critical tasks requiring a high level of stability/security and offer high hardware and computational utilization rates by virtualization.

        +With the new PyPy JIT backend we are happy to present a fast Python virtual machine for mainframes and contribute more free software running on s390x.

        +Meta tracing +

        +Even though the JIT backend has been tested on PyPy, it is not restricted to  the Python programming language. Do you have a great idea for a DSL, or another language that should run on mainframes? Go ahead and just implement your interpreter using RPython.

        +How do I get a copy? +

        +PyPy can be built using the usual instructions found here. As soon as the next PyPy version has been released we will provide binaries. Until then you can just grab a nightly here.We are currently busy to get the next version of PyPy ready, so an official release will be rolled out soon.

        +Comparing s390x to x86 +

        +The goal of this comparison is not to scientifically evaluate the benefits/disadvantages on s390x, but rather to see that PyPy's architecture delivers the same benefits as it does on other platforms. Similar to the comparison done for PPC I ran the benchmarks using the same setup. The first column is the speedup of the PyPy JIT VM compared to the speedup of a pure PyPy interpreter 1). Note that the s390x's OS was virtualized.

        +  Label               x86     s390x      s390x (run 2)

          ai                 13.7      12.4       11.9
          bm_chameleon        8.5       6.3        6.8
          bm_dulwich_log      5.1       5.0        5.1
          bm_krakatau         5.5       2.0        2.0
          bm_mako             8.4       5.8        5.9
          bm_mdp              2.0       3.8        3.8
          chaos              56.9      52.6       53.4
          crypto_pyaes       62.5      64.2       64.2
          deltablue           3.3       3.9        3.6
          django             28.8      22.6       21.7
          eparse              2.3       2.5        2.6
          fannkuch            9.1       9.9       10.1
          float              13.8      12.8       13.8
          genshi_text        16.4      10.5       10.9
          genshi_xml          8.2       7.9        8.2
          go                  6.7       6.2       11.2
          hexiom2            24.3      23.8       23.5
          html5lib            5.4       5.8        5.7
          json_bench         28.8      27.8       28.1
          meteor-contest      5.1       4.2        4.4
          nbody_modified     20.6      19.3       19.4
          pidigits            1.0      -1.1       -1.0
          pyflate-fast        9.0       8.7        8.5
          pypy_interp         3.3       4.2        4.4
          raytrace-simple    69.0     100.9       93.4
          richards           94.1      96.6       84.3
          rietveld            3.2       2.5        2.7
          slowspitfire        2.8       3.3        4.2
          spambayes           5.0       4.8        4.8
          spectral-norm      41.9      39.8       42.6
          spitfire            3.8       3.9        4.3
          spitfire_cstringio  7.6       7.9        8.2
          sympy_expand        2.9       1.8        1.8
          sympy_integrate     4.3       3.9        4.0
          sympy_str           1.5       1.3        1.3
          sympy_sum           6.2       5.8        5.9
          telco              61.2      48.5       54.8
          twisted_iteration  55.5      41.9       43.8
          twisted_names       8.2       9.3        9.7
          twisted_pb         12.1      10.4       10.2
          twisted_tcp         4.9       4.8        5.2


          Geometric mean:    9.31      9.10       9.43


        +As you can see the benefits are comparable on both platforms.
        +Of course this is scientifically not good enough, but it shows a tendency. s390x can achieve the same results as you can get on x86.

        +Are you running your business application on a mainframe? We would love to get some feedback. Join us in IRC tell us if PyPy made your application faster!

        +plan_rich & the PyPy Team

        1) PyPy revision for the benchmarks: 4b386bcfee54 +
        +

        Warmup improvements: more efficient trace representation

        + +
        +
        +

        Hello everyone.

        +

        I'm pleased to inform that we've finished another round of +improvements to the warmup performance of PyPy. Before I go +into details, I'll recap the achievements that we've done since we've started +working on the warmup performance. I picked a random PyPy from November 2014 +(which is definitely before we started the warmup work) and compared it with +a recent one, after 5.0. The exact revisions are respectively ffce4c795283 +and cfbb442ae368. First let's compare pure warmup benchmarks that +can be found in our benchmarking suite. Out of those, +pypy-graph-alloc-removal numbers should be taken with a grain of salt, +since other work could have influenced the results. +The rest of the benchmarks mentioned is bottlenecked purely by warmup times.

        +

        You can see how much your program spends in warmup running +PYPYLOG=jit-summary:- pypy your-program.py under "tracing" and "backend" +fields (in the first three lines). An example looks like that:

        +
        +[e00c145a41] {jit-summary
        +Tracing:        71      0.053645 <- time spent tracing & optimizing
        +Backend:        71      0.028659 <- time spent compiling to assembler
        +TOTAL:                  0.252217 <- total run time of the program
        +
        +

        The results of the benchmarks

        + ++++++++ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        benchmarktime - oldtime - newspeedupJIT time - oldJIT time - new
        function_call1.861.421.3x1.12s0.57s
        function_call25.17s2.73s1.9x4.2s1.6s
        bridges2.77s2.07s1.3x1.5s0.8s
        pypy-graph-alloc-removal2.06s1.65s1.25x1.25s0.79s
        +

        As we can see, the overall warmup benchmarks got up to 90% faster with +JIT time dropping by up to 2.5x. We have more optimizations in the pipeline, +with an idea how to transfer some of the JIT gains into more of a total program +runtime by jitting earlier and more eagerly.

        +
        +

        Details of the last round of optimizations

        +

        Now the nitty gritty details - what did we actually do? I covered a lot of +warmup improvements in the past blog posts so I'm going to focus on +the last change, the jit-leaner-frontend branch. This last change is simple, instead of using +pointers to store the "operations" objects created during tracing, we use a compact list of +16-bit integers (with 16bit pointers in between). On 64bit machine the memory wins are +tremendous - the new representation is 4x more efficient to use 16bit pointers than full 64bit pointers. +Additionally, the smaller representation has much better cache behavior and much less +pointer chasing in memory. It also has a better defined lifespan, so we don't need to +bother tracking them by the GC, which also saves quite a bit of time.

        +

        The change sounds simple, but the details in the underlaying data mean that +everything in the JIT had to be changed which took quite a bit of effort :-)

        +

        Going into the future on the JIT front, we have an exciting set of optimizations, +ranging from faster loops through faster warmup to using better code generation +techniques and broadening the kind of program that PyPy speeds up. Stay tuned +for the updates.

        +

        We would like to thank our commercial partners for making all of this possible. +The work has been performed by baroquesoftware and would not be possible +without support from people using PyPy in production. If your company uses +PyPy and want it to do more or does not use PyPy but has performance problems +with the Python installation, feel free to get in touch with me, trust me using +PyPy ends up being a lot cheaper than rewriting everything in go :-)

        +

        Best regards,
        +Maciej Fijalkowski

        +
        +
        +
        +
        +
        +
        +
        + + Peter wrote on 2016-04-08 08:55: +
        +
        +

        It would be nice to compare speed with C-Python and on short benchmarks, as that is where warmup time matters the most

        +
        +
        +
        +
        + + Maciej Fijalkowski wrote on 2016-04-08 09:14: +
        +
        +

        Those benchmarks are very synthetic warmup-oriented ones. It means you exec() piece of code and then run it 2000 times and then exec again. Any other short-running programs have a lot more noise where you have multiple effects taking place and it would be really hard to compare between old and new pypy. That said it's a fair requirement, we have one more branch in the pipeline and I'll try to get more real world data.

        +
        +
        +
        + +

        PyPy 5.0.1 bugfix released

        + +
        +
        +

        +PyPy 5.0.1 +

        +
        +We have released a bugfix for PyPy 5.0, after reports that the newly released +lxml 3.6.0, which now supports PyPy 5.0 +, can crash on large files. +Thanks to those who reported the crash. Please update, downloads are available +at

        pypy.org/download.html

        +The changes between PyPy 5.0 and 5.0.1 are only two bug fixes: one in +cpyext, which fixes notably (but not only) lxml; and another for a +corner case of the JIT.

        +What is PyPy?

        +
        +
        +
        +PyPy is a very compliant Python interpreter, almost a drop-in replacement for +CPython 2.7. It’s fast (PyPy and CPython 2.7.x performance comparison) +due to its integrated tracing JIT compiler.
        + +We also welcome developers of other +dynamic languages to see what RPython can do for them.
        + +This release supports x86 machines on most common operating systems +(Linux 32/64, Mac OS X 64, Windows 32, OpenBSD, FreeBSD), +newer ARM hardware (ARMv6 or ARMv7, with VFPv3) running Linux, and the +big- and little-endian variants of PPC64 running Linux.

        +Please update, and continue to help us make PyPy better.

        +Cheers
        + +The PyPy Team
        +
        +
        +
        +
        +
        + + Armin Rigo wrote on 2016-03-28 03:39: +
        +
        +

        ppc64 released four days ago, and big-endian updated just now to fix an important bug. There are other big-endian bugs left which we're fixing as we go along; they will be in the next official release.

        +
        +
        +
        + +

        PyPy 5.0 released

        + +
        +
        +

        +PyPy 5.0

        +We have released PyPy 5.0, about three months after PyPy 4.0.1. We encourage all users of PyPy to update to this version.

        +You can download the PyPy 5.0 release here:
        + +
        +We would like to thank our donors for the continued support of the PyPy project.
        +We would also like to thank our contributors and encourage new people to join the project. PyPy has many layers and we need help with all of them: PyPy and RPython documentation improvements, tweaking popular modules to run on pypy, or general help with making RPython’s JIT even better.

        +

        +Faster and Leaner

        +
        +We continue to improve the warmup time and memory usage of JIT-related metadata. The exact effects depend vastly on the program you’re running and can range from insignificant to warmup being up to 30% faster and memory dropping by about 30%.
        +

        +

        +C-API Upgrade

        +
        +We also merged a major upgrade to our C-API layer (cpyext), simplifying the interaction between c-level objects and PyPy interpreter level objects. As a result, lxml (prerelease) with its cython compiled component passes all tests on PyPy. The new cpyext is also much faster. This major refactoring will soon be followed by an expansion of our C-API compatibility.
        +

        +

        +Profiling with vmprof supported on more platforms

        +
        +vmprof has been a go-to profiler for PyPy on linux for a few releases and we’re happy to announce that thanks to the cooperation with jetbrains, vmprof now works on Linux, OS X and Windows on both PyPy and CPython.
        +
        +

        +

        +CFFI

        +While not applicable only to PyPy, cffi is arguably our most significant contribution to the python ecosystem. PyPy 5.0 ships with cffi-1.5.2 which now allows embedding PyPy (or CPython) in a C program.
        +
        +

        +

        +What is PyPy?

        +
        +PyPy is a very compliant Python interpreter, almost a drop-in replacement for CPython 2.7. It’s fast (pypy and cpython 2.7.x performance comparison) due to its integrated tracing JIT compiler.
        +We also welcome developers of other dynamic languages to see what RPython can do for them.
        +This release supports x86 machines on most common operating systems (Linux 32/64, Mac OS X 64, Windows 32, OpenBSD, freebsd), newer ARM hardware (ARMv6 or ARMv7, with VFPv3) running Linux, and 64 bit PowerPC hardware, specifically Linux running the big- and little-endian variants of ppc64.
        +
        +

        +

        +Other Highlights (since 4.0.1 released in November 2015)

        +
          +
        • New features:
            +
          • Support embedding PyPy in a C-program via cffi and static callbacks in cffi.
            +This deprecates the old method of embedding PyPy
          • +
          • Refactor vmprof to work cross-operating-system, deprecate using buggy
            +libunwind on Linux platforms. Vmprof even works on Windows now.
          • +
          • Support more of the C-API type slots, like tp_getattro, and fix C-API
            +macros, functions, and structs such as _PyLong_FromByteArray(),
            +PyString_GET_SIZE, f_locals in PyFrameObject, Py_NAN, co_filename in
            +PyCodeObject
          • +
          • Use a more stable approach for allocating PyObjects in cpyext. (see
            blog post). Once the PyObject corresponding to a PyPy object is created,
            +it stays around at the same location until the death of the PyPy object.
            +Done with a little bit of custom GC support. It allows us to kill the
            +notion of “borrowing” inside cpyext, reduces 4 dictionaries down to 1, and
            +significantly simplifies the whole approach (which is why it is a new
            +feature while technically a refactoring) and allows PyPy to support the
            +populart lxml module (as of the next release) with no PyPy specific
            +patches needed
          • +
          • Make the default filesystem encoding ASCII, like CPython
          • +
          • Use hypothesis in test creation, which is great for randomizing tests
          • +
            +
          • +
          • Bug Fixes
              +
            • Backport always using os.urandom for uuid4 from cpython and fix the JIT as well
              +(issue #2202)
            • +
            • More completely support datetime, optimize timedelta creation
            • +
            • Fix for issue #2185 which caused an inconsistent list of operations to be
              +generated by the unroller, appeared in a complicated DJango app
            • +
            • Fix an elusive issue with stacklets on shadowstack which showed up when
              +forgetting stacklets without resuming them
            • +
            • Fix entrypoint() which now acquires the GIL
            • +
            • Fix direct_ffi_call() so failure does not bail out before setting CALL_MAY_FORCE
            • +
            • Fix (de)pickling long values by simplifying the implementation
            • +
            • Fix RPython rthread so that objects stored as threadlocal do not force minor
              +GC collection and are kept alive automatically. This improves perfomance of
              +short-running Python callbacks and prevents resetting such object between
              +calls
            • +
            • Support floats as parameters to itertools.isslice()
            • +
            • Check for the existence of CODESET, ignoring it should have prevented PyPy
              +from working on FreeBSD
            • +
            • Fix for corner case (likely shown by Krakatau) for consecutive guards with
              +interdependencies
            • +
            • Fix applevel bare class method comparisons which should fix pretty printing
              +in IPython
            • +
            • Issues reported with our previous release were resolved after reports from users on our issue tracker at https://foss.heptapod.net/pypy/pypy/-/issues or on IRC at #pypy
            • +
              +
            • +
            • Numpy:
                +
              • Updates to numpy 1.10.2 (incompatibilities and not-implemented features
                +still exist)
              • +
              • Support dtype=((‘O’, spec)) union while disallowing record arrays with
                +mixed object, non-object values
              • +
              • Remove all traces of micronumpy from cpyext if –withoutmod-micronumpy option used
              • +
              • Support indexing filtering with a boolean ndarray
              • +
              • Support partition() as an app-level function, together with a cffi wrapper
                +in pypy/numpy, this now provides partial support for partition()
              • +
                +
              • +
              • Performance improvements:
                  +
                • Optimize global lookups
                • +
                • Improve the memory signature of numbering instances in the JIT. This should
                  +massively decrease the amount of memory consumed by the JIT, which is
                  +significant for most programs. Also compress the numberings using variable-
                  +size encoding
                • +
                • Optimize string concatenation
                • +
                • Use INT_LSHIFT instead of INT_MUL when possible
                • +
                • Improve struct.unpack by casting directly from the underlying buffer.
                  +Unpacking floats and doubles is about 15 times faster, and integer types
                  +about 50% faster (on 64 bit integers). This was then subsequently
                  +improved further in optimizeopt.py.
                • +
                • Optimize two-tuple lookups in mapdict, which improves warmup of instance
                  +variable access somewhat
                • +
                • Reduce all guards from int_floordiv_ovf if one of the arguments is constant
                • +
                • Identify permutations of attributes at instance creation, reducing the
                  +number of bridges created
                • +
                • Greatly improve re.sub() performance
                • +
                  +
                • +
                • Internal refactorings:
                    +
                  • Refactor and improve exception analysis in the annotator
                  • +
                  • Remove unnecessary special handling of space.wrap().
                  • +
                  • Support list-resizing setslice operations in RPython
                  • +
                  • Tweak the trace-too-long heuristic for multiple jit drivers
                  • +
                  • Refactor bookkeeping (such a cool word - three double letters) in the
                    +annotater
                  • +
                  • Refactor wrappers for OS functions from rtyper to rlib and simplify them
                  • +
                  • Simplify backend loading instructions to only use four variants
                  • +
                  • Simplify GIL handling in non-jitted code
                  • +
                  • Refactor naming in optimizeopt
                  • +
                  • Change GraphAnalyzer to use a more precise way to recognize external
                    +functions and fix null pointer handling, generally clean up external
                    +function handling
                  • +
                  • Remove pure variants of getfield_gc_* operations from the JIT by
                    +determining purity while tracing
                  • +
                  • Refactor databasing
                  • +
                  • Simplify bootstrapping in cpyext
                  • +
                  • Refactor rtyper debug code into python.rtyper.debug
                  • +
                  • Seperate structmember.h from Python.h Also enhance creating api functions
                    +to specify which header file they appear in (previously only pypy_decl.h)
                  • +
                  • Fix tokenizer to enforce universal newlines, needed for Python 3 support
                  • +
                  +
                • +
                +Please try it out and let us know what you think. We welcome feedback, we know you are using PyPy, please tell us about it!
                +Cheers
                +The PyPy Team
                +
                +
                +
                +
                +
                + + HelpingHand wrote on 2016-03-10 22:30: +
                +
                +

                What is the status on finally getting a functional x64 build for windows? I am mainly interested in embedding PyPy and unless there is support for it, I will continue to avoid it.

                +
                +
                +
                +
                + + mathgl wrote on 2016-03-11 05:05: +
                +
                +

                does new cpyext help for supporting numpy?

                +
                +
                +
                +
                + + mattip wrote on 2016-03-11 08:06: +
                +
                +

                HelpingHand: work on x64 for windows [0] is awaiting a champion, with either the skill to do it or with the deep pockets to sponsor it. If you are interested, please come to #pypy on IRC to discuss it

                [0] https://doc.pypy.org/en/latest/windows.html#what-is-missing-for-a-full-64-bit-translation

                +
                +
                +
                +
                + + mattip wrote on 2016-03-11 08:09: +
                +
                +

                mathgl: yes, we are cautiously optimistic that if we now flesh out cpyext to support enough of the C-API that vanilla numpy might just work. Stay tuned for further developments

                +
                +
                +
                +
                + + Martin Gfeller wrote on 2016-03-11 08:57: +
                +
                +

                I've asked Brett Cannon, well-know Pythonista working at Microsoft about whether they could sponsor or undertake Windows 64-bit work.

                If you have a substantial use cause requiring the speed of PyPy, large address spaces and Windows, it might help.

                +
                +
                +
                +
                + + Unknown wrote on 2016-03-11 10:52: +
                +
                +

                What happened to the speed graph on speed.pypy.org? The speedups for earlier versions of PyPy before 5.0 suddenly are much higher than they used to be. Compare for example against the graph of a couple of weeks ago (https://web.archive.org/web/20160228102615/https://speed.pypy.org/)

                Version 28/2 11/3
                1.5 3.18x 4.86x
                2.1 6.12x 7.50x
                2.4.0 6.22x 7.61x
                2.6.1 7.05x 8.58x

                Has the benchmark been changed, the timing method, the speed computation, hardware used, etc? More importantly, which version is "correct"?

                +
                +
                +
                +
                + + Maciej Fijalkowski wrote on 2016-03-11 10:56: +
                +
                +

                Hi Paul.

                We rerun all benchmarks on old Pythons and it shows now a different subset of benchmarks. I must admit I don't know why the main site chooses some benchmarks and not others, it's certainly not deliberate. Any single number you use is not correct, a bit by definition - we suggest you look in details what the benchmarks do or even better, benchmark yourself. We'll look why it's showing a different subset

                +
                +
                +
                +
                + + Unknown wrote on 2016-03-11 11:07: +
                +
                +

                Great news! Awesome!

                +
                +
                +
                +
                + + mattip wrote on 2016-03-11 12:40: +
                +
                +

                Paul Melis, Maciej Fjalkowski - indeed there was a bug; I reran the old benchmarks but only ~half ran to completion. I reverted the bad run, now results are like they used to be. Thanks for pointing it out

                +
                +
                +
                +
                + + Unknown wrote on 2016-03-14 03:52: +
                +
                +

                When is release of pypy3 5.0?
                I'd like also to get the profit of pypy5.0 by a condition of support of python 3.2.5.

                +
                +
                +
                +
                + + Armin Rigo wrote on 2016-03-17 15:07: +
                +
                +

                lxml 3.6.0 released with support for PyPy 5.x.

                +
                +
                +
                +
                + + Armin Rigo wrote on 2016-03-20 11:10: +
                +
                +

                Before trying out lxml 3.6.0, upgrade to PyPy 5.0.1: the release 5.0.0 does not reliably work with it.

                +
                +
                +
                + +

                C-API Support update

                + +
                +

                As you know, PyPy can emulate the CPython C API to some extent. In this post I will describe an important optimization that we merged to improve the performance and stability of the C-API emulation layer.

                + +

                The C-API is implemented by passing around PyObject * pointers in the C code. The problem with providing the same interface with PyPy is that +objects don't natively have the same PyObject * structure at all; and +additionally their memory address can change. PyPy handles the +difference by maintaining two sets of objects. More precisely, starting +from a PyPy object, it can allocate on demand a PyObject structure +and fill it with information that points back to the original PyPy +objects; and conversely, starting from a C-level object, it can allocate +a PyPy-level object and fill it with information in the opposite +direction.

                + +

                I have merged a rewrite of the interaction between C-API C-level objects +and PyPy's interpreter level objects. This is mostly a simplification +based on a small hack in our garbage collector. This hack makes the +garbage collector aware of the reference-counted PyObject +structures. When it considers a pair consisting of a PyPy object and a +PyObject, it will always free either none or both of them at the +same time. They both stay alive if either there is a regular GC +reference to the PyPy object, or the reference counter in the +PyObject is bigger than zero.

                + +

                This gives a more stable result. Previously, a PyPy object might grow a +corresponding PyObject, loose it (when its reference counter goes to +zero), and later have another corresponding PyObject re-created at a +different address. Now, once a link is created, it remains alive until +both objects die.

                + +

                The rewrite significantly simplifies our previous code (which used to be +based on at least 4 different dictionaries), and should make using the +C-API somewhat faster (though it is still slower than using pure +python or cffi).

                + +

                A side effect of this work is that now PyPy actually supports the upstream lxml package---which is is one +of the most popular packages on PyPI. (Specifically, you need version +3.5.0 with this pull +request to remove old PyPy-specific hacks that were not really +working. See +details.) At this point, we no longer recommend using the +cffi-lxml alternative: although it may still be faster, it might be +incomplete and old.

                + +

                We are actively working on extending our C-API support, and hope to soon +merge a branch to support more of the C-API functions (some numpy news +coming!). Please try +it out and let us know how it works for you.

                + +

                Armin Rigo and the PyPy team

                +
                +
                +
                +
                + + mathgl wrote on 2016-02-25 16:40: +
                +
                +

                wow, s good news. When trying to pick up a new lib, I always check whether it supports pypy first.

                +
                +
                +
                +
                + + Anonymous wrote on 2016-02-26 14:57: +
                +
                +

                Really looking forward to hearing news from the numpy front!

                +
                +
                +
                +
                + + Unknown wrote on 2016-02-26 18:42: +
                +
                +

                Great. Maybe now Odoo will work with PyPy!

                +
                +
                +
                +
                + + Anonymous wrote on 2016-02-28 09:58: +
                +
                +

                Great, in particular the native lxml. This is used in many large production systems that will now be even more interested in PyPy.

                +
                +
                +
                + +
                +
                + +
                +
                +
                + +
                + + + + \ No newline at end of file diff --git a/blog/index-36.html b/blog/index-36.html new file mode 100644 index 000000000..2b0fc3472 --- /dev/null +++ b/blog/index-36.html @@ -0,0 +1,1373 @@ + + + + + + +PyPy (old posts, page 36) | PyPy + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +
                +

                Async HTTP benchmarks on PyPy3

                + +
                +
                +Hello everyone, +
                +
                +
                +
                +
                +Since Mozilla announced funding, we've been working quite hard on delivering you a working Python 3.5. +
                +
                +  +
                +
                +We are almost ready to release an alpha version of PyPy 3.5. Our goal is to release it shortly after the sprint. Many modules have already been ported and  it can probably run many Python 3 programs already. We are happy to receive any feedback after the next release.  +
                +
                +
                +
                +
                +To show that the heart (asyncio) of Python 3 is already working we have prepared some benchmarks. They are done by Paweł Piotr Przeradowski @squeaky_pl for a HTTP workload on serveral asynchronous IO libraries, namely the relatively new asyncio and curio libraries and the battle-tested tornado, gevent and Twisted libraries. To see the benchmarks check out https://github.com/squeaky-pl/zenchmarks and the instructions for reproducing can be found inside README.md in the repository. Raw results can be obtained from https://github.com/squeaky-pl/zenchmarks/blob/master/results.csv. +
                +
                +
                +
                +
                +The + purpose of the presented benchmarks is showing that the upcoming PyPy release +is already working with unmodified code that runs on CPython 3.5. PyPy +also manages to make them run significantly faster. +
                +
                +
                +
                +
                +The + benchmarks consist of HTTP servers implemented on the top of the mentioned +libraries. All the servers are single-threaded relying on underlying +event loops to provide concurrency. Access logging was disabled to +exclude terminal I/O from the results. The view code consists of a +lookup in a dictionary mapping ASCII letters to verses from the famous +Zen of Python. If a verse is found the view returns it, otherwise a 404 +Not Found response is served. The 400 Bad Request and 500 Internal +Server Error cases are also handled. +
                +
                +
                +
                +
                +The workload was generated with the wrk HTTP benchmarking tool. It is run with one thread opening up to 100 +concurrent connections for 2 seconds and repeated 1010 times to get +consecutive measures. There is a Lua script provided + that instructs wrk to continuously send 24 different requests that hit +different execution paths (200, 404, 400) in the view code. Also it is +worth noting that wrk will only count 200 responses as successful so the actual request per second throughput is higher. +
                +
                +
                +
                +
                +For your convenience all the used libraries versions are vendored into the benchmark repository. There is also a precompiled portable version of wrk provided + that should run on any reasonably recent (10 year old or newer) Linux +x86_64 distribution. The benchmark was performed on a public cloud scaleway x86_64 server launched in a Paris data center. The server was running +Ubuntu 16.04.01 LTS and reported Intel(R) Xeon(R) CPU D-1531 @ 2.20GHz +CPU. CPython 3.5.2 (shipped by default in Ubuntu) was benchmarked +against a pypy-c-jit-90326-88ef793308eb-linux64 snapshot of the 3.5 compatibility branch of PyPy. +
                +
                +
                +
                +
                + +
                +
                +  +
                +
                +  +
                +
                +  +
                +
                +  +
                +
                +We want to thank Mozilla for supporting our work! +
                +
                +
                +
                +
                +Cheers, +
                +
                +fijal, squeaky_pl and the PyPy Team +
                +
                +
                +
                +
                +
                +
                +
                + + Benjamin wrote on 2017-03-02 00:37: +
                +
                +

                This is fantastic! How close to ready is the async/await syntax? Any chance it could be snuck in the 3.5 release?

                +
                +
                +
                +
                + + Armin Rigo wrote on 2017-03-02 07:55: +
                +
                +

                As far as I know, curio (and maybe asyncio) wouldn't run if we didn't properly support async/await already.

                +
                +
                +
                +
                + + Konstantin Lopuhin wrote on 2017-03-02 09:44: +
                +
                +

                Great news, you are doing awesome work! Any chance cpyext will be included in the alpha?

                +
                +
                +
                +
                + + Ronan Lamy wrote on 2017-03-02 21:49: +
                +
                +

                cpyext will be included. We expect C-API support to be approximately on par with pypy2, e.g. the pypy3 nightlies have nearly complete support for numpy.

                +
                +
                +
                +
                + + Unknown wrote on 2017-03-03 18:35: +
                +
                +

                Awesome work!

                +
                +
                +
                +
                + + Unknown wrote on 2017-03-03 22:30: +
                +
                +

                @Benjamin, async def / async for / async with / await were all introduced in Python 3.5.

                +
                +
                +
                +
                + + Unknown wrote on 2017-03-03 22:30: +
                +
                +

                This is wonderful work, congrats!

                +
                +
                +
                +
                + + stuaxo wrote on 2017-03-04 16:22: +
                +
                +

                This is great. It would be good to include some alternate asyncio back-ends as well if they work with pypy.

                For instance, my current project uses libuv and gbulb in different components.

                +
                +
                +
                +
                + + Anonymous wrote on 2017-03-07 17:03: +
                +
                +

                Will this work with uvloop? I'm curious as I would like to get Sanic running on this! :-)

                +
                +
                +
                +
                + + Armin Rigo wrote on 2017-03-08 22:49: +
                +
                +

                From what I've read on #pypy (sorry if I'm messing something up): uvloop is a drop-in replacement of asyncio, but asyncio is faster on PyPy. PyPy's JIT for pure Python code beats the overheads of the CPython API compatibility layer (in this case, via Cython). Moreover, considering the whole application, using asyncio on PyPy easily beats using uvloop on CPython. So, as long as it remains a fully compatible replacement, you can "drop it out" and use asyncio instead on PyPy.

                +
                +
                +
                + +

                Leysin Winter Sprint: 25/26th Feb. - 4th March 2017

                + +
                +

                The next PyPy sprint will be in Leysin, Switzerland, for the twelveth time. +This is a fully public sprint: newcomers and topics other than those +proposed below are welcome.

                +
                +

                Goals and topics of the sprint

                +

                The list of topics is very open.

                +
                  +
                • The main topic is Python 3.5 support in PyPy, as most py3.5 +contributors should be present. It is also a good topic if you have +no or limited experience with PyPy contribution: we can easily find +something semi-independent that is not done in py3.5 so far, and +do pair-programming with you.
                • +
                • Any other topic is fine too: JIT compiler optimizations, CFFI, +the RevDB reverse debugger, improving to speed of your program on +PyPy, etc.
                • +
                • And as usual, the main side goal is to have fun in winter sports :-) +We can take a day off (for ski or anything else).
                • +
                +
                +
                +

                Exact times

                +

                Work days: starting 26th Feb (~noon), ending March 4th (~noon).

                +

                I have pre-booked the week from Saturday Feb 25th to Saturday March 4th. +If it is possible for you to arrive Sunday before mid-afternoon, then +you should get a booking from Sunday only. The break day should be +around Wednesday.

                +

                It is fine to stay a few more days on either side, or conversely to book +for a part of that time only.

                +
                +
                +

                Location & Accomodation

                + +

                Leysin, Switzerland, "same place as before".

                + +
                + +

                Let me refresh your +memory: both the sprint venue and the lodging will be in a +pair of chalets built specifically for bed & breakfast: +https://www.ermina.ch/. The place has a good ADSL Internet connection +with wireless installed. You can also arrange your own lodging +elsewhere (as long as you are in Leysin, you cannot be more than a 15 +minutes walk away from the sprint venue).

                +

                Please confirm that you are coming so that we can adjust the +reservations as appropriate.

                +

                The options of rooms are a bit more limited than on previous years +because the place for bed-and-breakfast is shrinking; but we should +still have enough room for us. The price is around 60 CHF, breakfast +included, in shared rooms (3 or 4 people). If there are people that +would prefer a double or single room, please contact me and we'll see +what choices you have. There are also a choice of hotels in Leysin.

                +

                Please register by Mercurial:

                +
                +https://bitbucket.org/pypy/extradoc/ +https://foss.heptapod.net/pypy/extradoc/-/blob/branch/default/extradoc/sprintinfo/leysin-winter-2017/ +
                +

                or on the pypy-dev mailing list if you do not yet have check-in rights:

                +
                +https://mail.python.org/mailman/listinfo/pypy-dev +
                +

                You need a Swiss-to-(insert country here) power adapter. There will be +some Swiss-to-EU adapters around, and at least one EU-format power strip.

                +
                +
                +

                PyPy2.7 v5.6 released - stdlib 2.7.12 support, C-API improvements, and more

                + +
                +
                +
                +
                +
                +

                +

                +We have released PyPy2.7 v5.6 [0], about two months after PyPy2.7 v5.4. This new PyPy2.7 release includes the upstream stdlib version 2.7.12.

                +We continue to make incremental improvements to our C-API compatibility layer (cpyext). We pass all but 12 of the over-6000 tests in the upstream NumPy test suite, and have begun examining what it would take to support Pandas and PyQt.

                +Work proceeds at a good pace on the PyPy3.5 version due to a grant from the Mozilla Foundation, and some of those changes have been backported to PyPy2.7 where relevant.

                +The PowerPC and s390x backend have been enhanced with the capability to use SIMD instructions for micronumpy loops.

                +We changed timeit to now report average +/- standard deviation, which is better than the misleading minimum value reported in CPython.

                +We now support building PyPy with OpenSSL 1.1 in our built-in _ssl module, as well as maintaining support for previous versions.

                CFFI has been updated to 1.9, improving an already great package for interfacing with C.

                +As always, this release fixed many issues and bugs raised by the growing community of PyPy users. We strongly recommend updating. You can download the PyPy2.7 v5.6 release here:
                + +
                +Downstream packagers have been hard at work. The Debian package is already available, and the portable PyPy versions are also ready, for those who wish to run PyPy on other Linux distributions like RHEL/Centos 5.

                +We would like to thank our donors for the continued support of the PyPy project.

                +We would also like to thank our contributors and encourage new people to join the project. PyPy has many layers and we need help with all of them: PyPy and RPython documentation improvements, tweaking popular modules to run on pypy, or general help with making RPython’s JIT even better.
                +

                +What is PyPy?

                +PyPy is a very compliant Python interpreter, almost a drop-in replacement for CPython 2.7. It’s fast (PyPy and CPython 2.7.x performance comparison) due to its integrated tracing JIT compiler.
                +We also welcome developers of other dynamic languages to see what RPython can do for them.
                +This release supports:
                +
                +
                  +
                • +x86 machines on most common operating systems (Linux 32/64 bits, Mac OS X 64 bits, Windows 32 bits, OpenBSD, FreeBSD)
                • +
                • newer ARM hardware (ARMv6 or ARMv7, with VFPv3) running Linux,
                • +
                • big- and little-endian variants of PPC64 running Linux,
                • +
                • +s390x running Linux
                • +
                +
                +
                +
                +
                +

                +What else is new?

                +
                +(since the release of PyPy 5.4 in August, 2016)
                +
                +There are many incremental improvements to RPython and PyPy, the complete listing is here. +
                +
                +Please update, and continue to help us make PyPy better.

                +Cheers, The PyPy team

                +[0] We skipped 5.5 since we share a code base with PyPy3, and PyPy3.3-v.5.5-alpha was released last month
                +
                +
                +
                +
                +
                +
                +
                +
                + + Anonymous wrote on 2016-11-13 01:32: +
                +
                +

                I am really liking the regular updates! Nice to hear about cpyext and PyQt! Do desktop ui's apps gain alot of performance from being on pypy? Would kivy go faster seeing as it has a large chunk of widgets implemented in python?

                +
                +
                +
                +
                + + Unknown wrote on 2016-11-13 07:09: +
                +
                +

                All core features in Kivy are implemented in Cython. PyPy is slower with Cython.

                +
                +
                +
                +
                + + Anonymous wrote on 2016-11-13 10:55: +
                +
                +

                isn't the cpyext going to be the answer for pyQt and cython? Or are you saying pyQt should perform greater?

                +
                +
                +
                +
                + + mathgl wrote on 2016-11-17 07:00: +
                +
                +

                cpyext make them work instead of faster at the moment.

                +
                +
                +
                +
                + + Unknown wrote on 2018-04-16 07:23: +
                +
                +

                The python interpreter size is 3.5MB where as pypy intepreter size is almost 40MB. As it has huge size difference it is impossible to replace in embedded projects

                Is there any way to reduce it or any suggestions to implement in embedded area.Why is this difference.

                +
                +
                +
                +
                + + Armin Rigo wrote on 2018-04-16 16:05: +
                +
                +

                Please ask on pypy's irc channel: #pypy at freenode.net, or the pypy-dev mailing list. This blog post is old, it is pointless to ask questions here about it---you're unlikely to get an answer.

                +
                +
                +
                + +

                Vectorization extended. PowerPC and s390x

                + +
                +
                +We are happy to announce that JIT support in both the PowerPC backend and the
                +s390x backend have been enhanced. Both can now vectorize loops via SIMD
                +instructions. Special thanks to IBM for funding this work.

                +If you are not familiar with this topic you can read more details here.
                +
                +There are many more enhancements under the hood. Most notably, all pure operations are now delayed until the latest possible point. In some cases indices have been calculated more than once or they needed an additional register, because the old value is still used. Additionally it is now possible to load quadword-aligned memory in both PPC and s390x (x86 currently cannot do that).

                +NumPy & CPyExt +

                +The community and core developers have been moving CPyExt towards a complete, but emulated, layer for CPython C extensions. This is great, because the one restriction preventing the wider deployment of PyPy in several scenarios will hopefully be removed. However, we advocate not to use CPyExt, but rather to not write C code at all (let PyPy speed up your Python code) or use cffi.

                +The work done here to support vectorization helps micronumpy (NumPyPy) to speed up operations for PPC and s390x. So why is PyPy supporting both NumPyPy and NumPy, do we actually need both? Yes, there are places where gcc can beat the JIT, and places where the tight integration between NumPyPy and PyPy is more performant. We do have plans to integrate both, hijacking the C-extension method calls to use NumPyPy where we know NumPyPy can be faster.

                +Just to give you an idea why this is a benefit:

                +NumPy arrays can carry custom dtypes and apply user defined python functions on the arrays. How could one optimize this kind of scenario? In a traditional setup, you cannot. But as soon as NumPyPy is turned on, you can suddenly JIT compile this code and vectorize it.

                +Another example is element access that occurs frequently, or any other calls that cross between Python and the C level frequently.

                +Benchmarks +

                +Let's have a look at some benchmarks reusing mikefc's numpy benchmark suite (find the forked version here). I only ran a subset of microbenchmarks, showing that the core functionality is
                functioning properly. Additionally it has been rewritten to use perf instead of the timeit stdlib module.

                +Setup +

                +x86 runs on a Intel i7-2600 clocked at 3.40GHz using 4 cores. PowerPC runs on the Power 8 clocked at 3.425GHz providing 160 cores. Last but not least the mainframe machine clocked up to 4 GHz, but fully virtualized (as it is common for such machines). Note that PowerPC is a non private remote machine. It is used by many users and it is crowded with processes. It is hard to extract a stable benchmark there.

                +x86 ran on Fedora 24 (kernel version of 4.8.4), PPC ran on Fedora 21 (kernel version 3.17.4) and s390x ran on Redhat Linux 7.2 (kernel version 3.10.0). Respectivley, numpy on cpython had openblas available on x86, no blas implementation were present on s390x and PPC provided blas and lapack.

                +As you can see all machines run very different configurations. It does not make sense to compare across platforms, but rather implementations on the same platform.







                +Blue shows CPython 2.7.10+ available on that platform using the latest NumPy (1.11). Micro NumPy is used for PyPy. PyPy+ indicates that the vectorization optimization is turned on.
                +All bar charts show the median value of all runs (5 samples, 100 loops, 10 inner loops, for the operations on vectors (not matrices) the loops are set to 1000). PyPy additionally gets 3 extra executions to warmup the JIT.

                +The comparison is really comparing speed of machine code. It compares the PyPy's JIT output vs GCC's output. It has little to do with the speed of the interpreter.

                +Both new SIMD backends speedup the numeric kernels. Some times it is near to the speed of CPython, some times it is faster. The maximum parallelism very much depends on the extension emitted by the compiler. All three SIMD backends have the same vector register size (which is 128 bit). This means that all three behave similar but ppc and s390x gain more because they can load 128bit of memory from quadword aligned memory.

                +Future directions

                +Python is achieving rapid adoption in data science. This is currently a trend emerging in Europe, and Python is already heavily used for data science in the USA many other places around the world.


                +PyPy can make a valuable contribution for data scientists, helping them to rapidly write scientific programs in Python and run them at near native speed. If you happen to be in that situation, we are eager to hear you feedback or resolve your issues and also work together to improve the performance of your,
                +code. Just get in touch!


                +Richard Plangger (plan_rich) and the PyPy team
                +
                +
                +
                +
                +
                + + Anonymous wrote on 2016-11-03 20:06: +
                +
                +

                As you are talking about GCC beating your JIT, you are using your own vectorizing compiler right?
                I wonder if this is a feasible approach. Can you really compete with the years if not decades of work that went into the vectorizers of GCC and LLVM?
                Wouldn't it make more sense to plug into GCC's and LLVM's JIT API's (yes GCC has a JIT) for this type of code?
                What does PyPy bring to the table that the existing JIT's do not for numerical code?

                +
                +
                +
                +
                + + Anonymous wrote on 2016-11-07 06:44: +
                +
                +

                It's good to see pypy making progress on using python as a toolkit for data science. In addition to numpy, pandas/scipy also needs to work well for me to switch.

                Also, a lot of data science is currently being run on windows and the x64 port of pypy hasn't had much traction in the last several years. If these 2 issues are solved (pandas/scipy being supported on a x64 windows pypy) then there should be no reason to keep using CPython.

                +
                +
                +
                +
                + + mathgl wrote on 2016-11-08 05:22: +
                +
                +

                I think most of pypy dev/users use Linux/MacOsx only, so there is no strong motivation to support win64 at the moment.

                +
                +
                +
                +
                + + Armin Rigo wrote on 2016-11-09 16:16: +
                +
                +

                Not necessarily the users---there are some on Windows. But the point is that we have not a single developer on Windows. Until someone comes forward with a serious offer for either code or money, Win64 will not get magically done.

                +
                +
                +
                + +

                PyPy3 5.5.0 released

                + +
                +

                We're pleased to announce the release of PyPy3 v5.5.0. Coming four months after PyPy3.3 v5.2, it improves compatibility with Python 3.3 (3.3.5). We strongly recommend updating from previous PyPy3 versions.

                +We would like to thank all of the people who donated to the py3k proposal for supporting the work that went into this release.

                +You can download the PyPy3.3 v5.5.0 release here: https://pypy.org/download.html

                +
                  +
                • Improved Python 3.3.5 support.
                • +
                    +
                  • os.get_terminal_size(), time.monotonic(), str.casefold() 
                  • +
                  • faulthandler module
                  • +
                  • There are still some missing features such as a PEP 393-like space efficient string representation and including performance regressions (e.g. issue #2305). The focus for this release has been updating to 3.3 compatibility. Windows is also not yet supported.
                  • +
                  +
                • +ensurepip is also included (it's only included in CPython 3 >= 3.4).
                • +
                • Buffer interface improvements (numpy on top of cpyext)
                • +
                • Several JIT improvements (force-virtual-state, residual calls)
                • +
                • Search path for libpypy-c.so has changed (helps with cffi embedding on linux distributions)
                • +
                • Improve the error message when the user forgot the "self" argument of a method
                • +
                • Many more small improvements, please head over to our documentation for more information
                • +
                +

                +Towards Python 3.5

                +
                +
                +We have started to work on Python 3.5, which is a version used by many software projects. It seems to get wide adoption. We are happy to be part of the Mozilla Open Source Support (MOSS) initiative.
                +
                +
                +
                +
                +Nevertheless we want to give our users the chance to use PyPy in their Python 3 projects, thus we have prepared this release.
                +
                +

                +What is PyPy?

                +PyPy is a very compliant Python interpreter, almost a drop-in replacement for CPython 2.7.10 and 3.3.5. It's fast due to its integrated tracing JIT compiler.

                We also welcome developers of other dynamic languages to see what RPython can do for them.

                +This release supports:
                  +
                • x86 machines on most common operating systems except Windows 
                • +
                • newer ARM hardware (ARMv6 or ARMv7, with VFPv3) running Linux 
                • +
                • big- and little-endian variants of PPC64 running Linux 
                • +
                • s390x running Linux
                • +
                +Please try it out and let us know what you think. We welcome feedback, we know
                +you are using PyPy, please tell us about it!

                +Cheers

                +The PyPy Team +
                +
                +
                +
                + + Mak Sim wrote on 2016-10-13 07:51: +
                +
                +

                Great! Wayting for windows build.

                +
                +
                +
                +
                + + Anonymous wrote on 2016-10-14 05:08: +
                +
                +

                Excellent news. Thank you!

                +
                +
                +
                +
                + + Butla wrote on 2016-10-17 15:52: +
                +
                +

                Wow! 3.5? That would be incredible. Shouldn't there be more hype around JITted asyncio applications?

                +
                +
                +
                +
                + + Unknown wrote on 2016-10-21 07:49: +
                +
                +

                I was really touched.
                \(^o^)/

                +
                +
                +
                +
                + + Unknown wrote on 2016-11-06 01:19: +
                +
                +

                Butla: I do totally agree, pypy not only for numeric code anymore but also for parallel production servers.

                +
                +
                +
                +
                + + Unknown wrote on 2016-12-02 15:49: +
                +
                +

                The performance difference between 5.5 and 5.2 is awesome! For my heavy string and lists-of-strings processing tool, 5.5 needs about 25% less time for the same task. Thank you so much!

                +
                +
                +
                +
                + + Dagur wrote on 2017-05-24 14:08: +
                +
                +

                What is the status on windows support?

                +
                +
                +
                + +

                RevDB released, v5.4.1

                + +
                +

                Hi all,

                + +

                +The first beta version of RevDB is out! Remember that RevDB is a reverse debugger for Python. The idea is that it is a debugger that can run forward and backward in time, letting you more easily understand your subtle bug in your big Python program.

                + +

                +RevDB should work on almost any Python program. Even if you are normally only using CPython, trying to reproduce the bug with RevDB is similar to trying to run the program on a regular PyPy---usually it just works, even if not quite always. + +

                +

                +News from the alpha version in the previous blog post include notably support for: +

                +
                  +
                • Threads. +
                • +
                • CPyExt, the compatibility layer of PyPy that can run CPython C extension modules. +
                • +
                +as well as many other improvements. + +

                +You need to build it yourself for now. It is tested on 64-bit Linux. 32-bit Linux, OS/X, and other POSIX platforms should all either work out of the box or be just a few fixes away (contributions welcome). Win32 support is a lot more involved but not impossible.

                + +

                +See https://bitbucket.org/pypy/revdb/ for more information!

                + +

                Armin

                +
                +

                PyPy 5.4.1 bugfix released

                + +
                +
                +We have released a bugfix for PyPy2.7-v5.4.0, released last week, due to the following issues:

                +
                  +
                • Update list of contributors in documentation and LICENSE file, this was unfortunately left out of 5.4.0. My apologies to the new contributors
                • +
                • Allow tests run with -A to find libm.so even if it is a script not a dynamically loadable file
                • +
                • Bump sys.setrecursionlimit() when translating PyPy, for translating with CPython
                • +
                • Tweak a float comparison with 0 in backendopt.inline to avoid rounding errors
                • +
                • Fix for an issue for translating the sandbox
                • +
                • Fix for and issue where unicode.decode('utf8', 'custom_replace') messed up the last byte of a unicode string sometimes
                • +
                • Update built-in cffi to version 1.8.1
                • +
                • Explicitly detect that we found as-yet-unsupported OpenSSL 1.1, and crash translation with a message asking for help porting it
                • +
                • Fix a regression where a PyBytesObject was forced (converted to a RPython object) when not required, reported as issue #2395
                • +
                +
                +Thanks to those who reported the issues.

                +What is PyPy?

                +
                +PyPy is a very compliant Python interpreter, almost a drop-in replacement for CPython 2.7. It's fast (PyPy and CPython 2.7.x performance comparison) due to its integrated tracing JIT compiler.

                +We also welcome developers of other dynamic languages to see what RPython can do for them.

                +This release supports:
                +
                  +
                • +x86 machines on most common operating systems (Linux 32/64, Mac OS X 64, Windows 32, OpenBSD, FreeBSD),
                • +
                • newer ARM hardware (ARMv6 or ARMv7, with VFPv3) running Linux,
                • +
                • big- and little-endian variants of PPC64 running Linux,
                • +
                • +s390x running Linux
                • +
                +
                +Please update, and continue to help us make PyPy better.

                +Cheers

                +The PyPy Team
                +
                +
                +

                PyPy2 v5.4 released - incremental improvements and enhancements

                + +
                +
                +We have released PyPy2.7 v5.4, a little under two months after PyPy2.7 v5.3. +This new PyPy2.7 release includes incremental improvements to our C-API +compatibility layer (cpyext), enabling us to pass over 99% of the upstream +numpy test suite.

                +We updated built-in cffi support to version 1.8, +which now supports the “limited API” mode for c-extensions on +CPython >=3.2.

                +We improved tooling for the PyPy JIT, and expanded VMProf +support to OpenBSD and Dragon Fly BSD

                +As always, this release fixed many issues and bugs raised by the +growing community of PyPy users. We strongly recommend updating.

                +You can download the PyPy2 v5.4 release here:
                + +
                +We would like to thank our donors for their continued support of the PyPy +project. We would also like to thank our contributors and +encourage new people to join the project. PyPy has many +layers and we need help with all of them: PyPy and RPython documentation +improvements, testing and adapting popular modules to run on PyPy, or general help +with making RPython’s JIT even better.

                +

                +What is PyPy?

                +PyPy is a very compliant Python interpreter, almost a drop-in replacement for CPython 2.7. It’s fast (PyPy and CPython 2.7 performance comparison) due to its integrated tracing JIT compiler.

                +We also welcome developers of other dynamic languages to see what RPython can do for them.

                +This release supports:
                  +
                • +x86 machines on most common operating systems (Linux 32/64, Mac OS X 64, Windows 32, OpenBSD, FreeBSD)
                • +
                • newer ARM hardware (ARMv6 or ARMv7, with VFPv3) running Linux
                • +
                • big- and little-endian variants of PPC64 running Linux
                • +
                • +s390x running Linux
                • +
                +
                +
                +

                +What is New?

                +

                +(since the release of PyPy 5.3 in June, 2016)

                +There are many incremental improvements to RPython and PyPy, the complete listing is here. Mozilla generously sponsored work toward python 3.5 compatibility, and we are beginning to see some cross-over improvements of RPython and PyPy2.7 as a result.

                +Please update, and continue to help us make PyPy better. +Cheers

                +The PyPy Team
                +
                +
                +
                +
                + + stuaxo wrote on 2016-08-31 22:52: +
                +
                +

                Is this available on the PPA ?

                (if it is, which one, possibly I have the wrong one) - at the moment I have

                Get:2 https://ppa.launchpad.net/pypy/ppa/ubuntu xenial/main amd64 pypy amd64 5.3.1+dfsg-1~ppa1~ubuntu16.04 [7,754 kB]

                ?

                +
                +
                +
                +
                + + Armin Rigo wrote on 2016-09-01 09:11: +
                +
                +

                This is outside our control and should be checked with the 3rd-party provider for the particular platform (in this case, the PPA).

                +
                +
                +
                + +

                PyPy Tooling Upgrade: JitViewer and VMProf

                + +
                +

                We are happy to announce a major JitViewer (JV) update.
                +JV allows you to inspect RPython's internal compiler representation (the language in which PyPy is implemented) including the generated machine code of your program. It can graphically show you details of the JIT compiled code and helps you pinpoint issues in your program.

                +VMProf is a statistical CPU profiler for python imposing very little overhead at runtime.

                +Both VMProf and JitViewer share a common goal: Present useful information for your python program.
                +The combination of both can reveal more information than either alone.
                +That is the reason why they are now both packaged together.
                +We also updated vmprof.com with various bug fixes and changes including an all new interface to JV.

                +This work was done with the goal of improving tooling and libraries around the Python/PyPy/RPython ecosystem.
                +Some of the tools we have developed:

                +
                  +
                • +CFFI - Foreign Function Interface that avoids CPyExt (CFFI docs)
                • +
                • +RevDB - A reverse debugger for python (RevDB blog post)
                • +
                +
                +and of course the tools we discuss here:

                  +
                • +VMProf - A statistical CPU profiler (VMProf docs)
                • +
                • +JitViewer - Visualization of the log file produced by RPython (JitLog docs)
                • +
                +

                +A "brand new" JitViewer

                +
                +JitViewer has two pieces: you create a log file when running your program, and then use a graphic tool to view what happened.

                +The old logging format was a hard-to-maintain, plain-text-logging facility. Frequent changes often broke internal tools.
                +Additionally, the logging output of a long running program required a lot of disk space.

                +Our new binary format encodes data densely, makes use of some compression (gzip), and tries to remove repetition where possible.
                +It also supports versioning for future proofing and can be extended easily.

                +And *drumroll* you no longer need to install a tool to view the log yourself
                +anymore! The whole system moved to vmprof.com and you can use it any time.

                +Sounds great. But what can you do with it? Here are two examples for a PyPy user:

                +
                PyPy crashed? Did you discover a bug?

                +
                +For some hard to find bugs it is often necessary to look at the compiled code. The old
                +procedure often required you to upload a plain text file which was hard to parse and to look through.

                +A better way to share a crash report is to install the ``vmprof`` module from PyPi and execute either of the two commands:

                +# this program does not crash, but has some weird behaviour
                $ pypy -m jitlog --web <your program args>
                ...
                PyPy Jitlog: https://vmprof.com/#/<hash>/traces
                # this program segfaults
                $ pypy -m jitlog -o /tmp/log <your program args>
                ...
                <Segfault>
                $ pypy -m jitlog --upload /tmp/log
                PyPy Jitlog: https://vmprof.com/#/<hash>/traces


                +Providing the link in the bug report allows PyPy developers to browse and identify potential issues.

                +Speed issues

                +
                +VMProf is a great tool to find hot spots that consume a lot of time in your program. As soon as you have identified code that runs slowly, you can switch to jitlog and maybe pinpoint certain aspects that do not behave as expected. You will find an overview, and are able to browse the generated code. If you cannot make sense of all that, you can just share the link with us and we can have a look too.

                +
                Future direction

                +
                +We hope that the new release will help both PyPy developers and PyPy users resolve potential issues and easily point them out.

                +Here are a few ideas what might come in the next few releases:


                  +
                •  Combination of CPU profiles and the JITLOG (sadly did not make it into the current release).
                • +
                • Extend vmprof.com to be able to query vmprof/jitlog.
                  An example query for vmprof: 'methods.callsites() > 5' and
                  for the jitlog would be 'traces.contains('call_assembler').hasbridge('*my_func_name*')'.
                • +
                • Extend the jitlog to capture the information of the optimization stage.
                • +
                +

                +Richard Plangger (plan_rich) and the PyPy team
                +
                +
                +
                +
                +
                +
                + + phd wrote on 2016-08-11 20:29: +
                +
                +

                https://www.vmprof.com/ doesn't work, but https://vmprof.com/ does. Please fix your DNS.

                +
                +
                +
                +
                + + Armin Rigo wrote on 2016-08-14 08:43: +
                +
                +

                @phd: thanks, fixed the link inside the blog post. ``www.vmprof.com`` was never intended to work---but that could be considered as a bug; if you feel like it should, please open an issue.

                +
                +
                +
                + +

                PyPy gets funding from Mozilla for Python 3.5 support

                + +
                +

                "Python 2.x versus Python 3.x": this is by now an old question. In the eyes of some people Python 2 is here to stay, and in the eyes of others Python has long been 3 only.

                + +

                PyPy's own position is that PyPy will support Python 2.7 forever---the RPython language in which PyPy is written is a subset of 2.7, and we have no plan to upgrade that. But at the same time, we want to support 3.x. This is particularly true now: a relatively recent development is that Python 3.5 seems to attract more and more people. The "switch" to Python 3.x might be starting to happen.

                + +

                Correspondingly, PyPy has been searching for a while for a way to support a larger-scale development effort. The goal is to support not just any old version of Python 3.x, but Python 3.5, as this seems to be the version that people are switching to. PyPy is close to supporting all of Python 3.3 now; but the list of what is new in Python 3.4 and 3.5 is far, far longer than anyone imagines. The long-term goal is also to get a version of "PyPy3" that is as good as "PyPy2" is, including its performance and its cpyext layer (CPython C API interoperability), for example.

                + +

                So, the end result: Mozilla recently decided to award $200,000 to Baroque Software to work on PyPy as part of its Mozilla Open Source Support (MOSS) initiative. This money will be used to implement the Python 3.5 features in PyPy. Within the next year, we plan to use the money to pay four core PyPy developers half-time to work on the missing features and on some of the big performance and cpyext issues. This should speed up the progress of catching up with Python 3.x significantly. We are extremely thankful to Mozilla for supporting us in this way, and will keep you updated on the progress via this blog.

                +
                +
                +
                +
                + + Dave wrote on 2016-08-09 17:46: +
                +
                +

                Great to hear of this development. I'm one of those "Python has long been 3 only" developers, but have had an eye on PyPy for a long time and even donated several times. Planning to switch to PyPy when 3.5 support lands.

                +
                +
                +
                +
                + + Unknown wrote on 2016-08-09 17:56: +
                +
                +

                glad to hear that.

                To me, the time to switch to py3 depends upon the maturity of pypy 3.

                I have used pypy 2 for a while in production, so far so good.

                +
                +
                +
                +
                + + Alessandro wrote on 2016-08-09 18:05: +
                +
                +

                Great news!

                I'm one of those "Python 3 only". The switch was terrible for the community, but Python 3 is superior than 2 in my opinion.

                RPython 3 would be great to, but it's propably complete inviable.

                +
                +
                +
                +
                + + Ronan wrote on 2016-08-09 18:29: +
                +
                +

                Fantastic news! Thanks for your work, and thanks Mozilla for their support :)

                +
                +
                +
                +
                + + Shen wrote on 2016-08-09 21:36: +
                +
                +

                Awesome !

                +
                +
                +
                +
                + + Unknown wrote on 2016-08-10 00:07: +
                +
                +

                Is there any chance optional typing information will be used to help the JIT?

                +
                +
                +
                +
                + + Anonymous wrote on 2016-08-10 00:16: +
                +
                +

                200,000 sounds like a lot of money but it is two developers for a year at less than Silicon Valley wages.

                +
                +
                +
                +
                + + Anonymous wrote on 2016-08-10 00:45: +
                +
                +

                I think you are overpaying yourselves. But hey, it's your money.

                +
                +
                +
                +
                + + Anonymous wrote on 2016-08-10 06:06: +
                +
                +

                2.7 forever!

                +
                +
                +
                +
                + + cclauss wrote on 2016-08-10 07:38: +
                +
                +

                This is huge news! Corporate sponsorship of open source projects is a beautiful thing for us all. Congrats to the PyPy team. You really deserve this kind of support for all of your hard work and perseverance over the years.

                Given that Python 3.6 will be going beta next month, perhaps that should be your target instead of 3.5 but you know your craft better than I do.

                Those of you who would be interested to pay Mozilla back for this investment might want to help port the following 9 Mozilla projects to Python 3:
                * mozrunner, moznetwork, mozdevice, mozprocess, mozprofile, mozfile, mozinfo, mozlog, mozcrash

                These nine Python 2 projects are all in the Top 150 PyPI downloads of all time and each of them has been downloaded with pip more than 5 million times. Currently 92% of the Top 200 PyPI packages are Python3 compatible. Converting these 9 Mozbase modules to Python 3 would bump that number up to 96.5%. It would also probably push us over the line where 50% of the Top 4,000 PyPI packages are Python 3 compatible. This kind of momentum would be welcome news as the Python community continues our move to Python 3.

                +
                +
                +
                +
                + + Armin Rigo wrote on 2016-08-10 09:46: +
                +
                +

                Why Python 3.5 instead of 3.6? That's because 3.5 is the version that attracts people. Python 3.6 will only be out of beta in december and my own wild guess is that it won't immediately attract all the 2.7 people, who grew accustomed to sticking to a known unchanging version. So the deal with Mozilla is to get a PyPy 3.5 working as well as possible; it is better than getting a PyPy 3.6 that (like current versions of PyPy 3) has downsides in completeness and performance.

                +
                +
                +
                +
                + + Unknown wrote on 2016-08-10 12:23: +
                +
                +

                Great news! I Can't wait to the moment we get an stable Python3 PyPy! Congratulations!

                +
                +
                +
                +
                + + Unknown wrote on 2016-08-10 12:24: +
                +
                +

                Great news! I Can't wait to the moment we get an stable Python3 PyPy! Congratulations!

                +
                +
                +
                +
                + + guillem wrote on 2016-08-10 14:08: +
                +
                +

                TL;DR. Python 3.5 is a "good enough" and seems a future-proof language.

                I was in charge of deciding which version of Python to use at my job. We started the development of a framework supporting Python 2.7 and Python >=3.4, but we recently switched to a Python 3 only development. The whole python 2 vs 3 thing was quite confusing to the operations department and developers that are not proficient with Python.

                There was a quite thorough assessment of the features, and we decided to stick to Python 3.5, at least for the next decade or so. On the Python 3 side, one of the reasons was that the async/await syntax allows junior developers to understand (more or less) asynchronous programming, while coroutines+decorators are quite a mess. We still have some Red Hat instances that use Python 3.4, but as soon as we get rid of them, everything will be Python 3.5.

                +
                +
                +
                +
                + + touilleMan wrote on 2016-08-10 15:24: +
                +
                +

                Awesome news ! Long live Pypy ;-)

                +
                +
                +
                +
                + + Anonymous wrote on 2016-08-10 18:28: +
                +
                +

                Super great news!

                +
                +
                +
                +
                + + Anonymous wrote on 2016-08-10 21:12: +
                +
                +

                This is amazing news! I use Python3 extensively in private research projects. Unfortunately, I have been in the position of choosing between Python3 and having a high-performance interpreter implementation. Testing the PyPy 3.3.5 alpha shows all-around disappointing performance in our string-manipulation-heavy projects making intense use of unicode, so I can only recommend our team to stay with CPython for both performance and compatibility.

                I am very excited to hear that PyPy3 will be getting more of the specific attention it deserves, with Python 3.5 support to boot!

                +
                +
                +
                +
                + + PvdE wrote on 2016-08-11 13:03: +
                +
                +

                Good news, and you definitely deserve it. But I guess this will take virtually all of the PyPy team's resources for the next one-two years, what does this means for other in-progress innovations, in particular STM-GC? I donated but it looks like the money is not being spent.

                If continuing the improvements to CPython support means PyPy will become an option for more programs, that would be great. The (few) red lines in https://packages.pypy.org/ and lower performance for others are still blockers for many users.

                +
                +
                +
                +
                + + Armin Rigo wrote on 2016-08-11 16:40: +
                +
                +

                @PvdE: I'll admit that the STM-GC is not actively going anywhere right now. STM is hard, unsurprizingly. There is still a bit being done by Remi (working at an academic institution where he doesn't need the money). For me, I am part of the Python 3.5 team, but I might also come back to STM-GC soon. We expect not all our resources to be consumed by Python 3.5. In fact, the money covers four half-time jobs (and flexibility allows someone to do less than half-time while someone else does more).

                +
                +
                +
                +
                + + Anonymous wrote on 2016-08-17 04:04: +
                +
                +

                This is great news.

                A developer's decision to target Python 3 over Python 2 in their projects is more fundamental than deciding which interpreter to use. People use Python 3 because it's future-proof, to take advantage of its new features and to do their bit in driving the Python ecosystem forward. For me and I imagine many others, being curious about PyPy hasn't been enough to override all those factors.

                I think there's a huge untapped audience out there waiting for first-class support for modern Python in PyPy before giving it a shot. I hope to see a big bump in PyPy adoption with this move, and a corresponding bump in funding and support for PyPy's development.

                Thanks for your fantastic work!

                +
                +
                +
                +
                + + JP wrote on 2016-08-31 17:45: +
                +
                +

                Great news! Will this include making numpy work with Python3 pypy? That's the main thing preventing me from evaluating pypy for my Python3-only OpenGL application.

                +
                +
                +
                +
                + + mattip wrote on 2016-09-07 09:05: +
                +
                +

                @JP cpyext compatibility is one of the milestones, and we currently pass over 99% of the upstream NumPy test suite using PyPy 2.7, so it all should Just Work

                +
                +
                +
                +
                + + Anonymous wrote on 2016-10-04 16:54: +
                +
                +

                Getting PyPy3 to 3.5 status is a good start considering that the current LTS version of Ubuntu (16.04) has 3.5 and that is going to be supported for a while.

                +
                +
                +
                +
                + + Anonymous wrote on 2016-12-17 22:26: +
                +
                +

                Great news! Pypy's lack of Python 3 support is the biggest reason I haven't switched yet. It technically supports most of 3.3 already, but since Pypy3 is slower than CPython, it may as well not exist. Hopefully you'll also work out the performance issues in Pypy 3 as well.

                +
                +
                +
                + +
                +
                + +
                +
                +
                + +
                + + + + \ No newline at end of file diff --git a/blog/index-37.html b/blog/index-37.html new file mode 100644 index 000000000..4229892d7 --- /dev/null +++ b/blog/index-37.html @@ -0,0 +1,1794 @@ + + + + + + +PyPy (old posts, page 37) | PyPy + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +
                +

                How to make your code 80 times faster

                + +
                +
                +I often hear people who are happy because PyPy makes their code 2 times faster +or so. Here is a short personal story which shows PyPy can go well beyond +that.

                DISCLAIMER: this is not a silver bullet or a general recipe: it worked in +this particular case, it might not work so well in other cases. But I think it +is still an interesting technique. Moreover, the various steps and +implementations are showed in the same order as I tried them during the +development, so it is a real-life example of how to proceed when optimizing +for PyPy.

                +Some months ago I played a bit with evolutionary algorithms: the ambitious +plan was to automatically evolve a logic which could control a (simulated) +quadcopter, i.e. a PID controller (spoiler: it doesn't fly).

                +The idea is to have an initial population of random creatures: at each +generation, the ones with the best fitness survive and reproduce with small, +random variations.

                +However, for the scope of this post, the actual task at hand is not so +important, so let's jump straight to the code. To drive the quadcopter, a +Creature has a run_step method which runs at each delta_t (full +code):
                class Creature(object):
                +    INPUTS = 2  # z_setpoint, current z position
                +    OUTPUTS = 1 # PWM for all 4 motors
                +    STATE_VARS = 1
                +    ...
                +
                +    def run_step(self, inputs):
                +        # state: [state_vars ... inputs]
                +        # out_values: [state_vars, ... outputs]
                +        self.state[self.STATE_VARS:] = inputs
                +        out_values = np.dot(self.matrix, self.state) + self.constant
                +        self.state[:self.STATE_VARS] = out_values[:self.STATE_VARS]
                +        outputs = out_values[self.STATE_VARS:]
                +        return outputs
                +
                +
                  +
                • +inputs is a numpy array containing the desired setpoint and the current +position on the Z axis;
                • +
                • +outputs is a numpy array containing the thrust to give to the motors. To +start easy, all the 4 motors are constrained to have the same thrust, so +that the quadcopter only travels up and down the Z axis;
                • +
                • +self.state contains arbitrary values of unknown size which are passed from +one step to the next;
                • +
                • +self.matrix and self.constant contains the actual logic. By putting +the "right" values there, in theory we could get a perfectly tuned PID +controller. These are randomly mutated between generations.
                • +
                +run_step is called at 100Hz (in the virtual time frame of the simulation). At each +generation, we test 500 creatures for a total of 12 virtual seconds each. So, +we have a total of 600,000 executions of run_step at each generation.

                +At first, I simply tried to run this code on CPython; here is the result:
                $ python -m ev.main
                +Generation   1: ... [population = 500]  [12.06 secs]
                +Generation   2: ... [population = 500]  [6.13 secs]
                +Generation   3: ... [population = 500]  [6.11 secs]
                +Generation   4: ... [population = 500]  [6.09 secs]
                +Generation   5: ... [population = 500]  [6.18 secs]
                +Generation   6: ... [population = 500]  [6.26 secs]
                +
                +Which means ~6.15 seconds/generation, excluding the first.

                +Then I tried with PyPy 5.9:
                $ pypy -m ev.main
                +Generation   1: ... [population = 500]  [63.90 secs]
                +Generation   2: ... [population = 500]  [33.92 secs]
                +Generation   3: ... [population = 500]  [34.21 secs]
                +Generation   4: ... [population = 500]  [33.75 secs]
                +
                +Ouch! We are ~5.5x slower than CPython. This was kind of expected: numpy is +based on cpyext, which is infamously slow. (Actually, we are working on +that and on the cpyext-avoid-roundtrip branch we are already faster than +CPython, but this will be the subject of another blog post.)

                +So, let's try to avoid cpyext. The first obvious step is to use numpypy +instead of numpy (actually, there is a hack to use just the micronumpy +part). Let's see if the speed improves:
                $ pypy -m ev.main   # using numpypy
                +Generation   1: ... [population = 500]  [5.60 secs]
                +Generation   2: ... [population = 500]  [2.90 secs]
                +Generation   3: ... [population = 500]  [2.78 secs]
                +Generation   4: ... [population = 500]  [2.69 secs]
                +Generation   5: ... [population = 500]  [2.72 secs]
                +Generation   6: ... [population = 500]  [2.73 secs]
                +
                +So, ~2.7 seconds on average: this is 12x faster than PyPy+numpy, and more than +2x faster than the original CPython. At this point, most people would be happy +and go tweeting how PyPy is great.

                +In general, when talking of CPython vs PyPy, I am rarely satified of a 2x +speedup: I know that PyPy can do much better than this, especially if you +write code which is specifically optimized for the JIT. For a real-life +example, have a look at capnpy benchmarks, in which the PyPy version is +~15x faster than the heavily optimized CPython+Cython version (both have been +written by me, and I tried hard to write the fastest code for both +implementations).

                +So, let's try to do better. As usual, the first thing to do is to profile and +see where we spend most of the time. Here is the vmprof profile. We spend a +lot of time inside the internals of numpypy, and allocating tons of temporary +arrays to store the results of the various operations.

                +Also, let's look at the jit traces and search for the function run: +this is loop in which we spend most of the time, and it is composed of 1796 +operations. The operations emitted for the line np.dot(...) + +self.constant are listed between lines 1217 and 1456. Here is the excerpt +which calls np.dot(...); most of the ops are cheap, but at line 1232 we +see a call to the RPython function descr_dot; by looking at the +implementation we see that it creates a new W_NDimArray to store the +result, which means it has to do a malloc():
                + +
                +
                +The implementation of the + self.constant part is also interesting: +contrary the former, the call to W_NDimArray.descr_add has been inlined by +the JIT, so we have a better picture of what's happening; in particular, we +can see the call to __0_alloc_with_del____ which allocates the +W_NDimArray for the result, and the raw_malloc which allocates the +actual array. Then we have a long list of 149 simple operations which set the +fields of the resulting array, construct an iterator, and finally do a +call_assembler: this is the actual logic to do the addition, which was +JITtted indipendently; call_assembler is one of the operations to do +JIT-to-JIT calls:
                + +
                +
                +All of this is very suboptimal: in this particular case, we know that the +shape of self.matrix is always (3, 2): so, we are doing an incredible +amount of work, including calling malloc() twice for the temporary arrays, just to +call two functions which ultimately do a total of 6 multiplications +and 6 additions. Note also that this is not a fault of the JIT: CPython+numpy +has to do the same amount of work, just hidden inside C calls.

                +One possible solution to this nonsense is a well known compiler optimization: +loop unrolling. From the compiler point of view, unrolling the loop is always +risky because if the matrix is too big you might end up emitting a huge blob +of code, possibly uselss if the shape of the matrices change frequently: this +is the main reason why the PyPy JIT does not even try to do it in this case.

                +However, we know that the matrix is small, and always of the same +shape. So, let's unroll the loop manually:
                class SpecializedCreature(Creature):
                +
                +    def __init__(self, *args, **kwargs):
                +        Creature.__init__(self, *args, **kwargs)
                +        # store the data in a plain Python list
                +        self.data = list(self.matrix.ravel()) + list(self.constant)
                +        self.data_state = [0.0]
                +        assert self.matrix.shape == (2, 3)
                +        assert len(self.data) == 8
                +
                +    def run_step(self, inputs):
                +        # state: [state_vars ... inputs]
                +        # out_values: [state_vars, ... outputs]
                +        k0, k1, k2, q0, q1, q2, c0, c1 = self.data
                +        s0 = self.data_state[0]
                +        z_sp, z = inputs
                +        #
                +        # compute the output
                +        out0 = s0*k0 + z_sp*k1 + z*k2 + c0
                +        out1 = s0*q0 + z_sp*q1 + z*q2 + c1
                +        #
                +        self.data_state[0] = out0
                +        outputs = [out1]
                +        return outputs
                +
                +In the actual code there is also a sanity check which asserts that the +computed output is the very same as the one returned by Creature.run_step.

                +So, let's try to see how it performs. First, with CPython:
                $ python -m ev.main
                +Generation   1: ... [population = 500]  [7.61 secs]
                +Generation   2: ... [population = 500]  [3.96 secs]
                +Generation   3: ... [population = 500]  [3.79 secs]
                +Generation   4: ... [population = 500]  [3.74 secs]
                +Generation   5: ... [population = 500]  [3.84 secs]
                +Generation   6: ... [population = 500]  [3.69 secs]
                +
                +This looks good: 60% faster than the original CPython+numpy +implementation. Let's try on PyPy:
                Generation   1: ... [population = 500]  [0.39 secs]
                +Generation   2: ... [population = 500]  [0.10 secs]
                +Generation   3: ... [population = 500]  [0.11 secs]
                +Generation   4: ... [population = 500]  [0.09 secs]
                +Generation   5: ... [population = 500]  [0.08 secs]
                +Generation   6: ... [population = 500]  [0.12 secs]
                +Generation   7: ... [population = 500]  [0.09 secs]
                +Generation   8: ... [population = 500]  [0.08 secs]
                +Generation   9: ... [population = 500]  [0.08 secs]
                +Generation  10: ... [population = 500]  [0.08 secs]
                +Generation  11: ... [population = 500]  [0.08 secs]
                +Generation  12: ... [population = 500]  [0.07 secs]
                +Generation  13: ... [population = 500]  [0.07 secs]
                +Generation  14: ... [population = 500]  [0.08 secs]
                +Generation  15: ... [population = 500]  [0.07 secs]
                +
                +Yes, it's not an error. After a couple of generations, it stabilizes at around +~0.07-0.08 seconds per generation. This is around 80 (eighty) times faster +than the original CPython+numpy implementation, and around 35-40x faster than +the naive PyPy+numpypy one.

                +Let's look at the trace again: it no longer contains expensive calls, and +certainly no more temporary malloc() s. The core of the logic is between +lines 386-416, where we can see that it does fast C-level multiplications and +additions: float_mul and float_add are translated straight into +mulsd and addsd x86 instructions.

                +As I said before, this is a very particular example, and the techniques +described here do not always apply: it is not realistic to expect an 80x +speedup on arbitrary code, unfortunately. However, it clearly shows the potential of PyPy when +it comes to high-speed computing. And most importantly, it's not a toy +benchmark which was designed specifically to have good performance on PyPy: +it's a real world example, albeit small.

                +You might be also interested in the talk I gave at last EuroPython, in which I +talk about a similar topic: "The Joy of PyPy JIT: abstractions for free" +(abstract, slides and video).

                +

                +How to reproduce the results

                +
                $ git clone https://github.com/antocuni/evolvingcopter
                +$ cd evolvingcopter
                +$ {python,pypy} -m ev.main --no-specialized --no-numpypy
                +$ {python,pypy} -m ev.main --no-specialized
                +$ {python,pypy} -m ev.main
                +
                +
                +
                +
                +
                +
                +
                + + Unknown wrote on 2017-11-02 21:23: +
                +
                +

                Isn't this a factor 80 slowdown because of a design error? Normally, one should store all creatures in a big numpy array and evaluate run_step on all creatures at once.

                +
                +
                +
                +
                + + Unknown wrote on 2017-11-27 10:48: +
                +
                +

                I don't understand - how do you figure out that line 1232 is not cheap?

                +
                +
                +
                +
                + + Antonio Cuni wrote on 2017-11-28 09:40: +
                +
                +

                @anatoly: line 1232 is a call to descr_dot: if you look at the implementation, you see that it does lots of things including mallocs, and those we know are not cheap at all

                +
                +
                +
                +
                + + homm wrote on 2018-05-21 15:17: +
                +
                +

                Have you tried the third argument of numpy.dot, out to avoid memory alocation?

                +
                +
                +
                + +

                (Cape of) Good Hope for PyPy

                + +
                +
                +
                +
                +Hello from the other side of the world (for most of you)!

                +With the excuse of coming to PyCon ZA during the last two weeks Armin, +Ronan, Antonio and sometimes Maciek had a very nice and productive sprint in +Cape Town, as pictures show :). We would like to say a big thank you to +Kiwi.com, which sponsored part of the travel costs via its awesome Sourcelift +program to help Open Source projects.

                + + +
                Armin, Anto and Ronan at Cape Point
                +
                +Armin, Ronan and Anto spent most of the time hacking at cpyext, our CPython +C-API compatibility layer: during the last years, the focus was to make it +working and compatible with CPython, in order to run existing libraries such +as numpy and pandas. However, we never paid too much attention to performance, +so the net result is that with the latest released version of PyPy, C +extensions generally work but their speed ranges from "slow" to "horribly +slow".

                +For example, these very simple microbenchmarks measure the speed of +calling (empty) C functions, i.e. the time you spend to "cross the border" +between RPython and C. (Note: this includes the time spent doing the loop in regular Python code.) These are the results on CPython, on PyPy 5.8, and on +our newest in-progress version:

                $ python bench.py     # CPython
                +noargs      : 0.41 secs
                +onearg(None): 0.44 secs
                +onearg(i)   : 0.44 secs
                +varargs     : 0.58 secs
                +
                +
                +
                +
                +
                $ pypy-5.8 bench.py   # PyPy 5.8
                +noargs      : 1.01 secs
                +onearg(None): 1.31 secs
                +onearg(i)   : 2.57 secs
                +varargs     : 2.79 secs
                +
                +
                +
                +
                +
                $ pypy bench.py       # cpyext-refactor-methodobject branch
                +noargs      : 0.17 secs
                +onearg(None): 0.21 secs
                +onearg(i)   : 0.22 secs
                +varargs     : 0.47 secs
                +
                +
                +
                +
                +
                
                +
                
                +So yes: before the sprint, we were ~2-6x slower than CPython. Now, we are
                +faster than it!
                +To reach this result, we did various improvements, such as:
                +
                +
                  +
                1. teach the JIT how to look (a bit) inside the cpyext module;
                2. +
                3. write specialized code for calling METH_NOARGS, METH_O and +METH_VARARGS functions; previously, we always used a very general and +slow logic;
                4. +
                5. implement freelists to allocate the cpyext versions of int and +tuple objects, as CPython does;
                6. +
                7. the cpyext-avoid-roundtrip branch: crossing the RPython/C border is +slowish, but the real problem was (and still is for many cases) we often +cross it many times for no good reason. So, depending on the actual API +call, you might end up in the C land, which calls back into the RPython +land, which goes to C, etc. etc. (ad libitum).
                8. +
                +
                +The branch tries to fix such nonsense: so far, we fixed only some cases, which +are enough to speed up the benchmarks shown above. But most importantly, we +now have a clear path and an actual plan to improve cpyext more and +more. Ideally, we would like to reach a point in which cpyext-intensive +programs run at worst at the same speed of CPython.

                +The other big topic of the sprint was Armin and Maciej doing a lot of work on the +unicode-utf8 branch: the goal of the branch is to always use UTF-8 as the +internal representation of unicode strings. The advantages are various: +
                +
                  +
                • decoding a UTF-8 stream is super fast, as you just need to check that the +stream is valid;
                • +
                • encoding to UTF-8 is almost a no-op;
                • +
                • UTF-8 is always more compact representation than the currently +used UCS-4. It's also almost always more compact than CPython 3.5 latin1/UCS2/UCS4 combo;
                • +
                • smaller representation means everything becomes quite a bit faster due to lower cache pressure.
                • +
                +
                +Before you ask: yes, this branch contains special logic to ensure that random +access of single unicode chars is still O(1), as it is on both CPython and the +current PyPy.
                +We also plan to improve the speed of decoding even more by using modern processor features, like SSE and AVX. Preliminary results show that decoding can be done 100x faster than the current setup. +

                +In summary, this was a long and profitable sprint, in which we achieved lots +of interesting results. However, what we liked even more was the privilege of +doing commits from awesome places such as the top of Table Mountain:

                + + +
                + + +
                The panorama we looked at instead of staring at cpyext code
                +
                +
                +
                +
                + + Nickolas wrote on 2017-10-18 22:59: +
                +
                +

                It was awesome meeting you all, and I'm so stoked about the recent PyPy improvements :-D

                +
                +
                +
                +
                + + Anonymous wrote on 2017-10-19 06:31: +
                +
                +

                Fantastic news. Many Python users need to use some of these many specialized CPython-based extension modules for which there is no CFFI alternative extensively and as a result have not benefited much, or not at all, from PyPy's speed advantages. These improvements could make PyPy the default Python for many of us.

                +
                +
                +
                +
                + + Anonymous wrote on 2017-10-19 07:57: +
                +
                +

                Could you give a hint to how you're doing O(1) individual character access in UTF-8 strings? Not that I'd find such a requirement particularly necessary (might be handy for all-ASCII strings, but easy to flat those cases), but how is it done? I can figure O(log(n)) ways with up to O(n) storage overhead or O(sqrt(n)) with up to O(sqrt(n)) storage overhead, but O(1) w/o the O(n) storage overhead of having UTF-32 around?

                +
                +
                +
                +
                + + Maciej Fijalkowski wrote on 2017-10-19 09:51: +
                +
                +

                Hi Anonymous.

                It's O(1) time with O(n) storage overhead, but the constants can be manipulated to have 10% or 25% overhead, and only if ever indexed and not ascii at that.

                +
                +
                +
                +
                + + intgr wrote on 2017-10-20 15:33: +
                +
                +

                Really excited to hear about the Unicode representation changes; it should finally make PyPy significantly faster at Unicode manipulation than CPython 3.6 is. It seems this has been bogging down PyPy's advantage at Unicode-heavy workloads like webapp template rendering.

                Even without O(1) access to characters by index, I think it's a great idea to use UTF-8 internally, since that's the prevalent encoding for input/output pretty much everywhere. Accessing Unicode characters by index is an antipattern in most situations and UCS-2/UTF-16 is becoming irrelevant.

                +
                +
                +
                +
                + + Oscar Smith wrote on 2017-10-20 16:10: +
                +
                +

                I would also be really interested in a quick blogpost at some point about how to get O(1) indexing without greater storage overhead than just using UTF-32

                +
                +
                +
                + +

                PyPy v5.9 Released, Now Supports Pandas, NumPy

                + +
                +
                +The PyPy team is proud to release both PyPy3.5 v5.9 (a beta-quality interpreter for Python +3.5 syntax) and PyPy2.7 v5.9 (an interpreter supporting +Python 2.7 syntax).

                  +
                • NumPy and Pandas now work on PyPy2.7 (together with Cython 0.27.1). Many other modules +based on C-API extensions work on PyPy as well.
                • +
                +
                  +
                • Cython 0.27.1 (released very recently) supports more projects with PyPy, both +on PyPy2.7 and PyPy3.5 beta. Note version 0.27.1 is now the minimum +version that supports this version of PyPy, due to some interactions with +updated C-API interface code.
                • +
                +
                  +
                • We optimized the JSON parser for recurring string keys, which should decrease +memory use by up to 50% and increase parsing speed by up to 15% for large JSON files +with many repeating dictionary keys (which is quite common).
                • +
                +
                  +
                • +CFFI, which is part of the PyPy release, has been updated to 1.11.1, +improving an already great package for interfacing with C. CFFI now supports +complex arguments in API mode, as well as char16_t and char32_t and has +improved support for callbacks.
                • +
                +
                  +
                • Issues in the C-API compatibility layer that appeared as excessive memory +use were cleared up and other incompatibilities were resolved. The C-API +compatibility layer does slow down code which crosses the python-c interface +often. Some fixes are in the pipelines for some of the performance issues, and we still recommend +using pure python on PyPy or interfacing via CFFI
                • +
                +
                +Please let us know if your use case is slow, we have ideas how to make things +faster but need real-world examples (not micro-benchmarks) of problematic code.

                +Work sponsored by a Mozilla grant continues on PyPy3.5; we continue on the path to the goal of a complete python 3.5 implementation. Of course the bug fixes and performance enhancements +mentioned above are part of both PyPy2.7 and PyPy3.5 beta.

                +As always, this release fixed many other issues and bugs raised by the +growing community of PyPy users. We strongly recommend updating.

                +You can download the v5.9 releases here (note that we provide PyPy3.5 binaries for only Linux 64bit for now):

                + +
                +We would like to thank our donors and contributors, and +encourage new people to join the project. PyPy has many +layers and we need help with all of them: PyPy and RPython documentation +improvements, tweaking popular modules to run on PyPy, or general help +with making RPython’s JIT even better.

                +What is PyPy?

                +PyPy is a very compliant Python interpreter, almost a drop-in replacement for CPython 2.7 (stdlib version 2.7.13), and CPython 3.5 (stdlib version 3.5.3). It’s fast (PyPy and CPython 2.7.x performance comparison) due to its integrated tracing JIT compiler.

                +We also welcome developers of other dynamic languages to see what RPython can do for them.

                +The PyPy 2.7 release supports:
                +
                +
                  +
                • +x86 machines on most common operating systems (Linux 32/64 bits, Mac OS X 64 bits, Windows 32 bits, OpenBSD, FreeBSD)
                • +
                • newer ARM hardware (ARMv6 or ARMv7, with VFPv3) running Linux,
                • +
                • big- and little-endian variants of PPC64 running Linux,
                • +
                • +s390x running Linux
                • +
                +
                +
                +

                +What else is new?

                +
                +PyPy 5.8 was released in June, 2017.
                +
                +There are many incremental improvements to RPython and PyPy, the complete listing is here.
                +
                +Please update, and continue to help us make PyPy better.

                +Cheers, The PyPy team
                +
                +
                +
                +
                + + Anonymous wrote on 2017-10-05 19:36: +
                +
                +

                Pypy3 works very well with flask. Good Job and thanx.



                cheers Rob

                +
                +
                +
                +
                + + Carlos Vega wrote on 2017-10-08 16:56: +
                +
                +

                Good job ! 😀🎉
                It would be great if you could update https://packages.pypy.org/
                I'm going to donate again ! your work is awesome.

                +
                +
                +
                +
                + + melin wrote on 2017-11-02 13:57: +
                +
                +

                Pypy test run pands two or three times slower than pyhon

                df = sparkSession.sql("select * from test_users_dt").toPandas()
                for index, row in df.iterrows():
                result = 0

                for key in range(0, 10000000):
                event_type = row.event_type
                if key > 234:
                result = result + 1
                len(event_type + "123")

                print(result)

                +
                +
                +
                +
                + + Maciej Fijalkowski wrote on 2017-11-02 14:00: +
                +
                +

                @melin

                We know that. We're in the process of improving that by merging various cpyext improvement branches. Stay tuned.

                +
                +
                +
                +
                + + Unknown wrote on 2017-11-20 09:14: +
                +
                +

                So this means the numpy port for pypy is redundant now right? We can use the original python numpy package?

                +
                +
                +
                +
                + + Antonio Cuni wrote on 2017-11-20 09:48: +
                +
                +

                @Eitan

                yes, but look at this FAQ for a longer explanation: https://doc.pypy.org/en/latest/faq.html#what-about-numpy-numpypy-micronumpy

                +
                +
                +
                + +

                Let's remove the Global Interpreter Lock

                + +
                +
                +

                Hello everyone

                +

                The Python community has been discussing removing the Global Interpreter Lock for +a long time. +There have been various attempts at removing it: +Jython or IronPython successfully removed it with the help of the underlying +platform, and some have yet to bear fruit, like gilectomy. Since our February sprint in Leysin, +we have experimented with the topic of GIL removal in the PyPy project. +We believe that the work done in IronPython or Jython can be reproduced with +only a bit more effort in PyPy. Compared to that, removing the GIL in CPython is a much +harder topic, since it also requires tackling the problem of multi-threaded reference +counting. See the section below for further details.

                +

                As we announced at EuroPython, what we have so far is a GIL-less PyPy +which can run very simple multi-threaded, nicely parallelized, programs. +At the moment, more complicated programs probably segfault. The +remaining 90% (and another 90%) of work is with putting locks in strategic +places so PyPy does not segfault during concurrent accesses to +data structures.

                +

                Since such work would complicate the PyPy code base and our day-to-day work, +we would like to judge the interest of the community and the commercial +partners to make it happen (we are not looking for individual +donations at this point). We estimate a total cost of $50k, +out of which we already have backing for about 1/3 (with a possible 1/3 +extra from the STM money, see below). This would give us a good +shot at delivering a good proof-of-concept working PyPy with no GIL. If we can get a $100k +contract, we will deliver a fully working PyPy interpreter with no GIL as a release, +possibly separate from the default PyPy release.

                +

                People asked several questions, so I'll try to answer the technical parts +here.

                +

                What would the plan entail?

                +

                We've already done the work on the Garbage Collector to allow doing multi- +threaded programs in RPython. "All" that is left is adding locks on mutable +data structures everywhere in the PyPy codebase. Since it would significantly complicate +our workflow, we require real interest in that topic, backed up by +commercial contracts in order to justify the added maintenance burden.

                +

                Why did the STM effort not work out?

                +

                STM was a research project that proved that the idea is possible. However, +the amount of user effort that is required to make programs run in a +parallelizable way is significant, and we never managed to develop tools +that would help in doing so. At the moment we're not sure if more work +spent on tooling would improve the situation or if the whole idea is really doomed. +The approach also ended up adding significant overhead on single threaded programs, +so in the end it is very easy to make your programs slower. (We have some money +left in the donation pot for STM which we are not using; according to the rules, we +could declare the STM attempt failed and channel that money towards the present +GIL removal proposal.)

                +

                Wouldn't subinterpreters be a better idea?

                +

                Python is a very mutable language - there are tons of mutable state and +basic objects (classes, functions,...) that are compile-time in other +language but runtime and fully mutable in Python. In the end, sharing +things between subinterpreters would be restricted to basic immutable +data structures, which defeats the point. Subinterpreters suffers from the same problems as +multiprocessing with no additional benefits. +We believe that reducing mutability to implement subinterpreters is not viable without seriously impacting the +semantics of the language (a conclusion which applies to many other +approaches too).

                +

                Why is it easier to do in PyPy than CPython?

                +

                Removing the GIL in CPython has two problems:

                +
                  +
                • how do we guard access to mutable data structures with locks and
                • +
                • what to do with reference counting that needs to be guarded.
                • +
                +

                PyPy only has the former problem; the latter doesn't exist, +due to a different garbage collector approach. Of course the first problem +is a mess too, but at least we are already half-way there. Compared to Jython +or IronPython, PyPy lacks some data structures that are provided by JVM or .NET, +which we would need to implement, hence the problem is a little harder +than on an existing multithreaded platform. However, there is good research +and we know how that problem can be solved.

                +

                Best regards,
                +Maciej Fijalkowski

                +
                +
                +
                +
                +
                +
                + + Patrick wrote on 2017-08-14 18:03: +
                +
                +

                Where can one donate? Is there a specific page for it? :)

                +
                +
                +
                +
                + + Anonymous wrote on 2017-08-14 20:12: +
                +
                +

                Where can we we donate or forward a link to managing directors for corporate donations?

                +
                +
                +
                +
                + + funny_falcon wrote on 2017-08-14 21:29: +
                +
                +

                Neither .Net, nor Java put locks around every mutable access. Why the hell PyPy should?

                +
                +
                +
                +
                + + Unknown wrote on 2017-08-15 00:29: +
                +
                +

                It sounds to me like you are just looking for money to spend. I see no reliable or commercial deliverable coming out of this effort (you listed a bucketload of caveats already). If it were doable in $100k, it would have been done long ago, no? Caveat Emptor to those who toss their money at this.

                +
                +
                +
                +
                + + Unknown wrote on 2017-08-15 06:05: +
                +
                +

                200+ comments about this article are at: https://news.ycombinator.com/item?id=15008636

                +
                +
                +
                +
                + + Zunzster wrote on 2017-08-15 06:20: +
                +
                +

                @funny_falcon: I don't read this as them arguing for putting "putting locks around *every* mutable access". Rather, just the core shared-mutable pieces of the run-time library and infrastructure, which in .NET and the JVM are provided by the VM itself for Jython and IronPython but which PyPy has to implement.

                @scott_taggart: Your vision seems limited. Perhaps you aren't familiar with the PyPy team's strong history of delivering. It may well be 'doable in $100K' but how is that supposed to have spontaneously happened already without a viable plan and a trusted team which is exactly what the PyPy project is?

                I always thought the STM concept was really clever and elegant in theory but that the overhead involved, both in recording and rollback-retries, could impact forward progress too much to be viable in practice. Essentially, STM and locks are dual's of each other, with STM having better composition and locks less overhead.

                At least with a more traditional locking approach, the locks are still being inserted by the interpreter/library, so they can be reasoned about more carefully (and even instrumented programmatically) to avoid some of the classic problems with lock-based designs whilst regaining the performance lost to STM overhead.

                If anyone can pull it off, the PyPy team can :-)

                +
                +
                +
                +
                + + Unknown wrote on 2017-08-15 08:31: +
                +
                +

                +1

                +
                +
                +
                +
                + + Unknown wrote on 2017-08-15 09:19: +
                +
                +

                Why not rather implement immutable datastructures like Clojure does?

                +
                +
                +
                +
                + + Anonymous wrote on 2017-08-15 12:42: +
                +
                +

                Oh, just shut up and take my money.

                +
                +
                +
                +
                + + Anonymous wrote on 2017-08-15 14:10: +
                +
                +

                I have been very impressed with the PyPy developers accomplishments to date and sincerely hope that they find corporate sponsors for this worthwhile endeavor.

                +
                +
                +
                +
                + + Unknown wrote on 2017-08-15 20:23: +
                +
                +

                How can people donate? $50k seems a bargain for such an important achievement. That's pocket change to most moderately sized companies.

                +
                +
                +
                +
                + + Joce wrote on 2017-08-16 05:17: +
                +
                +

                Sounds good, perhaps time to mark the STM effort as stale?

                +
                +
                +
                +
                + + Unknown wrote on 2017-09-13 23:22: +
                +
                +

                This would be awesome, please. :(

                +
                +
                +
                +
                + + PvdE wrote on 2017-10-04 06:59: +
                +
                +

                I donated to the original STM and would be happy if it were reallocated to this.

                +
                +
                +
                + +

                Binary wheels for PyPy

                + +
                +

                Hi,

                +this is a short blog post, just to announce the existence of this Github repository, which contains binary PyPy wheels for some selected packages. The availability of binary wheels means that you can install the packages much more quickly, without having to wait for compilation.

                +
                +
                +
                +At the moment of writing, these packages are available:

                  +
                • numpy
                • +
                • scipy
                • +
                • pandas
                • +
                • psutil
                • +
                • netifaces
                • +
                +
                +For now, we provide only wheels built on Ubuntu, compiled for PyPy 5.8.
                +In particular, it is worth noting that they are not manylinux1 wheels, which means they could not work on other Linux distributions. For more information, see the explanation in the README of the above repo.

                +Moreover, the existence of the wheels does not guarantee that they work correctly 100% of the time. they still depend on cpyext, our C-API emulation layer, which is still work-in-progress, although it has become better and better during the last months. Again, the wheels are there only to save compilation time.

                +To install a package from the wheel repository, you can invoke pip like this:

                $ pip install --extra-index https://antocuni.github.io/pypy-wheels/ubuntu numpy
                +
                +
                +
                +Happy installing!
                +
                +
                +
                +
                + + Unknown wrote on 2017-07-27 11:16: +
                +
                +

                Very nice. The main reason I can't actively recommend PyPy to others is that I would have to help them install all packages, where for CPython I can just say "conda install foo". Working on efforts like this is extremely useful for the community.

                +
                +
                +
                +
                + + Gaëtan de Menten wrote on 2017-10-02 08:56: +
                +
                +

                Speaking of which if those were conda packages, that would make it much easier for me. And if pytables and pyyaml worked in pypy (a few years ago they did not and I have no idea what is their current state) and were packaged too, I could finally try pypy on my real projects, and not just toy examples.

                +
                +
                +
                + +

                PyPy v5.8 released

                + +
                +
                +The PyPy team is proud to release both PyPy2.7 v5.8 (an interpreter supporting +Python 2.7 syntax), and a beta-quality PyPy3.5 v5.8 (an interpreter for Python +3.5 syntax). The two releases are both based on much the same codebase, thus +the dual release. Note that PyPy3.5 supports Linux 64bit only for now.

                +This new PyPy2.7 release includes the upstream stdlib version 2.7.13, and +PyPy3.5 includes the upstream stdlib version 3.5.3.

                +We fixed critical bugs in the shadowstack rootfinder garbage collector +strategy that crashed multithreaded programs and very rarely showed up +even in single threaded programs.

                +We added native PyPy support to profile frames in the vmprof statistical +profiler.

                +The struct module functions pack* and unpack* are now much faster, +especially on raw buffers and bytearrays. Microbenchmarks show a 2x to 10x +speedup. Thanks to Gambit Research for sponsoring this work.

                +This release adds (but disables by default) link-time optimization and +profile guided optimization of the base interpreter, which may make +unjitted code run faster. To use these, translate with appropriate +options. Be aware of issues with gcc toolchains, though.

                +Please let us know if your use case is slow, we have ideas how to make things +faster but need real-world examples (not micro-benchmarks) of problematic code.

                +Work sponsored by a Mozilla grant continues on PyPy3.5; numerous fixes from +CPython were ported to PyPy and PEP 489 was fully implemented. Of course the +bug fixes and performance enhancements mentioned above are part of both PyPy +2.7 and PyPy 3.5.

                CFFI, which is part of the PyPy release, has been updated to an unreleased 1.10.1, +improving an already great package for interfacing with C.

                +Anyone using NumPy 1.13.0, must upgrade PyPy to this release since we implemented some previously missing C-API functionality. Many other c-extension modules now work with PyPy, let us know if yours does not.

                +As always, this release fixed many issues and bugs raised by the +growing community of PyPy users. We strongly recommend updating.

                +You can download the v5.8 release here:
                + +
                +We would like to thank our donors and contributors, and +encourage new people to join the project. PyPy has many +layers and we need help with all of them: PyPy and RPython documentation +improvements, tweaking popular modules to run on PyPy, or general help +with making RPython’s JIT even better.

                +What is PyPy?

                +PyPy is a very compliant Python interpreter, almost a drop-in replacement for CPython 2.7 and CPython 3.5. It’s fast (PyPy and CPython 2.7.x performance comparison) due to its integrated tracing JIT compiler.
                +We also welcome developers of other dynamic languages to see what RPython can do for them.
                +The PyPy 2.7 release supports:
                +
                +
                  +
                • +x86 machines on most common operating systems (Linux 32/64 bits, Mac OS X 64 bits, Windows 32 bits, OpenBSD, FreeBSD)
                • +
                • newer ARM hardware (ARMv6 or ARMv7, with VFPv3) running Linux,
                • +
                • big- and little-endian variants of PPC64 running Linux,
                • +
                • +s390x running Linux
                • +
                +
                +
                +

                +What else is new?

                +
                +PyPy 5.7 was released in March, 2017.
                +
                +There are many incremental improvements to RPython and PyPy, the complete listing is here. +
                +
                +Please update, and continue to help us make PyPy better.

                +Cheers, The PyPy team

                +
                +
                +
                +
                +
                + + Unknown wrote on 2017-06-09 12:11: +
                +
                +

                Great news! Thank you!

                +
                +
                +
                +
                + + Albert Le Blanc wrote on 2017-06-09 12:37: +
                +
                +

                Can we get a comprehensive update on Numpypy? It has gone really quiet since the days when Alex Gaynor used to talk at Pycon etc about the work which has been going on since what 2010/11? The repo has issues that are not looked at. I would really like an honest appraisal of what was learned in the Numpypy project and what is the future of Numpy (Scipy too) & PyPy because the situation for developers like myself is that we're caught between a rock and a hard place. PyPy consistently allows us to write code and explore algorithms in Python!! Whereas CPython forces you into C/Cython continually. PyPy is a great dream in my heart. What you guys are doing - allowing me to write Python and it be fast. What other language forces you so much to write in another language when performance is a consideration? The speed difference between Node.js and Python 3 is laughable. PyPy for the win!!!!

                But....and it's a big but I am one of those devs who extensively is addicted to numeric arrays, not because I'm a 'quant' or an astronomer or rocket scientist but because Numpy arrays are simply better for many solutions than Python's other data structures. And once leveraged, giving that up to go to PyPy is impossible. It forces you to choose between numpy + slower python (CPython) or slower Numpy and faster python (PyPy).

                Numpypy was a great dream, the best of both. But it seems to have failed, proven to be too difficult or does it simply need more money? I would appreciate a public update (if one exists, please link to it). Because the sadness for me is that a genuinely fast Python runtime will never be usable until the Numpy/Scipy world works and you get the fast python and as fast numpy.

                I would really like to help, raise money whatever but maybe I'm out of the loop and the plan has changed?

                +
                +
                +
                +
                + + Johny JKJK wrote on 2017-06-09 12:38: +
                +
                +

                Is it possible to resurrect pypy uwsgi integration for pypy3.5?

                +
                +
                +
                +
                + + mattip wrote on 2017-06-10 18:37: +
                +
                +

                Hi Albert. We have decided that a better route is to use upstream NumPy for compatibility. We are a small group, and reimplementing all of the c code in NumPy for Numpypy would be a never ending, close to impossible task.

                However, we do have a different long-term plan to combine numpy and python. Since our c-api emulation layer is slow, perhaps we can "hijack" the most common python calls that cross that emulation border and make them fast. This would utilize much of NumPyPy but would mean that only a subset of the extensive NumPy library would need to be implemented and maintained. We have a branch that demonstrates a proof-of-concept for simple item access (ctypedef struct). Help on the PyPy project is always welcome, come to #pypy on IRC and we can discuss it further.

                +
                +
                +
                +
                + + v3ss wrote on 2017-06-27 18:51: +
                +
                +

                Regarding Beta Status on PyPy 5.8 3.5.x , What are the main missing points?
                What are the current know issues for PyPy 5.8-3.5.x ?

                +
                +
                +
                + +

                PyPy 5.7.1 bugfix released

                + +
                +
                +We have released a bugfix PyPy2.7-v5.7.1 and PyPy3.5-v5.7.1 beta (Linux 64bit), +due to the following issues:
                +
                +
                  +
                • correctly handle an edge case in dict.pop (issue 2508)
                • +
                • fix a regression to correctly handle multiple inheritance in a C-API type +where the second base is an app-level class with a __new__ function
                • +
                • fix a regression to fill a C-API type’s tp_getattr slot from a +__getattr__ method (issue 2523)
                • +
                +
                +
                +Thanks to those who reported issues and helped test out the fixes

                +You can download the v5.7.1 release here:
                + +
                +

                +What is PyPy?

                +PyPy is a very compliant Python interpreter, almost a drop-in replacement for CPython 2.7 and CPython 3.5. It’s fast (PyPy and CPython 2.7.x performance comparison) due to its integrated tracing JIT compiler.
                +We also welcome developers of other dynamic languages to see what RPython can do for them.
                +The PyPy 2.7 release supports:
                +
                +
                  +
                • +x86 machines on most common operating systems (Linux 32/64 bits, Mac OS X 64 bits, Windows 32 bits, OpenBSD, FreeBSD)
                • +
                • newer ARM hardware (ARMv6 or ARMv7, with VFPv3) running Linux,
                • +
                • big- and little-endian variants of PPC64 running Linux,
                • +
                • +s390x running Linux
                • +
                +
                +
                +Please update, and continue to help us make PyPy better.

                +Cheers, The PyPy team

                +
                +
                +
                +
                +
                + + Anonymous wrote on 2017-04-04 14:20: +
                +
                +

                any chance for a Mac OS X PyPy3 distribution?
                compilation from sources fails …

                thanks for the great work by the way !

                +
                +
                +
                +
                + + aiguy wrote on 2017-05-15 11:24: +
                +
                +

                Tried looking for a pypy wishlist but couldn't find one. So hopefully somebody reads comments.

                My three biggest pypy wishes are for...

                1. faster csv file reading by replacing Python library code with compiled C code which I understand from 4 years ago is still slower than cPython so is still on the todo list.

                2. Update SQLite to latest version in pypy distribution since they have made some great speed enhancements in recent releases.

                3. Create an containerized downloadable Docker distribution for PyPy which allows for easy deployment of PyPy projects to other machines. platforms and thumbs drives. This would also allow easier setup of multiple PyPy microservices and encapsulation of multiple pypy environments on the same machine.

                +
                +
                +
                +
                + + Armin Rigo wrote on 2017-05-15 16:40: +
                +
                +

                @aiguy: csv is written in C already nowadays. Please report an issue with reproducible examples if you find that PyPy is still a lot slower than CPython at reading large-ish csv files.

                For SQLite, I guess you're talking about Windows. We have plans to update it at some point.

                For Docker, that's outside the scope of the PyPy team and should be done (or is done already?) by other people.

                +
                +
                +
                +
                + + Carl Friedrich Bolz-Tereick wrote on 2017-05-15 18:24: +
                +
                +

                There are maintained Docker files here, IIRC: https://hub.docker.com/_/pypy/

                +
                +
                +
                + +

                Native profiling in VMProf

                + +
                +

                We are happy to announce a new release for the PyPI package vmprof.
                +It is now able to capture native stack frames on Linux and Mac OS X to show you bottle necks in compiled code (such as CFFI modules, Cython or C Python extensions). It supports PyPy, CPython versions 2.7, 3.4, 3.5 and 3.6. Special thanks to Jetbrains for funding the native profiling support.

                +
                +vmprof logo +
                +
                +
                +
                What is vmprof?

                If you have already worked with vmprof you can skip the next two section. If not, here is a short introduction:

                The goal of vmprof package is to give you more insight into your program. It is a statistical profiler. Another prominent profiler you might already have worked with is cProfile. It is bundled with the Python standard library.

                vmprof's distinct feature (from most other profilers) is that it does not significantly slow down your program execution. The employed strategy is statistical, rather than deterministic. Not every function call is intercepted, but it samples stack traces and memory usage at a configured sample rate (usually around 100hz). You can imagine that this creates a lot less contention than doing work before and after each function call.

                As mentioned earlier cProfile gives you a complete profile, but it needs to intercept every function call (it is a deterministic profiler). Usually this means that you have to capture and record every function call, but this takes an significant amount time.

                The overhead vmprof consumes is roughly 3-4% of your total program runtime or even less if you reduce the sampling frequency. Indeed it lets you sample and inspect much larger programs. If you failed to profile a large application with cProfile, please give vmprof a shot.

                vmprof.com or PyCharm

                +
                +There are two major alternatives to the command-line tools shipped with vmprof:
                +
                  +
                • A web service on vmprof.com +
                • +
                • PyCharm Professional Edition
                • +
                +
                +While the command line tool is only good for quick inspections, vmprof.com + and PyCharm compliment each other providing deeper insight into your +program. With PyCharm you can view the per-line profiling results inside + the editor. With the vmprof.com you get a handy visualization of the profiling results as a flame chart and memory usage graph.
                +
                +
                +
                +
                +
                +
                +Since the PyPy Team runs and maintains the service on vmprof.com (which is by the way free and open-source), I’ll explain some more details here. On vmprof.com you can inspect the generated profile interactively instead of looking at console output. What is sent to vmprof.com? You can find details here.
                +
                +
                Flamegraph: Accumulates and displays the most frequent codepaths. It allows you to quickly and accurately identify hot spots in your code. The flame graph below is a very short run of richards.py (Thus it shows a lot of time spent in PyPy's JIT compiler).

                + +
                +

                List all functions (optionally sorted): the equivalent of the vmprof command line output in the web.

                + +
                +
                Memory curve: A line plot that shows how how many MBytes have been consumed over the lifetime of your program (see more info in the section below).

                + +
                +Native programs

                The new feature introduced in vmprof 0.4.x allows you to look beyond the Python level. As you might know, Python maintains a stack of frames to save the execution. Up to now the vmprof profiles only contained that level of information. But what if you program jumps to native code (such as calling gzip compression on a large file)? Up to now you would not see that information.

                +Many packages make use of the CPython C API (which we discurage, please lookup cffi for a better way to call C). Have you ever had the issue that you know that your performance problems reach down to, but you could not profile it properly? Now you can!

                Let's inspect a very simple Python program to find out why a program is significantly slower on Linux than on Mac:

                import numpy as np
                +n = 1000
                +a = np.random.random((n, n))
                +b = np.random.random((n, n))
                +c = np.dot(np.abs(a), b)



                +Take two NxN random matrix objects and create a dot product. The first argument to the dot product provides the absolute value of the random matrix.

                + + + + + + + + + + + + + + + + + + + + + + +
                RunPythonNumPyOSn=... Took
                [1]CPython 3.5.2NumPy 1.12.1Mac OS X, 10.12.3n=5000~9 sec
                [2]CPython 3.6.0NumPy 1.12.1Linux 64, Kernel 4.9.14n=1000~26 sec
                +
                +Note that the Linux machine operates on a 5 times smaller matrix, still it takes much longer. What is wrong? Is Linux slow? CPython 3.6.0? Well no, lets inspect and [1] and [2] (shown below in that order).
                + +
                +
                + +
                +
                [2] runs on Linux, spends nearly all of the time in PyArray_MatrixProduct2, if you compare to [1] on Mac OS X, you'll see that a lot of time is spent in generating the random numbers and the rest in cblas_matrixproduct.

                +Blas has a very efficient implementation so you can achieve the same on Linux if you install a blas implementation (such as openblas).

                +Usually you can spot potential program source locations that take a lot of time and might be the first starting point to resolve performance issues.

                Beyond Python programs

                +It is not unthinkable that the strategy can be reused for native programs. Indeed this can already be done by creating a small cffi wrapper around an entry point of a compiled C program. It would even work for programs compiled from other languages (e.g. C++ or Fortran). The resulting function names are the full symbol name embedded into either the executable symboltable or extracted from the dwarf debugging information. Most of those will be compiler specific and contain some cryptic information.

                Memory profiling
                +We thankfully received a code contribution from the company Blue Yonder. They have built a memory profiler (for Linux and Mac OS X) on top of vmprof.com that displays the memory consumption for the runtime of your process.

                +You can run it the following way:

                $ python -m vmprof --mem --web script.py

                +By adding --mem, vmprof will capture memory information and display it in the dedicated view on vmprof.com. You can tha view by by clicking the 'Memory' switch in the flamegraph view.

                There is more

                +Some more minor highlights contained in 0.4.x:
                  +
                • VMProf support for Windows 64 bit (No native profiling)
                • +
                • VMProf can read profiles generated by another host system
                • +
                • VMProf is now bundled in several binary wheel for fast and easy installation (Mac OS X, Linux 32/64 for CPython 2.7, 3.4, 3.5, 3.6)
                • +
                +Future plans - Profile Streaming

                +vmprof has not reached the end of development. There are many features we could implement. But there is one feature that could be a great asset to many Python developers.

                +Continuous delivery of your statistical profile, or in short, profile streaming. One of the great strengths of vmprof is that is consumes very little overhead. It is not a crazy idea to run this in production.

                +It would require a smart way to stream the profile in the background to vmprof.com and new visualizations to look at much more data your Python service produces.

                +If that sounds like a solid vmprof improvement, don't hesitate to get in touch with us (e.g. IRC #pypy, mailing list pypy-dev, or comment below)

                You can help!

                +There are some immediate things other people could help with. Either by donating time or money (yes we have occasional contributors which is great)!
                  +
                • We gladly received code contribution for the memory profiler. But it was not enough time to finish the migration completely. Sadly it is a bit brittle right now.
                • +
                • We would like to spend more time on other visualizations. This should include to give a much better user experience on vmprof.com (like a tutorial that explains the visualization that we already have). 
                • +
                • Build Windows 32/64 bit wheels (for all CPython versions we currently support)
                • +
                +We are also happy to accept google summer of code projects on vmprof for new visualizations and other improvements. If you qualify and are interested, don't hesitate to ask!

                +Richard Plangger (plan_rich) and the PyPy Team

                +[1] Mac OS X https://vmprof.com/#/567aa150-5927-4867-b22d-dbb67ac824ac
                +[2] Linux64 https://vmprof.com/#/097fded2-b350-4d68-ae93-7956cd10150c +
                +

                PyPy2.7 and PyPy3.5 v5.7 - two in one release

                + +
                +
                +
                +The PyPy team is proud to release both PyPy2.7 v5.7 (an interpreter supporting +Python v2.7 syntax), and a beta-quality PyPy3.5 v5.7 (an interpreter for Python +v3.5 syntax). The two releases are both based on much the same codebase, thus +the dual release. Note that PyPy3.5 only supports Linux 64bit for now.

                +This new PyPy2.7 release includes the upstream stdlib version 2.7.13, and PyPy3.5 (our first in the 3.5 series) includes the upstream stdlib version 3.5.3.

                +We continue to make incremental improvements to our C-API compatibility layer (cpyext). PyPy2 can now import and run many C-extension packages, among the most notable are Numpy, Cython, and Pandas. Performance may be slower than CPython, especially for frequently-called short C functions. Please let us know if your use case is slow, we have ideas how to make things faster but need real-world examples (not micro-benchmarks) of problematic code.

                +Work proceeds at a good pace on the PyPy3.5 version due to a grant from the Mozilla Foundation, hence our first 3.5.3 beta release. Thanks Mozilla !!! While we do not pass all tests yet, asyncio works and as these benchmarks show it already gives a nice speed bump. We also backported the f"" formatting from 3.6 (as an exception; otherwise “PyPy3.5” supports the Python 3.5 language).

                CFFI has been updated to 1.10, improving an already great package for interfacing with C.

                +We now use shadowstack as our default gcrootfinder even on Linux. The alternative, asmgcc, will be deprecated at some future point. While about 3% slower, shadowstack is much more easily maintained and debuggable. Also, the performance of shadowstack has been improved in general: this should close the speed gap between other platforms and Linux.

                +As always, this release fixed many issues and bugs raised by the growing community of PyPy users. We strongly recommend updating.

                +You can download the v5.7 release here:
                + +
                +We would like to thank our donors for the continued support of the PyPy project.
                +We would also like to thank our contributors and encourage new people to join the project. PyPy has many layers and we need help with all of them: PyPy and RPython documentation improvements, tweaking popular modules to run on pypy, or general help with making RPython’s JIT even better.
                +

                +

                +What is PyPy?

                +PyPy is a very compliant Python interpreter, almost a drop-in replacement for CPython 2.7 and CPython 3.5. It’s fast (PyPy and CPython 2.7.x performance comparison) due to its integrated tracing JIT compiler.
                +We also welcome developers of other dynamic languages to see what RPython can do for them.
                +The PyPy 2.7 release supports:
                +
                +
                  +
                • +x86 machines on most common operating systems (Linux 32/64 bits, Mac OS X 64 bits, Windows 32 bits, OpenBSD, FreeBSD)
                • +
                • newer ARM hardware (ARMv6 or ARMv7, with VFPv3) running Linux,
                • +
                • big- and little-endian variants of PPC64 running Linux,
                • +
                • +s390x running Linux
                • +
                +
                +
                +

                +

                +What else is new?

                +
                +(since the releases of PyPy 2.7 and 3.3 at the end of 2016)
                +
                +There are many incremental improvements to RPython and PyPy, the complete listing is here. +
                +
                +Please update, and continue to help us make PyPy better.

                +Cheers, The PyPy team

                +
                +
                +
                +
                +
                +
                +
                + + Sergei wrote on 2017-03-21 10:33: +
                +
                +

                Awesome! Thanks, guys.

                +
                +
                +
                +
                + + Baczek wrote on 2017-03-21 11:19: +
                +
                +

                > We also backported the f"" formatting from 3.6 (as an exception; otherwise “PyPy3.5” supports the Python 3.5 language).

                Could you also support just the syntax part of variable type declarations? It'll make using mypy that much nicer.

                +
                +
                +
                +
                + + Unknown wrote on 2017-03-21 12:16: +
                +
                +

                Awesome! Thanks a lot!

                +
                +
                +
                +
                + + Mike wrote on 2017-03-23 10:06: +
                +
                +

                Hello.
                Thanks for pypy!
                I have a question: Is there any big company who using pypy in production?
                Thanks

                +
                +
                +
                +
                + + Canesin wrote on 2017-03-23 14:40: +
                +
                +

                Great work as usual! Is there any plan to benefit from programs using PEP 484 syntax ?

                +
                +
                +
                +
                + + Armin Rigo wrote on 2017-03-23 16:16: +
                +
                +

                @Canesin: benefit for performance? No. The PEP itself says "Using type hints for performance optimizations is left as an exercise for the reader". But that's a misleading comment. There is no useful optimization that we can apply from the knowledge "argument 1 is an int", because that could also be an arbitrarily-large integer and/or an instance of a subclass of int. And if it really turns out to be almost always a regular machine-sized integer, then PyPy's JIT will figure it out by itself. PEP 484 is totally pointless for performance. (It is probably useful for other reasons outside the scope of this comment.)

                +
                +
                +
                +
                + + Miro Hrončok wrote on 2017-03-29 17:13: +
                +
                +

                Excellent news! Is PyPy3 support for 32bit Linux planned? Thanks for info.

                +
                +
                +
                +
                + + Armin Rigo wrote on 2017-04-01 08:13: +
                +
                +

                Miro: yes, we plan to have support for the same set of platforms. The various Posix platforms are not too much work, and Windows will follow, too.

                +
                +
                +
                +
                + + Gaëtan de Menten wrote on 2017-04-02 09:44: +
                +
                +

                Is there anybody working on win64? It is a bit frustrating to see pypy maturing quickly to the point that I could probably use it soon in production... if only it worked on win64.

                +
                +
                +
                +
                + + Armin Rigo wrote on 2017-04-02 09:48: +
                +
                +

                Gaëtan: no. We need either outside contributions or, more likely, money to make it happen. Just like what occurred with Mozilla for Python 3.

                +
                +
                +
                + +

                Leysin Winter Sprint Summary

                + +
                + Today + is the last day of our yearly sprint event in Leysin. We had lots of +ideas on how to enhance the current state of PyPy, we went skiing and +had interesting discussions around virtual machines, the Python +ecosystem, and other real world problems.
                +  +
                +
                +

                +Why don't you join us next time? +

                +
                +
                +A usual PyPy sprints day goes through the following stages: +
                +
                +
                +
                +
                +
                  +
                1.  Planning Session: Tasks from previous days that have seen progress or +are completed are noted in a shared document. Everyone adds new tasks +and then assigns themselves to one or more tasks (usually in pairs). As +soon as everybody is happy with their task and has a partner to work +with, the planning session is concluded and the work can start.
                2. +
                3. +Discussions: A sprint is a good occasion to discuss difficult +and important topics in person. We usually sit down in a separate area +in the sprint room and discuss until a) nobody wants to discuss anymore +or b) we found a solution to the problem. The good thing is that usally +the outcome is b). +
                4. +
                5. +Lunch: For lunch we prepare sandwiches and other finger food. +
                6. +
                7. +Continue working until dinner, which we eat at a random restaurant in Leysin. +
                8. +
                9. +Goto 1 the next day, if sprint has not ended.
                10. +
                +
                +
                +Sprints + are open to everybody and help newcomers to get started with PyPy (we usually + pair you with a developer familiar with PyPy). They are perfect to +discuss and find solutions to problems we currently face. If you are +eager to join next year, please don't hesitate to register next year +around January. +
                +
                +  +
                +
                +

                +Sprint Summary    +

                +Sprint goals included to work on the following topics: +
                +
                +
                  +
                • Work towards releasing PyPy 3.5 (it will be released soon)
                • +
                • +CPython Extension (CPyExt) modules on PyPy +
                • +
                • Have fun in winter sports (a side goal)
                • +
                +
                +
                +

                +Highlights +

                +

                +

                +

                + +

                +
                +
                +
                  +
                • +We have spent lots of time debugging and fixing memory issues on CPyExt. + In particular, we fixed a serious memory leak where taking a memoryview + would prevent numpy arrays from ever being freed. More work is still required to ensure that our GC always releases arrays in a timely +manner. +
                • +
                • +Fruitful discussions and progress about how to flesh out some details about the unicode representation in PyPy. Our current goal is to use utf-8 as the unicode representation internally and have fast vectorized operations (indexing, check if valid, ...). +
                • +
                • +PyPy will participate in GSoC 2017 and we will try to allocate more resources to that than last year. +
                • +
                • +Profile and think about some details how to reduce the starting size of the interpreter. The starting point would be to look at the parser and reduce the amount of strings to keep alive. +
                • +
                • Found a topic for a student's master thesis: correctly freeing cpyext reference cycles.
                • +
                • Run lots of Python3 code on top of PyPy3 and resolve issues we found along the way.
                • +
                • +Initial work on making RPython thread-safe without a GIL. +
                • +
                +
                +
                +

                +List of attendees +

                +
                +
                +- Stefan Beyer +
                +
                +- Antonio Cuni +
                +
                +- Maciej Fijalkowski +
                +
                +- Manuel Jacob +
                +
                +- Ronan Lamy +
                +
                +- Remi Meier +
                +
                +- Richard Plangger +
                +
                +- Armin Rigo +
                +
                +- Robert Zaremba +
                +
                +  +
                +
                +  +
                +
                +
                +
                +
                +
                + +
                +
                +
                +
                +
                +We + would like to thank our donors for the continued support of the PyPy +project and we looking forward to next years sprint in Leysin. +
                +
                +
                +
                +
                +
                +The PyPy Team +
                +
                +
                +
                +



                +
                +
                +
                +
                + +
                +
                +
                + +
                + + + + \ No newline at end of file diff --git a/blog/index-38.html b/blog/index-38.html new file mode 100644 index 000000000..297a9debd --- /dev/null +++ b/blog/index-38.html @@ -0,0 +1,2444 @@ + + + + + + +PyPy (old posts, page 38) | PyPy + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +
                +

                Inside cpyext: Why emulating CPython C API is so Hard

                + +
                +
                +cpyext is PyPy's subsystem which provides a compatibility +layer to compile and run CPython C extensions inside PyPy. Often people ask +why a particular C extension doesn't work or is very slow on PyPy. +Usually it is hard to answer without going into technical details. The goal of +this blog post is to explain some of these technical details, so that we can +simply link here instead of explaining again and again :).
                +From a 10.000 foot view, cpyext is PyPy's version of "Python.h". Every time +you compile an extension which uses that header file, you are using cpyext. +This includes extension explicitly written in C (such as numpy) and +extensions which are generated from other compilers/preprocessors +(e.g. Cython).
                +At the time of writing, the current status is that most C extensions "just +work". Generally speaking, you can simply pip install them, +provided they use the public, official C API instead of poking at private +implementation details. However, the performance of cpyext is generally +poor. A Python program which makes heavy use of cpyext extensions +is likely to be slower on PyPy than on CPython.
                +Note: in this blog post we are talking about Python 2.7 because it is still +the default version of PyPy: however most of the implementation of cpyext is +shared with PyPy3, so everything applies to that as well.
                +

                +C API Overview

                +In CPython, which is written in C, Python objects are represented as PyObject*, +i.e. (mostly) opaque pointers to some common "base struct".
                +CPython uses a very simple memory management scheme: when you create an +object, you allocate a block of memory of the appropriate size on the heap. +Depending on the details, you might end up calling different allocators, but +for the sake of simplicity, you can think that this ends up being a call to +malloc(). The resulting block of memory is initialized and casted to to +PyObject*: this address never changes during the object lifetime, and the +C code can freely pass it around, store it inside containers, retrieve it +later, etc.
                +Memory is managed using reference counting. When you create a new reference to +an object, or you discard a reference you own, you have to increment or +decrement the reference counter accordingly. When the reference counter goes to +0, it means that the object is no longer used and can safely be +destroyed. Again, we can simplify and say that this results in a call to +free(), which finally releases the memory which was allocated by malloc().
                +Generally speaking, the only way to operate on a PyObject* is to call the +appropriate API functions. For example, to convert a given PyObject* to a C +integer, you can use PyInt_AsLong(); to add two objects together, you can +call PyNumber_Add().
                +Internally, PyPy uses a similar approach. All Python objects are subclasses of +the RPython W_Root class, and they are operated by calling methods on the +space singleton, which represents the interpreter.
                +At first, it looks very easy to write a compatibility layer: just make +PyObject* an alias for W_Root, and write simple RPython functions +(which will be translated to C by the RPython compiler) which call the +space accordingly:
                def PyInt_AsLong(space, o):
                +    return space.int_w(o)
                +
                +def PyNumber_Add(space, o1, o2):
                +    return space.add(o1, o2)
                +
                +Actually, the code above is not too far from the real +implementation. However, there are tons of gory details which make it much +harder than it looks, and much slower unless you pay a lot of attention +to performance.
                +
                +

                +The PyPy GC

                +To understand some of cpyext challenges, you need to have at least a rough +idea of how the PyPy GC works.
                +Contrarily to the popular belief, the "Garbage Collector" is not only about +collecting garbage: instead, it is generally responsible for all memory +management, including allocation and deallocation.
                +Whereas CPython uses a combination of malloc/free/refcounting to manage +memory, the PyPy GC uses a completely different approach. It is designed +assuming that a dynamic language like Python behaves the following way:
                +
                  +
                • You create, either directly or indirectly, lots of objects.
                • +
                • Most of these objects are temporary and very short-lived. Think e.g. of +doing a + b + c: you need to allocate an object to hold the temporary +result of a + b, then it dies very quickly because you no longer need it +when you do the final + c part.
                • +
                • Only small fraction of the objects survive and stay around for a while.
                • +
                +
                +So, the strategy is: make allocation as fast as possible; make deallocation of +short-lived objects as fast as possible; find a way to handle the remaining +small set of objects which actually survive long enough to be important.
                +This is done using a Generational GC: the basic idea is the following:
                +
                  +
                1. We have a nursery, where we allocate "young objects" very quickly.
                2. +
                3. When the nursery is full, we start what we call a "minor collection".
                    +
                  • We do a quick scan to determine the small set of objects which survived so +far
                  • +
                  • We move these objects out of the nursery, and we place them in the +area of memory which contains the "old objects". Since the address of the +objects changes, we fix all the references to them accordingly.
                  • +
                  +
                4. +
                +
                  +
                1. now the nursery contains only objects which "died young". We can +discard all of them very quickly, reset the nursery, and use the same area +of memory to allocate new objects from now.
                2. +
                +
                +In practice, this scheme works very well and it is one of the reasons why PyPy +is much faster than CPython. However, careful readers have surely noticed +that this is a problem for cpyext. On one hand, we have PyPy objects which +can potentially move and change their underlying memory address; on the other +hand, we need a way to represent them as fixed-address PyObject* when we +pass them to C extensions. We surely need a way to handle that.
                +
                +

                +PyObject* in PyPy

                +Another challenge is that sometimes, PyObject* structs are not completely +opaque: there are parts of the public API which expose to the user specific +fields of some concrete C struct. For example the definition of PyTypeObject +which exposes many of the tp_* slots to the user. +Since the low-level layout of PyPy W_Root objects is completely different +than the one used by CPython, we cannot simply pass RPython objects to C; we +need a way to handle the difference.
                +So, we have two issues so far: objects can move, and incompatible +low-level layouts. cpyext solves both by decoupling the RPython and the C +representations. We have two "views" of the same entity, depending on whether +we are in the PyPy world (the movable W_Root subclass) or in the C world +(the non-movable PyObject*).
                PyObject* are created lazily, only when they are actually needed. The +vast majority of PyPy objects are never passed to any C extension, so we don't +pay any penalty in that case. However, the first time we pass a W_Root to +C, we allocate and initialize its PyObject* counterpart.
                +The same idea applies also to objects which are created in C, e.g. by calling +PyObject_New(). At first, only the PyObject* exists and it is +exclusively managed by reference counting. As soon as we pass it to the PyPy +world (e.g. as a return value of a function call), we create its W_Root +counterpart, which is managed by the GC as usual.
                +Here we start to see why calling cpyext modules is more costly in PyPy than in +CPython. We need to pay some penalty for all the conversions between +W_Root and PyObject*.
                +Moreover, the first time we pass a W_Root to C we also need to allocate +the memory for the PyObject* using a slowish "CPython-style" memory +allocator. In practice, for all the objects which are passed to C we pay more +or less the same costs as CPython, thus effectively "undoing" the speedup +guaranteed by PyPy's Generational GC under normal circumstances.
                + +
                +

                +Crossing the border between RPython and C

                +There are two other things we need to care about whenever we cross the border +between RPython and C, and vice-versa: exception handling and the GIL.
                +In the C API, exceptions are raised by calling PyErr_SetString() (or one of +many other functions which have a similar effect), which basically works by +creating an exception value and storing it in some global variable. The +function then signals that an exception has occurred by returning an error value, +usually NULL.
                +On the other hand, in the PyPy interpreter, exceptions are propagated by raising the +RPython-level OperationError exception, which wraps the actual app-level +exception values. To harmonize the two worlds, whenever we return from C to +RPython, we need to check whether a C API exception was raised and if so turn it +into an OperationError.
                +We won't dig into details of how the GIL is handled in cpyext. +For the purpose of this post, it is enough to know that whenever we enter +C land, we store the current thread id into a global variable which is +accessible also from C; conversely, whenever we go back from RPython to C, we +restore this value to 0.
                +Similarly, we need to do the inverse operations whenever you need to cross the +border between C and RPython, e.g. by calling a Python callback from C code.
                +All this complexity is automatically handled by the RPython function +generic_cpy_call. If you look at the code you see that it takes care of 4 +things:
                +
                  +
                1. Handling the GIL as explained above.
                2. +
                3. Handling exceptions, if they are raised.
                4. +
                5. Converting arguments from W_Root to PyObject*.
                6. +
                7. Converting the return value from PyObject* to W_Root.
                8. +
                +
                +So, we can see that calling C from RPython introduce some overhead. +Can we measure it?
                +Assuming that the conversion between W_Root and PyObject* has a +reasonable cost (as explained by the previous section), the overhead +introduced by a single border-cross is still acceptable, especially if the +callee is doing some non-negligible amount of work.
                +However this is not always the case. There are basically three problems that +make (or used to make) cpyext super slow:
                +
                  +
                1. Paying the border-crossing cost for trivial operations which are called +very often, such as Py_INCREF.
                2. +
                3. Crossing the border back and forth many times, even if it's not strictly +needed.
                4. +
                5. Paying an excessive cost for argument and return value conversions.
                6. +
                +
                +The next sections explain in more detail each of these problems.
                +
                +

                +Avoiding unnecessary roundtrips

                +Prior to the 2017 Cape Town Sprint, cpyext was horribly slow, and we were +well aware of it: the main reason was that we never really paid too much +attention to performance. As explained in the blog post, emulating all the +CPython quirks is basically a nightmare, so better to concentrate on +correctness first.
                +However, we didn't really know why it was so slow. We had theories and +assumptions, usually pointing at the cost of conversions between W_Root +and PyObject*, but we never actually measured it.
                +So, we decided to write a set of cpyext microbenchmarks to measure the +performance of various operations. The result was somewhat surprising: the +theory suggests that when you do a cpyext C call, you should pay the +border-crossing costs only once, but what the profiler told us was that we +were paying the cost of generic_cpy_call several times more than what we expected.
                +After a bit of investigation, we discovered this was ultimately caused by our +"correctness-first" approach. For simplicity of development and testing, when +we started cpyext we wrote everything in RPython: thus, every single API call +made from C (like the omnipresent PyArg_ParseTuple(), PyInt_AsLong(), etc.) +had to cross back the C-to-RPython border. This was especially daunting for +very simple and frequent operations like Py_INCREF and Py_DECREF, +which CPython implements as a single assembly instruction!
                +Another source of slow down was the implementation of PyTypeObject slots. +At the C level, these are function pointers which the interpreter calls to do +certain operations, e.g. tp_new to allocate a new instance of that type.
                +As usual, we have some magic to implement slots in RPython; in particular, +_make_wrapper does the opposite of generic_cpy_call: it takes a +RPython function and wraps it into a C function which can be safely called +from C, handling the GIL, exceptions and argument conversions automatically.
                +This was very handy during the development of cpyext, but it might result in +some bad nonsense; consider what happens when you call the following C +function:
                static PyObject* foo(PyObject* self, PyObject* args)
                +{
                +    PyObject* result = PyInt_FromLong(1234);
                +    return result;
                +}
                +
                +
                  +
                1. you are in RPython and do a cpyext call to foo: RPython-to-C;
                2. +
                3. +foo calls PyInt_FromLong(1234), which is implemented in RPython: +C-to-RPython;
                4. +
                5. the implementation of PyInt_FromLong indirectly calls +PyIntType.tp_new, which is a C function pointer: RPython-to-C;
                6. +
                7. however, tp_new is just a wrapper around an RPython function, created +by _make_wrapper: C-to-RPython;
                8. +
                9. finally, we create our RPython W_IntObject(1234); at some point +during the RPython-to-C crossing, its PyObject* equivalent is +created;
                10. +
                11. after many layers of wrappers, we are again in foo: after we do +return result, during the C-to-RPython step we convert it from +PyObject* to W_IntObject(1234).
                12. +
                +Phew! After we realized this, it was not so surprising that cpyext was very +slow :). And this was a simplified example, since we are not passing a +PyObject* to the API call. When we do, we need to convert it back and +forth at every step. Actually, I am not even sure that what I described was +the exact sequence of steps which used to happen, but you get the general +idea.
                +The solution is simple: rewrite as much as we can in C instead of RPython, +to avoid unnecessary roundtrips. This was the topic of most of the Cape Town +sprint and resulted in the cpyext-avoid-roundtrip branch, which was +eventually merged.
                +Of course, it is not possible to move everything to C: there are still +operations which need to be implemented in RPython. For example, think of +PyList_Append: the logic to append an item to a list is complex and +involves list strategies, so we cannot replicate it in C. However, we +discovered that a large subset of the C API can benefit from this.
                +Moreover, the C API is huge. While we invented this new way of writing +cpyext code, we still need to +convert many of the functions to the new paradigm. Sometimes the rewrite is +not automatic +or straighforward. cpyext is a delicate piece of software, so it happens often +that we make a mistake and end up staring at a segfault in gdb.
                +However, the most important takeaway is that the performance improvements we got +from this optimization are impressive, as we will detail later.
                +
                +

                +Conversion costs

                +The other potential big source of slowdown is the conversion of arguments +between W_Root and PyObject*.
                +As explained earlier, the first time you pass a W_Root to C, you need to +allocate its PyObject* counterpart. Suppose you have a foo function +defined in C, which takes a single int argument:
                for i in range(N):
                +    foo(i)
                +
                +To run this code, you need to create a different PyObject* for each value +of i: if implemented naively, it means calling N times malloc() +and free(), which kills performance.
                +CPython has the very same problem, which is solved by using a free list to +allocate ints. So, what we did was to simply steal the code from CPython +and do the exact same thing. This was also done in the +cpyext-avoid-roundtrip branch, and the benchmarks show that it worked +perfectly.
                +Every type which is converted often to PyObject* must have a very fast +allocator. At the moment of writing, PyPy uses free lists only for ints and +tuples: one of the next steps on our TODO list is certainly to use this +technique with more types, like float.
                +Conversely, we also need to optimize the converstion from PyObject* to +W_Root: this happens when an object is originally allocated in C and +returned to Python. Consider for example the following code:
                import numpy as np
                +myarray = np.random.random(N)
                +for i in range(len(arr)):
                +    myarray[i]
                +
                +At every iteration, we get an item out of the array: the return type is a an +instance of numpy.float64 (a numpy scalar), i.e. a PyObject'*: this is +something which is implemented by numpy entirely in C, so completely +opaque to cpyext. We don't have any control on how it is allocated, +managed, etc., and we can assume that allocation costs are the same as on +CPython.
                +As soon as we return these PyObject* to Python, we need to allocate +their W_Root equivalent. If you do it in a small loop like in the example +above, you end up allocating all these W_Root inside the nursery, which is +a good thing since allocation is super fast (see the section above about the +PyPy GC).
                +However, we also need to keep track of the W_Root to PyObject* link. +Currently, we do this by putting all of them in a dictionary, but it is very +inefficient, especially because most of these objects die young and thus it +is wasted work to do that for them. Currently, this is one of the biggest +unresolved problem in cpyext, and it is what causes the two microbenchmarks +allocate_int and allocate_tuple to be very slow.
                +We are well aware of the problem, and we have a plan for how to fix it. The +explanation is too technical for the scope of this blog post as it requires a +deep knowledge of the GC internals to be understood, but the details are +here.
                +
                +

                +C API quirks

                +Finally, there is another source of slowdown which is beyond our control. Some +parts of the CPython C API are badly designed and expose some of the +implementation details of CPython.
                +The major example is reference counting. The Py_INCREF / Py_DECREF API +is designed in such a way which forces other implementation to emulate +refcounting even in presence of other GC management schemes, as explained +above.
                +Another example is borrowed references. There are API functions which do +not incref an object before returning it, e.g. PyList_GetItem(). This is +done for performance reasons because we can avoid a whole incref/decref pair, +if the caller needs to handle the returned item only temporarily: the item is +kept alive because it is in the list anyway.
                +For PyPy, this is a challenge: thanks to list strategies, lists are often +represented in a compact way. For example, a list containing only integers is +stored as a C array of long. How to implement PyList_GetItem? We +cannot simply create a PyObject* on the fly, because the caller will never +decref it and it will result in a memory leak.
                +The current solution is very inefficient. The first time we do a +PyList_GetItem, we convert the whole list to a list of +PyObject*. This is bad in two ways: the first is that we potentially pay a +lot of unneeded conversion cost in case we will never access the other items +of the list. The second is that by doing that we lose all the performance +benefit granted by the original list strategy, making it slower for the +rest of the pure-python code which will manipulate the list later.
                PyList_GetItem is an example of a bad API because it assumes that the list +is implemented as an array of PyObject*: after all, in order to return a +borrowed reference, we need a reference to borrow, don't we?
                +Fortunately, (some) CPython developers are aware of these problems, and there +is an ongoing project to design a better C API which aims to fix exactly +this kind of problem.
                +Nonetheless, in the meantime we still need to implement the current +half-broken APIs. There is no easy solution for that, and it is likely that +we will always need to pay some performance penalty in order to implement them +correctly.
                +However, what we could potentially do is to provide alternative functions +which do the same job but are more PyPy friendly: for example, we could think +of implementing PyList_GetItemNonBorrowed or something like that: then, C +extensions could choose to use it (possibly hidden inside some macro and +#ifdef) if they want to be fast on PyPy.
                +
                +

                +Current performance

                +During the whole blog post we claimed cpyext is slow. How +slow it is, exactly?
                +We decided to concentrate on microbenchmarks for now. It should be evident +by now there are simply too many issues which can slow down a cpyext +program, and microbenchmarks help us to concentrate on one (or few) at a +time.
                +The microbenchmarks measure very simple things, like calling functions and +methods with the various calling conventions (no arguments, one arguments, +multiple arguments); passing various types as arguments (to measure conversion +costs); allocating objects from C, and so on.
                +Here are the results from the old PyPy 5.8 relative and normalized to CPython +2.7, the lower the better:

                + +
                +
                + +
                +
                + +
                +
                +PyPy was horribly slow everywhere, ranging from 2.5x to 10x slower. It is +particularly interesting to compare simple.noargs, which measures the cost +of calling an empty function with no arguments, and simple.onearg(i), +which measures the cost calling an empty function passing an integer argument: +the latter is ~2x slower than the former, indicating that the conversion cost +of integers is huge.
                +PyPy 5.8 was the last release before the famous Cape Town sprint, when we +started to look at cpyext performance seriously. Here are the performance data for +PyPy 6.0, the latest release at the time of writing:
                + +
                +

                +The results are amazing! PyPy is now massively faster than before, and for +most benchmarks it is even faster than CPython: yes, you read it correctly: +PyPy is faster than CPython at doing CPython's job, even considering all the +extra work it has to do to emulate the C API. This happens thanks to the JIT, +which produces speedups high enough to counterbalance the slowdown caused by +cpyext.
                +There are two microbenchmarks which are still slower though: allocate_int +and allocate_tuple, for the reasons explained in the section about +Conversion costs.
                +
                +

                +Next steps

                +Despite the spectacular results we got so far, cpyext is still slow enough to +kill performance in most real-world code which uses C extensions extensively +(e.g., the omnipresent numpy).
                +Our current approach is something along these lines:
                +
                  +
                1. run a real-world small benchmark which exercises cpyext
                2. +
                3. measure and find the major bottleneck
                4. +
                5. write a corresponding microbenchmark
                6. +
                7. optimize it
                8. +
                9. repeat
                10. +
                +
                +On one hand, this is a daunting task because the C API is huge and we need to +tackle functions one by one. On the other hand, not all the functions are +equally important, and is is enough to optimize a relatively small subset to +improve many different use cases.
                +Where a year ago we announced we have a working answer to run c-extension in +PyPy, we now have a clear picture of what are the performance bottlenecks, and +we have developed some technical solutions to fix them. It is "only" a matter +of tackling them, one by one. It is worth noting that most of the work was +done during two sprints, for a total 2-3 person-months of work.
                +We think this work is important for the Python ecosystem. PyPy has established +a baseline for performance in pure python code, providing an answer for the +"Python is slow" detractors. The techniques used to make cpyext performant +will let PyPy become an alternative for people who mix C extensions with +Python, which, it turns out, is just about everyone, in particular those using +the various scientific libraries. Today, many developers are forced to seek +performance by converting code from Python to a lower language. We feel there +is no reason to do this, but in order to prove it we must be able to run both +their python and their C extensions performantly, then we can begin to educate +them how to write JIT-friendly code in the first place.
                +We envision a future in which you can run arbitrary Python programs on PyPy, +with the JIT speeding up the pure Python parts and the C parts running as fast +as today: the best of both worlds!
                +
                +
                +
                +
                +
                + + AlbertMietus wrote on 2018-09-22 08:37: +
                +
                +

                Thanks fo this nice article!

                —Albert

                +
                +
                +
                +
                + + Pixy Misa wrote on 2018-09-22 09:58: +
                +
                +

                Great work guys! I should benchmark some of my apps again - a couple of things that were dependent on C extensions didn't show much speedup previously.

                +
                +
                +
                +
                + + Anonymous wrote on 2018-09-22 15:55: +
                +
                +

                Great work man !

                +
                +
                +
                + +

                The First 15 Years of PyPy — a Personal Retrospective

                + +
                +

                A few weeks ago I (=Carl Friedrich Bolz-Tereick) gave a keynote at ICOOOLPS in +Amsterdam with the above title. I was very happy to have been given that +opportunity, since a number of our papers have been published at ICOOOLPS, +including the very first one I published when I'd just started my PhD. I decided +to turn the talk manuscript into a (longish) blog post, to make it available to a wider audience. +Note that this blog post describes my personal recollections and research, it is +thus necessarily incomplete and coloured by my own experiences.

                +

                PyPy has turned 15 years old this year, so I decided that that's a good reason +to dig into and talk about the history of the project so far. I'm going to do +that using the lens of how performance developed over time, which is from +something like 2000x slower than CPython, to roughly 7x faster. In this post +I am going to present the history of the project, and also talk about some +lessons that we learned.

                +

                The post does not make too many assumptions about any prior knowledge of what +PyPy is, so if this is your first interaction with it, welcome! I have tried to +sprinkle links to earlier blog posts and papers into the writing, in case you +want to dive deeper into some of the topics.

                +

                As a disclaimer, in this post I am going to mostly focus on ideas, and not +explain who had or implemented them. A huge amount of people contributed to the +design, the implementation, the funding and the organization of PyPy over the +years, and it would be impossible to do them all justice.

                + +
                +

                2003: Starting the Project

                +

                On the technical level PyPy is a Python interpreter written in Python, which is +where the name comes from. It also has an automatically generated JIT compiler, +but I'm going to introduce that gradually over the rest of the blog post, so +let's not worry about it too much yet. On the social level PyPy is an +interesting mixture of a open source project, that sometimes had research done +in it.

                +

                The project got started in late 2002 and early 2003. To set the stage, at that +point Python was a significantly less popular language than it is today. Python +2.2 was the version at the time, Python didn't even have a bool type yet.

                +

                In fall 2002 the PyPy project was started by a number of Python programmers on a +mailing list who said +something like (I am exaggerating somewhat) "Python is the greatest most +wonderful most perfect language ever, we should use it for absolutely +everything. Well, what aren't we using it for? The Python virtual machine itself +is written in C, that's bad. Let's start a project to fix that."

                +

                Originally that project was called "minimal python", or "ptn", later gradually +renamed to PyPy. Here's the mailing list post to announce the project more +formally:

                +
                Minimal Python Discussion, Coding and Sprint
                +--------------------------------------------
                +
                +We announce a mailinglist dedicated to developing
                +a "Minimal Python" version.  Minimal means that
                +we want to have a very small C-core and as much
                +as possible (re)implemented in python itself.  This
                +includes (parts of) the VM-Code.
                +

                Why would that kind of project be useful? Originally it wasn't necessarily meant +to be useful as a real implementation at all, it was more meant as a kind of +executable explanation of how Python works, free of the low level details of +CPython. But pretty soon there were then also plans for how the virtual machine +(VM) could be bootstrapped to be runnable without an existing Python +implementation, but I'll get to that further down.

                +
                + + +
                +

                2003: Implementing the Interpreter

                +

                In early 2003 a group of Python people met in Hildesheim (Germany) for the first +of many week long development sprints, organized by Holger Krekel. During that +week a group of people showed up and started working on the core interpreter. +In May 2003 a second sprint was organized by Laura Creighton and Jacob Halén in +Gothenburg (Sweden). And already at that sprint enough of the Python bytecodes +and data structures were implemented to make it possible to run a program that +computed how much money everybody had to pay for the food bills of the week. And +everybody who's tried that for a large group of people knows that that’s an +amazingly complex mathematical problem.

                +

                In the next two years, the project continued as a open source project with +various contributors working on it in their free time, and meeting for the +occasional sprint. In that time, the rest of the core interpreter and the core +data types were implemented.

                +

                There's not going to be any other code in this post, but to give a bit of a +flavor of what the Python interpreter at that time looked like, here's the +implementation of the DUP_TOP bytecode after these first sprints. As you can +see, it's in Python, obviously, and it has high level constructs such as method +calls to do the stack manipulations:

                +
                def DUP_TOP(f):
                +    w_1 = f.valuestack.top()
                +    f.valuestack.push(w_1)
                +

                Here's the early code for integer addition:

                +
                def int_int_add(space, w_int1, w_int2):
                +    x = w_int1.intval
                +    y = w_int2.intval
                +    try:
                +        z = x + y
                +    except OverflowError:
                +        raise FailedToImplement(space.w_OverflowError,
                +                                space.wrap("integer addition"))
                +    return W_IntObject(space, z)
                +

                (the current implementations look slightly but not fundamentally different.)

                +
                + + +
                +

                Early organizational ideas

                +

                Some of the early organizational ideas of the project were as follows. Since the +project was started on a sprint and people really liked that style of working +PyPy continued to be developed on various subsequent sprints.

                +

                From early on there was a very heavy emphasis on testing. All the parts of the +interpreter that were implemented had a very careful set of unit tests to make +sure that they worked correctly. From early on, there was a continuous +integration infrastructure, which grew over time (nowadays it is very natural +for people to have automated tests, and the concept of green/red builds: but +embracing this workflow in the early 2000s was not really mainstream yet, and +it is probably one of the reasons behind PyPy's success).

                +

                At the sprints there was also an emphasis on doing pair programming to make +sure that everybody understood the codebase +equally. There was also a heavy emphasis on writing good code and on regularly +doing refactorings to make sure that the codebase remained nice, clean and +understandable. Those ideas followed from the early thoughts that PyPy would be +a sort of readable explanation of the language.

                +

                There was also a pretty fundamental design decision made at the time. That was +that the project should stay out of language design completely. Instead it would +follow CPython's lead and behave exactly like that implementation in all cases. +The project therefore committed to being almost quirk-to-quirk compatible and to +implement even the more obscure (and partially unnecessary) corner cases of +CPython.

                +

                All of these principles continue pretty much still today (There are a few places +where we had to deviate from being completely compatible, they are documented +here).

                +
                + + +
                +

                2004-2007: EU-Funding

                +

                While all this coding was going on it became clear pretty soon that the goals +that various participants had for the project would be very hard to achieve with +just open source volunteers working on the project in their spare time. +Particularly also the sprints became expensive given that those were just +volunteers doing this as a kind of weird hobby. Therefore a couple of people of +the project got together to apply for an EU grant in the framework programme 6 +to solve these money problems. In mid-2004 that application proved to be +successful. And so the project got a grant of a 1.3 million Euro for +two years to be able to employ some of the core developers and to make it +possible for them work on the project full time. The EU grant went to seven +small-to-medium companies and Uni Düsseldorf. The budget also contained money to +fund sprints, both for the employed core devs as well as other open source +contributors.

                + +

                The EU project started in December 2004 and that was a fairly heavy change in +pace for the project. Suddenly a lot of people were working full time on it, and +the pace and the pressure picked up quite a lot. Originally it had been a +leisurely project people worked on for fun. But afterwards people discovered +that doing this kind of work full time becomes slightly less fun, particularly +also if you have to fulfill the ambitious technical goals that the EU proposal +contained. And the proposal indeed contained a bit everything to increase its +chance of acceptance, such as aspect oriented programming, semantic web, logic +programming, constraint programming, and so on. Unfortunately it +turned out that those things then have to be implemented, which can be called +the first thing we learned: if you promise something to the EU, you'll have to +actually go do it (After the funding ended, a lot of these features were +actually removed from the project again, at a cleanup sprint).

                +
                + + +
                +

                2005: Bootstrapping PyPy

                +

                So what were the actually useful things done as part of the EU project?

                +

                One of the most important goals that the EU project was meant to solve was the +question of how to turn PyPy into an actually useful VM for Python. The +bootstrapping plans were taken quite directly from Squeak, which is a Smalltalk +VM written in a subset of Smalltalk called Slang, which can then be bootstrapped +to C code. The plan for PyPy was to do something similar, to define a restricted +subset of Python called RPython, restricted in such a way that it should be +possible to statically compile RPython programs to C code. Then the Python +interpreter should only use that subset, of course.

                +

                The main difference from the Squeak approach is that Slang, the subset of Squeak +used there, is actually quite a low level language. In a way, you could almost +describe it as C with Smalltalk syntax. RPython was really meant to be a +much higher level language, much closer to Python, with full support for single +inheritance classes, and most of Python's built-in data structures.

                + +
                +

                (BTW, you don’t have to understand any of the illustrations in this blog post, +they are taken from talks and project reports we did over the years so they are +of archaeological interest only and I don’t understand most of them myself.)

                +

                From 2005 on, work on the RPython type inference engine and C backend started in +earnest, which was sort of co-developed with the RPython language definition and +the PyPy Python interpreter. This is also roughly the time that I joined the +project as a volunteer.

                +

                And at the second sprint I went to, in July 2005, two and a half years after the +project got started, we managed to bootstrap the PyPy interpreter to C for the +first time. When we ran the compiled program, it of course immediately +segfaulted. The reason for that was that the C backend had turned characters +into signed chars in C, while the rest of the infrastructure assumed that they +were unsigned chars. After we fixed that, the second attempt worked and we +managed to run an incredibly complex program, something like 6 * 7. That +first bootstrapped version was really really slow, a couple of hundred times +slower than CPython.

                + +
                +

                The bootstrapping process of RPython has a number of nice benefits, a big one +being that a number of the properties of the generated virtual machine don't +have to expressed in the interpreter. The biggest example of this is garbage +collection. RPython is a garbage collected language, and the interpreter does +not have to care much about GC in most cases. When the C source code is +generated, a GC is automatically inserted. This is a source of great +flexibility. Over time we experimented with a number of different GC +approaches, from reference counting to Boehm to our current incremental +generational collector. As an aside, for a long time we were also working on +other backends to the RPython language and hoped to be able to target Java and +.NET as well. Eventually we abandoned this strand of work, however.

                +
                + + +
                +

                RPython's Modularity Problems

                +

                Now we come to the first thing I would say we learned in the project, which is +that the quality of tools we thought of as internal things still matters a lot. +One of the biggest technical mistakes we've made in the project was that we +designed RPython without any kind of story for modularity. There is no concept +of modules in the language or any other way to break up programs into smaller +components. We always thought that it would be ok for RPython to be a little bit +crappy. It was meant to be this sort of internal language with not too many +external users. And of course that turned out to be completely wrong later.

                +

                That lack of modularity led to various problems that persist until today. The +biggest one is that there is no separate compilation for RPython programs at +all! You always need to compile all the parts of your VM together, which leads +to infamously bad compilation times.

                +

                Also by not considering the modularity question we were never forced to fix +some internal structuring issues of the RPython compiler itself. +Various layers of the compiler keep very badly defined and porous interfaces between +them. This was made possible by being able to work with all the program information in one heap, +making the compiler less approachable and maintainable than it maybe could be.

                +

                Of course this mistake just got more and more costly to fix over time, +and so it means that so far nobody has actually done it. +Not thinking more carefully about RPython's design, particularly its +modularity story, is in my opinion the biggest technical mistake the project +made.

                +
                + + +
                +

                2006: The Meta-JIT

                +

                After successfully bootstrapping the VM we did some fairly straightforward +optimizations on the interpreter and the C backend and managed to reduce the +slowdown versus CPython to something like 2-5 times slower. That's great! But of +course not actually useful in practice. So where do we go from here?

                +

                One of the not so secret goals of Armin Rigo, one of the PyPy founders, was to +use PyPy together with some advanced partial evaluation magic sauce to +somehow automatically generate a JIT compiler from the interpreter. The goal was +something like, "you write your interpreter in RPython, add a few annotations +and then we give you a JIT for free for the language that that interpreter +implements."

                +

                Where did the wish for that approach come from, why not just write a JIT for +Python manually in the first place? Armin had actually done just that before he +co-founded PyPy, in a project called Psyco. Psyco was an extension module for +CPython that contained a method-based JIT compiler for Python code. And Psyco +proved to be an amazingly frustrating compiler to write. There were two main +reasons for that. The first reason was that Python is actually quite a complex +language underneath its apparent simplicity. The second reason for the +frustration was that Python was and is very much an alive language, that gains +new features in the language core in every version. So every time a new Python +version came out, Armin had to do fundamental changes and rewrites to Psyco, and +he was getting pretty frustrated with it. So he hoped that that effort could be +diminished by not writing the JIT for PyPy by hand at all. Instead, the goal was +to generate a method-based JIT from the interpreter automatically. By taking the +interpreter, and applying a kind of advanced transformation to it, that would +turn it into a method-based JIT. And all that would still be translated into a +C-based VM, of course.

                +
                +

                Slide from Psyco presentation at EuroPython 2002

                +
                + + +
                +

                The First JIT Generator

                +

                From early 2006 on until the end of the EU project a lot of work went into +writing such a JIT generator. The idea was to base it on runtime partial +evaluation. Partial evaluation is an old idea in computer science. It's supposed +to be a way to automatically turn interpreters for a language into a compiler +for that same language. Since PyPy was trying to generate a JIT compiler, which +is in any case necessary to get good performance for a dynamic language like +Python, the partial evaluation was going to happen at runtime.

                +

                There are various ways to look at partial evaluation, but if you've never heard +of it before, a simple way to view it is that it will compile a Python function +by gluing together the implementations of the bytecodes of that function and +optimizing the result.

                +

                The main new ideas of PyPy's partial-evaluation based JIT generator as opposed +to earlier partial-evaluation approaches are the ideas of "promote" and the idea +of "virtuals". Both of these techniques had already been present (in a slightly +less general form) in Psyco, and the goal was to keep using them in PyPy. Both +of these techniques also still remain in use today in PyPy. I'm +going on a slight technical diversion now, to give a high level explanation of +what those ideas are for.

                +
                +
                + + +
                +

                Promote

                +

                One important ingredient of any JIT compiler is the ability to do runtime +feedback. Runtime feedback is most commonly used to know something about which +concrete types are used by a program in practice. Promote is basically a way to +easily introduce runtime feedback into the JIT produced by the JIT generator. +It's an annotation the implementer of a language can use to express their wish +that specialization should happen at this point. This mechanism can be used to +express all kinds of runtime feedback, moving values from the interpreter +into the compiler, whether they be types or other things.

                +
                + + +
                +

                Virtuals

                +

                Virtuals are a very aggressive form of partial escape analysis. A dynamic +language often puts a lot of pressure on the garbage collector, since most +primitive types (like integers, floats and strings) are boxed in the heap, and +new boxes are allocated all the time.

                +

                With the help of virtuals a very significant portion of all allocations in the +generated machine code can be completely removed. Even if they can't be removed, +often the allocation can be delayed or moved into an error path, or even +into a deoptimization path, and thus disappear from the generated machine code +completely.

                +

                This optimization really is the super-power of PyPy's optimizer, since it +doesn't work only for primitive boxes but for any kind of object allocated on +the heap with a predictable lifetime.

                +

                As an aside, while this kind of partial escape analysis is sort of new for +object-oriented languages, it has actually existed in Prolog-based partial +evaluation systems since the 80s, because it's just extremely natural there.

                +
                + + +
                +

                JIT Status 2007

                +

                So, back to our history. We're now in 2007, at the end of the EU project (you +can find the EU-reports we wrote during the projects here). The EU project +successfully finished, we survived the final review with the EU. So, what's the +2007 status of the JIT generator? It works kind of, it can be applied to PyPy. It +produces a VM with a JIT that will turn Python code into machine code at runtime +and run it. However, that machine code is not particularly fast. Also, it tends +to generate many megabytes of machine code even for small Python programs. While +it's always faster than PyPy without JIT, it's only sometimes faster than +CPython, and most of the time Psyco still beats it. On the one hand, this is +still an amazing achievement! It's arguably the biggest application of partial +evaluation at this point in time! On the other hand, it was still quite +disappointing in practice, particularly since some of us had believed at the +time that it should have been possible to reach and then surpass the speed of +Psyco with this approach.

                +
                + + +
                +

                2007: RSqueak and other languages

                +

                After the EU project ended we did all kinds of things. Like sleep for a month +for example, and have the cleanup sprint that I already mentioned. We also had a +slightly unusual sprint in Bern, with members of the Software Composition +Group of Oscar Nierstrasz. As I wrote above, PyPy had been heavily influenced +by Squeak Smalltalk, and that group is a heavy user of Squeak, so we wanted to +see how to collaborate with them. At the beginning of the sprint, we decided +together that the goal of that week should be to try to write a Squeak virtual +machine in RPython, and at the end of the week we'd gotten surprisingly far with +that goal. Basically most of the bytecodes and the Smalltalk object system +worked, we had written an image loader and could run some benchmarks (during the +sprint we also regularly updated a blog, the success of which led us to start +the PyPy blog).

                + +
                +

                The development of the Squeak interpreter was very interesting for the project, +because it was the first real step that moved RPython from being an +implementation detail of PyPy to be a more interesting project in its own right. +Basically a language to write interpreters in, with the eventual promise to get +a JIT for that language almost for free. That Squeak implementation is now +called RSqueak ("Research Squeak").

                +

                I'll not go into more details about any of the other language implementations in +RPython in this post, but over the years we've had a large variety of language +of them done by various people and groups, most of them as research vehicles, +but also some as real language implementations. Some very cool research results +came out of these efforts, here's a slightly outdated list of some of them.

                +

                The use of RPython for other languages complicated the PyPy narrative a lot, and +in a way we never managed to recover the simplicity of the original project +description "PyPy is Python in Python". Because now it's something like "we have +this somewhat strange language, a subset of Python, that's called RPython, and +it's good to write interpreters in. And if you do that, we'll give you a JIT for +almost free. And also, we used that language to write a Python implementation, +called PyPy.". It just doesn't roll off the tongue as nicely.

                +
                + + +
                +

                2008-2009: Four More JIT Generators

                +

                Back to the JIT. After writing the first JIT generator as part of the EU +project, with somewhat mixed results, we actually wrote several more JIT +generator prototypes with different architectures to try to solve some of the +problems of the first approach. To give an impression of these prototypes, +here’s a list of them.

                +
                  +
                • The second JIT generator we started working on in 2008 behaved exactly like +the first one, but had a meta-interpreter based architecture, to make it more +flexible and easier to experiment with. The meta-interpreter was called +the "rainbow interpreter", and in general the JIT is an area where we went +somewhat overboard with borderline silly terminology, with notable +occurrences of "timeshifter", "blackhole interpreter" etc.

                • +
                • The third JIT generator was an experiment based on the second one which +changed +compilation strategy. While the previous two had compiled many control flow +paths of the currently compiled function eagerly, that third JIT was sort of +maximally lazy and stopped compilation at every control flow split to avoid +guessing which path would actually be useful later when executing the code. +This was an attempt to reduce the problem of the first JIT generating way too +much machine code. Only later, when execution went down one of the not yet +compiled paths would it continue compiling more code. This gives an effect +similar to that of lazy basic block versioning.

                • +
                • The fourth JIT generator was a pretty strange prototype, a runtime partial +evaluator for Prolog, to experiment with various specialization trade-offs. It +had an approach that we gave a not at all humble name, called "perfect +specialization".

                • +
                • The fifth JIT generator is the one that we are still using today. Instead of +generating a method-based JIT compiler from our interpreter we switched to +generating a tracing JIT compiler. Tracing JIT compilers were sort of the +latest fashion at the time, at least for a little while.

                • +
                +
                + + +
                +

                2009: Meta-Tracing

                +

                So, how did that tracing JIT generator work? A tracing JIT generates code by +observing and logging the execution of the running program. This yields a +straight-line trace of operations, which are then optimized and compiled into +machine code. Of course most tracing systems mostly focus on tracing loops.

                +

                As we discovered, it's actually quite simple to apply a tracing JIT to a generic +interpreter, by not tracing the execution of the user program directly, but by +instead tracing the execution of the interpreter while it is running the user +program (here's the paper we wrote about this approach).

                +

                So that's what we implemented. Of course we kept the two successful parts of the +first JIT, promote and virtuals (both links go to the papers about these +features in the meta-tracing context).

                +
                +
                + + +
                +

                Why did we Abandon Partial Evaluation?

                +

                So one question I get sometimes asked when telling this story is, why did +we think that tracing would work better than partial evaluation (PE)? One of the +hardest parts of compilers in general and partial evaluation based systems in +particular is the decision when and how much to inline, how much to specialize, +as well as the decision when to split control flow paths. In the PE based JIT +generator we never managed to control that question. Either the JIT would +inline too much, leading to useless compilation of all kinds of unlikely error +cases. Or it wouldn't inline enough, preventing necessary optimizations.

                +

                Meta tracing solves this problem with a hammer, it doesn't make particularly +complex inlining decisions at all. It instead decides what to inline by +precisely following what a real execution through the program is doing. Its +inlining decisions are therefore very understandable and predictable, and it +basically only has one heuristic based on whether the called function contains a +loop or not: If the called function contains a loop, we'll never inline it, if +it doesn't we always try to inline it. That predictability is actually what was +the most helpful, since it makes it possible for interpreter authors to +understand why the JIT did what it did and to actually influence its inlining +decisions by changing the annotations in the interpreter source. It turns out +that simple is better than complex.

                +
                + + +
                +

                2009-2011: The PyJIT Eurostars Project

                +

                While we were writing all these JIT prototypes, PyPy had sort of reverted back +to being a volunteer-driven open source project (although some of us, like +Antonio Cuni and I, had started working for universities and other project +members had other sources of funding). But again, while we did the work it +became clear that to get an actually working fast PyPy with generated JIT we +would need actual funding again for the project. So we applied to the EU again, +this time for a much smaller project with less money, in the Eurostars +framework. We got a grant for three participants, merlinux, OpenEnd and Uni +Düsseldorf, on the order of a bit more than half a million euro. That money was +specifically for JIT development and JIT testing infrastructure.

                +
                +
                + + +
                +

                Tracing JIT improvements

                +

                When writing the grant we had sat together at a sprint and discussed extensively +and decided that we would not switch JIT generation approaches any more. We all +liked the tracing approach well enough and thought it was promising. So instead +we agreed to try in earnest to make the tracing JIT really practical. So in the +Eurostars project we started with implementing sort of fairly standard JIT +compiler optimizations for the meta-tracing JIT, such as:

                +
                  +
                • constant folding

                • +
                • dead code elimination

                • +
                • loop invariant code motion (using LuaJIT's approach)

                • +
                • better heap optimizations

                • +
                • faster deoptimization (which is actually a bit of a mess in the +meta-approach)

                • +
                • and dealing more efficiently with Python frames objects and the +features of Python's debugging facilities

                • +
                +
                + + +
                +

                2010: speed.pypy.org

                +

                In 2010, to make sure that we wouldn't accidentally introduce speed regressions +while working on the JIT, we implemented infrastructure to build PyPy and run +our benchmarks nightly. Then, the https://speed.pypy.org website was implemented +by Miquel Torres, a volunteer. The website shows the changes in benchmark +performance compared to the previous n days. It didn't sound too important at +first, but this was (and is) a fantastic tool, and an amazing motivator over the +next years, to keep continually improving performance.

                +
                +
                + + +
                +

                Continuous Integration

                +

                This actually leads me to something else that I'd say we learned, which is that +continuous integration is really awesome, and completely transformative to have +for a project. This is not a particularly surprising insight nowadays in the +open source community, it's easy to set up continuous integration on Github +using Travis or some other CI service. But I still see a lot of research +projects that don't have tests, that don't use CI, so I wanted to mention it +anyway. As I mentioned earlier in the post, PyPy has a quite serious testing +culture, with unit tests written for new code, regression tests for all bugs, +and integration tests using the CPython test suite. Those tests are run +nightly on a number of architectures and operating systems.

                +

                Having all this kind of careful testing is of course necessary, since PyPy is +really trying to be a Python implementation that people actually use, not just +write papers about. But having all this infrastructure also had other benefits, +for example it allows us to trust newcomers to the project very quickly. +Basically after your first patch gets accepted, you immediately get commit +rights to the PyPy repository. If you screw up, the tests (or the code reviews) +are probably going to catch it, and that reduction to the barrier to +contributing is just super great.

                +

                This concludes my advertisement for testing in this post.

                +
                + + +
                +

                2010: Implementing Python Objects with Maps

                +

                So, what else did we do in the Eurostars project, apart from adding traditional +compiler optimizations to the tracing JIT and setting up CI infrastructure? +Another strand of work, that went on sort of concurrently to the JIT generator +improvements, were deep rewrites in the Python runtime, and the Python data +structures. I am going to write about two exemplary ones here, maps and storage strategies.

                +

                The first such rewrite is fairly standard. Python instances are similar to +Javascript objects, in that you can add arbitrary attributes to them at runtime. +Originally Python instances were backed by a dictionary in PyPy, but of course +in practice most instances of the same class have the same set of attribute +names. Therefore we went and implemented Self style maps, which are often +called hidden classes in the JS world to represent instances instead. This +has two big benefits, it allows you to generate much better machine code for +instance attribute access and makes instances use a lot less memory.

                +
                +
                + + +
                +

                2011: Container Storage Strategies

                +

                Another important change in the PyPy runtime was rewriting the Python container +data structures, such as lists, dictionaries and sets. A fairly straightforward +observation about how those are used is that in a significant percentage of +cases they contain type-homogeneous data. As an example it's quite common to +have lists of only integers, or lists of only strings. So we changed the list, +dict and set implementations to use something we called storage strategies. With +storage strategies these data structures use a more efficient representations if +they contain only primitives of the same type, such as ints, floats, strings. +This makes it possible to store the values without boxing them in the underlying +data structure. Therefore read and write access are much faster for such type +homogeneous containers. Of course when later another data type gets added to +such a list, the existing elements need to all be boxed at that point, which is +expensive. But we did a study and found out that that happens quite rarely in +practice. A lot of that work was done by Lukas Diekmann.

                +
                +
                + + +
                +

                Deep Changes in the Runtime are Necessary

                +

                These two are just two examples for a number of fairly fundamental changes in +the PyPy runtime and PyPy data structures, probably the two most important ones, +but we did many others. That leads me to another thing we learned. If you want +to generate good code for a complex dynamic language such as Python, it's +actually not enough at all to have a good code generator and good compiler +optimizations. That's not going to help you, if your runtime data-structures +aren't in a shape where it's possible to generate efficient machine code to +access them.

                +

                Maybe this is well known in the VM and research community. However it's the main +mistake that in my opinion every other Python JIT effort has made in the last 10 +years, where most projects said something along the lines of "we're not +changing the existing CPython data structures at all, we'll just let LLVM +inline enough C code of the runtime and then it will optimize all the overhead +away". That never works very well.

                +
                + + +
                +

                JIT Status 2011

                +

                So, here we are at the end of the Eurostars project, what's the status of the JIT? Well, it +seems this meta-tracing stuff really works! We finally started actually +believing in it, when we reached the point in 2010 where self-hosting PyPy was +actually faster than bootstrapping the VM on CPython. Speeding up the +bootstrapping process is something that Psyco never managed at all, so we +considered this a quite important achievement. At the end of +Eurostars, we were about 4x faster than CPython on our set of benchmarks.

                +
                + + +
                +

                2012-2017: Engineering and Incremental Progress

                +

                2012 the Eurostars project was finished and PyPy reverted yet another time back +to be an open source project. From then on, we've had a more diverse set of +sources of funding: we received some crowd funding via the Software Freedom +Conservancy and contracts of various sizes from companies to implement various +specific features, often handled by Baroque Software. Over the next couple of +years +we revamped various parts of the VM. We improved the GC in major ways. We +optimized the implementation of the JIT compiler to improve warmup times. We +implemented backends for various CPU architectures (including PowerPC and +s390x). We tried to reduce the number of performance cliffs and make the JIT +useful in a broader set of cases.

                +

                Another strand of work was to push quite significantly to be more +compatible with CPython, particularly the Python 3 line as well as extension +module support. Other compatibility improvements we did was making sure that +virtualenv works with PyPy, better support for distutils and setuptools and +similar improvements. The continually improving performance as well better +compatibility with the ecosystem tools led to the first few users of PyPy in +industry.

                +
                +
                + + +

                CPyExt

                +

                Another very important strand of work that took a lot of effort in recent years +was CPyExt. One of the main blockers of PyPy adoption had always been the fact +that a lot of people need specific C-extension modules at least in some parts of +their program, and telling them to reimplement everything in Python is just not +a practical solution. Therefore we worked on CPyExt, an emulation layer to make +it possible to run CPython C-extension modules in PyPy. Doing that was a very +painful process, since the CPython extension API leaks a lot of CPython +implementation details, so we had to painstakingly emulate all of these details +to make it possible to run extensions. That this works at all remains completely +amazing to me! But nowadays CPyExt is even getting quite good, a lot of the big +numerical libraries such as Numpy and Pandas are now supported (for a while +we had worked hard on a reimplementation of Numpy called NumPyPy, but +eventually realized that it would never be complete and useful enough). +However, calling CPyExt modules from PyPy can still be very slow, +which makes it impractical for some applications +that's why we are working on it.

                +

                Not thinking about C-extension module emulation earlier in the project history +was a pretty bad strategic mistake. It had been clear for a long time that +getting people to just stop using all their C-extension modules was never going +to work, despite our efforts to give them alternatives, such as cffi. So we +should have thought of a story for all the existing C-extension modules earlier +in the project. Not starting CPyExt earlier was mostly a failure of our +imagination (and maybe a too high pain threshold): We didn't believe this kind +of emulation was going to be practical, until somebody went and tried it.

                +
                + + +
                +

                Python 3

                +

                Another main +focus of the last couple of years has been to catch up with the CPython 3 line. +Originally we had ignored Python 3 for a little bit too long, and were trailing +several versions behind. In 2016 and 2017 we had a grant from the Mozilla open +source support program of $200'000 to be able to catch up with Python 3.5. This +work is now basically done, and we are starting to target CPython 3.6 and will +have to look into 3.7 in the near future.

                +
                + + +
                +

                Incentives of OSS compared to Academia

                +

                So, what can be learned from those more recent years? One thing we can observe +is that a lot of the engineering work we did in that time is not really science +as such. A lot of the VM techniques we implemented are kind of well known, and +catching up with new Python features is also not particularly deep researchy +work. Of course this kind of work is obviously super necessary if you want +people to use your VM, but it would be very hard to try to get research funding +for it. PyPy managed quite well over its history to balance phases of more +research oriented work, and more product oriented ones. But getting this balance +somewhat right is not easy, and definitely also involves a lot of luck. And, as +has been discussed a lot, it's actually very hard to find funding for open +source work, both within and outside of academia.

                +
                +
                +

                Meta-Tracing really works!

                +

                Let me end with what, in my opinion, is the main positive technical result of PyPy the +project. Which is that the whole idea of using a meta-tracing JIT can really +work! Currently PyPy is about 7 times faster than CPython on a broad set of +benchmarks. Also, one of the very early motivations for using a meta-jitting +approach in PyPy, which was to not have to adapt the JIT to new versions of +CPython proved to work: indeed we didn't have to change anything in the JIT +infrastructure to support Python 3.

                +

                RPython has also worked and improved performance for a number of other +languages. Some of these interpreters had wildly different architectures. +AST-based interpreters, bytecode based, CPU emulators, really inefficient +high-level ones that allocate continuation objects all the time, and so on. This +shows that RPython also gives you a lot of freedom in deciding how you want to +structure the interpreter and that it can be applied to languages of quite +different paradigms.

                +

                I'll end with a list of the people that have contributed code to PyPy over its +history, more than 350 of them. I'd like to thank all of them and the various +roles they played. To the next 15 years!

                + +
                + +
                +
                + +

                Acknowledgements

                +

                A lot of people helped me with this blog post. Tim Felgentreff made me give the +keynote, which lead me to start collecting the material. Samuele Pedroni +gave essential early input when I just started planning the talk, and also gave +feedback on the blog post. Maciej Fijałkowski gave me feedback on the post, in +particular important insight about the more recent years of the project. Armin +Rigo discussed the talk slides with me, and provided details about the early +expectations about the first JIT's hoped-for performance. Antonio Cuni gave +substantial feedback and many very helpful suggestions for the blog post. +Michael Hudson-Doyle also fixed a number of mistakes in the post and rightfully +complained about the lack of mention of the GC. Christian Tismer provided +access to his copy of early Python-de mailing list posts. Matti Picus pointed +out a number of things I had forgotten and fixed a huge number of typos and +awkward English, including my absolute inability to put commas correctly. +All remaining errors are of course my own.

                +
                + +

                update: fixed confusing wording in the maps section.

                +
                +
                +
                +
                + + Peterino wrote on 2018-09-14 00:24: +
                +
                +

                Congratulations on your story, and the fantastic achievements!

                Interestingly, from my personal experience I can't confirm the "PyPy is faster than CPython" claim. Maybe you can help me understand. I'm running a simple set of tests against a subset of Python versions of CPython and against PyPy, for a few years. For what I saw in that time, PyPy - including PyPy3 now - was always the slowest, usually by a factor of 2 compared to the mean of all CPython versions. See the results on Travis, for example: https://travis-ci.org/painless-software/painless-continuous-delivery

                Why is it that way? When is PyPy really faster? Are the benchmarks you run tailored to a specific area of software development?

                And then the final thing I've not yet understood about PyPy: What is the ultimate plan? Should it ever replace CPython, one day? When it proves to be both faster and less laborious to maintain, maybe?

                +
                +
                +
                +
                + + Carl Friedrich Bolz-Tereick wrote on 2018-09-14 07:58: +
                +
                +

                Hi Peterino! Thanks for testing your project on PyPy! PyPy is often slower on unit tests, as is explained here:
                https://alexgaynor.net/2013/jul/15/your-tests-are-not-benchmark/

                PyPy is best at CPU bound python code It will never replace CPython but keep existing as an alternative python implementation.

                +
                +
                +
                +
                + + MikeABKK wrote on 2020-04-18 18:47: +
                +
                +

                > ...AST-based interpreters, ... really inefficient high-level ones that allocate continuation objects all the time, and so on.

                Does anyone have any links to these sorts of interpreters? I am interested in both. I'd very much appreciate any references.

                +
                +
                +
                + +

                Repeating a Matrix Multiplication Benchmark

                + +
                +

                I watched the Hennessy & Patterson's Turing award lecture recently:

                + + + +

                In it, there's a slide comparing the performance of various matrix +multiplication implementations, using Python (presumably CPython) as a baseline +and comparing that against various C implementations (I couldn't find the +linked paper yet):

                + + + +

                I expected the baseline speedup of switching from CPython to C to be +higher and I also wanted to know what performance PyPy gets, so I did my own +benchmarks. This is a problem that Python is completely unsuited for, so it +should give very exaggerated results.

                +

                The usual disclaimers apply: All benchmarks are lies, benchmarking of +synthetic workloads even more so. My implementation is really naive (though I +did optimize it a little bit to help CPython), don't use any +of this code +for anything real. The benchmarks ran on my rather old Intel i5-3230M laptop +under Ubuntu 17.10.

                +

                With that said, my results were as follows:

                + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                Implementationtimespeedup over CPythonspeedup over PyPy
                CPython512.588 ± 2.362 s1 ×
                PyPy8.167 ± 0.007 s62.761 ± 0.295 ×1 ×
                'naive' C2.164 ± 0.025 s236.817 ± 2.918 ×3.773 ± 0.044 ×
                NumPy0.171 ± 0.002 s2992.286 ± 42.308 ×47.678 ± 0.634 ×
                +

                This is running 1500x1500 matrix multiplications with (the same) random matrices. Every +implementation is run 50 times in a fresh process. The results are averaged, +the errors are bootstrapped 99% confidence intervals.

                +

                So indeed the speedup that I got of switching from CPython to C is quite a bit higher than +47x! PyPy is much better than CPython, but of course can't really compete +against GCC. And then the real professionals (numpy/OpenBLAS) are in a whole +'nother league. The speedup of the AVX numbers in the slide above is even +higher than my NumPy numbers, which I assume is the result of my old CPU with +two cores, vs. the 18 core CPU with AVX support. +Lesson confirmed: leave matrix multiplication to people who +actually know what they are doing.

                +
                +
                +

                How to ignore the annoying Cython warnings in PyPy 6.0

                + +
                +
                +
                +
                +
                +If you install any Cython-based module in PyPy 6.0.0, it is very likely that you get a warning like this:
                +
                >>>> import numpy
                +/data/extra/pypy/6.0.0/site-packages/numpy/random/__init__.py:99: UserWarning: __builtin__.type size changed, may indicate binary incompatibility. Expected 888, got 408
                +  from .mtrand import *
                +
                +
                +The TL;DR version is: the warning is a false alarm, and you can hide it by doing:
                +
                $ pypy -m pip install pypy-fix-cython-warning
                +
                +
                +The package does not contain any module, only a .pth file which installs a warning filter at startup.
                +

                +Technical details

                +
                +This happens because whenever Cython compiles a pyx file, it generates C code which does a sanity check on the C size of PyType_Type. PyPy versions up to 5.10 are buggy and report the incorrect size, so Cython includes a workaround to compare it with the incorrect value, when on PyPy.
                +
                +PyPy 6 fixed the bug and now PyType_Type reports the correct size; however, Cython still tries to compare it with the old, buggy value, so it (wrongly) emits the warning.
                +
                +Cython 0.28.2 includes a fix for it, so that C files generated by it no longer emit the warning. However, most packages are distributed with pre-cythonized C files. For example, numpy-1.14.2.zip include C files which were generated by Cython 0.26.1: if you compile it you still get the warning, even if you locally installed a newer version of Cython.

                +There is not much that we can do on the PyPy side, apart for waiting for all the Cython-based packages to do a new release which include C files generated by a newer Cython.  In the mean time, installing this module will silence the warning. +
                +
                +
                +
                +
                +
                +
                +
                +
                +
                + + Ralf Gommers wrote on 2018-04-29 04:42: +
                +
                +

                I've opened an issue to allow easier workarounds via a Cython compile flag: https://github.com/cython/cython/issues/2221

                +
                +
                +
                +
                + + Antonio Cuni wrote on 2018-05-02 10:34: +
                +
                +

                Thanks, although note that this flag would not help much in this case. Even if it were there, the package author would still have to recompile/republish each package in order to get rid of them. And once you do that, the warning vanishes anyway in the case of PyPy :)

                +
                +
                +
                +
                + + Amaroq Starwind wrote on 2018-05-10 19:12: +
                +
                +

                PyPy looks awesome. I can't wait for Python 3.6.5 support and/or a Windows x86-64 version! Though it would be unlikely, Anaconda and/or IdleX support would be awesome too.

                +
                +
                +
                + +

                PyPy2.7 and PyPy3.5 v6.0 dual release

                + +
                +
                +The PyPy team is proud to release both PyPy2.7 v6.0 (an interpreter supporting +Python 2.7 syntax), and a PyPy3.5 v6.0 (an interpreter supporting Python +3.5 syntax). The two releases are both based on much the same codebase, thus +the dual release.
                +This release is a feature release following our previous 5.10 incremental +release in late December 2017. Our C-API compatibility layer cpyext is +now much faster (see the blog post) as well as more complete. We have made +many other improvements in speed and CPython compatibility. Since the changes +affect the included python development header files, all c-extension modules must +be recompiled for this version.
                +Until we can work with downstream providers to distribute builds with PyPy, we +have made packages for some common packages available as wheels. You may +compile yourself using pip install --no-build-isolation <package>, the +no-build-isolation is currently needed for pip v10.
                +First-time python users are often stumped by silly typos and omissions when +getting started writing code. We have improved our parser to emit more friendly +syntax errors, making PyPy not only faster but more friendly.
                +The GC now has hooks to gain more insights into its performance
                +The default Matplotlib TkAgg backend now works with PyPy, as do pygame and pygobject.
                +We updated the cffi module included in PyPy to version 1.11.5, and the +cppyy backend to 0.6.0. Please use these to wrap your C and C++ code, +respectively, for a JIT friendly experience.
                +As always, this release is 100% compatible with the previous one and fixed +several issues and bugs raised by the growing community of PyPy users. +We strongly recommend updating.
                +The Windows PyPy3.5 release is still considered beta-quality. There are open +issues with unicode handling especially around system calls and c-extensions.
                +The utf8 branch that changes internal representation of unicode to utf8 did not +make it into the release, so there is still more goodness coming. We also +began working on a Python3.6 implementation, help is welcome.
                +You can download the v6.0 releases here:
                + +
                +We would like to thank our donors for the continued support of the PyPy +project. If PyPy is not quite good enough for your needs, we are available for +direct consulting work.
                +We would also like to thank our contributors and encourage new people to join +the project. PyPy has many layers and we need help with all of them: PyPy +and RPython documentation improvements, tweaking popular modules to run +on pypy, or general help with making RPython’s JIT even better.
                +

                +What is PyPy?

                +PyPy is a very compliant Python interpreter, almost a drop-in replacement for +CPython 2.7 and CPython 3.5. It’s fast (PyPy and CPython 2.7.x performance comparison) +due to its integrated tracing JIT compiler.
                +We also welcome developers of other dynamic languages to see what RPython +can do for them.
                +The PyPy release supports:
                +
                +
                  +
                • +x86 machines on most common operating systems +(Linux 32/64 bits, Mac OS X 64 bits, Windows 32 bits, OpenBSD, FreeBSD)
                • +
                • newer ARM hardware (ARMv6 or ARMv7, with VFPv3) running Linux,
                • +
                • big- and little-endian variants of PPC64 running Linux,
                • +
                • +s390x running Linux
                • +
                +
                +
                +
                +
                +

                +What else is new?

                +
                +PyPy 5.10 was released in Dec, 2017.
                +
                +There are many incremental improvements to RPython and PyPy, the complete listing is here.
                +
                +Please update, and continue to help us make PyPy better.

                +Cheers, The PyPy team
                +
                +
                +
                +
                +
                + + Anonymous wrote on 2018-04-27 10:51: +
                +
                +

                Good news! Gratz PyPy Dev Core people!

                +
                +
                +
                +
                + + Gaëtan de Menten wrote on 2018-05-02 10:13: +
                +
                +

                Congratulations to the team! This is getting more interesting with each release!

                FWIW (not much, I know), I personally need two more things to start using pypy at work:
                * Windows 64bit support
                * pypy-specific conda packages for a few popular third-party packages (numpy, pandas, pytables, xlwings, ...)

                If you would do a funding campaign specifically for either of those, I would donate, as I guess many people would.

                +
                +
                +
                + +

                Improving SyntaxError in PyPy

                + +
                +

                For the last year, my halftime job has been to teach non-CS uni students +to program in Python. While doing that, I have been trying to see what common +stumbling blocks exist for novice programmers. There are many +things that could be said here, but a common theme that emerges is +hard-to-understand error messages. One source of such error messages, +particularly when starting out, is SyntaxErrors.

                +

                PyPy's parser (mostly following the architecture of CPython) uses a +regular-expression-based tokenizer with some cleverness to deal with +indentation, and a simple LR(1) parser. Both of these components obviously +produce errors for invalid syntax, but the messages are not very helpful. Often, +the message is just "invalid syntax", without any hint of what exactly is wrong. +In the last couple of weeks I have invested a little bit of effort to make them a +tiny bit better. They will be part of the upcoming PyPy 6.0 release. Here are +some examples of what changed.

                +
                +

                Missing Characters

                +

                The first class of errors occurs when a token is missing, often there is only one +valid token that the parser expects. This happens most commonly by leaving out +the ':' after control flow statements (which is the syntax error I personally +still make at least a few times a day). In such situations, the parser will now +tell you which character it expected:

                +
                +>>>> # before
                +>>>> if 1
                +  File "<stdin>", line 1
                +    if 1
                +       ^
                +SyntaxError: invalid syntax
                +>>>>
                +
                +>>>> # after
                +>>>> if 1
                +  File "<stdin>", line 1
                +    if 1
                +       ^
                +SyntaxError: invalid syntax (expected ':')
                +>>>>
                +
                +

                Another example of this feature:

                +
                +>>>> # before
                +>>>> def f:
                +  File "<stdin>", line 1
                +    def f:
                +        ^
                +SyntaxError: invalid syntax
                +>>>>
                +
                +>>>> # after
                +>>>> def f:
                +  File "<stdin>", line 1
                +    def f:
                +         ^
                +SyntaxError: invalid syntax (expected '(')
                +>>>>
                +
                +
                +
                +

                Parentheses

                +

                Another source of errors are unmatched parentheses. Here, PyPy has always had +slightly better error messages than CPython:

                +
                +>>> # CPython
                +>>> )
                +  File "<stdin>", line 1
                +    )
                +    ^
                +SyntaxError: invalid syntax
                +>>>
                +
                +>>>> # PyPy
                +>>> )
                +  File "<stdin>", line 1
                +    )
                +    ^
                +SyntaxError: unmatched ')'
                +>>>>
                +
                +

                The same is true for parentheses that are never closed (the call to eval is +needed to get the error, otherwise the repl will just wait for more input):

                +
                +>>> # CPython
                +>>> eval('(')
                +  File "<string>", line 1
                +    (
                +    ^
                +SyntaxError: unexpected EOF while parsing
                +>>>
                +
                +>>>> # PyPy
                +>>>> eval('(')
                +  File "<string>", line 1
                +    (
                +    ^
                +SyntaxError: parenthesis is never closed
                +>>>>
                +
                +

                What I have now improved is the case of parentheses that are matched wrongly:

                +
                +>>>> # before
                +>>>> (1,
                +.... 2,
                +.... ]
                +  File "<stdin>", line 3
                +    ]
                +    ^
                +SyntaxError: invalid syntax
                +>>>>
                +
                +>>>> # after
                +>>>> (1,
                +.... 2,
                +.... ]
                +  File "<stdin>", line 3
                +    ]
                +    ^
                +SyntaxError: closing parenthesis ']' does not match opening parenthesis '(' on line 1
                +>>>>
                +
                +
                +
                +

                Conclusion

                +

                Obviously these are just some very simple cases, and there is still a lot of +room for improvement (one huge problem is that only a single SyntaxError is +ever shown per parse attempt, but fixing that is rather hard).

                +

                If you have a favorite unhelpful SyntaxError message you love to hate, please +tell us in the comments and we might try to improve it. Other kinds of +non-informative error messages are also always welcome!

                +
                +
                +
                +
                +
                + + stuaxo wrote on 2018-04-10 11:36: +
                +
                +

                This is great, I've been thinking along these lines when it comes to python errors for a while.

                This kind of improvements would be great for the long-suffering python web developers too.

                +
                +
                +
                +
                + + stuaxo wrote on 2018-04-10 11:38: +
                +
                +

                Despite my typo-ridden comment, English is my first language :(

                +
                +
                +
                +
                + + René Dudfield wrote on 2018-04-10 16:00: +
                +
                +

                I've seen people struggle with lambda.

                >>> lambda x:
                File "", line 1
                lambda x:
                ^
                SyntaxError: invalid syntax

                +
                +
                +
                +
                + + smurfix wrote on 2018-04-10 16:55: +
                +
                +

                Upon a syntax error, you might want to scan forward until the next line with the current(ly-broken) statement's indent (or maybe until there's a dedent to below that level (except when already at top level, obviously)), then resume parsing.

                +
                +
                +
                +
                + + André Roberge wrote on 2018-04-11 00:12: +
                +
                +

                I applaud this initiative. This is something that I have attempted to do on https://reeborg.ca/reeborg.html (only for code run in the the editor, not for the repl). I also tried to provide translations when using languages other than English. I think it would be great if you could somehow provide a hook to easily add translations.

                +
                +
                +
                +
                + + Benjamin wrote on 2018-04-11 07:07: +
                +
                +

                Missing commas between elements in data structures is probably my most common syntax error, especially when dealing with nested data structures or structures split across multiple lines. And while they're something I can recognize very easily, the actual error message isn't especially helpful, particularly when the next element after a missing comma is on the following line.

                +
                +
                +
                +
                + + Unknown wrote on 2018-04-11 14:38: +
                +
                +

                Thanks for the explanation. It all makes sense now that I know Python uses regular expressions in its parser. When Idle points to a random space character within the indentation, off to the left of a code block implemented in compliance with every recognized convention, boldly proclaiming "syntax error", I know precisely which vestigial anti-Pythonic Bell Labs holdover to resent. Again.

                +
                +
                +
                +
                + + Carl Friedrich Bolz-Tereick wrote on 2018-04-12 20:23: +
                +
                +

                Everybody thanks for the suggestions! I've added these to my collections of things I might want to fix.

                @smurfix there is a huge amount of scientific papers on approaches how to do stuff like that, I am currently working through them (slowly)

                @Unknown do you have an example for this behaviour?

                +
                +
                +
                +
                + + Noah F. San Tsorvutz wrote on 2018-04-13 19:39: +
                +
                +

                Sorry for the 'unknown' status ... In fact, it happened again today. I can send a screenshot, if that will help, confirming the presence of a red highlighted space, among many seemingly non-offending spaces, within the left margin indentation. Let me see if it is still happening when I try to run that code ... No, that exact SNAFU has moved on, but I now have an example of a syntax error being highlighted within a comment. Is that interesting?

                +
                +
                +
                +
                + + Amaroq Starwind wrote on 2018-05-10 19:06: +
                +
                +

                I would love to see this get updated to Python 3.6.5. I'm currently using that for my programs, and even after looking at the changelogs between Python versions, I'm not sure what I'd lose by moving down to 3.5.3 so that I could use PyPy.

                I'm also curious about things like IdleX and Anaconda. Would those be, hypothetically speaking, mergeable with PyPy?

                +
                +
                +
                + +

                Leysin Winter Sprint 2018: review

                + +
                +

                Like every year, the PyPy developers and a couple of newcomers + gathered in Leysin, Switzerland, to share their thoughts and + contribute to the development of PyPy.

                +

                As always, we had interesting discussions about how we could + improve PyPy, to make it the first choice for even more + developers. We also made some progress with current issues, like + compatibility with Python 3.6 and improving the performance of + CPython extension modules, where we fixed a lot of bugs and gained + new insights about where and how we could tweak PyPy.

                +

                We were very happy about the number of new people who joined us + for the first time, and hope they enjoyed it as much as everyone + else.

                +

                Topics

                + We worked on the following topics (and more!):
                  +
                • Introductions for newcomers
                • +
                • Python 3.5 and 3.6 improvements
                • +
                • CPyExt performance improvements and GC implementation
                  +
                • +
                • JIT: guard-compatible implementation
                  +
                • +
                • Pygame performance improvements
                • +
                • Unicode/UTF8 implementation
                  +
                • +
                • CFFI tutorial/overview rewrite +
                • +
                • py3 test runners refactoring
                • +
                • RevDB improvements
                  +
                • +
                + The weather was really fine for most of the week, with only + occasional snow and fog. We started our days with a short (and + sometimes not so short) planning session and enjoyed our dinners in + the great restaurants in the area. Some of us even started earlier + and continued till late night. It was a relaxed, but also very + productive atmosphere. On our break day on Wednesday, we enjoyed the + great conditions and went skiing and hiking. +

                Attendees

                +
                  +
                • Arianna
                • +
                • Jean-Daniel
                  +
                • +
                • Stefan Beyer
                • +
                • Floris Bruynooghe
                  +
                • +
                • Antonio Cuni
                • +
                • René Dudfield
                • +
                • Manuel Jacob
                • +
                • Ronan Lamy
                • +
                • Remi Meier
                • +
                • Matti Picus
                  +
                • +
                • Armin Rigo
                • +
                • Alexander Schremmer
                  +
                • +
                + Leysin is easily reachable by Geneva Airport, so feel free to join + us next time!


                Cheers,
                + Stefan

                +
                +

                PyPy 5.10.1 bugfix release for python 3.5

                + +
                +
                +We have released a bug fix PyPy3.5-v5.10.1 +due to the following issues:
                +
                +
                  +
                • Fix time.sleep(float('nan')) which would hang on Windows
                • +
                • Fix missing errno constants on Windows
                • +
                • Fix issue 2718 for the REPL on Linux
                • +
                • Fix an overflow in converting int secs to nanosecs (issue 2717 )
                • +
                • Using kwarg 'flag' to os.setxattr had no effect
                • +
                • Fix the winreg module for unicode entries in the registry on Windows
                • +
                +
                +
                +Note that many of these fixes are for our new beta version of PyPy3.5 on Windows. There may be more unicode problems in the Windows beta version, +especially concerning directory- and file-names with non-ASCII +characters.

                +On macOS, we recommend you wait for the +Homebrew package to prevent issues with third-party packages. For other supported platforms our downloads are available now.
                +Thanks to those who reported the issues.

                +

                +What is PyPy?

                +PyPy is a very compliant Python interpreter, almost a drop-in replacement for +CPython 2.7 and CPython 3.5. It’s fast (PyPy and CPython 2.7.x performance comparison) +due to its integrated tracing JIT compiler.
                + +We also welcome developers of other dynamic languages to see what RPython +can do for them.
                + +This PyPy 3.5 release supports:
                +
                +
                  +
                • +x86 machines on most common operating systems +(Linux 32/64 bits, macOS 64 bits, Windows 32 bits, OpenBSD, FreeBSD)
                • +
                • newer ARM hardware (ARMv6 or ARMv7, with VFPv3) running Linux,
                • +
                • big- and little-endian variants of PPC64 running Linux,
                • +
                • +s390x running Linux
                • +
                +
                +
                +Please update, and continue to help us make PyPy better.
                + +Cheers
                + +The PyPy Team
                +
                +
                +
                +

                Leysin Winter sprint: 17-24 March 2018

                + +
                + + + +
                + +

                The next PyPy sprint will be in Leysin, Switzerland, for the thirteenth +time. This is a fully public sprint: newcomers and topics other than +those proposed below are welcome.

                + +

                (Note: this sprint is independent from the suggested April-May sprint in +Poland.)

                + +

                Goals and topics of the sprint

                + +

                The list of topics is open, but here is our current list:

                + +
                +
                +
                +
                  +
                • cffi tutorial/overview rewrite +
                • +
                • py3 test runners are too complicated +
                • +
                • make win32 builds green +
                • +
                • make packaging more like cpython/portable builds +
                • +
                • get CI builders for PyPy into mainstream projects (Numpy, Scipy, lxml, uwsgi) +
                • +
                • get more of scientific stack working (tensorflow?) +
                • +
                • cpyext performance improvements +
                • +
                • General 3.5 and 3.6 improvements +
                • +
                • JIT topics: guard-compatible, and the subsequent research project to save and reuse traces across processes +
                • +
                • finish unicode-utf8 +
                • +
                • update www.pypy.org, speed.pypy.org (web devs needed) +
                • +
                +

                As usual, the main side goal is to have fun in winter sports :-) +We can take a day off (for ski or anything else).

                + +

                Exact times

                + +

                Work days: starting March 18th (~noon), ending March 24th (~noon).

                + +

                Please see announcement.txt for more information.

                +
                +
                +
                +
                + + Евгений Демченко wrote on 2018-01-09 05:49: +
                +
                +

                Can we expect a python 3.6 support released anytime soon? Thanks!

                +
                +
                +
                +
                + + Oscar Smith wrote on 2018-01-13 01:34: +
                +
                +

                It would be nice to have tensorflow working on pypy, even if there aren't many real world cases where this is useful, as most tensorflow does not use python for much heavy lifting.

                +
                +
                +
                +
                + + Eric van Riet Paap wrote on 2018-02-09 22:24: +
                +
                +

                Hi PyPy-team!

                While I was checking out a reinforcement learning repo I thought it would benefit a lot from have the games it was learning in something faster that CPython. So I had another look at PyPy. Tensorflow I could not install so I am really happy that this is on the agenda for the next sprint!

                good luck and have fun!

                Eric

                +
                +
                +
                + +

                PyPy2.7 and PyPy3.5 v5.10 dual release

                + +
                +
                +

                The PyPy team is proud to release both PyPy2.7 v5.10 (an interpreter supporting +Python 2.7 syntax), and a final PyPy3.5 v5.10 (an interpreter for Python +3.5 syntax). The two releases are both based on much the same codebase, thus +the dual release.

                +

                This release is an incremental release with very few new features, the main +feature being the final PyPy3.5 release that works on linux and OS X with beta +windows support. It also includes fixes for vmprof cooperation with greenlets.

                +

                Compared to 5.9, the 5.10 release contains mostly bugfixes and small improvements. +We have in the pipeline big new features coming for PyPy 6.0 that did not make +the release cut and should be available within the next couple months.

                +

                As always, this release is 100% compatible with the previous one and fixed +several issues and bugs raised by the growing community of PyPy users. +As always, we strongly recommend updating.

                +

                There are quite a few important changes that are in the pipeline that did not +make it into the 5.10 release. Most important are speed improvements to cpyext +(which will make numpy and pandas a bit faster) and utf8 branch that changes +internal representation of unicode to utf8, which should help especially the +Python 3.5 version of PyPy.

                +

                This release concludes the Mozilla Open Source grant for having a compatible +PyPy 3.5 release and we're very grateful for that. Of course, we will continue +to improve PyPy 3.5 and probably move to 3.6 during the course of 2018.

                +

                You can download the v5.10 releases here:

                +
                +https://pypy.org/download.html +
                +

                We would like to thank our donors for the continued support of the PyPy +project.

                +

                We would also like to thank our contributors and +encourage new people to join the project. PyPy has many +layers and we need help with all of them: PyPy and RPython documentation +improvements, tweaking popular modules to run on pypy, or general help +with making RPython's JIT even better.

                +
                +

                What is PyPy?

                +

                PyPy is a very compliant Python interpreter, almost a drop-in replacement for +CPython 2.7 and CPython 3.5. It's fast (PyPy and CPython 2.7.x performance comparison) +due to its integrated tracing JIT compiler.

                +

                We also welcome developers of other dynamic languages to see what RPython +can do for them.

                +

                The PyPy release supports:

                +
                +
                  +
                • +x86 machines on most common operating systems +(Linux 32/64 bits, Mac OS X 64 bits, Windows 32 bits, OpenBSD, FreeBSD)
                • +
                • newer ARM hardware (ARMv6 or ARMv7, with VFPv3) running Linux,
                • +
                • big- and little-endian variants of PPC64 running Linux,
                • +
                • +s390x running Linux
                • +
                +
                +
                +
                +

                Changelog

                +
                  +
                • improve ssl handling on windows for pypy3 (makes pip work)
                • +
                • improve unicode handling in various error reporters
                • +
                • fix vmprof cooperation with greenlets
                • +
                • fix some things in cpyext
                • +
                • test and document the cmp(nan, nan) == 0 behaviour
                • +
                • don't crash when calling sleep with inf or nan
                • +
                • fix bugs in _io module
                • +
                • inspect.isbuiltin() now returns True for functions implemented in C
                • +
                • allow the sequences future-import, docstring, future-import for CPython bug compatibility
                • +
                • Issue #2699: non-ascii messages in warnings
                • +
                • posix.lockf
                • +
                • fixes for FreeBSD platform
                • +
                • add .debug files, so builds contain debugging info, instead of being stripped
                • +
                • improvements to cppyy
                • +
                • issue #2677 copy pure c PyBuffer_{From,To}Contiguous from cpython
                • +
                • issue #2682, split firstword on any whitespace in sqlite3
                • +
                • ctypes: allow ptr[0] = foo when ptr is a pointer to struct
                • +
                • matplotlib will work with tkagg backend once matplotlib pr #9356 is merged
                • +
                • improvements to utf32 surrogate handling
                • +
                • cffi version bump to 1.11.2
                • +
                +Maciej Fijalkowski, Matti Picus and the whole PyPy team +
                +
                +
                +
                +
                +
                +
                + + Unknown wrote on 2017-12-25 21:25: +
                +
                +

                Thanks a lot! What a christmas present!

                +
                +
                +
                +
                + + stuaxo wrote on 2017-12-25 23:32: +
                +
                +

                Great work and happy Xmas :)

                +
                +
                +
                +
                + + Joce wrote on 2017-12-26 04:40: +
                +
                +

                Thanks for the Christmas present of a new release!

                I'm having issues with the the official builds of 5.10 for macOS x64 (the "high sierra" version for pypy3):

                With pypy3: dyld: Library not loaded: /usr/local/opt/libffi/lib/libffi.6.dylib
                Referenced from: /Users/joce/devtools/python/pypy3-v5.10.0-osx64/bin/libpypy3-c.dylib
                Reason: image not found
                Abort trap: 6

                With pypy2: dyld: Library not loaded: /usr/local/opt/openssl/lib/libssl.1.0.0.dylib
                Referenced from: /Users/joce/devtools/python/pypy2-v5.10.0-osx64/bin/libpypy-c.dylib
                Reason: image not found
                Abort trap: 6

                Given that I have no homebrew or macports installed (and never have on that fresh-ish install of high sierra), /usr/local/opt/ doesn't even exist, so it's not an appropriate folder for the linker to serach in.

                +
                +
                +
                +
                + + Anonymous wrote on 2018-01-05 21:23: +
                +
                +

                Excellent work! I look forward to using PyPy with my 3.5-compatible applications!

                +
                +
                +
                + +
                +
                + +
                +
                +
                + +
                + + + + \ No newline at end of file diff --git a/blog/index-39.html b/blog/index-39.html new file mode 100644 index 000000000..25a789ce6 --- /dev/null +++ b/blog/index-39.html @@ -0,0 +1,2757 @@ + + + + + + +PyPy (old posts, page 39) | PyPy + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +
                +

                PyPy JIT for Aarch64

                + +
                +
                + +

                Hello everyone.

                +

                We are pleased to announce the availability of the new PyPy for AArch64. This +port brings PyPy's high-performance just-in-time compiler to the AArch64 +platform, also known as 64-bit ARM. With the addition of AArch64, PyPy now +supports a total of 6 architectures: x86 (32 & 64bit), ARM (32 & 64bit), PPC64, +and s390x. The AArch64 work was funded by ARM Holdings Ltd. and Crossbar.io.

                +

                PyPy has a good record of boosting the performance of Python programs on the +existing platforms. To show how well the new PyPy port performs, we compare the +performance of PyPy against CPython on a set of benchmarks. As a point of +comparison, we include the results of PyPy on x86_64.

                +

                Note, however, that the results presented here were measured on a Graviton A1 +machine from AWS, which comes with a very serious word of warning: Graviton A1's +are virtual machines, and, as such, they are not suitable for benchmarking. If +someone has access to a beefy enough (16G) ARM64 server and is willing to give +us access to it, we are happy to redo the benchmarks on a real machine. One +major concern is that while a virtual CPU is 1-to-1 with a real CPU, it is not +clear to us how CPU caches are shared across virtual CPUs. Also, note that by no +means is this benchmark suite representative enough to average the results. Read +the numbers individually per benchmark.

                +

                The following graph shows the speedups on AArch64 of PyPy (hg id 2417f925ce94) compared to +CPython (2.7.15), as well as the speedups on a x86_64 Linux laptop +comparing the most recent release, PyPy 7.1.1, to CPython 2.7.16.

                + +
                + +

                In the majority of benchmarks, the speedups achieved on AArch64 match those +achieved on the x86_64 laptop. Over CPython, PyPy on AArch64 achieves speedups +between 0.6x to 44.9x. These speedups are comparable to x86_64, where the +numbers are between 0.6x and 58.9x.

                +

                The next graph compares between the speedups achieved on AArch64 to the speedups +achieved on x86_64, i.e., how great the speedup is on AArch64 vs. the same +benchmark on x86_64. This comparison should give a rough idea about the +quality of the generated code for the new platform.

                + +
                + +

                Note that we see a large variance: There are generally three groups of +benchmarks - those that run at more or less the same speed, those that +run at 2x the speed, and those that run at 0.5x the speed of x86_64.

                +

                The variance and disparity are likely related to a variety of issues, mostly due +to differences in architecture. What is however interesting is that, compared +to measurements performed on older ARM boards, the branch predictor on the +Graviton A1 machine appears to have improved. As a result, the speedups achieved +by PyPy over CPython are smaller than on older ARM boards: sufficiently branchy +code, like CPython itself, simply runs a lot faster. Hence, the advantage +of the non-branchy code generated by PyPy's just-in-time compiler is smaller.

                +

                One takeaway here is that many possible improvements for PyPy have yet to be +implemented. This is true for both of the above platforms, but probably more so +for AArch64, which comes with a large number of CPU registers. The PyPy backend +was written with x86 (the 32-bit variant) in mind, which has a really low number +of registers. We think that we can improve in the area of emitting more modern +machine code, which may have a higher impact on AArch64 than on x86_64. There is +also a number of missing features in the AArch64 backend. These features are +currently implemented as expensive function calls instead of inlined native +instructions, something we intend to improve.

                +

                Best,

                +

                Maciej Fijalkowski, Armin Rigo and the PyPy team

                + +
                +
                +
                +
                +
                +
                + + Unknown wrote on 2019-07-25 18:59: +
                +
                +

                Hey - I can provide access to several flavors of beefy bare-metal arm64 hardware as part of the Works on Arm project, for your benchmark efforts.

                +
                +
                +
                +
                + + Maciej Fijalkowski wrote on 2019-07-25 21:22: +
                +
                +

                Awesome! Send me an email - fijall at gmail

                +
                +
                +
                +
                + + Anonymous wrote on 2019-07-29 15:57: +
                +
                +
                Does this work well with pypy3 ? +
                +
                +
                +
                + + Armin Rigo wrote on 2019-07-29 20:02: +
                +
                +

                Yes, it works with any RPython-based interpreter (including pypy2 and pypy3).

                +
                +
                +
                + +

                PyPy 7.1.1 Bug Fix Release

                + +
                +
                +The PyPy team is proud to release a bug-fix release version 7.1.1 of PyPy, which +includes two different interpreters:
                  +
                • PyPy2.7, which is an interpreter supporting the syntax and the features of +Python 2.
                • +
                • PyPy3.6-beta: the second official release of PyPy to support 3.6 +features.
                • +
                +
                +
                +
                +
                +The interpreters are based on much the same codebase, thus the double +release.

                +This bugfix fixes bugs related to large lists, dictionaries, and sets, some corner cases with unicode, and PEP 3118 memory views of ctype structures. It also fixes a few issues related to the ARM 32-bit backend. For the complete list see the changelog.

                +You can download the v7.1.1 releases here:
                + +
                +
                +As always, this release is 100% compatible with the previous one and fixed +several issues and bugs raised by the growing community of PyPy users. +We strongly recommend updating.

                +The PyPy3.6 release is rapidly maturing, but is still considered beta-quality.

                +The PyPy team
                +
                +

                An RPython JIT for LPegs

                + +
                +

                The following is a guest post by Stefan Troost, he describes the work he did in his bachelor thesis:

                + +

                In this project we have used the RPython infrastructure to generate an RPython +JIT for a +less-typical use-case: string pattern matching. The work in this project is +based on Parsing Expression Grammars and +LPeg, an implementation of PEGs +designed to be used in Lua. In this post I will showcase some of the work that +went into this project, explain PEGs in general and LPeg in particular, and +show some benchmarking results.

                +

                +Parsing Expression Grammars

                +

                Parsing Expression Grammas (PEGs) are a type of formal grammar similar to +context-free grammars, with the main difference being that they are unambiguous. +This is achieved by redefining the ambiguous choice operator of CFGs (usually +noted as |) as an ordered choice operator. In practice this means that if a +rule in a PEG presents a choice, a PEG parser should prioritize the leftmost +choice. Practical uses include parsing and pattern-searching. In comparison to +regular expressions PEGs stand out as being able to be parsed in linear time, +being strictly more powerful than REs, as well as being arguably more readable.

                +

                +LPeg

                +

                LPeg is an implementation of PEGs written in C to be used in the Lua +programming language. A crucial detail of this implementation is that it parses +high level function calls, translating them to bytecode, and interpreting that +bytecode. Therefore, we are able to improve that implementation by replacing +LPegs C-interpreter with an RPython JIT. I use a modified version of LPeg to +parse PEGs and pass the generated Intermediate Representation, the LPeg +bytecode, to my VM.

                +

                +The LPeg Library

                +

                The LPeg Interpreter executes bytecodes created by parsing a string of commands +using the LPeg library. Our JIT supports a subset of the LPeg library, with +some of the more advanced or obscure features being left out. Note that this +subset is still powerful enough to do things like parse JSON.

                + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                OperatorDescription
                lpeg.P(string)Matches string literally
                lpeg.P(n)Matches exactly n characters
                lpeg.P(-n)Matches at most n characters
                lpeg.S(string)Matches any character in string (Set)
                lpeg.R(“xy”)Matches any character between x and y (Range)
                pattern^nMatches at least n repetitions of pattern
                pattern^-nMatches at most n repetitions of pattern
                pattern1 * pattern2Matches pattern1 followed by pattern2
                pattern1 + pattern2Matches pattern1 or pattern2 (ordered choice)
                pattern1 - pattern2Matches pattern1 if pattern2 does not match
                -patternEquivalent to ("" - pattern)
                +

                As a simple example, the pattern lpeg.P"ab"+lpeg.P"cd" would match either the +string ab or the string cd.

                +

                To extract semantic information from a pattern, captures are needed. These are +the following operations supported for capture creation.

                + + + + + + + + + + + + + + + +
                OperationWhat it produces
                lpeg.C(pattern)the match for patten plus all captures made by pattern
                lpeg.Cp()the current position (matches the empty string)
                +

                (tables taken from the LPeg documentation)

                +

                These patterns are translated into bytecode by LPeg, at which point we are able +to pass them into our own VM.

                +

                +The VM

                +

                The state of the VM at any point is defined by the following variables:

                +
                  +
                • +PC: program counter indicating the current instruction
                • +
                • +fail: an indicator that some match failed and the VM must backtrack
                • +
                • +index: counter indicating the current character of the input string
                • +
                • +stackentries: stack of return addresses and choice points
                • +
                • +captures: stack of capture objects
                • +
                +

                The execution of bytecode manipulates the values of these variables in order to +produce some output. How that works and what that output looks like will be +explained now.

                +

                +The Bytecode

                +

                For simplicity’s sake I will not go over every individual bytecode, but instead +choose some that exemplify the core concepts of the bytecode set.

                +

                +generic character matching bytecodes

                +
                  +
                • +

                  any: Checks if there’s any characters left in the inputstring. If it succeeds +it advances the index and PC by 1, if not the bytecode fails.

                  +
                • +
                • +

                  char c: Checks if there is another bytecode in the input and if that +character is equal to c. Otherwise the bytecode fails.

                  +
                • +
                • +

                  set c1-c2: Checks if there is another bytecode in the input and if that +character is between (including) c1 and c2. Otherwise the bytecode fails.

                  +
                • +
                +

                These bytecodes are the easiest to understand with very little impact on the +VM. What it means for a bytecode to fail will be explained when +we get to control flow bytecodes.

                +

                To get back to the example, the first half of the pattern lpeg.P"ab" could be +compiled to the following bytecodes:

                +
                char a
                +char b
                +
                +

                +control flow bytecodes

                +
                  +
                • +

                  jmp n: Sets PC to n, effectively jumping to the n’th bytecode. Has no defined +failure case.

                  +
                • +
                • +

                  testchar c n: This is a lookahead bytecode. If the current character is equal +to c it advances the PC but not the index. Otherwise it jumps to n.

                  +
                • +
                • +

                  call n: Puts a return address (the current PC + 1) on the stackentries stack +and sets the PC to n. Has no defined failure case.

                  +
                • +
                • +

                  ret: Opposite of call. Removes the top value of the stackentries stack (if +the string of bytecodes is valid this will always be a return address) and +sets the PC to the removed value. Has no defined failure case.

                  +
                • +
                • +

                  choice n: Puts a choice point on the stackentries stack. Has no defined +failure case.

                  +
                • +
                • +

                  commit n: Removes the top value of the stackentries stack (if the string of +bytecodes is valid this will always be a choice point) and jumps to n. Has no +defined failure case.

                  +
                • +
                +

                Using testchar we can implement the full pattern lpeg.P"ab"+lpeg.P"cd" with +bytecode as follows:

                +
                testchar a -> L1
                +any
                +char b
                +end
                +any
                +L1: char c
                +char d
                +end
                +
                +

                The any bytecode is needed because testchar does not consume a character +from the input.

                +

                +Failure Handling, Backtracking and Choice Points

                +

                A choice point consist of the VM’s current index and capturestack as well as a +PC. This is not the VM’s PC at the time of creating the +choicepoint, but rather the PC where we should continue trying to find +matches when a failure occurs later.

                +

                Now that we have talked about choice points, we can talk about how the VM +behaves in the fail state. If the VM is in the fail state, it removed entries +from the stackentries stack until it finds a choice point. Then it backtracks +by restoring the VM to the state defined by the choice point. If no choice +point is found this way, no match was found in the string and the VM halts.

                +

                Using choice points we could implement the example lpeg.P"ab" + lpeg.P"cd" in +bytecodes in a different way (LPEG uses the simpler way shown above, but for +more complex patterns it can’t use the lookahead solution using testchar):

                +
                choice L1
                +char a
                +char b
                +commit
                +end
                +L1: char c
                +char d
                +end
                +
                +

                +Captures

                +

                Some patterns require the VM to produce more output than just “the pattern +matched” or “the pattern did not match”. Imagine searching a document for an +IPv4 address and all your program responded was “I found one”. In order to +recieve additional information about our inputstring, captures are used.

                +

                +The capture object

                +

                In my VM, two types of capture objects are supported, one of them being the +position capture. It consists of a single index referencing the point in the +inputstring where the object was created.

                +

                The other type of capture object is called simplecapture. It consists of an +index and a size value, which are used to reference a substring of the +inputstring. In addition, simplecaptures have a variable status indicating they +are either open or full. If a simplecapture object is open, that means that its +size is not yet determined, since the pattern we are capturing is of variable +length.

                +

                Capture objects are created using the following bytecodes:

                +
                  +
                • +

                  Fullcapture Position: Pushes a positioncapture object with the current index +value to the capture stack.

                  +
                • +
                • +

                  Fullcapture Simple n: Pushes a simplecapture object with current index value +and size=n to the capture stack.

                  +
                • +
                • +

                  Opencapture Simple: Pushes an open simplecapture object with current index +value and undetermined size to the capture stack.

                  +
                • +
                • +

                  closecapture: Sets the top element of the capturestack to full and sets its +size value using the difference between the current index and the index of +the capture object.

                  +
                • +
                +

                +The RPython Implementation

                +

                These, and many more bytecodes were implemented in an RPython-interpreter. +By adding jit hints, we were able to generate an efficient JIT. +We will now take a closer look at some implementations of bytecodes.

                +
                ...
                +        elif instruction.name == "any":
                +            if index >= len(inputstring):
                +                fail = True
                +            else:
                +                pc += 1
                +                index += 1
                +
                +...
                +
                +

                The code for the any-bytecode is relatively straight-forward. It either +advances the pc and index or sets the VM into the fail state, +depending on whether the end of the inputstring has been reached or not.

                +
                ...
                +        if instruction.name == "char":
                +            if index >= len(inputstring):
                +                fail = True
                +            elif instruction.character == inputstring[index]:
                +                pc += 1
                +                index += 1
                +            else:
                +                fail = True
                +...
                +
                +

                The char-bytecode also looks as one would expect. If the VM’s string index is +out of range or the character comparison fails, the VM is put into the +fail state, otherwise the pc and index are advanced by 1. As you can see, the +character we’re comparing the current inputstring to is stored in the +instruction object (note that this code-example has been simplified for +clarity, since the actual implementation includes a jit-optimization that +allows the VM to execute multiple successive char-bytecodes at once).

                +
                ...
                +        elif instruction.name == "jmp":
                +            pc = instruction.goto
                +...
                +
                +

                The jmp-bytecode comes with a goto value which is a pc that we want +execution to continue at.

                +
                ...
                +        elif instruction.name == "choice":
                +            pc += 1
                +            choice_points = choice_points.push_choice_point(
                +                instruction.goto, index, captures)
                +...
                +
                +

                As we can see here, the choice-bytecode puts a choice point onto the stack that +may be backtracked to if the VM is in the fail-state. This choice point +consists of a pc to jump to which is determined by the bytecode. +But it also includes the current index and captures values at the time the choice +point was created. An ongoing topic of jit optimization is which data structure +is best suited to store choice points and return addresses. Besides naive +implementations of stacks and single-linked lists, more case-specific +structures are also being tested for performance.

                +

                +Benchmarking Result

                +

                In order to find out how much it helps to JIT LPeg patterns we ran a small +number of benchmarks. We used an otherwise idle Intel Core i5-2430M CPU with +3072 KiB of cache and 8 GiB of RAM, running with 2.40GHz. The machine was +running Ubuntu 14.04 LTS, Lua 5.2.3 and we used GNU grep 2.16 as a point of +comparison for one of the benchmarks. The benchmarks were run 100 times in +a new process each. We measured the full runtime of the called process, +including starting the process.

                +

                Now we will take a look at some plots generated by measuring the runtime of +different iterations of my JIT compared to lua and using bootstrapping to +generate a sampling distribution of mean values. The plots contain a few different +variants of pypeg, only the one called "fullops" is important for this blog post, however.

                + +
                + +

                This is the plot for a search pattern that searches a text file for valid URLs. +As we can see, if the input file is as small as 100 kb, the benefits of JIT +optimizations do not outweigh the time required to generate the +machine code. As a result, all of our attempts perform significantly slower +than LPeg.

                + +
                + +

                This is the plot for the same search pattern on a larger input file. As we can +see, for input files as small as 500 kb our VM already outperforms LPeg’s. An +ongoing goal of continued development is to get this lower boundary as small as +possible.

                + +
                + +

                The benefits of a JIT compared to an Interpreter become more and more relevant +for larger input files. Searching a file as large as 5 MB makes this fairly +obvious and is exactly the behavior we expect.

                + +
                + +

                This time we are looking at a different more complicated pattern, one that parses JSON used on a +50 kb input file. As expected, LPeg outperforms us, however, something +unexpected happens as we increase the filesize.

                + +
                + +

                Since LPeg has a defined maximum depth of 400 for the choicepoints and +returnaddresses Stack, LPeg by default refuses to parse files as small as +100kb. This raises the question if LPeg was intended to be used for parsing. +Until a way to increase LPeg’s maximum stack depth is found, no comparisons to +LPeg can be performed at this scale. This has been a low priority in the past +but may be addressed in the future.

                +

                To conclude, we see that at sufficiently high filesizes, our JIT outperforms +the native LPeg-interpreter. This lower boundary is currently as low as 100kb +in filesize.

                +

                +Conclusion

                +

                Writing a JIT for PEG’s has proven itself to be a challenge worth pursuing, as +the expected benefits of a JIT compared to an Interpreter have been achieved. +Future goals include getting LPeg to be able to use parsing patterns on larger +files, further increasing the performance of our JIT and comparing it to other +well-known programs serving a similar purpose, like grep.

                +

                The prototype implementation that I described in this post can be found +on Github +(it's a bit of a hack in some places, though).

                +
                +

                PyPy v7.1 released; now uses utf-8 internally for unicode strings

                + +
                +
                +The PyPy team is proud to release version 7.1.0 of PyPy, which includes +two different interpreters:
                +
                +
                  +
                • PyPy2.7, which is an interpreter supporting the syntax and the features of +Python 2.7
                • +
                • PyPy3.6-beta: this is the second official release of PyPy to support 3.6 +features, although it is still considered beta quality.
                • +
                +
                +
                +The interpreters are based on much the same codebase, thus the double +release.

                +This release, coming fast on the heels of 7.0 in February, finally merges the +internal refactoring of unicode representation as UTF-8. Removing the +conversions from strings to unicode internally lead to a nice speed bump. We merged the utf-8 changes to the py3.5 branch (Python3.5.3) but will concentrate on 3.6 going forward.

                +We also improved the ability to use the buffer protocol with ctype structures +and arrays.

                +The CFFI backend has been updated to version 1.12.2. We recommend using CFFI +rather than c-extensions to interact with C, and cppyy for interacting with +C++ code.
                + You can download the v7.1 releases here:
                + +
                +We would like to thank our donors for the continued support of the PyPy +project. If PyPy is not quite good enough for your needs, we are available for +direct consulting work.

                +We would also like to thank our contributors and encourage new people to join +the project. PyPy has many layers and we need help with all of them: PyPy +and RPython documentation improvements, tweaking popular modules to run +on pypy, or general help with making RPython’s JIT even better.
                +

                +What is PyPy? +

                +PyPy is a very compliant Python interpreter, almost a drop-in replacement for +CPython 2.7, 3.6. It’s fast (PyPy and CPython 2.7.x performance +comparison) due to its integrated tracing JIT compiler.

                +We also welcome developers of other dynamic languages to see what RPython +can do for them.
                + +This PyPy release supports:
                  +
                +
                +
                  +
                • +x86 machines on most common operating systems +(Linux 32/64 bits, Mac OS X 64 bits, Windows 32 bits, OpenBSD, FreeBSD)
                • +
                • big- and little-endian variants of PPC64 running Linux
                • +
                •  ARM32 although we do not supply downloadable binaries at this time
                • +
                • +s390x running Linux
                • +
                +
                +

                +What else is new? +

                +PyPy 7.0 was released in February, 2019. +There are many incremental improvements to RPython and PyPy, for more information see the changelog.

                +Please update, and continue to help us make PyPy better.


                +Cheers, The PyPy team +
                +
                +
                +
                +
                +
                +
                +
                + + Anonymous wrote on 2019-03-28 09:52: +
                +
                +

                Hi,

                I get this error when trying to run my app with the new PyPy release (pypy 2.7 syntax on Windows):

                'C:\pypy2\lib_pypy\_sqlite3_cffi.pypy-41.pyd': The specified module could not be found


                The file specified in the error message (\lib_pypy\_sqlite3_cffi.pypy-41.pyd) is in the folder so whatever is missing is not quite so obvious.

                +
                +
                +
                +
                + + Noah F. San Tsorvutz wrote on 2019-03-29 14:27: +
                +
                +

                One question about using utf8 text encoding, internally.

                Is text handling code much different now, in PyPy, vs. cPython?

                If handling characters ( code points ) within the ASCII range
                is more like Python v.2.x, that would be very good news to
                at least one old fart who is having trouble even treating
                print as a function ...

                Thanks!

                +
                +
                +
                +
                + + Armin Rigo wrote on 2019-03-31 08:00: +
                +
                +

                @Noah The answer is complicated because CPython changed its internals more than once. The current CPython 3.x stores unicode strings as an array of same-sized characters; if your string contains even one character over 0xffff then it's an array of 4 bytes for all the characters. Sometimes CPython *also* caches the UTF8 string, but doesn't use it much. The new PyPy is very different: it uses the UTF8 string *only*, and it works for both PyPy 2.7 or 3.x.

                +
                +
                +
                +
                + + Armin Rigo wrote on 2019-03-31 08:04: +
                +
                +

                @Anonymous It works for me. Please open a bug report on https://bugs.pypy.org and give more details...

                +
                +
                +
                +
                + + Anonymous wrote on 2019-03-31 12:09: +
                +
                +

                Hi Armin,

                I can't log in to bugs.pypy.org but the problem is very easy to replicate, you only need to test this and it fails (v6.0.0 works fine but both v7.0.0 and 7.1.0 fail):

                try:
                import sqlite3
                except Exception as e:
                print str(e)

                The error is:
                'C:\pypy27v710\lib_pypy\_sqlite3_cffi.pypy-41.pyd': The specified module could not be found

                I've tested it on two different Win10 PCs (32bit PyPy on 64bit Win10) and both exhibit the same behaviour.

                +
                +
                +
                +
                + + Armin Rigo wrote on 2019-03-31 16:29: +
                +
                +

                It is not so easy, because it works fine for me (win10 too). Please file a regular bug report. If you can't then we have another problem to solve first...

                +
                +
                +
                +
                + + Anonymous wrote on 2019-03-31 18:06: +
                +
                +

                Hi Armin,

                I've got the answer: With PyPy version >= 7.0.0 you have to add PyPy's root folder to PATH in Environment Variables, that wasn't required with versions <= 6.0.0

                +
                +
                +
                +
                + + Armin Rigo wrote on 2019-04-01 08:15: +
                +
                +

                https://foss.heptapod.net/pypy/pypy/-/issues/2988/windows-cant-find-_sqlite3_cffipypy-41pyd

                +
                +
                +
                +
                + + Anonymous wrote on 2019-04-02 19:10: +
                +
                +

                Hi Armin,

                Moving the dlls to lib_pypy is a nice easy workaround, thank you.

                And thanks to everybody in the PyPy team for their excellent work.

                +
                +
                +
                + +

                PyPy v7.0.0: triple release of 2.7, 3.5 and 3.6-alpha

                + +
                +
                +The PyPy team is proud to release the version 7.0.0 of PyPy, which includes +three different interpreters:
                +
                  +
                • PyPy2.7, which is an interpreter supporting the syntax and the features of +Python 2.7
                • +
                • PyPy3.5, which supports Python 3.5
                • +
                • PyPy3.6-alpha: this is the first official release of PyPy to support 3.6 +features, although it is still considered alpha quality.
                • +
                +
                +All the interpreters are based on much the same codebase, thus the triple +release.
                +Until we can work with downstream providers to distribute builds with PyPy, we +have made packages for some common packages available as wheels.
                +The GC hooks , which can be used to gain more insights into its +performance, has been improved and it is now possible to manually manage the +GC by using a combination of gc.disable and gc.collect_step. See the +GC blog post.
                +We updated the cffi module included in PyPy to version 1.12, and the +cppyy backend to 1.4. Please use these to wrap your C and C++ code, +respectively, for a JIT friendly experience.
                +As always, this release is 100% compatible with the previous one and fixed +several issues and bugs raised by the growing community of PyPy users. +We strongly recommend updating.
                +The PyPy3.6 release and the Windows PyPy3.5 release are still not production +quality so your mileage may vary. There are open issues with incomplete +compatibility and c-extension support.
                +The utf8 branch that changes internal representation of unicode to utf8 did not +make it into the release, so there is still more goodness coming. +You can download the v7.0 releases here:
                +https://pypy.org/download.html +
                +We would like to thank our donors for the continued support of the PyPy +project. If PyPy is not quite good enough for your needs, we are available for +direct consulting work.
                +We would also like to thank our contributors and encourage new people to join +the project. PyPy has many layers and we need help with all of them: PyPy +and RPython documentation improvements, tweaking popular modules to run +on pypy, or general help with making RPython's JIT even better.
                +

                +What is PyPy?

                +PyPy is a very compliant Python interpreter, almost a drop-in replacement for +CPython 2.7, 3.5 and 3.6. It's fast (PyPy and CPython 2.7.x performance +comparison) due to its integrated tracing JIT compiler.
                +We also welcome developers of other dynamic languages to see what RPython +can do for them.
                +The PyPy release supports:
                +
                  +
                • +x86 machines on most common operating systems +(Linux 32/64 bits, Mac OS X 64 bits, Windows 32 bits, OpenBSD, FreeBSD)
                • +
                • big- and little-endian variants of PPC64 running Linux,
                • +
                • +s390x running Linux
                • +
                +
                +Unfortunately at the moment of writing our ARM buildbots are out of service, +so for now we are not releasing any binary for the ARM architecture.
                +
                +

                +What else is new?

                +PyPy 6.0 was released in April, 2018. +There are many incremental improvements to RPython and PyPy, the complete listing is here.

                +Please update, and continue to help us make PyPy better.


                +Cheers, The PyPy team +
                +
                +
                +
                +
                +
                + + Anonymous wrote on 2019-02-11 20:18: +
                +
                +
                I would be very happy, if at some point request-html would work. Thank you for your great work.


                cheers
                Rob +
                +
                +
                +
                + + Carl Friedrich Bolz-Tereick wrote on 2019-02-11 22:06: +
                +
                +

                @Rob can you please file an issue with how we can reproduce the problem?

                +
                +
                +
                +
                + + Anonymous wrote on 2019-02-15 13:54: +
                +
                +

                requests-html seems to work with pypy 3.6 -v7.0, but the normal requests not.


                This Code works with cpython

                from requests_html import HTMLSession
                import requests

                def get_url():
                    session = HTMLSession()
                    #r = session.get('https://www.kernel.org/', verify='kernel.org.crt')
                    r = session.get('https://www.kernel.org/')
                    url = r.html.xpath('//*[@id="latest_link"]/a/@href')
                    return url[0]

                def download():
                    with open('last_stable_kernel.txt', 'rt') as last_kernel:
                        last_kernel = last_kernel.read()
                    url = get_url()
                    if url != last_kernel:
                        print('New kernel found !!!\n')
                        print('Downloading from this url: \n' + url )
                        res = requests.get(url, stream = True)
                        if res.status_code == requests.codes.ok: # Check the download
                            print('Download complete\n')
                        print('Writing file to disk.')
                        kernel = open('latest_kernel.tar.xz', 'wb')
                        for file in res.iter_content(1024):
                            kernel.write(file)
                        kernel.close()
                        with open('last_stable_kernel.txt','wt') as last_kernel:
                            last_kernel.write(url)
                        return True

                    else:
                        print('I have allready the newest kernel !')
                        return False

                if __name__ == "__main__":
                download()

                +
                +
                +
                +
                + + Anonymous wrote on 2019-02-15 14:01: +
                +
                +

                The pybench2.0 looks good. (except string mapping)


                Test minimum average operation overhead
                -------------------------------------------------------------------------------
                BuiltinFunctionCalls: 0ms 5ms 0.01us 0.005ms
                BuiltinMethodLookup: 0ms 1ms 0.00us 0.006ms
                CompareFloats: 0ms 1ms 0.00us 0.005ms
                CompareFloatsIntegers: 0ms 1ms 0.00us 0.003ms
                CompareIntegers: 0ms 1ms 0.00us 0.007ms
                CompareInternedStrings: 0ms 1ms 0.00us 0.023ms
                CompareLongs: 0ms 1ms 0.00us 0.004ms
                CompareStrings: 0ms 0ms 0.00us 0.016ms
                ComplexPythonFunctionCalls: 12ms 14ms 0.07us 0.007ms
                ConcatStrings: 0ms 1ms 0.00us 0.017ms
                CreateInstances: 8ms 12ms 0.11us 0.013ms
                CreateNewInstances: 8ms 13ms 0.16us 0.012ms
                CreateStringsWithConcat: 0ms 1ms 0.00us 0.014ms
                DictCreation: 11ms 13ms 0.03us 0.005ms
                DictWithFloatKeys: 48ms 50ms 0.06us 0.010ms
                DictWithIntegerKeys: 10ms 11ms 0.01us 0.016ms
                DictWithStringKeys: 11ms 13ms 0.01us 0.016ms
                ForLoops: 3ms 7ms 0.28us 0.003ms
                IfThenElse: 0ms 1ms 0.00us 0.012ms
                ListSlicing: 22ms 24ms 1.69us 0.004ms
                NestedForLoops: 9ms 10ms 0.01us 0.002ms
                NestedListComprehensions: 8ms 11ms 0.92us 0.002ms
                NormalClassAttribute: 5ms 6ms 0.01us 0.011ms
                NormalInstanceAttribute: 4ms 5ms 0.00us 0.022ms
                PythonFunctionCalls: 0ms 2ms 0.01us 0.007ms
                PythonMethodCalls: 59ms 66ms 0.29us 0.012ms
                Recursion: 6ms 7ms 0.15us 0.009ms
                SecondImport: 65ms 74ms 0.74us 0.003ms
                SecondPackageImport: 67ms 70ms 0.70us 0.003ms
                SecondSubmoduleImport: 89ms 92ms 0.92us 0.004ms
                SimpleComplexArithmetic: 0ms 1ms 0.00us 0.007ms
                SimpleDictManipulation: 12ms 16ms 0.01us 0.008ms
                SimpleFloatArithmetic: 0ms 1ms 0.00us 0.010ms
                SimpleIntFloatArithmetic: 0ms 1ms 0.00us 0.010ms
                SimpleIntegerArithmetic: 0ms 1ms 0.00us 0.010ms
                SimpleListComprehensions: 6ms 9ms 0.72us 0.003ms
                SimpleListManipulation: 3ms 5ms 0.00us 0.011ms
                SimpleLongArithmetic: 0ms 1ms 0.00us 0.007ms
                SmallLists: 3ms 4ms 0.01us 0.007ms
                SmallTuples: 0ms 1ms 0.00us 0.007ms
                SpecialClassAttribute: 5ms 6ms 0.01us 0.011ms
                SpecialInstanceAttribute: 4ms 5ms 0.00us 0.022ms
                StringMappings: 838ms 846ms 3.36us 0.017ms
                StringPredicates: 5ms 6ms 0.01us 0.144ms
                StringSlicing: 0ms 1ms 0.00us 0.019ms
                TryExcept: 0ms 0ms 0.00us 0.012ms
                TryFinally: 0ms 2ms 0.01us 0.007ms
                TryRaiseExcept: 0ms 1ms 0.01us 0.009ms
                TupleSlicing: 36ms 38ms 0.15us 0.003ms
                WithFinally: 0ms 2ms 0.01us 0.007ms
                WithRaiseExcept: 0ms 1ms 0.02us 0.013ms
                -------------------------------------------------------------------------------
                Totals: 1359ms 1461ms



                Best regards
                Rob

                +
                +
                +
                + +

                Düsseldorf Sprint Report 2019

                + +
                +

                Hello everyone!

                +

                We are happy to report a successful and well attended sprint that is wrapping up +in Düsseldorf, Germany. In the last week we had eighteen people sprinting +at the Heinrich-Heine-Universität Düsseldorf on various topics.

                +
                +

                Totally serious work going on here constantly.

                +
                +

                A big +chunk of the sprint was dedicated to various discussions, since we did not +manage to gather the core developers in one room in quite a while. +Discussion topics included:

                +
                  +
                • Funding and general sustainability of open source.
                • +
                • Catching up with CPython 3.7/3.8 – we are planning to release 3.6 some time +in the next few months and we will continue working on 3.7/3.8.
                • +
                • What to do with VMprof
                • +
                • How can we support Cython inside PyPy in a way that will be understood +by the JIT, hence fast.
                • +
                • The future of supporting the numeric stack on pypy – we have made significant +progress in the past few years and most of the numeric stack works out of the box, +but deployment and performance remain problems. Improving on those problems +remains a very important focus for PyPy as a project.
                • +
                • Using the presence of a CPython developer (Łukasz Langa) and a Graal Python developer +(Tim Felgentreff) we discussed ways to collaborate in order to improve Python +ecosystem across implementations.
                • +
                • Pierre-Yves David and Georges Racinet from octobus gave us an exciting demo +on Heptapod, which adds mercurial support to gitlab.
                • +
                • Maciej and Armin gave demos of their current (non-PyPy-related) project VRSketch.
                • +
                +
                + +

                Visiting the Landschaftspark Duisburg Nord on the break day

                +
                + +

                Some highlights of the coding tasks worked on:

                +
                  +
                • Aarch64 (ARM64) JIT backend work has been started, we are able to run the first +test! Tobias Oberstein from Crossbar GmbH and Rodolph Perfetta from ARM joined the +sprint to help kickstart the project.
                • +
                • The long running math-improvements branch that was started by Stian Andreassen got merged +after bugfixes done by Alexander Schremmer. It should improve operations on large integers.
                • +
                • The arcane art of necromancy was used to revive long dormant regalloc branch started +and nearly finished by Carl Friedrich Bolz-Tereick. The branch got merged and gives +some modest speedups across the board.
                • +
                • Andrew Lawrence worked on MSI installer for PyPy on windows.
                • +
                • Łukasz worked on improving failing tests on the PyPy 3.6 branch. He knows very obscure +details of CPython (e.g. how pickling works), hence we managed to progress very quickly.
                • +
                • Matti Picus set up a new benchmarking server for PyPy 3 branches.
                • +
                • The Utf8 branch, which changes the internal representation of unicode might be finally +merged at some point very soon. We discussed and improved upon the last few +blockers. It gives significant speedups in a lot of cases handling strings.
                • +
                • Zlib was missing couple methods, which were added by Ronan Lamy and Julian Berman.
                • +
                • Manuel Jacob fixed RevDB failures.
                • +
                • Antonio Cuni and Matti Picus worked on 7.0 release which should happen in a few days.
                • +
                +

                Now we are all quite exhausted, and are looking forward to catching up on sleep.

                +

                Best regards, +Maciej Fijałkowski, Carl Friedrich Bolz-Tereick and the whole PyPy team.

                +
                +
                +
                +
                + + Juan Luis Cano wrote on 2019-02-09 18:19: +
                +
                +

                Congratulations for the sprint, folks! Any plans to leverage the manylinux2010 infrastructure and about producing PyPy compatible wheels soon?

                +
                +
                +
                +
                + + Anonymous wrote on 2019-02-10 15:29: +
                +
                +

                Nice work, looking forward to Python 3.6 and beyond! Is there anywhere to view the Python 3 benchmarks like there is for PyPy2?

                +
                +
                +
                +
                + + Carl Friedrich Bolz-Tereick wrote on 2019-02-11 08:22: +
                +
                +

                Hi Juan! Yes, we are going to work on manylinux2010 support to have PyPy wheels soon.

                +
                +
                +
                +
                + + Carl Friedrich Bolz-Tereick wrote on 2019-02-11 08:24: +
                +
                +

                @Anonymous yes, being able to view PyPy3 benchmarking results is the goal of the new benchmarking server, will still take a bit of work to hook everything up.

                +
                +
                +
                + +

                PyPy for low-latency systems

                + +
                +

                +PyPy for low-latency systems

                +Recently I have merged the gc-disable branch, introducing a couple of features +which are useful when you need to respond to certain events with the lowest +possible latency. This work has been kindly sponsored by Gambit Research +(which, by the way, is a very cool and geeky place where to work, in case you +are interested). Note also that this is a very specialized use case, so these +features might not be useful for the average PyPy user, unless you have the +same problems as described here.

                +The PyPy VM manages memory using a generational, moving Garbage Collector. +Periodically, the GC scans the whole heap to find unreachable objects and +frees the corresponding memory. Although at a first look this strategy might +sound expensive, in practice the total cost of memory management is far less +than e.g. on CPython, which is based on reference counting. While maybe +counter-intuitive, the main advantage of a non-refcount strategy is +that allocation is very fast (especially compared to malloc-based allocators), +and deallocation of objects which die young is basically for free. More +information about the PyPy GC is available here.

                +As we said, the total cost of memory managment is less on PyPy than on +CPython, and it's one of the reasons why PyPy is so fast. However, one big +disadvantage is that while on CPython the cost of memory management is spread +all over the execution of the program, on PyPy it is concentrated into GC +runs, causing observable pauses which interrupt the execution of the user +program.
                +To avoid excessively long pauses, the PyPy GC has been using an incremental +strategy since 2013. The GC runs as a series of "steps", letting the user +program to progress between each step.

                +The following chart shows the behavior of a real-world, long-running process:
                + +
                +

                +The orange line shows the total memory used by the program, which +increases linearly while the program progresses. Every ~5 minutes, the GC +kicks in and the memory usage drops from ~5.2GB to ~2.8GB (this ratio is controlled +by the PYPY_GC_MAJOR_COLLECT env variable).
                +The purple line shows aggregated data about the GC timing: the whole +collection takes ~1400 individual steps over the course of ~1 minute: each +point represent the maximum time a single step took during the past 10 +seconds. Most steps take ~10-20 ms, although we see a horrible peak of ~100 ms +towards the end. We have not investigated yet what it is caused by, but we +suspect it is related to the deallocation of raw objects.

                +These multi-millesecond pauses are a problem for systems where it is important +to respond to certain events with a latency which is both low and consistent. +If the GC kicks in at the wrong time, it might causes unacceptable pauses during +the collection cycle.

                +Let's look again at our real-world example. This is a system which +continuously monitors an external stream; when a certain event occurs, we want +to take an action. The following chart shows the maximum time it takes to +complete one of such actions, aggregated every minute:

                + +
                +
                +You can clearly see that the baseline response time is around ~20-30 +ms. However, we can also see periodic spikes around ~50-100 ms, with peaks up +to ~350-450 ms! After a bit of investigation, we concluded that most (although +not all) of the spikes were caused by the GC kicking in at the wrong time.

                +The work I did in the gc-disable branch aims to fix this problem by +introducing two new features to the gc module:
                +
                  +
                • +gc.disable(), which previously only inhibited the execution of +finalizers without actually touching the GC, now disables the GC major +collections. After a call to it, you will see the memory usage grow +indefinitely.
                • +
                • +gc.collect_step() is a new function which you can use to manually +execute a single incremental GC collection step.
                • +
                +
                +It is worth to specify that gc.disable() disables only the major +collections, while minor collections still runs. Moreover, thanks to the +JIT's virtuals, many objects with a short and predictable lifetime are not +allocated at all. The end result is that most objects with short lifetime are +still collected as usual, so the impact of gc.disable() on memory growth +is not as bad as it could sound.

                +Combining these two functions, it is possible to take control of the GC to +make sure it runs only when it is acceptable to do so. For an example of +usage, you can look at the implementation of a custom GC inside pypytools. +The peculiarity is that it also defines a "with nogc():" context manager +which you can use to mark performance-critical sections where the GC is not +allowed to run.

                +The following chart compares the behavior of the default PyPy GC and the new +custom GC, after a careful placing of nogc() sections:

                + +
                +
                +The yellow line is the same as before, while the purple line shows the new +system: almost all spikes have gone, and the baseline performance is about 10% +better. There is still one spike towards the end, but after some investigation +we concluded that it was not caused by the GC.

                +Note that this does not mean that the whole program became magically +faster: we simply moved the GC pauses in some other place which is not +shown in the graph: in this specific use case this technique was useful +because it allowed us to shift the GC work in places where pauses are more +acceptable.

                +All in all, a pretty big success, I think. These functionalities are already +available in the nightly builds of PyPy, and will be included in the next +release: take this as a New Year present :)

                +Antonio Cuni and the PyPy team +
                +
                +
                +
                + + stuaxo wrote on 2019-01-08 18:47: +
                +
                +

                Could see this being handy for python game libraries too.

                +
                +
                +
                +
                + + samantha wrote on 2019-01-08 22:40: +
                +
                +

                I am a bit surprised as these functions have been available for a long time in python gc module. So I suppose the news is a better performing one in pypy?

                +
                +
                +
                +
                + + Armin Rigo wrote on 2019-01-09 02:46: +
                +
                +

                @samantha: ``gc.collect_step()`` is new.

                +
                +
                +
                + +

                PyPy Winter Sprint Feb 4-9 in Düsseldorf

                + +
                +
                +
                +
                +

                + PyPy Sprint February 4th-9th 2019 in Düsseldorf

                +
                +
                +The next PyPy sprint will be held in the Computer Science department of Heinrich-Heine Universität Düsseldorf from the 4th to the 9st of February 2019 (nine years after the last sprint there). This is a fully public sprint, everyone is welcome to join us.
                +

                +Topics and goals

                +
                +
                +
                  +
                • improve Python 3.6 support
                • +
                • discuss benchmarking situation
                • +
                • progress on utf-8 branches
                • +
                • cpyext performance and completeness
                • +
                • packaging: are we ready to upload to PyPI?
                • +
                    +
                  • +issue 2617  - we expose too many functions from lib-pypy.so
                  • +
                  • +manylinux2010 - will it solve our build issues?
                  • +
                  • formulate an ABI name and upgrade policy
                  • +
                  +
                +
                  +
                • +memoryview(ctypes.Structure) does not create the correct format string
                • +
                • discussing the state and future of PyPy and the wider Python ecosystem
                • +
                +
                +
                +

                +Location

                +
                +The sprint will take place in seminar room 25.12.02.55 of the computer science department.  It is in the building 25.12 of the university campus, second floor. Travel instructions
                +
                +

                +Exact times

                +
                +Work days: starting February 4th (10:00), ending February 9th (~afternoon). The break day will probably be Thursday.
                +

                +Registration

                +
                +
                +Please register by Mercurial::
                +https://bitbucket.org/pypy/extradoc/
                +
                +https://foss.heptapod.net/pypy/extradoc/-/blob/branch/default/extradoc/sprintinfo/ddorf2019/people.txt

                +or on the pypy-dev mailing list if you do not yet have check-in rights:
                +
                + +
                +
                +
                +
                +Looking forward to seeing everyone there!
                +
                +
                +
                +
                +
                + + Anonymous wrote on 2018-12-27 13:00: +
                +
                +

                The travel instructions link is a redirect to a 404 page.

                +
                +
                +
                +
                + + Armin Rigo wrote on 2018-12-27 13:33: +
                +
                +

                Thanks! Fixed.

                +
                +
                +
                + +

                Funding for 64-bit Armv8-a support in PyPy

                + +
                +
                + +

                Hello everyone

                + +

                At PyPy we are trying to support a relatively wide range of platforms. We have PyPy working on OS X, Windows and various flavors of linux (and unofficially various flavors of BSD) on the software side, with hardware side having x86, x86_64, PPC, 32-bit Arm (v7) and even zarch. This is harder than for other projects, since PyPy emits assembler on the fly from the just in time compiler and it requires significant amount of work to port it to a new platform.

                + +

                We are pleased to inform that Arm Limited, together with Crossbar.io GmbH, are sponsoring the development of 64-bit Armv8-a architecture support through Baroque Software OU, which would allow PyPy to run on a new variety of low-power, high-density servers with that architecture. We believe this will be beneficial for the funders, for the PyPy project as well as to the wider community.

                + +

                The work will commence soon and will be done some time early next year with expected speedups either comparable to x86 speedups or, if our current experience with ARM holds, more significant than x86 speedups.

                + +

                Best,
                +Maciej Fijalkowski and the PyPy team

                +
                +
                +
                +
                +
                +
                + + GG boy wrote on 2018-12-01 13:59: +
                +
                +

                Good job

                +
                +
                +
                +
                + + Mahmoud Hashemi wrote on 2018-12-09 19:44: +
                +
                +

                Nice! Congrats!

                +
                +
                +
                + +

                Guest Post: Implementing a Calculator REPL in RPython

                + +
                +

                This is a tutorial style post that walks through using the RPython translation +toolchain to create a REPL that executes basic math expressions.

                + +

                We will do that by scanning the user's input into tokens, compiling those +tokens into bytecode and running that bytecode in our own virtual machine. Don't +worry if that sounds horribly complicated, we are going to explain it step by +step.

                + +

                This post is a bit of a diversion while on my journey to create a compliant +lox implementation +using the RPython translation toolchain. The +majority of this work is a direct RPython translation of the low level C +guide from Bob Nystrom (@munificentbob) in the +excellent book craftinginterpreters.com +specifically the chapters 14 – 17.

                + +

                The road ahead

                + +

                As this post is rather long I'll break it into a few major sections. In each section we will +have something that translates with RPython, and at the end it all comes together.

                + + +

                A REPL

                + +

                So if you're a Python programmer you might be thinking this is pretty trivial right?

                + +

                I mean if we ignore input errors, injection attacks etc couldn't we just do something +like this:

                + +
                """
                +A pure python REPL that can parse simple math expressions
                +"""
                +while True:
                +    print(eval(raw_input("> ")))
                +
                + +

                Well it does appear to do the trick:

                + +
                $ python2 section-1-repl/main.py
                +> 3 + 4 * ((1.0/(2 * 3 * 4)) + (1.0/(4 * 5 * 6)) - (1.0/(6 * 7 * 8)))
                +3.1880952381
                +
                + +

                So can we just ask RPython to translate this into a binary that runs magically +faster?

                + +

                Let's see what happens. We need to add two functions for RPython to +get its bearings (entry_point and target) and call the file targetXXX:

                + +

                targetrepl1.py

                + +
                def repl():
                +    while True:
                +        print eval(raw_input('> '))
                +
                +
                +def entry_point(argv):
                +    repl()
                +    return 0
                +
                +
                +def target(driver, *args):
                +    return entry_point, None
                +
                + +

                Which at translation time gives us this admonishment that accurately tells us +we are trying to call a Python built-in raw_input that is unfortunately not +valid RPython.

                + +
                $ rpython ./section-1-repl/targetrepl1.py
                +...SNIP...
                +[translation:ERROR] AnnotatorError: 
                +
                +object with a __call__ is not RPython: <built-in function raw_input>
                +Processing block:
                + block@18 is a <class 'rpython.flowspace.flowcontext.SpamBlock'> 
                + in (target1:2)repl 
                + containing the following operations: 
                +       v0 = simple_call((builtin_function raw_input), ('> ')) 
                +       v1 = simple_call((builtin_function eval), v0) 
                +       v2 = str(v1) 
                +       v3 = simple_call((function rpython_print_item), v2) 
                +       v4 = simple_call((function rpython_print_newline)) 
                +
                + +

                Ok so we can't use raw_input or eval but that doesn't faze us. Let's get +the input from a stdin stream and just print it out (no evaluation).

                + +

                targetrepl2.py

                + +
                from rpython.rlib import rfile
                +
                +LINE_BUFFER_LENGTH = 1024
                +
                +
                +def repl(stdin):
                +    while True:
                +        print "> ",
                +        line = stdin.readline(LINE_BUFFER_LENGTH)
                +        print line
                +
                +
                +def entry_point(argv):
                +    stdin, stdout, stderr = rfile.create_stdio()
                +    try:
                +        repl(stdin)
                +    except:
                +        return 0
                +
                +
                +def target(driver, *args):
                +    return entry_point, None
                +
                + +

                Translate targetrepl2.py – we can add an optimization level if we +are so inclined:

                + +
                $ rpython --opt=2 section-1-repl/targetrepl2.py
                +...SNIP...
                +[Timer] Timings:
                +[Timer] annotate                       ---  1.2 s
                +[Timer] rtype_lltype                   ---  0.9 s
                +[Timer] backendopt_lltype              ---  0.6 s
                +[Timer] stackcheckinsertion_lltype     ---  0.0 s
                +[Timer] database_c                     --- 15.0 s
                +[Timer] source_c                       ---  1.6 s
                +[Timer] compile_c                      ---  1.9 s
                +[Timer] =========================================
                +[Timer] Total:                         --- 21.2 s
                +
                + +

                No errors!? Let's try it out:

                + +
                $ ./target2-c 
                +1 + 2
                +>  1 + 2
                +
                +^C
                +
                + +

                Ahh our first success – let's quickly deal with the flushing fail by using the +stdout stream directly as well. Let's print out the input in quotes:

                + +
                from rpython.rlib import rfile
                +
                +LINE_BUFFER_LENGTH = 1024
                +
                +
                +def repl(stdin, stdout):
                +    while True:
                +        stdout.write("> ")
                +        line = stdin.readline(LINE_BUFFER_LENGTH)
                +        print '"%s"' % line.strip()
                +
                +
                +def entry_point(argv):
                +    stdin, stdout, stderr = rfile.create_stdio()
                +    try:
                +        repl(stdin, stdout)
                +    except:
                +        pass
                +    return 0
                +
                +
                +def target(driver, *args):
                +    return entry_point, None
                +
                + +

                Translation works, and the test run too:

                + +
                $ ./target3-c 
                +> hello this seems better
                +"hello this seems better"
                +> ^C
                +
                + +

                So we are in a good place with taking user input and printing output... What about +the whole math evaluation thing we were promised? For that we are can probably leave +our RPython REPL behind for a while and connect it up at the end.

                + +

                A virtual machine

                + +

                A virtual machine is the execution engine of our basic math interpreter. It will be very simple, +only able to do simple tasks like addition. I won't go into any depth to describe why we want +a virtual machine, but it is worth noting that many languages including Java and Python make +this decision to compile to an intermediate bytecode representation and then execute that with +a virtual machine. Alternatives are compiling directly to native machine code like (earlier versions of) the V8 +JavaScript engine, or at the other end of the spectrum executing an abstract syntax tree – +which is what the Truffle approach to building VMs is based on.

                + +

                We are going to keep things very simple. We will have a stack where we can push and pop values, +we will only support floats, and our VM will only implement a few very basic operations.

                + +

                OpCodes

                + +

                In fact our entire instruction set is:

                + +
                OP_CONSTANT
                +OP_RETURN
                +OP_NEGATE
                +OP_ADD
                +OP_SUBTRACT
                +OP_MULTIPLY
                +OP_DIVIDE
                +
                + +

                Since we are targeting RPython we can't use the nice enum module from the Python standard +library, so instead we just define a simple class with class attributes.

                + +

                We should start to get organized, so we will create a new file +opcodes.py and add this:

                + +
                class OpCode:
                +    OP_CONSTANT = 0
                +    OP_RETURN = 1
                +    OP_NEGATE = 2
                +    OP_ADD = 3
                +    OP_SUBTRACT = 4
                +    OP_MULTIPLY = 5
                +    OP_DIVIDE = 6
                +
                + +

                Chunks

                + +

                To start with we need to get some infrastructure in place before we write the VM engine.

                + +

                Following craftinginterpreters.com +we start with a Chunk object which will represent our bytecode. In RPython we have access +to Python-esq lists so our code object will just be a list of OpCode values – which are +just integers. A list of ints, couldn't get much simpler.

                + +

                section-2-vm/chunk.py

                + +
                class Chunk:
                +    code = None
                +
                +    def __init__(self):
                +        self.code = []
                +
                +    def write_chunk(self, byte):
                +        self.code.append(byte)
                +
                +    def disassemble(self, name):
                +        print "== %s ==\n" % name
                +        i = 0
                +        while i < len(self.code):
                +            i = disassemble_instruction(self, i)
                +
                + +

                From here on I'll only present minimal snippets of code instead of the whole lot, but +I'll link to the repository with the complete example code. For example the +various debugging including disassemble_instruction isn't particularly interesting +to include verbatim. See the github repo for full details

                + +

                We need to check that we can create a chunk and disassemble it. The quickest way to do this +is to use Python during development and debugging then every so often try to translate it.

                + +

                Getting the disassemble part through the RPython translator was a hurdle for me as I +quickly found that many str methods such as format are not supported, and only very basic +% based formatting is supported. I ended up creating helper functions for string manipulation +such as:

                + +
                def leftpad_string(string, width, char=" "):
                +    l = len(string)
                +    if l > width:
                +        return string
                +    return char * (width - l) + string
                +
                + +

                Let's write a new entry_point that creates and disassembles a chunk of bytecode. We can +set the target output name to vm1 at the same time:

                + +

                targetvm1.py

                + +
                def entry_point(argv):
                +    bytecode = Chunk()
                +    bytecode.write_chunk(OpCode.OP_ADD)
                +    bytecode.write_chunk(OpCode.OP_RETURN)
                +    bytecode.disassemble("hello world")
                +    return 0
                +
                +def target(driver, *args):
                +    driver.exe_name = "vm1"
                +    return entry_point, None
                +
                + +

                Running this isn't going to be terribly interesting, but it is always nice to +know that it is doing what you expect:

                + +
                $ ./vm1 
                +== hello world ==
                +
                +0000 OP_ADD       
                +0001 OP_RETURN    
                +
                + +

                Chunks of data

                + +

                Ref: https://www.craftinginterpreters.com/chunks-of-bytecode.html#constants

                + +

                So our bytecode is missing a very crucial element – the values to operate on!

                + +

                As with the bytecode we can store these constant values as part of the chunk +directly in a list. Each chunk will therefore have a constant data component, +and a code component.

                + +

                Edit the chunk.py file and add the new instance attribute constants as an +empty list, and a new method add_constant.

                + +
                    def add_constant(self, value):
                +        self.constants.append(value)
                +        return len(self.constants) - 1
                +
                + +

                Now to use this new capability we can modify our example chunk +to write in some constants before the OP_ADD:

                + +
                    bytecode = Chunk()
                +    constant = bytecode.add_constant(1.0)
                +    bytecode.write_chunk(OpCode.OP_CONSTANT)
                +    bytecode.write_chunk(constant)
                +
                +    constant = bytecode.add_constant(2.0)
                +    bytecode.write_chunk(OpCode.OP_CONSTANT)
                +    bytecode.write_chunk(constant)
                +
                +    bytecode.write_chunk(OpCode.OP_ADD)
                +    bytecode.write_chunk(OpCode.OP_RETURN)
                +
                +    bytecode.disassemble("adding constants")
                +
                + +

                Which still translates with RPython and when run gives us the following disassembled +bytecode:

                + +
                == adding constants ==
                +
                +0000 OP_CONSTANT  (00)        '1'
                +0002 OP_CONSTANT  (01)        '2'
                +0004 OP_ADD       
                +0005 OP_RETURN
                +
                + +

                We won't go down the route of serializing the bytecode to disk, but this bytecode chunk +(including the constant data) could be saved and executed on our VM later – like a Java +.class file. Instead we will pass the bytecode directly to our VM after we've created +it during the compilation process.

                + +

                Emulation

                + +

                So those four instructions of bytecode combined with the constant value mapping +00 -> 1.0 and 01 -> 2.0 describes individual steps for our virtual machine +to execute. One major point in favor of defining our own bytecode is we can +design it to be really simple to execute – this makes the VM really easy to implement.

                + +

                As I mentioned earlier this virtual machine will have a stack, so let's begin with that. +Now the stack is going to be a busy little beast – as our VM takes instructions like +OP_ADD it will pop off the top two values from the stack, and push the result of adding +them together back onto the stack. Although dynamically resizing Python lists +are marvelous, they can be a little slow. RPython can take advantage of a constant sized +list which doesn't make our code much more complicated.

                + +

                To do this we will define a constant sized list and track the stack_top directly. Note +how we can give the RPython translator hints by adding assertions about the state that +the stack_top will be in.

                + +
                class VM(object):
                +    STACK_MAX_SIZE = 256
                +    stack = None
                +    stack_top = 0
                +
                +    def __init__(self):
                +        self._reset_stack()
                +
                +    def _reset_stack(self):
                +        self.stack = [0] * self.STACK_MAX_SIZE
                +        self.stack_top = 0
                +
                +    def _stack_push(self, value):
                +        assert self.stack_top < self.STACK_MAX_SIZE
                +        self.stack[self.stack_top] = value
                +        self.stack_top += 1
                +
                +    def _stack_pop(self):
                +        assert self.stack_top >= 0
                +        self.stack_top -= 1
                +        return self.stack[self.stack_top]
                +
                +    def _print_stack(self):
                +        print "         ",
                +        if self.stack_top <= 0:
                +            print "[]",
                +        else:
                +            for i in range(self.stack_top):
                +                print "[ %s ]" % self.stack[i],
                +        print
                +
                + +

                Now we get to the main event, the hot loop, the VM engine. Hope I haven't built it up to +much, it is actually really simple! We loop until the instructions tell us to stop +(OP_RETURN), and dispatch to other simple methods based on the instruction.

                + +
                    def _run(self):
                +        while True:
                +            instruction = self._read_byte()
                +
                +            if instruction == OpCode.OP_RETURN:
                +                print "%s" % self._stack_pop()
                +                return InterpretResultCode.INTERPRET_OK
                +            elif instruction == OpCode.OP_CONSTANT:
                +                constant = self._read_constant()
                +                self._stack_push(constant)
                +            elif instruction == OpCode.OP_ADD:
                +                self._binary_op(self._stack_add)    
                +
                + +

                Now the _read_byte method will have to keep track of which instruction we are up +to. So add an instruction pointer (ip) to the VM with an initial value of 0. +Then _read_byte is simply getting the next bytecode (int) from the chunk's code:

                + +
                    def _read_byte(self):
                +        instruction = self.chunk.code[self.ip]
                +        self.ip += 1
                +        return instruction
                +
                + +

                + +

                If the instruction is OP_CONSTANT we take the constant's address from the next byte +of the chunk's code, retrieve that constant value and add it to the VM's stack.

                + +
                    def _read_constant(self):
                +        constant_index = self._read_byte()
                +        return self.chunk.constants[constant_index]
                +
                + +

                Finally our first arithmetic operation OP_ADD, what it has to achieve doesn't +require much explanation: pop two values from the stack, add them together, push +the result. But since a few operations all have the same template we introduce a +layer of indirection – or abstraction – by introducing a reusable _binary_op +helper method.

                + +
                    @specialize.arg(1)
                +    def _binary_op(self, operator):
                +        op2 = self._stack_pop()
                +        op1 = self._stack_pop()
                +        result = operator(op1, op2)
                +        self._stack_push(result)
                +
                +    @staticmethod
                +    def _stack_add(op1, op2):
                +        return op1 + op2
                +
                + +

                + +

                Note we tell RPython to specialize _binary_op on the first argument. This causes +RPython to make a copy of _binary_op for every value of the first argument passed, +which means that each copy contains a call to a particular operator, which can then be +inlined.

                + +

                To be able to run our bytecode the only thing left to do is to pass in the chunk +and call _run():

                + +
                    def interpret_chunk(self, chunk):
                +        if self.debug_trace:
                +            print "== VM TRACE =="
                +        self.chunk = chunk
                +        self.ip = 0
                +        try:
                +            result = self._run()
                +            return result
                +        except:
                +            return InterpretResultCode.INTERPRET_RUNTIME_ERROR
                +
                + +

                targetvm3.py connects the pieces:

                + +
                def entry_point(argv):
                +    bytecode = Chunk()
                +    constant = bytecode.add_constant(1)
                +    bytecode.write_chunk(OpCode.OP_CONSTANT)
                +    bytecode.write_chunk(constant)
                +    constant = bytecode.add_constant(2)
                +    bytecode.write_chunk(OpCode.OP_CONSTANT)
                +    bytecode.write_chunk(constant)
                +    bytecode.write_chunk(OpCode.OP_ADD)
                +    bytecode.write_chunk(OpCode.OP_RETURN)
                +
                +    vm = VM()
                +    vm.interpret_chunk(bytecode)
                +
                +    return 0
                +
                + +

                I've added some trace debugging so we can see what the VM and stack is doing.

                + +

                The whole thing translates with RPython, and when run gives us:

                + +
                ./vm3
                +== VM TRACE ==
                +          []
                +0000 OP_CONSTANT  (00)        '1'
                +          [ 1 ]
                +0002 OP_CONSTANT  (01)        '2'
                +          [ 1 ] [ 2 ]
                +0004 OP_ADD       
                +          [ 3 ]
                +0005 OP_RETURN    
                +3
                +
                + +

                Yes we just computed the result of 1+2. Pat yourself on the back.

                + +

                At this point it is probably valid to check that the translated executable is actually +faster than running our program directly in Python. For this trivial example under +Python2/pypy this targetvm3.py file runs in the 20ms – 90ms region, and the +compiled vm3 runs in <5ms. Something useful must be happening during the translation.

                + +

                I won't go through the code adding support for our other instructions as they are +very similar and straightforward. Our VM is ready to execute our chunks of bytecode, +but we haven't yet worked out how to take the entered expression and turn that into +this simple bytecode. This is broken into two steps, scanning and compiling.

                + +

                Scanning the source

                + +

                All the source for this section can be found in +section-3-scanning.

                + +

                The job of the scanner is to take the raw expression string and transform it into +a sequence of tokens. This scanning step will strip out whitespace and comments, +catch errors with invalid token and tokenize the string. For example the input +"( 1 + 2 ) would get tokenized into LEFT_PAREN, NUMBER(1), PLUS, NUMBER(2), RIGHT_PAREN.

                + +

                As with our OpCodes we will just define a simple Python class to define an int +for each type of token:

                + +
                class TokenTypes:
                +    ERROR = 0
                +    EOF = 1
                +    LEFT_PAREN = 2
                +    RIGHT_PAREN = 3
                +    MINUS = 4
                +    PLUS = 5
                +    SLASH = 6
                +    STAR = 7
                +    NUMBER = 8
                +
                + +

                A token has to keep some other information as well – keeping track of the location and +length of the token will be helpful for error reporting. The NUMBER token clearly needs +some data about the value it is representing: we could include a copy of the source lexeme +(e.g. the string 2.0), or parse the value and store that, or – what we will do in this +blog – use the location and length information as pointers into the original source +string. Every token type (except perhaps ERROR) will use this simple data structure:

                + +
                class Token(object):
                +
                +    def __init__(self, start, length, token_type):
                +        self.start = start
                +        self.length = length
                +        self.type = token_type
                +
                + +

                Our soon to be created scanner will create these Token objects which refer back to +addresses in some source. If the scanner sees the source "( 1 + 2.0 )" it would emit +the following tokens:

                + +
                Token(0, 1, TokenTypes.LEFT_PAREN)
                +Token(2, 1, TokenTypes.NUMBER)
                +Token(4, 1, TokenTypes.PLUS)
                +Token(6, 3, TokenTypes.NUMBER)
                +Token(10, 1, TokenTypes.RIGHT_PAREN)
                +
                + +

                Scanner

                + +

                Let's walk through the scanner implementation method +by method. The scanner will take the source and pass through it once, creating tokens +as it goes.

                + +
                class Scanner(object):
                +
                +    def __init__(self, source):
                +        self.source = source
                +        self.start = 0
                +        self.current = 0
                +
                + +

                The start and current variables are character indices in the source string that point to +the current substring being considered as a token.

                + +

                For example in the string "(51.05+2)" while we are tokenizing the number 51.05 +we will have start pointing at the 5, and advance current character by character +until the character is no longer part of a number. Midway through scanning the number +the start and current values might point to 1 and 4 respectively:

                + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                012345678
                "(""5""1"".""0""5""+""2"")"
                 ^ ^
                +

                From current=4 the scanner peeks ahead and sees that the next character (5) is +a digit, so will continue to advance.

                + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                012345678
                "(""5""1"".""0""5""+""2"")"
                 ^ ^
                +

                When the scanner peeks ahead and sees the "+" it will create the number +token and emit it. The method that carry's out this tokenizing is _number:

                + +
                    def _number(self):
                +        while self._peek().isdigit():
                +            self.advance()
                +
                +        # Look for decimal point
                +        if self._peek() == '.' and self._peek_next().isdigit():
                +            self.advance()
                +            while self._peek().isdigit():
                +                self.advance()
                +
                +        return self._make_token(TokenTypes.NUMBER)
                +
                + +

                It relies on a few helpers to look ahead at the upcoming characters:

                + +
                    def _peek(self):
                +        if self._is_at_end():
                +            return '\0'
                +        return self.source[self.current]
                +
                +    def _peek_next(self):
                +        if self._is_at_end():
                +            return '\0'
                +        return self.source[self.current+1]
                +
                +    def _is_at_end(self):
                +        return len(self.source) == self.current
                +
                + +

                If the character at current is still part of the number we want to call advance +to move on by one character.

                + +
                    def advance(self):
                +        self.current += 1
                +        return self.source[self.current - 1]
                +
                + +

                Once the isdigit() check fails in _number() we call _make_token() to emit the +token with the NUMBER type.

                + +
                    def _make_token(self, token_type):
                +        return Token(
                +            start=self.start,
                +            length=(self.current - self.start),
                +            token_type=token_type
                +        )
                +
                + +

                Note again that the token is linked to an index address in the source, rather than +including the string value.

                + +

                Our scanner is pull based, a token will be requested via scan_token. First we skip +past whitespace and depending on the characters emit the correct token:

                + +
                    def scan_token(self):
                +        # skip any whitespace
                +        while True:
                +            char = self._peek()
                +            if char in ' \r\t\n':
                +                self.advance()
                +            break
                +
                +        self.start = self.current
                +
                +        if self._is_at_end():
                +            return self._make_token(TokenTypes.EOF)
                +
                +        char = self.advance()
                +
                +        if char.isdigit():
                +            return self._number()
                +
                +        if char == '(':
                +            return self._make_token(TokenTypes.LEFT_PAREN)
                +        if char == ')':
                +            return self._make_token(TokenTypes.RIGHT_PAREN)
                +        if char == '-':
                +            return self._make_token(TokenTypes.MINUS)
                +        if char == '+':
                +            return self._make_token(TokenTypes.PLUS)
                +        if char == '/':
                +            return self._make_token(TokenTypes.SLASH)
                +        if char == '*':
                +            return self._make_token(TokenTypes.STAR)
                +
                +        return ErrorToken("Unexpected character", self.current)
                +
                + +

                + +

                If this was a real programming language we were scanning, this would be the point where we +add support for different types of literals and any language identifiers/reserved words.

                + +

                At some point we will need to parse the literal value for our numbers, but we leave that +job for some later component, for now we'll just add a get_token_string helper. To make +sure that RPython is happy to index arbitrary slices of source we add range assertions:

                + +
                    def get_token_string(self, token):
                +        if isinstance(token, ErrorToken):
                +            return token.message
                +        else:
                +            end_loc = token.start + token.length
                +            assert end_loc < len(self.source)
                +            assert end_loc > 0
                +            return self.source[token.start:end_loc]
                +
                + +

                A simple entry point can be used to test our scanner with a hard coded +source string:

                + +

                targetscanner1.py

                + +
                from scanner import Scanner, TokenTypes, TokenTypeToName
                +
                +
                +def entry_point(argv):
                +
                +    source = "(   1   + 2.0 )"
                +
                +    scanner = Scanner(source)
                +    t = scanner.scan_token()
                +    while t.type != TokenTypes.EOF and t.type != TokenTypes.ERROR:
                +        print TokenTypeToName[t.type],
                +        if t.type == TokenTypes.NUMBER:
                +            print "(%s)" % scanner.get_token_string(t),
                +        print
                +        t = scanner.scan_token()
                +    return 0
                +
                + +

                RPython didn't complain, and lo it works:

                + +
                $ ./scanner1 
                +LEFT_PAREN
                +NUMBER (1)
                +PLUS
                +NUMBER (2.0)
                +RIGHT_PAREN
                +
                + +

                Let's connect our REPL to the scanner.

                + +

                targetscanner2.py

                + +
                from rpython.rlib import rfile
                +from scanner import Scanner, TokenTypes, TokenTypeToName
                +
                +LINE_BUFFER_LENGTH = 1024
                +
                +
                +def repl(stdin, stdout):
                +    while True:
                +        stdout.write("> ")
                +        source = stdin.readline(LINE_BUFFER_LENGTH)
                +
                +        scanner = Scanner(source)
                +        t = scanner.scan_token()
                +        while t.type != TokenTypes.EOF and t.type != TokenTypes.ERROR:
                +            print TokenTypeToName[t.type],
                +            if t.type == TokenTypes.NUMBER:
                +                print "(%s)" % scanner.get_token_string(t),
                +            print
                +            t = scanner.scan_token()
                +
                +
                +def entry_point(argv):
                +    stdin, stdout, stderr = rfile.create_stdio()
                +    try:
                +        repl(stdin, stdout)
                +    except:
                +        pass
                +    return 0
                +
                + +

                With our REPL hooked up we can now scan tokens from arbitrary input:

                + +
                $ ./scanner2
                +> (3 *4) - -3
                +LEFT_PAREN
                +NUMBER (3)
                +STAR
                +NUMBER (4)
                +RIGHT_PAREN
                +MINUS
                +MINUS
                +NUMBER (3)
                +> ^C
                +
                + +

                Compiling expressions

                + +

                References

                + +
                  +
                • https://www.craftinginterpreters.com/compiling-expressions.html
                • + +
                • https://effbot.org/zone/simple-top-down-parsing.htm
                • +
                +

                The final piece is to turn this sequence of tokens into our low level +bytecode instructions for the virtual machine to execute. Buckle up, +we are about to write us a compiler.

                + +

                Our compiler will take a single pass over the tokens using +Vaughan Pratt’s +parsing technique, and output a chunk of bytecode – if we do it +right it will be compatible with our existing virtual machine.

                + +

                Remember the bytecode we defined above is really simple – by relying +on our stack we can transform a nested expression into a sequence of +our bytecode operations.

                + +

                To make this more concrete let's go through by hand translating an +expression into bytecode.

                + +

                Our source expression:

                + +
                (3 + 2) - (7 * 2)
                +
                + +

                If we were to make an abstract syntax tree we'd get something +like this:

                + +

                + +

                Now if we start at the first sub expression (3+2) we can clearly +note from the first open bracket that we must see a close bracket, +and that the expression inside that bracket must be valid on its +own. Not only that but regardless of the inside we know that the whole +expression still has to be valid. Let's focus on this first bracketed +expression, let our attention recurse into it so to speak.

                + +

                This gives us a much easier problem – we just want to get our virtual +machine to compute 3 + 2. In this bytecode dialect we would load the +two constants, and then add them with OP_ADD like so:

                + +
                OP_CONSTANT  (00) '3.000000'
                +OP_CONSTANT  (01) '2.000000'
                +OP_ADD
                +
                + +

                The effect of our vm executing these three instructions is that sitting +pretty at the top of the stack is the result of the addition. Winning.

                + +

                Jumping back out from our bracketed expression, our next token is MINUS, +at this point we have a fair idea that it must be used in an infix position. +In fact whatever token followed the bracketed expression it must be a +valid infix operator, if not the expression is over or had a syntax error.

                + +

                Assuming the best from our user (naive), we handle MINUS the same way +we handled the first PLUS. We've already got the first operand on the +stack, now we compile the right operand and then write out the bytecode +for OP_SUBTRACT.

                + +

                The right operand is another simple three instructions:

                + +
                OP_CONSTANT  (02) '7.000000'
                +OP_CONSTANT  (03) '2.000000'
                +OP_MULTIPLY
                +
                + +

                Then we finish our top level binary expression and write a OP_RETURN to +return the value at the top of the stack as the execution's result. Our +final hand compiled program is:

                + +
                OP_CONSTANT  (00) '3.000000'
                +OP_CONSTANT  (01) '2.000000'
                +OP_ADD
                +OP_CONSTANT  (02) '7.000000'
                +OP_CONSTANT  (03) '2.000000'
                +OP_MULTIPLY
                +OP_SUBTRACT
                +OP_RETURN
                +
                + +

                Ok that wasn't so hard was it? Let's try make our code do that.

                + +

                We define a parser object which will keep track of where we are, and +whether things have all gone horribly wrong:

                + +
                class Parser(object):
                +    def __init__(self):
                +        self.had_error = False
                +        self.panic_mode = False
                +        self.current = None
                +        self.previous = None
                +
                + +

                The compiler will also be a class, we'll need one of our Scanner instances +to pull tokens from, and since the output is a bytecode Chunk let's go ahead +and make one of those in our compiler initializer:

                + +
                class Compiler(object):
                +
                +    def __init__(self, source):
                +        self.parser = Parser()
                +        self.scanner = Scanner(source)
                +        self.chunk = Chunk()
                +
                + +

                Since we have this (empty) chunk of bytecode we will make a helper method +to add individual bytes. Every instruction will pass from our compiler into +an executable program through this simple .

                + +
                    def emit_byte(self, byte):
                +        self.current_chunk().write_chunk(byte)
                +
                + +

                To quote from Bob Nystrom on the Pratt parsing technique:

                + +
                +

                the implementation is a deceptively-simple handful of deeply intertwined code

                +
                + +

                I don't actually think I can do justice to this section. Instead I suggest +reading his treatment in +Pratt Parsers: Expression Parsing Made Easy +which explains the magic behind the parsing component. Our only major difference is +instead of creating an AST we are going to directly emit bytecode for our VM.

                + +

                Now that I've absolved myself from taking responsibility in explaining this somewhat +tricky concept, I'll discuss some of the code from +compiler.py, and walk through what happens +for a particular rule.

                + +

                I'll jump straight to the juicy bit the table of parse rules. We define a ParseRule +for each token, and each rule comprises:

                + +
                  +
                • an optional handler for when the token is as a prefix (e.g. the minus in (-2)),
                • + +
                • an optional handler for whet the token is used infix (e.g. the slash in 2/47)
                • + +
                • a precedence value (a number that determines what is of higher precedence)
                • +
                +
                rules = [
                +    ParseRule(None,              None,            Precedence.NONE),   # ERROR
                +    ParseRule(None,              None,            Precedence.NONE),   # EOF
                +    ParseRule(Compiler.grouping, None,            Precedence.CALL),   # LEFT_PAREN
                +    ParseRule(None,              None,            Precedence.NONE),   # RIGHT_PAREN
                +    ParseRule(Compiler.unary,    Compiler.binary, Precedence.TERM),   # MINUS
                +    ParseRule(None,              Compiler.binary, Precedence.TERM),   # PLUS
                +    ParseRule(None,              Compiler.binary, Precedence.FACTOR), # SLASH
                +    ParseRule(None,              Compiler.binary, Precedence.FACTOR), # STAR
                +    ParseRule(Compiler.number,   None,            Precedence.NONE),   # NUMBER
                +]
                +
                + +

                These rules really are the magic of our compiler. When we get to a particular +token such as MINUS we see if it is an infix operator and if so we've gone and +got its first operand ready. At all times we rely on the relative precedence; consuming +everything with higher precedence than the operator we are currently evaluating.

                + +

                In the expression:

                + +
                2 + 3 * 4
                +
                + +

                The * has higher precedence than the +, so 3 * 4 will be parsed together +as the second operand to the first infix operator (the +) which follows +the BEDMAS +order of operations I was taught at high school.

                + +

                To encode these precedence values we make another Python object moonlighting +as an enum:

                + +
                class Precedence(object):
                +    NONE = 0
                +    DEFAULT = 1
                +    TERM = 2        # + -
                +    FACTOR = 3      # * /
                +    UNARY = 4       # ! - +
                +    CALL = 5        # ()
                +    PRIMARY = 6
                +
                + +

                What happens in our compiler when turning -2.0 into bytecode? Assume we've just +pulled the token MINUS from the scanner. Every expression has to start with some +type of prefix – whether that is:

                + +
                  +
                • a bracket group (,
                • + +
                • a number 2,
                • + +
                • or a prefix unary operator -.
                • +
                +

                Knowing that, our compiler assumes there is a prefix handler in the rule table – in +this case it points us at the unary handler.

                + +
                    def parse_precedence(self, precedence):
                +        # parses any expression of a given precedence level or higher
                +        self.advance()
                +        prefix_rule = self._get_rule(self.parser.previous.type).prefix
                +        prefix_rule(self)
                +
                + +

                + +

                unary is called:

                + +
                    def unary(self):
                +        op_type = self.parser.previous.type
                +        # Compile the operand
                +        self.parse_precedence(Precedence.UNARY)
                +        # Emit the operator instruction
                +        if op_type == TokenTypes.MINUS:
                +            self.emit_byte(OpCode.OP_NEGATE)
                +
                + +

                Here – before writing the OP_NEGATE opcode we recurse back into parse_precedence +to ensure that whatever follows the MINUS token is compiled – provided it has +higher precedence than unary – e.g. a bracketed group. +Crucially at run time this recursive call will ensure that the result is left +on top of our stack. Armed with this knowledge, the unary method just +has to emit a single byte with the OP_NEGATE opcode.

                + +

                Test compilation

                + +

                Now we can test our compiler by outputting disassembled bytecode +of our user entered expressions. Create a new entry_point +targetcompiler:

                + +
                from rpython.rlib import rfile
                +from compiler import Compiler
                +
                +LINE_BUFFER_LENGTH = 1024
                +
                +
                +def entry_point(argv):
                +    stdin, stdout, stderr = rfile.create_stdio()
                +
                +    try:
                +        while True:
                +            stdout.write("> ")
                +            source = stdin.readline(LINE_BUFFER_LENGTH)
                +            compiler = Compiler(source, debugging=True)
                +            compiler.compile()
                +    except:
                +        pass
                +    return 0
                +
                + +

                Translate it and test it out:

                + +
                $ ./compiler1 
                +> (2/4 + 1/2)
                +== code ==
                +
                +0000 OP_CONSTANT  (00) '2.000000'
                +0002 OP_CONSTANT  (01) '4.000000'
                +0004 OP_DIVIDE    
                +0005 OP_CONSTANT  (02) '1.000000'
                +0007 OP_CONSTANT  (00) '2.000000'
                +0009 OP_DIVIDE    
                +0010 OP_ADD       
                +0011 OP_RETURN
                +
                + +

                Now if you've made it this far you'll be eager to finally connect everything +together by executing this bytecode with the virtual machine.

                + +

                End to end

                + +

                All the pieces slot together rather easily at this point, create a new +file targetcalc.py and define our +entry point:

                + +
                from rpython.rlib import rfile
                +from compiler import Compiler
                +from vm import VM
                +
                +LINE_BUFFER_LENGTH = 4096
                +
                +
                +def entry_point(argv):
                +    stdin, stdout, stderr = rfile.create_stdio()
                +    vm = VM()
                +    try:
                +        while True:
                +            stdout.write("> ")
                +            source = stdin.readline(LINE_BUFFER_LENGTH)
                +            if source:
                +                compiler = Compiler(source, debugging=False)
                +                compiler.compile()
                +                vm.interpret_chunk(compiler.chunk)
                +    except:
                +        pass
                +    return 0
                +
                +
                +def target(driver, *args):
                +    driver.exe_name = "calc"
                +    return entry_point, None
                +
                + +

                + +

                Let's try catch it out with a double negative:

                + +
                $ ./calc 
                +> 2--3
                +== VM TRACE ==
                +          []
                +0000 OP_CONSTANT  (00) '2.000000'
                +          [ 2.000000 ]
                +0002 OP_CONSTANT  (01) '3.000000'
                +          [ 2.000000 ] [ 3.000000 ]
                +0004 OP_NEGATE    
                +          [ 2.000000 ] [ -3.000000 ]
                +0005 OP_SUBTRACT  
                +          [ 5.000000 ]
                +0006 OP_RETURN    
                +5.000000
                +
                + +

                Ok well let's evaluate the first 50 terms of the +Nilakantha Series:

                + +
                $ ./calc
                +> 3 + 4 * ((1/(2 * 3 * 4)) + (1/(4 * 5 * 6)) - (1/(6 * 7 * 8)) + (1/(8 * 9 * 10)) - (1/(10 * 11 * 12)) + (1/(12 * 13 * 14)) - (1/(14 * 15 * 16)) + (1/(16 * 17 * 18)) - (1/(18 * 19 * 20)) + (1/(20 * 21 * 22)) - (1/(22 * 23 * 24)) + (1/(24 * 25 * 26)) - (1/(26 * 27 * 28)) + (1/(28 * 29 * 30)) - (1/(30 * 31 * 32)) + (1/(32 * 33 * 34)) - (1/(34 * 35 * 36)) + (1/(36 * 37 * 38)) - (1/(38 * 39 * 40)) + (1/(40 * 41 * 42)) - (1/(42 * 43 * 44)) + (1/(44 * 45 * 46)) - (1/(46 * 47 * 48)) + (1/(48 * 49 * 50)) - (1/(50 * 51 * 52)) + (1/(52 * 53 * 54)) - (1/(54 * 55 * 56)) + (1/(56 * 57 * 58)) - (1/(58 * 59 * 60)) + (1/(60 * 61 * 62)) - (1/(62 * 63 * 64)) + (1/(64 * 65 * 66)) - (1/(66 * 67 * 68)) + (1/(68 * 69 * 70)) - (1/(70 * 71 * 72)) + (1/(72 * 73 * 74)) - (1/(74 * 75 * 76)) + (1/(76 * 77 * 78)) - (1/(78 * 79 * 80)) + (1/(80 * 81 * 82)) - (1/(82 * 83 * 84)) + (1/(84 * 85 * 86)) - (1/(86 * 87 * 88)) + (1/(88 * 89 * 90)) - (1/(90 * 91 * 92)) + (1/(92 * 93 * 94)) - (1/(94 * 95 * 96)) + (1/(96 * 97 * 98)) - (1/(98 * 99 * 100)) + (1/(100 * 101 * 102)))
                +
                +== VM TRACE ==
                +          []
                +0000 OP_CONSTANT  (00) '3.000000'
                +          [ 3.000000 ]
                +0002 OP_CONSTANT  (01) '4.000000'
                +...SNIP...
                +0598 OP_CONSTANT  (101) '102.000000'
                +          [ 3.000000 ] [ 4.000000 ] [ 0.047935 ] [ 1.000000 ] [ 10100.000000 ] [ 102.000000 ]
                +0600 OP_MULTIPLY  
                +          [ 3.000000 ] [ 4.000000 ] [ 0.047935 ] [ 1.000000 ] [ 1030200.000000 ]
                +0601 OP_DIVIDE    
                +          [ 3.000000 ] [ 4.000000 ] [ 0.047935 ] [ 0.000001 ]
                +0602 OP_ADD       
                +          [ 3.000000 ] [ 4.000000 ] [ 0.047936 ]
                +0603 OP_MULTIPLY  
                +          [ 3.000000 ] [ 0.191743 ]
                +0604 OP_ADD       
                +          [ 3.191743 ]
                +0605 OP_RETURN    
                +3.191743
                +
                + +

                We just executed 605 virtual machine instructions to compute pi to 1dp!

                + +

                This brings us to the end of this tutorial. To recap we've walked through the whole +compilation process: from the user providing an expression string on the REPL, scanning +the source string into tokens, parsing the tokens while accounting for relative +precedence via a Pratt parser, generating bytecode, and finally executing the bytecode +on our own VM. RPython translated what we wrote into C and compiled it, meaning +our resulting calc REPL is really fast.

                + +
                +

                “The world is a thing of utter inordinate complexity and richness and strangeness that is absolutely awesome.”

                + +

                ― Douglas Adams

                +
                + +

                Many thanks to Bob Nystrom for writing the book that inspired this post, and thanks to +Carl Friedrich and Matt Halverson for reviewing.

                + +

                ― Brian (@thorneynzb)

                +
                +
                +
                +
                + +
                +
                +
                + +
                + + + + \ No newline at end of file diff --git a/blog/index-4.html b/blog/index-4.html new file mode 100644 index 000000000..bc979b6aa --- /dev/null +++ b/blog/index-4.html @@ -0,0 +1,910 @@ + + + + + + +PyPy (old posts, page 4) | PyPy + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +
                +

                Wrapping pyrepl in the readline API

                + +
                +

                If you translate a pypy-c with --allworkingmodules and start it, you will probably not notice anything strange about its prompt - except when typing multiline statements. You can move the cursor up and continue editing previous lines. And the history is multiline-statements-aware as well. Great experience! Ah, and completion using tab is nice too.

                + +

                Truth be told, there is nothing new here: it was all done by Michael Hudson's pyrepl many years ago. We had already included pyrepl in PyPy some time ago. What is new is a pure Python readline.py which exposes the most important parts of the API of the standard readline module by wrapping pyrepl under the hood, without needing the GNU readline library at all. The PyPy prompt is based on this, benefitting automagically from pyrepl's multiline editing capabilities, with minor tweaks so that the prompt looks much more like CPython's than a regular pyrepl prompt does.

                + +

                You can also try and use this multiline prompt with CPython: check out pyrepl at https://codespeak.net/svn/pyrepl/trunk/pyrepl and run the new pythoni1 script.

                +
                +

                Other April's Fools Ideas

                + +
                +

                While discussing what to post as an April Fool's joke yesterday, we had a +couple of other ideas, listed below. Most of them were rejected because they are +too incredible, others because they are too close to our wish list.

                +
                  +
                • quantum computer backend
                • +
                • Perl6 interpreter in RPython
                • +
                • Ruby backend to allow run "python on rails"
                • +
                • mandatory static typing at app-level, because it's the only way to increase +performances
                • +
                • rewrite PyPy in Haskell, because we discovered that dynamic typing is just +not suitable for a project of this size
                • +
                • a C front-end, so that we can interpret the C source of Python C extensions +and JIT it. This would work by writing an interpreter for LLVM bytecode in +RPython.
                • +
                • an elisp backend
                • +
                • a TeX backend (use PyPy for your advanced typesetting needs)
                • +
                • an SQL JIT backend, pushing remote procedures into the DB engine
                • +
                +
                +
                +
                +
                + + Leonardo Santagada wrote on 2008-04-02 16:20: +
                +
                +

                PoR - Python on Rails would be the funniest one...

                +
                +
                +
                +
                + + mernen wrote on 2008-04-02 18:07: +
                +
                +

                Oh, the C interpreter would be so awesome. The quantum computer backend, in the hands of a good writer, could become an excellent joke too, no matter how obviously fake. I'd love to see the ones about static typing too.

                +
                +
                +
                +
                + + Unknown wrote on 2008-04-11 23:14: +
                +
                +

                TeX backend ...
                You would be amazed a just how useful that stunt would be (grinBigly)
                of course that's sort of what PS is all about, eh.
                Did something similar (smaller scale) several decades ago. Great fun and extremely useful.

                Thanks for the grins.

                +
                +
                +
                +
                + + mernen wrote on 2008-06-05 21:20: +
                +
                +

                Whoah, anonymous, for a second or two this spam site almost looked convincing.

                +
                +
                +
                + +

                Trying to get PyPy to run on Python 3.0

                + +
                +

                As you surely know, Python 3.0 is coming; recently, they released +Python 3.0 alpha 3, and the final version is expected around +September.

                +

                As suggested by the migration guide (in the PEP 3000), we started by applying +2to3 to our standard interpreter, which is written in RPython (though +we should call it RPython 2.4 now, as opposed to RPython 3.0 -- see +below).

                +

                Converting was not seamless, but most of the resulting bugs were due to the +new dict views, str/unicode changes and the missing "reduce" built-in. +After forking and refactoring both our interpreter and the 2to3 script, +the Python interpreter runs on Python 3.0 alpha 3!

                +

                Next step was to run 2to3 over the whole translation toolchain, +i.e. the part of PyPy which takes care of analyzing the interpreter in +order to produce efficient executables; after the good results we got +with the standard interpreter, we were confident that it would have +been relatively easy to run 2to3 over it: unfortunately, it was not +:-(.

                +

                After letting 2to3 run for days and days uninterrupted, we decided to +kill it: we assume that the toolchain is simply too complex to be +converted in a reasonable amount of time.

                +

                So, we needed to think something else; THE great idea we had was to +turn everything upside-down: if we can't port PyPy to Py3k, we can +always port Py3k to PyPy!

                +

                Under the hood, the 2to3 conversion tool operates as a graph +transformer: it takes the graph of your program (in the form of Python +2.x source file) and returns a transformed graph of the same program +(in the form of Python 3.0 source file). Since the entire translation +toolchain of PyPy is based on graph transformations, we could reuse it +to modify the behaviour of the 2to3 tool. We wrote a general +graph-inverter algorithm which, as the name suggests, takes a graph +transformation and build the inverse transformation; then, we applied +the graph inverter to 2to3, getting something that we called 3to2: it +is important to underline that 3to2 was built by automatically +analysing 2to3 and reversing its operation with only the help of a few +manual hints. For this reason and because we are not keeping generated +files under version control, we do not need to maintain this new tool in +the Subversion repository.

                +

                Once we built 3to2, it was relatively easy to pipe its result to our +interpreter, getting something that can run Python 3.0 programs.

                +

                Performance-wise, this approach has the problem of being slower at +import time, because it needs to run (automatically) 3to2 every time +the source is modified; in the future, we plan to apply our JIT +techniques also to this part of the interpreter, trying to mitigate the +slowdown until it is not noticeable anymore to the final user.

                +

                In the next weeks, we will work on the transformation (and probably publish +the technique as a research paper, with a title like "Automatic Program +Reversion on Intermediate Languages").

                +

                UPDATE: In case anybody didn't guess or didn't spot the acronym: The above +was an April Fool's joke. Nearly nothing of it is true.

                +
                +
                +
                +
                + + Anonymous wrote on 2008-04-01 15:33: +
                +
                +

                "After letting 2to3 run for days and days uninterrupted, we decided to kill it: we assume that the toolchain is simply too complex to be converted in a reasonable amount of time."

                That was silly. Twisted got converted. I suppose that not even a meta-programing-languages-framework can be bigger thaan Twisted. Better luck next year.

                +
                +
                +
                +
                + + Anonymous wrote on 2008-04-01 16:02: +
                +
                +

                I have a working implementation of the Parrot virtual machine in Py3K. After running it through your converter (my hosting service only runs 2.1!), I find that my implementation now only supports Perl 5 and Snobol. What gives?

                +
                +
                +
                +
                + + fumanchu wrote on 2008-04-01 16:54: +
                +
                +

                Nice acronym, that.

                +
                +
                +
                +
                + + Paddy3118 wrote on 2008-04-01 19:37: +
                +
                +

                Nice one ;-)

                And the best I've read all day!

                - Paddy.

                +
                +
                +
                +
                + + Anonymous wrote on 2008-04-17 20:54: +
                +
                +

                Looks like hosting python 2.5 scripts on a Py3k interpreter might become a USP for PyPY ;)

                +
                +
                +
                + +

                Py-Lib 0.9.1 released

                + +
                +

                The Py-Lib 0.9.1 release is out! The Py-Lib is a very important support +library that PyPy uses for a lot of things – most importantly it contains +py.test, which PyPy uses for testing.

                +

                This is mostly a bugfix release, with a couple of new features sneaked in. +Most important changes:

                +
                  +
                • some new functionality (authentication, export, locking) in py.path's +Subversion APIs
                • +
                • numerous small fixes in py.test's rsession (experimental pluggable session) +and generative test features
                • +
                • some fixes in the py.test core
                • +
                +

                Download/Install: https://codespeak.net/py/0.9.1/download.html

                +

                Documentation/API: https://codespeak.net/py/0.9.1/index.html

                +

                UPDATE: the py-lib is now easy-installable with:

                +
                +easy_install py
                +
                +
                +

                PyPy Summer of Code Participation

                + +
                +

                As in the last years, PyPy will again participate in Google's Summer of Code +program under the umbrella of the Python Software Foundation. Unfortunately we +were a bit disorganized this year, so that our project ideas are only put up +now. The list of project ideas of PyPy can be found here.

                +

                Any interested student should mail to our mailing list or just come to the +#pypy channel on irc.freenode.net to discuss things.

                +
                +

                ctypes configuration tool

                + +
                +

                As a part of implementing ctypes, we decided to make coding using ctypes better on its own (irrelevant what python interpreter you use). The concrete problem we're trying to solve is to make ctypes code more platform-independent than it is. Say you want to create a ctypes type for size_t: ctypes itself provides no mechanism for doing that, so you need to use a concrete integer type (c_int, c_long, c_short etc.). Your code either becomes platform dependent if you pick one of them or is littered with conditionals for all sorts of platforms. We created a small library, called ctypes_configure (which is actually a variation of something we use somewhere in the PyPy source tree), which tries to solve some platform dependencies by compiling and running small chunks of C code through a C compiler. It's sort of like configure in the Linux world, except for Python using ctypes. +

                +To install the library, you can just type easy_install ctypes_configure. The code is in an svn repository on codespeak and there is even some documentation and sample code. Also, even though the code lives in the pypy repository, it depends only on pylib, not on the whole of pypy. +
                +The library is in its early infancy (but we think it is already rather useful). In the future we could add extra features, it might be possible to check whether the argtypes that are attached to the external functions are consistent with what is in the C headers), so that the following code wouldn't segfault but give a nice error +

                +
                +libc = ctypes.CDLL("libc.so")
                +time = libc.time
                +time.argtypes = [ctypes.c_double, ctypes.c_double]
                +time(0.0, 0.0)
                +
                + +Also, we plan to add a way to install a package that uses ctypes_configure in such a way that the installed library doesn't need to call the C compiler any more later. +
                +
                +
                +
                + + Anonymous wrote on 2008-03-18 09:52: +
                +
                +

                Cool - it even works on Windows!.

                BTW: The content-type of the documentation seems wrong, firefox displays the html instead of rendering it.

                +
                +
                +
                +
                + + PJE wrote on 2008-03-18 16:51: +
                +
                +

                Since easy_install can compile C code, why not just compile an extension module with the configuration? Then, other modules can just import the pre-built configuration.

                +
                +
                +
                +
                + + Maciej Fijalkowski wrote on 2008-03-18 17:50: +
                +
                +

                Sure. It's an obvious extension. I just got this from pypy source code and released separately. If it'll happen to be useful, I'll add more features.

                +
                +
                +
                +
                + + Unknown wrote on 2008-05-11 02:43: +
                +
                +

                Re: PJE

                I'm no expert (and I'm half asleep), but your approach sounds like it might run afoul of changes introduced by upgrading something without regenerating the pre-built configuration.

                +
                +
                +
                + +

                Bittorrent on PyPy

                + +
                +

                Hi all,

                + +

                Bittorrent now runs on PyPy! I tried the no-GUI BitTornado version (btdownloadheadless.py). It behaves correctly and I fixed the last few obvious places which made noticeable pauses. (However we know that there are I/O performance issues left: we make too many internal copies of the data, e.g. in a file.read() or os.read().)

                + +

                We are interested in people trying out other real-world applications that, like the GUI-less Bittorrent, don't have many external dependencies to C extension modules. Please report all the issues to us!

                + +

                The current magic command line for creating a pypy-c executable with as many of CPython's modules as possible is:

                + +
                +  cd pypy/translator/goal
                +  ./translate.py --thread targetpypystandalone.py --allworkingmodules --withmod-_rawffi --faassen
                +
                + +

                (This gives you a thread-aware pypy-c, which requires the Boehm gc library. The _rawffi module gives you ctypes support but is only tested for Linux at the moment.)

                +
                +
                +
                +
                + + Panos Laganakos wrote on 2008-03-18 12:23: +
                +
                +

                Pretty kewl stuff from PyPy :)

                +
                +
                +
                +
                + + Orangeman wrote on 2008-04-07 13:30: +
                +
                +

                I have a guide on most popular P2P technologies at https://sriraminhell.blogspot.com/2007/08/p2p-brief-introduction.html and on Bit Torrent at https://sriraminhell.blogspot.com/2007/08/peer-to-peer-ii-bit-torrent.html . Cheers!!

                +
                +
                +
                +
                + + Unknown wrote on 2008-04-19 11:53: +
                +
                +

                What rev number did you build on? I tried with the latest source from svn, but got an error almost immediately. No module named py.

                +
                +
                +
                + +

                As fast as CPython (for carefully taken benchmarks)

                + +
                +

                Good news everyone. A tuned PyPy compiled to C is nowadays as fast as CPython on the richards benchmark and slightly faster on the gcbench benchmark. +
                IMPORTANT: These are very carefully taken benchmarks where we expect pypy to be fast! PyPy is still quite slower than CPython on other benchmarks and on real-world applications (but we're working on it). The point of this post is just that for the first time (not counting JIT experiments) we are faster than CPython on *one* example :-) +
                +The exact times as measured on my notebook (which is a Core Duo machine) are here: +
                +Compiled pypy with options: +
                +./translate.py --gcrootfinder=asmgcc --gc=generation targetpypystandalone.py --allworkingmodules --withmod-_rawffi --faassen + +(allworkingmodules and withmod-_rawffi are very likely irrelevant to those benchmarks) +
                +CPython version 2.5.1, release. +

                +
                  +
                • richards 800ms pypy-c vs 809ms cpython (1% difference)
                • +
                • gcbench 53700ms pypy-c vs 60215ms cpython (11% difference)
                • +
                +PyPy shines on gcbench, which is mostly just about allocating and freeing many objects. Our gc is simply better than refcounting, even though we've got shortcomings in other places. +
                + +About richards, there is a catch. We use a method cache optimization, and have an optimization which helps to avoid creating bound methods each time a method is called. This speeds up the benchmark for about 20%. Although method cache was even implemented for CPython, it didn't make its way to the core because some C modules directly modify the dictionary of new-style classes. In PyPy, the greater level of abstraction means that this operation is just illegal. +
                +
                +
                +
                + + Anonymous wrote on 2008-03-05 13:08: +
                +
                +

                This is GREAT news!

                Keep up the good work guys, i will be closely following you all!

                Cheers!

                +
                +
                +
                +
                + + Anonymous wrote on 2008-03-06 22:40: +
                +
                +

                I have been watching PyPy for some time now and this news along with the ctypes news has me excited.

                +
                +
                +
                +
                + + Unknown wrote on 2008-03-07 10:06: +
                +
                +

                Great work.

                It is wonderful to see PyPy making progress towards the overall goal!

                +
                +
                +
                +
                + + Unknown wrote on 2008-03-12 18:02: +
                +
                +

                Awsome. :)

                +
                +
                +
                +
                + + Anonymous wrote on 2008-03-14 16:23: +
                +
                +

                Will PyPy be released before Duke Nukem gets released? Pray please tell and enlighten!
                Cursing all you skilled hackers for not doing an amd64 port of Psycho and pursuing something that will be irrelevant when it materializes.
                Have fun any way.

                +
                +
                +
                +
                + + Maciej Fijalkowski wrote on 2008-03-14 21:54: +
                +
                +

                Yeah, it will be released usable. For real. We're getting into having nice and usable python interpreter, ctypes is a good example of feature that is ready to use. How fast it'll be? I don't know, hopefully faster than psyco.

                +
                +
                +
                +
                + + Unknown wrote on 2008-03-17 23:02: +
                +
                +

                "Will PyPy be released before Duke Nukem gets released?"

                I doubt it: Duke Nukem was released in 1991...

                https://en.wikipedia.org/wiki/Duke_Nukem_%28computer_game%29

                If you want to make a wisecrack, at least try to deliver it correctly.

                +
                +
                +
                +
                + + Anonymous wrote on 2008-03-20 09:34: +
                +
                +

                And pypy 1.0 was released one year ago ... https://aspn.activestate.com/ASPN/Mail/Message/python-announce/3461501

                +
                +
                +
                + +

                Running Pyglet on PyPy

                + +
                +

                As part of our efforts of making PyPy's Python interpreter usable we put quite some effort into interfacing with external libraries. We were able, in quite a short amount of time (I think beginning really from Leysin sprint, or slightly earlier) to provide a prototype of the ctypes library. It is written in completely normal Python, at applevel, based on a very thin wrapper around the libffi library. This makes development a lot easier, but it makes the resulting ctypes implementation rather slow. The implementation is not complete yet and it will still need quite some effort to make it feature-complete (ctypes has lots of details and special cases and +do-what-I-mean magic). Yet another point will be to make it faster, but that's for much later. +
                +The implementation is good enough to run those parts of Pyglet that don't depend on PIL (which PyPy doesn't have). Here are a few pictures of running Pyglet demos on top of compiled pypy-c. + + + +To compile a version of PyPy that supports ctypes, use this highly sophisticated command line + +
                ./translate.py --gc=generation ./targetpypystandalone.py --allworkingmodules --withmod-_rawffi +
                +Note: this works on linux only right now. +
                +The list of missing small ctypes features is quite extensive, but I consider the current implementation to be usable for most common cases. I would love to hear about libraries written in pure python (using ctypes), to run them on top of PyPy and use them as test cases. If someone knows such library, please provide a link.

                +
                +
                +
                +
                + + Richard Jones wrote on 2008-02-21 01:17: +
                +
                +

                This is very cool news indeed! The second screenshot seems to show a strange artefact though on the inside of the torus. Is that running the unmodified examples/opengl.py code, or has the example been modified to display a second torus? It should also be noted that pyglet is perfectly usable without PIL (as long as you have libgdk installed under Linux which almost everyone will do).

                +
                +
                +
                +
                + + Richard Jones wrote on 2008-02-21 03:16: +
                +
                +

                ps. it's "pyglet" with a little "p" :)

                +
                +
                +
                +
                + + René Dudfield wrote on 2008-02-21 04:04: +
                +
                +

                Very cool :)

                Here's two more ctypes things... which you probably know about already, but eh :)

                https://pyopengl.sf.net/
                https://www.pygame.org/ctypes/

                cu,

                +
                +
                +
                +
                + + Anonymous wrote on 2008-02-21 08:52: +
                +
                +

                Congratulations from me!

                +
                +
                +
                +
                + + Carl Friedrich Bolz-Tereick wrote on 2008-02-21 09:07: +
                +
                +

                hi illume,

                yep, we know about those, thank you anyway :-).

                Cheers,

                Carl Friedrich

                +
                +
                +
                +
                + + Anonymous wrote on 2008-02-21 13:53: +
                +
                +

                Very exciting to see my humble 'triangles' pyglet demo being used in the blue colored screenshot. If anyone's interested, the code for that is here: https://tartley.com/?p=264

                I should remember to put an explicit CC license on my whole site.

                +
                +
                +
                +
                + + Justin wrote on 2008-02-29 08:58: +
                +
                +

                https://utidylib.berlios.de/

                needs only ctypes, iirc.

                +
                +
                +
                +
                + + Gerhard Häring wrote on 2008-03-05 16:07: +
                +
                +

                Please try the ctypes-based pysqlite reimplementation at https://hg.ghaering.de/pysqlite3/

                It's meant to become the "sqlite3" module for PyPy.

                +
                +
                +
                +
                + + Carl Friedrich Bolz-Tereick wrote on 2008-03-05 21:15: +
                +
                +

                Hi Gerhard,

                yip, we know about this sqlite implementation, thank you! We are already using it for our tests (and most of its tests already work).

                Cheers,

                Carl Friedrich

                +
                +
                +
                + +

                Python Finalizers Semantics, Part 2: Resurrection

                + +
                +

                Continuing the last blog post about GC semantics in Python.

                +

                Another consequence of reference counting is that resurrection is easy to +detect. A dead object can resurrect itself if its finalizer stores it into a +globally reachable position, like this:

                +
                +class C(object):
                +    def __init__(self, num):
                +        self.num = num
                +    def __del__(self):
                +        global c
                +        if c is None:
                +            c = self
                +c = C(1)
                +while c is not None:
                +    c = None
                +    print "again"
                +
                +

                This is an infinite loop in CPython: Every time c is set to None in the +loop, the __del__ method resets it to the C instance again (note that +this is terribly bad programming style, of course. In case anybody was wondering +:-)). CPython can detect resurrection by checking whether the reference count +after the call to __del__ has gotten bigger.

                +

                There exist even worse examples of perpetual resurrection in particular in +combination with the cycle GC. If you want to see a particularly horrible one, +see this discussion started by Armin Rigo. In the ensuing thread Tim Peters +proposes to follow Java's example and call the finalizer of every object at most +once.

                +

                In PyPy the resurrection problem is slightly more complex, since we have GCs +that run collection from time to time and don't really get to know at which +precise time an object dies. If the GC discovers during a collection that an +object is dead, it will call the finalizer after the collection is finished. If +the object is then dead at the next collection, the GC does not know whether +the object was resurrected by the finalizer and then died in the meantime or +whether it was not resurrected. Therefore it seemed sanest to follow Tim's +solution and to never call the finalizer of an object a second time, which has +many other benefits as well.

                +
                +
                +
                +
                + +
                +
                +
                + +
                + + + + \ No newline at end of file diff --git a/blog/index-40.html b/blog/index-40.html new file mode 100644 index 000000000..0257726c5 --- /dev/null +++ b/blog/index-40.html @@ -0,0 +1,1988 @@ + + + + + + +PyPy (old posts, page 40) | PyPy + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +
                +

                A new chapter for PyPy

                + +
                +

                PyPy winds down its membership in the Software Freedom Conservancy

                + +

                Conservancy and PyPy's great work together

                + +

                PyPy joined Conservancy in +the second half of 2010, shortly after the release of +PyPy 1.2, the first version to contain a fully functional JIT. In 2013, PyPy +started supporting ARM, bringing its just-in-time speediness to many more devices and began working toward supporting NumPy to help +scientists crunch their numbers faster. Together, PyPy and Conservancy ran successful fundraising drives and facilitated payment +and oversight for contractors and code sprints.

                + +

                Conservancy supported PyPy's impressive growth as it expanded support for +different hardware platforms, greatly improved the performance of C extensions, +and added support for Python 3 as the language itself evolved.

                + +

                The road ahead

                + +

                Conservancy provides a fiscal and organizational home for projects that find the +freedoms and guardrails that come along with a charitable home advantageous for +their community goals. While this framework was a great fit for the early PyPy +community, times change and all good things must come to an end.

                + +

                PyPy will remain a free and open source project, but the community's structure +and organizational underpinnings will be changing and the PyPy community will be +exploring options outside of the charitable realm for its next phase of growth +("charitable" in the legal sense -- PyPy will remain a community project).

                + +

                During the last year PyPy and Conservancy have worked together to properly +utilise the generous donations made by stalwart PyPy enthusiats over the years +and to wrap up PyPy's remaining charitable obligations. PyPy is grateful for +the Conservancy's help in shepherding the project toward its next chapter.

                + +

                Thank yous

                +

                From Conservancy:

                +

                +
                "We are happy that Conservancy was able to help PyPy bring important software +for the public good during a critical time in its history. We wish the +community well and look forward to seeing it develop and succeed in new ways."
                +
                +
                — Karen Sandler, Conservancy's Executive Director
                +

                +

                From PyPy:

                +

                +
                +

                "PyPy would like to thank Conservancy for their decade long support in +building the community and wishes Conservancy continued success in their +journey promoting, improving, developing and defending free and open source +sofware."

                +

                — Simon Cross & Carl Friedrich Bolz-Tereick, on behalf of PyPy.

                +
                +

                +
                +
                + +

                About

                + +

                PyPy is a multi-layer python interpreter with a built-in JIT compiler that runs +Python quickly across different computing environments. +Software Freedom Conservancy (Conservancy) is a charity that provides a home +to over forty free and open source software projects.

                +
                +
                +
                +
                + + intgr wrote on 2020-08-12 23:36: +
                +
                +

                This post has lots of words but unfortunately contains almost no information. What impact does this change have on PyPy? What is the new chapter?

                +
                +
                +
                +
                + + Rick Sanchez wrote on 2020-08-13 06:33: +
                +
                +

                What does PyPy do? Why should I use it over other Python compilers?

                +
                +
                +
                +
                + + Anonymous wrote on 2020-08-13 08:38: +
                +
                +

                @intgr the wind-down with the SFC hasn't been smooth and this is the politically-neutral, agreed-by-both-parties post. PyPy remains the same free and open-source project. Essentially we just switched to a different money-handler. We're announcing it in the next blog post.

                +
                +
                +
                +
                + + Florian wrote on 2020-08-19 11:10: +
                +
                +

                As https://bitbucket.org/pypy/pypy/downloads/pypy2.7-v7.3.1-linux32.tar.bz2 is down (due to heptapod move ?)

                Where can we download pypy binaries ?

                + +
                +
                +
                +
                + + Armin Rigo wrote on 2020-08-19 22:44: +
                +
                +

                The page https://pypy.org/download.html contains the updated links.

                +
                +
                +
                + +

                PyPy 7.3.1 released

                + +
                +
                +The PyPy team is proud to release the version 7.3.1 of PyPy, which includes +two different interpreters:
                +
                +
                  +
                • PyPy2.7, which is an interpreter supporting the syntax and the features of +Python 2.7 including the stdlib for CPython 2.7.13
                • +
                • PyPy3.6: which is an interpreter supporting the syntax and the features of +Python 3.6, including the stdlib for CPython 3.6.9.
                • +
                +
                +
                +The interpreters are based on much the same codebase, thus the multiple +release. This is a micro release, no APIs have changed since the 7.3.0 release +in December, but read on to find out what is new.

                +Conda Forge now supports PyPy as a Python interpreter. The support right now +is being built out. After this release, many more c-extension-based +packages can be successfully built and uploaded. This is the result of a lot of +hard work and good will on the part of the Conda Forge team. A big shout out +to them for taking this on.

                +We have worked with the Python packaging group to support tooling around +building third party packages for Python, so this release updates the pip and +setuptools installed when executing pypy -mensurepip to pip>=20. This +completes the work done to update the PEP 425 python tag from pp373 to +mean “PyPy 7.3 running python3” to pp36 meaning “PyPy running Python +3.6” (the format is recommended in the PEP). The tag itself was +changed in 7.3.0, but older pip versions build their own tag without querying +PyPy. This means that wheels built for the previous tag format will not be +discovered by pip from this version, so library authors should update their +PyPy-specific wheels on PyPI.

                +Development of PyPy is transitioning to https://foss.heptapod.net/pypy/pypy. +This move was covered more extensively in the blog post from last month.

                +The CFFI backend has been updated to version 14.0. We recommend using CFFI +rather than c-extensions to interact with C, and using cppyy for performant +wrapping of C++ code for Python. The cppyy backend has been enabled +experimentally for win32, try it out and let use know how it works.

                +Enabling cppyy requires a more modern C compiler, so win32 is now built +with MSVC160 (Visual Studio 2019). This is true for PyPy 3.6 as well as for 2.7.

                +We have improved warmup time by up to 20%, performance of io.StringIO to +match if not be faster than CPython, and improved JIT code generation for +generators (and generator expressions in particular) when passing them to +functions like sum, map, and map that consume them. Performance of closures has also be improved in certain situations.

                +As always, this release fixed several issues and bugs raised by the growing +community of PyPy users. We strongly recommend updating. Many of the fixes are +the direct result of end-user bug reports, so please continue reporting issues +as they crop up.
                + +You can find links to download the v7.3.1 releases here:
                + +
                +We would like to thank our donors for the continued support of the PyPy +project. If PyPy is not quite good enough for your needs, we are available for +direct consulting work.

                +We would also like to thank our contributors and encourage new people to join +the project. PyPy has many layers and we need help with all of them: PyPy +and RPython documentation improvements, tweaking popular modules to run +on PyPy, or general help with making RPython’s JIT even better. Since the +previous release, we have accepted contributions from 13 new contributors, +thanks for pitching in.

                +If you are a Python library maintainer and use c-extensions, please consider +making a cffi / cppyy version of your library that would be performant on PyPy. +In any case both cibuildwheel and the multibuild system support +building wheels for PyPy wheels.
                +

                +

                +What is PyPy? +

                +PyPy is a very compliant Python interpreter, almost a drop-in replacement for +CPython 2.7, 3.6, and soon 3.7. It’s fast (PyPy and CPython 2.7.x performance +comparison) due to its integrated tracing JIT compiler.

                +We also welcome developers of other dynamic languages to see what RPython +can do for them.

                +This PyPy release supports:
                +
                +
                  +
                • +x86 machines on most common operating systems +(Linux 32/64 bits, Mac OS X 64 bits, Windows 32 bits, OpenBSD, FreeBSD)
                • +
                • big- and little-endian variants of PPC64 running Linux,
                • +
                • +s390x running Linux
                • +
                • 64-bit ARM machines running Linux.
                • +
                +
                +
                +
                +

                +What else is new? +

                +For more information about the 7.3.1 release, see the full changelog.

                +Please update, and continue to help us make PyPy better.

                +Cheers,
                +The PyPy team +
                +


                +The PyPy Team 
                +
                +
                +
                +

                Leysin 2020 Sprint Report

                + +
                +

                At the end of February ten of us gathered in Leysin, Switzerland to work on
                +a variety of topics including HPy, PyPy Python 3.7 support and the PyPy
                +migration to Heptapod.

                +
                + +
                +
                +We had a fun and productive week. The snow was beautiful. There was skiing
                +and lunch at the top of Berneuse, cooking together, some late nights at
                +the pub next door, some even later nights coding, and of course the
                +obligatory cheese fondue outing.

                +There were a few of us participating in a PyPy sprint for the first time
                +and a few familiar faces who had attended many sprints. Many different
                +projects were represented including PyPy, HPy, GraalPython,
                Heptapod, and rust-cpython. The atmosphere was relaxed and welcoming, so if
                +you're thinking of attending the next one -- please do!

                +Topics worked on:

                +HPy

                +HPy is a new project to design and implement a better API for extending
                +Python in C. If you're unfamiliar with it you can read more about it at
                HPy.

                +A lot of attention was devoted to the Big HPy Design Discussion which
                +took up two full mornings. So much was decided that this will likely
                +get its own detailed write-up, but bigger topics included:
                  +
                • the HPy GetAttr, SetAttr, GetItem and SetItem methods,
                • +
                • HPy_FromVoidP and HPy_AsVoidP for passing HPy handles to C functions
                  +that pass void* pointers to callbacks,
                • +
                • avoiding having va_args as part of the ABI,
                • +
                • exception handling,
                • +
                • support for creating custom types.
                • +
                +Quite a few things got worked on too:
                  +
                • implemented support for writing methods that take keyword arguments with
                  +HPy_METH_KEYWORDS,
                • +
                • implemented HPy_GetAttr, HPy_SetAttr, HPy_GetItem, and HPy_SetItem,
                • +
                • started implementing support for adding custom types,
                • +
                • started implementing dumping JSON objects in ultrajson-hpy,
                • +
                • refactored the PyPy GIL to improve the interaction between HPy and
                  +PyPy's cpyext,
                • +
                • experimented with adding HPy support to rust-cpython.
                • +
                +And there was some discussion of the next steps of the HPy initiative
                +including writing documentation, setting up websites and funding, and
                +possibly organising another HPy gathering later in the year.

                +PyPy

                +
                  +
                • Georges gave a presentation on the Heptapod topic and branch workflows
                  +and showed everyone how to use hg-evolve.
                • +
                • Work was done on improving the PyPy CI buildbot post the move to
                  +heptapod, including a light-weight pre-merge CI and restricting
                  +when the full CI is run to only branch commits.
                • +
                • A lot of work was done improving the -D tests.
                • +
                +

                +Miscellaneous

                +
                  +
                • Armin demoed VRSketch and NaN Industries in VR, including an implementation
                  +of the Game of Life within NaN Industries!
                • +
                • Skiing!
                • +
                +

                +Aftermath

                +Immediately after the sprint large parts of Europe and the world were
                +hit by the COVID-19 epidemic. It was good to spend time together before
                +travelling ceased to be a sensible idea and many gatherings were cancelled.

                +Keep safe out there everyone.

                +The HPy & PyPy Team & Friends

                In joke for those who attended the sprint: Please don't replace this blog post
                +with its Swedish translation (or indeed a translation to any other language :).
                +
                +
                +
                +
                + + Pim wrote on 2020-03-30 13:04: +
                +
                +

                How does HPY relate to CFFI?

                +
                +
                +
                +
                + + Antonio Cuni wrote on 2020-03-30 16:44: +
                +
                +

                @Pim: CFFI allows to wrap C code in pure Python.
                HPy allows to write Python extensions in C.

                For example, you can't write a new class in CFFI, and CFFI functions can't receive Python objects as arguments

                +
                +
                +
                + +

                PyPy and CFFI have moved to Heptapod

                + +
                +
                +
                +
                +
                +It has been a very busy month, not so much because of deep changes in the JIT of PyPy but more around the development, deployment, and packaging of the project.
                +

                +  +

                +

                +Hosting +

                +The biggest news is that we have moved the center of our development off Bitbucket and to the new https://foss.heptapod.net/pypy. This is a friendly fork of Gitlab called heptapod that understands Mercurial and is hosted by Clever Cloud. When Atlassian decided to close down Mercurial hosting on bitbucket.org, PyPy debated what to do. Our development model is based on long-lived branches, and we want to keep the ability to immediately see which branch each commit came from. Mercurial has this, git does not (see our FAQ). Octobus, whose business is Mercurial, developed a way to use Mercurial with Gitlab called heptapod. The product is still under development, but quite usable (i.e., it doesn't get in the way). Octobus partnered with Clever Cloud hosting to offer community FOSS projects hosted on Bitbucket who wish to remain with Mercurial a new home. PyPy took them up on the offer, and migrated its repos to https://foss.heptapod.net/pypy. We were very happy with how smooth it was to import the repos to heptapod/GitLab, and are learning the small differences between Bitbucket and GitLab. All the pull requests, issues, and commits kept the same ids, but work is still being done to attribute the issues, pull requests, and comments to the correct users. So from now on, when you want to contribute to PyPy, you do so at the new home.

                +CFFI, which previously was also hosted on Bitbucket, has joined the PyPy group at https://foss.heptapod.net/pypy/cffi.
                +
                +

                +  +

                +

                +Website +

                +Secondly, thanks to work by https://baroquesoftware.com/ in leading a redesign and updating the logo, the https://www.pypy.org website has undergone a facelift. It should now be easier to use on small-screen devices. Thanks also to the PSF for hosting the site.
                +
                +

                +  +

                +

                +Packaging +

                +Also, building PyPy from source takes a fair amount of time. While we provide downloads in the form of tarballs or zipfiles, and some platforms such as debian and Homebrew provide packages, traditionally the downloads have only worked on a specific flavor of operating system. A few years ago squeaky-pl started providing portable builds. We have adopted that build system for our linux offerings, so the nightly downloads and release downloads should now work on any glibc platform that has not gone EndOfLife. So there goes another excuse not to use PyPy. And the "but does it run scipy" excuse also no longer holds, although "does it speed up scipy" still has the wrong answer. For that we are working on HPy, and will be sprinting soon.
                +The latest versions of pip, wheel, and setuptools, together with the manylinux2010 standard for linux wheels and tools such as multibuild or cibuildwheels (well, from the next version) make it easier for library developers to build binary wheels for PyPy. If you are having problems getting going with this, please reach out.
                +
                +
                +

                +  +

                +

                +Give it a try +

                +Thanks to all the folks who provide the infrastructure PyPy depends on. We hope the new look will encourage more involvement and engagement. Help prove us right!

                +The PyPy Team
                +
                +
                +
                +
                +
                +
                +
                + + Matěj Cepl wrote on 2020-02-16 21:20: +
                +
                +

                Could you elaborate on “this is not always possible with Git”, please? This is too brief statement for my taste, and it doesn't make much sense (merging a branch certainly doesn't make it go away, and it is certainly possible to find to which branch a commit used to belong before merging).

                +
                +
                +
                +
                + + Armin Rigo wrote on 2020-02-16 22:35: +
                +
                +

                https://doc.pypy.org/en/latest/faq.html#why-doesn-t-pypy-use-git-and-move-to-github

                +
                +
                +
                +
                + + Miro Hrončok wrote on 2020-02-18 07:25: +
                +
                +

                Were issue attachments migrated properly?

                +
                +
                +
                +
                + + mattip wrote on 2020-02-19 07:34: +
                +
                +

                > Were issue attachments migrated properly?

                No. They should be migrated in a follow-up. Here is the heptapod issue about attachments https://foss.heptapod.net/heptapod/foss.heptapod.net/issues/37

                +
                +
                +
                +
                + + Carl Friedrich Bolz-Tereick wrote on 2020-02-19 08:19: +
                +
                +

                I deleted one anonymous comment that was insulting the project decision. Please make your points constructively or don't make them.

                +
                +
                +
                + +

                Leysin Winter sprint 2020: Feb 29 - March 8th

                + +
                + The next PyPy sprint will be in Leysin, Switzerland, for the fourteenth +time. This is a fully public sprint: newcomers and topics other than +those proposed below are welcome.



                +Goals and topics of the sprint

                +The list of topics is open.  For reference, we would like to work at least partially on the following topics:
                +As usual, the main side goal is to have fun in winter sports :-) +We can take a day off (for ski or anything else).

                +Times and accomodation

                +The sprint will occur for one week starting on Saturday, the 29th of February, to Sunday, the 8th of March 2020 (dates were pushed back one day!)  It will occur in Les Airelles, a different bed-and-breakfast place from the traditional one in Leysin.  It is a nice old house at the top of the village.

                We have a 4- or 5-people room as well as up to three double-rooms.  Please register early!  These rooms are not booked for the sprint in advance, and might be already taken if you end up announcing yourself late.  We have a big room for up to 7 people with nice view, which might be split in two or three sub-rooms; plus possibly separately-booked double rooms if needed. (But it is of course always possible to book at a different place in Leysin.)

                +For more information, see our repository or write to me directly at armin.rigo@gmail.com. +
                +

                PyPy 7.3.0 released

                + +
                +
                +The PyPy team is proud to release the version 7.3.0 of PyPy, which includes +two different interpreters:
                  +
                • PyPy2.7, which is an interpreter supporting the syntax and the features of +Python 2.7 including the stdlib for CPython 2.7.13
                • +
                • PyPy3.6: which is an interpreter supporting the syntax and the features of +Python 3.6, including the stdlib for CPython 3.6.9.
                • +
                +
                +
                +
                +
                +The interpreters are based on much the same codebase, thus the double +release.

                +We have worked with the python packaging group to support tooling around +building third party packages for python, so this release changes the ABI tag +for PyPy.

                +Based on the great work done in portable-pypy, the linux downloads we +provide are now built on top of the manylinux2010 CentOS6 docker image. +The tarballs include the needed shared objects to run on any platform that +supports manylinux2010 wheels, which should include all supported versions of +debian- and RedHat-based distributions (including Ubuntu, CentOS, and Fedora).

                +The CFFI backend has been updated to version 1.13.1. We recommend using CFFI +rather than c-extensions to interact with C.
                + +The built-in cppyy module was upgraded to 1.10.6, which +provides, among others, better template resolution, stricter enum handling, +anonymous struct/unions, cmake fragments for distribution, optimizations for +PODs, and faster wrapper calls. We reccomend using cppyy for performant +wrapping of C++ code for Python.

                +The vendored pyrepl package for interaction inside the REPL was updated.

                +Support for codepage encoding and decoding was added for Windows.

                +As always, this release fixed several issues and bugs raised by the growing +community of PyPy users. We strongly recommend updating. Many of the fixes are +the direct result of end-user bug reports, so please continue reporting issues +as they crop up.
                + +You can download the v7.3 releases here:
                + +
                +We would like to thank our donors for the continued support of the PyPy +project. If PyPy is not quite good enough for your needs, we are available for +direct consulting work.

                +We would also like to thank our contributors and encourage new people to join +the project. PyPy has many layers and we need help with all of them: PyPy +and RPython documentation improvements, tweaking popular packages to run +on pypy, or general help with making RPython’s JIT even better. Since the +previous release, we have accepted contributions from 3 new contributors, +thanks for pitching in.

                +If you are a python library maintainer and use c-extensions, please consider making a cffi / cppyy version of your library that would be performant on PyPy. If you are stuck with using the C-API, you can use docker images with PyPy built in or the multibuild system to build wheels.

                +

                +What is PyPy? +

                +PyPy is a very compliant Python interpreter, almost a drop-in replacement for +CPython 2.7, 3.6. It’s fast (PyPy and CPython 2.7.x performance +comparison) due to its integrated tracing JIT compiler.

                +We also welcome developers of other dynamic languages to see what RPython +can do for them.

                +This PyPy release supports:
                  +
                • +x86 machines on most common operating systems +(Linux 32/64 bit, Mac OS X 64-bit, Windows 32-bit, OpenBSD, FreeBSD)
                • +
                +
                +
                +
                  +
                • big- and little-endian variants of PPC64 running Linux +
                • +
                +
                +
                +
                  +
                • +s390x running Linux
                • +
                +
                +
                +
                  +
                • 64-bit ARM machines running Linux
                • +
                +Unfortunately at the moment of writing our ARM buildbots are out of service, +so for now we are not releasing any binary for the ARM architecture (32-bit), although PyPy does support ARM 32-bit processors.

                +

                +What else is new? +

                +PyPy 7.2 was released in October, 2019. +There are many incremental improvements to RPython and PyPy, For more information about the 7.3.0 release, see the full changelog.

                +Please update, and continue to help us make PyPy better.

                +Cheers,
                +The PyPy team +
                +

                +
                +
                +
                +

                HPy kick-off sprint report

                + +
                +

                Recently Antonio, Armin and Ronan had a small internal sprint in the beautiful +city of Gdańsk to kick-off the development of HPy. Here is a brief report of +what was accomplished during the sprint.

                +
                +

                What is HPy?

                +

                The TL;DR answer is "a better way to write C extensions for Python".

                +

                The idea of HPy was born during EuroPython 2019 in Basel, where there was an +informal meeting which included core developers of PyPy, CPython (Victor +Stinner and Mark Shannon) and Cython (Stefan Behnel). The ideas were later also +discussed with Tim Felgentreff of GraalPython, to make sure they would also be +applicable to this very different implementation, Windel Bouwman of RustPython +is following the project as well.

                +

                All of us agreed that the current design of the CPython C API is problematic +for various reasons and, in particular, because it is too tied to the current +internal design of CPython. The end result is that:

                + +
                  +
                • alternative implementations of Python (such as PyPy, but not only) have a +hard time loading and executing existing C extensions;
                • +
                • CPython itself is unable to change some of its internal implementation +details without breaking the world. For example, as of today it would be +impossible to switch from using reference counting to using a real GC, +which in turns make it hard for example to remove the GIL, as gilectomy +attempted.
                • +
                +

                HPy tries to address these issues by following two major design guidelines:

                +
                  +
                1. objects are referenced and passed around using opaque handles, which are +similar to e.g., file descriptors in spirit. Multiple, different handles +can point to the same underlying object, handles can be duplicated and +each handle must be released independently of any other duplicate.
                2. +
                3. The internal data structures and C-level layout of objects are not +visible nor accessible using the API, so each implementation if free to +use what fits best.
                4. +
                +

                The other major design goal of HPy is to allow incremental transition and +porting, so existing modules can migrate their codebase one method at a time. +Moreover, Cython is considering to optionally generate HPy code, so extension +module written in Cython would be able to benefit from HPy automatically.

                +

                More details can be found in the README of the official HPy repository.

                +
                +
                +

                Target ABI

                +

                When compiling an HPy extension you can choose one of two different target ABIs:

                + +
                  +
                • +HPy/CPython ABI: in this case, hpy.h contains a set of macros and +static inline functions. At compilation time this translates the HPy API +into the standard C-API. The compiled module will have no performance +penalty, and it will have a "standard" filename like +foo.cpython-37m-x86_64-linux-gnu.so.
                • +
                • +Universal HPy ABI: as the name implies, extension modules compiled +this way are "universal" and can be loaded unmodified by multiple Python +interpreters and versions. Moreover, it will be possible to dynamically +enable a special debug mode which will make it easy to find e.g., open +handles or memory leaks, without having to recompile the extension.
                • +
                +

                Universal modules can also be loaded on CPython, thanks to the +hpy_universal module which is under development. An extra layer of +indirection enables loading extensions compiled with the universal ABI. Users +of hpy_universal will face a small performance penalty compared to the ones +using the HPy/CPython ABI.

                +

                This setup gives several benefits:

                + +
                  +
                • Extension developers can use the extra debug features given by the +Universal ABI with no need to use a special debug version of Python.
                • +
                • Projects which need the maximum level of performance can compile their +extension for each relevant version of CPython, as they are doing now.
                • +
                • Projects for which runtime speed is less important will have the choice of +distributing a single binary which will work on any version and +implementation of Python.
                • +
                +
                +
                +

                A simple example

                +

                The HPy repo contains a proof of concept module. Here is a simplified +version which illustrates what a HPy module looks like:

                +
                +#include "hpy.h"
                +
                +HPy_DEF_METH_VARARGS(add_ints)
                +static HPy add_ints_impl(HPyContext ctx, HPy self, HPy *args, HPy_ssize_t nargs)
                +{
                +    long a, b;
                +    if (!HPyArg_Parse(ctx, args, nargs, "ll", &a, &b))
                +        return HPy_NULL;
                +    return HPyLong_FromLong(ctx, a+b);
                +}
                +
                +
                +static HPyMethodDef PofMethods[] = {
                +    {"add_ints", add_ints, HPy_METH_VARARGS, ""},
                +    {NULL, NULL, 0, NULL}
                +};
                +
                +static HPyModuleDef moduledef = {
                +    HPyModuleDef_HEAD_INIT,
                +    .m_name = "pof",
                +    .m_doc = "HPy Proof of Concept",
                +    .m_size = -1,
                +    .m_methods = PofMethods
                +};
                +
                +
                +HPy_MODINIT(pof)
                +static HPy init_pof_impl(HPyContext ctx)
                +{
                +    HPy m;
                +    m = HPyModule_Create(ctx, &moduledef);
                +    if (HPy_IsNull(m))
                +        return HPy_NULL;
                +    return m;
                +}
                +
                +

                People who are familiar with the current C-API will surely notice many +similarities. The biggest differences are:

                + +
                  +
                • Instead of PyObject *, objects have the type HPy, which as +explained above represents a handle.
                • +
                • You need to explicitly pass an HPyContext around: the intent is +primary to be future-proof and make it easier to implement things like +sub- interpreters.
                • +
                • +HPy_METH_VARARGS is implemented differently than CPython's +METH_VARARGS: in particular, these methods receive an array of HPy +and its length, instead of a fully constructed tuple: passing a tuple +makes sense on CPython where you have it anyway, but it might be an +unnecessary burden for alternate implementations. Note that this is +similar to the new METH_FASTCALL which was introduced in CPython.
                • +
                • HPy relies a lot on C macros, which most of the time are needed to support +the HPy/CPython ABI compilation mode. For example, HPy_DEF_METH_VARARGS +expands into a trampoline which has the correct C signature that CPython +expects (i.e., PyObject (*)(PyObject *self, *PyObject *args)) and +which calls add_ints_impl.
                • +
                +
                +
                +

                Sprint report and current status

                +

                After this long preamble, here is a rough list of what we accomplished during +the week-long sprint and the days immediatly after.

                +

                On the HPy side, we kicked-off the code in the repo: at the moment of writing +the layout of the directories is a bit messy because we moved things around +several times, but we identified several main sections:

                + +
                  +
                1. +

                  A specification of the API which serves both as documentation and as an +input for parts of the projects which are automatically +generated. Currently, this lives in public_api.h.

                  +
                2. +
                3. +

                  A set of header files which can be used to compile extension modules: +depending on whether the flag -DHPY_UNIVERSAL_ABI is passed to the +compiler, the extension can target the HPy/CPython ABI or the HPy +Universal ABI

                  +
                4. +
                5. +

                  A CPython extension module called hpy_universal which makes it +possible to import universal modules on CPython

                  +
                6. +
                7. +

                  A set of tests which are independent of the implementation and are meant +to be an "executable specification" of the semantics. Currently, these +tests are run against three different implementations of the HPy API:

                  + +
                    +
                  • the headers which implements the "HPy/CPython ABI"
                  • +
                  • the hpy_universal module for CPython
                  • +
                  • the hpy_universal module for PyPy (these tests are run in the PyPy repo)
                  • +
                  +
                8. +
                +

                Moreover, we started a PyPy branch in which to implement the +hpy_univeral module: at the moment of writing PyPy can pass all the HPy +tests apart the ones which allow conversion to and from PyObject *. +Among the other things, this means that it is already possible to load the +very same binary module in both CPython and PyPy, which is impressive on its +own :).

                +

                Finally, we wanted a real-life use case to show how to port a module to HPy +and to do benchmarks. After some searching, we choose ultrajson, for the +following reasons:

                + +
                  +
                • it is a real-world extension module which was written with performance in +mind
                • +
                • when parsing a JSON file it does a lot of calls to the Python API to +construct the various parts of the result message
                • +
                • it uses only a small subset of the Python API
                • +
                +

                This repo contains the HPy port of ultrajson. This commit shows an example +of what the porting looks like.

                +

                ujson_hpy is also a very good example of incremental migration: so far +only ujson.loads is implemented using the HPy API, while ujson.dumps +is still implemented using the old C-API, and both can coexist nicely in the +same compiled module.

                +
                +
                +

                Benchmarks

                +

                Once we have a fully working ujson_hpy module, we can finally run +benchmarks! We tested several different versions of the module:

                + +
                  +
                • +ujson: this is the vanilla implementation of ultrajson using the +C-API. On PyPy this is executed by the infamous cpyext compatibility +layer, so we expect it to be much slower than on CPython
                • +
                • +ujson_hpy: our HPy port compiled to target the HPy/CPython ABI. We +expect it to be as fast as ujson +
                • +
                • +ujson_hpy_universal: same as above but compiled to target the +Universal HPy ABI. We expect it to be slightly slower than ujson on +CPython, and much faster on PyPy.
                • +
                +

                Finally, we also ran the benchmark using the builtin json module. This is +not really relevant to HPy, but it might still be an interesting as a +reference data point.

                +

                The benchmark is very simple and consists of parsing a big JSON file 100 +times. Here is the average time per iteration (in milliseconds) using the +various versions of the module, CPython 3.7 and the latest version of the hpy +PyPy branch:

                + +++++ + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                 CPythonPyPy
                ujson154.32633.97
                ujson_hpy152.19 
                ujson_hpy_universal168.78207.68
                json224.59135.43
                +

                As expected, the benchmark proves that when targeting the HPy/CPython ABI, HPy +doesn't impose any performance penalty on CPython. The universal version is +~10% slower on CPython, but gives an impressive 3x speedup on PyPy! It it +worth noting that the PyPy hpy module is not fully optimized yet, and we +expect to be able to reach the same performance as CPython for this particular +example (or even more, thanks to our better GC).

                +

                All in all, not a bad result for two weeks of intense hacking :)

                +

                It is also worth noting than PyPy's builtin json module does really +well in this benchmark, thanks to the recent optimizations that were described +in an earlier blog post.

                +
                +
                +

                Conclusion and future directions

                +

                We think we can be very satisfied about what we have got so far. The +development of HPy is quite new, but these early results seem to indicate that +we are on the right track to bring Python extensions into the future.

                +

                At the moment, we can anticipate some of the next steps in the development of +HPy:

                + +
                  +
                • Think about a proper API design: what we have done so far has +been a "dumb" translation of the API we needed to run ujson. However, +one of the declared goal of HPy is to improve the design of the API. There +will be a trade-off between the desire of having a clean, fresh new API +and the need to be not too different than the old one, to make porting +easier. Finding the sweet spot will not be easy!
                • +
                • Implement the "debug" mode, which will help developers to find +bugs such as leaking handles or using invalid handles.
                • +
                • Instruct Cython to emit HPy code on request.
                • +
                • Eventually, we will also want to try to port parts of numpy to HPy to +finally solve the long-standing problem of sub-optimal numpy +performance in PyPy.
                • +
                +

                Stay tuned!

                + +
                +
                +
                +
                +
                + + Anonymous wrote on 2019-12-18 16:22: +
                +
                +

                Is HPy going to be C(++)-specific? Will you consider the feasibility of implementing that API in other languages, such as Rust? Extensive usage of macros is something that's more difficult to generate bindings for.

                +
                +
                +
                +
                + + Antonio Cuni wrote on 2019-12-18 19:00: +
                +
                +

                At the moment HPy is two thing:

                - A C API: here the goal is to have something which is easy to write and to migrate from existing C extensions. The macros are mostly needed to overcome limitations of C as a language

                - an ABI: this is independent from C: any language can decide what is the best API to generate extensions compatible with such an ABI

                +
                +
                +
                +
                + + Anonymous wrote on 2019-12-18 23:53: +
                +
                +

                This sounds really interesting.

                What does this mean for the future of CFFI?

                +
                +
                +
                +
                + + René Dudfield wrote on 2019-12-19 07:41: +
                +
                +

                Great work!

                Especially happy with the consideration of incremental adoption.

                +
                +
                +
                +
                + + Antonio Cuni wrote on 2019-12-19 09:35: +
                +
                +

                @Unknown: CFFI solves a different problem, which is how to wrap an existing C library which does not need to manipulate Python objects. As such, it will continue its development independently than HPy, as far as I can see

                +
                +
                +
                +
                + + Anonymous wrote on 2019-12-20 08:39: +
                +
                +
                Hi PyPy team, thanks for your great work but I found this:

                import sqlite3
                print sqlite3.version

                2.6.0


                sqlite3 is SQLite 2?

                Any chance of a dot release to bring sqlite3 up to date? +
                +
                +
                +
                + + Anonymous wrote on 2019-12-20 09:09: +
                +
                +

                import sqlite3
                print sqlite3.sqlite_version

                D'oh!
                Sorry about that :-)

                +
                +
                +
                + +

                PyPy v7.2 released

                + +
                +
                +The PyPy team is proud to release the version 7.2.0 of PyPy, which includes +two different interpreters:
                  +
                • PyPy2.7, which is an interpreter supporting the syntax and the features of +Python 2.7 including the stdlib for CPython 2.7.13
                • +
                +
                  +
                • PyPy3.6: which is an interpreter supporting the syntax and the features of +Python 3.6, including the stdlib for CPython 3.6.9.
                • +
                +
                +
                +
                +
                +The interpreters are based on much the same codebase, thus the double +release.

                +As always, this release is 100% compatible with the previous one and fixed +several issues and bugs raised by the growing community of PyPy users. +We strongly recommend updating. Many of the fixes are the direct result of +end-user bug reports, so please continue reporting issues as they crop up.

                +You can download the v7.2 releases here:
                + +
                +With the support of Arm Holdings Ltd. and Crossbar.io, this release supports +the 64-bit aarch64 ARM architecture. More about the work and the +performance data around this welcome development can be found in the blog +post.

                + +This release removes the “beta” tag from PyPy3.6. While there may still be some +small corner-case incompatibilities (around the exact error messages in +exceptions and the handling of faulty codec errorhandlers) we are happy with +the quality of the 3.6 series and are looking forward to working on a Python +3.7 interpreter.

                +We updated our benchmark runner at https://speed.pypy.org to a more modern +machine and updated the baseline python to CPython 2.7.11. Thanks to Baroque +Software for maintaining the benchmark runner.

                +The CFFI-based _ssl module was backported to PyPy2.7 and updated to use +cryptography version 2.7. Additionally, the _hashlib, and crypt (or +_crypt on Python3) modules were converted to CFFI. This has two +consequences: end users and packagers can more easily update these libraries +for their platform by executing (cd lib_pypy; ../bin/pypy _*_build.py). +More significantly, since PyPy itself links to fewer system shared objects +(DLLs), on platforms with a single runtime namespace like linux, different CFFI +and c-extension modules can load different versions of the same shared object +into PyPy without collision (issue 2617).

                +Until downstream providers begin to distribute c-extension builds with PyPy, we +have made packages for some common packages available as wheels.

                +The CFFI backend has been updated to version 1.13.0. We recommend using CFFI +rather than c-extensions to interact with C, and cppyy for interacting with +C++ code.

                +Thanks to Anvil, we revived the PyPy Sandbox, (soon to be released) which allows total control +over a Python interpreter’s interactions with the external world.

                +We implemented a new JSON decoder that is much faster, uses less memory, and +uses a JIT-friendly specialized dictionary. More about that in the recent blog post

                +We would like to thank our donors for the continued support of the PyPy +project. If PyPy is not quite good enough for your needs, we are available for +direct consulting work. +
                +We would also like to thank our contributors and encourage new people to join +the project. PyPy has many layers and we need help with all of them: PyPy +and RPython documentation improvements, tweaking popular modules to run +on PyPy, or general help with making RPython’s JIT even better. Since the +previous release, we have accepted contributions from 27 new contributors, +so thanks for pitching in.

                +

                +What is PyPy? +

                +PyPy is a very compliant Python interpreter, almost a drop-in replacement for +CPython 2.7, 3.6. It’s fast (PyPy and CPython 2.7.x performance +comparison) due to its integrated tracing JIT compiler.

                +We also welcome developers of other dynamic languages to see what RPython +can do for them.

                +This PyPy release supports:
                  +
                • +x86 machines on most common operating systems +(Linux 32/64 bit, Mac OS X 64-bit, Windows 32-bit, OpenBSD, FreeBSD)
                • +
                +
                +
                +
                  +
                • big- and little-endian variants of PPC64 running Linux +
                • +
                +
                +
                +
                  +
                • +s390x running Linux
                • +
                +
                +
                +
                  +
                • 64-bit ARM machines running Linux
                • +
                +
                +
                +
                +
                +Unfortunately at the moment of writing our ARM buildbots are out of service, +so for now we are not releasing any binary for the ARM architecture (32-bit), although PyPy does support ARM 32-bit processors.

                +

                +What else is new? +

                +PyPy 7.1 was released in March, 2019. +There are many incremental improvements to RPython and PyPy, For more information about the 7.2.0 release, see the full changelog.

                +Please update, and continue to help us make PyPy better.

                +Cheers,
                +The PyPy team +
                +

                +
                +
                +
                +

                PyPy's new JSON parser

                + +
                +

                +Introduction

                +In the last year or two I have worked on and off on making PyPy's +JSON faster, particularly when parsing large +JSON files. In this post I am going to document those techniques and +measure their performance impact. Note that I am quite a lot more +constrained in what optimizations I can apply here, compared to some of +the much more advanced approaches like +Mison, +Sparser or +SimdJSON because I don't want to +change the json.loads API that Python programs expect, and because I +don't want to only support CPUs with wide SIMD extensions. With a more +expressive API, more optimizations would be possible.
                +There are a number of problems of working with huge JSON files: +deserialization takes a long time on the one hand, and the resulting +data structures often take a lot of memory (usually they can be many +times bigger than the size of the file they originated from). Of course +these problems are related, because allocating and initializing a big +data structure takes longer than a smaller data structure. Therefore I +always tried to attack both of these problems at the same time.
                +One common theme of the techniques I am describing is that of optimizing +the parser for how JSON files are typically used, not how they could +theoretically be used. This is a similar approach to the way dynamic +languages are optimized more generally: most JITs will optimize for +typical patterns of usage, at the cost of less common usage patterns, +which might even become slower as a result of the optimizations.

                +Maps

                +The first technique I investigated is to use maps in the JSON parser. +Maps, also called hidden classes or shapes, are a fairly common way to +(generally, not just in the context of JSON parsing) optimize instances +of +classes +in dynamic language VMs. Maps exploit the fact that while it is in +theory possible to add arbitrary fields to an instance, in practice most +instances of a class are going to have the same set of fields (or one of +a small number of different sets). Since JSON dictionaries or objects +often come from serialized instances of some kind, this property often +holds in JSON files as well: dictionaries often have the same fields in +the same order, within a JSON file.
                +This property can be exploited in two ways: on the one hand, it can be +used to again store the deserialized dictionaries in a more memory +efficient way by not using a hashmap in most cases, but instead +splitting the dictionary into a shared description of the set of keys +(the map) and an array of storage with the values. This makes the +deserialized dictionaries smaller if the same set of keys is repeated a +lot. This is completely transparent to the Python programmer, the +dictionary will look completely normal to the Python program but its +internal representation is different.
                +One downside of using maps is that sometimes files will contain many +dictionaries that have unique key sets. Since maps themselves are quite +large data structures and since dictionaries that use maps contain an +extra level of indirection we want to fall back to using normal hashmaps +to represent the dictionaries where that is the case. To prevent this we +perform some statistics at runtime, how often every map (i.e. set of +keys) is used in the file. For uncommonly used maps, the map is +discarded and the dictionaries that used the map converted into using a +regular hashmap.

                +Using Maps to Speed up Parsing

                +Another benefit of using maps to store deserialized dictionaries is that +we can use them to speed up the parsing process itself. To see how this +works, we need to understand maps a bit better. All the maps produced as +a side-effect of parsing JSON form a tree. The tree root is a map that +describes the object without any attributes. From every tree node we +have a number of edges going to other nodes, each edge for a specific +new attribute added:

                +This map tree is the result of parsing a file that has dictionaries with +the keys a, b, c many times, the keys a, b, f less often, and also some +objects with the keys x, y.
                +When parsing a dictionary we traverse this tree from the root, according +to the keys that we see in the input file. While doing this, we +potentially add new nodes, if we get key combinations that we have never +seen before. The set of keys of a dictionary parsed so far are +represented by the current tree node, while we can store the values into +an array. We can use the tree of nodes to speed up parsing. A lot of the +nodes only have one child, because after reading the first few keys of +an object, the remaining ones are often uniquely determined in a given +file. If we have only one child map node, we can speculatively parse the +next key by doing a memcmp between the key that the map tree says is +likely to come next and the characters that follow the ',' that started +the next entry in the dictionary. If the memcmp returns true this +means that the speculation paid off, and we can transition to the new map +that the edge points to, and parse the corresponding value. If not, we +fall back to general code that parses the string, handles escaping rules +etc. This trick was explained to me by some V8 engineers, the same trick +is supposedly used as part of the V8 JSON parser.
                +This scheme doesn't immediately work for map tree nodes that have more +than one child. However, since we keep statistics anyway about how often +each map is used as the map of a parsed dictionary, we can speculate +that the most common map transition is taken more often than the others +in the future, and use that as the speculated next node.
                +So for the example transition tree shown in the figure above the key +speculation would succeed for objects with keys a, b, c. For objects +with keys a, b, f the speculation would succeed for the first two +keys, but not for the third key f. For objects with the keys +x, y the speculation would fail for the first key x but succeed +for the second key y.
                +For real-world datasets these transition trees can become a lot more +complicated, for example here is a visualization of a part of the +transition tree generated for parsing a New York Times dataset:
                + +

                +Caching Strings

                +A rather obvious observation we can use to improve performance of the +parser is the fact that string values repeat a lot in most JSON files. +For strings that are used as dictionary keys this is pretty obvious. +However it happens also for strings that are used as values in +dictionaries (or are stored in lists). We can use this fact to +intern/memoize strings and save memory. This is an approach that many +JSON parsers use, including +CPython's. +To do this, I keep a dictionary of strings that we have seen so far +during parsing and look up new strings that are deserialized. If we have +seen the string before, we can re-use the deserialized previous string. +Right now I only consider utf-8 strings for caching that do not contain +any escapes (whether stuff like \", \n or escaped unicode chars).
                +This simple approach works extremely well for dictionary keys, but needs +a number of improvements to be a win in general. The first observation +is that computing the hash to look up the string in the dictionary of +strings we've seen so far is basically free. We can compute the hash +while scanning the input for the end of the string we are currently +deserializing. Computing the hash while scanning doesn't increase the +time spent scanning much. This is not a new idea, I am sure many other +parsers do the same thing (but CPython doesn't seem to).
                +Another improvement follows from the observation that inserting every +single deserialized non-key string into a hashmap is too expensive. +Instead, we insert strings into the cache more conservatively, by +keeping a small ring buffer of hashes of recently deserialized strings. +The hash is looked for in the ring buffer, and only if the hash is +present we insert the string into the memoization hashmap. This has the +effect of only inserting strings into the memoization hashmap that +re-occur a second time not too far into the file. This seems to give a +good trade-off between still re-using a lot of strings but keeping the +time spent updating and the size of the memoization hashmap low.
                +Another twist is that in a lot of situations caching strings is not +useful at all, because it will almost never succeed. Examples of this +are UUIDs (which are unique), or the content of a tweet in a JSON file +with many tweets (which is usually unique). However, in the same file it +might be useful to cache e.g. the user name of the Twitter user, because +many tweets from the same person could be in such a file. Therefore the +usefulness of the string cache depends on which fields of objects we are +deserializing the value off. Therefore we keep statistics per map field +and disable string memoization per individual field if the cache hit +rate falls below a certain threshold. This gives the best of both +worlds: in the cases where string values repeat a lot in certain fields +we use the cache to save time and memory. But for those fields that +mostly contain unique strings we don't waste time looking up and adding +strings in the memoization table. Strings outside of dictionaries are +quite rare anyway, so we just always try to use the cache for them.
                +The following pseudocode sketches the code to deserialize a string in +the input at a given position. The function also takes a map, which is +the point in the map tree that we are currently deserializing a field +off (if we are deserializing a string in another context, some kind of +dummy map can be used there).
                
                +def deserialize_string(pos, input, map):
                +    # input is the input string, pos is the position of the starting " of
                +    # the string
                +
                +    # find end of string, check whether it contains escape codes,
                +    # compute hash, all at the same time
                +    end, escapes, hash = find_end_of_string(pos + 1, input)
                +    if end == -1:
                +        raise ParseError
                +    if escapes:
                +        # need to be much more careful with escaping
                +        return deserialize_string_escapes(pos, input)
                +    
                +    # should we cache at all?
                +    if map.cache_disabled():
                +        return input[pos + 1:end]
                +
                +    # if string is in cache, return it
                +    if hash in cache:
                +        map.cache_hit += 1
                +        return cache[hash]
                +
                +    result = input[pos + 1:end]
                +    map.cache_miss += 1
                +
                +    # if hash is in the ring buffer of recently seen hashes,
                +    # add the string to the cache
                +    if hash in ring_buffer:
                +        cache[hash] = result
                +    else:
                +        ring_buffer.write(hash)
                +    return result
                +
                +
                +
                +

                +Evaluation

                +To find out how much the various techniques help, I implemented a number +of JSON parsers in PyPy with different combinations of the techniques +enabled. I compared the numbers with the JSON parser of CPython 3.7.3 +(simplejson), with ujson, with the JSON parser of Node 12.11.1 (V8) and with +RapidJSON (in DOM mode).
                +I collected a number of medium-to-large JSON files to try the JSON +parsers on:
                  +
                • +Censys: A subset of the Censys port and +protocol scan data for websites in the Alexa top million domains
                • +
                • +Gharchive: Github activity from +January 15-23, 2015 from Github Archive
                • +
                • +Reddit: Reddit +comments from May 2009
                • +
                • Rosie: The nested matches produced using the Rosie pattern +language all.things pattern on a log +file
                • +
                • Nytimes: Metadata of a collection of New York Times articles
                • +
                • Tpch: The TPC-H database benchmark's deals table as a JSON file
                • +
                • Twitter: A JSON export of the @pypyproject Twitter account data
                • +
                • Wikidata: A file storing a subset of the Wikidata fact dump from Nov +11, 2014
                • +
                • +Yelp: A file of yelp +businesses
                • +
                +Here are the file sizes of the benchmarks:
                + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                BenchmarkFile Size [MiB]
                Censys898.45
                Gharchive276.34
                NYTimes12.98
                Reddit931.65
                Rosie388.88
                TPCH173.86
                Wikidata119.75
                Yelp167.61
                +I measured the times of each benchmark with a number of variations +of the improved PyPy algorithms:
                  +
                • PyPyBaseline: The PyPy JSON parser as it was before my work with JSON +parsing started (PyPy version 5.8)
                • +
                • PyPyKeyStringCaching: Memoizing the key strings of dictionaries, but +not the other strings in a json file, and not using maps to represent +dictionaries (this is the JSON parser that PyPy has been shipping since +version 5.9, in the benchmarks I used 7.1).
                • +
                • PyPyMapNoCache: Like PyPyKeyStringCaching, but using maps to +represent dictionaries. This includes speculatively parsing the next +key using memcmp, but does not use string caching of non-key strings.
                • +
                • PyPyFull: Like PyPyMapNoCache but uses a string cache for all +strings, not just keys. This is equivalent to what will be released soon as part of PyPy 7.2
                • +
                +In addition to wall clock time of parsing, I also measured the increase +in memory use of each implementation after the input string has been +deserialized, i.e. the size of the in-memory representation of every +JSON file.

                +Contributions of Individual Optimizations

                +Let's first look at the contributions of the individual optimizations to the +overall performance and memory usage.

                +All the benchmarks were run 30 times in new processes, all the numbers are +normalized to PyPyFull.
                +The biggest individual improvement to both parsing time and memory used comes +from caching just the keys in parsed dictionaries. This is the optimization in +PyPy's JSON parser that has been implemented for a while already. To understand +why this optimization is so useful, let's look at some numbers about each +benchmark, namely the number of total keys across all dictionaries in each +file, as well as the number of unique keys. As we can see, for all benchmarks +the number of unique keys is significantly smaller than the number of keys in +total.
                + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                BenchmarkNumber of keysNumber of unique keys
                Censys14 404 234163
                Gharchive6 637 881169
                NYTimes417 33760
                Reddit25 226 39721
                Rosie28 500 1015
                TPCH6 700 00045
                Wikidata6 235 0881 602
                Yelp5 133 91461
                +The next big jump in deserialization time and memory comes from introducing +maps to represent deserialized dictionaries. With PyPyMapNoCache +deserialization time goes down because it's much cheaper to walk the tree +of maps and store all deserialized objects into an array of values than to +build hashmaps with the same keys again and again. Memory use goes down +for the same reason: it takes a lot less memory to store the shared +structure of each set of keys in the map, as opposed to repeating it again +and again in every hashmap.
                +We can look at some numbers about every benchmark again. The table shows how +many map-based dictionaries are deserialized for every benchmark, and how many +hashmap-backed dictionaries. We see that the number of hashmap-backed +dictionaries is often zero, or at most a small percentage of all dictionaries +in each benchmark. Yelp has the biggest number of hashmap-backed dictionaries. +The reason for this is that the input file contains hashmaps that store +combinations of various features of Yelp businesses, and a lot of these +combinations are totally unique to a business. Therefore the heuristics +determine that it's better to store these using hashmaps.
                + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                BenchmarkMap DictsRegular Dicts% Regular Dicts
                Censys4 049 2351 0420.03
                Gharchive955 30100.00
                NYTimes80 39300.00
                Reddit1 201 25700.00
                Rosie6 248 96600.00
                TPCH1 000 00000.00
                Wikidata1 923 46046 9052.38
                Yelp443 14052 05110.51
                + +We can also look at numbers about how often the memcmp-based speculative +parsing of the next key of a given map succeeds. Looking at statistics +about each benchmark, we can see that the speculation of what key we +expect next pays off in a significant percentage of cases, between 63% for +Wikidata where the dictionary structures are quite irregular, and 99% for +Reddit, where all the dictionaries have the same set of keys.
                + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                BenchmarkNumber of KeysMap Transitions% Successful Speculation
                Censys14 404 23414 403 24365.79
                Gharchive6 637 8816 637 88186.71
                NYTimes417 337417 33779.85
                Reddit25 226 39725 226 397100.00
                Rosie28 500 10128 500 10190.37
                TPCH6 700 0006 700 00086.57
                Wikidata6 235 0885 267 74463.68
                Yelp5 133 9144 593 98090.43
                geomean82.04
                +General string caching is the most unclear optimization. On the one hand its +impact on memory usage is quite substantial, leading to a 20% reduction for +Gharchive and Reddit, up to a 2× improvement for Yelp. On the other hand, the +effect on performance is less clear, since it even leads to a slowdown in +Gharchive and Reddit, and generally only a small improvement. Choosing the +right heuristic for when to disable the cache also has somewhat unclear effects +and is definitely a topic worthy of further investigation.

                +Comparison against other JSON Decoders

                +To get a more general feeling of the performance and memory usage of the +improved PyPy parser, we compare it against CPython's built-in json +parser, ujson for CPython, Node's (V8) JSON parser and RapidJSON. For +better context for the memory usage I also show the file size of the input +files.
                +These benchmarks are not really an apples-to-apple comparison. All of the +implementations use different in-memory representations of strings in +the deserialized data-structure (Node uses two bytes per character in +a string, in CPython it +depends but 4 bytes on my +machine), PyPyBaseline uses four bytes, PyPy and RapidJSON use utf-8). But +it's still interesting to get some ballpark numbers. The results are as +follows:

                +As we can see, PyPyFull handily beats CPython and ujson, with a geometric +mean of the improvement of about 2.5×. The memory improvement can be even +more extreme, with an improvement of over 4× against CPython/ujson in some +cases (CPython gives better memory sizes, because its parser caches the +keys of dictionaries as well). Node is often more than 50% slower, whereas +RapidJSON beats us easily, by a factor of 2× on average.

                +Conclusions

                +While the speedup I managed to achieve over the course of this project is +nice and I am certainly happy to beat both CPython and Node, I am +ultimately still annoyed that RapidJSON manages to maintain such a clear +lead over PyPyFull, and would like to get closer to it. One problem that +PyPy suffers compared to RapidJSON is the overhead of garbage collection. +Deserializing large JSON files is pretty much the worst case for the +generational GC that PyPy uses, since none of the deserialized objects die +young (and the GC expects that most objects do). That means that a lot of +the deserialization time of PyPy is wasted allocating the resulting +objects in the nursery, and then copying them into the old generation. +Somehow, this should be done in better ways, but all my attempts to not +have to do the copy did not seem to help much. So maybe more improvements +are possible, if I can come up with more ideas.
                +On the memory side of things, Node/V8 is beating PyPy clearly which might +indicate more general problems in how we represent Python objects in +memory. On the other hand, I think it's cool that we are competitive with +RapidJSON in terms of memory and often within 2× of the file size.
                +An effect that I didn't consider at all in this blog post is the fact that +accessing the deserialized objects with constants strings is also faster +than with regular dictionaries, due to them being represented with maps. +More benchmarking work to do in the future!
                +If you have your own programs that run on PyPy and use the json parser +a lot, please measure them on the new code and let me know whether you see +any difference! +
                +
                +
                +
                + + Unknown wrote on 2019-10-09 09:49: +
                +
                +

                Great work! Excited for the new release.

                This makes me wonder if maps are (or can) be used for identical dicts which are constructed in a tight loop (e.g. from a CSV or SQLAlchemy rows).

                +
                +
                +
                +
                + + Carl Friedrich Bolz-Tereick wrote on 2019-10-09 11:20: +
                +
                +

                thanks! yes, that should be possible somehow and would indeed be quite cool. If you give us a real benchmark, we can think about it (maybe I should start with csv.DictReader?)

                +
                +
                +
                +
                + + Alexander Hultnér | Hultnér Technologies wrote on 2019-10-09 12:52: +
                +
                +

                Excellent work!

                These posts are always a pleasure to read, and the improvements in PyPy are wounderful.
                Thanks to you and everyone who's involved in making PyPy the great product it is toady, do keep the great work up!

                +
                +
                +
                +
                + + peak wrote on 2019-10-15 03:12: +
                +
                +

                Great work, but could you please provide links (preferably of the permalink variety) to the datasets you used for benchmarking? Thanks.

                +
                +
                +
                + +

                A second life for the Sandbox

                + +
                +

                Hi all,

                Anvil is a UK-based company sponsoring one month of work to revive PyPy's +"sandbox" mode and upgrade it to PyPy3. Thanks to them, sandboxing will be +given a second life!

                +The sandboxed PyPy is a special version of PyPy that runs +fully isolated. It gives a safe way to execute arbitrary Python +programs (whole programs, not small bits of code inside your larger Python +program). Such scripts can be fully untrusted, and they can try to do +anything—there are no syntax-based restrictions, for example—but whatever +they do, any communication with the external world is not actually done but +delegated to the parent process. This is similar but much more flexible than +Linux's Seccomp approach, and it is more lightweight than setting up a full +virtual machine. It also works without operating system support.

                +However, during the course of the years the sandbox mode of PyPy has been +mostly unmaintained and unsupported by the core developers, mostly because of +a lack of interest by users and because it took too much effort to maintain +it.

                +Now we have found that we have an actual user, Anvil. As far as I can tell +they are still using a very old version of PyPy, the last one that supported +sandboxing. This is where this contract comes from: the goal is to modernize sandboxing and port it to PyPy3.

                +Part of my motivation for accepting this work is that I may have found a way to +tweak the protocol on the pipe between the sandboxed PyPy and the parent +controller process. This should make the sandboxed PyPy more resilient against +future developments and easier to maintain; at most, in the future some tweaks will be needed in the +controller process but hopefully not deep inside the guts of the sandboxed +PyPy. Among the advantages, such a more robust solution should mean that we +can actually get a working sandboxed PyPy—or sandboxed PyPy3 or sandboxed +version of any other interpreter written in RPython—with just an extra +argument when calling rpython to translate this interpreter. If everything +works as planned, sandboxing may be given a second life.

                +Armin Rigo

                +
                +
                +
                +
                + + mark wrote on 2020-03-16 11:10: +
                +
                +

                Hi Armin,

                I like your initiative a lot - I tihnk it is very useful to have a safe execution environment for python scripts (a lot can be done, once this is achieved).
                Please keep us updated about the stated of development.
                I am wondering, if it is already in a usable condition - descriptions diverge here.
                Thanks, mark

                +
                +
                +
                + +
                +
                + +
                +
                +
                + +
                + + + + \ No newline at end of file diff --git a/blog/index-41.html b/blog/index-41.html new file mode 100644 index 000000000..0337d07de --- /dev/null +++ b/blog/index-41.html @@ -0,0 +1,1013 @@ + + + + + + +PyPy (old posts, page 41) | PyPy + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +
                +

                #pypy IRC moves to Libera.Chat

                + +
                +
                +

                Following the example of many other FOSS projects, the PyPy team has +decided to move its official #pypy IRC channel from Freenode to +Libera.Chat: irc.libera.chat/pypy

                +

                The core devs will no longer be present on the Freenode channel, so we recommend to +join the new channel as soon as possible.

                +

                wikimedia.org has a +nice guide on +how to setup your client to migrate from Freenode to Libera.Chat.

                +

                Read more…

                +
                +
                +

                PyPy v7.3.5: bugfix release of python 2.7 and 3.7

                + +
                +

                PyPy v7.3.5: release of 2.7 and 3.7

                +

                We are releasing a PyPy 7.3.5 with bugfixes for PyPy 7.3.4, released April 4. +PyPy 7.3.4 was the first release that runs on windows 64-bit, so that support +is still "beta". We are releasing it in the hopes that we can garner momentum +for its continued support, but are already aware of some problems, for instance +it errors in the NumPy test suite (issue 3462). Please help out with testing +the release and reporting successes and failures, financially supporting our +ongoing work, and helping us find the source of these problems.

                +
                  +
                • The new windows 64-bit builds improperly named c-extension modules +with the same extension as the 32-bit build (issue 3443)

                • +
                • Use the windows-specific PC/pyconfig.h rather than the posix one

                • +
                • Fix the return type for _Py_HashDouble which impacts 64-bit windows

                • +
                • A change to the python 3.7 sysconfig.get_config_var('LIBDIR') was wrong, +leading to problems finding libpypy3-c.so for embedded PyPy (issue 3442).

                • +
                • Instantiate distutils.command.install schema for PyPy-specific +implementation_lower

                • +
                • Delay thread-checking logic in greenlets until the thread is actually started +(continuation of issue 3441)

                • +
                • +

                  Four upstream (CPython) security patches were applied:

                  +
                    +
                  • BPO 42988 to remove pydoc.getfile

                  • +
                  • BPO 43285 to not trust the PASV response in ftplib.

                  • +
                  • BPO 43075 to remove a possible ReDoS in urllib AbstractBasicAuthHandler

                  • +
                  • BPO 43882 to sanitize urls containing ASCII newline and tabs in +urllib.parse

                  • +
                  +
                • +
                • Fix for json-specialized dicts (issue 3460)

                • +
                • Specialize ByteBuffer.setslice which speeds up binary file reading by a +factor of 3

                • +
                • When assigning the full slice of a list, evaluate the rhs before clearing the +list (issue 3440)

                • +
                • On Python2, PyUnicode_Contains accepts bytes as well as unicode.

                • +
                • Finish fixing _sqlite3 - untested _reset() was missing an argument +(issue 3432)

                • +
                • Update the packaged sqlite3 to 3.35.5 on windows. While not a bugfix, this +seems like an easy win.

                • +
                +

                We recommend updating. These fixes are the direct result of end-user bug +reports, so please continue reporting issues as they crop up.

                +

                You can find links to download the v7.3.5 releases here:

                +
                +

                https://pypy.org/download.html

                +
                +

                We would like to thank our donors for the continued support of the PyPy +project. If PyPy is not quite good enough for your needs, we are available for +direct consulting work. If PyPy is helping you out, we would love to hear about +it and encourage submissions to our renovated blog site via a pull request +to https://github.com/pypy/pypy.org

                +

                We would also like to thank our contributors and encourage new people to join +the project. PyPy has many layers and we need help with all of them: PyPy +and RPython documentation improvements, tweaking popular modules to run +on PyPy, or general help with making RPython's JIT even better.

                +

                If you are a python library maintainer and use C-extensions, please consider +making a CFFI / cppyy version of your library that would be performant on PyPy. +In any case both cibuildwheel and the multibuild system support +building wheels for PyPy.

                +

                What is PyPy?

                +

                PyPy is a Python interpreter, a drop-in replacement for CPython 2.7, 3.7, and +soon 3.8. It's fast (PyPy and CPython 3.7.4 performance +comparison) due to its integrated tracing JIT compiler.

                +

                We also welcome developers of other dynamic languages to see what RPython +can do for them.

                +

                This PyPy release supports:

                +
                +
                  +
                • x86 machines on most common operating systems +(Linux 32/64 bits, Mac OS X 64 bits, Windows 32/64 bits, OpenBSD, FreeBSD)

                • +
                • big- and little-endian variants of PPC64 running Linux,

                • +
                • s390x running Linux

                • +
                • 64-bit ARM machines running Linux.

                • +
                +
                +

                PyPy does support ARM 32 bit processors, but does not release binaries.

                +
                +
                +

                Some Ways that PyPy uses Graphviz

                + +
                +

                Some way that PyPy uses Graphviz

                +

                Somebody wrote this super cool thread on Twitter about using Graphviz to make +software visualize its internal state:

                +

                PyPy is using this approach a lot too and I collected a few screenshots of that +technique on Twitter and I thought it would make a nice blog post too!

                +

                The most important view early in the project, and the way that our Graphviz +visualizations got started was that we implemented a way to look at the control +flow graphs of our RPython functions after type inference. They are in static +single information form (SSI), a variant of SSA form. Hovering over the +variables shows the inferred types in the footer:

                +/images/2021-graphviz-02-cfg-types.png

                There's another view that shows the inferred call graph of the program:

                +/images/2021-graphviz-05-call-graph.png

                A related viewer shows the inferred class hierarchy (in this case the exception +hierarchy) and you can focus on a single class, which will show you its base +classes and all the methods and instance attributes that were found:

                +/images/2021-graphviz-03-classhier.png/images/2021-graphviz-04-classhier-detailed.png

                We also have a view to show us the traces that are produced by the tracing JIT +tests. this viewer doesn't really scale to the big traces that the full Python +interpreter produces, but it's really useful during testing:

                +/images/2021-graphviz-06-trace.png

                Then there are more traditional tree views, eg here is a parse tree for a small +piece of Python source code:

                +/images/2021-graphviz-07-parse-tree.png

                Parsing-related we have visualized the DFAs of the parser in the past, +though the code is unfortunately lost.

                +

                All these visualizations are made by walking the relevant data structures and +producing a Graphviz input file using a bit of string manipulation, which is +quite easy to do. Knowing a bit of Graphviz is a really useful skill, it's +super easy to make throwaway visualizations.

                +

                For example here is a one-off thing I did when debugging our JSON parser to +show the properties of the objects used in a huge example json file:

                +/images/2021-graphviz-08-json-parser.png

                On top of graphviz, we have a custom tool called the dotviewer, which is +written in Python and uses Pygame to give you a zoomable, pannable, searchable +way to look at huge Graphviz graphs. All the images in this post are +screenshots of that tool. In its simplest form it takes any .dot files as +input.

                +

                Here's a small video dotviewer, moving around and searching in the json graph. +By writing a bit of extra Python code the dotviewer can also be extended to add +hyperlinks in the graphs to navigate to different views (for example, we did +that for the callgraphs above).

                +

                All in all this is a really powerful approach to understand the behaviour of +some of code, or when debugging complicated problems and we have gotten a +huge amount of mileage out of this over the years. It can be seen as an instance +of moldable development ("a way of programming through which you construct +custom tools for each problem"). And it's really easy to get into! The Graphviz +language is quite a simple text-based language that can be applied to a huge +amount of different visualization situations.

                +
                +
                +

                PyPy v7.3.4: release of python 2.7 and 3.7

                + +
                +

                PyPy v7.3.4: release of python 2.7 and 3.7

                +

                The PyPy team is proud to release the version 7.3.4 of PyPy, which includes +two different interpreters:

                +
                +
                  +
                • PyPy2.7, which is an interpreter supporting the syntax and the features of +Python 2.7 including the stdlib for CPython 2.7.18+ (the + is for +backported security updates)

                • +
                • PyPy3.7, which is an interpreter supporting the syntax and the features of +Python 3.7, including the stdlib for CPython 3.7.10. We no longer refer to +this as beta-quality as the last incompatibilities with CPython (in the +re module) have been fixed.

                • +
                +
                +

                We are no longer releasing a Python3.6 version, as we focus on updating to +Python 3.8. We have begun streaming the advances towards this goal on Saturday +evenings European time on https://www.twitch.tv/pypyproject. If Python3.6 is +important to you, please reach out as we could offer sponsored longer term +support.

                +

                The two interpreters are based on much the same codebase, thus the multiple +release. This is a micro release, all APIs are compatible with the other 7.3 +releases. Highlights of the release include binary Windows 64 support, +faster numerical instance fields, and a preliminary HPy backend.

                +

                A new contributor (Ondrej Baranovič - thanks!) took us up on the challenge to get +windows 64-bit support. The work has been merged and for the first time we +are releasing a 64-bit Windows binary package.

                +

                The release contains the biggest change to PyPy's implementation of the +instances of user-defined classes in many years. The optimization was +motivated by the report of performance problems running a numerical particle +emulation. We implemented an optimization that stores int and float +instance fields in an unboxed way, as long as these fields are type-stable +(meaning that the same field always stores the same type, using the principle +of type freezing). This gives significant performance improvements on +numerical pure-Python code, and other code where instances store many integers +or floating point numbers.

                +

                There were also a number of optimizations for methods around strings and bytes, +following user reported performance problems. If you are unhappy with PyPy's +performance on some code of yours, please report an issue!

                +

                A major new feature is prelminary support for the Universal mode of HPy: a +new way of writing c-extension modules to totally encapsulate PyObject*. +The goal, as laid out in the HPy documentation and recent HPy blog post, +is to enable a migration path +for c-extension authors who wish their code to be performant on alternative +interpreters like GraalPython (written on top of the Java virtual machine), +RustPython, and PyPy. Thanks to Oracle and IBM for sponsoring work on HPy.

                +

                Support for the vmprof statistical profiler has been extended to ARM64 via a +built-in backend.

                +

                Several issues exposed in the 7.3.3 release were fixed. Many of them came from the +great work ongoing to ship PyPy-compatible binary packages in conda-forge. +A big shout out to them for taking this on.

                +

                Development of PyPy takes place on https://foss.heptapod.net/pypy/pypy. +We have seen an increase in the number of drive-by contributors who are able to +use gitlab + mercurial to create merge requests.

                +

                The CFFI backend has been updated to version 1.14.5 and the cppyy backend +to 1.14.2. We recommend using CFFI rather than C-extensions to interact with C, +and using cppyy for performant wrapping of C++ code for Python.

                +

                As always, we strongly recommend updating to the latest versions. Many fixes +are the direct result of end-user bug reports, so please continue reporting +issues as they crop up.

                +

                You can find links to download the v7.3.4 releases here:

                +
                +

                https://pypy.org/download.html

                +
                +

                We would like to thank our donors for the continued support of the PyPy +project. If PyPy is not quite good enough for your needs, we are available for +direct consulting work. If PyPy is helping you out, we would love to hear about +it and encourage submissions to our renovated blog site via a pull request +to https://github.com/pypy/pypy.org

                +

                We would also like to thank our contributors and encourage new people to join +the project. PyPy has many layers and we need help with all of them: PyPy +and RPython documentation improvements, tweaking popular modules to run +on PyPy, or general help with making RPython's JIT even better. Since the +previous release, we have accepted contributions from 10 new contributors, +thanks for pitching in, and welcome to the project!

                +

                If you are a python library maintainer and use C-extensions, please consider +making a cffi / cppyy version of your library that would be performant on PyPy. +In any case both cibuildwheel and the multibuild system support +building wheels for PyPy.

                +

                What is PyPy?

                +

                PyPy is a Python interpreter, a drop-in replacement for CPython 2.7, 3.7, and +soon 3.8. It's fast (PyPy and CPython 3.7.4 performance +comparison) due to its integrated tracing JIT compiler.

                +

                We also welcome developers of other dynamic languages to see what RPython +can do for them.

                +

                This PyPy release supports:

                +
                +
                  +
                • x86 machines on most common operating systems +(Linux 32/64 bits, Mac OS X 64 bits, Windows 32/64 bits, OpenBSD, FreeBSD)

                • +
                • big- and little-endian variants of PPC64 running Linux,

                • +
                • s390x running Linux

                • +
                • 64-bit ARM machines running Linux.

                • +
                +
                +

                PyPy does support ARM 32 bit processors, but does not release binaries.

                +

                What else is new?

                +

                For more information about the 7.3.4 release, see the full changelog.

                +

                Please update, and continue to help us make PyPy better.

                +

                Cheers, +The PyPy team

                +
                +
                +

                New HPy blog

                + +
                +

                Regular readers of this blog +already know +about HPy, a project which aims to develop a new C +API for Python to make it easier/faster to support C extensions on alternative +Python implementations, including PyPy.

                +

                The HPy team just published the +first post of HPy new +blog, so if you are interested in its development, make sure to check it out!

                +
                +

                Mac meets Arm64

                + +
                + Looking for sponsorship + +

                Apple now ships Macs which are running on an arm64 variant machine with the +latest version of MacOS, Big Sur M1. We are getting requests for PyPy to +support this new architecture. Here is our position on this topic (or at least +mine, Armin Rigo's), and how you can help.

                + +

                Porting PyPy is harder than just re-running the compiler, because PyPy contains +a few big architecture-dependent "details", like the JIT compiler and the +foreign function interfaces (CFFI and ctypes).

                + +

                Fixing the JIT compiler should not be too much work: we already support arm64, +just the Linux one. But Apple made various details different (like the calling +conventions). A few other parts need to be fixed too, notably CFFI and ctypes, +again because of the calling conventions.

                + +

                Fixing that would be a reasonable amount of work. I would do it myself for a +small amount of money. However, the story doesn't finish here. Obviously, the +start of the story would be to get ssh access to a Big Sur M1 machine. (If at +this point you're thinking "sure, I can give you ssh access for three months", +then please read on.) The next part of the story is that we need a machine +available long term. It can be either a machine provided and maintained by a +third party, or alternatively a pot of money big enough to support the +acquision of a machine and ongoing work of one of us.

                + +

                If we go with the provided-machine solution: What we need isn't a lot of +resources. Our CI requires maybe 10 GB of disk space, and a few hours of CPU +per run. It should fit into 8 GB of RAM. We normally do a run every night but +we can certainly lower the frequency a bit if that would help. However, we'd +ideally like some kind of assurance that you are invested into maintaining the +machine for the next 3-5 years (I guess, see below). We had far too many +machines that disappeared after a few months.

                + +

                If we go with the money-supported solution: it's likely that after 3-5 years +the whole Mac base will have switched to arm64, we'll drop x86-64 support for +Mac, and we'll be back to the situation of the past where there was only one +kind of Mac machine to care about. In the meantime, we are looking at 3-5 +years of lightweight extra maintenance. We have someone that has said he would +do it, but not for free.

                + +

                If either of these two solutions occurs, we'll still have, I quote, "probably +some changes in distutils-type stuff to make python happy", and then some +packaging/deployment changes to support the "universal2" architecture, i.e. +including both versions inside a single executable (which will not be just an +extra switch to clang, because the two versions need a different JIT backend +and so must be translated separately).

                + +

                So, now all the factors are on the table. We won't do the minimal "just the +JIT compiler fixes" if we don't have a plan that goes farther. Either we get +sufficient money, and maybe support, and then we can do it quickly; or PyPy +will just remain not natively available on M1 hardware for the next 3-5 years. +We are looking forward to supporting M1, and view resources contributed by +the community as a vote of confidence in assuring the future of PyPy on this +hardware. Contact us: pypy-dev@python.org, or our private mailing +list pypy-z@python.org.

                + +

                Thanks for reading!

                + +

                Armin Rigo

                +
                +
                +
                +
                + + Adam Sah wrote on 2020-12-31 14:16: +
                +
                +

                if you post a crowdsourcing link (e.g. gofundme, etc) I'd be happy to contribute, and now that it's hit the front page of HN, I'm sure lots of other people would join. M1 macs are pretty inexpensive.

                p.s. thanks!!! for all the work - I use pypy regularly.

                +
                +
                +
                +
                + + Joshua Herman wrote on 2020-12-31 16:47: +
                +
                +

                I have an M1 MacBook Air that I could give you SSH access to but it will come to me in mid January.

                +
                +
                +
                +
                + + Anonymous wrote on 2020-12-31 21:51: +
                +
                +

                ditto on the crowdsource

                +
                +
                +
                +
                + + Michael wrote on 2021-01-01 00:03: +
                +
                +

                You can contribute to PyPy on their Open Collective page:

                https://opencollective.com/pypy

                +
                +
                +
                +
                + + Adam Sah wrote on 2021-01-01 00:25: +
                +
                +

                done.

                +
                +
                +
                +
                + + Anonymous wrote on 2021-01-02 20:03: +
                +
                +

                M1 Macs for CI are available for free for open source developers. See: https://www.macstadium.com/opensource

                +
                +
                +
                +
                + + Armin Rigo wrote on 2021-01-02 20:29: +
                +
                +

                @Anonymous: like many others, MacStadium is conflating "open source" with "hobbyist" by adding this clause: "Open source project may not (...)receive funding from commercial companies or organizations (NGO, education, research or governmental). (...) Contributors who are paid to work on the project are not eligible." The point of my blog post was precisely that I won't do it for free.

                +
                +
                +
                +
                + + glyph wrote on 2021-01-04 05:39: +
                +
                +

                It seems like it might be worth reaching out to MacStadium about it regardless. They've got Golang, Rust, Node, NumFocus, and Monero listed on their support page https://www.macstadium.com/opensource-members which suggests to me that this language might just be a hamfistedly awkward attempt to avoid somebody at Facebook trying to get a free fleet of mac minis out of open sourcing their SDK or something.

                +
                +
                +
                + +

                PyPy 7.3.3 triple release: python 3.7, 3.6, and 2.7

                + +
                +

                 The PyPy team is proud to release the version 7.3.3 of PyPy, which includes +three different interpreters: +

                +
                +
                  +
                • PyPy2.7, which is an interpreter supporting the syntax and the features of +Python 2.7 including the stdlib for CPython 2.7.18 (updated from the +previous version)
                • +
                • PyPy3.6: which is an interpreter supporting the syntax and the features of +Python 3.6, including the stdlib for CPython 3.6.12 (updated from the +previous version).
                • +
                • PyPy3.7 beta: which is our second release of an interpreter supporting the +syntax and the features of Python 3.7, including the stdlib for CPython +3.7.9. We call this beta quality software, there may be issues about +compatibility with new and changed features in CPython 3.7. +Please let us know what is broken or missing. We have not implemented the +documented changes in the re module, and a few other pieces are also +missing. For more information, see the PyPy 3.7 wiki page
                • +
                +
                +

                The interpreters are based on much the same codebase, thus the multiple +release. This is a micro release, all APIs are compatible with the 7.3 +releases, but read on to find out what is new.

                +

                Several issues found in the 7.3.2 release were fixed. Many of them came from the +great work by conda-forge to ship PyPy binary packages. A big shout out +to them for taking this on.

                +

                Development of PyPy has moved to https://foss.heptapod.net/pypy/pypy. +This was covered more extensively in this blog post. We have seen an +increase in the number of drive-by contributors who are able to use gitlab + +mercurial to create merge requests.

                +

                The CFFI backend has been updated to version 1.14.3. We recommend using CFFI +rather than c-extensions to interact with C, and using cppyy for performant +wrapping of C++ code for Python.

                +

                A new contributor took us up on the challenge to get windows 64-bit support. +The work is proceeding on the win64 branch, more help in coding or +sponsorship is welcome. In anticipation of merging this large change, we fixed +many test failures on windows.

                +

                As always, this release fixed several issues and bugs. We strongly recommend +updating. Many of the fixes are the direct result of end-user bug reports, so +please continue reporting issues as they crop up.

                +

                You can find links to download the v7.3.3 releases here:

                +
                + +
                +

                We would like to thank our donors for the continued support of the PyPy +project. If PyPy is not quite good enough for your needs, we are available for +direct consulting work.

                +

                We would also like to thank our contributors and encourage new people to join +the project. PyPy has many layers and we need help with all of them: PyPy +and RPython documentation improvements, tweaking popular modules to run +on pypy, or general help with making RPython’s JIT even better. Since the +previous release, we have accepted contributions from 2 new contributors, +thanks for pitching in.

                +

                If you are a python library maintainer and use c-extensions, please consider +making a cffi / cppyy version of your library that would be performant on PyPy. +In any case both cibuildwheel and the multibuild system support +building wheels for PyPy.

                +
                +

                What is PyPy?

                +

                PyPy is a Python interpreter, a drop-in replacement for CPython 2.7, 3.6, and +3.7. It’s fast (PyPy and CPython 3.7.4 performance +comparison) due to its integrated tracing JIT compiler.

                +

                We also welcome developers of other dynamic languages to see what RPython +can do for them.

                +

                This PyPy release supports:

                +
                +
                  +
                • +x86 machines on most common operating systems +(Linux 32/64 bits, Mac OS X 64 bits, Windows 32 bits, OpenBSD, FreeBSD)
                • +
                • big- and little-endian variants of PPC64 running Linux,
                • +
                • +s390x running Linux
                • +
                • 64-bit ARM machines running Linux.
                • +
                +
                +

                PyPy does support ARM 32 bit processors, but does not release binaries.

                +

                 

                +

                +What else is new? +

                +For more information about the 7.3.3 release, see the full changelog.

                +Please update, and continue to help us make PyPy better.

                +Cheers,
                +The PyPy team +

                 

                +
                +
                +

                PyPy 7.3.2 triple release: python 2.7, 3.6, and 3.7

                + +
                +

                 

                +
                The PyPy team is proud to release version 7.3.2 of PyPy, which includes +three different interpreters: +
                +
                +
                +
                  +
                • PyPy2.7, which is an interpreter supporting the syntax and the features of +Python 2.7 including the stdlib for CPython 2.7.13
                • +
                • PyPy3.6: which is an interpreter supporting the syntax and the features of +Python 3.6, including the stdlib for CPython 3.6.9.
                • +
                • PyPy3.7 alpha: which is our first release of an interpreter supporting the +syntax and the features of Python 3.7, including the stdlib for CPython +3.7.9. We call this an alpha release since it is our first. It is based off PyPy 3.6 so +issues should be around compatibility and not stability. Please try it out +and let us know what is broken or missing. We have not implemented some of the +documented changes in the re module, and other pieces are also +missing. For more information, see the PyPy 3.7 wiki page
                • +
                +
                +
                +

                The interpreters are based on much the same codebase, thus the multiple +release. This is a micro release, all APIs are compatible with the 7.3.0 (Dec +2019) and 7.3.1 (April 2020) releases, but read on to find out what is new.

                +

                Conda Forge now supports PyPy as a python interpreter. The support is quite +complete for linux and macOS. This is the result of a lot of +hard work and good will on the part of the Conda Forge team. A big shout out +to them for taking this on.

                +

                Development of PyPy has transitioning to https://foss.heptapod.net/pypy/pypy. +This move was covered more extensively in this blog post. We have seen an +increase in the number of drive-by contributors who are able to use gitlab + +mercurial to create merge requests.

                +

                The CFFI backend has been updated to version 1.14.2. We recommend using CFFI +rather than c-extensions to interact with C, and using cppyy for performant +wrapping of C++ code for Python.

                +

                NumPy has begun shipping wheels on PyPI for PyPy, currently for linux 64-bit +only. Wheels for PyPy windows will be available from the next NumPy release. Thanks to NumPy for their support.

                +

                A new contributor took us up on the challenge to get windows 64-bit support. +The work is proceeding on the win64 branch, more help in coding or +sponsorship is welcome.

                +

                As always, this release fixed several issues and bugs. We strongly recommend +updating. Many of the fixes are the direct result of end-user bug reports, so +please continue reporting issues as they crop up.

                +

                You can find links to download the v7.3.2 releases here:

                +
                + +
                +

                We would like to thank our donors for the continued support of the PyPy +project. Please help support us at Open Collective. If PyPy is not yet good enough for your needs, we are available for +direct consulting work.

                +

                We would also like to thank our contributors and encourage new people to join +the project. PyPy has many layers and we need help with all of them: PyPy +and RPython documentation improvements, tweaking popular modules to run +on pypy, or general help with making RPython’s JIT even better. Since the +previous release, we have accepted contributions from 8 new contributors, +thanks for pitching in.

                +

                If you are a python library maintainer and use c-extensions, please consider +making a cffi / cppyy version of your library that would be performant on PyPy. +In any case both cibuildwheel and the multibuild system support +building wheels for PyPy.

                +
                +

                What is PyPy?

                +

                PyPy is a very compliant Python interpreter, almost a drop-in replacement for +CPython 2.7, 3.6, and 3.7. It’s fast (PyPy and CPython 2.7.x performance +comparison) due to its integrated tracing JIT compiler.

                +

                We also welcome developers of other dynamic languages to see what RPython +can do for them.

                +

                This PyPy release supports:

                +
                +
                  +
                • +x86 machines on most common operating systems +(Linux 32/64 bits, Mac OS X 64 bits, Windows 32 bits, OpenBSD, FreeBSD)
                • +
                • big- and little-endian variants of PPC64 running Linux,
                • +
                • +s390x running Linux
                • +
                • 64-bit ARM machines running Linux.
                • +
                +
                +

                PyPy does support ARM 32 bit processors, but does not release binaries.

                +
                + +
                +

                +What else is new? +

                +For more information about the 7.3.2 release, see the full changelog.

                +Please update, and continue to help us make PyPy better.

                +Cheers,
                +The PyPy team +
                +

                 

                +

                 

                +
                +
                +
                +
                + + Marius Gedminas wrote on 2020-09-25 09:47: +
                +
                +

                The SHA256 checksum for pypy3.6-v7.3.2-aarch64.tar.bz2 is one character too short on the https://www.pypy.org/download.html. Was it accidentally truncated during a copy/paste?

                +
                +
                +
                +
                + + Anonymous wrote on 2020-09-25 14:03: +
                +
                +

                Better work

                +
                +
                +
                +
                + + Gaëtan de Menten wrote on 2020-09-25 14:45: +
                +
                +

                Concerning sponsorship for the win64 branch, I am in no position to sponsor the entire thing but I would happily give a few euros if you opened a specific fund raiser for that. Note that the donation link (on the blog) is currently broken.

                +
                +
                +
                +
                + + mattip wrote on 2020-09-26 17:32: +
                +
                +

                Marius Gedminas: thanks. Indeed a copy-paste truncation. Should be fixed now, try to refresh (may take 30 minutes or so to propagate)

                +
                +
                +
                + +

                PyPy is on Open Collective

                + +
                +

                Hi all,

                + +

                PyPy is now a member of Open Collective, a fiscal host. We have been thinking about switching to this organization for a couple of years; we like it for various reasons, like the budget transparency and the lightweight touch. We can now officially announce our membership!

                + +

                With this, we are now again free to use PyPy for all financial issues, like receiving funds professionally, paying parts of sprint budgets as we like, and so on. We will shortly be reintroducing buttons that link to Open Collective from the PyPy web site.

                + +

                Although the old donation buttons were removed last year, we believe that there are still a few people that send regularly money to the SFC, the not-for-profit charity we were affiliated with. If you do, please stop doing it now (and, if you like to do so, please set up an equivalent donation to PyPy on Open Collective).

                + +

                And by the way, sorry for all of you who were getting mixed feelings from the previous blog post (co-written with the SFC). PyPy is committed to continue being Open Source just like before. This was never in question. What these two blog posts mean is only that we switched to a different organization for our internal finances.

                + +

                We're looking forward to how this new relationship will go!

                + +

                Armin Rigo, for the PyPy team

                +
                +
                +
                +
                + +
                +
                +
                + +
                + + + + \ No newline at end of file diff --git a/blog/index-42.html b/blog/index-42.html new file mode 100644 index 000000000..1b26299ea --- /dev/null +++ b/blog/index-42.html @@ -0,0 +1,2208 @@ + + + + + + +PyPy (old posts, page 42) | PyPy + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +
                +

                M1 support for PyPy

                + +
                +

                The PyPy team is happy to announce that we can now target the macOS ARM64 +platform. Much of the work was executed by Maciej Fijałkowski (fijal) and +funded via a generous contribution to our OpenCollective. The work is based +on our existing support for aarch64 (arm64 on linux) with some twists +to support the differences between the CPUs and the operating system. There +are nightly builds for pypy3.8 and pypy3.9 (look for macos_arm64), and +the architecture will be part of our next release.

                +

                Please try it out and let us know how it is useful for you or how we could +improve.

                +

                We still need help improving our macOS support. We have an open issue to +help our packaging story. Help is welcome.

                +

                The PyPy team.

                +
                +

                Implementing a Toy Optimizer

                + +
                +

                In this blog post I want to show the complete code (in Python3) of how a very +simple optimizer for sequences of operations can work. These algorithms could +be part of a (really simple) compiler, or a JIT. The architecture of the code in +this blog post is very similar to that of the trace optimizer of the PyPy JIT: +After a trace is produced, is is optimized before being sent to the machine code +backend that produces binary instructions for the CPU architecture that PyPy is +running on.

                +

                To get started, the first thing we need to do is define how our operations are +stored. The +format that a compiler uses to store the program while it is being optimized +is usually called its intermediate representation (IR). Many production +compilers use IRs that are in the Static Single-Assignment Form (SSA), and +we will also use that. SSA form has the property that every variable is +assigned to exactly once, and every variable is defined before it is used. This +simplifies many things.

                +

                Let's make this concrete. If our input program is a complex expressions, such +as a * (b + 17) + (b + 17) the intermediate representation of that (or at +least its text representation) would maybe be something like:

                +
                var1 = add(b, 17)
                +var2 = mul(a, var1)
                +var3 = add(b, 17)
                +var4 = add(var2, var3)
                +

                This sequence of instructions is inefficient. The operation add(b, 17) is +computed twice and we can save time by removing the second one and only +computing it once. In this post I want to show an optimizer that can do this +(and some related) optimizations.

                +

                Looking at the IR we notice that the input expression has been linearized +into a sequence of operations, and all the intermedia results have been given +unique variable names. The value that every variable is assigned is computed +by the right hand side, which is some operation consisting of an operand and an +arbitrary number of arguments. The arguments of an operation are either +themselves variables or constants.

                +

                I will not at all talk about the process of translating the input program +into the IR. Instead, I will assume we have some component that does this +translation already. The tests in this blog post will construct small +snippets of IR by hand. I also won't talk about what happens after the +optimization (usually the optimized IR is translated into machine code).

                +

                Implementing the Intermediate Representation

                +

                Let's start modelling the intermediate representation with Python classes. +First we define a base class of all values that can be used as arguments in +operations, and let's also add a class that represents constants:

                +
                import pytest
                +from typing import Optional, Any
                +
                +class Value:
                +    pass
                +
                +class Constant(Value):
                +    def __init__(self, value: Any):
                +        self.value = value
                +
                +    def __repr__(self):
                +        return f"Constant({self.value})"
                +
                +

                One consequence of the fact that every variable is assigned to only once is +that variables are in a one-to-one correspondence with the right-hand-side of +their unique assignments. That means that we don't need a class that represents +variables at all. Instead, it's sufficient to have a class that represents an +operation (the right-hand side), and that by definition is the same as the variable (left-hand side) that it defines:

                +
                class Operation(Value):
                +    def __init__(self, name: str, args: list[Value]):
                +        self.name = name
                +        self.args = args
                +
                +    def __repr__(self):
                +        return f"Operation({self.name}, {self.args})"
                +
                +    def arg(self, index: int):
                +        return self.args[index]
                +
                +

                Now we can instantiate these two classes to represent the example sequence of +operations above:

                +
                def test_construct_example():
                +    # first we need something to represent
                +    # "a" and "b". In our limited view, we don't
                +    # know where they come from, so we will define
                +    # them with a pseudo-operation called "getarg"
                +    # which takes a number n as an argument and
                +    # returns the n-th input argument. The proper
                +    # SSA way to do this would be phi-nodes.
                +
                +    a = Operation("getarg", [Constant(0)])
                +    b = Operation("getarg", [Constant(1)])
                +    # var1 = add(b, 17)
                +    var1 = Operation("add", [b, Constant(17)])
                +    # var2 = mul(a, var1)
                +    var2 = Operation("mul", [a, var1])
                +    # var3 = add(b, 17)
                +    var3 = Operation("add", [b, Constant(17)])
                +    # var4 = add(var2, var3)
                +    var4 = Operation("add", [var2, var3])
                +
                +    sequence = [a, b, var1, var2, var3, var4]
                +    # nothing to test really, it shouldn't crash
                +
                +

                Usually, complicated programs are represented as a control flow graph in a +compiler, which represents all the possible paths that control can take while +executing the program. Every node in the control flow graph is a basic +block. A basic block is a linear sequence of operations with no control flow +inside of it.

                +

                When optimizing a program, a compiler usually looks at the whole control flow +graph of a function. However, that is still too complicated! So let's +simplify further and look at only at optimizations we can do when looking at +a single basic block and its sequence of instructions (they are called local +optimizations).

                +

                Let's define a class representing basic blocks and let's also add some +convenience functions for constructing sequences of operations, because the +code in test_construct_example is a bit annoying.

                +
                class Block(list):
                +    def opbuilder(opname):
                +        def wraparg(arg):
                +            if not isinstance(arg, Value):
                +                arg = Constant(arg)
                +            return arg
                +        def build(self, *args):
                +            # construct an Operation, wrap the
                +            # arguments in Constants if necessary
                +            op = Operation(opname,
                +                [wraparg(arg) for arg in args])
                +            # add it to self, the basic block
                +            self.append(op)
                +            return op
                +        return build
                +
                +    # a bunch of operations we support
                +    add = opbuilder("add")
                +    mul = opbuilder("mul")
                +    getarg = opbuilder("getarg")
                +    dummy = opbuilder("dummy")
                +    lshift = opbuilder("lshift")
                +
                +def test_convencience_block_construction():
                +    bb = Block()
                +    # a again with getarg, the following line
                +    # defines the Operation instance and
                +    # immediately adds it to the basic block bb
                +    a = bb.getarg(0)
                +    assert len(bb) == 1
                +    assert bb[0].name == "getarg"
                +
                +    # it's a Constant
                +    assert bb[0].args[0].value == 0
                +
                +    # b with getarg
                +    b = bb.getarg(1)
                +    # var1 = add(b, 17)
                +    var1 = bb.add(b, 17)
                +    # var2 = mul(a, var1)
                +    var2 = bb.mul(a, var1)
                +    # var3 = add(b, 17)
                +    var3 = bb.add(b, 17)
                +    # var4 = add(var2, var3)
                +    var4 = bb.add(var2, var3)
                +    assert len(bb) == 6
                +
                +

                That's a good bit of infrastructure to make the tests easy to write. One +thing we are lacking though is a way to print the basic blocks into a nicely +readable textual representation. Because in the current form, the repr of a +Block is very annoying, the output of pretty-printing bb in the test above +looks like this:

                +
                [Operation('getarg', [Constant(0)]),
                + Operation('getarg', [Constant(1)]),
                + Operation('add',
                +           [Operation('getarg',
                +                      [Constant(1)]),
                +                 Constant(17)]),
                + Operation('mul',
                +           [Operation('getarg',
                +                      [Constant(0)]),
                +                 Operation('add',
                +                           [Operation('getarg',
                +                                      [Constant(1)]),
                +                            Constant(17)])]),
                + Operation('add',
                +           [Operation('getarg',
                +                      [Constant(1)]),
                +            Constant(17)]),
                + Operation('add',
                +           [Operation('mul',
                +                       [Operation('getarg',
                +                                  [Constant(0)]),
                +                             Operation('add',
                +                                       [Operation('getarg',
                +                                                  [Constant(1)]),
                +                                        Constant(17)])]),
                +                 Operation('add',
                +                           [Operation('getarg',
                +                                           [Constant(1)]),
                +                                 Constant(17)])])]
                +
                +

                It's impossible to see what is going on here, because the Operations in the +basic block appear several times, once as elements of the list but then also as +arguments to operations further down in the list. So we need some code that +turns things back into a readable textual representation, so we have a chance +to debug.

                +
                def bb_to_str(bb: Block, varprefix: str = "var"):
                +    # the implementation is not too important,
                +    # look at the test below to see what the
                +    # result looks like
                +
                +    def arg_to_str(arg: Value):
                +        if isinstance(arg, Constant):
                +            return str(arg.value)
                +        else:
                +            # the key must exist, otherwise it's
                +            # not a valid SSA basic block:
                +            # the variable must be defined before
                +            # its first use
                +            return varnames[arg]
                +
                +    varnames = {}
                +    res = []
                +    for index, op in enumerate(bb):
                +        # give the operation a name used while
                +        # printing:
                +        var = f"{varprefix}{index}"
                +        varnames[op] = var
                +        arguments = ", ".join(
                +            arg_to_str(op.arg(i))
                +                for i in range(len(op.args))
                +        )
                +        strop = f"{var} = {op.name}({arguments})"
                +        res.append(strop)
                +    return "\n".join(res)
                +
                +def test_basicblock_to_str():
                +    bb = Block()
                +    var0 = bb.getarg(0)
                +    var1 = bb.add(5, 4)
                +    var2 = bb.add(var1, var0)
                +
                +    assert bb_to_str(bb) == """\
                +var0 = getarg(0)
                +var1 = add(5, 4)
                +var2 = add(var1, var0)"""
                +
                +    # with a different prefix for the invented
                +    # variable names:
                +    assert bb_to_str(bb, "x") == """\
                +x0 = getarg(0)
                +x1 = add(5, 4)
                +x2 = add(x1, x0)"""
                +
                +    # and our running example:
                +    bb = Block()
                +    a = bb.getarg(0)
                +    b = bb.getarg(1)
                +    var1 = bb.add(b, 17)
                +    var2 = bb.mul(a, var1)
                +    var3 = bb.add(b, 17)
                +    var4 = bb.add(var2, var3)
                +
                +    assert bb_to_str(bb, "v") == """\
                +v0 = getarg(0)
                +v1 = getarg(1)
                +v2 = add(v1, 17)
                +v3 = mul(v0, v2)
                +v4 = add(v1, 17)
                +v5 = add(v3, v4)"""
                +    # Note the re-numbering of the variables! We
                +    # don't attach names to Operations at all, so
                +    # the printing will just number them in
                +    # sequence, can sometimes be a source of
                +    # confusion.
                +
                +

                This is much better. Now we're done with the basic infrastructure, we can +define sequences of operations and print them in a readable way. Next we need a +central data structure that is used when actually optimizing basic blocks.

                +

                Storing Equivalences between Operations Using a Union-Find Data Structure

                +

                When optimizing a sequence of operations, we want to make it less costly to +execute. For that we typically want to remove operations (and sometimes +replace operations with less expensive ones). We can remove operations if +they do redundant computation, like case of the duplicate add(v1, 17) in +the example. So what we want to do is to turn the running input sequence:

                +
                v0 = getarg(0)
                +v1 = getarg(1)
                +v2 = add(v1, 17)
                +v3 = mul(v0, v2)
                +v4 = add(v1, 17)
                +v5 = add(v3, v4)
                +

                Into the following optimized output sequence:

                +
                optvar0 = getarg(0)
                +optvar1 = getarg(1)
                +optvar2 = add(optvar1, 17)
                +optvar3 = mul(optvar0, optvar2)
                +optvar4 = add(optvar3, optvar2)
                +

                We left out the second add (which defines v4), and then replaced the +usage of v4 with v2 in the final operation that defines v5.

                +

                What we effectively did was discover that v2 and v4 are equivalent and then +replaced v4 with v2. In general, we might discover more such equivalences, +and we need a data structure to store them. A good data structure to store +these equivalences is Union Find (also called Disjoint-set data structure), +which stores a collection of disjoint sets. Disjoint means, that no operation +can appear in more than one set. The sets in our concrete case are the sets of +operations that compute the same result.

                +

                When we start out, every operation is in its own singleton set, with no other +member. As we discover more equivalences, we will unify sets into larger sets +of operations that all compute the same result. So one operation the data +structure supports is union, to unify two sets, we'll call that +make_equal_to in the code below.

                +

                The other operation the data structure supports is find, which takes an +operation and returns a "representative" of the set of all equivalent +operations. Two operations are in the same set, if the representative that +find returns for them is the same.

                +

                The exact details of how the data structure works are only sort of important +(even though it's very cool, I promise!). It's OK to skip over the +implementation. We will add the data structure right into our Value, +Constant and Operation classes:

                +
                class Value:
                +    def find(self):
                +        raise NotImplementedError("abstract")
                +    def _set_forwarded(self, value):
                +        raise NotImplementedError("abstract")
                +
                +
                +class Operation(Value):
                +    def __init__(self, name: str, args: list[Value]):
                +        self.name = name
                +        self.args = args
                +        self.forwarded = None
                +
                +    def __repr__(self):
                +        return (
                +            f"Operation({self.name},"
                +            f"{self.args}, {self.forwarded})"
                +        )
                +
                +    def find(self) -> Value:
                +        # returns the "representative" value of
                +        # self, in the union-find sense
                +        op = self
                +        while isinstance(op, Operation):
                +            # could do path compression here too
                +            # but not essential
                +            next = op.forwarded
                +            if next is None:
                +                return op
                +            op = next
                +        return op
                +
                +    def arg(self, index):
                +        # change to above: return the
                +        # representative of argument 'index'
                +        return self.args[index].find()
                +
                +    def make_equal_to(self, value: Value):
                +        # this is "union" in the union-find sense,
                +        # but the direction is important! The
                +        # representative of the union of Operations
                +        # must be either a Constant or an operation
                +        # that we know for sure is not optimized
                +        # away.
                +
                +        self.find()._set_forwarded(value)
                +
                +    def _set_forwarded(self, value: Value):
                +        self.forwarded = value
                +
                +
                +class Constant(Value):
                +    def __init__(self, value: Any):
                +        self.value = value
                +
                +    def __repr__(self):
                +        return f"Constant({self.value})"
                +
                +    def find(self):
                +        return self
                +
                +    def _set_forwarded(self, value: Value):
                +        # if we found out that an Operation is
                +        # equal to a constant, it's a compiler bug
                +        # to find out that it's equal to another
                +        # constant
                +        assert isinstance(value, Constant) and \
                +            value.value == self.value
                +
                +def test_union_find():
                +    # construct three operation, and unify them
                +    # step by step
                +    bb = Block()
                +    a1 = bb.dummy(1)
                +    a2 = bb.dummy(2)
                +    a3 = bb.dummy(3)
                +
                +    # at the beginning, every op is its own
                +    # representative, that means every
                +    # operation is in a singleton set
                +    # {a1} {a2} {a3}
                +    assert a1.find() is a1
                +    assert a2.find() is a2
                +    assert a3.find() is a3
                +
                +    # now we unify a2 and a1, then the sets are
                +    # {a1, a2} {a3}
                +    a2.make_equal_to(a1)
                +    # they both return a1 as the representative
                +    assert a1.find() is a1
                +    assert a2.find() is a1
                +    # a3 is still different
                +    assert a3.find() is a3
                +
                +    # now they are all in the same set {a1, a2, a3}
                +    a3.make_equal_to(a2)
                +    assert a1.find() is a1
                +    assert a2.find() is a1
                +    assert a3.find() is a1
                +
                +    # now they are still all the same, and we
                +    # also learned that they are the same as the
                +    # constant 6
                +    # the single remaining set then is
                +    # {6, a1, a2, a3}
                +    c = Constant(6)
                +    a2.make_equal_to(c)
                +    assert a1.find() is c
                +    assert a2.find() is c
                +    assert a3.find() is c
                +
                +    # union with the same constant again is fine
                +    a2.make_equal_to(c)
                +
                +

                Constant Folding

                +

                Now comes the first actual optimization, a simple constant folding pass. It +will remove operations where all the arguments are constants and replace them +with the constant result.

                +

                Every pass has the same structure: we go over all operations in the basic +block in order and decide for each operation whether it can be removed. For the +constant folding pass, we can remove all the operations with constant +arguments (but we'll implement only the add case here).

                +

                I will show a buggy version of the constant folding pass first. It has a +problem that is related to why we need the union-find data structure. We will +fix it a bit further down.

                +
                def constfold_buggy(bb: Block) -> Block:
                +    opt_bb = Block()
                +
                +    for op in bb:
                +        # basic idea: go over the list and do
                +        # constant folding of add where possible
                +        if op.name == "add":
                +            arg0 = op.args[0]
                +            arg1 = op.args[1]
                +            if isinstance(arg0, Constant) and \
                +                    isinstance(arg1, Constant):
                +                # can constant-fold! that means we
                +                # learned a new equality, namely
                +                # that op is equal to a specific
                +                # constant
                +                value = arg0.value + arg1.value
                +                op.make_equal_to(Constant(value))
                +                # don't need to have the operation
                +                # in the optimized basic block
                +                continue
                +        # otherwise the operation is not
                +        # constant-foldable and we put into the
                +        # output list
                +        opt_bb.append(op)
                +    return opt_bb
                +
                +
                +def test_constfold_simple():
                +    bb = Block()
                +    var0 = bb.getarg(0)
                +    var1 = bb.add(5, 4)
                +    var2 = bb.add(var1, var0)
                +
                +    opt_bb = constfold_buggy(bb)
                +    assert bb_to_str(opt_bb, "optvar") == """\
                +optvar0 = getarg(0)
                +optvar1 = add(9, optvar0)"""
                +
                +@pytest.mark.xfail
                +def test_constfold_buggy_limitation():
                +    # this test fails! it shows the problem with
                +    # the above simple constfold_buggy pass
                +
                +    bb = Block()
                +    var0 = bb.getarg(0)
                +    # this is folded
                +    var1 = bb.add(5, 4)
                +    # we want this folded too, but it doesn't work
                +    var2 = bb.add(var1, 10)
                +    var3 = bb.add(var2, var0)
                +
                +    opt_bb = constfold_buggy(bb)
                +    assert bb_to_str(opt_bb, "optvar") == """\
                +optvar0 = getarg(0)
                +optvar1 = add(19, optvar0)"""
                +
                +

                Why does the test fail? The opt_bb printed output looks like this:

                +
                optvar0 = getarg(0)
                +optvar1 = add(9, 10)
                +optvar2 = add(optvar1, optvar0)
                +

                The problem is that when we optimize the second addition in constfold_buggy, +the argument of that operation is an Operation not a Constant, so +constant-folding is not applied to the second add. However, we have already +learned that the argument var1 to the operation var2 is equal to +Constant(9). This information is stored in the union-find data structure. +So what we are missing are suitable find calls in the constant folding pass, to +make use of the previously learned equalities.

                +

                Here's the fixed version:

                +
                def constfold(bb: Block) -> Block:
                +    opt_bb = Block()
                +
                +    for op in bb:
                +        # basic idea: go over the list and do
                +        # constant folding of add where possible
                +        if op.name == "add":
                +            # >>> changed
                +            arg0 = op.arg(0) # uses .find()
                +            arg1 = op.arg(1) # uses .find()
                +            # <<< end changes
                +            if isinstance(arg0, Constant) and \
                +                    isinstance(arg1, Constant):
                +                # can constant-fold! that means we
                +                # learned a new equality, namely
                +                # that op is equal to a specific
                +                # constant
                +                value = arg0.value + arg1.value
                +                op.make_equal_to(Constant(value))
                +                # don't need to have the operation
                +                # in the optimized basic block
                +                continue
                +        # otherwise the operation is not
                +        # constant-foldable and we put into the
                +        # output list
                +        opt_bb.append(op)
                +    return opt_bb
                +
                +
                +def test_constfold_two_ops():
                +    # now it works!
                +    bb = Block()
                +    var0 = bb.getarg(0)
                +    var1 = bb.add(5, 4)
                +    var2 = bb.add(var1, 10)
                +    var3 = bb.add(var2, var0)
                +    opt_bb = constfold(bb)
                +
                +    assert bb_to_str(opt_bb, "optvar") == """\
                +optvar0 = getarg(0)
                +optvar1 = add(19, optvar0)"""
                +
                +

                Common Subexpression Elimination

                +

                The constfold pass only discovers equalities between Operations and +Constants. Let's do a second pass that also discovers equalities between +Operations and other Operations.

                +

                A simple optimization that does that has this property common subexpression +elimination (CSE), which will finally optimize away the problem in the +introductory example code that we had above.

                +
                def cse(bb: Block) -> Block:
                +    # structure is the same, loop over the input,
                +    # add some but not all operations to the
                +    # output
                +
                +    opt_bb = Block()
                +
                +    for op in bb:
                +        # only do CSE for add here, but it
                +        # generalizes
                +        if op.name == "add":
                +            arg0 = op.arg(0)
                +            arg1 = op.arg(1)
                +            # Check whether we have emitted the
                +            # same operation already
                +            prev_op = find_prev_add_op(
                +                arg0, arg1, opt_bb)
                +            if prev_op is not None:
                +                # if yes, we can optimize op away
                +                # and replace it with the earlier
                +                # result, which is an Operation
                +                # that was already emitted to
                +                # opt_bb
                +                op.make_equal_to(prev_op)
                +                continue
                +        opt_bb.append(op)
                +    return opt_bb
                +
                +
                +def eq_value(val0, val1):
                +    if isinstance(val0, Constant) and \
                +            isinstance(val1, Constant):
                +        # constants compare by their value
                +        return val0.value == val1.value
                +    # everything else by identity
                +    return val0 is val1
                +
                +
                +def find_prev_add_op(arg0: Value, arg1: Value,
                +        opt_bb: Block) -> Optional[Operation]:
                +    # Really naive and quadratic implementation.
                +    # What we do is walk over the already emitted
                +    # operations and see whether we emitted an add
                +    # with the current arguments already. A real
                +    # implementation might use a hashmap of some
                +    # kind, or at least only look at a limited
                +    # window of instructions.
                +    for opt_op in opt_bb:
                +        if opt_op.name != "add":
                +            continue
                +        # It's important to call arg here,
                +        # for the same reason why we
                +        # needed it in constfold: we need to
                +        # make sure .find() is called
                +        if eq_value(arg0, opt_op.arg(0)) and \
                +                eq_value(arg1, opt_op.arg(1)):
                +            return opt_op
                +    return None
                +
                +
                +def test_cse():
                +    bb = Block()
                +    a = bb.getarg(0)
                +    b = bb.getarg(1)
                +    var1 = bb.add(b, 17)
                +    var2 = bb.mul(a, var1)
                +    var3 = bb.add(b, 17)
                +    var4 = bb.add(var2, var3)
                +
                +    opt_bb = cse(bb)
                +    assert bb_to_str(opt_bb, "optvar") == """\
                +optvar0 = getarg(0)
                +optvar1 = getarg(1)
                +optvar2 = add(optvar1, 17)
                +optvar3 = mul(optvar0, optvar2)
                +optvar4 = add(optvar3, optvar2)"""
                +
                +

                Strength Reduction

                +

                Now we have one pass that replaces Operations with Constants and one that +replaces Operations with previously existing Operations. Let's now do one +final pass that replaces Operations by newly invented Operations, a simple +strength reduction. This one will be simple.

                +
                def strength_reduce(bb: Block) -> Block:
                +    opt_bb = Block()
                +    for op in bb:
                +        if op.name == "add":
                +            arg0 = op.arg(0)
                +            arg1 = op.arg(1)
                +            if arg0 is arg1:
                +                # x + x turns into x << 1
                +                newop = opt_bb.lshift(arg0, 1)
                +                op.make_equal_to(newop)
                +                continue
                +        opt_bb.append(op)
                +    return opt_bb
                +
                +def test_strength_reduce():
                +    bb = Block()
                +    var0 = bb.getarg(0)
                +    var1 = bb.add(var0, var0)
                +
                +    opt_bb = strength_reduce(bb)
                +
                +    assert bb_to_str(opt_bb, "optvar") == """\
                +optvar0 = getarg(0)
                +optvar1 = lshift(optvar0, 1)"""
                +
                +

                Putting Things Together

                +

                Let's combine the passes into one single pass, so that we are going over all +the operations only exactly once, instead of having to look at every operation +once for all the different passes.

                +
                def optimize(bb: Block) -> Block:
                +    opt_bb = Block()
                +
                +    for op in bb:
                +        if op.name == "add":
                +            arg0 = op.arg(0)
                +            arg1 = op.arg(1)
                +
                +            # constant folding
                +            if isinstance(arg0, Constant) and \
                +                    isinstance(arg1, Constant):
                +                value = arg0.value + arg1.value
                +                op.make_equal_to(Constant(value))
                +                continue
                +
                +            # cse
                +            prev_op = find_prev_add_op(
                +                arg0, arg1, opt_bb)
                +            if prev_op is not None:
                +                op.make_equal_to(prev_op)
                +                continue
                +
                +            # strength reduce:
                +            # x + x turns into x << 1
                +            if arg0 is arg1:
                +                newop = opt_bb.lshift(arg0, 1)
                +                op.make_equal_to(newop)
                +                continue
                +
                +            # and while we are at it, let's do some
                +            # arithmetic simplification:
                +            # a + 0 => a
                +            if eq_value(arg0, Constant(0)):
                +                op.make_equal_to(arg1)
                +                continue
                +            if eq_value(arg1, Constant(0)):
                +                op.make_equal_to(arg0)
                +                continue
                +        opt_bb.append(op)
                +    return opt_bb
                +
                +
                +def test_single_pass():
                +    bb = Block()
                +    # constant folding
                +    var0 = bb.getarg(0)
                +    var1 = bb.add(5, 4)
                +    var2 = bb.add(var1, 10)
                +    var3 = bb.add(var2, var0)
                +
                +    opt_bb = optimize(bb)
                +    assert bb_to_str(opt_bb, "optvar") == """\
                +optvar0 = getarg(0)
                +optvar1 = add(19, optvar0)"""
                +
                +    # cse + strength reduction
                +    bb = Block()
                +    var0 = bb.getarg(0)
                +    var1 = bb.getarg(1)
                +    var2 = bb.add(var0, var1)
                +    var3 = bb.add(var0, var1) # the same as var3
                +    var4 = bb.add(var2, 2)
                +    var5 = bb.add(var3, 2) # the same as var4
                +    var6 = bb.add(var4, var5)
                +
                +    opt_bb = optimize(bb)
                +    assert bb_to_str(opt_bb, "optvar") == """\
                +optvar0 = getarg(0)
                +optvar1 = getarg(1)
                +optvar2 = add(optvar0, optvar1)
                +optvar3 = add(optvar2, 2)
                +optvar4 = lshift(optvar3, 1)"""
                +
                +    # removing + 0
                +    bb = Block()
                +    var0 = bb.getarg(0)
                +    var1 = bb.add(16, -16)
                +    var2 = bb.add(var0, var1)
                +    var3 = bb.add(0, var2)
                +    var4 = bb.add(var2, var3)
                +
                +    opt_bb = optimize(bb)
                +    assert bb_to_str(opt_bb, "optvar") == """\
                +optvar0 = getarg(0)
                +optvar1 = lshift(optvar0, 1)"""
                +
                +

                Conclusion

                +

                That's it for now. Why is this architecture cool? From a software engineering +point of view, sticking everything into a single function like in optimize +above is obviously not great, and if you wanted to do this for real you would +try to split the cases into different functions that are individually +digestible, or even use a DSL that makes the pattern matching much more +readable. But the advantage of the architecture is that it's quite efficient, +it makes it possible to pack a lot of good optimizations into a single pass +over a basic block.

                +

                Of course this works even better if you are in a tracing context, where +everything is put into a trace, which is basically one incredibly long basic +block. In a JIT context it's also quite important that the +optimizer itself runs quickly.

                +

                Various other optimizations are possible in this model. There is a +follow-up post that show how to implement what is arguably PyPy's most +important optimization.

                +

                Some Further Pointers

                +

                This post is only a short introduction and is taking some shortcuts, I wanted to +also give some (non-exhaustive) pointers to more general literature about the +touched topics.

                +

                The approach to CSE described here is usually can be seen as value +numbering, it's normally really implemented with a hashmap though. Here's a +paper that describes various styles of implementing that, even beyond a +single basic block. The paper also partly takes the perspective of discovering +equivalence classes of operations that compute the same result.

                +

                A technique that leans even more fully into finding equivalences between +operations is using e-graphs and then applying equality saturation (this is +significantly more advanced that what I described here though). A cool modern +project that applies this technique is egg.

                +

                If you squint a bit, you can generally view a constant folding pass as a very +simple form of Partial Evaluation: every operation that has constant +arguments is constant-folded away, and the remaining ones are "residualized", +i.e. put into the output program. This point of view is not super important for +the current post, but will become important in the next one.

                +

                Acknowledgements: Thanks to Thorsten Ball for getting me to write +this and for his enthusiastic feedback. I also got great feedback from Max +Bernstein, Matti Picus and Per Vognsen. A conversation with Peng Wu that +we had many many years ago and that stuck with me made me keep thinking about +various ways to view compiler optimizations.

                +
                +
                +

                How is PyPy Tested?

                + +
                +

                How is PyPy Tested?

                +

                In this post I want to give an overview of how the PyPy project does and thinks +about testing. PyPy takes testing quite seriously and has done some from the +start of the project. Here I want to present the different styles of +tests that PyPy has, when we use them and how I think about them.

                +

                Background

                +

                To make the blog post self-contained, I am going to start with a small overview +about PyPy's architecture. If you already know what PyPy is and how it works, +you can skip this section.

                +

                PyPy means "Python in Python". It is an alternative implementation of the Python +language. Usually, when we speak of "Python", we can mean two different things. +On the one hand it means "Python as an abstract programming language". On the +other hand, the main implementation of that language is also often called +"Python". To more clearly distinguish the two, the implementation is often also +called "CPython", because it is an interpreter implemented in C code.

                +

                Now we can make the statement "PyPy is Python in Python" more precise: PyPy is +an interpreter for Python 3.9, implemented in RPython. RPython ("Restricted +Python") is a subset of Python 2, which is statically typed (using type +inference, not type annotations) and can be compiled +to C code. That means we can take our Python 3.9 interpreter, and compile it +into a C binary that can run Python 3.9 code. The final binary behaves pretty +similarly to CPython.

                +

                The main thing that makes PyPy interesting is that during the translation of our +interpreter to C, a number of components are automatically inserted into the +final binary. One component is a reasonably good garbage collector.

                +

                The more exciting component that is inserted into the binary is a just-in-time +compiler. The insertion of this component is not fully automatic, instead it is +guided by a small number of annotations in the source code of the interpreter. +The effect of inserting this JIT compiler into the binary is that the resulting +binary can run Python code significantly faster than CPython, in many cases. +How this works is not important for the rest of the post, if you want to see an +example of concretely doing that to a small interpreter you can look at this +video.

                +

                PyPy Testing History

                +

                A few historical notes on the PyPy project and its relationship to testing: The +PyPy project was started in 2004. At the time when the project was started, +Extreme Programming and Agile Software Development were up and coming. On the +methodology side, PyPy was heavily influenced by these, and started using +Test-Driven Development and pair programming right from the start.

                +

                Also technologically, PyPy has been influential on testing in the Python world. +Originally, PyPy had used the unittest testing framework, but pretty soon +the developers got frustrated with it. Holger Krekel, one of the original +developers who started PyPy, started the pytest testing framework soon +afterwards.

                +

                Interpreter-Level Tests

                +

                So, how are tests for PyPy written, concretely? The tests for the interpreter +are split into two different kinds, which we call "interpreter level tests" and +"application level tests". The former are tests that can be used to test the +objects and functions that are used in the implementation of the Python +interpreter. Since the interpreter is written in Python 2, those tests are also +written in Python 2, using pytest. They tend to be more on the unit test side of +things. They are in files with the pattern test_*.py.

                +

                Here is an example that tests the implementation of integers (very slightly +simplified):

                +
                class TestW_IntObject:
                +    ...
                +
                +    def test_hash(self):
                +        w_x = W_IntObject(42)
                +        w_result = w_x.descr_hash(self.space)
                +        assert isinstance(w_result, W_IntObject)
                +        assert w_result.intval == 42
                +
                +

                This test checks that if you take an object that represents integers in the +Python language (using the class W_IntObject, a "wrapped integer object") +with the value 42, computing the hash of that object returns another instance of +the same class, also with the value 42.

                +

                These tests can be run on top of any Python 2 implementation, either CPython or +PyPy. We can then test and debug the internals of the PyPy interpreter using +familiar tools like indeed pytest and the Python debuggers. They can be run, +because all the involved code like the tests and the class W_IntObject are +just completely regular Python 2 classes that behave in the regular way when +run on top of a Python interpreter.

                +

                In CPython, these tests don't really have an equivalent. They would correspond +to tests that are written in C and that can test the logic of all the C +functions of CPython that execute certain functionality, accessing the internals +of C structs in the process. ¹

                +

                Application-Level Tests

                +

                There is also a second class of tests for the interpreter. Those are tests that +don't run on the level of the implementation. Instead, they are executed by +the PyPy Python interpreter, thus running on the level of the applications run +by PyPy. Since the interpreter is running Python 3, the tests are also written +in Python 3. They are stored in files with the pattern apptest_*.py and +look like "regular" Python 3 tests. ²

                +

                Here's an example of how you could write a test equivalent to the one above:

                +
                def test_hash():
                +    assert hash(42) == 42
                +
                +

                This style of test looks more "natural" and is the preferred one in cases where +the test does not need to access the internals of the logic or the objects of +the interpreter.

                +

                Application level tests can be run in two different ways. On the one hand, we +can simply run them on CPython 3. This is very useful! Since we want PyPy to +behave like CPython, running the tests that we write on CPython is useful to +make sure that the tests themselves aren't wrong.

                +

                On the other hand, the main way to run these tests is on top of PyPy, itself +running on top of a Python 2 implementation. This makes it possible to run the +test without first bootstrapping PyPy to C. Since bootstrapping to C is a +relatively slow operation (can take up to an hour) it is crucially important to +be able to run tests without bootstrapping first. It also again makes it +possible to debug crashes in the interpreter using the regular Python 2 +debugger. Of course running tests in this way is unfortunately itself not super +fast, given that they run on a stack of two different interpreters.

                +

                Application-level tests correspond quite closely to CPython's tests suite (which +is using the unittest framework). Of course in CPython it is not possible to run +the test suite without building the CPython binary using a C compiler. ³

                +

                So when do we write application-level tests, and when interpreter-level tests? +Interpreter-level tests are necessary to test internal data structures that +touch data and logic that is not directly exposed to the Python language. If +that is not necessary, we try to write application-level tests. App-level tests +are however by their nature always more on the integration test side of things. +To be able to run the test_hash function above, many parts of PyPy need to +work correctly, the parser, the bytecode compiler, the bytecode interpreter, the +hash builtin, calling the __hash__ special method, etc, etc.

                +

                This observation is also true for CPython! One could argue that CPython has no +unit tests at all, because in order to be able to even run the tests, most of +Python needs to be in working order already, so all the tests are really +implicitly integration tests.

                +

                The CPython Test Suite

                +

                We also use the CPython Test suite as a final check to see whether our +interpreter correctly implements all the features of the Python language. In +that sense it acts as some kind of compliance test suite that checks whether we +implement the language correctly. The test suite is not perfect for this. +Since it is written for CPython's purposes during its development, a +lot of the tests check really specific CPython implementation details. Examples +for these are tests that check that __del__ is called immediately after +objects go out of scope (which only happens if you use reference counting as a +garbage collection strategy, PyPy uses a different approach to garbage +collection). Other examples are checking +for exception error messages very explicitly. However, the CPython test suite +has gotten a lot better in these regards over time, by adding +support.gc_collect() calls to fix the former problem, and by marking some +very specific tests with the @impl_detail decorator. Thanks to all the +CPython developers who have worked on this!

                +

                In the process of re-implementing CPython's functionality and running CPython's +tests suite, PyPy can often also be a good way to find bugs in CPython. While we +think about the corner cases of some Python feature we occasionally find +situations where CPython didn't get everything completely correct either, which +we then report back.

                +

                Testing for Performance Regressions

                +

                All the tests we described so far are checking behaviour. But one of PyPy's +important goals is to be a fast implementation not "just" a correct one. Some +aspects of performance can be tested by regular unit tests, either application- +or interpreter-level. In order to check whether some performance shortcut is +taken in the interpreter, we sometimes can write tests that monkeypatch the slow +default implementation to always error. Then, if the fast path is taken +properly, that slow default implementation is never reached.

                +

                But we also have additional tests that test the correct interaction with the JIT +explicitly. For that, we have a special style of test that checks that the JIT +will produce the correct machine code for a small snippet of Python code. To +make this kind of test somewhat more robust, we don't check the machine code +directly, but instead the architecture independent intermediate +representation that the JIT uses to produce machine code from.

                +

                As an example, here is a small test that loading the attribute of a constant +global instance can be completely constant folded away:

                +
                def test_load_attr(self):
                +    src = '''
                +        class A(object):
                +            pass
                +        a = A()
                +        a.x = 1
                +        def main(n):
                +            i = 0
                +            while i < n:
                +                i = i + a.x
                +            return i
                +    '''
                +    log = self.run(src, [1000])
                +    assert log.result == 1000
                +    loop, = log.loops_by_filename(self.filepath)
                +    assert loop.match("""
                +        i9 = int_lt(i5, i6)
                +        guard_true(i9, descr=...)
                +        guard_not_invalidated(descr=...)
                +        i10 = int_add(i5, 1)
                +        --TICK--
                +        jump(..., descr=...)
                +    """)
                +
                +

                The string passed to the loop.match function is a string representation of +the intermediate representation code that is generated for the while loop in +the main function given in the source. The important part of that +intermediate representation is that the i = i + a.x addition is optimized +into an int_add(x, 1) operation. The second argument for the addition is the +constant 1, because the JIT noted that the global a is a constant, and +the attribute x of that instance is always 1. The test thus checks that +this optimization still works.

                +

                Those tests are again more on the unit test side of things (and can thus +unfortunately be a bit brittle sometimes and break). The integration test +equivalent for performance is the PyPy Speed Center which tracks the +performance of micro- and macro-benchmarks over time and lets us see when big +performance regressions are happening. The speed center is not really an +automatic test and does not produce pass/fail outcomes. Instead, it requires +human judgement and intervention in order to interpret the performance changes. +Having a real pass/fail mechanism is something that would be great to have +but is probably quite tricky in practice.

                +

                Conclusion

                +

                This concludes my overview of some of the different styles of tests that we use +to develop the PyPy Python interpreter.

                +

                There is a whole other set of tests for the development of the RPython language, +the garbage collectors it provides as well as the code that does the automatic +JIT insertion, maybe I'll cover these in a future post.

                +

                Footnotes

                +

                ¹ CPython has the _testcapimodule.c and related modules, that are used to +unit-test the C-API. However, these are still driven from Python tests using +the unittest framework and wouldn't run without the Python interpreter +already working.

                +

                ² There is also a deprecated different way to write these tests, by putting +them in the test_*.py files that interpreter level tests are using and +then having a test class with the pattern class AppTest*. We haven't +converted all of them to the new style yet, even though the old style is +quite weird: since the test_*.py files are themselves parsed by +Python 2, the tests methods in AppTest* classes need to be written in the +subset of Python 3 syntax that is also valid Python 2 syntax, leading to a lot +of confusion.

                +

                ³ Nit-picky side-note: C interpreters are a thing! But not that +widely used in practice, or only in very specific situations.

                +
                +
                +

                PyPy v7.3.9 security release

                + +
                +

                PyPy v7.3.9 security release

                +

                The PyPy team is proud to release version 7.3.9 of PyPy. This is a security +release to match the recent CPython release and updates the portable pypy +tarballs with bzip2 1.0.8, openssl1.1.1n, and libexpat 2.4.7. Along +the way this release fixes some issues discovered after the 7.3.8 release and +updates sqlite3 to 3.38.2. It includes:

                +
                +
                  +
                • PyPy2.7, which is an interpreter supporting the syntax and the features of +Python 2.7 including the stdlib for CPython 2.7.18+ (the + is for +backported security updates)

                • +
                • PyPy3.7, which is an interpreter supporting the syntax and the features of +Python 3.7, including the stdlib for CPython 3.7.13. This will be the last +release of PyPy3.7.

                • +
                • PyPy3.8, which is an interpreter supporting the syntax and the features of +Python 3.8, including the stdlib for CPython 3.8.13.

                • +
                • PyPy3.9, which is an interpreter supporting the syntax and the features of +Python 3.9, including the stdlib for CPython 3.9.12. We relate to this as +"beta" quality. We welcome testing of this version, if you discover +incompatibilities, please report them so we can gain confidence in the version.

                • +
                +
                +

                The interpreters are based on much the same codebase, thus the multiple +release. This is a micro release, all APIs are compatible with the other 7.3 +releases. Highlights of the release, since the release of 7.3.8 in February 2022, +include:

                +
                +
                  +
                • Fixed some failing stdlib tests on PyPy3.9

                • +
                • Update the bundled libexpat to 2.4.6 and sqlite3 to 3.38.2

                • +
                +
                +

                We recommend updating. You can find links to download the v7.3.9 releases here:

                +
                +

                https://pypy.org/download.html

                +
                +

                We would like to thank our donors for the continued support of the PyPy +project. If PyPy is not quite good enough for your needs, we are available for +direct consulting work. If PyPy is helping you out, we would love to hear about +it and encourage submissions to our blog via a pull request +to https://github.com/pypy/pypy.org

                +

                We would also like to thank our contributors and encourage new people to join +the project. PyPy has many layers and we need help with all of them: PyPy +and RPython documentation improvements, tweaking popular modules to run +on PyPy, or general help with making RPython's JIT even better. Since the +7.3.7 release, we have accepted contributions from 6 new contributors, +thanks for pitching in, and welcome to the project!

                +

                If you are a python library maintainer and use C-extensions, please consider +making a HPy / CFFI / cppyy version of your library that would be performant +on PyPy. +In any case both cibuildwheel and the multibuild system support +building wheels for PyPy.

                +

                What is PyPy?

                +

                PyPy is a Python interpreter, a drop-in replacement for CPython 2.7, 3.7, 3.8 and +3.9. It's fast (PyPy and CPython 3.7.4 performance +comparison) due to its integrated tracing JIT compiler.

                +

                We also welcome developers of other dynamic languages to see what RPython +can do for them.

                +

                This PyPy release supports:

                +
                +
                  +
                • x86 machines on most common operating systems +(Linux 32/64 bits, Mac OS X 64 bits, Windows 64 bits, OpenBSD, FreeBSD)

                • +
                • 64-bit ARM machines running Linux. A shoutout to Huawei for sponsoring +the VM running the tests.

                • +
                • s390x running Linux

                • +
                • big- and little-endian variants of PPC64 running Linux,

                • +
                +
                +

                PyPy support Windows 32-bit, PPC64 big- and little-endian, and ARM 32 bit, but +does not release binaries. Please reach out to us if you wish to sponsor +releases for those platforms.

                +

                Known Issues with PyPy3.9

                +
                  +
                • We slightly modified the concurrent future's ProcessExcecutorPool to +start all the worker processes when the first task is received (like on +Python3.8) to avoid an apparent race condition when using fork and +threads (issue 3650).

                • +

                What else is new?

                +

                For more information about the 7.3.9 release, see the full changelog.

                +

                Please update, and continue to help us make PyPy better.

                +

                Cheers, +The PyPy team

                +
                +
                +

                PyPy v7.3.8: release of python 2.7, 3.7, 3.8, and 3.9

                + +
                +

                PyPy v7.3.8: release of python 2.7, 3.7, 3.8, and 3.9-beta

                +

                The PyPy team is proud to release version 7.3.8 of PyPy. It has been only a few +months since our last release, but we have some nice speedups and bugfixes we +wish to share. The release includes four different interpreters:

                +
                +
                  +
                • PyPy2.7, which is an interpreter supporting the syntax and the features of +Python 2.7 including the stdlib for CPython 2.7.18+ (the + is for +backported security updates)

                • +
                • PyPy3.7, which is an interpreter supporting the syntax and the features of +Python 3.7, including the stdlib for CPython 3.7.12. This will be the last +release of PyPy3.7.

                • +
                • PyPy3.8, which is an interpreter supporting the syntax and the features of +Python 3.8, including the stdlib for CPython 3.8.12. This is our third +release of this interpreter, and we are removing the "beta" tag.

                • +
                • PyPy3.9, which is an interpreter supporting the syntax and the features of +Python 3.9, including the stdlib for CPython 3.9.10. As this is our first +release of this interpreter, we relate to this as "beta" quality. We +welcome testing of this version, if you discover incompatibilities, please +report them so we can gain confidence in the version.

                • +
                +
                +

                The interpreters are based on much the same codebase, thus the multiple +release. This is a micro release, all APIs are compatible with the other 7.3 +releases. Highlights of the release, since the release of 7.3.7 in late October 2021, +include:

                +
                +
                  +
                • PyPy3.9 uses an RPython version of the PEG parser which brought with it a +cleanup of the lexer and parser in general

                • +
                • Fixed a regression in PyPy3.8 when JITting empty list comprehensions

                • +
                • Tweaked some issues around changing the file layout after packaging to make +the on-disk layout of PyPy3.8 more compatible with CPython. This requires +setuptools>=58.1.0

                • +
                • RPython now allows the target executable to have a . in its name, so +PyPy3.9 will produce a pypy3.9-c and libpypy3.9-c.so. Changing the +name of the shared object to be version-specific (it used to be +libpypy3-c.so) will allow it to live alongside other versions.

                • +
                • Building PyPy3.9+ accepts a --platlibdir argument like CPython.

                • +
                • Improvement in ssl's use of CFFI buffers to speed up recv and recvinto

                • +
                • Update the packaged OpenSSL to 1.1.1m

                • +
                +
                +

                We recommend updating. You can find links to download the v7.3.8 releases here:

                +
                +

                https://pypy.org/download.html

                +
                +

                We would like to thank our donors for the continued support of the PyPy +project. If PyPy is not quite good enough for your needs, we are available for +direct consulting work. If PyPy is helping you out, we would love to hear about +it and encourage submissions to our blog via a pull request +to https://github.com/pypy/pypy.org

                +

                We would also like to thank our contributors and encourage new people to join +the project. PyPy has many layers and we need help with all of them: PyPy +and RPython documentation improvements, tweaking popular modules to run +on PyPy, or general help with making RPython's JIT even better. Since the +previous release, we have accepted contributions from 6 new contributors, +thanks for pitching in, and welcome to the project!

                +

                If you are a python library maintainer and use C-extensions, please consider +making a HPy / CFFI / cppyy version of your library that would be performant +on PyPy. +In any case both cibuildwheel and the multibuild system support +building wheels for PyPy.

                +

                What is PyPy?

                +

                PyPy is a Python interpreter, a drop-in replacement for CPython 2.7, 3.7, 3.8 and +3.9. It's fast (PyPy and CPython 3.7.4 performance +comparison) due to its integrated tracing JIT compiler.

                +

                We also welcome developers of other dynamic languages to see what RPython +can do for them.

                +

                This PyPy release supports:

                +
                +
                  +
                • x86 machines on most common operating systems +(Linux 32/64 bits, Mac OS X 64 bits, Windows 64 bits, OpenBSD, FreeBSD)

                • +
                • 64-bit ARM machines running Linux. A shoutout to Huawei for sponsoring +the VM running the tests.

                • +
                • s390x running Linux

                • +
                • big- and little-endian variants of PPC64 running Linux,

                • +
                +
                +

                PyPy support Windows 32-bit, PPC64 big- and little-endian, and ARM 32 bit, but +does not release binaries. Please reach out to us if you wish to sponsor +releases for those platforms.

                +

                Known Issues with PyPy3.9

                +
                  +
                • There is still a known speed regression around **kwargs handling

                • +
                • We slightly modified the concurrent future's ProcessExcecutorPool to +start all the worker processes when the first task is received (like on +Python3.8) to avoid an apparent race condition when using fork and +threads (issue 3650).

                • +

                What else is new?

                +

                For more information about the 7.3.8 release, see the full changelog.

                +

                Please update, and continue to help us make PyPy better.

                +

                Cheers, +The PyPy team

                +
                +
                +

                Natural Language Processing for Icelandic with PyPy: A Case Study

                + +
                +

                Natural Language Processing for Icelandic with PyPy: A Case Study

                +

                Icelandic is one +of the smallest languages of the world, with about 370.000 speakers. It +is a language in the Germanic family, most similar to Norwegian, Danish +and Swedish, but closer to the original Old +Norse spoken throughout +Scandinavia until about the 14th century CE.

                +

                As with other small languages, there are worries that the language may +not +survive +in a digital world, where all kinds of fancy applications are developed +first - and perhaps only - for the major languages. Voice assistants, +chatbots, spelling and grammar checking utilities, machine translation, +etc., are increasingly becoming staples of our personal and professional +lives, but if they don’t exist for Icelandic, Icelanders will gravitate +towards English or other languages where such tools are readily +available.

                +

                Iceland is a technology-savvy country, with world-leading adoption +rates of the +Internet, +PCs and smart devices, and a thriving software industry. So the +government figured that it would be worthwhile to fund a 5-year +plan to build natural +language processing (NLP) resources and other infrastructure for the +Icelandic language. The project focuses on collecting data and +developing open source software for a range of core applications, such +as tokenization, vocabulary lookup, n-gram statistics, part-of-speech +tagging, named entity recognition, spelling and grammar checking, neural +language models and speech processing.

                +
                +

                My name is Vilhjálmur Þorsteinsson, and I’m the founder and CEO of a +software startup Miðeind in Reykjavík, +Iceland, that employs 10 software engineers and linguists and focuses on +NLP and AI for the Icelandic language. The company participates in the +government’s language technology program, and has contributed +significantly to the program’s core tools (e.g., a tokenizer and a +parser), spelling and grammar checking modules, and a neural machine +translation stack.

                +

                When it came to a choice of programming languages and development tools +for the government program, the requirements were for a major, well +supported, vendor-and-OS-agnostic FOSS platform with a large and diverse +community, including in the NLP space. The decision to select Python as +a foundational language for the project was a relatively easy one. That +said, there was a bit of trepidation around the well known fact that +CPython can be slow for inner-core tasks, such as tokenization and +parsing, that can see heavy workloads in production.

                +

                I first became aware of PyPy in early 2016 when I was developing a +crossword game Netskrafl in Python 2.7 +for Google App Engine. I had a utility program that compressed a +dictionary into a Directed Acyclic Word Graph and was taking 160 +seconds  to run on CPython 2.7, so I tried PyPy and to my amazement saw +a 4x speedup (down to 38 seconds), with literally no effort besides +downloading the PyPy runtime.

                +

                This led me to select PyPy as the default Python interpreter for my +company’s Python development efforts as well as for our production +websites and API servers, a role in which it remains to this day. We +have followed PyPy’s upgrades along the way, being just about to migrate +our minimally required language version from 3.6 to 3.7.

                +

                In NLP, speed and memory requirements can be quite important for +software usability. On the other hand, NLP logic and algorithms are +often complex and challenging to program, so programmer productivity and +code clarity are also critical success factors. A pragmatic approach +balances these factors, avoids premature optimization and seeks a +careful compromise between maximal run-time efficiency and minimal +programming and maintenance effort.

                +

                Turning to our use cases, our Icelandic text +tokenizer "Tokenizer" is fairly light, +runs tight loops and performs a large number of small, repetitive +operations. It runs very well on PyPy’s JIT and has not required further +optimization.

                +

                Our Icelandic parser Greynir +(known on PyPI as reynir) is, +if I may say so myself, a piece of work. It parses natural language +text according to a +hand-written context-free +grammar, +using an Earley-type +algorithm as enhanced +by Scott and +Johnstone. +The CFG contains almost 7,000 nonterminals and 6,000 terminals, and the +parser handles ambiguity as well as left, right and middle recursion. It +returns a packed parse forest for each input sentence, which is then +pruned by a scoring heuristic down to a single best result tree.

                +

                This parser was originally coded in pure Python and turned out to be +unusably slow when run on CPython - but usable on PyPy, where it was +3-4x faster. However, when we started applying it to heavier production +workloads, it  became apparent that it needed to be faster still. We +then proceeded to convert the innermost Earley parsing loop from Python +to tight +C++ +and to call it from PyPy via +CFFI, with callbacks for +token-terminal matching functions (“business logic”) that remained on +the Python side. This made the parser much faster (on the order of 100x +faster than the original on CPython) and quick enough for our production +use cases. Even after moving much of the heavy processing to C++ and using CFFI, PyPy still gives a significant speed boost over CPython.

                +

                Connecting C++ code with PyPy proved to be quite painless using CFFI, +although we had to figure out a few magic incantations in our build +module +to make it compile smoothly during setup from source on Windows and +MacOS in addition to Linux. Of course, we build binary PyPy and CPython +wheels for the most common targets so most users don’t have to worry +about setup requirements.

                +

                With the positive experience from the parser project, we proceeded to +take a similar approach for two other core NLP packages: our compressed +vocabulary package BinPackage +(known on PyPI as islenska) and our +trigrams database package Icegrams. +These packages both take large text input (3.1 million word forms with +inflection data in the vocabulary case; 100 million tokens in the +trigrams case) and compress it into packed binary structures. These +structures are then memory-mapped at run-time using +mmap and queried via +Python functions with a lookup time in the microseconds range. The +low-level data structure navigation is done in +C++, +called from Python via CFFI. The ex-ante preparation, packing, +bit-fiddling and data structure generation is fast enough with PyPy, so +we haven’t seen a need to optimize that part further.

                +

                To showcase our tools, we host public (and open source) websites such as +greynir.is for our parsing, named entity +recognition and query stack and +yfirlestur.is for our spell and grammar +checking stack. The server code on these sites is all Python running on +PyPy using Flask, +wrapped in gunicorn and hosted on +nginx. The underlying database is +PostgreSQL accessed via +SQLAlchemy and +psycopg2cffi. This setup +has served us well for 6 years and counting, being fast, reliable and +having helpful and supporting communities.

                +

                As can be inferred from the above, we are avid fans of PyPy and +commensurately thankful for the great work by the PyPy team over the +years. PyPy has enabled us to use Python for a larger part of our +toolset than CPython alone would have supported, and its smooth +integration with C/C++ through CFFI has helped us attain a better +tradeoff between performance and programmer productivity in our +projects. We wish for PyPy a great and bright future and also look +forward to exciting related developments on the horizon, such as +HPy.

                +
                +
                +

                Error Message Style Guides of Various Languages

                + +
                +

                Error Message Style Guides of Various Languages

                +

                PyPy has been trying to produce good SyntaxErrors and other errors for +a long time. CPython has also made an enormous push to improve its +SyntaxErrors in the last few releases. These improvements are great, but the process +feels somewhat arbitrary sometimes. To see what other languages are doing, I +asked people on Twitter whether they know of error message style guides for +other programming languages.

                +

                Wonderfully, people answered me with lots of helpful links (full list at the +end of the post), thank you everybody! All those sources are very interesting +and contain many great points, I recommend reading them directly! In this +post, I'll try to summarize some common themes or topics that I thought were +particularly interesting.

                +

                Language Use

                +

                Almost all guides stress the need for plain and simple English, as well as +conciseness and clarity [Flix, Racket, Rust, Flow]. Flow suggests to put coding +effort into making the grammar correct, for example in the case of plurals or +to distinguish between "a" and "an".

                +

                The suggested tone should be friendly and neutral, the messages should not +blame the Programmer [Flow]. Rust and Flix suggest to not use the term +'illegal' and use something like 'invalid' instead.

                +

                Flow suggests to avoid "compiler speak". For example terms like 'token' and +'identifier' should be avoided and terms that are more familiar to programmers +be used (eg "name" is better). The Racket guide goes further and has a list of +allowed technical terms and some prohibited terms.

                +

                Structure

                +

                Several guides (such as Flix and Flow) point out a 80/20 rule: 80% of the times an error message is +read, the developer knows that message well and knows exactly what to do. For +this use case it's important that the message is short. On the other hand, 20% +of the times this same message will have to be understood by a developer who +has never seen it before and is confused, and so the message needs to contain +enough information +to allow them to find out what is going on. So the error message needs to strike +a balance between brevity and clarity.

                +

                The Racket guide proposes to use the following general structure for errors: +'State the constraint that was violated ("expected a"), followed by what was +found instead.'

                +

                The Rust guides says to avoid "Did you mean?" and questions in general, and +wants the compiler to instead be explicit about why something was suggested. The +example the Rust guide gives is: 'Compare "did you mean: Foo" vs. "there is a +struct with a similar name: Foo".' Racket goes further and forbids +suggestions altogether because "Students will follow well‐meaning‐but‐wrong +advice uncritically, if only because they have no reason to doubt the +authoritative voice of the tool."

                +

                Formatting and Source Positions

                +

                The Rust guide suggests to put all identifiers into backticks (like in +Markdown), Flow formats the error messages using full Markdown.

                +

                The Clang, Flow and Rust guides point out the importance of using precise +source code spans to point to errors, which is especially important if the +compiler information is used in the context of an IDE to show a red squiggly +underline or some other highlighting. The spans should be as small as possible to point out the source of +the error [Flow].

                +

                Conclusion

                +

                I am quite impressed how advanced and well-thought out the approaches are. I wonder whether it would makes sense for +Python to adopt a (probably minimal, to get started) subset of these ideas as guidelines for its own errors.

                +

                Sources

                +
                +
                +

                PyPy v7.3.7: bugfix release of python 3.7 and 3.8

                + +
                +

                PyPy v7.3.7: bug-fix release of 3.7, 3.8

                +

                We are releasing a PyPy 7.3.7 to fix the recent 7.3.6 release's binary +incompatibility with the previous 7.3.x releases. We mistakenly added fields +to PyFrameObject and PyDateTime_CAPI that broke the promise of binary +compatibility, which means that c-extension wheels compiled for 7.3.5 will not +work with 7.3.6 and via-versa. Please do not use 7.3.6.

                +

                We have added a cursory test for binary API breakage to the +https://github.com/pypy/binary-testing repo which hopefully will prevent such +mistakes in the future.

                +

                Additionally, a few smaller bugs were fixed:

                +
                  +
                • Use uint for the request argument of fcntl.ioctl (issue 3568)

                • +
                • Fix incorrect tracing of while True` body in 3.8 (issue 3577)

                • +
                • Properly close resources when using a concurrent.futures.ProcessPool +(issue 3317)

                • +
                • Fix the value of LIBDIR in _sysconfigdata in 3.8 (issue 3582)

                • +
                +

                You can find links to download the v7.3.7 releases here:

                +
                +

                https://pypy.org/download.html

                +
                +

                We would like to thank our donors for the continued support of the PyPy +project. If PyPy is not quite good enough for your needs, we are available for +direct consulting work. If PyPy is helping you out, we would love to hear about +it and encourage submissions to our blog site via a pull request +to https://github.com/pypy/pypy.org

                +

                We would also like to thank our contributors and encourage new people to join +the project. PyPy has many layers and we need help with all of them: PyPy +and RPython documentation improvements, tweaking popular modules to run +on PyPy, or general help with making RPython's JIT even better.

                +

                If you are a python library maintainer and use C-extensions, please consider +making a CFFI / cppyy version of your library that would be performant on PyPy. +In any case both cibuildwheel and the multibuild system support +building wheels for PyPy.

                +

                What is PyPy?

                +

                PyPy is a Python interpreter, a drop-in replacement for CPython 2.7, 3.7, and +3.8. It's fast (PyPy and CPython 3.7.4 performance +comparison) due to its integrated tracing JIT compiler.

                +

                We also welcome developers of other dynamic languages to see what RPython +can do for them.

                +

                This PyPy release supports:

                +
                +
                  +
                • x86 machines on most common operating systems +(Linux 32/64 bits, Mac OS X 64 bits, Windows 64 bits, OpenBSD, FreeBSD)

                • +
                • 64-bit ARM machines running Linux.

                • +
                • s390x running Linux

                • +
                +
                +

                PyPy does support ARM 32 bit and PPC64 processors, but does not release binaries.

                +
                +
                +

                PyPy v7.3.6: release of python 2.7, 3.7, and 3.8

                + +
                +

                PyPy v7.3.6: release of python 2.7, 3.7, and 3.8-beta

                +

                The PyPy team is proud to release version 7.3.6 of PyPy, which includes +three different interpreters:

                +
                +
                  +
                • PyPy2.7, which is an interpreter supporting the syntax and the features of +Python 2.7 including the stdlib for CPython 2.7.18+ (the + is for +backported security updates)

                • +
                • PyPy3.7, which is an interpreter supporting the syntax and the features of +Python 3.7, including the stdlib for CPython 3.7.12.

                • +
                • PyPy3.8, which is an interpreter supporting the syntax and the features of +Python 3.8, including the stdlib for CPython 3.8.12. Since this is our +first release of the interpreter, we relate to this as "beta" quality. We +welcome testing of this version, if you discover incompatibilites, please +report them so we can gain confidence in the version.

                • +
                +
                +

                The interpreters are based on much the same codebase, thus the multiple +release. This is a micro release, all APIs are compatible with the other 7.3 +releases. Highlights of the release, since the release of 7.3.5 in May 2021, +include:

                +
                +
                  +
                • We have merged a backend for HPy, the better C-API interface. The backend +implements HPy version 0.0.3.

                • +
                • Translation of PyPy into a binary, known to be slow, is now about 40% +faster. On a modern machine, PyPy3.8 can translate in about 20 minutes.

                • +
                • PyPy Windows 64 is now available on conda-forge, along with nearly 700 +commonly used binary packages. This new offering joins the more than 1000 +conda packages for PyPy on Linux and macOS. Many thanks to the conda-forge +maintainers for pushing this forward over the past 18 months.

                • +
                • Speed improvements were made to io, sum, _ssl and more. These +were done in response to user feedback.

                • +
                • The 3.8 version of the release contains a beta-quality improvement to the +JIT to better support compiling huge Python functions by breaking them +up into smaller pieces.

                • +
                • The release of Python3.8 required a concerted effort. We were greatly +helped by @isidentical (Batuhan Taskaya) and other new contributors.

                • +
                • The 3.8 package now uses the same layout as CPython, and many of the +PyPy-specific changes to sysconfig, distutils.sysconfig, and +distutils.commands.install.py have been removed. The stdlib now +is located in <base>/lib/pypy3.8 on posix systems, and in +<base>/Lib on Windows. The include files on windows remain the same. +On posix they are in <base>/include/pypy3.8. Note we still use the +pypy prefix to prevent mixing the files with CPython (which uses +python.

                • +
                +
                +

                We recommend updating. You can find links to download the v7.3.6 releases here:

                +
                +

                https://pypy.org/download.html

                +
                +

                We would like to thank our donors for the continued support of the PyPy +project. If PyPy is not quite good enough for your needs, we are available for +direct consulting work. If PyPy is helping you out, we would love to hear about +it and encourage submissions to our blog via a pull request +to https://github.com/pypy/pypy.org

                +

                We would also like to thank our contributors and encourage new people to join +the project. PyPy has many layers and we need help with all of them: PyPy +and RPython documentation improvements, tweaking popular modules to run +on PyPy, or general help with making RPython's JIT even better. Since the +previous release, we have accepted contributions from 7 new contributors, +thanks for pitching in, and welcome to the project!

                +

                If you are a python library maintainer and use C-extensions, please consider +making a CFFI / cppyy version of your library that would be performant on PyPy. +In any case both cibuildwheel and the multibuild system support +building wheels for PyPy.

                +

                What is PyPy?

                +

                PyPy is a Python interpreter, a drop-in replacement for CPython 2.7, 3.7, and +soon 3.8. It's fast (PyPy and CPython 3.7.4 performance +comparison) due to its integrated tracing JIT compiler.

                +

                We also welcome developers of other dynamic languages to see what RPython +can do for them.

                +

                This PyPy release supports:

                +
                +
                  +
                • x86 machines on most common operating systems +(Linux 32/64 bits, Mac OS X 64 bits, Windows 64 bits, OpenBSD, FreeBSD)

                • +
                • big- and little-endian variants of PPC64 running Linux,

                • +
                • s390x running Linux

                • +
                • 64-bit ARM machines running Linux.

                • +
                +
                +

                PyPy does support Windows 32-bit and ARM 32 bit processors, but does not +release binaries. Please reach out to us if you wish to sponsor releases for +those platforms.

                +

                What else is new?

                +

                For more information about the 7.3.6 release, see the full changelog.

                +

                Please update, and continue to help us make PyPy better.

                +

                Cheers, +The PyPy team

                +
                +
                +

                Better JIT Support for Auto-Generated Python Code

                + +
                +

                Performance Cliffs

                +

                A common bad property of many different JIT compilers is that of a "performance +cliff": A seemingly reasonable code change, leading to massively reduced +performance due to hitting some weird property of the JIT compiler that's not +easy to understand for the programmer (e.g. here's a blog post about the fix of +a performance cliff when running React on +V8). Hitting a performance cliff as a +programmer can be intensely frustrating and turn people off from using PyPy +altogether. Recently we've been working on trying to remove some of PyPy's +performance cliffs, and this post describes one such effort.

                +

                The problem showed up in an issue +where somebody found the performance +of their website using Tornado a lot +worse than what various benchmarks suggested. It took some careful digging to +figure out what caused the problem: The slow performance was caused by the huge +functions that the Tornado templating engine creates. These functions lead the +JIT to behave in unproductive ways. In this blog post I'll describe why the +problem occurs and how we fixed it.

                +

                Problem

                +

                After quite a bit of debugging we narrowed down the problem to the following +reproducer: If you render a big HTML template +(example) +using the Tornado templating engine, the template rendering is really not any +faster than CPython. A small template doesn't show this behavior, and other +parts of Tornado seem to perform well. So we looked into how the templating +engine works, and it turns out that the templates are compiled into Python +functions. This means that a big template can turn into a really enormous Python +function (Python version of the +example). +For some reason really enormous Python functions aren't handled particularly +well by the JIT, and in the next section I'll explain some the background that's +necessary to understand why this happens.

                +

                Trace Limits and Inlining

                +

                To understand why the problem occurs, it's necessary to understand how PyPy's +trace limit and inlining works. The tracing JIT has a maximum trace length built +in, the reason for that is some limitation in the compact encoding of traces in +the JIT. Another reason is that we don't want to generate arbitrary large chunks +of machine code. Usually, when we hit the trace limit, it is due to inlining. +While tracing, the JIT will inline many of the functions called from the +outermost one. This is usually good and improves performance greatly, however, +inlining can also lead to the trace being too long. If that happens, we +will mark a called function as uninlinable. The next time we trace the outer +function we won't inline it, leading to a shorter trace, which hopefully fits +the trace limit.

                +

                Diagram illustrating the interaction of the trace limit and inlining

                +

                In the diagram above we trace a function f, which calls a function g, which +is inlined into the trace. The trace ends up being too long, so the JIT +disables inlining of g. The next time we try to trace f the trace will +contain a call to g instead of inlining it. The trace ends up being not too +long, so we can turn it into machine code when tracing finishes.

                +

                Now we know enough to understand what the problem with automatically generated +code is: sometimes, the outermost function itself +doesn't fit the trace limit, without any inlining going on at all. This is +usually not the case for normal, hand-written Python functions. However, it can +happen for automatically generated Python code, such as the code that the +Tornado templating engine produces.

                +

                So, what happens when the JIT hits such a huge function? The function is traced +until the trace is too long. Then the trace limits stops further tracing. Since +nothing was inlined, we cannot make the trace shorter the next time by disabling +inlining. Therefore, this happens again and again, the next time we trace the +function we run into exactly the same problem. The net effect is that the +function is even slowed down: we spend time tracing it, then stop tracing and +throw the trace away. Therefore, that effort is never useful, so the resulting +execution can be slower than not using the JIT at all!

                +

                Solution

                +

                To get out of the endless cycle of useless retracing we first had the idea of +simply disabling all code generation for such huge functions, that produce too long +traces even if there is no inlining at all. However, that lead to disappointing +performance in the example Tornado program, because important parts of the code +remain always interpreted.

                +

                Instead, our solution is now as follows: After we have hit the trace limit and +no inlining has happened so far, we mark the outermost function as a source of huge +traces. The next time we trace such a function, we do so in a special mode. In +that mode, hitting the trace limit behaves differently: Instead of stopping the +tracer and throwing away the trace produced so far, we will use the unfinished +trace to produce machine code. This trace corresponds to the first part of the +function, but stops at a basically arbitrary point in the middle of the +function.

                +

                The question is what should happen when execution +reaches the end of this unfinished trace. We want to be able to cover more of +the function with machine code and therefore need to extend the trace +from that point on. But we don't want to do that too +eagerly to prevent lots and lots of machine code being generated. To achieve +this behaviour we add a guard to the end of the unfinished trace, which will +always fail. This has the right behaviour: a failing guard will transfer control +to the interpreter, but if it fails often enough, we can patch it to jump to +more machine code, that starts from this position. In that way, we can slowly +explore the full gigantic function and add all those parts of the control flow +graph that are actually commonly executed at runtime.

                +

                Diagram showing what happens in the new jit when tracing a huge function

                +

                In the diagram we are trying to trace a huge function f, which leads to +hitting the trace limit. However, nothing was inlined into the trace, so +disabling inlining won't ensure a successful trace attempt the next time. +Instead, we mark f as "huge". This has the effect that when we trace it again +and are about to hit the trace limit, we end the trace at an arbitrary point by +inserting a guard that always fails.

                +

                Diagram showing what happens in the new jit when tracing a huge function until completion

                +

                If this guard failure is executed often enough, we might patch the guard and +add a jump to a further part of the function f. This can continue potentially +several times, until the trace really hits and end points (for example by +closing the loop and jumping back to trace 1, or by returning from f).

                +

                Evaluation

                +

                Since this is a performance cliff that we didn't observe in any of our +benchmarks ourselves, it's pointless to look at the +effect that this improvement has on existing benchmarks – there shouldn't and +indeed there isn't any.

                +

                Instead, we are going to look at a micro-benchmark that came out of the +original bug report, one that simply renders a big artificial Tornado template +200 times. The code of the micro-benchmark can be found +here.

                +

                All benchmarks were run 10 times in new processes. The means and standard +deviations of the benchmark runs are:

                + + + + + + + + + + + + + + + + + + + + + + + +
                ImplementationTime taken (lower is better)
                CPython 3.9.514.19 ± 0.35s
                PyPy3 without JIT59.48 ± 5.41s
                PyPy3 JIT old14.47 ± 0.35s
                PyPy3 JIT new4.89 ± 0.10s
                +

                What we can see is that while the old JIT is very helpful for this +micro-benchmark, it only brings the performance up to CPython levels, not +providing any extra benefit. The new JIT gives an almost 3x speedup.

                +

                Another interesting number we can look at is how often the JIT started a trace, +and for how many traces we produced actual machine code:

                + + + + + + + + + + + + + + + + + + + + + +
                ImplementationTraces StartedTraces sent to backendTime spent in JIT
                PyPy3 JIT old216240.65s
                PyPy3 JIT new30250.06s
                +

                Here we can clearly see the problem: The old JIT would try tracing the +auto-generated templating code again and again, but would never actually produce +any machine code, wasting lots of time in the process. The new JIT still traces a +few times uselessly, but then eventually converges and stops emitting machine +code for all the paths through the auto-generated Python code.

                + + + +

                Tim Felgentreff pointed me to the fact that +Truffle also has a +mechanism +to slice huge methods into smaller compilation units (and I am sure other JITs +have such mechanisms as well).

                +

                Conclusion

                +

                In this post we've described a performance cliff in PyPy's JIT, that of really +big auto-generated functions which hit the trace limit without inlining, that we +still want to generate machine code for. We achieve this by chunking up the +trace into several smaller traces, which we compile piece by piece. This is not +a super common thing to be happening – otherwise we would have run into and +fixed it earlier – but it's still good to have a fix now.

                +

                The work +described in this post tiny bit experimental still, but we will release it as +part of the upcoming 3.8 beta release, to get some more experience with it. +Please grab a 3.8 release +candidate, +try it out and let us know your observations, good and bad!

                +
                +
                +
                +
                + +
                +
                +
                + +
                + + + + \ No newline at end of file diff --git a/blog/index-43.html b/blog/index-43.html new file mode 100644 index 000000000..e44e5b71e --- /dev/null +++ b/blog/index-43.html @@ -0,0 +1,2354 @@ + + + + + + +PyPy (old posts, page 43) | PyPy + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +
                +

                PyPy v7.3.12 release

                + +
                +

                PyPy v7.3.12: release of python 2.7, 3.9, and 3.10.

                +

                The PyPy team is proud to release version 7.3.12 of PyPy. +This release includes a new string-to-int algorithm (also appearing in CPython +3.12) that is faster than the older one; support for symlinks in Windows; and +our first Python3.10 version.

                +

                The release includes three different interpreters:

                +
                +
                  +
                • PyPy2.7, which is an interpreter supporting the syntax and the features of +Python 2.7 including the stdlib for CPython 2.7.18+ (the + is for +backported security updates)

                • +
                • PyPy3.9, which is an interpreter supporting the syntax and the features of +Python 3.9, including the stdlib for CPython 3.9.17.

                • +
                • PyPy3.10, which is an interpreter supporting the syntax and the features of +Python 3.10, including the stdlib for CPython 3.10.12. This is our first +release of 3.10, but based on past experience we are quite confident in +its compatibility with upstream. Of course, we recommend testing your code +with this new version before putting it into production. Note it does +require at least cython 0.29.35 or cython 3.0.0b3

                • +
                +
                +

                The interpreters are based on much the same codebase, thus the multiple +release. This is a micro release, all APIs are compatible with the other 7.3 +releases. It follows after 7.3.11 release on Dec 29, 2022

                +

                We recommend updating. You can find links to download the v7.3.12 releases here:

                +
                +

                https://pypy.org/download.html

                +
                +

                We would like to thank our donors for the continued support of the PyPy +project. If PyPy is not quite good enough for your needs, we are available for +direct consulting work. If PyPy is helping you out, we would love to hear about +it and encourage submissions to our blog via a pull request +to https://github.com/pypy/pypy.org

                +

                We would also like to thank our contributors and encourage new people to join +the project. PyPy has many layers and we need help with all of them: bug fixes, +PyPy and RPython documentation improvements, or general help with making +RPython's JIT even better. Since the previous release, we have accepted +contributions from one new contributor, thanks for pitching in, and welcome +to the project!

                +

                If you are a python library maintainer and use C-extensions, please consider +making a HPy / CFFI / cppyy version of your library that would be performant +on PyPy. In any case, both cibuildwheel and the multibuild system support +building wheels for PyPy.

                +

                What is PyPy?

                +

                PyPy is a Python interpreter, a drop-in replacement for CPython 2.7, 3.9 and +3.10. It's fast (PyPy and CPython 3.7.4 performance +comparison) due to its integrated tracing JIT compiler.

                +

                We also welcome developers of other dynamic languages to see what RPython +can do for them.

                +

                We provide binary builds for:

                +
                +
                  +
                • x86 machines on most common operating systems +(Linux 32/64 bits, Mac OS 64 bits, Windows 64 bits)

                • +
                • 64-bit ARM machines running Linux (aarch64).

                • +
                • Apple M1 arm64 machines (macos_arm64).

                • +
                • s390x running Linux

                • +
                +
                +

                PyPy support Windows 32-bit, Linux PPC64 big- and little-endian, and Linux ARM +32 bit, but does not release binaries. Please reach out to us if you wish to +sponsor binary releases for those platforms. Downstream packagers provide +binary builds for debian, Fedora, conda, OpenBSD, FreeBSD, Gentoo, and more.

                +

                What else is new?

                +

                For more information about the 7.3.12 release, see the full changelog.

                +

                Please update, and continue to help us make pypy better.

                +

                Cheers, +The PyPy Team

                +
                +
                +

                RPython-based emulator speeds up RISC-V simulation over 15x

                + +
                +

                In cooperation with RISC-V International, who funded a part of this project, +we recently created a workflow to +use RPython to take a Sail RISC-V model and automatically create a RISC-V ISA +emulator from it, which we call Pydrofoil. The simulator sped up booting a +linux emulator from 35 minutes (using the standard Sail-generated emulator in +C) to 2 minutes, a speedup of 17.5x. More details about the process are in the +RISC-V blog post.

                +

                A few take-aways from the project:

                +
                  +
                • While PyPy has shown it can speed up generic python code about 4x, the +technology behind PyPy can really shine in other areas.

                • +
                • RPython is malleable and can be molded to many tasks, the RPython meta-JIT is +very flexible.

                • +
                • A JIT is well-suited for the problem of emulation, because it can +perform dynamic binary translation.

                • +
                +

                PyPy can solve real world performance problems, even somewhat unusual ones. +Please get in touch and let us know how we can help you solve yours!

                +
                +

                Repeated string concatenation is quadratic in PyPy (and CPython)

                + +
                +

                This is a super brief blog post responding to an issue that we got on the PyPy +issue tracker. I am moving my response to the blog (with permission of the +submitter) to have a post to point to, since it's a problem that comes up with +some regularity. It's also documented on our page of differences between PyPy +and CPython but I thought an additional blog post might be good.

                +

                The issue pointed out that a small program that operates on strings is much +slower on PyPy compared to CPython. The program is a solution for 2016's +Advent of Code Day 16 and looks like this:

                +
                def dragon(a):
                +    b = a[::-1].replace('0','r').replace('1','0').replace('r','1')
                +    return a+'0'+b
                +
                +def diffstr(a):
                +    b = ""
                +    for i in range(0,len(a),2):
                +        b += ['0','1'][a[i] == a[i+1]]
                +    return b
                +
                +def iterdiff(a):
                +    b = a
                +    while(len(b) % 2 == 0):
                +        b = diffstr(b)
                +    return b
                +
                +size = 35651584
                +initstate = '10010000000110000'
                +while(len(initstate) < size):
                +    initstate = dragon(initstate)
                +initstate = initstate[:size]
                +print(iterdiff(initstate))
                +
                +

                The submitter pointed out, that the program is fast on CPython (~8s on my +laptop) and slow (didn't finish) on PyPy.

                +

                The reason for the performance difference is that += on strings in a loop +has quadratic complexity in PyPy, which is what diffstr does. To see the +quadraticness, consider that to add a character at the end of the string, the +beginning of the string needs to be copied into a new chunk of memory. If the +loop runs n times, that means there are

                +

                1 + 2 + 3 + ... + n = n * (n + 1) // 2

                +

                character copies.

                +

                Repeated string concatenations are in principle also quadratic in CPython, but +CPython has an optimization that makes them sometimes not quadratic, which is +what makes this program not too slow in CPython.

                +

                In order to fix the problem on PyPy it's best to use a list for the string +parts, which has the right amortized O(1) complexity for .append calls, and +then use str.join after the loop:

                +
                def diffstr(a):
                +    b = []
                +    for i in range(0,len(a),2):
                +        b.append(['0','1'][a[i] == a[i+1]])
                +    return "".join(b)
                +
                +

                With this change the program becomes a little bit faster on CPython for me, and +on PyPy it stops being quadratic and runs in ~3.5s.

                +

                In general, it's best not to rely on the presence of this optimization in +CPython either. Sometimes, a small innocent looking changes will break CPython's +optimization. E.g. this useless change makes CPython also take ages:

                +
                def diffstr(a):
                +    b = ""
                +    for i in range(0,len(a),2):
                +        b += ['0','1'][a[i] == a[i+1]]
                +        c = b
                +    return b
                +
                +

                The reason why this change breaks the optimization in CPython is that it only +triggers if the reference count of b is 1, in which case it uses realloc +on the string. The change is unrealistic of course, but you could imagine a +related that keeps an extra reference to b for a sensible reason.

                +

                Another situation in which the optimization doesn't work is discussed in this +StackOverflow question with an answer by Tim Peters.

                +

                It's unlikely that PyPy will fix this. We had a prototype how to do it, but it +seems very little "production" code uses += on strings in a loop, and the fix +makes the strings implementation quite a bit more complex.

                +

                So, in summary, don't use repeated concatenations in a loop!

                +
                +

                PyPy v7.3.11 release

                + +
                +

                PyPy v7.3.11: release of python 2.7, 3.8, and 3.9

                +

                The PyPy team is proud to release version 7.3.11 of PyPy. As could be expected, +the first release of macOS arm64 impacted the macOS x86-64 build, so this is +a bug release to restore the ability of macOS users to run PyPy on +macOS < 11.0. It also incorporates the latest CPython stdlib updates +released the day after 7.3.10 went out, and a few more bug fixes. The release +includes three different interpreters:

                +
                +
                  +
                • PyPy2.7, which is an interpreter supporting the syntax and the features of +Python 2.7 including the stdlib for CPython 2.7.18+ (the + is for +backported security updates)

                • +
                • PyPy3.8, which is an interpreter supporting the syntax and the features of +Python 3.8, including the stdlib for CPython 3.8.16. Note we intend to drop +support for this version in an upcoming release as soon as we release +Pyython 3.10.

                • +
                • PyPy3.9, which is an interpreter supporting the syntax and the features of +Python 3.9, including the stdlib for CPython 3.9.16.

                • +
                +
                +

                The interpreters are based on much the same codebase, thus the multiple +release. This is a micro release, all APIs are compatible with the other 7.3 +releases and follows quickly on the heals of the 7.3.10 release on Dec 6.

                +

                We recommend updating. You can find links to download the v7.3.11 releases here:

                +
                +

                https://pypy.org/download.html

                +
                +

                We would like to thank our donors for the continued support of the PyPy +project. If PyPy is not quite good enough for your needs, we are available for +direct consulting work. If PyPy is helping you out, we would love to hear about +it and encourage submissions to our blog via a pull request +to https://github.com/pypy/pypy.org

                +

                We would also like to thank our contributors and encourage new people to join +the project. PyPy has many layers and we need help with all of them: bug fixes, +PyPy and RPython documentation improvements, or general help with making +RPython's JIT even better. Since the previous release, we have accepted +contributions from one new contributor, thanks for pitching in, and welcome +to the project!

                +

                If you are a python library maintainer and use C-extensions, please consider +making a HPy / CFFI / cppyy version of your library that would be performant +on PyPy. +In any case, both cibuildwheel and the multibuild system support +building wheels for PyPy.

                +

                What is PyPy?

                +

                PyPy is a Python interpreter, a drop-in replacement for CPython 2.7, 3.8 and +3.9. It's fast (PyPy and CPython 3.7.4 performance +comparison) due to its integrated tracing JIT compiler.

                +

                We also welcome developers of other dynamic languages to see what RPython +can do for them.

                +

                We provide binary builds for:

                +
                +
                  +
                • x86 machines on most common operating systems +(Linux 32/64 bits, Mac OS 64 bits, Windows 64 bits)

                • +
                • 64-bit ARM machines running Linux (aarch64).

                • +
                • Apple M1 arm64 machines (macos_arm64).

                • +
                • s390x running Linux

                • +
                +
                +

                PyPy support Windows 32-bit, Linux PPC64 big- and little-endian, and Linux ARM +32 bit, but does not release binaries. Please reach out to us if you wish to +sponsor binary releases for those platforms. Downstream packagers provide +binary builds for debian, Fedora, conda, OpenBSD, FreeBSD, Gentoo, and more.

                +

                What else is new?

                +

                For more information about the 7.3.11 release, see the full changelog.

                +

                Please update, and continue to help us make pypy better.

                +

                Cheers, +The PyPy Team

                +
                +
                +

                Finding JIT Optimizer Bugs using SMT Solvers and Fuzzing

                + +
                +

                In this blog post I want to describe a recent bug finding technique that I've +added to the PyPy JIT testing infrastructure. This technique uses the Z3 +theorem prover to find bugs in the optimizer of PyPy's JIT, in particular its +integer operation optimizations. The approach is +based on things I have learned from John Regehr's blog (this post is a +good first one to read), Twitter, and on +his (et al) paper Alive2: Bounded Translation Validation for LLVM. The work +was triggered by a recent miscompilation bug my current bachelor student Nico +Rittinghaus found.

                +

                Background: Python Integers in the PyPy JIT

                +

                The optimizer of PyPy's JITs operates on traces, which are linear sequences of +instructions with guards. The instructions in the traces operate on different +machine-level data types, machine integers, doubles, pointers, bools, etc. In +this post we'll be mostly concerned with machine integers.

                +

                To given some wider context I'll explain a bit how Python ints in the user code +relate to the types that are used in traces when the PyPy Python implementation +is used. +When PyPy turns a regular Python 3 function into a trace, there is a lot of work +happening in the JIT frontend to try to observe and infer the types that the +Python function concretely uses at runtime. The traces are generated under these +typing assumptions. Therefore, code that uses ints in the Python code can +typically be translated into traces that operate on machine integers. In order +to make sure that the Python integer semantics are upheld, many of the +operations in the traces need to check that the integer results of some +operations still fit into a machine integer. If that is not the case (a rare +situation for most programs), the trace is left via a guard, execution falls +back to the interpreter, and there a big integer representation is chosen for +the too big value (the big integer representation is done via a pointer and +some storage on the heap).

                +

                All of this machinery is not going to be too relevant for the rest of the +post. For the post it's important to know that trace instructions operate on +machine integers and other low-level types, and some of the operations can +optionally check whether the +results still fit into a machine integer. These trace operations are improved by +the optimizer, which tries to transform the trace into one that behaves the +same, but is less costly to execute.

                +

                Background: Bounds Analysis in PyPy's JIT

                +

                The optimizer of PyPy's JIT has an analysis based on abstract interpretation +that tries to find out whether the integer values stored in a variable are +actually not using the full 64 bit (or 32 bit) range, but instead fit into some +smaller range. This means that for every integer variable x in a trace, the +JIT compiler tracks upper and lower bounds of the runtime value of that +variable: a range [a, b] such that for every concrete runtime value v +that gets stored in variable x, a <= v <= b must be true. +a and b start out +as the most general MININT and MAXINT, but sometimes there is extra +information that makes it possible to improve these known bounds, and that is +often useful to optimize the code.

                +

                A typical example is that the JIT knows that the length of a string is +non-negative, so for this kind of code: x = len(s) where s is a string, +x gets a range [0, MAXINT] assigned. With this information we could for +example remove a check x + 10 < 0 completely, because it can never be true.

                +

                The bounds information is useful for optimization, but the analysis of the +bounds is also a source of bugs in the JIT, because the reasoning is often +subtle and easy to get wrong in corner cases. We already use a number of testing +techniques to try to make sure that it is correct. A simple one is +property-based testing using Hypothesis on the operations on bounds. Even +though Hypothesis is fantastic, it unfortunately does not catch +absolutely all the bugs even if we'd like it too, as we'll see in the next +section.

                +

                Motivation: A JIT Miscompilation

                +

                I am currently supervising a Bachelor thesis by Nico Rittinghaus, who is +extending the integer analysis in the JIT. He'll probably write a separate blog +post about that soon. In the process of his work, the current bounds analysis +code got a lot of scrutiny, and we found out that one of the unit tests of the +bounds analysis was actually incorrect, and the example code in that unit test +was optimized incorrectly. This case of incorrect optimization is not a big deal +for regular Python code, because it involved a "wrapping integer addition +operation", i.e. one where overflowing results just wrap around to negative +values. All the additions and other arithmetic operations that the PyPy Python +frontend generates actually have +overflow checks (to be able to switch to a big integer representation if +needed). +However, it's still possible to trigger the problem with the +__pypy__.intop.int_add API which is a function that exposes wraparound +arithmetic on Python ints.

                +

                Here's the miscompilation. The JIT optimizes the following function:

                +
                import __pypy__
                +
                +def wrong(x):
                +    a = __pypy__.intop.int_add(x, 10)
                +    if a < 15:
                +        if x < 6:
                +            return 0
                +        return 1
                +    return 2
                +
                +

                Into the following code:

                +
                import __pypy__
                +
                +def wrong(x):
                +    a = __pypy__.intop.int_add(x, 10)
                +    if a < 15:
                +        return 0
                +    return 2
                +
                +

                Basically the faulty reasoning of the JIT looks like this: if int_add(x, 10) < 15 +then it must follow that x < 5, which is stronger than x < 6, so the +second if is always true. This sounds good, but is actually wrong +if the addition + 10 wrapped around. So if x == MAXINT, then +int_add(x, 10) == MININT + 9 < 15. But MAXINT < 5 is not +correct.

                +

                Note how the same reasoning with overflow-checking addition is correct! If x + +10 < 15 and the + didn't overflow, then indeed x < 6. And if your +mind bends starting to think about all this, you understand some of the +difficulty of getting the JIT correct in this area.

                +

                How could we have avoided this bug?

                +

                One exercise I try to do after finding bugs is to reflect on ways that the +bug could have been avoided. I think this is particularly important in the JIT, +where bugs are potentially really annoying to find and can cause very strange +behaviour in basically arbitrary Python code.

                +

                It's easy to always answer this question with "try to think more carefully +when working", but that approach cannot be relied on in complicated situations, +because humans don't concentrate perfectly for long stretches of time.

                +

                A situation-specific problem I identified was the bad design of the range analysis API. +A range is not just represented by two numbers, instead it's two numbers +and two bools that are supposed to represent that some operation did or did not +underflow/overflow. The meaning of these bools was quite hard to grasp and easy +to get wrong, so probably they should never have been introduced in the first +place (and my bugfix indeed removed them).

                +

                But in the rest of this blog post I want to talk about another, systematic +approach that can be applied to the problem of mis-optimizations of integer +operations, and that is done by applying an SMT solver to the problem.

                +

                An SMT solver (Satisfyability Modulo Theories) is a tool that can be used to +find out whether mathematical formulas are "satisfiable", i.e. whether +some chosen set of inputs exists that will make the formulas evaluate to true. SMT solvers are +commonly used in a wide range of CS applications including program correctness +proofs, program synthesis, etc. The most widely known one is probably Z3 by +Microsoft Research which has the nice advantage of coming with an easy-to-use +Python binding.

                +

                Going into this I basically knew next to nothing about SMT solvers (despite +having been embedded in a formal methods research group for years!) so it was an +interesting new world to learn about.

                +

                As briefly mentioned in the introduction, the approach I took followed a similar +(but much more properly executed) one applied to LLVM operations, called +Alive2. Krister Waldfridsson has done similar work for GCC recently, +described on his blog.

                +

                Z3 Proof of Concept

                +

                The first thing I did was to try to get Z3 find the above bug, by encoding the +input program into an SMT formula by hand and trying to get Z3 to prove the condition +that the JIT thinks is always true. The Z3 code for this looks as follows:

                +
                from z3 import BitVec, Implies, prove
                +x = BitVec('x', 64)
                +a = x + 10
                +cond1 = a < 15
                +cond2 = x < 6
                +prove(Implies(cond1, cond2))
                +
                +

                Here, x is defined to be a bit vector variable of width 64, which is a +datatype that can be used to represent bounded machine integers. Addition on +bit vectors performs wraparound arithmetic, like the __pypy__.intop.int_add +call in the original code. The JIT optimized the second condition away, so +essentially it was convinced that the first condition implies the second one. +The above snippet tries to get Z3 to confirm this.

                +

                When run, the above program prints:

                +
                counterexample
                +[x = 9223372036854775803]
                +

                Which shows the bug. As a small side-note, I thought it was cool that the +process of "proving" something in Z3 basically means trying to find an example +for the negation of the formula. If no counterexample can be found for the +negation, the original formula is true. If the original formula turns out to be +false (like here) we get a nice example that shows the problem to go with it.

                +

                It's not realistic to hand-translate all the hundreds of +unit-tests into Z3 formulas and then ask Z3 to prove the optimizations. Instead, +we want to have a program that does this for us.

                +

                SMT Checking of the JIT Optimizer

                +

                What we want from this program is the following: given an unoptimized trace and +its optimized version, we want to use Z3 to check whether the optimized trace +behaves identically to the unoptimized one. One question is what "behaves +identically" means. What we care about is the outputs of the trace being the +same values, no matter how they are computed. Also, for every guard we want to +make sure that it fails in identical ways in the optimized and unoptimized +versions. A guard is only allowed to be optimized away if it can never fail. +The code that comes after a guard can assume that the guard has not failed, +because otherwise execution would have left the trace. All of this should be +true regardless for the values of the input variables of the trace.

                +

                So in order to check that the two traces are behaving identically, we do the +following:

                +
                  +
                • We create Z3 variables for every input variable. We use the same input +variables both for the unoptimized as well as the optimized trace.

                • +
                • We align the two traces at the corresponding guards. Thankfully the optimizer +keeps track of which optimized guard corresponds to which unoptimized input +guard.

                • +
                • All the operations before a guard are translated into Z3 formulas, for both +versions of the trace.

                • +
                • For two corresponding guards, we ask Z3 to prove that the guard conditions are +identical.

                • +
                • For a guard that was optimized away we ask Z3 to prove that the condition is +always true.

                • +
                • After a guard, we tell Z3 that from now on it can assume that the guard +condition is true.

                • +
                • We repeat this, guard for guard, until we reach the end of the trace. There, +we ask Z3 to prove that the output variables in the unoptimized trace and the +optimized trace are identical (every trace can return one or many values).

                • +
                +

                I implemented this, it's not a lot of code, basically a couple of hundred lines +of (somewhat hacky) Python code. So far I only support integer +operations. Here are some parts of the code to give you a flavor of what this +looks like.

                +

                This is the code that translates operations into Z3 formulas:

                +
                def add_to_solver(self, ops, state):
                +    for op in ops:
                +        if op.type != 'v': # is it an operation with a result
                +            res = self.newvar(op)
                +        else: # or does it return void
                +            res = None
                +
                +       # ...
                +
                +        # convert arguments
                +        if op.numargs() == 1:
                +            arg0 = self.convertarg(op, 0)
                +        elif op.numargs() == 2:
                +            arg0 = self.convertarg(op, 0)
                +            arg1 = self.convertarg(op, 1)
                +
                +        # compute results
                +        if opname == "int_add":
                +            expr = arg0 + arg1
                +        elif opname == "int_sub":
                +            expr = arg0 - arg1
                +        elif opname == "int_mul":
                +            expr = arg0 * arg1
                +        elif opname == "int_and":
                +            expr = arg0 & arg1
                +        elif opname == "int_or":
                +            expr = arg0 | arg1
                +        elif opname == "int_xor":
                +            expr = arg0 ^ arg1
                +
                +        # ...  more operations, some shown below
                +
                +        self.solver.add(res == expr)
                +
                +

                New Z3 variables are defined by the helper function newvar, which adds the +operation to a dictionary box_to_z3 mapping boxes (=variables) to Z3 +variables. Due to the SSA property that traces have, a variable must be defined +before its first use.

                +

                Here's what newvar looks like (LONG_BIT is a constant that is either +64 or 32, depending on the target architecture):

                +
                def newvar(self, box, repr=None):
                +    # ... some logic around making the string representation
                +    # somewhat nicer omitted
                +    result = z3.BitVec(repr, LONG_BIT)
                +    self.box_to_z3[box] = result
                +    return result
                +
                +

                The convert method turns an operation argument (either a constant or a +variable) into a Z3 formula (either a constant bit vector or an already defined +Z3 variable). convertarg is a helper function that takes an operation, reads +its nth argument and converts it.

                +
                def convert(self, box):
                +    if isinstance(box, ConstInt):
                +        return z3.BitVecVal(box.getint(), LONG_BIT)
                +    return self.box_to_z3[box]
                +
                +def convertarg(self, box, arg):
                +    return self.convert(box.getarg(arg))
                +
                +

                The lookup of variables in box_to_z3 that convert does cannot fail, +because the variable must have been defined before use.

                +

                Comparisons return the bit vector 0 or bit vector 1, we use a helper function +cond to turn the Z3 truth value of the comparison into a bit vector:

                +
                def cond(self, z3expr):
                +    return z3.If(z3expr, TRUEBV, FALSEBV)
                +
                +
                +def add_to_solver(self, ops, state):
                +        # ... start as above
                +
                +        # more cases
                +        elif opname == "int_eq":
                +            expr = self.cond(arg0 == arg1)
                +        elif opname == "int_ne":
                +            expr = self.cond(arg0 != arg1)
                +        elif opname == "int_lt":
                +            expr = self.cond(arg0 < arg1)
                +        elif opname == "int_le":
                +            expr = self.cond(arg0 <= arg1)
                +        elif opname == "int_gt":
                +            expr = self.cond(arg0 > arg1)
                +        elif opname == "int_ge":
                +            expr = self.cond(arg0 >= arg1)
                +        elif opname == "int_is_true":
                +            expr = self.cond(arg0 != FALSEBV)
                +        elif opname == "uint_lt":
                +            expr = self.cond(z3.ULT(arg0, arg1))
                +        elif opname == "uint_le":
                +            expr = self.cond(z3.ULE(arg0, arg1))
                +        elif opname == "uint_gt":
                +            expr = self.cond(z3.UGT(arg0, arg1))
                +        elif opname == "uint_ge":
                +            expr = self.cond(z3.UGE(arg0, arg1))
                +        elif opname == "int_is_zero":
                +            expr = self.cond(arg0 == FALSEBV)
                +
                +        # ... rest as above
                +
                +

                So basically for every trace operation that operates on integers I had to give a +translation into Z3 formulas, which is mostly straightforward.

                +

                Guard operations get converted into a Z3 boolean by their own helper function, +which looks like this:

                +
                def guard_to_condition(self, guard, state):
                +    opname = guard.getopname()
                +    if opname == "guard_true":
                +        return self.convertarg(guard, 0) == TRUEBV
                +    elif opname == "guard_false":
                +        return self.convertarg(guard, 0) == FALSEBV
                +    elif opname == "guard_value":
                +        return self.convertarg(guard, 0) == self.convertarg(guard, 1)
                +
                +    # ... some more exist, shown below
                +
                +

                Some operations are a bit trickier. An important example in the context of +this blog post are integer operations that check for overflow. The overflow +operations return a result, but also a boolean whether the operation overflowed +or not.

                +
                def add_to_solver(self, ops, state):
                +
                +        # ... more cases
                +
                +        elif opname == "int_add_ovf":
                +            expr = arg0 + arg1
                +            m = z3.SignExt(LONG_BIT, arg0) + z3.SignExt(LONG_BIT, arg1)
                +            state.no_ovf = m == z3.SignExt(LONG_BIT, expr)
                +        elif opname == "int_sub_ovf":
                +            expr = arg0 - arg1
                +            m = z3.SignExt(LONG_BIT, arg0) - z3.SignExt(LONG_BIT, arg1)
                +            state.no_ovf = m == z3.SignExt(LONG_BIT, expr)
                +        elif opname == "int_mul_ovf":
                +            expr = arg0 * arg1
                +            m = z3.SignExt(LONG_BIT, arg0) * z3.SignExt(LONG_BIT, arg1)
                +            state.no_ovf = m == z3.SignExt(LONG_BIT, expr)
                +
                +        # ...
                +
                +

                The boolean is computed by comparing the result of the bit vector operation with +the result of converting the input bit vectors into an abstract (arbitrary +precision) integer and the result back to bit vectors. Let's go through the +addition case step by step, the other cases work analogously.

                +

                The addition in the first elif that computes expr is an addition on bit +vectors, therefore it is performing wraparound arithmetic. +z3.SignExt(LONG_BIT, arg0) sign-extends arg0 from a bit vector of +LONG_BIT bits to an abstract, arbitrary precision integer. The addition in +the second line is therefore an addition between abstract integers, so it will +never overflow and just compute the correct result as an integer.

                +

                The condition to check for overflow is now: if the results of the two different +ways to do the addition are the same, then overflow did not occur. So in order +to compute state.no_ovf in the addition case the +code converts the result of the bit vector wraparound addition to +an abstract integer (using SignExt again), and then compares that to the integer +result.

                +

                This boolean can then be checked by the guard operations guard_no_overflow +and guard_overflow.

                +
                def guard_to_condition(self, guard, state):
                +
                +    # ... more cases
                +
                +    elif opname == "guard_no_overflow":
                +        assert state.no_ovf is not None
                +        return state.no_ovf
                +    elif opname == "guard_overflow":
                +        assert state.no_ovf is not None
                +        return z3.Not(state.no_ovf)
                +
                +    # ... more cases
                +
                +

                Finding the Bug, Again

                +

                Let's actually make all of this more concrete by applying it to the trace of our +original bug. The input trace and the incorrectly optimized trace for that look +like this (differences highlighted):

                +
                # input                       # optimized
                +[i0]                          [i0]
                +i1 = int_add(i0, 10)          i1 = int_add(i0, 10)
                +i2 = int_lt(i1, 15)           i2 = int_lt(i1, 15)
                +guard_true(i2)                guard_true(i2)
                +i3 = int_lt(i0, 6)            jump(0)
                +guard_true(i3)
                +jump(0)
                +
                +

                Note that the trace represents just one of the paths through the control flow +graph of the original function, which is typical for tracing JITs (the other +paths could incrementally get added later).

                +

                The first guards in both these traces correspond to each other, so the first +chunks to check are the first three operations (lines 1-4). Those operations +don't get changed by the optimizer at all.

                +

                These two identical traces get translated to the following Z3 formulas:

                +
                i1unoptimized == input_i0 + 10
                +i2unoptimized == If(i1unoptimized < 15, 1, 0)
                +i1optimized == input_i0 + 10
                +i2optimized == If(i1optimized < 15, 1, 0)
                +
                +

                To check that the two corresponding guards are the same, the solver is asked to +prove that (i2unoptimized == 1) == (i2optimized == 1). This is +correct, because the formulas for i2unoptimized and i2optimized are +completely identical.

                +

                After checking that the guards behave the same, we add the knowledge to the +solver that the guards passed. So the Z3 formulas become:

                +
                i1unoptimized == input_i0 + 10
                +i2unoptimized == If(i1unoptimized < 15, 1, 0)
                +i1optimized == input_i0 + 10
                +i2optimized == If(i1optimized < 15, 1, 0)
                +i1optimized == 1
                +i2optimized == 1
                +
                +

                Now we continue with the remaining operations of the two traces (lines 6-8).

                +

                We start by adding the int_lt operation in the unoptimized trace to the Z3 +formulas:

                +
                ...
                +i3unoptimized == If(input_i0 < 6, 1, 0)
                +
                +

                Because the second guard was optimized away, we need to ask Z3 to prove that +i3unoptimized == 1 is always true, which fails and gives the following +counterexample:

                +
                input_i0 = 9223372036854775800
                +i1unoptimized = 9223372036854775810
                +i2unoptimized = 0
                +i1optimized = 9223372036854775810
                +i2optimized = 1
                +i3unoptimized = 0
                +
                +

                Thus demonstrating the bug. The fact that the Z3-based equivalence check also +managed to find the original motivating bug without manually translating it to +a formula is a good confirmation that the approach works.

                +

                Second bug

                +

                So with this code I applied the Z3-based equivalence check to all our optimizer +unit tests. In addition to the bug we've been discussing the whole post, it also +found another buggy test! I had found it too by hand by staring at all the tests +in the process of writing all the Z3 infrastructure, but it was still a good +confirmation that the process worked. This bug was in the range analysis for +int_neg, integer negation. It failed to account that -MININT == MININT +and therefore did a mis-optimization along the following lines:

                +
                import __pypy__
                +
                +def wrong(x):
                +    a = __pypy__.intop.int_sub(0, x)
                +    if a < 0:
                +        if x > 0:
                +            return 0
                +        return 1
                +    return 2
                +
                +

                Which was wrongly optimized into:

                +
                import __pypy__
                +
                +def wrong(x):
                +    a = __pypy__.intop.int_sub(0, x)
                +    if a < 0:
                +        return 0
                +    return 2
                +
                +

                This is wrong precisely for x == MININT.

                +

                Generating Random Traces

                +

                These two bugs were the only two that the Z3 checker found for existing unit +tests. To try to find some more bugs I combined PyPy's existing random trace +generator with the Z3 optimization checker. The random trace generator has so +far been mostly used to find bugs in the machine code backends, particularly +also in the register allocator. So far we haven't used it with our optimizer, +but my experiments show that we should have!

                +

                I'm going to describe a little bit how the random trace generator works. It's +actually not that complicated, but there's one neat trick to it.

                +

                The basic idea is straightforward, it starts out with an empty trace with a +random number of input variables. Then it adds some number of operations to the +trace, either regular operations or guards. Every operation takes already +existing variables as input.

                +

                The neat trick is that our random trace generator keeps a concrete random +example value for every one of the input variables, and an example result for +every operation. In this way, it is possible to generate guards that are +consistent with the example values to ensure that running the trace to its end +is possible with at least one set of values.

                +

                Here's an example random trace that is generated, together with the random +example inputs and the results of every operation at the end of every line:

                +
                [i0, i1, i2, i3, i4, i5] # example values: 9, 11, -8, -95, 46, 57
                +i6 = int_add_ovf(i3, i0) # -86
                +guard_no_overflow()
                +i7 = int_sub(i2, -35/ci) # 27
                +i8 = uint_ge(i3, i5) # 1
                +guard_true(i8)
                +i9 = int_lt(i7, i8) # 0
                +i10 = int_mul_ovf(34/ci, i7) # 918
                +guard_no_overflow()
                +i11 = int_and(i10, 63/ci) # 22
                +i12 = int_rshift(i3, i11) # -1
                +i13 = int_is_zero(i7) # 0
                +i14 = int_is_true(i13) # 0
                +guard_false(i13)
                +i15 = int_lt(i8, i4) # 1
                +i16 = int_and(i6, i0) # 8
                +i17 = uint_ge(i6, -6/ci) # 0
                +finish()
                +

                Note how every guard generated is true for the example values.

                +

                I have been running this combination of random trace generation and Z3 checking +for many nights and it has found some bugs, which I'll describe in the next +section. It should probably be run for a lot longer, but still a useful +exercise already.

                +

                In this mode, I'm giving every Z3 call a time limit to make sure that the random +tests don't just take arbitrarily long. This means that asking Z3 to prove +something can have three outcomes, either it's proved, or Z3 finds a +counterexample, or Z3 times out.

                +

                Bugs Found

                +

                In addition to the two bugs I've already described, I'll briefly list the +additional bugs that were found by optimizing random traces and then trying to +prove the equivalence with Z3.

                +

                Most of the bugs were actually identified by optimizing random traces alone, not +by the Z3 component. They manifested as assert failures in the JIT compiler.

                +
                  +
                • The JIT concluded after 12 == int_mul(x, 12) that x == 1, which is +incorrect if overflow occurred (a counterexample is 0x8000000000000001).

                • +
                • An amusing bug, where from 0 == int_lshift(0x1000000000000000, x) with +x <= 0 <= 15, the JIT concluded that 0x1000000000000000 == 0, +triggering an assert. This wrong conclusion was again caused by not taking the +possibility of overflow into account.

                • +
                • A corner case in an optimization for chained integer additions with a +constant, where in complex enough expressions, the wrong IR API was used +(which works correctly in simple cases). Again, this triggered an assert.

                • +
                +

                This shows that we should have been fuzzing our JIT optimizer already (not a +surprising observation in hindsight, fuzz all the things!).

                +

                Thankfully, there was also one further bug that really failed in the Z3 +verifier. It's a bug in common subexpression elimination / arithmetic +simplification, which again does not take overflow correctly into account.

                +

                The buggy trace looks like this (unfortunately it's not easily possible to show +this bug in Python code).

                +
                [a, b]
                +c = int_add(a, b)
                +r = int_sub_ovf(c, b)
                +guard_no_ovf()
                +finish(r)
                +
                +

                This was optimized to:

                +
                [a, b]
                +finish(a)
                +
                +

                Which is incorrect, because the guard can fail given the right inputs. +But the optimizer concluded that the subtraction is safe, because its the +inverse of an earlier addition, not taking into account that this earlier +addition can have overflowed.

                +

                Note that a related optimization is actually correct. Given this code:

                +
                [a, b]
                +c = int_add_ovf(a, b)
                +guard_no_ovf()
                +r = int_sub(c, b)
                +finish(r)
                +
                +

                It can be optimized to:

                +
                [a, b]
                +c = int_add_ovf(a, b)
                +guard_no_ovf()
                +finish(a)
                +
                +

                Future Work and Conclusion

                +

                In the current form the Z3 checker is only a start, even though it has already +been concretely useful. There are various directions into which we could extend +it. In addition to generate random tests completely from scratch, we could also +start from the existing manually written unit-tests and randomly mutate those.

                +

                I also want to extend the Z3 checker with support more operations, heap +operations in particular (but it's not quite clear to me how to model garbage +collection).

                +

                I also want to try to switch the code away from the Z3 API and use the more +general smtlib interface directly, in order to be able to use other SMT +checkers than Z3, eg CVC4.

                +

                But all in all this was a fun and not too hard way to find a bunch of bugs in +our optimizer! And the infrastructure is now in place, which means that we run +some random test cases every time we execute our tests. This is going to be +particularly useful when we do further work on the integer reasoning of the JIT +(like Nico is doing, for example). As of time of writing of this post, all the +bugs mentioned have been fixed and the Z3 code has landed on the default branch +and runs as part of PyPy's CI infrastructure.

                +

                Acknowledgements

                +

                Thanks to Saam Barati, Max Bernstein, Joshua Schmidt and Martin +Berger, for great feedback on drafts of this post!

                +
                +
                +

                PyPy v7.3.10 release

                + +
                +

                PyPy v7.3.10: release of python 2.7, 3.8, and 3.9

                +

                The PyPy team is proud to release version 7.3.10 of PyPy. We have some nice +speedups and bugfixes we wish to share. The release includes three different +interpreters:

                +
                +
                  +
                • PyPy2.7, which is an interpreter supporting the syntax and the features of +Python 2.7 including the stdlib for CPython 2.7.18+ (the + is for +backported security updates)

                • +
                • PyPy3.8, which is an interpreter supporting the syntax and the features of +Python 3.8, including the stdlib for CPython 3.8.15.

                • +
                • PyPy3.9, which is an interpreter supporting the syntax and the features of +Python 3.9, including the stdlib for CPython 3.9.15. We have gained +confidence in the stability of this version, and are removing the "beta" +label.

                • +
                +
                +

                The interpreters are based on much the same codebase, thus the multiple +release. This is a micro release, all APIs are compatible with the other 7.3 +releases. Highlights of the release, since the release of 7.3.9 in March 2022 +include:

                +
                +
                  +
                • A release of Apple Silicon M1 arm64 versions. This work was sponsored by +an anonymous donor and is tested on our buildbots.

                • +
                • Many improvements to the basic interpreter to make it 15-20% faster

                • +
                • The conda-forge community has built over 1000 packages for PyPy3.8 and 3.9, +making it easier than ever to use PyPy.

                • +
                • Update the packaged OpenSSL to 1.1.1s, sqlite3 to 3.39.4, and apply +applicable security fixes from CPython 3.9.15 to PyPy2.7

                • +
                • Update the HPy backend in PyPy3.8 and PyPy3.9 to 0.0.4

                • +
                +
                +

                We recommend updating. You can find links to download the v7.3.10 releases here:

                +
                +

                https://pypy.org/download.html

                +
                +

                We would like to thank our donors for the continued support of the PyPy +project. If PyPy is not quite good enough for your needs, we are available for +direct consulting work. If PyPy is helping you out, we would love to hear about +it and encourage submissions to our blog via a pull request +to https://github.com/pypy/pypy.org

                +

                We would also like to thank our contributors and encourage new people to join +the project. PyPy has many layers and we need help with all of them: bug fixes, +PyPy and RPython documentation improvements, or general help with making +RPython's JIT even better. Since the previous release, we have accepted +contributions from five new contributors, thanks for pitching in, and welcome +to the project!

                +

                If you are a python library maintainer and use C-extensions, please consider +making a HPy / CFFI / cppyy version of your library that would be performant +on PyPy. +In any case, both cibuildwheel and the multibuild system support +building wheels for PyPy.

                +

                What is PyPy?

                +

                PyPy is a Python interpreter, a drop-in replacement for CPython 2.7, 3.8 and +3.9. It's fast (PyPy and CPython 3.7.4 performance +comparison) due to its integrated tracing JIT compiler.

                +

                We also welcome developers of other dynamic languages to see what RPython +can do for them.

                +

                We provide binary builds for:

                +
                +
                  +
                • x86 machines on most common operating systems +(Linux 32/64 bits, Mac OS 64 bits, Windows 64 bits)

                • +
                • 64-bit ARM machines running Linux (aarch64).

                • +
                • Apple M1 arm64 machines (macos_arm64).

                • +
                • s390x running Linux

                • +
                +
                +

                PyPy support Windows 32-bit, Linux PPC64 big- and little-endian, and Linux ARM +32 bit, but does not release binaries. Please reach out to us if you wish to +sponsor binary releases for those platforms. Downstream packagers provide +binary builds for debian, Fedora, conda, OpenBSD, FreeBSD, Gentoo, and more.

                +

                What else is new?

                +

                For more information about the 7.3.10 release, see the full changelog.

                +

                Please update, and continue to help us make pypy better.

                +

                Cheers, +The PyPy Team

                +
                +
                +

                PyPy and conda-forge

                + +
                +

                You can use PyPy as your python interpreter in a conda environment. The +conda-forge team has graciously provided this service.

                +

                The conda-forge tips-and-tricks +page says:

                +
                +

                The conda-forge channel supports creating and installing packages into +environments using the PyPy interpreter. Many packages are already available. +You need to enable the conda-forge channel and use the pypy identifier when +creating your environment:

                +
                +
                  $ conda create -c conda-forge -n my-pypy-env pypy python=3.8
                +  $ conda activate my-pypy-env
                +
                + +
                +

                Currently supported python versions are 3.8 and 3.9. Support for pypy3.7 has +been dropped. While you can still create a python 3.7 environment, you you +will not be getting updates as new package versions are released (including +pypy itself).

                +

                if you are using defaults as a low priority channel, then you need to use +strict channel priority as the metadata in defaults has not been patched yet +which allows cpython extension packages to be installed alongside pypy.

                +
                +
                  $ conda config --set channel_priority strict
                +
                + +

                The work required some out-of-the-box thinking on the part of conda-forge since +they needed to add the idea of a pypy identifier to the python version and +the whole conda team has been very supportive of the effort needed. Binary +packages are on offer for the usual platforms:

                +
                  +
                • +x86_64 windows, macos, linux
                • +
                • +ppc64le and aarch64 linux.
                • +
                +

                There are currently over 1000 packages available for download via the +conda-forge channel, and more are being added as the kind package maintainers +work around various differences between CPython and PyPy. Please let us know if +your favorite package is not supported.

                +
                +

                The PyPy Blog Turns 15 Years

                + +
                +

                Exactly 15 years ago today we wrote the first blog post on the PyPy blog! +Over the years, we have written 423 posts, from the shortest to the +longest. In 2021 we moved from blogger to our own domain.

                +

                The topics over the years varied widely, we published release announcements; +roadmaps; JIT, GC and STM updates; benchmarks; sprint, trip and +conference reports; technical deep dives; case studies; april fool's +jokes; research projects; other languages using RPython; finished PhD +Bachelor and Master, theses; pictures:

                + +a collage of photos taken at PyPy sprints +

                and diagrams:

                + +a collage of diagrams from previous blog posts +

                Quite a number of blog posts were very early iterations of papers that we +published later, here are a few that I can remember:

                + +

                Greatest Hits

                +

                In terms of visitors, the top five posts on the old blog were – on the new blog +we simply don't have stats (yet?):

                +
                  +
                1. Let's remove the global interpreter lock

                2. +
                3. Tutorial: Writing an Interpreter with PyPy, Part 1

                4. +
                5. PyPy's new JSON parser

                6. +
                7. PyPy gets funding from Mozilla for Python 3.5 support

                8. +
                9. How to make your code 80 times faster

                10. +
                +

                The number of posts per year developed like this:

                +/images/2022-pypy-posts-per-year.svg

                The most prolific authors are:

                +
                  +
                1. Maciej Fijałkowski

                2. +
                3. Carl Friedrich Bolz-Tereick

                4. +
                5. Armin Rigo

                6. +
                7. Antonio Cuni

                8. +
                9. Matti Picus

                10. +
                +

                Several blog posts have made it to the Hacker News front page, three of them to +number 1:

                +

                Personal Favourites

                +

                While looking through the posts, there were a few that stood out to me in some +way, so here's a subjective list of ones that I had fun looking at again:

                + +

                We'd like to thank our authors, guest authors, commenters, users and readers who +have stuck with us through one and a half decades! If there's any particular +topics you would like to read something about, or any guest posts you'd like to +write, let us know!

                +
                +
                +

                Allocation Removal in the Toy Optimizer

                + +
                +

                One of the workhorse optimization of RPython's tracing JIT is allocation +removal, which removes short-lived object allocation from traces. Many Python +programs create a lot of objects that only live for a short time, and whose +lifespan is fully predictable (common examples are integer and float boxes, but +also tuples, frames, intermediate string results, etc). Allocation removal will +try (and very often succeed) to remove these allocations from traces. In +this blog post I want to show a toy version of how allocation removal is +implemented.

                +

                In the previous blog post of this series I showed the complete code for +writing a toy one-pass optimizer that does constant folding, common +subexpression elimination and strength reduction. In this +second post, I want to use allocation removal as a more advanced optimization +pass. The basic optimization framework is the same, we will use the same +datastructures for intermediate representation and also keep using the same +union find data structure to store equivalences between IR operations. Here's +the infrastructure code from the last post:

                +
                import pytest
                +from typing import Optional, Any
                +
                +
                +class Value:
                +    def find(self):
                +        raise NotImplementedError("abstract")
                +
                +    def _set_forwarded(self, value):
                +        raise NotImplementedError("abstract")
                +
                +
                +class Operation(Value):
                +    def __init__(
                +        self, name: str, args: list[Value]
                +    ):
                +        self.name = name
                +        self.args = args
                +        self.forwarded = None
                +        self.info = None
                +
                +    def __repr__(self):
                +        return (
                +            f"Operation({self.name}, "
                +            f"{self.args}, {self.forwarded}, "
                +            f"{self.info})"
                +        )
                +
                +    def find(self) -> Value:
                +        op = self
                +        while isinstance(op, Operation):
                +            next = op.forwarded
                +            if next is None:
                +                return op
                +            op = next
                +        return op
                +
                +    def arg(self, index):
                +        return self.args[index].find()
                +
                +    def make_equal_to(self, value: Value):
                +        self.find()._set_forwarded(value)
                +
                +    def _set_forwarded(self, value: Value):
                +        self.forwarded = value
                +
                +
                +class Constant(Value):
                +    def __init__(self, value: Any):
                +        self.value = value
                +
                +    def __repr__(self):
                +        return f"Constant({self.value})"
                +
                +    def find(self):
                +        return self
                +
                +    def _set_forwarded(self, value: Value):
                +        assert (
                +            isinstance(value, Constant)
                +            and value.value == self.value
                +        )
                +
                +class Block(list):
                +    def opbuilder(opname):
                +        def wraparg(arg):
                +            if not isinstance(arg, Value):
                +                arg = Constant(arg)
                +            return arg
                +        def build(self, *args):
                +            # construct an Operation, wrap the
                +            # arguments in Constants if necessary
                +            op = Operation(opname,
                +                [wraparg(arg) for arg in args])
                +            # add it to self, the basic block
                +            self.append(op)
                +            return op
                +        return build
                +
                +    # a bunch of operations we support
                +    add = opbuilder("add")
                +    mul = opbuilder("mul")
                +    getarg = opbuilder("getarg")
                +    dummy = opbuilder("dummy")
                +    lshift = opbuilder("lshift")
                +    # some new one for this post
                +    alloc = opbuilder("alloc")
                +    load = opbuilder("load")
                +    store = opbuilder("store")
                +    print = opbuilder("print")
                +
                +def bb_to_str(bb: Block, varprefix: str = "var"):
                +    def arg_to_str(arg: Value):
                +        if isinstance(arg, Constant):
                +            return str(arg.value)
                +        else:
                +            return varnames[arg]
                +
                +    varnames = {}
                +    res = []
                +    for index, op in enumerate(bb):
                +        var = f"{varprefix}{index}"
                +        varnames[op] = var
                +        arguments = ", ".join(
                +            arg_to_str(op.arg(i))
                +                for i in range(len(op.args))
                +        )
                +        strop = f"{var} = {op.name}({arguments})"
                +        res.append(strop)
                +    return "\n".join(res)
                +
                +

                There are two changes to the code from the last post: Operation instances +have a new .info field, which is set to None by default. We will learn +how the info field is used a bit further down. Also, we define some new +operations.

                +

                Interpreter

                +

                In this post we will mainly concern ourselves with optimizing +programs that allocate memory. We assume that our language is garbage collected +and memory safe. The new operations that we will optimize are alloc +(allocates some new object), store (stores a value into a fixed field of an +object), load (loads the value from a field in the object).

                +

                We are leaving out a lot of details of a "real" system here, usually an +alloc operation would get some extra information, for example the type of +the freshly allocated object or at least its size. load and store would +typically have some kind of field offset and maybe some information about the +field's type

                +

                Here's a simple program that uses these operations:

                +
                var0 = getarg(0)
                +obj0 = alloc()
                +store(obj0, 0, var0)
                +var1 = load(obj0, 0)
                +print(var1)
                +

                The code allocates a new object obj0, stores var0 into field 0 of +the object, the loads the same field and prints the result of the load.

                +

                Before we get started in writing the optimizer for these operations, let's try +to understand the semantics of the new operations a bit better. To do this, we +can sketch a small interpreter for basic blocks, supporting only getarg, +alloc, store, load, print:

                +
                def test_interpret():
                +    bb = Block()
                +    var0 = bb.getarg(0)
                +    obj = bb.alloc()
                +    sto = bb.store(obj, 0, var0)
                +    var1 = bb.load(obj, 0)
                +    bb.print(var1)
                +    assert interpret(bb, 17) == 17
                +
                +class Object:
                +    def __init__(self):
                +        self.contents: dict[int, Any] = {}
                +
                +    def store(self, idx : int, value : Any):
                +        self.contents[idx] = value
                +
                +    def load(self, idx : int):
                +        return self.contents[idx]
                +
                +def get_num(op, index=1):
                +    assert isinstance(op.arg(index), Constant)
                +    return op.arg(index).value
                +
                +def interpret(bb : Block, *args : tuple[Any]):
                +    def argval(op, i):
                +        arg = op.arg(i)
                +        if isinstance(arg, Constant):
                +            return arg.value
                +        else:
                +            assert isinstance(arg, Operation)
                +            return arg.info
                +
                +    for index, op in enumerate(bb):
                +        if op.name == "getarg":
                +            res = args[get_num(op, 0)]
                +        elif op.name == "alloc":
                +            res = Object()
                +        elif op.name == "load":
                +            fieldnum = get_num(op)
                +            res = argval(op, 0).load(fieldnum)
                +        elif op.name == "store":
                +            obj = argval(op, 0)
                +            fieldnum = get_num(op)
                +            fieldvalue = argval(op, 2)
                +            obj.store(fieldnum, fieldvalue)
                +            # no result, only side effect
                +            continue
                +        elif op.name == "print":
                +            res = argval(op, 0)
                +            print(res)
                +            return res
                +        else:
                +            raise NotImplementedError(
                +                f"{op.name} not supported")
                +        op.info = res
                +
                +

                The interpreter walks the operations of a block, executing each one in turn. It +uses the info field to store the result of each already executed +Operation. In this interpreter sketch we stop at the first print that +we execute and return its argument for the simple but bad reason that it makes +test_interpret easier to write.

                +

                Objects in the interpreter are represented using a class Object, which +stores the object's field into a Python dictionary. As written above, this is a +simplification, in a real system the alloc operation might for example take +some kind of type as an argument, that describes which kinds of fields an +object has and how they are laid out in memory, which would allow more +efficient storage of the content. But we don't want to care about this level of +detail in the post, so using a dict in the interpreter is good enough.

                +

                Version 1: Naive Attempt

                +

                In many programs, some allocated objects don't live for very long and have a +completely predictable lifetime. They get allocated, used for a while, and then +there is no way to reference them any more, so the garbage collector will +reclaim them. The very first example block had such an allocation:

                +
                var0 = getarg(0)
                +obj0 = alloc()
                +store(obj0, 0, var0)
                +var1 = load(obj0, 0)
                +print(var1)
                +

                Here obj0 is written to, then read from, and then it's no longer used. We +want to optimize such programs to remove this alloc operation. The optimized +version of this program would look like this:

                +
                var0 = getarg(0)
                +print(var0)
                +

                The alloc, store and load operations have been completely removed. +This is a pretty important optimizations for PyPy's JIT: Allocations, memory +reads and writes are quite costly and occur a lot in Python, so getting rid +of as many of them as possible is instrumental for performance.

                +

                Implementing the optimization is not a lot of code! However, understanding all +the corner cases of the +optimization and making sure that the resulting program behave correctly is not +completely trivial. Therefore we will develop the optimization step by step, in +a test driven fashion: I will start each section with a new test that shows a +bug in the version of the optimization that we have so far.

                +

                Let's start in a really naive way. Here's the first test we would like to +pass, using the example program above:

                +
                def test_remove_unused_allocation():
                +    bb = Block()
                +    var0 = bb.getarg(0)
                +    obj = bb.alloc()
                +    sto = bb.store(obj, 0, var0)
                +    var1 = bb.load(obj, 0)
                +    bb.print(var1)
                +    opt_bb = optimize_alloc_removal(bb)
                +    # the virtual object looks like this:
                +    #  obj
                +    # ┌──────────┐
                +    # │ 0: var0  │
                +    # └──────────┘
                +    assert bb_to_str(opt_bb, "optvar") == """\
                +optvar0 = getarg(0)
                +optvar1 = print(optvar0)"""
                +
                +

                We will define a class VirtualObject that is basically identical to +Object above. But it will not be used by the interpreter, instead we will +use it during optimization.

                +
                class VirtualObject:
                +    def __init__(self):
                +        self.contents: dict[int, Value] = {}
                +
                +    def store(self, idx, value):
                +        self.contents[idx] = value
                +
                +    def load(self, idx):
                +        return self.contents[idx]
                +
                +

                The structure of the optimizer is going to be like those in the first blog post. +The optimizer makes a single pass over all operations. It removes some and +emits others.

                +

                This first version of the allocation removal optimizer is going to be extremely +optimistic. It simply assumes that all the allocations in the program can be +optimized away. That is not realistic in practice. We will have to +refine this approach later, but it's a good way to start. That means whenever +the optimizer sees an alloc operation, it removes it and creates a +VirtualObject object which stores the information that is known during +optimization about the result of the alloc. Like in the interpreter, the +VirtualObject is stored in the .info field of the Operation instance +that represents the alloc.

                +

                When the optimizer sees a store operation, it will also remove it and +instead execute the store by calling the VirtualObject.store method. +Here is one important difference between the interpreter and the optimizer: In +the interpreter, the values that were stored into an Object (and thus +put into the object's .contents dictionary) were runtime values, for +example integers or other objects. In the optimizer however, the +fields of the VirtualObject store Value instances, either Constant +instances or Operation instances.

                +

                When the optimizer sees a load operation, it also removes it, and replaces +the load with the Operation (or Constant) that is stored in the +VirtualObject at that point:

                +
                def optimize_alloc_removal(bb):
                +    opt_bb = Block()
                +    for op in bb:
                +        if op.name == "alloc":
                +            op.info = VirtualObject()
                +            continue
                +        if op.name == "load":
                +            info = op.arg(0).info
                +            field = get_num(op)
                +            op.make_equal_to(info.load(field))
                +            continue
                +        if op.name == "store":
                +            info = op.arg(0).info
                +            field = get_num(op)
                +            info.store(field, op.arg(2))
                +            continue
                +        opt_bb.append(op)
                +    return opt_bb
                +
                +

                This is the first version of the optimization. It doesn't handle all kinds of +difficult cases, and we'll have to do something about its optimism. +But, already in this minimalistic form, we can write a slightly more complicated +test with two allocations, one object pointing to the other. It works correctly +too, both allocations are removed:

                +
                def test_remove_two_allocations():
                +    bb = Block()
                +    var0 = bb.getarg(0)
                +    obj0 = bb.alloc()
                +    sto1 = bb.store(obj0, 0, var0)
                +    obj1 = bb.alloc()
                +    sto2 = bb.store(obj1, 0, obj0)
                +    var1 = bb.load(obj1, 0)
                +    var2 = bb.load(var1, 0)
                +    bb.print(var2)
                +    # the virtual objects look like this:
                +    #  obj0
                +    # ┌──────┐
                +    # │ 0: ╷ │
                +    # └────┼─┘
                +    #      │
                +    #      ▼
                +    #     obj1
                +    #   ┌─────────┐
                +    #   │ 0: var0 │
                +    #   └─────────┘
                +    # therefore
                +    # var1 is the same as obj0
                +    # var2 is the same as var0
                +    opt_bb = optimize_alloc_removal(bb)
                +    assert bb_to_str(opt_bb, "optvar") == """\
                +optvar0 = getarg(0)
                +optvar1 = print(optvar0)"""
                +
                +

                Version 2: Re-Materializing Allocations

                +

                To make it easier to talk about how the optimizer operates, let's introduce +some terminology. As already seen by the choice +of the class name VirtualObject, we will call an object virtual if the +optimizer has optimized away the alloc operation that creates the object. +Other objects are equivalently not virtual, for example those that have +existed before we enter the current code block.

                +

                The first problem that we need to fix is the assumption that every +allocation can be removed. So far we only looked at small programs where every +allocation could be removed, or equivalently, where every object is virtual. +A program that creates virtual objects, stores into and loads from them, and +then forgets the objects. In this simple case removing the allocations is fine. +As we saw in the previous section, it's also fine to have a virtual object +reference another virtual, both allocations can be removed.

                +

                What are the cases were we can't remove an allocation? +The first version of the optimizer simply assumed that every allocation can be +removed. This can't work. We will replace this assumption with the following +simple heuristic:

                +

                If a reference to a virtual object a is stored into an object b +that is not virtual, then a will also stop being virtual. If an object a +that was virtual stops being virtual, we say that it escapes. ¹

                +

                The simplest test case for this happening looks like this:

                +
                def test_materialize():
                +    bb = Block()
                +    var0 = bb.getarg(0)
                +    obj = bb.alloc()
                +    sto = bb.store(var0, 0, obj)
                +    opt_bb = optimize_alloc_removal(bb)
                +    #  obj is virtual, without any fields
                +    # ┌───────┐
                +    # │ empty │
                +    # └───────┘
                +    # then we store a reference to obj into
                +    # field 0 of var0. Since var0 is not virtual,
                +    # obj escapes, so we have to put it back
                +    # into the optimized basic block
                +    assert bb_to_str(opt_bb, "optvar") == """\
                +optvar0 = getarg(0)
                +optvar1 = alloc()
                +optvar2 = store(optvar0, 0, optvar1)"""
                +    # so far, fails like this:
                +    # the line:
                +    # info.store(field, op.arg(2))
                +    # produces an AttributeError because info
                +    # is None
                +
                +

                If the optimizer reaches a point where a virtual object escapes (like the +store operation in the test), the optimizer has already removed the alloc +operation that created the virtual object. If the object escapes, we don't want +to go back in the operations list and re-insert the alloc operation, that +sounds potentially very complicated. Instead, we re-insert the alloc +operation that will recreate the virtual object at the point of escape using a +helper function materialize.

                +
                def materialize(opt_bb, value: Operation) -> None:
                +    assert not isinstance(value, Constant)
                +    assert isinstance(value, Operation)
                +    info = value.info
                +    assert isinstance(info, VirtualObject)
                +    assert value.name == "alloc"
                +    # put the alloc operation back into the trace
                +    opt_bb.append(value)
                +
                +

                I've added a number of fairly strong assertions to materialize to encode our +current assumptions about the situations in which it expects to be called. We +will remove some of them later as we generalize the code.

                +

                Now that we have materialize we need to change optimize_alloc_removal to +recognize the case of storing a virtual object into a non-virtual one. We can +recognize Operation instances that produced a virtual object by looking at +their .info field. If it is None, the object is not virtual, otherwise +it is. If we store something into a virtual object, we leave the code as above. +If we store a virtual object into an object that is not virtual, we will first +materialize the virtual object, and then emit the store.

                +
                def optimize_alloc_removal(bb):
                +    opt_bb = Block()
                +    for op in bb:
                +        if op.name == "alloc":
                +            op.info = VirtualObject()
                +            continue
                +        if op.name == "load":
                +            info = op.arg(0).info
                +            field = get_num(op)
                +            op.make_equal_to(info.load(field))
                +            continue
                +        if op.name == "store":
                +            info = op.arg(0).info
                +            if info: # virtual
                +                field = get_num(op)
                +                info.store(field, op.arg(2))
                +                continue
                +            else: # not virtual
                +                # first materialize the
                +                # right hand side
                +                materialize(opt_bb, op.arg(2))
                +                # then emit the store via
                +                # the general path below
                +        opt_bb.append(op)
                +    return opt_bb
                +
                +

                This is the general idea, and it is enough to pass test_materialize. But of +course there are still a number of further problems that we now need to solve.

                +

                Version 3: Don't Materialize Twice

                +

                The first problem is the fact that after we materialize a virtual object, it is +no longer virtual. So if it escapes a second time, it should not be +materialized a second time. A test for that case could simply repeat the +store operation:

                +
                def test_dont_materialize_twice():
                +    # obj is again an empty virtual object,
                +    # and we store it into var0 *twice*.
                +    # this should only materialize it once
                +    bb = Block()
                +    var0 = bb.getarg(0)
                +    obj = bb.alloc()
                +    sto0 = bb.store(var0, 0, obj)
                +    sto1 = bb.store(var0, 0, obj)
                +    opt_bb = optimize_alloc_removal(bb)
                +    assert bb_to_str(opt_bb, "optvar") == """\
                +optvar0 = getarg(0)
                +optvar1 = alloc()
                +optvar2 = store(optvar0, 0, optvar1)
                +optvar3 = store(optvar0, 0, optvar1)"""
                +    # fails so far: the operations that we get
                +    # at the moment are:
                +    # optvar0 = getarg(0)
                +    # optvar1 = alloc()
                +    # optvar2 = store(optvar0, 0, optvar1)
                +    # optvar3 = alloc()
                +    # optvar4 = store(optvar0, 0, optvar3)
                +    # ie the object is materialized twice,
                +    # which is incorrect
                +
                +

                We solve the problem by setting the .info field of an object that we +materialize to None to mark it as no longer being virtual.

                +
                def materialize(opt_bb, value: Operation) -> None:
                +    assert not isinstance(value, Constant)
                +    assert isinstance(value, Operation)
                +    info = value.info
                +    if info is None:
                +        return # already materialized
                +    assert value.name == "alloc"
                +    # put the alloc operation back into the trace
                +    opt_bb.append(value)
                +    # but only once
                +    value.info = None
                +
                +# optimize_alloc_removal unchanged
                +
                +

                This fixes the problem, only one alloc is created. This fix also allows +another test case to pass, one where we store a non-virtual into another +non-virtual, code which we cannot optimize at all:

                +
                def test_materialize_non_virtuals():
                +    # in this example we store a non-virtual var1
                +    # into another non-virtual var0
                +    # this should just lead to no optimization at
                +    # all
                +    bb = Block()
                +    var0 = bb.getarg(0)
                +    var1 = bb.getarg(1)
                +    sto = bb.store(var0, 0, var1)
                +    opt_bb = optimize_alloc_removal(bb)
                +    assert bb_to_str(opt_bb, "optvar") == """\
                +optvar0 = getarg(0)
                +optvar1 = getarg(1)
                +optvar2 = store(optvar0, 0, optvar1)"""
                +
                +

                Version 4: Materialization of Constants

                +

                Another straightforward extension is to support materializing constants. A +constant is never virtual, so materializing it should do nothing.

                +
                def test_materialization_constants():
                +    # in this example we store the constant 17
                +    # into the non-virtual var0
                +    # again, this will not be optimized
                +    bb = Block()
                +    var0 = bb.getarg(0)
                +    sto = bb.store(var0, 0, 17)
                +    opt_bb = optimize_alloc_removal(bb)
                +    # the previous line fails so far, triggering
                +    # the assert:
                +    # assert not isinstance(value, Constant)
                +    # in materialize
                +    assert bb_to_str(opt_bb, "optvar") == """\
                +optvar0 = getarg(0)
                +optvar1 = store(optvar0, 0, 17)"""
                +
                +

                To implement that case, we check for value being a constant and return +early:

                +
                def materialize(opt_bb, value: Operation) -> None:
                +    if isinstance(value, Constant):
                +        return
                +    assert isinstance(value, Operation)
                +    info = value.info
                +    if info is None:
                +        return # already materialized
                +    assert value.name == "alloc"
                +    # put the alloc operation back into the trace
                +    opt_bb.append(value)
                +    # but only once
                +    value.info = None
                +
                +# optimize_alloc_removal unchanged
                +
                +

                Version 5: Materializing Fields

                +

                Now we need to solve a more difficult problem. So far, the virtual objects that +we have materialized have all been empty, meaning they didn't have any fields +written to at the point of materialization. Let's write a test for this:

                +
                def test_materialize_fields():
                +    bb = Block()
                +    var0 = bb.getarg(0)
                +    var1 = bb.getarg(1)
                +    obj = bb.alloc()
                +    contents0 = bb.store(obj, 0, 8)
                +    contents1 = bb.store(obj, 1, var1)
                +    sto = bb.store(var0, 0, obj)
                +
                +    # the virtual obj looks like this
                +    #  obj
                +    # ┌──────┬──────────┐
                +    # │ 0: 8 │ 1: var1  │
                +    # └──────┴──────────┘
                +    # then it needs to be materialized
                +    # this is the first example where a virtual
                +    # object that we want to materialize has any
                +    # content and is not just an empty object
                +    opt_bb = optimize_alloc_removal(bb)
                +    assert bb_to_str(opt_bb, "optvar") == """\
                +optvar0 = getarg(0)
                +optvar1 = getarg(1)
                +optvar2 = alloc()
                +optvar3 = store(optvar2, 0, 8)
                +optvar4 = store(optvar2, 1, optvar1)
                +optvar5 = store(optvar0, 0, optvar2)"""
                +    # fails so far! the operations we get
                +    # at the moment are:
                +    # optvar0 = getarg(0)
                +    # optvar1 = getarg(1)
                +    # optvar2 = alloc()
                +    # optvar3 = store(optvar0, 0, optvar2)
                +    # which is wrong, because the store operations
                +    # into optvar1 got lost
                +
                +

                To fix this problem, we need to re-create a store operation for every +element of the .contents dictionary of the virtual object we are +materializing. ²

                +
                def materialize(opt_bb, value: Operation) -> None:
                +    if isinstance(value, Constant):
                +        return
                +    assert isinstance(value, Operation)
                +    info = value.info
                +    if info is None:
                +        return # already materialized
                +    assert value.name == "alloc"
                +    # put the alloc operation back into the trace
                +    opt_bb.append(value)
                +    # put the content back
                +    for idx, val in info.contents.items():
                +        # re-create store operation
                +        opt_bb.store(value, idx, val)
                +    # only materialize once
                +    value.info = None
                +
                +# optimize_alloc_removal unchanged
                +
                +

                This is enough to pass the test.

                +

                Version 6: Recursive Materialization

                +

                In the above example, the fields of the virtual objects contained +only constants or non-virtual objects. However, we could have a situation where +a whole tree of virtual objects is built, and then the root of the tree escapes. +This makes it necessary to escape the whole tree. Let's write a test for a small +tree of two virtual objects:

                +
                def test_materialize_chained_objects():
                +    bb = Block()
                +    var0 = bb.getarg(0)
                +    obj0 = bb.alloc()
                +    obj1 = bb.alloc()
                +    contents = bb.store(obj0, 0, obj1)
                +    const = bb.store(obj1, 0, 1337)
                +    sto = bb.store(var0, 0, obj0)
                +    #  obj0
                +    # ┌──────┐
                +    # │ 0: ╷ │
                +    # └────┼─┘
                +    #      │
                +    #      ▼
                +    #     obj1
                +    #   ┌─────────┐
                +    #   │ 0: 1337 │
                +    #   └─────────┘
                +    # now obj0 escapes
                +    opt_bb = optimize_alloc_removal(bb)
                +    assert bb_to_str(opt_bb, "optvar") == """\
                +optvar0 = getarg(0)
                +optvar1 = alloc()
                +optvar2 = alloc()
                +optvar3 = store(optvar2, 0, 1337)
                +optvar4 = store(optvar1, 0, optvar2)
                +optvar5 = store(optvar0, 0, optvar1)"""
                +    # fails in an annoying way! the resulting
                +    # basic block is not in proper SSA form
                +    # so printing it fails. The optimized
                +    # block would look like this:
                +    # optvar0 = getarg(0)
                +    # optvar1 = alloc()
                +    # optvar3 = store(optvar1, 0, optvar2)
                +    # optvar4 = store(optvar0, 0, optvar1)
                +    # where optvar2 is an ``alloc`` Operation
                +    # that is not itself in the output block
                +
                +

                To fix it, materialize needs to call itself recursively for all the field +values of the virtual object:

                +
                def materialize(opt_bb, value: Operation) -> None:
                +    if isinstance(value, Constant):
                +        return
                +    assert isinstance(value, Operation)
                +    info = value.info
                +    if info is None:
                +        return # already materialized
                +    assert value.name == "alloc"
                +    # put the alloc operation back into the trace
                +    opt_bb.append(value)
                +    # put the content back
                +    for idx, val in sorted(info.contents.items()):
                +        # materialize recursively
                +        materialize(opt_bb, val)
                +        opt_bb.store(value, idx, val)
                +    # only materialize once
                +    value.info = None
                +
                +# optimize_alloc_removal unchanged
                +
                +

                Getting there, the materialization logic is almost done. We need to fix a +subtle remaining problem though.

                +

                Version 7: Dealing with Object Cycles

                +

                The bug we need to fix in this section is a bit tricky, and does not immediately +occur in a lot of programs. In +fact, in PyPy a variant of it was hiding out in our optimizer +until we found it much later (despite us being aware of the general problem and +correctly dealing with it in other cases).

                +

                The problem is this: a virtual object can (directly or indirectly) point to +itself, and we must carefully deal with that case to avoid infinite recursion in +materialize. Here's the simplest test:

                +
                def test_object_graph_cycles():
                +    bb = Block()
                +    var0 = bb.getarg(0)
                +    var1 = bb.alloc()
                +    var2 = bb.store(var1, 0, var1)
                +    var3 = bb.store(var0, 1, var1)
                +    #   ┌────────┐
                +    #   ▼        │
                +    #  obj0      │
                +    # ┌──────┐   │
                +    # │ 0: ╷ │   │
                +    # └────┼─┘   │
                +    #      │     │
                +    #      └─────┘
                +    # obj0 points to itself, and then it is
                +    # escaped
                +    opt_bb = optimize_alloc_removal(bb)
                +    # the previous line fails with an
                +    # InfiniteRecursionError
                +    # materialize calls itself, infinitely
                +
                +    # what we want is instead this output:
                +    assert bb_to_str(opt_bb, "optvar") == """\
                +optvar0 = getarg(0)
                +optvar1 = alloc()
                +optvar2 = store(optvar1, 0, optvar1)
                +optvar3 = store(optvar0, 1, optvar1)"""
                +
                +

                The fix is not a big change, but a little bit subtle nevertheless. +We have to change the +order in which things are done in materialize. Right after emitting the +alloc, we set the .info to None, to mark the object as not virtual. +Only afterwards do we re-create the stores and call materialize recursively. +If a recursive call reaches the same object, it's already marked as non-virtual, +so materialize won't recurse further:

                +
                def materialize(opt_bb, value: Operation) -> None:
                +    if isinstance(value, Constant):
                +        return
                +    assert isinstance(value, Operation)
                +    info = value.info
                +    if info is None:
                +        return # already materialized
                +    assert value.name == "alloc"
                +    # put the alloc operation back into the trace
                +    opt_bb.append(value)
                +    # only materialize once
                +    value.info = None
                +    # put the content back
                +    for idx, val in sorted(info.contents.items()):
                +        # materialize recursively
                +        materialize(opt_bb, val)
                +        opt_bb.store(value, idx, val)
                +
                +

                Version 8: Loading from non-virtual objects

                +

                Now materialize is done. We need to go back to optimize_alloc_removal and +improve it further. The last time we changed it, we added a case analysis to the +code dealing with store, distinguishing between storing to a virtual and to +a non-virtual object. We need to add an equivalent distinction to the load +case, because right now loading from a non-virtual crashes.

                +
                def test_load_non_virtual():
                +    bb = Block()
                +    var0 = bb.getarg(0)
                +    var1 = bb.load(var0, 0)
                +    bb.print(var1)
                +    # the next line fails in the line
                +    # op.make_equal_to(info.load(field))
                +    # because info is None
                +    opt_bb = optimize_alloc_removal(bb)
                +    assert bb_to_str(opt_bb, "optvar") == """\
                +optvar0 = getarg(0)
                +optvar1 = load(optvar0, 0)
                +optvar2 = print(optvar1)"""
                +
                +

                To fix it, we split the load code into two cases, leaving the virtual path +as before, and letting the load from a non-virtual fall through to the +general code at the end of the function.

                +
                def optimize_alloc_removal(bb):
                +    opt_bb = Block()
                +    for op in bb:
                +        if op.name == "alloc":
                +            op.info = VirtualObject()
                +            continue
                +        if op.name == "load":
                +            info = op.arg(0).info
                +            if info: # virtual
                +                field = get_num(op)
                +                op.make_equal_to(info.load(field))
                +                continue
                +            # otherwise not virtual, use the
                +            # general path below
                +        if op.name == "store":
                +            info = op.arg(0).info
                +            if info: # virtual
                +                field = get_num(op)
                +                info.store(field, op.arg(2))
                +                continue
                +            else: # not virtual
                +                # first materialize the
                +                # right hand side
                +                materialize(opt_bb, op.arg(2))
                +                # then emit the store via
                +                # the general path below
                +        opt_bb.append(op)
                +    return opt_bb
                +
                +

                Version 9 (Final): Materialize on Other Operations

                +

                We're almost at the end now. There's one final generalization left to do. We +started with the heuristic that storing a virtual into a non-virtual would +escape it. This should be generalized. Every time we pass a virtual into any +operation where it is not the first argument of a load and a store +should also escape it (imagine passing the virtual to some function call). +Let's test this as usual with our print operation:

                +
                def test_materialize_on_other_ops():
                +    # materialize not just on store
                +    bb = Block()
                +    var0 = bb.getarg(0)
                +    var1 = bb.alloc()
                +    var2 = bb.print(var1)
                +    opt_bb = optimize_alloc_removal(bb)
                +    assert bb_to_str(opt_bb, "optvar") == """\
                +optvar0 = getarg(0)
                +optvar1 = alloc()
                +optvar2 = print(optvar1)"""
                +    # again, the resulting basic block is not in
                +    # valid SSA form
                +
                +

                To fix this, we will take the call to materialize out of the store code +path and instead put it into the generic code path the end of the while +loop:

                +
                # materialize is unchanged
                +def materialize(opt_bb, value: Value) -> None:
                +    if isinstance(value, Constant):
                +        return
                +    assert isinstance(value, Operation)
                +    info = value.info
                +    if not info:
                +        # Already materialized
                +        return
                +    assert value.name == "alloc"
                +    opt_bb.append(value)
                +    value.info = None
                +    for idx, val in sorted(info.contents.items()):
                +        materialize(opt_bb, val)
                +        opt_bb.store(value, idx, val)
                +
                +def optimize_alloc_removal(bb):
                +    opt_bb = Block()
                +    for op in bb:
                +        if op.name == "alloc":
                +            op.info = VirtualObject()
                +            continue
                +        if op.name == "load":
                +            info = op.arg(0).info
                +            if info: # virtual
                +                field = get_num(op)
                +                op.make_equal_to(info.load(field))
                +                continue
                +        if op.name == "store":
                +            info = op.arg(0).info
                +            if info: # virtual
                +                field = get_num(op)
                +                info.store(field, op.arg(2))
                +                continue
                +        # materialize all the arguments of
                +        # operations that are put into the
                +        # output basic block
                +        for arg in op.args:
                +            materialize(opt_bb, arg.find())
                +        opt_bb.append(op)
                +    return opt_bb
                +
                +

                That's it, we're done. It's not a lot of code, but actually quite a powerful +optimization. In addition to removing allocations for objects that are only used +briefly and in predictable ways, it also has another effect. If an object is +allocated, used in a number of operations and then escapes further down in the +block, the operations in between can often be optimized away. This is +demonstrated by the next test (which already passes):

                +
                def test_sink_allocations():
                +    bb = Block()
                +    var0 = bb.getarg(0)
                +    var1 = bb.alloc()
                +    var2 = bb.store(var1, 0, 123)
                +    var3 = bb.store(var1, 1, 456)
                +    var4 = bb.load(var1, 0)
                +    var5 = bb.load(var1, 1)
                +    var6 = bb.add(var4, var5)
                +    var7 = bb.store(var1, 0, var6)
                +    var8 = bb.store(var0, 1, var1)
                +    opt_bb = optimize_alloc_removal(bb)
                +    assert bb_to_str(opt_bb, "optvar") == """\
                +optvar0 = getarg(0)
                +optvar1 = add(123, 456)
                +optvar2 = alloc()
                +optvar3 = store(optvar2, 0, optvar1)
                +optvar4 = store(optvar2, 1, 456)
                +optvar5 = store(optvar0, 1, optvar2)"""
                +
                +

                Note that the addition is not optimized away, because the code from this blog +post does not contain constant folding and the other optimizations from +the last one. Combining them would not be too hard though.

                +

                Conclusion

                +

                That's it! The core idea of PyPy's allocation removal optimization in one or +two screens of code. The real implementation has a number of refinements, +but the core ideas are all here.

                +

                I'm not going to show any benchmark numbers or anything like that here, if you +are interested in numbers you could look at the evaluation Section 6. +"Implementation and Evaluation" of the paper that describes the work.

                +

                There's a complementary optimization that improves load and store +operations for objects that are not virtual. I'll probably not write that +down as another post, but Max Bernstein and I developed that together on a +PyPy Twitch channel channel a few weeks ago, here's the recording:

                +

                Footnotes

                +

                ¹ This is how PyPy uses the terminology, not really used consistently by other +projects. The term "escape" is fairly standard throughout the escape +analysis literature. The term "virtual" was used originally in Armin Rigo's +Psyco but is e.g. also used by the paper Partial Escape Analysis and Scalar +Replacement for Java.

                +

                ² The order in which we put the store operations back is relying on +dictionary iteration order, which is insertion order. That's not a bad +ordering, we could also be explicit and sort the fields in some order (ideally +the order in which the object lays them out in memory).

                +
                +
                +

                Düsseldorf HPy/PyPy/GraalPy sprint September 19-23rd 2022

                + +
                +

                The programming language group of the Computer Science department of +Heinrich-Heine Universität Düsseldorf is happy to invite everybody to another +sprint in Düsseldorf, from the 19th to the 23rd of September 2022. This is a +fully public sprint, everyone and particularly newcomers are welcome to join +us! The goal is to bring together people from the HPy, PyPy, GraalPy and +CPython communities.

                +

                Topics and goals

                +
                  +
                • work on HPy APIs, discussions around next steps for the project

                • +
                • continuing new and ongoing ports to HPy, including Cython, NumPy, Pillow, Matplotlib

                • +
                • 3.10 support on PyPy and GraalPy

                • +
                • preparing the next PyPy release

                • +
                • discussions around ways to improve collaboration between the different Python +implementations

                • +

                What is a sprint?

                +

                The experience of the PyPy project has shown the benefits of regular +sprints. They are focussed one week physical meetings where people pair-program +on new features and discuss future plans. Coming to one is a great way to get +started with a project!

                +

                Location

                +

                The sprint will take place in a seminar room of the computer science +department. It is in the building 25.12, room 02.50 (second floor) of the +university campus. For travel instructions see

                +
                +

                https://www.cs.hhu.de/lehrstuehle-und-arbeitsgruppen/softwaretechnik-und-programmiersprachen/kontakt/service/lage-und-anreise

                +
                +

                We ask participants to wear masks during the indoor working hours.

                +
                +Photograph of the statue of Heinrich Heine in front of the University library on the campus in Düsseldorf +

                Wiegels, CC BY 3.0, via Wikimedia Commons

                +

                Exact times

                +

                Work days: starting September 19th (~morning), ending September 23rd (~afternoon). +We will do a to-be-planned social activity on Wednesday afternoon.

                +

                Registration

                +

                Please register by editing this file or by opening a pull request:

                +
                +

                https://foss.heptapod.net/pypy/extradoc/-/blob/branch/extradoc/sprintinfo/ddorf2022/people.txt

                +
                +

                or by sending a quick mail to the pypy-dev mailing list:

                +
                +

                http://mail.python.org/mailman/listinfo/pypy-dev

                +
                +
                +
                +
                +
                +
                + +
                +
                +
                + +
                + + + + \ No newline at end of file diff --git a/blog/index-44.html b/blog/index-44.html new file mode 100644 index 000000000..d137d3476 --- /dev/null +++ b/blog/index-44.html @@ -0,0 +1,687 @@ + + + + + + +PyPy (old posts, page 44) | PyPy + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +
                +

                PyPy v7.3.15 release

                + +
                +

                PyPy v7.3.15: release of python 2.7, 3.9, and 3.10

                +

                The PyPy team is proud to release version 7.3.15 of PyPy.

                +

                This is primarily a bug-fix release, and includes work done to migrate PyPy to +Git and Github.

                +

                The release includes three different interpreters:

                +
                +
                  +
                • PyPy2.7, which is an interpreter supporting the syntax and the features of +Python 2.7 including the stdlib for CPython 2.7.18+ (the + is for +backported security updates)

                • +
                • PyPy3.9, which is an interpreter supporting the syntax and the features of +Python 3.9, including the stdlib for CPython 3.9.18.

                • +
                • PyPy3.10, which is an interpreter supporting the syntax and the features of +Python 3.10, including the stdlib for CPython 3.10.13.

                • +
                +
                +

                The interpreters are based on much the same codebase, thus the multiple +release. This is a micro release, all APIs are compatible with the other 7.3 +releases. It follows after 7.3.14 release on Dec 25, 2023

                +

                We recommend updating. You can find links to download the v7.3.15 releases here:

                +
                +

                https://pypy.org/download.html

                +
                +

                We would like to thank our donors for the continued support of the PyPy +project. If PyPy is not quite good enough for your needs, we are available for +direct consulting work. If PyPy is helping you out, we would love to hear about +it and encourage submissions to our blog via a pull request +to https://github.com/pypy/pypy.org

                +

                We would also like to thank our contributors and encourage new people to join +the project. PyPy has many layers and we need help with all of them: bug fixes, +PyPy and RPython documentation improvements, or general help with +making RPython's JIT even better.

                +

                If you are a python library maintainer and use C-extensions, please consider +making a HPy / CFFI / cppyy version of your library that would be performant +on PyPy. In any case, both cibuildwheel and the multibuild system support +building wheels for PyPy.

                +

                What is PyPy?

                +

                PyPy is a Python interpreter, a drop-in replacement for CPython +It's fast (PyPy and CPython 3.7.4 performance +comparison) due to its integrated tracing JIT compiler.

                +

                We also welcome developers of other dynamic languages to see what RPython +can do for them.

                +

                We provide binary builds for:

                +
                +
                  +
                • x86 machines on most common operating systems +(Linux 32/64 bits, Mac OS 64 bits, Windows 64 bits)

                • +
                • 64-bit ARM machines running Linux (aarch64).

                • +
                • Apple M1 arm64 machines (macos_arm64).

                • +
                • s390x running Linux

                • +
                +
                +

                PyPy support Windows 32-bit, Linux PPC64 big- and little-endian, and Linux ARM +32 bit, but does not release binaries. Please reach out to us if you wish to +sponsor binary releases for those platforms. Downstream packagers provide +binary builds for debian, Fedora, conda, OpenBSD, FreeBSD, Gentoo, and more.

                +

                What else is new?

                +

                For more information about the 7.3.15 release, see the full changelog.

                +

                Please update, and continue to help us make pypy better.

                +

                Cheers, +The PyPy Team

                +
                +
                +

                PyPy has moved to Git, GitHub

                + +
                +

                PyPy has moved its canonical repo and issue tracker from +https://foss.heptapod.net/pypy/pypy to https://github.com/pypy/pypy. Obviously, +this means development will now be tracked in Git rather than Mercurial.

                +

                Motivation

                +

                We still feel Mercurial is a better version control system. The named branch +model and user interface are superior. But

                +
                  +
                • +

                  foss.heptapod.net is not well indexed in google/bing/duckduckgo + search, so people find it harder to search for issues in the project.

                  +
                • +
                • +

                  Since Heptapod has tightened its spam control, we get reports that + users create issues only to have them flagged as spam.

                  +
                • +
                • +

                  Open Source has become synonymous with GitHub, and we are too small to + change that.

                  +
                • +
                • +

                  Much of the current development comes as a reaction to fixing issues. + Tracking interlocking issues is easier if all the code is on the same + platform.

                  +
                • +
                • +

                  The FAQ + presents two arguments against the move. Github notes + solves much of point (1): the difficulty of discovering provenance of + commits, although not entirely. But the main problem is point (2), it turns + out that not moving to GitHub is an impediment to contribution and issue + reporting.

                  +
                • +
                • +

                  People who wish to continue to use Mercurial can use the same method below to + push to GitHub.

                  +
                • +
                • +

                  GitHub is more resource rich than foss.heptapod.net. We could add CI + jobs to replace some of our aging buildbot + infrastructure.

                  +
                • +
                +

                Method

                +

                The migration required two parts: migrating the code and then migrating the +issues and merge requests.

                +

                Code migration 1: code and notes

                +

                I used a fork of git-remote-hg to +create a local Git repo with all the changesets. Then I wanted to add a Git +note to each commit with the branch it came from. So I prepared a file with two +columns: the Git commit hash, and the corresponding branch from Mercurial. +Mercurial can describe each commit in two ways: either the commit hash or by a +number index. I used hg log to convert an index i to the Mercurial hash, +and then git-hg-helper from git-remote-hg to convert the Mercurial hash to +a Git hash:

                +
                $(cd pypy-git; git-hg-helper git-rev $(cd ../pypy-hg; hg log -r $i -T"{node}\n"))
                +
                + +

                Then I used hg log again to print the Mercurial branch for the index i:

                +
                $(cd pypy-hg; hg log -r $i -T'{branch}\n')
                +
                + +

                Putting these two together, I could loop over all the commits by their +numerical index to prepare the file. Then I iterated over each line in the +file, and added the Git note. Since the git note add command works on the +current HEAD, I needed to checkout each commit in turn and then add the note:

                +
                git checkout -q <hash> && git notes --ref refs/notes/branch add -m branch:<branch>
                +
                + +

                I could then use git push --all to push to GitHub.

                +

                Code migration 2: prepare the branches

                +

                PyPy has almost 500 open branches. The code migration created all the branch +HEADs, but git push --all did not push them. I needed to check them out and +push each one. So I created a file with all the branch names

                +
                cd pypy-hg; hg branches | cut -f1 -d" " > branches.txt
                +
                + +

                and then push each one to the GitHub repo

                +
                while read branch; do git checkout branches/$branch && git push origin branches/$branch; done < branches.txt
                +
                + +

                Note that the branches were named branches/XXX by the migration, not branch/XXX. This confuses the merge request migration, more about that later.

                +

                Issue and merge request migration

                +

                I used the solution from +node-gitlab-2-github which +worked almost perfectly. It is important to do the conversion on a private +repo otherwise every mention of a successfully mapped user name notifies +the user about the transfer. This can be quite annoying for a repo the size of +PyPy with 600 merge requests and over 4000 issues. Issues transferred without a +problem: the script properly retained the issue numbers. However the script +does not convert the Mercurial hashes to Git hashes, so the bare hashes in +comments show up without a link to the commit. Merge requests are more of a problem:

                +
                  +
                • The Mercurial named branch "disappears" once it is merged, so a merge request + to a merged branch does not find the target branch name in Git. The + conversion creates an issue instead with the label gitlab merge request.
                • +
                • For some reason, the branches created by git-remote-hg are called + branches/XXX and not branch/XXX as expected by GitLab. This messes up the + merge request/PR conversion. For some of the branches (open PRs and main + target branches) I manually created additional branches without the es. The + net result is that open merge requests became open PRs, merged merge requests + became issues, and closed-not-merged merge requests were not migrated.
                • +
                +

                Layered conversions

                +

                PyPy already migrated once from Bitbucket to Heptapod. Many of the issues +reflect the multiple transitions: they have lines like "Created originally on +Bitbucket by XXX" from the first transition, and an additional line "In +Heptapod" from this transition.

                +

                Credits

                +

                We would like to express our gratitude to the Octobus +team who support Heptapod. The transition from Bitbucket was quite an effort, +and they have generously hosted our development since then. We wish them all +the best, and still believe that Mercurial should have "won".

                +

                Next steps

                +

                While the repo at GitHub is live, there are still a few more things we need to +do:

                +
                  +
                • Documentation needs an update for the new repo and the build automation from + readthedocs must be adjusted.
                • +
                • The wiki should be copied from Heptapod.
                • +
                • buildbot.pypy.org should also look at the new repo. I hope the code is up to + the task of interacting with a Git repo.
                • +
                • speed.pypy.org tracks changes, it too needs to reference the new location
                • +
                • To keep tracking branches with Git notes on new commits, I activated a + github action by Julian to + add a Git branch note to each commit. Please see the README there for + directions on using Git notes.
                • +
                • Some of the merge requests were not migrated. If someone wants to, they could + migrate those once they figure out the branch naming problems.
                • +
                +

                Additionally, now is the time for all of you to prove the move is worthwhile:

                +
                  +
                • Star the repo, let others know how to find it,
                • +
                • Help fix some of the open issues or file new ones,
                • +
                • Take advantage of the more familiar workflow to get involved in the project,
                • +
                • Suggest ways to improve the migration: are there things I missed or could + have done better?
                • +
                +

                How will development change?

                +

                Heptapod did not allow personal forks, so we were generous with a commit bit to +the main repo. Additionally, we (well, me) have been using a +commit-directly-to-main workflow. We will now be adopting a more structured +workflow. Please fork the repo and submit a pull request for any changes. We +can now add some pre-merge CI to check that the PR at least passes the first +stage of translation. The live and active branches will be:

                +
                  +
                • +main: what was "default" in Mercurial, it is the Python2.7 interpreter and + the base of the RPython interpreter,
                • +
                • +py3.9: the Python3.9 interpreter, which also includes all RPython changes + from main. This is exactly like on Mercurial, and
                • +
                • +py3.10: the Python3.10 interpreter, which also includes all RPython changes + from main and all bugfixes from py3.9. This is exactly like on Mercurial.
                • +
                +

                Working between the repos

                +
                Finding commits
                +

                If you want to figure out how a Mercurial commit relates to a Git commit, you +can use git-hg-helper. You run it in the Git repo. It takes the full long +hash from one repo and gives you the corresponding hash of the other repo:

                +
                $ git-hg-helper git-rev d64027c4c2b903403ceeef2c301f5132454491df
                +4527e62ad94b0e940a5b0f9f20d29428672f93f7
                +$ git-hg-helper hg-rev 4527e62ad94b0e940a5b0f9f20d29428672f93f7
                +d64027c4c2b903403ceeef2c301f5132454491df
                +
                + +
                Finding branches
                +

                Branches migrated from Mercurial will have a branches prefix, not branch. +While GitLab uses branch for its prefix, the git-remote-hg script uses +branches. New work should be in a PR targeting main, py3.9 or py3.10.

                +

                Thanks for helping to make PyPy better.

                +

                Matti

                +

                Update

                +

                In the meantime we found out that unfortunately something went wrong in the +migration of the issues. The old issue +3655 got lost in the +migration. This means that after number 3655 the numbers are different between +github and heptapod, with heptapod being one larger. E.g. issue 3700 on +heptapod is issue 3699 on +github. We are investigating +options.

                +
                +

                PyPy v7.3.14 release

                + +
                +

                PyPy v7.3.14: release of python 2.7, 3.9, and 3.10

                +

                The PyPy team is proud to release version 7.3.14 of PyPy.

                +

                Highlights of this release are compatibility with HPy-0.9, cffi 1.16, +additional C-API interfaces, and more python3.10 fixes.

                +

                The release includes three different interpreters:

                +
                +
                  +
                • PyPy2.7, which is an interpreter supporting the syntax and the features of +Python 2.7 including the stdlib for CPython 2.7.18+ (the + is for +backported security updates)

                • +
                • PyPy3.9, which is an interpreter supporting the syntax and the features of +Python 3.9, including the stdlib for CPython 3.9.18.

                • +
                • PyPy3.10, which is an interpreter supporting the syntax and the features of +Python 3.10, including the stdlib for CPython 3.10.13.

                • +
                +
                +

                The interpreters are based on much the same codebase, thus the multiple +release. This is a micro release, all APIs are compatible with the other 7.3 +releases. It follows after 7.3.13 release on Sept 29, 2023.

                +

                We recommend updating. You can find links to download the v7.3.14 releases here:

                +
                +

                https://pypy.org/download.html

                +
                +

                We would like to thank our donors for the continued support of the PyPy +project. If PyPy is not quite good enough for your needs, we are available for +direct consulting work. If PyPy is helping you out, we would love to hear about +it and encourage submissions to our blog via a pull request +to https://github.com/pypy/pypy.org

                +

                We would also like to thank our contributors and encourage new people to join +the project. Since the last release we have contributions from three new +contributors. PyPy has many layers and we need help with all of them: bug +fixes, PyPy and RPython documentation improvements, or general help +with making RPython's JIT even better.

                +

                If you are a python library maintainer and use C-extensions, please consider +making a HPy / CFFI / cppyy version of your library that would be performant +on PyPy. In any case, both cibuildwheel and the multibuild system support +building wheels for PyPy.

                +

                What is PyPy?

                +

                PyPy is a Python interpreter, a drop-in replacement for CPython +It's fast (PyPy and CPython 3.7.4 performance +comparison) due to its integrated tracing JIT compiler.

                +

                We also welcome developers of other dynamic languages to see what RPython +can do for them.

                +

                We provide binary builds for:

                +
                +
                  +
                • x86 machines on most common operating systems +(Linux 32/64 bits, Mac OS 64 bits, Windows 64 bits)

                • +
                • 64-bit ARM machines running Linux (aarch64).

                • +
                • Apple M1 arm64 machines (macos_arm64).

                • +
                • s390x running Linux

                • +
                +
                +

                PyPy support Windows 32-bit, Linux PPC64 big- and little-endian, and Linux ARM +32 bit, but does not release binaries. Please reach out to us if you wish to +sponsor binary releases for those platforms. Downstream packagers provide +binary builds for debian, Fedora, conda, OpenBSD, FreeBSD, Gentoo, and more.

                +

                What else is new?

                +

                For more information about the 7.3.14 release, see the full changelog.

                +

                Please update, and continue to help us make pypy better.

                +

                Cheers, +The PyPy Team

                +
                +
                +

                PyPy v7.3.13 release

                + +
                +

                PyPy v7.3.13: release of python 2.7, 3.9, and 3.10

                +

                The PyPy team is proud to release version 7.3.13 of PyPy. +This is primarily a security/bug-fix release. CPython released security +patches, and this release also improves the ability to use type +specifications via PyType_FromSpec and friends. There are also some +small speed-ups.

                +

                The release includes three different interpreters:

                +
                +
                  +
                • PyPy2.7, which is an interpreter supporting the syntax and the features of +Python 2.7 including the stdlib for CPython 2.7.18+ (the + is for +backported security updates)

                • +
                • PyPy3.9, which is an interpreter supporting the syntax and the features of +Python 3.9, including the stdlib for CPython 3.9.18.

                • +
                • PyPy3.10, which is an interpreter supporting the syntax and the features of +Python 3.10, including the stdlib for CPython 3.10.13. Note it requires at +least cython 0.29.35 or cython 3.0.0b3.

                • +
                +
                +

                The interpreters are based on much the same codebase, thus the multiple +release. This is a micro release, all APIs are compatible with the other 7.3 +releases. It follows after 7.3.12 release on June 16, 2023.

                +

                We recommend updating. You can find links to download the v7.3.13 releases here:

                +
                +

                https://pypy.org/download.html

                +
                +

                We would like to thank our donors for the continued support of the PyPy +project. If PyPy is not quite good enough for your needs, we are available for +direct consulting work. If PyPy is helping you out, we would love to hear about +it and encourage submissions to our blog via a pull request +to https://github.com/pypy/pypy.org

                +

                We would also like to thank our contributors and encourage new people to join +the project. PyPy has many layers and we need help with all of them: bug fixes, +PyPy and RPython documentation improvements, or general help with making +RPython's JIT even better.

                +

                If you are a python library maintainer and use C-extensions, please consider +making a HPy / CFFI / cppyy version of your library that would be performant +on PyPy. In any case, both cibuildwheel and the multibuild system support +building wheels for PyPy.

                +

                What is PyPy?

                +

                PyPy is a Python interpreter, a drop-in replacement for CPython +It's fast (PyPy and CPython 3.7.4 performance +comparison) due to its integrated tracing JIT compiler.

                +

                We also welcome developers of other dynamic languages to see what RPython +can do for them.

                +

                We provide binary builds for:

                +
                +
                  +
                • x86 machines on most common operating systems +(Linux 32/64 bits, Mac OS 64 bits, Windows 64 bits)

                • +
                • 64-bit ARM machines running Linux (aarch64).

                • +
                • Apple M1 arm64 machines (macos_arm64).

                • +
                • s390x running Linux

                • +
                +
                +

                PyPy support Windows 32-bit, Linux PPC64 big- and little-endian, and Linux ARM +32 bit, but does not release binaries. Please reach out to us if you wish to +sponsor binary releases for those platforms. Downstream packagers provide +binary builds for debian, Fedora, conda, OpenBSD, FreeBSD, Gentoo, and more.

                +

                What else is new?

                +

                For more information about the 7.3.13 release, see the full changelog.

                +

                Please update, and continue to help us make pypy better.

                +

                Cheers, +The PyPy Team

                +
                +
                +
                +
                +
                + +
                +
                +
                + +
                + + + + \ No newline at end of file diff --git a/blog/index-5.html b/blog/index-5.html new file mode 100644 index 000000000..c1cc06156 --- /dev/null +++ b/blog/index-5.html @@ -0,0 +1,1142 @@ + + + + + + +PyPy (old posts, page 5) | PyPy + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +
                +

                Threads and GCs

                + +
                +

                Hi all,

                + +

                We can now compile a pypy-c that includes both thread support +and one of our semi-advanced garbage collectors. This means +that threaded Python programs can now run not only with a +better performance, but without the annoyances of the Boehm +garbage collector. (For example, Boehm doesn't like too much +seeing large numbers of __del__(), and our implementation of +ctypes uses them everywhere.)

                + +

                Magic translation command (example):

                + +

                +
                   translate.py --thread --gc=hybrid targetpypystandalone --faassen --allworkingmodules
                + +

                Note that multithreading in PyPy is based on a global +interpreter lock, as in CPython. I imagine that we will get +rid of the global interpreter lock at some point in the future +-- I can certainly see how this might be done in PyPy, unlike +in CPython -- but it will be a lot of work nevertheless. Given +our current priorities, it will probably not occur soon unless +someone steps in.

                +
                +
                +
                +
                + + Anonymous wrote on 2008-05-29 00:04: +
                +
                +

                How could GIL be removed from PyPy?

                +
                +
                +
                +
                + + Armin Rigo wrote on 2008-05-29 09:19: +
                +
                +

                By using fine-grained locking: locking every dictionary and list while it is used. This is what Jython does (or more precisely, what Jython asks the JVM to do for it). This certainly comes with a performance penalty, so it would only pay off if you actually have and can use multiple CPUs -- which is fine in PyPy: you would just translate different pypy-c's depending on the use case.

                This would be a pain to implement in CPython, in particular because of refcounting. Even if the Py_INCREF and Py_DECREF macros were made thread-safe, all C-level APIs that manipulate borrowed references might have to be redesigned.

                +
                +
                +
                +
                + + Anonymous wrote on 2008-05-29 10:10: +
                +
                +

                Pyprocessing may serve multi-core cpu needs for the time being, as it's an almost drop-in replacement for the threading module.

                I think it uses ctypes, so it should work with pypy.

                +
                +
                +
                +
                + + Maciej Fijalkowski wrote on 2008-05-29 18:36: +
                +
                +

                pyprocessing has it's own problems (not that threads has no problems at all :)

                1. Memory usage, you need basically n times more memory when n is number of processes

                2. you cannot pass arbitrary data between processes, just stuff that you can marshal/pickle which is a bit huge limitation.

                3. on the other hand, multiple processes provides you better control, although not via threading drop-in replacement.

                Cheers,
                fijal

                +
                +
                +
                +
                + + Anonymous wrote on 2008-06-04 20:02: +
                +
                +

                The live demos seem to be down... :(

                +
                +
                +
                +
                + + Maciej Fijalkowski wrote on 2008-06-05 02:15: +
                +
                +

                Back online. Our test server is down as well, which makes it a bit hard to know stuff :(

                +
                +
                +
                +
                + + Connelly Barnes wrote on 2008-06-13 00:38: +
                +
                +

                In response to maciej, OSes that implement copy-on-write fork (Linux, but not Windows, unsure about Mac OS X), don't take n times more memory. Fine-grained locking and an OpenMP-like syntax would be potentially useful. Maybe you could get a student to prototype these for you. But I'm sure someone will find a way to parallelize Python eventually, or we'll all switch to some other language, as the number of cores goes to infinity.

                +
                +
                +
                +
                + + Connelly Barnes wrote on 2008-06-17 23:52: +
                +
                +

                In my previous comment, I was partly wrong: COW reduces memory usage, however, in CPython the refcounting will cause the interpreter to write to every area of memory, so the reduction may not be that significant. Also, IronPython supports fine-grained locks.

                +
                +
                +
                +
                + + nekto0n wrote on 2008-06-24 23:07: +
                +
                +

                Would it be better to lock not whole mutable object but just an element or slice(for lists) and not lock object for reading operations?
                It's a common method used in DBMS. A small and fast realisation(if it's possible to create) in PyPy whould be great =)

                +
                +
                +
                +
                + + Rushen Aly wrote on 2009-02-20 18:37: +
                +
                +

                Is there any calendar date for removal of GIL? or is it just a wish. Secondly, what is your speed aim compared with Java?
                Thanks...
                Rushen

                +
                +
                +
                + +

                Progresses on the CLI JIT backend front

                + +
                +

                In the last months, I've actively worked on the CLI backend for PyPy's +JIT generator, whose goal is to automatically generate JIT compilers +that produces .NET bytecode on the fly.

                +

                The CLI JIT backend is far from be completed and there is still a lot +of work to be done before it can handle the full PyPy's Python +interpreter; nevertheless, yesterday I finally got the first .NET +executable that contains a JIT for a very simple toy language called +tlr, which implements an interpreter for a minimal register based +virtual machine with only 8 operations.

                +

                To compile the tlr VM, follow these steps:

                +
                +
                  +
                1. +

                  get a fresh checkout of the oo-jit branch, i.e. the branch +where the CLI JIT development goes on:

                  +
                  +$ svn co https://codespeak.net/svn/pypy/branch/oo-jit
                  +
                  +
                2. +
                3. +

                  go to the oo-jit/pypy/jit/tl directory, and compile the tlr VM +with the CLI backend and JIT enabled:

                  +
                  +$ cd oo-jit/pypy/jit/tl/
                  +$ ../../translator/goal/translate.py -b cli --jit --batch targettlr
                  +
                  +
                4. +
                +
                +

                The goal of our test program is to compute the square of a given +number; since the only operations supported by the VM are addition and +negation, we compute the result by doing repetitive additions; I won't +describe the exact meaning of all the tlr bytecodes here, as they are +quite self-documenting:

                +
                +ALLOCATE,    3,   # make space for three registers
                +MOV_A_R,     0,   # i = a
                +MOV_A_R,     1,   # copy of 'a'
                +
                +SET_A,       0,
                +MOV_A_R,     2,   # res = 0
                +
                +# 10:
                +SET_A,       1,
                +NEG_A,
                +ADD_R_TO_A,  0,
                +MOV_A_R,     0,   # i--
                +
                +MOV_R_A,     2,
                +ADD_R_TO_A,  1,
                +MOV_A_R,     2,   # res += a
                +
                +MOV_R_A,     0,
                +JUMP_IF_A,  10,   # if i!=0: goto 10
                +
                +MOV_R_A,     2,
                +RETURN_A          # return res
                +
                +

                You can find the program also at the end of the tlr module; to get an +assembled version of the bytecode, ready to be interpreted, run this +command:

                +
                +$ python tlr.py assemble > square.tlr
                +
                +

                Now, we are ready to execute the code through the tlr VM; if you are +using Linux/Mono, you can simply execute the targettlr-cli script +that has been created for you; however, if you use Windows, you have +to manually fish the executable inside the targettlr-cli-data +directory:

                +
                +# Linux
                +$ ./targettlr-cli square.tlr 16
                +256
                +
                +# Windows
                +> targettlr-cli-data\main.exe square.tlr 16
                +256
                +
                +

                Cool, our program computed the result correctly! But, how can we be +sure that it really JIT compiled our code instead of interpreting it? +To inspect the code that it's generated by our JIT compiler, we simply +set the PYPYJITLOG environment variable to a filename, so that the +JIT will create a .NET assembly containing all the code that has been +generated by the JIT:

                +
                +$ PYPYJITLOG=generated.dll ./targettlr-cli square.tlr 16
                +256
                +$ file generated.dll
                +generated.dll: MS-DOS executable PE  for MS Windows (DLL) (console) Intel 80386 32-bit
                +
                +

                Now, we can inspect the DLL with any IL disassembler, such as +ilasm or monodis; here is an excerpt of the disassembled code, +that shows how our square.tlr bytecode has been compiled to .NET +bytecode:

                +
                +.method public static  hidebysig default int32 invoke (object[] A_0, int32 A_1)  cil managed
                +{
                +    .maxstack 3
                +    .locals init (int32 V_0, int32 V_1, int32 V_2, int32 V_3, int32 V_4, int32 V_5)
                +
                +    ldc.i4 -1
                +    ldarg.1
                +    add
                +    stloc.1
                +    ldc.i4 0
                +    ldarg.1
                +    add
                +    stloc.2
                +    IL_0010:  ldloc.1
                +    ldc.i4.0
                +    cgt.un
                +    stloc.3
                +    ldloc.3
                +    brfalse IL_003b
                +
                +    ldc.i4 -1
                +    ldloc.1
                +    add
                +    stloc.s 4
                +    ldloc.2
                +    ldarg.1
                +    add
                +    stloc.s 5
                +    ldloc.s 5
                +    stloc.2
                +    ldloc.s 4
                +    stloc.1
                +    ldarg.1
                +    starg 1
                +
                +    nop
                +    nop
                +    br IL_0010
                +
                +    IL_003b:  ldloc.2
                +    stloc.0
                +    br IL_0042
                +
                +    ldloc.0
                +    ret
                +}
                +
                +

                If you know a bit IL, you can see that the code generated is not +optimal, as there are some redundant operations like all those +stloc/ldloc pairs; however, while not optimal, it is still quite good +code, not much different to what you would get by writing the square +algorithm directly in e.g. C#.

                +

                As I said before, all of this is still work in progress and there is +still much to be done. Stay tuned :-).

                +
                +
                +
                +
                + + Anonymous wrote on 2008-05-28 20:11: +
                +
                +

                So the mono JIT would pick up that bytecode and further compile it to native code?

                Also, what would be needed for doing the same thing for the JVM?

                +
                +
                +
                +
                + + Antonio Cuni wrote on 2008-05-28 21:28: +
                +
                +

                Yes, that's exactly the idea; in fact, the program run by virtual machines generated this way are double jit-ed.

                Doing the same for the JVM won't be too hard, since most of the work we've done can be shared between the two JIT backends; unfortunately, at the moment the JVM backend is not as advanced as the CLI one, so before working on the JIT we would need more work on it. But indeed, having a JIT backend for the JVM is in our plans.

                +
                +
                +
                +
                + + Anonymous wrote on 2008-05-29 10:13: +
                +
                +

                Great. Can't wait for advanced piggybacking :)

                +
                +
                +
                + +

                More windows support

                + +
                +

                Recently, thanks to Amaury Forgeot d'Arc and Michael Schneider, Windows became more of a first-class platform for PyPy's Python interpreter. Most RPython extension modules are now considered working (apart from some POSIX specific modules). Even CTypes now works on windows! +

                +Next step would be to have better buildbot support for all supported platforms (Windows, Linux and OS X), so we can control and react to regressions quickly. (Buildbot is maintained by JP Calderone) +

                +Cheers,
                +fijal

                +
                +

                S3-Workshop Potsdam 2008 Writeup

                + +
                +

                Trying to give some notes about the S3 Workshop in Potsdam that several +PyPyers and Spies (Armin, Carl Friedrich, Niko, Toon, Adrian) attended before +the Berlin sprint. We presented a paper about SPy there. Below are some mostly +random note about my (Carl Friedrich's) impressions of the conference and some +talk notes. Before that I'd like to give thanks to the organizers who did a +great job. The workshop was well organized, the social events were wonderful (a +very relaxing boat trip in the many lakes around Potsdam and a conference +dinner).

                +

                Video recordings of all the talks can be found on the program page.

                +
                +

                Invited Talks

                +

                "Late-bound Object Lambda Architectures" by Ian Piumarta was quite an inspiring +talk about VPRI's attempt at writing a flexible and understandable computing +system in 20K lines of code. The talk was lacking a bit in technical details, so +while it was inspiring I couldn't really say much about their implementation. +Apart from that, I disagree with some of their goals, but that's the topic of +another blog post.

                +

                "The Lively Kernel – A Self-supporting System on a Web Page" by Dan Ingalls. Dan +Ingalls is one of the inventors of the original Smalltalk and of Squeak. He was +talking about his latest work, the attempts of bringing a Squeak-like system to +a web browser using JavaScript and SVG. To get some feel for what exactly The +Lively Kernel is, it is easiest to just try it out (only works in Safari +and Firefox 3 above Beta 5 though). I guess in a sense the progress of the +Lively Kernel over Squeak is not that great but Dan seems to be having fun. Dan +is an incredibly enthusiastic, friendly and positive person, it was really great +meeting him. He even seemed to like some of the ideas in SPy.

                +

                "On Sustaining Self" by Richard P. Gabriel was a sort of deconstructivist +multi-media-show train wreck of a presentation that was a bit too weird for my +taste. There was a lot of music, there were sections in the presentation +where Richard discussed with an alter ego, whose part he had recorded in advance +and mangled with a sound editor. There was a large bit of a documentary +about Levittown. Even the introduction and the questions were weird, with Pascal +Constanza staring down the audience, without saying a word (nobody dared to ask +questions). I am not sure I saw the point of the presentation, apart from +getting the audience to think, which probably worked. It seems that there are +people (e.g. Christian Neukirchen) that liked the presentation, though.

                +
                +
                +

                Research Papers

                +

                "SBCL - A Sanely Bootstrappable Common Lisp by Christophe Rhodes described the +bootstrapping process of SBCL (Steel Bank Common Lisp). SBCL can be bootstrapped +by a variety of Common Lisps, not just by itself. SBCL contains a complete +blueprint of the initial image instead of always getting the new image by +carefully mutating the old one. This bootstrapping approach is sort of similar +to that of PyPy.

                +

                "Reflection for the Masses" by Charlotte Herzeel, Pascal Costanza, and Theo +D'Hondt retraced some of the work of Brian Smith on reflection in Lisp. The +talk was not very good, it was way too long (40 min), quite hard to understand +because Charlotte Herzeel was talking in a very low voice. The biggest mistake +in her talk was in my opinion that she spent too much time explaining a more or +less standard meta-circular interpreter for Lisp and then running out of time +when she was trying to explain the modifications. I guess it would have been a +fair assumptions that large parts of the audience know such interpreters, so +glossing over the details would have been fine. A bit of a pity, since the paper +seems interesting.

                +

                "Back to the Future in One Week - Implementing a Smalltalk VM in PyPy" +by Carl Friedrich Bolz, Adrian Kuhn, Adrian Lienhard, Nicholas D. Matsakis, +Oscar Nierstrasz, Lukas Renggli, Armin Rigo and Toon Verwaest, the paper with +the longest author list. We just made everybody an author who was at the sprint +in Bern. Our paper had more authors than all the other papers together :-). I +gave the presentation at the workshop, which went quite well, judging from the +feedback I got.

                +

                "Huemul - A Smalltalk Implementation" by Guillermo Adrián Molina. Huemul is a +Smalltalk implementation that doesn't contain an interpreter but directly +compiles all methods to assembler (and also saves the assembler in the image). +In addition, as much functionality (such as threading, GUI) as possible is +delegated to libraries instead of reimplementing them in Smalltalk +(as e.g. Squeak is doing). The approach seems to suffer from the usual problems +of manually writing a JIT, e.g. the VM seems to segfault pretty often. Also I +don't agree with some of the design decisions of the threading scheme, there is +no automatic locking of objects at all, instead the user code is responsible for +preventing concurrent accesses from messing up things (which even seems to lead +to segfaults in the default image).

                +

                "Are Bytecodes an Atavism?" by Theo D'Hondt argued that using AST-based +interpreters can be as fast as bytecode-based interpreters which he proved by +writing two AST-interpreters, one for Pico and one for Scheme. Both of these +implementations seem to perform pretty well. Theo seems to have many similar +views as PyPy, for example that writing simple straightforward interpreters is +often preferable than writing complex (JIT-)compilers.

                +
                +
                +

                Berlin Sprint Finished

                + +
                +

                The Berlin sprint is finished, below some notes on what we worked on during +the last three days:

                +
                +
                  +
                • Camillo worked tirelessly on the gameboy emulator with some occasional input +by various people. He is making good progress, some test ROMs run now on the +translated emulator. However, the graphics are still not completely working +for unclear reasons. Since PyBoy is already taken as a project name, we +considered calling it PyGirl (another name proposition was "BoyBoy", but the +implementation is not circular enough for that).
                • +
                +
                +
                +
                  +
                • On Monday Armin and Samuele fixed the problem with our multimethods so that +the builtin shortcut works again (the builtin shortcut is an optimization +that speeds up all operations on builtin non-subclassed types quite a bit).
                • +
                • Antonio and Holger (who hasn't been on a sprint in a while, great to have you +back!) worked on writing a conftest file (the plugin mechanism of py.test) +that would allow us to run Django tests using py.test, which seems to be not +completely trivial. They also fixed some bugs in PyPy's Python interpreter, +e.g. related to dictionary subclassing.
                • +
                • Karl started adding sound support to the RPython SDL-bindings, which will be +needed both by the Gameboy emulator and eventually by the SPy VM.
                • +
                • Armin and Maciek continued the work that Maciek had started a while ago of +improving the speed of PyPy's IO operation. In the past, doing IO usually +involved copying lots of memory around, which should have improved now. Armin +and Maciek improved and then merged the first of the two branches that +contained IO improvements, which speeds up IO on non-moving GCs (mostly the +Boehm GC). Then they continued working on the hybrid-io branch which is +supposed improve IO on the hybrid GC (which was partially designed exactly +for this).
                • +
                • Toon, Carl Friedrich finished cleaning up the SPy improvement branch and +fixed all warnings that occur when you translate SPy there. An obscure bug in +an optimization prevented them from getting working executables, which at +this moment blocks the merging of that branch.
                • +
                +
                +

                By now everybody is home again (except for Anto, who booked his return flight +two days too late, accidentally) and mostly resting. It was a good sprint, with +some interesting results and several new people joining. And it was definitely +the most unusual sprint location ever :-).

                +
                +

                Berlin Sprint Day 1 + 2

                + +
                +

                After having survived the S3-Workshop which took place in Potsdam on Thursday +and Friday (a blog-post about this will follow later) we are now sitting in the +c-base in Berlin, happily sprinting. Below are some notes on what progress we +made so far:

                +
                +
                  +
                • The Gameboy emulator in RPython that Camillo Bruni is working on for his +Bachelor project at Uni Bern does now translate. It took him (assisted by +various people) a while to figure out the translation errors (essentially +because he wrote nice Python code that passed bound methods around, which the +RTyper doesn't completely like). Now that is fixed and the Gameboy emulator +translates and runs a test ROM. You cannot really see anything yet, because +there is no graphics support in RPython.
                • +
                • To get graphics support in RPython Armin and Karl started writing SDL +bindings for RPython, which both the Gameboy emulator and the SPy VM need. +They have basic stuff working, probably enough to support the Gameboy +already.
                • +
                • Alexander, Armin, Maciek and Samuele discussed how to approach separate +compilation for RPython, which isn't easy because the RPython type analysis +is a whole-program analysis.
                • +
                • Stephan, Peter and Adrian (at least in the beginning) worked on making PyPy's +stackless module more complete. They added channel preferences which +change details of the scheduling semantics.
                • +
                • Toon, Carl Friedrich and Adrian (a tiny bit) worked on SPy. There is a branch +that Toon started a while ago which contains many improvements but is also +quite unclear in many respects. There was some progress in cleaning that up. +This involved implementing the Smalltalk process scheduler (Smalltalk really +is an OS). There is still quite some work left though. While doing so, we +discovered many funny facts about Squeak's implementation details (most of +which are exposed to the user) in the process. I guess we should collect them +and blog about them eventually.
                • +
                • Samuele and Maciek improved the ctypes version of pysqlite that Gerhard +Häring started.
                • +
                • Armin, Samuele and Maciek found an obscure bug in the interaction between the +builtin-type-shortcut that Armin recently implemented and our multimethod +implementation. It's not clear which of the two are to blame, however it +seems rather unclear how to fix the problem: Armin and Samuele are stuck in a +discussion about how to approach a solution since a while and are hard to +talk to.
                • +
                • Stijn Timbermont, a Ph.D. student at the Vrije Universiteit Brussel who is +visiting the sprint for two days was first looking at how our GCs are +implemented to figure out whether he can use PyPy for some experiments. The +answer to that seems to be no. Today he was hacking on a Pico interpreter +(without knowing too much about Python) and is making some nice progress, it +seems.
                • +
                +
                +

                Will try to blog more as the sprint progresses.

                +
                +

                General performance improvements

                + +
                +

                Hi all,

                + +

                During the past two weeks we invested some more efforts on the +baseline performance of pypy-c. Some of the tweaks we did +were just new ideas, and others were based on actual +profiling. The net outcome is that we now expect PyPy to be +in the worst case twice as slow than CPython on real +applications. Here are some small-to-medium-size benchmark +results. The number is the execution time, normalized to 1.0 +for CPython 2.4:

                + +

                +
                  +
                • +1.90 on templess (a simple templating language)
                • +
                • +1.49 on gadfly (pure Python SQL database)
                • +
                • +1.49 on translate.py (pypy's own translation toolchain)
                • +
                • +1.44 on mako (another templating system)
                • +
                • +1.21 on pystone
                • +
                • +0.78 on richards
                • +
                +

                (This is all without the JIT, as usual. The JIT is not ready yet.)

                + +

                You can build yourself a pypy-c with this kind of speed with +the magic command line (gcrootfinder is only for a 32-bit +Linux machine):

                + +

                +
                    pypy/translator/goal/translate.py --gc=hybrid --gcrootfinder=asmgcc targetpypystandalone --allworkingmodules --faassen
                + +

                The main improvements come from: + +

                +
                  +
                • A general shortcut for any operation between built-in objects: +for example, a subtraction of two integers or floats now dispatches +directly to the integer or float subtraction code, without looking up +the '__sub__' in the class.
                • +
                • A shortcut for getting attributes out of instances of user classes +when the '__getattribute__' special method is not overridden.
                • +
                • The so-called Hybrid Garbage Collector is now a +three-generations collector. + +More about our GCs... +
                • +
                • Some profiling showed bad performance in our implementation of +the built-in id() -- a trivial function to write in CPython, but a lot +more fun when you have a moving GC and your object's real address can +change.
                • +
                • The bytecode compiler's parser had a very slow linear search +algorithm that we replaced with a dictionary lookup.
                • +
                +

                These benchmarks are doing CPU-intensive operations. You can expect +a similar blog post soon about the I/O performance, as the +io-improvements branch gets closer to being merged +:-) The branch could also improve the speed of +string operations, as used e.g. by the templating systems.

                +
                +
                +
                +
                + + Anonymous wrote on 2008-05-10 20:07: +
                +
                +

                We had the same problem with id() (called object_id()) in Rubinius. We currently hide an objects's ID inside it's metaclass (allocating one if there isn't one).

                Where did you guys store it?

                +
                +
                +
                +
                + + Anonymous wrote on 2008-05-11 00:48: +
                +
                +

                The ID is stored in a special dictionary (a normal dictionary specialized to be allocated so that the GC wont see it) that is used in the GC as a mapping from addresses to integers. This dict is updated when necessary (usually when collecting).

                +
                +
                +
                +
                + + Unknown wrote on 2008-05-11 06:56: +
                +
                +

                Wow. That sure is nice.

                +
                +
                +
                +
                + + Anonymous wrote on 2008-05-11 07:27: +
                +
                +

                My my, that must be a huge dictionary.

                +
                +
                +
                +
                + + Anonymous wrote on 2008-05-11 09:12: +
                +
                +

                The dictionary is of course only filled for objects that were used in an id() call.

                +
                +
                +
                +
                + + Armin Rigo wrote on 2008-05-11 09:19: +
                +
                +

                There are actually two dictionaries, at least when using one of the generational GCs: one for the first generation objects and one for the rest. The dictionary for the rest of the objects can probably get quite large, but it needs to be traversed once during each full collection only. It seems that full collections are rare enough: the full dictionary updating doesn't stand out in profiled runs.

                I didn't think about implementing id() at the language level, e.g. by extending the class of the object to add a field.
                We can't really do that in RPython. Moreover, that seems impractical for Python: if someone asks for the id() of an integer object, do all integers suddenly need to grow an 'id' field?

                +
                +
                +
                +
                + + Daivd wrote on 2008-05-11 09:41: +
                +
                +

                Great work!

                I have a few questions not answered by the FAQ that I hope someone will be able to answer.

                When might the JIT be ready enough? (no stress, just asking :)

                How much faster are CPython 2.5, 2.6 and 3.0? That seems to be relevant to the statement "we now expect PyPy to be in the worst case twice as slow than CPython".

                If I understand correctly, one of the purposes of PyPy is to make experimentation easier - so will making it compatible with 3.0 be fairly easy? Are there plans to do so?

                Is PyPy expected to one day become a serious "competitor" to CPython, in that you might want to run it in production? Is there a time set for when it will be ready for use by the general public (i.e me ;)?

                +
                +
                +
                +
                + + Maciej Fijalkowski wrote on 2008-05-11 10:19: +
                +
                +

                So, answering questions one by one:

                JIT will be ready when it'll be ready, not earlier.

                CPython 2.5 is slightly faster for some operations. No real difference there. 2.6 was optimized for certain operations, but as well, don't expect a huge difference. I think you can expect pypy to be in range of 2x for any cpython. 3.0 is not even sure how will look like, but certainly being ultra fast is not it's primary goal.

                Regarding making pypy compatible with 3.0 - yes, that should be fairly easy although we don't have any immediate plans doing that.

                The final date for making pypy production ready is not set (and this is a gradual process), but as you can see here and here we're trying more and more to make it run existing applications.

                Cheers,
                fijal

                +
                +
                +
                +
                + + Anonymous wrote on 2008-05-11 10:43: +
                +
                +

                Note that current benchmarks suggest that CPython 3.0 is yet much slower than CPython 2.x. It might be interesting to see whether this means that PyPy is much faster than CPython 3.0 running e.g. Pystone.
                Of course this fact would not be very surprising, esp. given that PyPy does not implement any CPy3k features.

                +
                +
                +
                +
                + + Luis wrote on 2008-10-12 22:43: +
                +
                +

                "JIT will be ready when it'll be ready, not earlier."

                Alright, alright... we know.
                But could you at least give us a very rough estimation for us, mere mortals? What does your heart tell you? :-)

                +
                +
                +
                +
                + + Spencer wrote on 2009-11-02 18:04: +
                +
                +

                What kind of computations are done in richards? I.e., what sort of applications can expect better perfomance in PyPy than in CPy?

                +
                +
                +
                + +

                Next Sprint: Berlin, May 17-22nd May

                + +
                +

                Our next PyPy sprint will take place in the crashed c-base space station, Berlin, Germany, Earth, Solar System. This is a fully public sprint: newcomers (from all planets) are welcome. Suggestion of topics (other topics are welcome too):

                + +
                  +
                • work on PyPy's JIT generator: we are refactoring parts of the + compiling logic, in ways that may also allow generating better + machine code for loops (people or aliens with knowledge on + compilers and SSA, welcome) + +
                • +
                • work on the SPy VM, PyPy's Squeak implementation, particularly the + graphics capabilities + +
                • +
                • work on PyPy's GameBoy emulator, which also needs graphics support + +
                • +
                • trying some large pure-Python applications or libraries on PyPy and + fixing the resulting bugs. Possibilities are Zope 3, Django and + others. +
                • +
                +

                For more information, see the full announcement. +

                +
                +

                Google's Summer of Code

                + +
                +

                PyPy got one proposal accepted for Google's Summer of Code under the Python +Software Foundation's umbrella. We welcome Bruno Gola into the PyPy +community. He will work on supporting all Python 2.5 features in PyPy and will +also update PyPy's standard library to support the modules that were modified +or new in Python 2.5.

                +

                Right now PyPy supports only Python 2.4 fully (some Python 2.5 features have +already sneaked in, though).

                +
                +
                +
                +
                + + Anonymous wrote on 2008-04-22 18:47: +
                +
                +

                Hello,

                I'm very glad to work on PyPy project this summer (ok, this winter here in Brazil =))!

                I hope this project helps to bring more people to the PyPy project, both users and developers =)

                Thanks Google, PSF, PyPy and Carl for mentoring it!

                Bruno

                +
                +
                +
                + +

                Float operations for JIT

                + +
                +

                Recently, we taught the JIT x86 backend how to produce code for the x87 floating point coprocessor. This means that JIT is able to nicely speed up float operations (this this is not true for our Python interpreter yet - we did not integrate it yet). This is the first time we started going beyond what is feasible in psyco - it would take a lot of effort to make floats working on top of psyco, way more than it will take on PyPy. +

                +This work is in very early stage and lives on a jit-hotpath branch, which includes all our recent experiments on JIT compiler generation, including tracing JIT experiments and huge JIT refactoring. +

                +Because we don't encode the Python's semantics in our JIT (which is really a JIT generator), it is expected that our Python interpreter with a JIT will become fast "suddenly", when our JIT generator is good enough. If this point is reached, we would also get fast interpreters for Smalltalk or JavaScript with relatively low effort. +

                +Stay tuned. +

                + +Cheers,
                +fijal

                +
                +
                +
                +
                + + Michael Foord wrote on 2008-04-17 14:22: +
                +
                +

                Having a fast implementation of Ruby written in Python would be very cool. :-p

                +
                +
                +
                +
                + + René Dudfield wrote on 2008-04-18 07:29: +
                +
                +

                Super cool!

                Are you going to add SIMD stuff to the i386 backend?

                Which is the main backend at the moment? LLVM?

                cheers,

                +
                +
                +
                +
                + + jlg wrote on 2008-04-18 10:22: +
                +
                +

                It would be amazing to run SciPy on PyPy with the JIT when this will be ready.

                +
                +
                +
                +
                + + Anonymous wrote on 2008-04-19 04:20: +
                +
                +

                I'm interested in the choice of x87 as well. My understanding was that Intel (at least) was keeping x87 floating point around because of binary applications but that for single element floating point the SSE single-element instructions were the preferred option on any processor which supports SSE. (Unfortunately since they've got such different styles of programming I can understand if it's just that "older chips have to be supported, and we've only got enough programming manpower for 1 implementation".)

                +
                +
                +
                +
                + + Maciej Fijalkowski wrote on 2008-04-20 16:28: +
                +
                +

                x87 because it's simpler and better documented. Right now would be ridiculously easy to reimplement it using SSE.

                +
                +
                +
                +
                + + Armin Rigo wrote on 2008-04-21 11:30: +
                +
                +

                The main backend is the one for 386. We have no working LLVM JIT backend: although llvm advertizes supporting JIT compilation, what it really provides is a regular compiler packaged as a library that can be used at run-time. This is only suitable for some kinds of usages; for example, it couldn't be used to write a Java VM with good just-in-time optimizations (which need e.g. quick and lazy code generation and regeneration, polymorphic inline caches, etc.)

                +
                +
                +
                + +
                +
                + +
                +
                +
                + +
                + + + + \ No newline at end of file diff --git a/blog/index-6.html b/blog/index-6.html new file mode 100644 index 000000000..90713cf86 --- /dev/null +++ b/blog/index-6.html @@ -0,0 +1,1265 @@ + + + + + + +PyPy (old posts, page 6) | PyPy + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +
                +

                JIT in Prolog

                + +
                +

                Hi all,

                + +

                Some news from the JIT front. Progress on the JIT has been low-profile +in the past few months. No big results to announce yet, but we have +played with some new ideas, and they are now documented as a draft +research paper: Towards Just-In-Time Compilation and Specialisation of Prolog.

                + +

                Prolog? Yes. To understand this slightly unusual choice of programming +language, here is first some background about our JIT.

                + +

                PyPy contains not a JIT but a JIT generator, which means that we +only write an interpreter for a language (say, the complete Python +language), and we get a JIT "for free". More precisely, it's not for +free: we had to write the JIT generator, of course, as well as some +amount of subtle generic support code. The JIT generator preprocesses +the (complete Python) interpreter that we wrote and links the result +with the generic support code; the result is a (complete Python) JIT.

                + +

                The way that this works so far gives us a generated JIT that is very +similar to Psyco in the way +it works. +But Psyco has issues (and so the current PyPy JITs have the same issues): +it can sometimes produce too much machine code, +e.g. by failing to notice that two versions of the machine code are +close enough that they should really be one; and it can also sometimes +fail in the opposite way, by making a single sub-efficient version of +the machine code instead of several efficient specialized versions.

                + +

                A few months ago we have chosen to experiment with improving this +instead of finishing and polishing what we had so far. The choice was +mostly because we were (and still are) busy finishing and polishing +everything else in PyPy, so it was more fun to keep at least the JIT on +the experimental side. Besides, PyPy is now getting to a rather good +and complete state, and it is quite usable without the JIT already.

                + +

                Anyway, enough excuses. Why is this about Prolog?

                + +

                In PyPy, both the (complete Python) interpreter and the JIT support code +are in RPython. Now RPython is not +an extremely complicated language, but still, it is far from the top on a +minimalism scale. In general, this is a good in practice (or at least I +think so): it gives +a reasonable balance because it is convenient to write interpreters +in RPython, while not being so bloated that it makes our translation +toolchain horribly complicated (e.g. writing garbage collectors for +RPython - or even JIT generators - is reasonable). Still, it is not the +best choice for early research-level experimentation.

                + +

                So what we did instead recently is hand-write, in Prolog, a JIT that +looks similar to what we would like to achieve for RPython with our JIT +generator. This gave much quicker turnaround times than we were used to +when we played around directly with RPython. We wrote tiny example +interpreters in Prolog (of course not a complete Python interpreter). +Self-inspection is trivial in Prolog, and generating Prolog code at +runtime is very easy too. Moreover, many other issues are also easier +in Prolog: for example, all data structures are immutable "terms". +Other languages than Prolog would have worked, too, but it happens to be +one that we (Carl Friderich, Michael Leuschel and myself) are familiar +with -- not to mention that it's basically a nice small dynamic +language.

                + +

                Of course, all this is closely related to what we want to do in PyPy. +The fundamental issues are the same. Indeed, in PyPy, the major goals +of the JIT are to remove, first, the overhead of allocating objects all +the time (e.g. integers), and second, the overhead of dynamic dispatch +(e.g. finding out that it's integers we are adding). The equivalent +goals in Prolog are, first, to avoid creating short-lived terms, and +second, to remove the overhead of dispatch (typically, the dispatching +to multiple clauses). If you are familiar with Prolog you can find more +details about this in the paper. So far we already played with many possible solutions +in the Prolog JIT, and the paper describes the most mature one; we have +more experimentation in mind. The main point here is that these are +mostly language-independent techniques (anything that works both in +Prolog and in RPython has to be language-independent, right? :-)

                + +

                In summary, besides the nice goal of speeding up Prolog, we are trying +to focus our Prolog JIT on the issues and goals that have equivalents in +the PyPy JIT generator. So in the end we are pretty convinced that it +will give us something that we can backport to PyPy -- good ideas about +what works and what doesn't, as well as some concrete algorithms.

                +
                +
                +
                +
                + + Shalabh wrote on 2008-06-30 21:00: +
                +
                +

                What is the reason you would back-port the Prolog implementation to RPython, and not make Prolog itself the standard language for implementing the JIT?

                +
                +
                +
                +
                + + Michael Foord wrote on 2008-06-30 21:47: +
                +
                +

                THat sounds like the great subject of a thesis for Carl. :-)

                Congratulations guys.

                +
                +
                +
                +
                + + Maciej Fijalkowski wrote on 2008-06-30 22:46: +
                +
                +

                shalabh: because (hopefully) porting back to rpython is saner than porting all of our interpreter (including modules) to prolog.

                +
                +
                +
                +
                + + nekto0n wrote on 2008-07-01 00:22: +
                +
                +

                A bit unsual approach =)
                Hope it'll help...

                +
                +
                +
                +
                + + Anonymous wrote on 2008-07-01 00:37: +
                +
                +

                What about making PyPy useful?

                There's still a need for a python compiler, but so far, you can't run standard libraries (eg PyObjC) and you run slow that cPython. -- Even Javascript is faster than you (squirrelfish).

                +
                +
                +
                +
                + + Anonymous wrote on 2008-07-01 02:12: +
                +
                +

                One thing I've never quite understood: how will the JIT-generation transform interact with more traditional optimization schemes?

                Concrete example: say in a function I want to perform some algebraic reductions of math operations which will change a lot of the instructions. Since the JIT generation turns the interpreter into a JIT, presumably I have to write the optimization at the interpreter level.

                I can see how that could work for the simplest kind of optimizations (special cases should be specialized at runtime after they go green, if I understand the rainbow colour scheme.)

                I don't see yet how the more complex optimizations I'd write on static, fixed-type code will look in this context. IIUC at interpreter level I can only access the JIT's observations via tests like "if type(a) == FloatType" which should be filled after they're known-- but that's inside the function itself, and I don't see how to access that information from anything outside.

                +
                +
                +
                +
                + + Armin Rigo wrote on 2008-07-01 16:31: +
                +
                +

                dsm: This is a two-level approach, corresponding to two levels of optimisations that are useful for dynamic languages like Python: the "high level" is the unboxing and dispatching removing that I describe in the post (which by itself can give something like a factor 50-100 speed-up in the best cases). Traditional "low level" optimisations can be performed on top of that, by optimising the generated code that comes out of the "high level" (and this could give another 2-4x speed-up, i.e. the same difference as between "gcc" and "gcc -O3").

                In this Prolog experiment we are only focusing on how to get the high level optimisations.

                +
                +
                +
                +
                + + Anonymous wrote on 2008-07-02 00:20: +
                +
                +

                The references in the paper are not properly numbered -- any idea if it could be fixed?

                +
                +
                +
                +
                + + Carl Friedrich Bolz-Tereick wrote on 2008-07-02 00:39: +
                +
                +

                Michel: Thanks for noticing, it should be fixed.

                +
                +
                +
                +
                + + Anonymous wrote on 2008-07-02 09:34: +
                +
                +

                Could you possibly profit from generating a JIT compiler for Lua (www.lua.org) and compare it to Mike Pall's Lua-Jit (https://luajit.org/)?

                +
                +
                +
                +
                + + Anonymous wrote on 2008-07-03 09:48: +
                +
                +

                While the paper was too difficult for me to understand fully, it was still an interesting read and I appreciate you posting it.

                +
                +
                +
                +
                + + Unknown wrote on 2008-09-26 02:04: +
                +
                +

                FYI: There is a project called Pyke which adds Prolog-like inferencing to Python. This integrates with Python allowing you to include Python code snippets in your rules.

                Don't know if this would be useful, but you can check it out at https://pyke.sourceforge.net.

                +
                +
                +
                +
                + + Unknown wrote on 2009-01-09 23:51: +
                +
                +

                Shalabh: It's also important to note 3 big benefits of implementing a language in the language itself, or a subset thereof ("turtles all the way down").

                (1) Debugging and testing tools for programs written in the language then (hopefully) also work for debugging and testing the language implementation with minimal (or no) modification. This also HUGELY lowers the bar for ordinary users of the language to find and fix implementation bugs. This isn't a fault of Prolog, but 99.99% of Python users won't touch a Prolog debugger with a 10-foot pole.

                (2) The largest pool of people most interested in improving the language is presumably the expert heavy users of the language. Forcing them to learn a new language and/or implement the language in a language outside their expertise is a large disadvatage.

                (3) The difference between language builtins and user code is reduced. Also, it forces certain powerful constructs to (at times) be exposed in the language when they might otherwise only be exposed in the implementation language. Also, with "turtles all the way down", performance improvements in the language itself also often apply to the language builtins, which increases the benefit of improvements, which is important in the cost/benefit analysis for undertaking the performance improvements in the first place. Having "turtles all the way down" make some optimizations worthwhile that otherwise would be too much trouble to implement.

                +
                +
                +
                + +

                PyPy code swarm

                + +
                +

                Following the great success of code_swarm, I recently produced a +video that shows the commit history of the PyPy project.

                +

                The video shows the commits under the dist/ and branch/ +directories, which is where most of the development happens.

                +

                In the first part of the video, you can see clearly our sprint based +approach: the video starts in February 2003, when the first PyPy +sprint took place in Hildesheim: after a lot of initial activity, few +commits happened in the next two months, until the second PyPy sprint, +which took place in Gothenburg in late May 2003; around the minute +0:15, you can see the high commit rate due to the sprint.

                +

                The next two years follow more or less the same pattern: very high +activity during sprints, followed by long pauses between them; the +most interesting breaking point is located around the minute 01:55; +it's January 2005, and when the EU project starts, the number of +commits just explodes, as well as the number of people involved.

                +

                I also particularly appreciated minute 03:08 aka March 22, 2006: it's +the date of my first commit to dist/, and my nickname magically +appears; but of course I'm biased :-).

                +

                The soundtrack is NIN - Ghosts IV - 34: thanks to xoraxax for +having added the music and uploaded the video.

                +
                PyPy Codeswarm from solse@trashymail.com on Vimeo. +
                +
                +
                +
                + + nekto0n wrote on 2008-06-27 13:49: +
                +
                +

                Niiice =)

                +
                +
                +
                +
                + + akuhn wrote on 2008-06-28 11:33: +
                +
                +

                Question: in case of pair programming, who's name is shown? both names?

                +
                +
                +
                +
                + + Michael Hudson-Doyle wrote on 2008-06-29 23:50: +
                +
                +

                Cool. There was less of a drop off after the eu project ended than I expected!

                +
                +
                +
                +
                + + Anonymous wrote on 2008-06-30 11:29: +
                +
                +

                It was cool to see the sprint effects as well

                Cheers

                Bea

                +
                +
                +
                +
                + + Anonymous wrote on 2008-09-06 01:49: +
                +
                +

                The codeswarm seems to have moved:

                https://www.vimeo.com/1241231

                +
                +
                +
                + +

                Funding of some recent progress by Google's Open Source Programs

                + +
                +

                As readers of this blog already know, PyPy development has +recently focused on getting the code base to a more usable state. One +of the most important parts of this work was creating an +implementation of the ctypes module for PyPy, which +provides a realistic way to interface with external libraries. The +module is now fairly complete (if somewhat slow), and has generated a +great deal of community interest. One of the main reasons this work +progressed so well was that we received funding from Google's Open +Source Programs Office. This is +really fantastic for us, and we cannot thank Google and Guido enough for helping PyPy progress +more rapidly than we could have with volunteer-only time!

                +

                This funding opportunity arose from the PyPy US road trip at the end +of last year, which included a visit to Google. You +can check out the video +of the talk we gave during our visit. We wrapped up our day with +discussions about the possibility of Google funding some PyPy work and +soon after a we were at work on the proposal for improvements we'd +submitted.

                +

                One nice side-effect of the funding is indeed that we can use some of +the money for funding travels of contributors to our sprint meetings. +The next scheduled Google funding proposal also aims at making our +Python interpreter more usable and compliant with CPython. This will be done by trying to +fully run Django on top of PyPy. With +more efforts like this one we're hoping that PyPy can start to be used +as a CPython replacement before the end of 2008.

                +

                Many thanks to the teams at merlinux and Open End for making this development possible, including +Carl Friedrich Bolz, Antonio Cuni, Holger Krekel, Maciek Fijalkowski +at merlinux, Samuele Pedroni and yours truly at Open End.

                +

                We always love to hear feedback from the community, and you can get +the latest word on our development and let us know your thoughts here in the comments.

                +

                Bea Düring, Open End AB

                + +

                PS: Thanks Carl Friedrich Bolz for drafting this post.

                +
                +
                +
                +
                + + Bill Mill wrote on 2008-06-26 14:26: +
                +
                +

                congratulations! that's awesome.

                +
                +
                +
                +
                + + Christopher Armstrong wrote on 2008-06-26 14:52: +
                +
                +

                Congratulations, guys!

                +
                +
                +
                +
                + + nekto0n wrote on 2008-06-26 14:55: +
                +
                +

                That's great! I like that this project is getting bigger, growing faster :)
                I wish I could help, but don't know where to start :-[

                +
                +
                +
                +
                + + Brandon Corfman wrote on 2008-06-26 15:18: +
                +
                +

                I've been hard on Guido in the past for not throwing more support behind PyPy, and I'm very glad now to hear that Guido (and Google) are demonstrating its importance. Thanks all.

                +
                +
                +
                +
                + + Anonymous wrote on 2008-06-26 16:06: +
                +
                +

                Wow, I am actually more excited by hearing that pypy will be a partial cpython replacement this year than by the google money. Pypy is the most interesting project going on right now in the python world.

                +
                +
                +
                +
                + + Unknown wrote on 2008-06-26 16:27: +
                +
                +

                Wow, this should be quite interesting.

                JT
                https://www.Ultimate-Anonymity.com

                +
                +
                +
                +
                + + Anonymous wrote on 2008-06-26 20:50: +
                +
                +

                Congrats. I'm very glad to keep hearing about efforts to make PyPy usable with real-world applications and frameworks. The PyPy project is starting to send out positive signals, and this is something I've been waiting for.

                +
                +
                +
                +
                + + Anonymous wrote on 2008-06-26 22:52: +
                +
                +

                "With more efforts like this one we're hoping that PyPy can start to be used as a CPython replacement before the end of 2008."

                Out of curiousity, are there good reasons for anyone to want to do that?

                +
                +
                +
                + +

                Pdb++ and rlcompleter_ng

                + +
                +

                When hacking on PyPy, I spend a lot of time inside pdb; thus, I tried +to create a more comfortable environment where I can pass my nights +:-).

                +

                As a result, I wrote two modules:

                +
                +
                  +
                • +pdb.py, which extends the default behaviour of pdb, by adding +some commands and some fancy features such as syntax highlight and +powerful tab completion; pdb.py is meant to be placed somewhere in +your PYTHONPATH, in order to override the default version of pdb.py +shipped with the stdlib;
                • +
                • +rlcompleter_ng.py, whose most important feature is the ability +to show coloured completions depending on the type of the objects.
                • +
                +
                +

                To find more informations about those modules and how to install them, +have a look at their docstrings.

                +

                It's important to underline that these modules are not PyPy specific, +and they work perfectly also on top of CPython.

                + +
                +
                +
                +
                + + Brodie Rao wrote on 2008-06-22 20:57: +
                +
                +

                That's pretty impressive, but I think having to modify readline itself in order to do this is a little excessive. readline's completion capabilities are pretty limited. I wonder if there are any better alternatives that could be used with Python.

                I have something similar set up for my Python prompt: https://bitheap.org/hg/dotfiles/file/tip/.pythonrc.py -- it allows completion and indentation, it persists command history with readline, and it prints documentation if you try to evaluate certain objects like functions, classes, and methods. It also pretty-prints output, but I'm still trying to tweak it so it's aware of the terminal width.

                +
                +
                +
                +
                + + Antonio Cuni wrote on 2008-06-23 12:32: +
                +
                +

                yes, I agree that having to modify readline is not too nice. I tried hard to avoid this but with bad luck :-/.

                I suppose I could try to reimplement readline in Python, but I think it would be too much work; if you are aware of something already done, please let me know :-).

                +
                +
                +
                +
                + + Pachi wrote on 2008-06-23 12:58: +
                +
                +

                would this work be suitable to inclusion in the standard pdb module?. That would be awesome.
                Thanks!

                +
                +
                +
                +
                + + Maciej Fijalkowski wrote on 2008-06-23 16:25: +
                +
                +

                There is readline implementation on top of pyrepl in pypy already :) PyPy by default does not use readline, but just uses this.

                +
                +
                +
                +
                + + Paul wrote on 2008-06-24 22:00: +
                +
                +

                Nice job antonio. I'd clean the code up, conform to new-style classes and proper MRO handling. I'd also think about refactoring some of those names and find something better suited. Overall, awesome job man.

                +
                +
                +
                +
                + + Unknown wrote on 2008-06-26 10:48: +
                +
                +

                This looks great. You've taken a step futher than my own attempts here:
                https://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/498182

                Two small comments though: it crashes on startup without the right config in ~/.pdbrc.py and once I got it started I see things like this when completing th tab:

                ^[[000;00m^[[00mtest^[[00m

                but syntax highlighting seems to work perfectly. Thanks!

                +
                +
                +
                +
                + + Antonio Cuni wrote on 2008-06-27 11:57: +
                +
                +

                @Stephen: as described in the docs of rlcompleter_ng, to use colorized completion you need to use a patched version of readline, there is no chance to get it working without that.

                Could you describe in more details what problem did you encounter with ~/.pdbrc.py, so that I can fix it, please?

                +
                +
                +
                +
                + + Anonymous wrote on 2008-12-10 20:26: +
                +
                +

                Antonio - I created a minor patch for rlcompleter_ng.py which will allow it to run on both Python 2 and 3.

                https://mikewatkins.ca/2008/12/10/colorized-interpreter/

                +
                +
                +
                +
                + + cool-RR wrote on 2011-05-05 18:39: +
                +
                +

                I was disappointed that pdb++ doesn't work on Windows. Apparently it uses the `termios` module which is not available on Windows.

                +
                +
                +
                + +

                Running Nevow on top of PyPy

                + +
                +

                Another episode of the "Running Real Application of top of PyPy" series: +

                +Today's topic: Divmod's Nevow. Nevow (pronounced as the French "nouveau", or "noo-voh") is a web application construction kit written in Python. Which means it's just another web framework, but this time built on top of Twisted. +While, due to some small problems we're not yet able to pass full Twisted test suite on top of pypy-c, Nevow seems to be simple enough to work perfectly (959 out of 960 unit tests passing, with the last one recognized as pointless and about to be deleted). Also, thanks to +exarkun, Nevow now no longer relies on ugly details like refcounting. +

                +As usual, translate pypy using: +

                +
                +translate.py --gc=hybrid --thread targetpypystandalone --faassen --allworkingmodules --oldstyle +
                +
                +Of course, obligatory to the series, screenshot:
                +This is Nevow's own test suite. +

                +Cheers,
                +fijal +
                +
                +
                +
                + + Donovan Preston wrote on 2008-06-26 21:53: +
                +
                +

                Awesome!

                +
                +
                +
                + +

                Next sprint: Vilnius/Post EuroPython, 10-12th of July

                + +
                +

                As happened in the last years, there will be a PyPy sprint just after +EuroPython. The sprint will take place in the same hotel as the +conference, from 10th to 12th of July.

                +

                This is a fully public sprint: newcomers are welcome, and on the first +day we will have a tutorial session for those new to PyPy development.

                +

                Some of the topics we would like to work on:

                +
                +
                  +
                • try out Python programs and fix them or fix PyPy or fix performance bottlenecks
                • +
                • some JIT improvement work
                • +
                • port the stackless transform to ootypesystem
                • +
                +
                +

                Of course, other topics are also welcome.

                +

                For more information, see the full announcement.

                +
                +

                German Introductory Podcast About Python and PyPy

                + +
                +

                During the Berlin Sprint Holger was interviewed by Tim Pritlove for Tim's +Podcast "Chaosradio Express". The whole thing is in German, so only +interesting to German-speakers. The PyPy episode can be found here. The +interview is touching on a lot of topics, starting with a fairly general intro +about what Python is and why it is interesting and then moving to explaining and +discussing PyPy. The bit about PyPy starts after about 45 minutes. There is also +a comment page about the episode.

                +
                +
                +
                +
                + + holger krekel wrote on 2008-06-15 18:54: +
                +
                +

                Thanks CF for linking - i found it actually a fun interview although i was caught a bit in surprise that it focused first a lot on Python-the-language and i didn't feel in evangelising mode.

                And what i again realized is that PyPy is not too well known or understood outside the Python world. Maybe it would help, also for getting some funding, if it were.

                +
                +
                +
                +
                + + Anonymous wrote on 2008-06-16 10:19: +
                +
                +

                It seems a pity non-German speakers cannot benefit from this. Any chance of an English version?

                +
                +
                +
                +
                + + kriss wrote on 2008-06-16 13:49: +
                +
                +

                Great Podcast, I like your project - have to listen to the podcast a second time though. :-)

                Keep up the good work!

                +
                +
                +
                + +

                Running Pylons on top of PyPy

                + +
                +

                The next episode of the "Running Real Applications on Top of PyPy" series:

                +Yesterday, we spend some time with Philip Jenvey on tweaking Pylons and PyPy to cooperate with each other. While doing this we found some pretty obscure details, but in general things went well. +

                +After resolving some issues, we can now run all (72) Pylons tests on +top of pypy-c compiled with the following command: +

                +
                +translate.py --gc=hybrid --thread targetpypystandalone --faassen --allworkingmodules --oldstyle +
                +
                +and run some example application. Here is the obligatory screenshot (of course +it might be fake, as usual with screenshots). Note: I broke application on purpose to showcase cool debugger, default screen is just boring:
                +Please note that we run example application without DB access, since +we need some more work to get SQLAlchemy run on top of pypy-c together with +pysqlite-ctypes. Just one example of an obscure details that sqlalchemy is +relying on in the test suite: +

                + class A(object):
                +   locals()[42] = 98 +
                +

                Update:This is only about new-style classes. +

                +This works on CPython and doesn't on PyPy.

                +Cheers,
                +fijal +
                +
                +
                +
                + + Anonymous wrote on 2008-06-11 00:46: +
                +
                +

                Very good to see this work! This is a a good thing to be trying and hearing it makes me happy.

                We're busy working on making Zope 3 run on Jython, which should get make some of our C level dependencies optional. These make a port to PyPy harder as well. Zope 3 libraries have umpteen thousands of tests that can be run, so that should give one some coverage. The libraries come packaged separately too.

                The trickiest part would be those bits that depend on the ZODB. Porting the ZODB to PyPy should allow new possibilities, but it'll be hard too, I imagine.

                +
                +
                +
                +
                + + holger krekel wrote on 2008-06-11 12:31: +
                +
                +

                Hi Martijn,

                in fact having zope3 work with pypy would be very nice. i discussed a bit with Phillip and he suggested to first get zope.interface and zope.component to work, then zope.proxy/zope.security. IIRC my first try with zope.interface yielded 3 failures out of 231 tests. I had to hack the test runner a bit to not rely on GC details - i guess that your work for Jython might imply that as well. What is the best way to follow your Jython work, btw?

                best & cheers,
                holger

                +
                +
                +
                +
                + + Anonymous wrote on 2008-06-11 12:38: +
                +
                +

                The Jython project is a summer of code project. Georgy Berdyshev is the student and is sending messages to jython-dev.

                Here was a recent status report:

                https://sourceforge.net/mailarchive/forum.php?thread_name=ee8eb53d0806082009g5aec43dbn3da1f35b751cba70%40mail.gmail.com&forum_name=jython-dev

                +
                +
                +
                +
                + + Anonymous wrote on 2008-06-11 12:48: +
                +
                +

                I see that the hyperlink to Georgy's report just now got eaten by the comment software. Here it is again, hopefully working this time.

                +
                +
                +
                +
                + + Carl Friedrich Bolz-Tereick wrote on 2008-06-11 13:38: +
                +
                +

                Georgy Berdyshev is lurking in the #pypy channelg (gberdyshev or similar), FWIW.

                +
                +
                +
                +
                + + mike bayer wrote on 2008-06-11 21:50: +
                +
                +

                Let's see who entered that line:

                4361 pje # This proves SA can handle a class with non-string dict keys
                4361 pje locals()[42] = 99 # Don't remove this line!


                pje ? Yes. That PJE. Complete with "don't remove this!"....we'll have to see what mr. guru was up to with that one. This test is also only present in the 0.5 branch which hasn't had alpha releases yet.

                Would love to hear some other examples of "obscure details" the test suite is relying upon...my guess would be extremely few or none besides this one example.

                +
                +
                +
                +
                + + PJE wrote on 2008-06-12 01:10: +
                +
                +

                It tests that SQLAlchemy isn't depending on class dictionaries containing only string keys.

                Unfortunately, this makes the test then depend on the ability to have non-string keys in the class dictionary. ;-)

                The test is to ensure that SQLAlchemy will be able to map objects whose *classes* have AddOns defined.

                By the way, as of PEP 3115, the locals() of a class can be an arbitrary object, so making compile-time assumptions about what *can't* be done with a class' locals() is probably not a good idea.

                Also, as of every existing version of Python>=2.2, a metaclass may add non-dictionary keys to the class dictionary during class.__new__. So, it has never been a valid assumption that class __dict__ keys *must* be strings. If PyPy is relying on that, it is already broken, IMO.

                +
                +
                +
                +
                + + Carl Friedrich Bolz-Tereick wrote on 2008-06-12 09:57: +
                +
                +

                PJE: What you say is mostly beside the point. PyPy has no problem at all with non-string keys in (old-style) class dicts. The point is more that locals() cannot be used to assign things to this dictionary, see the docs:

                "locals()
                Update and return a dictionary representing the current local symbol table. Warning: The contents of this dictionary should not be modified; changes may not affect the values of local variables used by the interpreter."

                +
                +
                +
                +
                + + PJE wrote on 2008-06-12 11:39: +
                +
                +

                Well, if you plan on supporting, say, Zope or Twisted, you'll need to support modifying class-body frame locals.

                There really isn't any point to optimizing them, not only due to PEP 3115, but also due to pre-3115 metaclasses. (And just the fact that most programs don't execute a lot of class suites in tight loops...)

                +
                +
                +
                +
                + + Anonymous wrote on 2008-06-12 17:34: +
                +
                +

                Zope does things like:

                frame = sys.getframe(1)
                frame.f_locals['foo'] = bar

                It does this to make zope.interface.implements() work, among other things. This allows you to the following:

                # IFoo is actually an instance, not a
                # class
                class IFoo(zope.interface.Interface):
                  pass

                class Myclass:
                  # stuffs information in the class
                  zope.interface.implements(IFoo)

                The martian library (which Grok uses) actually generates this into its directive construct.

                Some of this stuff could become class decorators in the future, I imagine, but we're stuck supporting this future for the forseeable future as well.

                +
                +
                +
                +
                + + Anonymous wrote on 2008-06-12 17:40: +
                +
                +

                I didn't "generates this", but "generalizes this". I think PJE's PEAK library also has stuff for this ("class advisors").

                +
                +
                +
                + +

                List comprehension implementation details

                + +
                +

                List comprehensions are a nice feature in Python. They are, however, just +syntactic sugar for for loops. E.g. the following list comprehension:

                +
                +def f(l):
                +    return [i ** 2 for i in l if i % 3 == 0]
                +
                +

                is sugar for the following for loop:

                +
                +def f(l):
                +    result = []
                +    for i in l:
                +        if i % 3 == 0:
                +            result.append(i ** 2)
                +    return result
                +
                +

                The interesting bit about this is that list comprehensions are actually +implemented in almost exactly this way. If one disassembles the two functions +above one gets sort of similar bytecode for both (apart from some details, like +the fact that the append in the list comprehension is done with a special +LIST_APPEND bytecode).

                +

                Now, when doing this sort of expansion there are some classical problems: what +name should the intermediate list get that is being built? (I said classical +because this is indeed one of the problems of many macro systems). What CPython +does is give the list the name _[1] (and _[2]... with nested list +comprehensions). You can observe this behaviour with the following code:

                +
                +$ python
                +Python 2.5.2 (r252:60911, Apr 21 2008, 11:12:42)
                +[GCC 4.2.3 (Ubuntu 4.2.3-2ubuntu7)] on linux2
                +Type "help", "copyright", "credits" or "license" for more information.
                +>>> [dir() for i in [0]][0]
                +['_[1]', '__builtins__', '__doc__', '__name__', 'i']
                +>>> [[dir() for i in [0]][0] for j in [0]][0]
                +['_[1]', '_[2]', '__builtins__', '__doc__', '__name__', 'i', 'j']
                +
                +

                That is a sort of nice decision, since you can not reach that name by any +"normal" means. Of course you can confuse yourself in funny ways if you want:

                +
                +>>> [locals()['_[1]'].extend([i, i + 1]) for i in range(10)]
                +[0, 1, None, 1, 2, None, 2, 3, None, 3, 4, None, 4, 5, None, 5, 6, None, 6, 7, None, 7, 8, None, 8, 9, None, 9, 10, None]
                +
                +

                Now to the real reason why I am writing this blog post. PyPy's Python +interpreter implements list comprehensions in more or less exactly the same way, +with on tiny difference: the name of the variable:

                +
                +$ pypy-c-53594-generation-allworking
                +Python 2.4.1 (pypy 1.0.0 build 53594) on linux2
                +Type "help", "copyright", "credits" or "license" for more information.
                +``the globe is our pony, the cosmos our real horse''
                +>>>> [dir() for i in [0]][0]
                +['$list0', '__builtins__', '__doc__', '__name__', 'i']
                +
                + +

                Now, that shouldn't really matter for anybody, should it? Turns out it does. The +following way too clever code is apparently used a lot:

                +
                +__all__ = [__name for __name in locals().keys() if not __name.startswith('_') '
                +               or __name == '_']
                +
                +

                In PyPy this will give you a "$list0" in __all__, which will prevent the +import of that module :-(. I guess I need to change the name to match CPython's.

                +

                Lesson learned: no detail is obscure enough to not have some code depending +on it. Mostly problems on this level of obscurity are the things we are fixing +in PyPy at the moment.

                +
                +
                +
                +
                + + Brandon Rhodes wrote on 2008-06-10 03:09: +
                +
                +

                In fairness, the clever code does not depend on the name looking as it actually does in CPython; the clever code merely expects that variables auto-created by Python internals will begin with an underscore. Which is far more reasonable than actually expecting the specific name "_[1]" (and, wow, you're right, that does look weird; you've shown me something I've never seen before about Python!) to turn up in the variable list.

                +
                +
                +
                +
                + + Unknown wrote on 2008-06-10 06:38: +
                +
                +

                Actually, that piece of code is looking to export only public identifiers, right? It's trying to exclude things prefixed with an underscore that are in the file scope.

                +
                +
                +
                +
                + + Anonymous wrote on 2008-06-10 07:32: +
                +
                +

                I would have said "Lesson learned: when MIT hackers in the 1960's come up with some funny thing called GENSYM, it's not just because they're weird; it really does serve a purpose". But then I'm an asshole Lisp hacker. :-)

                +
                +
                +
                +
                + + Carl Friedrich Bolz-Tereick wrote on 2008-06-10 09:50: +
                +
                +

                anonymous: Using gensym for getting the symbol wouldn't have helped in this case at all. The gensymmed symbol would still have showed up in the locals() dictionary. So depending on whether the gensym implementation returns symbols that start with an underscore or not the same bug would have occured.

                +
                +
                +
                +
                + + TuringTest wrote on 2008-06-10 10:28: +
                +
                +

                Other languages have the capability/design/philosophy to make such implementation details totally unobservable.

                Haskell has list comprehensions which expand into normal code. These cannot expose implementation details or temporary names.

                +
                +
                +
                +
                + + Carl Friedrich Bolz-Tereick wrote on 2008-06-10 13:26: +
                +
                +

                turingtest: I agree that that would be preferable, but it's sort of hard with the current interpreter design. Also, it's a pragmatic implementation in that the interpreter didn't have to change at all to add the list comps.

                +
                +
                +
                +
                + + arkanes wrote on 2008-06-10 15:48: +
                +
                +

                The code's not overly clever, it's ridiculous, because it exactly duplicates the effects of not having __all__ at all. From foo import * already won't import names prefaced with an underscore. Also from the google code search it looks like it's mostly used in Paste, most of the other hits are false positives.

                The "from foo import *" case (without __all__ defined) is a good enough reason to match the cpython naming, though, the useless code in Paste not withstanding.

                +
                +
                +
                +
                + + Anonymous wrote on 2008-06-10 17:49: +
                +
                +

                carl: something like GENSYM would still help, since the symbol generated is not accessible from any package.

                That's difference between gensym and mktemp. However, I don't believe that python has the concept of uninterned symbols (someone who knows more about python could correct me).

                +
                +
                +
                +
                + + Carl Friedrich Bolz-Tereick wrote on 2008-06-11 12:17: +
                +
                +

                arkanes: no, the "from foo import *" case isn't really changed by the different choice of symbols because the new variable is really only visible within the list comprehension and deleted afterwards. It doesn't leak (as opposed to the iteration variable).

                +
                +
                +
                +
                + + Unknown wrote on 2008-06-12 01:43: +
                +
                +

                arkanes: This is not the same as not having __all__ defined. __all__ would skip the function _() which is used to mark and translate strings with gettext. In other words, it is emulating the default no __all__ behavior and adding in _()

                Carl: doesn't the "$list0" get imported without the all? If not what keeps it from causing a problem normally? Could you not just delete the $list0 variable after assigning it to the LHS?

                +
                +
                +
                +
                + + Carl Friedrich Bolz-Tereick wrote on 2008-06-12 11:05: +
                +
                +

                chris: yes, deleting this variable is exactly what PyPy does (and CPython as well). That's what I was trying to say in my last post.

                The bug with the __all__ only occurs because locals is called within the list comprehension. After the list comprehension is done there is no problem.

                +
                +
                +
                + +

                Better Profiling Support for PyPy

                + +
                +

                As PyPy is getting more and more usable, we need better tools to use to work on certain applications running on top of PyPy. Out of this interest, I spent some time implementing the _lsprof module, which is a part of the standard library since Python2.5. It is necessary for the cProfile module, which can profile Python programs with high accuracy and a lot less overhead than the older, pure-python profile module. Together with the excellent +lsprofcalltree script, you can display this data using kcachegrind, which gives you great visualization possibilities for your profile data. +

                +Cheers,
                +fijal

                +
                +
                +
                +
                + +
                +
                +
                + +
                + + + + \ No newline at end of file diff --git a/blog/index-7.html b/blog/index-7.html new file mode 100644 index 000000000..989237a2a --- /dev/null +++ b/blog/index-7.html @@ -0,0 +1,877 @@ + + + + + + +PyPy (old posts, page 7) | PyPy + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +
                +

                Pycon UK, Javascript and the GIL

                + +
                +

                Just got back from Pycon UK 2008 - here are some impressions.

                +Both the keynote speakers Mark Shuttleworth (Canonical) and +Ted Leung (Sun Microsystems) expressed their concerns about +Javascript becoming so fast and prominent that it could displace +Python in the future. They also highlighted the fact that +Multi-core systems get cheaper and more popular also on +desktop computers or notebooks. They challenged the community +to advance Python implementations to exploit it. Question was up +what PyPy can do here. As it stands, PyPy still uses the good old +Global Interpreter Lock (GIL) but our approaches should indeed +lend itself well to do experimentation with free threading. +

                +During the 2-day conference we met many interesting people, most +notably the guys from Resolver, among them William Reade who is working on +IronClad -- which implements a fake python25.dll on top of +IronPython. He presented some good results for Numpy in his +lightning talk. This approach is surely something to follow +closely and potentially use for PyPy. +

                +We also had lunch and a couple of chats with Jacob Kaplan-Moss from +Django fame - he is apparently up to try use PyPy's sandboxing features +for one of his projects, cool! +

                +Conference itself was well organized for the 230 attending people - although +the venue might be a bit small for next year's EuroPython. Ah, and +we gave three well attended talks, find the slides here: +

                + +cheers,
                +Holger, Maciej, Anto (associated through merlinux, btw)
                +
                +
                +
                +
                + + Lucian wrote on 2008-09-17 19:36: +
                +
                +

                Is the work done on this (https://code.google.com/p/python-safethread/) useful conceptually?

                +
                +
                +
                +
                + + René Leonhardt wrote on 2008-09-17 21:27: +
                +
                +

                Is the new multiprocessing module going to offer improved multi-core performance?

                +
                +
                +
                +
                + + Colin Walters wrote on 2008-09-17 22:40: +
                +
                +

                Jython? About to hit 2.5, has a threading model for free from the JVM.

                +
                +
                +
                +
                + + Luis wrote on 2008-09-18 00:39: +
                +
                +

                I wonder how the new javascript improvements compare to pypy technically. For example, the tracing techniques of Mozilla's Tracemonkey look impressive, but I don't know if these techniques are conceptually close or not to pypy's. Is there anything you can learn from them (tracemonkey, chrome's v8, etc).

                Luis

                +
                +
                +
                +
                + + Miguel Filipe wrote on 2008-09-19 15:19: +
                +
                +

                ReneL:
                Yes, the new multiprocessing module will improve multi-core performance if you use it.
                That module allows a easy way to use multiple processes cooperatively in python. It tries to mimic the threading API.
                If you use multiple processes instead of threads you will avoid the Global Interpreter Lock.

                About new chalenges for PyPy, multicore isn't the major problem.. the absense of a powerfull JIT and GC is. Please keep working on a super-fast "VM+JIT" for python! (super linux performance is a must)

                +
                +
                +
                +
                + + holger krekel wrote on 2008-09-20 13:57: +
                +
                +

                colin: true, jython and also ironpython can make use of multiple threads. should have mentioned it. Doesn't mean that pypy-c shouldn't go for it, rather the opposite i'd think :)

                renel/multiprocessing module: i can imagine it helps with multi-core cpus. are there practical experiences using it yet?

                luis, miguel: there are considerable efforts on the PyPy/JIT front - particularly from Armin, Carl Friedrich and Antonio - would be worth a dedicated blog post to relate this to tracing JITs, V8, squirrelfish, etc. One thing i know is that we probably want to apply for funding to help completing the JIT.

                Miguel: We do have advanced GCs and are working on on improving them, currently.

                to all: thanks for your feedback!

                +
                +
                +
                + +

                Düsseldorf PyPy sprint 5-13th October, 2008

                + +
                +

                The PyPy team is happy to announce the next sprint, which will take place in +the Computer Science Department of the University of Düsseldorf, Germany. +Sprinting will start on the 6th of October and go on till the 12th. Please +arrive on the day before if you want to come.

                +

                Topics of the sprint will be aiming at a 1.1 release and to work on integrating PyPy better +with small devices. Other topics are also welcome!

                +

                We will try to find a hotel with group rates, so if you are interested, please +sign up soon! See the announcement for more details.

                +
                +

                pylib/py.test 0.9.2 released

                + +
                +

                PyPy and its 14638 automated tests use the py.test tool which is also used by many other projects. PyPy developers have actually driven and contributed a lot to its development. + +I just released version 0.9.2 of the py lib mainly fixing Windows issues and providing better packaging and integration with setuptools. It's usable completely independently from PyPy - "easy_install py" gives you the py.test command line. Of course you can run py.test on top of a translated PyPy version as well. Here is a quick summary of what the py lib provides besides py.test: +

                +
                  +
                • +py.execnet: ad-hoc code distribution to SSH, Socket and local sub processes
                • +
                • +py.magic.greenlet: micro-threads on standard CPython ("stackless-light") and PyPy
                • +
                • +py.path: path abstractions over local and subversion files
                • +
                • +py.code: dynamic code compile and traceback printing support
                • +
                • tested against Linux, Win32, OSX, works on python 2.3-2.6
                • +
                +Good general entry points for installation and documentation: + +have fun, holger krekel +
                +
                +
                +
                + + Anonymous wrote on 2008-09-02 18:30: +
                +
                +

                We use py.test in the development of the Translate Toolkit and Pootle - many thanks :)

                I use Fedora so here are Fedora RPMs for pylib:
                https://translate.sourceforge.net/releases/testing/fedora/pylib-0.9.2-1.fc9.noarch.rpm

                +
                +
                +
                +
                + + holger krekel wrote on 2008-09-23 15:44: +
                +
                +

                Hi Dwayne!

                thanks a lot. I added a link to your RPM from the download page. Let me know if there was anything that hindered packaging.

                holger

                +
                +
                +
                + +

                New translation option: --opt

                + +
                +

                Hi all,

                + +

                A few command-line options for translate.py have changed. +Most interesting is that optimization levels are selected with +the option --opt, or -O for short. This replaces --allopts, +which was also called --faassen in reference to a person who +is actually not involved in PyPy (so that was a bit of a +strange joke). Also, --allworkingmodules is the default +nowadays, and can be cancelled with --no-allworkingmodules. +Threads are also included in --allworkingmodules now.

                + +

                Examples:

                + +

                +
                  +
                • +translate.py (reasonable default, corresponds to --opt=2) +
                • +
                • +translate.py --opt=3 (best, maybe 10-20% faster) +
                • +
                • +translate.py --opt=1 (translation is faster and less RAM-hungry) +
                • +
                +

                For more information, see: + +

                + +
                +
                +
                +
                + + holger krekel wrote on 2008-08-19 16:48: +
                +
                +

                maybe for a bit of background: Martijn Faassen regularly asked at our talks "how fast is PyPy now?" - at times when PyPy was going from 2000 to 500 to 50 to ???? times slower than CPython (nowadays at 1-6 times, btw). so with "--faassen" we were trying to translate an "as-fast-as-possible" pypy. so now we are getting dead serious (also Martijn actually asked for removing his name from the commandline) and introduced a we-are-becoming-a-real-compiler-with-opt-levels "-O" option :)

                Martijn, to be clear: i really appreciate having you and your questions in our talks and in general - it also pushed me to get out py.test releases ... :) holger

                +
                +
                +
                +
                + + Anonymous wrote on 2008-08-22 02:56: +
                +
                +

                Congrats to PyPy on having a more sensible option! (though an option called 'opt' made me think it stood for 'option' first :).

                Thanks Holger for the background. "actually not involved in PyPy" depends on your interpretation of what the word "involved" means. Besides performance related questions, as Holger indicates I've asked other questions of the PyPy project too. I wasn't altogether successful at it (nor altogether unsuccessful), and I'm on an extended break from asking any questions right now.

                I didn't write any of the PyPy code and also had nothing to do with the design of PyPy. I indeed asked for the --faassen option to be removed earlier this year. It was amusing (and flattering), but it also lead to some confusion concerning credit that I certainly don't deserve - that goes to the PyPy developers and project managers.

                +
                +
                +
                + +

                Europython 2008 PyPy talks and sprint sum up

                + +
                +

                The EuroPython 2008 conference and sprints have finished - it certainly was +a very eventful and successful conference for PyPy. And many very interesting +non-PyPy talks as well. PyPy presentations are available online: PyPy status talk +PyPy for the rest of us, PyPy behind the scenes. Armin and Maciej also did a well-attended +talk about PyPy's garbage collection, but that was quite interactive, no slides. +

                +The talks were all well visited and we got good questions. However, we still +need to work on sorting out the "PyPy technology cloud" and how to present +it to different audiences. Anyway, we are happy to hear feedback or questions +about the talks! +

                +After the conference there was a three-day PyPy sprint. Despite +the fact that most PyPy core developers were zombies, +we made good progress. Particularly our newcomers did very well. +Here are some results: +

                +
                  +
                • itertools rewritten in RPython for performance by Jakub + Gustak and Andrew Durdin
                • + +
                • a new ctypes based dbm and hashlib module, both by Gasper Zejn + with support from Henrik Vendelbo, they also got ctypes to nicely work on OSX. (sorry for lack of proper letters in names :)
                • + +
                • implement builtin function call profiling by Stephan Diehl, Antonio and Armin.
                • + +
                • running + Pinax on top of pypy-c, by Henrik, Holger, Gasper.
                • + +
                • Jim Baker started a _rawffi.py for Jython using JNA aiming + to provide support to run PyPy's ctypes on top of Jython. + When Jython gets this to run, PyPy's JVM backend should be + able to use it. Talk about Code Reuse :)
                • + +
                • oldstyle classes are now the default, this makes + PyPy mimick very closely cpython's 2.5 object model.
                • + +
                • Andrew started a port of the Malbolge + interpreter written in Python to RPython (obviously the only missing + link for PyPy to take over the world).
                • + +
                • various cleanups (a new option "--lonepycfiles" helps with + saner imports, remove int-float comparison shortcuts, ...)
                • +
                +At the end of the sprint we also discussed initial plans for a 1.1 release which we'd like to make happen this year. So we are generally looking forward to a busy rest of 2008 and luckily this starts by many of us taking a good vacation first :)

                + +Cheers,
                +fijal & holger +
                +
                +
                +
                + + Armin Rigo wrote on 2008-07-16 00:20: +
                +
                +

                The option is not --lonepycfiles but --objspace-lonepycfiles, and using it makes imports *less* sane.

                +
                +
                +
                +
                + + holger krekel wrote on 2008-07-16 08:44: +
                +
                +

                oh, right. I meant to say that with the introduction (not the enabling) of the option imports are saner - in that pypy now by default ignores pyc files if there is no ".py" file. thanks for the attention.

                +
                +
                +
                + +

                Finding Bugs in PyPy with a Fuzzer

                + +
                +

                Last week I played a bit with Fusil, which is a fuzzing framework. The idea is +to feed the interpreter code that calls the functions of a module with random values +of various types as arguments in the hope that one hits an unchecked case. This is +done until a problem is hit , the most common problem being a segfault. Victor Stinner, +the author of Fusil, is a regular in the PyPy IRC channel and thankfully helped me +getting started with Fusil. I used his project description for CPython as a starting +point and tweaked it a bit. Reason is that PyPy is harder to segfault and so +I tweaked Fusil to also count uncaught RPython-level exceptions as such a problem. +(RPython has full exception support, and if an RPython-exception escapes to the top +level, the Python interpreter aborts. One should not be able to exploit this but +but for a user it is bad enough, because such exceptions cannot be caught from +Python code.)

                +

                Using Fusil I found a number of cases where such exceptions happened (in some +pickle support-code, in the expat parser, in the os and in the termios +module) and also one or two segfaults (in the parser module, of all places). +I fixed all these problems so that by +now the fuzzer just runs for a very long time and only finds things that take +too long (so they count as a way to do a DoS attack) like +pow(12355123123L, 12351512123121L) or round(1, 1000000000) (the latter +should probably be fixed). This probably just means that the fuzzer is not good +enough, because there are certainly segfaults left in PyPy. However, the fact +that it is rather hard to find them validates our approach of using a +high-level memory-managed language for our interpreter. Victor tells me that it +is rather easy to find segfaults in CPython this way, he already found quite +some problems.

                +
                +
                +
                +
                + + Marius Gedminas wrote on 2008-07-13 20:42: +
                +
                +

                Nice post!

                I especially like your certainty that PyPy has segfaults left in it. :-)

                +
                +
                +
                +
                + + Maciej Fijalkowski wrote on 2008-07-13 22:02: +
                +
                +

                What? Segfaults in PyPy? Shouldn't have any left by now :-)

                +
                +
                +
                +
                + + Armin Rigo wrote on 2008-07-13 22:04: +
                +
                +

                That previous comment was from me, accidentally logged in as Maciej, sorry. As usual, in PyPy confusion comes for free.

                +
                +
                +
                +
                + + Maciej Fijalkowski wrote on 2008-07-14 07:17: +
                +
                +

                heh :) I was a bit surprised to see my comment which I did not write. Anyway, I agree with it :-)

                +
                +
                +
                +
                + + Anonymous wrote on 2009-02-07 06:37: +
                +
                +

                You said you will love me wow gold the whole life, but WoW Gold you marry her. You said you will wow power leveling,come to marry me, but this will not be carried out forever.WoW Gold I am trying my best to forget you and do not love you anymore. wow leveling But I failed and I still love you. Maybe wow leveling she needs you more compared wow leveling with me. So I tell you that world of warcraft power leveling you should love world of warcraft power leveling her and take good world of warcraft leveling care of her. You said I was so kind.world of warcraft leveling Yes, because I love you,world of warcraft leveling I hope you will be power leveling happy forever.

                +
                +
                +
                + +

                PyPy's Python runs Pinax / Django

                + +
                +

                During the EP2008 sprint we got Pinax running on top of PyPy. At our play1 server we have it running on top of pypy-c. Not that you'll notice many differences to the original site but that's the point, isn't it? ... Well, in fact i am too lazy to customize our play1 version now - i rather spent a nice evening with the other sprint guys :) + +Pinax integrates numerous reusable Django apps to take care of the things that many sites have in common. Many thanks particularly to Henrik Vendelbo who sorted out various Pinax and PyPy issues, and wrote up a nice DjangoAndPyPy wiki page describing the installation process. + +greetings from Vilnius (Lithunia), Holger

                +
                +

                EP2008: PyPy meets Jython

                + +
                +

                One of the great events at EuroPython 2008 were our chats and meetings with the Jython and Sun people. The Jython people recently are pushing into releasing Python version 2.5 and they currently pursue many interesting sub projects. Coincidentally, PyPy also has tons of interesting areas and results :) So we eventually got into brainstorming a number of possible technical collab ideas. Further below is a first list as i wrote it down from our 10 people PyPy / Jython 30 minute close up meeting yesterday. + +It felt great to be able to talk to the Jython people this way - kudos to Sun for their clear commitments and open ways to go about things! I sense a genuine interest on fair collaboration with non-java developer communities. Seems like they are serious about not focusing on "Java this", "Java that" anymore but rather focus on the JVM platform. Good! And about language +independent interest in ambitious technology. Even Better! I am tensed to see how things go from here. + +So here the list of technical collab ideas: +

                +
                  +
                • ctypes - try to create _rawffi module in Java for Jython, which will enable Jython to reuse our existing ctypes implementation (and have PyPy use the Jython-rawffi for its own for PyPy.JVM)
                • +
                • generally see to share work / (continue) collaborate regarding extension modules
                • +
                • Jython/PyPy (and eventually IronPython): document known differences to CPython, maybe in a PEP
                • +
                • Python Interpreter for Jython (in order to run CPython's .pyc files): re-use pypy's bytecode evaluator, implement a "Jython object space".
                • +
                • re-use rpython-extension modules for jython (e.g. SRE), by compiling them to Java and reusing as a native library.
                • +
                • collaborate on testing framework / benchmarking, have a common site to show test results
                • +
                • make py.test compatible with jython
                • +
                • come up with a set of "pure Python language" tests, which would gather and refactor tests from CPython, PyPy and Jython.
                • +
                • look into using java types / jython approaches for implementing free threading.
                • +
                • share knowledge regarding JIT / psyco +
                • +
                If you have any more ideas, comments or would like to join efforts, let us know! + +Cheers and thanks to Ted Leung, Frank Wierzbiki, Jim Baker and Tobias Ivarsson from Sun and Jython fame respectively, + +Holger +
                +
                +
                +
                + + cartman wrote on 2008-07-10 09:50: +
                +
                +

                This is great news, a common VM for all Python implementations would be real cool :)

                +
                +
                +
                +
                + + Michael Foord wrote on 2008-07-10 11:44: +
                +
                +

                It would be great to get the IronPython folks involved as well.

                For example, .NET has an FFI and with the right effort could take advantage of ctypes extensions as well.

                +
                +
                +
                +
                + + holger krekel wrote on 2008-07-10 13:52: +
                +
                +

                fuzzyman: do you have anyone particular person/group in mind?

                +
                +
                +
                + +

                PyPy at the EuroPython 2008

                + +
                +

                Greetings from Vilnius, Lithuania. There were already +two pypy talks, one performed by Jacob Hallen +PyPy for the rest of us and second +by Maciej Fijalkowski PyPy status talk. The thing that +we forgotten to tell is that PyPy sandboxing feature +can also easily limit CPU and RAM usage as well as +any other possible resource (like network transfer). +For anyone who would like to join, there is a PyPy +sprint after the conference.

                +Cheers,
                +arigo & fijal

                +
                +
                +
                +
                + + Jonathan Ellis wrote on 2008-07-07 18:13: +
                +
                +

                Can you post PDFs of those slides? The text is not rendering for me in NeoOffice.

                +
                +
                +
                +
                + + Anonymous wrote on 2008-07-07 22:08: +
                +
                +

                Is Maciej using some secret cool reST presentation tool?

                +
                +
                +
                +
                + + Anonymous wrote on 2008-07-08 03:52: +
                +
                +

                You can convert to PDF online.

                This link should work for the next 24 hours. You can regenerate it on the same site after that.

                +
                +
                +
                +
                + + Maciej Fijalkowski wrote on 2008-07-08 06:19: +
                +
                +

                I checked in pdf version. I use rst2beamer + hacks :) Ask Antonio Cuni for details.

                +
                +
                +
                +
                + + Antonio Cuni wrote on 2008-07-08 08:20: +
                +
                +

                yes, we are using rst2beamer:
                https://www.agapow.net/programming/python/rst2beamer

                to have some hints how to use it, look at this script I used to generate my pycon-italy talk:
                https://codespeak.net/svn/pypy/extradoc/talk/pycon-italy-2008/makepdf

                I also wrote some rst macros that allows you to put some paragraphs in those nice beamer's exampleblock and alertblock:

                https://codespeak.net/svn/pypy/extradoc/talk/pycon-italy-2008/beamerdefs.txt

                +
                +
                +
                + +
                +
                + +
                +
                +
                + +
                + + + + \ No newline at end of file diff --git a/blog/index-8.html b/blog/index-8.html new file mode 100644 index 000000000..3ac4d7f92 --- /dev/null +++ b/blog/index-8.html @@ -0,0 +1,1957 @@ + + + + + + +PyPy (old posts, page 8) | PyPy + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +
                +

                Porting the JIT to CLI (part 3)

                + +
                +

                In my two previous posts, we talked about the PyPy JIT generator, seeing +that it can produce huge speedups and how its backend-independent frontend +works.

                +

                In this post, we will look closer at the internals of the CLI JIT backend; in +particular, we will see how we work around some serious limitations of the +platform, and why these workarounds didn't have any serious impact on the +performances of our toy virtual machine.

                +
                +

                Graphs, blocks, links

                + + + + +

                One of the core aspect of PyPy translator is the concept of flow graph: a +flow graph is a data structure that represents the code we are operating on. +It is composed by a set of basic blocks, each block containing a sequence +of operations; blocks are connected together by links, and each link can +carry a variable number of arguments whose value is passed to the target +block. In case a block contains more than one outgoing links, the one to +follow is selected by looking at the value of a designated variable (the +exitswitch), thus making possible to implement conditional jumps. To have +a more complete description of the flow graphs model, check the documentation.

                + +

                As we saw in the previous post, the generated JIT compiler makes heavy use of +flexswitches to generate efficient code, continuously intermixing +JIT-compile time and runtime.

                +

                In terms of graphs, we can think of a flexswitch as a special block whose +links change over time. In particular, adding a new case to the flexswitch is +equivalent to create a link whose target is a new block where the just +generated code starts. Thus, the graphs grows over the time, as showed by +the following images:

                + + + + + + +

                In the images above, the block containing the flexswitch is colored in +cyan. In the first picture, there is only one block connected to the +flexswitch: this block contains the code to restart the JIT compilation. The +second picture shows the graph after the first case has been added: you can +clearly see that a new block has been created and attached to the flexswitch. +Finally, the third picture shows the graph after a while, with a lot of new +blocks attached.

                +
                +
                +

                Translate graphs to CLI

                +

                Conceptually, the goal of the CLI JIT backend is to express these graphs in +terms of CLI bytecode.

                +

                Translating the single block is easy, as it is just a list of sequential +operation, and it's straightforward to map each operation to the equivalent +CLI opcode or to a call to a helper method. Moreover, we need a way to +express links between the various basic blocks: if the links are known in +advance, render them is as easy as emitting a (potentially conditional) jump to +the target block. Thus, we won't discuss this part in detail, as it is quite +straightforward.

                +

                The hard part is how to implement flexswitches: at the time when we are +emitting the code, some of the blocks of this growable graph don't even exist: +how can we make a jump to a non existent block of code? For backends that +emit assembly code, it is rather easy: when they need to add a new case to the +flexswitch, they can just patch the existing code to insert a jump to a +newly allocated area of the memory, where the new code is being generated in.

                +

                For CLI this approach is not feasible, as the VM will never allow us to modify +existing code. Thus, we need to think of a different approach.

                +
                +
                +

                Graphs and methods

                +

                In .NET, the basic unit of compilation is the method: the only way to +execute some bytecode is to wrap it into a method. Moreover, it is not +possible to execute a method until it has been completed, and after this point +it is no longer possible to add new code.

                +

                Because of all these constraints we cannot simply map each graph to its own +method, since we saw that our graphs can grow after they have already been +executed few times.

                +

                Hence, we need to distinguish between the two concepts:

                +
                +
                  +
                • a graph is the logical unit of code as seen by the JIT compiler: +concretely, the CLI JIT backend renders it as one or more methods;
                • +
                • a method is a collection of basic blocks; each method has the so +called parent graph, i.e. the graph its blocks logically belongs to.
                • +
                +
                +

                The first method of a graph is called main method (which has +nothing to do with the Main static methods found in .exe files); other +methods are called children methods.

                +

                When we want to add a new case to the flexswitch, we create a method +containing all the new code; then we wrap the method inside a delegate (the +.NET equivalent of a function pointer) and pass it to the flexswitch, so that +it can later invoke it.

                +
                +
                +

                The hard bit: non-local links

                +

                Using this approach, after a while the blocks of our original graph are +scattered over a lot of different methods; however, there are no constraints +about how these blocks can be linked together, so it happens to have links +between blocks which are not in the same method. In the following, we will +refer to them as non-local links.

                +

                If the non-local block we want to jump to happens to be at the beginning of +its containing method, it is enough to invoke the method; but, what if we want +to jump somewhere in the middle? What we really want is to produce a method +which has multiple entry-points; again, doing it in assembly would be +trivial, but the virtual machine does not provide any support for it, so we +need a work around.

                +

                Each method in a graph is assigned an unique 16 bit method id; each block in +a method is assigned a progressive 16 bit block number. From this two +numbers, we can compute the block id as an unsigned integer, by storing +the method id in the first 16 bits and the block number in the second 16 bits. +By construction, the block id is guaranteed to be unique in the graph.

                +

                The following picture shows a graph composed of three methods; the id of each +method is shown in red, while the block ids are shown in red (for the method +id part) and black (for the block number part). The graph contains three +non-local links; in particular, note the link between blocks 0x00020001 +and 0x00010001 which connects two block that resides in different methods.

                + + + +

                Every method contains a special dispatch block, (not shown in the picture above) whose goal is to jump to +the specified block number inside the method itself. The first argument of a +child method is always a block id; when the method starts, it immediately +jumps to the dispatch block, and thus to the desired block.

                +

                For example, suppose to have a method which contains 3 blocks numbered 0, 1, +2; here is how its dispatch blocks looks like; for simplicity it is shown as +C# code, but it is actually generated as IL bytecode:

                +
                +// dispatch block
                +int methodid = (blockid & 0xFFFF0000) >> 16); // take the first 16 bits
                +int blocknum = blockid && 0x0000FFFF;         // take the second 16 bits
                +
                +if (methodid != MY_METHOD_ID) {
                +// jump_to_unknown block
                +...
                +}
                +
                +switch(blocknum) {
                +case 0:
                +goto block0;
                +case 1:
                +goto block1;
                +case 2:
                +goto block2;
                +default:
                +throw new Exception("Invalid block id");
                +}
                +
                +

                Whenever we want to jump to a non-local block, it is enough to store the block +id in the appropriate variable and jump to the dispatch block. If the block +resides in a different method, the jump_to_unknown block is entered; this +special block is implemented differently by the main method and the child +methods, as we will see soon.

                +

                Each time a new method is added to the graph, we build a delegate +for it, and store it in a special array +called method_map; since we assign the method id sequentially starting +from 0, we are sure that to fetch the method whose id is n we can simply +load the n-th element of the array.

                +

                The jump_to_unknown block of the main method uses this array to select the +right method, and calls it (FlexSwitchCase is the type of delegates for +all children methods):

                +
                +// jump_to_unknown block of the main method
                +FlexSwitchCase meth = method_map[methodid];
                +blockid = meth(blockid, ...); // execute the method
                +goto dispatch_block;
                +
                +

                Each child method returns a block id specifying the next block to jump to; +after its execution, we assign the return value to the blockid variable, +and jump again to the dispatch block, which will jump again to the appropriate +block.

                +

                Keeping this in mind, it is straightforward to implement the +jump_to_unknown block of children methods: it is enough to return the +target block id to the caller, and let its dispatch loop do the right thing. +If the caller is also a child method, it will return it again, until we reach +the dispatch loop of the main method, which will finally do the jump. In +theory, we could implement things differently and jumping directly from a +child method to another one, but in that case the call stack could grows +indefinitely in case of a tight loop between two blocks residing in different +methods.

                +

                To implement the dispatch block we can exploit the switch opcode of the +CLI; if the .NET JIT is smart enough, it can render it using an indirect jump; +overall, jumping to a non-local block consists of an indirect function call +(by invoking the delegate) plus an indirect jump (by executing the switch +opcode); even if this is more costly than a simple direct jump, we will see in +the next section that this not the main source of overhead when following a +non-local link.

                +

                Obviously, the slow dispatching logic is needed only when we want to jump to a +non-local block; if the target block happens to reside in the same method as +the current one, we can directly jump to it, completely removing the overhead.

                +

                Moreover, the dispatch blocks are emitted only if needed, i.e. if the parent +graph contains at least one flexswitch; graphs without flexswitches are +rendered in the obvious way, by making one method per graph.

                +
                +
                +

                The slow bit: passing arguments

                +

                Jumping to the correct block is not enough to follow a link: as we said +before, each link carries a set of arguments to be passed from the source to +the target block. As usual, passing arguments across local links is easy, as +we can just use local variables to hold their values; on the other hand, +non-local links make things more complex.

                +

                The only way to jump to a block is to invoke its containing method, so the +first solution that comes to mind is to specify its input arguments as +parameter of the method; however, each block has potentially a different +number (and different types) of input arguments than every other block, so we +need to think of something else.

                +

                An alternative solution could be to compute the union of the sets of input +arguments of all the blocks in the method, and use this set as a signature +for the method; this way, there would be enough space to specify the input +arguments for every block we might want to jump to, each block ignoring the +exceeding unused parameters.

                +

                Unfortunately, all the children methods must have the very same signature, +as they are all called from the same calling site in the dispatch block of the +main method. Since the union of the set of input arguments (and hence the +computed signature) varies from method to method, this solution cannot work.

                +

                We might think to determine the signature by computing the union of input +arguments of all blocks in the graph; this way, all the children methods +would have the same signature. But as we said above, the graph grows new +blocks at runtime, so we cannot determine in advance which set of input +arguments we will need.

                +

                To solve the problem we need a way to pass a variable number of arguments +without knowing in advance neither their number nor their types. Thus, we use +an instance of this class:

                +
                +public class InputArgs {
                +public int[] ints;
                +public float[] floats;
                +public object[] objs;
                +...
                +}
                +
                +

                Since the fields are arrays, they can grow as needed to contain any number of +arguments; arguments whose type is primitive are stored in the ints or +floats array, depending on their type; arguments whose type is a reference +type are stored in the objs array: it's up to each block to cast each +argument back to the needed type.

                +

                This solution impose a huge overhead on both writing and reading arguments:

                +
                +
                  +
                • when writing, we need to make sure that the arrays are big enough to +contains all the arguments we need; if not, we need to allocate a bigger +array. Moreover, for each argument we store into the array the virtual +machine performs a bound-check, even if we know the index will never be +out of bounds (because we checked the size of the array in advance);
                • +
                • when reading, the same bound-check is performed for each argument read; +moreover, for each value read from the objs array we need to insert a +downcast.
                • +
                +
                +

                To mitigate the performance drop, we avoid to allocate a new InputArgs +object each time we do a non-local jump; instead, we preallocate one at the +beginning of the main method, and reuse it all the time.

                +

                Our benchmarks show that passing arguments in arrays is about 10 times slower +than passing them as real parameter of a method. Unfortunately, we couldn't +come up with anything better.

                +
                +
                +

                Implement flexswitches

                +

                Now, we can exploit all this machinery to implement flexswitches, as this is +our ultimate goal. As described above, the point is to be able to add new +cases at runtime, each case represented as a delegate. Here is an excerpt +of the C# class that implements a flexswitch that switches over an integer +value:

                +
                +public class IntLowLevelFlexSwitch:
                +{
                +public uint default_blockid = 0xFFFFFFFF;
                +public int numcases = 0;
                +public int[] values = new int[4];
                +public FlexSwitchCase[] cases = new FlexSwitchCase[4];
                +
                +public void add_case(int value, FlexSwitchCase c)
                +{
                +...
                +}
                +
                +public uint execute(int value, InputArgs args)
                +{
                +for(int i=0; i<numcases; i++)
                +if (values[i] == value) {
                + return cases[i](0, args);
                +}
                +return default_blockid;
                +}
                +}
                +
                +

                For each case, we store both the triggering value and the corresponding +delegate; the add_case method takes care to append value and c to +the values and cases arrays, respectively (and resize them if +necessary). The interesting bit is the execute method: it takes a value +and a set of input arguments to be passed across the link and jumps to the +right block by performing a linear search in the values array.

                +

                As shown by previous sections, the first argument of a FlexSwitchCase is +the block id to jump to; since when we go through a flexswitch we always want +to jump to the first block of the method, we pass the special value 0 as a +block id, which precisely means jump to the first block. This little +optimization let us not to have to explicitly store the block id for the first +block of all the cases.

                +

                The value returned by execute is the next block id to jump to; if the +value is not found in the values array, we return the default_blockid, +whose value has been set before by the JIT compiler; default_blockid +usually points to a block containing code to restart the JIT compiler again; +when the JIT compiler restarts, it emits more code for the missing case, then +calls add_case on the flexswitch; from now on, the new blocks are wired +into the existing graph, and we finally managed to implement growable +graphs.

                +
                +
                +

                Performances

                +

                As we saw, implementing growable graphs for CLI is a pain, as the virtual machine +offers very little support, so we need an incredible amount of workarounds. +Moreover, the code generated is much worse than what an assembly backend could +produce, and the cost of following a non-local link is very high compared to +local links.

                +

                However, our first blog post showed that we still get very good +performances; how is it possible?

                +

                As usual in computer science, most of the time of a running program in +spent in a tiny fraction of the code; our benchmark is no exception, and the +vast majority of the time is spent in the inner loop that multiplies numbers; +the graph is built in such a way that all the blocks that are part of the +inner loop reside in the same method, so that all links inside are local (and +fast).

                +

                Flexswitches and non-local links play a key role to select the right +specialized implementation of the inner loop, but once it is selected they are +not executed anymore until we have finished the computation.

                +

                It is still unclear how things will look like when we will compile the full +Python language instead of a toy one; depending on the code, it could be +possible to have non-local links inside the inner loop, thus making +performance much worse.

                +
                +
                +

                Alternative implementations

                +

                Before implementing the solution described here, we carefully studied a lot of +possible alternatives, but all of them either didn't work because of a +limitation of the virtual machine or they could work but with terrible +performances.

                +

                In particular, in theory it is possible to implement non-local links using +tail calls, by putting each block in its own method and doing a tail call +instead of a jump; this would also solve the problem of how to pass arguments, +as each method could have its own signature matching the input args of the +block. I would like to explain this solution in a more detailed way as I +think it's really elegant and nice, but since this post is already too long, +I'll stop here :-).

                +

                In theory, if the .NET JIT were smart enough it could inline and optimize away +the tail calls (or at least many of those) and give us very efficient code. +However, one benchmark I wrote shows that tail calls are up to 10 times +slower (!!!) than normal calls, thus making impractical to use them for our +purposes.

                +
                +
                +

                Conclusion

                +

                Despite the complexity of the implementation, our result are extremely good; +the speedup we got is impressive, and it proves that PyPy's approach to JIT +compiler can work well also on top of object oriented virtual machines like +.NET or the JVM.

                +

                Generating bytecode for those machine at runtime is not a new idea; Jython, +IronPython, JRuby and other languages have been doing this for years. +However, Jython and IronPython do only a simple "static" translation, which +doesn't take advantage of the informations gathered at runtime to generate +better, faster and specialized code. Recently, JRuby grew a new strategy to +JIT-compile only hotspots, taking advantage of some informations gathered +while interpreting the code; this is still a "one-shot" compilation, where the +compiled code does not change over time.

                +

                To my knowledge, PyPy brings the first example of a +language which implements a truly JIT compiler on top of the underlying JIT +compiler of the virtual machine, emitting bytecode that changes and adapts +over the time. If someone knows other languages doing that, I would really +like to know more.

                +

                Being so innovative, the problem of this approach is that the current virtual +machines are not designed to support it in a native way, and this forces us to +put a lot of workarounds that slow down the generated code. The hope is that +in the future the virtual machines will grow features that help us to generate +such kind of code. The experimental Da Vinci VM seems to go in the right +direction, so it is possible that in the future I will try to write a JIT +backend for it.

                +

                At the moment, the CLI JIT backend is almost complete, and all the hardest +problems seems to be solved; the next step is to fix all the remaining bugs +and implement some minor feature that it's still missing, then try to apply it +to the full Python language and see what is the outcome.

                +
                +
                +
                +
                +
                + + Unknown wrote on 2008-12-07 22:33: +
                +
                +

                JikesRVM + LLVM

                https://osdir.com/ml/java.jikes.rvm.devel/2003-09/msg00059.html

                Don't know if it succeeded.

                +
                +
                +
                +
                + + Yosef wrote on 2008-12-08 08:15: +
                +
                +

                The comment about assembly-code patching is interesting. Do you mean assembly code backends can do runtime patching of previously generated code? I thought this is impossible, because operating systems mark executable pages as read-only. How is that dealt with?

                +
                +
                +
                +
                + + Maciej Fijalkowski wrote on 2008-12-08 09:29: +
                +
                +

                Most executable pages are read only, but there is nothing that stops you from creating ones that are rw. You just pass different flags to mmap.

                Cheers,
                fijal

                +
                +
                +
                +
                + + Antonio Cuni wrote on 2008-12-08 19:11: +
                +
                +

                @Yosef
                about patching generated code, see fijal's comment. Btw, this is exactly the same approach used by psyco

                +
                +
                +
                +
                + + Anonymous wrote on 2008-12-11 08:22: +
                +
                +

                It's wrong to say IronPython only does static translation. The Call Site stuff happens and generates IL at run time, and generates different code depending on the types. In fact you may want to look at how they do it, becuase they regenerate the IL for a method multiple times, which may be another way of implementing Flex switches

                +
                +
                +
                +
                + + Antonio Cuni wrote on 2008-12-12 09:02: +
                +
                +

                @Ben Young
                do you have a link that explains in more detail what you mean?
                As far as I know, DLR's callsites are just a way to do polymorphic inline caches, but nothing more. In particular, they don't do any specialization of the called code.

                You are right that we could do the same to implement flexswitches, though I think this is a minor optimization, as right now the real performance problem is how to pass arguments across non-local links.

                +
                +
                +
                +
                + + Anonymous wrote on 2008-12-12 16:26: +
                +
                +

                Hi Antonio

                IronPython can create a method that looks like this

                void object add(object a, object b)
                {
                throw new Exception("I don't know how to add")
                }

                into

                void object add(object a, object b)
                {
                if(a is int && b is int)
                return (int)a + (int)b

                throw new Exception("I don't know how to add")
                }

                and can further add new tests at runtime. The code do do the adding is written directly into the method body and there's no futher call needed. This is runtime code generation, not just caching

                In your case, instead of having multiple methods implementing different blocks you could just rewrite the whole "master" method every time the flexswitch changes. That way there's no call overhead at all. That's what the DLR does. I think the main thing it's missing is promotion, so shared tests can't be moved up a level, and it doesn't do inlining.

                +
                +
                +
                +
                + + Anonymous wrote on 2008-12-18 08:32: +
                +
                +

                I'm a beginner programmer, so please excuse my beginner questions :-)

                I just started learning Python as my first programming language. Several of my programmer friends have said I should learn Java instead, one reason being the difference in performance - specifically for doing natural language processing / AI stuff which is the area I am interested in.

                With PyPy, do you think it is likely that in the near future, Python's performance may be close to that of Java? I do plan on learning multiple languages, but it would be nice if I could stick with Python for as long as possible :-)

                +
                +
                +
                +
                + + Lucian wrote on 2008-12-20 12:56: +
                +
                +

                @Anonymous

                Probably. People have great hopes for PyPy, but you can never know how it will turn out, if at all.

                Right now, you can use things like numpy, psycho, shedskin, cython/pyrex and a few others to speed up you code, only needing to know a few things about C or C++. Google them.

                +
                +
                +
                +
                + + Luis wrote on 2008-12-20 14:38: +
                +
                +

                @Sin

                You don't need to know any c or c++ to use psyco or shedskin. Only python.

                +
                +
                +
                +
                + + Anonymous wrote on 2009-03-05 03:07: +
                +
                +

                WoW shares many wow gold of its features with previously launched games. Essentially, you battle with wow gold cheap monsters and traverse the countryside, by yourself or as a buy cheap wow gold team, find challenging tasks, and go on to higher aoc gold levels as you gain skill and experience. In the course of your journey, you will be gaining new powers that are increased as your skill rating goes up. All the same, in terms of its features and quality, that is a ture stroy for this.WoW is far ahead of all other games of the genre the wow power leveling game undoubtedly is in a league of its own and cheapest wow gold playing it is another experience altogether.

                Even though WoW is a Cheap Wow Gold rather complicated game, the controls and interface are done in warhammer gold such a way that you don't feel the complexity. A good feature of the game is that it buy wow items does not put off people with lengthy manuals. The instructions bygamer cannot be simpler and the pop up tips can help you start playing the game World Of Warcraft Gold immediately. If on the other hand, you need a detailed manual, the instructions are there for you to access. Buy wow gold in this site,good for you, BUY WOW GOLD.

                +
                +
                +
                +
                + + Anonymous wrote on 2009-04-11 04:00: +
                +
                +

                My friends and I like to buy Anarchy credits, because the Anarchy Online credits is very useful to upgrade equipment. Only your equipment becomes better, then you can win this game. In Anarchy gold, you can buy everything you want in this game. Tomorrow will be my birthday, so my friends promise to buy AO credits as gifts. I am so happy. They understand me so well, Anarchy online gold is my favorite.
                I like angels gold very much because it is very useful. In fact at first sight I have fallen in love with angels online gold. So no matter how much I have spent to buy angels gold, I never regret. Because of cheap angels online gold, I meet a lot of friends.

                +
                +
                +
                + +

                Porting the JIT to CLI (part 2)

                + +
                +

                In my previous post, we saw that PyPy JIT generator can produce huge +speedups when applied to the tlc toy language.

                +

                In this post we will dive a bit into the internals of PyPy JIT, to see how it +manages to do so. Note that this is a very high level overview of how the +JIT works, and applies to all backends. Then, in the third post of this +series, we will look closer at the CLI JIT backend, seeing how it works around +some .NET limitations and how the generated code looks like.

                + +

                PyPy JIT for dummies

                +

                As you surely know, the key idea of PyPy is that we are too lazy to write a +JIT of our own: so, instead of passing nights writing a JIT, we pass years +coding a JIT generator that writes the JIT for us :-).

                +

                I'm not going to explain how the JIT generator does its job, (perhaps this +will be the subject of another blog post), but how the generated JIT +works.

                +

                There are values that, if known at compile-time (i.e., when the JIT compiler +runs), let the JIT to produce very efficient code. In a dynamic language, +types are the primary example: for instance, suppose you are a compiler and +you have to compile to following Python function:

                +
                +def mysum(a):
                +  return a + 1
                +
                +

                At compile time, you don't have any knowledge about the type of the parameter: +it could be integer, float, an user defined object, etc. In this situation, +the only safe choice is to emit code which does the usual, slow, full lookup +to know how to perform the operations.

                +

                On the other hand, suppose that you knew in advance that the parameter is an +integer: this time, you could emit code that exploits this extra +knowledge, by performing directly a fast integer addition.

                +

                The idea behind PyPy JIT is that if you don't have enough knowledge to +generate efficient code, you stop compiling and wait until you know +exactly what you need. Concretely, you emit code that runs until the point +where you stopped the compilation, then it triggers a special procedure that +restarts the compiler. This time the JIT compiler knows everything +you need, because you can inspect the state of the running program.

                +

                Let's see an example: the first time the JIT compiles mysum, it produces +something like this pseudo-code:

                +
                +PyObject mysum_compiled(PyObject a)
                +{
                +  Type a_type = a.GetType();
                +  switch(a_type) {
                +      default: continue_compilation(a_type, <position>);
                +  }
                +}
                +
                +

                If you call mysum(41), the execution goes in the default branch of the +switch, thus calling continue_compilation: its job is to restart the JIT +compiler, which now can emit fast code because it knows the exact type of +a; then, it modifies the original mysum_compiled function, in +order to make it executing the newly generated code the next time it +encounters an integer at that point:

                +
                +PyObject mysum_compiled(PyObject a)
                +{
                +  Type a_type = a.GetType();
                +  switch(a_type) {
                +      PyInteger: return new PyInteger(a.value+1); // fast path!
                +      default: continue_compilation(a_type, <position>);
                +  }
                +}
                +
                +

                From now on, every time we call mysum with an integer argument, the JIT +compiler is not called anymore and the fast path is directly executed; if we +happen to call mysum with a float arguments, the switch goes again in the +default branch, and the JIT compiler is started once more to produce fast +code also for this case. What happens in practice is that compile-time and +runtime are continuously intermixed, until the switches are stable enough and +the compiler is not needed anymore.

                +

                In PyPy jargon, this kind of "growable switch" is called flexswitch, and +it's one of the most important concept of our JIT generator.

                + +

                Promotion

                +

                How can the JIT generator know which values are useful to know to generate +efficient code and which aren't? Unfortunately it can't, or at least our JIT +generator is not smart enough at the moment.

                +

                To get the best from it, the developers of the VM need to instruct it by +annotating the variables on which we want the JIT to stop until it knows the +actual values; this is done by using particular hints, called promote +and promote_class; variables annotated with such hints are said to be +promoted. If something is promoted, a flexswitch is used to gain +information about it, as seen in the last section.

                +

                For an example, let's look at an excerpt from main dispatch loop of the tlc +virtual machine:

                +
                +elif opcode == ADD:
                +  a, b = stack.pop(), stack.pop()
                +  hint(a, promote_class=True)
                +  hint(b, promote_class=True)
                +  stack.append(b.add(a))
                +
                +

                This the implementation of the ADD opcode: first, it pops two values from +the stack; then, it computes the result; finally, it push the result to the +stack again. In between, both the classes of a and b have been +promoted: this means that when the JIT emits the code for b.add(a), it +knows exactly what is happening: if it sees that both are instances of the +IntObj class, it inlines the method call and emits a fast integer addition +instead.

                + +

                Virtuals

                +

                The other important concept of the JIT is the presence of virtual +structures, virtual lists, and virtual dictionaries. Again, I'm not +going to explain in depth how they work, but only why they are so important for +generating highly efficient code.

                +

                The essence of virtuals is that you don't allocate objects until you really +need to do it, e.g. because they are being passed as an argument to some +external function. Instead, we store all the informations we need as local +variables; e.g., in the case of a virtual structure, we create as many local +variables as the number of its fields: if the structure escapes the local +scope, we force it to a real object, by allocating memory on the heap and +initializing it after the current value of the local variables.

                +

                This technique allows the JIT to avoid the allocation of many temporary +objects that hold intermediate results; consider for example the following +Python loop:

                +
                +result = 0
                +for i in range(N):
                +  result += i
                +return result
                +
                +

                Without the JIT, at each iteration, a new int object is created and bound +to the result variable, while the previous one is discarded and not needed +anymore. By combining virtuals and promotion, the JIT can emit code that does +the whole computation locally, and allocates a real object only at the end, +when it escapes from the local scope because it is returned from the +function.

                + +

                Putting it all together

                +

                This is, essentially, how PyPy's generated JITs work. To summarize, our JITs +emit multiple versions of each chunk of code: each version is specialized +and optimized for one particular case.

                +

                The cost of selecting the right specialization to use (through flexswitches) +is almost always negligible compared to how much time you save by running the +fast version instead of the more-general-but-slow one. Moreover, each +specialized version knows the exact shape of the objects it's dealing with, so +they can be virtualized to make the generated code even more efficient.

                +

                At the end, the actual code generation is done by one of the JIT backends: +the backends exploit all the knowledge gathered by the previous steps to +produce highly efficient code, but this will be the subject of the next blog +post.

                +
                +
                +
                +
                + + Anonymous wrote on 2008-11-07 12:46: +
                +
                +

                Wow... I love this approach. Keep up the great work and interesting posts!

                +
                +
                +
                +
                + + Luis wrote on 2008-11-07 18:12: +
                +
                +

                This is a very clear and didactic explanation. Thanks!

                +
                +
                +
                +
                + + Anonymous wrote on 2008-11-07 20:14: +
                +
                +

                can't wait for the next one

                +
                +
                +
                +
                + + kbob wrote on 2008-11-08 01:35: +
                +
                +

                What does the flexswitch compile into? I'm guessing it would look like

                t = type(obj);
                if (t == int)
                ...
                else if (t == float)
                ...
                else
                ....

                but maybe there's a better way (or maybe the answer is backend-dependent).

                +
                +
                +
                +
                + + Antonio Cuni wrote on 2008-11-08 08:53: +
                +
                +

                @bob
                your guess is right, how to implement the flexswitch is backend-dependent. This is the hardest part for .NET, as the flexswitch needs to grow dynamically (i.e., you have to add more case after the .NET method has already been compiled). It will be subject of the next blog post.

                +
                +
                +
                +
                + + Unknown wrote on 2009-01-05 22:46: +
                +
                +

                It seems that an implication of the JIT way is that, by adopting a consistent habit of implementing type driven Generic Functions, the JIT could accomplish nearly all of the possible optimizations in a single pass. In other words, by definition, each type based variation of a Generic Function call can only be fired when data of that type is provided as a parameter.

                +
                +
                +
                + +

                Porting the JIT to CLI (part 1)

                + +
                +

                As the readers of this blog already know, I have been working on the CLI +JIT backend for some months: last Friday, it reached an important milestone, +as it is now able to produce huge speedups for a little dynamic language. To +know how huge the speedup is, read on :-).

                +

                The goal of PyPy JIT generator is to take an interpreter and, with the help of +few annotations, automatically generate a JIT compiler for it. In this post, +we will talk about the tlc virtual machine: while tlc it is just a toy +language, it contains some features that make it an interesting target for our +JIT generator.

                +
                +

                The tlc virtual machine

                +

                tlc is executed by a stack based, dynamically typed virtual machine (for +those who knows a bit about the Python VM: does it sound familiar? :-)).

                +

                There are three types of objects: integers, nil, and cons cells (i.e. +lisp-like pairs of objects).

                +

                As the VM is very simple, it provides only few opcodes:

                +
                +
                  +
                • opcodes to manipulate the stack, like PUSH, POP, etc.
                • +
                • integer operations, like ADD, MUL, all the comparisons, etc.: +these operations can only be applied to integers;
                • +
                • list operations, like CONS, CAR, CDR: these operations can +only be applied to lists;
                • +
                • other operations, including jumps and conditional jumps.
                • +
                +
                +

                The VM is interesting for our purposes because it has a lot of similarities +with Python (though on a smaller scale, of course):

                +
                +
                  +
                1. it has to do type-checks at runtime before doing most of the operations;
                2. +
                3. every time you do an arithmetic operation, it has to unbox the operand, +do the computation, and the box the result again.
                4. +
                +
                +

                This means that even if you have a program which only uses integers, you are +paying a lot of overhead.

                +

                To know more about this toy VM, look at its source code: the interesting +bits are the classes used to represent objects, and the interp_eval +function, which contains the main loop of the virtual machine. As you can +see, the implementation is quite straightforward; all the hint calls you +see are the special annotations needed by the JIT generator to produce better +code.

                +
                +
                +

                Let's JIT it!

                +

                So, the whole point is to generate a JIT compiler from it, isn't it?

                +

                First, checkout a fresh copy of the oo-jit branch:

                +
                +$ svn co https://codespeak.net/svn/pypy/branch/oo-jit
                +
                +

                Then, go to the oo-jit/pypy/jit/tl directory, and compile the tlc VM +with the CLI backend and JIT enabled:

                +
                +$ cd oo-jit/pypy/jit/tl/
                +$ ../../translator/goal/translate.py -b cli --jit --batch targettlc
                +...
                +lot of texts
                +...
                +
                +

                If everything went OK, you now have a targettlc-cli executable, which +accepts two arguments: the name of the file containing the tlc program we +want to run, and an integer to be passed to it.

                +

                Luckily, in the same directory we have a factorial.tlc file that contains +the bytecode for a function that -- guess? -- computes the factorial of a +given integer; let's try it:

                +
                +$ ./targettlc-cli factorial.tlc 5
                +Non jitted:    120 (0.009371 seconds)
                +Warmup jitted: 120 (0.208954 seconds)
                +Warmed jitted: 120 (0.000323999999999991 seconds)
                +
                +

                Cool, it seems that the result was computed correcly :-). As you can see from +the output, we ran the program three times:

                +
                +
                  +
                1. by plain interpretation, without any jitting;
                2. +
                3. with the jit enabled: this run includes the time spent by doing the +compilation itself, plus the time spent by running the produced code;
                4. +
                5. again with the jit enabled, but this time the compilation has already +been done, so we are actually measuring how good is the code we produced.
                6. +
                +
                +

                So, it's time to run a benchmark: let's try to compute the factorial of a very +big number; the result will be 0, because obviously after a while we overflow, +but after all we are interested in the time spent, not in the result:

                +
                +$ ./targettlc-cli factorial.tlc 5000000
                +Non jitted:    0 (19.93247 seconds)
                +Warmup jitted: 0 (0.293229999999998 seconds)
                +Warmed jitted: 0 (0.0494239999999984 seconds)
                +
                +$ python -c 'print 19.93247/0.0494239999999984'
                +403.295362577
                +
                +

                And no, I didn't make any mistake in copying&pasting: the jitted version is +really 400 times faster that the non jitted one!

                +

                Warning: my laptop seems to be not very well suited for benchmarks, as the +results vary a lot from run to run; I've run the benchmarks a lot of times, +and I got speedup factors up to 500 times, so your results may be different.

                +
                +
                +

                More benchmarks

                +

                It's also interesting to compare the result with a manual written C# +version of the factorial, to see how good is code we produced; to get +reasonable results, we need to compute a larger factorial, to let to code to +run a bit more:

                +
                +$ ./targettlc-cli --onlyjit factorial.tlc 100000000
                +Warmup jitted: 0 (0.980856 seconds)
                +Warmed jitted: 0 (0.769716 seconds)
                +
                +$ mono factorial.exe 100000000
                +C#:            0 (0.153777 seconds)
                +
                +$ python -c 'print 0.769716/0.153777'
                +5.00540392907
                +
                +

                We know that the generated code is far from being optimal, but probably the +factor of five is at least partially due to the fact that Mono's own JIT is optimized for +C#-like code, and our code has a completely different shape.

                +

                All the benchmarks above were run under Linux, with Mono 1.9.1. Here are the +results for the same benchmarks, but run with Microsoft CLR (on a different +machine, so the absolute values are not comparable):

                +
                +$ ./targettlc-cli factorial.tlc 5000000
                +Non jitted:    0 (15,640625 seconds)
                +Warmup jitted: 0 (0,4375 seconds)
                +Warmed jitted: 0 (0,03125 seconds)
                +
                +$ python -c 'print 15.640625/0.03125'
                +500.5
                +
                +$ ./targettlc-cli --onlyjit factorial.tlc 100000000
                +Warmup jitted: 0 (0,90625 seconds)
                +Warmed jitted: 0 (0,515625 seconds)
                +
                +$ ./factorial.exe 100000000
                +C#:            0 (0,34375 seconds)
                +
                +$ python -c 'print 0.515625/0.34375'
                +1.5
                +
                +

                The results are even better than before; this is probably thanks to CLR's JIT, +that does a better job than Mono when faced to something which is different +than the usual C#-like code.

                +
                +
                +

                Conclusions (for now)

                +

                This is a very important result, because it proves that PyPy's approach to JIT +compilers can be applied effectively also to OO virtual machines; the result +is even better than what I expected, because when generating code for .NET we +have much less freedom than when generating assembly code, and I had to play +some tricks to work around some .NET limitations.

                +

                Moreover, it worked at the first try :-). I tried to compile the tlc +virtual machine as soon as all the related JIT tests were passing, and +surprisingly everything worked just fine, even if it was the very first time I +was trying to apply some features of the JIT to something bigger than a test: +I think this is yet another prove that Test Driven Development just works!

                +

                Even if this is a major milestone, the CLI JIT backend is not yet completed: +as a consequence it can't still be used for the full PyPy, but all the +hardest problems should have been solved now.

                +

                Since a lot of readers asked for more technical details, especially about the +JIT, I will try to soon write a second blog post explaining how the CLI backend works +internally, with a brief look to the generated code to see how it looks like.

                +
                +
                +
                +
                +
                + + Anonymous wrote on 2008-11-04 01:50: +
                +
                +

                If you are benchmarking on Linux then watch out for CPU speed scaling. For example on Ubuntu by default the ondemand governor is used which runs the CPU at lowest possible speed and until there is any CPU demand at which point it runs at fastest. The time to switch varies (eg sometimes it can be instantaneous, other times a second or two, and other times not at all).

                Make sure to use: cpufreq-set -g performance

                That will run at maximum CPU speed the whole time.

                +
                +
                +
                +
                + + Lucian wrote on 2008-11-04 02:27: +
                +
                +

                Woohoo! Can't wait for more (and the jvm counterpart).

                +
                +
                +
                +
                + + Anonymous wrote on 2008-11-04 08:46: +
                +
                +

                I agree that this result is very important. For me and many others too i guess, a very good JIT is the most important missing part in python and other dynamic languages. Speed *is* important,

                For integers, psycho also showed huge performance gains. But i think the real proof of pypy's approach would be to show similar results for floating point operations also...

                +
                +
                +
                +
                + + Anonymous wrote on 2008-11-04 11:09: +
                +
                +

                awesome!

                +
                +
                +
                +
                + + Anonymous wrote on 2008-11-04 11:47: +
                +
                +

                Keep it on!

                +
                +
                +
                +
                + + Antonio Cuni wrote on 2008-11-04 16:50: +
                +
                +

                hi, thanks for your valuable comments.

                Some notes:

                - I know about cpufreq-set, but even setting the governor to performance doesn't help, the timings vary a lot between different runs. If someone knows a way to run reliable benchmarks, it would be very appreciated!

                - I have plans to experiment also the JIT on the JVM: since HotSpot usually does a better job than CLR's JIT, it's possible/likely that the JVM is a better platform for our purposes. Also, the experimental Da Vinci Machine contains features that could be very useful for us. Unfortunately the PyPy non-JIT JVM backend is not as advanced as the CLI one, and it lacks some features that are really needed for writing a JIT backend.

                - Float operations are already (mostly) supported by our JIT backends; I bet that if you add a FloatObj to the tlc interpreter, you will see huge speedups as well. However, the real point of PyPy's approach is that once finished it will optimize much more than ints and floats, including features that are currently not implemented by psyco (e.g. generators).

                +
                +
                +
                +
                + + Ορέστης wrote on 2008-11-04 21:21: +
                +
                +

                Brilliant post! Keep us updated!

                +
                +
                +
                + +

                One year PyPy Blog

                + +
                +

                Last Friday the PyPy Status Blog had its first anniversary. Yay! After not +really buying into any of this new-fangled "blog" stuff for a long time we just +bit the bullet and got started. Totally surprisingly it even worked. We posted +76 post in the last year, more than one per week. By now we have more than 800 +subscribers (according to feedburner), which is quite cool for a rather niche +blog.

                +

                To make our blog even more interesting, I would like to ask for some feedback +via the comments:

                +
                +
                  +
                • Which posts did you like in particular?
                • +
                • What sort of posts would you be interested in getting more of?
                • +
                • Any other improvements we could make?
                • +
                +
                +
                +
                +
                +
                + + Anonymous wrote on 2008-11-02 18:28: +
                +
                +

                For me the most interesting posts is about status of PyPy project. It will be great if you could post more frequently.

                +
                +
                +
                +
                + + Unknown wrote on 2008-11-02 20:40: +
                +
                +

                +1

                +
                +
                +
                +
                + + Michael Foord wrote on 2008-11-02 21:09: +
                +
                +

                It's been great to read about PyPy progress, congratulations for surviving a year and many thanks.

                I also like to hear the status updates - and wouldn't mind a bit more technical detail.

                In fact some deep dives into individual aspects of PyPy would be *great*, even if they're more effort to write...

                +
                +
                +
                +
                + + Eduardo O. Padoan wrote on 2008-11-02 21:45: +
                +
                +

                Greetings!
                What about quick Weekly Status Updates, with a summary of svn activity and stuff?

                +
                +
                +
                +
                + + Anonymous wrote on 2008-11-03 02:41: +
                +
                +

                It's not just a first for you, it's a first for me. This is the first blog I have ever subscribed to. You can attribute that to the fact that this subject is geniunly interesting.

                The blog format has many benefits. For one, it amortizes the effort required to understand the project. This allows me to take my time, wiki whatever I need to, and savor the details. It takes time for me to learn the concepts but in due time, I can see myself eventually contributing to the project. The other benefit is I can see all the hot topics revolved around the various pypy projects. The whole partial evaulation, for example, was something new I learned about.

                I would agree that increasing the rate of posts would be nice. While I can't say for others, in my personal experience, it seems that logged projects tend to finish faster than unlogged projects.

                +
                +
                +
                +
                + + Anonymous wrote on 2008-11-03 02:42: +
                +
                +

                It's not just a first for you, it's a first for me. This is the first blog I have ever subscribed to. You can attribute that to the fact that this subject is geniunly interesting.

                The blog format has many benefits. For one, it amortizes the effort required to understand the project. This allows me to take my time, wiki whatever I need to, and savor the details. It takes time for me to learn the concepts but in due time, I can see myself eventually contributing to the project. The other benefit is I can see all the hot topics revolved around the various pypy projects. The whole partial evaulation, for example, was something new I learned about.

                I would agree that increasing the rate of posts would be nice. While I can't say for others, in my personal experience, it seems that logged projects tend to finish faster than unlogged projects.

                +
                +
                +
                +
                + + Bill Mill wrote on 2008-11-03 03:09: +
                +
                +

                > Which posts did you like in particular?

                I just scanned a bunch of entries, and "List comprehension implementation details" jumped out at me as a really nice one. I like that it points out some of the deep details of python that are easy for me to not think about because I'm not implementing it.

                > What sort of posts would you be interested in getting more of?

                More technical details posts, I really like the one about the JIT and Prolog too.

                I post your articles to reddit too, and I think "we can now run big software X" and efficency milestones are successfuly at attracting a lot of attention (if that's what you want!)

                +
                +
                +
                +
                + + Benjamin Peterson wrote on 2008-11-03 03:11: +
                +
                +

                Thanks so much for doing this! It makes me very jealous over here in CPython land.

                I like to hear about specific new projects and ideas you guys are working on.

                +
                +
                +
                +
                + + nshepperd wrote on 2008-11-03 05:39: +
                +
                +

                For me the most interesting things were the technical details posts, like Bill Mill said. But I get excited any time there is a new blog post. :)

                +
                +
                +
                +
                + + Unknown wrote on 2008-11-03 06:35: +
                +
                +

                Being in the scientific computation area at the moment, I'm very eager to hear about progress in the JIT framework, esp. for 64 bit Linux.

                Yet most other posts are also interesting.

                +
                +
                +
                +
                + + Anonymous wrote on 2008-11-03 11:01: +
                +
                +

                > Which posts did you like in particular?

                Anything about the JIT and its progress.

                Good luck!

                +
                +
                +
                +
                + + Unknown wrote on 2008-11-03 12:57: +
                +
                +

                Hi,

                I think the blog is pretty good. Weekly summaries would make it rock, though.

                And I am also especially interested in hearing about progress on JIT work. And about any use of LLVM.

                Best
                Anders

                +
                +
                +
                +
                + + Carl Friedrich Bolz-Tereick wrote on 2008-11-03 14:46: +
                +
                +

                Thanks for all the friendly comments!

                So more technical posts it will be :-). Those are mostly even fun to write, it's just usually quite a bit of work. I actually have a vague plan to give a basic introduction of the ideas behind the JIT (but will still take some time, I am busy with the lecture at the moment).

                About more summaries of what happens: it requires a lot of discipline (see the Python-dev summaries) and I am not sure we have that :-). It would need somebody dedicated to care for it, and that one won't be me at the moment.

                +
                +
                +
                +
                + + Luis wrote on 2008-11-03 15:03: +
                +
                +

                Personaly, I get very anxious when you say "it will be ready when it's ready". Aaarghhh! Please, at least lie a little bit :-).
                For example: "Pypy is now 1.8x slower than cpython, but after [feature here] it will be 10x faster".
                Well, just kidding. Congratulations for all the great work and keep it up!

                +
                +
                +
                +
                + + Damian Cugley wrote on 2008-11-03 16:03: +
                +
                +

                I am not especially in favour of weekly summaries, unless there is some interesting progress to report. Otherwise you end up with someone filling in progress reports because they feel obliged to, rather than to celebrate new features, and it becomes a chore.

                That said, PyPy has many subprojects; maybe having a round-robin system where we get a progress report from a different project every week would be interesting.

                +
                +
                +
                +
                + + Anonymous wrote on 2008-11-03 23:05: +
                +
                +

                I'm a regular Python user that wishes often for something a little faster with the same flexibility. So generally, I read this because I think you guys are on a good track for JIT optimisation and other fun things.

                I guess I'm looking forward to the eventual series of posts that talks about how you can start using this on your system to replace your system Python, followed by you talking the regular Python core developers into working directly in PyPy instead. =)

                +
                +
                +
                +
                + + Paul D. Eden wrote on 2008-11-03 23:36: +
                +
                +

                For me the best parts are the tutorials and howtos relating to rpython, translating to c, etc.

                +
                +
                +
                +
                + + Konrad wrote on 2008-11-07 11:54: +
                +
                +

                I'm interested in status updates and longer descriptions on how elements of PyPy work. Sprint summaries are fine as long as they carry one of the above (they usually do, though :>)

                +
                +
                +
                +
                + + John Mudd wrote on 2008-11-10 13:27: +
                +
                +

                I'm interested in anything to do with multi-thread support, GIL elimination, general status, progress and future plans.

                +
                +
                +
                + +

                Sprint Discussions: JIT Generator Planning

                + +
                +

                Background

                +

                Finally, the JIT post :-). First some background: Despite our plans at the end +of the EU period, PyPy's Python interpreter didn't get a good and widely +applicable JIT in the last year. The reason for that was that we discovered that +although the basic idea to generate JIT compilers is good, the concrete +prototype made during the EU period is basically flawed. It could have +been pushed a bit farther, but would have run into deep troubles eventually. One +of the problems would have been performance instability: change a seemingly +unrelated bit in your source program, and the performance changes in unexpected +ways, which is clearly not desirable. Another problem with that old approach is +that too much assembler code is generated, leading to memory problems, and also +that the generated assembler is bad in various ways, e.g. it is hard in that +approach to do proper register allocation.

                +

                Therefore we decided that it would be worthless to pursue this direction much +further. Instead we tried to research approaches to fixing the inherent +problems. This research was largely done in Prolog and I eventually wrote my +Master's thesis about it. From the Prolog work we got some good insights into +what needs to be done and what sorts of techniques are needed. Also, it inspired +Armin to do some more exploration on a small Python prototype which used the +lessons learned from Prolog and also some additional ideas from tracing JITs. So +far, however, the prototype is neither in RPython, nor much integrated with +PyPy.

                +

                This research is not the only thing happening in the JIT-area. During the last +year, Antonio Cuni was working on bringing the JIT to pypy-cli. This +consisted mostly of writing a .NET backend for the old JIT-generator. Some +further work is being done since August by John Witulski, who is writing an +AMD64 backend for the JIT-generator for his Bachelor's thesis.

                + +

                Where to go from there

                +

                During the sprint we discussed in which directions we should continue now. We +plan to work quite a bit on the JIT in the coming months. Both Armin and Anto +are in Düsseldorf for four months, and them and me plan to mostly work on the +JIT (as well as giving a lecture on "Dynamic Programming Languages", trying to +ensnare some more students).

                +

                The first step will be to experiment a bit more with Armin's prototype. So far +it looks rather promising, but there are some unsolved issues that we need to +look into first. The first issue is to think a bit about how to efficiently do +profiling to compile only important code paths. The other large issue are +so-called "virtualizables". Roughly speaking, they are the frame objects of the +interpreter from which the JIT is generated. They need special treatment, +because on the one hand it is important that they get optimized away to make the +code fast, since the frames are accessed all the time for the local variables; +on the other hand they should still be usable for introspection if code is +around that is trying to look into them.

                +

                When this is done, the prototype needs to be ported to RPython, which is a +non-trivial task, since it is rather dynamic so far (it is rather important that +the unresolved issues are done before the porting, because once the prototype is +in RPython, experimentation will be harder). The porting has the potential to be +tedious, but in a sense it is "just work", as opposed to unclear research.

                +

                At this point it will become important to think about the backend interface. The +interface that the old frontend used to produce assembler code won't be usable +for the new approach, so things need to be rearranged slightly. Afterwards the +backends will have more information and be invoked at a slightly higher level, +which should allow them to produce better code.

                +

                When all this is done, the JIT generator will be in a rather good state and it +should become possible (modulo a lot of details, of course), to use it on the +Python interpreter.

                +

                Conclusion

                +

                I am intentionally not attaching any time estimates to this blog post. So far +our time estimates have not been very accurate when it comes to the JIT, which +only lead to disappointment when the JIT failed to materialize. We hope that we +will progress in interesting ways in the next four months, but who knows. Note +that we are really quite disappointed ourselves that it took so much longer than +we planned and hoped. The reason for this is mostly that this work really is +research and sometimes it is just hard to predict what sort of problems turn up. +Partial evaluation (the basis for our JIT generator) is a 30 years old technique +that was always just promising and never really successful, so the fact that we +think we can solve its problems in a few years is very much hubris anyway :-). +On the positive side, we think that we now know these problems much better than +ever before and that we have a plan that has a chance to succeed.

                +

                Also we are still convinced that our approach has huge potential, despite the +difficulties. If we manage to pull it off, it should be significantly simpler to +support new language features in the JIT and also to get speedups on some rather +interesting bits of the language. Some ideas we are having include generating a +JIT for the regex engine or speed up ctypes-bindings to be nearly as fast as an +extension module (or faster?). Also the JIT will be such that by construction +the JIT-generated code behaves identical to the original code, which isn't +always true for Psyco, for example.

                +
                +
                +
                +
                + + Luis wrote on 2008-10-14 19:20: +
                +
                +

                Thank you very much for this post Carl! I have a couple of questions:
                You said you were experimenting with some ideas from tracing JITs. I wonder how much the new javascript VMs are influencing your work in pypy. Were these techniques considered from the beginning or this is just because of the latest success of tracemonkey? And if so, are there ideas from Chrome's v8 or Squirellfish than could be applied too?
                Do they change in any way your expectations regarding the potential of pypy concerning speed?

                +
                +
                +
                +
                + + Anonymous wrote on 2008-10-15 05:42: +
                +
                +

                Hi,

                I quess you will now the work done about hybrid frames in VisualWorks works, but since you mentioned the problem, anyone else could benefit from the following link:

                https://pages.cs.wisc.edu/~cymen/misc/interests/oopsla99-contexts.pdf

                +
                +
                +
                +
                + + Maciej Fijalkowski wrote on 2008-10-15 13:21: +
                +
                +

                @luis: we definitely read same papers (by Michael Franz and others) as tracemonkey authors. Work on tracing jit for pypy is older than release of tracemonkey (it's even older than first work on tracemonkey). Regarding chrome's v8 it seems the main optimization is implementation of hidden classes, which we kind of get for free combining jit and our existing optimization called shared dict.

                +
                +
                +
                +
                + + Anonymous wrote on 2008-10-27 05:59: +
                +
                +

                From the sound of it, it sounds like it would be useful to have a 64-bit version of Psyco, otherwise there is no stopgap in the meantime...

                +
                +
                +
                +
                + + Anonymous wrote on 2008-12-01 13:50: +
                +
                +

                When will PyPy match Psyco's speed?

                +
                +
                +
                +
                + + Anonymous wrote on 2009-01-18 20:40: +
                +
                +

                Yes, especially on Linux, everything is moving to 64-bit. If there's no 64-bit Psyco, you can't get the benefits of 64-bit Python (memory) and use Psyco at the same time.

                +
                +
                +
                + +

                Sprint Discussions: C++ Library Bindings

                + +
                +

                At the beginning of this year, PyPy grew ctypes support, thanks to generous +support by Google. This made it possible to interface with C libraries from +our Python interpreter, something that was possible but rather tedious before. +What we are lacking so far is a way to interface to large C++ libraries (like +GUI libraries). During the sprint we had a brainstorming session about possible +approaches for fixing this shortcoming.

                +

                For CPython there are a number of approaches in common use:

                +
                + +
                +

                Those all have the property that they produce some code that is then compiled +with a compiler to produce a CPython extension. The produced code also uses +functions from CPython's C-API. This model is not simple to use for PyPy in its +current state. Since PyPy generates C code automatically, a fixed C-level API +does not exist (it is not unlikely that at one point in the future we might have +to provide one, but not yet). At the moment, PyPy very much has a "Don't call +us, we call you"-approach.

                +

                A very different approach is followed by the Reflex package, which is +developed at CERN (which has an incredible amount of C++ libraries). It is not +mainly intended for writing Python bindings for C++ libraries but instead +provides reflection capabilities for C++. The idea is that for every C++ shared +library, an additional shared library is produced, which allows together with +Reflex to introspect properties of C++ classes, methods, etc. at runtime. These +facilities are then used for writing a small generic CPython extension module, +that allows CPython to use any C++ library for which this reflection information +was generated.

                +

                This approach is a bit similar to the ctypes module, apart from the fact +that ctypes does not use any reflection information, but the user has to +specify the data structures that occur in the C code herself. This makes it +sometimes rather burdensome to write cross-platform library bindings.

                +

                For PyPy the approach seems rather fitting: We would need to implement only the +generic extension module and could then use any number of C++ libraries. Of +course some more evaluation is needed (e.g. to find out whether there are any +restrictions for the C++ code that the library can use and how bothersome it is +to get this reflection information for a large library) but so far it seems +promising.

                +
                +
                +
                +
                + + Anonymous wrote on 2008-10-14 17:39: +
                +
                +

                I've done a fair amount of complicated Boost.Python wrapping, and also implemented a small replacement for it with most of the complexity removed. There are two main reasons why Boost.Python is so complicated:

                1. It supports arbitrarily complex memory and sharing semantics on the C++ classes (and is runtime polymorphic on how the memory of wrapped objects is managed).

                2. It supports arbitrary overloading of C++ functions.

                If you remove those two generality requirements (by requiring that wrapped C++ objects are also PyObjects and banning overloading), it's possible to write very lightweight C++ bindings. Therefore, I think it's critical to factor the C/C++ API design so that as much of it as possible is writable in application level python on top of a small core that does the final C++ dispatch.

                For example, if you wrap a C++ vector class with a bunch of overloads of operator+ in Boost.Python, each call to __add__ has to do a runtime search through all the overloads asking whether each one matches the arguments passed. Each such check does a runtime search through a table of converters. It would a terrible shame if that overhead isn't stripped by the JIT, which means it has to be in python.

                I think a good test library for thinking about these issues is numpy, since it has some memory management complexity as well as internal overloading.

                I could go on, but it'd probably be better to do that via email. :)

                Geoffrey

                +
                +
                +
                +
                + + René Dudfield wrote on 2008-10-14 21:05: +
                +
                +

                Please add a C API :)

                Once that is done, it's lots easier to interface with the outside world.

                For a lot of C++ apis I find it easy enough to write a C api on top of it.

                In fact many C++ apis provide a C API. Since that makes it easier to work with different C++ compilers. As you probably know different C++ compilers mangle things differently.

                It is possible to look at C++ code at runtime. You just need to be able to interpret the C++ symbols. I know someone did a prototype of this for vc6 on windows. He parsed the symbols, and then created the functions at run time with ctypes. However the approach is not portible between platforms, compilers, or even different versions of compilers. Of course this didn't allow you to use many of the C++ features, but only some.

                If you look at how swig works, you will see it kind of generates a C API for many C++ things.


                For libraries, it is custom to provide a C API. It just makes things easier.

                +
                +
                +
                +
                + + Unknown wrote on 2008-10-15 01:11: +
                +
                +

                you might want to look at PyRoot [1,2,3] which is using the Reflex library to automatically wrap (and pythonize) the C++ libraries/types for which a Reflex dictionary has been (beforehand) generated.

                theoretically any piece of C++ can be wrapped as Reflex is using gccxml[4] to extract informations from a library and to generat the dictionary library.

                Using it in one of CERN's LHC experiment which makes heavy (ab)use of templates (Boost) I can say that we almost had basically no problem.
                Usually the only problems we got were either at the gccxml level (resolution of typedef, default template arguments,...) or at the gccxml-to-reflex level (mainly naming conventions problems interfering with the autoloading of types at runtime)

                Being a client of gccxml is a rather annoying as the development is... opaque.

                I know the Reflex guys were investigating at some point to migrate to an LLVM version (with GCC as a frontend) to replace gccxml.

                [1] https://wlav.web.cern.ch/wlav/pyroot/
                [2] https://root.cern.ch/viewcvs/trunk/bindings/pyroot/
                [3] https://www.scipy.org/PyRoot
                [4] https://www.gccxml.org/

                +
                +
                +
                +
                + + Unknown wrote on 2008-10-15 11:16: +
                +
                +

                There's been some (small) discussion in the SWIG project of making an alternative output method which creates a simple C API for a C++ project, and wraps that with ctypes (generating the python side of the ctypes bindings, too). So far, this is purely theoretical, but all the pieces needed to do it are present in the SWIG source code. If reflex doesn't work out, this might be a reasonable alternative approach.

                +
                +
                +
                +
                + + Maciej Fijalkowski wrote on 2008-10-15 13:41: +
                +
                +

                Wow. A lot of very informative posts. We'll definitely look to evaluate more what you all posted. Also, in case you want to discuss more, mailing list is usually better place for discussions. Feel free to send new ideas or more detailed info there.

                Cheers,
                fijal

                +
                +
                +
                +
                + + Anonymous wrote on 2008-10-16 00:44: +
                +
                +

                fyi check out Elsa ( https://www.cubewano.org/oink ). It is much better than Reflex.

                +
                +
                +
                +
                + + Carl Friedrich Bolz-Tereick wrote on 2008-10-16 10:57: +
                +
                +

                illume: Adding a C-API is rather hard, and probably not on our todo list, unless somebody pays for it :-).

                anonymous: From a quick glance I am not sure Elsa would really help. Yes, you can get use it to parse c++ headers and get information about it. But as far as I see it, you cannot use it to create shared libraries that can be used to dynamically construct classes and dynamically call methods on them. Besides, the idea is to have a solution that works on both CPython and PyPy. Reflex already has a way to bind C++ libraries to CPython, so we only need to do the PyPy part.

                Anyway, if anybody is interested in more detailed discussions, we should all move to pypy-dev.

                +
                +
                +
                +
                + + Unknown wrote on 2008-10-19 09:40: +
                +
                +

                I also have done a fair amount of Boost.Python wrapping. I even created a code generator for it - Py++( www.language-binding.net).

                The problem you want to solve is very complex. Exposing C++ code, as is, is not enough. You will have to create "bridge" between different concepts.

                For example C++ and Python iterators. In C++, in order to get the iterator value, you have to dereference it. In Python, you just have it( value ).

                This is just a single example, I have others.

                If you continue with the project - I would like to be involved.

                +
                +
                +
                +
                + + Unknown wrote on 2008-10-21 02:48: +
                +
                +

                Roman,

                I haven't looked at the code of Boost.Python since a long time, but the way "we" do the pythonization of the iteration over STL sequences is rather simple.

                when one writes:
                foos = std.vector('FooKlass')()
                # fill foos
                # iterate
                for foo in foos:
                print foo.value()

                what the PyROOT/Reflex layer is doing is looking at the dictionary for the std::vector(FooKlass), discovering that there is a pair of functions 'begin' and 'end' and it figures out one can create a python iterator from that pair.

                anyways, as Maciej pointed it out, we could try to move this discussion here[1] or there[2]...

                cheers,
                sebastien.

                [1] https://codespeak.net/pipermail/pypy-dev/2008q4/004847.html

                [2] https://codespeak.net/pipermail/pypy-dev/2008q4/004843.html

                +
                +
                +
                +
                + + Anonymous wrote on 2008-10-30 12:27: +
                +
                +

                Just to let you know, there is an upcoming paper in PythonPapers.org review on the topic of C++ wrapping in Python. Just watch out!

                +
                +
                +
                +
                + + ilkosta wrote on 2008-12-09 14:08: +
                +
                +

                maybe it's worth also evaluate the library xrtti, a Reflex comparable library but without CERN and ROOT dependencies.

                +
                +
                +
                + +

                Sprint Discussions: Release Planning

                + +
                + + + +

                One of the discussions that happened during the sprint was about how to approach +the next PyPy release. There hasn't been a release since the end of the EU +period, which is not an optimal situation. Therefore we plan to make a 1.1 +release at the beginning of next year, ideally before Pycon US. We'd also like +to move towards time-based releases. This will be greatly helped by the +new buildbot infrastructure, which allows us to decide when the +state of the codebase is stable enough to release.

                +

                Another goal of the release is to involve more people from the wider PyPy +community by having bugdays and generally asking for more support. This will be +particularly useful for bugs on platforms that no one of the core developers +group is using.

                +

                Feature-wise the release will mostly contain CPython 2.5 language support, +including some new extension modules (like ctypes, expat, sqlite). +In addition we plan to make it easier to actually install and use the PyPy +Python interpreter, which means some sort of proper installation procedure and +supporting distutils on top of PyPy. Another part of the release will be +support for fully sand-boxing an interpreter.

                +

                Additionally there were also a large number of improvements on several levels +since the last release, like optimizations, faster oldstyle-classes, better +GCs, correct finalization behaviour, lots and lots of bugfixes, better +threading support (still with the GIL), some work on improving memory +behaviour, ...

                +

                In contrast to our last release, we will focus mainly on PyPy's Python +Intepreter and more particularly its C-version. There are also various +experimental interpreters that PyPy contains, like for Prolog, Smalltalk, +JavaScript and Scheme. We also don't intend to put the LLVM and Javascipt +backends in the release, since they are essentially unmaintained and at least +partially broken. If anybody is particularly interested in one of these +components, please feel free to step up and take responsibility for them. +Another thing that the release won't contain is a JIT. I plan to make another +blog-post about this soon, stay tuned.

                +
                +
                +
                +
                + + Michael Foord wrote on 2008-10-12 17:58: +
                +
                +

                Looking forward to news on the JIT.

                +
                +
                +
                + +

                Düsseldorf Sprint Report Days 1-3

                + +
                +

                The Düsseldorf sprint is currently in full progress and this post will try to +summarize what progress has been made in the last days. We are (again) sprinting +at the STUPS group of the Düsseldorf University. You can find the sprint +announcement and the daily planning file.

                +

                Holger and Samuele put quite some effort over several days into setting up and +improving PyPy's testing infrastructure. PyPy has a variety of tests. On the one +hand, there are of course our own tests. But then we also have the CPython tests +that should be run on top of pypy-c. Up to now we used a custom-made pile of +hacks, held together by lots of duct-tape. It consisted of a variety of +different machines running different things with different reporting solutions. +Some of the old test-results can still be found on wyvern. Now we are moving +to a buildbot based solution together with a custom reporter to have a view +similar to the old one. Some details are not quite finished yet, but most of the +things are already working rather well (currently all the results displayed +are from the 2.5-merge branch).

                +

                Another large (and ongoing) topic of work is the 2.5 branch. It contains the +work done by our Summer-of-Code student, Bruno Gola, of adding CPython 2.5 +features to PyPy's Python interpreter. While Bruno implemented most language +features and imported the 2.5 stdlib into PyPy, a lot of details were still +missing. In the last days nearly everybody worked on fixing small issues and +failing stdlib tests. While doing that we tried to categorize some CPython tests +as implementation dependant so that we can skip them when running on PyPy.

                + +

                Memory Improvements

                +

                One goal of the sprint is to measure and to reduce the memory behaviour of our +Python interpreter. The idea is to make pypy-c a realistic option for use on +embedded devices. By memory behaviour we mean both the +dynamic memory usage (how much bytes does a dict or an instance take) as well as +the size of the executable and details of the GC strategy.

                +

                Alexander, Carl Friedrich and Antonio did some work on analyzing the static data +that a pypy-c executable contains. Our executables have the tendency to be +rather large, both due to a lot of code and due to a large amount of static +data. The analysis didn't give any really surprising results, the problem is +mostly that we have a lot of static data originating from a bit everywhere in +our program. Two big offenders are the unicodedata-module with about 750 KB +of static data and the multimethod-tables with about 150 KB of data.

                +

                Armin, Iko, Anto and Maciek worked on a new approach to malloc-removal. This is +(for PyPy) a crucial optimization of the translation toolchain that performs +escape analysis to find out which objects don't outlive the frame they were +allocated in. Since RPython is garbage-collected we usually have a lot of +allocations, so it is important to statically get rid of many of them. To +successfully do that, some inlining is needed to give the analysis more context. +This leads to the fact that we have rather aggressive inlining-settings to allow +as much malloc-removal as possible. The new approach tries to inline functions +only if this actually leads to the successful removal of a malloc operation. The +code is not finished quite yet, so it remains to be seen how successful it will +be.

                +

                Before the sprint Maciek had started to work on a mark-compact GC for PyPy. The +idea is that it is better for memory-constrained-environments because it does +not double the memory-requirements during collections. During the sprint Armin +and Maciek worked on cleaning up the code a bit and then merging the branch. +An interesting property of the mark-compact GC is that after a collection all +the memory that is not currently used by the program is returned to the +operating system. Right now the GC is not as fast as our more mature ones, but +it probably will be the basis for future tweaking.

                +

                A small thing that was done by Alexander and Carl Friedrich to make objects smaller is +to enable shared instance dictionaries also for instances of old-style +classes. Before it worked only for instances of new-style classes. Shared +instance dictionaries are a way to reduce the memory-usage of instances. In the +optimal case, it gives the same memory-savings that __slots__ are giving, +but without any behavioural changes. Conceptually it is very similar e.g. to +the notion of "map" in the Self project, or the hidden classes that Google Chrome's V8 +is using (click on the link, there are nice graphics). The +difference is that for them it is mostly a way to get faster attribute access, +and PyPy is so far only using it form memory savings (but that might change in +the future).

                +

                In parallel to all the other work, John Witulski worked tirelessly on advancing +the AMD64-JIT-backend. John has the implementation of this backend as the topic +of his Bachelor's thesis. He is progressing quite well (especially also +considering that this is his first sizeable Python project ever), just sometimes +being impaired by such annoyances as errors in the official Intel documentation. +By now the backend is supporting many integer operations and control flow.

                +
                +
                +
                +
                + + René Dudfield wrote on 2008-10-12 07:55: +
                +
                +

                Hello,

                sounds like some fun sprinting :)

                Have you considered mmap for some of those big memory users?

                Especially for unicode stuff, which mostly won't be used for many applications, it should be a good win -- both for load time, and memory use.

                Double plus extra combo win!!! for if you use multiple processes.

                cu,

                +
                +
                +
                +
                + + Unknown wrote on 2008-10-21 18:05: +
                +
                +

                I'm not sure of what you mean.
                But modern operating systems (at least Linux) use on-demand loading of executables and libraries, so they never copy anything from an executable file to memory unless it is used.

                In fact, process startup is implemented internally by mmap()ing the executable and libraries into the address space of the new process.

                If you use multiple processes, it still works well - also data pages are shared, until some process writes to them.

                For MMU-less devices, the above does not apply (and ucLinux allows Linux to run on them).
                But in that case, I guess that no demand loading is available, and that mmap() copies the mapped data in memory - you need to explicitly swap in and out code segments (i.e. to use good old overlays), and no modern programming environment has direct support for them any more I guess.

                You can still emulate overlays with advanced usage of linker scripts however - you put some code in a section, create variables containing the begin and end offset of that section in the linker script, and copy data in memory from that section; but I think that making relocations to that code work flawlessly is impossible, you need to always refer to the buffer containing loaded data.

                +
                +
                +
                + +

                Prolog-JIT Master's-Thesis Finished

                + +
                +

                As we already blogged, in the last half-year or so, Michael Leuschel, Armin +and me did a lot of JIT generator work on a Prolog prototype. The idea was to +experiment more quickly with some techniques than what would have been possible +with RPython. These experiments were quite successful in themselves. With very +little code we managed to get a JIT that is not doing too badly when compared to +existing projects for Prolog.

                +

                This Prolog work was also the subject of my Master's thesis. I finished the +thesis about two weeks ago (and since then have been mostly sleeping and then +sprinting). The thesis should be self-contained when it comes to explaining the +JIT concepts but needs knowledge of Prolog to be understandable.

                +
                +

                PyPy/Python at the Maemo summit

                + +
                +

                Maciej and me visited the Maemo Summit in Berlin - +a community meetup around Nokia's Linux based +mobile platform. We spontaneously did a lightning +talk about a first running pypy-c on Maemo +and got nice feedback. + +

                +

                We also had a nice lunch with guys from the INDT in Brazil, including Marcio Marcedo and Marcelo Eduardo. It turns out that Python is used a lot on Maemo, for example the nice Canola UI is done with it. Will be interesting to see how this shapes up in relation to the iPhone and Android. + +

                +

                A lot of Nokia engineers were around and they announced that from October on they are going for weekly new releases of their SDK for the new Fremantle (Maemo-5) debian-based platform until the SDK becomes final - if we got this right. + +

                +

                Funnily enough, we met Marius Gedminas from the Programmers of Vilnius - he gave a lightning talk on his impressions as a community member. We think python programmers really should go much more to non-Python centric conferences. + +

                +

                The whole event took place at the C-Base - was a bit +crammed in some of the sessions with something like 200 people attending. +
                +cheers, Maciej and Holger

                +
                +
                +
                +
                + +
                +
                +
                + +
                + + + + \ No newline at end of file diff --git a/blog/index-9.html b/blog/index-9.html new file mode 100644 index 000000000..4693470d3 --- /dev/null +++ b/blog/index-9.html @@ -0,0 +1,1421 @@ + + + + + + +PyPy (old posts, page 9) | PyPy + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +
                +

                VM summit: nice to see friendly competition

                + +
                +

                +So Google has launched the unladen swallow project +with this first goal: +

                +
                +    Produce a version of Python at least 5x faster than CPython.
                +
                +

                +We discussed some details with Collin Winter, Jeffrey Yasskin and Thomas Wouters +during the VM summit yesterday. We were a bit confused about usage +of the term JIT, because as far as we understood, it's going to be upfront +compilation into LLVM. In the past we have looked into LLVM + – at one point PyPy extensively use it but it +wasn't clear how we could make good use to it. +They also consider changing to something else than LLVM. It's gonna be +interesting to see how this works out. +

                +

                +It's good to see friendly competition, and we think that should take up +the challenge and see if we can produce faster pickling, run 2to3 and +Django faster than what they can come up with. We also talked +to IronPython and Jython developers and all agreed that some +common benchmarks would be good. And maybe do weekly +press releases about small speed increases? :) +

                +

                +The idea of the VM summit here in Chicago was to bring together implementors +of various virtual machine languages. There were members of the communities of +IronPython, CPython, GemStone's MagLev, Rubinius, Mozilla's TraceMonkey, Parrot, +Sun's Da Vinci Machine, Microsoft's DLR, Jython and JRuby. +Everybody got to talk 5-10 minutes on their current status and +challenges. It is clear that you cannot begin to cover the +complexities and architectures of the involved projects. +But that wasn't too much of a problem because the rest of +the day everybody freely and dynamically grouped on their +issues of choice. We established some more personal contacts, +was great to chat with people like Andreas Gal from the University of +California, Irvine, who have a very similar idea about the JIT +that we have. Actually, we could probably haved mixed our +two presentations and nobody would have actually noticed :-). +

                +

                +At the end of the presentation part, John Rose presented his +slides. John is a Hotspot developer, and while not precisely a dynamic +language implementor, he has a lot of experience in virtual +machine implementation. It's very good to see the JVM being extended towards +supporting dynamic-language specific things, in order to be something +more than just a good platform for Java. We'll probably have +some extra meetup with him the next days. +

                +cheers,
                +holger and fijal +
                +
                +
                +
                + + Anonymous wrote on 2009-03-26 14:21: +
                +
                + So Google has launched the unladen swallow project with this first goal

                I'm not sure this is a Google project. It's hosted on Google code for sure, but anyone can do that. +
                +
                +
                +
                + + Anonymous wrote on 2009-03-27 01:16: +
                +
                +

                All three of the primary developers are Google employees.

                +
                +
                +
                +
                + + Anonymous wrote on 2009-03-27 03:01: +
                +
                + We were a bit confused about usage of the term JIT, because as far as we understood, it's going to be upfront compilation into LLVM.

                The LLVM supports JIT, so compiling Python into LLVM bytecode will give JIT for free. +
                +
                +
                +
                + + Anonymous wrote on 2009-03-27 03:24: +
                +
                +

                Anonymous#1, this is extremely valuable note to take in an open source world, you know ;-)

                PyPy folks, keep up the good work!

                But... Sometimes I miss updates on this blog. Not in the sense that you slack on it, but in the sense that I miss some "technicaly intermediate" updates when there are no news on breathrouths.

                One thing I miss most is the retro style articles on how some things that are "established" now got to be this way. The design by evolution things. Stuff that both educates and helps to get acquainted with the code one might like to hack one day.

                +
                +
                +
                +
                + + Luis wrote on 2009-03-28 02:51: +
                +
                +

                I don't understand: These days, Google's v8 claims to be 56x faster than common javascript, tracemonkey is in the same league, as well as nitro, etc. Way before, psyco sped up python (theoretically) up to c's speed for algorithmic code, and up to 4x for common code.

                Now Unladen Swallow aims to "only" 5x speed up. Isn't it to little, seeing what the above projects are getting nowadays?
                Or am I getting confussed by their terminology? (what's exactly the meaning of 5x here?).

                +
                +
                +
                +
                + + Maciej Fijalkowski wrote on 2009-03-28 14:28: +
                +
                +

                The exact meaning of 5x is I *think* "5x on tests derived from google internal apps". It's a bit little, but note that the great speedups of JS engines are for simple algorithmic code (the one that psyco speedups great).

                It would be a good speedup for a lot of people though (of course we aim to speed up stuff according to JS engines ;)

                Cheers,
                fijal

                +
                +
                +
                +
                + + Luis wrote on 2009-03-28 14:34: +
                +
                +

                Maciej, this is a reply posted on the project's FAQ page:

                Comment by collinw, Today (8 hours ago)

                luismgz: translating Python to Javascript would be easy to implement for about 80% of the language, but you'd hit a wall in implementing that last 20%. Just ask the Jython, PyPy? and IronPython? teams how hard 100% compatibility is. They've done some really heroic work to implement every dark and musty corner of the language, and I think they'd be the first to tell you that it's easy to get something like the Fibonacci function working, but things like metaclasses are a different story. We hope to side-step that by reusing as much of CPython as possible.

                Psyco's claimed benefits of 100x speed-up on algorithmic code is rarely seen in real applications. It can certainly be used to optimize hotspots that fit Psyco's profile, but in examining the performance of some Google applications that use Psyco, we found that they see only a ~10% improvement in overall CPU usage. While that might be a valuable savings for some applications, it's not 100x, nor even the 2-4x low-end estimate that I've seen in the Psyco docs.

                Are our performance goals too modest? We don't think so. Our team is small -- only two full-time engineers -- and we want to allow for unexpected surprises along the way. We feel that 5x is a good goal for the time being, especially given that we may need to make changes to LLVM along the way. If things go astoundingly well, we may raise those numbers, but for now, we're comfortable with our stated goals.

                +
                +
                +
                +
                + + Luis wrote on 2009-05-28 23:35: +
                +
                +

                Maciej said: "It would be a good speedup for a lot of people though (of course we aim to speed up stuff according to JS engines ;)"

                What ever happend to the "secret goal" of being faster than c...?

                +
                +
                +
                + +

                PyPy talk at OpenBossa 09

                + +
                +

                Yesterday i gave my PyPy status/mobile perspectives at OpenBossa, Nokia's developer conference for embedded platforms in Brazil. Found it a bit of a tough task to do that in 50 minutes. I had some 50, later more developers attending the talk and was happy with the questions and the feedback. Guess it's a good sign if the number of people grows during a talk :) It was the first time i tried to work more with pictures and actually used some devianart photos from Marikaz to mark section transitions. I summarize/highlight some key points here in the post.

                +

                After intro and 2.5 compatibility status, i talked about our measurements of PyPy's Python on Nokia's N810 internet tablet. The best bit is that for almost all Python data structures PyPy has smaller memory representations than CPython. Particularly good are class instances which often score at 50% of CPython's sizes. Startup time is also often better and can be improved. On the bad side, PyPy's quite large base interpreter size and its bytecode execution is often worse. In the talk i also outline ideas for "perfect PYC files" for minimizing module import times and maximizing sharing across interpreter processes. I also briefly discussed the PyPy situation with extension modules and regarding C++ libs. Most of these ideas arose from sprint discussions last year. In the morning i also had some good talk with Stefan Seefeld about Boost Python and the new QT4 bindings. Maybe to use Boost Python is also a good opportunity - but PyPy does not currently have a C-level or C++ level API.

                +

                In subsequent lunch discussions people agreed that PyPy has three main interesting areas currently:

                +
                  +
                • the Python Just-In-Time Compiler
                • +
                • a virtualized, sandboxed Python interpreter
                • +
                • an efficient Python interpreter for small devices
                • +
                +

                I think our upcoming 1.1 release will be a good point in time for many people to look some more into PyPy. I hope we are crossing the chasm soon. It's been a while since the project started :) Getting some more sponsoring to sustain and increase our current efforts probably wouldn't hurt.

                +

                Now i am off to spend my last day in Recife / Brazil, fly back to Germany in the evening and then spend time on preparing for Pycon 2009. And I guess i am going to enjoy some naturally cold air - at least my two jogging sessions at Brazillian beaches, at a sustained 30 degrees celsius, were tough. I guess i shouldn't complain, though :)

                +

                Was great meeting all the brazillian guys and the few women - just had breakfeast with Kate Alhola, kernel hacker and working on the new "Freemantle" graphical platform. Many thanks go to Marcio Marcedo and the Python team at INDT who invited me here. Hope to come again next year and eventually talk more about the Zone VM :)

                +

                If you are interested in some more not so pypy-specific bits about the conference and what i experienced, you might head over to my tetamap blog.

                +

                holger

                +
                +
                +
                +
                + + Mikko Ohtamaa wrote on 2009-03-12 22:13: +
                +
                +

                Hi Holger,

                About start up times: We have researched them a lot when developing few applications on Nokia's PyS60.

                Our conclusion, for now, is that its imports and module body bytecode execution (loading modules, classes, creating functions) which takes the most of the time during the start up. Unfortunately there is no real way to speed up this process, except lazily trying to load all your code.

                We have experienced with unexec() like solution. Unexec() was an old Emacs trick where a.out binary code and data segments are dumped to disk. When the application is loaded for the next time, this dump is just blitted to memory and execution continues. Kind of application level hibernation. You wouldn't actually need to distribute .pyc files at all for embedded devices, you could just give out a target specific binary dump containing ready memory layout.

                Of course, it is not so straightforward on modern system with DLLs and other funny pointers. We got some tests working with CPython and PyS60 emulator - but there would be tons of work to make it actually usable (patching all system libs and DLL loads to be suspend friendly).

                Some discussion here:

                https://mail.python.org/pipermail/python-dev/2008-December/084466.html

                Please reply at mikko (at) redinnovation (dot) com if you are interested in to hear more.

                +
                +
                +
                +
                + + Mikko Ohtamaa wrote on 2009-03-12 22:14: +
                +
                +

                God I hate too small edit boxes.

                +
                +
                +
                +
                + + Anonymous wrote on 2009-03-13 05:24: +
                +
                +

                Do you have a link for those Qt4 bindings?

                +
                +
                +
                +
                + + Paddy3118 wrote on 2009-03-13 07:35: +
                +
                +

                On too small edit boxes: I use: It's All Text! for Firefox, together with Vim.

                - Paddy.

                +
                +
                +
                +
                + + holger krekel wrote on 2009-03-15 20:19: +
                +
                +

                Hi Mikko!

                thanks a lot for your comment and the pointer to the python-dev thread!

                Like Martin von Loewis i'd be very interested to know more numbers regarding how the time for python imports is usually spent - i'd suspect the major bit comes from unmarshalling and the involved malloc/copying of data work. If that is true then what i presented in the talk as "perfect pyc files" is probably a good idea. It's basically what Martin suggested.

                I find the unexec ideas interesting, especially on platforms where fork does not exist. PyPy could probably have a very compact interpreter state representation if we perform garbage collection before writing to disk. When using moving GCs those objects would map very compactly into the oldest-generation memory and thus be mostly avoided by subsequent GC collects.

                Of course, there also is time consumed for linking DLLs - only forking is efficient in avoiding this overhead. But it doesn't exist on Symbian, right?

                If you have any more info on the exact numbers on import times, i'd be very curious. We might also have some numbers from PyPy - need to check.

                I am also available on holger.krekel at gmail com. You are also very welcome to post to pypy-dev (https://codespeak.net/mailman/listinfo/pypy-dev)

                cheers,
                holger

                +
                +
                +
                +
                + + holger krekel wrote on 2009-03-15 20:39: +
                +
                +

                anoymous: pypy does not have qt4 bindings yet.

                paddy318: thanks! i'll check this out once i get firefox upgrade on the ubuntu machine i am currently using. (why are we in 2009 still having this concept of "installing" apps/plugins, let alone finding a matching one?)

                +
                +
                +
                +
                + + Anonymous wrote on 2009-03-16 00:57: +
                +
                +

                I was referring to this:
                "In the morning i also had some good talk with Stefan Seefeld about Boost Python and the new QT4 bindings."

                From that it sounds like there are new Qt4 bindings for CPython somewhere, using Boost. I have tried searching, but was not able to find anything.

                +
                +
                +
                +
                + + holger krekel wrote on 2009-03-16 08:56: +
                +
                +

                Anonymous, i also only found the 2005 announcement. I mailed Stefan to find out some more. Maybe it's just existing in some developers repository as of yet. I'll let you know if i find out something more actual.

                +
                +
                +
                +
                + + holger krekel wrote on 2009-03-16 12:04: +
                +
                +

                Anonymous: ok, seems like recent bindings use SIP, see https://www.riverbankcomputing.co.uk/software/pyqt/download

                not sure about the status of boost cpython based qt bindings.
                holger

                +
                +
                +
                +
                + + arman wrote on 2009-03-20 21:29: +
                +
                +

                I wonder if an unexec like functionality can be developed by developing a mechanism for pickling the current interpreter state (e.g. loaded modules).

                +
                +
                +
                + +

                Good news everyone!

                + +
                +

                +A quick update from the JIT front. As of yesterday, we're now able to translate +a highly-experimental Python interpreter that contains JIT. It mostly crashes +immediately, mostly due to some unsupported operations in the assembler backend, +but for a carefully crafted program, we're able to get massive speedups. +For something as complex as: +

                +
                +  i = 0
                +  while i < 10000000:
                +   i = i + 1
                +
                +

                +our JIT is about 20x faster than CPython. That's still about 3x slower than +Psyco, but looking at assembler code it's obvious that we can speed it up +a lot. These are very good news, since we don't encode python semantics at +all in the JIT. The JIT is automatically generated from the Python interpreter +source code. This means we should be able to expand it to handle more complex +python programs relatively quickly (interested assembler experts needed!). +

                +

                +This is actually the fifth incarnation of JIT that happened over the last +two years. It's by far simpler and more promising than any of the previous +approaches. Expect more details soon! +

                +Cheers,
                +fijal +
                +
                +
                +
                + + Anonymous wrote on 2009-03-10 16:14: +
                +
                +

                Very exciting news indeed.
                Congratulations!

                +
                +
                +
                +
                + + Zemantic dreams wrote on 2009-03-10 16:49: +
                +
                +

                This is exciting. The world is waiting.

                (I am still sad that Psyco development was discontinued and never ported to 64bit)

                +
                +
                +
                +
                + + nekto0n wrote on 2009-03-10 17:34: +
                +
                +

                Great news!
                Activity in blog shows that project is full of enthusiasm.

                +
                +
                +
                +
                + + Anonymous wrote on 2009-03-10 18:08: +
                +
                +

                wow, that's really great =)

                +
                +
                +
                +
                + + Eric van Riet Paap wrote on 2009-03-10 18:33: +
                +
                +

                Congratulations! Very nice to read about these milestones.

                I did not follow llvm development but does anyone know if they made some arrangements by now that would enable the PyPy JIT generator to leverage their optimizers?

                +
                +
                +
                +
                + + Harold Fowler wrote on 2009-03-11 12:16: +
                +
                +

                Wow, you are right, that is good news.

                RT
                www.privacy.at.tc

                +
                +
                +
                +
                + + Anonymous wrote on 2009-03-11 16:57: +
                +
                +

                I'm wondering why something like this would be faster than CPython? New to the whole python scene so I'm really just curios.

                +
                +
                +
                +
                + + René Dudfield wrote on 2009-03-11 19:59: +
                +
                +

                nice one :)

                In the mean time... I wrote an optimized version of that program for CPython:

                i = 10000000

                CPython is 10000000x faster than the pypy jit!!!!!!

                +
                +
                +
                +
                + + Tim Wintle wrote on 2009-03-12 02:48: +
                +
                +

                Congratulations!

                This is very exciting.

                @Anonymous - it's because the standard python interpreter doesn't use a JIT, which makes dynamic languages quite slow.

                +
                +
                +
                +
                + + Anonymous wrote on 2009-04-07 15:42: +
                +
                +

                I'm waiting for production solution!

                +
                +
                +
                + +

                JIT - a bit of look inside

                + +
                +

                +The previous post about our JIT explained a bit from the 1000 km +perspective how the tracing JIT would approach a language like Python. +

                +

                +I would like to step a bit inside and give a zoom to some of its features that +are already working. +While probably not the most innovative, I think it's very nice to look +at the way we work with the JIT and what tools we use. +

                +

                +The main cool thing is that you can work on and try the JIT (including trying +it on the Python interpreter!) without even generating a single bit of +assembler. How? Let's start with something very simple. Let's take +a simple interpreter for language X. +

                +

                +Language X has 3 opcodes: CO_INCREASE, CO_DECREASE and CO_JUMP_BACK_3. +CO_INCREASE increase the accumulator by one, CO_DECREASE decrease +it by one, CO_JUMP_BACK_3 jump 3 opcodes back, if the accumulator is smaller +than 100 (this is only to maintain some halting conditions possible). +The interpreter for language X looks like this:: +

                +

                +

                +
                +    jitdriver = JitDriver(greens = ['i'], reds = ['res', 'a'])
                +    code = [CO_INCREASE, CO_INCREASE, CO_INCREASE,
                +            CO_JUMP_BACK_3, CO_INCREASE, CO_DECREASE]
                +            
                +    def add(res, a):
                +        return res + a
                +
                +    def sub(res, a):
                +        return res - a
                +
                +    def main_interpreter_loop(a):
                +        i = 0
                +        res = 0
                +        c = len(code)
                +        while i < c:
                +            jitdriver.jit_merge_point(res=res, i=i, a=a)
                +            elem = code[i]
                +            if elem == CO_INCREASE:
                +                res = add(res, a)
                +            elif elem == CO_DECREASE:
                +                res = sub(res, a)
                +            else:
                +                if res > 100:
                +                    pass
                +                else:
                +                    i = i - 3
                +                    jitdriver.can_enter_jit(res=res, i=i, a=a)
                +                    continue
                +            i = i + 1
                +        return res
                +
                +

                +All very simple code, expect the jitdriver hints, which instruct JIT how to +behave (they are the equivalent of the ``add_to_position_key`` of last the blog +post). +

                +

                +Let's look how this code is processed. This will also give a glance +at how we work in this code. This particular piece can be found +on a branch in pypy/jit/metainterp/test/test_loop.py +and can be run with ./test_all.py jit/metainterp/test/test_loop.py -k test_example -s --view from pypy directory. The -s option lets you see the debugging output, while +--view will show you some graphs. So, let's look at graphs in order: +

                + + +And the same picture with a bit of zoom for the first block: + + +

                + +This is the call graph of an interpreter loop, nothing magic so far. This is an +intermediate representation of translation toolchain input. If you look around +you can follow how the opcodes are dispatched (with a chain of ifs) and helpers +called. Next graph is very boring, because it's a bit lower level representation +of the same thing (you exit with q or escape btw :). +

                +

                +When we exit the graph viewer, we can see the trace generated by interpreting +this graph with a given bytecode (variable code in paste above). It's something +like: +

                +
                +
                +        [compiler] ENTER
                +        [runner:cpu]    call__4 [(''), * GCREF hidden, 0] -> 0
                +        [runner:cpu]    int_eq [0, 0] -> True
                +        [runner:cpu]    int_add [9, 1] -> 10
                +        [runner:cpu]    int_add [0, 1] -> 1
                +        [runner:cpu]    int_lt [1, 6] -> True
                +        [runner:cpu]    call__4 [(''), * GCREF hidden, 1] -> 0
                +        [runner:cpu]    int_eq [0, 0] -> True
                +        [runner:cpu]    int_add [10, 1] -> 11
                +        [runner:cpu]    int_add [1, 1] -> 2
                +        [runner:cpu]    int_lt [2, 6] -> True
                +        [runner:cpu]    call__4 [(''), * GCREF hidden, 2] -> 0
                +        [runner:cpu]    int_eq [0, 0] -> True
                +        [runner:cpu]    int_add [11, 1] -> 12
                +        [runner:cpu]    int_add [2, 1] -> 3
                +        [runner:cpu]    int_lt [3, 6] -> True
                +        [runner:cpu]    call__4 [(''), * GCREF hidden, 3] -> 1
                +        [runner:cpu]    int_eq [1, 0] -> False
                +        [runner:cpu]    int_eq [1, 2] -> False
                +        [runner:cpu]    int_gt [12, 100] -> False
                +        [runner:cpu]    int_sub [3, 3] -> 0
                +        [compiler] LEAVE
                +
                +

                +It's entering JIT, doing some primitive operations for bytecode dispatching +and repeating the loop. Note that at the end of the interpreted loop +(not to be confused with the interpreter loop), we see int_sub [3, 3] +which resets the bytecode position to the beginning. At this time JIT +(instructed by can_enter_jit hint) notices that all green variables +are the same (here only i), +hence we can compile the efficient loop from this point. +

                + + +

                +The loop contains 3 additions and a check (for i < 100), exactly +the same as our interpreted program would do, but completely without +interpretation overhead! +

                +

                +As you might have noticed, there is no assembler involved so far. All of this +instruction execution is done directly, in pure python. In fact, the +code for executing instructions is located in jit/backend/llgraph +which directly interprets instructions. This is by far simpler (and easier +to debug) than x86 assembler. +

                +

                +And this is basically it: the very simple interpreter and a jit for it. +Of course we actually can generate assembler for that. Also the missing +piece is optimizing the generated graphs. While for this example, +by removing the interpretetation overhead, we're done, with more complex +examples it's important to further optimize traces. Hopefully this and +how we actually generate assembler will be topics for next blog posts. +

                +Cheers,
                +fijal +
                +
                +
                +
                + + Brent Millare wrote on 2009-03-05 20:25: +
                +
                +

                Great article. I like how it is the simplest case that can explain the most basic work flow. You have code, the interpreter, and the generated code as part of the running JIT.

                +
                +
                +
                + +

                PyPy on Mobiles, at OpenBossa

                + +
                +

                Next week i am going to give a talk on PyPy at OpenBossa, a developer conference on embedded platforms. I've written up a bit more of my background and why i find it very interesting to go there on my blog. Probably will mostly follow up there or on twitter and not much here on the PyPy blog because it's not all about PyPy. To summarize how i see it: i think there is great potential for Python and PyPy on mobiles and am thrilled to hear about what's going on currently and to discuss opportunities.

                +cheers, holger +
                +

                Applying a Tracing JIT to an Interpreter

                + +
                +

                After I had failed once more to explain to someone on IRC what the idea behind +the current JIT generator work of PyPy, I decided to just write a blog post to +explain it. Here it is :-). The post turned out to be a bit long, so please bear +with me.

                +

                The goal of the post is to give an understanding of how PyPy's JIT generator is +going to work. To do this, I will look at what happens when you write an +interpreter in Java and apply a completely normal tracing JIT to it (for this +reason all the code examples will be in some sort of pseudo-Java). The +resulting generated machine code is bad, so I will explain a way to fix the +occurring problem.

                +

                The techniques I describe here are conceptually similar to what we are doing in +PyPy. The details (as usual) are different. The reasons why I am trying to +explain things in this way is that I can start from tracing JITs, which are a +known existing technique.

                +

                To understand the following, it is helpful to already know a bit how a normal +tracing JIT works. I will give a reminder of how it is working, but there also +exist a couple of more thorough introductions on the web already. +I also will leave out a lot of details about the more detailed workings of +tracing JITs and only explain the things that are relevant to what I am trying +to get to here.

                +

                Tracing JITs

                +

                Tracing JITs are an idea explored by the Dynamo project in the context of +dynamic optimization of machine code at runtime. The techniques were then +successfully applied to Java VMs and are now being used by Mozilla's +TraceMonkey JavaScript VM. They are built on some basic assumptions:

                +
                +
                  +
                • programs spend most of their runtime in loops
                • +
                • several iterations of the same loop are likely to take similar code paths
                • +
                • the best way to gain information about the behaviour of a program is to +observe it
                • +
                +
                +

                The basic approach of a tracing JIT is to only generate machine code for +commonly executed loops and to interpret the rest of the program. The code for +those common loops however should be highly optimized, including aggressive +inlining.

                +

                The generation of loops works as follows: At first, everything is interpreted. +The interpreter does a bit of lightweight profiling to figure out which loops +are run often. When a common loop is identified, the interpreter enters a +special mode (called tracing mode). When in tracing mode, the interpreter +records a history (the trace) of all the operations it executes, in addition +to actually performing the operations. During tracing, the trace is repeatedly +checked whether the interpreter is at a position in the program that it had seen +earlier in the trace. If this happens, the trace recorded corresponds to a loop +in the program that the tracing interpreter is running. At this point, this loop +is turned into machine code by taking the trace and making machine code versions +of all the operations in it.

                +

                This process assumes that the path through the loop that was traced is a +"typical" example of possible paths (which is statistically likely). Of course +it is possible that later another path through the loop is taken, therefore the +machine code will contain guards, which check that the path is still the same. +If during execution of the machine code a guard fails, the machine code is left +and execution falls back to using interpretation (there are more complex +mechanisms in place to still produce more code for the cases of guard failures, +but they are of no importance for this post).

                +

                It is important to understand when the tracer considers a loop in the trace to +be closed. This happens when the position key is the same as at an earlier +point. The position key describes the position of the execution of the program, +e.g. usually contains things like the function currently being executed and the +program counter position of the tracing interpreter.

                +

                Let's look at a small example. Take the following code:

                +
                +int sum_1_to_n(int n) {
                +    int result = 0;
                +    while (n >= 0) {
                +        result += n;
                +        n -= 1;
                +    }
                +    return result;
                +}
                +
                +

                The tracing JIT will at one point trace the execution of the while loop in +sum_1_to_n. The trace might look as follows:

                +
                +guard_true(n >= 0);
                +result += n;
                +n -= 1;
                +<loop_back>
                +
                +

                This trace will then be turned into machine code. Note that the machine code +loop is by itself infinite and can only be left via a guard failure.

                +

                A slightly more complex example:

                +
                +int f(int a, int b) {
                +    if (b % 46 == 41)
                +        return a - b;
                +    else
                +        return a + b;
                +}
                +
                +int strange_sum(int n) {
                +    int result = 0;
                +    while (n >= 0) {
                +        result = f(result, n);
                +        n -= 1;
                +    }
                +    return result;
                +}
                +
                +

                The trace of the loop in strange_sum would maybe look like this:

                +
                +guard_true(n >= 0);
                +a = result;
                +b = n;
                +guard_false(b % 46 == 41);
                +result = a + b;
                +n -= 1;
                +<loop_back>
                +
                +

                This would then be turned into machine code. Note how f was inlined into the +loop and how the common else case was turned into machine code, while the +other one is implemented via a guard failure.

                +

                Applying a Tracing JIT to an Interpreter

                +

                In the rest of the post we will explore what happens when the program that is +being executed/compiled by the tracing JIT is itself a (bytecode) interpreter +for another language.

                +

                A stylized bytecode interpreter for a simple programming language could look as +follows:

                +
                +W_Object interpret(String bytecode, ...) {
                +    Stack<W_Object> stack = new Stack<W_Object>();
                +    int pc = 0;
                +    while (true) { // bytecode dispatch loop
                +        char instruction = bytecode.charAt(pc);
                +        pc += 1;
                +        switch (instruction) {
                +            case ADD:
                +                W_Object arg2 = stack.pop();
                +                W_Object arg1 = stack.pop();
                +                stack.push(do_addition(arg1, arg2));
                +                break;
                +            case SUB:
                +                W_Object arg2 = stack.pop();
                +                W_Object arg1 = stack.pop();
                +                stack.push(do_substraction(arg1, arg2));
                +                break;
                +            case RETURN:
                +                return stack.pop();
                +            case JUMP_BACKWARD:
                +                pc -= (int)bytecode.charAt(pc);
                +                break;
                +            case LOAD_INTEGER:
                +                int value = (int)bytecode.charAt(pc);
                +                pc += 1;
                +                stack.push(new W_Integer(value));
                +                break;
                +            case PRINT:
                +                do_print(stack.pop());
                +                break;
                +            case DUP:
                +                stack.push(stack.peek());
                +                break;
                +            case JUMP_IF_TRUE:
                +                ...
                +            ...
                +        }
                +    }
                +
                +

                If we apply a tracing JIT to this function, it will trace and compile the +execution of one bytecode, because after one bytecode the bytecode dispatch loop +is closed. E.g. it might trace and produce machine code for the execution of a +SUB. (Sidenote: this interpret function is an example where one of the +assumptions of a tracing JIT break down: two iterations of the bytecode dispatch +loop are rarely going to follow the same code path, because usually two +consecutive bytecodes encode different instructions).

                +

                The important bit to remember here is that the tracing JIT will produce a +machine code loop that corresponds to the bytecode dispatch loop in the +interpret function. Let's see how we can change that.

                +

                Improving the Generated Code

                +

                If we want to make use of the fact that the program that is being jitted is +itself an interpreter, we need to change the tracing JIT a bit. To be more +precise we add a way for the user of the tracing JIT to add information to the +position key that the tracing JIT uses to decide when a loop is closed. This is +done by a call to a magic function add_to_position_key. This allows the +program writer to influence the tracing JIT's behaviour.

                +

                The semantics of add_to_position_key is as follows: The method itself does +not do anything. It has an effect only when it is seen during tracing. If it is +seen during tracing, the tracer adds the argument of the call to the position +key that the tracer is using to find out whether a loop was closed or not.

                +

                In the example of the interpret function above, we would add a call to this +function into the while loop as follows:

                +
                +W_Object interpret(String bytecode, ...) {
                +    Stack stack = new Stack();
                +    int pc = 0;
                +    while (true) { // bytecode dispatch loop
                +        add_to_position_key(pc);
                +        add_to_position_key(bytecode);
                +        char instruction = bytecode.charAt(pc);
                +        pc += 1;
                +        switch (instruction) {
                +            case ADD:
                +    ...
                +
                +

                When the modified tracing JIT traces now the interpret function executing a +SUB, something interesting happens. When the bytecode loop is closed, the +modified tracing JIT does not consider the trace to be a loop, because the value of +pc has been increased by one, so the position key differs. Instead it +continues to trace, effectively unrolling the bytecode dispatch loop of +interpret.

                +

                The only way for a loop to be considered closed is if the pc variable has +the same value a second time. This can only happen after a JUMP_BACKWARD +instruction has been executed. A JUMP_BACKWARD instruction will only be in +the bytecode when the bytecode represents a loop. This means that the modified +tracing JIT will trace the interpret function and will only consider that +the trace represents a loop when the bytecode itself represents a loop! Thus, a +machine code loop will eventually be created that corresponds to the loop in the +bytecode.

                +

                Let's look at at example. If we have a bytecode that corresponds to the +following instructions:

                +
                +pc |   instruction
                +---+---------------------
                +0  |  LOAD_INTEGER 0
                +2  |  DUP
                +3  |  PRINT
                +4  |  LOAD_INTEGER 1
                +6  |  ADD
                +7  |  JUMP_BACKWARD 6
                +
                +

                This loop will print integers starting from 0 and going on from there. The +modified tracing JIT will unroll the bytecode dispatch until it sees the +JUMP_BACKWARD bytecode. After that bytecode the pc will be 2 again. Thus +the earlier position key is repeated, which means that the loop will be closed. +The produced machine code will do the equivalent of the following Java code:

                +
                +...
                +guard_true(pc == 2)
                +guard_true(bytecode == "... correct bytecode string ...")
                +while (true) {
                +    instruction = bytecode.charAt(pc);
                +    pc += 1;
                +    guard_true(instruction == DUP);
                +    stack.push(stack.peek());
                +
                +    instruction = bytecode.charAt(pc);
                +    pc += 1;
                +    guard_true(instruction == PRINT);
                +    do_print(stack.pop());
                +
                +    instruction = bytecode.charAt(pc);
                +    pc += 1;
                +    guard_true(instruction == LOAD_INTEGER)
                +    value = (int)bytecode.charAt(pc);
                +    pc += 1
                +    stack.push(W_Integer(value))
                +
                +    instruction = bytecode.charAt(pc);
                +    pc += 1;
                +    guard_true(instruction == ADD)
                +    arg2 = stack.pop()
                +    arg1 = stack.pop()
                +    stack.push(do_addition(arg1, arg2))
                +
                +    instruction = bytecode.charAt(pc);
                +    pc += 1;
                +    guard_true(instruction == JUMP_BACKWARD)
                +    pc -= (int)bytecode.charAt(pc);
                +}
                +
                +

                This is machine code that essentially does what the bytecode above did. Of +course the code still remains some remnants of the interpreter (like the program +counter manipulations, the stack handling, etc), which would have to be removed +by some clever enough optimization step. If this were done, result would look a +lot more natural.

                +

                Summary

                +

                If a tracing JIT is enhanced by a way to influence its loop-closing behaviour we +can significantly improve its performance when the jitted program is itself an +interpreter. The result is that in such a case the produced machine code +will correspond to the functions that are being interpreted, not to the code of +the interpreter itself.

                +

                Now, what does all this have to do with PyPy? What we are working on since a +while is a sort of tracing JIT for RPython which allows to be customized with a +function very similar to the add_to_position_key described above. This will +make it possible to make the tracing JIT generate code that corresponds to the +code that the interpreter interprets. For example, we would add a call to +add_to_position_key to SPy, PyPy's Smalltalk VM. Then the tracing JIT will +produce machine code for Smalltalk-level loops, with all the usual benefits of a +tracing JIT (like inlining of intermediate methods, constant-folding, ...). +This JIT differs from normal tracing JITs in that it also supports very powerful +constant-folding and allocation-removal optimizations. Those optimizations will +(hopefully) be the content of a later blog post.

                +

                The basics of this process have been working fine since quite a while. What the +work currently focuses on is to improve the optimizers to remove not only the +bytecode manipulation code, but also the stack handling, and a large number of +other inefficiencies.

                +
                +
                +
                +
                + + Benjamin Peterson wrote on 2009-03-03 03:04: +
                +
                +

                Wow, that's very cool! PyPy is an amazing novel project, but how did you guys ever think of this?

                +
                +
                +
                +
                + + Unknown wrote on 2009-03-03 13:47: +
                +
                +

                great explanaition.. thanks!

                +
                +
                +
                +
                + + Anonymous wrote on 2009-03-03 15:01: +
                +
                +

                Very nice. You might want to have a look at Sullivan, et al (2003) Dynamic Native Optimization of Interpreters. They also identify the need to record not only the native PC but also the interpreter's virtual PC to identify useful trace heads. They provide three intrinsic functions (compared to your single add_to_position_key) to achieve this. Further, they provide three more intrinsics to support constant propagation.

                +
                +
                +
                +
                + + Carl Friedrich Bolz-Tereick wrote on 2009-03-04 15:27: +
                +
                +

                Wow, that's extremely interesting! It's indeed very similar to what I describe in the blog post (apparently Armin knew of the paper, but I didn't). Of course they are severely hampered by the fact that the system is working on assembler level, so they don't really have enough information available to do really interesting optimizations.

                +
                +
                +
                + +

                The next Leysin Winter Sprint

                + +
                +

                PyPy Leysin Winter Sprint (14-21th April 2009)

                + +

                The next PyPy sprint will be in Leysin, Switzerland, for the +sixth time. This sprint will take place immediately after +Easter. This is a fully public sprint: newcomers and topics +other than those proposed below are welcome.

                + + + + +
                + + + +
                  +
                • The overall idea of the sprint is to continue working on making PyPy ready +for general use. There are a few tasks left in there. In parallel, we +will continue the work on the JIT, if there is general interest. And as +usual, we are ready to add any other task -- please mention on the mailing +list what you would like to work on; the list of task is not really fixed.
                • +
                • And as usual, the main side goal is to have fun in winter sports :-) +We can take a day off for ski until Sunday, the 19th; afterwards, the +installations close. (There was quite a lot of snow this winter, so +there should be some left even though it's relatively late in the season.)
                • +
                +
                +

                For more information see the announcement.

                +
                +

                Wroclaw 2009 sprint progress report

                + +
                +

                Hello.

                + +We have just finished probably the smallest sprint ever +in PyPy history. For most of the time it was just me +and Armin pairing together.

                + +We also had a chance to work a bit with people from +the University, but there were definitely not enough +core developers to organize the work in a reasonable +manner. At some point we ended up having two pairs containing +four people each.

                + +Jakub and Bartosz (who were our gentle hosts) worked +on getting PyPy's sandbox integrated with django. +It's still just an example what you can do (ie you +can do much more), but it's already interesting to look +at. The code can be found in user dir. This server (not yet online anywhere, sorry) +is able to run untrusted python code provided by user inside +a fully configurable sandbox.

                + +We also implemented missing peepholer optimizations from +CPython, finding out that some peepholer tests were failing, +just because PyPy is optimizing better :-)

                + +The main part of the sprint was work on JIT (most notable the fifth +generation of the JIT), which was moved +from the obscure directory in Carl's user in svn (which contains +branches these days!) into a PyPy branch. It's still very much +work in progress and a lot of pen and paper or handwaving was +involved, but we were able to implement a lot of basics in record time. +

                +Right now we need a lot of rest after the exhaustive sprint, +but after that, stay tuned for more information about +progressing JIT!

                + +Cheers,
                +fijal

                +
                +
                +
                +
                + + Michael Foord wrote on 2009-02-14 14:14: +
                +
                +

                Great to see both concrete work on the JIT and some practical applications for PyPy making progress. Keep up the good work.

                See you at PyCon.

                +
                +
                +
                +
                + + Anonymous wrote on 2009-02-16 08:47: +
                +
                +

                Great to see the JIT evolving.

                Zejn

                +
                +
                +
                +
                + + Olle Jonsson wrote on 2009-02-21 01:51: +
                +
                +

                Huzzah! And yay for practical apps.

                +
                +
                +
                +
                + + stuaxo wrote on 2009-02-24 12:48: +
                +
                +

                Great to see progress being made on this :)

                +
                +
                +
                + +

                Wroclaw 2009 PyPy sprint and talk

                + +
                +

                The next PyPy sprint will be held in Wrocław, Poland 7-14th February 2009. This is fully public +sprint and all newcomers are welcomed. Preceeding the sprint there +will be a talk at University of Technology in Wrocław held at 22nd of January.

                + +

                For detailed info about the sprint, look here.

                + +

                The talk will be a general, high-level overview about PyPy project. There is a very nice poster, made by Jakub Gustak and Bartosz Skowron (in polish):

                + + + +Talk details: +
                  +
                • Location: Politechnika Wrocławska, budynek C-13, sala 0.31
                • +
                • +
                • +
                • Date: 22nd January 2009, 19:00
                • +
                • Language: very likely polish, although talk can be as well in english if some non-polish native would show up. +
                • +
                + +Cheers,
                +fijal +
                +

                Pycon 2009

                + +
                +

                Hello.

                +

                +Both of our PyPy talks has been accepted for Pycon US 2009. Although both +are somehow related to PyPy, they're vastly different in +topics, attitude and target audience.

                +

                +The first one is a classic PyPy status talk - we'll mostly talk about +our achievements from the last year (readers of this blog are aware of most, +but not all :) as well as some general introduction and plans for the future. +

                +

                +The second one is about PyPy's sandboxing features. This is in my opinion +a very underestimated feature, also by us, because it's not really well +advertised or documented. The main purpose of the talk is to present +to the general public how this works and how to use it. Hopefully we will +get to work and publish about this a bit more ahead of Pycon already. +Unlike Zope's Restricted Python, it provides you with the full python +language, inside a fully +virtualized sandbox, controlled from an external process by a custom +security policy. Stay tuned for more :-) +

                +

                +See you at Pycon 2009! +

                +

                +Cheers,
                +fijal and holger +

                +
                +
                +
                +
                + + Alex wrote on 2008-12-24 07:17: +
                +
                +

                Can't wait to hear it, Fijal gave a fantastic talk last year and I'm excited for this year's as well. Really hoping it doesn't conflict with my panel :)

                +
                +
                +
                +
                + + Anonymous wrote on 2009-01-07 10:24: +
                +
                +

                hi,

                would you have somrthing to say about that ?:

                https://www.python-forum.org/pythonforum/viewtopic.php?f=1&t=10744&sid=304ad6507b0db8420ae1df9f6c1522cd

                thx

                +
                +
                +
                +
                + + Maciej Fijalkowski wrote on 2009-01-10 09:03: +
                +
                +

                Well, I'm not really up to discuss with some rants.

                cheers,
                fijal

                +
                +
                +
                + +
                +
                + +
                +
                +
                + +
                + + + + \ No newline at end of file diff --git a/blog/index.html b/blog/index.html new file mode 100644 index 000000000..faa82dbbf --- /dev/null +++ b/blog/index.html @@ -0,0 +1,4073 @@ + + + + + + +PyPy + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +
                +

                Guest Post: How PortaOne uses PyPy for high-performance processing, connecting over 1B of phone calls every month

                + +
                +

                The PyPy project is always happy to hear about industrial use and deployments +of PyPy. For the GC bug +finding +task earlier this year, we collaborated with PortaOne and we're super happy +that Serhii Titov, head of the QA department at PortaOne, was up to writing +this guest post to describe their use and experience with the project.

                +
                +

                What does PortaOne do?

                +

                We at PortaOne Inc. allow telecom operators to +launch new services (or provide existing services more efficiently) using our +VoIP platform (PortaSIP) and our real-time charging system (PortaBilling), +which provides additional features for cloud PBX, such as call transfer, +queues, interactive voice response (IVR) and more. At this moment our support +team manages several thousand servers with our software installed in 100 +countries, through which over 500 telecommunication service providers connect +millions of end users every day. The unique thing about PortaOne is that we +supply the source code of our product to our customers - something unheard of +in the telecom world! Thus we attract "telco innovators", who use our APIs to +build around the system and the source code to create unique tweaks of +functionality, which produces amazing products.

                +

                At the core of PortaSIP is the middle-ware component (the proper name for it is +"B2BUA", but that probably does not say much to anyone outside of experts in +VoIP), which implements the actual handling of SIP calls, messages, etc. and +all added features (for instance, trying to send a call via telco operators +through which the cost per minute is lower). It has to be fast (since even a +small delay in establishing a call is noticed by a customer), reliable +(everyone hates when a call drops or cannot be completed) and yet easily +expandable with new functionality. This is why we decided to use Python as +opposed to C/C++ or similar programming languages, which are often used in +telecom equipment.

                +

                The B2BUA component is a batch of similar Python processes that are looped +inside a +asyncore.dispatcher +wrapper. The load balancing between these Python processes is done by our +stateless SIP proxy server written in C++. All our sockets are served by this +B2BUA. We have our custom client-wrappers around pymysql, redis, +cassandra-driver and requests to communicate with external services. Some +of the Python processes use cffi +wrappers around C-code to improve their performance (examples: an Oracle DB +driver, a client to a radius server, a custom C logger).

                +

                The I/O operations that block the main thread of the Python processes are +processed in sub-threads. We have custom wrappers around threading.Thread +and also asyncore.dispatcher. The results of such operations are returned to +the main thread.

                +

                Improving our performance with PyPy

                +

                We started with CPython and then in 2014 switched to PyPy because it was +faster. Here's an exact quote from our first testing notes: "PyPy gives +significant performance boost, ~50%". Nowadays, after years of changes in all +the software involved, PyPy still gives us +50% boost compared to CPython.

                +

                Taking care of real time traffic for so many people around the globe is +something we're really proud of. I hope the PyPy team can be proud of it as +well, as the PyPy product is a part of this solution.

                +

                Finding a garbage collector bug: stage 1, the GC hooks

                +

                However our path with PyPy wasn't perfectly smooth. There were very rare cases +of crashes on PyPy that we weren't able to catch. That's because to make +coredump useful we needed to switch to PyPy with debug, but we cannot let it +run in that mode on a production system for an extended period of time, and we +did not have any STR (steps-to-reproduce) to make PyPy crash again in our lab. +That's why we kept (and still keep) both interpreters installed just in case, +and we would switch to CPython if we noticed it happening.

                +

                At the time of updating PyPy from 3.5 to 3.6 our QA started noticing those +crashes more often, but we still had no luck with STR or collecting proper +coredumps with debug symbols. Then it became even worse after our development +played with the Garbage Collector's +options to increase performance +of our middleware component. The crashes started to affect our regular +performance testing (controlled by QA manager Yevhenii Bovda). At that point it +was decided that we can no longer live like that and so we started an intense +investigation.

                +

                During the first stage of our investigation (following the best practice of +troubleshooting) we narrowed down the issue as much as we could. So, it was not +our code, it was definitely somewhere in PyPy. Eventually our SIP software +engineer Yevhenii Yatchenko found out +that this bug is connected with the use of our custom hooks in the +GC. Yevhenii created +ticket #4899 and within 2-3 days we +got a fix from a member of the PyPy team, in true open-source fashion.

                +

                Finding a garbage collector bug: stage 2, the real bug

                +

                Then came stage 2. In parallel with the previous ticket, Yevhenii created +#4900 that we still see failing +with coredumps quite often, and they are not connected to GC custom hooks. In a +nutshell, it took us dozens of back and forward emails, three Zoom sessions and +four versions of a patch to solve the issue. During the last iteration we got a +new set of options to try and a new version of the patch. Surprisingly, that +helped! What a relief! So, the next logical step was to remove all debug +options and run PyPy only with the patch. Unfortunately, it started to fail +again and we came to the obvious conclusion that what will help us is not a +patch, but one of options we were testing out. At that point we found out that +PYPY_GC_MAX_PINNED=0 +is a necessary and sufficient condition to solve our issue. This points to +another bug in the garbage collector, somehow related to object pinning.

                +

                Here's our current state: we have to add PYPY_GC_MAX_PINNED=0, but we do not +face the crashes anymore.

                +

                Conclusion and next steps

                +

                Gratitude is extended to Carl for his invaluable assistance in resolving the +nasty bugss, because it seems we're the only ones who suffered from the last +one and we really did not want to fall back to CPython due to its performance +disadvantage.

                +

                Serhii Titov, head of the QA department at PortaOne Inc.

                +

                P.S. If you are a perfectionist and at this point you have mixed feelings and +you are still bothered by the question "But there might still be a bug in the +GC, what about that?" - Carl has some ideas about it and he will sort it out +(we will help with the testing/verification part).

                +
                +

                PyPy v7.3.17 release

                + +
                +

                PyPy v7.3.17: release of python 2.7 and 3.10

                +

                The PyPy team is proud to release version 7.3.17 of PyPy.

                +

                This release includes a new RISC-V JIT backend, an improved REPL based on +work by the CPython team, and better JIT optimizations of integer +operations. Special shout-outs to Logan Chien for the RISC-V backend +work, to Nico Rittinghaus for better integer optimization in the JIT, and +the CPython team that has worked on the repl.

                +

                The release includes two different interpreters:

                +
                  +
                • PyPy2.7, which is an interpreter supporting the syntax and the features of +Python 2.7 including the stdlib for CPython 2.7.18+ (the + is for +backported security updates)

                • +
                • PyPy3.10, which is an interpreter supporting the syntax and the features of +Python 3.10, including the stdlib for CPython 3.10.14.

                • +
                +

                The interpreters are based on much the same codebase, thus the dual +release. This is a micro release, all APIs are compatible with the other 7.3 +releases. It follows after 7.3.16 release on April 23, 2024.

                +

                We recommend updating. You can find links to download the releases here:

                +
                +

                https://pypy.org/download.html

                +
                +

                We would like to thank our donors for the continued support of the PyPy +project. If PyPy is not quite good enough for your needs, we are available for +direct consulting work. If PyPy is helping you out, we would love to hear +about it and encourage submissions to our blog via a pull request +to https://github.com/pypy/pypy.org

                +

                We would also like to thank our contributors and encourage new people to join +the project. PyPy has many layers and we need help with all of them: bug fixes, +PyPy and RPython documentation improvements, or general help with +making RPython's JIT even better.

                +

                If you are a python library maintainer and use C-extensions, please consider +making a HPy / CFFI / cppyy version of your library that would be performant +on PyPy. In any case, both cibuildwheel and the multibuild system support +building wheels for PyPy.

                +

                RISC-V backend for the JIT

                +

                PyPy's JIT has added support for generating 64-bit RISC-V machine code at +runtime (RV64-IMAD, specifically). So far we are not releasing binaries for any +RISC-V platforms, but there are instructions on how to cross-compile binaries.

                +

                REPL Improvements

                +

                The biggest user-visible change of the release is new features in the repl of +PyPy3.10. CPython 3.13 has adopted and extended PyPy's pure-Python repl, adding +a number of features and fixing a number or bugs in the process. We have +backported and added the following features:

                +
                  +
                • Prompts and tracebacks use terminal colors, as well as terminal hyperlinks +for file names.

                • +
                • Bracketed paste enable pasting several lines of input into the terminal +without auto-indentation getting in the way.

                • +
                • A special interactive help browser (F1), history browser (F2), explicit paste +mode (F3).

                • +
                • Support for Ctrl-<left/right> to jump over whole words at a time.

                • +
                +

                See the CPython documentation for further details. Thanks to Łukasz Langa, +Pablo Galindo Salgado and the other CPython devs involved in this work.

                +

                Better JIT optimizations of integer operations

                +

                The optimizers of PyPy's JIT have become much better at reasoning about and +optimizing integer operations. This is done with a new "knownbits" abstract +domain. In many programs that do bit-manipulation of integers, some of the +bits of the integer variables of the program can be statically known. Here's a +simple example:

                +
                x = a | 1
                +...
                +if x & 1:
                +    ...
                +else:
                +    ...
                +
                +

                With the new abstract domain, the JIT can optimize the if-condition to +True, because it already knows that the lowest bit of x must be set. +This optimization applies to all Python-integers that fit into a machine word +(PyPy optimistically picks between two different representations for int, +depending on the size of the value). Unfortunately there is very little impact +of this change on almost all Python code, because intensive bit-manipulation is +rare in Python. However, the change leads to significant performance +improvements in Pydrofoil (the RPython-based RISC-V/ARM emulators that are +automatically generated from high-level Sail specifications of the respective +ISAs, and that use the RPython JIT to improve performance).

                +

                PyPy versions and speed.pypy.org

                +

                The keen-eyed will have noticed no mention of Python version 3.9 in the +releases above. Typically we will maintain only one version of Python3, but due +to PyPy3.9 support on conda-forge we maintained multiple versions from the +first release of PyPy3.10 in PyPy v7.3.12 (Dec 2022). Conda-forge is +sunsetting its PyPy support, which means we can drop PyPy3.9. Since that was +the major driver of benchmarks at https://speed.pypy.org, we revamped the site +to showcase PyPy3.9, PyPy3.10, and various versions of cpython on the home +page. For historical reasons, the "baseline" for comparison is still cpython +3.7.19.

                +

                We will keep the buildbots building PyPY3.9 until the end of August, these +builds will still be available on the nightly builds tab of the buildbot.

                +

                What is PyPy?

                +

                PyPy is a Python interpreter, a drop-in replacement for CPython +It's fast (PyPy and CPython performance +comparison) due to its integrated tracing JIT compiler.

                +

                We also welcome developers of other dynamic languages to see what RPython +can do for them.

                +

                We provide binary builds for:

                +
                  +
                • x86 machines on most common operating systems +(Linux 32/64 bits, Mac OS 64 bits, Windows 64 bits)

                • +
                • 64-bit ARM machines running Linux (aarch64) and macos (macos_arm64).

                • +
                +

                PyPy supports Windows 32-bit, Linux PPC64 big- and little-endian, Linux ARM +32 bit, RISC-V RV64IMAFD Linux, and s390x Linux but does not release binaries. +Please reach out to us if you wish to sponsor binary releases for those +platforms. Downstream packagers provide binary builds for debian, Fedora, +conda, OpenBSD, FreeBSD, Gentoo, and more.

                +

                What else is new?

                +

                For more information about the 7.3.17 release, see the full changelog.

                +

                Please update, and continue to help us make pypy better.

                +

                Cheers, +The PyPy Team

                +
                +
                +

                Conda-forge proposes sunsetting support for PyPy

                + +
                +

                Conda-forge has kindly been providing support for PyPy since 2019. The +conda-forge team has been very patient and generous with resources, but it +seems the uptake of PyPy has not justified the effort. Major packages still +are not available on PyPy, +others find it hard to update +versions. We don't +get much feedback at all about people using PyPy, and even less about PyPy on +conda-forge. The conda-forge team has proposed sunsetting +PyPy going +forward, which means current packages would remain but no new packages would be +built. If you have an opinion, you can comment on that PR, or on this blog post.

                +

                Since conda-forge supports PyPy3.9 but not PyPy3.10, we have continued +releasing PyPy3.9 even though we typically support only one version of PyPy3. +With the sunsetting proposal, we will not release any more updates to PyPy3.9. +I opened a poll about the +intention to drop PyPy3.9. If you have an opinion, please chime in.

                +
                +

                A Knownbits Abstract Domain for the Toy Optimizer, Correctly

                + +
                +

                After Max' introduction to abstract interpretation for the toy optimizer in the +last post, I want to present a more complicated abstract domain in this post. +This abstract domain reasons about the individual bits of a variable in a trace. +Every bit can be either "known zero", "known one" or "unknown". The abstract +domain is useful for optimizing integer operations, particularly the bitwise operations. +The abstract domain follows quite closely the tristate abstract domain of the +eBPF verifier in the Linux +Kernel, as +described by the paper +Sound, Precise, and Fast Abstract Interpretation with Tristate +Numbers by Harishankar Vishwanathan, Matan +Shachnai, Srinivas Narayana, and Santosh Nagarakatte.

                +

                The presentation in this post will still be in the context of the +toy optimizer. We'll spend a significant part of +the post convincing ourselves that the abstract domain transfer functions that +we're writing are really correct, using both property-based testing and +automated proofs (again using Z3).

                +

                PyPy has implemented and merged a more complicated version of the same abstract +domain for the "real" PyPy JIT. A more thorough explanation of that real world +implementation will follow.

                +

                I'd like to thank Max Bernstein and Armin Rigo for lots of great feedback on +drafts of this post. The PyPy implementation was mainly done by Nico +Rittinghaus and me.

                +

                Contents:

                + +

                Motivation

                +

                In many programs that do bit-manipulation of integers, some of the bits of the +integer variables of the program can be statically known. Here's a simple +example:

                +
                x = a | 1
                +...
                +if x & 1:
                +    ...
                +else:
                +    ...
                +
                + +

                After the assignment x = a | 1, we know that the lowest bit of x must be 1 +(the other bits are unknown) and an optimizer could remove the condition x & 1 by +constant-folding it to 1.

                +

                Another (more complicated) example is:

                +
                assert i & 0b111 == 0 # check that i is a multiple of 8
                +j = i + 16
                +assert j & 0b111 == 0
                +
                + +

                This kind of code could e.g. happen in a CPU +emulator, where i and j are +integers that represent emulated pointers, and the asserts are alignment +checks. The first assert implies that the lowest three bits of i must be 0. +Adding 16 to such a number produces a result where the lowest three bits are +again all 0, therefore the second assert is always true. So we would like a +compiler to remove the second assert.

                +

                Both of these will optimizations are doable with the help of the knownbits +abstract domain that we'll discuss in the rest of the post.

                +

                The Knownbits Abstract Domain

                +

                An abstract value of the knownbits domain needs to be able to store, for every +bit of an integer variable in a program, whether it is known 0, known 1, or +unknown. To represent +three different states, we need 2 bits, which we will call one and unknown. +Here's the encoding:

                + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                oneunknownknownbit
                000
                101
                01?
                11illegal
                +

                The unknown bit is set if we don't know the value of the bit ("?"), the one +bit is set if the bit is known to be a 1. Since two bits are enough to encode +four different states, but we only need three, the combination of a set one +bit and a set unknown is not allowed.

                +

                We don't just want to encode a single bit, however. Instead, we want to do this +for all the bits of an integer variable. Therefore the instances of the abstract +domain get two integer fields ones and unknowns, where each pair of +corresponding bits encodes the knowledge about the corresponding bit of the +integer variable in the program.

                +

                We can start implementing a Python class that works like this:

                +
                from dataclasses import dataclass
                +
                +@dataclass(eq=False)
                +class KnownBits:
                +    ones : int
                +    unknowns : int
                +
                +    def __post_init__(self):
                +        if isinstance(self.ones, int):
                +            assert self.is_well_formed()
                +
                +    def is_well_formed(self):
                +        # a bit cannot be both 1 and unknown
                +        return self.ones & self.unknowns == 0
                +
                +    @staticmethod
                +    def from_constant(const : int):
                +        """ Construct a KnownBits corresponding to a constant, where all bits
                +        are known."""
                +        return KnownBits(const, 0)
                +
                +    def is_constant(self):
                +        """ Check if the KnownBits instance represents a constant. """
                +        # it's a constant if there are no unknowns
                +        return self.unknowns == 0
                +
                + +

                We can also add some convenience properties. Sometimes it is easier to work +with an integer where all the known bits are set, or one where the positions +of all the known zeros have a set bit:

                +
                class KnownBits:
                +    ...
                +
                +    @property
                +    def knowns(self):
                +        """ return an integer where the known bits are set. """
                +        # the knowns are just the unknowns, inverted
                +        return ~self.unknowns
                +
                +    @property
                +    def zeros(self):
                +        """ return an integer where the places that are known zeros have a bit
                +        set. """
                +        # it's a 0 if it is known, but not 1
                +        return self.knowns & ~self.ones
                +
                + +

                Also, for debugging and for writing tests we want a way to print the known bits +in a human-readable form, and also to have a way to construct a KnownBits +instance from a string. It's not important to understand the details of +__str__ or from_str for the rest of the post, so I'm putting them into a fold:

                +
                KnownBits from and to string conversions
                class KnownBits:
                +    ...
                +
                +    def __repr__(self):
                +        if self.is_constant():
                +            return f"KnownBits.from_constant({self.ones})"
                +        return f"KnownBits({self.ones}, {self.unknowns})"
                +
                +    def __str__(self):
                +        res = []
                +        ones, unknowns = self.ones, self.unknowns
                +        # construct the string representation right to left
                +        while 1:
                +            if not ones and not unknowns:
                +                break # we leave off the leading known 0s
                +            if ones == -1 and not unknowns:
                +                # -1 has all bits set in two's complement, so the leading
                +                # bits are all 1
                +                res.append('1')
                +                res.append("...")
                +                break
                +            if unknowns == -1:
                +                # -1 has all bits set in two's complement, so the leading bits
                +                # are all ?
                +                assert not ones
                +                res.append("?")
                +                res.append("...")
                +                break
                +            if unknowns & 1:
                +                res.append('?')
                +            elif ones & 1:
                +                res.append('1')
                +            else:
                +                res.append('0')
                +            ones >>= 1
                +            unknowns >>= 1
                +        if not res:
                +            res.append('0')
                +        res.reverse()
                +        return "".join(res)
                +
                +    @staticmethod
                +    def from_str(s):
                +        """ Construct a KnownBits instance that from a string. String can start
                +        with ...1 to mean that all higher bits are 1, or ...? to mean that all
                +        higher bits are unknown. Otherwise it is assumed that the higher bits
                +        are all 0. """
                +        ones, unknowns = 0, 0
                +        startindex = 0
                +        if s.startswith("...?"):
                +            unknowns = -1
                +            startindex = 4
                +        elif s.startswith("...1"):
                +            ones = -1
                +            startindex = 4
                +        for index in range(startindex, len(s)):
                +            ones <<= 1
                +            unknowns <<= 1
                +            c = s[index]
                +            if c == '1':
                +                ones |= 1
                +            elif c == '?':
                +                unknowns |= 1
                +        return KnownBits(ones, unknowns)
                +
                +    @staticmethod
                +    def all_unknown():
                +        """ convenience constructor for the "all bits unknown" abstract value
                +        """
                +        return KnownBits.from_str("...?")
                +
                + + + +

                And here's a pytest-style unit test for str:

                +
                def test_str():
                +    assert str(KnownBits.from_constant(0)) == '0'
                +    assert str(KnownBits.from_constant(5)) == '101'
                +    assert str(KnownBits(5, 0b10)) == '1?1'
                +    assert str(KnownBits(~0b1111, 0b10)) == '...100?0'
                +    assert str(KnownBits(1, ~0b1)) == '...?1'
                +
                + +

                An instance of KnownBits represents a set of integers, namely those that match +the known bits stored in the instance. We can write a method contains that +takes a concrete int value and returns True if the value matches the +pattern of the known bits:

                +
                class KnownBits:
                +    ...
                +
                +    def contains(self, value : int):
                +        """ Check whether the KnownBits instance contains the concrete integer
                +        `value`. """
                +        # check whether value matches the bit pattern. in the places where we
                +        # know the bits, the value must agree with ones.
                +        return value & self.knowns == self.ones
                +
                + +

                and a test:

                +
                def test_contains():
                +    k1 = KnownBits.from_str('1?1')
                +    assert k1.contains(0b111)
                +    assert k1.contains(0b101)
                +    assert not k1.contains(0b110)
                +    assert not k1.contains(0b011)
                +
                +    k2 = KnownBits.from_str('...?1') # all odd numbers
                +    for i in range(-101, 100):
                +        assert k2.contains(i) == (i & 1)
                +
                + +

                Transfer Functions

                +

                Now that we have implemented the basics of the KnownBits class, we need to +start implementing the transfer functions. They are for computing what we know +about the results of an operation, given the knowledge we have about the bits +of the arguments.

                +

                We'll start with a simple unary operation, invert(x) (which is ~x in Python +and C syntax), which flips all the bits of at integer. If we know some bits of +the arguments, we can compute the corresponding bits of the result. The unknown +bits remain unknown.

                +

                Here's the code:

                +
                class KnownBits:
                +    ...
                +
                +    def abstract_invert(self):
                +        # self.zeros has bits set where the known 0s are in self
                +        return KnownBits(self.zeros, self.unknowns)
                +
                + +

                And a unit-test:

                +
                def test_invert():
                +    k1 = KnownBits.from_str('01?01?01?')
                +    k2 = k1.abstract_invert()
                +    assert str(k2) == '...10?10?10?'
                +
                +    k1 = KnownBits.from_str('...?')
                +    k2 = k1.abstract_invert()
                +    assert str(k2) == '...?'
                +
                + +

                Before we continue with further transfer functions, we'll think about +correctness of the transfer functions and build up some test infrastructure. To +test transfer functions, it's quite important to move being simple example-style +unit tests. The state-space for more complicated binary transfer functions is +extremely large and it's too easy to do something wrong in a corner case. +Therefore we'll look at property-based-test for KnownBits next.

                +

                Property-based Tests with Hypothesis

                +

                We want to do property-based tests of KnownBits, to try +make it less likely that we'll get a corner-case in the implementation wrong. +We'll use Hypothesis for that.

                +

                I can't give a decent introduction to Hypothesis here, but want to give a few +hints about the API. Hypothesis is a way to run unit tests with randomly +generated input. It provides strategies to describe the data that the test +functions expects. Hypothesis provides primitive strategies (for things like +integers, strings, floats, etc) and ways to build composite strategies out of +the primitive ones.

                +

                To be able to write the tests, we need to generate random KnownBits instances, +and we also want an int instance that is a member of the KnownBits instance. +We generate tuples of (KnownBits, int) together, to ensure this property. +We'll ask Hypothesis to generate us a random concrete int as the concrete +value, and then we'll also generate a second random int to use as the +unknown masks (i.e. which bits of the concrete int we don't know in the +KnownBits instance). Here's a function that takes two such ints and builds the +tuple:

                +
                def build_knownbits_and_contained_number(concrete_value : int, unknowns : int):
                +    # to construct a valid KnownBits instance, we need to mask off the unknown
                +    # bits
                +    ones = concrete_value & ~unknowns
                +    return KnownBits(ones, unknowns), concrete_value
                +
                + +

                We can turn this function into a hypothesis strategy to generate input data +using the strategies.builds function:

                +
                from hypothesis import strategies, given, settings
                +
                +ints = strategies.integers()
                +
                +random_knownbits_and_contained_number = strategies.builds(
                +    build_knownbits_and_contained_number,
                +    ints, ints
                +)
                +
                + +

                One important special case of KnownBits are the constants, which contain only +a single concrete value. We'll also generate some of those specifically, and +then combine the random_knownbits_and_contained_number strategy with it:

                +
                constant_knownbits = strategies.builds(
                +    lambda value: (KnownBits.from_constant(value), value),
                +    ints
                +)
                +
                +knownbits_and_contained_number = constant_knownbits | random_knownbits_and_contained_number
                +
                + +

                Now we can write the first property-based tests, for the KnownBits.contains +method:

                +
                @given(knownbits_and_contained_number)
                +def test_contains(t):
                +    k, n = t
                +    assert k.contains(t)
                +
                + +

                The @given decorator is used to tell Hypothesis which strategy to use to +generate random data for the test function. Hypothesis will run the test with a +number of random examples (100 by default). If it finds an error, it will try to +minimize the example needed that demonstrates the problem, to try to make it +easier to understand what is going wrong. It also saves all failing cases into +an example database and tries them again on subsequent runs.

                +

                This test is as much a check for whether we got the strategies right as it is +for the logic in KnownBits.contains. Here's an example output of random +concrete and abstract values that we are getting here:

                +
                110000011001101 ...?0???1
                +...1011011 ...1011011
                +...1001101110101000010010011111011 ...1001101110101000010010011111011
                +...1001101110101000010010011111011 ...100110111010100001?010?1??1??11
                +1000001101111101001011010011111101000011000111011001011111101 1000001101111101001011010011111101000011000111011001011111101
                +1000001101111101001011010011111101000011000111011001011111101 1000001101111101001011010011111101000011000111????01?11?????1
                +1111100000010 1111100000010
                +1111100000010 ...?11111?00000??
                +110110 110110
                +110110 ...?00?00????11??10
                +110110 ??0??0
                +...100010111011111 ...?100?10111??111?
                +...1000100000110001 ...?000?00000??000?
                +110000001110 ...?0?0??000?00?0?0000000?00???0000?????00???000?0?00?01?000?0??1??
                +110000001110 ??000000???0
                +1011011010000001110101001111000010001001011101010010010001000000010101010010001101110101111111010101010010101100110000011110000 1011011010000001110101001111000010001001011101010010010001000000010101010010001101110101111111010101010010101100110000011110000
                +...1011010010010100 ...1011010010010100
                +...1011111110110011 ...1011111110110011
                +101000011110110 101000011?10?1?
                +100101 ?00?0?
                +
                + +

                That looks suitably random, but we might want to bias our random numbers a +little bit towards common error values like small constants, powers of two, etc. +Like this:

                +
                INTEGER_WIDTH = 64
                +# some small integers
                +ints_special = set(range(100))
                +# powers of two
                +ints_special = ints_special.union(1 << i for i in range(INTEGER_WIDTH - 2))
                +# powers of two - 1
                +ints_special = ints_special.union((1 << i) - 1 for i in range(INTEGER_WIDTH - 2))
                +# negative versions of what we have so far
                +ints_special = ints_special.union(-x for x in ints_special)
                +# bit-flipped versions of what we have so far
                +ints_special = ints_special.union(~x for x in ints_special)
                +ints_special = list(ints_special)
                +# sort them (because hypothesis simplifies towards earlier elements in the list)
                +ints_special.sort(key=lambda element: (abs(element), element < 0))
                +
                +ints = strategies.sampled_from(ints_special) | strategies.integers()
                +
                + +

                Now we get data like this:

                +
                1110 1110
                +...10000000000000000001 ...10000??0??0000??00?1
                +1 ??0??0000??00?1
                +1 ?
                +...10101100 ...10101100
                +110000000011001010111011111111111111011110010001001100110001011 ...?0?101?
                +110000000011001010111011111111111111011110010001001100110001011 ??00000000??00?0?0???0??????????????0????00?000?00??00??000?0??
                +...1011111111111111111111111111 ...?11?11??
                +...1011111111111111111111111111 ...?0??????????????????????????
                +0 ...?0??????????????????????????
                +101101 101101
                +111111111111111111111111111111111111111111111 111111111111111111111111111111111111111111111
                +10111 10111
                +...101100 ...1?111011?0
                +101000 ?001010?0
                +101000 ?0?000
                +110010 110010
                +...100111 ...100111
                +1111011010010 1111011010010
                +...1000000000000000000000000000000000000 ...1000000000000000000000000000000000000
                +
                + +

                We can also write a test that checks that the somewhat tricky logic in +__str__ and from_str is correct, by making sure that the two functions +round-trip (ie converting a KnownBits to a string and then back to a +KnownBits instance produces the same abstract value).

                +
                @given(knownbits_and_contained_number)
                +def test_hypothesis_str_roundtrips(t1):
                +    k1, n1 = t1
                +    s = str(k1)
                +    k2 = KnownBits.from_str(s)
                +    assert k1.ones == k2.ones
                +    assert k1.unknowns == k2.unknowns
                +
                + +

                Now let's actually apply this infrastructure to test abstract_invert.

                +

                When are Transfer Functions Correct? How do we test them?

                +

                Abstract values, i.e. instances of KnownBits represent sets of concrete +values. We want the transfer functions to compute overapproximations of the +concrete values. So if we have an arbitrary abstract value k, with a concrete +number n that is a member of the abstract values (i.e. +k.contains(n) == True) then the result of the concrete operation op(n) +must be a member of the result of the abstract operation k.abstract_op() +(i.e. k.abstract_op().contains(op(n)) == True).

                +

                Checking the correctness/overapproximation property is a good match for +hypothesis. Here's what the test for abstract_invert looks like:

                +
                @given(knownbits_and_contained_number)
                +def test_hypothesis_invert(t):
                +    k1, n1 = t1
                +    n2 = ~n1 # compute the real result
                +    k2 = k1.abstract_invert() # compute the abstract result
                +    assert k2.contains(n2) # the abstract result must contain the real result
                +
                + +

                This is the only condition needed for abstract_invert to be correct. If +abstract_invert fulfils this property for every combination of abstract and +concrete value then abstract_invert is correct. Note however, that this test +does not actually check whether abstract_invert gives us precise results. A +correct (but imprecise) implementation of abstract_invert would simply return +a completely unknown result, regardless of what is known about the input +KnownBits.

                +

                The "proper" CS term for this notion of correctness is called soundness. The +correctness condition on the transfer functions is called a Galois +connection. I won't go into any mathematical/technical details here, but +wanted to at least mention the terms. I found Martin +Kellogg's +slides +to be quite an approachable introduction to the Galois connection and how to +show soundness.

                +

                Implementing Binary Transfer Functions

                +

                Now we have infrastructure in place for testing transfer functions with random +inputs. With that we can start thinking about the more complicated case, that of +binary operations. Let's start with the simpler ones, and and or. For and, +we can know a 0 bit in the result if either of the input bits are known 0; +or we can know a 1 bit in the result if both input bits are known 1. +Otherwise the resulting bit is unknown. Let's look at all the combinations:

                +
                and
                +input1: 000111???
                +input2: 01?01?01? 
                +result: 00001?0??
                +
                + +
                class KnownBits:
                +    ...
                +
                +    def abstract_and(self, other):
                +        ones = self.ones & other.ones # known ones
                +        knowns = self.zeros | other.zeros | ones
                +        return KnownBits(ones, ~knowns)
                +
                + +

                Here's an example unit-test and a property-based test for and:

                +
                def test_and():
                +    # test all combinations of 0, 1, ? in one example
                +    k1 = KnownBits.from_str('01?01?01?')
                +    k2 = KnownBits.from_str('000111???')
                +    res = k1.abstract_and(k2)     # should be: 0...00001?0??
                +    assert str(res) ==   "1?0??"
                +
                +@given(knownbits_and_contained_number, knownbits_and_contained_number)
                +def test_hypothesis_and(t1, t2):
                +    k1, n1 = t1
                +    k2, n2 = t2
                +    k3 = k1.abstract_and(k2)
                +    n3 = n1 & n2
                +    assert k3.contains(n3)
                +
                + +

                To implement or is pretty similar. The result is known 1 where either of the +inputs is 1. The result is known 0 where both inputs are known 0, and ? +otherwise.

                +
                or
                +input1: 000111???
                +input2: 01?01?01? 
                +result: 01?111?1?
                +
                + +
                class KnownBits:
                +    ...
                +
                +    def abstract_or(self, other):
                +        ones = self.ones | other.ones
                +        zeros = self.zeros & other.zeros
                +        knowns = ones | zeros
                +        return KnownBits(ones, ~knowns)
                +
                + +

                Here's an example unit-test and a property-based test for or:

                +
                def test_or():
                +    k1 = KnownBits.from_str('01?01?01?')
                +    k2 = KnownBits.from_str('000111???')
                +    res = k1.abstract_or(k2)     # should be:  0...01?111?1?
                +    assert str(res) ==   "1?111?1?"
                +
                +@given(knownbits_and_contained_number, knownbits_and_contained_number)
                +def test_hypothesis_or(t1, t2):
                +    k1, n1 = t1
                +    k2, n2 = t2
                +    k3 = k1.abstract_or(k2)
                +    n3 = n1 | n2
                +    assert k3.contains(n3)
                +
                + +

                Implementing support for abstract_xor is relatively simple, and left as an +exercise :-).

                +

                Addition and Subtraction

                +

                invert, and, and or are relatively simple transfer functions to write, +because they compose over the individual bits of the integers. The arithmetic +functions add and sub are significantly harder, because of carries and +borrows. Coming up with the formulas for them and gaining an intuitive +understanding is quite tricky and involves carefully going through a few +examples with pen and paper. When implementing this in PyPy, Nico and I didn't +come up with the implementation ourselves, but instead took them from the +Tristate Numbers paper. Here's the code, +with example tests and hypothesis tests:

                +
                class KnownBits:
                +    ...
                +
                +    def abstract_add(self, other):
                +        sum_ones = self.ones + other.ones
                +        sum_unknowns = self.unknowns + other.unknowns
                +        all_carries = sum_ones + sum_unknowns
                +        ones_carries = all_carries ^ sum_ones
                +        unknowns = self.unknowns | other.unknowns | ones_carries
                +        ones = sum_ones & ~unknowns
                +        return KnownBits(ones, unknowns)
                +
                +    def abstract_sub(self, other):
                +        diff_ones = self.ones - other.ones
                +        val_borrows = (diff_ones + self.unknowns) ^ (diff_ones - other.unknowns)
                +        unknowns = self.unknowns | other.unknowns | val_borrows
                +        ones = diff_ones & ~unknowns
                +        return KnownBits(ones, unknowns)
                +
                +
                +def test_add():
                +    k1 = KnownBits.from_str('0?10?10?10')
                +    k2 = KnownBits.from_str('0???111000')
                +    res = k1.abstract_add(k2)
                +    assert str(res) ==   "?????01?10"
                +
                +def test_sub():
                +    k1 = KnownBits.from_str('0?10?10?10')
                +    k2 = KnownBits.from_str('0???111000')
                +    res = k1.abstract_sub(k2)
                +    assert str(res) ==   "...?11?10"
                +    k1 = KnownBits.from_str(    '...1?10?10?10')
                +    k2 = KnownBits.from_str('...10000???111000')
                +    res = k1.abstract_sub(k2)
                +    assert str(res) ==   "111?????11?10"
                +
                +@given(knownbits_and_contained_number, knownbits_and_contained_number)
                +def test_hypothesis_add(t1, t2):
                +    k1, n1 = t1
                +    k2, n2 = t2
                +    k3 = k1.abstract_add(k2)
                +    n3 = n1 + n2
                +    assert k3.contains(n3)
                +
                +@given(knownbits_and_contained_number, knownbits_and_contained_number)
                +def test_hypothesis_sub(t1, t2):
                +    k1, n1 = t1
                +    k2, n2 = t2
                +    k3 = k1.abstract_sub(k2)
                +    n3 = n1 - n2
                +    assert k3.contains(n3)
                +
                + +

                Now we are in a pretty good situation, and have implemented abstract versions +for a bunch of important arithmetic and binary functions. What's also surprising +is that the implementation of all of the transfer functions is quite efficient. +We didn't have to write loops over the individual bits at all, instead we found +closed form expressions using primitive operations on the underlying integers +ones and unknowns. This means that computing the results of abstract +operations is quite efficient, which is important when using the abstract domain +in the context of a JIT compiler.

                +

                Proving correctness of the transfer functions with Z3

                +

                As one can probably tell from my recent posts, I've been thinking about +compiler correctness a lot. Getting the transfer functions absolutely +correct is really crucial, because a bug in them would lead to miscompilation of +Python code when the abstract domain is added to the JIT. While the randomized +tests are great, it's still entirely possible for them to miss bugs. The state +space for the arguments of a binary transfer function is 3**64 * 3**64, and if +only a small part of that contains wrong behaviour it would be really unlikely +for us to find it with random tests by chance. Therefore I was reluctant to +merge the PyPy branch that contained the new abstract domain for a long time.

                +

                To increase our confidence in the correctness of the transfer functions further, +we can use Z3 to prove their correctness, which gives us much stronger +guarantees (not 100%, obviously). In this subsection I will show how to do that.

                +

                Here's an attempt to do this manually in the Python repl:

                +
                >>>> import z3
                +>>>> solver = z3.Solver()
                +>>>> # like last blog post, proof by failing to find counterexamples
                +>>>> def prove(cond): assert solver.check(z3.Not(cond)) == z3.unsat
                +>>>>
                +>>>> # let's set up a z3 bitvector variable for an arbitrary concrete value
                +>>>> n1 = z3.BitVec('concrete_value', 64)
                +>>>> n1
                +concrete_value
                +>>>> # due to operator overloading we can manipulate z3 formulas
                +>>>> n2 = ~n1
                +>>>> n2
                +~concrete_value
                +>>>> 
                +>>>> # now z3 bitvector variables for the ones and zeros fields
                +>>>> ones = z3.BitVec('abstract_ones', 64)
                +>>>> unknowns = z3.BitVec('abstract_unknowns', 64)
                +>>>> # we construct a KnownBits instance with the z3 variables
                +>>>> k1 = KnownBits(ones, unknowns)
                +>>>> # due to operator overloading we can call the methods on k1:
                +>>>> k2 = k1.abstract_invert()
                +>>>> k2.ones
                +~abstract_unknowns & ~abstract_ones
                +>>>> k2.unknowns
                +abstract_unknowns
                +>>>> # here's the correctness condition that we want to prove:
                +>>>> k2.contains(n2)
                +~concrete_value & ~abstract_unknowns ==
                +~abstract_unknowns & ~abstract_ones
                +>>>> # let's try
                +>>>> prove(k2.contains(n2))
                +Traceback (most recent call last):
                +  File "<stdin>", line 1, in <module>
                +  File "<stdin>", line 1, in prove
                +AssertionError
                +>>>> # it doesn't work! let's look at the counterexample to see why:
                +>>>> solver.model()
                +[abstract_unknowns = 0,
                + abstract_ones = 0,
                + concrete_value = 1]
                +>>>> # we can build a KnownBits instance with the values in the
                +>>>> # counterexample:
                +>>>> ~1 # concrete result
                +-2
                +>>>> counter_example_k1 = KnownBits(0, 0)
                +>>>> counter_example_k1
                +KnownBits.from_constant(0)
                +>>>> counter_example_k2 = counter_example_k1.abstract_invert()
                +>>>> counter_example_k2
                +KnownBits.from_constant(-1)
                +>>>> # let's check the failing condition
                +>>>> counter_example_k2.contains(~1)
                +False
                +
                + +

                What is the problem here? We didn't tell Z3 that n1 was supposed to be a +member of k1. We can add this as a precondition to the solver, and then the +prove works:

                +
                >>>> solver.add(k1.contains(n1))
                +>>>> prove(k2.contains(n2)) # works!
                +
                + +

                This is super cool! It's really a proof about the actual implementation, because +we call the implementation methods directly, and due to the operator overloading +that Z3 does we can be sure that we are actually checking a formula that +corresponds to the Python code. This eliminates one source of errors in formal +methods.

                +

                Doing the proof manually on the Python REPL is kind of annoying though, and we +also would like to make sure that the proofs are re-done when we change the +code. What we would really like to do is writing the proofs as a unit-test that +we can run while developing and in CI. Doing this is possible, and the unit +tests that really perform proofs look pleasingly similar to the +Hypothesis-based ones.

                +

                First we need to set up a bit of infrastructure:

                +
                INTEGER_WIDTH = 64
                +
                +def BitVec(name):
                +    return z3.BitVec(name, INTEGER_WIDTH)
                +
                +def BitVecVal(val):
                +    return z3.BitVecVal(val, INTEGER_WIDTH)
                +
                +def z3_setup_variables():
                +    # instantiate a solver
                +    solver = z3.Solver()
                +
                +    # a Z3 variable for the first concrete value
                +    n1 = BitVec("n1")
                +    # a KnownBits instances that uses Z3 variables as its ones and unknowns,
                +    # representing the first abstract value
                +    k1 = KnownBits(BitVec("n1_ones"), BitVec("n1_unkowns"))
                +    # add the precondition to the solver that the concrete value n1 must be a
                +    # member of the abstract value k1
                +    solver.add(k1.contains(n1))
                +
                +    # a Z3 variable for the second concrete value
                +    n2 = BitVec("n2")
                +    # a KnownBits instances for the second abstract value
                +    k2 = KnownBits(BitVec("n2_ones"), BitVec("n2_unkowns"))
                +    # add the precondition linking n2 and k2 to the solver
                +    solver.add(k2.contains(n2))
                +    return solver, k1, n1, k2, n2
                +
                +def prove(cond, solver):
                +    z3res = solver.check(z3.Not(cond))
                +    if z3res != z3.unsat:
                +        assert z3res == z3.sat # can't be timeout, we set no timeout
                +        # make the model with the counterexample global, to make inspecting the
                +        # bug easier when running pytest --pdb
                +        global model
                +        model = solver.model()
                +        print(f"n1={model.eval(n1)}, n2={model.eval(n2)}")
                +        counter_example_k1 = KnownBits(model.eval(k1.ones).as_signed_long(),
                +                                       model.eval(k1.unknowns).as_signed_long())
                +        counter_example_k2 = KnownBits(model.eval(k2.ones).as_signed_long(),
                +                                       model.eval(k2.unknowns).as_signed_long())
                +        print(f"k1={counter_example_k1}, k2={counter_example_k2}")
                +        print(f"but {cond=} evaluates to {model.eval(cond)}")
                +        raise ValueError(solver.model())
                +
                + +

                And then we can write proof-unit-tests like this:

                +
                def test_z3_abstract_invert():
                +    solver, k1, n1, _, _ = z3_setup_variables()
                +    k2 = k1.abstract_invert()
                +    n2 = ~n1
                +    prove(k2.contains(n2), solver)
                +
                +def test_z3_abstract_and():
                +    solver, k1, n1, k2, n2 = z3_setup_variables()
                +    k3 = k1.abstract_and(k2)
                +    n3 = n1 & n2
                +    prove(k3.contains(n3), solver)
                +
                +def test_z3_abstract_or():
                +    solver, k1, n1, k2, n2 = z3_setup_variables()
                +    k3 = k1.abstract_or(k2)
                +    n3 = n1 | n2
                +    prove(k3.contains(n3), solver)
                +
                +def test_z3_abstract_add():
                +    solver, k1, n1, k2, n2 = z3_setup_variables()
                +    k3 = k1.abstract_add(k2)
                +    n3 = n1 + n2
                +    prove(k3.contains(n3), solver)
                +
                +def test_z3_abstract_sub():
                +    solver, k1, n1, k2, n2 = z3_setup_variables()
                +    k3 = k1.abstract_sub(k2)
                +    n3 = n1 - n2
                +    prove(k3.contains(n3), solver)
                +
                + +

                It's possible to write a bit more Python-metaprogramming-magic and unify the +Hypothesis and Z3 tests into the same test definition.1

                +

                Cases where this style of Z3 proof doesn't work

                +

                Unfortunately the approach described in the previous section only works for a +very small number of cases. It breaks down as soon as the KnownBits methods +that we're calling contain any if conditions (including hidden ones like +the short-circuiting and and or in Python). Let's look at an example and +implement abstract_eq. eq is supposed to be an operation that compares two +integers and returns 0 or 1 if they are different or equal, respectively. +Implementing this in knownbits looks like this (with example and hypothesis +tests):

                +
                class KnownBits:
                +    ...
                +
                +    def abstract_eq(self, other):
                +        # the result is a 0, 1, or ?
                +
                +        # if they are both the same constant, they must be equal
                +        if self.is_constant() and other.is_constant() and self.ones == other.ones:
                +            return KnownBits.from_constant(1)
                +        # check whether we have known disagreeing bits, then we know the result
                +        # is 0
                +        if self._disagrees(other):
                +            return KnownBits.from_constant(0)
                +        return KnownBits(0, 1) # an unknown boolean
                +
                +    def _disagrees(self, other):
                +        # check whether the bits disagree in any place where both are known
                +        both_known = self.knowns & other.knowns
                +        return self.ones & both_known != other.ones & both_known
                +
                +def test_eq():
                +    k1 = KnownBits.from_str('...?')
                +    k2 = KnownBits.from_str('...?')
                +    assert str(k1.abstract_eq(k2)) == '?'
                +    k1 = KnownBits.from_constant(10)
                +    assert str(k1.abstract_eq(k1)) == '1'
                +    k1 = KnownBits.from_constant(10)
                +    k2 = KnownBits.from_constant(20)
                +    assert str(k1.abstract_eq(k2)) == '0'
                +
                +@given(knownbits_and_contained_number, knownbits_and_contained_number)
                +def test_hypothesis_eq(t1, t2):
                +    k1, n1 = t1
                +    k2, n2 = t2
                +    k3 = k1.abstract_eq(k2)
                +    assert k3.contains(int(n1 == n2))
                +
                + +

                Trying to do the proof in the same style as before breaks:

                +
                >>>> k3 = k1.abstract_eq(k2)
                +Traceback (most recent call last):
                +  File "<stdin>", line 1, in <module>
                +  File "knownbits.py", line 246, in abstract_eq
                +    if self._disagrees(other):
                +  File "venv/site-packages/z3/z3.py", line 381, in __bool__
                +    raise Z3Exception("Symbolic expressions cannot be cast to concrete Boolean values.")
                +z3.z3types.Z3Exception: Symbolic expressions cannot be cast to concrete Boolean values.
                +
                + +

                We cannot call abstract_eq on a KnownBits with Z3 variables as fields, +because once we hit an if statement, the whole approach of relying on the +operator overloading breaks down. Z3 doesn't actually parse the Python code or +anything advanced like that, we rather build an expression only by running the +code and letting the Z3 formulas build up.

                +

                To still prove the correctness of abstract_eq we need to manually transform +the control flow logic of the function into a Z3 formula that uses the z3.If +expression, using a small helper function:

                +
                def z3_cond(b, trueval=1, falseval=0):
                +    return z3.If(b, BitVecVal(trueval), BitVecVal(falseval))
                +
                +def z3_abstract_eq(k1, k2):
                +    # follow the *logic* of abstract_eq, we can't call it due to the ifs in it
                +    case1cond = z3.And(k1.is_constant(), k2.is_constant(), k1.ones == k2.ones)
                +    case2cond = k1._disagrees(k2)
                +
                +    # ones is 1 in the first case, 0 otherwise
                +    ones = z3_cond(case1cond, 1, 0)
                +
                +    # in the first two cases, unknowns is 0, 1 otherwise
                +    unknowns = z3_cond(z3.Or(case1cond, case2cond), 0, 1)
                +    return KnownBits(ones, unknowns)
                +
                +def test_z3_abstract_eq_logic():
                +    solver, k1, n1, k2, n2 = z3_setup_variables()
                +    n3 = z3_cond(n1 == n2) # concrete result
                +    k3 = z3_abstract_eq(k1, k2)
                +    prove(k3.contains(n3), solver)
                +
                + +

                This proof works. It is a lot less satisfying than the previous ones though, +because we could have done an error in the manual transcription from Python code +to Z3 formulas (there are possibly more heavy-handed approaches where we do +this transformation more automatically using e.g. the ast module to analyze +the source code, but that's a much more complicated researchy project). To +lessen this problem somewhat we can factor out the parts of the logic that don't +have any conditions into small helper methods (like _disagrees in this +example) and use them in the manual conversion of the code to Z3 formulas.2

                +

                The final condition that Z3 checks, btw, is this one:

                +
                If(n1 == n2, 1, 0) &
                +~If(Or(And(n1_unkowns == 0,
                +           n2_unkowns == 0,
                +           n1_ones == n2_ones),
                +       n1_ones & ~n1_unkowns & ~n2_unkowns !=
                +       n2_ones & ~n1_unkowns & ~n2_unkowns),
                +    0, 1) ==
                +If(And(n1_unkowns == 0, n2_unkowns == 0, n1_ones == n2_ones),
                +   1, 0)
                +
                + +

                Making Statements about Precision

                +

                So far we have only used Z3 to prove statements about correctness, i.e. that +our abstract operations overapproximate what can happen with concrete values. +While proving this property is essential if we want to avoid miscompilation, +correctness alone is not a very strong constraint on the implementation of our +abstract transfer functions. We could simply return Knownbits.unknowns() for +every abstract_* method and the resulting overapproximation would be correct, +but useless in practice.

                +

                It's much harder to make statements about whether the transfer functions are +maximally precise. There are two aspects of precision I want to discuss in this +section, however.

                +

                The first aspect is that we would really like it if the transfer functions +compute the maximally precise results for singleton sets. If all abstract +arguments of an operations are constants, i.e. contain only a single concrete +element, then we know that the resulting set also has only a single element. We +can prove that all our transfer functions have this property:

                +
                def test_z3_prove_constant_folding():
                +    solver, k1, n1, k2, n2 = z3_setup_variables()
                +    k3 = k1.abstract_invert()
                +    prove(z3.Implies(k1.is_constant(),
                +                     k3.is_constant()), solver)
                +
                +    k3 = k1.abstract_and(k2)
                +    prove(z3.Implies(z3.And(k1.is_constant(), k2.is_constant()),
                +                     k3.is_constant()), solver)
                +
                +    k3 = k1.abstract_or(k2)
                +    prove(z3.Implies(z3.And(k1.is_constant(), k2.is_constant()),
                +                     k3.is_constant()), solver)
                +
                +    k3 = k1.abstract_sub(k2)
                +    prove(z3.Implies(z3.And(k1.is_constant(), k2.is_constant()),
                +                     k3.is_constant()), solver)
                +
                +    k3 = z3_abstract_eq(k1, k2)
                +    prove(z3.Implies(z3.And(k1.is_constant(), k2.is_constant()),
                +                     k3.is_constant()), solver)
                +
                + +

                Proving with Z3 that the transfer functions are maximally precise for +non-constant arguments seems to be relatively hard. I tried a few completely +rigorous approaches and failed. The paper Sound, Precise, and Fast Abstract +Interpretation with Tristate Numbers +contains an optimality proof for the transfer functions of addition and +subtraction, so we can be certain that they are as precise as is +possible.

                +

                I still want to show an approach for trying to find concrete examples of +abstract values that are less precise than they could be, using a combination +of Hypothesis and Z3. The idea is to use hypothesis to pick random abstract +values. Then we compute the abstract result using our transfer function. +Afterwards we can ask Z3 to find us an abstract result that is better than the +one our transfer function produced. If Z3 finds a better abstract result, we +have a concrete example of imprecision for our transfer function. Those tests +aren't strict proofs, because they rely on generating random abstract values, +but they can still be valuable (not for the transfer functions in this blog +post, which are all optimal).

                +

                Here is what the code looks like (this is a little bit bonus content, I'll not +explain the details and can only hope that the comments are somewhat helpful):

                +
                @given(random_knownbits_and_contained_number, random_knownbits_and_contained_number)
                +@settings(deadline=None)
                +def test_check_precision(t1, t2):
                +    k1, n1 = t1
                +    k2, n2 = t2
                +    # apply transfer function
                +    k3 = k1.abstract_add(k2)
                +    example_res = n1 + n2
                +
                +    # try to find a better version of k3 with Z3
                +    solver = z3.Solver()
                +    solver.set("timeout", 8000)
                +
                +    var1 = BitVec('v1')
                +    var2 = BitVec('v2')
                +
                +    ones = BitVec('ones')
                +    unknowns = BitVec('unknowns')
                +    better_k3 = KnownBits(ones, unknowns)
                +    print(k1, k2, k3)
                +
                +    # we're trying to find an example for a better k3, so we use check, without
                +    # negation:
                +    res = solver.check(z3.And(
                +        # better_k3 should be a valid knownbits instance
                +        better_k3.is_well_formed(),
                +        # it should be better than k3, ie there are known bits in better_k3
                +        # that we don't have in k3
                +        better_k3.knowns & ~k3.knowns != 0,
                +        # now encode the correctness condition for better_k3 with a ForAll:
                +        # for all concrete values var1 and var2, it must hold that if
                +        # var1 is in k1 and var2 is in k2 it follows that var1 + var2 is in
                +        # better_k3
                +        z3.ForAll(
                +        [var1, var2],
                +        z3.Implies(
                +            z3.And(k1.contains(var1), k2.contains(var2)),
                +            better_k3.contains(var1 + var2)))))
                +    # if this query is satisfiable, we have found a better result for the
                +    # abstract_add
                +    if res == z3.sat:
                +        model = solver.model()
                +        rk3 = KnownBits(model.eval(ones).as_signed_long(), model.eval(unknowns).as_signed_long())
                +        print("better", rk3)
                +        assert 0
                +    if res == z3.unknown:
                +        print("timeout")
                +
                + +

                It does not actually fail for abstract_add (nor the other abstract +functions). To see the test failing we can add some imprecision to the +implementation of abstract_add to see Hypothesis and Z3 find examples of +values that are not optimally precise (for example by setting some bits +of unknowns in the implementation of abstract_add unconditionally).

                +

                Using the Abstract Domain in the Toy Optimizer for Generalized Constant Folding

                +

                Now after all this work we can finally actually use the knownbits abstract +domain in the toy optimizer. The code for this follows Max' intro post about +abstract interpretation +quite closely.

                +

                For completeness sake, in the fold there's the basic infrastructure classes +that make up the IR again (they are identical or at least extremely close to +the previous toy posts).

                +
                toy infrastructure
                class Value:
                +    def find(self):
                +        raise NotImplementedError("abstract")
                +
                +
                +@dataclass(eq=False)
                +class Operation(Value):
                +    name : str
                +    args : list[Value]
                +
                +    forwarded : Optional[Value] = None
                +
                +    def find(self) -> Value:
                +        op = self
                +        while isinstance(op, Operation):
                +            next = op.forwarded
                +            if next is None:
                +                return op
                +            op = next
                +        return op
                +
                +    def arg(self, index):
                +        return self.args[index].find()
                +
                +    def make_equal_to(self, value : Value):
                +        self.find().forwarded = value
                +
                +
                +@dataclass(eq=False)
                +class Constant(Value):
                +    value : object
                +
                +    def find(self):
                +        return self
                +
                +
                +class Block(list):
                +    def __getattr__(self, opname):
                +        def wraparg(arg):
                +            if not isinstance(arg, Value):
                +                arg = Constant(arg)
                +            return arg
                +        def make_op(*args):
                +            op = Operation(opname,
                +                [wraparg(arg) for arg in args])
                +            self.append(op)
                +            return op
                +        return make_op
                +
                +
                +def bb_to_str(l : Block, varprefix : str = "var"):
                +    def arg_to_str(arg : Value):
                +        if isinstance(arg, Constant):
                +            return str(arg.value)
                +        else:
                +            return varnames[arg]
                +
                +    varnames = {}
                +    res = []
                +    for index, op in enumerate(l):
                +        # give the operation a name used while
                +        # printing:
                +        var =  f"{varprefix}{index}"
                +        varnames[op] = var
                +        arguments = ", ".join(
                +            arg_to_str(op.arg(i))
                +                for i in range(len(op.args))
                +        )
                +        strop = f"{var} = {op.name}({arguments})"
                +        res.append(strop)
                +    return "\n".join(res)
                +
                + + + +

                Now we can write some first tests, the first one simply checking constant +folding:

                +
                def test_constfold_two_ops():
                +    bb = Block()
                +    var0 = bb.getarg(0)
                +    var1 = bb.int_add(5, 4)
                +    var2 = bb.int_add(var1, 10)
                +    var3 = bb.int_add(var2, var0)
                +
                +    opt_bb = simplify(bb)
                +    assert bb_to_str(opt_bb, "optvar") == """\
                +optvar0 = getarg(0)
                +optvar1 = int_add(19, optvar0)"""
                +
                + +

                Calling the transfer functions on constant KnownBits produces a constant +results, as we have seen. Therefore "regular" constant folding should hopefully +be achieved by optimizing with the KnownBits abstract domain too.

                +

                The next two tests are slightly more complicated and can't be optimized by +regular constant-folding. They follow the motivating examples from the start of +this blog post, a hundred years ago:

                +
                def test_constfold_via_knownbits():
                +    bb = Block()
                +    var0 = bb.getarg(0)
                +    var1 = bb.int_or(var0, 1)
                +    var2 = bb.int_and(var1, 1)
                +    var3 = bb.dummy(var2)
                +
                +    opt_bb = simplify(bb)
                +    assert bb_to_str(opt_bb, "optvar") == """\
                +optvar0 = getarg(0)
                +optvar1 = int_or(optvar0, 1)
                +optvar2 = dummy(1)"""
                +
                +def test_constfold_alignment_check():
                +    bb = Block()
                +    var0 = bb.getarg(0)
                +    var1 = bb.int_invert(0b111)
                +    # mask off the lowest three bits, thus var2 is aligned
                +    var2 = bb.int_and(var0, var1)
                +    # add 16 to aligned quantity
                +    var3 = bb.int_add(var2, 16)
                +    # check alignment of result
                +    var4 = bb.int_and(var3, 0b111)
                +    var5 = bb.int_eq(var4, 0)
                +    # var5 should be const-folded to 1
                +    var6 = bb.dummy(var5)
                +
                +    opt_bb = simplify(bb)
                +    assert bb_to_str(opt_bb, "optvar") == """\
                +optvar0 = getarg(0)
                +optvar1 = int_and(optvar0, -8)
                +optvar2 = int_add(optvar1, 16)
                +optvar3 = dummy(1)"""
                +
                + +

                Here is simplify to make these tests pass:

                +
                def unknown_transfer_functions(*abstract_args):
                +    return KnownBits.all_unknown()
                +
                +
                +def simplify(bb: Block) -> Block:
                +    abstract_values = {} # dict mapping Operation to KnownBits
                +
                +    def knownbits_of(val : Value):
                +        if isinstance(val, Constant):
                +            return KnownBits.from_constant(val.value)
                +        return abstract_values[val]
                +
                +    opt_bb = Block()
                +    for op in bb:
                +        # apply the transfer function on the abstract arguments
                +        name_without_prefix = op.name.removeprefix("int_")
                +        method_name = f"abstract_{name_without_prefix}"
                +        transfer_function = getattr(KnownBits, method_name, unknown_transfer_functions)
                +        abstract_args = [knownbits_of(arg.find()) for arg in op.args]
                +        abstract_res = abstract_values[op] = transfer_function(*abstract_args)
                +        # if the result is a constant, we optimize the operation away and make
                +        # it equal to the constant result
                +        if abstract_res.is_constant():
                +            op.make_equal_to(Constant(abstract_res.ones))
                +            continue
                +        # otherwise emit the op
                +        opt_bb.append(op)
                +    return opt_bb
                +
                + +

                The code follows the approach from the previous blog post very closely. The +only difference is that we apply the transfer function first, to be able to +detect whether the abstract domain can tell us that the result has to always be +a constant. This code makes all three tests pass.

                +

                Using the KnownBits Domain for Conditional Peephole Rewrites

                +

                So far we are only using the KnownBits domain to find out that certain +operations have to produce a constant. We can also use the KnownBits domain +to check whether certain operation rewrites are correct. Let's use one of the +examples from the Mining JIT traces for missing optimizations with +Z3 +post, where Z3 found the inefficiency (x << 4) & -0xf == x << 4 in PyPy JIT +traces. We don't have shift operations, but we want to generalize this optimization +anyway. The general form of this rewrite is that under some circumstances x & +y == x, and we can use the KnownBits domain to detect situations where this +must be true.

                +

                To understand when x & y == x is true, we can think about individual pairs of +bits a and b. If a == 0, then a & b == 0 & b == 0 == a. If b == 1 +then a & b == a & 1 == a. So if either a == 0 or b == 1 is true, +a & b == a follows. And if either of these conditions is true for all the +bits of x and y, we can know that x & y == x.

                +

                We can write a method on KnownBits to check for this condition:

                +
                class KnownBits:
                +    ...
                +
                +    def is_and_identity(self, other):
                +        """ Return True if n1 & n2 == n1 for any n1 in self and n2 in other.
                +        (or, equivalently, return True if n1 | n2 == n2)"""
                +        return self.zeros | other.ones == -1
                +
                + +

                Since my reasoning about this feels ripe for errors, let's check that our +understanding is correct with Z3:

                +
                def test_prove_is_and_identity():
                +    solver, k1, n1, k2, n2 = z3_setup_variables()
                +    prove(z3.Implies(k1.is_and_identity(k2), n1 & n2 == n1), solver)
                +
                + +

                Now let's use this in the toy optimizer. Here are two tests for this rewrite:

                +
                def test_remove_redundant_and():
                +    bb = Block()
                +    var0 = bb.getarg(0)
                +    var1 = bb.int_invert(0b1111)
                +    # mask off the lowest four bits
                +    var2 = bb.int_and(var0, var1)
                +    # applying the same mask is not redundant
                +    var3 = bb.int_and(var2, var1)
                +    var4 = bb.dummy(var3)
                +
                +    opt_bb = simplify(bb)
                +    assert bb_to_str(opt_bb, "optvar") == """\
                +optvar0 = getarg(0)
                +optvar1 = int_and(optvar0, -16)
                +optvar2 = dummy(optvar1)"""
                +
                +def test_remove_redundant_and_more_complex():
                +    bb = Block()
                +    var0 = bb.getarg(0)
                +    var1 = bb.getarg(1)
                +    # var2 has bit pattern ????
                +    var2 = bb.int_and(var0, 0b1111)
                +    # var3 has bit pattern ...?1111
                +    var3 = bb.int_or(var1, 0b1111)
                +    # var4 is just var2
                +    var4 = bb.int_and(var2, var3)
                +    var5 = bb.dummy(var4)
                +
                +    opt_bb = simplify(bb)
                +    assert bb_to_str(opt_bb, "optvar") == """\
                +optvar0 = getarg(0)
                +optvar1 = getarg(1)
                +optvar2 = int_and(optvar0, 15)
                +optvar3 = int_or(optvar1, 15)
                +optvar4 = dummy(optvar2)"""
                +
                + +

                The first test could also be made to pass by implementing a reassociation +optimization that turns (x & c1) & c2 into x & (c1 & c2) and then constant-folds the second and. But here we want to +use KnownBits and conditionally rewrite int_and to its first argument. So to make the tests pass, +we can change simplify like this:

                +
                def simplify(bb: Block) -> Block:
                +    abstract_values = {} # dict mapping Operation to KnownBits
                +
                +    def knownbits_of(val : Value):
                +        ...
                +
                +    opt_bb = Block()
                +    for op in bb:
                +        # apply the transfer function on the abstract arguments
                +        name_without_prefix = op.name.removeprefix("int_")
                +        method_name = f"abstract_{name_without_prefix}"
                +        transfer_function = getattr(KnownBits, method_name, unknown_transfer_functions)
                +        abstract_args = [knownbits_of(arg.find()) for arg in op.args]
                +        abstract_res = abstract_values[op] = transfer_function(*abstract_args)
                +        # if the result is a constant, we optimize the operation away and make
                +        # it equal to the constant result
                +        if abstract_res.is_constant():
                +            op.make_equal_to(Constant(abstract_res.ones))
                +            continue
                +        # <<<< new code
                +        # conditionally rewrite int_and(x, y) to x
                +        if op.name == "int_and":
                +            k1, k2 = abstract_args
                +            if k1.is_and_identity(k2):
                +                op.make_equal_to(op.arg(0))
                +                continue
                +        # >>>> end changes
                +        opt_bb.append(op)
                +    return opt_bb
                +
                + +

                And with that, the new tests pass as well. A real implementation would also +check the other argument order, but we leave that out for the sake of brevity.

                +

                This rewrite also generalizes the rewrites int_and(0, x) -> 0 and +int_and(-1, x) -> x, let's add a test for those:

                +
                def test_remove_and_simple():
                +    bb = Block()
                +    var0 = bb.getarg(0)
                +    var1 = bb.getarg(1)
                +    var2 = bb.int_and(0, var0) # == 0
                +    var3 = bb.int_invert(var2) # == -1
                +    var4 = bb.int_and(var1, var3) # == var1
                +    var5 = bb.dummy(var4)
                +
                +    opt_bb = simplify(bb)
                +    assert bb_to_str(opt_bb, "optvar") == """\
                +optvar0 = getarg(0)
                +optvar1 = getarg(1)
                +optvar2 = dummy(optvar1)"""
                +
                + +

                This test just passes. And that's it for this post!

                +

                Conclusion

                +

                In this post we've seen the implementation, testing and proofs about a 'known +bits' abstract domain, as well as its use in the toy optimizer to generalize +constant folding, and to implement conditional peephole rewrites.

                +

                In the next posts I'll write about the real implementation of a knownbits +domain in PyPy's JIT, its combination with the existing interval abstract +domain, how to deal with gaining information from conditions in the program, +and some lose ends.

                +

                Sources:

                + +
                +
                +
                  +
                1. +

                  There's a subtletly about the Z3 proofs that I'm sort of +glossing over here. Python integers are of arbitrary width, and the +KnownBits code is actually carefully written to work for integers of any +size. This property is tested by the Hypothesis tests, which don't limit +the sizes of the generated random integers. However, the Z3 proofs only +check bitvectors of a fixed bitwidth of 64. There are various ways to deal +with this situation. For most "real" compilers, the bitwidth of integers +would be fixed anyway. Then the components ones and unknowns of the +KnownBits class would use the number of bits the corresponding integer +variable has, and the Z3 proofs would use the same width. This is what we +do in the PyPy JIT. 

                  +
                2. +
                3. +

                  The less close connection between implementation and proof +for abstract_eq is one of the reasons why it makes sense to do +unit-testing in addition to proofs. For a more detailed explanation of +why both tests and proofs are good to +have, see Jeremy Siek's blog +post, +as well as the Knuth +quote

                  +
                4. +
                +
                +
                +

                Abstract interpretation in the Toy Optimizer

                + +
                +

                This is a cross-post +from Max Bernstein from his excellent blog where he writes about programming +languages, compilers, optimizations, virtual machines. He's looking for a +(dynamic language runtime or compiler related) job too.

                +
                +

                CF Bolz-Tereick wrote some excellent posts in which they introduce a small IR +and optimizer and extend it with allocation +removal. We also did a live stream together in which +we did some more heap optimizations.

                +

                In this blog post, I'm going to write a small abstract interpreter for the Toy +IR and then show how we can use it to do some simple optimizations. It assumes +that you are familiar with the little IR, which I have reproduced unchanged in +a GitHub Gist.

                +

                Abstract interpretation is a general framework for efficiently computing +properties that must be true for all possible executions of a program. It's a +widely used approach both in compiler optimizations as well as offline static +analysis for finding bugs. I'm writing this post to pave the way for CF's next +post on proving abstract interpreters correct for range analysis and known bits +analysis inside PyPy.

                +

                Before we begin, I want to note a couple of things:

                +
                  +
                • The Toy IR is in SSA form, which means that every variable is defined exactly + once. This means that abstract properties of each variable are easy to track.
                • +
                • The Toy IR represents a linear trace without control flow, meaning we won't + talk about meet/join or fixpoints. They only make sense if the IR has a + notion of conditional branches or back edges (loops).
                • +
                +

                Alright, let's get started.

                +

                Welcome to abstract interpretation

                +

                Abstract interpretation means a couple different things to different people. +There's rigorous mathematical formalism thanks to Patrick and Radhia Cousot, +our favorite power couple, and there's also sketchy hand-wavy stuff like what +will follow in this post. In the end, all people are trying to do is reason +about program behavior without running it.

                +

                In particular, abstract interpretation is an over-approximation of the +behavior of a program. Correctly implemented abstract interpreters never lie, +but they might be a little bit pessimistic. This is because instead of using +real values and running the program---which would produce a concrete result and +some real-world behavior---we "run" the program with a parallel universe of +abstract values. This abstract run gives us information about all possible +runs of the program.1

                +

                Abstract values always represent sets of concrete values. Instead of literally +storing a set (in the world of integers, for example, it could get pretty +big...there are a lot of integers), we group them into a finite number of named +subsets.2

                +

                Let's learn a little about abstract interpretation with an example program and +example abstract domain. Here's the example program:

                +
                v0 = 1
                +v1 = 2
                +v2 = add(v0, v1)
                +
                + +

                And our abstract domain is "is the number positive" (where "positive" means +nonnegative, but I wanted to keep the words distinct):

                +
                       top
                +    /       \
                +positive    negative
                +    \       /
                +      bottom
                +
                + +

                The special top value means "I don't know" and the special bottom value +means "empty set" or "unreachable". The positive and negative values +represent the sets of all positive and negative numbers, respectively.

                +

                We initialize all the variables v0, v1, and v2 to bottom and then walk +our IR, updating our knowledge as we go.

                +
                # here
                +v0:bottom = 1
                +v1:bottom = 2
                +v2:bottom = add(v0, v1)
                +
                + +

                In order to do that, we have to have transfer functions for each operation. +For constants, the transfer function is easy: determine if the constant is +positive or negative. For other operations, we have to define a function that +takes the abstract values of the operands and returns the abstract value of the +result.

                +

                In order to be correct, transfer functions for operations have to be compatible +with the behavior of their corresponding concrete implementations. You can +think of them having an implicit universal quantifier forall in front of +them.

                +

                Let's step through the constants at least:

                +
                v0:positive = 1
                +v1:positive = 2
                +# here
                +v2:bottom = add(v0, v1)
                +
                + +

                Now we need to figure out the transfer function for add. It's kind of tricky +right now because we haven't specified our abstract domain very well. I keep +saying "numbers", but what kinds of numbers? Integers? Real numbers? Floating +point? Some kind of fixed-width bit vector (int8, uint32, ...) like an +actual machine "integer"?

                +

                For this post, I am going to use the mathematical definition of integer, which +means that the values are not bounded in size and therefore do not overflow. +Actual hardware memory constraints aside, this is kind of like a Python int.

                +

                So let's look at what happens when we add two abstract numbers:

                + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                toppositivenegativebottom
                toptoptoptopbottom
                positivetoppositivetopbottom
                negativetoptopnegativebottom
                bottombottombottombottombottom
                +

                As an example, let's try to add two numbers a and b, where a is positive +and b is negative. We don't know anything about their values other than their +signs. They could be 5 and -3, where the result is 2, or they could be +1 and -100, where the result is -99. This is why we can't say anything +about the result of this operation and have to return top.

                +

                The short of this table is that we only really know the result of an addition +if both operands are positive or both operands are negative. Thankfully, in +this example, both operands are known positive. So we can learn something about +v2:

                +
                v0:positive = 1
                +v1:positive = 2
                +v2:positive = add(v0, v1)
                +# here
                +
                + +

                This may not seem useful in isolation, but analyzing more complex programs even +with this simple domain may be able to remove checks such as if (v2 < 0) { ... }.

                +

                Let's take a look at another example using an sample absval (absolute value) +IR operation:

                +
                v0 = getarg(0)
                +v1 = getarg(1)
                +v2 = absval(v0)
                +v3 = absval(v1)
                +v4 = add(v2, v3)
                +v5 = absval(v4)
                +
                + +

                Even though we have no constant/concrete values, we can still learn something +about the states of values throughout the program. Since we know that absval +always returns a positive number, we learn that v2, v3, and v4 are all +positive. This means that we can optimize out the absval operation on v5:

                +
                v0:top = getarg(0)
                +v1:top = getarg(1)
                +v2:positive = absval(v0)
                +v3:positive = absval(v1)
                +v4:positive = add(v2, v3)
                +v5:positive = v4
                +
                + +

                Other interesting lattices include:

                +
                  +
                • Constants (where the middle row is pretty wide)
                • +
                • Range analysis (bounds on min and max of a number)
                • +
                • Known bits (using a bitvector representation of a number, which bits are + always 0 or 1)
                • +
                +

                For the rest of this blog post, we are going to do a very limited version of +"known bits", called parity. This analysis only tracks the least significant +bit of a number, which indicates if it is even or odd.

                +

                Parity

                +

                The lattice is pretty similar to the positive/negative lattice:

                +
                    top
                +  /     \
                +even    odd
                +  \     /
                +   bottom
                +
                + +

                Let's define a data structure to represent this in Python code:

                +
                class Parity:
                +    def __init__(self, name):
                +        self.name = name
                +
                +    def __repr__(self):
                +        return self.name
                +
                + +

                And instantiate the members of the lattice:

                +
                TOP = Parity("top")
                +EVEN = Parity("even")
                +ODD = Parity("odd")
                +BOTTOM = Parity("bottom")
                +
                + +

                Now let's write a forward flow analysis of a basic block using this lattice. +We'll do that by assuming that a method on Parity is defined for each IR +operation. For example, Parity.add, Parity.lshift, etc.

                +
                def analyze(block: Block) -> None:
                +    parity = {v: BOTTOM for v in block}
                +
                +    def parity_of(value):
                +        if isinstance(value, Constant):
                +            return Parity.const(value)
                +        return parity[value]
                +
                +    for op in block:
                +        transfer = getattr(Parity, op.name)
                +        args = [parity_of(arg.find()) for arg in op.args]
                +        parity[op] = transfer(*args)
                +
                + +

                For every operation, we compute the abstract value---the parity---of the +arguments and then call the corresponding method on Parity to get the +abstract result.

                + +

                We need to special case Constants due to a quirk of how the Toy IR is +constructed: the constants don't appear in the instruction stream and instead +are free-floating.

                +

                Let's start by looking at the abstraction function for concrete +values---constants:

                +
                class Parity:
                +    # ...
                +    @staticmethod
                +    def const(value):
                +        if value.value % 2 == 0:
                +            return EVEN
                +        else:
                +            return ODD
                +
                + +

                Seems reasonable enough. Let's pause on operations for a moment and consider an +example program:

                +
                v0 = getarg(0)
                +v1 = getarg(1)
                +v2 = lshift(v0, 1)
                +v3 = lshift(v1, 1)
                +v4 = add(v2, v3)
                +v5 = dummy(v4)
                +
                + +

                This function (which is admittedly a little contrived) takes two inputs, shifts +them left by one bit, adds the result, and then checks the least significant +bit of the addition result. It then passes that result into a dummy function, +which you can think of as "return" or "escape".

                +

                To do some abstract interpretation on this program, we'll need to implement the +transfer functions for lshift and add (dummy will just always return +TOP). We'll start with add. Remember that adding two even numbers returns +an even number, adding two odd numbers returns an even number, and mixing even +and odd returns an odd number.

                +
                class Parity:
                +    # ...
                +    def add(self, other):
                +        if self is BOTTOM or other is BOTTOM:
                +            return BOTTOM
                +        if self is TOP or other is TOP:
                +            return TOP
                +        if self is EVEN and other is EVEN:
                +            return EVEN
                +        if self is ODD and other is ODD:
                +            return EVEN
                +        return ODD
                +
                + +

                We also need to fill in the other cases where the operands are top or +bottom. In this case, they are both "contagious"; if either operand is +bottom, the result is as well. If neither is bottom but either operand is top, +the result is as well.

                +

                Now let's look at lshift. Shifting any number left by a non-zero number of +bits will always result in an even number, but we need to be careful about the +zero case! Shifting by zero doesn't change the number at all. Unfortunately, +since our lattice has no notion of zero, we have to over-approximate here:

                +
                class Parity:
                +    # ...
                +    def lshift(self, other):
                +        # self << other
                +        if other is ODD:
                +            return EVEN
                +        return TOP
                +
                + +

                This means that we will miss some opportunities to optimize, but it's a +tradeoff that's just part of the game. (We could also add more elements to our +lattice, but that's a topic for another day.)

                +

                Now, if we run our abstract interpretation, we'll collect some interesting +properties about the program. If we temporarily hack on the internals of +bb_to_str, we can print out parity information alongside the IR operations:

                +
                v0:top = getarg(0)
                +v1:top = getarg(1)
                +v2:even = lshift(v0, 1)
                +v3:even = lshift(v1, 1)
                +v4:even = add(v2, v3)
                +v5:top = dummy(v4)
                +
                + +

                This is pretty awesome, because we can see that v4, the result of the +addition, is always even. Maybe we can do something with that information.

                +

                Optimization

                +

                One way that a program might check if a number is odd is by checking the least +significant bit. This is a common pattern in C code, where you might see code +like y = x & 1. Let's introduce a bitand IR operation that acts like the +& operator in C/Python. Here is an example of use of it in our program:

                +
                v0 = getarg(0)
                +v1 = getarg(1)
                +v2 = lshift(v0, 1)
                +v3 = lshift(v1, 1)
                +v4 = add(v2, v3)
                +v5 = bitand(v4, 1)  # new!
                +v6 = dummy(v5)
                +
                + +

                We'll hold off on implementing the transfer function for it---that's left as an +exercise for the reader---and instead do something different.

                +

                Instead, we'll see if we can optimize operations of the form bitand(X, 1). If +we statically know the parity as a result of abstract interpretation, we can +replace the bitand with a constant 0 or 1.

                +

                We'll first modify the analyze function (and rename it) to return a new +Block containing optimized instructions:

                +
                def simplify(block: Block) -> Block:
                +    parity = {v: BOTTOM for v in block}
                +
                +    def parity_of(value):
                +        if isinstance(value, Constant):
                +            return Parity.const(value)
                +        return parity[value]
                +
                +    result = Block()
                +    for op in block:
                +        # TODO: Optimize op
                +        # Emit
                +        result.append(op)
                +        # Analyze
                +        transfer = getattr(Parity, op.name)
                +        args = [parity_of(arg.find()) for arg in op.args]
                +        parity[op] = transfer(*args)
                +    return result
                +
                + +

                We're approaching this the way that PyPy does things under the hood, which is +all in roughly a single pass. It tries to optimize an instruction away, and if +it can't, it copies it into the new block.

                +

                Now let's add in the bitand optimization. It's mostly some gross-looking +pattern matching that checks if the right hand side of a bitwise and +operation is 1 (TODO: the left hand side, too). CF had some neat ideas on how +to make this more ergonomic, which I might save for later.3

                +

                Then, if we know the parity, optimize the bitand into a constant.

                +
                def simplify(block: Block) -> Block:
                +    parity = {v: BOTTOM for v in block}
                +
                +    def parity_of(value):
                +        if isinstance(value, Constant):
                +            return Parity.const(value)
                +        return parity[value]
                +
                +    result = Block()
                +    for op in block:
                +        # Try to simplify
                +        if isinstance(op, Operation) and op.name == "bitand":
                +            arg = op.arg(0)
                +            mask = op.arg(1)
                +            if isinstance(mask, Constant) and mask.value == 1:
                +                if parity_of(arg) is EVEN:
                +                    op.make_equal_to(Constant(0))
                +                    continue
                +                elif parity_of(arg) is ODD:
                +                    op.make_equal_to(Constant(1))
                +                    continue
                +        # Emit
                +        result.append(op)
                +        # Analyze
                +        transfer = getattr(Parity, op.name)
                +        args = [parity_of(arg.find()) for arg in op.args]
                +        parity[op] = transfer(*args)
                +    return result
                +
                + +

                Remember: because we use union-find to rewrite instructions in the optimizer +(make_equal_to), later uses of the same instruction get the new +optimized version "for free" (find).

                +

                Let's see how it works on our IR:

                +
                v0 = getarg(0)
                +v1 = getarg(1)
                +v2 = lshift(v0, 1)
                +v3 = lshift(v1, 1)
                +v4 = add(v2, v3)
                +v6 = dummy(0)
                +
                + +

                Hey, neat! bitand disappeared and the argument to dummy is now the constant +0 because we know the lowest bit.

                +

                Wrapping up

                +

                Hopefully you have gained a little bit of an intuitive understanding of +abstract interpretation. Last year, being able to write some code made me more +comfortable with the math. Now being more comfortable with the math is helping +me write the code. It's nice upward spiral.

                +

                The two abstract domains we used in this post are simple and not very useful in +practice but it's possible to get very far using slightly more complicated +abstract domains. Common domains include: constant propagation, type inference, +range analysis, effect inference, liveness, etc. For example, here is a a +sample lattice for constant propagation:

                +
                + +

                It has multiple levels to indicate more and less precision. For example, you +might learn that a variable is either 1 or 2 and be able to encode that as +nonnegative instead of just going straight to top.

                +

                Check out some real-world abstract interpretation in open source projects:

                + +

                If you have some readable examples, please share them so I can add.

                +

                Acknowledgements

                +

                Thank you to CF Bolz-Tereick for the toy optimizer and +helping edit this post!

                +
                +
                +
                  +
                1. +

                  In the words of abstract interpretation researchers Vincent Laviron +and Francesco Logozzo in their paper Refining Abstract +Interpretation-based Static Analyses with Hints (APLAS 2009):

                  +
                  +

                  The three main elements of an abstract interpretation are: (i) the +abstract elements ("which properties am I interested in?"); (ii) the +abstract transfer functions ("which is the abstract semantics of basic +statements?"); and (iii) the abstract operations ("how do I combine the +abstract elements?").

                  +
                  +

                  We don't have any of these "abstract operations" in this post because +there's no control flow but you can read about them elsewhere! 

                  +
                2. +
                3. +

                  These abstract values are arranged in a lattice, which is a +mathematical structure with some properties but the most important ones are +that it has a top, a bottom, a partial order, a meet operation, and values +can only move in one direction on the lattice.

                  +

                  Using abstract values from a lattice promises two things:

                  +
                    +
                  • The analysis will terminate
                  • +
                  • The analysis will be correct for any run of the program, not just one + sample run
                  • +
                  +

                  +
                4. +
                5. +

                  Something about __match_args__ and @property... 

                  +
                6. +
                +
                +
                +

                Mining JIT traces for missing optimizations with Z3

                + +
                +

                In my last post I've described how to use Z3 to find simple local peephole +optimization patterns +for the integer operations in PyPy's JIT. An example is int_and(x, 0) -> +0. In this post I want to scale up the problem of identifying possible +optimizations to much bigger instruction sequences, also using Z3. For that, I +am starting with the JIT traces of real benchmarks, after they have been +optimized by the optimizer of PyPy's JIT. Then we can ask Z3 to find +inefficient integer operations in those traces.

                +

                Starting from the optimized traces of real programs has some big +advantages over the "classical" superoptimization approach of generating and +then trying all possible sequences of instructions. It avoids the +combinatorial explosion that happens with the latter approach. Also, starting +from the traces of benchmarks or (even better) actual programs makes sure that +we actually care about the missing optimizations +that are found in this way. And because the traces are analyzed after they have +been optimized by PyPy's optimizer, we only get reports for missing +optimizations, that the JIT isn't able to do (yet).

                +

                The techniques and experiments I describe in this post are again the result of +a bunch of discussions with John Regehr at a conference a few weeks ago, as +well as reading his blog posts and papers. Thanks John! Also thanks to Max +Bernstein for super helpful feedback on the drafts +of this blog post (and for poking me to write things in general).

                +

                High-Level Approach

                +

                The approach that I took works as follows:

                +
                  +
                • Run benchmarks or other interesting programs and then dump the IR of the JIT + traces into a file. The traces have at that point been already optimized by + the PyPy JIT's optimizer.
                • +
                • For every trace, ignore all the operations on non-integer variables.
                • +
                • Translate every integer operation into a Z3 formula.
                • +
                • For every operation, use Z3 to find out whether the operation is redundant + (how that is done is described below).
                • +
                • If the operation is redundant, the trace is less efficient than it could have + been, because the optimizer could also have removed the operation. Report the + inefficiency.
                • +
                • Minimize the inefficient programs by removing as many operations as possible + to make the problem easier to understand.
                • +
                +

                In the post I will describe the details and show some pseudocode of the +approach. I'll also make the proper code public eventually (but it needs a +healthy dose of cleanups first).

                +

                Dumping PyPy Traces

                +

                PyPy will write its JIT traces into the file out if the environment variable +PYPYLOG is set as follows:

                +
                PYPYLOG=jit-log-opt:out pypy <program.py>
                +
                + +

                This environment variable works for PyPy, but also for other virtual machines +built with RPython.

                +

                (This is really a side point for the rest of the blog post, but since the +question came up I wanted to clarify it: Operations on integers in the Python +program that the JIT is running don't all correspond 1-to-1 with the int_... +operations in the traces. The int_... trace operations always operate on +machine words. The Python int type supports arbitrarily large integers. PyPy +will optimistically try to lower the operations on Python integers into machine +word operations, but adds the necessary guards into the trace to make sure that +overflow outside of the range of machine words is caught. In case one of these +guards fails the interpreter switches to a big integer heap-allocated +representation.)

                +

                Encoding Traces as Z3 formulas

                +

                The last blog post already contained the code to encode the results of +individual trace operations into Z3 formulas, so we don't need to repeat that +here. To encode traces of operations we introduce a Z3 variable for every +operation in the trace and then call the z3_expression function for every +single one of the operations in the trace.

                +

                For example, for the following trace:

                +
                [i1]
                +i2 = uint_rshift(i1, 32)
                +i3 = int_and(i2, 65535)
                +i4 = uint_rshift(i1, 48)
                +i5 = int_lshift(i4, 16)
                +i6 = int_or(i5, i3)
                +jump(i6, i2) # equal
                +
                + +

                We would get the Z3 formula:

                +
                z3.And(i2 == LShR(i1, 32),
                +       i3 == i2 & 65535,
                +       i4 == LShR(i1, 48),
                +       i5 == i4 << 16)
                +
                + +

                Usually we won't ask for the formula of the whole trace at once. Instead we go +through the trace operation by operation and try to find inefficiencies in the +current one we are looking at. Roughly like this (pseudo-)code:

                +
                def newvar(name):
                +    return z3.BitVec(name, INTEGER_WIDTH)
                +
                +def find_inefficiencies(trace):
                +    solver = z3.Solver()
                +    var_to_z3var = {}
                +    for input_argument in trace.inputargs:
                +        var_to_z3var[input_argument] = newz3var(input_argument)
                +    for op in trace:
                +        var_to_z3var[op] = z3resultvar = newz3var(op.resultvarname)
                +        arg0 = op.args[0]
                +        z3arg0 = var_to_z3var[arg0]
                +        if len(op.args) == 2:
                +            arg1 = op.args[1]
                +            z3arg1 = var_to_z3var[arg1]
                +        else:
                +            z3arg1 = None
                +        res, valid_if = z3_expression(op.name, z3arg0, z3arg1)
                +        # checking for inefficiencies, see the next sections
                +        ...
                +        if ...:
                +            return "inefficient", op
                +
                +        # not inefficient, assert op into the solver and continue with the next op
                +        solver.add(z3resultvar == res)
                +    return None # no inefficiency found
                +
                + +

                Identifying constant booleans with Z3

                +

                To get started finding inefficiencies in a trace, we can +first focus on boolean variables. For every operation in the trace that +returns a bool we can ask Z3 to prove that this variable must be always True or +always False. Most of the time, neither of these proofs will succeed. But if Z3 +manages to prove one of them, we know have found an ineffiency: instead of +computing the boolean result (eg by executing a comparison) the JIT's optimizer +could have replaced the operation with the corresponding boolean constant.

                +

                Here's an example of an inefficiency found that way: if x < y and y < z are +both true, PyPy's JIT could conclude that x < z must also +be true. However, currently the JIT cannot make that conclusion because it +only reasons about the concrete ranges (lower and upper bounds) for every +integer variable, but it has no way to remember anything about relationships +between different variables. This kind of reasoning would quite often be useful +to remove list/string bounds checks. Here's a talk about how LLVM does +this (but it might be +too heavyweight for a JIT setting).

                +

                Here are some more examples found that way:

                +
                  +
                • +x - 1 == x is always False
                • +
                • +x - (x == -1) == -1 is always False. The pattern x - (x == -1) happens a + lot in PyPy's hash computations: To be compatible with the CPython hashes we + need to make sure that no object's hash is -1 (CPython uses -1 as an error + value on the C level).
                • +
                +

                Here's pseudo-code for how to implement checking boolean operations for +inefficiencies:

                +
                def find_inefficiencies(trace):
                +    ...
                +    for op in trace:
                +        ...
                +        res, valid_if = z3_expression(op.name, z3arg0, z3arg1)
                +        # check for boolean constant result
                +        if op.has_boolean_result():
                +            if prove(solver, res == 0):
                +                return "inefficient", op, 0
                +            if prove(solver, res == 1):
                +                return "inefficient", op, 1
                +        # checking for other inefficiencies, see the next sections
                +        ...
                +
                +        # not inefficient, add op to the solver and continue with the next op
                +        solver.add(z3resultvar == res)
                +    return None # no inefficiency found
                +
                + +

                Identifying redundant operations

                +

                A more interesting class of redundancy is to try to find two operations in a +trace that compute the same result. We can do that by asking Z3 to prove for +each pair of different operations in the trace to prove that the result is +always the same. If a previous operation returns the same result, the JIT could +have re-used that result instead of re-computing it, saving time. Doing this +search for equivalent operations with Z3 is quadratic in the number of +operations, but since traces have a maximum length it is not too bad in +practice.

                +

                This is the real workhorse of my script so far, it's what finds most of the +inefficiencies. Here's a few examples:

                +
                  +
                • The very first and super useful example the script found is int_eq(b, 1) == + b if b is known to be a boolean (ie and integer 0 or 1). I have already + implemented this optimization in the JIT.
                • +
                • Similarly, int_and(b, 1) == b for booleans.
                • +
                • (x << 4) & -0xf == x << 4
                • +
                • +((x >> 63) << 1) << 2) >> 3 == x >> 63. In general the JIT is quite bad at + optimizing repeated shifts (the infrastructure for doing better with that is + already in place, so this will be a relatively easy fix).
                • +
                • +(x & 0xffffffff) | ((x >> 32) << 32) == x. Having the JIT optimize this + would maybe require first recognizing that (x >> 32) << 32 can be expressed + as a mask: (x & 0xffffffff00000000), and then using (x & c1) | (x & c2) == + x & (c1 | c2) +
                • +
                • A commonly occurring pattern is variations of this one: + ((x & 1345) ^ 2048) - 2048 == x & 1345 (with different constants, of + course). xor is add without carry, and x & 1345 does not have the bit + 2048 set. Therefore the ^ 2048 is equivalent to + 2048, which the - + 2048 cancels. More generally, if a & b == 0, then a + b == a | b == a ^ b. + I don't understand at all why this appears so often in the traces, but I + see variations of it a lot. LLVM can optimize this, but GCC + can't, thanks to + Andrew Pinski for filing the + bug!
                • +
                +

                And here's some implementation pseudo-code again:

                +
                def find_inefficiencies(trace):
                +    ...
                +    for op in trace:
                +        ...
                +        res, valid_if = z3_expression(op.name, z3arg0, z3arg1)
                +        # check for boolean constant result
                +        ...
                +        # searching for redundant operations
                +        for previous_op in trace:
                +            if previous_op is op:
                +                break # done, reached the current op
                +            previous_op_z3var = var_to_z3var[previous_op]
                +            if prove(solver, previous_op_z3var == res):
                +                return "inefficient", op, previous_op
                +        ...
                +        # more code here later
                +        ...
                +
                +        # not inefficient, add op to the solver and continue with the next op
                +        solver.add(z3resultvar == res)
                +    return None # no inefficiency found
                +
                + +

                Synthesizing more complicated constants with exists-forall

                +

                To find out whether some integer operations always return a constant result, we +can't simply use the same trick as for those operations that return boolean +results, because enumerating 2⁶⁴ possible constants and checking them all +would take too long. Like in the last post, we can use z3.ForAll to find out +whether Z3 can synthesize a constant for the result of an operation for us. +If such a constant exists, the JIT could have removed the operation, +and replaced it with the constant that Z3 provides.

                +

                Here a few examples of inefficiencies found this way:

                +
                  +
                • +(x ^ 1) ^ x == 1 (or, more generally: (x ^ y) ^ x == y)
                • +
                • if x | y == 0, it follows that x == 0 and y == 0 +
                • +
                • if x != MAXINT, then x + 1 > x +
                • +
                +

                Implementing this is actually slightly annoying. The solver.add calls for +non-inefficient ops add assertions to the solver, which are now confusing the +z3.ForAll query. We could remove all assertion from the solver, then do the +ForAll query, then add the assertions back. What I ended doing instead was +instantiating a second solver object that I'm using for the ForAll queries, +that remains empty the whole time.

                +
                def find_inefficiencies(trace):
                +    solver = z3.Solver()
                +    empty_solver = z3.Solver()
                +    var_to_z3var = {}
                +    ...
                +    for op in trace:
                +        ...
                +        res, valid_if = z3_expression(op.name, z3arg0, z3arg1)
                +        # check for boolean constant result
                +        ...
                +        # searching for redundant operations
                +        ...
                +        # checking for constant results
                +        constvar = z3.BitVec('find_const', INTEGER_WIDTH)
                +        condition = z3.ForAll(
                +            var_to_z3var.values(),
                +            z3.Implies(
                +                *solver.assertions(),
                +                expr == constvar
                +            )
                +        )
                +        if empty_solver.check(condition) == z3.sat:
                +            model = empty_solver.model()
                +            const = model[constvar].as_signed_long()
                +            return "inefficient", op, const
                +
                +        # not inefficient, add op to the solver and continue with the next op
                +        solver.add(z3resultvar == res)
                +    return None # no inefficiency found
                +
                + +

                Minimization

                +

                Analyzing an inefficiency by hand in the context of a larger trace is quite +tedious. Therefore I've implemented a (super inefficient) script to try to make +the examples smaller. Here's how that works:

                +
                  +
                • First throw out all the operations that occur after the inefficient operation + in the trace.
                • +
                • Then we remove all "dead" operations, ie operations that don't have their + results used (all the operations that we can analyze with Z3 are without side + effects).
                • +
                • Now we try to remove every guard in the trace one by one and check + afterwards, whether the resulting trace still has an inefficiency.
                • +
                • We also try to replace every single operation with a new argument to the + trace, to see whether the inefficiency is still present.
                • +
                +

                The minimization process is sort of inefficient and I should probably be using + shrinkray or + C-Reduce instead. However, it + seems to work well in practice and the runtime isn't too bad.

                +

                Results

                +

                So far I am using the JIT traces of three programs: 1) Booting Linux on the +Pydrofoil RISC-V emulator, 2) booting Linux on the Pydrofoil ARM emulator, and 3) +running the PyPy bootstrap process on top of PyPy.

                +

                I picked these programs because most Python programs don't contain interesting +amounts of integer operations, and the traces of the emulators +contain a lot of them. I also used the bootstrap process because I still wanted +to try a big Python program and personally care about the runtime of this +program a lot.

                +

                The script identifies 94 +inefficiencies in the traces, a lot of them come from repeating +patterns. My next steps will be to manually inspect them all, categorize them, and +implement easy optimizations identified that way. I also want a way to sort the +examples by execution count in the benchmarks, to get a feeling for which of +them are most important.

                +

                I didn't investigate the full set of Python +benchmarks that PyPy uses yet, because I don't expect +them to contain interesting amounts of integer operations, but maybe I am wrong +about that? Will have to try eventually.

                +

                Conclusion

                +

                This was again much easier to do than I would have expected! Given that I had +the translation of trace ops to Z3 already in place, it was a matter of about a +day's of programming to use this infrastructure to find the first problems and +minimizing them.

                +

                Reusing the results of existing operations or replacing operations by constants +can be seen as "zero-instruction superoptimization". I'll probably be rather +busy for a while to add the missing optimizations identified by my simple +script. But later extensions to actually synthesize one or several operations +in the attempt to optimize the traces more and find more opportunities should +be possible.

                +

                Finding inefficiencies in traces with Z3 is significantly less +annoying and also less error-prone than just manually inspecting traces and +trying to spot optimization opportunities.

                +

                Random Notes and Sources

                +

                Again, John's blog posts:

                + +

                and papers:

                + +

                I remembered recently that I had seen the approach of optimizing the traces of +a tracing JIT with Z3 a long time ago, as part of the (now long dead, I think) +SPUR +project. +There's a workshop +paper +from 2010 about this. SPUR was trying to use Z3 built into the actual JIT (as +opposed to using Z3 only to find places where the regular optimizers could be +improved). In addition to bitvectors, SPUR also used the Z3 support for arrays +to model the C# heap and remove redundant stores. This is still another future +extension for all the Z3 work I've been doing in the context of the PyPy JIT.

                +
                +

                Finding Simple Rewrite Rules for the JIT with Z3

                + +
                +

                In June I was at the PLDI conference in +Copenhagen to present a paper +I co-authored with Max Bernstein. I also finally +met John Regehr, who I'd been talking on social +media for ages but had never met. John has been working on compiler correctness +and better techniques for building compilers and optimizers since a very long +time. The blog post Finding JIT Optimizer Bugs using SMT Solvers and +Fuzzing +was heavily inspired by this work. We talked a lot about his and his groups +work on using Z3 for +superoptimization and for +finding missing optimizations. I have applied some of the things John told me +about to the traces of PyPy's JIT, and wanted to blog about that. However, my +draft felt quite hard to understand. Therefore I have now written this current +post, to at least try to provide a somewhat gentler on-ramp to the topic.

                +

                In this post we will use the Python-API to Z3 to find local peephole rewrite +rules for the operations in the intermediate representation of PyPy's tracing +JIT. The code for this is simple enough that we can go through all of it.

                +

                The PyPy JIT produces traces of machine level instructions, which are optimized +and then turned into machine code. The optimizer uses a number of approaches to +make the traces more efficient. For integer operations it applies a number of +arithmetic simplification rules rules, for example int_add(x, 0) -> x. When +implementing these rules in the JIT there are two problems: How do we know +that the rules are correct? And how do we know that we haven't forgotten any +rules? We'll try to answer both of these, but the first one in particular.

                +

                We'll be using Z3, a satisfiability module theories (SMT) solver which has good +bitvector support and most importantly an excellent Python API. We can use the +solver to reason about bitvectors, which are how we will model machine +integers.

                +

                To find rewrite rules, we will consider the binary operations (i.e. those +taking two arguments) in PyPy traces that take and produce integers. The +completely general form op(x, y) is not simplifiable on its own. But if +either x == y +or if one of the arguments is a constant, we can potentially simplify the +operation into a simpler form. The results are either the variable x, or a +(potentially different) constant. We'll ignore constant-folding where both +arguments of the binary operation are constants. The possible results for a +simplifiable binary operation are the variable x or another constant. This +leaves the following patterns as possibilities:

                +
                  +
                • op(x, x) == x
                • +
                • op(x, x) == c1
                • +
                • op(x, c1) == x
                • +
                • op(c1, x) == x
                • +
                • op(x, c1) == c2
                • +
                • op(c1, x) == c2
                • +
                +

                Our approach will be to take every single supported binary integer operation, +instantiate all of these patterns, and try to ask Z3 whether the resulting +simplification is valid for all values of x.

                +

                Quick intro to the Z3 Python-API

                +

                Here's a terminal session showing the use of the Z3 Python API:

                +
                >>>> import z3
                +>>>> # construct a Z3 bitvector variable of width 8, with name x:
                +>>>> x = z3.BitVec('x', 8)
                +>>>> # construct a more complicated formula by using operator overloading:
                +>>>> x + x
                +x + x
                +>>>> x + 1
                +x + 1
                +
                + +

                Z3 checks the "satisfiability" of a formula. This means that it tries to find +an example set of concrete values for the variables that occur in a formula, +such that the formula becomes true. Examples:

                +
                >>>> solver = z3.Solver()
                +>>>> solver.check(x * x == 3)
                +unsat
                +>>>> # meaning no x fulfils this property
                +>>>>
                +>>>> solver.check(x * x == 9)
                +sat
                +>>>> model = solver.model()
                +>>>> model
                +[x = 253]
                +>>>> model[x].as_signed_long()
                +-3
                +>>>> # 253 is the same as -3 in two's complement arithmetic with 8 bits
                +
                + +

                In order to use Z3 to prove something, we can ask Z3 to find counterexamples +for the statement, meaning concrete values that would make the negation of the +statement true:

                +
                >>>> solver.check(z3.Not(x ^ -1 == ~x))
                +unsat
                +
                + +

                The result unsat means that we just proved that x ^ -1 == ~x is true for +all x, because there is no value for x that makes not (x ^ -1 == ~x) +true (this works because -1 has all the bits set).

                +

                If we try to prove something incorrect in this way, the following happens:

                +
                >>>> solver.check(z3.Not(x ^ -1 == x))
                +sat
                +
                + +

                sat shows that x ^ -1 == x is (unsurprisingly) not always true, and we can +ask for a counterexample:

                +
                >>>> solver.model()
                +[x = 0]
                +
                + +

                This way of proving this works because the check calls try to solve an +(implicit) "exists" quantifier, over all the Z3 variables used in the formula. +check will either return z3.unsat, which means that no concrete values make +the formula true; or z3.sat, which means that you can get some concrete +values that make the formula true by calling solver.model().

                +

                In math terms we prove things using check by de-Morgan's rules for quantifiers:

                +

                $$ \lnot \exists x: \lnot f(x) \implies \forall x: f(x) $$

                +

                Now that we've seen the basics of using the Z3 API on a few small examples, +we'll use it in a bigger program.

                +

                Encoding the integer operations of RPython's JIT into Z3 formulas

                +

                Now we'll use the API to reason about the integer operations of the PyPy JIT +intermediate representation (IR). The binary integer operations are:

                +
                opnames2 = [
                +"int_add",
                +"int_sub",
                +"int_mul",
                +"int_and",
                +"int_or",
                +"int_xor",
                +"int_eq",
                +"int_ne",
                +"int_lt",
                +"int_le",
                +"int_gt",
                +"int_ge",
                +"uint_lt",
                +"uint_le",
                +"uint_gt",
                +"uint_ge",
                +"int_lshift",
                +"int_rshift",
                +"uint_rshift",
                +"uint_mul_high",
                +"int_pydiv",
                +"int_pymod",
                +]
                +
                + +

                There's not much special about the integer operations. Like in LLVM, most of +them are signedness-independent: int_add, int_sub, int_mul, ... work +correctly for unsigned integers but also for +two's-complement signed +integers. Exceptions for that are order comparisons like int_lt etc. for +which we have unsigned variants uint_lt etc. All operations that produce a +boolean result return a full-width integer 0 or 1 (the PyPy JIT supports +only word-sized integers in its intermediate representation)

                +

                In order to reason about the IR operations, some ground work:

                +
                import z3
                +
                +INTEGER_WIDTH = 64
                +solver = z3.Solver()
                +solver.set("timeout", 10000) # milliseconds, ie 10s
                +xvar = z3.BitVec('x', INTEGER_WIDTH)
                +constvar = z3.BitVec('const', INTEGER_WIDTH)
                +constvar2 = z3.BitVec('const2', INTEGER_WIDTH)
                +TRUEBV = z3.BitVecVal(1, INTEGER_WIDTH)
                +FALSEBV = z3.BitVecVal(0, INTEGER_WIDTH)
                +
                + +

                And here's the a function to turn an integer IR operation of PyPy's JIT into Z3 +formulas:

                +
                def z3_expression(opname, arg0, arg1=None):
                +    """ computes a tuple of (result, valid_if) of Z3 formulas. `result` is the
                +    formula representing the result of the operation, given argument formulas
                +    arg0 and arg1. `valid_if` is a pre-condition that must be true for the
                +    result to be meaningful. """
                +    result = None
                +    valid_if = True # the precondition is mostly True, with few exceptions
                +    if opname == "int_add":
                +        result = arg0 + arg1
                +    elif opname == "int_sub":
                +        result = arg0 - arg1
                +    elif opname == "int_mul":
                +        result = arg0 * arg1
                +    elif opname == "int_and":
                +        result = arg0 & arg1
                +    elif opname == "int_or":
                +        result = arg0 | arg1
                +    elif opname == "int_xor":
                +        result = arg0 ^ arg1
                +    elif opname == "int_eq":
                +        result = cond(arg0 == arg1)
                +    elif opname == "int_ne":
                +        result = cond(arg0 != arg1)
                +    elif opname == "int_lt":
                +        result = cond(arg0 < arg1)
                +    elif opname == "int_le":
                +        result = cond(arg0 <= arg1)
                +    elif opname == "int_gt":
                +        result = cond(arg0 > arg1)
                +    elif opname == "int_ge":
                +        result = cond(arg0 >= arg1)
                +    elif opname == "uint_lt":
                +        result = cond(z3.ULT(arg0, arg1))
                +    elif opname == "uint_le":
                +        result = cond(z3.ULE(arg0, arg1))
                +    elif opname == "uint_gt":
                +        result = cond(z3.UGT(arg0, arg1))
                +    elif opname == "uint_ge":
                +        result = cond(z3.UGE(arg0, arg1))
                +    elif opname == "int_lshift":
                +        result = arg0 << arg1
                +        valid_if = z3.And(arg1 >= 0, arg1 < INTEGER_WIDTH)
                +    elif opname == "int_rshift":
                +        result = arg0 << arg1
                +        valid_if = z3.And(arg1 >= 0, arg1 < INTEGER_WIDTH)
                +    elif opname == "uint_rshift":
                +        result = z3.LShR(arg0, arg1)
                +        valid_if = z3.And(arg1 >= 0, arg1 < INTEGER_WIDTH)
                +    elif opname == "uint_mul_high":
                +        # zero-extend args to 2*INTEGER_WIDTH bit, then multiply and extract
                +        # highest INTEGER_WIDTH bits
                +        zarg0 = z3.ZeroExt(INTEGER_WIDTH, arg0)
                +        zarg1 = z3.ZeroExt(INTEGER_WIDTH, arg1)
                +        result = z3.Extract(INTEGER_WIDTH * 2 - 1, INTEGER_WIDTH, zarg0 * zarg1)
                +    elif opname == "int_pydiv":
                +        valid_if = arg1 != 0
                +        r = arg0 / arg1
                +        psubx = r * arg1 - arg0
                +        result = r + (z3.If(arg1 < 0, psubx, -psubx) >> (INTEGER_WIDTH - 1))
                +    elif opname == "int_pymod":
                +        valid_if = arg1 != 0
                +        r = arg0 % arg1
                +        result = r + (arg1 & z3.If(arg1 < 0, -r, r) >> (INTEGER_WIDTH - 1))
                +    elif opname == "int_is_true":
                +        result = cond(arg0 != FALSEBV)
                +    elif opname == "int_is_zero":
                +        result = cond(arg0 == FALSEBV)
                +    elif opname == "int_neg":
                +        result = -arg0
                +    elif opname == "int_invert":
                +        result = ~arg0
                +    else:
                +        assert 0, "unknown operation " + opname
                +    return result, valid_if
                +
                +def cond(z3expr):
                +    """ helper function to turn a Z3 boolean result z3expr into a 1 or 0
                +    bitvector, using z3.If """
                +    return z3.If(z3expr, TRUEBV, FALSEBV)
                +
                + +

                We map the semantics of a PyPy JIT operation to Z3 with the z3_expression +function. It takes the name of a JIT operation and its two (or one) arguments +into a pair of Z3 formulas, result and valid_if. The resulting formulas are +constructed with the operator overloading of Z3 variables/formulas.

                +

                The first element result of the result of z3_expression represents the result +of performing the operation. valid_if is a bool that represents a condition that +needs to be True in order for the result of the operation to be defined. E.g. +int_pydiv(a, b) is only valid if b != 0. Most operations are always valid, +so they return True as that condition (we'll ignore valid_if for a bit, but it +will become more relevant further down in the post).

                +

                We can define a helper function to prove things by finding counterexamples:

                +
                def prove(cond):
                +    """ Try to prove a condition cond by searching for counterexamples of its negation. """
                +    z3res = solver.check(z3.Not(cond))
                +    if z3res == z3.unsat:
                +        return True
                +    elif z3res == z3.unknown: # eg on timeout
                +        return False
                +    elif z3res == z3.sat:
                +        return False
                +    assert 0, "should be unreachable"
                +
                + +

                Finding rewrite rules

                +

                Now we can start finding our first rewrite rules, following the first pattern +op(x, x) -> x. We do this by iterating over all the supported binary +operation names, getting the z3 expression for op(x, x) and then asking Z3 to +prove op(x, x) == x.

                +
                for opname in opnames2:
                +    result, valid_if = z3_expression(opname, xvar, xvar)
                +    if prove(result == xvar):
                +        print(f"{opname}(x, x) -> x, {result}")
                +
                + +

                This yields the simplifications:

                +
                int_and(x, x) -> x
                +int_or(x, x) -> x
                +
                + +

                Synthesizing constants

                +

                Supporting the next patterns is harder: op(x, x) == c1, op(x, c1) == x, and +op(c1, x) == x. We don't know which constants to pick to try to get Z3 to +prove the equality. We could iterate over common constants like 0, 1, +MAXINT, etc, or even over all the 256 values for a bitvector of length 8. +However, we will instead ask Z3 to find the constants for us too.

                +

                This can be done by using quantifiers, in this case z3.ForAll. The query we +pose to Z3 is "does there exist a constant c1 such that for all x the +following is true: op(x, c1) == x? Note that the constant c1 is not +necessarily unique, there could be many of them. We generate several matching +constant, and add that they must be different to the condition of the second +and further queries.

                +

                We can express this in a helper function:

                +
                def find_constant(z3expr, number_of_results=5):
                +    condition = z3.ForAll(
                +        [xvar],
                +        z3expr
                +    )
                +    for i in range(number_of_results):
                +        checkres = solver.check(condition)
                +        if checkres == z3.sat:
                +            # if a solver check succeeds, we can ask for a model, which is
                +            # concrete values for the variables constvar
                +            model = solver.model()
                +            const = model[constvar].as_signed_long()
                +            yield const
                +            # make sure we don't generate the same constant again on the
                +            # next call
                +            condition = z3.And(constvar != const, condition)
                +        else:
                +            # no (more) constants found
                +            break
                +
                + +

                We can use this new function for the three mentioned patterns:

                +
                # try to find constants for op(x, x) == c
                +for opname in opnames2:
                +    result, valid_if = z3_expression(opname, xvar, xvar)
                +    for const in find_constant(result == constvar):
                +        print(f"{opname}(x, x) -> {const}")
                +# try to find constants for op(x, c) == x and op(c, x) == x
                +for opname in opnames2:
                +    result, valid_if = z3_expression(opname, xvar, constvar)
                +    for const in find_constant(result == xvar):
                +        print(f"{opname}(x, {const}) -> x")
                +    result, valid_if = z3_expression(opname, constvar, xvar)
                +    for const in find_constant(result == xvar):
                +        print(f"{opname}({const}, x) -> x")
                +# this code is not quite correct, we'll correct it later
                +
                + +

                Together this yields the following new simplifications:

                +
                # careful, these are not all correct!
                +int_sub(x, x) -> 0
                +int_xor(x, x) -> 0
                +int_eq(x, x) -> 1
                +int_ne(x, x) -> 0
                +int_lt(x, x) -> 0
                +int_le(x, x) -> 1
                +int_gt(x, x) -> 0
                +int_ge(x, x) -> 1
                +uint_lt(x, x) -> 0
                +uint_le(x, x) -> 1
                +uint_gt(x, x) -> 0
                +uint_ge(x, x) -> 1
                +uint_rshift(x, x) -> 0
                +int_pymod(x, x) -> 0
                +int_add(x, 0) -> x
                +int_add(0, x) -> x
                +int_sub(x, 0) -> x
                +int_mul(x, 1) -> x
                +int_mul(1, x) -> x
                +int_and(x, -1) -> x
                +int_and(-1, x) -> x
                +int_or(x, 0) -> x
                +int_or(0, x) -> x
                +int_xor(x, 0) -> x
                +int_xor(0, x) -> x
                +int_lshift(x, 0) -> x
                +int_rshift(x, 0) -> x
                +uint_rshift(x, 0) -> x
                +int_pydiv(x, 1) -> x
                +int_pymod(x, 0) -> x
                +
                + +

                Most of these look good at first glance, but the last one reveals a problem: +we've been ignoring the valid_if expression up to now. We can stop doing that by +changing the code like this, which adds z3.And(valid_if, ...) to the argument of +the calls to find_constant:

                +
                # try to find constants for op(x, x) == c, op(x, c) == x and op(c, x) == x
                +for opname in opnames2:
                +    result, valid_if = z3_expression(opname, xvar, xvar)
                +    for const in find_constant(z3.And(valid_if, result == constvar)):
                +        print(f"{opname}(x, x) -> {const}")
                +# try to find constants for op(x, c) == x and op(c, x) == x
                +for opname in opnames2:
                +    result, valid_if = z3_expression(opname, xvar, constvar)
                +    for const in find_constant(z3.And(result == xvar, valid_if)):
                +        print(f"{opname}(x, {const}) -> x")
                +    result, valid_if = z3_expression(opname, constvar, xvar)
                +    for const in find_constant(z3.And(result == xvar, valid_if)):
                +        print(f"{opname}({const}, x) -> x")
                +
                + +

                And we get this list instead:

                +
                int_sub(x, x) -> 0
                +int_xor(x, x) -> 0
                +int_eq(x, x) -> 1
                +int_ne(x, x) -> 0
                +int_lt(x, x) -> 0
                +int_le(x, x) -> 1
                +int_gt(x, x) -> 0
                +int_ge(x, x) -> 1
                +uint_lt(x, x) -> 0
                +uint_le(x, x) -> 1
                +uint_gt(x, x) -> 0
                +uint_ge(x, x) -> 1
                +int_add(x, 0) -> x
                +int_add(0, x) -> x
                +int_sub(x, 0) -> x
                +int_mul(x, 1) -> x
                +int_mul(1, x) -> x
                +int_and(x, -1) -> x
                +int_and(-1, x) -> x
                +int_or(x, 0) -> x
                +int_or(0, x) -> x
                +int_xor(x, 0) -> x
                +int_xor(0, x) -> x
                +int_lshift(x, 0) -> x
                +int_rshift(x, 0) -> x
                +uint_rshift(x, 0) -> x
                +int_pydiv(x, 1) -> x
                +
                + +

                Synthesizing two constants

                +

                For the patterns op(x, c1) == c2 and op(c1, x) == c2 we need to synthesize +two constants. We can again write a helper method for that:

                +
                def find_2consts(z3expr, number_of_results=5):
                +    condition = z3.ForAll(
                +        [xvar],
                +        z3expr
                +    )
                +    for i in range(number_of_results):
                +        checkres = solver.check(condition)
                +        if checkres == z3.sat:
                +            model = solver.model()
                +            const = model[constvar].as_signed_long()
                +            const2 = model[constvar2].as_signed_long()
                +            yield const, const2
                +            condition = z3.And(z3.Or(constvar != const, constvar2 != const2), condition)
                +        else:
                +            return
                +
                + +

                And then use it like this:

                +
                for opname in opnames2:
                +    # try to find constants c1, c2 such that op(c1, x) -> c2
                +    result, valid_if = z3_expression(opname, constvar, xvar)
                +    consts = find_2consts(z3.And(valid_if, result == constvar2))
                +    for const, const2 in consts:
                +        print(f"{opname}({const}, x) -> {const2}")
                +    # try to find constants c1, c2 such that op(x, c1) -> c2
                +    result, valid_if = z3_expression(opname, xvar, constvar)
                +    consts = find_2consts(z3.And(valid_if, result == constvar2))
                +    for const, const2 in consts:
                +        print("%s(x, %s) -> %s" % (opname, const, const2))
                +
                + +

                Which yields some straightforward simplifications:

                +
                int_mul(0, x) -> 0
                +int_mul(x, 0) -> 0
                +int_and(0, x) -> 0
                +int_and(x, 0) -> 0
                +uint_lt(x, 0) -> 0
                +uint_le(0, x) -> 1
                +uint_gt(0, x) -> 0
                +uint_ge(x, 0) -> 1
                +int_lshift(0, x) -> 0
                +int_rshift(0, x) -> 0
                +uint_rshift(0, x) -> 0
                +uint_mul_high(0, x) -> 0
                +uint_mul_high(1, x) -> 0
                +uint_mul_high(x, 0) -> 0
                +uint_mul_high(x, 1) -> 0
                +int_pymod(x, 1) -> 0
                +int_pymod(x, -1) -> 0
                +
                + +

                A few require a bit more thinking:

                +
                int_or(-1, x) -> -1
                +int_or(x, -1) -> -1
                +
                + +

                The are true because in two's complement, -1 has all bits set.

                +

                The following ones require recognizing that -9223372036854775808 == -2**63 is +the most negative signed 64-bit integer, and 9223372036854775807 == 2 ** 63 - +1 is the most positive one:

                +
                int_lt(9223372036854775807, x) -> 0
                +int_lt(x, -9223372036854775808) -> 0
                +int_le(-9223372036854775808, x) -> 1
                +int_le(x, 9223372036854775807) -> 1
                +int_gt(-9223372036854775808, x) -> 0
                +int_gt(x, 9223372036854775807) -> 0
                +int_ge(9223372036854775807, x) -> 1
                +int_ge(x, -9223372036854775808) -> 1
                +
                + +

                The following ones are true because the bitpattern for -1 is the largest +unsigned number:

                +
                uint_lt(-1, x) -> 0
                +uint_le(x, -1) -> 1
                +uint_gt(x, -1) -> 0
                +uint_ge(-1, x) -> 1
                +
                + +

                Strength Reductions

                +

                All the patterns so far only had a variable or a constant on the target of the +rewrite. We can also use the machinery to do strengh-reductions where we +generate a single-argument operation op1(x) for input operations op(x, c1) +or op(c1, x). To achieve this, we try all combinations of binary and unary +operations. (We won't consider strength reductions where a binary operation +gets turned into a "cheaper" other binary operation here.)

                +
                opnames1 = [
                +"int_is_true",
                +"int_is_zero",
                +"int_neg",
                +"int_invert",
                +]
                +
                +for opname in opnames2:
                +    for opname1 in opnames1:
                +        result, valid_if = z3_expression(opname, xvar, constvar)
                +        # try to find a constant op(x, c) == g(x)
                +        result1, valid_if1 = z3_expression(opname1, xvar)
                +        consts = find_constant(z3.And(valid_if, valid_if1, result == result1))
                +        for const in consts:
                +            print(f"{opname}(x, {const}) -> {opname1}(x)")
                +
                +        # try to find a constant op(c, x) == g(x)
                +        result, valid_if = z3_expression(opname, constvar, xvar)
                +        result1, valid_if1 = z3_expression(opname1, xvar)
                +        consts = find_constant(z3.And(valid_if, valid_if1, result == result1))
                +        for const in consts:
                +            print(f"{opname}({const}, x) -> {opname1}(x)")
                +
                + +

                Which yields the following new simplifications:

                +
                int_sub(0, x) -> int_neg(x)
                +int_sub(-1, x) -> int_invert(x)
                +int_mul(x, -1) -> int_neg(x)
                +int_mul(-1, x) -> int_neg(x)
                +int_xor(x, -1) -> int_invert(x)
                +int_xor(-1, x) -> int_invert(x)
                +int_eq(x, 0) -> int_is_zero(x)
                +int_eq(0, x) -> int_is_zero(x)
                +int_ne(x, 0) -> int_is_true(x)
                +int_ne(0, x) -> int_is_true(x)
                +uint_lt(0, x) -> int_is_true(x)
                +uint_lt(x, 1) -> int_is_zero(x)
                +uint_le(1, x) -> int_is_true(x)
                +uint_le(x, 0) -> int_is_zero(x)
                +uint_gt(x, 0) -> int_is_true(x)
                +uint_gt(1, x) -> int_is_zero(x)
                +uint_ge(x, 1) -> int_is_true(x)
                +uint_ge(0, x) -> int_is_zero(x)
                +int_pydiv(x, -1) -> int_neg(x)
                +
                + +

                Conclusions

                +

                With not very little code we managed to generate a whole lot of local +simplifications for integer operations in the IR of PyPy's JIT. The rules +discovered that way are "simple", in the sense that they only require looking +at a single instruction, and not where the arguments of that instruction came +from. They also don't require any knowledge about the properties of the +arguments of the instructions (e.g. that they are positive).

                +

                The rewrites in this post have mostly been in PyPy's JIT already. But now we +mechanically confirmed that they are correct. I've also added the remaining +useful looking ones, in particular int_eq(x, 0) -> int_is_zero(x) etc.

                +

                If we wanted to scale this approach up, we would have to work much harder! +There are a bunch of problems that come with generalizing the approach to +looking at sequences of instructions:

                +
                  +
                • +

                  Combinatorial explosion: if we look at sequences of instructions, we very + quickly get a combinatorial explosion and it becomes untractable to try all + combinations.

                  +
                • +
                • +

                  Finding non-minimal patterns: Some complicated simplifications can be + instances of simpler ones. For example, because int_add(x, 0) -> x, it's + also true that int_add(int_sub(x, y), 0) -> int_sub(x, y). If we simply + generate all possible sequences, we will find the latter simplification rule, + which we would usually not care about.

                  +
                • +
                • +

                  Unclear usefulness: if we simply generate all rewrites up to a certain number + of instructions, we will get a lot of patterns that are useless in the sense + that they typically aren't found in realistic programs. It would be much + better to somehow focus on the patterns that real benchmarks are using.

                  +
                • +
                +

                In the next blog post I'll discuss an alternative approach to simply generating +all possible sequences of instructions, that tries to address these problems. +This works by analyzing the real traces of benchmarks and mining those for +inefficiencies, which only shows problems that occur in actual programs.

                +

                Sources

                +

                I've been re-reading a lot of blog posts from John's blog:

                + +

                but also papers:

                + +

                Another of my favorite blogs has been Philipp Zucker's +blog in the last year or two, lots of excellent +posts about/using Z3 on there.

                +
                +

                Profiling PyPy using the Firefox profiler user interface

                + +
                +

                Introduction

                +

                If you ever wanted to profile your Python code on PyPy, you probably came across VMProf — a statistical profiler for PyPy.

                +

                VMProf's console output can already give some insights into where your code spends time, +but it is far from showing all the information captured while profiling.

                +

                There have been some tools around to visualize VMProf's output. +Unfortunately the vmprof.com user interface is no longer available and vmprof-server is not as easy to use, you may want to take a look at a local viewer or converter. +Those so far could give you some general visualizations of your profile, but do not show any PyPy related context like PyPy's log output (PyPyLog, which is output when using the PYPYLOG environment variable to log JIT actions).

                +

                To bring all of those features together in one tool, you may take a look at the vmprof-firefox-converter.

                +

                Created in the context of my bachelor's thesis, the vmprof-firefox-converter is a tool for analyzing VMProf profiles with the Firefox profiler user interface. +Instead of building a new user interface from scratch, this allows us to reuse the user interface work Mozilla put into the Firefox profiler. +The Firefox profiler offers a timeline where you can zoom into profiles and work with different visualizations like a flame graph or a stack chart. +To understand why there is time spent inside a function, you can revisit the source code and even dive into the intermediate representation of functions executed by PyPy's just-in-time compiler. +Additionally, there is a visualization for PyPy's log output, to keep track whether PyPy spent time inside the interpreter, JIT or GC throughout the profiling time.

                +

                Profiling word count

                +

                In this blog post, I want to show an example of how to use the vmprof-firefox-converter for a simple Python program. +Based on Ben Hoyt's blog Performance comparison: counting words in Python, Go, C++, C, AWK, Forth, and Rust we will profile two python versions of a word counter running on PyPy. One being a bit more optimized. For this, VMProf will be used, but instead of just going with the console output, we will use the Firefox profiler user interface.

                +

                At first, we are going to look at a simple way of counting words with Collections.Counter. +This will read one line from the standard input at a time and count the words with counter.update()

                +
                counts = collections.Counter()
                +for line in sys.stdin:
                +    words = line.lower().split()
                +    counts.update(words)
                +
                +for word, count in counts.most_common():
                +    print(word, count)
                +
                + +

                To start profiling, simply execute: +pypy -m vmprofconvert -run simple.py <kjvbible_x10.txt

                +

                This will run the above code with vmprof, automatically capture and convert the results and finally open the Firefox profiler.

                +

                The input file is the king James version of the bible concatenated ten times.

                +

                To get started, we take a look at the call stack.

                +

                +Here we see that most of the time is spent in native code (marked as blue) e.g., the counter.update() or split() C implementation.

                +

                Now let's proceed with the more optimized version. +This time we read 64 Kb of data from the standard input and count the words with counter.update().

                +
                counts = collections.Counter()
                +remaining = ''
                +while True:
                +    chunk = remaining + sys.stdin.read(64*1024)
                +    if not chunk:
                +        break
                +    last_lf = chunk.rfind('\n')  # process to last LF character
                +    if last_lf == -1:
                +        remaining = ''
                +    else:
                +        remaining = chunk[last_lf+1:]
                +        chunk = chunk[:last_lf]
                +    counts.update(chunk.lower().split())
                +
                +for word, count in counts.most_common():
                +    print(word, count)
                +
                + +

                As we did before, we are going to take a peek at the call stack.

                +

                +

                Now there is more time spent in native code, caused by larger chunks of text passed to counter.update().

                +

                This becomes even more clear by comparing the stack charts.

                +

                +

                Here, in the unoptimized case, we only read in one line at each loop iteration. +This results in small "spikes" in the stack chart.

                +

                But let's take an even closer look.

                +

                +

                Zoomed in, we see the call stack alternating between _count_elements() and (unfortunately unsymbolized) native calls coming from reading and splitting the input text (e.g., decode()).

                +

                Let us now take a look at the optimized case.

                +

                +

                And if we look closer at the same interval as before, we see some spikes, but slightly different.

                +

                +

                Even though we do not want to compare the (amount of) milliseconds directly, we clearly see that the spikes are wider, i.e. the time spent in those function calls is longer. +You may already know where this comes from. +We read a 64 Kb chunk of data from std in and pass that to counter.update(), so both these tasks do more work and take longer. +Bigger chunks mean there is less alternating between reading and counting, so there is more time spent doing work than "doing" loop iterations.

                +

                Getting started

                +

                You can get the converter from GitHub.

                +

                Both VMProf and the vmprof-firefox-converter were created for profiling PyPy, but you can also use them with CPython.

                +

                This project is still somewhat experimental, so if you want to try it out, please let us know whether it worked for you.

                +
                +

                PyPy v7.3.16 release

                + +
                +

                PyPy v7.3.16: release of python 2.7, 3.9, and 3.10

                +

                The PyPy team is proud to release version 7.3.16 of PyPy.

                +

                This release includes security fixes from upstream CPython, and bugfixes to the +garbage collector, described in a gc bug-hunt blog post.

                +

                The release includes three different interpreters:

                +
                +
                  +
                • PyPy2.7, which is an interpreter supporting the syntax and the features of +Python 2.7 including the stdlib for CPython 2.7.18+ (the + is for +backported security updates)

                • +
                • PyPy3.9, which is an interpreter supporting the syntax and the features of +Python 3.9, including the stdlib for CPython 3.9.19.

                • +
                • PyPy3.10, which is an interpreter supporting the syntax and the features of +Python 3.10, including the stdlib for CPython 3.10.14.

                • +
                +
                +

                The interpreters are based on much the same codebase, thus the multiple +release. This is a micro release, all APIs are compatible with the other 7.3 +releases. It follows after 7.3.15 release on Jan 15, 2024

                +

                We recommend updating. You can find links to download the v7.3.16 releases here:

                +
                +

                https://pypy.org/download.html

                +
                +

                We would like to thank our donors for the continued support of the PyPy +project. If PyPy is not quite good enough for your needs, we are available for +direct consulting work. If PyPy is helping you out, we would love to hear +about it and encourage submissions to our blog via a pull request +to https://github.com/pypy/pypy.org

                +

                We would also like to thank our contributors and encourage new people to join +the project. PyPy has many layers and we need help with all of them: bug fixes, +PyPy and RPython documentation improvements, or general help with +making RPython's JIT even better.

                +

                If you are a python library maintainer and use C-extensions, please consider +making a HPy / CFFI / cppyy version of your library that would be performant +on PyPy. In any case, both cibuildwheel and the multibuild system support +building wheels for PyPy.

                +

                What is PyPy?

                +

                PyPy is a Python interpreter, a drop-in replacement for CPython +It's fast (PyPy and CPython 3.7.4 performance +comparison) due to its integrated tracing JIT compiler.

                +

                We also welcome developers of other dynamic languages to see what RPython +can do for them.

                +

                We provide binary builds for:

                +
                +
                  +
                • x86 machines on most common operating systems +(Linux 32/64 bits, Mac OS 64 bits, Windows 64 bits)

                • +
                • 64-bit ARM machines running Linux (aarch64).

                • +
                • Apple M1 arm64 machines (macos_arm64).

                • +
                • s390x running Linux

                • +
                +
                +

                PyPy support Windows 32-bit, Linux PPC64 big- and little-endian, and Linux ARM +32 bit, but does not release binaries. Please reach out to us if you wish to +sponsor binary releases for those platforms. Downstream packagers provide +binary builds for debian, Fedora, conda, OpenBSD, FreeBSD, Gentoo, and more.

                +

                What else is new?

                +

                For more information about the 7.3.16 release, see the full changelog.

                +

                Please update, and continue to help us make pypy better.

                +

                Cheers, +The PyPy Team

                +
                +
                +

                Fixing a Bug in PyPy's Incremental GC

                + +
                +

                Introduction

                +

                Since last summer, I've been looking on and off into a weird and hard to +reproduce crash bug in PyPy. It was +manifesting only on CI, and it seemed to always happen in the AST rewriting +phase of pytest, the symptoms being that PyPy would crash +with a segfault. All my attempts to reproduce it locally failed, and my +attempts to try to understand the problem by dumping the involved ASTs lead +nowhere.

                +

                A few weeks ago, we got two more +bug reports, the last one by +the authors of the nanobind binding +generator, with the same symptoms: crash in AST rewriting, only on CI. I +decided to make a more serious push to try to find the bug this time. +Ultimately the problem turned out to be several bugs in PyPy's garbage +collector (GC) that had been there since its inception in +2013. +Understanding the +situation turned out to be quite involved, additionally complicated by this +being the first time that I was working on this particular aspect of PyPy's GC. +Since the bug was so much work to find, I thought I'd write a blog post about +it.

                +

                The blog post consists of three parts: first a chronological description of +what I did to find the bug, a technical explanation of what goes wrong, some +reflections on the bug (and then a bonus bug I also found in the process).

                +

                Finding the Bug

                +

                I started from the failing nanobind CI +runs +that ended with a segfault of the PyPy interpreter. This was only an +intermittent problem, not every run was failing. When I tried to just run the +test suite locally, I couldn't get it to fail. Therefore at first I tried to +learn more about what was happening by looking on the CI runners.

                +

                Running on CI

                +

                I forked the nanobind repo and hacked the CI script in order to get it to use a +PyPy build with full debug information and more assertions turned on. In order +to increase the probability of seeing the crash I added an otherwise unused +matrix +variable to the CI script that just contained 32 parameters. This means every +build is done 32 times (sorry Github for wasting your CPUs 😕). With that +amount of repetition, I got at least one job of every build that was crashing.

                +

                Then I added the -Xfaulthandler option to the PyPy command which will use the +faulthandler module +try to print a Python stacktrace if the VM segfaults to confirm that PyPy was +indeed crashing in the AST +rewriting +phase +of pytest, which pytest uses for nicer +assertions. +I experimented with hacking our faulthandler implementation to also give me a +C-level callstack, but that didn't work as well as I hoped.

                +

                Then I tried to run gdb on CI to try to get it +to print a C callstack at the crash point. You can get gdb to execute commands +as if typed at the prompt with the -ex commandline option, I used something +like this:

                +
                gdb -ex "set confirm off" -ex "set pagination off" -ex \
                +    "set debuginfod enabled off" -ex run -ex where -ex quit \
                +    --args <command> <arguments>
                +
                + +

                But unfortunately the crash never occurred when running in gdb.

                +

                Afterwards I tried the next best thing, which was configuring the CI runner to +dump a core file and upload it as a build +artifact, which worked. Looking +at the cores locally only sort of worked, because I am running a different +version of Ubuntu than the CI runners. So I used +tmate to be able to log into the +CI runner after a crash and interactively used gdb there. Unfortunately what I +learned from that was that the bug was some kind of memory corruption, +which is always incredibly unpleasant to debug. Basically the header word of a +Python object had been corrupted somehow at the point of the crash, which means +that it's vtable wasn't +usable any more.

                +

                (Sidenote: PyPy doesn't really use a vtable +pointer, +instead it uses half a word in the header for the vtable, and the other half +for flags that the GC needs to keep track of the state of the object. +Corrupting all this is still bad.)

                +

                Reproducing Locally

                +

                At that point it was clear that I had to push to reproduce the problem on my +laptop, to allow me to work on the problem more directly and not to always have +to go via the CI runner. Memory corruption bugs often have a lot of randomness +(depending on which part of memory gets modified, things might crash or more +likely just happily keep running). Therefore I decided to try to brute-force +reproducing the crash by simply running the tests many many times. Since the +crash happened in the AST rewriting phase of pytest, and that happens only if +no pyc +files +of the bytecode-compiled rewritten ASTs exist, I made sure to delete them +before every test run.

                +

                To repeat the test runs I used +multitime, which is a simple program +that runs a command repeatedly. It's meant for lightweight benchmarking +purposes, but it also halts the execution of the command if that command exits +with an error (and it sleeps a small random time between runs, which might help +with randomizing the situation, maybe). Here's a demo:

                +

                (Max pointed out +autoclave to me when reviewing +this post, which is a more dedicated tool for this job.)

                +

                Thankfully, running the tests repeatedly eventually lead to a crash, solving my +"only happens on CI" problem. I then tried various variants to exclude possible +sources of errors. The first source of errors to exclude in PyPy bugs is the +just-in-time compiler, so I reran the tests with --jit off to see whether I +could still get it to crash, and thankfully I eventually could (JIT bugs are +often very annoying).

                +

                Next source of bugs to exclude where C-extensions. Since those were the tests +of nanobind, a framework for creating C-extension modules I was a bit worried +that the bug might be in our emulation of CPython's C-API. But running PyPy +with the -v option (which will print all the imports as they happen) +confirmed that at the point of crash no C-extension had been imported yet.

                +

                Using rr +

                +

                I still couldn't get the bug to happen in GDB, so the tool I tried next was +rr, the "reverse debugger". rr can record the execution of a program and +later replay it arbitrarily often. This gives you a time-traveling debugger +that allows you to execute the program backwards in addition to forwards. +Eventually I managed to get the crash to happen when running the tests with +rr record --chaos (--chaos randomizes some decisions that rr takes, to try to +increase the chance of reproducing bugs).

                +

                Using rr well is quite hard, and I'm not very good at it. The main approach I +use with rr to debug memory corruption is to replay the crash, then set a +watchpoint +for the corrupted memory location, then use the command reverse-continue to +find the place in the code that mutated the memory location. reverse-continue +is like continue, except that it will execute the program backwards from the +current point. Here's a little demo of this:

                +

                Doing this for my bug revealed that the object that was being corrupted was +erroneously collected by the garbage collector. For some reason the GC had +wrongly decided that the object was no longer reachable and therefore put the +object into a freelist by writing a pointer to the next entry in the freelist +into the first word of the object, overwriting the object's header. The next +time the object was used things crashed.

                +

                Side-quest: wrong GC assertions

                +

                At this point in the process, I got massively side-tracked. PyPy's GC has a +number of debug modes that you can optionally turn on. Those slow down the +program execution a lot, but they should in theory help to understand why the +GC goes wrong. When I turned them on, I was getting a failing assertion really +early in the test execution, complaining about an invariant violation in the GC +logic. At first this made me very happy. I thought that this would help me fix +the bug more quickly.

                +

                Extremely frustratingly, after two days of work I concluded that the assertion +logic itself was wrong. I have fixed that in the meantime too, the details +of that are in the bonus section at the end of the post.

                +

                Using GDB scripting to find the real bug

                +

                After that disaster I went back to the earlier rr recording without GC assertions +and tried to understand in more detail why the GC decided to free an object +that was still being referenced. To be able to do that I used the GDB Python +scripting +API to +write some helper commands to understand the state of the GC heap (rr is an +extension of GDB, so the GDB scripting API works in rr too).

                +

                The first (small) helper command I wrote with the GDB scripting API was a way +to pretty-print the currently active GC flags of a random PyPy object, starting +just from the pointer. The more complex command I wrote was an object tracer, +which follows pointers to GC objects starting from a root object to explore the +object graph. The object tracer isn't complete, it doesn't deal with all the +complexities of PyPy's GC. But it was good enough to help me with my problem, I +found out that the corrupted object was stored in an array.

                +

                As an example, here's a function that uses the GDB API to walk one of the +helper data structures of the GC, a stack of pointers:

                +
                def walk_addr_stack(obj):
                +    """ walk an instance of the AddressStack class (which is a linked list of
                +    arrays of 1019 pointers).
                +
                +    the first of the arrays is only partially filled with used_in_last_chunk
                +    items, all the other chunks are full."""
                +    if obj.type.code == gdb.TYPE_CODE_PTR:
                +        obj = obj.dereference()
                +    used_in_last_chunk = lookup(obj, "used_in_last_chunk")
                +    chunk = lookup(obj, "inst_chunk").dereference()
                +    while 1:
                +        items = lookup(chunk, "items")
                +        for i in range(used_in_last_chunk):
                +            yield items[i]
                +        chunk = lookup(chunk, "next")
                +        if not chunk:
                +            break
                +        chunk = chunk.dereference()
                +        used_in_last_chunk = 1019
                +
                + +

                The full file of supporting code I wrote can be found in this +gist. This is +pretty rough throw-away code, however.

                +

                In the following recording I show a staged debugging session with some of the +extra commands I wrote with the Python API. The details aren't important, I +just wanted to give a bit of a flavor of what inspecting objects looks like:

                +

                The next step was to understand why the array content wasn't being correctly +traced by the GC, which I eventually managed with some conditional +breakpoints, +more watchpoints, and using reverse-continue. It turned out to be a bug that +occurs when the content of one array was memcopied into another array. The +technical details of why the array wasn't traced correctly are described in +detail in the next section.

                +

                Writing a unit test

                +

                To try to make sure I really understood the bug correctly I then wrote a GC +unit test that shows the problem. Like most of PyPy, our GC is written in +RPython, a (somewhat strange) subset/dialect of Python2, which can be compiled +to C code. However, since it is also valid Python2 code, it can be unit-tested +on top of a Python2 +implementation +(which is one of the reasons why we keep maintaining PyPy2).

                +

                In the GC unit tests you have a lot of control about what order things happen +in, e.g. how objects are allocated, when garbage collection phases happen, etc. +After some trying I managed to write a test that crashes with the same kind of +memory corruption that my original crash exhibited: an object that is still +reachable via an array is collected by the GC. To give you a flavor of what +this kind of test looks like, here's an (edited for clarity) version of the +test I eventually managed to write

                +
                def test_incrementality_bug_arraycopy(self):
                +    source = self.malloc(VAR, 8) # first array
                +    # the stackroots list emulates the C stack
                +    self.stackroots.append(source)
                +    target = self.malloc(VAR, 8) # second array
                +    self.stackroots.append(target)
                +    node = self.malloc(S) # unrelated object, will be collected
                +    node.x = 5
                +    # store reference into source array, calling the write barrier
                +    self.writearray(source, 0, node)
                +    val = self.gc.collect_step()
                +    source = self.stackroots[0] # reload arrays, they might have moved
                +    target = self.stackroots[1]
                +    # this GC step traces target
                +    val = self.gc.collect_step()
                +
                +    # emulate what a memcopy of arrays does
                +    res = self.gc.writebarrier_before_copy(source, target, 0, 0, 2)
                +    assert res
                +    target[0] = source[0] # copy two elements of the arrays
                +    target[1] = source[1]
                +    # now overwrite the reference to node in source
                +    self.writearray(source, 0, lltype.nullptr(S))
                +    # this GC step traces source
                +    self.gc.collect_step()
                +    # some more collection steps, crucially target isn't traced again
                +    # but node is deleted
                +    for i in range(3):
                +        self.gc.collect_step()
                +    # used to crash, node got collected
                +    assert target[0].x == 5
                +
                + +

                One of the good properties of testing our GC that way is that all the memory is +emulated. The crash in the last line of the test isn't a segfault at all, +instead you get a nice exception saying that you tried to access a freed chunk +of memory and you can then debug this with a python2 debugger.

                +

                Fixing the Bug

                +

                With the unit test in hand, fixing the test was relatively straightforward (the +diff in its simplest form is anyway only a single line +change). +After this first version of my fix, I +talked to Armin +Rigo who +helped me find different case that was still wrong, in the same area of the +code.

                +

                I also got help by the developers at PortaOne +who are using PyPy on their servers and had seen some mysterious PyPy +crashes +recently, that looked related to the GC. They did test deployments of my fixes +in their various stages to their servers to try to see whether stability +improved for them. Unfortunately in the end it turned out that their crashes +are an unrelated GC bug related to object pinning, which we haven't resolved +yet.

                +

                Writing a GC fuzzer/property based test

                +

                Finding bugs in the GC is always extremely disconcerting, particularly since +this one managed to hide for so long (more than ten years!). Therefore I wanted +to use these bugs as motivation to try to find more problems in PyPy's GC. Given +the ridiculous effectiveness of fuzzing, I used +hypothesis to write a +property-based test. Every test performs a sequence of randomly chosen steps +from the following list:

                +
                  +
                • allocate an object
                • +
                • read a random field from a random object
                • +
                • write a random reference into a random object
                • +
                • drop a random stack reference
                • +
                • perform one GC step
                • +
                • allocate an array
                • +
                • read a random index from a random array
                • +
                • write to an array
                • +
                • memcopy between two arrays
                • +
                +

                This approach of doing a sequence of steps is pretty close to the stateful +testing approach of +hypothesis, but I just implemented it manually with the data +strategy.

                +

                Every one of those steps is always performed on both the tested GC, and on some +regular Python objects. The Python objects provide the "ground truth" of what +the heap should look like, so we can compare the state of the GC objects +with the state of the Python objects to find out whether the GC made a mistake.

                +

                In order to check whether the test is actually useful, I reverted my bug fixes +and made sure that the test re-finds both the spurious GC assertion error and the +problems with memcopying an array.

                +

                In addition, the test also found corner cases in my fix. There was a situation +that I hadn't accounted for, which the test found after eventually. +I also plan on adding a bunch of other GC features as steps in the +test to stress them too (for example weakrefs, identity hashes, pinning, maybe +finalization).

                +

                At the point of publishing this post, the fixes got merged to the 2.7/3.9/3.10 +branches of PyPy, and will be part of the next release (v7.3.16).

                +

                The technical details of the bug

                +

                In order to understand the technical details of the bug, I need to give some +background explanations about PyPy's GC.

                +

                PyPy's incremental GC

                +

                PyPy uses an incremental generational mark-sweep GC. It's +generational +and therefore has minor collections (where only young objects get collected) +and major collections (collecting long-lived objects eventually, using a +mark-and-sweep +algorithm). Young objects are allocated in a nursery using a +bump-pointer allocator, which makes allocation quite efficient. They are moved +out of the nursery by minor collections. In order to find references from old +to young objects the GC uses a write barrier to detect writes into old objects.

                +

                The GC is also +incremental, +which means that its major collections aren't done all at once (which would +lead to long pauses). Instead, major collections are sliced up into small +steps, which are done directly after a minor collection (the GC isn't +concurrent though, which would mean that the GC does work in a separate +thread).

                +

                The incremental GC uses tri-color +marking +to reason about the reachable part of the heap during the marking phase, where +every old object can be:

                +
                  +
                • black: already marked, reachable, definitely survives the collection
                • +
                • grey: will survive, but still needs to be marked
                • +
                • white: potentially dead
                • +
                +

                The color of every object is encoded by setting flags +in the object header.

                +

                The GC maintains the invariant that black objects must never point to white +objects. At the start of a major collection cycle the stack roots are turned +gray. During the mark phase of a major collection cycle, the GC will trace gray +objects, until +none are left. To trace a gray object, all the objects it references have to be +marked grey if they are white so far. After a grey object is traced, it can be +marked black (because all the referenced objects are now either black or gray). +Eventually, there are no gray objects left. At that point (because no white +object can be reached from a black one) all the white objects are known to be +unreachable and can therefore be freed.

                +

                The GC is incremental because every collection step will only trace a limited +number of gray objects, before giving control back to the program. This leads to +a problem: if an already traced (black) object is changed between two marking +steps of the GC, the program can mutate that object and write a new reference +into one of its fields. This could lead to an invariant violation, if the +referenced object is white. Therefore, the GC uses the write barrier (which it +needs anyway to find references from old to young objects) to mark all black +objects that are modified gray, and then trace them again at one of the +later collection steps.

                +

                The special write barrier of memcopy

                +

                Arrays use a different kind of write barrier than normal objects. Since they +can be arbitrarily large, tracing them can take a long time. Therefore it's +potentially wasteful to trace them fully at a minor collection. To fix this, +the array write barrier keeps more granular information about which parts of +the array have been modified since the last collection step. Then only the +modified parts of the array need to be traced, not the whole array.

                +

                In addition, there is another optimization for arrays, which is that memcopy is +treated specially by the GC. If memcopy is implemented by simply writing a loop +that copies the content of one array to the other, that will invoke the write +barrier every single loop iteration for the write of every array element, +costing a lot of overhead. Here's some pseudo-code:

                +
                def arraycopy(source, dest, source_start, dest_start, length):
                +    for i in range(length):
                +        value = source[source_start + i]
                +        dest[dest_start + i] = value # <- write barrier inserted here
                +
                + +

                Therefore the GC has a special memcopy-specific +write barrier that will perform the GC logic once before the memcopy loop, and +then use a regular (typically SIMD-optimized) memcopy implementation from +libc. Roughly like this:

                +
                def arraycopy(source, dest, source_start, dest_start, length):
                +    gc_writebarrier_before_array_copy(source, dest, source_start, dest_start, length)
                +    raw_memcopy(cast_to_voidp(source) + source_start,
                +                cast_to_voidp(dest) + dest_start,
                +                sizeof(itemtype(source)) * length)
                +
                + +

                (this is really a rough sketch. The real +code +is much more complicated.)

                +

                The bug

                +

                The bugs turned out to be precisely in this memcopy write barrier. When we +implemented the current GC, we adapted our previous GC, which was a +generational mark-sweep GC but not incremental. We started with most of the +previous GC's code, including the write barriers. The regular write barriers +were adapted to the new incremental assumptions, in particular the need for the +write barrier to also turn black objects back to gray when they are modified +during a marking phase. This was simply not done at all for the memcopy write +barrier, at least in two of the code paths. Fixing this problem fixes the unit +tests and stops the crashes.

                +

                Reflections

                +

                The way the bug was introduced is really typical. A piece of code (the memcopy +write barrier) was written under a set of assumptions. Then those assumptions +changed later. Not all the code pieces that relied on these assumptions to be +correct were updated. It's pretty hard to prevent this in all situations.

                +

                I still think we could have done more to prevent the bug occurring. Writing a +property-based test for the GC would have been a good idea given the complexity +of the GC, and definitely something we did in other parts of our code at the +time (just using the random module mostly, we started using hypothesis +later).

                +

                It's a bit of a mystery to me why this bug managed to be undetected for so +long. Memcopy happens in a lot of pretty core operations of e.g. lists in +Python (list.extend, to name just one example). To speculate, I would suspect +that all the other preconditions for the bug occurring made it pretty rare:

                +
                  +
                • the content of an old list that is not yet marked needs to be copied into + another old list that is marked already
                • +
                • the source of the copy needs to also store an object that has no other + references
                • +
                • the source of the copy then needs to be overwritten with other data
                • +
                • then the next collection steps need to be happening at the right points
                • +
                • ...
                • +
                +

                Given the complexity of the GC logic I also wonder whether some lightweight +formal methods would have been a good idea. Formalizing some of the core +invariants in B or +TLA+ and then model +checking them up to some number +of +objects would have found this problem pretty quickly. There are also correctness +proofs for GC algorithms in some research papers, but I don't have a good +overview of the literature to point to any that are particularly good or bad. +Going such a more formal route might have fixed this and probably a whole bunch +of other bugs, but of course it's a pretty expensive (and tedious) approach.

                +

                While it was super annoying to track this down, it was definitely good to learn +a bit more about how to use rr and the GDB scripting interface.

                +

                Bonus Section: The Wrong Assertion

                +

                Some more technical information about the wrong assertion is in this section.

                +

                Background: pre-built objects

                +

                PyPy's VM-building bootstrapping process can "freeze" a bunch of heap objects +into the final binary. This allows the VM to start up quickly, because those +frozen objects are loaded by the OS as part of the binary.

                +

                Those frozen pre-built objects are parts of the 'roots' of the garbage +collector and need to be traced. However, tracing all the pre-built objects at +every collection would be very expensive, because there are a lot of them +(about 150,000 in a PyPy 3.10 binary). Tracing them all is also not necessary, +because most of them are never modified. Unmodified pre-built objects can only reference +other pre-built objects, which can never be deallocated anyway. Therefore we +have an optimization that uses the write barrier (which we need anyway to find +old-to-young pointers) to notice when a pre-built object gets modified for the +very first time. If that happens, it gets added to the set of pre-built objects +that gets counted as a root, and is traced as a root at collections +from then on.

                +

                The wrong assertion

                +

                The assertion that triggered when I turned on the GC debug mode was saying that +the GC found a reference from a black to a white object, violating its +invariant. Unmodified pre-built objects count as black, and they aren't roots, +because they can only ever reference other pre-built objects. However, when a +pre-built object gets modified for the first time, it becomes part of the root +set and will be marked gray. This logic works fine.

                +

                The wrong assertion triggers if a pre-built object is mutated for the very +first time in the middle of an incremental marking phase. While the pre-built +object gets added to the root set just fine, and will get traced before the +marking phase ends, this is encoded slightly differently for pre-built objects, +compared to "regular" old objects. Therefore, the invariant checking code +wrongly reported a black->white pointer in this situation.

                +

                To fix it I also wrote a unit test checking the problem, made sure that the GC +hypothesis test also found the bug, and then fixed the wrong assertion to take +the color encoding of pre-built objects into account.

                +

                The bug managed to be invisible because we don't tend to turn on the GC +assertions very often. We only do that when we find a GC bug, which is of +course also when we need it the most to be correct.

                +

                Acknowledgements

                +

                Thanks to Matti Picus, Max Bernstein, Wouter van Heyst for giving me feedback on drafts of the +post. Thanks to Armin Rigo for reviewing the code and pointing out holes in my +thinking. Thanks to the original reporters of the various forms of the bug, +including Lily Foote, David Hewitt, Wenzel Jakob.

                +
                +
                +
                +
                + +
                +
                +
                + +
                + + + + \ No newline at end of file diff --git a/categories/arm.html b/categories/arm.html new file mode 100644 index 000000000..62a0530c2 --- /dev/null +++ b/categories/arm.html @@ -0,0 +1,117 @@ + + + + + +Posts about arm | PyPy + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +
                + + \ No newline at end of file diff --git a/categories/arm.xml b/categories/arm.xml new file mode 100644 index 000000000..2a843937c --- /dev/null +++ b/categories/arm.xml @@ -0,0 +1,409 @@ + +PyPy (Posts about arm)https://www.pypy.org/enContents © 2024 <a href="mailto:pypy-dev@pypy.org">The PyPy Team</a> Sat, 31 Aug 2024 17:48:12 GMTNikola (getnikola.com)http://blogs.law.harvard.edu/tech/rssPyPy 2.0 alpha for ARMhttps://www.pypy.org/posts/2013/05/pypy-20-alpha-for-arm-2318299473927531503.htmlMaciej Fijalkowski<div dir="ltr" style="text-align: left;"> + +<p>Hello.</p> +<p>We're pleased to announce an alpha release of PyPy 2.0 for ARM. This is mostly +a technology preview, as we know the JIT is not yet stable enough for the +full release. However please try your stuff on ARM and report back.</p> +<p>This is the first release that supports a range of ARM devices - anything with +ARMv6 (like the Raspberry Pi) or ARMv7 (like Beagleboard, Chromebook, +Cubieboard, etc.) that supports VFPv3 should work. We provide builds with +support for both ARM EABI variants: hard-float and some older operating +systems soft-float.</p> +<p>This release comes with a list of limitations, consider it alpha quality, +not suitable for production:</p> +<ul class="simple"> +<li>stackless support is missing.</li> +<li>assembler produced is not always correct, but we successfully managed to +run large parts of our extensive benchmark suite, so most stuff should work.</li> +</ul> +<p>You can download the PyPy 2.0 alpha ARM release here (including a deb for raspbian):</p> +<blockquote> +<a class="reference external" href="https://pypy.org/download.html">https://pypy.org/download.html</a></blockquote> +<p>Part of the work was sponsored by the <a class="reference external" href="https://www.raspberrypi.org/">Raspberry Pi foundation</a>.</p> +<div class="section" id="what-is-pypy"> +<h3>What is PyPy?</h3> +<p>PyPy is a very compliant Python interpreter, almost a drop-in replacement for +CPython 2.7.3. It's fast due to its integrated tracing JIT compiler.</p> +<p>This release supports ARM machines running Linux 32bit. Both hard-float +<tt class="docutils literal">armhf</tt> and soft-float <tt class="docutils literal">armel</tt> builds are provided. <tt class="docutils literal">armhf</tt> builds are +created using the Raspberry Pi custom <a class="reference external" href="https://github.com/raspberrypi">cross-compilation toolchain</a> based on +gcc-arm-linux-gnueabihf and should work on ARMv6 and ARMv7 devices running at +least debian or ubuntu. <tt class="docutils literal">armel</tt> builds are built using gcc-arm-linux-gnuebi +toolchain provided by ubuntu and currently target ARMv7. If there is interest +in other builds, such as gnueabi for ARMv6 or without requiring a VFP let us +know in the comments or in IRC.</p> +</div> +<div class="section" id="benchmarks"> +<h3>Benchmarks</h3> +<p>Everybody loves benchmarks. Here is a table of our benchmark suite +(for ARM we don't provide it yet on <a class="reference external" href="https://speed.pypy.org">https://speed.pypy.org</a>, +unfortunately).</p> +<p>This is a comparison of Cortex A9 processor with 4M cache and Xeon W3580 with +8M of L3 cache. The set of benchmarks is a subset of what we run for +<a class="reference external" href="https://speed.pypy.org">https://speed.pypy.org</a> that finishes in reasonable time. The ARM machine +was provided by Calxeda. +Columns are respectively:</p> +<ul class="simple"> +<li>benchmark name</li> +<li>PyPy speedup over CPython on ARM (Cortex A9)</li> +<li>PyPy speedup over CPython on x86 (Xeon)</li> +<li>speedup on Xeon vs Cortex A9, as measured on CPython</li> +<li>speedup on Xeon vs Cortex A9, as measured on PyPy</li> +<li>relative speedup (how much bigger the x86 speedup is over ARM speedup)</li> +</ul> +<table border="1" class="docutils"> +<colgroup> +<col width="16%"> +<col width="18%"> +<col width="18%"> +<col width="15%"> +<col width="18%"> +<col width="14%"> +</colgroup> +<tbody valign="top"> +<tr><td>Benchmark</td> +<td>PyPy vs CPython (arm)</td> +<td>PyPy vs CPython (x86)</td> +<td>x86 vs arm (pypy)</td> +<td>x86 vs arm (cpython)</td> +<td>relative speedup</td> +</tr> +<tr><td>ai</td> +<td>3.61</td> +<td>3.16</td> +<td>7.70</td> +<td>8.82</td> +<td>0.87</td> +</tr> +<tr><td>bm_mako</td> +<td>3.41</td> +<td>2.11</td> +<td>8.56</td> +<td>13.82</td> +<td>0.62</td> +</tr> +<tr><td>chaos</td> +<td>21.82</td> +<td>17.80</td> +<td>6.93</td> +<td>8.50</td> +<td>0.82</td> +</tr> +<tr><td>crypto_pyaes</td> +<td>22.53</td> +<td>19.48</td> +<td>6.53</td> +<td>7.56</td> +<td>0.86</td> +</tr> +<tr><td>django</td> +<td>13.43</td> +<td>11.16</td> +<td>7.90</td> +<td>9.51</td> +<td>0.83</td> +</tr> +<tr><td>eparse</td> +<td>1.43</td> +<td>1.17</td> +<td>6.61</td> +<td>8.12</td> +<td>0.81</td> +</tr> +<tr><td>fannkuch</td> +<td>6.22</td> +<td>5.36</td> +<td>6.18</td> +<td>7.16</td> +<td>0.86</td> +</tr> +<tr><td>float</td> +<td>5.22</td> +<td>6.00</td> +<td>9.68</td> +<td>8.43</td> +<td>1.15</td> +</tr> +<tr><td>go</td> +<td>4.72</td> +<td>3.34</td> +<td>5.91</td> +<td>8.37</td> +<td>0.71</td> +</tr> +<tr><td>hexiom2</td> +<td>8.70</td> +<td>7.00</td> +<td>7.69</td> +<td>9.56</td> +<td>0.80</td> +</tr> +<tr><td>html5lib</td> +<td>2.35</td> +<td>2.13</td> +<td>6.59</td> +<td>7.26</td> +<td>0.91</td> +</tr> +<tr><td>json_bench</td> +<td>1.12</td> +<td>0.93</td> +<td>7.19</td> +<td>8.68</td> +<td>0.83</td> +</tr> +<tr><td>meteor-contest</td> +<td>2.13</td> +<td>1.68</td> +<td>5.95</td> +<td>7.54</td> +<td>0.79</td> +</tr> +<tr><td>nbody_modified</td> +<td>8.19</td> +<td>7.78</td> +<td>6.08</td> +<td>6.40</td> +<td>0.95</td> +</tr> +<tr><td>pidigits</td> +<td>1.27</td> +<td>0.95</td> +<td>14.67</td> +<td>19.66</td> +<td>0.75</td> +</tr> +<tr><td>pyflate-fast</td> +<td>3.30</td> +<td>3.57</td> +<td>10.64</td> +<td>9.84</td> +<td>1.08</td> +</tr> +<tr><td>raytrace-simple</td> +<td>46.41</td> +<td>29.00</td> +<td>5.14</td> +<td>8.23</td> +<td>0.62</td> +</tr> +<tr><td>richards</td> +<td>31.48</td> +<td>28.51</td> +<td>6.95</td> +<td>7.68</td> +<td>0.91</td> +</tr> +<tr><td>slowspitfire</td> +<td>1.28</td> +<td>1.14</td> +<td>5.91</td> +<td>6.61</td> +<td>0.89</td> +</tr> +<tr><td>spambayes</td> +<td>1.93</td> +<td>1.27</td> +<td>4.15</td> +<td>6.30</td> +<td>0.66</td> +</tr> +<tr><td>sphinx</td> +<td>1.01</td> +<td>1.05</td> +<td>7.76</td> +<td>7.45</td> +<td>1.04</td> +</tr> +<tr><td>spitfire</td> +<td>1.55</td> +<td>1.58</td> +<td>5.62</td> +<td>5.49</td> +<td>1.02</td> +</tr> +<tr><td>spitfire_cstringio</td> +<td>9.61</td> +<td>5.74</td> +<td>5.43</td> +<td>9.09</td> +<td>0.60</td> +</tr> +<tr><td>sympy_expand</td> +<td>1.42</td> +<td>0.97</td> +<td>3.86</td> +<td>5.66</td> +<td>0.68</td> +</tr> +<tr><td>sympy_integrate</td> +<td>1.60</td> +<td>0.95</td> +<td>4.24</td> +<td>7.12</td> +<td>0.60</td> +</tr> +<tr><td>sympy_str</td> +<td>0.72</td> +<td>0.48</td> +<td>3.68</td> +<td>5.56</td> +<td>0.66</td> +</tr> +<tr><td>sympy_sum</td> +<td>1.99</td> +<td>1.19</td> +<td>3.83</td> +<td>6.38</td> +<td>0.60</td> +</tr> +<tr><td>telco</td> +<td>14.28</td> +<td>9.36</td> +<td>3.94</td> +<td>6.02</td> +<td>0.66</td> +</tr> +<tr><td>twisted_iteration</td> +<td>11.60</td> +<td>7.33</td> +<td>6.04</td> +<td>9.55</td> +<td>0.63</td> +</tr> +<tr><td>twisted_names</td> +<td>3.68</td> +<td>2.83</td> +<td>5.01</td> +<td>6.50</td> +<td>0.77</td> +</tr> +<tr><td>twisted_pb</td> +<td>4.94</td> +<td>3.02</td> +<td>5.10</td> +<td>8.34</td> +<td>0.61</td> +</tr> +</tbody> +</table> +<p>It seems that Cortex A9, while significantly slower than Xeon, has higher +slowdowns with a large interpreter (CPython) than a JIT compiler (PyPy). This +comes as a surprise to me, especially that our ARM assembler is not nearly +as polished as our x86 assembler. As for the causes, various people mentioned +branch predictor, but I would not like to speculate without actually knowing.</p> +</div> +<div class="section" id="how-to-use-pypy"> +<h3>How to use PyPy?</h3> +<p>We suggest using PyPy from a <a class="reference external" href="https://www.virtualenv.org/en/latest/">virtualenv</a>. Once you have a virtualenv +installed, you can follow instructions from <a class="reference external" href="https://doc.pypy.org/en/latest/getting-started.html#installing-using-virtualenv">pypy documentation</a> on how +to proceed. This document also covers other <a class="reference external" href="https://doc.pypy.org/en/latest/getting-started.html#installing-pypy">installation schemes</a>.</p> +<p>We would not recommend using in production PyPy on ARM just quite yet, +however the day of a stable PyPy ARM release is not far off.</p> +<p>Cheers,<br> +fijal, bivab, arigo and the whole PyPy team</p> +</div> +<br></div>armsponsorshttps://www.pypy.org/posts/2013/05/pypy-20-alpha-for-arm-2318299473927531503.htmlTue, 07 May 2013 13:35:00 GMTAlmost There - PyPy's ARM Backendhttps://www.pypy.org/posts/2012/02/almost-there-pypys-arm-backend_01-3216759488618774525.htmlDavid Schneider<div style="text-align: left;"> +In this post I want to give an update on the status of the ARM backend for PyPy's JIT and describe some of the issues and details of the backend.</div> +<div class="section" id="current-status"> +<br> +<h2> + + + + +Current Status</h2> +It has been a more than a year that I have been working on the ARM backend. Now it is in a shape, that we can measure meaningful numbers and also ask for some feedback. Since the <a class="reference external" href="https://www.pypy.org/posts/2011/01/jit-backend-for-arm-processors-5994810755839586463.html">last post about the backend</a> we have added support floating point operations as well as for PyPy's framework GC's. Another area of work was to keep up with the constant improvements done in the main development branch, such as out-of-line guards, labels, etc. It has been possible for about a year to cross-translate the PyPy Python interpreter and other interpreters such as <a class="reference external" href="https://bitbucket.org/cfbolz/pyrolog/">Pyrolog</a>, with a JIT, to run benchmarks on ARM. Up until now there remained some hard to track bugs that would cause the interpreter to crash with a segmentation fault in certain cases when running with the JIT on ARM. Lately it was possible to run all benchmarks without problems, but when running the translation toolchain itself it would crash. During the last PyPy sprint in <a class="reference external" href="https://www.pypy.org/posts/2011/12/leysin-winter-sprint-6862532189897876336.html">Leysin</a> Armin and I managed to fix several of these hard to track bugs in the ARM backend with the result that, it is now possible to run the PyPy translator on ARM itself (at least unless until it runs out of memory), which is a kind of litmus test for the backend itself and used to crash before. Just to point it out, we are not able to complete a PyPy translation on ARM, because on the hardware we have currently available there is not enough memory. But up to the point we run out of memory the JIT does not hit any issues.<br> +<br></div> +<div class="section" id="implementation-details"> +<h2> + + + + +Implementation Details</h2> +The hardware requirements to run the JIT on ARM follow those for Ubuntu on ARM which targets ARMv7 with a VFP unit running in little endian mode. The JIT can be translated without floating point support, but there might be a few places that need to be fixed to fully work in this setting. We are targeting the ARM instruction set, because at least at the time we decided to use it seemed to be the best choice in terms of speed while having some size overhead compared to the Thumb2 instruction set. It appears that the Thumb2 instruction set should give comparable speed with better code density but has a few restriction on the number of registers available and the use of conditional execution. Also the implementation is a bit easier using a fixed width instruction set and we can use the full set of registers in the generated code when using the ARM instruction set.<br> +<br></div> +<div class="section" id="the-calling-convention-on-arm"> +<h2> + + + + +The calling convention on ARM</h2> +The calling convention on ARM uses 4 of the general purpose registers to pass arguments to functions, further arguments are passed on the stack. The presence of a floating point unit is not required for ARM cores, for this reason there are different ways of handling floats with relation to the calling convention. There is a so called soft-float calling convention that is independent of the presence of a floating point unit. For this calling convention floating point arguments to functions are stored in the general purpose registers and on the stack. Passing floats around this way works with software and hardware floating point implementations. But in presence of a floating point unit it produces some overhead, because floating point numbers need to be moved from the floating point unit to the core registers to do a call and moved back to the floating point registers by the callee. The alternative calling convention is the so-called hard-float calling convention which requires the presence of a floating point unit but has the advantage of getting rid of the overhead of moving floating point values around when performing a call. Although it would be better in the long term to support the hard-float calling convention, we need to be able to interoperate with external code compiled for the operating system we are running on. For this reason at the moment we only support the soft-float to interoperate with external code. We implemented and tested the backend on a <a class="reference external" href="https://beagleboard.org/hardware-xM/">BeagleBoard-xM</a> with a <a class="reference external" href="https://www.arm.com/products/processors/cortex-a/cortex-a8.php">Cortex-A8</a> processor running <a class="reference external" href="https://wiki.ubuntu.com/ARM">Ubuntu 11.04 for ARM</a>.<br> +<br></div> +<div class="section" id="translating-for-arm"> +<h2> + + + + +Translating for ARM</h2> +The toolchain used to translate PyPy currently is based on a <a class="reference external" href="https://maemo.gitorious.org/scratchbox2/pages/Home">Scratchbox2</a>. Scratchbox2 is a cross-compiling environment. Development had stopped for a while, but it seems to have revived again. We run a 32-bit Python interpreter on the host system and perform all calls to the compiler using a Scratchbox2 based environment. A description on how to setup the cross translation toolchain can be found <a class="reference external" href="https://bitbucket.org/pypy/pypy/src/1f07ea8076c9/pypy/doc/arm.rst">here</a>.<br> +<br></div> +<div class="section" id="results"> +<h2> + + + + +Results</h2> +The current results on ARM, as shown in the graph below, show that the JIT currently gives a speedup of about 3.5 times compared to CPython on ARM. The benchmarks were run on the before mentioned BeagleBoard-xM with a 1GHz ARM Cortex-A8 processor and 512MB of memory. The operating system on the board is Ubuntu 11.04 for ARM. We measured the PyPy interpreter with the JIT enabled and disabled comparing each to CPython Python 2.7.1+ (r271:86832) for ARM. The graph shows the speedup or slowdown of both PyPy versions for the different benchmarks from our benchmark suite normalized to the runtime of CPython. The data used for the graph can be seen below.<br> +<div class="separator" style="clear: both; text-align: center;"> +<a href="https://2.bp.blogspot.com/-uckc9tOWgnM/TykHMuuGT9I/AAAAAAAAAKg/J8_fC6RS-QA/s1600/graph.png" style="margin-left: 1em; margin-right: 1em;"><img border="0" height="258" src="https://2.bp.blogspot.com/-uckc9tOWgnM/TykHMuuGT9I/AAAAAAAAAKg/J8_fC6RS-QA/s400/graph.png" width="400"></a></div> +<br> +The speedup is less than the speedup of 5.2 times we currently get on x86 on our own benchmark suite (see <a class="reference external" href="https://speed.pypy.org/">https://speed.pypy.org</a> for details). There are several possible reasons for this. Comparing the results for the interpreter without the JIT on ARM and x86 suggests that the interpreter generated by PyPy, without the JIT, has a worse performance when compared to CPython that it does on x86. Also it is quite possible that the code we are generating with the JIT is not yet optimal. Also there are some architectural constraints produce some overhead. One of these differences is the handling of constants, most ARM instructions only support 8 bit (that can be shifted) immediate values, larger constants need to be loaded into a register, something that is not necessary on x86.<br> +<br> +<table border="1" class="docutils"><colgroup></colgroup><colgroup><col width="40%"></colgroup><colgroup><col width="32%"></colgroup><colgroup><col width="28%"></colgroup><tbody valign="top"> +<tr><td>Benchmark</td><td>PyPy JIT</td><td>PyPy no JIT</td></tr> +<tr><td>ai</td><td>0.484439780047</td><td>3.72756749625</td></tr> +<tr><td>chaos</td><td>0.0807291691934</td><td>2.2908692212</td></tr> +<tr><td>crypto_pyaes</td><td>0.0711114832245</td><td>3.30112318509</td></tr> +<tr><td>django</td><td>0.0977743245519</td><td>2.56779947601</td></tr> +<tr><td>fannkuch</td><td>0.210423735698</td><td>2.49163632938</td></tr> +<tr><td>float</td><td>0.154275334675</td><td>2.12053281495</td></tr> +<tr><td>go</td><td>0.330483034202</td><td>5.84628320479</td></tr> +<tr><td>html5lib</td><td>0.629264389862</td><td>3.60333138526</td></tr> +<tr><td>meteor-contest</td><td>0.984747426912</td><td>2.93838610037</td></tr> +<tr><td>nbody_modified</td><td>0.236969593082</td><td>1.40027234936</td></tr> +<tr><td>pyflate-fast</td><td>0.367447191807</td><td>2.72472422146</td></tr> +<tr><td>raytrace-simple</td><td>0.0290527461437</td><td>1.97270054339</td></tr> +<tr><td>richards</td><td>0.034575573553</td><td>3.29767342015</td></tr> +<tr><td>slowspitfire</td><td>0.786642551908</td><td>3.7397367403</td></tr> +<tr><td>spambayes</td><td>0.660324379456</td><td>3.29059863111</td></tr> +<tr><td>spectral-norm</td><td>0.063610783731</td><td>4.01788986233</td></tr> +<tr><td>spitfire</td><td>0.43617131165</td><td>2.72050579076</td></tr> +<tr><td>spitfire_cstringio</td><td>0.255538702134</td><td>1.7418593111</td></tr> +<tr><td>telco</td><td>0.102918930413</td><td>3.86388866047</td></tr> +<tr><td>twisted_iteration</td><td>0.122723986805</td><td>4.33632475491</td></tr> +<tr><td>twisted_names</td><td>2.42367797135</td><td>2.99878698076</td></tr> +<tr><td>twisted_pb</td><td>1.30991837431</td><td>4.48877805486</td></tr> +<tr><td>twisted_tcp</td><td>0.927033354055</td><td>2.8161624665</td></tr> +<tr><td>waf</td><td>1.02059811932</td><td>1.03793427321</td></tr> +</tbody></table> +</div> +<br> +<br> +<div class="section" id="the-next-steps-and-call-for-help"> +<h2> + + + + +The next steps and call for help</h2> +Although there probably still are some remaining issues which have not surfaced yet, the JIT backend for ARM is working. Before we can merge the backend into the main development line there are some things that we would like to do first, in particular it we are looking for a way to run the all PyPy tests to verify that things work on ARM before we can merge. Additionally there are some other longterm ideas. To do this we are looking for people willing to help, either by contributing to implement the open features or that can help us with hardware to test.<br> +<br> +The incomplete list of open topics:<br> +<ul class="simple"> +<li>We are looking for a better way to translate PyPy for ARM, than the one describe above. I am not sure if there currently is hardware with enough memory to directly translate PyPy on an ARM based system, this would require between 1.5 or 2 Gig of memory. A fully <a class="reference external" href="https://wiki.qemu.org/Main_Page">QEMU</a> based approach could also work, instead of Scratchbox2 that uses QEMU under the hood.</li> +<li>Test the JIT on different hardware.</li> +<li>Experiment with the JIT settings to find the optimal thresholds for ARM.</li> +<li>Continuous integration: We are looking for a way to run the PyPy test suite to make sure everything works as expected on ARM, here QEMU also might provide an alternative.</li> +<li>A long term plan would be to port the backend to ARMv5 ISA and improve the support for systems without a floating point unit. This would require to implement the ISA and create different code paths and improve the instruction selection depending on the target architecture.</li> +<li>Review of the generated machine code the JIT generates on ARM to see if the instruction selection makes sense for ARM.</li> +<li>Build a version that runs on Android.</li> +<li>Improve the tools, i.e. integrate with <a class="reference external" href="https://bitbucket.org/pypy/jitviewer">jitviewer</a>.</li> +</ul> +So if you are interested or willing to help in any way contact us.</div>armjitpypyhttps://www.pypy.org/posts/2012/02/almost-there-pypys-arm-backend_01-3216759488618774525.htmlWed, 01 Feb 2012 09:43:00 GMT \ No newline at end of file diff --git a/categories/casestudy.html b/categories/casestudy.html new file mode 100644 index 000000000..7d3706f14 --- /dev/null +++ b/categories/casestudy.html @@ -0,0 +1,120 @@ + + + + + +Posts about casestudy | PyPy + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +
                + + \ No newline at end of file diff --git a/categories/casestudy.xml b/categories/casestudy.xml new file mode 100644 index 000000000..21fbd7a69 --- /dev/null +++ b/categories/casestudy.xml @@ -0,0 +1,264 @@ + +PyPy (Posts about casestudy)https://www.pypy.org/enContents © 2024 <a href="mailto:pypy-dev@pypy.org">The PyPy Team</a> Sat, 31 Aug 2024 17:48:12 GMTNikola (getnikola.com)http://blogs.law.harvard.edu/tech/rssGuest Post: How PortaOne uses PyPy for high-performance processing, connecting over 1B of phone calls every monthhttps://www.pypy.org/posts/2024/08/portaone.htmlThe PyPy Team<p>The PyPy project is always happy to hear about industrial use and deployments +of PyPy. For the <a href="https://www.pypy.org/posts/2024/03/fixing-bug-incremental-gc.html">GC bug +finding</a> +task earlier this year, we collaborated with PortaOne and we're super happy +that Serhii Titov, head of the QA department at PortaOne, was up to writing +this guest post to describe their use and experience with the project.</p> +<hr> +<h3 id="what-does-portaone-do">What does PortaOne do?</h3> +<p>We at <a href="https://www.portaone.com/">PortaOne Inc.</a> allow telecom operators to +launch new services (or provide existing services more efficiently) using our +VoIP platform (PortaSIP) and our real-time charging system (PortaBilling), +which provides additional features for cloud PBX, such as call transfer, +queues, interactive voice response (IVR) and more. At this moment our support +team manages several thousand servers with our software installed in 100 +countries, through which over 500 telecommunication service providers connect +millions of end users every day. The unique thing about PortaOne is that we +supply the source code of our product to our customers - something unheard of +in the telecom world! Thus we attract "telco innovators", who use our APIs to +build around the system and the source code to create unique tweaks of +functionality, which produces amazing products.</p> +<p>At the core of PortaSIP is the middle-ware component (the proper name for it is +"B2BUA", but that probably does not say much to anyone outside of experts in +VoIP), which implements the actual handling of SIP calls, messages, etc. and +all added features (for instance, trying to send a call via telco operators +through which the cost per minute is lower). It has to be fast (since even a +small delay in establishing a call is noticed by a customer), reliable +(everyone hates when a call drops or cannot be completed) and yet easily +expandable with new functionality. This is why we decided to use Python as +opposed to C/C++ or similar programming languages, which are often used in +telecom equipment.</p> +<p>The B2BUA component is a batch of similar Python processes that are looped +inside a +<a href="https://docs.python.org/3.10/library/asyncore.html"><code>asyncore.dispatcher</code></a> +wrapper. The load balancing between these Python processes is done by our +stateless SIP proxy server written in C++. All our sockets are served by this +B2BUA. We have our custom client-wrappers around <code>pymysql</code>, <code>redis</code>, +<code>cassandra-driver</code> and <code>requests</code> to communicate with external services. Some +of the Python processes use <a href="https://cffi.readthedocs.io/en/stable/"><code>cffi</code></a> +wrappers around C-code to improve their performance (examples: an Oracle DB +driver, a client to a radius server, a custom C logger).</p> +<p>The I/O operations that block the main thread of the Python processes are +processed in sub-threads. We have custom wrappers around <code>threading.Thread</code> +and also <code>asyncore.dispatcher</code>. The results of such operations are returned to +the main thread.</p> +<h3 id="improving-our-performance-with-pypy">Improving our performance with PyPy</h3> +<p>We started with CPython and then in 2014 switched to PyPy because it was +faster. Here's an exact quote from our first testing notes: "PyPy gives +significant performance boost, ~50%". Nowadays, after years of changes in all +the software involved, PyPy still gives us +50% boost compared to CPython.</p> +<p>Taking care of real time traffic for so many people around the globe is +something we're really proud of. I hope the PyPy team can be proud of it as +well, as the PyPy product is a part of this solution.</p> +<h3 id="finding-a-garbage-collector-bug-stage-1-the-gc-hooks">Finding a garbage collector bug: stage 1, the GC hooks</h3> +<p>However our path with PyPy wasn't perfectly smooth. There were very rare cases +of crashes on PyPy that we weren't able to catch. That's because to make +coredump useful we needed to switch to PyPy with debug, but we cannot let it +run in that mode on a production system for an extended period of time, and we +did not have any STR (steps-to-reproduce) to make PyPy crash again in our lab. +That's why we kept (and still keep) both interpreters installed just in case, +and we would switch to CPython if we noticed it happening.</p> +<p>At the time of updating PyPy from 3.5 to 3.6 our QA started noticing those +crashes more often, but we still had no luck with STR or collecting proper +coredumps with debug symbols. Then it became even worse after our development +played with the <a href="https://doc.pypy.org/en/latest/gc_info.html">Garbage Collector's +options</a> to increase performance +of our middleware component. The crashes started to affect our regular +performance testing (controlled by QA manager Yevhenii Bovda). At that point it +was decided that we can no longer live like that and so we started an intense +investigation.</p> +<p>During the first stage of our investigation (following the best practice of +troubleshooting) we narrowed down the issue as much as we could. So, it was not +our code, it was definitely somewhere in PyPy. Eventually our SIP software +engineer <a href="https://github.com/Yevhenii-Yatchenko">Yevhenii Yatchenko</a> found out +that this bug is connected with the use of our <a href="https://doc.pypy.org/en/latest/gc_info.html#gc-hooks">custom hooks in the +GC</a>. Yevhenii created +ticket <a href="https://github.com/pypy/pypy/issues/4899">#4899</a> and within 2-3 days we +got a fix from a <a href="https://github.com/cfbolz">member of the PyPy team</a>, in true open-source fashion.</p> +<h3 id="finding-a-garbage-collector-bug-stage-2-the-real-bug">Finding a garbage collector bug: stage 2, the real bug</h3> +<p>Then came stage 2. In parallel with the previous ticket, Yevhenii created +<a href="https://github.com/pypy/pypy/issues/4900">#4900</a> that we still see failing +with coredumps quite often, and they are not connected to GC custom hooks. In a +nutshell, it took us dozens of back and forward emails, three Zoom sessions and +four versions of a patch to solve the issue. During the last iteration we got a +new set of options to try and a new version of the patch. Surprisingly, that +helped! What a relief! So, the next logical step was to remove all debug +options and run PyPy only with the patch. Unfortunately, it started to fail +again and we came to the obvious conclusion that what will help us is not a +patch, but one of options we were testing out. At that point we found out that +<a href="https://doc.pypy.org/en/latest/gc_info.html#environment-variables"><code>PYPY_GC_MAX_PINNED=0</code></a> +is a necessary and sufficient condition to solve our issue. This points to +another bug in the garbage collector, somehow related to object pinning.</p> +<p>Here's our current state: we have to add <code>PYPY_GC_MAX_PINNED=0</code>, but we do not +face the crashes anymore.</p> +<h3 id="conclusion-and-next-steps">Conclusion and next steps</h3> +<p>Gratitude is extended to Carl for his invaluable assistance in resolving the +nasty bugss, because it seems we're the only ones who suffered from the last +one and we really did not want to fall back to CPython due to its performance +disadvantage.</p> +<p>Serhii Titov, head of the QA department at PortaOne Inc.</p> +<p>P.S. If you are a perfectionist and at this point you have mixed feelings and +you are still bothered by the question "But there might still be a bug in the +GC, what about that?" - Carl has some ideas about it and he will sort it out +(we will help with the testing/verification part).</p>casestudyguestposthttps://www.pypy.org/posts/2024/08/portaone.htmlThu, 29 Aug 2024 09:00:00 GMTRPython-based emulator speeds up RISC-V simulation over 15xhttps://www.pypy.org/posts/2023/05/rpython-used-to-speed-up-risc-v-simulation-over-15x.htmlCarl Friedrich Bolz-Tereick<p>In cooperation with <a class="reference external" href="https://riscv.org/">RISC-V International</a>, who funded a part of this project, +we recently created a workflow to +use RPython to take a <a class="reference external" href="https://github.com/riscv/sail-riscv#riscv-sail-model">Sail RISC-V</a> model and automatically create a RISC-V ISA +emulator from it, which we call <a class="reference external" href="https://docs.pydrofoil.org">Pydrofoil</a>. The simulator sped up booting a +linux emulator from 35 minutes (using the standard Sail-generated emulator in +C) to 2 minutes, a speedup of 17.5x. More details about the process are in the +<a class="reference external" href="https://riscv.org/blog/2023/05/how-to-speed-up-the-emulating-process-with-pydrofoil-carl-friedrich/">RISC-V blog post</a>.</p> +<p>A few take-aways from the project:</p> +<ul class="simple"> +<li><p>While PyPy has shown it can speed up generic python code <a class="reference external" href="https://speed.pypy.org">about 4x</a>, the +technology behind PyPy can really shine in other areas.</p></li> +<li><p>RPython is malleable and can be molded to many tasks, the RPython meta-JIT is +very flexible.</p></li> +<li><p>A JIT is well-suited for the problem of emulation, because it can +perform dynamic binary translation.</p></li> +</ul> +<p>PyPy can solve real world performance problems, even somewhat unusual ones. +Please <a class="reference external" href="https://www.pypy.org/pypy-sponsors.html">get in touch</a> and let us know how we can help you solve yours!</p>casestudyperformancehttps://www.pypy.org/posts/2023/05/rpython-used-to-speed-up-risc-v-simulation-over-15x.htmlTue, 16 May 2023 11:22:35 GMTNatural Language Processing for Icelandic with PyPy: A Case Studyhttps://www.pypy.org/posts/2022/02/nlp-icelandic-case-study.htmlVilhjálmur Þorsteinsson<section id="natural-language-processing-for-icelandic-with-pypy-a-case-study"> +<h2>Natural Language Processing for Icelandic with PyPy: A Case Study</h2> +<p><a class="reference external" href="https://en.wikipedia.org/wiki/Icelandic_language">Icelandic</a> is one +of the smallest languages of the world, with about 370.000 speakers. It +is a language in the Germanic family, most similar to Norwegian, Danish +and Swedish, but closer to the original <a class="reference external" href="https://en.wikipedia.org/wiki/Old_Norse">Old +Norse</a> spoken throughout +Scandinavia until about the 14th century CE.</p> +<p>As with other small languages, there are <a class="reference external" href="https://www.theguardian.com/world/2018/feb/26/icelandic-language-battles-threat-of-digital-extinction">worries that the language may +not +survive</a> +in a digital world, where all kinds of fancy applications are developed +first - and perhaps only - for the major languages. Voice assistants, +chatbots, spelling and grammar checking utilities, machine translation, +etc., are increasingly becoming staples of our personal and professional +lives, but if they don’t exist for Icelandic, Icelanders will gravitate +towards English or other languages where such tools are readily +available.</p> +<p>Iceland is a technology-savvy country, with <a class="reference external" href="https://ourworldindata.org/grapher/share-of-individuals-using-the-internet?tab=table">world-leading adoption +rates of the +Internet</a>, +PCs and smart devices, and a thriving software industry. So the +government figured that it would be worthwhile to fund a <a class="reference external" href="https://aclanthology.org/2020.lrec-1.418.pdf">5-year +plan</a> to build natural +language processing (NLP) resources and other infrastructure for the +Icelandic language. The project focuses on collecting data and +developing open source software for a range of core applications, such +as tokenization, vocabulary lookup, n-gram statistics, part-of-speech +tagging, named entity recognition, spelling and grammar checking, neural +language models and speech processing.</p> +<hr class="docutils"> +<p>My name is Vilhjálmur Þorsteinsson, and I’m the founder and CEO of a +software startup <a class="reference external" href="https://mideind.is/english.html">Miðeind</a> in Reykjavík, +Iceland, that employs 10 software engineers and linguists and focuses on +NLP and AI for the Icelandic language. The company participates in the +government’s language technology program, and has contributed +significantly to the program’s core tools (e.g., a tokenizer and a +parser), spelling and grammar checking modules, and a neural machine +translation stack.</p> +<p>When it came to a choice of programming languages and development tools +for the government program, the requirements were for a major, well +supported, vendor-and-OS-agnostic FOSS platform with a large and diverse +community, including in the NLP space. The decision to select Python as +a foundational language for the project was a relatively easy one. That +said, there was a bit of trepidation around the well known fact that +CPython can be slow for inner-core tasks, such as tokenization and +parsing, that can see heavy workloads in production.</p> +<p>I first became aware of PyPy in early 2016 when I was developing a +crossword game <a class="reference external" href="https://github.com/mideind/Netskrafl">Netskrafl</a> in Python 2.7 +for Google App Engine. I had a utility program that compressed a +dictionary into a Directed Acyclic Word Graph and was taking 160 +seconds  to run on CPython 2.7, so I tried PyPy and to my amazement saw +a 4x speedup (down to 38 seconds), with literally no effort besides +downloading the PyPy runtime.</p> +<p>This led me to select PyPy as the default Python interpreter for my +company’s Python development efforts as well as for our production +websites and API servers, a role in which it remains to this day. We +have followed PyPy’s upgrades along the way, being just about to migrate +our minimally required language version from 3.6 to 3.7.</p> +<p>In NLP, speed and memory requirements can be quite important for +software usability. On the other hand, NLP logic and algorithms are +often complex and challenging to program, so programmer productivity and +code clarity are also critical success factors. A pragmatic approach +balances these factors, avoids premature optimization and seeks a +careful compromise between maximal run-time efficiency and minimal +programming and maintenance effort.</p> +<p>Turning to our use cases, our Icelandic text +tokenizer <a class="reference external" href="https://github.com/mideind/Tokenizer">"Tokenizer"</a> is fairly light, +runs tight loops and performs a large number of small, repetitive +operations. It runs very well on PyPy’s JIT and has not required further +optimization.</p> +<p>Our Icelandic parser <a class="reference external" href="https://github.com/mideind/GreynirPackage">Greynir</a> +(known on PyPI as <a class="reference external" href="https://pypi.org/project/reynir/">reynir</a>) is, +if I may say so myself, a piece of work. It <a class="reference external" href="https://aclanthology.org/R19-1160.pdf">parses natural language +text</a> according to a +<a class="reference external" href="https://github.com/mideind/GreynirPackage/blob/master/src/reynir/Greynir.grammar">hand-written context-free +grammar</a>, +using an <a class="reference external" href="https://en.wikipedia.org/wiki/Earley_parser">Earley-type +algorithm</a> as <a class="reference external" href="https://www.sciencedirect.com/science/article/pii/S0167642309000951">enhanced +by Scott and +Johnstone</a>. +The CFG contains almost 7,000 nonterminals and 6,000 terminals, and the +parser handles ambiguity as well as left, right and middle recursion. It +returns a packed parse forest for each input sentence, which is then +pruned by a scoring heuristic down to a single best result tree.</p> +<p>This parser was originally coded in pure Python and turned out to be +unusably slow when run on CPython - but usable on PyPy, where it was +3-4x faster. However, when we started applying it to heavier production +workloads, it  became apparent that it needed to be faster still. We +then proceeded to convert the innermost Earley parsing loop from Python +to <a class="reference external" href="https://github.com/mideind/GreynirPackage/blob/master/src/reynir/eparser.cpp">tight +C++</a> +and to call it from PyPy via +<a class="reference external" href="https://cffi.readthedocs.io/en/latest/">CFFI</a>, with callbacks for +token-terminal matching functions (“business logic”) that remained on +the Python side. This made the parser much faster (on the order of 100x +faster than the original on CPython) and quick enough for our production +use cases. Even after moving much of the heavy processing to C++ and using CFFI, PyPy still gives a significant speed boost over CPython.</p> +<p>Connecting C++ code with PyPy proved to be quite painless using CFFI, +although we had to figure out a few <a class="reference external" href="https://github.com/mideind/GreynirPackage/blob/master/src/reynir/eparser_build.py">magic incantations in our build +module</a> +to make it compile smoothly during setup from source on Windows and +MacOS in addition to Linux. Of course, we build binary PyPy and CPython +wheels for the most common targets so most users don’t have to worry +about setup requirements.</p> +<p>With the positive experience from the parser project, we proceeded to +take a similar approach for two other core NLP packages: our compressed +vocabulary package <a class="reference external" href="https://github.com/mideind/BinPackage">BinPackage</a> +(known on PyPI as <a class="reference external" href="https://pypi.org/project/islenska/">islenska</a>) and our +trigrams database package <a class="reference external" href="https://github.com/mideind/Icegrams">Icegrams</a>. +These packages both take large text input (3.1 million word forms with +inflection data in the vocabulary case; 100 million tokens in the +trigrams case) and compress it into packed binary structures. These +structures are then memory-mapped at run-time using +<a class="reference external" href="https://docs.python.org/3/library/mmap.html">mmap</a> and queried via +Python functions with a lookup time in the microseconds range. The +low-level data structure navigation is <a class="reference external" href="https://github.com/mideind/Icegrams/blob/master/src/icegrams/trie.cpp">done in +C++</a>, +called from Python via CFFI. The ex-ante preparation, packing, +bit-fiddling and data structure generation is fast enough with PyPy, so +we haven’t seen a need to optimize that part further.</p> +<p>To showcase our tools, we host public (and open source) websites such as +<a class="reference external" href="https://greynir.is/">greynir.is</a> for our parsing, named entity +recognition and query stack and +<a class="reference external" href="https://yfirlestur.is/">yfirlestur.is</a> for our spell and grammar +checking stack. The server code on these sites is all Python running on +PyPy using <a class="reference external" href="https://flask.palletsprojects.com/en/2.0.x/">Flask</a>, +wrapped in <a class="reference external" href="https://gunicorn.org/">gunicorn</a> and hosted on +<a class="reference external" href="https://www.nginx.com/">nginx</a>. The underlying database is +<a class="reference external" href="https://www.postgresql.org/">PostgreSQL</a> accessed via +<a class="reference external" href="https://www.sqlalchemy.org/">SQLAlchemy</a> and +<a class="reference external" href="https://pypi.org/project/psycopg2cffi/">psycopg2cffi</a>. This setup +has served us well for 6 years and counting, being fast, reliable and +having helpful and supporting communities.</p> +<p>As can be inferred from the above, we are avid fans of PyPy and +commensurately thankful for the great work by the PyPy team over the +years. PyPy has enabled us to use Python for a larger part of our +toolset than CPython alone would have supported, and its smooth +integration with C/C++ through CFFI has helped us attain a better +tradeoff between performance and programmer productivity in our +projects. We wish for PyPy a great and bright future and also look +forward to exciting related developments on the horizon, such as +<a class="reference external" href="https://hpyproject.org/">HPy</a>.</p> +</section>casestudyhttps://www.pypy.org/posts/2022/02/nlp-icelandic-case-study.htmlSun, 06 Feb 2022 15:00:00 GMT \ No newline at end of file diff --git a/categories/cli.html b/categories/cli.html new file mode 100644 index 000000000..359f3e385 --- /dev/null +++ b/categories/cli.html @@ -0,0 +1,114 @@ + + + + + +Posts about cli | PyPy + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +
                + + \ No newline at end of file diff --git a/categories/cli.xml b/categories/cli.xml new file mode 100644 index 000000000..51caba695 --- /dev/null +++ b/categories/cli.xml @@ -0,0 +1,141 @@ + +PyPy (Posts about cli)https://www.pypy.org/enContents © 2024 <a href="mailto:pypy-dev@pypy.org">The PyPy Team</a> Sat, 31 Aug 2024 17:48:14 GMTNikola (getnikola.com)http://blogs.law.harvard.edu/tech/rssFirst pypy-cli-jit benchmarkshttps://www.pypy.org/posts/2009/10/first-pypy-cli-jit-benchmarks-6698484455072589492.htmlAntonio Cuni<p>As the readers of this blog <a class="reference external" href="https://www.pypy.org/posts/2008/11/porting-jit-to-cli-part-1-8712941279840156635.html">already know</a>, I've been working on porting the +JIT to CLI/.NET for the last months. Now that it's finally possible to get a +working pypy-cli-jit, it's time to do some benchmarks.</p> +<p><strong>Warning:</strong> as usual, all of this has to be considered to be a alpha version: +don't be surprised if you get a crash when trying to run pypy-cli-jit. Of +course, things are improving very quickly so it should become more and more +stable as days pass.</p> +<p>For this time, I decided to run four benchmarks. Note that for all of them we +run the main function once in advance, to let the JIT recoginizing the hot +loops and emitting the corresponding code. Thus, the results reported do +<strong>not</strong> include the time spent by the JIT compiler itself, but give a good +measure of how good is the code generated by the JIT. At this point in time, +I know that the CLI JIT backend spends way too much time compiling stuff, but +this issue will be fixed soon.</p> +<blockquote> +<ul class="simple"> +<li><a class="reference external" href="https://paste.pocoo.org/show/145050/">f1.py</a>: this is the classic PyPy JIT benchmark. It is just a function +that does some computational intensive work with integers.</li> +<li><a class="reference external" href="https://paste.pocoo.org/show/143243/">floatdemo.py</a>: this is the same benchmark involving floating point +numbers that have already been described in a previous <a class="reference external" href="https://www.pypy.org/posts/2009/10/pypys-jit-now-supports-floats-7003493323596806737.html">blog post</a>.</li> +<li><a class="reference external" href="https://paste.pocoo.org/show/145051/">oodemo.py</a>: this is just a microbenchmark doing object oriented stuff +such as method calls and attribute access.</li> +<li><a class="reference external" href="https://paste.pocoo.org/show/145052/">richards2.py</a>: a modified version of the classic richards.py, with a +warmup call before starting the real benchmark.</li> +</ul> +</blockquote> +<p>The benchmarks were run on a Windows machine with an Intel Pentium Dual Core +E5200 2.5GHz and 2GB RAM, both with .NET (CLR 2.0) and Mono 2.4.2.3.</p> +<p>Because of a known <a class="reference external" href="https://bugzilla.novell.com/show_bug.cgi?id=474718">mono bug</a>, if you use a version older than 2.1 you need +to pass the option <tt class="docutils literal"><span class="pre">-O=-branch</span></tt> to mono when running pypy-cli-jit, else it +will just loop forever.</p> +<p>For comparison, we also run the same benchmarks with IronPython 2.0.1 and +IronPython 2.6rc1. Note that IronPython 2.6rc1 does not work with mono.</p> +<p>So, here are the results (expressed in seconds) with Microsoft CLR:</p> +<blockquote> +<table border="1" class="docutils"> +<colgroup> +<col width="15%"> +<col width="20%"> +<col width="15%"> +<col width="12%"> +<col width="20%"> +<col width="18%"> +</colgroup> +<thead valign="bottom"> +<tr><th class="head">Benchmark</th> +<th class="head">pypy-cli-jit</th> +<th class="head">ipy 2.0.1</th> +<th class="head">ipy 2.6</th> +<th class="head">ipy2.01/ pypy</th> +<th class="head">ipy2.6/ pypy</th> +</tr> +</thead> +<tbody valign="top"> +<tr><td>f1</td> +<td>0.028</td> +<td>0.145</td> +<td>0.136</td> +<td>5.18x</td> +<td>4.85x</td> +</tr> +<tr><td>floatdemo</td> +<td>0.671</td> +<td>0.765</td> +<td>0.812</td> +<td>1.14x</td> +<td>1.21x</td> +</tr> +<tr><td>oodemo</td> +<td>1.25</td> +<td>4.278</td> +<td>3.816</td> +<td>3.42x</td> +<td>3.05x</td> +</tr> +<tr><td>richards2</td> +<td>1228</td> +<td>442</td> +<td>670</td> +<td>0.36x</td> +<td>0.54x</td> +</tr> +</tbody> +</table> +</blockquote> +<p>And with Mono:</p> +<blockquote> +<table border="1" class="docutils"> +<colgroup> +<col width="21%"> +<col width="29%"> +<col width="21%"> +<col width="29%"> +</colgroup> +<thead valign="bottom"> +<tr><th class="head">Benchmark</th> +<th class="head">pypy-cli-jit</th> +<th class="head">ipy 2.0.1</th> +<th class="head">ipy2.01/ pypy</th> +</tr> +</thead> +<tbody valign="top"> +<tr><td>f1</td> +<td>0.042</td> +<td>0.695</td> +<td>16.54x</td> +</tr> +<tr><td>floatdemo</td> +<td>0.781</td> +<td>1.218</td> +<td>1.55x</td> +</tr> +<tr><td>oodemo</td> +<td>1.703</td> +<td>9.501</td> +<td>5.31x</td> +</tr> +<tr><td>richards2</td> +<td>720</td> +<td>862</td> +<td>1.20x</td> +</tr> +</tbody> +</table> +</blockquote> +<p>These results are very interesting: under the CLR, we are between 5x faster +and 3x slower than IronPython 2.0.1, and between 4.8x faster and 1.8x slower +than IronPython 2.6. On the other hand, on mono we are consistently faster +than IronPython, up to 16x. Also, it is also interesting to note that +pypy-cli runs faster on CLR than mono for all benchmarks except richards2.</p> +<p>I've not investigated yet, but I think that the culprit is the terrible +behaviour of tail calls on CLR: as I already wrote in <a class="reference external" href="https://www.pypy.org/posts/2008/12/porting-jit-to-cli-part-3-3519327524638923621.html">another blog post</a>, +tail calls are ~10x slower than normal calls on CLR, while being only ~2x +slower than normal calls on mono. richads2 is probably the benchmark that +makes most use of tail calls, thus explaining why we have a much better result +on mono than CLR.</p> +<p>The next step is probably to find an alternative implementation that does not +use tail calls: this probably will also improve the time spent by the JIT +compiler itself, which is not reported in the numbers above but that so far it +is surely too high to be acceptable. Stay tuned.</p>clijitpypyhttps://www.pypy.org/posts/2009/10/first-pypy-cli-jit-benchmarks-6698484455072589492.htmlThu, 15 Oct 2009 13:36:00 GMT \ No newline at end of file diff --git a/categories/compiler.html b/categories/compiler.html new file mode 100644 index 000000000..8d3acf869 --- /dev/null +++ b/categories/compiler.html @@ -0,0 +1,114 @@ + + + + + +Posts about compiler | PyPy + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +
                + + \ No newline at end of file diff --git a/categories/compiler.xml b/categories/compiler.xml new file mode 100644 index 000000000..19c3acc1e --- /dev/null +++ b/categories/compiler.xml @@ -0,0 +1,6 @@ + +PyPy (Posts about compiler)https://www.pypy.org/enContents © 2024 <a href="mailto:pypy-dev@pypy.org">The PyPy Team</a> Sat, 31 Aug 2024 17:48:13 GMTNikola (getnikola.com)http://blogs.law.harvard.edu/tech/rssPyPy gets a new compilerhttps://www.pypy.org/posts/2009/08/pypy-gets-new-compiler_25-6401910947439531107.htmlBenjamin Peterson<p>Today, I merged the parser-compiler branch, which I have been working on over the summer. It contained a total rewrite of both PyPy's Python parser and AST compiler. PyPy's old parser was (in)famous internally for being complicated and slow (with many algorithmic complexities greater than O(n)). The new parser is a simple as <a href="https://codespeak.net/viewvc/pypy/trunk/pypy/interpreter/pyparser/parser.py?view=markup">I could make it</a> LL(1) parser like CPython (though it doesn't share the hacks of CPython's parser).</p> + +<p>The new compiler is based on the <a href="https://doc.python.org/3.1/library/ast">Abstract Syntax Trees (AST) that CPython 2.5 introduced</a> instead of PyPy's old AST based on the <a href="https://doc.python.org/library/compiler">compiler package's</a>. This means that Python code running on PyPy will be able to use the same _ast interface as CPython. PyPy's _ast implementation supports AST features that CPython 2.6 added, including <a href="https://pythonic.pocoo.org/2008/3/29/ast-compilation-from-python">compiling modified AST to bytecode and executing it</a>. In this rewrite, some more obscure compiler features were added, too. For example, jumps in bytecode can now be greater than 65535 bytes! (That's like an if statement with 7000 lines of code in the body.)</p> + +<p>While the PyPy translation toolchain still has many obscure details and hacks, this merge completes the process of making the actual Python interpreter very clean. Hopefully, this will make adding new features much easier and make PyPy less frustrating to maintain as well as providing application level code with an improved AST interface!</p>compilerparserspeedhttps://www.pypy.org/posts/2009/08/pypy-gets-new-compiler_25-6401910947439531107.htmlTue, 25 Aug 2009 16:05:00 GMT \ No newline at end of file diff --git a/categories/conda-forge.html b/categories/conda-forge.html new file mode 100644 index 000000000..bf6f1aafe --- /dev/null +++ b/categories/conda-forge.html @@ -0,0 +1,114 @@ + + + + + +Posts about conda-forge | PyPy + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +
                + + \ No newline at end of file diff --git a/categories/conda-forge.xml b/categories/conda-forge.xml new file mode 100644 index 000000000..a0e0c1cf8 --- /dev/null +++ b/categories/conda-forge.xml @@ -0,0 +1,17 @@ + +PyPy (Posts about conda-forge)https://www.pypy.org/enContents © 2024 <a href="mailto:pypy-dev@pypy.org">The PyPy Team</a> Sat, 31 Aug 2024 17:48:12 GMTNikola (getnikola.com)http://blogs.law.harvard.edu/tech/rssConda-forge proposes sunsetting support for PyPyhttps://www.pypy.org/posts/2024/08/conda-forge-proposes-dropping-support-for-pypy.htmlmattip<p>Conda-forge has kindly been providing support for PyPy since 2019. The +conda-forge team has been very patient and generous with resources, but it +seems the uptake of PyPy has not justified the effort. Major packages still +are not <a href="https://conda-forge.org/status/migration/?name=pypy38">available on PyPy</a>, +others find it hard to <a href="https://github.com/conda-forge/numpy-feedstock/pull/310">update +versions</a>. We don't +get much feedback at all about people using PyPy, and even less about PyPy on +conda-forge. The conda-forge team has proposed <a href="https://github.com/conda-forge/conda-forge.github.io/pull/2259">sunsetting +PyPy</a> going +forward, which means current packages would remain but no new packages would be +built. If you have an opinion, you can comment on that PR, or on this blog post.</p> +<p>Since conda-forge supports PyPy3.9 but not PyPy3.10, we have continued +releasing PyPy3.9 even though we typically support only one version of PyPy3. +With the sunsetting proposal, we will not release any more updates to PyPy3.9. +I opened a <a href="https://github.com/orgs/pypy/discussions/4998">poll</a> about the +intention to drop PyPy3.9. If you have an opinion, please chime in.</p>conda-forgehttps://www.pypy.org/posts/2024/08/conda-forge-proposes-dropping-support-for-pypy.htmlFri, 09 Aug 2024 06:27:41 GMT \ No newline at end of file diff --git a/categories/cpyext.html b/categories/cpyext.html new file mode 100644 index 000000000..46e8c20ea --- /dev/null +++ b/categories/cpyext.html @@ -0,0 +1,123 @@ + + + + + +Posts about cpyext | PyPy + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +
                + + \ No newline at end of file diff --git a/categories/cpyext.xml b/categories/cpyext.xml new file mode 100644 index 000000000..2c04b6d3c --- /dev/null +++ b/categories/cpyext.xml @@ -0,0 +1,715 @@ + +PyPy (Posts about cpyext)https://www.pypy.org/enContents © 2024 <a href="mailto:pypy-dev@pypy.org">The PyPy Team</a> Sat, 31 Aug 2024 17:48:13 GMTNikola (getnikola.com)http://blogs.law.harvard.edu/tech/rssLeysin 2020 Sprint Reporthttps://www.pypy.org/posts/2020/03/leysin-2020-sprint-report-764567777353955897.htmlhodgestar<p>At the end of February ten of us gathered in Leysin, Switzerland to work on<br> +a variety of topics including <a class="reference external" href="https://github.com/pyhandle/hpy/">HPy</a>, <a class="reference external" href="https://buildbot.pypy.org/summary?branch=py3.7">PyPy Python 3.7</a> support and the PyPy<br> +migration to <a class="reference external" href="https://foss.heptapod.net/pypy/">Heptapod</a>.<br> +<br> +</p><div class="separator" style="clear: both; text-align: center;"> +<a href="https://1.bp.blogspot.com/-PIs_hVhn3RY/XnFDceuihNI/AAAAAAAAbRg/LKMOMWxeFw4jhcwqy8jx7iKzKE01fbfxQCEwYBhgL/s1600/2020_leysin_sprint_attendees.jpg" style="margin-left: 1em; margin-right: 1em;"><img border="0" height="180" src="https://1.bp.blogspot.com/-PIs_hVhn3RY/XnFDceuihNI/AAAAAAAAbRg/LKMOMWxeFw4jhcwqy8jx7iKzKE01fbfxQCEwYBhgL/s320/2020_leysin_sprint_attendees.jpg" width="320"></a></div> +<br> +We had a fun and productive week. The snow was beautiful. There was skiing<br> +and lunch at the top of <a class="reference external" href="https://en.wikipedia.org/wiki/Berneuse">Berneuse</a>, cooking together, some late nights at<br> +the pub next door, some even later nights coding, and of course the<br> +obligatory cheese fondue outing.<br> +<br> +There were a few of us participating in a PyPy sprint for the first time<br> +and a few familiar faces who had attended many sprints. Many different<br> +projects were represented including PyPy, <a class="reference external" href="https://github.com/pyhandle/hpy/">HPy</a>, <a class="reference external" href="https://github.com/graalvm/graalpython">GraalPython</a>,<br> +<a class="reference external" href="https://foss.heptapod.net/pypy/">Heptapod</a>, and <a class="reference external" href="https://github.com/dgrunwald/rust-cpython">rust-cpython</a>. The atmosphere was relaxed and welcoming, so if<br> +you're thinking of attending the next one -- please do!<br> +<br> +Topics worked on:<br> +<br> +<h2> +HPy</h2> +HPy is a new project to design and implement a better API for extending<br> +Python in C. If you're unfamiliar with it you can read more about it at<br> +<a class="reference external" href="https://github.com/pyhandle/hpy/">HPy</a>.<br> +<br> +A lot of attention was devoted to the Big HPy Design Discussion which<br> +took up two full mornings. So much was decided that this will likely<br> +get its own detailed write-up, but bigger topics included:<br> +<ul class="simple"> +<li>the HPy GetAttr, SetAttr, GetItem and SetItem methods,</li> +<li>HPy_FromVoidP and HPy_AsVoidP for passing HPy handles to C functions<br> +that pass void* pointers to callbacks,</li> +<li>avoiding having va_args as part of the ABI,</li> +<li>exception handling,</li> +<li>support for creating custom types.</li> +</ul> +Quite a few things got worked on too:<br> +<ul class="simple"> +<li>implemented support for writing methods that take keyword arguments with<br> +HPy_METH_KEYWORDS,</li> +<li>implemented HPy_GetAttr, HPy_SetAttr, HPy_GetItem, and HPy_SetItem,</li> +<li>started implementing support for adding custom types,</li> +<li>started implementing dumping JSON objects in ultrajson-hpy,</li> +<li>refactored the PyPy GIL to improve the interaction between HPy and<br> +PyPy's cpyext,</li> +<li>experimented with adding HPy support to rust-cpython.</li> +</ul> +And there was some discussion of the next steps of the HPy initiative<br> +including writing documentation, setting up websites and funding, and<br> +possibly organising another HPy gathering later in the year.<br> +<br> +<h2> +PyPy</h2> +<ul class="simple"> +<li>Georges gave a presentation on the Heptapod topic and branch workflows<br> +and showed everyone how to use hg-evolve.</li> +<li>Work was done on improving the PyPy CI buildbot post the move to<br> +heptapod, including a light-weight pre-merge CI and restricting<br> +when the full CI is run to only branch commits.</li> +<li>A lot of work was done improving the -D tests. </li> +</ul> +<br> +<h2> +Miscellaneous</h2> +<ul class="simple"> +<li>Armin demoed VRSketch and NaN Industries in VR, including an implementation<br> +of the Game of Life within NaN Industries!</li> +<li>Skiing!</li> +</ul> +<br> +<h2> +Aftermath</h2> +Immediately after the sprint large parts of Europe and the world were<br> +hit by the COVID-19 epidemic. It was good to spend time together before<br> +travelling ceased to be a sensible idea and many gatherings were cancelled.<br> +<br> +Keep safe out there everyone.<br> +<br> +The HPy &amp; PyPy Team &amp; Friends<br> +<br> +<i>In joke for those who attended the sprint: Please don't replace this blog post<br> +with its Swedish translation (or indeed a translation to any other language :).</i>cpyextCPythonGraalPythonHeptapodhpypypypypy3https://www.pypy.org/posts/2020/03/leysin-2020-sprint-report-764567777353955897.htmlTue, 17 Mar 2020 21:57:00 GMTInside cpyext: Why emulating CPython C API is so Hardhttps://www.pypy.org/posts/2018/09/inside-cpyext-why-emulating-cpython-c-8083064623681286567.htmlAntonio Cuni<br> +<div class="document" id="inside-cpyext-why-emulating-cpython-c-api-is-so-hard"> +<tt class="docutils literal">cpyext</tt> is PyPy's subsystem which provides a compatibility +layer to compile and run CPython C extensions inside PyPy. Often people ask +why a particular C extension doesn't work or is very slow on PyPy. +Usually it is hard to answer without going into technical details. The goal of +this blog post is to explain some of these technical details, so that we can +simply link here instead of explaining again and again :).<br> +From a 10.000 foot view, <tt class="docutils literal">cpyext</tt> is PyPy's version of <tt class="docutils literal">"Python.h"</tt>. Every time +you compile an extension which uses that header file, you are using <tt class="docutils literal">cpyext</tt>. +This includes extension explicitly written in C (such as <tt class="docutils literal">numpy</tt>) and +extensions which are generated from other compilers/preprocessors +(e.g. <tt class="docutils literal">Cython</tt>).<br> +At the time of writing, the current status is that most C extensions "just +work". Generally speaking, you can simply <tt class="docutils literal">pip install</tt> them, +provided they use the public, <a class="reference external" href="https://docs.python.org/2/c-api/index.html">official C API</a> instead of poking at private +implementation details. However, the performance of cpyext is generally +poor. A Python program which makes heavy use of <tt class="docutils literal">cpyext</tt> extensions +is likely to be slower on PyPy than on CPython.<br> +Note: in this blog post we are talking about Python 2.7 because it is still +the default version of PyPy: however most of the implementation of <tt class="docutils literal">cpyext</tt> is +shared with PyPy3, so everything applies to that as well.<br> +<div class="section" id="c-api-overview"> +<h1> +C API Overview</h1> +In CPython, which is written in C, Python objects are represented as <tt class="docutils literal">PyObject*</tt>, +i.e. (mostly) opaque pointers to some common "base struct".<br> +CPython uses a very simple memory management scheme: when you create an +object, you allocate a block of memory of the appropriate size on the heap. +Depending on the details, you might end up calling different allocators, but +for the sake of simplicity, you can think that this ends up being a call to +<tt class="docutils literal">malloc()</tt>. The resulting block of memory is initialized and casted to to +<tt class="docutils literal">PyObject*</tt>: this address never changes during the object lifetime, and the +C code can freely pass it around, store it inside containers, retrieve it +later, etc.<br> +Memory is managed using reference counting. When you create a new reference to +an object, or you discard a reference you own, you have to <a class="reference external" href="https://docs.python.org/2/c-api/refcounting.html#c.Py_INCREF">increment</a> or +<a class="reference external" href="https://docs.python.org/2/c-api/refcounting.html#c.Py_DECREF">decrement</a> the reference counter accordingly. When the reference counter goes to +0, it means that the object is no longer used and can safely be +destroyed. Again, we can simplify and say that this results in a call to +<tt class="docutils literal">free()</tt>, which finally releases the memory which was allocated by <tt class="docutils literal">malloc()</tt>.<br> +Generally speaking, the only way to operate on a <tt class="docutils literal">PyObject*</tt> is to call the +appropriate API functions. For example, to convert a given <tt class="docutils literal">PyObject*</tt> to a C +integer, you can use <a class="reference external" href="https://docs.python.org/2/c-api/int.html#c.PyInt_AsLong">PyInt_AsLong()</a>; to add two objects together, you can +call <a class="reference external" href="https://docs.python.org/2/c-api/number.html#c.PyNumber_Add">PyNumber_Add()</a>.<br> +Internally, PyPy uses a similar approach. All Python objects are subclasses of +the RPython <tt class="docutils literal">W_Root</tt> class, and they are operated by calling methods on the +<tt class="docutils literal">space</tt> singleton, which represents the interpreter.<br> +At first, it looks very easy to write a compatibility layer: just make +<tt class="docutils literal">PyObject*</tt> an alias for <tt class="docutils literal">W_Root</tt>, and write simple RPython functions +(which will be translated to C by the RPython compiler) which call the +<tt class="docutils literal">space</tt> accordingly:<br> +<pre class="code python literal-block"><span class="keyword">def</span> <span class="name function">PyInt_AsLong</span><span class="punctuation">(</span><span class="name">space</span><span class="punctuation">,</span> <span class="name">o</span><span class="punctuation">):</span> + <span class="keyword">return</span> <span class="name">space</span><span class="operator">.</span><span class="name">int_w</span><span class="punctuation">(</span><span class="name">o</span><span class="punctuation">)</span> + +<span class="keyword">def</span> <span class="name function">PyNumber_Add</span><span class="punctuation">(</span><span class="name">space</span><span class="punctuation">,</span> <span class="name">o1</span><span class="punctuation">,</span> <span class="name">o2</span><span class="punctuation">):</span> + <span class="keyword">return</span> <span class="name">space</span><span class="operator">.</span><span class="name">add</span><span class="punctuation">(</span><span class="name">o1</span><span class="punctuation">,</span> <span class="name">o2</span><span class="punctuation">)</span> +</pre> +Actually, the code above is not too far from the real +implementation. However, there are tons of gory details which make it much +harder than it looks, and much slower unless you pay a lot of attention +to performance.</div> +<div class="section" id="the-pypy-gc"> +<h1> +The PyPy GC</h1> +To understand some of <tt class="docutils literal">cpyext</tt> challenges, you need to have at least a rough +idea of how the PyPy GC works.<br> +Contrarily to the popular belief, the "Garbage Collector" is not only about +collecting garbage: instead, it is generally responsible for all memory +management, including allocation and deallocation.<br> +Whereas CPython uses a combination of malloc/free/refcounting to manage +memory, the PyPy GC uses a completely different approach. It is designed +assuming that a dynamic language like Python behaves the following way:<br> +<blockquote> +<ul class="simple"> +<li>You create, either directly or indirectly, lots of objects.</li> +<li>Most of these objects are temporary and very short-lived. Think e.g. of +doing <tt class="docutils literal">a + b + c</tt>: you need to allocate an object to hold the temporary +result of <tt class="docutils literal">a + b</tt>, then it dies very quickly because you no longer need it +when you do the final <tt class="docutils literal">+ c</tt> part.</li> +<li>Only small fraction of the objects survive and stay around for a while.</li> +</ul> +</blockquote> +So, the strategy is: make allocation as fast as possible; make deallocation of +short-lived objects as fast as possible; find a way to handle the remaining +small set of objects which actually survive long enough to be important.<br> +This is done using a <strong>Generational GC</strong>: the basic idea is the following:<br> +<blockquote> +<ol class="arabic simple"> +<li>We have a nursery, where we allocate "young objects" very quickly.</li> +<li>When the nursery is full, we start what we call a "minor collection".<ul> +<li>We do a quick scan to determine the small set of objects which survived so +far</li> +<li>We <strong>move</strong> these objects out of the nursery, and we place them in the +area of memory which contains the "old objects". Since the address of the +objects changes, we fix all the references to them accordingly.</li> +</ul> +</li> +</ol> +<ol class="arabic simple" start="4"> +<li>now the nursery contains only objects which "died young". We can +discard all of them very quickly, reset the nursery, and use the same area +of memory to allocate new objects from now.</li> +</ol> +</blockquote> +In practice, this scheme works very well and it is one of the reasons why PyPy +is much faster than CPython. However, careful readers have surely noticed +that this is a problem for <tt class="docutils literal">cpyext</tt>. On one hand, we have PyPy objects which +can potentially move and change their underlying memory address; on the other +hand, we need a way to represent them as fixed-address <tt class="docutils literal">PyObject*</tt> when we +pass them to C extensions. We surely need a way to handle that.</div> +<div class="section" id="pyobject-in-pypy"> +<h1> +<tt class="docutils literal">PyObject*</tt> in PyPy</h1> +Another challenge is that sometimes, <tt class="docutils literal">PyObject*</tt> structs are not completely +opaque: there are parts of the public API which expose to the user specific +fields of some concrete C struct. For example the definition of <a class="reference external" href="https://docs.python.org/2/c-api/typeobj.html">PyTypeObject</a> +which exposes many of the <tt class="docutils literal">tp_*</tt> slots to the user. +Since the low-level layout of PyPy <tt class="docutils literal">W_Root</tt> objects is completely different +than the one used by CPython, we cannot simply pass RPython objects to C; we +need a way to handle the difference.<br> +So, we have two issues so far: objects can move, and incompatible +low-level layouts. <tt class="docutils literal">cpyext</tt> solves both by decoupling the RPython and the C +representations. We have two "views" of the same entity, depending on whether +we are in the PyPy world (the movable <tt class="docutils literal">W_Root</tt> subclass) or in the C world +(the non-movable <tt class="docutils literal">PyObject*</tt>).<br> +<tt class="docutils literal">PyObject*</tt> are created lazily, only when they are actually needed. The +vast majority of PyPy objects are never passed to any C extension, so we don't +pay any penalty in that case. However, the first time we pass a <tt class="docutils literal">W_Root</tt> to +C, we allocate and initialize its <tt class="docutils literal">PyObject*</tt> counterpart.<br> +The same idea applies also to objects which are created in C, e.g. by calling +<a class="reference external" href="https://docs.python.org/2/c-api/allocation.html#c.PyObject_New">PyObject_New()</a>. At first, only the <tt class="docutils literal">PyObject*</tt> exists and it is +exclusively managed by reference counting. As soon as we pass it to the PyPy +world (e.g. as a return value of a function call), we create its <tt class="docutils literal">W_Root</tt> +counterpart, which is managed by the GC as usual.<br> +Here we start to see why calling cpyext modules is more costly in PyPy than in +CPython. We need to pay some penalty for all the conversions between +<tt class="docutils literal">W_Root</tt> and <tt class="docutils literal">PyObject*</tt>.<br> +Moreover, the first time we pass a <tt class="docutils literal">W_Root</tt> to C we also need to allocate +the memory for the <tt class="docutils literal">PyObject*</tt> using a slowish "CPython-style" memory +allocator. In practice, for all the objects which are passed to C we pay more +or less the same costs as CPython, thus effectively "undoing" the speedup +guaranteed by PyPy's Generational GC under normal circumstances.</div> +<div class="section" id="maintaining-the-link-between-w-root-and-pyobject"> +<h1> +Maintaining the link between <tt class="docutils literal">W_Root</tt> and <tt class="docutils literal">PyObject*</tt></h1> +We now need a way to convert between <tt class="docutils literal">W_Root</tt> and <tt class="docutils literal">PyObject*</tt> and +vice-versa; also, we need to to ensure that the lifetime of the two entities +are in sync. In particular:<br> +<blockquote> +<ol class="arabic simple"> +<li>as long as the <tt class="docutils literal">W_Root</tt> is kept alive by the GC, we want the +<tt class="docutils literal">PyObject*</tt> to live even if its refcount drops to 0;</li> +<li>as long as the <tt class="docutils literal">PyObject*</tt> has a refcount greater than 0, we want to +make sure that the GC does not collect the <tt class="docutils literal">W_Root</tt>.</li> +</ol> +</blockquote> +The <tt class="docutils literal">PyObject*</tt> ⇨ <tt class="docutils literal">W_Root</tt> link is maintained by the special field +<a class="reference external" href="https://foss.heptapod.net/pypy/pypy/-/tree/branch/py3.6/pypy/module/cpyext/parse/cpyext_object.h#lines-5">ob_pypy_link</a> which is added to all <tt class="docutils literal">PyObject*</tt>. On a 64 bit machine this +means that all <tt class="docutils literal">PyObject*</tt> have 8 bytes of overhead, but then the +conversion is very quick, just reading the field.<br> +For the other direction, we generally don't want to do the same: the +assumption is that the vast majority of <tt class="docutils literal">W_Root</tt> objects will never be +passed to C, and adding an overhead of 8 bytes to all of them is a +waste. Instead, in the general case the link is maintained by using a +dictionary, where <tt class="docutils literal">W_Root</tt> are the keys and <tt class="docutils literal">PyObject*</tt> the values.<br> +However, for a <a class="reference external" href="https://foss.heptapod.net/pypy/pypy/-/tree/branch/py3.6/pypy/module/cpyext/pyobject.py#lines-66">few selected</a> <tt class="docutils literal">W_Root</tt> subclasses we <strong>do</strong> maintain a +direct link using the special <tt class="docutils literal">_cpy_ref</tt> field to improve performance. In +particular, we use it for <tt class="docutils literal">W_TypeObject</tt> (which is big anyway, so a 8 bytes +overhead is negligible) and <tt class="docutils literal">W_NoneObject</tt>. <tt class="docutils literal">None</tt> is passed around very +often, so we want to ensure that the conversion to <tt class="docutils literal">PyObject*</tt> is very +fast. Moreover it's a singleton, so the 8 bytes overhead is negligible as +well.<br> +This means that in theory, passing an arbitrary Python object to C is +potentially costly, because it involves doing a dictionary lookup. We assume +that this cost will eventually show up in the profiler: however, at the time +of writing there are other parts of <tt class="docutils literal">cpyext</tt> which are even more costly (as we +will show later), so the cost of the dict lookup is never evident in the +profiler.</div> +<div class="section" id="crossing-the-border-between-rpython-and-c"> +<h1> +Crossing the border between RPython and C</h1> +There are two other things we need to care about whenever we cross the border +between RPython and C, and vice-versa: exception handling and the GIL.<br> +In the C API, exceptions are raised by calling <a class="reference external" href="https://docs.python.org/2/c-api/exceptions.html#c.PyErr_SetString">PyErr_SetString()</a> (or one of +<a class="reference external" href="https://docs.python.org/2/c-api/exceptions.html#exception-handling">many other functions</a> which have a similar effect), which basically works by +creating an exception value and storing it in some global variable. The +function then signals that an exception has occurred by returning an error value, +usually <tt class="docutils literal">NULL</tt>.<br> +On the other hand, in the PyPy interpreter, exceptions are propagated by raising the +RPython-level <a class="reference external" href="https://foss.heptapod.net/pypy/pypy/-/tree/branch/py3.6/pypy/interpreter/error.py#lines-20">OperationError</a> exception, which wraps the actual app-level +exception values. To harmonize the two worlds, whenever we return from C to +RPython, we need to check whether a C API exception was raised and if so turn it +into an <tt class="docutils literal">OperationError</tt>.<br> +We won't dig into details of <a class="reference external" href="https://foss.heptapod.net/pypy/pypy/-/tree/branch/py3.6/pypy/module/cpyext/api.py#lines-205">how the GIL is handled in cpyext</a>. +For the purpose of this post, it is enough to know that whenever we enter +C land, we store the current thread id into a global variable which is +accessible also from C; conversely, whenever we go back from RPython to C, we +restore this value to 0.<br> +Similarly, we need to do the inverse operations whenever you need to cross the +border between C and RPython, e.g. by calling a Python callback from C code.<br> +All this complexity is automatically handled by the RPython function +<a class="reference external" href="https://foss.heptapod.net/pypy/pypy/-/tree/branch/py3.6/pypy/module/cpyext/api.py#lines-1757">generic_cpy_call</a>. If you look at the code you see that it takes care of 4 +things:<br> +<blockquote> +<ol class="arabic simple"> +<li>Handling the GIL as explained above.</li> +<li>Handling exceptions, if they are raised.</li> +<li>Converting arguments from <tt class="docutils literal">W_Root</tt> to <tt class="docutils literal">PyObject*</tt>.</li> +<li>Converting the return value from <tt class="docutils literal">PyObject*</tt> to <tt class="docutils literal">W_Root</tt>.</li> +</ol> +</blockquote> +So, we can see that calling C from RPython introduce some overhead. +Can we measure it?<br> +Assuming that the conversion between <tt class="docutils literal">W_Root</tt> and <tt class="docutils literal">PyObject*</tt> has a +reasonable cost (as explained by the previous section), the overhead +introduced by a single border-cross is still acceptable, especially if the +callee is doing some non-negligible amount of work.<br> +However this is not always the case. There are basically three problems that +make (or used to make) <tt class="docutils literal">cpyext</tt> super slow:<br> +<blockquote> +<ol class="arabic simple"> +<li>Paying the border-crossing cost for trivial operations which are called +very often, such as <tt class="docutils literal">Py_INCREF</tt>.</li> +<li>Crossing the border back and forth many times, even if it's not strictly +needed.</li> +<li>Paying an excessive cost for argument and return value conversions.</li> +</ol> +</blockquote> +The next sections explain in more detail each of these problems.</div> +<div class="section" id="avoiding-unnecessary-roundtrips"> +<h1> +Avoiding unnecessary roundtrips</h1> +Prior to the <a class="reference external" href="https://www.pypy.org/posts/2017/10/cape-of-good-hope-for-pypy-hello-from-3656631725712879033.html">2017 Cape Town Sprint</a>, <tt class="docutils literal">cpyext</tt> was horribly slow, and we were +well aware of it: the main reason was that we never really paid too much +attention to performance. As explained in the blog post, emulating all the +CPython quirks is basically a nightmare, so better to concentrate on +correctness first.<br> +However, we didn't really know <strong>why</strong> it was so slow. We had theories and +assumptions, usually pointing at the cost of conversions between <tt class="docutils literal">W_Root</tt> +and <tt class="docutils literal">PyObject*</tt>, but we never actually measured it.<br> +So, we decided to write a set of <a class="reference external" href="https://github.com/antocuni/cpyext-benchmarks">cpyext microbenchmarks</a> to measure the +performance of various operations. The result was somewhat surprising: the +theory suggests that when you do a cpyext C call, you should pay the +border-crossing costs only once, but what the profiler told us was that we +were paying the cost of <tt class="docutils literal">generic_cpy_call</tt> several times more than what we expected.<br> +After a bit of investigation, we discovered this was ultimately caused by our +"correctness-first" approach. For simplicity of development and testing, when +we started <tt class="docutils literal">cpyext</tt> we wrote everything in RPython: thus, every single API call +made from C (like the omnipresent <a class="reference external" href="https://docs.python.org/2/c-api/arg.html#c.PyArg_ParseTuple">PyArg_ParseTuple()</a>, <a class="reference external" href="https://docs.python.org/2/c-api/int.html#c.PyInt_AsLong">PyInt_AsLong()</a>, etc.) +had to cross back the C-to-RPython border. This was especially daunting for +very simple and frequent operations like <tt class="docutils literal">Py_INCREF</tt> and <tt class="docutils literal">Py_DECREF</tt>, +which CPython implements as a single assembly instruction!<br> +Another source of slow down was the implementation of <tt class="docutils literal">PyTypeObject</tt> slots. +At the C level, these are function pointers which the interpreter calls to do +certain operations, e.g. <a class="reference external" href="https://docs.python.org/2/c-api/typeobj.html#c.PyTypeObject.tp_new">tp_new</a> to allocate a new instance of that type.<br> +As usual, we have some magic to implement slots in RPython; in particular, +<a class="reference external" href="https://foss.heptapod.net/pypy/pypy/-/tree/branch/py3.6/pypy/module/cpyext/api.py#lines-362">_make_wrapper</a> does the opposite of <tt class="docutils literal">generic_cpy_call</tt>: it takes a +RPython function and wraps it into a C function which can be safely called +from C, handling the GIL, exceptions and argument conversions automatically.<br> +This was very handy during the development of cpyext, but it might result in +some bad nonsense; consider what happens when you call the following C +function:<br> +<pre class="code C literal-block"><span class="keyword">static</span> <span class="name">PyObject</span><span class="operator">*</span> <span class="name function">foo</span><span class="punctuation">(</span><span class="name">PyObject</span><span class="operator">*</span> <span class="name">self</span><span class="punctuation">,</span> <span class="name">PyObject</span><span class="operator">*</span> <span class="name">args</span><span class="punctuation">)</span> +<span class="punctuation">{</span> + <span class="name">PyObject</span><span class="operator">*</span> <span class="name">result</span> <span class="operator">=</span> <span class="name">PyInt_FromLong</span><span class="punctuation">(</span><span class="literal number integer">1234</span><span class="punctuation">);</span> + <span class="keyword">return</span> <span class="name">result</span><span class="punctuation">;</span> +<span class="punctuation">}</span> +</pre> +<ol class="arabic simple"> +<li>you are in RPython and do a cpyext call to <tt class="docutils literal">foo</tt>: <strong>RPython-to-C</strong>;</li> +<li><tt class="docutils literal">foo</tt> calls <tt class="docutils literal">PyInt_FromLong(1234)</tt>, which is implemented in RPython: +<strong>C-to-RPython</strong>;</li> +<li>the implementation of <tt class="docutils literal">PyInt_FromLong</tt> indirectly calls +<tt class="docutils literal">PyIntType.tp_new</tt>, which is a C function pointer: <strong>RPython-to-C</strong>;</li> +<li>however, <tt class="docutils literal">tp_new</tt> is just a wrapper around an RPython function, created +by <tt class="docutils literal">_make_wrapper</tt>: <strong>C-to-RPython</strong>;</li> +<li>finally, we create our RPython <tt class="docutils literal">W_IntObject(1234)</tt>; at some point +during the <strong>RPython-to-C</strong> crossing, its <tt class="docutils literal">PyObject*</tt> equivalent is +created;</li> +<li>after many layers of wrappers, we are again in <tt class="docutils literal">foo</tt>: after we do +<tt class="docutils literal">return result</tt>, during the <strong>C-to-RPython</strong> step we convert it from +<tt class="docutils literal">PyObject*</tt> to <tt class="docutils literal">W_IntObject(1234)</tt>.</li> +</ol> +Phew! After we realized this, it was not so surprising that <tt class="docutils literal">cpyext</tt> was very +slow :). And this was a simplified example, since we are not passing a +<tt class="docutils literal">PyObject*</tt> to the API call. When we do, we need to convert it back and +forth at every step. Actually, I am not even sure that what I described was +the exact sequence of steps which used to happen, but you get the general +idea.<br> +The solution is simple: rewrite as much as we can in C instead of RPython, +to avoid unnecessary roundtrips. This was the topic of most of the Cape Town +sprint and resulted in the <tt class="docutils literal"><span class="pre">cpyext-avoid-roundtrip</span></tt> branch, which was +eventually <a class="reference external" href="https://foss.heptapod.net/pypy/pypy/-/tree/branch/cpyext_avoid-roundtrip">merged</a>.<br> +Of course, it is not possible to move <strong>everything</strong> to C: there are still +operations which need to be implemented in RPython. For example, think of +<tt class="docutils literal">PyList_Append</tt>: the logic to append an item to a list is complex and +involves list strategies, so we cannot replicate it in C. However, we +discovered that a large subset of the C API can benefit from this.<br> +Moreover, the C API is <strong>huge</strong>. While we invented this new way of writing +<tt class="docutils literal">cpyext</tt> code, we still need to +convert many of the functions to the new paradigm. Sometimes the rewrite is +not automatic +or straighforward. <tt class="docutils literal">cpyext</tt> is a delicate piece of software, so it happens often +that we make a mistake and end up staring at a segfault in gdb.<br> +However, the most important takeaway is that the performance improvements we got +from this optimization are impressive, as we will detail later.</div> +<div class="section" id="conversion-costs"> +<h1> +Conversion costs</h1> +The other potential big source of slowdown is the conversion of arguments +between <tt class="docutils literal">W_Root</tt> and <tt class="docutils literal">PyObject*</tt>.<br> +As explained earlier, the first time you pass a <tt class="docutils literal">W_Root</tt> to C, you need to +allocate its <tt class="docutils literal">PyObject*</tt> counterpart. Suppose you have a <tt class="docutils literal">foo</tt> function +defined in C, which takes a single int argument:<br> +<pre class="code python literal-block"><span class="keyword">for</span> <span class="name">i</span> <span class="operator word">in</span> <span class="name builtin">range</span><span class="punctuation">(</span><span class="name">N</span><span class="punctuation">):</span> + <span class="name">foo</span><span class="punctuation">(</span><span class="name">i</span><span class="punctuation">)</span> +</pre> +To run this code, you need to create a different <tt class="docutils literal">PyObject*</tt> for each value +of <tt class="docutils literal">i</tt>: if implemented naively, it means calling <tt class="docutils literal">N</tt> times <tt class="docutils literal">malloc()</tt> +and <tt class="docutils literal">free()</tt>, which kills performance.<br> +CPython has the very same problem, which is solved by using a <a class="reference external" href="https://en.wikipedia.org/wiki/Free_list">free list</a> to +<a class="reference external" href="https://github.com/python/cpython/blob/2.7/Objects/intobject.c#L16">allocate ints</a>. So, what we did was to simply <a class="reference external" href="https://foss.heptapod.net/pypy/pypy/-/commit/d8754ab9ba6371c83eaeb80cdf8cc13a37ee0c89">steal the code</a> from CPython +and do the exact same thing. This was also done in the +<tt class="docutils literal"><span class="pre">cpyext-avoid-roundtrip</span></tt> branch, and the benchmarks show that it worked +perfectly.<br> +Every type which is converted often to <tt class="docutils literal">PyObject*</tt> must have a very fast +allocator. At the moment of writing, PyPy uses free lists only for ints and +<a class="reference external" href="https://foss.heptapod.net/pypy/pypy/-/commit/35e2fb9903f2483940d7970bd83ce8c65aa1c1a3">tuples</a>: one of the next steps on our TODO list is certainly to use this +technique with more types, like <tt class="docutils literal">float</tt>.<br> +Conversely, we also need to optimize the converstion from <tt class="docutils literal">PyObject*</tt> to +<tt class="docutils literal">W_Root</tt>: this happens when an object is originally allocated in C and +returned to Python. Consider for example the following code:<br> +<pre class="code python literal-block"><span class="keyword namespace">import</span> <span class="name namespace">numpy</span> <span class="keyword namespace">as</span> <span class="name namespace">np</span> +<span class="name">myarray</span> <span class="operator">=</span> <span class="name">np</span><span class="operator">.</span><span class="name">random</span><span class="operator">.</span><span class="name">random</span><span class="punctuation">(</span><span class="name">N</span><span class="punctuation">)</span> +<span class="keyword">for</span> <span class="name">i</span> <span class="operator word">in</span> <span class="name builtin">range</span><span class="punctuation">(</span><span class="name builtin">len</span><span class="punctuation">(</span><span class="name">arr</span><span class="punctuation">)):</span> + <span class="name">myarray</span><span class="punctuation">[</span><span class="name">i</span><span class="punctuation">]</span> +</pre> +At every iteration, we get an item out of the array: the return type is a an +instance of <tt class="docutils literal">numpy.float64</tt> (a numpy scalar), i.e. a <tt class="docutils literal">PyObject'*</tt>: this is +something which is implemented by numpy entirely in C, so completely +opaque to <tt class="docutils literal">cpyext</tt>. We don't have any control on how it is allocated, +managed, etc., and we can assume that allocation costs are the same as on +CPython.<br> +As soon as we return these <tt class="docutils literal">PyObject*</tt> to Python, we need to allocate +their <tt class="docutils literal">W_Root</tt> equivalent. If you do it in a small loop like in the example +above, you end up allocating all these <tt class="docutils literal">W_Root</tt> inside the nursery, which is +a good thing since allocation is super fast (see the section above about the +PyPy GC).<br> +However, we also need to keep track of the <tt class="docutils literal">W_Root</tt> to <tt class="docutils literal">PyObject*</tt> link. +Currently, we do this by putting all of them in a dictionary, but it is very +inefficient, especially because most of these objects die young and thus it +is wasted work to do that for them. Currently, this is one of the biggest +unresolved problem in <tt class="docutils literal">cpyext</tt>, and it is what causes the two microbenchmarks +<tt class="docutils literal">allocate_int</tt> and <tt class="docutils literal">allocate_tuple</tt> to be very slow.<br> +We are well aware of the problem, and we have a plan for how to fix it. The +explanation is too technical for the scope of this blog post as it requires a +deep knowledge of the GC internals to be understood, but the details are +<a class="reference external" href="https://foss.heptapod.net/pypy/extradoc/-/blob/branch/extradoc/planning/cpyext.txt#L27">here</a>.</div> +<div class="section" id="c-api-quirks"> +<h1> +C API quirks</h1> +Finally, there is another source of slowdown which is beyond our control. Some +parts of the CPython C API are badly designed and expose some of the +implementation details of CPython.<br> +The major example is reference counting. The <tt class="docutils literal">Py_INCREF</tt> / <tt class="docutils literal">Py_DECREF</tt> API +is designed in such a way which forces other implementation to emulate +refcounting even in presence of other GC management schemes, as explained +above.<br> +Another example is borrowed references. There are API functions which <strong>do +not</strong> incref an object before returning it, e.g. <a class="reference external" href="https://docs.python.org/2/c-api/list.html#c.PyList_GetItem">PyList_GetItem()</a>. This is +done for performance reasons because we can avoid a whole incref/decref pair, +if the caller needs to handle the returned item only temporarily: the item is +kept alive because it is in the list anyway.<br> +For PyPy, this is a challenge: thanks to <a class="reference external" href="https://www.pypy.org/posts/2011/10/more-compact-lists-with-list-strategies-8229304944653956829.html">list strategies</a>, lists are often +represented in a compact way. For example, a list containing only integers is +stored as a C array of <tt class="docutils literal">long</tt>. How to implement <tt class="docutils literal">PyList_GetItem</tt>? We +cannot simply create a <tt class="docutils literal">PyObject*</tt> on the fly, because the caller will never +decref it and it will result in a memory leak.<br> +The current solution is very inefficient. The first time we do a +<tt class="docutils literal">PyList_GetItem</tt>, we <a class="reference external" href="https://foss.heptapod.net/pypy/pypy/-/tree/branch/py3.6/pypy/module/cpyext/listobject.py#lines-28">convert</a> the <strong>whole</strong> list to a list of +<tt class="docutils literal">PyObject*</tt>. This is bad in two ways: the first is that we potentially pay a +lot of unneeded conversion cost in case we will never access the other items +of the list. The second is that by doing that we lose all the performance +benefit granted by the original list strategy, making it slower for the +rest of the pure-python code which will manipulate the list later.<br> +<tt class="docutils literal">PyList_GetItem</tt> is an example of a bad API because it assumes that the list +is implemented as an array of <tt class="docutils literal">PyObject*</tt>: after all, in order to return a +borrowed reference, we need a reference to borrow, don't we?<br> +Fortunately, (some) CPython developers are aware of these problems, and there +is an ongoing project to <a class="reference external" href="https://pythoncapi.readthedocs.io/">design a better C API</a> which aims to fix exactly +this kind of problem.<br> +Nonetheless, in the meantime we still need to implement the current +half-broken APIs. There is no easy solution for that, and it is likely that +we will always need to pay some performance penalty in order to implement them +correctly.<br> +However, what we could potentially do is to provide alternative functions +which do the same job but are more PyPy friendly: for example, we could think +of implementing <tt class="docutils literal">PyList_GetItemNonBorrowed</tt> or something like that: then, C +extensions could choose to use it (possibly hidden inside some macro and +<tt class="docutils literal">#ifdef</tt>) if they want to be fast on PyPy.</div> +<div class="section" id="current-performance"> +<h1> +Current performance</h1> +During the whole blog post we claimed <tt class="docutils literal">cpyext</tt> is slow. How +slow it is, exactly?<br> +We decided to concentrate on <a class="reference external" href="https://github.com/antocuni/cpyext-benchmarks">microbenchmarks</a> for now. It should be evident +by now there are simply too many issues which can slow down a <tt class="docutils literal">cpyext</tt> +program, and microbenchmarks help us to concentrate on one (or few) at a +time.<br> +The microbenchmarks measure very simple things, like calling functions and +methods with the various calling conventions (no arguments, one arguments, +multiple arguments); passing various types as arguments (to measure conversion +costs); allocating objects from C, and so on.<br> +Here are the results from the old PyPy 5.8 relative and normalized to CPython +2.7, the lower the better:<br> +<br> + + +<div class="separator" style="clear: both; text-align: center;"> +<a href="https://4.bp.blogspot.com/-5QV9jBfeXfo/W6UOCRA9YqI/AAAAAAAABX4/H2zgbv_XFQEHD4Lb2lj5Ve4Ob_YMuSXLwCLcBGAs/s1600/pypy58.png" style="margin-left: 1em; margin-right: 1em;"><img border="0" height="480" src="https://4.bp.blogspot.com/-5QV9jBfeXfo/W6UOCRA9YqI/AAAAAAAABX4/H2zgbv_XFQEHD4Lb2lj5Ve4Ob_YMuSXLwCLcBGAs/s640/pypy58.png" width="640"></a></div> +<br> +<div class="separator" style="clear: both; text-align: center;"> +<a href="https://www.blogger.com/blogger.g?blogID=3971202189709462152" style="margin-left: 1em; margin-right: 1em;"></a></div> +<div class="separator" style="clear: both; text-align: center;"> +<a href="https://www.blogger.com/blogger.g?blogID=3971202189709462152" style="margin-left: 1em; margin-right: 1em;"></a></div> +<br> +PyPy was horribly slow everywhere, ranging from 2.5x to 10x slower. It is +particularly interesting to compare <tt class="docutils literal">simple.noargs</tt>, which measures the cost +of calling an empty function with no arguments, and <tt class="docutils literal">simple.onearg(i)</tt>, +which measures the cost calling an empty function passing an integer argument: +the latter is ~2x slower than the former, indicating that the conversion cost +of integers is huge.<br> +PyPy 5.8 was the last release before the famous Cape Town sprint, when we +started to look at cpyext performance seriously. Here are the performance data for +PyPy 6.0, the latest release at the time of writing:<br> +<div class="separator" style="clear: both; text-align: center;"> +<a href="https://1.bp.blogspot.com/-MRkRoxtCeOE/W6UOL5txl1I/AAAAAAAABX8/i0ZiOyS2MOgiSyxFAyMOkKcB6xqjSihBACLcBGAs/s1600/pypy60.png" style="margin-left: 1em; margin-right: 1em;"><img border="0" height="480" src="https://1.bp.blogspot.com/-MRkRoxtCeOE/W6UOL5txl1I/AAAAAAAABX8/i0ZiOyS2MOgiSyxFAyMOkKcB6xqjSihBACLcBGAs/s640/pypy60.png" width="640"></a></div> +<br> +<br> +The results are amazing! PyPy is now massively faster than before, and for +most benchmarks it is even faster than CPython: yes, you read it correctly: +PyPy is faster than CPython at doing CPython's job, even considering all the +extra work it has to do to emulate the C API. This happens thanks to the JIT, +which produces speedups high enough to counterbalance the slowdown caused by +cpyext.<br> +There are two microbenchmarks which are still slower though: <tt class="docutils literal">allocate_int</tt> +and <tt class="docutils literal">allocate_tuple</tt>, for the reasons explained in the section about +<a class="reference internal" href="https://www.blogger.com/blogger.g?blogID=3971202189709462152#conversion-costs">Conversion costs</a>.</div> +<div class="section" id="next-steps"> +<h1> +Next steps</h1> +Despite the spectacular results we got so far, <tt class="docutils literal">cpyext</tt> is still slow enough to +kill performance in most real-world code which uses C extensions extensively +(e.g., the omnipresent numpy).<br> +Our current approach is something along these lines:<br> +<blockquote> +<ol class="arabic simple"> +<li>run a real-world small benchmark which exercises cpyext</li> +<li>measure and find the major bottleneck</li> +<li>write a corresponding microbenchmark</li> +<li>optimize it</li> +<li>repeat</li> +</ol> +</blockquote> +On one hand, this is a daunting task because the C API is huge and we need to +tackle functions one by one. On the other hand, not all the functions are +equally important, and is is enough to optimize a relatively small subset to +improve many different use cases.<br> +Where a year ago we announced we have a working answer to run c-extension in +PyPy, we now have a clear picture of what are the performance bottlenecks, and +we have developed some technical solutions to fix them. It is "only" a matter +of tackling them, one by one. It is worth noting that most of the work was +done during two sprints, for a total 2-3 person-months of work.<br> +We think this work is important for the Python ecosystem. PyPy has established +a baseline for performance in pure python code, providing an answer for the +"Python is slow" detractors. The techniques used to make <tt class="docutils literal">cpyext</tt> performant +will let PyPy become an alternative for people who mix C extensions with +Python, which, it turns out, is just about everyone, in particular those using +the various scientific libraries. Today, many developers are forced to seek +performance by converting code from Python to a lower language. We feel there +is no reason to do this, but in order to prove it we must be able to run both +their python and their C extensions performantly, then we can begin to educate +them how to write JIT-friendly code in the first place.<br> +We envision a future in which you can run arbitrary Python programs on PyPy, +with the JIT speeding up the pure Python parts and the C parts running as fast +as today: the best of both worlds!</div> +</div>cpyextprofilingspeedhttps://www.pypy.org/posts/2018/09/inside-cpyext-why-emulating-cpython-c-8083064623681286567.htmlFri, 21 Sep 2018 16:32:00 GMT(Cape of) Good Hope for PyPyhttps://www.pypy.org/posts/2017/10/cape-of-good-hope-for-pypy-hello-from-3656631725712879033.htmlAntonio Cuni<div> +<br></div> +Hello from the other side of the world (for most of you)!<br> +<br> +With the excuse of coming to <a class="reference external" href="https://za.pycon.org/">PyCon ZA</a> during the last two weeks Armin, +Ronan, Antonio and sometimes Maciek had a very nice and productive sprint in +Cape Town, as pictures show :). We would like to say a big thank you to +Kiwi.com, which sponsored part of the travel costs via its awesome <a class="reference external" href="https://www.kiwi.com/sourcelift/">Sourcelift</a> +program to help Open Source projects.<br> +<br> +<table align="center" cellpadding="0" cellspacing="0" class="tr-caption-container" style="float: right; margin-left: 1em; text-align: right;"><tbody> +<tr><td style="text-align: center;"><a href="https://3.bp.blogspot.com/-9YVNucPN1wE/WeaWmTUFB-I/AAAAAAAABMQ/HeVMqS-ya2IYJuk0iZZODlULqpKaf5XcgCLcBGAs/s1600/DSC_2418.JPG" style="margin-left: auto; margin-right: auto;"><img border="0" height="225" src="https://3.bp.blogspot.com/-9YVNucPN1wE/WeaWmTUFB-I/AAAAAAAABMQ/HeVMqS-ya2IYJuk0iZZODlULqpKaf5XcgCLcBGAs/s400/DSC_2418.JPG" width="400"></a></td></tr> +<tr><td class="tr-caption" style="text-align: center;">Armin, Anto and Ronan at Cape Point</td></tr> +</tbody></table> +<br> +Armin, Ronan and Anto spent most of the time hacking at cpyext, our CPython +C-API compatibility layer: during the last years, the focus was to make it +working and compatible with CPython, in order to run existing libraries such +as numpy and pandas. However, we never paid too much attention to performance, +so the net result is that with the latest released version of PyPy, C +extensions generally work but their speed ranges from "slow" to "horribly +slow".<br> +<br> +For example, these very simple <a class="reference external" href="https://github.com/antocuni/cpyext-benchmarks">microbenchmarks</a> measure the speed of +calling (empty) C functions, i.e. the time you spend to "cross the border" +between RPython and C. <i>(Note: this includes the time spent doing the loop in regular Python code.)</i> These are the results on CPython, on PyPy 5.8, and on +our newest in-progress version:<br> +<br> +<pre class="literal-block">$ python bench.py # CPython +noargs : 0.41 secs +onearg(None): 0.44 secs +onearg(i) : 0.44 secs +varargs : 0.58 secs +</pre> +<div> +<br></div> +<pre class="literal-block">$ pypy-5.8 bench.py # PyPy 5.8 +noargs : 1.01 secs +onearg(None): 1.31 secs +onearg(i) : 2.57 secs +varargs : 2.79 secs +</pre> +<div> +<br></div> +<pre class="literal-block">$ pypy bench.py # cpyext-refactor-methodobject branch +noargs : 0.17 secs +onearg(None): 0.21 secs +onearg(i) : 0.22 secs +varargs : 0.47 secs +</pre> +<div> +<br></div> +<pre class="literal-block"></pre> +<pre class="literal-block"></pre> +So yes: before the sprint, we were ~2-6x slower than CPython. Now, we are +<strong>faster</strong> than it! +To reach this result, we did various improvements, such as: +<br> +<blockquote> +<ol class="arabic simple"> +<li>teach the JIT how to look (a bit) inside the cpyext module;</li> +<li>write specialized code for calling <tt class="docutils literal">METH_NOARGS</tt>, <tt class="docutils literal">METH_O</tt> and +<tt class="docutils literal">METH_VARARGS</tt> functions; previously, we always used a very general and +slow logic;</li> +<li>implement freelists to allocate the cpyext versions of <tt class="docutils literal">int</tt> and +<tt class="docutils literal">tuple</tt> objects, as CPython does;</li> +<li>the <a class="reference external" href="https://foss.heptapod.net/pypy/pypy/-/merge_requests/573">cpyext-avoid-roundtrip</a> branch: crossing the RPython/C border is +slowish, but the real problem was (and still is for many cases) we often +cross it many times for no good reason. So, depending on the actual API +call, you might end up in the C land, which calls back into the RPython +land, which goes to C, etc. etc. (ad libitum).</li> +</ol> +</blockquote> +The branch tries to fix such nonsense: so far, we fixed only some cases, which +are enough to speed up the benchmarks shown above. But most importantly, we +now have a clear path and an actual plan to improve cpyext more and +more. Ideally, we would like to reach a point in which cpyext-intensive +programs run at worst at the same speed of CPython.<br> +<br> +The other big topic of the sprint was Armin and Maciej doing a lot of work on the +<a class="reference external" href="https://bitbucket.org/pypy/pypy/commits/branch/unicode-utf8">unicode-utf8</a> branch: the goal of the branch is to always use UTF-8 as the +internal representation of unicode strings. The advantages are various: +<br> +<blockquote> +<ul class="simple"> +<li>decoding a UTF-8 stream is super fast, as you just need to check that the +stream is valid;</li> +<li>encoding to UTF-8 is almost a no-op;</li> +<li>UTF-8 is always more compact representation than the currently +used UCS-4. It's also almost always more compact than CPython 3.5 latin1/UCS2/UCS4 combo;</li> +<li>smaller representation means everything becomes quite a bit faster due to lower cache pressure.</li> +</ul> +</blockquote> +Before you ask: yes, this branch contains special logic to ensure that random +access of single unicode chars is still O(1), as it is on both CPython and the +current PyPy.<br> +We also plan to improve the speed of decoding even more by using modern processor features, like SSE and AVX. Preliminary results show that decoding can be done 100x faster than the current setup. +<br> +<br> +In summary, this was a long and profitable sprint, in which we achieved lots +of interesting results. However, what we liked even more was the privilege of +doing <a class="reference external" href="https://bitbucket.org/pypy/pypy/commits/a4307fb5912e">commits</a> from awesome places such as the top of Table Mountain:<br> +<br> +<blockquote class="twitter-tweet"> +<div dir="ltr" lang="en"> +Our sprint venue today <a href="https://twitter.com/hashtag/pypy?src=hash&amp;ref_src=twsrc%5Etfw">#pypy</a> <a href="https://t.co/o38IfTYmAV">pic.twitter.com/o38IfTYmAV</a></div> +— Ronan Lamy (@ronanlamy) <a href="https://twitter.com/ronanlamy/status/915575026107240449?ref_src=twsrc%5Etfw">4 ottobre 2017</a></blockquote> + + +<br> +<table align="center" cellpadding="0" cellspacing="0" class="tr-caption-container" style="float: left; margin-right: 1em; text-align: left;"><tbody> +<tr><td style="text-align: center;"><a href="https://foss.heptapod.net/pypy/extradoc/-/blob/branch/extradoc/sprintinfo/cape-town-2017/2017-10-04-155524.jpg" style="margin-left: auto; margin-right: auto;"><img border="0" height="360" src="https://bytebucket.org/pypy/extradoc/raw/extradoc/sprintinfo/cape-town-2017/2017-10-04-155524.jpg" width="640"></a></td></tr> +<tr><td class="tr-caption" style="text-align: center;">The panorama we looked at instead of staring at cpyext code</td></tr> +</tbody></table>cpyextprofilingspeedsprintunicodehttps://www.pypy.org/posts/2017/10/cape-of-good-hope-for-pypy-hello-from-3656631725712879033.htmlWed, 18 Oct 2017 13:31:00 GMTUsing CPython extension modules with PyPy natively, or: PyPy can load .pyd files with CPyExt!https://www.pypy.org/posts/2010/04/using-cpython-extension-modules-with-5864754772659599217.htmlAlexander Schremmer<p>PyPy is now able to load +and run CPython extension modules (i.e. .pyd and .so files) natively by using the new CPyExt +subsystem. +Unlike the solution presented in <a class="reference external" href="https://www.pypy.org/posts/2009/11/using-cpython-extension-modules-with-4951018896657992031.html">another blog post</a> (where extension modules like +numpy etc. were run on CPython and proxied through TCP), this solution does not require +a running CPython anymore. We do not achieve full binary compatiblity +yet (like Ironclad), but recompiling the extension is generally enough.</p> +<p>The only prerequisite is that the necessary functions of the C API of CPython are already +implemented in PyPy. If you are a user or an author of a module and miss certain functions +in PyPy, we invite you to implement them. Up until now, a lot of people (including a lot of +new committers) have stepped up and implemented a few functions to get their favorite module +running. See the end of this post for a list of names.</p> +<p>Regarding speed, we tried the following: even though there is a bit of overhead when running +these modules, we could run the regular expression engine of CPython (<tt class="docutils literal"><span class="pre">_sre.so</span></tt>) and execute +the spambayes benchmark of the Unladen Swallow benchmark suite (cf. <a class="reference external" href="https://speed.pypy.org/">speed.pypy.org</a>) and +experience a speedup: +It became <em>two times faster</em> on pypy-c than with the built-in regular +expression engine of PyPy. From <a href="https://en.wikipedia.org/wiki/Amdahl%27s_law">Amdahl's Law</a> it follows that the <tt class="docutils literal"><span class="pre">_sre.so</span></tt> must run several +times faster than the built-in engine.</p> +<p>Currently pursued modules include PIL and others. Distutils support is nearly ready. +If you would like to participate or want information on how to use this new feature, come and join +our IRC channel <tt class="docutils literal"><span class="pre">#pypy</span></tt> on <a class="reference external" href="irc://irc.freenode.net/">freenode</a>.</p> +<p>Amaury Forgeot d'Arc and Alexander Schremmer</p> +<p>Further CPyExt Contributors:</p> +<ul><li>Alex Gaynor +</li><li>Benjamin Peterson +</li><li>Jean-Paul Calderone +</li><li>Maciej Fijalkowski +</li><li>Jan de Mooij +</li><li>Lucian Branescu Mihaila +</li><li>Andreas Stührk +</li><li>Zooko Wilcox-O Hearn</li></ul>cpyextCPythonextension modulesspeedhttps://www.pypy.org/posts/2010/04/using-cpython-extension-modules-with-5864754772659599217.htmlFri, 09 Apr 2010 22:56:00 GMT \ No newline at end of file diff --git a/categories/cpython.html b/categories/cpython.html new file mode 100644 index 000000000..2f11a5209 --- /dev/null +++ b/categories/cpython.html @@ -0,0 +1,120 @@ + + + + + +Posts about CPython | PyPy + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +
                + + \ No newline at end of file diff --git a/categories/cpython.xml b/categories/cpython.xml new file mode 100644 index 000000000..1697093c5 --- /dev/null +++ b/categories/cpython.xml @@ -0,0 +1,153 @@ + +PyPy (Posts about CPython)https://www.pypy.org/enContents © 2024 <a href="mailto:pypy-dev@pypy.org">The PyPy Team</a> Sat, 31 Aug 2024 17:48:13 GMTNikola (getnikola.com)http://blogs.law.harvard.edu/tech/rssLeysin 2020 Sprint Reporthttps://www.pypy.org/posts/2020/03/leysin-2020-sprint-report-764567777353955897.htmlhodgestar<p>At the end of February ten of us gathered in Leysin, Switzerland to work on<br> +a variety of topics including <a class="reference external" href="https://github.com/pyhandle/hpy/">HPy</a>, <a class="reference external" href="https://buildbot.pypy.org/summary?branch=py3.7">PyPy Python 3.7</a> support and the PyPy<br> +migration to <a class="reference external" href="https://foss.heptapod.net/pypy/">Heptapod</a>.<br> +<br> +</p><div class="separator" style="clear: both; text-align: center;"> +<a href="https://1.bp.blogspot.com/-PIs_hVhn3RY/XnFDceuihNI/AAAAAAAAbRg/LKMOMWxeFw4jhcwqy8jx7iKzKE01fbfxQCEwYBhgL/s1600/2020_leysin_sprint_attendees.jpg" style="margin-left: 1em; margin-right: 1em;"><img border="0" height="180" src="https://1.bp.blogspot.com/-PIs_hVhn3RY/XnFDceuihNI/AAAAAAAAbRg/LKMOMWxeFw4jhcwqy8jx7iKzKE01fbfxQCEwYBhgL/s320/2020_leysin_sprint_attendees.jpg" width="320"></a></div> +<br> +We had a fun and productive week. The snow was beautiful. There was skiing<br> +and lunch at the top of <a class="reference external" href="https://en.wikipedia.org/wiki/Berneuse">Berneuse</a>, cooking together, some late nights at<br> +the pub next door, some even later nights coding, and of course the<br> +obligatory cheese fondue outing.<br> +<br> +There were a few of us participating in a PyPy sprint for the first time<br> +and a few familiar faces who had attended many sprints. Many different<br> +projects were represented including PyPy, <a class="reference external" href="https://github.com/pyhandle/hpy/">HPy</a>, <a class="reference external" href="https://github.com/graalvm/graalpython">GraalPython</a>,<br> +<a class="reference external" href="https://foss.heptapod.net/pypy/">Heptapod</a>, and <a class="reference external" href="https://github.com/dgrunwald/rust-cpython">rust-cpython</a>. The atmosphere was relaxed and welcoming, so if<br> +you're thinking of attending the next one -- please do!<br> +<br> +Topics worked on:<br> +<br> +<h2> +HPy</h2> +HPy is a new project to design and implement a better API for extending<br> +Python in C. If you're unfamiliar with it you can read more about it at<br> +<a class="reference external" href="https://github.com/pyhandle/hpy/">HPy</a>.<br> +<br> +A lot of attention was devoted to the Big HPy Design Discussion which<br> +took up two full mornings. So much was decided that this will likely<br> +get its own detailed write-up, but bigger topics included:<br> +<ul class="simple"> +<li>the HPy GetAttr, SetAttr, GetItem and SetItem methods,</li> +<li>HPy_FromVoidP and HPy_AsVoidP for passing HPy handles to C functions<br> +that pass void* pointers to callbacks,</li> +<li>avoiding having va_args as part of the ABI,</li> +<li>exception handling,</li> +<li>support for creating custom types.</li> +</ul> +Quite a few things got worked on too:<br> +<ul class="simple"> +<li>implemented support for writing methods that take keyword arguments with<br> +HPy_METH_KEYWORDS,</li> +<li>implemented HPy_GetAttr, HPy_SetAttr, HPy_GetItem, and HPy_SetItem,</li> +<li>started implementing support for adding custom types,</li> +<li>started implementing dumping JSON objects in ultrajson-hpy,</li> +<li>refactored the PyPy GIL to improve the interaction between HPy and<br> +PyPy's cpyext,</li> +<li>experimented with adding HPy support to rust-cpython.</li> +</ul> +And there was some discussion of the next steps of the HPy initiative<br> +including writing documentation, setting up websites and funding, and<br> +possibly organising another HPy gathering later in the year.<br> +<br> +<h2> +PyPy</h2> +<ul class="simple"> +<li>Georges gave a presentation on the Heptapod topic and branch workflows<br> +and showed everyone how to use hg-evolve.</li> +<li>Work was done on improving the PyPy CI buildbot post the move to<br> +heptapod, including a light-weight pre-merge CI and restricting<br> +when the full CI is run to only branch commits.</li> +<li>A lot of work was done improving the -D tests. </li> +</ul> +<br> +<h2> +Miscellaneous</h2> +<ul class="simple"> +<li>Armin demoed VRSketch and NaN Industries in VR, including an implementation<br> +of the Game of Life within NaN Industries!</li> +<li>Skiing!</li> +</ul> +<br> +<h2> +Aftermath</h2> +Immediately after the sprint large parts of Europe and the world were<br> +hit by the COVID-19 epidemic. It was good to spend time together before<br> +travelling ceased to be a sensible idea and many gatherings were cancelled.<br> +<br> +Keep safe out there everyone.<br> +<br> +The HPy &amp; PyPy Team &amp; Friends<br> +<br> +<i>In joke for those who attended the sprint: Please don't replace this blog post<br> +with its Swedish translation (or indeed a translation to any other language :).</i>cpyextCPythonGraalPythonHeptapodhpypypypypy3https://www.pypy.org/posts/2020/03/leysin-2020-sprint-report-764567777353955897.htmlTue, 17 Mar 2020 21:57:00 GMTUsing CPython extension modules with PyPy natively, or: PyPy can load .pyd files with CPyExt!https://www.pypy.org/posts/2010/04/using-cpython-extension-modules-with-5864754772659599217.htmlAlexander Schremmer<p>PyPy is now able to load +and run CPython extension modules (i.e. .pyd and .so files) natively by using the new CPyExt +subsystem. +Unlike the solution presented in <a class="reference external" href="https://www.pypy.org/posts/2009/11/using-cpython-extension-modules-with-4951018896657992031.html">another blog post</a> (where extension modules like +numpy etc. were run on CPython and proxied through TCP), this solution does not require +a running CPython anymore. We do not achieve full binary compatiblity +yet (like Ironclad), but recompiling the extension is generally enough.</p> +<p>The only prerequisite is that the necessary functions of the C API of CPython are already +implemented in PyPy. If you are a user or an author of a module and miss certain functions +in PyPy, we invite you to implement them. Up until now, a lot of people (including a lot of +new committers) have stepped up and implemented a few functions to get their favorite module +running. See the end of this post for a list of names.</p> +<p>Regarding speed, we tried the following: even though there is a bit of overhead when running +these modules, we could run the regular expression engine of CPython (<tt class="docutils literal"><span class="pre">_sre.so</span></tt>) and execute +the spambayes benchmark of the Unladen Swallow benchmark suite (cf. <a class="reference external" href="https://speed.pypy.org/">speed.pypy.org</a>) and +experience a speedup: +It became <em>two times faster</em> on pypy-c than with the built-in regular +expression engine of PyPy. From <a href="https://en.wikipedia.org/wiki/Amdahl%27s_law">Amdahl's Law</a> it follows that the <tt class="docutils literal"><span class="pre">_sre.so</span></tt> must run several +times faster than the built-in engine.</p> +<p>Currently pursued modules include PIL and others. Distutils support is nearly ready. +If you would like to participate or want information on how to use this new feature, come and join +our IRC channel <tt class="docutils literal"><span class="pre">#pypy</span></tt> on <a class="reference external" href="irc://irc.freenode.net/">freenode</a>.</p> +<p>Amaury Forgeot d'Arc and Alexander Schremmer</p> +<p>Further CPyExt Contributors:</p> +<ul><li>Alex Gaynor +</li><li>Benjamin Peterson +</li><li>Jean-Paul Calderone +</li><li>Maciej Fijalkowski +</li><li>Jan de Mooij +</li><li>Lucian Branescu Mihaila +</li><li>Andreas Stührk +</li><li>Zooko Wilcox-O Hearn</li></ul>cpyextCPythonextension modulesspeedhttps://www.pypy.org/posts/2010/04/using-cpython-extension-modules-with-5864754772659599217.htmlFri, 09 Apr 2010 22:56:00 GMTUsing CPython extension modules with PyPy, or: PyQt on PyPyhttps://www.pypy.org/posts/2009/11/using-cpython-extension-modules-with-4951018896657992031.htmlAlexander Schremmer<div class="document" id="using-cpython-extension-modules-with-pypy-or-pyqt-on-pypy"> + +<p>If you have ever wanted to use CPython extension modules on PyPy, +we want to announce that there is a solution that should be compatible +to quite a bit of the available modules. It is neither new nor written +by us, but works nevertheless great with PyPy.</p> +<p>The trick is to use RPyC, a transparent, symmetric remote procedure +call library written in Python. The idea is to start a +CPython process that hosts the PyQt libraries +and connect to it via TCP to send RPC commands to it.</p> +<p>I tried to run PyQt applications +using it on PyPy and could get quite a bit of the functionality of these +working. Remaining problems include regular segfaults of CPython +because of PyQt-induced memory corruption and bugs because classes +like StandardButtons behave incorrectly when it comes to arithmetical operations.</p> +<p>Changes to RPyC needed to be done to support remote unbound <tt class="docutils literal"><span class="pre">__init__</span></tt> methods, +shallow call by value for list and dict types (PyQt4 methods want real lists and dicts +as parameters), and callbacks to methods (all remote method objects are wrapped into +small lambda functions to ease the call for PyQt4).</p> +<p>If you want to try RPyC to run the PyQt application of your choice, you just +need to follow these steps. Please report your experience here in the blog +comments or on our <a class="reference external" href="https://codespeak.net/mailman/listinfo/pypy-dev">mailing list</a>.</p> +<blockquote> +<ol class="arabic simple"> +<li>Download RPyC from the <a class="reference external" href="https://sourceforge.net/projects/rpyc/files/">RPyC download page</a>.</li> +<li>Download this <a class="reference external" href="https://codespeak.net/svn/user/xoraxax/rpyc-3.0.7-pyqt4-compat.patch">patch</a> and apply it to RPyC by running +<tt class="docutils literal"><span class="pre">patch</span> <span class="pre">-p1</span> <span class="pre">&lt;</span> <span class="pre">rpyc-3.0.7-pyqt4-compat.patch</span></tt> in the RPyC directory.</li> +<li>Install RPyc by running <tt class="docutils literal"><span class="pre">python</span> <span class="pre">setup.py</span> <span class="pre">install</span></tt> as root.</li> +<li>Run the file <tt class="docutils literal"><span class="pre">rpyc/servers/classic_server.py</span></tt> using CPython.</li> +<li>Execute your PyQt application on PyPy.</li> +</ol> +</blockquote> +<p>PyPy will automatically connect to CPython and use its PyQt libraries.</p> +<p>Note that this scheme works with nearly every extension library. Look +at <tt class="docutils literal"><span class="pre">pypy/lib/sip.py</span></tt> on how to add new libraries (you need to create +such a file for every proxied extension module).</p> +<p>Have fun with PyQt</p> +<p>Alexander Schremmer</p> +</div>CPythonextension modulesPyQt4RPyChttps://www.pypy.org/posts/2009/11/using-cpython-extension-modules-with-4951018896657992031.htmlMon, 30 Nov 2009 11:19:00 GMT \ No newline at end of file diff --git a/categories/ep2008.html b/categories/ep2008.html new file mode 100644 index 000000000..f393ba560 --- /dev/null +++ b/categories/ep2008.html @@ -0,0 +1,114 @@ + + + + + +Posts about ep2008 | PyPy + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +
                + + \ No newline at end of file diff --git a/categories/ep2008.xml b/categories/ep2008.xml new file mode 100644 index 000000000..4f93c3e20 --- /dev/null +++ b/categories/ep2008.xml @@ -0,0 +1,13 @@ + +PyPy (Posts about ep2008)https://www.pypy.org/enContents © 2024 <a href="mailto:pypy-dev@pypy.org">The PyPy Team</a> Sat, 31 Aug 2024 17:48:12 GMTNikola (getnikola.com)http://blogs.law.harvard.edu/tech/rssEP2008: PyPy meets Jythonhttps://www.pypy.org/posts/2008/07/ep2008-pypy-meets-jython-1107070144380217881.htmlholger krekel<p>One of the great events at EuroPython 2008 were our chats and meetings with the Jython and Sun people. The Jython people recently are pushing into releasing Python version 2.5 and they currently pursue many interesting sub projects. Coincidentally, PyPy also has tons of interesting areas and results :) So we eventually got into brainstorming a number of possible technical collab ideas. Further below is a first list as i wrote it down from our 10 people PyPy / Jython 30 minute close up meeting yesterday. + +It felt great to be able to talk to the Jython people this way - kudos to Sun for their clear commitments and open ways to go about things! I sense a genuine interest on fair collaboration with non-java developer communities. Seems like they are serious about not focusing on "Java this", "Java that" anymore but rather focus on the JVM platform. Good! And about language +independent interest in ambitious technology. Even Better! I am tensed to see how things go from here. + +So here the list of technical collab ideas: +</p><ul><li>ctypes - try to create _rawffi module in Java for Jython, which will enable Jython to reuse our existing ctypes implementation (and have PyPy use the Jython-rawffi for its own for PyPy.JVM)</li><li> generally see to share work / (continue) collaborate regarding extension modules</li><li>Jython/PyPy (and eventually IronPython): document known differences to CPython, maybe in a PEP</li><li>Python Interpreter for Jython (in order to run CPython's .pyc files): re-use pypy's bytecode evaluator, implement a "Jython object space". </li><li>re-use rpython-extension modules for jython (e.g. SRE), by compiling them to Java and reusing as a native library.</li><li>collaborate on testing framework / benchmarking, have a common site to show test results</li><li>make py.test compatible with jython</li><li>come up with a set of "pure Python language" tests, which would gather and refactor tests from CPython, PyPy and Jython. </li><li>look into using java types / jython approaches for implementing free threading.</li><li>share knowledge regarding JIT / psyco +</li></ul>If you have any more ideas, comments or would like to join efforts, let us know! + +Cheers and thanks to <a href="https://www.sauria.com/blog/">Ted Leung</a>, <a href="https://fwierzbicki.blogspot.com/">Frank Wierzbiki</a>, <a href="https://www.zyasoft.com/pythoneering/">Jim Baker</a> and Tobias Ivarsson from Sun and Jython fame respectively, + +Holgerep2008jythonpypysunhttps://www.pypy.org/posts/2008/07/ep2008-pypy-meets-jython-1107070144380217881.htmlThu, 10 Jul 2008 08:29:00 GMT \ No newline at end of file diff --git a/categories/extension-modules.html b/categories/extension-modules.html new file mode 100644 index 000000000..87fbf06d9 --- /dev/null +++ b/categories/extension-modules.html @@ -0,0 +1,120 @@ + + + + + +Posts about extension modules | PyPy + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +
                + + \ No newline at end of file diff --git a/categories/extension-modules.xml b/categories/extension-modules.xml new file mode 100644 index 000000000..1c9cbe88d --- /dev/null +++ b/categories/extension-modules.xml @@ -0,0 +1,108 @@ + +PyPy (Posts about extension modules)https://www.pypy.org/enContents © 2024 <a href="mailto:pypy-dev@pypy.org">The PyPy Team</a> Sat, 31 Aug 2024 17:48:13 GMTNikola (getnikola.com)http://blogs.law.harvard.edu/tech/rssPyPy and conda-forgehttps://www.pypy.org/posts/2022/11/pypy-and-conda-forge.htmlmattip<p>You can use PyPy as your python interpreter in a conda environment. The +conda-forge team has graciously provided this service.</p> +<p>The conda-forge <a href="https://conda-forge.org/docs/user/tipsandtricks.html#using-pypy-as-an-interpreter">tips-and-tricks</a> +page says:</p> +<blockquote> +<p>The conda-forge channel supports creating and installing packages into +environments using the PyPy interpreter. Many packages are already available. +You need to enable the conda-forge channel and use the pypy identifier when +creating your environment:</p> +</blockquote> +<div class="code"><pre class="code literal-block"> $ conda create -c conda-forge -n my-pypy-env pypy python=3.8 + $ conda activate my-pypy-env +</pre></div> + +<blockquote> +<p>Currently supported python versions are 3.8 and 3.9. Support for pypy3.7 has +been dropped. While you can still create a python 3.7 environment, you you +will not be getting updates as new package versions are released (including +pypy itself).</p> +<p>if you are using defaults as a low priority channel, then you need to use +strict channel priority as the metadata in defaults has not been patched yet +which allows cpython extension packages to be installed alongside pypy.</p> +</blockquote> +<div class="code"><pre class="code literal-block"> $ conda config --set channel_priority strict +</pre></div> + +<p>The work required some out-of-the-box thinking on the part of conda-forge since +they needed to add the idea of a <code>pypy</code> identifier to the python version and +the whole conda team has been very supportive of the effort needed. Binary +packages are on offer for the usual platforms:</p> +<ul> +<li><code>x86_64</code> windows, macos, linux</li> +<li><code>ppc64le</code> and <code>aarch64</code> linux.</li> +</ul> +<p>There are <a href="https://conda-forge.org/status/#pypy38">currently over 1000 packages</a> available for download via the +conda-forge channel, and more are being added as the kind package maintainers +work around various differences between CPython and PyPy. Please let us know if +your favorite package is not supported.</p>extension moduleshttps://www.pypy.org/posts/2022/11/pypy-and-conda-forge.htmlSat, 05 Nov 2022 17:00:25 GMTUsing CPython extension modules with PyPy natively, or: PyPy can load .pyd files with CPyExt!https://www.pypy.org/posts/2010/04/using-cpython-extension-modules-with-5864754772659599217.htmlAlexander Schremmer<p>PyPy is now able to load +and run CPython extension modules (i.e. .pyd and .so files) natively by using the new CPyExt +subsystem. +Unlike the solution presented in <a class="reference external" href="https://www.pypy.org/posts/2009/11/using-cpython-extension-modules-with-4951018896657992031.html">another blog post</a> (where extension modules like +numpy etc. were run on CPython and proxied through TCP), this solution does not require +a running CPython anymore. We do not achieve full binary compatiblity +yet (like Ironclad), but recompiling the extension is generally enough.</p> +<p>The only prerequisite is that the necessary functions of the C API of CPython are already +implemented in PyPy. If you are a user or an author of a module and miss certain functions +in PyPy, we invite you to implement them. Up until now, a lot of people (including a lot of +new committers) have stepped up and implemented a few functions to get their favorite module +running. See the end of this post for a list of names.</p> +<p>Regarding speed, we tried the following: even though there is a bit of overhead when running +these modules, we could run the regular expression engine of CPython (<tt class="docutils literal"><span class="pre">_sre.so</span></tt>) and execute +the spambayes benchmark of the Unladen Swallow benchmark suite (cf. <a class="reference external" href="https://speed.pypy.org/">speed.pypy.org</a>) and +experience a speedup: +It became <em>two times faster</em> on pypy-c than with the built-in regular +expression engine of PyPy. From <a href="https://en.wikipedia.org/wiki/Amdahl%27s_law">Amdahl's Law</a> it follows that the <tt class="docutils literal"><span class="pre">_sre.so</span></tt> must run several +times faster than the built-in engine.</p> +<p>Currently pursued modules include PIL and others. Distutils support is nearly ready. +If you would like to participate or want information on how to use this new feature, come and join +our IRC channel <tt class="docutils literal"><span class="pre">#pypy</span></tt> on <a class="reference external" href="irc://irc.freenode.net/">freenode</a>.</p> +<p>Amaury Forgeot d'Arc and Alexander Schremmer</p> +<p>Further CPyExt Contributors:</p> +<ul><li>Alex Gaynor +</li><li>Benjamin Peterson +</li><li>Jean-Paul Calderone +</li><li>Maciej Fijalkowski +</li><li>Jan de Mooij +</li><li>Lucian Branescu Mihaila +</li><li>Andreas Stührk +</li><li>Zooko Wilcox-O Hearn</li></ul>cpyextCPythonextension modulesspeedhttps://www.pypy.org/posts/2010/04/using-cpython-extension-modules-with-5864754772659599217.htmlFri, 09 Apr 2010 22:56:00 GMTUsing CPython extension modules with PyPy, or: PyQt on PyPyhttps://www.pypy.org/posts/2009/11/using-cpython-extension-modules-with-4951018896657992031.htmlAlexander Schremmer<div class="document" id="using-cpython-extension-modules-with-pypy-or-pyqt-on-pypy"> + +<p>If you have ever wanted to use CPython extension modules on PyPy, +we want to announce that there is a solution that should be compatible +to quite a bit of the available modules. It is neither new nor written +by us, but works nevertheless great with PyPy.</p> +<p>The trick is to use RPyC, a transparent, symmetric remote procedure +call library written in Python. The idea is to start a +CPython process that hosts the PyQt libraries +and connect to it via TCP to send RPC commands to it.</p> +<p>I tried to run PyQt applications +using it on PyPy and could get quite a bit of the functionality of these +working. Remaining problems include regular segfaults of CPython +because of PyQt-induced memory corruption and bugs because classes +like StandardButtons behave incorrectly when it comes to arithmetical operations.</p> +<p>Changes to RPyC needed to be done to support remote unbound <tt class="docutils literal"><span class="pre">__init__</span></tt> methods, +shallow call by value for list and dict types (PyQt4 methods want real lists and dicts +as parameters), and callbacks to methods (all remote method objects are wrapped into +small lambda functions to ease the call for PyQt4).</p> +<p>If you want to try RPyC to run the PyQt application of your choice, you just +need to follow these steps. Please report your experience here in the blog +comments or on our <a class="reference external" href="https://codespeak.net/mailman/listinfo/pypy-dev">mailing list</a>.</p> +<blockquote> +<ol class="arabic simple"> +<li>Download RPyC from the <a class="reference external" href="https://sourceforge.net/projects/rpyc/files/">RPyC download page</a>.</li> +<li>Download this <a class="reference external" href="https://codespeak.net/svn/user/xoraxax/rpyc-3.0.7-pyqt4-compat.patch">patch</a> and apply it to RPyC by running +<tt class="docutils literal"><span class="pre">patch</span> <span class="pre">-p1</span> <span class="pre">&lt;</span> <span class="pre">rpyc-3.0.7-pyqt4-compat.patch</span></tt> in the RPyC directory.</li> +<li>Install RPyc by running <tt class="docutils literal"><span class="pre">python</span> <span class="pre">setup.py</span> <span class="pre">install</span></tt> as root.</li> +<li>Run the file <tt class="docutils literal"><span class="pre">rpyc/servers/classic_server.py</span></tt> using CPython.</li> +<li>Execute your PyQt application on PyPy.</li> +</ol> +</blockquote> +<p>PyPy will automatically connect to CPython and use its PyQt libraries.</p> +<p>Note that this scheme works with nearly every extension library. Look +at <tt class="docutils literal"><span class="pre">pypy/lib/sip.py</span></tt> on how to add new libraries (you need to create +such a file for every proxied extension module).</p> +<p>Have fun with PyQt</p> +<p>Alexander Schremmer</p> +</div>CPythonextension modulesPyQt4RPyChttps://www.pypy.org/posts/2009/11/using-cpython-extension-modules-with-4951018896657992031.htmlMon, 30 Nov 2009 11:19:00 GMT \ No newline at end of file diff --git a/categories/gc.html b/categories/gc.html new file mode 100644 index 000000000..367582273 --- /dev/null +++ b/categories/gc.html @@ -0,0 +1,114 @@ + + + + + +Posts about gc | PyPy + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +
                + + \ No newline at end of file diff --git a/categories/gc.xml b/categories/gc.xml new file mode 100644 index 000000000..ae7a58e86 --- /dev/null +++ b/categories/gc.xml @@ -0,0 +1,113 @@ + +PyPy (Posts about gc)https://www.pypy.org/enContents © 2024 <a href="mailto:pypy-dev@pypy.org">The PyPy Team</a> Sat, 31 Aug 2024 17:48:13 GMTNikola (getnikola.com)http://blogs.law.harvard.edu/tech/rssPyPy for low-latency systemshttps://www.pypy.org/posts/2019/01/pypy-for-low-latency-systems-613165393301401965.htmlAntonio Cuni<h1 class="title"> +PyPy for low-latency systems</h1> +Recently I have merged the gc-disable branch, introducing a couple of features +which are useful when you need to respond to certain events with the lowest +possible latency. This work has been kindly sponsored by <a class="reference external" href="https://www.gambitresearch.com/">Gambit Research</a> +(which, by the way, is a very cool and geeky place where to <a class="reference external" href="https://www.gambitresearch.com/jobs.html">work</a>, in case you +are interested). Note also that this is a very specialized use case, so these +features might not be useful for the average PyPy user, unless you have the +same problems as described here.<br> +<br> +The PyPy VM manages memory using a generational, moving Garbage Collector. +Periodically, the GC scans the whole heap to find unreachable objects and +frees the corresponding memory. Although at a first look this strategy might +sound expensive, in practice the total cost of memory management is far less +than e.g. on CPython, which is based on reference counting. While maybe +counter-intuitive, the main advantage of a non-refcount strategy is +that allocation is very fast (especially compared to malloc-based allocators), +and deallocation of objects which die young is basically for free. More +information about the PyPy GC is available <a class="reference external" href="https://pypy.readthedocs.io/en/latest/gc_info.html#incminimark">here</a>.<br> +<br> +As we said, the total cost of memory managment is less on PyPy than on +CPython, and it's one of the reasons why PyPy is so fast. However, one big +disadvantage is that while on CPython the cost of memory management is spread +all over the execution of the program, on PyPy it is concentrated into GC +runs, causing observable pauses which interrupt the execution of the user +program.<br> +To avoid excessively long pauses, the PyPy GC has been using an <a class="reference external" href="https://www.pypy.org/posts/2013/10/incremental-garbage-collector-in-pypy-8956893523842234676.html">incremental +strategy</a> since 2013. The GC runs as a series of "steps", letting the user +program to progress between each step.<br> +<br> +The following chart shows the behavior of a real-world, long-running process:<br> +<div class="separator" style="clear: both; text-align: center;"> +<a href="https://3.bp.blogspot.com/-44yKwUVK3BE/XC4X9XL4BII/AAAAAAAABbE/XdTCIoyA-eYxvxIgJhFHaKnzxjhoWStHQCEwYBhgL/s1600/gc-timing.png" style="margin-right: 1em;"><img border="0" height="246" src="https://3.bp.blogspot.com/-44yKwUVK3BE/XC4X9XL4BII/AAAAAAAABbE/XdTCIoyA-eYxvxIgJhFHaKnzxjhoWStHQCEwYBhgL/s640/gc-timing.png" width="640"></a></div> +<br> +<br> +The orange line shows the total memory used by the program, which +increases linearly while the program progresses. Every ~5 minutes, the GC +kicks in and the memory usage drops from ~5.2GB to ~2.8GB (this ratio is controlled +by the <a class="reference external" href="https://pypy.readthedocs.io/en/latest/gc_info.html#environment-variables">PYPY_GC_MAJOR_COLLECT</a> env variable).<br> +The purple line shows aggregated data about the GC timing: the whole +collection takes ~1400 individual steps over the course of ~1 minute: each +point represent the <strong>maximum</strong> time a single step took during the past 10 +seconds. Most steps take ~10-20 ms, although we see a horrible peak of ~100 ms +towards the end. We have not investigated yet what it is caused by, but we +suspect it is related to the deallocation of raw objects.<br> +<br> +These multi-millesecond pauses are a problem for systems where it is important +to respond to certain events with a latency which is both low and consistent. +If the GC kicks in at the wrong time, it might causes unacceptable pauses during +the collection cycle.<br> +<br> +Let's look again at our real-world example. This is a system which +continuously monitors an external stream; when a certain event occurs, we want +to take an action. The following chart shows the maximum time it takes to +complete one of such actions, aggregated every minute:<br> +<br> +<div class="separator" style="clear: both; text-align: center;"> +<a href="https://4.bp.blogspot.com/-FO9uFHSqZzU/XC4YC8LZUpI/AAAAAAAABa8/B8ZOrEgbVJUHoO65wxvCMVpvciO_d_0TwCLcBGAs/s1600/normal-max.png" style="margin-right: 1em;"><img border="0" height="240" src="https://4.bp.blogspot.com/-FO9uFHSqZzU/XC4YC8LZUpI/AAAAAAAABa8/B8ZOrEgbVJUHoO65wxvCMVpvciO_d_0TwCLcBGAs/s640/normal-max.png" width="640"></a></div> +<br> +You can clearly see that the baseline response time is around ~20-30 +ms. However, we can also see periodic spikes around ~50-100 ms, with peaks up +to ~350-450 ms! After a bit of investigation, we concluded that most (although +not all) of the spikes were caused by the GC kicking in at the wrong time.<br> +<br> +The work I did in the <tt class="docutils literal"><span class="pre">gc-disable</span></tt> branch aims to fix this problem by +introducing <a class="reference external" href="https://pypy.readthedocs.io/en/latest/gc_info.html#semi-manual-gc-management">two new features</a> to the <tt class="docutils literal">gc</tt> module:<br> +<blockquote> +<ul class="simple"> +<li><tt class="docutils literal">gc.disable()</tt>, which previously only inhibited the execution of +finalizers without actually touching the GC, now disables the GC major +collections. After a call to it, you will see the memory usage grow +indefinitely.</li> +<li><tt class="docutils literal">gc.collect_step()</tt> is a new function which you can use to manually +execute a single incremental GC collection step.</li> +</ul> +</blockquote> +It is worth to specify that <tt class="docutils literal">gc.disable()</tt> disables <strong>only</strong> the major +collections, while minor collections still runs. Moreover, thanks to the +JIT's virtuals, many objects with a short and predictable lifetime are not +allocated at all. The end result is that most objects with short lifetime are +still collected as usual, so the impact of <tt class="docutils literal">gc.disable()</tt> on memory growth +is not as bad as it could sound.<br> +<br> +Combining these two functions, it is possible to take control of the GC to +make sure it runs only when it is acceptable to do so. For an example of +usage, you can look at the implementation of a <a class="reference external" href="https://github.com/antocuni/pypytools/blob/master/pypytools/gc/custom.py">custom GC</a> inside <a class="reference external" href="https://pypi.org/project/pypytools/">pypytools</a>. +The peculiarity is that it also defines a "<tt class="docutils literal">with <span class="pre">nogc():"</span></tt> context manager +which you can use to mark performance-critical sections where the GC is not +allowed to run.<br> +<br> +The following chart compares the behavior of the default PyPy GC and the new +custom GC, after a careful placing of <tt class="docutils literal">nogc()</tt> sections:<br> +<br> +<div class="separator" style="clear: both; text-align: center;"> +<a href="https://1.bp.blogspot.com/-bGqs0WrOEBk/XC4YJN0uZfI/AAAAAAAABbA/4EXOASvy830IKBoTFtrnmY22Vyd_api-ACLcBGAs/s1600/nogc-max.png" style="margin-right: 1em;"><img border="0" height="242" src="https://1.bp.blogspot.com/-bGqs0WrOEBk/XC4YJN0uZfI/AAAAAAAABbA/4EXOASvy830IKBoTFtrnmY22Vyd_api-ACLcBGAs/s640/nogc-max.png" width="640"></a></div> +<br> +The yellow line is the same as before, while the purple line shows the new +system: almost all spikes have gone, and the baseline performance is about 10% +better. There is still one spike towards the end, but after some investigation +we concluded that it was <strong>not</strong> caused by the GC.<br> +<br> +Note that this does <strong>not</strong> mean that the whole program became magically +faster: we simply moved the GC pauses in some other place which is <strong>not</strong> +shown in the graph: in this specific use case this technique was useful +because it allowed us to shift the GC work in places where pauses are more +acceptable.<br> +<br> +All in all, a pretty big success, I think. These functionalities are already +available in the nightly builds of PyPy, and will be included in the next +release: take this as a New Year present :)<br> +<br> +Antonio Cuni and the PyPy teamgcsponsorshttps://www.pypy.org/posts/2019/01/pypy-for-low-latency-systems-613165393301401965.htmlThu, 03 Jan 2019 14:21:00 GMT \ No newline at end of file diff --git a/categories/graalpython.html b/categories/graalpython.html new file mode 100644 index 000000000..f4677ce82 --- /dev/null +++ b/categories/graalpython.html @@ -0,0 +1,114 @@ + + + + + +Posts about GraalPython | PyPy + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +
                + + \ No newline at end of file diff --git a/categories/graalpython.xml b/categories/graalpython.xml new file mode 100644 index 000000000..4ba96868b --- /dev/null +++ b/categories/graalpython.xml @@ -0,0 +1,84 @@ + +PyPy (Posts about GraalPython)https://www.pypy.org/enContents © 2024 <a href="mailto:pypy-dev@pypy.org">The PyPy Team</a> Sat, 31 Aug 2024 17:48:13 GMTNikola (getnikola.com)http://blogs.law.harvard.edu/tech/rssLeysin 2020 Sprint Reporthttps://www.pypy.org/posts/2020/03/leysin-2020-sprint-report-764567777353955897.htmlhodgestar<p>At the end of February ten of us gathered in Leysin, Switzerland to work on<br> +a variety of topics including <a class="reference external" href="https://github.com/pyhandle/hpy/">HPy</a>, <a class="reference external" href="https://buildbot.pypy.org/summary?branch=py3.7">PyPy Python 3.7</a> support and the PyPy<br> +migration to <a class="reference external" href="https://foss.heptapod.net/pypy/">Heptapod</a>.<br> +<br> +</p><div class="separator" style="clear: both; text-align: center;"> +<a href="https://1.bp.blogspot.com/-PIs_hVhn3RY/XnFDceuihNI/AAAAAAAAbRg/LKMOMWxeFw4jhcwqy8jx7iKzKE01fbfxQCEwYBhgL/s1600/2020_leysin_sprint_attendees.jpg" style="margin-left: 1em; margin-right: 1em;"><img border="0" height="180" src="https://1.bp.blogspot.com/-PIs_hVhn3RY/XnFDceuihNI/AAAAAAAAbRg/LKMOMWxeFw4jhcwqy8jx7iKzKE01fbfxQCEwYBhgL/s320/2020_leysin_sprint_attendees.jpg" width="320"></a></div> +<br> +We had a fun and productive week. The snow was beautiful. There was skiing<br> +and lunch at the top of <a class="reference external" href="https://en.wikipedia.org/wiki/Berneuse">Berneuse</a>, cooking together, some late nights at<br> +the pub next door, some even later nights coding, and of course the<br> +obligatory cheese fondue outing.<br> +<br> +There were a few of us participating in a PyPy sprint for the first time<br> +and a few familiar faces who had attended many sprints. Many different<br> +projects were represented including PyPy, <a class="reference external" href="https://github.com/pyhandle/hpy/">HPy</a>, <a class="reference external" href="https://github.com/graalvm/graalpython">GraalPython</a>,<br> +<a class="reference external" href="https://foss.heptapod.net/pypy/">Heptapod</a>, and <a class="reference external" href="https://github.com/dgrunwald/rust-cpython">rust-cpython</a>. The atmosphere was relaxed and welcoming, so if<br> +you're thinking of attending the next one -- please do!<br> +<br> +Topics worked on:<br> +<br> +<h2> +HPy</h2> +HPy is a new project to design and implement a better API for extending<br> +Python in C. If you're unfamiliar with it you can read more about it at<br> +<a class="reference external" href="https://github.com/pyhandle/hpy/">HPy</a>.<br> +<br> +A lot of attention was devoted to the Big HPy Design Discussion which<br> +took up two full mornings. So much was decided that this will likely<br> +get its own detailed write-up, but bigger topics included:<br> +<ul class="simple"> +<li>the HPy GetAttr, SetAttr, GetItem and SetItem methods,</li> +<li>HPy_FromVoidP and HPy_AsVoidP for passing HPy handles to C functions<br> +that pass void* pointers to callbacks,</li> +<li>avoiding having va_args as part of the ABI,</li> +<li>exception handling,</li> +<li>support for creating custom types.</li> +</ul> +Quite a few things got worked on too:<br> +<ul class="simple"> +<li>implemented support for writing methods that take keyword arguments with<br> +HPy_METH_KEYWORDS,</li> +<li>implemented HPy_GetAttr, HPy_SetAttr, HPy_GetItem, and HPy_SetItem,</li> +<li>started implementing support for adding custom types,</li> +<li>started implementing dumping JSON objects in ultrajson-hpy,</li> +<li>refactored the PyPy GIL to improve the interaction between HPy and<br> +PyPy's cpyext,</li> +<li>experimented with adding HPy support to rust-cpython.</li> +</ul> +And there was some discussion of the next steps of the HPy initiative<br> +including writing documentation, setting up websites and funding, and<br> +possibly organising another HPy gathering later in the year.<br> +<br> +<h2> +PyPy</h2> +<ul class="simple"> +<li>Georges gave a presentation on the Heptapod topic and branch workflows<br> +and showed everyone how to use hg-evolve.</li> +<li>Work was done on improving the PyPy CI buildbot post the move to<br> +heptapod, including a light-weight pre-merge CI and restricting<br> +when the full CI is run to only branch commits.</li> +<li>A lot of work was done improving the -D tests. </li> +</ul> +<br> +<h2> +Miscellaneous</h2> +<ul class="simple"> +<li>Armin demoed VRSketch and NaN Industries in VR, including an implementation<br> +of the Game of Life within NaN Industries!</li> +<li>Skiing!</li> +</ul> +<br> +<h2> +Aftermath</h2> +Immediately after the sprint large parts of Europe and the world were<br> +hit by the COVID-19 epidemic. It was good to spend time together before<br> +travelling ceased to be a sensible idea and many gatherings were cancelled.<br> +<br> +Keep safe out there everyone.<br> +<br> +The HPy &amp; PyPy Team &amp; Friends<br> +<br> +<i>In joke for those who attended the sprint: Please don't replace this blog post<br> +with its Swedish translation (or indeed a translation to any other language :).</i>cpyextCPythonGraalPythonHeptapodhpypypypypy3https://www.pypy.org/posts/2020/03/leysin-2020-sprint-report-764567777353955897.htmlTue, 17 Mar 2020 21:57:00 GMT \ No newline at end of file diff --git a/categories/guestpost.html b/categories/guestpost.html new file mode 100644 index 000000000..c97c8f16a --- /dev/null +++ b/categories/guestpost.html @@ -0,0 +1,114 @@ + + + + + +Posts about guestpost | PyPy + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +
                + + \ No newline at end of file diff --git a/categories/guestpost.xml b/categories/guestpost.xml new file mode 100644 index 000000000..11924145f --- /dev/null +++ b/categories/guestpost.xml @@ -0,0 +1,104 @@ + +PyPy (Posts about guestpost)https://www.pypy.org/enContents © 2024 <a href="mailto:pypy-dev@pypy.org">The PyPy Team</a> Sat, 31 Aug 2024 17:48:13 GMTNikola (getnikola.com)http://blogs.law.harvard.edu/tech/rssGuest Post: How PortaOne uses PyPy for high-performance processing, connecting over 1B of phone calls every monthhttps://www.pypy.org/posts/2024/08/portaone.htmlThe PyPy Team<p>The PyPy project is always happy to hear about industrial use and deployments +of PyPy. For the <a href="https://www.pypy.org/posts/2024/03/fixing-bug-incremental-gc.html">GC bug +finding</a> +task earlier this year, we collaborated with PortaOne and we're super happy +that Serhii Titov, head of the QA department at PortaOne, was up to writing +this guest post to describe their use and experience with the project.</p> +<hr> +<h3 id="what-does-portaone-do">What does PortaOne do?</h3> +<p>We at <a href="https://www.portaone.com/">PortaOne Inc.</a> allow telecom operators to +launch new services (or provide existing services more efficiently) using our +VoIP platform (PortaSIP) and our real-time charging system (PortaBilling), +which provides additional features for cloud PBX, such as call transfer, +queues, interactive voice response (IVR) and more. At this moment our support +team manages several thousand servers with our software installed in 100 +countries, through which over 500 telecommunication service providers connect +millions of end users every day. The unique thing about PortaOne is that we +supply the source code of our product to our customers - something unheard of +in the telecom world! Thus we attract "telco innovators", who use our APIs to +build around the system and the source code to create unique tweaks of +functionality, which produces amazing products.</p> +<p>At the core of PortaSIP is the middle-ware component (the proper name for it is +"B2BUA", but that probably does not say much to anyone outside of experts in +VoIP), which implements the actual handling of SIP calls, messages, etc. and +all added features (for instance, trying to send a call via telco operators +through which the cost per minute is lower). It has to be fast (since even a +small delay in establishing a call is noticed by a customer), reliable +(everyone hates when a call drops or cannot be completed) and yet easily +expandable with new functionality. This is why we decided to use Python as +opposed to C/C++ or similar programming languages, which are often used in +telecom equipment.</p> +<p>The B2BUA component is a batch of similar Python processes that are looped +inside a +<a href="https://docs.python.org/3.10/library/asyncore.html"><code>asyncore.dispatcher</code></a> +wrapper. The load balancing between these Python processes is done by our +stateless SIP proxy server written in C++. All our sockets are served by this +B2BUA. We have our custom client-wrappers around <code>pymysql</code>, <code>redis</code>, +<code>cassandra-driver</code> and <code>requests</code> to communicate with external services. Some +of the Python processes use <a href="https://cffi.readthedocs.io/en/stable/"><code>cffi</code></a> +wrappers around C-code to improve their performance (examples: an Oracle DB +driver, a client to a radius server, a custom C logger).</p> +<p>The I/O operations that block the main thread of the Python processes are +processed in sub-threads. We have custom wrappers around <code>threading.Thread</code> +and also <code>asyncore.dispatcher</code>. The results of such operations are returned to +the main thread.</p> +<h3 id="improving-our-performance-with-pypy">Improving our performance with PyPy</h3> +<p>We started with CPython and then in 2014 switched to PyPy because it was +faster. Here's an exact quote from our first testing notes: "PyPy gives +significant performance boost, ~50%". Nowadays, after years of changes in all +the software involved, PyPy still gives us +50% boost compared to CPython.</p> +<p>Taking care of real time traffic for so many people around the globe is +something we're really proud of. I hope the PyPy team can be proud of it as +well, as the PyPy product is a part of this solution.</p> +<h3 id="finding-a-garbage-collector-bug-stage-1-the-gc-hooks">Finding a garbage collector bug: stage 1, the GC hooks</h3> +<p>However our path with PyPy wasn't perfectly smooth. There were very rare cases +of crashes on PyPy that we weren't able to catch. That's because to make +coredump useful we needed to switch to PyPy with debug, but we cannot let it +run in that mode on a production system for an extended period of time, and we +did not have any STR (steps-to-reproduce) to make PyPy crash again in our lab. +That's why we kept (and still keep) both interpreters installed just in case, +and we would switch to CPython if we noticed it happening.</p> +<p>At the time of updating PyPy from 3.5 to 3.6 our QA started noticing those +crashes more often, but we still had no luck with STR or collecting proper +coredumps with debug symbols. Then it became even worse after our development +played with the <a href="https://doc.pypy.org/en/latest/gc_info.html">Garbage Collector's +options</a> to increase performance +of our middleware component. The crashes started to affect our regular +performance testing (controlled by QA manager Yevhenii Bovda). At that point it +was decided that we can no longer live like that and so we started an intense +investigation.</p> +<p>During the first stage of our investigation (following the best practice of +troubleshooting) we narrowed down the issue as much as we could. So, it was not +our code, it was definitely somewhere in PyPy. Eventually our SIP software +engineer <a href="https://github.com/Yevhenii-Yatchenko">Yevhenii Yatchenko</a> found out +that this bug is connected with the use of our <a href="https://doc.pypy.org/en/latest/gc_info.html#gc-hooks">custom hooks in the +GC</a>. Yevhenii created +ticket <a href="https://github.com/pypy/pypy/issues/4899">#4899</a> and within 2-3 days we +got a fix from a <a href="https://github.com/cfbolz">member of the PyPy team</a>, in true open-source fashion.</p> +<h3 id="finding-a-garbage-collector-bug-stage-2-the-real-bug">Finding a garbage collector bug: stage 2, the real bug</h3> +<p>Then came stage 2. In parallel with the previous ticket, Yevhenii created +<a href="https://github.com/pypy/pypy/issues/4900">#4900</a> that we still see failing +with coredumps quite often, and they are not connected to GC custom hooks. In a +nutshell, it took us dozens of back and forward emails, three Zoom sessions and +four versions of a patch to solve the issue. During the last iteration we got a +new set of options to try and a new version of the patch. Surprisingly, that +helped! What a relief! So, the next logical step was to remove all debug +options and run PyPy only with the patch. Unfortunately, it started to fail +again and we came to the obvious conclusion that what will help us is not a +patch, but one of options we were testing out. At that point we found out that +<a href="https://doc.pypy.org/en/latest/gc_info.html#environment-variables"><code>PYPY_GC_MAX_PINNED=0</code></a> +is a necessary and sufficient condition to solve our issue. This points to +another bug in the garbage collector, somehow related to object pinning.</p> +<p>Here's our current state: we have to add <code>PYPY_GC_MAX_PINNED=0</code>, but we do not +face the crashes anymore.</p> +<h3 id="conclusion-and-next-steps">Conclusion and next steps</h3> +<p>Gratitude is extended to Carl for his invaluable assistance in resolving the +nasty bugss, because it seems we're the only ones who suffered from the last +one and we really did not want to fall back to CPython due to its performance +disadvantage.</p> +<p>Serhii Titov, head of the QA department at PortaOne Inc.</p> +<p>P.S. If you are a perfectionist and at this point you have mixed feelings and +you are still bothered by the question "But there might still be a bug in the +GC, what about that?" - Carl has some ideas about it and he will sort it out +(we will help with the testing/verification part).</p>casestudyguestposthttps://www.pypy.org/posts/2024/08/portaone.htmlThu, 29 Aug 2024 09:00:00 GMT \ No newline at end of file diff --git a/categories/heptapod.html b/categories/heptapod.html new file mode 100644 index 000000000..bf8d10a09 --- /dev/null +++ b/categories/heptapod.html @@ -0,0 +1,114 @@ + + + + + +Posts about Heptapod | PyPy + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +
                + + \ No newline at end of file diff --git a/categories/heptapod.xml b/categories/heptapod.xml new file mode 100644 index 000000000..4748a8173 --- /dev/null +++ b/categories/heptapod.xml @@ -0,0 +1,84 @@ + +PyPy (Posts about Heptapod)https://www.pypy.org/enContents © 2024 <a href="mailto:pypy-dev@pypy.org">The PyPy Team</a> Sat, 31 Aug 2024 17:48:14 GMTNikola (getnikola.com)http://blogs.law.harvard.edu/tech/rssLeysin 2020 Sprint Reporthttps://www.pypy.org/posts/2020/03/leysin-2020-sprint-report-764567777353955897.htmlhodgestar<p>At the end of February ten of us gathered in Leysin, Switzerland to work on<br> +a variety of topics including <a class="reference external" href="https://github.com/pyhandle/hpy/">HPy</a>, <a class="reference external" href="https://buildbot.pypy.org/summary?branch=py3.7">PyPy Python 3.7</a> support and the PyPy<br> +migration to <a class="reference external" href="https://foss.heptapod.net/pypy/">Heptapod</a>.<br> +<br> +</p><div class="separator" style="clear: both; text-align: center;"> +<a href="https://1.bp.blogspot.com/-PIs_hVhn3RY/XnFDceuihNI/AAAAAAAAbRg/LKMOMWxeFw4jhcwqy8jx7iKzKE01fbfxQCEwYBhgL/s1600/2020_leysin_sprint_attendees.jpg" style="margin-left: 1em; margin-right: 1em;"><img border="0" height="180" src="https://1.bp.blogspot.com/-PIs_hVhn3RY/XnFDceuihNI/AAAAAAAAbRg/LKMOMWxeFw4jhcwqy8jx7iKzKE01fbfxQCEwYBhgL/s320/2020_leysin_sprint_attendees.jpg" width="320"></a></div> +<br> +We had a fun and productive week. The snow was beautiful. There was skiing<br> +and lunch at the top of <a class="reference external" href="https://en.wikipedia.org/wiki/Berneuse">Berneuse</a>, cooking together, some late nights at<br> +the pub next door, some even later nights coding, and of course the<br> +obligatory cheese fondue outing.<br> +<br> +There were a few of us participating in a PyPy sprint for the first time<br> +and a few familiar faces who had attended many sprints. Many different<br> +projects were represented including PyPy, <a class="reference external" href="https://github.com/pyhandle/hpy/">HPy</a>, <a class="reference external" href="https://github.com/graalvm/graalpython">GraalPython</a>,<br> +<a class="reference external" href="https://foss.heptapod.net/pypy/">Heptapod</a>, and <a class="reference external" href="https://github.com/dgrunwald/rust-cpython">rust-cpython</a>. The atmosphere was relaxed and welcoming, so if<br> +you're thinking of attending the next one -- please do!<br> +<br> +Topics worked on:<br> +<br> +<h2> +HPy</h2> +HPy is a new project to design and implement a better API for extending<br> +Python in C. If you're unfamiliar with it you can read more about it at<br> +<a class="reference external" href="https://github.com/pyhandle/hpy/">HPy</a>.<br> +<br> +A lot of attention was devoted to the Big HPy Design Discussion which<br> +took up two full mornings. So much was decided that this will likely<br> +get its own detailed write-up, but bigger topics included:<br> +<ul class="simple"> +<li>the HPy GetAttr, SetAttr, GetItem and SetItem methods,</li> +<li>HPy_FromVoidP and HPy_AsVoidP for passing HPy handles to C functions<br> +that pass void* pointers to callbacks,</li> +<li>avoiding having va_args as part of the ABI,</li> +<li>exception handling,</li> +<li>support for creating custom types.</li> +</ul> +Quite a few things got worked on too:<br> +<ul class="simple"> +<li>implemented support for writing methods that take keyword arguments with<br> +HPy_METH_KEYWORDS,</li> +<li>implemented HPy_GetAttr, HPy_SetAttr, HPy_GetItem, and HPy_SetItem,</li> +<li>started implementing support for adding custom types,</li> +<li>started implementing dumping JSON objects in ultrajson-hpy,</li> +<li>refactored the PyPy GIL to improve the interaction between HPy and<br> +PyPy's cpyext,</li> +<li>experimented with adding HPy support to rust-cpython.</li> +</ul> +And there was some discussion of the next steps of the HPy initiative<br> +including writing documentation, setting up websites and funding, and<br> +possibly organising another HPy gathering later in the year.<br> +<br> +<h2> +PyPy</h2> +<ul class="simple"> +<li>Georges gave a presentation on the Heptapod topic and branch workflows<br> +and showed everyone how to use hg-evolve.</li> +<li>Work was done on improving the PyPy CI buildbot post the move to<br> +heptapod, including a light-weight pre-merge CI and restricting<br> +when the full CI is run to only branch commits.</li> +<li>A lot of work was done improving the -D tests. </li> +</ul> +<br> +<h2> +Miscellaneous</h2> +<ul class="simple"> +<li>Armin demoed VRSketch and NaN Industries in VR, including an implementation<br> +of the Game of Life within NaN Industries!</li> +<li>Skiing!</li> +</ul> +<br> +<h2> +Aftermath</h2> +Immediately after the sprint large parts of Europe and the world were<br> +hit by the COVID-19 epidemic. It was good to spend time together before<br> +travelling ceased to be a sensible idea and many gatherings were cancelled.<br> +<br> +Keep safe out there everyone.<br> +<br> +The HPy &amp; PyPy Team &amp; Friends<br> +<br> +<i>In joke for those who attended the sprint: Please don't replace this blog post<br> +with its Swedish translation (or indeed a translation to any other language :).</i>cpyextCPythonGraalPythonHeptapodhpypypypypy3https://www.pypy.org/posts/2020/03/leysin-2020-sprint-report-764567777353955897.htmlTue, 17 Mar 2020 21:57:00 GMT \ No newline at end of file diff --git a/categories/hpy.html b/categories/hpy.html new file mode 100644 index 000000000..f951a9701 --- /dev/null +++ b/categories/hpy.html @@ -0,0 +1,114 @@ + + + + + +Posts about hpy | PyPy + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +
                + + \ No newline at end of file diff --git a/categories/hpy.xml b/categories/hpy.xml new file mode 100644 index 000000000..c9528f1d6 --- /dev/null +++ b/categories/hpy.xml @@ -0,0 +1,84 @@ + +PyPy (Posts about hpy)https://www.pypy.org/enContents © 2024 <a href="mailto:pypy-dev@pypy.org">The PyPy Team</a> Sat, 31 Aug 2024 17:48:14 GMTNikola (getnikola.com)http://blogs.law.harvard.edu/tech/rssLeysin 2020 Sprint Reporthttps://www.pypy.org/posts/2020/03/leysin-2020-sprint-report-764567777353955897.htmlhodgestar<p>At the end of February ten of us gathered in Leysin, Switzerland to work on<br> +a variety of topics including <a class="reference external" href="https://github.com/pyhandle/hpy/">HPy</a>, <a class="reference external" href="https://buildbot.pypy.org/summary?branch=py3.7">PyPy Python 3.7</a> support and the PyPy<br> +migration to <a class="reference external" href="https://foss.heptapod.net/pypy/">Heptapod</a>.<br> +<br> +</p><div class="separator" style="clear: both; text-align: center;"> +<a href="https://1.bp.blogspot.com/-PIs_hVhn3RY/XnFDceuihNI/AAAAAAAAbRg/LKMOMWxeFw4jhcwqy8jx7iKzKE01fbfxQCEwYBhgL/s1600/2020_leysin_sprint_attendees.jpg" style="margin-left: 1em; margin-right: 1em;"><img border="0" height="180" src="https://1.bp.blogspot.com/-PIs_hVhn3RY/XnFDceuihNI/AAAAAAAAbRg/LKMOMWxeFw4jhcwqy8jx7iKzKE01fbfxQCEwYBhgL/s320/2020_leysin_sprint_attendees.jpg" width="320"></a></div> +<br> +We had a fun and productive week. The snow was beautiful. There was skiing<br> +and lunch at the top of <a class="reference external" href="https://en.wikipedia.org/wiki/Berneuse">Berneuse</a>, cooking together, some late nights at<br> +the pub next door, some even later nights coding, and of course the<br> +obligatory cheese fondue outing.<br> +<br> +There were a few of us participating in a PyPy sprint for the first time<br> +and a few familiar faces who had attended many sprints. Many different<br> +projects were represented including PyPy, <a class="reference external" href="https://github.com/pyhandle/hpy/">HPy</a>, <a class="reference external" href="https://github.com/graalvm/graalpython">GraalPython</a>,<br> +<a class="reference external" href="https://foss.heptapod.net/pypy/">Heptapod</a>, and <a class="reference external" href="https://github.com/dgrunwald/rust-cpython">rust-cpython</a>. The atmosphere was relaxed and welcoming, so if<br> +you're thinking of attending the next one -- please do!<br> +<br> +Topics worked on:<br> +<br> +<h2> +HPy</h2> +HPy is a new project to design and implement a better API for extending<br> +Python in C. If you're unfamiliar with it you can read more about it at<br> +<a class="reference external" href="https://github.com/pyhandle/hpy/">HPy</a>.<br> +<br> +A lot of attention was devoted to the Big HPy Design Discussion which<br> +took up two full mornings. So much was decided that this will likely<br> +get its own detailed write-up, but bigger topics included:<br> +<ul class="simple"> +<li>the HPy GetAttr, SetAttr, GetItem and SetItem methods,</li> +<li>HPy_FromVoidP and HPy_AsVoidP for passing HPy handles to C functions<br> +that pass void* pointers to callbacks,</li> +<li>avoiding having va_args as part of the ABI,</li> +<li>exception handling,</li> +<li>support for creating custom types.</li> +</ul> +Quite a few things got worked on too:<br> +<ul class="simple"> +<li>implemented support for writing methods that take keyword arguments with<br> +HPy_METH_KEYWORDS,</li> +<li>implemented HPy_GetAttr, HPy_SetAttr, HPy_GetItem, and HPy_SetItem,</li> +<li>started implementing support for adding custom types,</li> +<li>started implementing dumping JSON objects in ultrajson-hpy,</li> +<li>refactored the PyPy GIL to improve the interaction between HPy and<br> +PyPy's cpyext,</li> +<li>experimented with adding HPy support to rust-cpython.</li> +</ul> +And there was some discussion of the next steps of the HPy initiative<br> +including writing documentation, setting up websites and funding, and<br> +possibly organising another HPy gathering later in the year.<br> +<br> +<h2> +PyPy</h2> +<ul class="simple"> +<li>Georges gave a presentation on the Heptapod topic and branch workflows<br> +and showed everyone how to use hg-evolve.</li> +<li>Work was done on improving the PyPy CI buildbot post the move to<br> +heptapod, including a light-weight pre-merge CI and restricting<br> +when the full CI is run to only branch commits.</li> +<li>A lot of work was done improving the -D tests. </li> +</ul> +<br> +<h2> +Miscellaneous</h2> +<ul class="simple"> +<li>Armin demoed VRSketch and NaN Industries in VR, including an implementation<br> +of the Game of Life within NaN Industries!</li> +<li>Skiing!</li> +</ul> +<br> +<h2> +Aftermath</h2> +Immediately after the sprint large parts of Europe and the world were<br> +hit by the COVID-19 epidemic. It was good to spend time together before<br> +travelling ceased to be a sensible idea and many gatherings were cancelled.<br> +<br> +Keep safe out there everyone.<br> +<br> +The HPy &amp; PyPy Team &amp; Friends<br> +<br> +<i>In joke for those who attended the sprint: Please don't replace this blog post<br> +with its Swedish translation (or indeed a translation to any other language :).</i>cpyextCPythonGraalPythonHeptapodhpypypypypy3https://www.pypy.org/posts/2020/03/leysin-2020-sprint-report-764567777353955897.htmlTue, 17 Mar 2020 21:57:00 GMT \ No newline at end of file diff --git a/categories/index.html b/categories/index.html new file mode 100644 index 000000000..20314a3f7 --- /dev/null +++ b/categories/index.html @@ -0,0 +1,151 @@ + + + + + + +Tags | PyPy + + + + + + + + + + + + + + + + Skip to main content +
                +
                +
                + + \ No newline at end of file diff --git a/categories/jit.html b/categories/jit.html new file mode 100644 index 000000000..7961c66ff --- /dev/null +++ b/categories/jit.html @@ -0,0 +1,177 @@ + + + + + +Posts about jit | PyPy + + + + + + + + + + + + + + + + + Skip to main content +
                +

                Posts about jit

                + + +
                +
                + + \ No newline at end of file diff --git a/categories/jit.xml b/categories/jit.xml new file mode 100644 index 000000000..8e2475bc7 --- /dev/null +++ b/categories/jit.xml @@ -0,0 +1,2245 @@ + +PyPy (Posts about jit)https://www.pypy.org/enContents © 2024 <a href="mailto:pypy-dev@pypy.org">The PyPy Team</a> Sat, 31 Aug 2024 17:48:13 GMTNikola (getnikola.com)http://blogs.law.harvard.edu/tech/rssMining JIT traces for missing optimizations with Z3https://www.pypy.org/posts/2024/07/mining-jit-traces-missing-optimizations-z3.htmlCF Bolz-Tereick<p>In my last post I've described <a href="https://www.pypy.org/posts/2024/07/finding-simple-rewrite-rules-jit-z3.html">how to use Z3 to find simple local peephole +optimization patterns</a> +for the integer operations in PyPy's JIT. An example is <code>int_and(x, 0) -&gt; +0</code>. In this post I want to scale up the problem of identifying possible +optimizations to much bigger instruction sequences, also using Z3. For that, I +am starting with the JIT traces of <strong>real benchmarks</strong>, after they have been +optimized by the optimizer of PyPy's JIT. Then we can ask Z3 to find +inefficient integer operations in those traces.</p> +<p>Starting from the optimized traces of real programs has some big +advantages over the "classical" superoptimization approach of generating and +then trying all possible sequences of instructions. It avoids the +combinatorial explosion that happens with the latter approach. Also, starting +from the traces of benchmarks or (even better) actual programs makes sure that +we actually care about the missing optimizations +that are found in this way. And because the traces are analyzed after they have +been optimized by PyPy's optimizer, we only get reports for <em>missing</em> +optimizations, that the JIT isn't able to do (yet).</p> +<p>The techniques and experiments I describe in this post are again the result of +a bunch of discussions with John Regehr at a conference a few weeks ago, as +well as reading his blog posts and papers. Thanks John! Also thanks to <a href="https://bernsteinbear.com/">Max +Bernstein</a> for super helpful feedback on the drafts +of this blog post (and for poking me to write things in general).</p> +<h3 id="high-level-approach">High-Level Approach</h3> +<p>The approach that I took works as follows:</p> +<ul> +<li>Run benchmarks or other interesting programs and then dump the IR of the JIT + traces into a file. The traces have at that point been already optimized by + the PyPy JIT's optimizer.</li> +<li>For every trace, ignore all the operations on non-integer variables.</li> +<li>Translate every integer operation into a Z3 formula.</li> +<li>For every operation, use Z3 to find out whether the operation is redundant + (how that is done is described below).</li> +<li>If the operation is redundant, the trace is less efficient than it could have + been, because the optimizer could also have removed the operation. Report the + inefficiency.</li> +<li>Minimize the inefficient programs by removing as many operations as possible + to make the problem easier to understand.</li> +</ul> +<p>In the post I will describe the details and show some pseudocode of the +approach. I'll also make the proper code public eventually (but it needs a +healthy dose of cleanups first).</p> +<h3 id="dumping-pypy-traces">Dumping PyPy Traces</h3> +<p>PyPy will write its JIT traces into the file <code>out</code> if the environment variable +<a href="https://doc.pypy.org/en/latest/man/pypy.1.html"><code>PYPYLOG</code></a> is set as follows:</p> +<div class="code"><pre class="code literal-block">PYPYLOG=jit-log-opt:out pypy &lt;program.py&gt; +</pre></div> + +<p>This environment variable works for PyPy, but also for other virtual machines +built with RPython.</p> +<p>(This is really a side point for the rest of the blog post, but since the +question came up I wanted to clarify it: Operations on integers in the Python +program that the JIT is running don't all correspond 1-to-1 with the <code>int_...</code> +operations in the traces. The <code>int_...</code> trace operations always operate on +machine words. The Python <code>int</code> type supports arbitrarily large integers. PyPy +will optimistically try to lower the operations on Python integers into machine +word operations, but adds the necessary guards into the trace to make sure that +overflow outside of the range of machine words is caught. In case one of these +guards fails the interpreter switches to a big integer heap-allocated +representation.)</p> +<h3 id="encoding-traces-as-z3-formulas">Encoding Traces as Z3 formulas</h3> +<p>The last blog post already contained the code to encode the results of +individual trace operations into Z3 formulas, so we don't need to repeat that +here. To encode traces of operations we introduce a Z3 variable for every +operation in the trace and then call the <code>z3_expression</code> function for every +single one of the operations in the trace.</p> +<p>For example, for the following trace:</p> +<div class="code"><pre class="code literal-block"><span class="k">[i1]</span> +<span class="na">i2</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="s">uint_rshift(i1, 32)</span> +<span class="na">i3</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="s">int_and(i2, 65535)</span> +<span class="na">i4</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="s">uint_rshift(i1, 48)</span> +<span class="na">i5</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="s">int_lshift(i4, 16)</span> +<span class="na">i6</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="s">int_or(i5, i3)</span> +<span class="na">jump(i6, i2) # equal</span> +</pre></div> + +<p>We would get the Z3 formula:</p> +<div class="code"><pre class="code literal-block">z3.And(i2 == LShR(i1, 32), + i3 == i2 &amp; 65535, + i4 == LShR(i1, 48), + i5 == i4 &lt;&lt; 16) +</pre></div> + +<p>Usually we won't ask for the formula of the whole trace at once. Instead we go +through the trace operation by operation and try to find inefficiencies in the +current one we are looking at. Roughly like this (pseudo-)code:</p> +<div class="code"><pre class="code literal-block"><span class="k">def</span> <span class="nf">newvar</span><span class="p">(</span><span class="n">name</span><span class="p">):</span> + <span class="k">return</span> <span class="n">z3</span><span class="o">.</span><span class="n">BitVec</span><span class="p">(</span><span class="n">name</span><span class="p">,</span> <span class="n">INTEGER_WIDTH</span><span class="p">)</span> + +<span class="k">def</span> <span class="nf">find_inefficiencies</span><span class="p">(</span><span class="n">trace</span><span class="p">):</span> + <span class="n">solver</span> <span class="o">=</span> <span class="n">z3</span><span class="o">.</span><span class="n">Solver</span><span class="p">()</span> + <span class="n">var_to_z3var</span> <span class="o">=</span> <span class="p">{}</span> + <span class="k">for</span> <span class="n">input_argument</span> <span class="ow">in</span> <span class="n">trace</span><span class="o">.</span><span class="n">inputargs</span><span class="p">:</span> + <span class="n">var_to_z3var</span><span class="p">[</span><span class="n">input_argument</span><span class="p">]</span> <span class="o">=</span> <span class="n">newz3var</span><span class="p">(</span><span class="n">input_argument</span><span class="p">)</span> + <span class="k">for</span> <span class="n">op</span> <span class="ow">in</span> <span class="n">trace</span><span class="p">:</span> + <span class="n">var_to_z3var</span><span class="p">[</span><span class="n">op</span><span class="p">]</span> <span class="o">=</span> <span class="n">z3resultvar</span> <span class="o">=</span> <span class="n">newz3var</span><span class="p">(</span><span class="n">op</span><span class="o">.</span><span class="n">resultvarname</span><span class="p">)</span> + <span class="n">arg0</span> <span class="o">=</span> <span class="n">op</span><span class="o">.</span><span class="n">args</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span> + <span class="n">z3arg0</span> <span class="o">=</span> <span class="n">var_to_z3var</span><span class="p">[</span><span class="n">arg0</span><span class="p">]</span> + <span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">op</span><span class="o">.</span><span class="n">args</span><span class="p">)</span> <span class="o">==</span> <span class="mi">2</span><span class="p">:</span> + <span class="n">arg1</span> <span class="o">=</span> <span class="n">op</span><span class="o">.</span><span class="n">args</span><span class="p">[</span><span class="mi">1</span><span class="p">]</span> + <span class="n">z3arg1</span> <span class="o">=</span> <span class="n">var_to_z3var</span><span class="p">[</span><span class="n">arg1</span><span class="p">]</span> + <span class="k">else</span><span class="p">:</span> + <span class="n">z3arg1</span> <span class="o">=</span> <span class="kc">None</span> + <span class="n">res</span><span class="p">,</span> <span class="n">valid_if</span> <span class="o">=</span> <span class="n">z3_expression</span><span class="p">(</span><span class="n">op</span><span class="o">.</span><span class="n">name</span><span class="p">,</span> <span class="n">z3arg0</span><span class="p">,</span> <span class="n">z3arg1</span><span class="p">)</span> + <span class="c1"># checking for inefficiencies, see the next sections</span> + <span class="o">...</span> + <span class="k">if</span> <span class="o">...</span><span class="p">:</span> + <span class="k">return</span> <span class="s2">"inefficient"</span><span class="p">,</span> <span class="n">op</span> + + <span class="c1"># not inefficient, assert op into the solver and continue with the next op</span> + <span class="n">solver</span><span class="o">.</span><span class="n">add</span><span class="p">(</span><span class="n">z3resultvar</span> <span class="o">==</span> <span class="n">res</span><span class="p">)</span> + <span class="k">return</span> <span class="kc">None</span> <span class="c1"># no inefficiency found</span> +</pre></div> + +<h3 id="identifying-constant-booleans-with-z3">Identifying constant booleans with Z3</h3> +<p>To get started finding inefficiencies in a trace, we can +first focus on boolean variables. For every operation in the trace that +returns a bool we can ask Z3 to prove that this variable must be always True or +always False. Most of the time, neither of these proofs will succeed. But if Z3 +manages to prove one of them, we know have found an ineffiency: instead of +computing the boolean result (eg by executing a comparison) the JIT's optimizer +could have replaced the operation with the corresponding boolean constant.</p> +<p>Here's an example of an inefficiency found that way: if <code>x &lt; y</code> and <code>y &lt; z</code> are +both true, PyPy's JIT could conclude that <code>x &lt; z</code> must also +be true. However, currently the JIT cannot make that conclusion because it +only reasons about the concrete ranges (lower and upper bounds) for every +integer variable, but it has no way to remember anything about relationships +between different variables. This kind of reasoning would quite often be useful +to remove list/string bounds checks. Here's a <a href="https://www.youtube.com/watch?app=desktop&amp;v=1hm5ZVmBEvo">talk about how LLVM does +this</a> (but it might be +too heavyweight for a JIT setting).</p> +<p>Here are some more examples found that way:</p> +<ul> +<li><code>x - 1 == x</code> is always False</li> +<li><code>x - (x == -1) == -1</code> is always False. The pattern <code>x - (x == -1)</code> happens a + lot in PyPy's hash computations: To be compatible with the CPython hashes we + need to make sure that no object's hash is -1 (CPython uses -1 as an error + value on the C level).</li> +</ul> +<p>Here's pseudo-code for how to implement checking boolean operations for +inefficiencies:</p> +<div class="code"><pre class="code literal-block"><span class="k">def</span> <span class="nf">find_inefficiencies</span><span class="p">(</span><span class="n">trace</span><span class="p">):</span> + <span class="o">...</span> + <span class="k">for</span> <span class="n">op</span> <span class="ow">in</span> <span class="n">trace</span><span class="p">:</span> + <span class="o">...</span> + <span class="n">res</span><span class="p">,</span> <span class="n">valid_if</span> <span class="o">=</span> <span class="n">z3_expression</span><span class="p">(</span><span class="n">op</span><span class="o">.</span><span class="n">name</span><span class="p">,</span> <span class="n">z3arg0</span><span class="p">,</span> <span class="n">z3arg1</span><span class="p">)</span> + <span class="c1"># check for boolean constant result</span> + <span class="k">if</span> <span class="n">op</span><span class="o">.</span><span class="n">has_boolean_result</span><span class="p">():</span> + <span class="k">if</span> <span class="n">prove</span><span class="p">(</span><span class="n">solver</span><span class="p">,</span> <span class="n">res</span> <span class="o">==</span> <span class="mi">0</span><span class="p">):</span> + <span class="k">return</span> <span class="s2">"inefficient"</span><span class="p">,</span> <span class="n">op</span><span class="p">,</span> <span class="mi">0</span> + <span class="k">if</span> <span class="n">prove</span><span class="p">(</span><span class="n">solver</span><span class="p">,</span> <span class="n">res</span> <span class="o">==</span> <span class="mi">1</span><span class="p">):</span> + <span class="k">return</span> <span class="s2">"inefficient"</span><span class="p">,</span> <span class="n">op</span><span class="p">,</span> <span class="mi">1</span> + <span class="c1"># checking for other inefficiencies, see the next sections</span> + <span class="o">...</span> + + <span class="c1"># not inefficient, add op to the solver and continue with the next op</span> + <span class="n">solver</span><span class="o">.</span><span class="n">add</span><span class="p">(</span><span class="n">z3resultvar</span> <span class="o">==</span> <span class="n">res</span><span class="p">)</span> + <span class="k">return</span> <span class="kc">None</span> <span class="c1"># no inefficiency found</span> +</pre></div> + +<h3 id="identifying-redundant-operations">Identifying redundant operations</h3> +<p>A more interesting class of redundancy is to try to find two operations in a +trace that compute the same result. We can do that by asking Z3 to prove for +each pair of different operations in the trace to prove that the result is +always the same. If a previous operation returns the same result, the JIT could +have re-used that result instead of re-computing it, saving time. Doing this +search for equivalent operations with Z3 is quadratic in the number of +operations, but since traces have a maximum length it is not too bad in +practice.</p> +<p>This is the real workhorse of my script so far, it's what finds most of the +inefficiencies. Here's a few examples:</p> +<ul> +<li>The very first and super useful example the script found is <code>int_eq(b, 1) == + b</code> if <code>b</code> is known to be a boolean (ie and integer 0 or 1). I have already + implemented this optimization in the JIT.</li> +<li>Similarly, <code>int_and(b, 1) == b</code> for booleans.</li> +<li><code>(x &lt;&lt; 4) &amp; -0xf == x &lt;&lt; 4</code></li> +<li><code>((x &gt;&gt; 63) &lt;&lt; 1) &lt;&lt; 2) &gt;&gt; 3 == x &gt;&gt; 63</code>. In general the JIT is quite bad at + optimizing repeated shifts (the infrastructure for doing better with that is + already in place, so this will be a relatively easy fix).</li> +<li><code>(x &amp; 0xffffffff) | ((x &gt;&gt; 32) &lt;&lt; 32) == x</code>. Having the JIT optimize this + would maybe require first recognizing that <code>(x &gt;&gt; 32) &lt;&lt; 32</code> can be expressed + as a mask: <code>(x &amp; 0xffffffff00000000)</code>, and then using <code>(x &amp; c1) | (x &amp; c2) == + x &amp; (c1 | c2)</code></li> +<li>A commonly occurring pattern is variations of this one: + <code>((x &amp; 1345) ^ 2048) - 2048 == x &amp; 1345</code> (with different constants, of + course). xor is add without carry, and <code>x &amp; 1345</code> does not have the bit + <code>2048</code> set. Therefore the <code>^ 2048</code> is equivalent to <code>+ 2048</code>, which the <code>- + 2048</code> cancels. More generally, if <code>a &amp; b == 0</code>, then <code>a + b == a | b == a ^ b</code>. + I don't understand at all why this appears so often in the traces, but I + see variations of it a lot. LLVM can optimize this, but <a href="https://gcc.gnu.org/bugzilla/show_bug.cgi?id=115829">GCC + can't</a>, thanks to + <a href="https://hachyderm.io/@pinskia/112752641328799157">Andrew Pinski for filing the + bug</a>!</li> +</ul> +<p>And here's some implementation pseudo-code again:</p> +<div class="code"><pre class="code literal-block"><span class="k">def</span> <span class="nf">find_inefficiencies</span><span class="p">(</span><span class="n">trace</span><span class="p">):</span> + <span class="o">...</span> + <span class="k">for</span> <span class="n">op</span> <span class="ow">in</span> <span class="n">trace</span><span class="p">:</span> + <span class="o">...</span> + <span class="n">res</span><span class="p">,</span> <span class="n">valid_if</span> <span class="o">=</span> <span class="n">z3_expression</span><span class="p">(</span><span class="n">op</span><span class="o">.</span><span class="n">name</span><span class="p">,</span> <span class="n">z3arg0</span><span class="p">,</span> <span class="n">z3arg1</span><span class="p">)</span> + <span class="c1"># check for boolean constant result</span> + <span class="o">...</span> + <span class="c1"># searching for redundant operations</span> + <span class="k">for</span> <span class="n">previous_op</span> <span class="ow">in</span> <span class="n">trace</span><span class="p">:</span> + <span class="k">if</span> <span class="n">previous_op</span> <span class="ow">is</span> <span class="n">op</span><span class="p">:</span> + <span class="k">break</span> <span class="c1"># done, reached the current op</span> + <span class="n">previous_op_z3var</span> <span class="o">=</span> <span class="n">var_to_z3var</span><span class="p">[</span><span class="n">previous_op</span><span class="p">]</span> + <span class="k">if</span> <span class="n">prove</span><span class="p">(</span><span class="n">solver</span><span class="p">,</span> <span class="n">previous_op_z3var</span> <span class="o">==</span> <span class="n">res</span><span class="p">):</span> + <span class="k">return</span> <span class="s2">"inefficient"</span><span class="p">,</span> <span class="n">op</span><span class="p">,</span> <span class="n">previous_op</span> + <span class="o">...</span> + <span class="c1"># more code here later</span> + <span class="o">...</span> + + <span class="c1"># not inefficient, add op to the solver and continue with the next op</span> + <span class="n">solver</span><span class="o">.</span><span class="n">add</span><span class="p">(</span><span class="n">z3resultvar</span> <span class="o">==</span> <span class="n">res</span><span class="p">)</span> + <span class="k">return</span> <span class="kc">None</span> <span class="c1"># no inefficiency found</span> +</pre></div> + +<h3 id="synthesizing-more-complicated-constants-with-exists-forall">Synthesizing more complicated constants with exists-forall</h3> +<p>To find out whether some integer operations always return a constant result, we +can't simply use the same trick as for those operations that return boolean +results, because enumerating 2⁶⁴ possible constants and checking them all +would take too long. Like in the last post, we can use <code>z3.ForAll</code> to find out +whether Z3 can synthesize a constant for the result of an operation for us. +If such a constant exists, the JIT could have removed the operation, +and replaced it with the constant that Z3 provides.</p> +<p>Here a few examples of inefficiencies found this way:</p> +<ul> +<li><code>(x ^ 1) ^ x == 1</code> (or, more generally: <code>(x ^ y) ^ x == y</code>)</li> +<li>if <code>x | y == 0</code>, it follows that <code>x == 0</code> and <code>y == 0</code></li> +<li>if <code>x != MAXINT</code>, then <code>x + 1 &gt; x</code></li> +</ul> +<p>Implementing this is actually slightly annoying. The <code>solver.add</code> calls for +non-inefficient ops add assertions to the solver, which are now confusing the +<code>z3.ForAll</code> query. We could remove all assertion from the solver, then do the +<code>ForAll</code> query, then add the assertions back. What I ended doing instead was +instantiating a second solver object that I'm using for the <code>ForAll</code> queries, +that remains empty the whole time.</p> +<div class="code"><pre class="code literal-block"><span class="k">def</span> <span class="nf">find_inefficiencies</span><span class="p">(</span><span class="n">trace</span><span class="p">):</span> + <span class="n">solver</span> <span class="o">=</span> <span class="n">z3</span><span class="o">.</span><span class="n">Solver</span><span class="p">()</span> + <span class="n">empty_solver</span> <span class="o">=</span> <span class="n">z3</span><span class="o">.</span><span class="n">Solver</span><span class="p">()</span> + <span class="n">var_to_z3var</span> <span class="o">=</span> <span class="p">{}</span> + <span class="o">...</span> + <span class="k">for</span> <span class="n">op</span> <span class="ow">in</span> <span class="n">trace</span><span class="p">:</span> + <span class="o">...</span> + <span class="n">res</span><span class="p">,</span> <span class="n">valid_if</span> <span class="o">=</span> <span class="n">z3_expression</span><span class="p">(</span><span class="n">op</span><span class="o">.</span><span class="n">name</span><span class="p">,</span> <span class="n">z3arg0</span><span class="p">,</span> <span class="n">z3arg1</span><span class="p">)</span> + <span class="c1"># check for boolean constant result</span> + <span class="o">...</span> + <span class="c1"># searching for redundant operations</span> + <span class="o">...</span> + <span class="c1"># checking for constant results</span> + <span class="n">constvar</span> <span class="o">=</span> <span class="n">z3</span><span class="o">.</span><span class="n">BitVec</span><span class="p">(</span><span class="s1">'find_const'</span><span class="p">,</span> <span class="n">INTEGER_WIDTH</span><span class="p">)</span> + <span class="n">condition</span> <span class="o">=</span> <span class="n">z3</span><span class="o">.</span><span class="n">ForAll</span><span class="p">(</span> + <span class="n">var_to_z3var</span><span class="o">.</span><span class="n">values</span><span class="p">(),</span> + <span class="n">z3</span><span class="o">.</span><span class="n">Implies</span><span class="p">(</span> + <span class="o">*</span><span class="n">solver</span><span class="o">.</span><span class="n">assertions</span><span class="p">(),</span> + <span class="n">expr</span> <span class="o">==</span> <span class="n">constvar</span> + <span class="p">)</span> + <span class="p">)</span> + <span class="k">if</span> <span class="n">empty_solver</span><span class="o">.</span><span class="n">check</span><span class="p">(</span><span class="n">condition</span><span class="p">)</span> <span class="o">==</span> <span class="n">z3</span><span class="o">.</span><span class="n">sat</span><span class="p">:</span> + <span class="n">model</span> <span class="o">=</span> <span class="n">empty_solver</span><span class="o">.</span><span class="n">model</span><span class="p">()</span> + <span class="n">const</span> <span class="o">=</span> <span class="n">model</span><span class="p">[</span><span class="n">constvar</span><span class="p">]</span><span class="o">.</span><span class="n">as_signed_long</span><span class="p">()</span> + <span class="k">return</span> <span class="s2">"inefficient"</span><span class="p">,</span> <span class="n">op</span><span class="p">,</span> <span class="n">const</span> + + <span class="c1"># not inefficient, add op to the solver and continue with the next op</span> + <span class="n">solver</span><span class="o">.</span><span class="n">add</span><span class="p">(</span><span class="n">z3resultvar</span> <span class="o">==</span> <span class="n">res</span><span class="p">)</span> + <span class="k">return</span> <span class="kc">None</span> <span class="c1"># no inefficiency found</span> +</pre></div> + +<h3 id="minimization">Minimization</h3> +<p>Analyzing an inefficiency by hand in the context of a larger trace is quite +tedious. Therefore I've implemented a (super inefficient) script to try to make +the examples smaller. Here's how that works:</p> +<ul> +<li>First throw out all the operations that occur <em>after</em> the inefficient operation + in the trace.</li> +<li>Then we remove all "dead" operations, ie operations that don't have their + results used (all the operations that we can analyze with Z3 are without side + effects).</li> +<li>Now we try to remove every guard in the trace one by one and check + afterwards, whether the resulting trace still has an inefficiency.</li> +<li>We also try to replace every single operation with a new argument to the + trace, to see whether the inefficiency is still present.</li> +</ul> +<p>The minimization process is sort of inefficient and I should probably be using + <a href="https://github.com/DRMacIver/shrinkray">shrinkray</a> or + <a href="https://github.com/csmith-project/creduce">C-Reduce</a> instead. However, it + seems to work well in practice and the runtime isn't too bad.</p> +<h3 id="results">Results</h3> +<p>So far I am using the JIT traces of three programs: 1) Booting Linux on the +<a href="https://docs.pydrofoil.org">Pydrofoil</a> RISC-V emulator, 2) booting Linux on the Pydrofoil ARM emulator, and 3) +running the PyPy bootstrap process on top of PyPy.</p> +<p>I picked these programs because most Python programs don't contain interesting +amounts of integer operations, and the traces of the emulators +contain a lot of them. I also used the bootstrap process because I still wanted +to try a big Python program and personally care about the runtime of this +program a lot.</p> +<p>The script identifies 94 +inefficiencies in the traces, a lot of them come from repeating +patterns. My next steps will be to manually inspect them all, categorize them, and +implement easy optimizations identified that way. I also want a way to sort the +examples by execution count in the benchmarks, to get a feeling for which of +them are most important.</p> +<p>I didn't investigate the full set of <a href="https://speed.pypy.org">Python +benchmarks</a> that PyPy uses yet, because I don't expect +them to contain interesting amounts of integer operations, but maybe I am wrong +about that? Will have to try eventually.</p> +<h3 id="conclusion">Conclusion</h3> +<p>This was again much easier to do than I would have expected! Given that I had +the translation of trace ops to Z3 already in place, it was a matter of about a +day's of programming to use this infrastructure to find the first problems and +minimizing them.</p> +<p>Reusing the results of existing operations or replacing operations by constants +can be seen as "zero-instruction superoptimization". I'll probably be rather +busy for a while to add the missing optimizations identified by my simple +script. But later extensions to actually synthesize one or several operations +in the attempt to optimize the traces more and find more opportunities should +be possible.</p> +<p>Finding inefficiencies in traces with Z3 is significantly less +annoying and also less error-prone than just manually inspecting traces and +trying to spot optimization opportunities.</p> +<h3 id="random-notes-and-sources">Random Notes and Sources</h3> +<p>Again, John's blog posts:</p> +<ul> +<li><a href="https://blog.regehr.org/archives/1109">Let’s Work on an LLVM Superoptimizer</a></li> +<li><a href="https://blog.regehr.org/archives/1146">Early Superoptimizer Results</a></li> +<li><a href="https://blog.regehr.org/archives/1252">A Few Synthesizing Superoptimizer Results</a></li> +<li><a href="https://blog.regehr.org/archives/1636">Synthesizing Constants</a></li> +</ul> +<p>and papers:</p> +<ul> +<li><a href="https://arxiv.org/pdf/1711.04422">A Synthesizing Superoptimizer</a></li> +<li><a href="https://dl.acm.org/doi/pdf/10.1145/3649837">Hydra: Generalizing Peephole Optimizations with Program Synthesis</a></li> +</ul> +<p>I remembered recently that I had seen the approach of optimizing the traces of +a tracing JIT with Z3 a long time ago, as part of the (now long dead, I think) +<a href="https://web.archive.org/web/20160304055149/http://research.microsoft.com/en-us/projects/spur/">SPUR +project</a>. +There's a <a href="https://web.archive.org/web/20161029162737/http://csl.stanford.edu/~christos/pldi2010.fit/tillmann.provers4jit.pdf">workshop +paper</a> +from 2010 about this. SPUR was trying to use Z3 built into the actual JIT (as +opposed to using Z3 only to find places where the regular optimizers could be +improved). In addition to bitvectors, SPUR also used the Z3 support for arrays +to model the C# heap and remove redundant stores. This is still another future +extension for all the Z3 work I've been doing in the context of the PyPy JIT.</p>jitz3https://www.pypy.org/posts/2024/07/mining-jit-traces-missing-optimizations-z3.htmlFri, 19 Jul 2024 17:01:09 GMTFinding Simple Rewrite Rules for the JIT with Z3https://www.pypy.org/posts/2024/07/finding-simple-rewrite-rules-jit-z3.htmlCF Bolz-Tereick<p>In June I was at the <a href="https://pldi24.sigplan.org/">PLDI conference</a> in +Copenhagen to present a <a href="https://dl.acm.org/doi/10.1145/3652588.3663316">paper</a> +I co-authored with <a href="https://bernsteinbear.com/">Max Bernstein</a>. I also finally +met <a href="https://blog.regehr.org/">John Regehr</a>, who I'd been talking on social +media for ages but had never met. John has been working on compiler correctness +and better techniques for building compilers and optimizers since a very long +time. The blog post <a href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html">Finding JIT Optimizer Bugs using SMT Solvers and +Fuzzing</a> +was heavily inspired by this work. We talked a lot about his and his groups +work on using Z3 for +<a href="https://en.wikipedia.org/wiki/Superoptimization">superoptimization</a> and for +finding missing optimizations. I have applied some of the things John told me +about to the traces of PyPy's JIT, and wanted to blog about that. However, my +draft felt quite hard to understand. Therefore I have now written this current +post, to at least try to provide a somewhat gentler on-ramp to the topic.</p> +<p>In <em>this</em> post we will use the Python-API to Z3 to find local peephole rewrite +rules for the operations in the intermediate representation of PyPy's tracing +JIT. The code for this is simple enough that we can go through all of it.</p> +<p>The PyPy JIT produces traces of machine level instructions, which are optimized +and then turned into machine code. The optimizer uses a number of approaches to +make the traces more efficient. For integer operations it applies a number of +arithmetic simplification rules rules, for example <code>int_add(x, 0) -&gt; x</code>. When +implementing these rules in the JIT there are <strong>two problems</strong>: How do we know +that the rules are correct? And how do we know that we haven't forgotten any +rules? We'll try to answer both of these, but the first one in particular.</p> +<p>We'll be using Z3, a satisfiability module theories (SMT) solver which has good +bitvector support and most importantly an excellent Python API. We can use the +solver to reason about bitvectors, which are how we will model machine +integers.</p> +<p>To find rewrite rules, we will consider the binary operations (i.e. those +taking two arguments) in PyPy traces that take and produce integers. The +completely general form <code>op(x, y)</code> is not simplifiable on its own. But if +either <code>x == y</code> +or if one of the arguments is a constant, we can potentially simplify the +operation into a simpler form. The results are either the variable <code>x</code>, or a +(potentially different) constant. We'll ignore constant-folding where both +arguments of the binary operation are constants. The possible results for a +simplifiable binary operation are the variable <code>x</code> or another constant. This +leaves the following patterns as possibilities:</p> +<ul> +<li><code>op(x, x) == x</code></li> +<li><code>op(x, x) == c1</code></li> +<li><code>op(x, c1) == x</code></li> +<li><code>op(c1, x) == x</code></li> +<li><code>op(x, c1) == c2</code></li> +<li><code>op(c1, x) == c2</code></li> +</ul> +<p>Our approach will be to take every single supported binary integer operation, +instantiate all of these patterns, and try to ask Z3 whether the resulting +simplification is valid for all values of <code>x</code>.</p> +<h3 id="quick-intro-to-the-z3-python-api">Quick intro to the Z3 Python-API</h3> +<p>Here's a terminal session showing the use of the Z3 Python API:</p> +<div class="code"><pre class="code literal-block"><span class="go">&gt;&gt;&gt;&gt; import z3</span> +<span class="go">&gt;&gt;&gt;&gt; # construct a Z3 bitvector variable of width 8, with name x:</span> +<span class="go">&gt;&gt;&gt;&gt; x = z3.BitVec('x', 8)</span> +<span class="go">&gt;&gt;&gt;&gt; # construct a more complicated formula by using operator overloading:</span> +<span class="go">&gt;&gt;&gt;&gt; x + x</span> +<span class="go">x + x</span> +<span class="go">&gt;&gt;&gt;&gt; x + 1</span> +<span class="go">x + 1</span> +</pre></div> + +<p>Z3 checks the "satisfiability" of a formula. This means that it tries to find +an example set of concrete values for the variables that occur in a formula, +such that the formula becomes true. Examples:</p> +<div class="code"><pre class="code literal-block"><span class="go">&gt;&gt;&gt;&gt; solver = z3.Solver()</span> +<span class="go">&gt;&gt;&gt;&gt; solver.check(x * x == 3)</span> +<span class="go">unsat</span> +<span class="go">&gt;&gt;&gt;&gt; # meaning no x fulfils this property</span> +<span class="go">&gt;&gt;&gt;&gt;</span> +<span class="go">&gt;&gt;&gt;&gt; solver.check(x * x == 9)</span> +<span class="go">sat</span> +<span class="go">&gt;&gt;&gt;&gt; model = solver.model()</span> +<span class="go">&gt;&gt;&gt;&gt; model</span> +<span class="go">[x = 253]</span> +<span class="go">&gt;&gt;&gt;&gt; model[x].as_signed_long()</span> +<span class="go">-3</span> +<span class="go">&gt;&gt;&gt;&gt; # 253 is the same as -3 in two's complement arithmetic with 8 bits</span> +</pre></div> + +<p>In order to use Z3 to prove something, we can ask Z3 to find counterexamples +for the statement, meaning concrete values that would make the negation of the +statement true:</p> +<div class="code"><pre class="code literal-block"><span class="go">&gt;&gt;&gt;&gt; solver.check(z3.Not(x ^ -1 == ~x))</span> +<span class="go">unsat</span> +</pre></div> + +<p>The result <code>unsat</code> means that we just proved that <code>x ^ -1 == ~x</code> is true for +all <code>x</code>, because there is no value for <code>x</code> that makes <code>not (x ^ -1 == ~x)</code> +true (this works because -1 has all the bits set).</p> +<p>If we try to prove something incorrect in this way, the following happens:</p> +<div class="code"><pre class="code literal-block"><span class="go">&gt;&gt;&gt;&gt; solver.check(z3.Not(x ^ -1 == x))</span> +<span class="go">sat</span> +</pre></div> + +<p><code>sat</code> shows that <code>x ^ -1 == x</code> is (unsurprisingly) not always true, and we can +ask for a counterexample:</p> +<div class="code"><pre class="code literal-block"><span class="go">&gt;&gt;&gt;&gt; solver.model()</span> +<span class="go">[x = 0]</span> +</pre></div> + +<p>This way of proving this works because the <code>check</code> calls try to solve an +(implicit) "exists" quantifier, over all the Z3 variables used in the formula. +<code>check</code> will either return <code>z3.unsat</code>, which means that no concrete values make +the formula true; or <code>z3.sat</code>, which means that you can get some concrete +values that make the formula true by calling <code>solver.model()</code>.</p> +<p>In math terms we prove things using <code>check</code> by de-Morgan's rules for quantifiers:</p> +<p>$$ \lnot \exists x: \lnot f(x) \implies \forall x: f(x) $$</p> +<p>Now that we've seen the basics of using the Z3 API on a few small examples, +we'll use it in a bigger program.</p> +<h3 id="encoding-the-integer-operations-of-rpythons-jit-into-z3-formulas">Encoding the integer operations of RPython's JIT into Z3 formulas</h3> +<p>Now we'll use the API to reason about the integer operations of the PyPy JIT +intermediate representation (IR). The binary integer operations are:</p> +<div class="code"><pre class="code literal-block"><span class="n">opnames2</span> <span class="o">=</span> <span class="p">[</span> +<span class="s2">"int_add"</span><span class="p">,</span> +<span class="s2">"int_sub"</span><span class="p">,</span> +<span class="s2">"int_mul"</span><span class="p">,</span> +<span class="s2">"int_and"</span><span class="p">,</span> +<span class="s2">"int_or"</span><span class="p">,</span> +<span class="s2">"int_xor"</span><span class="p">,</span> +<span class="s2">"int_eq"</span><span class="p">,</span> +<span class="s2">"int_ne"</span><span class="p">,</span> +<span class="s2">"int_lt"</span><span class="p">,</span> +<span class="s2">"int_le"</span><span class="p">,</span> +<span class="s2">"int_gt"</span><span class="p">,</span> +<span class="s2">"int_ge"</span><span class="p">,</span> +<span class="s2">"uint_lt"</span><span class="p">,</span> +<span class="s2">"uint_le"</span><span class="p">,</span> +<span class="s2">"uint_gt"</span><span class="p">,</span> +<span class="s2">"uint_ge"</span><span class="p">,</span> +<span class="s2">"int_lshift"</span><span class="p">,</span> +<span class="s2">"int_rshift"</span><span class="p">,</span> +<span class="s2">"uint_rshift"</span><span class="p">,</span> +<span class="s2">"uint_mul_high"</span><span class="p">,</span> +<span class="s2">"int_pydiv"</span><span class="p">,</span> +<span class="s2">"int_pymod"</span><span class="p">,</span> +<span class="p">]</span> +</pre></div> + +<p>There's not much special about the integer operations. Like in LLVM, most of +them are signedness-independent: <code>int_add</code>, <code>int_sub</code>, <code>int_mul</code>, ... work +correctly for unsigned integers but also for +<a href="https://en.wikipedia.org/wiki/Two%27s_complement">two's-complement</a> signed +integers. Exceptions for that are order comparisons like <code>int_lt</code> etc. for +which we have unsigned variants <code>uint_lt</code> etc. All operations that produce a +boolean result return a full-width integer <code>0</code> or <code>1</code> (the PyPy JIT supports +only word-sized integers in its intermediate representation)</p> +<p>In order to reason about the IR operations, some ground work:</p> +<div class="code"><pre class="code literal-block"><span class="kn">import</span> <span class="nn">z3</span> + +<span class="n">INTEGER_WIDTH</span> <span class="o">=</span> <span class="mi">64</span> +<span class="n">solver</span> <span class="o">=</span> <span class="n">z3</span><span class="o">.</span><span class="n">Solver</span><span class="p">()</span> +<span class="n">solver</span><span class="o">.</span><span class="n">set</span><span class="p">(</span><span class="s2">"timeout"</span><span class="p">,</span> <span class="mi">10000</span><span class="p">)</span> <span class="c1"># milliseconds, ie 10s</span> +<span class="n">xvar</span> <span class="o">=</span> <span class="n">z3</span><span class="o">.</span><span class="n">BitVec</span><span class="p">(</span><span class="s1">'x'</span><span class="p">,</span> <span class="n">INTEGER_WIDTH</span><span class="p">)</span> +<span class="n">constvar</span> <span class="o">=</span> <span class="n">z3</span><span class="o">.</span><span class="n">BitVec</span><span class="p">(</span><span class="s1">'const'</span><span class="p">,</span> <span class="n">INTEGER_WIDTH</span><span class="p">)</span> +<span class="n">constvar2</span> <span class="o">=</span> <span class="n">z3</span><span class="o">.</span><span class="n">BitVec</span><span class="p">(</span><span class="s1">'const2'</span><span class="p">,</span> <span class="n">INTEGER_WIDTH</span><span class="p">)</span> +<span class="n">TRUEBV</span> <span class="o">=</span> <span class="n">z3</span><span class="o">.</span><span class="n">BitVecVal</span><span class="p">(</span><span class="mi">1</span><span class="p">,</span> <span class="n">INTEGER_WIDTH</span><span class="p">)</span> +<span class="n">FALSEBV</span> <span class="o">=</span> <span class="n">z3</span><span class="o">.</span><span class="n">BitVecVal</span><span class="p">(</span><span class="mi">0</span><span class="p">,</span> <span class="n">INTEGER_WIDTH</span><span class="p">)</span> +</pre></div> + +<p>And here's the a function to turn an integer IR operation of PyPy's JIT into Z3 +formulas:</p> +<div class="code"><pre class="code literal-block"><span class="k">def</span> <span class="nf">z3_expression</span><span class="p">(</span><span class="n">opname</span><span class="p">,</span> <span class="n">arg0</span><span class="p">,</span> <span class="n">arg1</span><span class="o">=</span><span class="kc">None</span><span class="p">):</span> +<span class="w"> </span><span class="sd">""" computes a tuple of (result, valid_if) of Z3 formulas. `result` is the</span> +<span class="sd"> formula representing the result of the operation, given argument formulas</span> +<span class="sd"> arg0 and arg1. `valid_if` is a pre-condition that must be true for the</span> +<span class="sd"> result to be meaningful. """</span> + <span class="n">result</span> <span class="o">=</span> <span class="kc">None</span> + <span class="n">valid_if</span> <span class="o">=</span> <span class="kc">True</span> <span class="c1"># the precondition is mostly True, with few exceptions</span> + <span class="k">if</span> <span class="n">opname</span> <span class="o">==</span> <span class="s2">"int_add"</span><span class="p">:</span> + <span class="n">result</span> <span class="o">=</span> <span class="n">arg0</span> <span class="o">+</span> <span class="n">arg1</span> + <span class="k">elif</span> <span class="n">opname</span> <span class="o">==</span> <span class="s2">"int_sub"</span><span class="p">:</span> + <span class="n">result</span> <span class="o">=</span> <span class="n">arg0</span> <span class="o">-</span> <span class="n">arg1</span> + <span class="k">elif</span> <span class="n">opname</span> <span class="o">==</span> <span class="s2">"int_mul"</span><span class="p">:</span> + <span class="n">result</span> <span class="o">=</span> <span class="n">arg0</span> <span class="o">*</span> <span class="n">arg1</span> + <span class="k">elif</span> <span class="n">opname</span> <span class="o">==</span> <span class="s2">"int_and"</span><span class="p">:</span> + <span class="n">result</span> <span class="o">=</span> <span class="n">arg0</span> <span class="o">&amp;</span> <span class="n">arg1</span> + <span class="k">elif</span> <span class="n">opname</span> <span class="o">==</span> <span class="s2">"int_or"</span><span class="p">:</span> + <span class="n">result</span> <span class="o">=</span> <span class="n">arg0</span> <span class="o">|</span> <span class="n">arg1</span> + <span class="k">elif</span> <span class="n">opname</span> <span class="o">==</span> <span class="s2">"int_xor"</span><span class="p">:</span> + <span class="n">result</span> <span class="o">=</span> <span class="n">arg0</span> <span class="o">^</span> <span class="n">arg1</span> + <span class="k">elif</span> <span class="n">opname</span> <span class="o">==</span> <span class="s2">"int_eq"</span><span class="p">:</span> + <span class="n">result</span> <span class="o">=</span> <span class="n">cond</span><span class="p">(</span><span class="n">arg0</span> <span class="o">==</span> <span class="n">arg1</span><span class="p">)</span> + <span class="k">elif</span> <span class="n">opname</span> <span class="o">==</span> <span class="s2">"int_ne"</span><span class="p">:</span> + <span class="n">result</span> <span class="o">=</span> <span class="n">cond</span><span class="p">(</span><span class="n">arg0</span> <span class="o">!=</span> <span class="n">arg1</span><span class="p">)</span> + <span class="k">elif</span> <span class="n">opname</span> <span class="o">==</span> <span class="s2">"int_lt"</span><span class="p">:</span> + <span class="n">result</span> <span class="o">=</span> <span class="n">cond</span><span class="p">(</span><span class="n">arg0</span> <span class="o">&lt;</span> <span class="n">arg1</span><span class="p">)</span> + <span class="k">elif</span> <span class="n">opname</span> <span class="o">==</span> <span class="s2">"int_le"</span><span class="p">:</span> + <span class="n">result</span> <span class="o">=</span> <span class="n">cond</span><span class="p">(</span><span class="n">arg0</span> <span class="o">&lt;=</span> <span class="n">arg1</span><span class="p">)</span> + <span class="k">elif</span> <span class="n">opname</span> <span class="o">==</span> <span class="s2">"int_gt"</span><span class="p">:</span> + <span class="n">result</span> <span class="o">=</span> <span class="n">cond</span><span class="p">(</span><span class="n">arg0</span> <span class="o">&gt;</span> <span class="n">arg1</span><span class="p">)</span> + <span class="k">elif</span> <span class="n">opname</span> <span class="o">==</span> <span class="s2">"int_ge"</span><span class="p">:</span> + <span class="n">result</span> <span class="o">=</span> <span class="n">cond</span><span class="p">(</span><span class="n">arg0</span> <span class="o">&gt;=</span> <span class="n">arg1</span><span class="p">)</span> + <span class="k">elif</span> <span class="n">opname</span> <span class="o">==</span> <span class="s2">"uint_lt"</span><span class="p">:</span> + <span class="n">result</span> <span class="o">=</span> <span class="n">cond</span><span class="p">(</span><span class="n">z3</span><span class="o">.</span><span class="n">ULT</span><span class="p">(</span><span class="n">arg0</span><span class="p">,</span> <span class="n">arg1</span><span class="p">))</span> + <span class="k">elif</span> <span class="n">opname</span> <span class="o">==</span> <span class="s2">"uint_le"</span><span class="p">:</span> + <span class="n">result</span> <span class="o">=</span> <span class="n">cond</span><span class="p">(</span><span class="n">z3</span><span class="o">.</span><span class="n">ULE</span><span class="p">(</span><span class="n">arg0</span><span class="p">,</span> <span class="n">arg1</span><span class="p">))</span> + <span class="k">elif</span> <span class="n">opname</span> <span class="o">==</span> <span class="s2">"uint_gt"</span><span class="p">:</span> + <span class="n">result</span> <span class="o">=</span> <span class="n">cond</span><span class="p">(</span><span class="n">z3</span><span class="o">.</span><span class="n">UGT</span><span class="p">(</span><span class="n">arg0</span><span class="p">,</span> <span class="n">arg1</span><span class="p">))</span> + <span class="k">elif</span> <span class="n">opname</span> <span class="o">==</span> <span class="s2">"uint_ge"</span><span class="p">:</span> + <span class="n">result</span> <span class="o">=</span> <span class="n">cond</span><span class="p">(</span><span class="n">z3</span><span class="o">.</span><span class="n">UGE</span><span class="p">(</span><span class="n">arg0</span><span class="p">,</span> <span class="n">arg1</span><span class="p">))</span> + <span class="k">elif</span> <span class="n">opname</span> <span class="o">==</span> <span class="s2">"int_lshift"</span><span class="p">:</span> + <span class="n">result</span> <span class="o">=</span> <span class="n">arg0</span> <span class="o">&lt;&lt;</span> <span class="n">arg1</span> + <span class="n">valid_if</span> <span class="o">=</span> <span class="n">z3</span><span class="o">.</span><span class="n">And</span><span class="p">(</span><span class="n">arg1</span> <span class="o">&gt;=</span> <span class="mi">0</span><span class="p">,</span> <span class="n">arg1</span> <span class="o">&lt;</span> <span class="n">INTEGER_WIDTH</span><span class="p">)</span> + <span class="k">elif</span> <span class="n">opname</span> <span class="o">==</span> <span class="s2">"int_rshift"</span><span class="p">:</span> + <span class="n">result</span> <span class="o">=</span> <span class="n">arg0</span> <span class="o">&lt;&lt;</span> <span class="n">arg1</span> + <span class="n">valid_if</span> <span class="o">=</span> <span class="n">z3</span><span class="o">.</span><span class="n">And</span><span class="p">(</span><span class="n">arg1</span> <span class="o">&gt;=</span> <span class="mi">0</span><span class="p">,</span> <span class="n">arg1</span> <span class="o">&lt;</span> <span class="n">INTEGER_WIDTH</span><span class="p">)</span> + <span class="k">elif</span> <span class="n">opname</span> <span class="o">==</span> <span class="s2">"uint_rshift"</span><span class="p">:</span> + <span class="n">result</span> <span class="o">=</span> <span class="n">z3</span><span class="o">.</span><span class="n">LShR</span><span class="p">(</span><span class="n">arg0</span><span class="p">,</span> <span class="n">arg1</span><span class="p">)</span> + <span class="n">valid_if</span> <span class="o">=</span> <span class="n">z3</span><span class="o">.</span><span class="n">And</span><span class="p">(</span><span class="n">arg1</span> <span class="o">&gt;=</span> <span class="mi">0</span><span class="p">,</span> <span class="n">arg1</span> <span class="o">&lt;</span> <span class="n">INTEGER_WIDTH</span><span class="p">)</span> + <span class="k">elif</span> <span class="n">opname</span> <span class="o">==</span> <span class="s2">"uint_mul_high"</span><span class="p">:</span> + <span class="c1"># zero-extend args to 2*INTEGER_WIDTH bit, then multiply and extract</span> + <span class="c1"># highest INTEGER_WIDTH bits</span> + <span class="n">zarg0</span> <span class="o">=</span> <span class="n">z3</span><span class="o">.</span><span class="n">ZeroExt</span><span class="p">(</span><span class="n">INTEGER_WIDTH</span><span class="p">,</span> <span class="n">arg0</span><span class="p">)</span> + <span class="n">zarg1</span> <span class="o">=</span> <span class="n">z3</span><span class="o">.</span><span class="n">ZeroExt</span><span class="p">(</span><span class="n">INTEGER_WIDTH</span><span class="p">,</span> <span class="n">arg1</span><span class="p">)</span> + <span class="n">result</span> <span class="o">=</span> <span class="n">z3</span><span class="o">.</span><span class="n">Extract</span><span class="p">(</span><span class="n">INTEGER_WIDTH</span> <span class="o">*</span> <span class="mi">2</span> <span class="o">-</span> <span class="mi">1</span><span class="p">,</span> <span class="n">INTEGER_WIDTH</span><span class="p">,</span> <span class="n">zarg0</span> <span class="o">*</span> <span class="n">zarg1</span><span class="p">)</span> + <span class="k">elif</span> <span class="n">opname</span> <span class="o">==</span> <span class="s2">"int_pydiv"</span><span class="p">:</span> + <span class="n">valid_if</span> <span class="o">=</span> <span class="n">arg1</span> <span class="o">!=</span> <span class="mi">0</span> + <span class="n">r</span> <span class="o">=</span> <span class="n">arg0</span> <span class="o">/</span> <span class="n">arg1</span> + <span class="n">psubx</span> <span class="o">=</span> <span class="n">r</span> <span class="o">*</span> <span class="n">arg1</span> <span class="o">-</span> <span class="n">arg0</span> + <span class="n">result</span> <span class="o">=</span> <span class="n">r</span> <span class="o">+</span> <span class="p">(</span><span class="n">z3</span><span class="o">.</span><span class="n">If</span><span class="p">(</span><span class="n">arg1</span> <span class="o">&lt;</span> <span class="mi">0</span><span class="p">,</span> <span class="n">psubx</span><span class="p">,</span> <span class="o">-</span><span class="n">psubx</span><span class="p">)</span> <span class="o">&gt;&gt;</span> <span class="p">(</span><span class="n">INTEGER_WIDTH</span> <span class="o">-</span> <span class="mi">1</span><span class="p">))</span> + <span class="k">elif</span> <span class="n">opname</span> <span class="o">==</span> <span class="s2">"int_pymod"</span><span class="p">:</span> + <span class="n">valid_if</span> <span class="o">=</span> <span class="n">arg1</span> <span class="o">!=</span> <span class="mi">0</span> + <span class="n">r</span> <span class="o">=</span> <span class="n">arg0</span> <span class="o">%</span> <span class="n">arg1</span> + <span class="n">result</span> <span class="o">=</span> <span class="n">r</span> <span class="o">+</span> <span class="p">(</span><span class="n">arg1</span> <span class="o">&amp;</span> <span class="n">z3</span><span class="o">.</span><span class="n">If</span><span class="p">(</span><span class="n">arg1</span> <span class="o">&lt;</span> <span class="mi">0</span><span class="p">,</span> <span class="o">-</span><span class="n">r</span><span class="p">,</span> <span class="n">r</span><span class="p">)</span> <span class="o">&gt;&gt;</span> <span class="p">(</span><span class="n">INTEGER_WIDTH</span> <span class="o">-</span> <span class="mi">1</span><span class="p">))</span> + <span class="k">elif</span> <span class="n">opname</span> <span class="o">==</span> <span class="s2">"int_is_true"</span><span class="p">:</span> + <span class="n">result</span> <span class="o">=</span> <span class="n">cond</span><span class="p">(</span><span class="n">arg0</span> <span class="o">!=</span> <span class="n">FALSEBV</span><span class="p">)</span> + <span class="k">elif</span> <span class="n">opname</span> <span class="o">==</span> <span class="s2">"int_is_zero"</span><span class="p">:</span> + <span class="n">result</span> <span class="o">=</span> <span class="n">cond</span><span class="p">(</span><span class="n">arg0</span> <span class="o">==</span> <span class="n">FALSEBV</span><span class="p">)</span> + <span class="k">elif</span> <span class="n">opname</span> <span class="o">==</span> <span class="s2">"int_neg"</span><span class="p">:</span> + <span class="n">result</span> <span class="o">=</span> <span class="o">-</span><span class="n">arg0</span> + <span class="k">elif</span> <span class="n">opname</span> <span class="o">==</span> <span class="s2">"int_invert"</span><span class="p">:</span> + <span class="n">result</span> <span class="o">=</span> <span class="o">~</span><span class="n">arg0</span> + <span class="k">else</span><span class="p">:</span> + <span class="k">assert</span> <span class="mi">0</span><span class="p">,</span> <span class="s2">"unknown operation "</span> <span class="o">+</span> <span class="n">opname</span> + <span class="k">return</span> <span class="n">result</span><span class="p">,</span> <span class="n">valid_if</span> + +<span class="k">def</span> <span class="nf">cond</span><span class="p">(</span><span class="n">z3expr</span><span class="p">):</span> +<span class="w"> </span><span class="sd">""" helper function to turn a Z3 boolean result z3expr into a 1 or 0</span> +<span class="sd"> bitvector, using z3.If """</span> + <span class="k">return</span> <span class="n">z3</span><span class="o">.</span><span class="n">If</span><span class="p">(</span><span class="n">z3expr</span><span class="p">,</span> <span class="n">TRUEBV</span><span class="p">,</span> <span class="n">FALSEBV</span><span class="p">)</span> +</pre></div> + +<p>We map the semantics of a PyPy JIT operation to Z3 with the <code>z3_expression</code> +function. It takes the name of a JIT operation and its two (or one) arguments +into a pair of Z3 formulas, <code>result</code> and <code>valid_if</code>. The resulting formulas are +constructed with the operator overloading of Z3 variables/formulas.</p> +<p>The first element <code>result</code> of the result of <code>z3_expression</code> represents the result +of performing the operation. <code>valid_if</code> is a bool that represents a condition that +needs to be <code>True</code> in order for the result of the operation to be defined. E.g. +<code>int_pydiv(a, b)</code> is only valid if <code>b != 0</code>. Most operations are always valid, +so they return <code>True</code> as that condition (we'll ignore <code>valid_if</code> for a bit, but it +will become more relevant further down in the post).</p> +<p>We can define a helper function to prove things by finding counterexamples:</p> +<div class="code"><pre class="code literal-block"><span class="k">def</span> <span class="nf">prove</span><span class="p">(</span><span class="n">cond</span><span class="p">):</span> +<span class="w"> </span><span class="sd">""" Try to prove a condition cond by searching for counterexamples of its negation. """</span> + <span class="n">z3res</span> <span class="o">=</span> <span class="n">solver</span><span class="o">.</span><span class="n">check</span><span class="p">(</span><span class="n">z3</span><span class="o">.</span><span class="n">Not</span><span class="p">(</span><span class="n">cond</span><span class="p">))</span> + <span class="k">if</span> <span class="n">z3res</span> <span class="o">==</span> <span class="n">z3</span><span class="o">.</span><span class="n">unsat</span><span class="p">:</span> + <span class="k">return</span> <span class="kc">True</span> + <span class="k">elif</span> <span class="n">z3res</span> <span class="o">==</span> <span class="n">z3</span><span class="o">.</span><span class="n">unknown</span><span class="p">:</span> <span class="c1"># eg on timeout</span> + <span class="k">return</span> <span class="kc">False</span> + <span class="k">elif</span> <span class="n">z3res</span> <span class="o">==</span> <span class="n">z3</span><span class="o">.</span><span class="n">sat</span><span class="p">:</span> + <span class="k">return</span> <span class="kc">False</span> + <span class="k">assert</span> <span class="mi">0</span><span class="p">,</span> <span class="s2">"should be unreachable"</span> +</pre></div> + +<h3 id="finding-rewrite-rules">Finding rewrite rules</h3> +<p>Now we can start finding our first rewrite rules, following the first pattern +<code>op(x, x) -&gt; x</code>. We do this by iterating over all the supported binary +operation names, getting the z3 expression for <code>op(x, x)</code> and then asking Z3 to +prove <code>op(x, x) == x</code>.</p> +<div class="code"><pre class="code literal-block"><span class="k">for</span> <span class="n">opname</span> <span class="ow">in</span> <span class="n">opnames2</span><span class="p">:</span> + <span class="n">result</span><span class="p">,</span> <span class="n">valid_if</span> <span class="o">=</span> <span class="n">z3_expression</span><span class="p">(</span><span class="n">opname</span><span class="p">,</span> <span class="n">xvar</span><span class="p">,</span> <span class="n">xvar</span><span class="p">)</span> + <span class="k">if</span> <span class="n">prove</span><span class="p">(</span><span class="n">result</span> <span class="o">==</span> <span class="n">xvar</span><span class="p">):</span> + <span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">"</span><span class="si">{</span><span class="n">opname</span><span class="si">}</span><span class="s2">(x, x) -&gt; x, </span><span class="si">{</span><span class="n">result</span><span class="si">}</span><span class="s2">"</span><span class="p">)</span> +</pre></div> + +<p>This yields the simplifications:</p> +<div class="code"><pre class="code literal-block"><span class="n">int_and</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="n">x</span> +<span class="n">int_or</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="n">x</span> +</pre></div> + +<h3 id="synthesizing-constants">Synthesizing constants</h3> +<p>Supporting the next patterns is harder: <code>op(x, x) == c1</code>, <code>op(x, c1) == x</code>, and +<code>op(c1, x) == x</code>. We don't know which constants to pick to try to get Z3 to +prove the equality. We could iterate over common constants like <code>0</code>, <code>1</code>, +<code>MAXINT</code>, etc, or even over all the 256 values for a bitvector of length 8. +However, we will instead ask Z3 to find the constants for us too.</p> +<p>This can be done by using quantifiers, in this case <code>z3.ForAll</code>. The query we +pose to Z3 is "does there exist a constant <code>c1</code> such that for all <code>x</code> the +following is true: <code>op(x, c1) == x</code>? Note that the constant <code>c1</code> is not +necessarily unique, there could be many of them. We generate several matching +constant, and add that they must be different to the condition of the second +and further queries.</p> +<p>We can express this in a helper function:</p> +<div class="code"><pre class="code literal-block"><span class="k">def</span> <span class="nf">find_constant</span><span class="p">(</span><span class="n">z3expr</span><span class="p">,</span> <span class="n">number_of_results</span><span class="o">=</span><span class="mi">5</span><span class="p">):</span> + <span class="n">condition</span> <span class="o">=</span> <span class="n">z3</span><span class="o">.</span><span class="n">ForAll</span><span class="p">(</span> + <span class="p">[</span><span class="n">xvar</span><span class="p">],</span> + <span class="n">z3expr</span> + <span class="p">)</span> + <span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="n">number_of_results</span><span class="p">):</span> + <span class="n">checkres</span> <span class="o">=</span> <span class="n">solver</span><span class="o">.</span><span class="n">check</span><span class="p">(</span><span class="n">condition</span><span class="p">)</span> + <span class="k">if</span> <span class="n">checkres</span> <span class="o">==</span> <span class="n">z3</span><span class="o">.</span><span class="n">sat</span><span class="p">:</span> + <span class="c1"># if a solver check succeeds, we can ask for a model, which is</span> + <span class="c1"># concrete values for the variables constvar</span> + <span class="n">model</span> <span class="o">=</span> <span class="n">solver</span><span class="o">.</span><span class="n">model</span><span class="p">()</span> + <span class="n">const</span> <span class="o">=</span> <span class="n">model</span><span class="p">[</span><span class="n">constvar</span><span class="p">]</span><span class="o">.</span><span class="n">as_signed_long</span><span class="p">()</span> + <span class="k">yield</span> <span class="n">const</span> + <span class="c1"># make sure we don't generate the same constant again on the</span> + <span class="c1"># next call</span> + <span class="n">condition</span> <span class="o">=</span> <span class="n">z3</span><span class="o">.</span><span class="n">And</span><span class="p">(</span><span class="n">constvar</span> <span class="o">!=</span> <span class="n">const</span><span class="p">,</span> <span class="n">condition</span><span class="p">)</span> + <span class="k">else</span><span class="p">:</span> + <span class="c1"># no (more) constants found</span> + <span class="k">break</span> +</pre></div> + +<p>We can use this new function for the three mentioned patterns:</p> +<div class="code"><pre class="code literal-block"><span class="c1"># try to find constants for op(x, x) == c</span> +<span class="k">for</span> <span class="n">opname</span> <span class="ow">in</span> <span class="n">opnames2</span><span class="p">:</span> + <span class="n">result</span><span class="p">,</span> <span class="n">valid_if</span> <span class="o">=</span> <span class="n">z3_expression</span><span class="p">(</span><span class="n">opname</span><span class="p">,</span> <span class="n">xvar</span><span class="p">,</span> <span class="n">xvar</span><span class="p">)</span> + <span class="k">for</span> <span class="n">const</span> <span class="ow">in</span> <span class="n">find_constant</span><span class="p">(</span><span class="n">result</span> <span class="o">==</span> <span class="n">constvar</span><span class="p">):</span> + <span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">"</span><span class="si">{</span><span class="n">opname</span><span class="si">}</span><span class="s2">(x, x) -&gt; </span><span class="si">{</span><span class="n">const</span><span class="si">}</span><span class="s2">"</span><span class="p">)</span> +<span class="c1"># try to find constants for op(x, c) == x and op(c, x) == x</span> +<span class="k">for</span> <span class="n">opname</span> <span class="ow">in</span> <span class="n">opnames2</span><span class="p">:</span> + <span class="n">result</span><span class="p">,</span> <span class="n">valid_if</span> <span class="o">=</span> <span class="n">z3_expression</span><span class="p">(</span><span class="n">opname</span><span class="p">,</span> <span class="n">xvar</span><span class="p">,</span> <span class="n">constvar</span><span class="p">)</span> + <span class="k">for</span> <span class="n">const</span> <span class="ow">in</span> <span class="n">find_constant</span><span class="p">(</span><span class="n">result</span> <span class="o">==</span> <span class="n">xvar</span><span class="p">):</span> + <span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">"</span><span class="si">{</span><span class="n">opname</span><span class="si">}</span><span class="s2">(x, </span><span class="si">{</span><span class="n">const</span><span class="si">}</span><span class="s2">) -&gt; x"</span><span class="p">)</span> + <span class="n">result</span><span class="p">,</span> <span class="n">valid_if</span> <span class="o">=</span> <span class="n">z3_expression</span><span class="p">(</span><span class="n">opname</span><span class="p">,</span> <span class="n">constvar</span><span class="p">,</span> <span class="n">xvar</span><span class="p">)</span> + <span class="k">for</span> <span class="n">const</span> <span class="ow">in</span> <span class="n">find_constant</span><span class="p">(</span><span class="n">result</span> <span class="o">==</span> <span class="n">xvar</span><span class="p">):</span> + <span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">"</span><span class="si">{</span><span class="n">opname</span><span class="si">}</span><span class="s2">(</span><span class="si">{</span><span class="n">const</span><span class="si">}</span><span class="s2">, x) -&gt; x"</span><span class="p">)</span> +<span class="c1"># this code is not quite correct, we'll correct it later</span> +</pre></div> + +<p>Together this yields the following new simplifications:</p> +<div class="code"><pre class="code literal-block"><span class="cp"># careful, these are not all correct!</span> +<span class="n">int_sub</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="mi">0</span> +<span class="n">int_xor</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="mi">0</span> +<span class="n">int_eq</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="mi">1</span> +<span class="n">int_ne</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="mi">0</span> +<span class="n">int_lt</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="mi">0</span> +<span class="n">int_le</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="mi">1</span> +<span class="n">int_gt</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="mi">0</span> +<span class="n">int_ge</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="mi">1</span> +<span class="n">uint_lt</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="mi">0</span> +<span class="n">uint_le</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="mi">1</span> +<span class="n">uint_gt</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="mi">0</span> +<span class="n">uint_ge</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="mi">1</span> +<span class="n">uint_rshift</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="mi">0</span> +<span class="n">int_pymod</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="mi">0</span> +<span class="n">int_add</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="mi">0</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="n">x</span> +<span class="n">int_add</span><span class="p">(</span><span class="mi">0</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="n">x</span> +<span class="n">int_sub</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="mi">0</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="n">x</span> +<span class="n">int_mul</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="mi">1</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="n">x</span> +<span class="n">int_mul</span><span class="p">(</span><span class="mi">1</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="n">x</span> +<span class="n">int_and</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="o">-</span><span class="mi">1</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="n">x</span> +<span class="n">int_and</span><span class="p">(</span><span class="o">-</span><span class="mi">1</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="n">x</span> +<span class="n">int_or</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="mi">0</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="n">x</span> +<span class="n">int_or</span><span class="p">(</span><span class="mi">0</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="n">x</span> +<span class="n">int_xor</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="mi">0</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="n">x</span> +<span class="n">int_xor</span><span class="p">(</span><span class="mi">0</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="n">x</span> +<span class="n">int_lshift</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="mi">0</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="n">x</span> +<span class="n">int_rshift</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="mi">0</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="n">x</span> +<span class="n">uint_rshift</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="mi">0</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="n">x</span> +<span class="n">int_pydiv</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="mi">1</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="n">x</span> +<span class="n">int_pymod</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="mi">0</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="n">x</span> +</pre></div> + +<p>Most of these look good at first glance, but the last one reveals a problem: +we've been ignoring the <code>valid_if</code> expression up to now. We can stop doing that by +changing the code like this, which adds <code>z3.And(valid_if, ...)</code> to the argument of +the calls to <code>find_constant</code>:</p> +<div class="code"><pre class="code literal-block"><span class="c1"># try to find constants for op(x, x) == c, op(x, c) == x and op(c, x) == x</span> +<span class="k">for</span> <span class="n">opname</span> <span class="ow">in</span> <span class="n">opnames2</span><span class="p">:</span> + <span class="n">result</span><span class="p">,</span> <span class="n">valid_if</span> <span class="o">=</span> <span class="n">z3_expression</span><span class="p">(</span><span class="n">opname</span><span class="p">,</span> <span class="n">xvar</span><span class="p">,</span> <span class="n">xvar</span><span class="p">)</span> + <span class="k">for</span> <span class="n">const</span> <span class="ow">in</span> <span class="n">find_constant</span><span class="p">(</span><span class="n">z3</span><span class="o">.</span><span class="n">And</span><span class="p">(</span><span class="n">valid_if</span><span class="p">,</span> <span class="n">result</span> <span class="o">==</span> <span class="n">constvar</span><span class="p">)):</span> + <span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">"</span><span class="si">{</span><span class="n">opname</span><span class="si">}</span><span class="s2">(x, x) -&gt; </span><span class="si">{</span><span class="n">const</span><span class="si">}</span><span class="s2">"</span><span class="p">)</span> +<span class="c1"># try to find constants for op(x, c) == x and op(c, x) == x</span> +<span class="k">for</span> <span class="n">opname</span> <span class="ow">in</span> <span class="n">opnames2</span><span class="p">:</span> + <span class="n">result</span><span class="p">,</span> <span class="n">valid_if</span> <span class="o">=</span> <span class="n">z3_expression</span><span class="p">(</span><span class="n">opname</span><span class="p">,</span> <span class="n">xvar</span><span class="p">,</span> <span class="n">constvar</span><span class="p">)</span> + <span class="k">for</span> <span class="n">const</span> <span class="ow">in</span> <span class="n">find_constant</span><span class="p">(</span><span class="n">z3</span><span class="o">.</span><span class="n">And</span><span class="p">(</span><span class="n">result</span> <span class="o">==</span> <span class="n">xvar</span><span class="p">,</span> <span class="n">valid_if</span><span class="p">)):</span> + <span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">"</span><span class="si">{</span><span class="n">opname</span><span class="si">}</span><span class="s2">(x, </span><span class="si">{</span><span class="n">const</span><span class="si">}</span><span class="s2">) -&gt; x"</span><span class="p">)</span> + <span class="n">result</span><span class="p">,</span> <span class="n">valid_if</span> <span class="o">=</span> <span class="n">z3_expression</span><span class="p">(</span><span class="n">opname</span><span class="p">,</span> <span class="n">constvar</span><span class="p">,</span> <span class="n">xvar</span><span class="p">)</span> + <span class="k">for</span> <span class="n">const</span> <span class="ow">in</span> <span class="n">find_constant</span><span class="p">(</span><span class="n">z3</span><span class="o">.</span><span class="n">And</span><span class="p">(</span><span class="n">result</span> <span class="o">==</span> <span class="n">xvar</span><span class="p">,</span> <span class="n">valid_if</span><span class="p">)):</span> + <span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">"</span><span class="si">{</span><span class="n">opname</span><span class="si">}</span><span class="s2">(</span><span class="si">{</span><span class="n">const</span><span class="si">}</span><span class="s2">, x) -&gt; x"</span><span class="p">)</span> +</pre></div> + +<p>And we get this list instead:</p> +<div class="code"><pre class="code literal-block"><span class="n">int_sub</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="mi">0</span> +<span class="n">int_xor</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="mi">0</span> +<span class="n">int_eq</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="mi">1</span> +<span class="n">int_ne</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="mi">0</span> +<span class="n">int_lt</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="mi">0</span> +<span class="n">int_le</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="mi">1</span> +<span class="n">int_gt</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="mi">0</span> +<span class="n">int_ge</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="mi">1</span> +<span class="n">uint_lt</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="mi">0</span> +<span class="n">uint_le</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="mi">1</span> +<span class="n">uint_gt</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="mi">0</span> +<span class="n">uint_ge</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="mi">1</span> +<span class="n">int_add</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="mi">0</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="n">x</span> +<span class="n">int_add</span><span class="p">(</span><span class="mi">0</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="n">x</span> +<span class="n">int_sub</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="mi">0</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="n">x</span> +<span class="n">int_mul</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="mi">1</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="n">x</span> +<span class="n">int_mul</span><span class="p">(</span><span class="mi">1</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="n">x</span> +<span class="n">int_and</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="o">-</span><span class="mi">1</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="n">x</span> +<span class="n">int_and</span><span class="p">(</span><span class="o">-</span><span class="mi">1</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="n">x</span> +<span class="n">int_or</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="mi">0</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="n">x</span> +<span class="n">int_or</span><span class="p">(</span><span class="mi">0</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="n">x</span> +<span class="n">int_xor</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="mi">0</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="n">x</span> +<span class="n">int_xor</span><span class="p">(</span><span class="mi">0</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="n">x</span> +<span class="n">int_lshift</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="mi">0</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="n">x</span> +<span class="n">int_rshift</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="mi">0</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="n">x</span> +<span class="n">uint_rshift</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="mi">0</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="n">x</span> +<span class="n">int_pydiv</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="mi">1</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="n">x</span> +</pre></div> + +<h3 id="synthesizing-two-constants">Synthesizing two constants</h3> +<p>For the patterns <code>op(x, c1) == c2</code> and <code>op(c1, x) == c2</code> we need to synthesize +two constants. We can again write a helper method for that:</p> +<div class="code"><pre class="code literal-block"><span class="k">def</span> <span class="nf">find_2consts</span><span class="p">(</span><span class="n">z3expr</span><span class="p">,</span> <span class="n">number_of_results</span><span class="o">=</span><span class="mi">5</span><span class="p">):</span> + <span class="n">condition</span> <span class="o">=</span> <span class="n">z3</span><span class="o">.</span><span class="n">ForAll</span><span class="p">(</span> + <span class="p">[</span><span class="n">xvar</span><span class="p">],</span> + <span class="n">z3expr</span> + <span class="p">)</span> + <span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="n">number_of_results</span><span class="p">):</span> + <span class="n">checkres</span> <span class="o">=</span> <span class="n">solver</span><span class="o">.</span><span class="n">check</span><span class="p">(</span><span class="n">condition</span><span class="p">)</span> + <span class="k">if</span> <span class="n">checkres</span> <span class="o">==</span> <span class="n">z3</span><span class="o">.</span><span class="n">sat</span><span class="p">:</span> + <span class="n">model</span> <span class="o">=</span> <span class="n">solver</span><span class="o">.</span><span class="n">model</span><span class="p">()</span> + <span class="n">const</span> <span class="o">=</span> <span class="n">model</span><span class="p">[</span><span class="n">constvar</span><span class="p">]</span><span class="o">.</span><span class="n">as_signed_long</span><span class="p">()</span> + <span class="n">const2</span> <span class="o">=</span> <span class="n">model</span><span class="p">[</span><span class="n">constvar2</span><span class="p">]</span><span class="o">.</span><span class="n">as_signed_long</span><span class="p">()</span> + <span class="k">yield</span> <span class="n">const</span><span class="p">,</span> <span class="n">const2</span> + <span class="n">condition</span> <span class="o">=</span> <span class="n">z3</span><span class="o">.</span><span class="n">And</span><span class="p">(</span><span class="n">z3</span><span class="o">.</span><span class="n">Or</span><span class="p">(</span><span class="n">constvar</span> <span class="o">!=</span> <span class="n">const</span><span class="p">,</span> <span class="n">constvar2</span> <span class="o">!=</span> <span class="n">const2</span><span class="p">),</span> <span class="n">condition</span><span class="p">)</span> + <span class="k">else</span><span class="p">:</span> + <span class="k">return</span> +</pre></div> + +<p>And then use it like this:</p> +<div class="code"><pre class="code literal-block"><span class="k">for</span> <span class="n">opname</span> <span class="ow">in</span> <span class="n">opnames2</span><span class="p">:</span> + <span class="c1"># try to find constants c1, c2 such that op(c1, x) -&gt; c2</span> + <span class="n">result</span><span class="p">,</span> <span class="n">valid_if</span> <span class="o">=</span> <span class="n">z3_expression</span><span class="p">(</span><span class="n">opname</span><span class="p">,</span> <span class="n">constvar</span><span class="p">,</span> <span class="n">xvar</span><span class="p">)</span> + <span class="n">consts</span> <span class="o">=</span> <span class="n">find_2consts</span><span class="p">(</span><span class="n">z3</span><span class="o">.</span><span class="n">And</span><span class="p">(</span><span class="n">valid_if</span><span class="p">,</span> <span class="n">result</span> <span class="o">==</span> <span class="n">constvar2</span><span class="p">))</span> + <span class="k">for</span> <span class="n">const</span><span class="p">,</span> <span class="n">const2</span> <span class="ow">in</span> <span class="n">consts</span><span class="p">:</span> + <span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">"</span><span class="si">{</span><span class="n">opname</span><span class="si">}</span><span class="s2">(</span><span class="si">{</span><span class="n">const</span><span class="si">}</span><span class="s2">, x) -&gt; </span><span class="si">{</span><span class="n">const2</span><span class="si">}</span><span class="s2">"</span><span class="p">)</span> + <span class="c1"># try to find constants c1, c2 such that op(x, c1) -&gt; c2</span> + <span class="n">result</span><span class="p">,</span> <span class="n">valid_if</span> <span class="o">=</span> <span class="n">z3_expression</span><span class="p">(</span><span class="n">opname</span><span class="p">,</span> <span class="n">xvar</span><span class="p">,</span> <span class="n">constvar</span><span class="p">)</span> + <span class="n">consts</span> <span class="o">=</span> <span class="n">find_2consts</span><span class="p">(</span><span class="n">z3</span><span class="o">.</span><span class="n">And</span><span class="p">(</span><span class="n">valid_if</span><span class="p">,</span> <span class="n">result</span> <span class="o">==</span> <span class="n">constvar2</span><span class="p">))</span> + <span class="k">for</span> <span class="n">const</span><span class="p">,</span> <span class="n">const2</span> <span class="ow">in</span> <span class="n">consts</span><span class="p">:</span> + <span class="nb">print</span><span class="p">(</span><span class="s2">"</span><span class="si">%s</span><span class="s2">(x, </span><span class="si">%s</span><span class="s2">) -&gt; </span><span class="si">%s</span><span class="s2">"</span> <span class="o">%</span> <span class="p">(</span><span class="n">opname</span><span class="p">,</span> <span class="n">const</span><span class="p">,</span> <span class="n">const2</span><span class="p">))</span> +</pre></div> + +<p>Which yields some straightforward simplifications:</p> +<div class="code"><pre class="code literal-block"><span class="n">int_mul</span><span class="p">(</span><span class="mi">0</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="mi">0</span> +<span class="n">int_mul</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="mi">0</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="mi">0</span> +<span class="n">int_and</span><span class="p">(</span><span class="mi">0</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="mi">0</span> +<span class="n">int_and</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="mi">0</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="mi">0</span> +<span class="n">uint_lt</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="mi">0</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="mi">0</span> +<span class="n">uint_le</span><span class="p">(</span><span class="mi">0</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="mi">1</span> +<span class="n">uint_gt</span><span class="p">(</span><span class="mi">0</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="mi">0</span> +<span class="n">uint_ge</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="mi">0</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="mi">1</span> +<span class="n">int_lshift</span><span class="p">(</span><span class="mi">0</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="mi">0</span> +<span class="n">int_rshift</span><span class="p">(</span><span class="mi">0</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="mi">0</span> +<span class="n">uint_rshift</span><span class="p">(</span><span class="mi">0</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="mi">0</span> +<span class="n">uint_mul_high</span><span class="p">(</span><span class="mi">0</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="mi">0</span> +<span class="n">uint_mul_high</span><span class="p">(</span><span class="mi">1</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="mi">0</span> +<span class="n">uint_mul_high</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="mi">0</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="mi">0</span> +<span class="n">uint_mul_high</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="mi">1</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="mi">0</span> +<span class="n">int_pymod</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="mi">1</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="mi">0</span> +<span class="n">int_pymod</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="o">-</span><span class="mi">1</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="mi">0</span> +</pre></div> + +<p>A few require a bit more thinking:</p> +<div class="code"><pre class="code literal-block"><span class="n">int_or</span><span class="p">(</span><span class="o">-</span><span class="mi">1</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="o">-</span><span class="mi">1</span> +<span class="n">int_or</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="o">-</span><span class="mi">1</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="o">-</span><span class="mi">1</span> +</pre></div> + +<p>The are true because in two's complement, <code>-1</code> has all bits set.</p> +<p>The following ones require recognizing that <code>-9223372036854775808 == -2**63</code> is +the most negative signed 64-bit integer, and <code>9223372036854775807 == 2 ** 63 - +1</code> is the most positive one:</p> +<div class="code"><pre class="code literal-block"><span class="n">int_lt</span><span class="p">(</span><span class="mi">9223372036854775807</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="mi">0</span> +<span class="n">int_lt</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="o">-</span><span class="mi">9223372036854775808</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="mi">0</span> +<span class="n">int_le</span><span class="p">(</span><span class="o">-</span><span class="mi">9223372036854775808</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="mi">1</span> +<span class="n">int_le</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="mi">9223372036854775807</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="mi">1</span> +<span class="n">int_gt</span><span class="p">(</span><span class="o">-</span><span class="mi">9223372036854775808</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="mi">0</span> +<span class="n">int_gt</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="mi">9223372036854775807</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="mi">0</span> +<span class="n">int_ge</span><span class="p">(</span><span class="mi">9223372036854775807</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="mi">1</span> +<span class="n">int_ge</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="o">-</span><span class="mi">9223372036854775808</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="mi">1</span> +</pre></div> + +<p>The following ones are true because the bitpattern for <code>-1</code> is the largest +unsigned number:</p> +<div class="code"><pre class="code literal-block"><span class="n">uint_lt</span><span class="p">(</span><span class="o">-</span><span class="mi">1</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="mi">0</span> +<span class="n">uint_le</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="o">-</span><span class="mi">1</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="mi">1</span> +<span class="n">uint_gt</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="o">-</span><span class="mi">1</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="mi">0</span> +<span class="n">uint_ge</span><span class="p">(</span><span class="o">-</span><span class="mi">1</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="mi">1</span> +</pre></div> + +<h3 id="strength-reductions">Strength Reductions</h3> +<p>All the patterns so far only had a variable or a constant on the target of the +rewrite. We can also use the machinery to do strengh-reductions where we +generate a single-argument operation <code>op1(x)</code> for input operations <code>op(x, c1)</code> +or <code>op(c1, x)</code>. To achieve this, we try all combinations of binary and unary +operations. (We won't consider strength reductions where a binary operation +gets turned into a "cheaper" other binary operation here.)</p> +<div class="code"><pre class="code literal-block"><span class="n">opnames1</span> <span class="o">=</span> <span class="p">[</span> +<span class="s2">"int_is_true"</span><span class="p">,</span> +<span class="s2">"int_is_zero"</span><span class="p">,</span> +<span class="s2">"int_neg"</span><span class="p">,</span> +<span class="s2">"int_invert"</span><span class="p">,</span> +<span class="p">]</span> + +<span class="k">for</span> <span class="n">opname</span> <span class="ow">in</span> <span class="n">opnames2</span><span class="p">:</span> + <span class="k">for</span> <span class="n">opname1</span> <span class="ow">in</span> <span class="n">opnames1</span><span class="p">:</span> + <span class="n">result</span><span class="p">,</span> <span class="n">valid_if</span> <span class="o">=</span> <span class="n">z3_expression</span><span class="p">(</span><span class="n">opname</span><span class="p">,</span> <span class="n">xvar</span><span class="p">,</span> <span class="n">constvar</span><span class="p">)</span> + <span class="c1"># try to find a constant op(x, c) == g(x)</span> + <span class="n">result1</span><span class="p">,</span> <span class="n">valid_if1</span> <span class="o">=</span> <span class="n">z3_expression</span><span class="p">(</span><span class="n">opname1</span><span class="p">,</span> <span class="n">xvar</span><span class="p">)</span> + <span class="n">consts</span> <span class="o">=</span> <span class="n">find_constant</span><span class="p">(</span><span class="n">z3</span><span class="o">.</span><span class="n">And</span><span class="p">(</span><span class="n">valid_if</span><span class="p">,</span> <span class="n">valid_if1</span><span class="p">,</span> <span class="n">result</span> <span class="o">==</span> <span class="n">result1</span><span class="p">))</span> + <span class="k">for</span> <span class="n">const</span> <span class="ow">in</span> <span class="n">consts</span><span class="p">:</span> + <span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">"</span><span class="si">{</span><span class="n">opname</span><span class="si">}</span><span class="s2">(x, </span><span class="si">{</span><span class="n">const</span><span class="si">}</span><span class="s2">) -&gt; </span><span class="si">{</span><span class="n">opname1</span><span class="si">}</span><span class="s2">(x)"</span><span class="p">)</span> + + <span class="c1"># try to find a constant op(c, x) == g(x)</span> + <span class="n">result</span><span class="p">,</span> <span class="n">valid_if</span> <span class="o">=</span> <span class="n">z3_expression</span><span class="p">(</span><span class="n">opname</span><span class="p">,</span> <span class="n">constvar</span><span class="p">,</span> <span class="n">xvar</span><span class="p">)</span> + <span class="n">result1</span><span class="p">,</span> <span class="n">valid_if1</span> <span class="o">=</span> <span class="n">z3_expression</span><span class="p">(</span><span class="n">opname1</span><span class="p">,</span> <span class="n">xvar</span><span class="p">)</span> + <span class="n">consts</span> <span class="o">=</span> <span class="n">find_constant</span><span class="p">(</span><span class="n">z3</span><span class="o">.</span><span class="n">And</span><span class="p">(</span><span class="n">valid_if</span><span class="p">,</span> <span class="n">valid_if1</span><span class="p">,</span> <span class="n">result</span> <span class="o">==</span> <span class="n">result1</span><span class="p">))</span> + <span class="k">for</span> <span class="n">const</span> <span class="ow">in</span> <span class="n">consts</span><span class="p">:</span> + <span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">"</span><span class="si">{</span><span class="n">opname</span><span class="si">}</span><span class="s2">(</span><span class="si">{</span><span class="n">const</span><span class="si">}</span><span class="s2">, x) -&gt; </span><span class="si">{</span><span class="n">opname1</span><span class="si">}</span><span class="s2">(x)"</span><span class="p">)</span> +</pre></div> + +<p>Which yields the following new simplifications:</p> +<div class="code"><pre class="code literal-block"><span class="n">int_sub</span><span class="p">(</span><span class="mi">0</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="n">int_neg</span><span class="p">(</span><span class="n">x</span><span class="p">)</span> +<span class="n">int_sub</span><span class="p">(</span><span class="o">-</span><span class="mi">1</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="n">int_invert</span><span class="p">(</span><span class="n">x</span><span class="p">)</span> +<span class="n">int_mul</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="o">-</span><span class="mi">1</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="n">int_neg</span><span class="p">(</span><span class="n">x</span><span class="p">)</span> +<span class="n">int_mul</span><span class="p">(</span><span class="o">-</span><span class="mi">1</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="n">int_neg</span><span class="p">(</span><span class="n">x</span><span class="p">)</span> +<span class="n">int_xor</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="o">-</span><span class="mi">1</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="n">int_invert</span><span class="p">(</span><span class="n">x</span><span class="p">)</span> +<span class="n">int_xor</span><span class="p">(</span><span class="o">-</span><span class="mi">1</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="n">int_invert</span><span class="p">(</span><span class="n">x</span><span class="p">)</span> +<span class="n">int_eq</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="mi">0</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="n">int_is_zero</span><span class="p">(</span><span class="n">x</span><span class="p">)</span> +<span class="n">int_eq</span><span class="p">(</span><span class="mi">0</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="n">int_is_zero</span><span class="p">(</span><span class="n">x</span><span class="p">)</span> +<span class="n">int_ne</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="mi">0</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="n">int_is_true</span><span class="p">(</span><span class="n">x</span><span class="p">)</span> +<span class="n">int_ne</span><span class="p">(</span><span class="mi">0</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="n">int_is_true</span><span class="p">(</span><span class="n">x</span><span class="p">)</span> +<span class="n">uint_lt</span><span class="p">(</span><span class="mi">0</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="n">int_is_true</span><span class="p">(</span><span class="n">x</span><span class="p">)</span> +<span class="n">uint_lt</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="mi">1</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="n">int_is_zero</span><span class="p">(</span><span class="n">x</span><span class="p">)</span> +<span class="n">uint_le</span><span class="p">(</span><span class="mi">1</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="n">int_is_true</span><span class="p">(</span><span class="n">x</span><span class="p">)</span> +<span class="n">uint_le</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="mi">0</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="n">int_is_zero</span><span class="p">(</span><span class="n">x</span><span class="p">)</span> +<span class="n">uint_gt</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="mi">0</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="n">int_is_true</span><span class="p">(</span><span class="n">x</span><span class="p">)</span> +<span class="n">uint_gt</span><span class="p">(</span><span class="mi">1</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="n">int_is_zero</span><span class="p">(</span><span class="n">x</span><span class="p">)</span> +<span class="n">uint_ge</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="mi">1</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="n">int_is_true</span><span class="p">(</span><span class="n">x</span><span class="p">)</span> +<span class="n">uint_ge</span><span class="p">(</span><span class="mi">0</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="n">int_is_zero</span><span class="p">(</span><span class="n">x</span><span class="p">)</span> +<span class="n">int_pydiv</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="o">-</span><span class="mi">1</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="n">int_neg</span><span class="p">(</span><span class="n">x</span><span class="p">)</span> +</pre></div> + +<h3 id="conclusions">Conclusions</h3> +<p>With not very little code we managed to generate a whole lot of local +simplifications for integer operations in the IR of PyPy's JIT. The rules +discovered that way are "simple", in the sense that they only require looking +at a single instruction, and not where the arguments of that instruction came +from. They also don't require any knowledge about the properties of the +arguments of the instructions (e.g. that they are positive).</p> +<p>The rewrites in this post have mostly been in PyPy's JIT already. But now we +mechanically confirmed that they are correct. I've also added the remaining +useful looking ones, in particular <code>int_eq(x, 0) -&gt; int_is_zero(x)</code> etc.</p> +<p>If we wanted to scale this approach up, we would have to work much harder! +There are a bunch of problems that come with generalizing the approach to +looking at sequences of instructions:</p> +<ul> +<li> +<p>Combinatorial explosion: if we look at sequences of instructions, we very + quickly get a combinatorial explosion and it becomes untractable to try all + combinations.</p> +</li> +<li> +<p>Finding non-minimal patterns: Some complicated simplifications can be + instances of simpler ones. For example, because <code>int_add(x, 0) -&gt; x</code>, it's + also true that <code>int_add(int_sub(x, y), 0) -&gt; int_sub(x, y)</code>. If we simply + generate all possible sequences, we will find the latter simplification rule, + which we would usually not care about.</p> +</li> +<li> +<p>Unclear usefulness: if we simply generate all rewrites up to a certain number + of instructions, we will get a lot of patterns that are useless in the sense + that they typically aren't found in realistic programs. It would be much + better to somehow focus on the patterns that real benchmarks are using.</p> +</li> +</ul> +<p>In the <a href="https://www.pypy.org/posts/2024/07/mining-jit-traces-missing-optimizations-z3.html">next blog post</a> I'll discuss an alternative approach to simply generating +all possible sequences of instructions, that tries to address these problems. +This works by analyzing the real traces of benchmarks and mining those for +inefficiencies, which only shows problems that occur in actual programs.</p> +<h3 id="sources">Sources</h3> +<p>I've been re-reading a lot of blog posts from John's blog:</p> +<ul> +<li><a href="https://blog.regehr.org/archives/1109">Let’s Work on an LLVM Superoptimizer</a></li> +<li><a href="https://blog.regehr.org/archives/1146">Early Superoptimizer Results</a></li> +<li><a href="https://blog.regehr.org/archives/1252">A Few Synthesizing Superoptimizer Results</a></li> +<li><a href="https://blog.regehr.org/archives/1636">Synthesizing Constants</a></li> +</ul> +<p>but also papers:</p> +<ul> +<li><a href="https://arxiv.org/pdf/1711.04422">A Synthesizing Superoptimizer</a></li> +<li><a href="https://dl.acm.org/doi/pdf/10.1145/3649837">Hydra: Generalizing Peephole Optimizations with Program Synthesis</a></li> +</ul> +<p>Another of my favorite blogs has been <a href="https://www.philipzucker.com/">Philipp Zucker's +blog</a> in the last year or two, lots of excellent +posts about/using Z3 on there.</p>jitz3https://www.pypy.org/posts/2024/07/finding-simple-rewrite-rules-jit-z3.htmlFri, 12 Jul 2024 19:14:09 GMTFinding JIT Optimizer Bugs using SMT Solvers and Fuzzinghttps://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.htmlCarl Friedrich Bolz-Tereick<p>In this blog post I want to describe a recent bug finding technique that I've +added to the PyPy JIT testing infrastructure. This technique uses the Z3 +theorem prover to find bugs in the optimizer of PyPy's JIT, in particular its +integer operation optimizations. The approach is +based on things I have learned from <a class="reference external" href="https://www.cs.utah.edu/~regehr/">John Regehr's</a> <a class="reference external" href="https://blog.regehr.org/">blog</a> (<a class="reference external" href="https://blog.regehr.org/archives/1122">this post</a> is a +good first one to read), <a class="reference external" href="https://twitter.com/johnregehr/">Twitter</a>, and on +his (et al) paper <a class="reference external" href="https://www.cs.utah.edu/~regehr/alive2-pldi21.pdf">Alive2: Bounded Translation Validation for LLVM</a>. The work +was triggered by a recent miscompilation bug my current bachelor student Nico +Rittinghaus found.</p> +<section id="background-python-integers-in-the-pypy-jit"> +<h2>Background: Python Integers in the PyPy JIT</h2> +<p>The optimizer of PyPy's JITs operates on traces, which are linear sequences of +instructions with guards. The instructions in the traces operate on different +machine-level data types, machine integers, doubles, pointers, bools, etc. In +this post we'll be mostly concerned with machine integers.</p> +<p>To given some wider context I'll explain a bit how Python ints in the user code +relate to the types that are used in traces when the PyPy Python implementation +is used. +When PyPy turns a regular Python 3 function into a trace, there is a lot of work +happening in the JIT frontend to try to observe and infer the types that the +Python function concretely uses at runtime. The traces are generated under these +typing assumptions. Therefore, code that uses <code class="docutils literal">ints</code> in the Python code can +typically be translated into traces that operate on machine integers. In order +to make sure that the Python integer semantics are upheld, many of the +operations in the traces need to check that the integer results of some +operations still fit into a machine integer. If that is not the case (a rare +situation for most programs), the trace is left via a guard, execution falls +back to the interpreter, and there a big integer representation is chosen for +the too big value (the big integer representation is done via a pointer and +some storage on the heap).</p> +<p>All of this machinery is not going to be too relevant for the rest of the +post. For the post it's important to know that trace instructions operate on +machine integers and other low-level types, and some of the operations can +optionally check whether the +results still fit into a machine integer. These trace operations are improved by +the optimizer, which tries to transform the trace into one that behaves the +same, but is less costly to execute.</p> +</section> +<section id="background-bounds-analysis-in-pypy-s-jit"> +<h2>Background: Bounds Analysis in PyPy's JIT</h2> +<p>The optimizer of PyPy's JIT has an analysis based on <a class="reference external" href="https://en.wikipedia.org/wiki/Abstract_interpretation">abstract interpretation</a> +that tries to find out whether the integer values stored in a variable are +actually not using the full 64 bit (or 32 bit) range, but instead fit into some +smaller range. This means that for every integer variable <code class="docutils literal">x</code> in a trace, the +JIT compiler tracks upper and lower bounds of the runtime value of that +variable: a range <code class="docutils literal">[a, b]</code> such that for every concrete runtime value <code class="docutils literal">v</code> +that gets stored in variable <code class="docutils literal">x</code>, <code class="docutils literal">a &lt;= v &lt;= b</code> must be true. +<code class="docutils literal">a</code> and <code class="docutils literal">b</code> start out +as the most general <code class="docutils literal">MININT</code> and <code class="docutils literal">MAXINT</code>, but sometimes there is extra +information that makes it possible to improve these known bounds, and that is +often useful to optimize the code.</p> +<p>A typical example is that the JIT knows that the length of a string is +non-negative, so for this kind of code: <code class="docutils literal">x = len(s)</code> where <code class="docutils literal">s</code> is a string, +<code class="docutils literal">x</code> gets a range <code class="docutils literal">[0, MAXINT]</code> assigned. With this information we could for +example remove a check <code class="docutils literal">x + 10 &lt; 0</code> completely, because it can never be true.</p> +<p>The bounds information is useful for optimization, but the analysis of the +bounds is also a source of bugs in the JIT, because the reasoning is often +subtle and easy to get wrong in corner cases. We already use a number of testing +techniques to try to make sure that it is correct. A simple one is +<a class="reference external" href="https://hypothesis.works/articles/what-is-property-based-testing/">property-based testing</a> using <a class="reference external" href="https://github.com/HypothesisWorks/hypothesis">Hypothesis</a> on the operations on bounds. Even +though Hypothesis is fantastic, it unfortunately does not catch +absolutely all the bugs even if we'd like it too, as we'll see in the next +section.</p> +</section> +<section id="motivation-a-jit-miscompilation"> +<h2>Motivation: A JIT Miscompilation</h2> +<p>I am currently supervising a Bachelor thesis by Nico Rittinghaus, who is +extending the integer analysis in the JIT. He'll probably write a separate blog +post about that soon. In the process of his work, the current bounds analysis +code got a lot of scrutiny, and we found out that one of the unit tests of the +bounds analysis was actually incorrect, and the example code in that unit test +was optimized incorrectly. This case of incorrect optimization is not a big deal +for regular Python code, because it involved a "wrapping integer addition +operation", i.e. one where overflowing results just wrap around to negative +values. All the additions and other arithmetic operations that the PyPy Python +frontend generates actually have +overflow checks (to be able to switch to a big integer representation if +needed). +However, it's still possible to trigger the problem with the +<code class="docutils literal">__pypy__.intop.int_add</code> API which is a function that exposes wraparound +arithmetic on Python ints.</p> +<p><a class="reference external" href="https://foss.heptapod.net/pypy/pypy/-/issues/3832">Here's the miscompilation</a>. The JIT optimizes the following function:</p> +<div class="code"><pre class="code python"><a id="rest_code_fe430f89c3ac44bd87113cd210a97ff1-1" name="rest_code_fe430f89c3ac44bd87113cd210a97ff1-1" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_fe430f89c3ac44bd87113cd210a97ff1-1"></a><span class="kn">import</span> <span class="nn">__pypy__</span> +<a id="rest_code_fe430f89c3ac44bd87113cd210a97ff1-2" name="rest_code_fe430f89c3ac44bd87113cd210a97ff1-2" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_fe430f89c3ac44bd87113cd210a97ff1-2"></a> +<a id="rest_code_fe430f89c3ac44bd87113cd210a97ff1-3" name="rest_code_fe430f89c3ac44bd87113cd210a97ff1-3" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_fe430f89c3ac44bd87113cd210a97ff1-3"></a><span class="k">def</span> <span class="nf">wrong</span><span class="p">(</span><span class="n">x</span><span class="p">):</span> +<a id="rest_code_fe430f89c3ac44bd87113cd210a97ff1-4" name="rest_code_fe430f89c3ac44bd87113cd210a97ff1-4" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_fe430f89c3ac44bd87113cd210a97ff1-4"></a> <span class="n">a</span> <span class="o">=</span> <span class="n">__pypy__</span><span class="o">.</span><span class="n">intop</span><span class="o">.</span><span class="n">int_add</span><span class="p">(</span><span class="n">x</span><span class="p">,</span> <span class="mi">10</span><span class="p">)</span> +<a id="rest_code_fe430f89c3ac44bd87113cd210a97ff1-5" name="rest_code_fe430f89c3ac44bd87113cd210a97ff1-5" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_fe430f89c3ac44bd87113cd210a97ff1-5"></a> <span class="k">if</span> <span class="n">a</span> <span class="o">&lt;</span> <span class="mi">15</span><span class="p">:</span> +<a id="rest_code_fe430f89c3ac44bd87113cd210a97ff1-6" name="rest_code_fe430f89c3ac44bd87113cd210a97ff1-6" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_fe430f89c3ac44bd87113cd210a97ff1-6"></a> <span class="k">if</span> <span class="n">x</span> <span class="o">&lt;</span> <span class="mi">6</span><span class="p">:</span> +<a id="rest_code_fe430f89c3ac44bd87113cd210a97ff1-7" name="rest_code_fe430f89c3ac44bd87113cd210a97ff1-7" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_fe430f89c3ac44bd87113cd210a97ff1-7"></a> <span class="k">return</span> <span class="mi">0</span> +<a id="rest_code_fe430f89c3ac44bd87113cd210a97ff1-8" name="rest_code_fe430f89c3ac44bd87113cd210a97ff1-8" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_fe430f89c3ac44bd87113cd210a97ff1-8"></a> <span class="k">return</span> <span class="mi">1</span> +<a id="rest_code_fe430f89c3ac44bd87113cd210a97ff1-9" name="rest_code_fe430f89c3ac44bd87113cd210a97ff1-9" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_fe430f89c3ac44bd87113cd210a97ff1-9"></a> <span class="k">return</span> <span class="mi">2</span> +</pre></div> +<p>Into the following code:</p> +<div class="code"><pre class="code python"><a id="rest_code_4ffb3edd0ebd4f739819d99c60b8f91d-1" name="rest_code_4ffb3edd0ebd4f739819d99c60b8f91d-1" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_4ffb3edd0ebd4f739819d99c60b8f91d-1"></a><span class="kn">import</span> <span class="nn">__pypy__</span> +<a id="rest_code_4ffb3edd0ebd4f739819d99c60b8f91d-2" name="rest_code_4ffb3edd0ebd4f739819d99c60b8f91d-2" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_4ffb3edd0ebd4f739819d99c60b8f91d-2"></a> +<a id="rest_code_4ffb3edd0ebd4f739819d99c60b8f91d-3" name="rest_code_4ffb3edd0ebd4f739819d99c60b8f91d-3" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_4ffb3edd0ebd4f739819d99c60b8f91d-3"></a><span class="k">def</span> <span class="nf">wrong</span><span class="p">(</span><span class="n">x</span><span class="p">):</span> +<a id="rest_code_4ffb3edd0ebd4f739819d99c60b8f91d-4" name="rest_code_4ffb3edd0ebd4f739819d99c60b8f91d-4" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_4ffb3edd0ebd4f739819d99c60b8f91d-4"></a> <span class="n">a</span> <span class="o">=</span> <span class="n">__pypy__</span><span class="o">.</span><span class="n">intop</span><span class="o">.</span><span class="n">int_add</span><span class="p">(</span><span class="n">x</span><span class="p">,</span> <span class="mi">10</span><span class="p">)</span> +<a id="rest_code_4ffb3edd0ebd4f739819d99c60b8f91d-5" name="rest_code_4ffb3edd0ebd4f739819d99c60b8f91d-5" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_4ffb3edd0ebd4f739819d99c60b8f91d-5"></a> <span class="k">if</span> <span class="n">a</span> <span class="o">&lt;</span> <span class="mi">15</span><span class="p">:</span> +<a id="rest_code_4ffb3edd0ebd4f739819d99c60b8f91d-6" name="rest_code_4ffb3edd0ebd4f739819d99c60b8f91d-6" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_4ffb3edd0ebd4f739819d99c60b8f91d-6"></a> <span class="k">return</span> <span class="mi">0</span> +<a id="rest_code_4ffb3edd0ebd4f739819d99c60b8f91d-7" name="rest_code_4ffb3edd0ebd4f739819d99c60b8f91d-7" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_4ffb3edd0ebd4f739819d99c60b8f91d-7"></a> <span class="k">return</span> <span class="mi">2</span> +</pre></div> +<p>Basically the faulty reasoning of the JIT looks like this: if <code class="docutils literal">int_add(x, 10) &lt; 15</code> +then it must follow that <code class="docutils literal">x &lt; 5</code>, which is stronger than <code class="docutils literal">x &lt; 6</code>, so the +second <code class="docutils literal">if</code> is always true. This sounds good, but is actually wrong +if the addition <code class="docutils literal">+ 10</code> wrapped around. So if <code class="docutils literal">x == MAXINT</code>, then +<code class="docutils literal">int_add(x, 10) == MININT + 9 &lt; 15</code>. But <code class="docutils literal">MAXINT &lt; 5</code> is not +correct.</p> +<p>Note how the same reasoning with overflow-checking addition is correct! If <code class="docutils literal">x + +10 &lt; 15</code> and the <code class="docutils literal">+</code> didn't overflow, then indeed <code class="docutils literal">x &lt; 6</code>. And if your +mind bends starting to think about all this, you understand some of the +difficulty of getting the JIT correct in this area.</p> +</section> +<section id="how-could-we-have-avoided-this-bug"> +<h2>How could we have avoided this bug?</h2> +<p>One <a class="reference external" href="https://twitter.com/cfbolz/status/1482649144099586051">exercise I try to do after finding bugs</a> is to reflect on ways that the +bug could have been avoided. I think this is particularly important in the JIT, +where bugs are potentially really annoying to find and can cause very strange +behaviour in basically arbitrary Python code.</p> +<p>It's easy to always answer this question with "try to think more carefully +when working", but that approach cannot be relied on in complicated situations, +because humans don't concentrate perfectly for long stretches of time.</p> +<p>A situation-specific problem I identified was the bad design of the range analysis API. +A range is not just represented by two numbers, instead it's two numbers +and two bools that are supposed to represent that some operation did or did not +underflow/overflow. The meaning of these bools was quite hard to grasp and easy +to get wrong, so probably they should never have been introduced in the first +place (and my bugfix indeed removed them).</p> +<p>But in the rest of this blog post I want to talk about another, systematic +approach that can be applied to the problem of mis-optimizations of integer +operations, and that is done by applying an SMT solver to the problem.</p> +<p>An SMT solver (<a class="reference external" href="https://en.wikipedia.org/wiki/Satisfiability_modulo_theories">Satisfyability Modulo Theories</a>) is a tool that can be used to +find out whether mathematical formulas are "satisfiable", i.e. whether +some chosen set of inputs exists that will make the formulas evaluate to true. SMT solvers are +commonly used in a wide range of CS applications including program correctness +proofs, program synthesis, etc. The most widely known one is probably <a class="reference external" href="https://github.com/Z3Prover">Z3</a> by +Microsoft Research which has the nice advantage of coming with an easy-to-use +Python binding.</p> +<p>Going into this I basically knew next to nothing about SMT solvers (despite +having been embedded in a formal methods research group for years!) so it was an +interesting new world to learn about.</p> +<p>As briefly mentioned in the introduction, the approach I took followed a similar +(but <em>much</em> more properly executed) one applied to LLVM operations, called +<a class="reference external" href="https://github.com/AliveToolkit/alive2/">Alive2</a>. Krister Waldfridsson has done <a class="reference external" href="https://kristerw.github.io/2022/09/13/translation-validation/">similar work for GCC recently</a>, +described on his blog.</p> +</section> +<section id="z3-proof-of-concept"> +<h2>Z3 Proof of Concept</h2> +<p>The first thing I did was to try to get Z3 find the above bug, by encoding the +input program into an SMT formula by hand and trying to get Z3 to prove the condition +that the JIT thinks is always true. The Z3 code for this looks as follows:</p> +<div class="code"><pre class="code python"><a id="rest_code_2fe5dd23f4ec46749496562618a462eb-1" name="rest_code_2fe5dd23f4ec46749496562618a462eb-1" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_2fe5dd23f4ec46749496562618a462eb-1"></a><span class="kn">from</span> <span class="nn">z3</span> <span class="kn">import</span> <span class="n">BitVec</span><span class="p">,</span> <span class="n">Implies</span><span class="p">,</span> <span class="n">prove</span> +<a id="rest_code_2fe5dd23f4ec46749496562618a462eb-2" name="rest_code_2fe5dd23f4ec46749496562618a462eb-2" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_2fe5dd23f4ec46749496562618a462eb-2"></a><span class="n">x</span> <span class="o">=</span> <span class="n">BitVec</span><span class="p">(</span><span class="s1">'x'</span><span class="p">,</span> <span class="mi">64</span><span class="p">)</span> +<a id="rest_code_2fe5dd23f4ec46749496562618a462eb-3" name="rest_code_2fe5dd23f4ec46749496562618a462eb-3" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_2fe5dd23f4ec46749496562618a462eb-3"></a><span class="n">a</span> <span class="o">=</span> <span class="n">x</span> <span class="o">+</span> <span class="mi">10</span> +<a id="rest_code_2fe5dd23f4ec46749496562618a462eb-4" name="rest_code_2fe5dd23f4ec46749496562618a462eb-4" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_2fe5dd23f4ec46749496562618a462eb-4"></a><span class="n">cond1</span> <span class="o">=</span> <span class="n">a</span> <span class="o">&lt;</span> <span class="mi">15</span> +<a id="rest_code_2fe5dd23f4ec46749496562618a462eb-5" name="rest_code_2fe5dd23f4ec46749496562618a462eb-5" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_2fe5dd23f4ec46749496562618a462eb-5"></a><span class="n">cond2</span> <span class="o">=</span> <span class="n">x</span> <span class="o">&lt;</span> <span class="mi">6</span> +<a id="rest_code_2fe5dd23f4ec46749496562618a462eb-6" name="rest_code_2fe5dd23f4ec46749496562618a462eb-6" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_2fe5dd23f4ec46749496562618a462eb-6"></a><span class="n">prove</span><span class="p">(</span><span class="n">Implies</span><span class="p">(</span><span class="n">cond1</span><span class="p">,</span> <span class="n">cond2</span><span class="p">))</span> +</pre></div> +<p>Here, <code class="docutils literal">x</code> is defined to be a bit vector variable of width 64, which is a +datatype that can be used to represent bounded machine integers. Addition on +bit vectors performs wraparound arithmetic, like the <code class="docutils literal">__pypy__.intop.int_add</code> +call in the original code. The JIT optimized the second condition away, so +essentially it was convinced that the first condition implies the second one. +The above snippet tries to get Z3 to confirm this.</p> +<p>When run, the above program prints:</p> +<pre class="literal-block">counterexample +[x = 9223372036854775803]</pre> +<p>Which shows the bug. As a small side-note, I thought it was cool that the +process of "proving" something in Z3 basically means trying to find an example +for the negation of the formula. If no counterexample can be found for the +negation, the original formula is true. If the original formula turns out to be +false (like here) we get a nice example that shows the problem to go with it.</p> +<p>It's not realistic to hand-translate all the hundreds of +unit-tests into Z3 formulas and then ask Z3 to prove the optimizations. Instead, +we want to have a program that does this for us.</p> +</section> +<section id="smt-checking-of-the-jit-optimizer"> +<h2>SMT Checking of the JIT Optimizer</h2> +<p>What we want from this program is the following: given an unoptimized trace and +its optimized version, we want to use Z3 to check whether the optimized trace +behaves identically to the unoptimized one. One question is what "behaves +identically" means. What we care about is the outputs of the trace being the +same values, no matter how they are computed. Also, for every guard we want to +make sure that it fails in identical ways in the optimized and unoptimized +versions. A guard is only allowed to be optimized away if it can never fail. +The code that comes after a guard can assume that the guard has not failed, +because otherwise execution would have left the trace. All of this should be +true regardless for the values of the input variables of the trace.</p> +<p>So in order to check that the two traces are behaving identically, we do the +following:</p> +<ul class="simple"> +<li><p>We create Z3 variables for every input variable. We use the same input +variables both for the unoptimized as well as the optimized trace.</p></li> +<li><p>We align the two traces at the corresponding guards. Thankfully the optimizer +keeps track of which optimized guard corresponds to which unoptimized input +guard.</p></li> +<li><p>All the operations before a guard are translated into Z3 formulas, for both +versions of the trace.</p></li> +<li><p>For two corresponding guards, we ask Z3 to prove that the guard conditions are +identical.</p></li> +<li><p>For a guard that was optimized away we ask Z3 to prove that the condition is +always true.</p></li> +<li><p>After a guard, we tell Z3 that from now on it can assume that the guard +condition is true.</p></li> +<li><p>We repeat this, guard for guard, until we reach the end of the trace. There, +we ask Z3 to prove that the output variables in the unoptimized trace and the +optimized trace are identical (every trace can return one or many values).</p></li> +</ul> +<p>I implemented this, it's <a class="reference external" href="https://foss.heptapod.net/pypy/pypy/-/blob/branch/default/rpython/jit/metainterp/optimizeopt/test/test_z3checktests.py">not a lot of code</a>, basically a couple of hundred lines +of (somewhat hacky) Python code. So far I only support integer +operations. Here are some parts of the code to give you a flavor of what this +looks like.</p> +<p>This is the code that translates operations into Z3 formulas:</p> +<div class="code"><pre class="code python"><a id="rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-1" name="rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-1" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-1"></a><span class="k">def</span> <span class="nf">add_to_solver</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">ops</span><span class="p">,</span> <span class="n">state</span><span class="p">):</span> +<a id="rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-2" name="rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-2" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-2"></a> <span class="k">for</span> <span class="n">op</span> <span class="ow">in</span> <span class="n">ops</span><span class="p">:</span> +<a id="rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-3" name="rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-3" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-3"></a> <span class="k">if</span> <span class="n">op</span><span class="o">.</span><span class="n">type</span> <span class="o">!=</span> <span class="s1">'v'</span><span class="p">:</span> <span class="c1"># is it an operation with a result</span> +<a id="rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-4" name="rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-4" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-4"></a> <span class="n">res</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">newvar</span><span class="p">(</span><span class="n">op</span><span class="p">)</span> +<a id="rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-5" name="rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-5" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-5"></a> <span class="k">else</span><span class="p">:</span> <span class="c1"># or does it return void</span> +<a id="rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-6" name="rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-6" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-6"></a> <span class="n">res</span> <span class="o">=</span> <span class="kc">None</span> +<a id="rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-7" name="rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-7" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-7"></a> +<a id="rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-8" name="rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-8" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-8"></a> <span class="c1"># ...</span> +<a id="rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-9" name="rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-9" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-9"></a> +<a id="rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-10" name="rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-10" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-10"></a> <span class="c1"># convert arguments</span> +<a id="rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-11" name="rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-11" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-11"></a> <span class="k">if</span> <span class="n">op</span><span class="o">.</span><span class="n">numargs</span><span class="p">()</span> <span class="o">==</span> <span class="mi">1</span><span class="p">:</span> +<a id="rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-12" name="rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-12" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-12"></a> <span class="n">arg0</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">convertarg</span><span class="p">(</span><span class="n">op</span><span class="p">,</span> <span class="mi">0</span><span class="p">)</span> +<a id="rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-13" name="rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-13" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-13"></a> <span class="k">elif</span> <span class="n">op</span><span class="o">.</span><span class="n">numargs</span><span class="p">()</span> <span class="o">==</span> <span class="mi">2</span><span class="p">:</span> +<a id="rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-14" name="rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-14" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-14"></a> <span class="n">arg0</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">convertarg</span><span class="p">(</span><span class="n">op</span><span class="p">,</span> <span class="mi">0</span><span class="p">)</span> +<a id="rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-15" name="rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-15" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-15"></a> <span class="n">arg1</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">convertarg</span><span class="p">(</span><span class="n">op</span><span class="p">,</span> <span class="mi">1</span><span class="p">)</span> +<a id="rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-16" name="rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-16" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-16"></a> +<a id="rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-17" name="rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-17" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-17"></a> <span class="c1"># compute results</span> +<a id="rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-18" name="rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-18" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-18"></a> <span class="k">if</span> <span class="n">opname</span> <span class="o">==</span> <span class="s2">"int_add"</span><span class="p">:</span> +<a id="rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-19" name="rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-19" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-19"></a> <span class="n">expr</span> <span class="o">=</span> <span class="n">arg0</span> <span class="o">+</span> <span class="n">arg1</span> +<a id="rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-20" name="rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-20" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-20"></a> <span class="k">elif</span> <span class="n">opname</span> <span class="o">==</span> <span class="s2">"int_sub"</span><span class="p">:</span> +<a id="rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-21" name="rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-21" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-21"></a> <span class="n">expr</span> <span class="o">=</span> <span class="n">arg0</span> <span class="o">-</span> <span class="n">arg1</span> +<a id="rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-22" name="rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-22" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-22"></a> <span class="k">elif</span> <span class="n">opname</span> <span class="o">==</span> <span class="s2">"int_mul"</span><span class="p">:</span> +<a id="rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-23" name="rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-23" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-23"></a> <span class="n">expr</span> <span class="o">=</span> <span class="n">arg0</span> <span class="o">*</span> <span class="n">arg1</span> +<a id="rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-24" name="rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-24" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-24"></a> <span class="k">elif</span> <span class="n">opname</span> <span class="o">==</span> <span class="s2">"int_and"</span><span class="p">:</span> +<a id="rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-25" name="rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-25" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-25"></a> <span class="n">expr</span> <span class="o">=</span> <span class="n">arg0</span> <span class="o">&amp;</span> <span class="n">arg1</span> +<a id="rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-26" name="rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-26" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-26"></a> <span class="k">elif</span> <span class="n">opname</span> <span class="o">==</span> <span class="s2">"int_or"</span><span class="p">:</span> +<a id="rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-27" name="rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-27" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-27"></a> <span class="n">expr</span> <span class="o">=</span> <span class="n">arg0</span> <span class="o">|</span> <span class="n">arg1</span> +<a id="rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-28" name="rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-28" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-28"></a> <span class="k">elif</span> <span class="n">opname</span> <span class="o">==</span> <span class="s2">"int_xor"</span><span class="p">:</span> +<a id="rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-29" name="rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-29" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-29"></a> <span class="n">expr</span> <span class="o">=</span> <span class="n">arg0</span> <span class="o">^</span> <span class="n">arg1</span> +<a id="rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-30" name="rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-30" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-30"></a> +<a id="rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-31" name="rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-31" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-31"></a> <span class="c1"># ... more operations, some shown below</span> +<a id="rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-32" name="rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-32" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-32"></a> +<a id="rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-33" name="rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-33" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-33"></a> <span class="bp">self</span><span class="o">.</span><span class="n">solver</span><span class="o">.</span><span class="n">add</span><span class="p">(</span><span class="n">res</span> <span class="o">==</span> <span class="n">expr</span><span class="p">)</span> +</pre></div> +<p>New Z3 variables are defined by the helper function <code class="docutils literal">newvar</code>, which adds the +operation to a dictionary <code class="docutils literal">box_to_z3</code> mapping boxes (=variables) to Z3 +variables. Due to the <a class="reference external" href="https://en.wikipedia.org/wiki/Static_single-assignment_form">SSA</a> property that traces have, a variable must be defined +before its first use.</p> +<p>Here's what <code class="docutils literal">newvar</code> looks like (<code class="docutils literal">LONG_BIT</code> is a constant that is either +<code class="docutils literal">64</code> or <code class="docutils literal">32</code>, depending on the target architecture):</p> +<div class="code"><pre class="code python"><a id="rest_code_36cab9b8d68941ecafeac4cb42b72541-1" name="rest_code_36cab9b8d68941ecafeac4cb42b72541-1" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_36cab9b8d68941ecafeac4cb42b72541-1"></a><span class="k">def</span> <span class="nf">newvar</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">box</span><span class="p">,</span> <span class="nb">repr</span><span class="o">=</span><span class="kc">None</span><span class="p">):</span> +<a id="rest_code_36cab9b8d68941ecafeac4cb42b72541-2" name="rest_code_36cab9b8d68941ecafeac4cb42b72541-2" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_36cab9b8d68941ecafeac4cb42b72541-2"></a> <span class="c1"># ... some logic around making the string representation</span> +<a id="rest_code_36cab9b8d68941ecafeac4cb42b72541-3" name="rest_code_36cab9b8d68941ecafeac4cb42b72541-3" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_36cab9b8d68941ecafeac4cb42b72541-3"></a> <span class="c1"># somewhat nicer omitted</span> +<a id="rest_code_36cab9b8d68941ecafeac4cb42b72541-4" name="rest_code_36cab9b8d68941ecafeac4cb42b72541-4" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_36cab9b8d68941ecafeac4cb42b72541-4"></a> <span class="n">result</span> <span class="o">=</span> <span class="n">z3</span><span class="o">.</span><span class="n">BitVec</span><span class="p">(</span><span class="nb">repr</span><span class="p">,</span> <span class="n">LONG_BIT</span><span class="p">)</span> +<a id="rest_code_36cab9b8d68941ecafeac4cb42b72541-5" name="rest_code_36cab9b8d68941ecafeac4cb42b72541-5" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_36cab9b8d68941ecafeac4cb42b72541-5"></a> <span class="bp">self</span><span class="o">.</span><span class="n">box_to_z3</span><span class="p">[</span><span class="n">box</span><span class="p">]</span> <span class="o">=</span> <span class="n">result</span> +<a id="rest_code_36cab9b8d68941ecafeac4cb42b72541-6" name="rest_code_36cab9b8d68941ecafeac4cb42b72541-6" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_36cab9b8d68941ecafeac4cb42b72541-6"></a> <span class="k">return</span> <span class="n">result</span> +</pre></div> +<p>The <code class="docutils literal">convert</code> method turns an operation argument (either a constant or a +variable) into a Z3 formula (either a constant bit vector or an already defined +Z3 variable). <code class="docutils literal">convertarg</code> is a helper function that takes an operation, reads +its nth argument and converts it.</p> +<div class="code"><pre class="code python"><a id="rest_code_70b9c80263b2495bab6ea46cbe5febbc-1" name="rest_code_70b9c80263b2495bab6ea46cbe5febbc-1" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_70b9c80263b2495bab6ea46cbe5febbc-1"></a><span class="k">def</span> <span class="nf">convert</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">box</span><span class="p">):</span> +<a id="rest_code_70b9c80263b2495bab6ea46cbe5febbc-2" name="rest_code_70b9c80263b2495bab6ea46cbe5febbc-2" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_70b9c80263b2495bab6ea46cbe5febbc-2"></a> <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">box</span><span class="p">,</span> <span class="n">ConstInt</span><span class="p">):</span> +<a id="rest_code_70b9c80263b2495bab6ea46cbe5febbc-3" name="rest_code_70b9c80263b2495bab6ea46cbe5febbc-3" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_70b9c80263b2495bab6ea46cbe5febbc-3"></a> <span class="k">return</span> <span class="n">z3</span><span class="o">.</span><span class="n">BitVecVal</span><span class="p">(</span><span class="n">box</span><span class="o">.</span><span class="n">getint</span><span class="p">(),</span> <span class="n">LONG_BIT</span><span class="p">)</span> +<a id="rest_code_70b9c80263b2495bab6ea46cbe5febbc-4" name="rest_code_70b9c80263b2495bab6ea46cbe5febbc-4" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_70b9c80263b2495bab6ea46cbe5febbc-4"></a> <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">box_to_z3</span><span class="p">[</span><span class="n">box</span><span class="p">]</span> +<a id="rest_code_70b9c80263b2495bab6ea46cbe5febbc-5" name="rest_code_70b9c80263b2495bab6ea46cbe5febbc-5" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_70b9c80263b2495bab6ea46cbe5febbc-5"></a> +<a id="rest_code_70b9c80263b2495bab6ea46cbe5febbc-6" name="rest_code_70b9c80263b2495bab6ea46cbe5febbc-6" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_70b9c80263b2495bab6ea46cbe5febbc-6"></a><span class="k">def</span> <span class="nf">convertarg</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">box</span><span class="p">,</span> <span class="n">arg</span><span class="p">):</span> +<a id="rest_code_70b9c80263b2495bab6ea46cbe5febbc-7" name="rest_code_70b9c80263b2495bab6ea46cbe5febbc-7" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_70b9c80263b2495bab6ea46cbe5febbc-7"></a> <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">convert</span><span class="p">(</span><span class="n">box</span><span class="o">.</span><span class="n">getarg</span><span class="p">(</span><span class="n">arg</span><span class="p">))</span> +</pre></div> +<p>The lookup of variables in <code class="docutils literal">box_to_z3</code> that <code class="docutils literal">convert</code> does cannot fail, +because the variable must have been defined before use.</p> +<p>Comparisons return the bit vector 0 or bit vector 1, we use a helper function +<code class="docutils literal">cond</code> to turn the Z3 truth value of the comparison into a bit vector:</p> +<div class="code"><pre class="code python"><a id="rest_code_af8f5f62807f4670b7bb2c8ec574b55d-1" name="rest_code_af8f5f62807f4670b7bb2c8ec574b55d-1" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_af8f5f62807f4670b7bb2c8ec574b55d-1"></a><span class="k">def</span> <span class="nf">cond</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">z3expr</span><span class="p">):</span> +<a id="rest_code_af8f5f62807f4670b7bb2c8ec574b55d-2" name="rest_code_af8f5f62807f4670b7bb2c8ec574b55d-2" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_af8f5f62807f4670b7bb2c8ec574b55d-2"></a> <span class="k">return</span> <span class="n">z3</span><span class="o">.</span><span class="n">If</span><span class="p">(</span><span class="n">z3expr</span><span class="p">,</span> <span class="n">TRUEBV</span><span class="p">,</span> <span class="n">FALSEBV</span><span class="p">)</span> +<a id="rest_code_af8f5f62807f4670b7bb2c8ec574b55d-3" name="rest_code_af8f5f62807f4670b7bb2c8ec574b55d-3" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_af8f5f62807f4670b7bb2c8ec574b55d-3"></a> +<a id="rest_code_af8f5f62807f4670b7bb2c8ec574b55d-4" name="rest_code_af8f5f62807f4670b7bb2c8ec574b55d-4" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_af8f5f62807f4670b7bb2c8ec574b55d-4"></a> +<a id="rest_code_af8f5f62807f4670b7bb2c8ec574b55d-5" name="rest_code_af8f5f62807f4670b7bb2c8ec574b55d-5" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_af8f5f62807f4670b7bb2c8ec574b55d-5"></a><span class="k">def</span> <span class="nf">add_to_solver</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">ops</span><span class="p">,</span> <span class="n">state</span><span class="p">):</span> +<a id="rest_code_af8f5f62807f4670b7bb2c8ec574b55d-6" name="rest_code_af8f5f62807f4670b7bb2c8ec574b55d-6" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_af8f5f62807f4670b7bb2c8ec574b55d-6"></a> <span class="c1"># ... start as above</span> +<a id="rest_code_af8f5f62807f4670b7bb2c8ec574b55d-7" name="rest_code_af8f5f62807f4670b7bb2c8ec574b55d-7" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_af8f5f62807f4670b7bb2c8ec574b55d-7"></a> +<a id="rest_code_af8f5f62807f4670b7bb2c8ec574b55d-8" name="rest_code_af8f5f62807f4670b7bb2c8ec574b55d-8" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_af8f5f62807f4670b7bb2c8ec574b55d-8"></a> <span class="c1"># more cases</span> +<a id="rest_code_af8f5f62807f4670b7bb2c8ec574b55d-9" name="rest_code_af8f5f62807f4670b7bb2c8ec574b55d-9" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_af8f5f62807f4670b7bb2c8ec574b55d-9"></a> <span class="k">elif</span> <span class="n">opname</span> <span class="o">==</span> <span class="s2">"int_eq"</span><span class="p">:</span> +<a id="rest_code_af8f5f62807f4670b7bb2c8ec574b55d-10" name="rest_code_af8f5f62807f4670b7bb2c8ec574b55d-10" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_af8f5f62807f4670b7bb2c8ec574b55d-10"></a> <span class="n">expr</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">cond</span><span class="p">(</span><span class="n">arg0</span> <span class="o">==</span> <span class="n">arg1</span><span class="p">)</span> +<a id="rest_code_af8f5f62807f4670b7bb2c8ec574b55d-11" name="rest_code_af8f5f62807f4670b7bb2c8ec574b55d-11" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_af8f5f62807f4670b7bb2c8ec574b55d-11"></a> <span class="k">elif</span> <span class="n">opname</span> <span class="o">==</span> <span class="s2">"int_ne"</span><span class="p">:</span> +<a id="rest_code_af8f5f62807f4670b7bb2c8ec574b55d-12" name="rest_code_af8f5f62807f4670b7bb2c8ec574b55d-12" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_af8f5f62807f4670b7bb2c8ec574b55d-12"></a> <span class="n">expr</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">cond</span><span class="p">(</span><span class="n">arg0</span> <span class="o">!=</span> <span class="n">arg1</span><span class="p">)</span> +<a id="rest_code_af8f5f62807f4670b7bb2c8ec574b55d-13" name="rest_code_af8f5f62807f4670b7bb2c8ec574b55d-13" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_af8f5f62807f4670b7bb2c8ec574b55d-13"></a> <span class="k">elif</span> <span class="n">opname</span> <span class="o">==</span> <span class="s2">"int_lt"</span><span class="p">:</span> +<a id="rest_code_af8f5f62807f4670b7bb2c8ec574b55d-14" name="rest_code_af8f5f62807f4670b7bb2c8ec574b55d-14" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_af8f5f62807f4670b7bb2c8ec574b55d-14"></a> <span class="n">expr</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">cond</span><span class="p">(</span><span class="n">arg0</span> <span class="o">&lt;</span> <span class="n">arg1</span><span class="p">)</span> +<a id="rest_code_af8f5f62807f4670b7bb2c8ec574b55d-15" name="rest_code_af8f5f62807f4670b7bb2c8ec574b55d-15" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_af8f5f62807f4670b7bb2c8ec574b55d-15"></a> <span class="k">elif</span> <span class="n">opname</span> <span class="o">==</span> <span class="s2">"int_le"</span><span class="p">:</span> +<a id="rest_code_af8f5f62807f4670b7bb2c8ec574b55d-16" name="rest_code_af8f5f62807f4670b7bb2c8ec574b55d-16" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_af8f5f62807f4670b7bb2c8ec574b55d-16"></a> <span class="n">expr</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">cond</span><span class="p">(</span><span class="n">arg0</span> <span class="o">&lt;=</span> <span class="n">arg1</span><span class="p">)</span> +<a id="rest_code_af8f5f62807f4670b7bb2c8ec574b55d-17" name="rest_code_af8f5f62807f4670b7bb2c8ec574b55d-17" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_af8f5f62807f4670b7bb2c8ec574b55d-17"></a> <span class="k">elif</span> <span class="n">opname</span> <span class="o">==</span> <span class="s2">"int_gt"</span><span class="p">:</span> +<a id="rest_code_af8f5f62807f4670b7bb2c8ec574b55d-18" name="rest_code_af8f5f62807f4670b7bb2c8ec574b55d-18" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_af8f5f62807f4670b7bb2c8ec574b55d-18"></a> <span class="n">expr</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">cond</span><span class="p">(</span><span class="n">arg0</span> <span class="o">&gt;</span> <span class="n">arg1</span><span class="p">)</span> +<a id="rest_code_af8f5f62807f4670b7bb2c8ec574b55d-19" name="rest_code_af8f5f62807f4670b7bb2c8ec574b55d-19" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_af8f5f62807f4670b7bb2c8ec574b55d-19"></a> <span class="k">elif</span> <span class="n">opname</span> <span class="o">==</span> <span class="s2">"int_ge"</span><span class="p">:</span> +<a id="rest_code_af8f5f62807f4670b7bb2c8ec574b55d-20" name="rest_code_af8f5f62807f4670b7bb2c8ec574b55d-20" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_af8f5f62807f4670b7bb2c8ec574b55d-20"></a> <span class="n">expr</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">cond</span><span class="p">(</span><span class="n">arg0</span> <span class="o">&gt;=</span> <span class="n">arg1</span><span class="p">)</span> +<a id="rest_code_af8f5f62807f4670b7bb2c8ec574b55d-21" name="rest_code_af8f5f62807f4670b7bb2c8ec574b55d-21" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_af8f5f62807f4670b7bb2c8ec574b55d-21"></a> <span class="k">elif</span> <span class="n">opname</span> <span class="o">==</span> <span class="s2">"int_is_true"</span><span class="p">:</span> +<a id="rest_code_af8f5f62807f4670b7bb2c8ec574b55d-22" name="rest_code_af8f5f62807f4670b7bb2c8ec574b55d-22" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_af8f5f62807f4670b7bb2c8ec574b55d-22"></a> <span class="n">expr</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">cond</span><span class="p">(</span><span class="n">arg0</span> <span class="o">!=</span> <span class="n">FALSEBV</span><span class="p">)</span> +<a id="rest_code_af8f5f62807f4670b7bb2c8ec574b55d-23" name="rest_code_af8f5f62807f4670b7bb2c8ec574b55d-23" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_af8f5f62807f4670b7bb2c8ec574b55d-23"></a> <span class="k">elif</span> <span class="n">opname</span> <span class="o">==</span> <span class="s2">"uint_lt"</span><span class="p">:</span> +<a id="rest_code_af8f5f62807f4670b7bb2c8ec574b55d-24" name="rest_code_af8f5f62807f4670b7bb2c8ec574b55d-24" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_af8f5f62807f4670b7bb2c8ec574b55d-24"></a> <span class="n">expr</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">cond</span><span class="p">(</span><span class="n">z3</span><span class="o">.</span><span class="n">ULT</span><span class="p">(</span><span class="n">arg0</span><span class="p">,</span> <span class="n">arg1</span><span class="p">))</span> +<a id="rest_code_af8f5f62807f4670b7bb2c8ec574b55d-25" name="rest_code_af8f5f62807f4670b7bb2c8ec574b55d-25" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_af8f5f62807f4670b7bb2c8ec574b55d-25"></a> <span class="k">elif</span> <span class="n">opname</span> <span class="o">==</span> <span class="s2">"uint_le"</span><span class="p">:</span> +<a id="rest_code_af8f5f62807f4670b7bb2c8ec574b55d-26" name="rest_code_af8f5f62807f4670b7bb2c8ec574b55d-26" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_af8f5f62807f4670b7bb2c8ec574b55d-26"></a> <span class="n">expr</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">cond</span><span class="p">(</span><span class="n">z3</span><span class="o">.</span><span class="n">ULE</span><span class="p">(</span><span class="n">arg0</span><span class="p">,</span> <span class="n">arg1</span><span class="p">))</span> +<a id="rest_code_af8f5f62807f4670b7bb2c8ec574b55d-27" name="rest_code_af8f5f62807f4670b7bb2c8ec574b55d-27" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_af8f5f62807f4670b7bb2c8ec574b55d-27"></a> <span class="k">elif</span> <span class="n">opname</span> <span class="o">==</span> <span class="s2">"uint_gt"</span><span class="p">:</span> +<a id="rest_code_af8f5f62807f4670b7bb2c8ec574b55d-28" name="rest_code_af8f5f62807f4670b7bb2c8ec574b55d-28" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_af8f5f62807f4670b7bb2c8ec574b55d-28"></a> <span class="n">expr</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">cond</span><span class="p">(</span><span class="n">z3</span><span class="o">.</span><span class="n">UGT</span><span class="p">(</span><span class="n">arg0</span><span class="p">,</span> <span class="n">arg1</span><span class="p">))</span> +<a id="rest_code_af8f5f62807f4670b7bb2c8ec574b55d-29" name="rest_code_af8f5f62807f4670b7bb2c8ec574b55d-29" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_af8f5f62807f4670b7bb2c8ec574b55d-29"></a> <span class="k">elif</span> <span class="n">opname</span> <span class="o">==</span> <span class="s2">"uint_ge"</span><span class="p">:</span> +<a id="rest_code_af8f5f62807f4670b7bb2c8ec574b55d-30" name="rest_code_af8f5f62807f4670b7bb2c8ec574b55d-30" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_af8f5f62807f4670b7bb2c8ec574b55d-30"></a> <span class="n">expr</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">cond</span><span class="p">(</span><span class="n">z3</span><span class="o">.</span><span class="n">UGE</span><span class="p">(</span><span class="n">arg0</span><span class="p">,</span> <span class="n">arg1</span><span class="p">))</span> +<a id="rest_code_af8f5f62807f4670b7bb2c8ec574b55d-31" name="rest_code_af8f5f62807f4670b7bb2c8ec574b55d-31" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_af8f5f62807f4670b7bb2c8ec574b55d-31"></a> <span class="k">elif</span> <span class="n">opname</span> <span class="o">==</span> <span class="s2">"int_is_zero"</span><span class="p">:</span> +<a id="rest_code_af8f5f62807f4670b7bb2c8ec574b55d-32" name="rest_code_af8f5f62807f4670b7bb2c8ec574b55d-32" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_af8f5f62807f4670b7bb2c8ec574b55d-32"></a> <span class="n">expr</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">cond</span><span class="p">(</span><span class="n">arg0</span> <span class="o">==</span> <span class="n">FALSEBV</span><span class="p">)</span> +<a id="rest_code_af8f5f62807f4670b7bb2c8ec574b55d-33" name="rest_code_af8f5f62807f4670b7bb2c8ec574b55d-33" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_af8f5f62807f4670b7bb2c8ec574b55d-33"></a> +<a id="rest_code_af8f5f62807f4670b7bb2c8ec574b55d-34" name="rest_code_af8f5f62807f4670b7bb2c8ec574b55d-34" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_af8f5f62807f4670b7bb2c8ec574b55d-34"></a> <span class="c1"># ... rest as above</span> +</pre></div> +<p>So basically for every trace operation that operates on integers I had to give a +translation into Z3 formulas, which is mostly straightforward.</p> +<p>Guard operations get converted into a Z3 boolean by their own helper function, +which looks like this:</p> +<div class="code"><pre class="code python"><a id="rest_code_3de914924f164344a1267234ae4925f2-1" name="rest_code_3de914924f164344a1267234ae4925f2-1" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_3de914924f164344a1267234ae4925f2-1"></a><span class="k">def</span> <span class="nf">guard_to_condition</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">guard</span><span class="p">,</span> <span class="n">state</span><span class="p">):</span> +<a id="rest_code_3de914924f164344a1267234ae4925f2-2" name="rest_code_3de914924f164344a1267234ae4925f2-2" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_3de914924f164344a1267234ae4925f2-2"></a> <span class="n">opname</span> <span class="o">=</span> <span class="n">guard</span><span class="o">.</span><span class="n">getopname</span><span class="p">()</span> +<a id="rest_code_3de914924f164344a1267234ae4925f2-3" name="rest_code_3de914924f164344a1267234ae4925f2-3" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_3de914924f164344a1267234ae4925f2-3"></a> <span class="k">if</span> <span class="n">opname</span> <span class="o">==</span> <span class="s2">"guard_true"</span><span class="p">:</span> +<a id="rest_code_3de914924f164344a1267234ae4925f2-4" name="rest_code_3de914924f164344a1267234ae4925f2-4" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_3de914924f164344a1267234ae4925f2-4"></a> <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">convertarg</span><span class="p">(</span><span class="n">guard</span><span class="p">,</span> <span class="mi">0</span><span class="p">)</span> <span class="o">==</span> <span class="n">TRUEBV</span> +<a id="rest_code_3de914924f164344a1267234ae4925f2-5" name="rest_code_3de914924f164344a1267234ae4925f2-5" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_3de914924f164344a1267234ae4925f2-5"></a> <span class="k">elif</span> <span class="n">opname</span> <span class="o">==</span> <span class="s2">"guard_false"</span><span class="p">:</span> +<a id="rest_code_3de914924f164344a1267234ae4925f2-6" name="rest_code_3de914924f164344a1267234ae4925f2-6" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_3de914924f164344a1267234ae4925f2-6"></a> <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">convertarg</span><span class="p">(</span><span class="n">guard</span><span class="p">,</span> <span class="mi">0</span><span class="p">)</span> <span class="o">==</span> <span class="n">FALSEBV</span> +<a id="rest_code_3de914924f164344a1267234ae4925f2-7" name="rest_code_3de914924f164344a1267234ae4925f2-7" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_3de914924f164344a1267234ae4925f2-7"></a> <span class="k">elif</span> <span class="n">opname</span> <span class="o">==</span> <span class="s2">"guard_value"</span><span class="p">:</span> +<a id="rest_code_3de914924f164344a1267234ae4925f2-8" name="rest_code_3de914924f164344a1267234ae4925f2-8" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_3de914924f164344a1267234ae4925f2-8"></a> <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">convertarg</span><span class="p">(</span><span class="n">guard</span><span class="p">,</span> <span class="mi">0</span><span class="p">)</span> <span class="o">==</span> <span class="bp">self</span><span class="o">.</span><span class="n">convertarg</span><span class="p">(</span><span class="n">guard</span><span class="p">,</span> <span class="mi">1</span><span class="p">)</span> +<a id="rest_code_3de914924f164344a1267234ae4925f2-9" name="rest_code_3de914924f164344a1267234ae4925f2-9" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_3de914924f164344a1267234ae4925f2-9"></a> +<a id="rest_code_3de914924f164344a1267234ae4925f2-10" name="rest_code_3de914924f164344a1267234ae4925f2-10" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_3de914924f164344a1267234ae4925f2-10"></a> <span class="c1"># ... some more exist, shown below</span> +</pre></div> +<p>Some operations are a bit trickier. An important example in the context of +this blog post are integer operations that check for overflow. The overflow +operations return a result, but also a boolean whether the operation overflowed +or not.</p> +<div class="code"><pre class="code python"><a id="rest_code_51a2bf22ac6042edb7137eeab86ff8c4-1" name="rest_code_51a2bf22ac6042edb7137eeab86ff8c4-1" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_51a2bf22ac6042edb7137eeab86ff8c4-1"></a><span class="k">def</span> <span class="nf">add_to_solver</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">ops</span><span class="p">,</span> <span class="n">state</span><span class="p">):</span> +<a id="rest_code_51a2bf22ac6042edb7137eeab86ff8c4-2" name="rest_code_51a2bf22ac6042edb7137eeab86ff8c4-2" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_51a2bf22ac6042edb7137eeab86ff8c4-2"></a> +<a id="rest_code_51a2bf22ac6042edb7137eeab86ff8c4-3" name="rest_code_51a2bf22ac6042edb7137eeab86ff8c4-3" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_51a2bf22ac6042edb7137eeab86ff8c4-3"></a> <span class="c1"># ... more cases</span> +<a id="rest_code_51a2bf22ac6042edb7137eeab86ff8c4-4" name="rest_code_51a2bf22ac6042edb7137eeab86ff8c4-4" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_51a2bf22ac6042edb7137eeab86ff8c4-4"></a> +<a id="rest_code_51a2bf22ac6042edb7137eeab86ff8c4-5" name="rest_code_51a2bf22ac6042edb7137eeab86ff8c4-5" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_51a2bf22ac6042edb7137eeab86ff8c4-5"></a> <span class="k">elif</span> <span class="n">opname</span> <span class="o">==</span> <span class="s2">"int_add_ovf"</span><span class="p">:</span> +<a id="rest_code_51a2bf22ac6042edb7137eeab86ff8c4-6" name="rest_code_51a2bf22ac6042edb7137eeab86ff8c4-6" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_51a2bf22ac6042edb7137eeab86ff8c4-6"></a> <span class="n">expr</span> <span class="o">=</span> <span class="n">arg0</span> <span class="o">+</span> <span class="n">arg1</span> +<a id="rest_code_51a2bf22ac6042edb7137eeab86ff8c4-7" name="rest_code_51a2bf22ac6042edb7137eeab86ff8c4-7" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_51a2bf22ac6042edb7137eeab86ff8c4-7"></a> <span class="n">m</span> <span class="o">=</span> <span class="n">z3</span><span class="o">.</span><span class="n">SignExt</span><span class="p">(</span><span class="n">LONG_BIT</span><span class="p">,</span> <span class="n">arg0</span><span class="p">)</span> <span class="o">+</span> <span class="n">z3</span><span class="o">.</span><span class="n">SignExt</span><span class="p">(</span><span class="n">LONG_BIT</span><span class="p">,</span> <span class="n">arg1</span><span class="p">)</span> +<a id="rest_code_51a2bf22ac6042edb7137eeab86ff8c4-8" name="rest_code_51a2bf22ac6042edb7137eeab86ff8c4-8" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_51a2bf22ac6042edb7137eeab86ff8c4-8"></a> <span class="n">state</span><span class="o">.</span><span class="n">no_ovf</span> <span class="o">=</span> <span class="n">m</span> <span class="o">==</span> <span class="n">z3</span><span class="o">.</span><span class="n">SignExt</span><span class="p">(</span><span class="n">LONG_BIT</span><span class="p">,</span> <span class="n">expr</span><span class="p">)</span> +<a id="rest_code_51a2bf22ac6042edb7137eeab86ff8c4-9" name="rest_code_51a2bf22ac6042edb7137eeab86ff8c4-9" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_51a2bf22ac6042edb7137eeab86ff8c4-9"></a> <span class="k">elif</span> <span class="n">opname</span> <span class="o">==</span> <span class="s2">"int_sub_ovf"</span><span class="p">:</span> +<a id="rest_code_51a2bf22ac6042edb7137eeab86ff8c4-10" name="rest_code_51a2bf22ac6042edb7137eeab86ff8c4-10" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_51a2bf22ac6042edb7137eeab86ff8c4-10"></a> <span class="n">expr</span> <span class="o">=</span> <span class="n">arg0</span> <span class="o">-</span> <span class="n">arg1</span> +<a id="rest_code_51a2bf22ac6042edb7137eeab86ff8c4-11" name="rest_code_51a2bf22ac6042edb7137eeab86ff8c4-11" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_51a2bf22ac6042edb7137eeab86ff8c4-11"></a> <span class="n">m</span> <span class="o">=</span> <span class="n">z3</span><span class="o">.</span><span class="n">SignExt</span><span class="p">(</span><span class="n">LONG_BIT</span><span class="p">,</span> <span class="n">arg0</span><span class="p">)</span> <span class="o">-</span> <span class="n">z3</span><span class="o">.</span><span class="n">SignExt</span><span class="p">(</span><span class="n">LONG_BIT</span><span class="p">,</span> <span class="n">arg1</span><span class="p">)</span> +<a id="rest_code_51a2bf22ac6042edb7137eeab86ff8c4-12" name="rest_code_51a2bf22ac6042edb7137eeab86ff8c4-12" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_51a2bf22ac6042edb7137eeab86ff8c4-12"></a> <span class="n">state</span><span class="o">.</span><span class="n">no_ovf</span> <span class="o">=</span> <span class="n">m</span> <span class="o">==</span> <span class="n">z3</span><span class="o">.</span><span class="n">SignExt</span><span class="p">(</span><span class="n">LONG_BIT</span><span class="p">,</span> <span class="n">expr</span><span class="p">)</span> +<a id="rest_code_51a2bf22ac6042edb7137eeab86ff8c4-13" name="rest_code_51a2bf22ac6042edb7137eeab86ff8c4-13" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_51a2bf22ac6042edb7137eeab86ff8c4-13"></a> <span class="k">elif</span> <span class="n">opname</span> <span class="o">==</span> <span class="s2">"int_mul_ovf"</span><span class="p">:</span> +<a id="rest_code_51a2bf22ac6042edb7137eeab86ff8c4-14" name="rest_code_51a2bf22ac6042edb7137eeab86ff8c4-14" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_51a2bf22ac6042edb7137eeab86ff8c4-14"></a> <span class="n">expr</span> <span class="o">=</span> <span class="n">arg0</span> <span class="o">*</span> <span class="n">arg1</span> +<a id="rest_code_51a2bf22ac6042edb7137eeab86ff8c4-15" name="rest_code_51a2bf22ac6042edb7137eeab86ff8c4-15" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_51a2bf22ac6042edb7137eeab86ff8c4-15"></a> <span class="n">m</span> <span class="o">=</span> <span class="n">z3</span><span class="o">.</span><span class="n">SignExt</span><span class="p">(</span><span class="n">LONG_BIT</span><span class="p">,</span> <span class="n">arg0</span><span class="p">)</span> <span class="o">*</span> <span class="n">z3</span><span class="o">.</span><span class="n">SignExt</span><span class="p">(</span><span class="n">LONG_BIT</span><span class="p">,</span> <span class="n">arg1</span><span class="p">)</span> +<a id="rest_code_51a2bf22ac6042edb7137eeab86ff8c4-16" name="rest_code_51a2bf22ac6042edb7137eeab86ff8c4-16" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_51a2bf22ac6042edb7137eeab86ff8c4-16"></a> <span class="n">state</span><span class="o">.</span><span class="n">no_ovf</span> <span class="o">=</span> <span class="n">m</span> <span class="o">==</span> <span class="n">z3</span><span class="o">.</span><span class="n">SignExt</span><span class="p">(</span><span class="n">LONG_BIT</span><span class="p">,</span> <span class="n">expr</span><span class="p">)</span> +<a id="rest_code_51a2bf22ac6042edb7137eeab86ff8c4-17" name="rest_code_51a2bf22ac6042edb7137eeab86ff8c4-17" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_51a2bf22ac6042edb7137eeab86ff8c4-17"></a> +<a id="rest_code_51a2bf22ac6042edb7137eeab86ff8c4-18" name="rest_code_51a2bf22ac6042edb7137eeab86ff8c4-18" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_51a2bf22ac6042edb7137eeab86ff8c4-18"></a> <span class="c1"># ...</span> +</pre></div> +<p>The boolean is computed by comparing the result of the bit vector operation with +the result of converting the input bit vectors into an abstract (arbitrary +precision) integer and the result back to bit vectors. Let's go through the +addition case step by step, the other cases work analogously.</p> +<p>The addition in the first <code class="docutils literal">elif</code> that computes <code class="docutils literal">expr</code> is an addition on bit +vectors, therefore it is performing wraparound arithmetic. +<code class="docutils literal">z3.SignExt(LONG_BIT, arg0)</code> sign-extends <code class="docutils literal">arg0</code> from a bit vector of +<code class="docutils literal">LONG_BIT</code> bits to an abstract, arbitrary precision integer. The addition in +the second line is therefore an addition between abstract integers, so it will +never overflow and just compute the correct result as an integer.</p> +<p>The condition to check for overflow is now: if the results of the two different +ways to do the addition are the same, then overflow did not occur. So in order +to compute <code class="docutils literal">state.no_ovf</code> in the addition case the +code converts the result of the bit vector wraparound addition to +an abstract integer (using <code class="docutils literal">SignExt</code> again), and then compares that to the integer +result.</p> +<p>This boolean can then be checked by the guard operations <code class="docutils literal">guard_no_overflow</code> +and <code class="docutils literal">guard_overflow</code>.</p> +<div class="code"><pre class="code python"><a id="rest_code_71e8db552ee64a1abcb47ebbdb1df319-1" name="rest_code_71e8db552ee64a1abcb47ebbdb1df319-1" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_71e8db552ee64a1abcb47ebbdb1df319-1"></a><span class="k">def</span> <span class="nf">guard_to_condition</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">guard</span><span class="p">,</span> <span class="n">state</span><span class="p">):</span> +<a id="rest_code_71e8db552ee64a1abcb47ebbdb1df319-2" name="rest_code_71e8db552ee64a1abcb47ebbdb1df319-2" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_71e8db552ee64a1abcb47ebbdb1df319-2"></a> +<a id="rest_code_71e8db552ee64a1abcb47ebbdb1df319-3" name="rest_code_71e8db552ee64a1abcb47ebbdb1df319-3" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_71e8db552ee64a1abcb47ebbdb1df319-3"></a> <span class="c1"># ... more cases</span> +<a id="rest_code_71e8db552ee64a1abcb47ebbdb1df319-4" name="rest_code_71e8db552ee64a1abcb47ebbdb1df319-4" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_71e8db552ee64a1abcb47ebbdb1df319-4"></a> +<a id="rest_code_71e8db552ee64a1abcb47ebbdb1df319-5" name="rest_code_71e8db552ee64a1abcb47ebbdb1df319-5" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_71e8db552ee64a1abcb47ebbdb1df319-5"></a> <span class="k">elif</span> <span class="n">opname</span> <span class="o">==</span> <span class="s2">"guard_no_overflow"</span><span class="p">:</span> +<a id="rest_code_71e8db552ee64a1abcb47ebbdb1df319-6" name="rest_code_71e8db552ee64a1abcb47ebbdb1df319-6" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_71e8db552ee64a1abcb47ebbdb1df319-6"></a> <span class="k">assert</span> <span class="n">state</span><span class="o">.</span><span class="n">no_ovf</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span> +<a id="rest_code_71e8db552ee64a1abcb47ebbdb1df319-7" name="rest_code_71e8db552ee64a1abcb47ebbdb1df319-7" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_71e8db552ee64a1abcb47ebbdb1df319-7"></a> <span class="k">return</span> <span class="n">state</span><span class="o">.</span><span class="n">no_ovf</span> +<a id="rest_code_71e8db552ee64a1abcb47ebbdb1df319-8" name="rest_code_71e8db552ee64a1abcb47ebbdb1df319-8" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_71e8db552ee64a1abcb47ebbdb1df319-8"></a> <span class="k">elif</span> <span class="n">opname</span> <span class="o">==</span> <span class="s2">"guard_overflow"</span><span class="p">:</span> +<a id="rest_code_71e8db552ee64a1abcb47ebbdb1df319-9" name="rest_code_71e8db552ee64a1abcb47ebbdb1df319-9" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_71e8db552ee64a1abcb47ebbdb1df319-9"></a> <span class="k">assert</span> <span class="n">state</span><span class="o">.</span><span class="n">no_ovf</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span> +<a id="rest_code_71e8db552ee64a1abcb47ebbdb1df319-10" name="rest_code_71e8db552ee64a1abcb47ebbdb1df319-10" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_71e8db552ee64a1abcb47ebbdb1df319-10"></a> <span class="k">return</span> <span class="n">z3</span><span class="o">.</span><span class="n">Not</span><span class="p">(</span><span class="n">state</span><span class="o">.</span><span class="n">no_ovf</span><span class="p">)</span> +<a id="rest_code_71e8db552ee64a1abcb47ebbdb1df319-11" name="rest_code_71e8db552ee64a1abcb47ebbdb1df319-11" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_71e8db552ee64a1abcb47ebbdb1df319-11"></a> +<a id="rest_code_71e8db552ee64a1abcb47ebbdb1df319-12" name="rest_code_71e8db552ee64a1abcb47ebbdb1df319-12" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_71e8db552ee64a1abcb47ebbdb1df319-12"></a> <span class="c1"># ... more cases</span> +</pre></div> +</section> +<section id="finding-the-bug-again"> +<h2>Finding the Bug, Again</h2> +<p>Let's actually make all of this more concrete by applying it to the trace of our +original bug. The input trace and the incorrectly optimized trace for that look +like this (differences highlighted):</p> +<div class="code"><pre class="code python"><a id="rest_code_b7b84df3112e4bbf8acd0ef739239ca0-1" name="rest_code_b7b84df3112e4bbf8acd0ef739239ca0-1" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_b7b84df3112e4bbf8acd0ef739239ca0-1"></a><span class="c1"># input # optimized</span> +<a id="rest_code_b7b84df3112e4bbf8acd0ef739239ca0-2" name="rest_code_b7b84df3112e4bbf8acd0ef739239ca0-2" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_b7b84df3112e4bbf8acd0ef739239ca0-2"></a><span class="p">[</span><span class="n">i0</span><span class="p">]</span> <span class="p">[</span><span class="n">i0</span><span class="p">]</span> +<a id="rest_code_b7b84df3112e4bbf8acd0ef739239ca0-3" name="rest_code_b7b84df3112e4bbf8acd0ef739239ca0-3" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_b7b84df3112e4bbf8acd0ef739239ca0-3"></a><span class="n">i1</span> <span class="o">=</span> <span class="n">int_add</span><span class="p">(</span><span class="n">i0</span><span class="p">,</span> <span class="mi">10</span><span class="p">)</span> <span class="n">i1</span> <span class="o">=</span> <span class="n">int_add</span><span class="p">(</span><span class="n">i0</span><span class="p">,</span> <span class="mi">10</span><span class="p">)</span> +<a id="rest_code_b7b84df3112e4bbf8acd0ef739239ca0-4" name="rest_code_b7b84df3112e4bbf8acd0ef739239ca0-4" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_b7b84df3112e4bbf8acd0ef739239ca0-4"></a><span class="n">i2</span> <span class="o">=</span> <span class="n">int_lt</span><span class="p">(</span><span class="n">i1</span><span class="p">,</span> <span class="mi">15</span><span class="p">)</span> <span class="n">i2</span> <span class="o">=</span> <span class="n">int_lt</span><span class="p">(</span><span class="n">i1</span><span class="p">,</span> <span class="mi">15</span><span class="p">)</span> +<a id="rest_code_b7b84df3112e4bbf8acd0ef739239ca0-5" name="rest_code_b7b84df3112e4bbf8acd0ef739239ca0-5" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_b7b84df3112e4bbf8acd0ef739239ca0-5"></a><span class="n">guard_true</span><span class="p">(</span><span class="n">i2</span><span class="p">)</span> <span class="n">guard_true</span><span class="p">(</span><span class="n">i2</span><span class="p">)</span> +<a id="rest_code_b7b84df3112e4bbf8acd0ef739239ca0-6" name="rest_code_b7b84df3112e4bbf8acd0ef739239ca0-6" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_b7b84df3112e4bbf8acd0ef739239ca0-6"></a><span class="hll"><span class="n">i3</span> <span class="o">=</span> <span class="n">int_lt</span><span class="p">(</span><span class="n">i0</span><span class="p">,</span> <span class="mi">6</span><span class="p">)</span> <span class="n">jump</span><span class="p">(</span><span class="mi">0</span><span class="p">)</span> +</span><a id="rest_code_b7b84df3112e4bbf8acd0ef739239ca0-7" name="rest_code_b7b84df3112e4bbf8acd0ef739239ca0-7" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_b7b84df3112e4bbf8acd0ef739239ca0-7"></a><span class="hll"><span class="n">guard_true</span><span class="p">(</span><span class="n">i3</span><span class="p">)</span> +</span><a id="rest_code_b7b84df3112e4bbf8acd0ef739239ca0-8" name="rest_code_b7b84df3112e4bbf8acd0ef739239ca0-8" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_b7b84df3112e4bbf8acd0ef739239ca0-8"></a><span class="hll"><span class="n">jump</span><span class="p">(</span><span class="mi">0</span><span class="p">)</span> +</span></pre></div> +<p>Note that the trace represents just one of the paths through the control flow +graph of the original function, which is typical for tracing JITs (the other +paths could incrementally get added later).</p> +<p>The first guards in both these traces correspond to each other, so the first +chunks to check are the first three operations (lines 1-4). Those operations +don't get changed by the optimizer at all.</p> +<p>These two identical traces get translated to the following Z3 formulas:</p> +<div class="code"><pre class="code text"><a id="rest_code_25c448b34dd145d1837209987991ae86-1" name="rest_code_25c448b34dd145d1837209987991ae86-1" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_25c448b34dd145d1837209987991ae86-1"></a>i1unoptimized == input_i0 + 10 +<a id="rest_code_25c448b34dd145d1837209987991ae86-2" name="rest_code_25c448b34dd145d1837209987991ae86-2" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_25c448b34dd145d1837209987991ae86-2"></a>i2unoptimized == If(i1unoptimized &lt; 15, 1, 0) +<a id="rest_code_25c448b34dd145d1837209987991ae86-3" name="rest_code_25c448b34dd145d1837209987991ae86-3" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_25c448b34dd145d1837209987991ae86-3"></a>i1optimized == input_i0 + 10 +<a id="rest_code_25c448b34dd145d1837209987991ae86-4" name="rest_code_25c448b34dd145d1837209987991ae86-4" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_25c448b34dd145d1837209987991ae86-4"></a>i2optimized == If(i1optimized &lt; 15, 1, 0) +</pre></div> +<p>To check that the two corresponding guards are the same, the solver is asked to +prove that <code class="docutils literal">(i2unoptimized == 1) == (i2optimized == 1)</code>. This is +correct, because the formulas for <code class="docutils literal">i2unoptimized</code> and <code class="docutils literal">i2optimized</code> are +completely identical.</p> +<p>After checking that the guards behave the same, we add the knowledge to the +solver that the guards passed. So the Z3 formulas become:</p> +<div class="code"><pre class="code text"><a id="rest_code_bd0fcf12b5514a38b91ef86a0afa4a3c-1" name="rest_code_bd0fcf12b5514a38b91ef86a0afa4a3c-1" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_bd0fcf12b5514a38b91ef86a0afa4a3c-1"></a>i1unoptimized == input_i0 + 10 +<a id="rest_code_bd0fcf12b5514a38b91ef86a0afa4a3c-2" name="rest_code_bd0fcf12b5514a38b91ef86a0afa4a3c-2" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_bd0fcf12b5514a38b91ef86a0afa4a3c-2"></a>i2unoptimized == If(i1unoptimized &lt; 15, 1, 0) +<a id="rest_code_bd0fcf12b5514a38b91ef86a0afa4a3c-3" name="rest_code_bd0fcf12b5514a38b91ef86a0afa4a3c-3" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_bd0fcf12b5514a38b91ef86a0afa4a3c-3"></a>i1optimized == input_i0 + 10 +<a id="rest_code_bd0fcf12b5514a38b91ef86a0afa4a3c-4" name="rest_code_bd0fcf12b5514a38b91ef86a0afa4a3c-4" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_bd0fcf12b5514a38b91ef86a0afa4a3c-4"></a>i2optimized == If(i1optimized &lt; 15, 1, 0) +<a id="rest_code_bd0fcf12b5514a38b91ef86a0afa4a3c-5" name="rest_code_bd0fcf12b5514a38b91ef86a0afa4a3c-5" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_bd0fcf12b5514a38b91ef86a0afa4a3c-5"></a>i1optimized == 1 +<a id="rest_code_bd0fcf12b5514a38b91ef86a0afa4a3c-6" name="rest_code_bd0fcf12b5514a38b91ef86a0afa4a3c-6" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_bd0fcf12b5514a38b91ef86a0afa4a3c-6"></a>i2optimized == 1 +</pre></div> +<p>Now we continue with the remaining operations of the two traces (lines 6-8).</p> +<p>We start by adding the <code class="docutils literal">int_lt</code> operation in the unoptimized trace to the Z3 +formulas:</p> +<div class="code"><pre class="code text"><a id="rest_code_572cd48587b84ad4aea4ab9fb60d80fd-1" name="rest_code_572cd48587b84ad4aea4ab9fb60d80fd-1" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_572cd48587b84ad4aea4ab9fb60d80fd-1"></a>... +<a id="rest_code_572cd48587b84ad4aea4ab9fb60d80fd-2" name="rest_code_572cd48587b84ad4aea4ab9fb60d80fd-2" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_572cd48587b84ad4aea4ab9fb60d80fd-2"></a>i3unoptimized == If(input_i0 &lt; 6, 1, 0) +</pre></div> +<p>Because the second guard was optimized away, we need to ask Z3 to prove that +<code class="docutils literal">i3unoptimized == 1</code> is always true, which fails and gives the following +counterexample:</p> +<div class="code"><pre class="code text"><a id="rest_code_dad63ba423ac4e599c421529bf5361a0-1" name="rest_code_dad63ba423ac4e599c421529bf5361a0-1" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_dad63ba423ac4e599c421529bf5361a0-1"></a>input_i0 = 9223372036854775800 +<a id="rest_code_dad63ba423ac4e599c421529bf5361a0-2" name="rest_code_dad63ba423ac4e599c421529bf5361a0-2" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_dad63ba423ac4e599c421529bf5361a0-2"></a>i1unoptimized = 9223372036854775810 +<a id="rest_code_dad63ba423ac4e599c421529bf5361a0-3" name="rest_code_dad63ba423ac4e599c421529bf5361a0-3" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_dad63ba423ac4e599c421529bf5361a0-3"></a>i2unoptimized = 0 +<a id="rest_code_dad63ba423ac4e599c421529bf5361a0-4" name="rest_code_dad63ba423ac4e599c421529bf5361a0-4" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_dad63ba423ac4e599c421529bf5361a0-4"></a>i1optimized = 9223372036854775810 +<a id="rest_code_dad63ba423ac4e599c421529bf5361a0-5" name="rest_code_dad63ba423ac4e599c421529bf5361a0-5" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_dad63ba423ac4e599c421529bf5361a0-5"></a>i2optimized = 1 +<a id="rest_code_dad63ba423ac4e599c421529bf5361a0-6" name="rest_code_dad63ba423ac4e599c421529bf5361a0-6" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_dad63ba423ac4e599c421529bf5361a0-6"></a>i3unoptimized = 0 +</pre></div> +<p>Thus demonstrating the bug. The fact that the Z3-based equivalence check also +managed to find the original motivating bug without manually translating it to +a formula is a good confirmation that the approach works.</p> +</section> +<section id="second-bug"> +<h2>Second bug</h2> +<p>So with this code I applied the Z3-based equivalence check to all our optimizer +unit tests. In addition to the bug we've been discussing the whole post, it also +found another buggy test! I had found it too by hand by staring at all the tests +in the process of writing all the Z3 infrastructure, but it was still a good +confirmation that the process worked. This bug was in the range analysis for +<code class="docutils literal">int_neg</code>, integer negation. It failed to account that <code class="docutils literal"><span class="pre">-MININT</span> == MININT</code> +and therefore did a mis-optimization along the following lines:</p> +<div class="code"><pre class="code python"><a id="rest_code_486f2b8abd90465a8220a1becde3f0bd-1" name="rest_code_486f2b8abd90465a8220a1becde3f0bd-1" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_486f2b8abd90465a8220a1becde3f0bd-1"></a><span class="kn">import</span> <span class="nn">__pypy__</span> +<a id="rest_code_486f2b8abd90465a8220a1becde3f0bd-2" name="rest_code_486f2b8abd90465a8220a1becde3f0bd-2" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_486f2b8abd90465a8220a1becde3f0bd-2"></a> +<a id="rest_code_486f2b8abd90465a8220a1becde3f0bd-3" name="rest_code_486f2b8abd90465a8220a1becde3f0bd-3" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_486f2b8abd90465a8220a1becde3f0bd-3"></a><span class="k">def</span> <span class="nf">wrong</span><span class="p">(</span><span class="n">x</span><span class="p">):</span> +<a id="rest_code_486f2b8abd90465a8220a1becde3f0bd-4" name="rest_code_486f2b8abd90465a8220a1becde3f0bd-4" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_486f2b8abd90465a8220a1becde3f0bd-4"></a> <span class="n">a</span> <span class="o">=</span> <span class="n">__pypy__</span><span class="o">.</span><span class="n">intop</span><span class="o">.</span><span class="n">int_sub</span><span class="p">(</span><span class="mi">0</span><span class="p">,</span> <span class="n">x</span><span class="p">)</span> +<a id="rest_code_486f2b8abd90465a8220a1becde3f0bd-5" name="rest_code_486f2b8abd90465a8220a1becde3f0bd-5" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_486f2b8abd90465a8220a1becde3f0bd-5"></a> <span class="k">if</span> <span class="n">a</span> <span class="o">&lt;</span> <span class="mi">0</span><span class="p">:</span> +<a id="rest_code_486f2b8abd90465a8220a1becde3f0bd-6" name="rest_code_486f2b8abd90465a8220a1becde3f0bd-6" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_486f2b8abd90465a8220a1becde3f0bd-6"></a> <span class="k">if</span> <span class="n">x</span> <span class="o">&gt;</span> <span class="mi">0</span><span class="p">:</span> +<a id="rest_code_486f2b8abd90465a8220a1becde3f0bd-7" name="rest_code_486f2b8abd90465a8220a1becde3f0bd-7" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_486f2b8abd90465a8220a1becde3f0bd-7"></a> <span class="k">return</span> <span class="mi">0</span> +<a id="rest_code_486f2b8abd90465a8220a1becde3f0bd-8" name="rest_code_486f2b8abd90465a8220a1becde3f0bd-8" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_486f2b8abd90465a8220a1becde3f0bd-8"></a> <span class="k">return</span> <span class="mi">1</span> +<a id="rest_code_486f2b8abd90465a8220a1becde3f0bd-9" name="rest_code_486f2b8abd90465a8220a1becde3f0bd-9" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_486f2b8abd90465a8220a1becde3f0bd-9"></a> <span class="k">return</span> <span class="mi">2</span> +</pre></div> +<p>Which was wrongly optimized into:</p> +<div class="code"><pre class="code python"><a id="rest_code_a6cf538b3ecd4a0ebaf7b8c0cc7c7007-1" name="rest_code_a6cf538b3ecd4a0ebaf7b8c0cc7c7007-1" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_a6cf538b3ecd4a0ebaf7b8c0cc7c7007-1"></a><span class="kn">import</span> <span class="nn">__pypy__</span> +<a id="rest_code_a6cf538b3ecd4a0ebaf7b8c0cc7c7007-2" name="rest_code_a6cf538b3ecd4a0ebaf7b8c0cc7c7007-2" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_a6cf538b3ecd4a0ebaf7b8c0cc7c7007-2"></a> +<a id="rest_code_a6cf538b3ecd4a0ebaf7b8c0cc7c7007-3" name="rest_code_a6cf538b3ecd4a0ebaf7b8c0cc7c7007-3" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_a6cf538b3ecd4a0ebaf7b8c0cc7c7007-3"></a><span class="k">def</span> <span class="nf">wrong</span><span class="p">(</span><span class="n">x</span><span class="p">):</span> +<a id="rest_code_a6cf538b3ecd4a0ebaf7b8c0cc7c7007-4" name="rest_code_a6cf538b3ecd4a0ebaf7b8c0cc7c7007-4" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_a6cf538b3ecd4a0ebaf7b8c0cc7c7007-4"></a> <span class="n">a</span> <span class="o">=</span> <span class="n">__pypy__</span><span class="o">.</span><span class="n">intop</span><span class="o">.</span><span class="n">int_sub</span><span class="p">(</span><span class="mi">0</span><span class="p">,</span> <span class="n">x</span><span class="p">)</span> +<a id="rest_code_a6cf538b3ecd4a0ebaf7b8c0cc7c7007-5" name="rest_code_a6cf538b3ecd4a0ebaf7b8c0cc7c7007-5" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_a6cf538b3ecd4a0ebaf7b8c0cc7c7007-5"></a> <span class="k">if</span> <span class="n">a</span> <span class="o">&lt;</span> <span class="mi">0</span><span class="p">:</span> +<a id="rest_code_a6cf538b3ecd4a0ebaf7b8c0cc7c7007-6" name="rest_code_a6cf538b3ecd4a0ebaf7b8c0cc7c7007-6" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_a6cf538b3ecd4a0ebaf7b8c0cc7c7007-6"></a> <span class="k">return</span> <span class="mi">0</span> +<a id="rest_code_a6cf538b3ecd4a0ebaf7b8c0cc7c7007-7" name="rest_code_a6cf538b3ecd4a0ebaf7b8c0cc7c7007-7" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_a6cf538b3ecd4a0ebaf7b8c0cc7c7007-7"></a> <span class="k">return</span> <span class="mi">2</span> +</pre></div> +<p>This is wrong precisely for <code class="docutils literal">x == MININT</code>.</p> +</section> +<section id="generating-random-traces"> +<h2>Generating Random Traces</h2> +<p>These two bugs were the only two that the Z3 checker found for existing unit +tests. To try to find some more bugs I combined PyPy's existing random trace +generator with the Z3 optimization checker. The random trace generator has so +far been mostly used to find bugs in the machine code backends, particularly +also in the register allocator. So far we haven't used it with our optimizer, +but my experiments show that we should have!</p> +<p>I'm going to describe a little bit how the random trace generator works. It's +actually not that complicated, but there's one neat trick to it.</p> +<p>The basic idea is straightforward, it starts out with an empty trace with a +random number of input variables. Then it adds some number of operations to the +trace, either regular operations or guards. Every operation takes already +existing variables as input.</p> +<p>The neat trick is that our random trace generator keeps a concrete random +example value for every one of the input variables, and an example result for +every operation. In this way, it is possible to generate guards that are +consistent with the example values to ensure that running the trace to its end +is possible with at least one set of values.</p> +<p>Here's an example random trace that is generated, together with the random +example inputs and the results of every operation at the end of every line:</p> +<pre class="literal-block">[i0, i1, i2, i3, i4, i5] # example values: 9, 11, -8, -95, 46, 57 +i6 = int_add_ovf(i3, i0) # -86 +guard_no_overflow() +i7 = int_sub(i2, -35/ci) # 27 +i8 = uint_ge(i3, i5) # 1 +guard_true(i8) +i9 = int_lt(i7, i8) # 0 +i10 = int_mul_ovf(34/ci, i7) # 918 +guard_no_overflow() +i11 = int_and(i10, 63/ci) # 22 +i12 = int_rshift(i3, i11) # -1 +i13 = int_is_zero(i7) # 0 +i14 = int_is_true(i13) # 0 +guard_false(i13) +i15 = int_lt(i8, i4) # 1 +i16 = int_and(i6, i0) # 8 +i17 = uint_ge(i6, -6/ci) # 0 +finish()</pre> +<p>Note how every guard generated is true for the example values.</p> +<p>I have been running this combination of random trace generation and Z3 checking +for many nights and it has found some bugs, which I'll describe in the next +section. It should probably be run for a lot longer, but still a useful +exercise already.</p> +<p>In this mode, I'm giving every Z3 call a time limit to make sure that the random +tests don't just take arbitrarily long. This means that asking Z3 to prove +something can have three outcomes, either it's proved, or Z3 finds a +counterexample, or Z3 times out.</p> +</section> +<section id="bugs-found"> +<h2>Bugs Found</h2> +<p>In addition to the two bugs I've already described, I'll briefly list the +additional bugs that were found by optimizing random traces and then trying to +prove the equivalence with Z3.</p> +<p>Most of the bugs were actually identified by optimizing random traces alone, not +by the Z3 component. They manifested as assert failures in the JIT compiler.</p> +<ul class="simple"> +<li><p>The JIT concluded after <code class="docutils literal">12 == int_mul(x, 12)</code> that <code class="docutils literal">x == 1</code>, which is +incorrect if overflow occurred (a counterexample is <code class="docutils literal">0x8000000000000001</code>).</p></li> +<li><p>An amusing bug, where from <code class="docutils literal">0 == int_lshift(0x1000000000000000, x)</code> with +<code class="docutils literal">x &lt;= 0 &lt;= 15</code>, the JIT concluded that <code class="docutils literal">0x1000000000000000 == 0</code>, +triggering an assert. This wrong conclusion was again caused by not taking the +possibility of overflow into account.</p></li> +<li><p>A corner case in an optimization for chained integer additions with a +constant, where in complex enough expressions, the wrong IR API was used +(which works correctly in simple cases). Again, this triggered an assert.</p></li> +</ul> +<p>This shows that we should have been fuzzing our JIT optimizer already (not a +surprising observation in hindsight, fuzz all the things!).</p> +<p>Thankfully, there was also one further bug that really failed in the Z3 +verifier. It's a bug in common subexpression elimination / arithmetic +simplification, which again does not take overflow correctly into account.</p> +<p>The buggy trace looks like this (unfortunately it's not easily possible to show +this bug in Python code).</p> +<div class="code"><pre class="code text"><a id="rest_code_40493479399f42558ecf3121b6abb0ca-1" name="rest_code_40493479399f42558ecf3121b6abb0ca-1" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_40493479399f42558ecf3121b6abb0ca-1"></a>[a, b] +<a id="rest_code_40493479399f42558ecf3121b6abb0ca-2" name="rest_code_40493479399f42558ecf3121b6abb0ca-2" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_40493479399f42558ecf3121b6abb0ca-2"></a>c = int_add(a, b) +<a id="rest_code_40493479399f42558ecf3121b6abb0ca-3" name="rest_code_40493479399f42558ecf3121b6abb0ca-3" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_40493479399f42558ecf3121b6abb0ca-3"></a>r = int_sub_ovf(c, b) +<a id="rest_code_40493479399f42558ecf3121b6abb0ca-4" name="rest_code_40493479399f42558ecf3121b6abb0ca-4" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_40493479399f42558ecf3121b6abb0ca-4"></a>guard_no_ovf() +<a id="rest_code_40493479399f42558ecf3121b6abb0ca-5" name="rest_code_40493479399f42558ecf3121b6abb0ca-5" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_40493479399f42558ecf3121b6abb0ca-5"></a>finish(r) +</pre></div> +<p>This was optimized to:</p> +<div class="code"><pre class="code text"><a id="rest_code_30cdbc23b541425f891edc9180ced3c0-1" name="rest_code_30cdbc23b541425f891edc9180ced3c0-1" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_30cdbc23b541425f891edc9180ced3c0-1"></a>[a, b] +<a id="rest_code_30cdbc23b541425f891edc9180ced3c0-2" name="rest_code_30cdbc23b541425f891edc9180ced3c0-2" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_30cdbc23b541425f891edc9180ced3c0-2"></a>finish(a) +</pre></div> +<p>Which is incorrect, because the guard can fail given the right inputs. +But the optimizer concluded that the subtraction is safe, because its the +inverse of an earlier addition, not taking into account that this earlier +addition can have overflowed.</p> +<p>Note that a related optimization is actually correct. Given this code:</p> +<div class="code"><pre class="code text"><a id="rest_code_6037a89ec2e141f3a6fb830fe938b2f4-1" name="rest_code_6037a89ec2e141f3a6fb830fe938b2f4-1" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_6037a89ec2e141f3a6fb830fe938b2f4-1"></a>[a, b] +<a id="rest_code_6037a89ec2e141f3a6fb830fe938b2f4-2" name="rest_code_6037a89ec2e141f3a6fb830fe938b2f4-2" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_6037a89ec2e141f3a6fb830fe938b2f4-2"></a>c = int_add_ovf(a, b) +<a id="rest_code_6037a89ec2e141f3a6fb830fe938b2f4-3" name="rest_code_6037a89ec2e141f3a6fb830fe938b2f4-3" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_6037a89ec2e141f3a6fb830fe938b2f4-3"></a>guard_no_ovf() +<a id="rest_code_6037a89ec2e141f3a6fb830fe938b2f4-4" name="rest_code_6037a89ec2e141f3a6fb830fe938b2f4-4" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_6037a89ec2e141f3a6fb830fe938b2f4-4"></a>r = int_sub(c, b) +<a id="rest_code_6037a89ec2e141f3a6fb830fe938b2f4-5" name="rest_code_6037a89ec2e141f3a6fb830fe938b2f4-5" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_6037a89ec2e141f3a6fb830fe938b2f4-5"></a>finish(r) +</pre></div> +<p>It can be optimized to:</p> +<div class="code"><pre class="code text"><a id="rest_code_231f1b026f874575959e48a29de9a78d-1" name="rest_code_231f1b026f874575959e48a29de9a78d-1" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_231f1b026f874575959e48a29de9a78d-1"></a>[a, b] +<a id="rest_code_231f1b026f874575959e48a29de9a78d-2" name="rest_code_231f1b026f874575959e48a29de9a78d-2" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_231f1b026f874575959e48a29de9a78d-2"></a>c = int_add_ovf(a, b) +<a id="rest_code_231f1b026f874575959e48a29de9a78d-3" name="rest_code_231f1b026f874575959e48a29de9a78d-3" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_231f1b026f874575959e48a29de9a78d-3"></a>guard_no_ovf() +<a id="rest_code_231f1b026f874575959e48a29de9a78d-4" name="rest_code_231f1b026f874575959e48a29de9a78d-4" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_231f1b026f874575959e48a29de9a78d-4"></a>finish(a) +</pre></div> +</section> +<section id="future-work-and-conclusion"> +<h2>Future Work and Conclusion</h2> +<p>In the current form the Z3 checker is only a start, even though it has already +been concretely useful. There are various directions into which we could extend +it. In addition to generate random tests completely from scratch, we could also +start from the existing manually written unit-tests and randomly mutate those.</p> +<p>I also want to extend the Z3 checker with support more operations, heap +operations in particular (but it's not quite clear to me how to model garbage +collection).</p> +<p>I also want to try to switch the code away from the Z3 API and use the more +general <a class="reference external" href="https://smtlib.cs.uiowa.edu/">smtlib</a> interface directly, in order to be able to use other SMT +checkers than Z3, eg <a class="reference external" href="https://cvc4.github.io/">CVC4</a>.</p> +<p>But all in all this was a fun and not too hard way to find a bunch of bugs in +our optimizer! And the infrastructure is now in place, which means that we run +some random test cases every time we execute our tests. This is going to be +particularly useful when we do further work on the integer reasoning of the JIT +(like Nico is doing, for example). As of time of writing of this post, all the +bugs mentioned have been fixed and the Z3 code has landed on the default branch +and runs as part of PyPy's CI infrastructure.</p> +</section> +<section id="acknowledgements"> +<h2>Acknowledgements</h2> +<p>Thanks to <a class="reference external" href="http://saambarati.org/">Saam Barati</a>, <a class="reference external" href="https://bernsteinbear.com">Max Bernstein</a>, <a class="reference external" href="https://www.cs.hhu.de/lehrstuehle-und-arbeitsgruppen/softwaretechnik-und-programmiersprachen/unser-team/team/schmidt">Joshua Schmidt</a> and <a class="reference external" href="https://martinfriedrichberger.net/">Martin +Berger</a>, for great feedback on drafts of this post!</p> +</section>jittestinghttps://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.htmlSun, 11 Dec 2022 18:00:00 GMTHow to make your code 80 times fasterhttps://www.pypy.org/posts/2017/10/how-to-make-your-code-80-times-faster-1424098117108093942.htmlAntonio Cuni<div class="document" id="how-to-make-your-code-80-times-faster"> +I often hear people who are happy because PyPy makes their code 2 times faster +or so. Here is a short personal story which shows PyPy can go well beyond +that.<br> +<br> +<strong>DISCLAIMER</strong>: this is not a silver bullet or a general recipe: it worked in +this particular case, it might not work so well in other cases. But I think it +is still an interesting technique. Moreover, the various steps and +implementations are showed in the same order as I tried them during the +development, so it is a real-life example of how to proceed when optimizing +for PyPy.<br> +<br> +Some months ago I <a class="reference external" href="https://github.com/antocuni/evolvingcopter">played a bit</a> with evolutionary algorithms: the ambitious +plan was to automatically evolve a logic which could control a (simulated) +quadcopter, i.e. a <a class="reference external" href="https://en.wikipedia.org/wiki/PID_controller">PID controller</a> (<strong>spoiler</strong>: it doesn't fly).<br> +<br> +The idea is to have an initial population of random creatures: at each +generation, the ones with the best fitness survive and reproduce with small, +random variations.<br> +<br> +However, for the scope of this post, the actual task at hand is not so +important, so let's jump straight to the code. To drive the quadcopter, a +<tt class="docutils literal">Creature</tt> has a <tt class="docutils literal">run_step</tt> method which runs at each <tt class="docutils literal">delta_t</tt> (<a class="reference external" href="https://github.com/antocuni/evolvingcopter/blob/master/ev/creature.py">full +code</a>):<br> +<pre class="code python literal-block"><span class="keyword">class</span> <span class="name class">Creature</span><span class="punctuation">(</span><span class="name builtin">object</span><span class="punctuation">):</span> + <span class="name">INPUTS</span> <span class="operator">=</span> <span class="literal number integer">2</span> <span class="comment single"># z_setpoint, current z position</span> + <span class="name">OUTPUTS</span> <span class="operator">=</span> <span class="literal number integer">1</span> <span class="comment single"># PWM for all 4 motors</span> + <span class="name">STATE_VARS</span> <span class="operator">=</span> <span class="literal number integer">1</span> + <span class="operator">...</span> + + <span class="keyword">def</span> <span class="name function">run_step</span><span class="punctuation">(</span><span class="name builtin pseudo">self</span><span class="punctuation">,</span> <span class="name">inputs</span><span class="punctuation">):</span> + <span class="comment single"># state: [state_vars ... inputs]</span> + <span class="comment single"># out_values: [state_vars, ... outputs]</span> + <span class="name builtin pseudo">self</span><span class="operator">.</span><span class="name">state</span><span class="punctuation">[</span><span class="name builtin pseudo">self</span><span class="operator">.</span><span class="name">STATE_VARS</span><span class="punctuation">:]</span> <span class="operator">=</span> <span class="name">inputs</span> + <span class="name">out_values</span> <span class="operator">=</span> <span class="name">np</span><span class="operator">.</span><span class="name">dot</span><span class="punctuation">(</span><span class="name builtin pseudo">self</span><span class="operator">.</span><span class="name">matrix</span><span class="punctuation">,</span> <span class="name builtin pseudo">self</span><span class="operator">.</span><span class="name">state</span><span class="punctuation">)</span> <span class="operator">+</span> <span class="name builtin pseudo">self</span><span class="operator">.</span><span class="name">constant</span> + <span class="name builtin pseudo">self</span><span class="operator">.</span><span class="name">state</span><span class="punctuation">[:</span><span class="name builtin pseudo">self</span><span class="operator">.</span><span class="name">STATE_VARS</span><span class="punctuation">]</span> <span class="operator">=</span> <span class="name">out_values</span><span class="punctuation">[:</span><span class="name builtin pseudo">self</span><span class="operator">.</span><span class="name">STATE_VARS</span><span class="punctuation">]</span> + <span class="name">outputs</span> <span class="operator">=</span> <span class="name">out_values</span><span class="punctuation">[</span><span class="name builtin pseudo">self</span><span class="operator">.</span><span class="name">STATE_VARS</span><span class="punctuation">:]</span> + <span class="keyword">return</span> <span class="name">outputs</span> +</pre> +<ul class="simple"> +<li><tt class="docutils literal">inputs</tt> is a numpy array containing the desired setpoint and the current +position on the Z axis;</li> +<li><tt class="docutils literal">outputs</tt> is a numpy array containing the thrust to give to the motors. To +start easy, all the 4 motors are constrained to have the same thrust, so +that the quadcopter only travels up and down the Z axis;</li> +<li><tt class="docutils literal">self.state</tt> contains arbitrary values of unknown size which are passed from +one step to the next;</li> +<li><tt class="docutils literal">self.matrix</tt> and <tt class="docutils literal">self.constant</tt> contains the actual logic. By putting +the "right" values there, in theory we could get a perfectly tuned PID +controller. These are randomly mutated between generations.</li> +</ul> +<tt class="docutils literal">run_step</tt> is called at 100Hz (in the virtual time frame of the simulation). At each +generation, we test 500 creatures for a total of 12 virtual seconds each. So, +we have a total of 600,000 executions of <tt class="docutils literal">run_step</tt> at each generation.<br> +<br> +At first, I simply tried to run this code on CPython; here is the result:<br> +<pre class="code literal-block">$ python -m ev.main +Generation 1: ... [population = 500] [12.06 secs] +Generation 2: ... [population = 500] [6.13 secs] +Generation 3: ... [population = 500] [6.11 secs] +Generation 4: ... [population = 500] [6.09 secs] +Generation 5: ... [population = 500] [6.18 secs] +Generation 6: ... [population = 500] [6.26 secs] +</pre> +Which means ~6.15 seconds/generation, excluding the first.<br> +<br> +Then I tried with PyPy 5.9:<br> +<pre class="code literal-block">$ pypy -m ev.main +Generation 1: ... [population = 500] [63.90 secs] +Generation 2: ... [population = 500] [33.92 secs] +Generation 3: ... [population = 500] [34.21 secs] +Generation 4: ... [population = 500] [33.75 secs] +</pre> +Ouch! We are ~5.5x slower than CPython. This was kind of expected: numpy is +based on cpyext, which is infamously slow. (Actually, <a class="reference external" href="https://pypy.org/posts/2017/10/cape-of-good-hope-for-pypy-hello-from-3656631725712879033.html">we are working on +that</a> and on the <tt class="docutils literal"><span class="pre">cpyext-avoid-roundtrip</span></tt> branch we are already faster than +CPython, but this will be the subject of another blog post.)<br> +<br> +So, let's try to avoid cpyext. The first obvious step is to use <a class="reference external" href="https://doc.pypy.org/en/latest/faq.html#what-about-numpy-numpypy-micronumpy">numpypy</a> +instead of numpy (actually, there is a <a class="reference external" href="https://github.com/antocuni/evolvingcopter/blob/master/ev/pypycompat.py">hack</a> to use just the micronumpy +part). Let's see if the speed improves:<br> +<pre class="code literal-block">$ pypy -m ev.main # using numpypy +Generation 1: ... [population = 500] [5.60 secs] +Generation 2: ... [population = 500] [2.90 secs] +Generation 3: ... [population = 500] [2.78 secs] +Generation 4: ... [population = 500] [2.69 secs] +Generation 5: ... [population = 500] [2.72 secs] +Generation 6: ... [population = 500] [2.73 secs] +</pre> +So, ~2.7 seconds on average: this is 12x faster than PyPy+numpy, and more than +2x faster than the original CPython. At this point, most people would be happy +and go tweeting how PyPy is great.<br> +<br> +In general, when talking of CPython vs PyPy, I am rarely satified of a 2x +speedup: I know that PyPy can do much better than this, especially if you +write code which is specifically optimized for the JIT. For a real-life +example, have a look at <a class="reference external" href="https://capnpy.readthedocs.io/en/latest/benchmarks.html">capnpy benchmarks</a>, in which the PyPy version is +~15x faster than the heavily optimized CPython+Cython version (both have been +written by me, and I tried hard to write the fastest code for both +implementations).<br> +<br> +So, let's try to do better. As usual, the first thing to do is to profile and +see where we spend most of the time. Here is the <a class="reference external" href="https://vmprof.com/#/449ca8ee-3ab2-49d4-b6f0-9099987e9000">vmprof profile</a>. We spend a +lot of time inside the internals of numpypy, and allocating tons of temporary +arrays to store the results of the various operations.<br> +<br> +Also, let's look at the <a class="reference external" href="https://vmprof.com/#/28fd6e8f-f103-4bf4-a76a-4b65dbd637f4/traces">jit traces</a> and search for the function <tt class="docutils literal">run</tt>: +this is loop in which we spend most of the time, and it is composed of 1796 +operations. The operations emitted for the line <tt class="docutils literal"><span class="pre">np.dot(...)</span> + +self.constant</tt> are listed between lines 1217 and 1456. Here is the excerpt +which calls <tt class="docutils literal"><span class="pre">np.dot(...)</span></tt>; most of the ops are cheap, but at line 1232 we +see a call to the RPython function <a class="reference external" href="https://foss.heptapod.net/pypy/pypy/-/blob/release-pypy3.5-v5.10.0/pypy/module/micronumpy/ndarray.py#L1160">descr_dot</a>; by looking at the +implementation we see that it creates a new <tt class="docutils literal">W_NDimArray</tt> to store the +result, which means it has to do a <tt class="docutils literal">malloc()</tt>:<br> +<div class="separator" style="clear: both; text-align: center;"> +<a href="https://4.bp.blogspot.com/-_h6BuLTtEO8/Wfb6BXDg93I/AAAAAAAABNY/BY2XBg4ZtwokB9f1mWSmzI9gn_qanb81QCLcBGAs/s1600/2017-10-trace1.png" style="margin-left: 1em; margin-right: 1em;"><img border="0" height="450" src="https://4.bp.blogspot.com/-_h6BuLTtEO8/Wfb6BXDg93I/AAAAAAAABNY/BY2XBg4ZtwokB9f1mWSmzI9gn_qanb81QCLcBGAs/s640/2017-10-trace1.png" width="640"></a></div> +<br> +The implementation of the <tt class="docutils literal">+ self.constant</tt> part is also interesting: +contrary the former, the call to <tt class="docutils literal">W_NDimArray.descr_add</tt> has been inlined by +the JIT, so we have a better picture of what's happening; in particular, we +can see the call to <tt class="docutils literal">__0_alloc_with_del____</tt> which allocates the +<tt class="docutils literal">W_NDimArray</tt> for the result, and the <tt class="docutils literal">raw_malloc</tt> which allocates the +actual array. Then we have a long list of 149 simple operations which set the +fields of the resulting array, construct an iterator, and finally do a +<tt class="docutils literal">call_assembler</tt>: this is the actual logic to do the addition, which was +JITtted indipendently; <tt class="docutils literal">call_assembler</tt> is one of the operations to do +JIT-to-JIT calls:<br> +<div class="separator" style="clear: both; text-align: center;"> +<a href="https://1.bp.blogspot.com/-vmo0pWharIU/Wfb3VfwHjxI/AAAAAAAABNE/a6Em09qZizwGiWJeTbGzKfHQH70dB7RKgCEwYBhgL/s1600/2017-10-trace2.png" style="margin-left: 1em; margin-right: 1em;"><img border="0" height="640" src="https://1.bp.blogspot.com/-vmo0pWharIU/Wfb3VfwHjxI/AAAAAAAABNE/a6Em09qZizwGiWJeTbGzKfHQH70dB7RKgCEwYBhgL/s640/2017-10-trace2.png" width="625"></a></div> +<br> +All of this is very suboptimal: in this particular case, we know that the +shape of <tt class="docutils literal">self.matrix</tt> is always <tt class="docutils literal">(3, 2)</tt>: so, we are doing an incredible +amount of work, including calling <tt class="docutils literal">malloc()</tt> twice for the temporary arrays, just to +call two functions which ultimately do a total of 6 multiplications +and 6 additions. Note also that this is not a fault of the JIT: CPython+numpy +has to do the same amount of work, just hidden inside C calls.<br> +<br> +One possible solution to this nonsense is a well known compiler optimization: +loop unrolling. From the compiler point of view, unrolling the loop is always +risky because if the matrix is too big you might end up emitting a huge blob +of code, possibly uselss if the shape of the matrices change frequently: this +is the main reason why the PyPy JIT does not even try to do it in this case.<br> +<br> +However, we <strong>know</strong> that the matrix is small, and always of the same +shape. So, let's unroll the loop manually:<br> +<pre class="code python literal-block"><span class="keyword">class</span> <span class="name class">SpecializedCreature</span><span class="punctuation">(</span><span class="name">Creature</span><span class="punctuation">):</span> + + <span class="keyword">def</span> <span class="name function magic">__init__</span><span class="punctuation">(</span><span class="name builtin pseudo">self</span><span class="punctuation">,</span> <span class="operator">*</span><span class="name">args</span><span class="punctuation">,</span> <span class="operator">**</span><span class="name">kwargs</span><span class="punctuation">):</span> + <span class="name">Creature</span><span class="operator">.</span><span class="name function magic">__init__</span><span class="punctuation">(</span><span class="name builtin pseudo">self</span><span class="punctuation">,</span> <span class="operator">*</span><span class="name">args</span><span class="punctuation">,</span> <span class="operator">**</span><span class="name">kwargs</span><span class="punctuation">)</span> + <span class="comment single"># store the data in a plain Python list</span> + <span class="name builtin pseudo">self</span><span class="operator">.</span><span class="name">data</span> <span class="operator">=</span> <span class="name builtin">list</span><span class="punctuation">(</span><span class="name builtin pseudo">self</span><span class="operator">.</span><span class="name">matrix</span><span class="operator">.</span><span class="name">ravel</span><span class="punctuation">())</span> <span class="operator">+</span> <span class="name builtin">list</span><span class="punctuation">(</span><span class="name builtin pseudo">self</span><span class="operator">.</span><span class="name">constant</span><span class="punctuation">)</span> + <span class="name builtin pseudo">self</span><span class="operator">.</span><span class="name">data_state</span> <span class="operator">=</span> <span class="punctuation">[</span><span class="literal number float">0.0</span><span class="punctuation">]</span> + <span class="keyword">assert</span> <span class="name builtin pseudo">self</span><span class="operator">.</span><span class="name">matrix</span><span class="operator">.</span><span class="name">shape</span> <span class="operator">==</span> <span class="punctuation">(</span><span class="literal number integer">2</span><span class="punctuation">,</span> <span class="literal number integer">3</span><span class="punctuation">)</span> + <span class="keyword">assert</span> <span class="name builtin">len</span><span class="punctuation">(</span><span class="name builtin pseudo">self</span><span class="operator">.</span><span class="name">data</span><span class="punctuation">)</span> <span class="operator">==</span> <span class="literal number integer">8</span> + + <span class="keyword">def</span> <span class="name function">run_step</span><span class="punctuation">(</span><span class="name builtin pseudo">self</span><span class="punctuation">,</span> <span class="name">inputs</span><span class="punctuation">):</span> + <span class="comment single"># state: [state_vars ... inputs]</span> + <span class="comment single"># out_values: [state_vars, ... outputs]</span> + <span class="name">k0</span><span class="punctuation">,</span> <span class="name">k1</span><span class="punctuation">,</span> <span class="name">k2</span><span class="punctuation">,</span> <span class="name">q0</span><span class="punctuation">,</span> <span class="name">q1</span><span class="punctuation">,</span> <span class="name">q2</span><span class="punctuation">,</span> <span class="name">c0</span><span class="punctuation">,</span> <span class="name">c1</span> <span class="operator">=</span> <span class="name builtin pseudo">self</span><span class="operator">.</span><span class="name">data</span> + <span class="name">s0</span> <span class="operator">=</span> <span class="name builtin pseudo">self</span><span class="operator">.</span><span class="name">data_state</span><span class="punctuation">[</span><span class="literal number integer">0</span><span class="punctuation">]</span> + <span class="name">z_sp</span><span class="punctuation">,</span> <span class="name">z</span> <span class="operator">=</span> <span class="name">inputs</span> + <span class="comment single">#</span> + <span class="comment single"># compute the output</span> + <span class="name">out0</span> <span class="operator">=</span> <span class="name">s0</span><span class="operator">*</span><span class="name">k0</span> <span class="operator">+</span> <span class="name">z_sp</span><span class="operator">*</span><span class="name">k1</span> <span class="operator">+</span> <span class="name">z</span><span class="operator">*</span><span class="name">k2</span> <span class="operator">+</span> <span class="name">c0</span> + <span class="name">out1</span> <span class="operator">=</span> <span class="name">s0</span><span class="operator">*</span><span class="name">q0</span> <span class="operator">+</span> <span class="name">z_sp</span><span class="operator">*</span><span class="name">q1</span> <span class="operator">+</span> <span class="name">z</span><span class="operator">*</span><span class="name">q2</span> <span class="operator">+</span> <span class="name">c1</span> + <span class="comment single">#</span> + <span class="name builtin pseudo">self</span><span class="operator">.</span><span class="name">data_state</span><span class="punctuation">[</span><span class="literal number integer">0</span><span class="punctuation">]</span> <span class="operator">=</span> <span class="name">out0</span> + <span class="name">outputs</span> <span class="operator">=</span> <span class="punctuation">[</span><span class="name">out1</span><span class="punctuation">]</span> + <span class="keyword">return</span> <span class="name">outputs</span> +</pre> +In the <a class="reference external" href="https://github.com/antocuni/evolvingcopter/blob/master/ev/creature.py#L100">actual code</a> there is also a sanity check which asserts that the +computed output is the very same as the one returned by <tt class="docutils literal">Creature.run_step</tt>.<br> +<br> +So, let's try to see how it performs. First, with CPython:<br> +<pre class="code literal-block">$ python -m ev.main +Generation 1: ... [population = 500] [7.61 secs] +Generation 2: ... [population = 500] [3.96 secs] +Generation 3: ... [population = 500] [3.79 secs] +Generation 4: ... [population = 500] [3.74 secs] +Generation 5: ... [population = 500] [3.84 secs] +Generation 6: ... [population = 500] [3.69 secs] +</pre> +This looks good: 60% faster than the original CPython+numpy +implementation. Let's try on PyPy:<br> +<pre class="code literal-block">Generation 1: ... [population = 500] [0.39 secs] +Generation 2: ... [population = 500] [0.10 secs] +Generation 3: ... [population = 500] [0.11 secs] +Generation 4: ... [population = 500] [0.09 secs] +Generation 5: ... [population = 500] [0.08 secs] +Generation 6: ... [population = 500] [0.12 secs] +Generation 7: ... [population = 500] [0.09 secs] +Generation 8: ... [population = 500] [0.08 secs] +Generation 9: ... [population = 500] [0.08 secs] +Generation 10: ... [population = 500] [0.08 secs] +Generation 11: ... [population = 500] [0.08 secs] +Generation 12: ... [population = 500] [0.07 secs] +Generation 13: ... [population = 500] [0.07 secs] +Generation 14: ... [population = 500] [0.08 secs] +Generation 15: ... [population = 500] [0.07 secs] +</pre> +Yes, it's not an error. After a couple of generations, it stabilizes at around +~0.07-0.08 seconds per generation. This is around <strong>80 (eighty) times faster</strong> +than the original CPython+numpy implementation, and around 35-40x faster than +the naive PyPy+numpypy one.<br> +<br> +Let's look at the <a class="reference external" href="https://vmprof.com/#/402af746-2966-4403-a61d-93015abac033/traces">trace</a> again: it no longer contains expensive calls, and +certainly no more temporary <tt class="docutils literal">malloc()</tt> s. The core of the logic is between +lines 386-416, where we can see that it does fast C-level multiplications and +additions: <tt class="docutils literal">float_mul</tt> and <tt class="docutils literal">float_add</tt> are translated straight into +<tt class="docutils literal">mulsd</tt> and <tt class="docutils literal">addsd</tt> x86 instructions.<br> +<br> +As I said before, this is a very particular example, and the techniques +described here do not always apply: it is not realistic to expect an 80x +speedup on arbitrary code, unfortunately. However, it clearly shows the potential of PyPy when +it comes to high-speed computing. And most importantly, it's not a toy +benchmark which was designed specifically to have good performance on PyPy: +it's a real world example, albeit small.<br> +<br> +You might be also interested in the talk I gave at last EuroPython, in which I +talk about a similar topic: "The Joy of PyPy JIT: abstractions for free" +(<a class="reference external" href="https://ep2017.europython.eu/conference/talks/the-joy-of-pypy-jit-abstractions-for-free">abstract</a>, <a class="reference external" href="https://speakerdeck.com/antocuni/the-joy-of-pypy-jit-abstractions-for-free">slides</a> and <a class="reference external" href="https://www.youtube.com/watch?v=NQfpHQII2cU">video</a>).<br> +<br> +<div class="section" id="how-to-reproduce-the-results"> +<h3> +How to reproduce the results</h3> +<pre class="code literal-block">$ git clone https://github.com/antocuni/evolvingcopter +$ cd evolvingcopter +$ {python,pypy} -m ev.main --no-specialized --no-numpypy +$ {python,pypy} -m ev.main --no-specialized +$ {python,pypy} -m ev.main +</pre> +</div> +</div>jitprofilingspeedhttps://www.pypy.org/posts/2017/10/how-to-make-your-code-80-times-faster-1424098117108093942.htmlMon, 30 Oct 2017 10:15:00 GMTAlmost There - PyPy's ARM Backendhttps://www.pypy.org/posts/2012/02/almost-there-pypys-arm-backend_01-3216759488618774525.htmlDavid Schneider<div style="text-align: left;"> +In this post I want to give an update on the status of the ARM backend for PyPy's JIT and describe some of the issues and details of the backend.</div> +<div class="section" id="current-status"> +<br> +<h2> + + + + +Current Status</h2> +It has been a more than a year that I have been working on the ARM backend. Now it is in a shape, that we can measure meaningful numbers and also ask for some feedback. Since the <a class="reference external" href="https://www.pypy.org/posts/2011/01/jit-backend-for-arm-processors-5994810755839586463.html">last post about the backend</a> we have added support floating point operations as well as for PyPy's framework GC's. Another area of work was to keep up with the constant improvements done in the main development branch, such as out-of-line guards, labels, etc. It has been possible for about a year to cross-translate the PyPy Python interpreter and other interpreters such as <a class="reference external" href="https://bitbucket.org/cfbolz/pyrolog/">Pyrolog</a>, with a JIT, to run benchmarks on ARM. Up until now there remained some hard to track bugs that would cause the interpreter to crash with a segmentation fault in certain cases when running with the JIT on ARM. Lately it was possible to run all benchmarks without problems, but when running the translation toolchain itself it would crash. During the last PyPy sprint in <a class="reference external" href="https://www.pypy.org/posts/2011/12/leysin-winter-sprint-6862532189897876336.html">Leysin</a> Armin and I managed to fix several of these hard to track bugs in the ARM backend with the result that, it is now possible to run the PyPy translator on ARM itself (at least unless until it runs out of memory), which is a kind of litmus test for the backend itself and used to crash before. Just to point it out, we are not able to complete a PyPy translation on ARM, because on the hardware we have currently available there is not enough memory. But up to the point we run out of memory the JIT does not hit any issues.<br> +<br></div> +<div class="section" id="implementation-details"> +<h2> + + + + +Implementation Details</h2> +The hardware requirements to run the JIT on ARM follow those for Ubuntu on ARM which targets ARMv7 with a VFP unit running in little endian mode. The JIT can be translated without floating point support, but there might be a few places that need to be fixed to fully work in this setting. We are targeting the ARM instruction set, because at least at the time we decided to use it seemed to be the best choice in terms of speed while having some size overhead compared to the Thumb2 instruction set. It appears that the Thumb2 instruction set should give comparable speed with better code density but has a few restriction on the number of registers available and the use of conditional execution. Also the implementation is a bit easier using a fixed width instruction set and we can use the full set of registers in the generated code when using the ARM instruction set.<br> +<br></div> +<div class="section" id="the-calling-convention-on-arm"> +<h2> + + + + +The calling convention on ARM</h2> +The calling convention on ARM uses 4 of the general purpose registers to pass arguments to functions, further arguments are passed on the stack. The presence of a floating point unit is not required for ARM cores, for this reason there are different ways of handling floats with relation to the calling convention. There is a so called soft-float calling convention that is independent of the presence of a floating point unit. For this calling convention floating point arguments to functions are stored in the general purpose registers and on the stack. Passing floats around this way works with software and hardware floating point implementations. But in presence of a floating point unit it produces some overhead, because floating point numbers need to be moved from the floating point unit to the core registers to do a call and moved back to the floating point registers by the callee. The alternative calling convention is the so-called hard-float calling convention which requires the presence of a floating point unit but has the advantage of getting rid of the overhead of moving floating point values around when performing a call. Although it would be better in the long term to support the hard-float calling convention, we need to be able to interoperate with external code compiled for the operating system we are running on. For this reason at the moment we only support the soft-float to interoperate with external code. We implemented and tested the backend on a <a class="reference external" href="https://beagleboard.org/hardware-xM/">BeagleBoard-xM</a> with a <a class="reference external" href="https://www.arm.com/products/processors/cortex-a/cortex-a8.php">Cortex-A8</a> processor running <a class="reference external" href="https://wiki.ubuntu.com/ARM">Ubuntu 11.04 for ARM</a>.<br> +<br></div> +<div class="section" id="translating-for-arm"> +<h2> + + + + +Translating for ARM</h2> +The toolchain used to translate PyPy currently is based on a <a class="reference external" href="https://maemo.gitorious.org/scratchbox2/pages/Home">Scratchbox2</a>. Scratchbox2 is a cross-compiling environment. Development had stopped for a while, but it seems to have revived again. We run a 32-bit Python interpreter on the host system and perform all calls to the compiler using a Scratchbox2 based environment. A description on how to setup the cross translation toolchain can be found <a class="reference external" href="https://bitbucket.org/pypy/pypy/src/1f07ea8076c9/pypy/doc/arm.rst">here</a>.<br> +<br></div> +<div class="section" id="results"> +<h2> + + + + +Results</h2> +The current results on ARM, as shown in the graph below, show that the JIT currently gives a speedup of about 3.5 times compared to CPython on ARM. The benchmarks were run on the before mentioned BeagleBoard-xM with a 1GHz ARM Cortex-A8 processor and 512MB of memory. The operating system on the board is Ubuntu 11.04 for ARM. We measured the PyPy interpreter with the JIT enabled and disabled comparing each to CPython Python 2.7.1+ (r271:86832) for ARM. The graph shows the speedup or slowdown of both PyPy versions for the different benchmarks from our benchmark suite normalized to the runtime of CPython. The data used for the graph can be seen below.<br> +<div class="separator" style="clear: both; text-align: center;"> +<a href="https://2.bp.blogspot.com/-uckc9tOWgnM/TykHMuuGT9I/AAAAAAAAAKg/J8_fC6RS-QA/s1600/graph.png" style="margin-left: 1em; margin-right: 1em;"><img border="0" height="258" src="https://2.bp.blogspot.com/-uckc9tOWgnM/TykHMuuGT9I/AAAAAAAAAKg/J8_fC6RS-QA/s400/graph.png" width="400"></a></div> +<br> +The speedup is less than the speedup of 5.2 times we currently get on x86 on our own benchmark suite (see <a class="reference external" href="https://speed.pypy.org/">https://speed.pypy.org</a> for details). There are several possible reasons for this. Comparing the results for the interpreter without the JIT on ARM and x86 suggests that the interpreter generated by PyPy, without the JIT, has a worse performance when compared to CPython that it does on x86. Also it is quite possible that the code we are generating with the JIT is not yet optimal. Also there are some architectural constraints produce some overhead. One of these differences is the handling of constants, most ARM instructions only support 8 bit (that can be shifted) immediate values, larger constants need to be loaded into a register, something that is not necessary on x86.<br> +<br> +<table border="1" class="docutils"><colgroup></colgroup><colgroup><col width="40%"></colgroup><colgroup><col width="32%"></colgroup><colgroup><col width="28%"></colgroup><tbody valign="top"> +<tr><td>Benchmark</td><td>PyPy JIT</td><td>PyPy no JIT</td></tr> +<tr><td>ai</td><td>0.484439780047</td><td>3.72756749625</td></tr> +<tr><td>chaos</td><td>0.0807291691934</td><td>2.2908692212</td></tr> +<tr><td>crypto_pyaes</td><td>0.0711114832245</td><td>3.30112318509</td></tr> +<tr><td>django</td><td>0.0977743245519</td><td>2.56779947601</td></tr> +<tr><td>fannkuch</td><td>0.210423735698</td><td>2.49163632938</td></tr> +<tr><td>float</td><td>0.154275334675</td><td>2.12053281495</td></tr> +<tr><td>go</td><td>0.330483034202</td><td>5.84628320479</td></tr> +<tr><td>html5lib</td><td>0.629264389862</td><td>3.60333138526</td></tr> +<tr><td>meteor-contest</td><td>0.984747426912</td><td>2.93838610037</td></tr> +<tr><td>nbody_modified</td><td>0.236969593082</td><td>1.40027234936</td></tr> +<tr><td>pyflate-fast</td><td>0.367447191807</td><td>2.72472422146</td></tr> +<tr><td>raytrace-simple</td><td>0.0290527461437</td><td>1.97270054339</td></tr> +<tr><td>richards</td><td>0.034575573553</td><td>3.29767342015</td></tr> +<tr><td>slowspitfire</td><td>0.786642551908</td><td>3.7397367403</td></tr> +<tr><td>spambayes</td><td>0.660324379456</td><td>3.29059863111</td></tr> +<tr><td>spectral-norm</td><td>0.063610783731</td><td>4.01788986233</td></tr> +<tr><td>spitfire</td><td>0.43617131165</td><td>2.72050579076</td></tr> +<tr><td>spitfire_cstringio</td><td>0.255538702134</td><td>1.7418593111</td></tr> +<tr><td>telco</td><td>0.102918930413</td><td>3.86388866047</td></tr> +<tr><td>twisted_iteration</td><td>0.122723986805</td><td>4.33632475491</td></tr> +<tr><td>twisted_names</td><td>2.42367797135</td><td>2.99878698076</td></tr> +<tr><td>twisted_pb</td><td>1.30991837431</td><td>4.48877805486</td></tr> +<tr><td>twisted_tcp</td><td>0.927033354055</td><td>2.8161624665</td></tr> +<tr><td>waf</td><td>1.02059811932</td><td>1.03793427321</td></tr> +</tbody></table> +</div> +<br> +<br> +<div class="section" id="the-next-steps-and-call-for-help"> +<h2> + + + + +The next steps and call for help</h2> +Although there probably still are some remaining issues which have not surfaced yet, the JIT backend for ARM is working. Before we can merge the backend into the main development line there are some things that we would like to do first, in particular it we are looking for a way to run the all PyPy tests to verify that things work on ARM before we can merge. Additionally there are some other longterm ideas. To do this we are looking for people willing to help, either by contributing to implement the open features or that can help us with hardware to test.<br> +<br> +The incomplete list of open topics:<br> +<ul class="simple"> +<li>We are looking for a better way to translate PyPy for ARM, than the one describe above. I am not sure if there currently is hardware with enough memory to directly translate PyPy on an ARM based system, this would require between 1.5 or 2 Gig of memory. A fully <a class="reference external" href="https://wiki.qemu.org/Main_Page">QEMU</a> based approach could also work, instead of Scratchbox2 that uses QEMU under the hood.</li> +<li>Test the JIT on different hardware.</li> +<li>Experiment with the JIT settings to find the optimal thresholds for ARM.</li> +<li>Continuous integration: We are looking for a way to run the PyPy test suite to make sure everything works as expected on ARM, here QEMU also might provide an alternative.</li> +<li>A long term plan would be to port the backend to ARMv5 ISA and improve the support for systems without a floating point unit. This would require to implement the ISA and create different code paths and improve the instruction selection depending on the target architecture.</li> +<li>Review of the generated machine code the JIT generates on ARM to see if the instruction selection makes sense for ARM.</li> +<li>Build a version that runs on Android.</li> +<li>Improve the tools, i.e. integrate with <a class="reference external" href="https://bitbucket.org/pypy/jitviewer">jitviewer</a>.</li> +</ul> +So if you are interested or willing to help in any way contact us.</div>armjitpypyhttps://www.pypy.org/posts/2012/02/almost-there-pypys-arm-backend_01-3216759488618774525.htmlWed, 01 Feb 2012 09:43:00 GMTBenchmarking twistedhttps://www.pypy.org/posts/2010/03/hello-5058108566628405592.htmlMaciej Fijalkowski<p>Hello.</p> +<p>I recently did some benchmarking of <a class="reference external" href="https://twistedmatrix.com">twisted</a> on top of PyPy. For the very +impatient: <b>PyPy is up to 285% faster than CPython</b>. For more patient people, +there is a full explanation of what I did and how I performed measurments, +so they can judge themselves.</p> +<p>The benchmarks are living in <a class="reference external" href="https://code.launchpad.net/~exarkun/+junk/twisted-benchmarks">twisted-benchmarks</a> and were mostly written +by <a class="reference external" href="https://jcalderone.livejournal.com/">Jean Paul Calderone</a>. Even though he called them "initial exploratory +investigation into a potential direction for future development resulting +in performance oriented metrics guiding the process of optimization and +avoidance of complexity regressions", they're still much much better than +average benchmarks found out there.</p> +<p>The methodology was to run each benchmark for +quite some time (about 1 minute), measuring number of requests each 5s. +Then I looked at <a class="reference external" href="https://codespeak.net/svn/user/fijal/txt/twisted-data.txt">dump</a> of data and substracted some time it took +for JIT-capable interpreters to warm up (up to 15s), averaging +everything after that. Averages of requests per second are in the table below (the higher the better):</p> +<table border="1" class="docutils"> +<colgroup> +<col width="19%"> +<col width="16%"> +<col width="31%"> +<col width="34%"> +</colgroup> +<tbody valign="top"> +<tr><td>benchname</td> +<td>CPython</td> +<td>Unladen swallow</td> +<td>PyPy</td> +</tr> +<tr><td>names</td> +<td style="background-color: red;">10930</td> +<td>11940 (9% faster)</td> +<td style="background-color: #0C0;">15429 (40% faster)</td> +</tr> +<tr><td>pb</td> +<td style="background-color: red;">1705</td> +<td>2280 (34% faster)</td> +<td style="background-color: #0C0;">3029 (78% faster)</td> +</tr> +<tr><td>iterations</td> +<td style="background-color: red;">75569</td> +<td>94554 (25% faster)</td> +<td style="background-color: #0C0;">291066 (285% faster)</td> +</tr> +<tr><td>accept</td> +<td>2176</td> +<td>2166 (same speed)</td> +<td>2290 (5% faster)</td> +</tr> +<tr><td>web</td> +<td>879</td> +<td>854 (3% slower)</td> +<td style="background-color: #0C0;">1040 (18% faster)</td> +</tr> +<tr><td>tcp</td> +<td>105M</td> +<td style="background-color: #0C0;">119M (7% faster)</td> +<td style="background-color: red;">60M (46% slower)</td> +</tr> +</tbody> +</table> +<p>To reproduce, run each benchmark with:</p> +<blockquote> +benchname.py -n 12 -d 5</blockquote> +<p><em>WARNING</em>: running tcp-based benchmarks that open new connection for each +request (web &amp; accept) can exhaust number of some kernel structures, +limit <strong>n</strong> or wait until next run if you see drops in request per second.</p> +<p>The first obvious thing is that various benchmarks are more or less amenable +to speedups by JIT compilation. Accept and tcp getting smallest speedups, if at +all. This is understandable, since JIT is mostly about reducing interpretation +and frame overhead, which is probably not large when it comes to accepting +connections. However, if you actually loop around, doing something, JIT +can give you a lot of speedup.</p> +<p>The other obvious thing is that <b>PyPy is the fastest python interpreter +here</b>, almost across-the board (Jython and IronPython won't run twisted), +except for raw tcp throughput. However, speedups can vary and I expect +this to improve after the release, as there are points, where PyPy can +be improved. Regarding raw tcp throughput - this can be a problem for +some applications and we're looking forward to improve this particular +bit.</p> +<p>The main reason to use twisted for this comparison is a lot of support from +twisted team and JP Calderone in particular, especially when it comes to +providing benchmarks. If some open source project wants to be looked at +by PyPy team, please <b>provide a reasonable set of benchmarks and infrastructure</b>.</p> +<p>If, however, you're a closed source project fighting with performance problems +of Python, we're providing <b>contracting for investigating opportunities</b>, how +PyPy and not only PyPy, can speed up your project.</p> +<p>Cheers,<br> +fijal</p> +<br> +<p>Benchmark descriptions:</p> +<ul class="simple"> +<li><em>names</em> - simple DNS server</li> +<li><em>web</em> - simple http hello world server</li> +<li><em>pb</em> - perspective broker, RPC mechanism for twisted</li> +<li><em>iterations</em> - empty twisted loop</li> +<li><em>accept</em> - number of tcp connections accepted per second</li> +<li><em>tcp</em> - raw socket transfer throughput</li> +</ul> +<p>Used interpreters:</p> +<ul class="simple"> +<li>CPython 2.6.2 - as packaged by ubuntu</li> +<li>Unladen swallow svn trunk, revision 1109</li> +<li>PyPy svn trunk, revision 71439</li> +</ul> +<p>Twisted version used: svn trunk, revision 28580</p> +<p>Machine: unfortunately 32bit virtual-machine under qemu, running ubuntu karmic, +on top of Quad core intel Q9550 with 6M cache. Courtesy of Michael Schneider.</p>jithttps://www.pypy.org/posts/2010/03/hello-5058108566628405592.htmlMon, 01 Mar 2010 15:05:00 GMTSome benchmarkinghttps://www.pypy.org/posts/2009/11/some-benchmarking-9211261260383281459.htmlMaciej Fijalkowski<p>Hello. +</p><p> +Recently, thanks to the surprisingly helpful Unhelpful, also known as Andrew Mahone, +we have a decent, if slightly arbitrary, set of performances graphs. +It contains a couple of benchmarks already +seen on this blog as well as some taken from <a href="https://shootout.alioth.debian.org/">The Great Computer +Language Benchmarks Game</a>. These benchmarks don't even try to represent "real applications" +as they're mostly small algorithmic benchmarks. Interpreters used: +</p> +<ol> +<li> +PyPy trunk, revision 69331 with --translation-backendopt-storesink, which is +now on by default +</li> +<li> +Unladen swallow trunk, r900 +</li> +<li>CPython 2.6.2 release</li> +</ol> +<p> +Here are the graphs; the benchmarks and the runner script are <a href="https://www.looking-glass.us/~chshrcat/python-benchmarks/">available</a> +</p> + +<a href="https://1.bp.blogspot.com/_5R1EBmwBBTs/SwRteBYi01I/AAAAAAAAAOU/BU3h_VUfmH0/s1600/result.png"><img alt="" border="0" id="BLOGGER_PHOTO_ID_5405565815286322002" src="https://1.bp.blogspot.com/_5R1EBmwBBTs/SwRteBYi01I/AAAAAAAAAOU/BU3h_VUfmH0/s400/result.png" style="display: block; margin: 0px auto 10px; text-align: center; cursor: pointer; cursor: hand; width: 400px; height: 300px;"></a> + +And zoomed in for all benchmarks except binary-trees and fannkuch. +<a href="https://1.bp.blogspot.com/_5R1EBmwBBTs/SwRtnxYPJII/AAAAAAAAAOc/JAvE6pYaEjI/s1600/result2.png"><img alt="" border="0" id="BLOGGER_PHOTO_ID_5405565982788756610" src="https://1.bp.blogspot.com/_5R1EBmwBBTs/SwRtnxYPJII/AAAAAAAAAOc/JAvE6pYaEjI/s400/result2.png" style="display: block; margin: 0px auto 10px; text-align: center; cursor: pointer; cursor: hand; width: 400px; height: 300px;"></a> + +<p> +As we can see, PyPy is generally somewhere between the same speed +as CPython to 50x faster (f1int). The places where we're the same +speed as CPython are places where we know we have problems - for example generators are +not sped up by the JIT and they require some work (although not as much by far +as generators &amp; Psyco :-). The glaring inefficiency is in the regex-dna benchmark. +This one clearly demonstrates that our regular expression engine is really, +really, bad and urgently requires attention. +</p> +<p> +The cool thing here is, that although these benchmarks might not represent +typical python applications, they're not uninteresting. They show +that algorithmic code does not need to be far slower in Python than in C, +so using PyPy one need not worry about algorithmic code being dramatically +slow. As many readers would agree, that kills yet another usage of C in our +lives :-) +</p> +Cheers,<br> +fijaljithttps://www.pypy.org/posts/2009/11/some-benchmarking-9211261260383281459.htmlWed, 18 Nov 2009 21:53:00 GMTLogging and nice graphshttps://www.pypy.org/posts/2009/11/hi-all-this-week-i-worked-on-improving-6515977421244851229.htmlArmin Rigo<p>Hi all,</p> + +<p>This week I worked on improving the system we use for logging. Well, it was not really a "system" but rather a pile of hacks to measure in custom ways timings and counts and display them. So now, we have a system :-)</p> + +<p>The system in question was integrated in the code for the GC and the JIT, which are two independent components as far as the source is concerned. However, we can now display a unified view. Here is for example pypy-c-jit running pystone for (only) 5000 iterations:</p> + +<a href="https://codespeak.net/~arigo/raw/pystone.png"><img alt="" border="0" id="BLOGGER_PHOTO_ID_5399212353093417154" src="https://3.bp.blogspot.com/_Sg3NUJ-JhgU/Su3bB2UIuMI/AAAAAAAAAAM/2-Vf5zry_4Q/s320/pystone.png" style="cursor: pointer; cursor: hand; width: 320px; height: 51px;"></a> + +<p>The top long bar represents time. The bottom shows two summaries of the total time taken by the various components, and also plays the role of a legend to understand the colors at the top. Shades of red are the GC, shades of green are the JIT.</p> + +<p>Here is another picture, this time on pypy-c-jit running 10 iterations of richards:</p> + +<a href="https://codespeak.net/~arigo/raw/richards.png"><img alt="" border="0" id="BLOGGER_PHOTO_ID_5399212511216555922" src="https://4.bp.blogspot.com/_Sg3NUJ-JhgU/Su3bLDXoV5I/AAAAAAAAAAU/VPxEP_hqrFk/s320/richards.png" style="cursor: pointer; cursor: hand; width: 320px; height: 19px;"></a> + +<p>We have to look more closely at various examples, but a few things immediately show up. One thing is that the GC is put under large pressure by the jit-tracing, jit-optimize and (to a lesser extent) the jit-backend components. So large in fact that the GC takes at least 60-70% of the time there. We will have to do something about it at some point. The other thing is that on richards (and it's likely generally the case), the jit-blackhole component takes a lot of time. "Blackholing" is the operation of recovering from a guard failure in the generated assembler, and falling back to the interpreter. So this is also something we will need to improve.</p> + +<p>That's it! The images were generated with the following commands:</p> + +<pre>PYPYLOG=/tmp/log pypy-c-jit richards.py +python pypy/tool/logparser.py draw-time /tmp/log --mainwidth=8000 --output=filename.png</pre> + +<i><b>EDIT:</b> nowadays the command-line has changed to:</i><pre>python rpython/tool/logparser.py draw-time /tmp/log --mainwidth=8000 filename.png</pre>jithttps://www.pypy.org/posts/2009/11/hi-all-this-week-i-worked-on-improving-6515977421244851229.htmlSun, 01 Nov 2009 18:59:00 GMTFirst pypy-cli-jit benchmarkshttps://www.pypy.org/posts/2009/10/first-pypy-cli-jit-benchmarks-6698484455072589492.htmlAntonio Cuni<p>As the readers of this blog <a class="reference external" href="https://www.pypy.org/posts/2008/11/porting-jit-to-cli-part-1-8712941279840156635.html">already know</a>, I've been working on porting the +JIT to CLI/.NET for the last months. Now that it's finally possible to get a +working pypy-cli-jit, it's time to do some benchmarks.</p> +<p><strong>Warning:</strong> as usual, all of this has to be considered to be a alpha version: +don't be surprised if you get a crash when trying to run pypy-cli-jit. Of +course, things are improving very quickly so it should become more and more +stable as days pass.</p> +<p>For this time, I decided to run four benchmarks. Note that for all of them we +run the main function once in advance, to let the JIT recoginizing the hot +loops and emitting the corresponding code. Thus, the results reported do +<strong>not</strong> include the time spent by the JIT compiler itself, but give a good +measure of how good is the code generated by the JIT. At this point in time, +I know that the CLI JIT backend spends way too much time compiling stuff, but +this issue will be fixed soon.</p> +<blockquote> +<ul class="simple"> +<li><a class="reference external" href="https://paste.pocoo.org/show/145050/">f1.py</a>: this is the classic PyPy JIT benchmark. It is just a function +that does some computational intensive work with integers.</li> +<li><a class="reference external" href="https://paste.pocoo.org/show/143243/">floatdemo.py</a>: this is the same benchmark involving floating point +numbers that have already been described in a previous <a class="reference external" href="https://www.pypy.org/posts/2009/10/pypys-jit-now-supports-floats-7003493323596806737.html">blog post</a>.</li> +<li><a class="reference external" href="https://paste.pocoo.org/show/145051/">oodemo.py</a>: this is just a microbenchmark doing object oriented stuff +such as method calls and attribute access.</li> +<li><a class="reference external" href="https://paste.pocoo.org/show/145052/">richards2.py</a>: a modified version of the classic richards.py, with a +warmup call before starting the real benchmark.</li> +</ul> +</blockquote> +<p>The benchmarks were run on a Windows machine with an Intel Pentium Dual Core +E5200 2.5GHz and 2GB RAM, both with .NET (CLR 2.0) and Mono 2.4.2.3.</p> +<p>Because of a known <a class="reference external" href="https://bugzilla.novell.com/show_bug.cgi?id=474718">mono bug</a>, if you use a version older than 2.1 you need +to pass the option <tt class="docutils literal"><span class="pre">-O=-branch</span></tt> to mono when running pypy-cli-jit, else it +will just loop forever.</p> +<p>For comparison, we also run the same benchmarks with IronPython 2.0.1 and +IronPython 2.6rc1. Note that IronPython 2.6rc1 does not work with mono.</p> +<p>So, here are the results (expressed in seconds) with Microsoft CLR:</p> +<blockquote> +<table border="1" class="docutils"> +<colgroup> +<col width="15%"> +<col width="20%"> +<col width="15%"> +<col width="12%"> +<col width="20%"> +<col width="18%"> +</colgroup> +<thead valign="bottom"> +<tr><th class="head">Benchmark</th> +<th class="head">pypy-cli-jit</th> +<th class="head">ipy 2.0.1</th> +<th class="head">ipy 2.6</th> +<th class="head">ipy2.01/ pypy</th> +<th class="head">ipy2.6/ pypy</th> +</tr> +</thead> +<tbody valign="top"> +<tr><td>f1</td> +<td>0.028</td> +<td>0.145</td> +<td>0.136</td> +<td>5.18x</td> +<td>4.85x</td> +</tr> +<tr><td>floatdemo</td> +<td>0.671</td> +<td>0.765</td> +<td>0.812</td> +<td>1.14x</td> +<td>1.21x</td> +</tr> +<tr><td>oodemo</td> +<td>1.25</td> +<td>4.278</td> +<td>3.816</td> +<td>3.42x</td> +<td>3.05x</td> +</tr> +<tr><td>richards2</td> +<td>1228</td> +<td>442</td> +<td>670</td> +<td>0.36x</td> +<td>0.54x</td> +</tr> +</tbody> +</table> +</blockquote> +<p>And with Mono:</p> +<blockquote> +<table border="1" class="docutils"> +<colgroup> +<col width="21%"> +<col width="29%"> +<col width="21%"> +<col width="29%"> +</colgroup> +<thead valign="bottom"> +<tr><th class="head">Benchmark</th> +<th class="head">pypy-cli-jit</th> +<th class="head">ipy 2.0.1</th> +<th class="head">ipy2.01/ pypy</th> +</tr> +</thead> +<tbody valign="top"> +<tr><td>f1</td> +<td>0.042</td> +<td>0.695</td> +<td>16.54x</td> +</tr> +<tr><td>floatdemo</td> +<td>0.781</td> +<td>1.218</td> +<td>1.55x</td> +</tr> +<tr><td>oodemo</td> +<td>1.703</td> +<td>9.501</td> +<td>5.31x</td> +</tr> +<tr><td>richards2</td> +<td>720</td> +<td>862</td> +<td>1.20x</td> +</tr> +</tbody> +</table> +</blockquote> +<p>These results are very interesting: under the CLR, we are between 5x faster +and 3x slower than IronPython 2.0.1, and between 4.8x faster and 1.8x slower +than IronPython 2.6. On the other hand, on mono we are consistently faster +than IronPython, up to 16x. Also, it is also interesting to note that +pypy-cli runs faster on CLR than mono for all benchmarks except richards2.</p> +<p>I've not investigated yet, but I think that the culprit is the terrible +behaviour of tail calls on CLR: as I already wrote in <a class="reference external" href="https://www.pypy.org/posts/2008/12/porting-jit-to-cli-part-3-3519327524638923621.html">another blog post</a>, +tail calls are ~10x slower than normal calls on CLR, while being only ~2x +slower than normal calls on mono. richads2 is probably the benchmark that +makes most use of tail calls, thus explaining why we have a much better result +on mono than CLR.</p> +<p>The next step is probably to find an alternative implementation that does not +use tail calls: this probably will also improve the time spent by the JIT +compiler itself, which is not reported in the numbers above but that so far it +is surely too high to be acceptable. Stay tuned.</p>clijitpypyhttps://www.pypy.org/posts/2009/10/first-pypy-cli-jit-benchmarks-6698484455072589492.htmlThu, 15 Oct 2009 13:36:00 GMTPyPy's JIT now supports floatshttps://www.pypy.org/posts/2009/10/pypys-jit-now-supports-floats-7003493323596806737.htmlMaciej Fijalkowski<p> +Hello. +</p> + +<p> +We've just merged branch which adds float support to x86 backend. +This means that floating point operations are now super fast +in PyPy's JIT. Let's have a look at example, provided by +<a href="https://lazypython.blogspot.com/">Alex Gaynor</a> +and stolen from <a href="https://factor-language.blogspot.com/2009/08/performance-comparison-between-factor.html">Factor blog</a>. +</p> + +<p> +The original version of the <a href="https://paste.pocoo.org/raw/142952/">benchmark</a>, was definitely tuned for the performance needs of CPython. +</p><p> +For running this on PyPy, I changed to a bit <a href="https://paste.pocoo.org/show/143243/">simpler version of the program</a>, +and I'll explain a few changes that I did, which the reflect current +limitations of PyPy's JIT. They're not very deep and they might be +already gone while you're reading it: +</p> + +<ul> +<li>Usage of <tt>__slots__</tt>. This is a bit ridiculous, but we spend quite a bit + of time to speed up normal instances of new-style classes which are + very fast, yet ones with <tt>__slots__</tt> are slower. To be fixed soon.</li> + +<li>Usage of reduce. This one is even more obscure, but reduce is not + perceived as a thing producing loops in a program. Moving to + a pure-Python version of reduce fixes the problem.</li> + +<li>Using <tt>x ** 2</tt> vs <tt>x * x</tt>. In PyPy, reading a local variable is a + no-op when JITted (the same as reading local variable in C). However + multiplication is simpler operation that power operation.</li> +</ul> + +<p> +I also included the original <a href="https://paste.factorcode.org/paste?id=838">Java benchmark</a>. Please +note that original java version is similar to my modified one +(not the one specifically tuned for CPython) +</p> + +The performance figures below (for <tt>n = 1 000 000</tt>), average of 10 runs: + +<ul> +<li>CPython 2.6: <b>7.56s</b></li> +<li>CPython &amp; psyco 2.6: <b>4.44s</b></li> +<li>PyPy: <b>1.63s</b></li> +<li>Java (JVM 1.6, client mode): <b>0.77s</b></li> +</ul> + +<p> +and while JVM is much faster, it's very good that we can even compare :-) +</p> + +Cheers<br> +fijaljithttps://www.pypy.org/posts/2009/10/pypys-jit-now-supports-floats-7003493323596806737.htmlTue, 06 Oct 2009 14:47:00 GMT \ No newline at end of file diff --git a/categories/jython.html b/categories/jython.html new file mode 100644 index 000000000..e81f3ab71 --- /dev/null +++ b/categories/jython.html @@ -0,0 +1,114 @@ + + + + + +Posts about jython | PyPy + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +
                + + \ No newline at end of file diff --git a/categories/jython.xml b/categories/jython.xml new file mode 100644 index 000000000..927af96c6 --- /dev/null +++ b/categories/jython.xml @@ -0,0 +1,13 @@ + +PyPy (Posts about jython)https://www.pypy.org/enContents © 2024 <a href="mailto:pypy-dev@pypy.org">The PyPy Team</a> Sat, 31 Aug 2024 17:48:13 GMTNikola (getnikola.com)http://blogs.law.harvard.edu/tech/rssEP2008: PyPy meets Jythonhttps://www.pypy.org/posts/2008/07/ep2008-pypy-meets-jython-1107070144380217881.htmlholger krekel<p>One of the great events at EuroPython 2008 were our chats and meetings with the Jython and Sun people. The Jython people recently are pushing into releasing Python version 2.5 and they currently pursue many interesting sub projects. Coincidentally, PyPy also has tons of interesting areas and results :) So we eventually got into brainstorming a number of possible technical collab ideas. Further below is a first list as i wrote it down from our 10 people PyPy / Jython 30 minute close up meeting yesterday. + +It felt great to be able to talk to the Jython people this way - kudos to Sun for their clear commitments and open ways to go about things! I sense a genuine interest on fair collaboration with non-java developer communities. Seems like they are serious about not focusing on "Java this", "Java that" anymore but rather focus on the JVM platform. Good! And about language +independent interest in ambitious technology. Even Better! I am tensed to see how things go from here. + +So here the list of technical collab ideas: +</p><ul><li>ctypes - try to create _rawffi module in Java for Jython, which will enable Jython to reuse our existing ctypes implementation (and have PyPy use the Jython-rawffi for its own for PyPy.JVM)</li><li> generally see to share work / (continue) collaborate regarding extension modules</li><li>Jython/PyPy (and eventually IronPython): document known differences to CPython, maybe in a PEP</li><li>Python Interpreter for Jython (in order to run CPython's .pyc files): re-use pypy's bytecode evaluator, implement a "Jython object space". </li><li>re-use rpython-extension modules for jython (e.g. SRE), by compiling them to Java and reusing as a native library.</li><li>collaborate on testing framework / benchmarking, have a common site to show test results</li><li>make py.test compatible with jython</li><li>come up with a set of "pure Python language" tests, which would gather and refactor tests from CPython, PyPy and Jython. </li><li>look into using java types / jython approaches for implementing free threading.</li><li>share knowledge regarding JIT / psyco +</li></ul>If you have any more ideas, comments or would like to join efforts, let us know! + +Cheers and thanks to <a href="https://www.sauria.com/blog/">Ted Leung</a>, <a href="https://fwierzbicki.blogspot.com/">Frank Wierzbiki</a>, <a href="https://www.zyasoft.com/pythoneering/">Jim Baker</a> and Tobias Ivarsson from Sun and Jython fame respectively, + +Holgerep2008jythonpypysunhttps://www.pypy.org/posts/2008/07/ep2008-pypy-meets-jython-1107070144380217881.htmlThu, 10 Jul 2008 08:29:00 GMT \ No newline at end of file diff --git a/categories/kcachegrind.html b/categories/kcachegrind.html new file mode 100644 index 000000000..672d993ec --- /dev/null +++ b/categories/kcachegrind.html @@ -0,0 +1,114 @@ + + + + + +Posts about kcachegrind | PyPy + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +
                + + \ No newline at end of file diff --git a/categories/kcachegrind.xml b/categories/kcachegrind.xml new file mode 100644 index 000000000..51123947e --- /dev/null +++ b/categories/kcachegrind.xml @@ -0,0 +1,11 @@ + +PyPy (Posts about kcachegrind)https://www.pypy.org/enContents © 2024 <a href="mailto:pypy-dev@pypy.org">The PyPy Team</a> Sat, 31 Aug 2024 17:48:12 GMTNikola (getnikola.com)http://blogs.law.harvard.edu/tech/rssProfiling for fun with valgrindhttps://www.pypy.org/posts/2007/12/profiling-for-fun-with-valgrind-3215121784705288400.htmlMaciej Fijalkowski<p>Recently I've been doing a lot of profiling on the PyPy executables to find speed bottlenecks. <a href="https://en.wikipedia.org/wiki/Valgrind">Valgrind</a> (the original <a href="https://valgrind.org/">page</a> seems to be down) is an extremely nice tool for doing this. It has several built-in tools that give you different types of profiles. The callgrind mode provides you with a lot of information including relative call costs. The cachegrind tool gives you less information, but what it gives you (e.g. cache misses) is much more accurate. The obvious choice would be to have a way to combine the results of two profiling runs to have both. In the last days I wrote a script that does this. It's available <a href="https://codespeak.net/svn/user/fijal/pygrind">at my user's svn</a> and has a pretty intuitive command line interface. The combining calculation are not perfect yet, total costs of functions can still be a bit bogus (they can sum up to whatever) but at least the relative figures are good. This means that we can stop looking at two different types of graphs now. + +An awesome tool for analyzing the profile data is <a href="https://kcachegrind.sourceforge.net/cgi-bin/show.cgi">kcachegrind.</a> + +<a href="https://4.bp.blogspot.com/_5R1EBmwBBTs/R2JjKRYuTTI/AAAAAAAAAAM/LX5ktu_FcIE/s1600-h/kcachegrind.png"><img alt="" border="0" id="BLOGGER_PHOTO_ID_5143782752527469874" src="https://4.bp.blogspot.com/_5R1EBmwBBTs/R2JjKRYuTTI/AAAAAAAAAAM/LX5ktu_FcIE/s320/kcachegrind.png" style="margin: 0px auto 10px; display: block; text-align: center; cursor: pointer;"></a> + +Which also proves that my 12'' display is to small at least for some things :-). + + +<b>Update:</b> pygrind is available under the MIT license.</p>kcachegrindprofilingvalgrindhttps://www.pypy.org/posts/2007/12/profiling-for-fun-with-valgrind-3215121784705288400.htmlFri, 14 Dec 2007 11:02:00 GMT \ No newline at end of file diff --git a/categories/meta.html b/categories/meta.html new file mode 100644 index 000000000..835f13ae8 --- /dev/null +++ b/categories/meta.html @@ -0,0 +1,114 @@ + + + + + +Posts about meta | PyPy + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +
                + + \ No newline at end of file diff --git a/categories/meta.xml b/categories/meta.xml new file mode 100644 index 000000000..934a866fd --- /dev/null +++ b/categories/meta.xml @@ -0,0 +1,87 @@ + +PyPy (Posts about meta)https://www.pypy.org/enContents © 2024 <a href="mailto:pypy-dev@pypy.org">The PyPy Team</a> Sat, 31 Aug 2024 17:48:13 GMTNikola (getnikola.com)http://blogs.law.harvard.edu/tech/rssThe PyPy Blog Turns 15 Yearshttps://www.pypy.org/posts/2022/10/blog-15-years.htmlCarl Friedrich Bolz-Tereick<p>Exactly 15 years ago today we wrote the <a class="reference external" href="https://www.pypy.org/posts/2007/10/first-post-8150793557471983289.html">first blog post on the PyPy blog</a>! +Over the years, we have written 423 posts, from the <a class="reference external" href="https://www.pypy.org/posts/2007/12/faster-than-c-8057790636822502084.html">shortest</a> to the +<a class="reference external" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html">longest</a>. In 2021 we <a class="reference external" href="https://www.pypy.org/posts/2021/03/pypys-blog-has-moved.html">moved</a> from <a class="reference external" href="https://morepypy.blogspot.com">blogger</a> to our own domain.</p> +<p>The topics over the years varied widely, we published <a class="reference external" href="https://www.pypy.org/posts/2013/05/pypy-20-einstein-sandwich-635158782365435530.html">release</a> <a class="reference external" href="https://www.pypy.org/posts/2017/06/pypy-v58-released-739876359584854017.html">announcements</a>; +<a class="reference external" href="https://www.pypy.org/posts/2009/04/roadmap-for-jit-377358891902851723.html">roadmaps</a>; <a class="reference external" href="https://www.pypy.org/posts/2010/06/blackhole-interpreter-2752965445510091289.html">JIT</a>, <a class="reference external" href="https://www.pypy.org/posts/2013/10/incremental-garbage-collector-in-pypy-8956893523842234676.html">GC</a> and <a class="reference external" href="https://www.pypy.org/posts/2013/10/update-on-stm-7145890443443707910.html">STM</a> <a class="reference external" href="https://www.pypy.org/posts/2019/07/pypy-jit-for-aarch64-7161523403247118006.html">updates</a>; <a class="reference external" href="https://www.pypy.org/posts/2018/06/repeating-matrix-multiplication-8641748742577945875.html">benchmarks</a>; <a class="reference external" href="https://www.pypy.org/posts/2008/10/dsseldorf-sprint-report-days-1-3-5256639868851086032.html">sprint</a>, <a class="reference external" href="https://www.pypy.org/posts/2007/11/pypy-road-show-1-new-york-and-ibm-7837076523877011699.html">trip</a> and +<a class="reference external" href="https://www.pypy.org/posts/2009/07/ecoop-2009-8415055006373020774.html">conference</a> <a class="reference external" href="https://www.pypy.org/posts/2012/04/pycon-2012-wrap-up-559575896040055505.html">reports</a>; <a class="reference external" href="https://www.pypy.org/posts/2016/07/reverse-debugging-for-python-8854823774141612670.html">technical</a> <a class="reference external" href="https://www.pypy.org/posts/2010/11/efficiently-implementing-python-objects-3838329944323946932.html">deep</a> <a class="reference external" href="https://www.pypy.org/posts/2015/10/pypy-memory-and-warmup-improvements-2-4598780879518640015.html">dives</a>; <a class="reference external" href="https://www.pypy.org/posts/2022/02/nlp-icelandic-case-study.html">case studies</a>; <a class="reference external" href="https://www.pypy.org/posts/2008/04/trying-to-get-pypy-to-run-on-python-30-5082015544752137606.html">april</a> <a class="reference external" href="https://www.pypy.org/posts/2008/04/other-aprils-fools-ideas-955926452383759016.html">fool's +jokes</a>; <a class="reference external" href="https://www.pypy.org/posts/2015/03/pydgin-using-rpython-to-generate-fast-1514065178985838697.html">research</a> projects; <a class="reference external" href="https://www.pypy.org/posts/2013/02/announcing-topaz-rpython-powered-ruby-6662407703061538341.html">other</a> <a class="reference external" href="https://www.pypy.org/posts/2012/07/hello-everyone-6869934374873967346.html">languages</a> <a class="reference external" href="https://www.pypy.org/posts/2014/08/a-field-test-of-software-transactional-5659022209916605798.html">using</a> RPython; finished <a class="reference external" href="https://www.pypy.org/posts/2010/10/phd-thesis-about-pypys-cli-jit-backend-969267841095296323.html">PhD</a> +<a class="reference external" href="https://www.pypy.org/posts/2019/04/an-rpython-jit-for-lpegs-4779548053359386284.html">Bachelor</a> and <a class="reference external" href="https://www.pypy.org/posts/2008/10/prolog-jit-masters-thesis-finished-5462132148241449867.html">Master</a>, theses; pictures:</p> +<a class="reference external image-reference" href="https://www.pypy.org/images/2022-pypy-pictures-collage.jpg"> +<img alt="a collage of photos taken at PyPy sprints" src="https://www.pypy.org/images/2022-pypy-pictures-collage-small.jpg"> +</a> +<p>and diagrams:</p> +<a class="reference external image-reference" href="https://www.pypy.org/images/2022-pypy-diagrams-collage.png"> +<img alt="a collage of diagrams from previous blog posts" src="https://www.pypy.org/images/2022-pypy-diagrams-collage-small.png"> +</a> +<p>Quite a number of blog posts were very early iterations of papers that we +published later, here are a few that I can remember:</p> +<ul class="simple"> +<li><p><a class="reference external" href="https://www.pypy.org/posts/2009/03/applying-tracing-jit-to-interpreter-3287844903778799266.html">Applying a Tracing JIT to an Interpreter</a> became <a class="reference external" href="https://dl.acm.org/doi/10.1145/1565824.1565827">Tracing the meta-level: +PyPy's tracing JIT compiler</a> at ICOOOLPS 2009, by far our most successful +paper.</p></li> +<li><p><a class="reference external" href="https://www.pypy.org/posts/2010/09/escape-analysis-in-pypys-jit-1780048403046080197.html">Escape Analysis in PyPy's JIT</a> became <a class="reference external" href="https://dl.acm.org/doi/10.1145/1929501.1929508">Allocation removal by partial +evaluation in a tracing JIT</a> at PEPM 2010.</p></li> +<li><p><a class="reference external" href="https://www.pypy.org/posts/2011/03/controlling-tracing-of-interpreter-with_21-6524148550848694588.html">Controlling the Tracing of an Interpreter With Hints</a> was a draft of the +paper <a class="reference external" href="https://dl.acm.org/doi/10.1145/2069172.2069181">Runtime feedback in a meta-tracing JIT for efficient dynamic +languages</a> at ICOOOLPS 2011</p></li> +<li><p><a class="reference external" href="https://www.pypy.org/posts/2010/09/using-escape-analysis-across-loop-2887031293132023676.html">Using Escape Analysis Across Loop Boundaries for Specialization</a> was the +nucleus of <a class="reference external" href="https://dl.acm.org/doi/10.1145/2384577.2384586">Loop-aware optimizations in PyPy's tracing JIT</a> at DLS 2012.</p></li> +<li><p><a class="reference external" href="https://www.pypy.org/posts/2011/10/more-compact-lists-with-list-strategies-8229304944653956829.html">List Strategies</a> was eventually turned into the paper <a class="reference external" href="https://dl.acm.org/doi/10.1145/2509136.2509531">Storage strategies +for collections in dynamically typed languages</a> at OOPSLA 2013.</p></li> +</ul> +<section id="greatest-hits"> +<h2>Greatest Hits</h2> +<p>In terms of visitors, the top five posts on the old blog were – on the new blog +we simply don't have stats (yet?):</p> +<ol class="arabic simple"> +<li><p><a class="reference external" href="https://www.pypy.org/posts/2017/08/lets-remove-global-interpreter-lock-748023554216649595.html">Let's remove the global interpreter lock</a></p></li> +<li><p><a class="reference external" href="https://www.pypy.org/posts/2011/04/tutorial-writing-interpreter-with-pypy-3785910476193156295.html">Tutorial: Writing an Interpreter with PyPy, Part 1</a></p></li> +<li><p><a class="reference external" href="https://www.pypy.org/posts/2019/10/pypys-new-json-parser-492911724084305501.html">PyPy's new JSON parser</a></p></li> +<li><p><a class="reference external" href="https://www.pypy.org/posts/2016/08/pypy-gets-funding-from-mozilla-for-5569307998787871200.html">PyPy gets funding from Mozilla for Python 3.5 support</a></p></li> +<li><p><a class="reference external" href="https://www.pypy.org/posts/2017/10/how-to-make-your-code-80-times-faster-1424098117108093942.html">How to make your code 80 times faster</a></p></li> +</ol> +<p>The number of posts per year developed like this:</p> +<img alt="/images/2022-pypy-posts-per-year.svg" src="https://www.pypy.org/images/2022-pypy-posts-per-year.svg"> +<p>The most prolific authors are:</p> +<ol class="arabic simple"> +<li><p><a class="reference external" href="https://www.pypy.org/authors/maciej-fijalkowski.html">Maciej Fijałkowski</a></p></li> +<li><p><a class="reference external" href="https://www.pypy.org/authors/carl-friedrich-bolz-tereick.html">Carl Friedrich Bolz-Tereick</a></p></li> +<li><p><a class="reference external" href="https://www.pypy.org/authors/armin-rigo.html">Armin Rigo</a></p></li> +<li><p><a class="reference external" href="https://www.pypy.org/authors/antonio-cuni.html">Antonio Cuni</a></p></li> +<li><p><a class="reference external" href="https://www.pypy.org/authors/mattip.html">Matti Picus</a></p></li> +</ol> +<p>Several blog posts have made it to the Hacker News front page, three of them to +number 1:</p> +<ul class="simple"> +<li><p><a class="reference external" href="https://www.pypy.org/posts/2014/07/pypy-stm-first-interesting-release-8684276541915333814.html">PyPy-STM: first “interesting” release</a> (<a class="reference external" href="https://news.ycombinator.com/item?id=7991404">discussion</a>)</p></li> +<li><p><a class="reference external" href="https://www.pypy.org/posts/2017/08/lets-remove-global-interpreter-lock-748023554216649595.html">Let's Remove the Global Interpreter Lock</a> (<a class="reference external" href="https://news.ycombinator.com/item?id=15008636">discussion</a>)</p></li> +<li><p><a class="reference external" href="https://www.pypy.org/posts/2018/09/inside-cpyext-why-emulating-cpython-c-8083064623681286567.html">Inside cpyext: Why emulating CPython C API is so Hard</a> (<a class="reference external" href="https://news.ycombinator.com/item?id=18040664">discussion</a>)</p></li> +</ul> +</section> +<section id="personal-favourites"> +<h2>Personal Favourites</h2> +<p>While looking through the posts, there were a few that stood out to me in some +way, so here's a subjective list of ones that I had fun looking at again:</p> +<ul class="simple"> +<li><p>2008: <a class="reference external" href="https://www.pypy.org/posts/2008/10/sprint-discussions-jit-generator-3301578822967655604.html">Sprint Discussions: JIT Generator Planning</a></p></li> +<li><p>2009: <a class="reference external" href="https://www.pypy.org/posts/2009/08/pypy-gets-new-compiler_25-6401910947439531107.html">PyPy gets a new compiler</a></p></li> +<li><p>2010: <a class="reference external" href="https://www.pypy.org/posts/2010/12/oh-and-btw-pypy-gets-funding-through-3568486750776147382.html">Oh, and btw: PyPy gets funding through "Eurostars"</a></p></li> +<li><p>2011: <a class="reference external" href="https://www.pypy.org/posts/2011/07/realtime-image-processing-in-python-6985924592886873374.html">Realtime image processing in Python</a></p></li> +<li><p>2012: <a class="reference external" href="https://www.pypy.org/posts/2012/06/architecture-of-cppyy-9077100041707701102.html">Architecture of Cppyy</a></p></li> +<li><p>2013: <a class="reference external" href="https://www.pypy.org/posts/2013/02/10-years-of-pypy-634401291726575821.html">10 years of PyPy</a></p></li> +<li><p>2014: <a class="reference external" href="https://www.pypy.org/posts/2014/11/pypy-io-improvements-1042070332447047674.html">PyPy IO Improvements</a></p></li> +<li><p>2015: <a class="reference external" href="https://www.pypy.org/posts/2015/10/automatic-simd-vectorization-support-in-639063580401330508.html">Automatic SIMD vectorization support in PyPy</a></p></li> +<li><p>2016: <a class="reference external" href="https://www.pypy.org/posts/2016/04/pypy-enterprise-edition-3688275697656890948.html">PyPy Enterprise Edition</a></p></li> +<li><p>2017: <a class="reference external" href="https://www.pypy.org/posts/2017/03/async-http-benchmarks-on-pypy3-1092124994927894138.html">Async HTTP benchmarks on PyPy3</a></p></li> +<li><p>2018: <a class="reference external" href="https://www.pypy.org/posts/2018/04/improving-syntaxerror-in-pypy-5733639208090522433.html">Improving SyntaxError in PyPy</a></p></li> +<li><p>2018: <a class="reference external" href="https://www.pypy.org/posts/2018/09/the-first-15-years-of-pypy-3412615975376972020.html#incentives-of-oss-compared-to-academia">The First 15 Years of PyPy — a Personal Retrospective</a></p></li> +<li><p>2019: <a class="reference external" href="https://www.pypy.org/posts/2019/01/pypy-for-low-latency-systems-613165393301401965.html">PyPy for low-latency systems</a></p></li> +<li><p>2020: <a class="reference external" href="https://www.pypy.org/posts/2020/02/pypy-and-cffi-have-moved-to-heptapod-5791595152472747032.html">PyPy and CFFI have moved to Heptapod</a></p></li> +<li><p>2021: <a class="reference external" href="https://www.pypy.org/posts/2021/04/ways-pypy-graphviz.html">Some Ways that PyPy uses Graphviz</a></p></li> +</ul> +<p>We'd like to thank our authors, guest authors, commenters, users and readers who +have stuck with us through one and a half decades! If there's any particular +topics you would like to read something about, or any guest posts you'd like to +write, let us know!</p> +</section>metahttps://www.pypy.org/posts/2022/10/blog-15-years.htmlSun, 30 Oct 2022 12:00:00 GMT \ No newline at end of file diff --git a/categories/numpy.html b/categories/numpy.html new file mode 100644 index 000000000..049a07a3a --- /dev/null +++ b/categories/numpy.html @@ -0,0 +1,183 @@ + + + + + +Posts about numpy | PyPy + + + + + + + + + + + + + + + + + Skip to main content +
                +

                Posts about numpy

                + + +
                +
                + + \ No newline at end of file diff --git a/categories/numpy.xml b/categories/numpy.xml new file mode 100644 index 000000000..cce755eb4 --- /dev/null +++ b/categories/numpy.xml @@ -0,0 +1,167 @@ + +PyPy (Posts about numpy)https://www.pypy.org/enContents © 2024 <a href="mailto:pypy-dev@pypy.org">The PyPy Team</a> Sat, 31 Aug 2024 17:48:12 GMTNikola (getnikola.com)http://blogs.law.harvard.edu/tech/rssNumPy on PyPy - Status Updatehttps://www.pypy.org/posts/2014/04/numpy-on-pypy-status-update-1103134247318103282.htmlBrian Kearns<p>Work on NumPy on PyPy continued in March, though at a lighter pace than the previous few months. Progress was made on both compatibility and speed fronts. Several behavioral issues reported to the bug tracker were resolved. The most significant of these was probably the correction of casting to built-in Python types. Previously, int/long conversions of numpy scalars such as inf/nan/1e100 would return bogus results. Now, they raise or return values, as appropriate.<br> +<br> +On the speed front, enhancements to the PyPy JIT were made to support virtualizing the raw_store/raw_load memory operations used in numpy arrays. Further work remains here in virtualizing the alloc_raw_storage when possible. This will allow scalars to have storages but still be virtualized when possible in loops.<br> +<br> +Aside from continued work on compatibility/speed of existing code, we also hope to begin implementing the C-level components of other numpy modules such as mtrand, nditer, linalg, and so on. Several approaches could be taken to get C-level code in these modules working, ranging from reimplementing in RPython to interfacing with existing code with CFFI, if possible. The appropriate approach depends on many factors and will probably vary from module to module.<br><br>To try out PyPy + NumPy, grab a <a href="https://buildbot.pypy.org/nightly/trunk/">nightly PyPy</a> and install our <a href="https://bitbucket.org/pypy/numpy">NumPy fork</a>. Feel free to report comments/issues to IRC, our mailing list, or bug tracker. Thanks to the contributors to the <a href="https://pypy.org/numpydonate.html">NumPy on PyPy</a> proposal for supporting this work.</p>numpyhttps://www.pypy.org/posts/2014/04/numpy-on-pypy-status-update-1103134247318103282.htmlTue, 15 Apr 2014 20:08:00 GMTNumPy on PyPy - Progress in Februaryhttps://www.pypy.org/posts/2014/03/numpy-status-update-february-1245769841736493525.htmlBrian Kearns<p>More progress was made on the NumPy front in the past month. On the compatibility front, we now pass ~130 more tests from NumPy's suite since the end of January. Currently, we pass 2336 tests out of 3265 tests run, with many of the failures representing portions of NumPy that we don't plan to implement in the near future (object dtypes, unicode, etc). There are still some failures that do represent issues, such as special indexing cases and failures to respect subclassed ndarrays in return values, which we do plan to resolve. There are also some unimplemented components and ufuncs remaining which we hope to implement, such as nditer and mtrand. Overall, the most common array functionality should be working.<br> +<br> +Additionally, I began to take a look at some of the loops generated by our code. One widely used loop is dot, and we were running about 5x slower than NumPy's C version. I was able to optimize the dot loop and also the general array iterator to get us to ~1.5x NumPy C time on dot operations of various sizes. Further progress in this area could be made by using CFFI to tie into BLAS libraries, when available. Also, work remains in examining traces generated for our other loops and checking for potential optimizations.<br> +<br> +To try out PyPy + NumPy, grab a <a href="https://buildbot.pypy.org/nightly/trunk/">nightly PyPy</a> and install our <a href="https://bitbucket.org/pypy/numpy">NumPy fork</a>. Feel free to report comments/issues to IRC, our mailing list, or bug tracker. Thanks to the contributors to the <a href="https://pypy.org/numpydonate.html">NumPy on PyPy</a> proposal for supporting this work.<br> +<br> +Cheers,<br> +Brian</p>numpyhttps://www.pypy.org/posts/2014/03/numpy-status-update-february-1245769841736493525.htmlFri, 07 Mar 2014 05:05:00 GMTNumPy Status Update - December/Januaryhttps://www.pypy.org/posts/2014/02/numpy-status-update-decemberjanuary-4292961614234099787.htmlBrian Kearns<p>Work continued on the NumPy + PyPy front steadily in December and more lightly in January. The continued focus was compatibility, targeting incorrect or unimplemented features that appeared in multiple NumPy test suite failures. We now pass ~2/3 of the NumPy test suite. The biggest improvements were made in these areas:<br> +<br> +- Bugs in conversions of arrays/scalars to/from native types<br> +- Fix cases where we would choose incorrect dtypes when initializing or computing results<br> +- Improve handling of subclasses of ndarray through computations<br> +- Support some optional arguments for array methods that are used in the pure-python part of NumPy<br> +- Support additional attributes in arrays, array.flags, and dtypes<br> +- Fix some indexing corner cases that arise in NumPy testing<br> +- Implemented part of numpy.fft (cffti and cfftf)<br> +<br> +Looking forward, we plan to continue improving the correctness of the existing implemented NumPy functionality, while also beginning to look at performance. The initial focus for performance will be to look at areas where we are significantly worse than CPython+NumPy. Those interested in trying these improvements out will need a <a href="https://buildbot.pypy.org/nightly/trunk/">PyPy nightly</a>, and an install of the <a href="https://bitbucket.org/pypy/numpy">PyPy NumPy fork</a>. Thanks again to the <a href="https://pypy.org/numpydonate.html">NumPy on PyPy</a> donors for funding this work.</p>numpyhttps://www.pypy.org/posts/2014/02/numpy-status-update-decemberjanuary-4292961614234099787.htmlThu, 06 Feb 2014 19:06:00 GMTNumPy Status Update - Novemberhttps://www.pypy.org/posts/2013/12/numpy-status-update-november-364321959153372759.htmlBrian Kearns<p>Since the PyPy 2.2 release last month, more progress has been made on the NumPy compatibility front. Initial work has been directed by running the NumPy test suite and targeting failures that appear most frequently, along with fixing the few bugs reported on the bug tracker.<br> +<br> +Improvements were made in these areas:<br> +- Many missing/broken scalar functionalities were added/fixed. The scalar API should match up more closely with arrays now.<br> +- Some missing dtype functionality was added (newbyteorder, hasobject, descr, etc)<br> +- Support for optional arguments (axis, order) was added to some ndarray functions<br> +- Fixed some corner cases for string/record types<br> +<br> +Most of these improvements went onto trunk after 2.2 was split, so if you're interested in trying them out or running into problems on 2.2, try the +<a href="https://buildbot.pypy.org/nightly/trunk/">nightly</a>.<br> +<br> +Thanks again to the <a href="https://pypy.org/numpydonate.html">NumPy on PyPy</a> donors who make this continued progress possible.<br> +<br> +Cheers,<br> +Brian</p>numpyhttps://www.pypy.org/posts/2013/12/numpy-status-update-november-364321959153372759.htmlMon, 09 Dec 2013 23:05:00 GMTPyPy Leysin Winter Sprint (11-19st January 2014)https://www.pypy.org/posts/2013/11/pypy-leysin-winter-sprint-11-19st-8860782754173653661.htmlArmin Rigo<p>The next PyPy sprint will be in Leysin, Switzerland, for the ninth time. +This is a fully public sprint: newcomers and topics other than those +proposed below are welcome.</p> +<h3>Goals and topics of the sprint</h3> +<ul class="simple"> +<li>Py3k: work towards supporting Python 3 in PyPy</li> +<li>NumPyPy: work towards supporting the numpy module in PyPy</li> +<li>STM: work towards supporting Software Transactional Memory</li> +<li>And as usual, the main side goal is to have fun in winter sports :-) +We can take a day off for ski.</li> +</ul> +<br> +<h3>Exact times</h3> +<p>For a change, and as an attempt to simplify things, I specified the +dates as 11-19 January 2014, where 11 and 19 are travel days. We will +work full days between the 12 and the 18. You are of course allowed to +show up for a part of that time only, too.</p> +<h3>Location &amp; Accomodation</h3> +<p>Leysin, Switzerland, "same place as before". Let me refresh your +memory: both the sprint venue and the lodging will be in a very spacious +pair of chalets built specifically for bed &amp; breakfast: +<a class="reference external" href="https://www.ermina.ch/">https://www.ermina.ch/</a>. The place has a good ADSL Internet connexion +with wireless installed. You can of course arrange your own lodging +anywhere (as long as you are in Leysin, you cannot be more than a 15 +minutes walk away from the sprint venue), but I definitely recommend +lodging there too -- you won't find a better view anywhere else (though +you probably won't get much worse ones easily, either :-)</p> +<p>Please <em>confirm</em> that you are coming so that we can adjust the +reservations as appropriate. The rate so far has been around 60 CHF a +night all included in 2-person rooms, with breakfast. There are larger +rooms too (less expensive per person) and maybe the possibility to get a +single room if you really want to.</p> +<p>Please register by Mercurial:</p> +<pre class="literal-block"> +https://bitbucket.org/pypy/extradoc/ +https://foss.heptapod.net/pypy/extradoc/-/blob/branch/default/extradoc/sprintinfo/leysin-winter-2014 +</pre> +<p>or on the pypy-dev mailing list if you do not yet have check-in rights:</p> +<blockquote> +<a class="reference external" href="https://mail.python.org/mailman/listinfo/pypy-dev">https://mail.python.org/mailman/listinfo/pypy-dev</a></blockquote> +<p>You need a Swiss-to-(insert country here) power adapter. There will be +some Swiss-to-EU adapters around -- bring a EU-format power strip if you +have one.</p>numpyhttps://www.pypy.org/posts/2013/11/pypy-leysin-winter-sprint-11-19st-8860782754173653661.htmlSat, 30 Nov 2013 08:57:00 GMTNumPy status updatehttps://www.pypy.org/posts/2013/11/numpy-status-update-1609808546418002632.htmlRomain Guillebert<span style="font-family: inherit;">Here is what has been happening with NumPy in PyPy in October thanks to the people who donated to the </span><a href="https://pypy.org/numpydonate.html" style="font-family: inherit;" target="_blank">NumPyPy proposal</a><span style="font-family: inherit;">:</span><br> +<span style="font-family: inherit;"><br> +</span> <span style="font-family: inherit;">The biggest change is that we shifted to using an <a href="https://bitbucket.org/pypy/numpy" target="_blank">external fork of numpy</a> rather than a minimal numpypy module. The idea is that we will be able to </span>reuse<span style="font-family: inherit;"> most of the upstream pure-python numpy components, replacing the C modules with appropriate RPython micronumpy pieces at the correct places in the module namespace.</span><br> +<span style="font-family: inherit;"><br> +</span> <span style="font-family: inherit;">The numpy fork should work just as well as the old numpypy for functionality that existed previously, and also include much new functionality from the pure-python numpy pieces that simply hadn't been imported yet in numpypy. However, this new functionality will not have been "hand picked" to only include pieces that work, so you may run into functionality that relies on unimplemented components (which should fail with user-level exceptions).</span><br> +<span style="font-family: inherit;"><br> +</span> <span style="font-family: inherit;">This setup also allows us to run the entire numpy test suite, which will help in directing future compatibility development. The recent PyPy release includes these changes, so download it and let us know how it works! And if you want to live on the edge, the nightly includes even more numpy progress made in November.</span><br> +<span style="font-family: inherit;"><br> +</span> <span style="font-family: inherit;">To install the fork, download the latest release, and then install numpy eith</span>er separately with a virtualenv: <tt class="docutils literal">pip install git+https://bitbucket.org/pypy/numpy.git</tt>; or directly: <tt class="docutils literal">git clone https://bitbucket.org/pypy/numpy.git; cd numpy; pypy setup.py install</tt>.<br> + +<br><i><b>EDIT:</b> if you install numpy as root, you may need to also import it once as root before it works: <tt class="docutils literal">sudo pypy -c 'import numpy'</tt></i><br> + +<span style="font-family: inherit;"><br> +</span> <span style="font-family: inherit;">Along with this change, progress was made in fixing internal micronumpy bugs and increasing compatibility:</span><br> +<ul><li><span style="font-family: inherit;"><span style="font-family: inherit;">Fixed a bug with strings in record dtypes</span></span></li> +<li><span style="font-family: inherit;"><span style="font-family: inherit;">Fixed a bug wh</span><span style="background-color: white; font-family: inherit;">ere the multiplication of an ndarray with a Python int or float resulted in loss of the array's dtype</span></span></li> +<li><span style="font-family: inherit;"><span style="background-color: white; font-family: inherit;">Fixed several segfaults encountered in the numpy test suite (suite should run now without segfaulting)</span></span></li> +</ul><span style="font-family: inherit;"><span style="background-color: white;"><br> +</span></span> <span style="font-family: inherit;"><span style="background-color: white;">We also began working on __array_prepare__ and __array_wrap__, which are necessary pieces for a working matplotlib module.</span></span><br> +<span style="font-family: inherit;"><span style="background-color: white;"><br> +</span></span> Cheers,<br> +Romain and Briannumpyhttps://www.pypy.org/posts/2013/11/numpy-status-update-1609808546418002632.htmlFri, 15 Nov 2013 19:30:00 GMTNumpy Status Updatehttps://www.pypy.org/posts/2013/09/numpy-status-update-5160363918470470887.htmlRomain Guillebert<p>Hi everyone<br> +<br> +Thanks to the people who donated money to the <a href="https://pypy.org/numpydonate.html" target="_blank">numpy proposal</a>, here is what I've been working on recently :<br> +<br> +- Fixed conversion from a numpy complex number to a python complex number<br> +- Implement the rint ufunc<br> +- Make numpy.character usable as a dtype<br> +- Fix ndarray(dtype=str).fill()<br> +- Various fixes on boolean and fancy indexing<br> +<br> +Cheers<br> +Romain</p>numpyhttps://www.pypy.org/posts/2013/09/numpy-status-update-5160363918470470887.htmlWed, 25 Sep 2013 17:49:00 GMTNumPy road forwardhttps://www.pypy.org/posts/2013/08/numpy-road-forward-4210065750776753500.htmlMaciej Fijalkowski<div dir="ltr" style="text-align: left;"> +<p>Hello everyone.</p> +<p>This is the roadmap for numpy effort in PyPy as discussed on the London sprint. +First, the highest on our priority list is to finish the low-level part +of the numpy module. What +we'll do is to finish the RPython part of numpy and provide a pip installable +numpypy repository that includes the pure python part of Numpy. This would +contain the original Numpy with a few minor changes.</p> +<p>Second, we need to work on the JIT support that will make NumPy on PyPy +faster. In detail:</p> +<ul class="simple"> +<li>reenable the lazy loop evaluation</li> +<li>optimize bridges, which is depending on optimizer refactorings</li> +<li>SSE support</li> +</ul> +<p>On the compatibility front, there were some independent attempts into +making the following stuff working:</p> +<ul class="simple"> +<li>f2py</li> +<li>C API (in fact, PyArray_* API is partly present in the nightly builds of +PyPy)</li> +<li>matplotlib (both using PyArray_* API and embedding CPython runtime in PyPy)</li> +<li>scipy</li> +</ul> +<p>In order to make all of the above happen faster, it would be helpful to raise +more funds. You can donate to <a class="reference external" href="https://pypy.org/numpydonate.html">PyPy's NumPy project</a> on our website. Note +that PyPy is a member of SFC which is a 501(c)(3) US non-profit, so donations +from US companies can be tax-deducted.</p> +<p>Cheers,<br> +fijal, arigo, ronan, rguillebert, anto and others</p> +<br></div>numpyhttps://www.pypy.org/posts/2013/08/numpy-road-forward-4210065750776753500.htmlTue, 27 Aug 2013 11:20:00 GMTNumPyPy Status Updatehttps://www.pypy.org/posts/2013/08/numpypy-status-update-3401163348519734658.htmlRomain Guillebert<p>Hello everyone<br> +<br> +As expected, nditer is a lot of work. I'm going to pause my work on it for now and focus on simpler and more important things, here is a list of what I implemented :<br> +</p><ul> +<li>Fixed a bug on 32 bit that made int32(123).dtype == dtype("int32") fail</li> +<li>Fixed a bug on the pickling of array slices</li> +<li>The external loop flag is implemented on the nditer class</li> +<li>The c_index, f_index and multi_index flags are also implemented</li> +<li>Add dtype("double") and dtype("str")</li> +<li>C-style iteration is available for nditer</li> +</ul> +Cheers<br> +Romain Guillebertnumpyhttps://www.pypy.org/posts/2013/08/numpypy-status-update-3401163348519734658.htmlThu, 08 Aug 2013 19:01:00 GMTNumPyPy status updatehttps://www.pypy.org/posts/2013/06/numpypy-status-update-3846626188716521472.htmlRomain Guillebert<p>Hello everyone,<br> +<br> +May was the first month I was paid to work on NumPyPy (thanks to all who donated!), here is what I worked on during this period :<br> +<br> +</p><ul> +<li>It is now possible to use subarrays.</li> +<li>It is now possible to pickle ndarrays (including those using subarrays), dtypes and scalars, the pickling protocol is the same as numpy's.</li> +</ul> +<div> +<br></div> +<div> +For June, I plan to work on the nditer class, it seems that there's enough work for an entire month.</div> +<br> +Cheers<br> +Romain Guillebertnumpyhttps://www.pypy.org/posts/2013/06/numpypy-status-update-3846626188716521472.htmlMon, 03 Jun 2013 14:09:00 GMT \ No newline at end of file diff --git a/categories/parser.html b/categories/parser.html new file mode 100644 index 000000000..6f310b5d6 --- /dev/null +++ b/categories/parser.html @@ -0,0 +1,114 @@ + + + + + +Posts about parser | PyPy + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +
                + + \ No newline at end of file diff --git a/categories/parser.xml b/categories/parser.xml new file mode 100644 index 000000000..0671649f2 --- /dev/null +++ b/categories/parser.xml @@ -0,0 +1,6 @@ + +PyPy (Posts about parser)https://www.pypy.org/enContents © 2024 <a href="mailto:pypy-dev@pypy.org">The PyPy Team</a> Sat, 31 Aug 2024 17:48:13 GMTNikola (getnikola.com)http://blogs.law.harvard.edu/tech/rssPyPy gets a new compilerhttps://www.pypy.org/posts/2009/08/pypy-gets-new-compiler_25-6401910947439531107.htmlBenjamin Peterson<p>Today, I merged the parser-compiler branch, which I have been working on over the summer. It contained a total rewrite of both PyPy's Python parser and AST compiler. PyPy's old parser was (in)famous internally for being complicated and slow (with many algorithmic complexities greater than O(n)). The new parser is a simple as <a href="https://codespeak.net/viewvc/pypy/trunk/pypy/interpreter/pyparser/parser.py?view=markup">I could make it</a> LL(1) parser like CPython (though it doesn't share the hacks of CPython's parser).</p> + +<p>The new compiler is based on the <a href="https://doc.python.org/3.1/library/ast">Abstract Syntax Trees (AST) that CPython 2.5 introduced</a> instead of PyPy's old AST based on the <a href="https://doc.python.org/library/compiler">compiler package's</a>. This means that Python code running on PyPy will be able to use the same _ast interface as CPython. PyPy's _ast implementation supports AST features that CPython 2.6 added, including <a href="https://pythonic.pocoo.org/2008/3/29/ast-compilation-from-python">compiling modified AST to bytecode and executing it</a>. In this rewrite, some more obscure compiler features were added, too. For example, jumps in bytecode can now be greater than 65535 bytes! (That's like an if statement with 7000 lines of code in the body.)</p> + +<p>While the PyPy translation toolchain still has many obscure details and hacks, this merge completes the process of making the actual Python interpreter very clean. Hopefully, this will make adding new features much easier and make PyPy less frustrating to maintain as well as providing application level code with an improved AST interface!</p>compilerparserspeedhttps://www.pypy.org/posts/2009/08/pypy-gets-new-compiler_25-6401910947439531107.htmlTue, 25 Aug 2009 16:05:00 GMT \ No newline at end of file diff --git a/categories/performance.html b/categories/performance.html new file mode 100644 index 000000000..044c35db9 --- /dev/null +++ b/categories/performance.html @@ -0,0 +1,117 @@ + + + + + +Posts about performance | PyPy + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +
                + + \ No newline at end of file diff --git a/categories/performance.xml b/categories/performance.xml new file mode 100644 index 000000000..08bc703cb --- /dev/null +++ b/categories/performance.xml @@ -0,0 +1,92 @@ + +PyPy (Posts about performance)https://www.pypy.org/enContents © 2024 <a href="mailto:pypy-dev@pypy.org">The PyPy Team</a> Sat, 31 Aug 2024 17:48:13 GMTNikola (getnikola.com)http://blogs.law.harvard.edu/tech/rssRPython-based emulator speeds up RISC-V simulation over 15xhttps://www.pypy.org/posts/2023/05/rpython-used-to-speed-up-risc-v-simulation-over-15x.htmlCarl Friedrich Bolz-Tereick<p>In cooperation with <a class="reference external" href="https://riscv.org/">RISC-V International</a>, who funded a part of this project, +we recently created a workflow to +use RPython to take a <a class="reference external" href="https://github.com/riscv/sail-riscv#riscv-sail-model">Sail RISC-V</a> model and automatically create a RISC-V ISA +emulator from it, which we call <a class="reference external" href="https://docs.pydrofoil.org">Pydrofoil</a>. The simulator sped up booting a +linux emulator from 35 minutes (using the standard Sail-generated emulator in +C) to 2 minutes, a speedup of 17.5x. More details about the process are in the +<a class="reference external" href="https://riscv.org/blog/2023/05/how-to-speed-up-the-emulating-process-with-pydrofoil-carl-friedrich/">RISC-V blog post</a>.</p> +<p>A few take-aways from the project:</p> +<ul class="simple"> +<li><p>While PyPy has shown it can speed up generic python code <a class="reference external" href="https://speed.pypy.org">about 4x</a>, the +technology behind PyPy can really shine in other areas.</p></li> +<li><p>RPython is malleable and can be molded to many tasks, the RPython meta-JIT is +very flexible.</p></li> +<li><p>A JIT is well-suited for the problem of emulation, because it can +perform dynamic binary translation.</p></li> +</ul> +<p>PyPy can solve real world performance problems, even somewhat unusual ones. +Please <a class="reference external" href="https://www.pypy.org/pypy-sponsors.html">get in touch</a> and let us know how we can help you solve yours!</p>casestudyperformancehttps://www.pypy.org/posts/2023/05/rpython-used-to-speed-up-risc-v-simulation-over-15x.htmlTue, 16 May 2023 11:22:35 GMTRepeated string concatenation is quadratic in PyPy (and CPython)https://www.pypy.org/posts/2023/01/string-concatenation-quadratic.htmlCarl Friedrich Bolz-Tereick<p>This is a super brief blog post responding to an <a class="reference external" href="https://foss.heptapod.net/pypy/pypy/-/issues/3885">issue</a> that we got on the PyPy +issue tracker. I am moving my response to the blog (with permission of the +submitter) to have a post to point to, since it's a problem that comes up with +some regularity. It's also documented on our page of <a class="reference external" href="https://doc.pypy.org/en/latest/cpython_differences.html?highlight=join#performance-differences">differences between PyPy +and CPython</a> but I thought an additional blog post might be good.</p> +<p>The issue pointed out that a small program that operates on strings is much +slower on PyPy compared to CPython. The program is a solution for 2016's +Advent of Code <a class="reference external" href="https://adventofcode.com/2016/day/16">Day 16</a> and looks like this:</p> +<div class="code"><pre class="code python"><a id="rest_code_2392824f2ba74fb299c7ea44c8e4838c-1" name="rest_code_2392824f2ba74fb299c7ea44c8e4838c-1" href="https://www.pypy.org/posts/2023/01/string-concatenation-quadratic.html#rest_code_2392824f2ba74fb299c7ea44c8e4838c-1"></a><span class="k">def</span> <span class="nf">dragon</span><span class="p">(</span><span class="n">a</span><span class="p">):</span> +<a id="rest_code_2392824f2ba74fb299c7ea44c8e4838c-2" name="rest_code_2392824f2ba74fb299c7ea44c8e4838c-2" href="https://www.pypy.org/posts/2023/01/string-concatenation-quadratic.html#rest_code_2392824f2ba74fb299c7ea44c8e4838c-2"></a> <span class="n">b</span> <span class="o">=</span> <span class="n">a</span><span class="p">[::</span><span class="o">-</span><span class="mi">1</span><span class="p">]</span><span class="o">.</span><span class="n">replace</span><span class="p">(</span><span class="s1">'0'</span><span class="p">,</span><span class="s1">'r'</span><span class="p">)</span><span class="o">.</span><span class="n">replace</span><span class="p">(</span><span class="s1">'1'</span><span class="p">,</span><span class="s1">'0'</span><span class="p">)</span><span class="o">.</span><span class="n">replace</span><span class="p">(</span><span class="s1">'r'</span><span class="p">,</span><span class="s1">'1'</span><span class="p">)</span> +<a id="rest_code_2392824f2ba74fb299c7ea44c8e4838c-3" name="rest_code_2392824f2ba74fb299c7ea44c8e4838c-3" href="https://www.pypy.org/posts/2023/01/string-concatenation-quadratic.html#rest_code_2392824f2ba74fb299c7ea44c8e4838c-3"></a> <span class="k">return</span> <span class="n">a</span><span class="o">+</span><span class="s1">'0'</span><span class="o">+</span><span class="n">b</span> +<a id="rest_code_2392824f2ba74fb299c7ea44c8e4838c-4" name="rest_code_2392824f2ba74fb299c7ea44c8e4838c-4" href="https://www.pypy.org/posts/2023/01/string-concatenation-quadratic.html#rest_code_2392824f2ba74fb299c7ea44c8e4838c-4"></a> +<a id="rest_code_2392824f2ba74fb299c7ea44c8e4838c-5" name="rest_code_2392824f2ba74fb299c7ea44c8e4838c-5" href="https://www.pypy.org/posts/2023/01/string-concatenation-quadratic.html#rest_code_2392824f2ba74fb299c7ea44c8e4838c-5"></a><span class="k">def</span> <span class="nf">diffstr</span><span class="p">(</span><span class="n">a</span><span class="p">):</span> +<a id="rest_code_2392824f2ba74fb299c7ea44c8e4838c-6" name="rest_code_2392824f2ba74fb299c7ea44c8e4838c-6" href="https://www.pypy.org/posts/2023/01/string-concatenation-quadratic.html#rest_code_2392824f2ba74fb299c7ea44c8e4838c-6"></a> <span class="n">b</span> <span class="o">=</span> <span class="s2">""</span> +<a id="rest_code_2392824f2ba74fb299c7ea44c8e4838c-7" name="rest_code_2392824f2ba74fb299c7ea44c8e4838c-7" href="https://www.pypy.org/posts/2023/01/string-concatenation-quadratic.html#rest_code_2392824f2ba74fb299c7ea44c8e4838c-7"></a> <span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="mi">0</span><span class="p">,</span><span class="nb">len</span><span class="p">(</span><span class="n">a</span><span class="p">),</span><span class="mi">2</span><span class="p">):</span> +<a id="rest_code_2392824f2ba74fb299c7ea44c8e4838c-8" name="rest_code_2392824f2ba74fb299c7ea44c8e4838c-8" href="https://www.pypy.org/posts/2023/01/string-concatenation-quadratic.html#rest_code_2392824f2ba74fb299c7ea44c8e4838c-8"></a> <span class="n">b</span> <span class="o">+=</span> <span class="p">[</span><span class="s1">'0'</span><span class="p">,</span><span class="s1">'1'</span><span class="p">][</span><span class="n">a</span><span class="p">[</span><span class="n">i</span><span class="p">]</span> <span class="o">==</span> <span class="n">a</span><span class="p">[</span><span class="n">i</span><span class="o">+</span><span class="mi">1</span><span class="p">]]</span> +<a id="rest_code_2392824f2ba74fb299c7ea44c8e4838c-9" name="rest_code_2392824f2ba74fb299c7ea44c8e4838c-9" href="https://www.pypy.org/posts/2023/01/string-concatenation-quadratic.html#rest_code_2392824f2ba74fb299c7ea44c8e4838c-9"></a> <span class="k">return</span> <span class="n">b</span> +<a id="rest_code_2392824f2ba74fb299c7ea44c8e4838c-10" name="rest_code_2392824f2ba74fb299c7ea44c8e4838c-10" href="https://www.pypy.org/posts/2023/01/string-concatenation-quadratic.html#rest_code_2392824f2ba74fb299c7ea44c8e4838c-10"></a> +<a id="rest_code_2392824f2ba74fb299c7ea44c8e4838c-11" name="rest_code_2392824f2ba74fb299c7ea44c8e4838c-11" href="https://www.pypy.org/posts/2023/01/string-concatenation-quadratic.html#rest_code_2392824f2ba74fb299c7ea44c8e4838c-11"></a><span class="k">def</span> <span class="nf">iterdiff</span><span class="p">(</span><span class="n">a</span><span class="p">):</span> +<a id="rest_code_2392824f2ba74fb299c7ea44c8e4838c-12" name="rest_code_2392824f2ba74fb299c7ea44c8e4838c-12" href="https://www.pypy.org/posts/2023/01/string-concatenation-quadratic.html#rest_code_2392824f2ba74fb299c7ea44c8e4838c-12"></a> <span class="n">b</span> <span class="o">=</span> <span class="n">a</span> +<a id="rest_code_2392824f2ba74fb299c7ea44c8e4838c-13" name="rest_code_2392824f2ba74fb299c7ea44c8e4838c-13" href="https://www.pypy.org/posts/2023/01/string-concatenation-quadratic.html#rest_code_2392824f2ba74fb299c7ea44c8e4838c-13"></a> <span class="k">while</span><span class="p">(</span><span class="nb">len</span><span class="p">(</span><span class="n">b</span><span class="p">)</span> <span class="o">%</span> <span class="mi">2</span> <span class="o">==</span> <span class="mi">0</span><span class="p">):</span> +<a id="rest_code_2392824f2ba74fb299c7ea44c8e4838c-14" name="rest_code_2392824f2ba74fb299c7ea44c8e4838c-14" href="https://www.pypy.org/posts/2023/01/string-concatenation-quadratic.html#rest_code_2392824f2ba74fb299c7ea44c8e4838c-14"></a> <span class="n">b</span> <span class="o">=</span> <span class="n">diffstr</span><span class="p">(</span><span class="n">b</span><span class="p">)</span> +<a id="rest_code_2392824f2ba74fb299c7ea44c8e4838c-15" name="rest_code_2392824f2ba74fb299c7ea44c8e4838c-15" href="https://www.pypy.org/posts/2023/01/string-concatenation-quadratic.html#rest_code_2392824f2ba74fb299c7ea44c8e4838c-15"></a> <span class="k">return</span> <span class="n">b</span> +<a id="rest_code_2392824f2ba74fb299c7ea44c8e4838c-16" name="rest_code_2392824f2ba74fb299c7ea44c8e4838c-16" href="https://www.pypy.org/posts/2023/01/string-concatenation-quadratic.html#rest_code_2392824f2ba74fb299c7ea44c8e4838c-16"></a> +<a id="rest_code_2392824f2ba74fb299c7ea44c8e4838c-17" name="rest_code_2392824f2ba74fb299c7ea44c8e4838c-17" href="https://www.pypy.org/posts/2023/01/string-concatenation-quadratic.html#rest_code_2392824f2ba74fb299c7ea44c8e4838c-17"></a><span class="n">size</span> <span class="o">=</span> <span class="mi">35651584</span> +<a id="rest_code_2392824f2ba74fb299c7ea44c8e4838c-18" name="rest_code_2392824f2ba74fb299c7ea44c8e4838c-18" href="https://www.pypy.org/posts/2023/01/string-concatenation-quadratic.html#rest_code_2392824f2ba74fb299c7ea44c8e4838c-18"></a><span class="n">initstate</span> <span class="o">=</span> <span class="s1">'10010000000110000'</span> +<a id="rest_code_2392824f2ba74fb299c7ea44c8e4838c-19" name="rest_code_2392824f2ba74fb299c7ea44c8e4838c-19" href="https://www.pypy.org/posts/2023/01/string-concatenation-quadratic.html#rest_code_2392824f2ba74fb299c7ea44c8e4838c-19"></a><span class="k">while</span><span class="p">(</span><span class="nb">len</span><span class="p">(</span><span class="n">initstate</span><span class="p">)</span> <span class="o">&lt;</span> <span class="n">size</span><span class="p">):</span> +<a id="rest_code_2392824f2ba74fb299c7ea44c8e4838c-20" name="rest_code_2392824f2ba74fb299c7ea44c8e4838c-20" href="https://www.pypy.org/posts/2023/01/string-concatenation-quadratic.html#rest_code_2392824f2ba74fb299c7ea44c8e4838c-20"></a> <span class="n">initstate</span> <span class="o">=</span> <span class="n">dragon</span><span class="p">(</span><span class="n">initstate</span><span class="p">)</span> +<a id="rest_code_2392824f2ba74fb299c7ea44c8e4838c-21" name="rest_code_2392824f2ba74fb299c7ea44c8e4838c-21" href="https://www.pypy.org/posts/2023/01/string-concatenation-quadratic.html#rest_code_2392824f2ba74fb299c7ea44c8e4838c-21"></a><span class="n">initstate</span> <span class="o">=</span> <span class="n">initstate</span><span class="p">[:</span><span class="n">size</span><span class="p">]</span> +<a id="rest_code_2392824f2ba74fb299c7ea44c8e4838c-22" name="rest_code_2392824f2ba74fb299c7ea44c8e4838c-22" href="https://www.pypy.org/posts/2023/01/string-concatenation-quadratic.html#rest_code_2392824f2ba74fb299c7ea44c8e4838c-22"></a><span class="nb">print</span><span class="p">(</span><span class="n">iterdiff</span><span class="p">(</span><span class="n">initstate</span><span class="p">))</span> +</pre></div> +<p>The submitter pointed out, that the program is fast on CPython (~8s on my +laptop) and slow (didn't finish) on PyPy.</p> +<p>The reason for the performance difference is that <code class="docutils literal">+=</code> on strings in a loop +has quadratic complexity in PyPy, which is what <code class="docutils literal">diffstr</code> does. To see the +quadraticness, consider that to add a character at the end of the string, the +beginning of the string needs to be copied into a new chunk of memory. If the +loop runs <code class="docutils literal">n</code> times, that means there are</p> +<p><code class="docutils literal">1 + 2 + 3 + ... + n = n * (n + 1) // 2</code></p> +<p>character copies.</p> +<p>Repeated string concatenations are in principle also quadratic in CPython, but +CPython has an <a class="reference external" href="https://docs.python.org/2/whatsnew/2.4.html#optimizations">optimization</a> that makes them sometimes not quadratic, which is +what makes this program not too slow in CPython.</p> +<p>In order to fix the problem on PyPy it's best to use a list for the string +parts, which has the right amortized O(1) complexity for <code class="docutils literal">.append</code> calls, and +then use <code class="docutils literal">str.join</code> after the loop:</p> +<div class="code"><pre class="code python"><a id="rest_code_ad4f7a3ef35a44588b2a5efc3fee9a33-1" name="rest_code_ad4f7a3ef35a44588b2a5efc3fee9a33-1" href="https://www.pypy.org/posts/2023/01/string-concatenation-quadratic.html#rest_code_ad4f7a3ef35a44588b2a5efc3fee9a33-1"></a><span class="k">def</span> <span class="nf">diffstr</span><span class="p">(</span><span class="n">a</span><span class="p">):</span> +<a id="rest_code_ad4f7a3ef35a44588b2a5efc3fee9a33-2" name="rest_code_ad4f7a3ef35a44588b2a5efc3fee9a33-2" href="https://www.pypy.org/posts/2023/01/string-concatenation-quadratic.html#rest_code_ad4f7a3ef35a44588b2a5efc3fee9a33-2"></a> <span class="n">b</span> <span class="o">=</span> <span class="p">[]</span> +<a id="rest_code_ad4f7a3ef35a44588b2a5efc3fee9a33-3" name="rest_code_ad4f7a3ef35a44588b2a5efc3fee9a33-3" href="https://www.pypy.org/posts/2023/01/string-concatenation-quadratic.html#rest_code_ad4f7a3ef35a44588b2a5efc3fee9a33-3"></a> <span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="mi">0</span><span class="p">,</span><span class="nb">len</span><span class="p">(</span><span class="n">a</span><span class="p">),</span><span class="mi">2</span><span class="p">):</span> +<a id="rest_code_ad4f7a3ef35a44588b2a5efc3fee9a33-4" name="rest_code_ad4f7a3ef35a44588b2a5efc3fee9a33-4" href="https://www.pypy.org/posts/2023/01/string-concatenation-quadratic.html#rest_code_ad4f7a3ef35a44588b2a5efc3fee9a33-4"></a> <span class="n">b</span><span class="o">.</span><span class="n">append</span><span class="p">([</span><span class="s1">'0'</span><span class="p">,</span><span class="s1">'1'</span><span class="p">][</span><span class="n">a</span><span class="p">[</span><span class="n">i</span><span class="p">]</span> <span class="o">==</span> <span class="n">a</span><span class="p">[</span><span class="n">i</span><span class="o">+</span><span class="mi">1</span><span class="p">]])</span> +<a id="rest_code_ad4f7a3ef35a44588b2a5efc3fee9a33-5" name="rest_code_ad4f7a3ef35a44588b2a5efc3fee9a33-5" href="https://www.pypy.org/posts/2023/01/string-concatenation-quadratic.html#rest_code_ad4f7a3ef35a44588b2a5efc3fee9a33-5"></a> <span class="k">return</span> <span class="s2">""</span><span class="o">.</span><span class="n">join</span><span class="p">(</span><span class="n">b</span><span class="p">)</span> +</pre></div> +<p>With this change the program becomes a little bit faster on CPython for me, and +on PyPy it stops being quadratic and runs in ~3.5s.</p> +<p>In general, it's best not to rely on the presence of this optimization in +CPython either. Sometimes, a small innocent looking changes will break CPython's +optimization. E.g. this useless change makes CPython also take ages:</p> +<div class="code"><pre class="code python"><a id="rest_code_634bfd4fb6b14908b50e7dd140ed4b3a-1" name="rest_code_634bfd4fb6b14908b50e7dd140ed4b3a-1" href="https://www.pypy.org/posts/2023/01/string-concatenation-quadratic.html#rest_code_634bfd4fb6b14908b50e7dd140ed4b3a-1"></a><span class="k">def</span> <span class="nf">diffstr</span><span class="p">(</span><span class="n">a</span><span class="p">):</span> +<a id="rest_code_634bfd4fb6b14908b50e7dd140ed4b3a-2" name="rest_code_634bfd4fb6b14908b50e7dd140ed4b3a-2" href="https://www.pypy.org/posts/2023/01/string-concatenation-quadratic.html#rest_code_634bfd4fb6b14908b50e7dd140ed4b3a-2"></a> <span class="n">b</span> <span class="o">=</span> <span class="s2">""</span> +<a id="rest_code_634bfd4fb6b14908b50e7dd140ed4b3a-3" name="rest_code_634bfd4fb6b14908b50e7dd140ed4b3a-3" href="https://www.pypy.org/posts/2023/01/string-concatenation-quadratic.html#rest_code_634bfd4fb6b14908b50e7dd140ed4b3a-3"></a> <span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="mi">0</span><span class="p">,</span><span class="nb">len</span><span class="p">(</span><span class="n">a</span><span class="p">),</span><span class="mi">2</span><span class="p">):</span> +<a id="rest_code_634bfd4fb6b14908b50e7dd140ed4b3a-4" name="rest_code_634bfd4fb6b14908b50e7dd140ed4b3a-4" href="https://www.pypy.org/posts/2023/01/string-concatenation-quadratic.html#rest_code_634bfd4fb6b14908b50e7dd140ed4b3a-4"></a> <span class="n">b</span> <span class="o">+=</span> <span class="p">[</span><span class="s1">'0'</span><span class="p">,</span><span class="s1">'1'</span><span class="p">][</span><span class="n">a</span><span class="p">[</span><span class="n">i</span><span class="p">]</span> <span class="o">==</span> <span class="n">a</span><span class="p">[</span><span class="n">i</span><span class="o">+</span><span class="mi">1</span><span class="p">]]</span> +<a id="rest_code_634bfd4fb6b14908b50e7dd140ed4b3a-5" name="rest_code_634bfd4fb6b14908b50e7dd140ed4b3a-5" href="https://www.pypy.org/posts/2023/01/string-concatenation-quadratic.html#rest_code_634bfd4fb6b14908b50e7dd140ed4b3a-5"></a> <span class="n">c</span> <span class="o">=</span> <span class="n">b</span> +<a id="rest_code_634bfd4fb6b14908b50e7dd140ed4b3a-6" name="rest_code_634bfd4fb6b14908b50e7dd140ed4b3a-6" href="https://www.pypy.org/posts/2023/01/string-concatenation-quadratic.html#rest_code_634bfd4fb6b14908b50e7dd140ed4b3a-6"></a> <span class="k">return</span> <span class="n">b</span> +</pre></div> +<p>The reason why this change breaks the optimization in CPython is that it only +triggers if the reference count of <code class="docutils literal">b</code> is 1, in which case it uses <code class="docutils literal">realloc</code> +on the string. The change is unrealistic of course, but you could imagine a +related that keeps an extra reference to <code class="docutils literal">b</code> for a sensible reason.</p> +<p>Another situation in which the optimization doesn't work is discussed in this +<a class="reference external" href="https://stackoverflow.com/a/44487738">StackOverflow question</a> with an answer by Tim Peters.</p> +<p>It's unlikely that PyPy will fix this. We had a prototype how to do it, but it +seems very little "production" code uses <cite>+=</cite> on strings in a loop, and the fix +makes the strings implementation quite a bit more complex.</p> +<p>So, in summary, don't use repeated concatenations in a loop!</p>performancehttps://www.pypy.org/posts/2023/01/string-concatenation-quadratic.htmlWed, 04 Jan 2023 09:00:00 GMT \ No newline at end of file diff --git a/categories/profiling.html b/categories/profiling.html new file mode 100644 index 000000000..ec3bf148a --- /dev/null +++ b/categories/profiling.html @@ -0,0 +1,123 @@ + + + + + +Posts about profiling | PyPy + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +
                + + \ No newline at end of file diff --git a/categories/profiling.xml b/categories/profiling.xml new file mode 100644 index 000000000..90ed0af7c --- /dev/null +++ b/categories/profiling.xml @@ -0,0 +1,843 @@ + +PyPy (Posts about profiling)https://www.pypy.org/enContents © 2024 <a href="mailto:pypy-dev@pypy.org">The PyPy Team</a> Sat, 31 Aug 2024 17:48:13 GMTNikola (getnikola.com)http://blogs.law.harvard.edu/tech/rssInside cpyext: Why emulating CPython C API is so Hardhttps://www.pypy.org/posts/2018/09/inside-cpyext-why-emulating-cpython-c-8083064623681286567.htmlAntonio Cuni<br> +<div class="document" id="inside-cpyext-why-emulating-cpython-c-api-is-so-hard"> +<tt class="docutils literal">cpyext</tt> is PyPy's subsystem which provides a compatibility +layer to compile and run CPython C extensions inside PyPy. Often people ask +why a particular C extension doesn't work or is very slow on PyPy. +Usually it is hard to answer without going into technical details. The goal of +this blog post is to explain some of these technical details, so that we can +simply link here instead of explaining again and again :).<br> +From a 10.000 foot view, <tt class="docutils literal">cpyext</tt> is PyPy's version of <tt class="docutils literal">"Python.h"</tt>. Every time +you compile an extension which uses that header file, you are using <tt class="docutils literal">cpyext</tt>. +This includes extension explicitly written in C (such as <tt class="docutils literal">numpy</tt>) and +extensions which are generated from other compilers/preprocessors +(e.g. <tt class="docutils literal">Cython</tt>).<br> +At the time of writing, the current status is that most C extensions "just +work". Generally speaking, you can simply <tt class="docutils literal">pip install</tt> them, +provided they use the public, <a class="reference external" href="https://docs.python.org/2/c-api/index.html">official C API</a> instead of poking at private +implementation details. However, the performance of cpyext is generally +poor. A Python program which makes heavy use of <tt class="docutils literal">cpyext</tt> extensions +is likely to be slower on PyPy than on CPython.<br> +Note: in this blog post we are talking about Python 2.7 because it is still +the default version of PyPy: however most of the implementation of <tt class="docutils literal">cpyext</tt> is +shared with PyPy3, so everything applies to that as well.<br> +<div class="section" id="c-api-overview"> +<h1> +C API Overview</h1> +In CPython, which is written in C, Python objects are represented as <tt class="docutils literal">PyObject*</tt>, +i.e. (mostly) opaque pointers to some common "base struct".<br> +CPython uses a very simple memory management scheme: when you create an +object, you allocate a block of memory of the appropriate size on the heap. +Depending on the details, you might end up calling different allocators, but +for the sake of simplicity, you can think that this ends up being a call to +<tt class="docutils literal">malloc()</tt>. The resulting block of memory is initialized and casted to to +<tt class="docutils literal">PyObject*</tt>: this address never changes during the object lifetime, and the +C code can freely pass it around, store it inside containers, retrieve it +later, etc.<br> +Memory is managed using reference counting. When you create a new reference to +an object, or you discard a reference you own, you have to <a class="reference external" href="https://docs.python.org/2/c-api/refcounting.html#c.Py_INCREF">increment</a> or +<a class="reference external" href="https://docs.python.org/2/c-api/refcounting.html#c.Py_DECREF">decrement</a> the reference counter accordingly. When the reference counter goes to +0, it means that the object is no longer used and can safely be +destroyed. Again, we can simplify and say that this results in a call to +<tt class="docutils literal">free()</tt>, which finally releases the memory which was allocated by <tt class="docutils literal">malloc()</tt>.<br> +Generally speaking, the only way to operate on a <tt class="docutils literal">PyObject*</tt> is to call the +appropriate API functions. For example, to convert a given <tt class="docutils literal">PyObject*</tt> to a C +integer, you can use <a class="reference external" href="https://docs.python.org/2/c-api/int.html#c.PyInt_AsLong">PyInt_AsLong()</a>; to add two objects together, you can +call <a class="reference external" href="https://docs.python.org/2/c-api/number.html#c.PyNumber_Add">PyNumber_Add()</a>.<br> +Internally, PyPy uses a similar approach. All Python objects are subclasses of +the RPython <tt class="docutils literal">W_Root</tt> class, and they are operated by calling methods on the +<tt class="docutils literal">space</tt> singleton, which represents the interpreter.<br> +At first, it looks very easy to write a compatibility layer: just make +<tt class="docutils literal">PyObject*</tt> an alias for <tt class="docutils literal">W_Root</tt>, and write simple RPython functions +(which will be translated to C by the RPython compiler) which call the +<tt class="docutils literal">space</tt> accordingly:<br> +<pre class="code python literal-block"><span class="keyword">def</span> <span class="name function">PyInt_AsLong</span><span class="punctuation">(</span><span class="name">space</span><span class="punctuation">,</span> <span class="name">o</span><span class="punctuation">):</span> + <span class="keyword">return</span> <span class="name">space</span><span class="operator">.</span><span class="name">int_w</span><span class="punctuation">(</span><span class="name">o</span><span class="punctuation">)</span> + +<span class="keyword">def</span> <span class="name function">PyNumber_Add</span><span class="punctuation">(</span><span class="name">space</span><span class="punctuation">,</span> <span class="name">o1</span><span class="punctuation">,</span> <span class="name">o2</span><span class="punctuation">):</span> + <span class="keyword">return</span> <span class="name">space</span><span class="operator">.</span><span class="name">add</span><span class="punctuation">(</span><span class="name">o1</span><span class="punctuation">,</span> <span class="name">o2</span><span class="punctuation">)</span> +</pre> +Actually, the code above is not too far from the real +implementation. However, there are tons of gory details which make it much +harder than it looks, and much slower unless you pay a lot of attention +to performance.</div> +<div class="section" id="the-pypy-gc"> +<h1> +The PyPy GC</h1> +To understand some of <tt class="docutils literal">cpyext</tt> challenges, you need to have at least a rough +idea of how the PyPy GC works.<br> +Contrarily to the popular belief, the "Garbage Collector" is not only about +collecting garbage: instead, it is generally responsible for all memory +management, including allocation and deallocation.<br> +Whereas CPython uses a combination of malloc/free/refcounting to manage +memory, the PyPy GC uses a completely different approach. It is designed +assuming that a dynamic language like Python behaves the following way:<br> +<blockquote> +<ul class="simple"> +<li>You create, either directly or indirectly, lots of objects.</li> +<li>Most of these objects are temporary and very short-lived. Think e.g. of +doing <tt class="docutils literal">a + b + c</tt>: you need to allocate an object to hold the temporary +result of <tt class="docutils literal">a + b</tt>, then it dies very quickly because you no longer need it +when you do the final <tt class="docutils literal">+ c</tt> part.</li> +<li>Only small fraction of the objects survive and stay around for a while.</li> +</ul> +</blockquote> +So, the strategy is: make allocation as fast as possible; make deallocation of +short-lived objects as fast as possible; find a way to handle the remaining +small set of objects which actually survive long enough to be important.<br> +This is done using a <strong>Generational GC</strong>: the basic idea is the following:<br> +<blockquote> +<ol class="arabic simple"> +<li>We have a nursery, where we allocate "young objects" very quickly.</li> +<li>When the nursery is full, we start what we call a "minor collection".<ul> +<li>We do a quick scan to determine the small set of objects which survived so +far</li> +<li>We <strong>move</strong> these objects out of the nursery, and we place them in the +area of memory which contains the "old objects". Since the address of the +objects changes, we fix all the references to them accordingly.</li> +</ul> +</li> +</ol> +<ol class="arabic simple" start="4"> +<li>now the nursery contains only objects which "died young". We can +discard all of them very quickly, reset the nursery, and use the same area +of memory to allocate new objects from now.</li> +</ol> +</blockquote> +In practice, this scheme works very well and it is one of the reasons why PyPy +is much faster than CPython. However, careful readers have surely noticed +that this is a problem for <tt class="docutils literal">cpyext</tt>. On one hand, we have PyPy objects which +can potentially move and change their underlying memory address; on the other +hand, we need a way to represent them as fixed-address <tt class="docutils literal">PyObject*</tt> when we +pass them to C extensions. We surely need a way to handle that.</div> +<div class="section" id="pyobject-in-pypy"> +<h1> +<tt class="docutils literal">PyObject*</tt> in PyPy</h1> +Another challenge is that sometimes, <tt class="docutils literal">PyObject*</tt> structs are not completely +opaque: there are parts of the public API which expose to the user specific +fields of some concrete C struct. For example the definition of <a class="reference external" href="https://docs.python.org/2/c-api/typeobj.html">PyTypeObject</a> +which exposes many of the <tt class="docutils literal">tp_*</tt> slots to the user. +Since the low-level layout of PyPy <tt class="docutils literal">W_Root</tt> objects is completely different +than the one used by CPython, we cannot simply pass RPython objects to C; we +need a way to handle the difference.<br> +So, we have two issues so far: objects can move, and incompatible +low-level layouts. <tt class="docutils literal">cpyext</tt> solves both by decoupling the RPython and the C +representations. We have two "views" of the same entity, depending on whether +we are in the PyPy world (the movable <tt class="docutils literal">W_Root</tt> subclass) or in the C world +(the non-movable <tt class="docutils literal">PyObject*</tt>).<br> +<tt class="docutils literal">PyObject*</tt> are created lazily, only when they are actually needed. The +vast majority of PyPy objects are never passed to any C extension, so we don't +pay any penalty in that case. However, the first time we pass a <tt class="docutils literal">W_Root</tt> to +C, we allocate and initialize its <tt class="docutils literal">PyObject*</tt> counterpart.<br> +The same idea applies also to objects which are created in C, e.g. by calling +<a class="reference external" href="https://docs.python.org/2/c-api/allocation.html#c.PyObject_New">PyObject_New()</a>. At first, only the <tt class="docutils literal">PyObject*</tt> exists and it is +exclusively managed by reference counting. As soon as we pass it to the PyPy +world (e.g. as a return value of a function call), we create its <tt class="docutils literal">W_Root</tt> +counterpart, which is managed by the GC as usual.<br> +Here we start to see why calling cpyext modules is more costly in PyPy than in +CPython. We need to pay some penalty for all the conversions between +<tt class="docutils literal">W_Root</tt> and <tt class="docutils literal">PyObject*</tt>.<br> +Moreover, the first time we pass a <tt class="docutils literal">W_Root</tt> to C we also need to allocate +the memory for the <tt class="docutils literal">PyObject*</tt> using a slowish "CPython-style" memory +allocator. In practice, for all the objects which are passed to C we pay more +or less the same costs as CPython, thus effectively "undoing" the speedup +guaranteed by PyPy's Generational GC under normal circumstances.</div> +<div class="section" id="maintaining-the-link-between-w-root-and-pyobject"> +<h1> +Maintaining the link between <tt class="docutils literal">W_Root</tt> and <tt class="docutils literal">PyObject*</tt></h1> +We now need a way to convert between <tt class="docutils literal">W_Root</tt> and <tt class="docutils literal">PyObject*</tt> and +vice-versa; also, we need to to ensure that the lifetime of the two entities +are in sync. In particular:<br> +<blockquote> +<ol class="arabic simple"> +<li>as long as the <tt class="docutils literal">W_Root</tt> is kept alive by the GC, we want the +<tt class="docutils literal">PyObject*</tt> to live even if its refcount drops to 0;</li> +<li>as long as the <tt class="docutils literal">PyObject*</tt> has a refcount greater than 0, we want to +make sure that the GC does not collect the <tt class="docutils literal">W_Root</tt>.</li> +</ol> +</blockquote> +The <tt class="docutils literal">PyObject*</tt> ⇨ <tt class="docutils literal">W_Root</tt> link is maintained by the special field +<a class="reference external" href="https://foss.heptapod.net/pypy/pypy/-/tree/branch/py3.6/pypy/module/cpyext/parse/cpyext_object.h#lines-5">ob_pypy_link</a> which is added to all <tt class="docutils literal">PyObject*</tt>. On a 64 bit machine this +means that all <tt class="docutils literal">PyObject*</tt> have 8 bytes of overhead, but then the +conversion is very quick, just reading the field.<br> +For the other direction, we generally don't want to do the same: the +assumption is that the vast majority of <tt class="docutils literal">W_Root</tt> objects will never be +passed to C, and adding an overhead of 8 bytes to all of them is a +waste. Instead, in the general case the link is maintained by using a +dictionary, where <tt class="docutils literal">W_Root</tt> are the keys and <tt class="docutils literal">PyObject*</tt> the values.<br> +However, for a <a class="reference external" href="https://foss.heptapod.net/pypy/pypy/-/tree/branch/py3.6/pypy/module/cpyext/pyobject.py#lines-66">few selected</a> <tt class="docutils literal">W_Root</tt> subclasses we <strong>do</strong> maintain a +direct link using the special <tt class="docutils literal">_cpy_ref</tt> field to improve performance. In +particular, we use it for <tt class="docutils literal">W_TypeObject</tt> (which is big anyway, so a 8 bytes +overhead is negligible) and <tt class="docutils literal">W_NoneObject</tt>. <tt class="docutils literal">None</tt> is passed around very +often, so we want to ensure that the conversion to <tt class="docutils literal">PyObject*</tt> is very +fast. Moreover it's a singleton, so the 8 bytes overhead is negligible as +well.<br> +This means that in theory, passing an arbitrary Python object to C is +potentially costly, because it involves doing a dictionary lookup. We assume +that this cost will eventually show up in the profiler: however, at the time +of writing there are other parts of <tt class="docutils literal">cpyext</tt> which are even more costly (as we +will show later), so the cost of the dict lookup is never evident in the +profiler.</div> +<div class="section" id="crossing-the-border-between-rpython-and-c"> +<h1> +Crossing the border between RPython and C</h1> +There are two other things we need to care about whenever we cross the border +between RPython and C, and vice-versa: exception handling and the GIL.<br> +In the C API, exceptions are raised by calling <a class="reference external" href="https://docs.python.org/2/c-api/exceptions.html#c.PyErr_SetString">PyErr_SetString()</a> (or one of +<a class="reference external" href="https://docs.python.org/2/c-api/exceptions.html#exception-handling">many other functions</a> which have a similar effect), which basically works by +creating an exception value and storing it in some global variable. The +function then signals that an exception has occurred by returning an error value, +usually <tt class="docutils literal">NULL</tt>.<br> +On the other hand, in the PyPy interpreter, exceptions are propagated by raising the +RPython-level <a class="reference external" href="https://foss.heptapod.net/pypy/pypy/-/tree/branch/py3.6/pypy/interpreter/error.py#lines-20">OperationError</a> exception, which wraps the actual app-level +exception values. To harmonize the two worlds, whenever we return from C to +RPython, we need to check whether a C API exception was raised and if so turn it +into an <tt class="docutils literal">OperationError</tt>.<br> +We won't dig into details of <a class="reference external" href="https://foss.heptapod.net/pypy/pypy/-/tree/branch/py3.6/pypy/module/cpyext/api.py#lines-205">how the GIL is handled in cpyext</a>. +For the purpose of this post, it is enough to know that whenever we enter +C land, we store the current thread id into a global variable which is +accessible also from C; conversely, whenever we go back from RPython to C, we +restore this value to 0.<br> +Similarly, we need to do the inverse operations whenever you need to cross the +border between C and RPython, e.g. by calling a Python callback from C code.<br> +All this complexity is automatically handled by the RPython function +<a class="reference external" href="https://foss.heptapod.net/pypy/pypy/-/tree/branch/py3.6/pypy/module/cpyext/api.py#lines-1757">generic_cpy_call</a>. If you look at the code you see that it takes care of 4 +things:<br> +<blockquote> +<ol class="arabic simple"> +<li>Handling the GIL as explained above.</li> +<li>Handling exceptions, if they are raised.</li> +<li>Converting arguments from <tt class="docutils literal">W_Root</tt> to <tt class="docutils literal">PyObject*</tt>.</li> +<li>Converting the return value from <tt class="docutils literal">PyObject*</tt> to <tt class="docutils literal">W_Root</tt>.</li> +</ol> +</blockquote> +So, we can see that calling C from RPython introduce some overhead. +Can we measure it?<br> +Assuming that the conversion between <tt class="docutils literal">W_Root</tt> and <tt class="docutils literal">PyObject*</tt> has a +reasonable cost (as explained by the previous section), the overhead +introduced by a single border-cross is still acceptable, especially if the +callee is doing some non-negligible amount of work.<br> +However this is not always the case. There are basically three problems that +make (or used to make) <tt class="docutils literal">cpyext</tt> super slow:<br> +<blockquote> +<ol class="arabic simple"> +<li>Paying the border-crossing cost for trivial operations which are called +very often, such as <tt class="docutils literal">Py_INCREF</tt>.</li> +<li>Crossing the border back and forth many times, even if it's not strictly +needed.</li> +<li>Paying an excessive cost for argument and return value conversions.</li> +</ol> +</blockquote> +The next sections explain in more detail each of these problems.</div> +<div class="section" id="avoiding-unnecessary-roundtrips"> +<h1> +Avoiding unnecessary roundtrips</h1> +Prior to the <a class="reference external" href="https://www.pypy.org/posts/2017/10/cape-of-good-hope-for-pypy-hello-from-3656631725712879033.html">2017 Cape Town Sprint</a>, <tt class="docutils literal">cpyext</tt> was horribly slow, and we were +well aware of it: the main reason was that we never really paid too much +attention to performance. As explained in the blog post, emulating all the +CPython quirks is basically a nightmare, so better to concentrate on +correctness first.<br> +However, we didn't really know <strong>why</strong> it was so slow. We had theories and +assumptions, usually pointing at the cost of conversions between <tt class="docutils literal">W_Root</tt> +and <tt class="docutils literal">PyObject*</tt>, but we never actually measured it.<br> +So, we decided to write a set of <a class="reference external" href="https://github.com/antocuni/cpyext-benchmarks">cpyext microbenchmarks</a> to measure the +performance of various operations. The result was somewhat surprising: the +theory suggests that when you do a cpyext C call, you should pay the +border-crossing costs only once, but what the profiler told us was that we +were paying the cost of <tt class="docutils literal">generic_cpy_call</tt> several times more than what we expected.<br> +After a bit of investigation, we discovered this was ultimately caused by our +"correctness-first" approach. For simplicity of development and testing, when +we started <tt class="docutils literal">cpyext</tt> we wrote everything in RPython: thus, every single API call +made from C (like the omnipresent <a class="reference external" href="https://docs.python.org/2/c-api/arg.html#c.PyArg_ParseTuple">PyArg_ParseTuple()</a>, <a class="reference external" href="https://docs.python.org/2/c-api/int.html#c.PyInt_AsLong">PyInt_AsLong()</a>, etc.) +had to cross back the C-to-RPython border. This was especially daunting for +very simple and frequent operations like <tt class="docutils literal">Py_INCREF</tt> and <tt class="docutils literal">Py_DECREF</tt>, +which CPython implements as a single assembly instruction!<br> +Another source of slow down was the implementation of <tt class="docutils literal">PyTypeObject</tt> slots. +At the C level, these are function pointers which the interpreter calls to do +certain operations, e.g. <a class="reference external" href="https://docs.python.org/2/c-api/typeobj.html#c.PyTypeObject.tp_new">tp_new</a> to allocate a new instance of that type.<br> +As usual, we have some magic to implement slots in RPython; in particular, +<a class="reference external" href="https://foss.heptapod.net/pypy/pypy/-/tree/branch/py3.6/pypy/module/cpyext/api.py#lines-362">_make_wrapper</a> does the opposite of <tt class="docutils literal">generic_cpy_call</tt>: it takes a +RPython function and wraps it into a C function which can be safely called +from C, handling the GIL, exceptions and argument conversions automatically.<br> +This was very handy during the development of cpyext, but it might result in +some bad nonsense; consider what happens when you call the following C +function:<br> +<pre class="code C literal-block"><span class="keyword">static</span> <span class="name">PyObject</span><span class="operator">*</span> <span class="name function">foo</span><span class="punctuation">(</span><span class="name">PyObject</span><span class="operator">*</span> <span class="name">self</span><span class="punctuation">,</span> <span class="name">PyObject</span><span class="operator">*</span> <span class="name">args</span><span class="punctuation">)</span> +<span class="punctuation">{</span> + <span class="name">PyObject</span><span class="operator">*</span> <span class="name">result</span> <span class="operator">=</span> <span class="name">PyInt_FromLong</span><span class="punctuation">(</span><span class="literal number integer">1234</span><span class="punctuation">);</span> + <span class="keyword">return</span> <span class="name">result</span><span class="punctuation">;</span> +<span class="punctuation">}</span> +</pre> +<ol class="arabic simple"> +<li>you are in RPython and do a cpyext call to <tt class="docutils literal">foo</tt>: <strong>RPython-to-C</strong>;</li> +<li><tt class="docutils literal">foo</tt> calls <tt class="docutils literal">PyInt_FromLong(1234)</tt>, which is implemented in RPython: +<strong>C-to-RPython</strong>;</li> +<li>the implementation of <tt class="docutils literal">PyInt_FromLong</tt> indirectly calls +<tt class="docutils literal">PyIntType.tp_new</tt>, which is a C function pointer: <strong>RPython-to-C</strong>;</li> +<li>however, <tt class="docutils literal">tp_new</tt> is just a wrapper around an RPython function, created +by <tt class="docutils literal">_make_wrapper</tt>: <strong>C-to-RPython</strong>;</li> +<li>finally, we create our RPython <tt class="docutils literal">W_IntObject(1234)</tt>; at some point +during the <strong>RPython-to-C</strong> crossing, its <tt class="docutils literal">PyObject*</tt> equivalent is +created;</li> +<li>after many layers of wrappers, we are again in <tt class="docutils literal">foo</tt>: after we do +<tt class="docutils literal">return result</tt>, during the <strong>C-to-RPython</strong> step we convert it from +<tt class="docutils literal">PyObject*</tt> to <tt class="docutils literal">W_IntObject(1234)</tt>.</li> +</ol> +Phew! After we realized this, it was not so surprising that <tt class="docutils literal">cpyext</tt> was very +slow :). And this was a simplified example, since we are not passing a +<tt class="docutils literal">PyObject*</tt> to the API call. When we do, we need to convert it back and +forth at every step. Actually, I am not even sure that what I described was +the exact sequence of steps which used to happen, but you get the general +idea.<br> +The solution is simple: rewrite as much as we can in C instead of RPython, +to avoid unnecessary roundtrips. This was the topic of most of the Cape Town +sprint and resulted in the <tt class="docutils literal"><span class="pre">cpyext-avoid-roundtrip</span></tt> branch, which was +eventually <a class="reference external" href="https://foss.heptapod.net/pypy/pypy/-/tree/branch/cpyext_avoid-roundtrip">merged</a>.<br> +Of course, it is not possible to move <strong>everything</strong> to C: there are still +operations which need to be implemented in RPython. For example, think of +<tt class="docutils literal">PyList_Append</tt>: the logic to append an item to a list is complex and +involves list strategies, so we cannot replicate it in C. However, we +discovered that a large subset of the C API can benefit from this.<br> +Moreover, the C API is <strong>huge</strong>. While we invented this new way of writing +<tt class="docutils literal">cpyext</tt> code, we still need to +convert many of the functions to the new paradigm. Sometimes the rewrite is +not automatic +or straighforward. <tt class="docutils literal">cpyext</tt> is a delicate piece of software, so it happens often +that we make a mistake and end up staring at a segfault in gdb.<br> +However, the most important takeaway is that the performance improvements we got +from this optimization are impressive, as we will detail later.</div> +<div class="section" id="conversion-costs"> +<h1> +Conversion costs</h1> +The other potential big source of slowdown is the conversion of arguments +between <tt class="docutils literal">W_Root</tt> and <tt class="docutils literal">PyObject*</tt>.<br> +As explained earlier, the first time you pass a <tt class="docutils literal">W_Root</tt> to C, you need to +allocate its <tt class="docutils literal">PyObject*</tt> counterpart. Suppose you have a <tt class="docutils literal">foo</tt> function +defined in C, which takes a single int argument:<br> +<pre class="code python literal-block"><span class="keyword">for</span> <span class="name">i</span> <span class="operator word">in</span> <span class="name builtin">range</span><span class="punctuation">(</span><span class="name">N</span><span class="punctuation">):</span> + <span class="name">foo</span><span class="punctuation">(</span><span class="name">i</span><span class="punctuation">)</span> +</pre> +To run this code, you need to create a different <tt class="docutils literal">PyObject*</tt> for each value +of <tt class="docutils literal">i</tt>: if implemented naively, it means calling <tt class="docutils literal">N</tt> times <tt class="docutils literal">malloc()</tt> +and <tt class="docutils literal">free()</tt>, which kills performance.<br> +CPython has the very same problem, which is solved by using a <a class="reference external" href="https://en.wikipedia.org/wiki/Free_list">free list</a> to +<a class="reference external" href="https://github.com/python/cpython/blob/2.7/Objects/intobject.c#L16">allocate ints</a>. So, what we did was to simply <a class="reference external" href="https://foss.heptapod.net/pypy/pypy/-/commit/d8754ab9ba6371c83eaeb80cdf8cc13a37ee0c89">steal the code</a> from CPython +and do the exact same thing. This was also done in the +<tt class="docutils literal"><span class="pre">cpyext-avoid-roundtrip</span></tt> branch, and the benchmarks show that it worked +perfectly.<br> +Every type which is converted often to <tt class="docutils literal">PyObject*</tt> must have a very fast +allocator. At the moment of writing, PyPy uses free lists only for ints and +<a class="reference external" href="https://foss.heptapod.net/pypy/pypy/-/commit/35e2fb9903f2483940d7970bd83ce8c65aa1c1a3">tuples</a>: one of the next steps on our TODO list is certainly to use this +technique with more types, like <tt class="docutils literal">float</tt>.<br> +Conversely, we also need to optimize the converstion from <tt class="docutils literal">PyObject*</tt> to +<tt class="docutils literal">W_Root</tt>: this happens when an object is originally allocated in C and +returned to Python. Consider for example the following code:<br> +<pre class="code python literal-block"><span class="keyword namespace">import</span> <span class="name namespace">numpy</span> <span class="keyword namespace">as</span> <span class="name namespace">np</span> +<span class="name">myarray</span> <span class="operator">=</span> <span class="name">np</span><span class="operator">.</span><span class="name">random</span><span class="operator">.</span><span class="name">random</span><span class="punctuation">(</span><span class="name">N</span><span class="punctuation">)</span> +<span class="keyword">for</span> <span class="name">i</span> <span class="operator word">in</span> <span class="name builtin">range</span><span class="punctuation">(</span><span class="name builtin">len</span><span class="punctuation">(</span><span class="name">arr</span><span class="punctuation">)):</span> + <span class="name">myarray</span><span class="punctuation">[</span><span class="name">i</span><span class="punctuation">]</span> +</pre> +At every iteration, we get an item out of the array: the return type is a an +instance of <tt class="docutils literal">numpy.float64</tt> (a numpy scalar), i.e. a <tt class="docutils literal">PyObject'*</tt>: this is +something which is implemented by numpy entirely in C, so completely +opaque to <tt class="docutils literal">cpyext</tt>. We don't have any control on how it is allocated, +managed, etc., and we can assume that allocation costs are the same as on +CPython.<br> +As soon as we return these <tt class="docutils literal">PyObject*</tt> to Python, we need to allocate +their <tt class="docutils literal">W_Root</tt> equivalent. If you do it in a small loop like in the example +above, you end up allocating all these <tt class="docutils literal">W_Root</tt> inside the nursery, which is +a good thing since allocation is super fast (see the section above about the +PyPy GC).<br> +However, we also need to keep track of the <tt class="docutils literal">W_Root</tt> to <tt class="docutils literal">PyObject*</tt> link. +Currently, we do this by putting all of them in a dictionary, but it is very +inefficient, especially because most of these objects die young and thus it +is wasted work to do that for them. Currently, this is one of the biggest +unresolved problem in <tt class="docutils literal">cpyext</tt>, and it is what causes the two microbenchmarks +<tt class="docutils literal">allocate_int</tt> and <tt class="docutils literal">allocate_tuple</tt> to be very slow.<br> +We are well aware of the problem, and we have a plan for how to fix it. The +explanation is too technical for the scope of this blog post as it requires a +deep knowledge of the GC internals to be understood, but the details are +<a class="reference external" href="https://foss.heptapod.net/pypy/extradoc/-/blob/branch/extradoc/planning/cpyext.txt#L27">here</a>.</div> +<div class="section" id="c-api-quirks"> +<h1> +C API quirks</h1> +Finally, there is another source of slowdown which is beyond our control. Some +parts of the CPython C API are badly designed and expose some of the +implementation details of CPython.<br> +The major example is reference counting. The <tt class="docutils literal">Py_INCREF</tt> / <tt class="docutils literal">Py_DECREF</tt> API +is designed in such a way which forces other implementation to emulate +refcounting even in presence of other GC management schemes, as explained +above.<br> +Another example is borrowed references. There are API functions which <strong>do +not</strong> incref an object before returning it, e.g. <a class="reference external" href="https://docs.python.org/2/c-api/list.html#c.PyList_GetItem">PyList_GetItem()</a>. This is +done for performance reasons because we can avoid a whole incref/decref pair, +if the caller needs to handle the returned item only temporarily: the item is +kept alive because it is in the list anyway.<br> +For PyPy, this is a challenge: thanks to <a class="reference external" href="https://www.pypy.org/posts/2011/10/more-compact-lists-with-list-strategies-8229304944653956829.html">list strategies</a>, lists are often +represented in a compact way. For example, a list containing only integers is +stored as a C array of <tt class="docutils literal">long</tt>. How to implement <tt class="docutils literal">PyList_GetItem</tt>? We +cannot simply create a <tt class="docutils literal">PyObject*</tt> on the fly, because the caller will never +decref it and it will result in a memory leak.<br> +The current solution is very inefficient. The first time we do a +<tt class="docutils literal">PyList_GetItem</tt>, we <a class="reference external" href="https://foss.heptapod.net/pypy/pypy/-/tree/branch/py3.6/pypy/module/cpyext/listobject.py#lines-28">convert</a> the <strong>whole</strong> list to a list of +<tt class="docutils literal">PyObject*</tt>. This is bad in two ways: the first is that we potentially pay a +lot of unneeded conversion cost in case we will never access the other items +of the list. The second is that by doing that we lose all the performance +benefit granted by the original list strategy, making it slower for the +rest of the pure-python code which will manipulate the list later.<br> +<tt class="docutils literal">PyList_GetItem</tt> is an example of a bad API because it assumes that the list +is implemented as an array of <tt class="docutils literal">PyObject*</tt>: after all, in order to return a +borrowed reference, we need a reference to borrow, don't we?<br> +Fortunately, (some) CPython developers are aware of these problems, and there +is an ongoing project to <a class="reference external" href="https://pythoncapi.readthedocs.io/">design a better C API</a> which aims to fix exactly +this kind of problem.<br> +Nonetheless, in the meantime we still need to implement the current +half-broken APIs. There is no easy solution for that, and it is likely that +we will always need to pay some performance penalty in order to implement them +correctly.<br> +However, what we could potentially do is to provide alternative functions +which do the same job but are more PyPy friendly: for example, we could think +of implementing <tt class="docutils literal">PyList_GetItemNonBorrowed</tt> or something like that: then, C +extensions could choose to use it (possibly hidden inside some macro and +<tt class="docutils literal">#ifdef</tt>) if they want to be fast on PyPy.</div> +<div class="section" id="current-performance"> +<h1> +Current performance</h1> +During the whole blog post we claimed <tt class="docutils literal">cpyext</tt> is slow. How +slow it is, exactly?<br> +We decided to concentrate on <a class="reference external" href="https://github.com/antocuni/cpyext-benchmarks">microbenchmarks</a> for now. It should be evident +by now there are simply too many issues which can slow down a <tt class="docutils literal">cpyext</tt> +program, and microbenchmarks help us to concentrate on one (or few) at a +time.<br> +The microbenchmarks measure very simple things, like calling functions and +methods with the various calling conventions (no arguments, one arguments, +multiple arguments); passing various types as arguments (to measure conversion +costs); allocating objects from C, and so on.<br> +Here are the results from the old PyPy 5.8 relative and normalized to CPython +2.7, the lower the better:<br> +<br> + + +<div class="separator" style="clear: both; text-align: center;"> +<a href="https://4.bp.blogspot.com/-5QV9jBfeXfo/W6UOCRA9YqI/AAAAAAAABX4/H2zgbv_XFQEHD4Lb2lj5Ve4Ob_YMuSXLwCLcBGAs/s1600/pypy58.png" style="margin-left: 1em; margin-right: 1em;"><img border="0" height="480" src="https://4.bp.blogspot.com/-5QV9jBfeXfo/W6UOCRA9YqI/AAAAAAAABX4/H2zgbv_XFQEHD4Lb2lj5Ve4Ob_YMuSXLwCLcBGAs/s640/pypy58.png" width="640"></a></div> +<br> +<div class="separator" style="clear: both; text-align: center;"> +<a href="https://www.blogger.com/blogger.g?blogID=3971202189709462152" style="margin-left: 1em; margin-right: 1em;"></a></div> +<div class="separator" style="clear: both; text-align: center;"> +<a href="https://www.blogger.com/blogger.g?blogID=3971202189709462152" style="margin-left: 1em; margin-right: 1em;"></a></div> +<br> +PyPy was horribly slow everywhere, ranging from 2.5x to 10x slower. It is +particularly interesting to compare <tt class="docutils literal">simple.noargs</tt>, which measures the cost +of calling an empty function with no arguments, and <tt class="docutils literal">simple.onearg(i)</tt>, +which measures the cost calling an empty function passing an integer argument: +the latter is ~2x slower than the former, indicating that the conversion cost +of integers is huge.<br> +PyPy 5.8 was the last release before the famous Cape Town sprint, when we +started to look at cpyext performance seriously. Here are the performance data for +PyPy 6.0, the latest release at the time of writing:<br> +<div class="separator" style="clear: both; text-align: center;"> +<a href="https://1.bp.blogspot.com/-MRkRoxtCeOE/W6UOL5txl1I/AAAAAAAABX8/i0ZiOyS2MOgiSyxFAyMOkKcB6xqjSihBACLcBGAs/s1600/pypy60.png" style="margin-left: 1em; margin-right: 1em;"><img border="0" height="480" src="https://1.bp.blogspot.com/-MRkRoxtCeOE/W6UOL5txl1I/AAAAAAAABX8/i0ZiOyS2MOgiSyxFAyMOkKcB6xqjSihBACLcBGAs/s640/pypy60.png" width="640"></a></div> +<br> +<br> +The results are amazing! PyPy is now massively faster than before, and for +most benchmarks it is even faster than CPython: yes, you read it correctly: +PyPy is faster than CPython at doing CPython's job, even considering all the +extra work it has to do to emulate the C API. This happens thanks to the JIT, +which produces speedups high enough to counterbalance the slowdown caused by +cpyext.<br> +There are two microbenchmarks which are still slower though: <tt class="docutils literal">allocate_int</tt> +and <tt class="docutils literal">allocate_tuple</tt>, for the reasons explained in the section about +<a class="reference internal" href="https://www.blogger.com/blogger.g?blogID=3971202189709462152#conversion-costs">Conversion costs</a>.</div> +<div class="section" id="next-steps"> +<h1> +Next steps</h1> +Despite the spectacular results we got so far, <tt class="docutils literal">cpyext</tt> is still slow enough to +kill performance in most real-world code which uses C extensions extensively +(e.g., the omnipresent numpy).<br> +Our current approach is something along these lines:<br> +<blockquote> +<ol class="arabic simple"> +<li>run a real-world small benchmark which exercises cpyext</li> +<li>measure and find the major bottleneck</li> +<li>write a corresponding microbenchmark</li> +<li>optimize it</li> +<li>repeat</li> +</ol> +</blockquote> +On one hand, this is a daunting task because the C API is huge and we need to +tackle functions one by one. On the other hand, not all the functions are +equally important, and is is enough to optimize a relatively small subset to +improve many different use cases.<br> +Where a year ago we announced we have a working answer to run c-extension in +PyPy, we now have a clear picture of what are the performance bottlenecks, and +we have developed some technical solutions to fix them. It is "only" a matter +of tackling them, one by one. It is worth noting that most of the work was +done during two sprints, for a total 2-3 person-months of work.<br> +We think this work is important for the Python ecosystem. PyPy has established +a baseline for performance in pure python code, providing an answer for the +"Python is slow" detractors. The techniques used to make <tt class="docutils literal">cpyext</tt> performant +will let PyPy become an alternative for people who mix C extensions with +Python, which, it turns out, is just about everyone, in particular those using +the various scientific libraries. Today, many developers are forced to seek +performance by converting code from Python to a lower language. We feel there +is no reason to do this, but in order to prove it we must be able to run both +their python and their C extensions performantly, then we can begin to educate +them how to write JIT-friendly code in the first place.<br> +We envision a future in which you can run arbitrary Python programs on PyPy, +with the JIT speeding up the pure Python parts and the C parts running as fast +as today: the best of both worlds!</div> +</div>cpyextprofilingspeedhttps://www.pypy.org/posts/2018/09/inside-cpyext-why-emulating-cpython-c-8083064623681286567.htmlFri, 21 Sep 2018 16:32:00 GMTHow to make your code 80 times fasterhttps://www.pypy.org/posts/2017/10/how-to-make-your-code-80-times-faster-1424098117108093942.htmlAntonio Cuni<div class="document" id="how-to-make-your-code-80-times-faster"> +I often hear people who are happy because PyPy makes their code 2 times faster +or so. Here is a short personal story which shows PyPy can go well beyond +that.<br> +<br> +<strong>DISCLAIMER</strong>: this is not a silver bullet or a general recipe: it worked in +this particular case, it might not work so well in other cases. But I think it +is still an interesting technique. Moreover, the various steps and +implementations are showed in the same order as I tried them during the +development, so it is a real-life example of how to proceed when optimizing +for PyPy.<br> +<br> +Some months ago I <a class="reference external" href="https://github.com/antocuni/evolvingcopter">played a bit</a> with evolutionary algorithms: the ambitious +plan was to automatically evolve a logic which could control a (simulated) +quadcopter, i.e. a <a class="reference external" href="https://en.wikipedia.org/wiki/PID_controller">PID controller</a> (<strong>spoiler</strong>: it doesn't fly).<br> +<br> +The idea is to have an initial population of random creatures: at each +generation, the ones with the best fitness survive and reproduce with small, +random variations.<br> +<br> +However, for the scope of this post, the actual task at hand is not so +important, so let's jump straight to the code. To drive the quadcopter, a +<tt class="docutils literal">Creature</tt> has a <tt class="docutils literal">run_step</tt> method which runs at each <tt class="docutils literal">delta_t</tt> (<a class="reference external" href="https://github.com/antocuni/evolvingcopter/blob/master/ev/creature.py">full +code</a>):<br> +<pre class="code python literal-block"><span class="keyword">class</span> <span class="name class">Creature</span><span class="punctuation">(</span><span class="name builtin">object</span><span class="punctuation">):</span> + <span class="name">INPUTS</span> <span class="operator">=</span> <span class="literal number integer">2</span> <span class="comment single"># z_setpoint, current z position</span> + <span class="name">OUTPUTS</span> <span class="operator">=</span> <span class="literal number integer">1</span> <span class="comment single"># PWM for all 4 motors</span> + <span class="name">STATE_VARS</span> <span class="operator">=</span> <span class="literal number integer">1</span> + <span class="operator">...</span> + + <span class="keyword">def</span> <span class="name function">run_step</span><span class="punctuation">(</span><span class="name builtin pseudo">self</span><span class="punctuation">,</span> <span class="name">inputs</span><span class="punctuation">):</span> + <span class="comment single"># state: [state_vars ... inputs]</span> + <span class="comment single"># out_values: [state_vars, ... outputs]</span> + <span class="name builtin pseudo">self</span><span class="operator">.</span><span class="name">state</span><span class="punctuation">[</span><span class="name builtin pseudo">self</span><span class="operator">.</span><span class="name">STATE_VARS</span><span class="punctuation">:]</span> <span class="operator">=</span> <span class="name">inputs</span> + <span class="name">out_values</span> <span class="operator">=</span> <span class="name">np</span><span class="operator">.</span><span class="name">dot</span><span class="punctuation">(</span><span class="name builtin pseudo">self</span><span class="operator">.</span><span class="name">matrix</span><span class="punctuation">,</span> <span class="name builtin pseudo">self</span><span class="operator">.</span><span class="name">state</span><span class="punctuation">)</span> <span class="operator">+</span> <span class="name builtin pseudo">self</span><span class="operator">.</span><span class="name">constant</span> + <span class="name builtin pseudo">self</span><span class="operator">.</span><span class="name">state</span><span class="punctuation">[:</span><span class="name builtin pseudo">self</span><span class="operator">.</span><span class="name">STATE_VARS</span><span class="punctuation">]</span> <span class="operator">=</span> <span class="name">out_values</span><span class="punctuation">[:</span><span class="name builtin pseudo">self</span><span class="operator">.</span><span class="name">STATE_VARS</span><span class="punctuation">]</span> + <span class="name">outputs</span> <span class="operator">=</span> <span class="name">out_values</span><span class="punctuation">[</span><span class="name builtin pseudo">self</span><span class="operator">.</span><span class="name">STATE_VARS</span><span class="punctuation">:]</span> + <span class="keyword">return</span> <span class="name">outputs</span> +</pre> +<ul class="simple"> +<li><tt class="docutils literal">inputs</tt> is a numpy array containing the desired setpoint and the current +position on the Z axis;</li> +<li><tt class="docutils literal">outputs</tt> is a numpy array containing the thrust to give to the motors. To +start easy, all the 4 motors are constrained to have the same thrust, so +that the quadcopter only travels up and down the Z axis;</li> +<li><tt class="docutils literal">self.state</tt> contains arbitrary values of unknown size which are passed from +one step to the next;</li> +<li><tt class="docutils literal">self.matrix</tt> and <tt class="docutils literal">self.constant</tt> contains the actual logic. By putting +the "right" values there, in theory we could get a perfectly tuned PID +controller. These are randomly mutated between generations.</li> +</ul> +<tt class="docutils literal">run_step</tt> is called at 100Hz (in the virtual time frame of the simulation). At each +generation, we test 500 creatures for a total of 12 virtual seconds each. So, +we have a total of 600,000 executions of <tt class="docutils literal">run_step</tt> at each generation.<br> +<br> +At first, I simply tried to run this code on CPython; here is the result:<br> +<pre class="code literal-block">$ python -m ev.main +Generation 1: ... [population = 500] [12.06 secs] +Generation 2: ... [population = 500] [6.13 secs] +Generation 3: ... [population = 500] [6.11 secs] +Generation 4: ... [population = 500] [6.09 secs] +Generation 5: ... [population = 500] [6.18 secs] +Generation 6: ... [population = 500] [6.26 secs] +</pre> +Which means ~6.15 seconds/generation, excluding the first.<br> +<br> +Then I tried with PyPy 5.9:<br> +<pre class="code literal-block">$ pypy -m ev.main +Generation 1: ... [population = 500] [63.90 secs] +Generation 2: ... [population = 500] [33.92 secs] +Generation 3: ... [population = 500] [34.21 secs] +Generation 4: ... [population = 500] [33.75 secs] +</pre> +Ouch! We are ~5.5x slower than CPython. This was kind of expected: numpy is +based on cpyext, which is infamously slow. (Actually, <a class="reference external" href="https://pypy.org/posts/2017/10/cape-of-good-hope-for-pypy-hello-from-3656631725712879033.html">we are working on +that</a> and on the <tt class="docutils literal"><span class="pre">cpyext-avoid-roundtrip</span></tt> branch we are already faster than +CPython, but this will be the subject of another blog post.)<br> +<br> +So, let's try to avoid cpyext. The first obvious step is to use <a class="reference external" href="https://doc.pypy.org/en/latest/faq.html#what-about-numpy-numpypy-micronumpy">numpypy</a> +instead of numpy (actually, there is a <a class="reference external" href="https://github.com/antocuni/evolvingcopter/blob/master/ev/pypycompat.py">hack</a> to use just the micronumpy +part). Let's see if the speed improves:<br> +<pre class="code literal-block">$ pypy -m ev.main # using numpypy +Generation 1: ... [population = 500] [5.60 secs] +Generation 2: ... [population = 500] [2.90 secs] +Generation 3: ... [population = 500] [2.78 secs] +Generation 4: ... [population = 500] [2.69 secs] +Generation 5: ... [population = 500] [2.72 secs] +Generation 6: ... [population = 500] [2.73 secs] +</pre> +So, ~2.7 seconds on average: this is 12x faster than PyPy+numpy, and more than +2x faster than the original CPython. At this point, most people would be happy +and go tweeting how PyPy is great.<br> +<br> +In general, when talking of CPython vs PyPy, I am rarely satified of a 2x +speedup: I know that PyPy can do much better than this, especially if you +write code which is specifically optimized for the JIT. For a real-life +example, have a look at <a class="reference external" href="https://capnpy.readthedocs.io/en/latest/benchmarks.html">capnpy benchmarks</a>, in which the PyPy version is +~15x faster than the heavily optimized CPython+Cython version (both have been +written by me, and I tried hard to write the fastest code for both +implementations).<br> +<br> +So, let's try to do better. As usual, the first thing to do is to profile and +see where we spend most of the time. Here is the <a class="reference external" href="https://vmprof.com/#/449ca8ee-3ab2-49d4-b6f0-9099987e9000">vmprof profile</a>. We spend a +lot of time inside the internals of numpypy, and allocating tons of temporary +arrays to store the results of the various operations.<br> +<br> +Also, let's look at the <a class="reference external" href="https://vmprof.com/#/28fd6e8f-f103-4bf4-a76a-4b65dbd637f4/traces">jit traces</a> and search for the function <tt class="docutils literal">run</tt>: +this is loop in which we spend most of the time, and it is composed of 1796 +operations. The operations emitted for the line <tt class="docutils literal"><span class="pre">np.dot(...)</span> + +self.constant</tt> are listed between lines 1217 and 1456. Here is the excerpt +which calls <tt class="docutils literal"><span class="pre">np.dot(...)</span></tt>; most of the ops are cheap, but at line 1232 we +see a call to the RPython function <a class="reference external" href="https://foss.heptapod.net/pypy/pypy/-/blob/release-pypy3.5-v5.10.0/pypy/module/micronumpy/ndarray.py#L1160">descr_dot</a>; by looking at the +implementation we see that it creates a new <tt class="docutils literal">W_NDimArray</tt> to store the +result, which means it has to do a <tt class="docutils literal">malloc()</tt>:<br> +<div class="separator" style="clear: both; text-align: center;"> +<a href="https://4.bp.blogspot.com/-_h6BuLTtEO8/Wfb6BXDg93I/AAAAAAAABNY/BY2XBg4ZtwokB9f1mWSmzI9gn_qanb81QCLcBGAs/s1600/2017-10-trace1.png" style="margin-left: 1em; margin-right: 1em;"><img border="0" height="450" src="https://4.bp.blogspot.com/-_h6BuLTtEO8/Wfb6BXDg93I/AAAAAAAABNY/BY2XBg4ZtwokB9f1mWSmzI9gn_qanb81QCLcBGAs/s640/2017-10-trace1.png" width="640"></a></div> +<br> +The implementation of the <tt class="docutils literal">+ self.constant</tt> part is also interesting: +contrary the former, the call to <tt class="docutils literal">W_NDimArray.descr_add</tt> has been inlined by +the JIT, so we have a better picture of what's happening; in particular, we +can see the call to <tt class="docutils literal">__0_alloc_with_del____</tt> which allocates the +<tt class="docutils literal">W_NDimArray</tt> for the result, and the <tt class="docutils literal">raw_malloc</tt> which allocates the +actual array. Then we have a long list of 149 simple operations which set the +fields of the resulting array, construct an iterator, and finally do a +<tt class="docutils literal">call_assembler</tt>: this is the actual logic to do the addition, which was +JITtted indipendently; <tt class="docutils literal">call_assembler</tt> is one of the operations to do +JIT-to-JIT calls:<br> +<div class="separator" style="clear: both; text-align: center;"> +<a href="https://1.bp.blogspot.com/-vmo0pWharIU/Wfb3VfwHjxI/AAAAAAAABNE/a6Em09qZizwGiWJeTbGzKfHQH70dB7RKgCEwYBhgL/s1600/2017-10-trace2.png" style="margin-left: 1em; margin-right: 1em;"><img border="0" height="640" src="https://1.bp.blogspot.com/-vmo0pWharIU/Wfb3VfwHjxI/AAAAAAAABNE/a6Em09qZizwGiWJeTbGzKfHQH70dB7RKgCEwYBhgL/s640/2017-10-trace2.png" width="625"></a></div> +<br> +All of this is very suboptimal: in this particular case, we know that the +shape of <tt class="docutils literal">self.matrix</tt> is always <tt class="docutils literal">(3, 2)</tt>: so, we are doing an incredible +amount of work, including calling <tt class="docutils literal">malloc()</tt> twice for the temporary arrays, just to +call two functions which ultimately do a total of 6 multiplications +and 6 additions. Note also that this is not a fault of the JIT: CPython+numpy +has to do the same amount of work, just hidden inside C calls.<br> +<br> +One possible solution to this nonsense is a well known compiler optimization: +loop unrolling. From the compiler point of view, unrolling the loop is always +risky because if the matrix is too big you might end up emitting a huge blob +of code, possibly uselss if the shape of the matrices change frequently: this +is the main reason why the PyPy JIT does not even try to do it in this case.<br> +<br> +However, we <strong>know</strong> that the matrix is small, and always of the same +shape. So, let's unroll the loop manually:<br> +<pre class="code python literal-block"><span class="keyword">class</span> <span class="name class">SpecializedCreature</span><span class="punctuation">(</span><span class="name">Creature</span><span class="punctuation">):</span> + + <span class="keyword">def</span> <span class="name function magic">__init__</span><span class="punctuation">(</span><span class="name builtin pseudo">self</span><span class="punctuation">,</span> <span class="operator">*</span><span class="name">args</span><span class="punctuation">,</span> <span class="operator">**</span><span class="name">kwargs</span><span class="punctuation">):</span> + <span class="name">Creature</span><span class="operator">.</span><span class="name function magic">__init__</span><span class="punctuation">(</span><span class="name builtin pseudo">self</span><span class="punctuation">,</span> <span class="operator">*</span><span class="name">args</span><span class="punctuation">,</span> <span class="operator">**</span><span class="name">kwargs</span><span class="punctuation">)</span> + <span class="comment single"># store the data in a plain Python list</span> + <span class="name builtin pseudo">self</span><span class="operator">.</span><span class="name">data</span> <span class="operator">=</span> <span class="name builtin">list</span><span class="punctuation">(</span><span class="name builtin pseudo">self</span><span class="operator">.</span><span class="name">matrix</span><span class="operator">.</span><span class="name">ravel</span><span class="punctuation">())</span> <span class="operator">+</span> <span class="name builtin">list</span><span class="punctuation">(</span><span class="name builtin pseudo">self</span><span class="operator">.</span><span class="name">constant</span><span class="punctuation">)</span> + <span class="name builtin pseudo">self</span><span class="operator">.</span><span class="name">data_state</span> <span class="operator">=</span> <span class="punctuation">[</span><span class="literal number float">0.0</span><span class="punctuation">]</span> + <span class="keyword">assert</span> <span class="name builtin pseudo">self</span><span class="operator">.</span><span class="name">matrix</span><span class="operator">.</span><span class="name">shape</span> <span class="operator">==</span> <span class="punctuation">(</span><span class="literal number integer">2</span><span class="punctuation">,</span> <span class="literal number integer">3</span><span class="punctuation">)</span> + <span class="keyword">assert</span> <span class="name builtin">len</span><span class="punctuation">(</span><span class="name builtin pseudo">self</span><span class="operator">.</span><span class="name">data</span><span class="punctuation">)</span> <span class="operator">==</span> <span class="literal number integer">8</span> + + <span class="keyword">def</span> <span class="name function">run_step</span><span class="punctuation">(</span><span class="name builtin pseudo">self</span><span class="punctuation">,</span> <span class="name">inputs</span><span class="punctuation">):</span> + <span class="comment single"># state: [state_vars ... inputs]</span> + <span class="comment single"># out_values: [state_vars, ... outputs]</span> + <span class="name">k0</span><span class="punctuation">,</span> <span class="name">k1</span><span class="punctuation">,</span> <span class="name">k2</span><span class="punctuation">,</span> <span class="name">q0</span><span class="punctuation">,</span> <span class="name">q1</span><span class="punctuation">,</span> <span class="name">q2</span><span class="punctuation">,</span> <span class="name">c0</span><span class="punctuation">,</span> <span class="name">c1</span> <span class="operator">=</span> <span class="name builtin pseudo">self</span><span class="operator">.</span><span class="name">data</span> + <span class="name">s0</span> <span class="operator">=</span> <span class="name builtin pseudo">self</span><span class="operator">.</span><span class="name">data_state</span><span class="punctuation">[</span><span class="literal number integer">0</span><span class="punctuation">]</span> + <span class="name">z_sp</span><span class="punctuation">,</span> <span class="name">z</span> <span class="operator">=</span> <span class="name">inputs</span> + <span class="comment single">#</span> + <span class="comment single"># compute the output</span> + <span class="name">out0</span> <span class="operator">=</span> <span class="name">s0</span><span class="operator">*</span><span class="name">k0</span> <span class="operator">+</span> <span class="name">z_sp</span><span class="operator">*</span><span class="name">k1</span> <span class="operator">+</span> <span class="name">z</span><span class="operator">*</span><span class="name">k2</span> <span class="operator">+</span> <span class="name">c0</span> + <span class="name">out1</span> <span class="operator">=</span> <span class="name">s0</span><span class="operator">*</span><span class="name">q0</span> <span class="operator">+</span> <span class="name">z_sp</span><span class="operator">*</span><span class="name">q1</span> <span class="operator">+</span> <span class="name">z</span><span class="operator">*</span><span class="name">q2</span> <span class="operator">+</span> <span class="name">c1</span> + <span class="comment single">#</span> + <span class="name builtin pseudo">self</span><span class="operator">.</span><span class="name">data_state</span><span class="punctuation">[</span><span class="literal number integer">0</span><span class="punctuation">]</span> <span class="operator">=</span> <span class="name">out0</span> + <span class="name">outputs</span> <span class="operator">=</span> <span class="punctuation">[</span><span class="name">out1</span><span class="punctuation">]</span> + <span class="keyword">return</span> <span class="name">outputs</span> +</pre> +In the <a class="reference external" href="https://github.com/antocuni/evolvingcopter/blob/master/ev/creature.py#L100">actual code</a> there is also a sanity check which asserts that the +computed output is the very same as the one returned by <tt class="docutils literal">Creature.run_step</tt>.<br> +<br> +So, let's try to see how it performs. First, with CPython:<br> +<pre class="code literal-block">$ python -m ev.main +Generation 1: ... [population = 500] [7.61 secs] +Generation 2: ... [population = 500] [3.96 secs] +Generation 3: ... [population = 500] [3.79 secs] +Generation 4: ... [population = 500] [3.74 secs] +Generation 5: ... [population = 500] [3.84 secs] +Generation 6: ... [population = 500] [3.69 secs] +</pre> +This looks good: 60% faster than the original CPython+numpy +implementation. Let's try on PyPy:<br> +<pre class="code literal-block">Generation 1: ... [population = 500] [0.39 secs] +Generation 2: ... [population = 500] [0.10 secs] +Generation 3: ... [population = 500] [0.11 secs] +Generation 4: ... [population = 500] [0.09 secs] +Generation 5: ... [population = 500] [0.08 secs] +Generation 6: ... [population = 500] [0.12 secs] +Generation 7: ... [population = 500] [0.09 secs] +Generation 8: ... [population = 500] [0.08 secs] +Generation 9: ... [population = 500] [0.08 secs] +Generation 10: ... [population = 500] [0.08 secs] +Generation 11: ... [population = 500] [0.08 secs] +Generation 12: ... [population = 500] [0.07 secs] +Generation 13: ... [population = 500] [0.07 secs] +Generation 14: ... [population = 500] [0.08 secs] +Generation 15: ... [population = 500] [0.07 secs] +</pre> +Yes, it's not an error. After a couple of generations, it stabilizes at around +~0.07-0.08 seconds per generation. This is around <strong>80 (eighty) times faster</strong> +than the original CPython+numpy implementation, and around 35-40x faster than +the naive PyPy+numpypy one.<br> +<br> +Let's look at the <a class="reference external" href="https://vmprof.com/#/402af746-2966-4403-a61d-93015abac033/traces">trace</a> again: it no longer contains expensive calls, and +certainly no more temporary <tt class="docutils literal">malloc()</tt> s. The core of the logic is between +lines 386-416, where we can see that it does fast C-level multiplications and +additions: <tt class="docutils literal">float_mul</tt> and <tt class="docutils literal">float_add</tt> are translated straight into +<tt class="docutils literal">mulsd</tt> and <tt class="docutils literal">addsd</tt> x86 instructions.<br> +<br> +As I said before, this is a very particular example, and the techniques +described here do not always apply: it is not realistic to expect an 80x +speedup on arbitrary code, unfortunately. However, it clearly shows the potential of PyPy when +it comes to high-speed computing. And most importantly, it's not a toy +benchmark which was designed specifically to have good performance on PyPy: +it's a real world example, albeit small.<br> +<br> +You might be also interested in the talk I gave at last EuroPython, in which I +talk about a similar topic: "The Joy of PyPy JIT: abstractions for free" +(<a class="reference external" href="https://ep2017.europython.eu/conference/talks/the-joy-of-pypy-jit-abstractions-for-free">abstract</a>, <a class="reference external" href="https://speakerdeck.com/antocuni/the-joy-of-pypy-jit-abstractions-for-free">slides</a> and <a class="reference external" href="https://www.youtube.com/watch?v=NQfpHQII2cU">video</a>).<br> +<br> +<div class="section" id="how-to-reproduce-the-results"> +<h3> +How to reproduce the results</h3> +<pre class="code literal-block">$ git clone https://github.com/antocuni/evolvingcopter +$ cd evolvingcopter +$ {python,pypy} -m ev.main --no-specialized --no-numpypy +$ {python,pypy} -m ev.main --no-specialized +$ {python,pypy} -m ev.main +</pre> +</div> +</div>jitprofilingspeedhttps://www.pypy.org/posts/2017/10/how-to-make-your-code-80-times-faster-1424098117108093942.htmlMon, 30 Oct 2017 10:15:00 GMT(Cape of) Good Hope for PyPyhttps://www.pypy.org/posts/2017/10/cape-of-good-hope-for-pypy-hello-from-3656631725712879033.htmlAntonio Cuni<div> +<br></div> +Hello from the other side of the world (for most of you)!<br> +<br> +With the excuse of coming to <a class="reference external" href="https://za.pycon.org/">PyCon ZA</a> during the last two weeks Armin, +Ronan, Antonio and sometimes Maciek had a very nice and productive sprint in +Cape Town, as pictures show :). We would like to say a big thank you to +Kiwi.com, which sponsored part of the travel costs via its awesome <a class="reference external" href="https://www.kiwi.com/sourcelift/">Sourcelift</a> +program to help Open Source projects.<br> +<br> +<table align="center" cellpadding="0" cellspacing="0" class="tr-caption-container" style="float: right; margin-left: 1em; text-align: right;"><tbody> +<tr><td style="text-align: center;"><a href="https://3.bp.blogspot.com/-9YVNucPN1wE/WeaWmTUFB-I/AAAAAAAABMQ/HeVMqS-ya2IYJuk0iZZODlULqpKaf5XcgCLcBGAs/s1600/DSC_2418.JPG" style="margin-left: auto; margin-right: auto;"><img border="0" height="225" src="https://3.bp.blogspot.com/-9YVNucPN1wE/WeaWmTUFB-I/AAAAAAAABMQ/HeVMqS-ya2IYJuk0iZZODlULqpKaf5XcgCLcBGAs/s400/DSC_2418.JPG" width="400"></a></td></tr> +<tr><td class="tr-caption" style="text-align: center;">Armin, Anto and Ronan at Cape Point</td></tr> +</tbody></table> +<br> +Armin, Ronan and Anto spent most of the time hacking at cpyext, our CPython +C-API compatibility layer: during the last years, the focus was to make it +working and compatible with CPython, in order to run existing libraries such +as numpy and pandas. However, we never paid too much attention to performance, +so the net result is that with the latest released version of PyPy, C +extensions generally work but their speed ranges from "slow" to "horribly +slow".<br> +<br> +For example, these very simple <a class="reference external" href="https://github.com/antocuni/cpyext-benchmarks">microbenchmarks</a> measure the speed of +calling (empty) C functions, i.e. the time you spend to "cross the border" +between RPython and C. <i>(Note: this includes the time spent doing the loop in regular Python code.)</i> These are the results on CPython, on PyPy 5.8, and on +our newest in-progress version:<br> +<br> +<pre class="literal-block">$ python bench.py # CPython +noargs : 0.41 secs +onearg(None): 0.44 secs +onearg(i) : 0.44 secs +varargs : 0.58 secs +</pre> +<div> +<br></div> +<pre class="literal-block">$ pypy-5.8 bench.py # PyPy 5.8 +noargs : 1.01 secs +onearg(None): 1.31 secs +onearg(i) : 2.57 secs +varargs : 2.79 secs +</pre> +<div> +<br></div> +<pre class="literal-block">$ pypy bench.py # cpyext-refactor-methodobject branch +noargs : 0.17 secs +onearg(None): 0.21 secs +onearg(i) : 0.22 secs +varargs : 0.47 secs +</pre> +<div> +<br></div> +<pre class="literal-block"></pre> +<pre class="literal-block"></pre> +So yes: before the sprint, we were ~2-6x slower than CPython. Now, we are +<strong>faster</strong> than it! +To reach this result, we did various improvements, such as: +<br> +<blockquote> +<ol class="arabic simple"> +<li>teach the JIT how to look (a bit) inside the cpyext module;</li> +<li>write specialized code for calling <tt class="docutils literal">METH_NOARGS</tt>, <tt class="docutils literal">METH_O</tt> and +<tt class="docutils literal">METH_VARARGS</tt> functions; previously, we always used a very general and +slow logic;</li> +<li>implement freelists to allocate the cpyext versions of <tt class="docutils literal">int</tt> and +<tt class="docutils literal">tuple</tt> objects, as CPython does;</li> +<li>the <a class="reference external" href="https://foss.heptapod.net/pypy/pypy/-/merge_requests/573">cpyext-avoid-roundtrip</a> branch: crossing the RPython/C border is +slowish, but the real problem was (and still is for many cases) we often +cross it many times for no good reason. So, depending on the actual API +call, you might end up in the C land, which calls back into the RPython +land, which goes to C, etc. etc. (ad libitum).</li> +</ol> +</blockquote> +The branch tries to fix such nonsense: so far, we fixed only some cases, which +are enough to speed up the benchmarks shown above. But most importantly, we +now have a clear path and an actual plan to improve cpyext more and +more. Ideally, we would like to reach a point in which cpyext-intensive +programs run at worst at the same speed of CPython.<br> +<br> +The other big topic of the sprint was Armin and Maciej doing a lot of work on the +<a class="reference external" href="https://bitbucket.org/pypy/pypy/commits/branch/unicode-utf8">unicode-utf8</a> branch: the goal of the branch is to always use UTF-8 as the +internal representation of unicode strings. The advantages are various: +<br> +<blockquote> +<ul class="simple"> +<li>decoding a UTF-8 stream is super fast, as you just need to check that the +stream is valid;</li> +<li>encoding to UTF-8 is almost a no-op;</li> +<li>UTF-8 is always more compact representation than the currently +used UCS-4. It's also almost always more compact than CPython 3.5 latin1/UCS2/UCS4 combo;</li> +<li>smaller representation means everything becomes quite a bit faster due to lower cache pressure.</li> +</ul> +</blockquote> +Before you ask: yes, this branch contains special logic to ensure that random +access of single unicode chars is still O(1), as it is on both CPython and the +current PyPy.<br> +We also plan to improve the speed of decoding even more by using modern processor features, like SSE and AVX. Preliminary results show that decoding can be done 100x faster than the current setup. +<br> +<br> +In summary, this was a long and profitable sprint, in which we achieved lots +of interesting results. However, what we liked even more was the privilege of +doing <a class="reference external" href="https://bitbucket.org/pypy/pypy/commits/a4307fb5912e">commits</a> from awesome places such as the top of Table Mountain:<br> +<br> +<blockquote class="twitter-tweet"> +<div dir="ltr" lang="en"> +Our sprint venue today <a href="https://twitter.com/hashtag/pypy?src=hash&amp;ref_src=twsrc%5Etfw">#pypy</a> <a href="https://t.co/o38IfTYmAV">pic.twitter.com/o38IfTYmAV</a></div> +— Ronan Lamy (@ronanlamy) <a href="https://twitter.com/ronanlamy/status/915575026107240449?ref_src=twsrc%5Etfw">4 ottobre 2017</a></blockquote> + + +<br> +<table align="center" cellpadding="0" cellspacing="0" class="tr-caption-container" style="float: left; margin-right: 1em; text-align: left;"><tbody> +<tr><td style="text-align: center;"><a href="https://foss.heptapod.net/pypy/extradoc/-/blob/branch/extradoc/sprintinfo/cape-town-2017/2017-10-04-155524.jpg" style="margin-left: auto; margin-right: auto;"><img border="0" height="360" src="https://bytebucket.org/pypy/extradoc/raw/extradoc/sprintinfo/cape-town-2017/2017-10-04-155524.jpg" width="640"></a></td></tr> +<tr><td class="tr-caption" style="text-align: center;">The panorama we looked at instead of staring at cpyext code</td></tr> +</tbody></table>cpyextprofilingspeedsprintunicodehttps://www.pypy.org/posts/2017/10/cape-of-good-hope-for-pypy-hello-from-3656631725712879033.htmlWed, 18 Oct 2017 13:31:00 GMTProfiling for fun with valgrindhttps://www.pypy.org/posts/2007/12/profiling-for-fun-with-valgrind-3215121784705288400.htmlMaciej Fijalkowski<p>Recently I've been doing a lot of profiling on the PyPy executables to find speed bottlenecks. <a href="https://en.wikipedia.org/wiki/Valgrind">Valgrind</a> (the original <a href="https://valgrind.org/">page</a> seems to be down) is an extremely nice tool for doing this. It has several built-in tools that give you different types of profiles. The callgrind mode provides you with a lot of information including relative call costs. The cachegrind tool gives you less information, but what it gives you (e.g. cache misses) is much more accurate. The obvious choice would be to have a way to combine the results of two profiling runs to have both. In the last days I wrote a script that does this. It's available <a href="https://codespeak.net/svn/user/fijal/pygrind">at my user's svn</a> and has a pretty intuitive command line interface. The combining calculation are not perfect yet, total costs of functions can still be a bit bogus (they can sum up to whatever) but at least the relative figures are good. This means that we can stop looking at two different types of graphs now. + +An awesome tool for analyzing the profile data is <a href="https://kcachegrind.sourceforge.net/cgi-bin/show.cgi">kcachegrind.</a> + +<a href="https://4.bp.blogspot.com/_5R1EBmwBBTs/R2JjKRYuTTI/AAAAAAAAAAM/LX5ktu_FcIE/s1600-h/kcachegrind.png"><img alt="" border="0" id="BLOGGER_PHOTO_ID_5143782752527469874" src="https://4.bp.blogspot.com/_5R1EBmwBBTs/R2JjKRYuTTI/AAAAAAAAAAM/LX5ktu_FcIE/s320/kcachegrind.png" style="margin: 0px auto 10px; display: block; text-align: center; cursor: pointer;"></a> + +Which also proves that my 12'' display is to small at least for some things :-). + + +<b>Update:</b> pygrind is available under the MIT license.</p>kcachegrindprofilingvalgrindhttps://www.pypy.org/posts/2007/12/profiling-for-fun-with-valgrind-3215121784705288400.htmlFri, 14 Dec 2007 11:02:00 GMT \ No newline at end of file diff --git a/categories/pypy.html b/categories/pypy.html new file mode 100644 index 000000000..4ec2fd6d6 --- /dev/null +++ b/categories/pypy.html @@ -0,0 +1,129 @@ + + + + + +Posts about pypy | PyPy + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +
                + + \ No newline at end of file diff --git a/categories/pypy.xml b/categories/pypy.xml new file mode 100644 index 000000000..843368a46 --- /dev/null +++ b/categories/pypy.xml @@ -0,0 +1,440 @@ + +PyPy (Posts about pypy)https://www.pypy.org/enContents © 2024 <a href="mailto:pypy-dev@pypy.org">The PyPy Team</a> Sat, 31 Aug 2024 17:48:13 GMTNikola (getnikola.com)http://blogs.law.harvard.edu/tech/rssA new chapter for PyPyhttps://www.pypy.org/posts/2020/08/a-new-chapter-for-pypy-8388322709667328389.htmlhodgestar<p><i>PyPy winds down its membership in the Software Freedom Conservancy</i></p> + +<h1>Conservancy and PyPy's great work together</h1> + +<p><a href="https://pypy.org/">PyPy</a> joined <a href="https://sfconservancy.org/">Conservancy</a> in +the <a href="https://sfconservancy.org/blog/2011/jan/02/oct-dec-2010/">second half of 2010</a>, shortly after the release of +PyPy 1.2, the first version to contain a fully functional JIT. <a href="https://lwn.net/Articles/550427/">In 2013</a>, PyPy +started supporting ARM, bringing its just-in-time speediness to many more devices and began working toward supporting NumPy to help +scientists crunch their numbers faster. Together, PyPy and Conservancy ran successful fundraising drives and facilitated payment +and oversight for <a href="https://sfconservancy.org/blog/2016/dec/01/pypy-2016/">contractors and code sprints</a>.</p> + +<p>Conservancy supported PyPy's impressive growth as it expanded support for +different hardware platforms, greatly improved the performance of C extensions, +and added support for Python 3 as the language itself evolved.</p> + +<h1>The road ahead</h1> + +<p>Conservancy provides a fiscal and organizational home for projects that find the +freedoms and guardrails that come along with a charitable home advantageous for +their community goals. While this framework was a great fit for the early PyPy +community, times change and all good things must come to an end.</p> + +<p>PyPy will remain a free and open source project, but the community's structure +and organizational underpinnings will be changing and the PyPy community will be +exploring options outside of the charitable realm for its next phase of growth +("charitable" in the legal sense -- PyPy will remain a community project).</p> + +<p>During the last year PyPy and Conservancy have worked together to properly +utilise the generous donations made by stalwart PyPy enthusiats over the years +and to wrap up PyPy's remaining charitable obligations. PyPy is grateful for +the Conservancy's help in shepherding the project toward its next chapter.</p> + +<h1>Thank yous</h1><p>From Conservancy: <br></p><p style="text-align: left;"></p><blockquote>"We are happy that Conservancy was able to help PyPy bring important software +for the public good during a critical time in its history. We wish the +community well and look forward to seeing it develop and succeed in new ways." <br></blockquote><blockquote>— Karen Sandler, Conservancy's Executive Director</blockquote><p></p><p>From PyPy:</p><p></p><div style="text-align: left;"><div style="text-align: left;"><blockquote><p>"PyPy would like to thank Conservancy for their decade long support in +building the community and wishes Conservancy continued success in their +journey promoting, improving, developing and defending free and open source +sofware." <br></p></blockquote><blockquote><p style="text-align: left;">— Simon Cross &amp; Carl Friedrich Bolz-Tereick, on behalf of PyPy.</p></blockquote></div></div><p></p><blockquote> +</blockquote> + +<h1>About</h1> + +<p><a class="reference external" href="https://pypy.org/">PyPy</a> is a multi-layer python interpreter with a built-in JIT compiler that runs +Python quickly across different computing environments. +<a class="reference external" href="https://sfconservancy.org/">Software Freedom Conservancy</a> (Conservancy) is a charity that provides a home +to over forty free and open source software projects.</p>pypyhttps://www.pypy.org/posts/2020/08/a-new-chapter-for-pypy-8388322709667328389.htmlWed, 12 Aug 2020 19:00:00 GMTLeysin 2020 Sprint Reporthttps://www.pypy.org/posts/2020/03/leysin-2020-sprint-report-764567777353955897.htmlhodgestar<p>At the end of February ten of us gathered in Leysin, Switzerland to work on<br> +a variety of topics including <a class="reference external" href="https://github.com/pyhandle/hpy/">HPy</a>, <a class="reference external" href="https://buildbot.pypy.org/summary?branch=py3.7">PyPy Python 3.7</a> support and the PyPy<br> +migration to <a class="reference external" href="https://foss.heptapod.net/pypy/">Heptapod</a>.<br> +<br> +</p><div class="separator" style="clear: both; text-align: center;"> +<a href="https://1.bp.blogspot.com/-PIs_hVhn3RY/XnFDceuihNI/AAAAAAAAbRg/LKMOMWxeFw4jhcwqy8jx7iKzKE01fbfxQCEwYBhgL/s1600/2020_leysin_sprint_attendees.jpg" style="margin-left: 1em; margin-right: 1em;"><img border="0" height="180" src="https://1.bp.blogspot.com/-PIs_hVhn3RY/XnFDceuihNI/AAAAAAAAbRg/LKMOMWxeFw4jhcwqy8jx7iKzKE01fbfxQCEwYBhgL/s320/2020_leysin_sprint_attendees.jpg" width="320"></a></div> +<br> +We had a fun and productive week. The snow was beautiful. There was skiing<br> +and lunch at the top of <a class="reference external" href="https://en.wikipedia.org/wiki/Berneuse">Berneuse</a>, cooking together, some late nights at<br> +the pub next door, some even later nights coding, and of course the<br> +obligatory cheese fondue outing.<br> +<br> +There were a few of us participating in a PyPy sprint for the first time<br> +and a few familiar faces who had attended many sprints. Many different<br> +projects were represented including PyPy, <a class="reference external" href="https://github.com/pyhandle/hpy/">HPy</a>, <a class="reference external" href="https://github.com/graalvm/graalpython">GraalPython</a>,<br> +<a class="reference external" href="https://foss.heptapod.net/pypy/">Heptapod</a>, and <a class="reference external" href="https://github.com/dgrunwald/rust-cpython">rust-cpython</a>. The atmosphere was relaxed and welcoming, so if<br> +you're thinking of attending the next one -- please do!<br> +<br> +Topics worked on:<br> +<br> +<h2> +HPy</h2> +HPy is a new project to design and implement a better API for extending<br> +Python in C. If you're unfamiliar with it you can read more about it at<br> +<a class="reference external" href="https://github.com/pyhandle/hpy/">HPy</a>.<br> +<br> +A lot of attention was devoted to the Big HPy Design Discussion which<br> +took up two full mornings. So much was decided that this will likely<br> +get its own detailed write-up, but bigger topics included:<br> +<ul class="simple"> +<li>the HPy GetAttr, SetAttr, GetItem and SetItem methods,</li> +<li>HPy_FromVoidP and HPy_AsVoidP for passing HPy handles to C functions<br> +that pass void* pointers to callbacks,</li> +<li>avoiding having va_args as part of the ABI,</li> +<li>exception handling,</li> +<li>support for creating custom types.</li> +</ul> +Quite a few things got worked on too:<br> +<ul class="simple"> +<li>implemented support for writing methods that take keyword arguments with<br> +HPy_METH_KEYWORDS,</li> +<li>implemented HPy_GetAttr, HPy_SetAttr, HPy_GetItem, and HPy_SetItem,</li> +<li>started implementing support for adding custom types,</li> +<li>started implementing dumping JSON objects in ultrajson-hpy,</li> +<li>refactored the PyPy GIL to improve the interaction between HPy and<br> +PyPy's cpyext,</li> +<li>experimented with adding HPy support to rust-cpython.</li> +</ul> +And there was some discussion of the next steps of the HPy initiative<br> +including writing documentation, setting up websites and funding, and<br> +possibly organising another HPy gathering later in the year.<br> +<br> +<h2> +PyPy</h2> +<ul class="simple"> +<li>Georges gave a presentation on the Heptapod topic and branch workflows<br> +and showed everyone how to use hg-evolve.</li> +<li>Work was done on improving the PyPy CI buildbot post the move to<br> +heptapod, including a light-weight pre-merge CI and restricting<br> +when the full CI is run to only branch commits.</li> +<li>A lot of work was done improving the -D tests. </li> +</ul> +<br> +<h2> +Miscellaneous</h2> +<ul class="simple"> +<li>Armin demoed VRSketch and NaN Industries in VR, including an implementation<br> +of the Game of Life within NaN Industries!</li> +<li>Skiing!</li> +</ul> +<br> +<h2> +Aftermath</h2> +Immediately after the sprint large parts of Europe and the world were<br> +hit by the COVID-19 epidemic. It was good to spend time together before<br> +travelling ceased to be a sensible idea and many gatherings were cancelled.<br> +<br> +Keep safe out there everyone.<br> +<br> +The HPy &amp; PyPy Team &amp; Friends<br> +<br> +<i>In joke for those who attended the sprint: Please don't replace this blog post<br> +with its Swedish translation (or indeed a translation to any other language :).</i>cpyextCPythonGraalPythonHeptapodhpypypypypy3https://www.pypy.org/posts/2020/03/leysin-2020-sprint-report-764567777353955897.htmlTue, 17 Mar 2020 21:57:00 GMTAlmost There - PyPy's ARM Backendhttps://www.pypy.org/posts/2012/02/almost-there-pypys-arm-backend_01-3216759488618774525.htmlDavid Schneider<div style="text-align: left;"> +In this post I want to give an update on the status of the ARM backend for PyPy's JIT and describe some of the issues and details of the backend.</div> +<div class="section" id="current-status"> +<br> +<h2> + + + + +Current Status</h2> +It has been a more than a year that I have been working on the ARM backend. Now it is in a shape, that we can measure meaningful numbers and also ask for some feedback. Since the <a class="reference external" href="https://www.pypy.org/posts/2011/01/jit-backend-for-arm-processors-5994810755839586463.html">last post about the backend</a> we have added support floating point operations as well as for PyPy's framework GC's. Another area of work was to keep up with the constant improvements done in the main development branch, such as out-of-line guards, labels, etc. It has been possible for about a year to cross-translate the PyPy Python interpreter and other interpreters such as <a class="reference external" href="https://bitbucket.org/cfbolz/pyrolog/">Pyrolog</a>, with a JIT, to run benchmarks on ARM. Up until now there remained some hard to track bugs that would cause the interpreter to crash with a segmentation fault in certain cases when running with the JIT on ARM. Lately it was possible to run all benchmarks without problems, but when running the translation toolchain itself it would crash. During the last PyPy sprint in <a class="reference external" href="https://www.pypy.org/posts/2011/12/leysin-winter-sprint-6862532189897876336.html">Leysin</a> Armin and I managed to fix several of these hard to track bugs in the ARM backend with the result that, it is now possible to run the PyPy translator on ARM itself (at least unless until it runs out of memory), which is a kind of litmus test for the backend itself and used to crash before. Just to point it out, we are not able to complete a PyPy translation on ARM, because on the hardware we have currently available there is not enough memory. But up to the point we run out of memory the JIT does not hit any issues.<br> +<br></div> +<div class="section" id="implementation-details"> +<h2> + + + + +Implementation Details</h2> +The hardware requirements to run the JIT on ARM follow those for Ubuntu on ARM which targets ARMv7 with a VFP unit running in little endian mode. The JIT can be translated without floating point support, but there might be a few places that need to be fixed to fully work in this setting. We are targeting the ARM instruction set, because at least at the time we decided to use it seemed to be the best choice in terms of speed while having some size overhead compared to the Thumb2 instruction set. It appears that the Thumb2 instruction set should give comparable speed with better code density but has a few restriction on the number of registers available and the use of conditional execution. Also the implementation is a bit easier using a fixed width instruction set and we can use the full set of registers in the generated code when using the ARM instruction set.<br> +<br></div> +<div class="section" id="the-calling-convention-on-arm"> +<h2> + + + + +The calling convention on ARM</h2> +The calling convention on ARM uses 4 of the general purpose registers to pass arguments to functions, further arguments are passed on the stack. The presence of a floating point unit is not required for ARM cores, for this reason there are different ways of handling floats with relation to the calling convention. There is a so called soft-float calling convention that is independent of the presence of a floating point unit. For this calling convention floating point arguments to functions are stored in the general purpose registers and on the stack. Passing floats around this way works with software and hardware floating point implementations. But in presence of a floating point unit it produces some overhead, because floating point numbers need to be moved from the floating point unit to the core registers to do a call and moved back to the floating point registers by the callee. The alternative calling convention is the so-called hard-float calling convention which requires the presence of a floating point unit but has the advantage of getting rid of the overhead of moving floating point values around when performing a call. Although it would be better in the long term to support the hard-float calling convention, we need to be able to interoperate with external code compiled for the operating system we are running on. For this reason at the moment we only support the soft-float to interoperate with external code. We implemented and tested the backend on a <a class="reference external" href="https://beagleboard.org/hardware-xM/">BeagleBoard-xM</a> with a <a class="reference external" href="https://www.arm.com/products/processors/cortex-a/cortex-a8.php">Cortex-A8</a> processor running <a class="reference external" href="https://wiki.ubuntu.com/ARM">Ubuntu 11.04 for ARM</a>.<br> +<br></div> +<div class="section" id="translating-for-arm"> +<h2> + + + + +Translating for ARM</h2> +The toolchain used to translate PyPy currently is based on a <a class="reference external" href="https://maemo.gitorious.org/scratchbox2/pages/Home">Scratchbox2</a>. Scratchbox2 is a cross-compiling environment. Development had stopped for a while, but it seems to have revived again. We run a 32-bit Python interpreter on the host system and perform all calls to the compiler using a Scratchbox2 based environment. A description on how to setup the cross translation toolchain can be found <a class="reference external" href="https://bitbucket.org/pypy/pypy/src/1f07ea8076c9/pypy/doc/arm.rst">here</a>.<br> +<br></div> +<div class="section" id="results"> +<h2> + + + + +Results</h2> +The current results on ARM, as shown in the graph below, show that the JIT currently gives a speedup of about 3.5 times compared to CPython on ARM. The benchmarks were run on the before mentioned BeagleBoard-xM with a 1GHz ARM Cortex-A8 processor and 512MB of memory. The operating system on the board is Ubuntu 11.04 for ARM. We measured the PyPy interpreter with the JIT enabled and disabled comparing each to CPython Python 2.7.1+ (r271:86832) for ARM. The graph shows the speedup or slowdown of both PyPy versions for the different benchmarks from our benchmark suite normalized to the runtime of CPython. The data used for the graph can be seen below.<br> +<div class="separator" style="clear: both; text-align: center;"> +<a href="https://2.bp.blogspot.com/-uckc9tOWgnM/TykHMuuGT9I/AAAAAAAAAKg/J8_fC6RS-QA/s1600/graph.png" style="margin-left: 1em; margin-right: 1em;"><img border="0" height="258" src="https://2.bp.blogspot.com/-uckc9tOWgnM/TykHMuuGT9I/AAAAAAAAAKg/J8_fC6RS-QA/s400/graph.png" width="400"></a></div> +<br> +The speedup is less than the speedup of 5.2 times we currently get on x86 on our own benchmark suite (see <a class="reference external" href="https://speed.pypy.org/">https://speed.pypy.org</a> for details). There are several possible reasons for this. Comparing the results for the interpreter without the JIT on ARM and x86 suggests that the interpreter generated by PyPy, without the JIT, has a worse performance when compared to CPython that it does on x86. Also it is quite possible that the code we are generating with the JIT is not yet optimal. Also there are some architectural constraints produce some overhead. One of these differences is the handling of constants, most ARM instructions only support 8 bit (that can be shifted) immediate values, larger constants need to be loaded into a register, something that is not necessary on x86.<br> +<br> +<table border="1" class="docutils"><colgroup></colgroup><colgroup><col width="40%"></colgroup><colgroup><col width="32%"></colgroup><colgroup><col width="28%"></colgroup><tbody valign="top"> +<tr><td>Benchmark</td><td>PyPy JIT</td><td>PyPy no JIT</td></tr> +<tr><td>ai</td><td>0.484439780047</td><td>3.72756749625</td></tr> +<tr><td>chaos</td><td>0.0807291691934</td><td>2.2908692212</td></tr> +<tr><td>crypto_pyaes</td><td>0.0711114832245</td><td>3.30112318509</td></tr> +<tr><td>django</td><td>0.0977743245519</td><td>2.56779947601</td></tr> +<tr><td>fannkuch</td><td>0.210423735698</td><td>2.49163632938</td></tr> +<tr><td>float</td><td>0.154275334675</td><td>2.12053281495</td></tr> +<tr><td>go</td><td>0.330483034202</td><td>5.84628320479</td></tr> +<tr><td>html5lib</td><td>0.629264389862</td><td>3.60333138526</td></tr> +<tr><td>meteor-contest</td><td>0.984747426912</td><td>2.93838610037</td></tr> +<tr><td>nbody_modified</td><td>0.236969593082</td><td>1.40027234936</td></tr> +<tr><td>pyflate-fast</td><td>0.367447191807</td><td>2.72472422146</td></tr> +<tr><td>raytrace-simple</td><td>0.0290527461437</td><td>1.97270054339</td></tr> +<tr><td>richards</td><td>0.034575573553</td><td>3.29767342015</td></tr> +<tr><td>slowspitfire</td><td>0.786642551908</td><td>3.7397367403</td></tr> +<tr><td>spambayes</td><td>0.660324379456</td><td>3.29059863111</td></tr> +<tr><td>spectral-norm</td><td>0.063610783731</td><td>4.01788986233</td></tr> +<tr><td>spitfire</td><td>0.43617131165</td><td>2.72050579076</td></tr> +<tr><td>spitfire_cstringio</td><td>0.255538702134</td><td>1.7418593111</td></tr> +<tr><td>telco</td><td>0.102918930413</td><td>3.86388866047</td></tr> +<tr><td>twisted_iteration</td><td>0.122723986805</td><td>4.33632475491</td></tr> +<tr><td>twisted_names</td><td>2.42367797135</td><td>2.99878698076</td></tr> +<tr><td>twisted_pb</td><td>1.30991837431</td><td>4.48877805486</td></tr> +<tr><td>twisted_tcp</td><td>0.927033354055</td><td>2.8161624665</td></tr> +<tr><td>waf</td><td>1.02059811932</td><td>1.03793427321</td></tr> +</tbody></table> +</div> +<br> +<br> +<div class="section" id="the-next-steps-and-call-for-help"> +<h2> + + + + +The next steps and call for help</h2> +Although there probably still are some remaining issues which have not surfaced yet, the JIT backend for ARM is working. Before we can merge the backend into the main development line there are some things that we would like to do first, in particular it we are looking for a way to run the all PyPy tests to verify that things work on ARM before we can merge. Additionally there are some other longterm ideas. To do this we are looking for people willing to help, either by contributing to implement the open features or that can help us with hardware to test.<br> +<br> +The incomplete list of open topics:<br> +<ul class="simple"> +<li>We are looking for a better way to translate PyPy for ARM, than the one describe above. I am not sure if there currently is hardware with enough memory to directly translate PyPy on an ARM based system, this would require between 1.5 or 2 Gig of memory. A fully <a class="reference external" href="https://wiki.qemu.org/Main_Page">QEMU</a> based approach could also work, instead of Scratchbox2 that uses QEMU under the hood.</li> +<li>Test the JIT on different hardware.</li> +<li>Experiment with the JIT settings to find the optimal thresholds for ARM.</li> +<li>Continuous integration: We are looking for a way to run the PyPy test suite to make sure everything works as expected on ARM, here QEMU also might provide an alternative.</li> +<li>A long term plan would be to port the backend to ARMv5 ISA and improve the support for systems without a floating point unit. This would require to implement the ISA and create different code paths and improve the instruction selection depending on the target architecture.</li> +<li>Review of the generated machine code the JIT generates on ARM to see if the instruction selection makes sense for ARM.</li> +<li>Build a version that runs on Android.</li> +<li>Improve the tools, i.e. integrate with <a class="reference external" href="https://bitbucket.org/pypy/jitviewer">jitviewer</a>.</li> +</ul> +So if you are interested or willing to help in any way contact us.</div>armjitpypyhttps://www.pypy.org/posts/2012/02/almost-there-pypys-arm-backend_01-3216759488618774525.htmlWed, 01 Feb 2012 09:43:00 GMTFirst pypy-cli-jit benchmarkshttps://www.pypy.org/posts/2009/10/first-pypy-cli-jit-benchmarks-6698484455072589492.htmlAntonio Cuni<p>As the readers of this blog <a class="reference external" href="https://www.pypy.org/posts/2008/11/porting-jit-to-cli-part-1-8712941279840156635.html">already know</a>, I've been working on porting the +JIT to CLI/.NET for the last months. Now that it's finally possible to get a +working pypy-cli-jit, it's time to do some benchmarks.</p> +<p><strong>Warning:</strong> as usual, all of this has to be considered to be a alpha version: +don't be surprised if you get a crash when trying to run pypy-cli-jit. Of +course, things are improving very quickly so it should become more and more +stable as days pass.</p> +<p>For this time, I decided to run four benchmarks. Note that for all of them we +run the main function once in advance, to let the JIT recoginizing the hot +loops and emitting the corresponding code. Thus, the results reported do +<strong>not</strong> include the time spent by the JIT compiler itself, but give a good +measure of how good is the code generated by the JIT. At this point in time, +I know that the CLI JIT backend spends way too much time compiling stuff, but +this issue will be fixed soon.</p> +<blockquote> +<ul class="simple"> +<li><a class="reference external" href="https://paste.pocoo.org/show/145050/">f1.py</a>: this is the classic PyPy JIT benchmark. It is just a function +that does some computational intensive work with integers.</li> +<li><a class="reference external" href="https://paste.pocoo.org/show/143243/">floatdemo.py</a>: this is the same benchmark involving floating point +numbers that have already been described in a previous <a class="reference external" href="https://www.pypy.org/posts/2009/10/pypys-jit-now-supports-floats-7003493323596806737.html">blog post</a>.</li> +<li><a class="reference external" href="https://paste.pocoo.org/show/145051/">oodemo.py</a>: this is just a microbenchmark doing object oriented stuff +such as method calls and attribute access.</li> +<li><a class="reference external" href="https://paste.pocoo.org/show/145052/">richards2.py</a>: a modified version of the classic richards.py, with a +warmup call before starting the real benchmark.</li> +</ul> +</blockquote> +<p>The benchmarks were run on a Windows machine with an Intel Pentium Dual Core +E5200 2.5GHz and 2GB RAM, both with .NET (CLR 2.0) and Mono 2.4.2.3.</p> +<p>Because of a known <a class="reference external" href="https://bugzilla.novell.com/show_bug.cgi?id=474718">mono bug</a>, if you use a version older than 2.1 you need +to pass the option <tt class="docutils literal"><span class="pre">-O=-branch</span></tt> to mono when running pypy-cli-jit, else it +will just loop forever.</p> +<p>For comparison, we also run the same benchmarks with IronPython 2.0.1 and +IronPython 2.6rc1. Note that IronPython 2.6rc1 does not work with mono.</p> +<p>So, here are the results (expressed in seconds) with Microsoft CLR:</p> +<blockquote> +<table border="1" class="docutils"> +<colgroup> +<col width="15%"> +<col width="20%"> +<col width="15%"> +<col width="12%"> +<col width="20%"> +<col width="18%"> +</colgroup> +<thead valign="bottom"> +<tr><th class="head">Benchmark</th> +<th class="head">pypy-cli-jit</th> +<th class="head">ipy 2.0.1</th> +<th class="head">ipy 2.6</th> +<th class="head">ipy2.01/ pypy</th> +<th class="head">ipy2.6/ pypy</th> +</tr> +</thead> +<tbody valign="top"> +<tr><td>f1</td> +<td>0.028</td> +<td>0.145</td> +<td>0.136</td> +<td>5.18x</td> +<td>4.85x</td> +</tr> +<tr><td>floatdemo</td> +<td>0.671</td> +<td>0.765</td> +<td>0.812</td> +<td>1.14x</td> +<td>1.21x</td> +</tr> +<tr><td>oodemo</td> +<td>1.25</td> +<td>4.278</td> +<td>3.816</td> +<td>3.42x</td> +<td>3.05x</td> +</tr> +<tr><td>richards2</td> +<td>1228</td> +<td>442</td> +<td>670</td> +<td>0.36x</td> +<td>0.54x</td> +</tr> +</tbody> +</table> +</blockquote> +<p>And with Mono:</p> +<blockquote> +<table border="1" class="docutils"> +<colgroup> +<col width="21%"> +<col width="29%"> +<col width="21%"> +<col width="29%"> +</colgroup> +<thead valign="bottom"> +<tr><th class="head">Benchmark</th> +<th class="head">pypy-cli-jit</th> +<th class="head">ipy 2.0.1</th> +<th class="head">ipy2.01/ pypy</th> +</tr> +</thead> +<tbody valign="top"> +<tr><td>f1</td> +<td>0.042</td> +<td>0.695</td> +<td>16.54x</td> +</tr> +<tr><td>floatdemo</td> +<td>0.781</td> +<td>1.218</td> +<td>1.55x</td> +</tr> +<tr><td>oodemo</td> +<td>1.703</td> +<td>9.501</td> +<td>5.31x</td> +</tr> +<tr><td>richards2</td> +<td>720</td> +<td>862</td> +<td>1.20x</td> +</tr> +</tbody> +</table> +</blockquote> +<p>These results are very interesting: under the CLR, we are between 5x faster +and 3x slower than IronPython 2.0.1, and between 4.8x faster and 1.8x slower +than IronPython 2.6. On the other hand, on mono we are consistently faster +than IronPython, up to 16x. Also, it is also interesting to note that +pypy-cli runs faster on CLR than mono for all benchmarks except richards2.</p> +<p>I've not investigated yet, but I think that the culprit is the terrible +behaviour of tail calls on CLR: as I already wrote in <a class="reference external" href="https://www.pypy.org/posts/2008/12/porting-jit-to-cli-part-3-3519327524638923621.html">another blog post</a>, +tail calls are ~10x slower than normal calls on CLR, while being only ~2x +slower than normal calls on mono. richads2 is probably the benchmark that +makes most use of tail calls, thus explaining why we have a much better result +on mono than CLR.</p> +<p>The next step is probably to find an alternative implementation that does not +use tail calls: this probably will also improve the time spent by the JIT +compiler itself, which is not reported in the numbers above but that so far it +is surely too high to be acceptable. Stay tuned.</p>clijitpypyhttps://www.pypy.org/posts/2009/10/first-pypy-cli-jit-benchmarks-6698484455072589492.htmlThu, 15 Oct 2009 13:36:00 GMTRoadmap for JIThttps://www.pypy.org/posts/2009/04/roadmap-for-jit-377358891902851723.htmlMaciej Fijalkowski<p>Hello. +</p> +<p> +First a disclaimer. This post is more about plans for future than current +status. We usually try to write about things that we have done, because +it's much much easier to promise things than to actually make it happen, +but I think it's important enough to have some sort of roadmap. +</p> +<p> +In recent months we came to the point where the 5th generation of +JIT prototype was working as <a href="https://www.pypy.org/posts/2009/03/good-news-everyone-421421336094214242.html">nice</a> +or even a bit nicer than 1st one back in 2007. Someone might ask "so why +did you spend all this time without going forward?". And indeed, we spend +a lot of time moving sideways, but as posted, we also spent a lot of time +doing <a href="https://www.pypy.org/posts/2009/04/beta-for-110-released-4604559533184706699.html">some other things</a>, which are important as well. +The main advantage of current JIT incarnation is much much simpler than +the first one. Even I can comprehend it, which is much of an improvement :-) +</p> +<p> +So, the prototype is working and gives very nice speedups in range of 20-30x +over CPython. We're pretty confident this prototype will work and will +produce fast python interpreter eventually. So we decided that now we'll +work towards changing prototype into something stable and solid. This +might sound easy, but in fact it's not. Having stable assembler backend +and optimizations that keep semantics is not as easy as it might sound. +</p> +<p> +The current roadmap, as I see it, looks like as following: +</p> +<ul> +<li> Provide a JIT that does not speedup things, but produce assembler without + optimizations turned on, that is correct and able to run CPython's library + tests on a nightly basis. +</li> +<li> + Introduce simple optimizations, that should make above JIT a bit faster than + CPython. With optimizations disabled JIT is producing incredibly dumb + assembler, which is slower than correspoding C code, even with removal + of interpretation overhead (which is not very surprising). +</li> +<li> + Backport optimizations from JIT prototype, one by one, keeping an eye + on how they perform and making sure they don't break anything. +</li> +<li> + Create new optimizations, like speeding up attribute access. +</li> +<li> + Profit. +</li> +</ul> +<p> +This way, we can hopefully provide a working JIT, which gives fast python +interpreter, which is a bit harder than just a nice prototype. +</p> +<p> +Tell us what you think about this plan. +</p> +Cheers,<br> +fijal &amp; others.jitpypyroadmapspeedhttps://www.pypy.org/posts/2009/04/roadmap-for-jit-377358891902851723.htmlTue, 21 Apr 2009 19:38:00 GMTEP2008: PyPy meets Jythonhttps://www.pypy.org/posts/2008/07/ep2008-pypy-meets-jython-1107070144380217881.htmlholger krekel<p>One of the great events at EuroPython 2008 were our chats and meetings with the Jython and Sun people. The Jython people recently are pushing into releasing Python version 2.5 and they currently pursue many interesting sub projects. Coincidentally, PyPy also has tons of interesting areas and results :) So we eventually got into brainstorming a number of possible technical collab ideas. Further below is a first list as i wrote it down from our 10 people PyPy / Jython 30 minute close up meeting yesterday. + +It felt great to be able to talk to the Jython people this way - kudos to Sun for their clear commitments and open ways to go about things! I sense a genuine interest on fair collaboration with non-java developer communities. Seems like they are serious about not focusing on "Java this", "Java that" anymore but rather focus on the JVM platform. Good! And about language +independent interest in ambitious technology. Even Better! I am tensed to see how things go from here. + +So here the list of technical collab ideas: +</p><ul><li>ctypes - try to create _rawffi module in Java for Jython, which will enable Jython to reuse our existing ctypes implementation (and have PyPy use the Jython-rawffi for its own for PyPy.JVM)</li><li> generally see to share work / (continue) collaborate regarding extension modules</li><li>Jython/PyPy (and eventually IronPython): document known differences to CPython, maybe in a PEP</li><li>Python Interpreter for Jython (in order to run CPython's .pyc files): re-use pypy's bytecode evaluator, implement a "Jython object space". </li><li>re-use rpython-extension modules for jython (e.g. SRE), by compiling them to Java and reusing as a native library.</li><li>collaborate on testing framework / benchmarking, have a common site to show test results</li><li>make py.test compatible with jython</li><li>come up with a set of "pure Python language" tests, which would gather and refactor tests from CPython, PyPy and Jython. </li><li>look into using java types / jython approaches for implementing free threading.</li><li>share knowledge regarding JIT / psyco +</li></ul>If you have any more ideas, comments or would like to join efforts, let us know! + +Cheers and thanks to <a href="https://www.sauria.com/blog/">Ted Leung</a>, <a href="https://fwierzbicki.blogspot.com/">Frank Wierzbiki</a>, <a href="https://www.zyasoft.com/pythoneering/">Jim Baker</a> and Tobias Ivarsson from Sun and Jython fame respectively, + +Holgerep2008jythonpypysunhttps://www.pypy.org/posts/2008/07/ep2008-pypy-meets-jython-1107070144380217881.htmlThu, 10 Jul 2008 08:29:00 GMT \ No newline at end of file diff --git a/categories/pypy3.html b/categories/pypy3.html new file mode 100644 index 000000000..e94a3d5e5 --- /dev/null +++ b/categories/pypy3.html @@ -0,0 +1,159 @@ + + + + + +Posts about pypy3 | PyPy + + + + + + + + + + + + + + + + + Skip to main content +
                +

                Posts about pypy3

                + + +
                +
                + + \ No newline at end of file diff --git a/categories/pypy3.xml b/categories/pypy3.xml new file mode 100644 index 000000000..57134b555 --- /dev/null +++ b/categories/pypy3.xml @@ -0,0 +1,326 @@ + +PyPy (Posts about pypy3)https://www.pypy.org/enContents © 2024 <a href="mailto:pypy-dev@pypy.org">The PyPy Team</a> Sat, 31 Aug 2024 17:48:12 GMTNikola (getnikola.com)http://blogs.law.harvard.edu/tech/rssLeysin 2020 Sprint Reporthttps://www.pypy.org/posts/2020/03/leysin-2020-sprint-report-764567777353955897.htmlhodgestar<p>At the end of February ten of us gathered in Leysin, Switzerland to work on<br> +a variety of topics including <a class="reference external" href="https://github.com/pyhandle/hpy/">HPy</a>, <a class="reference external" href="https://buildbot.pypy.org/summary?branch=py3.7">PyPy Python 3.7</a> support and the PyPy<br> +migration to <a class="reference external" href="https://foss.heptapod.net/pypy/">Heptapod</a>.<br> +<br> +</p><div class="separator" style="clear: both; text-align: center;"> +<a href="https://1.bp.blogspot.com/-PIs_hVhn3RY/XnFDceuihNI/AAAAAAAAbRg/LKMOMWxeFw4jhcwqy8jx7iKzKE01fbfxQCEwYBhgL/s1600/2020_leysin_sprint_attendees.jpg" style="margin-left: 1em; margin-right: 1em;"><img border="0" height="180" src="https://1.bp.blogspot.com/-PIs_hVhn3RY/XnFDceuihNI/AAAAAAAAbRg/LKMOMWxeFw4jhcwqy8jx7iKzKE01fbfxQCEwYBhgL/s320/2020_leysin_sprint_attendees.jpg" width="320"></a></div> +<br> +We had a fun and productive week. The snow was beautiful. There was skiing<br> +and lunch at the top of <a class="reference external" href="https://en.wikipedia.org/wiki/Berneuse">Berneuse</a>, cooking together, some late nights at<br> +the pub next door, some even later nights coding, and of course the<br> +obligatory cheese fondue outing.<br> +<br> +There were a few of us participating in a PyPy sprint for the first time<br> +and a few familiar faces who had attended many sprints. Many different<br> +projects were represented including PyPy, <a class="reference external" href="https://github.com/pyhandle/hpy/">HPy</a>, <a class="reference external" href="https://github.com/graalvm/graalpython">GraalPython</a>,<br> +<a class="reference external" href="https://foss.heptapod.net/pypy/">Heptapod</a>, and <a class="reference external" href="https://github.com/dgrunwald/rust-cpython">rust-cpython</a>. The atmosphere was relaxed and welcoming, so if<br> +you're thinking of attending the next one -- please do!<br> +<br> +Topics worked on:<br> +<br> +<h2> +HPy</h2> +HPy is a new project to design and implement a better API for extending<br> +Python in C. If you're unfamiliar with it you can read more about it at<br> +<a class="reference external" href="https://github.com/pyhandle/hpy/">HPy</a>.<br> +<br> +A lot of attention was devoted to the Big HPy Design Discussion which<br> +took up two full mornings. So much was decided that this will likely<br> +get its own detailed write-up, but bigger topics included:<br> +<ul class="simple"> +<li>the HPy GetAttr, SetAttr, GetItem and SetItem methods,</li> +<li>HPy_FromVoidP and HPy_AsVoidP for passing HPy handles to C functions<br> +that pass void* pointers to callbacks,</li> +<li>avoiding having va_args as part of the ABI,</li> +<li>exception handling,</li> +<li>support for creating custom types.</li> +</ul> +Quite a few things got worked on too:<br> +<ul class="simple"> +<li>implemented support for writing methods that take keyword arguments with<br> +HPy_METH_KEYWORDS,</li> +<li>implemented HPy_GetAttr, HPy_SetAttr, HPy_GetItem, and HPy_SetItem,</li> +<li>started implementing support for adding custom types,</li> +<li>started implementing dumping JSON objects in ultrajson-hpy,</li> +<li>refactored the PyPy GIL to improve the interaction between HPy and<br> +PyPy's cpyext,</li> +<li>experimented with adding HPy support to rust-cpython.</li> +</ul> +And there was some discussion of the next steps of the HPy initiative<br> +including writing documentation, setting up websites and funding, and<br> +possibly organising another HPy gathering later in the year.<br> +<br> +<h2> +PyPy</h2> +<ul class="simple"> +<li>Georges gave a presentation on the Heptapod topic and branch workflows<br> +and showed everyone how to use hg-evolve.</li> +<li>Work was done on improving the PyPy CI buildbot post the move to<br> +heptapod, including a light-weight pre-merge CI and restricting<br> +when the full CI is run to only branch commits.</li> +<li>A lot of work was done improving the -D tests. </li> +</ul> +<br> +<h2> +Miscellaneous</h2> +<ul class="simple"> +<li>Armin demoed VRSketch and NaN Industries in VR, including an implementation<br> +of the Game of Life within NaN Industries!</li> +<li>Skiing!</li> +</ul> +<br> +<h2> +Aftermath</h2> +Immediately after the sprint large parts of Europe and the world were<br> +hit by the COVID-19 epidemic. It was good to spend time together before<br> +travelling ceased to be a sensible idea and many gatherings were cancelled.<br> +<br> +Keep safe out there everyone.<br> +<br> +The HPy &amp; PyPy Team &amp; Friends<br> +<br> +<i>In joke for those who attended the sprint: Please don't replace this blog post<br> +with its Swedish translation (or indeed a translation to any other language :).</i>cpyextCPythonGraalPythonHeptapodhpypypypypy3https://www.pypy.org/posts/2020/03/leysin-2020-sprint-report-764567777353955897.htmlTue, 17 Mar 2020 21:57:00 GMTPyPy3 2.3.1 - Fulcrumhttps://www.pypy.org/posts/2014/06/pypy3-231-fulcrum-3765964217640322884.htmlPhilip Jenvey<p>We're pleased to announce the first stable release of PyPy3. PyPy3<br> +targets Python 3 (3.2.5) compatibility.</p><p>We would like to thank all of the people who <a class="reference external" href="https://www.pypy.org/posts/2012/01/py3k-and-numpy-first-stage-thanks-to-3008917396290059758.html">donated</a> to the <a class="reference external" href="https://pypy.org/py3donate.html">py3k proposal</a><br> +for supporting the work that went into this.</p><p>You can download the PyPy3 2.3.1 release here:</p><blockquote><a class="reference external" href="https://pypy.org/download.html#pypy3-2-3-1">https://pypy.org/download.html#pypy3-2-3-1</a></blockquote><div class="section" id="highlights"><h1>Highlights</h1><ul class="simple"><li>The first stable release of PyPy3: support for Python 3!</li> +<li>The stdlib has been updated to Python 3.2.5</li> +<li>Additional support for the u'unicode' syntax (<a class="reference external" href="https://legacy.python.org/dev/peps/pep-0414/">PEP 414</a>) from Python 3.3</li> +<li>Updates from the default branch, such as incremental GC and various JIT<br> +improvements</li> +<li>Resolved some notable JIT performance regressions from PyPy2:</li> +</ul><blockquote><ul class="simple"><li>Re-enabled the previously disabled collection (list/dict/set) strategies</li> +<li>Resolved performance of iteration over range objects</li> +<li>Resolved handling of Python 3's exception __context__ unnecessarily forcing<br> +frame object overhead</li> +</ul></blockquote></div><div class="section" id="what-is-pypy"><h1>What is PyPy?</h1><p>PyPy is a very compliant Python interpreter, almost a drop-in replacement for<br> +CPython 2.7.6 or 3.2.5. It's fast due to its integrated tracing JIT compiler.</p><p>This release supports x86 machines running Linux 32/64, Mac OS X 64, Windows,<br> +and OpenBSD,<br> +as well as newer ARM hardware (ARMv6 or ARMv7, with VFPv3) running Linux.</p><p>While we support 32 bit python on Windows, work on the native Windows 64<br> +bit python is still stalling, we would welcome a volunteer<br> +to <a class="reference external" href="https://doc.pypy.org/en/latest/windows.html#what-is-missing-for-a-full-64-bit-translation">handle that</a>.</p></div><div class="section" id="how-to-use-pypy"><h1>How to use PyPy?</h1><p>We suggest using PyPy from a <a class="reference external" href="https://www.virtualenv.org/en/latest/">virtualenv</a>. Once you have a virtualenv<br> +installed, you can follow instructions from <a class="reference external" href="https://doc.pypy.org/en/latest/getting-started.html#installing-using-virtualenv">pypy documentation</a> on how<br> +to proceed. This document also covers other <a class="reference external" href="https://doc.pypy.org/en/latest/getting-started.html#installing-pypy">installation schemes</a>.</p><p>Cheers,<br> +the PyPy team</p></div>pypy3https://www.pypy.org/posts/2014/06/pypy3-231-fulcrum-3765964217640322884.htmlFri, 20 Jun 2014 21:31:00 GMTPy3k status update #13https://www.pypy.org/posts/2014/02/py3k-status-update-13-4630607029125647100.htmlPhilip Jenvey<p>This is the 13th status update about our work on the <a class="reference external" href="https://bitbucket.org/pypy/pypy/commits/all/tip/branch%28%22py3k%22%29">py3k branch</a>, which we<br> +can work on thanks to all of the people who <a class="reference external" href="https://www.pypy.org/posts/2012/01/py3k-and-numpy-first-stage-thanks-to-3008917396290059758.html">donated</a> to the <a class="reference external" href="https://pypy.org/py3donate.html">py3k proposal</a>.</p><p>We're just finishing up a cleanup of int/long types. This work helps the py3k<br> +branch unify these types into the Python 3 int and restore <a class="reference external" href="https://www.pypy.org/posts/2013/11/py3k-status-update-12-5307085693947812769.html">JIT compilation of<br> +machine sized integers</a>.</p><p>This cleanup also removes <a class="reference external" href="https://doc.pypy.org/en/latest/objspace.html#multimethods">multimethods</a> from these types. PyPy has<br> +historically used a clever implementation of multimethod dispatch for declaring<br> +methods of the __builtin__ types in RPython.</p><p>This multimethod scheme provides some convenient features for doing this,<br> +however we've come to the conclusion that it may be more trouble than it's<br> +worth. A major problem of multimethods is that they generate a large amount of<br> +stub methods which burden the already lengthy and memory hungry RPython<br> +translation process. Also, their implementation and behavior can be somewhat<br> +complicated/obscure.</p><p>The alternative to multimethods involves doing the work of the type checking<br> +and dispatching rules in a more verbose, manual way. It's a little more work in<br> +the end but less magical.</p><p>Recently, Manuel Jacob finished a large cleanup effort of the<br> +unicode/string/bytearray types that also removed their multimethods. This work<br> +also benefits the py3k branch: it'll help with future <a class="reference external" href="https://www.python.org/dev/peps/pep-0393/">PEP 393</a> (or <a class="reference external" href="https://lucumr.pocoo.org/2014/1/9/ucs-vs-utf8/">PEP 393<br> +alternative</a>) work. This effort was partly sponsored by Google's Summer of<br> +Code: thanks Manuel and Google!</p><p>Now there's only a couple major pieces left in the multimethod removal (the<br> +float/complex types and special marshaling code) and a few minor pieces that<br> +should be relatively easy.</p><p>In conclusion, there's been some good progress made on py3k and multimethod<br> +removal this winter, albeit a bit slower than we would have liked.</p><p>cheers,<br> +Phil</p>pypy3https://www.pypy.org/posts/2014/02/py3k-status-update-13-4630607029125647100.htmlTue, 18 Feb 2014 02:33:00 GMTPy3k status update #12https://www.pypy.org/posts/2013/11/py3k-status-update-12-5307085693947812769.htmlPhilip Jenvey<p>This is the 12th status update about our work on the <a class="reference external" href="https://bitbucket.org/pypy/pypy/commits/all/tip/branch%28%22py3k%22%29">py3k branch</a>, which we<br> +can work on thanks to all of the people who <a class="reference external" href="https://www.pypy.org/posts/2012/01/py3k-and-numpy-first-stage-thanks-to-3008917396290059758.html">donated</a> to the <a class="reference external" href="https://pypy.org/py3donate.html">py3k proposal</a>.</p><p>Here's an update on the recent progress:</p><ul class="simple"><li>Thank you to everyone who has provided initial feedback on the PyPy3 2.1 beta<br> +1 release. We've gotten a number of bug reports, most of which have been<br> +fixed.</li> +<li>As usual, we're continually keeping up with changes from the default<br> +branch. Oftentimes these merges come at a cost (conflicts and or<br> +reintegration of py3k changes) but occasionally we get goodies for free, such<br> +as the <a class="reference external" href="https://www.pypy.org/posts/2013/10/making-coveragepy-faster-under-pypy-935409618297062344.html">recent JIT optimizations</a> and <a class="reference external" href="https://www.pypy.org/posts/2013/10/incremental-garbage-collector-in-pypy-8956893523842234676.html">incremental garbage collection</a>.</li> +<li>We've been focusing on re-optimizing Python 2 int sized (machine sized)<br> +integers:</li> +</ul><p>We have a couple of known, notable speed regressions in the PyPy3 beta release<br> +vs regular PyPy. The major one being with Python 2.x int sized (or machine<br> +sized) integers.</p><p>Python 3 drops the distinction between int and long types. CPython 3.x<br> +accomplishes this by removing the old int type entirely and renaming the long<br> +type to int. Initially, we've done the same for PyPy3 for the sake of<br> +simplicity and getting everything working.</p><p>However PyPy's JIT is capable of heavily optimizing these machine sized integer<br> +operations, so this came with a regression in performance in this area.</p><p>We're now in the process of solving this. Part of this work also involves some<br> +house cleaning on these numeric types which also benefits the default branch.</p><p>cheers,<br> +Phil</p>pypy3https://www.pypy.org/posts/2013/11/py3k-status-update-12-5307085693947812769.htmlTue, 12 Nov 2013 23:16:00 GMTPy3k status update #11https://www.pypy.org/posts/2013/06/py3k-status-update-11-133025715908408072.htmlPhilip Jenvey<p>This is the 11th status update about our work on the <a class="reference external" href="https://bitbucket.org/pypy/pypy/commits/all/tip/branch%28%22py3k%22%29">py3k branch</a>, which we<br> +can work on thanks to all of the people who <a class="reference external" href="https://www.pypy.org/posts/2012/01/py3k-and-numpy-first-stage-thanks-to-3008917396290059758.html">donated</a> to the <a class="reference external" href="https://pypy.org/py3donate.html">py3k proposal</a>.</p><p>Here's some highlights of the progress made since the previous update:</p><ul class="simple"><li>PyPy py3k now matches CPython 3's hash code for<br> +int/float/complex/Decimal/Fraction</li> +<li>Various outstanding unicode identifier related issues were<br> +resolved. E.g. test_importlib/pep263/ucn/unicode all now fully pass. Various<br> +usage of identifiers (in particular type and module names) have been fixed to<br> +handle non-ascii names -- mostly around display of reprs and exception<br> +messages.</li> +<li>The unicodedata database has been upgraded to 6.0.0.</li> +<li>Windows support has greatly improved, though it could still use some more<br> +help (but so does the default branch to a certain degree).</li> +<li>Probably the last of the parsing related bugs/features have been taken care<br> +of.</li> +<li>Of course various other smaller miscellaneous fixes</li> +</ul><p>This leaves the branch w/ only about 5 outstanding failures of the stdlib test<br> +suite:</p><ul><li><p class="first">test_float</p><p>1 failing test about containment of floats in collections.</p></li> +<li><p class="first">test_memoryview</p><p>Various failures: requires some bytes/str changes among other things (Manuel<br> +Jacob's has some progress on this on the <a class="reference external" href="https://bitbucket.org/pypy/pypy/compare/py3k-memoryview..py3k">py3k-memoryview branch</a>)</p></li> +<li><p class="first">test_multiprocessing</p><p>1 or more tests deadlock on some platforms</p></li> +<li><p class="first">test_sys and test_threading</p><p>2 failing tests for the New GIL's new API</p></li> +</ul><p>Probably the biggest feature left to tackle is the New GIL.</p><p>We're now pretty close to pushing an initial release. We had planned for one<br> +around PyCon, but having missed that we've put some more effort into the branch<br> +to provide a more fully-fledged initial release.</p><p>Thanks to the following for their contributions: Manuel Jacob, Amaury Forgeot<br> +d'Arc, Karl Ramm, Jason Chu and Christian Hudon.</p><p>cheers,<br> +Phil</p>pypy3https://www.pypy.org/posts/2013/06/py3k-status-update-11-133025715908408072.htmlWed, 12 Jun 2013 19:17:00 GMTPy3k status update #10https://www.pypy.org/posts/2013/03/py3k-status-update-10-6681398990092286007.htmlPhilip Jenvey<p>This is the tenth status update about our work on the <a class="reference external" href="https://bitbucket.org/pypy/pypy/commits/all/tip/branch%28%22py3k%22%29">py3k branch</a>, which we<br> +can work on thanks to all of the people who <a class="reference external" href="https://www.pypy.org/posts/2012/01/py3k-and-numpy-first-stage-thanks-to-3008917396290059758.html">donated</a> to the <a class="reference external" href="https://pypy.org/py3donate.html">py3k proposal</a>.</p><p>There's been significant progress since the last update: the <a class="reference external" href="https://buildbot.pypy.org/summary?branch=py3k">linux x86-32<br> +buildbot</a> now passes 289 out of approximately 354 modules (with 39 skips) of<br> +CPython's regression test suite.</p><p>That means there's only 26 test module failures left! The list of major items<br> +remaining for 3.2 compatibility are now short enough to list here, with their<br> +related tests:</p><ul class="simple"><li>Tokenizer support for non-ascii identifiers</li> +</ul><blockquote><ul class="simple"><li>test_importlib</li> +<li>test_pep263</li> +</ul></blockquote><ul class="simple"><li>memoryview (Manuel Jacob's tackling this on the <a class="reference external" href="https://bitbucket.org/pypy/pypy/compare/py3k-memoryview..py3k">py3k-memoryview branch</a>)</li> +</ul><blockquote><ul class="simple"><li>test_memoryview</li> +</ul></blockquote><ul class="simple"><li>multiprocessing module currently deadlocks</li> +</ul><blockquote><ul class="simple"><li>test_multiprocessing</li> +</ul></blockquote><ul class="simple"><li>Buggy handling of the new extended unpacking syntax by the compiler:</li> +</ul><blockquote><ul class="simple"><li>test_unpack_ex</li> +</ul></blockquote><ul class="simple"><li>The new Global Interpreter Lock and new thread signal handling</li> +</ul><blockquote><ul class="simple"><li>test_threading</li> +<li>test_threadsignals</li> +<li>test_sys</li> +</ul></blockquote><ul class="simple"><li>Upgrade unicodedata to 6.0.0 (requires updates to the actual unicodedata<br> +generation script)</li> +</ul><blockquote><ul class="simple"><li>test_ucn</li> +<li>test_unicode</li> +<li>test_unicodedata</li> +</ul></blockquote><ul class="simple"><li><a class="reference external" href="https://www.pypy.org/posts/2010/04/using-cpython-extension-modules-with-5864754772659599217.html">CPyExt</a></li> +</ul><blockquote><ul class="simple"><li>test_capi (currently crashes)</li> +</ul></blockquote><ul class="simple"><li>Update int's hash code to match to CPython (float's is already updated on the<br> +<a class="reference external" href="https://bitbucket.org/pypy/pypy/compare/py3k-newhash..py3k">py3k-newhash branch</a>. note that PyPy 2.x doesn't even totally match<br> +CPython's hashing)</li> +</ul><blockquote><ul class="simple"><li>test_decimal</li> +<li>test_fractions</li> +<li>test_numeric_tower</li> +</ul></blockquote><ul class="simple"><li>Miscellaneous:</li> +</ul><blockquote><ul class="simple"><li>test_complex</li> +<li>test_float</li> +<li>test_peepholer</li> +<li>test_range</li> +<li>test_sqlite (a new cffi based version seems to be coming)</li> +<li>test_ssl</li> +<li>test_struct</li> +<li>test_subprocess</li> +<li>test_sys_settrace</li> +<li>test_time</li> +</ul></blockquote><p>Additionally there are still a number of failures in PyPy's internal test<br> +suite. These tests are usually ran against untranslated versions of PyPy during<br> +development. However we've now began running them against a fully translated<br> +version of PyPy on the buildbot too (thanks to Amaury for setting this<br> +up). This further ensures that our tests and implementation are sane.</p><p>We're getting closer to producing an initial alpha release. Before that happens<br> +we'd like to see:</p><ul class="simple"><li>further test fixes</li> +<li>the results of test runs on other major platforms (e.g. linux x86-64 and osx<br> +seem to have some additional failures as of now)</li> +<li>some basic real world testing</li> +</ul><p>Finally I'd like to thank Manuel Jacob for his various contributions over the<br> +past month, including fixing the array and ctypes modules among other things,<br> +and also Amaury Forgeot d'Arc for his ongoing excellent contributions.</p><p>cheers,<br> +Phil</p>pypy3https://www.pypy.org/posts/2013/03/py3k-status-update-10-6681398990092286007.htmlTue, 05 Mar 2013 20:00:00 GMTPy3k status update #9https://www.pypy.org/posts/2013/01/py3k-status-update-9-98332471264591773.htmlPhilip Jenvey<p>This is the ninth status update about our work on the <a class="reference external" href="https://foss.heptapod.net/pypy/pypy/-/tree/branch/py3k">py3k branch</a>, which<br> +we can work on thanks to all of the people who <a class="reference external" href="https://www.pypy.org/posts/2012/01/py3k-and-numpy-first-stage-thanks-to-3008917396290059758.html">donated</a> to the <a class="reference external" href="https://pypy.org/py3donate.html">py3k<br> +proposal</a>.</p><p>Just a very short update on December's work: we're now passing about 223 of<br> +approximately 355 modules of CPython's regression test suite, up from passing<br> +194 last month.</p><p>Some brief highlights:</p><ul class="simple"><li>More encoding related issues were addressed. e.g. now most if not all the<br> +multibytecodec test modules pass.</li> +<li>Fixed some path handling issues (<tt class="docutils literal">test_os</tt>, <tt class="docutils literal">test_ntpath</tt> and<br> +<tt class="docutils literal">test_posixpath</tt> now pass)</li> +<li>We now pass <tt class="docutils literal">test_class</tt>, <tt class="docutils literal">test_descr</tt> and almost <tt class="docutils literal">test_builtin</tt> (among<br> +other things): these are notable as they are fairly extensive test suites of<br> +core aspects of the langauge.</li> +<li>Amaury Forgeot d'Arc continued making progress on <a class="reference external" href="https://www.pypy.org/posts/2010/04/using-cpython-extension-modules-with-5864754772659599217.html">CPyExt</a> (thanks again!)</li> +</ul><p>cheers,<br> +Phil</p>pypy3https://www.pypy.org/posts/2013/01/py3k-status-update-9-98332471264591773.htmlThu, 10 Jan 2013 06:04:00 GMTPy3k status update #8https://www.pypy.org/posts/2012/12/py3k-status-update-8-3932232806458251730.htmlPhilip Jenvey<p>This is the eight status update about our work on the <a class="reference external" href="https://bitbucket.org/pypy/pypy/commits/all/tip/branch%28%22py3k%22%29">py3k branch</a>, which<br> +we can work on thanks to all of the people who <a class="reference external" href="https://www.pypy.org/posts/2012/01/py3k-and-numpy-first-stage-thanks-to-3008917396290059758.html">donated</a> to the <a class="reference external" href="https://pypy.org/py3donate.html">py3k<br> +proposal</a>.</p><p>Just a short update on November's work: we're now passing about 194 of<br> +approximately 355 modules of CPython's regression test suite, up from passing<br> +160 last month. Many test modules only fail a small number of individual tests<br> +now.</p><p>We'd like to thank Amaury Forgeot d'Arc for his contributions, in particular he<br> +has made significant progress on updating <a class="reference external" href="https://www.pypy.org/posts/2010/04/using-cpython-extension-modules-with-5864754772659599217.html">CPyExt</a> for Python 3 this month.</p><p>Some other highlights:</p><ul class="simple"><li><tt class="docutils literal">test_marshal</tt> now passes, and there's been significant progress on<br> +pickling (thanks <a class="reference external" href="https://twitter.com/Joushou">Kenny Levinsen</a> and Amaury for implementing<br> +<tt class="docutils literal"><span class="pre">int.{to,from}_bytes</span></tt>)</li> +<li>We now have a <tt class="docutils literal">_posixsubprocess</tt> module</li> +<li>More encoding related fixes, which affects many failing tests</li> +<li><tt class="docutils literal">_sre</tt> was updated and now <tt class="docutils literal">test_re</tt> almost passes</li> +<li>Exception behavior is almost complete per the Python 3 specs, what's mostly<br> +missing now are the new <tt class="docutils literal">__context__</tt> and <tt class="docutils literal">__traceback__</tt> attributes (<a class="reference external" href="https://www.python.org/dev/peps/pep-3134/">PEP<br> +3134</a>)</li> +<li>Fixed some crashes and deadlocks occurring during the regression tests</li> +<li>We merged the <a class="reference external" href="https://bitbucket.org/pypy/pypy/commits/all/tip/branch%28%22unicode-strategies%22%29">unicode-strategies</a> branch both to default and to py3k: now we<br> +have versions of lists, dictionaries and sets specialized for unicode<br> +elements, as we already had for strings.</li> +<li>However, for string-specialized containers are still faster in some cases<br> +because there are shortcuts which have not been implemented for unicode yet<br> +(e.g., constructing a set of strings from a list of strings). The plan is to<br> +completely kill the shortcuts and improve the JIT to produce the fast<br> +version automatically for both the string and unicode versions, to have a<br> +more maintainable codebase without sacrificing the speed. The <a class="reference external" href="https://bitbucket.org/pypy/pypy/commits/all/tip/branch%28%22autoreds%22%29">autoreds</a><br> +branch (already merged) was a first step in this direction.</li> +</ul><p>cheers,<br> +Philip&amp;Antonio</p>pypy3https://www.pypy.org/posts/2012/12/py3k-status-update-8-3932232806458251730.htmlTue, 04 Dec 2012 22:30:00 GMTPy3k status update #7https://www.pypy.org/posts/2012/11/py3k-status-update-7-6182140595418083307.htmlPhilip Jenvey<p>This is the seventh status update about our work on the <a class="reference external" href="https://foss.heptapod.net/pypy/pypy/-/tree/branch/py3k">py3k branch</a>, which<br> +we can work on thanks to all of the people who <a class="reference external" href="https://www.pypy.org/posts/2012/01/py3k-and-numpy-first-stage-thanks-to-3008917396290059758.html">donated</a> to the <a class="reference external" href="https://pypy.org/py3donate.html">py3k<br> +proposal</a>.</p><p>The biggest news is that this month Philip started to work on py3k in parallel<br> +to Antonio. As such, there was an increased amount of activity.</p><p>The <a class="reference external" href="https://buildbot.pypy.org/summary?branch=py3k">py3k buildbots</a> now fully translate the branch every night and run the<br> +Python standard library tests.</p><p>We currently pass 160 out of approximately 355 modules of CPython's standard<br> +test suite, fail 144 and skip approximately 51.</p><p>Some highlights:</p><ul class="simple"><li>dictviews (the objects returned by dict.keys/values/items) has been greatly<br> +improved, and now they full support set operators</li> +<li>a lot of tests has been fixed wrt complex numbers (and in particular the<br> +<tt class="docutils literal">__complex__</tt> method)</li> +<li>_csv has been fixed and now it correctly handles unicode instead of bytes</li> +<li>more parser fixes, py3k list comprehension semantics; now you can no longer<br> +access the list comprehension variable after it finishes</li> +<li>2to3'd most of the lib_pypy modules (pypy's custom standard lib<br> +replacements/additions)</li> +<li>py3-enabled pyrepl: this means that finally readline works at the command<br> +prompt, as well as builtins.input(). <tt class="docutils literal">pdb</tt> seems to work, as well as<br> +<a class="reference external" href="https://pypi.python.org/pypi/fancycompleter">fancycompleter</a> to get colorful TAB completions :-)</li> +<li>py3 round</li> +<li>further tightening/cleanup of the unicode handling (more usage of<br> +surrogateescape, surrogatepass among other things)</li> +<li>as well as keeping up with some big changes happening on the default branch<br> +and of course various other fixes.</li> +</ul><p>Finally, we would like to thank Amaury Forgeot d'Arc for his significant<br> +contributions.</p><p>cheers,<br> +Philip&amp;Antonio</p>pypy3https://www.pypy.org/posts/2012/11/py3k-status-update-7-6182140595418083307.htmlFri, 02 Nov 2012 15:47:00 GMTPy3k status update #6https://www.pypy.org/posts/2012/09/py3k-status-update-6-4049281716377789914.htmlAntonio Cuni<p>This is the sixth status update about our work on the <a class="reference external" href="https://foss.heptapod.net/pypy/pypy/-/tree/branch/py3k">py3k branch</a>, which we<br> +can work on thanks to all of the people who <a class="reference external" href="https://www.pypy.org/posts/2012/01/py3k-and-numpy-first-stage-thanks-to-3008917396290059758.html">donated</a> to the <a class="reference external" href="https://pypy.org/py3donate.html">py3k proposal</a>.</p><p>The coolest news is not about what we did in the past weeks, but what we will<br> +do in the next: I am pleased to announce that <a class="reference external" href="https://twitter.com/pjenvey">Philip Jenvey</a> has been<br> +selected by the PyPy communitiy to be funded for his upcoming work on py3k,<br> +thanks to your generous donations. He will start to work on it shortly, and he<br> +will surely help the branch to make faster progress. I am also particularly<br> +happy of this because Philip is the first non-core developer who is getting<br> +paid with donations: he demonstrated over the past months to be able to work<br> +effectively on PyPy, and so we were happy to approve his application for the<br> +job. This means that anyone can potentially be selected in the future, the<br> +only strict requirement is to have a deep interest in working on PyPy and to<br> +prove to be able to do so by contributing to the project.</p><p>Back to the status of the branch. Most of the work since the last status<br> +update has been done in the area of, guess what? Unicode strings. As usual,<br> +this is one of the most important changes between Python 2 and Python 3, so<br> +it's not surprising. The biggest news is that now PyPy internally supports<br> +unicode identifiers (such as names of variables, functions, attributes, etc.),<br> +whereas earlier it supported only ASCII bytes strings. The changes is still<br> +barely visible from the outside, because the parser still rejects non-ASCII<br> +identifiers, however you can see it with a bit of creativity:</p><pre class="literal-block">&gt;&gt;&gt;&gt; def foo(x): pass +&gt;&gt;&gt;&gt; foo(**{'àèìòù': 42}) +Traceback (most recent call last): + File "&lt;console&gt;", line 1, in &lt;module&gt; +TypeError: foo() got an unexpected keyword argument 'àèìòù' +</pre><p>Before the latest changes, you used to get question marks instead of the<br> +proper name for the keyword argument. Although this might seem like a small<br> +detail, it is a big step towards a proper working Python 3 interpreter and it<br> +required a couple of days of headaches. A spin-off of this work is that now<br> +RPython has better built-in support for unicode (also in the default branch):<br> +for example, it now supports unicode string formatting (using the percent<br> +operator) and the methods <tt class="docutils literal"><span class="pre">.encode/.decode('utf-8')</span></tt>.</p><p>Other than that there is the usual list of smaller issues and bugs that got<br> +fixed, including (but not limited to):</p><blockquote><ul class="simple"><li>teach the compiler when to emit the new opcode <tt class="docutils literal">DELETE_DEREF</tt> (and<br> +implement it!)</li> +<li>detect when we use spaces and TABs inconsistently in the source code, as<br> +CPython does</li> +<li>fix yet another bug related to the new lexically scoped exceptions (this<br> +is the last one, hopefully)</li> +<li>port some of the changes that we did to the standard CPython 2.7 tests to<br> +3.2, to mark those which are implementation details and should not be run on<br> +PyPy</li> +</ul></blockquote><p>Finally, I would like to thank Amaury Forgeot d'Arc and Ariel Ben-Yehuda for<br> +their work on the branch; among other things, Amaury recently worked on<br> +<tt class="docutils literal">cpyext</tt> and on the PyPy <tt class="docutils literal">_cffi_backend</tt>, while Ariel submitted a patch to<br> +implement <a class="reference external" href="https://www.python.org/dev/peps/pep-3138/">PEP 3138</a>.</p>pypy3https://www.pypy.org/posts/2012/09/py3k-status-update-6-4049281716377789914.htmlWed, 26 Sep 2012 09:50:00 GMT \ No newline at end of file diff --git a/categories/pyqt4.html b/categories/pyqt4.html new file mode 100644 index 000000000..6168dba89 --- /dev/null +++ b/categories/pyqt4.html @@ -0,0 +1,114 @@ + + + + + +Posts about PyQt4 | PyPy + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +
                + + \ No newline at end of file diff --git a/categories/pyqt4.xml b/categories/pyqt4.xml new file mode 100644 index 000000000..5d506dd03 --- /dev/null +++ b/categories/pyqt4.xml @@ -0,0 +1,40 @@ + +PyPy (Posts about PyQt4)https://www.pypy.org/enContents © 2024 <a href="mailto:pypy-dev@pypy.org">The PyPy Team</a> Sat, 31 Aug 2024 17:48:12 GMTNikola (getnikola.com)http://blogs.law.harvard.edu/tech/rssUsing CPython extension modules with PyPy, or: PyQt on PyPyhttps://www.pypy.org/posts/2009/11/using-cpython-extension-modules-with-4951018896657992031.htmlAlexander Schremmer<div class="document" id="using-cpython-extension-modules-with-pypy-or-pyqt-on-pypy"> + +<p>If you have ever wanted to use CPython extension modules on PyPy, +we want to announce that there is a solution that should be compatible +to quite a bit of the available modules. It is neither new nor written +by us, but works nevertheless great with PyPy.</p> +<p>The trick is to use RPyC, a transparent, symmetric remote procedure +call library written in Python. The idea is to start a +CPython process that hosts the PyQt libraries +and connect to it via TCP to send RPC commands to it.</p> +<p>I tried to run PyQt applications +using it on PyPy and could get quite a bit of the functionality of these +working. Remaining problems include regular segfaults of CPython +because of PyQt-induced memory corruption and bugs because classes +like StandardButtons behave incorrectly when it comes to arithmetical operations.</p> +<p>Changes to RPyC needed to be done to support remote unbound <tt class="docutils literal"><span class="pre">__init__</span></tt> methods, +shallow call by value for list and dict types (PyQt4 methods want real lists and dicts +as parameters), and callbacks to methods (all remote method objects are wrapped into +small lambda functions to ease the call for PyQt4).</p> +<p>If you want to try RPyC to run the PyQt application of your choice, you just +need to follow these steps. Please report your experience here in the blog +comments or on our <a class="reference external" href="https://codespeak.net/mailman/listinfo/pypy-dev">mailing list</a>.</p> +<blockquote> +<ol class="arabic simple"> +<li>Download RPyC from the <a class="reference external" href="https://sourceforge.net/projects/rpyc/files/">RPyC download page</a>.</li> +<li>Download this <a class="reference external" href="https://codespeak.net/svn/user/xoraxax/rpyc-3.0.7-pyqt4-compat.patch">patch</a> and apply it to RPyC by running +<tt class="docutils literal"><span class="pre">patch</span> <span class="pre">-p1</span> <span class="pre">&lt;</span> <span class="pre">rpyc-3.0.7-pyqt4-compat.patch</span></tt> in the RPyC directory.</li> +<li>Install RPyc by running <tt class="docutils literal"><span class="pre">python</span> <span class="pre">setup.py</span> <span class="pre">install</span></tt> as root.</li> +<li>Run the file <tt class="docutils literal"><span class="pre">rpyc/servers/classic_server.py</span></tt> using CPython.</li> +<li>Execute your PyQt application on PyPy.</li> +</ol> +</blockquote> +<p>PyPy will automatically connect to CPython and use its PyQt libraries.</p> +<p>Note that this scheme works with nearly every extension library. Look +at <tt class="docutils literal"><span class="pre">pypy/lib/sip.py</span></tt> on how to add new libraries (you need to create +such a file for every proxied extension module).</p> +<p>Have fun with PyQt</p> +<p>Alexander Schremmer</p> +</div>CPythonextension modulesPyQt4RPyChttps://www.pypy.org/posts/2009/11/using-cpython-extension-modules-with-4951018896657992031.htmlMon, 30 Nov 2009 11:19:00 GMT \ No newline at end of file diff --git a/categories/release.html b/categories/release.html new file mode 100644 index 000000000..34808c615 --- /dev/null +++ b/categories/release.html @@ -0,0 +1,297 @@ + + + + + +Posts about release | PyPy + + + + + + + + + + + + + + + + + Skip to main content +
                +

                Posts about release

                + + +
                +
                + + \ No newline at end of file diff --git a/categories/release.xml b/categories/release.xml new file mode 100644 index 000000000..e43d64cd1 --- /dev/null +++ b/categories/release.xml @@ -0,0 +1,818 @@ + +PyPy (Posts about release)https://www.pypy.org/enContents © 2024 <a href="mailto:pypy-dev@pypy.org">The PyPy Team</a> Sat, 31 Aug 2024 17:48:13 GMTNikola (getnikola.com)http://blogs.law.harvard.edu/tech/rssPyPy v7.3.17 releasehttps://www.pypy.org/posts/2024/08/pypy-v7317-release.htmlmattip<section id="pypy-v7-3-17-release-of-python-2-7-and-3-10"> +<h2>PyPy v7.3.17: release of python 2.7 and 3.10</h2> +<p>The PyPy team is proud to release version 7.3.17 of PyPy.</p> +<p>This release includes a new <a class="reference internal" href="https://www.pypy.org/posts/2024/08/pypy-v7317-release.html#risc-v-jit-backend">RISC-V JIT backend</a>, an <a class="reference internal" href="https://www.pypy.org/posts/2024/08/pypy-v7317-release.html#improved-repl">improved REPL</a> based on +work by the CPython team, and <a class="reference internal" href="https://www.pypy.org/posts/2024/08/pypy-v7317-release.html#better-jit-optimizations">better JIT optimizations</a> of integer +operations. Special shout-outs to <a class="reference external" href="https://github.com/loganchien">Logan Chien</a> for the <a class="reference external" href="https://github.com/pypy/pypy/pull/5002">RISC-V backend +work</a>, to <a class="reference external" href="https://github.com/nirit100">Nico Rittinghaus</a> for better integer optimization in the JIT, and +the CPython team that has worked on the repl.</p> +<p>The release includes two different interpreters:</p> +<ul class="simple"> +<li><p>PyPy2.7, which is an interpreter supporting the syntax and the features of +Python 2.7 including the stdlib for CPython 2.7.18+ (the <code class="docutils literal">+</code> is for +backported security updates)</p></li> +<li><p>PyPy3.10, which is an interpreter supporting the syntax and the features of +Python 3.10, including the stdlib for CPython 3.10.14.</p></li> +</ul> +<p>The interpreters are based on much the same codebase, thus the dual +release. This is a micro release, all APIs are compatible with the other 7.3 +releases. It follows after 7.3.16 release on April 23, 2024.</p> +<p>We recommend updating. You can find links to download the releases here:</p> +<blockquote> +<p><a class="reference external" href="https://pypy.org/download.html">https://pypy.org/download.html</a></p> +</blockquote> +<p>We would like to thank our donors for the continued support of the PyPy +project. If PyPy is not quite good enough for your needs, we are available for +<a class="reference external" href="https://www.pypy.org/pypy-sponsors.html">direct consulting</a> work. If PyPy is helping you out, we would love to hear +about it and encourage submissions to our <a class="reference external" href="https://pypy.org/blog">blog</a> via a pull request +to <a class="reference external" href="https://github.com/pypy/pypy.org">https://github.com/pypy/pypy.org</a></p> +<p>We would also like to thank our contributors and encourage new people to join +the project. PyPy has many layers and we need help with all of them: bug fixes, +<a class="reference external" href="https://www.pypy.org/posts/2024/08/index.html">PyPy</a> and <a class="reference external" href="https://rpython.readthedocs.org">RPython</a> documentation improvements, or general <a class="reference external" href="https://www.pypy.org/posts/2024/08/project-ideas.html">help</a> with +making RPython's JIT even better.</p> +<p>If you are a python library maintainer and use C-extensions, please consider +making a <a class="reference external" href="https://hpyproject.org/">HPy</a> / <a class="reference external" href="https://cffi.readthedocs.io">CFFI</a> / <a class="reference external" href="https://cppyy.readthedocs.io">cppyy</a> version of your library that would be performant +on PyPy. In any case, both <a class="reference external" href="https://github.com/joerick/cibuildwheel">cibuildwheel</a> and the <a class="reference external" href="https://github.com/matthew-brett/multibuild">multibuild system</a> support +building wheels for PyPy.</p> +<section id="risc-v-backend-for-the-jit"> +<span id="risc-v-jit-backend"></span><h3>RISC-V backend for the JIT</h3> +<p>PyPy's JIT has added support for generating 64-bit RISC-V machine code at +runtime (RV64-IMAD, specifically). So far we are not releasing binaries for any +RISC-V platforms, but there are <a class="reference external" href="https://rpython.readthedocs.io/en/latest/riscv.html">instructions</a> on how to cross-compile binaries.</p> +</section> +<section id="repl-improvements"> +<span id="improved-repl"></span><h3>REPL Improvements</h3> +<p>The biggest user-visible change of the release is new features in the repl of +PyPy3.10. CPython 3.13 has adopted and extended PyPy's pure-Python repl, adding +a number of features and fixing a number or bugs in the process. We have +backported and added the following features:</p> +<ul class="simple"> +<li><p>Prompts and tracebacks use terminal colors, as well as <a class="reference external" href="https://gist.github.com/egmontkob/eb114294efbcd5adb1944c9f3cb5feda">terminal hyperlinks</a> +for file names.</p></li> +<li><p><a class="reference external" href="https://en.wikipedia.org/wiki/Bracketed-paste">Bracketed paste</a> enable pasting several lines of input into the terminal +without auto-indentation getting in the way.</p></li> +<li><p>A special interactive help browser (F1), history browser (F2), explicit paste +mode (F3).</p></li> +<li><p>Support for Ctrl-&lt;left/right&gt; to jump over whole words at a time.</p></li> +</ul> +<p>See the <a class="reference external" href="https://docs.python.org/3.13/whatsnew/3.13.html#a-better-interactive-interpreter">CPython documentation for further details</a>. Thanks to Łukasz Langa, +Pablo Galindo Salgado and the other CPython devs involved in this work.</p> +</section> +<section id="better-jit-optimizations-of-integer-operations"> +<span id="better-jit-optimizations"></span><h3>Better JIT optimizations of integer operations</h3> +<p>The optimizers of PyPy's JIT have become much better at reasoning about and +optimizing integer operations. This is done with a new <a class="reference external" href="https://pypy.org/posts/2024/08/toy-knownbits.html">"knownbits" abstract +domain</a>. In many programs that do bit-manipulation of integers, some of the +bits of the integer variables of the program can be statically known. Here's a +simple example:</p> +<div class="code"><pre class="code python"><a id="rest_code_eca6db629fd844478a4ee5bd2ccb11fc-1" name="rest_code_eca6db629fd844478a4ee5bd2ccb11fc-1" href="https://www.pypy.org/posts/2024/08/pypy-v7317-release.html#rest_code_eca6db629fd844478a4ee5bd2ccb11fc-1"></a><span class="n">x</span> <span class="o">=</span> <span class="n">a</span> <span class="o">|</span> <span class="mi">1</span> +<a id="rest_code_eca6db629fd844478a4ee5bd2ccb11fc-2" name="rest_code_eca6db629fd844478a4ee5bd2ccb11fc-2" href="https://www.pypy.org/posts/2024/08/pypy-v7317-release.html#rest_code_eca6db629fd844478a4ee5bd2ccb11fc-2"></a><span class="o">...</span> +<a id="rest_code_eca6db629fd844478a4ee5bd2ccb11fc-3" name="rest_code_eca6db629fd844478a4ee5bd2ccb11fc-3" href="https://www.pypy.org/posts/2024/08/pypy-v7317-release.html#rest_code_eca6db629fd844478a4ee5bd2ccb11fc-3"></a><span class="k">if</span> <span class="n">x</span> <span class="o">&amp;</span> <span class="mi">1</span><span class="p">:</span> +<a id="rest_code_eca6db629fd844478a4ee5bd2ccb11fc-4" name="rest_code_eca6db629fd844478a4ee5bd2ccb11fc-4" href="https://www.pypy.org/posts/2024/08/pypy-v7317-release.html#rest_code_eca6db629fd844478a4ee5bd2ccb11fc-4"></a> <span class="o">...</span> +<a id="rest_code_eca6db629fd844478a4ee5bd2ccb11fc-5" name="rest_code_eca6db629fd844478a4ee5bd2ccb11fc-5" href="https://www.pypy.org/posts/2024/08/pypy-v7317-release.html#rest_code_eca6db629fd844478a4ee5bd2ccb11fc-5"></a><span class="k">else</span><span class="p">:</span> +<a id="rest_code_eca6db629fd844478a4ee5bd2ccb11fc-6" name="rest_code_eca6db629fd844478a4ee5bd2ccb11fc-6" href="https://www.pypy.org/posts/2024/08/pypy-v7317-release.html#rest_code_eca6db629fd844478a4ee5bd2ccb11fc-6"></a> <span class="o">...</span> +</pre></div> +<p>With the new abstract domain, the JIT can optimize the <code class="docutils literal">if</code>-condition to +<code class="docutils literal">True</code>, because it already knows that the lowest bit of <code class="docutils literal">x</code> must be set. +This optimization applies to all Python-integers that fit into a machine word +(PyPy optimistically picks between two different representations for <code class="docutils literal">int</code>, +depending on the size of the value). Unfortunately there is very little impact +of this change on almost all Python code, because intensive bit-manipulation is +rare in Python. However, the change leads to significant performance +improvements in <a class="reference external" href="https://docs.pydrofoil.org/en/latest/">Pydrofoil</a> (the RPython-based RISC-V/ARM emulators that are +automatically generated from high-level <a class="reference external" href="https://github.com/rems-project/sail/">Sail</a> specifications of the respective +ISAs, and that use the RPython JIT to improve performance).</p> +</section> +<section id="pypy-versions-and-speed-pypy-org"> +<h3>PyPy versions and speed.pypy.org</h3> +<p>The keen-eyed will have noticed no mention of Python version 3.9 in the +releases above. Typically we will maintain only one version of Python3, but due +to PyPy3.9 support on conda-forge we maintained multiple versions from the +first release of PyPy3.10 in PyPy v7.3.12 (Dec 2022). Conda-forge is +<a class="reference external" href="https://pypy.org/posts/2024/08/conda-forge-proposes-dropping-support-for-pypy.html">sunsetting its PyPy support</a>, which means we can drop PyPy3.9. Since that was +the major driver of benchmarks at <a class="reference external" href="https://speed.pypy.org">https://speed.pypy.org</a>, we revamped the site +to showcase PyPy3.9, PyPy3.10, and various versions of cpython on the home +page. For historical reasons, the "baseline" for comparison is still cpython +3.7.19.</p> +<p>We will keep the buildbots building PyPY3.9 until the end of August, these +builds will still be available on the <a class="reference external" href="https://buildbot.pypy.org/nightly/">nightly builds</a> tab of the buildbot.</p> +</section> +<section id="what-is-pypy"> +<h3>What is PyPy?</h3> +<p>PyPy is a Python interpreter, a drop-in replacement for CPython +It's fast (<a class="reference external" href="https://speed.pypy.org">PyPy and CPython</a> performance +comparison) due to its integrated tracing JIT compiler.</p> +<p>We also welcome developers of other <a class="reference external" href="https://rpython.readthedocs.io/en/latest/examples.html">dynamic languages</a> to see what RPython +can do for them.</p> +<p>We provide binary builds for:</p> +<ul class="simple"> +<li><p><strong>x86</strong> machines on most common operating systems +(Linux 32/64 bits, Mac OS 64 bits, Windows 64 bits)</p></li> +<li><p>64-bit <strong>ARM</strong> machines running Linux (<code class="docutils literal">aarch64</code>) and macos (<code class="docutils literal">macos_arm64</code>).</p></li> +</ul> +<p>PyPy supports Windows 32-bit, Linux PPC64 big- and little-endian, Linux ARM +32 bit, RISC-V RV64IMAFD Linux, and s390x Linux but does not release binaries. +Please reach out to us if you wish to sponsor binary releases for those +platforms. Downstream packagers provide binary builds for debian, Fedora, +conda, OpenBSD, FreeBSD, Gentoo, and more.</p> +</section> +<section id="what-else-is-new"> +<h3>What else is new?</h3> +<p>For more information about the 7.3.17 release, see the <a class="reference external" href="https://doc.pypy.org/en/latest/release-v7.3.17.html#changelog">full changelog</a>.</p> +<p>Please update, and continue to help us make pypy better.</p> +<p>Cheers, +The PyPy Team</p> +</section> +</section>releasehttps://www.pypy.org/posts/2024/08/pypy-v7317-release.htmlWed, 28 Aug 2024 12:22:08 GMTPyPy v7.3.16 releasehttps://www.pypy.org/posts/2024/04/pypy-v7316-release.htmlmattip<section id="pypy-v7-3-16-release-of-python-2-7-3-9-and-3-10"> +<h2>PyPy v7.3.16: release of python 2.7, 3.9, and 3.10</h2> +<p>The PyPy team is proud to release version 7.3.16 of PyPy.</p> +<p>This release includes security fixes from upstream CPython, and bugfixes to the +garbage collector, described in a <a class="reference external" href="https://www.pypy.org/posts/2024/03/fixing-bug-incremental-gc.html">gc bug-hunt blog post</a>.</p> +<p>The release includes three different interpreters:</p> +<blockquote> +<ul class="simple"> +<li><p>PyPy2.7, which is an interpreter supporting the syntax and the features of +Python 2.7 including the stdlib for CPython 2.7.18+ (the <code class="docutils literal">+</code> is for +backported security updates)</p></li> +<li><p>PyPy3.9, which is an interpreter supporting the syntax and the features of +Python 3.9, including the stdlib for CPython 3.9.19.</p></li> +<li><p>PyPy3.10, which is an interpreter supporting the syntax and the features of +Python 3.10, including the stdlib for CPython 3.10.14.</p></li> +</ul> +</blockquote> +<p>The interpreters are based on much the same codebase, thus the multiple +release. This is a micro release, all APIs are compatible with the other 7.3 +releases. It follows after 7.3.15 release on Jan 15, 2024</p> +<p>We recommend updating. You can find links to download the v7.3.16 releases here:</p> +<blockquote> +<p><a class="reference external" href="https://pypy.org/download.html">https://pypy.org/download.html</a></p> +</blockquote> +<p>We would like to thank our donors for the continued support of the PyPy +project. If PyPy is not quite good enough for your needs, we are available for +<a class="reference external" href="https://www.pypy.org/pypy-sponsors.html">direct consulting</a> work. If PyPy is helping you out, we would love to hear +about it and encourage submissions to our <a class="reference external" href="https://pypy.org/blog">blog</a> via a pull request +to <a class="reference external" href="https://github.com/pypy/pypy.org">https://github.com/pypy/pypy.org</a></p> +<p>We would also like to thank our contributors and encourage new people to join +the project. PyPy has many layers and we need help with all of them: bug fixes, +<a class="reference external" href="https://www.pypy.org/posts/2024/04/index.html">PyPy</a> and <a class="reference external" href="https://rpython.readthedocs.org">RPython</a> documentation improvements, or general <a class="reference external" href="https://www.pypy.org/posts/2024/04/project-ideas.html">help</a> with +making RPython's JIT even better.</p> +<p>If you are a python library maintainer and use C-extensions, please consider +making a <a class="reference external" href="https://hpyproject.org/">HPy</a> / <a class="reference external" href="https://cffi.readthedocs.io">CFFI</a> / <a class="reference external" href="https://cppyy.readthedocs.io">cppyy</a> version of your library that would be performant +on PyPy. In any case, both <a class="reference external" href="https://github.com/joerick/cibuildwheel">cibuildwheel</a> and the <a class="reference external" href="https://github.com/matthew-brett/multibuild">multibuild system</a> support +building wheels for PyPy.</p> +<section id="what-is-pypy"> +<h3>What is PyPy?</h3> +<p>PyPy is a Python interpreter, a drop-in replacement for CPython +It's fast (<a class="reference external" href="https://speed.pypy.org">PyPy and CPython 3.7.4</a> performance +comparison) due to its integrated tracing JIT compiler.</p> +<p>We also welcome developers of other <a class="reference external" href="https://rpython.readthedocs.io/en/latest/examples.html">dynamic languages</a> to see what RPython +can do for them.</p> +<p>We provide binary builds for:</p> +<blockquote> +<ul class="simple"> +<li><p><strong>x86</strong> machines on most common operating systems +(Linux 32/64 bits, Mac OS 64 bits, Windows 64 bits)</p></li> +<li><p>64-bit <strong>ARM</strong> machines running Linux (<code class="docutils literal">aarch64</code>).</p></li> +<li><p>Apple <strong>M1 arm64</strong> machines (<code class="docutils literal">macos_arm64</code>).</p></li> +<li><p><strong>s390x</strong> running Linux</p></li> +</ul> +</blockquote> +<p>PyPy support Windows 32-bit, Linux PPC64 big- and little-endian, and Linux ARM +32 bit, but does not release binaries. Please reach out to us if you wish to +sponsor binary releases for those platforms. Downstream packagers provide +binary builds for debian, Fedora, conda, OpenBSD, FreeBSD, Gentoo, and more.</p> +</section> +<section id="what-else-is-new"> +<h3>What else is new?</h3> +<p>For more information about the 7.3.16 release, see the <a class="reference external" href="https://doc.pypy.org/en/latest/release-v7.3.16.html#changelog">full changelog</a>.</p> +<p>Please update, and continue to help us make pypy better.</p> +<p>Cheers, +The PyPy Team</p> +</section> +</section>releasehttps://www.pypy.org/posts/2024/04/pypy-v7316-release.htmlTue, 23 Apr 2024 12:22:08 GMTPyPy v7.3.15 releasehttps://www.pypy.org/posts/2024/01/pypy-v7315-release.htmlmattip<section id="pypy-v7-3-15-release-of-python-2-7-3-9-and-3-10"> +<h2>PyPy v7.3.15: release of python 2.7, 3.9, and 3.10</h2> +<p>The PyPy team is proud to release version 7.3.15 of PyPy.</p> +<p>This is primarily a bug-fix release, and includes work done to migrate PyPy to +Git and Github.</p> +<p>The release includes three different interpreters:</p> +<blockquote> +<ul class="simple"> +<li><p>PyPy2.7, which is an interpreter supporting the syntax and the features of +Python 2.7 including the stdlib for CPython 2.7.18+ (the <code class="docutils literal">+</code> is for +backported security updates)</p></li> +<li><p>PyPy3.9, which is an interpreter supporting the syntax and the features of +Python 3.9, including the stdlib for CPython 3.9.18.</p></li> +<li><p>PyPy3.10, which is an interpreter supporting the syntax and the features of +Python 3.10, including the stdlib for CPython 3.10.13.</p></li> +</ul> +</blockquote> +<p>The interpreters are based on much the same codebase, thus the multiple +release. This is a micro release, all APIs are compatible with the other 7.3 +releases. It follows after 7.3.14 release on Dec 25, 2023</p> +<p>We recommend updating. You can find links to download the v7.3.15 releases here:</p> +<blockquote> +<p><a class="reference external" href="https://pypy.org/download.html">https://pypy.org/download.html</a></p> +</blockquote> +<p>We would like to thank our donors for the continued support of the PyPy +project. If PyPy is not quite good enough for your needs, we are available for +<a class="reference external" href="https://www.pypy.org/pypy-sponsors.html">direct consulting</a> work. If PyPy is helping you out, we would love to hear about +it and encourage submissions to our <a class="reference external" href="https://pypy.org/blog">blog</a> via a pull request +to <a class="reference external" href="https://github.com/pypy/pypy.org">https://github.com/pypy/pypy.org</a></p> +<p>We would also like to thank our contributors and encourage new people to join +the project. PyPy has many layers and we need help with all of them: bug fixes, +<a class="reference external" href="https://www.pypy.org/posts/2024/01/index.html">PyPy</a> and <a class="reference external" href="https://rpython.readthedocs.org">RPython</a> documentation improvements, or general <a class="reference external" href="https://www.pypy.org/posts/2024/01/project-ideas.html">help</a> with +making RPython's JIT even better.</p> +<p>If you are a python library maintainer and use C-extensions, please consider +making a <a class="reference external" href="https://hpyproject.org/">HPy</a> / <a class="reference external" href="https://cffi.readthedocs.io">CFFI</a> / <a class="reference external" href="https://cppyy.readthedocs.io">cppyy</a> version of your library that would be performant +on PyPy. In any case, both <a class="reference external" href="https://github.com/joerick/cibuildwheel">cibuildwheel</a> and the <a class="reference external" href="https://github.com/matthew-brett/multibuild">multibuild system</a> support +building wheels for PyPy.</p> +<section id="what-is-pypy"> +<h3>What is PyPy?</h3> +<p>PyPy is a Python interpreter, a drop-in replacement for CPython +It's fast (<a class="reference external" href="https://speed.pypy.org">PyPy and CPython 3.7.4</a> performance +comparison) due to its integrated tracing JIT compiler.</p> +<p>We also welcome developers of other <a class="reference external" href="https://rpython.readthedocs.io/en/latest/examples.html">dynamic languages</a> to see what RPython +can do for them.</p> +<p>We provide binary builds for:</p> +<blockquote> +<ul class="simple"> +<li><p><strong>x86</strong> machines on most common operating systems +(Linux 32/64 bits, Mac OS 64 bits, Windows 64 bits)</p></li> +<li><p>64-bit <strong>ARM</strong> machines running Linux (<code class="docutils literal">aarch64</code>).</p></li> +<li><p>Apple <strong>M1 arm64</strong> machines (<code class="docutils literal">macos_arm64</code>).</p></li> +<li><p><strong>s390x</strong> running Linux</p></li> +</ul> +</blockquote> +<p>PyPy support Windows 32-bit, Linux PPC64 big- and little-endian, and Linux ARM +32 bit, but does not release binaries. Please reach out to us if you wish to +sponsor binary releases for those platforms. Downstream packagers provide +binary builds for debian, Fedora, conda, OpenBSD, FreeBSD, Gentoo, and more.</p> +</section> +<section id="what-else-is-new"> +<h3>What else is new?</h3> +<p>For more information about the 7.3.15 release, see the <a class="reference external" href="https://doc.pypy.org/en/latest/release-v7.3.15.html#changelog">full changelog</a>.</p> +<p>Please update, and continue to help us make pypy better.</p> +<p>Cheers, +The PyPy Team</p> +</section> +</section>releasehttps://www.pypy.org/posts/2024/01/pypy-v7315-release.htmlMon, 15 Jan 2024 12:22:08 GMTPyPy v7.3.14 releasehttps://www.pypy.org/posts/2023/12/pypy-v7314-release.htmlmattip<section id="pypy-v7-3-14-release-of-python-2-7-3-9-and-3-10"> +<h2>PyPy v7.3.14: release of python 2.7, 3.9, and 3.10</h2> +<p>The PyPy team is proud to release version 7.3.14 of PyPy.</p> +<p>Highlights of this release are compatibility with <a class="reference external" href="https://hpyproject.org/blog/posts/2023/10/hpy-0.9.0-fourth-public-release/">HPy-0.9</a>, cffi 1.16, +additional C-API interfaces, and more python3.10 fixes.</p> +<p>The release includes three different interpreters:</p> +<blockquote> +<ul class="simple"> +<li><p>PyPy2.7, which is an interpreter supporting the syntax and the features of +Python 2.7 including the stdlib for CPython 2.7.18+ (the <code class="docutils literal">+</code> is for +backported security updates)</p></li> +<li><p>PyPy3.9, which is an interpreter supporting the syntax and the features of +Python 3.9, including the stdlib for CPython 3.9.18.</p></li> +<li><p>PyPy3.10, which is an interpreter supporting the syntax and the features of +Python 3.10, including the stdlib for CPython 3.10.13.</p></li> +</ul> +</blockquote> +<p>The interpreters are based on much the same codebase, thus the multiple +release. This is a micro release, all APIs are compatible with the other 7.3 +releases. It follows after 7.3.13 release on Sept 29, 2023.</p> +<p>We recommend updating. You can find links to download the v7.3.14 releases here:</p> +<blockquote> +<p><a class="reference external" href="https://pypy.org/download.html">https://pypy.org/download.html</a></p> +</blockquote> +<p>We would like to thank our donors for the continued support of the PyPy +project. If PyPy is not quite good enough for your needs, we are available for +<a class="reference external" href="https://www.pypy.org/pypy-sponsors.html">direct consulting</a> work. If PyPy is helping you out, we would love to hear about +it and encourage submissions to our <a class="reference external" href="https://pypy.org/blog">blog</a> via a pull request +to <a class="reference external" href="https://github.com/pypy/pypy.org">https://github.com/pypy/pypy.org</a></p> +<p>We would also like to thank our contributors and encourage new people to join +the project. Since the last release we have contributions from three new +contributors. PyPy has many layers and we need help with all of them: bug +fixes, <a class="reference external" href="https://www.pypy.org/posts/2023/12/index.html">PyPy</a> and <a class="reference external" href="https://rpython.readthedocs.org">RPython</a> documentation improvements, or general <a class="reference external" href="https://www.pypy.org/posts/2023/12/project-ideas.html">help</a> +with making RPython's JIT even better.</p> +<p>If you are a python library maintainer and use C-extensions, please consider +making a <a class="reference external" href="https://hpyproject.org/">HPy</a> / <a class="reference external" href="https://cffi.readthedocs.io">CFFI</a> / <a class="reference external" href="https://cppyy.readthedocs.io">cppyy</a> version of your library that would be performant +on PyPy. In any case, both <a class="reference external" href="https://github.com/joerick/cibuildwheel">cibuildwheel</a> and the <a class="reference external" href="https://github.com/matthew-brett/multibuild">multibuild system</a> support +building wheels for PyPy.</p> +<section id="what-is-pypy"> +<h3>What is PyPy?</h3> +<p>PyPy is a Python interpreter, a drop-in replacement for CPython +It's fast (<a class="reference external" href="https://speed.pypy.org">PyPy and CPython 3.7.4</a> performance +comparison) due to its integrated tracing JIT compiler.</p> +<p>We also welcome developers of other <a class="reference external" href="https://rpython.readthedocs.io/en/latest/examples.html">dynamic languages</a> to see what RPython +can do for them.</p> +<p>We provide binary builds for:</p> +<blockquote> +<ul class="simple"> +<li><p><strong>x86</strong> machines on most common operating systems +(Linux 32/64 bits, Mac OS 64 bits, Windows 64 bits)</p></li> +<li><p>64-bit <strong>ARM</strong> machines running Linux (<code class="docutils literal">aarch64</code>).</p></li> +<li><p>Apple <strong>M1 arm64</strong> machines (<code class="docutils literal">macos_arm64</code>).</p></li> +<li><p><strong>s390x</strong> running Linux</p></li> +</ul> +</blockquote> +<p>PyPy support Windows 32-bit, Linux PPC64 big- and little-endian, and Linux ARM +32 bit, but does not release binaries. Please reach out to us if you wish to +sponsor binary releases for those platforms. Downstream packagers provide +binary builds for debian, Fedora, conda, OpenBSD, FreeBSD, Gentoo, and more.</p> +</section> +<section id="what-else-is-new"> +<h3>What else is new?</h3> +<p>For more information about the 7.3.14 release, see the <a class="reference external" href="https://doc.pypy.org/en/latest/release-v7.3.14.html#changelog">full changelog</a>.</p> +<p>Please update, and continue to help us make pypy better.</p> +<p>Cheers, +The PyPy Team</p> +</section> +</section>releasehttps://www.pypy.org/posts/2023/12/pypy-v7314-release.htmlMon, 25 Dec 2023 04:22:08 GMTPyPy v7.3.13 releasehttps://www.pypy.org/posts/2023/09/pypy-v7313-release.htmlmattip<section id="pypy-v7-3-13-release-of-python-2-7-3-9-and-3-10"> +<h2>PyPy v7.3.13: release of python 2.7, 3.9, and 3.10</h2> +<p>The PyPy team is proud to release version 7.3.13 of PyPy. +This is primarily a security/bug-fix release. CPython released security +patches, and this release also improves the ability to use type +specifications via <code class="docutils literal">PyType_FromSpec</code> and friends. There are also some +small speed-ups.</p> +<p>The release includes three different interpreters:</p> +<blockquote> +<ul class="simple"> +<li><p>PyPy2.7, which is an interpreter supporting the syntax and the features of +Python 2.7 including the stdlib for CPython 2.7.18+ (the <code class="docutils literal">+</code> is for +backported security updates)</p></li> +<li><p>PyPy3.9, which is an interpreter supporting the syntax and the features of +Python 3.9, including the stdlib for CPython 3.9.18.</p></li> +<li><p>PyPy3.10, which is an interpreter supporting the syntax and the features of +Python 3.10, including the stdlib for CPython 3.10.13. Note it requires at +least cython 0.29.35 or cython 3.0.0b3.</p></li> +</ul> +</blockquote> +<p>The interpreters are based on much the same codebase, thus the multiple +release. This is a micro release, all APIs are compatible with the other 7.3 +releases. It follows after 7.3.12 release on June 16, 2023.</p> +<p>We recommend updating. You can find links to download the v7.3.13 releases here:</p> +<blockquote> +<p><a class="reference external" href="https://pypy.org/download.html">https://pypy.org/download.html</a></p> +</blockquote> +<p>We would like to thank our donors for the continued support of the PyPy +project. If PyPy is not quite good enough for your needs, we are available for +<a class="reference external" href="https://www.pypy.org/pypy-sponsors.html">direct consulting</a> work. If PyPy is helping you out, we would love to hear about +it and encourage submissions to our <a class="reference external" href="https://pypy.org/blog">blog</a> via a pull request +to <a class="reference external" href="https://github.com/pypy/pypy.org">https://github.com/pypy/pypy.org</a></p> +<p>We would also like to thank our contributors and encourage new people to join +the project. PyPy has many layers and we need help with all of them: bug fixes, +<a class="reference external" href="https://www.pypy.org/posts/2023/09/index.html">PyPy</a> and <a class="reference external" href="https://rpython.readthedocs.org">RPython</a> documentation improvements, or general <a class="reference external" href="https://www.pypy.org/posts/2023/09/project-ideas.html">help</a> with making +RPython's JIT even better.</p> +<p>If you are a python library maintainer and use C-extensions, please consider +making a <a class="reference external" href="https://hpyproject.org/">HPy</a> / <a class="reference external" href="https://cffi.readthedocs.io">CFFI</a> / <a class="reference external" href="https://cppyy.readthedocs.io">cppyy</a> version of your library that would be performant +on PyPy. In any case, both <a class="reference external" href="https://github.com/joerick/cibuildwheel">cibuildwheel</a> and the <a class="reference external" href="https://github.com/matthew-brett/multibuild">multibuild system</a> support +building wheels for PyPy.</p> +<section id="what-is-pypy"> +<h3>What is PyPy?</h3> +<p>PyPy is a Python interpreter, a drop-in replacement for CPython +It's fast (<a class="reference external" href="https://speed.pypy.org">PyPy and CPython 3.7.4</a> performance +comparison) due to its integrated tracing JIT compiler.</p> +<p>We also welcome developers of other <a class="reference external" href="https://rpython.readthedocs.io/en/latest/examples.html">dynamic languages</a> to see what RPython +can do for them.</p> +<p>We provide binary builds for:</p> +<blockquote> +<ul class="simple"> +<li><p><strong>x86</strong> machines on most common operating systems +(Linux 32/64 bits, Mac OS 64 bits, Windows 64 bits)</p></li> +<li><p>64-bit <strong>ARM</strong> machines running Linux (<code class="docutils literal">aarch64</code>).</p></li> +<li><p>Apple <strong>M1 arm64</strong> machines (<code class="docutils literal">macos_arm64</code>).</p></li> +<li><p><strong>s390x</strong> running Linux</p></li> +</ul> +</blockquote> +<p>PyPy support Windows 32-bit, Linux PPC64 big- and little-endian, and Linux ARM +32 bit, but does not release binaries. Please reach out to us if you wish to +sponsor binary releases for those platforms. Downstream packagers provide +binary builds for debian, Fedora, conda, OpenBSD, FreeBSD, Gentoo, and more.</p> +</section> +<section id="what-else-is-new"> +<h3>What else is new?</h3> +<p>For more information about the 7.3.13 release, see the <a class="reference external" href="https://doc.pypy.org/en/latest/release-v7.3.13.html#changelog">full changelog</a>.</p> +<p>Please update, and continue to help us make pypy better.</p> +<p>Cheers, +The PyPy Team</p> +</section> +</section>releasehttps://www.pypy.org/posts/2023/09/pypy-v7313-release.htmlFri, 29 Sep 2023 04:22:08 GMTPyPy v7.3.12 releasehttps://www.pypy.org/posts/2023/06/pypy-v7312-release.htmlmattip<section id="pypy-v7-3-12-release-of-python-2-7-3-9-and-3-10"> +<h2>PyPy v7.3.12: release of python 2.7, 3.9, and 3.10.</h2> +<p>The PyPy team is proud to release version 7.3.12 of PyPy. +This release includes a new string-to-int algorithm (also appearing in CPython +3.12) that is faster than the older one; support for symlinks in Windows; and +our first Python3.10 version.</p> +<p>The release includes three different interpreters:</p> +<blockquote> +<ul class="simple"> +<li><p>PyPy2.7, which is an interpreter supporting the syntax and the features of +Python 2.7 including the stdlib for CPython 2.7.18+ (the <code class="docutils literal">+</code> is for +backported security updates)</p></li> +<li><p>PyPy3.9, which is an interpreter supporting the syntax and the features of +Python 3.9, including the stdlib for CPython 3.9.17.</p></li> +<li><p>PyPy3.10, which is an interpreter supporting the syntax and the features of +Python 3.10, including the stdlib for CPython 3.10.12. This is our first +release of 3.10, but based on past experience we are quite confident in +its compatibility with upstream. Of course, we recommend testing your code +with this new version before putting it into production. Note it does +require at least cython 0.29.35 or cython 3.0.0b3</p></li> +</ul> +</blockquote> +<p>The interpreters are based on much the same codebase, thus the multiple +release. This is a micro release, all APIs are compatible with the other 7.3 +releases. It follows after 7.3.11 release on Dec 29, 2022</p> +<p>We recommend updating. You can find links to download the v7.3.12 releases here:</p> +<blockquote> +<p><a class="reference external" href="https://pypy.org/download.html">https://pypy.org/download.html</a></p> +</blockquote> +<p>We would like to thank our donors for the continued support of the PyPy +project. If PyPy is not quite good enough for your needs, we are available for +<a class="reference external" href="https://www.pypy.org/pypy-sponsors.html">direct consulting</a> work. If PyPy is helping you out, we would love to hear about +it and encourage submissions to our <a class="reference external" href="https://pypy.org/blog">blog</a> via a pull request +to <a class="reference external" href="https://github.com/pypy/pypy.org">https://github.com/pypy/pypy.org</a></p> +<p>We would also like to thank our contributors and encourage new people to join +the project. PyPy has many layers and we need help with all of them: bug fixes, +<a class="reference external" href="https://www.pypy.org/posts/2023/06/index.html">PyPy</a> and <a class="reference external" href="https://rpython.readthedocs.org">RPython</a> documentation improvements, or general <a class="reference external" href="https://www.pypy.org/posts/2023/06/project-ideas.html">help</a> with making +RPython's JIT even better. Since the previous release, we have accepted +contributions from one new contributor, thanks for pitching in, and welcome +to the project!</p> +<p>If you are a python library maintainer and use C-extensions, please consider +making a <a class="reference external" href="https://hpyproject.org/">HPy</a> / <a class="reference external" href="https://cffi.readthedocs.io">CFFI</a> / <a class="reference external" href="https://cppyy.readthedocs.io">cppyy</a> version of your library that would be performant +on PyPy. In any case, both <a class="reference external" href="https://github.com/joerick/cibuildwheel">cibuildwheel</a> and the <a class="reference external" href="https://github.com/matthew-brett/multibuild">multibuild system</a> support +building wheels for PyPy.</p> +<section id="what-is-pypy"> +<h3>What is PyPy?</h3> +<p>PyPy is a Python interpreter, a drop-in replacement for CPython 2.7, 3.9 and +3.10. It's fast (<a class="reference external" href="https://speed.pypy.org">PyPy and CPython 3.7.4</a> performance +comparison) due to its integrated tracing JIT compiler.</p> +<p>We also welcome developers of other <a class="reference external" href="https://rpython.readthedocs.io/en/latest/examples.html">dynamic languages</a> to see what RPython +can do for them.</p> +<p>We provide binary builds for:</p> +<blockquote> +<ul class="simple"> +<li><p><strong>x86</strong> machines on most common operating systems +(Linux 32/64 bits, Mac OS 64 bits, Windows 64 bits)</p></li> +<li><p>64-bit <strong>ARM</strong> machines running Linux (<code class="docutils literal">aarch64</code>).</p></li> +<li><p>Apple <strong>M1 arm64</strong> machines (<code class="docutils literal">macos_arm64</code>).</p></li> +<li><p><strong>s390x</strong> running Linux</p></li> +</ul> +</blockquote> +<p>PyPy support Windows 32-bit, Linux PPC64 big- and little-endian, and Linux ARM +32 bit, but does not release binaries. Please reach out to us if you wish to +sponsor binary releases for those platforms. Downstream packagers provide +binary builds for debian, Fedora, conda, OpenBSD, FreeBSD, Gentoo, and more.</p> +</section> +<section id="what-else-is-new"> +<h3>What else is new?</h3> +<p>For more information about the 7.3.12 release, see the <a class="reference external" href="https://doc.pypy.org/en/latest/release-v7.3.12.html#changelog">full changelog</a>.</p> +<p>Please update, and continue to help us make pypy better.</p> +<p>Cheers, +The PyPy Team</p> +</section> +</section>releasehttps://www.pypy.org/posts/2023/06/pypy-v7312-release.htmlFri, 16 Jun 2023 04:22:08 GMTPyPy v7.3.11 releasehttps://www.pypy.org/posts/2022/12/pypy-v7311-release.htmlThe PyPy Team<section id="pypy-v7-3-11-release-of-python-2-7-3-8-and-3-9"> +<h2>PyPy v7.3.11: release of python 2.7, 3.8, and 3.9</h2> +<p>The PyPy team is proud to release version 7.3.11 of PyPy. As could be expected, +the first release of macOS arm64 impacted the macOS x86-64 build, so this is +a bug release to restore the ability of macOS users to run PyPy on +<code class="docutils literal">macOS &lt; 11.0</code>. It also incorporates the latest CPython stdlib updates +released the day after 7.3.10 went out, and a few more bug fixes. The release +includes three different interpreters:</p> +<blockquote> +<ul class="simple"> +<li><p>PyPy2.7, which is an interpreter supporting the syntax and the features of +Python 2.7 including the stdlib for CPython 2.7.18+ (the <code class="docutils literal">+</code> is for +backported security updates)</p></li> +<li><p>PyPy3.8, which is an interpreter supporting the syntax and the features of +Python 3.8, including the stdlib for CPython 3.8.16. Note we intend to drop +support for this version in an upcoming release as soon as we release +Pyython 3.10.</p></li> +<li><p>PyPy3.9, which is an interpreter supporting the syntax and the features of +Python 3.9, including the stdlib for CPython 3.9.16.</p></li> +</ul> +</blockquote> +<p>The interpreters are based on much the same codebase, thus the multiple +release. This is a micro release, all APIs are compatible with the other 7.3 +releases and follows quickly on the heals of the 7.3.10 release on Dec 6.</p> +<p>We recommend updating. You can find links to download the v7.3.11 releases here:</p> +<blockquote> +<p><a class="reference external" href="https://pypy.org/download.html">https://pypy.org/download.html</a></p> +</blockquote> +<p>We would like to thank our donors for the continued support of the PyPy +project. If PyPy is not quite good enough for your needs, we are available for +<a class="reference external" href="https://www.pypy.org/pypy-sponsors.html">direct consulting</a> work. If PyPy is helping you out, we would love to hear about +it and encourage submissions to our <a class="reference external" href="https://pypy.org/blog">blog</a> via a pull request +to <a class="reference external" href="https://github.com/pypy/pypy.org">https://github.com/pypy/pypy.org</a></p> +<p>We would also like to thank our contributors and encourage new people to join +the project. PyPy has many layers and we need help with all of them: bug fixes, +<a class="reference external" href="https://www.pypy.org/posts/2022/12/index.html">PyPy</a> and <a class="reference external" href="https://rpython.readthedocs.org">RPython</a> documentation improvements, or general <a class="reference external" href="https://www.pypy.org/posts/2022/12/project-ideas.html">help</a> with making +RPython's JIT even better. Since the previous release, we have accepted +contributions from one new contributor, thanks for pitching in, and welcome +to the project!</p> +<p>If you are a python library maintainer and use C-extensions, please consider +making a <a class="reference external" href="https://hpyproject.org/">HPy</a> / <a class="reference external" href="https://cffi.readthedocs.io">CFFI</a> / <a class="reference external" href="https://cppyy.readthedocs.io">cppyy</a> version of your library that would be performant +on PyPy. +In any case, both <a class="reference external" href="https://github.com/joerick/cibuildwheel">cibuildwheel</a> and the <a class="reference external" href="https://github.com/matthew-brett/multibuild">multibuild system</a> support +building wheels for PyPy.</p> +<section id="what-is-pypy"> +<h3>What is PyPy?</h3> +<p>PyPy is a Python interpreter, a drop-in replacement for CPython 2.7, 3.8 and +3.9. It's fast (<a class="reference external" href="https://speed.pypy.org">PyPy and CPython 3.7.4</a> performance +comparison) due to its integrated tracing JIT compiler.</p> +<p>We also welcome developers of other <a class="reference external" href="https://rpython.readthedocs.io/en/latest/examples.html">dynamic languages</a> to see what RPython +can do for them.</p> +<p>We provide binary builds for:</p> +<blockquote> +<ul class="simple"> +<li><p><strong>x86</strong> machines on most common operating systems +(Linux 32/64 bits, Mac OS 64 bits, Windows 64 bits)</p></li> +<li><p>64-bit <strong>ARM</strong> machines running Linux (<code class="docutils literal">aarch64</code>).</p></li> +<li><p>Apple <strong>M1 arm64</strong> machines (<code class="docutils literal">macos_arm64</code>).</p></li> +<li><p><strong>s390x</strong> running Linux</p></li> +</ul> +</blockquote> +<p>PyPy support Windows 32-bit, Linux PPC64 big- and little-endian, and Linux ARM +32 bit, but does not release binaries. Please reach out to us if you wish to +sponsor binary releases for those platforms. Downstream packagers provide +binary builds for debian, Fedora, conda, OpenBSD, FreeBSD, Gentoo, and more.</p> +</section> +<section id="what-else-is-new"> +<h3>What else is new?</h3> +<p>For more information about the 7.3.11 release, see the <a class="reference external" href="https://doc.pypy.org/en/latest/release-v7.3.11.html#changelog">full changelog</a>.</p> +<p>Please update, and continue to help us make pypy better.</p> +<p>Cheers, +The PyPy Team</p> +</section> +</section>releasehttps://www.pypy.org/posts/2022/12/pypy-v7311-release.htmlThu, 29 Dec 2022 13:22:08 GMTPyPy v7.3.10 releasehttps://www.pypy.org/posts/2022/12/pypy-v7310-release.htmlThe PyPy Team<section id="pypy-v7-3-10-release-of-python-2-7-3-8-and-3-9"> +<h2>PyPy v7.3.10: release of python 2.7, 3.8, and 3.9</h2> +<p>The PyPy team is proud to release version 7.3.10 of PyPy. We have some nice +speedups and bugfixes we wish to share. The release includes three different +interpreters:</p> +<blockquote> +<ul class="simple"> +<li><p>PyPy2.7, which is an interpreter supporting the syntax and the features of +Python 2.7 including the stdlib for CPython 2.7.18+ (the <code class="docutils literal">+</code> is for +backported security updates)</p></li> +<li><p>PyPy3.8, which is an interpreter supporting the syntax and the features of +Python 3.8, including the stdlib for CPython 3.8.15.</p></li> +<li><p>PyPy3.9, which is an interpreter supporting the syntax and the features of +Python 3.9, including the stdlib for CPython 3.9.15. We have gained +confidence in the stability of this version, and are removing the "beta" +label.</p></li> +</ul> +</blockquote> +<p>The interpreters are based on much the same codebase, thus the multiple +release. This is a micro release, all APIs are compatible with the other 7.3 +releases. Highlights of the release, since the release of 7.3.9 in March 2022 +include:</p> +<blockquote> +<ul class="simple"> +<li><p>A release of Apple Silicon M1 arm64 versions. This work <a class="reference external" href="https://www.pypy.org/posts/2022/07/m1-support-for-pypy.html">was sponsored</a> by +an anonymous donor and is tested on our buildbots.</p></li> +<li><p>Many improvements to the basic interpreter to make it 15-20% faster</p></li> +<li><p>The conda-forge community <a class="reference external" href="https://www.pypy.org/posts/2022/11/pypy-and-conda-forge.html">has built</a> over 1000 packages for PyPy3.8 and 3.9, +making it easier than ever to use PyPy.</p></li> +<li><p>Update the packaged OpenSSL to 1.1.1s, sqlite3 to 3.39.4, and apply +applicable security fixes from CPython 3.9.15 to PyPy2.7</p></li> +<li><p>Update the <a class="reference external" href="https://hpyproject.org/">HPy</a> backend in PyPy3.8 and PyPy3.9 to 0.0.4</p></li> +</ul> +</blockquote> +<p>We recommend updating. You can find links to download the v7.3.10 releases here:</p> +<blockquote> +<p><a class="reference external" href="https://pypy.org/download.html">https://pypy.org/download.html</a></p> +</blockquote> +<p>We would like to thank our donors for the continued support of the PyPy +project. If PyPy is not quite good enough for your needs, we are available for +<a class="reference external" href="https://www.pypy.org/pypy-sponsors.html">direct consulting</a> work. If PyPy is helping you out, we would love to hear about +it and encourage submissions to our <a class="reference external" href="https://pypy.org/blog">blog</a> via a pull request +to <a class="reference external" href="https://github.com/pypy/pypy.org">https://github.com/pypy/pypy.org</a></p> +<p>We would also like to thank our contributors and encourage new people to join +the project. PyPy has many layers and we need help with all of them: bug fixes, +<a class="reference external" href="https://www.pypy.org/posts/2022/12/index.html">PyPy</a> and <a class="reference external" href="https://rpython.readthedocs.org">RPython</a> documentation improvements, or general <a class="reference external" href="https://www.pypy.org/posts/2022/12/project-ideas.html">help</a> with making +RPython's JIT even better. Since the previous release, we have accepted +contributions from five new contributors, thanks for pitching in, and welcome +to the project!</p> +<p>If you are a python library maintainer and use C-extensions, please consider +making a <a class="reference external" href="https://hpyproject.org/">HPy</a> / <a class="reference external" href="https://cffi.readthedocs.io">CFFI</a> / <a class="reference external" href="https://cppyy.readthedocs.io">cppyy</a> version of your library that would be performant +on PyPy. +In any case, both <a class="reference external" href="https://github.com/joerick/cibuildwheel">cibuildwheel</a> and the <a class="reference external" href="https://github.com/matthew-brett/multibuild">multibuild system</a> support +building wheels for PyPy.</p> +<section id="what-is-pypy"> +<h3>What is PyPy?</h3> +<p>PyPy is a Python interpreter, a drop-in replacement for CPython 2.7, 3.8 and +3.9. It's fast (<a class="reference external" href="https://speed.pypy.org">PyPy and CPython 3.7.4</a> performance +comparison) due to its integrated tracing JIT compiler.</p> +<p>We also welcome developers of other <a class="reference external" href="https://rpython.readthedocs.io/en/latest/examples.html">dynamic languages</a> to see what RPython +can do for them.</p> +<p>We provide binary builds for:</p> +<blockquote> +<ul class="simple"> +<li><p><strong>x86</strong> machines on most common operating systems +(Linux 32/64 bits, Mac OS 64 bits, Windows 64 bits)</p></li> +<li><p>64-bit <strong>ARM</strong> machines running Linux (<code class="docutils literal">aarch64</code>).</p></li> +<li><p>Apple <strong>M1 arm64</strong> machines (<code class="docutils literal">macos_arm64</code>).</p></li> +<li><p><strong>s390x</strong> running Linux</p></li> +</ul> +</blockquote> +<p>PyPy support Windows 32-bit, Linux PPC64 big- and little-endian, and Linux ARM +32 bit, but does not release binaries. Please reach out to us if you wish to +sponsor binary releases for those platforms. Downstream packagers provide +binary builds for debian, Fedora, conda, OpenBSD, FreeBSD, Gentoo, and more.</p> +</section> +<section id="what-else-is-new"> +<h3>What else is new?</h3> +<p>For more information about the 7.3.10 release, see the <a class="reference external" href="https://doc.pypy.org/en/latest/release-v7.3.10.html#changelog">full changelog</a>.</p> +<p>Please update, and continue to help us make pypy better.</p> +<p>Cheers, +The PyPy Team</p> +</section> +</section>releasehttps://www.pypy.org/posts/2022/12/pypy-v7310-release.htmlTue, 06 Dec 2022 13:22:08 GMTPyPy v7.3.9 security releasehttps://www.pypy.org/posts/2022/03/pypy-v738-release.htmlThe PyPy Team<section id="pypy-v7-3-9-security-release"> +<h2>PyPy v7.3.9 security release</h2> +<p>The PyPy team is proud to release version 7.3.9 of PyPy. This is a security +release to match the recent <a class="reference external" href="https://discuss.python.org/t/py-day-is-coming-a-joint-security-release-spree-for-python-3-7-3-8-3-9-and-3-10-on-march-14th">CPython release</a> and updates the portable pypy +tarballs with <code class="docutils literal">bzip2 1.0.8</code>, <code class="docutils literal">openssl1.1.1n</code>, and <code class="docutils literal">libexpat 2.4.7</code>. Along +the way this release fixes some issues discovered after the 7.3.8 release and +updates <code class="docutils literal">sqlite3</code> to 3.38.2. It includes:</p> +<blockquote> +<ul class="simple"> +<li><p>PyPy2.7, which is an interpreter supporting the syntax and the features of +Python 2.7 including the stdlib for CPython 2.7.18+ (the <code class="docutils literal">+</code> is for +backported security updates)</p></li> +<li><p>PyPy3.7, which is an interpreter supporting the syntax and the features of +Python 3.7, including the stdlib for CPython 3.7.13. This will be the last +release of PyPy3.7.</p></li> +<li><p>PyPy3.8, which is an interpreter supporting the syntax and the features of +Python 3.8, including the stdlib for CPython 3.8.13.</p></li> +<li><p>PyPy3.9, which is an interpreter supporting the syntax and the features of +Python 3.9, including the stdlib for CPython 3.9.12. We relate to this as +"beta" quality. We welcome testing of this version, if you discover +incompatibilities, please report them so we can gain confidence in the version.</p></li> +</ul> +</blockquote> +<p>The interpreters are based on much the same codebase, thus the multiple +release. This is a micro release, all APIs are compatible with the other 7.3 +releases. Highlights of the release, since the release of 7.3.8 in February 2022, +include:</p> +<blockquote> +<ul class="simple"> +<li><p>Fixed some failing stdlib tests on PyPy3.9</p></li> +<li><p>Update the bundled libexpat to 2.4.6 and sqlite3 to 3.38.2</p></li> +</ul> +</blockquote> +<p>We recommend updating. You can find links to download the v7.3.9 releases here:</p> +<blockquote> +<p><a class="reference external" href="https://pypy.org/download.html">https://pypy.org/download.html</a></p> +</blockquote> +<p>We would like to thank our donors for the continued support of the PyPy +project. If PyPy is not quite good enough for your needs, we are available for +direct consulting work. If PyPy is helping you out, we would love to hear about +it and encourage submissions to our <a class="reference external" href="https://pypy.org/blog">blog</a> via a pull request +to <a class="reference external" href="https://github.com/pypy/pypy.org">https://github.com/pypy/pypy.org</a></p> +<p>We would also like to thank our contributors and encourage new people to join +the project. PyPy has many layers and we need help with all of them: <a class="reference external" href="https://www.pypy.org/posts/2022/03/index.html">PyPy</a> +and <a class="reference external" href="https://rpython.readthedocs.org">RPython</a> documentation improvements, tweaking popular modules to run +on PyPy, or general <a class="reference external" href="https://www.pypy.org/posts/2022/03/project-ideas.html">help</a> with making RPython's JIT even better. Since the +7.3.7 release, we have accepted contributions from 6 new contributors, +thanks for pitching in, and welcome to the project!</p> +<p>If you are a python library maintainer and use C-extensions, please consider +making a <a class="reference external" href="https://hpyproject.org/">HPy</a> / <a class="reference external" href="https://cffi.readthedocs.io">CFFI</a> / <a class="reference external" href="https://cppyy.readthedocs.io">cppyy</a> version of your library that would be performant +on PyPy. +In any case both <a class="reference external" href="https://github.com/joerick/cibuildwheel">cibuildwheel</a> and the <a class="reference external" href="https://github.com/matthew-brett/multibuild">multibuild system</a> support +building wheels for PyPy.</p> +<section id="what-is-pypy"> +<h3>What is PyPy?</h3> +<p>PyPy is a Python interpreter, a drop-in replacement for CPython 2.7, 3.7, 3.8 and +3.9. It's fast (<a class="reference external" href="https://speed.pypy.org">PyPy and CPython 3.7.4</a> performance +comparison) due to its integrated tracing JIT compiler.</p> +<p>We also welcome developers of other <a class="reference external" href="https://rpython.readthedocs.io/en/latest/examples.html">dynamic languages</a> to see what RPython +can do for them.</p> +<p>This PyPy release supports:</p> +<blockquote> +<ul class="simple"> +<li><p><strong>x86</strong> machines on most common operating systems +(Linux 32/64 bits, Mac OS X 64 bits, Windows 64 bits, OpenBSD, FreeBSD)</p></li> +<li><p>64-bit <strong>ARM</strong> machines running Linux. A shoutout to Huawei for sponsoring +the VM running the tests.</p></li> +<li><p><strong>s390x</strong> running Linux</p></li> +<li><p>big- and little-endian variants of <strong>PPC64</strong> running Linux,</p></li> +</ul> +</blockquote> +<p>PyPy support Windows 32-bit, PPC64 big- and little-endian, and ARM 32 bit, but +does not release binaries. Please reach out to us if you wish to sponsor +releases for those platforms.</p> +</section> +<section id="known-issues-with-pypy3-9"> +<h3>Known Issues with PyPy3.9</h3> +<ul class="simple"> +<li><p>We slightly modified the concurrent future's <code class="docutils literal">ProcessExcecutorPool</code> to +start all the worker processes when the first task is received (like on +Python3.8) to avoid an apparent race condition when using <code class="docutils literal">fork</code> and +threads (issue <a class="reference external" href="https://foss.heptapod.net/pypy/pypy/-/issues/3650">3650</a>).</p></li> +</ul> +</section> +<section id="what-else-is-new"> +<h3>What else is new?</h3> +<p>For more information about the 7.3.9 release, see the <a class="reference external" href="https://doc.pypy.org/en/latest/release-v7.3.9.html#changelog">full changelog</a>.</p> +<p>Please update, and continue to help us make PyPy better.</p> +<p>Cheers, +The PyPy team</p> +</section> +</section>releasehttps://www.pypy.org/posts/2022/03/pypy-v738-release.htmlWed, 30 Mar 2022 05:53:45 GMTPyPy v7.3.8: release of python 2.7, 3.7, 3.8, and 3.9https://www.pypy.org/posts/2022/02/pypy-v738-release.htmlThe PyPy Team<section id="pypy-v7-3-8-release-of-python-2-7-3-7-3-8-and-3-9-beta"> +<h2>PyPy v7.3.8: release of python 2.7, 3.7, 3.8, and 3.9-beta</h2> +<p>The PyPy team is proud to release version 7.3.8 of PyPy. It has been only a few +months since our last release, but we have some nice speedups and bugfixes we +wish to share. The release includes four different interpreters:</p> +<blockquote> +<ul class="simple"> +<li><p>PyPy2.7, which is an interpreter supporting the syntax and the features of +Python 2.7 including the stdlib for CPython 2.7.18+ (the <code class="docutils literal">+</code> is for +backported security updates)</p></li> +<li><p>PyPy3.7, which is an interpreter supporting the syntax and the features of +Python 3.7, including the stdlib for CPython 3.7.12. This will be the last +release of PyPy3.7.</p></li> +<li><p>PyPy3.8, which is an interpreter supporting the syntax and the features of +Python 3.8, including the stdlib for CPython 3.8.12. This is our third +release of this interpreter, and we are removing the "beta" tag.</p></li> +<li><p>PyPy3.9, which is an interpreter supporting the syntax and the features of +Python 3.9, including the stdlib for CPython 3.9.10. As this is our first +release of this interpreter, we relate to this as "beta" quality. We +welcome testing of this version, if you discover incompatibilities, please +report them so we can gain confidence in the version.</p></li> +</ul> +</blockquote> +<p>The interpreters are based on much the same codebase, thus the multiple +release. This is a micro release, all APIs are compatible with the other 7.3 +releases. Highlights of the release, since the release of 7.3.7 in late October 2021, +include:</p> +<blockquote> +<ul class="simple"> +<li><p>PyPy3.9 uses an RPython version of the PEG parser which brought with it a +cleanup of the lexer and parser in general</p></li> +<li><p>Fixed a regression in PyPy3.8 when JITting empty list comprehensions</p></li> +<li><p>Tweaked some issues around changing the file layout after packaging to make +the on-disk layout of PyPy3.8 more compatible with CPython. This requires +<code class="docutils literal"><span class="pre">setuptools&gt;=58.1.0</span></code></p></li> +<li><p>RPython now allows the target executable to have a <code class="docutils literal">.</code> in its name, so +PyPy3.9 will produce a <code class="docutils literal"><span class="pre">pypy3.9-c</span></code> and <code class="docutils literal"><span class="pre">libpypy3.9-c.so</span></code>. Changing the +name of the shared object to be version-specific (it used to be +<code class="docutils literal"><span class="pre">libpypy3-c.so</span></code>) will allow it to live alongside other versions.</p></li> +<li><p>Building PyPy3.9+ accepts a <code class="docutils literal"><span class="pre">--platlibdir</span></code> argument like CPython.</p></li> +<li><p>Improvement in ssl's use of CFFI buffers to speed up <code class="docutils literal">recv</code> and <code class="docutils literal">recvinto</code></p></li> +<li><p>Update the packaged OpenSSL to 1.1.1m</p></li> +</ul> +</blockquote> +<p>We recommend updating. You can find links to download the v7.3.8 releases here:</p> +<blockquote> +<p><a class="reference external" href="https://pypy.org/download.html">https://pypy.org/download.html</a></p> +</blockquote> +<p>We would like to thank our donors for the continued support of the PyPy +project. If PyPy is not quite good enough for your needs, we are available for +direct consulting work. If PyPy is helping you out, we would love to hear about +it and encourage submissions to our <a class="reference external" href="https://pypy.org/blog">blog</a> via a pull request +to <a class="reference external" href="https://github.com/pypy/pypy.org">https://github.com/pypy/pypy.org</a></p> +<p>We would also like to thank our contributors and encourage new people to join +the project. PyPy has many layers and we need help with all of them: <a class="reference external" href="https://www.pypy.org/posts/2022/02/index.html">PyPy</a> +and <a class="reference external" href="https://rpython.readthedocs.org">RPython</a> documentation improvements, tweaking popular modules to run +on PyPy, or general <a class="reference external" href="https://www.pypy.org/posts/2022/02/project-ideas.html">help</a> with making RPython's JIT even better. Since the +previous release, we have accepted contributions from 6 new contributors, +thanks for pitching in, and welcome to the project!</p> +<p>If you are a python library maintainer and use C-extensions, please consider +making a <a class="reference external" href="https://hpyproject.org/">HPy</a> / <a class="reference external" href="https://cffi.readthedocs.io">CFFI</a> / <a class="reference external" href="https://cppyy.readthedocs.io">cppyy</a> version of your library that would be performant +on PyPy. +In any case both <a class="reference external" href="https://github.com/joerick/cibuildwheel">cibuildwheel</a> and the <a class="reference external" href="https://github.com/matthew-brett/multibuild">multibuild system</a> support +building wheels for PyPy.</p> +<section id="what-is-pypy"> +<h3>What is PyPy?</h3> +<p>PyPy is a Python interpreter, a drop-in replacement for CPython 2.7, 3.7, 3.8 and +3.9. It's fast (<a class="reference external" href="https://speed.pypy.org">PyPy and CPython 3.7.4</a> performance +comparison) due to its integrated tracing JIT compiler.</p> +<p>We also welcome developers of other <a class="reference external" href="https://rpython.readthedocs.io/en/latest/examples.html">dynamic languages</a> to see what RPython +can do for them.</p> +<p>This PyPy release supports:</p> +<blockquote> +<ul class="simple"> +<li><p><strong>x86</strong> machines on most common operating systems +(Linux 32/64 bits, Mac OS X 64 bits, Windows 64 bits, OpenBSD, FreeBSD)</p></li> +<li><p>64-bit <strong>ARM</strong> machines running Linux. A shoutout to Huawei for sponsoring +the VM running the tests.</p></li> +<li><p><strong>s390x</strong> running Linux</p></li> +<li><p>big- and little-endian variants of <strong>PPC64</strong> running Linux,</p></li> +</ul> +</blockquote> +<p>PyPy support Windows 32-bit, PPC64 big- and little-endian, and ARM 32 bit, but +does not release binaries. Please reach out to us if you wish to sponsor +releases for those platforms.</p> +</section> +<section id="known-issues-with-pypy3-9"> +<h3>Known Issues with PyPy3.9</h3> +<ul class="simple"> +<li><p>There is still a known <a class="reference external" href="https://foss.heptapod.net/pypy/pypy/-/issues/3649">speed regression</a> around <code class="docutils literal">**kwargs</code> handling</p></li> +<li><p>We slightly modified the concurrent future's <code class="docutils literal">ProcessExcecutorPool</code> to +start all the worker processes when the first task is received (like on +Python3.8) to avoid an apparent race condition when using <code class="docutils literal">fork</code> and +threads (issue <a class="reference external" href="https://foss.heptapod.net/pypy/pypy/-/issues/3650">3650</a>).</p></li> +</ul> +</section> +<section id="what-else-is-new"> +<h3>What else is new?</h3> +<p>For more information about the 7.3.8 release, see the <a class="reference external" href="https://doc.pypy.org/en/latest/release-v7.3.8.html#changelog">full changelog</a>.</p> +<p>Please update, and continue to help us make PyPy better.</p> +<p>Cheers, +The PyPy team</p> +</section> +</section>releasehttps://www.pypy.org/posts/2022/02/pypy-v738-release.htmlSun, 20 Feb 2022 05:53:45 GMT \ No newline at end of file diff --git a/categories/releasecffi.html b/categories/releasecffi.html new file mode 100644 index 000000000..e18c7abc5 --- /dev/null +++ b/categories/releasecffi.html @@ -0,0 +1,120 @@ + + + + + +Posts about releasecffi | PyPy + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +
                + + \ No newline at end of file diff --git a/categories/releasecffi.xml b/categories/releasecffi.xml new file mode 100644 index 000000000..0e755655f --- /dev/null +++ b/categories/releasecffi.xml @@ -0,0 +1,96 @@ + +PyPy (Posts about releasecffi)https://www.pypy.org/enContents © 2024 <a href="mailto:pypy-dev@pypy.org">The PyPy Team</a> Sat, 31 Aug 2024 17:48:12 GMTNikola (getnikola.com)http://blogs.law.harvard.edu/tech/rssCFFI release 0.3https://www.pypy.org/posts/2012/08/cffi-release-03-4740491796308953732.htmlArmin Rigo<p>Hi everybody,</p> +<p>We released <a class="reference external" href="https://cffi.readthedocs.org">CFFI 0.3</a>. This is the first release that supports more +than CPython 2.x <tt class="docutils literal"><span class="pre">:-)</span></tt></p> +<ul class="simple"> +<li>CPython 2.6, 2.7, and <strong>3.x</strong> are supported (3.3 definitely, but maybe 3.2 or earlier too)</li> +<li><strong>PyPy trunk</strong> is supported.</li> +</ul> +<p>In more details, the main news are:</p> +<ul class="simple"> +<li>support for PyPy. You need to get a trunk version of PyPy, which +comes with the built-in module <tt class="docutils literal">_cffi_backend</tt> to use with the CFFI +release. For testing, you can download the <a class="reference external" href="https://buildbot.pypy.org/nightly/trunk/">Linux 32/64 versions of +PyPy trunk</a>. The OS/X and Windows versions of <tt class="docutils literal">_cffi_backend</tt> +are not tested at all so far, so probably don't work yet.</li> +<li>support for Python 3. It is unknown which exact version is +required; probably 3.2 or even earlier, but we need 3.3 to run the +tests. The 3.x version is not a separate source; it runs out of the same sources. Thanks Amaury for starting this port.</li> +<li>the main change in the API is that you need to use <tt class="docutils literal">ffi.string(cdata)</tt> +instead of <tt class="docutils literal">str(cdata)</tt> or <tt class="docutils literal">unicode(cdata)</tt>. The motivation for this +change was the Python 3 compatibility. If your Python 2 code used to +contain <tt class="docutils literal"><span class="pre">str(&lt;cdata</span> 'char <span class="pre">*'&gt;)</span></tt>, it would interpret the memory content +as a null-terminated string; but on Python 3 it would just return a +different string, namely <tt class="docutils literal">"&lt;cdata 'char <span class="pre">*'&gt;"</span></tt>, and proceed without even +a crash, which is bad. So ffi.string() solves it by always returning +the memory content as an 8-bit string (which is a str in Python 2 and +a bytes in Python 3).</li> +<li>other minor API changes are documented at +<a class="reference external" href="https://cffi.readthedocs.org/">https://cffi.readthedocs.org/</a> (grep for <tt class="docutils literal">version 0.3</tt>).</li> +</ul> +<p>Upcoming work, to be done before release 1.0:</p> +<ul class="simple"> +<li>expose to the user the module <tt class="docutils literal">cffi.model</tt> in a possibly refactored +way, for people that don't like (or for some reason can't easily use) +strings containing snippets of C declarations. We are thinking about +refactoring it in such a way that it has a ctypes-compatible +interface, to ease porting existing code from ctypes to cffi. Note +that this would concern only the C type and function declarations, not +all the rest of ctypes.</li> +<li>CFFI 1.0 will also have a corresponding PyPy release. We are thinking +about calling it PyPy 2.0 and including the whole of CFFI (instead of +just the <tt class="docutils literal">_cffi_backend</tt> module like now). In other words it will +support CFFI out of the box --- we want to push forward usage of CFFI +in PyPy <tt class="docutils literal"><span class="pre">:-)</span></tt></li> +</ul> +<p>Cheers,</p> +<p>Armin Rigo and Maciej Fijałkowski</p>releasecffihttps://www.pypy.org/posts/2012/08/cffi-release-03-4740491796308953732.htmlMon, 13 Aug 2012 19:59:00 GMTCFFI release 0.2.1https://www.pypy.org/posts/2012/07/cffi-release-02-4800000428934604295.htmlArmin Rigo<p>Hi everybody,</p><p>We released <a class="reference" href="https://cffi.readthedocs.org">CFFI 0.2.1</a> (expected to be 1.0 soon). CFFI is a way to call C from Python.</p><p><b>EDIT:</b> Win32 was broken in 0.2. Fixed.</p><p>This release is only for CPython 2.6 or 2.7. PyPy support is coming in<br> +the <tt class="docutils literal"><span class="pre">ffi-backend</span></tt> branch, but not finished yet. CPython 3.x would be<br> +easy but requires the help of someone.</p><p>The package is available <a class="reference" href="https://foss.heptapod.net/cffi/cffi">on bitbucket</a> as well as <a class="reference" href="https://cffi.readthedocs.org">documented</a>. You<br> +can also install it straight from the <a href="https://pypi.python.org/pypi/cffi">python package index</a>: <tt class="docutils literal"><span class="pre">pip install cffi</span></tt></p><ul class="simple"><li>Contains numerous small changes and support for more C-isms.</li> +<li>The biggest news is the support for <a class="reference" href="https://cffi.readthedocs.org/en/latest/index.html#distributing-modules-using-cffi">installing packages</a> that use<br> +<tt class="docutils literal"><span class="pre">ffi.verify()</span></tt> on machines without a C compiler. Arguably, this<br> +lifts the last serious restriction for people to use CFFI.</li> +<li>Partial list of smaller changes:<ul><li>mappings between 'wchar_t' and Python unicodes</li> +<li>the introduction of ffi.NULL</li> +<li>a possibly clearer API for <tt class="docutils literal"><span class="pre">ffi.new()</span></tt>: e.g. to allocate a single <tt class="docutils literal"><span class="pre">int</span></tt> and obtain a pointer to it, use <tt class="docutils literal"><span class="pre">ffi.new("int *")</span></tt> instead of the old<br> +<tt class="docutils literal"><span class="pre">ffi.new("int")</span></tt></li> +<li>and of course a plethora of smaller bug fixes</li> +</ul></li> +<li>CFFI uses <tt class="docutils literal"><span class="pre">pkg-config</span></tt> to install itself if available. This helps<br> +locate <tt class="docutils literal"><span class="pre">libffi</span></tt> on modern Linuxes. Mac OS/X support is available too<br> +(see the detailed <a class="reference" href="https://cffi.readthedocs.org/en/latest/index.html#macos-10-6">installation instructions</a>). Win32 should work out<br> +of the box. Win64 has not been really tested yet.</li> +</ul><p>Cheers,<br> +Armin Rigo and Maciej Fijałkowski</p>releasecffihttps://www.pypy.org/posts/2012/07/cffi-release-02-4800000428934604295.htmlThu, 26 Jul 2012 17:45:00 GMTRelease 0.1 of CFFIhttps://www.pypy.org/posts/2012/06/release-01-of-cffi-4760622823232463868.htmlMaciej Fijalkowski<div dir="ltr" style="text-align: left;"><p>Hi.</p><p>We're pleased to announce the first public release, 0.1 of CFFI, a way to call C from Python.<br> +(This release does not support PyPy yet --- but we announce it here as it is planned for the<br> +next release :-)</p><p>The package is available <a class="reference external" href="https://foss.heptapod.net/cffi/cffi">on bitbucket</a> as well as <a class="reference external" href="https://cffi.readthedocs.org">documented</a>. You can also install it<br> +straight from the python package index (pip).</p><p>The aim of this project is to provide a convenient and reliable way of calling C code from Python.<br> +The interface is based on <a class="reference external" href="https://luajit.org/ext_ffi.html">LuaJIT's FFI</a> and follows a few principles:</p><ul class="simple"><li>The goal is to call C code from Python. You should be able to do so<br> +without learning a 3rd language: every alternative requires you to learn<br> +their own language (<a class="reference external" href="https://www.cython.org">Cython</a>, <a class="reference external" href="https://www.swig.org/">SWIG</a>) or API (<a class="reference external" href="https://docs.python.org/library/ctypes.html">ctypes</a>). So we tried to<br> +assume that you know Python and C and minimize the extra bits of API that<br> +you need to learn.</li> +<li>Keep all the Python-related logic in Python so that you don't need to<br> +write much C code (unlike <a class="reference external" href="https://docs.python.org/extending/extending.html">CPython native C extensions</a>).</li> +<li>Work either at the level of the ABI (Application Binary Interface)<br> +or the API (Application Programming Interface). Usually, C<br> +libraries have a specified C API but often not an ABI (e.g. they may<br> +document a "struct" as having at least these fields, but maybe more).<br> +(<a class="reference external" href="https://docs.python.org/library/ctypes.html">ctypes</a> works at the ABI level, whereas Cython or <a class="reference external" href="https://docs.python.org/extending/extending.html">native C extensions</a><br> +work at the API level.)</li> +<li>We try to be complete. For now some C99 constructs are not supported,<br> +but all C89 should be, including macros (and including macro "abuses",<br> +which you can manually wrap in saner-looking C functions).</li> +<li>We attempt to support both PyPy and CPython (although PyPy support is not<br> +complete yet) with a reasonable path for other Python implementations like<br> +IronPython and Jython.</li> +<li>Note that this project is <strong>not</strong> about embedding executable C code in<br> +Python, unlike <a class="reference external" href="https://www.scipy.org/Weave">Weave</a>. This is about calling existing C libraries<br> +from Python.</li> +</ul><div class="section" id="status-of-the-project"><h3>Status of the project</h3><p>Consider this as a beta release. Creating CPython extensions is fully supported and the API should<br> +be relatively stable; however, minor adjustements of the API are possible.</p><p>PyPy support is not yet done and this is a goal for the next release. There are vague plans to make this the<br> +preferred way to call C from Python that can reliably work between PyPy and CPython.</p><p>Right now CFFI's verify() requires a C compiler and header files to be available at run-time.<br> +This limitation will be lifted in the near future and it'll contain a way to cache the resulting binary.</p><p>Cheers,<br> +<br> +Armin Rigo and Maciej Fijałkowski</p></div></div>releasecffihttps://www.pypy.org/posts/2012/06/release-01-of-cffi-4760622823232463868.htmlMon, 18 Jun 2012 12:59:00 GMT \ No newline at end of file diff --git a/categories/releaserevdb.html b/categories/releaserevdb.html new file mode 100644 index 000000000..1662e683c --- /dev/null +++ b/categories/releaserevdb.html @@ -0,0 +1,114 @@ + + + + + +Posts about releaserevdb | PyPy + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +
                + + \ No newline at end of file diff --git a/categories/releaserevdb.xml b/categories/releaserevdb.xml new file mode 100644 index 000000000..027355d25 --- /dev/null +++ b/categories/releaserevdb.xml @@ -0,0 +1,24 @@ + +PyPy (Posts about releaserevdb)https://www.pypy.org/enContents © 2024 <a href="mailto:pypy-dev@pypy.org">The PyPy Team</a> Sat, 31 Aug 2024 17:48:11 GMTNikola (getnikola.com)http://blogs.law.harvard.edu/tech/rssRevDB released, v5.4.1https://www.pypy.org/posts/2016/09/revdb-released-v541-6719768292347391304.htmlArmin Rigo<p>Hi all,</p> + +<p> +The first beta version of <a href="https://bitbucket.org/pypy/revdb/">RevDB</a> is out! <a href="https://www.pypy.org/posts/2016/07/reverse-debugging-for-python-8854823774141612670.html">Remember</a> that RevDB is a reverse debugger for Python. The idea is that it is a debugger that can run forward and backward in time, letting you more easily understand your subtle bug in your big Python program.</p> + +<p> +RevDB should work on almost any Python program. Even if you are normally only using CPython, trying to reproduce the bug with RevDB is similar to trying to run the program on a regular PyPy---usually it just works, <a href="https://pypy.org/compat.html">even if not quite always</a>. + +</p><p> +News from the alpha version in the <a href="https://www.pypy.org/posts/2016/07/reverse-debugging-for-python-8854823774141612670.html">previous blog post</a> include notably support for: +</p><ul> +<li>Threads. +</li><li>CPyExt, the compatibility layer of PyPy that can run CPython C extension modules. +</li></ul> +as well as many other improvements. + +<p> +You need to build it yourself for now. It is tested on 64-bit Linux. 32-bit Linux, OS/X, and other POSIX platforms should all either work out of the box or be just a few fixes away (contributions welcome). Win32 support is a lot more involved but not impossible.</p> + +<p> +See <a href="https://bitbucket.org/pypy/revdb/">https://bitbucket.org/pypy/revdb/</a> for more information!</p> + +<p>Armin</p>releaserevdbhttps://www.pypy.org/posts/2016/09/revdb-released-v541-6719768292347391304.htmlSat, 10 Sep 2016 09:30:00 GMT \ No newline at end of file diff --git a/categories/releasestm.html b/categories/releasestm.html new file mode 100644 index 000000000..b1ca65396 --- /dev/null +++ b/categories/releasestm.html @@ -0,0 +1,114 @@ + + + + + +Posts about releasestm | PyPy + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +
                + + \ No newline at end of file diff --git a/categories/releasestm.xml b/categories/releasestm.xml new file mode 100644 index 000000000..b9dc394c6 --- /dev/null +++ b/categories/releasestm.xml @@ -0,0 +1,182 @@ + +PyPy (Posts about releasestm)https://www.pypy.org/enContents © 2024 <a href="mailto:pypy-dev@pypy.org">The PyPy Team</a> Sat, 31 Aug 2024 17:48:14 GMTNikola (getnikola.com)http://blogs.law.harvard.edu/tech/rssPyPy-STM: first "interesting" releasehttps://www.pypy.org/posts/2014/07/pypy-stm-first-interesting-release-8684276541915333814.htmlArmin Rigo<p>Hi all,</p> + +<p>PyPy-STM is now reaching a point where we can say it's good enough to be +a GIL-less Python. (We don't guarantee there are no more bugs, so please +report them :-) The first official STM release:</p> + +<ul> +<li><a href="https://bitbucket.org/pypy/pypy/downloads/pypy-stm-2.3-r2-linux64.tar.bz2">pypy-stm-2.3-r2-linux64</a> +<br><i>(UPDATE: this is release r2, fixing a systematic segfault at start-up on some systems)</i></li> +</ul> + +<p>This corresponds roughly to PyPy 2.3 (not 2.3.1). It requires 64-bit +Linux. More precisely, this release is built for Ubuntu 12.04 to 14.04; +you can also <a href="https://pypy.org/download.html#building-from-source">rebuild it +from source</a> by getting the branch <strong>stmgc-c7</strong>. You need +clang to compile, and you need a <a href="https://bitbucket.org/pypy/stmgc/src/default/c7/llvmfix/">patched +version of llvm</a>.</p> + +<p>This version's performance can reasonably be compared with a regular +PyPy, where both include the JIT. Thanks for following the meandering progress of PyPy-STM over the past three years --- we're finally getting somewhere really interesting! We cannot thank enough all contributors to the <a href="https://pypy.org/tmdonate.html">previous PyPy-STM money pot</a> that made this possible. And, although this blog post is focused on the results from that period of time, I have of course to remind you that we're running a <a href="https://pypy.org/tmdonate2.html">second call for donation</a> for future work, which I will briefly mention again later.</p> + +<p>A recap of what we did to get there: <a href="https://www.pypy.org/posts/2014/02/rewrites-of-stm-core-model-again-633249729751034512.html">around the start of the year</a> we found a new model, a "redo-log"-based STM which uses a couple of hardware tricks to not require chasing pointers, giving it (in this context) exceptionally cheap read barriers. This idea <a href="https://www.pypy.org/posts/2014/03/hi-all-here-is-one-of-first-full-pypys-8725931424559481728.html">was developed</a> over the following months and (relatively) easily <a href="https://www.pypy.org/posts/2014/04/stm-results-and-second-call-for-1767845182888902777.html">integrated with the JIT compiler</a>. The most recent improvements on the Garbage Collection side are closing the gap with a regular PyPy (there is still a bit more to do there). There is some <a href="https://pypy.readthedocs.org/en/latest/stm.html">preliminary user documentation</a>.</p> + +<p>Today, the result of this is a PyPy-STM that is capable of running pure Python code on multiple threads in parallel, as we will show in the benchmarks that follow. A quick warning: this is only about pure Python code. We didn't try so far to optimize the case where most of the time is spent in external libraries, or even manipulating "raw" memory like <code>array.array</code> or numpy arrays. To some extent there is no point because the approach of CPython works well for this case, i.e. releasing the GIL around the long-running operations in C. Of course it would be nice if such cases worked as well in PyPy-STM --- which they do to some extent; but checking and optimizing that is future work.</p> + +<p>As a starting point for our benchmarks, when running code that +only uses one thread, we get a slow-down between 1.2 and 3: at worst, +three times as slow; at best only 20% slower than a regular +PyPy. This worst case has been brought down --it used to be 10x-- by +recent work on "card marking", a useful GC technique that is also +present in the regular PyPy (and about which I don't find any blog post; +maybe we should write one :-) The main remaining issue is fork(), or +any function that creates subprocesses: it works, but is very slow. To +remind you of this fact, it prints a line to stderr when used.</p> + +<p>Now the real main part: when you run multithreaded code, it scales very nicely with two +threads, and less-than-linearly but still not badly with three or four +threads. Here is an artificial example:</p> + +<pre> total = 0 + lst1 = ["foo"] + for i in range(100000000): + lst1.append(i) + total += lst1.pop()</pre> + +<p>We run this code N times, once in each of N threads +(<a href="https://bitbucket.org/pypy/benchmarks/raw/default/multithread/minibench1.py">full +benchmark</a>). Run times, best of three:</p> + +<table border="1" cellpadding="5"> +<tbody> +<tr><td>Number of threads</td> + <td>Regular PyPy (head)</td> + <td>PyPy-STM</td></tr> +<tr><td>N = 1</td> + <td>real <strong>0.92s</strong> <br> +user+sys 0.92s</td> + <td>real <strong>1.34s</strong> <br> +user+sys 1.34s</td></tr> +<tr><td>N = 2</td> + <td>real <strong>1.77s</strong> <br> +user+sys 1.74s</td> + <td>real <strong>1.39s</strong> <br> +user+sys 2.47s</td></tr> +<tr><td>N = 3</td> + <td>real <strong>2.57s</strong> <br> +user+sys 2.56s</td> + <td>real <strong>1.58s</strong> <br> +user+sys 4.106s</td></tr> +<tr><td>N = 4</td> + <td>real <strong>3.38s</strong> <br> +user+sys 3.38s</td> + <td>real <strong>1.64s</strong> <br> +user+sys 5.35s</td></tr> +</tbody></table> + +<p>(The "real" time is the wall clock time. The "user+sys" time is the +recorded CPU time, which can be larger than the wall clock time if +multiple CPUs run in parallel. This was run on a 4x2 cores machine. +For direct comparison, avoid loops that are so trivial +that the JIT can remove <b>all</b> allocations from them: right now +PyPy-STM does not handle this case well. It has to force a dummy allocation +in such loops, which makes minor collections occur much more frequently.)</p> + +<p>Four threads is the limit so far: only four threads can be executed in +parallel. Similarly, the memory usage is limited to 2.5 GB of GC +objects. These two limitations are not hard to increase, but at least +increasing the memory limit requires fighting against more LLVM bugs. +(Include here snark remarks about LLVM.)</p> + +<p>Here are some measurements from more real-world benchmarks. This time, +the amount of work is fixed and we parallelize it on T threads. The first benchmark is just running <a href="https://pypy.org/download.html#building-from-source">translate.py</a> on a trunk PyPy. The last +three benchmarks are <a href="https://bitbucket.org/pypy/benchmarks/src/default/multithread/">here</a>.</p> + +<table border="1" cellpadding="5"> +<tbody> +<tr><td>Benchmark</td> + <td>PyPy 2.3</td> + <td bgcolor="#A0A0A0">(PyPy head)</td> + <td>PyPy-STM, T=1</td> + <td>T=2</td> + <td>T=3</td> + <td>T=4</td></tr> +<tr><td><code>translate.py --no-allworkingmodules</code><br> +(annotation step)</td> + <td>184s</td> + <td bgcolor="#A0A0A0">(170s)</td> + <td>386s (2.10x)</td> + <td colspan="3">n/a</td></tr> +<tr><td>multithread-richards<br> +5000 iterations</td> + <td>24.2s</td> + <td bgcolor="#A0A0A0">(16.8s)</td> + <td>52.5s (2.17x)</td> + <td>37.4s (1.55x)</td> + <td>25.9s (1.07x)</td> + <td>32.7s (1.35x)</td></tr> +<tr><td>mandelbrot<br> +divided in 16-18 bands</td> + <td>22.9s</td> + <td bgcolor="#A0A0A0">(18.2s)</td> + <td>27.5s (1.20x)</td> + <td>14.4s (0.63x)</td> + <td>10.3s (0.45x)</td> + <td>8.71s (0.38x)</td></tr> +<tr><td>btree</td> + <td>2.26s</td> + <td bgcolor="#A0A0A0">(2.00s)</td> + <td>2.01s (0.89x)</td> + <td>2.22s (0.98x)</td> + <td>2.14s (0.95x)</td> + <td>2.42s (1.07x)</td></tr> +</tbody></table> + +<p>This shows various cases that can occur:</p> + +<ul><li>The mandelbrot example runs with minimal overhead and very good parallelization. +It's dividing the plane to compute in bands, and each of the T threads receives the +same number of bands. + +</li><li>Richards, a classical benchmark for PyPy (tweaked to run the iterations +in multiple threads), is hard to beat on regular PyPy: +we suspect that the difference is due to the fact that a lot of +paths through the loops don't allocate, triggering the issue already +explained above. Moreover, the speed of Richards was again improved +dramatically recently, in trunk. + +</li><li>The translation benchmark measures the time <code>translate.py</code> +takes to run the first phase only, "annotation" (for now it consumes too much memory +to run <code>translate.py</code> to the end). Moreover the timing starts only after the large number of +subprocesses spawned at the beginning (mostly gcc). This benchmark is not parallel, but we +include it for reference here. The slow-down factor of 2.1x is still too much, but +we have some idea about the reasons: most likely, again the Garbage Collector, missing the regular PyPy's +very fast small-object allocator for old objects. Also, <code>translate.py</code> +is an example of application that could, with +reasonable efforts, be made largely parallel in the future using <i>atomic blocks.</i> + +</li><li>Atomic blocks are also present in the btree benchmark. I'm not completely sure +but it seems that, in this case, the atomic blocks create too many +conflicts between the threads for actual parallization: the base time is very good, +but running more threads does not help at all. +</li></ul> + +<p>As a summary, PyPy-STM looks already useful to run CPU-bound multithreaded +applications. We are certainly still going to fight slow-downs, but it +seems that there are cases where 2 threads are enough to outperform a regular +PyPy, by a large margin. Please try it out on your own small examples!</p> + +<p>And, at the same time, please don't attempt to retrofit threads inside +an existing large program just to benefit from PyPy-STM! +Our goal is not to send everyone down the obscure route of multithreaded +programming and its dark traps. We are going finally to shift our main +focus on the <a href="https://pypy.org/tmdonate2.html">phase 2 of our +research</a> (donations welcome): how to enable a better way of writing multi-core programs. +The starting point is to fix and test atomic blocks. Then we will have to +debug common causes of conflicts and fix them or work around them; and +try to see how common frameworks like Twisted can be adapted.</p> + +<p>Lots of work ahead, but lots of work behind too :-)</p> + +<p>Armin (thanks Remi as well for the work).</p>releasestmhttps://www.pypy.org/posts/2014/07/pypy-stm-first-interesting-release-8684276541915333814.htmlSat, 05 Jul 2014 09:37:00 GMT \ No newline at end of file diff --git a/categories/revdb.html b/categories/revdb.html new file mode 100644 index 000000000..af31a3486 --- /dev/null +++ b/categories/revdb.html @@ -0,0 +1,114 @@ + + + + + +Posts about revdb | PyPy + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +
                + + \ No newline at end of file diff --git a/categories/revdb.xml b/categories/revdb.xml new file mode 100644 index 000000000..cdbeb2111 --- /dev/null +++ b/categories/revdb.xml @@ -0,0 +1,457 @@ + +PyPy (Posts about revdb)https://www.pypy.org/enContents © 2024 <a href="mailto:pypy-dev@pypy.org">The PyPy Team</a> Sat, 31 Aug 2024 17:48:14 GMTNikola (getnikola.com)http://blogs.law.harvard.edu/tech/rssReverse debugging for Pythonhttps://www.pypy.org/posts/2016/07/reverse-debugging-for-python-8854823774141612670.htmlArmin Rigo<div class="section" id="revpdb"> +<h3>RevPDB</h3> +<p>A "reverse debugger" is a debugger where you can go forward and +backward in time. It is an uncommon feature, at least in the open +source world, but I have no idea why. I have used <a class="reference external" href="https://undo.io/">undodb-gdb</a> and +<a class="reference external" href="https://rr-project.org/">rr</a>, which are reverse debuggers for C code, and I can only say that +they saved me many, many days of poking around blindly in gdb.</p> +<p>The PyPy team is pleased to give you "RevPDB", a reverse-debugger +similar to <tt class="docutils literal">rr</tt> but for Python.</p> +<p>An example is worth a thousand words. Let's say your big Python +program has a bug that shows up inconsistently. You have nailed it +down to something like:</p> +<ul class="simple"> +<li>start <tt class="docutils literal">x.py</tt>, which does stuff (maybe involving processing files, +answering some web requests that you simulate from another terminal, +etc.);</li> +<li>sometimes, after a few minutes, your program's state becomes +inconsistent and you get a failing assert or another exception.</li> +</ul> +<p>This is the case where RevPDB is useful.</p> +<p>RevPDB is available only on 64-bit Linux and OS/X right now, but should +not be too hard to port to other OSes. It is very much <em>alpha-level!</em> +(It is a debugger full of bugs. Sorry about that.) I believe it is +still useful---it helped me in one <a class="reference external" href="https://bitbucket.org/pypy/pypy/commits/bd220c268bc9">real use case</a> already.</p> +</div> +<div class="section" id="how-to-get-revpdb"> +<h3>How to get RevPDB</h3> +<p>The following demo was done with an alpha version for 64-bit Linux, +compiled for Arch Linux. I won't provide the binary; it should be +easy enough to retranslate (much faster than a regular PyPy because it +contains neither a JIT nor a custom GC). Grab the <a class="reference external" href="https://pypy.org/download.html#building-from-source">PyPy sources</a> from +Mercurial, and then:</p> +<pre class="literal-block"> +hg update reverse-debugger +# or "hg update ff376ccacb36" for exactly this demo +cd pypy/goal +../../rpython/bin/rpython -O2 --revdb targetpypystandalone.py \ + --withoutmod-cpyext --withoutmod-micronumpy +</pre> +<p>and possibly rename the final <tt class="docutils literal"><span class="pre">pypy-c</span></tt> to <tt class="docutils literal"><span class="pre">pypy-revdb</span></tt> to avoid +confusion.</p> +<p>Other platforms than 64-bit Linux and OS/X need some fixes before they work.</p> +</div> +<div class="section" id="demo"> +<h3>Demo</h3> +<p>For this demo, we're going to use this <tt class="docutils literal">x.py</tt> as the "big program":</p> +<pre class="literal-block"> +import os + +class Foo(object): + value = 5 + +lst1 = [Foo() for i in range(100)] +lst1[50].value += 1 +for x in lst1: + x.value += 1 + +for x in lst1: + if x.value != 6: + print 'oops!' + os._exit(1) +</pre> +<p>Of course, it is clear what occurs in this small example: the check +fails on item 50. For this demo, the check has been written with +<tt class="docutils literal">os._exit(1)</tt>, because this exits immediately the program. If it +was written with an <tt class="docutils literal">assert</tt>, then its failure would execute things +in the <tt class="docutils literal">traceback</tt> module afterwards, to print the traceback; it +would be a minor mess just to find the exact point of the failing +<tt class="docutils literal">assert</tt>. (This and other issues are supposed to be fixed in the +future, but for now it is alpha-level.)</p> +<p>Anyway, with a regular <tt class="docutils literal">assert</tt> and a regular post-mortem <tt class="docutils literal">pdb</tt>, +we could observe that <tt class="docutils literal">x.value</tt> is indeed 7 instead of 6 when the +assert fails. Imagine that the program is much bigger: how would we +find the exact chain of events that caused this value 7 to show up on +this particular <tt class="docutils literal">Foo</tt> object? This is what RevPDB is for.</p> +<p><strike>First, we need for now to disable Address Space Layout Randomization +(ASLR), otherwise replaying will not work. This is done once with the +following command line, which changes the state until the next +reboot:</strike></p> +<pre class="literal-block"> +echo 0 | sudo tee /proc/sys/kernel/randomize_va_space +</pre> +<p><strong>UPDATE:</strong> the above is no longer necessary from revision ff376ccacb36.</p> +<p>Run <tt class="docutils literal">x.py</tt> with RevPDB's version of PyPy instead of the regular +interpreter (CPython or PyPy):</p> +<pre class="literal-block"> +PYPYRDB=log.rdb ./pypy-revdb x.py +</pre> +<p>This <tt class="docutils literal"><span class="pre">pypy-revdb</span></tt> executable is like a slow PyPy executable, running +(for now) without a JIT. This produces a file <tt class="docutils literal">log.rdb</tt> which +contains a complete log of this execution. (If the bug we are +tracking occurs rarely, we need to re-run it several times until we +get the failure. But once we got the failure, then we're done with +this step.)</p> +<p>Start:</p> +<pre class="literal-block"> +rpython/translator/revdb/revdb.py log.rdb +</pre> +<p>We get a pdb-style debugger. This <tt class="docutils literal">revdb.py</tt> is a normal Python +program, which you run with an unmodified Python; internally, it looks +inside the log for the path to <tt class="docutils literal"><span class="pre">pypy-revdb</span></tt> and run it as needed (as +one forking subprocess, in a special mode).</p> +<p>Initially, we are at the start of the program---not at the end, like +we'd get in a regular debugger:</p> +<pre class="literal-block"> +File "&lt;builtin&gt;/app_main.py", line 787 in setup_bootstrap_path: +(1)$ +</pre> +<p>The list of commands is available with <tt class="docutils literal">help</tt>.</p> +<p>Go to the end with <tt class="docutils literal">continue</tt> (or <tt class="docutils literal">c</tt>):</p> +<pre class="literal-block"> +(1)$ continue +File "/tmp/x.py", line 14 in &lt;module&gt;: +... + lst1 = [Foo() for i in range(100)] + lst1[50].value += 1 + for x in lst1: + x.value += 1 + + for x in lst1: + if x.value != 6: + print 'oops!' +&gt; os._exit(1) +(19727)$ +</pre> +<p>We are now at the beginning of the last executed line. The number +19727 is the "time", measured in number of lines executed. We can go +backward with the <tt class="docutils literal">bstep</tt> command (backward step, or <tt class="docutils literal">bs</tt>), line +by line, and forward again with the <tt class="docutils literal">step</tt> command. There are also +commands <tt class="docutils literal">bnext</tt>, <tt class="docutils literal">bcontinue</tt> and <tt class="docutils literal">bfinish</tt> and their forward +equivalents. There is also "<tt class="docutils literal">go TIME</tt>" to jump directly to the specified +time. (Right now the debugger only stops at "line start" +events, not at function entry or exit, which makes some cases a bit +surprising: for example, a <tt class="docutils literal">step</tt> from the return statement of +function <tt class="docutils literal">foo()</tt> will jump directly to the caller's caller, if the +caller's current line was <tt class="docutils literal">return foo() + 2</tt>, because no "line +start" event occurs in the caller after <tt class="docutils literal">foo()</tt> returns to it.)</p> +<p>We can print Python expressions and statements using the <tt class="docutils literal">p</tt> +command:</p> +<pre class="literal-block"> +(19727)$ p x +$0 = &lt;__main__.Foo object at 0xfffffffffffeab3e&gt; +(19727)$ p x.value +$1 = 7 +(19727)$ p x.value + 1 +8 +</pre> +<p>The "<tt class="docutils literal">$NUM =</tt>" prefix is only shown when we print an object that +really exists in the debugged program; that's why the last line does +not contain it. Once a <tt class="docutils literal">$NUM</tt> has been printed, then we can use +it in further expressions---even at a different point time. It +becomes an anchor that always refers to the same object:</p> +<pre class="literal-block"> +(19727)$ bstep + +File "/tmp/x.py", line 13 in &lt;module&gt;: +... + + lst1 = [Foo() for i in range(100)] + lst1[50].value += 1 + for x in lst1: + x.value += 1 + + for x in lst1: + if x.value != 6: +&gt; print 'oops!' + os._exit(1) +(19726)$ p $0.value +$1 = 7 +</pre> +<p>In this case, we want to know when this value 7 was put in this +attribute. This is the job of a watchpoint:</p> +<pre class="literal-block"> +(19726)$ watch $0.value +Watchpoint 1 added +updating watchpoint value: $0.value =&gt; 7 +</pre> +<p>This watchpoint means that <tt class="docutils literal">$0.value</tt> will be evaluated at each line. +When the <tt class="docutils literal">repr()</tt> of this expression changes, the watchpoint activates +and execution stops:</p> +<pre class="literal-block"> +(19726)$ bcontinue +[searching 19629..19726] +[searching 19338..19629] + +updating watchpoint value: $0.value =&gt; 6 +Reverse-hit watchpoint 1: $0.value +File "/tmp/x.py", line 9 in &lt;module&gt;: + import os + + class Foo(object): + value = 5 + + lst1 = [Foo() for i in range(100)] + lst1[50].value += 1 + for x in lst1: +&gt; x.value += 1 + + for x in lst1: + if x.value != 6: + print 'oops!' + os._exit(1) +(19524)$ +</pre> +<p>Note that using the <tt class="docutils literal">$NUM</tt> syntax is essential in watchpoints. You +can't say "<tt class="docutils literal">watch x.value</tt>", because the variable <tt class="docutils literal">x</tt> will go out +of scope very soon when we move forward or backward in time. In fact +the watchpoint expression is always evaluated inside an environment +that contains the builtins but not the current locals and globals. +But it also contains all the <tt class="docutils literal">$NUM</tt>, which can be used to refer to +known objects. It is thus common to watch <tt class="docutils literal">$0.attribute</tt> if <tt class="docutils literal">$0</tt> +is an object, or to watch <tt class="docutils literal"><span class="pre">len($1)</span></tt> if <tt class="docutils literal">$1</tt> is some list. The +watch expression can also be a simple boolean: for example, "<tt class="docutils literal">watch +$2 in $3</tt>" where <tt class="docutils literal">$3</tt> is some dict and <tt class="docutils literal">$2</tt> is some object that +you find now in the dict; you would use this to find out the time when +<tt class="docutils literal">$2</tt> was put inside <tt class="docutils literal">$3</tt>, or removed from it.</p> +<p>Use "<tt class="docutils literal">info watchpoints</tt>" and "<tt class="docutils literal">delete &lt;watchpointnum&gt;</tt>" to manage +watchpoints.</p> +<p>There are also regular breakpoints, which you set with "<tt class="docutils literal">b +FUNCNAME</tt>". It breaks whenever there is a call to a function that +happens to have the given name. (It might be annoying to use for a +function like <tt class="docutils literal">__init__()</tt> which has many homonyms. There is no +support for breaking on a fully-qualified name or at a given line +number for now.)</p> +<p>In our demo, we stop at the line <tt class="docutils literal">x.value += 1</tt>, which is where the +value was changed from 6 to 7. Use <tt class="docutils literal">bcontinue</tt> again to stop at the +line <tt class="docutils literal"><span class="pre">lst1[50].value</span> += 1</tt>, which is where the value was changed from +5 to 6. Now we know how this <tt class="docutils literal">value</tt> attribute ends up being 7.</p> +<pre class="literal-block"> +(19524)$ bcontinue +[searching 19427..19524] +[searching 19136..19427] + +updating watchpoint value: $0.value =&gt; 5 +Reverse-hit watchpoint 1: $0.value +File "/tmp/x.py", line 7 in &lt;module&gt;: + import os + + class Foo(object): + value = 5 + + lst1 = [Foo() for i in range(100)] +&gt; lst1[50].value += 1 + for x in lst1: + x.value += 1 + + for x in lst1: + if x.value != 6: +... +(19422)$ +</pre> +<p>Try to use <tt class="docutils literal">bcontinue</tt> yet another time. It will stop now just before +<tt class="docutils literal">$0</tt> is created. At that point in time, <tt class="docutils literal">$0</tt> refers to +an object that does not exist yet, so the watchpoint now evaluates to +an error message (but it continues to work as before, with that error +message as the string it currently evaluates to).</p> +<pre class="literal-block"> +(19422)$ bcontinue +[searching 19325..19422] + +updating watchpoint value: $0.value =&gt; RuntimeError: + '$0' refers to an object created later in time +Reverse-hit watchpoint 1: $0.value +File "/tmp/x.py", line 6 in &lt;module&gt;: + import os + + class Foo(object): + value = 5 + +&gt; lst1 = [Foo() for i in range(100)] + lst1[50].value += 1 + for x in lst1: + x.value += 1 + + for x in lst1: +... +(19371)$ +</pre> +<p>In big programs, the workflow is similar, just more complex. Usually +it works this way: we find interesting points in time with some +combination of watchpoints and some direct commands to move around. +We write down on a piece of (real or virtual) paper these points in +history, including most importantly their time, so that we can +construct an ordered understanding of what is going on.</p> +<p>The current <tt class="docutils literal">revdb</tt> can be annoying and sometimes even crash; but +the history you reconstruct can be kept. All the times and +expressions printed are still valid when you restart <tt class="docutils literal">revdb</tt>. The +only thing "lost" is the <tt class="docutils literal">$NUM</tt> objects, which you need to print +again. (Maybe instead of <tt class="docutils literal">$0</tt>, <tt class="docutils literal">$1</tt>, ... we should use <tt class="docutils literal">$&lt;big +number&gt;</tt>, where the big number identifies uniquely the object by its +creation time. These numbers would continue to be valid even after +<tt class="docutils literal">revdb</tt> is restarted. They are more annoying to use than just +<tt class="docutils literal">$0</tt> though.)</p> +<p><b>Screencast:</b> Here's a (slightly typo-y) screencast of cfbolz using the reverse debugger: +</p> +</div> +<div class="section" id="current-issues"> +<h3>Current issues</h3> +<p>General issues:</p> +<ul class="simple"> +<li>If you are using <tt class="docutils literal">revdb</tt> on a log that took more than a few +minutes to record, then it can be painfully slow. This is because +<tt class="docutils literal">revdb</tt> needs to replay again big parts of the log for some +operations.</li> +<li>The <tt class="docutils literal"><span class="pre">pypy-revdb</span></tt> is currently missing the following modules:<ul> +<li><tt class="docutils literal">thread</tt> (implementing multithreading is possible, but not done +yet);</li> +<li><tt class="docutils literal">cpyext</tt> (the CPython C API compatibility layer);</li> +<li><tt class="docutils literal">micronumpy</tt> (minor issue only);</li> +<li><tt class="docutils literal">_continuation</tt> (for greenlets).</li> +</ul> +</li> +<li>Does not contain a JIT, and does not use our fast garbage +collectors. You can expect <tt class="docutils literal"><span class="pre">pypy-revdb</span></tt> to be maybe 3 times +slower than CPython.</li> +<li>Only works on Linux and OS/X. There is no fundamental reason for +this restriction, but it is some work to fix.</li> +<li>Replaying a program uses a <em>lot</em> more memory; maybe 15x as much than +during the recording. This is because it creates many forks. If +you have a program that consumes 10% of your RAM or more, you will +need to reduce <tt class="docutils literal">MAX_SUBPROCESSES</tt> in <tt class="docutils literal">process.py</tt>.</li> +</ul> +<p>Replaying also comes with a bunch of user interface issues:</p> +<ul class="simple"> +<li><tt class="docutils literal">Attempted to do I/O or access raw memory</tt>: we get this whenever +trying to <tt class="docutils literal">print</tt> some expression that cannot be evaluated with +only the GC memory---or which can, but then the <tt class="docutils literal">__repr__()</tt> +method of the result cannot. We need to reset the state with +<tt class="docutils literal">bstep</tt> + <tt class="docutils literal">step</tt> before we can print anything else. However, +if only the <tt class="docutils literal">__repr__()</tt> crashes, you still see the <tt class="docutils literal">$NUM =</tt> +prefix, and you can use that <tt class="docutils literal">$NUM</tt> afterwards.</li> +<li><tt class="docutils literal">id()</tt> is globally unique, returning a reproducible 64-bit number, +so sometimes using <tt class="docutils literal">id(x)</tt> is a workaround for when using <tt class="docutils literal">x</tt> +doesn't work because of <tt class="docutils literal">Attempted to do I/O</tt> issues (e.g. <tt class="docutils literal">p +[id(x) for x in somelist]</tt>).</li> +<li>as explained in the demo, next/bnext/finish/bfinish might jump +around a bit non-predictably.</li> +<li>similarly, breaks on watchpoints can stop at apparently unexpected +places (when going backward, try to do "step" once). The issue is +that it can only stop at the beginning of every line. In the +extreme example, if a line is <tt class="docutils literal"><span class="pre">foo(somelist.pop(getindex()))</span></tt>, +then <tt class="docutils literal">somelist</tt> is modified in the middle. Immediately before +this modification occurs, we are in <tt class="docutils literal">getindex()</tt>, and +immediately afterwards we are in <tt class="docutils literal">foo()</tt>. The watchpoint will +stop the program at the end of <tt class="docutils literal">getindex()</tt> if running backward, +and at the start of <tt class="docutils literal">foo()</tt> if running forward, but never +actually on the line doing the change.</li> +<li>watchpoint expressions <em>must not</em> have any side-effect at all. If +they do, the replaying will get out of sync and <tt class="docutils literal">revdb.py</tt> will +complain about that. Regular <tt class="docutils literal">p</tt> expressions and statements can +have side-effects; these effects are discarded as soon as you move +in time again.</li> +<li>sometimes even "<tt class="docutils literal">p import foo</tt>" will fail with <tt class="docutils literal">Attempted to do +I/O</tt>. Use instead "<tt class="docutils literal">p import sys; foo = <span class="pre">sys.modules['foo']</span></tt>".</li> +<li>use <tt class="docutils literal">help</tt> to see all commands. <tt class="docutils literal">backtrace</tt> can be useful. +There is no <tt class="docutils literal">up</tt> command; you have to move in time instead, +e.g. using <tt class="docutils literal">bfinish</tt> to go back to the point where the current +function was called.</li> +</ul> +</div> +<div class="section" id="how-revpdb-is-done"> +<h3>How RevPDB is done</h3> +<p>If I had to pick the main advantage of PyPy over CPython, it is that +we have got with the RPython translation toolchain a real place for +experimentation. Every now and then, we build inside RPython some +feature that gives us an optionally tweaked version of the PyPy +interpreter---tweaked in a way that would be hard to do with CPython, +because it would require systematic changes everywhere. The most +obvious and successful examples are the GC and the JIT. But there +have been many other experiments along the same lines, from the +so-called <a class="reference external" href="https://foss.heptapod.net/pypy/extradoc/-/blob/branch/default/tip/eu-report/D07.1_Massive_Parallelism_and_Translation_Aspects-2007-02-28.pdf">stackless transformation</a> in the early days, to the STM +version of PyPy.</p> +<p>RevPDB works in a similar way. It is a version of PyPy in which some +operations are systematically replaced with other operations.</p> +<p>To keep the log file at a reasonable size, we duplicate the content of +all GC objects during replaying---by repeating the same actions on +them, without writing anything in the log file. So that means that in +the <tt class="docutils literal"><span class="pre">pypy-revdb</span></tt> binary, the operations that do arithmetic or +read/write GC-managed memory are not modified. Most operations are +like that. However, the other operations, the ones that involve +either non-GC memory or calls to external C functions, are tweaked. +Each of these operations is replaced with code that works in two +modes, based on a global flag:</p> +<ul class="simple"> +<li>in "recording" mode, we log the result of the operation (but not the +arguments);</li> +<li>in "replaying" mode, we don't really do the operation at all, but +instead just fetch the result from the log.</li> +</ul> +<p>Hopefully, all remaining unmodified operations (arithmetic and GC +load/store) are completely deterministic. So during replaying, every +integer or non-GC pointer variable will have exactly the same value as +it had during recording. Interestingly, it means that if the +recording process had a big array in non-GC memory, then in the +replaying process, the array is not allocated at all; it is just +represented by the same address, but there is nothing there. When we +record "read item 123 from the array", we record the result of the +read (but not the "123"). When we replay, we're seeing again the same +"read item 123 from the array" operation. At that point, we don't +read anything; we just return the result from the log. Similarly, +when recording a "write" to the array, we record nothing (this write +operation has no result); so that when replaying, we redo nothing.</p> +<p>Note how that differs from anything managed by GC memory: GC objects +(including GC arrays) are really allocated, writes really occur, and +reads are redone. We don't touch the log in this case.</p> +</div> +<div class="section" id="other-reverse-debuggers-for-python"> +<h3>Other reverse debuggers for Python</h3> +<p>There are already some Python experiments about <a class="reference external" href="https://en.wikipedia.org/wiki/Debugger#Reverse_debugging">reverse debugging</a>. +This is also known as "omniscient debugging". However, I claim that +the result they get to is not very useful (for the purpose presented +here). How they work is typically by recording changes to some +objects, like lists and dictionaries, in addition to recording the +history of where your program passed through. However, the problem of +Python is that lists and dictionaries are not the end of the story. +There are many, many, many types of objects written in C which are +mutable---in fact, the immutable ones are the exception. You can try +to systematically record all changes, but it is a huge task and easy +to forget a detail.</p> +<p>In other words it is a typical use case for tweaking the RPython +translation toolchain, rather than tweaking the CPython (or PyPy) +interpreter directly. The result that we get here with RevPDB is more +similar to <a class="reference external" href="https://rr-project.org/">rr</a> anyway, in that only a relatively small number of +external events are recorded---not every single change to every single +list and dictionary.</p> +<p>Some links:</p> +<ul class="simple"> +<li>epdb: <a class="reference external" href="https://github.com/native-human/epdb">https://github.com/native-human/epdb</a></li> +<li>pode: <a class="reference external" href="https://github.com/rodsenra/pode">https://github.com/rodsenra/pode</a></li> +</ul> +<p>For C:</p> +<ul class="simple"> +<li>rr: <a class="reference external" href="https://rr-project.org/">https://rr-project.org/</a></li> +<li>undodb-gdb: <a class="reference external" href="https://undo.io/">https://undo.io/</a></li> +</ul> +</div> +<div class="section" id="future-work"> +<h3>Future work</h3> +<p>As mentioned above, it is alpha-level, and only works on Linux and OS/X. +So the plans for the immediate future are to fix the various +issues described above, and port to more operating systems. The core of the system +is in the C file and headers in <tt class="docutils literal"><span class="pre">rpython/translator/revdb/src-revdb</span></tt>.</p> +<p>For interested people, there is also the <a class="reference external" href="https://bitbucket.org/pypy/duhton/">Duhton</a> interpreter and its +<tt class="docutils literal"><span class="pre">reverse-debugger</span></tt> branch, which is where I prototyped the RPython +concept before moving to PyPy. The basics should work for any +interpreter written in RPython, but they require some specific code to +interface with the language; in the case of PyPy, it is in +<tt class="docutils literal">pypy/interpreter/reverse_debugging.py</tt>.</p> +<p>In parallel, there are various user interface improvements that people +could be interested in, like a more "pdb++" experience. (And the script +at <tt class="docutils literal">rpython/translator/revdb/revdb.py</tt> should be moved out into some +more "official" place, and the <tt class="docutils literal"><span class="pre">reverse-debugger</span></tt> branch should be +merged back to default.)</p> +<p>I would certainly welcome any help!</p> +<p>-+- Armin</p> +</div>revdbhttps://www.pypy.org/posts/2016/07/reverse-debugging-for-python-8854823774141612670.htmlFri, 08 Jul 2016 11:39:00 GMT \ No newline at end of file diff --git a/categories/roadmap.html b/categories/roadmap.html new file mode 100644 index 000000000..7894b3cd6 --- /dev/null +++ b/categories/roadmap.html @@ -0,0 +1,117 @@ + + + + + +Posts about roadmap | PyPy + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +
                + + \ No newline at end of file diff --git a/categories/roadmap.xml b/categories/roadmap.xml new file mode 100644 index 000000000..eec6d4036 --- /dev/null +++ b/categories/roadmap.xml @@ -0,0 +1,816 @@ + +PyPy (Posts about roadmap)https://www.pypy.org/enContents © 2024 <a href="mailto:pypy-dev@pypy.org">The PyPy Team</a> Sat, 31 Aug 2024 17:48:13 GMTNikola (getnikola.com)http://blogs.law.harvard.edu/tech/rssThe First 15 Years of PyPy — a Personal Retrospectivehttps://www.pypy.org/posts/2018/09/the-first-15-years-of-pypy-3412615975376972020.htmlCarl Friedrich Bolz-Tereick<p>A few weeks ago I (=Carl Friedrich Bolz-Tereick) gave a <a class="reference external" href="https://conf.researchr.org/event/ecoop-issta-2018/icooolps-2018-papers-tbd-15-years-of-pypy-a-retrospective">keynote</a> at ICOOOLPS in +Amsterdam with the above title. I was very happy to have been given that +opportunity, since a number of our papers have been published at ICOOOLPS, +including the very first one I published when I'd just started my PhD. I decided +to turn the talk manuscript into a (longish) blog post, to make it available to a wider audience. +Note that this blog post describes my personal recollections and research, it is +thus necessarily incomplete and coloured by my own experiences.</p> +<p>PyPy has turned 15 years old this year, so I decided that that's a good reason +to dig into and talk about the history of the project so far. I'm going to do +that using the lens of how performance developed over time, which is from +something like 2000x slower than CPython, to roughly 7x faster. In this post +I am going to present the history of the project, and also talk about some +lessons that we learned.</p> +<p>The post does not make too many assumptions about any prior knowledge of what +PyPy is, so if this is your first interaction with it, welcome! I have tried to +sprinkle links to earlier blog posts and papers into the writing, in case you +want to dive deeper into some of the topics.</p> +<p>As a disclaimer, in this post I am going to mostly focus on ideas, and not +explain who had or implemented them. A huge amount of people contributed to the +design, the implementation, the funding and the organization of PyPy over the +years, and it would be impossible to do them all justice.</p> +<div class="contents topic" id="contents"> +<b>Contents</b> +<ul class="simple"> +<li><p><a class="reference internal" href="https://www.pypy.org/posts/2018/09/the-first-15-years-of-pypy-3412615975376972020.html#starting-the-project" id="id17">2003: Starting the Project</a></p></li> +<li><p><a class="reference internal" href="https://www.pypy.org/posts/2018/09/the-first-15-years-of-pypy-3412615975376972020.html#implementing-the-interpreter" id="id18">2003: Implementing the Interpreter</a></p></li> +<li><p><a class="reference internal" href="https://www.pypy.org/posts/2018/09/the-first-15-years-of-pypy-3412615975376972020.html#early-organizational-ideas" id="id19">Early organizational ideas</a></p></li> +<li><p><a class="reference internal" href="https://www.pypy.org/posts/2018/09/the-first-15-years-of-pypy-3412615975376972020.html#eu-funding" id="id20">2004-2007: EU-Funding</a></p></li> +<li><p><a class="reference internal" href="https://www.pypy.org/posts/2018/09/the-first-15-years-of-pypy-3412615975376972020.html#bootstrapping-pypy" id="id21">2005: Bootstrapping PyPy</a></p></li> +<li><p><a class="reference internal" href="https://www.pypy.org/posts/2018/09/the-first-15-years-of-pypy-3412615975376972020.html#rpython-s-modularity-problems" id="id22">RPython's Modularity Problems</a></p></li> +<li><p><a class="reference internal" href="https://www.pypy.org/posts/2018/09/the-first-15-years-of-pypy-3412615975376972020.html#the-meta-jit" id="id23">2006: The Meta-JIT</a></p></li> +<li><p><a class="reference internal" href="https://www.pypy.org/posts/2018/09/the-first-15-years-of-pypy-3412615975376972020.html#the-first-jit-generator" id="id24">The First JIT Generator</a></p></li> +<li><p><a class="reference internal" href="https://www.pypy.org/posts/2018/09/the-first-15-years-of-pypy-3412615975376972020.html#promote" id="id25">Promote</a></p></li> +<li><p><a class="reference internal" href="https://www.pypy.org/posts/2018/09/the-first-15-years-of-pypy-3412615975376972020.html#virtuals" id="id26">Virtuals</a></p></li> +<li><p><a class="reference internal" href="https://www.pypy.org/posts/2018/09/the-first-15-years-of-pypy-3412615975376972020.html#jit-status-2007" id="id27">JIT Status 2007</a></p></li> +<li><p><a class="reference internal" href="https://www.pypy.org/posts/2018/09/the-first-15-years-of-pypy-3412615975376972020.html#rsqueak-and-other-languages" id="id28">2007: RSqueak and other languages</a></p></li> +<li><p><a class="reference internal" href="https://www.pypy.org/posts/2018/09/the-first-15-years-of-pypy-3412615975376972020.html#four-more-jit-generators" id="id29">2008-2009: Four More JIT Generators</a></p></li> +<li><p><a class="reference internal" href="https://www.pypy.org/posts/2018/09/the-first-15-years-of-pypy-3412615975376972020.html#meta-tracing" id="id30">2009: Meta-Tracing</a></p></li> +<li><p><a class="reference internal" href="https://www.pypy.org/posts/2018/09/the-first-15-years-of-pypy-3412615975376972020.html#why-did-we-abandon-partial-evaluation" id="id31">Why did we Abandon Partial Evaluation?</a></p></li> +<li><p><a class="reference internal" href="https://www.pypy.org/posts/2018/09/the-first-15-years-of-pypy-3412615975376972020.html#the-pyjit-eurostars-project" id="id32">2009-2011: The PyJIT Eurostars Project</a></p></li> +<li><p><a class="reference internal" href="https://www.pypy.org/posts/2018/09/the-first-15-years-of-pypy-3412615975376972020.html#tracing-jit-improvements" id="id33">Tracing JIT improvements</a></p></li> +<li><p><a class="reference internal" href="https://www.pypy.org/posts/2018/09/the-first-15-years-of-pypy-3412615975376972020.html#speed-pypy-org" id="id34">2010: speed.pypy.org</a></p></li> +<li><p><a class="reference internal" href="https://www.pypy.org/posts/2018/09/the-first-15-years-of-pypy-3412615975376972020.html#continuous-integration" id="id35">Continuous Integration</a></p></li> +<li><p><a class="reference internal" href="https://www.pypy.org/posts/2018/09/the-first-15-years-of-pypy-3412615975376972020.html#implementing-python-objects-with-maps" id="id36">2010: Implementing Python Objects with Maps</a></p></li> +<li><p><a class="reference internal" href="https://www.pypy.org/posts/2018/09/the-first-15-years-of-pypy-3412615975376972020.html#container-storage-strategies" id="id37">2011: Container Storage Strategies</a></p></li> +<li><p><a class="reference internal" href="https://www.pypy.org/posts/2018/09/the-first-15-years-of-pypy-3412615975376972020.html#deep-changes-in-the-runtime-are-necessary" id="id38">Deep Changes in the Runtime are Necessary</a></p></li> +<li><p><a class="reference internal" href="https://www.pypy.org/posts/2018/09/the-first-15-years-of-pypy-3412615975376972020.html#jit-status-2011" id="id39">JIT Status 2011</a></p></li> +<li><p><a class="reference internal" href="https://www.pypy.org/posts/2018/09/the-first-15-years-of-pypy-3412615975376972020.html#engineering-and-incremental-progress" id="id40">2012-2017: Engineering and Incremental Progress</a></p></li> +<li><p><a class="reference internal" href="https://www.pypy.org/posts/2018/09/the-first-15-years-of-pypy-3412615975376972020.html#cpyext" id="id41">CPyExt</a></p></li> +<li><p><a class="reference internal" href="https://www.pypy.org/posts/2018/09/the-first-15-years-of-pypy-3412615975376972020.html#python-3" id="id42">Python 3</a></p></li> +<li><p><a class="reference internal" href="https://www.pypy.org/posts/2018/09/the-first-15-years-of-pypy-3412615975376972020.html#incentives-of-oss-compared-to-academia" id="id43">Incentives of OSS compared to Academia</a></p></li> +<li><p><a class="reference internal" href="https://www.pypy.org/posts/2018/09/the-first-15-years-of-pypy-3412615975376972020.html#meta-tracing-really-works" id="id44">Meta-Tracing really works!</a></p></li> +<li><p><a class="reference internal" href="https://www.pypy.org/posts/2018/09/the-first-15-years-of-pypy-3412615975376972020.html#acknowledgements" id="id45">Acknowledgements</a></p></li> +</ul> +</div> +<div class="section" id="starting-the-project"> +<h1><a class="toc-backref" href="https://www.pypy.org/posts/2018/09/the-first-15-years-of-pypy-3412615975376972020.html#id17">2003: Starting the Project</a></h1> +<p>On the technical level PyPy is a Python interpreter written in Python, which is +where the name comes from. It also has an automatically generated JIT compiler, +but I'm going to introduce that gradually over the rest of the blog post, so +let's not worry about it too much yet. On the social level PyPy is an +interesting mixture of a open source project, that sometimes had research done +in it.</p> +<p>The project got started in late 2002 and early 2003. To set the stage, at that +point Python was a significantly less popular language than it is today. <a class="reference external" href="https://www.python.org/download/releases/2.2/">Python +2.2</a> was the version at the time, Python didn't even have a <span class="docutils literal">bool</span> type yet.</p> +<p>In fall 2002 the PyPy project was started by a number of Python programmers on a +mailing list who said +something like (I am exaggerating somewhat) "Python is the greatest most +wonderful most perfect language ever, we should use it for absolutely +everything. Well, what aren't we using it for? The Python virtual machine itself +is written in C, that's bad. Let's start a project to fix that."</p> +<p>Originally that project was called "minimal python", or "ptn", later gradually +renamed to PyPy. Here's the <a class="reference external" href="https://mail.python.org/pipermail/python-list/2003-January/235289.html">mailing list post</a> to announce the project more +formally:</p> +<pre class="literal-block">Minimal Python Discussion, Coding and Sprint +-------------------------------------------- + +We announce a mailinglist dedicated to developing +a "Minimal Python" version. Minimal means that +we want to have a very small C-core and as much +as possible (re)implemented in python itself. This +includes (parts of) the VM-Code.</pre> +<p>Why would that kind of project be useful? Originally it wasn't necessarily meant +to be useful as a real implementation at all, it was more meant as a kind of +executable explanation of how Python works, free of the low level details of +CPython. But pretty soon there were then also plans for how the virtual machine +(VM) could be bootstrapped to be runnable without an existing Python +implementation, but I'll get to that further down.</p> +</div> + + +<div class="section" id="implementing-the-interpreter"> +<h1><a class="toc-backref" href="https://www.pypy.org/posts/2018/09/the-first-15-years-of-pypy-3412615975376972020.html#id18">2003: Implementing the Interpreter</a></h1> +<p>In early 2003 a group of Python people met in Hildesheim (Germany) for the first +of many week long development sprints, organized by Holger Krekel. During that +week a group of people showed up and started working on the core interpreter. +In May 2003 a second sprint was organized by Laura Creighton and Jacob Halén in +Gothenburg (Sweden). And already at that sprint enough of the Python bytecodes +and data structures were implemented to make it possible to run a program that +computed how much money everybody had to pay for the food bills of the week. And +everybody who's tried that for a large group of people knows that that’s an +amazingly complex mathematical problem.</p> +<p>In the next two years, the project continued as a open source project with +various contributors working on it in their free time, and meeting for the +occasional sprint. In that time, the rest of the core interpreter and the core +data types were implemented.</p> +<p>There's not going to be any other code in this post, but to give a bit of a +flavor of what the Python interpreter at that time looked like, here's the +implementation of the <span class="docutils literal">DUP_TOP</span> bytecode after these first sprints. As you can +see, it's in Python, obviously, and it has high level constructs such as method +calls to do the stack manipulations:</p> +<pre class="code python literal-block"><code><span class="keyword">def</span> <span class="name function">DUP_TOP</span><span class="punctuation">(</span><span class="name">f</span><span class="punctuation">):</span> + <span class="name">w_1</span> <span class="operator">=</span> <span class="name">f</span><span class="operator">.</span><span class="name">valuestack</span><span class="operator">.</span><span class="name">top</span><span class="punctuation">()</span> + <span class="name">f</span><span class="operator">.</span><span class="name">valuestack</span><span class="operator">.</span><span class="name">push</span><span class="punctuation">(</span><span class="name">w_1</span><span class="punctuation">)</span></code></pre> +<p>Here's the early code for integer addition:</p> +<pre class="code python literal-block"><code><span class="keyword">def</span> <span class="name function">int_int_add</span><span class="punctuation">(</span><span class="name">space</span><span class="punctuation">,</span> <span class="name">w_int1</span><span class="punctuation">,</span> <span class="name">w_int2</span><span class="punctuation">):</span> + <span class="name">x</span> <span class="operator">=</span> <span class="name">w_int1</span><span class="operator">.</span><span class="name">intval</span> + <span class="name">y</span> <span class="operator">=</span> <span class="name">w_int2</span><span class="operator">.</span><span class="name">intval</span> + <span class="keyword">try</span><span class="punctuation">:</span> + <span class="name">z</span> <span class="operator">=</span> <span class="name">x</span> <span class="operator">+</span> <span class="name">y</span> + <span class="keyword">except</span> <span class="name exception">OverflowError</span><span class="punctuation">:</span> + <span class="keyword">raise</span> <span class="name">FailedToImplement</span><span class="punctuation">(</span><span class="name">space</span><span class="operator">.</span><span class="name">w_OverflowError</span><span class="punctuation">,</span> + <span class="name">space</span><span class="operator">.</span><span class="name">wrap</span><span class="punctuation">(</span><span class="literal string double">"integer addition"</span><span class="punctuation">))</span> + <span class="keyword">return</span> <span class="name">W_IntObject</span><span class="punctuation">(</span><span class="name">space</span><span class="punctuation">,</span> <span class="name">z</span><span class="punctuation">)</span></code></pre> +<p>(the <a class="reference external" href="https://foss.heptapod.net/pypy/pypy/-/blob/branch/default/pypy/interpreter/pyopcode.py#L582">current</a> <a class="reference external" href="https://foss.heptapod.net/pypy/pypy/-/blob/branch/default/pypy/objspace/std/intobject.py#L551">implementations</a> look slightly but not fundamentally different.)</p> +</div> + + +<div class="section" id="early-organizational-ideas"> +<h1><a class="toc-backref" href="https://www.pypy.org/posts/2018/09/the-first-15-years-of-pypy-3412615975376972020.html#id19">Early organizational ideas</a></h1> +<p>Some of the early organizational ideas of the project were as follows. Since the +project was started on a sprint and people really liked that style of working +PyPy continued to be developed on various subsequent <a class="reference external" href="https://www.youtube.com/watch?v=ed-zAxZtGlY">sprints</a>.</p> +<p>From early on there was a very heavy emphasis on testing. All the parts of the +interpreter that were implemented had a very careful set of unit tests to make +sure that they worked correctly. From early on, there was a continuous +integration infrastructure, which grew over time (nowadays it is very natural +for people to have automated tests, and the concept of green/red builds: but +embracing this workflow in the early 2000s was not really mainstream yet, and +it is probably one of the reasons behind PyPy's success).</p> +<p>At the sprints there was also an emphasis on doing pair programming to make +sure that everybody understood the codebase +equally. There was also a heavy emphasis on writing good code and on regularly +doing refactorings to make sure that the codebase remained nice, clean and +understandable. Those ideas followed from the early thoughts that PyPy would be +a sort of readable explanation of the language.</p> +<p>There was also a pretty fundamental design decision made at the time. That was +that the project should stay out of language design completely. Instead it would +follow CPython's lead and behave exactly like that implementation in all cases. +The project therefore committed to being almost quirk-to-quirk compatible and to +implement even the more obscure (and partially unnecessary) corner cases of +CPython.</p> +<p>All of these principles continue pretty much still today (There are a few places +where we had to deviate from being completely compatible, they are documented +<a class="reference external" href="https://doc.pypy.org/en/latest/cpython_differences.html">here</a>).</p> +</div> + + +<div class="section" id="eu-funding"> +<h1><a class="toc-backref" href="https://www.pypy.org/posts/2018/09/the-first-15-years-of-pypy-3412615975376972020.html#id20">2004-2007: EU-Funding</a></h1> +<p>While all this coding was going on it became clear pretty soon that the goals +that various participants had for the project would be very hard to achieve with +just open source volunteers working on the project in their spare time. +Particularly also the sprints became expensive given that those were just +volunteers doing this as a kind of weird hobby. Therefore a couple of people of +the project got together to apply for an EU grant in the <a class="reference external" href="https://en.wikipedia.org/wiki/Framework_Programmes_for_Research_and_Technological_Development#FP6_and_FP7">framework programme 6</a> +to solve these money problems. In mid-2004 that application proved to be +successful. And so the project got a grant of a 1.3 million Euro for +two years to be able to employ some of the core developers and to make it +possible for them work on the project full time. The EU grant went to seven +small-to-medium companies and <a class="reference external" href="https://hhu.de">Uni Düsseldorf</a>. The budget also contained money to +fund sprints, both for the employed core devs as well as other open source +contributors.</p> + +<p>The EU project started in December 2004 and that was a fairly heavy change in +pace for the project. Suddenly a lot of people were working full time on it, and +the pace and the pressure picked up quite a lot. Originally it had been a +leisurely project people worked on for fun. But afterwards people discovered +that doing this kind of work full time becomes slightly less fun, particularly +also if you have to fulfill the ambitious technical goals that the EU proposal +contained. And the proposal indeed contained a bit everything to increase its +chance of acceptance, such as <a class="reference external" href="https://en.wikipedia.org/wiki/Aspect-oriented_programming">aspect oriented programming</a>, semantic web, logic +programming, constraint programming, and so on. Unfortunately it +turned out that those things then have to be implemented, which can be called +the first thing we learned: if you promise something to the EU, you'll have to +actually go do it (After the funding ended, a lot of these features were +actually removed from the project again, at a <a class="reference external" href="https://www.pypy.org/posts/2007/11/sprint-pictures.html">cleanup sprint</a>).</p> +</div> + + +<div class="section" id="bootstrapping-pypy"> +<h1><a class="toc-backref" href="https://www.pypy.org/posts/2018/09/the-first-15-years-of-pypy-3412615975376972020.html#id21">2005: Bootstrapping PyPy</a></h1> +<p>So what were the actually useful things done as part of the EU project?</p> +<p>One of the most important goals that the EU project was meant to solve was the +question of how to turn PyPy into an actually useful VM for Python. The +bootstrapping plans were taken quite directly from <a class="reference external" href="https://wiki.squeak.org/squeak">Squeak</a>, which is a Smalltalk +VM written in a subset of Smalltalk called Slang, which can then be bootstrapped +to C code. The plan for PyPy was to do something similar, to define a restricted +subset of Python called RPython, restricted in such a way that it should be +possible to statically compile RPython programs to C code. Then the Python +interpreter should only use that subset, of course.</p> +<p>The main difference from the Squeak approach is that Slang, the subset of Squeak +used there, is actually quite a low level language. In a way, you could almost +describe it as C with Smalltalk syntax. RPython was really meant to be a +much higher level language, much closer to Python, with full support for single +inheritance classes, and most of Python's built-in data structures.</p> + +<div class="separator" style="clear: both; text-align: center;"><a href="https://2.bp.blogspot.com/-LpUpuvIQNAM/W5UX365L1HI/AAAAAAAAlE0/JB3Co6ICsLwxQDHkqFDyXsxvsCeCAK4BACLcBGAs/s1600/translation.png" style="margin-left: 1em; margin-right: 1em;"><img border="0" height="640" src="https://2.bp.blogspot.com/-LpUpuvIQNAM/W5UX365L1HI/AAAAAAAAlE0/JB3Co6ICsLwxQDHkqFDyXsxvsCeCAK4BACLcBGAs/s640/translation.png" width="628"></a></div> +<p>(BTW, you don’t have to understand any of the illustrations in this blog post, +they are taken from talks and project reports we did over the years so they are +of archaeological interest only and I don’t understand most of them myself.)</p> +<p>From 2005 on, work on the RPython type inference engine and C backend started in +earnest, which was sort of co-developed with the RPython language definition and +the PyPy Python interpreter. This is also roughly the time that I joined the +project as a volunteer.</p> +<p>And at the second sprint I went to, in July 2005, two and a half years after the +project got started, we managed to <a class="reference external" href="https://mail.python.org/pipermail/pypy-dev/2005-July/002239.html">bootstrap</a> the PyPy interpreter to C for the +first time. When we ran the compiled program, it of course immediately +segfaulted. The reason for that was that the C backend had turned characters +into signed chars in C, while the rest of the infrastructure assumed that they +were unsigned chars. After we fixed that, the second attempt worked and we +managed to run an incredibly complex program, something like <span class="docutils literal">6 * 7</span>. That +first bootstrapped version was really really slow, a couple of hundred times +slower than CPython.</p> + +<div class="separator" style="clear: both; text-align: center;"><a href="https://2.bp.blogspot.com/-w3Rzz7ngdz0/W5UX_THbfYI/AAAAAAAAlFA/kK33VIR3G-AlNq9CRuOdXNWbjTII6vGKwCPcBGAYYCw/s1600/champagne.png" style="margin-left: 1em; margin-right: 1em;"><img border="0" height="300" src="https://2.bp.blogspot.com/-w3Rzz7ngdz0/W5UX_THbfYI/AAAAAAAAlFA/kK33VIR3G-AlNq9CRuOdXNWbjTII6vGKwCPcBGAYYCw/s400/champagne.png" width="400"></a></div> +<p>The bootstrapping process of RPython has a number of nice benefits, a big one +being that a number of the properties of the generated virtual machine don't +have to expressed in the interpreter. The biggest example of this is garbage +collection. RPython is a garbage collected language, and the interpreter does +not have to care much about GC in most cases. When the C source code is +generated, a GC is automatically inserted. This is a source of great +flexibility. Over time we experimented with a number of different GC +approaches, from reference counting to <a class="reference external" href="https://www.hboehm.info/gc/">Boehm</a> to our current incremental +generational collector. As an aside, for a long time we were also working on +other backends to the RPython language and hoped to be able to target Java and +.NET as well. Eventually we abandoned this strand of work, however.</p> +</div> + + +<div class="section" id="rpython-s-modularity-problems"> +<h1><a class="toc-backref" href="https://www.pypy.org/posts/2018/09/the-first-15-years-of-pypy-3412615975376972020.html#id22">RPython's Modularity Problems</a></h1> +<p>Now we come to the first thing I would say we learned in the project, which is +that the quality of tools we thought of as internal things still matters a lot. +One of the biggest technical mistakes we've made in the project was that we +designed RPython without any kind of story for modularity. There is no concept +of modules in the language or any other way to break up programs into smaller +components. We always thought that it would be ok for RPython to be a little bit +crappy. It was meant to be this sort of internal language with not too many +external users. And of course that turned out to be completely wrong later.</p> +<p>That lack of modularity led to various problems that persist until today. The +biggest one is that there is no separate compilation for RPython programs at +all! You always need to compile all the parts of your VM together, which leads +to infamously bad compilation times.</p> +<p>Also by not considering the modularity question we were never forced to fix +some internal structuring issues of the RPython compiler itself. +Various layers of the compiler keep very badly defined and porous interfaces between +them. This was made possible by being able to work with all the program information in one heap, +making the compiler less approachable and maintainable than it maybe could be.</p> +<p>Of course this mistake just got more and more costly to fix over time, +and so it means that so far nobody has actually done it. +Not thinking more carefully about RPython's design, particularly its +modularity story, is in my opinion the biggest technical mistake the project +made.</p> +</div> + + +<div class="section" id="the-meta-jit"> +<h1><a class="toc-backref" href="https://www.pypy.org/posts/2018/09/the-first-15-years-of-pypy-3412615975376972020.html#id23">2006: The Meta-JIT</a></h1> +<p>After successfully bootstrapping the VM we did some fairly straightforward +optimizations on the interpreter and the C backend and managed to reduce the +slowdown versus CPython to something like 2-5 times slower. That's great! But of +course not actually useful in practice. So where do we go from here?</p> +<p>One of the not so secret goals of Armin Rigo, one of the PyPy founders, was to +use PyPy together with some advanced <a class="reference external" href="https://en.wikipedia.org/wiki/Partial_evaluation">partial evaluation</a> magic sauce to +somehow automatically generate a JIT compiler from the interpreter. The goal was +something like, "you write your interpreter in RPython, add a few annotations +and then we give you a JIT for free for the language that that interpreter +implements."</p> +<p>Where did the wish for that approach come from, why not just write a JIT for +Python manually in the first place? Armin had actually done just that before he +co-founded PyPy, in a project called <a class="reference external" href="https://psyco.sourceforge.net/">Psyco</a>. Psyco was an extension module for +CPython that contained a method-based JIT compiler for Python code. And Psyco +proved to be an amazingly frustrating compiler to write. There were two main +reasons for that. The first reason was that Python is actually quite a complex +language underneath its apparent simplicity. The second reason for the +frustration was that Python was and is very much an alive language, that gains +new features in the language core in every version. So every time a new Python +version came out, Armin had to do fundamental changes and rewrites to Psyco, and +he was getting pretty frustrated with it. So he hoped that that effort could be +diminished by not writing the JIT for PyPy by hand at all. Instead, the goal was +to generate a method-based JIT from the interpreter automatically. By taking the +interpreter, and applying a kind of advanced transformation to it, that would +turn it into a method-based JIT. And all that would still be translated into a +C-based VM, of course.</p> +<div class="separator" style="clear: both; text-align: center;"><a href="https://2.bp.blogspot.com/-gCI1qhaNKIE/W5UWuEJHcsI/AAAAAAAAlEo/ctU2bNj03iEzcHkqDcJH5LuKznuppNegwCLcBGAs/s1600/page21.jpg" style="margin-left: 1em; margin-right: 1em;"><img border="0" src="https://2.bp.blogspot.com/-gCI1qhaNKIE/W5UWuEJHcsI/AAAAAAAAlEo/ctU2bNj03iEzcHkqDcJH5LuKznuppNegwCLcBGAs/s640/page21.jpg" width="600"></a></div> +<p>Slide from Psyco presentation at EuroPython 2002</p> +</div> + + +<div class="section" id="the-first-jit-generator"> +<h1><a class="toc-backref" href="https://www.pypy.org/posts/2018/09/the-first-15-years-of-pypy-3412615975376972020.html#id24">The First JIT Generator</a></h1> +<p>From early 2006 on until the end of the EU project a lot of work went into +writing such a JIT generator. The idea was to base it on runtime partial +evaluation. Partial evaluation is an old idea in computer science. It's supposed +to be a way to automatically turn interpreters for a language into a compiler +for that same language. Since PyPy was trying to generate a JIT compiler, which +is in any case necessary to get good performance for a dynamic language like +Python, the partial evaluation was going to happen at runtime.</p> +<p>There are various ways to look at partial evaluation, but if you've never heard +of it before, a simple way to view it is that it will compile a Python function +by gluing together the implementations of the bytecodes of that function and +optimizing the result.</p> +<p>The main new ideas of PyPy's partial-evaluation based JIT generator as opposed +to earlier partial-evaluation approaches are the ideas of "promote" and the idea +of "virtuals". Both of these techniques had already been present (in a slightly +less general form) in Psyco, and the goal was to keep using them in PyPy. Both +of these techniques also still remain in use today in PyPy. I'm +going on a slight technical diversion now, to give a high level explanation of +what those ideas are for.</p> +<div class="separator" style="clear: both; text-align: center;"><a href="https://2.bp.blogspot.com/-e3T0At96nbI/W5UYIgiaTZI/AAAAAAAAlE8/Fn-f4C4FpH03CAPr17RiUPKNoQKyf2UugCLcBGAs/s1600/redgreen.png" style="margin-left: 1em; margin-right: 1em;"><img border="0" height="640" src="https://2.bp.blogspot.com/-e3T0At96nbI/W5UYIgiaTZI/AAAAAAAAlE8/Fn-f4C4FpH03CAPr17RiUPKNoQKyf2UugCLcBGAs/s640/redgreen.png" width="567"></a></div> +</div> + + +<div class="section" id="promote"> +<h1><a class="toc-backref" href="https://www.pypy.org/posts/2018/09/the-first-15-years-of-pypy-3412615975376972020.html#id25">Promote</a></h1> +<p>One important ingredient of any JIT compiler is the ability to do runtime +feedback. Runtime feedback is most commonly used to know something about which +concrete types are used by a program in practice. Promote is basically a way to +easily introduce runtime feedback into the JIT produced by the JIT generator. +It's an <a class="reference external" href="https://www.pypy.org/posts/2011/03/controlling-tracing-of-interpreter-with_15.html">annotation</a> the implementer of a language can use to express their wish +that specialization should happen at <em>this</em> point. This mechanism can be used to +express <a class="reference external" href="https://www.pypy.org/posts/2011/03/controlling-tracing-of-interpreter-with_21.html">all kinds of</a> runtime feedback, moving values from the interpreter +into the compiler, whether they be types or other things.</p> +</div> + + +<div class="section" id="virtuals"> +<h1><a class="toc-backref" href="https://www.pypy.org/posts/2018/09/the-first-15-years-of-pypy-3412615975376972020.html#id26">Virtuals</a></h1> +<p>Virtuals are a very aggressive form of <a class="reference external" href="https://www.ssw.uni-linz.ac.at/Research/Papers/Stadler14/Stadler2014-CGO-PEA.pdf">partial escape analysis</a>. A dynamic +language often puts a lot of pressure on the garbage collector, since most +primitive types (like integers, floats and strings) are boxed in the heap, and +new boxes are allocated all the time.</p> +<p>With the help of virtuals a very significant portion of all allocations in the +generated machine code can be completely removed. Even if they can't be removed, +often the allocation can be delayed or moved into an error path, or even +into a <a class="reference external" href="https://bibliography.selflanguage.org/_static/dynamic-deoptimization.pdf">deoptimization</a> path, and thus disappear from the generated machine code +completely.</p> +<p>This optimization really is the super-power of PyPy's optimizer, since it +doesn't work only for primitive boxes but for any kind of object allocated on +the heap with a predictable lifetime.</p> +<p>As an aside, while this kind of partial escape analysis is sort of new for +object-oriented languages, it has actually existed in Prolog-based partial +evaluation systems since the 80s, because it's just extremely natural there.</p> +</div> + + +<div class="section" id="jit-status-2007"> +<h1><a class="toc-backref" href="https://www.pypy.org/posts/2018/09/the-first-15-years-of-pypy-3412615975376972020.html#id27">JIT Status 2007</a></h1> +<p>So, back to our history. We're now in 2007, at the end of the EU project (you +can find the EU-reports we wrote during the projects <a class="reference external" href="https://doc.pypy.org/en/latest/index-report.html">here</a>). The EU project +successfully finished, we survived the final review with the EU. So, what's the +2007 status of the JIT generator? It works kind of, it can be applied to PyPy. It +produces a VM with a JIT that will turn Python code into machine code at runtime +and run it. However, that machine code is not particularly fast. Also, it tends +to generate many megabytes of machine code even for small Python programs. While +it's always faster than PyPy without JIT, it's only sometimes faster than +CPython, and most of the time Psyco still beats it. On the one hand, this is +still an amazing achievement! It's arguably the biggest application of partial +evaluation at this point in time! On the other hand, it was still quite +disappointing in practice, particularly since some of us had believed at the +time that it should have been possible to reach and then surpass the speed of +Psyco with this approach.</p> +</div> + + +<div class="section" id="rsqueak-and-other-languages"> +<h1><a class="toc-backref" href="https://www.pypy.org/posts/2018/09/the-first-15-years-of-pypy-3412615975376972020.html#id28">2007: RSqueak and other languages</a></h1> +<p>After the EU project ended we did all kinds of things. Like sleep for a month +for example, and have the cleanup sprint that I already mentioned. We also had a +slightly unusual sprint in Bern, with members of the <a class="reference external" href="https://scg.unibe.ch/">Software Composition +Group</a> of Oscar Nierstrasz. As I wrote above, PyPy had been heavily influenced +by Squeak Smalltalk, and that group is a heavy user of Squeak, so we wanted to +see how to collaborate with them. At the beginning of the sprint, we decided +together that the goal of that week should be to try to write a Squeak virtual +machine in RPython, and at the end of the week we'd gotten surprisingly far with +that goal. Basically most of the bytecodes and the Smalltalk object system +worked, we had written an image loader and could run some benchmarks (during the +sprint we also regularly updated a <a class="reference external" href="https://pypysqueak.blogspot.com/">blog</a>, the success of which led us to <a class="reference external" href="https://www.pypy.org/posts/2007/10/first-post.html">start</a> +the PyPy blog).</p> + +<div class="separator" style="clear: both; text-align: center;"><a href="https://4.bp.blogspot.com/-n0Xj6fdNu-g/W5UZE-Z0O8I/AAAAAAAAlFM/A61pBvOV-zkIrYZKDTagNbFrm6HxyFbuwCLcBGAs/s1600/bern.png" style="margin-left: 1em; margin-right: 1em;"><img border="0" src="https://4.bp.blogspot.com/-n0Xj6fdNu-g/W5UZE-Z0O8I/AAAAAAAAlFM/A61pBvOV-zkIrYZKDTagNbFrm6HxyFbuwCLcBGAs/s640/bern.png" width="600"></a></div> +<p>The development of the Squeak interpreter was very interesting for the project, +because it was the first real step that moved RPython from being an +implementation detail of PyPy to be a more interesting project in its own right. +Basically a language to write interpreters in, with the eventual promise to get +a JIT for that language almost for free. That Squeak implementation is now +called <a class="reference external" href="https://github.com/hpi-swa/RSqueak">RSqueak</a> ("Research Squeak").</p> +<p>I'll not go into more details about any of the other language implementations in +RPython in this post, but over the years we've had a large variety of language +of them done by various people and groups, most of them as research vehicles, +but also some as real language implementations. Some very cool research results +came out of these efforts, here's a slightly outdated <a class="reference external" href="https://rpython.readthedocs.io/en/latest/examples.html">list of some of them</a>.</p> +<p>The use of RPython for other languages complicated the PyPy narrative a lot, and +in a way we never managed to recover the simplicity of the original project +description "PyPy is Python in Python". Because now it's something like "we have +this somewhat strange language, a subset of Python, that's called RPython, and +it's good to write interpreters in. And if you do that, we'll give you a JIT for +almost free. And also, we used that language to write a Python implementation, +called PyPy.". It just doesn't roll off the tongue as nicely.</p> +</div> + + +<div class="section" id="four-more-jit-generators"> +<h1><a class="toc-backref" href="https://www.pypy.org/posts/2018/09/the-first-15-years-of-pypy-3412615975376972020.html#id29">2008-2009: Four More JIT Generators</a></h1> +<p>Back to the JIT. After writing the first JIT generator as part of the EU +project, with somewhat mixed results, we actually wrote several more JIT +generator prototypes with different architectures to try to solve some of the +problems of the first approach. To give an impression of these prototypes, +here’s a list of them.</p> +<ul class="simple"> +<li><p>The second JIT generator we started working on in 2008 behaved exactly like +the first one, but had a meta-interpreter based architecture, to make it more +flexible and easier to experiment with. The meta-interpreter was called +the "rainbow interpreter", and in general the JIT is an area where we went +somewhat overboard with borderline silly terminology, with notable +occurrences of "timeshifter", "blackhole interpreter" etc.</p></li> +<li><p>The third JIT generator was an experiment based on the second one which +changed +compilation strategy. While the previous two had compiled many control flow +paths of the currently compiled function eagerly, that third JIT was sort of +maximally lazy and stopped compilation at every control flow split to avoid +guessing which path would actually be useful later when executing the code. +This was an attempt to reduce the problem of the first JIT generating way too +much machine code. Only later, when execution went down one of the not yet +compiled paths would it continue compiling more code. This gives an effect +similar to that of <a class="reference external" href="https://arxiv.org/abs/1411.0352">lazy basic block versioning</a>.</p></li> +<li><p>The fourth JIT generator was a pretty strange prototype, a <a class="reference external" href="https://pdfs.semanticscholar.org/db2d/0542c7791ee6f29a9f35e3181a186866f881.pdf">runtime partial +evaluator for Prolog</a>, to experiment with various specialization trade-offs. It +had an approach that we gave a not at all humble name, called "perfect +specialization".</p></li> +<li><p>The fifth JIT generator is the one that we are still using today. Instead of +generating a method-based JIT compiler from our interpreter we switched to +generating a tracing JIT compiler. Tracing JIT compilers were sort of the +latest fashion at the time, at least for a little while.</p></li> +</ul> +</div> + + +<div class="section" id="meta-tracing"> +<h1><a class="toc-backref" href="https://www.pypy.org/posts/2018/09/the-first-15-years-of-pypy-3412615975376972020.html#id30">2009: Meta-Tracing</a></h1> +<p>So, how did that tracing JIT generator work? A <a class="reference external" href="https://en.wikipedia.org/wiki/Tracing_just-in-time_compilation">tracing JIT</a> generates code by +observing and logging the execution of the running program. This yields a +straight-line trace of operations, which are then optimized and compiled into +machine code. Of course most tracing systems mostly focus on tracing loops.</p> +<p>As we discovered, it's actually quite simple to <a class="reference external" href="https://www.pypy.org/posts/2009/03/applying-tracing-jit-to-interpreter.html">apply a tracing JIT to a generic +interpreter</a>, by not tracing the execution of the user program directly, but by +instead tracing the execution of the interpreter while it is running the user +program (here's the <a class="reference external" href="https://foss.heptapod.net/pypy/extradoc/-/blob/branch/default/default/talk/icooolps2009/bolz-tracing-jit-final.pdf">paper</a> we wrote about this approach).</p> +<p>So that's what we implemented. Of course we kept the two successful parts of the +first JIT, <a class="reference external" href="https://foss.heptapod.net/pypy/extradoc/-/blob/branch/default/default/talk/icooolps2011/bolz-hints-final.pdf">promote</a> and <a class="reference external" href="https://foss.heptapod.net/pypy/extradoc/-/blob/branch/default/default/talk/pepm2011/escape-tracing.pdf">virtuals</a> (both links go to the papers about these +features in the meta-tracing context).</p> +<div class="separator" style="clear: both; text-align: center;"><a href="https://3.bp.blogspot.com/-LeGqU7U6UfI/W5UZNLCjCAI/AAAAAAAAlFQ/_yhheMGCTu82WB8bp1wjVfhCeu_ppdw_gCLcBGAs/s1600/metajit.png" style="margin-left: 1em; margin-right: 1em;"><img border="0" src="https://3.bp.blogspot.com/-LeGqU7U6UfI/W5UZNLCjCAI/AAAAAAAAlFQ/_yhheMGCTu82WB8bp1wjVfhCeu_ppdw_gCLcBGAs/s640/metajit.png" width="600"></a></div> +</div> + + +<div class="section" id="why-did-we-abandon-partial-evaluation"> +<h1><a class="toc-backref" href="https://www.pypy.org/posts/2018/09/the-first-15-years-of-pypy-3412615975376972020.html#id31">Why did we Abandon Partial Evaluation?</a></h1> +<p>So one question I get sometimes asked when telling this story is, why did +we think that tracing would work better than partial evaluation (PE)? One of the +hardest parts of compilers in general and partial evaluation based systems in +particular is the decision when and how much to inline, how much to specialize, +as well as the decision when to split control flow paths. In the PE based JIT +generator we never managed to control that question. Either the JIT would +inline too much, leading to useless compilation of all kinds of unlikely error +cases. Or it wouldn't inline enough, preventing necessary optimizations.</p> +<p>Meta tracing solves this problem with a hammer, it doesn't make particularly +complex inlining decisions at all. It instead decides what to inline by +precisely following what a real execution through the program is doing. Its +inlining decisions are therefore very understandable and predictable, and it +basically only has one heuristic based on whether the called function contains a +loop or not: If the called function contains a loop, we'll never inline it, if +it doesn't we always try to inline it. That predictability is actually what was +the most helpful, since it makes it possible for interpreter authors to +understand why the JIT did what it did and to actually influence its inlining +decisions by changing the annotations in the interpreter source. It turns out +that simple is better than complex.</p> +</div> + + +<div class="section" id="the-pyjit-eurostars-project"> +<h1><a class="toc-backref" href="https://www.pypy.org/posts/2018/09/the-first-15-years-of-pypy-3412615975376972020.html#id32">2009-2011: The PyJIT Eurostars Project</a></h1> +<p>While we were writing all these JIT prototypes, PyPy had sort of reverted back +to being a volunteer-driven open source project (although some of us, like +Antonio Cuni and I, had started working for universities and other project +members had other sources of funding). But again, while we did the work it +became clear that to get an actually working fast PyPy with generated JIT we +would need actual funding again for the project. So we applied to the EU again, +this time for a much smaller project with less money, in the <a class="reference external" href="https://www.pypy.org/posts/2010/12/oh-and-btw-pypy-gets-funding-through.html">Eurostars</a> +framework. We got a grant for three participants, <a class="reference external" href="https://merlinux.eu/">merlinux</a>, <a class="reference external" href="https://www.openend.se/">OpenEnd</a> and Uni +Düsseldorf, on the order of a bit more than half a million euro. That money was +specifically for JIT development and JIT testing infrastructure.</p> +<div class="separator" style="clear: both; text-align: center;"><a href="https://2.bp.blogspot.com/-CmyDXLRX85w/W5UZaPZSrzI/AAAAAAAAlFU/VcOEpPg95cUW7h8xssvJsGbiQAar8wsMACLcBGAs/s1600/eurostars.png" style="margin-left: 1em; margin-right: 1em;"><img border="0" height="640" src="https://2.bp.blogspot.com/-CmyDXLRX85w/W5UZaPZSrzI/AAAAAAAAlFU/VcOEpPg95cUW7h8xssvJsGbiQAar8wsMACLcBGAs/s640/eurostars.png" width="494"></a></div> +</div> + + +<div class="section" id="tracing-jit-improvements"> +<h1><a class="toc-backref" href="https://www.pypy.org/posts/2018/09/the-first-15-years-of-pypy-3412615975376972020.html#id33">Tracing JIT improvements</a></h1> +<p>When writing the grant we had sat together at a sprint and discussed extensively +and decided that we would not switch JIT generation approaches any more. We all +liked the tracing approach well enough and thought it was promising. So instead +we agreed to try in earnest to make the tracing JIT really practical. So in the +Eurostars project we started with implementing sort of fairly standard JIT +compiler optimizations for the meta-tracing JIT, such as:</p> +<ul class="simple"> +<li><p>constant folding</p></li> +<li><p>dead code elimination</p></li> +<li><p><a class="reference external" href="https://foss.heptapod.net/pypy/extradoc/-/blob/branch/default/default/talk/dls2012/dls04-ardo.pdf">loop invariant code motion</a> (using <a class="reference external" href="https://lua-users.org/lists/lua-l/2009-11/msg00089.html">LuaJIT's approach</a>)</p></li> +<li><p>better heap optimizations</p></li> +<li><p>faster deoptimization (which is actually a bit of a mess in the +meta-approach)</p></li> +<li><p>and dealing more efficiently with Python frames objects and the +features of Python's debugging facilities</p></li> +</ul> +</div> + + +<div class="section" id="speed-pypy-org"> +<h1><a class="toc-backref" href="https://www.pypy.org/posts/2018/09/the-first-15-years-of-pypy-3412615975376972020.html#id34">2010: speed.pypy.org</a></h1> +<p>In 2010, to make sure that we wouldn't accidentally introduce speed regressions +while working on the JIT, we implemented infrastructure to build PyPy and run +our benchmarks nightly. Then, the <a class="reference external" href="https://speed.pypy.org">https://speed.pypy.org</a> website was implemented +by Miquel Torres, a volunteer. The website shows the changes in benchmark +performance compared to the previous <em>n</em> days. It didn't sound too important at +first, but this was (and is) a fantastic tool, and an amazing motivator over the +next years, to keep continually improving performance.</p> +<div class="separator" style="clear: both; text-align: center;"><a href="https://4.bp.blogspot.com/-IVkE9-xguTs/W5UZgDbKiCI/AAAAAAAAlFc/pylFf_taalIHiqkR9IAKFR36cfJCaopPwCLcBGAs/s1600/speed.png" style="margin-left: 1em; margin-right: 1em;"><img border="0" src="https://4.bp.blogspot.com/-IVkE9-xguTs/W5UZgDbKiCI/AAAAAAAAlFc/pylFf_taalIHiqkR9IAKFR36cfJCaopPwCLcBGAs/s640/speed.png" width="600"></a></div> +</div> + + +<div class="section" id="continuous-integration"> +<h1><a class="toc-backref" href="https://www.pypy.org/posts/2018/09/the-first-15-years-of-pypy-3412615975376972020.html#id35">Continuous Integration</a></h1> +<p>This actually leads me to something else that I'd say we learned, which is that +continuous integration is really awesome, and completely transformative to have +for a project. This is not a particularly surprising insight nowadays in the +open source community, it's easy to set up continuous integration on Github +using Travis or some other CI service. But I still see a lot of research +projects that don't have tests, that don't use CI, so I wanted to mention it +anyway. As I mentioned earlier in the post, PyPy has a quite serious testing +culture, with unit tests written for new code, regression tests for all bugs, +and integration tests using the CPython test suite. Those tests are <a class="reference external" href="https://buildbot.pypy.org/">run +nightly</a> on a number of architectures and operating systems.</p> +<p>Having all this kind of careful testing is of course necessary, since PyPy is +really trying to be a Python implementation that people actually use, not just +write papers about. But having all this infrastructure also had other benefits, +for example it allows us to trust newcomers to the project very quickly. +Basically after your first patch gets accepted, you immediately get commit +rights to the PyPy repository. If you screw up, the tests (or the code reviews) +are probably going to catch it, and that reduction to the barrier to +contributing is just super great.</p> +<p>This concludes my advertisement for testing in this post.</p> +</div> + + +<div class="section" id="implementing-python-objects-with-maps"> +<h1><a class="toc-backref" href="https://www.pypy.org/posts/2018/09/the-first-15-years-of-pypy-3412615975376972020.html#id36">2010: Implementing Python Objects with Maps</a></h1> +<p>So, what else did we do in the Eurostars project, apart from adding traditional +compiler optimizations to the tracing JIT and setting up CI infrastructure? +Another strand of work, that went on sort of concurrently to the JIT generator +improvements, were deep rewrites in the Python runtime, and the Python data +structures. I am going to write about two exemplary ones here, maps and storage strategies.</p> +<p>The first such rewrite is fairly standard. Python instances are similar to +Javascript objects, in that you can add arbitrary attributes to them at runtime. +Originally Python instances were backed by a dictionary in PyPy, but of course +in practice most instances of the same class have the same set of attribute +names. Therefore we went and implemented <a class="reference external" href="https://www.pypy.org/posts/2010/11/efficiently-implementing-python-objects.html">Self style maps</a>, which are often +called <a class="reference external" href="https://richardartoul.github.io/jekyll/update/2015/04/26/hidden-classes.html">hidden classes</a> in the JS world to represent instances instead. This +has two big benefits, it allows you to generate much better machine code for +instance attribute access and makes instances use a lot less memory.</p> +<div class="separator" style="clear: both; text-align: center;"><a href="https://3.bp.blogspot.com/-Mp_vxpQsG5M/TN6__c74O3I/AAAAAAAAAMo/3RcifDuyVWk_PXcxKQJGbTqTMCEjIyPcACPcBGAYYCw/s1600/instancemap.png" style="margin-left: 1em; margin-right: 1em;"><img border="0" src="https://3.bp.blogspot.com/-Mp_vxpQsG5M/TN6__c74O3I/AAAAAAAAAMo/3RcifDuyVWk_PXcxKQJGbTqTMCEjIyPcACPcBGAYYCw/s640/instancemap.png" width="600"></a></div> +</div> + + +<div class="section" id="container-storage-strategies"> +<h1><a class="toc-backref" href="https://www.pypy.org/posts/2018/09/the-first-15-years-of-pypy-3412615975376972020.html#id37">2011: Container Storage Strategies</a></h1> +<p>Another important change in the PyPy runtime was rewriting the Python container +data structures, such as lists, dictionaries and sets. A fairly straightforward +observation about how those are used is that in a significant percentage of +cases they contain type-homogeneous data. As an example it's quite common to +have lists of only integers, or lists of only strings. So we changed the list, +dict and set implementations to use something we called <a class="reference external" href="https://www.pypy.org/posts/2011/10/more-compact-lists-with-list-strategies.html">storage strategies</a>. With +storage strategies these data structures use a more efficient representations if +they contain only primitives of the same type, such as ints, floats, strings. +This makes it possible to store the values without boxing them in the underlying +data structure. Therefore read and write access are much faster for such type +homogeneous containers. Of course when later another data type gets added to +such a list, the existing elements need to all be boxed at that point, which is +expensive. But we did a <a class="reference external" href="https://tratt.net/laurie/research/pubs/html/bolz_diekmann_tratt__storage_strategies_for_collections_in_dynamically_typed_languages/">study</a> and found out that that happens quite rarely in +practice. A lot of that work was done by Lukas Diekmann.</p> +<div class="separator" style="clear: both; text-align: center;"><a href="https://4.bp.blogspot.com/-hFXLNQ0Ry0I/TpQohnZHRpI/AAAAAAAAAYY/Yko9C1h1cU08jgighb9RKG3nEEp1ReA8wCPcBGAYYCw/s1600/with_strategies.png" style="margin-left: 1em; margin-right: 1em;"><img border="0" src="https://4.bp.blogspot.com/-hFXLNQ0Ry0I/TpQohnZHRpI/AAAAAAAAAYY/Yko9C1h1cU08jgighb9RKG3nEEp1ReA8wCPcBGAYYCw/s640/with_strategies.png" width="600"></a></div> +</div> + + +<div class="section" id="deep-changes-in-the-runtime-are-necessary"> +<h1><a class="toc-backref" href="https://www.pypy.org/posts/2018/09/the-first-15-years-of-pypy-3412615975376972020.html#id38">Deep Changes in the Runtime are Necessary</a></h1> +<p>These two are just two examples for a number of fairly fundamental changes in +the PyPy runtime and PyPy data structures, probably the two most important ones, +but we did many others. That leads me to another thing we learned. If you want +to generate good code for a complex dynamic language such as Python, it's +actually not enough at all to have a good code generator and good compiler +optimizations. That's not going to help you, if your runtime data-structures +aren't in a shape where it's possible to generate efficient machine code to +access them.</p> +<p>Maybe this is well known in the VM and research community. However it's the main +mistake that in my opinion every other Python JIT effort has made in the last 10 +years, where most projects said something along the lines of "we're not +changing the existing CPython data structures at all, we'll just let LLVM +inline enough C code of the runtime and then it will optimize all the overhead +away". That never works very well.</p> +</div> + + +<div class="section" id="jit-status-2011"> +<h1><a class="toc-backref" href="https://www.pypy.org/posts/2018/09/the-first-15-years-of-pypy-3412615975376972020.html#id39">JIT Status 2011</a></h1> +<p>So, here we are at the end of the Eurostars project, what's the status of the JIT? Well, it +seems this meta-tracing stuff really works! We finally started actually +believing in it, when we reached the point in 2010 where self-hosting PyPy was +actually <a class="reference external" href="https://www.pypy.org/posts/2010/11/snake-which-bites-its-tail-pypy-jitting.html">faster</a> than bootstrapping the VM on CPython. Speeding up the +bootstrapping process is something that Psyco never managed at all, so we +considered this a quite important achievement. At the end of +Eurostars, we were about 4x faster than CPython on our set of benchmarks.</p> +</div> + + +<div class="section" id="engineering-and-incremental-progress"> +<h1><a class="toc-backref" href="https://www.pypy.org/posts/2018/09/the-first-15-years-of-pypy-3412615975376972020.html#id40">2012-2017: Engineering and Incremental Progress</a></h1> +<p>2012 the Eurostars project was finished and PyPy reverted yet another time back +to be an open source project. From then on, we've had a more diverse set of +sources of funding: we received some crowd funding via the <a class="reference external" href="https://sfconservancy.org/">Software Freedom +Conservancy</a> and contracts of various sizes from companies to implement various +specific features, often handled by <a class="reference external" href="https://baroquesoftware.com/">Baroque Software</a>. Over the next couple of +years +we revamped various parts of the VM. We improved the GC in <a class="reference external" href="https://www.pypy.org/posts/2013/10/incremental-garbage-collector-in-pypy.html">major</a> ways. We +optimized the implementation of the JIT compiler to improve <a class="reference external" href="https://www.pypy.org/posts/2015/10/pypy-memory-and-warmup-improvements-2.html">warmup</a> <a class="reference external" href="https://www.pypy.org/posts/2016/04/warmup-improvements-more-efficient.html">times</a>. We +implemented backends for various CPU architectures (including <a class="reference external" href="https://www.pypy.org/posts/2015/10/powerpc-backend-for-jit.html">PowerPC</a> and +<a class="reference external" href="https://www.pypy.org/posts/2016/04/pypy-enterprise-edition.html">s390x</a>). We tried to reduce the number of performance cliffs and make the JIT +useful in a broader set of cases.</p> +<p>Another strand of work was to push quite significantly to be more +compatible with CPython, particularly the Python 3 line as well as extension +module support. Other compatibility improvements we did was making sure that +virtualenv <a class="reference external" href="https://www.pypy.org/posts/2010/08/using-virtualenv-with-pypy.html">works with PyPy</a>, better support for distutils and setuptools and +similar improvements. The continually improving performance as well better +compatibility with the ecosystem tools led to the <a class="reference external" href="https://www.pypy.org/posts/2014/10/couchbase-contribution-to-pypy.html">first few</a> <a class="reference external" href="https://baroquesoftware.com/blog#interview-with-roberto_de_ioris">users</a> of PyPy in +<a class="reference external" href="https://baroquesoftware.com/blog#magnetic">industry</a>.</p> +</div> +<div class="section" id="cpyext"> + + +<h1><a class="toc-backref" href="https://www.pypy.org/posts/2018/09/the-first-15-years-of-pypy-3412615975376972020.html#id41">CPyExt</a></h1> +<p>Another very important strand of work that took a lot of effort in recent years +was CPyExt. One of the main blockers of PyPy adoption had always been the fact +that a lot of people need specific C-extension modules at least in some parts of +their program, and telling them to reimplement everything in Python is just not +a practical solution. Therefore we worked on CPyExt, an emulation layer to make +it possible to run <a class="reference external" href="https://www.pypy.org/posts/2010/04/using-cpython-extension-modules-with.html">CPython C-extension modules</a> in PyPy. Doing that was a very +<a class="reference external" href="https://www.youtube.com/watch?v=qH0eeh-4XE8">painful process</a>, since the CPython extension API leaks a lot of CPython +implementation details, so we had to painstakingly emulate all of these details +to make it possible to run extensions. That this works at all remains completely +amazing to me! But nowadays CPyExt is even getting quite good, a lot of the big +numerical libraries such as Numpy and <a class="reference external" href="https://www.pypy.org/posts/2017/10/pypy-v59-released-now-supports-pandas.html">Pandas</a> are now supported (for a while +we had worked hard on a reimplementation of Numpy called NumPyPy, but +eventually realized that it would never be complete and useful enough). +However, calling CPyExt modules from PyPy can still be very slow, +which makes it impractical for some applications +that's why we are <a class="reference external" href="https://www.pypy.org/posts/2017/10/cape-of-good-hope-for-pypy-hello-from.html">working</a> on it.</p> +<p>Not thinking about C-extension module emulation earlier in the project history +was a pretty bad strategic mistake. It had been clear for a long time that +getting people to just stop using all their C-extension modules was never going +to work, despite our efforts to give them alternatives, such as <a class="reference external" href="https://cffi.readthedocs.io/en/latest/">cffi</a>. So we +should have thought of a story for all the existing C-extension modules earlier +in the project. Not starting CPyExt earlier was mostly a failure of our +imagination (and maybe a too high pain threshold): We didn't believe this kind +of emulation was going to be practical, until somebody <a class="reference external" href="https://foss.heptapod.net/pypy/pypy/-/commit/0c718ff5a3c1b583179325ab27b0d3b17fa11c0c">went and tried it</a>.</p> +</div> + + +<div class="section" id="python-3"> +<h1><a class="toc-backref" href="https://www.pypy.org/posts/2018/09/the-first-15-years-of-pypy-3412615975376972020.html#id42">Python 3</a></h1> +<p>Another main +focus of the last couple of years has been to catch up with the CPython 3 line. +Originally we had ignored Python 3 for a little bit too long, and were trailing +several versions behind. In 2016 and 2017 we had a <a class="reference external" href="https://www.pypy.org/posts/2016/08/pypy-gets-funding-from-mozilla-for.html">grant</a> from the Mozilla open +source support program of $200'000 to be able to catch up with Python 3.5. This +work is now basically done, and we are starting to target CPython 3.6 and will +have to look into 3.7 in the near future.</p> +</div> + + +<div class="section" id="incentives-of-oss-compared-to-academia"> +<h1><a class="toc-backref" href="https://www.pypy.org/posts/2018/09/the-first-15-years-of-pypy-3412615975376972020.html#id43">Incentives of OSS compared to Academia</a></h1> +<p>So, what can be learned from those more recent years? One thing we can observe +is that a lot of the engineering work we did in that time is not really science +as such. A lot of the VM techniques we implemented are kind of well known, and +catching up with new Python features is also not particularly deep researchy +work. Of course this kind of work is obviously super necessary if you want +people to use your VM, but it would be very hard to try to get research funding +for it. PyPy managed quite well over its history to balance phases of more +research oriented work, and more product oriented ones. But getting this balance +somewhat right is not easy, and definitely also involves a lot of luck. And, as +has been discussed a lot, it's actually very hard to find funding for open +source work, both within and outside of academia.</p> +</div> +<div class="section" id="meta-tracing-really-works"> +<h1><a class="toc-backref" href="https://www.pypy.org/posts/2018/09/the-first-15-years-of-pypy-3412615975376972020.html#id44">Meta-Tracing really works!</a></h1> +<p>Let me end with what, in my opinion, is the main positive technical result of PyPy the +project. Which is that the whole idea of using a meta-tracing JIT can really +work! Currently PyPy is about 7 times faster than CPython on a broad set of +benchmarks. Also, one of the very early motivations for using a meta-jitting +approach in PyPy, which was to not have to adapt the JIT to new versions of +CPython proved to work: indeed we didn't have to change anything in the JIT +infrastructure to support Python 3.</p> +<p>RPython has also worked and improved performance for a number of other +languages. Some of these interpreters had wildly different architectures. +AST-based interpreters, bytecode based, CPU emulators, really inefficient +high-level ones that allocate continuation objects all the time, and so on. This +shows that RPython also gives you a lot of freedom in deciding how you want to +structure the interpreter and that it can be applied to languages of quite +different paradigms.</p> +<p>I'll end with a list of the people that have contributed code to PyPy over its +<a class="reference external" href="https://www.openhub.net/p/pypy">history</a>, more than 350 of them. I'd like to thank all of them and the various +roles they played. To the next 15 years!</p> + +<div class="separator" style="clear: both; text-align: center;"><a href="https://2.bp.blogspot.com/-Qj9c-uIdhBw/W5UhBd-v07I/AAAAAAAAlFs/hSm6It8N_ngJLyM3tjH0ToNC_6SuvnCaQCLcBGAs/s1600/contributors2.pdf.png" style="margin-left: 1em; margin-right: 1em;"><img border="0" src="https://2.bp.blogspot.com/-Qj9c-uIdhBw/W5UhBd-v07I/AAAAAAAAlFs/hSm6It8N_ngJLyM3tjH0ToNC_6SuvnCaQCLcBGAs/s1600/contributors2.pdf.png" width="600"></a></div> + +</div> +<div class="section" id="acknowledgements"> + +<h1><a class="toc-backref" href="https://www.pypy.org/posts/2018/09/the-first-15-years-of-pypy-3412615975376972020.html#id45">Acknowledgements</a></h1> +<p>A lot of people helped me with this blog post. Tim Felgentreff made me give the +keynote, which lead me to start collecting the material. Samuele Pedroni +gave essential early input when I just started planning the talk, and also gave +feedback on the blog post. Maciej Fijałkowski gave me feedback on the post, in +particular important insight about the more recent years of the project. Armin +Rigo discussed the talk slides with me, and provided details about the early +expectations about the first JIT's hoped-for performance. Antonio Cuni gave +substantial feedback and many very helpful suggestions for the blog post. +Michael Hudson-Doyle also fixed a number of mistakes in the post and rightfully +complained about the lack of mention of the GC. Christian Tismer provided +access to his copy of early Python-de mailing list posts. Matti Picus pointed +out a number of things I had forgotten and fixed a huge number of typos and +awkward English, including my absolute inability to put commas correctly. +All remaining errors are of course my own.</p> +</div> + +<p><b>update</b>: fixed confusing wording in the maps section.</p>roadmaphttps://www.pypy.org/posts/2018/09/the-first-15-years-of-pypy-3412615975376972020.htmlSun, 09 Sep 2018 14:50:00 GMTRoadmap for JIThttps://www.pypy.org/posts/2009/04/roadmap-for-jit-377358891902851723.htmlMaciej Fijalkowski<p>Hello. +</p> +<p> +First a disclaimer. This post is more about plans for future than current +status. We usually try to write about things that we have done, because +it's much much easier to promise things than to actually make it happen, +but I think it's important enough to have some sort of roadmap. +</p> +<p> +In recent months we came to the point where the 5th generation of +JIT prototype was working as <a href="https://www.pypy.org/posts/2009/03/good-news-everyone-421421336094214242.html">nice</a> +or even a bit nicer than 1st one back in 2007. Someone might ask "so why +did you spend all this time without going forward?". And indeed, we spend +a lot of time moving sideways, but as posted, we also spent a lot of time +doing <a href="https://www.pypy.org/posts/2009/04/beta-for-110-released-4604559533184706699.html">some other things</a>, which are important as well. +The main advantage of current JIT incarnation is much much simpler than +the first one. Even I can comprehend it, which is much of an improvement :-) +</p> +<p> +So, the prototype is working and gives very nice speedups in range of 20-30x +over CPython. We're pretty confident this prototype will work and will +produce fast python interpreter eventually. So we decided that now we'll +work towards changing prototype into something stable and solid. This +might sound easy, but in fact it's not. Having stable assembler backend +and optimizations that keep semantics is not as easy as it might sound. +</p> +<p> +The current roadmap, as I see it, looks like as following: +</p> +<ul> +<li> Provide a JIT that does not speedup things, but produce assembler without + optimizations turned on, that is correct and able to run CPython's library + tests on a nightly basis. +</li> +<li> + Introduce simple optimizations, that should make above JIT a bit faster than + CPython. With optimizations disabled JIT is producing incredibly dumb + assembler, which is slower than correspoding C code, even with removal + of interpretation overhead (which is not very surprising). +</li> +<li> + Backport optimizations from JIT prototype, one by one, keeping an eye + on how they perform and making sure they don't break anything. +</li> +<li> + Create new optimizations, like speeding up attribute access. +</li> +<li> + Profit. +</li> +</ul> +<p> +This way, we can hopefully provide a working JIT, which gives fast python +interpreter, which is a bit harder than just a nice prototype. +</p> +<p> +Tell us what you think about this plan. +</p> +Cheers,<br> +fijal &amp; others.jitpypyroadmapspeedhttps://www.pypy.org/posts/2009/04/roadmap-for-jit-377358891902851723.htmlTue, 21 Apr 2009 19:38:00 GMT \ No newline at end of file diff --git a/categories/rpyc.html b/categories/rpyc.html new file mode 100644 index 000000000..5196572d5 --- /dev/null +++ b/categories/rpyc.html @@ -0,0 +1,114 @@ + + + + + +Posts about RPyC | PyPy + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +
                + + \ No newline at end of file diff --git a/categories/rpyc.xml b/categories/rpyc.xml new file mode 100644 index 000000000..72e8c3a71 --- /dev/null +++ b/categories/rpyc.xml @@ -0,0 +1,40 @@ + +PyPy (Posts about RPyC)https://www.pypy.org/enContents © 2024 <a href="mailto:pypy-dev@pypy.org">The PyPy Team</a> Sat, 31 Aug 2024 17:48:14 GMTNikola (getnikola.com)http://blogs.law.harvard.edu/tech/rssUsing CPython extension modules with PyPy, or: PyQt on PyPyhttps://www.pypy.org/posts/2009/11/using-cpython-extension-modules-with-4951018896657992031.htmlAlexander Schremmer<div class="document" id="using-cpython-extension-modules-with-pypy-or-pyqt-on-pypy"> + +<p>If you have ever wanted to use CPython extension modules on PyPy, +we want to announce that there is a solution that should be compatible +to quite a bit of the available modules. It is neither new nor written +by us, but works nevertheless great with PyPy.</p> +<p>The trick is to use RPyC, a transparent, symmetric remote procedure +call library written in Python. The idea is to start a +CPython process that hosts the PyQt libraries +and connect to it via TCP to send RPC commands to it.</p> +<p>I tried to run PyQt applications +using it on PyPy and could get quite a bit of the functionality of these +working. Remaining problems include regular segfaults of CPython +because of PyQt-induced memory corruption and bugs because classes +like StandardButtons behave incorrectly when it comes to arithmetical operations.</p> +<p>Changes to RPyC needed to be done to support remote unbound <tt class="docutils literal"><span class="pre">__init__</span></tt> methods, +shallow call by value for list and dict types (PyQt4 methods want real lists and dicts +as parameters), and callbacks to methods (all remote method objects are wrapped into +small lambda functions to ease the call for PyQt4).</p> +<p>If you want to try RPyC to run the PyQt application of your choice, you just +need to follow these steps. Please report your experience here in the blog +comments or on our <a class="reference external" href="https://codespeak.net/mailman/listinfo/pypy-dev">mailing list</a>.</p> +<blockquote> +<ol class="arabic simple"> +<li>Download RPyC from the <a class="reference external" href="https://sourceforge.net/projects/rpyc/files/">RPyC download page</a>.</li> +<li>Download this <a class="reference external" href="https://codespeak.net/svn/user/xoraxax/rpyc-3.0.7-pyqt4-compat.patch">patch</a> and apply it to RPyC by running +<tt class="docutils literal"><span class="pre">patch</span> <span class="pre">-p1</span> <span class="pre">&lt;</span> <span class="pre">rpyc-3.0.7-pyqt4-compat.patch</span></tt> in the RPyC directory.</li> +<li>Install RPyc by running <tt class="docutils literal"><span class="pre">python</span> <span class="pre">setup.py</span> <span class="pre">install</span></tt> as root.</li> +<li>Run the file <tt class="docutils literal"><span class="pre">rpyc/servers/classic_server.py</span></tt> using CPython.</li> +<li>Execute your PyQt application on PyPy.</li> +</ol> +</blockquote> +<p>PyPy will automatically connect to CPython and use its PyQt libraries.</p> +<p>Note that this scheme works with nearly every extension library. Look +at <tt class="docutils literal"><span class="pre">pypy/lib/sip.py</span></tt> on how to add new libraries (you need to create +such a file for every proxied extension module).</p> +<p>Have fun with PyQt</p> +<p>Alexander Schremmer</p> +</div>CPythonextension modulesPyQt4RPyChttps://www.pypy.org/posts/2009/11/using-cpython-extension-modules-with-4951018896657992031.htmlMon, 30 Nov 2009 11:19:00 GMT \ No newline at end of file diff --git a/categories/smalltalk.html b/categories/smalltalk.html new file mode 100644 index 000000000..df7187fc0 --- /dev/null +++ b/categories/smalltalk.html @@ -0,0 +1,114 @@ + + + + + +Posts about Smalltalk | PyPy + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +
                + + \ No newline at end of file diff --git a/categories/smalltalk.xml b/categories/smalltalk.xml new file mode 100644 index 000000000..4b600fe52 --- /dev/null +++ b/categories/smalltalk.xml @@ -0,0 +1,122 @@ + +PyPy (Posts about Smalltalk)https://www.pypy.org/enContents © 2024 <a href="mailto:pypy-dev@pypy.org">The PyPy Team</a> Sat, 31 Aug 2024 17:48:14 GMTNikola (getnikola.com)http://blogs.law.harvard.edu/tech/rssA Field Test of Software Transactional Memory Using the RSqueak Smalltalk VMhttps://www.pypy.org/posts/2014/08/a-field-test-of-software-transactional-5659022209916605798.htmlCarl Friedrich Bolz-Tereick<h2> +Extending the Smalltalk RSqueakVM with STM</h2> +<p>by Conrad Calmez, Hubert Hesse, Patrick Rein and Malte Swart supervised by Tim Felgentreff and Tobias Pape</p> +<h2> +Introduction</h2> +<p>After pypy-stm we can announce that through the <a href="https://bitbucket.org/pypy/lang-smalltalk">RSqueakVM</a> (which used to be called <em>SPyVM</em>) a second VM implementation supports software transactional memory. RSqueakVM is a Smalltalk implementation based on the RPython toolchain. We have added STM support based on the <a href="https://www.pypy.org/posts/2014/07/pypy-stm-first-interesting-release-8684276541915333814.html">STM tools from RPython (rstm)</a>. The benchmarks indicate that linear scale up is possible, however in some situations the STM overhead limits speedup.</p> +<p>The work was done as a master's project at the <a href="https://www.hpi.uni-potsdam.de/hirschfeld/">Software Architechture Group</a> of Professor Robert Hirschfeld at at the <a href="https://hpi.de/">Hasso Plattner Institut</a> at the <a href="https://www.uni-potsdam.de/">University of Potsdam</a>. We - four students - worked about one and a half days per week for four months on the topic. The RSqueakVM was <a href="https://pypysqueak.blogspot.de/2007/10/first-day-discussions.html">originally developped during a sprint at the University of Bern</a>. When we started the project we were new to the topic of building VMs / interpreters.</p> +<p>We would like to thank Armin, Remi and the #pypy IRC channel who supported us over the course of our project. We also like to thank Toni Mattis and Eric Seckler, who have provided us with an <a href="https://bitbucket.org/amintos/lang-smalltalk">initial code base</a>.</p> +<h2 id="introduction-to-rsqueakvm"> +Introduction to RSqueakVM</h2> +<p>As the original Smalltalk implementation, the RSqueakVM executes a given Squeak Smalltalk image, containing the Smalltalk code and a snapshot of formerly created objects and active execution contexts. These execution contexts are scheduled inside the image (greenlets) and not mapped to OS threads. Thereby the non-STM RSqueakVM runs on only one OS thread.</p> +<h2 id="changes-to-rsqueakvm"> +Changes to RSqueakVM</h2> +<p>The core adjustments to support STM were inside the VM and transparent from the view of a Smalltalk user. Additionally we added Smalltalk code to influence the behavior of the STM. As the RSqueakVM has run in one OS thread so far, we added the capability to start OS threads. Essentially, we added an additional way to launch a new Smalltalk execution context (thread). But in contrast to the original one this one creates a new native OS thread, not a Smalltalk internal green thread.</p> + +<p>STM (with automatic transaction boundaries) already solves the problem of concurrent access on one value as this is protected by the STM transactions (to be more precise one instruction). But there are cases were the application relies on the fact that a bigger group of changes is executed either completely or not at all (atomic). Without further information transaction borders could be in the middle of such a set of atomic statements. rstm allows to aggregate multiple statements into one higher level transaction. To let the application mark the beginning and the end of these atomic blocks (high-level transactions), we added two more STM specific extensions to Smalltalk.</p> + +<h2 id="benchmarks"> +Benchmarks</h2> +<p>RSqueak was executed in a single OS thread so far. rstm enables us to execute the VM using several OS threads. Using OS threads we expected a speed-up in benchmarks which use multiple threads. We measured this speed-up by using two benchmarks: a simple parallel summation where each thread sums up a predefined interval and an implementation of Mandelbrot where each thread computes a range of predefined lines.</p> + +<p>To assess the speed-up, we used one RSqueakVM compiled with rstm enabled, but once running the benchmarks with OS threads and once with Smalltalk green threads. The workload always remained the same and only the number of threads increased. To assess the overhead imposed by the STM transformation we also ran the green threads version on an unmodified RSqueakVM. All VMs were translated with the JIT optimization and all benchmarks were run once before the measurement to warm up the JIT. As the JIT optimization is working it is likely to be adoped by VM creators (the baseline RSqueakVM did that) so that results with this optimization are more relevant in practice than those without it. We measured the execution time by getting the system time in Squeak. The results are:</p> +<h4> +Parallel Sum Ten Million</h4> +<table align="center" cellpadding="0" cellspacing="0" class="tr-caption-container" style="margin-left: auto; margin-right: auto; text-align: center;"><tbody> +<tr><td style="text-align: center;"> + +<div class="separator" style="clear: both; text-align: center;"><a href="https://1.bp.blogspot.com/-7J05whp07m8/U-iEdb3Ce0I/AAAAAAAAAVw/91sD_1KEiGc/s1600/parallelSum10MioChart.png" style="margin-left: 1em; margin-right: 1em;"><img border="0" src="https://1.bp.blogspot.com/-7J05whp07m8/U-iEdb3Ce0I/AAAAAAAAAVw/91sD_1KEiGc/s320/parallelSum10MioChart.png"></a></div> + +</td></tr> +<tr><td class="tr-caption" style="text-align: center;"><span style="font-size: small; text-align: start;">Benchmark Parallel Sum 10,000,000</span></td></tr> +</tbody></table> +<table><thead> +<tr> <th>Thread Count</th> <th>RSqueak green threads</th> <th>RSqueak/STM green threads</th> <th>RSqueak/STM OS threads</th> <th>Slow down from RSqueak green threads to RSqueak/STM green threads</th> <th>Speed up from RSqueak/STM green threads to RSQueak/STM OS Threads</th> </tr> +</thead> <tbody> +<tr> <td>1</td> <td>168.0 ms</td> <td>240.0 ms</td> <td>290.9 ms</td> <td>0.70</td> <td>0.83</td> </tr> +<tr> <td>2</td> <td>167.0 ms</td> <td>244.0 ms</td> <td>246.1 ms</td> <td>0.68</td> <td>0.99</td> </tr> +<tr> <td>4</td> <td>167.8 ms</td> <td>240.7 ms</td> <td>366.7 ms</td> <td>0.70</td> <td>0.66</td> </tr> +<tr> <td>8</td> <td>168.1 ms</td> <td>241.1 ms</td> <td>757.0 ms</td> <td>0.70</td> <td>0.32</td> </tr> +<tr> <td>16</td> <td>168.5 ms</td> <td>244.5 ms</td> <td>1460.0 ms</td> <td>0.69</td> <td>0.17</td> </tr> +</tbody> </table> +<br> + +<h4> +Parallel Sum One Billion</h4> +<table align="center" cellpadding="0" cellspacing="0" class="tr-caption-container" style="margin-left: auto; margin-right: auto; text-align: center;"><tbody> +<tr><td style="text-align: center;"> + +<div class="separator" style="clear: both; text-align: center;"><a href="https://3.bp.blogspot.com/-wN-Bad8Pnd8/U-iE43ZtHcI/AAAAAAAAAV4/dii8NU0rseE/s1600/parallelSum1BioChart.png" style="margin-left: 1em; margin-right: 1em;"><img border="0" src="https://3.bp.blogspot.com/-wN-Bad8Pnd8/U-iE43ZtHcI/AAAAAAAAAV4/dii8NU0rseE/s320/parallelSum1BioChart.png"></a></div> + +</td></tr> +<tr><td class="tr-caption" style="text-align: center;">Benchmark Parallel Sum 1,000,000,000</td></tr> +</tbody></table> +<br> +<table><thead> +<tr><th>Thread Count</th><th>RSqueak green threads</th><th>RSqueak/STM green threads</th><th>RSqueak/STM OS threads</th><th>Slow down from RSqueak green threads to RSqueak/STM green threads</th><th>Speed up from RSqueak/STM green threads to RSQueak/STM OS Threads</th></tr> +</thead><tbody> +<tr> <td>1</td> <td>16831.0 ms</td> <td>24111.0 ms</td> <td>23346.0 ms</td> <td>0.70</td> <td>1.03</td> </tr> +<tr> <td>2</td> <td>17059.9 ms</td> <td>24229.4 ms</td> <td>16102.1 ms</td> <td>0.70</td> <td>1.50</td> </tr> +<tr> <td>4</td> <td>16959.9 ms</td> <td>24365.6 ms</td> <td>12099.5 ms</td> <td>0.70</td> <td>2.01</td> </tr> +<tr> <td>8</td> <td>16758.4 ms</td> <td>24228.1 ms</td> <td>14076.9 ms</td> <td>0.69</td> <td>1.72</td> </tr> +<tr> <td>16</td> <td>16748.7 ms</td> <td>24266.6 ms</td> <td>55502.9 ms</td> <td>0.69</td> <td>0.44</td> </tr> +</tbody></table> + +<br> + +<h4> +Mandelbrot Iterative</h4> +<table align="center" cellpadding="0" cellspacing="0" class="tr-caption-container" style="margin-left: auto; margin-right: auto; text-align: center;"><tbody> +<tr><td style="text-align: center;"> + +<div class="separator" style="clear: both; text-align: center;"><a href="https://2.bp.blogspot.com/-_wLcNRFGkQc/U-iFOB3wDmI/AAAAAAAAAWA/He1oxb0hEpc/s1600/mandelbrotChart.png" style="margin-left: 1em; margin-right: 1em;"><img border="0" src="https://2.bp.blogspot.com/-_wLcNRFGkQc/U-iFOB3wDmI/AAAAAAAAAWA/He1oxb0hEpc/s320/mandelbrotChart.png"></a></div> + +</td></tr> +<tr><td class="tr-caption" style="text-align: center;">Benchmark Mandelbrot</td></tr> +</tbody></table> +<table><thead> +<tr> <th>Thread Count</th> <th>RSqueak green threads</th> <th>RSqueak/STM green threads</th> <th>RSqueak/STM OS threads</th> <th>Slow down from RSqueak green threads to RSqueak/STM green threads</th> <th>Speed up from RSqueak/STM green threads to RSqueak/STM OS Threads</th> </tr> +</thead> <tbody> +<tr> <td>1</td> <td>724.0 ms</td> <td>983.0 ms</td> <td>1565.5 ms</td> <td>0.74</td> <td>0.63</td> </tr> +<tr> <td>2</td> <td>780.5 ms</td> <td>973.5 ms</td> <td>5555.0 ms</td> <td>0.80</td> <td>0.18</td> </tr> +<tr> <td>4</td> <td>781.0 ms</td> <td>982.5 ms</td> <td>20107.5 ms</td> <td>0.79</td> <td>0.05</td> </tr> +<tr> <td>8</td> <td>779.5 ms</td> <td>980.0 ms</td> <td>113067.0 ms</td> <td>0.80</td> <td>0.01</td></tr> +</tbody></table> + +<br> + +<h2> +Discussion of benchmark results</h2> +<p>First of all, the ParallelSum benchmarks show that the parallelism is actually paying off, at least for sufficiently large embarrassingly parallel problems. Thus RSqueak can also benefit from rstm.</p> +<p>On the other hand, our Mandelbrot implementation shows the limits of our current rstm integration. We implemented two versions of the algorithm one using one low-level array and one using two nested collections. In both versions, one job only calculates a distinct range of rows and both lead to a slowdown. The summary of the state of rstm transactions shows that there are a lot of inevitable transactions (transactions which must be completed). One reason might be the interactions between the VM and its low-level extensions, so called plugins. We have to investigate this further.</p> +<h2 id="limitations"> +Limitations</h2> +<p>Although the current VM setup is working well enough to support our benchmarks, the VM still has limitations. First of all, as it is based on rstm, it has the current limitation of only running on 64-bit Linux.</p> +<p>Besides this, we also have two major limitations regarding the VM itself. First, the atomic interface exposed in Smalltalk is currently not working, when the VM is compiled using the just-in-time compiler transformation. Simple examples such as concurrent parallel sum work fine while more complex benchmarks such as <a href="https://benchmarksgame.alioth.debian.org/u32/performance.php?test=chameneosredux#about">chameneos</a> fail. The reasons for this are currently beyond our understanding. Second, Smalltalk supports green threads, which are threads which are managed by the VM and are not mapped to OS threads. We currently support starting new Smalltalk threads as OS threads instead of starting them as green threads. However, existing threads in a Smalltalk image are not migrated to OS threads, but remain running as green threads.</p> +<h2 id="future-work-for-stm-in-rsqueak"> +Future work for STM in RSqueak</h2> +The work we presented showed interesting problems, we propose the following problem statements for further analysis:<br> +<ul> +<li><strong>Inevitable transactions</strong> in benchmarks. This looks like it could limit other applications too so it should be solved.</li> +<li><strong>Collection implementation aware of STM</strong>: The current implementation of collections can cause a lot of STM collisions due to their internal memory structure. We believe it could bear potential for performance improvements, if we replace these collections in an STM enabled interpreter with implementations with less STM collisions. As already proposed by Remi Meier, bags, sets and lists are of particular interest.</li> +<li>Finally, we exposed <strong>STM through languages features</strong> such as the atomic method, which is provided through the VM. Originally, it was possible to model STM transactions barriers implicitly by using clever locks, now its exposed via the atomic keyword. From a language design point of view, the question arises whether this is a good solution and what features an stm-enabled interpreter must provide to the user in general? Of particular interest are for example, access to the transaction length and hints for transaction borders to and their performance impact.</li> +</ul> +<ul></ul> +<h2 id="details-for-the-technically-inclined"> +Details for the technically inclined</h2> +<ul> +<li><a href="https://bitbucket.org/pypy/lang-smalltalk/diff/spyvm/interpreter.py?diff1=7a217be69118&amp;diff2=a772ee2447d96041e7db6550e160e90251d0dd85&amp;at=stmgc-c7#Lspyvm/interpreter.pyT233">Adjustments to the interpreter loop were minimal</a>.</li> +<li>STM works on bytecode granularity that means, there is a implicit transaction border after every bytecode executed. Possible alternatives: only break transactions after certain bytecodes, break transactions on one abstraction layer above, e.g. object methods (setter, getter).</li> +<li>rstm calls were exposed using primtives (a way to expose native code in Smalltalk), this was mainly used for atomic.</li> +<li>Starting and stopping OS threads is exposed via primitives as well. Threads are started from within the interpreter.</li> +<li>For Smalltalk enabled STM code we currently have different image versions. However another way to add, load and replace code to the Smalltalk code base is required to make a switch between STM and non-STM code simple.</li> +</ul> +<ul></ul> +<h2 id="details-on-the-project-setup"> +Details on the project setup</h2> +<p>From a non-technical perspective, a problem we encountered was the huge roundtrip times (on our machines up to 600s, 900s with JIT enabled). This led to a tendency of bigger code changes ("Before we compile, let's also add this"), lost flow ("What where we doing before?") and different compiled interpreters in parallel testing ("How is this version different from the others?") As a consequence it was harder to test and correct errors. While this is not as much of a problem for other RPython VMs, RSqueakVM needs to execute the entire image, which makes running it untranslated even slower.</p> +<h2 id="summary"> +Summary</h2> +<p>The benchmarks show that speed up is possible, but also that the STM overhead in some situations can eat up the speedup. The resulting STM-enabled VM still has some limitations: As rstm is currently only running on 64-bit Linux the RSqueakVM is doing so as well. Eventhough it is possible for us now to create new threads that map to OS threads within the VM, the migration of exiting Smalltalk threads keeps being problematic.</p> +<p>We showed that an existing VM code base can benefit of STM in terms of scaling up. Further it was relatively easy to enable STM support. This may also be valuable to VM developers considering to get STM support for their VMs.</p>SmalltalkSqueakstmhttps://www.pypy.org/posts/2014/08/a-field-test-of-software-transactional-5659022209916605798.htmlSat, 09 Aug 2014 13:15:00 GMT \ No newline at end of file diff --git a/categories/speed.html b/categories/speed.html new file mode 100644 index 000000000..6c50e4af2 --- /dev/null +++ b/categories/speed.html @@ -0,0 +1,129 @@ + + + + + +Posts about speed | PyPy + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +
                + + \ No newline at end of file diff --git a/categories/speed.xml b/categories/speed.xml new file mode 100644 index 000000000..5eb1e2c1b --- /dev/null +++ b/categories/speed.xml @@ -0,0 +1,928 @@ + +PyPy (Posts about speed)https://www.pypy.org/enContents © 2024 <a href="mailto:pypy-dev@pypy.org">The PyPy Team</a> Sat, 31 Aug 2024 17:48:11 GMTNikola (getnikola.com)http://blogs.law.harvard.edu/tech/rssInside cpyext: Why emulating CPython C API is so Hardhttps://www.pypy.org/posts/2018/09/inside-cpyext-why-emulating-cpython-c-8083064623681286567.htmlAntonio Cuni<br> +<div class="document" id="inside-cpyext-why-emulating-cpython-c-api-is-so-hard"> +<tt class="docutils literal">cpyext</tt> is PyPy's subsystem which provides a compatibility +layer to compile and run CPython C extensions inside PyPy. Often people ask +why a particular C extension doesn't work or is very slow on PyPy. +Usually it is hard to answer without going into technical details. The goal of +this blog post is to explain some of these technical details, so that we can +simply link here instead of explaining again and again :).<br> +From a 10.000 foot view, <tt class="docutils literal">cpyext</tt> is PyPy's version of <tt class="docutils literal">"Python.h"</tt>. Every time +you compile an extension which uses that header file, you are using <tt class="docutils literal">cpyext</tt>. +This includes extension explicitly written in C (such as <tt class="docutils literal">numpy</tt>) and +extensions which are generated from other compilers/preprocessors +(e.g. <tt class="docutils literal">Cython</tt>).<br> +At the time of writing, the current status is that most C extensions "just +work". Generally speaking, you can simply <tt class="docutils literal">pip install</tt> them, +provided they use the public, <a class="reference external" href="https://docs.python.org/2/c-api/index.html">official C API</a> instead of poking at private +implementation details. However, the performance of cpyext is generally +poor. A Python program which makes heavy use of <tt class="docutils literal">cpyext</tt> extensions +is likely to be slower on PyPy than on CPython.<br> +Note: in this blog post we are talking about Python 2.7 because it is still +the default version of PyPy: however most of the implementation of <tt class="docutils literal">cpyext</tt> is +shared with PyPy3, so everything applies to that as well.<br> +<div class="section" id="c-api-overview"> +<h1> +C API Overview</h1> +In CPython, which is written in C, Python objects are represented as <tt class="docutils literal">PyObject*</tt>, +i.e. (mostly) opaque pointers to some common "base struct".<br> +CPython uses a very simple memory management scheme: when you create an +object, you allocate a block of memory of the appropriate size on the heap. +Depending on the details, you might end up calling different allocators, but +for the sake of simplicity, you can think that this ends up being a call to +<tt class="docutils literal">malloc()</tt>. The resulting block of memory is initialized and casted to to +<tt class="docutils literal">PyObject*</tt>: this address never changes during the object lifetime, and the +C code can freely pass it around, store it inside containers, retrieve it +later, etc.<br> +Memory is managed using reference counting. When you create a new reference to +an object, or you discard a reference you own, you have to <a class="reference external" href="https://docs.python.org/2/c-api/refcounting.html#c.Py_INCREF">increment</a> or +<a class="reference external" href="https://docs.python.org/2/c-api/refcounting.html#c.Py_DECREF">decrement</a> the reference counter accordingly. When the reference counter goes to +0, it means that the object is no longer used and can safely be +destroyed. Again, we can simplify and say that this results in a call to +<tt class="docutils literal">free()</tt>, which finally releases the memory which was allocated by <tt class="docutils literal">malloc()</tt>.<br> +Generally speaking, the only way to operate on a <tt class="docutils literal">PyObject*</tt> is to call the +appropriate API functions. For example, to convert a given <tt class="docutils literal">PyObject*</tt> to a C +integer, you can use <a class="reference external" href="https://docs.python.org/2/c-api/int.html#c.PyInt_AsLong">PyInt_AsLong()</a>; to add two objects together, you can +call <a class="reference external" href="https://docs.python.org/2/c-api/number.html#c.PyNumber_Add">PyNumber_Add()</a>.<br> +Internally, PyPy uses a similar approach. All Python objects are subclasses of +the RPython <tt class="docutils literal">W_Root</tt> class, and they are operated by calling methods on the +<tt class="docutils literal">space</tt> singleton, which represents the interpreter.<br> +At first, it looks very easy to write a compatibility layer: just make +<tt class="docutils literal">PyObject*</tt> an alias for <tt class="docutils literal">W_Root</tt>, and write simple RPython functions +(which will be translated to C by the RPython compiler) which call the +<tt class="docutils literal">space</tt> accordingly:<br> +<pre class="code python literal-block"><span class="keyword">def</span> <span class="name function">PyInt_AsLong</span><span class="punctuation">(</span><span class="name">space</span><span class="punctuation">,</span> <span class="name">o</span><span class="punctuation">):</span> + <span class="keyword">return</span> <span class="name">space</span><span class="operator">.</span><span class="name">int_w</span><span class="punctuation">(</span><span class="name">o</span><span class="punctuation">)</span> + +<span class="keyword">def</span> <span class="name function">PyNumber_Add</span><span class="punctuation">(</span><span class="name">space</span><span class="punctuation">,</span> <span class="name">o1</span><span class="punctuation">,</span> <span class="name">o2</span><span class="punctuation">):</span> + <span class="keyword">return</span> <span class="name">space</span><span class="operator">.</span><span class="name">add</span><span class="punctuation">(</span><span class="name">o1</span><span class="punctuation">,</span> <span class="name">o2</span><span class="punctuation">)</span> +</pre> +Actually, the code above is not too far from the real +implementation. However, there are tons of gory details which make it much +harder than it looks, and much slower unless you pay a lot of attention +to performance.</div> +<div class="section" id="the-pypy-gc"> +<h1> +The PyPy GC</h1> +To understand some of <tt class="docutils literal">cpyext</tt> challenges, you need to have at least a rough +idea of how the PyPy GC works.<br> +Contrarily to the popular belief, the "Garbage Collector" is not only about +collecting garbage: instead, it is generally responsible for all memory +management, including allocation and deallocation.<br> +Whereas CPython uses a combination of malloc/free/refcounting to manage +memory, the PyPy GC uses a completely different approach. It is designed +assuming that a dynamic language like Python behaves the following way:<br> +<blockquote> +<ul class="simple"> +<li>You create, either directly or indirectly, lots of objects.</li> +<li>Most of these objects are temporary and very short-lived. Think e.g. of +doing <tt class="docutils literal">a + b + c</tt>: you need to allocate an object to hold the temporary +result of <tt class="docutils literal">a + b</tt>, then it dies very quickly because you no longer need it +when you do the final <tt class="docutils literal">+ c</tt> part.</li> +<li>Only small fraction of the objects survive and stay around for a while.</li> +</ul> +</blockquote> +So, the strategy is: make allocation as fast as possible; make deallocation of +short-lived objects as fast as possible; find a way to handle the remaining +small set of objects which actually survive long enough to be important.<br> +This is done using a <strong>Generational GC</strong>: the basic idea is the following:<br> +<blockquote> +<ol class="arabic simple"> +<li>We have a nursery, where we allocate "young objects" very quickly.</li> +<li>When the nursery is full, we start what we call a "minor collection".<ul> +<li>We do a quick scan to determine the small set of objects which survived so +far</li> +<li>We <strong>move</strong> these objects out of the nursery, and we place them in the +area of memory which contains the "old objects". Since the address of the +objects changes, we fix all the references to them accordingly.</li> +</ul> +</li> +</ol> +<ol class="arabic simple" start="4"> +<li>now the nursery contains only objects which "died young". We can +discard all of them very quickly, reset the nursery, and use the same area +of memory to allocate new objects from now.</li> +</ol> +</blockquote> +In practice, this scheme works very well and it is one of the reasons why PyPy +is much faster than CPython. However, careful readers have surely noticed +that this is a problem for <tt class="docutils literal">cpyext</tt>. On one hand, we have PyPy objects which +can potentially move and change their underlying memory address; on the other +hand, we need a way to represent them as fixed-address <tt class="docutils literal">PyObject*</tt> when we +pass them to C extensions. We surely need a way to handle that.</div> +<div class="section" id="pyobject-in-pypy"> +<h1> +<tt class="docutils literal">PyObject*</tt> in PyPy</h1> +Another challenge is that sometimes, <tt class="docutils literal">PyObject*</tt> structs are not completely +opaque: there are parts of the public API which expose to the user specific +fields of some concrete C struct. For example the definition of <a class="reference external" href="https://docs.python.org/2/c-api/typeobj.html">PyTypeObject</a> +which exposes many of the <tt class="docutils literal">tp_*</tt> slots to the user. +Since the low-level layout of PyPy <tt class="docutils literal">W_Root</tt> objects is completely different +than the one used by CPython, we cannot simply pass RPython objects to C; we +need a way to handle the difference.<br> +So, we have two issues so far: objects can move, and incompatible +low-level layouts. <tt class="docutils literal">cpyext</tt> solves both by decoupling the RPython and the C +representations. We have two "views" of the same entity, depending on whether +we are in the PyPy world (the movable <tt class="docutils literal">W_Root</tt> subclass) or in the C world +(the non-movable <tt class="docutils literal">PyObject*</tt>).<br> +<tt class="docutils literal">PyObject*</tt> are created lazily, only when they are actually needed. The +vast majority of PyPy objects are never passed to any C extension, so we don't +pay any penalty in that case. However, the first time we pass a <tt class="docutils literal">W_Root</tt> to +C, we allocate and initialize its <tt class="docutils literal">PyObject*</tt> counterpart.<br> +The same idea applies also to objects which are created in C, e.g. by calling +<a class="reference external" href="https://docs.python.org/2/c-api/allocation.html#c.PyObject_New">PyObject_New()</a>. At first, only the <tt class="docutils literal">PyObject*</tt> exists and it is +exclusively managed by reference counting. As soon as we pass it to the PyPy +world (e.g. as a return value of a function call), we create its <tt class="docutils literal">W_Root</tt> +counterpart, which is managed by the GC as usual.<br> +Here we start to see why calling cpyext modules is more costly in PyPy than in +CPython. We need to pay some penalty for all the conversions between +<tt class="docutils literal">W_Root</tt> and <tt class="docutils literal">PyObject*</tt>.<br> +Moreover, the first time we pass a <tt class="docutils literal">W_Root</tt> to C we also need to allocate +the memory for the <tt class="docutils literal">PyObject*</tt> using a slowish "CPython-style" memory +allocator. In practice, for all the objects which are passed to C we pay more +or less the same costs as CPython, thus effectively "undoing" the speedup +guaranteed by PyPy's Generational GC under normal circumstances.</div> +<div class="section" id="maintaining-the-link-between-w-root-and-pyobject"> +<h1> +Maintaining the link between <tt class="docutils literal">W_Root</tt> and <tt class="docutils literal">PyObject*</tt></h1> +We now need a way to convert between <tt class="docutils literal">W_Root</tt> and <tt class="docutils literal">PyObject*</tt> and +vice-versa; also, we need to to ensure that the lifetime of the two entities +are in sync. In particular:<br> +<blockquote> +<ol class="arabic simple"> +<li>as long as the <tt class="docutils literal">W_Root</tt> is kept alive by the GC, we want the +<tt class="docutils literal">PyObject*</tt> to live even if its refcount drops to 0;</li> +<li>as long as the <tt class="docutils literal">PyObject*</tt> has a refcount greater than 0, we want to +make sure that the GC does not collect the <tt class="docutils literal">W_Root</tt>.</li> +</ol> +</blockquote> +The <tt class="docutils literal">PyObject*</tt> ⇨ <tt class="docutils literal">W_Root</tt> link is maintained by the special field +<a class="reference external" href="https://foss.heptapod.net/pypy/pypy/-/tree/branch/py3.6/pypy/module/cpyext/parse/cpyext_object.h#lines-5">ob_pypy_link</a> which is added to all <tt class="docutils literal">PyObject*</tt>. On a 64 bit machine this +means that all <tt class="docutils literal">PyObject*</tt> have 8 bytes of overhead, but then the +conversion is very quick, just reading the field.<br> +For the other direction, we generally don't want to do the same: the +assumption is that the vast majority of <tt class="docutils literal">W_Root</tt> objects will never be +passed to C, and adding an overhead of 8 bytes to all of them is a +waste. Instead, in the general case the link is maintained by using a +dictionary, where <tt class="docutils literal">W_Root</tt> are the keys and <tt class="docutils literal">PyObject*</tt> the values.<br> +However, for a <a class="reference external" href="https://foss.heptapod.net/pypy/pypy/-/tree/branch/py3.6/pypy/module/cpyext/pyobject.py#lines-66">few selected</a> <tt class="docutils literal">W_Root</tt> subclasses we <strong>do</strong> maintain a +direct link using the special <tt class="docutils literal">_cpy_ref</tt> field to improve performance. In +particular, we use it for <tt class="docutils literal">W_TypeObject</tt> (which is big anyway, so a 8 bytes +overhead is negligible) and <tt class="docutils literal">W_NoneObject</tt>. <tt class="docutils literal">None</tt> is passed around very +often, so we want to ensure that the conversion to <tt class="docutils literal">PyObject*</tt> is very +fast. Moreover it's a singleton, so the 8 bytes overhead is negligible as +well.<br> +This means that in theory, passing an arbitrary Python object to C is +potentially costly, because it involves doing a dictionary lookup. We assume +that this cost will eventually show up in the profiler: however, at the time +of writing there are other parts of <tt class="docutils literal">cpyext</tt> which are even more costly (as we +will show later), so the cost of the dict lookup is never evident in the +profiler.</div> +<div class="section" id="crossing-the-border-between-rpython-and-c"> +<h1> +Crossing the border between RPython and C</h1> +There are two other things we need to care about whenever we cross the border +between RPython and C, and vice-versa: exception handling and the GIL.<br> +In the C API, exceptions are raised by calling <a class="reference external" href="https://docs.python.org/2/c-api/exceptions.html#c.PyErr_SetString">PyErr_SetString()</a> (or one of +<a class="reference external" href="https://docs.python.org/2/c-api/exceptions.html#exception-handling">many other functions</a> which have a similar effect), which basically works by +creating an exception value and storing it in some global variable. The +function then signals that an exception has occurred by returning an error value, +usually <tt class="docutils literal">NULL</tt>.<br> +On the other hand, in the PyPy interpreter, exceptions are propagated by raising the +RPython-level <a class="reference external" href="https://foss.heptapod.net/pypy/pypy/-/tree/branch/py3.6/pypy/interpreter/error.py#lines-20">OperationError</a> exception, which wraps the actual app-level +exception values. To harmonize the two worlds, whenever we return from C to +RPython, we need to check whether a C API exception was raised and if so turn it +into an <tt class="docutils literal">OperationError</tt>.<br> +We won't dig into details of <a class="reference external" href="https://foss.heptapod.net/pypy/pypy/-/tree/branch/py3.6/pypy/module/cpyext/api.py#lines-205">how the GIL is handled in cpyext</a>. +For the purpose of this post, it is enough to know that whenever we enter +C land, we store the current thread id into a global variable which is +accessible also from C; conversely, whenever we go back from RPython to C, we +restore this value to 0.<br> +Similarly, we need to do the inverse operations whenever you need to cross the +border between C and RPython, e.g. by calling a Python callback from C code.<br> +All this complexity is automatically handled by the RPython function +<a class="reference external" href="https://foss.heptapod.net/pypy/pypy/-/tree/branch/py3.6/pypy/module/cpyext/api.py#lines-1757">generic_cpy_call</a>. If you look at the code you see that it takes care of 4 +things:<br> +<blockquote> +<ol class="arabic simple"> +<li>Handling the GIL as explained above.</li> +<li>Handling exceptions, if they are raised.</li> +<li>Converting arguments from <tt class="docutils literal">W_Root</tt> to <tt class="docutils literal">PyObject*</tt>.</li> +<li>Converting the return value from <tt class="docutils literal">PyObject*</tt> to <tt class="docutils literal">W_Root</tt>.</li> +</ol> +</blockquote> +So, we can see that calling C from RPython introduce some overhead. +Can we measure it?<br> +Assuming that the conversion between <tt class="docutils literal">W_Root</tt> and <tt class="docutils literal">PyObject*</tt> has a +reasonable cost (as explained by the previous section), the overhead +introduced by a single border-cross is still acceptable, especially if the +callee is doing some non-negligible amount of work.<br> +However this is not always the case. There are basically three problems that +make (or used to make) <tt class="docutils literal">cpyext</tt> super slow:<br> +<blockquote> +<ol class="arabic simple"> +<li>Paying the border-crossing cost for trivial operations which are called +very often, such as <tt class="docutils literal">Py_INCREF</tt>.</li> +<li>Crossing the border back and forth many times, even if it's not strictly +needed.</li> +<li>Paying an excessive cost for argument and return value conversions.</li> +</ol> +</blockquote> +The next sections explain in more detail each of these problems.</div> +<div class="section" id="avoiding-unnecessary-roundtrips"> +<h1> +Avoiding unnecessary roundtrips</h1> +Prior to the <a class="reference external" href="https://www.pypy.org/posts/2017/10/cape-of-good-hope-for-pypy-hello-from-3656631725712879033.html">2017 Cape Town Sprint</a>, <tt class="docutils literal">cpyext</tt> was horribly slow, and we were +well aware of it: the main reason was that we never really paid too much +attention to performance. As explained in the blog post, emulating all the +CPython quirks is basically a nightmare, so better to concentrate on +correctness first.<br> +However, we didn't really know <strong>why</strong> it was so slow. We had theories and +assumptions, usually pointing at the cost of conversions between <tt class="docutils literal">W_Root</tt> +and <tt class="docutils literal">PyObject*</tt>, but we never actually measured it.<br> +So, we decided to write a set of <a class="reference external" href="https://github.com/antocuni/cpyext-benchmarks">cpyext microbenchmarks</a> to measure the +performance of various operations. The result was somewhat surprising: the +theory suggests that when you do a cpyext C call, you should pay the +border-crossing costs only once, but what the profiler told us was that we +were paying the cost of <tt class="docutils literal">generic_cpy_call</tt> several times more than what we expected.<br> +After a bit of investigation, we discovered this was ultimately caused by our +"correctness-first" approach. For simplicity of development and testing, when +we started <tt class="docutils literal">cpyext</tt> we wrote everything in RPython: thus, every single API call +made from C (like the omnipresent <a class="reference external" href="https://docs.python.org/2/c-api/arg.html#c.PyArg_ParseTuple">PyArg_ParseTuple()</a>, <a class="reference external" href="https://docs.python.org/2/c-api/int.html#c.PyInt_AsLong">PyInt_AsLong()</a>, etc.) +had to cross back the C-to-RPython border. This was especially daunting for +very simple and frequent operations like <tt class="docutils literal">Py_INCREF</tt> and <tt class="docutils literal">Py_DECREF</tt>, +which CPython implements as a single assembly instruction!<br> +Another source of slow down was the implementation of <tt class="docutils literal">PyTypeObject</tt> slots. +At the C level, these are function pointers which the interpreter calls to do +certain operations, e.g. <a class="reference external" href="https://docs.python.org/2/c-api/typeobj.html#c.PyTypeObject.tp_new">tp_new</a> to allocate a new instance of that type.<br> +As usual, we have some magic to implement slots in RPython; in particular, +<a class="reference external" href="https://foss.heptapod.net/pypy/pypy/-/tree/branch/py3.6/pypy/module/cpyext/api.py#lines-362">_make_wrapper</a> does the opposite of <tt class="docutils literal">generic_cpy_call</tt>: it takes a +RPython function and wraps it into a C function which can be safely called +from C, handling the GIL, exceptions and argument conversions automatically.<br> +This was very handy during the development of cpyext, but it might result in +some bad nonsense; consider what happens when you call the following C +function:<br> +<pre class="code C literal-block"><span class="keyword">static</span> <span class="name">PyObject</span><span class="operator">*</span> <span class="name function">foo</span><span class="punctuation">(</span><span class="name">PyObject</span><span class="operator">*</span> <span class="name">self</span><span class="punctuation">,</span> <span class="name">PyObject</span><span class="operator">*</span> <span class="name">args</span><span class="punctuation">)</span> +<span class="punctuation">{</span> + <span class="name">PyObject</span><span class="operator">*</span> <span class="name">result</span> <span class="operator">=</span> <span class="name">PyInt_FromLong</span><span class="punctuation">(</span><span class="literal number integer">1234</span><span class="punctuation">);</span> + <span class="keyword">return</span> <span class="name">result</span><span class="punctuation">;</span> +<span class="punctuation">}</span> +</pre> +<ol class="arabic simple"> +<li>you are in RPython and do a cpyext call to <tt class="docutils literal">foo</tt>: <strong>RPython-to-C</strong>;</li> +<li><tt class="docutils literal">foo</tt> calls <tt class="docutils literal">PyInt_FromLong(1234)</tt>, which is implemented in RPython: +<strong>C-to-RPython</strong>;</li> +<li>the implementation of <tt class="docutils literal">PyInt_FromLong</tt> indirectly calls +<tt class="docutils literal">PyIntType.tp_new</tt>, which is a C function pointer: <strong>RPython-to-C</strong>;</li> +<li>however, <tt class="docutils literal">tp_new</tt> is just a wrapper around an RPython function, created +by <tt class="docutils literal">_make_wrapper</tt>: <strong>C-to-RPython</strong>;</li> +<li>finally, we create our RPython <tt class="docutils literal">W_IntObject(1234)</tt>; at some point +during the <strong>RPython-to-C</strong> crossing, its <tt class="docutils literal">PyObject*</tt> equivalent is +created;</li> +<li>after many layers of wrappers, we are again in <tt class="docutils literal">foo</tt>: after we do +<tt class="docutils literal">return result</tt>, during the <strong>C-to-RPython</strong> step we convert it from +<tt class="docutils literal">PyObject*</tt> to <tt class="docutils literal">W_IntObject(1234)</tt>.</li> +</ol> +Phew! After we realized this, it was not so surprising that <tt class="docutils literal">cpyext</tt> was very +slow :). And this was a simplified example, since we are not passing a +<tt class="docutils literal">PyObject*</tt> to the API call. When we do, we need to convert it back and +forth at every step. Actually, I am not even sure that what I described was +the exact sequence of steps which used to happen, but you get the general +idea.<br> +The solution is simple: rewrite as much as we can in C instead of RPython, +to avoid unnecessary roundtrips. This was the topic of most of the Cape Town +sprint and resulted in the <tt class="docutils literal"><span class="pre">cpyext-avoid-roundtrip</span></tt> branch, which was +eventually <a class="reference external" href="https://foss.heptapod.net/pypy/pypy/-/tree/branch/cpyext_avoid-roundtrip">merged</a>.<br> +Of course, it is not possible to move <strong>everything</strong> to C: there are still +operations which need to be implemented in RPython. For example, think of +<tt class="docutils literal">PyList_Append</tt>: the logic to append an item to a list is complex and +involves list strategies, so we cannot replicate it in C. However, we +discovered that a large subset of the C API can benefit from this.<br> +Moreover, the C API is <strong>huge</strong>. While we invented this new way of writing +<tt class="docutils literal">cpyext</tt> code, we still need to +convert many of the functions to the new paradigm. Sometimes the rewrite is +not automatic +or straighforward. <tt class="docutils literal">cpyext</tt> is a delicate piece of software, so it happens often +that we make a mistake and end up staring at a segfault in gdb.<br> +However, the most important takeaway is that the performance improvements we got +from this optimization are impressive, as we will detail later.</div> +<div class="section" id="conversion-costs"> +<h1> +Conversion costs</h1> +The other potential big source of slowdown is the conversion of arguments +between <tt class="docutils literal">W_Root</tt> and <tt class="docutils literal">PyObject*</tt>.<br> +As explained earlier, the first time you pass a <tt class="docutils literal">W_Root</tt> to C, you need to +allocate its <tt class="docutils literal">PyObject*</tt> counterpart. Suppose you have a <tt class="docutils literal">foo</tt> function +defined in C, which takes a single int argument:<br> +<pre class="code python literal-block"><span class="keyword">for</span> <span class="name">i</span> <span class="operator word">in</span> <span class="name builtin">range</span><span class="punctuation">(</span><span class="name">N</span><span class="punctuation">):</span> + <span class="name">foo</span><span class="punctuation">(</span><span class="name">i</span><span class="punctuation">)</span> +</pre> +To run this code, you need to create a different <tt class="docutils literal">PyObject*</tt> for each value +of <tt class="docutils literal">i</tt>: if implemented naively, it means calling <tt class="docutils literal">N</tt> times <tt class="docutils literal">malloc()</tt> +and <tt class="docutils literal">free()</tt>, which kills performance.<br> +CPython has the very same problem, which is solved by using a <a class="reference external" href="https://en.wikipedia.org/wiki/Free_list">free list</a> to +<a class="reference external" href="https://github.com/python/cpython/blob/2.7/Objects/intobject.c#L16">allocate ints</a>. So, what we did was to simply <a class="reference external" href="https://foss.heptapod.net/pypy/pypy/-/commit/d8754ab9ba6371c83eaeb80cdf8cc13a37ee0c89">steal the code</a> from CPython +and do the exact same thing. This was also done in the +<tt class="docutils literal"><span class="pre">cpyext-avoid-roundtrip</span></tt> branch, and the benchmarks show that it worked +perfectly.<br> +Every type which is converted often to <tt class="docutils literal">PyObject*</tt> must have a very fast +allocator. At the moment of writing, PyPy uses free lists only for ints and +<a class="reference external" href="https://foss.heptapod.net/pypy/pypy/-/commit/35e2fb9903f2483940d7970bd83ce8c65aa1c1a3">tuples</a>: one of the next steps on our TODO list is certainly to use this +technique with more types, like <tt class="docutils literal">float</tt>.<br> +Conversely, we also need to optimize the converstion from <tt class="docutils literal">PyObject*</tt> to +<tt class="docutils literal">W_Root</tt>: this happens when an object is originally allocated in C and +returned to Python. Consider for example the following code:<br> +<pre class="code python literal-block"><span class="keyword namespace">import</span> <span class="name namespace">numpy</span> <span class="keyword namespace">as</span> <span class="name namespace">np</span> +<span class="name">myarray</span> <span class="operator">=</span> <span class="name">np</span><span class="operator">.</span><span class="name">random</span><span class="operator">.</span><span class="name">random</span><span class="punctuation">(</span><span class="name">N</span><span class="punctuation">)</span> +<span class="keyword">for</span> <span class="name">i</span> <span class="operator word">in</span> <span class="name builtin">range</span><span class="punctuation">(</span><span class="name builtin">len</span><span class="punctuation">(</span><span class="name">arr</span><span class="punctuation">)):</span> + <span class="name">myarray</span><span class="punctuation">[</span><span class="name">i</span><span class="punctuation">]</span> +</pre> +At every iteration, we get an item out of the array: the return type is a an +instance of <tt class="docutils literal">numpy.float64</tt> (a numpy scalar), i.e. a <tt class="docutils literal">PyObject'*</tt>: this is +something which is implemented by numpy entirely in C, so completely +opaque to <tt class="docutils literal">cpyext</tt>. We don't have any control on how it is allocated, +managed, etc., and we can assume that allocation costs are the same as on +CPython.<br> +As soon as we return these <tt class="docutils literal">PyObject*</tt> to Python, we need to allocate +their <tt class="docutils literal">W_Root</tt> equivalent. If you do it in a small loop like in the example +above, you end up allocating all these <tt class="docutils literal">W_Root</tt> inside the nursery, which is +a good thing since allocation is super fast (see the section above about the +PyPy GC).<br> +However, we also need to keep track of the <tt class="docutils literal">W_Root</tt> to <tt class="docutils literal">PyObject*</tt> link. +Currently, we do this by putting all of them in a dictionary, but it is very +inefficient, especially because most of these objects die young and thus it +is wasted work to do that for them. Currently, this is one of the biggest +unresolved problem in <tt class="docutils literal">cpyext</tt>, and it is what causes the two microbenchmarks +<tt class="docutils literal">allocate_int</tt> and <tt class="docutils literal">allocate_tuple</tt> to be very slow.<br> +We are well aware of the problem, and we have a plan for how to fix it. The +explanation is too technical for the scope of this blog post as it requires a +deep knowledge of the GC internals to be understood, but the details are +<a class="reference external" href="https://foss.heptapod.net/pypy/extradoc/-/blob/branch/extradoc/planning/cpyext.txt#L27">here</a>.</div> +<div class="section" id="c-api-quirks"> +<h1> +C API quirks</h1> +Finally, there is another source of slowdown which is beyond our control. Some +parts of the CPython C API are badly designed and expose some of the +implementation details of CPython.<br> +The major example is reference counting. The <tt class="docutils literal">Py_INCREF</tt> / <tt class="docutils literal">Py_DECREF</tt> API +is designed in such a way which forces other implementation to emulate +refcounting even in presence of other GC management schemes, as explained +above.<br> +Another example is borrowed references. There are API functions which <strong>do +not</strong> incref an object before returning it, e.g. <a class="reference external" href="https://docs.python.org/2/c-api/list.html#c.PyList_GetItem">PyList_GetItem()</a>. This is +done for performance reasons because we can avoid a whole incref/decref pair, +if the caller needs to handle the returned item only temporarily: the item is +kept alive because it is in the list anyway.<br> +For PyPy, this is a challenge: thanks to <a class="reference external" href="https://www.pypy.org/posts/2011/10/more-compact-lists-with-list-strategies-8229304944653956829.html">list strategies</a>, lists are often +represented in a compact way. For example, a list containing only integers is +stored as a C array of <tt class="docutils literal">long</tt>. How to implement <tt class="docutils literal">PyList_GetItem</tt>? We +cannot simply create a <tt class="docutils literal">PyObject*</tt> on the fly, because the caller will never +decref it and it will result in a memory leak.<br> +The current solution is very inefficient. The first time we do a +<tt class="docutils literal">PyList_GetItem</tt>, we <a class="reference external" href="https://foss.heptapod.net/pypy/pypy/-/tree/branch/py3.6/pypy/module/cpyext/listobject.py#lines-28">convert</a> the <strong>whole</strong> list to a list of +<tt class="docutils literal">PyObject*</tt>. This is bad in two ways: the first is that we potentially pay a +lot of unneeded conversion cost in case we will never access the other items +of the list. The second is that by doing that we lose all the performance +benefit granted by the original list strategy, making it slower for the +rest of the pure-python code which will manipulate the list later.<br> +<tt class="docutils literal">PyList_GetItem</tt> is an example of a bad API because it assumes that the list +is implemented as an array of <tt class="docutils literal">PyObject*</tt>: after all, in order to return a +borrowed reference, we need a reference to borrow, don't we?<br> +Fortunately, (some) CPython developers are aware of these problems, and there +is an ongoing project to <a class="reference external" href="https://pythoncapi.readthedocs.io/">design a better C API</a> which aims to fix exactly +this kind of problem.<br> +Nonetheless, in the meantime we still need to implement the current +half-broken APIs. There is no easy solution for that, and it is likely that +we will always need to pay some performance penalty in order to implement them +correctly.<br> +However, what we could potentially do is to provide alternative functions +which do the same job but are more PyPy friendly: for example, we could think +of implementing <tt class="docutils literal">PyList_GetItemNonBorrowed</tt> or something like that: then, C +extensions could choose to use it (possibly hidden inside some macro and +<tt class="docutils literal">#ifdef</tt>) if they want to be fast on PyPy.</div> +<div class="section" id="current-performance"> +<h1> +Current performance</h1> +During the whole blog post we claimed <tt class="docutils literal">cpyext</tt> is slow. How +slow it is, exactly?<br> +We decided to concentrate on <a class="reference external" href="https://github.com/antocuni/cpyext-benchmarks">microbenchmarks</a> for now. It should be evident +by now there are simply too many issues which can slow down a <tt class="docutils literal">cpyext</tt> +program, and microbenchmarks help us to concentrate on one (or few) at a +time.<br> +The microbenchmarks measure very simple things, like calling functions and +methods with the various calling conventions (no arguments, one arguments, +multiple arguments); passing various types as arguments (to measure conversion +costs); allocating objects from C, and so on.<br> +Here are the results from the old PyPy 5.8 relative and normalized to CPython +2.7, the lower the better:<br> +<br> + + +<div class="separator" style="clear: both; text-align: center;"> +<a href="https://4.bp.blogspot.com/-5QV9jBfeXfo/W6UOCRA9YqI/AAAAAAAABX4/H2zgbv_XFQEHD4Lb2lj5Ve4Ob_YMuSXLwCLcBGAs/s1600/pypy58.png" style="margin-left: 1em; margin-right: 1em;"><img border="0" height="480" src="https://4.bp.blogspot.com/-5QV9jBfeXfo/W6UOCRA9YqI/AAAAAAAABX4/H2zgbv_XFQEHD4Lb2lj5Ve4Ob_YMuSXLwCLcBGAs/s640/pypy58.png" width="640"></a></div> +<br> +<div class="separator" style="clear: both; text-align: center;"> +<a href="https://www.blogger.com/blogger.g?blogID=3971202189709462152" style="margin-left: 1em; margin-right: 1em;"></a></div> +<div class="separator" style="clear: both; text-align: center;"> +<a href="https://www.blogger.com/blogger.g?blogID=3971202189709462152" style="margin-left: 1em; margin-right: 1em;"></a></div> +<br> +PyPy was horribly slow everywhere, ranging from 2.5x to 10x slower. It is +particularly interesting to compare <tt class="docutils literal">simple.noargs</tt>, which measures the cost +of calling an empty function with no arguments, and <tt class="docutils literal">simple.onearg(i)</tt>, +which measures the cost calling an empty function passing an integer argument: +the latter is ~2x slower than the former, indicating that the conversion cost +of integers is huge.<br> +PyPy 5.8 was the last release before the famous Cape Town sprint, when we +started to look at cpyext performance seriously. Here are the performance data for +PyPy 6.0, the latest release at the time of writing:<br> +<div class="separator" style="clear: both; text-align: center;"> +<a href="https://1.bp.blogspot.com/-MRkRoxtCeOE/W6UOL5txl1I/AAAAAAAABX8/i0ZiOyS2MOgiSyxFAyMOkKcB6xqjSihBACLcBGAs/s1600/pypy60.png" style="margin-left: 1em; margin-right: 1em;"><img border="0" height="480" src="https://1.bp.blogspot.com/-MRkRoxtCeOE/W6UOL5txl1I/AAAAAAAABX8/i0ZiOyS2MOgiSyxFAyMOkKcB6xqjSihBACLcBGAs/s640/pypy60.png" width="640"></a></div> +<br> +<br> +The results are amazing! PyPy is now massively faster than before, and for +most benchmarks it is even faster than CPython: yes, you read it correctly: +PyPy is faster than CPython at doing CPython's job, even considering all the +extra work it has to do to emulate the C API. This happens thanks to the JIT, +which produces speedups high enough to counterbalance the slowdown caused by +cpyext.<br> +There are two microbenchmarks which are still slower though: <tt class="docutils literal">allocate_int</tt> +and <tt class="docutils literal">allocate_tuple</tt>, for the reasons explained in the section about +<a class="reference internal" href="https://www.blogger.com/blogger.g?blogID=3971202189709462152#conversion-costs">Conversion costs</a>.</div> +<div class="section" id="next-steps"> +<h1> +Next steps</h1> +Despite the spectacular results we got so far, <tt class="docutils literal">cpyext</tt> is still slow enough to +kill performance in most real-world code which uses C extensions extensively +(e.g., the omnipresent numpy).<br> +Our current approach is something along these lines:<br> +<blockquote> +<ol class="arabic simple"> +<li>run a real-world small benchmark which exercises cpyext</li> +<li>measure and find the major bottleneck</li> +<li>write a corresponding microbenchmark</li> +<li>optimize it</li> +<li>repeat</li> +</ol> +</blockquote> +On one hand, this is a daunting task because the C API is huge and we need to +tackle functions one by one. On the other hand, not all the functions are +equally important, and is is enough to optimize a relatively small subset to +improve many different use cases.<br> +Where a year ago we announced we have a working answer to run c-extension in +PyPy, we now have a clear picture of what are the performance bottlenecks, and +we have developed some technical solutions to fix them. It is "only" a matter +of tackling them, one by one. It is worth noting that most of the work was +done during two sprints, for a total 2-3 person-months of work.<br> +We think this work is important for the Python ecosystem. PyPy has established +a baseline for performance in pure python code, providing an answer for the +"Python is slow" detractors. The techniques used to make <tt class="docutils literal">cpyext</tt> performant +will let PyPy become an alternative for people who mix C extensions with +Python, which, it turns out, is just about everyone, in particular those using +the various scientific libraries. Today, many developers are forced to seek +performance by converting code from Python to a lower language. We feel there +is no reason to do this, but in order to prove it we must be able to run both +their python and their C extensions performantly, then we can begin to educate +them how to write JIT-friendly code in the first place.<br> +We envision a future in which you can run arbitrary Python programs on PyPy, +with the JIT speeding up the pure Python parts and the C parts running as fast +as today: the best of both worlds!</div> +</div>cpyextprofilingspeedhttps://www.pypy.org/posts/2018/09/inside-cpyext-why-emulating-cpython-c-8083064623681286567.htmlFri, 21 Sep 2018 16:32:00 GMTHow to make your code 80 times fasterhttps://www.pypy.org/posts/2017/10/how-to-make-your-code-80-times-faster-1424098117108093942.htmlAntonio Cuni<div class="document" id="how-to-make-your-code-80-times-faster"> +I often hear people who are happy because PyPy makes their code 2 times faster +or so. Here is a short personal story which shows PyPy can go well beyond +that.<br> +<br> +<strong>DISCLAIMER</strong>: this is not a silver bullet or a general recipe: it worked in +this particular case, it might not work so well in other cases. But I think it +is still an interesting technique. Moreover, the various steps and +implementations are showed in the same order as I tried them during the +development, so it is a real-life example of how to proceed when optimizing +for PyPy.<br> +<br> +Some months ago I <a class="reference external" href="https://github.com/antocuni/evolvingcopter">played a bit</a> with evolutionary algorithms: the ambitious +plan was to automatically evolve a logic which could control a (simulated) +quadcopter, i.e. a <a class="reference external" href="https://en.wikipedia.org/wiki/PID_controller">PID controller</a> (<strong>spoiler</strong>: it doesn't fly).<br> +<br> +The idea is to have an initial population of random creatures: at each +generation, the ones with the best fitness survive and reproduce with small, +random variations.<br> +<br> +However, for the scope of this post, the actual task at hand is not so +important, so let's jump straight to the code. To drive the quadcopter, a +<tt class="docutils literal">Creature</tt> has a <tt class="docutils literal">run_step</tt> method which runs at each <tt class="docutils literal">delta_t</tt> (<a class="reference external" href="https://github.com/antocuni/evolvingcopter/blob/master/ev/creature.py">full +code</a>):<br> +<pre class="code python literal-block"><span class="keyword">class</span> <span class="name class">Creature</span><span class="punctuation">(</span><span class="name builtin">object</span><span class="punctuation">):</span> + <span class="name">INPUTS</span> <span class="operator">=</span> <span class="literal number integer">2</span> <span class="comment single"># z_setpoint, current z position</span> + <span class="name">OUTPUTS</span> <span class="operator">=</span> <span class="literal number integer">1</span> <span class="comment single"># PWM for all 4 motors</span> + <span class="name">STATE_VARS</span> <span class="operator">=</span> <span class="literal number integer">1</span> + <span class="operator">...</span> + + <span class="keyword">def</span> <span class="name function">run_step</span><span class="punctuation">(</span><span class="name builtin pseudo">self</span><span class="punctuation">,</span> <span class="name">inputs</span><span class="punctuation">):</span> + <span class="comment single"># state: [state_vars ... inputs]</span> + <span class="comment single"># out_values: [state_vars, ... outputs]</span> + <span class="name builtin pseudo">self</span><span class="operator">.</span><span class="name">state</span><span class="punctuation">[</span><span class="name builtin pseudo">self</span><span class="operator">.</span><span class="name">STATE_VARS</span><span class="punctuation">:]</span> <span class="operator">=</span> <span class="name">inputs</span> + <span class="name">out_values</span> <span class="operator">=</span> <span class="name">np</span><span class="operator">.</span><span class="name">dot</span><span class="punctuation">(</span><span class="name builtin pseudo">self</span><span class="operator">.</span><span class="name">matrix</span><span class="punctuation">,</span> <span class="name builtin pseudo">self</span><span class="operator">.</span><span class="name">state</span><span class="punctuation">)</span> <span class="operator">+</span> <span class="name builtin pseudo">self</span><span class="operator">.</span><span class="name">constant</span> + <span class="name builtin pseudo">self</span><span class="operator">.</span><span class="name">state</span><span class="punctuation">[:</span><span class="name builtin pseudo">self</span><span class="operator">.</span><span class="name">STATE_VARS</span><span class="punctuation">]</span> <span class="operator">=</span> <span class="name">out_values</span><span class="punctuation">[:</span><span class="name builtin pseudo">self</span><span class="operator">.</span><span class="name">STATE_VARS</span><span class="punctuation">]</span> + <span class="name">outputs</span> <span class="operator">=</span> <span class="name">out_values</span><span class="punctuation">[</span><span class="name builtin pseudo">self</span><span class="operator">.</span><span class="name">STATE_VARS</span><span class="punctuation">:]</span> + <span class="keyword">return</span> <span class="name">outputs</span> +</pre> +<ul class="simple"> +<li><tt class="docutils literal">inputs</tt> is a numpy array containing the desired setpoint and the current +position on the Z axis;</li> +<li><tt class="docutils literal">outputs</tt> is a numpy array containing the thrust to give to the motors. To +start easy, all the 4 motors are constrained to have the same thrust, so +that the quadcopter only travels up and down the Z axis;</li> +<li><tt class="docutils literal">self.state</tt> contains arbitrary values of unknown size which are passed from +one step to the next;</li> +<li><tt class="docutils literal">self.matrix</tt> and <tt class="docutils literal">self.constant</tt> contains the actual logic. By putting +the "right" values there, in theory we could get a perfectly tuned PID +controller. These are randomly mutated between generations.</li> +</ul> +<tt class="docutils literal">run_step</tt> is called at 100Hz (in the virtual time frame of the simulation). At each +generation, we test 500 creatures for a total of 12 virtual seconds each. So, +we have a total of 600,000 executions of <tt class="docutils literal">run_step</tt> at each generation.<br> +<br> +At first, I simply tried to run this code on CPython; here is the result:<br> +<pre class="code literal-block">$ python -m ev.main +Generation 1: ... [population = 500] [12.06 secs] +Generation 2: ... [population = 500] [6.13 secs] +Generation 3: ... [population = 500] [6.11 secs] +Generation 4: ... [population = 500] [6.09 secs] +Generation 5: ... [population = 500] [6.18 secs] +Generation 6: ... [population = 500] [6.26 secs] +</pre> +Which means ~6.15 seconds/generation, excluding the first.<br> +<br> +Then I tried with PyPy 5.9:<br> +<pre class="code literal-block">$ pypy -m ev.main +Generation 1: ... [population = 500] [63.90 secs] +Generation 2: ... [population = 500] [33.92 secs] +Generation 3: ... [population = 500] [34.21 secs] +Generation 4: ... [population = 500] [33.75 secs] +</pre> +Ouch! We are ~5.5x slower than CPython. This was kind of expected: numpy is +based on cpyext, which is infamously slow. (Actually, <a class="reference external" href="https://pypy.org/posts/2017/10/cape-of-good-hope-for-pypy-hello-from-3656631725712879033.html">we are working on +that</a> and on the <tt class="docutils literal"><span class="pre">cpyext-avoid-roundtrip</span></tt> branch we are already faster than +CPython, but this will be the subject of another blog post.)<br> +<br> +So, let's try to avoid cpyext. The first obvious step is to use <a class="reference external" href="https://doc.pypy.org/en/latest/faq.html#what-about-numpy-numpypy-micronumpy">numpypy</a> +instead of numpy (actually, there is a <a class="reference external" href="https://github.com/antocuni/evolvingcopter/blob/master/ev/pypycompat.py">hack</a> to use just the micronumpy +part). Let's see if the speed improves:<br> +<pre class="code literal-block">$ pypy -m ev.main # using numpypy +Generation 1: ... [population = 500] [5.60 secs] +Generation 2: ... [population = 500] [2.90 secs] +Generation 3: ... [population = 500] [2.78 secs] +Generation 4: ... [population = 500] [2.69 secs] +Generation 5: ... [population = 500] [2.72 secs] +Generation 6: ... [population = 500] [2.73 secs] +</pre> +So, ~2.7 seconds on average: this is 12x faster than PyPy+numpy, and more than +2x faster than the original CPython. At this point, most people would be happy +and go tweeting how PyPy is great.<br> +<br> +In general, when talking of CPython vs PyPy, I am rarely satified of a 2x +speedup: I know that PyPy can do much better than this, especially if you +write code which is specifically optimized for the JIT. For a real-life +example, have a look at <a class="reference external" href="https://capnpy.readthedocs.io/en/latest/benchmarks.html">capnpy benchmarks</a>, in which the PyPy version is +~15x faster than the heavily optimized CPython+Cython version (both have been +written by me, and I tried hard to write the fastest code for both +implementations).<br> +<br> +So, let's try to do better. As usual, the first thing to do is to profile and +see where we spend most of the time. Here is the <a class="reference external" href="https://vmprof.com/#/449ca8ee-3ab2-49d4-b6f0-9099987e9000">vmprof profile</a>. We spend a +lot of time inside the internals of numpypy, and allocating tons of temporary +arrays to store the results of the various operations.<br> +<br> +Also, let's look at the <a class="reference external" href="https://vmprof.com/#/28fd6e8f-f103-4bf4-a76a-4b65dbd637f4/traces">jit traces</a> and search for the function <tt class="docutils literal">run</tt>: +this is loop in which we spend most of the time, and it is composed of 1796 +operations. The operations emitted for the line <tt class="docutils literal"><span class="pre">np.dot(...)</span> + +self.constant</tt> are listed between lines 1217 and 1456. Here is the excerpt +which calls <tt class="docutils literal"><span class="pre">np.dot(...)</span></tt>; most of the ops are cheap, but at line 1232 we +see a call to the RPython function <a class="reference external" href="https://foss.heptapod.net/pypy/pypy/-/blob/release-pypy3.5-v5.10.0/pypy/module/micronumpy/ndarray.py#L1160">descr_dot</a>; by looking at the +implementation we see that it creates a new <tt class="docutils literal">W_NDimArray</tt> to store the +result, which means it has to do a <tt class="docutils literal">malloc()</tt>:<br> +<div class="separator" style="clear: both; text-align: center;"> +<a href="https://4.bp.blogspot.com/-_h6BuLTtEO8/Wfb6BXDg93I/AAAAAAAABNY/BY2XBg4ZtwokB9f1mWSmzI9gn_qanb81QCLcBGAs/s1600/2017-10-trace1.png" style="margin-left: 1em; margin-right: 1em;"><img border="0" height="450" src="https://4.bp.blogspot.com/-_h6BuLTtEO8/Wfb6BXDg93I/AAAAAAAABNY/BY2XBg4ZtwokB9f1mWSmzI9gn_qanb81QCLcBGAs/s640/2017-10-trace1.png" width="640"></a></div> +<br> +The implementation of the <tt class="docutils literal">+ self.constant</tt> part is also interesting: +contrary the former, the call to <tt class="docutils literal">W_NDimArray.descr_add</tt> has been inlined by +the JIT, so we have a better picture of what's happening; in particular, we +can see the call to <tt class="docutils literal">__0_alloc_with_del____</tt> which allocates the +<tt class="docutils literal">W_NDimArray</tt> for the result, and the <tt class="docutils literal">raw_malloc</tt> which allocates the +actual array. Then we have a long list of 149 simple operations which set the +fields of the resulting array, construct an iterator, and finally do a +<tt class="docutils literal">call_assembler</tt>: this is the actual logic to do the addition, which was +JITtted indipendently; <tt class="docutils literal">call_assembler</tt> is one of the operations to do +JIT-to-JIT calls:<br> +<div class="separator" style="clear: both; text-align: center;"> +<a href="https://1.bp.blogspot.com/-vmo0pWharIU/Wfb3VfwHjxI/AAAAAAAABNE/a6Em09qZizwGiWJeTbGzKfHQH70dB7RKgCEwYBhgL/s1600/2017-10-trace2.png" style="margin-left: 1em; margin-right: 1em;"><img border="0" height="640" src="https://1.bp.blogspot.com/-vmo0pWharIU/Wfb3VfwHjxI/AAAAAAAABNE/a6Em09qZizwGiWJeTbGzKfHQH70dB7RKgCEwYBhgL/s640/2017-10-trace2.png" width="625"></a></div> +<br> +All of this is very suboptimal: in this particular case, we know that the +shape of <tt class="docutils literal">self.matrix</tt> is always <tt class="docutils literal">(3, 2)</tt>: so, we are doing an incredible +amount of work, including calling <tt class="docutils literal">malloc()</tt> twice for the temporary arrays, just to +call two functions which ultimately do a total of 6 multiplications +and 6 additions. Note also that this is not a fault of the JIT: CPython+numpy +has to do the same amount of work, just hidden inside C calls.<br> +<br> +One possible solution to this nonsense is a well known compiler optimization: +loop unrolling. From the compiler point of view, unrolling the loop is always +risky because if the matrix is too big you might end up emitting a huge blob +of code, possibly uselss if the shape of the matrices change frequently: this +is the main reason why the PyPy JIT does not even try to do it in this case.<br> +<br> +However, we <strong>know</strong> that the matrix is small, and always of the same +shape. So, let's unroll the loop manually:<br> +<pre class="code python literal-block"><span class="keyword">class</span> <span class="name class">SpecializedCreature</span><span class="punctuation">(</span><span class="name">Creature</span><span class="punctuation">):</span> + + <span class="keyword">def</span> <span class="name function magic">__init__</span><span class="punctuation">(</span><span class="name builtin pseudo">self</span><span class="punctuation">,</span> <span class="operator">*</span><span class="name">args</span><span class="punctuation">,</span> <span class="operator">**</span><span class="name">kwargs</span><span class="punctuation">):</span> + <span class="name">Creature</span><span class="operator">.</span><span class="name function magic">__init__</span><span class="punctuation">(</span><span class="name builtin pseudo">self</span><span class="punctuation">,</span> <span class="operator">*</span><span class="name">args</span><span class="punctuation">,</span> <span class="operator">**</span><span class="name">kwargs</span><span class="punctuation">)</span> + <span class="comment single"># store the data in a plain Python list</span> + <span class="name builtin pseudo">self</span><span class="operator">.</span><span class="name">data</span> <span class="operator">=</span> <span class="name builtin">list</span><span class="punctuation">(</span><span class="name builtin pseudo">self</span><span class="operator">.</span><span class="name">matrix</span><span class="operator">.</span><span class="name">ravel</span><span class="punctuation">())</span> <span class="operator">+</span> <span class="name builtin">list</span><span class="punctuation">(</span><span class="name builtin pseudo">self</span><span class="operator">.</span><span class="name">constant</span><span class="punctuation">)</span> + <span class="name builtin pseudo">self</span><span class="operator">.</span><span class="name">data_state</span> <span class="operator">=</span> <span class="punctuation">[</span><span class="literal number float">0.0</span><span class="punctuation">]</span> + <span class="keyword">assert</span> <span class="name builtin pseudo">self</span><span class="operator">.</span><span class="name">matrix</span><span class="operator">.</span><span class="name">shape</span> <span class="operator">==</span> <span class="punctuation">(</span><span class="literal number integer">2</span><span class="punctuation">,</span> <span class="literal number integer">3</span><span class="punctuation">)</span> + <span class="keyword">assert</span> <span class="name builtin">len</span><span class="punctuation">(</span><span class="name builtin pseudo">self</span><span class="operator">.</span><span class="name">data</span><span class="punctuation">)</span> <span class="operator">==</span> <span class="literal number integer">8</span> + + <span class="keyword">def</span> <span class="name function">run_step</span><span class="punctuation">(</span><span class="name builtin pseudo">self</span><span class="punctuation">,</span> <span class="name">inputs</span><span class="punctuation">):</span> + <span class="comment single"># state: [state_vars ... inputs]</span> + <span class="comment single"># out_values: [state_vars, ... outputs]</span> + <span class="name">k0</span><span class="punctuation">,</span> <span class="name">k1</span><span class="punctuation">,</span> <span class="name">k2</span><span class="punctuation">,</span> <span class="name">q0</span><span class="punctuation">,</span> <span class="name">q1</span><span class="punctuation">,</span> <span class="name">q2</span><span class="punctuation">,</span> <span class="name">c0</span><span class="punctuation">,</span> <span class="name">c1</span> <span class="operator">=</span> <span class="name builtin pseudo">self</span><span class="operator">.</span><span class="name">data</span> + <span class="name">s0</span> <span class="operator">=</span> <span class="name builtin pseudo">self</span><span class="operator">.</span><span class="name">data_state</span><span class="punctuation">[</span><span class="literal number integer">0</span><span class="punctuation">]</span> + <span class="name">z_sp</span><span class="punctuation">,</span> <span class="name">z</span> <span class="operator">=</span> <span class="name">inputs</span> + <span class="comment single">#</span> + <span class="comment single"># compute the output</span> + <span class="name">out0</span> <span class="operator">=</span> <span class="name">s0</span><span class="operator">*</span><span class="name">k0</span> <span class="operator">+</span> <span class="name">z_sp</span><span class="operator">*</span><span class="name">k1</span> <span class="operator">+</span> <span class="name">z</span><span class="operator">*</span><span class="name">k2</span> <span class="operator">+</span> <span class="name">c0</span> + <span class="name">out1</span> <span class="operator">=</span> <span class="name">s0</span><span class="operator">*</span><span class="name">q0</span> <span class="operator">+</span> <span class="name">z_sp</span><span class="operator">*</span><span class="name">q1</span> <span class="operator">+</span> <span class="name">z</span><span class="operator">*</span><span class="name">q2</span> <span class="operator">+</span> <span class="name">c1</span> + <span class="comment single">#</span> + <span class="name builtin pseudo">self</span><span class="operator">.</span><span class="name">data_state</span><span class="punctuation">[</span><span class="literal number integer">0</span><span class="punctuation">]</span> <span class="operator">=</span> <span class="name">out0</span> + <span class="name">outputs</span> <span class="operator">=</span> <span class="punctuation">[</span><span class="name">out1</span><span class="punctuation">]</span> + <span class="keyword">return</span> <span class="name">outputs</span> +</pre> +In the <a class="reference external" href="https://github.com/antocuni/evolvingcopter/blob/master/ev/creature.py#L100">actual code</a> there is also a sanity check which asserts that the +computed output is the very same as the one returned by <tt class="docutils literal">Creature.run_step</tt>.<br> +<br> +So, let's try to see how it performs. First, with CPython:<br> +<pre class="code literal-block">$ python -m ev.main +Generation 1: ... [population = 500] [7.61 secs] +Generation 2: ... [population = 500] [3.96 secs] +Generation 3: ... [population = 500] [3.79 secs] +Generation 4: ... [population = 500] [3.74 secs] +Generation 5: ... [population = 500] [3.84 secs] +Generation 6: ... [population = 500] [3.69 secs] +</pre> +This looks good: 60% faster than the original CPython+numpy +implementation. Let's try on PyPy:<br> +<pre class="code literal-block">Generation 1: ... [population = 500] [0.39 secs] +Generation 2: ... [population = 500] [0.10 secs] +Generation 3: ... [population = 500] [0.11 secs] +Generation 4: ... [population = 500] [0.09 secs] +Generation 5: ... [population = 500] [0.08 secs] +Generation 6: ... [population = 500] [0.12 secs] +Generation 7: ... [population = 500] [0.09 secs] +Generation 8: ... [population = 500] [0.08 secs] +Generation 9: ... [population = 500] [0.08 secs] +Generation 10: ... [population = 500] [0.08 secs] +Generation 11: ... [population = 500] [0.08 secs] +Generation 12: ... [population = 500] [0.07 secs] +Generation 13: ... [population = 500] [0.07 secs] +Generation 14: ... [population = 500] [0.08 secs] +Generation 15: ... [population = 500] [0.07 secs] +</pre> +Yes, it's not an error. After a couple of generations, it stabilizes at around +~0.07-0.08 seconds per generation. This is around <strong>80 (eighty) times faster</strong> +than the original CPython+numpy implementation, and around 35-40x faster than +the naive PyPy+numpypy one.<br> +<br> +Let's look at the <a class="reference external" href="https://vmprof.com/#/402af746-2966-4403-a61d-93015abac033/traces">trace</a> again: it no longer contains expensive calls, and +certainly no more temporary <tt class="docutils literal">malloc()</tt> s. The core of the logic is between +lines 386-416, where we can see that it does fast C-level multiplications and +additions: <tt class="docutils literal">float_mul</tt> and <tt class="docutils literal">float_add</tt> are translated straight into +<tt class="docutils literal">mulsd</tt> and <tt class="docutils literal">addsd</tt> x86 instructions.<br> +<br> +As I said before, this is a very particular example, and the techniques +described here do not always apply: it is not realistic to expect an 80x +speedup on arbitrary code, unfortunately. However, it clearly shows the potential of PyPy when +it comes to high-speed computing. And most importantly, it's not a toy +benchmark which was designed specifically to have good performance on PyPy: +it's a real world example, albeit small.<br> +<br> +You might be also interested in the talk I gave at last EuroPython, in which I +talk about a similar topic: "The Joy of PyPy JIT: abstractions for free" +(<a class="reference external" href="https://ep2017.europython.eu/conference/talks/the-joy-of-pypy-jit-abstractions-for-free">abstract</a>, <a class="reference external" href="https://speakerdeck.com/antocuni/the-joy-of-pypy-jit-abstractions-for-free">slides</a> and <a class="reference external" href="https://www.youtube.com/watch?v=NQfpHQII2cU">video</a>).<br> +<br> +<div class="section" id="how-to-reproduce-the-results"> +<h3> +How to reproduce the results</h3> +<pre class="code literal-block">$ git clone https://github.com/antocuni/evolvingcopter +$ cd evolvingcopter +$ {python,pypy} -m ev.main --no-specialized --no-numpypy +$ {python,pypy} -m ev.main --no-specialized +$ {python,pypy} -m ev.main +</pre> +</div> +</div>jitprofilingspeedhttps://www.pypy.org/posts/2017/10/how-to-make-your-code-80-times-faster-1424098117108093942.htmlMon, 30 Oct 2017 10:15:00 GMT(Cape of) Good Hope for PyPyhttps://www.pypy.org/posts/2017/10/cape-of-good-hope-for-pypy-hello-from-3656631725712879033.htmlAntonio Cuni<div> +<br></div> +Hello from the other side of the world (for most of you)!<br> +<br> +With the excuse of coming to <a class="reference external" href="https://za.pycon.org/">PyCon ZA</a> during the last two weeks Armin, +Ronan, Antonio and sometimes Maciek had a very nice and productive sprint in +Cape Town, as pictures show :). We would like to say a big thank you to +Kiwi.com, which sponsored part of the travel costs via its awesome <a class="reference external" href="https://www.kiwi.com/sourcelift/">Sourcelift</a> +program to help Open Source projects.<br> +<br> +<table align="center" cellpadding="0" cellspacing="0" class="tr-caption-container" style="float: right; margin-left: 1em; text-align: right;"><tbody> +<tr><td style="text-align: center;"><a href="https://3.bp.blogspot.com/-9YVNucPN1wE/WeaWmTUFB-I/AAAAAAAABMQ/HeVMqS-ya2IYJuk0iZZODlULqpKaf5XcgCLcBGAs/s1600/DSC_2418.JPG" style="margin-left: auto; margin-right: auto;"><img border="0" height="225" src="https://3.bp.blogspot.com/-9YVNucPN1wE/WeaWmTUFB-I/AAAAAAAABMQ/HeVMqS-ya2IYJuk0iZZODlULqpKaf5XcgCLcBGAs/s400/DSC_2418.JPG" width="400"></a></td></tr> +<tr><td class="tr-caption" style="text-align: center;">Armin, Anto and Ronan at Cape Point</td></tr> +</tbody></table> +<br> +Armin, Ronan and Anto spent most of the time hacking at cpyext, our CPython +C-API compatibility layer: during the last years, the focus was to make it +working and compatible with CPython, in order to run existing libraries such +as numpy and pandas. However, we never paid too much attention to performance, +so the net result is that with the latest released version of PyPy, C +extensions generally work but their speed ranges from "slow" to "horribly +slow".<br> +<br> +For example, these very simple <a class="reference external" href="https://github.com/antocuni/cpyext-benchmarks">microbenchmarks</a> measure the speed of +calling (empty) C functions, i.e. the time you spend to "cross the border" +between RPython and C. <i>(Note: this includes the time spent doing the loop in regular Python code.)</i> These are the results on CPython, on PyPy 5.8, and on +our newest in-progress version:<br> +<br> +<pre class="literal-block">$ python bench.py # CPython +noargs : 0.41 secs +onearg(None): 0.44 secs +onearg(i) : 0.44 secs +varargs : 0.58 secs +</pre> +<div> +<br></div> +<pre class="literal-block">$ pypy-5.8 bench.py # PyPy 5.8 +noargs : 1.01 secs +onearg(None): 1.31 secs +onearg(i) : 2.57 secs +varargs : 2.79 secs +</pre> +<div> +<br></div> +<pre class="literal-block">$ pypy bench.py # cpyext-refactor-methodobject branch +noargs : 0.17 secs +onearg(None): 0.21 secs +onearg(i) : 0.22 secs +varargs : 0.47 secs +</pre> +<div> +<br></div> +<pre class="literal-block"></pre> +<pre class="literal-block"></pre> +So yes: before the sprint, we were ~2-6x slower than CPython. Now, we are +<strong>faster</strong> than it! +To reach this result, we did various improvements, such as: +<br> +<blockquote> +<ol class="arabic simple"> +<li>teach the JIT how to look (a bit) inside the cpyext module;</li> +<li>write specialized code for calling <tt class="docutils literal">METH_NOARGS</tt>, <tt class="docutils literal">METH_O</tt> and +<tt class="docutils literal">METH_VARARGS</tt> functions; previously, we always used a very general and +slow logic;</li> +<li>implement freelists to allocate the cpyext versions of <tt class="docutils literal">int</tt> and +<tt class="docutils literal">tuple</tt> objects, as CPython does;</li> +<li>the <a class="reference external" href="https://foss.heptapod.net/pypy/pypy/-/merge_requests/573">cpyext-avoid-roundtrip</a> branch: crossing the RPython/C border is +slowish, but the real problem was (and still is for many cases) we often +cross it many times for no good reason. So, depending on the actual API +call, you might end up in the C land, which calls back into the RPython +land, which goes to C, etc. etc. (ad libitum).</li> +</ol> +</blockquote> +The branch tries to fix such nonsense: so far, we fixed only some cases, which +are enough to speed up the benchmarks shown above. But most importantly, we +now have a clear path and an actual plan to improve cpyext more and +more. Ideally, we would like to reach a point in which cpyext-intensive +programs run at worst at the same speed of CPython.<br> +<br> +The other big topic of the sprint was Armin and Maciej doing a lot of work on the +<a class="reference external" href="https://bitbucket.org/pypy/pypy/commits/branch/unicode-utf8">unicode-utf8</a> branch: the goal of the branch is to always use UTF-8 as the +internal representation of unicode strings. The advantages are various: +<br> +<blockquote> +<ul class="simple"> +<li>decoding a UTF-8 stream is super fast, as you just need to check that the +stream is valid;</li> +<li>encoding to UTF-8 is almost a no-op;</li> +<li>UTF-8 is always more compact representation than the currently +used UCS-4. It's also almost always more compact than CPython 3.5 latin1/UCS2/UCS4 combo;</li> +<li>smaller representation means everything becomes quite a bit faster due to lower cache pressure.</li> +</ul> +</blockquote> +Before you ask: yes, this branch contains special logic to ensure that random +access of single unicode chars is still O(1), as it is on both CPython and the +current PyPy.<br> +We also plan to improve the speed of decoding even more by using modern processor features, like SSE and AVX. Preliminary results show that decoding can be done 100x faster than the current setup. +<br> +<br> +In summary, this was a long and profitable sprint, in which we achieved lots +of interesting results. However, what we liked even more was the privilege of +doing <a class="reference external" href="https://bitbucket.org/pypy/pypy/commits/a4307fb5912e">commits</a> from awesome places such as the top of Table Mountain:<br> +<br> +<blockquote class="twitter-tweet"> +<div dir="ltr" lang="en"> +Our sprint venue today <a href="https://twitter.com/hashtag/pypy?src=hash&amp;ref_src=twsrc%5Etfw">#pypy</a> <a href="https://t.co/o38IfTYmAV">pic.twitter.com/o38IfTYmAV</a></div> +— Ronan Lamy (@ronanlamy) <a href="https://twitter.com/ronanlamy/status/915575026107240449?ref_src=twsrc%5Etfw">4 ottobre 2017</a></blockquote> + + +<br> +<table align="center" cellpadding="0" cellspacing="0" class="tr-caption-container" style="float: left; margin-right: 1em; text-align: left;"><tbody> +<tr><td style="text-align: center;"><a href="https://foss.heptapod.net/pypy/extradoc/-/blob/branch/extradoc/sprintinfo/cape-town-2017/2017-10-04-155524.jpg" style="margin-left: auto; margin-right: auto;"><img border="0" height="360" src="https://bytebucket.org/pypy/extradoc/raw/extradoc/sprintinfo/cape-town-2017/2017-10-04-155524.jpg" width="640"></a></td></tr> +<tr><td class="tr-caption" style="text-align: center;">The panorama we looked at instead of staring at cpyext code</td></tr> +</tbody></table>cpyextprofilingspeedsprintunicodehttps://www.pypy.org/posts/2017/10/cape-of-good-hope-for-pypy-hello-from-3656631725712879033.htmlWed, 18 Oct 2017 13:31:00 GMTUsing CPython extension modules with PyPy natively, or: PyPy can load .pyd files with CPyExt!https://www.pypy.org/posts/2010/04/using-cpython-extension-modules-with-5864754772659599217.htmlAlexander Schremmer<p>PyPy is now able to load +and run CPython extension modules (i.e. .pyd and .so files) natively by using the new CPyExt +subsystem. +Unlike the solution presented in <a class="reference external" href="https://www.pypy.org/posts/2009/11/using-cpython-extension-modules-with-4951018896657992031.html">another blog post</a> (where extension modules like +numpy etc. were run on CPython and proxied through TCP), this solution does not require +a running CPython anymore. We do not achieve full binary compatiblity +yet (like Ironclad), but recompiling the extension is generally enough.</p> +<p>The only prerequisite is that the necessary functions of the C API of CPython are already +implemented in PyPy. If you are a user or an author of a module and miss certain functions +in PyPy, we invite you to implement them. Up until now, a lot of people (including a lot of +new committers) have stepped up and implemented a few functions to get their favorite module +running. See the end of this post for a list of names.</p> +<p>Regarding speed, we tried the following: even though there is a bit of overhead when running +these modules, we could run the regular expression engine of CPython (<tt class="docutils literal"><span class="pre">_sre.so</span></tt>) and execute +the spambayes benchmark of the Unladen Swallow benchmark suite (cf. <a class="reference external" href="https://speed.pypy.org/">speed.pypy.org</a>) and +experience a speedup: +It became <em>two times faster</em> on pypy-c than with the built-in regular +expression engine of PyPy. From <a href="https://en.wikipedia.org/wiki/Amdahl%27s_law">Amdahl's Law</a> it follows that the <tt class="docutils literal"><span class="pre">_sre.so</span></tt> must run several +times faster than the built-in engine.</p> +<p>Currently pursued modules include PIL and others. Distutils support is nearly ready. +If you would like to participate or want information on how to use this new feature, come and join +our IRC channel <tt class="docutils literal"><span class="pre">#pypy</span></tt> on <a class="reference external" href="irc://irc.freenode.net/">freenode</a>.</p> +<p>Amaury Forgeot d'Arc and Alexander Schremmer</p> +<p>Further CPyExt Contributors:</p> +<ul><li>Alex Gaynor +</li><li>Benjamin Peterson +</li><li>Jean-Paul Calderone +</li><li>Maciej Fijalkowski +</li><li>Jan de Mooij +</li><li>Lucian Branescu Mihaila +</li><li>Andreas Stührk +</li><li>Zooko Wilcox-O Hearn</li></ul>cpyextCPythonextension modulesspeedhttps://www.pypy.org/posts/2010/04/using-cpython-extension-modules-with-5864754772659599217.htmlFri, 09 Apr 2010 22:56:00 GMTPyPy gets a new compilerhttps://www.pypy.org/posts/2009/08/pypy-gets-new-compiler_25-6401910947439531107.htmlBenjamin Peterson<p>Today, I merged the parser-compiler branch, which I have been working on over the summer. It contained a total rewrite of both PyPy's Python parser and AST compiler. PyPy's old parser was (in)famous internally for being complicated and slow (with many algorithmic complexities greater than O(n)). The new parser is a simple as <a href="https://codespeak.net/viewvc/pypy/trunk/pypy/interpreter/pyparser/parser.py?view=markup">I could make it</a> LL(1) parser like CPython (though it doesn't share the hacks of CPython's parser).</p> + +<p>The new compiler is based on the <a href="https://doc.python.org/3.1/library/ast">Abstract Syntax Trees (AST) that CPython 2.5 introduced</a> instead of PyPy's old AST based on the <a href="https://doc.python.org/library/compiler">compiler package's</a>. This means that Python code running on PyPy will be able to use the same _ast interface as CPython. PyPy's _ast implementation supports AST features that CPython 2.6 added, including <a href="https://pythonic.pocoo.org/2008/3/29/ast-compilation-from-python">compiling modified AST to bytecode and executing it</a>. In this rewrite, some more obscure compiler features were added, too. For example, jumps in bytecode can now be greater than 65535 bytes! (That's like an if statement with 7000 lines of code in the body.)</p> + +<p>While the PyPy translation toolchain still has many obscure details and hacks, this merge completes the process of making the actual Python interpreter very clean. Hopefully, this will make adding new features much easier and make PyPy less frustrating to maintain as well as providing application level code with an improved AST interface!</p>compilerparserspeedhttps://www.pypy.org/posts/2009/08/pypy-gets-new-compiler_25-6401910947439531107.htmlTue, 25 Aug 2009 16:05:00 GMTRoadmap for JIThttps://www.pypy.org/posts/2009/04/roadmap-for-jit-377358891902851723.htmlMaciej Fijalkowski<p>Hello. +</p> +<p> +First a disclaimer. This post is more about plans for future than current +status. We usually try to write about things that we have done, because +it's much much easier to promise things than to actually make it happen, +but I think it's important enough to have some sort of roadmap. +</p> +<p> +In recent months we came to the point where the 5th generation of +JIT prototype was working as <a href="https://www.pypy.org/posts/2009/03/good-news-everyone-421421336094214242.html">nice</a> +or even a bit nicer than 1st one back in 2007. Someone might ask "so why +did you spend all this time without going forward?". And indeed, we spend +a lot of time moving sideways, but as posted, we also spent a lot of time +doing <a href="https://www.pypy.org/posts/2009/04/beta-for-110-released-4604559533184706699.html">some other things</a>, which are important as well. +The main advantage of current JIT incarnation is much much simpler than +the first one. Even I can comprehend it, which is much of an improvement :-) +</p> +<p> +So, the prototype is working and gives very nice speedups in range of 20-30x +over CPython. We're pretty confident this prototype will work and will +produce fast python interpreter eventually. So we decided that now we'll +work towards changing prototype into something stable and solid. This +might sound easy, but in fact it's not. Having stable assembler backend +and optimizations that keep semantics is not as easy as it might sound. +</p> +<p> +The current roadmap, as I see it, looks like as following: +</p> +<ul> +<li> Provide a JIT that does not speedup things, but produce assembler without + optimizations turned on, that is correct and able to run CPython's library + tests on a nightly basis. +</li> +<li> + Introduce simple optimizations, that should make above JIT a bit faster than + CPython. With optimizations disabled JIT is producing incredibly dumb + assembler, which is slower than correspoding C code, even with removal + of interpretation overhead (which is not very surprising). +</li> +<li> + Backport optimizations from JIT prototype, one by one, keeping an eye + on how they perform and making sure they don't break anything. +</li> +<li> + Create new optimizations, like speeding up attribute access. +</li> +<li> + Profit. +</li> +</ul> +<p> +This way, we can hopefully provide a working JIT, which gives fast python +interpreter, which is a bit harder than just a nice prototype. +</p> +<p> +Tell us what you think about this plan. +</p> +Cheers,<br> +fijal &amp; others.jitpypyroadmapspeedhttps://www.pypy.org/posts/2009/04/roadmap-for-jit-377358891902851723.htmlTue, 21 Apr 2009 19:38:00 GMT \ No newline at end of file diff --git a/categories/sponsors.html b/categories/sponsors.html new file mode 100644 index 000000000..4d232bb84 --- /dev/null +++ b/categories/sponsors.html @@ -0,0 +1,132 @@ + + + + + +Posts about sponsors | PyPy + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +
                + + \ No newline at end of file diff --git a/categories/sponsors.xml b/categories/sponsors.xml new file mode 100644 index 000000000..b79e708d7 --- /dev/null +++ b/categories/sponsors.xml @@ -0,0 +1,649 @@ + +PyPy (Posts about sponsors)https://www.pypy.org/enContents © 2024 <a href="mailto:pypy-dev@pypy.org">The PyPy Team</a> Sat, 31 Aug 2024 17:48:14 GMTNikola (getnikola.com)http://blogs.law.harvard.edu/tech/rssPyPy for low-latency systemshttps://www.pypy.org/posts/2019/01/pypy-for-low-latency-systems-613165393301401965.htmlAntonio Cuni<h1 class="title"> +PyPy for low-latency systems</h1> +Recently I have merged the gc-disable branch, introducing a couple of features +which are useful when you need to respond to certain events with the lowest +possible latency. This work has been kindly sponsored by <a class="reference external" href="https://www.gambitresearch.com/">Gambit Research</a> +(which, by the way, is a very cool and geeky place where to <a class="reference external" href="https://www.gambitresearch.com/jobs.html">work</a>, in case you +are interested). Note also that this is a very specialized use case, so these +features might not be useful for the average PyPy user, unless you have the +same problems as described here.<br> +<br> +The PyPy VM manages memory using a generational, moving Garbage Collector. +Periodically, the GC scans the whole heap to find unreachable objects and +frees the corresponding memory. Although at a first look this strategy might +sound expensive, in practice the total cost of memory management is far less +than e.g. on CPython, which is based on reference counting. While maybe +counter-intuitive, the main advantage of a non-refcount strategy is +that allocation is very fast (especially compared to malloc-based allocators), +and deallocation of objects which die young is basically for free. More +information about the PyPy GC is available <a class="reference external" href="https://pypy.readthedocs.io/en/latest/gc_info.html#incminimark">here</a>.<br> +<br> +As we said, the total cost of memory managment is less on PyPy than on +CPython, and it's one of the reasons why PyPy is so fast. However, one big +disadvantage is that while on CPython the cost of memory management is spread +all over the execution of the program, on PyPy it is concentrated into GC +runs, causing observable pauses which interrupt the execution of the user +program.<br> +To avoid excessively long pauses, the PyPy GC has been using an <a class="reference external" href="https://www.pypy.org/posts/2013/10/incremental-garbage-collector-in-pypy-8956893523842234676.html">incremental +strategy</a> since 2013. The GC runs as a series of "steps", letting the user +program to progress between each step.<br> +<br> +The following chart shows the behavior of a real-world, long-running process:<br> +<div class="separator" style="clear: both; text-align: center;"> +<a href="https://3.bp.blogspot.com/-44yKwUVK3BE/XC4X9XL4BII/AAAAAAAABbE/XdTCIoyA-eYxvxIgJhFHaKnzxjhoWStHQCEwYBhgL/s1600/gc-timing.png" style="margin-right: 1em;"><img border="0" height="246" src="https://3.bp.blogspot.com/-44yKwUVK3BE/XC4X9XL4BII/AAAAAAAABbE/XdTCIoyA-eYxvxIgJhFHaKnzxjhoWStHQCEwYBhgL/s640/gc-timing.png" width="640"></a></div> +<br> +<br> +The orange line shows the total memory used by the program, which +increases linearly while the program progresses. Every ~5 minutes, the GC +kicks in and the memory usage drops from ~5.2GB to ~2.8GB (this ratio is controlled +by the <a class="reference external" href="https://pypy.readthedocs.io/en/latest/gc_info.html#environment-variables">PYPY_GC_MAJOR_COLLECT</a> env variable).<br> +The purple line shows aggregated data about the GC timing: the whole +collection takes ~1400 individual steps over the course of ~1 minute: each +point represent the <strong>maximum</strong> time a single step took during the past 10 +seconds. Most steps take ~10-20 ms, although we see a horrible peak of ~100 ms +towards the end. We have not investigated yet what it is caused by, but we +suspect it is related to the deallocation of raw objects.<br> +<br> +These multi-millesecond pauses are a problem for systems where it is important +to respond to certain events with a latency which is both low and consistent. +If the GC kicks in at the wrong time, it might causes unacceptable pauses during +the collection cycle.<br> +<br> +Let's look again at our real-world example. This is a system which +continuously monitors an external stream; when a certain event occurs, we want +to take an action. The following chart shows the maximum time it takes to +complete one of such actions, aggregated every minute:<br> +<br> +<div class="separator" style="clear: both; text-align: center;"> +<a href="https://4.bp.blogspot.com/-FO9uFHSqZzU/XC4YC8LZUpI/AAAAAAAABa8/B8ZOrEgbVJUHoO65wxvCMVpvciO_d_0TwCLcBGAs/s1600/normal-max.png" style="margin-right: 1em;"><img border="0" height="240" src="https://4.bp.blogspot.com/-FO9uFHSqZzU/XC4YC8LZUpI/AAAAAAAABa8/B8ZOrEgbVJUHoO65wxvCMVpvciO_d_0TwCLcBGAs/s640/normal-max.png" width="640"></a></div> +<br> +You can clearly see that the baseline response time is around ~20-30 +ms. However, we can also see periodic spikes around ~50-100 ms, with peaks up +to ~350-450 ms! After a bit of investigation, we concluded that most (although +not all) of the spikes were caused by the GC kicking in at the wrong time.<br> +<br> +The work I did in the <tt class="docutils literal"><span class="pre">gc-disable</span></tt> branch aims to fix this problem by +introducing <a class="reference external" href="https://pypy.readthedocs.io/en/latest/gc_info.html#semi-manual-gc-management">two new features</a> to the <tt class="docutils literal">gc</tt> module:<br> +<blockquote> +<ul class="simple"> +<li><tt class="docutils literal">gc.disable()</tt>, which previously only inhibited the execution of +finalizers without actually touching the GC, now disables the GC major +collections. After a call to it, you will see the memory usage grow +indefinitely.</li> +<li><tt class="docutils literal">gc.collect_step()</tt> is a new function which you can use to manually +execute a single incremental GC collection step.</li> +</ul> +</blockquote> +It is worth to specify that <tt class="docutils literal">gc.disable()</tt> disables <strong>only</strong> the major +collections, while minor collections still runs. Moreover, thanks to the +JIT's virtuals, many objects with a short and predictable lifetime are not +allocated at all. The end result is that most objects with short lifetime are +still collected as usual, so the impact of <tt class="docutils literal">gc.disable()</tt> on memory growth +is not as bad as it could sound.<br> +<br> +Combining these two functions, it is possible to take control of the GC to +make sure it runs only when it is acceptable to do so. For an example of +usage, you can look at the implementation of a <a class="reference external" href="https://github.com/antocuni/pypytools/blob/master/pypytools/gc/custom.py">custom GC</a> inside <a class="reference external" href="https://pypi.org/project/pypytools/">pypytools</a>. +The peculiarity is that it also defines a "<tt class="docutils literal">with <span class="pre">nogc():"</span></tt> context manager +which you can use to mark performance-critical sections where the GC is not +allowed to run.<br> +<br> +The following chart compares the behavior of the default PyPy GC and the new +custom GC, after a careful placing of <tt class="docutils literal">nogc()</tt> sections:<br> +<br> +<div class="separator" style="clear: both; text-align: center;"> +<a href="https://1.bp.blogspot.com/-bGqs0WrOEBk/XC4YJN0uZfI/AAAAAAAABbA/4EXOASvy830IKBoTFtrnmY22Vyd_api-ACLcBGAs/s1600/nogc-max.png" style="margin-right: 1em;"><img border="0" height="242" src="https://1.bp.blogspot.com/-bGqs0WrOEBk/XC4YJN0uZfI/AAAAAAAABbA/4EXOASvy830IKBoTFtrnmY22Vyd_api-ACLcBGAs/s640/nogc-max.png" width="640"></a></div> +<br> +The yellow line is the same as before, while the purple line shows the new +system: almost all spikes have gone, and the baseline performance is about 10% +better. There is still one spike towards the end, but after some investigation +we concluded that it was <strong>not</strong> caused by the GC.<br> +<br> +Note that this does <strong>not</strong> mean that the whole program became magically +faster: we simply moved the GC pauses in some other place which is <strong>not</strong> +shown in the graph: in this specific use case this technique was useful +because it allowed us to shift the GC work in places where pauses are more +acceptable.<br> +<br> +All in all, a pretty big success, I think. These functionalities are already +available in the nightly builds of PyPy, and will be included in the next +release: take this as a New Year present :)<br> +<br> +Antonio Cuni and the PyPy teamgcsponsorshttps://www.pypy.org/posts/2019/01/pypy-for-low-latency-systems-613165393301401965.htmlThu, 03 Jan 2019 14:21:00 GMTPyPy v5.8 releasedhttps://www.pypy.org/posts/2017/06/pypy-v58-released-739876359584854017.htmlmattip<div dir="ltr" style="text-align: left;"> +The PyPy team is proud to release both PyPy2.7 v5.8 (an interpreter supporting +Python 2.7 syntax), and a beta-quality PyPy3.5 v5.8 (an interpreter for Python +3.5 syntax). The two releases are both based on much the same codebase, thus +the dual release. Note that PyPy3.5 supports Linux 64bit only for now.<br> +<br> +This new PyPy2.7 release includes the upstream stdlib version 2.7.13, and +PyPy3.5 includes the upstream stdlib version 3.5.3.<br> +<br> +We fixed critical bugs in the <a class="reference external" href="https://doc.pypy.org/en/latest/config/translation.gcrootfinder.html">shadowstack</a> rootfinder garbage collector +strategy that crashed multithreaded programs and very rarely showed up +even in single threaded programs.<br> +<br> +We added native PyPy support to profile frames in the <a class="reference external" href="https://vmprof.readthedocs.io/">vmprof</a> statistical +profiler.<br> +<br> +The <code class="docutils literal"><span class="pre">struct</span></code> module functions <code class="docutils literal"><span class="pre">pack*</span></code> and <code class="docutils literal"><span class="pre">unpack*</span></code> are now much faster, +especially on raw buffers and bytearrays. Microbenchmarks show a 2x to 10x +speedup. Thanks to <a class="reference external" href="https://gambitresearch.com/">Gambit Research</a> for sponsoring this work.<br> +<br> +This release adds (but disables by default) link-time optimization and +<a class="reference external" href="https://pythonfiles.wordpress.com/2017/05/12/enabling-profile-guided-optimizations-for-pypy">profile guided optimization</a> of the base interpreter, which may make +unjitted code run faster. To use these, translate with appropriate +<a class="reference external" href="https://doc.pypy.org/en/latest/config/commandline.html#general-translation-options">options</a>. Be aware of <a class="reference external" href="https://foss.heptapod.net/pypy/pypy/-/issues/2572/link-time-optimization-lto-disabled">issues with gcc toolchains</a>, though.<br> +<br> +Please let us know if your use case is slow, we have ideas how to make things +faster but need real-world examples (not micro-benchmarks) of problematic code.<br> +<br> +Work sponsored by a Mozilla <a class="reference external" href="https://www.pypy.org/posts/2016/08/pypy-gets-funding-from-mozilla-for-5569307998787871200.html">grant</a> continues on PyPy3.5; numerous fixes from +CPython were ported to PyPy and PEP 489 was fully implemented. Of course the +bug fixes and performance enhancements mentioned above are part of both PyPy +2.7 and PyPy 3.5.<br> +<br> +<a class="reference external" href="https://cffi.readthedocs.io/en/latest/whatsnew.html">CFFI</a>, which is part of the PyPy release, has been updated to an unreleased 1.10.1, +improving an already great package for interfacing with C.<br> +<br> +Anyone using <a href="https://docs.scipy.org/doc/numpy-dev/release.html">NumPy 1.13.0</a>, must upgrade PyPy to this release since we implemented some previously missing C-API functionality. Many other c-extension modules now work with PyPy, let us know if yours does not.<br> +<br> +As always, this release fixed many issues and bugs raised by the +growing community of PyPy users. We strongly recommend updating.<br> +<br> +You can download the v5.8 release here:<br> +<blockquote> +<div> +<a class="reference external" href="https://pypy.org/download.html">https://pypy.org/download.html</a></div> +</blockquote> +We would like to thank our donors and contributors, and +encourage new people to join the project. PyPy has many +layers and we need help with all of them: <a class="reference external" href="https://doc.pypy.org/en/latest/index.html">PyPy</a> and <a class="reference external" href="https://rpython.readthedocs.org/">RPython</a> documentation +improvements, tweaking popular <a class="reference external" href="https://doc.pypy.org/en/latest/project-ideas.html#make-more-python-modules-pypy-friendly">modules</a> to run on PyPy, or general <a class="reference external" href="https://doc.pypy.org/en/latest/project-ideas.html">help</a> +with making RPython’s JIT even better.<br> +<br> +<h2 style="text-align: center;"> +What is PyPy?</h2> +PyPy is a very compliant Python interpreter, almost a drop-in replacement for CPython 2.7 and CPython 3.5. It’s fast (<a class="reference external" href="https://speed.pypy.org/">PyPy and CPython 2.7.x</a> performance comparison) due to its integrated tracing JIT compiler.<br> +We also welcome developers of other <a class="reference external" href="https://rpython.readthedocs.io/en/latest/examples.html">dynamic languages</a> to see what RPython can do for them.<br> +The PyPy 2.7 release supports:<br> +<blockquote> +<div> +<ul class="simple"> +<li><b>x86</b> machines on most common operating systems (Linux 32/64 bits, Mac OS X 64 bits, Windows 32 bits, OpenBSD, FreeBSD)</li> +<li>newer <b>ARM</b> hardware (ARMv6 or ARMv7, with VFPv3) running Linux,</li> +<li>big- and little-endian variants of <b>PPC64</b> running Linux,</li> +<li><b>s390x</b> running Linux </li> +</ul> +</div> +</blockquote> +<br> +<h2 style="text-align: center;"> +What else is new?</h2> +<div style="text-align: left;"> +PyPy 5.7 was released in March, 2017.</div> +<div style="text-align: left;"> +There are many incremental improvements to RPython and PyPy, the complete listing is <a href="https://doc.pypy.org/en/latest/release-v5.8.0.html">here.</a></div> +<div style="text-align: left;"> +  </div> +Please update, and continue to help us make PyPy better.<br> +<br> +Cheers, The PyPy team<br> +<br></div>releasesponsorshttps://www.pypy.org/posts/2017/06/pypy-v58-released-739876359584854017.htmlThu, 08 Jun 2017 23:20:00 GMTPyPy gets funding from Mozilla for Python 3.5 supporthttps://www.pypy.org/posts/2016/08/pypy-gets-funding-from-mozilla-for-5569307998787871200.htmlArmin Rigo<p>"Python 2.x versus Python 3.x": this is by now an old question. In the eyes of some people Python 2 is here to stay, and in the eyes of others Python has long been 3 only.</p> + +<p>PyPy's own position is that PyPy will support Python 2.7 forever---the RPython language in which PyPy is written is a subset of 2.7, and we have no plan to upgrade that. But at the same time, we want to support 3.x. This is particularly true now: a relatively recent development is that Python 3.5 seems to attract more and more people. The "switch" to Python 3.x might be starting to happen.</p> + +<p>Correspondingly, PyPy has been searching for a while for a way to support a larger-scale development effort. The goal is to support not just any old version of Python 3.x, but Python 3.5, as this seems to be the version that people are switching to. PyPy is close to supporting all of Python 3.3 now; but the list of what is new in Python <a href="https://docs.python.org/3/whatsnew/3.4.html">3.4</a> and <a href="https://docs.python.org/3/whatsnew/3.5.html">3.5</a> is far, far longer than anyone imagines. The long-term goal is also to get a version of "PyPy3" that is as good as "PyPy2" is, including its performance and its cpyext layer (CPython C API interoperability), for example.</p> + +<p>So, the end result: <a href="https://blog.mozilla.org/blog/2016/08/04/mozilla-awards-585000-to-nine-open-source-projects-in-q2-2016/">Mozilla recently decided to award $200,000</a> to <a href="https://baroquesoftware.com/">Baroque Software</a> to work on PyPy as part of its Mozilla Open Source Support (MOSS) initiative. This money will be used to implement the Python 3.5 features in PyPy. Within the next year, we plan to use the money to pay four core PyPy developers half-time to work on the missing features and on some of the big performance and cpyext issues. This should speed up the progress of catching up with Python 3.x significantly. We are extremely thankful to Mozilla for supporting us in this way, and will keep you updated on the progress via this blog.</p>sponsorshttps://www.pypy.org/posts/2016/08/pypy-gets-funding-from-mozilla-for-5569307998787871200.htmlTue, 09 Aug 2016 16:38:00 GMTCouchbase contribution to PyPyhttps://www.pypy.org/posts/2014/10/couchbase-contribution-to-pypy-2360892117372790069.htmlMaciej Fijalkowski<div dir="ltr" style="text-align: left;"> +<p>Hello everyone!</p> +<p>We always offer to put on the blog info about our sponsors who donate substantial amounts of money. So far most people decided to stay anonymous, so this is the first blog post describing our sponsor and his relationship to PyPy, hopefully not the last. We'll also publish a full blog post about the PSF-matched fundraiser soon. This is a guest post by Brent Woodruff from Couchbase.</p> +<p> +</p><div class="separator" style="clear: both; text-align: center;"><a href="https://www.couchbase.com/images/logo.svg" style="margin-left: 1em; margin-right: 1em;"><img border="0" src="https://www.couchbase.com/images/logo.svg" width="300px"></a></div> + +<p> +Couchbase is a leading NoSQL document database that provides a flexible data model, high performance, scalability, and high availability. Couchbase is a commercially supported open source project. Visit us at <a href="https://www.couchbase.com">https://www.couchbase.com</a> and <a href="https://github.com/couchbase">https://github.com/couchbase</a>. +</p> +<p> +Couchbase Inc. donated $2000.00, and employees of Couchbase personally contributed a disclosed additional $230.00, towards Pypy progress during the September funding drive. These funds will see a match from the Python Software Foundation. +</p><p> +Pypy is primarily used by Couchbase employees to perform product analysis and troubleshooting using internally developed tools. Every customer of Couchbase benefits from the use of Pypy; both due to the rapid development provided by Python, and the speed of the resulting tools provided by the Pypy JIT interpreter. +</p><p> +“PyPy is great - it gave us a 4x speedup in our CPU-intensive internal application over CPython” +-Dave Rigby and Daniel Owen, Couchbase Engineers +</p> +<p> +Additionally, Couchbase has a preliminary <a href="https://github.com/couchbaselabs/couchbase-python-cffi">CFFI based Couchbase client</a> available for Pypy users. +</p> + +<br></div>sponsorshttps://www.pypy.org/posts/2014/10/couchbase-contribution-to-pypy-2360892117372790069.htmlTue, 14 Oct 2014 17:40:00 GMTpygame_cffi: pygame on PyPyhttps://www.pypy.org/posts/2014/03/pygamecffi-pygame-on-pypy-8679802461301121984.htmlMaciej Fijalkowski<div dir="ltr" style="text-align: left;"> +<p>The Raspberry Pi aims to be a low-cost educational tool that anyone can use to learn about electronics and programming. Python and <a class="reference external" href="https://pygame.org/news.html">pygame</a> are included in the Pi's programming toolkit. And since last year, thanks in part to sponsorship from the <a class="reference external" href="https://www.raspberrypi.org/">Raspberry Pi Foundation</a>, PyPy also works on the Pi (read more <a class="reference external" href="https://www.pypy.org/posts/2013/05/pypy-20-alpha-for-arm-2318299473927531503.html">here</a>).</p> +<p>With PyPy working on the Pi, game logic written in Python stands to gain an awesome performance boost. However, the original pygame is a Python C extension. This means it performs poorly on PyPy and negates any speedup in the Python parts of the game code.</p> +<p>One solution to making pygame games run faster on PyPy, and eventually on the Raspberry Pi, comes in the form of <a class="reference external" href="https://github.com/CTPUG/pygame_cffi">pygame_cffi</a>. pygame_cffi uses <a class="reference external" href="https://cffi.readthedocs.org/">CFFI</a> to wrap the underlying SDL library instead of a C extension. A few months ago, the Raspberry Pi Foundation sponsored a <a class="reference external" href="https://www.pypy.org/posts/2013/12/pygame-cffi-8991437796535033699.html">Cape Town Python User Group hackathon</a> to build a proof-of-concept pygame using CFFI. This hackathon was a success and it produced an early working version of pygame_cffi.</p> +<p>So for the last 5 weeks Raspberry Pi has been funding work on pygame_cffi. The goal was a complete implementation of the core modules. We also wanted benchmarks to illuminate performance differences between pygame_cffi on PyPy and pygame on CPython. We are happy to report that those goals were met. So without further ado, here's a rundown of what works.</p> +<div class="section" id="current-functionality"> +<h3>Current functionality</h3> +<ul class="simple"> +<li><a class="reference external" href="https://www.pygame.org/docs/ref/surface.html">Surfaces</a> support all the usual flags for SDL and OpenGL rendering (more about OpenGL <a class="reference internal" href="https://www.pypy.org/posts/2014/03/pygamecffi-pygame-on-pypy-8679802461301121984.html#pyopenglperformance">below</a>).</li> +<li>The graphics-related modules <a class="reference external" href="https://www.pygame.org/docs/ref/color.html">color</a>, <a class="reference external" href="https://www.pygame.org/docs/ref/display.html">display</a>, <a class="reference external" href="https://www.pygame.org/docs/ref/font.html">font</a> and <a class="reference external" href="https://www.pygame.org/docs/ref/image.html">image</a>, and parts of <a class="reference external" href="https://www.pygame.org/docs/ref/draw.html">draw</a> and <a class="reference external" href="https://www.pygame.org/docs/ref/transform.html">transform</a> are mostly complete.</li> +<li><a class="reference external" href="https://www.pygame.org/docs/ref/event.html">Events</a>! No <a class="reference external" href="https://www.pygame.org/docs/ref/fastevent.html">fastevent</a> module yet, though.</li> +<li>Mouse and keyboard functionality, as provided by the <a class="reference external" href="https://www.pygame.org/docs/ref/mouse.html">mouse</a> and <a class="reference external" href="https://www.pygame.org/docs/ref/key.html">key</a> modules, is complete.</li> +<li>Sound functionality, as provided by the <a class="reference external" href="https://www.pygame.org/docs/ref/mixer.html">mixer</a> and <a class="reference external" href="https://www.pygame.org/docs/ref/music.html">music</a> modules, is complete.</li> +<li>Miscellaneous modules, <a class="reference external" href="https://www.pygame.org/docs/ref/cursors.html">cursors</a>, <a class="reference external" href="https://www.pygame.org/docs/ref/rect.html">rect</a>, <a class="reference external" href="https://www.pygame.org/docs/ref/sprite.html">sprite</a> and <a class="reference external" href="https://www.pygame.org/docs/ref/time.html">time</a> are also complete.</li> +</ul> + +Invention screenshot: + +<div class="separator" style="clear: both; text-align: center;"><a href="https://3.bp.blogspot.com/-1ZVah86dW3s/UzL9ZhiDiKI/AAAAAAAABvI/kMO9Pnmq9FY/s1600/invention_screenshot.png" style="margin-left: 1em; margin-right: 1em;"><img border="0" src="https://3.bp.blogspot.com/-1ZVah86dW3s/UzL9ZhiDiKI/AAAAAAAABvI/kMO9Pnmq9FY/s320/invention_screenshot.png"></a></div> + +Mutable mamba screenshot: + +<div class="separator" style="clear: both; text-align: center;"><a href="https://2.bp.blogspot.com/-JZzDhMwp43s/UzL9g4lktwI/AAAAAAAABvQ/WuCvtbCA3Lc/s1600/mutable_mamba_screenshot.png" style="margin-left: 1em; margin-right: 1em;"><img border="0" src="https://2.bp.blogspot.com/-JZzDhMwp43s/UzL9g4lktwI/AAAAAAAABvQ/WuCvtbCA3Lc/s320/mutable_mamba_screenshot.png"></a></div> + +<p>With the above-mentioned functionality in place we could get 10+ of the pygame examples to work, and a number of <a class="reference external" href="https://pyweek.org/">PyWeek</a> games. At the time of writing, if a game doesn't work it is most likely due to an unimplemented <a class="reference external" href="https://www.pygame.org/docs/ref/transform.html">transform</a> or <a class="reference external" href="https://www.pygame.org/docs/ref/draw.html">draw</a> function. That will be remedied soon.</p> +</div> +<div class="section" id="performance"> +<h3>Performance</h3> +<p>In terms of performance, pygame_cffi on PyPy is showing a lot of promise. It beats pygame on CPython by a significant margin in our events processing and collision detection benchmarks, while blit and fill benchmarks perform similarly. The pygame examples we checked also perform better.</p> + +<div class="separator" style="clear: both; text-align: center;"><a href="https://3.bp.blogspot.com/-tSV6v3J5rwc/UzL-4CbkqCI/AAAAAAAABwQ/NFDuq4biNqY/s1600/collision_increase.png" style="margin-left: 1em; margin-right: 1em;"><img border="0" src="https://3.bp.blogspot.com/-tSV6v3J5rwc/UzL-4CbkqCI/AAAAAAAABwQ/NFDuq4biNqY/s400/collision_increase.png"></a></div> + +<div class="separator" style="clear: both; text-align: center;"><a href="https://1.bp.blogspot.com/-HJCdpeVHbj0/UzL-0e5eGMI/AAAAAAAABwI/3eKRVRpP45s/s1600/examples_bench.png" style="margin-left: 1em; margin-right: 1em;"><img border="0" src="https://1.bp.blogspot.com/-HJCdpeVHbj0/UzL-0e5eGMI/AAAAAAAABwI/3eKRVRpP45s/s400/examples_bench.png"></a></div> + +<p>However, there is still work to be done to identify and eliminate bottlenecks. On the Raspberry Pi performance is markedly worse compared to pygame (barring collision detection). The PyWeek games we tested also performed slightly worse. Fortunately there is room for improvement in various places.</p> + +Invention &amp; Mutable Mamba (x86) + +<div class="separator" style="clear: both; text-align: center;"><a href="https://4.bp.blogspot.com/-jYdr73oj154/UzL-u4aAwWI/AAAAAAAABwA/cv_vNSFtb0Q/s1600/pyweek_games_bench.png" style="margin-left: 1em; margin-right: 1em;"><img border="0" src="https://4.bp.blogspot.com/-jYdr73oj154/UzL-u4aAwWI/AAAAAAAABwA/cv_vNSFtb0Q/s400/pyweek_games_bench.png"></a></div> + +Standard pygame examples (Raspberry Pi) + +<div class="separator" style="clear: both; text-align: center;"><a href="https://2.bp.blogspot.com/-gd9KEHANb_I/UzL-oKCx5BI/AAAAAAAABv4/frssbcGhI9A/s1600/examples_bench_rasp.png" style="margin-left: 1em; margin-right: 1em;"><img border="0" src="https://2.bp.blogspot.com/-gd9KEHANb_I/UzL-oKCx5BI/AAAAAAAABv4/frssbcGhI9A/s400/examples_bench_rasp.png"></a></div> + +<p>Here's a summary of some of the benchmarks. Relative speed refers to the frame rate obtained in pygame_cffi on PyPy relative to pygame on CPython.</p> +<table border="1" class="docutils"> +<colgroup> +<col width="76%"> +<col width="24%"> +</colgroup> +<thead valign="bottom"> +<tr><th class="head">Benchmark</th> +<th class="head">Relative speed (pypy speedup)</th> +</tr> +</thead> +<tbody valign="top"> +<tr><td>Events (x86)</td> +<td>1.41</td> +</tr> +<tr><td>Events (Pi)</td> +<td>0.58</td> +</tr> +<tr><td>N<sup>2</sup> collision detection on 100 sprites (x86)</td> +<td>4.14</td> +</tr> +<tr><td>N<sup>2</sup> collision detection on 100 sprites (Pi)</td> +<td>1.01</td> +</tr> +<tr><td>Blit 100 surfaces (x86)</td> +<td>1.06</td> +</tr> +<tr><td>Blit 100 surfaces (Pi)</td> +<td>0.60</td> +</tr> +<tr><td>Invention (x86)</td> +<td>0.95</td> +</tr> +<tr><td>Mutable Mamba (x86)</td> +<td>0.72</td> +</tr> +<tr><td>stars example (x86)</td> +<td>1.95</td> +</tr> +<tr><td>stars example (Pi)</td> +<td>0.84</td> +</tr> +</tbody> +</table> +<div class="section" id="opengl"> +<h2>OpenGL</h2> +<p id="pyopenglperformance">Some not-so-great news is that <a class="reference external" href="https://pyopengl.sourceforge.net/">PyOpenGL</a> performs poorly on PyPy since PyOpenGL uses ctypes. This translates into a nasty reduction in frame rate for games that use OpenGL surfaces. It might be worthwhile creating a CFFI-powered version of PyOpenGL as well.</p> +</div> +</div> +<div class="section" id="where-to-now"> +<h3>Where to now?</h3> +<p>Work on pygame_cffi is ongoing. Here are some things that are in the pipeline:</p> +<ul class="simple"> +<li>Get pygame_cffi on PyPy to a place where it is consistently faster than pygame on CPython.</li> +<li>Implement the remaining modules and functions, starting with <a class="reference external" href="https://www.pygame.org/docs/ref/draw.html">draw</a> and <a class="reference external" href="https://www.pygame.org/docs/ref/transform.html">transform</a>.</li> +<li>Improve test coverage.</li> +<li>Reduce the time it takes for CFFI to parse the cdef. This makes the initial pygame import slow.</li> +</ul> +<p>If you want to contribute you can find pygame_cffi <a class="reference external" href="https://github.com/CTPUG/pygame_cffi">on Github</a>. +Feel free to find us on #pypy on freenode or post issues on github.</p> +<p>Cheers,<br> +Rizmari Versfeld</p> +</div> +<br></div>sponsorshttps://www.pypy.org/posts/2014/03/pygamecffi-pygame-on-pypy-8679802461301121984.htmlWed, 26 Mar 2014 16:28:00 GMTPyPy 2.0 alpha for ARMhttps://www.pypy.org/posts/2013/05/pypy-20-alpha-for-arm-2318299473927531503.htmlMaciej Fijalkowski<div dir="ltr" style="text-align: left;"> + +<p>Hello.</p> +<p>We're pleased to announce an alpha release of PyPy 2.0 for ARM. This is mostly +a technology preview, as we know the JIT is not yet stable enough for the +full release. However please try your stuff on ARM and report back.</p> +<p>This is the first release that supports a range of ARM devices - anything with +ARMv6 (like the Raspberry Pi) or ARMv7 (like Beagleboard, Chromebook, +Cubieboard, etc.) that supports VFPv3 should work. We provide builds with +support for both ARM EABI variants: hard-float and some older operating +systems soft-float.</p> +<p>This release comes with a list of limitations, consider it alpha quality, +not suitable for production:</p> +<ul class="simple"> +<li>stackless support is missing.</li> +<li>assembler produced is not always correct, but we successfully managed to +run large parts of our extensive benchmark suite, so most stuff should work.</li> +</ul> +<p>You can download the PyPy 2.0 alpha ARM release here (including a deb for raspbian):</p> +<blockquote> +<a class="reference external" href="https://pypy.org/download.html">https://pypy.org/download.html</a></blockquote> +<p>Part of the work was sponsored by the <a class="reference external" href="https://www.raspberrypi.org/">Raspberry Pi foundation</a>.</p> +<div class="section" id="what-is-pypy"> +<h3>What is PyPy?</h3> +<p>PyPy is a very compliant Python interpreter, almost a drop-in replacement for +CPython 2.7.3. It's fast due to its integrated tracing JIT compiler.</p> +<p>This release supports ARM machines running Linux 32bit. Both hard-float +<tt class="docutils literal">armhf</tt> and soft-float <tt class="docutils literal">armel</tt> builds are provided. <tt class="docutils literal">armhf</tt> builds are +created using the Raspberry Pi custom <a class="reference external" href="https://github.com/raspberrypi">cross-compilation toolchain</a> based on +gcc-arm-linux-gnueabihf and should work on ARMv6 and ARMv7 devices running at +least debian or ubuntu. <tt class="docutils literal">armel</tt> builds are built using gcc-arm-linux-gnuebi +toolchain provided by ubuntu and currently target ARMv7. If there is interest +in other builds, such as gnueabi for ARMv6 or without requiring a VFP let us +know in the comments or in IRC.</p> +</div> +<div class="section" id="benchmarks"> +<h3>Benchmarks</h3> +<p>Everybody loves benchmarks. Here is a table of our benchmark suite +(for ARM we don't provide it yet on <a class="reference external" href="https://speed.pypy.org">https://speed.pypy.org</a>, +unfortunately).</p> +<p>This is a comparison of Cortex A9 processor with 4M cache and Xeon W3580 with +8M of L3 cache. The set of benchmarks is a subset of what we run for +<a class="reference external" href="https://speed.pypy.org">https://speed.pypy.org</a> that finishes in reasonable time. The ARM machine +was provided by Calxeda. +Columns are respectively:</p> +<ul class="simple"> +<li>benchmark name</li> +<li>PyPy speedup over CPython on ARM (Cortex A9)</li> +<li>PyPy speedup over CPython on x86 (Xeon)</li> +<li>speedup on Xeon vs Cortex A9, as measured on CPython</li> +<li>speedup on Xeon vs Cortex A9, as measured on PyPy</li> +<li>relative speedup (how much bigger the x86 speedup is over ARM speedup)</li> +</ul> +<table border="1" class="docutils"> +<colgroup> +<col width="16%"> +<col width="18%"> +<col width="18%"> +<col width="15%"> +<col width="18%"> +<col width="14%"> +</colgroup> +<tbody valign="top"> +<tr><td>Benchmark</td> +<td>PyPy vs CPython (arm)</td> +<td>PyPy vs CPython (x86)</td> +<td>x86 vs arm (pypy)</td> +<td>x86 vs arm (cpython)</td> +<td>relative speedup</td> +</tr> +<tr><td>ai</td> +<td>3.61</td> +<td>3.16</td> +<td>7.70</td> +<td>8.82</td> +<td>0.87</td> +</tr> +<tr><td>bm_mako</td> +<td>3.41</td> +<td>2.11</td> +<td>8.56</td> +<td>13.82</td> +<td>0.62</td> +</tr> +<tr><td>chaos</td> +<td>21.82</td> +<td>17.80</td> +<td>6.93</td> +<td>8.50</td> +<td>0.82</td> +</tr> +<tr><td>crypto_pyaes</td> +<td>22.53</td> +<td>19.48</td> +<td>6.53</td> +<td>7.56</td> +<td>0.86</td> +</tr> +<tr><td>django</td> +<td>13.43</td> +<td>11.16</td> +<td>7.90</td> +<td>9.51</td> +<td>0.83</td> +</tr> +<tr><td>eparse</td> +<td>1.43</td> +<td>1.17</td> +<td>6.61</td> +<td>8.12</td> +<td>0.81</td> +</tr> +<tr><td>fannkuch</td> +<td>6.22</td> +<td>5.36</td> +<td>6.18</td> +<td>7.16</td> +<td>0.86</td> +</tr> +<tr><td>float</td> +<td>5.22</td> +<td>6.00</td> +<td>9.68</td> +<td>8.43</td> +<td>1.15</td> +</tr> +<tr><td>go</td> +<td>4.72</td> +<td>3.34</td> +<td>5.91</td> +<td>8.37</td> +<td>0.71</td> +</tr> +<tr><td>hexiom2</td> +<td>8.70</td> +<td>7.00</td> +<td>7.69</td> +<td>9.56</td> +<td>0.80</td> +</tr> +<tr><td>html5lib</td> +<td>2.35</td> +<td>2.13</td> +<td>6.59</td> +<td>7.26</td> +<td>0.91</td> +</tr> +<tr><td>json_bench</td> +<td>1.12</td> +<td>0.93</td> +<td>7.19</td> +<td>8.68</td> +<td>0.83</td> +</tr> +<tr><td>meteor-contest</td> +<td>2.13</td> +<td>1.68</td> +<td>5.95</td> +<td>7.54</td> +<td>0.79</td> +</tr> +<tr><td>nbody_modified</td> +<td>8.19</td> +<td>7.78</td> +<td>6.08</td> +<td>6.40</td> +<td>0.95</td> +</tr> +<tr><td>pidigits</td> +<td>1.27</td> +<td>0.95</td> +<td>14.67</td> +<td>19.66</td> +<td>0.75</td> +</tr> +<tr><td>pyflate-fast</td> +<td>3.30</td> +<td>3.57</td> +<td>10.64</td> +<td>9.84</td> +<td>1.08</td> +</tr> +<tr><td>raytrace-simple</td> +<td>46.41</td> +<td>29.00</td> +<td>5.14</td> +<td>8.23</td> +<td>0.62</td> +</tr> +<tr><td>richards</td> +<td>31.48</td> +<td>28.51</td> +<td>6.95</td> +<td>7.68</td> +<td>0.91</td> +</tr> +<tr><td>slowspitfire</td> +<td>1.28</td> +<td>1.14</td> +<td>5.91</td> +<td>6.61</td> +<td>0.89</td> +</tr> +<tr><td>spambayes</td> +<td>1.93</td> +<td>1.27</td> +<td>4.15</td> +<td>6.30</td> +<td>0.66</td> +</tr> +<tr><td>sphinx</td> +<td>1.01</td> +<td>1.05</td> +<td>7.76</td> +<td>7.45</td> +<td>1.04</td> +</tr> +<tr><td>spitfire</td> +<td>1.55</td> +<td>1.58</td> +<td>5.62</td> +<td>5.49</td> +<td>1.02</td> +</tr> +<tr><td>spitfire_cstringio</td> +<td>9.61</td> +<td>5.74</td> +<td>5.43</td> +<td>9.09</td> +<td>0.60</td> +</tr> +<tr><td>sympy_expand</td> +<td>1.42</td> +<td>0.97</td> +<td>3.86</td> +<td>5.66</td> +<td>0.68</td> +</tr> +<tr><td>sympy_integrate</td> +<td>1.60</td> +<td>0.95</td> +<td>4.24</td> +<td>7.12</td> +<td>0.60</td> +</tr> +<tr><td>sympy_str</td> +<td>0.72</td> +<td>0.48</td> +<td>3.68</td> +<td>5.56</td> +<td>0.66</td> +</tr> +<tr><td>sympy_sum</td> +<td>1.99</td> +<td>1.19</td> +<td>3.83</td> +<td>6.38</td> +<td>0.60</td> +</tr> +<tr><td>telco</td> +<td>14.28</td> +<td>9.36</td> +<td>3.94</td> +<td>6.02</td> +<td>0.66</td> +</tr> +<tr><td>twisted_iteration</td> +<td>11.60</td> +<td>7.33</td> +<td>6.04</td> +<td>9.55</td> +<td>0.63</td> +</tr> +<tr><td>twisted_names</td> +<td>3.68</td> +<td>2.83</td> +<td>5.01</td> +<td>6.50</td> +<td>0.77</td> +</tr> +<tr><td>twisted_pb</td> +<td>4.94</td> +<td>3.02</td> +<td>5.10</td> +<td>8.34</td> +<td>0.61</td> +</tr> +</tbody> +</table> +<p>It seems that Cortex A9, while significantly slower than Xeon, has higher +slowdowns with a large interpreter (CPython) than a JIT compiler (PyPy). This +comes as a surprise to me, especially that our ARM assembler is not nearly +as polished as our x86 assembler. As for the causes, various people mentioned +branch predictor, but I would not like to speculate without actually knowing.</p> +</div> +<div class="section" id="how-to-use-pypy"> +<h3>How to use PyPy?</h3> +<p>We suggest using PyPy from a <a class="reference external" href="https://www.virtualenv.org/en/latest/">virtualenv</a>. Once you have a virtualenv +installed, you can follow instructions from <a class="reference external" href="https://doc.pypy.org/en/latest/getting-started.html#installing-using-virtualenv">pypy documentation</a> on how +to proceed. This document also covers other <a class="reference external" href="https://doc.pypy.org/en/latest/getting-started.html#installing-pypy">installation schemes</a>.</p> +<p>We would not recommend using in production PyPy on ARM just quite yet, +however the day of a stable PyPy ARM release is not far off.</p> +<p>Cheers,<br> +fijal, bivab, arigo and the whole PyPy team</p> +</div> +<br></div>armsponsorshttps://www.pypy.org/posts/2013/05/pypy-20-alpha-for-arm-2318299473927531503.htmlTue, 07 May 2013 13:35:00 GMTA thank you to the PSFhttps://www.pypy.org/posts/2011/03/thank-you-to-psf-5934275567667314914.htmlMaciej Fijalkowski<p>This year's PyCon was an incredible time; several members of the PyPy team were +there, and we'll be blogging more about our experiences in the coming days. +However, we quickly wanted to extend a thank you to the <a class="reference external" href="https://www.python.org/psf/">Python Software +Foundation (PSF)</a>.</p> +<p>As you may have heard, on Friday morning at PyCon Jesse Noller handed the PyPy +team a check for $10,000, on behalf of the PSF. This was in recognition of our +success over the past few years in bringing PyPy from a research project +to a fast, compliant, production-ready Python implementation, and to allow us +to continue our work on making it faster and more up-to-date with upstream +version changes.</p> +<p>Beyond the large check, we're grateful for the endorsement this represents, +not only of our work on PyPy, but also of all alternatve Python VMs. +The PSF has shifted its focus from representing just CPython to representing +the Python Language, reguardless of its implementation, something we are very +appreciative of.</p> +<a href="https://3.bp.blogspot.com/-yLUKuyRgjdg/TYfklB5Jg4I/AAAAAAAABKM/_5Rv2thqzA0/s1600/pycon_cheque.jpg"><img alt="" border="0" id="BLOGGER_PHOTO_ID_5586685187590816642" src="https://3.bp.blogspot.com/-yLUKuyRgjdg/TYfklB5Jg4I/AAAAAAAABKM/_5Rv2thqzA0/s320/pycon_cheque.jpg" style="display: block; margin: 0px auto 10px; text-align: center; cursor: pointer; cursor: hand; width: 320px; height: 269px;"></a> +<p>From left to right, PyPy people present at PyCon 2011: Maciej Fijałkowski, Armin Rigo, Alex Gaynor, Laura Creighton and Jacob Hallén</p> + +<p>Thank you, PSF.</p>sponsorshttps://www.pypy.org/posts/2011/03/thank-you-to-psf-5934275567667314914.htmlMon, 21 Mar 2011 23:50:00 GMT \ No newline at end of file diff --git a/categories/sprint.html b/categories/sprint.html new file mode 100644 index 000000000..75443c107 --- /dev/null +++ b/categories/sprint.html @@ -0,0 +1,120 @@ + + + + + +Posts about sprint | PyPy + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +
                + + \ No newline at end of file diff --git a/categories/sprint.xml b/categories/sprint.xml new file mode 100644 index 000000000..4e45f0b3d --- /dev/null +++ b/categories/sprint.xml @@ -0,0 +1,192 @@ + +PyPy (Posts about sprint)https://www.pypy.org/enContents © 2024 <a href="mailto:pypy-dev@pypy.org">The PyPy Team</a> Sat, 31 Aug 2024 17:48:14 GMTNikola (getnikola.com)http://blogs.law.harvard.edu/tech/rss(Cape of) Good Hope for PyPyhttps://www.pypy.org/posts/2017/10/cape-of-good-hope-for-pypy-hello-from-3656631725712879033.htmlAntonio Cuni<div> +<br></div> +Hello from the other side of the world (for most of you)!<br> +<br> +With the excuse of coming to <a class="reference external" href="https://za.pycon.org/">PyCon ZA</a> during the last two weeks Armin, +Ronan, Antonio and sometimes Maciek had a very nice and productive sprint in +Cape Town, as pictures show :). We would like to say a big thank you to +Kiwi.com, which sponsored part of the travel costs via its awesome <a class="reference external" href="https://www.kiwi.com/sourcelift/">Sourcelift</a> +program to help Open Source projects.<br> +<br> +<table align="center" cellpadding="0" cellspacing="0" class="tr-caption-container" style="float: right; margin-left: 1em; text-align: right;"><tbody> +<tr><td style="text-align: center;"><a href="https://3.bp.blogspot.com/-9YVNucPN1wE/WeaWmTUFB-I/AAAAAAAABMQ/HeVMqS-ya2IYJuk0iZZODlULqpKaf5XcgCLcBGAs/s1600/DSC_2418.JPG" style="margin-left: auto; margin-right: auto;"><img border="0" height="225" src="https://3.bp.blogspot.com/-9YVNucPN1wE/WeaWmTUFB-I/AAAAAAAABMQ/HeVMqS-ya2IYJuk0iZZODlULqpKaf5XcgCLcBGAs/s400/DSC_2418.JPG" width="400"></a></td></tr> +<tr><td class="tr-caption" style="text-align: center;">Armin, Anto and Ronan at Cape Point</td></tr> +</tbody></table> +<br> +Armin, Ronan and Anto spent most of the time hacking at cpyext, our CPython +C-API compatibility layer: during the last years, the focus was to make it +working and compatible with CPython, in order to run existing libraries such +as numpy and pandas. However, we never paid too much attention to performance, +so the net result is that with the latest released version of PyPy, C +extensions generally work but their speed ranges from "slow" to "horribly +slow".<br> +<br> +For example, these very simple <a class="reference external" href="https://github.com/antocuni/cpyext-benchmarks">microbenchmarks</a> measure the speed of +calling (empty) C functions, i.e. the time you spend to "cross the border" +between RPython and C. <i>(Note: this includes the time spent doing the loop in regular Python code.)</i> These are the results on CPython, on PyPy 5.8, and on +our newest in-progress version:<br> +<br> +<pre class="literal-block">$ python bench.py # CPython +noargs : 0.41 secs +onearg(None): 0.44 secs +onearg(i) : 0.44 secs +varargs : 0.58 secs +</pre> +<div> +<br></div> +<pre class="literal-block">$ pypy-5.8 bench.py # PyPy 5.8 +noargs : 1.01 secs +onearg(None): 1.31 secs +onearg(i) : 2.57 secs +varargs : 2.79 secs +</pre> +<div> +<br></div> +<pre class="literal-block">$ pypy bench.py # cpyext-refactor-methodobject branch +noargs : 0.17 secs +onearg(None): 0.21 secs +onearg(i) : 0.22 secs +varargs : 0.47 secs +</pre> +<div> +<br></div> +<pre class="literal-block"></pre> +<pre class="literal-block"></pre> +So yes: before the sprint, we were ~2-6x slower than CPython. Now, we are +<strong>faster</strong> than it! +To reach this result, we did various improvements, such as: +<br> +<blockquote> +<ol class="arabic simple"> +<li>teach the JIT how to look (a bit) inside the cpyext module;</li> +<li>write specialized code for calling <tt class="docutils literal">METH_NOARGS</tt>, <tt class="docutils literal">METH_O</tt> and +<tt class="docutils literal">METH_VARARGS</tt> functions; previously, we always used a very general and +slow logic;</li> +<li>implement freelists to allocate the cpyext versions of <tt class="docutils literal">int</tt> and +<tt class="docutils literal">tuple</tt> objects, as CPython does;</li> +<li>the <a class="reference external" href="https://foss.heptapod.net/pypy/pypy/-/merge_requests/573">cpyext-avoid-roundtrip</a> branch: crossing the RPython/C border is +slowish, but the real problem was (and still is for many cases) we often +cross it many times for no good reason. So, depending on the actual API +call, you might end up in the C land, which calls back into the RPython +land, which goes to C, etc. etc. (ad libitum).</li> +</ol> +</blockquote> +The branch tries to fix such nonsense: so far, we fixed only some cases, which +are enough to speed up the benchmarks shown above. But most importantly, we +now have a clear path and an actual plan to improve cpyext more and +more. Ideally, we would like to reach a point in which cpyext-intensive +programs run at worst at the same speed of CPython.<br> +<br> +The other big topic of the sprint was Armin and Maciej doing a lot of work on the +<a class="reference external" href="https://bitbucket.org/pypy/pypy/commits/branch/unicode-utf8">unicode-utf8</a> branch: the goal of the branch is to always use UTF-8 as the +internal representation of unicode strings. The advantages are various: +<br> +<blockquote> +<ul class="simple"> +<li>decoding a UTF-8 stream is super fast, as you just need to check that the +stream is valid;</li> +<li>encoding to UTF-8 is almost a no-op;</li> +<li>UTF-8 is always more compact representation than the currently +used UCS-4. It's also almost always more compact than CPython 3.5 latin1/UCS2/UCS4 combo;</li> +<li>smaller representation means everything becomes quite a bit faster due to lower cache pressure.</li> +</ul> +</blockquote> +Before you ask: yes, this branch contains special logic to ensure that random +access of single unicode chars is still O(1), as it is on both CPython and the +current PyPy.<br> +We also plan to improve the speed of decoding even more by using modern processor features, like SSE and AVX. Preliminary results show that decoding can be done 100x faster than the current setup. +<br> +<br> +In summary, this was a long and profitable sprint, in which we achieved lots +of interesting results. However, what we liked even more was the privilege of +doing <a class="reference external" href="https://bitbucket.org/pypy/pypy/commits/a4307fb5912e">commits</a> from awesome places such as the top of Table Mountain:<br> +<br> +<blockquote class="twitter-tweet"> +<div dir="ltr" lang="en"> +Our sprint venue today <a href="https://twitter.com/hashtag/pypy?src=hash&amp;ref_src=twsrc%5Etfw">#pypy</a> <a href="https://t.co/o38IfTYmAV">pic.twitter.com/o38IfTYmAV</a></div> +— Ronan Lamy (@ronanlamy) <a href="https://twitter.com/ronanlamy/status/915575026107240449?ref_src=twsrc%5Etfw">4 ottobre 2017</a></blockquote> + + +<br> +<table align="center" cellpadding="0" cellspacing="0" class="tr-caption-container" style="float: left; margin-right: 1em; text-align: left;"><tbody> +<tr><td style="text-align: center;"><a href="https://foss.heptapod.net/pypy/extradoc/-/blob/branch/extradoc/sprintinfo/cape-town-2017/2017-10-04-155524.jpg" style="margin-left: auto; margin-right: auto;"><img border="0" height="360" src="https://bytebucket.org/pypy/extradoc/raw/extradoc/sprintinfo/cape-town-2017/2017-10-04-155524.jpg" width="640"></a></td></tr> +<tr><td class="tr-caption" style="text-align: center;">The panorama we looked at instead of staring at cpyext code</td></tr> +</tbody></table>cpyextprofilingspeedsprintunicodehttps://www.pypy.org/posts/2017/10/cape-of-good-hope-for-pypy-hello-from-3656631725712879033.htmlWed, 18 Oct 2017 13:31:00 GMTPyPy London Sprint (August 26 - September 1 2013)https://www.pypy.org/posts/2013/07/pypy-london-sprint-august-26-september-5156945690440578388.htmlCarl Friedrich Bolz-Tereick<p>The next PyPy sprint will be in London, United Kingdom for the first +time. This is a fully public sprint. PyPy sprints are a very good way +to get into PyPy development and no prior PyPy knowledge is necessary.</p> +<h2>Goals and topics of the sprint</h2> +<p>For newcomers:</p> +<ul class="simple"> +<li>bring your application/library and we'll help you port it to PyPy, +benchmark and profile</li> +<li>come and write your favorite missing numpy function</li> +<li>help us work on developer tools like jitviewer</li> +</ul> +<p>We'll also work on:</p> +<ul class="simple"> +<li>refactoring the JIT optimizations</li> +<li>STM and STM-related topics</li> +<li>anything else attendees are interested in</li> +</ul> +<h2>Exact times</h2> +<p>The work days should be August 26 - September 1 2013 (Monday-Sunday). +The official plans are for people to arrive on the 26th, and +to leave on the 2nd. There will be a break day in the middle. +We'll typically start at 10:00 in the morning.</p> +<h2>Location</h2> +<p>The sprint will happen within a room of <a class="reference external" href="https://www.kcl.ac.uk/">King's College's</a> <a class="reference external" href="https://www.kcl.ac.uk/campuslife/campuses/strand/StrandCampusLocation.aspx">Strand +Campus</a> in <a class="reference external" href="https://goo.gl/maps/Qz0zz">Central London, UK</a>. There are some travel instructions <a class="reference external" href="https://www.kcl.ac.uk/campuslife/campuses/directions/strand.aspx">how to +get there</a>. We are being hosted by <a class="reference external" href="https://tratt.net/laurie">Laurence Tratt</a> and the <a class="reference external" href="https://soft-dev.org">Software +Development Team</a>.</p> +<h2>Demo Session</h2> +<p>If you don't want to come to the full sprint, but still want to chat a +bit, we are planning to have a demo session on Tuesday August 27. We +will announce this separately on the blog. If you are interested, please +leave a comment.</p> +<h2>Registration</h2> +<p>If you want to attend, please register by adding yourself to the +"people.txt" file in Mercurial:</p> +<pre class="literal-block"> +https://bitbucket.org/pypy/extradoc/ +https://foss.heptapod.net/pypy/extradoc/-/blob/branch/default/extradoc/sprintinfo/london-2013 +</pre> +<p>or on the pypy-dev mailing list if you do not yet have check-in rights:</p> +<pre class="literal-block"> +https://mail.python.org/mailman/listinfo/pypy-dev +</pre> +<p>Remember that you may need a (insert country here)-to-UK power adapter. +Please note that UK is not within the Schengen zone, so non-EU and +non-Switzerland citizens may require specific visa. Please check <a class="reference external" href="https://www.ukba.homeoffice.gov.uk/visas-immigration/do-you-need-a-visa/">travel +regulations</a>. Also, the UK uses pound sterling (GBP).</p>sprinthttps://www.pypy.org/posts/2013/07/pypy-london-sprint-august-26-september-5156945690440578388.htmlFri, 19 Jul 2013 14:58:00 GMTPyPy Winter Sprint Reporthttps://www.pypy.org/posts/2011/02/pypy-winter-sprint-report-4155886720346408516.htmlMichael Foord<p>A few weeks ago I had the great fortune to attend the PyPy winter sprint in Leysin Switzerland. I've wanted to contribute to PyPy for a long time and I thought diving into a sprint might be a good way to get familiar with some of the code. What I wasn't expecting was to be using RPython to implement new methods on built-in Python objects on the first day. The main thing I took away from the sprint was just how easy it is to get involved in developing PyPy (well, some bits of it at least and being surrounded by core developers helps). I wrote up a very short description of how to get started <a href="https://bitbucket.org/pypy/pypy/wiki/How%20to%20run%20lib-python%20tests">here</a>, but I'll do a longer blog post with examples on <a href="https://www.voidspace.org.uk/python/weblog/">my own blog</a> soon(ish).<br> +<br> +The sprint was kicked off by Armin merging the "fast-forward" branch of PyPy onto trunk. "fast-forward" brings PyPy from Python 2.5 compatibility to Python 2.7. Along with this it brought a large number of test failures, as the sterling work done by Benjamin Peterson and Amaury Forgeot d'Arc was not complete. This immediately set the primary sprint goal to reduce the number of test failures.<br> +<br> +We made a great deal of progress on this front, and you can see how close PyPy is now from the <a href="https://buildbot.pypy.org/summary?branch=%3Ctrunk%3E">buildbots</a>.<br> +<br> +Jacob Hallén and I started working through the list of tests with failures alphabetically. We made short work of test_asyncore and moved onto test_bytes where I was stuck for the rest of the sprint. I spent much of the remaining days working with Laura Creighton on the pypy bytearray implementation to make it more compatible with Python 2.7. This meant adding new methods, changing some of the Python protocol method implementations and even changing the way that bytearray is constructed. All in all great fun and a great introduction to working with RPython.<br> +<br> +A big part of the compatibility with Python 2.7 work was done by Laura and Armin who basically rewrote the math module from scratch. This was needed to incorporate all the improvements made (mostly by Mark Dickinson) in CPython in 2.7. That involved a lot of head-scratching about such subtleties as whether -0.0 should be considered almost equal to 0.0 and other fun problems.<br> +<span id="goog_788025148"></span><span id="goog_788025149"></span><br> +<br> +</p><table align="center" cellpadding="0" cellspacing="0" class="tr-caption-container" style="margin-left: auto; margin-right: auto; text-align: center;"><tbody> +<tr><td style="text-align: center;"><span class="Apple-style-span" style="margin-left: auto; margin-right: auto;"><img border="0" height="239" src="https://4.bp.blogspot.com/-mtUgzR-TwUA/TVkXkIjqmXI/AAAAAAAAAVc/bbynq2Dwmg8/s320/first-meal.jpg" width="320"></span></td></tr> +<tr><td class="tr-caption" style="text-align: center;"><a href="https://www.flickr.com/photos/mfoord/sets/72157625889973066/">The first meal together, before everyone had arrived</a></td></tr> +</tbody></table> +If you add on top of this the wonderful people, the beautiful scenery, the Swiss cheese fondues, managing to not kill myself with a days skiing and traditional pypy card games, I can heartily recommend pypy sprints as a close approximation of geek nirvana.<br> +<br> +<table align="center" cellpadding="0" cellspacing="0" class="tr-caption-container" style="margin-left: auto; margin-right: auto; text-align: center;"><tbody> +<tr><td style="text-align: center;"><span class="Apple-style-span" style="margin-left: auto; margin-right: auto;"><img border="0" height="239" src="https://4.bp.blogspot.com/-qP95S6g9X9k/TVkYJKNYTQI/AAAAAAAAAVg/Pm3q36yMiLY/s320/mountains.jpg" width="320"></span></td></tr> +<tr><td class="tr-caption" style="text-align: center;"><a href="https://www.flickr.com/photos/mfoord/sets/72157625889973066/">View of the mountains from the sprint</a></td></tr> +</tbody></table> +<br> +Working on 2.7 compatibility wasn't the only work that happened during the sprint. Other activities included:<br> +<ul> +<li>Antonio Cuni worked on the "jittypes" branch. This is a reimplementation of the core of the PyPy ctypes code to make it jittable. The goal is that for common cases the jit should be able to turn ctypes calls from Python into direct C level calls. This work was not completed but very close and is great for the future of integrating C libraries with PyPy. As ctypes is also available in CPython and IronPython, and hopefully will be available in Jython soon, integrating C code with Python through ctypes is the most "implementation portable" technique.</li> +<li>David Schneider continued his work on the <a href="https://www.pypy.org/posts/2011/01/jit-backend-for-arm-processors-5994810755839586463.html">JIT backend for ARM</a>. PyPy has been cross-compilable to ARM for a long time, but bringing the JIT to ARM will provide a *fast* PyPy for ARM, which includes platforms like Android. Again David didn't complete this work but did complete the float support.</li> +<li>Håkan Ardo was present for two days and continued his crazy-clever work on JIT optimisations, some of which are described in the <a href="https://www.pypy.org/posts/2011/01/loop-invariant-code-motion-1998392217676829154.html">Loop invariant code motion</a> blog entry.</li> +<li>Holger Krekel worked on updating the PyPy test suite to the latest version of py.test and also worked with me on the interminable bytearray changes for part of the sprint.</li> +<li>No one was sure what  Maciej Fijałkowski worked on but he seemed to be quite busy.</li> +</ul> +I think that was most of the work done during the actual sprint. There was also a great deal of healthy discussion about the future of PyPy. Expect lots more interesting and exciting developments over the coming year.<br> +<br>sprinthttps://www.pypy.org/posts/2011/02/pypy-winter-sprint-report-4155886720346408516.htmlMon, 14 Feb 2011 12:05:00 GMT \ No newline at end of file diff --git a/categories/sprints.html b/categories/sprints.html new file mode 100644 index 000000000..1f16d00bf --- /dev/null +++ b/categories/sprints.html @@ -0,0 +1,114 @@ + + + + + +Posts about sprints | PyPy + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +
                + + \ No newline at end of file diff --git a/categories/sprints.xml b/categories/sprints.xml new file mode 100644 index 000000000..69e588b89 --- /dev/null +++ b/categories/sprints.xml @@ -0,0 +1,59 @@ + +PyPy (Posts about sprints)https://www.pypy.org/enContents © 2024 <a href="mailto:pypy-dev@pypy.org">The PyPy Team</a> Sat, 31 Aug 2024 17:48:13 GMTNikola (getnikola.com)http://blogs.law.harvard.edu/tech/rssDüsseldorf HPy/PyPy/GraalPy sprint September 19-23rd 2022https://www.pypy.org/posts/2022/07/ddorf-sprint-sep-2022.htmlCarl Friedrich Bolz-Tereick<p>The programming language group of the Computer Science department of +Heinrich-Heine Universität Düsseldorf is happy to invite everybody to another +sprint in Düsseldorf, from the 19th to the 23rd of September 2022. This is a +fully public sprint, everyone and particularly newcomers are welcome to join +us! The goal is to bring together people from the <a class="reference external" href="https://hpyproject.org/">HPy</a>, PyPy, <a class="reference external" href="https://www.graalvm.org/python/">GraalPy</a> and +CPython communities.</p> +<section id="topics-and-goals"> +<h2>Topics and goals</h2> +<ul class="simple"> +<li><p>work on HPy APIs, discussions around next steps for the project</p></li> +<li><p>continuing new and ongoing ports to HPy, including Cython, NumPy, Pillow, Matplotlib</p></li> +<li><p>3.10 support on PyPy and GraalPy</p></li> +<li><p>preparing the next PyPy release</p></li> +<li><p>discussions around ways to improve collaboration between the different Python +implementations</p></li> +</ul> +</section> +<section id="what-is-a-sprint"> +<h2>What is a sprint?</h2> +<p>The experience of the PyPy project has shown the benefits of regular +sprints. They are focussed one week physical meetings where people pair-program +on new features and discuss future plans. Coming to one is a great way to get +started with a project!</p> +</section> +<section id="location"> +<h2>Location</h2> +<p>The sprint will take place in a seminar room of the computer science +department. It is in the building 25.12, room 02.50 (second floor) of the +university campus. For travel instructions see</p> +<blockquote> +<p><a class="reference external" href="https://www.cs.hhu.de/lehrstuehle-und-arbeitsgruppen/softwaretechnik-und-programmiersprachen/kontakt/service/lage-und-anreise">https://www.cs.hhu.de/lehrstuehle-und-arbeitsgruppen/softwaretechnik-und-programmiersprachen/kontakt/service/lage-und-anreise</a></p> +</blockquote> +<p>We ask participants to wear masks during the indoor working hours.</p> +<figure> +<a class="reference external image-reference" href="https://commons.wikimedia.org/wiki/File:Universitaets-_und_Landesbibliothek_Duesseldorf_in_Duesseldorf-Bilk,_von_Nordwesten.jpg"> +<img alt="Photograph of the statue of Heinrich Heine in front of the University library on the campus in Düsseldorf" src="https://upload.wikimedia.org/wikipedia/commons/thumb/0/00/Universitaets-_und_Landesbibliothek_Duesseldorf_in_Duesseldorf-Bilk%2C_von_Nordwesten.jpg/640px-Universitaets-_und_Landesbibliothek_Duesseldorf_in_Duesseldorf-Bilk%2C_von_Nordwesten.jpg"> +</a> +<figcaption> +<p>Wiegels, CC BY 3.0, via Wikimedia Commons</p> +</figcaption> +</figure> +</section> +<section id="exact-times"> +<h2>Exact times</h2> +<p>Work days: starting September 19th (~morning), ending September 23rd (~afternoon). +We will do a to-be-planned social activity on Wednesday afternoon.</p> +</section> +<section id="registration"> +<h2>Registration</h2> +<p>Please register by editing this file or by opening a <a class="reference external" href="https://doc.pypy.org/en/latest/coding-guide.html">pull request</a>:</p> +<blockquote> +<p><a class="reference external" href="https://foss.heptapod.net/pypy/extradoc/-/blob/branch/extradoc/sprintinfo/ddorf2022/people.txt">https://foss.heptapod.net/pypy/extradoc/-/blob/branch/extradoc/sprintinfo/ddorf2022/people.txt</a></p> +</blockquote> +<p>or by sending a quick mail to the pypy-dev mailing list:</p> +<blockquote> +<p><a class="reference external" href="http://mail.python.org/mailman/listinfo/pypy-dev">http://mail.python.org/mailman/listinfo/pypy-dev</a></p> +</blockquote> +</section>sprintshttps://www.pypy.org/posts/2022/07/ddorf-sprint-sep-2022.htmlFri, 29 Jul 2022 12:00:00 GMT \ No newline at end of file diff --git a/categories/squeak.html b/categories/squeak.html new file mode 100644 index 000000000..d5ef6cbf3 --- /dev/null +++ b/categories/squeak.html @@ -0,0 +1,114 @@ + + + + + +Posts about Squeak | PyPy + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +
                + + \ No newline at end of file diff --git a/categories/squeak.xml b/categories/squeak.xml new file mode 100644 index 000000000..5a9c45d37 --- /dev/null +++ b/categories/squeak.xml @@ -0,0 +1,122 @@ + +PyPy (Posts about Squeak)https://www.pypy.org/enContents © 2024 <a href="mailto:pypy-dev@pypy.org">The PyPy Team</a> Sat, 31 Aug 2024 17:48:14 GMTNikola (getnikola.com)http://blogs.law.harvard.edu/tech/rssA Field Test of Software Transactional Memory Using the RSqueak Smalltalk VMhttps://www.pypy.org/posts/2014/08/a-field-test-of-software-transactional-5659022209916605798.htmlCarl Friedrich Bolz-Tereick<h2> +Extending the Smalltalk RSqueakVM with STM</h2> +<p>by Conrad Calmez, Hubert Hesse, Patrick Rein and Malte Swart supervised by Tim Felgentreff and Tobias Pape</p> +<h2> +Introduction</h2> +<p>After pypy-stm we can announce that through the <a href="https://bitbucket.org/pypy/lang-smalltalk">RSqueakVM</a> (which used to be called <em>SPyVM</em>) a second VM implementation supports software transactional memory. RSqueakVM is a Smalltalk implementation based on the RPython toolchain. We have added STM support based on the <a href="https://www.pypy.org/posts/2014/07/pypy-stm-first-interesting-release-8684276541915333814.html">STM tools from RPython (rstm)</a>. The benchmarks indicate that linear scale up is possible, however in some situations the STM overhead limits speedup.</p> +<p>The work was done as a master's project at the <a href="https://www.hpi.uni-potsdam.de/hirschfeld/">Software Architechture Group</a> of Professor Robert Hirschfeld at at the <a href="https://hpi.de/">Hasso Plattner Institut</a> at the <a href="https://www.uni-potsdam.de/">University of Potsdam</a>. We - four students - worked about one and a half days per week for four months on the topic. The RSqueakVM was <a href="https://pypysqueak.blogspot.de/2007/10/first-day-discussions.html">originally developped during a sprint at the University of Bern</a>. When we started the project we were new to the topic of building VMs / interpreters.</p> +<p>We would like to thank Armin, Remi and the #pypy IRC channel who supported us over the course of our project. We also like to thank Toni Mattis and Eric Seckler, who have provided us with an <a href="https://bitbucket.org/amintos/lang-smalltalk">initial code base</a>.</p> +<h2 id="introduction-to-rsqueakvm"> +Introduction to RSqueakVM</h2> +<p>As the original Smalltalk implementation, the RSqueakVM executes a given Squeak Smalltalk image, containing the Smalltalk code and a snapshot of formerly created objects and active execution contexts. These execution contexts are scheduled inside the image (greenlets) and not mapped to OS threads. Thereby the non-STM RSqueakVM runs on only one OS thread.</p> +<h2 id="changes-to-rsqueakvm"> +Changes to RSqueakVM</h2> +<p>The core adjustments to support STM were inside the VM and transparent from the view of a Smalltalk user. Additionally we added Smalltalk code to influence the behavior of the STM. As the RSqueakVM has run in one OS thread so far, we added the capability to start OS threads. Essentially, we added an additional way to launch a new Smalltalk execution context (thread). But in contrast to the original one this one creates a new native OS thread, not a Smalltalk internal green thread.</p> + +<p>STM (with automatic transaction boundaries) already solves the problem of concurrent access on one value as this is protected by the STM transactions (to be more precise one instruction). But there are cases were the application relies on the fact that a bigger group of changes is executed either completely or not at all (atomic). Without further information transaction borders could be in the middle of such a set of atomic statements. rstm allows to aggregate multiple statements into one higher level transaction. To let the application mark the beginning and the end of these atomic blocks (high-level transactions), we added two more STM specific extensions to Smalltalk.</p> + +<h2 id="benchmarks"> +Benchmarks</h2> +<p>RSqueak was executed in a single OS thread so far. rstm enables us to execute the VM using several OS threads. Using OS threads we expected a speed-up in benchmarks which use multiple threads. We measured this speed-up by using two benchmarks: a simple parallel summation where each thread sums up a predefined interval and an implementation of Mandelbrot where each thread computes a range of predefined lines.</p> + +<p>To assess the speed-up, we used one RSqueakVM compiled with rstm enabled, but once running the benchmarks with OS threads and once with Smalltalk green threads. The workload always remained the same and only the number of threads increased. To assess the overhead imposed by the STM transformation we also ran the green threads version on an unmodified RSqueakVM. All VMs were translated with the JIT optimization and all benchmarks were run once before the measurement to warm up the JIT. As the JIT optimization is working it is likely to be adoped by VM creators (the baseline RSqueakVM did that) so that results with this optimization are more relevant in practice than those without it. We measured the execution time by getting the system time in Squeak. The results are:</p> +<h4> +Parallel Sum Ten Million</h4> +<table align="center" cellpadding="0" cellspacing="0" class="tr-caption-container" style="margin-left: auto; margin-right: auto; text-align: center;"><tbody> +<tr><td style="text-align: center;"> + +<div class="separator" style="clear: both; text-align: center;"><a href="https://1.bp.blogspot.com/-7J05whp07m8/U-iEdb3Ce0I/AAAAAAAAAVw/91sD_1KEiGc/s1600/parallelSum10MioChart.png" style="margin-left: 1em; margin-right: 1em;"><img border="0" src="https://1.bp.blogspot.com/-7J05whp07m8/U-iEdb3Ce0I/AAAAAAAAAVw/91sD_1KEiGc/s320/parallelSum10MioChart.png"></a></div> + +</td></tr> +<tr><td class="tr-caption" style="text-align: center;"><span style="font-size: small; text-align: start;">Benchmark Parallel Sum 10,000,000</span></td></tr> +</tbody></table> +<table><thead> +<tr> <th>Thread Count</th> <th>RSqueak green threads</th> <th>RSqueak/STM green threads</th> <th>RSqueak/STM OS threads</th> <th>Slow down from RSqueak green threads to RSqueak/STM green threads</th> <th>Speed up from RSqueak/STM green threads to RSQueak/STM OS Threads</th> </tr> +</thead> <tbody> +<tr> <td>1</td> <td>168.0 ms</td> <td>240.0 ms</td> <td>290.9 ms</td> <td>0.70</td> <td>0.83</td> </tr> +<tr> <td>2</td> <td>167.0 ms</td> <td>244.0 ms</td> <td>246.1 ms</td> <td>0.68</td> <td>0.99</td> </tr> +<tr> <td>4</td> <td>167.8 ms</td> <td>240.7 ms</td> <td>366.7 ms</td> <td>0.70</td> <td>0.66</td> </tr> +<tr> <td>8</td> <td>168.1 ms</td> <td>241.1 ms</td> <td>757.0 ms</td> <td>0.70</td> <td>0.32</td> </tr> +<tr> <td>16</td> <td>168.5 ms</td> <td>244.5 ms</td> <td>1460.0 ms</td> <td>0.69</td> <td>0.17</td> </tr> +</tbody> </table> +<br> + +<h4> +Parallel Sum One Billion</h4> +<table align="center" cellpadding="0" cellspacing="0" class="tr-caption-container" style="margin-left: auto; margin-right: auto; text-align: center;"><tbody> +<tr><td style="text-align: center;"> + +<div class="separator" style="clear: both; text-align: center;"><a href="https://3.bp.blogspot.com/-wN-Bad8Pnd8/U-iE43ZtHcI/AAAAAAAAAV4/dii8NU0rseE/s1600/parallelSum1BioChart.png" style="margin-left: 1em; margin-right: 1em;"><img border="0" src="https://3.bp.blogspot.com/-wN-Bad8Pnd8/U-iE43ZtHcI/AAAAAAAAAV4/dii8NU0rseE/s320/parallelSum1BioChart.png"></a></div> + +</td></tr> +<tr><td class="tr-caption" style="text-align: center;">Benchmark Parallel Sum 1,000,000,000</td></tr> +</tbody></table> +<br> +<table><thead> +<tr><th>Thread Count</th><th>RSqueak green threads</th><th>RSqueak/STM green threads</th><th>RSqueak/STM OS threads</th><th>Slow down from RSqueak green threads to RSqueak/STM green threads</th><th>Speed up from RSqueak/STM green threads to RSQueak/STM OS Threads</th></tr> +</thead><tbody> +<tr> <td>1</td> <td>16831.0 ms</td> <td>24111.0 ms</td> <td>23346.0 ms</td> <td>0.70</td> <td>1.03</td> </tr> +<tr> <td>2</td> <td>17059.9 ms</td> <td>24229.4 ms</td> <td>16102.1 ms</td> <td>0.70</td> <td>1.50</td> </tr> +<tr> <td>4</td> <td>16959.9 ms</td> <td>24365.6 ms</td> <td>12099.5 ms</td> <td>0.70</td> <td>2.01</td> </tr> +<tr> <td>8</td> <td>16758.4 ms</td> <td>24228.1 ms</td> <td>14076.9 ms</td> <td>0.69</td> <td>1.72</td> </tr> +<tr> <td>16</td> <td>16748.7 ms</td> <td>24266.6 ms</td> <td>55502.9 ms</td> <td>0.69</td> <td>0.44</td> </tr> +</tbody></table> + +<br> + +<h4> +Mandelbrot Iterative</h4> +<table align="center" cellpadding="0" cellspacing="0" class="tr-caption-container" style="margin-left: auto; margin-right: auto; text-align: center;"><tbody> +<tr><td style="text-align: center;"> + +<div class="separator" style="clear: both; text-align: center;"><a href="https://2.bp.blogspot.com/-_wLcNRFGkQc/U-iFOB3wDmI/AAAAAAAAAWA/He1oxb0hEpc/s1600/mandelbrotChart.png" style="margin-left: 1em; margin-right: 1em;"><img border="0" src="https://2.bp.blogspot.com/-_wLcNRFGkQc/U-iFOB3wDmI/AAAAAAAAAWA/He1oxb0hEpc/s320/mandelbrotChart.png"></a></div> + +</td></tr> +<tr><td class="tr-caption" style="text-align: center;">Benchmark Mandelbrot</td></tr> +</tbody></table> +<table><thead> +<tr> <th>Thread Count</th> <th>RSqueak green threads</th> <th>RSqueak/STM green threads</th> <th>RSqueak/STM OS threads</th> <th>Slow down from RSqueak green threads to RSqueak/STM green threads</th> <th>Speed up from RSqueak/STM green threads to RSqueak/STM OS Threads</th> </tr> +</thead> <tbody> +<tr> <td>1</td> <td>724.0 ms</td> <td>983.0 ms</td> <td>1565.5 ms</td> <td>0.74</td> <td>0.63</td> </tr> +<tr> <td>2</td> <td>780.5 ms</td> <td>973.5 ms</td> <td>5555.0 ms</td> <td>0.80</td> <td>0.18</td> </tr> +<tr> <td>4</td> <td>781.0 ms</td> <td>982.5 ms</td> <td>20107.5 ms</td> <td>0.79</td> <td>0.05</td> </tr> +<tr> <td>8</td> <td>779.5 ms</td> <td>980.0 ms</td> <td>113067.0 ms</td> <td>0.80</td> <td>0.01</td></tr> +</tbody></table> + +<br> + +<h2> +Discussion of benchmark results</h2> +<p>First of all, the ParallelSum benchmarks show that the parallelism is actually paying off, at least for sufficiently large embarrassingly parallel problems. Thus RSqueak can also benefit from rstm.</p> +<p>On the other hand, our Mandelbrot implementation shows the limits of our current rstm integration. We implemented two versions of the algorithm one using one low-level array and one using two nested collections. In both versions, one job only calculates a distinct range of rows and both lead to a slowdown. The summary of the state of rstm transactions shows that there are a lot of inevitable transactions (transactions which must be completed). One reason might be the interactions between the VM and its low-level extensions, so called plugins. We have to investigate this further.</p> +<h2 id="limitations"> +Limitations</h2> +<p>Although the current VM setup is working well enough to support our benchmarks, the VM still has limitations. First of all, as it is based on rstm, it has the current limitation of only running on 64-bit Linux.</p> +<p>Besides this, we also have two major limitations regarding the VM itself. First, the atomic interface exposed in Smalltalk is currently not working, when the VM is compiled using the just-in-time compiler transformation. Simple examples such as concurrent parallel sum work fine while more complex benchmarks such as <a href="https://benchmarksgame.alioth.debian.org/u32/performance.php?test=chameneosredux#about">chameneos</a> fail. The reasons for this are currently beyond our understanding. Second, Smalltalk supports green threads, which are threads which are managed by the VM and are not mapped to OS threads. We currently support starting new Smalltalk threads as OS threads instead of starting them as green threads. However, existing threads in a Smalltalk image are not migrated to OS threads, but remain running as green threads.</p> +<h2 id="future-work-for-stm-in-rsqueak"> +Future work for STM in RSqueak</h2> +The work we presented showed interesting problems, we propose the following problem statements for further analysis:<br> +<ul> +<li><strong>Inevitable transactions</strong> in benchmarks. This looks like it could limit other applications too so it should be solved.</li> +<li><strong>Collection implementation aware of STM</strong>: The current implementation of collections can cause a lot of STM collisions due to their internal memory structure. We believe it could bear potential for performance improvements, if we replace these collections in an STM enabled interpreter with implementations with less STM collisions. As already proposed by Remi Meier, bags, sets and lists are of particular interest.</li> +<li>Finally, we exposed <strong>STM through languages features</strong> such as the atomic method, which is provided through the VM. Originally, it was possible to model STM transactions barriers implicitly by using clever locks, now its exposed via the atomic keyword. From a language design point of view, the question arises whether this is a good solution and what features an stm-enabled interpreter must provide to the user in general? Of particular interest are for example, access to the transaction length and hints for transaction borders to and their performance impact.</li> +</ul> +<ul></ul> +<h2 id="details-for-the-technically-inclined"> +Details for the technically inclined</h2> +<ul> +<li><a href="https://bitbucket.org/pypy/lang-smalltalk/diff/spyvm/interpreter.py?diff1=7a217be69118&amp;diff2=a772ee2447d96041e7db6550e160e90251d0dd85&amp;at=stmgc-c7#Lspyvm/interpreter.pyT233">Adjustments to the interpreter loop were minimal</a>.</li> +<li>STM works on bytecode granularity that means, there is a implicit transaction border after every bytecode executed. Possible alternatives: only break transactions after certain bytecodes, break transactions on one abstraction layer above, e.g. object methods (setter, getter).</li> +<li>rstm calls were exposed using primtives (a way to expose native code in Smalltalk), this was mainly used for atomic.</li> +<li>Starting and stopping OS threads is exposed via primitives as well. Threads are started from within the interpreter.</li> +<li>For Smalltalk enabled STM code we currently have different image versions. However another way to add, load and replace code to the Smalltalk code base is required to make a switch between STM and non-STM code simple.</li> +</ul> +<ul></ul> +<h2 id="details-on-the-project-setup"> +Details on the project setup</h2> +<p>From a non-technical perspective, a problem we encountered was the huge roundtrip times (on our machines up to 600s, 900s with JIT enabled). This led to a tendency of bigger code changes ("Before we compile, let's also add this"), lost flow ("What where we doing before?") and different compiled interpreters in parallel testing ("How is this version different from the others?") As a consequence it was harder to test and correct errors. While this is not as much of a problem for other RPython VMs, RSqueakVM needs to execute the entire image, which makes running it untranslated even slower.</p> +<h2 id="summary"> +Summary</h2> +<p>The benchmarks show that speed up is possible, but also that the STM overhead in some situations can eat up the speedup. The resulting STM-enabled VM still has some limitations: As rstm is currently only running on 64-bit Linux the RSqueakVM is doing so as well. Eventhough it is possible for us now to create new threads that map to OS threads within the VM, the migration of exiting Smalltalk threads keeps being problematic.</p> +<p>We showed that an existing VM code base can benefit of STM in terms of scaling up. Further it was relatively easy to enable STM support. This may also be valuable to VM developers considering to get STM support for their VMs.</p>SmalltalkSqueakstmhttps://www.pypy.org/posts/2014/08/a-field-test-of-software-transactional-5659022209916605798.htmlSat, 09 Aug 2014 13:15:00 GMT \ No newline at end of file diff --git a/categories/stm.html b/categories/stm.html new file mode 100644 index 000000000..d9a6ae478 --- /dev/null +++ b/categories/stm.html @@ -0,0 +1,153 @@ + + + + + +Posts about stm | PyPy + + + + + + + + + + + + + + + + + Skip to main content +
                +

                Posts about stm

                + + +
                +
                + + \ No newline at end of file diff --git a/categories/stm.xml b/categories/stm.xml new file mode 100644 index 000000000..daae2af67 --- /dev/null +++ b/categories/stm.xml @@ -0,0 +1,895 @@ + +PyPy (Posts about stm)https://www.pypy.org/enContents © 2024 <a href="mailto:pypy-dev@pypy.org">The PyPy Team</a> Sat, 31 Aug 2024 17:48:14 GMTNikola (getnikola.com)http://blogs.law.harvard.edu/tech/rssA Field Test of Software Transactional Memory Using the RSqueak Smalltalk VMhttps://www.pypy.org/posts/2014/08/a-field-test-of-software-transactional-5659022209916605798.htmlCarl Friedrich Bolz-Tereick<h2> +Extending the Smalltalk RSqueakVM with STM</h2> +<p>by Conrad Calmez, Hubert Hesse, Patrick Rein and Malte Swart supervised by Tim Felgentreff and Tobias Pape</p> +<h2> +Introduction</h2> +<p>After pypy-stm we can announce that through the <a href="https://bitbucket.org/pypy/lang-smalltalk">RSqueakVM</a> (which used to be called <em>SPyVM</em>) a second VM implementation supports software transactional memory. RSqueakVM is a Smalltalk implementation based on the RPython toolchain. We have added STM support based on the <a href="https://www.pypy.org/posts/2014/07/pypy-stm-first-interesting-release-8684276541915333814.html">STM tools from RPython (rstm)</a>. The benchmarks indicate that linear scale up is possible, however in some situations the STM overhead limits speedup.</p> +<p>The work was done as a master's project at the <a href="https://www.hpi.uni-potsdam.de/hirschfeld/">Software Architechture Group</a> of Professor Robert Hirschfeld at at the <a href="https://hpi.de/">Hasso Plattner Institut</a> at the <a href="https://www.uni-potsdam.de/">University of Potsdam</a>. We - four students - worked about one and a half days per week for four months on the topic. The RSqueakVM was <a href="https://pypysqueak.blogspot.de/2007/10/first-day-discussions.html">originally developped during a sprint at the University of Bern</a>. When we started the project we were new to the topic of building VMs / interpreters.</p> +<p>We would like to thank Armin, Remi and the #pypy IRC channel who supported us over the course of our project. We also like to thank Toni Mattis and Eric Seckler, who have provided us with an <a href="https://bitbucket.org/amintos/lang-smalltalk">initial code base</a>.</p> +<h2 id="introduction-to-rsqueakvm"> +Introduction to RSqueakVM</h2> +<p>As the original Smalltalk implementation, the RSqueakVM executes a given Squeak Smalltalk image, containing the Smalltalk code and a snapshot of formerly created objects and active execution contexts. These execution contexts are scheduled inside the image (greenlets) and not mapped to OS threads. Thereby the non-STM RSqueakVM runs on only one OS thread.</p> +<h2 id="changes-to-rsqueakvm"> +Changes to RSqueakVM</h2> +<p>The core adjustments to support STM were inside the VM and transparent from the view of a Smalltalk user. Additionally we added Smalltalk code to influence the behavior of the STM. As the RSqueakVM has run in one OS thread so far, we added the capability to start OS threads. Essentially, we added an additional way to launch a new Smalltalk execution context (thread). But in contrast to the original one this one creates a new native OS thread, not a Smalltalk internal green thread.</p> + +<p>STM (with automatic transaction boundaries) already solves the problem of concurrent access on one value as this is protected by the STM transactions (to be more precise one instruction). But there are cases were the application relies on the fact that a bigger group of changes is executed either completely or not at all (atomic). Without further information transaction borders could be in the middle of such a set of atomic statements. rstm allows to aggregate multiple statements into one higher level transaction. To let the application mark the beginning and the end of these atomic blocks (high-level transactions), we added two more STM specific extensions to Smalltalk.</p> + +<h2 id="benchmarks"> +Benchmarks</h2> +<p>RSqueak was executed in a single OS thread so far. rstm enables us to execute the VM using several OS threads. Using OS threads we expected a speed-up in benchmarks which use multiple threads. We measured this speed-up by using two benchmarks: a simple parallel summation where each thread sums up a predefined interval and an implementation of Mandelbrot where each thread computes a range of predefined lines.</p> + +<p>To assess the speed-up, we used one RSqueakVM compiled with rstm enabled, but once running the benchmarks with OS threads and once with Smalltalk green threads. The workload always remained the same and only the number of threads increased. To assess the overhead imposed by the STM transformation we also ran the green threads version on an unmodified RSqueakVM. All VMs were translated with the JIT optimization and all benchmarks were run once before the measurement to warm up the JIT. As the JIT optimization is working it is likely to be adoped by VM creators (the baseline RSqueakVM did that) so that results with this optimization are more relevant in practice than those without it. We measured the execution time by getting the system time in Squeak. The results are:</p> +<h4> +Parallel Sum Ten Million</h4> +<table align="center" cellpadding="0" cellspacing="0" class="tr-caption-container" style="margin-left: auto; margin-right: auto; text-align: center;"><tbody> +<tr><td style="text-align: center;"> + +<div class="separator" style="clear: both; text-align: center;"><a href="https://1.bp.blogspot.com/-7J05whp07m8/U-iEdb3Ce0I/AAAAAAAAAVw/91sD_1KEiGc/s1600/parallelSum10MioChart.png" style="margin-left: 1em; margin-right: 1em;"><img border="0" src="https://1.bp.blogspot.com/-7J05whp07m8/U-iEdb3Ce0I/AAAAAAAAAVw/91sD_1KEiGc/s320/parallelSum10MioChart.png"></a></div> + +</td></tr> +<tr><td class="tr-caption" style="text-align: center;"><span style="font-size: small; text-align: start;">Benchmark Parallel Sum 10,000,000</span></td></tr> +</tbody></table> +<table><thead> +<tr> <th>Thread Count</th> <th>RSqueak green threads</th> <th>RSqueak/STM green threads</th> <th>RSqueak/STM OS threads</th> <th>Slow down from RSqueak green threads to RSqueak/STM green threads</th> <th>Speed up from RSqueak/STM green threads to RSQueak/STM OS Threads</th> </tr> +</thead> <tbody> +<tr> <td>1</td> <td>168.0 ms</td> <td>240.0 ms</td> <td>290.9 ms</td> <td>0.70</td> <td>0.83</td> </tr> +<tr> <td>2</td> <td>167.0 ms</td> <td>244.0 ms</td> <td>246.1 ms</td> <td>0.68</td> <td>0.99</td> </tr> +<tr> <td>4</td> <td>167.8 ms</td> <td>240.7 ms</td> <td>366.7 ms</td> <td>0.70</td> <td>0.66</td> </tr> +<tr> <td>8</td> <td>168.1 ms</td> <td>241.1 ms</td> <td>757.0 ms</td> <td>0.70</td> <td>0.32</td> </tr> +<tr> <td>16</td> <td>168.5 ms</td> <td>244.5 ms</td> <td>1460.0 ms</td> <td>0.69</td> <td>0.17</td> </tr> +</tbody> </table> +<br> + +<h4> +Parallel Sum One Billion</h4> +<table align="center" cellpadding="0" cellspacing="0" class="tr-caption-container" style="margin-left: auto; margin-right: auto; text-align: center;"><tbody> +<tr><td style="text-align: center;"> + +<div class="separator" style="clear: both; text-align: center;"><a href="https://3.bp.blogspot.com/-wN-Bad8Pnd8/U-iE43ZtHcI/AAAAAAAAAV4/dii8NU0rseE/s1600/parallelSum1BioChart.png" style="margin-left: 1em; margin-right: 1em;"><img border="0" src="https://3.bp.blogspot.com/-wN-Bad8Pnd8/U-iE43ZtHcI/AAAAAAAAAV4/dii8NU0rseE/s320/parallelSum1BioChart.png"></a></div> + +</td></tr> +<tr><td class="tr-caption" style="text-align: center;">Benchmark Parallel Sum 1,000,000,000</td></tr> +</tbody></table> +<br> +<table><thead> +<tr><th>Thread Count</th><th>RSqueak green threads</th><th>RSqueak/STM green threads</th><th>RSqueak/STM OS threads</th><th>Slow down from RSqueak green threads to RSqueak/STM green threads</th><th>Speed up from RSqueak/STM green threads to RSQueak/STM OS Threads</th></tr> +</thead><tbody> +<tr> <td>1</td> <td>16831.0 ms</td> <td>24111.0 ms</td> <td>23346.0 ms</td> <td>0.70</td> <td>1.03</td> </tr> +<tr> <td>2</td> <td>17059.9 ms</td> <td>24229.4 ms</td> <td>16102.1 ms</td> <td>0.70</td> <td>1.50</td> </tr> +<tr> <td>4</td> <td>16959.9 ms</td> <td>24365.6 ms</td> <td>12099.5 ms</td> <td>0.70</td> <td>2.01</td> </tr> +<tr> <td>8</td> <td>16758.4 ms</td> <td>24228.1 ms</td> <td>14076.9 ms</td> <td>0.69</td> <td>1.72</td> </tr> +<tr> <td>16</td> <td>16748.7 ms</td> <td>24266.6 ms</td> <td>55502.9 ms</td> <td>0.69</td> <td>0.44</td> </tr> +</tbody></table> + +<br> + +<h4> +Mandelbrot Iterative</h4> +<table align="center" cellpadding="0" cellspacing="0" class="tr-caption-container" style="margin-left: auto; margin-right: auto; text-align: center;"><tbody> +<tr><td style="text-align: center;"> + +<div class="separator" style="clear: both; text-align: center;"><a href="https://2.bp.blogspot.com/-_wLcNRFGkQc/U-iFOB3wDmI/AAAAAAAAAWA/He1oxb0hEpc/s1600/mandelbrotChart.png" style="margin-left: 1em; margin-right: 1em;"><img border="0" src="https://2.bp.blogspot.com/-_wLcNRFGkQc/U-iFOB3wDmI/AAAAAAAAAWA/He1oxb0hEpc/s320/mandelbrotChart.png"></a></div> + +</td></tr> +<tr><td class="tr-caption" style="text-align: center;">Benchmark Mandelbrot</td></tr> +</tbody></table> +<table><thead> +<tr> <th>Thread Count</th> <th>RSqueak green threads</th> <th>RSqueak/STM green threads</th> <th>RSqueak/STM OS threads</th> <th>Slow down from RSqueak green threads to RSqueak/STM green threads</th> <th>Speed up from RSqueak/STM green threads to RSqueak/STM OS Threads</th> </tr> +</thead> <tbody> +<tr> <td>1</td> <td>724.0 ms</td> <td>983.0 ms</td> <td>1565.5 ms</td> <td>0.74</td> <td>0.63</td> </tr> +<tr> <td>2</td> <td>780.5 ms</td> <td>973.5 ms</td> <td>5555.0 ms</td> <td>0.80</td> <td>0.18</td> </tr> +<tr> <td>4</td> <td>781.0 ms</td> <td>982.5 ms</td> <td>20107.5 ms</td> <td>0.79</td> <td>0.05</td> </tr> +<tr> <td>8</td> <td>779.5 ms</td> <td>980.0 ms</td> <td>113067.0 ms</td> <td>0.80</td> <td>0.01</td></tr> +</tbody></table> + +<br> + +<h2> +Discussion of benchmark results</h2> +<p>First of all, the ParallelSum benchmarks show that the parallelism is actually paying off, at least for sufficiently large embarrassingly parallel problems. Thus RSqueak can also benefit from rstm.</p> +<p>On the other hand, our Mandelbrot implementation shows the limits of our current rstm integration. We implemented two versions of the algorithm one using one low-level array and one using two nested collections. In both versions, one job only calculates a distinct range of rows and both lead to a slowdown. The summary of the state of rstm transactions shows that there are a lot of inevitable transactions (transactions which must be completed). One reason might be the interactions between the VM and its low-level extensions, so called plugins. We have to investigate this further.</p> +<h2 id="limitations"> +Limitations</h2> +<p>Although the current VM setup is working well enough to support our benchmarks, the VM still has limitations. First of all, as it is based on rstm, it has the current limitation of only running on 64-bit Linux.</p> +<p>Besides this, we also have two major limitations regarding the VM itself. First, the atomic interface exposed in Smalltalk is currently not working, when the VM is compiled using the just-in-time compiler transformation. Simple examples such as concurrent parallel sum work fine while more complex benchmarks such as <a href="https://benchmarksgame.alioth.debian.org/u32/performance.php?test=chameneosredux#about">chameneos</a> fail. The reasons for this are currently beyond our understanding. Second, Smalltalk supports green threads, which are threads which are managed by the VM and are not mapped to OS threads. We currently support starting new Smalltalk threads as OS threads instead of starting them as green threads. However, existing threads in a Smalltalk image are not migrated to OS threads, but remain running as green threads.</p> +<h2 id="future-work-for-stm-in-rsqueak"> +Future work for STM in RSqueak</h2> +The work we presented showed interesting problems, we propose the following problem statements for further analysis:<br> +<ul> +<li><strong>Inevitable transactions</strong> in benchmarks. This looks like it could limit other applications too so it should be solved.</li> +<li><strong>Collection implementation aware of STM</strong>: The current implementation of collections can cause a lot of STM collisions due to their internal memory structure. We believe it could bear potential for performance improvements, if we replace these collections in an STM enabled interpreter with implementations with less STM collisions. As already proposed by Remi Meier, bags, sets and lists are of particular interest.</li> +<li>Finally, we exposed <strong>STM through languages features</strong> such as the atomic method, which is provided through the VM. Originally, it was possible to model STM transactions barriers implicitly by using clever locks, now its exposed via the atomic keyword. From a language design point of view, the question arises whether this is a good solution and what features an stm-enabled interpreter must provide to the user in general? Of particular interest are for example, access to the transaction length and hints for transaction borders to and their performance impact.</li> +</ul> +<ul></ul> +<h2 id="details-for-the-technically-inclined"> +Details for the technically inclined</h2> +<ul> +<li><a href="https://bitbucket.org/pypy/lang-smalltalk/diff/spyvm/interpreter.py?diff1=7a217be69118&amp;diff2=a772ee2447d96041e7db6550e160e90251d0dd85&amp;at=stmgc-c7#Lspyvm/interpreter.pyT233">Adjustments to the interpreter loop were minimal</a>.</li> +<li>STM works on bytecode granularity that means, there is a implicit transaction border after every bytecode executed. Possible alternatives: only break transactions after certain bytecodes, break transactions on one abstraction layer above, e.g. object methods (setter, getter).</li> +<li>rstm calls were exposed using primtives (a way to expose native code in Smalltalk), this was mainly used for atomic.</li> +<li>Starting and stopping OS threads is exposed via primitives as well. Threads are started from within the interpreter.</li> +<li>For Smalltalk enabled STM code we currently have different image versions. However another way to add, load and replace code to the Smalltalk code base is required to make a switch between STM and non-STM code simple.</li> +</ul> +<ul></ul> +<h2 id="details-on-the-project-setup"> +Details on the project setup</h2> +<p>From a non-technical perspective, a problem we encountered was the huge roundtrip times (on our machines up to 600s, 900s with JIT enabled). This led to a tendency of bigger code changes ("Before we compile, let's also add this"), lost flow ("What where we doing before?") and different compiled interpreters in parallel testing ("How is this version different from the others?") As a consequence it was harder to test and correct errors. While this is not as much of a problem for other RPython VMs, RSqueakVM needs to execute the entire image, which makes running it untranslated even slower.</p> +<h2 id="summary"> +Summary</h2> +<p>The benchmarks show that speed up is possible, but also that the STM overhead in some situations can eat up the speedup. The resulting STM-enabled VM still has some limitations: As rstm is currently only running on 64-bit Linux the RSqueakVM is doing so as well. Eventhough it is possible for us now to create new threads that map to OS threads within the VM, the migration of exiting Smalltalk threads keeps being problematic.</p> +<p>We showed that an existing VM code base can benefit of STM in terms of scaling up. Further it was relatively easy to enable STM support. This may also be valuable to VM developers considering to get STM support for their VMs.</p>SmalltalkSqueakstmhttps://www.pypy.org/posts/2014/08/a-field-test-of-software-transactional-5659022209916605798.htmlSat, 09 Aug 2014 13:15:00 GMTSTM results and Second Call for Donationshttps://www.pypy.org/posts/2014/04/stm-results-and-second-call-for-1767845182888902777.htmlArmin Rigo<p>Hi all,</p> + +<p>We now have a preliminary version of <a href="https://pypy.readthedocs.org/en/latest/stm.html#current-status">PyPy-STM +with the JIT</a>, from the new <a href="https://pypy.readthedocs.org/en/latest/stm.html">STM documentation +page.</a> This PyPy-STM is still not quite useful, failing to top the +performance of a regular PyPy by a small margin on most benchmarks, but +it's definitely getting there :-) The overheads with the JIT are still +a bit too high. (I've been tracking an obscure bug since days. +It turned out to be a simple buffer overflow. But if anybody has +a clue about why a hardware watchpoint in gdb, set on one of the garbled +memory locations, fails to trigger but the memory ends up being modified +anyway... and, it turns out, by just a regular pointer write... ideas +welcome.)</p> + +<p>But I go off-topic :-) The main point of this post is to announce the +<a href="https://pypy.org/tmdonate2.html">2nd Call for Donation about +STM</a>. We achieved most of the goals laid out in the first call. We +even largely overachieved them in terms of raw performance, even if +there are many cases that are unreasonably slow for now. So, after the +successful research, we are launching a second proposal about the +development part of the project:</p> + +<ol><li><p>Polish PyPy-STM to get a consistently reasonable speed, 25%-40% +slower than a regular JITted PyPy when running single-threaded code. Of +course it is supposed to scale nicely as long as there are no +user-visible conflicts.</p> + +</li><li><p>Focus on developing the Python-facing interface: both internal things +(e.g. do dictionaries need to be more TM-friendly in general?) as well +as directly visible things (e.g. some profiler-like interface to explore +common conflicts in a program).</p> + +</li><li><p>Regular multithreaded code should benefit out of the box, but the +final goal is to explore and tweak some existing non-multithreaded +frameworks and improve their TM-friendliness. So existing programs +using Twisted or Stackless, for example, should run on multiple cores +without any major change.</p></li></ol> + +<p>See the <a href="https://pypy.org/tmdonate2.html">full call</a> for more +details! I'd like to thank Remi Meier for getting involved. And a big +thank you to everybody who contributed money on the first call. It +took more time than anticipated, but it's there in good but rough shape. +Now it needs a lot of polishing :-)</p> + +<p>Armin</p>stmhttps://www.pypy.org/posts/2014/04/stm-results-and-second-call-for-1767845182888902777.htmlWed, 09 Apr 2014 09:33:00 GMTSTMGC-C7 with PyPyhttps://www.pypy.org/posts/2014/03/hi-all-here-is-one-of-first-full-pypys-8725931424559481728.htmlArmin Rigo<p>Hi all,</p> + +<p>Here is one of the first full PyPy's +(edit: it was r69967+, but the general list of versions is currently <a href="https://cobra.cs.uni-duesseldorf.de/~buildmaster/misc/">here</a>) +compiled with the new <a href="https://www.pypy.org/posts/2014/02/rewrites-of-stm-core-model-again-633249729751034512.html">StmGC-c7 +library</a>. It has no JIT so far, but it runs some small +single-threaded benchmarks by taking around 40% more time than a +corresponding non-STM, no-JIT version of PyPy. It scales --- up to two +threads only, which is the hard-coded maximum so far in the c7 code. +But the scaling looks perfect in these small benchmarks without +conflict: starting two threads each running a copy of the benchmark +takes almost exactly the same amount of total time, simply using two +cores.</p> + +<p>Feel free to try it! It is not actually useful so far, because it is +limited to two cores and CPython is something like 2.5x faster. One of +the important next steps is to re-enable the JIT. Based on our <a href="https://foss.heptapod.net/pypy/pypy/-/tree/branch//stmgc-c7/TODO">current +understanding</a> of the "40%" figure, we can probably reduce it with +enough efforts; but also, the JIT should be able to easily produce +machine code that suffers a bit less than the interpreter from these +effects. This seems to mean that we're looking at 20%-ish slow-downs +for the future PyPy-STM-JIT.</p> + +<p>Interesting times :-)</p> + +<p>For reference, this is what you get by downloading <a href="https://cobra.cs.uni-duesseldorf.de/~buildmaster/misc/pypy-c-r69967+-stm-1d0b870195e7.tbz2">the +PyPy binary linked above</a>: a Linux 64 binary (Ubuntu 12.04) that +should behave mostly like a regular PyPy. (One main missing feature is +that destructors are never called.) It uses two cores, but obviously +only if the Python program you run is multithreaded. The only new +built-in feature is <code>with __pypy__.thread.atomic:</code> this gives +you a way to enforce that a block of code runs "atomically", which means +without any operation from any other thread randomly interleaved.</p> + +<p>If you want to translate it yourself, you need a trunk version of clang +with <a href="https://bitbucket.org/pypy/stmgc/raw/default/c7/llvmfix">three patches</a> applied. That's the number of bugs that we couldn't +find workarounds for, not the total number of bugs we found by (ab)using +the <a href="https://clang.llvm.org/docs/LanguageExtensions.html#target-specific-extensions">address_space</a> feature...</p> + +<p>Stay tuned for more!</p> + +<p>Armin &amp; Remi</p>stmhttps://www.pypy.org/posts/2014/03/hi-all-here-is-one-of-first-full-pypys-8725931424559481728.htmlSat, 15 Mar 2014 17:00:00 GMTRewrites of the STM core model -- againhttps://www.pypy.org/posts/2014/02/rewrites-of-stm-core-model-again-633249729751034512.htmlArmin Rigo<p>Hi all,</p> + +<p>A quick note about the Software Transactional Memory (STM) front.</p> + +<p>Since the <a href="https://www.pypy.org/posts/2013/10/update-on-stm-7145890443443707910.html">previous +post</a>, we believe we progressed a lot by discovering an alternative +core model for software transactions. Why do I say "believe"? It's +because it means <i>again</i> that we have to rewrite from scratch the C +library handling STM. This is currently work in progress. Once this is +done, we should be able to adapt the existing pypy-stm to run on top of +it without much rewriting efforts; in fact it should simplify the +difficult issues we ran into for the JIT. So while this is basically +yet another restart similar to <a href="https://www.pypy.org/posts/2013/06/stm-on-drawing-board-1028082727566254104.html">last +June's</a>, the difference is that the work that we have already put in the PyPy +part (as opposed to the C library) remains.</p> + +<p>You can read about the basic ideas of this new C library <a href="https://bitbucket.org/pypy/stmgc/raw/c7/c7/README.txt">here</a>. +It is still STM-only, not HTM, but because it doesn't constantly move +objects around in memory, it would be easier to adapt an HTM version. +There are even potential ideas about a hybrid TM, like using HTM but +only to speed up the commits. It is based on a <a href="https://bpaste.net/show/177186/">Linux-only</a> system call, <a href="https://man7.org/linux/man-pages/man2/remap_file_pages.2.html">remap_file_pages()</a> +(poll: who heard about it before? :-). As previously, the work is done +by Remi Meier and myself.</p> + +<p>Currently, the C library is incomplete, but early experiments show good +results in running <a href="https://www.pypy.org/posts/2013/07/software-transactional-memory-lisp-7777576128992250197.html">duhton</a>, +the interpreter for a minimal language created for the purpose of +testing STM. Good results means we brough down the slow-downs from +60-80% (previous version) to around 15% (current version). This number +measures the slow-down from the non-STM-enabled to the STM-enabled +version, on one CPU core; of course, the idea is that the STM version +scales up when using more than one core.</p> + +<p>This means that we are looking forward to a result that is much better +than originally predicted. The pypy-stm has chances to run at a +one-thread speed that is only "n%" slower than the regular pypy-jit, for +a value of "n" that is optimistically 15 --- but more likely some number +around 25 or 50. This is seriously better than the original estimate, +which was "between 2x and 5x". It would mean that using pypy-stm is +quite worthwhile even with just two cores.</p> + +<p>More updates later...</p> + +<p>Armin</p>stmhttps://www.pypy.org/posts/2014/02/rewrites-of-stm-core-model-again-633249729751034512.htmlSun, 09 Feb 2014 22:16:00 GMTUpdate on STMhttps://www.pypy.org/posts/2013/10/update-on-stm-7145890443443707910.htmlArmin Rigo<p>Hi all,</p> +<p>The sprint in London was a lot of fun and very fruitful. In the last +update on STM, Armin was working on improving and specializing the +automatic barrier placement. There is still a lot to do in that area, +but that work is merged now. Specializing and improving barrier placement +is still to be done for the JIT.</p> +<p>But that is not all. Right after the sprint, we were able to squeeze +the last obvious bugs in the STM-JIT combination. However, the performance +was nowhere near to what we want. So until now, we fixed some of the most +obvious issues. Many come from RPython erring on the side of caution +and e.g. making a transaction inevitable even if that is not strictly +necessary, thereby limiting parallelism. Another problem came from +increasing counters everytime a guard fails, which caused transactions +to conflict on these counter updates. Since these counters do not have +to be completely accurate, we update them non-transactionally now with +a chance of small errors.</p> +<p>There are still many such performance issues of various complexity left +to tackle: we are nowhere near done. So stay tuned or contribute :)</p> + +<h2>Performance</h2> +<p>Now, since the JIT is all about performance, we want to at least +show you some numbers that are indicative of things to come. +Our set of STM benchmarks is very small unfortunately +(something you can help us out with), so this is +not representative of real-world performance. We tried to +minimize the effect of JIT warm-up in the benchmark results.</p> +<p>The machine these benchmarks were executed on has 4 physical +cores with Hyper-Threading (8 hardware threads).</p> +<p><strong>Raytracer</strong> from <a class="reference external" href="https://bitbucket.org/Raemi/stm-benchmarks/src">stm-benchmarks</a>: +Render times in seconds for a 1024x1024 image:</p> +<table border="1" class="docutils"> +<colgroup> +<col width="23%"> +<col width="39%"> +<col width="38%"> +</colgroup> +<thead valign="bottom"> +<tr><th class="head">Interpreter</th> +<th class="head">Base time: 1 thread</th> +<th class="head">8 threads (speedup)</th> +</tr> +</thead> +<tbody valign="top"> +<tr><td>PyPy-2.1</td> +<td>2.47</td> +<td>2.56 (0.96x)</td> +</tr> +<tr><td>CPython</td> +<td>81.1</td> +<td>73.4 (1.1x)</td> +</tr> +<tr><td>PyPy-STM</td> +<td>50.2</td> +<td>10.8 (4.6x)</td> +</tr> +</tbody> +</table> +<p>For comparison, disabling the JIT gives 148s on PyPy-2.1 and 87s on +PyPy-STM (with 8 threads).</p> +<p><strong>Richards</strong> from <a class="reference external" href="https://bitbucket.org/pypy/pypy/commits/branch/stmgc-c4">PyPy repository on the stmgc-c4 +branch</a>: +Average time per iteration in milliseconds:</p> +<table border="1" class="docutils"> +<colgroup> +<col width="23%"> +<col width="39%"> +<col width="38%"> +</colgroup> +<thead valign="bottom"> +<tr><th class="head">Interpreter</th> +<th class="head">Base time: 1 thread</th> +<th class="head">8 threads (speedup)</th> +</tr> +</thead> +<tbody valign="top"> +<tr><td>PyPy-2.1</td> +<td>15.6</td> +<td>15.4 (1.01x)</td> +</tr> +<tr><td>CPython</td> +<td>239</td> +<td>237 (1.01x)</td> +</tr> +<tr><td>PyPy-STM</td> +<td>371</td> +<td>116 (3.2x)</td> +</tr> +</tbody> +</table> +<p>For comparison, disabling the JIT gives 492ms on PyPy-2.1 and 538ms on +PyPy-STM.</p> + +<h2>Try it!</h2> +<p>All this can be found in the <a class="reference external" href="https://bitbucket.org/pypy/pypy/commits/branch/stmgc-c4">PyPy repository on the stmgc-c4 +branch</a>. +Try it for yourself, but keep in mind that this is still experimental +with a lot of things yet to come. Only Linux x64 is supported right +now, but contributions are welcome.</p> +<p>You can download a prebuilt binary from here: +<a class="reference external" href="https://bitbucket.org/pypy/pypy/downloads/pypy-oct13-stm.tar.bz2">https://bitbucket.org/pypy/pypy/downloads/pypy-oct13-stm.tar.bz2</a> +(Linux x64 Ubuntu &gt;= 12.04). This was made at revision bafcb0cdff48.</p> + +<h2>Summary</h2> +<p>What the numbers tell us is that PyPy-STM is, as expected, +the only of the three interpreters where multithreading gives a large +improvement in speed. What they also tell us is that, obviously, the +result is not good enough <em>yet:</em> it still takes longer on a 8-threaded +PyPy-STM than on a regular single-threaded PyPy-2.1. However, as you +should know by now, we are good at promising speed and delivering it... +years later <tt class="docutils literal"><span class="pre">:-)</span></tt></p> +<p>But it has been two years already since PyPy-STM started, and this is +our first preview of the JIT integration. Expect major improvements +soon: with STM, the JIT generates code that is completely suboptimal in +many cases (barriers, allocation, and more). Once we improve this, the +performance of the STM-JITted code should come much closer to PyPy 2.1.</p> +<p>Cheers</p> +<p>Remi &amp; Armin</p>stmhttps://www.pypy.org/posts/2013/10/update-on-stm-7145890443443707910.htmlWed, 16 Oct 2013 17:01:00 GMTUpdate on STMhttps://www.pypy.org/posts/2013/08/update-on-stm-8705514488940872802.htmlArmin Rigo<p>Hi all,</p> + +<p>A quick update on Software Transactional Memory. We are +working on two fronts.</p> + +<p>On the one hand, the integration of the "c4" C library with PyPy is done +and works well, but is still subject to improvements. The "PyPy-STM" +executable (without the JIT) +seems to be stable, as far as it has been tested. It runs a simple +benchmark like Richards with a 3.2x slow-down over a regular JIT-less +PyPy.</p> + +<p>The main factor of this slow-down: the numerous "barriers" in +the code --- checks that are needed a bit everywhere to verify that a +pointer to an object points to a recent enough version, and if not, to +go to the most recent version. These barriers are inserted automatically +during the translation; there is no need for us to manually put 42 million +barriers in the source code of PyPy. But this automatic insertion uses a +primitive algorithm right now, which usually ends up putting more barriers than the +theoretical optimum. I (Armin) am trying to improve that --- and progressing: +last week the slow-down was around 4.5x. This is done in the branch +<a href="https://foss.heptapod.net/pypy/pypy/-/tree/branch/stmgc-static-barrier">stmgc-static-barrier</a>.</p> + +<p>On the other hand, Remi is progressing on the JIT integration in +the branch <a href="https://foss.heptapod.net/pypy/pypy/-/tree/branch/stmgc-c4">stmgc-c4</a>. +This has been working in simple cases since a couple of weeks by now, but the +resulting "PyPy-JIT-STM" often crashes. This is because while the +basics are not really hard, we keep hitting new issues that must be +resolved.</p> + +<p>The basics are that whenever the JIT is about to generate +assembler corresponding to a load or a store in a GC object, it must +first generate a bit of extra assembler that corresponds to the barrier +that we need. This works fine by now (but could benefit from the same +kind of optimizations described above, to reduce the number of barriers). +The additional issues are all more subtle. I will describe the current +one as an example: it is how to write constant pointers inside the assembler.</p> + +<p>Remember that the STM library classifies objects as either +"public" or "protected/private". A "protected/private" object +is one which has not been seen by another thread so far. +This is essential as an optimization, because we know that no +other thread will access our protected or private objects in parallel, +and thus we are free to modify their content in place. By contrast, +public objects are frozen, and to do any change, we first need to +build a different (protected) copy of the object. See this +<a href="https://www.pypy.org/posts/2013/06/stm-on-drawing-board-1028082727566254104.html">blog +post</a> for more details.</p> + +<p>So far so good, but the JIT will sometimes (actually often) hard-code +constant pointers into the assembler it produces. For example, this is the +case when the Python code being JITted creates an instance of a known class; +the corresponding assembler produced by the JIT will reserve the memory for +the instance and then write the constant type pointer in it. This type +pointer is a GC object (in the simple model, it's the Python class object; +in PyPy it's actually the "map" object, which is +<a href="https://www.pypy.org/posts/2011/03/controlling-tracing-of-interpreter-with_21-6524148550848694588.html">a different story</a>).</p> + +<p>The problem right now is that this constant pointer may point to a +protected object. This is a problem because the same piece of assembler +can later be executed by a different thread. If it does, then this +different thread will create instances whose type pointer is bogus: looking +like a protected object, but actually protected by a different thread. +Any attempt to use this type pointer to change anything on the class +itself will likely crash: the threads will all think they can safely change it +in-place. To fix this, we need to make sure we only write pointers to +public objects in the assembler. This is a bit involved because we need +to ensure that there <i>is</i> a public version of the object to start with.</p> + +<p>When this is done, we will likely hit the next problem, and the next one; +but at some point it should converge (hopefully!) and we'll give you our first +PyPy-JIT-STM ready to try. Stay tuned :-)</p> + +<p>A bientôt,</p> + +<p>Armin.</p>stmhttps://www.pypy.org/posts/2013/08/update-on-stm-8705514488940872802.htmlSun, 18 Aug 2013 18:54:00 GMTSoftware Transactional Memory lisp experimentshttps://www.pypy.org/posts/2013/07/software-transactional-memory-lisp-7777576128992250197.htmlMaciej Fijalkowski<div dir="ltr" style="text-align: left;"> +<p>As covered in <a class="reference external" href="https://www.pypy.org/posts/2013/06/stm-on-drawing-board-1028082727566254104.html">the previous blog post</a>, the STM subproject of PyPy has been +back on the drawing board. The result of this experiment is an STM-aware +garbage collector written in C. This is finished by now, thanks to Armin's +and Remi's work, we have a fully functional garbage collector and a STM system +that can be used from any C program with enough effort. Using it is more than +a little mundane, since you have to inserts write and read barriers by hand +everywhere in your code that reads or writes to garbage collector controlled +memory. In the PyPy integration, this manual work is done automatically +by the STM transformation in the interpreter.</p> +<p>However, to experiment some more, we created a minimal +<a class="reference external" href="https://bitbucket.org/arigo/duhton">lisp-like/scheme-like interpreter</a> +(called Duhton), that follows closely CPython's implementation strategy. +For anyone familiar with CPython's source code, it should be pretty +readable. This interpreter works like a normal and very basic lisp variant, +however it comes with a <tt class="docutils literal">transaction</tt> builtin, that lets you spawn transactions +using the STM system. We implemented a few demos that let you play with the +transaction system. All the demos are running without conflicts, which means +there are no conflicting writes to global memory and hence the demos are very +amenable to parallelization. They exercise:</p> +<ul class="simple"> +<li>arithmetics - <tt class="docutils literal">demo/many_sqare_roots.duh</tt></li> +<li>read-only access to globals - <tt class="docutils literal">demo/trees.duh</tt></li> +<li>read-write access to local objects - <tt class="docutils literal">demo/trees2.duh</tt></li> +</ul> +<p>With the latter ones being very similar to the classic gcbench. STM-aware +Duhton can be found in <a class="reference external" href="https://bitbucket.org/pypy/stmgc">the stmgc repo</a>, while the STM-less Duhton, +that uses refcounting, can be found in <a class="reference external" href="https://bitbucket.org/arigo/duhton">the duhton repo</a> under the <tt class="docutils literal">base</tt> +branch.</p> +<p>Below are some benchmarks. Note that this is a little comparing apples to +oranges since the single-threaded duhton uses refcounting GC vs generational +GC for STM version. Future pypy benchmarks will compare more apples to apples. +Moreover none of the benchmarks has any conflicts. Time is the total time +that the benchmark took (not the CPU time) and there was very little variation +in the consecutive runs (definitely below 5%).</p> +<table border="1" class="docutils"> +<colgroup> +<col width="16%"> +<col width="30%"> +<col width="23%"> +<col width="16%"> +<col width="16%"> +</colgroup> +<tbody valign="top"> +<tr><td>benchmark</td> +<td>1 thread (refcount)</td> +<td>1 thread (stm)</td> +<td>2 threads</td> +<td>4 threads</td> +</tr> +<tr><td>square</td> +<td>1.9s</td> +<td>3.5s</td> +<td>1.8s</td> +<td>0.9s</td> +</tr> +<tr><td>trees</td> +<td>0.6s</td> +<td>1.0s</td> +<td>0.54s</td> +<td>0.28s</td> +</tr> +<tr><td>trees2</td> +<td>1.4s</td> +<td>2.2s</td> +<td>1.1s</td> +<td>0.57s</td> +</tr> +</tbody> +</table> +<p>As you can see, the slowdown for STM vs single thread is significant +(1.8x, 1.7x, 1.6x respectively), but still lower than 2x. However the speedup +from running on multiple threads parallelizes the problem almost perfectly.</p> +<p>While a significant milestone, we hope the next blog post will cover +STM-enabled pypy that's fully working with JIT work ongoing.</p> +<p>Cheers,<br> +fijal on behalf of Remi Meier and Armin Rigo</p><br> +<br></div>stmhttps://www.pypy.org/posts/2013/07/software-transactional-memory-lisp-7777576128992250197.htmlFri, 12 Jul 2013 10:07:00 GMTSTM on the drawing boardhttps://www.pypy.org/posts/2013/06/stm-on-drawing-board-1028082727566254104.htmlArmin Rigo<p>Hi all!</p> + +<p>This is an update about the Software Transactional Memory subproject of +PyPy. I have some good news of progress. Also, +<a href="https://bitbucket.org/Raemi">Remi Meier</a> will +likely help me this summer. He did various +investigations with PyPy-STM for his Master's Thesis and contributed back +a lot of ideas and some code. Welcome again Remi!</p> + +<p>I am also sorry that it seems to advance so slowly. Beyond the usual +excuses --- I was busy with other things, e.g. releasing PyPy 2.0 --- I +would like to reassure people: I'm again working on it, and the financial +contributions are still there and reserved for STM (almost half the money is +left, a big thank you again if you contributed!).</p> + +<p>The real reason for the apparent slowness, though, is that it is really +a research project. It's possible to either have hard deadlines, or to +follow various tracks and keep improving the basics, but not both at the +same time.</p> + +<p>During the past month where I have worked again on STM, I worked still on +the second option; and I believe it was worth every second of it. Let me try +to convince you :-)</p> + +<p>The main blocker was that the STM subsystem, written in C, and the +Garbage Collection (GC) subsystem, written in RPython, were getting +harder and harder to coordinate. So what I did instead is to give up +using RPython in favor of using only C for both. C is a good language +for some things, which includes low-level programming where we must take +care of delicate multithreading issues; RPython is not a good fit in +that case, and wasn't designed to be.</p> + +<p>I started a fresh <a href="https://bitbucket.org/pypy/stmgc">Mercurial repo</a> +which is basically a stand-alone C library. This library (in heavy development +right now!) gives any C +program some functions to allocate and track GC-managed objects, and +gives an actual STM+GC combination on these objects. It's possible +(though rather verbose) to use it directly in C programs, like in a +small example interpreter. Of course the eventual purpose is to link it +with PyPy during translation to C, with all the verbose calls +automatically generated.</p> + +<p>Since I started this, bringing the GC closer to the STM, I kept finding +new ways that the two might interact to improve the performance, maybe +radically. Here is a summary of the current ideas.</p> + +<p>When we run +multiple threads, there are two common cases: one is to access (read and write) +objects that have only been seen by the current thread; the other is to read +objects seen by all threads, like in Python the modules/functions/classes, +but not to write to them. Of course, writing to the same object from +multiple threads occurs too, and it is handled correctly (that's the whole +point), but it is a relatively rare case.</p> + +<p>So each object is classified as "public" or "protected" (or "private", +when they belong to the current transaction). Newly created objects, once +they are no longer private, remain protected until +they are read by a different thread. Now, the point is to use very +different mechanisms for public and for protected objects. Public +objects are visible by all threads, but read-only in memory; to change +them, a copy must be made, and the changes are written to the copy (the +"redolog" approach to STM). Protected objects, on the other hand, are +modified in-place, with (if necessary) a copy of them being made +for the sole purpose of a possible abort of the transaction (the "undolog" +approach).</p> + +<p>This is combined with a generational GC similar to PyPy's --- but here, +each thread gets its own nursery and does its own "minor collections", +independently of the others.</p> + +<p>So objects are by default protected; when another thread tries to follow a +pointer to them, then it is that other thread's job to carefully "steal" +the object and turn it public (possibly making a copy of it if needed, +e.g. if it was still a young object living in the original nursery).</p> + +<p>The same object can exist temporarily in multiple versions: any number +of public copies; at most one active protected copy; and optionally one +private copy per thread (this is the copy as currently seen by the +transaction in progress on that thread). The GC cleans up the +unnecessary copies.</p> + +<p>These ideas are variants and extensions of the same basic idea +of keeping multiple copies with revision numbers to track them. +Moreover, "read barriers" and "write barriers" are used by the C program +calling into this library in order to be sure that it is accessing the +right version of the object. In the currently investigated variant +I believe it should be possible to have rather cheap +read barriers, which would definitely be a major speed improvement over +the previous variants. Actually, as far as I know, it would be a major +improvement over most of the other existing STMs: in them, the typical read barrier +involves following chains of pointers, and checking some dictionary to see if this +thread has a modified local copy of the object. The difference with a +read barrier that can resolve most cases in a few CPU cycles should be +huge.</p> + +<p>So, this is research :-) It is progressing, and at some point I'll be +satisfied with it and stop rewriting everything; and then the actual +integration into PyPy should be straightforward (there is already code +to detect where the read and write barriers need to be inserted, where +transactions can be split, etc.). Then there is support for the +JIT to be written, and so on. But more about it later.</p> + +<p>The purpose of this post was to give you some glimpses into what I'm +working on right now. As usual, no plan for release yet. But you can +look forward to seeing the C library progress. I'll probably also start +soon some sample interpreter in C, to test the waters (likely a +revival of <a href="https://bitbucket.org/arigo/duhton">duhton</a>). +If you know nothing about Python but all about the C-level +multithreading issues, now is a good time to get involved :-)</p> + +<p>Thanks for reading!</p> + +<p>Armin</p>stmhttps://www.pypy.org/posts/2013/06/stm-on-drawing-board-1028082727566254104.htmlWed, 05 Jun 2013 15:31:00 GMTMulticore Programming in PyPy and CPythonhttps://www.pypy.org/posts/2012/08/multicore-programming-in-pypy-and-6595343388141556320.htmlArmin Rigo<p>Hi all,</p> +<p>This is a short "position paper" kind of post about my view (Armin +Rigo's) on the future of multicore programming in high-level languages. +It is a summary of the +keynote presentation at EuroPython. As I learned by talking with people +afterwards, I am not a good enough speaker to manage to convey a deeper +message in a 20-minutes talk. I will try instead to convey it in a +250-lines post...</p> +<p>This is about three points:</p> +<ol class="arabic simple"> +<li>We often hear about people wanting a version of Python running without +the Global Interpreter Lock (GIL): a "GIL-less Python". But what we +programmers really need is not just a GIL-less Python --- we need a +higher-level way to write multithreaded programs than using directly +threads and locks. One way is Automatic Mutual Exclusion (AME), which +would give us an "AME Python".</li> +<li>A good enough Software Transactional Memory (STM) system can be used +as an internal tool to do that. +This is what we are building into an "AME PyPy".</li> +<li>The picture is darker for CPython, though there is a way too. The +problem is that when we say STM, we think about either GCC 4.7's STM +support, or Hardware Transactional Memory (HTM). However, both +solutions are enough for a "GIL-less CPython", but not +for "AME CPython", due to capacity limitations. For the latter, we +need somehow to add some large-scale STM into the compiler.</li> +</ol> +<p>Let me explain these points in more details.</p> +<div class="section"> +<h3><a id="gil-less-versus-ame" name="gil-less-versus-ame">GIL-less versus AME</a></h3> +<p>The first point is in favor of the so-called Automatic Mutual Exclusion +approach. The issue with using threads (in any language with or without +a GIL) is that threads are fundamentally non-deterministic. In other +words, the programs' behaviors are not reproductible at all, and worse, +we cannot even reason about it --- it becomes quickly messy. We would +have to consider all possible combinations of code paths and timings, +and we cannot hope to write tests that cover all combinations. This +fact is often documented as one of the main blockers towards writing +successful multithreaded applications.</p> +<p>We need to solve this issue with a higher-level solution. Such +solutions exist theoretically, and Automatic Mutual Exclusion (AME) is +one of them. The idea of AME is that we divide the execution of each +thread into a number of "atomic blocks". Each block is well-delimited +and typically large. Each block runs atomically, as if it acquired a +GIL for its whole duration. The trick is that internally we use +Transactional Memory, which is a technique that lets the system run the +atomic blocks from each thread in parallel, while giving the programmer +the illusion that the blocks have been run in some global serialized +order.</p> +<p>This doesn't magically solve all possible issues, but it helps a lot: it +is far easier to reason in terms of a random ordering of large atomic +blocks than in terms of a random ordering of lines of code --- not to +mention the mess that multithreaded C is, where even a random ordering +of instructions is not a sufficient model any more.</p> +<p>How do such atomic blocks look like? For example, a program might +contain a loop over all keys of a dictionary, performing some +"mostly-independent" work on each value. This is a typical example: +each atomic block is one iteration through the loop. By using the +technique described here, we can run the iterations in parallel +(e.g. using a thread pool) but using AME to ensure that they appear to +run serially.</p> +<p>In Python, we don't care about the order in which the loop iterations +are done, because we are anyway iterating over the keys of a dictionary. +So we get exactly the same effect as before: the iterations still run in +some random order, but --- and that's the important point --- they +appear to run in a +global serialized order. In other words, we introduced parallelism, but +only under the hood: from the programmer's point of view, his program +still appears to run completely serially. Parallelisation as a +theoretically invisible optimization... more about the "theoretically" +in the next paragraph.</p> +<p>Note that randomness of order is not fundamental: they are techniques +building on top of AME that can be used to force the order of the +atomic blocks, if needed.</p> +</div> +<div class="section"> +<h3><a id="pypy-and-stm-ame" name="pypy-and-stm-ame">PyPy and STM/AME</a></h3> +<p>Talking more precisely about PyPy: the current prototype <tt class="docutils literal"><span class="pre">pypy-stm</span></tt> is +doing precisely this. In <tt class="docutils literal"><span class="pre">pypy-stm</span></tt>, the length of the atomic blocks is +selected in one of two ways: either explicitly or automatically.</p> +<p>The automatic selection gives blocks corresponding to some small number +of bytecodes, in which case we have merely a GIL-less Python: multiple +threads will appear to run serially, with the execution randomly +switching from one thread to another at bytecode boundaries, just like +in CPython.</p> +<p>The explicit selection is closer to what was described in the previous +section: someone --- the programmer or the author of some library that +the programmer uses --- will explicitly put <tt class="docutils literal"><span class="pre">with</span> <span class="pre">thread.atomic:</span></tt> in +the source, which delimitates an atomic block. For example, we can use +it to build a library that can be used to iterate over the keys of a +dictionary: instead of iterating over the dictionary directly, we would +use some custom utility which gives the elements "in parallel". It +would give them by using internally a pool of threads, but enclosing +every handling of an element into such a <tt class="docutils literal"><span class="pre">with</span> <span class="pre">thread.atomic</span></tt> block.</p> +<p>This gives the nice illusion of a global serialized order, and thus +gives us a well-behaving model of the program's behavior.</p> +<p>Restating this differently, +the <em>only</em> semantical difference between <tt class="docutils literal"><span class="pre">pypy-stm</span></tt> and +a regular PyPy or CPython is that it has <tt class="docutils literal"><span class="pre">thread.atomic</span></tt>, which is a +context manager that gives the illusion of forcing the GIL to not be +released during the execution of the corresponding block of code. Apart +from this addition, they are apparently identical.</p> +<p>Of course they are only semantically identical if we ignore performance: +<tt class="docutils literal"><span class="pre">pypy-stm</span></tt> uses multiple threads and can potentially benefit from that +on multicore machines. The drawback is: when does it benefit, and how +much? The answer to this question is not immediate. The programmer +will usually have to detect and locate places that cause too many +"conflicts" in the Transactional Memory sense. A conflict occurs when +two atomic blocks write to the same location, or when <tt class="docutils literal"><span class="pre">A</span></tt> reads it, +<tt class="docutils literal"><span class="pre">B</span></tt> writes it, but <tt class="docutils literal"><span class="pre">B</span></tt> finishes first and commits. A conflict +causes the execution of one atomic block to be aborted and restarted, +due to another block committing. Although the process is transparent, +if it occurs more than occasionally, then it has a negative impact on +performance.</p> +<p>There is no out-of-the-box perfect solution for solving all conflicts. +What we will need is more tools to detect them and deal with them, data +structures that are made aware of the risks of "internal" conflicts when +externally there shouldn't be one, and so on. There is some work ahead.</p> +<p>The point here is that from the point of view of the final programmer, +we gets conflicts that we should resolve --- but at any point, our +program is <em>correct</em>, even if it may not be yet as efficient as it could +be. This is the opposite of regular multithreading, where programs are +efficient but not as correct as they could be. In other words, as we +all know, we only have resources to do the easy 80% of the work and not +the remaining hard 20%. So in this model we get a program that has 80% +of the theoretical maximum of performance and it's fine. In the regular +multithreading model we would instead only manage to remove 80% of the +bugs, and we are left with obscure rare crashes.</p> +</div> +<div class="section"> +<h3><a id="cpython-and-htm" name="cpython-and-htm">CPython and HTM</a></h3> +<p>Couldn't we do the same for CPython? The problem here is that +<tt class="docutils literal"><span class="pre">pypy-stm</span></tt> is implemented as a transformation step during translation, +which is not directly possible in CPython. Here are our options:</p> +<ul class="simple"> +<li>We could review and change the C code everywhere in CPython.</li> +<li>We use GCC 4.7, which supports some form of STM.</li> +<li>We wait until Intel's next generation of CPUs comes out ("Haswell") +and use HTM.</li> +<li>We write our own C code transformation within a compiler (e.g. LLVM).</li> +</ul> +<p>I will personally file the first solution in the "thanks but no thanks" +category. If anything, it will give us another fork of CPython that +will painfully struggle to keep not more than 3-4 versions behind, and +then eventually die. It is very unlikely to be ever merged into the +CPython trunk, because it would need changes <em>everywhere</em>. Not to +mention that these changes would be very experimental: tomorrow we might +figure out that different changes would have been better, and have to +start from scratch again.</p> +<p>Let us turn instead to the next two solutions. Both of these solutions +are geared toward small-scale transactions, but not long-running ones. +For example, I have no clue how to give GCC rules about performing I/O +in a transaction --- this seems not supported at all; and moreover +looking at the STM library that is available so far to be linked with +the compiled program, it assumes short transactions only. By contrast, +when I say "long transaction" I mean transactions that can run for 0.1 +seconds or more. To give you an idea, in 0.1 seconds a PyPy program +allocates and frees on the order of ~50MB of memory.</p> +<p>Intel's Hardware Transactional Memory solution is both more flexible and +comes with a stricter limit. In one word, the transaction boundaries +are given by a pair of special CPU instructions that make the CPU enter +or leave "transactional" mode. If the transaction aborts, the CPU +cancels any change, rolls back to the "enter" instruction and causes +this instruction to return an error code instead of re-entering +transactional mode (a bit like a <tt class="docutils literal"><span class="pre">fork()</span></tt>). The software then detects +the error code. Typically, if transactions are rarely cancelled, it is +fine to fall back to a GIL-like solution just to redo these cancelled +transactions.</p> +<p>About the implementation: this is done by recording all the changes that +a transaction wants to do to the main memory, and keeping them invisible +to other CPUs. This is "easily" achieved by keeping them inside this +CPU's local cache; rolling back is then just a matter of discarding a +part of this cache without committing it to memory. From this point of +view, <a class="reference" href="https://arstechnica.com/business/2012/02/transactional-memory-going-mainstream-with-intel-haswell/">there is a lot to bet</a> that we are actually talking about the +regular per-core Level 1 and Level 2 caches --- so any transaction that +cannot fully store its read and written data in the 64+256KB of the L1+L2 +caches will abort.</p> +<p>So what does it mean? A Python interpreter overflows the L1 cache of +the CPU very quickly: just creating new Python function frames takes a +lot of memory (on the order of magnitude of 1/100 of the whole L1 +cache). Adding a 256KB L2 cache into the picture helps, particularly +because it is highly associative and thus avoids a lot of fake conflicts. +However, as long as the HTM support is limited to L1+L2 caches, +it is not going to be enough to run an "AME Python" with any sort of +medium-to-long transaction. It can +run a "GIL-less Python", though: just running a few hundred or even +thousand bytecodes at a time should fit in the L1+L2 caches, for most +bytecodes.</p> +<p>I would vaguely guess that it will take on the order of 10 years until +CPU cache sizes grow enough for a CPU in HTM mode to actually be able to +run 0.1-second transactions. (Of course in 10 years' time a lot of other +things may occur too, including the whole Transactional Memory model +being displaced by something else.)</p> +</div> +<div class="section"> +<h3><a id="write-your-own-stm-for-c" name="write-your-own-stm-for-c">Write your own STM for C</a></h3> +<p>Let's discuss now the last option: if neither GCC 4.7 nor HTM are +sufficient for an "AME CPython", then we might want to +write our own C compiler patch (as either extra work on GCC 4.7, or an +extra pass to LLVM, for example).</p> +<p>We would have to deal with the fact that we get low-level information, +and somehow need to preserve interesting high-level bits through the +compiler up to the point at which our pass runs: for example, whether +the field we read is immutable or not. (This is important because some +common objects are immutable, e.g. PyIntObject. Immutable reads don't +need to be recorded, whereas reads of mutable data must be protected +against other threads modifying them.) We can also have custom code to +handle the reference counters: e.g. not consider it a conflict if +multiple transactions have changed the same reference counter, but just +resolve it automatically at commit time. We are also free to handle I/O +in the way we want.</p> +<p>More generally, the advantage of this approach over both the current GCC +4.7 and over HTM is that we control the whole process. While this still +looks like a lot of work, it looks doable. It would be possible to come +up with a minimal patch of CPython that can be accepted into core +without too much troubles (e.g. to mark immutable fields and tweak the +refcounting macros), and keep all the cleverness inside the compiler +extension.</p> +</div> +<div class="section"> +<h3><a id="conclusion" name="conclusion">Conclusion</a></h3> +<p>I would assume that a programming model specific to PyPy and not +applicable to CPython has little chances to catch on, as long as PyPy is +not the main Python interpreter (which looks unlikely to change anytime +soon). Thus as long as only PyPy has AME, it looks like it will not +become the main model of multicore usage in Python. However, I can +conclude with a more positive note than during the EuroPython +conference: it is a lot of work, but there is a more-or-less reasonable +way forward to have an AME version of CPython too.</p> +<p>In the meantime, <tt class="docutils literal"><span class="pre">pypy-stm</span></tt> is around the corner, and together with +tools developed on top of it, it might become really useful and used. I +hope that in the next few years this work will trigger enough motivation +for CPython to follow the ideas.</p> +</div>stmhttps://www.pypy.org/posts/2012/08/multicore-programming-in-pypy-and-6595343388141556320.htmlThu, 09 Aug 2012 09:27:00 GMTSTM with threadshttps://www.pypy.org/posts/2012/06/stm-with-threads-7818875111634541910.htmlArmin Rigo<p>Hi all,</p><p>A quick update. The first version of pypy-stm <a class="reference" href="https://www.pypy.org/posts/2012/05/stm-update-back-to-threads-6622746581767639355.html">based on regular<br> +threads</a> is ready. Still having no JIT and a 4-or-5-times performance<br> +hit, it is not particularly fast, but I am happy that it turns out not<br> +to be much slower than the previous thread-less attempts. It is at<br> +least fast enough to run faster (in real time) than an equivalent no-STM<br> +PyPy, if fed with an eight-threaded program on an eight-core machine<br> +(provided, of course, you don't mind it eating all 8 cores' CPU power<br> +instead of just one :-).</p><p>You can download and play around with <a class="reference" href="https://cobra.cs.uni-duesseldorf.de/~buildmaster/misc/pypy-stm-38eb1fbc3c8d.bz2">this binary</a> for Linux 64. It<br> +was made from the <a class="reference" href="https://foss.heptapod.net/pypy/pypy/-/tree/branch/stm-thread">stm-thread</a> branch of the PyPy repository (<tt class="docutils literal"><span class="pre">translate.py --stm -O2 targetpypystandalone.py</span></tt>). (Be sure<br> +to put it where it can find its stdlib, e.g. by putting it inside the<br> +directory from the official <a class="reference" href="https://bitbucket.org/pypy/pypy/downloads/pypy-1.9-linux64.tar.bz2">1.9 release</a>.)</p><p>This binary supports the <tt class="docutils literal"><span class="pre">thread</span></tt> module and runs without the GIL.<br> +So, despite the factor-of-4 slow-down issue, it should be the <em>fourth</em><br> +complete Python interpreter in which we can reasonably claim to have<br> +resolved the problem of the GIL. (The first one was Greg Stein's Python<br> +1.4, re-explored <a class="reference" href="https://dabeaz.blogspot.ch/2011/08/inside-look-at-gil-removal-patch-of.html">here</a>; the second one is <a class="reference" href="https://jython.org/">Jython</a>; the third one is<br> +<a class="reference" href="https://ironpython.net/">IronPython</a>.) Unlike the previous three, it is also the first one to<br> +offer full GIL semantics to the programmer, and additionally<br> +<tt class="docutils literal"><span class="pre">thread.atomic</span></tt> (see below). I should also add that we're likely to<br> +see in the next year a 5th such interpreter, too, based on Hardware<br> +Transactional Memory (same approach as with STM, but using e.g.<br> +<a class="reference" href="https://software.intel.com/en-us/blogs/2012/02/07/transactional-synchronization-in-haswell/">Intel's HTM</a>).</p><p>The binary I linked to above supports all built-in modules from PyPy,<br> +apart from <tt class="docutils literal"><span class="pre">signal</span></tt>, still being worked on (which can be a bit<br> +annoying because standard library modules like <tt class="docutils literal"><span class="pre">subprocess</span></tt> depend on<br> +it). The <tt class="docutils literal"><span class="pre">sys.get/setcheckinterval()</span></tt> functions can be used to tweak<br> +the frequency of the automatic commits. Additionally, it offers<br> +<tt class="docutils literal"><span class="pre">thread.atomic</span></tt>, described in the <a class="reference" href="https://www.pypy.org/posts/2012/05/stm-update-back-to-threads-6622746581767639355.html">previous blog post</a> as a way to<br> +create longer atomic sections (with the observable effect of preventing<br> +the "GIL" to be released during that time). A complete<br> +<tt class="docutils literal"><span class="pre">transaction.py</span></tt> module based on it is available <a class="reference" href="https://foss.heptapod.net/pypy/pypy/-/tree/branch/stm-thread/lib_pypy/transaction.py">from the sources</a>.</p><p>The main missing features are:</p><ul class="simple"><li>the <tt class="docutils literal"><span class="pre">signal</span></tt> module;</li> +<li>the Garbage Collector, which does not do major collections so far, only<br> +minor ones;</li> +<li>and finally, the JIT, which needs some amount of integration to generate<br> +the correctly-tweaked assembler.</li> +</ul><p>Have fun!</p><p>Armin.</p>stmhttps://www.pypy.org/posts/2012/06/stm-with-threads-7818875111634541910.htmlSun, 10 Jun 2012 19:02:00 GMT \ No newline at end of file diff --git a/categories/sun.html b/categories/sun.html new file mode 100644 index 000000000..f78d6dc67 --- /dev/null +++ b/categories/sun.html @@ -0,0 +1,114 @@ + + + + + +Posts about sun | PyPy + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +
                + + \ No newline at end of file diff --git a/categories/sun.xml b/categories/sun.xml new file mode 100644 index 000000000..538b45192 --- /dev/null +++ b/categories/sun.xml @@ -0,0 +1,13 @@ + +PyPy (Posts about sun)https://www.pypy.org/enContents © 2024 <a href="mailto:pypy-dev@pypy.org">The PyPy Team</a> Sat, 31 Aug 2024 17:48:13 GMTNikola (getnikola.com)http://blogs.law.harvard.edu/tech/rssEP2008: PyPy meets Jythonhttps://www.pypy.org/posts/2008/07/ep2008-pypy-meets-jython-1107070144380217881.htmlholger krekel<p>One of the great events at EuroPython 2008 were our chats and meetings with the Jython and Sun people. The Jython people recently are pushing into releasing Python version 2.5 and they currently pursue many interesting sub projects. Coincidentally, PyPy also has tons of interesting areas and results :) So we eventually got into brainstorming a number of possible technical collab ideas. Further below is a first list as i wrote it down from our 10 people PyPy / Jython 30 minute close up meeting yesterday. + +It felt great to be able to talk to the Jython people this way - kudos to Sun for their clear commitments and open ways to go about things! I sense a genuine interest on fair collaboration with non-java developer communities. Seems like they are serious about not focusing on "Java this", "Java that" anymore but rather focus on the JVM platform. Good! And about language +independent interest in ambitious technology. Even Better! I am tensed to see how things go from here. + +So here the list of technical collab ideas: +</p><ul><li>ctypes - try to create _rawffi module in Java for Jython, which will enable Jython to reuse our existing ctypes implementation (and have PyPy use the Jython-rawffi for its own for PyPy.JVM)</li><li> generally see to share work / (continue) collaborate regarding extension modules</li><li>Jython/PyPy (and eventually IronPython): document known differences to CPython, maybe in a PEP</li><li>Python Interpreter for Jython (in order to run CPython's .pyc files): re-use pypy's bytecode evaluator, implement a "Jython object space". </li><li>re-use rpython-extension modules for jython (e.g. SRE), by compiling them to Java and reusing as a native library.</li><li>collaborate on testing framework / benchmarking, have a common site to show test results</li><li>make py.test compatible with jython</li><li>come up with a set of "pure Python language" tests, which would gather and refactor tests from CPython, PyPy and Jython. </li><li>look into using java types / jython approaches for implementing free threading.</li><li>share knowledge regarding JIT / psyco +</li></ul>If you have any more ideas, comments or would like to join efforts, let us know! + +Cheers and thanks to <a href="https://www.sauria.com/blog/">Ted Leung</a>, <a href="https://fwierzbicki.blogspot.com/">Frank Wierzbiki</a>, <a href="https://www.zyasoft.com/pythoneering/">Jim Baker</a> and Tobias Ivarsson from Sun and Jython fame respectively, + +Holgerep2008jythonpypysunhttps://www.pypy.org/posts/2008/07/ep2008-pypy-meets-jython-1107070144380217881.htmlThu, 10 Jul 2008 08:29:00 GMT \ No newline at end of file diff --git a/categories/testing.html b/categories/testing.html new file mode 100644 index 000000000..0de60e3f0 --- /dev/null +++ b/categories/testing.html @@ -0,0 +1,114 @@ + + + + + +Posts about testing | PyPy + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +
                + + \ No newline at end of file diff --git a/categories/testing.xml b/categories/testing.xml new file mode 100644 index 000000000..3ba3b2d5e --- /dev/null +++ b/categories/testing.xml @@ -0,0 +1,597 @@ + +PyPy (Posts about testing)https://www.pypy.org/enContents © 2024 <a href="mailto:pypy-dev@pypy.org">The PyPy Team</a> Sat, 31 Aug 2024 17:48:13 GMTNikola (getnikola.com)http://blogs.law.harvard.edu/tech/rssFinding JIT Optimizer Bugs using SMT Solvers and Fuzzinghttps://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.htmlCarl Friedrich Bolz-Tereick<p>In this blog post I want to describe a recent bug finding technique that I've +added to the PyPy JIT testing infrastructure. This technique uses the Z3 +theorem prover to find bugs in the optimizer of PyPy's JIT, in particular its +integer operation optimizations. The approach is +based on things I have learned from <a class="reference external" href="https://www.cs.utah.edu/~regehr/">John Regehr's</a> <a class="reference external" href="https://blog.regehr.org/">blog</a> (<a class="reference external" href="https://blog.regehr.org/archives/1122">this post</a> is a +good first one to read), <a class="reference external" href="https://twitter.com/johnregehr/">Twitter</a>, and on +his (et al) paper <a class="reference external" href="https://www.cs.utah.edu/~regehr/alive2-pldi21.pdf">Alive2: Bounded Translation Validation for LLVM</a>. The work +was triggered by a recent miscompilation bug my current bachelor student Nico +Rittinghaus found.</p> +<section id="background-python-integers-in-the-pypy-jit"> +<h2>Background: Python Integers in the PyPy JIT</h2> +<p>The optimizer of PyPy's JITs operates on traces, which are linear sequences of +instructions with guards. The instructions in the traces operate on different +machine-level data types, machine integers, doubles, pointers, bools, etc. In +this post we'll be mostly concerned with machine integers.</p> +<p>To given some wider context I'll explain a bit how Python ints in the user code +relate to the types that are used in traces when the PyPy Python implementation +is used. +When PyPy turns a regular Python 3 function into a trace, there is a lot of work +happening in the JIT frontend to try to observe and infer the types that the +Python function concretely uses at runtime. The traces are generated under these +typing assumptions. Therefore, code that uses <code class="docutils literal">ints</code> in the Python code can +typically be translated into traces that operate on machine integers. In order +to make sure that the Python integer semantics are upheld, many of the +operations in the traces need to check that the integer results of some +operations still fit into a machine integer. If that is not the case (a rare +situation for most programs), the trace is left via a guard, execution falls +back to the interpreter, and there a big integer representation is chosen for +the too big value (the big integer representation is done via a pointer and +some storage on the heap).</p> +<p>All of this machinery is not going to be too relevant for the rest of the +post. For the post it's important to know that trace instructions operate on +machine integers and other low-level types, and some of the operations can +optionally check whether the +results still fit into a machine integer. These trace operations are improved by +the optimizer, which tries to transform the trace into one that behaves the +same, but is less costly to execute.</p> +</section> +<section id="background-bounds-analysis-in-pypy-s-jit"> +<h2>Background: Bounds Analysis in PyPy's JIT</h2> +<p>The optimizer of PyPy's JIT has an analysis based on <a class="reference external" href="https://en.wikipedia.org/wiki/Abstract_interpretation">abstract interpretation</a> +that tries to find out whether the integer values stored in a variable are +actually not using the full 64 bit (or 32 bit) range, but instead fit into some +smaller range. This means that for every integer variable <code class="docutils literal">x</code> in a trace, the +JIT compiler tracks upper and lower bounds of the runtime value of that +variable: a range <code class="docutils literal">[a, b]</code> such that for every concrete runtime value <code class="docutils literal">v</code> +that gets stored in variable <code class="docutils literal">x</code>, <code class="docutils literal">a &lt;= v &lt;= b</code> must be true. +<code class="docutils literal">a</code> and <code class="docutils literal">b</code> start out +as the most general <code class="docutils literal">MININT</code> and <code class="docutils literal">MAXINT</code>, but sometimes there is extra +information that makes it possible to improve these known bounds, and that is +often useful to optimize the code.</p> +<p>A typical example is that the JIT knows that the length of a string is +non-negative, so for this kind of code: <code class="docutils literal">x = len(s)</code> where <code class="docutils literal">s</code> is a string, +<code class="docutils literal">x</code> gets a range <code class="docutils literal">[0, MAXINT]</code> assigned. With this information we could for +example remove a check <code class="docutils literal">x + 10 &lt; 0</code> completely, because it can never be true.</p> +<p>The bounds information is useful for optimization, but the analysis of the +bounds is also a source of bugs in the JIT, because the reasoning is often +subtle and easy to get wrong in corner cases. We already use a number of testing +techniques to try to make sure that it is correct. A simple one is +<a class="reference external" href="https://hypothesis.works/articles/what-is-property-based-testing/">property-based testing</a> using <a class="reference external" href="https://github.com/HypothesisWorks/hypothesis">Hypothesis</a> on the operations on bounds. Even +though Hypothesis is fantastic, it unfortunately does not catch +absolutely all the bugs even if we'd like it too, as we'll see in the next +section.</p> +</section> +<section id="motivation-a-jit-miscompilation"> +<h2>Motivation: A JIT Miscompilation</h2> +<p>I am currently supervising a Bachelor thesis by Nico Rittinghaus, who is +extending the integer analysis in the JIT. He'll probably write a separate blog +post about that soon. In the process of his work, the current bounds analysis +code got a lot of scrutiny, and we found out that one of the unit tests of the +bounds analysis was actually incorrect, and the example code in that unit test +was optimized incorrectly. This case of incorrect optimization is not a big deal +for regular Python code, because it involved a "wrapping integer addition +operation", i.e. one where overflowing results just wrap around to negative +values. All the additions and other arithmetic operations that the PyPy Python +frontend generates actually have +overflow checks (to be able to switch to a big integer representation if +needed). +However, it's still possible to trigger the problem with the +<code class="docutils literal">__pypy__.intop.int_add</code> API which is a function that exposes wraparound +arithmetic on Python ints.</p> +<p><a class="reference external" href="https://foss.heptapod.net/pypy/pypy/-/issues/3832">Here's the miscompilation</a>. The JIT optimizes the following function:</p> +<div class="code"><pre class="code python"><a id="rest_code_fe430f89c3ac44bd87113cd210a97ff1-1" name="rest_code_fe430f89c3ac44bd87113cd210a97ff1-1" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_fe430f89c3ac44bd87113cd210a97ff1-1"></a><span class="kn">import</span> <span class="nn">__pypy__</span> +<a id="rest_code_fe430f89c3ac44bd87113cd210a97ff1-2" name="rest_code_fe430f89c3ac44bd87113cd210a97ff1-2" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_fe430f89c3ac44bd87113cd210a97ff1-2"></a> +<a id="rest_code_fe430f89c3ac44bd87113cd210a97ff1-3" name="rest_code_fe430f89c3ac44bd87113cd210a97ff1-3" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_fe430f89c3ac44bd87113cd210a97ff1-3"></a><span class="k">def</span> <span class="nf">wrong</span><span class="p">(</span><span class="n">x</span><span class="p">):</span> +<a id="rest_code_fe430f89c3ac44bd87113cd210a97ff1-4" name="rest_code_fe430f89c3ac44bd87113cd210a97ff1-4" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_fe430f89c3ac44bd87113cd210a97ff1-4"></a> <span class="n">a</span> <span class="o">=</span> <span class="n">__pypy__</span><span class="o">.</span><span class="n">intop</span><span class="o">.</span><span class="n">int_add</span><span class="p">(</span><span class="n">x</span><span class="p">,</span> <span class="mi">10</span><span class="p">)</span> +<a id="rest_code_fe430f89c3ac44bd87113cd210a97ff1-5" name="rest_code_fe430f89c3ac44bd87113cd210a97ff1-5" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_fe430f89c3ac44bd87113cd210a97ff1-5"></a> <span class="k">if</span> <span class="n">a</span> <span class="o">&lt;</span> <span class="mi">15</span><span class="p">:</span> +<a id="rest_code_fe430f89c3ac44bd87113cd210a97ff1-6" name="rest_code_fe430f89c3ac44bd87113cd210a97ff1-6" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_fe430f89c3ac44bd87113cd210a97ff1-6"></a> <span class="k">if</span> <span class="n">x</span> <span class="o">&lt;</span> <span class="mi">6</span><span class="p">:</span> +<a id="rest_code_fe430f89c3ac44bd87113cd210a97ff1-7" name="rest_code_fe430f89c3ac44bd87113cd210a97ff1-7" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_fe430f89c3ac44bd87113cd210a97ff1-7"></a> <span class="k">return</span> <span class="mi">0</span> +<a id="rest_code_fe430f89c3ac44bd87113cd210a97ff1-8" name="rest_code_fe430f89c3ac44bd87113cd210a97ff1-8" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_fe430f89c3ac44bd87113cd210a97ff1-8"></a> <span class="k">return</span> <span class="mi">1</span> +<a id="rest_code_fe430f89c3ac44bd87113cd210a97ff1-9" name="rest_code_fe430f89c3ac44bd87113cd210a97ff1-9" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_fe430f89c3ac44bd87113cd210a97ff1-9"></a> <span class="k">return</span> <span class="mi">2</span> +</pre></div> +<p>Into the following code:</p> +<div class="code"><pre class="code python"><a id="rest_code_4ffb3edd0ebd4f739819d99c60b8f91d-1" name="rest_code_4ffb3edd0ebd4f739819d99c60b8f91d-1" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_4ffb3edd0ebd4f739819d99c60b8f91d-1"></a><span class="kn">import</span> <span class="nn">__pypy__</span> +<a id="rest_code_4ffb3edd0ebd4f739819d99c60b8f91d-2" name="rest_code_4ffb3edd0ebd4f739819d99c60b8f91d-2" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_4ffb3edd0ebd4f739819d99c60b8f91d-2"></a> +<a id="rest_code_4ffb3edd0ebd4f739819d99c60b8f91d-3" name="rest_code_4ffb3edd0ebd4f739819d99c60b8f91d-3" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_4ffb3edd0ebd4f739819d99c60b8f91d-3"></a><span class="k">def</span> <span class="nf">wrong</span><span class="p">(</span><span class="n">x</span><span class="p">):</span> +<a id="rest_code_4ffb3edd0ebd4f739819d99c60b8f91d-4" name="rest_code_4ffb3edd0ebd4f739819d99c60b8f91d-4" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_4ffb3edd0ebd4f739819d99c60b8f91d-4"></a> <span class="n">a</span> <span class="o">=</span> <span class="n">__pypy__</span><span class="o">.</span><span class="n">intop</span><span class="o">.</span><span class="n">int_add</span><span class="p">(</span><span class="n">x</span><span class="p">,</span> <span class="mi">10</span><span class="p">)</span> +<a id="rest_code_4ffb3edd0ebd4f739819d99c60b8f91d-5" name="rest_code_4ffb3edd0ebd4f739819d99c60b8f91d-5" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_4ffb3edd0ebd4f739819d99c60b8f91d-5"></a> <span class="k">if</span> <span class="n">a</span> <span class="o">&lt;</span> <span class="mi">15</span><span class="p">:</span> +<a id="rest_code_4ffb3edd0ebd4f739819d99c60b8f91d-6" name="rest_code_4ffb3edd0ebd4f739819d99c60b8f91d-6" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_4ffb3edd0ebd4f739819d99c60b8f91d-6"></a> <span class="k">return</span> <span class="mi">0</span> +<a id="rest_code_4ffb3edd0ebd4f739819d99c60b8f91d-7" name="rest_code_4ffb3edd0ebd4f739819d99c60b8f91d-7" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_4ffb3edd0ebd4f739819d99c60b8f91d-7"></a> <span class="k">return</span> <span class="mi">2</span> +</pre></div> +<p>Basically the faulty reasoning of the JIT looks like this: if <code class="docutils literal">int_add(x, 10) &lt; 15</code> +then it must follow that <code class="docutils literal">x &lt; 5</code>, which is stronger than <code class="docutils literal">x &lt; 6</code>, so the +second <code class="docutils literal">if</code> is always true. This sounds good, but is actually wrong +if the addition <code class="docutils literal">+ 10</code> wrapped around. So if <code class="docutils literal">x == MAXINT</code>, then +<code class="docutils literal">int_add(x, 10) == MININT + 9 &lt; 15</code>. But <code class="docutils literal">MAXINT &lt; 5</code> is not +correct.</p> +<p>Note how the same reasoning with overflow-checking addition is correct! If <code class="docutils literal">x + +10 &lt; 15</code> and the <code class="docutils literal">+</code> didn't overflow, then indeed <code class="docutils literal">x &lt; 6</code>. And if your +mind bends starting to think about all this, you understand some of the +difficulty of getting the JIT correct in this area.</p> +</section> +<section id="how-could-we-have-avoided-this-bug"> +<h2>How could we have avoided this bug?</h2> +<p>One <a class="reference external" href="https://twitter.com/cfbolz/status/1482649144099586051">exercise I try to do after finding bugs</a> is to reflect on ways that the +bug could have been avoided. I think this is particularly important in the JIT, +where bugs are potentially really annoying to find and can cause very strange +behaviour in basically arbitrary Python code.</p> +<p>It's easy to always answer this question with "try to think more carefully +when working", but that approach cannot be relied on in complicated situations, +because humans don't concentrate perfectly for long stretches of time.</p> +<p>A situation-specific problem I identified was the bad design of the range analysis API. +A range is not just represented by two numbers, instead it's two numbers +and two bools that are supposed to represent that some operation did or did not +underflow/overflow. The meaning of these bools was quite hard to grasp and easy +to get wrong, so probably they should never have been introduced in the first +place (and my bugfix indeed removed them).</p> +<p>But in the rest of this blog post I want to talk about another, systematic +approach that can be applied to the problem of mis-optimizations of integer +operations, and that is done by applying an SMT solver to the problem.</p> +<p>An SMT solver (<a class="reference external" href="https://en.wikipedia.org/wiki/Satisfiability_modulo_theories">Satisfyability Modulo Theories</a>) is a tool that can be used to +find out whether mathematical formulas are "satisfiable", i.e. whether +some chosen set of inputs exists that will make the formulas evaluate to true. SMT solvers are +commonly used in a wide range of CS applications including program correctness +proofs, program synthesis, etc. The most widely known one is probably <a class="reference external" href="https://github.com/Z3Prover">Z3</a> by +Microsoft Research which has the nice advantage of coming with an easy-to-use +Python binding.</p> +<p>Going into this I basically knew next to nothing about SMT solvers (despite +having been embedded in a formal methods research group for years!) so it was an +interesting new world to learn about.</p> +<p>As briefly mentioned in the introduction, the approach I took followed a similar +(but <em>much</em> more properly executed) one applied to LLVM operations, called +<a class="reference external" href="https://github.com/AliveToolkit/alive2/">Alive2</a>. Krister Waldfridsson has done <a class="reference external" href="https://kristerw.github.io/2022/09/13/translation-validation/">similar work for GCC recently</a>, +described on his blog.</p> +</section> +<section id="z3-proof-of-concept"> +<h2>Z3 Proof of Concept</h2> +<p>The first thing I did was to try to get Z3 find the above bug, by encoding the +input program into an SMT formula by hand and trying to get Z3 to prove the condition +that the JIT thinks is always true. The Z3 code for this looks as follows:</p> +<div class="code"><pre class="code python"><a id="rest_code_2fe5dd23f4ec46749496562618a462eb-1" name="rest_code_2fe5dd23f4ec46749496562618a462eb-1" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_2fe5dd23f4ec46749496562618a462eb-1"></a><span class="kn">from</span> <span class="nn">z3</span> <span class="kn">import</span> <span class="n">BitVec</span><span class="p">,</span> <span class="n">Implies</span><span class="p">,</span> <span class="n">prove</span> +<a id="rest_code_2fe5dd23f4ec46749496562618a462eb-2" name="rest_code_2fe5dd23f4ec46749496562618a462eb-2" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_2fe5dd23f4ec46749496562618a462eb-2"></a><span class="n">x</span> <span class="o">=</span> <span class="n">BitVec</span><span class="p">(</span><span class="s1">'x'</span><span class="p">,</span> <span class="mi">64</span><span class="p">)</span> +<a id="rest_code_2fe5dd23f4ec46749496562618a462eb-3" name="rest_code_2fe5dd23f4ec46749496562618a462eb-3" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_2fe5dd23f4ec46749496562618a462eb-3"></a><span class="n">a</span> <span class="o">=</span> <span class="n">x</span> <span class="o">+</span> <span class="mi">10</span> +<a id="rest_code_2fe5dd23f4ec46749496562618a462eb-4" name="rest_code_2fe5dd23f4ec46749496562618a462eb-4" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_2fe5dd23f4ec46749496562618a462eb-4"></a><span class="n">cond1</span> <span class="o">=</span> <span class="n">a</span> <span class="o">&lt;</span> <span class="mi">15</span> +<a id="rest_code_2fe5dd23f4ec46749496562618a462eb-5" name="rest_code_2fe5dd23f4ec46749496562618a462eb-5" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_2fe5dd23f4ec46749496562618a462eb-5"></a><span class="n">cond2</span> <span class="o">=</span> <span class="n">x</span> <span class="o">&lt;</span> <span class="mi">6</span> +<a id="rest_code_2fe5dd23f4ec46749496562618a462eb-6" name="rest_code_2fe5dd23f4ec46749496562618a462eb-6" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_2fe5dd23f4ec46749496562618a462eb-6"></a><span class="n">prove</span><span class="p">(</span><span class="n">Implies</span><span class="p">(</span><span class="n">cond1</span><span class="p">,</span> <span class="n">cond2</span><span class="p">))</span> +</pre></div> +<p>Here, <code class="docutils literal">x</code> is defined to be a bit vector variable of width 64, which is a +datatype that can be used to represent bounded machine integers. Addition on +bit vectors performs wraparound arithmetic, like the <code class="docutils literal">__pypy__.intop.int_add</code> +call in the original code. The JIT optimized the second condition away, so +essentially it was convinced that the first condition implies the second one. +The above snippet tries to get Z3 to confirm this.</p> +<p>When run, the above program prints:</p> +<pre class="literal-block">counterexample +[x = 9223372036854775803]</pre> +<p>Which shows the bug. As a small side-note, I thought it was cool that the +process of "proving" something in Z3 basically means trying to find an example +for the negation of the formula. If no counterexample can be found for the +negation, the original formula is true. If the original formula turns out to be +false (like here) we get a nice example that shows the problem to go with it.</p> +<p>It's not realistic to hand-translate all the hundreds of +unit-tests into Z3 formulas and then ask Z3 to prove the optimizations. Instead, +we want to have a program that does this for us.</p> +</section> +<section id="smt-checking-of-the-jit-optimizer"> +<h2>SMT Checking of the JIT Optimizer</h2> +<p>What we want from this program is the following: given an unoptimized trace and +its optimized version, we want to use Z3 to check whether the optimized trace +behaves identically to the unoptimized one. One question is what "behaves +identically" means. What we care about is the outputs of the trace being the +same values, no matter how they are computed. Also, for every guard we want to +make sure that it fails in identical ways in the optimized and unoptimized +versions. A guard is only allowed to be optimized away if it can never fail. +The code that comes after a guard can assume that the guard has not failed, +because otherwise execution would have left the trace. All of this should be +true regardless for the values of the input variables of the trace.</p> +<p>So in order to check that the two traces are behaving identically, we do the +following:</p> +<ul class="simple"> +<li><p>We create Z3 variables for every input variable. We use the same input +variables both for the unoptimized as well as the optimized trace.</p></li> +<li><p>We align the two traces at the corresponding guards. Thankfully the optimizer +keeps track of which optimized guard corresponds to which unoptimized input +guard.</p></li> +<li><p>All the operations before a guard are translated into Z3 formulas, for both +versions of the trace.</p></li> +<li><p>For two corresponding guards, we ask Z3 to prove that the guard conditions are +identical.</p></li> +<li><p>For a guard that was optimized away we ask Z3 to prove that the condition is +always true.</p></li> +<li><p>After a guard, we tell Z3 that from now on it can assume that the guard +condition is true.</p></li> +<li><p>We repeat this, guard for guard, until we reach the end of the trace. There, +we ask Z3 to prove that the output variables in the unoptimized trace and the +optimized trace are identical (every trace can return one or many values).</p></li> +</ul> +<p>I implemented this, it's <a class="reference external" href="https://foss.heptapod.net/pypy/pypy/-/blob/branch/default/rpython/jit/metainterp/optimizeopt/test/test_z3checktests.py">not a lot of code</a>, basically a couple of hundred lines +of (somewhat hacky) Python code. So far I only support integer +operations. Here are some parts of the code to give you a flavor of what this +looks like.</p> +<p>This is the code that translates operations into Z3 formulas:</p> +<div class="code"><pre class="code python"><a id="rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-1" name="rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-1" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-1"></a><span class="k">def</span> <span class="nf">add_to_solver</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">ops</span><span class="p">,</span> <span class="n">state</span><span class="p">):</span> +<a id="rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-2" name="rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-2" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-2"></a> <span class="k">for</span> <span class="n">op</span> <span class="ow">in</span> <span class="n">ops</span><span class="p">:</span> +<a id="rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-3" name="rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-3" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-3"></a> <span class="k">if</span> <span class="n">op</span><span class="o">.</span><span class="n">type</span> <span class="o">!=</span> <span class="s1">'v'</span><span class="p">:</span> <span class="c1"># is it an operation with a result</span> +<a id="rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-4" name="rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-4" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-4"></a> <span class="n">res</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">newvar</span><span class="p">(</span><span class="n">op</span><span class="p">)</span> +<a id="rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-5" name="rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-5" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-5"></a> <span class="k">else</span><span class="p">:</span> <span class="c1"># or does it return void</span> +<a id="rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-6" name="rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-6" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-6"></a> <span class="n">res</span> <span class="o">=</span> <span class="kc">None</span> +<a id="rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-7" name="rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-7" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-7"></a> +<a id="rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-8" name="rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-8" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-8"></a> <span class="c1"># ...</span> +<a id="rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-9" name="rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-9" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-9"></a> +<a id="rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-10" name="rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-10" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-10"></a> <span class="c1"># convert arguments</span> +<a id="rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-11" name="rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-11" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-11"></a> <span class="k">if</span> <span class="n">op</span><span class="o">.</span><span class="n">numargs</span><span class="p">()</span> <span class="o">==</span> <span class="mi">1</span><span class="p">:</span> +<a id="rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-12" name="rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-12" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-12"></a> <span class="n">arg0</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">convertarg</span><span class="p">(</span><span class="n">op</span><span class="p">,</span> <span class="mi">0</span><span class="p">)</span> +<a id="rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-13" name="rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-13" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-13"></a> <span class="k">elif</span> <span class="n">op</span><span class="o">.</span><span class="n">numargs</span><span class="p">()</span> <span class="o">==</span> <span class="mi">2</span><span class="p">:</span> +<a id="rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-14" name="rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-14" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-14"></a> <span class="n">arg0</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">convertarg</span><span class="p">(</span><span class="n">op</span><span class="p">,</span> <span class="mi">0</span><span class="p">)</span> +<a id="rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-15" name="rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-15" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-15"></a> <span class="n">arg1</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">convertarg</span><span class="p">(</span><span class="n">op</span><span class="p">,</span> <span class="mi">1</span><span class="p">)</span> +<a id="rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-16" name="rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-16" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-16"></a> +<a id="rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-17" name="rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-17" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-17"></a> <span class="c1"># compute results</span> +<a id="rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-18" name="rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-18" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-18"></a> <span class="k">if</span> <span class="n">opname</span> <span class="o">==</span> <span class="s2">"int_add"</span><span class="p">:</span> +<a id="rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-19" name="rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-19" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-19"></a> <span class="n">expr</span> <span class="o">=</span> <span class="n">arg0</span> <span class="o">+</span> <span class="n">arg1</span> +<a id="rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-20" name="rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-20" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-20"></a> <span class="k">elif</span> <span class="n">opname</span> <span class="o">==</span> <span class="s2">"int_sub"</span><span class="p">:</span> +<a id="rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-21" name="rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-21" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-21"></a> <span class="n">expr</span> <span class="o">=</span> <span class="n">arg0</span> <span class="o">-</span> <span class="n">arg1</span> +<a id="rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-22" name="rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-22" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-22"></a> <span class="k">elif</span> <span class="n">opname</span> <span class="o">==</span> <span class="s2">"int_mul"</span><span class="p">:</span> +<a id="rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-23" name="rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-23" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-23"></a> <span class="n">expr</span> <span class="o">=</span> <span class="n">arg0</span> <span class="o">*</span> <span class="n">arg1</span> +<a id="rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-24" name="rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-24" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-24"></a> <span class="k">elif</span> <span class="n">opname</span> <span class="o">==</span> <span class="s2">"int_and"</span><span class="p">:</span> +<a id="rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-25" name="rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-25" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-25"></a> <span class="n">expr</span> <span class="o">=</span> <span class="n">arg0</span> <span class="o">&amp;</span> <span class="n">arg1</span> +<a id="rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-26" name="rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-26" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-26"></a> <span class="k">elif</span> <span class="n">opname</span> <span class="o">==</span> <span class="s2">"int_or"</span><span class="p">:</span> +<a id="rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-27" name="rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-27" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-27"></a> <span class="n">expr</span> <span class="o">=</span> <span class="n">arg0</span> <span class="o">|</span> <span class="n">arg1</span> +<a id="rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-28" name="rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-28" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-28"></a> <span class="k">elif</span> <span class="n">opname</span> <span class="o">==</span> <span class="s2">"int_xor"</span><span class="p">:</span> +<a id="rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-29" name="rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-29" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-29"></a> <span class="n">expr</span> <span class="o">=</span> <span class="n">arg0</span> <span class="o">^</span> <span class="n">arg1</span> +<a id="rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-30" name="rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-30" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-30"></a> +<a id="rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-31" name="rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-31" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-31"></a> <span class="c1"># ... more operations, some shown below</span> +<a id="rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-32" name="rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-32" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-32"></a> +<a id="rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-33" name="rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-33" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_7b1cf3642ca8434fa8f8705f8d783cf1-33"></a> <span class="bp">self</span><span class="o">.</span><span class="n">solver</span><span class="o">.</span><span class="n">add</span><span class="p">(</span><span class="n">res</span> <span class="o">==</span> <span class="n">expr</span><span class="p">)</span> +</pre></div> +<p>New Z3 variables are defined by the helper function <code class="docutils literal">newvar</code>, which adds the +operation to a dictionary <code class="docutils literal">box_to_z3</code> mapping boxes (=variables) to Z3 +variables. Due to the <a class="reference external" href="https://en.wikipedia.org/wiki/Static_single-assignment_form">SSA</a> property that traces have, a variable must be defined +before its first use.</p> +<p>Here's what <code class="docutils literal">newvar</code> looks like (<code class="docutils literal">LONG_BIT</code> is a constant that is either +<code class="docutils literal">64</code> or <code class="docutils literal">32</code>, depending on the target architecture):</p> +<div class="code"><pre class="code python"><a id="rest_code_36cab9b8d68941ecafeac4cb42b72541-1" name="rest_code_36cab9b8d68941ecafeac4cb42b72541-1" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_36cab9b8d68941ecafeac4cb42b72541-1"></a><span class="k">def</span> <span class="nf">newvar</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">box</span><span class="p">,</span> <span class="nb">repr</span><span class="o">=</span><span class="kc">None</span><span class="p">):</span> +<a id="rest_code_36cab9b8d68941ecafeac4cb42b72541-2" name="rest_code_36cab9b8d68941ecafeac4cb42b72541-2" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_36cab9b8d68941ecafeac4cb42b72541-2"></a> <span class="c1"># ... some logic around making the string representation</span> +<a id="rest_code_36cab9b8d68941ecafeac4cb42b72541-3" name="rest_code_36cab9b8d68941ecafeac4cb42b72541-3" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_36cab9b8d68941ecafeac4cb42b72541-3"></a> <span class="c1"># somewhat nicer omitted</span> +<a id="rest_code_36cab9b8d68941ecafeac4cb42b72541-4" name="rest_code_36cab9b8d68941ecafeac4cb42b72541-4" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_36cab9b8d68941ecafeac4cb42b72541-4"></a> <span class="n">result</span> <span class="o">=</span> <span class="n">z3</span><span class="o">.</span><span class="n">BitVec</span><span class="p">(</span><span class="nb">repr</span><span class="p">,</span> <span class="n">LONG_BIT</span><span class="p">)</span> +<a id="rest_code_36cab9b8d68941ecafeac4cb42b72541-5" name="rest_code_36cab9b8d68941ecafeac4cb42b72541-5" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_36cab9b8d68941ecafeac4cb42b72541-5"></a> <span class="bp">self</span><span class="o">.</span><span class="n">box_to_z3</span><span class="p">[</span><span class="n">box</span><span class="p">]</span> <span class="o">=</span> <span class="n">result</span> +<a id="rest_code_36cab9b8d68941ecafeac4cb42b72541-6" name="rest_code_36cab9b8d68941ecafeac4cb42b72541-6" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_36cab9b8d68941ecafeac4cb42b72541-6"></a> <span class="k">return</span> <span class="n">result</span> +</pre></div> +<p>The <code class="docutils literal">convert</code> method turns an operation argument (either a constant or a +variable) into a Z3 formula (either a constant bit vector or an already defined +Z3 variable). <code class="docutils literal">convertarg</code> is a helper function that takes an operation, reads +its nth argument and converts it.</p> +<div class="code"><pre class="code python"><a id="rest_code_70b9c80263b2495bab6ea46cbe5febbc-1" name="rest_code_70b9c80263b2495bab6ea46cbe5febbc-1" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_70b9c80263b2495bab6ea46cbe5febbc-1"></a><span class="k">def</span> <span class="nf">convert</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">box</span><span class="p">):</span> +<a id="rest_code_70b9c80263b2495bab6ea46cbe5febbc-2" name="rest_code_70b9c80263b2495bab6ea46cbe5febbc-2" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_70b9c80263b2495bab6ea46cbe5febbc-2"></a> <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">box</span><span class="p">,</span> <span class="n">ConstInt</span><span class="p">):</span> +<a id="rest_code_70b9c80263b2495bab6ea46cbe5febbc-3" name="rest_code_70b9c80263b2495bab6ea46cbe5febbc-3" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_70b9c80263b2495bab6ea46cbe5febbc-3"></a> <span class="k">return</span> <span class="n">z3</span><span class="o">.</span><span class="n">BitVecVal</span><span class="p">(</span><span class="n">box</span><span class="o">.</span><span class="n">getint</span><span class="p">(),</span> <span class="n">LONG_BIT</span><span class="p">)</span> +<a id="rest_code_70b9c80263b2495bab6ea46cbe5febbc-4" name="rest_code_70b9c80263b2495bab6ea46cbe5febbc-4" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_70b9c80263b2495bab6ea46cbe5febbc-4"></a> <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">box_to_z3</span><span class="p">[</span><span class="n">box</span><span class="p">]</span> +<a id="rest_code_70b9c80263b2495bab6ea46cbe5febbc-5" name="rest_code_70b9c80263b2495bab6ea46cbe5febbc-5" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_70b9c80263b2495bab6ea46cbe5febbc-5"></a> +<a id="rest_code_70b9c80263b2495bab6ea46cbe5febbc-6" name="rest_code_70b9c80263b2495bab6ea46cbe5febbc-6" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_70b9c80263b2495bab6ea46cbe5febbc-6"></a><span class="k">def</span> <span class="nf">convertarg</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">box</span><span class="p">,</span> <span class="n">arg</span><span class="p">):</span> +<a id="rest_code_70b9c80263b2495bab6ea46cbe5febbc-7" name="rest_code_70b9c80263b2495bab6ea46cbe5febbc-7" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_70b9c80263b2495bab6ea46cbe5febbc-7"></a> <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">convert</span><span class="p">(</span><span class="n">box</span><span class="o">.</span><span class="n">getarg</span><span class="p">(</span><span class="n">arg</span><span class="p">))</span> +</pre></div> +<p>The lookup of variables in <code class="docutils literal">box_to_z3</code> that <code class="docutils literal">convert</code> does cannot fail, +because the variable must have been defined before use.</p> +<p>Comparisons return the bit vector 0 or bit vector 1, we use a helper function +<code class="docutils literal">cond</code> to turn the Z3 truth value of the comparison into a bit vector:</p> +<div class="code"><pre class="code python"><a id="rest_code_af8f5f62807f4670b7bb2c8ec574b55d-1" name="rest_code_af8f5f62807f4670b7bb2c8ec574b55d-1" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_af8f5f62807f4670b7bb2c8ec574b55d-1"></a><span class="k">def</span> <span class="nf">cond</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">z3expr</span><span class="p">):</span> +<a id="rest_code_af8f5f62807f4670b7bb2c8ec574b55d-2" name="rest_code_af8f5f62807f4670b7bb2c8ec574b55d-2" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_af8f5f62807f4670b7bb2c8ec574b55d-2"></a> <span class="k">return</span> <span class="n">z3</span><span class="o">.</span><span class="n">If</span><span class="p">(</span><span class="n">z3expr</span><span class="p">,</span> <span class="n">TRUEBV</span><span class="p">,</span> <span class="n">FALSEBV</span><span class="p">)</span> +<a id="rest_code_af8f5f62807f4670b7bb2c8ec574b55d-3" name="rest_code_af8f5f62807f4670b7bb2c8ec574b55d-3" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_af8f5f62807f4670b7bb2c8ec574b55d-3"></a> +<a id="rest_code_af8f5f62807f4670b7bb2c8ec574b55d-4" name="rest_code_af8f5f62807f4670b7bb2c8ec574b55d-4" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_af8f5f62807f4670b7bb2c8ec574b55d-4"></a> +<a id="rest_code_af8f5f62807f4670b7bb2c8ec574b55d-5" name="rest_code_af8f5f62807f4670b7bb2c8ec574b55d-5" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_af8f5f62807f4670b7bb2c8ec574b55d-5"></a><span class="k">def</span> <span class="nf">add_to_solver</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">ops</span><span class="p">,</span> <span class="n">state</span><span class="p">):</span> +<a id="rest_code_af8f5f62807f4670b7bb2c8ec574b55d-6" name="rest_code_af8f5f62807f4670b7bb2c8ec574b55d-6" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_af8f5f62807f4670b7bb2c8ec574b55d-6"></a> <span class="c1"># ... start as above</span> +<a id="rest_code_af8f5f62807f4670b7bb2c8ec574b55d-7" name="rest_code_af8f5f62807f4670b7bb2c8ec574b55d-7" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_af8f5f62807f4670b7bb2c8ec574b55d-7"></a> +<a id="rest_code_af8f5f62807f4670b7bb2c8ec574b55d-8" name="rest_code_af8f5f62807f4670b7bb2c8ec574b55d-8" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_af8f5f62807f4670b7bb2c8ec574b55d-8"></a> <span class="c1"># more cases</span> +<a id="rest_code_af8f5f62807f4670b7bb2c8ec574b55d-9" name="rest_code_af8f5f62807f4670b7bb2c8ec574b55d-9" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_af8f5f62807f4670b7bb2c8ec574b55d-9"></a> <span class="k">elif</span> <span class="n">opname</span> <span class="o">==</span> <span class="s2">"int_eq"</span><span class="p">:</span> +<a id="rest_code_af8f5f62807f4670b7bb2c8ec574b55d-10" name="rest_code_af8f5f62807f4670b7bb2c8ec574b55d-10" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_af8f5f62807f4670b7bb2c8ec574b55d-10"></a> <span class="n">expr</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">cond</span><span class="p">(</span><span class="n">arg0</span> <span class="o">==</span> <span class="n">arg1</span><span class="p">)</span> +<a id="rest_code_af8f5f62807f4670b7bb2c8ec574b55d-11" name="rest_code_af8f5f62807f4670b7bb2c8ec574b55d-11" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_af8f5f62807f4670b7bb2c8ec574b55d-11"></a> <span class="k">elif</span> <span class="n">opname</span> <span class="o">==</span> <span class="s2">"int_ne"</span><span class="p">:</span> +<a id="rest_code_af8f5f62807f4670b7bb2c8ec574b55d-12" name="rest_code_af8f5f62807f4670b7bb2c8ec574b55d-12" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_af8f5f62807f4670b7bb2c8ec574b55d-12"></a> <span class="n">expr</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">cond</span><span class="p">(</span><span class="n">arg0</span> <span class="o">!=</span> <span class="n">arg1</span><span class="p">)</span> +<a id="rest_code_af8f5f62807f4670b7bb2c8ec574b55d-13" name="rest_code_af8f5f62807f4670b7bb2c8ec574b55d-13" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_af8f5f62807f4670b7bb2c8ec574b55d-13"></a> <span class="k">elif</span> <span class="n">opname</span> <span class="o">==</span> <span class="s2">"int_lt"</span><span class="p">:</span> +<a id="rest_code_af8f5f62807f4670b7bb2c8ec574b55d-14" name="rest_code_af8f5f62807f4670b7bb2c8ec574b55d-14" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_af8f5f62807f4670b7bb2c8ec574b55d-14"></a> <span class="n">expr</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">cond</span><span class="p">(</span><span class="n">arg0</span> <span class="o">&lt;</span> <span class="n">arg1</span><span class="p">)</span> +<a id="rest_code_af8f5f62807f4670b7bb2c8ec574b55d-15" name="rest_code_af8f5f62807f4670b7bb2c8ec574b55d-15" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_af8f5f62807f4670b7bb2c8ec574b55d-15"></a> <span class="k">elif</span> <span class="n">opname</span> <span class="o">==</span> <span class="s2">"int_le"</span><span class="p">:</span> +<a id="rest_code_af8f5f62807f4670b7bb2c8ec574b55d-16" name="rest_code_af8f5f62807f4670b7bb2c8ec574b55d-16" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_af8f5f62807f4670b7bb2c8ec574b55d-16"></a> <span class="n">expr</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">cond</span><span class="p">(</span><span class="n">arg0</span> <span class="o">&lt;=</span> <span class="n">arg1</span><span class="p">)</span> +<a id="rest_code_af8f5f62807f4670b7bb2c8ec574b55d-17" name="rest_code_af8f5f62807f4670b7bb2c8ec574b55d-17" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_af8f5f62807f4670b7bb2c8ec574b55d-17"></a> <span class="k">elif</span> <span class="n">opname</span> <span class="o">==</span> <span class="s2">"int_gt"</span><span class="p">:</span> +<a id="rest_code_af8f5f62807f4670b7bb2c8ec574b55d-18" name="rest_code_af8f5f62807f4670b7bb2c8ec574b55d-18" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_af8f5f62807f4670b7bb2c8ec574b55d-18"></a> <span class="n">expr</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">cond</span><span class="p">(</span><span class="n">arg0</span> <span class="o">&gt;</span> <span class="n">arg1</span><span class="p">)</span> +<a id="rest_code_af8f5f62807f4670b7bb2c8ec574b55d-19" name="rest_code_af8f5f62807f4670b7bb2c8ec574b55d-19" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_af8f5f62807f4670b7bb2c8ec574b55d-19"></a> <span class="k">elif</span> <span class="n">opname</span> <span class="o">==</span> <span class="s2">"int_ge"</span><span class="p">:</span> +<a id="rest_code_af8f5f62807f4670b7bb2c8ec574b55d-20" name="rest_code_af8f5f62807f4670b7bb2c8ec574b55d-20" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_af8f5f62807f4670b7bb2c8ec574b55d-20"></a> <span class="n">expr</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">cond</span><span class="p">(</span><span class="n">arg0</span> <span class="o">&gt;=</span> <span class="n">arg1</span><span class="p">)</span> +<a id="rest_code_af8f5f62807f4670b7bb2c8ec574b55d-21" name="rest_code_af8f5f62807f4670b7bb2c8ec574b55d-21" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_af8f5f62807f4670b7bb2c8ec574b55d-21"></a> <span class="k">elif</span> <span class="n">opname</span> <span class="o">==</span> <span class="s2">"int_is_true"</span><span class="p">:</span> +<a id="rest_code_af8f5f62807f4670b7bb2c8ec574b55d-22" name="rest_code_af8f5f62807f4670b7bb2c8ec574b55d-22" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_af8f5f62807f4670b7bb2c8ec574b55d-22"></a> <span class="n">expr</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">cond</span><span class="p">(</span><span class="n">arg0</span> <span class="o">!=</span> <span class="n">FALSEBV</span><span class="p">)</span> +<a id="rest_code_af8f5f62807f4670b7bb2c8ec574b55d-23" name="rest_code_af8f5f62807f4670b7bb2c8ec574b55d-23" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_af8f5f62807f4670b7bb2c8ec574b55d-23"></a> <span class="k">elif</span> <span class="n">opname</span> <span class="o">==</span> <span class="s2">"uint_lt"</span><span class="p">:</span> +<a id="rest_code_af8f5f62807f4670b7bb2c8ec574b55d-24" name="rest_code_af8f5f62807f4670b7bb2c8ec574b55d-24" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_af8f5f62807f4670b7bb2c8ec574b55d-24"></a> <span class="n">expr</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">cond</span><span class="p">(</span><span class="n">z3</span><span class="o">.</span><span class="n">ULT</span><span class="p">(</span><span class="n">arg0</span><span class="p">,</span> <span class="n">arg1</span><span class="p">))</span> +<a id="rest_code_af8f5f62807f4670b7bb2c8ec574b55d-25" name="rest_code_af8f5f62807f4670b7bb2c8ec574b55d-25" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_af8f5f62807f4670b7bb2c8ec574b55d-25"></a> <span class="k">elif</span> <span class="n">opname</span> <span class="o">==</span> <span class="s2">"uint_le"</span><span class="p">:</span> +<a id="rest_code_af8f5f62807f4670b7bb2c8ec574b55d-26" name="rest_code_af8f5f62807f4670b7bb2c8ec574b55d-26" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_af8f5f62807f4670b7bb2c8ec574b55d-26"></a> <span class="n">expr</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">cond</span><span class="p">(</span><span class="n">z3</span><span class="o">.</span><span class="n">ULE</span><span class="p">(</span><span class="n">arg0</span><span class="p">,</span> <span class="n">arg1</span><span class="p">))</span> +<a id="rest_code_af8f5f62807f4670b7bb2c8ec574b55d-27" name="rest_code_af8f5f62807f4670b7bb2c8ec574b55d-27" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_af8f5f62807f4670b7bb2c8ec574b55d-27"></a> <span class="k">elif</span> <span class="n">opname</span> <span class="o">==</span> <span class="s2">"uint_gt"</span><span class="p">:</span> +<a id="rest_code_af8f5f62807f4670b7bb2c8ec574b55d-28" name="rest_code_af8f5f62807f4670b7bb2c8ec574b55d-28" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_af8f5f62807f4670b7bb2c8ec574b55d-28"></a> <span class="n">expr</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">cond</span><span class="p">(</span><span class="n">z3</span><span class="o">.</span><span class="n">UGT</span><span class="p">(</span><span class="n">arg0</span><span class="p">,</span> <span class="n">arg1</span><span class="p">))</span> +<a id="rest_code_af8f5f62807f4670b7bb2c8ec574b55d-29" name="rest_code_af8f5f62807f4670b7bb2c8ec574b55d-29" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_af8f5f62807f4670b7bb2c8ec574b55d-29"></a> <span class="k">elif</span> <span class="n">opname</span> <span class="o">==</span> <span class="s2">"uint_ge"</span><span class="p">:</span> +<a id="rest_code_af8f5f62807f4670b7bb2c8ec574b55d-30" name="rest_code_af8f5f62807f4670b7bb2c8ec574b55d-30" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_af8f5f62807f4670b7bb2c8ec574b55d-30"></a> <span class="n">expr</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">cond</span><span class="p">(</span><span class="n">z3</span><span class="o">.</span><span class="n">UGE</span><span class="p">(</span><span class="n">arg0</span><span class="p">,</span> <span class="n">arg1</span><span class="p">))</span> +<a id="rest_code_af8f5f62807f4670b7bb2c8ec574b55d-31" name="rest_code_af8f5f62807f4670b7bb2c8ec574b55d-31" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_af8f5f62807f4670b7bb2c8ec574b55d-31"></a> <span class="k">elif</span> <span class="n">opname</span> <span class="o">==</span> <span class="s2">"int_is_zero"</span><span class="p">:</span> +<a id="rest_code_af8f5f62807f4670b7bb2c8ec574b55d-32" name="rest_code_af8f5f62807f4670b7bb2c8ec574b55d-32" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_af8f5f62807f4670b7bb2c8ec574b55d-32"></a> <span class="n">expr</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">cond</span><span class="p">(</span><span class="n">arg0</span> <span class="o">==</span> <span class="n">FALSEBV</span><span class="p">)</span> +<a id="rest_code_af8f5f62807f4670b7bb2c8ec574b55d-33" name="rest_code_af8f5f62807f4670b7bb2c8ec574b55d-33" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_af8f5f62807f4670b7bb2c8ec574b55d-33"></a> +<a id="rest_code_af8f5f62807f4670b7bb2c8ec574b55d-34" name="rest_code_af8f5f62807f4670b7bb2c8ec574b55d-34" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_af8f5f62807f4670b7bb2c8ec574b55d-34"></a> <span class="c1"># ... rest as above</span> +</pre></div> +<p>So basically for every trace operation that operates on integers I had to give a +translation into Z3 formulas, which is mostly straightforward.</p> +<p>Guard operations get converted into a Z3 boolean by their own helper function, +which looks like this:</p> +<div class="code"><pre class="code python"><a id="rest_code_3de914924f164344a1267234ae4925f2-1" name="rest_code_3de914924f164344a1267234ae4925f2-1" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_3de914924f164344a1267234ae4925f2-1"></a><span class="k">def</span> <span class="nf">guard_to_condition</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">guard</span><span class="p">,</span> <span class="n">state</span><span class="p">):</span> +<a id="rest_code_3de914924f164344a1267234ae4925f2-2" name="rest_code_3de914924f164344a1267234ae4925f2-2" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_3de914924f164344a1267234ae4925f2-2"></a> <span class="n">opname</span> <span class="o">=</span> <span class="n">guard</span><span class="o">.</span><span class="n">getopname</span><span class="p">()</span> +<a id="rest_code_3de914924f164344a1267234ae4925f2-3" name="rest_code_3de914924f164344a1267234ae4925f2-3" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_3de914924f164344a1267234ae4925f2-3"></a> <span class="k">if</span> <span class="n">opname</span> <span class="o">==</span> <span class="s2">"guard_true"</span><span class="p">:</span> +<a id="rest_code_3de914924f164344a1267234ae4925f2-4" name="rest_code_3de914924f164344a1267234ae4925f2-4" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_3de914924f164344a1267234ae4925f2-4"></a> <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">convertarg</span><span class="p">(</span><span class="n">guard</span><span class="p">,</span> <span class="mi">0</span><span class="p">)</span> <span class="o">==</span> <span class="n">TRUEBV</span> +<a id="rest_code_3de914924f164344a1267234ae4925f2-5" name="rest_code_3de914924f164344a1267234ae4925f2-5" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_3de914924f164344a1267234ae4925f2-5"></a> <span class="k">elif</span> <span class="n">opname</span> <span class="o">==</span> <span class="s2">"guard_false"</span><span class="p">:</span> +<a id="rest_code_3de914924f164344a1267234ae4925f2-6" name="rest_code_3de914924f164344a1267234ae4925f2-6" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_3de914924f164344a1267234ae4925f2-6"></a> <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">convertarg</span><span class="p">(</span><span class="n">guard</span><span class="p">,</span> <span class="mi">0</span><span class="p">)</span> <span class="o">==</span> <span class="n">FALSEBV</span> +<a id="rest_code_3de914924f164344a1267234ae4925f2-7" name="rest_code_3de914924f164344a1267234ae4925f2-7" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_3de914924f164344a1267234ae4925f2-7"></a> <span class="k">elif</span> <span class="n">opname</span> <span class="o">==</span> <span class="s2">"guard_value"</span><span class="p">:</span> +<a id="rest_code_3de914924f164344a1267234ae4925f2-8" name="rest_code_3de914924f164344a1267234ae4925f2-8" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_3de914924f164344a1267234ae4925f2-8"></a> <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">convertarg</span><span class="p">(</span><span class="n">guard</span><span class="p">,</span> <span class="mi">0</span><span class="p">)</span> <span class="o">==</span> <span class="bp">self</span><span class="o">.</span><span class="n">convertarg</span><span class="p">(</span><span class="n">guard</span><span class="p">,</span> <span class="mi">1</span><span class="p">)</span> +<a id="rest_code_3de914924f164344a1267234ae4925f2-9" name="rest_code_3de914924f164344a1267234ae4925f2-9" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_3de914924f164344a1267234ae4925f2-9"></a> +<a id="rest_code_3de914924f164344a1267234ae4925f2-10" name="rest_code_3de914924f164344a1267234ae4925f2-10" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_3de914924f164344a1267234ae4925f2-10"></a> <span class="c1"># ... some more exist, shown below</span> +</pre></div> +<p>Some operations are a bit trickier. An important example in the context of +this blog post are integer operations that check for overflow. The overflow +operations return a result, but also a boolean whether the operation overflowed +or not.</p> +<div class="code"><pre class="code python"><a id="rest_code_51a2bf22ac6042edb7137eeab86ff8c4-1" name="rest_code_51a2bf22ac6042edb7137eeab86ff8c4-1" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_51a2bf22ac6042edb7137eeab86ff8c4-1"></a><span class="k">def</span> <span class="nf">add_to_solver</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">ops</span><span class="p">,</span> <span class="n">state</span><span class="p">):</span> +<a id="rest_code_51a2bf22ac6042edb7137eeab86ff8c4-2" name="rest_code_51a2bf22ac6042edb7137eeab86ff8c4-2" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_51a2bf22ac6042edb7137eeab86ff8c4-2"></a> +<a id="rest_code_51a2bf22ac6042edb7137eeab86ff8c4-3" name="rest_code_51a2bf22ac6042edb7137eeab86ff8c4-3" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_51a2bf22ac6042edb7137eeab86ff8c4-3"></a> <span class="c1"># ... more cases</span> +<a id="rest_code_51a2bf22ac6042edb7137eeab86ff8c4-4" name="rest_code_51a2bf22ac6042edb7137eeab86ff8c4-4" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_51a2bf22ac6042edb7137eeab86ff8c4-4"></a> +<a id="rest_code_51a2bf22ac6042edb7137eeab86ff8c4-5" name="rest_code_51a2bf22ac6042edb7137eeab86ff8c4-5" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_51a2bf22ac6042edb7137eeab86ff8c4-5"></a> <span class="k">elif</span> <span class="n">opname</span> <span class="o">==</span> <span class="s2">"int_add_ovf"</span><span class="p">:</span> +<a id="rest_code_51a2bf22ac6042edb7137eeab86ff8c4-6" name="rest_code_51a2bf22ac6042edb7137eeab86ff8c4-6" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_51a2bf22ac6042edb7137eeab86ff8c4-6"></a> <span class="n">expr</span> <span class="o">=</span> <span class="n">arg0</span> <span class="o">+</span> <span class="n">arg1</span> +<a id="rest_code_51a2bf22ac6042edb7137eeab86ff8c4-7" name="rest_code_51a2bf22ac6042edb7137eeab86ff8c4-7" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_51a2bf22ac6042edb7137eeab86ff8c4-7"></a> <span class="n">m</span> <span class="o">=</span> <span class="n">z3</span><span class="o">.</span><span class="n">SignExt</span><span class="p">(</span><span class="n">LONG_BIT</span><span class="p">,</span> <span class="n">arg0</span><span class="p">)</span> <span class="o">+</span> <span class="n">z3</span><span class="o">.</span><span class="n">SignExt</span><span class="p">(</span><span class="n">LONG_BIT</span><span class="p">,</span> <span class="n">arg1</span><span class="p">)</span> +<a id="rest_code_51a2bf22ac6042edb7137eeab86ff8c4-8" name="rest_code_51a2bf22ac6042edb7137eeab86ff8c4-8" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_51a2bf22ac6042edb7137eeab86ff8c4-8"></a> <span class="n">state</span><span class="o">.</span><span class="n">no_ovf</span> <span class="o">=</span> <span class="n">m</span> <span class="o">==</span> <span class="n">z3</span><span class="o">.</span><span class="n">SignExt</span><span class="p">(</span><span class="n">LONG_BIT</span><span class="p">,</span> <span class="n">expr</span><span class="p">)</span> +<a id="rest_code_51a2bf22ac6042edb7137eeab86ff8c4-9" name="rest_code_51a2bf22ac6042edb7137eeab86ff8c4-9" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_51a2bf22ac6042edb7137eeab86ff8c4-9"></a> <span class="k">elif</span> <span class="n">opname</span> <span class="o">==</span> <span class="s2">"int_sub_ovf"</span><span class="p">:</span> +<a id="rest_code_51a2bf22ac6042edb7137eeab86ff8c4-10" name="rest_code_51a2bf22ac6042edb7137eeab86ff8c4-10" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_51a2bf22ac6042edb7137eeab86ff8c4-10"></a> <span class="n">expr</span> <span class="o">=</span> <span class="n">arg0</span> <span class="o">-</span> <span class="n">arg1</span> +<a id="rest_code_51a2bf22ac6042edb7137eeab86ff8c4-11" name="rest_code_51a2bf22ac6042edb7137eeab86ff8c4-11" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_51a2bf22ac6042edb7137eeab86ff8c4-11"></a> <span class="n">m</span> <span class="o">=</span> <span class="n">z3</span><span class="o">.</span><span class="n">SignExt</span><span class="p">(</span><span class="n">LONG_BIT</span><span class="p">,</span> <span class="n">arg0</span><span class="p">)</span> <span class="o">-</span> <span class="n">z3</span><span class="o">.</span><span class="n">SignExt</span><span class="p">(</span><span class="n">LONG_BIT</span><span class="p">,</span> <span class="n">arg1</span><span class="p">)</span> +<a id="rest_code_51a2bf22ac6042edb7137eeab86ff8c4-12" name="rest_code_51a2bf22ac6042edb7137eeab86ff8c4-12" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_51a2bf22ac6042edb7137eeab86ff8c4-12"></a> <span class="n">state</span><span class="o">.</span><span class="n">no_ovf</span> <span class="o">=</span> <span class="n">m</span> <span class="o">==</span> <span class="n">z3</span><span class="o">.</span><span class="n">SignExt</span><span class="p">(</span><span class="n">LONG_BIT</span><span class="p">,</span> <span class="n">expr</span><span class="p">)</span> +<a id="rest_code_51a2bf22ac6042edb7137eeab86ff8c4-13" name="rest_code_51a2bf22ac6042edb7137eeab86ff8c4-13" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_51a2bf22ac6042edb7137eeab86ff8c4-13"></a> <span class="k">elif</span> <span class="n">opname</span> <span class="o">==</span> <span class="s2">"int_mul_ovf"</span><span class="p">:</span> +<a id="rest_code_51a2bf22ac6042edb7137eeab86ff8c4-14" name="rest_code_51a2bf22ac6042edb7137eeab86ff8c4-14" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_51a2bf22ac6042edb7137eeab86ff8c4-14"></a> <span class="n">expr</span> <span class="o">=</span> <span class="n">arg0</span> <span class="o">*</span> <span class="n">arg1</span> +<a id="rest_code_51a2bf22ac6042edb7137eeab86ff8c4-15" name="rest_code_51a2bf22ac6042edb7137eeab86ff8c4-15" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_51a2bf22ac6042edb7137eeab86ff8c4-15"></a> <span class="n">m</span> <span class="o">=</span> <span class="n">z3</span><span class="o">.</span><span class="n">SignExt</span><span class="p">(</span><span class="n">LONG_BIT</span><span class="p">,</span> <span class="n">arg0</span><span class="p">)</span> <span class="o">*</span> <span class="n">z3</span><span class="o">.</span><span class="n">SignExt</span><span class="p">(</span><span class="n">LONG_BIT</span><span class="p">,</span> <span class="n">arg1</span><span class="p">)</span> +<a id="rest_code_51a2bf22ac6042edb7137eeab86ff8c4-16" name="rest_code_51a2bf22ac6042edb7137eeab86ff8c4-16" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_51a2bf22ac6042edb7137eeab86ff8c4-16"></a> <span class="n">state</span><span class="o">.</span><span class="n">no_ovf</span> <span class="o">=</span> <span class="n">m</span> <span class="o">==</span> <span class="n">z3</span><span class="o">.</span><span class="n">SignExt</span><span class="p">(</span><span class="n">LONG_BIT</span><span class="p">,</span> <span class="n">expr</span><span class="p">)</span> +<a id="rest_code_51a2bf22ac6042edb7137eeab86ff8c4-17" name="rest_code_51a2bf22ac6042edb7137eeab86ff8c4-17" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_51a2bf22ac6042edb7137eeab86ff8c4-17"></a> +<a id="rest_code_51a2bf22ac6042edb7137eeab86ff8c4-18" name="rest_code_51a2bf22ac6042edb7137eeab86ff8c4-18" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_51a2bf22ac6042edb7137eeab86ff8c4-18"></a> <span class="c1"># ...</span> +</pre></div> +<p>The boolean is computed by comparing the result of the bit vector operation with +the result of converting the input bit vectors into an abstract (arbitrary +precision) integer and the result back to bit vectors. Let's go through the +addition case step by step, the other cases work analogously.</p> +<p>The addition in the first <code class="docutils literal">elif</code> that computes <code class="docutils literal">expr</code> is an addition on bit +vectors, therefore it is performing wraparound arithmetic. +<code class="docutils literal">z3.SignExt(LONG_BIT, arg0)</code> sign-extends <code class="docutils literal">arg0</code> from a bit vector of +<code class="docutils literal">LONG_BIT</code> bits to an abstract, arbitrary precision integer. The addition in +the second line is therefore an addition between abstract integers, so it will +never overflow and just compute the correct result as an integer.</p> +<p>The condition to check for overflow is now: if the results of the two different +ways to do the addition are the same, then overflow did not occur. So in order +to compute <code class="docutils literal">state.no_ovf</code> in the addition case the +code converts the result of the bit vector wraparound addition to +an abstract integer (using <code class="docutils literal">SignExt</code> again), and then compares that to the integer +result.</p> +<p>This boolean can then be checked by the guard operations <code class="docutils literal">guard_no_overflow</code> +and <code class="docutils literal">guard_overflow</code>.</p> +<div class="code"><pre class="code python"><a id="rest_code_71e8db552ee64a1abcb47ebbdb1df319-1" name="rest_code_71e8db552ee64a1abcb47ebbdb1df319-1" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_71e8db552ee64a1abcb47ebbdb1df319-1"></a><span class="k">def</span> <span class="nf">guard_to_condition</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">guard</span><span class="p">,</span> <span class="n">state</span><span class="p">):</span> +<a id="rest_code_71e8db552ee64a1abcb47ebbdb1df319-2" name="rest_code_71e8db552ee64a1abcb47ebbdb1df319-2" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_71e8db552ee64a1abcb47ebbdb1df319-2"></a> +<a id="rest_code_71e8db552ee64a1abcb47ebbdb1df319-3" name="rest_code_71e8db552ee64a1abcb47ebbdb1df319-3" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_71e8db552ee64a1abcb47ebbdb1df319-3"></a> <span class="c1"># ... more cases</span> +<a id="rest_code_71e8db552ee64a1abcb47ebbdb1df319-4" name="rest_code_71e8db552ee64a1abcb47ebbdb1df319-4" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_71e8db552ee64a1abcb47ebbdb1df319-4"></a> +<a id="rest_code_71e8db552ee64a1abcb47ebbdb1df319-5" name="rest_code_71e8db552ee64a1abcb47ebbdb1df319-5" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_71e8db552ee64a1abcb47ebbdb1df319-5"></a> <span class="k">elif</span> <span class="n">opname</span> <span class="o">==</span> <span class="s2">"guard_no_overflow"</span><span class="p">:</span> +<a id="rest_code_71e8db552ee64a1abcb47ebbdb1df319-6" name="rest_code_71e8db552ee64a1abcb47ebbdb1df319-6" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_71e8db552ee64a1abcb47ebbdb1df319-6"></a> <span class="k">assert</span> <span class="n">state</span><span class="o">.</span><span class="n">no_ovf</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span> +<a id="rest_code_71e8db552ee64a1abcb47ebbdb1df319-7" name="rest_code_71e8db552ee64a1abcb47ebbdb1df319-7" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_71e8db552ee64a1abcb47ebbdb1df319-7"></a> <span class="k">return</span> <span class="n">state</span><span class="o">.</span><span class="n">no_ovf</span> +<a id="rest_code_71e8db552ee64a1abcb47ebbdb1df319-8" name="rest_code_71e8db552ee64a1abcb47ebbdb1df319-8" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_71e8db552ee64a1abcb47ebbdb1df319-8"></a> <span class="k">elif</span> <span class="n">opname</span> <span class="o">==</span> <span class="s2">"guard_overflow"</span><span class="p">:</span> +<a id="rest_code_71e8db552ee64a1abcb47ebbdb1df319-9" name="rest_code_71e8db552ee64a1abcb47ebbdb1df319-9" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_71e8db552ee64a1abcb47ebbdb1df319-9"></a> <span class="k">assert</span> <span class="n">state</span><span class="o">.</span><span class="n">no_ovf</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span> +<a id="rest_code_71e8db552ee64a1abcb47ebbdb1df319-10" name="rest_code_71e8db552ee64a1abcb47ebbdb1df319-10" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_71e8db552ee64a1abcb47ebbdb1df319-10"></a> <span class="k">return</span> <span class="n">z3</span><span class="o">.</span><span class="n">Not</span><span class="p">(</span><span class="n">state</span><span class="o">.</span><span class="n">no_ovf</span><span class="p">)</span> +<a id="rest_code_71e8db552ee64a1abcb47ebbdb1df319-11" name="rest_code_71e8db552ee64a1abcb47ebbdb1df319-11" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_71e8db552ee64a1abcb47ebbdb1df319-11"></a> +<a id="rest_code_71e8db552ee64a1abcb47ebbdb1df319-12" name="rest_code_71e8db552ee64a1abcb47ebbdb1df319-12" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_71e8db552ee64a1abcb47ebbdb1df319-12"></a> <span class="c1"># ... more cases</span> +</pre></div> +</section> +<section id="finding-the-bug-again"> +<h2>Finding the Bug, Again</h2> +<p>Let's actually make all of this more concrete by applying it to the trace of our +original bug. The input trace and the incorrectly optimized trace for that look +like this (differences highlighted):</p> +<div class="code"><pre class="code python"><a id="rest_code_b7b84df3112e4bbf8acd0ef739239ca0-1" name="rest_code_b7b84df3112e4bbf8acd0ef739239ca0-1" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_b7b84df3112e4bbf8acd0ef739239ca0-1"></a><span class="c1"># input # optimized</span> +<a id="rest_code_b7b84df3112e4bbf8acd0ef739239ca0-2" name="rest_code_b7b84df3112e4bbf8acd0ef739239ca0-2" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_b7b84df3112e4bbf8acd0ef739239ca0-2"></a><span class="p">[</span><span class="n">i0</span><span class="p">]</span> <span class="p">[</span><span class="n">i0</span><span class="p">]</span> +<a id="rest_code_b7b84df3112e4bbf8acd0ef739239ca0-3" name="rest_code_b7b84df3112e4bbf8acd0ef739239ca0-3" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_b7b84df3112e4bbf8acd0ef739239ca0-3"></a><span class="n">i1</span> <span class="o">=</span> <span class="n">int_add</span><span class="p">(</span><span class="n">i0</span><span class="p">,</span> <span class="mi">10</span><span class="p">)</span> <span class="n">i1</span> <span class="o">=</span> <span class="n">int_add</span><span class="p">(</span><span class="n">i0</span><span class="p">,</span> <span class="mi">10</span><span class="p">)</span> +<a id="rest_code_b7b84df3112e4bbf8acd0ef739239ca0-4" name="rest_code_b7b84df3112e4bbf8acd0ef739239ca0-4" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_b7b84df3112e4bbf8acd0ef739239ca0-4"></a><span class="n">i2</span> <span class="o">=</span> <span class="n">int_lt</span><span class="p">(</span><span class="n">i1</span><span class="p">,</span> <span class="mi">15</span><span class="p">)</span> <span class="n">i2</span> <span class="o">=</span> <span class="n">int_lt</span><span class="p">(</span><span class="n">i1</span><span class="p">,</span> <span class="mi">15</span><span class="p">)</span> +<a id="rest_code_b7b84df3112e4bbf8acd0ef739239ca0-5" name="rest_code_b7b84df3112e4bbf8acd0ef739239ca0-5" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_b7b84df3112e4bbf8acd0ef739239ca0-5"></a><span class="n">guard_true</span><span class="p">(</span><span class="n">i2</span><span class="p">)</span> <span class="n">guard_true</span><span class="p">(</span><span class="n">i2</span><span class="p">)</span> +<a id="rest_code_b7b84df3112e4bbf8acd0ef739239ca0-6" name="rest_code_b7b84df3112e4bbf8acd0ef739239ca0-6" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_b7b84df3112e4bbf8acd0ef739239ca0-6"></a><span class="hll"><span class="n">i3</span> <span class="o">=</span> <span class="n">int_lt</span><span class="p">(</span><span class="n">i0</span><span class="p">,</span> <span class="mi">6</span><span class="p">)</span> <span class="n">jump</span><span class="p">(</span><span class="mi">0</span><span class="p">)</span> +</span><a id="rest_code_b7b84df3112e4bbf8acd0ef739239ca0-7" name="rest_code_b7b84df3112e4bbf8acd0ef739239ca0-7" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_b7b84df3112e4bbf8acd0ef739239ca0-7"></a><span class="hll"><span class="n">guard_true</span><span class="p">(</span><span class="n">i3</span><span class="p">)</span> +</span><a id="rest_code_b7b84df3112e4bbf8acd0ef739239ca0-8" name="rest_code_b7b84df3112e4bbf8acd0ef739239ca0-8" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_b7b84df3112e4bbf8acd0ef739239ca0-8"></a><span class="hll"><span class="n">jump</span><span class="p">(</span><span class="mi">0</span><span class="p">)</span> +</span></pre></div> +<p>Note that the trace represents just one of the paths through the control flow +graph of the original function, which is typical for tracing JITs (the other +paths could incrementally get added later).</p> +<p>The first guards in both these traces correspond to each other, so the first +chunks to check are the first three operations (lines 1-4). Those operations +don't get changed by the optimizer at all.</p> +<p>These two identical traces get translated to the following Z3 formulas:</p> +<div class="code"><pre class="code text"><a id="rest_code_25c448b34dd145d1837209987991ae86-1" name="rest_code_25c448b34dd145d1837209987991ae86-1" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_25c448b34dd145d1837209987991ae86-1"></a>i1unoptimized == input_i0 + 10 +<a id="rest_code_25c448b34dd145d1837209987991ae86-2" name="rest_code_25c448b34dd145d1837209987991ae86-2" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_25c448b34dd145d1837209987991ae86-2"></a>i2unoptimized == If(i1unoptimized &lt; 15, 1, 0) +<a id="rest_code_25c448b34dd145d1837209987991ae86-3" name="rest_code_25c448b34dd145d1837209987991ae86-3" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_25c448b34dd145d1837209987991ae86-3"></a>i1optimized == input_i0 + 10 +<a id="rest_code_25c448b34dd145d1837209987991ae86-4" name="rest_code_25c448b34dd145d1837209987991ae86-4" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_25c448b34dd145d1837209987991ae86-4"></a>i2optimized == If(i1optimized &lt; 15, 1, 0) +</pre></div> +<p>To check that the two corresponding guards are the same, the solver is asked to +prove that <code class="docutils literal">(i2unoptimized == 1) == (i2optimized == 1)</code>. This is +correct, because the formulas for <code class="docutils literal">i2unoptimized</code> and <code class="docutils literal">i2optimized</code> are +completely identical.</p> +<p>After checking that the guards behave the same, we add the knowledge to the +solver that the guards passed. So the Z3 formulas become:</p> +<div class="code"><pre class="code text"><a id="rest_code_bd0fcf12b5514a38b91ef86a0afa4a3c-1" name="rest_code_bd0fcf12b5514a38b91ef86a0afa4a3c-1" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_bd0fcf12b5514a38b91ef86a0afa4a3c-1"></a>i1unoptimized == input_i0 + 10 +<a id="rest_code_bd0fcf12b5514a38b91ef86a0afa4a3c-2" name="rest_code_bd0fcf12b5514a38b91ef86a0afa4a3c-2" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_bd0fcf12b5514a38b91ef86a0afa4a3c-2"></a>i2unoptimized == If(i1unoptimized &lt; 15, 1, 0) +<a id="rest_code_bd0fcf12b5514a38b91ef86a0afa4a3c-3" name="rest_code_bd0fcf12b5514a38b91ef86a0afa4a3c-3" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_bd0fcf12b5514a38b91ef86a0afa4a3c-3"></a>i1optimized == input_i0 + 10 +<a id="rest_code_bd0fcf12b5514a38b91ef86a0afa4a3c-4" name="rest_code_bd0fcf12b5514a38b91ef86a0afa4a3c-4" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_bd0fcf12b5514a38b91ef86a0afa4a3c-4"></a>i2optimized == If(i1optimized &lt; 15, 1, 0) +<a id="rest_code_bd0fcf12b5514a38b91ef86a0afa4a3c-5" name="rest_code_bd0fcf12b5514a38b91ef86a0afa4a3c-5" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_bd0fcf12b5514a38b91ef86a0afa4a3c-5"></a>i1optimized == 1 +<a id="rest_code_bd0fcf12b5514a38b91ef86a0afa4a3c-6" name="rest_code_bd0fcf12b5514a38b91ef86a0afa4a3c-6" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_bd0fcf12b5514a38b91ef86a0afa4a3c-6"></a>i2optimized == 1 +</pre></div> +<p>Now we continue with the remaining operations of the two traces (lines 6-8).</p> +<p>We start by adding the <code class="docutils literal">int_lt</code> operation in the unoptimized trace to the Z3 +formulas:</p> +<div class="code"><pre class="code text"><a id="rest_code_572cd48587b84ad4aea4ab9fb60d80fd-1" name="rest_code_572cd48587b84ad4aea4ab9fb60d80fd-1" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_572cd48587b84ad4aea4ab9fb60d80fd-1"></a>... +<a id="rest_code_572cd48587b84ad4aea4ab9fb60d80fd-2" name="rest_code_572cd48587b84ad4aea4ab9fb60d80fd-2" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_572cd48587b84ad4aea4ab9fb60d80fd-2"></a>i3unoptimized == If(input_i0 &lt; 6, 1, 0) +</pre></div> +<p>Because the second guard was optimized away, we need to ask Z3 to prove that +<code class="docutils literal">i3unoptimized == 1</code> is always true, which fails and gives the following +counterexample:</p> +<div class="code"><pre class="code text"><a id="rest_code_dad63ba423ac4e599c421529bf5361a0-1" name="rest_code_dad63ba423ac4e599c421529bf5361a0-1" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_dad63ba423ac4e599c421529bf5361a0-1"></a>input_i0 = 9223372036854775800 +<a id="rest_code_dad63ba423ac4e599c421529bf5361a0-2" name="rest_code_dad63ba423ac4e599c421529bf5361a0-2" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_dad63ba423ac4e599c421529bf5361a0-2"></a>i1unoptimized = 9223372036854775810 +<a id="rest_code_dad63ba423ac4e599c421529bf5361a0-3" name="rest_code_dad63ba423ac4e599c421529bf5361a0-3" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_dad63ba423ac4e599c421529bf5361a0-3"></a>i2unoptimized = 0 +<a id="rest_code_dad63ba423ac4e599c421529bf5361a0-4" name="rest_code_dad63ba423ac4e599c421529bf5361a0-4" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_dad63ba423ac4e599c421529bf5361a0-4"></a>i1optimized = 9223372036854775810 +<a id="rest_code_dad63ba423ac4e599c421529bf5361a0-5" name="rest_code_dad63ba423ac4e599c421529bf5361a0-5" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_dad63ba423ac4e599c421529bf5361a0-5"></a>i2optimized = 1 +<a id="rest_code_dad63ba423ac4e599c421529bf5361a0-6" name="rest_code_dad63ba423ac4e599c421529bf5361a0-6" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_dad63ba423ac4e599c421529bf5361a0-6"></a>i3unoptimized = 0 +</pre></div> +<p>Thus demonstrating the bug. The fact that the Z3-based equivalence check also +managed to find the original motivating bug without manually translating it to +a formula is a good confirmation that the approach works.</p> +</section> +<section id="second-bug"> +<h2>Second bug</h2> +<p>So with this code I applied the Z3-based equivalence check to all our optimizer +unit tests. In addition to the bug we've been discussing the whole post, it also +found another buggy test! I had found it too by hand by staring at all the tests +in the process of writing all the Z3 infrastructure, but it was still a good +confirmation that the process worked. This bug was in the range analysis for +<code class="docutils literal">int_neg</code>, integer negation. It failed to account that <code class="docutils literal"><span class="pre">-MININT</span> == MININT</code> +and therefore did a mis-optimization along the following lines:</p> +<div class="code"><pre class="code python"><a id="rest_code_486f2b8abd90465a8220a1becde3f0bd-1" name="rest_code_486f2b8abd90465a8220a1becde3f0bd-1" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_486f2b8abd90465a8220a1becde3f0bd-1"></a><span class="kn">import</span> <span class="nn">__pypy__</span> +<a id="rest_code_486f2b8abd90465a8220a1becde3f0bd-2" name="rest_code_486f2b8abd90465a8220a1becde3f0bd-2" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_486f2b8abd90465a8220a1becde3f0bd-2"></a> +<a id="rest_code_486f2b8abd90465a8220a1becde3f0bd-3" name="rest_code_486f2b8abd90465a8220a1becde3f0bd-3" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_486f2b8abd90465a8220a1becde3f0bd-3"></a><span class="k">def</span> <span class="nf">wrong</span><span class="p">(</span><span class="n">x</span><span class="p">):</span> +<a id="rest_code_486f2b8abd90465a8220a1becde3f0bd-4" name="rest_code_486f2b8abd90465a8220a1becde3f0bd-4" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_486f2b8abd90465a8220a1becde3f0bd-4"></a> <span class="n">a</span> <span class="o">=</span> <span class="n">__pypy__</span><span class="o">.</span><span class="n">intop</span><span class="o">.</span><span class="n">int_sub</span><span class="p">(</span><span class="mi">0</span><span class="p">,</span> <span class="n">x</span><span class="p">)</span> +<a id="rest_code_486f2b8abd90465a8220a1becde3f0bd-5" name="rest_code_486f2b8abd90465a8220a1becde3f0bd-5" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_486f2b8abd90465a8220a1becde3f0bd-5"></a> <span class="k">if</span> <span class="n">a</span> <span class="o">&lt;</span> <span class="mi">0</span><span class="p">:</span> +<a id="rest_code_486f2b8abd90465a8220a1becde3f0bd-6" name="rest_code_486f2b8abd90465a8220a1becde3f0bd-6" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_486f2b8abd90465a8220a1becde3f0bd-6"></a> <span class="k">if</span> <span class="n">x</span> <span class="o">&gt;</span> <span class="mi">0</span><span class="p">:</span> +<a id="rest_code_486f2b8abd90465a8220a1becde3f0bd-7" name="rest_code_486f2b8abd90465a8220a1becde3f0bd-7" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_486f2b8abd90465a8220a1becde3f0bd-7"></a> <span class="k">return</span> <span class="mi">0</span> +<a id="rest_code_486f2b8abd90465a8220a1becde3f0bd-8" name="rest_code_486f2b8abd90465a8220a1becde3f0bd-8" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_486f2b8abd90465a8220a1becde3f0bd-8"></a> <span class="k">return</span> <span class="mi">1</span> +<a id="rest_code_486f2b8abd90465a8220a1becde3f0bd-9" name="rest_code_486f2b8abd90465a8220a1becde3f0bd-9" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_486f2b8abd90465a8220a1becde3f0bd-9"></a> <span class="k">return</span> <span class="mi">2</span> +</pre></div> +<p>Which was wrongly optimized into:</p> +<div class="code"><pre class="code python"><a id="rest_code_a6cf538b3ecd4a0ebaf7b8c0cc7c7007-1" name="rest_code_a6cf538b3ecd4a0ebaf7b8c0cc7c7007-1" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_a6cf538b3ecd4a0ebaf7b8c0cc7c7007-1"></a><span class="kn">import</span> <span class="nn">__pypy__</span> +<a id="rest_code_a6cf538b3ecd4a0ebaf7b8c0cc7c7007-2" name="rest_code_a6cf538b3ecd4a0ebaf7b8c0cc7c7007-2" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_a6cf538b3ecd4a0ebaf7b8c0cc7c7007-2"></a> +<a id="rest_code_a6cf538b3ecd4a0ebaf7b8c0cc7c7007-3" name="rest_code_a6cf538b3ecd4a0ebaf7b8c0cc7c7007-3" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_a6cf538b3ecd4a0ebaf7b8c0cc7c7007-3"></a><span class="k">def</span> <span class="nf">wrong</span><span class="p">(</span><span class="n">x</span><span class="p">):</span> +<a id="rest_code_a6cf538b3ecd4a0ebaf7b8c0cc7c7007-4" name="rest_code_a6cf538b3ecd4a0ebaf7b8c0cc7c7007-4" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_a6cf538b3ecd4a0ebaf7b8c0cc7c7007-4"></a> <span class="n">a</span> <span class="o">=</span> <span class="n">__pypy__</span><span class="o">.</span><span class="n">intop</span><span class="o">.</span><span class="n">int_sub</span><span class="p">(</span><span class="mi">0</span><span class="p">,</span> <span class="n">x</span><span class="p">)</span> +<a id="rest_code_a6cf538b3ecd4a0ebaf7b8c0cc7c7007-5" name="rest_code_a6cf538b3ecd4a0ebaf7b8c0cc7c7007-5" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_a6cf538b3ecd4a0ebaf7b8c0cc7c7007-5"></a> <span class="k">if</span> <span class="n">a</span> <span class="o">&lt;</span> <span class="mi">0</span><span class="p">:</span> +<a id="rest_code_a6cf538b3ecd4a0ebaf7b8c0cc7c7007-6" name="rest_code_a6cf538b3ecd4a0ebaf7b8c0cc7c7007-6" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_a6cf538b3ecd4a0ebaf7b8c0cc7c7007-6"></a> <span class="k">return</span> <span class="mi">0</span> +<a id="rest_code_a6cf538b3ecd4a0ebaf7b8c0cc7c7007-7" name="rest_code_a6cf538b3ecd4a0ebaf7b8c0cc7c7007-7" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_a6cf538b3ecd4a0ebaf7b8c0cc7c7007-7"></a> <span class="k">return</span> <span class="mi">2</span> +</pre></div> +<p>This is wrong precisely for <code class="docutils literal">x == MININT</code>.</p> +</section> +<section id="generating-random-traces"> +<h2>Generating Random Traces</h2> +<p>These two bugs were the only two that the Z3 checker found for existing unit +tests. To try to find some more bugs I combined PyPy's existing random trace +generator with the Z3 optimization checker. The random trace generator has so +far been mostly used to find bugs in the machine code backends, particularly +also in the register allocator. So far we haven't used it with our optimizer, +but my experiments show that we should have!</p> +<p>I'm going to describe a little bit how the random trace generator works. It's +actually not that complicated, but there's one neat trick to it.</p> +<p>The basic idea is straightforward, it starts out with an empty trace with a +random number of input variables. Then it adds some number of operations to the +trace, either regular operations or guards. Every operation takes already +existing variables as input.</p> +<p>The neat trick is that our random trace generator keeps a concrete random +example value for every one of the input variables, and an example result for +every operation. In this way, it is possible to generate guards that are +consistent with the example values to ensure that running the trace to its end +is possible with at least one set of values.</p> +<p>Here's an example random trace that is generated, together with the random +example inputs and the results of every operation at the end of every line:</p> +<pre class="literal-block">[i0, i1, i2, i3, i4, i5] # example values: 9, 11, -8, -95, 46, 57 +i6 = int_add_ovf(i3, i0) # -86 +guard_no_overflow() +i7 = int_sub(i2, -35/ci) # 27 +i8 = uint_ge(i3, i5) # 1 +guard_true(i8) +i9 = int_lt(i7, i8) # 0 +i10 = int_mul_ovf(34/ci, i7) # 918 +guard_no_overflow() +i11 = int_and(i10, 63/ci) # 22 +i12 = int_rshift(i3, i11) # -1 +i13 = int_is_zero(i7) # 0 +i14 = int_is_true(i13) # 0 +guard_false(i13) +i15 = int_lt(i8, i4) # 1 +i16 = int_and(i6, i0) # 8 +i17 = uint_ge(i6, -6/ci) # 0 +finish()</pre> +<p>Note how every guard generated is true for the example values.</p> +<p>I have been running this combination of random trace generation and Z3 checking +for many nights and it has found some bugs, which I'll describe in the next +section. It should probably be run for a lot longer, but still a useful +exercise already.</p> +<p>In this mode, I'm giving every Z3 call a time limit to make sure that the random +tests don't just take arbitrarily long. This means that asking Z3 to prove +something can have three outcomes, either it's proved, or Z3 finds a +counterexample, or Z3 times out.</p> +</section> +<section id="bugs-found"> +<h2>Bugs Found</h2> +<p>In addition to the two bugs I've already described, I'll briefly list the +additional bugs that were found by optimizing random traces and then trying to +prove the equivalence with Z3.</p> +<p>Most of the bugs were actually identified by optimizing random traces alone, not +by the Z3 component. They manifested as assert failures in the JIT compiler.</p> +<ul class="simple"> +<li><p>The JIT concluded after <code class="docutils literal">12 == int_mul(x, 12)</code> that <code class="docutils literal">x == 1</code>, which is +incorrect if overflow occurred (a counterexample is <code class="docutils literal">0x8000000000000001</code>).</p></li> +<li><p>An amusing bug, where from <code class="docutils literal">0 == int_lshift(0x1000000000000000, x)</code> with +<code class="docutils literal">x &lt;= 0 &lt;= 15</code>, the JIT concluded that <code class="docutils literal">0x1000000000000000 == 0</code>, +triggering an assert. This wrong conclusion was again caused by not taking the +possibility of overflow into account.</p></li> +<li><p>A corner case in an optimization for chained integer additions with a +constant, where in complex enough expressions, the wrong IR API was used +(which works correctly in simple cases). Again, this triggered an assert.</p></li> +</ul> +<p>This shows that we should have been fuzzing our JIT optimizer already (not a +surprising observation in hindsight, fuzz all the things!).</p> +<p>Thankfully, there was also one further bug that really failed in the Z3 +verifier. It's a bug in common subexpression elimination / arithmetic +simplification, which again does not take overflow correctly into account.</p> +<p>The buggy trace looks like this (unfortunately it's not easily possible to show +this bug in Python code).</p> +<div class="code"><pre class="code text"><a id="rest_code_40493479399f42558ecf3121b6abb0ca-1" name="rest_code_40493479399f42558ecf3121b6abb0ca-1" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_40493479399f42558ecf3121b6abb0ca-1"></a>[a, b] +<a id="rest_code_40493479399f42558ecf3121b6abb0ca-2" name="rest_code_40493479399f42558ecf3121b6abb0ca-2" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_40493479399f42558ecf3121b6abb0ca-2"></a>c = int_add(a, b) +<a id="rest_code_40493479399f42558ecf3121b6abb0ca-3" name="rest_code_40493479399f42558ecf3121b6abb0ca-3" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_40493479399f42558ecf3121b6abb0ca-3"></a>r = int_sub_ovf(c, b) +<a id="rest_code_40493479399f42558ecf3121b6abb0ca-4" name="rest_code_40493479399f42558ecf3121b6abb0ca-4" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_40493479399f42558ecf3121b6abb0ca-4"></a>guard_no_ovf() +<a id="rest_code_40493479399f42558ecf3121b6abb0ca-5" name="rest_code_40493479399f42558ecf3121b6abb0ca-5" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_40493479399f42558ecf3121b6abb0ca-5"></a>finish(r) +</pre></div> +<p>This was optimized to:</p> +<div class="code"><pre class="code text"><a id="rest_code_30cdbc23b541425f891edc9180ced3c0-1" name="rest_code_30cdbc23b541425f891edc9180ced3c0-1" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_30cdbc23b541425f891edc9180ced3c0-1"></a>[a, b] +<a id="rest_code_30cdbc23b541425f891edc9180ced3c0-2" name="rest_code_30cdbc23b541425f891edc9180ced3c0-2" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_30cdbc23b541425f891edc9180ced3c0-2"></a>finish(a) +</pre></div> +<p>Which is incorrect, because the guard can fail given the right inputs. +But the optimizer concluded that the subtraction is safe, because its the +inverse of an earlier addition, not taking into account that this earlier +addition can have overflowed.</p> +<p>Note that a related optimization is actually correct. Given this code:</p> +<div class="code"><pre class="code text"><a id="rest_code_6037a89ec2e141f3a6fb830fe938b2f4-1" name="rest_code_6037a89ec2e141f3a6fb830fe938b2f4-1" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_6037a89ec2e141f3a6fb830fe938b2f4-1"></a>[a, b] +<a id="rest_code_6037a89ec2e141f3a6fb830fe938b2f4-2" name="rest_code_6037a89ec2e141f3a6fb830fe938b2f4-2" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_6037a89ec2e141f3a6fb830fe938b2f4-2"></a>c = int_add_ovf(a, b) +<a id="rest_code_6037a89ec2e141f3a6fb830fe938b2f4-3" name="rest_code_6037a89ec2e141f3a6fb830fe938b2f4-3" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_6037a89ec2e141f3a6fb830fe938b2f4-3"></a>guard_no_ovf() +<a id="rest_code_6037a89ec2e141f3a6fb830fe938b2f4-4" name="rest_code_6037a89ec2e141f3a6fb830fe938b2f4-4" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_6037a89ec2e141f3a6fb830fe938b2f4-4"></a>r = int_sub(c, b) +<a id="rest_code_6037a89ec2e141f3a6fb830fe938b2f4-5" name="rest_code_6037a89ec2e141f3a6fb830fe938b2f4-5" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_6037a89ec2e141f3a6fb830fe938b2f4-5"></a>finish(r) +</pre></div> +<p>It can be optimized to:</p> +<div class="code"><pre class="code text"><a id="rest_code_231f1b026f874575959e48a29de9a78d-1" name="rest_code_231f1b026f874575959e48a29de9a78d-1" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_231f1b026f874575959e48a29de9a78d-1"></a>[a, b] +<a id="rest_code_231f1b026f874575959e48a29de9a78d-2" name="rest_code_231f1b026f874575959e48a29de9a78d-2" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_231f1b026f874575959e48a29de9a78d-2"></a>c = int_add_ovf(a, b) +<a id="rest_code_231f1b026f874575959e48a29de9a78d-3" name="rest_code_231f1b026f874575959e48a29de9a78d-3" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_231f1b026f874575959e48a29de9a78d-3"></a>guard_no_ovf() +<a id="rest_code_231f1b026f874575959e48a29de9a78d-4" name="rest_code_231f1b026f874575959e48a29de9a78d-4" href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html#rest_code_231f1b026f874575959e48a29de9a78d-4"></a>finish(a) +</pre></div> +</section> +<section id="future-work-and-conclusion"> +<h2>Future Work and Conclusion</h2> +<p>In the current form the Z3 checker is only a start, even though it has already +been concretely useful. There are various directions into which we could extend +it. In addition to generate random tests completely from scratch, we could also +start from the existing manually written unit-tests and randomly mutate those.</p> +<p>I also want to extend the Z3 checker with support more operations, heap +operations in particular (but it's not quite clear to me how to model garbage +collection).</p> +<p>I also want to try to switch the code away from the Z3 API and use the more +general <a class="reference external" href="https://smtlib.cs.uiowa.edu/">smtlib</a> interface directly, in order to be able to use other SMT +checkers than Z3, eg <a class="reference external" href="https://cvc4.github.io/">CVC4</a>.</p> +<p>But all in all this was a fun and not too hard way to find a bunch of bugs in +our optimizer! And the infrastructure is now in place, which means that we run +some random test cases every time we execute our tests. This is going to be +particularly useful when we do further work on the integer reasoning of the JIT +(like Nico is doing, for example). As of time of writing of this post, all the +bugs mentioned have been fixed and the Z3 code has landed on the default branch +and runs as part of PyPy's CI infrastructure.</p> +</section> +<section id="acknowledgements"> +<h2>Acknowledgements</h2> +<p>Thanks to <a class="reference external" href="http://saambarati.org/">Saam Barati</a>, <a class="reference external" href="https://bernsteinbear.com">Max Bernstein</a>, <a class="reference external" href="https://www.cs.hhu.de/lehrstuehle-und-arbeitsgruppen/softwaretechnik-und-programmiersprachen/unser-team/team/schmidt">Joshua Schmidt</a> and <a class="reference external" href="https://martinfriedrichberger.net/">Martin +Berger</a>, for great feedback on drafts of this post!</p> +</section>jittestinghttps://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.htmlSun, 11 Dec 2022 18:00:00 GMT \ No newline at end of file diff --git a/categories/toy-optimizer.html b/categories/toy-optimizer.html new file mode 100644 index 000000000..a0b3f88c7 --- /dev/null +++ b/categories/toy-optimizer.html @@ -0,0 +1,123 @@ + + + + + +Posts about toy-optimizer | PyPy + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +
                + + \ No newline at end of file diff --git a/categories/toy-optimizer.xml b/categories/toy-optimizer.xml new file mode 100644 index 000000000..fce4b8585 --- /dev/null +++ b/categories/toy-optimizer.xml @@ -0,0 +1,3643 @@ + +PyPy (Posts about toy-optimizer)https://www.pypy.org/enContents © 2024 <a href="mailto:pypy-dev@pypy.org">The PyPy Team</a> Sat, 31 Aug 2024 17:48:14 GMTNikola (getnikola.com)http://blogs.law.harvard.edu/tech/rssA Knownbits Abstract Domain for the Toy Optimizer, Correctlyhttps://www.pypy.org/posts/2024/08/toy-knownbits.htmlCF Bolz-Tereick<p>After <a href="https://bernsteinbear.com/blog/toy-abstract-interpretation/">Max' introduction to abstract interpretation for the toy optimizer</a> in the +last post, I want to present a more complicated abstract domain in this post. +This abstract domain reasons about the individual bits of a variable in a trace. +Every bit can be either "known zero", "known one" or "unknown". The abstract +domain is useful for optimizing integer operations, particularly the bitwise operations. +The abstract domain follows quite closely the <a href="https://github.com/torvalds/linux/blob/master/kernel/bpf/tnum.c">tristate abstract domain of the +eBPF verifier in the Linux +Kernel</a>, as +described by the paper +<a href="https://arxiv.org/abs/2105.05398">Sound, Precise, and Fast Abstract Interpretation with Tristate +Numbers</a> by Harishankar Vishwanathan, Matan +Shachnai, Srinivas Narayana, and Santosh Nagarakatte.</p> +<p>The presentation in this post will still be in the context of the +<a href="https://www.pypy.org/categories/toy-optimizer">toy optimizer</a>. We'll spend a significant part of +the post convincing ourselves that the abstract domain transfer functions that +we're writing are really correct, using both property-based testing and +automated proofs (again using Z3).</p> +<p>PyPy has implemented and merged a more complicated version of the same abstract +domain for the "real" PyPy JIT. A more thorough explanation of that real world +implementation will follow.</p> +<p>I'd like to thank Max Bernstein and Armin Rigo for lots of great feedback on +drafts of this post. The PyPy implementation was mainly done by Nico +Rittinghaus and me.</p> +<p><strong>Contents:</strong></p> +<div class="toc"> +<ul> +<li><a href="https://www.pypy.org/posts/2024/08/toy-knownbits.html#motivation">Motivation</a></li> +<li><a href="https://www.pypy.org/posts/2024/08/toy-knownbits.html#the-knownbits-abstract-domain">The Knownbits Abstract Domain</a></li> +<li><a href="https://www.pypy.org/posts/2024/08/toy-knownbits.html#transfer-functions">Transfer Functions</a></li> +<li><a href="https://www.pypy.org/posts/2024/08/toy-knownbits.html#property-based-tests-with-hypothesis">Property-based Tests with Hypothesis</a></li> +<li><a href="https://www.pypy.org/posts/2024/08/toy-knownbits.html#when-are-transfer-functions-correct-how-do-we-test-them">When are Transfer Functions Correct? How do we test them?</a></li> +<li><a href="https://www.pypy.org/posts/2024/08/toy-knownbits.html#implementing-binary-transfer-functions">Implementing Binary Transfer Functions</a></li> +<li><a href="https://www.pypy.org/posts/2024/08/toy-knownbits.html#addition-and-subtraction">Addition and Subtraction</a></li> +<li><a href="https://www.pypy.org/posts/2024/08/toy-knownbits.html#proving-correctness-of-the-transfer-functions-with-z3">Proving correctness of the transfer functions with Z3</a></li> +<li><a href="https://www.pypy.org/posts/2024/08/toy-knownbits.html#cases-where-this-style-of-z3-proof-doesnt-work">Cases where this style of Z3 proof doesn't work</a></li> +<li><a href="https://www.pypy.org/posts/2024/08/toy-knownbits.html#making-statements-about-precision">Making Statements about Precision</a></li> +<li><a href="https://www.pypy.org/posts/2024/08/toy-knownbits.html#using-the-abstract-domain-in-the-toy-optimizer-for-generalized-constant-folding">Using the Abstract Domain in the Toy Optimizer for Generalized Constant Folding</a></li> +<li><a href="https://www.pypy.org/posts/2024/08/toy-knownbits.html#using-the-knownbits-domain-for-conditional-peephole-rewrites">Using the KnownBits Domain for Conditional Peephole Rewrites</a></li> +<li><a href="https://www.pypy.org/posts/2024/08/toy-knownbits.html#conclusion">Conclusion</a></li> +</ul> +</div> +<h3 id="motivation">Motivation</h3> +<p>In many programs that do bit-manipulation of integers, some of the bits of the +integer variables of the program can be statically known. Here's a simple +example:</p> +<div class="code"><pre class="code literal-block"><span class="nv">x</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="nv">a</span><span class="w"> </span><span class="o">|</span><span class="w"> </span><span class="mi">1</span> +... +<span class="k">if</span><span class="w"> </span><span class="nv">x</span><span class="w"> </span><span class="o">&amp;</span><span class="w"> </span><span class="mi">1</span>: +<span class="w"> </span>... +<span class="k">else</span>: +<span class="w"> </span>... +</pre></div> + +<p>After the assignment <code>x = a | 1</code>, we know that the lowest bit of <code>x</code> must be <code>1</code> +(the other bits are unknown) and an optimizer could remove the condition <code>x &amp; 1</code> by +constant-folding it to <code>1</code>.</p> +<p>Another (more complicated) example is:</p> +<div class="code"><pre class="code literal-block">assert i &amp; 0b111 == 0 # check that i is a multiple of 8 +j = i + 16 +assert j &amp; 0b111 == 0 +</pre></div> + +<p>This kind of code could e.g. happen in a <a href="https://docs.pydrofoil.org/en/latest/">CPU +emulator</a>, where <code>i</code> and <code>j</code> are +integers that represent emulated pointers, and the <code>assert</code>s are alignment +checks. The first assert implies that the lowest three bits of i must be <code>0</code>. +Adding 16 to such a number produces a result where the lowest three bits are +again all <code>0</code>, therefore the second assert is always true. So we would like a +compiler to remove the second assert.</p> +<p>Both of these will optimizations are doable with the help of the knownbits +abstract domain that we'll discuss in the rest of the post.</p> +<h3 id="the-knownbits-abstract-domain">The Knownbits Abstract Domain</h3> +<p>An abstract value of the knownbits domain needs to be able to store, for every +bit of an integer variable in a program, whether it is known 0, known 1, or +unknown. To represent +three different states, we need 2 bits, which we will call <code>one</code> and <code>unknown</code>. +Here's the encoding:</p> +<table> +<thead> +<tr> +<th>one</th> +<th>unknown</th> +<th align="right">knownbit</th> +</tr> +</thead> +<tbody> +<tr> +<td>0</td> +<td>0</td> +<td align="right">0</td> +</tr> +<tr> +<td>1</td> +<td>0</td> +<td align="right">1</td> +</tr> +<tr> +<td>0</td> +<td>1</td> +<td align="right">?</td> +</tr> +<tr> +<td>1</td> +<td>1</td> +<td align="right">illegal</td> +</tr> +</tbody> +</table> +<p>The <code>unknown</code> bit is set if we don't know the value of the bit ("?"), the <code>one</code> +bit is set if the bit is known to be a <code>1</code>. Since two bits are enough to encode +four different states, but we only need three, the combination of a set <code>one</code> +bit and a set <code>unknown</code> is not allowed.</p> +<p>We don't just want to encode a single bit, however. Instead, we want to do this +for all the bits of an integer variable. Therefore the instances of the abstract +domain get two integer fields <code>ones</code> and <code>unknowns</code>, where each pair of +corresponding bits encodes the knowledge about the corresponding bit of the +integer variable in the program.</p> +<p>We can start implementing a Python class that works like this:</p> +<div class="code"><pre class="code literal-block"><span class="kn">from</span> <span class="nn">dataclasses</span> <span class="kn">import</span> <span class="n">dataclass</span> + +<span class="nd">@dataclass</span><span class="p">(</span><span class="n">eq</span><span class="o">=</span><span class="kc">False</span><span class="p">)</span> +<span class="k">class</span> <span class="nc">KnownBits</span><span class="p">:</span> + <span class="n">ones</span> <span class="p">:</span> <span class="nb">int</span> + <span class="n">unknowns</span> <span class="p">:</span> <span class="nb">int</span> + + <span class="k">def</span> <span class="nf">__post_init__</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span> + <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">ones</span><span class="p">,</span> <span class="nb">int</span><span class="p">):</span> + <span class="k">assert</span> <span class="bp">self</span><span class="o">.</span><span class="n">is_well_formed</span><span class="p">()</span> + + <span class="k">def</span> <span class="nf">is_well_formed</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span> + <span class="c1"># a bit cannot be both 1 and unknown</span> + <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">ones</span> <span class="o">&amp;</span> <span class="bp">self</span><span class="o">.</span><span class="n">unknowns</span> <span class="o">==</span> <span class="mi">0</span> + + <span class="nd">@staticmethod</span> + <span class="k">def</span> <span class="nf">from_constant</span><span class="p">(</span><span class="n">const</span> <span class="p">:</span> <span class="nb">int</span><span class="p">):</span> +<span class="w"> </span><span class="sd">""" Construct a KnownBits corresponding to a constant, where all bits</span> +<span class="sd"> are known."""</span> + <span class="k">return</span> <span class="n">KnownBits</span><span class="p">(</span><span class="n">const</span><span class="p">,</span> <span class="mi">0</span><span class="p">)</span> + + <span class="k">def</span> <span class="nf">is_constant</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span> +<span class="w"> </span><span class="sd">""" Check if the KnownBits instance represents a constant. """</span> + <span class="c1"># it's a constant if there are no unknowns</span> + <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">unknowns</span> <span class="o">==</span> <span class="mi">0</span> +</pre></div> + +<p>We can also add some convenience properties. Sometimes it is easier to work +with an integer where all the <em>known</em> bits are set, or one where the positions +of all the known zeros have a set bit:</p> +<div class="code"><pre class="code literal-block"><span class="k">class</span> <span class="nc">KnownBits</span><span class="p">:</span> + <span class="o">...</span> + + <span class="nd">@property</span> + <span class="k">def</span> <span class="nf">knowns</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span> +<span class="w"> </span><span class="sd">""" return an integer where the known bits are set. """</span> + <span class="c1"># the knowns are just the unknowns, inverted</span> + <span class="k">return</span> <span class="o">~</span><span class="bp">self</span><span class="o">.</span><span class="n">unknowns</span> + + <span class="nd">@property</span> + <span class="k">def</span> <span class="nf">zeros</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span> +<span class="w"> </span><span class="sd">""" return an integer where the places that are known zeros have a bit</span> +<span class="sd"> set. """</span> + <span class="c1"># it's a 0 if it is known, but not 1</span> + <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">knowns</span> <span class="o">&amp;</span> <span class="o">~</span><span class="bp">self</span><span class="o">.</span><span class="n">ones</span> +</pre></div> + +<p>Also, for debugging and for writing tests we want a way to print the known bits +in a human-readable form, and also to have a way to construct a <code>KnownBits</code> +instance from a string. It's not important to understand the details of +<code>__str__</code> or <code>from_str</code> for the rest of the post, so I'm putting them into a fold:</p> +<details> +<summary><code>KnownBits</code> from and to string conversions</summary> + + +<div class="code"><pre class="code literal-block"><span class="k">class</span> <span class="nc">KnownBits</span><span class="p">:</span> + <span class="o">...</span> + + <span class="k">def</span> <span class="fm">__repr__</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span> + <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">is_constant</span><span class="p">():</span> + <span class="k">return</span> <span class="sa">f</span><span class="s2">"KnownBits.from_constant(</span><span class="si">{</span><span class="bp">self</span><span class="o">.</span><span class="n">ones</span><span class="si">}</span><span class="s2">)"</span> + <span class="k">return</span> <span class="sa">f</span><span class="s2">"KnownBits(</span><span class="si">{</span><span class="bp">self</span><span class="o">.</span><span class="n">ones</span><span class="si">}</span><span class="s2">, </span><span class="si">{</span><span class="bp">self</span><span class="o">.</span><span class="n">unknowns</span><span class="si">}</span><span class="s2">)"</span> + + <span class="k">def</span> <span class="fm">__str__</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span> + <span class="n">res</span> <span class="o">=</span> <span class="p">[]</span> + <span class="n">ones</span><span class="p">,</span> <span class="n">unknowns</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">ones</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">unknowns</span> + <span class="c1"># construct the string representation right to left</span> + <span class="k">while</span> <span class="mi">1</span><span class="p">:</span> + <span class="k">if</span> <span class="ow">not</span> <span class="n">ones</span> <span class="ow">and</span> <span class="ow">not</span> <span class="n">unknowns</span><span class="p">:</span> + <span class="k">break</span> <span class="c1"># we leave off the leading known 0s</span> + <span class="k">if</span> <span class="n">ones</span> <span class="o">==</span> <span class="o">-</span><span class="mi">1</span> <span class="ow">and</span> <span class="ow">not</span> <span class="n">unknowns</span><span class="p">:</span> + <span class="c1"># -1 has all bits set in two's complement, so the leading</span> + <span class="c1"># bits are all 1</span> + <span class="n">res</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="s1">'1'</span><span class="p">)</span> + <span class="n">res</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="s2">"..."</span><span class="p">)</span> + <span class="k">break</span> + <span class="k">if</span> <span class="n">unknowns</span> <span class="o">==</span> <span class="o">-</span><span class="mi">1</span><span class="p">:</span> + <span class="c1"># -1 has all bits set in two's complement, so the leading bits</span> + <span class="c1"># are all ?</span> + <span class="k">assert</span> <span class="ow">not</span> <span class="n">ones</span> + <span class="n">res</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="s2">"?"</span><span class="p">)</span> + <span class="n">res</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="s2">"..."</span><span class="p">)</span> + <span class="k">break</span> + <span class="k">if</span> <span class="n">unknowns</span> <span class="o">&amp;</span> <span class="mi">1</span><span class="p">:</span> + <span class="n">res</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="s1">'?'</span><span class="p">)</span> + <span class="k">elif</span> <span class="n">ones</span> <span class="o">&amp;</span> <span class="mi">1</span><span class="p">:</span> + <span class="n">res</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="s1">'1'</span><span class="p">)</span> + <span class="k">else</span><span class="p">:</span> + <span class="n">res</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="s1">'0'</span><span class="p">)</span> + <span class="n">ones</span> <span class="o">&gt;&gt;=</span> <span class="mi">1</span> + <span class="n">unknowns</span> <span class="o">&gt;&gt;=</span> <span class="mi">1</span> + <span class="k">if</span> <span class="ow">not</span> <span class="n">res</span><span class="p">:</span> + <span class="n">res</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="s1">'0'</span><span class="p">)</span> + <span class="n">res</span><span class="o">.</span><span class="n">reverse</span><span class="p">()</span> + <span class="k">return</span> <span class="s2">""</span><span class="o">.</span><span class="n">join</span><span class="p">(</span><span class="n">res</span><span class="p">)</span> + + <span class="nd">@staticmethod</span> + <span class="k">def</span> <span class="nf">from_str</span><span class="p">(</span><span class="n">s</span><span class="p">):</span> +<span class="w"> </span><span class="sd">""" Construct a KnownBits instance that from a string. String can start</span> +<span class="sd"> with ...1 to mean that all higher bits are 1, or ...? to mean that all</span> +<span class="sd"> higher bits are unknown. Otherwise it is assumed that the higher bits</span> +<span class="sd"> are all 0. """</span> + <span class="n">ones</span><span class="p">,</span> <span class="n">unknowns</span> <span class="o">=</span> <span class="mi">0</span><span class="p">,</span> <span class="mi">0</span> + <span class="n">startindex</span> <span class="o">=</span> <span class="mi">0</span> + <span class="k">if</span> <span class="n">s</span><span class="o">.</span><span class="n">startswith</span><span class="p">(</span><span class="s2">"...?"</span><span class="p">):</span> + <span class="n">unknowns</span> <span class="o">=</span> <span class="o">-</span><span class="mi">1</span> + <span class="n">startindex</span> <span class="o">=</span> <span class="mi">4</span> + <span class="k">elif</span> <span class="n">s</span><span class="o">.</span><span class="n">startswith</span><span class="p">(</span><span class="s2">"...1"</span><span class="p">):</span> + <span class="n">ones</span> <span class="o">=</span> <span class="o">-</span><span class="mi">1</span> + <span class="n">startindex</span> <span class="o">=</span> <span class="mi">4</span> + <span class="k">for</span> <span class="n">index</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="n">startindex</span><span class="p">,</span> <span class="nb">len</span><span class="p">(</span><span class="n">s</span><span class="p">)):</span> + <span class="n">ones</span> <span class="o">&lt;&lt;=</span> <span class="mi">1</span> + <span class="n">unknowns</span> <span class="o">&lt;&lt;=</span> <span class="mi">1</span> + <span class="n">c</span> <span class="o">=</span> <span class="n">s</span><span class="p">[</span><span class="n">index</span><span class="p">]</span> + <span class="k">if</span> <span class="n">c</span> <span class="o">==</span> <span class="s1">'1'</span><span class="p">:</span> + <span class="n">ones</span> <span class="o">|=</span> <span class="mi">1</span> + <span class="k">elif</span> <span class="n">c</span> <span class="o">==</span> <span class="s1">'?'</span><span class="p">:</span> + <span class="n">unknowns</span> <span class="o">|=</span> <span class="mi">1</span> + <span class="k">return</span> <span class="n">KnownBits</span><span class="p">(</span><span class="n">ones</span><span class="p">,</span> <span class="n">unknowns</span><span class="p">)</span> + + <span class="nd">@staticmethod</span> + <span class="k">def</span> <span class="nf">all_unknown</span><span class="p">():</span> +<span class="w"> </span><span class="sd">""" convenience constructor for the "all bits unknown" abstract value</span> +<span class="sd"> """</span> + <span class="k">return</span> <span class="n">KnownBits</span><span class="o">.</span><span class="n">from_str</span><span class="p">(</span><span class="s2">"...?"</span><span class="p">)</span> +</pre></div> + + + +</details> + +<p>And here's a <a href="https://pytest.org">pytest</a>-style unit test for <code>str</code>:</p> +<div class="code"><pre class="code literal-block"><span class="k">def</span> <span class="nf">test_str</span><span class="p">():</span> + <span class="k">assert</span> <span class="nb">str</span><span class="p">(</span><span class="n">KnownBits</span><span class="o">.</span><span class="n">from_constant</span><span class="p">(</span><span class="mi">0</span><span class="p">))</span> <span class="o">==</span> <span class="s1">'0'</span> + <span class="k">assert</span> <span class="nb">str</span><span class="p">(</span><span class="n">KnownBits</span><span class="o">.</span><span class="n">from_constant</span><span class="p">(</span><span class="mi">5</span><span class="p">))</span> <span class="o">==</span> <span class="s1">'101'</span> + <span class="k">assert</span> <span class="nb">str</span><span class="p">(</span><span class="n">KnownBits</span><span class="p">(</span><span class="mi">5</span><span class="p">,</span> <span class="mb">0b10</span><span class="p">))</span> <span class="o">==</span> <span class="s1">'1?1'</span> + <span class="k">assert</span> <span class="nb">str</span><span class="p">(</span><span class="n">KnownBits</span><span class="p">(</span><span class="o">~</span><span class="mb">0b1111</span><span class="p">,</span> <span class="mb">0b10</span><span class="p">))</span> <span class="o">==</span> <span class="s1">'...100?0'</span> + <span class="k">assert</span> <span class="nb">str</span><span class="p">(</span><span class="n">KnownBits</span><span class="p">(</span><span class="mi">1</span><span class="p">,</span> <span class="o">~</span><span class="mb">0b1</span><span class="p">))</span> <span class="o">==</span> <span class="s1">'...?1'</span> +</pre></div> + +<p>An instance of <code>KnownBits</code> represents a set of integers, namely those that match +the known bits stored in the instance. We can write a method <code>contains</code> that +takes a concrete <code>int</code> value and returns <code>True</code> if the value matches the +pattern of the known bits:</p> +<div class="code"><pre class="code literal-block"><span class="k">class</span> <span class="nc">KnownBits</span><span class="p">:</span> + <span class="o">...</span> + + <span class="k">def</span> <span class="nf">contains</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span> <span class="p">:</span> <span class="nb">int</span><span class="p">):</span> +<span class="w"> </span><span class="sd">""" Check whether the KnownBits instance contains the concrete integer</span> +<span class="sd"> `value`. """</span> + <span class="c1"># check whether value matches the bit pattern. in the places where we</span> + <span class="c1"># know the bits, the value must agree with ones.</span> + <span class="k">return</span> <span class="n">value</span> <span class="o">&amp;</span> <span class="bp">self</span><span class="o">.</span><span class="n">knowns</span> <span class="o">==</span> <span class="bp">self</span><span class="o">.</span><span class="n">ones</span> +</pre></div> + +<p>and a test:</p> +<div class="code"><pre class="code literal-block"><span class="k">def</span> <span class="nf">test_contains</span><span class="p">():</span> + <span class="n">k1</span> <span class="o">=</span> <span class="n">KnownBits</span><span class="o">.</span><span class="n">from_str</span><span class="p">(</span><span class="s1">'1?1'</span><span class="p">)</span> + <span class="k">assert</span> <span class="n">k1</span><span class="o">.</span><span class="n">contains</span><span class="p">(</span><span class="mb">0b111</span><span class="p">)</span> + <span class="k">assert</span> <span class="n">k1</span><span class="o">.</span><span class="n">contains</span><span class="p">(</span><span class="mb">0b101</span><span class="p">)</span> + <span class="k">assert</span> <span class="ow">not</span> <span class="n">k1</span><span class="o">.</span><span class="n">contains</span><span class="p">(</span><span class="mb">0b110</span><span class="p">)</span> + <span class="k">assert</span> <span class="ow">not</span> <span class="n">k1</span><span class="o">.</span><span class="n">contains</span><span class="p">(</span><span class="mb">0b011</span><span class="p">)</span> + + <span class="n">k2</span> <span class="o">=</span> <span class="n">KnownBits</span><span class="o">.</span><span class="n">from_str</span><span class="p">(</span><span class="s1">'...?1'</span><span class="p">)</span> <span class="c1"># all odd numbers</span> + <span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="o">-</span><span class="mi">101</span><span class="p">,</span> <span class="mi">100</span><span class="p">):</span> + <span class="k">assert</span> <span class="n">k2</span><span class="o">.</span><span class="n">contains</span><span class="p">(</span><span class="n">i</span><span class="p">)</span> <span class="o">==</span> <span class="p">(</span><span class="n">i</span> <span class="o">&amp;</span> <span class="mi">1</span><span class="p">)</span> +</pre></div> + +<h3 id="transfer-functions">Transfer Functions</h3> +<p>Now that we have implemented the basics of the <code>KnownBits</code> class, we need to +start implementing the transfer functions. They are for computing what we know +about the <em>results</em> of an operation, given the knowledge we have about the bits +of the arguments.</p> +<p>We'll start with a simple unary operation, <code>invert(x)</code> (which is <code>~x</code> in Python +and C syntax), which flips all the bits of at integer. If we know some bits of +the arguments, we can compute the corresponding bits of the result. The unknown +bits remain unknown.</p> +<p>Here's the code:</p> +<div class="code"><pre class="code literal-block"><span class="k">class</span> <span class="nc">KnownBits</span><span class="p">:</span> + <span class="o">...</span> + + <span class="k">def</span> <span class="nf">abstract_invert</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span> + <span class="c1"># self.zeros has bits set where the known 0s are in self</span> + <span class="k">return</span> <span class="n">KnownBits</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">zeros</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">unknowns</span><span class="p">)</span> +</pre></div> + +<p>And a unit-test:</p> +<div class="code"><pre class="code literal-block"><span class="k">def</span> <span class="nf">test_invert</span><span class="p">():</span> + <span class="n">k1</span> <span class="o">=</span> <span class="n">KnownBits</span><span class="o">.</span><span class="n">from_str</span><span class="p">(</span><span class="s1">'01?01?01?'</span><span class="p">)</span> + <span class="n">k2</span> <span class="o">=</span> <span class="n">k1</span><span class="o">.</span><span class="n">abstract_invert</span><span class="p">()</span> + <span class="k">assert</span> <span class="nb">str</span><span class="p">(</span><span class="n">k2</span><span class="p">)</span> <span class="o">==</span> <span class="s1">'...10?10?10?'</span> + + <span class="n">k1</span> <span class="o">=</span> <span class="n">KnownBits</span><span class="o">.</span><span class="n">from_str</span><span class="p">(</span><span class="s1">'...?'</span><span class="p">)</span> + <span class="n">k2</span> <span class="o">=</span> <span class="n">k1</span><span class="o">.</span><span class="n">abstract_invert</span><span class="p">()</span> + <span class="k">assert</span> <span class="nb">str</span><span class="p">(</span><span class="n">k2</span><span class="p">)</span> <span class="o">==</span> <span class="s1">'...?'</span> +</pre></div> + +<p>Before we continue with further transfer functions, we'll think about +correctness of the transfer functions and build up some test infrastructure. To +test transfer functions, it's quite important to move being simple example-style +unit tests. The state-space for more complicated binary transfer functions is +extremely large and it's too easy to do something wrong in a corner case. +Therefore we'll look at property-based-test for <code>KnownBits</code> next.</p> +<h3 id="property-based-tests-with-hypothesis">Property-based Tests with Hypothesis</h3> +<p>We want to do property-based tests of <code>KnownBits</code>, to try +make it less likely that we'll get a corner-case in the implementation wrong. +We'll use <a href="https://hypothesis.readthedocs.io/en/latest/">Hypothesis</a> for that.</p> +<p>I can't give a decent introduction to Hypothesis here, but want to give a few +hints about the API. Hypothesis is a way to run unit tests with randomly +generated input. It provides <em>strategies</em> to describe the data that the test +functions expects. Hypothesis provides primitive strategies (for things like +integers, strings, floats, etc) and ways to build composite strategies out of +the primitive ones.</p> +<p>To be able to write the tests, we need to generate random <code>KnownBits</code> instances, +and we also want an <code>int</code> instance that is a member of the <code>KnownBits</code> instance. +We generate tuples of <code>(KnownBits, int)</code> together, to ensure this property. +We'll ask Hypothesis to generate us a random concrete <code>int</code> as the concrete +value, and then we'll also generate a second random <code>int</code> to use as the +<code>unknown</code> masks (i.e. which bits of the concrete int we don't know in the +<code>KnownBits</code> instance). Here's a function that takes two such ints and builds the +tuple:</p> +<div class="code"><pre class="code literal-block"><span class="k">def</span> <span class="nf">build_knownbits_and_contained_number</span><span class="p">(</span><span class="n">concrete_value</span> <span class="p">:</span> <span class="nb">int</span><span class="p">,</span> <span class="n">unknowns</span> <span class="p">:</span> <span class="nb">int</span><span class="p">):</span> + <span class="c1"># to construct a valid KnownBits instance, we need to mask off the unknown</span> + <span class="c1"># bits</span> + <span class="n">ones</span> <span class="o">=</span> <span class="n">concrete_value</span> <span class="o">&amp;</span> <span class="o">~</span><span class="n">unknowns</span> + <span class="k">return</span> <span class="n">KnownBits</span><span class="p">(</span><span class="n">ones</span><span class="p">,</span> <span class="n">unknowns</span><span class="p">),</span> <span class="n">concrete_value</span> +</pre></div> + +<p>We can turn this function into a hypothesis strategy to generate input data +using the <code>strategies.builds</code> function:</p> +<div class="code"><pre class="code literal-block"><span class="kn">from</span> <span class="nn">hypothesis</span> <span class="kn">import</span> <span class="n">strategies</span><span class="p">,</span> <span class="n">given</span><span class="p">,</span> <span class="n">settings</span> + +<span class="n">ints</span> <span class="o">=</span> <span class="n">strategies</span><span class="o">.</span><span class="n">integers</span><span class="p">()</span> + +<span class="n">random_knownbits_and_contained_number</span> <span class="o">=</span> <span class="n">strategies</span><span class="o">.</span><span class="n">builds</span><span class="p">(</span> + <span class="n">build_knownbits_and_contained_number</span><span class="p">,</span> + <span class="n">ints</span><span class="p">,</span> <span class="n">ints</span> +<span class="p">)</span> +</pre></div> + +<p>One important special case of <code>KnownBits</code> are the constants, which contain only +a single concrete value. We'll also generate some of those specifically, and +then combine the <code>random_knownbits_and_contained_number</code> strategy with it:</p> +<div class="code"><pre class="code literal-block"><span class="n">constant_knownbits</span> <span class="o">=</span> <span class="n">strategies</span><span class="o">.</span><span class="n">builds</span><span class="p">(</span> + <span class="k">lambda</span> <span class="n">value</span><span class="p">:</span> <span class="p">(</span><span class="n">KnownBits</span><span class="o">.</span><span class="n">from_constant</span><span class="p">(</span><span class="n">value</span><span class="p">),</span> <span class="n">value</span><span class="p">),</span> + <span class="n">ints</span> +<span class="p">)</span> + +<span class="n">knownbits_and_contained_number</span> <span class="o">=</span> <span class="n">constant_knownbits</span> <span class="o">|</span> <span class="n">random_knownbits_and_contained_number</span> +</pre></div> + +<p>Now we can write the first property-based tests, for the <code>KnownBits.contains</code> +method:</p> +<div class="code"><pre class="code literal-block"><span class="nd">@given</span><span class="p">(</span><span class="n">knownbits_and_contained_number</span><span class="p">)</span> +<span class="k">def</span> <span class="nf">test_contains</span><span class="p">(</span><span class="n">t</span><span class="p">):</span> + <span class="n">k</span><span class="p">,</span> <span class="n">n</span> <span class="o">=</span> <span class="n">t</span> + <span class="k">assert</span> <span class="n">k</span><span class="o">.</span><span class="n">contains</span><span class="p">(</span><span class="n">t</span><span class="p">)</span> +</pre></div> + +<p>The <code>@given</code> decorator is used to tell Hypothesis which strategy to use to +generate random data for the test function. Hypothesis will run the test with a +number of random examples (100 by default). If it finds an error, it will try to +minimize the example needed that demonstrates the problem, to try to make it +easier to understand what is going wrong. It also saves all failing cases into +an example database and tries them again on subsequent runs.</p> +<p>This test is as much a check for whether we got the strategies right as it is +for the logic in <code>KnownBits.contains</code>. Here's an example output of random +concrete and abstract values that we are getting here:</p> +<div class="code"><pre class="code literal-block"><span class="mf">110000011001101</span><span class="w"> </span><span class="mf">...</span><span class="err">?</span><span class="mf">0</span><span class="err">???</span><span class="mf">1</span> +<span class="mf">...1011011</span><span class="w"> </span><span class="mf">...1011011</span> +<span class="mf">...1001101110101000010010011111011</span><span class="w"> </span><span class="mf">...1001101110101000010010011111011</span> +<span class="mf">...1001101110101000010010011111011</span><span class="w"> </span><span class="mf">...100110111010100001</span><span class="err">?</span><span class="mf">010</span><span class="err">?</span><span class="mf">1</span><span class="err">??</span><span class="mf">1</span><span class="err">??</span><span class="mf">11</span> +<span class="mf">1000001101111101001011010011111101000011000111011001011111101</span><span class="w"> </span><span class="mf">1000001101111101001011010011111101000011000111011001011111101</span> +<span class="mf">1000001101111101001011010011111101000011000111011001011111101</span><span class="w"> </span><span class="mf">1000001101111101001011010011111101000011000111</span><span class="err">????</span><span class="mf">01</span><span class="err">?</span><span class="mf">11</span><span class="err">?????</span><span class="mf">1</span> +<span class="mf">1111100000010</span><span class="w"> </span><span class="mf">1111100000010</span> +<span class="mf">1111100000010</span><span class="w"> </span><span class="mf">...</span><span class="err">?</span><span class="mf">11111</span><span class="err">?</span><span class="mf">00000</span><span class="err">??</span> +<span class="mf">110110</span><span class="w"> </span><span class="mf">110110</span> +<span class="mf">110110</span><span class="w"> </span><span class="mf">...</span><span class="err">?</span><span class="mf">00</span><span class="err">?</span><span class="mf">00</span><span class="err">????</span><span class="mf">11</span><span class="err">??</span><span class="mf">10</span> +<span class="mf">110110</span><span class="w"> </span><span class="err">??</span><span class="mf">0</span><span class="err">??</span><span class="mf">0</span> +<span class="mf">...100010111011111</span><span class="w"> </span><span class="mf">...</span><span class="err">?</span><span class="mf">100</span><span class="err">?</span><span class="mf">10111</span><span class="err">??</span><span class="mf">111</span><span class="err">?</span> +<span class="mf">...1000100000110001</span><span class="w"> </span><span class="mf">...</span><span class="err">?</span><span class="mf">000</span><span class="err">?</span><span class="mf">00000</span><span class="err">??</span><span class="mf">000</span><span class="err">?</span> +<span class="mf">110000001110</span><span class="w"> </span><span class="mf">...</span><span class="err">?</span><span class="mf">0</span><span class="err">?</span><span class="mf">0</span><span class="err">??</span><span class="mf">000</span><span class="err">?</span><span class="mf">00</span><span class="err">?</span><span class="mf">0</span><span class="err">?</span><span class="mf">0000000</span><span class="err">?</span><span class="mf">00</span><span class="err">???</span><span class="mf">0000</span><span class="err">?????</span><span class="mf">00</span><span class="err">???</span><span class="mf">000</span><span class="err">?</span><span class="mf">0</span><span class="err">?</span><span class="mf">00</span><span class="err">?</span><span class="mf">01</span><span class="err">?</span><span class="mf">000</span><span class="err">?</span><span class="mf">0</span><span class="err">??</span><span class="mf">1</span><span class="err">??</span> +<span class="mf">110000001110</span><span class="w"> </span><span class="err">??</span><span class="mf">000000</span><span class="err">???</span><span class="mf">0</span> +<span class="mf">1011011010000001110101001111000010001001011101010010010001000000010101010010001101110101111111010101010010101100110000011110000</span><span class="w"> </span><span class="mf">1011011010000001110101001111000010001001011101010010010001000000010101010010001101110101111111010101010010101100110000011110000</span> +<span class="mf">...1011010010010100</span><span class="w"> </span><span class="mf">...1011010010010100</span> +<span class="mf">...1011111110110011</span><span class="w"> </span><span class="mf">...1011111110110011</span> +<span class="mf">101000011110110</span><span class="w"> </span><span class="mf">101000011</span><span class="err">?</span><span class="mf">10</span><span class="err">?</span><span class="mf">1</span><span class="err">?</span> +<span class="mf">100101</span><span class="w"> </span><span class="err">?</span><span class="mf">00</span><span class="err">?</span><span class="mf">0</span><span class="err">?</span> +</pre></div> + +<p>That looks suitably random, but we might want to bias our random numbers a +little bit towards common error values like small constants, powers of two, etc. +Like this:</p> +<div class="code"><pre class="code literal-block"><span class="n">INTEGER_WIDTH</span> <span class="o">=</span> <span class="mi">64</span> +<span class="c1"># some small integers</span> +<span class="n">ints_special</span> <span class="o">=</span> <span class="nb">set</span><span class="p">(</span><span class="nb">range</span><span class="p">(</span><span class="mi">100</span><span class="p">))</span> +<span class="c1"># powers of two</span> +<span class="n">ints_special</span> <span class="o">=</span> <span class="n">ints_special</span><span class="o">.</span><span class="n">union</span><span class="p">(</span><span class="mi">1</span> <span class="o">&lt;&lt;</span> <span class="n">i</span> <span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="n">INTEGER_WIDTH</span> <span class="o">-</span> <span class="mi">2</span><span class="p">))</span> +<span class="c1"># powers of two - 1</span> +<span class="n">ints_special</span> <span class="o">=</span> <span class="n">ints_special</span><span class="o">.</span><span class="n">union</span><span class="p">((</span><span class="mi">1</span> <span class="o">&lt;&lt;</span> <span class="n">i</span><span class="p">)</span> <span class="o">-</span> <span class="mi">1</span> <span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="n">INTEGER_WIDTH</span> <span class="o">-</span> <span class="mi">2</span><span class="p">))</span> +<span class="c1"># negative versions of what we have so far</span> +<span class="n">ints_special</span> <span class="o">=</span> <span class="n">ints_special</span><span class="o">.</span><span class="n">union</span><span class="p">(</span><span class="o">-</span><span class="n">x</span> <span class="k">for</span> <span class="n">x</span> <span class="ow">in</span> <span class="n">ints_special</span><span class="p">)</span> +<span class="c1"># bit-flipped versions of what we have so far</span> +<span class="n">ints_special</span> <span class="o">=</span> <span class="n">ints_special</span><span class="o">.</span><span class="n">union</span><span class="p">(</span><span class="o">~</span><span class="n">x</span> <span class="k">for</span> <span class="n">x</span> <span class="ow">in</span> <span class="n">ints_special</span><span class="p">)</span> +<span class="n">ints_special</span> <span class="o">=</span> <span class="nb">list</span><span class="p">(</span><span class="n">ints_special</span><span class="p">)</span> +<span class="c1"># sort them (because hypothesis simplifies towards earlier elements in the list)</span> +<span class="n">ints_special</span><span class="o">.</span><span class="n">sort</span><span class="p">(</span><span class="n">key</span><span class="o">=</span><span class="k">lambda</span> <span class="n">element</span><span class="p">:</span> <span class="p">(</span><span class="nb">abs</span><span class="p">(</span><span class="n">element</span><span class="p">),</span> <span class="n">element</span> <span class="o">&lt;</span> <span class="mi">0</span><span class="p">))</span> + +<span class="n">ints</span> <span class="o">=</span> <span class="n">strategies</span><span class="o">.</span><span class="n">sampled_from</span><span class="p">(</span><span class="n">ints_special</span><span class="p">)</span> <span class="o">|</span> <span class="n">strategies</span><span class="o">.</span><span class="n">integers</span><span class="p">()</span> +</pre></div> + +<p>Now we get data like this:</p> +<div class="code"><pre class="code literal-block"><span class="mf">1110</span><span class="w"> </span><span class="mf">1110</span> +<span class="mf">...10000000000000000001</span><span class="w"> </span><span class="mf">...10000</span><span class="err">??</span><span class="mf">0</span><span class="err">??</span><span class="mf">0000</span><span class="err">??</span><span class="mf">00</span><span class="err">?</span><span class="mf">1</span> +<span class="mf">1</span><span class="w"> </span><span class="err">??</span><span class="mf">0</span><span class="err">??</span><span class="mf">0000</span><span class="err">??</span><span class="mf">00</span><span class="err">?</span><span class="mf">1</span> +<span class="mf">1</span><span class="w"> </span><span class="err">?</span> +<span class="mf">...10101100</span><span class="w"> </span><span class="mf">...10101100</span> +<span class="mf">110000000011001010111011111111111111011110010001001100110001011</span><span class="w"> </span><span class="mf">...</span><span class="err">?</span><span class="mf">0</span><span class="err">?</span><span class="mf">101</span><span class="err">?</span> +<span class="mf">110000000011001010111011111111111111011110010001001100110001011</span><span class="w"> </span><span class="err">??</span><span class="mf">00000000</span><span class="err">??</span><span class="mf">00</span><span class="err">?</span><span class="mf">0</span><span class="err">?</span><span class="mf">0</span><span class="err">???</span><span class="mf">0</span><span class="err">??????????????</span><span class="mf">0</span><span class="err">????</span><span class="mf">00</span><span class="err">?</span><span class="mf">000</span><span class="err">?</span><span class="mf">00</span><span class="err">??</span><span class="mf">00</span><span class="err">??</span><span class="mf">000</span><span class="err">?</span><span class="mf">0</span><span class="err">??</span> +<span class="mf">...1011111111111111111111111111</span><span class="w"> </span><span class="mf">...</span><span class="err">?</span><span class="mf">11</span><span class="err">?</span><span class="mf">11</span><span class="err">??</span> +<span class="mf">...1011111111111111111111111111</span><span class="w"> </span><span class="mf">...</span><span class="err">?</span><span class="mf">0</span><span class="err">??????????????????????????</span> +<span class="mf">0</span><span class="w"> </span><span class="mf">...</span><span class="err">?</span><span class="mf">0</span><span class="err">??????????????????????????</span> +<span class="mf">101101</span><span class="w"> </span><span class="mf">101101</span> +<span class="mf">111111111111111111111111111111111111111111111</span><span class="w"> </span><span class="mf">111111111111111111111111111111111111111111111</span> +<span class="mf">10111</span><span class="w"> </span><span class="mf">10111</span> +<span class="mf">...101100</span><span class="w"> </span><span class="mf">...1</span><span class="err">?</span><span class="mf">111011</span><span class="err">?</span><span class="mf">0</span> +<span class="mf">101000</span><span class="w"> </span><span class="err">?</span><span class="mf">001010</span><span class="err">?</span><span class="mf">0</span> +<span class="mf">101000</span><span class="w"> </span><span class="err">?</span><span class="mf">0</span><span class="err">?</span><span class="mf">000</span> +<span class="mf">110010</span><span class="w"> </span><span class="mf">110010</span> +<span class="mf">...100111</span><span class="w"> </span><span class="mf">...100111</span> +<span class="mf">1111011010010</span><span class="w"> </span><span class="mf">1111011010010</span> +<span class="mf">...1000000000000000000000000000000000000</span><span class="w"> </span><span class="mf">...1000000000000000000000000000000000000</span> +</pre></div> + +<p>We can also write a test that checks that the somewhat tricky logic in +<code>__str__</code> and <code>from_str</code> is correct, by making sure that the two functions +round-trip (ie converting a <code>KnownBits</code> to a string and then back to a +<code>KnownBits</code> instance produces the same abstract value).</p> +<div class="code"><pre class="code literal-block"><span class="nd">@given</span><span class="p">(</span><span class="n">knownbits_and_contained_number</span><span class="p">)</span> +<span class="k">def</span> <span class="nf">test_hypothesis_str_roundtrips</span><span class="p">(</span><span class="n">t1</span><span class="p">):</span> + <span class="n">k1</span><span class="p">,</span> <span class="n">n1</span> <span class="o">=</span> <span class="n">t1</span> + <span class="n">s</span> <span class="o">=</span> <span class="nb">str</span><span class="p">(</span><span class="n">k1</span><span class="p">)</span> + <span class="n">k2</span> <span class="o">=</span> <span class="n">KnownBits</span><span class="o">.</span><span class="n">from_str</span><span class="p">(</span><span class="n">s</span><span class="p">)</span> + <span class="k">assert</span> <span class="n">k1</span><span class="o">.</span><span class="n">ones</span> <span class="o">==</span> <span class="n">k2</span><span class="o">.</span><span class="n">ones</span> + <span class="k">assert</span> <span class="n">k1</span><span class="o">.</span><span class="n">unknowns</span> <span class="o">==</span> <span class="n">k2</span><span class="o">.</span><span class="n">unknowns</span> +</pre></div> + +<p>Now let's actually apply this infrastructure to test <code>abstract_invert</code>.</p> +<h3 id="when-are-transfer-functions-correct-how-do-we-test-them">When are Transfer Functions Correct? How do we test them?</h3> +<p>Abstract values, i.e. instances of <code>KnownBits</code> represent <em>sets</em> of concrete +values. We want the transfer functions to compute <em>overapproximations</em> of the +concrete values. So if we have an arbitrary abstract value <code>k</code>, with a concrete +number <code>n</code> that is a member of the abstract values (i.e. +<code>k.contains(n) == True</code>) then the result of the concrete operation <code>op(n)</code> +<strong>must</strong> be a member of the result of the abstract operation <code>k.abstract_op()</code> +(i.e. <code>k.abstract_op().contains(op(n)) == True</code>).</p> +<p>Checking the correctness/overapproximation property is a good match for +hypothesis. Here's what the test for <code>abstract_invert</code> looks like:</p> +<div class="code"><pre class="code literal-block"><span class="nd">@given</span><span class="p">(</span><span class="n">knownbits_and_contained_number</span><span class="p">)</span> +<span class="k">def</span> <span class="nf">test_hypothesis_invert</span><span class="p">(</span><span class="n">t</span><span class="p">):</span> + <span class="n">k1</span><span class="p">,</span> <span class="n">n1</span> <span class="o">=</span> <span class="n">t1</span> + <span class="n">n2</span> <span class="o">=</span> <span class="o">~</span><span class="n">n1</span> <span class="c1"># compute the real result</span> + <span class="n">k2</span> <span class="o">=</span> <span class="n">k1</span><span class="o">.</span><span class="n">abstract_invert</span><span class="p">()</span> <span class="c1"># compute the abstract result</span> + <span class="k">assert</span> <span class="n">k2</span><span class="o">.</span><span class="n">contains</span><span class="p">(</span><span class="n">n2</span><span class="p">)</span> <span class="c1"># the abstract result must contain the real result</span> +</pre></div> + +<p>This is the <em>only</em> condition needed for <code>abstract_invert</code> to be correct. If +<code>abstract_invert</code> fulfils this property for every combination of abstract and +concrete value then <code>abstract_invert</code> is correct. Note however, that this test +does not actually check whether <code>abstract_invert</code> gives us precise results. A +correct (but imprecise) implementation of <code>abstract_invert</code> would simply return +a completely unknown result, regardless of what is known about the input +<code>KnownBits</code>.</p> +<p>The "proper" CS term for this notion of correctness is called <em>soundness</em>. The +correctness condition on the transfer functions is called a <em>Galois +connection</em>. I won't go into any mathematical/technical details here, but +wanted to at least mention the terms. I found <a href="https://web.njit.edu/~mjk76/">Martin +Kellogg</a>'s +<a href="https://web.njit.edu/~mjk76/teaching/cs684-sp24/assets/lecture-12.pdf#34">slides</a> +to be quite an approachable introduction to the Galois connection and how to +show soundness.</p> +<h3 id="implementing-binary-transfer-functions">Implementing Binary Transfer Functions</h3> +<p>Now we have infrastructure in place for testing transfer functions with random +inputs. With that we can start thinking about the more complicated case, that of +binary operations. Let's start with the simpler ones, <code>and</code> and <code>or</code>. For <code>and</code>, +we can know a <code>0</code> bit in the result if either of the input bits are known <code>0</code>; +or we can know a <code>1</code> bit in the result if both input bits are known <code>1</code>. +Otherwise the resulting bit is unknown. Let's look at all the combinations:</p> +<div class="code"><pre class="code literal-block">and +input1: 000111??? +input2: 01?01?01? +result: 00001?0?? +</pre></div> + +<div class="code"><pre class="code literal-block"><span class="k">class</span> <span class="nc">KnownBits</span><span class="p">:</span> + <span class="o">...</span> + + <span class="k">def</span> <span class="nf">abstract_and</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">):</span> + <span class="n">ones</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">ones</span> <span class="o">&amp;</span> <span class="n">other</span><span class="o">.</span><span class="n">ones</span> <span class="c1"># known ones</span> + <span class="n">knowns</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">zeros</span> <span class="o">|</span> <span class="n">other</span><span class="o">.</span><span class="n">zeros</span> <span class="o">|</span> <span class="n">ones</span> + <span class="k">return</span> <span class="n">KnownBits</span><span class="p">(</span><span class="n">ones</span><span class="p">,</span> <span class="o">~</span><span class="n">knowns</span><span class="p">)</span> +</pre></div> + +<p>Here's an example unit-test and a property-based test for <code>and</code>:</p> +<div class="code"><pre class="code literal-block"><span class="k">def</span> <span class="nf">test_and</span><span class="p">():</span> + <span class="c1"># test all combinations of 0, 1, ? in one example</span> + <span class="n">k1</span> <span class="o">=</span> <span class="n">KnownBits</span><span class="o">.</span><span class="n">from_str</span><span class="p">(</span><span class="s1">'01?01?01?'</span><span class="p">)</span> + <span class="n">k2</span> <span class="o">=</span> <span class="n">KnownBits</span><span class="o">.</span><span class="n">from_str</span><span class="p">(</span><span class="s1">'000111???'</span><span class="p">)</span> + <span class="n">res</span> <span class="o">=</span> <span class="n">k1</span><span class="o">.</span><span class="n">abstract_and</span><span class="p">(</span><span class="n">k2</span><span class="p">)</span> <span class="c1"># should be: 0...00001?0??</span> + <span class="k">assert</span> <span class="nb">str</span><span class="p">(</span><span class="n">res</span><span class="p">)</span> <span class="o">==</span> <span class="s2">"1?0??"</span> + +<span class="nd">@given</span><span class="p">(</span><span class="n">knownbits_and_contained_number</span><span class="p">,</span> <span class="n">knownbits_and_contained_number</span><span class="p">)</span> +<span class="k">def</span> <span class="nf">test_hypothesis_and</span><span class="p">(</span><span class="n">t1</span><span class="p">,</span> <span class="n">t2</span><span class="p">):</span> + <span class="n">k1</span><span class="p">,</span> <span class="n">n1</span> <span class="o">=</span> <span class="n">t1</span> + <span class="n">k2</span><span class="p">,</span> <span class="n">n2</span> <span class="o">=</span> <span class="n">t2</span> + <span class="n">k3</span> <span class="o">=</span> <span class="n">k1</span><span class="o">.</span><span class="n">abstract_and</span><span class="p">(</span><span class="n">k2</span><span class="p">)</span> + <span class="n">n3</span> <span class="o">=</span> <span class="n">n1</span> <span class="o">&amp;</span> <span class="n">n2</span> + <span class="k">assert</span> <span class="n">k3</span><span class="o">.</span><span class="n">contains</span><span class="p">(</span><span class="n">n3</span><span class="p">)</span> +</pre></div> + +<p>To implement <code>or</code> is pretty similar. The result is known <code>1</code> where either of the +inputs is <code>1</code>. The result is known <code>0</code> where both inputs are known <code>0</code>, and <code>?</code> +otherwise.</p> +<div class="code"><pre class="code literal-block">or +input1: 000111??? +input2: 01?01?01? +result: 01?111?1? +</pre></div> + +<div class="code"><pre class="code literal-block"><span class="k">class</span> <span class="nc">KnownBits</span><span class="p">:</span> + <span class="o">...</span> + + <span class="k">def</span> <span class="nf">abstract_or</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">):</span> + <span class="n">ones</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">ones</span> <span class="o">|</span> <span class="n">other</span><span class="o">.</span><span class="n">ones</span> + <span class="n">zeros</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">zeros</span> <span class="o">&amp;</span> <span class="n">other</span><span class="o">.</span><span class="n">zeros</span> + <span class="n">knowns</span> <span class="o">=</span> <span class="n">ones</span> <span class="o">|</span> <span class="n">zeros</span> + <span class="k">return</span> <span class="n">KnownBits</span><span class="p">(</span><span class="n">ones</span><span class="p">,</span> <span class="o">~</span><span class="n">knowns</span><span class="p">)</span> +</pre></div> + +<p>Here's an example unit-test and a property-based test for <code>or</code>:</p> +<div class="code"><pre class="code literal-block"><span class="k">def</span> <span class="nf">test_or</span><span class="p">():</span> + <span class="n">k1</span> <span class="o">=</span> <span class="n">KnownBits</span><span class="o">.</span><span class="n">from_str</span><span class="p">(</span><span class="s1">'01?01?01?'</span><span class="p">)</span> + <span class="n">k2</span> <span class="o">=</span> <span class="n">KnownBits</span><span class="o">.</span><span class="n">from_str</span><span class="p">(</span><span class="s1">'000111???'</span><span class="p">)</span> + <span class="n">res</span> <span class="o">=</span> <span class="n">k1</span><span class="o">.</span><span class="n">abstract_or</span><span class="p">(</span><span class="n">k2</span><span class="p">)</span> <span class="c1"># should be: 0...01?111?1?</span> + <span class="k">assert</span> <span class="nb">str</span><span class="p">(</span><span class="n">res</span><span class="p">)</span> <span class="o">==</span> <span class="s2">"1?111?1?"</span> + +<span class="nd">@given</span><span class="p">(</span><span class="n">knownbits_and_contained_number</span><span class="p">,</span> <span class="n">knownbits_and_contained_number</span><span class="p">)</span> +<span class="k">def</span> <span class="nf">test_hypothesis_or</span><span class="p">(</span><span class="n">t1</span><span class="p">,</span> <span class="n">t2</span><span class="p">):</span> + <span class="n">k1</span><span class="p">,</span> <span class="n">n1</span> <span class="o">=</span> <span class="n">t1</span> + <span class="n">k2</span><span class="p">,</span> <span class="n">n2</span> <span class="o">=</span> <span class="n">t2</span> + <span class="n">k3</span> <span class="o">=</span> <span class="n">k1</span><span class="o">.</span><span class="n">abstract_or</span><span class="p">(</span><span class="n">k2</span><span class="p">)</span> + <span class="n">n3</span> <span class="o">=</span> <span class="n">n1</span> <span class="o">|</span> <span class="n">n2</span> + <span class="k">assert</span> <span class="n">k3</span><span class="o">.</span><span class="n">contains</span><span class="p">(</span><span class="n">n3</span><span class="p">)</span> +</pre></div> + +<p>Implementing support for <code>abstract_xor</code> is relatively simple, and left as an +exercise :-).</p> +<h3 id="addition-and-subtraction">Addition and Subtraction</h3> +<p><code>invert</code>, <code>and</code>, and <code>or</code> are relatively simple transfer functions to write, +because they compose over the individual bits of the integers. The arithmetic +functions <code>add</code> and <code>sub</code> are significantly harder, because of carries and +borrows. Coming up with the formulas for them and gaining an intuitive +understanding is quite tricky and involves carefully going through a few +examples with pen and paper. When implementing this in PyPy, Nico and I didn't +come up with the implementation ourselves, but instead took them from the +<a href="https://arxiv.org/abs/2105.05398">Tristate Numbers</a> paper. Here's the code, +with example tests and hypothesis tests:</p> +<div class="code"><pre class="code literal-block"><span class="k">class</span> <span class="nc">KnownBits</span><span class="p">:</span> + <span class="o">...</span> + + <span class="k">def</span> <span class="nf">abstract_add</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">):</span> + <span class="n">sum_ones</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">ones</span> <span class="o">+</span> <span class="n">other</span><span class="o">.</span><span class="n">ones</span> + <span class="n">sum_unknowns</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">unknowns</span> <span class="o">+</span> <span class="n">other</span><span class="o">.</span><span class="n">unknowns</span> + <span class="n">all_carries</span> <span class="o">=</span> <span class="n">sum_ones</span> <span class="o">+</span> <span class="n">sum_unknowns</span> + <span class="n">ones_carries</span> <span class="o">=</span> <span class="n">all_carries</span> <span class="o">^</span> <span class="n">sum_ones</span> + <span class="n">unknowns</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">unknowns</span> <span class="o">|</span> <span class="n">other</span><span class="o">.</span><span class="n">unknowns</span> <span class="o">|</span> <span class="n">ones_carries</span> + <span class="n">ones</span> <span class="o">=</span> <span class="n">sum_ones</span> <span class="o">&amp;</span> <span class="o">~</span><span class="n">unknowns</span> + <span class="k">return</span> <span class="n">KnownBits</span><span class="p">(</span><span class="n">ones</span><span class="p">,</span> <span class="n">unknowns</span><span class="p">)</span> + + <span class="k">def</span> <span class="nf">abstract_sub</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">):</span> + <span class="n">diff_ones</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">ones</span> <span class="o">-</span> <span class="n">other</span><span class="o">.</span><span class="n">ones</span> + <span class="n">val_borrows</span> <span class="o">=</span> <span class="p">(</span><span class="n">diff_ones</span> <span class="o">+</span> <span class="bp">self</span><span class="o">.</span><span class="n">unknowns</span><span class="p">)</span> <span class="o">^</span> <span class="p">(</span><span class="n">diff_ones</span> <span class="o">-</span> <span class="n">other</span><span class="o">.</span><span class="n">unknowns</span><span class="p">)</span> + <span class="n">unknowns</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">unknowns</span> <span class="o">|</span> <span class="n">other</span><span class="o">.</span><span class="n">unknowns</span> <span class="o">|</span> <span class="n">val_borrows</span> + <span class="n">ones</span> <span class="o">=</span> <span class="n">diff_ones</span> <span class="o">&amp;</span> <span class="o">~</span><span class="n">unknowns</span> + <span class="k">return</span> <span class="n">KnownBits</span><span class="p">(</span><span class="n">ones</span><span class="p">,</span> <span class="n">unknowns</span><span class="p">)</span> + + +<span class="k">def</span> <span class="nf">test_add</span><span class="p">():</span> + <span class="n">k1</span> <span class="o">=</span> <span class="n">KnownBits</span><span class="o">.</span><span class="n">from_str</span><span class="p">(</span><span class="s1">'0?10?10?10'</span><span class="p">)</span> + <span class="n">k2</span> <span class="o">=</span> <span class="n">KnownBits</span><span class="o">.</span><span class="n">from_str</span><span class="p">(</span><span class="s1">'0???111000'</span><span class="p">)</span> + <span class="n">res</span> <span class="o">=</span> <span class="n">k1</span><span class="o">.</span><span class="n">abstract_add</span><span class="p">(</span><span class="n">k2</span><span class="p">)</span> + <span class="k">assert</span> <span class="nb">str</span><span class="p">(</span><span class="n">res</span><span class="p">)</span> <span class="o">==</span> <span class="s2">"?????01?10"</span> + +<span class="k">def</span> <span class="nf">test_sub</span><span class="p">():</span> + <span class="n">k1</span> <span class="o">=</span> <span class="n">KnownBits</span><span class="o">.</span><span class="n">from_str</span><span class="p">(</span><span class="s1">'0?10?10?10'</span><span class="p">)</span> + <span class="n">k2</span> <span class="o">=</span> <span class="n">KnownBits</span><span class="o">.</span><span class="n">from_str</span><span class="p">(</span><span class="s1">'0???111000'</span><span class="p">)</span> + <span class="n">res</span> <span class="o">=</span> <span class="n">k1</span><span class="o">.</span><span class="n">abstract_sub</span><span class="p">(</span><span class="n">k2</span><span class="p">)</span> + <span class="k">assert</span> <span class="nb">str</span><span class="p">(</span><span class="n">res</span><span class="p">)</span> <span class="o">==</span> <span class="s2">"...?11?10"</span> + <span class="n">k1</span> <span class="o">=</span> <span class="n">KnownBits</span><span class="o">.</span><span class="n">from_str</span><span class="p">(</span> <span class="s1">'...1?10?10?10'</span><span class="p">)</span> + <span class="n">k2</span> <span class="o">=</span> <span class="n">KnownBits</span><span class="o">.</span><span class="n">from_str</span><span class="p">(</span><span class="s1">'...10000???111000'</span><span class="p">)</span> + <span class="n">res</span> <span class="o">=</span> <span class="n">k1</span><span class="o">.</span><span class="n">abstract_sub</span><span class="p">(</span><span class="n">k2</span><span class="p">)</span> + <span class="k">assert</span> <span class="nb">str</span><span class="p">(</span><span class="n">res</span><span class="p">)</span> <span class="o">==</span> <span class="s2">"111?????11?10"</span> + +<span class="nd">@given</span><span class="p">(</span><span class="n">knownbits_and_contained_number</span><span class="p">,</span> <span class="n">knownbits_and_contained_number</span><span class="p">)</span> +<span class="k">def</span> <span class="nf">test_hypothesis_add</span><span class="p">(</span><span class="n">t1</span><span class="p">,</span> <span class="n">t2</span><span class="p">):</span> + <span class="n">k1</span><span class="p">,</span> <span class="n">n1</span> <span class="o">=</span> <span class="n">t1</span> + <span class="n">k2</span><span class="p">,</span> <span class="n">n2</span> <span class="o">=</span> <span class="n">t2</span> + <span class="n">k3</span> <span class="o">=</span> <span class="n">k1</span><span class="o">.</span><span class="n">abstract_add</span><span class="p">(</span><span class="n">k2</span><span class="p">)</span> + <span class="n">n3</span> <span class="o">=</span> <span class="n">n1</span> <span class="o">+</span> <span class="n">n2</span> + <span class="k">assert</span> <span class="n">k3</span><span class="o">.</span><span class="n">contains</span><span class="p">(</span><span class="n">n3</span><span class="p">)</span> + +<span class="nd">@given</span><span class="p">(</span><span class="n">knownbits_and_contained_number</span><span class="p">,</span> <span class="n">knownbits_and_contained_number</span><span class="p">)</span> +<span class="k">def</span> <span class="nf">test_hypothesis_sub</span><span class="p">(</span><span class="n">t1</span><span class="p">,</span> <span class="n">t2</span><span class="p">):</span> + <span class="n">k1</span><span class="p">,</span> <span class="n">n1</span> <span class="o">=</span> <span class="n">t1</span> + <span class="n">k2</span><span class="p">,</span> <span class="n">n2</span> <span class="o">=</span> <span class="n">t2</span> + <span class="n">k3</span> <span class="o">=</span> <span class="n">k1</span><span class="o">.</span><span class="n">abstract_sub</span><span class="p">(</span><span class="n">k2</span><span class="p">)</span> + <span class="n">n3</span> <span class="o">=</span> <span class="n">n1</span> <span class="o">-</span> <span class="n">n2</span> + <span class="k">assert</span> <span class="n">k3</span><span class="o">.</span><span class="n">contains</span><span class="p">(</span><span class="n">n3</span><span class="p">)</span> +</pre></div> + +<p>Now we are in a pretty good situation, and have implemented abstract versions +for a bunch of important arithmetic and binary functions. What's also surprising +is that the implementation of all of the transfer functions is quite efficient. +We didn't have to write loops over the individual bits at all, instead we found +closed form expressions using primitive operations on the underlying integers +<code>ones</code> and <code>unknowns</code>. This means that computing the results of abstract +operations is quite efficient, which is important when using the abstract domain +in the context of a JIT compiler.</p> +<h3 id="proving-correctness-of-the-transfer-functions-with-z3">Proving correctness of the transfer functions with Z3</h3> +<p>As one can probably tell from my recent posts, I've been thinking about +compiler correctness a lot. Getting the transfer functions absolutely +correct is really crucial, because a bug in them would lead to miscompilation of +Python code when the abstract domain is added to the JIT. While the randomized +tests are great, it's still entirely possible for them to miss bugs. The state +space for the arguments of a binary transfer function is <code>3**64 * 3**64</code>, and if +only a small part of that contains wrong behaviour it would be really unlikely +for us to find it with random tests by chance. Therefore I was reluctant to +merge the PyPy branch that contained the new abstract domain for a long time.</p> +<p>To increase our confidence in the correctness of the transfer functions further, +we can use Z3 to <em>prove</em> their correctness, which gives us much stronger +guarantees (not 100%, obviously). In this subsection I will show how to do that.</p> +<p>Here's an attempt to do this manually in the Python repl:</p> +<div class="code"><pre class="code literal-block"><span class="o">&gt;&gt;&gt;&gt;</span><span class="w"> </span><span class="kn">import</span><span class="w"> </span><span class="nn">z3</span> +<span class="o">&gt;&gt;&gt;&gt;</span><span class="w"> </span><span class="n">solver</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">z3</span><span class="o">.</span><span class="n">Solver</span><span class="p">()</span> +<span class="o">&gt;&gt;&gt;&gt;</span><span class="w"> </span><span class="c1"># like last blog post, proof by failing to find counterexamples</span> +<span class="o">&gt;&gt;&gt;&gt;</span><span class="w"> </span><span class="k">def</span><span class="w"> </span><span class="nf">prove</span><span class="p">(</span><span class="n">cond</span><span class="p">):</span><span class="w"> </span><span class="k">assert</span><span class="w"> </span><span class="n">solver</span><span class="o">.</span><span class="n">check</span><span class="p">(</span><span class="n">z3</span><span class="o">.</span><span class="n">Not</span><span class="p">(</span><span class="n">cond</span><span class="p">))</span><span class="w"> </span><span class="o">==</span><span class="w"> </span><span class="n">z3</span><span class="o">.</span><span class="n">unsat</span> +<span class="o">&gt;&gt;&gt;&gt;</span> +<span class="o">&gt;&gt;&gt;&gt;</span><span class="w"> </span><span class="c1"># let's set up a z3 bitvector variable for an arbitrary concrete value</span> +<span class="o">&gt;&gt;&gt;&gt;</span><span class="w"> </span><span class="n">n1</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">z3</span><span class="o">.</span><span class="n">BitVec</span><span class="p">(</span><span class="s1">'concrete_value'</span><span class="p">,</span><span class="w"> </span><span class="mi">64</span><span class="p">)</span> +<span class="o">&gt;&gt;&gt;&gt;</span><span class="w"> </span><span class="n">n1</span> +<span class="n">concrete_value</span> +<span class="o">&gt;&gt;&gt;&gt;</span><span class="w"> </span><span class="c1"># due to operator overloading we can manipulate z3 formulas</span> +<span class="o">&gt;&gt;&gt;&gt;</span><span class="w"> </span><span class="n">n2</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="o">~</span><span class="n">n1</span> +<span class="o">&gt;&gt;&gt;&gt;</span><span class="w"> </span><span class="n">n2</span> +<span class="o">~</span><span class="n">concrete_value</span> +<span class="o">&gt;&gt;&gt;&gt;</span><span class="w"> </span> +<span class="o">&gt;&gt;&gt;&gt;</span><span class="w"> </span><span class="c1"># now z3 bitvector variables for the ones and zeros fields</span> +<span class="o">&gt;&gt;&gt;&gt;</span><span class="w"> </span><span class="n">ones</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">z3</span><span class="o">.</span><span class="n">BitVec</span><span class="p">(</span><span class="s1">'abstract_ones'</span><span class="p">,</span><span class="w"> </span><span class="mi">64</span><span class="p">)</span> +<span class="o">&gt;&gt;&gt;&gt;</span><span class="w"> </span><span class="n">unknowns</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">z3</span><span class="o">.</span><span class="n">BitVec</span><span class="p">(</span><span class="s1">'abstract_unknowns'</span><span class="p">,</span><span class="w"> </span><span class="mi">64</span><span class="p">)</span> +<span class="o">&gt;&gt;&gt;&gt;</span><span class="w"> </span><span class="c1"># we construct a KnownBits instance with the z3 variables</span> +<span class="o">&gt;&gt;&gt;&gt;</span><span class="w"> </span><span class="n">k1</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">KnownBits</span><span class="p">(</span><span class="n">ones</span><span class="p">,</span><span class="w"> </span><span class="n">unknowns</span><span class="p">)</span> +<span class="o">&gt;&gt;&gt;&gt;</span><span class="w"> </span><span class="c1"># due to operator overloading we can call the methods on k1:</span> +<span class="o">&gt;&gt;&gt;&gt;</span><span class="w"> </span><span class="n">k2</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">k1</span><span class="o">.</span><span class="n">abstract_invert</span><span class="p">()</span> +<span class="o">&gt;&gt;&gt;&gt;</span><span class="w"> </span><span class="n">k2</span><span class="o">.</span><span class="n">ones</span> +<span class="o">~</span><span class="n">abstract_unknowns</span><span class="w"> </span><span class="o">&amp;</span><span class="w"> </span><span class="o">~</span><span class="n">abstract_ones</span> +<span class="o">&gt;&gt;&gt;&gt;</span><span class="w"> </span><span class="n">k2</span><span class="o">.</span><span class="n">unknowns</span> +<span class="n">abstract_unknowns</span> +<span class="o">&gt;&gt;&gt;&gt;</span><span class="w"> </span><span class="c1"># here's the correctness condition that we want to prove:</span> +<span class="o">&gt;&gt;&gt;&gt;</span><span class="w"> </span><span class="n">k2</span><span class="o">.</span><span class="n">contains</span><span class="p">(</span><span class="n">n2</span><span class="p">)</span> +<span class="o">~</span><span class="n">concrete_value</span><span class="w"> </span><span class="o">&amp;</span><span class="w"> </span><span class="o">~</span><span class="n">abstract_unknowns</span><span class="w"> </span><span class="o">==</span> +<span class="o">~</span><span class="n">abstract_unknowns</span><span class="w"> </span><span class="o">&amp;</span><span class="w"> </span><span class="o">~</span><span class="n">abstract_ones</span> +<span class="o">&gt;&gt;&gt;&gt;</span><span class="w"> </span><span class="c1"># let's try</span> +<span class="o">&gt;&gt;&gt;&gt;</span><span class="w"> </span><span class="n">prove</span><span class="p">(</span><span class="n">k2</span><span class="o">.</span><span class="n">contains</span><span class="p">(</span><span class="n">n2</span><span class="p">))</span> +<span class="n">Traceback</span><span class="w"> </span><span class="p">(</span><span class="n">most</span><span class="w"> </span><span class="n">recent</span><span class="w"> </span><span class="n">call</span><span class="w"> </span><span class="n">last</span><span class="p">):</span> +<span class="w"> </span><span class="n">File</span><span class="w"> </span><span class="s2">"&lt;stdin&gt;"</span><span class="p">,</span><span class="w"> </span><span class="n">line</span><span class="w"> </span><span class="mi">1</span><span class="p">,</span><span class="w"> </span><span class="ow">in</span><span class="w"> </span><span class="o">&lt;</span><span class="n">module</span><span class="o">&gt;</span> +<span class="w"> </span><span class="n">File</span><span class="w"> </span><span class="s2">"&lt;stdin&gt;"</span><span class="p">,</span><span class="w"> </span><span class="n">line</span><span class="w"> </span><span class="mi">1</span><span class="p">,</span><span class="w"> </span><span class="ow">in</span><span class="w"> </span><span class="n">prove</span> +<span class="n">AssertionError</span> +<span class="o">&gt;&gt;&gt;&gt;</span><span class="w"> </span><span class="c1"># it doesn't work! let's look at the counterexample to see why:</span> +<span class="o">&gt;&gt;&gt;&gt;</span><span class="w"> </span><span class="n">solver</span><span class="o">.</span><span class="n">model</span><span class="p">()</span> +<span class="p">[</span><span class="n">abstract_unknowns</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="mi">0</span><span class="p">,</span> +<span class="w"> </span><span class="n">abstract_ones</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="mi">0</span><span class="p">,</span> +<span class="w"> </span><span class="n">concrete_value</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="mi">1</span><span class="p">]</span> +<span class="o">&gt;&gt;&gt;&gt;</span><span class="w"> </span><span class="c1"># we can build a KnownBits instance with the values in the</span> +<span class="o">&gt;&gt;&gt;&gt;</span><span class="w"> </span><span class="c1"># counterexample:</span> +<span class="o">&gt;&gt;&gt;&gt;</span><span class="w"> </span><span class="o">~</span><span class="mi">1</span><span class="w"> </span><span class="c1"># concrete result</span> +<span class="o">-</span><span class="mi">2</span> +<span class="o">&gt;&gt;&gt;&gt;</span><span class="w"> </span><span class="n">counter_example_k1</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">KnownBits</span><span class="p">(</span><span class="mi">0</span><span class="p">,</span><span class="w"> </span><span class="mi">0</span><span class="p">)</span> +<span class="o">&gt;&gt;&gt;&gt;</span><span class="w"> </span><span class="n">counter_example_k1</span> +<span class="n">KnownBits</span><span class="o">.</span><span class="n">from_constant</span><span class="p">(</span><span class="mi">0</span><span class="p">)</span> +<span class="o">&gt;&gt;&gt;&gt;</span><span class="w"> </span><span class="n">counter_example_k2</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">counter_example_k1</span><span class="o">.</span><span class="n">abstract_invert</span><span class="p">()</span> +<span class="o">&gt;&gt;&gt;&gt;</span><span class="w"> </span><span class="n">counter_example_k2</span> +<span class="n">KnownBits</span><span class="o">.</span><span class="n">from_constant</span><span class="p">(</span><span class="o">-</span><span class="mi">1</span><span class="p">)</span> +<span class="o">&gt;&gt;&gt;&gt;</span><span class="w"> </span><span class="c1"># let's check the failing condition</span> +<span class="o">&gt;&gt;&gt;&gt;</span><span class="w"> </span><span class="n">counter_example_k2</span><span class="o">.</span><span class="n">contains</span><span class="p">(</span><span class="o">~</span><span class="mi">1</span><span class="p">)</span> +<span class="kc">False</span> +</pre></div> + +<p>What is the problem here? We didn't tell Z3 that <code>n1</code> was supposed to be a +member of <code>k1</code>. We can add this as a precondition to the solver, and then the +prove works:</p> +<div class="code"><pre class="code literal-block">&gt;&gt;&gt;&gt; solver.add(k1.contains(n1)) +&gt;&gt;&gt;&gt; prove(k2.contains(n2)) # works! +</pre></div> + +<p>This is super cool! It's really a proof about the actual implementation, because +we call the implementation methods directly, and due to the operator overloading +that Z3 does we can be sure that we are actually checking a formula that +corresponds to the Python code. This eliminates one source of errors in formal +methods.</p> +<p>Doing the proof manually on the Python REPL is kind of annoying though, and we +also would like to make sure that the proofs are re-done when we change the +code. What we would really like to do is writing the proofs as a unit-test that +we can run while developing and in CI. Doing this is possible, and the unit +tests that really perform proofs look pleasingly similar to the +Hypothesis-based ones.</p> +<p>First we need to set up a bit of infrastructure:</p> +<div class="code"><pre class="code literal-block"><span class="n">INTEGER_WIDTH</span> <span class="o">=</span> <span class="mi">64</span> + +<span class="k">def</span> <span class="nf">BitVec</span><span class="p">(</span><span class="n">name</span><span class="p">):</span> + <span class="k">return</span> <span class="n">z3</span><span class="o">.</span><span class="n">BitVec</span><span class="p">(</span><span class="n">name</span><span class="p">,</span> <span class="n">INTEGER_WIDTH</span><span class="p">)</span> + +<span class="k">def</span> <span class="nf">BitVecVal</span><span class="p">(</span><span class="n">val</span><span class="p">):</span> + <span class="k">return</span> <span class="n">z3</span><span class="o">.</span><span class="n">BitVecVal</span><span class="p">(</span><span class="n">val</span><span class="p">,</span> <span class="n">INTEGER_WIDTH</span><span class="p">)</span> + +<span class="k">def</span> <span class="nf">z3_setup_variables</span><span class="p">():</span> + <span class="c1"># instantiate a solver</span> + <span class="n">solver</span> <span class="o">=</span> <span class="n">z3</span><span class="o">.</span><span class="n">Solver</span><span class="p">()</span> + + <span class="c1"># a Z3 variable for the first concrete value</span> + <span class="n">n1</span> <span class="o">=</span> <span class="n">BitVec</span><span class="p">(</span><span class="s2">"n1"</span><span class="p">)</span> + <span class="c1"># a KnownBits instances that uses Z3 variables as its ones and unknowns,</span> + <span class="c1"># representing the first abstract value</span> + <span class="n">k1</span> <span class="o">=</span> <span class="n">KnownBits</span><span class="p">(</span><span class="n">BitVec</span><span class="p">(</span><span class="s2">"n1_ones"</span><span class="p">),</span> <span class="n">BitVec</span><span class="p">(</span><span class="s2">"n1_unkowns"</span><span class="p">))</span> + <span class="c1"># add the precondition to the solver that the concrete value n1 must be a</span> + <span class="c1"># member of the abstract value k1</span> + <span class="n">solver</span><span class="o">.</span><span class="n">add</span><span class="p">(</span><span class="n">k1</span><span class="o">.</span><span class="n">contains</span><span class="p">(</span><span class="n">n1</span><span class="p">))</span> + + <span class="c1"># a Z3 variable for the second concrete value</span> + <span class="n">n2</span> <span class="o">=</span> <span class="n">BitVec</span><span class="p">(</span><span class="s2">"n2"</span><span class="p">)</span> + <span class="c1"># a KnownBits instances for the second abstract value</span> + <span class="n">k2</span> <span class="o">=</span> <span class="n">KnownBits</span><span class="p">(</span><span class="n">BitVec</span><span class="p">(</span><span class="s2">"n2_ones"</span><span class="p">),</span> <span class="n">BitVec</span><span class="p">(</span><span class="s2">"n2_unkowns"</span><span class="p">))</span> + <span class="c1"># add the precondition linking n2 and k2 to the solver</span> + <span class="n">solver</span><span class="o">.</span><span class="n">add</span><span class="p">(</span><span class="n">k2</span><span class="o">.</span><span class="n">contains</span><span class="p">(</span><span class="n">n2</span><span class="p">))</span> + <span class="k">return</span> <span class="n">solver</span><span class="p">,</span> <span class="n">k1</span><span class="p">,</span> <span class="n">n1</span><span class="p">,</span> <span class="n">k2</span><span class="p">,</span> <span class="n">n2</span> + +<span class="k">def</span> <span class="nf">prove</span><span class="p">(</span><span class="n">cond</span><span class="p">,</span> <span class="n">solver</span><span class="p">):</span> + <span class="n">z3res</span> <span class="o">=</span> <span class="n">solver</span><span class="o">.</span><span class="n">check</span><span class="p">(</span><span class="n">z3</span><span class="o">.</span><span class="n">Not</span><span class="p">(</span><span class="n">cond</span><span class="p">))</span> + <span class="k">if</span> <span class="n">z3res</span> <span class="o">!=</span> <span class="n">z3</span><span class="o">.</span><span class="n">unsat</span><span class="p">:</span> + <span class="k">assert</span> <span class="n">z3res</span> <span class="o">==</span> <span class="n">z3</span><span class="o">.</span><span class="n">sat</span> <span class="c1"># can't be timeout, we set no timeout</span> + <span class="c1"># make the model with the counterexample global, to make inspecting the</span> + <span class="c1"># bug easier when running pytest --pdb</span> + <span class="k">global</span> <span class="n">model</span> + <span class="n">model</span> <span class="o">=</span> <span class="n">solver</span><span class="o">.</span><span class="n">model</span><span class="p">()</span> + <span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">"n1=</span><span class="si">{</span><span class="n">model</span><span class="o">.</span><span class="n">eval</span><span class="p">(</span><span class="n">n1</span><span class="p">)</span><span class="si">}</span><span class="s2">, n2=</span><span class="si">{</span><span class="n">model</span><span class="o">.</span><span class="n">eval</span><span class="p">(</span><span class="n">n2</span><span class="p">)</span><span class="si">}</span><span class="s2">"</span><span class="p">)</span> + <span class="n">counter_example_k1</span> <span class="o">=</span> <span class="n">KnownBits</span><span class="p">(</span><span class="n">model</span><span class="o">.</span><span class="n">eval</span><span class="p">(</span><span class="n">k1</span><span class="o">.</span><span class="n">ones</span><span class="p">)</span><span class="o">.</span><span class="n">as_signed_long</span><span class="p">(),</span> + <span class="n">model</span><span class="o">.</span><span class="n">eval</span><span class="p">(</span><span class="n">k1</span><span class="o">.</span><span class="n">unknowns</span><span class="p">)</span><span class="o">.</span><span class="n">as_signed_long</span><span class="p">())</span> + <span class="n">counter_example_k2</span> <span class="o">=</span> <span class="n">KnownBits</span><span class="p">(</span><span class="n">model</span><span class="o">.</span><span class="n">eval</span><span class="p">(</span><span class="n">k2</span><span class="o">.</span><span class="n">ones</span><span class="p">)</span><span class="o">.</span><span class="n">as_signed_long</span><span class="p">(),</span> + <span class="n">model</span><span class="o">.</span><span class="n">eval</span><span class="p">(</span><span class="n">k2</span><span class="o">.</span><span class="n">unknowns</span><span class="p">)</span><span class="o">.</span><span class="n">as_signed_long</span><span class="p">())</span> + <span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">"k1=</span><span class="si">{</span><span class="n">counter_example_k1</span><span class="si">}</span><span class="s2">, k2=</span><span class="si">{</span><span class="n">counter_example_k2</span><span class="si">}</span><span class="s2">"</span><span class="p">)</span> + <span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">"but </span><span class="si">{</span><span class="n">cond</span><span class="si">=}</span><span class="s2"> evaluates to </span><span class="si">{</span><span class="n">model</span><span class="o">.</span><span class="n">eval</span><span class="p">(</span><span class="n">cond</span><span class="p">)</span><span class="si">}</span><span class="s2">"</span><span class="p">)</span> + <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="n">solver</span><span class="o">.</span><span class="n">model</span><span class="p">())</span> +</pre></div> + +<p>And then we can write proof-unit-tests like this:</p> +<div class="code"><pre class="code literal-block"><span class="k">def</span> <span class="nf">test_z3_abstract_invert</span><span class="p">():</span> + <span class="n">solver</span><span class="p">,</span> <span class="n">k1</span><span class="p">,</span> <span class="n">n1</span><span class="p">,</span> <span class="n">_</span><span class="p">,</span> <span class="n">_</span> <span class="o">=</span> <span class="n">z3_setup_variables</span><span class="p">()</span> + <span class="n">k2</span> <span class="o">=</span> <span class="n">k1</span><span class="o">.</span><span class="n">abstract_invert</span><span class="p">()</span> + <span class="n">n2</span> <span class="o">=</span> <span class="o">~</span><span class="n">n1</span> + <span class="n">prove</span><span class="p">(</span><span class="n">k2</span><span class="o">.</span><span class="n">contains</span><span class="p">(</span><span class="n">n2</span><span class="p">),</span> <span class="n">solver</span><span class="p">)</span> + +<span class="k">def</span> <span class="nf">test_z3_abstract_and</span><span class="p">():</span> + <span class="n">solver</span><span class="p">,</span> <span class="n">k1</span><span class="p">,</span> <span class="n">n1</span><span class="p">,</span> <span class="n">k2</span><span class="p">,</span> <span class="n">n2</span> <span class="o">=</span> <span class="n">z3_setup_variables</span><span class="p">()</span> + <span class="n">k3</span> <span class="o">=</span> <span class="n">k1</span><span class="o">.</span><span class="n">abstract_and</span><span class="p">(</span><span class="n">k2</span><span class="p">)</span> + <span class="n">n3</span> <span class="o">=</span> <span class="n">n1</span> <span class="o">&amp;</span> <span class="n">n2</span> + <span class="n">prove</span><span class="p">(</span><span class="n">k3</span><span class="o">.</span><span class="n">contains</span><span class="p">(</span><span class="n">n3</span><span class="p">),</span> <span class="n">solver</span><span class="p">)</span> + +<span class="k">def</span> <span class="nf">test_z3_abstract_or</span><span class="p">():</span> + <span class="n">solver</span><span class="p">,</span> <span class="n">k1</span><span class="p">,</span> <span class="n">n1</span><span class="p">,</span> <span class="n">k2</span><span class="p">,</span> <span class="n">n2</span> <span class="o">=</span> <span class="n">z3_setup_variables</span><span class="p">()</span> + <span class="n">k3</span> <span class="o">=</span> <span class="n">k1</span><span class="o">.</span><span class="n">abstract_or</span><span class="p">(</span><span class="n">k2</span><span class="p">)</span> + <span class="n">n3</span> <span class="o">=</span> <span class="n">n1</span> <span class="o">|</span> <span class="n">n2</span> + <span class="n">prove</span><span class="p">(</span><span class="n">k3</span><span class="o">.</span><span class="n">contains</span><span class="p">(</span><span class="n">n3</span><span class="p">),</span> <span class="n">solver</span><span class="p">)</span> + +<span class="k">def</span> <span class="nf">test_z3_abstract_add</span><span class="p">():</span> + <span class="n">solver</span><span class="p">,</span> <span class="n">k1</span><span class="p">,</span> <span class="n">n1</span><span class="p">,</span> <span class="n">k2</span><span class="p">,</span> <span class="n">n2</span> <span class="o">=</span> <span class="n">z3_setup_variables</span><span class="p">()</span> + <span class="n">k3</span> <span class="o">=</span> <span class="n">k1</span><span class="o">.</span><span class="n">abstract_add</span><span class="p">(</span><span class="n">k2</span><span class="p">)</span> + <span class="n">n3</span> <span class="o">=</span> <span class="n">n1</span> <span class="o">+</span> <span class="n">n2</span> + <span class="n">prove</span><span class="p">(</span><span class="n">k3</span><span class="o">.</span><span class="n">contains</span><span class="p">(</span><span class="n">n3</span><span class="p">),</span> <span class="n">solver</span><span class="p">)</span> + +<span class="k">def</span> <span class="nf">test_z3_abstract_sub</span><span class="p">():</span> + <span class="n">solver</span><span class="p">,</span> <span class="n">k1</span><span class="p">,</span> <span class="n">n1</span><span class="p">,</span> <span class="n">k2</span><span class="p">,</span> <span class="n">n2</span> <span class="o">=</span> <span class="n">z3_setup_variables</span><span class="p">()</span> + <span class="n">k3</span> <span class="o">=</span> <span class="n">k1</span><span class="o">.</span><span class="n">abstract_sub</span><span class="p">(</span><span class="n">k2</span><span class="p">)</span> + <span class="n">n3</span> <span class="o">=</span> <span class="n">n1</span> <span class="o">-</span> <span class="n">n2</span> + <span class="n">prove</span><span class="p">(</span><span class="n">k3</span><span class="o">.</span><span class="n">contains</span><span class="p">(</span><span class="n">n3</span><span class="p">),</span> <span class="n">solver</span><span class="p">)</span> +</pre></div> + +<p>It's possible to write a bit more Python-metaprogramming-magic and unify the +Hypothesis and Z3 tests into the same test definition.<sup id="fnref:proof_bitwidths"><a class="footnote-ref" href="https://www.pypy.org/posts/2024/08/toy-knownbits.html#fn:proof_bitwidths">1</a></sup></p> +<h3 id="cases-where-this-style-of-z3-proof-doesnt-work">Cases where this style of Z3 proof doesn't work</h3> +<p>Unfortunately the approach described in the previous section only works for a +very small number of cases. It breaks down as soon as the <code>KnownBits</code> methods +that we're calling contain any <code>if</code> conditions (including hidden ones like +the short-circuiting <code>and</code> and <code>or</code> in Python). Let's look at an example and +implement <code>abstract_eq</code>. <code>eq</code> is supposed to be an operation that compares two +integers and returns <code>0</code> or <code>1</code> if they are different or equal, respectively. +Implementing this in knownbits looks like this (with example and hypothesis +tests):</p> +<div class="code"><pre class="code literal-block"><span class="k">class</span> <span class="nc">KnownBits</span><span class="p">:</span> + <span class="o">...</span> + + <span class="k">def</span> <span class="nf">abstract_eq</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">):</span> + <span class="c1"># the result is a 0, 1, or ?</span> + + <span class="c1"># if they are both the same constant, they must be equal</span> + <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">is_constant</span><span class="p">()</span> <span class="ow">and</span> <span class="n">other</span><span class="o">.</span><span class="n">is_constant</span><span class="p">()</span> <span class="ow">and</span> <span class="bp">self</span><span class="o">.</span><span class="n">ones</span> <span class="o">==</span> <span class="n">other</span><span class="o">.</span><span class="n">ones</span><span class="p">:</span> + <span class="k">return</span> <span class="n">KnownBits</span><span class="o">.</span><span class="n">from_constant</span><span class="p">(</span><span class="mi">1</span><span class="p">)</span> + <span class="c1"># check whether we have known disagreeing bits, then we know the result</span> + <span class="c1"># is 0</span> + <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">_disagrees</span><span class="p">(</span><span class="n">other</span><span class="p">):</span> + <span class="k">return</span> <span class="n">KnownBits</span><span class="o">.</span><span class="n">from_constant</span><span class="p">(</span><span class="mi">0</span><span class="p">)</span> + <span class="k">return</span> <span class="n">KnownBits</span><span class="p">(</span><span class="mi">0</span><span class="p">,</span> <span class="mi">1</span><span class="p">)</span> <span class="c1"># an unknown boolean</span> + + <span class="k">def</span> <span class="nf">_disagrees</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">):</span> + <span class="c1"># check whether the bits disagree in any place where both are known</span> + <span class="n">both_known</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">knowns</span> <span class="o">&amp;</span> <span class="n">other</span><span class="o">.</span><span class="n">knowns</span> + <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">ones</span> <span class="o">&amp;</span> <span class="n">both_known</span> <span class="o">!=</span> <span class="n">other</span><span class="o">.</span><span class="n">ones</span> <span class="o">&amp;</span> <span class="n">both_known</span> + +<span class="k">def</span> <span class="nf">test_eq</span><span class="p">():</span> + <span class="n">k1</span> <span class="o">=</span> <span class="n">KnownBits</span><span class="o">.</span><span class="n">from_str</span><span class="p">(</span><span class="s1">'...?'</span><span class="p">)</span> + <span class="n">k2</span> <span class="o">=</span> <span class="n">KnownBits</span><span class="o">.</span><span class="n">from_str</span><span class="p">(</span><span class="s1">'...?'</span><span class="p">)</span> + <span class="k">assert</span> <span class="nb">str</span><span class="p">(</span><span class="n">k1</span><span class="o">.</span><span class="n">abstract_eq</span><span class="p">(</span><span class="n">k2</span><span class="p">))</span> <span class="o">==</span> <span class="s1">'?'</span> + <span class="n">k1</span> <span class="o">=</span> <span class="n">KnownBits</span><span class="o">.</span><span class="n">from_constant</span><span class="p">(</span><span class="mi">10</span><span class="p">)</span> + <span class="k">assert</span> <span class="nb">str</span><span class="p">(</span><span class="n">k1</span><span class="o">.</span><span class="n">abstract_eq</span><span class="p">(</span><span class="n">k1</span><span class="p">))</span> <span class="o">==</span> <span class="s1">'1'</span> + <span class="n">k1</span> <span class="o">=</span> <span class="n">KnownBits</span><span class="o">.</span><span class="n">from_constant</span><span class="p">(</span><span class="mi">10</span><span class="p">)</span> + <span class="n">k2</span> <span class="o">=</span> <span class="n">KnownBits</span><span class="o">.</span><span class="n">from_constant</span><span class="p">(</span><span class="mi">20</span><span class="p">)</span> + <span class="k">assert</span> <span class="nb">str</span><span class="p">(</span><span class="n">k1</span><span class="o">.</span><span class="n">abstract_eq</span><span class="p">(</span><span class="n">k2</span><span class="p">))</span> <span class="o">==</span> <span class="s1">'0'</span> + +<span class="nd">@given</span><span class="p">(</span><span class="n">knownbits_and_contained_number</span><span class="p">,</span> <span class="n">knownbits_and_contained_number</span><span class="p">)</span> +<span class="k">def</span> <span class="nf">test_hypothesis_eq</span><span class="p">(</span><span class="n">t1</span><span class="p">,</span> <span class="n">t2</span><span class="p">):</span> + <span class="n">k1</span><span class="p">,</span> <span class="n">n1</span> <span class="o">=</span> <span class="n">t1</span> + <span class="n">k2</span><span class="p">,</span> <span class="n">n2</span> <span class="o">=</span> <span class="n">t2</span> + <span class="n">k3</span> <span class="o">=</span> <span class="n">k1</span><span class="o">.</span><span class="n">abstract_eq</span><span class="p">(</span><span class="n">k2</span><span class="p">)</span> + <span class="k">assert</span> <span class="n">k3</span><span class="o">.</span><span class="n">contains</span><span class="p">(</span><span class="nb">int</span><span class="p">(</span><span class="n">n1</span> <span class="o">==</span> <span class="n">n2</span><span class="p">))</span> +</pre></div> + +<p>Trying to do the proof in the same style as before breaks:</p> +<div class="code"><pre class="code literal-block"><span class="o">&gt;&gt;&gt;&gt;</span> <span class="n">k3</span> <span class="o">=</span> <span class="n">k1</span><span class="o">.</span><span class="n">abstract_eq</span><span class="p">(</span><span class="n">k2</span><span class="p">)</span> +<span class="n">Traceback</span> <span class="p">(</span><span class="n">most</span> <span class="n">recent</span> <span class="n">call</span> <span class="n">last</span><span class="p">):</span> + <span class="n">File</span> <span class="s2">"&lt;stdin&gt;"</span><span class="p">,</span> <span class="n">line</span> <span class="mi">1</span><span class="p">,</span> <span class="ow">in</span> <span class="o">&lt;</span><span class="n">module</span><span class="o">&gt;</span> + <span class="n">File</span> <span class="s2">"knownbits.py"</span><span class="p">,</span> <span class="n">line</span> <span class="mi">246</span><span class="p">,</span> <span class="ow">in</span> <span class="n">abstract_eq</span> + <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">_disagrees</span><span class="p">(</span><span class="n">other</span><span class="p">):</span> + <span class="n">File</span> <span class="s2">"venv/site-packages/z3/z3.py"</span><span class="p">,</span> <span class="n">line</span> <span class="mi">381</span><span class="p">,</span> <span class="ow">in</span> <span class="fm">__bool__</span> + <span class="k">raise</span> <span class="n">Z3Exception</span><span class="p">(</span><span class="s2">"Symbolic expressions cannot be cast to concrete Boolean values."</span><span class="p">)</span> +<span class="n">z3</span><span class="o">.</span><span class="n">z3types</span><span class="o">.</span><span class="n">Z3Exception</span><span class="p">:</span> <span class="n">Symbolic</span> <span class="n">expressions</span> <span class="n">cannot</span> <span class="n">be</span> <span class="n">cast</span> <span class="n">to</span> <span class="n">concrete</span> <span class="n">Boolean</span> <span class="n">values</span><span class="o">.</span> +</pre></div> + +<p>We cannot call <code>abstract_eq</code> on a <code>KnownBits</code> with Z3 variables as fields, +because once we hit an <code>if</code> statement, the whole approach of relying on the +operator overloading breaks down. Z3 doesn't actually parse the Python code or +anything advanced like that, we rather build an expression only by running the +code and letting the Z3 formulas build up.</p> +<p>To still prove the correctness of <code>abstract_eq</code> we need to manually transform +the control flow logic of the function into a Z3 formula that uses the <code>z3.If</code> +expression, using a small helper function:</p> +<div class="code"><pre class="code literal-block"><span class="k">def</span> <span class="nf">z3_cond</span><span class="p">(</span><span class="n">b</span><span class="p">,</span> <span class="n">trueval</span><span class="o">=</span><span class="mi">1</span><span class="p">,</span> <span class="n">falseval</span><span class="o">=</span><span class="mi">0</span><span class="p">):</span> + <span class="k">return</span> <span class="n">z3</span><span class="o">.</span><span class="n">If</span><span class="p">(</span><span class="n">b</span><span class="p">,</span> <span class="n">BitVecVal</span><span class="p">(</span><span class="n">trueval</span><span class="p">),</span> <span class="n">BitVecVal</span><span class="p">(</span><span class="n">falseval</span><span class="p">))</span> + +<span class="k">def</span> <span class="nf">z3_abstract_eq</span><span class="p">(</span><span class="n">k1</span><span class="p">,</span> <span class="n">k2</span><span class="p">):</span> + <span class="c1"># follow the *logic* of abstract_eq, we can't call it due to the ifs in it</span> + <span class="n">case1cond</span> <span class="o">=</span> <span class="n">z3</span><span class="o">.</span><span class="n">And</span><span class="p">(</span><span class="n">k1</span><span class="o">.</span><span class="n">is_constant</span><span class="p">(),</span> <span class="n">k2</span><span class="o">.</span><span class="n">is_constant</span><span class="p">(),</span> <span class="n">k1</span><span class="o">.</span><span class="n">ones</span> <span class="o">==</span> <span class="n">k2</span><span class="o">.</span><span class="n">ones</span><span class="p">)</span> + <span class="n">case2cond</span> <span class="o">=</span> <span class="n">k1</span><span class="o">.</span><span class="n">_disagrees</span><span class="p">(</span><span class="n">k2</span><span class="p">)</span> + + <span class="c1"># ones is 1 in the first case, 0 otherwise</span> + <span class="n">ones</span> <span class="o">=</span> <span class="n">z3_cond</span><span class="p">(</span><span class="n">case1cond</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="mi">0</span><span class="p">)</span> + + <span class="c1"># in the first two cases, unknowns is 0, 1 otherwise</span> + <span class="n">unknowns</span> <span class="o">=</span> <span class="n">z3_cond</span><span class="p">(</span><span class="n">z3</span><span class="o">.</span><span class="n">Or</span><span class="p">(</span><span class="n">case1cond</span><span class="p">,</span> <span class="n">case2cond</span><span class="p">),</span> <span class="mi">0</span><span class="p">,</span> <span class="mi">1</span><span class="p">)</span> + <span class="k">return</span> <span class="n">KnownBits</span><span class="p">(</span><span class="n">ones</span><span class="p">,</span> <span class="n">unknowns</span><span class="p">)</span> + +<span class="k">def</span> <span class="nf">test_z3_abstract_eq_logic</span><span class="p">():</span> + <span class="n">solver</span><span class="p">,</span> <span class="n">k1</span><span class="p">,</span> <span class="n">n1</span><span class="p">,</span> <span class="n">k2</span><span class="p">,</span> <span class="n">n2</span> <span class="o">=</span> <span class="n">z3_setup_variables</span><span class="p">()</span> + <span class="n">n3</span> <span class="o">=</span> <span class="n">z3_cond</span><span class="p">(</span><span class="n">n1</span> <span class="o">==</span> <span class="n">n2</span><span class="p">)</span> <span class="c1"># concrete result</span> + <span class="n">k3</span> <span class="o">=</span> <span class="n">z3_abstract_eq</span><span class="p">(</span><span class="n">k1</span><span class="p">,</span> <span class="n">k2</span><span class="p">)</span> + <span class="n">prove</span><span class="p">(</span><span class="n">k3</span><span class="o">.</span><span class="n">contains</span><span class="p">(</span><span class="n">n3</span><span class="p">),</span> <span class="n">solver</span><span class="p">)</span> +</pre></div> + +<p>This proof works. It is a lot less satisfying than the previous ones though, +because we could have done an error in the manual transcription from Python code +to Z3 formulas (there are possibly more heavy-handed approaches where we do +this transformation more automatically using e.g. the <code>ast</code> module to analyze +the source code, but that's a much more complicated researchy project). To +lessen this problem somewhat we can factor out the parts of the logic that don't +have any conditions into small helper methods (like <code>_disagrees</code> in this +example) and use them in the manual conversion of the code to Z3 formulas.<sup id="fnref:tests_vs_proofs"><a class="footnote-ref" href="https://www.pypy.org/posts/2024/08/toy-knownbits.html#fn:tests_vs_proofs">2</a></sup></p> +<p>The final condition that Z3 checks, btw, is this one:</p> +<div class="code"><pre class="code literal-block">If(n1 == n2, 1, 0) &amp; +~If(Or(And(n1_unkowns == 0, + n2_unkowns == 0, + n1_ones == n2_ones), + n1_ones &amp; ~n1_unkowns &amp; ~n2_unkowns != + n2_ones &amp; ~n1_unkowns &amp; ~n2_unkowns), + 0, 1) == +If(And(n1_unkowns == 0, n2_unkowns == 0, n1_ones == n2_ones), + 1, 0) +</pre></div> + +<h3 id="making-statements-about-precision">Making Statements about Precision</h3> +<p>So far we have only used Z3 to prove statements about correctness, i.e. that +our abstract operations overapproximate what can happen with concrete values. +While proving this property is essential if we want to avoid miscompilation, +correctness alone is not a very strong constraint on the implementation of our +abstract transfer functions. We could simply return <code>Knownbits.unknowns()</code> for +every <code>abstract_*</code> method and the resulting overapproximation would be correct, +but useless in practice.</p> +<p>It's much harder to make statements about whether the transfer functions are +maximally precise. There are two aspects of precision I want to discuss in this +section, however.</p> +<p>The first aspect is that we would really like it if the transfer functions +compute the maximally precise results for singleton sets. If all abstract +arguments of an operations are constants, i.e. contain only a single concrete +element, then we know that the resulting set also has only a single element. We +can prove that all our transfer functions have this property:</p> +<div class="code"><pre class="code literal-block"><span class="k">def</span> <span class="nf">test_z3_prove_constant_folding</span><span class="p">():</span> + <span class="n">solver</span><span class="p">,</span> <span class="n">k1</span><span class="p">,</span> <span class="n">n1</span><span class="p">,</span> <span class="n">k2</span><span class="p">,</span> <span class="n">n2</span> <span class="o">=</span> <span class="n">z3_setup_variables</span><span class="p">()</span> + <span class="n">k3</span> <span class="o">=</span> <span class="n">k1</span><span class="o">.</span><span class="n">abstract_invert</span><span class="p">()</span> + <span class="n">prove</span><span class="p">(</span><span class="n">z3</span><span class="o">.</span><span class="n">Implies</span><span class="p">(</span><span class="n">k1</span><span class="o">.</span><span class="n">is_constant</span><span class="p">(),</span> + <span class="n">k3</span><span class="o">.</span><span class="n">is_constant</span><span class="p">()),</span> <span class="n">solver</span><span class="p">)</span> + + <span class="n">k3</span> <span class="o">=</span> <span class="n">k1</span><span class="o">.</span><span class="n">abstract_and</span><span class="p">(</span><span class="n">k2</span><span class="p">)</span> + <span class="n">prove</span><span class="p">(</span><span class="n">z3</span><span class="o">.</span><span class="n">Implies</span><span class="p">(</span><span class="n">z3</span><span class="o">.</span><span class="n">And</span><span class="p">(</span><span class="n">k1</span><span class="o">.</span><span class="n">is_constant</span><span class="p">(),</span> <span class="n">k2</span><span class="o">.</span><span class="n">is_constant</span><span class="p">()),</span> + <span class="n">k3</span><span class="o">.</span><span class="n">is_constant</span><span class="p">()),</span> <span class="n">solver</span><span class="p">)</span> + + <span class="n">k3</span> <span class="o">=</span> <span class="n">k1</span><span class="o">.</span><span class="n">abstract_or</span><span class="p">(</span><span class="n">k2</span><span class="p">)</span> + <span class="n">prove</span><span class="p">(</span><span class="n">z3</span><span class="o">.</span><span class="n">Implies</span><span class="p">(</span><span class="n">z3</span><span class="o">.</span><span class="n">And</span><span class="p">(</span><span class="n">k1</span><span class="o">.</span><span class="n">is_constant</span><span class="p">(),</span> <span class="n">k2</span><span class="o">.</span><span class="n">is_constant</span><span class="p">()),</span> + <span class="n">k3</span><span class="o">.</span><span class="n">is_constant</span><span class="p">()),</span> <span class="n">solver</span><span class="p">)</span> + + <span class="n">k3</span> <span class="o">=</span> <span class="n">k1</span><span class="o">.</span><span class="n">abstract_sub</span><span class="p">(</span><span class="n">k2</span><span class="p">)</span> + <span class="n">prove</span><span class="p">(</span><span class="n">z3</span><span class="o">.</span><span class="n">Implies</span><span class="p">(</span><span class="n">z3</span><span class="o">.</span><span class="n">And</span><span class="p">(</span><span class="n">k1</span><span class="o">.</span><span class="n">is_constant</span><span class="p">(),</span> <span class="n">k2</span><span class="o">.</span><span class="n">is_constant</span><span class="p">()),</span> + <span class="n">k3</span><span class="o">.</span><span class="n">is_constant</span><span class="p">()),</span> <span class="n">solver</span><span class="p">)</span> + + <span class="n">k3</span> <span class="o">=</span> <span class="n">z3_abstract_eq</span><span class="p">(</span><span class="n">k1</span><span class="p">,</span> <span class="n">k2</span><span class="p">)</span> + <span class="n">prove</span><span class="p">(</span><span class="n">z3</span><span class="o">.</span><span class="n">Implies</span><span class="p">(</span><span class="n">z3</span><span class="o">.</span><span class="n">And</span><span class="p">(</span><span class="n">k1</span><span class="o">.</span><span class="n">is_constant</span><span class="p">(),</span> <span class="n">k2</span><span class="o">.</span><span class="n">is_constant</span><span class="p">()),</span> + <span class="n">k3</span><span class="o">.</span><span class="n">is_constant</span><span class="p">()),</span> <span class="n">solver</span><span class="p">)</span> +</pre></div> + +<p>Proving with Z3 that the transfer functions are maximally precise for +non-constant arguments seems to be relatively hard. I tried a few completely +rigorous approaches and failed. The paper <a href="https://arxiv.org/pdf/2105.05398">Sound, Precise, and Fast Abstract +Interpretation with Tristate Numbers</a> +contains an optimality proof for the transfer functions of addition and +subtraction, so we can be certain that they are as precise as is +possible.</p> +<p>I still want to show an approach for trying to find concrete examples of +abstract values that are less precise than they could be, using a combination +of Hypothesis and Z3. The idea is to use hypothesis to pick random abstract +values. Then we compute the abstract result using our transfer function. +Afterwards we can ask Z3 to find us an abstract result that is better than the +one our transfer function produced. If Z3 finds a better abstract result, we +have a concrete example of imprecision for our transfer function. Those tests +aren't strict proofs, because they rely on generating random abstract values, +but they can still be valuable (not for the transfer functions in this blog +post, which are all optimal).</p> +<p>Here is what the code looks like (this is a little bit bonus content, I'll not +explain the details and can only hope that the comments are somewhat helpful):</p> +<div class="code"><pre class="code literal-block"><span class="nd">@given</span><span class="p">(</span><span class="n">random_knownbits_and_contained_number</span><span class="p">,</span> <span class="n">random_knownbits_and_contained_number</span><span class="p">)</span> +<span class="nd">@settings</span><span class="p">(</span><span class="n">deadline</span><span class="o">=</span><span class="kc">None</span><span class="p">)</span> +<span class="k">def</span> <span class="nf">test_check_precision</span><span class="p">(</span><span class="n">t1</span><span class="p">,</span> <span class="n">t2</span><span class="p">):</span> + <span class="n">k1</span><span class="p">,</span> <span class="n">n1</span> <span class="o">=</span> <span class="n">t1</span> + <span class="n">k2</span><span class="p">,</span> <span class="n">n2</span> <span class="o">=</span> <span class="n">t2</span> + <span class="c1"># apply transfer function</span> + <span class="n">k3</span> <span class="o">=</span> <span class="n">k1</span><span class="o">.</span><span class="n">abstract_add</span><span class="p">(</span><span class="n">k2</span><span class="p">)</span> + <span class="n">example_res</span> <span class="o">=</span> <span class="n">n1</span> <span class="o">+</span> <span class="n">n2</span> + + <span class="c1"># try to find a better version of k3 with Z3</span> + <span class="n">solver</span> <span class="o">=</span> <span class="n">z3</span><span class="o">.</span><span class="n">Solver</span><span class="p">()</span> + <span class="n">solver</span><span class="o">.</span><span class="n">set</span><span class="p">(</span><span class="s2">"timeout"</span><span class="p">,</span> <span class="mi">8000</span><span class="p">)</span> + + <span class="n">var1</span> <span class="o">=</span> <span class="n">BitVec</span><span class="p">(</span><span class="s1">'v1'</span><span class="p">)</span> + <span class="n">var2</span> <span class="o">=</span> <span class="n">BitVec</span><span class="p">(</span><span class="s1">'v2'</span><span class="p">)</span> + + <span class="n">ones</span> <span class="o">=</span> <span class="n">BitVec</span><span class="p">(</span><span class="s1">'ones'</span><span class="p">)</span> + <span class="n">unknowns</span> <span class="o">=</span> <span class="n">BitVec</span><span class="p">(</span><span class="s1">'unknowns'</span><span class="p">)</span> + <span class="n">better_k3</span> <span class="o">=</span> <span class="n">KnownBits</span><span class="p">(</span><span class="n">ones</span><span class="p">,</span> <span class="n">unknowns</span><span class="p">)</span> + <span class="nb">print</span><span class="p">(</span><span class="n">k1</span><span class="p">,</span> <span class="n">k2</span><span class="p">,</span> <span class="n">k3</span><span class="p">)</span> + + <span class="c1"># we're trying to find an example for a better k3, so we use check, without</span> + <span class="c1"># negation:</span> + <span class="n">res</span> <span class="o">=</span> <span class="n">solver</span><span class="o">.</span><span class="n">check</span><span class="p">(</span><span class="n">z3</span><span class="o">.</span><span class="n">And</span><span class="p">(</span> + <span class="c1"># better_k3 should be a valid knownbits instance</span> + <span class="n">better_k3</span><span class="o">.</span><span class="n">is_well_formed</span><span class="p">(),</span> + <span class="c1"># it should be better than k3, ie there are known bits in better_k3</span> + <span class="c1"># that we don't have in k3</span> + <span class="n">better_k3</span><span class="o">.</span><span class="n">knowns</span> <span class="o">&amp;</span> <span class="o">~</span><span class="n">k3</span><span class="o">.</span><span class="n">knowns</span> <span class="o">!=</span> <span class="mi">0</span><span class="p">,</span> + <span class="c1"># now encode the correctness condition for better_k3 with a ForAll:</span> + <span class="c1"># for all concrete values var1 and var2, it must hold that if</span> + <span class="c1"># var1 is in k1 and var2 is in k2 it follows that var1 + var2 is in</span> + <span class="c1"># better_k3</span> + <span class="n">z3</span><span class="o">.</span><span class="n">ForAll</span><span class="p">(</span> + <span class="p">[</span><span class="n">var1</span><span class="p">,</span> <span class="n">var2</span><span class="p">],</span> + <span class="n">z3</span><span class="o">.</span><span class="n">Implies</span><span class="p">(</span> + <span class="n">z3</span><span class="o">.</span><span class="n">And</span><span class="p">(</span><span class="n">k1</span><span class="o">.</span><span class="n">contains</span><span class="p">(</span><span class="n">var1</span><span class="p">),</span> <span class="n">k2</span><span class="o">.</span><span class="n">contains</span><span class="p">(</span><span class="n">var2</span><span class="p">)),</span> + <span class="n">better_k3</span><span class="o">.</span><span class="n">contains</span><span class="p">(</span><span class="n">var1</span> <span class="o">+</span> <span class="n">var2</span><span class="p">)))))</span> + <span class="c1"># if this query is satisfiable, we have found a better result for the</span> + <span class="c1"># abstract_add</span> + <span class="k">if</span> <span class="n">res</span> <span class="o">==</span> <span class="n">z3</span><span class="o">.</span><span class="n">sat</span><span class="p">:</span> + <span class="n">model</span> <span class="o">=</span> <span class="n">solver</span><span class="o">.</span><span class="n">model</span><span class="p">()</span> + <span class="n">rk3</span> <span class="o">=</span> <span class="n">KnownBits</span><span class="p">(</span><span class="n">model</span><span class="o">.</span><span class="n">eval</span><span class="p">(</span><span class="n">ones</span><span class="p">)</span><span class="o">.</span><span class="n">as_signed_long</span><span class="p">(),</span> <span class="n">model</span><span class="o">.</span><span class="n">eval</span><span class="p">(</span><span class="n">unknowns</span><span class="p">)</span><span class="o">.</span><span class="n">as_signed_long</span><span class="p">())</span> + <span class="nb">print</span><span class="p">(</span><span class="s2">"better"</span><span class="p">,</span> <span class="n">rk3</span><span class="p">)</span> + <span class="k">assert</span> <span class="mi">0</span> + <span class="k">if</span> <span class="n">res</span> <span class="o">==</span> <span class="n">z3</span><span class="o">.</span><span class="n">unknown</span><span class="p">:</span> + <span class="nb">print</span><span class="p">(</span><span class="s2">"timeout"</span><span class="p">)</span> +</pre></div> + +<p>It does not actually fail for <code>abstract_add</code> (nor the other abstract +functions). To see the test failing we can add some imprecision to the +implementation of <code>abstract_add</code> to see Hypothesis and Z3 find examples of +values that are not optimally precise (for example by setting some bits +of <code>unknowns</code> in the implementation of <code>abstract_add</code> unconditionally).</p> +<h3 id="using-the-abstract-domain-in-the-toy-optimizer-for-generalized-constant-folding">Using the Abstract Domain in the Toy Optimizer for Generalized Constant Folding</h3> +<p>Now after all this work we can finally actually use the knownbits abstract +domain in the toy optimizer. The code for this follows <a href="https://bernsteinbear.com/blog/toy-abstract-interpretation/">Max' intro post about +abstract interpretation</a> +quite closely.</p> +<p>For completeness sake, in the fold there's the basic infrastructure classes +that make up the IR again (they are identical or at least extremely close to +the previous toy posts).</p> +<details> +<summary>toy infrastructure</summary> + +<div class="code"><pre class="code literal-block"><span class="k">class</span> <span class="nc">Value</span><span class="p">:</span> + <span class="k">def</span> <span class="nf">find</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span> + <span class="k">raise</span> <span class="ne">NotImplementedError</span><span class="p">(</span><span class="s2">"abstract"</span><span class="p">)</span> + + +<span class="nd">@dataclass</span><span class="p">(</span><span class="n">eq</span><span class="o">=</span><span class="kc">False</span><span class="p">)</span> +<span class="k">class</span> <span class="nc">Operation</span><span class="p">(</span><span class="n">Value</span><span class="p">):</span> + <span class="n">name</span> <span class="p">:</span> <span class="nb">str</span> + <span class="n">args</span> <span class="p">:</span> <span class="nb">list</span><span class="p">[</span><span class="n">Value</span><span class="p">]</span> + + <span class="n">forwarded</span> <span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Value</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span> + + <span class="k">def</span> <span class="nf">find</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Value</span><span class="p">:</span> + <span class="n">op</span> <span class="o">=</span> <span class="bp">self</span> + <span class="k">while</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">op</span><span class="p">,</span> <span class="n">Operation</span><span class="p">):</span> + <span class="nb">next</span> <span class="o">=</span> <span class="n">op</span><span class="o">.</span><span class="n">forwarded</span> + <span class="k">if</span> <span class="nb">next</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span> + <span class="k">return</span> <span class="n">op</span> + <span class="n">op</span> <span class="o">=</span> <span class="nb">next</span> + <span class="k">return</span> <span class="n">op</span> + + <span class="k">def</span> <span class="nf">arg</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">index</span><span class="p">):</span> + <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">args</span><span class="p">[</span><span class="n">index</span><span class="p">]</span><span class="o">.</span><span class="n">find</span><span class="p">()</span> + + <span class="k">def</span> <span class="nf">make_equal_to</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span> <span class="p">:</span> <span class="n">Value</span><span class="p">):</span> + <span class="bp">self</span><span class="o">.</span><span class="n">find</span><span class="p">()</span><span class="o">.</span><span class="n">forwarded</span> <span class="o">=</span> <span class="n">value</span> + + +<span class="nd">@dataclass</span><span class="p">(</span><span class="n">eq</span><span class="o">=</span><span class="kc">False</span><span class="p">)</span> +<span class="k">class</span> <span class="nc">Constant</span><span class="p">(</span><span class="n">Value</span><span class="p">):</span> + <span class="n">value</span> <span class="p">:</span> <span class="nb">object</span> + + <span class="k">def</span> <span class="nf">find</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span> + <span class="k">return</span> <span class="bp">self</span> + + +<span class="k">class</span> <span class="nc">Block</span><span class="p">(</span><span class="nb">list</span><span class="p">):</span> + <span class="k">def</span> <span class="fm">__getattr__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">opname</span><span class="p">):</span> + <span class="k">def</span> <span class="nf">wraparg</span><span class="p">(</span><span class="n">arg</span><span class="p">):</span> + <span class="k">if</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">arg</span><span class="p">,</span> <span class="n">Value</span><span class="p">):</span> + <span class="n">arg</span> <span class="o">=</span> <span class="n">Constant</span><span class="p">(</span><span class="n">arg</span><span class="p">)</span> + <span class="k">return</span> <span class="n">arg</span> + <span class="k">def</span> <span class="nf">make_op</span><span class="p">(</span><span class="o">*</span><span class="n">args</span><span class="p">):</span> + <span class="n">op</span> <span class="o">=</span> <span class="n">Operation</span><span class="p">(</span><span class="n">opname</span><span class="p">,</span> + <span class="p">[</span><span class="n">wraparg</span><span class="p">(</span><span class="n">arg</span><span class="p">)</span> <span class="k">for</span> <span class="n">arg</span> <span class="ow">in</span> <span class="n">args</span><span class="p">])</span> + <span class="bp">self</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">op</span><span class="p">)</span> + <span class="k">return</span> <span class="n">op</span> + <span class="k">return</span> <span class="n">make_op</span> + + +<span class="k">def</span> <span class="nf">bb_to_str</span><span class="p">(</span><span class="n">l</span> <span class="p">:</span> <span class="n">Block</span><span class="p">,</span> <span class="n">varprefix</span> <span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">"var"</span><span class="p">):</span> + <span class="k">def</span> <span class="nf">arg_to_str</span><span class="p">(</span><span class="n">arg</span> <span class="p">:</span> <span class="n">Value</span><span class="p">):</span> + <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">arg</span><span class="p">,</span> <span class="n">Constant</span><span class="p">):</span> + <span class="k">return</span> <span class="nb">str</span><span class="p">(</span><span class="n">arg</span><span class="o">.</span><span class="n">value</span><span class="p">)</span> + <span class="k">else</span><span class="p">:</span> + <span class="k">return</span> <span class="n">varnames</span><span class="p">[</span><span class="n">arg</span><span class="p">]</span> + + <span class="n">varnames</span> <span class="o">=</span> <span class="p">{}</span> + <span class="n">res</span> <span class="o">=</span> <span class="p">[]</span> + <span class="k">for</span> <span class="n">index</span><span class="p">,</span> <span class="n">op</span> <span class="ow">in</span> <span class="nb">enumerate</span><span class="p">(</span><span class="n">l</span><span class="p">):</span> + <span class="c1"># give the operation a name used while</span> + <span class="c1"># printing:</span> + <span class="n">var</span> <span class="o">=</span> <span class="sa">f</span><span class="s2">"</span><span class="si">{</span><span class="n">varprefix</span><span class="si">}{</span><span class="n">index</span><span class="si">}</span><span class="s2">"</span> + <span class="n">varnames</span><span class="p">[</span><span class="n">op</span><span class="p">]</span> <span class="o">=</span> <span class="n">var</span> + <span class="n">arguments</span> <span class="o">=</span> <span class="s2">", "</span><span class="o">.</span><span class="n">join</span><span class="p">(</span> + <span class="n">arg_to_str</span><span class="p">(</span><span class="n">op</span><span class="o">.</span><span class="n">arg</span><span class="p">(</span><span class="n">i</span><span class="p">))</span> + <span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="nb">len</span><span class="p">(</span><span class="n">op</span><span class="o">.</span><span class="n">args</span><span class="p">))</span> + <span class="p">)</span> + <span class="n">strop</span> <span class="o">=</span> <span class="sa">f</span><span class="s2">"</span><span class="si">{</span><span class="n">var</span><span class="si">}</span><span class="s2"> = </span><span class="si">{</span><span class="n">op</span><span class="o">.</span><span class="n">name</span><span class="si">}</span><span class="s2">(</span><span class="si">{</span><span class="n">arguments</span><span class="si">}</span><span class="s2">)"</span> + <span class="n">res</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">strop</span><span class="p">)</span> + <span class="k">return</span> <span class="s2">"</span><span class="se">\n</span><span class="s2">"</span><span class="o">.</span><span class="n">join</span><span class="p">(</span><span class="n">res</span><span class="p">)</span> +</pre></div> + + + +</details> + +<p>Now we can write some first tests, the first one simply checking constant +folding:</p> +<div class="code"><pre class="code literal-block"><span class="k">def</span> <span class="nf">test_constfold_two_ops</span><span class="p">():</span> + <span class="n">bb</span> <span class="o">=</span> <span class="n">Block</span><span class="p">()</span> + <span class="n">var0</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">getarg</span><span class="p">(</span><span class="mi">0</span><span class="p">)</span> + <span class="n">var1</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">int_add</span><span class="p">(</span><span class="mi">5</span><span class="p">,</span> <span class="mi">4</span><span class="p">)</span> + <span class="n">var2</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">int_add</span><span class="p">(</span><span class="n">var1</span><span class="p">,</span> <span class="mi">10</span><span class="p">)</span> + <span class="n">var3</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">int_add</span><span class="p">(</span><span class="n">var2</span><span class="p">,</span> <span class="n">var0</span><span class="p">)</span> + + <span class="n">opt_bb</span> <span class="o">=</span> <span class="n">simplify</span><span class="p">(</span><span class="n">bb</span><span class="p">)</span> + <span class="k">assert</span> <span class="n">bb_to_str</span><span class="p">(</span><span class="n">opt_bb</span><span class="p">,</span> <span class="s2">"optvar"</span><span class="p">)</span> <span class="o">==</span> <span class="s2">"""</span><span class="se">\</span> +<span class="s2">optvar0 = getarg(0)</span> +<span class="s2">optvar1 = int_add(19, optvar0)"""</span> +</pre></div> + +<p>Calling the transfer functions on constant <code>KnownBits</code> produces a constant +results, as we have seen. Therefore "regular" constant folding should hopefully +be achieved by optimizing with the <code>KnownBits</code> abstract domain too.</p> +<p>The next two tests are slightly more complicated and can't be optimized by +regular constant-folding. They follow the motivating examples from the start of +this blog post, a hundred years ago:</p> +<div class="code"><pre class="code literal-block"><span class="n">def</span><span class="w"> </span><span class="n">test_constfold_via_knownbits</span><span class="p">():</span> +<span class="w"> </span><span class="n">bb</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">Block</span><span class="p">()</span> +<span class="w"> </span><span class="n">var0</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">bb</span><span class="o">.</span><span class="n">getarg</span><span class="p">(</span><span class="mi">0</span><span class="p">)</span> +<span class="w"> </span><span class="n">var1</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">bb</span><span class="o">.</span><span class="n">int_or</span><span class="p">(</span><span class="n">var0</span><span class="p">,</span><span class="w"> </span><span class="mi">1</span><span class="p">)</span> +<span class="w"> </span><span class="n">var2</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">bb</span><span class="o">.</span><span class="n">int_and</span><span class="p">(</span><span class="n">var1</span><span class="p">,</span><span class="w"> </span><span class="mi">1</span><span class="p">)</span> +<span class="w"> </span><span class="n">var3</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">bb</span><span class="o">.</span><span class="n">dummy</span><span class="p">(</span><span class="n">var2</span><span class="p">)</span> + +<span class="w"> </span><span class="n">opt_bb</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">simplify</span><span class="p">(</span><span class="n">bb</span><span class="p">)</span> +<span class="w"> </span><span class="nb">assert</span><span class="w"> </span><span class="n">bb_to_str</span><span class="p">(</span><span class="n">opt_bb</span><span class="p">,</span><span class="w"> </span><span class="s2">"optvar"</span><span class="p">)</span><span class="w"> </span><span class="o">==</span><span class="w"> </span><span class="s2">"""</span><span class="se">\</span> +<span class="s2">optvar0 = getarg(0)</span> +<span class="s2">optvar1 = int_or(optvar0, 1)</span> +<span class="s2">optvar2 = dummy(1)"""</span> + +<span class="n">def</span><span class="w"> </span><span class="n">test_constfold_alignment_check</span><span class="p">():</span> +<span class="w"> </span><span class="n">bb</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">Block</span><span class="p">()</span> +<span class="w"> </span><span class="n">var0</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">bb</span><span class="o">.</span><span class="n">getarg</span><span class="p">(</span><span class="mi">0</span><span class="p">)</span> +<span class="w"> </span><span class="n">var1</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">bb</span><span class="o">.</span><span class="n">int_invert</span><span class="p">(</span><span class="mi">0</span><span class="n">b111</span><span class="p">)</span> +<span class="w"> </span><span class="c1"># mask off the lowest three bits, thus var2 is aligned</span> +<span class="w"> </span><span class="n">var2</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">bb</span><span class="o">.</span><span class="n">int_and</span><span class="p">(</span><span class="n">var0</span><span class="p">,</span><span class="w"> </span><span class="n">var1</span><span class="p">)</span> +<span class="w"> </span><span class="c1"># add 16 to aligned quantity</span> +<span class="w"> </span><span class="n">var3</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">bb</span><span class="o">.</span><span class="n">int_add</span><span class="p">(</span><span class="n">var2</span><span class="p">,</span><span class="w"> </span><span class="mi">16</span><span class="p">)</span> +<span class="w"> </span><span class="c1"># check alignment of result</span> +<span class="w"> </span><span class="n">var4</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">bb</span><span class="o">.</span><span class="n">int_and</span><span class="p">(</span><span class="n">var3</span><span class="p">,</span><span class="w"> </span><span class="mi">0</span><span class="n">b111</span><span class="p">)</span> +<span class="w"> </span><span class="n">var5</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">bb</span><span class="o">.</span><span class="n">int_eq</span><span class="p">(</span><span class="n">var4</span><span class="p">,</span><span class="w"> </span><span class="mi">0</span><span class="p">)</span> +<span class="w"> </span><span class="c1"># var5 should be const-folded to 1</span> +<span class="w"> </span><span class="n">var6</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">bb</span><span class="o">.</span><span class="n">dummy</span><span class="p">(</span><span class="n">var5</span><span class="p">)</span> + +<span class="w"> </span><span class="n">opt_bb</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">simplify</span><span class="p">(</span><span class="n">bb</span><span class="p">)</span> +<span class="w"> </span><span class="nb">assert</span><span class="w"> </span><span class="n">bb_to_str</span><span class="p">(</span><span class="n">opt_bb</span><span class="p">,</span><span class="w"> </span><span class="s2">"optvar"</span><span class="p">)</span><span class="w"> </span><span class="o">==</span><span class="w"> </span><span class="s2">"""</span><span class="se">\</span> +<span class="s2">optvar0 = getarg(0)</span> +<span class="s2">optvar1 = int_and(optvar0, -8)</span> +<span class="s2">optvar2 = int_add(optvar1, 16)</span> +<span class="s2">optvar3 = dummy(1)"""</span> +</pre></div> + +<p>Here is <code>simplify</code> to make these tests pass:</p> +<div class="code"><pre class="code literal-block"><span class="k">def</span> <span class="nf">unknown_transfer_functions</span><span class="p">(</span><span class="o">*</span><span class="n">abstract_args</span><span class="p">):</span> + <span class="k">return</span> <span class="n">KnownBits</span><span class="o">.</span><span class="n">all_unknown</span><span class="p">()</span> + + +<span class="k">def</span> <span class="nf">simplify</span><span class="p">(</span><span class="n">bb</span><span class="p">:</span> <span class="n">Block</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Block</span><span class="p">:</span> + <span class="n">abstract_values</span> <span class="o">=</span> <span class="p">{}</span> <span class="c1"># dict mapping Operation to KnownBits</span> + + <span class="k">def</span> <span class="nf">knownbits_of</span><span class="p">(</span><span class="n">val</span> <span class="p">:</span> <span class="n">Value</span><span class="p">):</span> + <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">val</span><span class="p">,</span> <span class="n">Constant</span><span class="p">):</span> + <span class="k">return</span> <span class="n">KnownBits</span><span class="o">.</span><span class="n">from_constant</span><span class="p">(</span><span class="n">val</span><span class="o">.</span><span class="n">value</span><span class="p">)</span> + <span class="k">return</span> <span class="n">abstract_values</span><span class="p">[</span><span class="n">val</span><span class="p">]</span> + + <span class="n">opt_bb</span> <span class="o">=</span> <span class="n">Block</span><span class="p">()</span> + <span class="k">for</span> <span class="n">op</span> <span class="ow">in</span> <span class="n">bb</span><span class="p">:</span> + <span class="c1"># apply the transfer function on the abstract arguments</span> + <span class="n">name_without_prefix</span> <span class="o">=</span> <span class="n">op</span><span class="o">.</span><span class="n">name</span><span class="o">.</span><span class="n">removeprefix</span><span class="p">(</span><span class="s2">"int_"</span><span class="p">)</span> + <span class="n">method_name</span> <span class="o">=</span> <span class="sa">f</span><span class="s2">"abstract_</span><span class="si">{</span><span class="n">name_without_prefix</span><span class="si">}</span><span class="s2">"</span> + <span class="n">transfer_function</span> <span class="o">=</span> <span class="nb">getattr</span><span class="p">(</span><span class="n">KnownBits</span><span class="p">,</span> <span class="n">method_name</span><span class="p">,</span> <span class="n">unknown_transfer_functions</span><span class="p">)</span> + <span class="n">abstract_args</span> <span class="o">=</span> <span class="p">[</span><span class="n">knownbits_of</span><span class="p">(</span><span class="n">arg</span><span class="o">.</span><span class="n">find</span><span class="p">())</span> <span class="k">for</span> <span class="n">arg</span> <span class="ow">in</span> <span class="n">op</span><span class="o">.</span><span class="n">args</span><span class="p">]</span> + <span class="n">abstract_res</span> <span class="o">=</span> <span class="n">abstract_values</span><span class="p">[</span><span class="n">op</span><span class="p">]</span> <span class="o">=</span> <span class="n">transfer_function</span><span class="p">(</span><span class="o">*</span><span class="n">abstract_args</span><span class="p">)</span> + <span class="c1"># if the result is a constant, we optimize the operation away and make</span> + <span class="c1"># it equal to the constant result</span> + <span class="k">if</span> <span class="n">abstract_res</span><span class="o">.</span><span class="n">is_constant</span><span class="p">():</span> + <span class="n">op</span><span class="o">.</span><span class="n">make_equal_to</span><span class="p">(</span><span class="n">Constant</span><span class="p">(</span><span class="n">abstract_res</span><span class="o">.</span><span class="n">ones</span><span class="p">))</span> + <span class="k">continue</span> + <span class="c1"># otherwise emit the op</span> + <span class="n">opt_bb</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">op</span><span class="p">)</span> + <span class="k">return</span> <span class="n">opt_bb</span> +</pre></div> + +<p>The code follows the approach from the previous blog post very closely. The +only difference is that we apply the transfer function <em>first</em>, to be able to +detect whether the abstract domain can tell us that the result has to always be +a constant. This code makes all three tests pass.</p> +<h3 id="using-the-knownbits-domain-for-conditional-peephole-rewrites">Using the <code>KnownBits</code> Domain for Conditional Peephole Rewrites</h3> +<p>So far we are only using the <code>KnownBits</code> domain to find out that certain +operations have to produce a constant. We can also use the <code>KnownBits</code> domain +to check whether certain operation rewrites are correct. Let's use one of the +examples from the <a href="https://www.pypy.org/posts/2024/07/mining-jit-traces-missing-optimizations-z3.html">Mining JIT traces for missing optimizations with +Z3</a> +post, where Z3 found the inefficiency <code>(x &lt;&lt; 4) &amp; -0xf == x &lt;&lt; 4</code> in PyPy JIT +traces. We don't have shift operations, but we want to generalize this optimization +anyway. The general form of this rewrite is that under some circumstances <code>x &amp; +y == x</code>, and we can use the <code>KnownBits</code> domain to detect situations where this +must be true.</p> +<p>To understand <em>when</em> <code>x &amp; y == x</code> is true, we can think about individual pairs of +bits <code>a</code> and <code>b</code>. If <code>a == 0</code>, then <code>a &amp; b == 0 &amp; b == 0 == a</code>. If <code>b == 1</code> +then <code>a &amp; b == a &amp; 1 == a</code>. So if either <code>a == 0</code> or <code>b == 1</code> is true, +<code>a &amp; b == a</code> follows. And if either of these conditions is true for <em>all</em> the +bits of <code>x</code> and <code>y</code>, we can know that <code>x &amp; y == x</code>.</p> +<p>We can write a method on <code>KnownBits</code> to check for this condition:</p> +<div class="code"><pre class="code literal-block"><span class="k">class</span> <span class="nc">KnownBits</span><span class="p">:</span> + <span class="o">...</span> + + <span class="k">def</span> <span class="nf">is_and_identity</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">):</span> +<span class="w"> </span><span class="sd">""" Return True if n1 &amp; n2 == n1 for any n1 in self and n2 in other.</span> +<span class="sd"> (or, equivalently, return True if n1 | n2 == n2)"""</span> + <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">zeros</span> <span class="o">|</span> <span class="n">other</span><span class="o">.</span><span class="n">ones</span> <span class="o">==</span> <span class="o">-</span><span class="mi">1</span> +</pre></div> + +<p>Since my reasoning about this feels ripe for errors, let's check that our +understanding is correct with Z3:</p> +<div class="code"><pre class="code literal-block"><span class="k">def</span> <span class="nf">test_prove_is_and_identity</span><span class="p">():</span> + <span class="n">solver</span><span class="p">,</span> <span class="n">k1</span><span class="p">,</span> <span class="n">n1</span><span class="p">,</span> <span class="n">k2</span><span class="p">,</span> <span class="n">n2</span> <span class="o">=</span> <span class="n">z3_setup_variables</span><span class="p">()</span> + <span class="n">prove</span><span class="p">(</span><span class="n">z3</span><span class="o">.</span><span class="n">Implies</span><span class="p">(</span><span class="n">k1</span><span class="o">.</span><span class="n">is_and_identity</span><span class="p">(</span><span class="n">k2</span><span class="p">),</span> <span class="n">n1</span> <span class="o">&amp;</span> <span class="n">n2</span> <span class="o">==</span> <span class="n">n1</span><span class="p">),</span> <span class="n">solver</span><span class="p">)</span> +</pre></div> + +<p>Now let's use this in the toy optimizer. Here are two tests for this rewrite:</p> +<div class="code"><pre class="code literal-block"><span class="k">def</span> <span class="nf">test_remove_redundant_and</span><span class="p">():</span> + <span class="n">bb</span> <span class="o">=</span> <span class="n">Block</span><span class="p">()</span> + <span class="n">var0</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">getarg</span><span class="p">(</span><span class="mi">0</span><span class="p">)</span> + <span class="n">var1</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">int_invert</span><span class="p">(</span><span class="mb">0b1111</span><span class="p">)</span> + <span class="c1"># mask off the lowest four bits</span> + <span class="n">var2</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">int_and</span><span class="p">(</span><span class="n">var0</span><span class="p">,</span> <span class="n">var1</span><span class="p">)</span> + <span class="c1"># applying the same mask is not redundant</span> + <span class="n">var3</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">int_and</span><span class="p">(</span><span class="n">var2</span><span class="p">,</span> <span class="n">var1</span><span class="p">)</span> + <span class="n">var4</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">dummy</span><span class="p">(</span><span class="n">var3</span><span class="p">)</span> + + <span class="n">opt_bb</span> <span class="o">=</span> <span class="n">simplify</span><span class="p">(</span><span class="n">bb</span><span class="p">)</span> + <span class="k">assert</span> <span class="n">bb_to_str</span><span class="p">(</span><span class="n">opt_bb</span><span class="p">,</span> <span class="s2">"optvar"</span><span class="p">)</span> <span class="o">==</span> <span class="s2">"""</span><span class="se">\</span> +<span class="s2">optvar0 = getarg(0)</span> +<span class="s2">optvar1 = int_and(optvar0, -16)</span> +<span class="s2">optvar2 = dummy(optvar1)"""</span> + +<span class="k">def</span> <span class="nf">test_remove_redundant_and_more_complex</span><span class="p">():</span> + <span class="n">bb</span> <span class="o">=</span> <span class="n">Block</span><span class="p">()</span> + <span class="n">var0</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">getarg</span><span class="p">(</span><span class="mi">0</span><span class="p">)</span> + <span class="n">var1</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">getarg</span><span class="p">(</span><span class="mi">1</span><span class="p">)</span> + <span class="c1"># var2 has bit pattern ????</span> + <span class="n">var2</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">int_and</span><span class="p">(</span><span class="n">var0</span><span class="p">,</span> <span class="mb">0b1111</span><span class="p">)</span> + <span class="c1"># var3 has bit pattern ...?1111</span> + <span class="n">var3</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">int_or</span><span class="p">(</span><span class="n">var1</span><span class="p">,</span> <span class="mb">0b1111</span><span class="p">)</span> + <span class="c1"># var4 is just var2</span> + <span class="n">var4</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">int_and</span><span class="p">(</span><span class="n">var2</span><span class="p">,</span> <span class="n">var3</span><span class="p">)</span> + <span class="n">var5</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">dummy</span><span class="p">(</span><span class="n">var4</span><span class="p">)</span> + + <span class="n">opt_bb</span> <span class="o">=</span> <span class="n">simplify</span><span class="p">(</span><span class="n">bb</span><span class="p">)</span> + <span class="k">assert</span> <span class="n">bb_to_str</span><span class="p">(</span><span class="n">opt_bb</span><span class="p">,</span> <span class="s2">"optvar"</span><span class="p">)</span> <span class="o">==</span> <span class="s2">"""</span><span class="se">\</span> +<span class="s2">optvar0 = getarg(0)</span> +<span class="s2">optvar1 = getarg(1)</span> +<span class="s2">optvar2 = int_and(optvar0, 15)</span> +<span class="s2">optvar3 = int_or(optvar1, 15)</span> +<span class="s2">optvar4 = dummy(optvar2)"""</span> +</pre></div> + +<p>The first test could also be made to pass by implementing a reassociation +optimization that turns <code>(x &amp; c1) &amp; c2</code> into <code>x &amp; (c1 &amp; c2)</code> and then constant-folds the second <code>and</code>. But here we want to +use <code>KnownBits</code> and conditionally rewrite <code>int_and</code> to its first argument. So to make the tests pass, +we can change <code>simplify</code> like this:</p> +<div class="code"><pre class="code literal-block"><span class="k">def</span> <span class="nf">simplify</span><span class="p">(</span><span class="n">bb</span><span class="p">:</span> <span class="n">Block</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Block</span><span class="p">:</span> + <span class="n">abstract_values</span> <span class="o">=</span> <span class="p">{}</span> <span class="c1"># dict mapping Operation to KnownBits</span> + + <span class="k">def</span> <span class="nf">knownbits_of</span><span class="p">(</span><span class="n">val</span> <span class="p">:</span> <span class="n">Value</span><span class="p">):</span> + <span class="o">...</span> + + <span class="n">opt_bb</span> <span class="o">=</span> <span class="n">Block</span><span class="p">()</span> + <span class="k">for</span> <span class="n">op</span> <span class="ow">in</span> <span class="n">bb</span><span class="p">:</span> + <span class="c1"># apply the transfer function on the abstract arguments</span> + <span class="n">name_without_prefix</span> <span class="o">=</span> <span class="n">op</span><span class="o">.</span><span class="n">name</span><span class="o">.</span><span class="n">removeprefix</span><span class="p">(</span><span class="s2">"int_"</span><span class="p">)</span> + <span class="n">method_name</span> <span class="o">=</span> <span class="sa">f</span><span class="s2">"abstract_</span><span class="si">{</span><span class="n">name_without_prefix</span><span class="si">}</span><span class="s2">"</span> + <span class="n">transfer_function</span> <span class="o">=</span> <span class="nb">getattr</span><span class="p">(</span><span class="n">KnownBits</span><span class="p">,</span> <span class="n">method_name</span><span class="p">,</span> <span class="n">unknown_transfer_functions</span><span class="p">)</span> + <span class="n">abstract_args</span> <span class="o">=</span> <span class="p">[</span><span class="n">knownbits_of</span><span class="p">(</span><span class="n">arg</span><span class="o">.</span><span class="n">find</span><span class="p">())</span> <span class="k">for</span> <span class="n">arg</span> <span class="ow">in</span> <span class="n">op</span><span class="o">.</span><span class="n">args</span><span class="p">]</span> + <span class="n">abstract_res</span> <span class="o">=</span> <span class="n">abstract_values</span><span class="p">[</span><span class="n">op</span><span class="p">]</span> <span class="o">=</span> <span class="n">transfer_function</span><span class="p">(</span><span class="o">*</span><span class="n">abstract_args</span><span class="p">)</span> + <span class="c1"># if the result is a constant, we optimize the operation away and make</span> + <span class="c1"># it equal to the constant result</span> + <span class="k">if</span> <span class="n">abstract_res</span><span class="o">.</span><span class="n">is_constant</span><span class="p">():</span> + <span class="n">op</span><span class="o">.</span><span class="n">make_equal_to</span><span class="p">(</span><span class="n">Constant</span><span class="p">(</span><span class="n">abstract_res</span><span class="o">.</span><span class="n">ones</span><span class="p">))</span> + <span class="k">continue</span> + <span class="c1"># &lt;&lt;&lt;&lt; new code</span> + <span class="c1"># conditionally rewrite int_and(x, y) to x</span> + <span class="k">if</span> <span class="n">op</span><span class="o">.</span><span class="n">name</span> <span class="o">==</span> <span class="s2">"int_and"</span><span class="p">:</span> + <span class="n">k1</span><span class="p">,</span> <span class="n">k2</span> <span class="o">=</span> <span class="n">abstract_args</span> + <span class="k">if</span> <span class="n">k1</span><span class="o">.</span><span class="n">is_and_identity</span><span class="p">(</span><span class="n">k2</span><span class="p">):</span> + <span class="n">op</span><span class="o">.</span><span class="n">make_equal_to</span><span class="p">(</span><span class="n">op</span><span class="o">.</span><span class="n">arg</span><span class="p">(</span><span class="mi">0</span><span class="p">))</span> + <span class="k">continue</span> + <span class="c1"># &gt;&gt;&gt;&gt; end changes</span> + <span class="n">opt_bb</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">op</span><span class="p">)</span> + <span class="k">return</span> <span class="n">opt_bb</span> +</pre></div> + +<p>And with that, the new tests pass as well. A real implementation would also +check the other argument order, but we leave that out for the sake of brevity.</p> +<p>This rewrite also generalizes the <a href="https://pypy.org/posts/2024/07/finding-simple-rewrite-rules-jit-z3.html">rewrites</a> <code>int_and(0, x) -&gt; 0</code> and +<code>int_and(-1, x) -&gt; x</code>, let's add a test for those:</p> +<div class="code"><pre class="code literal-block"><span class="k">def</span> <span class="nf">test_remove_and_simple</span><span class="p">():</span> + <span class="n">bb</span> <span class="o">=</span> <span class="n">Block</span><span class="p">()</span> + <span class="n">var0</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">getarg</span><span class="p">(</span><span class="mi">0</span><span class="p">)</span> + <span class="n">var1</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">getarg</span><span class="p">(</span><span class="mi">1</span><span class="p">)</span> + <span class="n">var2</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">int_and</span><span class="p">(</span><span class="mi">0</span><span class="p">,</span> <span class="n">var0</span><span class="p">)</span> <span class="c1"># == 0</span> + <span class="n">var3</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">int_invert</span><span class="p">(</span><span class="n">var2</span><span class="p">)</span> <span class="c1"># == -1</span> + <span class="n">var4</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">int_and</span><span class="p">(</span><span class="n">var1</span><span class="p">,</span> <span class="n">var3</span><span class="p">)</span> <span class="c1"># == var1</span> + <span class="n">var5</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">dummy</span><span class="p">(</span><span class="n">var4</span><span class="p">)</span> + + <span class="n">opt_bb</span> <span class="o">=</span> <span class="n">simplify</span><span class="p">(</span><span class="n">bb</span><span class="p">)</span> + <span class="k">assert</span> <span class="n">bb_to_str</span><span class="p">(</span><span class="n">opt_bb</span><span class="p">,</span> <span class="s2">"optvar"</span><span class="p">)</span> <span class="o">==</span> <span class="s2">"""</span><span class="se">\</span> +<span class="s2">optvar0 = getarg(0)</span> +<span class="s2">optvar1 = getarg(1)</span> +<span class="s2">optvar2 = dummy(optvar1)"""</span> +</pre></div> + +<p>This test just passes. And that's it for this post!</p> +<h3 id="conclusion">Conclusion</h3> +<p>In this post we've seen the implementation, testing and proofs about a 'known +bits' abstract domain, as well as its use in the toy optimizer to generalize +constant folding, and to implement conditional peephole rewrites.</p> +<p>In the next posts I'll write about the real implementation of a knownbits +domain in PyPy's JIT, its combination with the existing interval abstract +domain, how to deal with gaining information from conditions in the program, +and some lose ends.</p> +<p>Sources:</p> +<ul> +<li><a href="https://github.com/llvm/llvm-project/blob/main/llvm/lib/Support/KnownBits.cpp">Known bits in LLVM</a></li> +<li><a href="https://github.com/torvalds/linux/blob/master/kernel/bpf/tnum.c">Tristate numbers for known bits in Linux eBPF</a></li> +<li><a href="https://arxiv.org/abs/2105.05398">Sound, Precise, and Fast Abstract Interpretation with Tristate Numbers</a></li> +<li><a href="https://people.cs.rutgers.edu/~sn349/papers/agni-cav2023.pdf">Verifying the Verifier: eBPF Range Analysis Verification</a></li> +<li><a href="https://dougallj.wordpress.com/2020/01/13/bit-twiddling-addition-with-unknown-bits/">Bit-Twiddling: Addition with Unknown + Bits</a> + is a super readable blog post by Dougall J. I've taken the <code>ones</code> and + <code>unknowns</code> naming from this post, which I find significantly clearer than + <code>value</code> and <code>mask</code>, which the Linux kernel uses.</li> +<li><a href="https://bitmath.blogspot.com/">Bits, Math and Performance(?)</a>, a fantastic + blog by <a href="https://mastodon.gamedev.place/@harold">Harold Aptroot</a>. There are a + lot of relevant posts about known bits, range analysis etc. Harold is also + the author of <a href="http://haroldbot.nl/">Haroldbot</a>, a website that can be used + for bitvector calculations, and also checks bitvector identities.</li> +<li><a href="https://cea.hal.science/cea-01795779/document">Sharpening Constraint Programming approaches for Bit-Vector Theory</a></li> +<li><a href="https://users.cs.utah.edu/~regehr/papers/lctes06_2/fp019-regehr.pdf">Deriving Abstract Transfer Functions for Analyzing Embedded Software</a></li> +<li><a href="https://arxiv.org/abs/2105.00493">Synthesizing Abstract Transformers</a></li> +</ul> +<div class="footnote"> +<hr> +<ol> +<li id="fn:proof_bitwidths"> +<p>There's a subtletly about the Z3 proofs that I'm sort of +glossing over here. Python integers are of arbitrary width, and the +<code>KnownBits</code> code is actually carefully written to work for integers of any +size. This property is tested by the Hypothesis tests, which don't limit +the sizes of the generated random integers. However, the Z3 proofs only +check bitvectors of a fixed bitwidth of 64. There are various ways to deal +with this situation. For most "real" compilers, the bitwidth of integers +would be fixed anyway. Then the components <code>ones</code> and <code>unknowns</code> of the +<code>KnownBits</code> class would use the number of bits the corresponding integer +variable has, and the Z3 proofs would use the same width. This is what we +do in the PyPy JIT. <a class="footnote-backref" href="https://www.pypy.org/posts/2024/08/toy-knownbits.html#fnref:proof_bitwidths" title="Jump back to footnote 1 in the text">↩</a></p> +</li> +<li id="fn:tests_vs_proofs"> +<p>The less close connection between implementation and proof +for <code>abstract_eq</code> is one of the reasons why it makes sense to do +unit-testing <em>in addition</em> to proofs. For a more detailed explanation of +why both tests and proofs are good to +have, see <a href="https://siek.blogspot.com/2024/06/data-structures-and-algorithms-correctly.html#correct-software-via-write-test-and-prove:~:text=We%20recognize%20that%20once%20step,detect%20most%20of%20the%20bugs">Jeremy Siek's blog +post</a>, +as well as the <a href="https://www-cs-faculty.stanford.edu/~knuth/faq.html#:~:text=What's%20the%20exact%20citation%20of%20your%20oft%2Dcited%20comment%20about%20bugs?">Knuth +quote</a>. <a class="footnote-backref" href="https://www.pypy.org/posts/2024/08/toy-knownbits.html#fnref:tests_vs_proofs" title="Jump back to footnote 2 in the text">↩</a></p> +</li> +</ol> +</div>toy-optimizerz3https://www.pypy.org/posts/2024/08/toy-knownbits.htmlSat, 03 Aug 2024 14:00:00 GMTAbstract interpretation in the Toy Optimizerhttps://www.pypy.org/posts/2024/07/toy-abstract-interpretation.htmlMax Bernstein<p>This is a <a href="https://bernsteinbear.com/blog/toy-abstract-interpretation/" rel="canonical">cross-post</a> +from Max Bernstein from his excellent blog where he writes about programming +languages, compilers, optimizations, virtual machines. He's looking for a +(dynamic language runtime or compiler related) job too.</p> +<hr> +<p>CF Bolz-Tereick wrote some excellent posts in which they <a href="https://pypy.org/posts/2022/07/toy-optimizer.html">introduce a small IR +and optimizer</a> and <a href="https://pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html">extend it with allocation +removal</a>. We also did a live stream together in which +we did <a href="https://www.youtube.com/watch?v=w-UHg0yOPSE">some more heap optimizations</a>.</p> +<p>In this blog post, I'm going to write a small abstract interpreter for the Toy +IR and then show how we can use it to do some simple optimizations. It assumes +that you are familiar with the little IR, which I have reproduced unchanged in +<a href="https://gist.github.com/tekknolagi/4425b28d5267e7bae8b0d7ef8fb4a671">a GitHub Gist</a>.</p> +<p>Abstract interpretation is a general framework for efficiently computing +properties that must be true for all possible executions of a program. It's a +widely used approach both in compiler optimizations as well as offline static +analysis for finding bugs. I'm writing this post to pave the way for CF's next +post on proving abstract interpreters correct for range analysis and known bits +analysis inside PyPy.</p> +<p>Before we begin, I want to note a couple of things:</p> +<ul> +<li>The Toy IR is in SSA form, which means that every variable is defined exactly + once. This means that abstract properties of each variable are easy to track.</li> +<li>The Toy IR represents a linear trace without control flow, meaning we won't + talk about meet/join or fixpoints. They only make sense if the IR has a + notion of conditional branches or back edges (loops).</li> +</ul> +<p>Alright, let's get started.</p> +<h3 id="welcome-to-abstract-interpretation">Welcome to abstract interpretation</h3> +<p>Abstract interpretation means a couple different things to different people. +There's rigorous mathematical formalism thanks to Patrick and Radhia Cousot, +our favorite power couple, and there's also sketchy hand-wavy stuff like what +will follow in this post. In the end, all people are trying to do is reason +about program behavior without running it.</p> +<p>In particular, abstract interpretation is an <em>over-approximation</em> of the +behavior of a program. Correctly implemented abstract interpreters never lie, +but they might be a little bit pessimistic. This is because instead of using +real values and running the program---which would produce a concrete result and +some real-world behavior---we "run" the program with a parallel universe of +<em>abstract</em> values. This abstract run gives us information about all possible +runs of the program.<sup id="fnref:logozzo"><a class="footnote-ref" href="https://www.pypy.org/posts/2024/07/toy-abstract-interpretation.html#fn:logozzo">1</a></sup></p> +<p>Abstract values always represent sets of concrete values. Instead of literally +storing a set (in the world of integers, for example, it could get pretty +big...there are a lot of integers), we group them into a finite number of named +subsets.<sup id="fnref:lattices"><a class="footnote-ref" href="https://www.pypy.org/posts/2024/07/toy-abstract-interpretation.html#fn:lattices">2</a></sup></p> +<p>Let's learn a little about abstract interpretation with an example program and +example abstract domain. Here's the example program:</p> +<div class="code"><pre class="code literal-block"><span class="n">v0</span> <span class="o">=</span> <span class="mi">1</span> +<span class="n">v1</span> <span class="o">=</span> <span class="mi">2</span> +<span class="n">v2</span> <span class="o">=</span> <span class="n">add</span><span class="p">(</span><span class="n">v0</span><span class="p">,</span> <span class="n">v1</span><span class="p">)</span> +</pre></div> + +<p>And our abstract domain is "is the number positive" (where "positive" means +nonnegative, but I wanted to keep the words distinct):</p> +<div class="code"><pre class="code literal-block"> top + / \ +positive negative + \ / + bottom +</pre></div> + +<p>The special <em>top</em> value means "I don't know" and the special <em>bottom</em> value +means "empty set" or "unreachable". The <em>positive</em> and <em>negative</em> values +represent the sets of all positive and negative numbers, respectively.</p> +<p>We initialize all the variables <code>v0</code>, <code>v1</code>, and <code>v2</code> to <em>bottom</em> and then walk +our IR, updating our knowledge as we go.</p> +<div class="code"><pre class="code literal-block"><span class="c1"># here</span> +<span class="n">v0</span><span class="p">:</span><span class="n">bottom</span> <span class="o">=</span> <span class="mi">1</span> +<span class="n">v1</span><span class="p">:</span><span class="n">bottom</span> <span class="o">=</span> <span class="mi">2</span> +<span class="n">v2</span><span class="p">:</span><span class="n">bottom</span> <span class="o">=</span> <span class="n">add</span><span class="p">(</span><span class="n">v0</span><span class="p">,</span> <span class="n">v1</span><span class="p">)</span> +</pre></div> + +<p>In order to do that, we have to have <em>transfer functions</em> for each operation. +For constants, the transfer function is easy: determine if the constant is +positive or negative. For other operations, we have to define a function that +takes the abstract values of the operands and returns the abstract value of the +result.</p> +<p>In order to be correct, transfer functions for operations have to be compatible +with the behavior of their corresponding concrete implementations. You can +think of them having an implicit universal quantifier <em>forall</em> in front of +them.</p> +<p>Let's step through the constants at least:</p> +<div class="code"><pre class="code literal-block"><span class="n">v0</span><span class="p">:</span><span class="n">positive</span> <span class="o">=</span> <span class="mi">1</span> +<span class="n">v1</span><span class="p">:</span><span class="n">positive</span> <span class="o">=</span> <span class="mi">2</span> +<span class="c1"># here</span> +<span class="n">v2</span><span class="p">:</span><span class="n">bottom</span> <span class="o">=</span> <span class="n">add</span><span class="p">(</span><span class="n">v0</span><span class="p">,</span> <span class="n">v1</span><span class="p">)</span> +</pre></div> + +<p>Now we need to figure out the transfer function for <code>add</code>. It's kind of tricky +right now because we haven't specified our abstract domain very well. I keep +saying "numbers", but what kinds of numbers? Integers? Real numbers? Floating +point? Some kind of fixed-width bit vector (<code>int8</code>, <code>uint32</code>, ...) like an +actual machine "integer"?</p> +<p>For this post, I am going to use the mathematical definition of integer, which +means that the values are not bounded in size and therefore do not overflow. +Actual hardware memory constraints aside, this is kind of like a Python <code>int</code>.</p> +<p>So let's look at what happens when we add two abstract numbers:</p> +<table> +<thead> +<tr> +<th></th> +<th>top</th> +<th>positive</th> +<th>negative</th> +<th>bottom</th> +</tr> +</thead> +<tbody> +<tr> +<td><strong>top</strong></td> +<td>top</td> +<td>top</td> +<td>top</td> +<td>bottom</td> +</tr> +<tr> +<td><strong>positive</strong></td> +<td>top</td> +<td>positive</td> +<td>top</td> +<td>bottom</td> +</tr> +<tr> +<td><strong>negative</strong></td> +<td>top</td> +<td>top</td> +<td>negative</td> +<td>bottom</td> +</tr> +<tr> +<td><strong>bottom</strong></td> +<td>bottom</td> +<td>bottom</td> +<td>bottom</td> +<td>bottom</td> +</tr> +</tbody> +</table> +<p>As an example, let's try to add two numbers <code>a</code> and <code>b</code>, where <code>a</code> is positive +and <code>b</code> is negative. We don't know anything about their values other than their +signs. They could be <code>5</code> and <code>-3</code>, where the result is <code>2</code>, or they could be +<code>1</code> and <code>-100</code>, where the result is <code>-99</code>. This is why we can't say anything +about the result of this operation and have to return <em>top</em>.</p> +<p>The short of this table is that we only really know the result of an addition +if both operands are positive or both operands are negative. Thankfully, in +this example, both operands are known positive. So we can learn something about +<code>v2</code>:</p> +<div class="code"><pre class="code literal-block"><span class="n">v0</span><span class="p">:</span><span class="n">positive</span> <span class="o">=</span> <span class="mi">1</span> +<span class="n">v1</span><span class="p">:</span><span class="n">positive</span> <span class="o">=</span> <span class="mi">2</span> +<span class="n">v2</span><span class="p">:</span><span class="n">positive</span> <span class="o">=</span> <span class="n">add</span><span class="p">(</span><span class="n">v0</span><span class="p">,</span> <span class="n">v1</span><span class="p">)</span> +<span class="c1"># here</span> +</pre></div> + +<p>This may not seem useful in isolation, but analyzing more complex programs even +with this simple domain may be able to remove checks such as <code>if (v2 &lt; 0) { ... }</code>.</p> +<p>Let's take a look at another example using an sample <code>absval</code> (absolute value) +IR operation:</p> +<div class="code"><pre class="code literal-block"><span class="n">v0</span> <span class="o">=</span> <span class="n">getarg</span><span class="p">(</span><span class="mi">0</span><span class="p">)</span> +<span class="n">v1</span> <span class="o">=</span> <span class="n">getarg</span><span class="p">(</span><span class="mi">1</span><span class="p">)</span> +<span class="n">v2</span> <span class="o">=</span> <span class="n">absval</span><span class="p">(</span><span class="n">v0</span><span class="p">)</span> +<span class="n">v3</span> <span class="o">=</span> <span class="n">absval</span><span class="p">(</span><span class="n">v1</span><span class="p">)</span> +<span class="n">v4</span> <span class="o">=</span> <span class="n">add</span><span class="p">(</span><span class="n">v2</span><span class="p">,</span> <span class="n">v3</span><span class="p">)</span> +<span class="n">v5</span> <span class="o">=</span> <span class="n">absval</span><span class="p">(</span><span class="n">v4</span><span class="p">)</span> +</pre></div> + +<p>Even though we have no constant/concrete values, we can still learn something +about the states of values throughout the program. Since we know that <code>absval</code> +always returns a positive number, we learn that <code>v2</code>, <code>v3</code>, and <code>v4</code> are all +positive. This means that we can optimize out the <code>absval</code> operation on <code>v5</code>:</p> +<div class="code"><pre class="code literal-block"><span class="n">v0</span><span class="p">:</span><span class="n">top</span> <span class="o">=</span> <span class="n">getarg</span><span class="p">(</span><span class="mi">0</span><span class="p">)</span> +<span class="n">v1</span><span class="p">:</span><span class="n">top</span> <span class="o">=</span> <span class="n">getarg</span><span class="p">(</span><span class="mi">1</span><span class="p">)</span> +<span class="n">v2</span><span class="p">:</span><span class="n">positive</span> <span class="o">=</span> <span class="n">absval</span><span class="p">(</span><span class="n">v0</span><span class="p">)</span> +<span class="n">v3</span><span class="p">:</span><span class="n">positive</span> <span class="o">=</span> <span class="n">absval</span><span class="p">(</span><span class="n">v1</span><span class="p">)</span> +<span class="n">v4</span><span class="p">:</span><span class="n">positive</span> <span class="o">=</span> <span class="n">add</span><span class="p">(</span><span class="n">v2</span><span class="p">,</span> <span class="n">v3</span><span class="p">)</span> +<span class="n">v5</span><span class="p">:</span><span class="n">positive</span> <span class="o">=</span> <span class="n">v4</span> +</pre></div> + +<p>Other interesting lattices include:</p> +<ul> +<li>Constants (where the middle row is pretty wide)</li> +<li>Range analysis (bounds on min and max of a number)</li> +<li>Known bits (using a bitvector representation of a number, which bits are + always 0 or 1)</li> +</ul> +<p>For the rest of this blog post, we are going to do a very limited version of +"known bits", called <em>parity</em>. This analysis only tracks the least significant +bit of a number, which indicates if it is even or odd.</p> +<h3 id="parity">Parity</h3> +<p>The lattice is pretty similar to the positive/negative lattice:</p> +<div class="code"><pre class="code literal-block"> top + / \ +even odd + \ / + bottom +</pre></div> + +<p>Let's define a data structure to represent this in Python code:</p> +<div class="code"><pre class="code literal-block"><span class="k">class</span> <span class="nc">Parity</span><span class="p">:</span> + <span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">name</span><span class="p">):</span> + <span class="bp">self</span><span class="o">.</span><span class="n">name</span> <span class="o">=</span> <span class="n">name</span> + + <span class="k">def</span> <span class="fm">__repr__</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span> + <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">name</span> +</pre></div> + +<p>And instantiate the members of the lattice:</p> +<div class="code"><pre class="code literal-block"><span class="n">TOP</span> <span class="o">=</span> <span class="n">Parity</span><span class="p">(</span><span class="s2">"top"</span><span class="p">)</span> +<span class="n">EVEN</span> <span class="o">=</span> <span class="n">Parity</span><span class="p">(</span><span class="s2">"even"</span><span class="p">)</span> +<span class="n">ODD</span> <span class="o">=</span> <span class="n">Parity</span><span class="p">(</span><span class="s2">"odd"</span><span class="p">)</span> +<span class="n">BOTTOM</span> <span class="o">=</span> <span class="n">Parity</span><span class="p">(</span><span class="s2">"bottom"</span><span class="p">)</span> +</pre></div> + +<p>Now let's write a forward flow analysis of a basic block using this lattice. +We'll do that by assuming that a method on <code>Parity</code> is defined for each IR +operation. For example, <code>Parity.add</code>, <code>Parity.lshift</code>, etc.</p> +<div class="code"><pre class="code literal-block"><span class="k">def</span> <span class="nf">analyze</span><span class="p">(</span><span class="n">block</span><span class="p">:</span> <span class="n">Block</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="kc">None</span><span class="p">:</span> + <span class="n">parity</span> <span class="o">=</span> <span class="p">{</span><span class="n">v</span><span class="p">:</span> <span class="n">BOTTOM</span> <span class="k">for</span> <span class="n">v</span> <span class="ow">in</span> <span class="n">block</span><span class="p">}</span> + + <span class="k">def</span> <span class="nf">parity_of</span><span class="p">(</span><span class="n">value</span><span class="p">):</span> + <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">value</span><span class="p">,</span> <span class="n">Constant</span><span class="p">):</span> + <span class="k">return</span> <span class="n">Parity</span><span class="o">.</span><span class="n">const</span><span class="p">(</span><span class="n">value</span><span class="p">)</span> + <span class="k">return</span> <span class="n">parity</span><span class="p">[</span><span class="n">value</span><span class="p">]</span> + + <span class="k">for</span> <span class="n">op</span> <span class="ow">in</span> <span class="n">block</span><span class="p">:</span> + <span class="n">transfer</span> <span class="o">=</span> <span class="nb">getattr</span><span class="p">(</span><span class="n">Parity</span><span class="p">,</span> <span class="n">op</span><span class="o">.</span><span class="n">name</span><span class="p">)</span> + <span class="n">args</span> <span class="o">=</span> <span class="p">[</span><span class="n">parity_of</span><span class="p">(</span><span class="n">arg</span><span class="o">.</span><span class="n">find</span><span class="p">())</span> <span class="k">for</span> <span class="n">arg</span> <span class="ow">in</span> <span class="n">op</span><span class="o">.</span><span class="n">args</span><span class="p">]</span> + <span class="n">parity</span><span class="p">[</span><span class="n">op</span><span class="p">]</span> <span class="o">=</span> <span class="n">transfer</span><span class="p">(</span><span class="o">*</span><span class="n">args</span><span class="p">)</span> +</pre></div> + +<p>For every operation, we compute the abstract value---the parity---of the +arguments and then call the corresponding method on <code>Parity</code> to get the +abstract result.</p> +<!-- TODO maybe learn more about different IRs and how they do constants. +apparently pypy/llvm are free-floating; cinder is not --> +<p>We need to special case <code>Constant</code>s due to a quirk of how the Toy IR is +constructed: the constants don't appear in the instruction stream and instead +are free-floating.</p> +<p>Let's start by looking at the abstraction function for concrete +values---constants:</p> +<div class="code"><pre class="code literal-block"><span class="k">class</span> <span class="nc">Parity</span><span class="p">:</span> + <span class="c1"># ...</span> + <span class="nd">@staticmethod</span> + <span class="k">def</span> <span class="nf">const</span><span class="p">(</span><span class="n">value</span><span class="p">):</span> + <span class="k">if</span> <span class="n">value</span><span class="o">.</span><span class="n">value</span> <span class="o">%</span> <span class="mi">2</span> <span class="o">==</span> <span class="mi">0</span><span class="p">:</span> + <span class="k">return</span> <span class="n">EVEN</span> + <span class="k">else</span><span class="p">:</span> + <span class="k">return</span> <span class="n">ODD</span> +</pre></div> + +<p>Seems reasonable enough. Let's pause on operations for a moment and consider an +example program:</p> +<div class="code"><pre class="code literal-block"><span class="n">v0</span> <span class="o">=</span> <span class="n">getarg</span><span class="p">(</span><span class="mi">0</span><span class="p">)</span> +<span class="n">v1</span> <span class="o">=</span> <span class="n">getarg</span><span class="p">(</span><span class="mi">1</span><span class="p">)</span> +<span class="n">v2</span> <span class="o">=</span> <span class="n">lshift</span><span class="p">(</span><span class="n">v0</span><span class="p">,</span> <span class="mi">1</span><span class="p">)</span> +<span class="n">v3</span> <span class="o">=</span> <span class="n">lshift</span><span class="p">(</span><span class="n">v1</span><span class="p">,</span> <span class="mi">1</span><span class="p">)</span> +<span class="n">v4</span> <span class="o">=</span> <span class="n">add</span><span class="p">(</span><span class="n">v2</span><span class="p">,</span> <span class="n">v3</span><span class="p">)</span> +<span class="n">v5</span> <span class="o">=</span> <span class="n">dummy</span><span class="p">(</span><span class="n">v4</span><span class="p">)</span> +</pre></div> + +<p>This function (which is admittedly a little contrived) takes two inputs, shifts +them left by one bit, adds the result, and then checks the least significant +bit of the addition result. It then passes that result into a <code>dummy</code> function, +which you can think of as "return" or "escape".</p> +<p>To do some abstract interpretation on this program, we'll need to implement the +transfer functions for <code>lshift</code> and <code>add</code> (<code>dummy</code> will just always return +<code>TOP</code>). We'll start with <code>add</code>. Remember that adding two even numbers returns +an even number, adding two odd numbers returns an even number, and mixing even +and odd returns an odd number.</p> +<div class="code"><pre class="code literal-block"><span class="k">class</span> <span class="nc">Parity</span><span class="p">:</span> + <span class="c1"># ...</span> + <span class="k">def</span> <span class="nf">add</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">):</span> + <span class="k">if</span> <span class="bp">self</span> <span class="ow">is</span> <span class="n">BOTTOM</span> <span class="ow">or</span> <span class="n">other</span> <span class="ow">is</span> <span class="n">BOTTOM</span><span class="p">:</span> + <span class="k">return</span> <span class="n">BOTTOM</span> + <span class="k">if</span> <span class="bp">self</span> <span class="ow">is</span> <span class="n">TOP</span> <span class="ow">or</span> <span class="n">other</span> <span class="ow">is</span> <span class="n">TOP</span><span class="p">:</span> + <span class="k">return</span> <span class="n">TOP</span> + <span class="k">if</span> <span class="bp">self</span> <span class="ow">is</span> <span class="n">EVEN</span> <span class="ow">and</span> <span class="n">other</span> <span class="ow">is</span> <span class="n">EVEN</span><span class="p">:</span> + <span class="k">return</span> <span class="n">EVEN</span> + <span class="k">if</span> <span class="bp">self</span> <span class="ow">is</span> <span class="n">ODD</span> <span class="ow">and</span> <span class="n">other</span> <span class="ow">is</span> <span class="n">ODD</span><span class="p">:</span> + <span class="k">return</span> <span class="n">EVEN</span> + <span class="k">return</span> <span class="n">ODD</span> +</pre></div> + +<p>We also need to fill in the other cases where the operands are <em>top</em> or +<em>bottom</em>. In this case, they are both "contagious"; if either operand is +bottom, the result is as well. If neither is bottom but either operand is top, +the result is as well.</p> +<p>Now let's look at <code>lshift</code>. Shifting any number left by a non-zero number of +bits will always result in an even number, but we need to be careful about the +zero case! Shifting by zero doesn't change the number at all. Unfortunately, +since our lattice has no notion of zero, we have to over-approximate here:</p> +<div class="code"><pre class="code literal-block"><span class="k">class</span> <span class="nc">Parity</span><span class="p">:</span> + <span class="c1"># ...</span> + <span class="k">def</span> <span class="nf">lshift</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">):</span> + <span class="c1"># self &lt;&lt; other</span> + <span class="k">if</span> <span class="n">other</span> <span class="ow">is</span> <span class="n">ODD</span><span class="p">:</span> + <span class="k">return</span> <span class="n">EVEN</span> + <span class="k">return</span> <span class="n">TOP</span> +</pre></div> + +<p>This means that we will miss some opportunities to optimize, but it's a +tradeoff that's just part of the game. (We could also add more elements to our +lattice, but that's a topic for another day.)</p> +<p>Now, if we run our abstract interpretation, we'll collect some interesting +properties about the program. If we temporarily hack on the internals of +<code>bb_to_str</code>, we can print out parity information alongside the IR operations:</p> +<div class="code"><pre class="code literal-block"><span class="n">v0</span><span class="p">:</span><span class="n">top</span> <span class="o">=</span> <span class="n">getarg</span><span class="p">(</span><span class="mi">0</span><span class="p">)</span> +<span class="n">v1</span><span class="p">:</span><span class="n">top</span> <span class="o">=</span> <span class="n">getarg</span><span class="p">(</span><span class="mi">1</span><span class="p">)</span> +<span class="n">v2</span><span class="p">:</span><span class="n">even</span> <span class="o">=</span> <span class="n">lshift</span><span class="p">(</span><span class="n">v0</span><span class="p">,</span> <span class="mi">1</span><span class="p">)</span> +<span class="n">v3</span><span class="p">:</span><span class="n">even</span> <span class="o">=</span> <span class="n">lshift</span><span class="p">(</span><span class="n">v1</span><span class="p">,</span> <span class="mi">1</span><span class="p">)</span> +<span class="n">v4</span><span class="p">:</span><span class="n">even</span> <span class="o">=</span> <span class="n">add</span><span class="p">(</span><span class="n">v2</span><span class="p">,</span> <span class="n">v3</span><span class="p">)</span> +<span class="n">v5</span><span class="p">:</span><span class="n">top</span> <span class="o">=</span> <span class="n">dummy</span><span class="p">(</span><span class="n">v4</span><span class="p">)</span> +</pre></div> + +<p>This is pretty awesome, because we can see that <code>v4</code>, the result of the +addition, is <em>always</em> even. Maybe we can do something with that information.</p> +<h3 id="optimization">Optimization</h3> +<p>One way that a program might check if a number is odd is by checking the least +significant bit. This is a common pattern in C code, where you might see code +like <code>y = x &amp; 1</code>. Let's introduce a <code>bitand</code> IR operation that acts like the +<code>&amp;</code> operator in C/Python. Here is an example of use of it in our program:</p> +<div class="code"><pre class="code literal-block"><span class="n">v0</span> <span class="o">=</span> <span class="n">getarg</span><span class="p">(</span><span class="mi">0</span><span class="p">)</span> +<span class="n">v1</span> <span class="o">=</span> <span class="n">getarg</span><span class="p">(</span><span class="mi">1</span><span class="p">)</span> +<span class="n">v2</span> <span class="o">=</span> <span class="n">lshift</span><span class="p">(</span><span class="n">v0</span><span class="p">,</span> <span class="mi">1</span><span class="p">)</span> +<span class="n">v3</span> <span class="o">=</span> <span class="n">lshift</span><span class="p">(</span><span class="n">v1</span><span class="p">,</span> <span class="mi">1</span><span class="p">)</span> +<span class="n">v4</span> <span class="o">=</span> <span class="n">add</span><span class="p">(</span><span class="n">v2</span><span class="p">,</span> <span class="n">v3</span><span class="p">)</span> +<span class="n">v5</span> <span class="o">=</span> <span class="n">bitand</span><span class="p">(</span><span class="n">v4</span><span class="p">,</span> <span class="mi">1</span><span class="p">)</span> <span class="c1"># new!</span> +<span class="n">v6</span> <span class="o">=</span> <span class="n">dummy</span><span class="p">(</span><span class="n">v5</span><span class="p">)</span> +</pre></div> + +<p>We'll hold off on implementing the transfer function for it---that's left as an +exercise for the reader---and instead do something different.</p> +<p>Instead, we'll see if we can optimize operations of the form <code>bitand(X, 1)</code>. If +we statically know the parity as a result of abstract interpretation, we can +replace the <code>bitand</code> with a constant <code>0</code> or <code>1</code>.</p> +<p>We'll first modify the <code>analyze</code> function (and rename it) to return a new +<code>Block</code> containing optimized instructions:</p> +<div class="code"><pre class="code literal-block"><span class="k">def</span> <span class="nf">simplify</span><span class="p">(</span><span class="n">block</span><span class="p">:</span> <span class="n">Block</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Block</span><span class="p">:</span> + <span class="n">parity</span> <span class="o">=</span> <span class="p">{</span><span class="n">v</span><span class="p">:</span> <span class="n">BOTTOM</span> <span class="k">for</span> <span class="n">v</span> <span class="ow">in</span> <span class="n">block</span><span class="p">}</span> + + <span class="k">def</span> <span class="nf">parity_of</span><span class="p">(</span><span class="n">value</span><span class="p">):</span> + <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">value</span><span class="p">,</span> <span class="n">Constant</span><span class="p">):</span> + <span class="k">return</span> <span class="n">Parity</span><span class="o">.</span><span class="n">const</span><span class="p">(</span><span class="n">value</span><span class="p">)</span> + <span class="k">return</span> <span class="n">parity</span><span class="p">[</span><span class="n">value</span><span class="p">]</span> + + <span class="n">result</span> <span class="o">=</span> <span class="n">Block</span><span class="p">()</span> + <span class="k">for</span> <span class="n">op</span> <span class="ow">in</span> <span class="n">block</span><span class="p">:</span> + <span class="c1"># TODO: Optimize op</span> + <span class="c1"># Emit</span> + <span class="n">result</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">op</span><span class="p">)</span> + <span class="c1"># Analyze</span> + <span class="n">transfer</span> <span class="o">=</span> <span class="nb">getattr</span><span class="p">(</span><span class="n">Parity</span><span class="p">,</span> <span class="n">op</span><span class="o">.</span><span class="n">name</span><span class="p">)</span> + <span class="n">args</span> <span class="o">=</span> <span class="p">[</span><span class="n">parity_of</span><span class="p">(</span><span class="n">arg</span><span class="o">.</span><span class="n">find</span><span class="p">())</span> <span class="k">for</span> <span class="n">arg</span> <span class="ow">in</span> <span class="n">op</span><span class="o">.</span><span class="n">args</span><span class="p">]</span> + <span class="n">parity</span><span class="p">[</span><span class="n">op</span><span class="p">]</span> <span class="o">=</span> <span class="n">transfer</span><span class="p">(</span><span class="o">*</span><span class="n">args</span><span class="p">)</span> + <span class="k">return</span> <span class="n">result</span> +</pre></div> + +<p>We're approaching this the way that PyPy does things under the hood, which is +all in roughly a single pass. It tries to optimize an instruction away, and if +it can't, it copies it into the new block.</p> +<p>Now let's add in the <code>bitand</code> optimization. It's mostly some gross-looking +pattern matching that checks if the right hand side of a bitwise <code>and</code> +operation is <code>1</code> (TODO: the left hand side, too). CF had some neat ideas on how +to make this more ergonomic, which I might save for later.<sup id="fnref:match-args"><a class="footnote-ref" href="https://www.pypy.org/posts/2024/07/toy-abstract-interpretation.html#fn:match-args">3</a></sup></p> +<p>Then, if we know the parity, optimize the <code>bitand</code> into a constant.</p> +<div class="code"><pre class="code literal-block"><span class="k">def</span> <span class="nf">simplify</span><span class="p">(</span><span class="n">block</span><span class="p">:</span> <span class="n">Block</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Block</span><span class="p">:</span> + <span class="n">parity</span> <span class="o">=</span> <span class="p">{</span><span class="n">v</span><span class="p">:</span> <span class="n">BOTTOM</span> <span class="k">for</span> <span class="n">v</span> <span class="ow">in</span> <span class="n">block</span><span class="p">}</span> + + <span class="k">def</span> <span class="nf">parity_of</span><span class="p">(</span><span class="n">value</span><span class="p">):</span> + <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">value</span><span class="p">,</span> <span class="n">Constant</span><span class="p">):</span> + <span class="k">return</span> <span class="n">Parity</span><span class="o">.</span><span class="n">const</span><span class="p">(</span><span class="n">value</span><span class="p">)</span> + <span class="k">return</span> <span class="n">parity</span><span class="p">[</span><span class="n">value</span><span class="p">]</span> + + <span class="n">result</span> <span class="o">=</span> <span class="n">Block</span><span class="p">()</span> + <span class="k">for</span> <span class="n">op</span> <span class="ow">in</span> <span class="n">block</span><span class="p">:</span> + <span class="c1"># Try to simplify</span> + <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">op</span><span class="p">,</span> <span class="n">Operation</span><span class="p">)</span> <span class="ow">and</span> <span class="n">op</span><span class="o">.</span><span class="n">name</span> <span class="o">==</span> <span class="s2">"bitand"</span><span class="p">:</span> + <span class="n">arg</span> <span class="o">=</span> <span class="n">op</span><span class="o">.</span><span class="n">arg</span><span class="p">(</span><span class="mi">0</span><span class="p">)</span> + <span class="n">mask</span> <span class="o">=</span> <span class="n">op</span><span class="o">.</span><span class="n">arg</span><span class="p">(</span><span class="mi">1</span><span class="p">)</span> + <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">mask</span><span class="p">,</span> <span class="n">Constant</span><span class="p">)</span> <span class="ow">and</span> <span class="n">mask</span><span class="o">.</span><span class="n">value</span> <span class="o">==</span> <span class="mi">1</span><span class="p">:</span> + <span class="k">if</span> <span class="n">parity_of</span><span class="p">(</span><span class="n">arg</span><span class="p">)</span> <span class="ow">is</span> <span class="n">EVEN</span><span class="p">:</span> + <span class="n">op</span><span class="o">.</span><span class="n">make_equal_to</span><span class="p">(</span><span class="n">Constant</span><span class="p">(</span><span class="mi">0</span><span class="p">))</span> + <span class="k">continue</span> + <span class="k">elif</span> <span class="n">parity_of</span><span class="p">(</span><span class="n">arg</span><span class="p">)</span> <span class="ow">is</span> <span class="n">ODD</span><span class="p">:</span> + <span class="n">op</span><span class="o">.</span><span class="n">make_equal_to</span><span class="p">(</span><span class="n">Constant</span><span class="p">(</span><span class="mi">1</span><span class="p">))</span> + <span class="k">continue</span> + <span class="c1"># Emit</span> + <span class="n">result</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">op</span><span class="p">)</span> + <span class="c1"># Analyze</span> + <span class="n">transfer</span> <span class="o">=</span> <span class="nb">getattr</span><span class="p">(</span><span class="n">Parity</span><span class="p">,</span> <span class="n">op</span><span class="o">.</span><span class="n">name</span><span class="p">)</span> + <span class="n">args</span> <span class="o">=</span> <span class="p">[</span><span class="n">parity_of</span><span class="p">(</span><span class="n">arg</span><span class="o">.</span><span class="n">find</span><span class="p">())</span> <span class="k">for</span> <span class="n">arg</span> <span class="ow">in</span> <span class="n">op</span><span class="o">.</span><span class="n">args</span><span class="p">]</span> + <span class="n">parity</span><span class="p">[</span><span class="n">op</span><span class="p">]</span> <span class="o">=</span> <span class="n">transfer</span><span class="p">(</span><span class="o">*</span><span class="n">args</span><span class="p">)</span> + <span class="k">return</span> <span class="n">result</span> +</pre></div> + +<p>Remember: because we use union-find to rewrite instructions in the optimizer +(<code>make_equal_to</code>), later uses of the same instruction get the new +optimized version "for free" (<code>find</code>).</p> +<p>Let's see how it works on our IR:</p> +<div class="code"><pre class="code literal-block"><span class="n">v0</span> <span class="o">=</span> <span class="n">getarg</span><span class="p">(</span><span class="mi">0</span><span class="p">)</span> +<span class="n">v1</span> <span class="o">=</span> <span class="n">getarg</span><span class="p">(</span><span class="mi">1</span><span class="p">)</span> +<span class="n">v2</span> <span class="o">=</span> <span class="n">lshift</span><span class="p">(</span><span class="n">v0</span><span class="p">,</span> <span class="mi">1</span><span class="p">)</span> +<span class="n">v3</span> <span class="o">=</span> <span class="n">lshift</span><span class="p">(</span><span class="n">v1</span><span class="p">,</span> <span class="mi">1</span><span class="p">)</span> +<span class="n">v4</span> <span class="o">=</span> <span class="n">add</span><span class="p">(</span><span class="n">v2</span><span class="p">,</span> <span class="n">v3</span><span class="p">)</span> +<span class="n">v6</span> <span class="o">=</span> <span class="n">dummy</span><span class="p">(</span><span class="mi">0</span><span class="p">)</span> +</pre></div> + +<p>Hey, neat! <code>bitand</code> disappeared and the argument to <code>dummy</code> is now the constant +<code>0</code> because we know the lowest bit.</p> +<h3 id="wrapping-up">Wrapping up</h3> +<p>Hopefully you have gained a little bit of an intuitive understanding of +abstract interpretation. Last year, being able to write some code made me more +comfortable with the math. Now being more comfortable with the math is helping +me write the code. It's nice upward spiral.</p> +<p>The two abstract domains we used in this post are simple and not very useful in +practice but it's possible to get very far using slightly more complicated +abstract domains. Common domains include: constant propagation, type inference, +range analysis, effect inference, liveness, etc. For example, here is a a +sample lattice for constant propagation:</p> +<figure style="display: block; margin: 0 auto;"> +<!-- +digraph G { + rankdir="BT"; + top [shape=Msquare]; + bottom [shape=Msquare]; + + bottom -> "-inf"; + bottom -> "-2"; + bottom -> "-1"; + bottom -> 0; + bottom -> 1; + bottom -> 2; + bottom -> "+inf"; + + "-inf" -> negative; + "-2" -> negative; + "-1" -> negative; + 0 -> top; + 1 -> nonnegative; + 2 -> nonnegative; + "+inf" -> nonnegative; + + negative -> nonzero; + nonnegative -> nonzero; + nonzero->top; + + {rank=same; "-inf"; "-2"; "-1"; 0; 1; 2; "+inf"} + {rank=same; nonnegative; negative;} +} +--> + <object class="svg" type="image/svg+xml" data="https://www.pypy.org/images/2024-complex-lattice.svg"> + </object> +</figure> + +<p>It has multiple levels to indicate more and less precision. For example, you +might learn that a variable is either <code>1</code> or <code>2</code> and be able to encode that as +<code>nonnegative</code> instead of just going straight to <code>top</code>.</p> +<p>Check out some real-world abstract interpretation in open source projects:</p> +<ul> +<li><a href="https://github.com/llvm/llvm-project/blob/main/llvm/lib/Support/KnownBits.cpp">Known bits in LLVM</a></li> +<li><a href="https://github.com/llvm/llvm-project/blob/main/llvm/lib/IR/ConstantRange.cpp">Constant range in LLVM</a></li> +<li>But I am told that the ranges don't form a lattice (see <a href="https://dl.acm.org/doi/10.1145/2651360">Interval Analysis and Machine Arithmetic: Why Signedness Ignorance Is Bliss</a>)</li> +<li><a href="https://github.com/torvalds/linux/blob/master/kernel/bpf/tnum.c">Tristate numbers for known bits in Linux eBPF</a></li> +<li><a href="https://github.com/torvalds/linux/blob/28bbe4ea686a023929d907cc168430b61094811c/kernel/bpf/verifier.c#L13335">Range analysis in Linux eBPF</a></li> +<li><a href="https://github.com/bminor/binutils-gdb/blob/master/gdb/prologue-value.c">GDB prologue analysis</a> + of assembly to understand the stack and find frame pointers without using + DWARF (<a href="https://sourceware.org/gdb/wiki/Internals/Prologue%20Analysis">some + docs</a>)</li> +</ul> +<p>If you have some readable examples, please share them so I can add.</p> +<h3 id="acknowledgements">Acknowledgements</h3> +<p>Thank you to <a href="https://cfbolz.de/">CF Bolz-Tereick</a> for the toy optimizer and +helping edit this post!</p> +<div class="footnote"> +<hr> +<ol> +<li id="fn:logozzo"> +<p>In the words of abstract interpretation researchers Vincent Laviron +and Francesco Logozzo in their paper <em>Refining Abstract +Interpretation-based Static Analyses with Hints</em> (APLAS 2009):</p> +<blockquote> +<p>The three main elements of an abstract interpretation are: (i) the +abstract elements ("which properties am I interested in?"); (ii) the +abstract transfer functions ("which is the abstract semantics of basic +statements?"); and (iii) the abstract operations ("how do I combine the +abstract elements?").</p> +</blockquote> +<p>We don't have any of these "abstract operations" in this post because +there's no control flow but you can read about them elsewhere! <a class="footnote-backref" href="https://www.pypy.org/posts/2024/07/toy-abstract-interpretation.html#fnref:logozzo" title="Jump back to footnote 1 in the text">↩</a></p> +</li> +<li id="fn:lattices"> +<p>These abstract values are arranged in a <em>lattice</em>, which is a +mathematical structure with some properties but the most important ones are +that it has a top, a bottom, a partial order, a meet operation, and values +can only move in one direction on the lattice.</p> +<p>Using abstract values from a lattice promises two things:</p> +<ul> +<li>The analysis will terminate</li> +<li>The analysis will be correct for <em>any</em> run of the program, not just one + sample run</li> +</ul> +<p><a class="footnote-backref" href="https://www.pypy.org/posts/2024/07/toy-abstract-interpretation.html#fnref:lattices" title="Jump back to footnote 2 in the text">↩</a></p> +</li> +<li id="fn:match-args"> +<p>Something about <code>__match_args__</code> and <code>@property</code>... <a class="footnote-backref" href="https://www.pypy.org/posts/2024/07/toy-abstract-interpretation.html#fnref:match-args" title="Jump back to footnote 3 in the text">↩</a></p> +</li> +</ol> +</div>toy-optimizerhttps://www.pypy.org/posts/2024/07/toy-abstract-interpretation.htmlWed, 24 Jul 2024 14:48:00 GMTAllocation Removal in the Toy Optimizerhttps://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.htmlCarl Friedrich Bolz-Tereick<p>One of the workhorse optimization of RPython's tracing JIT is <a class="reference external" href="https://dl.acm.org/doi/10.1145/1929501.1929508">allocation +removal</a>, which removes short-lived object allocation from traces. Many Python +programs create a lot of objects that only live for a short time, and whose +lifespan is fully predictable (common examples are integer and float boxes, but +also tuples, frames, intermediate string results, etc). Allocation removal will +try (and very often succeed) to remove these allocations from traces. In +this blog post I want to show a toy version of how allocation removal is +implemented.</p> +<p>In the <a class="reference external" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html">previous</a> blog post of this series I showed the complete code for +writing a toy one-pass optimizer that does constant folding, common +subexpression elimination and strength reduction. In this +second post, I want to use allocation removal as a more advanced optimization +pass. The basic optimization framework is the same, we will use the same +datastructures for intermediate representation and also keep using the same +union find data structure to store equivalences between IR operations. Here's +the infrastructure code from the last post:</p> +<div class="code"><pre class="code python"><a id="rest_code_1adc0460707d4986a0ff9334f2124306-1" name="rest_code_1adc0460707d4986a0ff9334f2124306-1" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1adc0460707d4986a0ff9334f2124306-1"></a><span class="kn">import</span> <span class="nn">pytest</span> +<a id="rest_code_1adc0460707d4986a0ff9334f2124306-2" name="rest_code_1adc0460707d4986a0ff9334f2124306-2" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1adc0460707d4986a0ff9334f2124306-2"></a><span class="kn">from</span> <span class="nn">typing</span> <span class="kn">import</span> <span class="n">Optional</span><span class="p">,</span> <span class="n">Any</span> +<a id="rest_code_1adc0460707d4986a0ff9334f2124306-3" name="rest_code_1adc0460707d4986a0ff9334f2124306-3" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1adc0460707d4986a0ff9334f2124306-3"></a> +<a id="rest_code_1adc0460707d4986a0ff9334f2124306-4" name="rest_code_1adc0460707d4986a0ff9334f2124306-4" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1adc0460707d4986a0ff9334f2124306-4"></a> +<a id="rest_code_1adc0460707d4986a0ff9334f2124306-5" name="rest_code_1adc0460707d4986a0ff9334f2124306-5" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1adc0460707d4986a0ff9334f2124306-5"></a><span class="k">class</span> <span class="nc">Value</span><span class="p">:</span> +<a id="rest_code_1adc0460707d4986a0ff9334f2124306-6" name="rest_code_1adc0460707d4986a0ff9334f2124306-6" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1adc0460707d4986a0ff9334f2124306-6"></a> <span class="k">def</span> <span class="nf">find</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span> +<a id="rest_code_1adc0460707d4986a0ff9334f2124306-7" name="rest_code_1adc0460707d4986a0ff9334f2124306-7" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1adc0460707d4986a0ff9334f2124306-7"></a> <span class="k">raise</span> <span class="ne">NotImplementedError</span><span class="p">(</span><span class="s2">"abstract"</span><span class="p">)</span> +<a id="rest_code_1adc0460707d4986a0ff9334f2124306-8" name="rest_code_1adc0460707d4986a0ff9334f2124306-8" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1adc0460707d4986a0ff9334f2124306-8"></a> +<a id="rest_code_1adc0460707d4986a0ff9334f2124306-9" name="rest_code_1adc0460707d4986a0ff9334f2124306-9" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1adc0460707d4986a0ff9334f2124306-9"></a> <span class="k">def</span> <span class="nf">_set_forwarded</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span> +<a id="rest_code_1adc0460707d4986a0ff9334f2124306-10" name="rest_code_1adc0460707d4986a0ff9334f2124306-10" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1adc0460707d4986a0ff9334f2124306-10"></a> <span class="k">raise</span> <span class="ne">NotImplementedError</span><span class="p">(</span><span class="s2">"abstract"</span><span class="p">)</span> +<a id="rest_code_1adc0460707d4986a0ff9334f2124306-11" name="rest_code_1adc0460707d4986a0ff9334f2124306-11" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1adc0460707d4986a0ff9334f2124306-11"></a> +<a id="rest_code_1adc0460707d4986a0ff9334f2124306-12" name="rest_code_1adc0460707d4986a0ff9334f2124306-12" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1adc0460707d4986a0ff9334f2124306-12"></a> +<a id="rest_code_1adc0460707d4986a0ff9334f2124306-13" name="rest_code_1adc0460707d4986a0ff9334f2124306-13" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1adc0460707d4986a0ff9334f2124306-13"></a><span class="k">class</span> <span class="nc">Operation</span><span class="p">(</span><span class="n">Value</span><span class="p">):</span> +<a id="rest_code_1adc0460707d4986a0ff9334f2124306-14" name="rest_code_1adc0460707d4986a0ff9334f2124306-14" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1adc0460707d4986a0ff9334f2124306-14"></a> <span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span> +<a id="rest_code_1adc0460707d4986a0ff9334f2124306-15" name="rest_code_1adc0460707d4986a0ff9334f2124306-15" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1adc0460707d4986a0ff9334f2124306-15"></a> <span class="bp">self</span><span class="p">,</span> <span class="n">name</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span> <span class="n">args</span><span class="p">:</span> <span class="nb">list</span><span class="p">[</span><span class="n">Value</span><span class="p">]</span> +<a id="rest_code_1adc0460707d4986a0ff9334f2124306-16" name="rest_code_1adc0460707d4986a0ff9334f2124306-16" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1adc0460707d4986a0ff9334f2124306-16"></a> <span class="p">):</span> +<a id="rest_code_1adc0460707d4986a0ff9334f2124306-17" name="rest_code_1adc0460707d4986a0ff9334f2124306-17" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1adc0460707d4986a0ff9334f2124306-17"></a> <span class="bp">self</span><span class="o">.</span><span class="n">name</span> <span class="o">=</span> <span class="n">name</span> +<a id="rest_code_1adc0460707d4986a0ff9334f2124306-18" name="rest_code_1adc0460707d4986a0ff9334f2124306-18" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1adc0460707d4986a0ff9334f2124306-18"></a> <span class="bp">self</span><span class="o">.</span><span class="n">args</span> <span class="o">=</span> <span class="n">args</span> +<a id="rest_code_1adc0460707d4986a0ff9334f2124306-19" name="rest_code_1adc0460707d4986a0ff9334f2124306-19" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1adc0460707d4986a0ff9334f2124306-19"></a> <span class="bp">self</span><span class="o">.</span><span class="n">forwarded</span> <span class="o">=</span> <span class="kc">None</span> +<a id="rest_code_1adc0460707d4986a0ff9334f2124306-20" name="rest_code_1adc0460707d4986a0ff9334f2124306-20" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1adc0460707d4986a0ff9334f2124306-20"></a> <span class="bp">self</span><span class="o">.</span><span class="n">info</span> <span class="o">=</span> <span class="kc">None</span> +<a id="rest_code_1adc0460707d4986a0ff9334f2124306-21" name="rest_code_1adc0460707d4986a0ff9334f2124306-21" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1adc0460707d4986a0ff9334f2124306-21"></a><span class="hll"> +</span><a id="rest_code_1adc0460707d4986a0ff9334f2124306-22" name="rest_code_1adc0460707d4986a0ff9334f2124306-22" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1adc0460707d4986a0ff9334f2124306-22"></a> <span class="k">def</span> <span class="fm">__repr__</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span> +<a id="rest_code_1adc0460707d4986a0ff9334f2124306-23" name="rest_code_1adc0460707d4986a0ff9334f2124306-23" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1adc0460707d4986a0ff9334f2124306-23"></a> <span class="k">return</span> <span class="p">(</span> +<a id="rest_code_1adc0460707d4986a0ff9334f2124306-24" name="rest_code_1adc0460707d4986a0ff9334f2124306-24" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1adc0460707d4986a0ff9334f2124306-24"></a> <span class="sa">f</span><span class="s2">"Operation(</span><span class="si">{</span><span class="bp">self</span><span class="o">.</span><span class="n">name</span><span class="si">}</span><span class="s2">, "</span> +<a id="rest_code_1adc0460707d4986a0ff9334f2124306-25" name="rest_code_1adc0460707d4986a0ff9334f2124306-25" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1adc0460707d4986a0ff9334f2124306-25"></a> <span class="sa">f</span><span class="s2">"</span><span class="si">{</span><span class="bp">self</span><span class="o">.</span><span class="n">args</span><span class="si">}</span><span class="s2">, </span><span class="si">{</span><span class="bp">self</span><span class="o">.</span><span class="n">forwarded</span><span class="si">}</span><span class="s2">, "</span> +<a id="rest_code_1adc0460707d4986a0ff9334f2124306-26" name="rest_code_1adc0460707d4986a0ff9334f2124306-26" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1adc0460707d4986a0ff9334f2124306-26"></a> <span class="sa">f</span><span class="s2">"</span><span class="si">{</span><span class="bp">self</span><span class="o">.</span><span class="n">info</span><span class="si">}</span><span class="s2">)"</span> +<a id="rest_code_1adc0460707d4986a0ff9334f2124306-27" name="rest_code_1adc0460707d4986a0ff9334f2124306-27" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1adc0460707d4986a0ff9334f2124306-27"></a> <span class="p">)</span> +<a id="rest_code_1adc0460707d4986a0ff9334f2124306-28" name="rest_code_1adc0460707d4986a0ff9334f2124306-28" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1adc0460707d4986a0ff9334f2124306-28"></a> +<a id="rest_code_1adc0460707d4986a0ff9334f2124306-29" name="rest_code_1adc0460707d4986a0ff9334f2124306-29" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1adc0460707d4986a0ff9334f2124306-29"></a> <span class="k">def</span> <span class="nf">find</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Value</span><span class="p">:</span> +<a id="rest_code_1adc0460707d4986a0ff9334f2124306-30" name="rest_code_1adc0460707d4986a0ff9334f2124306-30" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1adc0460707d4986a0ff9334f2124306-30"></a> <span class="n">op</span> <span class="o">=</span> <span class="bp">self</span> +<a id="rest_code_1adc0460707d4986a0ff9334f2124306-31" name="rest_code_1adc0460707d4986a0ff9334f2124306-31" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1adc0460707d4986a0ff9334f2124306-31"></a> <span class="k">while</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">op</span><span class="p">,</span> <span class="n">Operation</span><span class="p">):</span> +<a id="rest_code_1adc0460707d4986a0ff9334f2124306-32" name="rest_code_1adc0460707d4986a0ff9334f2124306-32" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1adc0460707d4986a0ff9334f2124306-32"></a> <span class="nb">next</span> <span class="o">=</span> <span class="n">op</span><span class="o">.</span><span class="n">forwarded</span> +<a id="rest_code_1adc0460707d4986a0ff9334f2124306-33" name="rest_code_1adc0460707d4986a0ff9334f2124306-33" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1adc0460707d4986a0ff9334f2124306-33"></a> <span class="k">if</span> <span class="nb">next</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span> +<a id="rest_code_1adc0460707d4986a0ff9334f2124306-34" name="rest_code_1adc0460707d4986a0ff9334f2124306-34" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1adc0460707d4986a0ff9334f2124306-34"></a> <span class="k">return</span> <span class="n">op</span> +<a id="rest_code_1adc0460707d4986a0ff9334f2124306-35" name="rest_code_1adc0460707d4986a0ff9334f2124306-35" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1adc0460707d4986a0ff9334f2124306-35"></a> <span class="n">op</span> <span class="o">=</span> <span class="nb">next</span> +<a id="rest_code_1adc0460707d4986a0ff9334f2124306-36" name="rest_code_1adc0460707d4986a0ff9334f2124306-36" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1adc0460707d4986a0ff9334f2124306-36"></a> <span class="k">return</span> <span class="n">op</span> +<a id="rest_code_1adc0460707d4986a0ff9334f2124306-37" name="rest_code_1adc0460707d4986a0ff9334f2124306-37" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1adc0460707d4986a0ff9334f2124306-37"></a> +<a id="rest_code_1adc0460707d4986a0ff9334f2124306-38" name="rest_code_1adc0460707d4986a0ff9334f2124306-38" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1adc0460707d4986a0ff9334f2124306-38"></a> <span class="k">def</span> <span class="nf">arg</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">index</span><span class="p">):</span> +<a id="rest_code_1adc0460707d4986a0ff9334f2124306-39" name="rest_code_1adc0460707d4986a0ff9334f2124306-39" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1adc0460707d4986a0ff9334f2124306-39"></a> <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">args</span><span class="p">[</span><span class="n">index</span><span class="p">]</span><span class="o">.</span><span class="n">find</span><span class="p">()</span> +<a id="rest_code_1adc0460707d4986a0ff9334f2124306-40" name="rest_code_1adc0460707d4986a0ff9334f2124306-40" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1adc0460707d4986a0ff9334f2124306-40"></a> +<a id="rest_code_1adc0460707d4986a0ff9334f2124306-41" name="rest_code_1adc0460707d4986a0ff9334f2124306-41" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1adc0460707d4986a0ff9334f2124306-41"></a> <span class="k">def</span> <span class="nf">make_equal_to</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="n">Value</span><span class="p">):</span> +<a id="rest_code_1adc0460707d4986a0ff9334f2124306-42" name="rest_code_1adc0460707d4986a0ff9334f2124306-42" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1adc0460707d4986a0ff9334f2124306-42"></a> <span class="bp">self</span><span class="o">.</span><span class="n">find</span><span class="p">()</span><span class="o">.</span><span class="n">_set_forwarded</span><span class="p">(</span><span class="n">value</span><span class="p">)</span> +<a id="rest_code_1adc0460707d4986a0ff9334f2124306-43" name="rest_code_1adc0460707d4986a0ff9334f2124306-43" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1adc0460707d4986a0ff9334f2124306-43"></a> +<a id="rest_code_1adc0460707d4986a0ff9334f2124306-44" name="rest_code_1adc0460707d4986a0ff9334f2124306-44" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1adc0460707d4986a0ff9334f2124306-44"></a> <span class="k">def</span> <span class="nf">_set_forwarded</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="n">Value</span><span class="p">):</span> +<a id="rest_code_1adc0460707d4986a0ff9334f2124306-45" name="rest_code_1adc0460707d4986a0ff9334f2124306-45" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1adc0460707d4986a0ff9334f2124306-45"></a> <span class="bp">self</span><span class="o">.</span><span class="n">forwarded</span> <span class="o">=</span> <span class="n">value</span> +<a id="rest_code_1adc0460707d4986a0ff9334f2124306-46" name="rest_code_1adc0460707d4986a0ff9334f2124306-46" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1adc0460707d4986a0ff9334f2124306-46"></a> +<a id="rest_code_1adc0460707d4986a0ff9334f2124306-47" name="rest_code_1adc0460707d4986a0ff9334f2124306-47" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1adc0460707d4986a0ff9334f2124306-47"></a> +<a id="rest_code_1adc0460707d4986a0ff9334f2124306-48" name="rest_code_1adc0460707d4986a0ff9334f2124306-48" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1adc0460707d4986a0ff9334f2124306-48"></a><span class="k">class</span> <span class="nc">Constant</span><span class="p">(</span><span class="n">Value</span><span class="p">):</span> +<a id="rest_code_1adc0460707d4986a0ff9334f2124306-49" name="rest_code_1adc0460707d4986a0ff9334f2124306-49" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1adc0460707d4986a0ff9334f2124306-49"></a> <span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="n">Any</span><span class="p">):</span> +<a id="rest_code_1adc0460707d4986a0ff9334f2124306-50" name="rest_code_1adc0460707d4986a0ff9334f2124306-50" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1adc0460707d4986a0ff9334f2124306-50"></a> <span class="bp">self</span><span class="o">.</span><span class="n">value</span> <span class="o">=</span> <span class="n">value</span> +<a id="rest_code_1adc0460707d4986a0ff9334f2124306-51" name="rest_code_1adc0460707d4986a0ff9334f2124306-51" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1adc0460707d4986a0ff9334f2124306-51"></a> +<a id="rest_code_1adc0460707d4986a0ff9334f2124306-52" name="rest_code_1adc0460707d4986a0ff9334f2124306-52" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1adc0460707d4986a0ff9334f2124306-52"></a> <span class="k">def</span> <span class="fm">__repr__</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span> +<a id="rest_code_1adc0460707d4986a0ff9334f2124306-53" name="rest_code_1adc0460707d4986a0ff9334f2124306-53" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1adc0460707d4986a0ff9334f2124306-53"></a> <span class="k">return</span> <span class="sa">f</span><span class="s2">"Constant(</span><span class="si">{</span><span class="bp">self</span><span class="o">.</span><span class="n">value</span><span class="si">}</span><span class="s2">)"</span> +<a id="rest_code_1adc0460707d4986a0ff9334f2124306-54" name="rest_code_1adc0460707d4986a0ff9334f2124306-54" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1adc0460707d4986a0ff9334f2124306-54"></a> +<a id="rest_code_1adc0460707d4986a0ff9334f2124306-55" name="rest_code_1adc0460707d4986a0ff9334f2124306-55" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1adc0460707d4986a0ff9334f2124306-55"></a> <span class="k">def</span> <span class="nf">find</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span> +<a id="rest_code_1adc0460707d4986a0ff9334f2124306-56" name="rest_code_1adc0460707d4986a0ff9334f2124306-56" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1adc0460707d4986a0ff9334f2124306-56"></a> <span class="k">return</span> <span class="bp">self</span> +<a id="rest_code_1adc0460707d4986a0ff9334f2124306-57" name="rest_code_1adc0460707d4986a0ff9334f2124306-57" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1adc0460707d4986a0ff9334f2124306-57"></a> +<a id="rest_code_1adc0460707d4986a0ff9334f2124306-58" name="rest_code_1adc0460707d4986a0ff9334f2124306-58" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1adc0460707d4986a0ff9334f2124306-58"></a> <span class="k">def</span> <span class="nf">_set_forwarded</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="n">Value</span><span class="p">):</span> +<a id="rest_code_1adc0460707d4986a0ff9334f2124306-59" name="rest_code_1adc0460707d4986a0ff9334f2124306-59" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1adc0460707d4986a0ff9334f2124306-59"></a> <span class="k">assert</span> <span class="p">(</span> +<a id="rest_code_1adc0460707d4986a0ff9334f2124306-60" name="rest_code_1adc0460707d4986a0ff9334f2124306-60" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1adc0460707d4986a0ff9334f2124306-60"></a> <span class="nb">isinstance</span><span class="p">(</span><span class="n">value</span><span class="p">,</span> <span class="n">Constant</span><span class="p">)</span> +<a id="rest_code_1adc0460707d4986a0ff9334f2124306-61" name="rest_code_1adc0460707d4986a0ff9334f2124306-61" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1adc0460707d4986a0ff9334f2124306-61"></a> <span class="ow">and</span> <span class="n">value</span><span class="o">.</span><span class="n">value</span> <span class="o">==</span> <span class="bp">self</span><span class="o">.</span><span class="n">value</span> +<a id="rest_code_1adc0460707d4986a0ff9334f2124306-62" name="rest_code_1adc0460707d4986a0ff9334f2124306-62" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1adc0460707d4986a0ff9334f2124306-62"></a> <span class="p">)</span> +<a id="rest_code_1adc0460707d4986a0ff9334f2124306-63" name="rest_code_1adc0460707d4986a0ff9334f2124306-63" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1adc0460707d4986a0ff9334f2124306-63"></a> +<a id="rest_code_1adc0460707d4986a0ff9334f2124306-64" name="rest_code_1adc0460707d4986a0ff9334f2124306-64" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1adc0460707d4986a0ff9334f2124306-64"></a><span class="k">class</span> <span class="nc">Block</span><span class="p">(</span><span class="nb">list</span><span class="p">):</span> +<a id="rest_code_1adc0460707d4986a0ff9334f2124306-65" name="rest_code_1adc0460707d4986a0ff9334f2124306-65" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1adc0460707d4986a0ff9334f2124306-65"></a> <span class="k">def</span> <span class="nf">opbuilder</span><span class="p">(</span><span class="n">opname</span><span class="p">):</span> +<a id="rest_code_1adc0460707d4986a0ff9334f2124306-66" name="rest_code_1adc0460707d4986a0ff9334f2124306-66" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1adc0460707d4986a0ff9334f2124306-66"></a> <span class="k">def</span> <span class="nf">wraparg</span><span class="p">(</span><span class="n">arg</span><span class="p">):</span> +<a id="rest_code_1adc0460707d4986a0ff9334f2124306-67" name="rest_code_1adc0460707d4986a0ff9334f2124306-67" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1adc0460707d4986a0ff9334f2124306-67"></a> <span class="k">if</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">arg</span><span class="p">,</span> <span class="n">Value</span><span class="p">):</span> +<a id="rest_code_1adc0460707d4986a0ff9334f2124306-68" name="rest_code_1adc0460707d4986a0ff9334f2124306-68" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1adc0460707d4986a0ff9334f2124306-68"></a> <span class="n">arg</span> <span class="o">=</span> <span class="n">Constant</span><span class="p">(</span><span class="n">arg</span><span class="p">)</span> +<a id="rest_code_1adc0460707d4986a0ff9334f2124306-69" name="rest_code_1adc0460707d4986a0ff9334f2124306-69" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1adc0460707d4986a0ff9334f2124306-69"></a> <span class="k">return</span> <span class="n">arg</span> +<a id="rest_code_1adc0460707d4986a0ff9334f2124306-70" name="rest_code_1adc0460707d4986a0ff9334f2124306-70" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1adc0460707d4986a0ff9334f2124306-70"></a> <span class="k">def</span> <span class="nf">build</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="n">args</span><span class="p">):</span> +<a id="rest_code_1adc0460707d4986a0ff9334f2124306-71" name="rest_code_1adc0460707d4986a0ff9334f2124306-71" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1adc0460707d4986a0ff9334f2124306-71"></a> <span class="c1"># construct an Operation, wrap the</span> +<a id="rest_code_1adc0460707d4986a0ff9334f2124306-72" name="rest_code_1adc0460707d4986a0ff9334f2124306-72" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1adc0460707d4986a0ff9334f2124306-72"></a> <span class="c1"># arguments in Constants if necessary</span> +<a id="rest_code_1adc0460707d4986a0ff9334f2124306-73" name="rest_code_1adc0460707d4986a0ff9334f2124306-73" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1adc0460707d4986a0ff9334f2124306-73"></a> <span class="n">op</span> <span class="o">=</span> <span class="n">Operation</span><span class="p">(</span><span class="n">opname</span><span class="p">,</span> +<a id="rest_code_1adc0460707d4986a0ff9334f2124306-74" name="rest_code_1adc0460707d4986a0ff9334f2124306-74" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1adc0460707d4986a0ff9334f2124306-74"></a> <span class="p">[</span><span class="n">wraparg</span><span class="p">(</span><span class="n">arg</span><span class="p">)</span> <span class="k">for</span> <span class="n">arg</span> <span class="ow">in</span> <span class="n">args</span><span class="p">])</span> +<a id="rest_code_1adc0460707d4986a0ff9334f2124306-75" name="rest_code_1adc0460707d4986a0ff9334f2124306-75" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1adc0460707d4986a0ff9334f2124306-75"></a> <span class="c1"># add it to self, the basic block</span> +<a id="rest_code_1adc0460707d4986a0ff9334f2124306-76" name="rest_code_1adc0460707d4986a0ff9334f2124306-76" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1adc0460707d4986a0ff9334f2124306-76"></a> <span class="bp">self</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">op</span><span class="p">)</span> +<a id="rest_code_1adc0460707d4986a0ff9334f2124306-77" name="rest_code_1adc0460707d4986a0ff9334f2124306-77" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1adc0460707d4986a0ff9334f2124306-77"></a> <span class="k">return</span> <span class="n">op</span> +<a id="rest_code_1adc0460707d4986a0ff9334f2124306-78" name="rest_code_1adc0460707d4986a0ff9334f2124306-78" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1adc0460707d4986a0ff9334f2124306-78"></a> <span class="k">return</span> <span class="n">build</span> +<a id="rest_code_1adc0460707d4986a0ff9334f2124306-79" name="rest_code_1adc0460707d4986a0ff9334f2124306-79" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1adc0460707d4986a0ff9334f2124306-79"></a> +<a id="rest_code_1adc0460707d4986a0ff9334f2124306-80" name="rest_code_1adc0460707d4986a0ff9334f2124306-80" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1adc0460707d4986a0ff9334f2124306-80"></a> <span class="c1"># a bunch of operations we support</span> +<a id="rest_code_1adc0460707d4986a0ff9334f2124306-81" name="rest_code_1adc0460707d4986a0ff9334f2124306-81" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1adc0460707d4986a0ff9334f2124306-81"></a> <span class="n">add</span> <span class="o">=</span> <span class="n">opbuilder</span><span class="p">(</span><span class="s2">"add"</span><span class="p">)</span> +<a id="rest_code_1adc0460707d4986a0ff9334f2124306-82" name="rest_code_1adc0460707d4986a0ff9334f2124306-82" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1adc0460707d4986a0ff9334f2124306-82"></a> <span class="n">mul</span> <span class="o">=</span> <span class="n">opbuilder</span><span class="p">(</span><span class="s2">"mul"</span><span class="p">)</span> +<a id="rest_code_1adc0460707d4986a0ff9334f2124306-83" name="rest_code_1adc0460707d4986a0ff9334f2124306-83" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1adc0460707d4986a0ff9334f2124306-83"></a> <span class="n">getarg</span> <span class="o">=</span> <span class="n">opbuilder</span><span class="p">(</span><span class="s2">"getarg"</span><span class="p">)</span> +<a id="rest_code_1adc0460707d4986a0ff9334f2124306-84" name="rest_code_1adc0460707d4986a0ff9334f2124306-84" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1adc0460707d4986a0ff9334f2124306-84"></a> <span class="n">dummy</span> <span class="o">=</span> <span class="n">opbuilder</span><span class="p">(</span><span class="s2">"dummy"</span><span class="p">)</span> +<a id="rest_code_1adc0460707d4986a0ff9334f2124306-85" name="rest_code_1adc0460707d4986a0ff9334f2124306-85" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1adc0460707d4986a0ff9334f2124306-85"></a> <span class="n">lshift</span> <span class="o">=</span> <span class="n">opbuilder</span><span class="p">(</span><span class="s2">"lshift"</span><span class="p">)</span> +<a id="rest_code_1adc0460707d4986a0ff9334f2124306-86" name="rest_code_1adc0460707d4986a0ff9334f2124306-86" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1adc0460707d4986a0ff9334f2124306-86"></a> <span class="c1"># some new one for this post</span> +<a id="rest_code_1adc0460707d4986a0ff9334f2124306-87" name="rest_code_1adc0460707d4986a0ff9334f2124306-87" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1adc0460707d4986a0ff9334f2124306-87"></a> <span class="n">alloc</span> <span class="o">=</span> <span class="n">opbuilder</span><span class="p">(</span><span class="s2">"alloc"</span><span class="p">)</span> +<a id="rest_code_1adc0460707d4986a0ff9334f2124306-88" name="rest_code_1adc0460707d4986a0ff9334f2124306-88" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1adc0460707d4986a0ff9334f2124306-88"></a><span class="hll"> <span class="n">load</span> <span class="o">=</span> <span class="n">opbuilder</span><span class="p">(</span><span class="s2">"load"</span><span class="p">)</span> +</span><a id="rest_code_1adc0460707d4986a0ff9334f2124306-89" name="rest_code_1adc0460707d4986a0ff9334f2124306-89" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1adc0460707d4986a0ff9334f2124306-89"></a><span class="hll"> <span class="n">store</span> <span class="o">=</span> <span class="n">opbuilder</span><span class="p">(</span><span class="s2">"store"</span><span class="p">)</span> +</span><a id="rest_code_1adc0460707d4986a0ff9334f2124306-90" name="rest_code_1adc0460707d4986a0ff9334f2124306-90" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1adc0460707d4986a0ff9334f2124306-90"></a><span class="hll"> <span class="nb">print</span> <span class="o">=</span> <span class="n">opbuilder</span><span class="p">(</span><span class="s2">"print"</span><span class="p">)</span> +</span><a id="rest_code_1adc0460707d4986a0ff9334f2124306-91" name="rest_code_1adc0460707d4986a0ff9334f2124306-91" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1adc0460707d4986a0ff9334f2124306-91"></a><span class="hll"> +</span><a id="rest_code_1adc0460707d4986a0ff9334f2124306-92" name="rest_code_1adc0460707d4986a0ff9334f2124306-92" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1adc0460707d4986a0ff9334f2124306-92"></a><span class="hll"><span class="k">def</span> <span class="nf">bb_to_str</span><span class="p">(</span><span class="n">bb</span><span class="p">:</span> <span class="n">Block</span><span class="p">,</span> <span class="n">varprefix</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">"var"</span><span class="p">):</span> +</span><a id="rest_code_1adc0460707d4986a0ff9334f2124306-93" name="rest_code_1adc0460707d4986a0ff9334f2124306-93" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1adc0460707d4986a0ff9334f2124306-93"></a> <span class="k">def</span> <span class="nf">arg_to_str</span><span class="p">(</span><span class="n">arg</span><span class="p">:</span> <span class="n">Value</span><span class="p">):</span> +<a id="rest_code_1adc0460707d4986a0ff9334f2124306-94" name="rest_code_1adc0460707d4986a0ff9334f2124306-94" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1adc0460707d4986a0ff9334f2124306-94"></a> <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">arg</span><span class="p">,</span> <span class="n">Constant</span><span class="p">):</span> +<a id="rest_code_1adc0460707d4986a0ff9334f2124306-95" name="rest_code_1adc0460707d4986a0ff9334f2124306-95" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1adc0460707d4986a0ff9334f2124306-95"></a> <span class="k">return</span> <span class="nb">str</span><span class="p">(</span><span class="n">arg</span><span class="o">.</span><span class="n">value</span><span class="p">)</span> +<a id="rest_code_1adc0460707d4986a0ff9334f2124306-96" name="rest_code_1adc0460707d4986a0ff9334f2124306-96" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1adc0460707d4986a0ff9334f2124306-96"></a> <span class="k">else</span><span class="p">:</span> +<a id="rest_code_1adc0460707d4986a0ff9334f2124306-97" name="rest_code_1adc0460707d4986a0ff9334f2124306-97" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1adc0460707d4986a0ff9334f2124306-97"></a> <span class="k">return</span> <span class="n">varnames</span><span class="p">[</span><span class="n">arg</span><span class="p">]</span> +<a id="rest_code_1adc0460707d4986a0ff9334f2124306-98" name="rest_code_1adc0460707d4986a0ff9334f2124306-98" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1adc0460707d4986a0ff9334f2124306-98"></a> +<a id="rest_code_1adc0460707d4986a0ff9334f2124306-99" name="rest_code_1adc0460707d4986a0ff9334f2124306-99" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1adc0460707d4986a0ff9334f2124306-99"></a> <span class="n">varnames</span> <span class="o">=</span> <span class="p">{}</span> +<a id="rest_code_1adc0460707d4986a0ff9334f2124306-100" name="rest_code_1adc0460707d4986a0ff9334f2124306-100" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1adc0460707d4986a0ff9334f2124306-100"></a> <span class="n">res</span> <span class="o">=</span> <span class="p">[]</span> +<a id="rest_code_1adc0460707d4986a0ff9334f2124306-101" name="rest_code_1adc0460707d4986a0ff9334f2124306-101" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1adc0460707d4986a0ff9334f2124306-101"></a> <span class="k">for</span> <span class="n">index</span><span class="p">,</span> <span class="n">op</span> <span class="ow">in</span> <span class="nb">enumerate</span><span class="p">(</span><span class="n">bb</span><span class="p">):</span> +<a id="rest_code_1adc0460707d4986a0ff9334f2124306-102" name="rest_code_1adc0460707d4986a0ff9334f2124306-102" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1adc0460707d4986a0ff9334f2124306-102"></a> <span class="n">var</span> <span class="o">=</span> <span class="sa">f</span><span class="s2">"</span><span class="si">{</span><span class="n">varprefix</span><span class="si">}{</span><span class="n">index</span><span class="si">}</span><span class="s2">"</span> +<a id="rest_code_1adc0460707d4986a0ff9334f2124306-103" name="rest_code_1adc0460707d4986a0ff9334f2124306-103" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1adc0460707d4986a0ff9334f2124306-103"></a> <span class="n">varnames</span><span class="p">[</span><span class="n">op</span><span class="p">]</span> <span class="o">=</span> <span class="n">var</span> +<a id="rest_code_1adc0460707d4986a0ff9334f2124306-104" name="rest_code_1adc0460707d4986a0ff9334f2124306-104" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1adc0460707d4986a0ff9334f2124306-104"></a> <span class="n">arguments</span> <span class="o">=</span> <span class="s2">", "</span><span class="o">.</span><span class="n">join</span><span class="p">(</span> +<a id="rest_code_1adc0460707d4986a0ff9334f2124306-105" name="rest_code_1adc0460707d4986a0ff9334f2124306-105" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1adc0460707d4986a0ff9334f2124306-105"></a> <span class="n">arg_to_str</span><span class="p">(</span><span class="n">op</span><span class="o">.</span><span class="n">arg</span><span class="p">(</span><span class="n">i</span><span class="p">))</span> +<a id="rest_code_1adc0460707d4986a0ff9334f2124306-106" name="rest_code_1adc0460707d4986a0ff9334f2124306-106" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1adc0460707d4986a0ff9334f2124306-106"></a> <span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="nb">len</span><span class="p">(</span><span class="n">op</span><span class="o">.</span><span class="n">args</span><span class="p">))</span> +<a id="rest_code_1adc0460707d4986a0ff9334f2124306-107" name="rest_code_1adc0460707d4986a0ff9334f2124306-107" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1adc0460707d4986a0ff9334f2124306-107"></a> <span class="p">)</span> +<a id="rest_code_1adc0460707d4986a0ff9334f2124306-108" name="rest_code_1adc0460707d4986a0ff9334f2124306-108" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1adc0460707d4986a0ff9334f2124306-108"></a> <span class="n">strop</span> <span class="o">=</span> <span class="sa">f</span><span class="s2">"</span><span class="si">{</span><span class="n">var</span><span class="si">}</span><span class="s2"> = </span><span class="si">{</span><span class="n">op</span><span class="o">.</span><span class="n">name</span><span class="si">}</span><span class="s2">(</span><span class="si">{</span><span class="n">arguments</span><span class="si">}</span><span class="s2">)"</span> +<a id="rest_code_1adc0460707d4986a0ff9334f2124306-109" name="rest_code_1adc0460707d4986a0ff9334f2124306-109" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1adc0460707d4986a0ff9334f2124306-109"></a> <span class="n">res</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">strop</span><span class="p">)</span> +<a id="rest_code_1adc0460707d4986a0ff9334f2124306-110" name="rest_code_1adc0460707d4986a0ff9334f2124306-110" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1adc0460707d4986a0ff9334f2124306-110"></a> <span class="k">return</span> <span class="s2">"</span><span class="se">\n</span><span class="s2">"</span><span class="o">.</span><span class="n">join</span><span class="p">(</span><span class="n">res</span><span class="p">)</span> +</pre></div> +<p>There are two changes to the code from the last post: <code class="docutils literal">Operation</code> instances +have a new <code class="docutils literal">.info</code> field, which is set to <code class="docutils literal">None</code> by default. We will learn +how the info field is used a bit further down. Also, we define some new +operations.</p> +<section id="interpreter"> +<h2>Interpreter</h2> +<p>In this post we will mainly concern ourselves with optimizing +programs that allocate memory. We assume that our language is garbage collected +and memory safe. The new operations that we will optimize are <code class="docutils literal">alloc</code> +(allocates some new object), <code class="docutils literal">store</code> (stores a value into a fixed field of an +object), <code class="docutils literal">load</code> (loads the value from a field in the object).</p> +<p>We are leaving out a lot of details of a "real" system here, usually an +<code class="docutils literal">alloc</code> operation would get some extra information, for example the type of +the freshly allocated object or at least its size. <code class="docutils literal">load</code> and <code class="docutils literal">store</code> would +typically have some kind of field offset and maybe some information about the +field's type</p> +<p>Here's a simple program that uses these operations:</p> +<pre class="literal-block">var0 = getarg(0) +obj0 = alloc() +store(obj0, 0, var0) +var1 = load(obj0, 0) +print(var1)</pre> +<p>The code allocates a new object <code class="docutils literal">obj0</code>, stores <code class="docutils literal">var0</code> into field <code class="docutils literal">0</code> of +the object, the loads the same field and prints the result of the load.</p> +<p>Before we get started in writing the optimizer for these operations, let's try +to understand the semantics of the new operations a bit better. To do this, we +can sketch a small interpreter for basic blocks, supporting only <code class="docutils literal">getarg</code>, +<code class="docutils literal">alloc</code>, <code class="docutils literal">store</code>, <code class="docutils literal">load</code>, <code class="docutils literal">print</code>:</p> +<div class="code"><pre class="code python"><a id="rest_code_ec539155f5ee4081b1310b37de07b1b6-1" name="rest_code_ec539155f5ee4081b1310b37de07b1b6-1" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_ec539155f5ee4081b1310b37de07b1b6-1"></a><span class="k">def</span> <span class="nf">test_interpret</span><span class="p">():</span> +<a id="rest_code_ec539155f5ee4081b1310b37de07b1b6-2" name="rest_code_ec539155f5ee4081b1310b37de07b1b6-2" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_ec539155f5ee4081b1310b37de07b1b6-2"></a> <span class="n">bb</span> <span class="o">=</span> <span class="n">Block</span><span class="p">()</span> +<a id="rest_code_ec539155f5ee4081b1310b37de07b1b6-3" name="rest_code_ec539155f5ee4081b1310b37de07b1b6-3" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_ec539155f5ee4081b1310b37de07b1b6-3"></a> <span class="n">var0</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">getarg</span><span class="p">(</span><span class="mi">0</span><span class="p">)</span> +<a id="rest_code_ec539155f5ee4081b1310b37de07b1b6-4" name="rest_code_ec539155f5ee4081b1310b37de07b1b6-4" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_ec539155f5ee4081b1310b37de07b1b6-4"></a> <span class="n">obj</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">alloc</span><span class="p">()</span> +<a id="rest_code_ec539155f5ee4081b1310b37de07b1b6-5" name="rest_code_ec539155f5ee4081b1310b37de07b1b6-5" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_ec539155f5ee4081b1310b37de07b1b6-5"></a> <span class="n">sto</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">store</span><span class="p">(</span><span class="n">obj</span><span class="p">,</span> <span class="mi">0</span><span class="p">,</span> <span class="n">var0</span><span class="p">)</span> +<a id="rest_code_ec539155f5ee4081b1310b37de07b1b6-6" name="rest_code_ec539155f5ee4081b1310b37de07b1b6-6" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_ec539155f5ee4081b1310b37de07b1b6-6"></a> <span class="n">var1</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">load</span><span class="p">(</span><span class="n">obj</span><span class="p">,</span> <span class="mi">0</span><span class="p">)</span> +<a id="rest_code_ec539155f5ee4081b1310b37de07b1b6-7" name="rest_code_ec539155f5ee4081b1310b37de07b1b6-7" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_ec539155f5ee4081b1310b37de07b1b6-7"></a> <span class="n">bb</span><span class="o">.</span><span class="n">print</span><span class="p">(</span><span class="n">var1</span><span class="p">)</span> +<a id="rest_code_ec539155f5ee4081b1310b37de07b1b6-8" name="rest_code_ec539155f5ee4081b1310b37de07b1b6-8" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_ec539155f5ee4081b1310b37de07b1b6-8"></a> <span class="k">assert</span> <span class="n">interpret</span><span class="p">(</span><span class="n">bb</span><span class="p">,</span> <span class="mi">17</span><span class="p">)</span> <span class="o">==</span> <span class="mi">17</span> +<a id="rest_code_ec539155f5ee4081b1310b37de07b1b6-9" name="rest_code_ec539155f5ee4081b1310b37de07b1b6-9" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_ec539155f5ee4081b1310b37de07b1b6-9"></a> +<a id="rest_code_ec539155f5ee4081b1310b37de07b1b6-10" name="rest_code_ec539155f5ee4081b1310b37de07b1b6-10" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_ec539155f5ee4081b1310b37de07b1b6-10"></a><span class="k">class</span> <span class="nc">Object</span><span class="p">:</span> +<a id="rest_code_ec539155f5ee4081b1310b37de07b1b6-11" name="rest_code_ec539155f5ee4081b1310b37de07b1b6-11" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_ec539155f5ee4081b1310b37de07b1b6-11"></a> <span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span> +<a id="rest_code_ec539155f5ee4081b1310b37de07b1b6-12" name="rest_code_ec539155f5ee4081b1310b37de07b1b6-12" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_ec539155f5ee4081b1310b37de07b1b6-12"></a> <span class="bp">self</span><span class="o">.</span><span class="n">contents</span><span class="p">:</span> <span class="nb">dict</span><span class="p">[</span><span class="nb">int</span><span class="p">,</span> <span class="n">Any</span><span class="p">]</span> <span class="o">=</span> <span class="p">{}</span> +<a id="rest_code_ec539155f5ee4081b1310b37de07b1b6-13" name="rest_code_ec539155f5ee4081b1310b37de07b1b6-13" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_ec539155f5ee4081b1310b37de07b1b6-13"></a> +<a id="rest_code_ec539155f5ee4081b1310b37de07b1b6-14" name="rest_code_ec539155f5ee4081b1310b37de07b1b6-14" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_ec539155f5ee4081b1310b37de07b1b6-14"></a> <span class="k">def</span> <span class="nf">store</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">idx</span> <span class="p">:</span> <span class="nb">int</span><span class="p">,</span> <span class="n">value</span> <span class="p">:</span> <span class="n">Any</span><span class="p">):</span> +<a id="rest_code_ec539155f5ee4081b1310b37de07b1b6-15" name="rest_code_ec539155f5ee4081b1310b37de07b1b6-15" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_ec539155f5ee4081b1310b37de07b1b6-15"></a> <span class="bp">self</span><span class="o">.</span><span class="n">contents</span><span class="p">[</span><span class="n">idx</span><span class="p">]</span> <span class="o">=</span> <span class="n">value</span> +<a id="rest_code_ec539155f5ee4081b1310b37de07b1b6-16" name="rest_code_ec539155f5ee4081b1310b37de07b1b6-16" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_ec539155f5ee4081b1310b37de07b1b6-16"></a> +<a id="rest_code_ec539155f5ee4081b1310b37de07b1b6-17" name="rest_code_ec539155f5ee4081b1310b37de07b1b6-17" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_ec539155f5ee4081b1310b37de07b1b6-17"></a> <span class="k">def</span> <span class="nf">load</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">idx</span> <span class="p">:</span> <span class="nb">int</span><span class="p">):</span> +<a id="rest_code_ec539155f5ee4081b1310b37de07b1b6-18" name="rest_code_ec539155f5ee4081b1310b37de07b1b6-18" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_ec539155f5ee4081b1310b37de07b1b6-18"></a> <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">contents</span><span class="p">[</span><span class="n">idx</span><span class="p">]</span> +<a id="rest_code_ec539155f5ee4081b1310b37de07b1b6-19" name="rest_code_ec539155f5ee4081b1310b37de07b1b6-19" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_ec539155f5ee4081b1310b37de07b1b6-19"></a> +<a id="rest_code_ec539155f5ee4081b1310b37de07b1b6-20" name="rest_code_ec539155f5ee4081b1310b37de07b1b6-20" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_ec539155f5ee4081b1310b37de07b1b6-20"></a><span class="k">def</span> <span class="nf">get_num</span><span class="p">(</span><span class="n">op</span><span class="p">,</span> <span class="n">index</span><span class="o">=</span><span class="mi">1</span><span class="p">):</span> +<a id="rest_code_ec539155f5ee4081b1310b37de07b1b6-21" name="rest_code_ec539155f5ee4081b1310b37de07b1b6-21" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_ec539155f5ee4081b1310b37de07b1b6-21"></a> <span class="k">assert</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">op</span><span class="o">.</span><span class="n">arg</span><span class="p">(</span><span class="n">index</span><span class="p">),</span> <span class="n">Constant</span><span class="p">)</span> +<a id="rest_code_ec539155f5ee4081b1310b37de07b1b6-22" name="rest_code_ec539155f5ee4081b1310b37de07b1b6-22" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_ec539155f5ee4081b1310b37de07b1b6-22"></a> <span class="k">return</span> <span class="n">op</span><span class="o">.</span><span class="n">arg</span><span class="p">(</span><span class="n">index</span><span class="p">)</span><span class="o">.</span><span class="n">value</span> +<a id="rest_code_ec539155f5ee4081b1310b37de07b1b6-23" name="rest_code_ec539155f5ee4081b1310b37de07b1b6-23" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_ec539155f5ee4081b1310b37de07b1b6-23"></a> +<a id="rest_code_ec539155f5ee4081b1310b37de07b1b6-24" name="rest_code_ec539155f5ee4081b1310b37de07b1b6-24" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_ec539155f5ee4081b1310b37de07b1b6-24"></a><span class="k">def</span> <span class="nf">interpret</span><span class="p">(</span><span class="n">bb</span> <span class="p">:</span> <span class="n">Block</span><span class="p">,</span> <span class="o">*</span><span class="n">args</span> <span class="p">:</span> <span class="nb">tuple</span><span class="p">[</span><span class="n">Any</span><span class="p">]):</span> +<a id="rest_code_ec539155f5ee4081b1310b37de07b1b6-25" name="rest_code_ec539155f5ee4081b1310b37de07b1b6-25" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_ec539155f5ee4081b1310b37de07b1b6-25"></a> <span class="k">def</span> <span class="nf">argval</span><span class="p">(</span><span class="n">op</span><span class="p">,</span> <span class="n">i</span><span class="p">):</span> +<a id="rest_code_ec539155f5ee4081b1310b37de07b1b6-26" name="rest_code_ec539155f5ee4081b1310b37de07b1b6-26" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_ec539155f5ee4081b1310b37de07b1b6-26"></a> <span class="n">arg</span> <span class="o">=</span> <span class="n">op</span><span class="o">.</span><span class="n">arg</span><span class="p">(</span><span class="n">i</span><span class="p">)</span> +<a id="rest_code_ec539155f5ee4081b1310b37de07b1b6-27" name="rest_code_ec539155f5ee4081b1310b37de07b1b6-27" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_ec539155f5ee4081b1310b37de07b1b6-27"></a> <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">arg</span><span class="p">,</span> <span class="n">Constant</span><span class="p">):</span> +<a id="rest_code_ec539155f5ee4081b1310b37de07b1b6-28" name="rest_code_ec539155f5ee4081b1310b37de07b1b6-28" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_ec539155f5ee4081b1310b37de07b1b6-28"></a> <span class="k">return</span> <span class="n">arg</span><span class="o">.</span><span class="n">value</span> +<a id="rest_code_ec539155f5ee4081b1310b37de07b1b6-29" name="rest_code_ec539155f5ee4081b1310b37de07b1b6-29" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_ec539155f5ee4081b1310b37de07b1b6-29"></a> <span class="k">else</span><span class="p">:</span> +<a id="rest_code_ec539155f5ee4081b1310b37de07b1b6-30" name="rest_code_ec539155f5ee4081b1310b37de07b1b6-30" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_ec539155f5ee4081b1310b37de07b1b6-30"></a> <span class="k">assert</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">arg</span><span class="p">,</span> <span class="n">Operation</span><span class="p">)</span> +<a id="rest_code_ec539155f5ee4081b1310b37de07b1b6-31" name="rest_code_ec539155f5ee4081b1310b37de07b1b6-31" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_ec539155f5ee4081b1310b37de07b1b6-31"></a> <span class="k">return</span> <span class="n">arg</span><span class="o">.</span><span class="n">info</span> +<a id="rest_code_ec539155f5ee4081b1310b37de07b1b6-32" name="rest_code_ec539155f5ee4081b1310b37de07b1b6-32" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_ec539155f5ee4081b1310b37de07b1b6-32"></a> +<a id="rest_code_ec539155f5ee4081b1310b37de07b1b6-33" name="rest_code_ec539155f5ee4081b1310b37de07b1b6-33" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_ec539155f5ee4081b1310b37de07b1b6-33"></a> <span class="k">for</span> <span class="n">index</span><span class="p">,</span> <span class="n">op</span> <span class="ow">in</span> <span class="nb">enumerate</span><span class="p">(</span><span class="n">bb</span><span class="p">):</span> +<a id="rest_code_ec539155f5ee4081b1310b37de07b1b6-34" name="rest_code_ec539155f5ee4081b1310b37de07b1b6-34" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_ec539155f5ee4081b1310b37de07b1b6-34"></a> <span class="k">if</span> <span class="n">op</span><span class="o">.</span><span class="n">name</span> <span class="o">==</span> <span class="s2">"getarg"</span><span class="p">:</span> +<a id="rest_code_ec539155f5ee4081b1310b37de07b1b6-35" name="rest_code_ec539155f5ee4081b1310b37de07b1b6-35" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_ec539155f5ee4081b1310b37de07b1b6-35"></a> <span class="n">res</span> <span class="o">=</span> <span class="n">args</span><span class="p">[</span><span class="n">get_num</span><span class="p">(</span><span class="n">op</span><span class="p">,</span> <span class="mi">0</span><span class="p">)]</span> +<a id="rest_code_ec539155f5ee4081b1310b37de07b1b6-36" name="rest_code_ec539155f5ee4081b1310b37de07b1b6-36" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_ec539155f5ee4081b1310b37de07b1b6-36"></a> <span class="k">elif</span> <span class="n">op</span><span class="o">.</span><span class="n">name</span> <span class="o">==</span> <span class="s2">"alloc"</span><span class="p">:</span> +<a id="rest_code_ec539155f5ee4081b1310b37de07b1b6-37" name="rest_code_ec539155f5ee4081b1310b37de07b1b6-37" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_ec539155f5ee4081b1310b37de07b1b6-37"></a> <span class="n">res</span> <span class="o">=</span> <span class="n">Object</span><span class="p">()</span> +<a id="rest_code_ec539155f5ee4081b1310b37de07b1b6-38" name="rest_code_ec539155f5ee4081b1310b37de07b1b6-38" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_ec539155f5ee4081b1310b37de07b1b6-38"></a> <span class="k">elif</span> <span class="n">op</span><span class="o">.</span><span class="n">name</span> <span class="o">==</span> <span class="s2">"load"</span><span class="p">:</span> +<a id="rest_code_ec539155f5ee4081b1310b37de07b1b6-39" name="rest_code_ec539155f5ee4081b1310b37de07b1b6-39" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_ec539155f5ee4081b1310b37de07b1b6-39"></a> <span class="n">fieldnum</span> <span class="o">=</span> <span class="n">get_num</span><span class="p">(</span><span class="n">op</span><span class="p">)</span> +<a id="rest_code_ec539155f5ee4081b1310b37de07b1b6-40" name="rest_code_ec539155f5ee4081b1310b37de07b1b6-40" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_ec539155f5ee4081b1310b37de07b1b6-40"></a> <span class="n">res</span> <span class="o">=</span> <span class="n">argval</span><span class="p">(</span><span class="n">op</span><span class="p">,</span> <span class="mi">0</span><span class="p">)</span><span class="o">.</span><span class="n">load</span><span class="p">(</span><span class="n">fieldnum</span><span class="p">)</span> +<a id="rest_code_ec539155f5ee4081b1310b37de07b1b6-41" name="rest_code_ec539155f5ee4081b1310b37de07b1b6-41" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_ec539155f5ee4081b1310b37de07b1b6-41"></a> <span class="k">elif</span> <span class="n">op</span><span class="o">.</span><span class="n">name</span> <span class="o">==</span> <span class="s2">"store"</span><span class="p">:</span> +<a id="rest_code_ec539155f5ee4081b1310b37de07b1b6-42" name="rest_code_ec539155f5ee4081b1310b37de07b1b6-42" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_ec539155f5ee4081b1310b37de07b1b6-42"></a> <span class="n">obj</span> <span class="o">=</span> <span class="n">argval</span><span class="p">(</span><span class="n">op</span><span class="p">,</span> <span class="mi">0</span><span class="p">)</span> +<a id="rest_code_ec539155f5ee4081b1310b37de07b1b6-43" name="rest_code_ec539155f5ee4081b1310b37de07b1b6-43" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_ec539155f5ee4081b1310b37de07b1b6-43"></a> <span class="n">fieldnum</span> <span class="o">=</span> <span class="n">get_num</span><span class="p">(</span><span class="n">op</span><span class="p">)</span> +<a id="rest_code_ec539155f5ee4081b1310b37de07b1b6-44" name="rest_code_ec539155f5ee4081b1310b37de07b1b6-44" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_ec539155f5ee4081b1310b37de07b1b6-44"></a> <span class="n">fieldvalue</span> <span class="o">=</span> <span class="n">argval</span><span class="p">(</span><span class="n">op</span><span class="p">,</span> <span class="mi">2</span><span class="p">)</span> +<a id="rest_code_ec539155f5ee4081b1310b37de07b1b6-45" name="rest_code_ec539155f5ee4081b1310b37de07b1b6-45" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_ec539155f5ee4081b1310b37de07b1b6-45"></a> <span class="n">obj</span><span class="o">.</span><span class="n">store</span><span class="p">(</span><span class="n">fieldnum</span><span class="p">,</span> <span class="n">fieldvalue</span><span class="p">)</span> +<a id="rest_code_ec539155f5ee4081b1310b37de07b1b6-46" name="rest_code_ec539155f5ee4081b1310b37de07b1b6-46" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_ec539155f5ee4081b1310b37de07b1b6-46"></a> <span class="c1"># no result, only side effect</span> +<a id="rest_code_ec539155f5ee4081b1310b37de07b1b6-47" name="rest_code_ec539155f5ee4081b1310b37de07b1b6-47" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_ec539155f5ee4081b1310b37de07b1b6-47"></a> <span class="k">continue</span> +<a id="rest_code_ec539155f5ee4081b1310b37de07b1b6-48" name="rest_code_ec539155f5ee4081b1310b37de07b1b6-48" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_ec539155f5ee4081b1310b37de07b1b6-48"></a> <span class="k">elif</span> <span class="n">op</span><span class="o">.</span><span class="n">name</span> <span class="o">==</span> <span class="s2">"print"</span><span class="p">:</span> +<a id="rest_code_ec539155f5ee4081b1310b37de07b1b6-49" name="rest_code_ec539155f5ee4081b1310b37de07b1b6-49" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_ec539155f5ee4081b1310b37de07b1b6-49"></a> <span class="n">res</span> <span class="o">=</span> <span class="n">argval</span><span class="p">(</span><span class="n">op</span><span class="p">,</span> <span class="mi">0</span><span class="p">)</span> +<a id="rest_code_ec539155f5ee4081b1310b37de07b1b6-50" name="rest_code_ec539155f5ee4081b1310b37de07b1b6-50" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_ec539155f5ee4081b1310b37de07b1b6-50"></a> <span class="nb">print</span><span class="p">(</span><span class="n">res</span><span class="p">)</span> +<a id="rest_code_ec539155f5ee4081b1310b37de07b1b6-51" name="rest_code_ec539155f5ee4081b1310b37de07b1b6-51" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_ec539155f5ee4081b1310b37de07b1b6-51"></a> <span class="k">return</span> <span class="n">res</span> +<a id="rest_code_ec539155f5ee4081b1310b37de07b1b6-52" name="rest_code_ec539155f5ee4081b1310b37de07b1b6-52" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_ec539155f5ee4081b1310b37de07b1b6-52"></a> <span class="k">else</span><span class="p">:</span> +<a id="rest_code_ec539155f5ee4081b1310b37de07b1b6-53" name="rest_code_ec539155f5ee4081b1310b37de07b1b6-53" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_ec539155f5ee4081b1310b37de07b1b6-53"></a> <span class="k">raise</span> <span class="ne">NotImplementedError</span><span class="p">(</span> +<a id="rest_code_ec539155f5ee4081b1310b37de07b1b6-54" name="rest_code_ec539155f5ee4081b1310b37de07b1b6-54" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_ec539155f5ee4081b1310b37de07b1b6-54"></a> <span class="sa">f</span><span class="s2">"</span><span class="si">{</span><span class="n">op</span><span class="o">.</span><span class="n">name</span><span class="si">}</span><span class="s2"> not supported"</span><span class="p">)</span> +<a id="rest_code_ec539155f5ee4081b1310b37de07b1b6-55" name="rest_code_ec539155f5ee4081b1310b37de07b1b6-55" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_ec539155f5ee4081b1310b37de07b1b6-55"></a> <span class="n">op</span><span class="o">.</span><span class="n">info</span> <span class="o">=</span> <span class="n">res</span> +</pre></div> +<p>The interpreter walks the operations of a block, executing each one in turn. It +uses the <code class="docutils literal">info</code> field to store the result of each already executed +<code class="docutils literal">Operation</code>. In this interpreter sketch we stop at the first <code class="docutils literal">print</code> that +we execute and return its argument for the simple but bad reason that it makes +<code class="docutils literal">test_interpret</code> easier to write.</p> +<p>Objects in the interpreter are represented using a class <code class="docutils literal">Object</code>, which +stores the object's field into a Python dictionary. As written above, this is a +simplification, in a real system the <cite>alloc</cite> operation might for example take +some kind of type as an argument, that describes which kinds of fields an +object has and how they are laid out in memory, which would allow more +efficient storage of the content. But we don't want to care about this level of +detail in the post, so using a dict in the interpreter is good enough.</p> +</section> +<section id="version-1-naive-attempt"> +<h2>Version 1: Naive Attempt</h2> +<p>In many programs, some allocated objects don't live for very long and have a +completely predictable lifetime. They get allocated, used for a while, and then +there is no way to reference them any more, so the garbage collector will +reclaim them. The very first example block had such an allocation:</p> +<pre class="literal-block">var0 = getarg(0) +obj0 = alloc() +store(obj0, 0, var0) +var1 = load(obj0, 0) +print(var1)</pre> +<p>Here <code class="docutils literal">obj0</code> is written to, then read from, and then it's no longer used. We +want to optimize such programs to remove this <code class="docutils literal">alloc</code> operation. The optimized +version of this program would look like this:</p> +<pre class="literal-block">var0 = getarg(0) +print(var0)</pre> +<p>The <code class="docutils literal">alloc</code>, <code class="docutils literal">store</code> and <code class="docutils literal">load</code> operations have been completely removed. +This is a pretty important optimizations for PyPy's JIT: Allocations, memory +reads and writes are quite costly and occur <em>a lot</em> in Python, so getting rid +of as many of them as possible is instrumental for performance.</p> +<p>Implementing the optimization is not a lot of code! However, understanding all +the corner cases of the +optimization and making sure that the resulting program behave correctly is not +completely trivial. Therefore we will develop the optimization step by step, in +a test driven fashion: I will start each section with a new test that shows a +bug in the version of the optimization that we have so far.</p> +<p>Let's start in a really naive way. Here's the first test we would like to +pass, using the example program above:</p> +<div class="code"><pre class="code python"><a id="rest_code_7861ee93b7b24a2c9694eb266d65ded5-1" name="rest_code_7861ee93b7b24a2c9694eb266d65ded5-1" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_7861ee93b7b24a2c9694eb266d65ded5-1"></a><span class="k">def</span> <span class="nf">test_remove_unused_allocation</span><span class="p">():</span> +<a id="rest_code_7861ee93b7b24a2c9694eb266d65ded5-2" name="rest_code_7861ee93b7b24a2c9694eb266d65ded5-2" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_7861ee93b7b24a2c9694eb266d65ded5-2"></a> <span class="n">bb</span> <span class="o">=</span> <span class="n">Block</span><span class="p">()</span> +<a id="rest_code_7861ee93b7b24a2c9694eb266d65ded5-3" name="rest_code_7861ee93b7b24a2c9694eb266d65ded5-3" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_7861ee93b7b24a2c9694eb266d65ded5-3"></a> <span class="n">var0</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">getarg</span><span class="p">(</span><span class="mi">0</span><span class="p">)</span> +<a id="rest_code_7861ee93b7b24a2c9694eb266d65ded5-4" name="rest_code_7861ee93b7b24a2c9694eb266d65ded5-4" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_7861ee93b7b24a2c9694eb266d65ded5-4"></a> <span class="n">obj</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">alloc</span><span class="p">()</span> +<a id="rest_code_7861ee93b7b24a2c9694eb266d65ded5-5" name="rest_code_7861ee93b7b24a2c9694eb266d65ded5-5" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_7861ee93b7b24a2c9694eb266d65ded5-5"></a> <span class="n">sto</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">store</span><span class="p">(</span><span class="n">obj</span><span class="p">,</span> <span class="mi">0</span><span class="p">,</span> <span class="n">var0</span><span class="p">)</span> +<a id="rest_code_7861ee93b7b24a2c9694eb266d65ded5-6" name="rest_code_7861ee93b7b24a2c9694eb266d65ded5-6" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_7861ee93b7b24a2c9694eb266d65ded5-6"></a> <span class="n">var1</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">load</span><span class="p">(</span><span class="n">obj</span><span class="p">,</span> <span class="mi">0</span><span class="p">)</span> +<a id="rest_code_7861ee93b7b24a2c9694eb266d65ded5-7" name="rest_code_7861ee93b7b24a2c9694eb266d65ded5-7" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_7861ee93b7b24a2c9694eb266d65ded5-7"></a> <span class="n">bb</span><span class="o">.</span><span class="n">print</span><span class="p">(</span><span class="n">var1</span><span class="p">)</span> +<a id="rest_code_7861ee93b7b24a2c9694eb266d65ded5-8" name="rest_code_7861ee93b7b24a2c9694eb266d65ded5-8" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_7861ee93b7b24a2c9694eb266d65ded5-8"></a> <span class="n">opt_bb</span> <span class="o">=</span> <span class="n">optimize_alloc_removal</span><span class="p">(</span><span class="n">bb</span><span class="p">)</span> +<a id="rest_code_7861ee93b7b24a2c9694eb266d65ded5-9" name="rest_code_7861ee93b7b24a2c9694eb266d65ded5-9" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_7861ee93b7b24a2c9694eb266d65ded5-9"></a> <span class="c1"># the virtual object looks like this:</span> +<a id="rest_code_7861ee93b7b24a2c9694eb266d65ded5-10" name="rest_code_7861ee93b7b24a2c9694eb266d65ded5-10" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_7861ee93b7b24a2c9694eb266d65ded5-10"></a> <span class="c1"># obj</span> +<a id="rest_code_7861ee93b7b24a2c9694eb266d65ded5-11" name="rest_code_7861ee93b7b24a2c9694eb266d65ded5-11" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_7861ee93b7b24a2c9694eb266d65ded5-11"></a> <span class="c1"># ┌──────────┐</span> +<a id="rest_code_7861ee93b7b24a2c9694eb266d65ded5-12" name="rest_code_7861ee93b7b24a2c9694eb266d65ded5-12" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_7861ee93b7b24a2c9694eb266d65ded5-12"></a> <span class="c1"># │ 0: var0 │</span> +<a id="rest_code_7861ee93b7b24a2c9694eb266d65ded5-13" name="rest_code_7861ee93b7b24a2c9694eb266d65ded5-13" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_7861ee93b7b24a2c9694eb266d65ded5-13"></a> <span class="c1"># └──────────┘</span> +<a id="rest_code_7861ee93b7b24a2c9694eb266d65ded5-14" name="rest_code_7861ee93b7b24a2c9694eb266d65ded5-14" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_7861ee93b7b24a2c9694eb266d65ded5-14"></a> <span class="k">assert</span> <span class="n">bb_to_str</span><span class="p">(</span><span class="n">opt_bb</span><span class="p">,</span> <span class="s2">"optvar"</span><span class="p">)</span> <span class="o">==</span> <span class="s2">"""</span><span class="se">\</span> +<a id="rest_code_7861ee93b7b24a2c9694eb266d65ded5-15" name="rest_code_7861ee93b7b24a2c9694eb266d65ded5-15" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_7861ee93b7b24a2c9694eb266d65ded5-15"></a><span class="s2">optvar0 = getarg(0)</span> +<a id="rest_code_7861ee93b7b24a2c9694eb266d65ded5-16" name="rest_code_7861ee93b7b24a2c9694eb266d65ded5-16" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_7861ee93b7b24a2c9694eb266d65ded5-16"></a><span class="s2">optvar1 = print(optvar0)"""</span> +</pre></div> +<p>We will define a class <code class="docutils literal">VirtualObject</code> that is basically identical to +<code class="docutils literal">Object</code> above. But it will not be used by the interpreter, instead we will +use it during optimization.</p> +<div class="code"><pre class="code python"><a id="rest_code_ac03f32ad0a0449495b514e97e81c430-1" name="rest_code_ac03f32ad0a0449495b514e97e81c430-1" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_ac03f32ad0a0449495b514e97e81c430-1"></a><span class="k">class</span> <span class="nc">VirtualObject</span><span class="p">:</span> +<a id="rest_code_ac03f32ad0a0449495b514e97e81c430-2" name="rest_code_ac03f32ad0a0449495b514e97e81c430-2" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_ac03f32ad0a0449495b514e97e81c430-2"></a> <span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span> +<a id="rest_code_ac03f32ad0a0449495b514e97e81c430-3" name="rest_code_ac03f32ad0a0449495b514e97e81c430-3" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_ac03f32ad0a0449495b514e97e81c430-3"></a> <span class="bp">self</span><span class="o">.</span><span class="n">contents</span><span class="p">:</span> <span class="nb">dict</span><span class="p">[</span><span class="nb">int</span><span class="p">,</span> <span class="n">Value</span><span class="p">]</span> <span class="o">=</span> <span class="p">{}</span> +<a id="rest_code_ac03f32ad0a0449495b514e97e81c430-4" name="rest_code_ac03f32ad0a0449495b514e97e81c430-4" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_ac03f32ad0a0449495b514e97e81c430-4"></a> +<a id="rest_code_ac03f32ad0a0449495b514e97e81c430-5" name="rest_code_ac03f32ad0a0449495b514e97e81c430-5" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_ac03f32ad0a0449495b514e97e81c430-5"></a> <span class="k">def</span> <span class="nf">store</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">idx</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span> +<a id="rest_code_ac03f32ad0a0449495b514e97e81c430-6" name="rest_code_ac03f32ad0a0449495b514e97e81c430-6" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_ac03f32ad0a0449495b514e97e81c430-6"></a> <span class="bp">self</span><span class="o">.</span><span class="n">contents</span><span class="p">[</span><span class="n">idx</span><span class="p">]</span> <span class="o">=</span> <span class="n">value</span> +<a id="rest_code_ac03f32ad0a0449495b514e97e81c430-7" name="rest_code_ac03f32ad0a0449495b514e97e81c430-7" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_ac03f32ad0a0449495b514e97e81c430-7"></a> +<a id="rest_code_ac03f32ad0a0449495b514e97e81c430-8" name="rest_code_ac03f32ad0a0449495b514e97e81c430-8" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_ac03f32ad0a0449495b514e97e81c430-8"></a> <span class="k">def</span> <span class="nf">load</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">idx</span><span class="p">):</span> +<a id="rest_code_ac03f32ad0a0449495b514e97e81c430-9" name="rest_code_ac03f32ad0a0449495b514e97e81c430-9" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_ac03f32ad0a0449495b514e97e81c430-9"></a> <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">contents</span><span class="p">[</span><span class="n">idx</span><span class="p">]</span> +</pre></div> +<p>The structure of the optimizer is going to be like those in the first blog post. +The optimizer makes a single pass over all operations. It removes some and +emits others.</p> +<p>This first version of the allocation removal optimizer is going to be extremely +optimistic. It simply assumes that <em>all</em> the allocations in the program can be +optimized away. That is not realistic in practice. We will have to +refine this approach later, but it's a good way to start. That means whenever +the optimizer sees an <code class="docutils literal">alloc</code> operation, it removes it and creates a +<code class="docutils literal">VirtualObject</code> object which stores the information that is known during +optimization about the result of the <code class="docutils literal">alloc</code>. Like in the interpreter, the +<code class="docutils literal">VirtualObject</code> is stored in the <code class="docutils literal">.info</code> field of the <code class="docutils literal">Operation</code> instance +that represents the <code class="docutils literal">alloc</code>.</p> +<p>When the optimizer sees a <code class="docutils literal">store</code> operation, it will also remove it and +instead execute the store by calling the <code class="docutils literal">VirtualObject.store</code> method. +Here is one important difference between the interpreter and the optimizer: In +the interpreter, the values that were stored into an <code class="docutils literal">Object</code> (and thus +put into the object's <code class="docutils literal">.contents</code> dictionary) were runtime values, for +example integers or other objects. In the optimizer however, the +fields of the <code class="docutils literal">VirtualObject</code> store <code class="docutils literal">Value</code> instances, either <code class="docutils literal">Constant</code> +instances or <code class="docutils literal">Operation</code> instances.</p> +<p>When the optimizer sees a <code class="docutils literal">load</code> operation, it <em>also</em> removes it, and replaces +the <code class="docutils literal">load</code> with the <code class="docutils literal">Operation</code> (or <code class="docutils literal">Constant</code>) that is stored in the +<code class="docutils literal">VirtualObject</code> at that point:</p> +<div class="code"><pre class="code python"><a id="rest_code_e1d77e0a46db40298289e70f374a23cf-1" name="rest_code_e1d77e0a46db40298289e70f374a23cf-1" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_e1d77e0a46db40298289e70f374a23cf-1"></a><span class="k">def</span> <span class="nf">optimize_alloc_removal</span><span class="p">(</span><span class="n">bb</span><span class="p">):</span> +<a id="rest_code_e1d77e0a46db40298289e70f374a23cf-2" name="rest_code_e1d77e0a46db40298289e70f374a23cf-2" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_e1d77e0a46db40298289e70f374a23cf-2"></a> <span class="n">opt_bb</span> <span class="o">=</span> <span class="n">Block</span><span class="p">()</span> +<a id="rest_code_e1d77e0a46db40298289e70f374a23cf-3" name="rest_code_e1d77e0a46db40298289e70f374a23cf-3" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_e1d77e0a46db40298289e70f374a23cf-3"></a> <span class="k">for</span> <span class="n">op</span> <span class="ow">in</span> <span class="n">bb</span><span class="p">:</span> +<a id="rest_code_e1d77e0a46db40298289e70f374a23cf-4" name="rest_code_e1d77e0a46db40298289e70f374a23cf-4" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_e1d77e0a46db40298289e70f374a23cf-4"></a> <span class="k">if</span> <span class="n">op</span><span class="o">.</span><span class="n">name</span> <span class="o">==</span> <span class="s2">"alloc"</span><span class="p">:</span> +<a id="rest_code_e1d77e0a46db40298289e70f374a23cf-5" name="rest_code_e1d77e0a46db40298289e70f374a23cf-5" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_e1d77e0a46db40298289e70f374a23cf-5"></a> <span class="n">op</span><span class="o">.</span><span class="n">info</span> <span class="o">=</span> <span class="n">VirtualObject</span><span class="p">()</span> +<a id="rest_code_e1d77e0a46db40298289e70f374a23cf-6" name="rest_code_e1d77e0a46db40298289e70f374a23cf-6" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_e1d77e0a46db40298289e70f374a23cf-6"></a> <span class="k">continue</span> +<a id="rest_code_e1d77e0a46db40298289e70f374a23cf-7" name="rest_code_e1d77e0a46db40298289e70f374a23cf-7" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_e1d77e0a46db40298289e70f374a23cf-7"></a> <span class="k">if</span> <span class="n">op</span><span class="o">.</span><span class="n">name</span> <span class="o">==</span> <span class="s2">"load"</span><span class="p">:</span> +<a id="rest_code_e1d77e0a46db40298289e70f374a23cf-8" name="rest_code_e1d77e0a46db40298289e70f374a23cf-8" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_e1d77e0a46db40298289e70f374a23cf-8"></a> <span class="n">info</span> <span class="o">=</span> <span class="n">op</span><span class="o">.</span><span class="n">arg</span><span class="p">(</span><span class="mi">0</span><span class="p">)</span><span class="o">.</span><span class="n">info</span> +<a id="rest_code_e1d77e0a46db40298289e70f374a23cf-9" name="rest_code_e1d77e0a46db40298289e70f374a23cf-9" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_e1d77e0a46db40298289e70f374a23cf-9"></a> <span class="n">field</span> <span class="o">=</span> <span class="n">get_num</span><span class="p">(</span><span class="n">op</span><span class="p">)</span> +<a id="rest_code_e1d77e0a46db40298289e70f374a23cf-10" name="rest_code_e1d77e0a46db40298289e70f374a23cf-10" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_e1d77e0a46db40298289e70f374a23cf-10"></a> <span class="n">op</span><span class="o">.</span><span class="n">make_equal_to</span><span class="p">(</span><span class="n">info</span><span class="o">.</span><span class="n">load</span><span class="p">(</span><span class="n">field</span><span class="p">))</span> +<a id="rest_code_e1d77e0a46db40298289e70f374a23cf-11" name="rest_code_e1d77e0a46db40298289e70f374a23cf-11" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_e1d77e0a46db40298289e70f374a23cf-11"></a> <span class="k">continue</span> +<a id="rest_code_e1d77e0a46db40298289e70f374a23cf-12" name="rest_code_e1d77e0a46db40298289e70f374a23cf-12" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_e1d77e0a46db40298289e70f374a23cf-12"></a> <span class="k">if</span> <span class="n">op</span><span class="o">.</span><span class="n">name</span> <span class="o">==</span> <span class="s2">"store"</span><span class="p">:</span> +<a id="rest_code_e1d77e0a46db40298289e70f374a23cf-13" name="rest_code_e1d77e0a46db40298289e70f374a23cf-13" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_e1d77e0a46db40298289e70f374a23cf-13"></a> <span class="n">info</span> <span class="o">=</span> <span class="n">op</span><span class="o">.</span><span class="n">arg</span><span class="p">(</span><span class="mi">0</span><span class="p">)</span><span class="o">.</span><span class="n">info</span> +<a id="rest_code_e1d77e0a46db40298289e70f374a23cf-14" name="rest_code_e1d77e0a46db40298289e70f374a23cf-14" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_e1d77e0a46db40298289e70f374a23cf-14"></a> <span class="n">field</span> <span class="o">=</span> <span class="n">get_num</span><span class="p">(</span><span class="n">op</span><span class="p">)</span> +<a id="rest_code_e1d77e0a46db40298289e70f374a23cf-15" name="rest_code_e1d77e0a46db40298289e70f374a23cf-15" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_e1d77e0a46db40298289e70f374a23cf-15"></a> <span class="n">info</span><span class="o">.</span><span class="n">store</span><span class="p">(</span><span class="n">field</span><span class="p">,</span> <span class="n">op</span><span class="o">.</span><span class="n">arg</span><span class="p">(</span><span class="mi">2</span><span class="p">))</span> +<a id="rest_code_e1d77e0a46db40298289e70f374a23cf-16" name="rest_code_e1d77e0a46db40298289e70f374a23cf-16" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_e1d77e0a46db40298289e70f374a23cf-16"></a> <span class="k">continue</span> +<a id="rest_code_e1d77e0a46db40298289e70f374a23cf-17" name="rest_code_e1d77e0a46db40298289e70f374a23cf-17" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_e1d77e0a46db40298289e70f374a23cf-17"></a> <span class="n">opt_bb</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">op</span><span class="p">)</span> +<a id="rest_code_e1d77e0a46db40298289e70f374a23cf-18" name="rest_code_e1d77e0a46db40298289e70f374a23cf-18" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_e1d77e0a46db40298289e70f374a23cf-18"></a> <span class="k">return</span> <span class="n">opt_bb</span> +</pre></div> +<p>This is the first version of the optimization. It doesn't handle all kinds of +difficult cases, and we'll have to do something about its optimism. +But, already in this minimalistic form, we can write a slightly more complicated +test with two allocations, one object pointing to the other. It works correctly +too, both allocations are removed:</p> +<div class="code"><pre class="code python"><a id="rest_code_c4a730568f38466fa02866676b4b8737-1" name="rest_code_c4a730568f38466fa02866676b4b8737-1" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_c4a730568f38466fa02866676b4b8737-1"></a><span class="k">def</span> <span class="nf">test_remove_two_allocations</span><span class="p">():</span> +<a id="rest_code_c4a730568f38466fa02866676b4b8737-2" name="rest_code_c4a730568f38466fa02866676b4b8737-2" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_c4a730568f38466fa02866676b4b8737-2"></a> <span class="n">bb</span> <span class="o">=</span> <span class="n">Block</span><span class="p">()</span> +<a id="rest_code_c4a730568f38466fa02866676b4b8737-3" name="rest_code_c4a730568f38466fa02866676b4b8737-3" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_c4a730568f38466fa02866676b4b8737-3"></a> <span class="n">var0</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">getarg</span><span class="p">(</span><span class="mi">0</span><span class="p">)</span> +<a id="rest_code_c4a730568f38466fa02866676b4b8737-4" name="rest_code_c4a730568f38466fa02866676b4b8737-4" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_c4a730568f38466fa02866676b4b8737-4"></a> <span class="n">obj0</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">alloc</span><span class="p">()</span> +<a id="rest_code_c4a730568f38466fa02866676b4b8737-5" name="rest_code_c4a730568f38466fa02866676b4b8737-5" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_c4a730568f38466fa02866676b4b8737-5"></a> <span class="n">sto1</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">store</span><span class="p">(</span><span class="n">obj0</span><span class="p">,</span> <span class="mi">0</span><span class="p">,</span> <span class="n">var0</span><span class="p">)</span> +<a id="rest_code_c4a730568f38466fa02866676b4b8737-6" name="rest_code_c4a730568f38466fa02866676b4b8737-6" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_c4a730568f38466fa02866676b4b8737-6"></a> <span class="n">obj1</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">alloc</span><span class="p">()</span> +<a id="rest_code_c4a730568f38466fa02866676b4b8737-7" name="rest_code_c4a730568f38466fa02866676b4b8737-7" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_c4a730568f38466fa02866676b4b8737-7"></a> <span class="n">sto2</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">store</span><span class="p">(</span><span class="n">obj1</span><span class="p">,</span> <span class="mi">0</span><span class="p">,</span> <span class="n">obj0</span><span class="p">)</span> +<a id="rest_code_c4a730568f38466fa02866676b4b8737-8" name="rest_code_c4a730568f38466fa02866676b4b8737-8" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_c4a730568f38466fa02866676b4b8737-8"></a> <span class="n">var1</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">load</span><span class="p">(</span><span class="n">obj1</span><span class="p">,</span> <span class="mi">0</span><span class="p">)</span> +<a id="rest_code_c4a730568f38466fa02866676b4b8737-9" name="rest_code_c4a730568f38466fa02866676b4b8737-9" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_c4a730568f38466fa02866676b4b8737-9"></a> <span class="n">var2</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">load</span><span class="p">(</span><span class="n">var1</span><span class="p">,</span> <span class="mi">0</span><span class="p">)</span> +<a id="rest_code_c4a730568f38466fa02866676b4b8737-10" name="rest_code_c4a730568f38466fa02866676b4b8737-10" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_c4a730568f38466fa02866676b4b8737-10"></a> <span class="n">bb</span><span class="o">.</span><span class="n">print</span><span class="p">(</span><span class="n">var2</span><span class="p">)</span> +<a id="rest_code_c4a730568f38466fa02866676b4b8737-11" name="rest_code_c4a730568f38466fa02866676b4b8737-11" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_c4a730568f38466fa02866676b4b8737-11"></a> <span class="c1"># the virtual objects look like this:</span> +<a id="rest_code_c4a730568f38466fa02866676b4b8737-12" name="rest_code_c4a730568f38466fa02866676b4b8737-12" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_c4a730568f38466fa02866676b4b8737-12"></a> <span class="c1"># obj0</span> +<a id="rest_code_c4a730568f38466fa02866676b4b8737-13" name="rest_code_c4a730568f38466fa02866676b4b8737-13" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_c4a730568f38466fa02866676b4b8737-13"></a> <span class="c1"># ┌──────┐</span> +<a id="rest_code_c4a730568f38466fa02866676b4b8737-14" name="rest_code_c4a730568f38466fa02866676b4b8737-14" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_c4a730568f38466fa02866676b4b8737-14"></a> <span class="c1"># │ 0: ╷ │</span> +<a id="rest_code_c4a730568f38466fa02866676b4b8737-15" name="rest_code_c4a730568f38466fa02866676b4b8737-15" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_c4a730568f38466fa02866676b4b8737-15"></a> <span class="c1"># └────┼─┘</span> +<a id="rest_code_c4a730568f38466fa02866676b4b8737-16" name="rest_code_c4a730568f38466fa02866676b4b8737-16" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_c4a730568f38466fa02866676b4b8737-16"></a> <span class="c1"># │</span> +<a id="rest_code_c4a730568f38466fa02866676b4b8737-17" name="rest_code_c4a730568f38466fa02866676b4b8737-17" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_c4a730568f38466fa02866676b4b8737-17"></a> <span class="c1"># ▼</span> +<a id="rest_code_c4a730568f38466fa02866676b4b8737-18" name="rest_code_c4a730568f38466fa02866676b4b8737-18" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_c4a730568f38466fa02866676b4b8737-18"></a> <span class="c1"># obj1</span> +<a id="rest_code_c4a730568f38466fa02866676b4b8737-19" name="rest_code_c4a730568f38466fa02866676b4b8737-19" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_c4a730568f38466fa02866676b4b8737-19"></a> <span class="c1"># ┌─────────┐</span> +<a id="rest_code_c4a730568f38466fa02866676b4b8737-20" name="rest_code_c4a730568f38466fa02866676b4b8737-20" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_c4a730568f38466fa02866676b4b8737-20"></a> <span class="c1"># │ 0: var0 │</span> +<a id="rest_code_c4a730568f38466fa02866676b4b8737-21" name="rest_code_c4a730568f38466fa02866676b4b8737-21" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_c4a730568f38466fa02866676b4b8737-21"></a> <span class="c1"># └─────────┘</span> +<a id="rest_code_c4a730568f38466fa02866676b4b8737-22" name="rest_code_c4a730568f38466fa02866676b4b8737-22" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_c4a730568f38466fa02866676b4b8737-22"></a> <span class="c1"># therefore</span> +<a id="rest_code_c4a730568f38466fa02866676b4b8737-23" name="rest_code_c4a730568f38466fa02866676b4b8737-23" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_c4a730568f38466fa02866676b4b8737-23"></a> <span class="c1"># var1 is the same as obj0</span> +<a id="rest_code_c4a730568f38466fa02866676b4b8737-24" name="rest_code_c4a730568f38466fa02866676b4b8737-24" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_c4a730568f38466fa02866676b4b8737-24"></a> <span class="c1"># var2 is the same as var0</span> +<a id="rest_code_c4a730568f38466fa02866676b4b8737-25" name="rest_code_c4a730568f38466fa02866676b4b8737-25" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_c4a730568f38466fa02866676b4b8737-25"></a> <span class="n">opt_bb</span> <span class="o">=</span> <span class="n">optimize_alloc_removal</span><span class="p">(</span><span class="n">bb</span><span class="p">)</span> +<a id="rest_code_c4a730568f38466fa02866676b4b8737-26" name="rest_code_c4a730568f38466fa02866676b4b8737-26" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_c4a730568f38466fa02866676b4b8737-26"></a> <span class="k">assert</span> <span class="n">bb_to_str</span><span class="p">(</span><span class="n">opt_bb</span><span class="p">,</span> <span class="s2">"optvar"</span><span class="p">)</span> <span class="o">==</span> <span class="s2">"""</span><span class="se">\</span> +<a id="rest_code_c4a730568f38466fa02866676b4b8737-27" name="rest_code_c4a730568f38466fa02866676b4b8737-27" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_c4a730568f38466fa02866676b4b8737-27"></a><span class="s2">optvar0 = getarg(0)</span> +<a id="rest_code_c4a730568f38466fa02866676b4b8737-28" name="rest_code_c4a730568f38466fa02866676b4b8737-28" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_c4a730568f38466fa02866676b4b8737-28"></a><span class="s2">optvar1 = print(optvar0)"""</span> +</pre></div> +</section> +<section id="version-2-re-materializing-allocations"> +<h2>Version 2: Re-Materializing Allocations</h2> +<p>To make it easier to talk about how the optimizer operates, let's introduce +some terminology. As already seen by the choice +of the class name <code class="docutils literal">VirtualObject</code>, we will call an object <strong>virtual</strong> if the +optimizer has optimized away the <code class="docutils literal">alloc</code> operation that creates the object. +Other objects are equivalently <strong>not virtual</strong>, for example those that have +existed before we enter the current code block.</p> +<p>The first problem that we need to fix is the assumption that every +allocation can be removed. So far we only looked at small programs where every +allocation could be removed, or equivalently, where every object is virtual. +A program that creates virtual objects, stores into and loads from them, and +then forgets the objects. In this simple case removing the allocations is fine. +As we saw in the previous section, it's also fine to have a virtual object +reference another virtual, both allocations can be removed.</p> +<p>What are the cases were we <em>can't</em> remove an allocation? +The first version of the optimizer simply assumed that every allocation can be +removed. This can't work. We will replace this assumption with the following +simple heuristic:</p> +<p>If a reference to a virtual object <code class="docutils literal">a</code> is stored into an object <code class="docutils literal">b</code> +that is not virtual, then <code class="docutils literal">a</code> will also stop being virtual. If an object <code class="docutils literal">a</code> +that was virtual stops being virtual, we say that it <strong>escapes</strong>. <a class="reference internal" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#target-4">¹</a></p> +<p>The simplest test case for this happening looks like this:</p> +<div class="code"><pre class="code python"><a id="rest_code_0c257544406048c9853180429a9d35a8-1" name="rest_code_0c257544406048c9853180429a9d35a8-1" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_0c257544406048c9853180429a9d35a8-1"></a><span class="k">def</span> <span class="nf">test_materialize</span><span class="p">():</span> +<a id="rest_code_0c257544406048c9853180429a9d35a8-2" name="rest_code_0c257544406048c9853180429a9d35a8-2" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_0c257544406048c9853180429a9d35a8-2"></a> <span class="n">bb</span> <span class="o">=</span> <span class="n">Block</span><span class="p">()</span> +<a id="rest_code_0c257544406048c9853180429a9d35a8-3" name="rest_code_0c257544406048c9853180429a9d35a8-3" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_0c257544406048c9853180429a9d35a8-3"></a> <span class="n">var0</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">getarg</span><span class="p">(</span><span class="mi">0</span><span class="p">)</span> +<a id="rest_code_0c257544406048c9853180429a9d35a8-4" name="rest_code_0c257544406048c9853180429a9d35a8-4" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_0c257544406048c9853180429a9d35a8-4"></a> <span class="n">obj</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">alloc</span><span class="p">()</span> +<a id="rest_code_0c257544406048c9853180429a9d35a8-5" name="rest_code_0c257544406048c9853180429a9d35a8-5" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_0c257544406048c9853180429a9d35a8-5"></a> <span class="n">sto</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">store</span><span class="p">(</span><span class="n">var0</span><span class="p">,</span> <span class="mi">0</span><span class="p">,</span> <span class="n">obj</span><span class="p">)</span> +<a id="rest_code_0c257544406048c9853180429a9d35a8-6" name="rest_code_0c257544406048c9853180429a9d35a8-6" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_0c257544406048c9853180429a9d35a8-6"></a> <span class="n">opt_bb</span> <span class="o">=</span> <span class="n">optimize_alloc_removal</span><span class="p">(</span><span class="n">bb</span><span class="p">)</span> +<a id="rest_code_0c257544406048c9853180429a9d35a8-7" name="rest_code_0c257544406048c9853180429a9d35a8-7" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_0c257544406048c9853180429a9d35a8-7"></a> <span class="c1"># obj is virtual, without any fields</span> +<a id="rest_code_0c257544406048c9853180429a9d35a8-8" name="rest_code_0c257544406048c9853180429a9d35a8-8" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_0c257544406048c9853180429a9d35a8-8"></a> <span class="c1"># ┌───────┐</span> +<a id="rest_code_0c257544406048c9853180429a9d35a8-9" name="rest_code_0c257544406048c9853180429a9d35a8-9" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_0c257544406048c9853180429a9d35a8-9"></a> <span class="c1"># │ empty │</span> +<a id="rest_code_0c257544406048c9853180429a9d35a8-10" name="rest_code_0c257544406048c9853180429a9d35a8-10" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_0c257544406048c9853180429a9d35a8-10"></a> <span class="c1"># └───────┘</span> +<a id="rest_code_0c257544406048c9853180429a9d35a8-11" name="rest_code_0c257544406048c9853180429a9d35a8-11" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_0c257544406048c9853180429a9d35a8-11"></a> <span class="c1"># then we store a reference to obj into</span> +<a id="rest_code_0c257544406048c9853180429a9d35a8-12" name="rest_code_0c257544406048c9853180429a9d35a8-12" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_0c257544406048c9853180429a9d35a8-12"></a> <span class="c1"># field 0 of var0. Since var0 is not virtual,</span> +<a id="rest_code_0c257544406048c9853180429a9d35a8-13" name="rest_code_0c257544406048c9853180429a9d35a8-13" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_0c257544406048c9853180429a9d35a8-13"></a> <span class="c1"># obj escapes, so we have to put it back</span> +<a id="rest_code_0c257544406048c9853180429a9d35a8-14" name="rest_code_0c257544406048c9853180429a9d35a8-14" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_0c257544406048c9853180429a9d35a8-14"></a> <span class="c1"># into the optimized basic block</span> +<a id="rest_code_0c257544406048c9853180429a9d35a8-15" name="rest_code_0c257544406048c9853180429a9d35a8-15" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_0c257544406048c9853180429a9d35a8-15"></a> <span class="k">assert</span> <span class="n">bb_to_str</span><span class="p">(</span><span class="n">opt_bb</span><span class="p">,</span> <span class="s2">"optvar"</span><span class="p">)</span> <span class="o">==</span> <span class="s2">"""</span><span class="se">\</span> +<a id="rest_code_0c257544406048c9853180429a9d35a8-16" name="rest_code_0c257544406048c9853180429a9d35a8-16" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_0c257544406048c9853180429a9d35a8-16"></a><span class="s2">optvar0 = getarg(0)</span> +<a id="rest_code_0c257544406048c9853180429a9d35a8-17" name="rest_code_0c257544406048c9853180429a9d35a8-17" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_0c257544406048c9853180429a9d35a8-17"></a><span class="s2">optvar1 = alloc()</span> +<a id="rest_code_0c257544406048c9853180429a9d35a8-18" name="rest_code_0c257544406048c9853180429a9d35a8-18" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_0c257544406048c9853180429a9d35a8-18"></a><span class="s2">optvar2 = store(optvar0, 0, optvar1)"""</span> +<a id="rest_code_0c257544406048c9853180429a9d35a8-19" name="rest_code_0c257544406048c9853180429a9d35a8-19" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_0c257544406048c9853180429a9d35a8-19"></a> <span class="c1"># so far, fails like this:</span> +<a id="rest_code_0c257544406048c9853180429a9d35a8-20" name="rest_code_0c257544406048c9853180429a9d35a8-20" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_0c257544406048c9853180429a9d35a8-20"></a> <span class="c1"># the line:</span> +<a id="rest_code_0c257544406048c9853180429a9d35a8-21" name="rest_code_0c257544406048c9853180429a9d35a8-21" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_0c257544406048c9853180429a9d35a8-21"></a> <span class="c1"># info.store(field, op.arg(2))</span> +<a id="rest_code_0c257544406048c9853180429a9d35a8-22" name="rest_code_0c257544406048c9853180429a9d35a8-22" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_0c257544406048c9853180429a9d35a8-22"></a> <span class="c1"># produces an AttributeError because info</span> +<a id="rest_code_0c257544406048c9853180429a9d35a8-23" name="rest_code_0c257544406048c9853180429a9d35a8-23" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_0c257544406048c9853180429a9d35a8-23"></a> <span class="c1"># is None</span> +</pre></div> +<p>If the optimizer reaches a point where a virtual object escapes (like the +<code class="docutils literal">store</code> operation in the test), the optimizer has already removed the <code class="docutils literal">alloc</code> +operation that created the virtual object. If the object escapes, we don't want +to go back in the operations list and re-insert the <code class="docutils literal">alloc</code> operation, that +sounds potentially very complicated. Instead, we re-insert the <code class="docutils literal">alloc</code> +operation that will recreate the virtual object at the point of escape using a +helper function <code class="docutils literal">materialize</code>.</p> +<div class="code"><pre class="code python"><a id="rest_code_e9d7ee3ad1a7422f81a27a1c3a1b1466-1" name="rest_code_e9d7ee3ad1a7422f81a27a1c3a1b1466-1" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_e9d7ee3ad1a7422f81a27a1c3a1b1466-1"></a><span class="hll"><span class="k">def</span> <span class="nf">materialize</span><span class="p">(</span><span class="n">opt_bb</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="n">Operation</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="kc">None</span><span class="p">:</span> +</span><a id="rest_code_e9d7ee3ad1a7422f81a27a1c3a1b1466-2" name="rest_code_e9d7ee3ad1a7422f81a27a1c3a1b1466-2" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_e9d7ee3ad1a7422f81a27a1c3a1b1466-2"></a><span class="hll"> <span class="k">assert</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">value</span><span class="p">,</span> <span class="n">Constant</span><span class="p">)</span> +</span><a id="rest_code_e9d7ee3ad1a7422f81a27a1c3a1b1466-3" name="rest_code_e9d7ee3ad1a7422f81a27a1c3a1b1466-3" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_e9d7ee3ad1a7422f81a27a1c3a1b1466-3"></a><span class="hll"> <span class="k">assert</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">value</span><span class="p">,</span> <span class="n">Operation</span><span class="p">)</span> +</span><a id="rest_code_e9d7ee3ad1a7422f81a27a1c3a1b1466-4" name="rest_code_e9d7ee3ad1a7422f81a27a1c3a1b1466-4" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_e9d7ee3ad1a7422f81a27a1c3a1b1466-4"></a><span class="hll"> <span class="n">info</span> <span class="o">=</span> <span class="n">value</span><span class="o">.</span><span class="n">info</span> +</span><a id="rest_code_e9d7ee3ad1a7422f81a27a1c3a1b1466-5" name="rest_code_e9d7ee3ad1a7422f81a27a1c3a1b1466-5" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_e9d7ee3ad1a7422f81a27a1c3a1b1466-5"></a><span class="hll"> <span class="k">assert</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">info</span><span class="p">,</span> <span class="n">VirtualObject</span><span class="p">)</span> +</span><a id="rest_code_e9d7ee3ad1a7422f81a27a1c3a1b1466-6" name="rest_code_e9d7ee3ad1a7422f81a27a1c3a1b1466-6" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_e9d7ee3ad1a7422f81a27a1c3a1b1466-6"></a><span class="hll"> <span class="k">assert</span> <span class="n">value</span><span class="o">.</span><span class="n">name</span> <span class="o">==</span> <span class="s2">"alloc"</span> +</span><a id="rest_code_e9d7ee3ad1a7422f81a27a1c3a1b1466-7" name="rest_code_e9d7ee3ad1a7422f81a27a1c3a1b1466-7" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_e9d7ee3ad1a7422f81a27a1c3a1b1466-7"></a><span class="hll"> <span class="c1"># put the alloc operation back into the trace</span> +</span><a id="rest_code_e9d7ee3ad1a7422f81a27a1c3a1b1466-8" name="rest_code_e9d7ee3ad1a7422f81a27a1c3a1b1466-8" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_e9d7ee3ad1a7422f81a27a1c3a1b1466-8"></a><span class="hll"> <span class="n">opt_bb</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">value</span><span class="p">)</span> +</span></pre></div> +<p>I've added a number of fairly strong assertions to <code class="docutils literal">materialize</code> to encode our +current assumptions about the situations in which it expects to be called. We +will remove some of them later as we generalize the code.</p> +<p>Now that we have <code class="docutils literal">materialize</code> we need to change <code class="docutils literal">optimize_alloc_removal</code> to +recognize the case of storing a virtual object into a non-virtual one. We can +recognize <code class="docutils literal">Operation</code> instances that produced a virtual object by looking at +their <code class="docutils literal">.info</code> field. If it is <code class="docutils literal">None</code>, the object is not virtual, otherwise +it is. If we store something into a virtual object, we leave the code as above. +If we store a virtual object into an object that is not virtual, we will first +materialize the virtual object, and then emit the store.</p> +<div class="code"><pre class="code python"><a id="rest_code_5fe65fac17ce4da58592318a00455537-1" name="rest_code_5fe65fac17ce4da58592318a00455537-1" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_5fe65fac17ce4da58592318a00455537-1"></a><span class="k">def</span> <span class="nf">optimize_alloc_removal</span><span class="p">(</span><span class="n">bb</span><span class="p">):</span> +<a id="rest_code_5fe65fac17ce4da58592318a00455537-2" name="rest_code_5fe65fac17ce4da58592318a00455537-2" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_5fe65fac17ce4da58592318a00455537-2"></a> <span class="n">opt_bb</span> <span class="o">=</span> <span class="n">Block</span><span class="p">()</span> +<a id="rest_code_5fe65fac17ce4da58592318a00455537-3" name="rest_code_5fe65fac17ce4da58592318a00455537-3" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_5fe65fac17ce4da58592318a00455537-3"></a> <span class="k">for</span> <span class="n">op</span> <span class="ow">in</span> <span class="n">bb</span><span class="p">:</span> +<a id="rest_code_5fe65fac17ce4da58592318a00455537-4" name="rest_code_5fe65fac17ce4da58592318a00455537-4" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_5fe65fac17ce4da58592318a00455537-4"></a> <span class="k">if</span> <span class="n">op</span><span class="o">.</span><span class="n">name</span> <span class="o">==</span> <span class="s2">"alloc"</span><span class="p">:</span> +<a id="rest_code_5fe65fac17ce4da58592318a00455537-5" name="rest_code_5fe65fac17ce4da58592318a00455537-5" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_5fe65fac17ce4da58592318a00455537-5"></a> <span class="n">op</span><span class="o">.</span><span class="n">info</span> <span class="o">=</span> <span class="n">VirtualObject</span><span class="p">()</span> +<a id="rest_code_5fe65fac17ce4da58592318a00455537-6" name="rest_code_5fe65fac17ce4da58592318a00455537-6" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_5fe65fac17ce4da58592318a00455537-6"></a> <span class="k">continue</span> +<a id="rest_code_5fe65fac17ce4da58592318a00455537-7" name="rest_code_5fe65fac17ce4da58592318a00455537-7" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_5fe65fac17ce4da58592318a00455537-7"></a> <span class="k">if</span> <span class="n">op</span><span class="o">.</span><span class="n">name</span> <span class="o">==</span> <span class="s2">"load"</span><span class="p">:</span> +<a id="rest_code_5fe65fac17ce4da58592318a00455537-8" name="rest_code_5fe65fac17ce4da58592318a00455537-8" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_5fe65fac17ce4da58592318a00455537-8"></a> <span class="n">info</span> <span class="o">=</span> <span class="n">op</span><span class="o">.</span><span class="n">arg</span><span class="p">(</span><span class="mi">0</span><span class="p">)</span><span class="o">.</span><span class="n">info</span> +<a id="rest_code_5fe65fac17ce4da58592318a00455537-9" name="rest_code_5fe65fac17ce4da58592318a00455537-9" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_5fe65fac17ce4da58592318a00455537-9"></a> <span class="n">field</span> <span class="o">=</span> <span class="n">get_num</span><span class="p">(</span><span class="n">op</span><span class="p">)</span> +<a id="rest_code_5fe65fac17ce4da58592318a00455537-10" name="rest_code_5fe65fac17ce4da58592318a00455537-10" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_5fe65fac17ce4da58592318a00455537-10"></a> <span class="n">op</span><span class="o">.</span><span class="n">make_equal_to</span><span class="p">(</span><span class="n">info</span><span class="o">.</span><span class="n">load</span><span class="p">(</span><span class="n">field</span><span class="p">))</span> +<a id="rest_code_5fe65fac17ce4da58592318a00455537-11" name="rest_code_5fe65fac17ce4da58592318a00455537-11" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_5fe65fac17ce4da58592318a00455537-11"></a> <span class="k">continue</span> +<a id="rest_code_5fe65fac17ce4da58592318a00455537-12" name="rest_code_5fe65fac17ce4da58592318a00455537-12" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_5fe65fac17ce4da58592318a00455537-12"></a> <span class="k">if</span> <span class="n">op</span><span class="o">.</span><span class="n">name</span> <span class="o">==</span> <span class="s2">"store"</span><span class="p">:</span> +<a id="rest_code_5fe65fac17ce4da58592318a00455537-13" name="rest_code_5fe65fac17ce4da58592318a00455537-13" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_5fe65fac17ce4da58592318a00455537-13"></a> <span class="n">info</span> <span class="o">=</span> <span class="n">op</span><span class="o">.</span><span class="n">arg</span><span class="p">(</span><span class="mi">0</span><span class="p">)</span><span class="o">.</span><span class="n">info</span> +<a id="rest_code_5fe65fac17ce4da58592318a00455537-14" name="rest_code_5fe65fac17ce4da58592318a00455537-14" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_5fe65fac17ce4da58592318a00455537-14"></a><span class="hll"> <span class="k">if</span> <span class="n">info</span><span class="p">:</span> <span class="c1"># virtual</span> +</span><a id="rest_code_5fe65fac17ce4da58592318a00455537-15" name="rest_code_5fe65fac17ce4da58592318a00455537-15" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_5fe65fac17ce4da58592318a00455537-15"></a><span class="hll"> <span class="n">field</span> <span class="o">=</span> <span class="n">get_num</span><span class="p">(</span><span class="n">op</span><span class="p">)</span> +</span><a id="rest_code_5fe65fac17ce4da58592318a00455537-16" name="rest_code_5fe65fac17ce4da58592318a00455537-16" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_5fe65fac17ce4da58592318a00455537-16"></a><span class="hll"> <span class="n">info</span><span class="o">.</span><span class="n">store</span><span class="p">(</span><span class="n">field</span><span class="p">,</span> <span class="n">op</span><span class="o">.</span><span class="n">arg</span><span class="p">(</span><span class="mi">2</span><span class="p">))</span> +</span><a id="rest_code_5fe65fac17ce4da58592318a00455537-17" name="rest_code_5fe65fac17ce4da58592318a00455537-17" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_5fe65fac17ce4da58592318a00455537-17"></a><span class="hll"> <span class="k">continue</span> +</span><a id="rest_code_5fe65fac17ce4da58592318a00455537-18" name="rest_code_5fe65fac17ce4da58592318a00455537-18" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_5fe65fac17ce4da58592318a00455537-18"></a><span class="hll"> <span class="k">else</span><span class="p">:</span> <span class="c1"># not virtual</span> +</span><a id="rest_code_5fe65fac17ce4da58592318a00455537-19" name="rest_code_5fe65fac17ce4da58592318a00455537-19" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_5fe65fac17ce4da58592318a00455537-19"></a><span class="hll"> <span class="c1"># first materialize the</span> +</span><a id="rest_code_5fe65fac17ce4da58592318a00455537-20" name="rest_code_5fe65fac17ce4da58592318a00455537-20" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_5fe65fac17ce4da58592318a00455537-20"></a><span class="hll"> <span class="c1"># right hand side</span> +</span><a id="rest_code_5fe65fac17ce4da58592318a00455537-21" name="rest_code_5fe65fac17ce4da58592318a00455537-21" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_5fe65fac17ce4da58592318a00455537-21"></a><span class="hll"> <span class="n">materialize</span><span class="p">(</span><span class="n">opt_bb</span><span class="p">,</span> <span class="n">op</span><span class="o">.</span><span class="n">arg</span><span class="p">(</span><span class="mi">2</span><span class="p">))</span> +</span><a id="rest_code_5fe65fac17ce4da58592318a00455537-22" name="rest_code_5fe65fac17ce4da58592318a00455537-22" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_5fe65fac17ce4da58592318a00455537-22"></a><span class="hll"> <span class="c1"># then emit the store via</span> +</span><a id="rest_code_5fe65fac17ce4da58592318a00455537-23" name="rest_code_5fe65fac17ce4da58592318a00455537-23" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_5fe65fac17ce4da58592318a00455537-23"></a><span class="hll"> <span class="c1"># the general path below</span> +</span><a id="rest_code_5fe65fac17ce4da58592318a00455537-24" name="rest_code_5fe65fac17ce4da58592318a00455537-24" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_5fe65fac17ce4da58592318a00455537-24"></a> <span class="n">opt_bb</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">op</span><span class="p">)</span> +<a id="rest_code_5fe65fac17ce4da58592318a00455537-25" name="rest_code_5fe65fac17ce4da58592318a00455537-25" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_5fe65fac17ce4da58592318a00455537-25"></a> <span class="k">return</span> <span class="n">opt_bb</span> +</pre></div> +<p>This is the general idea, and it is enough to pass <code class="docutils literal">test_materialize</code>. But of +course there are still a number of further problems that we now need to solve.</p> +</section> +<section id="version-3-don-t-materialize-twice"> +<h2>Version 3: Don't Materialize Twice</h2> +<p>The first problem is the fact that after we materialize a virtual object, it is +no longer virtual. So if it escapes a second time, it should <em>not</em> be +materialized a second time. A test for that case could simply repeat the +<code class="docutils literal">store</code> operation:</p> +<div class="code"><pre class="code python"><a id="rest_code_078d6f2e53ce48fa9bfa0bdb049a4bae-1" name="rest_code_078d6f2e53ce48fa9bfa0bdb049a4bae-1" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_078d6f2e53ce48fa9bfa0bdb049a4bae-1"></a><span class="k">def</span> <span class="nf">test_dont_materialize_twice</span><span class="p">():</span> +<a id="rest_code_078d6f2e53ce48fa9bfa0bdb049a4bae-2" name="rest_code_078d6f2e53ce48fa9bfa0bdb049a4bae-2" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_078d6f2e53ce48fa9bfa0bdb049a4bae-2"></a> <span class="c1"># obj is again an empty virtual object,</span> +<a id="rest_code_078d6f2e53ce48fa9bfa0bdb049a4bae-3" name="rest_code_078d6f2e53ce48fa9bfa0bdb049a4bae-3" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_078d6f2e53ce48fa9bfa0bdb049a4bae-3"></a> <span class="c1"># and we store it into var0 *twice*.</span> +<a id="rest_code_078d6f2e53ce48fa9bfa0bdb049a4bae-4" name="rest_code_078d6f2e53ce48fa9bfa0bdb049a4bae-4" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_078d6f2e53ce48fa9bfa0bdb049a4bae-4"></a> <span class="c1"># this should only materialize it once</span> +<a id="rest_code_078d6f2e53ce48fa9bfa0bdb049a4bae-5" name="rest_code_078d6f2e53ce48fa9bfa0bdb049a4bae-5" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_078d6f2e53ce48fa9bfa0bdb049a4bae-5"></a> <span class="n">bb</span> <span class="o">=</span> <span class="n">Block</span><span class="p">()</span> +<a id="rest_code_078d6f2e53ce48fa9bfa0bdb049a4bae-6" name="rest_code_078d6f2e53ce48fa9bfa0bdb049a4bae-6" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_078d6f2e53ce48fa9bfa0bdb049a4bae-6"></a> <span class="n">var0</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">getarg</span><span class="p">(</span><span class="mi">0</span><span class="p">)</span> +<a id="rest_code_078d6f2e53ce48fa9bfa0bdb049a4bae-7" name="rest_code_078d6f2e53ce48fa9bfa0bdb049a4bae-7" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_078d6f2e53ce48fa9bfa0bdb049a4bae-7"></a> <span class="n">obj</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">alloc</span><span class="p">()</span> +<a id="rest_code_078d6f2e53ce48fa9bfa0bdb049a4bae-8" name="rest_code_078d6f2e53ce48fa9bfa0bdb049a4bae-8" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_078d6f2e53ce48fa9bfa0bdb049a4bae-8"></a> <span class="n">sto0</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">store</span><span class="p">(</span><span class="n">var0</span><span class="p">,</span> <span class="mi">0</span><span class="p">,</span> <span class="n">obj</span><span class="p">)</span> +<a id="rest_code_078d6f2e53ce48fa9bfa0bdb049a4bae-9" name="rest_code_078d6f2e53ce48fa9bfa0bdb049a4bae-9" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_078d6f2e53ce48fa9bfa0bdb049a4bae-9"></a> <span class="n">sto1</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">store</span><span class="p">(</span><span class="n">var0</span><span class="p">,</span> <span class="mi">0</span><span class="p">,</span> <span class="n">obj</span><span class="p">)</span> +<a id="rest_code_078d6f2e53ce48fa9bfa0bdb049a4bae-10" name="rest_code_078d6f2e53ce48fa9bfa0bdb049a4bae-10" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_078d6f2e53ce48fa9bfa0bdb049a4bae-10"></a> <span class="n">opt_bb</span> <span class="o">=</span> <span class="n">optimize_alloc_removal</span><span class="p">(</span><span class="n">bb</span><span class="p">)</span> +<a id="rest_code_078d6f2e53ce48fa9bfa0bdb049a4bae-11" name="rest_code_078d6f2e53ce48fa9bfa0bdb049a4bae-11" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_078d6f2e53ce48fa9bfa0bdb049a4bae-11"></a> <span class="k">assert</span> <span class="n">bb_to_str</span><span class="p">(</span><span class="n">opt_bb</span><span class="p">,</span> <span class="s2">"optvar"</span><span class="p">)</span> <span class="o">==</span> <span class="s2">"""</span><span class="se">\</span> +<a id="rest_code_078d6f2e53ce48fa9bfa0bdb049a4bae-12" name="rest_code_078d6f2e53ce48fa9bfa0bdb049a4bae-12" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_078d6f2e53ce48fa9bfa0bdb049a4bae-12"></a><span class="s2">optvar0 = getarg(0)</span> +<a id="rest_code_078d6f2e53ce48fa9bfa0bdb049a4bae-13" name="rest_code_078d6f2e53ce48fa9bfa0bdb049a4bae-13" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_078d6f2e53ce48fa9bfa0bdb049a4bae-13"></a><span class="s2">optvar1 = alloc()</span> +<a id="rest_code_078d6f2e53ce48fa9bfa0bdb049a4bae-14" name="rest_code_078d6f2e53ce48fa9bfa0bdb049a4bae-14" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_078d6f2e53ce48fa9bfa0bdb049a4bae-14"></a><span class="s2">optvar2 = store(optvar0, 0, optvar1)</span> +<a id="rest_code_078d6f2e53ce48fa9bfa0bdb049a4bae-15" name="rest_code_078d6f2e53ce48fa9bfa0bdb049a4bae-15" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_078d6f2e53ce48fa9bfa0bdb049a4bae-15"></a><span class="s2">optvar3 = store(optvar0, 0, optvar1)"""</span> +<a id="rest_code_078d6f2e53ce48fa9bfa0bdb049a4bae-16" name="rest_code_078d6f2e53ce48fa9bfa0bdb049a4bae-16" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_078d6f2e53ce48fa9bfa0bdb049a4bae-16"></a> <span class="c1"># fails so far: the operations that we get</span> +<a id="rest_code_078d6f2e53ce48fa9bfa0bdb049a4bae-17" name="rest_code_078d6f2e53ce48fa9bfa0bdb049a4bae-17" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_078d6f2e53ce48fa9bfa0bdb049a4bae-17"></a> <span class="c1"># at the moment are:</span> +<a id="rest_code_078d6f2e53ce48fa9bfa0bdb049a4bae-18" name="rest_code_078d6f2e53ce48fa9bfa0bdb049a4bae-18" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_078d6f2e53ce48fa9bfa0bdb049a4bae-18"></a> <span class="c1"># optvar0 = getarg(0)</span> +<a id="rest_code_078d6f2e53ce48fa9bfa0bdb049a4bae-19" name="rest_code_078d6f2e53ce48fa9bfa0bdb049a4bae-19" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_078d6f2e53ce48fa9bfa0bdb049a4bae-19"></a> <span class="c1"># optvar1 = alloc()</span> +<a id="rest_code_078d6f2e53ce48fa9bfa0bdb049a4bae-20" name="rest_code_078d6f2e53ce48fa9bfa0bdb049a4bae-20" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_078d6f2e53ce48fa9bfa0bdb049a4bae-20"></a> <span class="c1"># optvar2 = store(optvar0, 0, optvar1)</span> +<a id="rest_code_078d6f2e53ce48fa9bfa0bdb049a4bae-21" name="rest_code_078d6f2e53ce48fa9bfa0bdb049a4bae-21" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_078d6f2e53ce48fa9bfa0bdb049a4bae-21"></a> <span class="c1"># optvar3 = alloc()</span> +<a id="rest_code_078d6f2e53ce48fa9bfa0bdb049a4bae-22" name="rest_code_078d6f2e53ce48fa9bfa0bdb049a4bae-22" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_078d6f2e53ce48fa9bfa0bdb049a4bae-22"></a> <span class="c1"># optvar4 = store(optvar0, 0, optvar3)</span> +<a id="rest_code_078d6f2e53ce48fa9bfa0bdb049a4bae-23" name="rest_code_078d6f2e53ce48fa9bfa0bdb049a4bae-23" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_078d6f2e53ce48fa9bfa0bdb049a4bae-23"></a> <span class="c1"># ie the object is materialized twice,</span> +<a id="rest_code_078d6f2e53ce48fa9bfa0bdb049a4bae-24" name="rest_code_078d6f2e53ce48fa9bfa0bdb049a4bae-24" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_078d6f2e53ce48fa9bfa0bdb049a4bae-24"></a> <span class="c1"># which is incorrect</span> +</pre></div> +<p>We solve the problem by setting the <code class="docutils literal">.info</code> field of an object that we +materialize to <code class="docutils literal">None</code> to mark it as no longer being virtual.</p> +<div class="code"><pre class="code python"><a id="rest_code_5a0ccaa034e54ffe9f2eea09a15e056c-1" name="rest_code_5a0ccaa034e54ffe9f2eea09a15e056c-1" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_5a0ccaa034e54ffe9f2eea09a15e056c-1"></a><span class="k">def</span> <span class="nf">materialize</span><span class="p">(</span><span class="n">opt_bb</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="n">Operation</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="kc">None</span><span class="p">:</span> +<a id="rest_code_5a0ccaa034e54ffe9f2eea09a15e056c-2" name="rest_code_5a0ccaa034e54ffe9f2eea09a15e056c-2" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_5a0ccaa034e54ffe9f2eea09a15e056c-2"></a> <span class="k">assert</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">value</span><span class="p">,</span> <span class="n">Constant</span><span class="p">)</span> +<a id="rest_code_5a0ccaa034e54ffe9f2eea09a15e056c-3" name="rest_code_5a0ccaa034e54ffe9f2eea09a15e056c-3" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_5a0ccaa034e54ffe9f2eea09a15e056c-3"></a> <span class="k">assert</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">value</span><span class="p">,</span> <span class="n">Operation</span><span class="p">)</span> +<a id="rest_code_5a0ccaa034e54ffe9f2eea09a15e056c-4" name="rest_code_5a0ccaa034e54ffe9f2eea09a15e056c-4" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_5a0ccaa034e54ffe9f2eea09a15e056c-4"></a> <span class="n">info</span> <span class="o">=</span> <span class="n">value</span><span class="o">.</span><span class="n">info</span> +<a id="rest_code_5a0ccaa034e54ffe9f2eea09a15e056c-5" name="rest_code_5a0ccaa034e54ffe9f2eea09a15e056c-5" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_5a0ccaa034e54ffe9f2eea09a15e056c-5"></a><span class="hll"> <span class="k">if</span> <span class="n">info</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span> +</span><a id="rest_code_5a0ccaa034e54ffe9f2eea09a15e056c-6" name="rest_code_5a0ccaa034e54ffe9f2eea09a15e056c-6" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_5a0ccaa034e54ffe9f2eea09a15e056c-6"></a><span class="hll"> <span class="k">return</span> <span class="c1"># already materialized</span> +</span><a id="rest_code_5a0ccaa034e54ffe9f2eea09a15e056c-7" name="rest_code_5a0ccaa034e54ffe9f2eea09a15e056c-7" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_5a0ccaa034e54ffe9f2eea09a15e056c-7"></a> <span class="k">assert</span> <span class="n">value</span><span class="o">.</span><span class="n">name</span> <span class="o">==</span> <span class="s2">"alloc"</span> +<a id="rest_code_5a0ccaa034e54ffe9f2eea09a15e056c-8" name="rest_code_5a0ccaa034e54ffe9f2eea09a15e056c-8" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_5a0ccaa034e54ffe9f2eea09a15e056c-8"></a><span class="hll"> <span class="c1"># put the alloc operation back into the trace</span> +</span><a id="rest_code_5a0ccaa034e54ffe9f2eea09a15e056c-9" name="rest_code_5a0ccaa034e54ffe9f2eea09a15e056c-9" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_5a0ccaa034e54ffe9f2eea09a15e056c-9"></a> <span class="n">opt_bb</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">value</span><span class="p">)</span> +<a id="rest_code_5a0ccaa034e54ffe9f2eea09a15e056c-10" name="rest_code_5a0ccaa034e54ffe9f2eea09a15e056c-10" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_5a0ccaa034e54ffe9f2eea09a15e056c-10"></a><span class="hll"> <span class="c1"># but only once</span> +</span><a id="rest_code_5a0ccaa034e54ffe9f2eea09a15e056c-11" name="rest_code_5a0ccaa034e54ffe9f2eea09a15e056c-11" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_5a0ccaa034e54ffe9f2eea09a15e056c-11"></a><span class="hll"> <span class="n">value</span><span class="o">.</span><span class="n">info</span> <span class="o">=</span> <span class="kc">None</span> +</span><a id="rest_code_5a0ccaa034e54ffe9f2eea09a15e056c-12" name="rest_code_5a0ccaa034e54ffe9f2eea09a15e056c-12" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_5a0ccaa034e54ffe9f2eea09a15e056c-12"></a> +<a id="rest_code_5a0ccaa034e54ffe9f2eea09a15e056c-13" name="rest_code_5a0ccaa034e54ffe9f2eea09a15e056c-13" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_5a0ccaa034e54ffe9f2eea09a15e056c-13"></a><span class="c1"># optimize_alloc_removal unchanged</span> +</pre></div> +<p>This fixes the problem, only one <code class="docutils literal">alloc</code> is created. This fix also allows +another test case to pass, one where we store a non-virtual into another +non-virtual, code which we cannot optimize at all:</p> +<div class="code"><pre class="code python"><a id="rest_code_205bbe2ab59241609c95782a0781cd2c-1" name="rest_code_205bbe2ab59241609c95782a0781cd2c-1" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_205bbe2ab59241609c95782a0781cd2c-1"></a><span class="k">def</span> <span class="nf">test_materialize_non_virtuals</span><span class="p">():</span> +<a id="rest_code_205bbe2ab59241609c95782a0781cd2c-2" name="rest_code_205bbe2ab59241609c95782a0781cd2c-2" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_205bbe2ab59241609c95782a0781cd2c-2"></a> <span class="c1"># in this example we store a non-virtual var1</span> +<a id="rest_code_205bbe2ab59241609c95782a0781cd2c-3" name="rest_code_205bbe2ab59241609c95782a0781cd2c-3" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_205bbe2ab59241609c95782a0781cd2c-3"></a> <span class="c1"># into another non-virtual var0</span> +<a id="rest_code_205bbe2ab59241609c95782a0781cd2c-4" name="rest_code_205bbe2ab59241609c95782a0781cd2c-4" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_205bbe2ab59241609c95782a0781cd2c-4"></a> <span class="c1"># this should just lead to no optimization at</span> +<a id="rest_code_205bbe2ab59241609c95782a0781cd2c-5" name="rest_code_205bbe2ab59241609c95782a0781cd2c-5" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_205bbe2ab59241609c95782a0781cd2c-5"></a> <span class="c1"># all</span> +<a id="rest_code_205bbe2ab59241609c95782a0781cd2c-6" name="rest_code_205bbe2ab59241609c95782a0781cd2c-6" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_205bbe2ab59241609c95782a0781cd2c-6"></a> <span class="n">bb</span> <span class="o">=</span> <span class="n">Block</span><span class="p">()</span> +<a id="rest_code_205bbe2ab59241609c95782a0781cd2c-7" name="rest_code_205bbe2ab59241609c95782a0781cd2c-7" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_205bbe2ab59241609c95782a0781cd2c-7"></a> <span class="n">var0</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">getarg</span><span class="p">(</span><span class="mi">0</span><span class="p">)</span> +<a id="rest_code_205bbe2ab59241609c95782a0781cd2c-8" name="rest_code_205bbe2ab59241609c95782a0781cd2c-8" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_205bbe2ab59241609c95782a0781cd2c-8"></a> <span class="n">var1</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">getarg</span><span class="p">(</span><span class="mi">1</span><span class="p">)</span> +<a id="rest_code_205bbe2ab59241609c95782a0781cd2c-9" name="rest_code_205bbe2ab59241609c95782a0781cd2c-9" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_205bbe2ab59241609c95782a0781cd2c-9"></a> <span class="n">sto</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">store</span><span class="p">(</span><span class="n">var0</span><span class="p">,</span> <span class="mi">0</span><span class="p">,</span> <span class="n">var1</span><span class="p">)</span> +<a id="rest_code_205bbe2ab59241609c95782a0781cd2c-10" name="rest_code_205bbe2ab59241609c95782a0781cd2c-10" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_205bbe2ab59241609c95782a0781cd2c-10"></a> <span class="n">opt_bb</span> <span class="o">=</span> <span class="n">optimize_alloc_removal</span><span class="p">(</span><span class="n">bb</span><span class="p">)</span> +<a id="rest_code_205bbe2ab59241609c95782a0781cd2c-11" name="rest_code_205bbe2ab59241609c95782a0781cd2c-11" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_205bbe2ab59241609c95782a0781cd2c-11"></a> <span class="k">assert</span> <span class="n">bb_to_str</span><span class="p">(</span><span class="n">opt_bb</span><span class="p">,</span> <span class="s2">"optvar"</span><span class="p">)</span> <span class="o">==</span> <span class="s2">"""</span><span class="se">\</span> +<a id="rest_code_205bbe2ab59241609c95782a0781cd2c-12" name="rest_code_205bbe2ab59241609c95782a0781cd2c-12" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_205bbe2ab59241609c95782a0781cd2c-12"></a><span class="s2">optvar0 = getarg(0)</span> +<a id="rest_code_205bbe2ab59241609c95782a0781cd2c-13" name="rest_code_205bbe2ab59241609c95782a0781cd2c-13" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_205bbe2ab59241609c95782a0781cd2c-13"></a><span class="s2">optvar1 = getarg(1)</span> +<a id="rest_code_205bbe2ab59241609c95782a0781cd2c-14" name="rest_code_205bbe2ab59241609c95782a0781cd2c-14" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_205bbe2ab59241609c95782a0781cd2c-14"></a><span class="s2">optvar2 = store(optvar0, 0, optvar1)"""</span> +</pre></div> +</section> +<section id="version-4-materialization-of-constants"> +<h2>Version 4: Materialization of Constants</h2> +<p>Another straightforward extension is to support materializing constants. A +constant is never virtual, so materializing it should do nothing.</p> +<div class="code"><pre class="code python"><a id="rest_code_b709144ffac344d1ba11ab5b097883f0-1" name="rest_code_b709144ffac344d1ba11ab5b097883f0-1" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_b709144ffac344d1ba11ab5b097883f0-1"></a><span class="k">def</span> <span class="nf">test_materialization_constants</span><span class="p">():</span> +<a id="rest_code_b709144ffac344d1ba11ab5b097883f0-2" name="rest_code_b709144ffac344d1ba11ab5b097883f0-2" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_b709144ffac344d1ba11ab5b097883f0-2"></a> <span class="c1"># in this example we store the constant 17</span> +<a id="rest_code_b709144ffac344d1ba11ab5b097883f0-3" name="rest_code_b709144ffac344d1ba11ab5b097883f0-3" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_b709144ffac344d1ba11ab5b097883f0-3"></a> <span class="c1"># into the non-virtual var0</span> +<a id="rest_code_b709144ffac344d1ba11ab5b097883f0-4" name="rest_code_b709144ffac344d1ba11ab5b097883f0-4" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_b709144ffac344d1ba11ab5b097883f0-4"></a> <span class="c1"># again, this will not be optimized</span> +<a id="rest_code_b709144ffac344d1ba11ab5b097883f0-5" name="rest_code_b709144ffac344d1ba11ab5b097883f0-5" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_b709144ffac344d1ba11ab5b097883f0-5"></a> <span class="n">bb</span> <span class="o">=</span> <span class="n">Block</span><span class="p">()</span> +<a id="rest_code_b709144ffac344d1ba11ab5b097883f0-6" name="rest_code_b709144ffac344d1ba11ab5b097883f0-6" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_b709144ffac344d1ba11ab5b097883f0-6"></a> <span class="n">var0</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">getarg</span><span class="p">(</span><span class="mi">0</span><span class="p">)</span> +<a id="rest_code_b709144ffac344d1ba11ab5b097883f0-7" name="rest_code_b709144ffac344d1ba11ab5b097883f0-7" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_b709144ffac344d1ba11ab5b097883f0-7"></a> <span class="n">sto</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">store</span><span class="p">(</span><span class="n">var0</span><span class="p">,</span> <span class="mi">0</span><span class="p">,</span> <span class="mi">17</span><span class="p">)</span> +<a id="rest_code_b709144ffac344d1ba11ab5b097883f0-8" name="rest_code_b709144ffac344d1ba11ab5b097883f0-8" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_b709144ffac344d1ba11ab5b097883f0-8"></a> <span class="n">opt_bb</span> <span class="o">=</span> <span class="n">optimize_alloc_removal</span><span class="p">(</span><span class="n">bb</span><span class="p">)</span> +<a id="rest_code_b709144ffac344d1ba11ab5b097883f0-9" name="rest_code_b709144ffac344d1ba11ab5b097883f0-9" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_b709144ffac344d1ba11ab5b097883f0-9"></a> <span class="c1"># the previous line fails so far, triggering</span> +<a id="rest_code_b709144ffac344d1ba11ab5b097883f0-10" name="rest_code_b709144ffac344d1ba11ab5b097883f0-10" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_b709144ffac344d1ba11ab5b097883f0-10"></a> <span class="c1"># the assert:</span> +<a id="rest_code_b709144ffac344d1ba11ab5b097883f0-11" name="rest_code_b709144ffac344d1ba11ab5b097883f0-11" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_b709144ffac344d1ba11ab5b097883f0-11"></a> <span class="c1"># assert not isinstance(value, Constant)</span> +<a id="rest_code_b709144ffac344d1ba11ab5b097883f0-12" name="rest_code_b709144ffac344d1ba11ab5b097883f0-12" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_b709144ffac344d1ba11ab5b097883f0-12"></a> <span class="c1"># in materialize</span> +<a id="rest_code_b709144ffac344d1ba11ab5b097883f0-13" name="rest_code_b709144ffac344d1ba11ab5b097883f0-13" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_b709144ffac344d1ba11ab5b097883f0-13"></a> <span class="k">assert</span> <span class="n">bb_to_str</span><span class="p">(</span><span class="n">opt_bb</span><span class="p">,</span> <span class="s2">"optvar"</span><span class="p">)</span> <span class="o">==</span> <span class="s2">"""</span><span class="se">\</span> +<a id="rest_code_b709144ffac344d1ba11ab5b097883f0-14" name="rest_code_b709144ffac344d1ba11ab5b097883f0-14" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_b709144ffac344d1ba11ab5b097883f0-14"></a><span class="s2">optvar0 = getarg(0)</span> +<a id="rest_code_b709144ffac344d1ba11ab5b097883f0-15" name="rest_code_b709144ffac344d1ba11ab5b097883f0-15" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_b709144ffac344d1ba11ab5b097883f0-15"></a><span class="s2">optvar1 = store(optvar0, 0, 17)"""</span> +</pre></div> +<p>To implement that case, we check for <code class="docutils literal">value</code> being a constant and return +early:</p> +<div class="code"><pre class="code python"><a id="rest_code_c7e8dca37bae45e3aae461ef85046c6f-1" name="rest_code_c7e8dca37bae45e3aae461ef85046c6f-1" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_c7e8dca37bae45e3aae461ef85046c6f-1"></a><span class="k">def</span> <span class="nf">materialize</span><span class="p">(</span><span class="n">opt_bb</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="n">Operation</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="kc">None</span><span class="p">:</span> +<a id="rest_code_c7e8dca37bae45e3aae461ef85046c6f-2" name="rest_code_c7e8dca37bae45e3aae461ef85046c6f-2" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_c7e8dca37bae45e3aae461ef85046c6f-2"></a><span class="hll"> <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">value</span><span class="p">,</span> <span class="n">Constant</span><span class="p">):</span> +</span><a id="rest_code_c7e8dca37bae45e3aae461ef85046c6f-3" name="rest_code_c7e8dca37bae45e3aae461ef85046c6f-3" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_c7e8dca37bae45e3aae461ef85046c6f-3"></a><span class="hll"> <span class="k">return</span> +</span><a id="rest_code_c7e8dca37bae45e3aae461ef85046c6f-4" name="rest_code_c7e8dca37bae45e3aae461ef85046c6f-4" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_c7e8dca37bae45e3aae461ef85046c6f-4"></a> <span class="k">assert</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">value</span><span class="p">,</span> <span class="n">Operation</span><span class="p">)</span> +<a id="rest_code_c7e8dca37bae45e3aae461ef85046c6f-5" name="rest_code_c7e8dca37bae45e3aae461ef85046c6f-5" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_c7e8dca37bae45e3aae461ef85046c6f-5"></a> <span class="n">info</span> <span class="o">=</span> <span class="n">value</span><span class="o">.</span><span class="n">info</span> +<a id="rest_code_c7e8dca37bae45e3aae461ef85046c6f-6" name="rest_code_c7e8dca37bae45e3aae461ef85046c6f-6" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_c7e8dca37bae45e3aae461ef85046c6f-6"></a> <span class="k">if</span> <span class="n">info</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span> +<a id="rest_code_c7e8dca37bae45e3aae461ef85046c6f-7" name="rest_code_c7e8dca37bae45e3aae461ef85046c6f-7" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_c7e8dca37bae45e3aae461ef85046c6f-7"></a> <span class="k">return</span> <span class="c1"># already materialized</span> +<a id="rest_code_c7e8dca37bae45e3aae461ef85046c6f-8" name="rest_code_c7e8dca37bae45e3aae461ef85046c6f-8" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_c7e8dca37bae45e3aae461ef85046c6f-8"></a> <span class="k">assert</span> <span class="n">value</span><span class="o">.</span><span class="n">name</span> <span class="o">==</span> <span class="s2">"alloc"</span> +<a id="rest_code_c7e8dca37bae45e3aae461ef85046c6f-9" name="rest_code_c7e8dca37bae45e3aae461ef85046c6f-9" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_c7e8dca37bae45e3aae461ef85046c6f-9"></a> <span class="c1"># put the alloc operation back into the trace</span> +<a id="rest_code_c7e8dca37bae45e3aae461ef85046c6f-10" name="rest_code_c7e8dca37bae45e3aae461ef85046c6f-10" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_c7e8dca37bae45e3aae461ef85046c6f-10"></a> <span class="n">opt_bb</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">value</span><span class="p">)</span> +<a id="rest_code_c7e8dca37bae45e3aae461ef85046c6f-11" name="rest_code_c7e8dca37bae45e3aae461ef85046c6f-11" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_c7e8dca37bae45e3aae461ef85046c6f-11"></a> <span class="c1"># but only once</span> +<a id="rest_code_c7e8dca37bae45e3aae461ef85046c6f-12" name="rest_code_c7e8dca37bae45e3aae461ef85046c6f-12" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_c7e8dca37bae45e3aae461ef85046c6f-12"></a> <span class="n">value</span><span class="o">.</span><span class="n">info</span> <span class="o">=</span> <span class="kc">None</span> +<a id="rest_code_c7e8dca37bae45e3aae461ef85046c6f-13" name="rest_code_c7e8dca37bae45e3aae461ef85046c6f-13" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_c7e8dca37bae45e3aae461ef85046c6f-13"></a> +<a id="rest_code_c7e8dca37bae45e3aae461ef85046c6f-14" name="rest_code_c7e8dca37bae45e3aae461ef85046c6f-14" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_c7e8dca37bae45e3aae461ef85046c6f-14"></a><span class="c1"># optimize_alloc_removal unchanged</span> +</pre></div> +</section> +<section id="version-5-materializing-fields"> +<h2>Version 5: Materializing Fields</h2> +<p>Now we need to solve a more difficult problem. So far, the virtual objects that +we have materialized have all been empty, meaning they didn't have any fields +written to at the point of materialization. Let's write a test for this:</p> +<div class="code"><pre class="code python"><a id="rest_code_fbc3fcddd68741a38bb88dc5981923f9-1" name="rest_code_fbc3fcddd68741a38bb88dc5981923f9-1" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_fbc3fcddd68741a38bb88dc5981923f9-1"></a><span class="k">def</span> <span class="nf">test_materialize_fields</span><span class="p">():</span> +<a id="rest_code_fbc3fcddd68741a38bb88dc5981923f9-2" name="rest_code_fbc3fcddd68741a38bb88dc5981923f9-2" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_fbc3fcddd68741a38bb88dc5981923f9-2"></a> <span class="n">bb</span> <span class="o">=</span> <span class="n">Block</span><span class="p">()</span> +<a id="rest_code_fbc3fcddd68741a38bb88dc5981923f9-3" name="rest_code_fbc3fcddd68741a38bb88dc5981923f9-3" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_fbc3fcddd68741a38bb88dc5981923f9-3"></a> <span class="n">var0</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">getarg</span><span class="p">(</span><span class="mi">0</span><span class="p">)</span> +<a id="rest_code_fbc3fcddd68741a38bb88dc5981923f9-4" name="rest_code_fbc3fcddd68741a38bb88dc5981923f9-4" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_fbc3fcddd68741a38bb88dc5981923f9-4"></a> <span class="n">var1</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">getarg</span><span class="p">(</span><span class="mi">1</span><span class="p">)</span> +<a id="rest_code_fbc3fcddd68741a38bb88dc5981923f9-5" name="rest_code_fbc3fcddd68741a38bb88dc5981923f9-5" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_fbc3fcddd68741a38bb88dc5981923f9-5"></a> <span class="n">obj</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">alloc</span><span class="p">()</span> +<a id="rest_code_fbc3fcddd68741a38bb88dc5981923f9-6" name="rest_code_fbc3fcddd68741a38bb88dc5981923f9-6" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_fbc3fcddd68741a38bb88dc5981923f9-6"></a> <span class="n">contents0</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">store</span><span class="p">(</span><span class="n">obj</span><span class="p">,</span> <span class="mi">0</span><span class="p">,</span> <span class="mi">8</span><span class="p">)</span> +<a id="rest_code_fbc3fcddd68741a38bb88dc5981923f9-7" name="rest_code_fbc3fcddd68741a38bb88dc5981923f9-7" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_fbc3fcddd68741a38bb88dc5981923f9-7"></a> <span class="n">contents1</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">store</span><span class="p">(</span><span class="n">obj</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="n">var1</span><span class="p">)</span> +<a id="rest_code_fbc3fcddd68741a38bb88dc5981923f9-8" name="rest_code_fbc3fcddd68741a38bb88dc5981923f9-8" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_fbc3fcddd68741a38bb88dc5981923f9-8"></a> <span class="n">sto</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">store</span><span class="p">(</span><span class="n">var0</span><span class="p">,</span> <span class="mi">0</span><span class="p">,</span> <span class="n">obj</span><span class="p">)</span> +<a id="rest_code_fbc3fcddd68741a38bb88dc5981923f9-9" name="rest_code_fbc3fcddd68741a38bb88dc5981923f9-9" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_fbc3fcddd68741a38bb88dc5981923f9-9"></a> +<a id="rest_code_fbc3fcddd68741a38bb88dc5981923f9-10" name="rest_code_fbc3fcddd68741a38bb88dc5981923f9-10" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_fbc3fcddd68741a38bb88dc5981923f9-10"></a> <span class="c1"># the virtual obj looks like this</span> +<a id="rest_code_fbc3fcddd68741a38bb88dc5981923f9-11" name="rest_code_fbc3fcddd68741a38bb88dc5981923f9-11" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_fbc3fcddd68741a38bb88dc5981923f9-11"></a> <span class="c1"># obj</span> +<a id="rest_code_fbc3fcddd68741a38bb88dc5981923f9-12" name="rest_code_fbc3fcddd68741a38bb88dc5981923f9-12" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_fbc3fcddd68741a38bb88dc5981923f9-12"></a> <span class="c1"># ┌──────┬──────────┐</span> +<a id="rest_code_fbc3fcddd68741a38bb88dc5981923f9-13" name="rest_code_fbc3fcddd68741a38bb88dc5981923f9-13" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_fbc3fcddd68741a38bb88dc5981923f9-13"></a> <span class="c1"># │ 0: 8 │ 1: var1 │</span> +<a id="rest_code_fbc3fcddd68741a38bb88dc5981923f9-14" name="rest_code_fbc3fcddd68741a38bb88dc5981923f9-14" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_fbc3fcddd68741a38bb88dc5981923f9-14"></a> <span class="c1"># └──────┴──────────┘</span> +<a id="rest_code_fbc3fcddd68741a38bb88dc5981923f9-15" name="rest_code_fbc3fcddd68741a38bb88dc5981923f9-15" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_fbc3fcddd68741a38bb88dc5981923f9-15"></a> <span class="c1"># then it needs to be materialized</span> +<a id="rest_code_fbc3fcddd68741a38bb88dc5981923f9-16" name="rest_code_fbc3fcddd68741a38bb88dc5981923f9-16" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_fbc3fcddd68741a38bb88dc5981923f9-16"></a> <span class="c1"># this is the first example where a virtual</span> +<a id="rest_code_fbc3fcddd68741a38bb88dc5981923f9-17" name="rest_code_fbc3fcddd68741a38bb88dc5981923f9-17" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_fbc3fcddd68741a38bb88dc5981923f9-17"></a> <span class="c1"># object that we want to materialize has any</span> +<a id="rest_code_fbc3fcddd68741a38bb88dc5981923f9-18" name="rest_code_fbc3fcddd68741a38bb88dc5981923f9-18" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_fbc3fcddd68741a38bb88dc5981923f9-18"></a> <span class="c1"># content and is not just an empty object</span> +<a id="rest_code_fbc3fcddd68741a38bb88dc5981923f9-19" name="rest_code_fbc3fcddd68741a38bb88dc5981923f9-19" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_fbc3fcddd68741a38bb88dc5981923f9-19"></a> <span class="n">opt_bb</span> <span class="o">=</span> <span class="n">optimize_alloc_removal</span><span class="p">(</span><span class="n">bb</span><span class="p">)</span> +<a id="rest_code_fbc3fcddd68741a38bb88dc5981923f9-20" name="rest_code_fbc3fcddd68741a38bb88dc5981923f9-20" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_fbc3fcddd68741a38bb88dc5981923f9-20"></a> <span class="k">assert</span> <span class="n">bb_to_str</span><span class="p">(</span><span class="n">opt_bb</span><span class="p">,</span> <span class="s2">"optvar"</span><span class="p">)</span> <span class="o">==</span> <span class="s2">"""</span><span class="se">\</span> +<a id="rest_code_fbc3fcddd68741a38bb88dc5981923f9-21" name="rest_code_fbc3fcddd68741a38bb88dc5981923f9-21" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_fbc3fcddd68741a38bb88dc5981923f9-21"></a><span class="s2">optvar0 = getarg(0)</span> +<a id="rest_code_fbc3fcddd68741a38bb88dc5981923f9-22" name="rest_code_fbc3fcddd68741a38bb88dc5981923f9-22" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_fbc3fcddd68741a38bb88dc5981923f9-22"></a><span class="s2">optvar1 = getarg(1)</span> +<a id="rest_code_fbc3fcddd68741a38bb88dc5981923f9-23" name="rest_code_fbc3fcddd68741a38bb88dc5981923f9-23" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_fbc3fcddd68741a38bb88dc5981923f9-23"></a><span class="s2">optvar2 = alloc()</span> +<a id="rest_code_fbc3fcddd68741a38bb88dc5981923f9-24" name="rest_code_fbc3fcddd68741a38bb88dc5981923f9-24" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_fbc3fcddd68741a38bb88dc5981923f9-24"></a><span class="s2">optvar3 = store(optvar2, 0, 8)</span> +<a id="rest_code_fbc3fcddd68741a38bb88dc5981923f9-25" name="rest_code_fbc3fcddd68741a38bb88dc5981923f9-25" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_fbc3fcddd68741a38bb88dc5981923f9-25"></a><span class="s2">optvar4 = store(optvar2, 1, optvar1)</span> +<a id="rest_code_fbc3fcddd68741a38bb88dc5981923f9-26" name="rest_code_fbc3fcddd68741a38bb88dc5981923f9-26" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_fbc3fcddd68741a38bb88dc5981923f9-26"></a><span class="s2">optvar5 = store(optvar0, 0, optvar2)"""</span> +<a id="rest_code_fbc3fcddd68741a38bb88dc5981923f9-27" name="rest_code_fbc3fcddd68741a38bb88dc5981923f9-27" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_fbc3fcddd68741a38bb88dc5981923f9-27"></a> <span class="c1"># fails so far! the operations we get</span> +<a id="rest_code_fbc3fcddd68741a38bb88dc5981923f9-28" name="rest_code_fbc3fcddd68741a38bb88dc5981923f9-28" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_fbc3fcddd68741a38bb88dc5981923f9-28"></a> <span class="c1"># at the moment are:</span> +<a id="rest_code_fbc3fcddd68741a38bb88dc5981923f9-29" name="rest_code_fbc3fcddd68741a38bb88dc5981923f9-29" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_fbc3fcddd68741a38bb88dc5981923f9-29"></a> <span class="c1"># optvar0 = getarg(0)</span> +<a id="rest_code_fbc3fcddd68741a38bb88dc5981923f9-30" name="rest_code_fbc3fcddd68741a38bb88dc5981923f9-30" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_fbc3fcddd68741a38bb88dc5981923f9-30"></a> <span class="c1"># optvar1 = getarg(1)</span> +<a id="rest_code_fbc3fcddd68741a38bb88dc5981923f9-31" name="rest_code_fbc3fcddd68741a38bb88dc5981923f9-31" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_fbc3fcddd68741a38bb88dc5981923f9-31"></a> <span class="c1"># optvar2 = alloc()</span> +<a id="rest_code_fbc3fcddd68741a38bb88dc5981923f9-32" name="rest_code_fbc3fcddd68741a38bb88dc5981923f9-32" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_fbc3fcddd68741a38bb88dc5981923f9-32"></a> <span class="c1"># optvar3 = store(optvar0, 0, optvar2)</span> +<a id="rest_code_fbc3fcddd68741a38bb88dc5981923f9-33" name="rest_code_fbc3fcddd68741a38bb88dc5981923f9-33" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_fbc3fcddd68741a38bb88dc5981923f9-33"></a> <span class="c1"># which is wrong, because the store operations</span> +<a id="rest_code_fbc3fcddd68741a38bb88dc5981923f9-34" name="rest_code_fbc3fcddd68741a38bb88dc5981923f9-34" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_fbc3fcddd68741a38bb88dc5981923f9-34"></a> <span class="c1"># into optvar1 got lost</span> +</pre></div> +<p>To fix this problem, we need to re-create a <code class="docutils literal">store</code> operation for every +element of the <code class="docutils literal">.contents</code> dictionary of the virtual object we are +materializing. <a class="reference internal" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#target-5">²</a></p> +<div class="code"><pre class="code python"><a id="rest_code_3903547c0616440380ad3221ad822e36-1" name="rest_code_3903547c0616440380ad3221ad822e36-1" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_3903547c0616440380ad3221ad822e36-1"></a><span class="k">def</span> <span class="nf">materialize</span><span class="p">(</span><span class="n">opt_bb</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="n">Operation</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="kc">None</span><span class="p">:</span> +<a id="rest_code_3903547c0616440380ad3221ad822e36-2" name="rest_code_3903547c0616440380ad3221ad822e36-2" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_3903547c0616440380ad3221ad822e36-2"></a> <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">value</span><span class="p">,</span> <span class="n">Constant</span><span class="p">):</span> +<a id="rest_code_3903547c0616440380ad3221ad822e36-3" name="rest_code_3903547c0616440380ad3221ad822e36-3" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_3903547c0616440380ad3221ad822e36-3"></a> <span class="k">return</span> +<a id="rest_code_3903547c0616440380ad3221ad822e36-4" name="rest_code_3903547c0616440380ad3221ad822e36-4" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_3903547c0616440380ad3221ad822e36-4"></a> <span class="k">assert</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">value</span><span class="p">,</span> <span class="n">Operation</span><span class="p">)</span> +<a id="rest_code_3903547c0616440380ad3221ad822e36-5" name="rest_code_3903547c0616440380ad3221ad822e36-5" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_3903547c0616440380ad3221ad822e36-5"></a> <span class="n">info</span> <span class="o">=</span> <span class="n">value</span><span class="o">.</span><span class="n">info</span> +<a id="rest_code_3903547c0616440380ad3221ad822e36-6" name="rest_code_3903547c0616440380ad3221ad822e36-6" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_3903547c0616440380ad3221ad822e36-6"></a> <span class="k">if</span> <span class="n">info</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span> +<a id="rest_code_3903547c0616440380ad3221ad822e36-7" name="rest_code_3903547c0616440380ad3221ad822e36-7" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_3903547c0616440380ad3221ad822e36-7"></a> <span class="k">return</span> <span class="c1"># already materialized</span> +<a id="rest_code_3903547c0616440380ad3221ad822e36-8" name="rest_code_3903547c0616440380ad3221ad822e36-8" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_3903547c0616440380ad3221ad822e36-8"></a> <span class="k">assert</span> <span class="n">value</span><span class="o">.</span><span class="n">name</span> <span class="o">==</span> <span class="s2">"alloc"</span> +<a id="rest_code_3903547c0616440380ad3221ad822e36-9" name="rest_code_3903547c0616440380ad3221ad822e36-9" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_3903547c0616440380ad3221ad822e36-9"></a> <span class="c1"># put the alloc operation back into the trace</span> +<a id="rest_code_3903547c0616440380ad3221ad822e36-10" name="rest_code_3903547c0616440380ad3221ad822e36-10" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_3903547c0616440380ad3221ad822e36-10"></a> <span class="n">opt_bb</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">value</span><span class="p">)</span> +<a id="rest_code_3903547c0616440380ad3221ad822e36-11" name="rest_code_3903547c0616440380ad3221ad822e36-11" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_3903547c0616440380ad3221ad822e36-11"></a><span class="hll"> <span class="c1"># put the content back</span> +</span><a id="rest_code_3903547c0616440380ad3221ad822e36-12" name="rest_code_3903547c0616440380ad3221ad822e36-12" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_3903547c0616440380ad3221ad822e36-12"></a><span class="hll"> <span class="k">for</span> <span class="n">idx</span><span class="p">,</span> <span class="n">val</span> <span class="ow">in</span> <span class="n">info</span><span class="o">.</span><span class="n">contents</span><span class="o">.</span><span class="n">items</span><span class="p">():</span> +</span><a id="rest_code_3903547c0616440380ad3221ad822e36-13" name="rest_code_3903547c0616440380ad3221ad822e36-13" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_3903547c0616440380ad3221ad822e36-13"></a><span class="hll"> <span class="c1"># re-create store operation</span> +</span><a id="rest_code_3903547c0616440380ad3221ad822e36-14" name="rest_code_3903547c0616440380ad3221ad822e36-14" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_3903547c0616440380ad3221ad822e36-14"></a><span class="hll"> <span class="n">opt_bb</span><span class="o">.</span><span class="n">store</span><span class="p">(</span><span class="n">value</span><span class="p">,</span> <span class="n">idx</span><span class="p">,</span> <span class="n">val</span><span class="p">)</span> +</span><a id="rest_code_3903547c0616440380ad3221ad822e36-15" name="rest_code_3903547c0616440380ad3221ad822e36-15" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_3903547c0616440380ad3221ad822e36-15"></a> <span class="c1"># only materialize once</span> +<a id="rest_code_3903547c0616440380ad3221ad822e36-16" name="rest_code_3903547c0616440380ad3221ad822e36-16" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_3903547c0616440380ad3221ad822e36-16"></a> <span class="n">value</span><span class="o">.</span><span class="n">info</span> <span class="o">=</span> <span class="kc">None</span> +<a id="rest_code_3903547c0616440380ad3221ad822e36-17" name="rest_code_3903547c0616440380ad3221ad822e36-17" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_3903547c0616440380ad3221ad822e36-17"></a> +<a id="rest_code_3903547c0616440380ad3221ad822e36-18" name="rest_code_3903547c0616440380ad3221ad822e36-18" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_3903547c0616440380ad3221ad822e36-18"></a><span class="c1"># optimize_alloc_removal unchanged</span> +</pre></div> +<p>This is enough to pass the test.</p> +</section> +<section id="version-6-recursive-materialization"> +<h2>Version 6: Recursive Materialization</h2> +<p>In the above example, the fields of the virtual objects contained +only constants or non-virtual objects. However, we could have a situation where +a whole tree of virtual objects is built, and then the root of the tree escapes. +This makes it necessary to escape the whole tree. Let's write a test for a small +tree of two virtual objects:</p> +<div class="code"><pre class="code python"><a id="rest_code_f4090cb2279842e1b178f2d23ac47659-1" name="rest_code_f4090cb2279842e1b178f2d23ac47659-1" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_f4090cb2279842e1b178f2d23ac47659-1"></a><span class="k">def</span> <span class="nf">test_materialize_chained_objects</span><span class="p">():</span> +<a id="rest_code_f4090cb2279842e1b178f2d23ac47659-2" name="rest_code_f4090cb2279842e1b178f2d23ac47659-2" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_f4090cb2279842e1b178f2d23ac47659-2"></a> <span class="n">bb</span> <span class="o">=</span> <span class="n">Block</span><span class="p">()</span> +<a id="rest_code_f4090cb2279842e1b178f2d23ac47659-3" name="rest_code_f4090cb2279842e1b178f2d23ac47659-3" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_f4090cb2279842e1b178f2d23ac47659-3"></a> <span class="n">var0</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">getarg</span><span class="p">(</span><span class="mi">0</span><span class="p">)</span> +<a id="rest_code_f4090cb2279842e1b178f2d23ac47659-4" name="rest_code_f4090cb2279842e1b178f2d23ac47659-4" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_f4090cb2279842e1b178f2d23ac47659-4"></a> <span class="n">obj0</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">alloc</span><span class="p">()</span> +<a id="rest_code_f4090cb2279842e1b178f2d23ac47659-5" name="rest_code_f4090cb2279842e1b178f2d23ac47659-5" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_f4090cb2279842e1b178f2d23ac47659-5"></a> <span class="n">obj1</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">alloc</span><span class="p">()</span> +<a id="rest_code_f4090cb2279842e1b178f2d23ac47659-6" name="rest_code_f4090cb2279842e1b178f2d23ac47659-6" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_f4090cb2279842e1b178f2d23ac47659-6"></a> <span class="n">contents</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">store</span><span class="p">(</span><span class="n">obj0</span><span class="p">,</span> <span class="mi">0</span><span class="p">,</span> <span class="n">obj1</span><span class="p">)</span> +<a id="rest_code_f4090cb2279842e1b178f2d23ac47659-7" name="rest_code_f4090cb2279842e1b178f2d23ac47659-7" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_f4090cb2279842e1b178f2d23ac47659-7"></a> <span class="n">const</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">store</span><span class="p">(</span><span class="n">obj1</span><span class="p">,</span> <span class="mi">0</span><span class="p">,</span> <span class="mi">1337</span><span class="p">)</span> +<a id="rest_code_f4090cb2279842e1b178f2d23ac47659-8" name="rest_code_f4090cb2279842e1b178f2d23ac47659-8" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_f4090cb2279842e1b178f2d23ac47659-8"></a> <span class="n">sto</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">store</span><span class="p">(</span><span class="n">var0</span><span class="p">,</span> <span class="mi">0</span><span class="p">,</span> <span class="n">obj0</span><span class="p">)</span> +<a id="rest_code_f4090cb2279842e1b178f2d23ac47659-9" name="rest_code_f4090cb2279842e1b178f2d23ac47659-9" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_f4090cb2279842e1b178f2d23ac47659-9"></a> <span class="c1"># obj0</span> +<a id="rest_code_f4090cb2279842e1b178f2d23ac47659-10" name="rest_code_f4090cb2279842e1b178f2d23ac47659-10" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_f4090cb2279842e1b178f2d23ac47659-10"></a> <span class="c1"># ┌──────┐</span> +<a id="rest_code_f4090cb2279842e1b178f2d23ac47659-11" name="rest_code_f4090cb2279842e1b178f2d23ac47659-11" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_f4090cb2279842e1b178f2d23ac47659-11"></a> <span class="c1"># │ 0: ╷ │</span> +<a id="rest_code_f4090cb2279842e1b178f2d23ac47659-12" name="rest_code_f4090cb2279842e1b178f2d23ac47659-12" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_f4090cb2279842e1b178f2d23ac47659-12"></a> <span class="c1"># └────┼─┘</span> +<a id="rest_code_f4090cb2279842e1b178f2d23ac47659-13" name="rest_code_f4090cb2279842e1b178f2d23ac47659-13" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_f4090cb2279842e1b178f2d23ac47659-13"></a> <span class="c1"># │</span> +<a id="rest_code_f4090cb2279842e1b178f2d23ac47659-14" name="rest_code_f4090cb2279842e1b178f2d23ac47659-14" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_f4090cb2279842e1b178f2d23ac47659-14"></a> <span class="c1"># ▼</span> +<a id="rest_code_f4090cb2279842e1b178f2d23ac47659-15" name="rest_code_f4090cb2279842e1b178f2d23ac47659-15" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_f4090cb2279842e1b178f2d23ac47659-15"></a> <span class="c1"># obj1</span> +<a id="rest_code_f4090cb2279842e1b178f2d23ac47659-16" name="rest_code_f4090cb2279842e1b178f2d23ac47659-16" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_f4090cb2279842e1b178f2d23ac47659-16"></a> <span class="c1"># ┌─────────┐</span> +<a id="rest_code_f4090cb2279842e1b178f2d23ac47659-17" name="rest_code_f4090cb2279842e1b178f2d23ac47659-17" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_f4090cb2279842e1b178f2d23ac47659-17"></a> <span class="c1"># │ 0: 1337 │</span> +<a id="rest_code_f4090cb2279842e1b178f2d23ac47659-18" name="rest_code_f4090cb2279842e1b178f2d23ac47659-18" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_f4090cb2279842e1b178f2d23ac47659-18"></a> <span class="c1"># └─────────┘</span> +<a id="rest_code_f4090cb2279842e1b178f2d23ac47659-19" name="rest_code_f4090cb2279842e1b178f2d23ac47659-19" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_f4090cb2279842e1b178f2d23ac47659-19"></a> <span class="c1"># now obj0 escapes</span> +<a id="rest_code_f4090cb2279842e1b178f2d23ac47659-20" name="rest_code_f4090cb2279842e1b178f2d23ac47659-20" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_f4090cb2279842e1b178f2d23ac47659-20"></a> <span class="n">opt_bb</span> <span class="o">=</span> <span class="n">optimize_alloc_removal</span><span class="p">(</span><span class="n">bb</span><span class="p">)</span> +<a id="rest_code_f4090cb2279842e1b178f2d23ac47659-21" name="rest_code_f4090cb2279842e1b178f2d23ac47659-21" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_f4090cb2279842e1b178f2d23ac47659-21"></a> <span class="k">assert</span> <span class="n">bb_to_str</span><span class="p">(</span><span class="n">opt_bb</span><span class="p">,</span> <span class="s2">"optvar"</span><span class="p">)</span> <span class="o">==</span> <span class="s2">"""</span><span class="se">\</span> +<a id="rest_code_f4090cb2279842e1b178f2d23ac47659-22" name="rest_code_f4090cb2279842e1b178f2d23ac47659-22" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_f4090cb2279842e1b178f2d23ac47659-22"></a><span class="s2">optvar0 = getarg(0)</span> +<a id="rest_code_f4090cb2279842e1b178f2d23ac47659-23" name="rest_code_f4090cb2279842e1b178f2d23ac47659-23" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_f4090cb2279842e1b178f2d23ac47659-23"></a><span class="s2">optvar1 = alloc()</span> +<a id="rest_code_f4090cb2279842e1b178f2d23ac47659-24" name="rest_code_f4090cb2279842e1b178f2d23ac47659-24" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_f4090cb2279842e1b178f2d23ac47659-24"></a><span class="s2">optvar2 = alloc()</span> +<a id="rest_code_f4090cb2279842e1b178f2d23ac47659-25" name="rest_code_f4090cb2279842e1b178f2d23ac47659-25" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_f4090cb2279842e1b178f2d23ac47659-25"></a><span class="s2">optvar3 = store(optvar2, 0, 1337)</span> +<a id="rest_code_f4090cb2279842e1b178f2d23ac47659-26" name="rest_code_f4090cb2279842e1b178f2d23ac47659-26" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_f4090cb2279842e1b178f2d23ac47659-26"></a><span class="s2">optvar4 = store(optvar1, 0, optvar2)</span> +<a id="rest_code_f4090cb2279842e1b178f2d23ac47659-27" name="rest_code_f4090cb2279842e1b178f2d23ac47659-27" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_f4090cb2279842e1b178f2d23ac47659-27"></a><span class="s2">optvar5 = store(optvar0, 0, optvar1)"""</span> +<a id="rest_code_f4090cb2279842e1b178f2d23ac47659-28" name="rest_code_f4090cb2279842e1b178f2d23ac47659-28" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_f4090cb2279842e1b178f2d23ac47659-28"></a> <span class="c1"># fails in an annoying way! the resulting</span> +<a id="rest_code_f4090cb2279842e1b178f2d23ac47659-29" name="rest_code_f4090cb2279842e1b178f2d23ac47659-29" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_f4090cb2279842e1b178f2d23ac47659-29"></a> <span class="c1"># basic block is not in proper SSA form</span> +<a id="rest_code_f4090cb2279842e1b178f2d23ac47659-30" name="rest_code_f4090cb2279842e1b178f2d23ac47659-30" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_f4090cb2279842e1b178f2d23ac47659-30"></a> <span class="c1"># so printing it fails. The optimized</span> +<a id="rest_code_f4090cb2279842e1b178f2d23ac47659-31" name="rest_code_f4090cb2279842e1b178f2d23ac47659-31" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_f4090cb2279842e1b178f2d23ac47659-31"></a> <span class="c1"># block would look like this:</span> +<a id="rest_code_f4090cb2279842e1b178f2d23ac47659-32" name="rest_code_f4090cb2279842e1b178f2d23ac47659-32" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_f4090cb2279842e1b178f2d23ac47659-32"></a> <span class="c1"># optvar0 = getarg(0)</span> +<a id="rest_code_f4090cb2279842e1b178f2d23ac47659-33" name="rest_code_f4090cb2279842e1b178f2d23ac47659-33" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_f4090cb2279842e1b178f2d23ac47659-33"></a> <span class="c1"># optvar1 = alloc()</span> +<a id="rest_code_f4090cb2279842e1b178f2d23ac47659-34" name="rest_code_f4090cb2279842e1b178f2d23ac47659-34" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_f4090cb2279842e1b178f2d23ac47659-34"></a> <span class="c1"># optvar3 = store(optvar1, 0, optvar2)</span> +<a id="rest_code_f4090cb2279842e1b178f2d23ac47659-35" name="rest_code_f4090cb2279842e1b178f2d23ac47659-35" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_f4090cb2279842e1b178f2d23ac47659-35"></a> <span class="c1"># optvar4 = store(optvar0, 0, optvar1)</span> +<a id="rest_code_f4090cb2279842e1b178f2d23ac47659-36" name="rest_code_f4090cb2279842e1b178f2d23ac47659-36" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_f4090cb2279842e1b178f2d23ac47659-36"></a> <span class="c1"># where optvar2 is an ``alloc`` Operation</span> +<a id="rest_code_f4090cb2279842e1b178f2d23ac47659-37" name="rest_code_f4090cb2279842e1b178f2d23ac47659-37" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_f4090cb2279842e1b178f2d23ac47659-37"></a> <span class="c1"># that is not itself in the output block</span> +</pre></div> +<p>To fix it, <code class="docutils literal">materialize</code> needs to call itself recursively for all the field +values of the virtual object:</p> +<div class="code"><pre class="code python"><a id="rest_code_89139e94fb0a484da178cf4cdcdc7e6a-1" name="rest_code_89139e94fb0a484da178cf4cdcdc7e6a-1" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_89139e94fb0a484da178cf4cdcdc7e6a-1"></a><span class="k">def</span> <span class="nf">materialize</span><span class="p">(</span><span class="n">opt_bb</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="n">Operation</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="kc">None</span><span class="p">:</span> +<a id="rest_code_89139e94fb0a484da178cf4cdcdc7e6a-2" name="rest_code_89139e94fb0a484da178cf4cdcdc7e6a-2" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_89139e94fb0a484da178cf4cdcdc7e6a-2"></a> <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">value</span><span class="p">,</span> <span class="n">Constant</span><span class="p">):</span> +<a id="rest_code_89139e94fb0a484da178cf4cdcdc7e6a-3" name="rest_code_89139e94fb0a484da178cf4cdcdc7e6a-3" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_89139e94fb0a484da178cf4cdcdc7e6a-3"></a> <span class="k">return</span> +<a id="rest_code_89139e94fb0a484da178cf4cdcdc7e6a-4" name="rest_code_89139e94fb0a484da178cf4cdcdc7e6a-4" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_89139e94fb0a484da178cf4cdcdc7e6a-4"></a> <span class="k">assert</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">value</span><span class="p">,</span> <span class="n">Operation</span><span class="p">)</span> +<a id="rest_code_89139e94fb0a484da178cf4cdcdc7e6a-5" name="rest_code_89139e94fb0a484da178cf4cdcdc7e6a-5" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_89139e94fb0a484da178cf4cdcdc7e6a-5"></a> <span class="n">info</span> <span class="o">=</span> <span class="n">value</span><span class="o">.</span><span class="n">info</span> +<a id="rest_code_89139e94fb0a484da178cf4cdcdc7e6a-6" name="rest_code_89139e94fb0a484da178cf4cdcdc7e6a-6" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_89139e94fb0a484da178cf4cdcdc7e6a-6"></a> <span class="k">if</span> <span class="n">info</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span> +<a id="rest_code_89139e94fb0a484da178cf4cdcdc7e6a-7" name="rest_code_89139e94fb0a484da178cf4cdcdc7e6a-7" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_89139e94fb0a484da178cf4cdcdc7e6a-7"></a> <span class="k">return</span> <span class="c1"># already materialized</span> +<a id="rest_code_89139e94fb0a484da178cf4cdcdc7e6a-8" name="rest_code_89139e94fb0a484da178cf4cdcdc7e6a-8" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_89139e94fb0a484da178cf4cdcdc7e6a-8"></a> <span class="k">assert</span> <span class="n">value</span><span class="o">.</span><span class="n">name</span> <span class="o">==</span> <span class="s2">"alloc"</span> +<a id="rest_code_89139e94fb0a484da178cf4cdcdc7e6a-9" name="rest_code_89139e94fb0a484da178cf4cdcdc7e6a-9" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_89139e94fb0a484da178cf4cdcdc7e6a-9"></a> <span class="c1"># put the alloc operation back into the trace</span> +<a id="rest_code_89139e94fb0a484da178cf4cdcdc7e6a-10" name="rest_code_89139e94fb0a484da178cf4cdcdc7e6a-10" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_89139e94fb0a484da178cf4cdcdc7e6a-10"></a> <span class="n">opt_bb</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">value</span><span class="p">)</span> +<a id="rest_code_89139e94fb0a484da178cf4cdcdc7e6a-11" name="rest_code_89139e94fb0a484da178cf4cdcdc7e6a-11" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_89139e94fb0a484da178cf4cdcdc7e6a-11"></a> <span class="c1"># put the content back</span> +<a id="rest_code_89139e94fb0a484da178cf4cdcdc7e6a-12" name="rest_code_89139e94fb0a484da178cf4cdcdc7e6a-12" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_89139e94fb0a484da178cf4cdcdc7e6a-12"></a> <span class="k">for</span> <span class="n">idx</span><span class="p">,</span> <span class="n">val</span> <span class="ow">in</span> <span class="nb">sorted</span><span class="p">(</span><span class="n">info</span><span class="o">.</span><span class="n">contents</span><span class="o">.</span><span class="n">items</span><span class="p">()):</span> +<a id="rest_code_89139e94fb0a484da178cf4cdcdc7e6a-13" name="rest_code_89139e94fb0a484da178cf4cdcdc7e6a-13" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_89139e94fb0a484da178cf4cdcdc7e6a-13"></a><span class="hll"> <span class="c1"># materialize recursively</span> +</span><a id="rest_code_89139e94fb0a484da178cf4cdcdc7e6a-14" name="rest_code_89139e94fb0a484da178cf4cdcdc7e6a-14" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_89139e94fb0a484da178cf4cdcdc7e6a-14"></a><span class="hll"> <span class="n">materialize</span><span class="p">(</span><span class="n">opt_bb</span><span class="p">,</span> <span class="n">val</span><span class="p">)</span> +</span><a id="rest_code_89139e94fb0a484da178cf4cdcdc7e6a-15" name="rest_code_89139e94fb0a484da178cf4cdcdc7e6a-15" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_89139e94fb0a484da178cf4cdcdc7e6a-15"></a> <span class="n">opt_bb</span><span class="o">.</span><span class="n">store</span><span class="p">(</span><span class="n">value</span><span class="p">,</span> <span class="n">idx</span><span class="p">,</span> <span class="n">val</span><span class="p">)</span> +<a id="rest_code_89139e94fb0a484da178cf4cdcdc7e6a-16" name="rest_code_89139e94fb0a484da178cf4cdcdc7e6a-16" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_89139e94fb0a484da178cf4cdcdc7e6a-16"></a> <span class="c1"># only materialize once</span> +<a id="rest_code_89139e94fb0a484da178cf4cdcdc7e6a-17" name="rest_code_89139e94fb0a484da178cf4cdcdc7e6a-17" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_89139e94fb0a484da178cf4cdcdc7e6a-17"></a> <span class="n">value</span><span class="o">.</span><span class="n">info</span> <span class="o">=</span> <span class="kc">None</span> +<a id="rest_code_89139e94fb0a484da178cf4cdcdc7e6a-18" name="rest_code_89139e94fb0a484da178cf4cdcdc7e6a-18" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_89139e94fb0a484da178cf4cdcdc7e6a-18"></a> +<a id="rest_code_89139e94fb0a484da178cf4cdcdc7e6a-19" name="rest_code_89139e94fb0a484da178cf4cdcdc7e6a-19" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_89139e94fb0a484da178cf4cdcdc7e6a-19"></a><span class="c1"># optimize_alloc_removal unchanged</span> +</pre></div> +<p>Getting there, the materialization logic is almost done. We need to fix a +subtle remaining problem though.</p> +</section> +<section id="version-7-dealing-with-object-cycles"> +<h2>Version 7: Dealing with Object Cycles</h2> +<p>The bug we need to fix in this section is a bit tricky, and does not immediately +occur in a lot of programs. In +fact, in PyPy a variant of it was hiding out in our optimizer +until we found it much later (despite us being aware of the general problem and +correctly dealing with it in other cases).</p> +<p>The problem is this: a virtual object can (directly or indirectly) point to +itself, and we must carefully deal with that case to avoid infinite recursion in +<code class="docutils literal">materialize</code>. Here's the simplest test:</p> +<div class="code"><pre class="code python"><a id="rest_code_14cfb68b67424d1ea9623fffbe8cd9e3-1" name="rest_code_14cfb68b67424d1ea9623fffbe8cd9e3-1" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_14cfb68b67424d1ea9623fffbe8cd9e3-1"></a><span class="k">def</span> <span class="nf">test_object_graph_cycles</span><span class="p">():</span> +<a id="rest_code_14cfb68b67424d1ea9623fffbe8cd9e3-2" name="rest_code_14cfb68b67424d1ea9623fffbe8cd9e3-2" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_14cfb68b67424d1ea9623fffbe8cd9e3-2"></a> <span class="n">bb</span> <span class="o">=</span> <span class="n">Block</span><span class="p">()</span> +<a id="rest_code_14cfb68b67424d1ea9623fffbe8cd9e3-3" name="rest_code_14cfb68b67424d1ea9623fffbe8cd9e3-3" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_14cfb68b67424d1ea9623fffbe8cd9e3-3"></a> <span class="n">var0</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">getarg</span><span class="p">(</span><span class="mi">0</span><span class="p">)</span> +<a id="rest_code_14cfb68b67424d1ea9623fffbe8cd9e3-4" name="rest_code_14cfb68b67424d1ea9623fffbe8cd9e3-4" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_14cfb68b67424d1ea9623fffbe8cd9e3-4"></a> <span class="n">var1</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">alloc</span><span class="p">()</span> +<a id="rest_code_14cfb68b67424d1ea9623fffbe8cd9e3-5" name="rest_code_14cfb68b67424d1ea9623fffbe8cd9e3-5" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_14cfb68b67424d1ea9623fffbe8cd9e3-5"></a> <span class="n">var2</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">store</span><span class="p">(</span><span class="n">var1</span><span class="p">,</span> <span class="mi">0</span><span class="p">,</span> <span class="n">var1</span><span class="p">)</span> +<a id="rest_code_14cfb68b67424d1ea9623fffbe8cd9e3-6" name="rest_code_14cfb68b67424d1ea9623fffbe8cd9e3-6" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_14cfb68b67424d1ea9623fffbe8cd9e3-6"></a> <span class="n">var3</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">store</span><span class="p">(</span><span class="n">var0</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="n">var1</span><span class="p">)</span> +<a id="rest_code_14cfb68b67424d1ea9623fffbe8cd9e3-7" name="rest_code_14cfb68b67424d1ea9623fffbe8cd9e3-7" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_14cfb68b67424d1ea9623fffbe8cd9e3-7"></a> <span class="c1"># ┌────────┐</span> +<a id="rest_code_14cfb68b67424d1ea9623fffbe8cd9e3-8" name="rest_code_14cfb68b67424d1ea9623fffbe8cd9e3-8" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_14cfb68b67424d1ea9623fffbe8cd9e3-8"></a> <span class="c1"># ▼ │</span> +<a id="rest_code_14cfb68b67424d1ea9623fffbe8cd9e3-9" name="rest_code_14cfb68b67424d1ea9623fffbe8cd9e3-9" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_14cfb68b67424d1ea9623fffbe8cd9e3-9"></a> <span class="c1"># obj0 │</span> +<a id="rest_code_14cfb68b67424d1ea9623fffbe8cd9e3-10" name="rest_code_14cfb68b67424d1ea9623fffbe8cd9e3-10" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_14cfb68b67424d1ea9623fffbe8cd9e3-10"></a> <span class="c1"># ┌──────┐ │</span> +<a id="rest_code_14cfb68b67424d1ea9623fffbe8cd9e3-11" name="rest_code_14cfb68b67424d1ea9623fffbe8cd9e3-11" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_14cfb68b67424d1ea9623fffbe8cd9e3-11"></a> <span class="c1"># │ 0: ╷ │ │</span> +<a id="rest_code_14cfb68b67424d1ea9623fffbe8cd9e3-12" name="rest_code_14cfb68b67424d1ea9623fffbe8cd9e3-12" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_14cfb68b67424d1ea9623fffbe8cd9e3-12"></a> <span class="c1"># └────┼─┘ │</span> +<a id="rest_code_14cfb68b67424d1ea9623fffbe8cd9e3-13" name="rest_code_14cfb68b67424d1ea9623fffbe8cd9e3-13" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_14cfb68b67424d1ea9623fffbe8cd9e3-13"></a> <span class="c1"># │ │</span> +<a id="rest_code_14cfb68b67424d1ea9623fffbe8cd9e3-14" name="rest_code_14cfb68b67424d1ea9623fffbe8cd9e3-14" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_14cfb68b67424d1ea9623fffbe8cd9e3-14"></a> <span class="c1"># └─────┘</span> +<a id="rest_code_14cfb68b67424d1ea9623fffbe8cd9e3-15" name="rest_code_14cfb68b67424d1ea9623fffbe8cd9e3-15" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_14cfb68b67424d1ea9623fffbe8cd9e3-15"></a> <span class="c1"># obj0 points to itself, and then it is</span> +<a id="rest_code_14cfb68b67424d1ea9623fffbe8cd9e3-16" name="rest_code_14cfb68b67424d1ea9623fffbe8cd9e3-16" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_14cfb68b67424d1ea9623fffbe8cd9e3-16"></a> <span class="c1"># escaped</span> +<a id="rest_code_14cfb68b67424d1ea9623fffbe8cd9e3-17" name="rest_code_14cfb68b67424d1ea9623fffbe8cd9e3-17" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_14cfb68b67424d1ea9623fffbe8cd9e3-17"></a> <span class="n">opt_bb</span> <span class="o">=</span> <span class="n">optimize_alloc_removal</span><span class="p">(</span><span class="n">bb</span><span class="p">)</span> +<a id="rest_code_14cfb68b67424d1ea9623fffbe8cd9e3-18" name="rest_code_14cfb68b67424d1ea9623fffbe8cd9e3-18" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_14cfb68b67424d1ea9623fffbe8cd9e3-18"></a> <span class="c1"># the previous line fails with an</span> +<a id="rest_code_14cfb68b67424d1ea9623fffbe8cd9e3-19" name="rest_code_14cfb68b67424d1ea9623fffbe8cd9e3-19" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_14cfb68b67424d1ea9623fffbe8cd9e3-19"></a> <span class="c1"># InfiniteRecursionError</span> +<a id="rest_code_14cfb68b67424d1ea9623fffbe8cd9e3-20" name="rest_code_14cfb68b67424d1ea9623fffbe8cd9e3-20" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_14cfb68b67424d1ea9623fffbe8cd9e3-20"></a> <span class="c1"># materialize calls itself, infinitely</span> +<a id="rest_code_14cfb68b67424d1ea9623fffbe8cd9e3-21" name="rest_code_14cfb68b67424d1ea9623fffbe8cd9e3-21" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_14cfb68b67424d1ea9623fffbe8cd9e3-21"></a> +<a id="rest_code_14cfb68b67424d1ea9623fffbe8cd9e3-22" name="rest_code_14cfb68b67424d1ea9623fffbe8cd9e3-22" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_14cfb68b67424d1ea9623fffbe8cd9e3-22"></a> <span class="c1"># what we want is instead this output:</span> +<a id="rest_code_14cfb68b67424d1ea9623fffbe8cd9e3-23" name="rest_code_14cfb68b67424d1ea9623fffbe8cd9e3-23" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_14cfb68b67424d1ea9623fffbe8cd9e3-23"></a> <span class="k">assert</span> <span class="n">bb_to_str</span><span class="p">(</span><span class="n">opt_bb</span><span class="p">,</span> <span class="s2">"optvar"</span><span class="p">)</span> <span class="o">==</span> <span class="s2">"""</span><span class="se">\</span> +<a id="rest_code_14cfb68b67424d1ea9623fffbe8cd9e3-24" name="rest_code_14cfb68b67424d1ea9623fffbe8cd9e3-24" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_14cfb68b67424d1ea9623fffbe8cd9e3-24"></a><span class="s2">optvar0 = getarg(0)</span> +<a id="rest_code_14cfb68b67424d1ea9623fffbe8cd9e3-25" name="rest_code_14cfb68b67424d1ea9623fffbe8cd9e3-25" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_14cfb68b67424d1ea9623fffbe8cd9e3-25"></a><span class="s2">optvar1 = alloc()</span> +<a id="rest_code_14cfb68b67424d1ea9623fffbe8cd9e3-26" name="rest_code_14cfb68b67424d1ea9623fffbe8cd9e3-26" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_14cfb68b67424d1ea9623fffbe8cd9e3-26"></a><span class="s2">optvar2 = store(optvar1, 0, optvar1)</span> +<a id="rest_code_14cfb68b67424d1ea9623fffbe8cd9e3-27" name="rest_code_14cfb68b67424d1ea9623fffbe8cd9e3-27" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_14cfb68b67424d1ea9623fffbe8cd9e3-27"></a><span class="s2">optvar3 = store(optvar0, 1, optvar1)"""</span> +</pre></div> +<p>The fix is not a big change, but a little bit subtle nevertheless. +We have to change the +order in which things are done in <code class="docutils literal">materialize</code>. Right after emitting the +<code class="docutils literal">alloc</code>, we set the <code class="docutils literal">.info</code> to <code class="docutils literal">None</code>, to mark the object as not virtual. +Only <em>afterwards</em> do we re-create the stores and call <code class="docutils literal">materialize</code> recursively. +If a recursive call reaches the same object, it's already marked as non-virtual, +so <code class="docutils literal">materialize</code> won't recurse further:</p> +<div class="code"><pre class="code python"><a id="rest_code_7ed667c4854348719d115ecce0edcf63-1" name="rest_code_7ed667c4854348719d115ecce0edcf63-1" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_7ed667c4854348719d115ecce0edcf63-1"></a><span class="k">def</span> <span class="nf">materialize</span><span class="p">(</span><span class="n">opt_bb</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="n">Operation</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="kc">None</span><span class="p">:</span> +<a id="rest_code_7ed667c4854348719d115ecce0edcf63-2" name="rest_code_7ed667c4854348719d115ecce0edcf63-2" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_7ed667c4854348719d115ecce0edcf63-2"></a> <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">value</span><span class="p">,</span> <span class="n">Constant</span><span class="p">):</span> +<a id="rest_code_7ed667c4854348719d115ecce0edcf63-3" name="rest_code_7ed667c4854348719d115ecce0edcf63-3" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_7ed667c4854348719d115ecce0edcf63-3"></a> <span class="k">return</span> +<a id="rest_code_7ed667c4854348719d115ecce0edcf63-4" name="rest_code_7ed667c4854348719d115ecce0edcf63-4" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_7ed667c4854348719d115ecce0edcf63-4"></a> <span class="k">assert</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">value</span><span class="p">,</span> <span class="n">Operation</span><span class="p">)</span> +<a id="rest_code_7ed667c4854348719d115ecce0edcf63-5" name="rest_code_7ed667c4854348719d115ecce0edcf63-5" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_7ed667c4854348719d115ecce0edcf63-5"></a> <span class="n">info</span> <span class="o">=</span> <span class="n">value</span><span class="o">.</span><span class="n">info</span> +<a id="rest_code_7ed667c4854348719d115ecce0edcf63-6" name="rest_code_7ed667c4854348719d115ecce0edcf63-6" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_7ed667c4854348719d115ecce0edcf63-6"></a> <span class="k">if</span> <span class="n">info</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span> +<a id="rest_code_7ed667c4854348719d115ecce0edcf63-7" name="rest_code_7ed667c4854348719d115ecce0edcf63-7" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_7ed667c4854348719d115ecce0edcf63-7"></a> <span class="k">return</span> <span class="c1"># already materialized</span> +<a id="rest_code_7ed667c4854348719d115ecce0edcf63-8" name="rest_code_7ed667c4854348719d115ecce0edcf63-8" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_7ed667c4854348719d115ecce0edcf63-8"></a> <span class="k">assert</span> <span class="n">value</span><span class="o">.</span><span class="n">name</span> <span class="o">==</span> <span class="s2">"alloc"</span> +<a id="rest_code_7ed667c4854348719d115ecce0edcf63-9" name="rest_code_7ed667c4854348719d115ecce0edcf63-9" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_7ed667c4854348719d115ecce0edcf63-9"></a> <span class="c1"># put the alloc operation back into the trace</span> +<a id="rest_code_7ed667c4854348719d115ecce0edcf63-10" name="rest_code_7ed667c4854348719d115ecce0edcf63-10" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_7ed667c4854348719d115ecce0edcf63-10"></a> <span class="n">opt_bb</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">value</span><span class="p">)</span> +<a id="rest_code_7ed667c4854348719d115ecce0edcf63-11" name="rest_code_7ed667c4854348719d115ecce0edcf63-11" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_7ed667c4854348719d115ecce0edcf63-11"></a><span class="hll"> <span class="c1"># only materialize once</span> +</span><a id="rest_code_7ed667c4854348719d115ecce0edcf63-12" name="rest_code_7ed667c4854348719d115ecce0edcf63-12" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_7ed667c4854348719d115ecce0edcf63-12"></a><span class="hll"> <span class="n">value</span><span class="o">.</span><span class="n">info</span> <span class="o">=</span> <span class="kc">None</span> +</span><a id="rest_code_7ed667c4854348719d115ecce0edcf63-13" name="rest_code_7ed667c4854348719d115ecce0edcf63-13" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_7ed667c4854348719d115ecce0edcf63-13"></a><span class="hll"> <span class="c1"># put the content back</span> +</span><a id="rest_code_7ed667c4854348719d115ecce0edcf63-14" name="rest_code_7ed667c4854348719d115ecce0edcf63-14" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_7ed667c4854348719d115ecce0edcf63-14"></a><span class="hll"> <span class="k">for</span> <span class="n">idx</span><span class="p">,</span> <span class="n">val</span> <span class="ow">in</span> <span class="nb">sorted</span><span class="p">(</span><span class="n">info</span><span class="o">.</span><span class="n">contents</span><span class="o">.</span><span class="n">items</span><span class="p">()):</span> +</span><a id="rest_code_7ed667c4854348719d115ecce0edcf63-15" name="rest_code_7ed667c4854348719d115ecce0edcf63-15" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_7ed667c4854348719d115ecce0edcf63-15"></a><span class="hll"> <span class="c1"># materialize recursively</span> +</span><a id="rest_code_7ed667c4854348719d115ecce0edcf63-16" name="rest_code_7ed667c4854348719d115ecce0edcf63-16" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_7ed667c4854348719d115ecce0edcf63-16"></a><span class="hll"> <span class="n">materialize</span><span class="p">(</span><span class="n">opt_bb</span><span class="p">,</span> <span class="n">val</span><span class="p">)</span> +</span><a id="rest_code_7ed667c4854348719d115ecce0edcf63-17" name="rest_code_7ed667c4854348719d115ecce0edcf63-17" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_7ed667c4854348719d115ecce0edcf63-17"></a><span class="hll"> <span class="n">opt_bb</span><span class="o">.</span><span class="n">store</span><span class="p">(</span><span class="n">value</span><span class="p">,</span> <span class="n">idx</span><span class="p">,</span> <span class="n">val</span><span class="p">)</span> +</span></pre></div> +</section> +<section id="version-8-loading-from-non-virtual-objects"> +<h2>Version 8: Loading from non-virtual objects</h2> +<p>Now materialize is done. We need to go back to <code class="docutils literal">optimize_alloc_removal</code> and +improve it further. The last time we changed it, we added a case analysis to the +code dealing with <code class="docutils literal">store</code>, distinguishing between storing to a virtual and to +a non-virtual object. We need to add an equivalent distinction to the <code class="docutils literal">load</code> +case, because right now loading from a non-virtual crashes.</p> +<div class="code"><pre class="code python"><a id="rest_code_3b225716a45245fb930b2e4ec0343836-1" name="rest_code_3b225716a45245fb930b2e4ec0343836-1" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_3b225716a45245fb930b2e4ec0343836-1"></a><span class="k">def</span> <span class="nf">test_load_non_virtual</span><span class="p">():</span> +<a id="rest_code_3b225716a45245fb930b2e4ec0343836-2" name="rest_code_3b225716a45245fb930b2e4ec0343836-2" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_3b225716a45245fb930b2e4ec0343836-2"></a> <span class="n">bb</span> <span class="o">=</span> <span class="n">Block</span><span class="p">()</span> +<a id="rest_code_3b225716a45245fb930b2e4ec0343836-3" name="rest_code_3b225716a45245fb930b2e4ec0343836-3" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_3b225716a45245fb930b2e4ec0343836-3"></a> <span class="n">var0</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">getarg</span><span class="p">(</span><span class="mi">0</span><span class="p">)</span> +<a id="rest_code_3b225716a45245fb930b2e4ec0343836-4" name="rest_code_3b225716a45245fb930b2e4ec0343836-4" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_3b225716a45245fb930b2e4ec0343836-4"></a> <span class="n">var1</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">load</span><span class="p">(</span><span class="n">var0</span><span class="p">,</span> <span class="mi">0</span><span class="p">)</span> +<a id="rest_code_3b225716a45245fb930b2e4ec0343836-5" name="rest_code_3b225716a45245fb930b2e4ec0343836-5" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_3b225716a45245fb930b2e4ec0343836-5"></a> <span class="n">bb</span><span class="o">.</span><span class="n">print</span><span class="p">(</span><span class="n">var1</span><span class="p">)</span> +<a id="rest_code_3b225716a45245fb930b2e4ec0343836-6" name="rest_code_3b225716a45245fb930b2e4ec0343836-6" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_3b225716a45245fb930b2e4ec0343836-6"></a> <span class="c1"># the next line fails in the line</span> +<a id="rest_code_3b225716a45245fb930b2e4ec0343836-7" name="rest_code_3b225716a45245fb930b2e4ec0343836-7" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_3b225716a45245fb930b2e4ec0343836-7"></a> <span class="c1"># op.make_equal_to(info.load(field))</span> +<a id="rest_code_3b225716a45245fb930b2e4ec0343836-8" name="rest_code_3b225716a45245fb930b2e4ec0343836-8" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_3b225716a45245fb930b2e4ec0343836-8"></a> <span class="c1"># because info is None</span> +<a id="rest_code_3b225716a45245fb930b2e4ec0343836-9" name="rest_code_3b225716a45245fb930b2e4ec0343836-9" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_3b225716a45245fb930b2e4ec0343836-9"></a> <span class="n">opt_bb</span> <span class="o">=</span> <span class="n">optimize_alloc_removal</span><span class="p">(</span><span class="n">bb</span><span class="p">)</span> +<a id="rest_code_3b225716a45245fb930b2e4ec0343836-10" name="rest_code_3b225716a45245fb930b2e4ec0343836-10" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_3b225716a45245fb930b2e4ec0343836-10"></a> <span class="k">assert</span> <span class="n">bb_to_str</span><span class="p">(</span><span class="n">opt_bb</span><span class="p">,</span> <span class="s2">"optvar"</span><span class="p">)</span> <span class="o">==</span> <span class="s2">"""</span><span class="se">\</span> +<a id="rest_code_3b225716a45245fb930b2e4ec0343836-11" name="rest_code_3b225716a45245fb930b2e4ec0343836-11" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_3b225716a45245fb930b2e4ec0343836-11"></a><span class="s2">optvar0 = getarg(0)</span> +<a id="rest_code_3b225716a45245fb930b2e4ec0343836-12" name="rest_code_3b225716a45245fb930b2e4ec0343836-12" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_3b225716a45245fb930b2e4ec0343836-12"></a><span class="s2">optvar1 = load(optvar0, 0)</span> +<a id="rest_code_3b225716a45245fb930b2e4ec0343836-13" name="rest_code_3b225716a45245fb930b2e4ec0343836-13" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_3b225716a45245fb930b2e4ec0343836-13"></a><span class="s2">optvar2 = print(optvar1)"""</span> +</pre></div> +<p>To fix it, we split the <code class="docutils literal">load</code> code into two cases, leaving the virtual path +as before, and letting the <code class="docutils literal">load</code> from a non-virtual fall through to the +general code at the end of the function.</p> +<div class="code"><pre class="code python"><a id="rest_code_1b477872bb23416f9c4122fbbbbfb0c0-1" name="rest_code_1b477872bb23416f9c4122fbbbbfb0c0-1" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1b477872bb23416f9c4122fbbbbfb0c0-1"></a><span class="k">def</span> <span class="nf">optimize_alloc_removal</span><span class="p">(</span><span class="n">bb</span><span class="p">):</span> +<a id="rest_code_1b477872bb23416f9c4122fbbbbfb0c0-2" name="rest_code_1b477872bb23416f9c4122fbbbbfb0c0-2" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1b477872bb23416f9c4122fbbbbfb0c0-2"></a> <span class="n">opt_bb</span> <span class="o">=</span> <span class="n">Block</span><span class="p">()</span> +<a id="rest_code_1b477872bb23416f9c4122fbbbbfb0c0-3" name="rest_code_1b477872bb23416f9c4122fbbbbfb0c0-3" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1b477872bb23416f9c4122fbbbbfb0c0-3"></a> <span class="k">for</span> <span class="n">op</span> <span class="ow">in</span> <span class="n">bb</span><span class="p">:</span> +<a id="rest_code_1b477872bb23416f9c4122fbbbbfb0c0-4" name="rest_code_1b477872bb23416f9c4122fbbbbfb0c0-4" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1b477872bb23416f9c4122fbbbbfb0c0-4"></a> <span class="k">if</span> <span class="n">op</span><span class="o">.</span><span class="n">name</span> <span class="o">==</span> <span class="s2">"alloc"</span><span class="p">:</span> +<a id="rest_code_1b477872bb23416f9c4122fbbbbfb0c0-5" name="rest_code_1b477872bb23416f9c4122fbbbbfb0c0-5" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1b477872bb23416f9c4122fbbbbfb0c0-5"></a> <span class="n">op</span><span class="o">.</span><span class="n">info</span> <span class="o">=</span> <span class="n">VirtualObject</span><span class="p">()</span> +<a id="rest_code_1b477872bb23416f9c4122fbbbbfb0c0-6" name="rest_code_1b477872bb23416f9c4122fbbbbfb0c0-6" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1b477872bb23416f9c4122fbbbbfb0c0-6"></a> <span class="k">continue</span> +<a id="rest_code_1b477872bb23416f9c4122fbbbbfb0c0-7" name="rest_code_1b477872bb23416f9c4122fbbbbfb0c0-7" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1b477872bb23416f9c4122fbbbbfb0c0-7"></a> <span class="k">if</span> <span class="n">op</span><span class="o">.</span><span class="n">name</span> <span class="o">==</span> <span class="s2">"load"</span><span class="p">:</span> +<a id="rest_code_1b477872bb23416f9c4122fbbbbfb0c0-8" name="rest_code_1b477872bb23416f9c4122fbbbbfb0c0-8" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1b477872bb23416f9c4122fbbbbfb0c0-8"></a> <span class="n">info</span> <span class="o">=</span> <span class="n">op</span><span class="o">.</span><span class="n">arg</span><span class="p">(</span><span class="mi">0</span><span class="p">)</span><span class="o">.</span><span class="n">info</span> +<a id="rest_code_1b477872bb23416f9c4122fbbbbfb0c0-9" name="rest_code_1b477872bb23416f9c4122fbbbbfb0c0-9" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1b477872bb23416f9c4122fbbbbfb0c0-9"></a><span class="hll"> <span class="k">if</span> <span class="n">info</span><span class="p">:</span> <span class="c1"># virtual</span> +</span><a id="rest_code_1b477872bb23416f9c4122fbbbbfb0c0-10" name="rest_code_1b477872bb23416f9c4122fbbbbfb0c0-10" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1b477872bb23416f9c4122fbbbbfb0c0-10"></a><span class="hll"> <span class="n">field</span> <span class="o">=</span> <span class="n">get_num</span><span class="p">(</span><span class="n">op</span><span class="p">)</span> +</span><a id="rest_code_1b477872bb23416f9c4122fbbbbfb0c0-11" name="rest_code_1b477872bb23416f9c4122fbbbbfb0c0-11" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1b477872bb23416f9c4122fbbbbfb0c0-11"></a><span class="hll"> <span class="n">op</span><span class="o">.</span><span class="n">make_equal_to</span><span class="p">(</span><span class="n">info</span><span class="o">.</span><span class="n">load</span><span class="p">(</span><span class="n">field</span><span class="p">))</span> +</span><a id="rest_code_1b477872bb23416f9c4122fbbbbfb0c0-12" name="rest_code_1b477872bb23416f9c4122fbbbbfb0c0-12" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1b477872bb23416f9c4122fbbbbfb0c0-12"></a><span class="hll"> <span class="k">continue</span> +</span><a id="rest_code_1b477872bb23416f9c4122fbbbbfb0c0-13" name="rest_code_1b477872bb23416f9c4122fbbbbfb0c0-13" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1b477872bb23416f9c4122fbbbbfb0c0-13"></a><span class="hll"> <span class="c1"># otherwise not virtual, use the</span> +</span><a id="rest_code_1b477872bb23416f9c4122fbbbbfb0c0-14" name="rest_code_1b477872bb23416f9c4122fbbbbfb0c0-14" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1b477872bb23416f9c4122fbbbbfb0c0-14"></a><span class="hll"> <span class="c1"># general path below</span> +</span><a id="rest_code_1b477872bb23416f9c4122fbbbbfb0c0-15" name="rest_code_1b477872bb23416f9c4122fbbbbfb0c0-15" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1b477872bb23416f9c4122fbbbbfb0c0-15"></a> <span class="k">if</span> <span class="n">op</span><span class="o">.</span><span class="n">name</span> <span class="o">==</span> <span class="s2">"store"</span><span class="p">:</span> +<a id="rest_code_1b477872bb23416f9c4122fbbbbfb0c0-16" name="rest_code_1b477872bb23416f9c4122fbbbbfb0c0-16" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1b477872bb23416f9c4122fbbbbfb0c0-16"></a> <span class="n">info</span> <span class="o">=</span> <span class="n">op</span><span class="o">.</span><span class="n">arg</span><span class="p">(</span><span class="mi">0</span><span class="p">)</span><span class="o">.</span><span class="n">info</span> +<a id="rest_code_1b477872bb23416f9c4122fbbbbfb0c0-17" name="rest_code_1b477872bb23416f9c4122fbbbbfb0c0-17" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1b477872bb23416f9c4122fbbbbfb0c0-17"></a> <span class="k">if</span> <span class="n">info</span><span class="p">:</span> <span class="c1"># virtual</span> +<a id="rest_code_1b477872bb23416f9c4122fbbbbfb0c0-18" name="rest_code_1b477872bb23416f9c4122fbbbbfb0c0-18" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1b477872bb23416f9c4122fbbbbfb0c0-18"></a> <span class="n">field</span> <span class="o">=</span> <span class="n">get_num</span><span class="p">(</span><span class="n">op</span><span class="p">)</span> +<a id="rest_code_1b477872bb23416f9c4122fbbbbfb0c0-19" name="rest_code_1b477872bb23416f9c4122fbbbbfb0c0-19" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1b477872bb23416f9c4122fbbbbfb0c0-19"></a> <span class="n">info</span><span class="o">.</span><span class="n">store</span><span class="p">(</span><span class="n">field</span><span class="p">,</span> <span class="n">op</span><span class="o">.</span><span class="n">arg</span><span class="p">(</span><span class="mi">2</span><span class="p">))</span> +<a id="rest_code_1b477872bb23416f9c4122fbbbbfb0c0-20" name="rest_code_1b477872bb23416f9c4122fbbbbfb0c0-20" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1b477872bb23416f9c4122fbbbbfb0c0-20"></a> <span class="k">continue</span> +<a id="rest_code_1b477872bb23416f9c4122fbbbbfb0c0-21" name="rest_code_1b477872bb23416f9c4122fbbbbfb0c0-21" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1b477872bb23416f9c4122fbbbbfb0c0-21"></a> <span class="k">else</span><span class="p">:</span> <span class="c1"># not virtual</span> +<a id="rest_code_1b477872bb23416f9c4122fbbbbfb0c0-22" name="rest_code_1b477872bb23416f9c4122fbbbbfb0c0-22" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1b477872bb23416f9c4122fbbbbfb0c0-22"></a> <span class="c1"># first materialize the</span> +<a id="rest_code_1b477872bb23416f9c4122fbbbbfb0c0-23" name="rest_code_1b477872bb23416f9c4122fbbbbfb0c0-23" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1b477872bb23416f9c4122fbbbbfb0c0-23"></a> <span class="c1"># right hand side</span> +<a id="rest_code_1b477872bb23416f9c4122fbbbbfb0c0-24" name="rest_code_1b477872bb23416f9c4122fbbbbfb0c0-24" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1b477872bb23416f9c4122fbbbbfb0c0-24"></a> <span class="n">materialize</span><span class="p">(</span><span class="n">opt_bb</span><span class="p">,</span> <span class="n">op</span><span class="o">.</span><span class="n">arg</span><span class="p">(</span><span class="mi">2</span><span class="p">))</span> +<a id="rest_code_1b477872bb23416f9c4122fbbbbfb0c0-25" name="rest_code_1b477872bb23416f9c4122fbbbbfb0c0-25" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1b477872bb23416f9c4122fbbbbfb0c0-25"></a> <span class="c1"># then emit the store via</span> +<a id="rest_code_1b477872bb23416f9c4122fbbbbfb0c0-26" name="rest_code_1b477872bb23416f9c4122fbbbbfb0c0-26" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1b477872bb23416f9c4122fbbbbfb0c0-26"></a> <span class="c1"># the general path below</span> +<a id="rest_code_1b477872bb23416f9c4122fbbbbfb0c0-27" name="rest_code_1b477872bb23416f9c4122fbbbbfb0c0-27" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1b477872bb23416f9c4122fbbbbfb0c0-27"></a> <span class="n">opt_bb</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">op</span><span class="p">)</span> +<a id="rest_code_1b477872bb23416f9c4122fbbbbfb0c0-28" name="rest_code_1b477872bb23416f9c4122fbbbbfb0c0-28" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1b477872bb23416f9c4122fbbbbfb0c0-28"></a> <span class="k">return</span> <span class="n">opt_bb</span> +</pre></div> +</section> +<section id="version-9-final-materialize-on-other-operations"> +<h2>Version 9 (Final): Materialize on Other Operations</h2> +<p>We're almost at the end now. There's one final generalization left to do. We +started with the heuristic that storing a virtual into a non-virtual would +escape it. This should be generalized. Every time we pass a virtual into any +operation where it is not the first argument of a <code class="docutils literal">load</code> and a <code class="docutils literal">store</code> +should also escape it (imagine passing the virtual to some function call). +Let's test this as usual with our <code class="docutils literal">print</code> operation:</p> +<div class="code"><pre class="code python"><a id="rest_code_50e1d8b837bf447b84874ecb13f34fc3-1" name="rest_code_50e1d8b837bf447b84874ecb13f34fc3-1" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_50e1d8b837bf447b84874ecb13f34fc3-1"></a><span class="k">def</span> <span class="nf">test_materialize_on_other_ops</span><span class="p">():</span> +<a id="rest_code_50e1d8b837bf447b84874ecb13f34fc3-2" name="rest_code_50e1d8b837bf447b84874ecb13f34fc3-2" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_50e1d8b837bf447b84874ecb13f34fc3-2"></a> <span class="c1"># materialize not just on store</span> +<a id="rest_code_50e1d8b837bf447b84874ecb13f34fc3-3" name="rest_code_50e1d8b837bf447b84874ecb13f34fc3-3" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_50e1d8b837bf447b84874ecb13f34fc3-3"></a> <span class="n">bb</span> <span class="o">=</span> <span class="n">Block</span><span class="p">()</span> +<a id="rest_code_50e1d8b837bf447b84874ecb13f34fc3-4" name="rest_code_50e1d8b837bf447b84874ecb13f34fc3-4" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_50e1d8b837bf447b84874ecb13f34fc3-4"></a> <span class="n">var0</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">getarg</span><span class="p">(</span><span class="mi">0</span><span class="p">)</span> +<a id="rest_code_50e1d8b837bf447b84874ecb13f34fc3-5" name="rest_code_50e1d8b837bf447b84874ecb13f34fc3-5" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_50e1d8b837bf447b84874ecb13f34fc3-5"></a> <span class="n">var1</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">alloc</span><span class="p">()</span> +<a id="rest_code_50e1d8b837bf447b84874ecb13f34fc3-6" name="rest_code_50e1d8b837bf447b84874ecb13f34fc3-6" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_50e1d8b837bf447b84874ecb13f34fc3-6"></a> <span class="n">var2</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">print</span><span class="p">(</span><span class="n">var1</span><span class="p">)</span> +<a id="rest_code_50e1d8b837bf447b84874ecb13f34fc3-7" name="rest_code_50e1d8b837bf447b84874ecb13f34fc3-7" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_50e1d8b837bf447b84874ecb13f34fc3-7"></a> <span class="n">opt_bb</span> <span class="o">=</span> <span class="n">optimize_alloc_removal</span><span class="p">(</span><span class="n">bb</span><span class="p">)</span> +<a id="rest_code_50e1d8b837bf447b84874ecb13f34fc3-8" name="rest_code_50e1d8b837bf447b84874ecb13f34fc3-8" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_50e1d8b837bf447b84874ecb13f34fc3-8"></a> <span class="k">assert</span> <span class="n">bb_to_str</span><span class="p">(</span><span class="n">opt_bb</span><span class="p">,</span> <span class="s2">"optvar"</span><span class="p">)</span> <span class="o">==</span> <span class="s2">"""</span><span class="se">\</span> +<a id="rest_code_50e1d8b837bf447b84874ecb13f34fc3-9" name="rest_code_50e1d8b837bf447b84874ecb13f34fc3-9" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_50e1d8b837bf447b84874ecb13f34fc3-9"></a><span class="s2">optvar0 = getarg(0)</span> +<a id="rest_code_50e1d8b837bf447b84874ecb13f34fc3-10" name="rest_code_50e1d8b837bf447b84874ecb13f34fc3-10" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_50e1d8b837bf447b84874ecb13f34fc3-10"></a><span class="s2">optvar1 = alloc()</span> +<a id="rest_code_50e1d8b837bf447b84874ecb13f34fc3-11" name="rest_code_50e1d8b837bf447b84874ecb13f34fc3-11" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_50e1d8b837bf447b84874ecb13f34fc3-11"></a><span class="s2">optvar2 = print(optvar1)"""</span> +<a id="rest_code_50e1d8b837bf447b84874ecb13f34fc3-12" name="rest_code_50e1d8b837bf447b84874ecb13f34fc3-12" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_50e1d8b837bf447b84874ecb13f34fc3-12"></a> <span class="c1"># again, the resulting basic block is not in</span> +<a id="rest_code_50e1d8b837bf447b84874ecb13f34fc3-13" name="rest_code_50e1d8b837bf447b84874ecb13f34fc3-13" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_50e1d8b837bf447b84874ecb13f34fc3-13"></a> <span class="c1"># valid SSA form</span> +</pre></div> +<p>To fix this, we will take the call to <code class="docutils literal">materialize</code> out of the <code class="docutils literal">store</code> code +path and instead put it into the generic code path the end of the <code class="docutils literal">while</code> +loop:</p> +<div class="code"><pre class="code python"><a id="rest_code_1265bdac21584a538123beb08104bbb3-1" name="rest_code_1265bdac21584a538123beb08104bbb3-1" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1265bdac21584a538123beb08104bbb3-1"></a><span class="c1"># materialize is unchanged</span> +<a id="rest_code_1265bdac21584a538123beb08104bbb3-2" name="rest_code_1265bdac21584a538123beb08104bbb3-2" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1265bdac21584a538123beb08104bbb3-2"></a><span class="k">def</span> <span class="nf">materialize</span><span class="p">(</span><span class="n">opt_bb</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="n">Value</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="kc">None</span><span class="p">:</span> +<a id="rest_code_1265bdac21584a538123beb08104bbb3-3" name="rest_code_1265bdac21584a538123beb08104bbb3-3" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1265bdac21584a538123beb08104bbb3-3"></a> <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">value</span><span class="p">,</span> <span class="n">Constant</span><span class="p">):</span> +<a id="rest_code_1265bdac21584a538123beb08104bbb3-4" name="rest_code_1265bdac21584a538123beb08104bbb3-4" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1265bdac21584a538123beb08104bbb3-4"></a> <span class="k">return</span> +<a id="rest_code_1265bdac21584a538123beb08104bbb3-5" name="rest_code_1265bdac21584a538123beb08104bbb3-5" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1265bdac21584a538123beb08104bbb3-5"></a> <span class="k">assert</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">value</span><span class="p">,</span> <span class="n">Operation</span><span class="p">)</span> +<a id="rest_code_1265bdac21584a538123beb08104bbb3-6" name="rest_code_1265bdac21584a538123beb08104bbb3-6" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1265bdac21584a538123beb08104bbb3-6"></a> <span class="n">info</span> <span class="o">=</span> <span class="n">value</span><span class="o">.</span><span class="n">info</span> +<a id="rest_code_1265bdac21584a538123beb08104bbb3-7" name="rest_code_1265bdac21584a538123beb08104bbb3-7" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1265bdac21584a538123beb08104bbb3-7"></a> <span class="k">if</span> <span class="ow">not</span> <span class="n">info</span><span class="p">:</span> +<a id="rest_code_1265bdac21584a538123beb08104bbb3-8" name="rest_code_1265bdac21584a538123beb08104bbb3-8" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1265bdac21584a538123beb08104bbb3-8"></a> <span class="c1"># Already materialized</span> +<a id="rest_code_1265bdac21584a538123beb08104bbb3-9" name="rest_code_1265bdac21584a538123beb08104bbb3-9" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1265bdac21584a538123beb08104bbb3-9"></a> <span class="k">return</span> +<a id="rest_code_1265bdac21584a538123beb08104bbb3-10" name="rest_code_1265bdac21584a538123beb08104bbb3-10" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1265bdac21584a538123beb08104bbb3-10"></a> <span class="k">assert</span> <span class="n">value</span><span class="o">.</span><span class="n">name</span> <span class="o">==</span> <span class="s2">"alloc"</span> +<a id="rest_code_1265bdac21584a538123beb08104bbb3-11" name="rest_code_1265bdac21584a538123beb08104bbb3-11" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1265bdac21584a538123beb08104bbb3-11"></a> <span class="n">opt_bb</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">value</span><span class="p">)</span> +<a id="rest_code_1265bdac21584a538123beb08104bbb3-12" name="rest_code_1265bdac21584a538123beb08104bbb3-12" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1265bdac21584a538123beb08104bbb3-12"></a> <span class="n">value</span><span class="o">.</span><span class="n">info</span> <span class="o">=</span> <span class="kc">None</span> +<a id="rest_code_1265bdac21584a538123beb08104bbb3-13" name="rest_code_1265bdac21584a538123beb08104bbb3-13" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1265bdac21584a538123beb08104bbb3-13"></a> <span class="k">for</span> <span class="n">idx</span><span class="p">,</span> <span class="n">val</span> <span class="ow">in</span> <span class="nb">sorted</span><span class="p">(</span><span class="n">info</span><span class="o">.</span><span class="n">contents</span><span class="o">.</span><span class="n">items</span><span class="p">()):</span> +<a id="rest_code_1265bdac21584a538123beb08104bbb3-14" name="rest_code_1265bdac21584a538123beb08104bbb3-14" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1265bdac21584a538123beb08104bbb3-14"></a> <span class="n">materialize</span><span class="p">(</span><span class="n">opt_bb</span><span class="p">,</span> <span class="n">val</span><span class="p">)</span> +<a id="rest_code_1265bdac21584a538123beb08104bbb3-15" name="rest_code_1265bdac21584a538123beb08104bbb3-15" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1265bdac21584a538123beb08104bbb3-15"></a> <span class="n">opt_bb</span><span class="o">.</span><span class="n">store</span><span class="p">(</span><span class="n">value</span><span class="p">,</span> <span class="n">idx</span><span class="p">,</span> <span class="n">val</span><span class="p">)</span> +<a id="rest_code_1265bdac21584a538123beb08104bbb3-16" name="rest_code_1265bdac21584a538123beb08104bbb3-16" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1265bdac21584a538123beb08104bbb3-16"></a> +<a id="rest_code_1265bdac21584a538123beb08104bbb3-17" name="rest_code_1265bdac21584a538123beb08104bbb3-17" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1265bdac21584a538123beb08104bbb3-17"></a><span class="k">def</span> <span class="nf">optimize_alloc_removal</span><span class="p">(</span><span class="n">bb</span><span class="p">):</span> +<a id="rest_code_1265bdac21584a538123beb08104bbb3-18" name="rest_code_1265bdac21584a538123beb08104bbb3-18" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1265bdac21584a538123beb08104bbb3-18"></a> <span class="n">opt_bb</span> <span class="o">=</span> <span class="n">Block</span><span class="p">()</span> +<a id="rest_code_1265bdac21584a538123beb08104bbb3-19" name="rest_code_1265bdac21584a538123beb08104bbb3-19" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1265bdac21584a538123beb08104bbb3-19"></a> <span class="k">for</span> <span class="n">op</span> <span class="ow">in</span> <span class="n">bb</span><span class="p">:</span> +<a id="rest_code_1265bdac21584a538123beb08104bbb3-20" name="rest_code_1265bdac21584a538123beb08104bbb3-20" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1265bdac21584a538123beb08104bbb3-20"></a> <span class="k">if</span> <span class="n">op</span><span class="o">.</span><span class="n">name</span> <span class="o">==</span> <span class="s2">"alloc"</span><span class="p">:</span> +<a id="rest_code_1265bdac21584a538123beb08104bbb3-21" name="rest_code_1265bdac21584a538123beb08104bbb3-21" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1265bdac21584a538123beb08104bbb3-21"></a> <span class="n">op</span><span class="o">.</span><span class="n">info</span> <span class="o">=</span> <span class="n">VirtualObject</span><span class="p">()</span> +<a id="rest_code_1265bdac21584a538123beb08104bbb3-22" name="rest_code_1265bdac21584a538123beb08104bbb3-22" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1265bdac21584a538123beb08104bbb3-22"></a> <span class="k">continue</span> +<a id="rest_code_1265bdac21584a538123beb08104bbb3-23" name="rest_code_1265bdac21584a538123beb08104bbb3-23" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1265bdac21584a538123beb08104bbb3-23"></a> <span class="k">if</span> <span class="n">op</span><span class="o">.</span><span class="n">name</span> <span class="o">==</span> <span class="s2">"load"</span><span class="p">:</span> +<a id="rest_code_1265bdac21584a538123beb08104bbb3-24" name="rest_code_1265bdac21584a538123beb08104bbb3-24" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1265bdac21584a538123beb08104bbb3-24"></a> <span class="n">info</span> <span class="o">=</span> <span class="n">op</span><span class="o">.</span><span class="n">arg</span><span class="p">(</span><span class="mi">0</span><span class="p">)</span><span class="o">.</span><span class="n">info</span> +<a id="rest_code_1265bdac21584a538123beb08104bbb3-25" name="rest_code_1265bdac21584a538123beb08104bbb3-25" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1265bdac21584a538123beb08104bbb3-25"></a> <span class="k">if</span> <span class="n">info</span><span class="p">:</span> <span class="c1"># virtual</span> +<a id="rest_code_1265bdac21584a538123beb08104bbb3-26" name="rest_code_1265bdac21584a538123beb08104bbb3-26" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1265bdac21584a538123beb08104bbb3-26"></a> <span class="n">field</span> <span class="o">=</span> <span class="n">get_num</span><span class="p">(</span><span class="n">op</span><span class="p">)</span> +<a id="rest_code_1265bdac21584a538123beb08104bbb3-27" name="rest_code_1265bdac21584a538123beb08104bbb3-27" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1265bdac21584a538123beb08104bbb3-27"></a> <span class="n">op</span><span class="o">.</span><span class="n">make_equal_to</span><span class="p">(</span><span class="n">info</span><span class="o">.</span><span class="n">load</span><span class="p">(</span><span class="n">field</span><span class="p">))</span> +<a id="rest_code_1265bdac21584a538123beb08104bbb3-28" name="rest_code_1265bdac21584a538123beb08104bbb3-28" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1265bdac21584a538123beb08104bbb3-28"></a> <span class="k">continue</span> +<a id="rest_code_1265bdac21584a538123beb08104bbb3-29" name="rest_code_1265bdac21584a538123beb08104bbb3-29" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1265bdac21584a538123beb08104bbb3-29"></a> <span class="k">if</span> <span class="n">op</span><span class="o">.</span><span class="n">name</span> <span class="o">==</span> <span class="s2">"store"</span><span class="p">:</span> +<a id="rest_code_1265bdac21584a538123beb08104bbb3-30" name="rest_code_1265bdac21584a538123beb08104bbb3-30" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1265bdac21584a538123beb08104bbb3-30"></a> <span class="n">info</span> <span class="o">=</span> <span class="n">op</span><span class="o">.</span><span class="n">arg</span><span class="p">(</span><span class="mi">0</span><span class="p">)</span><span class="o">.</span><span class="n">info</span> +<a id="rest_code_1265bdac21584a538123beb08104bbb3-31" name="rest_code_1265bdac21584a538123beb08104bbb3-31" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1265bdac21584a538123beb08104bbb3-31"></a> <span class="k">if</span> <span class="n">info</span><span class="p">:</span> <span class="c1"># virtual</span> +<a id="rest_code_1265bdac21584a538123beb08104bbb3-32" name="rest_code_1265bdac21584a538123beb08104bbb3-32" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1265bdac21584a538123beb08104bbb3-32"></a> <span class="n">field</span> <span class="o">=</span> <span class="n">get_num</span><span class="p">(</span><span class="n">op</span><span class="p">)</span> +<a id="rest_code_1265bdac21584a538123beb08104bbb3-33" name="rest_code_1265bdac21584a538123beb08104bbb3-33" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1265bdac21584a538123beb08104bbb3-33"></a> <span class="n">info</span><span class="o">.</span><span class="n">store</span><span class="p">(</span><span class="n">field</span><span class="p">,</span> <span class="n">op</span><span class="o">.</span><span class="n">arg</span><span class="p">(</span><span class="mi">2</span><span class="p">))</span> +<a id="rest_code_1265bdac21584a538123beb08104bbb3-34" name="rest_code_1265bdac21584a538123beb08104bbb3-34" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1265bdac21584a538123beb08104bbb3-34"></a> <span class="k">continue</span> +<a id="rest_code_1265bdac21584a538123beb08104bbb3-35" name="rest_code_1265bdac21584a538123beb08104bbb3-35" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1265bdac21584a538123beb08104bbb3-35"></a><span class="hll"> <span class="c1"># materialize all the arguments of</span> +</span><a id="rest_code_1265bdac21584a538123beb08104bbb3-36" name="rest_code_1265bdac21584a538123beb08104bbb3-36" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1265bdac21584a538123beb08104bbb3-36"></a><span class="hll"> <span class="c1"># operations that are put into the</span> +</span><a id="rest_code_1265bdac21584a538123beb08104bbb3-37" name="rest_code_1265bdac21584a538123beb08104bbb3-37" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1265bdac21584a538123beb08104bbb3-37"></a><span class="hll"> <span class="c1"># output basic block</span> +</span><a id="rest_code_1265bdac21584a538123beb08104bbb3-38" name="rest_code_1265bdac21584a538123beb08104bbb3-38" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1265bdac21584a538123beb08104bbb3-38"></a><span class="hll"> <span class="k">for</span> <span class="n">arg</span> <span class="ow">in</span> <span class="n">op</span><span class="o">.</span><span class="n">args</span><span class="p">:</span> +</span><a id="rest_code_1265bdac21584a538123beb08104bbb3-39" name="rest_code_1265bdac21584a538123beb08104bbb3-39" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1265bdac21584a538123beb08104bbb3-39"></a><span class="hll"> <span class="n">materialize</span><span class="p">(</span><span class="n">opt_bb</span><span class="p">,</span> <span class="n">arg</span><span class="o">.</span><span class="n">find</span><span class="p">())</span> +</span><a id="rest_code_1265bdac21584a538123beb08104bbb3-40" name="rest_code_1265bdac21584a538123beb08104bbb3-40" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1265bdac21584a538123beb08104bbb3-40"></a> <span class="n">opt_bb</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">op</span><span class="p">)</span> +<a id="rest_code_1265bdac21584a538123beb08104bbb3-41" name="rest_code_1265bdac21584a538123beb08104bbb3-41" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_1265bdac21584a538123beb08104bbb3-41"></a> <span class="k">return</span> <span class="n">opt_bb</span> +</pre></div> +<p>That's it, we're done. It's not a lot of code, but actually quite a powerful +optimization. In addition to removing allocations for objects that are only used +briefly and in predictable ways, it also has another effect. If an object is +allocated, used in a number of operations and then escapes further down in the +block, the operations in between can often be optimized away. This is +demonstrated by the next test (which already passes):</p> +<div class="code"><pre class="code python"><a id="rest_code_dcf3f980aee84678b8a7e06a810cf4a6-1" name="rest_code_dcf3f980aee84678b8a7e06a810cf4a6-1" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_dcf3f980aee84678b8a7e06a810cf4a6-1"></a><span class="k">def</span> <span class="nf">test_sink_allocations</span><span class="p">():</span> +<a id="rest_code_dcf3f980aee84678b8a7e06a810cf4a6-2" name="rest_code_dcf3f980aee84678b8a7e06a810cf4a6-2" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_dcf3f980aee84678b8a7e06a810cf4a6-2"></a> <span class="n">bb</span> <span class="o">=</span> <span class="n">Block</span><span class="p">()</span> +<a id="rest_code_dcf3f980aee84678b8a7e06a810cf4a6-3" name="rest_code_dcf3f980aee84678b8a7e06a810cf4a6-3" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_dcf3f980aee84678b8a7e06a810cf4a6-3"></a> <span class="n">var0</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">getarg</span><span class="p">(</span><span class="mi">0</span><span class="p">)</span> +<a id="rest_code_dcf3f980aee84678b8a7e06a810cf4a6-4" name="rest_code_dcf3f980aee84678b8a7e06a810cf4a6-4" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_dcf3f980aee84678b8a7e06a810cf4a6-4"></a> <span class="n">var1</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">alloc</span><span class="p">()</span> +<a id="rest_code_dcf3f980aee84678b8a7e06a810cf4a6-5" name="rest_code_dcf3f980aee84678b8a7e06a810cf4a6-5" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_dcf3f980aee84678b8a7e06a810cf4a6-5"></a> <span class="n">var2</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">store</span><span class="p">(</span><span class="n">var1</span><span class="p">,</span> <span class="mi">0</span><span class="p">,</span> <span class="mi">123</span><span class="p">)</span> +<a id="rest_code_dcf3f980aee84678b8a7e06a810cf4a6-6" name="rest_code_dcf3f980aee84678b8a7e06a810cf4a6-6" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_dcf3f980aee84678b8a7e06a810cf4a6-6"></a> <span class="n">var3</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">store</span><span class="p">(</span><span class="n">var1</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="mi">456</span><span class="p">)</span> +<a id="rest_code_dcf3f980aee84678b8a7e06a810cf4a6-7" name="rest_code_dcf3f980aee84678b8a7e06a810cf4a6-7" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_dcf3f980aee84678b8a7e06a810cf4a6-7"></a> <span class="n">var4</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">load</span><span class="p">(</span><span class="n">var1</span><span class="p">,</span> <span class="mi">0</span><span class="p">)</span> +<a id="rest_code_dcf3f980aee84678b8a7e06a810cf4a6-8" name="rest_code_dcf3f980aee84678b8a7e06a810cf4a6-8" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_dcf3f980aee84678b8a7e06a810cf4a6-8"></a> <span class="n">var5</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">load</span><span class="p">(</span><span class="n">var1</span><span class="p">,</span> <span class="mi">1</span><span class="p">)</span> +<a id="rest_code_dcf3f980aee84678b8a7e06a810cf4a6-9" name="rest_code_dcf3f980aee84678b8a7e06a810cf4a6-9" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_dcf3f980aee84678b8a7e06a810cf4a6-9"></a> <span class="n">var6</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">add</span><span class="p">(</span><span class="n">var4</span><span class="p">,</span> <span class="n">var5</span><span class="p">)</span> +<a id="rest_code_dcf3f980aee84678b8a7e06a810cf4a6-10" name="rest_code_dcf3f980aee84678b8a7e06a810cf4a6-10" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_dcf3f980aee84678b8a7e06a810cf4a6-10"></a> <span class="n">var7</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">store</span><span class="p">(</span><span class="n">var1</span><span class="p">,</span> <span class="mi">0</span><span class="p">,</span> <span class="n">var6</span><span class="p">)</span> +<a id="rest_code_dcf3f980aee84678b8a7e06a810cf4a6-11" name="rest_code_dcf3f980aee84678b8a7e06a810cf4a6-11" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_dcf3f980aee84678b8a7e06a810cf4a6-11"></a> <span class="n">var8</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">store</span><span class="p">(</span><span class="n">var0</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="n">var1</span><span class="p">)</span> +<a id="rest_code_dcf3f980aee84678b8a7e06a810cf4a6-12" name="rest_code_dcf3f980aee84678b8a7e06a810cf4a6-12" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_dcf3f980aee84678b8a7e06a810cf4a6-12"></a> <span class="n">opt_bb</span> <span class="o">=</span> <span class="n">optimize_alloc_removal</span><span class="p">(</span><span class="n">bb</span><span class="p">)</span> +<a id="rest_code_dcf3f980aee84678b8a7e06a810cf4a6-13" name="rest_code_dcf3f980aee84678b8a7e06a810cf4a6-13" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_dcf3f980aee84678b8a7e06a810cf4a6-13"></a> <span class="k">assert</span> <span class="n">bb_to_str</span><span class="p">(</span><span class="n">opt_bb</span><span class="p">,</span> <span class="s2">"optvar"</span><span class="p">)</span> <span class="o">==</span> <span class="s2">"""</span><span class="se">\</span> +<a id="rest_code_dcf3f980aee84678b8a7e06a810cf4a6-14" name="rest_code_dcf3f980aee84678b8a7e06a810cf4a6-14" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_dcf3f980aee84678b8a7e06a810cf4a6-14"></a><span class="s2">optvar0 = getarg(0)</span> +<a id="rest_code_dcf3f980aee84678b8a7e06a810cf4a6-15" name="rest_code_dcf3f980aee84678b8a7e06a810cf4a6-15" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_dcf3f980aee84678b8a7e06a810cf4a6-15"></a><span class="s2">optvar1 = add(123, 456)</span> +<a id="rest_code_dcf3f980aee84678b8a7e06a810cf4a6-16" name="rest_code_dcf3f980aee84678b8a7e06a810cf4a6-16" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_dcf3f980aee84678b8a7e06a810cf4a6-16"></a><span class="s2">optvar2 = alloc()</span> +<a id="rest_code_dcf3f980aee84678b8a7e06a810cf4a6-17" name="rest_code_dcf3f980aee84678b8a7e06a810cf4a6-17" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_dcf3f980aee84678b8a7e06a810cf4a6-17"></a><span class="s2">optvar3 = store(optvar2, 0, optvar1)</span> +<a id="rest_code_dcf3f980aee84678b8a7e06a810cf4a6-18" name="rest_code_dcf3f980aee84678b8a7e06a810cf4a6-18" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_dcf3f980aee84678b8a7e06a810cf4a6-18"></a><span class="s2">optvar4 = store(optvar2, 1, 456)</span> +<a id="rest_code_dcf3f980aee84678b8a7e06a810cf4a6-19" name="rest_code_dcf3f980aee84678b8a7e06a810cf4a6-19" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html#rest_code_dcf3f980aee84678b8a7e06a810cf4a6-19"></a><span class="s2">optvar5 = store(optvar0, 1, optvar2)"""</span> +</pre></div> +<p>Note that the addition is not optimized away, because the code from this blog +post does not contain constant folding and the other optimizations from +the last one. Combining them would not be too hard though.</p> +</section> +<section id="conclusion"> +<h2>Conclusion</h2> +<p>That's it! The core idea of PyPy's allocation removal optimization in one or +two screens of code. The real implementation has a number of refinements, +but the core ideas are all here.</p> +<p>I'm not going to show any benchmark numbers or anything like that here, if you +are interested in numbers you could look at the evaluation Section 6. +"Implementation and Evaluation" of the <a class="reference external" href="https://www3.hhu.de/stups/downloads/pdf/BoCuFiLePeRi2011.pdf">paper</a> that describes the work.</p> +<p>There's a complementary optimization that improves <code class="docutils literal">load</code> and <code class="docutils literal">store</code> +operations for objects that are <em>not</em> virtual. I'll probably not write that +down as another post, but <a class="reference external" href="https://bernsteinbear.com/">Max Bernstein</a> and I developed that together on a +<a class="reference external" href="https://www.pypy.org/posts/2022/10/twitch.tv/pypyproject">PyPy Twitch channel</a> channel a few weeks ago, here's the recording:</p> +<iframe width="560" height="315" src="https://www.youtube-nocookie.com/embed/w-UHg0yOPSE" title="YouTube video player" frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture" allowfullscreen></iframe></section> +<section id="footnotes"> +<h2>Footnotes</h2> +<p id="target-4">¹ This is how PyPy uses the terminology, not really used consistently by other +projects. The term "escape" is fairly standard throughout the <a class="reference external" href="https://en.wikipedia.org/wiki/Escape_analysis">escape +analysis</a> literature. The term "virtual" was used originally in <a class="reference external" href="https://dl.acm.org/doi/abs/10.1145/1014007.1014010">Armin Rigo's +Psyco</a> but is e.g. also used by the paper <a class="reference external" href="https://www.ssw.uni-linz.ac.at/Research/Papers/Stadler14/Stadler2014-CGO-PEA.pdf">Partial Escape Analysis and Scalar +Replacement for Java</a>.</p> +<p id="target-5">² The order in which we put the <cite>store</cite> operations back is relying on +dictionary iteration order, which is insertion order. That's not a bad +ordering, we could also be explicit and sort the fields in some order (ideally +the order in which the object lays them out in memory).</p> +</section>toy-optimizerhttps://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.htmlTue, 25 Oct 2022 07:55:00 GMTImplementing a Toy Optimizerhttps://www.pypy.org/posts/2022/07/toy-optimizer.htmlCarl Friedrich Bolz-Tereick<p>In this blog post I want to show the complete code (in Python3) of how a very +simple optimizer for sequences of operations can work. These algorithms could +be part of a (really simple) compiler, or a JIT. The architecture of the code in +this blog post is very similar to that of the trace optimizer of the PyPy JIT: +After a trace is produced, is is optimized before being sent to the machine code +backend that produces binary instructions for the CPU architecture that PyPy is +running on.</p> +<p>To get started, the first thing we need to do is define how our operations are +stored. The +format that a compiler uses to store the program while it is being optimized +is usually called its <a class="reference external" href="https://en.wikipedia.org/wiki/Intermediate_representation">intermediate representation</a> (IR). Many production +compilers use IRs that are in the <a class="reference external" href="https://en.wikipedia.org/wiki/Static_single-assignment_form">Static Single-Assignment Form</a> (SSA), and +we will also use that. SSA form has the property that every variable is +assigned to exactly once, and every variable is defined before it is used. This +simplifies many things.</p> +<p>Let's make this concrete. If our input program is a complex expressions, such +as <code class="docutils literal">a * (b + 17) + (b + 17)</code> the intermediate representation of that (or at +least its text representation) would maybe be something like:</p> +<pre class="literal-block">var1 = add(b, 17) +var2 = mul(a, var1) +var3 = add(b, 17) +var4 = add(var2, var3)</pre> +<p>This sequence of instructions is inefficient. The operation <code class="docutils literal">add(b, 17)</code> is +computed twice and we can save time by removing the second one and only +computing it once. In this post I want to show an optimizer that can do this +(and some related) optimizations.</p> +<p>Looking at the IR we notice that the input expression has been linearized +into a sequence of operations, and all the intermedia results have been given +unique variable names. The value that every variable is assigned is computed +by the right hand side, which is some operation consisting of an operand and an +arbitrary number of arguments. The arguments of an operation are either +themselves variables or constants.</p> +<p>I will not at all talk about the process of translating the input program +into the IR. Instead, I will assume we have some component that does this +translation already. The tests in this blog post will construct small +snippets of IR by hand. I also won't talk about what happens after the +optimization (usually the optimized IR is translated into machine code).</p> +<section id="implementing-the-intermediate-representation"> +<h2>Implementing the Intermediate Representation</h2> +<p>Let's start modelling the intermediate representation with Python classes. +First we define a base class of all values that can be used as arguments in +operations, and let's also add a class that represents constants:</p> +<div class="code"><pre class="code python"><a id="rest_code_69e9b0ec902f4309ab1c8f4987b5f274-1" name="rest_code_69e9b0ec902f4309ab1c8f4987b5f274-1" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_69e9b0ec902f4309ab1c8f4987b5f274-1"></a><span class="kn">import</span> <span class="nn">pytest</span> +<a id="rest_code_69e9b0ec902f4309ab1c8f4987b5f274-2" name="rest_code_69e9b0ec902f4309ab1c8f4987b5f274-2" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_69e9b0ec902f4309ab1c8f4987b5f274-2"></a><span class="kn">from</span> <span class="nn">typing</span> <span class="kn">import</span> <span class="n">Optional</span><span class="p">,</span> <span class="n">Any</span> +<a id="rest_code_69e9b0ec902f4309ab1c8f4987b5f274-3" name="rest_code_69e9b0ec902f4309ab1c8f4987b5f274-3" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_69e9b0ec902f4309ab1c8f4987b5f274-3"></a> +<a id="rest_code_69e9b0ec902f4309ab1c8f4987b5f274-4" name="rest_code_69e9b0ec902f4309ab1c8f4987b5f274-4" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_69e9b0ec902f4309ab1c8f4987b5f274-4"></a><span class="k">class</span> <span class="nc">Value</span><span class="p">:</span> +<a id="rest_code_69e9b0ec902f4309ab1c8f4987b5f274-5" name="rest_code_69e9b0ec902f4309ab1c8f4987b5f274-5" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_69e9b0ec902f4309ab1c8f4987b5f274-5"></a> <span class="k">pass</span> +<a id="rest_code_69e9b0ec902f4309ab1c8f4987b5f274-6" name="rest_code_69e9b0ec902f4309ab1c8f4987b5f274-6" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_69e9b0ec902f4309ab1c8f4987b5f274-6"></a> +<a id="rest_code_69e9b0ec902f4309ab1c8f4987b5f274-7" name="rest_code_69e9b0ec902f4309ab1c8f4987b5f274-7" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_69e9b0ec902f4309ab1c8f4987b5f274-7"></a><span class="k">class</span> <span class="nc">Constant</span><span class="p">(</span><span class="n">Value</span><span class="p">):</span> +<a id="rest_code_69e9b0ec902f4309ab1c8f4987b5f274-8" name="rest_code_69e9b0ec902f4309ab1c8f4987b5f274-8" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_69e9b0ec902f4309ab1c8f4987b5f274-8"></a> <span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="n">Any</span><span class="p">):</span> +<a id="rest_code_69e9b0ec902f4309ab1c8f4987b5f274-9" name="rest_code_69e9b0ec902f4309ab1c8f4987b5f274-9" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_69e9b0ec902f4309ab1c8f4987b5f274-9"></a> <span class="bp">self</span><span class="o">.</span><span class="n">value</span> <span class="o">=</span> <span class="n">value</span> +<a id="rest_code_69e9b0ec902f4309ab1c8f4987b5f274-10" name="rest_code_69e9b0ec902f4309ab1c8f4987b5f274-10" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_69e9b0ec902f4309ab1c8f4987b5f274-10"></a> +<a id="rest_code_69e9b0ec902f4309ab1c8f4987b5f274-11" name="rest_code_69e9b0ec902f4309ab1c8f4987b5f274-11" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_69e9b0ec902f4309ab1c8f4987b5f274-11"></a> <span class="k">def</span> <span class="fm">__repr__</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span> +<a id="rest_code_69e9b0ec902f4309ab1c8f4987b5f274-12" name="rest_code_69e9b0ec902f4309ab1c8f4987b5f274-12" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_69e9b0ec902f4309ab1c8f4987b5f274-12"></a> <span class="k">return</span> <span class="sa">f</span><span class="s2">"Constant(</span><span class="si">{</span><span class="bp">self</span><span class="o">.</span><span class="n">value</span><span class="si">}</span><span class="s2">)"</span> +</pre></div> +<p>One consequence of the fact that every variable is assigned to only once is +that variables are in a one-to-one correspondence with the right-hand-side of +their unique assignments. That means that we don't need a class that represents +variables at all. Instead, it's sufficient to have a class that represents an +operation (the right-hand side), and that by definition is the same as the variable (left-hand side) that it defines:</p> +<div class="code"><pre class="code python"><a id="rest_code_40d95908ff0c4e0fbb15a2f1c054d2f4-1" name="rest_code_40d95908ff0c4e0fbb15a2f1c054d2f4-1" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_40d95908ff0c4e0fbb15a2f1c054d2f4-1"></a><span class="k">class</span> <span class="nc">Operation</span><span class="p">(</span><span class="n">Value</span><span class="p">):</span> +<a id="rest_code_40d95908ff0c4e0fbb15a2f1c054d2f4-2" name="rest_code_40d95908ff0c4e0fbb15a2f1c054d2f4-2" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_40d95908ff0c4e0fbb15a2f1c054d2f4-2"></a> <span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">name</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span> <span class="n">args</span><span class="p">:</span> <span class="nb">list</span><span class="p">[</span><span class="n">Value</span><span class="p">]):</span> +<a id="rest_code_40d95908ff0c4e0fbb15a2f1c054d2f4-3" name="rest_code_40d95908ff0c4e0fbb15a2f1c054d2f4-3" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_40d95908ff0c4e0fbb15a2f1c054d2f4-3"></a> <span class="bp">self</span><span class="o">.</span><span class="n">name</span> <span class="o">=</span> <span class="n">name</span> +<a id="rest_code_40d95908ff0c4e0fbb15a2f1c054d2f4-4" name="rest_code_40d95908ff0c4e0fbb15a2f1c054d2f4-4" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_40d95908ff0c4e0fbb15a2f1c054d2f4-4"></a> <span class="bp">self</span><span class="o">.</span><span class="n">args</span> <span class="o">=</span> <span class="n">args</span> +<a id="rest_code_40d95908ff0c4e0fbb15a2f1c054d2f4-5" name="rest_code_40d95908ff0c4e0fbb15a2f1c054d2f4-5" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_40d95908ff0c4e0fbb15a2f1c054d2f4-5"></a> +<a id="rest_code_40d95908ff0c4e0fbb15a2f1c054d2f4-6" name="rest_code_40d95908ff0c4e0fbb15a2f1c054d2f4-6" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_40d95908ff0c4e0fbb15a2f1c054d2f4-6"></a> <span class="k">def</span> <span class="fm">__repr__</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span> +<a id="rest_code_40d95908ff0c4e0fbb15a2f1c054d2f4-7" name="rest_code_40d95908ff0c4e0fbb15a2f1c054d2f4-7" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_40d95908ff0c4e0fbb15a2f1c054d2f4-7"></a> <span class="k">return</span> <span class="sa">f</span><span class="s2">"Operation(</span><span class="si">{</span><span class="bp">self</span><span class="o">.</span><span class="n">name</span><span class="si">}</span><span class="s2">, </span><span class="si">{</span><span class="bp">self</span><span class="o">.</span><span class="n">args</span><span class="si">}</span><span class="s2">)"</span> +<a id="rest_code_40d95908ff0c4e0fbb15a2f1c054d2f4-8" name="rest_code_40d95908ff0c4e0fbb15a2f1c054d2f4-8" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_40d95908ff0c4e0fbb15a2f1c054d2f4-8"></a> +<a id="rest_code_40d95908ff0c4e0fbb15a2f1c054d2f4-9" name="rest_code_40d95908ff0c4e0fbb15a2f1c054d2f4-9" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_40d95908ff0c4e0fbb15a2f1c054d2f4-9"></a> <span class="k">def</span> <span class="nf">arg</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">index</span><span class="p">:</span> <span class="nb">int</span><span class="p">):</span> +<a id="rest_code_40d95908ff0c4e0fbb15a2f1c054d2f4-10" name="rest_code_40d95908ff0c4e0fbb15a2f1c054d2f4-10" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_40d95908ff0c4e0fbb15a2f1c054d2f4-10"></a> <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">args</span><span class="p">[</span><span class="n">index</span><span class="p">]</span> +</pre></div> +<p>Now we can instantiate these two classes to represent the example sequence of +operations above:</p> +<div class="code"><pre class="code python"><a id="rest_code_31b4664131db44af997a1af90a539c87-1" name="rest_code_31b4664131db44af997a1af90a539c87-1" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_31b4664131db44af997a1af90a539c87-1"></a><span class="k">def</span> <span class="nf">test_construct_example</span><span class="p">():</span> +<a id="rest_code_31b4664131db44af997a1af90a539c87-2" name="rest_code_31b4664131db44af997a1af90a539c87-2" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_31b4664131db44af997a1af90a539c87-2"></a> <span class="c1"># first we need something to represent</span> +<a id="rest_code_31b4664131db44af997a1af90a539c87-3" name="rest_code_31b4664131db44af997a1af90a539c87-3" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_31b4664131db44af997a1af90a539c87-3"></a> <span class="c1"># "a" and "b". In our limited view, we don't</span> +<a id="rest_code_31b4664131db44af997a1af90a539c87-4" name="rest_code_31b4664131db44af997a1af90a539c87-4" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_31b4664131db44af997a1af90a539c87-4"></a> <span class="c1"># know where they come from, so we will define</span> +<a id="rest_code_31b4664131db44af997a1af90a539c87-5" name="rest_code_31b4664131db44af997a1af90a539c87-5" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_31b4664131db44af997a1af90a539c87-5"></a> <span class="c1"># them with a pseudo-operation called "getarg"</span> +<a id="rest_code_31b4664131db44af997a1af90a539c87-6" name="rest_code_31b4664131db44af997a1af90a539c87-6" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_31b4664131db44af997a1af90a539c87-6"></a> <span class="c1"># which takes a number n as an argument and</span> +<a id="rest_code_31b4664131db44af997a1af90a539c87-7" name="rest_code_31b4664131db44af997a1af90a539c87-7" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_31b4664131db44af997a1af90a539c87-7"></a> <span class="c1"># returns the n-th input argument. The proper</span> +<a id="rest_code_31b4664131db44af997a1af90a539c87-8" name="rest_code_31b4664131db44af997a1af90a539c87-8" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_31b4664131db44af997a1af90a539c87-8"></a> <span class="c1"># SSA way to do this would be phi-nodes.</span> +<a id="rest_code_31b4664131db44af997a1af90a539c87-9" name="rest_code_31b4664131db44af997a1af90a539c87-9" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_31b4664131db44af997a1af90a539c87-9"></a> +<a id="rest_code_31b4664131db44af997a1af90a539c87-10" name="rest_code_31b4664131db44af997a1af90a539c87-10" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_31b4664131db44af997a1af90a539c87-10"></a> <span class="n">a</span> <span class="o">=</span> <span class="n">Operation</span><span class="p">(</span><span class="s2">"getarg"</span><span class="p">,</span> <span class="p">[</span><span class="n">Constant</span><span class="p">(</span><span class="mi">0</span><span class="p">)])</span> +<a id="rest_code_31b4664131db44af997a1af90a539c87-11" name="rest_code_31b4664131db44af997a1af90a539c87-11" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_31b4664131db44af997a1af90a539c87-11"></a> <span class="n">b</span> <span class="o">=</span> <span class="n">Operation</span><span class="p">(</span><span class="s2">"getarg"</span><span class="p">,</span> <span class="p">[</span><span class="n">Constant</span><span class="p">(</span><span class="mi">1</span><span class="p">)])</span> +<a id="rest_code_31b4664131db44af997a1af90a539c87-12" name="rest_code_31b4664131db44af997a1af90a539c87-12" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_31b4664131db44af997a1af90a539c87-12"></a> <span class="c1"># var1 = add(b, 17)</span> +<a id="rest_code_31b4664131db44af997a1af90a539c87-13" name="rest_code_31b4664131db44af997a1af90a539c87-13" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_31b4664131db44af997a1af90a539c87-13"></a> <span class="n">var1</span> <span class="o">=</span> <span class="n">Operation</span><span class="p">(</span><span class="s2">"add"</span><span class="p">,</span> <span class="p">[</span><span class="n">b</span><span class="p">,</span> <span class="n">Constant</span><span class="p">(</span><span class="mi">17</span><span class="p">)])</span> +<a id="rest_code_31b4664131db44af997a1af90a539c87-14" name="rest_code_31b4664131db44af997a1af90a539c87-14" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_31b4664131db44af997a1af90a539c87-14"></a> <span class="c1"># var2 = mul(a, var1)</span> +<a id="rest_code_31b4664131db44af997a1af90a539c87-15" name="rest_code_31b4664131db44af997a1af90a539c87-15" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_31b4664131db44af997a1af90a539c87-15"></a> <span class="n">var2</span> <span class="o">=</span> <span class="n">Operation</span><span class="p">(</span><span class="s2">"mul"</span><span class="p">,</span> <span class="p">[</span><span class="n">a</span><span class="p">,</span> <span class="n">var1</span><span class="p">])</span> +<a id="rest_code_31b4664131db44af997a1af90a539c87-16" name="rest_code_31b4664131db44af997a1af90a539c87-16" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_31b4664131db44af997a1af90a539c87-16"></a> <span class="c1"># var3 = add(b, 17)</span> +<a id="rest_code_31b4664131db44af997a1af90a539c87-17" name="rest_code_31b4664131db44af997a1af90a539c87-17" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_31b4664131db44af997a1af90a539c87-17"></a> <span class="n">var3</span> <span class="o">=</span> <span class="n">Operation</span><span class="p">(</span><span class="s2">"add"</span><span class="p">,</span> <span class="p">[</span><span class="n">b</span><span class="p">,</span> <span class="n">Constant</span><span class="p">(</span><span class="mi">17</span><span class="p">)])</span> +<a id="rest_code_31b4664131db44af997a1af90a539c87-18" name="rest_code_31b4664131db44af997a1af90a539c87-18" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_31b4664131db44af997a1af90a539c87-18"></a> <span class="c1"># var4 = add(var2, var3)</span> +<a id="rest_code_31b4664131db44af997a1af90a539c87-19" name="rest_code_31b4664131db44af997a1af90a539c87-19" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_31b4664131db44af997a1af90a539c87-19"></a> <span class="n">var4</span> <span class="o">=</span> <span class="n">Operation</span><span class="p">(</span><span class="s2">"add"</span><span class="p">,</span> <span class="p">[</span><span class="n">var2</span><span class="p">,</span> <span class="n">var3</span><span class="p">])</span> +<a id="rest_code_31b4664131db44af997a1af90a539c87-20" name="rest_code_31b4664131db44af997a1af90a539c87-20" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_31b4664131db44af997a1af90a539c87-20"></a> +<a id="rest_code_31b4664131db44af997a1af90a539c87-21" name="rest_code_31b4664131db44af997a1af90a539c87-21" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_31b4664131db44af997a1af90a539c87-21"></a> <span class="n">sequence</span> <span class="o">=</span> <span class="p">[</span><span class="n">a</span><span class="p">,</span> <span class="n">b</span><span class="p">,</span> <span class="n">var1</span><span class="p">,</span> <span class="n">var2</span><span class="p">,</span> <span class="n">var3</span><span class="p">,</span> <span class="n">var4</span><span class="p">]</span> +<a id="rest_code_31b4664131db44af997a1af90a539c87-22" name="rest_code_31b4664131db44af997a1af90a539c87-22" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_31b4664131db44af997a1af90a539c87-22"></a> <span class="c1"># nothing to test really, it shouldn't crash</span> +</pre></div> +<p>Usually, complicated programs are represented as a <a class="reference external" href="https://en.wikipedia.org/wiki/Control-flow_graph">control flow graph</a> in a +compiler, which represents all the possible paths that control can take while +executing the program. Every node in the control flow graph is a <a class="reference external" href="https://en.wikipedia.org/wiki/Basic_block">basic +block</a>. A basic block is a linear sequence of operations with no control flow +inside of it.</p> +<p>When optimizing a program, a compiler usually looks at the whole control flow +graph of a function. However, that is still too complicated! So let's +simplify further and look at only at optimizations we can do when looking at +a single basic block and its sequence of instructions (they are called local +optimizations).</p> +<p>Let's define a class representing basic blocks and let's also add some +convenience functions for constructing sequences of operations, because the +code in <code class="docutils literal">test_construct_example</code> is a bit annoying.</p> +<div class="code"><pre class="code python"><a id="rest_code_cadeff25d2194d8a8f26c581650641c7-1" name="rest_code_cadeff25d2194d8a8f26c581650641c7-1" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_cadeff25d2194d8a8f26c581650641c7-1"></a><span class="k">class</span> <span class="nc">Block</span><span class="p">(</span><span class="nb">list</span><span class="p">):</span> +<a id="rest_code_cadeff25d2194d8a8f26c581650641c7-2" name="rest_code_cadeff25d2194d8a8f26c581650641c7-2" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_cadeff25d2194d8a8f26c581650641c7-2"></a> <span class="k">def</span> <span class="nf">opbuilder</span><span class="p">(</span><span class="n">opname</span><span class="p">):</span> +<a id="rest_code_cadeff25d2194d8a8f26c581650641c7-3" name="rest_code_cadeff25d2194d8a8f26c581650641c7-3" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_cadeff25d2194d8a8f26c581650641c7-3"></a> <span class="k">def</span> <span class="nf">wraparg</span><span class="p">(</span><span class="n">arg</span><span class="p">):</span> +<a id="rest_code_cadeff25d2194d8a8f26c581650641c7-4" name="rest_code_cadeff25d2194d8a8f26c581650641c7-4" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_cadeff25d2194d8a8f26c581650641c7-4"></a> <span class="k">if</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">arg</span><span class="p">,</span> <span class="n">Value</span><span class="p">):</span> +<a id="rest_code_cadeff25d2194d8a8f26c581650641c7-5" name="rest_code_cadeff25d2194d8a8f26c581650641c7-5" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_cadeff25d2194d8a8f26c581650641c7-5"></a> <span class="n">arg</span> <span class="o">=</span> <span class="n">Constant</span><span class="p">(</span><span class="n">arg</span><span class="p">)</span> +<a id="rest_code_cadeff25d2194d8a8f26c581650641c7-6" name="rest_code_cadeff25d2194d8a8f26c581650641c7-6" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_cadeff25d2194d8a8f26c581650641c7-6"></a> <span class="k">return</span> <span class="n">arg</span> +<a id="rest_code_cadeff25d2194d8a8f26c581650641c7-7" name="rest_code_cadeff25d2194d8a8f26c581650641c7-7" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_cadeff25d2194d8a8f26c581650641c7-7"></a> <span class="k">def</span> <span class="nf">build</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="n">args</span><span class="p">):</span> +<a id="rest_code_cadeff25d2194d8a8f26c581650641c7-8" name="rest_code_cadeff25d2194d8a8f26c581650641c7-8" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_cadeff25d2194d8a8f26c581650641c7-8"></a> <span class="c1"># construct an Operation, wrap the</span> +<a id="rest_code_cadeff25d2194d8a8f26c581650641c7-9" name="rest_code_cadeff25d2194d8a8f26c581650641c7-9" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_cadeff25d2194d8a8f26c581650641c7-9"></a> <span class="c1"># arguments in Constants if necessary</span> +<a id="rest_code_cadeff25d2194d8a8f26c581650641c7-10" name="rest_code_cadeff25d2194d8a8f26c581650641c7-10" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_cadeff25d2194d8a8f26c581650641c7-10"></a> <span class="n">op</span> <span class="o">=</span> <span class="n">Operation</span><span class="p">(</span><span class="n">opname</span><span class="p">,</span> +<a id="rest_code_cadeff25d2194d8a8f26c581650641c7-11" name="rest_code_cadeff25d2194d8a8f26c581650641c7-11" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_cadeff25d2194d8a8f26c581650641c7-11"></a> <span class="p">[</span><span class="n">wraparg</span><span class="p">(</span><span class="n">arg</span><span class="p">)</span> <span class="k">for</span> <span class="n">arg</span> <span class="ow">in</span> <span class="n">args</span><span class="p">])</span> +<a id="rest_code_cadeff25d2194d8a8f26c581650641c7-12" name="rest_code_cadeff25d2194d8a8f26c581650641c7-12" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_cadeff25d2194d8a8f26c581650641c7-12"></a> <span class="c1"># add it to self, the basic block</span> +<a id="rest_code_cadeff25d2194d8a8f26c581650641c7-13" name="rest_code_cadeff25d2194d8a8f26c581650641c7-13" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_cadeff25d2194d8a8f26c581650641c7-13"></a> <span class="bp">self</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">op</span><span class="p">)</span> +<a id="rest_code_cadeff25d2194d8a8f26c581650641c7-14" name="rest_code_cadeff25d2194d8a8f26c581650641c7-14" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_cadeff25d2194d8a8f26c581650641c7-14"></a> <span class="k">return</span> <span class="n">op</span> +<a id="rest_code_cadeff25d2194d8a8f26c581650641c7-15" name="rest_code_cadeff25d2194d8a8f26c581650641c7-15" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_cadeff25d2194d8a8f26c581650641c7-15"></a> <span class="k">return</span> <span class="n">build</span> +<a id="rest_code_cadeff25d2194d8a8f26c581650641c7-16" name="rest_code_cadeff25d2194d8a8f26c581650641c7-16" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_cadeff25d2194d8a8f26c581650641c7-16"></a> +<a id="rest_code_cadeff25d2194d8a8f26c581650641c7-17" name="rest_code_cadeff25d2194d8a8f26c581650641c7-17" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_cadeff25d2194d8a8f26c581650641c7-17"></a> <span class="c1"># a bunch of operations we support</span> +<a id="rest_code_cadeff25d2194d8a8f26c581650641c7-18" name="rest_code_cadeff25d2194d8a8f26c581650641c7-18" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_cadeff25d2194d8a8f26c581650641c7-18"></a> <span class="n">add</span> <span class="o">=</span> <span class="n">opbuilder</span><span class="p">(</span><span class="s2">"add"</span><span class="p">)</span> +<a id="rest_code_cadeff25d2194d8a8f26c581650641c7-19" name="rest_code_cadeff25d2194d8a8f26c581650641c7-19" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_cadeff25d2194d8a8f26c581650641c7-19"></a> <span class="n">mul</span> <span class="o">=</span> <span class="n">opbuilder</span><span class="p">(</span><span class="s2">"mul"</span><span class="p">)</span> +<a id="rest_code_cadeff25d2194d8a8f26c581650641c7-20" name="rest_code_cadeff25d2194d8a8f26c581650641c7-20" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_cadeff25d2194d8a8f26c581650641c7-20"></a> <span class="n">getarg</span> <span class="o">=</span> <span class="n">opbuilder</span><span class="p">(</span><span class="s2">"getarg"</span><span class="p">)</span> +<a id="rest_code_cadeff25d2194d8a8f26c581650641c7-21" name="rest_code_cadeff25d2194d8a8f26c581650641c7-21" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_cadeff25d2194d8a8f26c581650641c7-21"></a> <span class="n">dummy</span> <span class="o">=</span> <span class="n">opbuilder</span><span class="p">(</span><span class="s2">"dummy"</span><span class="p">)</span> +<a id="rest_code_cadeff25d2194d8a8f26c581650641c7-22" name="rest_code_cadeff25d2194d8a8f26c581650641c7-22" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_cadeff25d2194d8a8f26c581650641c7-22"></a> <span class="n">lshift</span> <span class="o">=</span> <span class="n">opbuilder</span><span class="p">(</span><span class="s2">"lshift"</span><span class="p">)</span> +<a id="rest_code_cadeff25d2194d8a8f26c581650641c7-23" name="rest_code_cadeff25d2194d8a8f26c581650641c7-23" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_cadeff25d2194d8a8f26c581650641c7-23"></a> +<a id="rest_code_cadeff25d2194d8a8f26c581650641c7-24" name="rest_code_cadeff25d2194d8a8f26c581650641c7-24" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_cadeff25d2194d8a8f26c581650641c7-24"></a><span class="k">def</span> <span class="nf">test_convencience_block_construction</span><span class="p">():</span> +<a id="rest_code_cadeff25d2194d8a8f26c581650641c7-25" name="rest_code_cadeff25d2194d8a8f26c581650641c7-25" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_cadeff25d2194d8a8f26c581650641c7-25"></a> <span class="n">bb</span> <span class="o">=</span> <span class="n">Block</span><span class="p">()</span> +<a id="rest_code_cadeff25d2194d8a8f26c581650641c7-26" name="rest_code_cadeff25d2194d8a8f26c581650641c7-26" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_cadeff25d2194d8a8f26c581650641c7-26"></a> <span class="c1"># a again with getarg, the following line</span> +<a id="rest_code_cadeff25d2194d8a8f26c581650641c7-27" name="rest_code_cadeff25d2194d8a8f26c581650641c7-27" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_cadeff25d2194d8a8f26c581650641c7-27"></a> <span class="c1"># defines the Operation instance and</span> +<a id="rest_code_cadeff25d2194d8a8f26c581650641c7-28" name="rest_code_cadeff25d2194d8a8f26c581650641c7-28" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_cadeff25d2194d8a8f26c581650641c7-28"></a> <span class="c1"># immediately adds it to the basic block bb</span> +<a id="rest_code_cadeff25d2194d8a8f26c581650641c7-29" name="rest_code_cadeff25d2194d8a8f26c581650641c7-29" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_cadeff25d2194d8a8f26c581650641c7-29"></a> <span class="n">a</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">getarg</span><span class="p">(</span><span class="mi">0</span><span class="p">)</span> +<a id="rest_code_cadeff25d2194d8a8f26c581650641c7-30" name="rest_code_cadeff25d2194d8a8f26c581650641c7-30" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_cadeff25d2194d8a8f26c581650641c7-30"></a> <span class="k">assert</span> <span class="nb">len</span><span class="p">(</span><span class="n">bb</span><span class="p">)</span> <span class="o">==</span> <span class="mi">1</span> +<a id="rest_code_cadeff25d2194d8a8f26c581650641c7-31" name="rest_code_cadeff25d2194d8a8f26c581650641c7-31" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_cadeff25d2194d8a8f26c581650641c7-31"></a> <span class="k">assert</span> <span class="n">bb</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span><span class="o">.</span><span class="n">name</span> <span class="o">==</span> <span class="s2">"getarg"</span> +<a id="rest_code_cadeff25d2194d8a8f26c581650641c7-32" name="rest_code_cadeff25d2194d8a8f26c581650641c7-32" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_cadeff25d2194d8a8f26c581650641c7-32"></a> +<a id="rest_code_cadeff25d2194d8a8f26c581650641c7-33" name="rest_code_cadeff25d2194d8a8f26c581650641c7-33" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_cadeff25d2194d8a8f26c581650641c7-33"></a> <span class="c1"># it's a Constant</span> +<a id="rest_code_cadeff25d2194d8a8f26c581650641c7-34" name="rest_code_cadeff25d2194d8a8f26c581650641c7-34" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_cadeff25d2194d8a8f26c581650641c7-34"></a> <span class="k">assert</span> <span class="n">bb</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span><span class="o">.</span><span class="n">args</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span><span class="o">.</span><span class="n">value</span> <span class="o">==</span> <span class="mi">0</span> +<a id="rest_code_cadeff25d2194d8a8f26c581650641c7-35" name="rest_code_cadeff25d2194d8a8f26c581650641c7-35" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_cadeff25d2194d8a8f26c581650641c7-35"></a> +<a id="rest_code_cadeff25d2194d8a8f26c581650641c7-36" name="rest_code_cadeff25d2194d8a8f26c581650641c7-36" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_cadeff25d2194d8a8f26c581650641c7-36"></a> <span class="c1"># b with getarg</span> +<a id="rest_code_cadeff25d2194d8a8f26c581650641c7-37" name="rest_code_cadeff25d2194d8a8f26c581650641c7-37" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_cadeff25d2194d8a8f26c581650641c7-37"></a> <span class="n">b</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">getarg</span><span class="p">(</span><span class="mi">1</span><span class="p">)</span> +<a id="rest_code_cadeff25d2194d8a8f26c581650641c7-38" name="rest_code_cadeff25d2194d8a8f26c581650641c7-38" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_cadeff25d2194d8a8f26c581650641c7-38"></a> <span class="c1"># var1 = add(b, 17)</span> +<a id="rest_code_cadeff25d2194d8a8f26c581650641c7-39" name="rest_code_cadeff25d2194d8a8f26c581650641c7-39" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_cadeff25d2194d8a8f26c581650641c7-39"></a> <span class="n">var1</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">add</span><span class="p">(</span><span class="n">b</span><span class="p">,</span> <span class="mi">17</span><span class="p">)</span> +<a id="rest_code_cadeff25d2194d8a8f26c581650641c7-40" name="rest_code_cadeff25d2194d8a8f26c581650641c7-40" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_cadeff25d2194d8a8f26c581650641c7-40"></a> <span class="c1"># var2 = mul(a, var1)</span> +<a id="rest_code_cadeff25d2194d8a8f26c581650641c7-41" name="rest_code_cadeff25d2194d8a8f26c581650641c7-41" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_cadeff25d2194d8a8f26c581650641c7-41"></a> <span class="n">var2</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">mul</span><span class="p">(</span><span class="n">a</span><span class="p">,</span> <span class="n">var1</span><span class="p">)</span> +<a id="rest_code_cadeff25d2194d8a8f26c581650641c7-42" name="rest_code_cadeff25d2194d8a8f26c581650641c7-42" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_cadeff25d2194d8a8f26c581650641c7-42"></a> <span class="c1"># var3 = add(b, 17)</span> +<a id="rest_code_cadeff25d2194d8a8f26c581650641c7-43" name="rest_code_cadeff25d2194d8a8f26c581650641c7-43" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_cadeff25d2194d8a8f26c581650641c7-43"></a> <span class="n">var3</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">add</span><span class="p">(</span><span class="n">b</span><span class="p">,</span> <span class="mi">17</span><span class="p">)</span> +<a id="rest_code_cadeff25d2194d8a8f26c581650641c7-44" name="rest_code_cadeff25d2194d8a8f26c581650641c7-44" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_cadeff25d2194d8a8f26c581650641c7-44"></a> <span class="c1"># var4 = add(var2, var3)</span> +<a id="rest_code_cadeff25d2194d8a8f26c581650641c7-45" name="rest_code_cadeff25d2194d8a8f26c581650641c7-45" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_cadeff25d2194d8a8f26c581650641c7-45"></a> <span class="n">var4</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">add</span><span class="p">(</span><span class="n">var2</span><span class="p">,</span> <span class="n">var3</span><span class="p">)</span> +<a id="rest_code_cadeff25d2194d8a8f26c581650641c7-46" name="rest_code_cadeff25d2194d8a8f26c581650641c7-46" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_cadeff25d2194d8a8f26c581650641c7-46"></a> <span class="k">assert</span> <span class="nb">len</span><span class="p">(</span><span class="n">bb</span><span class="p">)</span> <span class="o">==</span> <span class="mi">6</span> +</pre></div> +<p>That's a good bit of infrastructure to make the tests easy to write. One +thing we are lacking though is a way to print the basic blocks into a nicely +readable textual representation. Because in the current form, the <code class="docutils literal">repr</code> of a +Block is very annoying, the output of pretty-printing <code class="docutils literal">bb</code> in the test above +looks like this:</p> +<div class="code"><pre class="code python"><a id="rest_code_3b3ea7bc40a549f49465cd29e353728b-1" name="rest_code_3b3ea7bc40a549f49465cd29e353728b-1" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_3b3ea7bc40a549f49465cd29e353728b-1"></a><span class="p">[</span><span class="n">Operation</span><span class="p">(</span><span class="s1">'getarg'</span><span class="p">,</span> <span class="p">[</span><span class="n">Constant</span><span class="p">(</span><span class="mi">0</span><span class="p">)]),</span> +<a id="rest_code_3b3ea7bc40a549f49465cd29e353728b-2" name="rest_code_3b3ea7bc40a549f49465cd29e353728b-2" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_3b3ea7bc40a549f49465cd29e353728b-2"></a> <span class="n">Operation</span><span class="p">(</span><span class="s1">'getarg'</span><span class="p">,</span> <span class="p">[</span><span class="n">Constant</span><span class="p">(</span><span class="mi">1</span><span class="p">)]),</span> +<a id="rest_code_3b3ea7bc40a549f49465cd29e353728b-3" name="rest_code_3b3ea7bc40a549f49465cd29e353728b-3" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_3b3ea7bc40a549f49465cd29e353728b-3"></a> <span class="n">Operation</span><span class="p">(</span><span class="s1">'add'</span><span class="p">,</span> +<a id="rest_code_3b3ea7bc40a549f49465cd29e353728b-4" name="rest_code_3b3ea7bc40a549f49465cd29e353728b-4" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_3b3ea7bc40a549f49465cd29e353728b-4"></a> <span class="p">[</span><span class="n">Operation</span><span class="p">(</span><span class="s1">'getarg'</span><span class="p">,</span> +<a id="rest_code_3b3ea7bc40a549f49465cd29e353728b-5" name="rest_code_3b3ea7bc40a549f49465cd29e353728b-5" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_3b3ea7bc40a549f49465cd29e353728b-5"></a> <span class="p">[</span><span class="n">Constant</span><span class="p">(</span><span class="mi">1</span><span class="p">)]),</span> +<a id="rest_code_3b3ea7bc40a549f49465cd29e353728b-6" name="rest_code_3b3ea7bc40a549f49465cd29e353728b-6" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_3b3ea7bc40a549f49465cd29e353728b-6"></a> <span class="n">Constant</span><span class="p">(</span><span class="mi">17</span><span class="p">)]),</span> +<a id="rest_code_3b3ea7bc40a549f49465cd29e353728b-7" name="rest_code_3b3ea7bc40a549f49465cd29e353728b-7" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_3b3ea7bc40a549f49465cd29e353728b-7"></a> <span class="n">Operation</span><span class="p">(</span><span class="s1">'mul'</span><span class="p">,</span> +<a id="rest_code_3b3ea7bc40a549f49465cd29e353728b-8" name="rest_code_3b3ea7bc40a549f49465cd29e353728b-8" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_3b3ea7bc40a549f49465cd29e353728b-8"></a> <span class="p">[</span><span class="n">Operation</span><span class="p">(</span><span class="s1">'getarg'</span><span class="p">,</span> +<a id="rest_code_3b3ea7bc40a549f49465cd29e353728b-9" name="rest_code_3b3ea7bc40a549f49465cd29e353728b-9" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_3b3ea7bc40a549f49465cd29e353728b-9"></a> <span class="p">[</span><span class="n">Constant</span><span class="p">(</span><span class="mi">0</span><span class="p">)]),</span> +<a id="rest_code_3b3ea7bc40a549f49465cd29e353728b-10" name="rest_code_3b3ea7bc40a549f49465cd29e353728b-10" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_3b3ea7bc40a549f49465cd29e353728b-10"></a> <span class="n">Operation</span><span class="p">(</span><span class="s1">'add'</span><span class="p">,</span> +<a id="rest_code_3b3ea7bc40a549f49465cd29e353728b-11" name="rest_code_3b3ea7bc40a549f49465cd29e353728b-11" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_3b3ea7bc40a549f49465cd29e353728b-11"></a> <span class="p">[</span><span class="n">Operation</span><span class="p">(</span><span class="s1">'getarg'</span><span class="p">,</span> +<a id="rest_code_3b3ea7bc40a549f49465cd29e353728b-12" name="rest_code_3b3ea7bc40a549f49465cd29e353728b-12" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_3b3ea7bc40a549f49465cd29e353728b-12"></a> <span class="p">[</span><span class="n">Constant</span><span class="p">(</span><span class="mi">1</span><span class="p">)]),</span> +<a id="rest_code_3b3ea7bc40a549f49465cd29e353728b-13" name="rest_code_3b3ea7bc40a549f49465cd29e353728b-13" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_3b3ea7bc40a549f49465cd29e353728b-13"></a> <span class="n">Constant</span><span class="p">(</span><span class="mi">17</span><span class="p">)])]),</span> +<a id="rest_code_3b3ea7bc40a549f49465cd29e353728b-14" name="rest_code_3b3ea7bc40a549f49465cd29e353728b-14" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_3b3ea7bc40a549f49465cd29e353728b-14"></a> <span class="n">Operation</span><span class="p">(</span><span class="s1">'add'</span><span class="p">,</span> +<a id="rest_code_3b3ea7bc40a549f49465cd29e353728b-15" name="rest_code_3b3ea7bc40a549f49465cd29e353728b-15" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_3b3ea7bc40a549f49465cd29e353728b-15"></a> <span class="p">[</span><span class="n">Operation</span><span class="p">(</span><span class="s1">'getarg'</span><span class="p">,</span> +<a id="rest_code_3b3ea7bc40a549f49465cd29e353728b-16" name="rest_code_3b3ea7bc40a549f49465cd29e353728b-16" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_3b3ea7bc40a549f49465cd29e353728b-16"></a> <span class="p">[</span><span class="n">Constant</span><span class="p">(</span><span class="mi">1</span><span class="p">)]),</span> +<a id="rest_code_3b3ea7bc40a549f49465cd29e353728b-17" name="rest_code_3b3ea7bc40a549f49465cd29e353728b-17" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_3b3ea7bc40a549f49465cd29e353728b-17"></a> <span class="n">Constant</span><span class="p">(</span><span class="mi">17</span><span class="p">)]),</span> +<a id="rest_code_3b3ea7bc40a549f49465cd29e353728b-18" name="rest_code_3b3ea7bc40a549f49465cd29e353728b-18" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_3b3ea7bc40a549f49465cd29e353728b-18"></a> <span class="n">Operation</span><span class="p">(</span><span class="s1">'add'</span><span class="p">,</span> +<a id="rest_code_3b3ea7bc40a549f49465cd29e353728b-19" name="rest_code_3b3ea7bc40a549f49465cd29e353728b-19" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_3b3ea7bc40a549f49465cd29e353728b-19"></a> <span class="p">[</span><span class="n">Operation</span><span class="p">(</span><span class="s1">'mul'</span><span class="p">,</span> +<a id="rest_code_3b3ea7bc40a549f49465cd29e353728b-20" name="rest_code_3b3ea7bc40a549f49465cd29e353728b-20" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_3b3ea7bc40a549f49465cd29e353728b-20"></a> <span class="p">[</span><span class="n">Operation</span><span class="p">(</span><span class="s1">'getarg'</span><span class="p">,</span> +<a id="rest_code_3b3ea7bc40a549f49465cd29e353728b-21" name="rest_code_3b3ea7bc40a549f49465cd29e353728b-21" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_3b3ea7bc40a549f49465cd29e353728b-21"></a> <span class="p">[</span><span class="n">Constant</span><span class="p">(</span><span class="mi">0</span><span class="p">)]),</span> +<a id="rest_code_3b3ea7bc40a549f49465cd29e353728b-22" name="rest_code_3b3ea7bc40a549f49465cd29e353728b-22" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_3b3ea7bc40a549f49465cd29e353728b-22"></a> <span class="n">Operation</span><span class="p">(</span><span class="s1">'add'</span><span class="p">,</span> +<a id="rest_code_3b3ea7bc40a549f49465cd29e353728b-23" name="rest_code_3b3ea7bc40a549f49465cd29e353728b-23" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_3b3ea7bc40a549f49465cd29e353728b-23"></a> <span class="p">[</span><span class="n">Operation</span><span class="p">(</span><span class="s1">'getarg'</span><span class="p">,</span> +<a id="rest_code_3b3ea7bc40a549f49465cd29e353728b-24" name="rest_code_3b3ea7bc40a549f49465cd29e353728b-24" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_3b3ea7bc40a549f49465cd29e353728b-24"></a> <span class="p">[</span><span class="n">Constant</span><span class="p">(</span><span class="mi">1</span><span class="p">)]),</span> +<a id="rest_code_3b3ea7bc40a549f49465cd29e353728b-25" name="rest_code_3b3ea7bc40a549f49465cd29e353728b-25" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_3b3ea7bc40a549f49465cd29e353728b-25"></a> <span class="n">Constant</span><span class="p">(</span><span class="mi">17</span><span class="p">)])]),</span> +<a id="rest_code_3b3ea7bc40a549f49465cd29e353728b-26" name="rest_code_3b3ea7bc40a549f49465cd29e353728b-26" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_3b3ea7bc40a549f49465cd29e353728b-26"></a> <span class="n">Operation</span><span class="p">(</span><span class="s1">'add'</span><span class="p">,</span> +<a id="rest_code_3b3ea7bc40a549f49465cd29e353728b-27" name="rest_code_3b3ea7bc40a549f49465cd29e353728b-27" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_3b3ea7bc40a549f49465cd29e353728b-27"></a> <span class="p">[</span><span class="n">Operation</span><span class="p">(</span><span class="s1">'getarg'</span><span class="p">,</span> +<a id="rest_code_3b3ea7bc40a549f49465cd29e353728b-28" name="rest_code_3b3ea7bc40a549f49465cd29e353728b-28" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_3b3ea7bc40a549f49465cd29e353728b-28"></a> <span class="p">[</span><span class="n">Constant</span><span class="p">(</span><span class="mi">1</span><span class="p">)]),</span> +<a id="rest_code_3b3ea7bc40a549f49465cd29e353728b-29" name="rest_code_3b3ea7bc40a549f49465cd29e353728b-29" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_3b3ea7bc40a549f49465cd29e353728b-29"></a> <span class="n">Constant</span><span class="p">(</span><span class="mi">17</span><span class="p">)])])]</span> +</pre></div> +<p>It's impossible to see what is going on here, because the <code class="docutils literal">Operations</code> in the +basic block appear several times, once as elements of the list but then also as +arguments to operations further down in the list. So we need some code that +turns things back into a readable textual representation, so we have a chance +to debug.</p> +<div class="code"><pre class="code python"><a id="rest_code_9248a9c16ce744bb86b23599baa5ddcb-1" name="rest_code_9248a9c16ce744bb86b23599baa5ddcb-1" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_9248a9c16ce744bb86b23599baa5ddcb-1"></a><span class="k">def</span> <span class="nf">bb_to_str</span><span class="p">(</span><span class="n">bb</span><span class="p">:</span> <span class="n">Block</span><span class="p">,</span> <span class="n">varprefix</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">"var"</span><span class="p">):</span> +<a id="rest_code_9248a9c16ce744bb86b23599baa5ddcb-2" name="rest_code_9248a9c16ce744bb86b23599baa5ddcb-2" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_9248a9c16ce744bb86b23599baa5ddcb-2"></a> <span class="c1"># the implementation is not too important,</span> +<a id="rest_code_9248a9c16ce744bb86b23599baa5ddcb-3" name="rest_code_9248a9c16ce744bb86b23599baa5ddcb-3" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_9248a9c16ce744bb86b23599baa5ddcb-3"></a> <span class="c1"># look at the test below to see what the</span> +<a id="rest_code_9248a9c16ce744bb86b23599baa5ddcb-4" name="rest_code_9248a9c16ce744bb86b23599baa5ddcb-4" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_9248a9c16ce744bb86b23599baa5ddcb-4"></a> <span class="c1"># result looks like</span> +<a id="rest_code_9248a9c16ce744bb86b23599baa5ddcb-5" name="rest_code_9248a9c16ce744bb86b23599baa5ddcb-5" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_9248a9c16ce744bb86b23599baa5ddcb-5"></a> +<a id="rest_code_9248a9c16ce744bb86b23599baa5ddcb-6" name="rest_code_9248a9c16ce744bb86b23599baa5ddcb-6" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_9248a9c16ce744bb86b23599baa5ddcb-6"></a> <span class="k">def</span> <span class="nf">arg_to_str</span><span class="p">(</span><span class="n">arg</span><span class="p">:</span> <span class="n">Value</span><span class="p">):</span> +<a id="rest_code_9248a9c16ce744bb86b23599baa5ddcb-7" name="rest_code_9248a9c16ce744bb86b23599baa5ddcb-7" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_9248a9c16ce744bb86b23599baa5ddcb-7"></a> <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">arg</span><span class="p">,</span> <span class="n">Constant</span><span class="p">):</span> +<a id="rest_code_9248a9c16ce744bb86b23599baa5ddcb-8" name="rest_code_9248a9c16ce744bb86b23599baa5ddcb-8" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_9248a9c16ce744bb86b23599baa5ddcb-8"></a> <span class="k">return</span> <span class="nb">str</span><span class="p">(</span><span class="n">arg</span><span class="o">.</span><span class="n">value</span><span class="p">)</span> +<a id="rest_code_9248a9c16ce744bb86b23599baa5ddcb-9" name="rest_code_9248a9c16ce744bb86b23599baa5ddcb-9" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_9248a9c16ce744bb86b23599baa5ddcb-9"></a> <span class="k">else</span><span class="p">:</span> +<a id="rest_code_9248a9c16ce744bb86b23599baa5ddcb-10" name="rest_code_9248a9c16ce744bb86b23599baa5ddcb-10" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_9248a9c16ce744bb86b23599baa5ddcb-10"></a> <span class="c1"># the key must exist, otherwise it's</span> +<a id="rest_code_9248a9c16ce744bb86b23599baa5ddcb-11" name="rest_code_9248a9c16ce744bb86b23599baa5ddcb-11" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_9248a9c16ce744bb86b23599baa5ddcb-11"></a> <span class="c1"># not a valid SSA basic block:</span> +<a id="rest_code_9248a9c16ce744bb86b23599baa5ddcb-12" name="rest_code_9248a9c16ce744bb86b23599baa5ddcb-12" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_9248a9c16ce744bb86b23599baa5ddcb-12"></a> <span class="c1"># the variable must be defined before</span> +<a id="rest_code_9248a9c16ce744bb86b23599baa5ddcb-13" name="rest_code_9248a9c16ce744bb86b23599baa5ddcb-13" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_9248a9c16ce744bb86b23599baa5ddcb-13"></a> <span class="c1"># its first use</span> +<a id="rest_code_9248a9c16ce744bb86b23599baa5ddcb-14" name="rest_code_9248a9c16ce744bb86b23599baa5ddcb-14" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_9248a9c16ce744bb86b23599baa5ddcb-14"></a> <span class="k">return</span> <span class="n">varnames</span><span class="p">[</span><span class="n">arg</span><span class="p">]</span> +<a id="rest_code_9248a9c16ce744bb86b23599baa5ddcb-15" name="rest_code_9248a9c16ce744bb86b23599baa5ddcb-15" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_9248a9c16ce744bb86b23599baa5ddcb-15"></a> +<a id="rest_code_9248a9c16ce744bb86b23599baa5ddcb-16" name="rest_code_9248a9c16ce744bb86b23599baa5ddcb-16" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_9248a9c16ce744bb86b23599baa5ddcb-16"></a> <span class="n">varnames</span> <span class="o">=</span> <span class="p">{}</span> +<a id="rest_code_9248a9c16ce744bb86b23599baa5ddcb-17" name="rest_code_9248a9c16ce744bb86b23599baa5ddcb-17" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_9248a9c16ce744bb86b23599baa5ddcb-17"></a> <span class="n">res</span> <span class="o">=</span> <span class="p">[]</span> +<a id="rest_code_9248a9c16ce744bb86b23599baa5ddcb-18" name="rest_code_9248a9c16ce744bb86b23599baa5ddcb-18" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_9248a9c16ce744bb86b23599baa5ddcb-18"></a> <span class="k">for</span> <span class="n">index</span><span class="p">,</span> <span class="n">op</span> <span class="ow">in</span> <span class="nb">enumerate</span><span class="p">(</span><span class="n">bb</span><span class="p">):</span> +<a id="rest_code_9248a9c16ce744bb86b23599baa5ddcb-19" name="rest_code_9248a9c16ce744bb86b23599baa5ddcb-19" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_9248a9c16ce744bb86b23599baa5ddcb-19"></a> <span class="c1"># give the operation a name used while</span> +<a id="rest_code_9248a9c16ce744bb86b23599baa5ddcb-20" name="rest_code_9248a9c16ce744bb86b23599baa5ddcb-20" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_9248a9c16ce744bb86b23599baa5ddcb-20"></a> <span class="c1"># printing:</span> +<a id="rest_code_9248a9c16ce744bb86b23599baa5ddcb-21" name="rest_code_9248a9c16ce744bb86b23599baa5ddcb-21" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_9248a9c16ce744bb86b23599baa5ddcb-21"></a> <span class="n">var</span> <span class="o">=</span> <span class="sa">f</span><span class="s2">"</span><span class="si">{</span><span class="n">varprefix</span><span class="si">}{</span><span class="n">index</span><span class="si">}</span><span class="s2">"</span> +<a id="rest_code_9248a9c16ce744bb86b23599baa5ddcb-22" name="rest_code_9248a9c16ce744bb86b23599baa5ddcb-22" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_9248a9c16ce744bb86b23599baa5ddcb-22"></a> <span class="n">varnames</span><span class="p">[</span><span class="n">op</span><span class="p">]</span> <span class="o">=</span> <span class="n">var</span> +<a id="rest_code_9248a9c16ce744bb86b23599baa5ddcb-23" name="rest_code_9248a9c16ce744bb86b23599baa5ddcb-23" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_9248a9c16ce744bb86b23599baa5ddcb-23"></a> <span class="n">arguments</span> <span class="o">=</span> <span class="s2">", "</span><span class="o">.</span><span class="n">join</span><span class="p">(</span> +<a id="rest_code_9248a9c16ce744bb86b23599baa5ddcb-24" name="rest_code_9248a9c16ce744bb86b23599baa5ddcb-24" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_9248a9c16ce744bb86b23599baa5ddcb-24"></a> <span class="n">arg_to_str</span><span class="p">(</span><span class="n">op</span><span class="o">.</span><span class="n">arg</span><span class="p">(</span><span class="n">i</span><span class="p">))</span> +<a id="rest_code_9248a9c16ce744bb86b23599baa5ddcb-25" name="rest_code_9248a9c16ce744bb86b23599baa5ddcb-25" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_9248a9c16ce744bb86b23599baa5ddcb-25"></a> <span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="nb">len</span><span class="p">(</span><span class="n">op</span><span class="o">.</span><span class="n">args</span><span class="p">))</span> +<a id="rest_code_9248a9c16ce744bb86b23599baa5ddcb-26" name="rest_code_9248a9c16ce744bb86b23599baa5ddcb-26" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_9248a9c16ce744bb86b23599baa5ddcb-26"></a> <span class="p">)</span> +<a id="rest_code_9248a9c16ce744bb86b23599baa5ddcb-27" name="rest_code_9248a9c16ce744bb86b23599baa5ddcb-27" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_9248a9c16ce744bb86b23599baa5ddcb-27"></a> <span class="n">strop</span> <span class="o">=</span> <span class="sa">f</span><span class="s2">"</span><span class="si">{</span><span class="n">var</span><span class="si">}</span><span class="s2"> = </span><span class="si">{</span><span class="n">op</span><span class="o">.</span><span class="n">name</span><span class="si">}</span><span class="s2">(</span><span class="si">{</span><span class="n">arguments</span><span class="si">}</span><span class="s2">)"</span> +<a id="rest_code_9248a9c16ce744bb86b23599baa5ddcb-28" name="rest_code_9248a9c16ce744bb86b23599baa5ddcb-28" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_9248a9c16ce744bb86b23599baa5ddcb-28"></a> <span class="n">res</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">strop</span><span class="p">)</span> +<a id="rest_code_9248a9c16ce744bb86b23599baa5ddcb-29" name="rest_code_9248a9c16ce744bb86b23599baa5ddcb-29" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_9248a9c16ce744bb86b23599baa5ddcb-29"></a> <span class="k">return</span> <span class="s2">"</span><span class="se">\n</span><span class="s2">"</span><span class="o">.</span><span class="n">join</span><span class="p">(</span><span class="n">res</span><span class="p">)</span> +<a id="rest_code_9248a9c16ce744bb86b23599baa5ddcb-30" name="rest_code_9248a9c16ce744bb86b23599baa5ddcb-30" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_9248a9c16ce744bb86b23599baa5ddcb-30"></a> +<a id="rest_code_9248a9c16ce744bb86b23599baa5ddcb-31" name="rest_code_9248a9c16ce744bb86b23599baa5ddcb-31" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_9248a9c16ce744bb86b23599baa5ddcb-31"></a><span class="k">def</span> <span class="nf">test_basicblock_to_str</span><span class="p">():</span> +<a id="rest_code_9248a9c16ce744bb86b23599baa5ddcb-32" name="rest_code_9248a9c16ce744bb86b23599baa5ddcb-32" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_9248a9c16ce744bb86b23599baa5ddcb-32"></a> <span class="n">bb</span> <span class="o">=</span> <span class="n">Block</span><span class="p">()</span> +<a id="rest_code_9248a9c16ce744bb86b23599baa5ddcb-33" name="rest_code_9248a9c16ce744bb86b23599baa5ddcb-33" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_9248a9c16ce744bb86b23599baa5ddcb-33"></a> <span class="n">var0</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">getarg</span><span class="p">(</span><span class="mi">0</span><span class="p">)</span> +<a id="rest_code_9248a9c16ce744bb86b23599baa5ddcb-34" name="rest_code_9248a9c16ce744bb86b23599baa5ddcb-34" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_9248a9c16ce744bb86b23599baa5ddcb-34"></a> <span class="n">var1</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">add</span><span class="p">(</span><span class="mi">5</span><span class="p">,</span> <span class="mi">4</span><span class="p">)</span> +<a id="rest_code_9248a9c16ce744bb86b23599baa5ddcb-35" name="rest_code_9248a9c16ce744bb86b23599baa5ddcb-35" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_9248a9c16ce744bb86b23599baa5ddcb-35"></a> <span class="n">var2</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">add</span><span class="p">(</span><span class="n">var1</span><span class="p">,</span> <span class="n">var0</span><span class="p">)</span> +<a id="rest_code_9248a9c16ce744bb86b23599baa5ddcb-36" name="rest_code_9248a9c16ce744bb86b23599baa5ddcb-36" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_9248a9c16ce744bb86b23599baa5ddcb-36"></a> +<a id="rest_code_9248a9c16ce744bb86b23599baa5ddcb-37" name="rest_code_9248a9c16ce744bb86b23599baa5ddcb-37" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_9248a9c16ce744bb86b23599baa5ddcb-37"></a> <span class="k">assert</span> <span class="n">bb_to_str</span><span class="p">(</span><span class="n">bb</span><span class="p">)</span> <span class="o">==</span> <span class="s2">"""</span><span class="se">\</span> +<a id="rest_code_9248a9c16ce744bb86b23599baa5ddcb-38" name="rest_code_9248a9c16ce744bb86b23599baa5ddcb-38" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_9248a9c16ce744bb86b23599baa5ddcb-38"></a><span class="s2">var0 = getarg(0)</span> +<a id="rest_code_9248a9c16ce744bb86b23599baa5ddcb-39" name="rest_code_9248a9c16ce744bb86b23599baa5ddcb-39" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_9248a9c16ce744bb86b23599baa5ddcb-39"></a><span class="s2">var1 = add(5, 4)</span> +<a id="rest_code_9248a9c16ce744bb86b23599baa5ddcb-40" name="rest_code_9248a9c16ce744bb86b23599baa5ddcb-40" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_9248a9c16ce744bb86b23599baa5ddcb-40"></a><span class="s2">var2 = add(var1, var0)"""</span> +<a id="rest_code_9248a9c16ce744bb86b23599baa5ddcb-41" name="rest_code_9248a9c16ce744bb86b23599baa5ddcb-41" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_9248a9c16ce744bb86b23599baa5ddcb-41"></a> +<a id="rest_code_9248a9c16ce744bb86b23599baa5ddcb-42" name="rest_code_9248a9c16ce744bb86b23599baa5ddcb-42" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_9248a9c16ce744bb86b23599baa5ddcb-42"></a> <span class="c1"># with a different prefix for the invented</span> +<a id="rest_code_9248a9c16ce744bb86b23599baa5ddcb-43" name="rest_code_9248a9c16ce744bb86b23599baa5ddcb-43" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_9248a9c16ce744bb86b23599baa5ddcb-43"></a> <span class="c1"># variable names:</span> +<a id="rest_code_9248a9c16ce744bb86b23599baa5ddcb-44" name="rest_code_9248a9c16ce744bb86b23599baa5ddcb-44" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_9248a9c16ce744bb86b23599baa5ddcb-44"></a> <span class="k">assert</span> <span class="n">bb_to_str</span><span class="p">(</span><span class="n">bb</span><span class="p">,</span> <span class="s2">"x"</span><span class="p">)</span> <span class="o">==</span> <span class="s2">"""</span><span class="se">\</span> +<a id="rest_code_9248a9c16ce744bb86b23599baa5ddcb-45" name="rest_code_9248a9c16ce744bb86b23599baa5ddcb-45" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_9248a9c16ce744bb86b23599baa5ddcb-45"></a><span class="s2">x0 = getarg(0)</span> +<a id="rest_code_9248a9c16ce744bb86b23599baa5ddcb-46" name="rest_code_9248a9c16ce744bb86b23599baa5ddcb-46" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_9248a9c16ce744bb86b23599baa5ddcb-46"></a><span class="s2">x1 = add(5, 4)</span> +<a id="rest_code_9248a9c16ce744bb86b23599baa5ddcb-47" name="rest_code_9248a9c16ce744bb86b23599baa5ddcb-47" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_9248a9c16ce744bb86b23599baa5ddcb-47"></a><span class="s2">x2 = add(x1, x0)"""</span> +<a id="rest_code_9248a9c16ce744bb86b23599baa5ddcb-48" name="rest_code_9248a9c16ce744bb86b23599baa5ddcb-48" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_9248a9c16ce744bb86b23599baa5ddcb-48"></a> +<a id="rest_code_9248a9c16ce744bb86b23599baa5ddcb-49" name="rest_code_9248a9c16ce744bb86b23599baa5ddcb-49" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_9248a9c16ce744bb86b23599baa5ddcb-49"></a> <span class="c1"># and our running example:</span> +<a id="rest_code_9248a9c16ce744bb86b23599baa5ddcb-50" name="rest_code_9248a9c16ce744bb86b23599baa5ddcb-50" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_9248a9c16ce744bb86b23599baa5ddcb-50"></a> <span class="n">bb</span> <span class="o">=</span> <span class="n">Block</span><span class="p">()</span> +<a id="rest_code_9248a9c16ce744bb86b23599baa5ddcb-51" name="rest_code_9248a9c16ce744bb86b23599baa5ddcb-51" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_9248a9c16ce744bb86b23599baa5ddcb-51"></a> <span class="n">a</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">getarg</span><span class="p">(</span><span class="mi">0</span><span class="p">)</span> +<a id="rest_code_9248a9c16ce744bb86b23599baa5ddcb-52" name="rest_code_9248a9c16ce744bb86b23599baa5ddcb-52" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_9248a9c16ce744bb86b23599baa5ddcb-52"></a> <span class="n">b</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">getarg</span><span class="p">(</span><span class="mi">1</span><span class="p">)</span> +<a id="rest_code_9248a9c16ce744bb86b23599baa5ddcb-53" name="rest_code_9248a9c16ce744bb86b23599baa5ddcb-53" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_9248a9c16ce744bb86b23599baa5ddcb-53"></a> <span class="n">var1</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">add</span><span class="p">(</span><span class="n">b</span><span class="p">,</span> <span class="mi">17</span><span class="p">)</span> +<a id="rest_code_9248a9c16ce744bb86b23599baa5ddcb-54" name="rest_code_9248a9c16ce744bb86b23599baa5ddcb-54" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_9248a9c16ce744bb86b23599baa5ddcb-54"></a> <span class="n">var2</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">mul</span><span class="p">(</span><span class="n">a</span><span class="p">,</span> <span class="n">var1</span><span class="p">)</span> +<a id="rest_code_9248a9c16ce744bb86b23599baa5ddcb-55" name="rest_code_9248a9c16ce744bb86b23599baa5ddcb-55" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_9248a9c16ce744bb86b23599baa5ddcb-55"></a> <span class="n">var3</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">add</span><span class="p">(</span><span class="n">b</span><span class="p">,</span> <span class="mi">17</span><span class="p">)</span> +<a id="rest_code_9248a9c16ce744bb86b23599baa5ddcb-56" name="rest_code_9248a9c16ce744bb86b23599baa5ddcb-56" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_9248a9c16ce744bb86b23599baa5ddcb-56"></a> <span class="n">var4</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">add</span><span class="p">(</span><span class="n">var2</span><span class="p">,</span> <span class="n">var3</span><span class="p">)</span> +<a id="rest_code_9248a9c16ce744bb86b23599baa5ddcb-57" name="rest_code_9248a9c16ce744bb86b23599baa5ddcb-57" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_9248a9c16ce744bb86b23599baa5ddcb-57"></a> +<a id="rest_code_9248a9c16ce744bb86b23599baa5ddcb-58" name="rest_code_9248a9c16ce744bb86b23599baa5ddcb-58" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_9248a9c16ce744bb86b23599baa5ddcb-58"></a> <span class="k">assert</span> <span class="n">bb_to_str</span><span class="p">(</span><span class="n">bb</span><span class="p">,</span> <span class="s2">"v"</span><span class="p">)</span> <span class="o">==</span> <span class="s2">"""</span><span class="se">\</span> +<a id="rest_code_9248a9c16ce744bb86b23599baa5ddcb-59" name="rest_code_9248a9c16ce744bb86b23599baa5ddcb-59" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_9248a9c16ce744bb86b23599baa5ddcb-59"></a><span class="s2">v0 = getarg(0)</span> +<a id="rest_code_9248a9c16ce744bb86b23599baa5ddcb-60" name="rest_code_9248a9c16ce744bb86b23599baa5ddcb-60" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_9248a9c16ce744bb86b23599baa5ddcb-60"></a><span class="s2">v1 = getarg(1)</span> +<a id="rest_code_9248a9c16ce744bb86b23599baa5ddcb-61" name="rest_code_9248a9c16ce744bb86b23599baa5ddcb-61" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_9248a9c16ce744bb86b23599baa5ddcb-61"></a><span class="s2">v2 = add(v1, 17)</span> +<a id="rest_code_9248a9c16ce744bb86b23599baa5ddcb-62" name="rest_code_9248a9c16ce744bb86b23599baa5ddcb-62" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_9248a9c16ce744bb86b23599baa5ddcb-62"></a><span class="s2">v3 = mul(v0, v2)</span> +<a id="rest_code_9248a9c16ce744bb86b23599baa5ddcb-63" name="rest_code_9248a9c16ce744bb86b23599baa5ddcb-63" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_9248a9c16ce744bb86b23599baa5ddcb-63"></a><span class="s2">v4 = add(v1, 17)</span> +<a id="rest_code_9248a9c16ce744bb86b23599baa5ddcb-64" name="rest_code_9248a9c16ce744bb86b23599baa5ddcb-64" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_9248a9c16ce744bb86b23599baa5ddcb-64"></a><span class="s2">v5 = add(v3, v4)"""</span> +<a id="rest_code_9248a9c16ce744bb86b23599baa5ddcb-65" name="rest_code_9248a9c16ce744bb86b23599baa5ddcb-65" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_9248a9c16ce744bb86b23599baa5ddcb-65"></a> <span class="c1"># Note the re-numbering of the variables! We</span> +<a id="rest_code_9248a9c16ce744bb86b23599baa5ddcb-66" name="rest_code_9248a9c16ce744bb86b23599baa5ddcb-66" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_9248a9c16ce744bb86b23599baa5ddcb-66"></a> <span class="c1"># don't attach names to Operations at all, so</span> +<a id="rest_code_9248a9c16ce744bb86b23599baa5ddcb-67" name="rest_code_9248a9c16ce744bb86b23599baa5ddcb-67" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_9248a9c16ce744bb86b23599baa5ddcb-67"></a> <span class="c1"># the printing will just number them in</span> +<a id="rest_code_9248a9c16ce744bb86b23599baa5ddcb-68" name="rest_code_9248a9c16ce744bb86b23599baa5ddcb-68" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_9248a9c16ce744bb86b23599baa5ddcb-68"></a> <span class="c1"># sequence, can sometimes be a source of</span> +<a id="rest_code_9248a9c16ce744bb86b23599baa5ddcb-69" name="rest_code_9248a9c16ce744bb86b23599baa5ddcb-69" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_9248a9c16ce744bb86b23599baa5ddcb-69"></a> <span class="c1"># confusion.</span> +</pre></div> +<p>This is much better. Now we're done with the basic infrastructure, we can +define sequences of operations and print them in a readable way. Next we need a +central data structure that is used when actually optimizing basic blocks.</p> +</section> +<section id="storing-equivalences-between-operations-using-a-union-find-data-structure"> +<h2>Storing Equivalences between Operations Using a Union-Find Data Structure</h2> +<p>When optimizing a sequence of operations, we want to make it less costly to +execute. For that we typically want to remove operations (and sometimes +replace operations with less expensive ones). We can remove operations if +they do redundant computation, like case of the duplicate <code class="docutils literal">add(v1, 17)</code> in +the example. So what we want to do is to turn the running input sequence:</p> +<pre class="literal-block">v0 = getarg(0) +v1 = getarg(1) +v2 = add(v1, 17) +v3 = mul(v0, v2) +v4 = add(v1, 17) +v5 = add(v3, v4)</pre> +<p>Into the following optimized output sequence:</p> +<pre class="literal-block">optvar0 = getarg(0) +optvar1 = getarg(1) +optvar2 = add(optvar1, 17) +optvar3 = mul(optvar0, optvar2) +optvar4 = add(optvar3, optvar2)</pre> +<p>We left out the second <code class="docutils literal">add</code> (which defines <code class="docutils literal">v4</code>), and then replaced the +usage of <code class="docutils literal">v4</code> with <code class="docutils literal">v2</code> in the final operation that defines <code class="docutils literal">v5</code>.</p> +<p>What we effectively did was discover that <code class="docutils literal">v2</code> and <code class="docutils literal">v4</code> are equivalent and then +replaced <code class="docutils literal">v4</code> with <code class="docutils literal">v2</code>. In general, we might discover more such equivalences, +and we need a data structure to store them. A good data structure to store +these equivalences is <a class="reference external" href="https://en.wikipedia.org/wiki/Disjoint-set_data_structure">Union Find</a> (also called Disjoint-set data structure), +which stores a collection of disjoint sets. Disjoint means, that no operation +can appear in more than one set. The sets in our concrete case are the sets of +operations that compute the same result.</p> +<p>When we start out, every operation is in its own singleton set, with no other +member. As we discover more equivalences, we will unify sets into larger sets +of operations that all compute the same result. So one operation the data +structure supports is <code class="docutils literal">union</code>, to unify two sets, we'll call that +<code class="docutils literal">make_equal_to</code> in the code below.</p> +<p>The other operation the data structure supports is <code class="docutils literal">find</code>, which takes an +operation and returns a "representative" of the set of all equivalent +operations. Two operations are in the same set, if the representative that +find returns for them is the same.</p> +<p>The exact details of how the data structure works are only sort of important +(even though it's very cool, I promise!). It's OK to skip over the +implementation. We will add the data structure right into our <code class="docutils literal">Value</code>, +<code class="docutils literal">Constant</code> and <code class="docutils literal">Operation</code> classes:</p> +<div class="code"><pre class="code python"><a id="rest_code_a15a9155215648a298765668d60a43a2-1" name="rest_code_a15a9155215648a298765668d60a43a2-1" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_a15a9155215648a298765668d60a43a2-1"></a><span class="k">class</span> <span class="nc">Value</span><span class="p">:</span> +<a id="rest_code_a15a9155215648a298765668d60a43a2-2" name="rest_code_a15a9155215648a298765668d60a43a2-2" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_a15a9155215648a298765668d60a43a2-2"></a> <span class="k">def</span> <span class="nf">find</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span> +<a id="rest_code_a15a9155215648a298765668d60a43a2-3" name="rest_code_a15a9155215648a298765668d60a43a2-3" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_a15a9155215648a298765668d60a43a2-3"></a> <span class="k">raise</span> <span class="ne">NotImplementedError</span><span class="p">(</span><span class="s2">"abstract"</span><span class="p">)</span> +<a id="rest_code_a15a9155215648a298765668d60a43a2-4" name="rest_code_a15a9155215648a298765668d60a43a2-4" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_a15a9155215648a298765668d60a43a2-4"></a> <span class="k">def</span> <span class="nf">_set_forwarded</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span> +<a id="rest_code_a15a9155215648a298765668d60a43a2-5" name="rest_code_a15a9155215648a298765668d60a43a2-5" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_a15a9155215648a298765668d60a43a2-5"></a> <span class="k">raise</span> <span class="ne">NotImplementedError</span><span class="p">(</span><span class="s2">"abstract"</span><span class="p">)</span> +<a id="rest_code_a15a9155215648a298765668d60a43a2-6" name="rest_code_a15a9155215648a298765668d60a43a2-6" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_a15a9155215648a298765668d60a43a2-6"></a> +<a id="rest_code_a15a9155215648a298765668d60a43a2-7" name="rest_code_a15a9155215648a298765668d60a43a2-7" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_a15a9155215648a298765668d60a43a2-7"></a> +<a id="rest_code_a15a9155215648a298765668d60a43a2-8" name="rest_code_a15a9155215648a298765668d60a43a2-8" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_a15a9155215648a298765668d60a43a2-8"></a><span class="k">class</span> <span class="nc">Operation</span><span class="p">(</span><span class="n">Value</span><span class="p">):</span> +<a id="rest_code_a15a9155215648a298765668d60a43a2-9" name="rest_code_a15a9155215648a298765668d60a43a2-9" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_a15a9155215648a298765668d60a43a2-9"></a> <span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">name</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span> <span class="n">args</span><span class="p">:</span> <span class="nb">list</span><span class="p">[</span><span class="n">Value</span><span class="p">]):</span> +<a id="rest_code_a15a9155215648a298765668d60a43a2-10" name="rest_code_a15a9155215648a298765668d60a43a2-10" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_a15a9155215648a298765668d60a43a2-10"></a> <span class="bp">self</span><span class="o">.</span><span class="n">name</span> <span class="o">=</span> <span class="n">name</span> +<a id="rest_code_a15a9155215648a298765668d60a43a2-11" name="rest_code_a15a9155215648a298765668d60a43a2-11" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_a15a9155215648a298765668d60a43a2-11"></a> <span class="bp">self</span><span class="o">.</span><span class="n">args</span> <span class="o">=</span> <span class="n">args</span> +<a id="rest_code_a15a9155215648a298765668d60a43a2-12" name="rest_code_a15a9155215648a298765668d60a43a2-12" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_a15a9155215648a298765668d60a43a2-12"></a> <span class="bp">self</span><span class="o">.</span><span class="n">forwarded</span> <span class="o">=</span> <span class="kc">None</span> +<a id="rest_code_a15a9155215648a298765668d60a43a2-13" name="rest_code_a15a9155215648a298765668d60a43a2-13" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_a15a9155215648a298765668d60a43a2-13"></a> +<a id="rest_code_a15a9155215648a298765668d60a43a2-14" name="rest_code_a15a9155215648a298765668d60a43a2-14" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_a15a9155215648a298765668d60a43a2-14"></a> <span class="k">def</span> <span class="fm">__repr__</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span> +<a id="rest_code_a15a9155215648a298765668d60a43a2-15" name="rest_code_a15a9155215648a298765668d60a43a2-15" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_a15a9155215648a298765668d60a43a2-15"></a> <span class="k">return</span> <span class="p">(</span> +<a id="rest_code_a15a9155215648a298765668d60a43a2-16" name="rest_code_a15a9155215648a298765668d60a43a2-16" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_a15a9155215648a298765668d60a43a2-16"></a> <span class="sa">f</span><span class="s2">"Operation(</span><span class="si">{</span><span class="bp">self</span><span class="o">.</span><span class="n">name</span><span class="si">}</span><span class="s2">,"</span> +<a id="rest_code_a15a9155215648a298765668d60a43a2-17" name="rest_code_a15a9155215648a298765668d60a43a2-17" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_a15a9155215648a298765668d60a43a2-17"></a> <span class="sa">f</span><span class="s2">"</span><span class="si">{</span><span class="bp">self</span><span class="o">.</span><span class="n">args</span><span class="si">}</span><span class="s2">, </span><span class="si">{</span><span class="bp">self</span><span class="o">.</span><span class="n">forwarded</span><span class="si">}</span><span class="s2">)"</span> +<a id="rest_code_a15a9155215648a298765668d60a43a2-18" name="rest_code_a15a9155215648a298765668d60a43a2-18" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_a15a9155215648a298765668d60a43a2-18"></a> <span class="p">)</span> +<a id="rest_code_a15a9155215648a298765668d60a43a2-19" name="rest_code_a15a9155215648a298765668d60a43a2-19" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_a15a9155215648a298765668d60a43a2-19"></a> +<a id="rest_code_a15a9155215648a298765668d60a43a2-20" name="rest_code_a15a9155215648a298765668d60a43a2-20" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_a15a9155215648a298765668d60a43a2-20"></a> <span class="k">def</span> <span class="nf">find</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Value</span><span class="p">:</span> +<a id="rest_code_a15a9155215648a298765668d60a43a2-21" name="rest_code_a15a9155215648a298765668d60a43a2-21" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_a15a9155215648a298765668d60a43a2-21"></a> <span class="c1"># returns the "representative" value of</span> +<a id="rest_code_a15a9155215648a298765668d60a43a2-22" name="rest_code_a15a9155215648a298765668d60a43a2-22" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_a15a9155215648a298765668d60a43a2-22"></a> <span class="c1"># self, in the union-find sense</span> +<a id="rest_code_a15a9155215648a298765668d60a43a2-23" name="rest_code_a15a9155215648a298765668d60a43a2-23" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_a15a9155215648a298765668d60a43a2-23"></a> <span class="n">op</span> <span class="o">=</span> <span class="bp">self</span> +<a id="rest_code_a15a9155215648a298765668d60a43a2-24" name="rest_code_a15a9155215648a298765668d60a43a2-24" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_a15a9155215648a298765668d60a43a2-24"></a> <span class="k">while</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">op</span><span class="p">,</span> <span class="n">Operation</span><span class="p">):</span> +<a id="rest_code_a15a9155215648a298765668d60a43a2-25" name="rest_code_a15a9155215648a298765668d60a43a2-25" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_a15a9155215648a298765668d60a43a2-25"></a> <span class="c1"># could do path compression here too</span> +<a id="rest_code_a15a9155215648a298765668d60a43a2-26" name="rest_code_a15a9155215648a298765668d60a43a2-26" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_a15a9155215648a298765668d60a43a2-26"></a> <span class="c1"># but not essential</span> +<a id="rest_code_a15a9155215648a298765668d60a43a2-27" name="rest_code_a15a9155215648a298765668d60a43a2-27" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_a15a9155215648a298765668d60a43a2-27"></a> <span class="nb">next</span> <span class="o">=</span> <span class="n">op</span><span class="o">.</span><span class="n">forwarded</span> +<a id="rest_code_a15a9155215648a298765668d60a43a2-28" name="rest_code_a15a9155215648a298765668d60a43a2-28" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_a15a9155215648a298765668d60a43a2-28"></a> <span class="k">if</span> <span class="nb">next</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span> +<a id="rest_code_a15a9155215648a298765668d60a43a2-29" name="rest_code_a15a9155215648a298765668d60a43a2-29" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_a15a9155215648a298765668d60a43a2-29"></a> <span class="k">return</span> <span class="n">op</span> +<a id="rest_code_a15a9155215648a298765668d60a43a2-30" name="rest_code_a15a9155215648a298765668d60a43a2-30" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_a15a9155215648a298765668d60a43a2-30"></a> <span class="n">op</span> <span class="o">=</span> <span class="nb">next</span> +<a id="rest_code_a15a9155215648a298765668d60a43a2-31" name="rest_code_a15a9155215648a298765668d60a43a2-31" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_a15a9155215648a298765668d60a43a2-31"></a> <span class="k">return</span> <span class="n">op</span> +<a id="rest_code_a15a9155215648a298765668d60a43a2-32" name="rest_code_a15a9155215648a298765668d60a43a2-32" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_a15a9155215648a298765668d60a43a2-32"></a> +<a id="rest_code_a15a9155215648a298765668d60a43a2-33" name="rest_code_a15a9155215648a298765668d60a43a2-33" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_a15a9155215648a298765668d60a43a2-33"></a> <span class="k">def</span> <span class="nf">arg</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">index</span><span class="p">):</span> +<a id="rest_code_a15a9155215648a298765668d60a43a2-34" name="rest_code_a15a9155215648a298765668d60a43a2-34" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_a15a9155215648a298765668d60a43a2-34"></a> <span class="c1"># change to above: return the</span> +<a id="rest_code_a15a9155215648a298765668d60a43a2-35" name="rest_code_a15a9155215648a298765668d60a43a2-35" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_a15a9155215648a298765668d60a43a2-35"></a> <span class="c1"># representative of argument 'index'</span> +<a id="rest_code_a15a9155215648a298765668d60a43a2-36" name="rest_code_a15a9155215648a298765668d60a43a2-36" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_a15a9155215648a298765668d60a43a2-36"></a> <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">args</span><span class="p">[</span><span class="n">index</span><span class="p">]</span><span class="o">.</span><span class="n">find</span><span class="p">()</span> +<a id="rest_code_a15a9155215648a298765668d60a43a2-37" name="rest_code_a15a9155215648a298765668d60a43a2-37" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_a15a9155215648a298765668d60a43a2-37"></a> +<a id="rest_code_a15a9155215648a298765668d60a43a2-38" name="rest_code_a15a9155215648a298765668d60a43a2-38" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_a15a9155215648a298765668d60a43a2-38"></a> <span class="k">def</span> <span class="nf">make_equal_to</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="n">Value</span><span class="p">):</span> +<a id="rest_code_a15a9155215648a298765668d60a43a2-39" name="rest_code_a15a9155215648a298765668d60a43a2-39" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_a15a9155215648a298765668d60a43a2-39"></a> <span class="c1"># this is "union" in the union-find sense,</span> +<a id="rest_code_a15a9155215648a298765668d60a43a2-40" name="rest_code_a15a9155215648a298765668d60a43a2-40" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_a15a9155215648a298765668d60a43a2-40"></a> <span class="c1"># but the direction is important! The</span> +<a id="rest_code_a15a9155215648a298765668d60a43a2-41" name="rest_code_a15a9155215648a298765668d60a43a2-41" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_a15a9155215648a298765668d60a43a2-41"></a> <span class="c1"># representative of the union of Operations</span> +<a id="rest_code_a15a9155215648a298765668d60a43a2-42" name="rest_code_a15a9155215648a298765668d60a43a2-42" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_a15a9155215648a298765668d60a43a2-42"></a> <span class="c1"># must be either a Constant or an operation</span> +<a id="rest_code_a15a9155215648a298765668d60a43a2-43" name="rest_code_a15a9155215648a298765668d60a43a2-43" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_a15a9155215648a298765668d60a43a2-43"></a> <span class="c1"># that we know for sure is not optimized</span> +<a id="rest_code_a15a9155215648a298765668d60a43a2-44" name="rest_code_a15a9155215648a298765668d60a43a2-44" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_a15a9155215648a298765668d60a43a2-44"></a> <span class="c1"># away.</span> +<a id="rest_code_a15a9155215648a298765668d60a43a2-45" name="rest_code_a15a9155215648a298765668d60a43a2-45" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_a15a9155215648a298765668d60a43a2-45"></a> +<a id="rest_code_a15a9155215648a298765668d60a43a2-46" name="rest_code_a15a9155215648a298765668d60a43a2-46" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_a15a9155215648a298765668d60a43a2-46"></a> <span class="bp">self</span><span class="o">.</span><span class="n">find</span><span class="p">()</span><span class="o">.</span><span class="n">_set_forwarded</span><span class="p">(</span><span class="n">value</span><span class="p">)</span> +<a id="rest_code_a15a9155215648a298765668d60a43a2-47" name="rest_code_a15a9155215648a298765668d60a43a2-47" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_a15a9155215648a298765668d60a43a2-47"></a> +<a id="rest_code_a15a9155215648a298765668d60a43a2-48" name="rest_code_a15a9155215648a298765668d60a43a2-48" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_a15a9155215648a298765668d60a43a2-48"></a> <span class="k">def</span> <span class="nf">_set_forwarded</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="n">Value</span><span class="p">):</span> +<a id="rest_code_a15a9155215648a298765668d60a43a2-49" name="rest_code_a15a9155215648a298765668d60a43a2-49" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_a15a9155215648a298765668d60a43a2-49"></a> <span class="bp">self</span><span class="o">.</span><span class="n">forwarded</span> <span class="o">=</span> <span class="n">value</span> +<a id="rest_code_a15a9155215648a298765668d60a43a2-50" name="rest_code_a15a9155215648a298765668d60a43a2-50" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_a15a9155215648a298765668d60a43a2-50"></a> +<a id="rest_code_a15a9155215648a298765668d60a43a2-51" name="rest_code_a15a9155215648a298765668d60a43a2-51" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_a15a9155215648a298765668d60a43a2-51"></a> +<a id="rest_code_a15a9155215648a298765668d60a43a2-52" name="rest_code_a15a9155215648a298765668d60a43a2-52" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_a15a9155215648a298765668d60a43a2-52"></a><span class="k">class</span> <span class="nc">Constant</span><span class="p">(</span><span class="n">Value</span><span class="p">):</span> +<a id="rest_code_a15a9155215648a298765668d60a43a2-53" name="rest_code_a15a9155215648a298765668d60a43a2-53" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_a15a9155215648a298765668d60a43a2-53"></a> <span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="n">Any</span><span class="p">):</span> +<a id="rest_code_a15a9155215648a298765668d60a43a2-54" name="rest_code_a15a9155215648a298765668d60a43a2-54" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_a15a9155215648a298765668d60a43a2-54"></a> <span class="bp">self</span><span class="o">.</span><span class="n">value</span> <span class="o">=</span> <span class="n">value</span> +<a id="rest_code_a15a9155215648a298765668d60a43a2-55" name="rest_code_a15a9155215648a298765668d60a43a2-55" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_a15a9155215648a298765668d60a43a2-55"></a> +<a id="rest_code_a15a9155215648a298765668d60a43a2-56" name="rest_code_a15a9155215648a298765668d60a43a2-56" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_a15a9155215648a298765668d60a43a2-56"></a> <span class="k">def</span> <span class="fm">__repr__</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span> +<a id="rest_code_a15a9155215648a298765668d60a43a2-57" name="rest_code_a15a9155215648a298765668d60a43a2-57" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_a15a9155215648a298765668d60a43a2-57"></a> <span class="k">return</span> <span class="sa">f</span><span class="s2">"Constant(</span><span class="si">{</span><span class="bp">self</span><span class="o">.</span><span class="n">value</span><span class="si">}</span><span class="s2">)"</span> +<a id="rest_code_a15a9155215648a298765668d60a43a2-58" name="rest_code_a15a9155215648a298765668d60a43a2-58" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_a15a9155215648a298765668d60a43a2-58"></a> +<a id="rest_code_a15a9155215648a298765668d60a43a2-59" name="rest_code_a15a9155215648a298765668d60a43a2-59" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_a15a9155215648a298765668d60a43a2-59"></a> <span class="k">def</span> <span class="nf">find</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span> +<a id="rest_code_a15a9155215648a298765668d60a43a2-60" name="rest_code_a15a9155215648a298765668d60a43a2-60" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_a15a9155215648a298765668d60a43a2-60"></a> <span class="k">return</span> <span class="bp">self</span> +<a id="rest_code_a15a9155215648a298765668d60a43a2-61" name="rest_code_a15a9155215648a298765668d60a43a2-61" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_a15a9155215648a298765668d60a43a2-61"></a> +<a id="rest_code_a15a9155215648a298765668d60a43a2-62" name="rest_code_a15a9155215648a298765668d60a43a2-62" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_a15a9155215648a298765668d60a43a2-62"></a> <span class="k">def</span> <span class="nf">_set_forwarded</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="n">Value</span><span class="p">):</span> +<a id="rest_code_a15a9155215648a298765668d60a43a2-63" name="rest_code_a15a9155215648a298765668d60a43a2-63" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_a15a9155215648a298765668d60a43a2-63"></a> <span class="c1"># if we found out that an Operation is</span> +<a id="rest_code_a15a9155215648a298765668d60a43a2-64" name="rest_code_a15a9155215648a298765668d60a43a2-64" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_a15a9155215648a298765668d60a43a2-64"></a> <span class="c1"># equal to a constant, it's a compiler bug</span> +<a id="rest_code_a15a9155215648a298765668d60a43a2-65" name="rest_code_a15a9155215648a298765668d60a43a2-65" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_a15a9155215648a298765668d60a43a2-65"></a> <span class="c1"># to find out that it's equal to another</span> +<a id="rest_code_a15a9155215648a298765668d60a43a2-66" name="rest_code_a15a9155215648a298765668d60a43a2-66" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_a15a9155215648a298765668d60a43a2-66"></a> <span class="c1"># constant</span> +<a id="rest_code_a15a9155215648a298765668d60a43a2-67" name="rest_code_a15a9155215648a298765668d60a43a2-67" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_a15a9155215648a298765668d60a43a2-67"></a> <span class="k">assert</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">value</span><span class="p">,</span> <span class="n">Constant</span><span class="p">)</span> <span class="ow">and</span> \ +<a id="rest_code_a15a9155215648a298765668d60a43a2-68" name="rest_code_a15a9155215648a298765668d60a43a2-68" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_a15a9155215648a298765668d60a43a2-68"></a> <span class="n">value</span><span class="o">.</span><span class="n">value</span> <span class="o">==</span> <span class="bp">self</span><span class="o">.</span><span class="n">value</span> +<a id="rest_code_a15a9155215648a298765668d60a43a2-69" name="rest_code_a15a9155215648a298765668d60a43a2-69" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_a15a9155215648a298765668d60a43a2-69"></a> +<a id="rest_code_a15a9155215648a298765668d60a43a2-70" name="rest_code_a15a9155215648a298765668d60a43a2-70" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_a15a9155215648a298765668d60a43a2-70"></a><span class="k">def</span> <span class="nf">test_union_find</span><span class="p">():</span> +<a id="rest_code_a15a9155215648a298765668d60a43a2-71" name="rest_code_a15a9155215648a298765668d60a43a2-71" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_a15a9155215648a298765668d60a43a2-71"></a> <span class="c1"># construct three operation, and unify them</span> +<a id="rest_code_a15a9155215648a298765668d60a43a2-72" name="rest_code_a15a9155215648a298765668d60a43a2-72" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_a15a9155215648a298765668d60a43a2-72"></a> <span class="c1"># step by step</span> +<a id="rest_code_a15a9155215648a298765668d60a43a2-73" name="rest_code_a15a9155215648a298765668d60a43a2-73" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_a15a9155215648a298765668d60a43a2-73"></a> <span class="n">bb</span> <span class="o">=</span> <span class="n">Block</span><span class="p">()</span> +<a id="rest_code_a15a9155215648a298765668d60a43a2-74" name="rest_code_a15a9155215648a298765668d60a43a2-74" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_a15a9155215648a298765668d60a43a2-74"></a> <span class="n">a1</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">dummy</span><span class="p">(</span><span class="mi">1</span><span class="p">)</span> +<a id="rest_code_a15a9155215648a298765668d60a43a2-75" name="rest_code_a15a9155215648a298765668d60a43a2-75" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_a15a9155215648a298765668d60a43a2-75"></a> <span class="n">a2</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">dummy</span><span class="p">(</span><span class="mi">2</span><span class="p">)</span> +<a id="rest_code_a15a9155215648a298765668d60a43a2-76" name="rest_code_a15a9155215648a298765668d60a43a2-76" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_a15a9155215648a298765668d60a43a2-76"></a> <span class="n">a3</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">dummy</span><span class="p">(</span><span class="mi">3</span><span class="p">)</span> +<a id="rest_code_a15a9155215648a298765668d60a43a2-77" name="rest_code_a15a9155215648a298765668d60a43a2-77" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_a15a9155215648a298765668d60a43a2-77"></a> +<a id="rest_code_a15a9155215648a298765668d60a43a2-78" name="rest_code_a15a9155215648a298765668d60a43a2-78" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_a15a9155215648a298765668d60a43a2-78"></a> <span class="c1"># at the beginning, every op is its own</span> +<a id="rest_code_a15a9155215648a298765668d60a43a2-79" name="rest_code_a15a9155215648a298765668d60a43a2-79" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_a15a9155215648a298765668d60a43a2-79"></a> <span class="c1"># representative, that means every</span> +<a id="rest_code_a15a9155215648a298765668d60a43a2-80" name="rest_code_a15a9155215648a298765668d60a43a2-80" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_a15a9155215648a298765668d60a43a2-80"></a> <span class="c1"># operation is in a singleton set</span> +<a id="rest_code_a15a9155215648a298765668d60a43a2-81" name="rest_code_a15a9155215648a298765668d60a43a2-81" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_a15a9155215648a298765668d60a43a2-81"></a> <span class="c1"># {a1} {a2} {a3}</span> +<a id="rest_code_a15a9155215648a298765668d60a43a2-82" name="rest_code_a15a9155215648a298765668d60a43a2-82" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_a15a9155215648a298765668d60a43a2-82"></a> <span class="k">assert</span> <span class="n">a1</span><span class="o">.</span><span class="n">find</span><span class="p">()</span> <span class="ow">is</span> <span class="n">a1</span> +<a id="rest_code_a15a9155215648a298765668d60a43a2-83" name="rest_code_a15a9155215648a298765668d60a43a2-83" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_a15a9155215648a298765668d60a43a2-83"></a> <span class="k">assert</span> <span class="n">a2</span><span class="o">.</span><span class="n">find</span><span class="p">()</span> <span class="ow">is</span> <span class="n">a2</span> +<a id="rest_code_a15a9155215648a298765668d60a43a2-84" name="rest_code_a15a9155215648a298765668d60a43a2-84" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_a15a9155215648a298765668d60a43a2-84"></a> <span class="k">assert</span> <span class="n">a3</span><span class="o">.</span><span class="n">find</span><span class="p">()</span> <span class="ow">is</span> <span class="n">a3</span> +<a id="rest_code_a15a9155215648a298765668d60a43a2-85" name="rest_code_a15a9155215648a298765668d60a43a2-85" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_a15a9155215648a298765668d60a43a2-85"></a> +<a id="rest_code_a15a9155215648a298765668d60a43a2-86" name="rest_code_a15a9155215648a298765668d60a43a2-86" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_a15a9155215648a298765668d60a43a2-86"></a> <span class="c1"># now we unify a2 and a1, then the sets are</span> +<a id="rest_code_a15a9155215648a298765668d60a43a2-87" name="rest_code_a15a9155215648a298765668d60a43a2-87" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_a15a9155215648a298765668d60a43a2-87"></a> <span class="c1"># {a1, a2} {a3}</span> +<a id="rest_code_a15a9155215648a298765668d60a43a2-88" name="rest_code_a15a9155215648a298765668d60a43a2-88" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_a15a9155215648a298765668d60a43a2-88"></a> <span class="n">a2</span><span class="o">.</span><span class="n">make_equal_to</span><span class="p">(</span><span class="n">a1</span><span class="p">)</span> +<a id="rest_code_a15a9155215648a298765668d60a43a2-89" name="rest_code_a15a9155215648a298765668d60a43a2-89" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_a15a9155215648a298765668d60a43a2-89"></a> <span class="c1"># they both return a1 as the representative</span> +<a id="rest_code_a15a9155215648a298765668d60a43a2-90" name="rest_code_a15a9155215648a298765668d60a43a2-90" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_a15a9155215648a298765668d60a43a2-90"></a> <span class="k">assert</span> <span class="n">a1</span><span class="o">.</span><span class="n">find</span><span class="p">()</span> <span class="ow">is</span> <span class="n">a1</span> +<a id="rest_code_a15a9155215648a298765668d60a43a2-91" name="rest_code_a15a9155215648a298765668d60a43a2-91" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_a15a9155215648a298765668d60a43a2-91"></a> <span class="k">assert</span> <span class="n">a2</span><span class="o">.</span><span class="n">find</span><span class="p">()</span> <span class="ow">is</span> <span class="n">a1</span> +<a id="rest_code_a15a9155215648a298765668d60a43a2-92" name="rest_code_a15a9155215648a298765668d60a43a2-92" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_a15a9155215648a298765668d60a43a2-92"></a> <span class="c1"># a3 is still different</span> +<a id="rest_code_a15a9155215648a298765668d60a43a2-93" name="rest_code_a15a9155215648a298765668d60a43a2-93" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_a15a9155215648a298765668d60a43a2-93"></a> <span class="k">assert</span> <span class="n">a3</span><span class="o">.</span><span class="n">find</span><span class="p">()</span> <span class="ow">is</span> <span class="n">a3</span> +<a id="rest_code_a15a9155215648a298765668d60a43a2-94" name="rest_code_a15a9155215648a298765668d60a43a2-94" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_a15a9155215648a298765668d60a43a2-94"></a> +<a id="rest_code_a15a9155215648a298765668d60a43a2-95" name="rest_code_a15a9155215648a298765668d60a43a2-95" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_a15a9155215648a298765668d60a43a2-95"></a> <span class="c1"># now they are all in the same set {a1, a2, a3}</span> +<a id="rest_code_a15a9155215648a298765668d60a43a2-96" name="rest_code_a15a9155215648a298765668d60a43a2-96" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_a15a9155215648a298765668d60a43a2-96"></a> <span class="n">a3</span><span class="o">.</span><span class="n">make_equal_to</span><span class="p">(</span><span class="n">a2</span><span class="p">)</span> +<a id="rest_code_a15a9155215648a298765668d60a43a2-97" name="rest_code_a15a9155215648a298765668d60a43a2-97" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_a15a9155215648a298765668d60a43a2-97"></a> <span class="k">assert</span> <span class="n">a1</span><span class="o">.</span><span class="n">find</span><span class="p">()</span> <span class="ow">is</span> <span class="n">a1</span> +<a id="rest_code_a15a9155215648a298765668d60a43a2-98" name="rest_code_a15a9155215648a298765668d60a43a2-98" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_a15a9155215648a298765668d60a43a2-98"></a> <span class="k">assert</span> <span class="n">a2</span><span class="o">.</span><span class="n">find</span><span class="p">()</span> <span class="ow">is</span> <span class="n">a1</span> +<a id="rest_code_a15a9155215648a298765668d60a43a2-99" name="rest_code_a15a9155215648a298765668d60a43a2-99" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_a15a9155215648a298765668d60a43a2-99"></a> <span class="k">assert</span> <span class="n">a3</span><span class="o">.</span><span class="n">find</span><span class="p">()</span> <span class="ow">is</span> <span class="n">a1</span> +<a id="rest_code_a15a9155215648a298765668d60a43a2-100" name="rest_code_a15a9155215648a298765668d60a43a2-100" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_a15a9155215648a298765668d60a43a2-100"></a> +<a id="rest_code_a15a9155215648a298765668d60a43a2-101" name="rest_code_a15a9155215648a298765668d60a43a2-101" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_a15a9155215648a298765668d60a43a2-101"></a> <span class="c1"># now they are still all the same, and we</span> +<a id="rest_code_a15a9155215648a298765668d60a43a2-102" name="rest_code_a15a9155215648a298765668d60a43a2-102" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_a15a9155215648a298765668d60a43a2-102"></a> <span class="c1"># also learned that they are the same as the</span> +<a id="rest_code_a15a9155215648a298765668d60a43a2-103" name="rest_code_a15a9155215648a298765668d60a43a2-103" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_a15a9155215648a298765668d60a43a2-103"></a> <span class="c1"># constant 6</span> +<a id="rest_code_a15a9155215648a298765668d60a43a2-104" name="rest_code_a15a9155215648a298765668d60a43a2-104" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_a15a9155215648a298765668d60a43a2-104"></a> <span class="c1"># the single remaining set then is</span> +<a id="rest_code_a15a9155215648a298765668d60a43a2-105" name="rest_code_a15a9155215648a298765668d60a43a2-105" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_a15a9155215648a298765668d60a43a2-105"></a> <span class="c1"># {6, a1, a2, a3}</span> +<a id="rest_code_a15a9155215648a298765668d60a43a2-106" name="rest_code_a15a9155215648a298765668d60a43a2-106" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_a15a9155215648a298765668d60a43a2-106"></a> <span class="n">c</span> <span class="o">=</span> <span class="n">Constant</span><span class="p">(</span><span class="mi">6</span><span class="p">)</span> +<a id="rest_code_a15a9155215648a298765668d60a43a2-107" name="rest_code_a15a9155215648a298765668d60a43a2-107" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_a15a9155215648a298765668d60a43a2-107"></a> <span class="n">a2</span><span class="o">.</span><span class="n">make_equal_to</span><span class="p">(</span><span class="n">c</span><span class="p">)</span> +<a id="rest_code_a15a9155215648a298765668d60a43a2-108" name="rest_code_a15a9155215648a298765668d60a43a2-108" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_a15a9155215648a298765668d60a43a2-108"></a> <span class="k">assert</span> <span class="n">a1</span><span class="o">.</span><span class="n">find</span><span class="p">()</span> <span class="ow">is</span> <span class="n">c</span> +<a id="rest_code_a15a9155215648a298765668d60a43a2-109" name="rest_code_a15a9155215648a298765668d60a43a2-109" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_a15a9155215648a298765668d60a43a2-109"></a> <span class="k">assert</span> <span class="n">a2</span><span class="o">.</span><span class="n">find</span><span class="p">()</span> <span class="ow">is</span> <span class="n">c</span> +<a id="rest_code_a15a9155215648a298765668d60a43a2-110" name="rest_code_a15a9155215648a298765668d60a43a2-110" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_a15a9155215648a298765668d60a43a2-110"></a> <span class="k">assert</span> <span class="n">a3</span><span class="o">.</span><span class="n">find</span><span class="p">()</span> <span class="ow">is</span> <span class="n">c</span> +<a id="rest_code_a15a9155215648a298765668d60a43a2-111" name="rest_code_a15a9155215648a298765668d60a43a2-111" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_a15a9155215648a298765668d60a43a2-111"></a> +<a id="rest_code_a15a9155215648a298765668d60a43a2-112" name="rest_code_a15a9155215648a298765668d60a43a2-112" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_a15a9155215648a298765668d60a43a2-112"></a> <span class="c1"># union with the same constant again is fine</span> +<a id="rest_code_a15a9155215648a298765668d60a43a2-113" name="rest_code_a15a9155215648a298765668d60a43a2-113" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_a15a9155215648a298765668d60a43a2-113"></a> <span class="n">a2</span><span class="o">.</span><span class="n">make_equal_to</span><span class="p">(</span><span class="n">c</span><span class="p">)</span> +</pre></div> +</section> +<section id="constant-folding"> +<h2>Constant Folding</h2> +<p>Now comes the first actual optimization, a simple <a class="reference external" href="https://en.wikipedia.org/wiki/Constant_folding">constant folding</a> pass. It +will remove operations where all the arguments are constants and replace them +with the constant result.</p> +<p>Every pass has the same structure: we go over all operations in the basic +block in order and decide for each operation whether it can be removed. For the +constant folding pass, we can remove all the operations with constant +arguments (but we'll implement only the <code class="docutils literal">add</code> case here).</p> +<p>I will show a buggy version of the <a class="reference external" href="https://en.wikipedia.org/wiki/Constant_folding">constant folding</a> pass first. It has a +problem that is related to why we need the union-find data structure. We will +fix it a bit further down.</p> +<div class="code"><pre class="code python"><a id="rest_code_daa85120c4a44379affe0c40e571c659-1" name="rest_code_daa85120c4a44379affe0c40e571c659-1" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_daa85120c4a44379affe0c40e571c659-1"></a><span class="k">def</span> <span class="nf">constfold_buggy</span><span class="p">(</span><span class="n">bb</span><span class="p">:</span> <span class="n">Block</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Block</span><span class="p">:</span> +<a id="rest_code_daa85120c4a44379affe0c40e571c659-2" name="rest_code_daa85120c4a44379affe0c40e571c659-2" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_daa85120c4a44379affe0c40e571c659-2"></a> <span class="n">opt_bb</span> <span class="o">=</span> <span class="n">Block</span><span class="p">()</span> +<a id="rest_code_daa85120c4a44379affe0c40e571c659-3" name="rest_code_daa85120c4a44379affe0c40e571c659-3" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_daa85120c4a44379affe0c40e571c659-3"></a> +<a id="rest_code_daa85120c4a44379affe0c40e571c659-4" name="rest_code_daa85120c4a44379affe0c40e571c659-4" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_daa85120c4a44379affe0c40e571c659-4"></a> <span class="k">for</span> <span class="n">op</span> <span class="ow">in</span> <span class="n">bb</span><span class="p">:</span> +<a id="rest_code_daa85120c4a44379affe0c40e571c659-5" name="rest_code_daa85120c4a44379affe0c40e571c659-5" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_daa85120c4a44379affe0c40e571c659-5"></a> <span class="c1"># basic idea: go over the list and do</span> +<a id="rest_code_daa85120c4a44379affe0c40e571c659-6" name="rest_code_daa85120c4a44379affe0c40e571c659-6" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_daa85120c4a44379affe0c40e571c659-6"></a> <span class="c1"># constant folding of add where possible</span> +<a id="rest_code_daa85120c4a44379affe0c40e571c659-7" name="rest_code_daa85120c4a44379affe0c40e571c659-7" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_daa85120c4a44379affe0c40e571c659-7"></a> <span class="k">if</span> <span class="n">op</span><span class="o">.</span><span class="n">name</span> <span class="o">==</span> <span class="s2">"add"</span><span class="p">:</span> +<a id="rest_code_daa85120c4a44379affe0c40e571c659-8" name="rest_code_daa85120c4a44379affe0c40e571c659-8" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_daa85120c4a44379affe0c40e571c659-8"></a> <span class="n">arg0</span> <span class="o">=</span> <span class="n">op</span><span class="o">.</span><span class="n">args</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span> +<a id="rest_code_daa85120c4a44379affe0c40e571c659-9" name="rest_code_daa85120c4a44379affe0c40e571c659-9" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_daa85120c4a44379affe0c40e571c659-9"></a> <span class="n">arg1</span> <span class="o">=</span> <span class="n">op</span><span class="o">.</span><span class="n">args</span><span class="p">[</span><span class="mi">1</span><span class="p">]</span> +<a id="rest_code_daa85120c4a44379affe0c40e571c659-10" name="rest_code_daa85120c4a44379affe0c40e571c659-10" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_daa85120c4a44379affe0c40e571c659-10"></a> <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">arg0</span><span class="p">,</span> <span class="n">Constant</span><span class="p">)</span> <span class="ow">and</span> \ +<a id="rest_code_daa85120c4a44379affe0c40e571c659-11" name="rest_code_daa85120c4a44379affe0c40e571c659-11" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_daa85120c4a44379affe0c40e571c659-11"></a> <span class="nb">isinstance</span><span class="p">(</span><span class="n">arg1</span><span class="p">,</span> <span class="n">Constant</span><span class="p">):</span> +<a id="rest_code_daa85120c4a44379affe0c40e571c659-12" name="rest_code_daa85120c4a44379affe0c40e571c659-12" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_daa85120c4a44379affe0c40e571c659-12"></a> <span class="c1"># can constant-fold! that means we</span> +<a id="rest_code_daa85120c4a44379affe0c40e571c659-13" name="rest_code_daa85120c4a44379affe0c40e571c659-13" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_daa85120c4a44379affe0c40e571c659-13"></a> <span class="c1"># learned a new equality, namely</span> +<a id="rest_code_daa85120c4a44379affe0c40e571c659-14" name="rest_code_daa85120c4a44379affe0c40e571c659-14" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_daa85120c4a44379affe0c40e571c659-14"></a> <span class="c1"># that op is equal to a specific</span> +<a id="rest_code_daa85120c4a44379affe0c40e571c659-15" name="rest_code_daa85120c4a44379affe0c40e571c659-15" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_daa85120c4a44379affe0c40e571c659-15"></a> <span class="c1"># constant</span> +<a id="rest_code_daa85120c4a44379affe0c40e571c659-16" name="rest_code_daa85120c4a44379affe0c40e571c659-16" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_daa85120c4a44379affe0c40e571c659-16"></a> <span class="n">value</span> <span class="o">=</span> <span class="n">arg0</span><span class="o">.</span><span class="n">value</span> <span class="o">+</span> <span class="n">arg1</span><span class="o">.</span><span class="n">value</span> +<a id="rest_code_daa85120c4a44379affe0c40e571c659-17" name="rest_code_daa85120c4a44379affe0c40e571c659-17" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_daa85120c4a44379affe0c40e571c659-17"></a> <span class="n">op</span><span class="o">.</span><span class="n">make_equal_to</span><span class="p">(</span><span class="n">Constant</span><span class="p">(</span><span class="n">value</span><span class="p">))</span> +<a id="rest_code_daa85120c4a44379affe0c40e571c659-18" name="rest_code_daa85120c4a44379affe0c40e571c659-18" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_daa85120c4a44379affe0c40e571c659-18"></a> <span class="c1"># don't need to have the operation</span> +<a id="rest_code_daa85120c4a44379affe0c40e571c659-19" name="rest_code_daa85120c4a44379affe0c40e571c659-19" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_daa85120c4a44379affe0c40e571c659-19"></a> <span class="c1"># in the optimized basic block</span> +<a id="rest_code_daa85120c4a44379affe0c40e571c659-20" name="rest_code_daa85120c4a44379affe0c40e571c659-20" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_daa85120c4a44379affe0c40e571c659-20"></a> <span class="k">continue</span> +<a id="rest_code_daa85120c4a44379affe0c40e571c659-21" name="rest_code_daa85120c4a44379affe0c40e571c659-21" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_daa85120c4a44379affe0c40e571c659-21"></a> <span class="c1"># otherwise the operation is not</span> +<a id="rest_code_daa85120c4a44379affe0c40e571c659-22" name="rest_code_daa85120c4a44379affe0c40e571c659-22" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_daa85120c4a44379affe0c40e571c659-22"></a> <span class="c1"># constant-foldable and we put into the</span> +<a id="rest_code_daa85120c4a44379affe0c40e571c659-23" name="rest_code_daa85120c4a44379affe0c40e571c659-23" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_daa85120c4a44379affe0c40e571c659-23"></a> <span class="c1"># output list</span> +<a id="rest_code_daa85120c4a44379affe0c40e571c659-24" name="rest_code_daa85120c4a44379affe0c40e571c659-24" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_daa85120c4a44379affe0c40e571c659-24"></a> <span class="n">opt_bb</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">op</span><span class="p">)</span> +<a id="rest_code_daa85120c4a44379affe0c40e571c659-25" name="rest_code_daa85120c4a44379affe0c40e571c659-25" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_daa85120c4a44379affe0c40e571c659-25"></a> <span class="k">return</span> <span class="n">opt_bb</span> +<a id="rest_code_daa85120c4a44379affe0c40e571c659-26" name="rest_code_daa85120c4a44379affe0c40e571c659-26" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_daa85120c4a44379affe0c40e571c659-26"></a> +<a id="rest_code_daa85120c4a44379affe0c40e571c659-27" name="rest_code_daa85120c4a44379affe0c40e571c659-27" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_daa85120c4a44379affe0c40e571c659-27"></a> +<a id="rest_code_daa85120c4a44379affe0c40e571c659-28" name="rest_code_daa85120c4a44379affe0c40e571c659-28" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_daa85120c4a44379affe0c40e571c659-28"></a><span class="k">def</span> <span class="nf">test_constfold_simple</span><span class="p">():</span> +<a id="rest_code_daa85120c4a44379affe0c40e571c659-29" name="rest_code_daa85120c4a44379affe0c40e571c659-29" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_daa85120c4a44379affe0c40e571c659-29"></a> <span class="n">bb</span> <span class="o">=</span> <span class="n">Block</span><span class="p">()</span> +<a id="rest_code_daa85120c4a44379affe0c40e571c659-30" name="rest_code_daa85120c4a44379affe0c40e571c659-30" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_daa85120c4a44379affe0c40e571c659-30"></a> <span class="n">var0</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">getarg</span><span class="p">(</span><span class="mi">0</span><span class="p">)</span> +<a id="rest_code_daa85120c4a44379affe0c40e571c659-31" name="rest_code_daa85120c4a44379affe0c40e571c659-31" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_daa85120c4a44379affe0c40e571c659-31"></a> <span class="n">var1</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">add</span><span class="p">(</span><span class="mi">5</span><span class="p">,</span> <span class="mi">4</span><span class="p">)</span> +<a id="rest_code_daa85120c4a44379affe0c40e571c659-32" name="rest_code_daa85120c4a44379affe0c40e571c659-32" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_daa85120c4a44379affe0c40e571c659-32"></a> <span class="n">var2</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">add</span><span class="p">(</span><span class="n">var1</span><span class="p">,</span> <span class="n">var0</span><span class="p">)</span> +<a id="rest_code_daa85120c4a44379affe0c40e571c659-33" name="rest_code_daa85120c4a44379affe0c40e571c659-33" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_daa85120c4a44379affe0c40e571c659-33"></a> +<a id="rest_code_daa85120c4a44379affe0c40e571c659-34" name="rest_code_daa85120c4a44379affe0c40e571c659-34" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_daa85120c4a44379affe0c40e571c659-34"></a> <span class="n">opt_bb</span> <span class="o">=</span> <span class="n">constfold_buggy</span><span class="p">(</span><span class="n">bb</span><span class="p">)</span> +<a id="rest_code_daa85120c4a44379affe0c40e571c659-35" name="rest_code_daa85120c4a44379affe0c40e571c659-35" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_daa85120c4a44379affe0c40e571c659-35"></a> <span class="k">assert</span> <span class="n">bb_to_str</span><span class="p">(</span><span class="n">opt_bb</span><span class="p">,</span> <span class="s2">"optvar"</span><span class="p">)</span> <span class="o">==</span> <span class="s2">"""</span><span class="se">\</span> +<a id="rest_code_daa85120c4a44379affe0c40e571c659-36" name="rest_code_daa85120c4a44379affe0c40e571c659-36" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_daa85120c4a44379affe0c40e571c659-36"></a><span class="s2">optvar0 = getarg(0)</span> +<a id="rest_code_daa85120c4a44379affe0c40e571c659-37" name="rest_code_daa85120c4a44379affe0c40e571c659-37" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_daa85120c4a44379affe0c40e571c659-37"></a><span class="s2">optvar1 = add(9, optvar0)"""</span> +<a id="rest_code_daa85120c4a44379affe0c40e571c659-38" name="rest_code_daa85120c4a44379affe0c40e571c659-38" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_daa85120c4a44379affe0c40e571c659-38"></a> +<a id="rest_code_daa85120c4a44379affe0c40e571c659-39" name="rest_code_daa85120c4a44379affe0c40e571c659-39" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_daa85120c4a44379affe0c40e571c659-39"></a><span class="nd">@pytest</span><span class="o">.</span><span class="n">mark</span><span class="o">.</span><span class="n">xfail</span> +<a id="rest_code_daa85120c4a44379affe0c40e571c659-40" name="rest_code_daa85120c4a44379affe0c40e571c659-40" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_daa85120c4a44379affe0c40e571c659-40"></a><span class="k">def</span> <span class="nf">test_constfold_buggy_limitation</span><span class="p">():</span> +<a id="rest_code_daa85120c4a44379affe0c40e571c659-41" name="rest_code_daa85120c4a44379affe0c40e571c659-41" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_daa85120c4a44379affe0c40e571c659-41"></a> <span class="c1"># this test fails! it shows the problem with</span> +<a id="rest_code_daa85120c4a44379affe0c40e571c659-42" name="rest_code_daa85120c4a44379affe0c40e571c659-42" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_daa85120c4a44379affe0c40e571c659-42"></a> <span class="c1"># the above simple constfold_buggy pass</span> +<a id="rest_code_daa85120c4a44379affe0c40e571c659-43" name="rest_code_daa85120c4a44379affe0c40e571c659-43" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_daa85120c4a44379affe0c40e571c659-43"></a> +<a id="rest_code_daa85120c4a44379affe0c40e571c659-44" name="rest_code_daa85120c4a44379affe0c40e571c659-44" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_daa85120c4a44379affe0c40e571c659-44"></a> <span class="n">bb</span> <span class="o">=</span> <span class="n">Block</span><span class="p">()</span> +<a id="rest_code_daa85120c4a44379affe0c40e571c659-45" name="rest_code_daa85120c4a44379affe0c40e571c659-45" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_daa85120c4a44379affe0c40e571c659-45"></a> <span class="n">var0</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">getarg</span><span class="p">(</span><span class="mi">0</span><span class="p">)</span> +<a id="rest_code_daa85120c4a44379affe0c40e571c659-46" name="rest_code_daa85120c4a44379affe0c40e571c659-46" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_daa85120c4a44379affe0c40e571c659-46"></a> <span class="c1"># this is folded</span> +<a id="rest_code_daa85120c4a44379affe0c40e571c659-47" name="rest_code_daa85120c4a44379affe0c40e571c659-47" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_daa85120c4a44379affe0c40e571c659-47"></a> <span class="n">var1</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">add</span><span class="p">(</span><span class="mi">5</span><span class="p">,</span> <span class="mi">4</span><span class="p">)</span> +<a id="rest_code_daa85120c4a44379affe0c40e571c659-48" name="rest_code_daa85120c4a44379affe0c40e571c659-48" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_daa85120c4a44379affe0c40e571c659-48"></a> <span class="c1"># we want this folded too, but it doesn't work</span> +<a id="rest_code_daa85120c4a44379affe0c40e571c659-49" name="rest_code_daa85120c4a44379affe0c40e571c659-49" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_daa85120c4a44379affe0c40e571c659-49"></a> <span class="n">var2</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">add</span><span class="p">(</span><span class="n">var1</span><span class="p">,</span> <span class="mi">10</span><span class="p">)</span> +<a id="rest_code_daa85120c4a44379affe0c40e571c659-50" name="rest_code_daa85120c4a44379affe0c40e571c659-50" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_daa85120c4a44379affe0c40e571c659-50"></a> <span class="n">var3</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">add</span><span class="p">(</span><span class="n">var2</span><span class="p">,</span> <span class="n">var0</span><span class="p">)</span> +<a id="rest_code_daa85120c4a44379affe0c40e571c659-51" name="rest_code_daa85120c4a44379affe0c40e571c659-51" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_daa85120c4a44379affe0c40e571c659-51"></a> +<a id="rest_code_daa85120c4a44379affe0c40e571c659-52" name="rest_code_daa85120c4a44379affe0c40e571c659-52" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_daa85120c4a44379affe0c40e571c659-52"></a> <span class="n">opt_bb</span> <span class="o">=</span> <span class="n">constfold_buggy</span><span class="p">(</span><span class="n">bb</span><span class="p">)</span> +<a id="rest_code_daa85120c4a44379affe0c40e571c659-53" name="rest_code_daa85120c4a44379affe0c40e571c659-53" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_daa85120c4a44379affe0c40e571c659-53"></a> <span class="k">assert</span> <span class="n">bb_to_str</span><span class="p">(</span><span class="n">opt_bb</span><span class="p">,</span> <span class="s2">"optvar"</span><span class="p">)</span> <span class="o">==</span> <span class="s2">"""</span><span class="se">\</span> +<a id="rest_code_daa85120c4a44379affe0c40e571c659-54" name="rest_code_daa85120c4a44379affe0c40e571c659-54" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_daa85120c4a44379affe0c40e571c659-54"></a><span class="s2">optvar0 = getarg(0)</span> +<a id="rest_code_daa85120c4a44379affe0c40e571c659-55" name="rest_code_daa85120c4a44379affe0c40e571c659-55" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_daa85120c4a44379affe0c40e571c659-55"></a><span class="s2">optvar1 = add(19, optvar0)"""</span> +</pre></div> +<p>Why does the test fail? The <code class="docutils literal">opt_bb</code> printed output looks like this:</p> +<pre class="literal-block">optvar0 = getarg(0) +optvar1 = add(9, 10) +optvar2 = add(optvar1, optvar0)</pre> +<p>The problem is that when we optimize the second addition in <cite>constfold_buggy</cite>, +the argument of that operation is an <em>Operation</em> not a <code class="docutils literal">Constant</code>, so +constant-folding is not applied to the second add. However, we have already +learned that the argument <code class="docutils literal">var1</code> to the operation <code class="docutils literal">var2</code> is equal to +<code class="docutils literal">Constant(9)</code>. This information is stored in the union-find data structure. +So what we are missing are suitable find calls in the constant folding pass, to +make use of the previously learned equalities.</p> +<p>Here's the fixed version:</p> +<div class="code"><pre class="code python"><a id="rest_code_b07e310d695e4fd184cd77ed64be36f5-1" name="rest_code_b07e310d695e4fd184cd77ed64be36f5-1" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_b07e310d695e4fd184cd77ed64be36f5-1"></a><span class="k">def</span> <span class="nf">constfold</span><span class="p">(</span><span class="n">bb</span><span class="p">:</span> <span class="n">Block</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Block</span><span class="p">:</span> +<a id="rest_code_b07e310d695e4fd184cd77ed64be36f5-2" name="rest_code_b07e310d695e4fd184cd77ed64be36f5-2" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_b07e310d695e4fd184cd77ed64be36f5-2"></a> <span class="n">opt_bb</span> <span class="o">=</span> <span class="n">Block</span><span class="p">()</span> +<a id="rest_code_b07e310d695e4fd184cd77ed64be36f5-3" name="rest_code_b07e310d695e4fd184cd77ed64be36f5-3" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_b07e310d695e4fd184cd77ed64be36f5-3"></a> +<a id="rest_code_b07e310d695e4fd184cd77ed64be36f5-4" name="rest_code_b07e310d695e4fd184cd77ed64be36f5-4" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_b07e310d695e4fd184cd77ed64be36f5-4"></a> <span class="k">for</span> <span class="n">op</span> <span class="ow">in</span> <span class="n">bb</span><span class="p">:</span> +<a id="rest_code_b07e310d695e4fd184cd77ed64be36f5-5" name="rest_code_b07e310d695e4fd184cd77ed64be36f5-5" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_b07e310d695e4fd184cd77ed64be36f5-5"></a> <span class="c1"># basic idea: go over the list and do</span> +<a id="rest_code_b07e310d695e4fd184cd77ed64be36f5-6" name="rest_code_b07e310d695e4fd184cd77ed64be36f5-6" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_b07e310d695e4fd184cd77ed64be36f5-6"></a> <span class="c1"># constant folding of add where possible</span> +<a id="rest_code_b07e310d695e4fd184cd77ed64be36f5-7" name="rest_code_b07e310d695e4fd184cd77ed64be36f5-7" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_b07e310d695e4fd184cd77ed64be36f5-7"></a> <span class="k">if</span> <span class="n">op</span><span class="o">.</span><span class="n">name</span> <span class="o">==</span> <span class="s2">"add"</span><span class="p">:</span> +<a id="rest_code_b07e310d695e4fd184cd77ed64be36f5-8" name="rest_code_b07e310d695e4fd184cd77ed64be36f5-8" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_b07e310d695e4fd184cd77ed64be36f5-8"></a> <span class="c1"># &gt;&gt;&gt; changed</span> +<a id="rest_code_b07e310d695e4fd184cd77ed64be36f5-9" name="rest_code_b07e310d695e4fd184cd77ed64be36f5-9" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_b07e310d695e4fd184cd77ed64be36f5-9"></a><span class="hll"> <span class="n">arg0</span> <span class="o">=</span> <span class="n">op</span><span class="o">.</span><span class="n">arg</span><span class="p">(</span><span class="mi">0</span><span class="p">)</span> <span class="c1"># uses .find()</span> +</span><a id="rest_code_b07e310d695e4fd184cd77ed64be36f5-10" name="rest_code_b07e310d695e4fd184cd77ed64be36f5-10" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_b07e310d695e4fd184cd77ed64be36f5-10"></a><span class="hll"> <span class="n">arg1</span> <span class="o">=</span> <span class="n">op</span><span class="o">.</span><span class="n">arg</span><span class="p">(</span><span class="mi">1</span><span class="p">)</span> <span class="c1"># uses .find()</span> +</span><a id="rest_code_b07e310d695e4fd184cd77ed64be36f5-11" name="rest_code_b07e310d695e4fd184cd77ed64be36f5-11" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_b07e310d695e4fd184cd77ed64be36f5-11"></a> <span class="c1"># &lt;&lt;&lt; end changes</span> +<a id="rest_code_b07e310d695e4fd184cd77ed64be36f5-12" name="rest_code_b07e310d695e4fd184cd77ed64be36f5-12" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_b07e310d695e4fd184cd77ed64be36f5-12"></a> <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">arg0</span><span class="p">,</span> <span class="n">Constant</span><span class="p">)</span> <span class="ow">and</span> \ +<a id="rest_code_b07e310d695e4fd184cd77ed64be36f5-13" name="rest_code_b07e310d695e4fd184cd77ed64be36f5-13" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_b07e310d695e4fd184cd77ed64be36f5-13"></a> <span class="nb">isinstance</span><span class="p">(</span><span class="n">arg1</span><span class="p">,</span> <span class="n">Constant</span><span class="p">):</span> +<a id="rest_code_b07e310d695e4fd184cd77ed64be36f5-14" name="rest_code_b07e310d695e4fd184cd77ed64be36f5-14" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_b07e310d695e4fd184cd77ed64be36f5-14"></a> <span class="c1"># can constant-fold! that means we</span> +<a id="rest_code_b07e310d695e4fd184cd77ed64be36f5-15" name="rest_code_b07e310d695e4fd184cd77ed64be36f5-15" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_b07e310d695e4fd184cd77ed64be36f5-15"></a> <span class="c1"># learned a new equality, namely</span> +<a id="rest_code_b07e310d695e4fd184cd77ed64be36f5-16" name="rest_code_b07e310d695e4fd184cd77ed64be36f5-16" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_b07e310d695e4fd184cd77ed64be36f5-16"></a> <span class="c1"># that op is equal to a specific</span> +<a id="rest_code_b07e310d695e4fd184cd77ed64be36f5-17" name="rest_code_b07e310d695e4fd184cd77ed64be36f5-17" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_b07e310d695e4fd184cd77ed64be36f5-17"></a> <span class="c1"># constant</span> +<a id="rest_code_b07e310d695e4fd184cd77ed64be36f5-18" name="rest_code_b07e310d695e4fd184cd77ed64be36f5-18" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_b07e310d695e4fd184cd77ed64be36f5-18"></a> <span class="n">value</span> <span class="o">=</span> <span class="n">arg0</span><span class="o">.</span><span class="n">value</span> <span class="o">+</span> <span class="n">arg1</span><span class="o">.</span><span class="n">value</span> +<a id="rest_code_b07e310d695e4fd184cd77ed64be36f5-19" name="rest_code_b07e310d695e4fd184cd77ed64be36f5-19" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_b07e310d695e4fd184cd77ed64be36f5-19"></a> <span class="n">op</span><span class="o">.</span><span class="n">make_equal_to</span><span class="p">(</span><span class="n">Constant</span><span class="p">(</span><span class="n">value</span><span class="p">))</span> +<a id="rest_code_b07e310d695e4fd184cd77ed64be36f5-20" name="rest_code_b07e310d695e4fd184cd77ed64be36f5-20" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_b07e310d695e4fd184cd77ed64be36f5-20"></a> <span class="c1"># don't need to have the operation</span> +<a id="rest_code_b07e310d695e4fd184cd77ed64be36f5-21" name="rest_code_b07e310d695e4fd184cd77ed64be36f5-21" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_b07e310d695e4fd184cd77ed64be36f5-21"></a> <span class="c1"># in the optimized basic block</span> +<a id="rest_code_b07e310d695e4fd184cd77ed64be36f5-22" name="rest_code_b07e310d695e4fd184cd77ed64be36f5-22" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_b07e310d695e4fd184cd77ed64be36f5-22"></a> <span class="k">continue</span> +<a id="rest_code_b07e310d695e4fd184cd77ed64be36f5-23" name="rest_code_b07e310d695e4fd184cd77ed64be36f5-23" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_b07e310d695e4fd184cd77ed64be36f5-23"></a> <span class="c1"># otherwise the operation is not</span> +<a id="rest_code_b07e310d695e4fd184cd77ed64be36f5-24" name="rest_code_b07e310d695e4fd184cd77ed64be36f5-24" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_b07e310d695e4fd184cd77ed64be36f5-24"></a> <span class="c1"># constant-foldable and we put into the</span> +<a id="rest_code_b07e310d695e4fd184cd77ed64be36f5-25" name="rest_code_b07e310d695e4fd184cd77ed64be36f5-25" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_b07e310d695e4fd184cd77ed64be36f5-25"></a> <span class="c1"># output list</span> +<a id="rest_code_b07e310d695e4fd184cd77ed64be36f5-26" name="rest_code_b07e310d695e4fd184cd77ed64be36f5-26" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_b07e310d695e4fd184cd77ed64be36f5-26"></a> <span class="n">opt_bb</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">op</span><span class="p">)</span> +<a id="rest_code_b07e310d695e4fd184cd77ed64be36f5-27" name="rest_code_b07e310d695e4fd184cd77ed64be36f5-27" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_b07e310d695e4fd184cd77ed64be36f5-27"></a> <span class="k">return</span> <span class="n">opt_bb</span> +<a id="rest_code_b07e310d695e4fd184cd77ed64be36f5-28" name="rest_code_b07e310d695e4fd184cd77ed64be36f5-28" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_b07e310d695e4fd184cd77ed64be36f5-28"></a> +<a id="rest_code_b07e310d695e4fd184cd77ed64be36f5-29" name="rest_code_b07e310d695e4fd184cd77ed64be36f5-29" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_b07e310d695e4fd184cd77ed64be36f5-29"></a> +<a id="rest_code_b07e310d695e4fd184cd77ed64be36f5-30" name="rest_code_b07e310d695e4fd184cd77ed64be36f5-30" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_b07e310d695e4fd184cd77ed64be36f5-30"></a><span class="k">def</span> <span class="nf">test_constfold_two_ops</span><span class="p">():</span> +<a id="rest_code_b07e310d695e4fd184cd77ed64be36f5-31" name="rest_code_b07e310d695e4fd184cd77ed64be36f5-31" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_b07e310d695e4fd184cd77ed64be36f5-31"></a> <span class="c1"># now it works!</span> +<a id="rest_code_b07e310d695e4fd184cd77ed64be36f5-32" name="rest_code_b07e310d695e4fd184cd77ed64be36f5-32" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_b07e310d695e4fd184cd77ed64be36f5-32"></a> <span class="n">bb</span> <span class="o">=</span> <span class="n">Block</span><span class="p">()</span> +<a id="rest_code_b07e310d695e4fd184cd77ed64be36f5-33" name="rest_code_b07e310d695e4fd184cd77ed64be36f5-33" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_b07e310d695e4fd184cd77ed64be36f5-33"></a> <span class="n">var0</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">getarg</span><span class="p">(</span><span class="mi">0</span><span class="p">)</span> +<a id="rest_code_b07e310d695e4fd184cd77ed64be36f5-34" name="rest_code_b07e310d695e4fd184cd77ed64be36f5-34" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_b07e310d695e4fd184cd77ed64be36f5-34"></a> <span class="n">var1</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">add</span><span class="p">(</span><span class="mi">5</span><span class="p">,</span> <span class="mi">4</span><span class="p">)</span> +<a id="rest_code_b07e310d695e4fd184cd77ed64be36f5-35" name="rest_code_b07e310d695e4fd184cd77ed64be36f5-35" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_b07e310d695e4fd184cd77ed64be36f5-35"></a> <span class="n">var2</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">add</span><span class="p">(</span><span class="n">var1</span><span class="p">,</span> <span class="mi">10</span><span class="p">)</span> +<a id="rest_code_b07e310d695e4fd184cd77ed64be36f5-36" name="rest_code_b07e310d695e4fd184cd77ed64be36f5-36" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_b07e310d695e4fd184cd77ed64be36f5-36"></a> <span class="n">var3</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">add</span><span class="p">(</span><span class="n">var2</span><span class="p">,</span> <span class="n">var0</span><span class="p">)</span> +<a id="rest_code_b07e310d695e4fd184cd77ed64be36f5-37" name="rest_code_b07e310d695e4fd184cd77ed64be36f5-37" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_b07e310d695e4fd184cd77ed64be36f5-37"></a> <span class="n">opt_bb</span> <span class="o">=</span> <span class="n">constfold</span><span class="p">(</span><span class="n">bb</span><span class="p">)</span> +<a id="rest_code_b07e310d695e4fd184cd77ed64be36f5-38" name="rest_code_b07e310d695e4fd184cd77ed64be36f5-38" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_b07e310d695e4fd184cd77ed64be36f5-38"></a> +<a id="rest_code_b07e310d695e4fd184cd77ed64be36f5-39" name="rest_code_b07e310d695e4fd184cd77ed64be36f5-39" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_b07e310d695e4fd184cd77ed64be36f5-39"></a> <span class="k">assert</span> <span class="n">bb_to_str</span><span class="p">(</span><span class="n">opt_bb</span><span class="p">,</span> <span class="s2">"optvar"</span><span class="p">)</span> <span class="o">==</span> <span class="s2">"""</span><span class="se">\</span> +<a id="rest_code_b07e310d695e4fd184cd77ed64be36f5-40" name="rest_code_b07e310d695e4fd184cd77ed64be36f5-40" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_b07e310d695e4fd184cd77ed64be36f5-40"></a><span class="s2">optvar0 = getarg(0)</span> +<a id="rest_code_b07e310d695e4fd184cd77ed64be36f5-41" name="rest_code_b07e310d695e4fd184cd77ed64be36f5-41" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_b07e310d695e4fd184cd77ed64be36f5-41"></a><span class="s2">optvar1 = add(19, optvar0)"""</span> +</pre></div> +</section> +<section id="common-subexpression-elimination"> +<h2>Common Subexpression Elimination</h2> +<p>The <code class="docutils literal">constfold</code> pass only discovers equalities between <code class="docutils literal">Operations</code> and +<code class="docutils literal">Constants</code>. Let's do a second pass that also discovers equalities between +<code class="docutils literal">Operations</code> and other <code class="docutils literal">Operations</code>.</p> +<p>A simple optimization that does that has this property <a class="reference external" href="https://en.wikipedia.org/wiki/Common_subexpression_elimination">common subexpression +elimination</a> (CSE), which will finally optimize away the problem in the +introductory example code that we had above.</p> +<div class="code"><pre class="code python"><a id="rest_code_5a167cf0fa6e448499556f57339456ca-1" name="rest_code_5a167cf0fa6e448499556f57339456ca-1" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_5a167cf0fa6e448499556f57339456ca-1"></a><span class="k">def</span> <span class="nf">cse</span><span class="p">(</span><span class="n">bb</span><span class="p">:</span> <span class="n">Block</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Block</span><span class="p">:</span> +<a id="rest_code_5a167cf0fa6e448499556f57339456ca-2" name="rest_code_5a167cf0fa6e448499556f57339456ca-2" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_5a167cf0fa6e448499556f57339456ca-2"></a> <span class="c1"># structure is the same, loop over the input,</span> +<a id="rest_code_5a167cf0fa6e448499556f57339456ca-3" name="rest_code_5a167cf0fa6e448499556f57339456ca-3" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_5a167cf0fa6e448499556f57339456ca-3"></a> <span class="c1"># add some but not all operations to the</span> +<a id="rest_code_5a167cf0fa6e448499556f57339456ca-4" name="rest_code_5a167cf0fa6e448499556f57339456ca-4" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_5a167cf0fa6e448499556f57339456ca-4"></a> <span class="c1"># output</span> +<a id="rest_code_5a167cf0fa6e448499556f57339456ca-5" name="rest_code_5a167cf0fa6e448499556f57339456ca-5" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_5a167cf0fa6e448499556f57339456ca-5"></a> +<a id="rest_code_5a167cf0fa6e448499556f57339456ca-6" name="rest_code_5a167cf0fa6e448499556f57339456ca-6" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_5a167cf0fa6e448499556f57339456ca-6"></a> <span class="n">opt_bb</span> <span class="o">=</span> <span class="n">Block</span><span class="p">()</span> +<a id="rest_code_5a167cf0fa6e448499556f57339456ca-7" name="rest_code_5a167cf0fa6e448499556f57339456ca-7" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_5a167cf0fa6e448499556f57339456ca-7"></a> +<a id="rest_code_5a167cf0fa6e448499556f57339456ca-8" name="rest_code_5a167cf0fa6e448499556f57339456ca-8" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_5a167cf0fa6e448499556f57339456ca-8"></a> <span class="k">for</span> <span class="n">op</span> <span class="ow">in</span> <span class="n">bb</span><span class="p">:</span> +<a id="rest_code_5a167cf0fa6e448499556f57339456ca-9" name="rest_code_5a167cf0fa6e448499556f57339456ca-9" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_5a167cf0fa6e448499556f57339456ca-9"></a> <span class="c1"># only do CSE for add here, but it</span> +<a id="rest_code_5a167cf0fa6e448499556f57339456ca-10" name="rest_code_5a167cf0fa6e448499556f57339456ca-10" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_5a167cf0fa6e448499556f57339456ca-10"></a> <span class="c1"># generalizes</span> +<a id="rest_code_5a167cf0fa6e448499556f57339456ca-11" name="rest_code_5a167cf0fa6e448499556f57339456ca-11" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_5a167cf0fa6e448499556f57339456ca-11"></a> <span class="k">if</span> <span class="n">op</span><span class="o">.</span><span class="n">name</span> <span class="o">==</span> <span class="s2">"add"</span><span class="p">:</span> +<a id="rest_code_5a167cf0fa6e448499556f57339456ca-12" name="rest_code_5a167cf0fa6e448499556f57339456ca-12" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_5a167cf0fa6e448499556f57339456ca-12"></a> <span class="n">arg0</span> <span class="o">=</span> <span class="n">op</span><span class="o">.</span><span class="n">arg</span><span class="p">(</span><span class="mi">0</span><span class="p">)</span> +<a id="rest_code_5a167cf0fa6e448499556f57339456ca-13" name="rest_code_5a167cf0fa6e448499556f57339456ca-13" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_5a167cf0fa6e448499556f57339456ca-13"></a> <span class="n">arg1</span> <span class="o">=</span> <span class="n">op</span><span class="o">.</span><span class="n">arg</span><span class="p">(</span><span class="mi">1</span><span class="p">)</span> +<a id="rest_code_5a167cf0fa6e448499556f57339456ca-14" name="rest_code_5a167cf0fa6e448499556f57339456ca-14" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_5a167cf0fa6e448499556f57339456ca-14"></a> <span class="c1"># Check whether we have emitted the</span> +<a id="rest_code_5a167cf0fa6e448499556f57339456ca-15" name="rest_code_5a167cf0fa6e448499556f57339456ca-15" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_5a167cf0fa6e448499556f57339456ca-15"></a> <span class="c1"># same operation already</span> +<a id="rest_code_5a167cf0fa6e448499556f57339456ca-16" name="rest_code_5a167cf0fa6e448499556f57339456ca-16" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_5a167cf0fa6e448499556f57339456ca-16"></a> <span class="n">prev_op</span> <span class="o">=</span> <span class="n">find_prev_add_op</span><span class="p">(</span> +<a id="rest_code_5a167cf0fa6e448499556f57339456ca-17" name="rest_code_5a167cf0fa6e448499556f57339456ca-17" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_5a167cf0fa6e448499556f57339456ca-17"></a> <span class="n">arg0</span><span class="p">,</span> <span class="n">arg1</span><span class="p">,</span> <span class="n">opt_bb</span><span class="p">)</span> +<a id="rest_code_5a167cf0fa6e448499556f57339456ca-18" name="rest_code_5a167cf0fa6e448499556f57339456ca-18" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_5a167cf0fa6e448499556f57339456ca-18"></a> <span class="k">if</span> <span class="n">prev_op</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span> +<a id="rest_code_5a167cf0fa6e448499556f57339456ca-19" name="rest_code_5a167cf0fa6e448499556f57339456ca-19" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_5a167cf0fa6e448499556f57339456ca-19"></a> <span class="c1"># if yes, we can optimize op away</span> +<a id="rest_code_5a167cf0fa6e448499556f57339456ca-20" name="rest_code_5a167cf0fa6e448499556f57339456ca-20" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_5a167cf0fa6e448499556f57339456ca-20"></a> <span class="c1"># and replace it with the earlier</span> +<a id="rest_code_5a167cf0fa6e448499556f57339456ca-21" name="rest_code_5a167cf0fa6e448499556f57339456ca-21" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_5a167cf0fa6e448499556f57339456ca-21"></a> <span class="c1"># result, which is an Operation</span> +<a id="rest_code_5a167cf0fa6e448499556f57339456ca-22" name="rest_code_5a167cf0fa6e448499556f57339456ca-22" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_5a167cf0fa6e448499556f57339456ca-22"></a> <span class="c1"># that was already emitted to</span> +<a id="rest_code_5a167cf0fa6e448499556f57339456ca-23" name="rest_code_5a167cf0fa6e448499556f57339456ca-23" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_5a167cf0fa6e448499556f57339456ca-23"></a> <span class="c1"># opt_bb</span> +<a id="rest_code_5a167cf0fa6e448499556f57339456ca-24" name="rest_code_5a167cf0fa6e448499556f57339456ca-24" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_5a167cf0fa6e448499556f57339456ca-24"></a> <span class="n">op</span><span class="o">.</span><span class="n">make_equal_to</span><span class="p">(</span><span class="n">prev_op</span><span class="p">)</span> +<a id="rest_code_5a167cf0fa6e448499556f57339456ca-25" name="rest_code_5a167cf0fa6e448499556f57339456ca-25" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_5a167cf0fa6e448499556f57339456ca-25"></a> <span class="k">continue</span> +<a id="rest_code_5a167cf0fa6e448499556f57339456ca-26" name="rest_code_5a167cf0fa6e448499556f57339456ca-26" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_5a167cf0fa6e448499556f57339456ca-26"></a> <span class="n">opt_bb</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">op</span><span class="p">)</span> +<a id="rest_code_5a167cf0fa6e448499556f57339456ca-27" name="rest_code_5a167cf0fa6e448499556f57339456ca-27" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_5a167cf0fa6e448499556f57339456ca-27"></a> <span class="k">return</span> <span class="n">opt_bb</span> +<a id="rest_code_5a167cf0fa6e448499556f57339456ca-28" name="rest_code_5a167cf0fa6e448499556f57339456ca-28" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_5a167cf0fa6e448499556f57339456ca-28"></a> +<a id="rest_code_5a167cf0fa6e448499556f57339456ca-29" name="rest_code_5a167cf0fa6e448499556f57339456ca-29" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_5a167cf0fa6e448499556f57339456ca-29"></a> +<a id="rest_code_5a167cf0fa6e448499556f57339456ca-30" name="rest_code_5a167cf0fa6e448499556f57339456ca-30" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_5a167cf0fa6e448499556f57339456ca-30"></a><span class="k">def</span> <span class="nf">eq_value</span><span class="p">(</span><span class="n">val0</span><span class="p">,</span> <span class="n">val1</span><span class="p">):</span> +<a id="rest_code_5a167cf0fa6e448499556f57339456ca-31" name="rest_code_5a167cf0fa6e448499556f57339456ca-31" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_5a167cf0fa6e448499556f57339456ca-31"></a> <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">val0</span><span class="p">,</span> <span class="n">Constant</span><span class="p">)</span> <span class="ow">and</span> \ +<a id="rest_code_5a167cf0fa6e448499556f57339456ca-32" name="rest_code_5a167cf0fa6e448499556f57339456ca-32" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_5a167cf0fa6e448499556f57339456ca-32"></a> <span class="nb">isinstance</span><span class="p">(</span><span class="n">val1</span><span class="p">,</span> <span class="n">Constant</span><span class="p">):</span> +<a id="rest_code_5a167cf0fa6e448499556f57339456ca-33" name="rest_code_5a167cf0fa6e448499556f57339456ca-33" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_5a167cf0fa6e448499556f57339456ca-33"></a> <span class="c1"># constants compare by their value</span> +<a id="rest_code_5a167cf0fa6e448499556f57339456ca-34" name="rest_code_5a167cf0fa6e448499556f57339456ca-34" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_5a167cf0fa6e448499556f57339456ca-34"></a> <span class="k">return</span> <span class="n">val0</span><span class="o">.</span><span class="n">value</span> <span class="o">==</span> <span class="n">val1</span><span class="o">.</span><span class="n">value</span> +<a id="rest_code_5a167cf0fa6e448499556f57339456ca-35" name="rest_code_5a167cf0fa6e448499556f57339456ca-35" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_5a167cf0fa6e448499556f57339456ca-35"></a> <span class="c1"># everything else by identity</span> +<a id="rest_code_5a167cf0fa6e448499556f57339456ca-36" name="rest_code_5a167cf0fa6e448499556f57339456ca-36" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_5a167cf0fa6e448499556f57339456ca-36"></a> <span class="k">return</span> <span class="n">val0</span> <span class="ow">is</span> <span class="n">val1</span> +<a id="rest_code_5a167cf0fa6e448499556f57339456ca-37" name="rest_code_5a167cf0fa6e448499556f57339456ca-37" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_5a167cf0fa6e448499556f57339456ca-37"></a> +<a id="rest_code_5a167cf0fa6e448499556f57339456ca-38" name="rest_code_5a167cf0fa6e448499556f57339456ca-38" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_5a167cf0fa6e448499556f57339456ca-38"></a> +<a id="rest_code_5a167cf0fa6e448499556f57339456ca-39" name="rest_code_5a167cf0fa6e448499556f57339456ca-39" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_5a167cf0fa6e448499556f57339456ca-39"></a><span class="k">def</span> <span class="nf">find_prev_add_op</span><span class="p">(</span><span class="n">arg0</span><span class="p">:</span> <span class="n">Value</span><span class="p">,</span> <span class="n">arg1</span><span class="p">:</span> <span class="n">Value</span><span class="p">,</span> +<a id="rest_code_5a167cf0fa6e448499556f57339456ca-40" name="rest_code_5a167cf0fa6e448499556f57339456ca-40" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_5a167cf0fa6e448499556f57339456ca-40"></a> <span class="n">opt_bb</span><span class="p">:</span> <span class="n">Block</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Operation</span><span class="p">]:</span> +<a id="rest_code_5a167cf0fa6e448499556f57339456ca-41" name="rest_code_5a167cf0fa6e448499556f57339456ca-41" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_5a167cf0fa6e448499556f57339456ca-41"></a> <span class="c1"># Really naive and quadratic implementation.</span> +<a id="rest_code_5a167cf0fa6e448499556f57339456ca-42" name="rest_code_5a167cf0fa6e448499556f57339456ca-42" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_5a167cf0fa6e448499556f57339456ca-42"></a> <span class="c1"># What we do is walk over the already emitted</span> +<a id="rest_code_5a167cf0fa6e448499556f57339456ca-43" name="rest_code_5a167cf0fa6e448499556f57339456ca-43" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_5a167cf0fa6e448499556f57339456ca-43"></a> <span class="c1"># operations and see whether we emitted an add</span> +<a id="rest_code_5a167cf0fa6e448499556f57339456ca-44" name="rest_code_5a167cf0fa6e448499556f57339456ca-44" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_5a167cf0fa6e448499556f57339456ca-44"></a> <span class="c1"># with the current arguments already. A real</span> +<a id="rest_code_5a167cf0fa6e448499556f57339456ca-45" name="rest_code_5a167cf0fa6e448499556f57339456ca-45" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_5a167cf0fa6e448499556f57339456ca-45"></a> <span class="c1"># implementation might use a hashmap of some</span> +<a id="rest_code_5a167cf0fa6e448499556f57339456ca-46" name="rest_code_5a167cf0fa6e448499556f57339456ca-46" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_5a167cf0fa6e448499556f57339456ca-46"></a> <span class="c1"># kind, or at least only look at a limited</span> +<a id="rest_code_5a167cf0fa6e448499556f57339456ca-47" name="rest_code_5a167cf0fa6e448499556f57339456ca-47" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_5a167cf0fa6e448499556f57339456ca-47"></a> <span class="c1"># window of instructions.</span> +<a id="rest_code_5a167cf0fa6e448499556f57339456ca-48" name="rest_code_5a167cf0fa6e448499556f57339456ca-48" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_5a167cf0fa6e448499556f57339456ca-48"></a> <span class="k">for</span> <span class="n">opt_op</span> <span class="ow">in</span> <span class="n">opt_bb</span><span class="p">:</span> +<a id="rest_code_5a167cf0fa6e448499556f57339456ca-49" name="rest_code_5a167cf0fa6e448499556f57339456ca-49" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_5a167cf0fa6e448499556f57339456ca-49"></a> <span class="k">if</span> <span class="n">opt_op</span><span class="o">.</span><span class="n">name</span> <span class="o">!=</span> <span class="s2">"add"</span><span class="p">:</span> +<a id="rest_code_5a167cf0fa6e448499556f57339456ca-50" name="rest_code_5a167cf0fa6e448499556f57339456ca-50" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_5a167cf0fa6e448499556f57339456ca-50"></a> <span class="k">continue</span> +<a id="rest_code_5a167cf0fa6e448499556f57339456ca-51" name="rest_code_5a167cf0fa6e448499556f57339456ca-51" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_5a167cf0fa6e448499556f57339456ca-51"></a> <span class="c1"># It's important to call arg here,</span> +<a id="rest_code_5a167cf0fa6e448499556f57339456ca-52" name="rest_code_5a167cf0fa6e448499556f57339456ca-52" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_5a167cf0fa6e448499556f57339456ca-52"></a> <span class="c1"># for the same reason why we</span> +<a id="rest_code_5a167cf0fa6e448499556f57339456ca-53" name="rest_code_5a167cf0fa6e448499556f57339456ca-53" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_5a167cf0fa6e448499556f57339456ca-53"></a> <span class="c1"># needed it in constfold: we need to</span> +<a id="rest_code_5a167cf0fa6e448499556f57339456ca-54" name="rest_code_5a167cf0fa6e448499556f57339456ca-54" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_5a167cf0fa6e448499556f57339456ca-54"></a> <span class="c1"># make sure .find() is called</span> +<a id="rest_code_5a167cf0fa6e448499556f57339456ca-55" name="rest_code_5a167cf0fa6e448499556f57339456ca-55" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_5a167cf0fa6e448499556f57339456ca-55"></a> <span class="k">if</span> <span class="n">eq_value</span><span class="p">(</span><span class="n">arg0</span><span class="p">,</span> <span class="n">opt_op</span><span class="o">.</span><span class="n">arg</span><span class="p">(</span><span class="mi">0</span><span class="p">))</span> <span class="ow">and</span> \ +<a id="rest_code_5a167cf0fa6e448499556f57339456ca-56" name="rest_code_5a167cf0fa6e448499556f57339456ca-56" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_5a167cf0fa6e448499556f57339456ca-56"></a> <span class="n">eq_value</span><span class="p">(</span><span class="n">arg1</span><span class="p">,</span> <span class="n">opt_op</span><span class="o">.</span><span class="n">arg</span><span class="p">(</span><span class="mi">1</span><span class="p">)):</span> +<a id="rest_code_5a167cf0fa6e448499556f57339456ca-57" name="rest_code_5a167cf0fa6e448499556f57339456ca-57" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_5a167cf0fa6e448499556f57339456ca-57"></a> <span class="k">return</span> <span class="n">opt_op</span> +<a id="rest_code_5a167cf0fa6e448499556f57339456ca-58" name="rest_code_5a167cf0fa6e448499556f57339456ca-58" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_5a167cf0fa6e448499556f57339456ca-58"></a> <span class="k">return</span> <span class="kc">None</span> +<a id="rest_code_5a167cf0fa6e448499556f57339456ca-59" name="rest_code_5a167cf0fa6e448499556f57339456ca-59" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_5a167cf0fa6e448499556f57339456ca-59"></a> +<a id="rest_code_5a167cf0fa6e448499556f57339456ca-60" name="rest_code_5a167cf0fa6e448499556f57339456ca-60" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_5a167cf0fa6e448499556f57339456ca-60"></a> +<a id="rest_code_5a167cf0fa6e448499556f57339456ca-61" name="rest_code_5a167cf0fa6e448499556f57339456ca-61" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_5a167cf0fa6e448499556f57339456ca-61"></a><span class="k">def</span> <span class="nf">test_cse</span><span class="p">():</span> +<a id="rest_code_5a167cf0fa6e448499556f57339456ca-62" name="rest_code_5a167cf0fa6e448499556f57339456ca-62" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_5a167cf0fa6e448499556f57339456ca-62"></a> <span class="n">bb</span> <span class="o">=</span> <span class="n">Block</span><span class="p">()</span> +<a id="rest_code_5a167cf0fa6e448499556f57339456ca-63" name="rest_code_5a167cf0fa6e448499556f57339456ca-63" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_5a167cf0fa6e448499556f57339456ca-63"></a> <span class="n">a</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">getarg</span><span class="p">(</span><span class="mi">0</span><span class="p">)</span> +<a id="rest_code_5a167cf0fa6e448499556f57339456ca-64" name="rest_code_5a167cf0fa6e448499556f57339456ca-64" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_5a167cf0fa6e448499556f57339456ca-64"></a> <span class="n">b</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">getarg</span><span class="p">(</span><span class="mi">1</span><span class="p">)</span> +<a id="rest_code_5a167cf0fa6e448499556f57339456ca-65" name="rest_code_5a167cf0fa6e448499556f57339456ca-65" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_5a167cf0fa6e448499556f57339456ca-65"></a> <span class="n">var1</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">add</span><span class="p">(</span><span class="n">b</span><span class="p">,</span> <span class="mi">17</span><span class="p">)</span> +<a id="rest_code_5a167cf0fa6e448499556f57339456ca-66" name="rest_code_5a167cf0fa6e448499556f57339456ca-66" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_5a167cf0fa6e448499556f57339456ca-66"></a> <span class="n">var2</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">mul</span><span class="p">(</span><span class="n">a</span><span class="p">,</span> <span class="n">var1</span><span class="p">)</span> +<a id="rest_code_5a167cf0fa6e448499556f57339456ca-67" name="rest_code_5a167cf0fa6e448499556f57339456ca-67" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_5a167cf0fa6e448499556f57339456ca-67"></a> <span class="n">var3</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">add</span><span class="p">(</span><span class="n">b</span><span class="p">,</span> <span class="mi">17</span><span class="p">)</span> +<a id="rest_code_5a167cf0fa6e448499556f57339456ca-68" name="rest_code_5a167cf0fa6e448499556f57339456ca-68" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_5a167cf0fa6e448499556f57339456ca-68"></a> <span class="n">var4</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">add</span><span class="p">(</span><span class="n">var2</span><span class="p">,</span> <span class="n">var3</span><span class="p">)</span> +<a id="rest_code_5a167cf0fa6e448499556f57339456ca-69" name="rest_code_5a167cf0fa6e448499556f57339456ca-69" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_5a167cf0fa6e448499556f57339456ca-69"></a> +<a id="rest_code_5a167cf0fa6e448499556f57339456ca-70" name="rest_code_5a167cf0fa6e448499556f57339456ca-70" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_5a167cf0fa6e448499556f57339456ca-70"></a> <span class="n">opt_bb</span> <span class="o">=</span> <span class="n">cse</span><span class="p">(</span><span class="n">bb</span><span class="p">)</span> +<a id="rest_code_5a167cf0fa6e448499556f57339456ca-71" name="rest_code_5a167cf0fa6e448499556f57339456ca-71" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_5a167cf0fa6e448499556f57339456ca-71"></a> <span class="k">assert</span> <span class="n">bb_to_str</span><span class="p">(</span><span class="n">opt_bb</span><span class="p">,</span> <span class="s2">"optvar"</span><span class="p">)</span> <span class="o">==</span> <span class="s2">"""</span><span class="se">\</span> +<a id="rest_code_5a167cf0fa6e448499556f57339456ca-72" name="rest_code_5a167cf0fa6e448499556f57339456ca-72" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_5a167cf0fa6e448499556f57339456ca-72"></a><span class="s2">optvar0 = getarg(0)</span> +<a id="rest_code_5a167cf0fa6e448499556f57339456ca-73" name="rest_code_5a167cf0fa6e448499556f57339456ca-73" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_5a167cf0fa6e448499556f57339456ca-73"></a><span class="s2">optvar1 = getarg(1)</span> +<a id="rest_code_5a167cf0fa6e448499556f57339456ca-74" name="rest_code_5a167cf0fa6e448499556f57339456ca-74" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_5a167cf0fa6e448499556f57339456ca-74"></a><span class="s2">optvar2 = add(optvar1, 17)</span> +<a id="rest_code_5a167cf0fa6e448499556f57339456ca-75" name="rest_code_5a167cf0fa6e448499556f57339456ca-75" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_5a167cf0fa6e448499556f57339456ca-75"></a><span class="s2">optvar3 = mul(optvar0, optvar2)</span> +<a id="rest_code_5a167cf0fa6e448499556f57339456ca-76" name="rest_code_5a167cf0fa6e448499556f57339456ca-76" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_5a167cf0fa6e448499556f57339456ca-76"></a><span class="s2">optvar4 = add(optvar3, optvar2)"""</span> +</pre></div> +</section> +<section id="strength-reduction"> +<h2>Strength Reduction</h2> +<p>Now we have one pass that replaces <code class="docutils literal">Operations</code> with <code class="docutils literal">Constants</code> and one that +replaces <code class="docutils literal">Operations</code> with previously existing <code class="docutils literal">Operations</code>. Let's now do one +final pass that replaces <code class="docutils literal">Operations</code> by newly invented <code class="docutils literal">Operations</code>, a simple +<a class="reference external" href="https://en.wikipedia.org/wiki/Strength_reduction">strength reduction</a>. This one will be simple.</p> +<div class="code"><pre class="code python"><a id="rest_code_0f38ef580c61466493f9ead527062ee0-1" name="rest_code_0f38ef580c61466493f9ead527062ee0-1" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_0f38ef580c61466493f9ead527062ee0-1"></a><span class="k">def</span> <span class="nf">strength_reduce</span><span class="p">(</span><span class="n">bb</span><span class="p">:</span> <span class="n">Block</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Block</span><span class="p">:</span> +<a id="rest_code_0f38ef580c61466493f9ead527062ee0-2" name="rest_code_0f38ef580c61466493f9ead527062ee0-2" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_0f38ef580c61466493f9ead527062ee0-2"></a> <span class="n">opt_bb</span> <span class="o">=</span> <span class="n">Block</span><span class="p">()</span> +<a id="rest_code_0f38ef580c61466493f9ead527062ee0-3" name="rest_code_0f38ef580c61466493f9ead527062ee0-3" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_0f38ef580c61466493f9ead527062ee0-3"></a> <span class="k">for</span> <span class="n">op</span> <span class="ow">in</span> <span class="n">bb</span><span class="p">:</span> +<a id="rest_code_0f38ef580c61466493f9ead527062ee0-4" name="rest_code_0f38ef580c61466493f9ead527062ee0-4" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_0f38ef580c61466493f9ead527062ee0-4"></a> <span class="k">if</span> <span class="n">op</span><span class="o">.</span><span class="n">name</span> <span class="o">==</span> <span class="s2">"add"</span><span class="p">:</span> +<a id="rest_code_0f38ef580c61466493f9ead527062ee0-5" name="rest_code_0f38ef580c61466493f9ead527062ee0-5" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_0f38ef580c61466493f9ead527062ee0-5"></a> <span class="n">arg0</span> <span class="o">=</span> <span class="n">op</span><span class="o">.</span><span class="n">arg</span><span class="p">(</span><span class="mi">0</span><span class="p">)</span> +<a id="rest_code_0f38ef580c61466493f9ead527062ee0-6" name="rest_code_0f38ef580c61466493f9ead527062ee0-6" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_0f38ef580c61466493f9ead527062ee0-6"></a> <span class="n">arg1</span> <span class="o">=</span> <span class="n">op</span><span class="o">.</span><span class="n">arg</span><span class="p">(</span><span class="mi">1</span><span class="p">)</span> +<a id="rest_code_0f38ef580c61466493f9ead527062ee0-7" name="rest_code_0f38ef580c61466493f9ead527062ee0-7" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_0f38ef580c61466493f9ead527062ee0-7"></a> <span class="k">if</span> <span class="n">arg0</span> <span class="ow">is</span> <span class="n">arg1</span><span class="p">:</span> +<a id="rest_code_0f38ef580c61466493f9ead527062ee0-8" name="rest_code_0f38ef580c61466493f9ead527062ee0-8" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_0f38ef580c61466493f9ead527062ee0-8"></a> <span class="c1"># x + x turns into x &lt;&lt; 1</span> +<a id="rest_code_0f38ef580c61466493f9ead527062ee0-9" name="rest_code_0f38ef580c61466493f9ead527062ee0-9" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_0f38ef580c61466493f9ead527062ee0-9"></a> <span class="n">newop</span> <span class="o">=</span> <span class="n">opt_bb</span><span class="o">.</span><span class="n">lshift</span><span class="p">(</span><span class="n">arg0</span><span class="p">,</span> <span class="mi">1</span><span class="p">)</span> +<a id="rest_code_0f38ef580c61466493f9ead527062ee0-10" name="rest_code_0f38ef580c61466493f9ead527062ee0-10" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_0f38ef580c61466493f9ead527062ee0-10"></a> <span class="n">op</span><span class="o">.</span><span class="n">make_equal_to</span><span class="p">(</span><span class="n">newop</span><span class="p">)</span> +<a id="rest_code_0f38ef580c61466493f9ead527062ee0-11" name="rest_code_0f38ef580c61466493f9ead527062ee0-11" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_0f38ef580c61466493f9ead527062ee0-11"></a> <span class="k">continue</span> +<a id="rest_code_0f38ef580c61466493f9ead527062ee0-12" name="rest_code_0f38ef580c61466493f9ead527062ee0-12" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_0f38ef580c61466493f9ead527062ee0-12"></a> <span class="n">opt_bb</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">op</span><span class="p">)</span> +<a id="rest_code_0f38ef580c61466493f9ead527062ee0-13" name="rest_code_0f38ef580c61466493f9ead527062ee0-13" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_0f38ef580c61466493f9ead527062ee0-13"></a> <span class="k">return</span> <span class="n">opt_bb</span> +<a id="rest_code_0f38ef580c61466493f9ead527062ee0-14" name="rest_code_0f38ef580c61466493f9ead527062ee0-14" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_0f38ef580c61466493f9ead527062ee0-14"></a> +<a id="rest_code_0f38ef580c61466493f9ead527062ee0-15" name="rest_code_0f38ef580c61466493f9ead527062ee0-15" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_0f38ef580c61466493f9ead527062ee0-15"></a><span class="k">def</span> <span class="nf">test_strength_reduce</span><span class="p">():</span> +<a id="rest_code_0f38ef580c61466493f9ead527062ee0-16" name="rest_code_0f38ef580c61466493f9ead527062ee0-16" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_0f38ef580c61466493f9ead527062ee0-16"></a> <span class="n">bb</span> <span class="o">=</span> <span class="n">Block</span><span class="p">()</span> +<a id="rest_code_0f38ef580c61466493f9ead527062ee0-17" name="rest_code_0f38ef580c61466493f9ead527062ee0-17" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_0f38ef580c61466493f9ead527062ee0-17"></a> <span class="n">var0</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">getarg</span><span class="p">(</span><span class="mi">0</span><span class="p">)</span> +<a id="rest_code_0f38ef580c61466493f9ead527062ee0-18" name="rest_code_0f38ef580c61466493f9ead527062ee0-18" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_0f38ef580c61466493f9ead527062ee0-18"></a> <span class="n">var1</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">add</span><span class="p">(</span><span class="n">var0</span><span class="p">,</span> <span class="n">var0</span><span class="p">)</span> +<a id="rest_code_0f38ef580c61466493f9ead527062ee0-19" name="rest_code_0f38ef580c61466493f9ead527062ee0-19" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_0f38ef580c61466493f9ead527062ee0-19"></a> +<a id="rest_code_0f38ef580c61466493f9ead527062ee0-20" name="rest_code_0f38ef580c61466493f9ead527062ee0-20" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_0f38ef580c61466493f9ead527062ee0-20"></a> <span class="n">opt_bb</span> <span class="o">=</span> <span class="n">strength_reduce</span><span class="p">(</span><span class="n">bb</span><span class="p">)</span> +<a id="rest_code_0f38ef580c61466493f9ead527062ee0-21" name="rest_code_0f38ef580c61466493f9ead527062ee0-21" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_0f38ef580c61466493f9ead527062ee0-21"></a> +<a id="rest_code_0f38ef580c61466493f9ead527062ee0-22" name="rest_code_0f38ef580c61466493f9ead527062ee0-22" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_0f38ef580c61466493f9ead527062ee0-22"></a> <span class="k">assert</span> <span class="n">bb_to_str</span><span class="p">(</span><span class="n">opt_bb</span><span class="p">,</span> <span class="s2">"optvar"</span><span class="p">)</span> <span class="o">==</span> <span class="s2">"""</span><span class="se">\</span> +<a id="rest_code_0f38ef580c61466493f9ead527062ee0-23" name="rest_code_0f38ef580c61466493f9ead527062ee0-23" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_0f38ef580c61466493f9ead527062ee0-23"></a><span class="s2">optvar0 = getarg(0)</span> +<a id="rest_code_0f38ef580c61466493f9ead527062ee0-24" name="rest_code_0f38ef580c61466493f9ead527062ee0-24" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_0f38ef580c61466493f9ead527062ee0-24"></a><span class="s2">optvar1 = lshift(optvar0, 1)"""</span> +</pre></div> +</section> +<section id="putting-things-together"> +<h2>Putting Things Together</h2> +<p>Let's combine the passes into one single pass, so that we are going over all +the operations only exactly once, instead of having to look at every operation +once for all the different passes.</p> +<div class="code"><pre class="code python"><a id="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-1" name="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-1" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_bfff4cedf6c14a58be32aea404b8c6e5-1"></a><span class="k">def</span> <span class="nf">optimize</span><span class="p">(</span><span class="n">bb</span><span class="p">:</span> <span class="n">Block</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Block</span><span class="p">:</span> +<a id="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-2" name="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-2" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_bfff4cedf6c14a58be32aea404b8c6e5-2"></a> <span class="n">opt_bb</span> <span class="o">=</span> <span class="n">Block</span><span class="p">()</span> +<a id="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-3" name="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-3" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_bfff4cedf6c14a58be32aea404b8c6e5-3"></a> +<a id="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-4" name="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-4" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_bfff4cedf6c14a58be32aea404b8c6e5-4"></a> <span class="k">for</span> <span class="n">op</span> <span class="ow">in</span> <span class="n">bb</span><span class="p">:</span> +<a id="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-5" name="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-5" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_bfff4cedf6c14a58be32aea404b8c6e5-5"></a> <span class="k">if</span> <span class="n">op</span><span class="o">.</span><span class="n">name</span> <span class="o">==</span> <span class="s2">"add"</span><span class="p">:</span> +<a id="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-6" name="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-6" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_bfff4cedf6c14a58be32aea404b8c6e5-6"></a> <span class="n">arg0</span> <span class="o">=</span> <span class="n">op</span><span class="o">.</span><span class="n">arg</span><span class="p">(</span><span class="mi">0</span><span class="p">)</span> +<a id="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-7" name="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-7" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_bfff4cedf6c14a58be32aea404b8c6e5-7"></a> <span class="n">arg1</span> <span class="o">=</span> <span class="n">op</span><span class="o">.</span><span class="n">arg</span><span class="p">(</span><span class="mi">1</span><span class="p">)</span> +<a id="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-8" name="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-8" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_bfff4cedf6c14a58be32aea404b8c6e5-8"></a> +<a id="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-9" name="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-9" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_bfff4cedf6c14a58be32aea404b8c6e5-9"></a> <span class="c1"># constant folding</span> +<a id="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-10" name="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-10" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_bfff4cedf6c14a58be32aea404b8c6e5-10"></a> <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">arg0</span><span class="p">,</span> <span class="n">Constant</span><span class="p">)</span> <span class="ow">and</span> \ +<a id="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-11" name="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-11" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_bfff4cedf6c14a58be32aea404b8c6e5-11"></a> <span class="nb">isinstance</span><span class="p">(</span><span class="n">arg1</span><span class="p">,</span> <span class="n">Constant</span><span class="p">):</span> +<a id="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-12" name="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-12" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_bfff4cedf6c14a58be32aea404b8c6e5-12"></a> <span class="n">value</span> <span class="o">=</span> <span class="n">arg0</span><span class="o">.</span><span class="n">value</span> <span class="o">+</span> <span class="n">arg1</span><span class="o">.</span><span class="n">value</span> +<a id="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-13" name="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-13" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_bfff4cedf6c14a58be32aea404b8c6e5-13"></a> <span class="n">op</span><span class="o">.</span><span class="n">make_equal_to</span><span class="p">(</span><span class="n">Constant</span><span class="p">(</span><span class="n">value</span><span class="p">))</span> +<a id="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-14" name="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-14" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_bfff4cedf6c14a58be32aea404b8c6e5-14"></a> <span class="k">continue</span> +<a id="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-15" name="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-15" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_bfff4cedf6c14a58be32aea404b8c6e5-15"></a> +<a id="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-16" name="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-16" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_bfff4cedf6c14a58be32aea404b8c6e5-16"></a> <span class="c1"># cse</span> +<a id="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-17" name="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-17" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_bfff4cedf6c14a58be32aea404b8c6e5-17"></a> <span class="n">prev_op</span> <span class="o">=</span> <span class="n">find_prev_add_op</span><span class="p">(</span> +<a id="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-18" name="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-18" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_bfff4cedf6c14a58be32aea404b8c6e5-18"></a> <span class="n">arg0</span><span class="p">,</span> <span class="n">arg1</span><span class="p">,</span> <span class="n">opt_bb</span><span class="p">)</span> +<a id="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-19" name="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-19" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_bfff4cedf6c14a58be32aea404b8c6e5-19"></a> <span class="k">if</span> <span class="n">prev_op</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span> +<a id="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-20" name="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-20" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_bfff4cedf6c14a58be32aea404b8c6e5-20"></a> <span class="n">op</span><span class="o">.</span><span class="n">make_equal_to</span><span class="p">(</span><span class="n">prev_op</span><span class="p">)</span> +<a id="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-21" name="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-21" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_bfff4cedf6c14a58be32aea404b8c6e5-21"></a> <span class="k">continue</span> +<a id="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-22" name="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-22" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_bfff4cedf6c14a58be32aea404b8c6e5-22"></a> +<a id="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-23" name="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-23" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_bfff4cedf6c14a58be32aea404b8c6e5-23"></a> <span class="c1"># strength reduce:</span> +<a id="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-24" name="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-24" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_bfff4cedf6c14a58be32aea404b8c6e5-24"></a> <span class="c1"># x + x turns into x &lt;&lt; 1</span> +<a id="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-25" name="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-25" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_bfff4cedf6c14a58be32aea404b8c6e5-25"></a> <span class="k">if</span> <span class="n">arg0</span> <span class="ow">is</span> <span class="n">arg1</span><span class="p">:</span> +<a id="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-26" name="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-26" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_bfff4cedf6c14a58be32aea404b8c6e5-26"></a> <span class="n">newop</span> <span class="o">=</span> <span class="n">opt_bb</span><span class="o">.</span><span class="n">lshift</span><span class="p">(</span><span class="n">arg0</span><span class="p">,</span> <span class="mi">1</span><span class="p">)</span> +<a id="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-27" name="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-27" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_bfff4cedf6c14a58be32aea404b8c6e5-27"></a> <span class="n">op</span><span class="o">.</span><span class="n">make_equal_to</span><span class="p">(</span><span class="n">newop</span><span class="p">)</span> +<a id="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-28" name="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-28" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_bfff4cedf6c14a58be32aea404b8c6e5-28"></a> <span class="k">continue</span> +<a id="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-29" name="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-29" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_bfff4cedf6c14a58be32aea404b8c6e5-29"></a> +<a id="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-30" name="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-30" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_bfff4cedf6c14a58be32aea404b8c6e5-30"></a> <span class="c1"># and while we are at it, let's do some</span> +<a id="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-31" name="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-31" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_bfff4cedf6c14a58be32aea404b8c6e5-31"></a> <span class="c1"># arithmetic simplification:</span> +<a id="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-32" name="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-32" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_bfff4cedf6c14a58be32aea404b8c6e5-32"></a> <span class="c1"># a + 0 =&gt; a</span> +<a id="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-33" name="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-33" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_bfff4cedf6c14a58be32aea404b8c6e5-33"></a> <span class="k">if</span> <span class="n">eq_value</span><span class="p">(</span><span class="n">arg0</span><span class="p">,</span> <span class="n">Constant</span><span class="p">(</span><span class="mi">0</span><span class="p">)):</span> +<a id="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-34" name="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-34" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_bfff4cedf6c14a58be32aea404b8c6e5-34"></a> <span class="n">op</span><span class="o">.</span><span class="n">make_equal_to</span><span class="p">(</span><span class="n">arg1</span><span class="p">)</span> +<a id="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-35" name="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-35" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_bfff4cedf6c14a58be32aea404b8c6e5-35"></a> <span class="k">continue</span> +<a id="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-36" name="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-36" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_bfff4cedf6c14a58be32aea404b8c6e5-36"></a> <span class="k">if</span> <span class="n">eq_value</span><span class="p">(</span><span class="n">arg1</span><span class="p">,</span> <span class="n">Constant</span><span class="p">(</span><span class="mi">0</span><span class="p">)):</span> +<a id="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-37" name="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-37" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_bfff4cedf6c14a58be32aea404b8c6e5-37"></a> <span class="n">op</span><span class="o">.</span><span class="n">make_equal_to</span><span class="p">(</span><span class="n">arg0</span><span class="p">)</span> +<a id="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-38" name="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-38" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_bfff4cedf6c14a58be32aea404b8c6e5-38"></a> <span class="k">continue</span> +<a id="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-39" name="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-39" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_bfff4cedf6c14a58be32aea404b8c6e5-39"></a> <span class="n">opt_bb</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">op</span><span class="p">)</span> +<a id="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-40" name="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-40" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_bfff4cedf6c14a58be32aea404b8c6e5-40"></a> <span class="k">return</span> <span class="n">opt_bb</span> +<a id="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-41" name="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-41" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_bfff4cedf6c14a58be32aea404b8c6e5-41"></a> +<a id="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-42" name="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-42" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_bfff4cedf6c14a58be32aea404b8c6e5-42"></a> +<a id="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-43" name="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-43" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_bfff4cedf6c14a58be32aea404b8c6e5-43"></a><span class="k">def</span> <span class="nf">test_single_pass</span><span class="p">():</span> +<a id="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-44" name="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-44" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_bfff4cedf6c14a58be32aea404b8c6e5-44"></a> <span class="n">bb</span> <span class="o">=</span> <span class="n">Block</span><span class="p">()</span> +<a id="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-45" name="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-45" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_bfff4cedf6c14a58be32aea404b8c6e5-45"></a> <span class="c1"># constant folding</span> +<a id="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-46" name="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-46" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_bfff4cedf6c14a58be32aea404b8c6e5-46"></a> <span class="n">var0</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">getarg</span><span class="p">(</span><span class="mi">0</span><span class="p">)</span> +<a id="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-47" name="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-47" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_bfff4cedf6c14a58be32aea404b8c6e5-47"></a> <span class="n">var1</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">add</span><span class="p">(</span><span class="mi">5</span><span class="p">,</span> <span class="mi">4</span><span class="p">)</span> +<a id="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-48" name="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-48" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_bfff4cedf6c14a58be32aea404b8c6e5-48"></a> <span class="n">var2</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">add</span><span class="p">(</span><span class="n">var1</span><span class="p">,</span> <span class="mi">10</span><span class="p">)</span> +<a id="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-49" name="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-49" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_bfff4cedf6c14a58be32aea404b8c6e5-49"></a> <span class="n">var3</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">add</span><span class="p">(</span><span class="n">var2</span><span class="p">,</span> <span class="n">var0</span><span class="p">)</span> +<a id="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-50" name="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-50" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_bfff4cedf6c14a58be32aea404b8c6e5-50"></a> +<a id="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-51" name="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-51" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_bfff4cedf6c14a58be32aea404b8c6e5-51"></a> <span class="n">opt_bb</span> <span class="o">=</span> <span class="n">optimize</span><span class="p">(</span><span class="n">bb</span><span class="p">)</span> +<a id="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-52" name="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-52" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_bfff4cedf6c14a58be32aea404b8c6e5-52"></a> <span class="k">assert</span> <span class="n">bb_to_str</span><span class="p">(</span><span class="n">opt_bb</span><span class="p">,</span> <span class="s2">"optvar"</span><span class="p">)</span> <span class="o">==</span> <span class="s2">"""</span><span class="se">\</span> +<a id="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-53" name="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-53" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_bfff4cedf6c14a58be32aea404b8c6e5-53"></a><span class="s2">optvar0 = getarg(0)</span> +<a id="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-54" name="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-54" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_bfff4cedf6c14a58be32aea404b8c6e5-54"></a><span class="s2">optvar1 = add(19, optvar0)"""</span> +<a id="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-55" name="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-55" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_bfff4cedf6c14a58be32aea404b8c6e5-55"></a> +<a id="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-56" name="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-56" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_bfff4cedf6c14a58be32aea404b8c6e5-56"></a> <span class="c1"># cse + strength reduction</span> +<a id="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-57" name="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-57" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_bfff4cedf6c14a58be32aea404b8c6e5-57"></a> <span class="n">bb</span> <span class="o">=</span> <span class="n">Block</span><span class="p">()</span> +<a id="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-58" name="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-58" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_bfff4cedf6c14a58be32aea404b8c6e5-58"></a> <span class="n">var0</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">getarg</span><span class="p">(</span><span class="mi">0</span><span class="p">)</span> +<a id="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-59" name="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-59" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_bfff4cedf6c14a58be32aea404b8c6e5-59"></a> <span class="n">var1</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">getarg</span><span class="p">(</span><span class="mi">1</span><span class="p">)</span> +<a id="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-60" name="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-60" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_bfff4cedf6c14a58be32aea404b8c6e5-60"></a> <span class="n">var2</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">add</span><span class="p">(</span><span class="n">var0</span><span class="p">,</span> <span class="n">var1</span><span class="p">)</span> +<a id="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-61" name="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-61" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_bfff4cedf6c14a58be32aea404b8c6e5-61"></a> <span class="n">var3</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">add</span><span class="p">(</span><span class="n">var0</span><span class="p">,</span> <span class="n">var1</span><span class="p">)</span> <span class="c1"># the same as var3</span> +<a id="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-62" name="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-62" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_bfff4cedf6c14a58be32aea404b8c6e5-62"></a> <span class="n">var4</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">add</span><span class="p">(</span><span class="n">var2</span><span class="p">,</span> <span class="mi">2</span><span class="p">)</span> +<a id="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-63" name="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-63" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_bfff4cedf6c14a58be32aea404b8c6e5-63"></a> <span class="n">var5</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">add</span><span class="p">(</span><span class="n">var3</span><span class="p">,</span> <span class="mi">2</span><span class="p">)</span> <span class="c1"># the same as var4</span> +<a id="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-64" name="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-64" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_bfff4cedf6c14a58be32aea404b8c6e5-64"></a> <span class="n">var6</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">add</span><span class="p">(</span><span class="n">var4</span><span class="p">,</span> <span class="n">var5</span><span class="p">)</span> +<a id="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-65" name="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-65" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_bfff4cedf6c14a58be32aea404b8c6e5-65"></a> +<a id="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-66" name="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-66" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_bfff4cedf6c14a58be32aea404b8c6e5-66"></a> <span class="n">opt_bb</span> <span class="o">=</span> <span class="n">optimize</span><span class="p">(</span><span class="n">bb</span><span class="p">)</span> +<a id="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-67" name="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-67" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_bfff4cedf6c14a58be32aea404b8c6e5-67"></a> <span class="k">assert</span> <span class="n">bb_to_str</span><span class="p">(</span><span class="n">opt_bb</span><span class="p">,</span> <span class="s2">"optvar"</span><span class="p">)</span> <span class="o">==</span> <span class="s2">"""</span><span class="se">\</span> +<a id="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-68" name="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-68" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_bfff4cedf6c14a58be32aea404b8c6e5-68"></a><span class="s2">optvar0 = getarg(0)</span> +<a id="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-69" name="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-69" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_bfff4cedf6c14a58be32aea404b8c6e5-69"></a><span class="s2">optvar1 = getarg(1)</span> +<a id="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-70" name="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-70" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_bfff4cedf6c14a58be32aea404b8c6e5-70"></a><span class="s2">optvar2 = add(optvar0, optvar1)</span> +<a id="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-71" name="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-71" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_bfff4cedf6c14a58be32aea404b8c6e5-71"></a><span class="s2">optvar3 = add(optvar2, 2)</span> +<a id="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-72" name="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-72" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_bfff4cedf6c14a58be32aea404b8c6e5-72"></a><span class="s2">optvar4 = lshift(optvar3, 1)"""</span> +<a id="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-73" name="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-73" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_bfff4cedf6c14a58be32aea404b8c6e5-73"></a> +<a id="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-74" name="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-74" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_bfff4cedf6c14a58be32aea404b8c6e5-74"></a> <span class="c1"># removing + 0</span> +<a id="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-75" name="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-75" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_bfff4cedf6c14a58be32aea404b8c6e5-75"></a> <span class="n">bb</span> <span class="o">=</span> <span class="n">Block</span><span class="p">()</span> +<a id="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-76" name="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-76" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_bfff4cedf6c14a58be32aea404b8c6e5-76"></a> <span class="n">var0</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">getarg</span><span class="p">(</span><span class="mi">0</span><span class="p">)</span> +<a id="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-77" name="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-77" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_bfff4cedf6c14a58be32aea404b8c6e5-77"></a> <span class="n">var1</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">add</span><span class="p">(</span><span class="mi">16</span><span class="p">,</span> <span class="o">-</span><span class="mi">16</span><span class="p">)</span> +<a id="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-78" name="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-78" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_bfff4cedf6c14a58be32aea404b8c6e5-78"></a> <span class="n">var2</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">add</span><span class="p">(</span><span class="n">var0</span><span class="p">,</span> <span class="n">var1</span><span class="p">)</span> +<a id="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-79" name="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-79" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_bfff4cedf6c14a58be32aea404b8c6e5-79"></a> <span class="n">var3</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">add</span><span class="p">(</span><span class="mi">0</span><span class="p">,</span> <span class="n">var2</span><span class="p">)</span> +<a id="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-80" name="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-80" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_bfff4cedf6c14a58be32aea404b8c6e5-80"></a> <span class="n">var4</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">add</span><span class="p">(</span><span class="n">var2</span><span class="p">,</span> <span class="n">var3</span><span class="p">)</span> +<a id="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-81" name="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-81" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_bfff4cedf6c14a58be32aea404b8c6e5-81"></a> +<a id="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-82" name="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-82" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_bfff4cedf6c14a58be32aea404b8c6e5-82"></a> <span class="n">opt_bb</span> <span class="o">=</span> <span class="n">optimize</span><span class="p">(</span><span class="n">bb</span><span class="p">)</span> +<a id="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-83" name="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-83" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_bfff4cedf6c14a58be32aea404b8c6e5-83"></a> <span class="k">assert</span> <span class="n">bb_to_str</span><span class="p">(</span><span class="n">opt_bb</span><span class="p">,</span> <span class="s2">"optvar"</span><span class="p">)</span> <span class="o">==</span> <span class="s2">"""</span><span class="se">\</span> +<a id="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-84" name="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-84" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_bfff4cedf6c14a58be32aea404b8c6e5-84"></a><span class="s2">optvar0 = getarg(0)</span> +<a id="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-85" name="rest_code_bfff4cedf6c14a58be32aea404b8c6e5-85" href="https://www.pypy.org/posts/2022/07/toy-optimizer.html#rest_code_bfff4cedf6c14a58be32aea404b8c6e5-85"></a><span class="s2">optvar1 = lshift(optvar0, 1)"""</span> +</pre></div> +</section> +<section id="conclusion"> +<h2>Conclusion</h2> +<p>That's it for now. Why is this architecture cool? From a software engineering +point of view, sticking everything into a single function like in <code class="docutils literal">optimize</code> +above is obviously not great, and if you wanted to do this for real you would +try to split the cases into different functions that are individually +digestible, or even use a DSL that makes the pattern matching much more +readable. But the advantage of the architecture is that it's quite efficient, +it makes it possible to pack a lot of good optimizations into a single pass +over a basic block.</p> +<p>Of course this works even better if you are in a tracing context, where +everything is put into a trace, which is basically one incredibly long basic +block. In a JIT context it's also quite important that the +optimizer itself runs quickly.</p> +<p>Various other optimizations are possible in this model. There is a +<a class="reference external" href="https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html">follow-up post</a> that show how to implement what is arguably PyPy's <a class="reference external" href="https://www.pypy.org/posts/2010/09/escape-analysis-in-pypys-jit-1780048403046080197.html">most +important optimization</a>.</p> +</section> +<section id="some-further-pointers"> +<h2>Some Further Pointers</h2> +<p>This post is only a short introduction and is taking some shortcuts, I wanted to +also give some (non-exhaustive) pointers to more general literature about the +touched topics.</p> +<p>The approach to CSE described here is usually can be seen as <a class="reference external" href="https://en.wikipedia.org/wiki/Value_numbering">value +numbering</a>, it's normally really implemented with a hashmap though. Here's a +<a class="reference external" href="https://www.cs.tufts.edu/~nr/cs257/archive/keith-cooper/value-numbering.pdf">paper</a> that describes various styles of implementing that, even beyond a +single basic block. The paper also partly takes the perspective of discovering +equivalence classes of operations that compute the same result.</p> +<p>A technique that leans even more fully into finding equivalences between +operations is using e-graphs and then applying <a class="reference external" href="https://en.wikipedia.org/wiki/E-graph#Equality_saturation">equality saturation</a> (this is +significantly more advanced that what I described here though). A cool modern +project that applies this technique is <a class="reference external" href="https://egraphs-good.github.io/">egg</a>.</p> +<p>If you squint a bit, you can generally view a constant folding pass as a very +simple form of <a class="reference external" href="https://en.wikipedia.org/wiki/Partial_evaluation">Partial Evaluation</a>: every operation that has constant +arguments is constant-folded away, and the remaining ones are "residualized", +i.e. put into the output program. This point of view is not super important for +the current post, but will become important in the next one.</p> +<p><strong>Acknowledgements:</strong> Thanks to <a class="reference external" href="https://thorstenball.com/">Thorsten Ball</a> for <a class="reference external" href="https://twitter.com/cfbolz/status/1547231548017106944">getting me</a> to write +this and for his enthusiastic feedback. I also got great feedback from <a class="reference external" href="https://bernsteinbear.com/">Max +Bernstein</a>, Matti Picus and Per Vognsen. A conversation with <a class="reference external" href="https://pengwu.substack.com/">Peng Wu</a> that +we had many many years ago and that stuck with me made me keep thinking about +various ways to view compiler optimizations.</p> +</section>toy-optimizerhttps://www.pypy.org/posts/2022/07/toy-optimizer.htmlTue, 19 Jul 2022 12:00:00 GMT \ No newline at end of file diff --git a/categories/unicode.html b/categories/unicode.html new file mode 100644 index 000000000..3d84a88e4 --- /dev/null +++ b/categories/unicode.html @@ -0,0 +1,114 @@ + + + + + +Posts about unicode | PyPy + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +
                + + \ No newline at end of file diff --git a/categories/unicode.xml b/categories/unicode.xml new file mode 100644 index 000000000..fbdaa5523 --- /dev/null +++ b/categories/unicode.xml @@ -0,0 +1,115 @@ + +PyPy (Posts about unicode)https://www.pypy.org/enContents © 2024 <a href="mailto:pypy-dev@pypy.org">The PyPy Team</a> Sat, 31 Aug 2024 17:48:11 GMTNikola (getnikola.com)http://blogs.law.harvard.edu/tech/rss(Cape of) Good Hope for PyPyhttps://www.pypy.org/posts/2017/10/cape-of-good-hope-for-pypy-hello-from-3656631725712879033.htmlAntonio Cuni<div> +<br></div> +Hello from the other side of the world (for most of you)!<br> +<br> +With the excuse of coming to <a class="reference external" href="https://za.pycon.org/">PyCon ZA</a> during the last two weeks Armin, +Ronan, Antonio and sometimes Maciek had a very nice and productive sprint in +Cape Town, as pictures show :). We would like to say a big thank you to +Kiwi.com, which sponsored part of the travel costs via its awesome <a class="reference external" href="https://www.kiwi.com/sourcelift/">Sourcelift</a> +program to help Open Source projects.<br> +<br> +<table align="center" cellpadding="0" cellspacing="0" class="tr-caption-container" style="float: right; margin-left: 1em; text-align: right;"><tbody> +<tr><td style="text-align: center;"><a href="https://3.bp.blogspot.com/-9YVNucPN1wE/WeaWmTUFB-I/AAAAAAAABMQ/HeVMqS-ya2IYJuk0iZZODlULqpKaf5XcgCLcBGAs/s1600/DSC_2418.JPG" style="margin-left: auto; margin-right: auto;"><img border="0" height="225" src="https://3.bp.blogspot.com/-9YVNucPN1wE/WeaWmTUFB-I/AAAAAAAABMQ/HeVMqS-ya2IYJuk0iZZODlULqpKaf5XcgCLcBGAs/s400/DSC_2418.JPG" width="400"></a></td></tr> +<tr><td class="tr-caption" style="text-align: center;">Armin, Anto and Ronan at Cape Point</td></tr> +</tbody></table> +<br> +Armin, Ronan and Anto spent most of the time hacking at cpyext, our CPython +C-API compatibility layer: during the last years, the focus was to make it +working and compatible with CPython, in order to run existing libraries such +as numpy and pandas. However, we never paid too much attention to performance, +so the net result is that with the latest released version of PyPy, C +extensions generally work but their speed ranges from "slow" to "horribly +slow".<br> +<br> +For example, these very simple <a class="reference external" href="https://github.com/antocuni/cpyext-benchmarks">microbenchmarks</a> measure the speed of +calling (empty) C functions, i.e. the time you spend to "cross the border" +between RPython and C. <i>(Note: this includes the time spent doing the loop in regular Python code.)</i> These are the results on CPython, on PyPy 5.8, and on +our newest in-progress version:<br> +<br> +<pre class="literal-block">$ python bench.py # CPython +noargs : 0.41 secs +onearg(None): 0.44 secs +onearg(i) : 0.44 secs +varargs : 0.58 secs +</pre> +<div> +<br></div> +<pre class="literal-block">$ pypy-5.8 bench.py # PyPy 5.8 +noargs : 1.01 secs +onearg(None): 1.31 secs +onearg(i) : 2.57 secs +varargs : 2.79 secs +</pre> +<div> +<br></div> +<pre class="literal-block">$ pypy bench.py # cpyext-refactor-methodobject branch +noargs : 0.17 secs +onearg(None): 0.21 secs +onearg(i) : 0.22 secs +varargs : 0.47 secs +</pre> +<div> +<br></div> +<pre class="literal-block"></pre> +<pre class="literal-block"></pre> +So yes: before the sprint, we were ~2-6x slower than CPython. Now, we are +<strong>faster</strong> than it! +To reach this result, we did various improvements, such as: +<br> +<blockquote> +<ol class="arabic simple"> +<li>teach the JIT how to look (a bit) inside the cpyext module;</li> +<li>write specialized code for calling <tt class="docutils literal">METH_NOARGS</tt>, <tt class="docutils literal">METH_O</tt> and +<tt class="docutils literal">METH_VARARGS</tt> functions; previously, we always used a very general and +slow logic;</li> +<li>implement freelists to allocate the cpyext versions of <tt class="docutils literal">int</tt> and +<tt class="docutils literal">tuple</tt> objects, as CPython does;</li> +<li>the <a class="reference external" href="https://foss.heptapod.net/pypy/pypy/-/merge_requests/573">cpyext-avoid-roundtrip</a> branch: crossing the RPython/C border is +slowish, but the real problem was (and still is for many cases) we often +cross it many times for no good reason. So, depending on the actual API +call, you might end up in the C land, which calls back into the RPython +land, which goes to C, etc. etc. (ad libitum).</li> +</ol> +</blockquote> +The branch tries to fix such nonsense: so far, we fixed only some cases, which +are enough to speed up the benchmarks shown above. But most importantly, we +now have a clear path and an actual plan to improve cpyext more and +more. Ideally, we would like to reach a point in which cpyext-intensive +programs run at worst at the same speed of CPython.<br> +<br> +The other big topic of the sprint was Armin and Maciej doing a lot of work on the +<a class="reference external" href="https://bitbucket.org/pypy/pypy/commits/branch/unicode-utf8">unicode-utf8</a> branch: the goal of the branch is to always use UTF-8 as the +internal representation of unicode strings. The advantages are various: +<br> +<blockquote> +<ul class="simple"> +<li>decoding a UTF-8 stream is super fast, as you just need to check that the +stream is valid;</li> +<li>encoding to UTF-8 is almost a no-op;</li> +<li>UTF-8 is always more compact representation than the currently +used UCS-4. It's also almost always more compact than CPython 3.5 latin1/UCS2/UCS4 combo;</li> +<li>smaller representation means everything becomes quite a bit faster due to lower cache pressure.</li> +</ul> +</blockquote> +Before you ask: yes, this branch contains special logic to ensure that random +access of single unicode chars is still O(1), as it is on both CPython and the +current PyPy.<br> +We also plan to improve the speed of decoding even more by using modern processor features, like SSE and AVX. Preliminary results show that decoding can be done 100x faster than the current setup. +<br> +<br> +In summary, this was a long and profitable sprint, in which we achieved lots +of interesting results. However, what we liked even more was the privilege of +doing <a class="reference external" href="https://bitbucket.org/pypy/pypy/commits/a4307fb5912e">commits</a> from awesome places such as the top of Table Mountain:<br> +<br> +<blockquote class="twitter-tweet"> +<div dir="ltr" lang="en"> +Our sprint venue today <a href="https://twitter.com/hashtag/pypy?src=hash&amp;ref_src=twsrc%5Etfw">#pypy</a> <a href="https://t.co/o38IfTYmAV">pic.twitter.com/o38IfTYmAV</a></div> +— Ronan Lamy (@ronanlamy) <a href="https://twitter.com/ronanlamy/status/915575026107240449?ref_src=twsrc%5Etfw">4 ottobre 2017</a></blockquote> + + +<br> +<table align="center" cellpadding="0" cellspacing="0" class="tr-caption-container" style="float: left; margin-right: 1em; text-align: left;"><tbody> +<tr><td style="text-align: center;"><a href="https://foss.heptapod.net/pypy/extradoc/-/blob/branch/extradoc/sprintinfo/cape-town-2017/2017-10-04-155524.jpg" style="margin-left: auto; margin-right: auto;"><img border="0" height="360" src="https://bytebucket.org/pypy/extradoc/raw/extradoc/sprintinfo/cape-town-2017/2017-10-04-155524.jpg" width="640"></a></td></tr> +<tr><td class="tr-caption" style="text-align: center;">The panorama we looked at instead of staring at cpyext code</td></tr> +</tbody></table>cpyextprofilingspeedsprintunicodehttps://www.pypy.org/posts/2017/10/cape-of-good-hope-for-pypy-hello-from-3656631725712879033.htmlWed, 18 Oct 2017 13:31:00 GMT \ No newline at end of file diff --git a/categories/valgrind.html b/categories/valgrind.html new file mode 100644 index 000000000..643155789 --- /dev/null +++ b/categories/valgrind.html @@ -0,0 +1,114 @@ + + + + + +Posts about valgrind | PyPy + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +
                + + \ No newline at end of file diff --git a/categories/valgrind.xml b/categories/valgrind.xml new file mode 100644 index 000000000..7e85237f6 --- /dev/null +++ b/categories/valgrind.xml @@ -0,0 +1,11 @@ + +PyPy (Posts about valgrind)https://www.pypy.org/enContents © 2024 <a href="mailto:pypy-dev@pypy.org">The PyPy Team</a> Sat, 31 Aug 2024 17:48:13 GMTNikola (getnikola.com)http://blogs.law.harvard.edu/tech/rssProfiling for fun with valgrindhttps://www.pypy.org/posts/2007/12/profiling-for-fun-with-valgrind-3215121784705288400.htmlMaciej Fijalkowski<p>Recently I've been doing a lot of profiling on the PyPy executables to find speed bottlenecks. <a href="https://en.wikipedia.org/wiki/Valgrind">Valgrind</a> (the original <a href="https://valgrind.org/">page</a> seems to be down) is an extremely nice tool for doing this. It has several built-in tools that give you different types of profiles. The callgrind mode provides you with a lot of information including relative call costs. The cachegrind tool gives you less information, but what it gives you (e.g. cache misses) is much more accurate. The obvious choice would be to have a way to combine the results of two profiling runs to have both. In the last days I wrote a script that does this. It's available <a href="https://codespeak.net/svn/user/fijal/pygrind">at my user's svn</a> and has a pretty intuitive command line interface. The combining calculation are not perfect yet, total costs of functions can still be a bit bogus (they can sum up to whatever) but at least the relative figures are good. This means that we can stop looking at two different types of graphs now. + +An awesome tool for analyzing the profile data is <a href="https://kcachegrind.sourceforge.net/cgi-bin/show.cgi">kcachegrind.</a> + +<a href="https://4.bp.blogspot.com/_5R1EBmwBBTs/R2JjKRYuTTI/AAAAAAAAAAM/LX5ktu_FcIE/s1600-h/kcachegrind.png"><img alt="" border="0" id="BLOGGER_PHOTO_ID_5143782752527469874" src="https://4.bp.blogspot.com/_5R1EBmwBBTs/R2JjKRYuTTI/AAAAAAAAAAM/LX5ktu_FcIE/s320/kcachegrind.png" style="margin: 0px auto 10px; display: block; text-align: center; cursor: pointer;"></a> + +Which also proves that my 12'' display is to small at least for some things :-). + + +<b>Update:</b> pygrind is available under the MIT license.</p>kcachegrindprofilingvalgrindhttps://www.pypy.org/posts/2007/12/profiling-for-fun-with-valgrind-3215121784705288400.htmlFri, 14 Dec 2007 11:02:00 GMT \ No newline at end of file diff --git a/categories/z3.html b/categories/z3.html new file mode 100644 index 000000000..e3184a572 --- /dev/null +++ b/categories/z3.html @@ -0,0 +1,120 @@ + + + + + +Posts about z3 | PyPy + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +
                + + \ No newline at end of file diff --git a/categories/z3.xml b/categories/z3.xml new file mode 100644 index 000000000..349e2aca0 --- /dev/null +++ b/categories/z3.xml @@ -0,0 +1,2344 @@ + +PyPy (Posts about z3)https://www.pypy.org/enContents © 2024 <a href="mailto:pypy-dev@pypy.org">The PyPy Team</a> Sat, 31 Aug 2024 17:48:14 GMTNikola (getnikola.com)http://blogs.law.harvard.edu/tech/rssA Knownbits Abstract Domain for the Toy Optimizer, Correctlyhttps://www.pypy.org/posts/2024/08/toy-knownbits.htmlCF Bolz-Tereick<p>After <a href="https://bernsteinbear.com/blog/toy-abstract-interpretation/">Max' introduction to abstract interpretation for the toy optimizer</a> in the +last post, I want to present a more complicated abstract domain in this post. +This abstract domain reasons about the individual bits of a variable in a trace. +Every bit can be either "known zero", "known one" or "unknown". The abstract +domain is useful for optimizing integer operations, particularly the bitwise operations. +The abstract domain follows quite closely the <a href="https://github.com/torvalds/linux/blob/master/kernel/bpf/tnum.c">tristate abstract domain of the +eBPF verifier in the Linux +Kernel</a>, as +described by the paper +<a href="https://arxiv.org/abs/2105.05398">Sound, Precise, and Fast Abstract Interpretation with Tristate +Numbers</a> by Harishankar Vishwanathan, Matan +Shachnai, Srinivas Narayana, and Santosh Nagarakatte.</p> +<p>The presentation in this post will still be in the context of the +<a href="https://www.pypy.org/categories/toy-optimizer">toy optimizer</a>. We'll spend a significant part of +the post convincing ourselves that the abstract domain transfer functions that +we're writing are really correct, using both property-based testing and +automated proofs (again using Z3).</p> +<p>PyPy has implemented and merged a more complicated version of the same abstract +domain for the "real" PyPy JIT. A more thorough explanation of that real world +implementation will follow.</p> +<p>I'd like to thank Max Bernstein and Armin Rigo for lots of great feedback on +drafts of this post. The PyPy implementation was mainly done by Nico +Rittinghaus and me.</p> +<p><strong>Contents:</strong></p> +<div class="toc"> +<ul> +<li><a href="https://www.pypy.org/posts/2024/08/toy-knownbits.html#motivation">Motivation</a></li> +<li><a href="https://www.pypy.org/posts/2024/08/toy-knownbits.html#the-knownbits-abstract-domain">The Knownbits Abstract Domain</a></li> +<li><a href="https://www.pypy.org/posts/2024/08/toy-knownbits.html#transfer-functions">Transfer Functions</a></li> +<li><a href="https://www.pypy.org/posts/2024/08/toy-knownbits.html#property-based-tests-with-hypothesis">Property-based Tests with Hypothesis</a></li> +<li><a href="https://www.pypy.org/posts/2024/08/toy-knownbits.html#when-are-transfer-functions-correct-how-do-we-test-them">When are Transfer Functions Correct? How do we test them?</a></li> +<li><a href="https://www.pypy.org/posts/2024/08/toy-knownbits.html#implementing-binary-transfer-functions">Implementing Binary Transfer Functions</a></li> +<li><a href="https://www.pypy.org/posts/2024/08/toy-knownbits.html#addition-and-subtraction">Addition and Subtraction</a></li> +<li><a href="https://www.pypy.org/posts/2024/08/toy-knownbits.html#proving-correctness-of-the-transfer-functions-with-z3">Proving correctness of the transfer functions with Z3</a></li> +<li><a href="https://www.pypy.org/posts/2024/08/toy-knownbits.html#cases-where-this-style-of-z3-proof-doesnt-work">Cases where this style of Z3 proof doesn't work</a></li> +<li><a href="https://www.pypy.org/posts/2024/08/toy-knownbits.html#making-statements-about-precision">Making Statements about Precision</a></li> +<li><a href="https://www.pypy.org/posts/2024/08/toy-knownbits.html#using-the-abstract-domain-in-the-toy-optimizer-for-generalized-constant-folding">Using the Abstract Domain in the Toy Optimizer for Generalized Constant Folding</a></li> +<li><a href="https://www.pypy.org/posts/2024/08/toy-knownbits.html#using-the-knownbits-domain-for-conditional-peephole-rewrites">Using the KnownBits Domain for Conditional Peephole Rewrites</a></li> +<li><a href="https://www.pypy.org/posts/2024/08/toy-knownbits.html#conclusion">Conclusion</a></li> +</ul> +</div> +<h3 id="motivation">Motivation</h3> +<p>In many programs that do bit-manipulation of integers, some of the bits of the +integer variables of the program can be statically known. Here's a simple +example:</p> +<div class="code"><pre class="code literal-block"><span class="nv">x</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="nv">a</span><span class="w"> </span><span class="o">|</span><span class="w"> </span><span class="mi">1</span> +... +<span class="k">if</span><span class="w"> </span><span class="nv">x</span><span class="w"> </span><span class="o">&amp;</span><span class="w"> </span><span class="mi">1</span>: +<span class="w"> </span>... +<span class="k">else</span>: +<span class="w"> </span>... +</pre></div> + +<p>After the assignment <code>x = a | 1</code>, we know that the lowest bit of <code>x</code> must be <code>1</code> +(the other bits are unknown) and an optimizer could remove the condition <code>x &amp; 1</code> by +constant-folding it to <code>1</code>.</p> +<p>Another (more complicated) example is:</p> +<div class="code"><pre class="code literal-block">assert i &amp; 0b111 == 0 # check that i is a multiple of 8 +j = i + 16 +assert j &amp; 0b111 == 0 +</pre></div> + +<p>This kind of code could e.g. happen in a <a href="https://docs.pydrofoil.org/en/latest/">CPU +emulator</a>, where <code>i</code> and <code>j</code> are +integers that represent emulated pointers, and the <code>assert</code>s are alignment +checks. The first assert implies that the lowest three bits of i must be <code>0</code>. +Adding 16 to such a number produces a result where the lowest three bits are +again all <code>0</code>, therefore the second assert is always true. So we would like a +compiler to remove the second assert.</p> +<p>Both of these will optimizations are doable with the help of the knownbits +abstract domain that we'll discuss in the rest of the post.</p> +<h3 id="the-knownbits-abstract-domain">The Knownbits Abstract Domain</h3> +<p>An abstract value of the knownbits domain needs to be able to store, for every +bit of an integer variable in a program, whether it is known 0, known 1, or +unknown. To represent +three different states, we need 2 bits, which we will call <code>one</code> and <code>unknown</code>. +Here's the encoding:</p> +<table> +<thead> +<tr> +<th>one</th> +<th>unknown</th> +<th align="right">knownbit</th> +</tr> +</thead> +<tbody> +<tr> +<td>0</td> +<td>0</td> +<td align="right">0</td> +</tr> +<tr> +<td>1</td> +<td>0</td> +<td align="right">1</td> +</tr> +<tr> +<td>0</td> +<td>1</td> +<td align="right">?</td> +</tr> +<tr> +<td>1</td> +<td>1</td> +<td align="right">illegal</td> +</tr> +</tbody> +</table> +<p>The <code>unknown</code> bit is set if we don't know the value of the bit ("?"), the <code>one</code> +bit is set if the bit is known to be a <code>1</code>. Since two bits are enough to encode +four different states, but we only need three, the combination of a set <code>one</code> +bit and a set <code>unknown</code> is not allowed.</p> +<p>We don't just want to encode a single bit, however. Instead, we want to do this +for all the bits of an integer variable. Therefore the instances of the abstract +domain get two integer fields <code>ones</code> and <code>unknowns</code>, where each pair of +corresponding bits encodes the knowledge about the corresponding bit of the +integer variable in the program.</p> +<p>We can start implementing a Python class that works like this:</p> +<div class="code"><pre class="code literal-block"><span class="kn">from</span> <span class="nn">dataclasses</span> <span class="kn">import</span> <span class="n">dataclass</span> + +<span class="nd">@dataclass</span><span class="p">(</span><span class="n">eq</span><span class="o">=</span><span class="kc">False</span><span class="p">)</span> +<span class="k">class</span> <span class="nc">KnownBits</span><span class="p">:</span> + <span class="n">ones</span> <span class="p">:</span> <span class="nb">int</span> + <span class="n">unknowns</span> <span class="p">:</span> <span class="nb">int</span> + + <span class="k">def</span> <span class="nf">__post_init__</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span> + <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">ones</span><span class="p">,</span> <span class="nb">int</span><span class="p">):</span> + <span class="k">assert</span> <span class="bp">self</span><span class="o">.</span><span class="n">is_well_formed</span><span class="p">()</span> + + <span class="k">def</span> <span class="nf">is_well_formed</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span> + <span class="c1"># a bit cannot be both 1 and unknown</span> + <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">ones</span> <span class="o">&amp;</span> <span class="bp">self</span><span class="o">.</span><span class="n">unknowns</span> <span class="o">==</span> <span class="mi">0</span> + + <span class="nd">@staticmethod</span> + <span class="k">def</span> <span class="nf">from_constant</span><span class="p">(</span><span class="n">const</span> <span class="p">:</span> <span class="nb">int</span><span class="p">):</span> +<span class="w"> </span><span class="sd">""" Construct a KnownBits corresponding to a constant, where all bits</span> +<span class="sd"> are known."""</span> + <span class="k">return</span> <span class="n">KnownBits</span><span class="p">(</span><span class="n">const</span><span class="p">,</span> <span class="mi">0</span><span class="p">)</span> + + <span class="k">def</span> <span class="nf">is_constant</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span> +<span class="w"> </span><span class="sd">""" Check if the KnownBits instance represents a constant. """</span> + <span class="c1"># it's a constant if there are no unknowns</span> + <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">unknowns</span> <span class="o">==</span> <span class="mi">0</span> +</pre></div> + +<p>We can also add some convenience properties. Sometimes it is easier to work +with an integer where all the <em>known</em> bits are set, or one where the positions +of all the known zeros have a set bit:</p> +<div class="code"><pre class="code literal-block"><span class="k">class</span> <span class="nc">KnownBits</span><span class="p">:</span> + <span class="o">...</span> + + <span class="nd">@property</span> + <span class="k">def</span> <span class="nf">knowns</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span> +<span class="w"> </span><span class="sd">""" return an integer where the known bits are set. """</span> + <span class="c1"># the knowns are just the unknowns, inverted</span> + <span class="k">return</span> <span class="o">~</span><span class="bp">self</span><span class="o">.</span><span class="n">unknowns</span> + + <span class="nd">@property</span> + <span class="k">def</span> <span class="nf">zeros</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span> +<span class="w"> </span><span class="sd">""" return an integer where the places that are known zeros have a bit</span> +<span class="sd"> set. """</span> + <span class="c1"># it's a 0 if it is known, but not 1</span> + <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">knowns</span> <span class="o">&amp;</span> <span class="o">~</span><span class="bp">self</span><span class="o">.</span><span class="n">ones</span> +</pre></div> + +<p>Also, for debugging and for writing tests we want a way to print the known bits +in a human-readable form, and also to have a way to construct a <code>KnownBits</code> +instance from a string. It's not important to understand the details of +<code>__str__</code> or <code>from_str</code> for the rest of the post, so I'm putting them into a fold:</p> +<details> +<summary><code>KnownBits</code> from and to string conversions</summary> + + +<div class="code"><pre class="code literal-block"><span class="k">class</span> <span class="nc">KnownBits</span><span class="p">:</span> + <span class="o">...</span> + + <span class="k">def</span> <span class="fm">__repr__</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span> + <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">is_constant</span><span class="p">():</span> + <span class="k">return</span> <span class="sa">f</span><span class="s2">"KnownBits.from_constant(</span><span class="si">{</span><span class="bp">self</span><span class="o">.</span><span class="n">ones</span><span class="si">}</span><span class="s2">)"</span> + <span class="k">return</span> <span class="sa">f</span><span class="s2">"KnownBits(</span><span class="si">{</span><span class="bp">self</span><span class="o">.</span><span class="n">ones</span><span class="si">}</span><span class="s2">, </span><span class="si">{</span><span class="bp">self</span><span class="o">.</span><span class="n">unknowns</span><span class="si">}</span><span class="s2">)"</span> + + <span class="k">def</span> <span class="fm">__str__</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span> + <span class="n">res</span> <span class="o">=</span> <span class="p">[]</span> + <span class="n">ones</span><span class="p">,</span> <span class="n">unknowns</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">ones</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">unknowns</span> + <span class="c1"># construct the string representation right to left</span> + <span class="k">while</span> <span class="mi">1</span><span class="p">:</span> + <span class="k">if</span> <span class="ow">not</span> <span class="n">ones</span> <span class="ow">and</span> <span class="ow">not</span> <span class="n">unknowns</span><span class="p">:</span> + <span class="k">break</span> <span class="c1"># we leave off the leading known 0s</span> + <span class="k">if</span> <span class="n">ones</span> <span class="o">==</span> <span class="o">-</span><span class="mi">1</span> <span class="ow">and</span> <span class="ow">not</span> <span class="n">unknowns</span><span class="p">:</span> + <span class="c1"># -1 has all bits set in two's complement, so the leading</span> + <span class="c1"># bits are all 1</span> + <span class="n">res</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="s1">'1'</span><span class="p">)</span> + <span class="n">res</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="s2">"..."</span><span class="p">)</span> + <span class="k">break</span> + <span class="k">if</span> <span class="n">unknowns</span> <span class="o">==</span> <span class="o">-</span><span class="mi">1</span><span class="p">:</span> + <span class="c1"># -1 has all bits set in two's complement, so the leading bits</span> + <span class="c1"># are all ?</span> + <span class="k">assert</span> <span class="ow">not</span> <span class="n">ones</span> + <span class="n">res</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="s2">"?"</span><span class="p">)</span> + <span class="n">res</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="s2">"..."</span><span class="p">)</span> + <span class="k">break</span> + <span class="k">if</span> <span class="n">unknowns</span> <span class="o">&amp;</span> <span class="mi">1</span><span class="p">:</span> + <span class="n">res</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="s1">'?'</span><span class="p">)</span> + <span class="k">elif</span> <span class="n">ones</span> <span class="o">&amp;</span> <span class="mi">1</span><span class="p">:</span> + <span class="n">res</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="s1">'1'</span><span class="p">)</span> + <span class="k">else</span><span class="p">:</span> + <span class="n">res</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="s1">'0'</span><span class="p">)</span> + <span class="n">ones</span> <span class="o">&gt;&gt;=</span> <span class="mi">1</span> + <span class="n">unknowns</span> <span class="o">&gt;&gt;=</span> <span class="mi">1</span> + <span class="k">if</span> <span class="ow">not</span> <span class="n">res</span><span class="p">:</span> + <span class="n">res</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="s1">'0'</span><span class="p">)</span> + <span class="n">res</span><span class="o">.</span><span class="n">reverse</span><span class="p">()</span> + <span class="k">return</span> <span class="s2">""</span><span class="o">.</span><span class="n">join</span><span class="p">(</span><span class="n">res</span><span class="p">)</span> + + <span class="nd">@staticmethod</span> + <span class="k">def</span> <span class="nf">from_str</span><span class="p">(</span><span class="n">s</span><span class="p">):</span> +<span class="w"> </span><span class="sd">""" Construct a KnownBits instance that from a string. String can start</span> +<span class="sd"> with ...1 to mean that all higher bits are 1, or ...? to mean that all</span> +<span class="sd"> higher bits are unknown. Otherwise it is assumed that the higher bits</span> +<span class="sd"> are all 0. """</span> + <span class="n">ones</span><span class="p">,</span> <span class="n">unknowns</span> <span class="o">=</span> <span class="mi">0</span><span class="p">,</span> <span class="mi">0</span> + <span class="n">startindex</span> <span class="o">=</span> <span class="mi">0</span> + <span class="k">if</span> <span class="n">s</span><span class="o">.</span><span class="n">startswith</span><span class="p">(</span><span class="s2">"...?"</span><span class="p">):</span> + <span class="n">unknowns</span> <span class="o">=</span> <span class="o">-</span><span class="mi">1</span> + <span class="n">startindex</span> <span class="o">=</span> <span class="mi">4</span> + <span class="k">elif</span> <span class="n">s</span><span class="o">.</span><span class="n">startswith</span><span class="p">(</span><span class="s2">"...1"</span><span class="p">):</span> + <span class="n">ones</span> <span class="o">=</span> <span class="o">-</span><span class="mi">1</span> + <span class="n">startindex</span> <span class="o">=</span> <span class="mi">4</span> + <span class="k">for</span> <span class="n">index</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="n">startindex</span><span class="p">,</span> <span class="nb">len</span><span class="p">(</span><span class="n">s</span><span class="p">)):</span> + <span class="n">ones</span> <span class="o">&lt;&lt;=</span> <span class="mi">1</span> + <span class="n">unknowns</span> <span class="o">&lt;&lt;=</span> <span class="mi">1</span> + <span class="n">c</span> <span class="o">=</span> <span class="n">s</span><span class="p">[</span><span class="n">index</span><span class="p">]</span> + <span class="k">if</span> <span class="n">c</span> <span class="o">==</span> <span class="s1">'1'</span><span class="p">:</span> + <span class="n">ones</span> <span class="o">|=</span> <span class="mi">1</span> + <span class="k">elif</span> <span class="n">c</span> <span class="o">==</span> <span class="s1">'?'</span><span class="p">:</span> + <span class="n">unknowns</span> <span class="o">|=</span> <span class="mi">1</span> + <span class="k">return</span> <span class="n">KnownBits</span><span class="p">(</span><span class="n">ones</span><span class="p">,</span> <span class="n">unknowns</span><span class="p">)</span> + + <span class="nd">@staticmethod</span> + <span class="k">def</span> <span class="nf">all_unknown</span><span class="p">():</span> +<span class="w"> </span><span class="sd">""" convenience constructor for the "all bits unknown" abstract value</span> +<span class="sd"> """</span> + <span class="k">return</span> <span class="n">KnownBits</span><span class="o">.</span><span class="n">from_str</span><span class="p">(</span><span class="s2">"...?"</span><span class="p">)</span> +</pre></div> + + + +</details> + +<p>And here's a <a href="https://pytest.org">pytest</a>-style unit test for <code>str</code>:</p> +<div class="code"><pre class="code literal-block"><span class="k">def</span> <span class="nf">test_str</span><span class="p">():</span> + <span class="k">assert</span> <span class="nb">str</span><span class="p">(</span><span class="n">KnownBits</span><span class="o">.</span><span class="n">from_constant</span><span class="p">(</span><span class="mi">0</span><span class="p">))</span> <span class="o">==</span> <span class="s1">'0'</span> + <span class="k">assert</span> <span class="nb">str</span><span class="p">(</span><span class="n">KnownBits</span><span class="o">.</span><span class="n">from_constant</span><span class="p">(</span><span class="mi">5</span><span class="p">))</span> <span class="o">==</span> <span class="s1">'101'</span> + <span class="k">assert</span> <span class="nb">str</span><span class="p">(</span><span class="n">KnownBits</span><span class="p">(</span><span class="mi">5</span><span class="p">,</span> <span class="mb">0b10</span><span class="p">))</span> <span class="o">==</span> <span class="s1">'1?1'</span> + <span class="k">assert</span> <span class="nb">str</span><span class="p">(</span><span class="n">KnownBits</span><span class="p">(</span><span class="o">~</span><span class="mb">0b1111</span><span class="p">,</span> <span class="mb">0b10</span><span class="p">))</span> <span class="o">==</span> <span class="s1">'...100?0'</span> + <span class="k">assert</span> <span class="nb">str</span><span class="p">(</span><span class="n">KnownBits</span><span class="p">(</span><span class="mi">1</span><span class="p">,</span> <span class="o">~</span><span class="mb">0b1</span><span class="p">))</span> <span class="o">==</span> <span class="s1">'...?1'</span> +</pre></div> + +<p>An instance of <code>KnownBits</code> represents a set of integers, namely those that match +the known bits stored in the instance. We can write a method <code>contains</code> that +takes a concrete <code>int</code> value and returns <code>True</code> if the value matches the +pattern of the known bits:</p> +<div class="code"><pre class="code literal-block"><span class="k">class</span> <span class="nc">KnownBits</span><span class="p">:</span> + <span class="o">...</span> + + <span class="k">def</span> <span class="nf">contains</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span> <span class="p">:</span> <span class="nb">int</span><span class="p">):</span> +<span class="w"> </span><span class="sd">""" Check whether the KnownBits instance contains the concrete integer</span> +<span class="sd"> `value`. """</span> + <span class="c1"># check whether value matches the bit pattern. in the places where we</span> + <span class="c1"># know the bits, the value must agree with ones.</span> + <span class="k">return</span> <span class="n">value</span> <span class="o">&amp;</span> <span class="bp">self</span><span class="o">.</span><span class="n">knowns</span> <span class="o">==</span> <span class="bp">self</span><span class="o">.</span><span class="n">ones</span> +</pre></div> + +<p>and a test:</p> +<div class="code"><pre class="code literal-block"><span class="k">def</span> <span class="nf">test_contains</span><span class="p">():</span> + <span class="n">k1</span> <span class="o">=</span> <span class="n">KnownBits</span><span class="o">.</span><span class="n">from_str</span><span class="p">(</span><span class="s1">'1?1'</span><span class="p">)</span> + <span class="k">assert</span> <span class="n">k1</span><span class="o">.</span><span class="n">contains</span><span class="p">(</span><span class="mb">0b111</span><span class="p">)</span> + <span class="k">assert</span> <span class="n">k1</span><span class="o">.</span><span class="n">contains</span><span class="p">(</span><span class="mb">0b101</span><span class="p">)</span> + <span class="k">assert</span> <span class="ow">not</span> <span class="n">k1</span><span class="o">.</span><span class="n">contains</span><span class="p">(</span><span class="mb">0b110</span><span class="p">)</span> + <span class="k">assert</span> <span class="ow">not</span> <span class="n">k1</span><span class="o">.</span><span class="n">contains</span><span class="p">(</span><span class="mb">0b011</span><span class="p">)</span> + + <span class="n">k2</span> <span class="o">=</span> <span class="n">KnownBits</span><span class="o">.</span><span class="n">from_str</span><span class="p">(</span><span class="s1">'...?1'</span><span class="p">)</span> <span class="c1"># all odd numbers</span> + <span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="o">-</span><span class="mi">101</span><span class="p">,</span> <span class="mi">100</span><span class="p">):</span> + <span class="k">assert</span> <span class="n">k2</span><span class="o">.</span><span class="n">contains</span><span class="p">(</span><span class="n">i</span><span class="p">)</span> <span class="o">==</span> <span class="p">(</span><span class="n">i</span> <span class="o">&amp;</span> <span class="mi">1</span><span class="p">)</span> +</pre></div> + +<h3 id="transfer-functions">Transfer Functions</h3> +<p>Now that we have implemented the basics of the <code>KnownBits</code> class, we need to +start implementing the transfer functions. They are for computing what we know +about the <em>results</em> of an operation, given the knowledge we have about the bits +of the arguments.</p> +<p>We'll start with a simple unary operation, <code>invert(x)</code> (which is <code>~x</code> in Python +and C syntax), which flips all the bits of at integer. If we know some bits of +the arguments, we can compute the corresponding bits of the result. The unknown +bits remain unknown.</p> +<p>Here's the code:</p> +<div class="code"><pre class="code literal-block"><span class="k">class</span> <span class="nc">KnownBits</span><span class="p">:</span> + <span class="o">...</span> + + <span class="k">def</span> <span class="nf">abstract_invert</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span> + <span class="c1"># self.zeros has bits set where the known 0s are in self</span> + <span class="k">return</span> <span class="n">KnownBits</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">zeros</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">unknowns</span><span class="p">)</span> +</pre></div> + +<p>And a unit-test:</p> +<div class="code"><pre class="code literal-block"><span class="k">def</span> <span class="nf">test_invert</span><span class="p">():</span> + <span class="n">k1</span> <span class="o">=</span> <span class="n">KnownBits</span><span class="o">.</span><span class="n">from_str</span><span class="p">(</span><span class="s1">'01?01?01?'</span><span class="p">)</span> + <span class="n">k2</span> <span class="o">=</span> <span class="n">k1</span><span class="o">.</span><span class="n">abstract_invert</span><span class="p">()</span> + <span class="k">assert</span> <span class="nb">str</span><span class="p">(</span><span class="n">k2</span><span class="p">)</span> <span class="o">==</span> <span class="s1">'...10?10?10?'</span> + + <span class="n">k1</span> <span class="o">=</span> <span class="n">KnownBits</span><span class="o">.</span><span class="n">from_str</span><span class="p">(</span><span class="s1">'...?'</span><span class="p">)</span> + <span class="n">k2</span> <span class="o">=</span> <span class="n">k1</span><span class="o">.</span><span class="n">abstract_invert</span><span class="p">()</span> + <span class="k">assert</span> <span class="nb">str</span><span class="p">(</span><span class="n">k2</span><span class="p">)</span> <span class="o">==</span> <span class="s1">'...?'</span> +</pre></div> + +<p>Before we continue with further transfer functions, we'll think about +correctness of the transfer functions and build up some test infrastructure. To +test transfer functions, it's quite important to move being simple example-style +unit tests. The state-space for more complicated binary transfer functions is +extremely large and it's too easy to do something wrong in a corner case. +Therefore we'll look at property-based-test for <code>KnownBits</code> next.</p> +<h3 id="property-based-tests-with-hypothesis">Property-based Tests with Hypothesis</h3> +<p>We want to do property-based tests of <code>KnownBits</code>, to try +make it less likely that we'll get a corner-case in the implementation wrong. +We'll use <a href="https://hypothesis.readthedocs.io/en/latest/">Hypothesis</a> for that.</p> +<p>I can't give a decent introduction to Hypothesis here, but want to give a few +hints about the API. Hypothesis is a way to run unit tests with randomly +generated input. It provides <em>strategies</em> to describe the data that the test +functions expects. Hypothesis provides primitive strategies (for things like +integers, strings, floats, etc) and ways to build composite strategies out of +the primitive ones.</p> +<p>To be able to write the tests, we need to generate random <code>KnownBits</code> instances, +and we also want an <code>int</code> instance that is a member of the <code>KnownBits</code> instance. +We generate tuples of <code>(KnownBits, int)</code> together, to ensure this property. +We'll ask Hypothesis to generate us a random concrete <code>int</code> as the concrete +value, and then we'll also generate a second random <code>int</code> to use as the +<code>unknown</code> masks (i.e. which bits of the concrete int we don't know in the +<code>KnownBits</code> instance). Here's a function that takes two such ints and builds the +tuple:</p> +<div class="code"><pre class="code literal-block"><span class="k">def</span> <span class="nf">build_knownbits_and_contained_number</span><span class="p">(</span><span class="n">concrete_value</span> <span class="p">:</span> <span class="nb">int</span><span class="p">,</span> <span class="n">unknowns</span> <span class="p">:</span> <span class="nb">int</span><span class="p">):</span> + <span class="c1"># to construct a valid KnownBits instance, we need to mask off the unknown</span> + <span class="c1"># bits</span> + <span class="n">ones</span> <span class="o">=</span> <span class="n">concrete_value</span> <span class="o">&amp;</span> <span class="o">~</span><span class="n">unknowns</span> + <span class="k">return</span> <span class="n">KnownBits</span><span class="p">(</span><span class="n">ones</span><span class="p">,</span> <span class="n">unknowns</span><span class="p">),</span> <span class="n">concrete_value</span> +</pre></div> + +<p>We can turn this function into a hypothesis strategy to generate input data +using the <code>strategies.builds</code> function:</p> +<div class="code"><pre class="code literal-block"><span class="kn">from</span> <span class="nn">hypothesis</span> <span class="kn">import</span> <span class="n">strategies</span><span class="p">,</span> <span class="n">given</span><span class="p">,</span> <span class="n">settings</span> + +<span class="n">ints</span> <span class="o">=</span> <span class="n">strategies</span><span class="o">.</span><span class="n">integers</span><span class="p">()</span> + +<span class="n">random_knownbits_and_contained_number</span> <span class="o">=</span> <span class="n">strategies</span><span class="o">.</span><span class="n">builds</span><span class="p">(</span> + <span class="n">build_knownbits_and_contained_number</span><span class="p">,</span> + <span class="n">ints</span><span class="p">,</span> <span class="n">ints</span> +<span class="p">)</span> +</pre></div> + +<p>One important special case of <code>KnownBits</code> are the constants, which contain only +a single concrete value. We'll also generate some of those specifically, and +then combine the <code>random_knownbits_and_contained_number</code> strategy with it:</p> +<div class="code"><pre class="code literal-block"><span class="n">constant_knownbits</span> <span class="o">=</span> <span class="n">strategies</span><span class="o">.</span><span class="n">builds</span><span class="p">(</span> + <span class="k">lambda</span> <span class="n">value</span><span class="p">:</span> <span class="p">(</span><span class="n">KnownBits</span><span class="o">.</span><span class="n">from_constant</span><span class="p">(</span><span class="n">value</span><span class="p">),</span> <span class="n">value</span><span class="p">),</span> + <span class="n">ints</span> +<span class="p">)</span> + +<span class="n">knownbits_and_contained_number</span> <span class="o">=</span> <span class="n">constant_knownbits</span> <span class="o">|</span> <span class="n">random_knownbits_and_contained_number</span> +</pre></div> + +<p>Now we can write the first property-based tests, for the <code>KnownBits.contains</code> +method:</p> +<div class="code"><pre class="code literal-block"><span class="nd">@given</span><span class="p">(</span><span class="n">knownbits_and_contained_number</span><span class="p">)</span> +<span class="k">def</span> <span class="nf">test_contains</span><span class="p">(</span><span class="n">t</span><span class="p">):</span> + <span class="n">k</span><span class="p">,</span> <span class="n">n</span> <span class="o">=</span> <span class="n">t</span> + <span class="k">assert</span> <span class="n">k</span><span class="o">.</span><span class="n">contains</span><span class="p">(</span><span class="n">t</span><span class="p">)</span> +</pre></div> + +<p>The <code>@given</code> decorator is used to tell Hypothesis which strategy to use to +generate random data for the test function. Hypothesis will run the test with a +number of random examples (100 by default). If it finds an error, it will try to +minimize the example needed that demonstrates the problem, to try to make it +easier to understand what is going wrong. It also saves all failing cases into +an example database and tries them again on subsequent runs.</p> +<p>This test is as much a check for whether we got the strategies right as it is +for the logic in <code>KnownBits.contains</code>. Here's an example output of random +concrete and abstract values that we are getting here:</p> +<div class="code"><pre class="code literal-block"><span class="mf">110000011001101</span><span class="w"> </span><span class="mf">...</span><span class="err">?</span><span class="mf">0</span><span class="err">???</span><span class="mf">1</span> +<span class="mf">...1011011</span><span class="w"> </span><span class="mf">...1011011</span> +<span class="mf">...1001101110101000010010011111011</span><span class="w"> </span><span class="mf">...1001101110101000010010011111011</span> +<span class="mf">...1001101110101000010010011111011</span><span class="w"> </span><span class="mf">...100110111010100001</span><span class="err">?</span><span class="mf">010</span><span class="err">?</span><span class="mf">1</span><span class="err">??</span><span class="mf">1</span><span class="err">??</span><span class="mf">11</span> +<span class="mf">1000001101111101001011010011111101000011000111011001011111101</span><span class="w"> </span><span class="mf">1000001101111101001011010011111101000011000111011001011111101</span> +<span class="mf">1000001101111101001011010011111101000011000111011001011111101</span><span class="w"> </span><span class="mf">1000001101111101001011010011111101000011000111</span><span class="err">????</span><span class="mf">01</span><span class="err">?</span><span class="mf">11</span><span class="err">?????</span><span class="mf">1</span> +<span class="mf">1111100000010</span><span class="w"> </span><span class="mf">1111100000010</span> +<span class="mf">1111100000010</span><span class="w"> </span><span class="mf">...</span><span class="err">?</span><span class="mf">11111</span><span class="err">?</span><span class="mf">00000</span><span class="err">??</span> +<span class="mf">110110</span><span class="w"> </span><span class="mf">110110</span> +<span class="mf">110110</span><span class="w"> </span><span class="mf">...</span><span class="err">?</span><span class="mf">00</span><span class="err">?</span><span class="mf">00</span><span class="err">????</span><span class="mf">11</span><span class="err">??</span><span class="mf">10</span> +<span class="mf">110110</span><span class="w"> </span><span class="err">??</span><span class="mf">0</span><span class="err">??</span><span class="mf">0</span> +<span class="mf">...100010111011111</span><span class="w"> </span><span class="mf">...</span><span class="err">?</span><span class="mf">100</span><span class="err">?</span><span class="mf">10111</span><span class="err">??</span><span class="mf">111</span><span class="err">?</span> +<span class="mf">...1000100000110001</span><span class="w"> </span><span class="mf">...</span><span class="err">?</span><span class="mf">000</span><span class="err">?</span><span class="mf">00000</span><span class="err">??</span><span class="mf">000</span><span class="err">?</span> +<span class="mf">110000001110</span><span class="w"> </span><span class="mf">...</span><span class="err">?</span><span class="mf">0</span><span class="err">?</span><span class="mf">0</span><span class="err">??</span><span class="mf">000</span><span class="err">?</span><span class="mf">00</span><span class="err">?</span><span class="mf">0</span><span class="err">?</span><span class="mf">0000000</span><span class="err">?</span><span class="mf">00</span><span class="err">???</span><span class="mf">0000</span><span class="err">?????</span><span class="mf">00</span><span class="err">???</span><span class="mf">000</span><span class="err">?</span><span class="mf">0</span><span class="err">?</span><span class="mf">00</span><span class="err">?</span><span class="mf">01</span><span class="err">?</span><span class="mf">000</span><span class="err">?</span><span class="mf">0</span><span class="err">??</span><span class="mf">1</span><span class="err">??</span> +<span class="mf">110000001110</span><span class="w"> </span><span class="err">??</span><span class="mf">000000</span><span class="err">???</span><span class="mf">0</span> +<span class="mf">1011011010000001110101001111000010001001011101010010010001000000010101010010001101110101111111010101010010101100110000011110000</span><span class="w"> </span><span class="mf">1011011010000001110101001111000010001001011101010010010001000000010101010010001101110101111111010101010010101100110000011110000</span> +<span class="mf">...1011010010010100</span><span class="w"> </span><span class="mf">...1011010010010100</span> +<span class="mf">...1011111110110011</span><span class="w"> </span><span class="mf">...1011111110110011</span> +<span class="mf">101000011110110</span><span class="w"> </span><span class="mf">101000011</span><span class="err">?</span><span class="mf">10</span><span class="err">?</span><span class="mf">1</span><span class="err">?</span> +<span class="mf">100101</span><span class="w"> </span><span class="err">?</span><span class="mf">00</span><span class="err">?</span><span class="mf">0</span><span class="err">?</span> +</pre></div> + +<p>That looks suitably random, but we might want to bias our random numbers a +little bit towards common error values like small constants, powers of two, etc. +Like this:</p> +<div class="code"><pre class="code literal-block"><span class="n">INTEGER_WIDTH</span> <span class="o">=</span> <span class="mi">64</span> +<span class="c1"># some small integers</span> +<span class="n">ints_special</span> <span class="o">=</span> <span class="nb">set</span><span class="p">(</span><span class="nb">range</span><span class="p">(</span><span class="mi">100</span><span class="p">))</span> +<span class="c1"># powers of two</span> +<span class="n">ints_special</span> <span class="o">=</span> <span class="n">ints_special</span><span class="o">.</span><span class="n">union</span><span class="p">(</span><span class="mi">1</span> <span class="o">&lt;&lt;</span> <span class="n">i</span> <span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="n">INTEGER_WIDTH</span> <span class="o">-</span> <span class="mi">2</span><span class="p">))</span> +<span class="c1"># powers of two - 1</span> +<span class="n">ints_special</span> <span class="o">=</span> <span class="n">ints_special</span><span class="o">.</span><span class="n">union</span><span class="p">((</span><span class="mi">1</span> <span class="o">&lt;&lt;</span> <span class="n">i</span><span class="p">)</span> <span class="o">-</span> <span class="mi">1</span> <span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="n">INTEGER_WIDTH</span> <span class="o">-</span> <span class="mi">2</span><span class="p">))</span> +<span class="c1"># negative versions of what we have so far</span> +<span class="n">ints_special</span> <span class="o">=</span> <span class="n">ints_special</span><span class="o">.</span><span class="n">union</span><span class="p">(</span><span class="o">-</span><span class="n">x</span> <span class="k">for</span> <span class="n">x</span> <span class="ow">in</span> <span class="n">ints_special</span><span class="p">)</span> +<span class="c1"># bit-flipped versions of what we have so far</span> +<span class="n">ints_special</span> <span class="o">=</span> <span class="n">ints_special</span><span class="o">.</span><span class="n">union</span><span class="p">(</span><span class="o">~</span><span class="n">x</span> <span class="k">for</span> <span class="n">x</span> <span class="ow">in</span> <span class="n">ints_special</span><span class="p">)</span> +<span class="n">ints_special</span> <span class="o">=</span> <span class="nb">list</span><span class="p">(</span><span class="n">ints_special</span><span class="p">)</span> +<span class="c1"># sort them (because hypothesis simplifies towards earlier elements in the list)</span> +<span class="n">ints_special</span><span class="o">.</span><span class="n">sort</span><span class="p">(</span><span class="n">key</span><span class="o">=</span><span class="k">lambda</span> <span class="n">element</span><span class="p">:</span> <span class="p">(</span><span class="nb">abs</span><span class="p">(</span><span class="n">element</span><span class="p">),</span> <span class="n">element</span> <span class="o">&lt;</span> <span class="mi">0</span><span class="p">))</span> + +<span class="n">ints</span> <span class="o">=</span> <span class="n">strategies</span><span class="o">.</span><span class="n">sampled_from</span><span class="p">(</span><span class="n">ints_special</span><span class="p">)</span> <span class="o">|</span> <span class="n">strategies</span><span class="o">.</span><span class="n">integers</span><span class="p">()</span> +</pre></div> + +<p>Now we get data like this:</p> +<div class="code"><pre class="code literal-block"><span class="mf">1110</span><span class="w"> </span><span class="mf">1110</span> +<span class="mf">...10000000000000000001</span><span class="w"> </span><span class="mf">...10000</span><span class="err">??</span><span class="mf">0</span><span class="err">??</span><span class="mf">0000</span><span class="err">??</span><span class="mf">00</span><span class="err">?</span><span class="mf">1</span> +<span class="mf">1</span><span class="w"> </span><span class="err">??</span><span class="mf">0</span><span class="err">??</span><span class="mf">0000</span><span class="err">??</span><span class="mf">00</span><span class="err">?</span><span class="mf">1</span> +<span class="mf">1</span><span class="w"> </span><span class="err">?</span> +<span class="mf">...10101100</span><span class="w"> </span><span class="mf">...10101100</span> +<span class="mf">110000000011001010111011111111111111011110010001001100110001011</span><span class="w"> </span><span class="mf">...</span><span class="err">?</span><span class="mf">0</span><span class="err">?</span><span class="mf">101</span><span class="err">?</span> +<span class="mf">110000000011001010111011111111111111011110010001001100110001011</span><span class="w"> </span><span class="err">??</span><span class="mf">00000000</span><span class="err">??</span><span class="mf">00</span><span class="err">?</span><span class="mf">0</span><span class="err">?</span><span class="mf">0</span><span class="err">???</span><span class="mf">0</span><span class="err">??????????????</span><span class="mf">0</span><span class="err">????</span><span class="mf">00</span><span class="err">?</span><span class="mf">000</span><span class="err">?</span><span class="mf">00</span><span class="err">??</span><span class="mf">00</span><span class="err">??</span><span class="mf">000</span><span class="err">?</span><span class="mf">0</span><span class="err">??</span> +<span class="mf">...1011111111111111111111111111</span><span class="w"> </span><span class="mf">...</span><span class="err">?</span><span class="mf">11</span><span class="err">?</span><span class="mf">11</span><span class="err">??</span> +<span class="mf">...1011111111111111111111111111</span><span class="w"> </span><span class="mf">...</span><span class="err">?</span><span class="mf">0</span><span class="err">??????????????????????????</span> +<span class="mf">0</span><span class="w"> </span><span class="mf">...</span><span class="err">?</span><span class="mf">0</span><span class="err">??????????????????????????</span> +<span class="mf">101101</span><span class="w"> </span><span class="mf">101101</span> +<span class="mf">111111111111111111111111111111111111111111111</span><span class="w"> </span><span class="mf">111111111111111111111111111111111111111111111</span> +<span class="mf">10111</span><span class="w"> </span><span class="mf">10111</span> +<span class="mf">...101100</span><span class="w"> </span><span class="mf">...1</span><span class="err">?</span><span class="mf">111011</span><span class="err">?</span><span class="mf">0</span> +<span class="mf">101000</span><span class="w"> </span><span class="err">?</span><span class="mf">001010</span><span class="err">?</span><span class="mf">0</span> +<span class="mf">101000</span><span class="w"> </span><span class="err">?</span><span class="mf">0</span><span class="err">?</span><span class="mf">000</span> +<span class="mf">110010</span><span class="w"> </span><span class="mf">110010</span> +<span class="mf">...100111</span><span class="w"> </span><span class="mf">...100111</span> +<span class="mf">1111011010010</span><span class="w"> </span><span class="mf">1111011010010</span> +<span class="mf">...1000000000000000000000000000000000000</span><span class="w"> </span><span class="mf">...1000000000000000000000000000000000000</span> +</pre></div> + +<p>We can also write a test that checks that the somewhat tricky logic in +<code>__str__</code> and <code>from_str</code> is correct, by making sure that the two functions +round-trip (ie converting a <code>KnownBits</code> to a string and then back to a +<code>KnownBits</code> instance produces the same abstract value).</p> +<div class="code"><pre class="code literal-block"><span class="nd">@given</span><span class="p">(</span><span class="n">knownbits_and_contained_number</span><span class="p">)</span> +<span class="k">def</span> <span class="nf">test_hypothesis_str_roundtrips</span><span class="p">(</span><span class="n">t1</span><span class="p">):</span> + <span class="n">k1</span><span class="p">,</span> <span class="n">n1</span> <span class="o">=</span> <span class="n">t1</span> + <span class="n">s</span> <span class="o">=</span> <span class="nb">str</span><span class="p">(</span><span class="n">k1</span><span class="p">)</span> + <span class="n">k2</span> <span class="o">=</span> <span class="n">KnownBits</span><span class="o">.</span><span class="n">from_str</span><span class="p">(</span><span class="n">s</span><span class="p">)</span> + <span class="k">assert</span> <span class="n">k1</span><span class="o">.</span><span class="n">ones</span> <span class="o">==</span> <span class="n">k2</span><span class="o">.</span><span class="n">ones</span> + <span class="k">assert</span> <span class="n">k1</span><span class="o">.</span><span class="n">unknowns</span> <span class="o">==</span> <span class="n">k2</span><span class="o">.</span><span class="n">unknowns</span> +</pre></div> + +<p>Now let's actually apply this infrastructure to test <code>abstract_invert</code>.</p> +<h3 id="when-are-transfer-functions-correct-how-do-we-test-them">When are Transfer Functions Correct? How do we test them?</h3> +<p>Abstract values, i.e. instances of <code>KnownBits</code> represent <em>sets</em> of concrete +values. We want the transfer functions to compute <em>overapproximations</em> of the +concrete values. So if we have an arbitrary abstract value <code>k</code>, with a concrete +number <code>n</code> that is a member of the abstract values (i.e. +<code>k.contains(n) == True</code>) then the result of the concrete operation <code>op(n)</code> +<strong>must</strong> be a member of the result of the abstract operation <code>k.abstract_op()</code> +(i.e. <code>k.abstract_op().contains(op(n)) == True</code>).</p> +<p>Checking the correctness/overapproximation property is a good match for +hypothesis. Here's what the test for <code>abstract_invert</code> looks like:</p> +<div class="code"><pre class="code literal-block"><span class="nd">@given</span><span class="p">(</span><span class="n">knownbits_and_contained_number</span><span class="p">)</span> +<span class="k">def</span> <span class="nf">test_hypothesis_invert</span><span class="p">(</span><span class="n">t</span><span class="p">):</span> + <span class="n">k1</span><span class="p">,</span> <span class="n">n1</span> <span class="o">=</span> <span class="n">t1</span> + <span class="n">n2</span> <span class="o">=</span> <span class="o">~</span><span class="n">n1</span> <span class="c1"># compute the real result</span> + <span class="n">k2</span> <span class="o">=</span> <span class="n">k1</span><span class="o">.</span><span class="n">abstract_invert</span><span class="p">()</span> <span class="c1"># compute the abstract result</span> + <span class="k">assert</span> <span class="n">k2</span><span class="o">.</span><span class="n">contains</span><span class="p">(</span><span class="n">n2</span><span class="p">)</span> <span class="c1"># the abstract result must contain the real result</span> +</pre></div> + +<p>This is the <em>only</em> condition needed for <code>abstract_invert</code> to be correct. If +<code>abstract_invert</code> fulfils this property for every combination of abstract and +concrete value then <code>abstract_invert</code> is correct. Note however, that this test +does not actually check whether <code>abstract_invert</code> gives us precise results. A +correct (but imprecise) implementation of <code>abstract_invert</code> would simply return +a completely unknown result, regardless of what is known about the input +<code>KnownBits</code>.</p> +<p>The "proper" CS term for this notion of correctness is called <em>soundness</em>. The +correctness condition on the transfer functions is called a <em>Galois +connection</em>. I won't go into any mathematical/technical details here, but +wanted to at least mention the terms. I found <a href="https://web.njit.edu/~mjk76/">Martin +Kellogg</a>'s +<a href="https://web.njit.edu/~mjk76/teaching/cs684-sp24/assets/lecture-12.pdf#34">slides</a> +to be quite an approachable introduction to the Galois connection and how to +show soundness.</p> +<h3 id="implementing-binary-transfer-functions">Implementing Binary Transfer Functions</h3> +<p>Now we have infrastructure in place for testing transfer functions with random +inputs. With that we can start thinking about the more complicated case, that of +binary operations. Let's start with the simpler ones, <code>and</code> and <code>or</code>. For <code>and</code>, +we can know a <code>0</code> bit in the result if either of the input bits are known <code>0</code>; +or we can know a <code>1</code> bit in the result if both input bits are known <code>1</code>. +Otherwise the resulting bit is unknown. Let's look at all the combinations:</p> +<div class="code"><pre class="code literal-block">and +input1: 000111??? +input2: 01?01?01? +result: 00001?0?? +</pre></div> + +<div class="code"><pre class="code literal-block"><span class="k">class</span> <span class="nc">KnownBits</span><span class="p">:</span> + <span class="o">...</span> + + <span class="k">def</span> <span class="nf">abstract_and</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">):</span> + <span class="n">ones</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">ones</span> <span class="o">&amp;</span> <span class="n">other</span><span class="o">.</span><span class="n">ones</span> <span class="c1"># known ones</span> + <span class="n">knowns</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">zeros</span> <span class="o">|</span> <span class="n">other</span><span class="o">.</span><span class="n">zeros</span> <span class="o">|</span> <span class="n">ones</span> + <span class="k">return</span> <span class="n">KnownBits</span><span class="p">(</span><span class="n">ones</span><span class="p">,</span> <span class="o">~</span><span class="n">knowns</span><span class="p">)</span> +</pre></div> + +<p>Here's an example unit-test and a property-based test for <code>and</code>:</p> +<div class="code"><pre class="code literal-block"><span class="k">def</span> <span class="nf">test_and</span><span class="p">():</span> + <span class="c1"># test all combinations of 0, 1, ? in one example</span> + <span class="n">k1</span> <span class="o">=</span> <span class="n">KnownBits</span><span class="o">.</span><span class="n">from_str</span><span class="p">(</span><span class="s1">'01?01?01?'</span><span class="p">)</span> + <span class="n">k2</span> <span class="o">=</span> <span class="n">KnownBits</span><span class="o">.</span><span class="n">from_str</span><span class="p">(</span><span class="s1">'000111???'</span><span class="p">)</span> + <span class="n">res</span> <span class="o">=</span> <span class="n">k1</span><span class="o">.</span><span class="n">abstract_and</span><span class="p">(</span><span class="n">k2</span><span class="p">)</span> <span class="c1"># should be: 0...00001?0??</span> + <span class="k">assert</span> <span class="nb">str</span><span class="p">(</span><span class="n">res</span><span class="p">)</span> <span class="o">==</span> <span class="s2">"1?0??"</span> + +<span class="nd">@given</span><span class="p">(</span><span class="n">knownbits_and_contained_number</span><span class="p">,</span> <span class="n">knownbits_and_contained_number</span><span class="p">)</span> +<span class="k">def</span> <span class="nf">test_hypothesis_and</span><span class="p">(</span><span class="n">t1</span><span class="p">,</span> <span class="n">t2</span><span class="p">):</span> + <span class="n">k1</span><span class="p">,</span> <span class="n">n1</span> <span class="o">=</span> <span class="n">t1</span> + <span class="n">k2</span><span class="p">,</span> <span class="n">n2</span> <span class="o">=</span> <span class="n">t2</span> + <span class="n">k3</span> <span class="o">=</span> <span class="n">k1</span><span class="o">.</span><span class="n">abstract_and</span><span class="p">(</span><span class="n">k2</span><span class="p">)</span> + <span class="n">n3</span> <span class="o">=</span> <span class="n">n1</span> <span class="o">&amp;</span> <span class="n">n2</span> + <span class="k">assert</span> <span class="n">k3</span><span class="o">.</span><span class="n">contains</span><span class="p">(</span><span class="n">n3</span><span class="p">)</span> +</pre></div> + +<p>To implement <code>or</code> is pretty similar. The result is known <code>1</code> where either of the +inputs is <code>1</code>. The result is known <code>0</code> where both inputs are known <code>0</code>, and <code>?</code> +otherwise.</p> +<div class="code"><pre class="code literal-block">or +input1: 000111??? +input2: 01?01?01? +result: 01?111?1? +</pre></div> + +<div class="code"><pre class="code literal-block"><span class="k">class</span> <span class="nc">KnownBits</span><span class="p">:</span> + <span class="o">...</span> + + <span class="k">def</span> <span class="nf">abstract_or</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">):</span> + <span class="n">ones</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">ones</span> <span class="o">|</span> <span class="n">other</span><span class="o">.</span><span class="n">ones</span> + <span class="n">zeros</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">zeros</span> <span class="o">&amp;</span> <span class="n">other</span><span class="o">.</span><span class="n">zeros</span> + <span class="n">knowns</span> <span class="o">=</span> <span class="n">ones</span> <span class="o">|</span> <span class="n">zeros</span> + <span class="k">return</span> <span class="n">KnownBits</span><span class="p">(</span><span class="n">ones</span><span class="p">,</span> <span class="o">~</span><span class="n">knowns</span><span class="p">)</span> +</pre></div> + +<p>Here's an example unit-test and a property-based test for <code>or</code>:</p> +<div class="code"><pre class="code literal-block"><span class="k">def</span> <span class="nf">test_or</span><span class="p">():</span> + <span class="n">k1</span> <span class="o">=</span> <span class="n">KnownBits</span><span class="o">.</span><span class="n">from_str</span><span class="p">(</span><span class="s1">'01?01?01?'</span><span class="p">)</span> + <span class="n">k2</span> <span class="o">=</span> <span class="n">KnownBits</span><span class="o">.</span><span class="n">from_str</span><span class="p">(</span><span class="s1">'000111???'</span><span class="p">)</span> + <span class="n">res</span> <span class="o">=</span> <span class="n">k1</span><span class="o">.</span><span class="n">abstract_or</span><span class="p">(</span><span class="n">k2</span><span class="p">)</span> <span class="c1"># should be: 0...01?111?1?</span> + <span class="k">assert</span> <span class="nb">str</span><span class="p">(</span><span class="n">res</span><span class="p">)</span> <span class="o">==</span> <span class="s2">"1?111?1?"</span> + +<span class="nd">@given</span><span class="p">(</span><span class="n">knownbits_and_contained_number</span><span class="p">,</span> <span class="n">knownbits_and_contained_number</span><span class="p">)</span> +<span class="k">def</span> <span class="nf">test_hypothesis_or</span><span class="p">(</span><span class="n">t1</span><span class="p">,</span> <span class="n">t2</span><span class="p">):</span> + <span class="n">k1</span><span class="p">,</span> <span class="n">n1</span> <span class="o">=</span> <span class="n">t1</span> + <span class="n">k2</span><span class="p">,</span> <span class="n">n2</span> <span class="o">=</span> <span class="n">t2</span> + <span class="n">k3</span> <span class="o">=</span> <span class="n">k1</span><span class="o">.</span><span class="n">abstract_or</span><span class="p">(</span><span class="n">k2</span><span class="p">)</span> + <span class="n">n3</span> <span class="o">=</span> <span class="n">n1</span> <span class="o">|</span> <span class="n">n2</span> + <span class="k">assert</span> <span class="n">k3</span><span class="o">.</span><span class="n">contains</span><span class="p">(</span><span class="n">n3</span><span class="p">)</span> +</pre></div> + +<p>Implementing support for <code>abstract_xor</code> is relatively simple, and left as an +exercise :-).</p> +<h3 id="addition-and-subtraction">Addition and Subtraction</h3> +<p><code>invert</code>, <code>and</code>, and <code>or</code> are relatively simple transfer functions to write, +because they compose over the individual bits of the integers. The arithmetic +functions <code>add</code> and <code>sub</code> are significantly harder, because of carries and +borrows. Coming up with the formulas for them and gaining an intuitive +understanding is quite tricky and involves carefully going through a few +examples with pen and paper. When implementing this in PyPy, Nico and I didn't +come up with the implementation ourselves, but instead took them from the +<a href="https://arxiv.org/abs/2105.05398">Tristate Numbers</a> paper. Here's the code, +with example tests and hypothesis tests:</p> +<div class="code"><pre class="code literal-block"><span class="k">class</span> <span class="nc">KnownBits</span><span class="p">:</span> + <span class="o">...</span> + + <span class="k">def</span> <span class="nf">abstract_add</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">):</span> + <span class="n">sum_ones</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">ones</span> <span class="o">+</span> <span class="n">other</span><span class="o">.</span><span class="n">ones</span> + <span class="n">sum_unknowns</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">unknowns</span> <span class="o">+</span> <span class="n">other</span><span class="o">.</span><span class="n">unknowns</span> + <span class="n">all_carries</span> <span class="o">=</span> <span class="n">sum_ones</span> <span class="o">+</span> <span class="n">sum_unknowns</span> + <span class="n">ones_carries</span> <span class="o">=</span> <span class="n">all_carries</span> <span class="o">^</span> <span class="n">sum_ones</span> + <span class="n">unknowns</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">unknowns</span> <span class="o">|</span> <span class="n">other</span><span class="o">.</span><span class="n">unknowns</span> <span class="o">|</span> <span class="n">ones_carries</span> + <span class="n">ones</span> <span class="o">=</span> <span class="n">sum_ones</span> <span class="o">&amp;</span> <span class="o">~</span><span class="n">unknowns</span> + <span class="k">return</span> <span class="n">KnownBits</span><span class="p">(</span><span class="n">ones</span><span class="p">,</span> <span class="n">unknowns</span><span class="p">)</span> + + <span class="k">def</span> <span class="nf">abstract_sub</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">):</span> + <span class="n">diff_ones</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">ones</span> <span class="o">-</span> <span class="n">other</span><span class="o">.</span><span class="n">ones</span> + <span class="n">val_borrows</span> <span class="o">=</span> <span class="p">(</span><span class="n">diff_ones</span> <span class="o">+</span> <span class="bp">self</span><span class="o">.</span><span class="n">unknowns</span><span class="p">)</span> <span class="o">^</span> <span class="p">(</span><span class="n">diff_ones</span> <span class="o">-</span> <span class="n">other</span><span class="o">.</span><span class="n">unknowns</span><span class="p">)</span> + <span class="n">unknowns</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">unknowns</span> <span class="o">|</span> <span class="n">other</span><span class="o">.</span><span class="n">unknowns</span> <span class="o">|</span> <span class="n">val_borrows</span> + <span class="n">ones</span> <span class="o">=</span> <span class="n">diff_ones</span> <span class="o">&amp;</span> <span class="o">~</span><span class="n">unknowns</span> + <span class="k">return</span> <span class="n">KnownBits</span><span class="p">(</span><span class="n">ones</span><span class="p">,</span> <span class="n">unknowns</span><span class="p">)</span> + + +<span class="k">def</span> <span class="nf">test_add</span><span class="p">():</span> + <span class="n">k1</span> <span class="o">=</span> <span class="n">KnownBits</span><span class="o">.</span><span class="n">from_str</span><span class="p">(</span><span class="s1">'0?10?10?10'</span><span class="p">)</span> + <span class="n">k2</span> <span class="o">=</span> <span class="n">KnownBits</span><span class="o">.</span><span class="n">from_str</span><span class="p">(</span><span class="s1">'0???111000'</span><span class="p">)</span> + <span class="n">res</span> <span class="o">=</span> <span class="n">k1</span><span class="o">.</span><span class="n">abstract_add</span><span class="p">(</span><span class="n">k2</span><span class="p">)</span> + <span class="k">assert</span> <span class="nb">str</span><span class="p">(</span><span class="n">res</span><span class="p">)</span> <span class="o">==</span> <span class="s2">"?????01?10"</span> + +<span class="k">def</span> <span class="nf">test_sub</span><span class="p">():</span> + <span class="n">k1</span> <span class="o">=</span> <span class="n">KnownBits</span><span class="o">.</span><span class="n">from_str</span><span class="p">(</span><span class="s1">'0?10?10?10'</span><span class="p">)</span> + <span class="n">k2</span> <span class="o">=</span> <span class="n">KnownBits</span><span class="o">.</span><span class="n">from_str</span><span class="p">(</span><span class="s1">'0???111000'</span><span class="p">)</span> + <span class="n">res</span> <span class="o">=</span> <span class="n">k1</span><span class="o">.</span><span class="n">abstract_sub</span><span class="p">(</span><span class="n">k2</span><span class="p">)</span> + <span class="k">assert</span> <span class="nb">str</span><span class="p">(</span><span class="n">res</span><span class="p">)</span> <span class="o">==</span> <span class="s2">"...?11?10"</span> + <span class="n">k1</span> <span class="o">=</span> <span class="n">KnownBits</span><span class="o">.</span><span class="n">from_str</span><span class="p">(</span> <span class="s1">'...1?10?10?10'</span><span class="p">)</span> + <span class="n">k2</span> <span class="o">=</span> <span class="n">KnownBits</span><span class="o">.</span><span class="n">from_str</span><span class="p">(</span><span class="s1">'...10000???111000'</span><span class="p">)</span> + <span class="n">res</span> <span class="o">=</span> <span class="n">k1</span><span class="o">.</span><span class="n">abstract_sub</span><span class="p">(</span><span class="n">k2</span><span class="p">)</span> + <span class="k">assert</span> <span class="nb">str</span><span class="p">(</span><span class="n">res</span><span class="p">)</span> <span class="o">==</span> <span class="s2">"111?????11?10"</span> + +<span class="nd">@given</span><span class="p">(</span><span class="n">knownbits_and_contained_number</span><span class="p">,</span> <span class="n">knownbits_and_contained_number</span><span class="p">)</span> +<span class="k">def</span> <span class="nf">test_hypothesis_add</span><span class="p">(</span><span class="n">t1</span><span class="p">,</span> <span class="n">t2</span><span class="p">):</span> + <span class="n">k1</span><span class="p">,</span> <span class="n">n1</span> <span class="o">=</span> <span class="n">t1</span> + <span class="n">k2</span><span class="p">,</span> <span class="n">n2</span> <span class="o">=</span> <span class="n">t2</span> + <span class="n">k3</span> <span class="o">=</span> <span class="n">k1</span><span class="o">.</span><span class="n">abstract_add</span><span class="p">(</span><span class="n">k2</span><span class="p">)</span> + <span class="n">n3</span> <span class="o">=</span> <span class="n">n1</span> <span class="o">+</span> <span class="n">n2</span> + <span class="k">assert</span> <span class="n">k3</span><span class="o">.</span><span class="n">contains</span><span class="p">(</span><span class="n">n3</span><span class="p">)</span> + +<span class="nd">@given</span><span class="p">(</span><span class="n">knownbits_and_contained_number</span><span class="p">,</span> <span class="n">knownbits_and_contained_number</span><span class="p">)</span> +<span class="k">def</span> <span class="nf">test_hypothesis_sub</span><span class="p">(</span><span class="n">t1</span><span class="p">,</span> <span class="n">t2</span><span class="p">):</span> + <span class="n">k1</span><span class="p">,</span> <span class="n">n1</span> <span class="o">=</span> <span class="n">t1</span> + <span class="n">k2</span><span class="p">,</span> <span class="n">n2</span> <span class="o">=</span> <span class="n">t2</span> + <span class="n">k3</span> <span class="o">=</span> <span class="n">k1</span><span class="o">.</span><span class="n">abstract_sub</span><span class="p">(</span><span class="n">k2</span><span class="p">)</span> + <span class="n">n3</span> <span class="o">=</span> <span class="n">n1</span> <span class="o">-</span> <span class="n">n2</span> + <span class="k">assert</span> <span class="n">k3</span><span class="o">.</span><span class="n">contains</span><span class="p">(</span><span class="n">n3</span><span class="p">)</span> +</pre></div> + +<p>Now we are in a pretty good situation, and have implemented abstract versions +for a bunch of important arithmetic and binary functions. What's also surprising +is that the implementation of all of the transfer functions is quite efficient. +We didn't have to write loops over the individual bits at all, instead we found +closed form expressions using primitive operations on the underlying integers +<code>ones</code> and <code>unknowns</code>. This means that computing the results of abstract +operations is quite efficient, which is important when using the abstract domain +in the context of a JIT compiler.</p> +<h3 id="proving-correctness-of-the-transfer-functions-with-z3">Proving correctness of the transfer functions with Z3</h3> +<p>As one can probably tell from my recent posts, I've been thinking about +compiler correctness a lot. Getting the transfer functions absolutely +correct is really crucial, because a bug in them would lead to miscompilation of +Python code when the abstract domain is added to the JIT. While the randomized +tests are great, it's still entirely possible for them to miss bugs. The state +space for the arguments of a binary transfer function is <code>3**64 * 3**64</code>, and if +only a small part of that contains wrong behaviour it would be really unlikely +for us to find it with random tests by chance. Therefore I was reluctant to +merge the PyPy branch that contained the new abstract domain for a long time.</p> +<p>To increase our confidence in the correctness of the transfer functions further, +we can use Z3 to <em>prove</em> their correctness, which gives us much stronger +guarantees (not 100%, obviously). In this subsection I will show how to do that.</p> +<p>Here's an attempt to do this manually in the Python repl:</p> +<div class="code"><pre class="code literal-block"><span class="o">&gt;&gt;&gt;&gt;</span><span class="w"> </span><span class="kn">import</span><span class="w"> </span><span class="nn">z3</span> +<span class="o">&gt;&gt;&gt;&gt;</span><span class="w"> </span><span class="n">solver</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">z3</span><span class="o">.</span><span class="n">Solver</span><span class="p">()</span> +<span class="o">&gt;&gt;&gt;&gt;</span><span class="w"> </span><span class="c1"># like last blog post, proof by failing to find counterexamples</span> +<span class="o">&gt;&gt;&gt;&gt;</span><span class="w"> </span><span class="k">def</span><span class="w"> </span><span class="nf">prove</span><span class="p">(</span><span class="n">cond</span><span class="p">):</span><span class="w"> </span><span class="k">assert</span><span class="w"> </span><span class="n">solver</span><span class="o">.</span><span class="n">check</span><span class="p">(</span><span class="n">z3</span><span class="o">.</span><span class="n">Not</span><span class="p">(</span><span class="n">cond</span><span class="p">))</span><span class="w"> </span><span class="o">==</span><span class="w"> </span><span class="n">z3</span><span class="o">.</span><span class="n">unsat</span> +<span class="o">&gt;&gt;&gt;&gt;</span> +<span class="o">&gt;&gt;&gt;&gt;</span><span class="w"> </span><span class="c1"># let's set up a z3 bitvector variable for an arbitrary concrete value</span> +<span class="o">&gt;&gt;&gt;&gt;</span><span class="w"> </span><span class="n">n1</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">z3</span><span class="o">.</span><span class="n">BitVec</span><span class="p">(</span><span class="s1">'concrete_value'</span><span class="p">,</span><span class="w"> </span><span class="mi">64</span><span class="p">)</span> +<span class="o">&gt;&gt;&gt;&gt;</span><span class="w"> </span><span class="n">n1</span> +<span class="n">concrete_value</span> +<span class="o">&gt;&gt;&gt;&gt;</span><span class="w"> </span><span class="c1"># due to operator overloading we can manipulate z3 formulas</span> +<span class="o">&gt;&gt;&gt;&gt;</span><span class="w"> </span><span class="n">n2</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="o">~</span><span class="n">n1</span> +<span class="o">&gt;&gt;&gt;&gt;</span><span class="w"> </span><span class="n">n2</span> +<span class="o">~</span><span class="n">concrete_value</span> +<span class="o">&gt;&gt;&gt;&gt;</span><span class="w"> </span> +<span class="o">&gt;&gt;&gt;&gt;</span><span class="w"> </span><span class="c1"># now z3 bitvector variables for the ones and zeros fields</span> +<span class="o">&gt;&gt;&gt;&gt;</span><span class="w"> </span><span class="n">ones</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">z3</span><span class="o">.</span><span class="n">BitVec</span><span class="p">(</span><span class="s1">'abstract_ones'</span><span class="p">,</span><span class="w"> </span><span class="mi">64</span><span class="p">)</span> +<span class="o">&gt;&gt;&gt;&gt;</span><span class="w"> </span><span class="n">unknowns</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">z3</span><span class="o">.</span><span class="n">BitVec</span><span class="p">(</span><span class="s1">'abstract_unknowns'</span><span class="p">,</span><span class="w"> </span><span class="mi">64</span><span class="p">)</span> +<span class="o">&gt;&gt;&gt;&gt;</span><span class="w"> </span><span class="c1"># we construct a KnownBits instance with the z3 variables</span> +<span class="o">&gt;&gt;&gt;&gt;</span><span class="w"> </span><span class="n">k1</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">KnownBits</span><span class="p">(</span><span class="n">ones</span><span class="p">,</span><span class="w"> </span><span class="n">unknowns</span><span class="p">)</span> +<span class="o">&gt;&gt;&gt;&gt;</span><span class="w"> </span><span class="c1"># due to operator overloading we can call the methods on k1:</span> +<span class="o">&gt;&gt;&gt;&gt;</span><span class="w"> </span><span class="n">k2</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">k1</span><span class="o">.</span><span class="n">abstract_invert</span><span class="p">()</span> +<span class="o">&gt;&gt;&gt;&gt;</span><span class="w"> </span><span class="n">k2</span><span class="o">.</span><span class="n">ones</span> +<span class="o">~</span><span class="n">abstract_unknowns</span><span class="w"> </span><span class="o">&amp;</span><span class="w"> </span><span class="o">~</span><span class="n">abstract_ones</span> +<span class="o">&gt;&gt;&gt;&gt;</span><span class="w"> </span><span class="n">k2</span><span class="o">.</span><span class="n">unknowns</span> +<span class="n">abstract_unknowns</span> +<span class="o">&gt;&gt;&gt;&gt;</span><span class="w"> </span><span class="c1"># here's the correctness condition that we want to prove:</span> +<span class="o">&gt;&gt;&gt;&gt;</span><span class="w"> </span><span class="n">k2</span><span class="o">.</span><span class="n">contains</span><span class="p">(</span><span class="n">n2</span><span class="p">)</span> +<span class="o">~</span><span class="n">concrete_value</span><span class="w"> </span><span class="o">&amp;</span><span class="w"> </span><span class="o">~</span><span class="n">abstract_unknowns</span><span class="w"> </span><span class="o">==</span> +<span class="o">~</span><span class="n">abstract_unknowns</span><span class="w"> </span><span class="o">&amp;</span><span class="w"> </span><span class="o">~</span><span class="n">abstract_ones</span> +<span class="o">&gt;&gt;&gt;&gt;</span><span class="w"> </span><span class="c1"># let's try</span> +<span class="o">&gt;&gt;&gt;&gt;</span><span class="w"> </span><span class="n">prove</span><span class="p">(</span><span class="n">k2</span><span class="o">.</span><span class="n">contains</span><span class="p">(</span><span class="n">n2</span><span class="p">))</span> +<span class="n">Traceback</span><span class="w"> </span><span class="p">(</span><span class="n">most</span><span class="w"> </span><span class="n">recent</span><span class="w"> </span><span class="n">call</span><span class="w"> </span><span class="n">last</span><span class="p">):</span> +<span class="w"> </span><span class="n">File</span><span class="w"> </span><span class="s2">"&lt;stdin&gt;"</span><span class="p">,</span><span class="w"> </span><span class="n">line</span><span class="w"> </span><span class="mi">1</span><span class="p">,</span><span class="w"> </span><span class="ow">in</span><span class="w"> </span><span class="o">&lt;</span><span class="n">module</span><span class="o">&gt;</span> +<span class="w"> </span><span class="n">File</span><span class="w"> </span><span class="s2">"&lt;stdin&gt;"</span><span class="p">,</span><span class="w"> </span><span class="n">line</span><span class="w"> </span><span class="mi">1</span><span class="p">,</span><span class="w"> </span><span class="ow">in</span><span class="w"> </span><span class="n">prove</span> +<span class="n">AssertionError</span> +<span class="o">&gt;&gt;&gt;&gt;</span><span class="w"> </span><span class="c1"># it doesn't work! let's look at the counterexample to see why:</span> +<span class="o">&gt;&gt;&gt;&gt;</span><span class="w"> </span><span class="n">solver</span><span class="o">.</span><span class="n">model</span><span class="p">()</span> +<span class="p">[</span><span class="n">abstract_unknowns</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="mi">0</span><span class="p">,</span> +<span class="w"> </span><span class="n">abstract_ones</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="mi">0</span><span class="p">,</span> +<span class="w"> </span><span class="n">concrete_value</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="mi">1</span><span class="p">]</span> +<span class="o">&gt;&gt;&gt;&gt;</span><span class="w"> </span><span class="c1"># we can build a KnownBits instance with the values in the</span> +<span class="o">&gt;&gt;&gt;&gt;</span><span class="w"> </span><span class="c1"># counterexample:</span> +<span class="o">&gt;&gt;&gt;&gt;</span><span class="w"> </span><span class="o">~</span><span class="mi">1</span><span class="w"> </span><span class="c1"># concrete result</span> +<span class="o">-</span><span class="mi">2</span> +<span class="o">&gt;&gt;&gt;&gt;</span><span class="w"> </span><span class="n">counter_example_k1</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">KnownBits</span><span class="p">(</span><span class="mi">0</span><span class="p">,</span><span class="w"> </span><span class="mi">0</span><span class="p">)</span> +<span class="o">&gt;&gt;&gt;&gt;</span><span class="w"> </span><span class="n">counter_example_k1</span> +<span class="n">KnownBits</span><span class="o">.</span><span class="n">from_constant</span><span class="p">(</span><span class="mi">0</span><span class="p">)</span> +<span class="o">&gt;&gt;&gt;&gt;</span><span class="w"> </span><span class="n">counter_example_k2</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">counter_example_k1</span><span class="o">.</span><span class="n">abstract_invert</span><span class="p">()</span> +<span class="o">&gt;&gt;&gt;&gt;</span><span class="w"> </span><span class="n">counter_example_k2</span> +<span class="n">KnownBits</span><span class="o">.</span><span class="n">from_constant</span><span class="p">(</span><span class="o">-</span><span class="mi">1</span><span class="p">)</span> +<span class="o">&gt;&gt;&gt;&gt;</span><span class="w"> </span><span class="c1"># let's check the failing condition</span> +<span class="o">&gt;&gt;&gt;&gt;</span><span class="w"> </span><span class="n">counter_example_k2</span><span class="o">.</span><span class="n">contains</span><span class="p">(</span><span class="o">~</span><span class="mi">1</span><span class="p">)</span> +<span class="kc">False</span> +</pre></div> + +<p>What is the problem here? We didn't tell Z3 that <code>n1</code> was supposed to be a +member of <code>k1</code>. We can add this as a precondition to the solver, and then the +prove works:</p> +<div class="code"><pre class="code literal-block">&gt;&gt;&gt;&gt; solver.add(k1.contains(n1)) +&gt;&gt;&gt;&gt; prove(k2.contains(n2)) # works! +</pre></div> + +<p>This is super cool! It's really a proof about the actual implementation, because +we call the implementation methods directly, and due to the operator overloading +that Z3 does we can be sure that we are actually checking a formula that +corresponds to the Python code. This eliminates one source of errors in formal +methods.</p> +<p>Doing the proof manually on the Python REPL is kind of annoying though, and we +also would like to make sure that the proofs are re-done when we change the +code. What we would really like to do is writing the proofs as a unit-test that +we can run while developing and in CI. Doing this is possible, and the unit +tests that really perform proofs look pleasingly similar to the +Hypothesis-based ones.</p> +<p>First we need to set up a bit of infrastructure:</p> +<div class="code"><pre class="code literal-block"><span class="n">INTEGER_WIDTH</span> <span class="o">=</span> <span class="mi">64</span> + +<span class="k">def</span> <span class="nf">BitVec</span><span class="p">(</span><span class="n">name</span><span class="p">):</span> + <span class="k">return</span> <span class="n">z3</span><span class="o">.</span><span class="n">BitVec</span><span class="p">(</span><span class="n">name</span><span class="p">,</span> <span class="n">INTEGER_WIDTH</span><span class="p">)</span> + +<span class="k">def</span> <span class="nf">BitVecVal</span><span class="p">(</span><span class="n">val</span><span class="p">):</span> + <span class="k">return</span> <span class="n">z3</span><span class="o">.</span><span class="n">BitVecVal</span><span class="p">(</span><span class="n">val</span><span class="p">,</span> <span class="n">INTEGER_WIDTH</span><span class="p">)</span> + +<span class="k">def</span> <span class="nf">z3_setup_variables</span><span class="p">():</span> + <span class="c1"># instantiate a solver</span> + <span class="n">solver</span> <span class="o">=</span> <span class="n">z3</span><span class="o">.</span><span class="n">Solver</span><span class="p">()</span> + + <span class="c1"># a Z3 variable for the first concrete value</span> + <span class="n">n1</span> <span class="o">=</span> <span class="n">BitVec</span><span class="p">(</span><span class="s2">"n1"</span><span class="p">)</span> + <span class="c1"># a KnownBits instances that uses Z3 variables as its ones and unknowns,</span> + <span class="c1"># representing the first abstract value</span> + <span class="n">k1</span> <span class="o">=</span> <span class="n">KnownBits</span><span class="p">(</span><span class="n">BitVec</span><span class="p">(</span><span class="s2">"n1_ones"</span><span class="p">),</span> <span class="n">BitVec</span><span class="p">(</span><span class="s2">"n1_unkowns"</span><span class="p">))</span> + <span class="c1"># add the precondition to the solver that the concrete value n1 must be a</span> + <span class="c1"># member of the abstract value k1</span> + <span class="n">solver</span><span class="o">.</span><span class="n">add</span><span class="p">(</span><span class="n">k1</span><span class="o">.</span><span class="n">contains</span><span class="p">(</span><span class="n">n1</span><span class="p">))</span> + + <span class="c1"># a Z3 variable for the second concrete value</span> + <span class="n">n2</span> <span class="o">=</span> <span class="n">BitVec</span><span class="p">(</span><span class="s2">"n2"</span><span class="p">)</span> + <span class="c1"># a KnownBits instances for the second abstract value</span> + <span class="n">k2</span> <span class="o">=</span> <span class="n">KnownBits</span><span class="p">(</span><span class="n">BitVec</span><span class="p">(</span><span class="s2">"n2_ones"</span><span class="p">),</span> <span class="n">BitVec</span><span class="p">(</span><span class="s2">"n2_unkowns"</span><span class="p">))</span> + <span class="c1"># add the precondition linking n2 and k2 to the solver</span> + <span class="n">solver</span><span class="o">.</span><span class="n">add</span><span class="p">(</span><span class="n">k2</span><span class="o">.</span><span class="n">contains</span><span class="p">(</span><span class="n">n2</span><span class="p">))</span> + <span class="k">return</span> <span class="n">solver</span><span class="p">,</span> <span class="n">k1</span><span class="p">,</span> <span class="n">n1</span><span class="p">,</span> <span class="n">k2</span><span class="p">,</span> <span class="n">n2</span> + +<span class="k">def</span> <span class="nf">prove</span><span class="p">(</span><span class="n">cond</span><span class="p">,</span> <span class="n">solver</span><span class="p">):</span> + <span class="n">z3res</span> <span class="o">=</span> <span class="n">solver</span><span class="o">.</span><span class="n">check</span><span class="p">(</span><span class="n">z3</span><span class="o">.</span><span class="n">Not</span><span class="p">(</span><span class="n">cond</span><span class="p">))</span> + <span class="k">if</span> <span class="n">z3res</span> <span class="o">!=</span> <span class="n">z3</span><span class="o">.</span><span class="n">unsat</span><span class="p">:</span> + <span class="k">assert</span> <span class="n">z3res</span> <span class="o">==</span> <span class="n">z3</span><span class="o">.</span><span class="n">sat</span> <span class="c1"># can't be timeout, we set no timeout</span> + <span class="c1"># make the model with the counterexample global, to make inspecting the</span> + <span class="c1"># bug easier when running pytest --pdb</span> + <span class="k">global</span> <span class="n">model</span> + <span class="n">model</span> <span class="o">=</span> <span class="n">solver</span><span class="o">.</span><span class="n">model</span><span class="p">()</span> + <span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">"n1=</span><span class="si">{</span><span class="n">model</span><span class="o">.</span><span class="n">eval</span><span class="p">(</span><span class="n">n1</span><span class="p">)</span><span class="si">}</span><span class="s2">, n2=</span><span class="si">{</span><span class="n">model</span><span class="o">.</span><span class="n">eval</span><span class="p">(</span><span class="n">n2</span><span class="p">)</span><span class="si">}</span><span class="s2">"</span><span class="p">)</span> + <span class="n">counter_example_k1</span> <span class="o">=</span> <span class="n">KnownBits</span><span class="p">(</span><span class="n">model</span><span class="o">.</span><span class="n">eval</span><span class="p">(</span><span class="n">k1</span><span class="o">.</span><span class="n">ones</span><span class="p">)</span><span class="o">.</span><span class="n">as_signed_long</span><span class="p">(),</span> + <span class="n">model</span><span class="o">.</span><span class="n">eval</span><span class="p">(</span><span class="n">k1</span><span class="o">.</span><span class="n">unknowns</span><span class="p">)</span><span class="o">.</span><span class="n">as_signed_long</span><span class="p">())</span> + <span class="n">counter_example_k2</span> <span class="o">=</span> <span class="n">KnownBits</span><span class="p">(</span><span class="n">model</span><span class="o">.</span><span class="n">eval</span><span class="p">(</span><span class="n">k2</span><span class="o">.</span><span class="n">ones</span><span class="p">)</span><span class="o">.</span><span class="n">as_signed_long</span><span class="p">(),</span> + <span class="n">model</span><span class="o">.</span><span class="n">eval</span><span class="p">(</span><span class="n">k2</span><span class="o">.</span><span class="n">unknowns</span><span class="p">)</span><span class="o">.</span><span class="n">as_signed_long</span><span class="p">())</span> + <span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">"k1=</span><span class="si">{</span><span class="n">counter_example_k1</span><span class="si">}</span><span class="s2">, k2=</span><span class="si">{</span><span class="n">counter_example_k2</span><span class="si">}</span><span class="s2">"</span><span class="p">)</span> + <span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">"but </span><span class="si">{</span><span class="n">cond</span><span class="si">=}</span><span class="s2"> evaluates to </span><span class="si">{</span><span class="n">model</span><span class="o">.</span><span class="n">eval</span><span class="p">(</span><span class="n">cond</span><span class="p">)</span><span class="si">}</span><span class="s2">"</span><span class="p">)</span> + <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="n">solver</span><span class="o">.</span><span class="n">model</span><span class="p">())</span> +</pre></div> + +<p>And then we can write proof-unit-tests like this:</p> +<div class="code"><pre class="code literal-block"><span class="k">def</span> <span class="nf">test_z3_abstract_invert</span><span class="p">():</span> + <span class="n">solver</span><span class="p">,</span> <span class="n">k1</span><span class="p">,</span> <span class="n">n1</span><span class="p">,</span> <span class="n">_</span><span class="p">,</span> <span class="n">_</span> <span class="o">=</span> <span class="n">z3_setup_variables</span><span class="p">()</span> + <span class="n">k2</span> <span class="o">=</span> <span class="n">k1</span><span class="o">.</span><span class="n">abstract_invert</span><span class="p">()</span> + <span class="n">n2</span> <span class="o">=</span> <span class="o">~</span><span class="n">n1</span> + <span class="n">prove</span><span class="p">(</span><span class="n">k2</span><span class="o">.</span><span class="n">contains</span><span class="p">(</span><span class="n">n2</span><span class="p">),</span> <span class="n">solver</span><span class="p">)</span> + +<span class="k">def</span> <span class="nf">test_z3_abstract_and</span><span class="p">():</span> + <span class="n">solver</span><span class="p">,</span> <span class="n">k1</span><span class="p">,</span> <span class="n">n1</span><span class="p">,</span> <span class="n">k2</span><span class="p">,</span> <span class="n">n2</span> <span class="o">=</span> <span class="n">z3_setup_variables</span><span class="p">()</span> + <span class="n">k3</span> <span class="o">=</span> <span class="n">k1</span><span class="o">.</span><span class="n">abstract_and</span><span class="p">(</span><span class="n">k2</span><span class="p">)</span> + <span class="n">n3</span> <span class="o">=</span> <span class="n">n1</span> <span class="o">&amp;</span> <span class="n">n2</span> + <span class="n">prove</span><span class="p">(</span><span class="n">k3</span><span class="o">.</span><span class="n">contains</span><span class="p">(</span><span class="n">n3</span><span class="p">),</span> <span class="n">solver</span><span class="p">)</span> + +<span class="k">def</span> <span class="nf">test_z3_abstract_or</span><span class="p">():</span> + <span class="n">solver</span><span class="p">,</span> <span class="n">k1</span><span class="p">,</span> <span class="n">n1</span><span class="p">,</span> <span class="n">k2</span><span class="p">,</span> <span class="n">n2</span> <span class="o">=</span> <span class="n">z3_setup_variables</span><span class="p">()</span> + <span class="n">k3</span> <span class="o">=</span> <span class="n">k1</span><span class="o">.</span><span class="n">abstract_or</span><span class="p">(</span><span class="n">k2</span><span class="p">)</span> + <span class="n">n3</span> <span class="o">=</span> <span class="n">n1</span> <span class="o">|</span> <span class="n">n2</span> + <span class="n">prove</span><span class="p">(</span><span class="n">k3</span><span class="o">.</span><span class="n">contains</span><span class="p">(</span><span class="n">n3</span><span class="p">),</span> <span class="n">solver</span><span class="p">)</span> + +<span class="k">def</span> <span class="nf">test_z3_abstract_add</span><span class="p">():</span> + <span class="n">solver</span><span class="p">,</span> <span class="n">k1</span><span class="p">,</span> <span class="n">n1</span><span class="p">,</span> <span class="n">k2</span><span class="p">,</span> <span class="n">n2</span> <span class="o">=</span> <span class="n">z3_setup_variables</span><span class="p">()</span> + <span class="n">k3</span> <span class="o">=</span> <span class="n">k1</span><span class="o">.</span><span class="n">abstract_add</span><span class="p">(</span><span class="n">k2</span><span class="p">)</span> + <span class="n">n3</span> <span class="o">=</span> <span class="n">n1</span> <span class="o">+</span> <span class="n">n2</span> + <span class="n">prove</span><span class="p">(</span><span class="n">k3</span><span class="o">.</span><span class="n">contains</span><span class="p">(</span><span class="n">n3</span><span class="p">),</span> <span class="n">solver</span><span class="p">)</span> + +<span class="k">def</span> <span class="nf">test_z3_abstract_sub</span><span class="p">():</span> + <span class="n">solver</span><span class="p">,</span> <span class="n">k1</span><span class="p">,</span> <span class="n">n1</span><span class="p">,</span> <span class="n">k2</span><span class="p">,</span> <span class="n">n2</span> <span class="o">=</span> <span class="n">z3_setup_variables</span><span class="p">()</span> + <span class="n">k3</span> <span class="o">=</span> <span class="n">k1</span><span class="o">.</span><span class="n">abstract_sub</span><span class="p">(</span><span class="n">k2</span><span class="p">)</span> + <span class="n">n3</span> <span class="o">=</span> <span class="n">n1</span> <span class="o">-</span> <span class="n">n2</span> + <span class="n">prove</span><span class="p">(</span><span class="n">k3</span><span class="o">.</span><span class="n">contains</span><span class="p">(</span><span class="n">n3</span><span class="p">),</span> <span class="n">solver</span><span class="p">)</span> +</pre></div> + +<p>It's possible to write a bit more Python-metaprogramming-magic and unify the +Hypothesis and Z3 tests into the same test definition.<sup id="fnref:proof_bitwidths"><a class="footnote-ref" href="https://www.pypy.org/posts/2024/08/toy-knownbits.html#fn:proof_bitwidths">1</a></sup></p> +<h3 id="cases-where-this-style-of-z3-proof-doesnt-work">Cases where this style of Z3 proof doesn't work</h3> +<p>Unfortunately the approach described in the previous section only works for a +very small number of cases. It breaks down as soon as the <code>KnownBits</code> methods +that we're calling contain any <code>if</code> conditions (including hidden ones like +the short-circuiting <code>and</code> and <code>or</code> in Python). Let's look at an example and +implement <code>abstract_eq</code>. <code>eq</code> is supposed to be an operation that compares two +integers and returns <code>0</code> or <code>1</code> if they are different or equal, respectively. +Implementing this in knownbits looks like this (with example and hypothesis +tests):</p> +<div class="code"><pre class="code literal-block"><span class="k">class</span> <span class="nc">KnownBits</span><span class="p">:</span> + <span class="o">...</span> + + <span class="k">def</span> <span class="nf">abstract_eq</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">):</span> + <span class="c1"># the result is a 0, 1, or ?</span> + + <span class="c1"># if they are both the same constant, they must be equal</span> + <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">is_constant</span><span class="p">()</span> <span class="ow">and</span> <span class="n">other</span><span class="o">.</span><span class="n">is_constant</span><span class="p">()</span> <span class="ow">and</span> <span class="bp">self</span><span class="o">.</span><span class="n">ones</span> <span class="o">==</span> <span class="n">other</span><span class="o">.</span><span class="n">ones</span><span class="p">:</span> + <span class="k">return</span> <span class="n">KnownBits</span><span class="o">.</span><span class="n">from_constant</span><span class="p">(</span><span class="mi">1</span><span class="p">)</span> + <span class="c1"># check whether we have known disagreeing bits, then we know the result</span> + <span class="c1"># is 0</span> + <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">_disagrees</span><span class="p">(</span><span class="n">other</span><span class="p">):</span> + <span class="k">return</span> <span class="n">KnownBits</span><span class="o">.</span><span class="n">from_constant</span><span class="p">(</span><span class="mi">0</span><span class="p">)</span> + <span class="k">return</span> <span class="n">KnownBits</span><span class="p">(</span><span class="mi">0</span><span class="p">,</span> <span class="mi">1</span><span class="p">)</span> <span class="c1"># an unknown boolean</span> + + <span class="k">def</span> <span class="nf">_disagrees</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">):</span> + <span class="c1"># check whether the bits disagree in any place where both are known</span> + <span class="n">both_known</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">knowns</span> <span class="o">&amp;</span> <span class="n">other</span><span class="o">.</span><span class="n">knowns</span> + <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">ones</span> <span class="o">&amp;</span> <span class="n">both_known</span> <span class="o">!=</span> <span class="n">other</span><span class="o">.</span><span class="n">ones</span> <span class="o">&amp;</span> <span class="n">both_known</span> + +<span class="k">def</span> <span class="nf">test_eq</span><span class="p">():</span> + <span class="n">k1</span> <span class="o">=</span> <span class="n">KnownBits</span><span class="o">.</span><span class="n">from_str</span><span class="p">(</span><span class="s1">'...?'</span><span class="p">)</span> + <span class="n">k2</span> <span class="o">=</span> <span class="n">KnownBits</span><span class="o">.</span><span class="n">from_str</span><span class="p">(</span><span class="s1">'...?'</span><span class="p">)</span> + <span class="k">assert</span> <span class="nb">str</span><span class="p">(</span><span class="n">k1</span><span class="o">.</span><span class="n">abstract_eq</span><span class="p">(</span><span class="n">k2</span><span class="p">))</span> <span class="o">==</span> <span class="s1">'?'</span> + <span class="n">k1</span> <span class="o">=</span> <span class="n">KnownBits</span><span class="o">.</span><span class="n">from_constant</span><span class="p">(</span><span class="mi">10</span><span class="p">)</span> + <span class="k">assert</span> <span class="nb">str</span><span class="p">(</span><span class="n">k1</span><span class="o">.</span><span class="n">abstract_eq</span><span class="p">(</span><span class="n">k1</span><span class="p">))</span> <span class="o">==</span> <span class="s1">'1'</span> + <span class="n">k1</span> <span class="o">=</span> <span class="n">KnownBits</span><span class="o">.</span><span class="n">from_constant</span><span class="p">(</span><span class="mi">10</span><span class="p">)</span> + <span class="n">k2</span> <span class="o">=</span> <span class="n">KnownBits</span><span class="o">.</span><span class="n">from_constant</span><span class="p">(</span><span class="mi">20</span><span class="p">)</span> + <span class="k">assert</span> <span class="nb">str</span><span class="p">(</span><span class="n">k1</span><span class="o">.</span><span class="n">abstract_eq</span><span class="p">(</span><span class="n">k2</span><span class="p">))</span> <span class="o">==</span> <span class="s1">'0'</span> + +<span class="nd">@given</span><span class="p">(</span><span class="n">knownbits_and_contained_number</span><span class="p">,</span> <span class="n">knownbits_and_contained_number</span><span class="p">)</span> +<span class="k">def</span> <span class="nf">test_hypothesis_eq</span><span class="p">(</span><span class="n">t1</span><span class="p">,</span> <span class="n">t2</span><span class="p">):</span> + <span class="n">k1</span><span class="p">,</span> <span class="n">n1</span> <span class="o">=</span> <span class="n">t1</span> + <span class="n">k2</span><span class="p">,</span> <span class="n">n2</span> <span class="o">=</span> <span class="n">t2</span> + <span class="n">k3</span> <span class="o">=</span> <span class="n">k1</span><span class="o">.</span><span class="n">abstract_eq</span><span class="p">(</span><span class="n">k2</span><span class="p">)</span> + <span class="k">assert</span> <span class="n">k3</span><span class="o">.</span><span class="n">contains</span><span class="p">(</span><span class="nb">int</span><span class="p">(</span><span class="n">n1</span> <span class="o">==</span> <span class="n">n2</span><span class="p">))</span> +</pre></div> + +<p>Trying to do the proof in the same style as before breaks:</p> +<div class="code"><pre class="code literal-block"><span class="o">&gt;&gt;&gt;&gt;</span> <span class="n">k3</span> <span class="o">=</span> <span class="n">k1</span><span class="o">.</span><span class="n">abstract_eq</span><span class="p">(</span><span class="n">k2</span><span class="p">)</span> +<span class="n">Traceback</span> <span class="p">(</span><span class="n">most</span> <span class="n">recent</span> <span class="n">call</span> <span class="n">last</span><span class="p">):</span> + <span class="n">File</span> <span class="s2">"&lt;stdin&gt;"</span><span class="p">,</span> <span class="n">line</span> <span class="mi">1</span><span class="p">,</span> <span class="ow">in</span> <span class="o">&lt;</span><span class="n">module</span><span class="o">&gt;</span> + <span class="n">File</span> <span class="s2">"knownbits.py"</span><span class="p">,</span> <span class="n">line</span> <span class="mi">246</span><span class="p">,</span> <span class="ow">in</span> <span class="n">abstract_eq</span> + <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">_disagrees</span><span class="p">(</span><span class="n">other</span><span class="p">):</span> + <span class="n">File</span> <span class="s2">"venv/site-packages/z3/z3.py"</span><span class="p">,</span> <span class="n">line</span> <span class="mi">381</span><span class="p">,</span> <span class="ow">in</span> <span class="fm">__bool__</span> + <span class="k">raise</span> <span class="n">Z3Exception</span><span class="p">(</span><span class="s2">"Symbolic expressions cannot be cast to concrete Boolean values."</span><span class="p">)</span> +<span class="n">z3</span><span class="o">.</span><span class="n">z3types</span><span class="o">.</span><span class="n">Z3Exception</span><span class="p">:</span> <span class="n">Symbolic</span> <span class="n">expressions</span> <span class="n">cannot</span> <span class="n">be</span> <span class="n">cast</span> <span class="n">to</span> <span class="n">concrete</span> <span class="n">Boolean</span> <span class="n">values</span><span class="o">.</span> +</pre></div> + +<p>We cannot call <code>abstract_eq</code> on a <code>KnownBits</code> with Z3 variables as fields, +because once we hit an <code>if</code> statement, the whole approach of relying on the +operator overloading breaks down. Z3 doesn't actually parse the Python code or +anything advanced like that, we rather build an expression only by running the +code and letting the Z3 formulas build up.</p> +<p>To still prove the correctness of <code>abstract_eq</code> we need to manually transform +the control flow logic of the function into a Z3 formula that uses the <code>z3.If</code> +expression, using a small helper function:</p> +<div class="code"><pre class="code literal-block"><span class="k">def</span> <span class="nf">z3_cond</span><span class="p">(</span><span class="n">b</span><span class="p">,</span> <span class="n">trueval</span><span class="o">=</span><span class="mi">1</span><span class="p">,</span> <span class="n">falseval</span><span class="o">=</span><span class="mi">0</span><span class="p">):</span> + <span class="k">return</span> <span class="n">z3</span><span class="o">.</span><span class="n">If</span><span class="p">(</span><span class="n">b</span><span class="p">,</span> <span class="n">BitVecVal</span><span class="p">(</span><span class="n">trueval</span><span class="p">),</span> <span class="n">BitVecVal</span><span class="p">(</span><span class="n">falseval</span><span class="p">))</span> + +<span class="k">def</span> <span class="nf">z3_abstract_eq</span><span class="p">(</span><span class="n">k1</span><span class="p">,</span> <span class="n">k2</span><span class="p">):</span> + <span class="c1"># follow the *logic* of abstract_eq, we can't call it due to the ifs in it</span> + <span class="n">case1cond</span> <span class="o">=</span> <span class="n">z3</span><span class="o">.</span><span class="n">And</span><span class="p">(</span><span class="n">k1</span><span class="o">.</span><span class="n">is_constant</span><span class="p">(),</span> <span class="n">k2</span><span class="o">.</span><span class="n">is_constant</span><span class="p">(),</span> <span class="n">k1</span><span class="o">.</span><span class="n">ones</span> <span class="o">==</span> <span class="n">k2</span><span class="o">.</span><span class="n">ones</span><span class="p">)</span> + <span class="n">case2cond</span> <span class="o">=</span> <span class="n">k1</span><span class="o">.</span><span class="n">_disagrees</span><span class="p">(</span><span class="n">k2</span><span class="p">)</span> + + <span class="c1"># ones is 1 in the first case, 0 otherwise</span> + <span class="n">ones</span> <span class="o">=</span> <span class="n">z3_cond</span><span class="p">(</span><span class="n">case1cond</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="mi">0</span><span class="p">)</span> + + <span class="c1"># in the first two cases, unknowns is 0, 1 otherwise</span> + <span class="n">unknowns</span> <span class="o">=</span> <span class="n">z3_cond</span><span class="p">(</span><span class="n">z3</span><span class="o">.</span><span class="n">Or</span><span class="p">(</span><span class="n">case1cond</span><span class="p">,</span> <span class="n">case2cond</span><span class="p">),</span> <span class="mi">0</span><span class="p">,</span> <span class="mi">1</span><span class="p">)</span> + <span class="k">return</span> <span class="n">KnownBits</span><span class="p">(</span><span class="n">ones</span><span class="p">,</span> <span class="n">unknowns</span><span class="p">)</span> + +<span class="k">def</span> <span class="nf">test_z3_abstract_eq_logic</span><span class="p">():</span> + <span class="n">solver</span><span class="p">,</span> <span class="n">k1</span><span class="p">,</span> <span class="n">n1</span><span class="p">,</span> <span class="n">k2</span><span class="p">,</span> <span class="n">n2</span> <span class="o">=</span> <span class="n">z3_setup_variables</span><span class="p">()</span> + <span class="n">n3</span> <span class="o">=</span> <span class="n">z3_cond</span><span class="p">(</span><span class="n">n1</span> <span class="o">==</span> <span class="n">n2</span><span class="p">)</span> <span class="c1"># concrete result</span> + <span class="n">k3</span> <span class="o">=</span> <span class="n">z3_abstract_eq</span><span class="p">(</span><span class="n">k1</span><span class="p">,</span> <span class="n">k2</span><span class="p">)</span> + <span class="n">prove</span><span class="p">(</span><span class="n">k3</span><span class="o">.</span><span class="n">contains</span><span class="p">(</span><span class="n">n3</span><span class="p">),</span> <span class="n">solver</span><span class="p">)</span> +</pre></div> + +<p>This proof works. It is a lot less satisfying than the previous ones though, +because we could have done an error in the manual transcription from Python code +to Z3 formulas (there are possibly more heavy-handed approaches where we do +this transformation more automatically using e.g. the <code>ast</code> module to analyze +the source code, but that's a much more complicated researchy project). To +lessen this problem somewhat we can factor out the parts of the logic that don't +have any conditions into small helper methods (like <code>_disagrees</code> in this +example) and use them in the manual conversion of the code to Z3 formulas.<sup id="fnref:tests_vs_proofs"><a class="footnote-ref" href="https://www.pypy.org/posts/2024/08/toy-knownbits.html#fn:tests_vs_proofs">2</a></sup></p> +<p>The final condition that Z3 checks, btw, is this one:</p> +<div class="code"><pre class="code literal-block">If(n1 == n2, 1, 0) &amp; +~If(Or(And(n1_unkowns == 0, + n2_unkowns == 0, + n1_ones == n2_ones), + n1_ones &amp; ~n1_unkowns &amp; ~n2_unkowns != + n2_ones &amp; ~n1_unkowns &amp; ~n2_unkowns), + 0, 1) == +If(And(n1_unkowns == 0, n2_unkowns == 0, n1_ones == n2_ones), + 1, 0) +</pre></div> + +<h3 id="making-statements-about-precision">Making Statements about Precision</h3> +<p>So far we have only used Z3 to prove statements about correctness, i.e. that +our abstract operations overapproximate what can happen with concrete values. +While proving this property is essential if we want to avoid miscompilation, +correctness alone is not a very strong constraint on the implementation of our +abstract transfer functions. We could simply return <code>Knownbits.unknowns()</code> for +every <code>abstract_*</code> method and the resulting overapproximation would be correct, +but useless in practice.</p> +<p>It's much harder to make statements about whether the transfer functions are +maximally precise. There are two aspects of precision I want to discuss in this +section, however.</p> +<p>The first aspect is that we would really like it if the transfer functions +compute the maximally precise results for singleton sets. If all abstract +arguments of an operations are constants, i.e. contain only a single concrete +element, then we know that the resulting set also has only a single element. We +can prove that all our transfer functions have this property:</p> +<div class="code"><pre class="code literal-block"><span class="k">def</span> <span class="nf">test_z3_prove_constant_folding</span><span class="p">():</span> + <span class="n">solver</span><span class="p">,</span> <span class="n">k1</span><span class="p">,</span> <span class="n">n1</span><span class="p">,</span> <span class="n">k2</span><span class="p">,</span> <span class="n">n2</span> <span class="o">=</span> <span class="n">z3_setup_variables</span><span class="p">()</span> + <span class="n">k3</span> <span class="o">=</span> <span class="n">k1</span><span class="o">.</span><span class="n">abstract_invert</span><span class="p">()</span> + <span class="n">prove</span><span class="p">(</span><span class="n">z3</span><span class="o">.</span><span class="n">Implies</span><span class="p">(</span><span class="n">k1</span><span class="o">.</span><span class="n">is_constant</span><span class="p">(),</span> + <span class="n">k3</span><span class="o">.</span><span class="n">is_constant</span><span class="p">()),</span> <span class="n">solver</span><span class="p">)</span> + + <span class="n">k3</span> <span class="o">=</span> <span class="n">k1</span><span class="o">.</span><span class="n">abstract_and</span><span class="p">(</span><span class="n">k2</span><span class="p">)</span> + <span class="n">prove</span><span class="p">(</span><span class="n">z3</span><span class="o">.</span><span class="n">Implies</span><span class="p">(</span><span class="n">z3</span><span class="o">.</span><span class="n">And</span><span class="p">(</span><span class="n">k1</span><span class="o">.</span><span class="n">is_constant</span><span class="p">(),</span> <span class="n">k2</span><span class="o">.</span><span class="n">is_constant</span><span class="p">()),</span> + <span class="n">k3</span><span class="o">.</span><span class="n">is_constant</span><span class="p">()),</span> <span class="n">solver</span><span class="p">)</span> + + <span class="n">k3</span> <span class="o">=</span> <span class="n">k1</span><span class="o">.</span><span class="n">abstract_or</span><span class="p">(</span><span class="n">k2</span><span class="p">)</span> + <span class="n">prove</span><span class="p">(</span><span class="n">z3</span><span class="o">.</span><span class="n">Implies</span><span class="p">(</span><span class="n">z3</span><span class="o">.</span><span class="n">And</span><span class="p">(</span><span class="n">k1</span><span class="o">.</span><span class="n">is_constant</span><span class="p">(),</span> <span class="n">k2</span><span class="o">.</span><span class="n">is_constant</span><span class="p">()),</span> + <span class="n">k3</span><span class="o">.</span><span class="n">is_constant</span><span class="p">()),</span> <span class="n">solver</span><span class="p">)</span> + + <span class="n">k3</span> <span class="o">=</span> <span class="n">k1</span><span class="o">.</span><span class="n">abstract_sub</span><span class="p">(</span><span class="n">k2</span><span class="p">)</span> + <span class="n">prove</span><span class="p">(</span><span class="n">z3</span><span class="o">.</span><span class="n">Implies</span><span class="p">(</span><span class="n">z3</span><span class="o">.</span><span class="n">And</span><span class="p">(</span><span class="n">k1</span><span class="o">.</span><span class="n">is_constant</span><span class="p">(),</span> <span class="n">k2</span><span class="o">.</span><span class="n">is_constant</span><span class="p">()),</span> + <span class="n">k3</span><span class="o">.</span><span class="n">is_constant</span><span class="p">()),</span> <span class="n">solver</span><span class="p">)</span> + + <span class="n">k3</span> <span class="o">=</span> <span class="n">z3_abstract_eq</span><span class="p">(</span><span class="n">k1</span><span class="p">,</span> <span class="n">k2</span><span class="p">)</span> + <span class="n">prove</span><span class="p">(</span><span class="n">z3</span><span class="o">.</span><span class="n">Implies</span><span class="p">(</span><span class="n">z3</span><span class="o">.</span><span class="n">And</span><span class="p">(</span><span class="n">k1</span><span class="o">.</span><span class="n">is_constant</span><span class="p">(),</span> <span class="n">k2</span><span class="o">.</span><span class="n">is_constant</span><span class="p">()),</span> + <span class="n">k3</span><span class="o">.</span><span class="n">is_constant</span><span class="p">()),</span> <span class="n">solver</span><span class="p">)</span> +</pre></div> + +<p>Proving with Z3 that the transfer functions are maximally precise for +non-constant arguments seems to be relatively hard. I tried a few completely +rigorous approaches and failed. The paper <a href="https://arxiv.org/pdf/2105.05398">Sound, Precise, and Fast Abstract +Interpretation with Tristate Numbers</a> +contains an optimality proof for the transfer functions of addition and +subtraction, so we can be certain that they are as precise as is +possible.</p> +<p>I still want to show an approach for trying to find concrete examples of +abstract values that are less precise than they could be, using a combination +of Hypothesis and Z3. The idea is to use hypothesis to pick random abstract +values. Then we compute the abstract result using our transfer function. +Afterwards we can ask Z3 to find us an abstract result that is better than the +one our transfer function produced. If Z3 finds a better abstract result, we +have a concrete example of imprecision for our transfer function. Those tests +aren't strict proofs, because they rely on generating random abstract values, +but they can still be valuable (not for the transfer functions in this blog +post, which are all optimal).</p> +<p>Here is what the code looks like (this is a little bit bonus content, I'll not +explain the details and can only hope that the comments are somewhat helpful):</p> +<div class="code"><pre class="code literal-block"><span class="nd">@given</span><span class="p">(</span><span class="n">random_knownbits_and_contained_number</span><span class="p">,</span> <span class="n">random_knownbits_and_contained_number</span><span class="p">)</span> +<span class="nd">@settings</span><span class="p">(</span><span class="n">deadline</span><span class="o">=</span><span class="kc">None</span><span class="p">)</span> +<span class="k">def</span> <span class="nf">test_check_precision</span><span class="p">(</span><span class="n">t1</span><span class="p">,</span> <span class="n">t2</span><span class="p">):</span> + <span class="n">k1</span><span class="p">,</span> <span class="n">n1</span> <span class="o">=</span> <span class="n">t1</span> + <span class="n">k2</span><span class="p">,</span> <span class="n">n2</span> <span class="o">=</span> <span class="n">t2</span> + <span class="c1"># apply transfer function</span> + <span class="n">k3</span> <span class="o">=</span> <span class="n">k1</span><span class="o">.</span><span class="n">abstract_add</span><span class="p">(</span><span class="n">k2</span><span class="p">)</span> + <span class="n">example_res</span> <span class="o">=</span> <span class="n">n1</span> <span class="o">+</span> <span class="n">n2</span> + + <span class="c1"># try to find a better version of k3 with Z3</span> + <span class="n">solver</span> <span class="o">=</span> <span class="n">z3</span><span class="o">.</span><span class="n">Solver</span><span class="p">()</span> + <span class="n">solver</span><span class="o">.</span><span class="n">set</span><span class="p">(</span><span class="s2">"timeout"</span><span class="p">,</span> <span class="mi">8000</span><span class="p">)</span> + + <span class="n">var1</span> <span class="o">=</span> <span class="n">BitVec</span><span class="p">(</span><span class="s1">'v1'</span><span class="p">)</span> + <span class="n">var2</span> <span class="o">=</span> <span class="n">BitVec</span><span class="p">(</span><span class="s1">'v2'</span><span class="p">)</span> + + <span class="n">ones</span> <span class="o">=</span> <span class="n">BitVec</span><span class="p">(</span><span class="s1">'ones'</span><span class="p">)</span> + <span class="n">unknowns</span> <span class="o">=</span> <span class="n">BitVec</span><span class="p">(</span><span class="s1">'unknowns'</span><span class="p">)</span> + <span class="n">better_k3</span> <span class="o">=</span> <span class="n">KnownBits</span><span class="p">(</span><span class="n">ones</span><span class="p">,</span> <span class="n">unknowns</span><span class="p">)</span> + <span class="nb">print</span><span class="p">(</span><span class="n">k1</span><span class="p">,</span> <span class="n">k2</span><span class="p">,</span> <span class="n">k3</span><span class="p">)</span> + + <span class="c1"># we're trying to find an example for a better k3, so we use check, without</span> + <span class="c1"># negation:</span> + <span class="n">res</span> <span class="o">=</span> <span class="n">solver</span><span class="o">.</span><span class="n">check</span><span class="p">(</span><span class="n">z3</span><span class="o">.</span><span class="n">And</span><span class="p">(</span> + <span class="c1"># better_k3 should be a valid knownbits instance</span> + <span class="n">better_k3</span><span class="o">.</span><span class="n">is_well_formed</span><span class="p">(),</span> + <span class="c1"># it should be better than k3, ie there are known bits in better_k3</span> + <span class="c1"># that we don't have in k3</span> + <span class="n">better_k3</span><span class="o">.</span><span class="n">knowns</span> <span class="o">&amp;</span> <span class="o">~</span><span class="n">k3</span><span class="o">.</span><span class="n">knowns</span> <span class="o">!=</span> <span class="mi">0</span><span class="p">,</span> + <span class="c1"># now encode the correctness condition for better_k3 with a ForAll:</span> + <span class="c1"># for all concrete values var1 and var2, it must hold that if</span> + <span class="c1"># var1 is in k1 and var2 is in k2 it follows that var1 + var2 is in</span> + <span class="c1"># better_k3</span> + <span class="n">z3</span><span class="o">.</span><span class="n">ForAll</span><span class="p">(</span> + <span class="p">[</span><span class="n">var1</span><span class="p">,</span> <span class="n">var2</span><span class="p">],</span> + <span class="n">z3</span><span class="o">.</span><span class="n">Implies</span><span class="p">(</span> + <span class="n">z3</span><span class="o">.</span><span class="n">And</span><span class="p">(</span><span class="n">k1</span><span class="o">.</span><span class="n">contains</span><span class="p">(</span><span class="n">var1</span><span class="p">),</span> <span class="n">k2</span><span class="o">.</span><span class="n">contains</span><span class="p">(</span><span class="n">var2</span><span class="p">)),</span> + <span class="n">better_k3</span><span class="o">.</span><span class="n">contains</span><span class="p">(</span><span class="n">var1</span> <span class="o">+</span> <span class="n">var2</span><span class="p">)))))</span> + <span class="c1"># if this query is satisfiable, we have found a better result for the</span> + <span class="c1"># abstract_add</span> + <span class="k">if</span> <span class="n">res</span> <span class="o">==</span> <span class="n">z3</span><span class="o">.</span><span class="n">sat</span><span class="p">:</span> + <span class="n">model</span> <span class="o">=</span> <span class="n">solver</span><span class="o">.</span><span class="n">model</span><span class="p">()</span> + <span class="n">rk3</span> <span class="o">=</span> <span class="n">KnownBits</span><span class="p">(</span><span class="n">model</span><span class="o">.</span><span class="n">eval</span><span class="p">(</span><span class="n">ones</span><span class="p">)</span><span class="o">.</span><span class="n">as_signed_long</span><span class="p">(),</span> <span class="n">model</span><span class="o">.</span><span class="n">eval</span><span class="p">(</span><span class="n">unknowns</span><span class="p">)</span><span class="o">.</span><span class="n">as_signed_long</span><span class="p">())</span> + <span class="nb">print</span><span class="p">(</span><span class="s2">"better"</span><span class="p">,</span> <span class="n">rk3</span><span class="p">)</span> + <span class="k">assert</span> <span class="mi">0</span> + <span class="k">if</span> <span class="n">res</span> <span class="o">==</span> <span class="n">z3</span><span class="o">.</span><span class="n">unknown</span><span class="p">:</span> + <span class="nb">print</span><span class="p">(</span><span class="s2">"timeout"</span><span class="p">)</span> +</pre></div> + +<p>It does not actually fail for <code>abstract_add</code> (nor the other abstract +functions). To see the test failing we can add some imprecision to the +implementation of <code>abstract_add</code> to see Hypothesis and Z3 find examples of +values that are not optimally precise (for example by setting some bits +of <code>unknowns</code> in the implementation of <code>abstract_add</code> unconditionally).</p> +<h3 id="using-the-abstract-domain-in-the-toy-optimizer-for-generalized-constant-folding">Using the Abstract Domain in the Toy Optimizer for Generalized Constant Folding</h3> +<p>Now after all this work we can finally actually use the knownbits abstract +domain in the toy optimizer. The code for this follows <a href="https://bernsteinbear.com/blog/toy-abstract-interpretation/">Max' intro post about +abstract interpretation</a> +quite closely.</p> +<p>For completeness sake, in the fold there's the basic infrastructure classes +that make up the IR again (they are identical or at least extremely close to +the previous toy posts).</p> +<details> +<summary>toy infrastructure</summary> + +<div class="code"><pre class="code literal-block"><span class="k">class</span> <span class="nc">Value</span><span class="p">:</span> + <span class="k">def</span> <span class="nf">find</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span> + <span class="k">raise</span> <span class="ne">NotImplementedError</span><span class="p">(</span><span class="s2">"abstract"</span><span class="p">)</span> + + +<span class="nd">@dataclass</span><span class="p">(</span><span class="n">eq</span><span class="o">=</span><span class="kc">False</span><span class="p">)</span> +<span class="k">class</span> <span class="nc">Operation</span><span class="p">(</span><span class="n">Value</span><span class="p">):</span> + <span class="n">name</span> <span class="p">:</span> <span class="nb">str</span> + <span class="n">args</span> <span class="p">:</span> <span class="nb">list</span><span class="p">[</span><span class="n">Value</span><span class="p">]</span> + + <span class="n">forwarded</span> <span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Value</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span> + + <span class="k">def</span> <span class="nf">find</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Value</span><span class="p">:</span> + <span class="n">op</span> <span class="o">=</span> <span class="bp">self</span> + <span class="k">while</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">op</span><span class="p">,</span> <span class="n">Operation</span><span class="p">):</span> + <span class="nb">next</span> <span class="o">=</span> <span class="n">op</span><span class="o">.</span><span class="n">forwarded</span> + <span class="k">if</span> <span class="nb">next</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span> + <span class="k">return</span> <span class="n">op</span> + <span class="n">op</span> <span class="o">=</span> <span class="nb">next</span> + <span class="k">return</span> <span class="n">op</span> + + <span class="k">def</span> <span class="nf">arg</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">index</span><span class="p">):</span> + <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">args</span><span class="p">[</span><span class="n">index</span><span class="p">]</span><span class="o">.</span><span class="n">find</span><span class="p">()</span> + + <span class="k">def</span> <span class="nf">make_equal_to</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span> <span class="p">:</span> <span class="n">Value</span><span class="p">):</span> + <span class="bp">self</span><span class="o">.</span><span class="n">find</span><span class="p">()</span><span class="o">.</span><span class="n">forwarded</span> <span class="o">=</span> <span class="n">value</span> + + +<span class="nd">@dataclass</span><span class="p">(</span><span class="n">eq</span><span class="o">=</span><span class="kc">False</span><span class="p">)</span> +<span class="k">class</span> <span class="nc">Constant</span><span class="p">(</span><span class="n">Value</span><span class="p">):</span> + <span class="n">value</span> <span class="p">:</span> <span class="nb">object</span> + + <span class="k">def</span> <span class="nf">find</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span> + <span class="k">return</span> <span class="bp">self</span> + + +<span class="k">class</span> <span class="nc">Block</span><span class="p">(</span><span class="nb">list</span><span class="p">):</span> + <span class="k">def</span> <span class="fm">__getattr__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">opname</span><span class="p">):</span> + <span class="k">def</span> <span class="nf">wraparg</span><span class="p">(</span><span class="n">arg</span><span class="p">):</span> + <span class="k">if</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">arg</span><span class="p">,</span> <span class="n">Value</span><span class="p">):</span> + <span class="n">arg</span> <span class="o">=</span> <span class="n">Constant</span><span class="p">(</span><span class="n">arg</span><span class="p">)</span> + <span class="k">return</span> <span class="n">arg</span> + <span class="k">def</span> <span class="nf">make_op</span><span class="p">(</span><span class="o">*</span><span class="n">args</span><span class="p">):</span> + <span class="n">op</span> <span class="o">=</span> <span class="n">Operation</span><span class="p">(</span><span class="n">opname</span><span class="p">,</span> + <span class="p">[</span><span class="n">wraparg</span><span class="p">(</span><span class="n">arg</span><span class="p">)</span> <span class="k">for</span> <span class="n">arg</span> <span class="ow">in</span> <span class="n">args</span><span class="p">])</span> + <span class="bp">self</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">op</span><span class="p">)</span> + <span class="k">return</span> <span class="n">op</span> + <span class="k">return</span> <span class="n">make_op</span> + + +<span class="k">def</span> <span class="nf">bb_to_str</span><span class="p">(</span><span class="n">l</span> <span class="p">:</span> <span class="n">Block</span><span class="p">,</span> <span class="n">varprefix</span> <span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">"var"</span><span class="p">):</span> + <span class="k">def</span> <span class="nf">arg_to_str</span><span class="p">(</span><span class="n">arg</span> <span class="p">:</span> <span class="n">Value</span><span class="p">):</span> + <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">arg</span><span class="p">,</span> <span class="n">Constant</span><span class="p">):</span> + <span class="k">return</span> <span class="nb">str</span><span class="p">(</span><span class="n">arg</span><span class="o">.</span><span class="n">value</span><span class="p">)</span> + <span class="k">else</span><span class="p">:</span> + <span class="k">return</span> <span class="n">varnames</span><span class="p">[</span><span class="n">arg</span><span class="p">]</span> + + <span class="n">varnames</span> <span class="o">=</span> <span class="p">{}</span> + <span class="n">res</span> <span class="o">=</span> <span class="p">[]</span> + <span class="k">for</span> <span class="n">index</span><span class="p">,</span> <span class="n">op</span> <span class="ow">in</span> <span class="nb">enumerate</span><span class="p">(</span><span class="n">l</span><span class="p">):</span> + <span class="c1"># give the operation a name used while</span> + <span class="c1"># printing:</span> + <span class="n">var</span> <span class="o">=</span> <span class="sa">f</span><span class="s2">"</span><span class="si">{</span><span class="n">varprefix</span><span class="si">}{</span><span class="n">index</span><span class="si">}</span><span class="s2">"</span> + <span class="n">varnames</span><span class="p">[</span><span class="n">op</span><span class="p">]</span> <span class="o">=</span> <span class="n">var</span> + <span class="n">arguments</span> <span class="o">=</span> <span class="s2">", "</span><span class="o">.</span><span class="n">join</span><span class="p">(</span> + <span class="n">arg_to_str</span><span class="p">(</span><span class="n">op</span><span class="o">.</span><span class="n">arg</span><span class="p">(</span><span class="n">i</span><span class="p">))</span> + <span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="nb">len</span><span class="p">(</span><span class="n">op</span><span class="o">.</span><span class="n">args</span><span class="p">))</span> + <span class="p">)</span> + <span class="n">strop</span> <span class="o">=</span> <span class="sa">f</span><span class="s2">"</span><span class="si">{</span><span class="n">var</span><span class="si">}</span><span class="s2"> = </span><span class="si">{</span><span class="n">op</span><span class="o">.</span><span class="n">name</span><span class="si">}</span><span class="s2">(</span><span class="si">{</span><span class="n">arguments</span><span class="si">}</span><span class="s2">)"</span> + <span class="n">res</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">strop</span><span class="p">)</span> + <span class="k">return</span> <span class="s2">"</span><span class="se">\n</span><span class="s2">"</span><span class="o">.</span><span class="n">join</span><span class="p">(</span><span class="n">res</span><span class="p">)</span> +</pre></div> + + + +</details> + +<p>Now we can write some first tests, the first one simply checking constant +folding:</p> +<div class="code"><pre class="code literal-block"><span class="k">def</span> <span class="nf">test_constfold_two_ops</span><span class="p">():</span> + <span class="n">bb</span> <span class="o">=</span> <span class="n">Block</span><span class="p">()</span> + <span class="n">var0</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">getarg</span><span class="p">(</span><span class="mi">0</span><span class="p">)</span> + <span class="n">var1</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">int_add</span><span class="p">(</span><span class="mi">5</span><span class="p">,</span> <span class="mi">4</span><span class="p">)</span> + <span class="n">var2</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">int_add</span><span class="p">(</span><span class="n">var1</span><span class="p">,</span> <span class="mi">10</span><span class="p">)</span> + <span class="n">var3</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">int_add</span><span class="p">(</span><span class="n">var2</span><span class="p">,</span> <span class="n">var0</span><span class="p">)</span> + + <span class="n">opt_bb</span> <span class="o">=</span> <span class="n">simplify</span><span class="p">(</span><span class="n">bb</span><span class="p">)</span> + <span class="k">assert</span> <span class="n">bb_to_str</span><span class="p">(</span><span class="n">opt_bb</span><span class="p">,</span> <span class="s2">"optvar"</span><span class="p">)</span> <span class="o">==</span> <span class="s2">"""</span><span class="se">\</span> +<span class="s2">optvar0 = getarg(0)</span> +<span class="s2">optvar1 = int_add(19, optvar0)"""</span> +</pre></div> + +<p>Calling the transfer functions on constant <code>KnownBits</code> produces a constant +results, as we have seen. Therefore "regular" constant folding should hopefully +be achieved by optimizing with the <code>KnownBits</code> abstract domain too.</p> +<p>The next two tests are slightly more complicated and can't be optimized by +regular constant-folding. They follow the motivating examples from the start of +this blog post, a hundred years ago:</p> +<div class="code"><pre class="code literal-block"><span class="n">def</span><span class="w"> </span><span class="n">test_constfold_via_knownbits</span><span class="p">():</span> +<span class="w"> </span><span class="n">bb</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">Block</span><span class="p">()</span> +<span class="w"> </span><span class="n">var0</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">bb</span><span class="o">.</span><span class="n">getarg</span><span class="p">(</span><span class="mi">0</span><span class="p">)</span> +<span class="w"> </span><span class="n">var1</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">bb</span><span class="o">.</span><span class="n">int_or</span><span class="p">(</span><span class="n">var0</span><span class="p">,</span><span class="w"> </span><span class="mi">1</span><span class="p">)</span> +<span class="w"> </span><span class="n">var2</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">bb</span><span class="o">.</span><span class="n">int_and</span><span class="p">(</span><span class="n">var1</span><span class="p">,</span><span class="w"> </span><span class="mi">1</span><span class="p">)</span> +<span class="w"> </span><span class="n">var3</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">bb</span><span class="o">.</span><span class="n">dummy</span><span class="p">(</span><span class="n">var2</span><span class="p">)</span> + +<span class="w"> </span><span class="n">opt_bb</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">simplify</span><span class="p">(</span><span class="n">bb</span><span class="p">)</span> +<span class="w"> </span><span class="nb">assert</span><span class="w"> </span><span class="n">bb_to_str</span><span class="p">(</span><span class="n">opt_bb</span><span class="p">,</span><span class="w"> </span><span class="s2">"optvar"</span><span class="p">)</span><span class="w"> </span><span class="o">==</span><span class="w"> </span><span class="s2">"""</span><span class="se">\</span> +<span class="s2">optvar0 = getarg(0)</span> +<span class="s2">optvar1 = int_or(optvar0, 1)</span> +<span class="s2">optvar2 = dummy(1)"""</span> + +<span class="n">def</span><span class="w"> </span><span class="n">test_constfold_alignment_check</span><span class="p">():</span> +<span class="w"> </span><span class="n">bb</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">Block</span><span class="p">()</span> +<span class="w"> </span><span class="n">var0</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">bb</span><span class="o">.</span><span class="n">getarg</span><span class="p">(</span><span class="mi">0</span><span class="p">)</span> +<span class="w"> </span><span class="n">var1</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">bb</span><span class="o">.</span><span class="n">int_invert</span><span class="p">(</span><span class="mi">0</span><span class="n">b111</span><span class="p">)</span> +<span class="w"> </span><span class="c1"># mask off the lowest three bits, thus var2 is aligned</span> +<span class="w"> </span><span class="n">var2</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">bb</span><span class="o">.</span><span class="n">int_and</span><span class="p">(</span><span class="n">var0</span><span class="p">,</span><span class="w"> </span><span class="n">var1</span><span class="p">)</span> +<span class="w"> </span><span class="c1"># add 16 to aligned quantity</span> +<span class="w"> </span><span class="n">var3</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">bb</span><span class="o">.</span><span class="n">int_add</span><span class="p">(</span><span class="n">var2</span><span class="p">,</span><span class="w"> </span><span class="mi">16</span><span class="p">)</span> +<span class="w"> </span><span class="c1"># check alignment of result</span> +<span class="w"> </span><span class="n">var4</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">bb</span><span class="o">.</span><span class="n">int_and</span><span class="p">(</span><span class="n">var3</span><span class="p">,</span><span class="w"> </span><span class="mi">0</span><span class="n">b111</span><span class="p">)</span> +<span class="w"> </span><span class="n">var5</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">bb</span><span class="o">.</span><span class="n">int_eq</span><span class="p">(</span><span class="n">var4</span><span class="p">,</span><span class="w"> </span><span class="mi">0</span><span class="p">)</span> +<span class="w"> </span><span class="c1"># var5 should be const-folded to 1</span> +<span class="w"> </span><span class="n">var6</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">bb</span><span class="o">.</span><span class="n">dummy</span><span class="p">(</span><span class="n">var5</span><span class="p">)</span> + +<span class="w"> </span><span class="n">opt_bb</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">simplify</span><span class="p">(</span><span class="n">bb</span><span class="p">)</span> +<span class="w"> </span><span class="nb">assert</span><span class="w"> </span><span class="n">bb_to_str</span><span class="p">(</span><span class="n">opt_bb</span><span class="p">,</span><span class="w"> </span><span class="s2">"optvar"</span><span class="p">)</span><span class="w"> </span><span class="o">==</span><span class="w"> </span><span class="s2">"""</span><span class="se">\</span> +<span class="s2">optvar0 = getarg(0)</span> +<span class="s2">optvar1 = int_and(optvar0, -8)</span> +<span class="s2">optvar2 = int_add(optvar1, 16)</span> +<span class="s2">optvar3 = dummy(1)"""</span> +</pre></div> + +<p>Here is <code>simplify</code> to make these tests pass:</p> +<div class="code"><pre class="code literal-block"><span class="k">def</span> <span class="nf">unknown_transfer_functions</span><span class="p">(</span><span class="o">*</span><span class="n">abstract_args</span><span class="p">):</span> + <span class="k">return</span> <span class="n">KnownBits</span><span class="o">.</span><span class="n">all_unknown</span><span class="p">()</span> + + +<span class="k">def</span> <span class="nf">simplify</span><span class="p">(</span><span class="n">bb</span><span class="p">:</span> <span class="n">Block</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Block</span><span class="p">:</span> + <span class="n">abstract_values</span> <span class="o">=</span> <span class="p">{}</span> <span class="c1"># dict mapping Operation to KnownBits</span> + + <span class="k">def</span> <span class="nf">knownbits_of</span><span class="p">(</span><span class="n">val</span> <span class="p">:</span> <span class="n">Value</span><span class="p">):</span> + <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">val</span><span class="p">,</span> <span class="n">Constant</span><span class="p">):</span> + <span class="k">return</span> <span class="n">KnownBits</span><span class="o">.</span><span class="n">from_constant</span><span class="p">(</span><span class="n">val</span><span class="o">.</span><span class="n">value</span><span class="p">)</span> + <span class="k">return</span> <span class="n">abstract_values</span><span class="p">[</span><span class="n">val</span><span class="p">]</span> + + <span class="n">opt_bb</span> <span class="o">=</span> <span class="n">Block</span><span class="p">()</span> + <span class="k">for</span> <span class="n">op</span> <span class="ow">in</span> <span class="n">bb</span><span class="p">:</span> + <span class="c1"># apply the transfer function on the abstract arguments</span> + <span class="n">name_without_prefix</span> <span class="o">=</span> <span class="n">op</span><span class="o">.</span><span class="n">name</span><span class="o">.</span><span class="n">removeprefix</span><span class="p">(</span><span class="s2">"int_"</span><span class="p">)</span> + <span class="n">method_name</span> <span class="o">=</span> <span class="sa">f</span><span class="s2">"abstract_</span><span class="si">{</span><span class="n">name_without_prefix</span><span class="si">}</span><span class="s2">"</span> + <span class="n">transfer_function</span> <span class="o">=</span> <span class="nb">getattr</span><span class="p">(</span><span class="n">KnownBits</span><span class="p">,</span> <span class="n">method_name</span><span class="p">,</span> <span class="n">unknown_transfer_functions</span><span class="p">)</span> + <span class="n">abstract_args</span> <span class="o">=</span> <span class="p">[</span><span class="n">knownbits_of</span><span class="p">(</span><span class="n">arg</span><span class="o">.</span><span class="n">find</span><span class="p">())</span> <span class="k">for</span> <span class="n">arg</span> <span class="ow">in</span> <span class="n">op</span><span class="o">.</span><span class="n">args</span><span class="p">]</span> + <span class="n">abstract_res</span> <span class="o">=</span> <span class="n">abstract_values</span><span class="p">[</span><span class="n">op</span><span class="p">]</span> <span class="o">=</span> <span class="n">transfer_function</span><span class="p">(</span><span class="o">*</span><span class="n">abstract_args</span><span class="p">)</span> + <span class="c1"># if the result is a constant, we optimize the operation away and make</span> + <span class="c1"># it equal to the constant result</span> + <span class="k">if</span> <span class="n">abstract_res</span><span class="o">.</span><span class="n">is_constant</span><span class="p">():</span> + <span class="n">op</span><span class="o">.</span><span class="n">make_equal_to</span><span class="p">(</span><span class="n">Constant</span><span class="p">(</span><span class="n">abstract_res</span><span class="o">.</span><span class="n">ones</span><span class="p">))</span> + <span class="k">continue</span> + <span class="c1"># otherwise emit the op</span> + <span class="n">opt_bb</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">op</span><span class="p">)</span> + <span class="k">return</span> <span class="n">opt_bb</span> +</pre></div> + +<p>The code follows the approach from the previous blog post very closely. The +only difference is that we apply the transfer function <em>first</em>, to be able to +detect whether the abstract domain can tell us that the result has to always be +a constant. This code makes all three tests pass.</p> +<h3 id="using-the-knownbits-domain-for-conditional-peephole-rewrites">Using the <code>KnownBits</code> Domain for Conditional Peephole Rewrites</h3> +<p>So far we are only using the <code>KnownBits</code> domain to find out that certain +operations have to produce a constant. We can also use the <code>KnownBits</code> domain +to check whether certain operation rewrites are correct. Let's use one of the +examples from the <a href="https://www.pypy.org/posts/2024/07/mining-jit-traces-missing-optimizations-z3.html">Mining JIT traces for missing optimizations with +Z3</a> +post, where Z3 found the inefficiency <code>(x &lt;&lt; 4) &amp; -0xf == x &lt;&lt; 4</code> in PyPy JIT +traces. We don't have shift operations, but we want to generalize this optimization +anyway. The general form of this rewrite is that under some circumstances <code>x &amp; +y == x</code>, and we can use the <code>KnownBits</code> domain to detect situations where this +must be true.</p> +<p>To understand <em>when</em> <code>x &amp; y == x</code> is true, we can think about individual pairs of +bits <code>a</code> and <code>b</code>. If <code>a == 0</code>, then <code>a &amp; b == 0 &amp; b == 0 == a</code>. If <code>b == 1</code> +then <code>a &amp; b == a &amp; 1 == a</code>. So if either <code>a == 0</code> or <code>b == 1</code> is true, +<code>a &amp; b == a</code> follows. And if either of these conditions is true for <em>all</em> the +bits of <code>x</code> and <code>y</code>, we can know that <code>x &amp; y == x</code>.</p> +<p>We can write a method on <code>KnownBits</code> to check for this condition:</p> +<div class="code"><pre class="code literal-block"><span class="k">class</span> <span class="nc">KnownBits</span><span class="p">:</span> + <span class="o">...</span> + + <span class="k">def</span> <span class="nf">is_and_identity</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">):</span> +<span class="w"> </span><span class="sd">""" Return True if n1 &amp; n2 == n1 for any n1 in self and n2 in other.</span> +<span class="sd"> (or, equivalently, return True if n1 | n2 == n2)"""</span> + <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">zeros</span> <span class="o">|</span> <span class="n">other</span><span class="o">.</span><span class="n">ones</span> <span class="o">==</span> <span class="o">-</span><span class="mi">1</span> +</pre></div> + +<p>Since my reasoning about this feels ripe for errors, let's check that our +understanding is correct with Z3:</p> +<div class="code"><pre class="code literal-block"><span class="k">def</span> <span class="nf">test_prove_is_and_identity</span><span class="p">():</span> + <span class="n">solver</span><span class="p">,</span> <span class="n">k1</span><span class="p">,</span> <span class="n">n1</span><span class="p">,</span> <span class="n">k2</span><span class="p">,</span> <span class="n">n2</span> <span class="o">=</span> <span class="n">z3_setup_variables</span><span class="p">()</span> + <span class="n">prove</span><span class="p">(</span><span class="n">z3</span><span class="o">.</span><span class="n">Implies</span><span class="p">(</span><span class="n">k1</span><span class="o">.</span><span class="n">is_and_identity</span><span class="p">(</span><span class="n">k2</span><span class="p">),</span> <span class="n">n1</span> <span class="o">&amp;</span> <span class="n">n2</span> <span class="o">==</span> <span class="n">n1</span><span class="p">),</span> <span class="n">solver</span><span class="p">)</span> +</pre></div> + +<p>Now let's use this in the toy optimizer. Here are two tests for this rewrite:</p> +<div class="code"><pre class="code literal-block"><span class="k">def</span> <span class="nf">test_remove_redundant_and</span><span class="p">():</span> + <span class="n">bb</span> <span class="o">=</span> <span class="n">Block</span><span class="p">()</span> + <span class="n">var0</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">getarg</span><span class="p">(</span><span class="mi">0</span><span class="p">)</span> + <span class="n">var1</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">int_invert</span><span class="p">(</span><span class="mb">0b1111</span><span class="p">)</span> + <span class="c1"># mask off the lowest four bits</span> + <span class="n">var2</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">int_and</span><span class="p">(</span><span class="n">var0</span><span class="p">,</span> <span class="n">var1</span><span class="p">)</span> + <span class="c1"># applying the same mask is not redundant</span> + <span class="n">var3</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">int_and</span><span class="p">(</span><span class="n">var2</span><span class="p">,</span> <span class="n">var1</span><span class="p">)</span> + <span class="n">var4</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">dummy</span><span class="p">(</span><span class="n">var3</span><span class="p">)</span> + + <span class="n">opt_bb</span> <span class="o">=</span> <span class="n">simplify</span><span class="p">(</span><span class="n">bb</span><span class="p">)</span> + <span class="k">assert</span> <span class="n">bb_to_str</span><span class="p">(</span><span class="n">opt_bb</span><span class="p">,</span> <span class="s2">"optvar"</span><span class="p">)</span> <span class="o">==</span> <span class="s2">"""</span><span class="se">\</span> +<span class="s2">optvar0 = getarg(0)</span> +<span class="s2">optvar1 = int_and(optvar0, -16)</span> +<span class="s2">optvar2 = dummy(optvar1)"""</span> + +<span class="k">def</span> <span class="nf">test_remove_redundant_and_more_complex</span><span class="p">():</span> + <span class="n">bb</span> <span class="o">=</span> <span class="n">Block</span><span class="p">()</span> + <span class="n">var0</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">getarg</span><span class="p">(</span><span class="mi">0</span><span class="p">)</span> + <span class="n">var1</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">getarg</span><span class="p">(</span><span class="mi">1</span><span class="p">)</span> + <span class="c1"># var2 has bit pattern ????</span> + <span class="n">var2</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">int_and</span><span class="p">(</span><span class="n">var0</span><span class="p">,</span> <span class="mb">0b1111</span><span class="p">)</span> + <span class="c1"># var3 has bit pattern ...?1111</span> + <span class="n">var3</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">int_or</span><span class="p">(</span><span class="n">var1</span><span class="p">,</span> <span class="mb">0b1111</span><span class="p">)</span> + <span class="c1"># var4 is just var2</span> + <span class="n">var4</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">int_and</span><span class="p">(</span><span class="n">var2</span><span class="p">,</span> <span class="n">var3</span><span class="p">)</span> + <span class="n">var5</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">dummy</span><span class="p">(</span><span class="n">var4</span><span class="p">)</span> + + <span class="n">opt_bb</span> <span class="o">=</span> <span class="n">simplify</span><span class="p">(</span><span class="n">bb</span><span class="p">)</span> + <span class="k">assert</span> <span class="n">bb_to_str</span><span class="p">(</span><span class="n">opt_bb</span><span class="p">,</span> <span class="s2">"optvar"</span><span class="p">)</span> <span class="o">==</span> <span class="s2">"""</span><span class="se">\</span> +<span class="s2">optvar0 = getarg(0)</span> +<span class="s2">optvar1 = getarg(1)</span> +<span class="s2">optvar2 = int_and(optvar0, 15)</span> +<span class="s2">optvar3 = int_or(optvar1, 15)</span> +<span class="s2">optvar4 = dummy(optvar2)"""</span> +</pre></div> + +<p>The first test could also be made to pass by implementing a reassociation +optimization that turns <code>(x &amp; c1) &amp; c2</code> into <code>x &amp; (c1 &amp; c2)</code> and then constant-folds the second <code>and</code>. But here we want to +use <code>KnownBits</code> and conditionally rewrite <code>int_and</code> to its first argument. So to make the tests pass, +we can change <code>simplify</code> like this:</p> +<div class="code"><pre class="code literal-block"><span class="k">def</span> <span class="nf">simplify</span><span class="p">(</span><span class="n">bb</span><span class="p">:</span> <span class="n">Block</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Block</span><span class="p">:</span> + <span class="n">abstract_values</span> <span class="o">=</span> <span class="p">{}</span> <span class="c1"># dict mapping Operation to KnownBits</span> + + <span class="k">def</span> <span class="nf">knownbits_of</span><span class="p">(</span><span class="n">val</span> <span class="p">:</span> <span class="n">Value</span><span class="p">):</span> + <span class="o">...</span> + + <span class="n">opt_bb</span> <span class="o">=</span> <span class="n">Block</span><span class="p">()</span> + <span class="k">for</span> <span class="n">op</span> <span class="ow">in</span> <span class="n">bb</span><span class="p">:</span> + <span class="c1"># apply the transfer function on the abstract arguments</span> + <span class="n">name_without_prefix</span> <span class="o">=</span> <span class="n">op</span><span class="o">.</span><span class="n">name</span><span class="o">.</span><span class="n">removeprefix</span><span class="p">(</span><span class="s2">"int_"</span><span class="p">)</span> + <span class="n">method_name</span> <span class="o">=</span> <span class="sa">f</span><span class="s2">"abstract_</span><span class="si">{</span><span class="n">name_without_prefix</span><span class="si">}</span><span class="s2">"</span> + <span class="n">transfer_function</span> <span class="o">=</span> <span class="nb">getattr</span><span class="p">(</span><span class="n">KnownBits</span><span class="p">,</span> <span class="n">method_name</span><span class="p">,</span> <span class="n">unknown_transfer_functions</span><span class="p">)</span> + <span class="n">abstract_args</span> <span class="o">=</span> <span class="p">[</span><span class="n">knownbits_of</span><span class="p">(</span><span class="n">arg</span><span class="o">.</span><span class="n">find</span><span class="p">())</span> <span class="k">for</span> <span class="n">arg</span> <span class="ow">in</span> <span class="n">op</span><span class="o">.</span><span class="n">args</span><span class="p">]</span> + <span class="n">abstract_res</span> <span class="o">=</span> <span class="n">abstract_values</span><span class="p">[</span><span class="n">op</span><span class="p">]</span> <span class="o">=</span> <span class="n">transfer_function</span><span class="p">(</span><span class="o">*</span><span class="n">abstract_args</span><span class="p">)</span> + <span class="c1"># if the result is a constant, we optimize the operation away and make</span> + <span class="c1"># it equal to the constant result</span> + <span class="k">if</span> <span class="n">abstract_res</span><span class="o">.</span><span class="n">is_constant</span><span class="p">():</span> + <span class="n">op</span><span class="o">.</span><span class="n">make_equal_to</span><span class="p">(</span><span class="n">Constant</span><span class="p">(</span><span class="n">abstract_res</span><span class="o">.</span><span class="n">ones</span><span class="p">))</span> + <span class="k">continue</span> + <span class="c1"># &lt;&lt;&lt;&lt; new code</span> + <span class="c1"># conditionally rewrite int_and(x, y) to x</span> + <span class="k">if</span> <span class="n">op</span><span class="o">.</span><span class="n">name</span> <span class="o">==</span> <span class="s2">"int_and"</span><span class="p">:</span> + <span class="n">k1</span><span class="p">,</span> <span class="n">k2</span> <span class="o">=</span> <span class="n">abstract_args</span> + <span class="k">if</span> <span class="n">k1</span><span class="o">.</span><span class="n">is_and_identity</span><span class="p">(</span><span class="n">k2</span><span class="p">):</span> + <span class="n">op</span><span class="o">.</span><span class="n">make_equal_to</span><span class="p">(</span><span class="n">op</span><span class="o">.</span><span class="n">arg</span><span class="p">(</span><span class="mi">0</span><span class="p">))</span> + <span class="k">continue</span> + <span class="c1"># &gt;&gt;&gt;&gt; end changes</span> + <span class="n">opt_bb</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">op</span><span class="p">)</span> + <span class="k">return</span> <span class="n">opt_bb</span> +</pre></div> + +<p>And with that, the new tests pass as well. A real implementation would also +check the other argument order, but we leave that out for the sake of brevity.</p> +<p>This rewrite also generalizes the <a href="https://pypy.org/posts/2024/07/finding-simple-rewrite-rules-jit-z3.html">rewrites</a> <code>int_and(0, x) -&gt; 0</code> and +<code>int_and(-1, x) -&gt; x</code>, let's add a test for those:</p> +<div class="code"><pre class="code literal-block"><span class="k">def</span> <span class="nf">test_remove_and_simple</span><span class="p">():</span> + <span class="n">bb</span> <span class="o">=</span> <span class="n">Block</span><span class="p">()</span> + <span class="n">var0</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">getarg</span><span class="p">(</span><span class="mi">0</span><span class="p">)</span> + <span class="n">var1</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">getarg</span><span class="p">(</span><span class="mi">1</span><span class="p">)</span> + <span class="n">var2</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">int_and</span><span class="p">(</span><span class="mi">0</span><span class="p">,</span> <span class="n">var0</span><span class="p">)</span> <span class="c1"># == 0</span> + <span class="n">var3</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">int_invert</span><span class="p">(</span><span class="n">var2</span><span class="p">)</span> <span class="c1"># == -1</span> + <span class="n">var4</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">int_and</span><span class="p">(</span><span class="n">var1</span><span class="p">,</span> <span class="n">var3</span><span class="p">)</span> <span class="c1"># == var1</span> + <span class="n">var5</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">dummy</span><span class="p">(</span><span class="n">var4</span><span class="p">)</span> + + <span class="n">opt_bb</span> <span class="o">=</span> <span class="n">simplify</span><span class="p">(</span><span class="n">bb</span><span class="p">)</span> + <span class="k">assert</span> <span class="n">bb_to_str</span><span class="p">(</span><span class="n">opt_bb</span><span class="p">,</span> <span class="s2">"optvar"</span><span class="p">)</span> <span class="o">==</span> <span class="s2">"""</span><span class="se">\</span> +<span class="s2">optvar0 = getarg(0)</span> +<span class="s2">optvar1 = getarg(1)</span> +<span class="s2">optvar2 = dummy(optvar1)"""</span> +</pre></div> + +<p>This test just passes. And that's it for this post!</p> +<h3 id="conclusion">Conclusion</h3> +<p>In this post we've seen the implementation, testing and proofs about a 'known +bits' abstract domain, as well as its use in the toy optimizer to generalize +constant folding, and to implement conditional peephole rewrites.</p> +<p>In the next posts I'll write about the real implementation of a knownbits +domain in PyPy's JIT, its combination with the existing interval abstract +domain, how to deal with gaining information from conditions in the program, +and some lose ends.</p> +<p>Sources:</p> +<ul> +<li><a href="https://github.com/llvm/llvm-project/blob/main/llvm/lib/Support/KnownBits.cpp">Known bits in LLVM</a></li> +<li><a href="https://github.com/torvalds/linux/blob/master/kernel/bpf/tnum.c">Tristate numbers for known bits in Linux eBPF</a></li> +<li><a href="https://arxiv.org/abs/2105.05398">Sound, Precise, and Fast Abstract Interpretation with Tristate Numbers</a></li> +<li><a href="https://people.cs.rutgers.edu/~sn349/papers/agni-cav2023.pdf">Verifying the Verifier: eBPF Range Analysis Verification</a></li> +<li><a href="https://dougallj.wordpress.com/2020/01/13/bit-twiddling-addition-with-unknown-bits/">Bit-Twiddling: Addition with Unknown + Bits</a> + is a super readable blog post by Dougall J. I've taken the <code>ones</code> and + <code>unknowns</code> naming from this post, which I find significantly clearer than + <code>value</code> and <code>mask</code>, which the Linux kernel uses.</li> +<li><a href="https://bitmath.blogspot.com/">Bits, Math and Performance(?)</a>, a fantastic + blog by <a href="https://mastodon.gamedev.place/@harold">Harold Aptroot</a>. There are a + lot of relevant posts about known bits, range analysis etc. Harold is also + the author of <a href="http://haroldbot.nl/">Haroldbot</a>, a website that can be used + for bitvector calculations, and also checks bitvector identities.</li> +<li><a href="https://cea.hal.science/cea-01795779/document">Sharpening Constraint Programming approaches for Bit-Vector Theory</a></li> +<li><a href="https://users.cs.utah.edu/~regehr/papers/lctes06_2/fp019-regehr.pdf">Deriving Abstract Transfer Functions for Analyzing Embedded Software</a></li> +<li><a href="https://arxiv.org/abs/2105.00493">Synthesizing Abstract Transformers</a></li> +</ul> +<div class="footnote"> +<hr> +<ol> +<li id="fn:proof_bitwidths"> +<p>There's a subtletly about the Z3 proofs that I'm sort of +glossing over here. Python integers are of arbitrary width, and the +<code>KnownBits</code> code is actually carefully written to work for integers of any +size. This property is tested by the Hypothesis tests, which don't limit +the sizes of the generated random integers. However, the Z3 proofs only +check bitvectors of a fixed bitwidth of 64. There are various ways to deal +with this situation. For most "real" compilers, the bitwidth of integers +would be fixed anyway. Then the components <code>ones</code> and <code>unknowns</code> of the +<code>KnownBits</code> class would use the number of bits the corresponding integer +variable has, and the Z3 proofs would use the same width. This is what we +do in the PyPy JIT. <a class="footnote-backref" href="https://www.pypy.org/posts/2024/08/toy-knownbits.html#fnref:proof_bitwidths" title="Jump back to footnote 1 in the text">↩</a></p> +</li> +<li id="fn:tests_vs_proofs"> +<p>The less close connection between implementation and proof +for <code>abstract_eq</code> is one of the reasons why it makes sense to do +unit-testing <em>in addition</em> to proofs. For a more detailed explanation of +why both tests and proofs are good to +have, see <a href="https://siek.blogspot.com/2024/06/data-structures-and-algorithms-correctly.html#correct-software-via-write-test-and-prove:~:text=We%20recognize%20that%20once%20step,detect%20most%20of%20the%20bugs">Jeremy Siek's blog +post</a>, +as well as the <a href="https://www-cs-faculty.stanford.edu/~knuth/faq.html#:~:text=What's%20the%20exact%20citation%20of%20your%20oft%2Dcited%20comment%20about%20bugs?">Knuth +quote</a>. <a class="footnote-backref" href="https://www.pypy.org/posts/2024/08/toy-knownbits.html#fnref:tests_vs_proofs" title="Jump back to footnote 2 in the text">↩</a></p> +</li> +</ol> +</div>toy-optimizerz3https://www.pypy.org/posts/2024/08/toy-knownbits.htmlSat, 03 Aug 2024 14:00:00 GMTMining JIT traces for missing optimizations with Z3https://www.pypy.org/posts/2024/07/mining-jit-traces-missing-optimizations-z3.htmlCF Bolz-Tereick<p>In my last post I've described <a href="https://www.pypy.org/posts/2024/07/finding-simple-rewrite-rules-jit-z3.html">how to use Z3 to find simple local peephole +optimization patterns</a> +for the integer operations in PyPy's JIT. An example is <code>int_and(x, 0) -&gt; +0</code>. In this post I want to scale up the problem of identifying possible +optimizations to much bigger instruction sequences, also using Z3. For that, I +am starting with the JIT traces of <strong>real benchmarks</strong>, after they have been +optimized by the optimizer of PyPy's JIT. Then we can ask Z3 to find +inefficient integer operations in those traces.</p> +<p>Starting from the optimized traces of real programs has some big +advantages over the "classical" superoptimization approach of generating and +then trying all possible sequences of instructions. It avoids the +combinatorial explosion that happens with the latter approach. Also, starting +from the traces of benchmarks or (even better) actual programs makes sure that +we actually care about the missing optimizations +that are found in this way. And because the traces are analyzed after they have +been optimized by PyPy's optimizer, we only get reports for <em>missing</em> +optimizations, that the JIT isn't able to do (yet).</p> +<p>The techniques and experiments I describe in this post are again the result of +a bunch of discussions with John Regehr at a conference a few weeks ago, as +well as reading his blog posts and papers. Thanks John! Also thanks to <a href="https://bernsteinbear.com/">Max +Bernstein</a> for super helpful feedback on the drafts +of this blog post (and for poking me to write things in general).</p> +<h3 id="high-level-approach">High-Level Approach</h3> +<p>The approach that I took works as follows:</p> +<ul> +<li>Run benchmarks or other interesting programs and then dump the IR of the JIT + traces into a file. The traces have at that point been already optimized by + the PyPy JIT's optimizer.</li> +<li>For every trace, ignore all the operations on non-integer variables.</li> +<li>Translate every integer operation into a Z3 formula.</li> +<li>For every operation, use Z3 to find out whether the operation is redundant + (how that is done is described below).</li> +<li>If the operation is redundant, the trace is less efficient than it could have + been, because the optimizer could also have removed the operation. Report the + inefficiency.</li> +<li>Minimize the inefficient programs by removing as many operations as possible + to make the problem easier to understand.</li> +</ul> +<p>In the post I will describe the details and show some pseudocode of the +approach. I'll also make the proper code public eventually (but it needs a +healthy dose of cleanups first).</p> +<h3 id="dumping-pypy-traces">Dumping PyPy Traces</h3> +<p>PyPy will write its JIT traces into the file <code>out</code> if the environment variable +<a href="https://doc.pypy.org/en/latest/man/pypy.1.html"><code>PYPYLOG</code></a> is set as follows:</p> +<div class="code"><pre class="code literal-block">PYPYLOG=jit-log-opt:out pypy &lt;program.py&gt; +</pre></div> + +<p>This environment variable works for PyPy, but also for other virtual machines +built with RPython.</p> +<p>(This is really a side point for the rest of the blog post, but since the +question came up I wanted to clarify it: Operations on integers in the Python +program that the JIT is running don't all correspond 1-to-1 with the <code>int_...</code> +operations in the traces. The <code>int_...</code> trace operations always operate on +machine words. The Python <code>int</code> type supports arbitrarily large integers. PyPy +will optimistically try to lower the operations on Python integers into machine +word operations, but adds the necessary guards into the trace to make sure that +overflow outside of the range of machine words is caught. In case one of these +guards fails the interpreter switches to a big integer heap-allocated +representation.)</p> +<h3 id="encoding-traces-as-z3-formulas">Encoding Traces as Z3 formulas</h3> +<p>The last blog post already contained the code to encode the results of +individual trace operations into Z3 formulas, so we don't need to repeat that +here. To encode traces of operations we introduce a Z3 variable for every +operation in the trace and then call the <code>z3_expression</code> function for every +single one of the operations in the trace.</p> +<p>For example, for the following trace:</p> +<div class="code"><pre class="code literal-block"><span class="k">[i1]</span> +<span class="na">i2</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="s">uint_rshift(i1, 32)</span> +<span class="na">i3</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="s">int_and(i2, 65535)</span> +<span class="na">i4</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="s">uint_rshift(i1, 48)</span> +<span class="na">i5</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="s">int_lshift(i4, 16)</span> +<span class="na">i6</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="s">int_or(i5, i3)</span> +<span class="na">jump(i6, i2) # equal</span> +</pre></div> + +<p>We would get the Z3 formula:</p> +<div class="code"><pre class="code literal-block">z3.And(i2 == LShR(i1, 32), + i3 == i2 &amp; 65535, + i4 == LShR(i1, 48), + i5 == i4 &lt;&lt; 16) +</pre></div> + +<p>Usually we won't ask for the formula of the whole trace at once. Instead we go +through the trace operation by operation and try to find inefficiencies in the +current one we are looking at. Roughly like this (pseudo-)code:</p> +<div class="code"><pre class="code literal-block"><span class="k">def</span> <span class="nf">newvar</span><span class="p">(</span><span class="n">name</span><span class="p">):</span> + <span class="k">return</span> <span class="n">z3</span><span class="o">.</span><span class="n">BitVec</span><span class="p">(</span><span class="n">name</span><span class="p">,</span> <span class="n">INTEGER_WIDTH</span><span class="p">)</span> + +<span class="k">def</span> <span class="nf">find_inefficiencies</span><span class="p">(</span><span class="n">trace</span><span class="p">):</span> + <span class="n">solver</span> <span class="o">=</span> <span class="n">z3</span><span class="o">.</span><span class="n">Solver</span><span class="p">()</span> + <span class="n">var_to_z3var</span> <span class="o">=</span> <span class="p">{}</span> + <span class="k">for</span> <span class="n">input_argument</span> <span class="ow">in</span> <span class="n">trace</span><span class="o">.</span><span class="n">inputargs</span><span class="p">:</span> + <span class="n">var_to_z3var</span><span class="p">[</span><span class="n">input_argument</span><span class="p">]</span> <span class="o">=</span> <span class="n">newz3var</span><span class="p">(</span><span class="n">input_argument</span><span class="p">)</span> + <span class="k">for</span> <span class="n">op</span> <span class="ow">in</span> <span class="n">trace</span><span class="p">:</span> + <span class="n">var_to_z3var</span><span class="p">[</span><span class="n">op</span><span class="p">]</span> <span class="o">=</span> <span class="n">z3resultvar</span> <span class="o">=</span> <span class="n">newz3var</span><span class="p">(</span><span class="n">op</span><span class="o">.</span><span class="n">resultvarname</span><span class="p">)</span> + <span class="n">arg0</span> <span class="o">=</span> <span class="n">op</span><span class="o">.</span><span class="n">args</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span> + <span class="n">z3arg0</span> <span class="o">=</span> <span class="n">var_to_z3var</span><span class="p">[</span><span class="n">arg0</span><span class="p">]</span> + <span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">op</span><span class="o">.</span><span class="n">args</span><span class="p">)</span> <span class="o">==</span> <span class="mi">2</span><span class="p">:</span> + <span class="n">arg1</span> <span class="o">=</span> <span class="n">op</span><span class="o">.</span><span class="n">args</span><span class="p">[</span><span class="mi">1</span><span class="p">]</span> + <span class="n">z3arg1</span> <span class="o">=</span> <span class="n">var_to_z3var</span><span class="p">[</span><span class="n">arg1</span><span class="p">]</span> + <span class="k">else</span><span class="p">:</span> + <span class="n">z3arg1</span> <span class="o">=</span> <span class="kc">None</span> + <span class="n">res</span><span class="p">,</span> <span class="n">valid_if</span> <span class="o">=</span> <span class="n">z3_expression</span><span class="p">(</span><span class="n">op</span><span class="o">.</span><span class="n">name</span><span class="p">,</span> <span class="n">z3arg0</span><span class="p">,</span> <span class="n">z3arg1</span><span class="p">)</span> + <span class="c1"># checking for inefficiencies, see the next sections</span> + <span class="o">...</span> + <span class="k">if</span> <span class="o">...</span><span class="p">:</span> + <span class="k">return</span> <span class="s2">"inefficient"</span><span class="p">,</span> <span class="n">op</span> + + <span class="c1"># not inefficient, assert op into the solver and continue with the next op</span> + <span class="n">solver</span><span class="o">.</span><span class="n">add</span><span class="p">(</span><span class="n">z3resultvar</span> <span class="o">==</span> <span class="n">res</span><span class="p">)</span> + <span class="k">return</span> <span class="kc">None</span> <span class="c1"># no inefficiency found</span> +</pre></div> + +<h3 id="identifying-constant-booleans-with-z3">Identifying constant booleans with Z3</h3> +<p>To get started finding inefficiencies in a trace, we can +first focus on boolean variables. For every operation in the trace that +returns a bool we can ask Z3 to prove that this variable must be always True or +always False. Most of the time, neither of these proofs will succeed. But if Z3 +manages to prove one of them, we know have found an ineffiency: instead of +computing the boolean result (eg by executing a comparison) the JIT's optimizer +could have replaced the operation with the corresponding boolean constant.</p> +<p>Here's an example of an inefficiency found that way: if <code>x &lt; y</code> and <code>y &lt; z</code> are +both true, PyPy's JIT could conclude that <code>x &lt; z</code> must also +be true. However, currently the JIT cannot make that conclusion because it +only reasons about the concrete ranges (lower and upper bounds) for every +integer variable, but it has no way to remember anything about relationships +between different variables. This kind of reasoning would quite often be useful +to remove list/string bounds checks. Here's a <a href="https://www.youtube.com/watch?app=desktop&amp;v=1hm5ZVmBEvo">talk about how LLVM does +this</a> (but it might be +too heavyweight for a JIT setting).</p> +<p>Here are some more examples found that way:</p> +<ul> +<li><code>x - 1 == x</code> is always False</li> +<li><code>x - (x == -1) == -1</code> is always False. The pattern <code>x - (x == -1)</code> happens a + lot in PyPy's hash computations: To be compatible with the CPython hashes we + need to make sure that no object's hash is -1 (CPython uses -1 as an error + value on the C level).</li> +</ul> +<p>Here's pseudo-code for how to implement checking boolean operations for +inefficiencies:</p> +<div class="code"><pre class="code literal-block"><span class="k">def</span> <span class="nf">find_inefficiencies</span><span class="p">(</span><span class="n">trace</span><span class="p">):</span> + <span class="o">...</span> + <span class="k">for</span> <span class="n">op</span> <span class="ow">in</span> <span class="n">trace</span><span class="p">:</span> + <span class="o">...</span> + <span class="n">res</span><span class="p">,</span> <span class="n">valid_if</span> <span class="o">=</span> <span class="n">z3_expression</span><span class="p">(</span><span class="n">op</span><span class="o">.</span><span class="n">name</span><span class="p">,</span> <span class="n">z3arg0</span><span class="p">,</span> <span class="n">z3arg1</span><span class="p">)</span> + <span class="c1"># check for boolean constant result</span> + <span class="k">if</span> <span class="n">op</span><span class="o">.</span><span class="n">has_boolean_result</span><span class="p">():</span> + <span class="k">if</span> <span class="n">prove</span><span class="p">(</span><span class="n">solver</span><span class="p">,</span> <span class="n">res</span> <span class="o">==</span> <span class="mi">0</span><span class="p">):</span> + <span class="k">return</span> <span class="s2">"inefficient"</span><span class="p">,</span> <span class="n">op</span><span class="p">,</span> <span class="mi">0</span> + <span class="k">if</span> <span class="n">prove</span><span class="p">(</span><span class="n">solver</span><span class="p">,</span> <span class="n">res</span> <span class="o">==</span> <span class="mi">1</span><span class="p">):</span> + <span class="k">return</span> <span class="s2">"inefficient"</span><span class="p">,</span> <span class="n">op</span><span class="p">,</span> <span class="mi">1</span> + <span class="c1"># checking for other inefficiencies, see the next sections</span> + <span class="o">...</span> + + <span class="c1"># not inefficient, add op to the solver and continue with the next op</span> + <span class="n">solver</span><span class="o">.</span><span class="n">add</span><span class="p">(</span><span class="n">z3resultvar</span> <span class="o">==</span> <span class="n">res</span><span class="p">)</span> + <span class="k">return</span> <span class="kc">None</span> <span class="c1"># no inefficiency found</span> +</pre></div> + +<h3 id="identifying-redundant-operations">Identifying redundant operations</h3> +<p>A more interesting class of redundancy is to try to find two operations in a +trace that compute the same result. We can do that by asking Z3 to prove for +each pair of different operations in the trace to prove that the result is +always the same. If a previous operation returns the same result, the JIT could +have re-used that result instead of re-computing it, saving time. Doing this +search for equivalent operations with Z3 is quadratic in the number of +operations, but since traces have a maximum length it is not too bad in +practice.</p> +<p>This is the real workhorse of my script so far, it's what finds most of the +inefficiencies. Here's a few examples:</p> +<ul> +<li>The very first and super useful example the script found is <code>int_eq(b, 1) == + b</code> if <code>b</code> is known to be a boolean (ie and integer 0 or 1). I have already + implemented this optimization in the JIT.</li> +<li>Similarly, <code>int_and(b, 1) == b</code> for booleans.</li> +<li><code>(x &lt;&lt; 4) &amp; -0xf == x &lt;&lt; 4</code></li> +<li><code>((x &gt;&gt; 63) &lt;&lt; 1) &lt;&lt; 2) &gt;&gt; 3 == x &gt;&gt; 63</code>. In general the JIT is quite bad at + optimizing repeated shifts (the infrastructure for doing better with that is + already in place, so this will be a relatively easy fix).</li> +<li><code>(x &amp; 0xffffffff) | ((x &gt;&gt; 32) &lt;&lt; 32) == x</code>. Having the JIT optimize this + would maybe require first recognizing that <code>(x &gt;&gt; 32) &lt;&lt; 32</code> can be expressed + as a mask: <code>(x &amp; 0xffffffff00000000)</code>, and then using <code>(x &amp; c1) | (x &amp; c2) == + x &amp; (c1 | c2)</code></li> +<li>A commonly occurring pattern is variations of this one: + <code>((x &amp; 1345) ^ 2048) - 2048 == x &amp; 1345</code> (with different constants, of + course). xor is add without carry, and <code>x &amp; 1345</code> does not have the bit + <code>2048</code> set. Therefore the <code>^ 2048</code> is equivalent to <code>+ 2048</code>, which the <code>- + 2048</code> cancels. More generally, if <code>a &amp; b == 0</code>, then <code>a + b == a | b == a ^ b</code>. + I don't understand at all why this appears so often in the traces, but I + see variations of it a lot. LLVM can optimize this, but <a href="https://gcc.gnu.org/bugzilla/show_bug.cgi?id=115829">GCC + can't</a>, thanks to + <a href="https://hachyderm.io/@pinskia/112752641328799157">Andrew Pinski for filing the + bug</a>!</li> +</ul> +<p>And here's some implementation pseudo-code again:</p> +<div class="code"><pre class="code literal-block"><span class="k">def</span> <span class="nf">find_inefficiencies</span><span class="p">(</span><span class="n">trace</span><span class="p">):</span> + <span class="o">...</span> + <span class="k">for</span> <span class="n">op</span> <span class="ow">in</span> <span class="n">trace</span><span class="p">:</span> + <span class="o">...</span> + <span class="n">res</span><span class="p">,</span> <span class="n">valid_if</span> <span class="o">=</span> <span class="n">z3_expression</span><span class="p">(</span><span class="n">op</span><span class="o">.</span><span class="n">name</span><span class="p">,</span> <span class="n">z3arg0</span><span class="p">,</span> <span class="n">z3arg1</span><span class="p">)</span> + <span class="c1"># check for boolean constant result</span> + <span class="o">...</span> + <span class="c1"># searching for redundant operations</span> + <span class="k">for</span> <span class="n">previous_op</span> <span class="ow">in</span> <span class="n">trace</span><span class="p">:</span> + <span class="k">if</span> <span class="n">previous_op</span> <span class="ow">is</span> <span class="n">op</span><span class="p">:</span> + <span class="k">break</span> <span class="c1"># done, reached the current op</span> + <span class="n">previous_op_z3var</span> <span class="o">=</span> <span class="n">var_to_z3var</span><span class="p">[</span><span class="n">previous_op</span><span class="p">]</span> + <span class="k">if</span> <span class="n">prove</span><span class="p">(</span><span class="n">solver</span><span class="p">,</span> <span class="n">previous_op_z3var</span> <span class="o">==</span> <span class="n">res</span><span class="p">):</span> + <span class="k">return</span> <span class="s2">"inefficient"</span><span class="p">,</span> <span class="n">op</span><span class="p">,</span> <span class="n">previous_op</span> + <span class="o">...</span> + <span class="c1"># more code here later</span> + <span class="o">...</span> + + <span class="c1"># not inefficient, add op to the solver and continue with the next op</span> + <span class="n">solver</span><span class="o">.</span><span class="n">add</span><span class="p">(</span><span class="n">z3resultvar</span> <span class="o">==</span> <span class="n">res</span><span class="p">)</span> + <span class="k">return</span> <span class="kc">None</span> <span class="c1"># no inefficiency found</span> +</pre></div> + +<h3 id="synthesizing-more-complicated-constants-with-exists-forall">Synthesizing more complicated constants with exists-forall</h3> +<p>To find out whether some integer operations always return a constant result, we +can't simply use the same trick as for those operations that return boolean +results, because enumerating 2⁶⁴ possible constants and checking them all +would take too long. Like in the last post, we can use <code>z3.ForAll</code> to find out +whether Z3 can synthesize a constant for the result of an operation for us. +If such a constant exists, the JIT could have removed the operation, +and replaced it with the constant that Z3 provides.</p> +<p>Here a few examples of inefficiencies found this way:</p> +<ul> +<li><code>(x ^ 1) ^ x == 1</code> (or, more generally: <code>(x ^ y) ^ x == y</code>)</li> +<li>if <code>x | y == 0</code>, it follows that <code>x == 0</code> and <code>y == 0</code></li> +<li>if <code>x != MAXINT</code>, then <code>x + 1 &gt; x</code></li> +</ul> +<p>Implementing this is actually slightly annoying. The <code>solver.add</code> calls for +non-inefficient ops add assertions to the solver, which are now confusing the +<code>z3.ForAll</code> query. We could remove all assertion from the solver, then do the +<code>ForAll</code> query, then add the assertions back. What I ended doing instead was +instantiating a second solver object that I'm using for the <code>ForAll</code> queries, +that remains empty the whole time.</p> +<div class="code"><pre class="code literal-block"><span class="k">def</span> <span class="nf">find_inefficiencies</span><span class="p">(</span><span class="n">trace</span><span class="p">):</span> + <span class="n">solver</span> <span class="o">=</span> <span class="n">z3</span><span class="o">.</span><span class="n">Solver</span><span class="p">()</span> + <span class="n">empty_solver</span> <span class="o">=</span> <span class="n">z3</span><span class="o">.</span><span class="n">Solver</span><span class="p">()</span> + <span class="n">var_to_z3var</span> <span class="o">=</span> <span class="p">{}</span> + <span class="o">...</span> + <span class="k">for</span> <span class="n">op</span> <span class="ow">in</span> <span class="n">trace</span><span class="p">:</span> + <span class="o">...</span> + <span class="n">res</span><span class="p">,</span> <span class="n">valid_if</span> <span class="o">=</span> <span class="n">z3_expression</span><span class="p">(</span><span class="n">op</span><span class="o">.</span><span class="n">name</span><span class="p">,</span> <span class="n">z3arg0</span><span class="p">,</span> <span class="n">z3arg1</span><span class="p">)</span> + <span class="c1"># check for boolean constant result</span> + <span class="o">...</span> + <span class="c1"># searching for redundant operations</span> + <span class="o">...</span> + <span class="c1"># checking for constant results</span> + <span class="n">constvar</span> <span class="o">=</span> <span class="n">z3</span><span class="o">.</span><span class="n">BitVec</span><span class="p">(</span><span class="s1">'find_const'</span><span class="p">,</span> <span class="n">INTEGER_WIDTH</span><span class="p">)</span> + <span class="n">condition</span> <span class="o">=</span> <span class="n">z3</span><span class="o">.</span><span class="n">ForAll</span><span class="p">(</span> + <span class="n">var_to_z3var</span><span class="o">.</span><span class="n">values</span><span class="p">(),</span> + <span class="n">z3</span><span class="o">.</span><span class="n">Implies</span><span class="p">(</span> + <span class="o">*</span><span class="n">solver</span><span class="o">.</span><span class="n">assertions</span><span class="p">(),</span> + <span class="n">expr</span> <span class="o">==</span> <span class="n">constvar</span> + <span class="p">)</span> + <span class="p">)</span> + <span class="k">if</span> <span class="n">empty_solver</span><span class="o">.</span><span class="n">check</span><span class="p">(</span><span class="n">condition</span><span class="p">)</span> <span class="o">==</span> <span class="n">z3</span><span class="o">.</span><span class="n">sat</span><span class="p">:</span> + <span class="n">model</span> <span class="o">=</span> <span class="n">empty_solver</span><span class="o">.</span><span class="n">model</span><span class="p">()</span> + <span class="n">const</span> <span class="o">=</span> <span class="n">model</span><span class="p">[</span><span class="n">constvar</span><span class="p">]</span><span class="o">.</span><span class="n">as_signed_long</span><span class="p">()</span> + <span class="k">return</span> <span class="s2">"inefficient"</span><span class="p">,</span> <span class="n">op</span><span class="p">,</span> <span class="n">const</span> + + <span class="c1"># not inefficient, add op to the solver and continue with the next op</span> + <span class="n">solver</span><span class="o">.</span><span class="n">add</span><span class="p">(</span><span class="n">z3resultvar</span> <span class="o">==</span> <span class="n">res</span><span class="p">)</span> + <span class="k">return</span> <span class="kc">None</span> <span class="c1"># no inefficiency found</span> +</pre></div> + +<h3 id="minimization">Minimization</h3> +<p>Analyzing an inefficiency by hand in the context of a larger trace is quite +tedious. Therefore I've implemented a (super inefficient) script to try to make +the examples smaller. Here's how that works:</p> +<ul> +<li>First throw out all the operations that occur <em>after</em> the inefficient operation + in the trace.</li> +<li>Then we remove all "dead" operations, ie operations that don't have their + results used (all the operations that we can analyze with Z3 are without side + effects).</li> +<li>Now we try to remove every guard in the trace one by one and check + afterwards, whether the resulting trace still has an inefficiency.</li> +<li>We also try to replace every single operation with a new argument to the + trace, to see whether the inefficiency is still present.</li> +</ul> +<p>The minimization process is sort of inefficient and I should probably be using + <a href="https://github.com/DRMacIver/shrinkray">shrinkray</a> or + <a href="https://github.com/csmith-project/creduce">C-Reduce</a> instead. However, it + seems to work well in practice and the runtime isn't too bad.</p> +<h3 id="results">Results</h3> +<p>So far I am using the JIT traces of three programs: 1) Booting Linux on the +<a href="https://docs.pydrofoil.org">Pydrofoil</a> RISC-V emulator, 2) booting Linux on the Pydrofoil ARM emulator, and 3) +running the PyPy bootstrap process on top of PyPy.</p> +<p>I picked these programs because most Python programs don't contain interesting +amounts of integer operations, and the traces of the emulators +contain a lot of them. I also used the bootstrap process because I still wanted +to try a big Python program and personally care about the runtime of this +program a lot.</p> +<p>The script identifies 94 +inefficiencies in the traces, a lot of them come from repeating +patterns. My next steps will be to manually inspect them all, categorize them, and +implement easy optimizations identified that way. I also want a way to sort the +examples by execution count in the benchmarks, to get a feeling for which of +them are most important.</p> +<p>I didn't investigate the full set of <a href="https://speed.pypy.org">Python +benchmarks</a> that PyPy uses yet, because I don't expect +them to contain interesting amounts of integer operations, but maybe I am wrong +about that? Will have to try eventually.</p> +<h3 id="conclusion">Conclusion</h3> +<p>This was again much easier to do than I would have expected! Given that I had +the translation of trace ops to Z3 already in place, it was a matter of about a +day's of programming to use this infrastructure to find the first problems and +minimizing them.</p> +<p>Reusing the results of existing operations or replacing operations by constants +can be seen as "zero-instruction superoptimization". I'll probably be rather +busy for a while to add the missing optimizations identified by my simple +script. But later extensions to actually synthesize one or several operations +in the attempt to optimize the traces more and find more opportunities should +be possible.</p> +<p>Finding inefficiencies in traces with Z3 is significantly less +annoying and also less error-prone than just manually inspecting traces and +trying to spot optimization opportunities.</p> +<h3 id="random-notes-and-sources">Random Notes and Sources</h3> +<p>Again, John's blog posts:</p> +<ul> +<li><a href="https://blog.regehr.org/archives/1109">Let’s Work on an LLVM Superoptimizer</a></li> +<li><a href="https://blog.regehr.org/archives/1146">Early Superoptimizer Results</a></li> +<li><a href="https://blog.regehr.org/archives/1252">A Few Synthesizing Superoptimizer Results</a></li> +<li><a href="https://blog.regehr.org/archives/1636">Synthesizing Constants</a></li> +</ul> +<p>and papers:</p> +<ul> +<li><a href="https://arxiv.org/pdf/1711.04422">A Synthesizing Superoptimizer</a></li> +<li><a href="https://dl.acm.org/doi/pdf/10.1145/3649837">Hydra: Generalizing Peephole Optimizations with Program Synthesis</a></li> +</ul> +<p>I remembered recently that I had seen the approach of optimizing the traces of +a tracing JIT with Z3 a long time ago, as part of the (now long dead, I think) +<a href="https://web.archive.org/web/20160304055149/http://research.microsoft.com/en-us/projects/spur/">SPUR +project</a>. +There's a <a href="https://web.archive.org/web/20161029162737/http://csl.stanford.edu/~christos/pldi2010.fit/tillmann.provers4jit.pdf">workshop +paper</a> +from 2010 about this. SPUR was trying to use Z3 built into the actual JIT (as +opposed to using Z3 only to find places where the regular optimizers could be +improved). In addition to bitvectors, SPUR also used the Z3 support for arrays +to model the C# heap and remove redundant stores. This is still another future +extension for all the Z3 work I've been doing in the context of the PyPy JIT.</p>jitz3https://www.pypy.org/posts/2024/07/mining-jit-traces-missing-optimizations-z3.htmlFri, 19 Jul 2024 17:01:09 GMTFinding Simple Rewrite Rules for the JIT with Z3https://www.pypy.org/posts/2024/07/finding-simple-rewrite-rules-jit-z3.htmlCF Bolz-Tereick<p>In June I was at the <a href="https://pldi24.sigplan.org/">PLDI conference</a> in +Copenhagen to present a <a href="https://dl.acm.org/doi/10.1145/3652588.3663316">paper</a> +I co-authored with <a href="https://bernsteinbear.com/">Max Bernstein</a>. I also finally +met <a href="https://blog.regehr.org/">John Regehr</a>, who I'd been talking on social +media for ages but had never met. John has been working on compiler correctness +and better techniques for building compilers and optimizers since a very long +time. The blog post <a href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html">Finding JIT Optimizer Bugs using SMT Solvers and +Fuzzing</a> +was heavily inspired by this work. We talked a lot about his and his groups +work on using Z3 for +<a href="https://en.wikipedia.org/wiki/Superoptimization">superoptimization</a> and for +finding missing optimizations. I have applied some of the things John told me +about to the traces of PyPy's JIT, and wanted to blog about that. However, my +draft felt quite hard to understand. Therefore I have now written this current +post, to at least try to provide a somewhat gentler on-ramp to the topic.</p> +<p>In <em>this</em> post we will use the Python-API to Z3 to find local peephole rewrite +rules for the operations in the intermediate representation of PyPy's tracing +JIT. The code for this is simple enough that we can go through all of it.</p> +<p>The PyPy JIT produces traces of machine level instructions, which are optimized +and then turned into machine code. The optimizer uses a number of approaches to +make the traces more efficient. For integer operations it applies a number of +arithmetic simplification rules rules, for example <code>int_add(x, 0) -&gt; x</code>. When +implementing these rules in the JIT there are <strong>two problems</strong>: How do we know +that the rules are correct? And how do we know that we haven't forgotten any +rules? We'll try to answer both of these, but the first one in particular.</p> +<p>We'll be using Z3, a satisfiability module theories (SMT) solver which has good +bitvector support and most importantly an excellent Python API. We can use the +solver to reason about bitvectors, which are how we will model machine +integers.</p> +<p>To find rewrite rules, we will consider the binary operations (i.e. those +taking two arguments) in PyPy traces that take and produce integers. The +completely general form <code>op(x, y)</code> is not simplifiable on its own. But if +either <code>x == y</code> +or if one of the arguments is a constant, we can potentially simplify the +operation into a simpler form. The results are either the variable <code>x</code>, or a +(potentially different) constant. We'll ignore constant-folding where both +arguments of the binary operation are constants. The possible results for a +simplifiable binary operation are the variable <code>x</code> or another constant. This +leaves the following patterns as possibilities:</p> +<ul> +<li><code>op(x, x) == x</code></li> +<li><code>op(x, x) == c1</code></li> +<li><code>op(x, c1) == x</code></li> +<li><code>op(c1, x) == x</code></li> +<li><code>op(x, c1) == c2</code></li> +<li><code>op(c1, x) == c2</code></li> +</ul> +<p>Our approach will be to take every single supported binary integer operation, +instantiate all of these patterns, and try to ask Z3 whether the resulting +simplification is valid for all values of <code>x</code>.</p> +<h3 id="quick-intro-to-the-z3-python-api">Quick intro to the Z3 Python-API</h3> +<p>Here's a terminal session showing the use of the Z3 Python API:</p> +<div class="code"><pre class="code literal-block"><span class="go">&gt;&gt;&gt;&gt; import z3</span> +<span class="go">&gt;&gt;&gt;&gt; # construct a Z3 bitvector variable of width 8, with name x:</span> +<span class="go">&gt;&gt;&gt;&gt; x = z3.BitVec('x', 8)</span> +<span class="go">&gt;&gt;&gt;&gt; # construct a more complicated formula by using operator overloading:</span> +<span class="go">&gt;&gt;&gt;&gt; x + x</span> +<span class="go">x + x</span> +<span class="go">&gt;&gt;&gt;&gt; x + 1</span> +<span class="go">x + 1</span> +</pre></div> + +<p>Z3 checks the "satisfiability" of a formula. This means that it tries to find +an example set of concrete values for the variables that occur in a formula, +such that the formula becomes true. Examples:</p> +<div class="code"><pre class="code literal-block"><span class="go">&gt;&gt;&gt;&gt; solver = z3.Solver()</span> +<span class="go">&gt;&gt;&gt;&gt; solver.check(x * x == 3)</span> +<span class="go">unsat</span> +<span class="go">&gt;&gt;&gt;&gt; # meaning no x fulfils this property</span> +<span class="go">&gt;&gt;&gt;&gt;</span> +<span class="go">&gt;&gt;&gt;&gt; solver.check(x * x == 9)</span> +<span class="go">sat</span> +<span class="go">&gt;&gt;&gt;&gt; model = solver.model()</span> +<span class="go">&gt;&gt;&gt;&gt; model</span> +<span class="go">[x = 253]</span> +<span class="go">&gt;&gt;&gt;&gt; model[x].as_signed_long()</span> +<span class="go">-3</span> +<span class="go">&gt;&gt;&gt;&gt; # 253 is the same as -3 in two's complement arithmetic with 8 bits</span> +</pre></div> + +<p>In order to use Z3 to prove something, we can ask Z3 to find counterexamples +for the statement, meaning concrete values that would make the negation of the +statement true:</p> +<div class="code"><pre class="code literal-block"><span class="go">&gt;&gt;&gt;&gt; solver.check(z3.Not(x ^ -1 == ~x))</span> +<span class="go">unsat</span> +</pre></div> + +<p>The result <code>unsat</code> means that we just proved that <code>x ^ -1 == ~x</code> is true for +all <code>x</code>, because there is no value for <code>x</code> that makes <code>not (x ^ -1 == ~x)</code> +true (this works because -1 has all the bits set).</p> +<p>If we try to prove something incorrect in this way, the following happens:</p> +<div class="code"><pre class="code literal-block"><span class="go">&gt;&gt;&gt;&gt; solver.check(z3.Not(x ^ -1 == x))</span> +<span class="go">sat</span> +</pre></div> + +<p><code>sat</code> shows that <code>x ^ -1 == x</code> is (unsurprisingly) not always true, and we can +ask for a counterexample:</p> +<div class="code"><pre class="code literal-block"><span class="go">&gt;&gt;&gt;&gt; solver.model()</span> +<span class="go">[x = 0]</span> +</pre></div> + +<p>This way of proving this works because the <code>check</code> calls try to solve an +(implicit) "exists" quantifier, over all the Z3 variables used in the formula. +<code>check</code> will either return <code>z3.unsat</code>, which means that no concrete values make +the formula true; or <code>z3.sat</code>, which means that you can get some concrete +values that make the formula true by calling <code>solver.model()</code>.</p> +<p>In math terms we prove things using <code>check</code> by de-Morgan's rules for quantifiers:</p> +<p>$$ \lnot \exists x: \lnot f(x) \implies \forall x: f(x) $$</p> +<p>Now that we've seen the basics of using the Z3 API on a few small examples, +we'll use it in a bigger program.</p> +<h3 id="encoding-the-integer-operations-of-rpythons-jit-into-z3-formulas">Encoding the integer operations of RPython's JIT into Z3 formulas</h3> +<p>Now we'll use the API to reason about the integer operations of the PyPy JIT +intermediate representation (IR). The binary integer operations are:</p> +<div class="code"><pre class="code literal-block"><span class="n">opnames2</span> <span class="o">=</span> <span class="p">[</span> +<span class="s2">"int_add"</span><span class="p">,</span> +<span class="s2">"int_sub"</span><span class="p">,</span> +<span class="s2">"int_mul"</span><span class="p">,</span> +<span class="s2">"int_and"</span><span class="p">,</span> +<span class="s2">"int_or"</span><span class="p">,</span> +<span class="s2">"int_xor"</span><span class="p">,</span> +<span class="s2">"int_eq"</span><span class="p">,</span> +<span class="s2">"int_ne"</span><span class="p">,</span> +<span class="s2">"int_lt"</span><span class="p">,</span> +<span class="s2">"int_le"</span><span class="p">,</span> +<span class="s2">"int_gt"</span><span class="p">,</span> +<span class="s2">"int_ge"</span><span class="p">,</span> +<span class="s2">"uint_lt"</span><span class="p">,</span> +<span class="s2">"uint_le"</span><span class="p">,</span> +<span class="s2">"uint_gt"</span><span class="p">,</span> +<span class="s2">"uint_ge"</span><span class="p">,</span> +<span class="s2">"int_lshift"</span><span class="p">,</span> +<span class="s2">"int_rshift"</span><span class="p">,</span> +<span class="s2">"uint_rshift"</span><span class="p">,</span> +<span class="s2">"uint_mul_high"</span><span class="p">,</span> +<span class="s2">"int_pydiv"</span><span class="p">,</span> +<span class="s2">"int_pymod"</span><span class="p">,</span> +<span class="p">]</span> +</pre></div> + +<p>There's not much special about the integer operations. Like in LLVM, most of +them are signedness-independent: <code>int_add</code>, <code>int_sub</code>, <code>int_mul</code>, ... work +correctly for unsigned integers but also for +<a href="https://en.wikipedia.org/wiki/Two%27s_complement">two's-complement</a> signed +integers. Exceptions for that are order comparisons like <code>int_lt</code> etc. for +which we have unsigned variants <code>uint_lt</code> etc. All operations that produce a +boolean result return a full-width integer <code>0</code> or <code>1</code> (the PyPy JIT supports +only word-sized integers in its intermediate representation)</p> +<p>In order to reason about the IR operations, some ground work:</p> +<div class="code"><pre class="code literal-block"><span class="kn">import</span> <span class="nn">z3</span> + +<span class="n">INTEGER_WIDTH</span> <span class="o">=</span> <span class="mi">64</span> +<span class="n">solver</span> <span class="o">=</span> <span class="n">z3</span><span class="o">.</span><span class="n">Solver</span><span class="p">()</span> +<span class="n">solver</span><span class="o">.</span><span class="n">set</span><span class="p">(</span><span class="s2">"timeout"</span><span class="p">,</span> <span class="mi">10000</span><span class="p">)</span> <span class="c1"># milliseconds, ie 10s</span> +<span class="n">xvar</span> <span class="o">=</span> <span class="n">z3</span><span class="o">.</span><span class="n">BitVec</span><span class="p">(</span><span class="s1">'x'</span><span class="p">,</span> <span class="n">INTEGER_WIDTH</span><span class="p">)</span> +<span class="n">constvar</span> <span class="o">=</span> <span class="n">z3</span><span class="o">.</span><span class="n">BitVec</span><span class="p">(</span><span class="s1">'const'</span><span class="p">,</span> <span class="n">INTEGER_WIDTH</span><span class="p">)</span> +<span class="n">constvar2</span> <span class="o">=</span> <span class="n">z3</span><span class="o">.</span><span class="n">BitVec</span><span class="p">(</span><span class="s1">'const2'</span><span class="p">,</span> <span class="n">INTEGER_WIDTH</span><span class="p">)</span> +<span class="n">TRUEBV</span> <span class="o">=</span> <span class="n">z3</span><span class="o">.</span><span class="n">BitVecVal</span><span class="p">(</span><span class="mi">1</span><span class="p">,</span> <span class="n">INTEGER_WIDTH</span><span class="p">)</span> +<span class="n">FALSEBV</span> <span class="o">=</span> <span class="n">z3</span><span class="o">.</span><span class="n">BitVecVal</span><span class="p">(</span><span class="mi">0</span><span class="p">,</span> <span class="n">INTEGER_WIDTH</span><span class="p">)</span> +</pre></div> + +<p>And here's the a function to turn an integer IR operation of PyPy's JIT into Z3 +formulas:</p> +<div class="code"><pre class="code literal-block"><span class="k">def</span> <span class="nf">z3_expression</span><span class="p">(</span><span class="n">opname</span><span class="p">,</span> <span class="n">arg0</span><span class="p">,</span> <span class="n">arg1</span><span class="o">=</span><span class="kc">None</span><span class="p">):</span> +<span class="w"> </span><span class="sd">""" computes a tuple of (result, valid_if) of Z3 formulas. `result` is the</span> +<span class="sd"> formula representing the result of the operation, given argument formulas</span> +<span class="sd"> arg0 and arg1. `valid_if` is a pre-condition that must be true for the</span> +<span class="sd"> result to be meaningful. """</span> + <span class="n">result</span> <span class="o">=</span> <span class="kc">None</span> + <span class="n">valid_if</span> <span class="o">=</span> <span class="kc">True</span> <span class="c1"># the precondition is mostly True, with few exceptions</span> + <span class="k">if</span> <span class="n">opname</span> <span class="o">==</span> <span class="s2">"int_add"</span><span class="p">:</span> + <span class="n">result</span> <span class="o">=</span> <span class="n">arg0</span> <span class="o">+</span> <span class="n">arg1</span> + <span class="k">elif</span> <span class="n">opname</span> <span class="o">==</span> <span class="s2">"int_sub"</span><span class="p">:</span> + <span class="n">result</span> <span class="o">=</span> <span class="n">arg0</span> <span class="o">-</span> <span class="n">arg1</span> + <span class="k">elif</span> <span class="n">opname</span> <span class="o">==</span> <span class="s2">"int_mul"</span><span class="p">:</span> + <span class="n">result</span> <span class="o">=</span> <span class="n">arg0</span> <span class="o">*</span> <span class="n">arg1</span> + <span class="k">elif</span> <span class="n">opname</span> <span class="o">==</span> <span class="s2">"int_and"</span><span class="p">:</span> + <span class="n">result</span> <span class="o">=</span> <span class="n">arg0</span> <span class="o">&amp;</span> <span class="n">arg1</span> + <span class="k">elif</span> <span class="n">opname</span> <span class="o">==</span> <span class="s2">"int_or"</span><span class="p">:</span> + <span class="n">result</span> <span class="o">=</span> <span class="n">arg0</span> <span class="o">|</span> <span class="n">arg1</span> + <span class="k">elif</span> <span class="n">opname</span> <span class="o">==</span> <span class="s2">"int_xor"</span><span class="p">:</span> + <span class="n">result</span> <span class="o">=</span> <span class="n">arg0</span> <span class="o">^</span> <span class="n">arg1</span> + <span class="k">elif</span> <span class="n">opname</span> <span class="o">==</span> <span class="s2">"int_eq"</span><span class="p">:</span> + <span class="n">result</span> <span class="o">=</span> <span class="n">cond</span><span class="p">(</span><span class="n">arg0</span> <span class="o">==</span> <span class="n">arg1</span><span class="p">)</span> + <span class="k">elif</span> <span class="n">opname</span> <span class="o">==</span> <span class="s2">"int_ne"</span><span class="p">:</span> + <span class="n">result</span> <span class="o">=</span> <span class="n">cond</span><span class="p">(</span><span class="n">arg0</span> <span class="o">!=</span> <span class="n">arg1</span><span class="p">)</span> + <span class="k">elif</span> <span class="n">opname</span> <span class="o">==</span> <span class="s2">"int_lt"</span><span class="p">:</span> + <span class="n">result</span> <span class="o">=</span> <span class="n">cond</span><span class="p">(</span><span class="n">arg0</span> <span class="o">&lt;</span> <span class="n">arg1</span><span class="p">)</span> + <span class="k">elif</span> <span class="n">opname</span> <span class="o">==</span> <span class="s2">"int_le"</span><span class="p">:</span> + <span class="n">result</span> <span class="o">=</span> <span class="n">cond</span><span class="p">(</span><span class="n">arg0</span> <span class="o">&lt;=</span> <span class="n">arg1</span><span class="p">)</span> + <span class="k">elif</span> <span class="n">opname</span> <span class="o">==</span> <span class="s2">"int_gt"</span><span class="p">:</span> + <span class="n">result</span> <span class="o">=</span> <span class="n">cond</span><span class="p">(</span><span class="n">arg0</span> <span class="o">&gt;</span> <span class="n">arg1</span><span class="p">)</span> + <span class="k">elif</span> <span class="n">opname</span> <span class="o">==</span> <span class="s2">"int_ge"</span><span class="p">:</span> + <span class="n">result</span> <span class="o">=</span> <span class="n">cond</span><span class="p">(</span><span class="n">arg0</span> <span class="o">&gt;=</span> <span class="n">arg1</span><span class="p">)</span> + <span class="k">elif</span> <span class="n">opname</span> <span class="o">==</span> <span class="s2">"uint_lt"</span><span class="p">:</span> + <span class="n">result</span> <span class="o">=</span> <span class="n">cond</span><span class="p">(</span><span class="n">z3</span><span class="o">.</span><span class="n">ULT</span><span class="p">(</span><span class="n">arg0</span><span class="p">,</span> <span class="n">arg1</span><span class="p">))</span> + <span class="k">elif</span> <span class="n">opname</span> <span class="o">==</span> <span class="s2">"uint_le"</span><span class="p">:</span> + <span class="n">result</span> <span class="o">=</span> <span class="n">cond</span><span class="p">(</span><span class="n">z3</span><span class="o">.</span><span class="n">ULE</span><span class="p">(</span><span class="n">arg0</span><span class="p">,</span> <span class="n">arg1</span><span class="p">))</span> + <span class="k">elif</span> <span class="n">opname</span> <span class="o">==</span> <span class="s2">"uint_gt"</span><span class="p">:</span> + <span class="n">result</span> <span class="o">=</span> <span class="n">cond</span><span class="p">(</span><span class="n">z3</span><span class="o">.</span><span class="n">UGT</span><span class="p">(</span><span class="n">arg0</span><span class="p">,</span> <span class="n">arg1</span><span class="p">))</span> + <span class="k">elif</span> <span class="n">opname</span> <span class="o">==</span> <span class="s2">"uint_ge"</span><span class="p">:</span> + <span class="n">result</span> <span class="o">=</span> <span class="n">cond</span><span class="p">(</span><span class="n">z3</span><span class="o">.</span><span class="n">UGE</span><span class="p">(</span><span class="n">arg0</span><span class="p">,</span> <span class="n">arg1</span><span class="p">))</span> + <span class="k">elif</span> <span class="n">opname</span> <span class="o">==</span> <span class="s2">"int_lshift"</span><span class="p">:</span> + <span class="n">result</span> <span class="o">=</span> <span class="n">arg0</span> <span class="o">&lt;&lt;</span> <span class="n">arg1</span> + <span class="n">valid_if</span> <span class="o">=</span> <span class="n">z3</span><span class="o">.</span><span class="n">And</span><span class="p">(</span><span class="n">arg1</span> <span class="o">&gt;=</span> <span class="mi">0</span><span class="p">,</span> <span class="n">arg1</span> <span class="o">&lt;</span> <span class="n">INTEGER_WIDTH</span><span class="p">)</span> + <span class="k">elif</span> <span class="n">opname</span> <span class="o">==</span> <span class="s2">"int_rshift"</span><span class="p">:</span> + <span class="n">result</span> <span class="o">=</span> <span class="n">arg0</span> <span class="o">&lt;&lt;</span> <span class="n">arg1</span> + <span class="n">valid_if</span> <span class="o">=</span> <span class="n">z3</span><span class="o">.</span><span class="n">And</span><span class="p">(</span><span class="n">arg1</span> <span class="o">&gt;=</span> <span class="mi">0</span><span class="p">,</span> <span class="n">arg1</span> <span class="o">&lt;</span> <span class="n">INTEGER_WIDTH</span><span class="p">)</span> + <span class="k">elif</span> <span class="n">opname</span> <span class="o">==</span> <span class="s2">"uint_rshift"</span><span class="p">:</span> + <span class="n">result</span> <span class="o">=</span> <span class="n">z3</span><span class="o">.</span><span class="n">LShR</span><span class="p">(</span><span class="n">arg0</span><span class="p">,</span> <span class="n">arg1</span><span class="p">)</span> + <span class="n">valid_if</span> <span class="o">=</span> <span class="n">z3</span><span class="o">.</span><span class="n">And</span><span class="p">(</span><span class="n">arg1</span> <span class="o">&gt;=</span> <span class="mi">0</span><span class="p">,</span> <span class="n">arg1</span> <span class="o">&lt;</span> <span class="n">INTEGER_WIDTH</span><span class="p">)</span> + <span class="k">elif</span> <span class="n">opname</span> <span class="o">==</span> <span class="s2">"uint_mul_high"</span><span class="p">:</span> + <span class="c1"># zero-extend args to 2*INTEGER_WIDTH bit, then multiply and extract</span> + <span class="c1"># highest INTEGER_WIDTH bits</span> + <span class="n">zarg0</span> <span class="o">=</span> <span class="n">z3</span><span class="o">.</span><span class="n">ZeroExt</span><span class="p">(</span><span class="n">INTEGER_WIDTH</span><span class="p">,</span> <span class="n">arg0</span><span class="p">)</span> + <span class="n">zarg1</span> <span class="o">=</span> <span class="n">z3</span><span class="o">.</span><span class="n">ZeroExt</span><span class="p">(</span><span class="n">INTEGER_WIDTH</span><span class="p">,</span> <span class="n">arg1</span><span class="p">)</span> + <span class="n">result</span> <span class="o">=</span> <span class="n">z3</span><span class="o">.</span><span class="n">Extract</span><span class="p">(</span><span class="n">INTEGER_WIDTH</span> <span class="o">*</span> <span class="mi">2</span> <span class="o">-</span> <span class="mi">1</span><span class="p">,</span> <span class="n">INTEGER_WIDTH</span><span class="p">,</span> <span class="n">zarg0</span> <span class="o">*</span> <span class="n">zarg1</span><span class="p">)</span> + <span class="k">elif</span> <span class="n">opname</span> <span class="o">==</span> <span class="s2">"int_pydiv"</span><span class="p">:</span> + <span class="n">valid_if</span> <span class="o">=</span> <span class="n">arg1</span> <span class="o">!=</span> <span class="mi">0</span> + <span class="n">r</span> <span class="o">=</span> <span class="n">arg0</span> <span class="o">/</span> <span class="n">arg1</span> + <span class="n">psubx</span> <span class="o">=</span> <span class="n">r</span> <span class="o">*</span> <span class="n">arg1</span> <span class="o">-</span> <span class="n">arg0</span> + <span class="n">result</span> <span class="o">=</span> <span class="n">r</span> <span class="o">+</span> <span class="p">(</span><span class="n">z3</span><span class="o">.</span><span class="n">If</span><span class="p">(</span><span class="n">arg1</span> <span class="o">&lt;</span> <span class="mi">0</span><span class="p">,</span> <span class="n">psubx</span><span class="p">,</span> <span class="o">-</span><span class="n">psubx</span><span class="p">)</span> <span class="o">&gt;&gt;</span> <span class="p">(</span><span class="n">INTEGER_WIDTH</span> <span class="o">-</span> <span class="mi">1</span><span class="p">))</span> + <span class="k">elif</span> <span class="n">opname</span> <span class="o">==</span> <span class="s2">"int_pymod"</span><span class="p">:</span> + <span class="n">valid_if</span> <span class="o">=</span> <span class="n">arg1</span> <span class="o">!=</span> <span class="mi">0</span> + <span class="n">r</span> <span class="o">=</span> <span class="n">arg0</span> <span class="o">%</span> <span class="n">arg1</span> + <span class="n">result</span> <span class="o">=</span> <span class="n">r</span> <span class="o">+</span> <span class="p">(</span><span class="n">arg1</span> <span class="o">&amp;</span> <span class="n">z3</span><span class="o">.</span><span class="n">If</span><span class="p">(</span><span class="n">arg1</span> <span class="o">&lt;</span> <span class="mi">0</span><span class="p">,</span> <span class="o">-</span><span class="n">r</span><span class="p">,</span> <span class="n">r</span><span class="p">)</span> <span class="o">&gt;&gt;</span> <span class="p">(</span><span class="n">INTEGER_WIDTH</span> <span class="o">-</span> <span class="mi">1</span><span class="p">))</span> + <span class="k">elif</span> <span class="n">opname</span> <span class="o">==</span> <span class="s2">"int_is_true"</span><span class="p">:</span> + <span class="n">result</span> <span class="o">=</span> <span class="n">cond</span><span class="p">(</span><span class="n">arg0</span> <span class="o">!=</span> <span class="n">FALSEBV</span><span class="p">)</span> + <span class="k">elif</span> <span class="n">opname</span> <span class="o">==</span> <span class="s2">"int_is_zero"</span><span class="p">:</span> + <span class="n">result</span> <span class="o">=</span> <span class="n">cond</span><span class="p">(</span><span class="n">arg0</span> <span class="o">==</span> <span class="n">FALSEBV</span><span class="p">)</span> + <span class="k">elif</span> <span class="n">opname</span> <span class="o">==</span> <span class="s2">"int_neg"</span><span class="p">:</span> + <span class="n">result</span> <span class="o">=</span> <span class="o">-</span><span class="n">arg0</span> + <span class="k">elif</span> <span class="n">opname</span> <span class="o">==</span> <span class="s2">"int_invert"</span><span class="p">:</span> + <span class="n">result</span> <span class="o">=</span> <span class="o">~</span><span class="n">arg0</span> + <span class="k">else</span><span class="p">:</span> + <span class="k">assert</span> <span class="mi">0</span><span class="p">,</span> <span class="s2">"unknown operation "</span> <span class="o">+</span> <span class="n">opname</span> + <span class="k">return</span> <span class="n">result</span><span class="p">,</span> <span class="n">valid_if</span> + +<span class="k">def</span> <span class="nf">cond</span><span class="p">(</span><span class="n">z3expr</span><span class="p">):</span> +<span class="w"> </span><span class="sd">""" helper function to turn a Z3 boolean result z3expr into a 1 or 0</span> +<span class="sd"> bitvector, using z3.If """</span> + <span class="k">return</span> <span class="n">z3</span><span class="o">.</span><span class="n">If</span><span class="p">(</span><span class="n">z3expr</span><span class="p">,</span> <span class="n">TRUEBV</span><span class="p">,</span> <span class="n">FALSEBV</span><span class="p">)</span> +</pre></div> + +<p>We map the semantics of a PyPy JIT operation to Z3 with the <code>z3_expression</code> +function. It takes the name of a JIT operation and its two (or one) arguments +into a pair of Z3 formulas, <code>result</code> and <code>valid_if</code>. The resulting formulas are +constructed with the operator overloading of Z3 variables/formulas.</p> +<p>The first element <code>result</code> of the result of <code>z3_expression</code> represents the result +of performing the operation. <code>valid_if</code> is a bool that represents a condition that +needs to be <code>True</code> in order for the result of the operation to be defined. E.g. +<code>int_pydiv(a, b)</code> is only valid if <code>b != 0</code>. Most operations are always valid, +so they return <code>True</code> as that condition (we'll ignore <code>valid_if</code> for a bit, but it +will become more relevant further down in the post).</p> +<p>We can define a helper function to prove things by finding counterexamples:</p> +<div class="code"><pre class="code literal-block"><span class="k">def</span> <span class="nf">prove</span><span class="p">(</span><span class="n">cond</span><span class="p">):</span> +<span class="w"> </span><span class="sd">""" Try to prove a condition cond by searching for counterexamples of its negation. """</span> + <span class="n">z3res</span> <span class="o">=</span> <span class="n">solver</span><span class="o">.</span><span class="n">check</span><span class="p">(</span><span class="n">z3</span><span class="o">.</span><span class="n">Not</span><span class="p">(</span><span class="n">cond</span><span class="p">))</span> + <span class="k">if</span> <span class="n">z3res</span> <span class="o">==</span> <span class="n">z3</span><span class="o">.</span><span class="n">unsat</span><span class="p">:</span> + <span class="k">return</span> <span class="kc">True</span> + <span class="k">elif</span> <span class="n">z3res</span> <span class="o">==</span> <span class="n">z3</span><span class="o">.</span><span class="n">unknown</span><span class="p">:</span> <span class="c1"># eg on timeout</span> + <span class="k">return</span> <span class="kc">False</span> + <span class="k">elif</span> <span class="n">z3res</span> <span class="o">==</span> <span class="n">z3</span><span class="o">.</span><span class="n">sat</span><span class="p">:</span> + <span class="k">return</span> <span class="kc">False</span> + <span class="k">assert</span> <span class="mi">0</span><span class="p">,</span> <span class="s2">"should be unreachable"</span> +</pre></div> + +<h3 id="finding-rewrite-rules">Finding rewrite rules</h3> +<p>Now we can start finding our first rewrite rules, following the first pattern +<code>op(x, x) -&gt; x</code>. We do this by iterating over all the supported binary +operation names, getting the z3 expression for <code>op(x, x)</code> and then asking Z3 to +prove <code>op(x, x) == x</code>.</p> +<div class="code"><pre class="code literal-block"><span class="k">for</span> <span class="n">opname</span> <span class="ow">in</span> <span class="n">opnames2</span><span class="p">:</span> + <span class="n">result</span><span class="p">,</span> <span class="n">valid_if</span> <span class="o">=</span> <span class="n">z3_expression</span><span class="p">(</span><span class="n">opname</span><span class="p">,</span> <span class="n">xvar</span><span class="p">,</span> <span class="n">xvar</span><span class="p">)</span> + <span class="k">if</span> <span class="n">prove</span><span class="p">(</span><span class="n">result</span> <span class="o">==</span> <span class="n">xvar</span><span class="p">):</span> + <span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">"</span><span class="si">{</span><span class="n">opname</span><span class="si">}</span><span class="s2">(x, x) -&gt; x, </span><span class="si">{</span><span class="n">result</span><span class="si">}</span><span class="s2">"</span><span class="p">)</span> +</pre></div> + +<p>This yields the simplifications:</p> +<div class="code"><pre class="code literal-block"><span class="n">int_and</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="n">x</span> +<span class="n">int_or</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="n">x</span> +</pre></div> + +<h3 id="synthesizing-constants">Synthesizing constants</h3> +<p>Supporting the next patterns is harder: <code>op(x, x) == c1</code>, <code>op(x, c1) == x</code>, and +<code>op(c1, x) == x</code>. We don't know which constants to pick to try to get Z3 to +prove the equality. We could iterate over common constants like <code>0</code>, <code>1</code>, +<code>MAXINT</code>, etc, or even over all the 256 values for a bitvector of length 8. +However, we will instead ask Z3 to find the constants for us too.</p> +<p>This can be done by using quantifiers, in this case <code>z3.ForAll</code>. The query we +pose to Z3 is "does there exist a constant <code>c1</code> such that for all <code>x</code> the +following is true: <code>op(x, c1) == x</code>? Note that the constant <code>c1</code> is not +necessarily unique, there could be many of them. We generate several matching +constant, and add that they must be different to the condition of the second +and further queries.</p> +<p>We can express this in a helper function:</p> +<div class="code"><pre class="code literal-block"><span class="k">def</span> <span class="nf">find_constant</span><span class="p">(</span><span class="n">z3expr</span><span class="p">,</span> <span class="n">number_of_results</span><span class="o">=</span><span class="mi">5</span><span class="p">):</span> + <span class="n">condition</span> <span class="o">=</span> <span class="n">z3</span><span class="o">.</span><span class="n">ForAll</span><span class="p">(</span> + <span class="p">[</span><span class="n">xvar</span><span class="p">],</span> + <span class="n">z3expr</span> + <span class="p">)</span> + <span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="n">number_of_results</span><span class="p">):</span> + <span class="n">checkres</span> <span class="o">=</span> <span class="n">solver</span><span class="o">.</span><span class="n">check</span><span class="p">(</span><span class="n">condition</span><span class="p">)</span> + <span class="k">if</span> <span class="n">checkres</span> <span class="o">==</span> <span class="n">z3</span><span class="o">.</span><span class="n">sat</span><span class="p">:</span> + <span class="c1"># if a solver check succeeds, we can ask for a model, which is</span> + <span class="c1"># concrete values for the variables constvar</span> + <span class="n">model</span> <span class="o">=</span> <span class="n">solver</span><span class="o">.</span><span class="n">model</span><span class="p">()</span> + <span class="n">const</span> <span class="o">=</span> <span class="n">model</span><span class="p">[</span><span class="n">constvar</span><span class="p">]</span><span class="o">.</span><span class="n">as_signed_long</span><span class="p">()</span> + <span class="k">yield</span> <span class="n">const</span> + <span class="c1"># make sure we don't generate the same constant again on the</span> + <span class="c1"># next call</span> + <span class="n">condition</span> <span class="o">=</span> <span class="n">z3</span><span class="o">.</span><span class="n">And</span><span class="p">(</span><span class="n">constvar</span> <span class="o">!=</span> <span class="n">const</span><span class="p">,</span> <span class="n">condition</span><span class="p">)</span> + <span class="k">else</span><span class="p">:</span> + <span class="c1"># no (more) constants found</span> + <span class="k">break</span> +</pre></div> + +<p>We can use this new function for the three mentioned patterns:</p> +<div class="code"><pre class="code literal-block"><span class="c1"># try to find constants for op(x, x) == c</span> +<span class="k">for</span> <span class="n">opname</span> <span class="ow">in</span> <span class="n">opnames2</span><span class="p">:</span> + <span class="n">result</span><span class="p">,</span> <span class="n">valid_if</span> <span class="o">=</span> <span class="n">z3_expression</span><span class="p">(</span><span class="n">opname</span><span class="p">,</span> <span class="n">xvar</span><span class="p">,</span> <span class="n">xvar</span><span class="p">)</span> + <span class="k">for</span> <span class="n">const</span> <span class="ow">in</span> <span class="n">find_constant</span><span class="p">(</span><span class="n">result</span> <span class="o">==</span> <span class="n">constvar</span><span class="p">):</span> + <span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">"</span><span class="si">{</span><span class="n">opname</span><span class="si">}</span><span class="s2">(x, x) -&gt; </span><span class="si">{</span><span class="n">const</span><span class="si">}</span><span class="s2">"</span><span class="p">)</span> +<span class="c1"># try to find constants for op(x, c) == x and op(c, x) == x</span> +<span class="k">for</span> <span class="n">opname</span> <span class="ow">in</span> <span class="n">opnames2</span><span class="p">:</span> + <span class="n">result</span><span class="p">,</span> <span class="n">valid_if</span> <span class="o">=</span> <span class="n">z3_expression</span><span class="p">(</span><span class="n">opname</span><span class="p">,</span> <span class="n">xvar</span><span class="p">,</span> <span class="n">constvar</span><span class="p">)</span> + <span class="k">for</span> <span class="n">const</span> <span class="ow">in</span> <span class="n">find_constant</span><span class="p">(</span><span class="n">result</span> <span class="o">==</span> <span class="n">xvar</span><span class="p">):</span> + <span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">"</span><span class="si">{</span><span class="n">opname</span><span class="si">}</span><span class="s2">(x, </span><span class="si">{</span><span class="n">const</span><span class="si">}</span><span class="s2">) -&gt; x"</span><span class="p">)</span> + <span class="n">result</span><span class="p">,</span> <span class="n">valid_if</span> <span class="o">=</span> <span class="n">z3_expression</span><span class="p">(</span><span class="n">opname</span><span class="p">,</span> <span class="n">constvar</span><span class="p">,</span> <span class="n">xvar</span><span class="p">)</span> + <span class="k">for</span> <span class="n">const</span> <span class="ow">in</span> <span class="n">find_constant</span><span class="p">(</span><span class="n">result</span> <span class="o">==</span> <span class="n">xvar</span><span class="p">):</span> + <span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">"</span><span class="si">{</span><span class="n">opname</span><span class="si">}</span><span class="s2">(</span><span class="si">{</span><span class="n">const</span><span class="si">}</span><span class="s2">, x) -&gt; x"</span><span class="p">)</span> +<span class="c1"># this code is not quite correct, we'll correct it later</span> +</pre></div> + +<p>Together this yields the following new simplifications:</p> +<div class="code"><pre class="code literal-block"><span class="cp"># careful, these are not all correct!</span> +<span class="n">int_sub</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="mi">0</span> +<span class="n">int_xor</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="mi">0</span> +<span class="n">int_eq</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="mi">1</span> +<span class="n">int_ne</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="mi">0</span> +<span class="n">int_lt</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="mi">0</span> +<span class="n">int_le</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="mi">1</span> +<span class="n">int_gt</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="mi">0</span> +<span class="n">int_ge</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="mi">1</span> +<span class="n">uint_lt</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="mi">0</span> +<span class="n">uint_le</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="mi">1</span> +<span class="n">uint_gt</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="mi">0</span> +<span class="n">uint_ge</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="mi">1</span> +<span class="n">uint_rshift</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="mi">0</span> +<span class="n">int_pymod</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="mi">0</span> +<span class="n">int_add</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="mi">0</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="n">x</span> +<span class="n">int_add</span><span class="p">(</span><span class="mi">0</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="n">x</span> +<span class="n">int_sub</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="mi">0</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="n">x</span> +<span class="n">int_mul</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="mi">1</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="n">x</span> +<span class="n">int_mul</span><span class="p">(</span><span class="mi">1</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="n">x</span> +<span class="n">int_and</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="o">-</span><span class="mi">1</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="n">x</span> +<span class="n">int_and</span><span class="p">(</span><span class="o">-</span><span class="mi">1</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="n">x</span> +<span class="n">int_or</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="mi">0</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="n">x</span> +<span class="n">int_or</span><span class="p">(</span><span class="mi">0</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="n">x</span> +<span class="n">int_xor</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="mi">0</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="n">x</span> +<span class="n">int_xor</span><span class="p">(</span><span class="mi">0</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="n">x</span> +<span class="n">int_lshift</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="mi">0</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="n">x</span> +<span class="n">int_rshift</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="mi">0</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="n">x</span> +<span class="n">uint_rshift</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="mi">0</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="n">x</span> +<span class="n">int_pydiv</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="mi">1</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="n">x</span> +<span class="n">int_pymod</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="mi">0</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="n">x</span> +</pre></div> + +<p>Most of these look good at first glance, but the last one reveals a problem: +we've been ignoring the <code>valid_if</code> expression up to now. We can stop doing that by +changing the code like this, which adds <code>z3.And(valid_if, ...)</code> to the argument of +the calls to <code>find_constant</code>:</p> +<div class="code"><pre class="code literal-block"><span class="c1"># try to find constants for op(x, x) == c, op(x, c) == x and op(c, x) == x</span> +<span class="k">for</span> <span class="n">opname</span> <span class="ow">in</span> <span class="n">opnames2</span><span class="p">:</span> + <span class="n">result</span><span class="p">,</span> <span class="n">valid_if</span> <span class="o">=</span> <span class="n">z3_expression</span><span class="p">(</span><span class="n">opname</span><span class="p">,</span> <span class="n">xvar</span><span class="p">,</span> <span class="n">xvar</span><span class="p">)</span> + <span class="k">for</span> <span class="n">const</span> <span class="ow">in</span> <span class="n">find_constant</span><span class="p">(</span><span class="n">z3</span><span class="o">.</span><span class="n">And</span><span class="p">(</span><span class="n">valid_if</span><span class="p">,</span> <span class="n">result</span> <span class="o">==</span> <span class="n">constvar</span><span class="p">)):</span> + <span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">"</span><span class="si">{</span><span class="n">opname</span><span class="si">}</span><span class="s2">(x, x) -&gt; </span><span class="si">{</span><span class="n">const</span><span class="si">}</span><span class="s2">"</span><span class="p">)</span> +<span class="c1"># try to find constants for op(x, c) == x and op(c, x) == x</span> +<span class="k">for</span> <span class="n">opname</span> <span class="ow">in</span> <span class="n">opnames2</span><span class="p">:</span> + <span class="n">result</span><span class="p">,</span> <span class="n">valid_if</span> <span class="o">=</span> <span class="n">z3_expression</span><span class="p">(</span><span class="n">opname</span><span class="p">,</span> <span class="n">xvar</span><span class="p">,</span> <span class="n">constvar</span><span class="p">)</span> + <span class="k">for</span> <span class="n">const</span> <span class="ow">in</span> <span class="n">find_constant</span><span class="p">(</span><span class="n">z3</span><span class="o">.</span><span class="n">And</span><span class="p">(</span><span class="n">result</span> <span class="o">==</span> <span class="n">xvar</span><span class="p">,</span> <span class="n">valid_if</span><span class="p">)):</span> + <span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">"</span><span class="si">{</span><span class="n">opname</span><span class="si">}</span><span class="s2">(x, </span><span class="si">{</span><span class="n">const</span><span class="si">}</span><span class="s2">) -&gt; x"</span><span class="p">)</span> + <span class="n">result</span><span class="p">,</span> <span class="n">valid_if</span> <span class="o">=</span> <span class="n">z3_expression</span><span class="p">(</span><span class="n">opname</span><span class="p">,</span> <span class="n">constvar</span><span class="p">,</span> <span class="n">xvar</span><span class="p">)</span> + <span class="k">for</span> <span class="n">const</span> <span class="ow">in</span> <span class="n">find_constant</span><span class="p">(</span><span class="n">z3</span><span class="o">.</span><span class="n">And</span><span class="p">(</span><span class="n">result</span> <span class="o">==</span> <span class="n">xvar</span><span class="p">,</span> <span class="n">valid_if</span><span class="p">)):</span> + <span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">"</span><span class="si">{</span><span class="n">opname</span><span class="si">}</span><span class="s2">(</span><span class="si">{</span><span class="n">const</span><span class="si">}</span><span class="s2">, x) -&gt; x"</span><span class="p">)</span> +</pre></div> + +<p>And we get this list instead:</p> +<div class="code"><pre class="code literal-block"><span class="n">int_sub</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="mi">0</span> +<span class="n">int_xor</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="mi">0</span> +<span class="n">int_eq</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="mi">1</span> +<span class="n">int_ne</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="mi">0</span> +<span class="n">int_lt</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="mi">0</span> +<span class="n">int_le</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="mi">1</span> +<span class="n">int_gt</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="mi">0</span> +<span class="n">int_ge</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="mi">1</span> +<span class="n">uint_lt</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="mi">0</span> +<span class="n">uint_le</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="mi">1</span> +<span class="n">uint_gt</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="mi">0</span> +<span class="n">uint_ge</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="mi">1</span> +<span class="n">int_add</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="mi">0</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="n">x</span> +<span class="n">int_add</span><span class="p">(</span><span class="mi">0</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="n">x</span> +<span class="n">int_sub</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="mi">0</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="n">x</span> +<span class="n">int_mul</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="mi">1</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="n">x</span> +<span class="n">int_mul</span><span class="p">(</span><span class="mi">1</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="n">x</span> +<span class="n">int_and</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="o">-</span><span class="mi">1</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="n">x</span> +<span class="n">int_and</span><span class="p">(</span><span class="o">-</span><span class="mi">1</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="n">x</span> +<span class="n">int_or</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="mi">0</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="n">x</span> +<span class="n">int_or</span><span class="p">(</span><span class="mi">0</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="n">x</span> +<span class="n">int_xor</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="mi">0</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="n">x</span> +<span class="n">int_xor</span><span class="p">(</span><span class="mi">0</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="n">x</span> +<span class="n">int_lshift</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="mi">0</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="n">x</span> +<span class="n">int_rshift</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="mi">0</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="n">x</span> +<span class="n">uint_rshift</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="mi">0</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="n">x</span> +<span class="n">int_pydiv</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="mi">1</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="n">x</span> +</pre></div> + +<h3 id="synthesizing-two-constants">Synthesizing two constants</h3> +<p>For the patterns <code>op(x, c1) == c2</code> and <code>op(c1, x) == c2</code> we need to synthesize +two constants. We can again write a helper method for that:</p> +<div class="code"><pre class="code literal-block"><span class="k">def</span> <span class="nf">find_2consts</span><span class="p">(</span><span class="n">z3expr</span><span class="p">,</span> <span class="n">number_of_results</span><span class="o">=</span><span class="mi">5</span><span class="p">):</span> + <span class="n">condition</span> <span class="o">=</span> <span class="n">z3</span><span class="o">.</span><span class="n">ForAll</span><span class="p">(</span> + <span class="p">[</span><span class="n">xvar</span><span class="p">],</span> + <span class="n">z3expr</span> + <span class="p">)</span> + <span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="n">number_of_results</span><span class="p">):</span> + <span class="n">checkres</span> <span class="o">=</span> <span class="n">solver</span><span class="o">.</span><span class="n">check</span><span class="p">(</span><span class="n">condition</span><span class="p">)</span> + <span class="k">if</span> <span class="n">checkres</span> <span class="o">==</span> <span class="n">z3</span><span class="o">.</span><span class="n">sat</span><span class="p">:</span> + <span class="n">model</span> <span class="o">=</span> <span class="n">solver</span><span class="o">.</span><span class="n">model</span><span class="p">()</span> + <span class="n">const</span> <span class="o">=</span> <span class="n">model</span><span class="p">[</span><span class="n">constvar</span><span class="p">]</span><span class="o">.</span><span class="n">as_signed_long</span><span class="p">()</span> + <span class="n">const2</span> <span class="o">=</span> <span class="n">model</span><span class="p">[</span><span class="n">constvar2</span><span class="p">]</span><span class="o">.</span><span class="n">as_signed_long</span><span class="p">()</span> + <span class="k">yield</span> <span class="n">const</span><span class="p">,</span> <span class="n">const2</span> + <span class="n">condition</span> <span class="o">=</span> <span class="n">z3</span><span class="o">.</span><span class="n">And</span><span class="p">(</span><span class="n">z3</span><span class="o">.</span><span class="n">Or</span><span class="p">(</span><span class="n">constvar</span> <span class="o">!=</span> <span class="n">const</span><span class="p">,</span> <span class="n">constvar2</span> <span class="o">!=</span> <span class="n">const2</span><span class="p">),</span> <span class="n">condition</span><span class="p">)</span> + <span class="k">else</span><span class="p">:</span> + <span class="k">return</span> +</pre></div> + +<p>And then use it like this:</p> +<div class="code"><pre class="code literal-block"><span class="k">for</span> <span class="n">opname</span> <span class="ow">in</span> <span class="n">opnames2</span><span class="p">:</span> + <span class="c1"># try to find constants c1, c2 such that op(c1, x) -&gt; c2</span> + <span class="n">result</span><span class="p">,</span> <span class="n">valid_if</span> <span class="o">=</span> <span class="n">z3_expression</span><span class="p">(</span><span class="n">opname</span><span class="p">,</span> <span class="n">constvar</span><span class="p">,</span> <span class="n">xvar</span><span class="p">)</span> + <span class="n">consts</span> <span class="o">=</span> <span class="n">find_2consts</span><span class="p">(</span><span class="n">z3</span><span class="o">.</span><span class="n">And</span><span class="p">(</span><span class="n">valid_if</span><span class="p">,</span> <span class="n">result</span> <span class="o">==</span> <span class="n">constvar2</span><span class="p">))</span> + <span class="k">for</span> <span class="n">const</span><span class="p">,</span> <span class="n">const2</span> <span class="ow">in</span> <span class="n">consts</span><span class="p">:</span> + <span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">"</span><span class="si">{</span><span class="n">opname</span><span class="si">}</span><span class="s2">(</span><span class="si">{</span><span class="n">const</span><span class="si">}</span><span class="s2">, x) -&gt; </span><span class="si">{</span><span class="n">const2</span><span class="si">}</span><span class="s2">"</span><span class="p">)</span> + <span class="c1"># try to find constants c1, c2 such that op(x, c1) -&gt; c2</span> + <span class="n">result</span><span class="p">,</span> <span class="n">valid_if</span> <span class="o">=</span> <span class="n">z3_expression</span><span class="p">(</span><span class="n">opname</span><span class="p">,</span> <span class="n">xvar</span><span class="p">,</span> <span class="n">constvar</span><span class="p">)</span> + <span class="n">consts</span> <span class="o">=</span> <span class="n">find_2consts</span><span class="p">(</span><span class="n">z3</span><span class="o">.</span><span class="n">And</span><span class="p">(</span><span class="n">valid_if</span><span class="p">,</span> <span class="n">result</span> <span class="o">==</span> <span class="n">constvar2</span><span class="p">))</span> + <span class="k">for</span> <span class="n">const</span><span class="p">,</span> <span class="n">const2</span> <span class="ow">in</span> <span class="n">consts</span><span class="p">:</span> + <span class="nb">print</span><span class="p">(</span><span class="s2">"</span><span class="si">%s</span><span class="s2">(x, </span><span class="si">%s</span><span class="s2">) -&gt; </span><span class="si">%s</span><span class="s2">"</span> <span class="o">%</span> <span class="p">(</span><span class="n">opname</span><span class="p">,</span> <span class="n">const</span><span class="p">,</span> <span class="n">const2</span><span class="p">))</span> +</pre></div> + +<p>Which yields some straightforward simplifications:</p> +<div class="code"><pre class="code literal-block"><span class="n">int_mul</span><span class="p">(</span><span class="mi">0</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="mi">0</span> +<span class="n">int_mul</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="mi">0</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="mi">0</span> +<span class="n">int_and</span><span class="p">(</span><span class="mi">0</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="mi">0</span> +<span class="n">int_and</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="mi">0</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="mi">0</span> +<span class="n">uint_lt</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="mi">0</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="mi">0</span> +<span class="n">uint_le</span><span class="p">(</span><span class="mi">0</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="mi">1</span> +<span class="n">uint_gt</span><span class="p">(</span><span class="mi">0</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="mi">0</span> +<span class="n">uint_ge</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="mi">0</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="mi">1</span> +<span class="n">int_lshift</span><span class="p">(</span><span class="mi">0</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="mi">0</span> +<span class="n">int_rshift</span><span class="p">(</span><span class="mi">0</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="mi">0</span> +<span class="n">uint_rshift</span><span class="p">(</span><span class="mi">0</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="mi">0</span> +<span class="n">uint_mul_high</span><span class="p">(</span><span class="mi">0</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="mi">0</span> +<span class="n">uint_mul_high</span><span class="p">(</span><span class="mi">1</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="mi">0</span> +<span class="n">uint_mul_high</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="mi">0</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="mi">0</span> +<span class="n">uint_mul_high</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="mi">1</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="mi">0</span> +<span class="n">int_pymod</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="mi">1</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="mi">0</span> +<span class="n">int_pymod</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="o">-</span><span class="mi">1</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="mi">0</span> +</pre></div> + +<p>A few require a bit more thinking:</p> +<div class="code"><pre class="code literal-block"><span class="n">int_or</span><span class="p">(</span><span class="o">-</span><span class="mi">1</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="o">-</span><span class="mi">1</span> +<span class="n">int_or</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="o">-</span><span class="mi">1</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="o">-</span><span class="mi">1</span> +</pre></div> + +<p>The are true because in two's complement, <code>-1</code> has all bits set.</p> +<p>The following ones require recognizing that <code>-9223372036854775808 == -2**63</code> is +the most negative signed 64-bit integer, and <code>9223372036854775807 == 2 ** 63 - +1</code> is the most positive one:</p> +<div class="code"><pre class="code literal-block"><span class="n">int_lt</span><span class="p">(</span><span class="mi">9223372036854775807</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="mi">0</span> +<span class="n">int_lt</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="o">-</span><span class="mi">9223372036854775808</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="mi">0</span> +<span class="n">int_le</span><span class="p">(</span><span class="o">-</span><span class="mi">9223372036854775808</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="mi">1</span> +<span class="n">int_le</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="mi">9223372036854775807</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="mi">1</span> +<span class="n">int_gt</span><span class="p">(</span><span class="o">-</span><span class="mi">9223372036854775808</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="mi">0</span> +<span class="n">int_gt</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="mi">9223372036854775807</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="mi">0</span> +<span class="n">int_ge</span><span class="p">(</span><span class="mi">9223372036854775807</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="mi">1</span> +<span class="n">int_ge</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="o">-</span><span class="mi">9223372036854775808</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="mi">1</span> +</pre></div> + +<p>The following ones are true because the bitpattern for <code>-1</code> is the largest +unsigned number:</p> +<div class="code"><pre class="code literal-block"><span class="n">uint_lt</span><span class="p">(</span><span class="o">-</span><span class="mi">1</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="mi">0</span> +<span class="n">uint_le</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="o">-</span><span class="mi">1</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="mi">1</span> +<span class="n">uint_gt</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="o">-</span><span class="mi">1</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="mi">0</span> +<span class="n">uint_ge</span><span class="p">(</span><span class="o">-</span><span class="mi">1</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="mi">1</span> +</pre></div> + +<h3 id="strength-reductions">Strength Reductions</h3> +<p>All the patterns so far only had a variable or a constant on the target of the +rewrite. We can also use the machinery to do strengh-reductions where we +generate a single-argument operation <code>op1(x)</code> for input operations <code>op(x, c1)</code> +or <code>op(c1, x)</code>. To achieve this, we try all combinations of binary and unary +operations. (We won't consider strength reductions where a binary operation +gets turned into a "cheaper" other binary operation here.)</p> +<div class="code"><pre class="code literal-block"><span class="n">opnames1</span> <span class="o">=</span> <span class="p">[</span> +<span class="s2">"int_is_true"</span><span class="p">,</span> +<span class="s2">"int_is_zero"</span><span class="p">,</span> +<span class="s2">"int_neg"</span><span class="p">,</span> +<span class="s2">"int_invert"</span><span class="p">,</span> +<span class="p">]</span> + +<span class="k">for</span> <span class="n">opname</span> <span class="ow">in</span> <span class="n">opnames2</span><span class="p">:</span> + <span class="k">for</span> <span class="n">opname1</span> <span class="ow">in</span> <span class="n">opnames1</span><span class="p">:</span> + <span class="n">result</span><span class="p">,</span> <span class="n">valid_if</span> <span class="o">=</span> <span class="n">z3_expression</span><span class="p">(</span><span class="n">opname</span><span class="p">,</span> <span class="n">xvar</span><span class="p">,</span> <span class="n">constvar</span><span class="p">)</span> + <span class="c1"># try to find a constant op(x, c) == g(x)</span> + <span class="n">result1</span><span class="p">,</span> <span class="n">valid_if1</span> <span class="o">=</span> <span class="n">z3_expression</span><span class="p">(</span><span class="n">opname1</span><span class="p">,</span> <span class="n">xvar</span><span class="p">)</span> + <span class="n">consts</span> <span class="o">=</span> <span class="n">find_constant</span><span class="p">(</span><span class="n">z3</span><span class="o">.</span><span class="n">And</span><span class="p">(</span><span class="n">valid_if</span><span class="p">,</span> <span class="n">valid_if1</span><span class="p">,</span> <span class="n">result</span> <span class="o">==</span> <span class="n">result1</span><span class="p">))</span> + <span class="k">for</span> <span class="n">const</span> <span class="ow">in</span> <span class="n">consts</span><span class="p">:</span> + <span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">"</span><span class="si">{</span><span class="n">opname</span><span class="si">}</span><span class="s2">(x, </span><span class="si">{</span><span class="n">const</span><span class="si">}</span><span class="s2">) -&gt; </span><span class="si">{</span><span class="n">opname1</span><span class="si">}</span><span class="s2">(x)"</span><span class="p">)</span> + + <span class="c1"># try to find a constant op(c, x) == g(x)</span> + <span class="n">result</span><span class="p">,</span> <span class="n">valid_if</span> <span class="o">=</span> <span class="n">z3_expression</span><span class="p">(</span><span class="n">opname</span><span class="p">,</span> <span class="n">constvar</span><span class="p">,</span> <span class="n">xvar</span><span class="p">)</span> + <span class="n">result1</span><span class="p">,</span> <span class="n">valid_if1</span> <span class="o">=</span> <span class="n">z3_expression</span><span class="p">(</span><span class="n">opname1</span><span class="p">,</span> <span class="n">xvar</span><span class="p">)</span> + <span class="n">consts</span> <span class="o">=</span> <span class="n">find_constant</span><span class="p">(</span><span class="n">z3</span><span class="o">.</span><span class="n">And</span><span class="p">(</span><span class="n">valid_if</span><span class="p">,</span> <span class="n">valid_if1</span><span class="p">,</span> <span class="n">result</span> <span class="o">==</span> <span class="n">result1</span><span class="p">))</span> + <span class="k">for</span> <span class="n">const</span> <span class="ow">in</span> <span class="n">consts</span><span class="p">:</span> + <span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">"</span><span class="si">{</span><span class="n">opname</span><span class="si">}</span><span class="s2">(</span><span class="si">{</span><span class="n">const</span><span class="si">}</span><span class="s2">, x) -&gt; </span><span class="si">{</span><span class="n">opname1</span><span class="si">}</span><span class="s2">(x)"</span><span class="p">)</span> +</pre></div> + +<p>Which yields the following new simplifications:</p> +<div class="code"><pre class="code literal-block"><span class="n">int_sub</span><span class="p">(</span><span class="mi">0</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="n">int_neg</span><span class="p">(</span><span class="n">x</span><span class="p">)</span> +<span class="n">int_sub</span><span class="p">(</span><span class="o">-</span><span class="mi">1</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="n">int_invert</span><span class="p">(</span><span class="n">x</span><span class="p">)</span> +<span class="n">int_mul</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="o">-</span><span class="mi">1</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="n">int_neg</span><span class="p">(</span><span class="n">x</span><span class="p">)</span> +<span class="n">int_mul</span><span class="p">(</span><span class="o">-</span><span class="mi">1</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="n">int_neg</span><span class="p">(</span><span class="n">x</span><span class="p">)</span> +<span class="n">int_xor</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="o">-</span><span class="mi">1</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="n">int_invert</span><span class="p">(</span><span class="n">x</span><span class="p">)</span> +<span class="n">int_xor</span><span class="p">(</span><span class="o">-</span><span class="mi">1</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="n">int_invert</span><span class="p">(</span><span class="n">x</span><span class="p">)</span> +<span class="n">int_eq</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="mi">0</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="n">int_is_zero</span><span class="p">(</span><span class="n">x</span><span class="p">)</span> +<span class="n">int_eq</span><span class="p">(</span><span class="mi">0</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="n">int_is_zero</span><span class="p">(</span><span class="n">x</span><span class="p">)</span> +<span class="n">int_ne</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="mi">0</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="n">int_is_true</span><span class="p">(</span><span class="n">x</span><span class="p">)</span> +<span class="n">int_ne</span><span class="p">(</span><span class="mi">0</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="n">int_is_true</span><span class="p">(</span><span class="n">x</span><span class="p">)</span> +<span class="n">uint_lt</span><span class="p">(</span><span class="mi">0</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="n">int_is_true</span><span class="p">(</span><span class="n">x</span><span class="p">)</span> +<span class="n">uint_lt</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="mi">1</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="n">int_is_zero</span><span class="p">(</span><span class="n">x</span><span class="p">)</span> +<span class="n">uint_le</span><span class="p">(</span><span class="mi">1</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="n">int_is_true</span><span class="p">(</span><span class="n">x</span><span class="p">)</span> +<span class="n">uint_le</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="mi">0</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="n">int_is_zero</span><span class="p">(</span><span class="n">x</span><span class="p">)</span> +<span class="n">uint_gt</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="mi">0</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="n">int_is_true</span><span class="p">(</span><span class="n">x</span><span class="p">)</span> +<span class="n">uint_gt</span><span class="p">(</span><span class="mi">1</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="n">int_is_zero</span><span class="p">(</span><span class="n">x</span><span class="p">)</span> +<span class="n">uint_ge</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="mi">1</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="n">int_is_true</span><span class="p">(</span><span class="n">x</span><span class="p">)</span> +<span class="n">uint_ge</span><span class="p">(</span><span class="mi">0</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="n">int_is_zero</span><span class="p">(</span><span class="n">x</span><span class="p">)</span> +<span class="n">int_pydiv</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="o">-</span><span class="mi">1</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="n">int_neg</span><span class="p">(</span><span class="n">x</span><span class="p">)</span> +</pre></div> + +<h3 id="conclusions">Conclusions</h3> +<p>With not very little code we managed to generate a whole lot of local +simplifications for integer operations in the IR of PyPy's JIT. The rules +discovered that way are "simple", in the sense that they only require looking +at a single instruction, and not where the arguments of that instruction came +from. They also don't require any knowledge about the properties of the +arguments of the instructions (e.g. that they are positive).</p> +<p>The rewrites in this post have mostly been in PyPy's JIT already. But now we +mechanically confirmed that they are correct. I've also added the remaining +useful looking ones, in particular <code>int_eq(x, 0) -&gt; int_is_zero(x)</code> etc.</p> +<p>If we wanted to scale this approach up, we would have to work much harder! +There are a bunch of problems that come with generalizing the approach to +looking at sequences of instructions:</p> +<ul> +<li> +<p>Combinatorial explosion: if we look at sequences of instructions, we very + quickly get a combinatorial explosion and it becomes untractable to try all + combinations.</p> +</li> +<li> +<p>Finding non-minimal patterns: Some complicated simplifications can be + instances of simpler ones. For example, because <code>int_add(x, 0) -&gt; x</code>, it's + also true that <code>int_add(int_sub(x, y), 0) -&gt; int_sub(x, y)</code>. If we simply + generate all possible sequences, we will find the latter simplification rule, + which we would usually not care about.</p> +</li> +<li> +<p>Unclear usefulness: if we simply generate all rewrites up to a certain number + of instructions, we will get a lot of patterns that are useless in the sense + that they typically aren't found in realistic programs. It would be much + better to somehow focus on the patterns that real benchmarks are using.</p> +</li> +</ul> +<p>In the <a href="https://www.pypy.org/posts/2024/07/mining-jit-traces-missing-optimizations-z3.html">next blog post</a> I'll discuss an alternative approach to simply generating +all possible sequences of instructions, that tries to address these problems. +This works by analyzing the real traces of benchmarks and mining those for +inefficiencies, which only shows problems that occur in actual programs.</p> +<h3 id="sources">Sources</h3> +<p>I've been re-reading a lot of blog posts from John's blog:</p> +<ul> +<li><a href="https://blog.regehr.org/archives/1109">Let’s Work on an LLVM Superoptimizer</a></li> +<li><a href="https://blog.regehr.org/archives/1146">Early Superoptimizer Results</a></li> +<li><a href="https://blog.regehr.org/archives/1252">A Few Synthesizing Superoptimizer Results</a></li> +<li><a href="https://blog.regehr.org/archives/1636">Synthesizing Constants</a></li> +</ul> +<p>but also papers:</p> +<ul> +<li><a href="https://arxiv.org/pdf/1711.04422">A Synthesizing Superoptimizer</a></li> +<li><a href="https://dl.acm.org/doi/pdf/10.1145/3649837">Hydra: Generalizing Peephole Optimizations with Program Synthesis</a></li> +</ul> +<p>Another of my favorite blogs has been <a href="https://www.philipzucker.com/">Philipp Zucker's +blog</a> in the last year or two, lots of excellent +posts about/using Z3 on there.</p>jitz3https://www.pypy.org/posts/2024/07/finding-simple-rewrite-rules-jit-z3.htmlFri, 12 Jul 2024 19:14:09 GMT \ No newline at end of file diff --git a/checksums.html b/checksums.html new file mode 100644 index 000000000..124aabbc7 --- /dev/null +++ b/checksums.html @@ -0,0 +1,1080 @@ + + + + + +Checksums | PyPy + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +

                Checksums

                + + +
                +

                Here are the checksums

                +

                pypy3.10-v7.3.17 sha256:

                +
                53b6e5907df869c49e4eae7aca09fba16d150741097efb245892c1477d2395f2  pypy3.10-v7.3.17-aarch64.tar.bz2
                +e534110e1047da37c1d586c392f74de3424f871d906a2083de6d41f2a8cc9164  pypy3.10-v7.3.17-linux32.tar.bz2
                +fdcdb9b24f1a7726003586503fdeb264fd68fc37fbfcea022dcfe825a7fee18b  pypy3.10-v7.3.17-linux64.tar.bz2
                +a050e25e8d686853dd5afc363e55625165825dacfb55f8753d8225ebe417cfd2  pypy3.10-v7.3.17-macos_arm64.tar.bz2
                +6c2c5f2300d7564e711421b4968abd63243cb96f76e363975dd648ebf4a362ee  pypy3.10-v7.3.17-macos_x86_64.tar.bz2
                +6ad74bc578e9c6d3a8a1c51503313058e3c58c35df86f7485453c4be6ab24bf7  pypy3.10-v7.3.17-src.tar.bz2
                +00857673af7d92144a5e134c723891953a1e99ac002eff440330de23a8147e85  pypy3.10-v7.3.17-src.zip
                +cab794a03ddda26238c72942ea6f225612e0dc17c76cac6652da83a95024e6e8  pypy3.10-v7.3.17-win64.zip
                +

                pypy2.7-v7.3.16 sha256:

                +
                a8df5ce1650f4756933f8780870c91a0a40e7c9870d74629bf241392bcb5c2e3  pypy2.7-v7.3.17-aarch64.tar.bz2
                +a3aa0867cc837a34941047ece0fbb6ca190410fae6ad35fae4999d03bf178750  pypy2.7-v7.3.17-linux32.tar.bz2
                +9f3497f87b3372d17e447369e0016a4bec99a6b4d2a59aba774a25bfe4353474  pypy2.7-v7.3.17-linux64.tar.bz2
                +8573172db377ee0831bf20492cdee9bac4e0b194e3dfe8bf7c44ee257a824766  pypy2.7-v7.3.17-macos_arm64.tar.bz2
                +e3e1af1d6ad15e51d8d19ea36e1ac65c4c792314cc8b8dc5cf771ec4353b50f8  pypy2.7-v7.3.17-macos_x86_64.tar.bz2
                +50e06840f4bbde91448080a4118068a89b8fbcae25ff8da1e2bb1402dc9a0346  pypy2.7-v7.3.17-src.tar.bz2
                +593cedd368a59bd5ed5dc8df00961a42a50c5d75d2614a96b1c75d25612dadf1  pypy2.7-v7.3.17-src.zip
                +2ce2f4c205819902ee3ea2e80f8fc9ae9b18647bcfc8046ba83fe46b4139f734  pypy2.7-v7.3.17-win64.zip
                +

                pypy3.10-v7.3.16 sha256:

                +
                fc720999bc5050e1d3706b3b6445e695cf42bfc71ebc7c88ed6bb88828b1d385  pypy3.10-v7.3.16-aarch64.tar.bz2
                +0df48aa780159e879ac89a805d143e4a6cd1b842f98046f5a3f865814bfaa2a4  pypy3.10-v7.3.16-linux32.tar.bz2
                +404e6180d6caf9258eaab0c02c72018e9aa8eb03ab9094a0ff17ee5e3b265ac1  pypy3.10-v7.3.16-linux64.tar.bz2
                +6c003376667a95c7a228544649677b9927b8210d6444b901817aad24b8719b93  pypy3.10-v7.3.16-macos_arm64.tar.bz2
                +490f2c6ba2489f405444f3b4ad42166da6e2eb73489a9535b206067eaaf21737  pypy3.10-v7.3.16-macos_x86_64.tar.bz2
                +af97efe498a209ba18c7bc7d084164a9907fb3736588b6864955177e19d5216a  pypy3.10-v7.3.16-s390x.tar.bz2
                +4a3a3177d0a1f51d59982bb981d1d485403bda3419d5437b9e077f55f59424ff  pypy3.10-v7.3.16-src.tar.bz2
                +8f59b6859d7d49036afce8156ea52f9c6a1e8d1e08af01bd6c70444d092841f5  pypy3.10-v7.3.16-src.zip
                +e08415a2f35c6ecf2342b504bdfde11e4c5eca3fc5ef7fd2214ff064a5a54396  pypy3.10-v7.3.16-win64.zip
                +

                pypy3.9-v7.3.16 sha256:

                +
                de3f2ed3581b30555ac0dd3e4df78a262ec736a36fb2e8f28259f8539b278ef4  pypy3.9-v7.3.16-aarch64.tar.bz2
                +583b6d6dd4e8c07cbc04da04a7ec2bdfa6674825289c2378c5e018d5abe779ea  pypy3.9-v7.3.16-linux32.tar.bz2
                +16f9c5b808c848516e742986e826b833cdbeda09ad8764e8704595adbe791b23  pypy3.9-v7.3.16-linux64.tar.bz2
                +88f824e7a2d676440d09bc90fc959ae0fd3557d7e2f14bfbbe53d41d159a47fe  pypy3.9-v7.3.16-macos_arm64.tar.bz2
                +fda015431621e7e5aa16359d114f2c45a77ed936992c1efff86302e768a6b21c  pypy3.9-v7.3.16-macos_x86_64.tar.bz2
                +7a56ebb27dba3110dc1ff52d8e0449cdb37fe5c2275f7faf11432e4e164833ba  pypy3.9-v7.3.16-s390x.tar.bz2
                +5b75af3f8e76041e79c1ef5ce22ce63f8bd131733e9302081897d8f650e81843  pypy3.9-v7.3.16-src.tar.bz2
                +def4dae720dd09b868b9b8a7a1255f07f925d88a4543f99cd9ae1aeb0a49ff5e  pypy3.9-v7.3.16-src.zip
                +06ec12a5e964dc0ad33e6f380185a4d295178dce6d6df512f508e7aee00a1323  pypy3.9-v7.3.16-win64.zip
                +

                pypy2.7-v7.3.16 sha256:

                +
                be44e65dd8c00d2388b2580dbe2af6a5179f951a8f4979efc74360f92f3c7e96  pypy2.7-v7.3.16-aarch64.tar.bz2
                +a19712d7a6bd4f6d113e352c5271803c583b5129b76a357d387b1fa85204f8e5  pypy2.7-v7.3.16-linux32.tar.bz2
                +04b2fceb712d6f811274825b8a471ee392d3d1b53afc83eb3f42439ce00d8e07  pypy2.7-v7.3.16-linux64.tar.bz2
                +9cc13f4d6c4096820e1e0ddabb3959f853e45150ce0166a39aa23867e99f0145  pypy2.7-v7.3.16-macos_arm64.tar.bz2
                +e8744c1cef8b9e4eb2d2b6b368ed19a1c5cde482c7ef750f2d9f0807bb77fd1c  pypy2.7-v7.3.16-macos_x86_64.tar.bz2
                +09eb70b932e6aac484cf4b5f2de5845f71589f2cbb53e5ed37a497613b43cd53  pypy2.7-v7.3.16-s390x.tar.bz2
                +43721cc0c397f0f3560b325c20c70b11f7c76c27910d3df09f8418cec4f9c2ad  pypy2.7-v7.3.16-src.tar.bz2
                +54c5f8405bb28e3a48d8962ad1765e8536d53546e1c352bcabab36e5727dd609  pypy2.7-v7.3.16-src.zip
                +a51ac82cc0374f86b5eba571d4e5f23cdce5ac7cd3bd5b2d2d726c0d98684d7d  pypy2.7-v7.3.16-win64.zip
                +

                pypy3.10-v7.3.15 sha256:

                +
                52146fccaf64e87e71d178dda8de63c01577ec3923073dc69e1519622bcacb74  pypy3.10-v7.3.15-aarch64.tar.bz2
                +75dd58c9abd8b9d78220373148355bc3119febcf27a2c781d64ad85e7232c4aa  pypy3.10-v7.3.15-linux32.tar.bz2
                +33c584e9a70a71afd0cb7dd8ba9996720b911b3b8ed0156aea298d4487ad22c3  pypy3.10-v7.3.15-linux64.tar.bz2
                +d927c5105ea7880f7596fe459183e35cc17c853ef5105678b2ad62a8d000a548  pypy3.10-v7.3.15-macos_arm64.tar.bz2
                +559b61ba7e7c5a5c23cef5370f1fab47ccdb939ac5d2b42b4bef091abe3f6964  pypy3.10-v7.3.15-macos_x86_64.tar.bz2
                +209e57596381e13c9914d1332f359dc4b78de06576739747eb797bdbf85062b8  pypy3.10-v7.3.15-s390x.tar.bz2
                +837622130b36603a1893899bd9f529961a8e4a56c9eb67268d72ddf8920c9579  pypy3.10-v7.3.15-src.tar.bz2
                +67432b82dd7e436d818bd6cd38115564f13fc226ffd2940f3915ad68b0fc683b  pypy3.10-v7.3.15-src.zip
                +b378b3ab1c3719aee0c3e5519e7bff93ff67b2d8aa987fe4f088b54382db676c  pypy3.10-v7.3.15-win64.zip
                +

                pypy3.9-v7.3.15 sha256:

                +
                03e35fcba290454bb0ccf7ee57fb42d1e63108d10d593776a382c0a2fe355de0  pypy3.9-v7.3.15-aarch64.tar.bz2
                +c6209380977066c9e8b96e8258821c70f996004ce1bc8659ae83d4fd5a89ff5c  pypy3.9-v7.3.15-linux32.tar.bz2
                +f062be307200bde434817e1620cebc13f563d6ab25309442c5f4d0f0d68f0912  pypy3.9-v7.3.15-linux64.tar.bz2
                +300541c32125767a91b182b03d9cc4257f04971af32d747ecd4d62549d72acfd  pypy3.9-v7.3.15-macos_arm64.tar.bz2
                +18ad7c9cb91c5e8ef9d40442b2fd1f6392ae113794c5b6b7d3a45e04f19edec6  pypy3.9-v7.3.15-macos_x86_64.tar.bz2
                +deeb5e54c36a0fd9cfefd16e63a0d5bed4f4a43e6bbc01c23f0ed8f7f1c0aaf3  pypy3.9-v7.3.15-s390x.tar.bz2
                +6bb9537d85aa7ad13c0aad2e41ff7fd55080bc9b4d1361b8f502df51db816e18  pypy3.9-v7.3.15-src.tar.bz2
                +06dd38124b873343bdf566ca9076ff8e38ad82fd7f2feecd942480c2200a13c0  pypy3.9-v7.3.15-src.zip
                +a156dad8b58570597eaaabe05663f00f80c60bc11df4a9c46d0953b6c5eb9209  pypy3.9-v7.3.15-win64.zip
                +

                pypy2.7-v7.3.15 sha256:

                +
                31b41fca7280636d7818713b7a0fab8f34ece9c82cc88e51d305d43b3e6306d6  pypy2.7-v7.3.15-aarch64.tar.bz2
                +cb5c1da62a8ca31050173c4f6f537bc3ff316026895e5f1897b9bb526babae79  pypy2.7-v7.3.15-linux32.tar.bz2
                +e857553bdc4f25ba9670a5c173a057a9ff71262d5c5da73a6ddef9d7dc5d4f5e  pypy2.7-v7.3.15-linux64.tar.bz2
                +618d33df7ac6570d88a58183e3e15c56f63f862968cecbd2ee896eac6255cea6  pypy2.7-v7.3.15-macos_arm64.tar.bz2
                +72dac262fc63115b6ccd2c3c15e7afd1b2e7a65d7e97265c116246d1cf2cdffd  pypy2.7-v7.3.15-macos_x86_64.tar.bz2
                +eb442279ec3f1eb17da296e38b531d3ca50c6418eab208a020bca4646a1dea46  pypy2.7-v7.3.15-s390x.tar.bz2
                +a66ddaed39544a35bb7ab7a17dbf673a020c7cb3a614bd2b61a54776888daf2c  pypy2.7-v7.3.15-src.tar.bz2
                +a424a065d42b49f6e7f3576cdc3acb60778dd578be8d59f04eccd35c2ef15dc8  pypy2.7-v7.3.15-src.zip
                +ca3c813aec8f9304c7bdc0f69d8ea2a263d4247224ee094e0017338da84c75f2  pypy2.7-v7.3.15-win64.zip
                +

                pypy3.10-v7.3.14 sha256:

                +
                fbef65dfc69dcd6006d843553d268b331f1b13dfc3938492bd35f0f477b5bcf4  pypy3.10-v7.3.14-aarch64.tar.bz2
                +d37e7c7a03bed5dceca2ab7f821ad7655808cccf6908155f78f0effd811b7f4f  pypy3.10-v7.3.14-linux32.tar.bz2
                +a83879891dc0a6c1504da0954fba1125b21a2591782897231a8168100ea72b94  pypy3.10-v7.3.14-linux64.tar.bz2
                +0f09584e21ed8f45e8ff1e3dd1582f077ebdd23a1068298f45006f67bc692632  pypy3.10-v7.3.14-macos_arm64.tar.bz2
                +31ce62b7ea3b5b5bde68241ae9404f0a68f5a7d0094ef651062b7a64caecfd4e  pypy3.10-v7.3.14-macos_x86_64.tar.bz2
                +363e87ad3b6547cc68981c665cf049449bed44cf9e49cabbbcc61df73ea2d40b  pypy3.10-v7.3.14-s390x.tar.bz2
                +a3481af466103fa13740db4e27780e0423dcf8626b3340f60d3d3c28fbc11ae0  pypy3.10-v7.3.14-src.tar.bz2
                +95db3e9d22a4820ad9a683d4f6895fa611b16ed02bd709c86a4ac903f9b36721  pypy3.10-v7.3.14-src.zip
                +1713edd310f400935fe9a9f8fa0fd9da1a405eaf7b69564d00f437fb498327f8  pypy3.10-v7.3.14-win64.zip
                +

                pypy3.9-v7.3.14 sha256:

                +
                14b842f32f60ce2d9d130971f9bcbdb6875824a0e78fac36806d267e0982179c  pypy3.9-v7.3.14-aarch64.tar.bz2
                +4ad89a22369a6f2f83a7d8d047e0fc4cf5597f0921fa7afa23499ed05f663503  pypy3.9-v7.3.14-linux32.tar.bz2
                +febd770a616641ca8419c381c7fb224e515b892551d0db49a1231397ed38859d  pypy3.9-v7.3.14-linux64.tar.bz2
                +4f8f2464a743f855b8fc8bda7ce7994a674616db3b5c2c1955cd08502fa782ca  pypy3.9-v7.3.14-macos_arm64.tar.bz2
                +0e2fea9b2dadb82b7acf05f21c0144f7bb1cfaaa39c693ab1eba4aef5ed52680  pypy3.9-v7.3.14-macos_x86_64.tar.bz2
                +ba2451e9081db5bc724a05530a7f98817231de83ff6fdf15bad21a4e9b6dfeae  pypy3.9-v7.3.14-s390x.tar.bz2
                +560fe6161e159557e1fe612aaadf9b293eefded1da372e70b8e3b23bba598366  pypy3.9-v7.3.14-src.tar.bz2
                +16336170410dd13eb39fbacb412b640c9e3ab4dcdd3e2a8f3ba7978edae1dc2d  pypy3.9-v7.3.14-src.zip
                +9b3d8496f2a4729fdf20d9f835299902048950baad3a42019b67da75ca5b38b7  pypy3.9-v7.3.14-win64.zip
                +

                pypy2.7-v7.3.14 sha256:

                +
                98468f4cc704a2821401afdd001ebddd367e594e05a70c7767fb86f1364fb21a  pypy2.7-v7.3.14-aarch64.tar.bz2
                +b12b4b587da55c8f212ae854e31d29258451e069c65aca596e577644e520bc8b  pypy2.7-v7.3.14-linux32.tar.bz2
                +5938c3c6cddb2e8eb5e435cd3bf61d15134b94a9ac026e26a533bdda6c28a4a0  pypy2.7-v7.3.14-linux64.tar.bz2
                +a428e18fcf1470b032fb1f4d75795aeed9216b4314a4c8a3e4d7e13f10f8607e  pypy2.7-v7.3.14-macos_arm64.tar.bz2
                +8af24683621937e65c518fbca1eb34e17ffc741c2ac917e4ca20694348157d78  pypy2.7-v7.3.14-macos_x86_64.tar.bz2
                +5abc6a0f55a89c08def13b5f410b8e7bd706fe1b472f31db01ecbc4d0a49e8dc  pypy2.7-v7.3.14-s390x.tar.bz2
                +e096fe67ce2d8d4d5e7dceb84fe1ca854498f00766d31b27d32c8d8833131373  pypy2.7-v7.3.14-src.tar.bz2
                +680df6e172c5e5778fe3f7bd0a1f8902148f5de9decc5ec9252e72e94eb49bff  pypy2.7-v7.3.14-src.zip
                +a4c6d35e5ae68dfb773ec34b7d8f1503c8fbfcad817e6147babd6cfd3c8eb071  pypy2.7-v7.3.14-win64.zip
                +

                pypy3.10-v7.3.13 sha256:

                +
                ac476f01c9653358404f2e4b52f62307b2f64ccdb8c96dadcbfe355824d81a63  pypy3.10-v7.3.13-aarch64.tar.bz2
                +bfba57eb1f859dd0ad0d6fe841bb12e1256f1f023c7fbca083b536cccbc1233b  pypy3.10-v7.3.13-linux32.tar.bz2
                +54936eeafd9350a5ea0375b036272a260871b9bca82e1b0bb3201deea9f5a442  pypy3.10-v7.3.13-linux64.tar.bz2
                +efb3007b7aace0af6e3b30d381088a5bbc175973a6627b6b0d624a2ca2dc63ce  pypy3.10-v7.3.13-macos_arm64.tar.bz2
                +2c6238b9ece7b94ffdfd1d9b50619edef4b169a5c78adcdb691fce6709cd6610  pypy3.10-v7.3.13-macos_x86_64.tar.bz2
                +3c813c7efa6a026b281313b299c186c585155fc164c7538e65d41efdabff87c9  pypy3.10-v7.3.13-s390x.tar.bz2
                +4ac1733c19d014d3193c804e7f40ffccbf6924bcaaee1b6089b82b9bf9353a6d  pypy3.10-v7.3.13-src.tar.bz2
                +828fc66eca1c097e44bc910c78ab773a98747268c7ce264da97022e5aca358dc  pypy3.10-v7.3.13-src.zip
                +5b99422fb8978b2f4bbf97961bca49963a82dc47c2fa51b7d23c493db3a2e0f0  pypy3.10-v7.3.13-win64.zip
                +

                pypy3.9-v7.3.13 sha256:

                +
                317d7876c5825a086f854253648b967a432b993ce87695d2895d3ad6ed0d2716  pypy3.9-v7.3.13-aarch64.tar.bz2
                +ac695238b4a3635ac6b482e74e04e2ea78b31acca0decd5de601dfd2f4ebf35a  pypy3.9-v7.3.13-linux32.tar.bz2
                +323b05a9f607e932cda1995cbe77a96e4ea35994631aa6d734c8035e8479b74e  pypy3.9-v7.3.13-linux64.tar.bz2
                +a07b17a790a1952b551e69d47d77a5546ad5e666ed1bd90b9ad60baaca6aa51e  pypy3.9-v7.3.13-macos_arm64.tar.bz2
                +180802aa0122d4a05ec480bf3130c78591ba88fdde25d8e65a92d4a798b318a3  pypy3.9-v7.3.13-macos_x86_64.tar.bz2
                +213c88f652a99c4dc4e8e00b4b5b58f381c7f7e9ea1a9b65801fc0eb1e50df0a  pypy3.9-v7.3.13-s390x.tar.bz2
                +bc6147268105e7cb3bd57b401e6d97f66aa4ede269104b2712a7cdd9f02f68cd  pypy3.9-v7.3.13-src.tar.bz2
                +5036ba37fb07116754f3eab2df6d41f405f947ffbf8d99d62bf743dc1d2c195f  pypy3.9-v7.3.13-src.zip
                +85745a2055c4a8cefac9b6d3f7f305b1edaaf62468c8f640b4511d9dd21d091c  pypy3.9-v7.3.13-win64.zip
                +

                pypy2.7-v7.3.13 sha256:

                +
                f1e20f833cc86a097c1f1318069fc17d01c3988678c1438fe27ed567fcb5cfd0  pypy2.7-v7.3.13-aarch64.tar.bz2
                +b727d2e759a740f45bab1e333029d001c4384b52949bcbb4bd2ad7912eae8dad  pypy2.7-v7.3.13-linux32.tar.bz2
                +e41ceb5dc6c4d3a9311ed5f88edfeedbf3e8abbd1ed3c4f2e151a90a5cf4e1d7  pypy2.7-v7.3.13-linux64.tar.bz2
                +5b86cf0750abc188a0355380d10c7bab1dec51b610cde23ce78f30a9ef296618  pypy2.7-v7.3.13-macos_arm64.tar.bz2
                +50769df0091e8fa51c9e4356e0cb204e6f6aa54f502ec5a6e55aef03d0ac5675  pypy2.7-v7.3.13-macos_x86_64.tar.bz2
                +fbb2f3d92831c02b094f17e9609b95a6202d4bdcddae437e380ab14388d4556e  pypy2.7-v7.3.13-s390x.tar.bz2
                +976984bc6ca5ec9d37ae4e219b020cbed2751d1a02267033f59ed700ba8cec40  pypy2.7-v7.3.13-src.tar.bz2
                +34976f32358349b535081d5b5d48759d6f112a31352dc11c15dcfea44bb041d8  pypy2.7-v7.3.13-src.zip
                +0dc9c18f91f2aee97b95eaec2244e3b22e0183095f359c410d0090c54413dadc  pypy2.7-v7.3.13-win64.zip
                +

                pypy3.10-v7.3.12 sha256:

                +
                26208b5a134d9860a08f74cce60960005758e82dc5f0e3566a48ed863a1f16a1  pypy3.10-v7.3.12-aarch64.tar.bz2
                +811667825ae58ada4b7c3d8bc1b5055b9f9d6a377e51aedfbe0727966603f60e  pypy3.10-v7.3.12-linux32.tar.bz2
                +6c577993160b6f5ee8cab73cd1a807affcefafe2f7441c87bd926c10505e8731  pypy3.10-v7.3.12-linux64.tar.bz2
                +45671b1e9437f95ccd790af10dbeb57733cca1ed9661463b727d3c4f5caa7ba0  pypy3.10-v7.3.12-macos_arm64.tar.bz2
                +dbc15d8570560d5f79366883c24bc42231a92855ac19a0f28cb0adeb11242666  pypy3.10-v7.3.12-macos_x86_64.tar.bz2
                +043c13a585479428b463ab69575a088db74aadc16798d6e677d97f563585fee3  pypy3.10-v7.3.12-s390x.tar.bz2
                +86e4e4eacc36046c6182f43018796537fe33a60e1d2a2cc6b8e7f91a5dcb3e42  pypy3.10-v7.3.12-src.tar.bz2
                +191c275e3f6f2785da783cc7e951cc53cdf9df3b42d4533cd121c526e0b79991  pypy3.10-v7.3.12-src.zip
                +8c3b1d34fb99100e230e94560410a38d450dc844effbee9ea183518e4aff595c  pypy3.10-v7.3.12-win64.zip
                +

                pypy3.9-v7.3.12 sha256:

                +
                e9327fb9edaf2ad91935d5b8563ec5ff24193bddb175c1acaaf772c025af1824  pypy3.9-v7.3.12-aarch64.tar.bz2
                +aa04370d38f451683ccc817d76c2b3e0f471dbb879e0bd618d9affbdc9cd37a4  pypy3.9-v7.3.12-linux32.tar.bz2
                +84c89b966fab2b58f451a482ee30ca7fec3350435bd0b9614615c61dc6da2390  pypy3.9-v7.3.12-linux64.tar.bz2
                +0e8a1a3468b9790c734ac698f5b00cc03fc16899ccc6ce876465fac0b83980e3  pypy3.9-v7.3.12-macos_arm64.tar.bz2
                +64f008ffa070c407e5ef46c8256b2e014de7196ea5d858385861254e7959f4eb  pypy3.9-v7.3.12-macos_x86_64.tar.bz2
                +20d84658a6899bdd2ca35b00ead33a2f56cff2c40dce1af630466d27952f6d4f  pypy3.9-v7.3.12-s390x.tar.bz2
                +e7a2046c7e6c25fc386abbb5132e92a7cc2491e3935699a946cb5dcbb342c2aa  pypy3.9-v7.3.12-src.tar.bz2
                +c65e4082b6da1660041ccb23823e1cbd7759377c391f050e7c1ccad2220f08c0  pypy3.9-v7.3.12-src.zip
                +0996054207b401aeacace1aa11bad82cfcb463838a1603c5f263626c47bbe0e6  pypy3.9-v7.3.12-win64.zip
                +

                pypy2.7-v7.3.12 sha256:

                +
                e04dcb6286a7b4724ec3f0e50d3cc1ba8583301dd1658c06d7f37599e4201c59  pypy2.7-v7.3.12-aarch64.tar.bz2
                +abf3ae477bd0e526ac6dcefe0bfa845e1535aa053342c0d641219bfcde4b9b56  pypy2.7-v7.3.12-linux32.tar.bz2
                +1a61a2574b79466f606010f2999a2b995bd96cd085f91a78ebdd3d5c2c40e81d  pypy2.7-v7.3.12-linux64.tar.bz2
                +6b747aa076ae8597e49603c5dec4ca5935a1a0a132d7404a559be96a260d9bf7  pypy2.7-v7.3.12-macos_arm64.tar.bz2
                +6e89ffdd15537ce4ffce3145b65ee57c2e9c952892bd95b934012d2f009f503b  pypy2.7-v7.3.12-macos_x86_64.tar.bz2
                +80c0154d8b0949f9dc6a227c322abbc9590c8ae4c9f11c13bf4022aa38b82064  pypy2.7-v7.3.12-s390x.tar.bz2
                +dd61d88da274c2ce2cec77667d4a3df9a652bcc50e26f90991d4dd0af66bccf4  pypy2.7-v7.3.12-src.tar.bz2
                +99cfea9862299cb043914167f4ddc69171c3f38462b6e1ab170df0aab423ca0f  pypy2.7-v7.3.12-src.zip
                +84cd3b98812d47a1ddb36f3417cc96b3dbdfa32c2b4e16438f205e1253f7ccea  pypy2.7-v7.3.12-win64.zip
                +

                pypy3.10-v7.3.12rc2 sha256:

                +
                a6dc89b8100f423d5f8f5f579db3691e0ec5f14c2d92d529d70054263e202810  pypy3.10-v7.3.12rc2-aarch64.tar.bz2
                +5607812d1fc9ec62956555a88b75f9178fadba090759f7c0941341b9d761e6ef  pypy3.10-v7.3.12rc2-linux32.tar.bz2
                +6be46911c20152de7d317cf8b2b7c83933a18a9d4193c41e0b70810381fc8d09  pypy3.10-v7.3.12rc2-linux64.tar.bz2
                +7c353cce25d76482e6b03e298891e7a5433b1c825391bc9f14b93abdd365276b  pypy3.10-v7.3.12rc2-macos_arm64.tar.bz2
                +098e408004813c126f09989588d586428982278c2a79a5f216f55b29db2f05de  pypy3.10-v7.3.12rc2-macos_x86_64.tar.bz2
                +2a842af10a5b1f3be97866af21a7108951c45af7b0ffb757a8e1e1ffd2c76718  pypy3.10-v7.3.12rc2-s390x.tar.bz2
                +d8c51b7bb88dd1343195d088c95b4b53c704ae2c7a517ba8d8f8c728bf150683  pypy3.10-v7.3.12rc2-src.tar.bz2
                +cc695d4e48bc29867e171071524d97cd4cd903ec965ee0748c3dde2b012ae36a  pypy3.10-v7.3.12rc2-src.zip
                +cd3b1b409d41ea694a2d22f15afcab12305c058b8fa2a197c49e96b1c5fb776c  pypy3.10-v7.3.12rc2-win64.zip
                +

                pypy3.9-v7.3.12rc2 sha256:

                +
                0e50aafa4e92413573cff9d579613175e5cdc128bda91a47154c9909b47c2f4c  pypy3.9-v7.3.12rc2-aarch64.tar.bz2
                +37335affc962acd79fcd1f08cce19c3d2a3d2d2f6e9ba73d6c804160fd42b471  pypy3.9-v7.3.12rc2-linux32.tar.bz2
                +79a3d32a21534d784f2ac4934d157354aba4871b72c39ac7908e9d853c16c3ad  pypy3.9-v7.3.12rc2-linux64.tar.bz2
                +4b4adfb435c3677bf7c518413c2c53282789ceadd747bec19ed42ce0eb7192ed  pypy3.9-v7.3.12rc2-macos_arm64.tar.bz2
                +3b29d34919f53136a2272363d819eb4e678368a01d5a182feae04a78a505d15d  pypy3.9-v7.3.12rc2-macos_x86_64.tar.bz2
                +9d760b96db54f8d51c47c78397d70dbf61e1144de5afe6840deb3b9a7c265381  pypy3.9-v7.3.12rc2-s390x.tar.bz2
                +4835d2f3814c92851f7930398f397cd0e938de165329c019d86561d9482c9daf  pypy3.9-v7.3.12rc2-src.tar.bz2
                +453d84e4104216c23a466fc58f58231c051eafabf258c1c907b41ffe9955219b  pypy3.9-v7.3.12rc2-src.zip
                +559fa00f89eab23c87ac2132ef30fb456631f4ff4bb8009d60900be57594dbea  pypy3.9-v7.3.12rc2-win64.zip
                +

                pypy2.7-v7.3.12rc2 sha256:

                +
                561c6496251fbdf36ecfeaa08bc2dc89f24ef3044dde6d9f297efc798726e49d  pypy2.7-v7.3.12rc2-aarch64.tar.bz2
                +a66cfb8fd8a88a60bcefca14364c7e87f2932f978b81187572064e1df16c0285  pypy2.7-v7.3.12rc2-linux32.tar.bz2
                +03d68b7d43751807cc4e7743a3977f2359cc4b6f0acaad00057b1b4158efe51a  pypy2.7-v7.3.12rc2-linux64.tar.bz2
                +0cd0fc59894325ab30585fc2bee1d244b2b788d04e3aec46dafb0e2b3b232657  pypy2.7-v7.3.12rc2-macos_arm64.tar.bz2
                +75587e171ea77ccbdcc9e0f062c9bd55bc374083ac106eeb788321dc5f031aa6  pypy2.7-v7.3.12rc2-macos_x86_64.tar.bz2
                +5968a009c19bf723eda722e9ff1b95986a1b5c79247269532f99e0b25819089a  pypy2.7-v7.3.12rc2-s390x.tar.bz2
                +6c69d4260554ef677d9dfb3b81a1dbd6f4d7302ef0170d1c66616865a711317f  pypy2.7-v7.3.12rc2-src.tar.bz2
                +a4cbe00a2bef9181929b4577c535f327021ee6af596ac0ad8d577e2a67b44a5f  pypy2.7-v7.3.12rc2-src.zip
                +2bcab9251209b44eb0f7059f91c070d1de19abcfc42397e437ebe3be2faaaf5d  pypy2.7-v7.3.12rc2-win64.zip
                +

                pypy3.10-v7.3.12rc1 sha256:

                +
                3e92ba4977c1937913c5a4cb04ee31fa809cb44d12eefcfcd5b7ef64fa2d1a45  pypy3.10-v7.3.12rc1-aarch64.tar.bz2
                +889f887eada150cdbf3bfce5bb209fae90a64ad99590047c1123452431d43659  pypy3.10-v7.3.12rc1-linux32.tar.bz2
                +cbc86894e22bd06f5d99dbd274dcfe0c2cacfb213f6522e06153010f40423dcc  pypy3.10-v7.3.12rc1-linux64.tar.bz2
                +9e135570058c227515f742b0600c1a209f154a72247ba52073c0026f6bdc5022  pypy3.10-v7.3.12rc1-macos_arm64.tar.bz2
                +3f423b794962e0ddbf95a1f40591f008e7b62a603206f65a239b25953308fbf6  pypy3.10-v7.3.12rc1-macos_x86_64.tar.bz2
                +94d25c8777eff222e4cdb7419db7e49ad1b789e88fb6d59ab930e474180c74c1  pypy3.10-v7.3.12rc1-s390x.tar.bz2
                +8952f17d401babd69f9bd4f7a417c19f07e1ed7bd078721eadf90f55914793e4  pypy3.10-v7.3.12rc1-src.tar.bz2
                +c11b44ab9396bc6ce2a1ff5be514c27b1b327f79da6ba2cad635ea90e590ab5c  pypy3.10-v7.3.12rc1-src.zip
                +2a2c285909f67984691f7861637a633c06cb30e59374744de08c0dbfbd89a151  pypy3.10-v7.3.12rc1-win64.zip
                +

                pypy3.9-v7.3.12rc1 sha256:

                +
                4be87ceb5d522e8f0619a06660a7b68252add41b60ab4957d8f899d4893f6a15  pypy3.9-v7.3.12rc1-aarch64.tar.bz2
                +0219d3353eda1526828d4b48e773d045469c0b0dafd95b0bfae72b4ef258bd02  pypy3.9-v7.3.12rc1-linux32.tar.bz2
                +298ab60c5e1d56924767a4c2fcb5b3c66561c2128ca385c207193b2b3c61a5f9  pypy3.9-v7.3.12rc1-linux64.tar.bz2
                +759b5d4de479b67c01df168c482f00cfdc75475f8401bfecd4f6bd7f0be2df23  pypy3.9-v7.3.12rc1-macos_arm64.tar.bz2
                +5d3286920bba60af7bf8a4047b879a04302d2d0e7038965bef26f2dabd235b88  pypy3.9-v7.3.12rc1-macos_x86_64.tar.bz2
                +77a27d2cde6e101b94acbc663c3c530568ed509fcdb0eaec149a195410c6efba  pypy3.9-v7.3.12rc1-s390x.tar.bz2
                +7ef838e96bdd6e672868e705eb04cfbe67a5e4495e7bf374e6fc0d68fa285f7f  pypy3.9-v7.3.12rc1-src.tar.bz2
                +4bf7eeb2263051838e38ff483f734994c0e1cfd2b818eddbe9e30ae8d9f6fd83  pypy3.9-v7.3.12rc1-src.zip
                +a78186a26590d87c48a81902a0118f6c3c70f4ef895f3ceb2fcc714a338832a7  pypy3.9-v7.3.12rc1-win64.zip
                +

                pypy2.7-v7.3.12rc1 sha256:

                +
                79a87e1e7b3e6bd77117bedb2efe45c0de3cf9e055f688fc7a038969d058de1f  pypy2.7-v7.3.12rc1-aarch64.tar.bz2
                +0aef12d0a4fe998125c3e6758325905c7b7fc9b348915c4241af89953e04fdc0  pypy2.7-v7.3.12rc1-linux32.tar.bz2
                +eb7f8be5f120edc29211c2ccaff4be219dcfb82030db3f667ce2c88e859217f1  pypy2.7-v7.3.12rc1-linux64.tar.bz2
                +0552074ff977ea860b1989e298dd27d54f5d59e180b9b605922c0ba8becfcf6e  pypy2.7-v7.3.12rc1-macos_arm64.tar.bz2
                +6dc763c8d25b00c4931e1989e09a429065b41eccf1d39cf85eb09b35846615b4  pypy2.7-v7.3.12rc1-macos_x86_64.tar.bz2
                +b2a498c7d10150ad416b27be30b7ec38a61b208eecf2d58eadb6ce822e9d5ca3  pypy2.7-v7.3.12rc1-s390x.tar.bz2
                +23c1ecf2b28aae2aa676a1b2eb2bdbf7db18d8718489db6d8501fb9a4b232f49  pypy2.7-v7.3.12rc1-src.tar.bz2
                +60cf43bae08c87dfdd3e70be54604c6ca559c14ecf53181dc162c3befd5f8df0  pypy2.7-v7.3.12rc1-src.zip
                +5f0786c0c797700458ff0cb9cfe750dd5b81a7ca3175d9ffcb55b5418b707e9c  pypy2.7-v7.3.12rc1-win64.zip
                +

                pypy3.9-v7.3.11 sha256:

                +
                09175dc652ed895d98e9ad63d216812bf3ee7e398d900a9bf9eb2906ba8302b9  pypy3.9-v7.3.11-aarch64.tar.bz2
                +0099d72c2897b229057bff7e2c343624aeabdc60d6fb43ca882bff082f1ffa48  pypy3.9-v7.3.11-linux32.tar.bz2
                +d506172ca11071274175d74e9c581c3166432d0179b036470e3b9e8d20eae581  pypy3.9-v7.3.11-linux64.tar.bz2
                +91ad7500f1a39531dbefa0b345a3dcff927ff9971654e8d2e9ef7c5ae311f57e  pypy3.9-v7.3.11-macos_arm64.tar.bz2
                +d33f40b207099872585afd71873575ca6ea638a27d823bc621238c5ae82542ed  pypy3.9-v7.3.11-macos_x86_64.tar.bz2
                +e1f30f2ddbe3f446ddacd79677b958d56c07463b20171fb2abf8f9a3178b79fc  pypy3.9-v7.3.11-s390x.tar.bz2
                +b0f3166fb2a5aadfd5ceb9db5cdd5f7929a0eccca02b4a26c0dae0492f7ca8ea  pypy3.9-v7.3.11-src.tar.bz2
                +3d2f473590b1390478e281a2e0d209b5df7cc9f26c33e73baecf7bd0f62bc848  pypy3.9-v7.3.11-src.zip
                +57faad132d42d3e7a6406fcffafffe0b4f390cf0e2966abb8090d073c6edf405  pypy3.9-v7.3.11-win64.zip
                +

                pypy3.8-v7.3.11 sha256:

                +
                9a2fa0b8d92b7830aa31774a9a76129b0ff81afbd22cd5c41fbdd9119e859f55  pypy3.8-v7.3.11-aarch64.tar.bz2
                +a79b31fce8f5bc1f9940b6777134189a1d3d18bda4b1c830384cda90077c9176  pypy3.8-v7.3.11-linux32.tar.bz2
                +470330e58ac105c094041aa07bb05676b06292bc61409e26f5c5593ebb2292d9  pypy3.8-v7.3.11-linux64.tar.bz2
                +78cdc79ff964c4bfd13eb45a7d43a011cbe8d8b513323d204891f703fdc4fa1a  pypy3.8-v7.3.11-macos_arm64.tar.bz2
                +194ca0b4d91ae409a9cb1a59eb7572d7affa8a451ea3daf26539aa515443433a  pypy3.8-v7.3.11-macos_x86_64.tar.bz2
                +eab7734d86d96549866f1cba67f4f9c73c989f6a802248beebc504080d4c3fcd  pypy3.8-v7.3.11-s390x.tar.bz2
                +4d6769bfca73734e8666fd70503b7ceb06a6e259110e617331bb3899ca4e6058  pypy3.8-v7.3.11-src.tar.bz2
                +3e635c7d4d5ded1c5f41f7a9f277a0ee3dfd21a545516fb68e90240dca66fa07  pypy3.8-v7.3.11-src.zip
                +0f46fb6df32941ea016f77cfd7e9b426d5ac25a2af2453414df66103941c8435  pypy3.8-v7.3.11-win64.zip
                +

                pypy2.7-v7.3.11 sha256:

                +
                ea924da1defe9325ef760e288b04f984614e405580f5321eb6a5c8f539bd415a  pypy2.7-v7.3.11-aarch64.tar.bz2
                +30fd245fab7068c96a75b9ff1323ac55174c64fc8c4751cceb4b7a9bedc1851e  pypy2.7-v7.3.11-linux32.tar.bz2
                +ba8ed958a905c0735a4cfff2875c25089954dc020e087d982b0ffa5b9da316cd  pypy2.7-v7.3.11-linux64.tar.bz2
                +cc5696ab4f93cd3481c1e4990b5dedd7ba60ac0602fa1890d368889a6c5bf771  pypy2.7-v7.3.11-macos_arm64.tar.bz2
                +56deee9c22640f5686c35b9d64fdb1ce3abd044583e4078f0b171ca2fd2a198e  pypy2.7-v7.3.11-macos_x86_64.tar.bz2
                +8fe9481c473178e53266983678684a70fe0c42bafc95f1807bf3ef28770316d4  pypy2.7-v7.3.11-s390x.tar.bz2
                +1117afb66831da4ea6f39d8d2084787a74689fd0229de0be301f9ed9b255093c  pypy2.7-v7.3.11-src.tar.bz2
                +6df2ddd9a925eac5294ae5a5f8916baefbc4bc3298d7cdada18fc1fa71aa0670  pypy2.7-v7.3.11-src.zip
                +106942702de0df148e39fa44a33e76b8a362341e1460d4e5e61b3ff0e64e5514  pypy2.7-v7.3.11-win64.zip
                +

                pypy3.9-v7.3.10 sha256:

                +
                657a04fd9a5a992a2f116a9e7e9132ea0c578721f59139c9fb2083775f71e514  pypy3.9-v7.3.10-aarch64.tar.bz2
                +b6db59613b9a1c0c1ab87bc103f52ee95193423882dc8a848b68850b8ba59cc5  pypy3.9-v7.3.10-linux32.tar.bz2
                +95cf99406179460d63ddbfe1ec870f889d05f7767ce81cef14b88a3a9e127266  pypy3.9-v7.3.10-linux64.tar.bz2
                +e2a6bec7408e6497c7de8165aa4a1b15e2416aec4a72f2578f793fb06859ccba  pypy3.9-v7.3.10-macos_arm64.tar.bz2
                +f90c8619b41e68ec9ffd7d5e913fe02e60843da43d3735b1c1bc75bcfe638d97  pypy3.9-v7.3.10-macos_x86_64.tar.bz2
                +ca6525a540cf0c682d1592ae35d3fbc97559a97260e4b789255cc76dde7a14f0  pypy3.9-v7.3.10-s390x.tar.bz2
                +3738d32575ed2513e3e66878e4e4c6c208caed267570f3f9f814748830002967  pypy3.9-v7.3.10-src.tar.bz2
                +e3e2c41db0a5590d31233fd2909feeb83b1e7f997a473d74a11ad87ba4bbdc30  pypy3.9-v7.3.10-src.zip
                +07e18b7b24c74af9730dfaab16e24b22ef94ea9a4b64cbb2c0d80610a381192a  pypy3.9-v7.3.10-win64.zip
                +
                +2775f1eca62dd1eab0af09f8e4b1640b5c86f18a766ed46ff9aa7dc8aa916c13  pypy3.9-v7.3.10rc3-aarch64.tar.bz2
                +68b2f1b986217475fc98bc0e5a98b4bb0c602ec1d603abbeef9ada89c9ff7048  pypy3.9-v7.3.10rc3-linux32.tar.bz2
                +1cf9db691cadbf870c9af4a6af7ab89cbf24fef0469d63fd0d857656ee4adee6  pypy3.9-v7.3.10rc3-linux64.tar.bz2
                +b585ab42f95aa7f0e713c6c22aba030e5d49d78ba79e8d005e754384d33cfaa4  pypy3.9-v7.3.10rc3-macos_arm64.tar.bz2
                +73550941c02349c5d1051331f590962da9a0eff52e793295c1a3bd2a72dc461e  pypy3.9-v7.3.10rc3-macos_x86_64.tar.bz2
                +abb736466180c3cc68ff5cd0d9b07cfabebc26989eb7fc5e9a9512e1bbe234c2  pypy3.9-v7.3.10rc3-s390x.tar.bz2
                +a313e85a073f3a4d9c592e142e69c856b40afd29473665d7f41fe07d50ecbad2  pypy3.9-v7.3.10rc3-src.tar.bz2
                +6f5ead6ccdf7544eb5a7e33e352a361bfd19f6bfcd91f9e121843b4e2ae9c590  pypy3.9-v7.3.10rc3-src.zip
                +f5ae260d8557d7380d595c93ccd2b7bbaff718d8dd82051034444479a89e1c37  pypy3.9-v7.3.10rc3-win64.zip
                +

                pypy3.8-v7.3.10 sha256:

                +
                e4caa1a545f22cfee87d5b9aa6f8852347f223643ad7d2562e0b2a2f4663ad98  pypy3.8-v7.3.10-aarch64.tar.bz2
                +b70ed7fdc73a74ebdc04f07439f7bad1a849aaca95e26b4a74049d0e483f071c  pypy3.8-v7.3.10-linux32.tar.bz2
                +ceef6496fd4ab1c99e3ec22ce657b8f10f8bb77a32427fadfb5e1dd943806011  pypy3.8-v7.3.10-linux64.tar.bz2
                +6cb1429371e4854b718148a509d80143f801e3abfc72fef58d88aeeee1e98f9e  pypy3.8-v7.3.10-macos_arm64.tar.bz2
                +399eb1ce4c65f62f6a096b7c273536601b7695e3c0dc0457393a659b95b7615b  pypy3.8-v7.3.10-macos_x86_64.tar.bz2
                +c294f8e815158388628fe77ac5b8ad6cd93c8db1359091fa02d41cf6da4d61a1  pypy3.8-v7.3.10-s390x.tar.bz2
                +218a1e062f17aba89f61bc398e8498f13c048b9fcf294343f5d9d56c3ac9b882  pypy3.8-v7.3.10-src.tar.bz2
                +0e4dd55729a2bf8c9bf963c769004b287ef57576ddb402e71e387847a7c31c0a  pypy3.8-v7.3.10-src.zip
                +362dd624d95bd64743190ea2539b97452ecb3d53ea92ceb2fbe9f48dc60e6b8f  pypy3.8-v7.3.10-win64.zip
                +
                +d7feab3fd0e670dc66277ad710d2a26dd5ec3def68cb4fdf2697e570b74ab62e  pypy3.8-v7.3.10rc3-aarch64.tar.bz2
                +4a33b7e08033527e9f8c6dc2a3d6a8d0163c381b9e75813cfe1a7865caf335ae  pypy3.8-v7.3.10rc3-linux32.tar.bz2
                +7ab218ab7f05a156ad3ea3b498e6da94dd7e7e77dfe03ee77e5827af755a6207  pypy3.8-v7.3.10rc3-linux64.tar.bz2
                +d77a5f94690e8e74d3ae57d1f65ef657c670614559447a196da001de943e1fa5  pypy3.8-v7.3.10rc3-macos_arm64.tar.bz2
                +fa15127affd9dbc6d447cf48a99fe4795423132070b84b802d2dc8cbecd9607e  pypy3.8-v7.3.10rc3-macos_x86_64.tar.bz2
                +8d3e07840be537b6b879add1b34a082dde156f7c2a8c5d75be60e9192393533d  pypy3.8-v7.3.10rc3-s390x.tar.bz2
                +5284dfba00f4ffcdf29b732cf7f2e63f29d1f33295f826a2caefb1f782cedaef  pypy3.8-v7.3.10rc3-src.tar.bz2
                +d8a2992734463e8db5ca4209c5ce7f9fcc2965f9fbd975cb04a4e173b6d2411b  pypy3.8-v7.3.10rc3-src.zip
                +fab16618e7adf8c268c7f48032f51d6d4985734d672d18712fe8b557fe9c9abe  pypy3.8-v7.3.10rc3-win64.zip
                +

                pypy2.7-v7.3.10 sha256:

                +
                274342f0e75e99d60ba7a0cfb0e13792e7664163e01450d2f7f2f7825603a0ae  pypy2.7-v7.3.10-aarch64.tar.bz2
                +0b17132f62d2a0c3c4572c57eb53820f25611afad71f3d6a310202942baed6e1  pypy2.7-v7.3.10-linux32.tar.bz2
                +461fb6df524208af9e94ffb16989f628b585bdb4b9e97d81e668899fc3a064a3  pypy2.7-v7.3.10-linux64.tar.bz2
                +14b178f005603e3df6db7574b77b9c65ae79feda1a629214cafcb4eee7da679d  pypy2.7-v7.3.10-macos_arm64.tar.bz2
                +188551185ee945d5e42a3a619205d02ac31db77bdd5d98b6c11469e125c3bdb5  pypy2.7-v7.3.10-macos_x86_64.tar.bz2
                +0fac1ec1e05c70941f758be05d40ce7ffe6a42c0416e70b55d40a7523e3e70ae  pypy2.7-v7.3.10-s390x.tar.bz2
                +35e2cf4519cb51c4d5ffb4493ee24f0c7f42b4b04944903ca4b33981a04a3bc5  pypy2.7-v7.3.10-src.tar.bz2
                +ece8975f49b192cc6e3169301a3c3ef71822cc7b52e70d7d8b506f54f917e14e  pypy2.7-v7.3.10-src.zip
                +2915b5201a5f71546951bc41efd80f40b2ed709511bc526219a70f3ae37b918e  pypy2.7-v7.3.10-win64.zip
                +
                +85f0b2f0bffe9a9a0fe17382c25f595be7c7ca9a4d070eaf98cb4258bdc8f703  pypy2.7-v7.3.10rc3-aarch64.tar.bz2
                +38f0fe020ac7880ae4e843d2cacdfcceecd0d7dca5fd2769f13b60a1e6bf8e86  pypy2.7-v7.3.10rc3-linux32.tar.bz2
                +e6d7330c16f503e1c21dacb22c525974f1d81fea86ef32e0d21239d9d372b4d5  pypy2.7-v7.3.10rc3-linux64.tar.bz2
                +5f62122884e87b263ce3f416513e1f380276fc327570cff07daac864907b1d1e  pypy2.7-v7.3.10rc3-macos_arm64.tar.bz2
                +6de0c73285378cae79ee92566e38296e91382cd5df0322224d006dd2e2429489  pypy2.7-v7.3.10rc3-macos_x86_64.tar.bz2
                +0c350a480a928c9ed0fca0a531f333946269c32f9673c9d461772c48eccc5380  pypy2.7-v7.3.10rc3-s390x.tar.bz2
                +2514df50aeb2dafd8fd13b299dd3a1a30986e5e396a7ea253410d3126b7ad245  pypy2.7-v7.3.10rc3-src.tar.bz2
                +dbd30ad54104ffb9ada8717cec068958b15c4ad9a22e37b192acdd1495e9ec44  pypy2.7-v7.3.10rc3-src.zip
                +f95114991fbe1bc6aa87466a62efbba6d6e4e1a8c95b5efd43a402ece0371357  pypy2.7-v7.3.10rc3-win64.zip
                +

                pypy3.9-v7.3.9 sha256:

                +
                2e1ae193d98bc51439642a7618d521ea019f45b8fb226940f7e334c548d2b4b9  pypy3.9-v7.3.9-aarch64.tar.bz2
                +0de4b9501cf28524cdedcff5052deee9ea4630176a512bdc408edfa30914bae7  pypy3.9-v7.3.9-linux32.tar.bz2
                +46818cb3d74b96b34787548343d266e2562b531ddbaf330383ba930ff1930ed5  pypy3.9-v7.3.9-linux64.tar.bz2
                +59c8852168b2b1ba1f0211ff043c678760380d2f9faf2f95042a8878554dbc25  pypy3.9-v7.3.9-osx64.tar.bz2
                +774dca83bcb4403fb99b3d155e7bd572ef8c52b9fe87a657109f64e75ad71732  pypy3.9-v7.3.9-s390x.tar.bz2
                +2abaa1e9fe1ec0e233c9fbc377a0c8e9a0634080a8f4f30eb6898301f6618c12  pypy3.9-v7.3.9-src.tar.bz2
                +1c67e33882052ab53e464e398898abefd6df7ff7127bf754be88bb17938759f2  pypy3.9-v7.3.9-src.zip
                +be48ab42f95c402543a7042c999c9433b17e55477c847612c8733a583ca6dff5  pypy3.9-v7.3.9-win64.zip
                +

                pypy3.8-v7.3.9 sha256:

                +
                5e124455e207425e80731dff317f0432fa0aba1f025845ffca813770e2447e32  pypy3.8-v7.3.9-aarch64.tar.bz2
                +4b261516c6c59078ab0c8bd7207327a1b97057b4ec1714ed5e79a026f9efd492  pypy3.8-v7.3.9-linux32.tar.bz2
                +08be25ec82fc5d23b78563eda144923517daba481a90af0ace7a047c9c9a3c34  pypy3.8-v7.3.9-linux64.tar.bz2
                +91a5c2c1facd5a4931a8682b7d792f7cf4f2ba25cd2e7e44e982139a6d5e4840  pypy3.8-v7.3.9-osx64.tar.bz2
                +c6177a0016c9145c7b99fddb5d74cc2e518ccdb216a6deb51ef6a377510cc930  pypy3.8-v7.3.9-s390x.tar.bz2
                +5b5d9d9256f12a129af8384e2f581bdfab3bc0fbbe3a0a480d9c1d2e95490eb1  pypy3.8-v7.3.9-src.tar.bz2
                +d4f716f324ebbd7ec3c0e0e309c2d7dd76846f693f50b7796820acf346147401  pypy3.8-v7.3.9-src.zip
                +05022baaa55db2b60880f2422312d9e4025e1267303ac57f33e8253559d0be88  pypy3.8-v7.3.9-win64.zip
                +

                pypy3.7-v7.3.9 sha256:

                +
                dfc62f2c453fb851d10a1879c6e75c31ffebbf2a44d181bb06fcac4750d023fc  pypy3.7-v7.3.9-aarch64.tar.bz2
                +3398cece0167b81baa219c9cd54a549443d8c0a6b553ec8ec13236281e0d86cd  pypy3.7-v7.3.9-linux32.tar.bz2
                +c58195124d807ecc527499ee19bc511ed753f4f2e418203ca51bc7e3b124d5d1  pypy3.7-v7.3.9-linux64.tar.bz2
                +12d92f578a200d50959e55074b20f29f93c538943e9a6e6522df1a1cc9cef542  pypy3.7-v7.3.9-osx64.tar.bz2
                +fcab3b9e110379948217cf592229542f53c33bfe881006f95ce30ac815a6df48  pypy3.7-v7.3.9-s390x.tar.bz2
                +70426163b194ee46009986eea6d9426098a3ffb552d9cdbd3dfaa64a47373f49  pypy3.7-v7.3.9-src.tar.bz2
                +3643392817cfd0826f70be3d026c2f119904b2bfb40c39c32bad84f5a6aa02f5  pypy3.7-v7.3.9-src.zip
                +8acb184b48fb3c854de0662e4d23a66b90e73b1ab73a86695022c12c745d8b00  pypy3.7-v7.3.9-win64.zip
                +

                pypy2.7-v7.3.9 sha256:

                +
                aff4e4dbab53448f662cd01acb2251571d60f836d2f48382a7d8da54ca5b3442  pypy2.7-v7.3.9-aarch64.tar.bz2
                +bbf4e7343d43c8217099a9bffeed6a1781f4b5a3e186ed1a0befca65e647aeb9  pypy2.7-v7.3.9-linux32.tar.bz2
                +172a928b0096a7e00b7d58f523f57300c35c3de7f822491e2a7bc845375c23f8  pypy2.7-v7.3.9-linux64.tar.bz2
                +77314f5a6b2cc35d24e6f952bef89f5da612b90e4127a8034aed708d9ae483c4  pypy2.7-v7.3.9-osx64.tar.bz2
                +62481dd3c6472393ca05eb3a0880c96e4f5921747157607dbaa772a7369cab77  pypy2.7-v7.3.9-s390x.tar.bz2
                +39b0972956f6548ce5828019dbae12503c32d6cbe91a2becf88d3e42cc52197b  pypy2.7-v7.3.9-src.tar.bz2
                +3400e6b03cfcecd0a2f90271e4dd44e5fe862c7bf82a43535114ad57b57af555  pypy2.7-v7.3.9-src.zip
                +ca7b0f4c576995b388cfb4c796e3f6f20b037e5314571bf267daa068a3a2af31  pypy2.7-v7.3.9-win64.zip
                +

                pypy3.9-v7.3.8 sha256:

                +
                89d7ee12a8c416e83fae80af82482531fc6502321e75e5b7a0cc01d756ee5f0e  pypy3.9-v7.3.8-aarch64.tar.bz2
                +b7282bc4484bceae5bc4cc04e05ee4faf51cb624c8fc7a69d92e5fdf0d0c96aa  pypy3.9-v7.3.8-aarch64-portable.tar.bz2
                +a0d18e4e73cc655eb02354759178b8fb161d3e53b64297d05e2fff91f7cf862d  pypy3.9-v7.3.8-linux32.tar.bz2
                +129a055032bba700cd1d0acacab3659cf6b7180e25b1b2f730e792f06d5b3010  pypy3.9-v7.3.8-linux64.tar.bz2
                +95bd88ac8d6372cd5b7b5393de7b7d5c615a0c6e42fdb1eb67f2d2d510965aee  pypy3.9-v7.3.8-osx64.tar.bz2
                +37b596bfe76707ead38ffb565629697e9b6fa24e722acc3c632b41ec624f5d95  pypy3.9-v7.3.8-s390x.tar.bz2
                +546b7fc3789728869d5ada7b6a95ce9d03047e8489b92ada84613c900e431ee9  pypy3.9-v7.3.8-src.tar.bz2
                +c5cece54ce0444943ae43fe672b13b21b3915d1e71ac730589de8204ec6f417a  pypy3.9-v7.3.8-src.zip
                +c1b2e4cde2dcd1208d41ef7b7df8e5c90564a521e7a5db431673da335a1ba697  pypy3.9-v7.3.8-win64.zip
                +
                +81c58e0c0eb0f76801d0ac8cb528dd8a0b1e4138a4062e3e64e71beeadeccb79  pypy3.9-v7.3.8rc2-linux32.tar.bz2
                +22ec1af269d68f7288a48f49ca58cb55fb9cb78f6ae58341cd13484838327751  pypy3.9-v7.3.8rc2-linux64.tar.bz2
                +b49e569944f712f257e7557e61e21b36b388c9af09ce8a09085e93a51a8e3f95  pypy3.9-v7.3.8rc2-osx64.tar.bz2
                +47824c665d7992dafbe8f00749f72b606bc3478c80adaaea340100f349e7b207  pypy3.9-v7.3.8rc2-s390x.tar.bz2
                +53d47b101a6ff31b07b79429b0cf62e06efb29c3147799ab5aaac270ff17581b  pypy3.9-v7.3.8rc2-src.tar.bz2
                +c84e8094ecca6f90930d527e2c2ca6c37d1da6009ba16d8eef4d02d02a5b05b5  pypy3.9-v7.3.8rc2-src.zip
                +b118fd06197e1218917fa9577874d6bc31a7488f057d5000377c63ee6cd0beca  pypy3.9-v7.3.8rc2-win64.zip
                +
                +89dd0399a89a04b58c22e9b773747258807996bd5071dbf996a85bf8af432393  pypy3.9-v7.3.8rc1-linux32.tar.bz2
                +f3f90203afcf7ee359e8c8a871bfaa06d96f926781fd94fb81f471dcd32f7332  pypy3.9-v7.3.8rc1-linux64.tar.bz2
                +9a5d7217d8173bbdf1c7351b34651fee0596b0bcfe6fe4becae150d4a5469487  pypy3.9-v7.3.8rc1-osx64.tar.bz2
                +4651d804341046be824af0ca35b7ebbbb6d5cdcef0d4a373891398dba182d010  pypy3.9-v7.3.8rc1-src.tar.bz2
                +c4db62a854c2cc994d46fac0105a8e3bd4273093b9844c1f7cb69118fae6df72  pypy3.9-v7.3.8rc1-src.zip
                +ad214e4a44c893dc503e7e0b6f6bdfa7523db80b9d4890523f8ee96339d05fc9  pypy3.9-v7.3.8rc1-win64.zip
                +

                pypy3.8-v7.3.8 sha256:

                +
                fe41df391f87239925e573e195e631a9d03d37f471eb1479790ee13ca47a28af  pypy3.8-v7.3.8-aarch64.tar.bz2
                +0210536e9f1841ba283c13b04783394050837bb3e6f4091c9f1bd9c7f2b94b55  pypy3.8-v7.3.8-aarch64-portable.tar.bz2
                +bea4b275decd492af6462157d293dd6fcf08a949859f8aec0959537b40afd032  pypy3.8-v7.3.8-linux32.tar.bz2
                +089f8e3e357d6130815964ddd3507c13bd53e4976ccf0a89b5c36a9a6775a188  pypy3.8-v7.3.8-linux64.tar.bz2
                +de1b283ff112d76395c0162a1cf11528e192bdc230ee3f1b237f7694c7518dee  pypy3.8-v7.3.8-osx64.tar.bz2
                +ad53d373d6e275a41ca64da7d88afb6a17e48e7bfb2a6fff92daafdc06da6b90  pypy3.8-v7.3.8-s390x.tar.bz2
                +f1a378b264cdbfb0e03d77dfc4d105d02f91d542bd7c9c957d1f8083a9808f1f  pypy3.8-v7.3.8-src.tar.bz2
                +7abf870044c95b31c8e1a0a32e887485b56f3c0a3151401446b113a0a65111b4  pypy3.8-v7.3.8-src.zip
                +0894c468e7de758c509a602a28ef0ba4fbf197ccdf946c7853a7283d9bb2a345  pypy3.8-v7.3.8-win64.zip
                +
                +475883e59f6d2a90d273142da27f999a227d510f51b7cdec3f53ceaf832b6b4b  pypy3.8-v7.3.8rc2-linux32.tar.bz2
                +141abedd8f0f46f61d9f05243c4fe32a88c6d9f2219cd3cd6a1312f56d4bd5eb  pypy3.8-v7.3.8rc2-linux64.tar.bz2
                +3bd390bfa30f4225cc379d592c822b9bb2dea9530451904fa215b8649d614375  pypy3.8-v7.3.8rc2-osx64.tar.bz2
                +735751d124140cb75c24848199230fe41110761fcb830ba2a253baa5846ec86f  pypy3.8-v7.3.8rc2-s390x.tar.bz2
                +0ae9515b964865d5946bb48c41e1248cac00ba6f145f10ff230163f4a3c47c91  pypy3.8-v7.3.8rc2-src.tar.bz2
                +973ec5dab8b1243b71d25acca4d6db3d1545e62e0984a5d43d407052e4767662  pypy3.8-v7.3.8rc2-src.zip
                +089cbb1491eaf921bf905dc79936a95a90b0b5a06ebde3e26d1d2e98bdd2dcdd  pypy3.8-v7.3.8rc2-win64.zip
                +
                +56b62c57df91b4a04036535a94814da3c682ac5208d4a565f230fbc657d949e3  pypy3.8-v7.3.8rc1-linux32.tar.bz2
                +fac68364acdebed2a11f6d5a62fc10e7c44985bfe9baafdb991f65e25a375998  pypy3.8-v7.3.8rc1-linux64.tar.bz2
                +ed62e2f5e25bda752463e2acd881de5876ccd383ce3589630b880de204d8ad75  pypy3.8-v7.3.8rc1-osx64.tar.bz2
                +70aa9380fe19a3694d38aab92d46b96427dd8a98952a4d4637043739a485be4f  pypy3.8-v7.3.8rc1-src.tar.bz2
                +9abb90bc11c5ba53aa7f8c23ab95eba864bb253082d23aa8552d23b322ecef85  pypy3.8-v7.3.8rc1-src.zip
                +6a4d2405adc13c68140a48492178829a11ff8d3a22a27b9730166486be2688d0  pypy3.8-v7.3.8rc1-win64.zip
                +

                pypy3.7-v7.3.8 sha256:

                +
                4fb2f8281f3aaca72e6fe62ecc5fc054fcc79cd061ca3e0eea730f7d82d610d4  pypy3.7-v7.3.8-aarch64.tar.bz2
                +639c76f128a856747aee23a34276fa101a7a157ea81e76394fbaf80b97dcf2f2  pypy3.7-v7.3.8-aarch64-portable.tar.bz2
                +38429ec6ea1aca391821ee4fbda7358ae86de4600146643f2af2fe2c085af839  pypy3.7-v7.3.8-linux32.tar.bz2
                +409085db79a6d90bfcf4f576dca1538498e65937acfbe03bd4909bdc262ff378  pypy3.7-v7.3.8-linux64.tar.bz2
                +76b8eef5b059a7e478f525615482d2a6e9feb83375e3f63c16381d80521a693f  pypy3.7-v7.3.8-osx64.tar.bz2
                +5c2cd3f7cf04cb96f6bcc6b02e271f5d7275867763978e66651b8d1605ef3141  pypy3.7-v7.3.8-s390x.tar.bz2
                +35752be62b148fa6f7fb69e58e1f993c7cc319bea54928eb03ed2e75b8248d5f  pypy3.7-v7.3.8-src.tar.bz2
                +089fd12039ef92256fc218fc45652a93bbef1f5291181d07a4b55dad3f6987b9  pypy3.7-v7.3.8-src.zip
                +96df67492bc8d62b2e71dddf5f6c58965a26cac9799c5f4081401af0494b3bcc  pypy3.7-v7.3.8-win64.zip
                +
                +a85189cdbf717928a4c5c90f05ccf48668e38291d2ac438e644d06aa6fa1fb7e  pypy3.7-v7.3.8rc2-linux32.tar.bz2
                +b8fe346d90561f34db1f23b0213ce247c148b7922d3b9acbfb7fdb1824c708b0  pypy3.7-v7.3.8rc2-linux64.tar.bz2
                +480ad018194096736c47a2735ad453bbc0bd60117e7326508a723befe9543c28  pypy3.7-v7.3.8rc2-osx64.tar.bz2
                +ebc8d34d5b4c546cb2bdb22a848def94b07d23cc6833fd54b76226eb658126a2  pypy3.7-v7.3.8rc2-s390x.tar.bz2
                +2d3059daaaaae35ffd70387b37e9bfe91224a24951be20e5edfbe836300fbdb3  pypy3.7-v7.3.8rc2-src.tar.bz2
                +25df8cfc7510470c525e35d4a465499d0284ea4a895b08a1f75de3fb3a1698b3  pypy3.7-v7.3.8rc2-src.zip
                +3fe66039537920d141cd5fca018e9778e283613dd791dab41122223224585db0  pypy3.7-v7.3.8rc2-win64.zip
                +
                +6db124cda7eb9ee54dbdaf8e5edc052bc32bd59c1a535faf34b175e3e5cd855d  pypy3.7-v7.3.8rc1-linux32.tar.bz2
                +9f239262bcf31609b758a70dcf3c8aba4bfa9d1639285afba707414639ee5871  pypy3.7-v7.3.8rc1-linux64.tar.bz2
                +ed208dac960650f52c69cfc38d17af5e978acd1ad6f09de6aaac1603dea32ffa  pypy3.7-v7.3.8rc1-osx64.tar.bz2
                +9c2ec87b0c827f9d37ce7c11a9b7b4c1cc9a2182b7f86a1bb36ee209dffda49d  pypy3.7-v7.3.8rc1-src.tar.bz2
                +4cc32f99e4dbda8a20f1b9e0e95cdba59963a173e00a02baa574e4d00739b58f  pypy3.7-v7.3.8rc1-src.zip
                +6eb5a637534dbcaa496208061ad19faf5f4413c941a450e091e22ef49e3af9ec  pypy3.7-v7.3.8rc1-win64.zip
                +

                pypy2.7-v7.3.8 sha256:

                +
                ca1f8d3146c83002ee97615906b0930e821297dcce3063b5b28933a0690ef298  pypy2.7-v7.3.8-aarch64.tar.bz2
                +b5edfc995d83feea8b4c8aeffccb89753b4b182f076126550bd07cc35faa6208  pypy2.7-v7.3.8-aarch64-portable.tar.bz2
                +7c84f173bbcd73d0eb10909259d11b5cc253d4c6ea4492e6da8f2532df9b3da5  pypy2.7-v7.3.8-linux32.tar.bz2
                +1f2e84fb539ffce233c34769d2f11647955f894be091e85419e05f48011e8940  pypy2.7-v7.3.8-linux64.tar.bz2
                +e5c1ff39ad9916ea23e3deb8012fe42367b6b19284cf13b1a1ea2b2f53a43add  pypy2.7-v7.3.8-osx64.tar.bz2
                +b4ae4e708ba84602d976ad6ae391ef2eef4b1896d831b8f2b2ec69927dd92014  pypy2.7-v7.3.8-s390x.tar.bz2
                +0cdad270c62d3ccc53cc87eeb069a6dc46acaf95521b584624bcd6697d94fa1c  pypy2.7-v7.3.8-src.tar.bz2
                +13f70c6a0d4e5a59eb368c11d6b581ae09aa9715f96f84b890c5c9fa24cdaa93  pypy2.7-v7.3.8-src.zip
                +806a29a6c5550b1e669d8870683d3379138d3d43eb1e07bdf26d65a0691265f2  pypy2.7-v7.3.8-win64.zip
                +
                +3e9744307a60740191341df2b4feb42ca08452eff354156322b760e1aac3ef54  pypy2.7-v7.3.8rc2-linux32.tar.bz2
                +a13ceb4a881a8da75475feea3d55dc337b7e2c6cf58e1e33924fa17012ace4e5  pypy2.7-v7.3.8rc2-linux64.tar.bz2
                +6413048a6ab1ec5d7702a08f482443be0604a6f2019f32024a35e27c42ed7210  pypy2.7-v7.3.8rc2-osx64.tar.bz2
                +b015012ac2f72a3971d4b4691df2a6f2dc478f2abb2252dec79ad2b4c66c18ed  pypy2.7-v7.3.8rc2-s390x.tar.bz2
                +8b08ace5f402fe7b8b18416082534d2463409b6891ffa426a6989448c5d95064  pypy2.7-v7.3.8rc2-src.tar.bz2
                +b507dac295d94972c62c1faf2206db6333993df60864d0c23be0206d8560e278  pypy2.7-v7.3.8rc2-src.zip
                +270d289a6b32a83db1e0b1078801b2f36fce6d12e238346a2b8354bf31a64e1e  pypy2.7-v7.3.8rc2-win64.zip
                +
                +5ab938f2b0cff62be3869076f1fb99c859ef2df165ed33d329e2de4d32aaafef  pypy2.7-v7.3.8rc1-linux32.tar.bz2
                +124de0f3d327e39e0344b70d71298315714fe0b1115db80b463dda06bd618c58  pypy2.7-v7.3.8rc1-linux64.tar.bz2
                +183a9c0aa5c9ced4ce071ddedf6ae203a752574f06e96722077eb5708f583405  pypy2.7-v7.3.8rc1-osx64.tar.bz2
                +96c9f5a85759cc92000064d3b32ce89748870b35a48e631f713be3f29bf64f3c  pypy2.7-v7.3.8rc1-src.tar.bz2
                +a11e32d93da35a5ab7bf0a6cd37abce4f1697ef22c0bb46957f2360526c20c7b  pypy2.7-v7.3.8rc1-src.zip
                +e3b2e88b5785538ac3f7bccf3122e400b7d42f3871201fbfb2110b9eb93473be  pypy2.7-v7.3.8rc1-win64.zip
                +

                pypy3.8-v7.3.7 sha256:

                +
                cbd44e0a9146b3c03a9d14b265774a848f387ed846316c3e984847e278d0efd3  pypy3.8-v7.3.7-aarch64.tar.bz2
                +dfb9d005f0fc917edc60fd618143e4934c412f9168b55166f5519ba0a3b1a835  pypy3.8-v7.3.7-linux32.tar.bz2
                +5dee37c7c3cb8b160028fbde3a5901c68043dfa545a16794502b897d4bc40d7e  pypy3.8-v7.3.7-linux64.tar.bz2
                +1f044fe7bbdd443b7913ecf554683dab6dade5dcd7f47d4e6d01f4bb4cf84836  pypy3.8-v7.3.7-osx64.tar.bz2
                +ae7d6a76490b317a74b87788d596610c7ffd0ae2d3ffa2433d5bb5300f6b4b77  pypy3.8-v7.3.7-s390x.tar.bz2
                +21ae339f4f5016d6ca7300305f3e3b554373835cb3c39a9041fe30e6811c80c6  pypy3.8-v7.3.7-src.tar.bz2
                +aa9aa0a800d06048d301fbafa7892ff8978e2d63b23cc23a147f2fd1fd288baf  pypy3.8-v7.3.7-src.zip
                +8ceb03d2f7b73c6ce0758290bc42ba366a45c46e033eda36f1779d957a905735  pypy3.8-v7.3.7-win64.zip
                +

                pypy3.7-v7.3.7 sha256:

                +
                a1a84882525dd574c4b051b66e9b7ef0e132392acc2f729420d7825f96835216  pypy3.7-v7.3.7-aarch64.tar.bz2
                +0ab9e2e8ae1ac463bb811b9d3ba24d138f41f7378c17ca9e2d8dee51bf151d19  pypy3.7-v7.3.7-linux32.tar.bz2
                +8332f923755441fedfe4767a84601c94f4d6f8475384406cb5f259ad8d0b2002  pypy3.7-v7.3.7-linux64.tar.bz2
                +edc9df7d0f7c56f7ee05b24117bdb6c03aa65e768471e210c05ccdbbfd11a866  pypy3.7-v7.3.7-osx64.tar.bz2
                +7f91efc65a69e727519cc885ca6351f4bfdd6b90580dced2fdcc9ae1bf10013b  pypy3.7-v7.3.7-s390x.tar.bz2
                +2ed02ac9e710859c41bc82deafb08619792bb9a27eeaa1676c741ededd214dd7  pypy3.7-v7.3.7-src.tar.bz2
                +240ecf56c50b190cc7b728b07fc535be4b3d70a65406d0d8440edc02df4cce17  pypy3.7-v7.3.7-src.zip
                +53505dc0b57590290efd7656117ee5384bcd036f7f7c4f0bc3f5cd10299037d1  pypy3.7-v7.3.7-win64.zip
                +

                pypy3.8-v7.3.6 sha256:

                +
                704d5303096e8a3173e73435f3bb204e31a8bf02ed5ba617a4a0f1e7491edf50  pypy3.8-v7.3.6-aarch64.tar.bz2
                +e857a04a76285f0ef5bae84f6f5e9943ca415d499204c531b1c33fe8f015b48d  pypy3.8-v7.3.6-linux32.tar.bz2
                +8579ea990e95d2b7e101ef47fd9ebf25a9500d5086e8f708c43f9bae83306ece  pypy3.8-v7.3.6-linux64.tar.bz2
                +8195e52a20cf2a4f42c2d7e4969fbf44fe349c1f80f758e20525dd0f8c134bec  pypy3.8-v7.3.6-osx64.tar.bz2
                +a36208d5e950ec4b630b33d0aede8ca3da383d973fc5ca387082c7e5bad8d245  pypy3.8-v7.3.6-s390x.tar.bz2
                +f234c56eb0d4ab0afb196232fb38cd1ca8e19b1c65cf7b65eb691695499be259  pypy3.8-v7.3.6-src.tar.bz2
                +055caaab4171e29915aaad602c9a49fa46e2b50a3f56c650772e31467c541858  pypy3.8-v7.3.6-src.zip
                +1b216fd75f8f0a48633cc21dce7d6f25ba65016142df758842e1df661269b458  pypy3.8-v7.3.6-win64.zip
                +
                +
                +59c299e9657334d651e2154c77490a743cb507f4f39344f934b2975ca91b4b2f  pypy3.8-v7.3.6rc3-aarch64.tar.bz2
                +6cd36eb9857d6f7022099300c70666eb706f1e06b404234ea929a341fee40b68  pypy3.8-v7.3.6rc3-linux32.tar.bz2
                +acdbc39ade2ef2cf2b4bcf0eb387ec0ef0d257175751d32e9d730886405439d0  pypy3.8-v7.3.6rc3-linux64.tar.bz2
                +18fdba4a6c54c7df6fe2521858046ba865261c0e89557c4b53ef37eb7e562806  pypy3.8-v7.3.6rc3-osx64.tar.bz2
                +128ede0f5565b626431755d58eb632362c748508e53777d32184eba5da8fdb6d  pypy3.8-v7.3.6rc3-s390x.tar.bz2
                +0cb9c517a96850c4fba0494ee10b35e87861d71d8b1387e0588c316fa21230ee  pypy3.8-v7.3.6rc3-src.tar.bz2
                +54704168785a6b22580d46a4a39f5a2c3f81e5d9f0c8e5ba906ac01603d42cbf  pypy3.8-v7.3.6rc3-src.zip
                +1bd65ab6c82a696f2dcecd9b37679b474eadd149d96aab30438642236a1f7136  pypy3.8-v7.3.6rc3-win64.zip
                +
                +8ec2b28c6f1558a6abd0ce0a6fb504253b43b013a750c08c1e74470631afc1dd  pypy3.8-v7.3.6rc2-aarch64.tar.bz2
                +008e9a9336108821f0080011aafe54a71e42ffffb7223d5183e610f689a0f8aa  pypy3.8-v7.3.6rc2-linux32.tar.bz2
                +b1069fc7b08c2a230630f55f155c3ea016038471490ff0be020f850c5a8ec0cc  pypy3.8-v7.3.6rc2-linux64.tar.bz2
                +4298d6b1a8333746c43dd313eb6ccd64f11b3dde795921d07f02c8e32d1ac44b  pypy3.8-v7.3.6rc2-osx64.tar.bz2
                +9f3f7bb2842e626a85c8b314a3af959f98dc4a57fc0169c98b566b6fe645ea39  pypy3.8-v7.3.6rc2-s390x.tar.bz2
                +a9c3835e37e84a7667e3e548a176986a77663612d30594c7c4877ce0e712c6c9  pypy3.8-v7.3.6rc2-src.tar.bz2
                +cae1f0a13b0da3b9db87141e662c3db73564f8fa4e4f1dab2d838341bf8bacc1  pypy3.8-v7.3.6rc2-src.zip
                +6415bfd8afb6cef9cd7666de60f58d7fbbabae92042a9c1f3ce5e8ffe9ba4a26  pypy3.8-v7.3.6rc2-win64.zip
                +
                +18308f227c02ecb84ad21ed4a51bba8472acafe20386caef7ada0058d2d5a243  pypy3.8-v7.3.6rc1-aarch64.tar.bz2
                +9b16a894477cbdb1275ab253d7bc71e8d64ad7d12dd61c835242fdac2cdf6cc7  pypy3.8-v7.3.6rc1-linux32.tar.bz2
                +2abcd2a21f17216613c941a6bf6e26b395b089b9aa8f227af9e1b55c86d6d732  pypy3.8-v7.3.6rc1-linux64.tar.bz2
                +d3aebc5c862e223606e3a79c245a748da7b9aa7d0206a2400e6c7d906676ef34  pypy3.8-v7.3.6rc1-osx64.tar.bz2
                +e5013c21d21ca0eb16bc2e12c4093ec3095150b606830fb10f0c588629412b37  pypy3.8-v7.3.6rc1-s390x.tar.bz2
                +999747cb4eacbc23c14e9f71d42c784c35cf45b52a7de9113c6db0811300e526  pypy3.8-v7.3.6rc1-src.tar.bz2
                +3c9010fb3d1074c1ac350f0dbc8b215c53b2ab8ca3440d9ca4e903800e2ef1ce  pypy3.8-v7.3.6rc1-src.zip
                +cef32837d4ab2cd9fbb6173472b633c6996f6a7915d89c66f87f0f0c69edcda2  pypy3.8-v7.3.6rc1-win64.zip
                +

                pypy3.7-v7.3.6 sha256:

                +
                d446b6987eeaa03d706603863e83d6b99df69232cf1e06d3ee5706add6a84cd6  pypy3.7-v7.3.6-aarch64.tar.bz2
                +459e77c845b31fa9367f7b1b1122155f0ba7888b1d4ce4455c35d2111eeeb275  pypy3.7-v7.3.6-linux32.tar.bz2
                +c41d07063b1d002a91ad2a0763b4baaca2b306ec635889c2e4826e706cc7f9ca  pypy3.7-v7.3.6-linux64.tar.bz2
                +26f0c5c2a5f4a2ce35281d2fa760aa10715300dd110387eac43699a78ed32365  pypy3.7-v7.3.6-osx64.tar.bz2
                +3659bf96a177a53426ffc38d3619c6ee307e600c80e924edc9cee604680c141d  pypy3.7-v7.3.6-s390x.tar.bz2
                +9252ccaa130094205b3c7f0a2cad5adc0d9dfba31658ff3172f788dec1fdb348  pypy3.7-v7.3.6-src.tar.bz2
                +c2385436004d7d8d8978650efff1c22512ed9f9808c83ddfd68fe8fe812eb879  pypy3.7-v7.3.6-src.zip
                +341e69a369da5a1f4f69dbbd47e7dff5e745439b203e28c7afcf98308a24b003  pypy3.7-v7.3.6-win64.zip
                +
                +742fc6fa7bdc377e8a8c976f57ef643a9068a0427a5ffbb50f8ba32aa6986392  pypy3.7-v7.3.6rc3-aarch64.tar.bz2
                +b5382404935dd09b8a7ac160b593729151c9c907e6df029e3a7f312c53b5038a  pypy3.7-v7.3.6rc3-linux32.tar.bz2
                +33db78a3c9c9f78eaaf7f52c9c174b1e4c795e5d3294e8364002470a3ced0986  pypy3.7-v7.3.6rc3-linux64.tar.bz2
                +3218ef597290ec2983c692a01a6fe9ba5ebf05b8e95fed5e8431b750ec588544  pypy3.7-v7.3.6rc3-osx64.tar.bz2
                +4f555251083f633bf044a1bc68d6c50629a374d90f1bee66e245cfac0fdd86f5  pypy3.7-v7.3.6rc3-s390x.tar.bz2
                +f0f047f046bec43e433ee08db460c267518eb5b7df1f4d4d6bc3fd735c06a3bc  pypy3.7-v7.3.6rc3-src.tar.bz2
                +a27d35e75c2486029502590ee862e02af2a3453fa685b42916d618cdbc250fd0  pypy3.7-v7.3.6rc3-src.zip
                +67c2e0676b04bbb3bbcf13f5c1f6c97a420b576e362c4948bed0fcbbf64419ee  pypy3.7-v7.3.6rc3-win64.zip
                +
                +7c5877b27ece045af7603436d64c8589eadc920045341bb16c9a773b924b1dfc  pypy3.7-v7.3.6rc2-aarch64.tar.bz2
                +1afe2650a79ea2f234576986e599d504c1f4ab7928a50e3360cdac3b900c04b3  pypy3.7-v7.3.6rc2-linux32.tar.bz2
                +d590359ea1a674b51ea13c2a79d883db38b21c43494c986f90af1f34053111a6  pypy3.7-v7.3.6rc2-linux64.tar.bz2
                +bd9a96b9c5c542ef36e1e01f0e1987140d54f7bf04f0434bf3a3b9efe166c912  pypy3.7-v7.3.6rc2-osx64.tar.bz2
                +22cab4d077f39dc2ff74ebb0d4505e5e3a5b88f2b909643181f57d7b810391da  pypy3.7-v7.3.6rc2-s390x.tar.bz2
                +064e4f9fa408bacb67829782d95e2206b20319ae5b15e85993c76532350f57e8  pypy3.7-v7.3.6rc2-src.tar.bz2
                +4071597a7450fb0d886005c82c52ed7773e9b0c2015bc93968850071d3195f6d  pypy3.7-v7.3.6rc2-src.zip
                +6c6ac71a616882a53648d49e3b20dd1991c08e39a422e650cd58e2f12eecf19c  pypy3.7-v7.3.6rc2-win64.zip
                +
                +7cfb96afb7aa7478516c1747da77616edf92b46fda56570bcc3117bed46364c1  pypy3.7-v7.3.6rc1-aarch64.tar.bz2
                +8079707602a24ab1b61f8982c8ef858f2780e60c08e02354c377d428326f57dd  pypy3.7-v7.3.6rc1-linux32.tar.bz2
                +c40b7859933e14ca398e4eba0f70f9dbd521def5279acb4fc7c897d41ac0ac60  pypy3.7-v7.3.6rc1-linux64.tar.bz2
                +8d9fde2810f84564902cb37d2d8f7294e5c3ea1fd664ab186864c71edb517d83  pypy3.7-v7.3.6rc1-osx64.tar.bz2
                +8c4db2df86239c3e1fa5fb8a4efa5f5ec1f4d55f48ea92a01bd73bdce7fdf9bb  pypy3.7-v7.3.6rc1-s390x.tar.bz2
                +25b980da5a5ca89a67e3752dfb1bb6ee3cd0804b7961d0a12e2f9180afe5bd07  pypy3.7-v7.3.6rc1-src.tar.bz2
                +c2d21937db476d9c2d86f1e8622998278599f0cadda43a6335c6c7ada5403fec  pypy3.7-v7.3.6rc1-src.zip
                +a8d8a861dbff630f902d167da202b654e700b802b1c77643723cd246cef0b2ff  pypy3.7-v7.3.6rc1-win64.zip
                +

                pypy2.7-v7.3.6 sha256:

                +
                90e9aafb310314938f54678d4d6d7db1163b57c9343e640b447112f74d7f9151  pypy2.7-v7.3.6-aarch64.tar.bz2
                +7a1145f3a278ffab4da0e2d4c4bd024ab8d67106a502e4bb7f6d67337e7af2b7  pypy2.7-v7.3.6-linux32.tar.bz2
                +82127f43fae6ce75d47d6c4539f8c1ea372e9c2dbfa40fae8b58351d522793a4  pypy2.7-v7.3.6-linux64.tar.bz2
                +9a97de82037d4be1949ec0c35a4d638ba635e8b34948549ae2fa08abd2cbaa8c  pypy2.7-v7.3.6-osx64.tar.bz2
                +bb29ecbe1f4a05045f0804b3e741267fc2db742249747b36cdbbd18866c15f04  pypy2.7-v7.3.6-s390x.tar.bz2
                +0114473c8c57169cdcab1a69c60ad7fef7089731fdbe6f46af55060b29be41e4  pypy2.7-v7.3.6-src.tar.bz2
                +cd88f99eccce3b9921a3c7fa452b25d7b60d87ff580bb03237bb1cd0fe2dd031  pypy2.7-v7.3.6-src.zip
                +fcc8f6b3b472a77eaa754951f288fe234b4953bfba845888dd839b9b862cb891  pypy2.7-v7.3.6-win64.zip
                +
                +
                +e92e4ba12a62f053e70799e463c7fcb2663b9fa270a16764250385024180cde4  pypy2.7-v7.3.6rc3-aarch64.tar.bz2
                +918cf465e1339adcc66d9829b711e30d6a78d764ce74d79407ce35222f24e569  pypy2.7-v7.3.6rc3-linux32.tar.bz2
                +21d9ed5a80aee8c320321b32eb3ca0bc89d630646a7371ee560c15296e68e4aa  pypy2.7-v7.3.6rc3-linux64.tar.bz2
                +dcb0f049626b47d0bef1ff4f6d19c43b92f7c99a2cf2032afcbf3456b0e00425  pypy2.7-v7.3.6rc3-osx64.tar.bz2
                +648e6e02e31d0ee17428f90da7fc938c2b6d0a8bd790ca73887c94a1016013d7  pypy2.7-v7.3.6rc3-s390x.tar.bz2
                +0b868fe3b6c5a1a498b558395876a5d9cd3f0add649d5c281542db31a086c16b  pypy2.7-v7.3.6rc3-src.tar.bz2
                +eec6ec44cb9e4da0a29118fe98d4c289374af617e5279a77f6759a9713b68d2d  pypy2.7-v7.3.6rc3-src.zip
                +47f9003c5909271c3ee4ce81de3703e2f17e20d7eba7d7328e8dc29407107b3d  pypy2.7-v7.3.6rc3-win64.zip
                +
                +9de5474ae55d31b02b9d43be26d7b3ea70e24e6e8a24bdc1d2ee396e191f315d  pypy2.7-v7.3.6rc2-aarch64.tar.bz2
                +85a57d385a0e6072dfcf979654160fecb3f7d3d7a43352a28dff2c9dd63c7b01  pypy2.7-v7.3.6rc2-linux32.tar.bz2
                +5e5800b1dcc705476bdc1bb6a195e857390d3fafc6406ba27513bff461cfadf7  pypy2.7-v7.3.6rc2-linux64.tar.bz2
                +c6cb5bc6107bdbbf18a18db5b143a9d0476c6578f2d35792c49274d14f6f55ab  pypy2.7-v7.3.6rc2-osx64.tar.bz2
                +a490ab50a846c5587d525aba6ec6cbaeca758e9c6c6941ea0a1738bb78d32b22  pypy2.7-v7.3.6rc2-s390x.tar.bz2
                +1e3870ba5ca5567e4808893ca3361e79f1ba02424059e4459936810ff304ba63  pypy2.7-v7.3.6rc2-src.tar.bz2
                +38d18c15a64950822a404e98b9fba8aac671671e4d51553a60923de5992a6ddd  pypy2.7-v7.3.6rc2-src.zip
                +965f3581e53de1d55f150d78aa9d90b7717a243be494b78d9b88b30ab4a1a8be  pypy2.7-v7.3.6rc2-win64.zip
                +
                +b2957fc3a3fe3957529fdb3e0e85965d46f4b7c09e4101237869f34ddfe5f0d4  pypy2.7-v7.3.6rc1-aarch64.tar.bz2
                +37b9c8d41b5ba85b8ab9defd86da98b842f975d72c473bf92c3c1143a9c293cf  pypy2.7-v7.3.6rc1-linux32.tar.bz2
                +b83967849db84c6e7b7c80b2135788da9c235a89a689729fd044b58d1d92c12f  pypy2.7-v7.3.6rc1-linux64.tar.bz2
                +63a57129987f54ee692129b53fdf13d635cb6097dc0a1c8cd77f255fc95edda4  pypy2.7-v7.3.6rc1-osx64.tar.bz2
                +187e9de4fc4d7edc332275031a40f0de8dc882050b14d5e9b588808c51efedf9  pypy2.7-v7.3.6rc1-s390x.tar.bz2
                +be979c8742181d5646ee1b78eac467612cf61484713ae6862e2b3475b4325b98  pypy2.7-v7.3.6rc1-src.tar.bz2
                +c746176c507128e8e5aca14e5a0eaa101955b7cc860ceeba8b20f4f011da4061  pypy2.7-v7.3.6rc1-src.zip
                +c515b46bccf1b56fd2f7761a9e3984aa6d56843e848eae67a28fd58fb158a5a9  pypy2.7-v7.3.6rc1-win64.zip
                +

                pypy3.7-v7.3.5 sha256:

                +
                85d83093b3ef5b863f641bc4073d057cc98bb821e16aa9361a5ff4898e70e8ee  pypy3.7-v7.3.5-aarch64.tar.bz2
                +3dd8b565203d372829e53945c599296fa961895130342ea13791b17c84ed06c4  pypy3.7-v7.3.5-linux32.tar.bz2
                +9000db3e87b54638e55177e68cbeb30a30fe5d17b6be48a9eb43d65b3ebcfc26  pypy3.7-v7.3.5-linux64.tar.bz2
                +b3a7d3099ad83de7c267bb79ae609d5ce73b01800578ffd91ba7e221b13f80db  pypy3.7-v7.3.5-osx64.tar.bz2
                +dffdf5d73613be2c6809dc1a3cf3ee6ac2f3af015180910247ff24270b532ed5  pypy3.7-v7.3.5-s390x.tar.bz2
                +d920fe409a9ecad9d074aa8568ca5f3ed3581be66f66e5d8988b7ec66e6d99a2  pypy3.7-v7.3.5-src.tar.bz2
                +61bb9740eaac5dd93577e6b76e8bb1a998daa1df5314bc3b192e6803552e12ea  pypy3.7-v7.3.5-src.zip
                +072bd22427178dc4e65d961f50281bd2f56e11c4e4d9f16311c703f69f46ae24  pypy3.7-v7.3.5-win64.zip
                +
                +dbf579f7eb5c527d37ecd43da88cbad02920881b608eb7486d70b4fa31bfc146  pypy3.7-v7.3.5rc3-aarch64.tar.bz2
                +d2daf8b1966497d09be703b939bd0020394e0738095243396b3d5f87cef0d815  pypy3.7-v7.3.5rc3-linux32.tar.bz2
                +1f9712fa86a50b1de00eb776f3e99033c2a7911dceaa8bc9daf77aa3d2a95842  pypy3.7-v7.3.5rc3-linux64.tar.bz2
                +ff1d1ce25f60d9474a950ccc90c5c4af376cba2b8af83b4e30cf33de97611c7e  pypy3.7-v7.3.5rc3-osx64.tar.bz2
                +8e1c4035ba05161083105f452dfcd463c657085405444afc0acf26ceedb1e8a3  pypy3.7-v7.3.5rc3-s390x.tar.bz2
                +9f7215f77106a6df0c201b6025dffdc605cd0731d60ee85a81343a51e64edc76  pypy3.7-v7.3.5rc3-src.tar.bz2
                +21cae47ec47bead5d0c5e7a902a1bec85cab1eb30bf7190bd140309c20602110  pypy3.7-v7.3.5rc3-src.zip
                +8e40ddc6e4360602597bed44f3ae227d20f8eaa0adfb6a728d10805f76456b74  pypy3.7-v7.3.5rc3-win64.zip
                +
                +
                +c01e59167a26976e764f7b230f6febe0af59982911cd727c551191aed0a843c4  pypy3.7-v7.3.5rc2-aarch64.tar.bz2
                +7f8e55f34bf9422576a501c22ae8b82d5d6ffcbf40251a9daf53b5d8d96c2f43  pypy3.7-v7.3.5rc2-linux32.tar.bz2
                +93f9ccf44ec92145cf2fe17ac98a07f0adc08866b001c7f023b64a3729ed9710  pypy3.7-v7.3.5rc2-linux64.tar.bz2
                +4902ac65329447f2451d2b2b264a12fb95d97a4bb734c75410d2b5abc6e6de52  pypy3.7-v7.3.5rc2-osx64.tar.bz2
                +f0d4bbbe4000c836c17168cc709b233b6184039aad69bc9929c415a92bc462a9  pypy3.7-v7.3.5rc2-s390x.tar.bz2
                +b1ac30e5e7cd8d04c4472b5c4a71a414d6b0cf08a2026fd1bfc84994598abfda  pypy3.7-v7.3.5rc2-src.tar.bz2
                +c6c004550444c2f8749d7e34bcdfe404333b5f4bdf08af7745e28371c8358050  pypy3.7-v7.3.5rc2-src.zip
                +ea41d9e5cb94c7b9e7df2652b74fcc1018ce3e786c9636791b70e46d90e7e8ac  pypy3.7-v7.3.5rc2-win64.zip
                +
                +8dcd20e35e26bf92ce08fc8c97350acb4c773e19a78a89d3b4f28a8be63006d3  pypy3.7-v7.3.5rc1-aarch64.tar.bz2
                +04573fd71618d5c26b0828dd306fa02e9eece8a33a020081e55b60d9a6bc6240  pypy3.7-v7.3.5rc1-linux32.tar.bz2
                +97c1142f7ac99af03b2c56eb379af6e9ed4eef7d0d37675f4ca5ec33c841d62f  pypy3.7-v7.3.5rc1-linux64.tar.bz2
                +f4893667f0b978deb891b0b7d91a1117e25299f19c65b31281c40e87dea523d3  pypy3.7-v7.3.5rc1-osx64.tar.bz2
                +2880cfa6349aebc5c28aff5df06cabb8c8733dc7090f7f36410eb9ff3def37bc  pypy3.7-v7.3.5rc1-s390x.tar.bz2
                +ddccb7e8b24523f3f0e31e6c34b3a61c260b895ac9c7567f560f8ceda675fef8  pypy3.7-v7.3.5rc1-src.tar.bz2
                +f39baa99eb0cb4d1505cd43676f86c54cae142f88b9b875542520b8596368ba7  pypy3.7-v7.3.5rc1-src.zip
                +ab8c5e6bf756f6dda2eba5c2e8d65d8d5de9b3a2c54f2f7a3dfb4f111e40ba0d  pypy3.7-v7.3.5rc1-win64.zip
                +

                pypy2.7-7.3.5 sha256:

                +
                8dc2c753f8a94eca1a304d7736c99b439c09274f492eaa3446770c6c32ed010e  pypy2.7-v7.3.5-aarch64.tar.bz2
                +35bb5cb1dcca8e05dc58ba0a4b4d54f8b4787f24dfc93f7562f049190e4f0d94  pypy2.7-v7.3.5-linux32.tar.bz2
                +4858b347801fba3249ad90af015b3aaec9d57f54d038a58d806a1bd3217d5150  pypy2.7-v7.3.5-linux64.tar.bz2
                +8b10442ef31c3b28048816f858adde6d6858a190d9367001a49648e669cbebb6  pypy2.7-v7.3.5-osx64.tar.bz2
                +b91aaa5819ba8af90799eed8eaaba87ceca1fd4dbcbcdb2defc6d313d663b5dd  pypy2.7-v7.3.5-s390x.tar.bz2
                +c0444fd9873058c1c0d99e13a934e92285cb05992c9968bf523c32bf9bec0a9d  pypy2.7-v7.3.5-src.tar.bz2
                +c67214acee357d383bb2716269663406611e17cee580026d6d7baa7891afa85b  pypy2.7-v7.3.5-src.zip
                +0b90eded11ba89a526c4288f17fff7e75000914ac071bd6d67912748ae89d761  pypy2.7-v7.3.5-win64.zip
                +
                +0f83212202d51835dcedfdfe607fe157d1111a368f7f28738792417acd987c37  pypy2.7-v7.3.5rc3-aarch64.tar.bz2
                +6dc2fec9894121cc75500c84509c869648e6fa95c8e8084c81bf17191d80ba8c  pypy2.7-v7.3.5rc3-linux32.tar.bz2
                +8a918307a51a02ae222e71e2973a4d0dc520a3bae2d510a6571aaf53cf7cead7  pypy2.7-v7.3.5rc3-linux64.tar.bz2
                +9376ba404009ce435e7b04a3c194f783b841464031607081081429f079797faa  pypy2.7-v7.3.5rc3-osx64.tar.bz2
                +c95f5d5cef6181fe08f54824872c94f27177feb5d156fa6dae279a5b8228b13c  pypy2.7-v7.3.5rc3-s390x.tar.bz2
                +b643dd908e6d07d703f388798e0355e3378a8157833680cbea55c3cf3e4256e2  pypy2.7-v7.3.5rc3-src.tar.bz2
                +baeafa81e445a5b6c8da8ec92c8587a11104f7e125478d669d9eaa45492b7b90  pypy2.7-v7.3.5rc3-src.zip
                +21b21873124572043749bb5b19cc33a14ffbf6d8ea5e538006689cc4e3af3d5a  pypy2.7-v7.3.5rc3-win64.zip
                +
                +8250c8db8f227aec3d85f8866f8ad78d925ed338a5622f64c22d6a7fb0963b5a  pypy2.7-v7.3.5rc2-aarch64.tar.bz2
                +978ed1e445809adbaa0ca593abd445384c28d72344bf67184b5cee5e0f76fc3c  pypy2.7-v7.3.5rc2-linux32.tar.bz2
                +a933976a2adc840d07be9ed4ac1dc1b1986fd68f875c4258ed214a2ce9f5f659  pypy2.7-v7.3.5rc2-linux64.tar.bz2
                +cbdfe3f9e49cb96b5b182b19ce257a086dbb7204ba01c178db13b4e6272a3260  pypy2.7-v7.3.5rc2-osx64.tar.bz2
                +da2bf8e5e8f03f10ffd8c7e970e20ff702a91fc44a6bd0de51f1a79401804e79  pypy2.7-v7.3.5rc2-s390x.tar.bz2
                +b47ce66e8d716b22e7b78f1ec0e2d212a27afd355adcb94e00b6d76ffa9a513f  pypy2.7-v7.3.5rc2-src.tar.bz2
                +b031352443dff2202fcc0ee131887a232214363af1d87ba35886dc683b18eb85  pypy2.7-v7.3.5rc2-src.zip
                +47a355033a4c61e679f5ed34274a320adda8df2c27ed313bda0841dc8e11a354  pypy2.7-v7.3.5rc2-win64.zip
                +
                +4431bc2193f76b97add9726420c6d6ab14b46178e9cfeade5f596016b66b6549  pypy2.7-v7.3.5rc1-aarch64.tar.bz2
                +b0d2432bf50bfeeb00e91e048db6df1bba40ca54b0d19d9f61db0f3a4e6e2bf5  pypy2.7-v7.3.5rc1-linux32.tar.bz2
                +5a81b1e5733351a1e27e8072f474c60d24ab987dc1355873861b69961da425f5  pypy2.7-v7.3.5rc1-linux64.tar.bz2
                +d2e3077b6c0a84e07af5e4c5eb9c883e54bf649ef982dd5310b3e8e68dfffc0e  pypy2.7-v7.3.5rc1-osx64.tar.bz2
                +5d6a52bbed77855303dadf10a44c1f5e07920ad28948ecf6f13c57eed0c95f8b  pypy2.7-v7.3.5rc1-s390x.tar.bz2
                +45639e3b398f1dbac54f35e2aebc4770432519dd8838e0190708f1dcfa945356  pypy2.7-v7.3.5rc1-src.tar.bz2
                +67329cae37163b4838bb5768dd04ebc75ce1bbb0a62b74da404587f7344d80fc  pypy2.7-v7.3.5rc1-src.zip
                +6d36595d6cf6f61c33c0e36ae47d9f84abe1ab99cee6cb910a2517d4d3db6cb0  pypy2.7-v7.3.5rc1-win64.zip
                +

                pypy3.7-7.3.4 sha256:

                +
                a4148fa73b74a091e004e1f378b278c0b8830984cbcb91e10fa31fd915c43efe  pypy3.7-v7.3.4-aarch64.tar.bz2
                +04de1a2e80530f3d74abcf133ec046a0fb12d81956bc043dee8ab4799f3b77eb  pypy3.7-v7.3.4-linux32.tar.bz2
                +09d7298b44a38648a87995ec06e1e093761644e50f547c8bb0b2d7f4fe433548  pypy3.7-v7.3.4-linux64.tar.bz2
                +8a4f0e6c7e3845820202bf7f46b48e36886ceb820ff0767963fd74091c4f5d13  pypy3.7-v7.3.4-osx64.tar.bz2
                +7d6fb180c359a66a158ef6e81eeca88fbabbb62656a1700f425a70db18de2a0f  pypy3.7-v7.3.4-s390x.tar.bz2
                +74d3c1e79f3fc7d384ffb32d3d2a95c2d5f61b81091eccce12ac76030d96ad08  pypy3.7-v7.3.4-src.tar.bz2
                +80d4da3aaeb8b4cc5e4e4ea747f2e468e9f448da549aa7ada4d59c24380cda43  pypy3.7-v7.3.4-src.zip
                +0ff4e4653f1ff0653f105680eb101c64c857fa8f828a54a61b02f65c94b5d262  pypy3.7-v7.3.4-win64.zip
                +
                +647e34857d181e7560205eb877915b787836237929c7bd52860de626d5e85e9d  pypy3.7-v7.3.4rc2-aarch64.tar.bz2
                +cfc661034347d79ba907078b4e3acea4f09d0de0eaf474c5bde173666319780c  pypy3.7-v7.3.4rc2-linux32.tar.bz2
                +dcf1fa6dd5da4076f040ed4302a22c8da3838335e64cd118c29d69eb7d443d6b  pypy3.7-v7.3.4rc2-linux64.tar.bz2
                +c9ecc213cdc3169ef230d85e49d9d073ffc1ba0a36bc1d8483f724e31b9d9d12  pypy3.7-v7.3.4rc2-osx64.tar.bz2
                +fcc5c02382f67c7ee6f267b459131519b6a72e60ae370d6e398d54c0e07080f9  pypy3.7-v7.3.4rc2-s390x.tar.bz2
                +f1257d4d8a3d84e84ff85c83f4f5bc2e126727d7595c536ccbe1a03a280c0df6  pypy3.7-v7.3.4rc2-src.tar.bz2
                +dfab9881e2c42ae61115aa6ed77389f835094fd783dc08cf4dee1ebfdd4c1d47  pypy3.7-v7.3.4rc2-src.zip
                +b62b7aad962a8c42895a13b08d68b32254934d6d1b1f5f1f02f762cbe111b035  pypy3.7-v7.3.4rc2-win64.zip
                +
                +958a562528d24fdb33b9fd12f2076f4b546dc218e0793324558560823234adb1  pypy3.7-v7.3.4rc1-aarch64.tar.bz2
                +d05299744ac8c6f12bb3587541ce106f3a93d9ed64b0529c46e79b56efd27b24  pypy3.7-v7.3.4rc1-linux32.tar.bz2
                +bb7ee16bdf7c1bbbca45d1228502a5c276be33e27e849525aa5a61c0eaec5b4a  pypy3.7-v7.3.4rc1-linux64.tar.bz2
                +6d3aea12b744413c874e33ff456f6591049e12dc1a356d975dc0e29a047a151e  pypy3.7-v7.3.4rc1-osx64.tar.bz2
                +8deb01eb54b95e480d2ee03ee9148ba0c1684b410165c198e9f68a015656246e  pypy3.7-v7.3.4rc1-src.tar.bz2
                +bf247839954a4518327d5cbc9ab1a1b4296982c2fe78671d59a58373239e675e  pypy3.7-v7.3.4rc1-src.zip
                +0819de5a5212bddef0f615f7ced03dfd9f5d4ee115ec3564119d45b6b447843f  pypy3.7-v7.3.4rc1-win64.zip
                +

                pypy2.7-7.3.4 sha256:

                +
                9e741162ce486b14fbcf5aa377796d26b0529a9352fb602ee8b66c005f8420d1  pypy2.7-v7.3.4-aarch64.tar.bz2
                +653cc3f0612399e494021027f4463d62639dffa4345736a16d0704f3f8a61d5f  pypy2.7-v7.3.4-linux32.tar.bz2
                +d3f7b0625e770d9be62201765d7d2316febc463372fba9c93a12969d26ae03dd  pypy2.7-v7.3.4-linux64.tar.bz2
                +ee7bf42ce843596521e02c763408a5164d18f23c9617f1b8e032ce0675686582  pypy2.7-v7.3.4-osx64.tar.bz2
                +f19b70ca5bd918d1349444be775bc2194c8165b0140e6e8b87c3ee101765a5ba  pypy2.7-v7.3.4-s390x.tar.bz2
                +ff9b928237767efe08ccfba79dae489519b3c768fb6e3af52d39c2a8a1c21ca4  pypy2.7-v7.3.4-src.tar.bz2
                +e0811ecc272fee58e01b95c4c12f23b115a3e64075a1b50dcefe8faaa6cca869  pypy2.7-v7.3.4-src.zip
                +1080012d7a3cea65182528259b51d52b1f61a3717377c2d9ba11ef36e06162d5  pypy2.7-v7.3.4-win64.zip
                +
                +f0a11bd48a01b27595e659c3a1b7fb936ac6e0a21574f1fc2f57fd032830342a  pypy2.7-v7.3.4rc2-aarch64.tar.bz2
                +81dd5ac16b11f6f9ba0ff2536306dd85997a6cad86aa4e7971e7805264d61716  pypy2.7-v7.3.4rc2-linux32.tar.bz2
                +077acdb14e797878341fc6f50d87a2f0c9b7d25215c6b2f73541bacb7730f64d  pypy2.7-v7.3.4rc2-linux64.tar.bz2
                +6a220785a962c56db26dd56245aacb7cb6658879ecaad9ada04d26df56da172c  pypy2.7-v7.3.4rc2-osx64.tar.bz2
                +a3201493550457f932ddf743118635a7e8ff6b5c5fd69d0b8596dfeabcc5bffd  pypy2.7-v7.3.4rc2-s390x.tar.bz2
                +1965dfc3de6fdae83bd954fed206111a020898708d8754705fb1312473be35bf  pypy2.7-v7.3.4rc2-src.tar.bz2
                +1072727a4a948b16ccebb165015e43716ffc586f5249356c97c454b24aacb2dd  pypy2.7-v7.3.4rc2-src.zip
                +e20f206ba8751d2c17ad80c66b7f4bd63c2f500cbfa9e8a3906cd7d77955e00f  pypy2.7-v7.3.4rc2-win64.zip
                +
                +ee4894169260d3e4c55e06232c96d690e41d13e9f82f1512edcf6b8d960b695d  pypy2.7-v7.3.4rc1-aarch64.tar.bz2
                +fd736003d5a7f5f2744269d67dc9a96005a5a2ceac8987007bd27ab57681c0f2  pypy2.7-v7.3.4rc1-linux32.tar.bz2
                +ec1cd67c28416c359dbe1caddf7ae7a0be10e3fbe6435150d39d4b7492469852  pypy2.7-v7.3.4rc1-linux64.tar.bz2
                +cce4e360b31010e415e397ce8982535db482e36c0f13934eaa6d9e1e30eb2bc3  pypy2.7-v7.3.4rc1-osx64.tar.bz2
                +84930e433a81f16dcf81b678c12167ef951cd74534ee1ee8e6b0b27b0a128e1d  pypy2.7-v7.3.4rc1-src.tar.bz2
                +7bdc1e5431a7429bd2ec2853c86a68f09069f080b9765a87084904f52adab789  pypy2.7-v7.3.4rc1-src.zip
                +02befc534dbcc2da6ad4c7e60735d977dc8b4f6901630eb599d1684cb86a58c7  pypy2.7-v7.3.4rc1-win64.zip
                +

                pypy3.7-7.3.3 sha256:

                +
                ee4aa041558b58de6063dd6df93b3def221c4ca4c900d6a9db5b1b52135703a8  pypy3.7-v7.3.3-aarch64.tar.bz2
                +7d81b8e9fcd07c067cfe2f519ab770ec62928ee8787f952cadf2d2786246efc8  pypy3.7-v7.3.3-linux32.tar.bz2
                +37e2804c4661c86c857d709d28c7de716b000d31e89766599fdf5a98928b7096  pypy3.7-v7.3.3-linux64.tar.bz2
                +d72b27d5bb60813273f14f07378a08822186a66e216c5d1a768ad295b582438d  pypy3.7-v7.3.3-osx64.tar.bz2
                +92000d90b9a37f2e9cb7885f2a872adfa9e48e74bf7f84a8b8185c8181f0502d  pypy3.7-v7.3.3-s390x.tar.bz2
                +f6c96401f76331e474cca2d14437eb3b2f68a0f27220a6dcbc537445fe9d5b78  pypy3.7-v7.3.3-src.tar.bz2
                +9e4756903b14c5f971989a2f5a4de6ee19b21a59f2a798b3ad2ad0e71b2582a5  pypy3.7-v7.3.3-src.zip
                +a282ce40aa4f853e877a5dbb38f0a586a29e563ae9ba82fd50c7e5dc465fb649  pypy3.7-v7.3.3-win32.zip
                +
                +54a1697d39f136c3e3961afbd58a049e10a5ed10e6d230e6729d696c226d5185  pypy3.7-v7.3.3rc2-aarch64.tar.bz2
                +796c0b57b28850f9a212593f30baf7c241c0ed3fe857048d2ea50b3e13b9773b  pypy3.7-v7.3.3rc2-linux32.tar.bz2
                +be427afe0434ac42b4da997c841250c499286c57f1c1e9a764d49787bbeeda38  pypy3.7-v7.3.3rc2-linux64.tar.bz2
                +e670772077ea400c8f276f8bea301a0c3fa0f037f7e174ae08b34d46e43ce433  pypy3.7-v7.3.3rc2-osx64.tar.bz2
                +b230bfd935d6a4ecfaf890c91431b56cb53325ad988899542b178610f94d5970  pypy3.7-v7.3.3rc2-s390x.tar.bz2
                +c4a7f8c8a00073de1f987562bed486c372005e021505d3847562966541e0ea6f  pypy3.7-v7.3.3rc2-src.tar.bz2
                +26ba0babe260fbc9264c15070b129593ca871c7658a661eacf4c5e27507542f7  pypy3.7-v7.3.3rc2-src.zip
                +53959607ea55de6ec5cf15227c195e3356d56629e91279ce26744cb3e392a863  pypy3.7-v7.3.3rc2-win32.zip
                +
                +45357c23a05bc4e4828c0c0964142a7c45f0bcc6653cae67837ff00a02ececb2  pypy3.7-v7.3.3rc1-aarch64.tar.bz2
                +22c04f6984c986895999c73d845e57957d86ab788137e482b60f83aa4983e278  pypy3.7-v7.3.3rc1-linux32.tar.bz2
                +2069912448749295537c2b381957c5e07dec103fc9a3322f2ce8a57b3fa6e60c  pypy3.7-v7.3.3rc1-linux64.tar.bz2
                +9fbbf9cfb9ca699e00ea08aaec6248625541998c251033aa3e6d8c592c0a6ff9  pypy3.7-v7.3.3rc1-osx64.tar.bz2
                +f502ed792c9da1531a413cd8a7c4c8158c649d7820cb4a910a5852866579c365  pypy3.7-v7.3.3rc1-s390x.tar.bz2
                +6780d79e205768a5b2c1d6ecc9e1c4a8c05811cc6b130ed728ba1a53088e0406  pypy3.7-v7.3.3rc1-src.tar.bz2
                +edaed54347b69d2a3037e427c60eb88050226cf082d26fff594221cbedab9cd8  pypy3.7-v7.3.3rc1-src.zip
                +3c82f4569293dcff5085f0c61af1ba2671217256c58b6e6092629a406eee4fc5  pypy3.7-v7.3.3rc1-win32.zip
                +

                pypy3.6-7.3.3 sha256:

                +
                bc82cf7f0182b942a2cfad4a0d167f364bfbf18f434e100a2fe62bc88547ac9b  pypy3.6-v7.3.3-aarch64.tar.bz2
                +f183c61e66fd2c536a65695bd7ff770748c2884c235a589b9c6ac63690770c69  pypy3.6-v7.3.3-linux32.tar.bz2
                +4fb85fdd516482cab727bb9473b066ff8fb672940dedf7ccc32bf92957d29e0a  pypy3.6-v7.3.3-linux64.tar.bz2
                +84126fcb957f260de221244222152c981643144df1d817329781f555daa52e35  pypy3.6-v7.3.3-osx64.tar.bz2
                +0de9c33ff3500c6e7fd273d0a6d341bc839b0298f697c4d6fe141f2b54c5c3e2  pypy3.6-v7.3.3-s390x.tar.bz2
                +a23d21ca0de0f613732af4b4abb0b0db1cc56134b5bf0e33614eca87ab8805af  pypy3.6-v7.3.3-src.tar.bz2
                +df534213c27c6ecc8e7d4f2a6950305301711ea3e132ec7a836959146761c9d8  pypy3.6-v7.3.3-src.zip
                +b935253877b703d29b1b11f79e66944f1f88adb8a76f871abf765d4de9d25f8a  pypy3.6-v7.3.3-win32.zip
                +
                +58a35d069bc887c09f8106aec1c0da18241f887dc227bd9e31bd2819496b8256  pypy3.6-v7.3.3rc2-aarch64.tar.bz2
                +e171477f56ada45ce64df6f91ad4961c13b674d268b8b16850d1bae5eda43393  pypy3.6-v7.3.3rc2-linux32.tar.bz2
                +df2f421c3782e09ca304f00afd79d7ac24224c3346b41ddae9ab919f4b243538  pypy3.6-v7.3.3rc2-linux64.tar.bz2
                +1b2715c8bdf97bbe2135a13562aaeab3408c1459d714412a0b0c607309c5c48b  pypy3.6-v7.3.3rc2-osx64.tar.bz2
                +d1eaa8ea52f8ce7b02ddc08cff56a64405cfdc7f657edd9bfbb8788484ab9c01  pypy3.6-v7.3.3rc2-s390x.tar.bz2
                +3c91a1e911eee1baf9093dcb66899bd06a9ddc095ee60c51c2bca1626497148f  pypy3.6-v7.3.3rc2-src.tar.bz2
                +e9e5dc879afcddc7ffea09500a092fe00c9070d8fd5008ef0342e0b77c9f9161  pypy3.6-v7.3.3rc2-src.zip
                +7bfdc3544216003b96e76f133073084f2918c5cd29642211735c8507142d107a  pypy3.6-v7.3.3rc2-win32.zip
                +
                +9e65dff7a5bc34d32ea88b9436a9f9629542dd3eb8f948f49ecce40112530199  pypy3.6-v7.3.3rc1-aarch64.tar.bz2
                +13a67079e78eaa01dcc2a8aa986a50944bc4bf42469c3c39e3ecb0f0cee31439  pypy3.6-v7.3.3rc1-linux32.tar.bz2
                +17fb6dff3a5fd9d9e791ce1cd8ae9076e5f47b8b463b7575e4403f01656b0735  pypy3.6-v7.3.3rc1-linux64.tar.bz2
                +2f62a9c9876d83a2bf04d8e5e1373aa7e0dcd1e523a58216e60f20329a536b9b  pypy3.6-v7.3.3rc1-osx64.tar.bz2
                +a652572f3c783c4c9cfae477a6a64584f2df39e4df75773131ab512e486d61f3  pypy3.6-v7.3.3rc1-s390x.tar.bz2
                +bd5e6d6ba3bd9bc1a233c2dd77b518fd1d337a37670fe0e23edf837852254ee7  pypy3.6-v7.3.3rc1-src.tar.bz2
                +e26c8c95e2d131507a08c3e8b8010e6dd366e8e9bf6e77db6844bc5145be1932  pypy3.6-v7.3.3rc1-src.zip
                +773ffcabddc3bdc626318f24f0ba256153eca517775425b618c1c7b8b10f1680  pypy3.6-v7.3.3rc1-win32.zip
                +

                pypy2.7-7.3.3 sha256:

                +
                23b145b7cfbaeefb6ee76fc8216c83b652ab1daffac490558718edbbd60082d8  pypy2.7-v7.3.3-aarch64.tar.bz2
                +bfbc81874b137837a8ba8c517b97de29f5a336f7ec500c52f2bfdbd3580d1703  pypy2.7-v7.3.3-linux32.tar.bz2
                +f412b602ccd6912ddee0e7523e0e38f4b2c7a144449c2cad078cffbdb66fd7b1  pypy2.7-v7.3.3-linux64.tar.bz2
                +f34dc4f5ded1f6bcea05841aa9781b9307329e3ab755607917148568824ae0b0  pypy2.7-v7.3.3-osx64.tar.bz2
                +8254a7fb98ea66c33324a403d06ccb052d616a4176ce0130591693ceeb011cf7  pypy2.7-v7.3.3-s390x.tar.bz2
                +f63488051ba877fd65840bf8d53822a9c6423d947839023b8720139f4b6e2336  pypy2.7-v7.3.3-src.tar.bz2
                +5ce67ea6afb0cf1a3e20bbd4bbd375e375f572d5325524f9c7760edf8521f029  pypy2.7-v7.3.3-src.zip
                +b3e660dae8d25d8278fd6a0db77e76a16ac9a8c1dca22e7e103d39ed696dc69e  pypy2.7-v7.3.3-win32.zip
                +
                +4f2eee1d8ae2571d6fde76141237cf7717324dd6b6a1aa50036c42266d92cbce  pypy2.7-v7.3.3rc2-aarch64.tar.bz2
                +79c741bd28f293820382f4ecd81414a327745956fa402a5dcfe38900e7520214  pypy2.7-v7.3.3rc2-linux32.tar.bz2
                +b227698c4797170b7fdb427a56632fa7733695dd3b31fd404ce4c0939505f918  pypy2.7-v7.3.3rc2-linux64.tar.bz2
                +451fca86c965e498ce2ada9474c36d316a627bd6aeeeb808b952a447c938c936  pypy2.7-v7.3.3rc2-osx64.tar.bz2
                +83147a40ecc2ab39679129f7898756febd09422ee63a0074fb7f844964c189d8  pypy2.7-v7.3.3rc2-s390x.tar.bz2
                +1d60d7f9662278ba59f34cd20c0332993c0bb117009309bc06bd3cb651318c36  pypy2.7-v7.3.3rc2-src.tar.bz2
                +4810fb6761eccf6f3e6a14f7a8e4010548e551928fef27fb9482b0c7e3e501d5  pypy2.7-v7.3.3rc2-src.zip
                +72a43db2c5bd639023adad2a5c9fd7d4db639c5269dcfeb19ef5b0576771ea9b  pypy2.7-v7.3.3rc2-win32.zip
                +
                +061be51e14fc5f16ce38a61b3873239a0a74b02af51be5930b52941bbb3e6eb2  pypy2.7-v7.3.3rc1-aarch64.tar.bz2
                +395113ae0a9d1e352e5aef22b1d9e272b029b186d5e1c7e204dd6df044647fc1  pypy2.7-v7.3.3rc1-linux32.tar.bz2
                +1e160ff884fdcdc3388b3c88a00ee54d0b11e7b3c94c4787a217eeea76da63e3  pypy2.7-v7.3.3rc1-linux64.tar.bz2
                +761b6e9485dd218e63d231f351f908e74c6cc6bb38cc3b61992b92a0e5384f02  pypy2.7-v7.3.3rc1-osx64.tar.bz2
                +72d62a3d0bfcb1693f44d5bc3601d528188838df9fbb885e3e18770f81f97e5a  pypy2.7-v7.3.3rc1-s390x.tar.bz2
                +39fa3f6f0921785c4b44ab2e47777d64480737c710672f09913b2306a1430281  pypy2.7-v7.3.3rc1-src.tar.bz2
                +6b5b466e74505e59985ff9583587a417a200ab2d41829b8c72c74daef4c0d44c  pypy2.7-v7.3.3rc1-src.zip
                +403bce17882ca7f305fedd9f604f5657364e4ef76086064bbed0a31dfbf47155  pypy2.7-v7.3.3rc1-win32.zip
                +

                pypy3.6-7.3.2 sha256:

                +
                164d6a0503c83dd328e1a6bf7fcb2b2e977c1d27c6fcc491a7174fd37bc32a12  pypy3.6-v7.3.2-aarch64.tar.bz2
                +6fa871dedf5e60372231362d2ccb0f28f623d42267cabb49be11a3e10bee2726  pypy3.6-v7.3.2-linux32.tar.bz2
                +d7a91f179076aaa28115ffc0a81e46c6a787785b2bc995c926fe3b02f0e9ad83  pypy3.6-v7.3.2-linux64.tar.bz2
                +fd457bfeaf54aa69417b6aa4817df40e702dc8aaaf7e83ba005d391a1bddfa96  pypy3.6-v7.3.2-osx64.tar.bz2
                +16afbaa245c016c054d9300c19433efcc76c50664ff2c86d913ff76ed0a729dc  pypy3.6-v7.3.2-s390x.tar.bz2
                +fd6175fed63ff9fccd7886068078853078948d98afae9bd4f5554c6f7873c10d  pypy3.6-v7.3.2-src.tar.bz2
                +edcbcd3598a91de3115f86550d1bc76ac46fc0a3e86a1e951769a993f6fbcbf0  pypy3.6-v7.3.2-src.zip
                +13a39d46340afed20f11de24e9068968386e4bb7c8bd168662711916e2bf1da6  pypy3.6-v7.3.2-win32.zip
                +
                +62e525c6c71c8264c8476e2c4afe11d2aa07b71f9bcf6d694fc4aae27bfcbb66  pypy3.6-v7.3.2rc2-aarch64.tar.bz2
                +e9de7036c663f08f06f760340c5d165d8bdecad159abd14d0d93d1bde714ed38  pypy3.6-v7.3.2rc2-linux32.tar.bz2
                +e3ac3cf1560f8aee41e542bd999214cbbe0645a4786e4d8a5dc3d58b219429f3  pypy3.6-v7.3.2rc2-linux64.tar.bz2
                +7995b74b190f619feb3f393620f63dd0f7cae9e8e298c0616bd184090c356c90  pypy3.6-v7.3.2rc2-osx64.tar.bz2
                +9c09100e3302221dbe9776bb3f99e870a8404a2f6afd7a056fa3b7116f5ab013  pypy3.6-v7.3.2rc2-s390x.tar.bz2
                +b7d4b3cf3ba7e7749421b1eb857be32d8e5fede124cb2a1d1e1bc606a437b4c5  pypy3.6-v7.3.2rc2-src.tar.bz2
                +f5c4f219a974c69b949221082b789a455a67f9f6a37c173cb48a6246ab57f05c  pypy3.6-v7.3.2rc2-src.zip
                +0555340fdd2e2fcbf114d1f2b57d798269dfccddf1b6419dbe3ce937927b0504  pypy3.6-v7.3.2rc2-win32.zip
                +
                +1c69cca7292e3c3ffcb7a09f5cdeb51d45e24dc75510b2c9bb410b8ffc57a579  pypy3.6-v7.3.2rc1-aarch64.tar.bz2
                +d5738cffc11b364b5f0bf4883c2e1fd46431822f3bd126c7d8c83e9b5f0e6543  pypy3.6-v7.3.2rc1-linux32.tar.bz2
                +41cab069841cfc713cc2d0526034f04fcbd741d67d70212926a3ff90754a39f5  pypy3.6-v7.3.2rc1-linux64.tar.bz2
                +afabd1ea5a7da31df547c1d4b7028caef1dfaad0ba7e9dda81da2884dfe3062c  pypy3.6-v7.3.2rc1-osx64.tar.bz2
                +9202fa080d821cca5fe788acfdee3020449e3c36df720ede89ef7389ad6d4a37  pypy3.6-v7.3.2rc1-src.tar.bz2
                +8dc4d906720208d590133d580bc7976f7aca1fedf49c3dec1eba1fccb39e0bdc  pypy3.6-v7.3.2rc1-src.zip
                +29d47b72cf417d12b23161d898dae38f48e48788733623ffb09807e913fbeb44  pypy3.6-v7.3.2rc1-win32.zip
                +

                pypy3.7-7.3.2 sha256:

                +
                c5c35a37917f759c19e2a6b3df3b4d56298faa2fae83c143469bcbda42ca5dd2  pypy3.7-v7.3.2-aarch64.tar.bz2
                +34c7e1c7bd06e437ad43cc90a20f9444be1f0a264d0955e32098294c30274784  pypy3.7-v7.3.2-linux32.tar.bz2
                +a285ddcbc909d68c648585fae4f33b0ba24961bb4e8fafe5874cf725d6e83df6  pypy3.7-v7.3.2-linux64.tar.bz2
                +337dd4d9e529d2f221e0beb092236c18430e0564ab835c6bba425a1daf7c9958  pypy3.7-v7.3.2-osx64.tar.bz2
                +d4ce71ebba148bf83c24fc963e8282c9b7f0c81fcf6b612301b8efe6bd7658d1  pypy3.7-v7.3.2-s390x.tar.bz2
                +9274186eb0c28716a8c6134803b1df857bc3f496e25e50e605c4d95201c8817d  pypy3.7-v7.3.2-src.tar.bz2
                +23363123c607058dac29995cf281c4609a8d8d278841a8f05ea8559bdb1678a8  pypy3.7-v7.3.2-src.zip
                +e3c589be07760bc3042981c379b7fd1603e832a4db426075f09e090473846a96  pypy3.7-v7.3.2-win32.zip
                +
                +78fe46fa8706e325bd0bdb81d6f0865b7dae0ffb22a77c533a24fa960e885b1b  pypy3.7-v7.3.2rc2-aarch64.tar.bz2
                +2ed3489e1ea42b1807e79ba46a2dfb2c763bdd4d15efac0fd8ba9cf05ab436bb  pypy3.7-v7.3.2rc2-linux32.tar.bz2
                +6c67701914b7885e67d282c1286e9109fc79e73ab65b5c164492fb024b8deb7f  pypy3.7-v7.3.2rc2-linux64.tar.bz2
                +28b48a691276a806bcf0009df5e367d90159b9b4a4161ad9857454999e6915ec  pypy3.7-v7.3.2rc2-osx64.tar.bz2
                +544023b22670be740970bfc8d67a102dfa045cb229e40271a4197a9e8d3bc5da  pypy3.7-v7.3.2rc2-s390x.tar.bz2
                +9a3f29338340ab5e006300b68369745bd16f99943a7d48d8440c5a0ad67a5c68  pypy3.7-v7.3.2rc2-src.tar.bz2
                +73a6c2241d0a5ce7741a15f8cfd205a6f1eb10310799d912c069d6be58907ba7  pypy3.7-v7.3.2rc2-src.zip
                +9a44c694f9c642a7a127241466f72ca58f303d3e148bf5488e34a162c7d7a55b  pypy3.7-v7.3.2rc2-win32.zip
                +
                +a7e2376f5e64256aa2e3cf3d403b4c48753c9c2588c57e0fc6bddebefacb3a9d  pypy3.7-v7.3.2rc1-aarch64.tar.bz2
                +e2b2fa3f83f4a3cc138eb88c3bbf4fde395faec6bc04cd72721623865a366d96  pypy3.7-v7.3.2rc1-linux32.tar.bz2
                +8173935a5d1cae7238cb27e35bf881ab0ed0d8bd978d3cf6c80311ed596324ba  pypy3.7-v7.3.2rc1-linux64.tar.bz2
                +e730cf9e5be8566544a478bf2da4bc4ab84428ac4f4a7bb8e001ea4516a3f3be  pypy3.7-v7.3.2rc1-osx64.tar.bz2
                +209c2136654ea116c316c6d5305659e8e33d49b9f9f61eee36c06330bb3214ba  pypy3.7-v7.3.2rc1-src.tar.bz2
                +419020e81793030cb6d011e7c0b75183163a7586a31ae88a6a52689e9c45926e  pypy3.7-v7.3.2rc1-src.zip
                +a6fc9d568c05504759e945e70b94fc55f5e99748eb01da4fb5192231238fa1d7  pypy3.7-v7.3.2rc1-win32.zip
                +

                pypy2.7-7.3.2 sha256:

                +
                fce1f06f20ab8bcacb9ac1c33572d6425033de53c3a93fbd5391189cc3e106cb  pypy2.7-v7.3.2-aarch64.tar.bz2
                +78f30ac17abe3cc077fc2456ef55adb51b052c5126011b2a32bacc858acaca7d  pypy2.7-v7.3.2-linux32.tar.bz2
                +8d4f08116a97153a0f739de8981874d544b564cbc87dd064cca33f36c29da13b  pypy2.7-v7.3.2-linux64.tar.bz2
                +10ca57050793923aea3808b9c8669cf53b7342c90c091244e9660bf797d397c7  pypy2.7-v7.3.2-osx64.tar.bz2
                +042d5e99f660de098de979c4b27f7f8c1332d904db379bb2bf2c3402729749bb  pypy2.7-v7.3.2-s390x.tar.bz2
                +8189480d8350ad6364d05c2b39fd7d832644d4b1cd018f785126389df45928d1  pypy2.7-v7.3.2-src.tar.bz2
                +d891c55f4e657b5e3fe609cee02b2288790abb5554a544ca047f088310d129c4  pypy2.7-v7.3.2-src.zip
                +0fd62265e0421a02432f10a294a712a5e784a8e061375e6d8ea5fd619be1be62  pypy2.7-v7.3.2-win32.zip
                +
                +fa76bfc65200eeb3b32253e674a9339a417aef23f5a5c54e0c519bbbfefcdc7e  pypy2.7-v7.3.2rc2-aarch64.tar.bz2
                +40ff311202eca98ef3d6edeac4171470135087a8de34296f486c17ec376ebe51  pypy2.7-v7.3.2rc2-linux32.tar.bz2
                +379d458c1a9d38c2b3a6a32bd805786fc584739548a697a4ef7b683bcfdfda3e  pypy2.7-v7.3.2rc2-linux64.tar.bz2
                +3d515a233c83cbc833bcdd0b75354b20dc79b9f6ca892a5db9cadaea36c6bb5b  pypy2.7-v7.3.2rc2-osx64.tar.bz2
                +41344e1e4d27d774780e9cace6e70c5025b510c82de708ea55b64d21ed0c2f40  pypy2.7-v7.3.2rc2-s390x.tar.bz2
                +144bfc9607e6319ba950de9a4d1587020e3f1311cc25a79d1711de78c5992f4f  pypy2.7-v7.3.2rc2-src.tar.bz2
                +f9de3fe464ca11dfcdd6816b64051f03bdba7c66755b17ddd4f071c4d08cc0fb  pypy2.7-v7.3.2rc2-src.zip
                +01a9b5b266fde443698cb01c7bac843cc0ed8747f47f1e8930666a4303bf83b2  pypy2.7-v7.3.2rc2-win32.zip
                +
                +925543a3161153d9b15df49000e96ce2625bf4371619667b5f37616b699acc21  pypy2.7-v7.3.2rc1-linux32.tar.bz2
                +6216e1bbac3b86bfd38d16f0685c34c8c9c7aaf908ebd00388844ec295b89c17  pypy2.7-v7.3.2rc1-linux64.tar.bz2
                +a6fcdb44f12379eb1a547750322bd4c154b6e0c5ee30f9de2d9e2b86b2f2f319  pypy2.7-v7.3.2rc1-osx64.tar.bz2
                +9f58b5bacab010d945d9c31e8b7a2539034858f4cdf048f016d8d04430688cc6  pypy2.7-v7.3.2rc1-src.tar.bz2
                +0c86b52f6ad09dce1275427c18a216a0cbb5cf0db89eba2389e97ae81416eef7  pypy2.7-v7.3.2rc1-src.zip
                +bbb737f4ce714af0e7797fc951f5231b26ee10f8bca3d969c5b732982f952957  pypy2.7-v7.3.2rc1-win32.zip
                +

                pypy2.7-7.3.1 sha256:

                +
                094f23ab262e666d8740bf27459a6b1215a628dad9b6c2a88f1ed5c793fab267  pypy2.7-v7.3.1-aarch64.tar.bz2
                +cd155d06cd0956d9de4a16e8a6bdf0722cb45b5bc4bbf805825d393ebd6690ad  pypy2.7-v7.3.1-linux32.tar.bz2
                +be74886547df7bf7094096a11fc0a48496779d0d1b71901797b0c816f92caca3  pypy2.7-v7.3.1-linux64.tar.bz2
                +dfd4651243441d2f8f1c348e9ecc09848642d0c31bb323aa8ac320e5b9f232f0  pypy2.7-v7.3.1-osx64.tar.bz2
                +1b65e085118e44ac57d38a9ba79516c68bf1fdcd65c81c66b5b5ffff06b4463b  pypy2.7-v7.3.1-ppc64.tar.bz2
                +d81c7177e25bd8b1c99081e32362a29ee467ccd310b17a11161f4a9b96222b20  pypy2.7-v7.3.1-ppc64le.tar.bz2
                +71ad5132a6fd32af0b538c17ebd1e0bfe5f5dfa74b129bce242bd28357bf35fc  pypy2.7-v7.3.1-s390x.tar.bz2
                +fa3771514c8a354969be9bd3b26d65a489c30e28f91d350e4ad2f4081a9c9321  pypy2.7-v7.3.1-src.tar.bz2
                +71d764c94f467f9dd75b6af086e2b69e0d520bf6227bcb39055c24c799c135be  pypy2.7-v7.3.1-src.zip
                +e3c0dfb385d9825dd7723f26576d55d43ed92f1178f2399ab39e9fa11621a47b  pypy2.7-v7.3.1-win32.zip
                +

                pypy3.6-7.3.1 sha256:

                +
                0069bc3c1570b935f1687f5e128cf050cd7229309e48fad2a2bf2140d43ffcee  pypy3.6-v7.3.1-aarch64.tar.bz2
                +2e7a818c67f3ac0708e4d8cdf1961f30cf9586b3f3ca2f215d93437c5ea4567b  pypy3.6-v7.3.1-linux32.tar.bz2
                +f67cf1664a336a3e939b58b3cabfe47d893356bdc01f2e17bc912aaa6605db12  pypy3.6-v7.3.1-linux64.tar.bz2
                +d9c1778cd1ba37e129b495ea0f35ccdd9b68f5cd9d33ef0ce24e955c16d8840b  pypy3.6-v7.3.1-osx64.tar.bz2
                +ee02b3e65f0ca49dc09850b57835c2b65d1234f26f7991027ca6d65fadbaa4d9  pypy3.6-v7.3.1-ppc64.tar.bz2
                +089fd806629ebf79cb0cb4b0c303d8665f360903b79f0df9214b58dbc42e8231  pypy3.6-v7.3.1-ppc64le.tar.bz2
                +147592888e25678c1ae1c2929dc7420b3a0990117fdb25f235cb22476b4e4b5a  pypy3.6-v7.3.1-s390x.tar.bz2
                +0c2cc3229da36c6984baee128c8ff8bb4516d69df1d73275dc4622bf249afa83  pypy3.6-v7.3.1-src.tar.bz2
                +91e7ba30519f2c4c1833280acfb660b48392ef57c5ed0fa4e8af78587a7b8f20  pypy3.6-v7.3.1-src.zip
                +752fbe8c4abee6468e5ce22af82818f821daded36faa65f3d69423f9c217007a  pypy3.6-v7.3.1-win32.zip
                +

                pypy2.7-7.3.0 sha256:

                +
                a3dd8d5e2a656849fa344dce4679d854a19bc4a096a0cf62b46a1be127a5d56c  pypy2.7-v7.3.0-aarch64.tar.bz2
                +eac1308b7d523003a5f6d20f58406d52ab14611bcec750122ae513a5a35110db  pypy2.7-v7.3.0-linux32.tar.bz2
                +f4950a54378ac637da2a6defa52d6ffed96af12fcd5d74e1182fb834883c9826  pypy2.7-v7.3.0-linux64.tar.bz2
                +ca7b056b243a6221ad04fa7fc8696e36a2fb858396999dcaa31dbbae53c54474  pypy2.7-v7.3.0-osx64.tar.bz2
                +82e62869812aa2953a4f83e96c813cbc52973dfa5e42605e72b6610ac13f2481  pypy2.7-v7.3.0-ppc64.tar.bz2
                +592a6db77270b922ffa13cbeced9eabbc36c532ded9fc145f6a19073d3e78499  pypy2.7-v7.3.0-ppc64le.tar.bz2
                +d254b82a00021339762198e41ba7f72316010d0f9bd4dcd7b0755185da9c005e  pypy2.7-v7.3.0-s390x.tar.bz2
                +b0b25c7f8938ab0fedd8dedf26b9e73c490913b002b484c1b2f19d5844a518de  pypy2.7-v7.3.0-src.tar.bz2
                +42dc84a277e7a5e635fe39bbd745f06135902c229a257123332b7555800d915b  pypy2.7-v7.3.0-src.zip
                +a9e3c5c983edba0313a41d3c1ab55b080816c4129e67a6c272c53b9dbcdd97ec  pypy2.7-v7.3.0-win32.zip
                +

                pypy3.6-7.3.0 sha256:

                +
                b900241bca7152254c107a632767f49edede99ca6360b9a064141267b47ef598  pypy3.6-v7.3.0-aarch64.tar.bz2
                +7045b295d38ba0b5ee65bd3f078ca249fcf1de73fedeaab2d6ad78de2eab0f0e  pypy3.6-v7.3.0-linux32.tar.bz2
                +d3d549e8f43de820ac3385b698b83fa59b4d7dd6cf3fe34c115f731e26ad8856  pypy3.6-v7.3.0-linux64.tar.bz2
                +87b2545dad75fe3027b4b2108aceb9fdadcdd24e61ae312ac48b449fdd452bf3  pypy3.6-v7.3.0-osx64.tar.bz2
                +e2587e8da2abb12a86bf75941ce739124d2a1156367a9a3d729ac31d0841c300  pypy3.6-v7.3.0-ppc64.tar.bz2
                +d6f3b701313df69483b43ebdd21b9652ae5e808b2eea5fbffe3b74b82d2e7433  pypy3.6-v7.3.0-ppc64le.tar.bz2
                +0fe2f7bbf42ea88b40954d7de773a43179a44f40656f2f58201524be70699544  pypy3.6-v7.3.0-s390x.tar.bz2
                +48d12c15fbcbcf4a32882a883195e1f922997cde78e7a16d4342b9b521eefcfa  pypy3.6-v7.3.0-src.tar.bz2
                +8ae9efd0a2aadb19e892bbd07eca8ef51536296a3ef93964149aceba511e79ca  pypy3.6-v7.3.0-src.zip
                +30e6870c4f3d8ef91890a6556a98080758000ba7c207cccdd86a8f5d358998c1  pypy3.6-v7.3.0-win32.zip
                +

                pypy2.7-7.2.0 sha256:

                +
                57b0be053c6a5f069e23b843f38863cf7920f5eef7bc89f2e086e5c3a28a2ba9  pypy2.7-v7.2.0-aarch64.tar.bz2
                +76d666e5aee54b519d6ec1af4ef0cbdc85f7f9276dd554e97deb026adfd0c936  pypy2.7-v7.2.0-linux32.tar.bz2
                +05acf28e6a243026ecad933b9361d8f74b41f00818071b76b38c4694cc4c9599  pypy2.7-v7.2.0-linux64.tar.bz2
                +36aa2f2440e762333569118dd0b3d5371d575c40966effa194d116c5453ddb52  pypy2.7-v7.2.0-osx64.tar.bz2
                +fb51150a4ce94b0ca8587899ba69c41fc58a6b35c5340ea6926376ecb9cfcac4  pypy2.7-v7.2.0-ppc64.tar.bz2
                +5c4224525657c29b815cb2c6b3f9bc5a267368cc6adf0fedb235a6052929f65f  pypy2.7-v7.2.0-ppc64le.tar.bz2
                +bb7ae585ecb4d904c890e28a2c5b6bd379f57cc3d9e38ff45597ff54fa935eaa  pypy2.7-v7.2.0-s390x.tar.bz2
                +55cb7757784fbe3952102447f65b27d80e6c885a464a7af1a9ce264492439dcc  pypy2.7-v7.2.0-src.tar.bz2
                +897038550614d558f9f6718409b107e27903ef2b2b57ec250939d1b1ebdf0aba  pypy2.7-v7.2.0-src.zip
                +956eeaaaac053e5d0917e77a3d2ad1933ab5561eb3e6e71235780b5aa5fd2bb7  pypy2.7-v7.2.0-win32.zip
                +

                pypy2.7-7.1.1 sha256:

                +
                41ca390a76ca0d47b8353a0d6a20d5aab5fad8b0bb647b960d8c33e873d18ef5  pypy2.7-v7.1.1-linux32.tar.bz2
                +73b09ef0860eb9ad7997af3030b22909806a273d90786d78420926df53279d66  pypy2.7-v7.1.1-linux64.tar.bz2
                +31a17294dec96c2191885c776b4ee02112957dc874f7ba03e570537a77b78c35  pypy2.7-v7.1.1-osx64.tar.bz2
                +1ef94c3a9c67c2335cee0b21753036b4696ed588b9d54b7b8036a6ae47f7001d  pypy2.7-v7.1.1-s390x.tar.bz2
                +5f06bede6d71dce8dfbfe797aab26c8e35cb990e16b826914652dc093ad74451  pypy2.7-v7.1.1-src.tar.bz2
                +d9b07a2954ad6dbde94feffd848311e2b5169563d33e3e9f17969579b01a4158  pypy2.7-v7.1.1-src.zip
                +9c59226311f216a181e70ee7b5aa4d9665a15d00f24ae02acec9af7d96355f63  pypy2.7-v7.1.1-win32.zip
                +

                pypy2.7-7.1.0 sha256:

                +
                44ec91e8cb01caab289d8763c203f3aaf288d14325a6c42692bd1ac4e870d758  pypy2.7-v7.1.0-linux32.tar.bz2
                +fef176a29a2ef068c00c8098e59dab935ca6e956f089672b3f7351da95a034f5  pypy2.7-v7.1.0-linux64.tar.bz2
                +8be43685ce718b0768387450fc6dc395d60809b778b6146c353ef67826022153  pypy2.7-v7.1.0-osx64.tar.bz2
                +b065f55741bcb37863f1eca30ce91c9d79159371a6994100930cdc2ede3237bc  pypy2.7-v7.1.0-s390x.tar.bz2
                +b051a71ea5b4fa27d0a744b28e6054661adfce8904dcc82500716b5edff5ce4b  pypy2.7-v7.1.0-src.tar.bz2
                +e60ce30f9947844da43daaa7658adc0c05330681305225954114772f42df06ec  pypy2.7-v7.1.0-src.zip
                +76658c9ad679d562b8b6a09d006caa666406337b9834ff56db16980c5e549f20  pypy2.7-v7.1.0-win32.zip
                +

                pypy3.6-7.2.0 sha256:

                +
                f82dc9dc6c692417ee9727f23beae75364a5757ebdc657a2a1d0010ac3ad17ab  pypy3.6-v7.2.0-aarch64.tar.bz2
                +45e99de197cb3e974cfc8d45e0076ad2066852e61e56b3eafd1237efafd2c43e  pypy3.6-v7.2.0-linux32.tar.bz2
                +aa128e555ad0fe5c4c15104ae0903052bd232b6e3a73f5fe023d27b8fd0d6089  pypy3.6-v7.2.0-linux64.tar.bz2
                +836abb0ec303b90a684533711ed3b8269d3e8c64805b595e410920abdea678ac  pypy3.6-v7.2.0-osx64.tar.bz2
                +14021d196e393b3a6d2395ab94ceec347753715e37223efe4c50b7c141b351a2  pypy3.6-v7.2.0-ppc64.tar.bz2
                +6aef73a3b68e9a6c062cadd83d3db16790960cf97401ca6f2aad2195e9b05c35  pypy3.6-v7.2.0-ppc64le.tar.bz2
                +a11da8118064db102d159e9221319c428b298c4a87f26166fd6ae94be8d6ae0d  pypy3.6-v7.2.0-s390x.tar.bz2
                +0d7c707df5041f1593fe82f29c40056c21e4d6cb66554bbd66769bd80bcbfafc  pypy3.6-v7.2.0-src.tar.bz2
                +405ac35695dd374d5ea192cb44cb47231f9a65812cc7b6549df33df12ffe54db  pypy3.6-v7.2.0-src.zip
                +c926f622bec24a8b348591d631717ace83b3a6c3c2dac02b157b622b97d1fc9c  pypy3.6-v7.2.0-win32.zip
                +

                pypy3.6-7.1.1 sha256:

                +
                cb11ef4b0df569c28390b1ee93029159e1b90bfbad98df6abd629d5203b2abd9  pypy3.6-v7.1.1-linux32.tar.bz2
                +8014f63b1a34b155548852c7bf73aab2d41ebddf2c8fb603dc9dd8509be93db0  pypy3.6-v7.1.1-linux64.tar.bz2
                +a5c2f2bfa2b4a4d29e8a67baab95699b169054066df218a14f171bb84a6df0c0  pypy3.6-v7.1.1-osx64.tar.bz2
                +4a91bf2d9a142b6dbf82b5301cb510535ae9a54e1645546b2e0735a7b5ed85ba  pypy3.6-v7.1.1-s390x.tar.bz2
                +6a3ef876e3691a54f4cff045028ec3be94ab9beb2e99f051b83175302c1899a8  pypy3.6-v7.1.1-src.tar.bz2
                +4a3ebeb767740f2dc0b886d02797d21d7d69f154cf951bb991c19bd485e6cae1  pypy3.6-v7.1.1-src.zip
                +8b513b254de5f31890f5956569de9aec3a0a91d7aba72fc89d66901f4a8ccf49  pypy3.6-v7.1.1-win32.zip
                +

                pypy 3.6-v7.1.0 sha256:

                +
                031bfac61210a6e161bace0691b854dc15d01b0e624dc0588c544ee5e1621a83  pypy3.6-v7.1.0-linux32.tar.bz2
                +270dd06633cf03337e6f815d7235e790e90dabba6f4b6345c9745121006925fc  pypy3.6-v7.1.0-linux64.tar.bz2
                +d46e005ba095cb4a7006079ffbf4fe63c18cf5e9d8ce9ce8383efc1a4863ab5b  pypy3.6-v7.1.0-osx64.tar.bz2
                +243cd0cc188a94c1f064f402ae72b8ba4303eb3137eac53c53826472b8005098  pypy3.6-v7.1.0-s390x.tar.bz2
                +faa81f469bb2a7cbd22c64f22d4b4ddc5a1f7c798d43b7919b629b932f9b1c6f  pypy3.6-v7.1.0-src.tar.bz2
                +4858e7e8a0007bc3b381bd392208b28d30889a4e5a88a3c28e3d9dc4f25b654e  pypy3.6-v7.1.0-src.zip
                +77a0576a3d518210467f0df2d0d9a1892c664566dc02f25d974c2dbc6b4749e7  pypy3.6-v7.1.0-win32.zip
                +
                + +
                +
                + + \ No newline at end of file diff --git a/compat.html b/compat.html new file mode 100644 index 000000000..558af063f --- /dev/null +++ b/compat.html @@ -0,0 +1,188 @@ + + + + + +Python compatibility | PyPy + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +

                Python compatibility

                + + +
                +

                The goal of this page is to point out some of the differences between running +python with PyPy and with CPython

                +

                TL;DR

                +

                Pure python code works, but there are a few differences with object lifetime +management. Modules that use the CPython C API will probably work, but will +not achieve a speedup via the JIT. We encourage library authors to use CFFI +and HPy instead.

                +

                If you are looking for how to use PyPy with the scientific python ecosystem, +we encourage you to use conda, since they repackage common libraries like +scikit-learn and SciPy for PyPy.

                +

                Refcounting, __del__, and resource use

                +

                The main difference in pure-python code that is not going to be fixed is that +PyPy does +not support refcounting semantics for "automatically" releasing state when +an object's __del__ is called. The following code won't fill the +file immediately, but only after a certain period of time, when the GC +does a collection and flushes the output, since the file is only closed when +the __del__ method is called:

                +
                open("filename", "w").write("stuff")
                +
                +

                The proper fix is

                +
                with open("filename", "w") as f:
                +    f.write("stuff")
                +
                +

                The same problem---not closing your files---can also show up if your +program opens a large number of files without closing them explicitly. +In that case, you can easily hit the system limit on the number of file +descriptors that are allowed to be opened at the same time.

                +

                PyPy can be run with the command-line option -X track-resources (as in, +pypy -X track-resources myprogram.py). This produces a ResourceWarning +when the GC closes a non-closed file or socket. The traceback for the place +where the file or socket was allocated is given as well, which aids finding +places where close() is missing.

                +

                Similarly, remember that you must close() a non-exhausted +generator in order to have its pending finally or with +clauses executed immediately:

                +
                def mygen():
                +    with foo:
                +        yield 42
                +
                +for x in mygen():
                +    if x == 42:
                +        break    # foo.__exit__ is not run immediately!
                +
                +# fixed version:
                +gen = mygen()
                +try:
                +    for x in gen:
                +        if x == 42:
                +            break
                +finally:
                +    gen.close()
                +
                +

                More generally, __del__() methods are not executed as predictively +as on CPython: they run "some time later" in PyPy (or not at all if +the program finishes running in the meantime). See more details +here.

                +

                Why is memory usage so high?

                +

                Note that PyPy returns unused memory to the operating system only after +a madvise() system call (at least Linux, OS X, BSD) or on Windows. It is +important to realize that you may not see this in top. The unused +pages are marked with MADV_FREE, which tells the system "if you +need more memory at some point, grab this page". As long as memory is +plentiful, the RES column in top might remains high. (Exceptions to +this rule are systems with no MADV_FREE, where we use +MADV_DONTNEED, which forcefully lowers the RES. This includes +Linux <= 4.4.)

                +

                More info

                +

                A more complete list of known differences is available at our dev site.

                +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/contact.html b/contact.html new file mode 100644 index 000000000..dbee17cd7 --- /dev/null +++ b/contact.html @@ -0,0 +1,125 @@ + + + + + +Contact | PyPy + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +
                + + \ No newline at end of file diff --git a/download.html b/download.html new file mode 100644 index 000000000..eaa8d7a18 --- /dev/null +++ b/download.html @@ -0,0 +1,191 @@ + + + + + +Download and Install | PyPy + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +

                Download and Install

                + + +
                +

                We provide pre-compiled binaries for many platforms and OSes. There are also +pre-compiled binaries available on conda-forge. We have found conda-forge +a convenient and cooperative community for distributing not only the +interpreter, but many packages like SciPy that are difficult to build and +which do not yet have binary PyPy builds available on PyPI.

                + + ++++++ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                PyPy latest

                OS

                PyPy3.10

                PyPy2.7

                Notes

                Linux x86 64 bit

                Download

                Download

                compatible with CentOS7 and later.

                Windows 64 bit

                Download

                Download

                compatible with any windows 64-bit +you might need the VC runtime library installer vcredist.x64.exe

                MacOS arm64

                Download

                Download

                MacOS >= 11. Not signed, for signed packages use Homebrew.

                MacOS x86_64

                Download

                Download

                MacOS >= 10.15, not for Mojave and below. Not signed, for signed +packages use Homebrew.

                Linux ARM64

                Download

                Download

                compatible with CentOS7 and later.

                +

                Source

                +

                More information

                +

                Visit the more information page for other platforms, information about +running PyPy, STM, instructions on building from source and more.

                +

                Checksums

                +

                Checksums for the downloads are here

                +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/download_advanced.html b/download_advanced.html new file mode 100644 index 000000000..8641ab836 --- /dev/null +++ b/download_advanced.html @@ -0,0 +1,415 @@ + + + + + +Download (advanced) | PyPy + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +

                Download (advanced)

                + + +
                +

                We provide pre-compiled binaries for many platforms and OSes:

                +
                  +
                • the Python2.7 compatible release — PyPy2.7 v7.3.17

                • +
                • the Python3.10 compatible release — PyPy3.10 v7.3.17

                • +
                + + ++++++ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                PyPy latest

                OS

                PyPy3.10

                PyPy2.7

                Notes

                Linux x86 64 bit

                Download

                Download

                compatible with CentOS7 and later.

                Windows 64 bit

                Download

                Download

                compatible with any windows 64-bit +you might need the VC runtime library installer vcredist.x64.exe

                MacOS arm64

                Download

                Download

                MacOS >= 11. Not signed, for signed packages use Homebrew.

                MacOS x86_64

                Download

                Download

                MacOS >= 10.15, not for Mojave and below. Not signed, for signed +packages use Homebrew.

                Linux ARM64

                Download

                Download

                compatible with CentOS7 and later.

                + + ++++++ + + + + + + + + + + + + +
                Other Platforms

                OS

                PyPy3.10

                PyPy2.7

                Notes

                Linux x86 32 bit

                Download

                Download

                compatible with CentOS7 and later

                +

                "JIT Compiler" version

                +

                The binaries above include a Just-in-Time compiler. On x86-32, they only work on +CPUs that have the SSE2 instruction set (most of them do, nowadays).. They also +contain stackless extensions, like greenlets.

                +

                Linux binaries and common distributions

                +

                Since version 7.3, the linux x86 binaries ship with versions +of OpenSSL, SQLite3, libffi, expat, and TCL/TK binary libraries linked in. This +make the binaries "portable" so that they should run on any current glibc-based +linux platform. The ideas were adopted from the portable-pypy package.

                +

                This solution to the portability problem means that the versions of the +packaged libraries are frozen to the version shipped, so updating your system +libraries will not affect this installation of PyPy. Also see the note about +SSL certificates below.

                +

                There are other solutions:

                +
                  +
                • download PyPy from your release vendor (usually an outdated +version): Ubuntu (PPA), Debian, Homebrew, MacPorts, +Fedora, Gentoo and Arch are known to package PyPy, with various +degrees of being up-to-date. FreshPorts packages for FreeBSD.

                • +
                • recompile the CFFI-based TCL/TK, OpenSSL, or sqlite3 modules, using system +libraries and the scripts in pypy/lib_pypy/pypy_tools. This solution will +not solve compatibility issues with libffi, since that is baked into PyPy.

                • +
                • or translate your own PyPy.

                • +
                +

                Previous version can be downloaded from here, or directly from the buildbot's +mirror.

                +

                If your CPU is really, really old, it may be a x86-32 without SSE2. +There is untested support for manually translating PyPy's JIT without +SSE2 (--jit-backend=x86-without-sse2) but note that your machine +is probably low-spec enough that running CPython on it is a better +idea in the first place.

                +

                PyPy-STM 2.5.1

                +

                This is a special version of PyPy! See the Software Transactional +Memory (STM) documentation.

                +

                Other versions

                +

                The other versions of PyPy are:

                +
                  +
                • Try the most up-to-date nightly binary builds , if the official +release is too old for what you want to do.

                • +
                • Reverse debugger: This version enables debugging your Python +programs by going forward and backward in time. See the RevDB +documentation.

                • +
                +
                  +
                • Old-style sandboxing: A special safe version. +This is NOT the version announced in-development during 2019! +Read the docs about sandboxing. +This version is not supported and not actively maintained. You +will likely have to fix some issues yourself, or checkout an old +version, or otherwise play around on your own. We provide this +documentation only for historical reasons. Please do not use in +production. For reference, there are some very old, unmaintained +binaries for Linux (32bit, 64bit).

                • +

                Installing

                +

                All binary versions are packaged in a tar.bz2 or zip file. When +uncompressed, they run in-place. You can uncompress them +either somewhere in your home directory or, say, in /opt. +If you want, put a symlink from somewhere like +/usr/local/bin/pypy to /path/to/pypy_expanded/bin/pypy. Do +not move or copy the executable pypy outside the tree --- put +a symlink to it, otherwise it will not find its libraries.

                +

                Installing more modules

                +

                The typical pip workflow for packages with binary extensions +requires that the package maintainers provide a wheel for PyPy, which is +sometimes too much work for the overburdened maintainers. For more information +see the installation documentation_

                +

                If you use your distribution's PyPy package we recommend you install packages +into a virtualenv. If you try to build a module and the build process complains +about "missing Python.h", you may need to install the pypy-dev package.

                +

                Building from source

                +

                (see more build instructions)

                +
                  +
                1. +

                  Get the source code. The preferred way is to checkout the current +trunk using git. The trunk usually works and is of course +more up-to-date:

                  +
                  git clone https://github.com/pypy/pypy
                  +
                  +

                  The trunk contains PyPy 2. For PyPy 3, switch to the correct branch:

                  +
                  # switch to the branch that implements Python 3.10
                  +git checkout branches/py3.10
                  +
                  +

                  Alternatively, get one of the following smaller packages for the source at +the same revision as the above binaries:

                  + +
                2. +
                3. +

                  Make sure you installed the dependencies. See the list here.

                  +
                4. +
                5. +

                  Enter the goal directory:

                  +
                  cd pypy/pypy/goal
                  +
                  +
                6. +
                7. +

                  Run the rpython script. Here are the common combinations +of options (works also with python instead of pypy; +requires CPython 2.7 or PyPy 2, even to build PyPy 3):

                  +
                  # get the JIT version
                  +pypy ../../rpython/bin/rpython -Ojit targetpypystandalone
                  +# get the no-jit version
                  +pypy ../../rpython/bin/rpython -O2 targetpypystandalone
                  +# get the sandbox version
                  +pypy ../../rpython/bin/rpython -O2 --sandbox targetpypystandalone
                  +
                  +
                8. +
                9. Enjoy Mandelbrot :-) It takes on the order of half an hour to +finish the translation, and about 3GB of RAM on a 32-bit system +and about 5GB on 64-bit systems. (Do not start a translation on a +machine with insufficient RAM! It will just swap forever. See +notes below in that case.)

                10. +
                11. If you want to install this PyPy as root, please read the next section, +Packaging.

                12. +
                +

                Notes:

                +
                  +
                • It is recommended to use PyPy to do translations, instead of using CPython, +because it is twice as fast. You should just start by downloading an +official release of PyPy (with the JIT). If you really have to use CPython +then note that we are talking about CPython 2.7 here, not CPython 3.x. +(Older versions like 2.6 are out.)

                • +
                • +

                  On some 32-bit systems, the address space limit of 2 or 3 GB of RAM +can be an issue. More generally you may be just a little bit low of +RAM. First note that 2 GB is really not enough nowadays; on Windows +you first need to refer to the Windows build instructions. More +precisely, translation on 32-bit takes at this point 2.7 GB if PyPy is +used and 2.9 GB if CPython is used. There are two workarounds:

                  +

                  1. use PyPy, not CPython. If you don't have any PyPy so far, not even +an older version, then you need to build one first, with some parts +removed. So, first translate with:

                  +
                  cpython2 rpython -Ojit targetpypystandalone \
                  +--withoutmod-micronumpy --withoutmod-cpyext
                  +
                  +

                  then copy pypy-c and libpypy_c.so somewhere else, and finally +call it with ...pypy-c ../../rpython/bin/rpython -Ojit.

                  +

                  2. if even using PyPy instead of CPython is not enough, try to tweak +some internal parameters. Example (slower but saves around 400MB):

                  +
                  PYPY_DONT_RUN_SUBPROCESS=1 PYPY_GC_MAX_DELTA=200MB \
                  +pypy --jit loop_longevity=300 ../../rpython/bin/rpython \
                  +-Ojit --source
                  +# then read the next point about --source
                  +
                  +
                • +
                • You can run translations with --source, which only builds the C +source files (and prints at the end where). Then you can cd there +and execute make. This is another way to reduce memory usage. +Note that afterwards, you have to run manually pypy-c +.../pypy/tool/build_cffi_imports.py if you want to be able to import +the cffi-based modules.

                • +
                • Like other JITs, PyPy doesn't work out of the box on some Linux +distributions that trade full POSIX compliance for extra security +features. E.g. with PAX, you have to run PyPy with paxctl -cm. +This also applies to translation (unless you use CPython to run the +translation and you specify --source).

                • +

                Packaging

                +

                Once PyPy is translated from source, a binary package similar to those +provided in the section Default (with a JIT Compiler) above can be +created with the package.py script:

                +
                cd ./pypy/pypy/tool/release/
                +python package.py --help  # for information
                +python package.py --archive-name pypy-my-own-package-name
                +
                +

                It is recommended to use package.py because custom scripts will +invariably become out-of-date. If you want to write custom scripts +anyway, note an easy-to-miss point: some modules are written with CFFI, +and require some compilation. If you install PyPy as root without +pre-compiling them, normal users will get errors:

                +
                  +
                • PyPy 2.5.1 or earlier: normal users would see permission errors. +Installers need to run pypy -c "import gdbm" and other similar +commands at install time; the exact list is in package.py. Users +seeing a broken installation of PyPy can fix it after-the-fact if they +have sudo rights, by running once e.g. sudo pypy -c "import gdbm.

                • +
                • PyPy 2.6 and later: anyone would get ImportError: no module named +_gdbm_cffi. Installers need to run pypy _gdbm_build.py in the +lib_pypy directory during the installation process (plus others; +see the exact list in package.py). Users seeing a broken +installation of PyPy can fix it after-the-fact, by running pypy +/path/to/lib_pypy/_gdbm_build.py. This command produces a file +called _gdbm_cffi.pypy-41.so locally, which is a C extension +module for PyPy. You can move it at any place where modules are +normally found: e.g. in your project's main directory, or in a +directory that you add to the env var PYTHONPATH.

                • +

                Checksums

                +

                Checksums for the downloads are here

                +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/favicon.ico b/favicon.ico new file mode 100644 index 000000000..00cf381e7 Binary files /dev/null and b/favicon.ico differ diff --git a/favicon2.ico b/favicon2.ico new file mode 100644 index 000000000..8e5cc5a9e Binary files /dev/null and b/favicon2.ico differ diff --git a/favicon2.png b/favicon2.png new file mode 100644 index 000000000..d14717362 Binary files /dev/null and b/favicon2.png differ diff --git a/favicon32x32.ico b/favicon32x32.ico new file mode 100644 index 000000000..9e34b3f45 Binary files /dev/null and b/favicon32x32.ico differ diff --git a/features.html b/features.html new file mode 100644 index 000000000..1aca2de90 --- /dev/null +++ b/features.html @@ -0,0 +1,206 @@ + + + + + + +PyPy - Features | PyPy + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +

                PyPy - Features

                + + +
                +

                PyPy is a replacement for CPython. It is built using the RPython +language that was co-developed with it. The main reason to use it +instead of CPython is speed: it runs generally faster (see next section).

                +

                PyPy implements Python 2.7.18 and 3.10.14. +It supports all of the core language. It supports most of +the commonly used Python standard library modules. For known differences with +CPython, see our compatibility page.

                +

                The following CPU architectures are supported and maintained:

                + +

                PyPy's x86 version runs on several operating systems, such as Linux +(32/64 bits), MacOS (64 bits), Windows (32 bits), OpenBSD, FreeBSD. +Non-x86 versions are supported on Linux, and ARM64 is supported on MacOS.

                +

                If you are interested in helping, see our howtohelp page.

                +

                The main features of PyPy:

                +

                Speed

                +

                Our main executable comes with a Just-in-Time compiler. It is +really fast in running most benchmarks—including very large and +complicated Python applications, not just 10-liners.

                +

                There are two cases that you should be aware where PyPy will not be +able to speed up your code:

                +
                  +
                • Short-running processes: if it doesn't run for at least a few seconds, +then the JIT compiler won't have enough time to warm up.

                • +
                • If all the time is spent in run-time libraries (i.e. in C functions), +and not actually running Python code, the JIT compiler will not help.

                • +
                +

                So the case where PyPy works best is when executing long-running +programs where a significant fraction of the time is spent executing +Python code. This is the case covered by the majority of our +benchmarks, but not all of them --- the goal of PyPy is to get speed +but still support (ideally) any Python program.

                +

                Memory usage

                +

                Memory-hungry Python programs (several hundreds of MBs or more) might +end up taking less space than they do in CPython. It is not always +the case, though, as it depends on a lot of details. Also note that +the baseline is higher than CPython's.

                +

                Stackless

                +

                Support for Stackless and greenlets are now integrated in the normal +PyPy. More detailed information is available here.

                +

                Other features

                +

                PyPy has many secondary features and semi-independent +projects. We will mention here:

                +
                  +
                • +

                  Other languages: we also implemented other languages that makes +use of our RPython toolchain: Prolog (almost complete), as +well as Smalltalk, JavaScript, Io, Scheme and Gameboy.

                  +

                  There is also a Ruby implementation called Topaz and a PHP implementation +called HippyVM.

                  +
                • +

                Sandboxing

                +

                PyPy's sandboxing is a working prototype for the idea of running untrusted +user programs. Unlike other sandboxing approaches for Python, PyPy's does not +try to limit language features considered "unsafe". Instead we replace all +calls to external libraries (C or platform) with a stub that communicates +with an external process handling the policy.

                +

                To run the sandboxed process, you need to get the full sources and +build pypy-sandbox from it (see Building from source). These +instructions give you a pypy-c that you should rename to +pypy-sandbox to avoid future confusion. Then run:

                +
                cd pypy/sandbox
                +pypy_interact.py path/to/pypy-sandbox
                +# don't confuse it with pypy/goal/pyinteractive.py!
                +
                +

                You get a fully sandboxed interpreter, in its own filesystem hierarchy +(try os.listdir('/')). For example, you would run an untrusted +script as follows:

                +
                mkdir virtualtmp
                +cp untrusted.py virtualtmp/
                +pypy_interact.py --tmp=virtualtmp pypy-sandbox /tmp/untrusted.py
                +
                +

                Note that the path /tmp/untrusted.py is a path inside the sandboxed +filesystem. You don't have to put untrusted.py in the real /tmp +directory at all.

                +

                To read more about its features, try pypy_interact.py --help or go to +our documentation site.

                +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/howtohelp.html b/howtohelp.html new file mode 100644 index 000000000..46bf19986 --- /dev/null +++ b/howtohelp.html @@ -0,0 +1,129 @@ + + + + + +How to help? | PyPy + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +

                How to help?

                + + +
                +

                How to help PyPy?

                +

                Here are some ideas to help PyPy move forward:

                +
                  +
                • use pypy for your projects and provide detailed feedback

                • +
                • talk to us about how to support Python 3.x

                • +
                • write blog posts or tweets about your experiences

                • +
                • help porting to new platforms

                • +
                • contact us and get involved

                • +
                • donate some money to enable others to help

                • +
                • take on our consultants and make PyPy work better for your

                • +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/images/2021-graphviz-02-cfg-types.png b/images/2021-graphviz-02-cfg-types.png new file mode 100644 index 000000000..eeb260886 Binary files /dev/null and b/images/2021-graphviz-02-cfg-types.png differ diff --git a/images/2021-graphviz-02-cfg-types.thumbnail.png b/images/2021-graphviz-02-cfg-types.thumbnail.png new file mode 100644 index 000000000..eeb260886 Binary files /dev/null and b/images/2021-graphviz-02-cfg-types.thumbnail.png differ diff --git a/images/2021-graphviz-03-classhier.png b/images/2021-graphviz-03-classhier.png new file mode 100644 index 000000000..571b209e9 Binary files /dev/null and b/images/2021-graphviz-03-classhier.png differ diff --git a/images/2021-graphviz-03-classhier.thumbnail.png b/images/2021-graphviz-03-classhier.thumbnail.png new file mode 100644 index 000000000..571b209e9 Binary files /dev/null and b/images/2021-graphviz-03-classhier.thumbnail.png differ diff --git a/images/2021-graphviz-04-classhier-detailed.png b/images/2021-graphviz-04-classhier-detailed.png new file mode 100644 index 000000000..eec320f97 Binary files /dev/null and b/images/2021-graphviz-04-classhier-detailed.png differ diff --git a/images/2021-graphviz-04-classhier-detailed.thumbnail.png b/images/2021-graphviz-04-classhier-detailed.thumbnail.png new file mode 100644 index 000000000..eec320f97 Binary files /dev/null and b/images/2021-graphviz-04-classhier-detailed.thumbnail.png differ diff --git a/images/2021-graphviz-05-call-graph.png b/images/2021-graphviz-05-call-graph.png new file mode 100644 index 000000000..bb6656d16 Binary files /dev/null and b/images/2021-graphviz-05-call-graph.png differ diff --git a/images/2021-graphviz-05-call-graph.thumbnail.png b/images/2021-graphviz-05-call-graph.thumbnail.png new file mode 100644 index 000000000..bb6656d16 Binary files /dev/null and b/images/2021-graphviz-05-call-graph.thumbnail.png differ diff --git a/images/2021-graphviz-06-trace.png b/images/2021-graphviz-06-trace.png new file mode 100644 index 000000000..1d623467b Binary files /dev/null and b/images/2021-graphviz-06-trace.png differ diff --git a/images/2021-graphviz-06-trace.thumbnail.png b/images/2021-graphviz-06-trace.thumbnail.png new file mode 100644 index 000000000..1d623467b Binary files /dev/null and b/images/2021-graphviz-06-trace.thumbnail.png differ diff --git a/images/2021-graphviz-07-parse-tree.png b/images/2021-graphviz-07-parse-tree.png new file mode 100644 index 000000000..b65f8864a Binary files /dev/null and b/images/2021-graphviz-07-parse-tree.png differ diff --git a/images/2021-graphviz-07-parse-tree.thumbnail.png b/images/2021-graphviz-07-parse-tree.thumbnail.png new file mode 100644 index 000000000..b65f8864a Binary files /dev/null and b/images/2021-graphviz-07-parse-tree.thumbnail.png differ diff --git a/images/2021-graphviz-08-json-parser.png b/images/2021-graphviz-08-json-parser.png new file mode 100644 index 000000000..211f08758 Binary files /dev/null and b/images/2021-graphviz-08-json-parser.png differ diff --git a/images/2021-graphviz-08-json-parser.thumbnail.png b/images/2021-graphviz-08-json-parser.thumbnail.png new file mode 100644 index 000000000..211f08758 Binary files /dev/null and b/images/2021-graphviz-08-json-parser.thumbnail.png differ diff --git a/images/2021-open-ended-traces-01-inlining.svg b/images/2021-open-ended-traces-01-inlining.svg new file mode 100644 index 000000000..a63558f6a --- /dev/null +++ b/images/2021-open-ended-traces-01-inlining.svg @@ -0,0 +1,59 @@ + + + + + + + + + + + + + + + + + + + + + + + + + image/svg+xml + + + + + + + + + Trace Attempt 1 + Operations from f + Operations from g + More operations from f + + Inlining + + + Abort due to trace limit + + + + Disable inlining of g + + + Trace Attempt 2 + Operations from f + Call to g + More operations from f + + End of trace + + Machine Code + Hitting the Trace Limit with Inlining + + \ No newline at end of file diff --git a/images/2021-open-ended-traces-01-inlining.thumbnail.svg b/images/2021-open-ended-traces-01-inlining.thumbnail.svg new file mode 100644 index 000000000..736af3c0e --- /dev/null +++ b/images/2021-open-ended-traces-01-inlining.thumbnail.svg @@ -0,0 +1,59 @@ + + + + + + + + + + + + + + + + + + + + + + + + + image/svg+xml + + + + + + + + + Trace Attempt 1 + Operations from f + Operations from g + More operations from f + + Inlining + + + Abort due to trace limit + + + + Disable inlining of g + + + Trace Attempt 2 + Operations from f + Call to g + More operations from f + + End of trace + + Machine Code + Hitting the Trace Limit with Inlining + + \ No newline at end of file diff --git a/images/2021-open-ended-traces-02-no-inlining.svg b/images/2021-open-ended-traces-02-no-inlining.svg new file mode 100644 index 000000000..753f0cf59 --- /dev/null +++ b/images/2021-open-ended-traces-02-no-inlining.svg @@ -0,0 +1,43 @@ + + + + + + + + + + + + + + image/svg+xml + + + + + + + + Trace Attempt 1 + Operations from f + + Abort due to trace limit + + + + Mark f as huge + + + Trace Attempt 2 + Operations from f + + guard_always_fail + + Machine Code + no inlining + + Hitting the Trace Limit Without Inlining + + + \ No newline at end of file diff --git a/images/2021-open-ended-traces-02-no-inlining.thumbnail.svg b/images/2021-open-ended-traces-02-no-inlining.thumbnail.svg new file mode 100644 index 000000000..6df6c276e --- /dev/null +++ b/images/2021-open-ended-traces-02-no-inlining.thumbnail.svg @@ -0,0 +1,43 @@ + + + + + + + + + + + + + + image/svg+xml + + + + + + + + Trace Attempt 1 + Operations from f + + Abort due to trace limit + + + + Mark f as huge + + + Trace Attempt 2 + Operations from f + + guard_always_fail + + Machine Code + no inlining + + Hitting the Trace Limit Without Inlining + + + \ No newline at end of file diff --git a/images/2021-open-ended-traces-03-complete.svg b/images/2021-open-ended-traces-03-complete.svg new file mode 100644 index 000000000..69eaf47f0 --- /dev/null +++ b/images/2021-open-ended-traces-03-complete.svg @@ -0,0 +1,47 @@ + + + + + + + + + + + + + + + + + image/svg+xml + + + + + + + + Operations from f + + guard_always_fail + Trace 1 + + + More operations from f + + guard_always_fail + Trace 2 + + + + Even moreoperations from f + + end of trace + Trace n + + + ... + Chaining Several Incomple Traces Together + + \ No newline at end of file diff --git a/images/2021-open-ended-traces-03-complete.thumbnail.svg b/images/2021-open-ended-traces-03-complete.thumbnail.svg new file mode 100644 index 000000000..6dab59cae --- /dev/null +++ b/images/2021-open-ended-traces-03-complete.thumbnail.svg @@ -0,0 +1,47 @@ + + + + + + + + + + + + + + + + + image/svg+xml + + + + + + + + Operations from f + + guard_always_fail + Trace 1 + + + More operations from f + + guard_always_fail + Trace 2 + + + + Even moreoperations from f + + end of trace + Trace n + + + ... + Chaining Several Incomple Traces Together + + \ No newline at end of file diff --git a/images/2022-pypy-diagrams-collage-small.png b/images/2022-pypy-diagrams-collage-small.png new file mode 100644 index 000000000..a6a43e190 Binary files /dev/null and b/images/2022-pypy-diagrams-collage-small.png differ diff --git a/images/2022-pypy-diagrams-collage-small.thumbnail.png b/images/2022-pypy-diagrams-collage-small.thumbnail.png new file mode 100644 index 000000000..a6a43e190 Binary files /dev/null and b/images/2022-pypy-diagrams-collage-small.thumbnail.png differ diff --git a/images/2022-pypy-diagrams-collage.png b/images/2022-pypy-diagrams-collage.png new file mode 100644 index 000000000..0f52e0a62 Binary files /dev/null and b/images/2022-pypy-diagrams-collage.png differ diff --git a/images/2022-pypy-diagrams-collage.thumbnail.png b/images/2022-pypy-diagrams-collage.thumbnail.png new file mode 100644 index 000000000..0f52e0a62 Binary files /dev/null and b/images/2022-pypy-diagrams-collage.thumbnail.png differ diff --git a/images/2022-pypy-pictures-collage-small.jpg b/images/2022-pypy-pictures-collage-small.jpg new file mode 100644 index 000000000..df45ce57f Binary files /dev/null and b/images/2022-pypy-pictures-collage-small.jpg differ diff --git a/images/2022-pypy-pictures-collage-small.thumbnail.jpg b/images/2022-pypy-pictures-collage-small.thumbnail.jpg new file mode 100644 index 000000000..df45ce57f Binary files /dev/null and b/images/2022-pypy-pictures-collage-small.thumbnail.jpg differ diff --git a/images/2022-pypy-pictures-collage.jpg b/images/2022-pypy-pictures-collage.jpg new file mode 100644 index 000000000..5ed0068be Binary files /dev/null and b/images/2022-pypy-pictures-collage.jpg differ diff --git a/images/2022-pypy-pictures-collage.thumbnail.jpg b/images/2022-pypy-pictures-collage.thumbnail.jpg new file mode 100644 index 000000000..5ed0068be Binary files /dev/null and b/images/2022-pypy-pictures-collage.thumbnail.jpg differ diff --git a/images/2022-pypy-posts-per-year.svg b/images/2022-pypy-posts-per-year.svg new file mode 100644 index 000000000..0b4c7e8e1 --- /dev/null +++ b/images/2022-pypy-posts-per-year.svg @@ -0,0 +1,387 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/images/2022-pypy-posts-per-year.thumbnail.svg b/images/2022-pypy-posts-per-year.thumbnail.svg new file mode 100644 index 000000000..dc7f320ad --- /dev/null +++ b/images/2022-pypy-posts-per-year.thumbnail.svg @@ -0,0 +1,387 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/images/2024-complex-lattice.svg b/images/2024-complex-lattice.svg new file mode 100644 index 000000000..36a275856 --- /dev/null +++ b/images/2024-complex-lattice.svg @@ -0,0 +1,188 @@ + + +G + + + +top + + + + + +top + + + +bottom + + + + + +bottom + + + +-inf + +-inf + + + +bottom->-inf + + + + + +-2 + +-2 + + + +bottom->-2 + + + + + +-1 + +-1 + + + +bottom->-1 + + + + + +0 + +0 + + + +bottom->0 + + + + + +1 + +1 + + + +bottom->1 + + + + + +2 + +2 + + + +bottom->2 + + + + + ++inf + ++inf + + + +bottom->+inf + + + + + +negative + +negative + + + +-inf->negative + + + + + +-2->negative + + + + + +-1->negative + + + + + +0->top + + + + + +nonnegative + +nonnegative + + + +1->nonnegative + + + + + +2->nonnegative + + + + + ++inf->nonnegative + + + + + +nonzero + +nonzero + + + +negative->nonzero + + + + + +nonnegative->nonzero + + + + + +nonzero->top + + + + + \ No newline at end of file diff --git a/images/2024-complex-lattice.thumbnail.svg b/images/2024-complex-lattice.thumbnail.svg new file mode 100644 index 000000000..294b5f6a1 --- /dev/null +++ b/images/2024-complex-lattice.thumbnail.svg @@ -0,0 +1,188 @@ + + +G + + + +top + + + + + +top + + + +bottom + + + + + +bottom + + + +-inf + +-inf + + + +bottom->-inf + + + + + +-2 + +-2 + + + +bottom->-2 + + + + + +-1 + +-1 + + + +bottom->-1 + + + + + +0 + +0 + + + +bottom->0 + + + + + +1 + +1 + + + +bottom->1 + + + + + +2 + +2 + + + +bottom->2 + + + + + ++inf + ++inf + + + +bottom->+inf + + + + + +negative + +negative + + + +-inf->negative + + + + + +-2->negative + + + + + +-1->negative + + + + + +0->top + + + + + +nonnegative + +nonnegative + + + +1->nonnegative + + + + + +2->nonnegative + + + + + ++inf->nonnegative + + + + + +nonzero + +nonzero + + + +negative->nonzero + + + + + +nonnegative->nonzero + + + + + +nonzero->top + + + + + \ No newline at end of file diff --git a/images/download.svg b/images/download.svg new file mode 100644 index 000000000..a8e1c922b --- /dev/null +++ b/images/download.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/images/download.thumbnail.svg b/images/download.thumbnail.svg new file mode 100644 index 000000000..8c3c335da --- /dev/null +++ b/images/download.thumbnail.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/images/header-background.png b/images/header-background.png new file mode 100644 index 000000000..fd1fbf805 Binary files /dev/null and b/images/header-background.png differ diff --git a/images/header-background.thumbnail.png b/images/header-background.thumbnail.png new file mode 100644 index 000000000..fd1fbf805 Binary files /dev/null and b/images/header-background.thumbnail.png differ diff --git a/images/people/alex.jpg b/images/people/alex.jpg new file mode 100644 index 000000000..4e90a7d57 Binary files /dev/null and b/images/people/alex.jpg differ diff --git a/images/people/alex.thumbnail.jpg b/images/people/alex.thumbnail.jpg new file mode 100644 index 000000000..4e90a7d57 Binary files /dev/null and b/images/people/alex.thumbnail.jpg differ diff --git a/images/people/antocuni.png b/images/people/antocuni.png new file mode 100644 index 000000000..f1c5fb7f0 Binary files /dev/null and b/images/people/antocuni.png differ diff --git a/images/people/antocuni.thumbnail.png b/images/people/antocuni.thumbnail.png new file mode 100644 index 000000000..f1c5fb7f0 Binary files /dev/null and b/images/people/antocuni.thumbnail.png differ diff --git a/images/people/arigo.png b/images/people/arigo.png new file mode 100644 index 000000000..d35a82b51 Binary files /dev/null and b/images/people/arigo.png differ diff --git a/images/people/arigo.thumbnail.png b/images/people/arigo.thumbnail.png new file mode 100644 index 000000000..d35a82b51 Binary files /dev/null and b/images/people/arigo.thumbnail.png differ diff --git a/images/people/cfbolz.jpg b/images/people/cfbolz.jpg new file mode 100644 index 000000000..6dfe42e41 Binary files /dev/null and b/images/people/cfbolz.jpg differ diff --git a/images/people/cfbolz.thumbnail.jpg b/images/people/cfbolz.thumbnail.jpg new file mode 100644 index 000000000..6dfe42e41 Binary files /dev/null and b/images/people/cfbolz.thumbnail.jpg differ diff --git a/images/people/fijal.jpg b/images/people/fijal.jpg new file mode 100644 index 000000000..b12d61602 Binary files /dev/null and b/images/people/fijal.jpg differ diff --git a/images/people/fijal.thumbnail.jpg b/images/people/fijal.thumbnail.jpg new file mode 100644 index 000000000..b12d61602 Binary files /dev/null and b/images/people/fijal.thumbnail.jpg differ diff --git a/images/people/fijal_thumb.png b/images/people/fijal_thumb.png new file mode 100644 index 000000000..05b195b6e Binary files /dev/null and b/images/people/fijal_thumb.png differ diff --git a/images/people/fijal_thumb.thumbnail.png b/images/people/fijal_thumb.thumbnail.png new file mode 100644 index 000000000..05b195b6e Binary files /dev/null and b/images/people/fijal_thumb.thumbnail.png differ diff --git a/images/people/hakanardo.jpg b/images/people/hakanardo.jpg new file mode 100644 index 000000000..2800d7e75 Binary files /dev/null and b/images/people/hakanardo.jpg differ diff --git a/images/people/hakanardo.thumbnail.jpg b/images/people/hakanardo.thumbnail.jpg new file mode 100644 index 000000000..2800d7e75 Binary files /dev/null and b/images/people/hakanardo.thumbnail.jpg differ diff --git a/images/people/holger1.jpg b/images/people/holger1.jpg new file mode 100644 index 000000000..fe73614fe Binary files /dev/null and b/images/people/holger1.jpg differ diff --git a/images/people/holger1.thumbnail.jpg b/images/people/holger1.thumbnail.jpg new file mode 100644 index 000000000..fe73614fe Binary files /dev/null and b/images/people/holger1.thumbnail.jpg differ diff --git a/images/pypy-logo-nav-grey.png b/images/pypy-logo-nav-grey.png new file mode 100644 index 000000000..0ac40afe4 Binary files /dev/null and b/images/pypy-logo-nav-grey.png differ diff --git a/images/pypy-logo-nav-grey.thumbnail.png b/images/pypy-logo-nav-grey.thumbnail.png new file mode 100644 index 000000000..0ac40afe4 Binary files /dev/null and b/images/pypy-logo-nav-grey.thumbnail.png differ diff --git a/images/pypy-logo-nav.png b/images/pypy-logo-nav.png new file mode 100644 index 000000000..a148179fd Binary files /dev/null and b/images/pypy-logo-nav.png differ diff --git a/images/pypy-logo-nav.thumbnail.png b/images/pypy-logo-nav.thumbnail.png new file mode 100644 index 000000000..a148179fd Binary files /dev/null and b/images/pypy-logo-nav.thumbnail.png differ diff --git a/images/pypy-logo.png b/images/pypy-logo.png new file mode 100644 index 000000000..612ab0a77 Binary files /dev/null and b/images/pypy-logo.png differ diff --git a/images/pypy-logo.svg b/images/pypy-logo.svg new file mode 100644 index 000000000..f082dc54b --- /dev/null +++ b/images/pypy-logo.svg @@ -0,0 +1,220 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/images/pypy-logo.thumbnail.png b/images/pypy-logo.thumbnail.png new file mode 100644 index 000000000..612ab0a77 Binary files /dev/null and b/images/pypy-logo.thumbnail.png differ diff --git a/images/pypy-logo.thumbnail.svg b/images/pypy-logo.thumbnail.svg new file mode 100644 index 000000000..f082dc54b --- /dev/null +++ b/images/pypy-logo.thumbnail.svg @@ -0,0 +1,220 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/images/pypy_speed_graph.png b/images/pypy_speed_graph.png new file mode 100644 index 000000000..0621911ba Binary files /dev/null and b/images/pypy_speed_graph.png differ diff --git a/images/pypy_speed_graph.thumbnail.png b/images/pypy_speed_graph.thumbnail.png new file mode 100644 index 000000000..0621911ba Binary files /dev/null and b/images/pypy_speed_graph.thumbnail.png differ diff --git a/index.html b/index.html new file mode 100644 index 000000000..c9837b2c7 --- /dev/null +++ b/index.html @@ -0,0 +1,158 @@ + + + + + +PyPy + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +

                PyPy

                + + +
                +
                +
                +

                A fast, compliant alternative implementation of Python

                + + + Download PyPy +

                What is PyPy ?

                +

                Documentation (external link)

                +
                +
                PyPy logo
                +
                +

                On average, PyPy is 4.4 times faster than CPython 3.7. We currently support python 3.10 and 2.7.

                +
                PyPy vs. Python speed comparison graph"

                PyPy (with JIT) benchmark times normalized to CPython. Smaller is +better. Based on the geometric average of all benchmarks

                +
                "... we are avid fans of PyPy and
                +commensurately thankful for the great work by the PyPy team over the
                +years. PyPy has enabled us to use Python for a larger part of our
                +toolset than CPython alone would have supported, and its smooth
                +integration with C/C++ through CFFI has helped us attain a better
                +tradeoff between performance and programmer productivity in our
                +projects"
                +-- Vilhjálmur Þorsteinsson, founder and CEO of Miðeind, Feb 2022
                +

                Advantages and distinct Features

                +
                  +
                • Speed: thanks to its Just-in-Time compiler, Python programs +often run faster on PyPy. (What is a JIT compiler?)

                • +
                • Memory usage: memory-hungry Python programs (several hundreds of +MBs or more) might end up taking less space than they do in CPython.

                • +
                • Compatibility: PyPy is highly compatible with existing python code. +It supports cffi, cppyy, and can run popular python libraries like +twisted, and django. It can also run NumPy, Scikit-learn and more via a +c-extension compatibility layer.

                • +
                • Stackless: PyPy comes by default with support for stackless mode, +providing micro-threads for massive concurrency.

                • +
                • As well as other features.

                • +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/people.html b/people.html new file mode 100644 index 000000000..f086f0f22 --- /dev/null +++ b/people.html @@ -0,0 +1,209 @@ + + + + + +The PyPy Team (from 2008) | PyPy + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +

                The PyPy Team (from 2008)

                + + +
                +

                Armin Rigo

                +images/people/arigo.png

                Armin Rigo is a former researcher at the Heinrich-Heine Universitat +Düsseldorf (Germany). He studied Mathematics at the University +of Lausanne (Switzerland), obtained his Ph.D. in Logic and Set +Theory at the Free University of Brussels (Belgium) in 2002, and +worked at the University of Southampton (UK) until 2005. He is +the author of Psyco, the first just-in-time compiler for Python. +He is one of the founders and lead developers of the PyPy project +which began in 2003. He has taken part in all areas, from the Python +language definition to the RPython translation framework, +including the garbage collector and the tracing just-in-time +compiler.

                +

                Maciej Fijałkowski

                +images/people/fijal_thumb.png

                Maciej is a freelancer working mostly on PyPy for the past several years. +He's a core developer since 2006, working on all kinds of parts in +the entire codebase including JIT, GC and assembler backends. +Maciej has been going to many conferences, advertising PyPy to a broader +audience for the past several years, including a keynote at Pycon 2010. +He's also the main maintainer of +jitviewer, a tool for analyzing performance of your python programs under +PyPy.

                +

                Carl Friedrich Bolz

                +images/people/cfbolz.jpg

                Carl Friedrich is a core developer since 2005, currently doing his PhD at the +Heinrich-Heine Universität Düsseldorf (Germany). He has worked on most aspects +of PyPy, from the core interpreter to the GC to the JIT. He has published +several papers about the inner workings of PyPy, presenting them at various +scientific conferences. Carl Friedrich is also interested in other dynamic +language implementation and was the original author of the Prolog +implementation.

                +

                Carl Friedrich likes science fiction novels and sometimes plays the bassoon.

                +

                Antonio Cuni

                +images/people/antocuni.png

                Antonio Cuni loves skiing, mountains and programming languages. He studied +Computer Science at the University of Genova (Italy), and then at the same +university he obtained his Ph.D. in Computer Science in 2010, with a +dissertation about the PyPy CLI JIT backend. He has been a core PyPy +developer since 2006, working in various areas including the "object oriented +backends" for the CLI and JVM, the RPython translation framework, the Python +interpreter and the JIT compiler generator. Apart from PyPy, he is the author of +other popular tools such as pdb++.

                +

                Benjamin Peterson

                +

                Both a PyPy and CPython core developer, Benjamin knows way too much about the +nooks and cranies of the Python language. He is driven by a fascination with +interpreters and compilers of all shapes and sizes. Around the PyPy project, he +tries to be generally useful and has taken on major projects including rewriting +PyPy's Python compiler and porting PyPy to Python 2.7.

                +

                Alex Gaynor

                +images/people/alex.jpg

                Alex is software engineer living in Washington, DC. He's been a PyPy developer +since 2010, and has worked on many parts of the codebase, including the JIT +compiler's optimizers, the RPython translation toolchain, and the Python +interpreter. In addition to his work on PyPy, Alex is also the creator of +Topaz, a Ruby VM built on RPython and a core developer of Django (a Python web +framework) and CPython, as well as a retired member of the board of directors +of the Python Software Foundation.

                +

                Håkan Ardö

                +images/people/hakanardo.jpg

                Håkan Ardö received his master of science degree in electrical +engineering from Lund University in 2002. He specialized in +VLSI-design and Image Processing. He worked as a software +engineer at Axis Communications 2002-2003 before doing his +PhD at the Centre for Mathematical Sciences of Lund University +2003-2009 in the Mathematical Imaging Group. His thesis work consisted +of designing image processing algorithms for traffic surveillance, +aiming for a system that automatically measures the safety of an +intersection or road segment. He is currently working part-time as a +postdoc at the Centre for Mathematical Sciences of Lund University +continuing this work and part-time as CTO with a spinoff company +Cognimatics. His contributions to PyPy started 2010 and consists of +the array module as well as work on the JIT compiler's trace optimizers.

                +

                Holger Krekel

                +images/people/holger1.jpg

                Holger Krekel is a founder of the PyPy project and has participated in +PyPy core development for several years as well as maintained much of +its infrastructure. He also is the author of the popular py.test and +tox testing tools as well as execnet, a library for easily deploying +different interacting Python interpreters side by side. He helped +manage multiple PyPy funding contracts through his company merlinux and was a +PyPy representative within the Software Freedom Conservancy (SFC). He +holds a summa cum laude degree in computer science with a thesis about +artificial intelligence applied to the game of Go. As of 2011 he is on +another sabbatical-ish leave, caring for his newborn son, travelling +and pondering what comes next. Other than that he continues to care +for testing and some PyPy co-ordination bits behind the scene.

                +

                Samuele Pedroni

                +

                Samuele Pedroni got involved with PyPy almost at its inception in the +spring of 2003. One of the design contributors to PyPy, his help has +ranged from infrastructure and processes, through building out +RPython... optimizing the Python interpreter, to compressing resume +data in the last incarnation of the JIT compiler. Tempted away into the +application side of the software equation, these days he contributes +some words and wisdom to PyPy's paper writing.

                +

                Many more people

                +

                PyPy is and has always been an effort of many volunteers. Consult the LICENSE +file for details.

                +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/performance.html b/performance.html new file mode 100644 index 000000000..3e399b3b7 --- /dev/null +++ b/performance.html @@ -0,0 +1,379 @@ + + + + + +Performance | PyPy + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +

                Performance

                + + +
                +

                This document collects strategies, tactics and tricks for making your +code run faster under PyPy. Many of these are also useful hints for +stock Python and other languages. For contrast, we also describe some +CPython (stock Python) optimizations that are not needed in PyPy.

                +
                +

                Profiling: vmprof

                +

                As a general rule, when considering performance issues, follow these +three points: first measure them (it is counter-productive to fight +imaginary performance issues); then profile your code (it is useless +to optimize the wrong parts). Only optimize then.

                +

                PyPy 2.6 introduced vmprof, a very-low-overhead statistical profiler. +The standard, non-statistical cProfile is also supported, and can be +enabled without turning off the JIT. We do recommend vmprof anyway +because turning on cProfile can distort the result (sometimes massively, +though hopefully this should not be too common).

                +

                +

                Optimization strategy

                +

                These suggestions apply to all computer languages. They're here as +reminders of things to try before any Python or PyPy-specific tweaking.

                +

                Build a regression-test suite

                +

                Before you start tuning, build a regression-test suite for your code. +This front-loads a significant amount of work, but it means you can +try lots of optimizations without worrying so much about introducing +functional bugs.

                +

                Measure, don't guess

                +

                Human beings are bad at guessing or intuiting where the hotspots in code are. +Measure, don't guess; use a profiler to pin down the 20% of the +code where the code is spending 80% of its time, then speed-tune that.

                +

                Measuring will save you a lot of effort wasted on tuning parts of the code +that aren't actually bottlenecks.

                +

                As you tune, re-profile frequently so you can see how the hottest spots +are shifting around.

                +

                I/O-bound is different from compute-bound

                +

                Be aware of the difference between code that is compute-bound (slow +because it's doing a huge number of instructions) and code that is I/O +bound (slow because of disk or network delays).

                +

                Expect to get most of your gains from optimizing compute-bound code. +It's usually (though not always) a sign that you're near the end of +worthwhile tuning when profiling shows that the bulk of the +application's time is spent on network and disk I/O.

                +

                Tune your algorithms first

                +

                Generally, when your code is doing things that are O(n**2) or larger +in the size of your data set, the cost of those operations is going +to swamp any small gains you can pick up with the tricks we describe +here.

                +

                Tune your algorithms first. It's time to think about applying our +list of micro-tuning tips after you think you've optimized out +intrinsically expensive operations.

                +

                That said, be prepared for the possibility that you will discover +better-hidden algorithmic problems as you micro-tune. Likely +you will go through this cycle more than once.

                +

                Focus on tight loops

                +

                It's extremely common for high time costs to lurk within some +innocuous-looking code inside a tight loop - especially in code +that does something like a searching/matching/lookup operation +or any kind of graph traversal.

                +

                Probably the most common kind of performance-killer in compute-bound +code is an O(n**2) operation that is disguised by being some sort of +O(n) lookup or match inside an O(n) loop.

                +

                Another common time-sink is relatively expensive common-setup +operations that are performed inside tight loops but could be moved +to before they start. (For a representative case of this, see the +micro-tuning tip on regexp compilation.)

                +

                Smaller is faster

                +

                Modern computers have multiple levels of memory caching, some directly +on the processor chip. Causing a cache miss at any level incurs a +performance penalty proportional to random-access time for the next +outward (and much slower) layer of cache.

                +

                Accordingly, smaller is faster. Programs or routines with a small +enough working set to fit inside a fast cache will be as fast as +that cache is. To make your code fast, reduce the length of the +series of Python or JIT-compiler opcodes it generates by making +it simpler.

                +

                The tradeoff here is that algorithmic tuning often trades time for +space - that is, it increases the size of an algorithm's working set +by including pre-computations or tables or reverse maps in order to +avoid O(n**2) operations.

                +

                It's impossible to predict in advance where the sweet spot in that +tradeoff will be. You have to try different things and measure - +which takes us right back to "Measure, don't guess". And another +function of your regression test suite can be as a speed benchmark.

                +

                +

                Micro-tuning tips

                +

                These are in no particular order.

                +

                Keep it simple

                +

                Simple is better than complex. The PyPy JIT is not very smart; the +simpler your code is the better it will run. Here again, though, you face +a tradeoff: you may need to pay with more algorithmic complexity in order +to avoid brute-force operations that are O(n**2) or worse.

                +

                Write plain-vanilla code in plain-vanilla ways. The PyPy JIT has many +productions that optimize a common usage pattern against an uncommon +usage pattern.

                +

                Global variables

                +

                In CPython, global variables and functions (including package imports) +are much more expensive to reference than locals; avoid them. (This +is also good modularity practice).

                +

                The cost of CPython global references is high enough that, for example, if you +have code in a frequently-visited inner loop that uses int() a lot, it +may be worthwhile to create a local copy of the reference with "int = +int" in an enclosing block.

                +

                However, this in not true in JITted PyPy code. The "int = int" hack +won't buy you performance, it's just an extra copy. The modularity +reason for avoiding globals are still valid.

                +

                Regular expressions

                +

                Regular-expression compilation is expensive. If the regexp pattern in +a search, match, or replace operation is static (doesn't mutate at +runtime) refactor so it's only done once.

                +

                If the regexp compilation is in a class method, consider doing it as +the initializer of a regexp-valued static (shared) class member and +using that class member in your operation.

                +

                If the regexp compilation is in a free function, consider moving it +to module level and referencing the resulting regexp object +(but see the warning above about global variables).

                +

                Old- vs. new-style classes

                +

                New-style classes allow faster attribute access and take up less core +per instance than old-style classes. Much of this advantage may be +lost, however, if attribute names are not constant. For example: x.a += y or even setattr(x, 'a', y) will be much faster than a dynamic +version: setattr(x, 'a' + some_variable, y).

                +

                Classes that inherit from both new- and old-style classes are +extremely slow; avoid at all costs.

                +

                In PyPy, isinstance() called against an old-style class was very slow +until 2.0.

                +

                String concatenation is expensive

                +

                In CPython, you may want to replace:

                +
                s = head + body + maybe + tail
                +
                +

                with the admittedly less readable:

                +
                s = "%(head)s%(body)s%(maybe)s%(tail)s" % locals()
                +
                +

                or even:

                +
                s = "{head}{body}{maybe}{tail}".format(**locals())
                +
                +

                Both of the latter forms avoid multiple-allocation overhead. +But PyPy's JIT makes the overhead of intermediate concatenations +go away in linear code that keeps the number of concatenations +small, bound and constant. (And locals() is rather slow +with PyPy's JIT.)

                +

                On the other hand, in code like this with a string-valued foo() function:

                +
                for x in mylist:
                +    s += foo(x)
                +
                +

                the JIT cannot optimize out intermediate copies. This code is +actually quadratic in the total size of the mylist strings due to +repeated string copies of ever-larger prefix segments. (Such code +is always fine for bytearrays, because in this case += is an +in-place operation.)

                +

                This:

                +
                parts = []
                +for x in mylist:
                +    parts.append(foo(x))
                +s = "".join(parts)
                +
                +

                can be much faster because all the string concatenation in the last +line creates exactly one new string object with one C-level copy +sequence (and list operations are relatively cheap).

                +

                Frame introspection and tracing are slow

                +

                Certain function calls can disable PyPy's speed options over +stretches of surrounding code called "JIT scopes".

                +

                A JIT like PyPy's works based on the assumption that the only thing +worth optimizing are loops that are executed often. Whenever the +interpreter enters a loop in the interpreted program, the JIT records +what the interpreter does, creating a trace. This trace is optimized, +compiled to machine code and executed when the loop is hit with the +conditions observed during tracing. This trace is one kind of JIT scope.

                +

                Another kind of JIT scope that matters is a function, considered as +a unit for inlining.

                +

                Note that a JIT scope is a run-time phenomenon, not a compile-time +one. It's not confined by source-code module boundaries. A library- +or foreign-module call in a frequently-called loop or inlined function +will be part of its JIT scope.

                +

                locals(), globals(), sys._getframe(), sys.exc_info(), and sys.settrace +work in PyPy, but they incur a performance penalty that can be huge by +disabling the JIT over the enclosing JIT scope.

                +

                (Thanks Eric S. Raymond for the text above)

                +

                Insider's point of view

                +

                This section describes performance issues from the point of view of +insiders of the project; it should be particularly interesting if you +plan to contribute in that area.

                +

                One of the goals of the PyPy project is to provide a fast and compliant +python interpreter. Some of the ways we achieve this are by providing a +high-performance garbage collector (GC) and a high-performance +Just-in-Time compiler (JIT). Results of comparing PyPy and CPython can +be found on the speed website. Those benchmarks are not a random +collection: they are a combination of real-world Python programs --- +benchmarks originally included with the (now dead) Unladen Swallow +project --- and benchmarks for which we found PyPy to be slow (and improved). +Consult the descriptions of each for details.

                +

                The JIT, however, is not a magic bullet. There are several characteristics +that might surprise people who are not used to JITs in +general or to the PyPy JIT in particular. The JIT is generally good at +speeding up straight-forward Python code that spends a lot of time in the +bytecode dispatch loop, i.e., running actual Python code --- as opposed +to running things that only are invoked by Python code. Good +examples include numeric calculations or any kind of heavily +object-oriented program. Bad examples include doing computations with +large longs --- which is performed by unoptimizable support code. When the +JIT cannot help, PyPy is generally slower than CPython.

                +

                More specifically, the JIT is known not to work on:

                +
                  +
                • Tests: The ideal unit tests execute each piece of tested code +once. This leaves no time for the JIT to warm up.

                • +
                • Really short-running scripts: A rule of thumb is if something runs below +0.2s the JIT has no chance, but it depends a lot on the program in question. +In general, make sure you warm up your program before running benchmarks, if +you're measuring something long-running like a server. The time required +to warm up the JIT varies; give it at least a couple of seconds. (PyPy's +JIT takes an especially long time to warm up.)

                • +
                • Long-running runtime functions: These are the functions provided +by the runtime of PyPy that do a significant amount of work. +PyPy's runtime is generally not as optimized as CPython's and we expect those +functions to take somewhere between the same time as CPython to twice as long. +This includes, for example, computing with longs, or sorting large lists. +A counterexample is regular expressions: although they take time, they +come with their own JIT.

                • +
                +

                Unrelated things that we know PyPy to be slow at (note that we're probably +working on it):

                +
                  +
                • CPython C extension modules: Any C extension module recompiled +with PyPy takes a very large hit in performance. PyPy supports C +extension modules solely to provide basic functionality. +If the extension module is for speedup purposes only, then it +makes no sense to use it with PyPy at the moment. Instead, remove it +and use a native Python implementation, which also allows opportunities +for JIT optimization. If the extension module is +both performance-critical and an interface to some C library, then it +might be worthwhile to consider rewriting it as a pure Python version +that uses CFFI for the interface.

                • +
                • Missing RPython modules: A few modules of the standard library +(like csv and cPickle) are written in C in CPython, but written +natively in pure Python in PyPy. Sometimes the JIT is able to do a +good job on them, and sometimes not. In most cases (like csv and +cPickle), we're slower than CPython, with the notable exception of +json and heapq.

                • +
                • Abuse of itertools: The itertools module is often "abused" in the +sense that it is used for the wrong purposes. From our point of view, +itertools is great if you have iterations over millions of items, but +not for most other cases. It gives you 3 lines in functional style +that replace 10 lines of Python loops (longer but arguably much easier +to read). The pure Python version is generally not slower even on +CPython, and on PyPy it allows the JIT to work much better --- simple +Python code is fast. The same argument also applies to filter(), +reduce(), and to some extend map() (although the simple case +is JITted), and to all usages of the operator module we can think +of.

                • +
                • Ctypes: Ctypes is slower than on CPython. Consider CFFI or HPy +instead which have special paths inside the JIT.

                • +
                +

                We generally consider things that are slower on PyPy than CPython to be bugs +of PyPy. If you find some issue that is not documented here, +please report it to our bug tracker for investigation.

                +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2007/10/first-post-8150793557471983289.html b/posts/2007/10/first-post-8150793557471983289.html new file mode 100644 index 000000000..ebdd812d7 --- /dev/null +++ b/posts/2007/10/first-post-8150793557471983289.html @@ -0,0 +1,330 @@ + + + + + +First Post | PyPy + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                First Post

                + + + +
                +

                Welcome to the PyPy status blog. After we got a lot of positive feedback about the blog coverage of our Squeak/PyPy sprint in Bern we decided that having a general PyPy blog sounds like a good idea. We will try to periodically post about what is going on in the PyPy project, cover sprints and other events where PyPyers are present. If you have any wishes about things we should write about, feel free to leave a comment.

                +
                +

                Comments

                +
                +
                +
                + + Martijn Faassen wrote on 2007-10-31 15:53: +
                +
                +

                You should write about PyPy's upcoming "Grand US" tour!

                +
                +
                +
                +
                + + Carl Friedrich Bolz-Tereick wrote on 2007-10-31 17:16: +
                +
                +

                Hi Martijn!

                I think that is the plan, yes. But let's see whether they will have time to write blog posts :-).

                +
                +
                +
                +
                + + Steven Kryskalla wrote on 2007-11-03 02:40: +
                +
                +

                Good to see you guys are getting more involved in promoting and showing off Pypy. I check the mailing list from time to time for interesting developments, but a blog is much easier to keep track of!

                As far as ideas for posts, maybe something like the old python-dev summaries? (posts every week or two summarizing the new mailing list posts)

                Release announcements, sprint announcements / reports, technical information, tutorials, etc. would all be good too.

                +
                +
                +
                +
                + + Sarah Kerrigan wrote on 2007-11-20 18:54: +
                +
                +

                Even though there is a lot of work down the road, I am genuinely interested in the progress of this project. I'm taking a compilers class at UCR as a CS student so I'm furthering my appreciation of well written compilers.

                We had a guest speaker the other day, Jens Palsberg, who created a subset of Java, miniJava, (the language we are writing our compilers for), talk about the future of compilers. He said that the future is in the ability to generate code suitable for multi-threading. With hardware slowing down and resorting to increasing the amount of cores on a die instead of making them faster, this makes sense. I also asked questions about just-in-time compilers and about the possibilities to improve performance beyond current compilers using runtime information.

                To see you guys work on attacking those problems using a high-level language like python shows to me that we are getting closer to reaching those goals.

                Keep up the good work. This blog is a great idea. I can't wait to use PyPy to speed up all my python based applications in an expedient and robust fashion.

                +
                +
                +
                +
                + + Sarah Kerrigan wrote on 2007-11-20 18:56: +
                +
                +

                Also you should let your comments be displayed on the page without linking.

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2007/11/pypy-cleanup-sprint-startup-4429006224971155209.html b/posts/2007/11/pypy-cleanup-sprint-startup-4429006224971155209.html new file mode 100644 index 000000000..119c44295 --- /dev/null +++ b/posts/2007/11/pypy-cleanup-sprint-startup-4429006224971155209.html @@ -0,0 +1,288 @@ + + + + + +PyPy cleanup sprint startup | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                PyPy cleanup sprint startup

                + + + +
                +

                The following week we will have a sprint in Gothenburg to clean up the PyPy codebase and make it ready for future developments. So far, only a few people are here, the others will arrive this afternoon. + +The Älvsborgsbron in Gothenburg from the ferry I took to get there.

                +
                +

                Comments

                +
                +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2007/11/pypy-google-tech-talk-9082134238390123890.html b/posts/2007/11/pypy-google-tech-talk-9082134238390123890.html new file mode 100644 index 000000000..8ffc17c9a --- /dev/null +++ b/posts/2007/11/pypy-google-tech-talk-9082134238390123890.html @@ -0,0 +1,298 @@ + + + + + +PyPy Google Tech Talk | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                PyPy Google Tech Talk

                + + + +
                +

                The Google Tech Talk that Samuele, Armin, Jacob and Laura gave during the US trip is now on YouTube: https://www.youtube.com/watch?v=GnPmErtqPXk

                +
                +

                Comments

                +
                +
                +
                + + Eric wrote on 2007-11-30 09:27: +
                +
                +

                Absolutely fascinating! I have to admit that there were a few (ok, a lot) of times where I couldn't quite follow along, but you guys are doing some absolutely amazing work.

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2007/11/pypy-road-show-1-new-york-and-ibm-7837076523877011699.html b/posts/2007/11/pypy-road-show-1-new-york-and-ibm-7837076523877011699.html new file mode 100644 index 000000000..3341beaec --- /dev/null +++ b/posts/2007/11/pypy-road-show-1-new-york-and-ibm-7837076523877011699.html @@ -0,0 +1,298 @@ + + + + + +The PyPy Road Show (1): New York and IBM | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                The PyPy Road Show (1): New York and IBM

                + + + +
                +

                We're slowly getting adjusted to the jet-lag (except maybe Samuele). Time to blog...

                The past two days at IBM, in New York, have been quite interesting. The place is a research center. Feels University-like, but meetings rooms have no windows and climatization fixed on "polar" settings. The building is of course heated at this time of the year, and then the meeting rooms are climatized... I guess that just doesn't make sense to me.

                We gave a 1h30 talk to a general audience first. Then we had a compact schedule of meetings with various people or groups of people. In the early preparations for this trip we planned to stay only one day, but Martin Hirzel, our host, found too many people that wanted to talk with us :-)

                I think that both us and most of the people we talked with got interesting things out of the meetings. On our side, let me point a few highlights.

                We asked two people that worked on the GCs for the Jikes RVM if reusing them for RPython programs would make sense. They didn't scream "you're mad!", so I guess the answer is yes. Apparently, it has been done before, too. I'm still not sure I got this right, but it seems that Microsoft paid someone money to integrate them with Rotor... Then the real-time garbage-collection guys explained to us the things that we need to take care about when writing a VM: real-time GC needs not only write barriers and read barriers, but pointer-equality-comparison barriers... They have bad memories of trying to add a posteriori this kind of barrier into existing VMs, so it took us a bit of explaining to make them realize that adding new kinds of barriers is mostly trivial for us (I'm still not 100% sure they got it... bad memories can stick hard).

                Then we had discussions with JIT people. Mostly, this allowed us to confirm that Samuele has already got a good idea about what Java JITs like Hotspot can do, and in which kind of situation they work well. As expected, the most difficult bit for a PyPy-like JIT that would run on top of a JVM would be the promotion. We discussed approaches like first generating fall-back cases that include some instrumentation logic, and regenerating code with a few promoted values after some time if it seems like it will be a gain. Replacing a method with a new version is difficult to do in a way that is portable across Java VMs. There are still possible workarounds, but it also means that if we really want to explore this seriously, we should consider experimenting with specifics VMs - e.g. the Jikes RVM gives (or could be adapted to give) hooks to replace methods with new versions of them, which is something that the JVM's own JIT internally does all the time.

                We showed the taint object space and the sandboxed PyPy to several groups of security people. I won't say much about it here, beyond the fact that they were generally interested by the fact that the corresponding code is very short and easy to play with. They are doing a lot on security in Java and... PHP, for web sites. Someone could write a PHP interpreter (!) in PyPy to get the same kind of results. But as Laura and Samuele put it, there are things in life you do for fun, and things you do for money :-)

                We're in Vancouver today and tomorrow. More about this later...

                Armin Rigo

                +
                +

                Comments

                +
                +
                +
                + + Miguel Filipe wrote on 2007-11-13 15:12: +
                +
                +

                Thast's amazing news.
                I always thought that the forest of groups wouring on VM technologies should work more closely.

                I sure am happy to know that PyPy is having input and talking to a bunch of ibm'ers who have worked or work on VM, JIT, GC technologies.

                Best regards,

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2007/11/pypy-road-show-5790414147905233059.html b/posts/2007/11/pypy-road-show-5790414147905233059.html new file mode 100644 index 000000000..3c507dd20 --- /dev/null +++ b/posts/2007/11/pypy-road-show-5790414147905233059.html @@ -0,0 +1,317 @@ + + + + + +The PyPy Road Show | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                The PyPy Road Show

                + + + +
                +

                Armin Rigo, Samuele Pedroni, Laura Creighton and Jacob Hallén are on a two-week-trip through the USA and Canada, to present PyPy to various companies and institutions. The next few blog entries will cover our experiences and adventures. + +Here is a glimpse of our schedule (all November 2007): +

                +
                  +
                • 4th: Chigaco
                • +
                • 5th-6th: New York
                • +
                • 7th-8th: Vancouver
                • +
                • 9th-18th: San Francisco and the Bay Area +
                • +
                Notably, we meet with IBM Research in New York and give a Google Talk in the Bay Area. +
                +

                Comments

                +
                +
                +
                + + Arnar Birgisson wrote on 2007-11-14 11:08: +
                +
                +

                Hey there,

                Will they by any chance be stopping over in Iceland on their way back?

                cheers,
                Arnar

                +
                +
                +
                +
                + + Anonymous wrote on 2007-11-14 14:40: +
                +
                +

                Alas, we fly directly from SFO
                to Frankfurt, and then to
                Göteborg where we will immediately
                have a PyPy sprint. But we
                could come visit another day.
                Are you connected with CCP games?
                Or are there other people in Iceland who are interested in PyPy? I'd love to come to Iceland. I'll bet the PyPy team has other people who feel the same way. But let us take this off-line, ok?

                Laura

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2007/11/ropes-branch-merged-8782576892496878598.html b/posts/2007/11/ropes-branch-merged-8782576892496878598.html new file mode 100644 index 000000000..687e2c711 --- /dev/null +++ b/posts/2007/11/ropes-branch-merged-8782576892496878598.html @@ -0,0 +1,363 @@ + + + + + +Ropes branch merged | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                Ropes branch merged

                + + + +
                +

                This afternoon we merged the ropes branch that I have been working on on the side for a while (also to cut down the number of currently active branches a bit, since we are doing major cleanups right now). It contained a new (optional) implementation of the unicode type using the rope data structure. Ropes essentially use concatenation trees to represent strings. The leaves of the trees contain either byte arrays or arrays of unicode characters. + + +Of course the fact that ropes are used is mostly completely transparent to the user (as usual in the pypy world :) ). Normal and unicode strings are implemented with them, but just from the behavior of these types the user has a hard time noticing. Of course there are significant changes in performance (in both directions). + +Using ropes to implement strings has some interesting effects. The most obvious one is that string concatenation, slicing and repetition is really fast (I suspect that it is amortized O(1), but haven't proved it). This is probably not helping most existing Python programs because people tend to code in such a way that these operations are not done too often. However, with ropes it is possible to do something like this: +

                +
                Python 2.4.1 (pypy 1.0.0 build 48942) on linux2
                +Type "help", "copyright", "credits" or "license" for more information.
                +>>>> import sys
                +>>>> a = "a" * sys.maxint
                +>>>> hash(a)
                +-768146060
                +
                + +So somebody who is targeting a Python implementation that has ropes could write his code in such a way that this is taken into account. Another interesting feature is that ropes try to share as much data as possible with each other, so if you create a large slice of a large string, the slice is not going to take much additional memory. + +One of the most interesting use-cases of ropes are together with unicode. The leaf nodes of a rope unicode string can be either a byte array or an array of unicode characters. This means that a unicode string that uses only characters that are latin-1 or ascii will use one byte of memory per character. If a unicode string contains mostly only unicode characters that are latin-1 and a few that are not, it will still use 1 byte for most of the latin-1 characters. This property also allows really fast encoding and decoding of unicode strings as long as they don't contain non-latin-1 characters (only with certain encodings of course): +
                >>>> s = "a" * sys.maxint
                +>>>> u = s.decode("ascii")
                +>>>> u = s.decode("latin-1")
                +>>>> u = s.decode("utf-8")
                +Again, encoding and decoding strings that contain a few non-latin-1 characters is again efficient: +
                >>>> u = "a" * 100000000 + u"\uffff"
                +>>>> s = u.encode("utf-8")
                +>>>> len(s)
                +100000003
                +I am not completely certain how useful this behaviour is for real-life applications, but it's kind of cool :-). It saves memory for european languages that contain few non-ascii characters. + +Of course there is at least one down-side to all of this, which is that string indexing is not O(1) any longer, because we have to walk down the tree to find the correct leaf where the character is actually in. I have not measured much, but I expect it to be quite fast in practice, because the trees are never deeper than 32 nodes. +
                +

                Comments

                +
                +
                +
                + + Unknown wrote on 2007-11-23 18:54: +
                +
                +

                awesome.

                and what about pattern matching? for substring and regexps?

                +
                +
                +
                +
                + + Carl Friedrich Bolz-Tereick wrote on 2007-11-23 22:59: +
                +
                +

                Substring matching should not be too slow, but there was no specific work on that. I think it only makes sense to optimize this once someone has a concrete application for that, because otherwise you don't know what you are optimizing for. So if anyone has ideas, I am interested to hear them.

                +
                +
                +
                +
                + + Bruce Hoult wrote on 2007-11-24 02:53: +
                +
                +

                Go and try this year's ICFP programming contest task (just the initial virtual machine part) using this.

                +
                +
                +
                +
                + + Carl Friedrich Bolz-Tereick wrote on 2007-11-24 20:38: +
                +
                +

                Ah, nice idea. Somebody should try this.

                +
                +
                +
                +
                + + Titus Brown wrote on 2007-12-03 11:03: +
                +
                +

                I'm writing up a GHOP task related to this; let me know if anyone is interested in mentoring it.

                (I'm asking for a CPython implementation, and separately the ICFP implementation)

                --titus
                titus@idyll.org

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2007/11/sprint-discussions-releases-testing-1126468258904483211.html b/posts/2007/11/sprint-discussions-releases-testing-1126468258904483211.html new file mode 100644 index 000000000..4d9c5d95f --- /dev/null +++ b/posts/2007/11/sprint-discussions-releases-testing-1126468258904483211.html @@ -0,0 +1,323 @@ + + + + + +Sprint Discussions: Releases, Testing | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                Sprint Discussions: Releases, Testing

                + + + +
                +

                During the sprint we had various discussions about technical issues as well as planning discussions about how we want to go about things. One of them was about the stability of PyPy, how to ensure stability, how to handle releases and approaches to being more "usable". I will describe this discussion in this post (there are also minutes of the meeting). + + + +The Meetings whiteboard + +

                +

                +

                Testing +

                First we discussed the current situation in terms of testing. PyPy has been extremely testing-oriented from the start, it is being developed almost exclusively in test-driven-development style. To deal with the large number of tests we already have some infrastructure in place:

                +

                As you can see, we are lacking in the Windows testing area, which is an even worse problem because none of the currently active developers has Windows as his primary OS. We should improve this by finding a Windows machine where the tests are run nightly and where we can log in to try bug-fixes quickly. The latter bit is important, we had a nightly windows test run before (thanks to Scott Dial) but it didn't help, because even if you tried to fix a bug you would have to wait until the next night to see whether it worked.

                Another very serious problem is that of aggregation: we have these various test runs that all have a web interface to check for errors but there is no easy way to find out which tests failed. You have to go to each page and even some sub-pages to see what needs fixing, which is a tedious process. The idea for solving this is aggregate all the available information into some sort of testing-entry-point page that gives a quick overview of the regressions that happened during the night. It's not clear whether we can achieve that with existing tools (buildbots or whatever), but we will investigate that. +

                Releases +

                +

                The discussion about releases was more on a fundamental and less on a concrete level (especially when it comes to time-frames). We discussed what it means to make a release, because obviously it is more than just taking an SVN revision and putting a tarball of it onto the webpage. During the EU period we were required to make several releases, but those were not really meant to be more than technology previews for the brave adventurers to try. In the future we have the goal to release things that are more stable and hopefully more practically useful. The plan is to use medium-sized Python applications that have a chance to run on top of PyPy because they don't use too many extension modules (web apps being likely candidates) and that have good unit-tests themselves. The first step would be to find some applications that fit this description, fix the bugs that prevents PyPy from running them and from then on run them nightly on one of the testing machines to check for regressions. This would allow us to be more confident when stating that "PyPy works".

                Another thing to keep in mind for releases is the special features that our Python interpreter provides (e.g. the thunk and the taint object space, our stackless features, transparent proxies, sandboxing, special object implementations). Those features are neither tested by the CPython tests nor by any existing applications. Therefore we cannot really be confident that these features work and don't have too many bugs (in fact, the first time somebody really use the become feature of the thunk space in earnest he found a serious bug that is not fixed so far). To get around this problem, we plan to write small-to-medium sized example applications for each of these features (for stackless we can maybe use one of the existing stackless examples). This will hopefully find bugs and will also make it possible to evaluate whether the features make sense from a language design point of view.

                A minor thing to make releases easier is to be able to not only have the tests be run once a night but also be able to trigger them manually on the release branch before doing the release.

                +

                Publishing Cool Things +

                Since we decided that the releases we make should be stable and usable, we also discussed how we would go about making new "cool things" like features, experiments etc. better known. The consensus was that this blog is probably the best forum for doing this. In addition we discussed having a stabler snapshot of the trunk made to ensure that people wanting to play around with these features don't accidentally get +a broken version.

                +

                Helping Out +

                +

                Right now we are still in cleanup mode (the cleanup sprint is nearly done, but we haven't finished all the cleanups yet), so we won't be able to start on the above things right now. However, they will have a strong focus soon. So if you are interested in trying out to run programs on top of PyPy or writing new ones that use the new features you are most welcome to do so and we will try to fix the bugs or help you doing it (of course some tolerance against frustration is needed when you do that, because the bugs that turn up tend to be obscure). We have not been perfect at this in the past, but this will have to change.

                +
                +

                Comments

                +
                +
                +
                + + Bill Mill wrote on 2007-11-25 14:00: +
                +
                +

                Please do publish more about the cool things in pypy! I find that, for most languages, I get the right information level from blog announcements. Reading the mailing list is like drinking from a fire hose when I only want to stay informed of where you guys are at.

                (I post a lot on reddit too, and it's nicer to post blog articles than mailing list postings)

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2007/11/sprint-discussions-wrapping-external-8731011170537270161.html b/posts/2007/11/sprint-discussions-wrapping-external-8731011170537270161.html new file mode 100644 index 000000000..59fab5659 --- /dev/null +++ b/posts/2007/11/sprint-discussions-wrapping-external-8731011170537270161.html @@ -0,0 +1,323 @@ + + + + + +Sprint Discussions: Wrapping External Libraries | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                Sprint Discussions: Wrapping External Libraries

                + + + +
                +

                A more technical discussion during the sprint was about the next steps for the external module problem (minutes). One of PyPy's biggest problems in becoming more generally useful are C extension modules, which can't work with PyPy's Python interpreter. We already reimplemented many of the more commonly used extension modules in CPython's standard library in Python or RPython. However, there are more missing and there is no way to implement all the extension modules that other people have written.

                +

                +

                Whiteboard after the discussion. +

                +

                +

                +

                Therefore we need a different approach to this problem. Extension modules are commonly written for two different reasons, one being speed, the other being wrapping non-Python libraries. At the moment we want mostly to approach a solution for the latter problem, because we hope that the JIT will eventually make it possible to not have to write extension modules for speed reasons any more.

                There are two rough ideas to approach this problem in the near future (there are other, more long-term ideas that I am not describing now): One of them is to add the ctypes module to PyPy's Python interpreter, which would mean re-implementing it since the existing implementation is written in C.

                The other way would be to work on the existing way to get extensions in that PyPy provides, which are "mixed modules". Mixed modules are written in a combination of RPython and normal Python code. To then wrap C libraries you would use rffi, which is the foreign function interface of RPython.

                +

                The discussion round: Maciek Fijalkowski, Armin Rigo, Richard Emslie, Alexander Schremmer.

                Both approaches have problems: With ctypes you have no built-in way to query C header files for structure layouts and constants which requires you to hard-wire them, which is highly platform dependant. Mixed modules are not really fun to write, since they need to be RPython and we currently don't have a way to do separate compilation, so you always need to translate PyPy's whole Python interpreter to see whether your module is correct.

                In the meeting it was decided to first go for a ctypes replacement. The replacement would be written in pure Python, we already have a very thin wrapper around libffi which the new ctypes implementation would use. The goal to reach would be to get the pygame implementation in ctypes to run on PyPy.

                To make ctypes more useful in general to write this kind of wrappers, we will probably extract some code that we have already written for PyPy's own usage: it gives a way to write "imprecise" declarations ("a structure with at least fields called x and y which are of some kind of integer type") and turn them into exact ctypes declarations, internally using the C compiler to inspect the platform headers.

                After this is done we should approach separate compilation so that developing modules in RPython has a quicker turnaround time. This is somewhat involved to implement for technical reasons. There are ideas how to implement it quickly to make it usable for prototyping, but it's still a lot of work.

                +
                +

                Comments

                +
                +
                +
                + + Simon Burton wrote on 2007-11-25 18:37: +
                +
                +

                Is it not possibe to test rpython extension modules for pypy on top of cpython ? (ie. without compilation)

                +
                +
                +
                +
                + + Maciej Fijalkowski wrote on 2007-11-25 21:39: +
                +
                +

                Yop, sure it is. PyPy extension modules runs through ctypes on top of CPython.

                +
                +
                +
                +
                + + Anonymous wrote on 2007-11-26 13:04: +
                +
                +

                I guess that Simon meant to ask why easier module testing requires separate compilation. The fact is that even if a module runs fine on top of CPython, there will be some RPython issues that are only visible when you try to translate it.

                Armin

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2007/11/sprint-pictures-3151912856495869652.html b/posts/2007/11/sprint-pictures-3151912856495869652.html new file mode 100644 index 000000000..783c9be1b --- /dev/null +++ b/posts/2007/11/sprint-pictures-3151912856495869652.html @@ -0,0 +1,304 @@ + + + + + +Sprint Pictures | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                Sprint Pictures

                + + + +
                +

                The obligatory sprint picture post... + + + + +Alexander Schremmer, Armin Rigo, Maciek Fijalkowski, Antonio Cuni + +Anders Chrigström, Samuele Pedroni, Laura Creighton, Jacob Hallén, Carl Friedrich Bolz, Richard Emslie, Maciek Fijalkowski, Armin Rigo + +Holger Krekel + +Whiteboard with "real world goals" dependencies.

                +
                +

                Comments

                +
                +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2007/11/unicode-support-in-rpython-in-recent-1444449848043047640.html b/posts/2007/11/unicode-support-in-rpython-in-recent-1444449848043047640.html new file mode 100644 index 000000000..c063700e8 --- /dev/null +++ b/posts/2007/11/unicode-support-in-rpython-in-recent-1444449848043047640.html @@ -0,0 +1,319 @@ + + + + + +Unicode support in RPython | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                Unicode support in RPython

                + + + +
                +

                In the recent days we (Carl Friedrich, Anto and me) implemented native unicode support for RPython. This means that now you can write u'xxxx' directly in your RPython program, as well as unicode(some_string_variable) and most of the unicode methods should work as well. The things that don't work, are operations that require the unicode database (such as .upper() and friends) and encodings (unicode(x, encoding) for example). Right now our python interpreter does not use this at all, but that's the next step. +

                +Cheers,
                +fijal

                +
                +

                Comments

                +
                +
                +
                + + Miguel Filipe wrote on 2007-11-13 15:29: +
                +
                +

                Hi there,
                It would be nice for the pypy site to mention this blog, or update the news section.
                I stumbled here from reading the ML.

                BTW: for when a new release?

                +
                +
                +
                +
                + + Carl Friedrich Bolz-Tereick wrote on 2007-11-13 15:55: +
                +
                +

                Hi Miguel,

                the blog is still somewhat unofficial so I don't want to give it completely official status by linking it from the PyPy page. But I guess a news item makes sense.

                There are no release-plans, we will discuss it next week on the sprint.

                Cheers,

                Carl Friedrich

                +
                +
                +
                +
                + + Anonymous wrote on 2008-01-16 18:52: +
                +
                +

                The blog is now official and posted
                on the PyPy website. The plan is to
                use it as the main channel for
                updates on what is happening.

                Jacob

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2007/12/faster-implementation-of-classic-1021557618590043616.html b/posts/2007/12/faster-implementation-of-classic-1021557618590043616.html new file mode 100644 index 000000000..419e6eece --- /dev/null +++ b/posts/2007/12/faster-implementation-of-classic-1021557618590043616.html @@ -0,0 +1,336 @@ + + + + + +Faster implementation of classic classes merged | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                Faster implementation of classic classes merged

                + + + +
                +

                Old-style classes have so far been a bit neglected by PyPy's Python interpreter. By default, PyPy makes all classes new-style and you have to use a command-line switch (--oldstyle) at startup or at translation time to change that default. Then you would get an pure-Python implementation of classic classes. This implementation was extremely slow (around 20 times slower than classic classes in CPython). In the past we had hoped that we could get away with mostly only supporting new-style classes, however it seems that real-world software seems to rely on them quite a bit, so we decided to offer a better migration path. + +A while ago I therefore started a re-implementation of classic classes in RPython to speed them up. This work is now finished, the branch I worked on got merged today. Speed for the old-style class benchmarks was improved greatly and I found quite a number of bugs in the old implementation too. New-style classes are still a bit faster than old-style in PyPy though, and this is unlikely to change.

                +
                +

                Comments

                +
                +
                +
                + + Michael Foord wrote on 2007-12-14 18:29: +
                +
                +

                Hey guys - its great to hear so much about PyPy progress. Keep up the good work (coding *and* blogging of course).

                Michael

                +
                +
                +
                +
                + + Carl Friedrich Bolz-Tereick wrote on 2007-12-14 18:38: +
                +
                +

                Hi Michael!

                It seems we are slowly getting into this blogging thing. Good to hear that somebody is actually reading that stuff too :-).

                Cheers,

                Carl Friedrich

                +
                +
                +
                +
                + + Anonymous wrote on 2007-12-15 01:05: +
                +
                +

                Of course it is being read, more please :)!

                +
                +
                +
                +
                + + Anonymous wrote on 2007-12-20 14:10: +
                +
                +

                Dear Carl and other PyPy developers,
                Thank you for all of your hard work in getting PyPy to its present impressive state.
                I really enjoy reading about your activities and accomplishments on this blog and on the PyPy irc logs.

                -gyro

                +
                +
                +
                +
                + + Carl Friedrich Bolz-Tereick wrote on 2007-12-20 14:23: +
                +
                +

                Hi gyro!

                Really impressive that you chew through all the IRC-logs: Even I find that a lot of work sometimes :-)

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2007/12/faster-than-c-8057790636822502084.html b/posts/2007/12/faster-than-c-8057790636822502084.html new file mode 100644 index 000000000..a74de3ebf --- /dev/null +++ b/posts/2007/12/faster-than-c-8057790636822502084.html @@ -0,0 +1,334 @@ + + + + + +faster than c | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                faster than c

                + + + +
                + Of course being "faster than c" means being faster than light. What did you think it means? :-) +
                +

                Comments

                +
                +
                +
                + + teki wrote on 2007-12-06 01:28: +
                +
                +

                "faster than c" means you can write code in other languages which look like do the same as the c one, but the c one is slower

                +
                +
                +
                +
                + + Anonymous wrote on 2007-12-06 09:20: +
                +
                +

                Took me some time to get the joke. =)

                +
                +
                +
                +
                + + Fredrik Johansson wrote on 2007-12-06 16:48: +
                +
                +

                Dream on; only imaginary things can be faster than c.

                (Relativistic mass formula with square root of a negative number for those who don't get it.)

                +
                +
                +
                +
                + + Carl Friedrich Bolz-Tereick wrote on 2007-12-06 17:09: +
                +
                +

                Sorry, sorry. I think I spent too much time around physicists. Further hint: notice the lower-case c.

                +
                +
                +
                +
                + + Eduardo O. Padoan wrote on 2007-12-18 16:06: +
                +
                +

                Benchmarking is relative :)

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2007/12/german-slides-of-talk-at-python-user-3715884461725333051.html b/posts/2007/12/german-slides-of-talk-at-python-user-3715884461725333051.html new file mode 100644 index 000000000..6c0e33c97 --- /dev/null +++ b/posts/2007/12/german-slides-of-talk-at-python-user-3715884461725333051.html @@ -0,0 +1,286 @@ + + + + + +(German) Slides of Talk at Python User Group Munich Available | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                (German) Slides of Talk at Python User Group Munich Available

                + + + +
                +

                Georg Brandl has put up the slides of the PyPy talk he gave at the Python User Group Munich. The slides are in German.

                +
                +

                Comments

                +
                +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2007/12/good-news-from-garbage-collection-front-2678138026363485439.html b/posts/2007/12/good-news-from-garbage-collection-front-2678138026363485439.html new file mode 100644 index 000000000..908aaee8b --- /dev/null +++ b/posts/2007/12/good-news-from-garbage-collection-front-2678138026363485439.html @@ -0,0 +1,286 @@ + + + + + +Good news from the garbage collection front | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                Good news from the garbage collection front

                + + + +
                +

                It seems that we can do better! Armin fixed a bug in our generational garbage collector, which caused variable sized objects (e.g. arrays) to be allocated outside of the nursery. This resulted in 50% speedup on synthetic benchmarks and about 10-20% on real world ones. Doing some preliminary measures, it seems that we spend roughly 10% of the time in garbage collection, which is good (and there is still some room for improvements!)

                +
                +

                Comments

                +
                +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2007/12/profiling-for-fun-with-valgrind-3215121784705288400.html b/posts/2007/12/profiling-for-fun-with-valgrind-3215121784705288400.html new file mode 100644 index 000000000..a927fa9c6 --- /dev/null +++ b/posts/2007/12/profiling-for-fun-with-valgrind-3215121784705288400.html @@ -0,0 +1,342 @@ + + + + + +Profiling for fun with valgrind | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                Profiling for fun with valgrind

                + + + +
                +

                Recently I've been doing a lot of profiling on the PyPy executables to find speed bottlenecks. Valgrind (the original page seems to be down) is an extremely nice tool for doing this. It has several built-in tools that give you different types of profiles. The callgrind mode provides you with a lot of information including relative call costs. The cachegrind tool gives you less information, but what it gives you (e.g. cache misses) is much more accurate. The obvious choice would be to have a way to combine the results of two profiling runs to have both. In the last days I wrote a script that does this. It's available at my user's svn and has a pretty intuitive command line interface. The combining calculation are not perfect yet, total costs of functions can still be a bit bogus (they can sum up to whatever) but at least the relative figures are good. This means that we can stop looking at two different types of graphs now. + +An awesome tool for analyzing the profile data is kcachegrind. + + + +Which also proves that my 12'' display is to small at least for some things :-). + + +Update: pygrind is available under the MIT license.

                +
                +

                Comments

                +
                +
                +
                + + José Fonseca wrote on 2007-12-14 13:11: +
                +
                +

                Nice!

                In what license are you releasing pygrind? I would like to integrate pygrind's code into Gprof2Dot (LGPL) to be able to generate nice-looking graphs from cachegrind output.

                +
                +
                +
                +
                + + Maciej Fijalkowski wrote on 2007-12-14 13:18: +
                +
                +

                Indeed, good question, thanks. I've updated blog post (I think it satisfies you, if not you can have it under LGPL if you like).

                +
                +
                +
                +
                + + José Fonseca wrote on 2007-12-14 14:43: +
                +
                +

                Excellent. Thanks!

                +
                +
                +
                +
                + + Aaron Bentley wrote on 2007-12-14 17:57: +
                +
                +

                If you like kcachegrind, you might like using it for profiling python:
                https://ddaa.net/blog/python/lsprof-calltree

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2007/12/pypy-talk-at-python-user-group-munich-1952379593354367249.html b/posts/2007/12/pypy-talk-at-python-user-group-munich-1952379593354367249.html new file mode 100644 index 000000000..b480daa47 --- /dev/null +++ b/posts/2007/12/pypy-talk-at-python-user-group-munich-1952379593354367249.html @@ -0,0 +1,286 @@ + + + + + +PyPy Talk at the Python User Group Munich | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                PyPy Talk at the Python User Group Munich

                + + + +
                +

                Tomorrow evening there will be an introductory talk about PyPy at the Python User Group Munich. The talk will be given by CPython and PyPy contributor Georg Brandl and will be in German.

                +
                +

                Comments

                +
                +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2007/12/pypy-tasks-in-ghop-5130253260153218709.html b/posts/2007/12/pypy-tasks-in-ghop-5130253260153218709.html new file mode 100644 index 000000000..1efd81aa4 --- /dev/null +++ b/posts/2007/12/pypy-tasks-in-ghop-5130253260153218709.html @@ -0,0 +1,297 @@ + + + + + +PyPy tasks in GHOP | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                PyPy tasks in GHOP

                + + + +
                +

                In the latest bunch of tasks that Titus released on Friday for the Google Highly Open Participation Contest there are several that are related to PyPy. Some of them are about presenting PyPy to a technical audience: Task 187, Task 188, Task 189, Task 190. + +Then there are some three about Ropes, which are all rather challenging: +

                + +In addition there is a task to use PyPy's sandboxing features to provide an interactive Python tutorial on a web page: Task 220. + +We're really looking forward to working together with some bright students! +
                +

                Comments

                +
                +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2007/12/pypy-winter-sports-sprint-from-12-19th-5592383212609773292.html b/posts/2007/12/pypy-winter-sports-sprint-from-12-19th-5592383212609773292.html new file mode 100644 index 000000000..19196d460 --- /dev/null +++ b/posts/2007/12/pypy-winter-sports-sprint-from-12-19th-5592383212609773292.html @@ -0,0 +1,307 @@ + + + + + +PyPy Winter Sports Sprint from 12-19th of January in Leysin, Switzerland | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                PyPy Winter Sports Sprint from 12-19th of January in Leysin, Switzerland

                + + + +
                + + + + + + + +
                +

                The next PyPy sprint will be held in Leysin, Switzerland, for +the fifth time. The overall idea of the sprint is to continue +working on making PyPy ready for general use.

                +
                +

                The proposed topics are: ctypes, JIT, testing, LLVM. This is +a fully public sprint, so newcomers and other topics are +welcome. And like previous winters, the main side goal is to +have fun in winter sports :-) See the sprint announcement +for details.

                +
                  +
                +
                +

                Comments

                +
                +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2007/12/various-performance-improvements-7027210611565246190.html b/posts/2007/12/various-performance-improvements-7027210611565246190.html new file mode 100644 index 000000000..dd4200504 --- /dev/null +++ b/posts/2007/12/various-performance-improvements-7027210611565246190.html @@ -0,0 +1,300 @@ + + + + + +Various Performance Improvements | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                Various Performance Improvements

                + + + +
                +

                A few days ago, Armin discovered Gnuplot. He wrote a script that turns the results of the nightly benchmark runs into plots (lower is always better, all the numbers of the microbenchmarks are "times slower than CPython"). The corresponding microbenchmarks can be found in the repository. Staring at the plots revealed a strange performance regression around the revision 45000. After some investigation Armin found that an mostly unrelated change had disabled our method cache, which caused the regression. This was fixed. + +In addition, Armin did a few other small tweaks in the interpreter main loop, making sure that small bytecodes are inlined into the main loop. This gave another few percent of performance increase. Together with the GC improvements two weeks ago this leads to the fastest non-JIT PyPy ever. Unfortunately "fastest" is not really very fast yet in absolute terms, with realistic apps being around 3-4 times slower than CPython. Especially calls (in all its variants) are quite slow, which is something we should look into.

                +
                +

                Comments

                +
                +
                +
                + + Anonymous wrote on 2008-01-06 06:05: +
                +
                +

                I'm amazed by the progress you guys have made. 3 - 4 times slower than CPython is actually really good considering what it does!

                PyPy is one of the most interesting computer language projects on the net.

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2008/01/buildbots-and-better-platform-support-6965497451398110731.html b/posts/2008/01/buildbots-and-better-platform-support-6965497451398110731.html new file mode 100644 index 000000000..6f9ea93b7 --- /dev/null +++ b/posts/2008/01/buildbots-and-better-platform-support-6965497451398110731.html @@ -0,0 +1,315 @@ + + + + + +Buildbots and Better Platform Support | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                Buildbots and Better Platform Support

                + + + +
                +

                In the last days we improved platform-support of PyPy's Python interpreter. +Jean-Paul Calderone has been tirelessly working for some time now on setting up a +buildbot for translating and testing PyPy. So far the basic mechanisms are +working and the buildbot is running on various machines, including some that +Michael Schneider (bigdog) lets us use, one of them being a Windows machine, +the other one with a 64bit Linux (lots of thanks to those two, you are +awesome!).

                +

                What is still missing is a nice way to visualize the test results to quickly see +which tests have started failing on which platforms. There is a prototype +already, which still needs some tweaking.

                +

                The availability of these machines has triggered some much-needed bug-fixing in +PyPy to make our Python interpreter work better on Windows and on 64 bit Linux. +Maciek and Michael Schneider worked on this quite a bit last week, with the +result that PyPy supports many more extension modules now on Windows and 64 bit +Linux. Since we now have the buildbot the hope is that the support also won't +disappear soon :-).

                +
                +

                Comments

                +
                +
                +
                + + Unknown wrote on 2008-02-06 20:37: +
                +
                +

                Cool
                I just found your blog and now I am going to read it every day:)
                I love reading about the progress you guys are making on PyPy.

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2008/01/crashing-other-peoples-compilers-4574453763254909150.html b/posts/2008/01/crashing-other-peoples-compilers-4574453763254909150.html new file mode 100644 index 000000000..ed7dd893c --- /dev/null +++ b/posts/2008/01/crashing-other-peoples-compilers-4574453763254909150.html @@ -0,0 +1,380 @@ + + + + + +Crashing Other People's Compilers | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                Crashing Other People's Compilers

                + + + +
                +

                Over the years PyPy has (ab?)used various external software for different +purposes, and we've discovered bugs in nearly all of them, mostly by pushing them +to their limits. For example, many compilers are not happy with 200MB of +source in one file. The Microsoft C compiler has a limit of 65536 lines of code +per file and the CLI was raising "System.InvalidProgramException: Method +pypy.runtime.Constants:.cctor () is too complex.", where too complex probably +means "too long". Just for fun, today we collected all projects we could think of +in which we found bugs:

                +
                + +
                +

                So one could say that PyPy is really just the most expensive debugging tool +ever :-).

                +
                +

                Comments

                +
                +
                +
                + + Michael Hudson-Doyle wrote on 2008-01-19 10:53: +
                +
                +

                You know, the one piece of external software we most depend on is one we haven't found bugs in: gcc (at least, I can't remember any problems). That's pretty impressive.

                I think you can probably add gdb to the list though.

                +
                +
                +
                +
                + + Alok wrote on 2008-01-20 13:16: +
                +
                +

                Can you, maybe, give a few examples of what you did. Linking to items about them if you wrote about it.

                +
                +
                +
                +
                + + Unknown wrote on 2008-01-21 21:46: +
                +
                +

                I'd be interested in knowing which projects were the most receptive to the bug reports.

                +
                +
                +
                +
                + + Carl Friedrich Bolz-Tereick wrote on 2008-01-21 22:24: +
                +
                +

                Hi Brett,

                I think the project most receptive to bug reports is LLVM, where bugs that we find are usually fixed within a small number of days. I think in general Open Source projects react quite well, as you would expect. A negative example is graphviz, which still segfaults despite us producing a patch which fixes the problem.

                Microsoft proves to be completely unapproachable, it seems you have to pay them if you want to report a bug (should be the other way round, of course :-)).

                +
                +
                +
                +
                + + Unknown wrote on 2008-01-21 23:19: +
                +
                +

                @Carl:

                Thanks for the info, Carl. I have been contemplating trying to rely on them for compiling Python for testing purposes, especially with clang coming along (although I am waiting for them to address a bug I found =). Good to know they are responsive.

                And yes, it is really unfortunate that Microsoft doesn't make reporting bugs easy, but I guess no one wants to deal with the number of reports they would most likely get. =)

                +
                +
                +
                +
                + + /SiD wrote on 2008-02-13 22:14: +
                +
                +

                Regarding Microsoft bug reports, there's Connect. And I've got some degree of success with it.

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2008/01/finding-gc-roots-using-llvm-or-parsing-1980376164990001937.html b/posts/2008/01/finding-gc-roots-using-llvm-or-parsing-1980376164990001937.html new file mode 100644 index 000000000..a5f56c142 --- /dev/null +++ b/posts/2008/01/finding-gc-roots-using-llvm-or-parsing-1980376164990001937.html @@ -0,0 +1,359 @@ + + + + + +Finding GC roots: using LLVM or parsing assembler files from GCC | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                Finding GC roots: using LLVM or parsing assembler files from GCC

                + + + +
                +

                PyPy contains a framework for writing custom Garbage Collectors, and a few simple GCs have been written in this framework. A common issue with all these GCs is how to find all the stack roots, i.e. all the pointers to live GC-managed objects currently stored in local variables, in all the callers of the current function. The current solution is to maintain a custom shadow stack of roots, where all functions push and pop copies of their local variables of type "GC pointer". Clearly this is an overhead. Can we remove it?

                + +

                LLVM has recently grown some support for this. By emitting markers in the LLVM source and with the help of a bit of custom C++ code, we can generate stack maps for the functions compiled by LLVM. Then, with 100% non-portable code in our framework GC's root finding algorithm, we can walk the machine stack and locate where in each stack frame LLVM stores the GC pointers. (Yes, I mean non-portable: LLVM offers no help for doing that. Maybe it will at some point, though I didn't manage to explain why this is an issue to people working on this in LLVM so far...). I've tried that approach in the llvmgcroot branch. Over the manually-managed shadow stack, this gives speed improvements which are, very roughly, on the order of 5%.

                + +

                Note that this prevents some optimizations in LLVM, because it forces it to allocate all local variables of type "GC pointer" in the stack; it cannot keep them in registers and it must assume that they can be changed more or less at any time (as moving GCs do). Can we do better?

                + +

                Actually, yes. We can even do better in the C backend, using a GCC hack. GCC has this nice extension: +

                +
                asm("bla", constrains);
                +This is meant to generate assembler instructions directly from C. Internally, GCC considers the whole asm() as a single regular instruction of its intermediate language; the constrains are expressed in the same way as the constrains for all the prebuilt intermediate language instructions. They express things like input and output operands of the instruction, whether they can live in memory or in registers, whether the whole instruction has side-effects, etc. The nice thing about asm() is that it doesn't kill any optimization whatsoever in GCC - it's your job to make sure that you use the correct constrains. + +

                So what I've tried in the asmgcroot branch is to use asm() as markers. In this branch, the C backend produces code like this after each function call, for each local variable containing a live GC pointer:

                + +
                asm("/* GCROOT %0 */" : "=g"(localvar) : "0"(localvar) : "memory");
                + +

                This causes GCC to emit the following line in the assembler file it generates:

                + +
                /* GCROOT register-or-memory-containing-localvar */
                + +

                I won't go in the details of the asm() line above - the constrains are just enough to make sure that GCC doesn't optimize too much, but don't prevent most optimizations from occurring. For example, the localvar can be in a register.

                + +

                The assembler will just ignore the line above; it is a comment. But what we can do is write our own tool parsing the assembler files. This tool locates the /* GCROOT */ comments and follows where the register or memory location in the comment comes from (to do this it must follow the control flow and data flow of the function). This allows it to build a stack map: for each call instruction it knows exactly which registers and frame stack locations contain a live GC pointer. The stack map is then emitted in an extra assembler file that we link with the rest. As with LLVM above, the stack map is then used at run-time by non-portable code written in our GC's stack root tracker.

                + +

                Yes, that's rather insane. But at least, we don't need to modify the assembler file - just read it. If GCC is too clever in its optimizations, the custom parser will get lost and complain cleanly; but I think that it is relatively safe in the sense that GCC optimizations should not be able to make the custom parser produce wrong results.

                + +

                The branch is not merged because it's probably too insane to merge (not to mention, it's probably not portable to non-GCC compilers, and it is completely platform-specific). Still, it gives good results, better that the pure LLVM approach - on the order of 10% to 25% speed-ups for pypy-c.

                +
                +

                Comments

                +
                +
                +
                + + Anonymous wrote on 2008-01-11 21:18: +
                +
                +

                How does Objective-C 2.0 handle this same problem?

                +
                +
                +
                +
                + + Armin Rigo wrote on 2008-01-12 09:04: +
                +
                +

                Obviously it depends on the compiler, but the basic idea is that the natural place to support this is in the compiler itself. For example, instead of parsing the assembler produced by GCC, it would probably be possible to extend GCC to cleanly generate stack maps. (This is basically what I tried to do with LLVM, which gives a plug-in API to do that.)

                After a bit of googling, GCC doesn't seem to support Objective-C 2.0 yet. Moreover, the current Objective-C run-time library simply uses the conservative Boehm collector.

                +
                +
                +
                +
                + + Anonymous wrote on 2008-01-15 08:28: +
                +
                +

                ObjC 2 does not use Boehm. The collecting thread suspends other threads and conservatively scans their stacks. It picks up values in registers by querying the kernel for the suspended thread state. It depends heavily on Mach.

                +
                +
                +
                +
                + + Anonymous wrote on 2008-01-16 17:39: +
                +
                +

                llvm-gcc fully supports inline asm, so you could use the same hack you use with GCC with your llvm backend.

                Also, you might be interested in https://llvm.org/PR1917, which proposes a method of identifying GC pointers that doesn't disable most optimizations.

                +
                +
                +
                +
                + + Barry Kelly wrote on 2010-04-08 21:10: +
                +
                +

                To Anonymous saying Objective C 2 not using Boehm: that may be true (I don't know the details), but the Boehm GC also suspends other threads, conservatively scans their stacks and picks up values in registers using the OS.

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2008/01/improve-net-integration-2239651503641931440.html b/posts/2008/01/improve-net-integration-2239651503641931440.html new file mode 100644 index 000000000..3c79fc219 --- /dev/null +++ b/posts/2008/01/improve-net-integration-2239651503641931440.html @@ -0,0 +1,328 @@ + + + + + +Improve .NET Integration | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                Improve .NET Integration

                + + + +
                +

                A while ago Amit Regmi, a student from Canada, started working on the +clr module improvements branch as a university project.

                +

                During the sprint Carl Friedrich, Paul and me worked more on it and +brought it to a mergeable state.

                +

                It adds a lot of new features to the clr module, which is the +module that allows integration between pypy-cli (aka PyPy.NET) and +the surrounding .NET environment:

                +
                +
                  +
                • full support to generic classes;
                • +
                • a new importer hook, allowing things like from System import +Math and so on;
                • +
                • .NET classes that implements IEnumerator are treated +as Python iterators; e.g. it's is possile to iterate over them +with a for loop.
                • +
                +
                +

                This is an example of a pypy-cli session:

                +
                +>>>> from System import Math
                +>>>> Math.Abs(-42)
                +42
                +>>>> from System.Collections.Generic import List
                +>>>> mylist = List[int]()
                +>>>> mylist.Add(42)
                +>>>> mylist.Add(43)
                +>>>> mylist.Add("foo")
                +Traceback (most recent call last):
                +  File "<console>", line 1, in <interactive>
                +TypeError: No overloads for Add could match
                +>>>> mylist[0]
                +42
                +>>>> for item in mylist: print item
                +42
                +43
                +
                +

                This is still to be considered an alpha version; there are few known +bugs and probably a lot of unknown ones :-), so don't expect it to +work in every occasion. Still, it's a considerable step towards real +world :-).

                +
                +

                Comments

                +
                +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2008/01/leysin-winter-sport-sprint-started-5478612778498579467.html b/posts/2008/01/leysin-winter-sport-sprint-started-5478612778498579467.html new file mode 100644 index 000000000..37085be8c --- /dev/null +++ b/posts/2008/01/leysin-winter-sport-sprint-started-5478612778498579467.html @@ -0,0 +1,310 @@ + + + + + +Leysin Winter Sport Sprint Started | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                Leysin Winter Sport Sprint Started

                + + + +
                + + +

                The Leysin sprint has started since yesterday morning in the usual location. The view is spectacular (see photo) the weather mostly sunny. The following people are sprinting: +

                +
                  +
                • Maciej Fijalkowski
                • +
                • Armin Rigo
                • +
                • Toby Watson
                • +
                • Paul deGrandis
                • +
                • Antonio Cuni
                • +
                • Carl Friedrich Bolz
                • +
                So it is a rather small sprint.

                We started working on various features and performance improvements for the high level backends (JVM and .NET) and on implementing ctypes for PyPy. Later this week we plan to spend a few days on the JIT, because Anto and I both need to get into it for our respective university projects.

                +
                +

                Comments

                +
                +
                +
                + + ajaksu wrote on 2008-01-14 22:29: +
                +
                +

                For those curious about what is going on: SVN commits

                Great work, guys! :)

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2008/01/pypy-keyboard-heatmap-4950995633665492453.html b/posts/2008/01/pypy-keyboard-heatmap-4950995633665492453.html new file mode 100644 index 000000000..d45609bc3 --- /dev/null +++ b/posts/2008/01/pypy-keyboard-heatmap-4950995633665492453.html @@ -0,0 +1,295 @@ + + + + + +PyPy Keyboard Heatmap | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                PyPy Keyboard Heatmap

                + + + +
                +

                Today I saw the keyboard heatmap generator on the Blended Technologies +blog. I threw all the PyPy code at it to see whether the heatmap looks any +different than normal Python code. It doesn't:

                + +

                So now the excuse "I can't contribute to PyPy because it needs all those special +PyPy-keys" isn't working anymore :-).

                +
                +

                Comments

                +
                +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2008/01/pypynet-goes-windows-forms-7031406830502864570.html b/posts/2008/01/pypynet-goes-windows-forms-7031406830502864570.html new file mode 100644 index 000000000..a9015609b --- /dev/null +++ b/posts/2008/01/pypynet-goes-windows-forms-7031406830502864570.html @@ -0,0 +1,314 @@ + + + + + +PyPy.NET goes Windows Forms | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                PyPy.NET goes Windows Forms

                + + + +
                + +

                After having spent the last few days on understanding PyPy's JIT, +today I went back hacking the clr module. As a result, it is now +possible to import and use external assemblies from pypy-cli, +including Windows Forms

                +

                Here is a screenshot of the result you get by typing the following at +the pypy-cli interactive prompt:

                +
                +>>>> import clr
                +>>>> clr.AddReferenceByPartialName("System.Windows.Forms")
                +>>>> clr.AddReferenceByPartialName("System.Drawing")
                +>>>> from System.Windows.Forms import Application, Form, Label
                +>>>> from System.Drawing import Point
                +>>>>
                +>>>> frm = Form()
                +>>>> frm.Text = "The first pypy-cli Windows Forms app ever"
                +>>>> lbl = Label()
                +>>>> lbl.Text = "Hello World!"
                +>>>> lbl.AutoSize = True
                +>>>> lbl.Location = Point(100, 100)
                +>>>> frm.Controls.Add(lbl)
                +>>>> Application.Run(frm)
                +
                +

                Unfortunately at the moment you can't do much more than this, because +we still miss support for delegates and so it's not possibile to +handle events. Still, it's a step in the right direction :-).

                +
                +

                Comments

                +
                +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2008/01/rpython-can-be-faster-than-c-2559071147541131237.html b/posts/2008/01/rpython-can-be-faster-than-c-2559071147541131237.html new file mode 100644 index 000000000..abd7e7462 --- /dev/null +++ b/posts/2008/01/rpython-can-be-faster-than-c-2559071147541131237.html @@ -0,0 +1,453 @@ + + + + + +RPython can be faster than C | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                RPython can be faster than C

                + + + +
                +

                (yes, C as in language, not c as in speed of light). I looked recently at the great computer language shootout, for some benchmarks and to make some speed comparisons. I use this benchmark, modified it to be rpythonic-enough and compared speeds. The code is here (the only change from the Python version was to create a class instead of tuple, so actually this version is more OO). Also the benchmark is very likely flawed because it favours better GCs :).
                +So, here we go: + +

                + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                Language:Time of run (for N=14):
                Python version running on Python 2.5.1, distribution25.5s
                Python version running on PyPy with generational GC45.5
                Python with psyco20s
                RPython translated to C using PyPy's generational GC0.42s
                compiling the Haskell version with GHC 6.6.11.6s
                compiling the C version with gcc 4.1.2 -O3 -fomit-frame-pointer0.6s
                +

                + +Also worth noticing is that when using psyco with the original version (with tuples) it is very fast (2s).

                + +So, PyPy's Python interpreter is 80% slower than CPython on this (not too horrible), but RPython is 40% faster than gcc here. Cool. The result is mostly due to our GC, which also proves that manual memory-management can be slower than garbage collection in some situations. Please note that this result does not mean that RPython is meant for you. It requires a completely different mindset than the one used to program in Python. Don't say you weren't warned! :-) +
                +

                Comments

                +
                +
                +
                + + Jonathan Ellis wrote on 2008-01-21 16:02: +
                +
                +

                "It requires a completely different mindset than the one used to program in Python."

                Can you elaborate? "RPython for Python programmers" would be an excellent addition to the docs or this blog. :)

                +
                +
                +
                +
                + + Michael Foord wrote on 2008-01-21 16:14: +
                +
                +

                I agree with Jonathan. There are many Python programmers who would *love* to be able to write Python extensions with RPython.

                I know that this is already possible, but there are two issues:

                * Lack of documentation on programming with RPython (I realise that this is a moving target)
                * Last I heard, the refcounting implementation made RPython extensions inefficient

                If these two issues were resolved (or mostly resolved) then a lot more people might start using the PyPy toolchain.

                Asides from my growsing, it looks like PyPy is becoming more impressive by the day. Congratulations.

                +
                +
                +
                +
                + + Leonardo Santagada wrote on 2008-01-21 16:20: +
                +
                +

                As of today you can't write CPython extensions in RPython.

                Why not ask for the great computer language shootout to include RPython as one of their benchmarking languages? This could be a good and free advertising for the pypy project.

                +
                +
                +
                +
                + + Silveira Neto wrote on 2008-01-21 16:55: +
                +
                +

                mmm
                0.42
                42
                The answer...

                +
                +
                +
                +
                + + Carl Friedrich Bolz-Tereick wrote on 2008-01-21 17:06: +
                +
                +

                Hi Michael,

                Leonardo is correct, the extension compiler was removed from SVN in November. We had many discussions about this step, but eventually it turned out to be necessary for many reasons. The extcompiler never was that useful in the first place because the produced extensions weren't fast (one of the reasons being the bad refcounting indeed).

                The other reasons were that the extcompiler was impossible to maintain and was actually preventing progress, because it kept code alive that we wanted to get rid off.

                So at the moment you cannot use PyPy any more to produce CPython extensions, only standalone programs.

                It's completely possible that the extcompiler will be reborn in the future, but at the moment our priorities are really to make PyPy a good Python and not do tons of things on the side.

                Cheers,

                Carl Friedrich

                +
                +
                +
                +
                + + Antonio Cuni wrote on 2008-01-21 18:06: +
                +
                +

                I would also say nowadays it's already possible to write extension modules in RPython... but just for PyPy, now for CPython :-).

                Jokes apart, if someone is really interested in writing part of its application in RPython (despite our warnings :-)), targeting PyPy could be an interesting alternative, as long as you don't need external libraries and the speed gain is more than what you loose in other areas where PyPy is actually slower.

                +
                +
                +
                +
                + + Justin wrote on 2008-01-21 21:17: +
                +
                +

                I think a lot of people are interested in using RPython for performance reasons. But about nobody will leave CPython atm, because extension modules are not working.

                At the moment, I wouldn't leave CPython since all I am doing is heavily based on scipy. And so my only option is (a) to wait PyPy being able to compile extensions for CPython or (b) PyPy making use of CPython extensions.

                As long as this is not going to happen, I probably will not use RPython for serious projects. :/

                +
                +
                +
                +
                + + Isaac Gouy wrote on 2008-01-25 17:11: +
                +
                + "Also the benchmark is very likely flawed because it favours better GCs :)"

                Why would that be a flaw? Note: this is an adaptation of a benchmark for testing GC


                Leonardo Santagada said "Why not ask for the great computer language shootout to include RPython ..."

                FAQ Why don't you include language X? +
                +
                +
                +
                + + Unknown wrote on 2008-01-25 21:03: +
                +
                +

                Once the RPython was translated to C by PyPy how did you compile the C?

                +
                +
                +
                +
                + + Maciej Fijalkowski wrote on 2008-01-25 21:31: +
                +
                +

                > Why would that be a flaw? Note: this is an adaptation of a benchmark for testing GC

                I know, but I realised it after posting :) (We even have original somewhere around to compare gcs). Also, honestly a lot of python versions rely on libraries written in C, hence it took me a while that is "pure enough".

                > Once the RPython was translated to C by PyPy how did you compile the C?

                With the very same options as bare gcc. -O3 -fomit-frame-pointer

                +
                +
                +
                +
                + + Anonymous wrote on 2008-01-28 20:18: +
                +
                +

                Did you try any of the other computer language shootout benchmarks?

                +
                +
                +
                +
                + + _ wrote on 2008-02-14 03:14: +
                +
                +

                More objects != OO.

                Perhaps you meant to say that it more closely reflects the domain?

                No, I don't know how I ended up on this blog post.

                +
                +
                +
                +
                + + Ariel Balter wrote on 2009-10-15 04:25: +
                +
                +

                How does RPython compare to Python Shedskin?

                +
                +
                +
                +
                + + Patric Dexheimer wrote on 2010-09-01 15:12: +
                +
                +

                "Can you elaborate? "RPython for Python programmers" would be an excellent addition to the docs or this blog. :)"

                +1 on this.

                Greetings from Brazil!

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2008/01/visualizing-python-tokenizer-5020282079473796926.html b/posts/2008/01/visualizing-python-tokenizer-5020282079473796926.html new file mode 100644 index 000000000..4c1b72647 --- /dev/null +++ b/posts/2008/01/visualizing-python-tokenizer-5020282079473796926.html @@ -0,0 +1,395 @@ + + + + + +Visualizing a Python tokenizer | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                Visualizing a Python tokenizer

                + + + +
                +

                Armin and me have been working on PyPy's parser and bytecode compiler for the Python language in the last days. Armin implemented several bytecode optimizations that CPython has since a while whereas I tried to refactor our tokenizer and parser (because our existing parser is rather slow and also not very nice code). Armin is mostly done whereas the new parser is not very far yet.

                What is done, however, is the Python tokenizer. It is implemented in the usual way, by using a set of regular expressions to generate a deterministic finite automaton (DFA). This automaton is then turned into a big function which does the actual tokenization. Of course the picture is not quite as simple for Python, because it is not possible to tokenize Python using only regular expressions. To generate the proper "indent" and "dedent" tokens it would be necessary to keep state (the previous indentation levels) which a DFA cannot do. This is solved by postprocessing the tokens that the tokenizer produces to turn whitespace tokens into the proper indent and dedent tokens.

                +

                For debugging purposes I implemented a visualization tool for DFAs using PyPy's pygame-based graph viewer. The graph viewer is able to visualize interactively any graph given in the graph-description language of Graphviz. Looking at the tokenizing DFA for Python is rather instructive, both for understanding how tokenizing works and (maybe) for understanding the Python language. To try it, download the dot file of the DFA and run from a pypy checkout:

                +

                +
                $ python pypy/bin/dotviewer.py tokenizer.dot
                +

                The following is a screenshot of the graphviewer: + +

                For people who don't want do checkout PyPy I generated a (rather big) png for the DFA.

                +

                Next thing I would like to do (apart from actually finishing the parser, of course :-) ) is visualize the Python grammar itself using syntax diagrams or something similar. So far I couldn't really find a program to do that, though.

                +
                +

                Comments

                +
                +
                +
                + + Anonymous wrote on 2008-01-08 16:11: +
                + +
                +
                +
                + + Carl Friedrich Bolz-Tereick wrote on 2008-01-08 17:52: +
                +
                +

                Hi Robin,

                Yes, this is sort of cool already, but it doesn't really give you as much information as the grammar itself. It's more of an overview-like thing.

                Cheers,

                Carl Friedrich

                +
                +
                +
                +
                + + Anonymous wrote on 2008-01-08 18:05: +
                +
                +

                Check out ANTLRWorks or ANTLRStudio for superb visualization of grammars.

                +
                +
                +
                +
                + + Carl Friedrich Bolz-Tereick wrote on 2008-01-09 13:57: +
                +
                +

                Yeah, ANTLRWorks seems pretty nice in that respect. Thanks for the hint.

                +
                +
                +
                +
                + + Unknown wrote on 2008-01-18 13:52: +
                +
                +

                Hello!

                I was considering to use

                https://www.antlr.org/wiki/display/ANTLR3/Antlr3PythonTarget

                for a toy language project for uni (Computer Engineering, IT) because I guess ANTLR(works) and its mailing list could help me a lot in understanding the very basics of 'grammar design'...

                Or at least I hope so! :-)

                However, I've also been lurking the PyPy ml for quite a while now and was considering the possibility to implement the whole (*toy*) interpreter in RPython, so to understand a bit more of PyPy's design by actually coding something simple which makes some use of it. :-)

                So, would you consider trying to port the current ANTLR's Python runtime to RPython a good way for me to start doing something with PyPy?

                Would you consider the thing interesting? I know this possibility had been discussed on IRC some times ago and it wasn't thought to be that useful at last, but maybe you discussed the thing some more since then and changed idea, I don't know...

                How would you rate the difficulty of such a task for a PyPy and ANTLR newbie, also? :-)

                I wouldn't try doing that right now, anyway, but maybe in March I should manage to get some spare time for it. In the meantime, I'd try to read as many docs and sources as possible...

                Tell me your opinion, please! :-)

                Cheers,
                Matteo

                PS: I'm writing here because you were discussing PyPy's Python lexer and somebody wrote about ANTLRworks, but if you think I'd better send this message to the list please just tell me and I'll do so! :-)

                +
                +
                +
                +
                + + Carl Friedrich Bolz-Tereick wrote on 2008-01-18 16:32: +
                +
                +

                Hi Matteo,

                I would post to the pypy-dev mailing list instead, it is better to discuss such longish things there. Thanks for you interest!

                Cheers,

                Carl Friedrich

                +
                +
                +
                +
                + + Unknown wrote on 2008-05-19 14:45: +
                +
                +

                Does it use Graphviz's dot utility for rendering?

                +
                +
                +
                +
                + + Konrad wrote on 2008-06-04 18:36: +
                +
                +

                @techtonik

                Yes, it does.

                +
                +
                +
                +
                + + Unknown wrote on 2008-06-04 18:47: +
                +
                +

                That's bad it cann't be used as a standalone product without the need to install Graphvis.

                +
                +
                +
                +
                + + Anonymous wrote on 2009-04-22 02:22: +
                + +
                +
                +
                + + wow power leveling wrote on 2009-04-22 02:27: +
                + +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2008/02/pypy-presence-on-various-conferences-in-6584680808789191759.html b/posts/2008/02/pypy-presence-on-various-conferences-in-6584680808789191759.html new file mode 100644 index 000000000..af90a36da --- /dev/null +++ b/posts/2008/02/pypy-presence-on-various-conferences-in-6584680808789191759.html @@ -0,0 +1,332 @@ + + + + + +PyPy presence on various conferences in the near future | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                PyPy presence on various conferences in the near future

                + + + +
                +

                Hello! I will have the pleasure of presenting PyPy on various conferences in the near future. They're (in chronological order): + +

                +
                  +
                • Studencki Festiwal Informatyczny in Krakow, POLAND 6-8 March 2008. I think this might be only interesting for polish people (website, in polish)
                • + +
                • Pycon Chicago, IL, USA. 14-17 March 2008. There should be also a PyPy sprint afterwards, including newbie-friendly tutorial, everybody is welcome to join us! (Provided that I'll get the US visa, which seems to be non-trivial issue for a polish citizen)
                • +
                • RuPy, Poznan, POLAND 13-14 April 2008 (website). This is small, but very friendly Ruby and Python conference. Last year was amazing, I can strongly recommend to go there (Poznan is only 2h by train from Berlin also has its own airport).
                • +
                + +Hope to see you at those places!

                + +Cheers,
                +fijal +
                +

                Comments

                +
                +
                +
                + + Michael Foord wrote on 2008-02-12 14:04: +
                +
                +

                Hey - I'll be at both the Polish conferences talking about IronPython. I hope you will be talking in English!

                Look forward to meeting up with you.

                Michael Foord

                +
                +
                +
                +
                + + Maciej Fijalkowski wrote on 2008-02-12 14:56: +
                +
                +

                Cheers Michael. Looking forward to see you!

                At rupy definitely. At sfi it depends on them (I'll try to, also that I have noone to help me with slides in polish :)

                +
                +
                +
                +
                + + Konrad wrote on 2008-02-15 23:52: +
                +
                +

                Hey Fijal.

                I think the Academic IT Festival in Cracow would be interesting not only for polish people. Large part of the talks will be given in English.

                Here's the link to the english version of the festival website: https://www.sfi.org.pl/news

                Konrad Delong, SFI :)

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2008/02/python-finalizers-semantics-part-1-1196956834543115766.html b/posts/2008/02/python-finalizers-semantics-part-1-1196956834543115766.html new file mode 100644 index 000000000..d9470a1fd --- /dev/null +++ b/posts/2008/02/python-finalizers-semantics-part-1-1196956834543115766.html @@ -0,0 +1,355 @@ + + + + + +Python Finalizers Semantics, Part 1 | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                Python Finalizers Semantics, Part 1

                + + + +
                +

                Python's garbage collection semantics is very much historically grown and +implementation-driven. Samuele Pedroni therefore likes to call it the "'there +is no such thing as too much chocolate'-approach to GC semantics" :-). In this +two-part post series I am going to talk about the semantics of finalization +(__del__ methods) in CPython and PyPy.

                +

                The current behaviour is mostly all a consequence of the fact that CPython uses +reference counting for garbage collection. The first consequence is that if +several objects die at the same time, their finalizers are called in a +so-called topological order, which is a feature that some GCs have that +CPython offers by chance. This ensures, that in a __del__ method, all the +attributes of the object didn't get their __del__ called yet. A simple +example:

                +
                +class B(object):
                +    def __init__(self, logfile):
                +        self.logfile = logfile
                +    def __del__(self):
                +        self.logfile.write("done doing stuff")
                +b = B(file("logfile.txt", "w"))
                +
                +

                If the instance of B dies now, both it and the logfile are dead. They will +get their __del__``s called and it's important that the file's ``__del__ +gets called second, because otherwise the __del__ of B would try to +write to a closed file.

                +

                The correct ordering happens completely automatically if you use reference +counting: Setting b to None will decref the old value of b. This reduces +the reference count of this instance to 0, so the finalizer will be called. +After the __del__ has finished, this object will be freed and all the +objects it points to decrefed as well, which decreases the reference count of +the file to 0 and call its `` __del__`` as well, which closes the file.

                +

                The behaviour of PyPy's semispace and generational GCs wasn't very nice so far: +it just called the finalizers in an essentially random order. Last week Armin +came up with a somewhat complicated algorithm that solves this by emulating +CPython's finalization order, which we subsequently implemented. So PyPy does +what you expect now! The Boehm GC does a topological ordering by default, so it +wasn't a problem there.

                +

                A small twist on the above is when +there is a cycle of objects involving finalizers: +In this case a topological ordering is not possible, so that CPython refuses to +guess the finalization order and puts such cycles into gc.garbage. This +would be very hard for PyPy to do, since our GC implementation is essentially +independent from the Python interpreter. The same GCs work for our other +interpreters after all too. Therefore we decided to break such a cycle at an +arbitrary place, which doesn't sound too insane. The insane thing is for +a Python program to create a cycle of objects with finalizers and depend +on the order in which the finalizers are called. Don't do that :-) (After +all, CPython wouldn't even call the finalizers in this case.)

                +
                +

                Comments

                +
                +
                +
                + + SamB wrote on 2015-03-15 05:46: +
                +
                +

                The link to the "somewhat complicated algorithm" is a bit broken, but you can still get to it at the web archive.

                +
                +
                +
                +
                + + Armin Rigo wrote on 2015-03-30 08:07: +
                +
                +

                Thanks, link updated.

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2008/02/python-finalizers-semantics-part-2-2748812428675325525.html b/posts/2008/02/python-finalizers-semantics-part-2-2748812428675325525.html new file mode 100644 index 000000000..269303a22 --- /dev/null +++ b/posts/2008/02/python-finalizers-semantics-part-2-2748812428675325525.html @@ -0,0 +1,323 @@ + + + + + +Python Finalizers Semantics, Part 2: Resurrection | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                Python Finalizers Semantics, Part 2: Resurrection

                + + + +
                +

                Continuing the last blog post about GC semantics in Python.

                +

                Another consequence of reference counting is that resurrection is easy to +detect. A dead object can resurrect itself if its finalizer stores it into a +globally reachable position, like this:

                +
                +class C(object):
                +    def __init__(self, num):
                +        self.num = num
                +    def __del__(self):
                +        global c
                +        if c is None:
                +            c = self
                +c = C(1)
                +while c is not None:
                +    c = None
                +    print "again"
                +
                +

                This is an infinite loop in CPython: Every time c is set to None in the +loop, the __del__ method resets it to the C instance again (note that +this is terribly bad programming style, of course. In case anybody was wondering +:-)). CPython can detect resurrection by checking whether the reference count +after the call to __del__ has gotten bigger.

                +

                There exist even worse examples of perpetual resurrection in particular in +combination with the cycle GC. If you want to see a particularly horrible one, +see this discussion started by Armin Rigo. In the ensuing thread Tim Peters +proposes to follow Java's example and call the finalizer of every object at most +once.

                +

                In PyPy the resurrection problem is slightly more complex, since we have GCs +that run collection from time to time and don't really get to know at which +precise time an object dies. If the GC discovers during a collection that an +object is dead, it will call the finalizer after the collection is finished. If +the object is then dead at the next collection, the GC does not know whether +the object was resurrected by the finalizer and then died in the meantime or +whether it was not resurrected. Therefore it seemed sanest to follow Tim's +solution and to never call the finalizer of an object a second time, which has +many other benefits as well.

                +
                +

                Comments

                +
                +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2008/02/running-pyglet-on-pypy-3191536711417589549.html b/posts/2008/02/running-pyglet-on-pypy-3191536711417589549.html new file mode 100644 index 000000000..769cfabbc --- /dev/null +++ b/posts/2008/02/running-pyglet-on-pypy-3191536711417589549.html @@ -0,0 +1,383 @@ + + + + + +Running Pyglet on PyPy | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                Running Pyglet on PyPy

                + + + +
                +

                As part of our efforts of making PyPy's Python interpreter usable we put quite some effort into interfacing with external libraries. We were able, in quite a short amount of time (I think beginning really from Leysin sprint, or slightly earlier) to provide a prototype of the ctypes library. It is written in completely normal Python, at applevel, based on a very thin wrapper around the libffi library. This makes development a lot easier, but it makes the resulting ctypes implementation rather slow. The implementation is not complete yet and it will still need quite some effort to make it feature-complete (ctypes has lots of details and special cases and +do-what-I-mean magic). Yet another point will be to make it faster, but that's for much later. +
                +The implementation is good enough to run those parts of Pyglet that don't depend on PIL (which PyPy doesn't have). Here are a few pictures of running Pyglet demos on top of compiled pypy-c. + + + +To compile a version of PyPy that supports ctypes, use this highly sophisticated command line + +
                ./translate.py --gc=generation ./targetpypystandalone.py --allworkingmodules --withmod-_rawffi +
                +Note: this works on linux only right now. +
                +The list of missing small ctypes features is quite extensive, but I consider the current implementation to be usable for most common cases. I would love to hear about libraries written in pure python (using ctypes), to run them on top of PyPy and use them as test cases. If someone knows such library, please provide a link.

                +
                +

                Comments

                +
                +
                +
                + + Richard Jones wrote on 2008-02-21 01:17: +
                +
                +

                This is very cool news indeed! The second screenshot seems to show a strange artefact though on the inside of the torus. Is that running the unmodified examples/opengl.py code, or has the example been modified to display a second torus? It should also be noted that pyglet is perfectly usable without PIL (as long as you have libgdk installed under Linux which almost everyone will do).

                +
                +
                +
                +
                + + Richard Jones wrote on 2008-02-21 03:16: +
                +
                +

                ps. it's "pyglet" with a little "p" :)

                +
                +
                +
                +
                + + René Dudfield wrote on 2008-02-21 04:04: +
                +
                +

                Very cool :)

                Here's two more ctypes things... which you probably know about already, but eh :)

                https://pyopengl.sf.net/
                https://www.pygame.org/ctypes/

                cu,

                +
                +
                +
                +
                + + Anonymous wrote on 2008-02-21 08:52: +
                +
                +

                Congratulations from me!

                +
                +
                +
                +
                + + Carl Friedrich Bolz-Tereick wrote on 2008-02-21 09:07: +
                +
                +

                hi illume,

                yep, we know about those, thank you anyway :-).

                Cheers,

                Carl Friedrich

                +
                +
                +
                +
                + + Anonymous wrote on 2008-02-21 13:53: +
                +
                +

                Very exciting to see my humble 'triangles' pyglet demo being used in the blue colored screenshot. If anyone's interested, the code for that is here: https://tartley.com/?p=264

                I should remember to put an explicit CC license on my whole site.

                +
                +
                +
                +
                + + Justin wrote on 2008-02-29 08:58: +
                +
                +

                https://utidylib.berlios.de/

                needs only ctypes, iirc.

                +
                +
                +
                +
                + + Gerhard Häring wrote on 2008-03-05 16:07: +
                +
                +

                Please try the ctypes-based pysqlite reimplementation at https://hg.ghaering.de/pysqlite3/

                It's meant to become the "sqlite3" module for PyPy.

                +
                +
                +
                +
                + + Carl Friedrich Bolz-Tereick wrote on 2008-03-05 21:15: +
                +
                +

                Hi Gerhard,

                yip, we know about this sqlite implementation, thank you! We are already using it for our tests (and most of its tests already work).

                Cheers,

                Carl Friedrich

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2008/03/as-fast-as-cpython-for-carefully-taken-1984440931984637179.html b/posts/2008/03/as-fast-as-cpython-for-carefully-taken-1984440931984637179.html new file mode 100644 index 000000000..5b2227ab8 --- /dev/null +++ b/posts/2008/03/as-fast-as-cpython-for-carefully-taken-1984440931984637179.html @@ -0,0 +1,383 @@ + + + + + +As fast as CPython (for carefully taken benchmarks) | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                As fast as CPython (for carefully taken benchmarks)

                + + + +
                +

                Good news everyone. A tuned PyPy compiled to C is nowadays as fast as CPython on the richards benchmark and slightly faster on the gcbench benchmark. +
                IMPORTANT: These are very carefully taken benchmarks where we expect pypy to be fast! PyPy is still quite slower than CPython on other benchmarks and on real-world applications (but we're working on it). The point of this post is just that for the first time (not counting JIT experiments) we are faster than CPython on *one* example :-) +
                +The exact times as measured on my notebook (which is a Core Duo machine) are here: +
                +Compiled pypy with options: +
                +./translate.py --gcrootfinder=asmgcc --gc=generation targetpypystandalone.py --allworkingmodules --withmod-_rawffi --faassen + +(allworkingmodules and withmod-_rawffi are very likely irrelevant to those benchmarks) +
                +CPython version 2.5.1, release. +

                +
                  +
                • richards 800ms pypy-c vs 809ms cpython (1% difference)
                • +
                • gcbench 53700ms pypy-c vs 60215ms cpython (11% difference)
                • +
                +PyPy shines on gcbench, which is mostly just about allocating and freeing many objects. Our gc is simply better than refcounting, even though we've got shortcomings in other places. +
                + +About richards, there is a catch. We use a method cache optimization, and have an optimization which helps to avoid creating bound methods each time a method is called. This speeds up the benchmark for about 20%. Although method cache was even implemented for CPython, it didn't make its way to the core because some C modules directly modify the dictionary of new-style classes. In PyPy, the greater level of abstraction means that this operation is just illegal. +
                +

                Comments

                +
                +
                +
                + + Anonymous wrote on 2008-03-05 13:08: +
                +
                +

                This is GREAT news!

                Keep up the good work guys, i will be closely following you all!

                Cheers!

                +
                +
                +
                +
                + + Anonymous wrote on 2008-03-06 22:40: +
                +
                +

                I have been watching PyPy for some time now and this news along with the ctypes news has me excited.

                +
                +
                +
                +
                + + Unknown wrote on 2008-03-07 10:06: +
                +
                +

                Great work.

                It is wonderful to see PyPy making progress towards the overall goal!

                +
                +
                +
                +
                + + Unknown wrote on 2008-03-12 18:02: +
                +
                +

                Awsome. :)

                +
                +
                +
                +
                + + Anonymous wrote on 2008-03-14 16:23: +
                +
                +

                Will PyPy be released before Duke Nukem gets released? Pray please tell and enlighten!
                Cursing all you skilled hackers for not doing an amd64 port of Psycho and pursuing something that will be irrelevant when it materializes.
                Have fun any way.

                +
                +
                +
                +
                + + Maciej Fijalkowski wrote on 2008-03-14 21:54: +
                +
                +

                Yeah, it will be released usable. For real. We're getting into having nice and usable python interpreter, ctypes is a good example of feature that is ready to use. How fast it'll be? I don't know, hopefully faster than psyco.

                +
                +
                +
                +
                + + Unknown wrote on 2008-03-17 23:02: +
                +
                +

                "Will PyPy be released before Duke Nukem gets released?"

                I doubt it: Duke Nukem was released in 1991...

                https://en.wikipedia.org/wiki/Duke_Nukem_%28computer_game%29

                If you want to make a wisecrack, at least try to deliver it correctly.

                +
                +
                +
                +
                + + Anonymous wrote on 2008-03-20 09:34: +
                +
                +

                And pypy 1.0 was released one year ago ... https://aspn.activestate.com/ASPN/Mail/Message/python-announce/3461501

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2008/03/bittorrent-on-pypy-7984272143557948160.html b/posts/2008/03/bittorrent-on-pypy-7984272143557948160.html new file mode 100644 index 000000000..c6e27d0c9 --- /dev/null +++ b/posts/2008/03/bittorrent-on-pypy-7984272143557948160.html @@ -0,0 +1,331 @@ + + + + + +Bittorrent on PyPy | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                Bittorrent on PyPy

                + + + +
                +

                Hi all,

                + +

                Bittorrent now runs on PyPy! I tried the no-GUI BitTornado version (btdownloadheadless.py). It behaves correctly and I fixed the last few obvious places which made noticeable pauses. (However we know that there are I/O performance issues left: we make too many internal copies of the data, e.g. in a file.read() or os.read().)

                + +

                We are interested in people trying out other real-world applications that, like the GUI-less Bittorrent, don't have many external dependencies to C extension modules. Please report all the issues to us!

                + +

                The current magic command line for creating a pypy-c executable with as many of CPython's modules as possible is:

                + +
                +  cd pypy/translator/goal
                +  ./translate.py --thread targetpypystandalone.py --allworkingmodules --withmod-_rawffi --faassen
                +
                + +

                (This gives you a thread-aware pypy-c, which requires the Boehm gc library. The _rawffi module gives you ctypes support but is only tested for Linux at the moment.)

                +
                +

                Comments

                +
                +
                +
                + + Panos Laganakos wrote on 2008-03-18 12:23: +
                +
                +

                Pretty kewl stuff from PyPy :)

                +
                +
                +
                +
                + + Orangeman wrote on 2008-04-07 13:30: +
                +
                +

                I have a guide on most popular P2P technologies at https://sriraminhell.blogspot.com/2007/08/p2p-brief-introduction.html and on Bit Torrent at https://sriraminhell.blogspot.com/2007/08/peer-to-peer-ii-bit-torrent.html . Cheers!!

                +
                +
                +
                +
                + + Unknown wrote on 2008-04-19 11:53: +
                +
                +

                What rev number did you build on? I tried with the latest source from svn, but got an error almost immediately. No module named py.

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2008/03/ctypes-configuration-tool-7414864595600362988.html b/posts/2008/03/ctypes-configuration-tool-7414864595600362988.html new file mode 100644 index 000000000..41ea0aa8e --- /dev/null +++ b/posts/2008/03/ctypes-configuration-tool-7414864595600362988.html @@ -0,0 +1,338 @@ + + + + + +ctypes configuration tool | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                ctypes configuration tool

                + + + +
                +

                As a part of implementing ctypes, we decided to make coding using ctypes better on its own (irrelevant what python interpreter you use). The concrete problem we're trying to solve is to make ctypes code more platform-independent than it is. Say you want to create a ctypes type for size_t: ctypes itself provides no mechanism for doing that, so you need to use a concrete integer type (c_int, c_long, c_short etc.). Your code either becomes platform dependent if you pick one of them or is littered with conditionals for all sorts of platforms. We created a small library, called ctypes_configure (which is actually a variation of something we use somewhere in the PyPy source tree), which tries to solve some platform dependencies by compiling and running small chunks of C code through a C compiler. It's sort of like configure in the Linux world, except for Python using ctypes. +

                +To install the library, you can just type easy_install ctypes_configure. The code is in an svn repository on codespeak and there is even some documentation and sample code. Also, even though the code lives in the pypy repository, it depends only on pylib, not on the whole of pypy. +
                +The library is in its early infancy (but we think it is already rather useful). In the future we could add extra features, it might be possible to check whether the argtypes that are attached to the external functions are consistent with what is in the C headers), so that the following code wouldn't segfault but give a nice error +

                +
                +libc = ctypes.CDLL("libc.so")
                +time = libc.time
                +time.argtypes = [ctypes.c_double, ctypes.c_double]
                +time(0.0, 0.0)
                +
                + +Also, we plan to add a way to install a package that uses ctypes_configure in such a way that the installed library doesn't need to call the C compiler any more later. +
                +

                Comments

                +
                +
                +
                + + Anonymous wrote on 2008-03-18 09:52: +
                +
                +

                Cool - it even works on Windows!.

                BTW: The content-type of the documentation seems wrong, firefox displays the html instead of rendering it.

                +
                +
                +
                +
                + + PJE wrote on 2008-03-18 16:51: +
                +
                +

                Since easy_install can compile C code, why not just compile an extension module with the configuration? Then, other modules can just import the pre-built configuration.

                +
                +
                +
                +
                + + Maciej Fijalkowski wrote on 2008-03-18 17:50: +
                +
                +

                Sure. It's an obvious extension. I just got this from pypy source code and released separately. If it'll happen to be useful, I'll add more features.

                +
                +
                +
                +
                + + Unknown wrote on 2008-05-11 02:43: +
                +
                +

                Re: PJE

                I'm no expert (and I'm half asleep), but your approach sounds like it might run afoul of changes introduced by upgrading something without regenerating the pre-built configuration.

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2008/03/py-lib-091-released-1654797401128918376.html b/posts/2008/03/py-lib-091-released-1654797401128918376.html new file mode 100644 index 000000000..262b822ba --- /dev/null +++ b/posts/2008/03/py-lib-091-released-1654797401128918376.html @@ -0,0 +1,310 @@ + + + + + +Py-Lib 0.9.1 released | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                Py-Lib 0.9.1 released

                + + + +
                +

                The Py-Lib 0.9.1 release is out! The Py-Lib is a very important support +library that PyPy uses for a lot of things – most importantly it contains +py.test, which PyPy uses for testing.

                +

                This is mostly a bugfix release, with a couple of new features sneaked in. +Most important changes:

                +
                  +
                • some new functionality (authentication, export, locking) in py.path's +Subversion APIs
                • +
                • numerous small fixes in py.test's rsession (experimental pluggable session) +and generative test features
                • +
                • some fixes in the py.test core
                • +
                +

                Download/Install: https://codespeak.net/py/0.9.1/download.html

                +

                Documentation/API: https://codespeak.net/py/0.9.1/index.html

                +

                UPDATE: the py-lib is now easy-installable with:

                +
                +easy_install py
                +
                +
                +

                Comments

                +
                +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2008/03/pypy-summer-of-code-participation-3403842530060519982.html b/posts/2008/03/pypy-summer-of-code-participation-3403842530060519982.html new file mode 100644 index 000000000..9531b4f99 --- /dev/null +++ b/posts/2008/03/pypy-summer-of-code-participation-3403842530060519982.html @@ -0,0 +1,293 @@ + + + + + +PyPy Summer of Code Participation | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                PyPy Summer of Code Participation

                + + + +
                +

                As in the last years, PyPy will again participate in Google's Summer of Code +program under the umbrella of the Python Software Foundation. Unfortunately we +were a bit disorganized this year, so that our project ideas are only put up +now. The list of project ideas of PyPy can be found here.

                +

                Any interested student should mail to our mailing list or just come to the +#pypy channel on irc.freenode.net to discuss things.

                +
                +

                Comments

                +
                +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2008/04/float-operations-for-jit-6499693696246367083.html b/posts/2008/04/float-operations-for-jit-6499693696246367083.html new file mode 100644 index 000000000..46da2e08e --- /dev/null +++ b/posts/2008/04/float-operations-for-jit-6499693696246367083.html @@ -0,0 +1,353 @@ + + + + + +Float operations for JIT | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                Float operations for JIT

                + + + +
                +

                Recently, we taught the JIT x86 backend how to produce code for the x87 floating point coprocessor. This means that JIT is able to nicely speed up float operations (this this is not true for our Python interpreter yet - we did not integrate it yet). This is the first time we started going beyond what is feasible in psyco - it would take a lot of effort to make floats working on top of psyco, way more than it will take on PyPy. +

                +This work is in very early stage and lives on a jit-hotpath branch, which includes all our recent experiments on JIT compiler generation, including tracing JIT experiments and huge JIT refactoring. +

                +Because we don't encode the Python's semantics in our JIT (which is really a JIT generator), it is expected that our Python interpreter with a JIT will become fast "suddenly", when our JIT generator is good enough. If this point is reached, we would also get fast interpreters for Smalltalk or JavaScript with relatively low effort. +

                +Stay tuned. +

                + +Cheers,
                +fijal

                +
                +

                Comments

                +
                +
                +
                + + Michael Foord wrote on 2008-04-17 14:22: +
                +
                +

                Having a fast implementation of Ruby written in Python would be very cool. :-p

                +
                +
                +
                +
                + + René Dudfield wrote on 2008-04-18 07:29: +
                +
                +

                Super cool!

                Are you going to add SIMD stuff to the i386 backend?

                Which is the main backend at the moment? LLVM?

                cheers,

                +
                +
                +
                +
                + + jlg wrote on 2008-04-18 10:22: +
                +
                +

                It would be amazing to run SciPy on PyPy with the JIT when this will be ready.

                +
                +
                +
                +
                + + Anonymous wrote on 2008-04-19 04:20: +
                +
                +

                I'm interested in the choice of x87 as well. My understanding was that Intel (at least) was keeping x87 floating point around because of binary applications but that for single element floating point the SSE single-element instructions were the preferred option on any processor which supports SSE. (Unfortunately since they've got such different styles of programming I can understand if it's just that "older chips have to be supported, and we've only got enough programming manpower for 1 implementation".)

                +
                +
                +
                +
                + + Maciej Fijalkowski wrote on 2008-04-20 16:28: +
                +
                +

                x87 because it's simpler and better documented. Right now would be ridiculously easy to reimplement it using SSE.

                +
                +
                +
                +
                + + Armin Rigo wrote on 2008-04-21 11:30: +
                +
                +

                The main backend is the one for 386. We have no working LLVM JIT backend: although llvm advertizes supporting JIT compilation, what it really provides is a regular compiler packaged as a library that can be used at run-time. This is only suitable for some kinds of usages; for example, it couldn't be used to write a Java VM with good just-in-time optimizations (which need e.g. quick and lazy code generation and regeneration, polymorphic inline caches, etc.)

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2008/04/googles-summer-of-code-4911168632727441622.html b/posts/2008/04/googles-summer-of-code-4911168632727441622.html new file mode 100644 index 000000000..b1ad7c973 --- /dev/null +++ b/posts/2008/04/googles-summer-of-code-4911168632727441622.html @@ -0,0 +1,306 @@ + + + + + +Google's Summer of Code | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                Google's Summer of Code

                + + + +
                +

                PyPy got one proposal accepted for Google's Summer of Code under the Python +Software Foundation's umbrella. We welcome Bruno Gola into the PyPy +community. He will work on supporting all Python 2.5 features in PyPy and will +also update PyPy's standard library to support the modules that were modified +or new in Python 2.5.

                +

                Right now PyPy supports only Python 2.4 fully (some Python 2.5 features have +already sneaked in, though).

                +
                +

                Comments

                +
                +
                +
                + + Anonymous wrote on 2008-04-22 18:47: +
                +
                +

                Hello,

                I'm very glad to work on PyPy project this summer (ok, this winter here in Brazil =))!

                I hope this project helps to bring more people to the PyPy project, both users and developers =)

                Thanks Google, PSF, PyPy and Carl for mentoring it!

                Bruno

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2008/04/other-aprils-fools-ideas-955926452383759016.html b/posts/2008/04/other-aprils-fools-ideas-955926452383759016.html new file mode 100644 index 000000000..5153e6f5a --- /dev/null +++ b/posts/2008/04/other-aprils-fools-ideas-955926452383759016.html @@ -0,0 +1,344 @@ + + + + + +Other April's Fools Ideas | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                Other April's Fools Ideas

                + + + +
                +

                While discussing what to post as an April Fool's joke yesterday, we had a +couple of other ideas, listed below. Most of them were rejected because they are +too incredible, others because they are too close to our wish list.

                +
                  +
                • quantum computer backend
                • +
                • Perl6 interpreter in RPython
                • +
                • Ruby backend to allow run "python on rails"
                • +
                • mandatory static typing at app-level, because it's the only way to increase +performances
                • +
                • rewrite PyPy in Haskell, because we discovered that dynamic typing is just +not suitable for a project of this size
                • +
                • a C front-end, so that we can interpret the C source of Python C extensions +and JIT it. This would work by writing an interpreter for LLVM bytecode in +RPython.
                • +
                • an elisp backend
                • +
                • a TeX backend (use PyPy for your advanced typesetting needs)
                • +
                • an SQL JIT backend, pushing remote procedures into the DB engine
                • +
                +
                +

                Comments

                +
                +
                +
                + + Leonardo Santagada wrote on 2008-04-02 16:20: +
                +
                +

                PoR - Python on Rails would be the funniest one...

                +
                +
                +
                +
                + + mernen wrote on 2008-04-02 18:07: +
                +
                +

                Oh, the C interpreter would be so awesome. The quantum computer backend, in the hands of a good writer, could become an excellent joke too, no matter how obviously fake. I'd love to see the ones about static typing too.

                +
                +
                +
                +
                + + Unknown wrote on 2008-04-11 23:14: +
                +
                +

                TeX backend ...
                You would be amazed a just how useful that stunt would be (grinBigly)
                of course that's sort of what PS is all about, eh.
                Did something similar (smaller scale) several decades ago. Great fun and extremely useful.

                Thanks for the grins.

                +
                +
                +
                +
                + + mernen wrote on 2008-06-05 21:20: +
                +
                +

                Whoah, anonymous, for a second or two this spam site almost looked convincing.

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2008/04/trying-to-get-pypy-to-run-on-python-30-5082015544752137606.html b/posts/2008/04/trying-to-get-pypy-to-run-on-python-30-5082015544752137606.html new file mode 100644 index 000000000..30c62fb51 --- /dev/null +++ b/posts/2008/04/trying-to-get-pypy-to-run-on-python-30-5082015544752137606.html @@ -0,0 +1,385 @@ + + + + + +Trying to get PyPy to run on Python 3.0 | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                Trying to get PyPy to run on Python 3.0

                + + + +
                +

                As you surely know, Python 3.0 is coming; recently, they released +Python 3.0 alpha 3, and the final version is expected around +September.

                +

                As suggested by the migration guide (in the PEP 3000), we started by applying +2to3 to our standard interpreter, which is written in RPython (though +we should call it RPython 2.4 now, as opposed to RPython 3.0 -- see +below).

                +

                Converting was not seamless, but most of the resulting bugs were due to the +new dict views, str/unicode changes and the missing "reduce" built-in. +After forking and refactoring both our interpreter and the 2to3 script, +the Python interpreter runs on Python 3.0 alpha 3!

                +

                Next step was to run 2to3 over the whole translation toolchain, +i.e. the part of PyPy which takes care of analyzing the interpreter in +order to produce efficient executables; after the good results we got +with the standard interpreter, we were confident that it would have +been relatively easy to run 2to3 over it: unfortunately, it was not +:-(.

                +

                After letting 2to3 run for days and days uninterrupted, we decided to +kill it: we assume that the toolchain is simply too complex to be +converted in a reasonable amount of time.

                +

                So, we needed to think something else; THE great idea we had was to +turn everything upside-down: if we can't port PyPy to Py3k, we can +always port Py3k to PyPy!

                +

                Under the hood, the 2to3 conversion tool operates as a graph +transformer: it takes the graph of your program (in the form of Python +2.x source file) and returns a transformed graph of the same program +(in the form of Python 3.0 source file). Since the entire translation +toolchain of PyPy is based on graph transformations, we could reuse it +to modify the behaviour of the 2to3 tool. We wrote a general +graph-inverter algorithm which, as the name suggests, takes a graph +transformation and build the inverse transformation; then, we applied +the graph inverter to 2to3, getting something that we called 3to2: it +is important to underline that 3to2 was built by automatically +analysing 2to3 and reversing its operation with only the help of a few +manual hints. For this reason and because we are not keeping generated +files under version control, we do not need to maintain this new tool in +the Subversion repository.

                +

                Once we built 3to2, it was relatively easy to pipe its result to our +interpreter, getting something that can run Python 3.0 programs.

                +

                Performance-wise, this approach has the problem of being slower at +import time, because it needs to run (automatically) 3to2 every time +the source is modified; in the future, we plan to apply our JIT +techniques also to this part of the interpreter, trying to mitigate the +slowdown until it is not noticeable anymore to the final user.

                +

                In the next weeks, we will work on the transformation (and probably publish +the technique as a research paper, with a title like "Automatic Program +Reversion on Intermediate Languages").

                +

                UPDATE: In case anybody didn't guess or didn't spot the acronym: The above +was an April Fool's joke. Nearly nothing of it is true.

                +
                +

                Comments

                +
                +
                +
                + + Anonymous wrote on 2008-04-01 15:33: +
                +
                +

                "After letting 2to3 run for days and days uninterrupted, we decided to kill it: we assume that the toolchain is simply too complex to be converted in a reasonable amount of time."

                That was silly. Twisted got converted. I suppose that not even a meta-programing-languages-framework can be bigger thaan Twisted. Better luck next year.

                +
                +
                +
                +
                + + Anonymous wrote on 2008-04-01 16:02: +
                +
                +

                I have a working implementation of the Parrot virtual machine in Py3K. After running it through your converter (my hosting service only runs 2.1!), I find that my implementation now only supports Perl 5 and Snobol. What gives?

                +
                +
                +
                +
                + + fumanchu wrote on 2008-04-01 16:54: +
                +
                +

                Nice acronym, that.

                +
                +
                +
                +
                + + Paddy3118 wrote on 2008-04-01 19:37: +
                +
                +

                Nice one ;-)

                And the best I've read all day!

                - Paddy.

                +
                +
                +
                +
                + + Anonymous wrote on 2008-04-17 20:54: +
                +
                +

                Looks like hosting python 2.5 scripts on a Py3k interpreter might become a USP for PyPY ;)

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2008/04/wrapping-pyrepl-in-readline-api-362730784820949868.html b/posts/2008/04/wrapping-pyrepl-in-readline-api-362730784820949868.html new file mode 100644 index 000000000..63f4f743e --- /dev/null +++ b/posts/2008/04/wrapping-pyrepl-in-readline-api-362730784820949868.html @@ -0,0 +1,290 @@ + + + + + +Wrapping pyrepl in the readline API | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                Wrapping pyrepl in the readline API

                + + + +
                +

                If you translate a pypy-c with --allworkingmodules and start it, you will probably not notice anything strange about its prompt - except when typing multiline statements. You can move the cursor up and continue editing previous lines. And the history is multiline-statements-aware as well. Great experience! Ah, and completion using tab is nice too.

                + +

                Truth be told, there is nothing new here: it was all done by Michael Hudson's pyrepl many years ago. We had already included pyrepl in PyPy some time ago. What is new is a pure Python readline.py which exposes the most important parts of the API of the standard readline module by wrapping pyrepl under the hood, without needing the GNU readline library at all. The PyPy prompt is based on this, benefitting automagically from pyrepl's multiline editing capabilities, with minor tweaks so that the prompt looks much more like CPython's than a regular pyrepl prompt does.

                + +

                You can also try and use this multiline prompt with CPython: check out pyrepl at https://codespeak.net/svn/pyrepl/trunk/pyrepl and run the new pythoni1 script.

                +
                +

                Comments

                +
                +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2008/05/berlin-sprint-day-1-2-8761821946764492267.html b/posts/2008/05/berlin-sprint-day-1-2-8761821946764492267.html new file mode 100644 index 000000000..2730f8a4d --- /dev/null +++ b/posts/2008/05/berlin-sprint-day-1-2-8761821946764492267.html @@ -0,0 +1,335 @@ + + + + + +Berlin Sprint Day 1 + 2 | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                Berlin Sprint Day 1 + 2

                + + + +
                +

                After having survived the S3-Workshop which took place in Potsdam on Thursday +and Friday (a blog-post about this will follow later) we are now sitting in the +c-base in Berlin, happily sprinting. Below are some notes on what progress we +made so far:

                +
                +
                  +
                • The Gameboy emulator in RPython that Camillo Bruni is working on for his +Bachelor project at Uni Bern does now translate. It took him (assisted by +various people) a while to figure out the translation errors (essentially +because he wrote nice Python code that passed bound methods around, which the +RTyper doesn't completely like). Now that is fixed and the Gameboy emulator +translates and runs a test ROM. You cannot really see anything yet, because +there is no graphics support in RPython.
                • +
                • To get graphics support in RPython Armin and Karl started writing SDL +bindings for RPython, which both the Gameboy emulator and the SPy VM need. +They have basic stuff working, probably enough to support the Gameboy +already.
                • +
                • Alexander, Armin, Maciek and Samuele discussed how to approach separate +compilation for RPython, which isn't easy because the RPython type analysis +is a whole-program analysis.
                • +
                • Stephan, Peter and Adrian (at least in the beginning) worked on making PyPy's +stackless module more complete. They added channel preferences which +change details of the scheduling semantics.
                • +
                • Toon, Carl Friedrich and Adrian (a tiny bit) worked on SPy. There is a branch +that Toon started a while ago which contains many improvements but is also +quite unclear in many respects. There was some progress in cleaning that up. +This involved implementing the Smalltalk process scheduler (Smalltalk really +is an OS). There is still quite some work left though. While doing so, we +discovered many funny facts about Squeak's implementation details (most of +which are exposed to the user) in the process. I guess we should collect them +and blog about them eventually.
                • +
                • Samuele and Maciek improved the ctypes version of pysqlite that Gerhard +Häring started.
                • +
                • Armin, Samuele and Maciek found an obscure bug in the interaction between the +builtin-type-shortcut that Armin recently implemented and our multimethod +implementation. It's not clear which of the two are to blame, however it +seems rather unclear how to fix the problem: Armin and Samuele are stuck in a +discussion about how to approach a solution since a while and are hard to +talk to.
                • +
                • Stijn Timbermont, a Ph.D. student at the Vrije Universiteit Brussel who is +visiting the sprint for two days was first looking at how our GCs are +implemented to figure out whether he can use PyPy for some experiments. The +answer to that seems to be no. Today he was hacking on a Pico interpreter +(without knowing too much about Python) and is making some nice progress, it +seems.
                • +
                +
                +

                Will try to blog more as the sprint progresses.

                +
                +

                Comments

                +
                +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2008/05/berlin-sprint-finished-1597243123548564657.html b/posts/2008/05/berlin-sprint-finished-1597243123548564657.html new file mode 100644 index 000000000..f48998bbb --- /dev/null +++ b/posts/2008/05/berlin-sprint-finished-1597243123548564657.html @@ -0,0 +1,332 @@ + + + + + +Berlin Sprint Finished | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                Berlin Sprint Finished

                + + + +
                +

                The Berlin sprint is finished, below some notes on what we worked on during +the last three days:

                +
                +
                  +
                • Camillo worked tirelessly on the gameboy emulator with some occasional input +by various people. He is making good progress, some test ROMs run now on the +translated emulator. However, the graphics are still not completely working +for unclear reasons. Since PyBoy is already taken as a project name, we +considered calling it PyGirl (another name proposition was "BoyBoy", but the +implementation is not circular enough for that).
                • +
                +
                +
                +
                  +
                • On Monday Armin and Samuele fixed the problem with our multimethods so that +the builtin shortcut works again (the builtin shortcut is an optimization +that speeds up all operations on builtin non-subclassed types quite a bit).
                • +
                • Antonio and Holger (who hasn't been on a sprint in a while, great to have you +back!) worked on writing a conftest file (the plugin mechanism of py.test) +that would allow us to run Django tests using py.test, which seems to be not +completely trivial. They also fixed some bugs in PyPy's Python interpreter, +e.g. related to dictionary subclassing.
                • +
                • Karl started adding sound support to the RPython SDL-bindings, which will be +needed both by the Gameboy emulator and eventually by the SPy VM.
                • +
                • Armin and Maciek continued the work that Maciek had started a while ago of +improving the speed of PyPy's IO operation. In the past, doing IO usually +involved copying lots of memory around, which should have improved now. Armin +and Maciek improved and then merged the first of the two branches that +contained IO improvements, which speeds up IO on non-moving GCs (mostly the +Boehm GC). Then they continued working on the hybrid-io branch which is +supposed improve IO on the hybrid GC (which was partially designed exactly +for this).
                • +
                • Toon, Carl Friedrich finished cleaning up the SPy improvement branch and +fixed all warnings that occur when you translate SPy there. An obscure bug in +an optimization prevented them from getting working executables, which at +this moment blocks the merging of that branch.
                • +
                +
                +

                By now everybody is home again (except for Anto, who booked his return flight +two days too late, accidentally) and mostly resting. It was a good sprint, with +some interesting results and several new people joining. And it was definitely +the most unusual sprint location ever :-).

                +
                +

                Comments

                +
                +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2008/05/general-performance-improvements-838741900863354293.html b/posts/2008/05/general-performance-improvements-838741900863354293.html new file mode 100644 index 000000000..2e761c837 --- /dev/null +++ b/posts/2008/05/general-performance-improvements-838741900863354293.html @@ -0,0 +1,454 @@ + + + + + +General performance improvements | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                General performance improvements

                + + + +
                +

                Hi all,

                + +

                During the past two weeks we invested some more efforts on the +baseline performance of pypy-c. Some of the tweaks we did +were just new ideas, and others were based on actual +profiling. The net outcome is that we now expect PyPy to be +in the worst case twice as slow than CPython on real +applications. Here are some small-to-medium-size benchmark +results. The number is the execution time, normalized to 1.0 +for CPython 2.4:

                + +

                +
                  +
                • +1.90 on templess (a simple templating language)
                • +
                • +1.49 on gadfly (pure Python SQL database)
                • +
                • +1.49 on translate.py (pypy's own translation toolchain)
                • +
                • +1.44 on mako (another templating system)
                • +
                • +1.21 on pystone
                • +
                • +0.78 on richards
                • +
                +

                (This is all without the JIT, as usual. The JIT is not ready yet.)

                + +

                You can build yourself a pypy-c with this kind of speed with +the magic command line (gcrootfinder is only for a 32-bit +Linux machine):

                + +

                +
                    pypy/translator/goal/translate.py --gc=hybrid --gcrootfinder=asmgcc targetpypystandalone --allworkingmodules --faassen
                + +

                The main improvements come from: + +

                +
                  +
                • A general shortcut for any operation between built-in objects: +for example, a subtraction of two integers or floats now dispatches +directly to the integer or float subtraction code, without looking up +the '__sub__' in the class.
                • +
                • A shortcut for getting attributes out of instances of user classes +when the '__getattribute__' special method is not overridden.
                • +
                • The so-called Hybrid Garbage Collector is now a +three-generations collector. + +More about our GCs... +
                • +
                • Some profiling showed bad performance in our implementation of +the built-in id() -- a trivial function to write in CPython, but a lot +more fun when you have a moving GC and your object's real address can +change.
                • +
                • The bytecode compiler's parser had a very slow linear search +algorithm that we replaced with a dictionary lookup.
                • +
                +

                These benchmarks are doing CPU-intensive operations. You can expect +a similar blog post soon about the I/O performance, as the +io-improvements branch gets closer to being merged +:-) The branch could also improve the speed of +string operations, as used e.g. by the templating systems.

                +
                +

                Comments

                +
                +
                +
                + + Anonymous wrote on 2008-05-10 20:07: +
                +
                +

                We had the same problem with id() (called object_id()) in Rubinius. We currently hide an objects's ID inside it's metaclass (allocating one if there isn't one).

                Where did you guys store it?

                +
                +
                +
                +
                + + Anonymous wrote on 2008-05-11 00:48: +
                +
                +

                The ID is stored in a special dictionary (a normal dictionary specialized to be allocated so that the GC wont see it) that is used in the GC as a mapping from addresses to integers. This dict is updated when necessary (usually when collecting).

                +
                +
                +
                +
                + + Unknown wrote on 2008-05-11 06:56: +
                +
                +

                Wow. That sure is nice.

                +
                +
                +
                +
                + + Anonymous wrote on 2008-05-11 07:27: +
                +
                +

                My my, that must be a huge dictionary.

                +
                +
                +
                +
                + + Anonymous wrote on 2008-05-11 09:12: +
                +
                +

                The dictionary is of course only filled for objects that were used in an id() call.

                +
                +
                +
                +
                + + Armin Rigo wrote on 2008-05-11 09:19: +
                +
                +

                There are actually two dictionaries, at least when using one of the generational GCs: one for the first generation objects and one for the rest. The dictionary for the rest of the objects can probably get quite large, but it needs to be traversed once during each full collection only. It seems that full collections are rare enough: the full dictionary updating doesn't stand out in profiled runs.

                I didn't think about implementing id() at the language level, e.g. by extending the class of the object to add a field.
                We can't really do that in RPython. Moreover, that seems impractical for Python: if someone asks for the id() of an integer object, do all integers suddenly need to grow an 'id' field?

                +
                +
                +
                +
                + + Daivd wrote on 2008-05-11 09:41: +
                +
                +

                Great work!

                I have a few questions not answered by the FAQ that I hope someone will be able to answer.

                When might the JIT be ready enough? (no stress, just asking :)

                How much faster are CPython 2.5, 2.6 and 3.0? That seems to be relevant to the statement "we now expect PyPy to be in the worst case twice as slow than CPython".

                If I understand correctly, one of the purposes of PyPy is to make experimentation easier - so will making it compatible with 3.0 be fairly easy? Are there plans to do so?

                Is PyPy expected to one day become a serious "competitor" to CPython, in that you might want to run it in production? Is there a time set for when it will be ready for use by the general public (i.e me ;)?

                +
                +
                +
                +
                + + Maciej Fijalkowski wrote on 2008-05-11 10:19: +
                +
                +

                So, answering questions one by one:

                JIT will be ready when it'll be ready, not earlier.

                CPython 2.5 is slightly faster for some operations. No real difference there. 2.6 was optimized for certain operations, but as well, don't expect a huge difference. I think you can expect pypy to be in range of 2x for any cpython. 3.0 is not even sure how will look like, but certainly being ultra fast is not it's primary goal.

                Regarding making pypy compatible with 3.0 - yes, that should be fairly easy although we don't have any immediate plans doing that.

                The final date for making pypy production ready is not set (and this is a gradual process), but as you can see here and here we're trying more and more to make it run existing applications.

                Cheers,
                fijal

                +
                +
                +
                +
                + + Anonymous wrote on 2008-05-11 10:43: +
                +
                +

                Note that current benchmarks suggest that CPython 3.0 is yet much slower than CPython 2.x. It might be interesting to see whether this means that PyPy is much faster than CPython 3.0 running e.g. Pystone.
                Of course this fact would not be very surprising, esp. given that PyPy does not implement any CPy3k features.

                +
                +
                +
                +
                + + Luis wrote on 2008-10-12 22:43: +
                +
                +

                "JIT will be ready when it'll be ready, not earlier."

                Alright, alright... we know.
                But could you at least give us a very rough estimation for us, mere mortals? What does your heart tell you? :-)

                +
                +
                +
                +
                + + Spencer wrote on 2009-11-02 18:04: +
                +
                +

                What kind of computations are done in richards? I.e., what sort of applications can expect better perfomance in PyPy than in CPy?

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2008/05/more-windows-support-1747028151130099034.html b/posts/2008/05/more-windows-support-1747028151130099034.html new file mode 100644 index 000000000..ac213d6ae --- /dev/null +++ b/posts/2008/05/more-windows-support-1747028151130099034.html @@ -0,0 +1,291 @@ + + + + + +More windows support | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                More windows support

                + + + +
                +

                Recently, thanks to Amaury Forgeot d'Arc and Michael Schneider, Windows became more of a first-class platform for PyPy's Python interpreter. Most RPython extension modules are now considered working (apart from some POSIX specific modules). Even CTypes now works on windows! +

                +Next step would be to have better buildbot support for all supported platforms (Windows, Linux and OS X), so we can control and react to regressions quickly. (Buildbot is maintained by JP Calderone) +

                +Cheers,
                +fijal

                +
                +

                Comments

                +
                +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2008/05/next-sprint-berlin-may-17-22nd-may-5362899847460267375.html b/posts/2008/05/next-sprint-berlin-may-17-22nd-may-5362899847460267375.html new file mode 100644 index 000000000..8e5e0af68 --- /dev/null +++ b/posts/2008/05/next-sprint-berlin-may-17-22nd-may-5362899847460267375.html @@ -0,0 +1,308 @@ + + + + + +Next Sprint: Berlin, May 17-22nd May | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                Next Sprint: Berlin, May 17-22nd May

                + + + +
                +

                Our next PyPy sprint will take place in the crashed c-base space station, Berlin, Germany, Earth, Solar System. This is a fully public sprint: newcomers (from all planets) are welcome. Suggestion of topics (other topics are welcome too):

                + +
                  +
                • work on PyPy's JIT generator: we are refactoring parts of the + compiling logic, in ways that may also allow generating better + machine code for loops (people or aliens with knowledge on + compilers and SSA, welcome) + +
                • +
                • work on the SPy VM, PyPy's Squeak implementation, particularly the + graphics capabilities + +
                • +
                • work on PyPy's GameBoy emulator, which also needs graphics support + +
                • +
                • trying some large pure-Python applications or libraries on PyPy and + fixing the resulting bugs. Possibilities are Zope 3, Django and + others. +
                • +
                +

                For more information, see the full announcement. +

                +
                +

                Comments

                +
                +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2008/05/progresses-on-cli-jit-backend-front-1021772190959551376.html b/posts/2008/05/progresses-on-cli-jit-backend-front-1021772190959551376.html new file mode 100644 index 000000000..ef4cc2478 --- /dev/null +++ b/posts/2008/05/progresses-on-cli-jit-backend-front-1021772190959551376.html @@ -0,0 +1,467 @@ + + + + + +Progresses on the CLI JIT backend front | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                Progresses on the CLI JIT backend front

                + + + +
                +

                In the last months, I've actively worked on the CLI backend for PyPy's +JIT generator, whose goal is to automatically generate JIT compilers +that produces .NET bytecode on the fly.

                +

                The CLI JIT backend is far from be completed and there is still a lot +of work to be done before it can handle the full PyPy's Python +interpreter; nevertheless, yesterday I finally got the first .NET +executable that contains a JIT for a very simple toy language called +tlr, which implements an interpreter for a minimal register based +virtual machine with only 8 operations.

                +

                To compile the tlr VM, follow these steps:

                +
                +
                  +
                1. +

                  get a fresh checkout of the oo-jit branch, i.e. the branch +where the CLI JIT development goes on:

                  +
                  +$ svn co https://codespeak.net/svn/pypy/branch/oo-jit
                  +
                  +
                2. +
                3. +

                  go to the oo-jit/pypy/jit/tl directory, and compile the tlr VM +with the CLI backend and JIT enabled:

                  +
                  +$ cd oo-jit/pypy/jit/tl/
                  +$ ../../translator/goal/translate.py -b cli --jit --batch targettlr
                  +
                  +
                4. +
                +
                +

                The goal of our test program is to compute the square of a given +number; since the only operations supported by the VM are addition and +negation, we compute the result by doing repetitive additions; I won't +describe the exact meaning of all the tlr bytecodes here, as they are +quite self-documenting:

                +
                +ALLOCATE,    3,   # make space for three registers
                +MOV_A_R,     0,   # i = a
                +MOV_A_R,     1,   # copy of 'a'
                +
                +SET_A,       0,
                +MOV_A_R,     2,   # res = 0
                +
                +# 10:
                +SET_A,       1,
                +NEG_A,
                +ADD_R_TO_A,  0,
                +MOV_A_R,     0,   # i--
                +
                +MOV_R_A,     2,
                +ADD_R_TO_A,  1,
                +MOV_A_R,     2,   # res += a
                +
                +MOV_R_A,     0,
                +JUMP_IF_A,  10,   # if i!=0: goto 10
                +
                +MOV_R_A,     2,
                +RETURN_A          # return res
                +
                +

                You can find the program also at the end of the tlr module; to get an +assembled version of the bytecode, ready to be interpreted, run this +command:

                +
                +$ python tlr.py assemble > square.tlr
                +
                +

                Now, we are ready to execute the code through the tlr VM; if you are +using Linux/Mono, you can simply execute the targettlr-cli script +that has been created for you; however, if you use Windows, you have +to manually fish the executable inside the targettlr-cli-data +directory:

                +
                +# Linux
                +$ ./targettlr-cli square.tlr 16
                +256
                +
                +# Windows
                +> targettlr-cli-data\main.exe square.tlr 16
                +256
                +
                +

                Cool, our program computed the result correctly! But, how can we be +sure that it really JIT compiled our code instead of interpreting it? +To inspect the code that it's generated by our JIT compiler, we simply +set the PYPYJITLOG environment variable to a filename, so that the +JIT will create a .NET assembly containing all the code that has been +generated by the JIT:

                +
                +$ PYPYJITLOG=generated.dll ./targettlr-cli square.tlr 16
                +256
                +$ file generated.dll
                +generated.dll: MS-DOS executable PE  for MS Windows (DLL) (console) Intel 80386 32-bit
                +
                +

                Now, we can inspect the DLL with any IL disassembler, such as +ilasm or monodis; here is an excerpt of the disassembled code, +that shows how our square.tlr bytecode has been compiled to .NET +bytecode:

                +
                +.method public static  hidebysig default int32 invoke (object[] A_0, int32 A_1)  cil managed
                +{
                +    .maxstack 3
                +    .locals init (int32 V_0, int32 V_1, int32 V_2, int32 V_3, int32 V_4, int32 V_5)
                +
                +    ldc.i4 -1
                +    ldarg.1
                +    add
                +    stloc.1
                +    ldc.i4 0
                +    ldarg.1
                +    add
                +    stloc.2
                +    IL_0010:  ldloc.1
                +    ldc.i4.0
                +    cgt.un
                +    stloc.3
                +    ldloc.3
                +    brfalse IL_003b
                +
                +    ldc.i4 -1
                +    ldloc.1
                +    add
                +    stloc.s 4
                +    ldloc.2
                +    ldarg.1
                +    add
                +    stloc.s 5
                +    ldloc.s 5
                +    stloc.2
                +    ldloc.s 4
                +    stloc.1
                +    ldarg.1
                +    starg 1
                +
                +    nop
                +    nop
                +    br IL_0010
                +
                +    IL_003b:  ldloc.2
                +    stloc.0
                +    br IL_0042
                +
                +    ldloc.0
                +    ret
                +}
                +
                +

                If you know a bit IL, you can see that the code generated is not +optimal, as there are some redundant operations like all those +stloc/ldloc pairs; however, while not optimal, it is still quite good +code, not much different to what you would get by writing the square +algorithm directly in e.g. C#.

                +

                As I said before, all of this is still work in progress and there is +still much to be done. Stay tuned :-).

                +
                +

                Comments

                +
                +
                +
                + + Anonymous wrote on 2008-05-28 20:11: +
                +
                +

                So the mono JIT would pick up that bytecode and further compile it to native code?

                Also, what would be needed for doing the same thing for the JVM?

                +
                +
                +
                +
                + + Antonio Cuni wrote on 2008-05-28 21:28: +
                +
                +

                Yes, that's exactly the idea; in fact, the program run by virtual machines generated this way are double jit-ed.

                Doing the same for the JVM won't be too hard, since most of the work we've done can be shared between the two JIT backends; unfortunately, at the moment the JVM backend is not as advanced as the CLI one, so before working on the JIT we would need more work on it. But indeed, having a JIT backend for the JVM is in our plans.

                +
                +
                +
                +
                + + Anonymous wrote on 2008-05-29 10:13: +
                +
                +

                Great. Can't wait for advanced piggybacking :)

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2008/05/s3-workshop-potsdam-2008-writeup-6610637452403831794.html b/posts/2008/05/s3-workshop-potsdam-2008-writeup-6610637452403831794.html new file mode 100644 index 000000000..f4c9dfa22 --- /dev/null +++ b/posts/2008/05/s3-workshop-potsdam-2008-writeup-6610637452403831794.html @@ -0,0 +1,367 @@ + + + + + +S3-Workshop Potsdam 2008 Writeup | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                S3-Workshop Potsdam 2008 Writeup

                + + + +
                +

                Trying to give some notes about the S3 Workshop in Potsdam that several +PyPyers and Spies (Armin, Carl Friedrich, Niko, Toon, Adrian) attended before +the Berlin sprint. We presented a paper about SPy there. Below are some mostly +random note about my (Carl Friedrich's) impressions of the conference and some +talk notes. Before that I'd like to give thanks to the organizers who did a +great job. The workshop was well organized, the social events were wonderful (a +very relaxing boat trip in the many lakes around Potsdam and a conference +dinner).

                +

                Video recordings of all the talks can be found on the program page.

                +
                +

                Invited Talks

                +

                "Late-bound Object Lambda Architectures" by Ian Piumarta was quite an inspiring +talk about VPRI's attempt at writing a flexible and understandable computing +system in 20K lines of code. The talk was lacking a bit in technical details, so +while it was inspiring I couldn't really say much about their implementation. +Apart from that, I disagree with some of their goals, but that's the topic of +another blog post.

                +

                "The Lively Kernel – A Self-supporting System on a Web Page" by Dan Ingalls. Dan +Ingalls is one of the inventors of the original Smalltalk and of Squeak. He was +talking about his latest work, the attempts of bringing a Squeak-like system to +a web browser using JavaScript and SVG. To get some feel for what exactly The +Lively Kernel is, it is easiest to just try it out (only works in Safari +and Firefox 3 above Beta 5 though). I guess in a sense the progress of the +Lively Kernel over Squeak is not that great but Dan seems to be having fun. Dan +is an incredibly enthusiastic, friendly and positive person, it was really great +meeting him. He even seemed to like some of the ideas in SPy.

                +

                "On Sustaining Self" by Richard P. Gabriel was a sort of deconstructivist +multi-media-show train wreck of a presentation that was a bit too weird for my +taste. There was a lot of music, there were sections in the presentation +where Richard discussed with an alter ego, whose part he had recorded in advance +and mangled with a sound editor. There was a large bit of a documentary +about Levittown. Even the introduction and the questions were weird, with Pascal +Constanza staring down the audience, without saying a word (nobody dared to ask +questions). I am not sure I saw the point of the presentation, apart from +getting the audience to think, which probably worked. It seems that there are +people (e.g. Christian Neukirchen) that liked the presentation, though.

                +
                +
                +

                Research Papers

                +

                "SBCL - A Sanely Bootstrappable Common Lisp by Christophe Rhodes described the +bootstrapping process of SBCL (Steel Bank Common Lisp). SBCL can be bootstrapped +by a variety of Common Lisps, not just by itself. SBCL contains a complete +blueprint of the initial image instead of always getting the new image by +carefully mutating the old one. This bootstrapping approach is sort of similar +to that of PyPy.

                +

                "Reflection for the Masses" by Charlotte Herzeel, Pascal Costanza, and Theo +D'Hondt retraced some of the work of Brian Smith on reflection in Lisp. The +talk was not very good, it was way too long (40 min), quite hard to understand +because Charlotte Herzeel was talking in a very low voice. The biggest mistake +in her talk was in my opinion that she spent too much time explaining a more or +less standard meta-circular interpreter for Lisp and then running out of time +when she was trying to explain the modifications. I guess it would have been a +fair assumptions that large parts of the audience know such interpreters, so +glossing over the details would have been fine. A bit of a pity, since the paper +seems interesting.

                +

                "Back to the Future in One Week - Implementing a Smalltalk VM in PyPy" +by Carl Friedrich Bolz, Adrian Kuhn, Adrian Lienhard, Nicholas D. Matsakis, +Oscar Nierstrasz, Lukas Renggli, Armin Rigo and Toon Verwaest, the paper with +the longest author list. We just made everybody an author who was at the sprint +in Bern. Our paper had more authors than all the other papers together :-). I +gave the presentation at the workshop, which went quite well, judging from the +feedback I got.

                +

                "Huemul - A Smalltalk Implementation" by Guillermo Adrián Molina. Huemul is a +Smalltalk implementation that doesn't contain an interpreter but directly +compiles all methods to assembler (and also saves the assembler in the image). +In addition, as much functionality (such as threading, GUI) as possible is +delegated to libraries instead of reimplementing them in Smalltalk +(as e.g. Squeak is doing). The approach seems to suffer from the usual problems +of manually writing a JIT, e.g. the VM seems to segfault pretty often. Also I +don't agree with some of the design decisions of the threading scheme, there is +no automatic locking of objects at all, instead the user code is responsible for +preventing concurrent accesses from messing up things (which even seems to lead +to segfaults in the default image).

                +

                "Are Bytecodes an Atavism?" by Theo D'Hondt argued that using AST-based +interpreters can be as fast as bytecode-based interpreters which he proved by +writing two AST-interpreters, one for Pico and one for Scheme. Both of these +implementations seem to perform pretty well. Theo seems to have many similar +views as PyPy, for example that writing simple straightforward interpreters is +often preferable than writing complex (JIT-)compilers.

                +
                +
                +

                Comments

                +
                +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2008/05/threads-and-gcs-1126087726480790112.html b/posts/2008/05/threads-and-gcs-1126087726480790112.html new file mode 100644 index 000000000..24ab9feb8 --- /dev/null +++ b/posts/2008/05/threads-and-gcs-1126087726480790112.html @@ -0,0 +1,405 @@ + + + + + +Threads and GCs | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                Threads and GCs

                + + + +
                +

                Hi all,

                + +

                We can now compile a pypy-c that includes both thread support +and one of our semi-advanced garbage collectors. This means +that threaded Python programs can now run not only with a +better performance, but without the annoyances of the Boehm +garbage collector. (For example, Boehm doesn't like too much +seeing large numbers of __del__(), and our implementation of +ctypes uses them everywhere.)

                + +

                Magic translation command (example):

                + +

                +
                   translate.py --thread --gc=hybrid targetpypystandalone --faassen --allworkingmodules
                + +

                Note that multithreading in PyPy is based on a global +interpreter lock, as in CPython. I imagine that we will get +rid of the global interpreter lock at some point in the future +-- I can certainly see how this might be done in PyPy, unlike +in CPython -- but it will be a lot of work nevertheless. Given +our current priorities, it will probably not occur soon unless +someone steps in.

                +
                +

                Comments

                +
                +
                +
                + + Anonymous wrote on 2008-05-29 00:04: +
                +
                +

                How could GIL be removed from PyPy?

                +
                +
                +
                +
                + + Armin Rigo wrote on 2008-05-29 09:19: +
                +
                +

                By using fine-grained locking: locking every dictionary and list while it is used. This is what Jython does (or more precisely, what Jython asks the JVM to do for it). This certainly comes with a performance penalty, so it would only pay off if you actually have and can use multiple CPUs -- which is fine in PyPy: you would just translate different pypy-c's depending on the use case.

                This would be a pain to implement in CPython, in particular because of refcounting. Even if the Py_INCREF and Py_DECREF macros were made thread-safe, all C-level APIs that manipulate borrowed references might have to be redesigned.

                +
                +
                +
                +
                + + Anonymous wrote on 2008-05-29 10:10: +
                +
                +

                Pyprocessing may serve multi-core cpu needs for the time being, as it's an almost drop-in replacement for the threading module.

                I think it uses ctypes, so it should work with pypy.

                +
                +
                +
                +
                + + Maciej Fijalkowski wrote on 2008-05-29 18:36: +
                +
                +

                pyprocessing has it's own problems (not that threads has no problems at all :)

                1. Memory usage, you need basically n times more memory when n is number of processes

                2. you cannot pass arbitrary data between processes, just stuff that you can marshal/pickle which is a bit huge limitation.

                3. on the other hand, multiple processes provides you better control, although not via threading drop-in replacement.

                Cheers,
                fijal

                +
                +
                +
                +
                + + Anonymous wrote on 2008-06-04 20:02: +
                +
                +

                The live demos seem to be down... :(

                +
                +
                +
                +
                + + Maciej Fijalkowski wrote on 2008-06-05 02:15: +
                +
                +

                Back online. Our test server is down as well, which makes it a bit hard to know stuff :(

                +
                +
                +
                +
                + + Connelly Barnes wrote on 2008-06-13 00:38: +
                +
                +

                In response to maciej, OSes that implement copy-on-write fork (Linux, but not Windows, unsure about Mac OS X), don't take n times more memory. Fine-grained locking and an OpenMP-like syntax would be potentially useful. Maybe you could get a student to prototype these for you. But I'm sure someone will find a way to parallelize Python eventually, or we'll all switch to some other language, as the number of cores goes to infinity.

                +
                +
                +
                +
                + + Connelly Barnes wrote on 2008-06-17 23:52: +
                +
                +

                In my previous comment, I was partly wrong: COW reduces memory usage, however, in CPython the refcounting will cause the interpreter to write to every area of memory, so the reduction may not be that significant. Also, IronPython supports fine-grained locks.

                +
                +
                +
                +
                + + nekto0n wrote on 2008-06-24 23:07: +
                +
                +

                Would it be better to lock not whole mutable object but just an element or slice(for lists) and not lock object for reading operations?
                It's a common method used in DBMS. A small and fast realisation(if it's possible to create) in PyPy whould be great =)

                +
                +
                +
                +
                + + Rushen Aly wrote on 2009-02-20 18:37: +
                +
                +

                Is there any calendar date for removal of GIL? or is it just a wish. Secondly, what is your speed aim compared with Java?
                Thanks...
                Rushen

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2008/06/better-profiling-support-for-pypy-1848129914083462080.html b/posts/2008/06/better-profiling-support-for-pypy-1848129914083462080.html new file mode 100644 index 000000000..12d7f06a7 --- /dev/null +++ b/posts/2008/06/better-profiling-support-for-pypy-1848129914083462080.html @@ -0,0 +1,290 @@ + + + + + +Better Profiling Support for PyPy | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                Better Profiling Support for PyPy

                + + + +
                +

                As PyPy is getting more and more usable, we need better tools to use to work on certain applications running on top of PyPy. Out of this interest, I spent some time implementing the _lsprof module, which is a part of the standard library since Python2.5. It is necessary for the cProfile module, which can profile Python programs with high accuracy and a lot less overhead than the older, pure-python profile module. Together with the excellent +lsprofcalltree script, you can display this data using kcachegrind, which gives you great visualization possibilities for your profile data. +

                +Cheers,
                +fijal

                +
                +

                Comments

                +
                +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2008/06/german-introductory-podcast-about-3836017753197345761.html b/posts/2008/06/german-introductory-podcast-about-3836017753197345761.html new file mode 100644 index 000000000..4f4e22872 --- /dev/null +++ b/posts/2008/06/german-introductory-podcast-about-3836017753197345761.html @@ -0,0 +1,324 @@ + + + + + +German Introductory Podcast About Python and PyPy | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                German Introductory Podcast About Python and PyPy

                + + + +
                +

                During the Berlin Sprint Holger was interviewed by Tim Pritlove for Tim's +Podcast "Chaosradio Express". The whole thing is in German, so only +interesting to German-speakers. The PyPy episode can be found here. The +interview is touching on a lot of topics, starting with a fairly general intro +about what Python is and why it is interesting and then moving to explaining and +discussing PyPy. The bit about PyPy starts after about 45 minutes. There is also +a comment page about the episode.

                +
                +

                Comments

                +
                +
                +
                + + holger krekel wrote on 2008-06-15 18:54: +
                +
                +

                Thanks CF for linking - i found it actually a fun interview although i was caught a bit in surprise that it focused first a lot on Python-the-language and i didn't feel in evangelising mode.

                And what i again realized is that PyPy is not too well known or understood outside the Python world. Maybe it would help, also for getting some funding, if it were.

                +
                +
                +
                +
                + + Anonymous wrote on 2008-06-16 10:19: +
                +
                +

                It seems a pity non-German speakers cannot benefit from this. Any chance of an English version?

                +
                +
                +
                +
                + + kriss wrote on 2008-06-16 13:49: +
                +
                +

                Great Podcast, I like your project - have to listen to the podcast a second time though. :-)

                Keep up the good work!

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2008/06/hi-all-some-news-from-jit-front-7534695765973581706.html b/posts/2008/06/hi-all-some-news-from-jit-front-7534695765973581706.html new file mode 100644 index 000000000..bc8838b6c --- /dev/null +++ b/posts/2008/06/hi-all-some-news-from-jit-front-7534695765973581706.html @@ -0,0 +1,493 @@ + + + + + +JIT in Prolog | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                JIT in Prolog

                + + + +
                +

                Hi all,

                + +

                Some news from the JIT front. Progress on the JIT has been low-profile +in the past few months. No big results to announce yet, but we have +played with some new ideas, and they are now documented as a draft +research paper: Towards Just-In-Time Compilation and Specialisation of Prolog.

                + +

                Prolog? Yes. To understand this slightly unusual choice of programming +language, here is first some background about our JIT.

                + +

                PyPy contains not a JIT but a JIT generator, which means that we +only write an interpreter for a language (say, the complete Python +language), and we get a JIT "for free". More precisely, it's not for +free: we had to write the JIT generator, of course, as well as some +amount of subtle generic support code. The JIT generator preprocesses +the (complete Python) interpreter that we wrote and links the result +with the generic support code; the result is a (complete Python) JIT.

                + +

                The way that this works so far gives us a generated JIT that is very +similar to Psyco in the way +it works. +But Psyco has issues (and so the current PyPy JITs have the same issues): +it can sometimes produce too much machine code, +e.g. by failing to notice that two versions of the machine code are +close enough that they should really be one; and it can also sometimes +fail in the opposite way, by making a single sub-efficient version of +the machine code instead of several efficient specialized versions.

                + +

                A few months ago we have chosen to experiment with improving this +instead of finishing and polishing what we had so far. The choice was +mostly because we were (and still are) busy finishing and polishing +everything else in PyPy, so it was more fun to keep at least the JIT on +the experimental side. Besides, PyPy is now getting to a rather good +and complete state, and it is quite usable without the JIT already.

                + +

                Anyway, enough excuses. Why is this about Prolog?

                + +

                In PyPy, both the (complete Python) interpreter and the JIT support code +are in RPython. Now RPython is not +an extremely complicated language, but still, it is far from the top on a +minimalism scale. In general, this is a good in practice (or at least I +think so): it gives +a reasonable balance because it is convenient to write interpreters +in RPython, while not being so bloated that it makes our translation +toolchain horribly complicated (e.g. writing garbage collectors for +RPython - or even JIT generators - is reasonable). Still, it is not the +best choice for early research-level experimentation.

                + +

                So what we did instead recently is hand-write, in Prolog, a JIT that +looks similar to what we would like to achieve for RPython with our JIT +generator. This gave much quicker turnaround times than we were used to +when we played around directly with RPython. We wrote tiny example +interpreters in Prolog (of course not a complete Python interpreter). +Self-inspection is trivial in Prolog, and generating Prolog code at +runtime is very easy too. Moreover, many other issues are also easier +in Prolog: for example, all data structures are immutable "terms". +Other languages than Prolog would have worked, too, but it happens to be +one that we (Carl Friderich, Michael Leuschel and myself) are familiar +with -- not to mention that it's basically a nice small dynamic +language.

                + +

                Of course, all this is closely related to what we want to do in PyPy. +The fundamental issues are the same. Indeed, in PyPy, the major goals +of the JIT are to remove, first, the overhead of allocating objects all +the time (e.g. integers), and second, the overhead of dynamic dispatch +(e.g. finding out that it's integers we are adding). The equivalent +goals in Prolog are, first, to avoid creating short-lived terms, and +second, to remove the overhead of dispatch (typically, the dispatching +to multiple clauses). If you are familiar with Prolog you can find more +details about this in the paper. So far we already played with many possible solutions +in the Prolog JIT, and the paper describes the most mature one; we have +more experimentation in mind. The main point here is that these are +mostly language-independent techniques (anything that works both in +Prolog and in RPython has to be language-independent, right? :-)

                + +

                In summary, besides the nice goal of speeding up Prolog, we are trying +to focus our Prolog JIT on the issues and goals that have equivalents in +the PyPy JIT generator. So in the end we are pretty convinced that it +will give us something that we can backport to PyPy -- good ideas about +what works and what doesn't, as well as some concrete algorithms.

                +
                +

                Comments

                +
                +
                +
                + + Shalabh wrote on 2008-06-30 21:00: +
                +
                +

                What is the reason you would back-port the Prolog implementation to RPython, and not make Prolog itself the standard language for implementing the JIT?

                +
                +
                +
                +
                + + Michael Foord wrote on 2008-06-30 21:47: +
                +
                +

                THat sounds like the great subject of a thesis for Carl. :-)

                Congratulations guys.

                +
                +
                +
                +
                + + Maciej Fijalkowski wrote on 2008-06-30 22:46: +
                +
                +

                shalabh: because (hopefully) porting back to rpython is saner than porting all of our interpreter (including modules) to prolog.

                +
                +
                +
                +
                + + nekto0n wrote on 2008-07-01 00:22: +
                +
                +

                A bit unsual approach =)
                Hope it'll help...

                +
                +
                +
                +
                + + Anonymous wrote on 2008-07-01 00:37: +
                +
                +

                What about making PyPy useful?

                There's still a need for a python compiler, but so far, you can't run standard libraries (eg PyObjC) and you run slow that cPython. -- Even Javascript is faster than you (squirrelfish).

                +
                +
                +
                +
                + + Anonymous wrote on 2008-07-01 02:12: +
                +
                +

                One thing I've never quite understood: how will the JIT-generation transform interact with more traditional optimization schemes?

                Concrete example: say in a function I want to perform some algebraic reductions of math operations which will change a lot of the instructions. Since the JIT generation turns the interpreter into a JIT, presumably I have to write the optimization at the interpreter level.

                I can see how that could work for the simplest kind of optimizations (special cases should be specialized at runtime after they go green, if I understand the rainbow colour scheme.)

                I don't see yet how the more complex optimizations I'd write on static, fixed-type code will look in this context. IIUC at interpreter level I can only access the JIT's observations via tests like "if type(a) == FloatType" which should be filled after they're known-- but that's inside the function itself, and I don't see how to access that information from anything outside.

                +
                +
                +
                +
                + + Armin Rigo wrote on 2008-07-01 16:31: +
                +
                +

                dsm: This is a two-level approach, corresponding to two levels of optimisations that are useful for dynamic languages like Python: the "high level" is the unboxing and dispatching removing that I describe in the post (which by itself can give something like a factor 50-100 speed-up in the best cases). Traditional "low level" optimisations can be performed on top of that, by optimising the generated code that comes out of the "high level" (and this could give another 2-4x speed-up, i.e. the same difference as between "gcc" and "gcc -O3").

                In this Prolog experiment we are only focusing on how to get the high level optimisations.

                +
                +
                +
                +
                + + Anonymous wrote on 2008-07-02 00:20: +
                +
                +

                The references in the paper are not properly numbered -- any idea if it could be fixed?

                +
                +
                +
                +
                + + Carl Friedrich Bolz-Tereick wrote on 2008-07-02 00:39: +
                +
                +

                Michel: Thanks for noticing, it should be fixed.

                +
                +
                +
                +
                + + Anonymous wrote on 2008-07-02 09:34: +
                +
                +

                Could you possibly profit from generating a JIT compiler for Lua (www.lua.org) and compare it to Mike Pall's Lua-Jit (https://luajit.org/)?

                +
                +
                +
                +
                + + Anonymous wrote on 2008-07-03 09:48: +
                +
                +

                While the paper was too difficult for me to understand fully, it was still an interesting read and I appreciate you posting it.

                +
                +
                +
                +
                + + Unknown wrote on 2008-09-26 02:04: +
                +
                +

                FYI: There is a project called Pyke which adds Prolog-like inferencing to Python. This integrates with Python allowing you to include Python code snippets in your rules.

                Don't know if this would be useful, but you can check it out at https://pyke.sourceforge.net.

                +
                +
                +
                +
                + + Unknown wrote on 2009-01-09 23:51: +
                +
                +

                Shalabh: It's also important to note 3 big benefits of implementing a language in the language itself, or a subset thereof ("turtles all the way down").

                (1) Debugging and testing tools for programs written in the language then (hopefully) also work for debugging and testing the language implementation with minimal (or no) modification. This also HUGELY lowers the bar for ordinary users of the language to find and fix implementation bugs. This isn't a fault of Prolog, but 99.99% of Python users won't touch a Prolog debugger with a 10-foot pole.

                (2) The largest pool of people most interested in improving the language is presumably the expert heavy users of the language. Forcing them to learn a new language and/or implement the language in a language outside their expertise is a large disadvatage.

                (3) The difference between language builtins and user code is reduced. Also, it forces certain powerful constructs to (at times) be exposed in the language when they might otherwise only be exposed in the implementation language. Also, with "turtles all the way down", performance improvements in the language itself also often apply to the language builtins, which increases the benefit of improvements, which is important in the cost/benefit analysis for undertaking the performance improvements in the first place. Having "turtles all the way down" make some optimizations worthwhile that otherwise would be too much trouble to implement.

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2008/06/list-comprehension-implementation-5289956690288817225.html b/posts/2008/06/list-comprehension-implementation-5289956690288817225.html new file mode 100644 index 000000000..61c36ba00 --- /dev/null +++ b/posts/2008/06/list-comprehension-implementation-5289956690288817225.html @@ -0,0 +1,457 @@ + + + + + +List comprehension implementation details | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                List comprehension implementation details

                + + + +
                +

                List comprehensions are a nice feature in Python. They are, however, just +syntactic sugar for for loops. E.g. the following list comprehension:

                +
                +def f(l):
                +    return [i ** 2 for i in l if i % 3 == 0]
                +
                +

                is sugar for the following for loop:

                +
                +def f(l):
                +    result = []
                +    for i in l:
                +        if i % 3 == 0:
                +            result.append(i ** 2)
                +    return result
                +
                +

                The interesting bit about this is that list comprehensions are actually +implemented in almost exactly this way. If one disassembles the two functions +above one gets sort of similar bytecode for both (apart from some details, like +the fact that the append in the list comprehension is done with a special +LIST_APPEND bytecode).

                +

                Now, when doing this sort of expansion there are some classical problems: what +name should the intermediate list get that is being built? (I said classical +because this is indeed one of the problems of many macro systems). What CPython +does is give the list the name _[1] (and _[2]... with nested list +comprehensions). You can observe this behaviour with the following code:

                +
                +$ python
                +Python 2.5.2 (r252:60911, Apr 21 2008, 11:12:42)
                +[GCC 4.2.3 (Ubuntu 4.2.3-2ubuntu7)] on linux2
                +Type "help", "copyright", "credits" or "license" for more information.
                +>>> [dir() for i in [0]][0]
                +['_[1]', '__builtins__', '__doc__', '__name__', 'i']
                +>>> [[dir() for i in [0]][0] for j in [0]][0]
                +['_[1]', '_[2]', '__builtins__', '__doc__', '__name__', 'i', 'j']
                +
                +

                That is a sort of nice decision, since you can not reach that name by any +"normal" means. Of course you can confuse yourself in funny ways if you want:

                +
                +>>> [locals()['_[1]'].extend([i, i + 1]) for i in range(10)]
                +[0, 1, None, 1, 2, None, 2, 3, None, 3, 4, None, 4, 5, None, 5, 6, None, 6, 7, None, 7, 8, None, 8, 9, None, 9, 10, None]
                +
                +

                Now to the real reason why I am writing this blog post. PyPy's Python +interpreter implements list comprehensions in more or less exactly the same way, +with on tiny difference: the name of the variable:

                +
                +$ pypy-c-53594-generation-allworking
                +Python 2.4.1 (pypy 1.0.0 build 53594) on linux2
                +Type "help", "copyright", "credits" or "license" for more information.
                +``the globe is our pony, the cosmos our real horse''
                +>>>> [dir() for i in [0]][0]
                +['$list0', '__builtins__', '__doc__', '__name__', 'i']
                +
                + +

                Now, that shouldn't really matter for anybody, should it? Turns out it does. The +following way too clever code is apparently used a lot:

                +
                +__all__ = [__name for __name in locals().keys() if not __name.startswith('_') '
                +               or __name == '_']
                +
                +

                In PyPy this will give you a "$list0" in __all__, which will prevent the +import of that module :-(. I guess I need to change the name to match CPython's.

                +

                Lesson learned: no detail is obscure enough to not have some code depending +on it. Mostly problems on this level of obscurity are the things we are fixing +in PyPy at the moment.

                +
                +

                Comments

                +
                +
                +
                + + Brandon Rhodes wrote on 2008-06-10 03:09: +
                +
                +

                In fairness, the clever code does not depend on the name looking as it actually does in CPython; the clever code merely expects that variables auto-created by Python internals will begin with an underscore. Which is far more reasonable than actually expecting the specific name "_[1]" (and, wow, you're right, that does look weird; you've shown me something I've never seen before about Python!) to turn up in the variable list.

                +
                +
                +
                +
                + + Unknown wrote on 2008-06-10 06:38: +
                +
                +

                Actually, that piece of code is looking to export only public identifiers, right? It's trying to exclude things prefixed with an underscore that are in the file scope.

                +
                +
                +
                +
                + + Anonymous wrote on 2008-06-10 07:32: +
                +
                +

                I would have said "Lesson learned: when MIT hackers in the 1960's come up with some funny thing called GENSYM, it's not just because they're weird; it really does serve a purpose". But then I'm an asshole Lisp hacker. :-)

                +
                +
                +
                +
                + + Carl Friedrich Bolz-Tereick wrote on 2008-06-10 09:50: +
                +
                +

                anonymous: Using gensym for getting the symbol wouldn't have helped in this case at all. The gensymmed symbol would still have showed up in the locals() dictionary. So depending on whether the gensym implementation returns symbols that start with an underscore or not the same bug would have occured.

                +
                +
                +
                +
                + + TuringTest wrote on 2008-06-10 10:28: +
                +
                +

                Other languages have the capability/design/philosophy to make such implementation details totally unobservable.

                Haskell has list comprehensions which expand into normal code. These cannot expose implementation details or temporary names.

                +
                +
                +
                +
                + + Carl Friedrich Bolz-Tereick wrote on 2008-06-10 13:26: +
                +
                +

                turingtest: I agree that that would be preferable, but it's sort of hard with the current interpreter design. Also, it's a pragmatic implementation in that the interpreter didn't have to change at all to add the list comps.

                +
                +
                +
                +
                + + arkanes wrote on 2008-06-10 15:48: +
                +
                +

                The code's not overly clever, it's ridiculous, because it exactly duplicates the effects of not having __all__ at all. From foo import * already won't import names prefaced with an underscore. Also from the google code search it looks like it's mostly used in Paste, most of the other hits are false positives.

                The "from foo import *" case (without __all__ defined) is a good enough reason to match the cpython naming, though, the useless code in Paste not withstanding.

                +
                +
                +
                +
                + + Anonymous wrote on 2008-06-10 17:49: +
                +
                +

                carl: something like GENSYM would still help, since the symbol generated is not accessible from any package.

                That's difference between gensym and mktemp. However, I don't believe that python has the concept of uninterned symbols (someone who knows more about python could correct me).

                +
                +
                +
                +
                + + Carl Friedrich Bolz-Tereick wrote on 2008-06-11 12:17: +
                +
                +

                arkanes: no, the "from foo import *" case isn't really changed by the different choice of symbols because the new variable is really only visible within the list comprehension and deleted afterwards. It doesn't leak (as opposed to the iteration variable).

                +
                +
                +
                +
                + + Unknown wrote on 2008-06-12 01:43: +
                +
                +

                arkanes: This is not the same as not having __all__ defined. __all__ would skip the function _() which is used to mark and translate strings with gettext. In other words, it is emulating the default no __all__ behavior and adding in _()

                Carl: doesn't the "$list0" get imported without the all? If not what keeps it from causing a problem normally? Could you not just delete the $list0 variable after assigning it to the LHS?

                +
                +
                +
                +
                + + Carl Friedrich Bolz-Tereick wrote on 2008-06-12 11:05: +
                +
                +

                chris: yes, deleting this variable is exactly what PyPy does (and CPython as well). That's what I was trying to say in my last post.

                The bug with the __all__ only occurs because locals is called within the list comprehension. After the list comprehension is done there is no problem.

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2008/06/next-sprint-vilniuspost-europython-10-3844544842675903586.html b/posts/2008/06/next-sprint-vilniuspost-europython-10-3844544842675903586.html new file mode 100644 index 000000000..e26c49c98 --- /dev/null +++ b/posts/2008/06/next-sprint-vilniuspost-europython-10-3844544842675903586.html @@ -0,0 +1,303 @@ + + + + + +Next sprint: Vilnius/Post EuroPython, 10-12th of July | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                Next sprint: Vilnius/Post EuroPython, 10-12th of July

                + + + +
                +

                As happened in the last years, there will be a PyPy sprint just after +EuroPython. The sprint will take place in the same hotel as the +conference, from 10th to 12th of July.

                +

                This is a fully public sprint: newcomers are welcome, and on the first +day we will have a tutorial session for those new to PyPy development.

                +

                Some of the topics we would like to work on:

                +
                +
                  +
                • try out Python programs and fix them or fix PyPy or fix performance bottlenecks
                • +
                • some JIT improvement work
                • +
                • port the stackless transform to ootypesystem
                • +
                +
                +

                Of course, other topics are also welcome.

                +

                For more information, see the full announcement.

                +
                +

                Comments

                +
                +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2008/06/pdb-and-rlcompleterng-2414105295687348881.html b/posts/2008/06/pdb-and-rlcompleterng-2414105295687348881.html new file mode 100644 index 000000000..c17f9e5b7 --- /dev/null +++ b/posts/2008/06/pdb-and-rlcompleterng-2414105295687348881.html @@ -0,0 +1,397 @@ + + + + + +Pdb++ and rlcompleter_ng | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                Pdb++ and rlcompleter_ng

                + + + +
                +

                When hacking on PyPy, I spend a lot of time inside pdb; thus, I tried +to create a more comfortable environment where I can pass my nights +:-).

                +

                As a result, I wrote two modules:

                +
                +
                  +
                • +pdb.py, which extends the default behaviour of pdb, by adding +some commands and some fancy features such as syntax highlight and +powerful tab completion; pdb.py is meant to be placed somewhere in +your PYTHONPATH, in order to override the default version of pdb.py +shipped with the stdlib;
                • +
                • +rlcompleter_ng.py, whose most important feature is the ability +to show coloured completions depending on the type of the objects.
                • +
                +
                +

                To find more informations about those modules and how to install them, +have a look at their docstrings.

                +

                It's important to underline that these modules are not PyPy specific, +and they work perfectly also on top of CPython.

                + +
                +

                Comments

                +
                +
                +
                + + Brodie Rao wrote on 2008-06-22 20:57: +
                +
                +

                That's pretty impressive, but I think having to modify readline itself in order to do this is a little excessive. readline's completion capabilities are pretty limited. I wonder if there are any better alternatives that could be used with Python.

                I have something similar set up for my Python prompt: https://bitheap.org/hg/dotfiles/file/tip/.pythonrc.py -- it allows completion and indentation, it persists command history with readline, and it prints documentation if you try to evaluate certain objects like functions, classes, and methods. It also pretty-prints output, but I'm still trying to tweak it so it's aware of the terminal width.

                +
                +
                +
                +
                + + Antonio Cuni wrote on 2008-06-23 12:32: +
                +
                +

                yes, I agree that having to modify readline is not too nice. I tried hard to avoid this but with bad luck :-/.

                I suppose I could try to reimplement readline in Python, but I think it would be too much work; if you are aware of something already done, please let me know :-).

                +
                +
                +
                +
                + + Pachi wrote on 2008-06-23 12:58: +
                +
                +

                would this work be suitable to inclusion in the standard pdb module?. That would be awesome.
                Thanks!

                +
                +
                +
                +
                + + Maciej Fijalkowski wrote on 2008-06-23 16:25: +
                +
                +

                There is readline implementation on top of pyrepl in pypy already :) PyPy by default does not use readline, but just uses this.

                +
                +
                +
                +
                + + Paul wrote on 2008-06-24 22:00: +
                +
                +

                Nice job antonio. I'd clean the code up, conform to new-style classes and proper MRO handling. I'd also think about refactoring some of those names and find something better suited. Overall, awesome job man.

                +
                +
                +
                +
                + + Unknown wrote on 2008-06-26 10:48: +
                +
                +

                This looks great. You've taken a step futher than my own attempts here:
                https://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/498182

                Two small comments though: it crashes on startup without the right config in ~/.pdbrc.py and once I got it started I see things like this when completing th tab:

                ^[[000;00m^[[00mtest^[[00m

                but syntax highlighting seems to work perfectly. Thanks!

                +
                +
                +
                +
                + + Antonio Cuni wrote on 2008-06-27 11:57: +
                +
                +

                @Stephen: as described in the docs of rlcompleter_ng, to use colorized completion you need to use a patched version of readline, there is no chance to get it working without that.

                Could you describe in more details what problem did you encounter with ~/.pdbrc.py, so that I can fix it, please?

                +
                +
                +
                +
                + + Anonymous wrote on 2008-12-10 20:26: +
                +
                +

                Antonio - I created a minor patch for rlcompleter_ng.py which will allow it to run on both Python 2 and 3.

                https://mikewatkins.ca/2008/12/10/colorized-interpreter/

                +
                +
                +
                +
                + + cool-RR wrote on 2011-05-05 18:39: +
                +
                +

                I was disappointed that pdb++ doesn't work on Windows. Apparently it uses the `termios` module which is not available on Windows.

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2008/06/pypy-code-swarm-7038411918926116477.html b/posts/2008/06/pypy-code-swarm-7038411918926116477.html new file mode 100644 index 000000000..a82ed58b7 --- /dev/null +++ b/posts/2008/06/pypy-code-swarm-7038411918926116477.html @@ -0,0 +1,357 @@ + + + + + +PyPy code swarm | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                PyPy code swarm

                + + + +
                +

                Following the great success of code_swarm, I recently produced a +video that shows the commit history of the PyPy project.

                +

                The video shows the commits under the dist/ and branch/ +directories, which is where most of the development happens.

                +

                In the first part of the video, you can see clearly our sprint based +approach: the video starts in February 2003, when the first PyPy +sprint took place in Hildesheim: after a lot of initial activity, few +commits happened in the next two months, until the second PyPy sprint, +which took place in Gothenburg in late May 2003; around the minute +0:15, you can see the high commit rate due to the sprint.

                +

                The next two years follow more or less the same pattern: very high +activity during sprints, followed by long pauses between them; the +most interesting breaking point is located around the minute 01:55; +it's January 2005, and when the EU project starts, the number of +commits just explodes, as well as the number of people involved.

                +

                I also particularly appreciated minute 03:08 aka March 22, 2006: it's +the date of my first commit to dist/, and my nickname magically +appears; but of course I'm biased :-).

                +

                The soundtrack is NIN - Ghosts IV - 34: thanks to xoraxax for +having added the music and uploaded the video.

                +
                PyPy Codeswarm from solse@trashymail.com on Vimeo. +
                +

                Comments

                +
                +
                +
                + + nekto0n wrote on 2008-06-27 13:49: +
                +
                +

                Niiice =)

                +
                +
                +
                +
                + + akuhn wrote on 2008-06-28 11:33: +
                +
                +

                Question: in case of pair programming, who's name is shown? both names?

                +
                +
                +
                +
                + + Michael Hudson-Doyle wrote on 2008-06-29 23:50: +
                +
                +

                Cool. There was less of a drop off after the eu project ended than I expected!

                +
                +
                +
                +
                + + Anonymous wrote on 2008-06-30 11:29: +
                +
                +

                It was cool to see the sprint effects as well

                Cheers

                Bea

                +
                +
                +
                +
                + + Anonymous wrote on 2008-09-06 01:49: +
                +
                +

                The codeswarm seems to have moved:

                https://www.vimeo.com/1241231

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2008/06/pypy-improvements-5272963843122158791.html b/posts/2008/06/pypy-improvements-5272963843122158791.html new file mode 100644 index 000000000..8dcdcded9 --- /dev/null +++ b/posts/2008/06/pypy-improvements-5272963843122158791.html @@ -0,0 +1,396 @@ + + + + + +Funding of some recent progress by Google's Open Source Programs | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                Funding of some recent progress by Google's Open Source Programs

                + + + +
                +

                As readers of this blog already know, PyPy development has +recently focused on getting the code base to a more usable state. One +of the most important parts of this work was creating an +implementation of the ctypes module for PyPy, which +provides a realistic way to interface with external libraries. The +module is now fairly complete (if somewhat slow), and has generated a +great deal of community interest. One of the main reasons this work +progressed so well was that we received funding from Google's Open +Source Programs Office. This is +really fantastic for us, and we cannot thank Google and Guido enough for helping PyPy progress +more rapidly than we could have with volunteer-only time!

                +

                This funding opportunity arose from the PyPy US road trip at the end +of last year, which included a visit to Google. You +can check out the video +of the talk we gave during our visit. We wrapped up our day with +discussions about the possibility of Google funding some PyPy work and +soon after a we were at work on the proposal for improvements we'd +submitted.

                +

                One nice side-effect of the funding is indeed that we can use some of +the money for funding travels of contributors to our sprint meetings. +The next scheduled Google funding proposal also aims at making our +Python interpreter more usable and compliant with CPython. This will be done by trying to +fully run Django on top of PyPy. With +more efforts like this one we're hoping that PyPy can start to be used +as a CPython replacement before the end of 2008.

                +

                Many thanks to the teams at merlinux and Open End for making this development possible, including +Carl Friedrich Bolz, Antonio Cuni, Holger Krekel, Maciek Fijalkowski +at merlinux, Samuele Pedroni and yours truly at Open End.

                +

                We always love to hear feedback from the community, and you can get +the latest word on our development and let us know your thoughts here in the comments.

                +

                Bea Düring, Open End AB

                + +

                PS: Thanks Carl Friedrich Bolz for drafting this post.

                +
                +

                Comments

                +
                +
                +
                + + Bill Mill wrote on 2008-06-26 14:26: +
                +
                +

                congratulations! that's awesome.

                +
                +
                +
                +
                + + Christopher Armstrong wrote on 2008-06-26 14:52: +
                +
                +

                Congratulations, guys!

                +
                +
                +
                +
                + + nekto0n wrote on 2008-06-26 14:55: +
                +
                +

                That's great! I like that this project is getting bigger, growing faster :)
                I wish I could help, but don't know where to start :-[

                +
                +
                +
                +
                + + Brandon Corfman wrote on 2008-06-26 15:18: +
                +
                +

                I've been hard on Guido in the past for not throwing more support behind PyPy, and I'm very glad now to hear that Guido (and Google) are demonstrating its importance. Thanks all.

                +
                +
                +
                +
                + + Anonymous wrote on 2008-06-26 16:06: +
                +
                +

                Wow, I am actually more excited by hearing that pypy will be a partial cpython replacement this year than by the google money. Pypy is the most interesting project going on right now in the python world.

                +
                +
                +
                +
                + + Unknown wrote on 2008-06-26 16:27: +
                +
                +

                Wow, this should be quite interesting.

                JT
                https://www.Ultimate-Anonymity.com

                +
                +
                +
                +
                + + Anonymous wrote on 2008-06-26 20:50: +
                +
                +

                Congrats. I'm very glad to keep hearing about efforts to make PyPy usable with real-world applications and frameworks. The PyPy project is starting to send out positive signals, and this is something I've been waiting for.

                +
                +
                +
                +
                + + Anonymous wrote on 2008-06-26 22:52: +
                +
                +

                "With more efforts like this one we're hoping that PyPy can start to be used as a CPython replacement before the end of 2008."

                Out of curiousity, are there good reasons for anyone to want to do that?

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2008/06/running-nevow-on-top-of-pypy-58891137802412513.html b/posts/2008/06/running-nevow-on-top-of-pypy-58891137802412513.html new file mode 100644 index 000000000..b5ebf2c71 --- /dev/null +++ b/posts/2008/06/running-nevow-on-top-of-pypy-58891137802412513.html @@ -0,0 +1,316 @@ + + + + + +Running Nevow on top of PyPy | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                Running Nevow on top of PyPy

                + + + +
                +

                Another episode of the "Running Real Application of top of PyPy" series: +

                +Today's topic: Divmod's Nevow. Nevow (pronounced as the French "nouveau", or "noo-voh") is a web application construction kit written in Python. Which means it's just another web framework, but this time built on top of Twisted. +While, due to some small problems we're not yet able to pass full Twisted test suite on top of pypy-c, Nevow seems to be simple enough to work perfectly (959 out of 960 unit tests passing, with the last one recognized as pointless and about to be deleted). Also, thanks to +exarkun, Nevow now no longer relies on ugly details like refcounting. +

                +As usual, translate pypy using: +

                +
                +translate.py --gc=hybrid --thread targetpypystandalone --faassen --allworkingmodules --oldstyle +
                +
                +Of course, obligatory to the series, screenshot:
                +This is Nevow's own test suite. +

                +Cheers,
                +fijal +
                +

                Comments

                +
                +
                +
                + + Donovan Preston wrote on 2008-06-26 21:53: +
                +
                +

                Awesome!

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2008/06/running-pylons-on-top-of-pypy-3234492105090025733.html b/posts/2008/06/running-pylons-on-top-of-pypy-3234492105090025733.html new file mode 100644 index 000000000..a976552a6 --- /dev/null +++ b/posts/2008/06/running-pylons-on-top-of-pypy-3234492105090025733.html @@ -0,0 +1,413 @@ + + + + + +Running Pylons on top of PyPy | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                Running Pylons on top of PyPy

                + + + +
                +

                The next episode of the "Running Real Applications on Top of PyPy" series:

                +Yesterday, we spend some time with Philip Jenvey on tweaking Pylons and PyPy to cooperate with each other. While doing this we found some pretty obscure details, but in general things went well. +

                +After resolving some issues, we can now run all (72) Pylons tests on +top of pypy-c compiled with the following command: +

                +
                +translate.py --gc=hybrid --thread targetpypystandalone --faassen --allworkingmodules --oldstyle +
                +
                +and run some example application. Here is the obligatory screenshot (of course +it might be fake, as usual with screenshots). Note: I broke application on purpose to showcase cool debugger, default screen is just boring:
                +Please note that we run example application without DB access, since +we need some more work to get SQLAlchemy run on top of pypy-c together with +pysqlite-ctypes. Just one example of an obscure details that sqlalchemy is +relying on in the test suite: +

                + class A(object):
                +   locals()[42] = 98 +
                +

                Update:This is only about new-style classes. +

                +This works on CPython and doesn't on PyPy.

                +Cheers,
                +fijal +
                +

                Comments

                +
                +
                +
                + + Anonymous wrote on 2008-06-11 00:46: +
                +
                +

                Very good to see this work! This is a a good thing to be trying and hearing it makes me happy.

                We're busy working on making Zope 3 run on Jython, which should get make some of our C level dependencies optional. These make a port to PyPy harder as well. Zope 3 libraries have umpteen thousands of tests that can be run, so that should give one some coverage. The libraries come packaged separately too.

                The trickiest part would be those bits that depend on the ZODB. Porting the ZODB to PyPy should allow new possibilities, but it'll be hard too, I imagine.

                +
                +
                +
                +
                + + holger krekel wrote on 2008-06-11 12:31: +
                +
                +

                Hi Martijn,

                in fact having zope3 work with pypy would be very nice. i discussed a bit with Phillip and he suggested to first get zope.interface and zope.component to work, then zope.proxy/zope.security. IIRC my first try with zope.interface yielded 3 failures out of 231 tests. I had to hack the test runner a bit to not rely on GC details - i guess that your work for Jython might imply that as well. What is the best way to follow your Jython work, btw?

                best & cheers,
                holger

                +
                +
                +
                +
                + + Anonymous wrote on 2008-06-11 12:38: +
                +
                +

                The Jython project is a summer of code project. Georgy Berdyshev is the student and is sending messages to jython-dev.

                Here was a recent status report:

                https://sourceforge.net/mailarchive/forum.php?thread_name=ee8eb53d0806082009g5aec43dbn3da1f35b751cba70%40mail.gmail.com&forum_name=jython-dev

                +
                +
                +
                +
                + + Anonymous wrote on 2008-06-11 12:48: +
                +
                +

                I see that the hyperlink to Georgy's report just now got eaten by the comment software. Here it is again, hopefully working this time.

                +
                +
                +
                +
                + + Carl Friedrich Bolz-Tereick wrote on 2008-06-11 13:38: +
                +
                +

                Georgy Berdyshev is lurking in the #pypy channelg (gberdyshev or similar), FWIW.

                +
                +
                +
                +
                + + mike bayer wrote on 2008-06-11 21:50: +
                +
                +

                Let's see who entered that line:

                4361 pje # This proves SA can handle a class with non-string dict keys
                4361 pje locals()[42] = 99 # Don't remove this line!


                pje ? Yes. That PJE. Complete with "don't remove this!"....we'll have to see what mr. guru was up to with that one. This test is also only present in the 0.5 branch which hasn't had alpha releases yet.

                Would love to hear some other examples of "obscure details" the test suite is relying upon...my guess would be extremely few or none besides this one example.

                +
                +
                +
                +
                + + PJE wrote on 2008-06-12 01:10: +
                +
                +

                It tests that SQLAlchemy isn't depending on class dictionaries containing only string keys.

                Unfortunately, this makes the test then depend on the ability to have non-string keys in the class dictionary. ;-)

                The test is to ensure that SQLAlchemy will be able to map objects whose *classes* have AddOns defined.

                By the way, as of PEP 3115, the locals() of a class can be an arbitrary object, so making compile-time assumptions about what *can't* be done with a class' locals() is probably not a good idea.

                Also, as of every existing version of Python>=2.2, a metaclass may add non-dictionary keys to the class dictionary during class.__new__. So, it has never been a valid assumption that class __dict__ keys *must* be strings. If PyPy is relying on that, it is already broken, IMO.

                +
                +
                +
                +
                + + Carl Friedrich Bolz-Tereick wrote on 2008-06-12 09:57: +
                +
                +

                PJE: What you say is mostly beside the point. PyPy has no problem at all with non-string keys in (old-style) class dicts. The point is more that locals() cannot be used to assign things to this dictionary, see the docs:

                "locals()
                Update and return a dictionary representing the current local symbol table. Warning: The contents of this dictionary should not be modified; changes may not affect the values of local variables used by the interpreter."

                +
                +
                +
                +
                + + PJE wrote on 2008-06-12 11:39: +
                +
                +

                Well, if you plan on supporting, say, Zope or Twisted, you'll need to support modifying class-body frame locals.

                There really isn't any point to optimizing them, not only due to PEP 3115, but also due to pre-3115 metaclasses. (And just the fact that most programs don't execute a lot of class suites in tight loops...)

                +
                +
                +
                +
                + + Anonymous wrote on 2008-06-12 17:34: +
                +
                +

                Zope does things like:

                frame = sys.getframe(1)
                frame.f_locals['foo'] = bar

                It does this to make zope.interface.implements() work, among other things. This allows you to the following:

                # IFoo is actually an instance, not a
                # class
                class IFoo(zope.interface.Interface):
                  pass

                class Myclass:
                  # stuffs information in the class
                  zope.interface.implements(IFoo)

                The martian library (which Grok uses) actually generates this into its directive construct.

                Some of this stuff could become class decorators in the future, I imagine, but we're stuck supporting this future for the forseeable future as well.

                +
                +
                +
                +
                + + Anonymous wrote on 2008-06-12 17:40: +
                +
                +

                I didn't "generates this", but "generalizes this". I think PJE's PEAK library also has stuff for this ("class advisors").

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2008/07/ep2008-pypy-meets-jython-1107070144380217881.html b/posts/2008/07/ep2008-pypy-meets-jython-1107070144380217881.html new file mode 100644 index 000000000..58693c690 --- /dev/null +++ b/posts/2008/07/ep2008-pypy-meets-jython-1107070144380217881.html @@ -0,0 +1,349 @@ + + + + + +EP2008: PyPy meets Jython | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                EP2008: PyPy meets Jython

                + + + +
                +

                One of the great events at EuroPython 2008 were our chats and meetings with the Jython and Sun people. The Jython people recently are pushing into releasing Python version 2.5 and they currently pursue many interesting sub projects. Coincidentally, PyPy also has tons of interesting areas and results :) So we eventually got into brainstorming a number of possible technical collab ideas. Further below is a first list as i wrote it down from our 10 people PyPy / Jython 30 minute close up meeting yesterday. + +It felt great to be able to talk to the Jython people this way - kudos to Sun for their clear commitments and open ways to go about things! I sense a genuine interest on fair collaboration with non-java developer communities. Seems like they are serious about not focusing on "Java this", "Java that" anymore but rather focus on the JVM platform. Good! And about language +independent interest in ambitious technology. Even Better! I am tensed to see how things go from here. + +So here the list of technical collab ideas: +

                +
                  +
                • ctypes - try to create _rawffi module in Java for Jython, which will enable Jython to reuse our existing ctypes implementation (and have PyPy use the Jython-rawffi for its own for PyPy.JVM)
                • +
                • generally see to share work / (continue) collaborate regarding extension modules
                • +
                • Jython/PyPy (and eventually IronPython): document known differences to CPython, maybe in a PEP
                • +
                • Python Interpreter for Jython (in order to run CPython's .pyc files): re-use pypy's bytecode evaluator, implement a "Jython object space".
                • +
                • re-use rpython-extension modules for jython (e.g. SRE), by compiling them to Java and reusing as a native library.
                • +
                • collaborate on testing framework / benchmarking, have a common site to show test results
                • +
                • make py.test compatible with jython
                • +
                • come up with a set of "pure Python language" tests, which would gather and refactor tests from CPython, PyPy and Jython.
                • +
                • look into using java types / jython approaches for implementing free threading.
                • +
                • share knowledge regarding JIT / psyco +
                • +
                If you have any more ideas, comments or would like to join efforts, let us know! + +Cheers and thanks to Ted Leung, Frank Wierzbiki, Jim Baker and Tobias Ivarsson from Sun and Jython fame respectively, + +Holger +
                +

                Comments

                +
                +
                +
                + + cartman wrote on 2008-07-10 09:50: +
                +
                +

                This is great news, a common VM for all Python implementations would be real cool :)

                +
                +
                +
                +
                + + Michael Foord wrote on 2008-07-10 11:44: +
                +
                +

                It would be great to get the IronPython folks involved as well.

                For example, .NET has an FFI and with the right effort could take advantage of ctypes extensions as well.

                +
                +
                +
                +
                + + holger krekel wrote on 2008-07-10 13:52: +
                +
                +

                fuzzyman: do you have anyone particular person/group in mind?

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2008/07/europython-2008-pypy-talks-and-sprint-2255727845041197411.html b/posts/2008/07/europython-2008-pypy-talks-and-sprint-2255727845041197411.html new file mode 100644 index 000000000..9de60f0d1 --- /dev/null +++ b/posts/2008/07/europython-2008-pypy-talks-and-sprint-2255727845041197411.html @@ -0,0 +1,355 @@ + + + + + +Europython 2008 PyPy talks and sprint sum up | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                Europython 2008 PyPy talks and sprint sum up

                + + + +
                +

                The EuroPython 2008 conference and sprints have finished - it certainly was +a very eventful and successful conference for PyPy. And many very interesting +non-PyPy talks as well. PyPy presentations are available online: PyPy status talk +PyPy for the rest of us, PyPy behind the scenes. Armin and Maciej also did a well-attended +talk about PyPy's garbage collection, but that was quite interactive, no slides. +

                +The talks were all well visited and we got good questions. However, we still +need to work on sorting out the "PyPy technology cloud" and how to present +it to different audiences. Anyway, we are happy to hear feedback or questions +about the talks! +

                +After the conference there was a three-day PyPy sprint. Despite +the fact that most PyPy core developers were zombies, +we made good progress. Particularly our newcomers did very well. +Here are some results: +

                +
                  +
                • itertools rewritten in RPython for performance by Jakub + Gustak and Andrew Durdin
                • + +
                • a new ctypes based dbm and hashlib module, both by Gasper Zejn + with support from Henrik Vendelbo, they also got ctypes to nicely work on OSX. (sorry for lack of proper letters in names :)
                • + +
                • implement builtin function call profiling by Stephan Diehl, Antonio and Armin.
                • + +
                • running + Pinax on top of pypy-c, by Henrik, Holger, Gasper.
                • + +
                • Jim Baker started a _rawffi.py for Jython using JNA aiming + to provide support to run PyPy's ctypes on top of Jython. + When Jython gets this to run, PyPy's JVM backend should be + able to use it. Talk about Code Reuse :)
                • + +
                • oldstyle classes are now the default, this makes + PyPy mimick very closely cpython's 2.5 object model.
                • + +
                • Andrew started a port of the Malbolge + interpreter written in Python to RPython (obviously the only missing + link for PyPy to take over the world).
                • + +
                • various cleanups (a new option "--lonepycfiles" helps with + saner imports, remove int-float comparison shortcuts, ...)
                • +
                +At the end of the sprint we also discussed initial plans for a 1.1 release which we'd like to make happen this year. So we are generally looking forward to a busy rest of 2008 and luckily this starts by many of us taking a good vacation first :)

                + +Cheers,
                +fijal & holger +
                +

                Comments

                +
                +
                +
                + + Armin Rigo wrote on 2008-07-16 00:20: +
                +
                +

                The option is not --lonepycfiles but --objspace-lonepycfiles, and using it makes imports *less* sane.

                +
                +
                +
                +
                + + holger krekel wrote on 2008-07-16 08:44: +
                +
                +

                oh, right. I meant to say that with the introduction (not the enabling) of the option imports are saner - in that pypy now by default ignores pyc files if there is no ".py" file. thanks for the attention.

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2008/07/finding-bugs-in-pypy-with-fuz-7503072572107631526.html b/posts/2008/07/finding-bugs-in-pypy-with-fuz-7503072572107631526.html new file mode 100644 index 000000000..ff28a31f1 --- /dev/null +++ b/posts/2008/07/finding-bugs-in-pypy-with-fuz-7503072572107631526.html @@ -0,0 +1,360 @@ + + + + + +Finding Bugs in PyPy with a Fuzzer | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                Finding Bugs in PyPy with a Fuzzer

                + + + +
                +

                Last week I played a bit with Fusil, which is a fuzzing framework. The idea is +to feed the interpreter code that calls the functions of a module with random values +of various types as arguments in the hope that one hits an unchecked case. This is +done until a problem is hit , the most common problem being a segfault. Victor Stinner, +the author of Fusil, is a regular in the PyPy IRC channel and thankfully helped me +getting started with Fusil. I used his project description for CPython as a starting +point and tweaked it a bit. Reason is that PyPy is harder to segfault and so +I tweaked Fusil to also count uncaught RPython-level exceptions as such a problem. +(RPython has full exception support, and if an RPython-exception escapes to the top +level, the Python interpreter aborts. One should not be able to exploit this but +but for a user it is bad enough, because such exceptions cannot be caught from +Python code.)

                +

                Using Fusil I found a number of cases where such exceptions happened (in some +pickle support-code, in the expat parser, in the os and in the termios +module) and also one or two segfaults (in the parser module, of all places). +I fixed all these problems so that by +now the fuzzer just runs for a very long time and only finds things that take +too long (so they count as a way to do a DoS attack) like +pow(12355123123L, 12351512123121L) or round(1, 1000000000) (the latter +should probably be fixed). This probably just means that the fuzzer is not good +enough, because there are certainly segfaults left in PyPy. However, the fact +that it is rather hard to find them validates our approach of using a +high-level memory-managed language for our interpreter. Victor tells me that it +is rather easy to find segfaults in CPython this way, he already found quite +some problems.

                +
                +

                Comments

                +
                +
                +
                + + Marius Gedminas wrote on 2008-07-13 20:42: +
                +
                +

                Nice post!

                I especially like your certainty that PyPy has segfaults left in it. :-)

                +
                +
                +
                +
                + + Maciej Fijalkowski wrote on 2008-07-13 22:02: +
                +
                +

                What? Segfaults in PyPy? Shouldn't have any left by now :-)

                +
                +
                +
                +
                + + Armin Rigo wrote on 2008-07-13 22:04: +
                +
                +

                That previous comment was from me, accidentally logged in as Maciej, sorry. As usual, in PyPy confusion comes for free.

                +
                +
                +
                +
                + + Maciej Fijalkowski wrote on 2008-07-14 07:17: +
                +
                +

                heh :) I was a bit surprised to see my comment which I did not write. Anyway, I agree with it :-)

                +
                +
                +
                +
                + + Anonymous wrote on 2009-02-07 06:37: +
                +
                +

                You said you will love me wow gold the whole life, but WoW Gold you marry her. You said you will wow power leveling,come to marry me, but this will not be carried out forever.WoW Gold I am trying my best to forget you and do not love you anymore. wow leveling But I failed and I still love you. Maybe wow leveling she needs you more compared wow leveling with me. So I tell you that world of warcraft power leveling you should love world of warcraft power leveling her and take good world of warcraft leveling care of her. You said I was so kind.world of warcraft leveling Yes, because I love you,world of warcraft leveling I hope you will be power leveling happy forever.

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2008/07/pypy-at-europython-2008-1488914968455397674.html b/posts/2008/07/pypy-at-europython-2008-1488914968455397674.html new file mode 100644 index 000000000..c17c65c89 --- /dev/null +++ b/posts/2008/07/pypy-at-europython-2008-1488914968455397674.html @@ -0,0 +1,348 @@ + + + + + +PyPy at the EuroPython 2008 | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                PyPy at the EuroPython 2008

                + + + +
                +

                Greetings from Vilnius, Lithuania. There were already +two pypy talks, one performed by Jacob Hallen +PyPy for the rest of us and second +by Maciej Fijalkowski PyPy status talk. The thing that +we forgotten to tell is that PyPy sandboxing feature +can also easily limit CPU and RAM usage as well as +any other possible resource (like network transfer). +For anyone who would like to join, there is a PyPy +sprint after the conference.

                +Cheers,
                +arigo & fijal

                +
                +

                Comments

                +
                +
                +
                + + Jonathan Ellis wrote on 2008-07-07 18:13: +
                +
                +

                Can you post PDFs of those slides? The text is not rendering for me in NeoOffice.

                +
                +
                +
                +
                + + Anonymous wrote on 2008-07-07 22:08: +
                +
                +

                Is Maciej using some secret cool reST presentation tool?

                +
                +
                +
                +
                + + Anonymous wrote on 2008-07-08 03:52: +
                +
                +

                You can convert to PDF online.

                This link should work for the next 24 hours. You can regenerate it on the same site after that.

                +
                +
                +
                +
                + + Maciej Fijalkowski wrote on 2008-07-08 06:19: +
                +
                +

                I checked in pdf version. I use rst2beamer + hacks :) Ask Antonio Cuni for details.

                +
                +
                +
                +
                + + Antonio Cuni wrote on 2008-07-08 08:20: +
                +
                +

                yes, we are using rst2beamer:
                https://www.agapow.net/programming/python/rst2beamer

                to have some hints how to use it, look at this script I used to generate my pycon-italy talk:
                https://codespeak.net/svn/pypy/extradoc/talk/pycon-italy-2008/makepdf

                I also wrote some rst macros that allows you to put some paragraphs in those nice beamer's exampleblock and alertblock:

                https://codespeak.net/svn/pypy/extradoc/talk/pycon-italy-2008/beamerdefs.txt

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2008/07/pypys-python-runs-pinax-django-1265543049596913506.html b/posts/2008/07/pypys-python-runs-pinax-django-1265543049596913506.html new file mode 100644 index 000000000..cc6300e27 --- /dev/null +++ b/posts/2008/07/pypys-python-runs-pinax-django-1265543049596913506.html @@ -0,0 +1,290 @@ + + + + + +PyPy's Python runs Pinax / Django | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                PyPy's Python runs Pinax / Django

                + + + +
                +

                During the EP2008 sprint we got Pinax running on top of PyPy. At our play1 server we have it running on top of pypy-c. Not that you'll notice many differences to the original site but that's the point, isn't it? ... Well, in fact i am too lazy to customize our play1 version now - i rather spent a nice evening with the other sprint guys :) + +Pinax integrates numerous reusable Django apps to take care of the things that many sites have in common. Many thanks particularly to Henrik Vendelbo who sorted out various Pinax and PyPy issues, and wrote up a nice DjangoAndPyPy wiki page describing the installation process. + +greetings from Vilnius (Lithunia), Holger

                +
                +

                Comments

                +
                +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2008/08/new-translation-option-opt-7737733390438084418.html b/posts/2008/08/new-translation-option-opt-7737733390438084418.html new file mode 100644 index 000000000..5acbafe73 --- /dev/null +++ b/posts/2008/08/new-translation-option-opt-7737733390438084418.html @@ -0,0 +1,347 @@ + + + + + +New translation option: --opt | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                New translation option: --opt

                + + + +
                +

                Hi all,

                + +

                A few command-line options for translate.py have changed. +Most interesting is that optimization levels are selected with +the option --opt, or -O for short. This replaces --allopts, +which was also called --faassen in reference to a person who +is actually not involved in PyPy (so that was a bit of a +strange joke). Also, --allworkingmodules is the default +nowadays, and can be cancelled with --no-allworkingmodules. +Threads are also included in --allworkingmodules now.

                + +

                Examples:

                + +

                +
                  +
                • +translate.py (reasonable default, corresponds to --opt=2) +
                • +
                • +translate.py --opt=3 (best, maybe 10-20% faster) +
                • +
                • +translate.py --opt=1 (translation is faster and less RAM-hungry) +
                • +
                +

                For more information, see: + +

                + +
                +

                Comments

                +
                +
                +
                + + holger krekel wrote on 2008-08-19 16:48: +
                +
                +

                maybe for a bit of background: Martijn Faassen regularly asked at our talks "how fast is PyPy now?" - at times when PyPy was going from 2000 to 500 to 50 to ???? times slower than CPython (nowadays at 1-6 times, btw). so with "--faassen" we were trying to translate an "as-fast-as-possible" pypy. so now we are getting dead serious (also Martijn actually asked for removing his name from the commandline) and introduced a we-are-becoming-a-real-compiler-with-opt-levels "-O" option :)

                Martijn, to be clear: i really appreciate having you and your questions in our talks and in general - it also pushed me to get out py.test releases ... :) holger

                +
                +
                +
                +
                + + Anonymous wrote on 2008-08-22 02:56: +
                +
                +

                Congrats to PyPy on having a more sensible option! (though an option called 'opt' made me think it stood for 'option' first :).

                Thanks Holger for the background. "actually not involved in PyPy" depends on your interpretation of what the word "involved" means. Besides performance related questions, as Holger indicates I've asked other questions of the PyPy project too. I wasn't altogether successful at it (nor altogether unsuccessful), and I'm on an extended break from asking any questions right now.

                I didn't write any of the PyPy code and also had nothing to do with the design of PyPy. I indeed asked for the --faassen option to be removed earlier this year. It was amusing (and flattering), but it also lead to some confusion concerning credit that I certainly don't deserve - that goes to the PyPy developers and project managers.

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2008/08/pylibpytest-092-released-6233865913406513469.html b/posts/2008/08/pylibpytest-092-released-6233865913406513469.html new file mode 100644 index 000000000..ad2359c90 --- /dev/null +++ b/posts/2008/08/pylibpytest-092-released-6233865913406513469.html @@ -0,0 +1,340 @@ + + + + + +pylib/py.test 0.9.2 released | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                pylib/py.test 0.9.2 released

                + + + +
                +

                PyPy and its 14638 automated tests use the py.test tool which is also used by many other projects. PyPy developers have actually driven and contributed a lot to its development. + +I just released version 0.9.2 of the py lib mainly fixing Windows issues and providing better packaging and integration with setuptools. It's usable completely independently from PyPy - "easy_install py" gives you the py.test command line. Of course you can run py.test on top of a translated PyPy version as well. Here is a quick summary of what the py lib provides besides py.test: +

                +
                  +
                • +py.execnet: ad-hoc code distribution to SSH, Socket and local sub processes
                • +
                • +py.magic.greenlet: micro-threads on standard CPython ("stackless-light") and PyPy
                • +
                • +py.path: path abstractions over local and subversion files
                • +
                • +py.code: dynamic code compile and traceback printing support
                • +
                • tested against Linux, Win32, OSX, works on python 2.3-2.6
                • +
                +Good general entry points for installation and documentation: + +have fun, holger krekel +
                +

                Comments

                +
                +
                +
                + + Anonymous wrote on 2008-09-02 18:30: +
                +
                +

                We use py.test in the development of the Translate Toolkit and Pootle - many thanks :)

                I use Fedora so here are Fedora RPMs for pylib:
                https://translate.sourceforge.net/releases/testing/fedora/pylib-0.9.2-1.fc9.noarch.rpm

                +
                +
                +
                +
                + + holger krekel wrote on 2008-09-23 15:44: +
                +
                +

                Hi Dwayne!

                thanks a lot. I added a link to your RPM from the download page. Let me know if there was anything that hindered packaging.

                holger

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2008/08/pypy-runs-unmodified-django-10-beta-7105507436425430319.html b/posts/2008/08/pypy-runs-unmodified-django-10-beta-7105507436425430319.html new file mode 100644 index 000000000..b3c4b2ebf --- /dev/null +++ b/posts/2008/08/pypy-runs-unmodified-django-10-beta-7105507436425430319.html @@ -0,0 +1,294 @@ + + + + + +PyPy runs unmodified django 1.0 beta | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                PyPy runs unmodified django 1.0 beta

                + + + +
                +

                This is just a quick update post to previous post - django folks commited all +outstanding tickets and we are able to run unmodified django +on top of pypy-c. Instructions how to do it are well explained +on django wiki entry

                + +enjoy,
                +fijal

                +
                +

                Comments

                +
                +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2008/09/dsseldorf-pypy-sprint-5-13th-october-8919978872121664955.html b/posts/2008/09/dsseldorf-pypy-sprint-5-13th-october-8919978872121664955.html new file mode 100644 index 000000000..cbbd6009f --- /dev/null +++ b/posts/2008/09/dsseldorf-pypy-sprint-5-13th-october-8919978872121664955.html @@ -0,0 +1,295 @@ + + + + + +Düsseldorf PyPy sprint 5-13th October, 2008 | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                Düsseldorf PyPy sprint 5-13th October, 2008

                + + + +
                +

                The PyPy team is happy to announce the next sprint, which will take place in +the Computer Science Department of the University of Düsseldorf, Germany. +Sprinting will start on the 6th of October and go on till the 12th. Please +arrive on the day before if you want to come.

                +

                Topics of the sprint will be aiming at a 1.1 release and to work on integrating PyPy better +with small devices. Other topics are also welcome!

                +

                We will try to find a hotel with group rates, so if you are interested, please +sign up soon! See the announcement for more details.

                +
                +

                Comments

                +
                +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2008/09/pycon-uk-javascript-and-gil-8387247619202094916.html b/posts/2008/09/pycon-uk-javascript-and-gil-8387247619202094916.html new file mode 100644 index 000000000..40ae89e04 --- /dev/null +++ b/posts/2008/09/pycon-uk-javascript-and-gil-8387247619202094916.html @@ -0,0 +1,385 @@ + + + + + +Pycon UK, Javascript and the GIL | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                Pycon UK, Javascript and the GIL

                + + + +
                +

                Just got back from Pycon UK 2008 - here are some impressions.

                +Both the keynote speakers Mark Shuttleworth (Canonical) and +Ted Leung (Sun Microsystems) expressed their concerns about +Javascript becoming so fast and prominent that it could displace +Python in the future. They also highlighted the fact that +Multi-core systems get cheaper and more popular also on +desktop computers or notebooks. They challenged the community +to advance Python implementations to exploit it. Question was up +what PyPy can do here. As it stands, PyPy still uses the good old +Global Interpreter Lock (GIL) but our approaches should indeed +lend itself well to do experimentation with free threading. +

                +During the 2-day conference we met many interesting people, most +notably the guys from Resolver, among them William Reade who is working on +IronClad -- which implements a fake python25.dll on top of +IronPython. He presented some good results for Numpy in his +lightning talk. This approach is surely something to follow +closely and potentially use for PyPy. +

                +We also had lunch and a couple of chats with Jacob Kaplan-Moss from +Django fame - he is apparently up to try use PyPy's sandboxing features +for one of his projects, cool! +

                +Conference itself was well organized for the 230 attending people - although +the venue might be a bit small for next year's EuroPython. Ah, and +we gave three well attended talks, find the slides here: +

                + +cheers,
                +Holger, Maciej, Anto (associated through merlinux, btw)
                +
                +

                Comments

                +
                +
                +
                + + Lucian wrote on 2008-09-17 19:36: +
                +
                +

                Is the work done on this (https://code.google.com/p/python-safethread/) useful conceptually?

                +
                +
                +
                +
                + + René Leonhardt wrote on 2008-09-17 21:27: +
                +
                +

                Is the new multiprocessing module going to offer improved multi-core performance?

                +
                +
                +
                +
                + + Colin Walters wrote on 2008-09-17 22:40: +
                +
                +

                Jython? About to hit 2.5, has a threading model for free from the JVM.

                +
                +
                +
                +
                + + Luis wrote on 2008-09-18 00:39: +
                +
                +

                I wonder how the new javascript improvements compare to pypy technically. For example, the tracing techniques of Mozilla's Tracemonkey look impressive, but I don't know if these techniques are conceptually close or not to pypy's. Is there anything you can learn from them (tracemonkey, chrome's v8, etc).

                Luis

                +
                +
                +
                +
                + + Miguel Filipe wrote on 2008-09-19 15:19: +
                +
                +

                ReneL:
                Yes, the new multiprocessing module will improve multi-core performance if you use it.
                That module allows a easy way to use multiple processes cooperatively in python. It tries to mimic the threading API.
                If you use multiple processes instead of threads you will avoid the Global Interpreter Lock.

                About new chalenges for PyPy, multicore isn't the major problem.. the absense of a powerfull JIT and GC is. Please keep working on a super-fast "VM+JIT" for python! (super linux performance is a must)

                +
                +
                +
                +
                + + holger krekel wrote on 2008-09-20 13:57: +
                +
                +

                colin: true, jython and also ironpython can make use of multiple threads. should have mentioned it. Doesn't mean that pypy-c shouldn't go for it, rather the opposite i'd think :)

                renel/multiprocessing module: i can imagine it helps with multi-core cpus. are there practical experiences using it yet?

                luis, miguel: there are considerable efforts on the PyPy/JIT front - particularly from Armin, Carl Friedrich and Antonio - would be worth a dedicated blog post to relate this to tracing JITs, V8, squirrelfish, etc. One thing i know is that we probably want to apply for funding to help completing the JIT.

                Miguel: We do have advanced GCs and are working on on improving them, currently.

                to all: thanks for your feedback!

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2008/09/pypypython-at-maemo-summit-6115106472056714072.html b/posts/2008/09/pypypython-at-maemo-summit-6115106472056714072.html new file mode 100644 index 000000000..2ca8038bc --- /dev/null +++ b/posts/2008/09/pypypython-at-maemo-summit-6115106472056714072.html @@ -0,0 +1,309 @@ + + + + + +PyPy/Python at the Maemo summit | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                PyPy/Python at the Maemo summit

                + + + +
                +

                Maciej and me visited the Maemo Summit in Berlin - +a community meetup around Nokia's Linux based +mobile platform. We spontaneously did a lightning +talk about a first running pypy-c on Maemo +and got nice feedback. + +

                +

                We also had a nice lunch with guys from the INDT in Brazil, including Marcio Marcedo and Marcelo Eduardo. It turns out that Python is used a lot on Maemo, for example the nice Canola UI is done with it. Will be interesting to see how this shapes up in relation to the iPhone and Android. + +

                +

                A lot of Nokia engineers were around and they announced that from October on they are going for weekly new releases of their SDK for the new Fremantle (Maemo-5) debian-based platform until the SDK becomes final - if we got this right. + +

                +

                Funnily enough, we met Marius Gedminas from the Programmers of Vilnius - he gave a lightning talk on his impressions as a community member. We think python programmers really should go much more to non-Python centric conferences. + +

                +

                The whole event took place at the C-Base - was a bit +crammed in some of the sessions with something like 200 people attending. +
                +cheers, Maciej and Holger

                +
                +

                Comments

                +
                +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2008/10/dsseldorf-sprint-report-days-1-3-5256639868851086032.html b/posts/2008/10/dsseldorf-sprint-report-days-1-3-5256639868851086032.html new file mode 100644 index 000000000..d7ef3af2a --- /dev/null +++ b/posts/2008/10/dsseldorf-sprint-report-days-1-3-5256639868851086032.html @@ -0,0 +1,380 @@ + + + + + +Düsseldorf Sprint Report Days 1-3 | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                Düsseldorf Sprint Report Days 1-3

                + + + +
                +

                The Düsseldorf sprint is currently in full progress and this post will try to +summarize what progress has been made in the last days. We are (again) sprinting +at the STUPS group of the Düsseldorf University. You can find the sprint +announcement and the daily planning file.

                +

                Holger and Samuele put quite some effort over several days into setting up and +improving PyPy's testing infrastructure. PyPy has a variety of tests. On the one +hand, there are of course our own tests. But then we also have the CPython tests +that should be run on top of pypy-c. Up to now we used a custom-made pile of +hacks, held together by lots of duct-tape. It consisted of a variety of +different machines running different things with different reporting solutions. +Some of the old test-results can still be found on wyvern. Now we are moving +to a buildbot based solution together with a custom reporter to have a view +similar to the old one. Some details are not quite finished yet, but most of the +things are already working rather well (currently all the results displayed +are from the 2.5-merge branch).

                +

                Another large (and ongoing) topic of work is the 2.5 branch. It contains the +work done by our Summer-of-Code student, Bruno Gola, of adding CPython 2.5 +features to PyPy's Python interpreter. While Bruno implemented most language +features and imported the 2.5 stdlib into PyPy, a lot of details were still +missing. In the last days nearly everybody worked on fixing small issues and +failing stdlib tests. While doing that we tried to categorize some CPython tests +as implementation dependant so that we can skip them when running on PyPy.

                + +

                Memory Improvements

                +

                One goal of the sprint is to measure and to reduce the memory behaviour of our +Python interpreter. The idea is to make pypy-c a realistic option for use on +embedded devices. By memory behaviour we mean both the +dynamic memory usage (how much bytes does a dict or an instance take) as well as +the size of the executable and details of the GC strategy.

                +

                Alexander, Carl Friedrich and Antonio did some work on analyzing the static data +that a pypy-c executable contains. Our executables have the tendency to be +rather large, both due to a lot of code and due to a large amount of static +data. The analysis didn't give any really surprising results, the problem is +mostly that we have a lot of static data originating from a bit everywhere in +our program. Two big offenders are the unicodedata-module with about 750 KB +of static data and the multimethod-tables with about 150 KB of data.

                +

                Armin, Iko, Anto and Maciek worked on a new approach to malloc-removal. This is +(for PyPy) a crucial optimization of the translation toolchain that performs +escape analysis to find out which objects don't outlive the frame they were +allocated in. Since RPython is garbage-collected we usually have a lot of +allocations, so it is important to statically get rid of many of them. To +successfully do that, some inlining is needed to give the analysis more context. +This leads to the fact that we have rather aggressive inlining-settings to allow +as much malloc-removal as possible. The new approach tries to inline functions +only if this actually leads to the successful removal of a malloc operation. The +code is not finished quite yet, so it remains to be seen how successful it will +be.

                +

                Before the sprint Maciek had started to work on a mark-compact GC for PyPy. The +idea is that it is better for memory-constrained-environments because it does +not double the memory-requirements during collections. During the sprint Armin +and Maciek worked on cleaning up the code a bit and then merging the branch. +An interesting property of the mark-compact GC is that after a collection all +the memory that is not currently used by the program is returned to the +operating system. Right now the GC is not as fast as our more mature ones, but +it probably will be the basis for future tweaking.

                +

                A small thing that was done by Alexander and Carl Friedrich to make objects smaller is +to enable shared instance dictionaries also for instances of old-style +classes. Before it worked only for instances of new-style classes. Shared +instance dictionaries are a way to reduce the memory-usage of instances. In the +optimal case, it gives the same memory-savings that __slots__ are giving, +but without any behavioural changes. Conceptually it is very similar e.g. to +the notion of "map" in the Self project, or the hidden classes that Google Chrome's V8 +is using (click on the link, there are nice graphics). The +difference is that for them it is mostly a way to get faster attribute access, +and PyPy is so far only using it form memory savings (but that might change in +the future).

                +

                In parallel to all the other work, John Witulski worked tirelessly on advancing +the AMD64-JIT-backend. John has the implementation of this backend as the topic +of his Bachelor's thesis. He is progressing quite well (especially also +considering that this is his first sizeable Python project ever), just sometimes +being impaired by such annoyances as errors in the official Intel documentation. +By now the backend is supporting many integer operations and control flow.

                +
                +

                Comments

                +
                +
                +
                + + René Dudfield wrote on 2008-10-12 07:55: +
                +
                +

                Hello,

                sounds like some fun sprinting :)

                Have you considered mmap for some of those big memory users?

                Especially for unicode stuff, which mostly won't be used for many applications, it should be a good win -- both for load time, and memory use.

                Double plus extra combo win!!! for if you use multiple processes.

                cu,

                +
                +
                +
                +
                + + Unknown wrote on 2008-10-21 18:05: +
                +
                +

                I'm not sure of what you mean.
                But modern operating systems (at least Linux) use on-demand loading of executables and libraries, so they never copy anything from an executable file to memory unless it is used.

                In fact, process startup is implemented internally by mmap()ing the executable and libraries into the address space of the new process.

                If you use multiple processes, it still works well - also data pages are shared, until some process writes to them.

                For MMU-less devices, the above does not apply (and ucLinux allows Linux to run on them).
                But in that case, I guess that no demand loading is available, and that mmap() copies the mapped data in memory - you need to explicitly swap in and out code segments (i.e. to use good old overlays), and no modern programming environment has direct support for them any more I guess.

                You can still emulate overlays with advanced usage of linker scripts however - you put some code in a section, create variables containing the begin and end offset of that section in the linker script, and copy data in memory from that section; but I think that making relocations to that code work flawlessly is impossible, you need to always refer to the buffer containing loaded data.

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2008/10/prolog-jit-masters-thesis-finished-5462132148241449867.html b/posts/2008/10/prolog-jit-masters-thesis-finished-5462132148241449867.html new file mode 100644 index 000000000..4bfda4469 --- /dev/null +++ b/posts/2008/10/prolog-jit-masters-thesis-finished-5462132148241449867.html @@ -0,0 +1,301 @@ + + + + + +Prolog-JIT Master's-Thesis Finished | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                Prolog-JIT Master's-Thesis Finished

                + + + +
                +

                As we already blogged, in the last half-year or so, Michael Leuschel, Armin +and me did a lot of JIT generator work on a Prolog prototype. The idea was to +experiment more quickly with some techniques than what would have been possible +with RPython. These experiments were quite successful in themselves. With very +little code we managed to get a JIT that is not doing too badly when compared to +existing projects for Prolog.

                +

                This Prolog work was also the subject of my Master's thesis. I finished the +thesis about two weeks ago (and since then have been mostly sleeping and then +sprinting). The thesis should be self-contained when it comes to explaining the +JIT concepts but needs knowledge of Prolog to be understandable.

                +
                +

                Comments

                +
                +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2008/10/sprint-discussions-c-library-bindings-249141169883996521.html b/posts/2008/10/sprint-discussions-c-library-bindings-249141169883996521.html new file mode 100644 index 000000000..26b02c1eb --- /dev/null +++ b/posts/2008/10/sprint-discussions-c-library-bindings-249141169883996521.html @@ -0,0 +1,431 @@ + + + + + +Sprint Discussions: C++ Library Bindings | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                Sprint Discussions: C++ Library Bindings

                + + + +
                +

                At the beginning of this year, PyPy grew ctypes support, thanks to generous +support by Google. This made it possible to interface with C libraries from +our Python interpreter, something that was possible but rather tedious before. +What we are lacking so far is a way to interface to large C++ libraries (like +GUI libraries). During the sprint we had a brainstorming session about possible +approaches for fixing this shortcoming.

                +

                For CPython there are a number of approaches in common use:

                +
                + +
                +

                Those all have the property that they produce some code that is then compiled +with a compiler to produce a CPython extension. The produced code also uses +functions from CPython's C-API. This model is not simple to use for PyPy in its +current state. Since PyPy generates C code automatically, a fixed C-level API +does not exist (it is not unlikely that at one point in the future we might have +to provide one, but not yet). At the moment, PyPy very much has a "Don't call +us, we call you"-approach.

                +

                A very different approach is followed by the Reflex package, which is +developed at CERN (which has an incredible amount of C++ libraries). It is not +mainly intended for writing Python bindings for C++ libraries but instead +provides reflection capabilities for C++. The idea is that for every C++ shared +library, an additional shared library is produced, which allows together with +Reflex to introspect properties of C++ classes, methods, etc. at runtime. These +facilities are then used for writing a small generic CPython extension module, +that allows CPython to use any C++ library for which this reflection information +was generated.

                +

                This approach is a bit similar to the ctypes module, apart from the fact +that ctypes does not use any reflection information, but the user has to +specify the data structures that occur in the C code herself. This makes it +sometimes rather burdensome to write cross-platform library bindings.

                +

                For PyPy the approach seems rather fitting: We would need to implement only the +generic extension module and could then use any number of C++ libraries. Of +course some more evaluation is needed (e.g. to find out whether there are any +restrictions for the C++ code that the library can use and how bothersome it is +to get this reflection information for a large library) but so far it seems +promising.

                +
                +

                Comments

                +
                +
                +
                + + Anonymous wrote on 2008-10-14 17:39: +
                +
                +

                I've done a fair amount of complicated Boost.Python wrapping, and also implemented a small replacement for it with most of the complexity removed. There are two main reasons why Boost.Python is so complicated:

                1. It supports arbitrarily complex memory and sharing semantics on the C++ classes (and is runtime polymorphic on how the memory of wrapped objects is managed).

                2. It supports arbitrary overloading of C++ functions.

                If you remove those two generality requirements (by requiring that wrapped C++ objects are also PyObjects and banning overloading), it's possible to write very lightweight C++ bindings. Therefore, I think it's critical to factor the C/C++ API design so that as much of it as possible is writable in application level python on top of a small core that does the final C++ dispatch.

                For example, if you wrap a C++ vector class with a bunch of overloads of operator+ in Boost.Python, each call to __add__ has to do a runtime search through all the overloads asking whether each one matches the arguments passed. Each such check does a runtime search through a table of converters. It would a terrible shame if that overhead isn't stripped by the JIT, which means it has to be in python.

                I think a good test library for thinking about these issues is numpy, since it has some memory management complexity as well as internal overloading.

                I could go on, but it'd probably be better to do that via email. :)

                Geoffrey

                +
                +
                +
                +
                + + René Dudfield wrote on 2008-10-14 21:05: +
                +
                +

                Please add a C API :)

                Once that is done, it's lots easier to interface with the outside world.

                For a lot of C++ apis I find it easy enough to write a C api on top of it.

                In fact many C++ apis provide a C API. Since that makes it easier to work with different C++ compilers. As you probably know different C++ compilers mangle things differently.

                It is possible to look at C++ code at runtime. You just need to be able to interpret the C++ symbols. I know someone did a prototype of this for vc6 on windows. He parsed the symbols, and then created the functions at run time with ctypes. However the approach is not portible between platforms, compilers, or even different versions of compilers. Of course this didn't allow you to use many of the C++ features, but only some.

                If you look at how swig works, you will see it kind of generates a C API for many C++ things.


                For libraries, it is custom to provide a C API. It just makes things easier.

                +
                +
                +
                +
                + + Unknown wrote on 2008-10-15 01:11: +
                +
                +

                you might want to look at PyRoot [1,2,3] which is using the Reflex library to automatically wrap (and pythonize) the C++ libraries/types for which a Reflex dictionary has been (beforehand) generated.

                theoretically any piece of C++ can be wrapped as Reflex is using gccxml[4] to extract informations from a library and to generat the dictionary library.

                Using it in one of CERN's LHC experiment which makes heavy (ab)use of templates (Boost) I can say that we almost had basically no problem.
                Usually the only problems we got were either at the gccxml level (resolution of typedef, default template arguments,...) or at the gccxml-to-reflex level (mainly naming conventions problems interfering with the autoloading of types at runtime)

                Being a client of gccxml is a rather annoying as the development is... opaque.

                I know the Reflex guys were investigating at some point to migrate to an LLVM version (with GCC as a frontend) to replace gccxml.

                [1] https://wlav.web.cern.ch/wlav/pyroot/
                [2] https://root.cern.ch/viewcvs/trunk/bindings/pyroot/
                [3] https://www.scipy.org/PyRoot
                [4] https://www.gccxml.org/

                +
                +
                +
                +
                + + Unknown wrote on 2008-10-15 11:16: +
                +
                +

                There's been some (small) discussion in the SWIG project of making an alternative output method which creates a simple C API for a C++ project, and wraps that with ctypes (generating the python side of the ctypes bindings, too). So far, this is purely theoretical, but all the pieces needed to do it are present in the SWIG source code. If reflex doesn't work out, this might be a reasonable alternative approach.

                +
                +
                +
                +
                + + Maciej Fijalkowski wrote on 2008-10-15 13:41: +
                +
                +

                Wow. A lot of very informative posts. We'll definitely look to evaluate more what you all posted. Also, in case you want to discuss more, mailing list is usually better place for discussions. Feel free to send new ideas or more detailed info there.

                Cheers,
                fijal

                +
                +
                +
                +
                + + Anonymous wrote on 2008-10-16 00:44: +
                +
                +

                fyi check out Elsa ( https://www.cubewano.org/oink ). It is much better than Reflex.

                +
                +
                +
                +
                + + Carl Friedrich Bolz-Tereick wrote on 2008-10-16 10:57: +
                +
                +

                illume: Adding a C-API is rather hard, and probably not on our todo list, unless somebody pays for it :-).

                anonymous: From a quick glance I am not sure Elsa would really help. Yes, you can get use it to parse c++ headers and get information about it. But as far as I see it, you cannot use it to create shared libraries that can be used to dynamically construct classes and dynamically call methods on them. Besides, the idea is to have a solution that works on both CPython and PyPy. Reflex already has a way to bind C++ libraries to CPython, so we only need to do the PyPy part.

                Anyway, if anybody is interested in more detailed discussions, we should all move to pypy-dev.

                +
                +
                +
                +
                + + Unknown wrote on 2008-10-19 09:40: +
                +
                +

                I also have done a fair amount of Boost.Python wrapping. I even created a code generator for it - Py++( www.language-binding.net).

                The problem you want to solve is very complex. Exposing C++ code, as is, is not enough. You will have to create "bridge" between different concepts.

                For example C++ and Python iterators. In C++, in order to get the iterator value, you have to dereference it. In Python, you just have it( value ).

                This is just a single example, I have others.

                If you continue with the project - I would like to be involved.

                +
                +
                +
                +
                + + Unknown wrote on 2008-10-21 02:48: +
                +
                +

                Roman,

                I haven't looked at the code of Boost.Python since a long time, but the way "we" do the pythonization of the iteration over STL sequences is rather simple.

                when one writes:
                foos = std.vector('FooKlass')()
                # fill foos
                # iterate
                for foo in foos:
                print foo.value()

                what the PyROOT/Reflex layer is doing is looking at the dictionary for the std::vector(FooKlass), discovering that there is a pair of functions 'begin' and 'end' and it figures out one can create a python iterator from that pair.

                anyways, as Maciej pointed it out, we could try to move this discussion here[1] or there[2]...

                cheers,
                sebastien.

                [1] https://codespeak.net/pipermail/pypy-dev/2008q4/004847.html

                [2] https://codespeak.net/pipermail/pypy-dev/2008q4/004843.html

                +
                +
                +
                +
                + + Anonymous wrote on 2008-10-30 12:27: +
                +
                +

                Just to let you know, there is an upcoming paper in PythonPapers.org review on the topic of C++ wrapping in Python. Just watch out!

                +
                +
                +
                +
                + + ilkosta wrote on 2008-12-09 14:08: +
                +
                +

                maybe it's worth also evaluate the library xrtti, a Reflex comparable library but without CERN and ROOT dependencies.

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2008/10/sprint-discussions-jit-generator-3301578822967655604.html b/posts/2008/10/sprint-discussions-jit-generator-3301578822967655604.html new file mode 100644 index 000000000..e310381a4 --- /dev/null +++ b/posts/2008/10/sprint-discussions-jit-generator-3301578822967655604.html @@ -0,0 +1,423 @@ + + + + + +Sprint Discussions: JIT Generator Planning | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                Sprint Discussions: JIT Generator Planning

                + + + +
                +

                Background

                +

                Finally, the JIT post :-). First some background: Despite our plans at the end +of the EU period, PyPy's Python interpreter didn't get a good and widely +applicable JIT in the last year. The reason for that was that we discovered that +although the basic idea to generate JIT compilers is good, the concrete +prototype made during the EU period is basically flawed. It could have +been pushed a bit farther, but would have run into deep troubles eventually. One +of the problems would have been performance instability: change a seemingly +unrelated bit in your source program, and the performance changes in unexpected +ways, which is clearly not desirable. Another problem with that old approach is +that too much assembler code is generated, leading to memory problems, and also +that the generated assembler is bad in various ways, e.g. it is hard in that +approach to do proper register allocation.

                +

                Therefore we decided that it would be worthless to pursue this direction much +further. Instead we tried to research approaches to fixing the inherent +problems. This research was largely done in Prolog and I eventually wrote my +Master's thesis about it. From the Prolog work we got some good insights into +what needs to be done and what sorts of techniques are needed. Also, it inspired +Armin to do some more exploration on a small Python prototype which used the +lessons learned from Prolog and also some additional ideas from tracing JITs. So +far, however, the prototype is neither in RPython, nor much integrated with +PyPy.

                +

                This research is not the only thing happening in the JIT-area. During the last +year, Antonio Cuni was working on bringing the JIT to pypy-cli. This +consisted mostly of writing a .NET backend for the old JIT-generator. Some +further work is being done since August by John Witulski, who is writing an +AMD64 backend for the JIT-generator for his Bachelor's thesis.

                + +

                Where to go from there

                +

                During the sprint we discussed in which directions we should continue now. We +plan to work quite a bit on the JIT in the coming months. Both Armin and Anto +are in Düsseldorf for four months, and them and me plan to mostly work on the +JIT (as well as giving a lecture on "Dynamic Programming Languages", trying to +ensnare some more students).

                +

                The first step will be to experiment a bit more with Armin's prototype. So far +it looks rather promising, but there are some unsolved issues that we need to +look into first. The first issue is to think a bit about how to efficiently do +profiling to compile only important code paths. The other large issue are +so-called "virtualizables". Roughly speaking, they are the frame objects of the +interpreter from which the JIT is generated. They need special treatment, +because on the one hand it is important that they get optimized away to make the +code fast, since the frames are accessed all the time for the local variables; +on the other hand they should still be usable for introspection if code is +around that is trying to look into them.

                +

                When this is done, the prototype needs to be ported to RPython, which is a +non-trivial task, since it is rather dynamic so far (it is rather important that +the unresolved issues are done before the porting, because once the prototype is +in RPython, experimentation will be harder). The porting has the potential to be +tedious, but in a sense it is "just work", as opposed to unclear research.

                +

                At this point it will become important to think about the backend interface. The +interface that the old frontend used to produce assembler code won't be usable +for the new approach, so things need to be rearranged slightly. Afterwards the +backends will have more information and be invoked at a slightly higher level, +which should allow them to produce better code.

                +

                When all this is done, the JIT generator will be in a rather good state and it +should become possible (modulo a lot of details, of course), to use it on the +Python interpreter.

                +

                Conclusion

                +

                I am intentionally not attaching any time estimates to this blog post. So far +our time estimates have not been very accurate when it comes to the JIT, which +only lead to disappointment when the JIT failed to materialize. We hope that we +will progress in interesting ways in the next four months, but who knows. Note +that we are really quite disappointed ourselves that it took so much longer than +we planned and hoped. The reason for this is mostly that this work really is +research and sometimes it is just hard to predict what sort of problems turn up. +Partial evaluation (the basis for our JIT generator) is a 30 years old technique +that was always just promising and never really successful, so the fact that we +think we can solve its problems in a few years is very much hubris anyway :-). +On the positive side, we think that we now know these problems much better than +ever before and that we have a plan that has a chance to succeed.

                +

                Also we are still convinced that our approach has huge potential, despite the +difficulties. If we manage to pull it off, it should be significantly simpler to +support new language features in the JIT and also to get speedups on some rather +interesting bits of the language. Some ideas we are having include generating a +JIT for the regex engine or speed up ctypes-bindings to be nearly as fast as an +extension module (or faster?). Also the JIT will be such that by construction +the JIT-generated code behaves identical to the original code, which isn't +always true for Psyco, for example.

                +
                +

                Comments

                +
                +
                +
                + + Luis wrote on 2008-10-14 19:20: +
                +
                +

                Thank you very much for this post Carl! I have a couple of questions:
                You said you were experimenting with some ideas from tracing JITs. I wonder how much the new javascript VMs are influencing your work in pypy. Were these techniques considered from the beginning or this is just because of the latest success of tracemonkey? And if so, are there ideas from Chrome's v8 or Squirellfish than could be applied too?
                Do they change in any way your expectations regarding the potential of pypy concerning speed?

                +
                +
                +
                +
                + + Anonymous wrote on 2008-10-15 05:42: +
                +
                +

                Hi,

                I quess you will now the work done about hybrid frames in VisualWorks works, but since you mentioned the problem, anyone else could benefit from the following link:

                https://pages.cs.wisc.edu/~cymen/misc/interests/oopsla99-contexts.pdf

                +
                +
                +
                +
                + + Maciej Fijalkowski wrote on 2008-10-15 13:21: +
                +
                +

                @luis: we definitely read same papers (by Michael Franz and others) as tracemonkey authors. Work on tracing jit for pypy is older than release of tracemonkey (it's even older than first work on tracemonkey). Regarding chrome's v8 it seems the main optimization is implementation of hidden classes, which we kind of get for free combining jit and our existing optimization called shared dict.

                +
                +
                +
                +
                + + Anonymous wrote on 2008-10-27 05:59: +
                +
                +

                From the sound of it, it sounds like it would be useful to have a 64-bit version of Psyco, otherwise there is no stopgap in the meantime...

                +
                +
                +
                +
                + + Anonymous wrote on 2008-12-01 13:50: +
                +
                +

                When will PyPy match Psyco's speed?

                +
                +
                +
                +
                + + Anonymous wrote on 2009-01-18 20:40: +
                +
                +

                Yes, especially on Linux, everything is moving to 64-bit. If there's no 64-bit Psyco, you can't get the benefits of 64-bit Python (memory) and use Psyco at the same time.

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2008/10/sprint-discussions-release-planning-7097053444808236145.html b/posts/2008/10/sprint-discussions-release-planning-7097053444808236145.html new file mode 100644 index 000000000..9e4269cdd --- /dev/null +++ b/posts/2008/10/sprint-discussions-release-planning-7097053444808236145.html @@ -0,0 +1,337 @@ + + + + + +Sprint Discussions: Release Planning | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                Sprint Discussions: Release Planning

                + + + +
                + + + +

                One of the discussions that happened during the sprint was about how to approach +the next PyPy release. There hasn't been a release since the end of the EU +period, which is not an optimal situation. Therefore we plan to make a 1.1 +release at the beginning of next year, ideally before Pycon US. We'd also like +to move towards time-based releases. This will be greatly helped by the +new buildbot infrastructure, which allows us to decide when the +state of the codebase is stable enough to release.

                +

                Another goal of the release is to involve more people from the wider PyPy +community by having bugdays and generally asking for more support. This will be +particularly useful for bugs on platforms that no one of the core developers +group is using.

                +

                Feature-wise the release will mostly contain CPython 2.5 language support, +including some new extension modules (like ctypes, expat, sqlite). +In addition we plan to make it easier to actually install and use the PyPy +Python interpreter, which means some sort of proper installation procedure and +supporting distutils on top of PyPy. Another part of the release will be +support for fully sand-boxing an interpreter.

                +

                Additionally there were also a large number of improvements on several levels +since the last release, like optimizations, faster oldstyle-classes, better +GCs, correct finalization behaviour, lots and lots of bugfixes, better +threading support (still with the GIL), some work on improving memory +behaviour, ...

                +

                In contrast to our last release, we will focus mainly on PyPy's Python +Intepreter and more particularly its C-version. There are also various +experimental interpreters that PyPy contains, like for Prolog, Smalltalk, +JavaScript and Scheme. We also don't intend to put the LLVM and Javascipt +backends in the release, since they are essentially unmaintained and at least +partially broken. If anybody is particularly interested in one of these +components, please feel free to step up and take responsibility for them. +Another thing that the release won't contain is a JIT. I plan to make another +blog-post about this soon, stay tuned.

                +
                +

                Comments

                +
                +
                +
                + + Michael Foord wrote on 2008-10-12 17:58: +
                +
                +

                Looking forward to news on the JIT.

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2008/11/one-year-pypy-blog-3267056180369310162.html b/posts/2008/11/one-year-pypy-blog-3267056180369310162.html new file mode 100644 index 000000000..c64425db2 --- /dev/null +++ b/posts/2008/11/one-year-pypy-blog-3267056180369310162.html @@ -0,0 +1,476 @@ + + + + + +One year PyPy Blog | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                One year PyPy Blog

                + + + +
                +

                Last Friday the PyPy Status Blog had its first anniversary. Yay! After not +really buying into any of this new-fangled "blog" stuff for a long time we just +bit the bullet and got started. Totally surprisingly it even worked. We posted +76 post in the last year, more than one per week. By now we have more than 800 +subscribers (according to feedburner), which is quite cool for a rather niche +blog.

                +

                To make our blog even more interesting, I would like to ask for some feedback +via the comments:

                +
                +
                  +
                • Which posts did you like in particular?
                • +
                • What sort of posts would you be interested in getting more of?
                • +
                • Any other improvements we could make?
                • +
                +
                +
                +

                Comments

                +
                +
                +
                + + Anonymous wrote on 2008-11-02 18:28: +
                +
                +

                For me the most interesting posts is about status of PyPy project. It will be great if you could post more frequently.

                +
                +
                +
                +
                + + Unknown wrote on 2008-11-02 20:40: +
                +
                +

                +1

                +
                +
                +
                +
                + + Michael Foord wrote on 2008-11-02 21:09: +
                +
                +

                It's been great to read about PyPy progress, congratulations for surviving a year and many thanks.

                I also like to hear the status updates - and wouldn't mind a bit more technical detail.

                In fact some deep dives into individual aspects of PyPy would be *great*, even if they're more effort to write...

                +
                +
                +
                +
                + + Eduardo O. Padoan wrote on 2008-11-02 21:45: +
                +
                +

                Greetings!
                What about quick Weekly Status Updates, with a summary of svn activity and stuff?

                +
                +
                +
                +
                + + Anonymous wrote on 2008-11-03 02:41: +
                +
                +

                It's not just a first for you, it's a first for me. This is the first blog I have ever subscribed to. You can attribute that to the fact that this subject is geniunly interesting.

                The blog format has many benefits. For one, it amortizes the effort required to understand the project. This allows me to take my time, wiki whatever I need to, and savor the details. It takes time for me to learn the concepts but in due time, I can see myself eventually contributing to the project. The other benefit is I can see all the hot topics revolved around the various pypy projects. The whole partial evaulation, for example, was something new I learned about.

                I would agree that increasing the rate of posts would be nice. While I can't say for others, in my personal experience, it seems that logged projects tend to finish faster than unlogged projects.

                +
                +
                +
                +
                + + Anonymous wrote on 2008-11-03 02:42: +
                +
                +

                It's not just a first for you, it's a first for me. This is the first blog I have ever subscribed to. You can attribute that to the fact that this subject is geniunly interesting.

                The blog format has many benefits. For one, it amortizes the effort required to understand the project. This allows me to take my time, wiki whatever I need to, and savor the details. It takes time for me to learn the concepts but in due time, I can see myself eventually contributing to the project. The other benefit is I can see all the hot topics revolved around the various pypy projects. The whole partial evaulation, for example, was something new I learned about.

                I would agree that increasing the rate of posts would be nice. While I can't say for others, in my personal experience, it seems that logged projects tend to finish faster than unlogged projects.

                +
                +
                +
                +
                + + Bill Mill wrote on 2008-11-03 03:09: +
                +
                +

                > Which posts did you like in particular?

                I just scanned a bunch of entries, and "List comprehension implementation details" jumped out at me as a really nice one. I like that it points out some of the deep details of python that are easy for me to not think about because I'm not implementing it.

                > What sort of posts would you be interested in getting more of?

                More technical details posts, I really like the one about the JIT and Prolog too.

                I post your articles to reddit too, and I think "we can now run big software X" and efficency milestones are successfuly at attracting a lot of attention (if that's what you want!)

                +
                +
                +
                +
                + + Benjamin Peterson wrote on 2008-11-03 03:11: +
                +
                +

                Thanks so much for doing this! It makes me very jealous over here in CPython land.

                I like to hear about specific new projects and ideas you guys are working on.

                +
                +
                +
                +
                + + nshepperd wrote on 2008-11-03 05:39: +
                +
                +

                For me the most interesting things were the technical details posts, like Bill Mill said. But I get excited any time there is a new blog post. :)

                +
                +
                +
                +
                + + Unknown wrote on 2008-11-03 06:35: +
                +
                +

                Being in the scientific computation area at the moment, I'm very eager to hear about progress in the JIT framework, esp. for 64 bit Linux.

                Yet most other posts are also interesting.

                +
                +
                +
                +
                + + Anonymous wrote on 2008-11-03 11:01: +
                +
                +

                > Which posts did you like in particular?

                Anything about the JIT and its progress.

                Good luck!

                +
                +
                +
                +
                + + Unknown wrote on 2008-11-03 12:57: +
                +
                +

                Hi,

                I think the blog is pretty good. Weekly summaries would make it rock, though.

                And I am also especially interested in hearing about progress on JIT work. And about any use of LLVM.

                Best
                Anders

                +
                +
                +
                +
                + + Carl Friedrich Bolz-Tereick wrote on 2008-11-03 14:46: +
                +
                +

                Thanks for all the friendly comments!

                So more technical posts it will be :-). Those are mostly even fun to write, it's just usually quite a bit of work. I actually have a vague plan to give a basic introduction of the ideas behind the JIT (but will still take some time, I am busy with the lecture at the moment).

                About more summaries of what happens: it requires a lot of discipline (see the Python-dev summaries) and I am not sure we have that :-). It would need somebody dedicated to care for it, and that one won't be me at the moment.

                +
                +
                +
                +
                + + Luis wrote on 2008-11-03 15:03: +
                +
                +

                Personaly, I get very anxious when you say "it will be ready when it's ready". Aaarghhh! Please, at least lie a little bit :-).
                For example: "Pypy is now 1.8x slower than cpython, but after [feature here] it will be 10x faster".
                Well, just kidding. Congratulations for all the great work and keep it up!

                +
                +
                +
                +
                + + Damian Cugley wrote on 2008-11-03 16:03: +
                +
                +

                I am not especially in favour of weekly summaries, unless there is some interesting progress to report. Otherwise you end up with someone filling in progress reports because they feel obliged to, rather than to celebrate new features, and it becomes a chore.

                That said, PyPy has many subprojects; maybe having a round-robin system where we get a progress report from a different project every week would be interesting.

                +
                +
                +
                +
                + + Anonymous wrote on 2008-11-03 23:05: +
                +
                +

                I'm a regular Python user that wishes often for something a little faster with the same flexibility. So generally, I read this because I think you guys are on a good track for JIT optimisation and other fun things.

                I guess I'm looking forward to the eventual series of posts that talks about how you can start using this on your system to replace your system Python, followed by you talking the regular Python core developers into working directly in PyPy instead. =)

                +
                +
                +
                +
                + + Paul D. Eden wrote on 2008-11-03 23:36: +
                +
                +

                For me the best parts are the tutorials and howtos relating to rpython, translating to c, etc.

                +
                +
                +
                +
                + + Konrad wrote on 2008-11-07 11:54: +
                +
                +

                I'm interested in status updates and longer descriptions on how elements of PyPy work. Sprint summaries are fine as long as they carry one of the above (they usually do, though :>)

                +
                +
                +
                +
                + + John Mudd wrote on 2008-11-10 13:27: +
                +
                +

                I'm interested in anything to do with multi-thread support, GIL elimination, general status, progress and future plans.

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2008/11/porting-jit-to-cli-part-1-8712941279840156635.html b/posts/2008/11/porting-jit-to-cli-part-1-8712941279840156635.html new file mode 100644 index 000000000..02514e3aa --- /dev/null +++ b/posts/2008/11/porting-jit-to-cli-part-1-8712941279840156635.html @@ -0,0 +1,519 @@ + + + + + +Porting the JIT to CLI (part 1) | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                Porting the JIT to CLI (part 1)

                + + + +
                +

                As the readers of this blog already know, I have been working on the CLI +JIT backend for some months: last Friday, it reached an important milestone, +as it is now able to produce huge speedups for a little dynamic language. To +know how huge the speedup is, read on :-).

                +

                The goal of PyPy JIT generator is to take an interpreter and, with the help of +few annotations, automatically generate a JIT compiler for it. In this post, +we will talk about the tlc virtual machine: while tlc it is just a toy +language, it contains some features that make it an interesting target for our +JIT generator.

                +
                +

                The tlc virtual machine

                +

                tlc is executed by a stack based, dynamically typed virtual machine (for +those who knows a bit about the Python VM: does it sound familiar? :-)).

                +

                There are three types of objects: integers, nil, and cons cells (i.e. +lisp-like pairs of objects).

                +

                As the VM is very simple, it provides only few opcodes:

                +
                +
                  +
                • opcodes to manipulate the stack, like PUSH, POP, etc.
                • +
                • integer operations, like ADD, MUL, all the comparisons, etc.: +these operations can only be applied to integers;
                • +
                • list operations, like CONS, CAR, CDR: these operations can +only be applied to lists;
                • +
                • other operations, including jumps and conditional jumps.
                • +
                +
                +

                The VM is interesting for our purposes because it has a lot of similarities +with Python (though on a smaller scale, of course):

                +
                +
                  +
                1. it has to do type-checks at runtime before doing most of the operations;
                2. +
                3. every time you do an arithmetic operation, it has to unbox the operand, +do the computation, and the box the result again.
                4. +
                +
                +

                This means that even if you have a program which only uses integers, you are +paying a lot of overhead.

                +

                To know more about this toy VM, look at its source code: the interesting +bits are the classes used to represent objects, and the interp_eval +function, which contains the main loop of the virtual machine. As you can +see, the implementation is quite straightforward; all the hint calls you +see are the special annotations needed by the JIT generator to produce better +code.

                +
                +
                +

                Let's JIT it!

                +

                So, the whole point is to generate a JIT compiler from it, isn't it?

                +

                First, checkout a fresh copy of the oo-jit branch:

                +
                +$ svn co https://codespeak.net/svn/pypy/branch/oo-jit
                +
                +

                Then, go to the oo-jit/pypy/jit/tl directory, and compile the tlc VM +with the CLI backend and JIT enabled:

                +
                +$ cd oo-jit/pypy/jit/tl/
                +$ ../../translator/goal/translate.py -b cli --jit --batch targettlc
                +...
                +lot of texts
                +...
                +
                +

                If everything went OK, you now have a targettlc-cli executable, which +accepts two arguments: the name of the file containing the tlc program we +want to run, and an integer to be passed to it.

                +

                Luckily, in the same directory we have a factorial.tlc file that contains +the bytecode for a function that -- guess? -- computes the factorial of a +given integer; let's try it:

                +
                +$ ./targettlc-cli factorial.tlc 5
                +Non jitted:    120 (0.009371 seconds)
                +Warmup jitted: 120 (0.208954 seconds)
                +Warmed jitted: 120 (0.000323999999999991 seconds)
                +
                +

                Cool, it seems that the result was computed correcly :-). As you can see from +the output, we ran the program three times:

                +
                +
                  +
                1. by plain interpretation, without any jitting;
                2. +
                3. with the jit enabled: this run includes the time spent by doing the +compilation itself, plus the time spent by running the produced code;
                4. +
                5. again with the jit enabled, but this time the compilation has already +been done, so we are actually measuring how good is the code we produced.
                6. +
                +
                +

                So, it's time to run a benchmark: let's try to compute the factorial of a very +big number; the result will be 0, because obviously after a while we overflow, +but after all we are interested in the time spent, not in the result:

                +
                +$ ./targettlc-cli factorial.tlc 5000000
                +Non jitted:    0 (19.93247 seconds)
                +Warmup jitted: 0 (0.293229999999998 seconds)
                +Warmed jitted: 0 (0.0494239999999984 seconds)
                +
                +$ python -c 'print 19.93247/0.0494239999999984'
                +403.295362577
                +
                +

                And no, I didn't make any mistake in copying&pasting: the jitted version is +really 400 times faster that the non jitted one!

                +

                Warning: my laptop seems to be not very well suited for benchmarks, as the +results vary a lot from run to run; I've run the benchmarks a lot of times, +and I got speedup factors up to 500 times, so your results may be different.

                +
                +
                +

                More benchmarks

                +

                It's also interesting to compare the result with a manual written C# +version of the factorial, to see how good is code we produced; to get +reasonable results, we need to compute a larger factorial, to let to code to +run a bit more:

                +
                +$ ./targettlc-cli --onlyjit factorial.tlc 100000000
                +Warmup jitted: 0 (0.980856 seconds)
                +Warmed jitted: 0 (0.769716 seconds)
                +
                +$ mono factorial.exe 100000000
                +C#:            0 (0.153777 seconds)
                +
                +$ python -c 'print 0.769716/0.153777'
                +5.00540392907
                +
                +

                We know that the generated code is far from being optimal, but probably the +factor of five is at least partially due to the fact that Mono's own JIT is optimized for +C#-like code, and our code has a completely different shape.

                +

                All the benchmarks above were run under Linux, with Mono 1.9.1. Here are the +results for the same benchmarks, but run with Microsoft CLR (on a different +machine, so the absolute values are not comparable):

                +
                +$ ./targettlc-cli factorial.tlc 5000000
                +Non jitted:    0 (15,640625 seconds)
                +Warmup jitted: 0 (0,4375 seconds)
                +Warmed jitted: 0 (0,03125 seconds)
                +
                +$ python -c 'print 15.640625/0.03125'
                +500.5
                +
                +$ ./targettlc-cli --onlyjit factorial.tlc 100000000
                +Warmup jitted: 0 (0,90625 seconds)
                +Warmed jitted: 0 (0,515625 seconds)
                +
                +$ ./factorial.exe 100000000
                +C#:            0 (0,34375 seconds)
                +
                +$ python -c 'print 0.515625/0.34375'
                +1.5
                +
                +

                The results are even better than before; this is probably thanks to CLR's JIT, +that does a better job than Mono when faced to something which is different +than the usual C#-like code.

                +
                +
                +

                Conclusions (for now)

                +

                This is a very important result, because it proves that PyPy's approach to JIT +compilers can be applied effectively also to OO virtual machines; the result +is even better than what I expected, because when generating code for .NET we +have much less freedom than when generating assembly code, and I had to play +some tricks to work around some .NET limitations.

                +

                Moreover, it worked at the first try :-). I tried to compile the tlc +virtual machine as soon as all the related JIT tests were passing, and +surprisingly everything worked just fine, even if it was the very first time I +was trying to apply some features of the JIT to something bigger than a test: +I think this is yet another prove that Test Driven Development just works!

                +

                Even if this is a major milestone, the CLI JIT backend is not yet completed: +as a consequence it can't still be used for the full PyPy, but all the +hardest problems should have been solved now.

                +

                Since a lot of readers asked for more technical details, especially about the +JIT, I will try to soon write a second blog post explaining how the CLI backend works +internally, with a brief look to the generated code to see how it looks like.

                +
                +
                +

                Comments

                +
                +
                +
                + + Anonymous wrote on 2008-11-04 01:50: +
                +
                +

                If you are benchmarking on Linux then watch out for CPU speed scaling. For example on Ubuntu by default the ondemand governor is used which runs the CPU at lowest possible speed and until there is any CPU demand at which point it runs at fastest. The time to switch varies (eg sometimes it can be instantaneous, other times a second or two, and other times not at all).

                Make sure to use: cpufreq-set -g performance

                That will run at maximum CPU speed the whole time.

                +
                +
                +
                +
                + + Lucian wrote on 2008-11-04 02:27: +
                +
                +

                Woohoo! Can't wait for more (and the jvm counterpart).

                +
                +
                +
                +
                + + Anonymous wrote on 2008-11-04 08:46: +
                +
                +

                I agree that this result is very important. For me and many others too i guess, a very good JIT is the most important missing part in python and other dynamic languages. Speed *is* important,

                For integers, psycho also showed huge performance gains. But i think the real proof of pypy's approach would be to show similar results for floating point operations also...

                +
                +
                +
                +
                + + Anonymous wrote on 2008-11-04 11:09: +
                +
                +

                awesome!

                +
                +
                +
                +
                + + Anonymous wrote on 2008-11-04 11:47: +
                +
                +

                Keep it on!

                +
                +
                +
                +
                + + Antonio Cuni wrote on 2008-11-04 16:50: +
                +
                +

                hi, thanks for your valuable comments.

                Some notes:

                - I know about cpufreq-set, but even setting the governor to performance doesn't help, the timings vary a lot between different runs. If someone knows a way to run reliable benchmarks, it would be very appreciated!

                - I have plans to experiment also the JIT on the JVM: since HotSpot usually does a better job than CLR's JIT, it's possible/likely that the JVM is a better platform for our purposes. Also, the experimental Da Vinci Machine contains features that could be very useful for us. Unfortunately the PyPy non-JIT JVM backend is not as advanced as the CLI one, and it lacks some features that are really needed for writing a JIT backend.

                - Float operations are already (mostly) supported by our JIT backends; I bet that if you add a FloatObj to the tlc interpreter, you will see huge speedups as well. However, the real point of PyPy's approach is that once finished it will optimize much more than ints and floats, including features that are currently not implemented by psyco (e.g. generators).

                +
                +
                +
                +
                + + Ορέστης wrote on 2008-11-04 21:21: +
                +
                +

                Brilliant post! Keep us updated!

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2008/11/porting-jit-to-cli-part-2-2456826431882963884.html b/posts/2008/11/porting-jit-to-cli-part-2-2456826431882963884.html new file mode 100644 index 000000000..5dc6d585d --- /dev/null +++ b/posts/2008/11/porting-jit-to-cli-part-2-2456826431882963884.html @@ -0,0 +1,486 @@ + + + + + +Porting the JIT to CLI (part 2) | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                Porting the JIT to CLI (part 2)

                + + + +
                +

                In my previous post, we saw that PyPy JIT generator can produce huge +speedups when applied to the tlc toy language.

                +

                In this post we will dive a bit into the internals of PyPy JIT, to see how it +manages to do so. Note that this is a very high level overview of how the +JIT works, and applies to all backends. Then, in the third post of this +series, we will look closer at the CLI JIT backend, seeing how it works around +some .NET limitations and how the generated code looks like.

                + +

                PyPy JIT for dummies

                +

                As you surely know, the key idea of PyPy is that we are too lazy to write a +JIT of our own: so, instead of passing nights writing a JIT, we pass years +coding a JIT generator that writes the JIT for us :-).

                +

                I'm not going to explain how the JIT generator does its job, (perhaps this +will be the subject of another blog post), but how the generated JIT +works.

                +

                There are values that, if known at compile-time (i.e., when the JIT compiler +runs), let the JIT to produce very efficient code. In a dynamic language, +types are the primary example: for instance, suppose you are a compiler and +you have to compile to following Python function:

                +
                +def mysum(a):
                +  return a + 1
                +
                +

                At compile time, you don't have any knowledge about the type of the parameter: +it could be integer, float, an user defined object, etc. In this situation, +the only safe choice is to emit code which does the usual, slow, full lookup +to know how to perform the operations.

                +

                On the other hand, suppose that you knew in advance that the parameter is an +integer: this time, you could emit code that exploits this extra +knowledge, by performing directly a fast integer addition.

                +

                The idea behind PyPy JIT is that if you don't have enough knowledge to +generate efficient code, you stop compiling and wait until you know +exactly what you need. Concretely, you emit code that runs until the point +where you stopped the compilation, then it triggers a special procedure that +restarts the compiler. This time the JIT compiler knows everything +you need, because you can inspect the state of the running program.

                +

                Let's see an example: the first time the JIT compiles mysum, it produces +something like this pseudo-code:

                +
                +PyObject mysum_compiled(PyObject a)
                +{
                +  Type a_type = a.GetType();
                +  switch(a_type) {
                +      default: continue_compilation(a_type, <position>);
                +  }
                +}
                +
                +

                If you call mysum(41), the execution goes in the default branch of the +switch, thus calling continue_compilation: its job is to restart the JIT +compiler, which now can emit fast code because it knows the exact type of +a; then, it modifies the original mysum_compiled function, in +order to make it executing the newly generated code the next time it +encounters an integer at that point:

                +
                +PyObject mysum_compiled(PyObject a)
                +{
                +  Type a_type = a.GetType();
                +  switch(a_type) {
                +      PyInteger: return new PyInteger(a.value+1); // fast path!
                +      default: continue_compilation(a_type, <position>);
                +  }
                +}
                +
                +

                From now on, every time we call mysum with an integer argument, the JIT +compiler is not called anymore and the fast path is directly executed; if we +happen to call mysum with a float arguments, the switch goes again in the +default branch, and the JIT compiler is started once more to produce fast +code also for this case. What happens in practice is that compile-time and +runtime are continuously intermixed, until the switches are stable enough and +the compiler is not needed anymore.

                +

                In PyPy jargon, this kind of "growable switch" is called flexswitch, and +it's one of the most important concept of our JIT generator.

                + +

                Promotion

                +

                How can the JIT generator know which values are useful to know to generate +efficient code and which aren't? Unfortunately it can't, or at least our JIT +generator is not smart enough at the moment.

                +

                To get the best from it, the developers of the VM need to instruct it by +annotating the variables on which we want the JIT to stop until it knows the +actual values; this is done by using particular hints, called promote +and promote_class; variables annotated with such hints are said to be +promoted. If something is promoted, a flexswitch is used to gain +information about it, as seen in the last section.

                +

                For an example, let's look at an excerpt from main dispatch loop of the tlc +virtual machine:

                +
                +elif opcode == ADD:
                +  a, b = stack.pop(), stack.pop()
                +  hint(a, promote_class=True)
                +  hint(b, promote_class=True)
                +  stack.append(b.add(a))
                +
                +

                This the implementation of the ADD opcode: first, it pops two values from +the stack; then, it computes the result; finally, it push the result to the +stack again. In between, both the classes of a and b have been +promoted: this means that when the JIT emits the code for b.add(a), it +knows exactly what is happening: if it sees that both are instances of the +IntObj class, it inlines the method call and emits a fast integer addition +instead.

                + +

                Virtuals

                +

                The other important concept of the JIT is the presence of virtual +structures, virtual lists, and virtual dictionaries. Again, I'm not +going to explain in depth how they work, but only why they are so important for +generating highly efficient code.

                +

                The essence of virtuals is that you don't allocate objects until you really +need to do it, e.g. because they are being passed as an argument to some +external function. Instead, we store all the informations we need as local +variables; e.g., in the case of a virtual structure, we create as many local +variables as the number of its fields: if the structure escapes the local +scope, we force it to a real object, by allocating memory on the heap and +initializing it after the current value of the local variables.

                +

                This technique allows the JIT to avoid the allocation of many temporary +objects that hold intermediate results; consider for example the following +Python loop:

                +
                +result = 0
                +for i in range(N):
                +  result += i
                +return result
                +
                +

                Without the JIT, at each iteration, a new int object is created and bound +to the result variable, while the previous one is discarded and not needed +anymore. By combining virtuals and promotion, the JIT can emit code that does +the whole computation locally, and allocates a real object only at the end, +when it escapes from the local scope because it is returned from the +function.

                + +

                Putting it all together

                +

                This is, essentially, how PyPy's generated JITs work. To summarize, our JITs +emit multiple versions of each chunk of code: each version is specialized +and optimized for one particular case.

                +

                The cost of selecting the right specialization to use (through flexswitches) +is almost always negligible compared to how much time you save by running the +fast version instead of the more-general-but-slow one. Moreover, each +specialized version knows the exact shape of the objects it's dealing with, so +they can be virtualized to make the generated code even more efficient.

                +

                At the end, the actual code generation is done by one of the JIT backends: +the backends exploit all the knowledge gathered by the previous steps to +produce highly efficient code, but this will be the subject of the next blog +post.

                +
                +

                Comments

                +
                +
                +
                + + Anonymous wrote on 2008-11-07 12:46: +
                +
                +

                Wow... I love this approach. Keep up the great work and interesting posts!

                +
                +
                +
                +
                + + Luis wrote on 2008-11-07 18:12: +
                +
                +

                This is a very clear and didactic explanation. Thanks!

                +
                +
                +
                +
                + + Anonymous wrote on 2008-11-07 20:14: +
                +
                +

                can't wait for the next one

                +
                +
                +
                +
                + + kbob wrote on 2008-11-08 01:35: +
                +
                +

                What does the flexswitch compile into? I'm guessing it would look like

                t = type(obj);
                if (t == int)
                ...
                else if (t == float)
                ...
                else
                ....

                but maybe there's a better way (or maybe the answer is backend-dependent).

                +
                +
                +
                +
                + + Antonio Cuni wrote on 2008-11-08 08:53: +
                +
                +

                @bob
                your guess is right, how to implement the flexswitch is backend-dependent. This is the hardest part for .NET, as the flexswitch needs to grow dynamically (i.e., you have to add more case after the .NET method has already been compiled). It will be subject of the next blog post.

                +
                +
                +
                +
                + + Unknown wrote on 2009-01-05 22:46: +
                +
                +

                It seems that an implication of the JIT way is that, by adopting a consistent habit of implementing type driven Generic Functions, the JIT could accomplish nearly all of the possible optimizations in a single pass. In other words, by definition, each type based variation of a Generic Function call can only be fired when data of that type is provided as a parameter.

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2008/12/porting-jit-to-cli-part-3-3519327524638923621.html b/posts/2008/12/porting-jit-to-cli-part-3-3519327524638923621.html new file mode 100644 index 000000000..4f3491993 --- /dev/null +++ b/posts/2008/12/porting-jit-to-cli-part-3-3519327524638923621.html @@ -0,0 +1,776 @@ + + + + + +Porting the JIT to CLI (part 3) | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                Porting the JIT to CLI (part 3)

                + + + +
                +

                In my two previous posts, we talked about the PyPy JIT generator, seeing +that it can produce huge speedups and how its backend-independent frontend +works.

                +

                In this post, we will look closer at the internals of the CLI JIT backend; in +particular, we will see how we work around some serious limitations of the +platform, and why these workarounds didn't have any serious impact on the +performances of our toy virtual machine.

                +
                + + + + + +

                One of the core aspect of PyPy translator is the concept of flow graph: a +flow graph is a data structure that represents the code we are operating on. +It is composed by a set of basic blocks, each block containing a sequence +of operations; blocks are connected together by links, and each link can +carry a variable number of arguments whose value is passed to the target +block. In case a block contains more than one outgoing links, the one to +follow is selected by looking at the value of a designated variable (the +exitswitch), thus making possible to implement conditional jumps. To have +a more complete description of the flow graphs model, check the documentation.

                + +

                As we saw in the previous post, the generated JIT compiler makes heavy use of +flexswitches to generate efficient code, continuously intermixing +JIT-compile time and runtime.

                +

                In terms of graphs, we can think of a flexswitch as a special block whose +links change over time. In particular, adding a new case to the flexswitch is +equivalent to create a link whose target is a new block where the just +generated code starts. Thus, the graphs grows over the time, as showed by +the following images:

                + + + + + + +

                In the images above, the block containing the flexswitch is colored in +cyan. In the first picture, there is only one block connected to the +flexswitch: this block contains the code to restart the JIT compilation. The +second picture shows the graph after the first case has been added: you can +clearly see that a new block has been created and attached to the flexswitch. +Finally, the third picture shows the graph after a while, with a lot of new +blocks attached.

                +
                +
                +

                Translate graphs to CLI

                +

                Conceptually, the goal of the CLI JIT backend is to express these graphs in +terms of CLI bytecode.

                +

                Translating the single block is easy, as it is just a list of sequential +operation, and it's straightforward to map each operation to the equivalent +CLI opcode or to a call to a helper method. Moreover, we need a way to +express links between the various basic blocks: if the links are known in +advance, render them is as easy as emitting a (potentially conditional) jump to +the target block. Thus, we won't discuss this part in detail, as it is quite +straightforward.

                +

                The hard part is how to implement flexswitches: at the time when we are +emitting the code, some of the blocks of this growable graph don't even exist: +how can we make a jump to a non existent block of code? For backends that +emit assembly code, it is rather easy: when they need to add a new case to the +flexswitch, they can just patch the existing code to insert a jump to a +newly allocated area of the memory, where the new code is being generated in.

                +

                For CLI this approach is not feasible, as the VM will never allow us to modify +existing code. Thus, we need to think of a different approach.

                +
                +
                +

                Graphs and methods

                +

                In .NET, the basic unit of compilation is the method: the only way to +execute some bytecode is to wrap it into a method. Moreover, it is not +possible to execute a method until it has been completed, and after this point +it is no longer possible to add new code.

                +

                Because of all these constraints we cannot simply map each graph to its own +method, since we saw that our graphs can grow after they have already been +executed few times.

                +

                Hence, we need to distinguish between the two concepts:

                +
                +
                  +
                • a graph is the logical unit of code as seen by the JIT compiler: +concretely, the CLI JIT backend renders it as one or more methods;
                • +
                • a method is a collection of basic blocks; each method has the so +called parent graph, i.e. the graph its blocks logically belongs to.
                • +
                +
                +

                The first method of a graph is called main method (which has +nothing to do with the Main static methods found in .exe files); other +methods are called children methods.

                +

                When we want to add a new case to the flexswitch, we create a method +containing all the new code; then we wrap the method inside a delegate (the +.NET equivalent of a function pointer) and pass it to the flexswitch, so that +it can later invoke it.

                +
                +
                + +

                Using this approach, after a while the blocks of our original graph are +scattered over a lot of different methods; however, there are no constraints +about how these blocks can be linked together, so it happens to have links +between blocks which are not in the same method. In the following, we will +refer to them as non-local links.

                +

                If the non-local block we want to jump to happens to be at the beginning of +its containing method, it is enough to invoke the method; but, what if we want +to jump somewhere in the middle? What we really want is to produce a method +which has multiple entry-points; again, doing it in assembly would be +trivial, but the virtual machine does not provide any support for it, so we +need a work around.

                +

                Each method in a graph is assigned an unique 16 bit method id; each block in +a method is assigned a progressive 16 bit block number. From this two +numbers, we can compute the block id as an unsigned integer, by storing +the method id in the first 16 bits and the block number in the second 16 bits. +By construction, the block id is guaranteed to be unique in the graph.

                +

                The following picture shows a graph composed of three methods; the id of each +method is shown in red, while the block ids are shown in red (for the method +id part) and black (for the block number part). The graph contains three +non-local links; in particular, note the link between blocks 0x00020001 +and 0x00010001 which connects two block that resides in different methods.

                + + + +

                Every method contains a special dispatch block, (not shown in the picture above) whose goal is to jump to +the specified block number inside the method itself. The first argument of a +child method is always a block id; when the method starts, it immediately +jumps to the dispatch block, and thus to the desired block.

                +

                For example, suppose to have a method which contains 3 blocks numbered 0, 1, +2; here is how its dispatch blocks looks like; for simplicity it is shown as +C# code, but it is actually generated as IL bytecode:

                +
                +// dispatch block
                +int methodid = (blockid & 0xFFFF0000) >> 16); // take the first 16 bits
                +int blocknum = blockid && 0x0000FFFF;         // take the second 16 bits
                +
                +if (methodid != MY_METHOD_ID) {
                +// jump_to_unknown block
                +...
                +}
                +
                +switch(blocknum) {
                +case 0:
                +goto block0;
                +case 1:
                +goto block1;
                +case 2:
                +goto block2;
                +default:
                +throw new Exception("Invalid block id");
                +}
                +
                +

                Whenever we want to jump to a non-local block, it is enough to store the block +id in the appropriate variable and jump to the dispatch block. If the block +resides in a different method, the jump_to_unknown block is entered; this +special block is implemented differently by the main method and the child +methods, as we will see soon.

                +

                Each time a new method is added to the graph, we build a delegate +for it, and store it in a special array +called method_map; since we assign the method id sequentially starting +from 0, we are sure that to fetch the method whose id is n we can simply +load the n-th element of the array.

                +

                The jump_to_unknown block of the main method uses this array to select the +right method, and calls it (FlexSwitchCase is the type of delegates for +all children methods):

                +
                +// jump_to_unknown block of the main method
                +FlexSwitchCase meth = method_map[methodid];
                +blockid = meth(blockid, ...); // execute the method
                +goto dispatch_block;
                +
                +

                Each child method returns a block id specifying the next block to jump to; +after its execution, we assign the return value to the blockid variable, +and jump again to the dispatch block, which will jump again to the appropriate +block.

                +

                Keeping this in mind, it is straightforward to implement the +jump_to_unknown block of children methods: it is enough to return the +target block id to the caller, and let its dispatch loop do the right thing. +If the caller is also a child method, it will return it again, until we reach +the dispatch loop of the main method, which will finally do the jump. In +theory, we could implement things differently and jumping directly from a +child method to another one, but in that case the call stack could grows +indefinitely in case of a tight loop between two blocks residing in different +methods.

                +

                To implement the dispatch block we can exploit the switch opcode of the +CLI; if the .NET JIT is smart enough, it can render it using an indirect jump; +overall, jumping to a non-local block consists of an indirect function call +(by invoking the delegate) plus an indirect jump (by executing the switch +opcode); even if this is more costly than a simple direct jump, we will see in +the next section that this not the main source of overhead when following a +non-local link.

                +

                Obviously, the slow dispatching logic is needed only when we want to jump to a +non-local block; if the target block happens to reside in the same method as +the current one, we can directly jump to it, completely removing the overhead.

                +

                Moreover, the dispatch blocks are emitted only if needed, i.e. if the parent +graph contains at least one flexswitch; graphs without flexswitches are +rendered in the obvious way, by making one method per graph.

                +
                +
                +

                The slow bit: passing arguments

                +

                Jumping to the correct block is not enough to follow a link: as we said +before, each link carries a set of arguments to be passed from the source to +the target block. As usual, passing arguments across local links is easy, as +we can just use local variables to hold their values; on the other hand, +non-local links make things more complex.

                +

                The only way to jump to a block is to invoke its containing method, so the +first solution that comes to mind is to specify its input arguments as +parameter of the method; however, each block has potentially a different +number (and different types) of input arguments than every other block, so we +need to think of something else.

                +

                An alternative solution could be to compute the union of the sets of input +arguments of all the blocks in the method, and use this set as a signature +for the method; this way, there would be enough space to specify the input +arguments for every block we might want to jump to, each block ignoring the +exceeding unused parameters.

                +

                Unfortunately, all the children methods must have the very same signature, +as they are all called from the same calling site in the dispatch block of the +main method. Since the union of the set of input arguments (and hence the +computed signature) varies from method to method, this solution cannot work.

                +

                We might think to determine the signature by computing the union of input +arguments of all blocks in the graph; this way, all the children methods +would have the same signature. But as we said above, the graph grows new +blocks at runtime, so we cannot determine in advance which set of input +arguments we will need.

                +

                To solve the problem we need a way to pass a variable number of arguments +without knowing in advance neither their number nor their types. Thus, we use +an instance of this class:

                +
                +public class InputArgs {
                +public int[] ints;
                +public float[] floats;
                +public object[] objs;
                +...
                +}
                +
                +

                Since the fields are arrays, they can grow as needed to contain any number of +arguments; arguments whose type is primitive are stored in the ints or +floats array, depending on their type; arguments whose type is a reference +type are stored in the objs array: it's up to each block to cast each +argument back to the needed type.

                +

                This solution impose a huge overhead on both writing and reading arguments:

                +
                +
                  +
                • when writing, we need to make sure that the arrays are big enough to +contains all the arguments we need; if not, we need to allocate a bigger +array. Moreover, for each argument we store into the array the virtual +machine performs a bound-check, even if we know the index will never be +out of bounds (because we checked the size of the array in advance);
                • +
                • when reading, the same bound-check is performed for each argument read; +moreover, for each value read from the objs array we need to insert a +downcast.
                • +
                +
                +

                To mitigate the performance drop, we avoid to allocate a new InputArgs +object each time we do a non-local jump; instead, we preallocate one at the +beginning of the main method, and reuse it all the time.

                +

                Our benchmarks show that passing arguments in arrays is about 10 times slower +than passing them as real parameter of a method. Unfortunately, we couldn't +come up with anything better.

                +
                +
                +

                Implement flexswitches

                +

                Now, we can exploit all this machinery to implement flexswitches, as this is +our ultimate goal. As described above, the point is to be able to add new +cases at runtime, each case represented as a delegate. Here is an excerpt +of the C# class that implements a flexswitch that switches over an integer +value:

                +
                +public class IntLowLevelFlexSwitch:
                +{
                +public uint default_blockid = 0xFFFFFFFF;
                +public int numcases = 0;
                +public int[] values = new int[4];
                +public FlexSwitchCase[] cases = new FlexSwitchCase[4];
                +
                +public void add_case(int value, FlexSwitchCase c)
                +{
                +...
                +}
                +
                +public uint execute(int value, InputArgs args)
                +{
                +for(int i=0; i<numcases; i++)
                +if (values[i] == value) {
                + return cases[i](0, args);
                +}
                +return default_blockid;
                +}
                +}
                +
                +

                For each case, we store both the triggering value and the corresponding +delegate; the add_case method takes care to append value and c to +the values and cases arrays, respectively (and resize them if +necessary). The interesting bit is the execute method: it takes a value +and a set of input arguments to be passed across the link and jumps to the +right block by performing a linear search in the values array.

                +

                As shown by previous sections, the first argument of a FlexSwitchCase is +the block id to jump to; since when we go through a flexswitch we always want +to jump to the first block of the method, we pass the special value 0 as a +block id, which precisely means jump to the first block. This little +optimization let us not to have to explicitly store the block id for the first +block of all the cases.

                +

                The value returned by execute is the next block id to jump to; if the +value is not found in the values array, we return the default_blockid, +whose value has been set before by the JIT compiler; default_blockid +usually points to a block containing code to restart the JIT compiler again; +when the JIT compiler restarts, it emits more code for the missing case, then +calls add_case on the flexswitch; from now on, the new blocks are wired +into the existing graph, and we finally managed to implement growable +graphs.

                +
                +
                +

                Performances

                +

                As we saw, implementing growable graphs for CLI is a pain, as the virtual machine +offers very little support, so we need an incredible amount of workarounds. +Moreover, the code generated is much worse than what an assembly backend could +produce, and the cost of following a non-local link is very high compared to +local links.

                +

                However, our first blog post showed that we still get very good +performances; how is it possible?

                +

                As usual in computer science, most of the time of a running program in +spent in a tiny fraction of the code; our benchmark is no exception, and the +vast majority of the time is spent in the inner loop that multiplies numbers; +the graph is built in such a way that all the blocks that are part of the +inner loop reside in the same method, so that all links inside are local (and +fast).

                +

                Flexswitches and non-local links play a key role to select the right +specialized implementation of the inner loop, but once it is selected they are +not executed anymore until we have finished the computation.

                +

                It is still unclear how things will look like when we will compile the full +Python language instead of a toy one; depending on the code, it could be +possible to have non-local links inside the inner loop, thus making +performance much worse.

                +
                +
                +

                Alternative implementations

                +

                Before implementing the solution described here, we carefully studied a lot of +possible alternatives, but all of them either didn't work because of a +limitation of the virtual machine or they could work but with terrible +performances.

                +

                In particular, in theory it is possible to implement non-local links using +tail calls, by putting each block in its own method and doing a tail call +instead of a jump; this would also solve the problem of how to pass arguments, +as each method could have its own signature matching the input args of the +block. I would like to explain this solution in a more detailed way as I +think it's really elegant and nice, but since this post is already too long, +I'll stop here :-).

                +

                In theory, if the .NET JIT were smart enough it could inline and optimize away +the tail calls (or at least many of those) and give us very efficient code. +However, one benchmark I wrote shows that tail calls are up to 10 times +slower (!!!) than normal calls, thus making impractical to use them for our +purposes.

                +
                +
                +

                Conclusion

                +

                Despite the complexity of the implementation, our result are extremely good; +the speedup we got is impressive, and it proves that PyPy's approach to JIT +compiler can work well also on top of object oriented virtual machines like +.NET or the JVM.

                +

                Generating bytecode for those machine at runtime is not a new idea; Jython, +IronPython, JRuby and other languages have been doing this for years. +However, Jython and IronPython do only a simple "static" translation, which +doesn't take advantage of the informations gathered at runtime to generate +better, faster and specialized code. Recently, JRuby grew a new strategy to +JIT-compile only hotspots, taking advantage of some informations gathered +while interpreting the code; this is still a "one-shot" compilation, where the +compiled code does not change over time.

                +

                To my knowledge, PyPy brings the first example of a +language which implements a truly JIT compiler on top of the underlying JIT +compiler of the virtual machine, emitting bytecode that changes and adapts +over the time. If someone knows other languages doing that, I would really +like to know more.

                +

                Being so innovative, the problem of this approach is that the current virtual +machines are not designed to support it in a native way, and this forces us to +put a lot of workarounds that slow down the generated code. The hope is that +in the future the virtual machines will grow features that help us to generate +such kind of code. The experimental Da Vinci VM seems to go in the right +direction, so it is possible that in the future I will try to write a JIT +backend for it.

                +

                At the moment, the CLI JIT backend is almost complete, and all the hardest +problems seems to be solved; the next step is to fix all the remaining bugs +and implement some minor feature that it's still missing, then try to apply it +to the full Python language and see what is the outcome.

                +
                +
                +

                Comments

                +
                +
                +
                + + Unknown wrote on 2008-12-07 22:33: +
                +
                +

                JikesRVM + LLVM

                https://osdir.com/ml/java.jikes.rvm.devel/2003-09/msg00059.html

                Don't know if it succeeded.

                +
                +
                +
                +
                + + Yosef wrote on 2008-12-08 08:15: +
                +
                +

                The comment about assembly-code patching is interesting. Do you mean assembly code backends can do runtime patching of previously generated code? I thought this is impossible, because operating systems mark executable pages as read-only. How is that dealt with?

                +
                +
                +
                +
                + + Maciej Fijalkowski wrote on 2008-12-08 09:29: +
                +
                +

                Most executable pages are read only, but there is nothing that stops you from creating ones that are rw. You just pass different flags to mmap.

                Cheers,
                fijal

                +
                +
                +
                +
                + + Antonio Cuni wrote on 2008-12-08 19:11: +
                +
                +

                @Yosef
                about patching generated code, see fijal's comment. Btw, this is exactly the same approach used by psyco

                +
                +
                +
                +
                + + Anonymous wrote on 2008-12-11 08:22: +
                +
                +

                It's wrong to say IronPython only does static translation. The Call Site stuff happens and generates IL at run time, and generates different code depending on the types. In fact you may want to look at how they do it, becuase they regenerate the IL for a method multiple times, which may be another way of implementing Flex switches

                +
                +
                +
                +
                + + Antonio Cuni wrote on 2008-12-12 09:02: +
                +
                +

                @Ben Young
                do you have a link that explains in more detail what you mean?
                As far as I know, DLR's callsites are just a way to do polymorphic inline caches, but nothing more. In particular, they don't do any specialization of the called code.

                You are right that we could do the same to implement flexswitches, though I think this is a minor optimization, as right now the real performance problem is how to pass arguments across non-local links.

                +
                +
                +
                +
                + + Anonymous wrote on 2008-12-12 16:26: +
                +
                +

                Hi Antonio

                IronPython can create a method that looks like this

                void object add(object a, object b)
                {
                throw new Exception("I don't know how to add")
                }

                into

                void object add(object a, object b)
                {
                if(a is int && b is int)
                return (int)a + (int)b

                throw new Exception("I don't know how to add")
                }

                and can further add new tests at runtime. The code do do the adding is written directly into the method body and there's no futher call needed. This is runtime code generation, not just caching

                In your case, instead of having multiple methods implementing different blocks you could just rewrite the whole "master" method every time the flexswitch changes. That way there's no call overhead at all. That's what the DLR does. I think the main thing it's missing is promotion, so shared tests can't be moved up a level, and it doesn't do inlining.

                +
                +
                +
                +
                + + Anonymous wrote on 2008-12-18 08:32: +
                +
                +

                I'm a beginner programmer, so please excuse my beginner questions :-)

                I just started learning Python as my first programming language. Several of my programmer friends have said I should learn Java instead, one reason being the difference in performance - specifically for doing natural language processing / AI stuff which is the area I am interested in.

                With PyPy, do you think it is likely that in the near future, Python's performance may be close to that of Java? I do plan on learning multiple languages, but it would be nice if I could stick with Python for as long as possible :-)

                +
                +
                +
                +
                + + Lucian wrote on 2008-12-20 12:56: +
                +
                +

                @Anonymous

                Probably. People have great hopes for PyPy, but you can never know how it will turn out, if at all.

                Right now, you can use things like numpy, psycho, shedskin, cython/pyrex and a few others to speed up you code, only needing to know a few things about C or C++. Google them.

                +
                +
                +
                +
                + + Luis wrote on 2008-12-20 14:38: +
                +
                +

                @Sin

                You don't need to know any c or c++ to use psyco or shedskin. Only python.

                +
                +
                +
                +
                + + Anonymous wrote on 2009-03-05 03:07: +
                +
                +

                WoW shares many wow gold of its features with previously launched games. Essentially, you battle with wow gold cheap monsters and traverse the countryside, by yourself or as a buy cheap wow gold team, find challenging tasks, and go on to higher aoc gold levels as you gain skill and experience. In the course of your journey, you will be gaining new powers that are increased as your skill rating goes up. All the same, in terms of its features and quality, that is a ture stroy for this.WoW is far ahead of all other games of the genre the wow power leveling game undoubtedly is in a league of its own and cheapest wow gold playing it is another experience altogether.

                Even though WoW is a Cheap Wow Gold rather complicated game, the controls and interface are done in warhammer gold such a way that you don't feel the complexity. A good feature of the game is that it buy wow items does not put off people with lengthy manuals. The instructions bygamer cannot be simpler and the pop up tips can help you start playing the game World Of Warcraft Gold immediately. If on the other hand, you need a detailed manual, the instructions are there for you to access. Buy wow gold in this site,good for you, BUY WOW GOLD.

                +
                +
                +
                +
                + + Anonymous wrote on 2009-04-11 04:00: +
                +
                +

                My friends and I like to buy Anarchy credits, because the Anarchy Online credits is very useful to upgrade equipment. Only your equipment becomes better, then you can win this game. In Anarchy gold, you can buy everything you want in this game. Tomorrow will be my birthday, so my friends promise to buy AO credits as gifts. I am so happy. They understand me so well, Anarchy online gold is my favorite.
                I like angels gold very much because it is very useful. In fact at first sight I have fallen in love with angels online gold. So no matter how much I have spent to buy angels gold, I never regret. Because of cheap angels online gold, I meet a lot of friends.

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2008/12/pycon-2009-9090464449197911432.html b/posts/2008/12/pycon-2009-9090464449197911432.html new file mode 100644 index 000000000..246c2ed97 --- /dev/null +++ b/posts/2008/12/pycon-2009-9090464449197911432.html @@ -0,0 +1,349 @@ + + + + + +Pycon 2009 | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                Pycon 2009

                + + + +
                +

                Hello.

                +

                +Both of our PyPy talks has been accepted for Pycon US 2009. Although both +are somehow related to PyPy, they're vastly different in +topics, attitude and target audience.

                +

                +The first one is a classic PyPy status talk - we'll mostly talk about +our achievements from the last year (readers of this blog are aware of most, +but not all :) as well as some general introduction and plans for the future. +

                +

                +The second one is about PyPy's sandboxing features. This is in my opinion +a very underestimated feature, also by us, because it's not really well +advertised or documented. The main purpose of the talk is to present +to the general public how this works and how to use it. Hopefully we will +get to work and publish about this a bit more ahead of Pycon already. +Unlike Zope's Restricted Python, it provides you with the full python +language, inside a fully +virtualized sandbox, controlled from an external process by a custom +security policy. Stay tuned for more :-) +

                +

                +See you at Pycon 2009! +

                +

                +Cheers,
                +fijal and holger +

                +
                +

                Comments

                +
                +
                +
                + + Alex wrote on 2008-12-24 07:17: +
                +
                +

                Can't wait to hear it, Fijal gave a fantastic talk last year and I'm excited for this year's as well. Really hoping it doesn't conflict with my panel :)

                +
                +
                +
                +
                + + Anonymous wrote on 2009-01-07 10:24: +
                +
                +

                hi,

                would you have somrthing to say about that ?:

                https://www.python-forum.org/pythonforum/viewtopic.php?f=1&t=10744&sid=304ad6507b0db8420ae1df9f6c1522cd

                thx

                +
                +
                +
                +
                + + Maciej Fijalkowski wrote on 2009-01-10 09:03: +
                +
                +

                Well, I'm not really up to discuss with some rants.

                cheers,
                fijal

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2009/01/wroclaw-2009-pypy-sprint-and-talk-8240928228677982487.html b/posts/2009/01/wroclaw-2009-pypy-sprint-and-talk-8240928228677982487.html new file mode 100644 index 000000000..44d8a0b93 --- /dev/null +++ b/posts/2009/01/wroclaw-2009-pypy-sprint-and-talk-8240928228677982487.html @@ -0,0 +1,309 @@ + + + + + +Wroclaw 2009 PyPy sprint and talk | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                Wroclaw 2009 PyPy sprint and talk

                + + + +
                +

                The next PyPy sprint will be held in Wrocław, Poland 7-14th February 2009. This is fully public +sprint and all newcomers are welcomed. Preceeding the sprint there +will be a talk at University of Technology in Wrocław held at 22nd of January.

                + +

                For detailed info about the sprint, look here.

                + +

                The talk will be a general, high-level overview about PyPy project. There is a very nice poster, made by Jakub Gustak and Bartosz Skowron (in polish):

                + + + +Talk details: +
                  +
                • Location: Politechnika Wrocławska, budynek C-13, sala 0.31
                • +
                • +
                • +
                • Date: 22nd January 2009, 19:00
                • +
                • Language: very likely polish, although talk can be as well in english if some non-polish native would show up. +
                • +
                + +Cheers,
                +fijal +
                +

                Comments

                +
                +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2009/02/wroclaw-2009-sprint-progress-report-2510073170049635489.html b/posts/2009/02/wroclaw-2009-sprint-progress-report-2510073170049635489.html new file mode 100644 index 000000000..0fd9d4100 --- /dev/null +++ b/posts/2009/02/wroclaw-2009-sprint-progress-report-2510073170049635489.html @@ -0,0 +1,368 @@ + + + + + +Wroclaw 2009 sprint progress report | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                Wroclaw 2009 sprint progress report

                + + + +
                +

                Hello.

                + +We have just finished probably the smallest sprint ever +in PyPy history. For most of the time it was just me +and Armin pairing together.

                + +We also had a chance to work a bit with people from +the University, but there were definitely not enough +core developers to organize the work in a reasonable +manner. At some point we ended up having two pairs containing +four people each.

                + +Jakub and Bartosz (who were our gentle hosts) worked +on getting PyPy's sandbox integrated with django. +It's still just an example what you can do (ie you +can do much more), but it's already interesting to look +at. The code can be found in user dir. This server (not yet online anywhere, sorry) +is able to run untrusted python code provided by user inside +a fully configurable sandbox.

                + +We also implemented missing peepholer optimizations from +CPython, finding out that some peepholer tests were failing, +just because PyPy is optimizing better :-)

                + +The main part of the sprint was work on JIT (most notable the fifth +generation of the JIT), which was moved +from the obscure directory in Carl's user in svn (which contains +branches these days!) into a PyPy branch. It's still very much +work in progress and a lot of pen and paper or handwaving was +involved, but we were able to implement a lot of basics in record time. +

                +Right now we need a lot of rest after the exhaustive sprint, +but after that, stay tuned for more information about +progressing JIT!

                + +Cheers,
                +fijal

                +
                +

                Comments

                +
                +
                +
                + + Michael Foord wrote on 2009-02-14 14:14: +
                +
                +

                Great to see both concrete work on the JIT and some practical applications for PyPy making progress. Keep up the good work.

                See you at PyCon.

                +
                +
                +
                +
                + + Anonymous wrote on 2009-02-16 08:47: +
                +
                +

                Great to see the JIT evolving.

                Zejn

                +
                +
                +
                +
                + + Olle Jonsson wrote on 2009-02-21 01:51: +
                +
                +

                Huzzah! And yay for practical apps.

                +
                +
                +
                +
                + + stuaxo wrote on 2009-02-24 12:48: +
                +
                +

                Great to see progress being made on this :)

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2009/03/applying-tracing-jit-to-interpreter-3287844903778799266.html b/posts/2009/03/applying-tracing-jit-to-interpreter-3287844903778799266.html new file mode 100644 index 000000000..a4627044d --- /dev/null +++ b/posts/2009/03/applying-tracing-jit-to-interpreter-3287844903778799266.html @@ -0,0 +1,617 @@ + + + + + +Applying a Tracing JIT to an Interpreter | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                Applying a Tracing JIT to an Interpreter

                + + + +
                +

                After I had failed once more to explain to someone on IRC what the idea behind +the current JIT generator work of PyPy, I decided to just write a blog post to +explain it. Here it is :-). The post turned out to be a bit long, so please bear +with me.

                +

                The goal of the post is to give an understanding of how PyPy's JIT generator is +going to work. To do this, I will look at what happens when you write an +interpreter in Java and apply a completely normal tracing JIT to it (for this +reason all the code examples will be in some sort of pseudo-Java). The +resulting generated machine code is bad, so I will explain a way to fix the +occurring problem.

                +

                The techniques I describe here are conceptually similar to what we are doing in +PyPy. The details (as usual) are different. The reasons why I am trying to +explain things in this way is that I can start from tracing JITs, which are a +known existing technique.

                +

                To understand the following, it is helpful to already know a bit how a normal +tracing JIT works. I will give a reminder of how it is working, but there also +exist a couple of more thorough introductions on the web already. +I also will leave out a lot of details about the more detailed workings of +tracing JITs and only explain the things that are relevant to what I am trying +to get to here.

                +

                Tracing JITs

                +

                Tracing JITs are an idea explored by the Dynamo project in the context of +dynamic optimization of machine code at runtime. The techniques were then +successfully applied to Java VMs and are now being used by Mozilla's +TraceMonkey JavaScript VM. They are built on some basic assumptions:

                +
                +
                  +
                • programs spend most of their runtime in loops
                • +
                • several iterations of the same loop are likely to take similar code paths
                • +
                • the best way to gain information about the behaviour of a program is to +observe it
                • +
                +
                +

                The basic approach of a tracing JIT is to only generate machine code for +commonly executed loops and to interpret the rest of the program. The code for +those common loops however should be highly optimized, including aggressive +inlining.

                +

                The generation of loops works as follows: At first, everything is interpreted. +The interpreter does a bit of lightweight profiling to figure out which loops +are run often. When a common loop is identified, the interpreter enters a +special mode (called tracing mode). When in tracing mode, the interpreter +records a history (the trace) of all the operations it executes, in addition +to actually performing the operations. During tracing, the trace is repeatedly +checked whether the interpreter is at a position in the program that it had seen +earlier in the trace. If this happens, the trace recorded corresponds to a loop +in the program that the tracing interpreter is running. At this point, this loop +is turned into machine code by taking the trace and making machine code versions +of all the operations in it.

                +

                This process assumes that the path through the loop that was traced is a +"typical" example of possible paths (which is statistically likely). Of course +it is possible that later another path through the loop is taken, therefore the +machine code will contain guards, which check that the path is still the same. +If during execution of the machine code a guard fails, the machine code is left +and execution falls back to using interpretation (there are more complex +mechanisms in place to still produce more code for the cases of guard failures, +but they are of no importance for this post).

                +

                It is important to understand when the tracer considers a loop in the trace to +be closed. This happens when the position key is the same as at an earlier +point. The position key describes the position of the execution of the program, +e.g. usually contains things like the function currently being executed and the +program counter position of the tracing interpreter.

                +

                Let's look at a small example. Take the following code:

                +
                +int sum_1_to_n(int n) {
                +    int result = 0;
                +    while (n >= 0) {
                +        result += n;
                +        n -= 1;
                +    }
                +    return result;
                +}
                +
                +

                The tracing JIT will at one point trace the execution of the while loop in +sum_1_to_n. The trace might look as follows:

                +
                +guard_true(n >= 0);
                +result += n;
                +n -= 1;
                +<loop_back>
                +
                +

                This trace will then be turned into machine code. Note that the machine code +loop is by itself infinite and can only be left via a guard failure.

                +

                A slightly more complex example:

                +
                +int f(int a, int b) {
                +    if (b % 46 == 41)
                +        return a - b;
                +    else
                +        return a + b;
                +}
                +
                +int strange_sum(int n) {
                +    int result = 0;
                +    while (n >= 0) {
                +        result = f(result, n);
                +        n -= 1;
                +    }
                +    return result;
                +}
                +
                +

                The trace of the loop in strange_sum would maybe look like this:

                +
                +guard_true(n >= 0);
                +a = result;
                +b = n;
                +guard_false(b % 46 == 41);
                +result = a + b;
                +n -= 1;
                +<loop_back>
                +
                +

                This would then be turned into machine code. Note how f was inlined into the +loop and how the common else case was turned into machine code, while the +other one is implemented via a guard failure.

                +

                Applying a Tracing JIT to an Interpreter

                +

                In the rest of the post we will explore what happens when the program that is +being executed/compiled by the tracing JIT is itself a (bytecode) interpreter +for another language.

                +

                A stylized bytecode interpreter for a simple programming language could look as +follows:

                +
                +W_Object interpret(String bytecode, ...) {
                +    Stack<W_Object> stack = new Stack<W_Object>();
                +    int pc = 0;
                +    while (true) { // bytecode dispatch loop
                +        char instruction = bytecode.charAt(pc);
                +        pc += 1;
                +        switch (instruction) {
                +            case ADD:
                +                W_Object arg2 = stack.pop();
                +                W_Object arg1 = stack.pop();
                +                stack.push(do_addition(arg1, arg2));
                +                break;
                +            case SUB:
                +                W_Object arg2 = stack.pop();
                +                W_Object arg1 = stack.pop();
                +                stack.push(do_substraction(arg1, arg2));
                +                break;
                +            case RETURN:
                +                return stack.pop();
                +            case JUMP_BACKWARD:
                +                pc -= (int)bytecode.charAt(pc);
                +                break;
                +            case LOAD_INTEGER:
                +                int value = (int)bytecode.charAt(pc);
                +                pc += 1;
                +                stack.push(new W_Integer(value));
                +                break;
                +            case PRINT:
                +                do_print(stack.pop());
                +                break;
                +            case DUP:
                +                stack.push(stack.peek());
                +                break;
                +            case JUMP_IF_TRUE:
                +                ...
                +            ...
                +        }
                +    }
                +
                +

                If we apply a tracing JIT to this function, it will trace and compile the +execution of one bytecode, because after one bytecode the bytecode dispatch loop +is closed. E.g. it might trace and produce machine code for the execution of a +SUB. (Sidenote: this interpret function is an example where one of the +assumptions of a tracing JIT break down: two iterations of the bytecode dispatch +loop are rarely going to follow the same code path, because usually two +consecutive bytecodes encode different instructions).

                +

                The important bit to remember here is that the tracing JIT will produce a +machine code loop that corresponds to the bytecode dispatch loop in the +interpret function. Let's see how we can change that.

                +

                Improving the Generated Code

                +

                If we want to make use of the fact that the program that is being jitted is +itself an interpreter, we need to change the tracing JIT a bit. To be more +precise we add a way for the user of the tracing JIT to add information to the +position key that the tracing JIT uses to decide when a loop is closed. This is +done by a call to a magic function add_to_position_key. This allows the +program writer to influence the tracing JIT's behaviour.

                +

                The semantics of add_to_position_key is as follows: The method itself does +not do anything. It has an effect only when it is seen during tracing. If it is +seen during tracing, the tracer adds the argument of the call to the position +key that the tracer is using to find out whether a loop was closed or not.

                +

                In the example of the interpret function above, we would add a call to this +function into the while loop as follows:

                +
                +W_Object interpret(String bytecode, ...) {
                +    Stack stack = new Stack();
                +    int pc = 0;
                +    while (true) { // bytecode dispatch loop
                +        add_to_position_key(pc);
                +        add_to_position_key(bytecode);
                +        char instruction = bytecode.charAt(pc);
                +        pc += 1;
                +        switch (instruction) {
                +            case ADD:
                +    ...
                +
                +

                When the modified tracing JIT traces now the interpret function executing a +SUB, something interesting happens. When the bytecode loop is closed, the +modified tracing JIT does not consider the trace to be a loop, because the value of +pc has been increased by one, so the position key differs. Instead it +continues to trace, effectively unrolling the bytecode dispatch loop of +interpret.

                +

                The only way for a loop to be considered closed is if the pc variable has +the same value a second time. This can only happen after a JUMP_BACKWARD +instruction has been executed. A JUMP_BACKWARD instruction will only be in +the bytecode when the bytecode represents a loop. This means that the modified +tracing JIT will trace the interpret function and will only consider that +the trace represents a loop when the bytecode itself represents a loop! Thus, a +machine code loop will eventually be created that corresponds to the loop in the +bytecode.

                +

                Let's look at at example. If we have a bytecode that corresponds to the +following instructions:

                +
                +pc |   instruction
                +---+---------------------
                +0  |  LOAD_INTEGER 0
                +2  |  DUP
                +3  |  PRINT
                +4  |  LOAD_INTEGER 1
                +6  |  ADD
                +7  |  JUMP_BACKWARD 6
                +
                +

                This loop will print integers starting from 0 and going on from there. The +modified tracing JIT will unroll the bytecode dispatch until it sees the +JUMP_BACKWARD bytecode. After that bytecode the pc will be 2 again. Thus +the earlier position key is repeated, which means that the loop will be closed. +The produced machine code will do the equivalent of the following Java code:

                +
                +...
                +guard_true(pc == 2)
                +guard_true(bytecode == "... correct bytecode string ...")
                +while (true) {
                +    instruction = bytecode.charAt(pc);
                +    pc += 1;
                +    guard_true(instruction == DUP);
                +    stack.push(stack.peek());
                +
                +    instruction = bytecode.charAt(pc);
                +    pc += 1;
                +    guard_true(instruction == PRINT);
                +    do_print(stack.pop());
                +
                +    instruction = bytecode.charAt(pc);
                +    pc += 1;
                +    guard_true(instruction == LOAD_INTEGER)
                +    value = (int)bytecode.charAt(pc);
                +    pc += 1
                +    stack.push(W_Integer(value))
                +
                +    instruction = bytecode.charAt(pc);
                +    pc += 1;
                +    guard_true(instruction == ADD)
                +    arg2 = stack.pop()
                +    arg1 = stack.pop()
                +    stack.push(do_addition(arg1, arg2))
                +
                +    instruction = bytecode.charAt(pc);
                +    pc += 1;
                +    guard_true(instruction == JUMP_BACKWARD)
                +    pc -= (int)bytecode.charAt(pc);
                +}
                +
                +

                This is machine code that essentially does what the bytecode above did. Of +course the code still remains some remnants of the interpreter (like the program +counter manipulations, the stack handling, etc), which would have to be removed +by some clever enough optimization step. If this were done, result would look a +lot more natural.

                +

                Summary

                +

                If a tracing JIT is enhanced by a way to influence its loop-closing behaviour we +can significantly improve its performance when the jitted program is itself an +interpreter. The result is that in such a case the produced machine code +will correspond to the functions that are being interpreted, not to the code of +the interpreter itself.

                +

                Now, what does all this have to do with PyPy? What we are working on since a +while is a sort of tracing JIT for RPython which allows to be customized with a +function very similar to the add_to_position_key described above. This will +make it possible to make the tracing JIT generate code that corresponds to the +code that the interpreter interprets. For example, we would add a call to +add_to_position_key to SPy, PyPy's Smalltalk VM. Then the tracing JIT will +produce machine code for Smalltalk-level loops, with all the usual benefits of a +tracing JIT (like inlining of intermediate methods, constant-folding, ...). +This JIT differs from normal tracing JITs in that it also supports very powerful +constant-folding and allocation-removal optimizations. Those optimizations will +(hopefully) be the content of a later blog post.

                +

                The basics of this process have been working fine since quite a while. What the +work currently focuses on is to improve the optimizers to remove not only the +bytecode manipulation code, but also the stack handling, and a large number of +other inefficiencies.

                +
                +

                Comments

                +
                +
                +
                + + Benjamin Peterson wrote on 2009-03-03 03:04: +
                +
                +

                Wow, that's very cool! PyPy is an amazing novel project, but how did you guys ever think of this?

                +
                +
                +
                +
                + + Unknown wrote on 2009-03-03 13:47: +
                +
                +

                great explanaition.. thanks!

                +
                +
                +
                +
                + + Anonymous wrote on 2009-03-03 15:01: +
                +
                +

                Very nice. You might want to have a look at Sullivan, et al (2003) Dynamic Native Optimization of Interpreters. They also identify the need to record not only the native PC but also the interpreter's virtual PC to identify useful trace heads. They provide three intrinsic functions (compared to your single add_to_position_key) to achieve this. Further, they provide three more intrinsics to support constant propagation.

                +
                +
                +
                +
                + + Carl Friedrich Bolz-Tereick wrote on 2009-03-04 15:27: +
                +
                +

                Wow, that's extremely interesting! It's indeed very similar to what I describe in the blog post (apparently Armin knew of the paper, but I didn't). Of course they are severely hampered by the fact that the system is working on assembler level, so they don't really have enough information available to do really interesting optimizations.

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2009/03/good-news-everyone-421421336094214242.html b/posts/2009/03/good-news-everyone-421421336094214242.html new file mode 100644 index 000000000..04b5322ba --- /dev/null +++ b/posts/2009/03/good-news-everyone-421421336094214242.html @@ -0,0 +1,411 @@ + + + + + +Good news everyone! | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                Good news everyone!

                + + + +
                +

                +A quick update from the JIT front. As of yesterday, we're now able to translate +a highly-experimental Python interpreter that contains JIT. It mostly crashes +immediately, mostly due to some unsupported operations in the assembler backend, +but for a carefully crafted program, we're able to get massive speedups. +For something as complex as: +

                +
                +  i = 0
                +  while i < 10000000:
                +   i = i + 1
                +
                +

                +our JIT is about 20x faster than CPython. That's still about 3x slower than +Psyco, but looking at assembler code it's obvious that we can speed it up +a lot. These are very good news, since we don't encode python semantics at +all in the JIT. The JIT is automatically generated from the Python interpreter +source code. This means we should be able to expand it to handle more complex +python programs relatively quickly (interested assembler experts needed!). +

                +

                +This is actually the fifth incarnation of JIT that happened over the last +two years. It's by far simpler and more promising than any of the previous +approaches. Expect more details soon! +

                +Cheers,
                +fijal +
                +

                Comments

                +
                +
                +
                + + Anonymous wrote on 2009-03-10 16:14: +
                +
                +

                Very exciting news indeed.
                Congratulations!

                +
                +
                +
                +
                + + Zemantic dreams wrote on 2009-03-10 16:49: +
                +
                +

                This is exciting. The world is waiting.

                (I am still sad that Psyco development was discontinued and never ported to 64bit)

                +
                +
                +
                +
                + + nekto0n wrote on 2009-03-10 17:34: +
                +
                +

                Great news!
                Activity in blog shows that project is full of enthusiasm.

                +
                +
                +
                +
                + + Anonymous wrote on 2009-03-10 18:08: +
                +
                +

                wow, that's really great =)

                +
                +
                +
                +
                + + Eric van Riet Paap wrote on 2009-03-10 18:33: +
                +
                +

                Congratulations! Very nice to read about these milestones.

                I did not follow llvm development but does anyone know if they made some arrangements by now that would enable the PyPy JIT generator to leverage their optimizers?

                +
                +
                +
                +
                + + Harold Fowler wrote on 2009-03-11 12:16: +
                +
                +

                Wow, you are right, that is good news.

                RT
                www.privacy.at.tc

                +
                +
                +
                +
                + + Anonymous wrote on 2009-03-11 16:57: +
                +
                +

                I'm wondering why something like this would be faster than CPython? New to the whole python scene so I'm really just curios.

                +
                +
                +
                +
                + + René Dudfield wrote on 2009-03-11 19:59: +
                +
                +

                nice one :)

                In the mean time... I wrote an optimized version of that program for CPython:

                i = 10000000

                CPython is 10000000x faster than the pypy jit!!!!!!

                +
                +
                +
                +
                + + Tim Wintle wrote on 2009-03-12 02:48: +
                +
                +

                Congratulations!

                This is very exciting.

                @Anonymous - it's because the standard python interpreter doesn't use a JIT, which makes dynamic languages quite slow.

                +
                +
                +
                +
                + + Anonymous wrote on 2009-04-07 15:42: +
                +
                +

                I'm waiting for production solution!

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2009/03/jit-bit-of-look-inside-7472130507462677287.html b/posts/2009/03/jit-bit-of-look-inside-7472130507462677287.html new file mode 100644 index 000000000..503a8e742 --- /dev/null +++ b/posts/2009/03/jit-bit-of-look-inside-7472130507462677287.html @@ -0,0 +1,450 @@ + + + + + +JIT - a bit of look inside | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                JIT - a bit of look inside

                + + + +
                +

                +The previous post about our JIT explained a bit from the 1000 km +perspective how the tracing JIT would approach a language like Python. +

                +

                +I would like to step a bit inside and give a zoom to some of its features that +are already working. +While probably not the most innovative, I think it's very nice to look +at the way we work with the JIT and what tools we use. +

                +

                +The main cool thing is that you can work on and try the JIT (including trying +it on the Python interpreter!) without even generating a single bit of +assembler. How? Let's start with something very simple. Let's take +a simple interpreter for language X. +

                +

                +Language X has 3 opcodes: CO_INCREASE, CO_DECREASE and CO_JUMP_BACK_3. +CO_INCREASE increase the accumulator by one, CO_DECREASE decrease +it by one, CO_JUMP_BACK_3 jump 3 opcodes back, if the accumulator is smaller +than 100 (this is only to maintain some halting conditions possible). +The interpreter for language X looks like this:: +

                +

                +

                +
                +    jitdriver = JitDriver(greens = ['i'], reds = ['res', 'a'])
                +    code = [CO_INCREASE, CO_INCREASE, CO_INCREASE,
                +            CO_JUMP_BACK_3, CO_INCREASE, CO_DECREASE]
                +            
                +    def add(res, a):
                +        return res + a
                +
                +    def sub(res, a):
                +        return res - a
                +
                +    def main_interpreter_loop(a):
                +        i = 0
                +        res = 0
                +        c = len(code)
                +        while i < c:
                +            jitdriver.jit_merge_point(res=res, i=i, a=a)
                +            elem = code[i]
                +            if elem == CO_INCREASE:
                +                res = add(res, a)
                +            elif elem == CO_DECREASE:
                +                res = sub(res, a)
                +            else:
                +                if res > 100:
                +                    pass
                +                else:
                +                    i = i - 3
                +                    jitdriver.can_enter_jit(res=res, i=i, a=a)
                +                    continue
                +            i = i + 1
                +        return res
                +
                +

                +All very simple code, expect the jitdriver hints, which instruct JIT how to +behave (they are the equivalent of the ``add_to_position_key`` of last the blog +post). +

                +

                +Let's look how this code is processed. This will also give a glance +at how we work in this code. This particular piece can be found +on a branch in pypy/jit/metainterp/test/test_loop.py +and can be run with ./test_all.py jit/metainterp/test/test_loop.py -k test_example -s --view from pypy directory. The -s option lets you see the debugging output, while +--view will show you some graphs. So, let's look at graphs in order: +

                + + +And the same picture with a bit of zoom for the first block: + + +

                + +This is the call graph of an interpreter loop, nothing magic so far. This is an +intermediate representation of translation toolchain input. If you look around +you can follow how the opcodes are dispatched (with a chain of ifs) and helpers +called. Next graph is very boring, because it's a bit lower level representation +of the same thing (you exit with q or escape btw :). +

                +

                +When we exit the graph viewer, we can see the trace generated by interpreting +this graph with a given bytecode (variable code in paste above). It's something +like: +

                +
                +
                +        [compiler] ENTER
                +        [runner:cpu]    call__4 [(''), * GCREF hidden, 0] -> 0
                +        [runner:cpu]    int_eq [0, 0] -> True
                +        [runner:cpu]    int_add [9, 1] -> 10
                +        [runner:cpu]    int_add [0, 1] -> 1
                +        [runner:cpu]    int_lt [1, 6] -> True
                +        [runner:cpu]    call__4 [(''), * GCREF hidden, 1] -> 0
                +        [runner:cpu]    int_eq [0, 0] -> True
                +        [runner:cpu]    int_add [10, 1] -> 11
                +        [runner:cpu]    int_add [1, 1] -> 2
                +        [runner:cpu]    int_lt [2, 6] -> True
                +        [runner:cpu]    call__4 [(''), * GCREF hidden, 2] -> 0
                +        [runner:cpu]    int_eq [0, 0] -> True
                +        [runner:cpu]    int_add [11, 1] -> 12
                +        [runner:cpu]    int_add [2, 1] -> 3
                +        [runner:cpu]    int_lt [3, 6] -> True
                +        [runner:cpu]    call__4 [(''), * GCREF hidden, 3] -> 1
                +        [runner:cpu]    int_eq [1, 0] -> False
                +        [runner:cpu]    int_eq [1, 2] -> False
                +        [runner:cpu]    int_gt [12, 100] -> False
                +        [runner:cpu]    int_sub [3, 3] -> 0
                +        [compiler] LEAVE
                +
                +

                +It's entering JIT, doing some primitive operations for bytecode dispatching +and repeating the loop. Note that at the end of the interpreted loop +(not to be confused with the interpreter loop), we see int_sub [3, 3] +which resets the bytecode position to the beginning. At this time JIT +(instructed by can_enter_jit hint) notices that all green variables +are the same (here only i), +hence we can compile the efficient loop from this point. +

                + + +

                +The loop contains 3 additions and a check (for i < 100), exactly +the same as our interpreted program would do, but completely without +interpretation overhead! +

                +

                +As you might have noticed, there is no assembler involved so far. All of this +instruction execution is done directly, in pure python. In fact, the +code for executing instructions is located in jit/backend/llgraph +which directly interprets instructions. This is by far simpler (and easier +to debug) than x86 assembler. +

                +

                +And this is basically it: the very simple interpreter and a jit for it. +Of course we actually can generate assembler for that. Also the missing +piece is optimizing the generated graphs. While for this example, +by removing the interpretetation overhead, we're done, with more complex +examples it's important to further optimize traces. Hopefully this and +how we actually generate assembler will be topics for next blog posts. +

                +Cheers,
                +fijal +
                +

                Comments

                +
                +
                +
                + + Brent Millare wrote on 2009-03-05 20:25: +
                +
                +

                Great article. I like how it is the simplest case that can explain the most basic work flow. You have code, the interpreter, and the generated code as part of the running JIT.

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2009/03/next-leysin-winter-sprint-1791506307881043273.html b/posts/2009/03/next-leysin-winter-sprint-1791506307881043273.html new file mode 100644 index 000000000..5c2ac6967 --- /dev/null +++ b/posts/2009/03/next-leysin-winter-sprint-1791506307881043273.html @@ -0,0 +1,316 @@ + + + + + +The next Leysin Winter Sprint | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                The next Leysin Winter Sprint

                + + + +
                +

                PyPy Leysin Winter Sprint (14-21th April 2009)

                + +

                The next PyPy sprint will be in Leysin, Switzerland, for the +sixth time. This sprint will take place immediately after +Easter. This is a fully public sprint: newcomers and topics +other than those proposed below are welcome.

                + + + + +
                + + + +
                  +
                • The overall idea of the sprint is to continue working on making PyPy ready +for general use. There are a few tasks left in there. In parallel, we +will continue the work on the JIT, if there is general interest. And as +usual, we are ready to add any other task -- please mention on the mailing +list what you would like to work on; the list of task is not really fixed.
                • +
                • And as usual, the main side goal is to have fun in winter sports :-) +We can take a day off for ski until Sunday, the 19th; afterwards, the +installations close. (There was quite a lot of snow this winter, so +there should be some left even though it's relatively late in the season.)
                • +
                +
                +

                For more information see the announcement.

                +
                +

                Comments

                +
                +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2009/03/pypy-on-mobiles-at-openbossa-845760004725129519.html b/posts/2009/03/pypy-on-mobiles-at-openbossa-845760004725129519.html new file mode 100644 index 000000000..16fa270ca --- /dev/null +++ b/posts/2009/03/pypy-on-mobiles-at-openbossa-845760004725129519.html @@ -0,0 +1,287 @@ + + + + + +PyPy on Mobiles, at OpenBossa | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                PyPy on Mobiles, at OpenBossa

                + + + +
                +

                Next week i am going to give a talk on PyPy at OpenBossa, a developer conference on embedded platforms. I've written up a bit more of my background and why i find it very interesting to go there on my blog. Probably will mostly follow up there or on twitter and not much here on the PyPy blog because it's not all about PyPy. To summarize how i see it: i think there is great potential for Python and PyPy on mobiles and am thrilled to hear about what's going on currently and to discuss opportunities.

                +cheers, holger +
                +

                Comments

                +
                +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2009/03/pypy-talk-at-openbossa-09-5135830287297423499.html b/posts/2009/03/pypy-talk-at-openbossa-09-5135830287297423499.html new file mode 100644 index 000000000..44bcc8f1a --- /dev/null +++ b/posts/2009/03/pypy-talk-at-openbossa-09-5135830287297423499.html @@ -0,0 +1,391 @@ + + + + + +PyPy talk at OpenBossa 09 | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                PyPy talk at OpenBossa 09

                + + + +
                +

                Yesterday i gave my PyPy status/mobile perspectives at OpenBossa, Nokia's developer conference for embedded platforms in Brazil. Found it a bit of a tough task to do that in 50 minutes. I had some 50, later more developers attending the talk and was happy with the questions and the feedback. Guess it's a good sign if the number of people grows during a talk :) It was the first time i tried to work more with pictures and actually used some devianart photos from Marikaz to mark section transitions. I summarize/highlight some key points here in the post.

                +

                After intro and 2.5 compatibility status, i talked about our measurements of PyPy's Python on Nokia's N810 internet tablet. The best bit is that for almost all Python data structures PyPy has smaller memory representations than CPython. Particularly good are class instances which often score at 50% of CPython's sizes. Startup time is also often better and can be improved. On the bad side, PyPy's quite large base interpreter size and its bytecode execution is often worse. In the talk i also outline ideas for "perfect PYC files" for minimizing module import times and maximizing sharing across interpreter processes. I also briefly discussed the PyPy situation with extension modules and regarding C++ libs. Most of these ideas arose from sprint discussions last year. In the morning i also had some good talk with Stefan Seefeld about Boost Python and the new QT4 bindings. Maybe to use Boost Python is also a good opportunity - but PyPy does not currently have a C-level or C++ level API.

                +

                In subsequent lunch discussions people agreed that PyPy has three main interesting areas currently:

                +
                  +
                • the Python Just-In-Time Compiler
                • +
                • a virtualized, sandboxed Python interpreter
                • +
                • an efficient Python interpreter for small devices
                • +
                +

                I think our upcoming 1.1 release will be a good point in time for many people to look some more into PyPy. I hope we are crossing the chasm soon. It's been a while since the project started :) Getting some more sponsoring to sustain and increase our current efforts probably wouldn't hurt.

                +

                Now i am off to spend my last day in Recife / Brazil, fly back to Germany in the evening and then spend time on preparing for Pycon 2009. And I guess i am going to enjoy some naturally cold air - at least my two jogging sessions at Brazillian beaches, at a sustained 30 degrees celsius, were tough. I guess i shouldn't complain, though :)

                +

                Was great meeting all the brazillian guys and the few women - just had breakfeast with Kate Alhola, kernel hacker and working on the new "Freemantle" graphical platform. Many thanks go to Marcio Marcedo and the Python team at INDT who invited me here. Hope to come again next year and eventually talk more about the Zone VM :)

                +

                If you are interested in some more not so pypy-specific bits about the conference and what i experienced, you might head over to my tetamap blog.

                +

                holger

                +
                +

                Comments

                +
                +
                +
                + + Mikko Ohtamaa wrote on 2009-03-12 22:13: +
                +
                +

                Hi Holger,

                About start up times: We have researched them a lot when developing few applications on Nokia's PyS60.

                Our conclusion, for now, is that its imports and module body bytecode execution (loading modules, classes, creating functions) which takes the most of the time during the start up. Unfortunately there is no real way to speed up this process, except lazily trying to load all your code.

                We have experienced with unexec() like solution. Unexec() was an old Emacs trick where a.out binary code and data segments are dumped to disk. When the application is loaded for the next time, this dump is just blitted to memory and execution continues. Kind of application level hibernation. You wouldn't actually need to distribute .pyc files at all for embedded devices, you could just give out a target specific binary dump containing ready memory layout.

                Of course, it is not so straightforward on modern system with DLLs and other funny pointers. We got some tests working with CPython and PyS60 emulator - but there would be tons of work to make it actually usable (patching all system libs and DLL loads to be suspend friendly).

                Some discussion here:

                https://mail.python.org/pipermail/python-dev/2008-December/084466.html

                Please reply at mikko (at) redinnovation (dot) com if you are interested in to hear more.

                +
                +
                +
                +
                + + Mikko Ohtamaa wrote on 2009-03-12 22:14: +
                +
                +

                God I hate too small edit boxes.

                +
                +
                +
                +
                + + Anonymous wrote on 2009-03-13 05:24: +
                +
                +

                Do you have a link for those Qt4 bindings?

                +
                +
                +
                +
                + + Paddy3118 wrote on 2009-03-13 07:35: +
                +
                +

                On too small edit boxes: I use: It's All Text! for Firefox, together with Vim.

                - Paddy.

                +
                +
                +
                +
                + + holger krekel wrote on 2009-03-15 20:19: +
                +
                +

                Hi Mikko!

                thanks a lot for your comment and the pointer to the python-dev thread!

                Like Martin von Loewis i'd be very interested to know more numbers regarding how the time for python imports is usually spent - i'd suspect the major bit comes from unmarshalling and the involved malloc/copying of data work. If that is true then what i presented in the talk as "perfect pyc files" is probably a good idea. It's basically what Martin suggested.

                I find the unexec ideas interesting, especially on platforms where fork does not exist. PyPy could probably have a very compact interpreter state representation if we perform garbage collection before writing to disk. When using moving GCs those objects would map very compactly into the oldest-generation memory and thus be mostly avoided by subsequent GC collects.

                Of course, there also is time consumed for linking DLLs - only forking is efficient in avoiding this overhead. But it doesn't exist on Symbian, right?

                If you have any more info on the exact numbers on import times, i'd be very curious. We might also have some numbers from PyPy - need to check.

                I am also available on holger.krekel at gmail com. You are also very welcome to post to pypy-dev (https://codespeak.net/mailman/listinfo/pypy-dev)

                cheers,
                holger

                +
                +
                +
                +
                + + holger krekel wrote on 2009-03-15 20:39: +
                +
                +

                anoymous: pypy does not have qt4 bindings yet.

                paddy318: thanks! i'll check this out once i get firefox upgrade on the ubuntu machine i am currently using. (why are we in 2009 still having this concept of "installing" apps/plugins, let alone finding a matching one?)

                +
                +
                +
                +
                + + Anonymous wrote on 2009-03-16 00:57: +
                +
                +

                I was referring to this:
                "In the morning i also had some good talk with Stefan Seefeld about Boost Python and the new QT4 bindings."

                From that it sounds like there are new Qt4 bindings for CPython somewhere, using Boost. I have tried searching, but was not able to find anything.

                +
                +
                +
                +
                + + holger krekel wrote on 2009-03-16 08:56: +
                +
                +

                Anonymous, i also only found the 2005 announcement. I mailed Stefan to find out some more. Maybe it's just existing in some developers repository as of yet. I'll let you know if i find out something more actual.

                +
                +
                +
                +
                + + holger krekel wrote on 2009-03-16 12:04: +
                +
                +

                Anonymous: ok, seems like recent bindings use SIP, see https://www.riverbankcomputing.co.uk/software/pyqt/download

                not sure about the status of boost cpython based qt bindings.
                holger

                +
                +
                +
                +
                + + arman wrote on 2009-03-20 21:29: +
                +
                +

                I wonder if an unexec like functionality can be developed by developing a mechanism for pickling the current interpreter state (e.g. loaded modules).

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2009/03/vm-summit-nice-to-see-friendly-8755773725359396485.html b/posts/2009/03/vm-summit-nice-to-see-friendly-8755773725359396485.html new file mode 100644 index 000000000..be152270b --- /dev/null +++ b/posts/2009/03/vm-summit-nice-to-see-friendly-8755773725359396485.html @@ -0,0 +1,419 @@ + + + + + +VM summit: nice to see friendly competition | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                VM summit: nice to see friendly competition

                + + + +
                +

                +So Google has launched the unladen swallow project +with this first goal: +

                +
                +    Produce a version of Python at least 5x faster than CPython.
                +
                +

                +We discussed some details with Collin Winter, Jeffrey Yasskin and Thomas Wouters +during the VM summit yesterday. We were a bit confused about usage +of the term JIT, because as far as we understood, it's going to be upfront +compilation into LLVM. In the past we have looked into LLVM + – at one point PyPy extensively use it but it +wasn't clear how we could make good use to it. +They also consider changing to something else than LLVM. It's gonna be +interesting to see how this works out. +

                +

                +It's good to see friendly competition, and we think that should take up +the challenge and see if we can produce faster pickling, run 2to3 and +Django faster than what they can come up with. We also talked +to IronPython and Jython developers and all agreed that some +common benchmarks would be good. And maybe do weekly +press releases about small speed increases? :) +

                +

                +The idea of the VM summit here in Chicago was to bring together implementors +of various virtual machine languages. There were members of the communities of +IronPython, CPython, GemStone's MagLev, Rubinius, Mozilla's TraceMonkey, Parrot, +Sun's Da Vinci Machine, Microsoft's DLR, Jython and JRuby. +Everybody got to talk 5-10 minutes on their current status and +challenges. It is clear that you cannot begin to cover the +complexities and architectures of the involved projects. +But that wasn't too much of a problem because the rest of +the day everybody freely and dynamically grouped on their +issues of choice. We established some more personal contacts, +was great to chat with people like Andreas Gal from the University of +California, Irvine, who have a very similar idea about the JIT +that we have. Actually, we could probably haved mixed our +two presentations and nobody would have actually noticed :-). +

                +

                +At the end of the presentation part, John Rose presented his +slides. John is a Hotspot developer, and while not precisely a dynamic +language implementor, he has a lot of experience in virtual +machine implementation. It's very good to see the JVM being extended towards +supporting dynamic-language specific things, in order to be something +more than just a good platform for Java. We'll probably have +some extra meetup with him the next days. +

                +cheers,
                +holger and fijal +
                +

                Comments

                +
                +
                +
                + + Anonymous wrote on 2009-03-26 14:21: +
                +
                + So Google has launched the unladen swallow project with this first goal

                I'm not sure this is a Google project. It's hosted on Google code for sure, but anyone can do that. +
                +
                +
                +
                + + Anonymous wrote on 2009-03-27 01:16: +
                +
                +

                All three of the primary developers are Google employees.

                +
                +
                +
                +
                + + Anonymous wrote on 2009-03-27 03:01: +
                +
                + We were a bit confused about usage of the term JIT, because as far as we understood, it's going to be upfront compilation into LLVM.

                The LLVM supports JIT, so compiling Python into LLVM bytecode will give JIT for free. +
                +
                +
                +
                + + Anonymous wrote on 2009-03-27 03:24: +
                +
                +

                Anonymous#1, this is extremely valuable note to take in an open source world, you know ;-)

                PyPy folks, keep up the good work!

                But... Sometimes I miss updates on this blog. Not in the sense that you slack on it, but in the sense that I miss some "technicaly intermediate" updates when there are no news on breathrouths.

                One thing I miss most is the retro style articles on how some things that are "established" now got to be this way. The design by evolution things. Stuff that both educates and helps to get acquainted with the code one might like to hack one day.

                +
                +
                +
                +
                + + Luis wrote on 2009-03-28 02:51: +
                +
                +

                I don't understand: These days, Google's v8 claims to be 56x faster than common javascript, tracemonkey is in the same league, as well as nitro, etc. Way before, psyco sped up python (theoretically) up to c's speed for algorithmic code, and up to 4x for common code.

                Now Unladen Swallow aims to "only" 5x speed up. Isn't it to little, seeing what the above projects are getting nowadays?
                Or am I getting confussed by their terminology? (what's exactly the meaning of 5x here?).

                +
                +
                +
                +
                + + Maciej Fijalkowski wrote on 2009-03-28 14:28: +
                +
                +

                The exact meaning of 5x is I *think* "5x on tests derived from google internal apps". It's a bit little, but note that the great speedups of JS engines are for simple algorithmic code (the one that psyco speedups great).

                It would be a good speedup for a lot of people though (of course we aim to speed up stuff according to JS engines ;)

                Cheers,
                fijal

                +
                +
                +
                +
                + + Luis wrote on 2009-03-28 14:34: +
                +
                +

                Maciej, this is a reply posted on the project's FAQ page:

                Comment by collinw, Today (8 hours ago)

                luismgz: translating Python to Javascript would be easy to implement for about 80% of the language, but you'd hit a wall in implementing that last 20%. Just ask the Jython, PyPy? and IronPython? teams how hard 100% compatibility is. They've done some really heroic work to implement every dark and musty corner of the language, and I think they'd be the first to tell you that it's easy to get something like the Fibonacci function working, but things like metaclasses are a different story. We hope to side-step that by reusing as much of CPython as possible.

                Psyco's claimed benefits of 100x speed-up on algorithmic code is rarely seen in real applications. It can certainly be used to optimize hotspots that fit Psyco's profile, but in examining the performance of some Google applications that use Psyco, we found that they see only a ~10% improvement in overall CPU usage. While that might be a valuable savings for some applications, it's not 100x, nor even the 2-4x low-end estimate that I've seen in the Psyco docs.

                Are our performance goals too modest? We don't think so. Our team is small -- only two full-time engineers -- and we want to allow for unexpected surprises along the way. We feel that 5x is a good goal for the time being, especially given that we may need to make changes to LLVM along the way. If things go astoundingly well, we may raise those numbers, but for now, we're comfortable with our stated goals.

                +
                +
                +
                +
                + + Luis wrote on 2009-05-28 23:35: +
                +
                +

                Maciej said: "It would be a good speedup for a lot of people though (of course we aim to speed up stuff according to JS engines ;)"

                What ever happend to the "secret goal" of being faster than c...?

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2009/04/11-final-released-225813777919757859.html b/posts/2009/04/11-final-released-225813777919757859.html new file mode 100644 index 000000000..2fb6c9396 --- /dev/null +++ b/posts/2009/04/11-final-released-225813777919757859.html @@ -0,0 +1,331 @@ + + + + + +1.1 final released | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                1.1 final released

                + + + +
                +

                We just released PyPy 1.1 final. Not much changed since the beta, apart +from some more fixed bugs. Have fun with it!

                +
                +

                Comments

                +
                +
                +
                + + Anonymous wrote on 2009-04-28 17:09: +
                +
                +

                Congratulations on the new release!

                +
                +
                +
                +
                + + Anonymous wrote on 2009-04-28 19:09: +
                +
                +

                Congrats! This is a great project :)

                +
                +
                +
                +
                + + Anonymous wrote on 2009-04-29 11:33: +
                +
                +

                Any chance of prebuilt binaries? I tried to compile but had to give up after 2 hours (I guess my laptop is not up to the task).

                By the way, you should put the release note somewhere on the main page of the PyPy site. Currently this page gives no indication that a release of PyPy exists at all.

                +
                +
                +
                +
                + + Armin Rigo wrote on 2009-04-30 11:08: +
                +
                +

                Thanks, added a link from the main page to release-1.1.0.html.

                About binaries: there are just too many possible combinations, not only of platforms but of kinds of pypy-c. I suppose that we can list other people's pages with some of them, if they mention them to us.

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2009/04/4-weeks-of-gdb-522864241041643529.html b/posts/2009/04/4-weeks-of-gdb-522864241041643529.html new file mode 100644 index 000000000..0dc8faa96 --- /dev/null +++ b/posts/2009/04/4-weeks-of-gdb-522864241041643529.html @@ -0,0 +1,459 @@ + + + + + +4 weeks of GDB | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                4 weeks of GDB

                + + + +
                +

                Hello.

                +

                +So, according to our jit +plan we're mostly done with point 1, that is to provide a JIT that compiles +python code to assembler in the most horrible manner possible but doesn't +break. That meant mostly 4 weeks of glaring at GDB and megabytess of assembler +generated by C code generated from python code. The figure of 4 weeks proves +that our approach is by far superior to the one of psyco, since Armin says it's +"only 4 weeks" :-) +

                +

                +Right now, pypy compiled with JIT can run the whole CPython test suite +without crashing, which means we're done with obvious bugs and the only +ones waiting for us are really horrible. (Or they really don't exist. +At least they should never be about obscure Python corner cases: they can +only be in the 10'000 lines of relatively clear code that is our JIT +generator.) +

                +

                +But... the fun thing is that we can actually concentrate on optimizations! +So the next step is to provide a JIT that is correct *and* actually speeds +up python. Stay tuned for more :-) +

                +Cheers,
                +fijal, armin & benjamin +

                +UPDATE: for those of you blessed with no knowledge of C, gdb stands for GNU debugger, a classic debugger for C. (It's also much more powerful than python debugger, pdb, which is kind of surprising).

                +
                +

                Comments

                +
                +
                +
                + + Alexander Kellett wrote on 2009-04-30 23:15: +
                +
                +

                *bow*

                +
                +
                +
                +
                + + Luis wrote on 2009-05-01 00:00: +
                +
                +

                I love this kind of posts. Keep'em coming!

                +
                +
                +
                +
                + + Unknown wrote on 2009-05-01 01:06: +
                +
                +

                This is probably the most exciting thing I've heard since I started tracking PyPy. Can't wait to see how fast JIT Python flies. :-)

                +
                +
                +
                +
                + + René Dudfield wrote on 2009-05-01 01:56: +
                +
                +

                nice one! Really looking forward to it.

                Is this for just i386? Or is this for amd64/ppc etc?

                +
                +
                +
                +
                + + Maciej Fijalkowski wrote on 2009-05-01 02:11: +
                +
                +

                amd64 and ppc are only available in enterprise version :-)

                We cannot really solve all problems at once, it's one-by-one approach.

                +
                +
                +
                +
                + + Armin Rigo wrote on 2009-05-01 09:47: +
                +
                +

                illume: if you are comparing with Psyco, then it's definitely "any platform provided someone writes a backend for it". Writing a backend is really much easier than porting the whole of Psyco...

                Our vague plans include an AMD64 backend and an LLVM-JIT one, the latter being able to target any platform that LLVM targets.

                +
                +
                +
                +
                + + DSM wrote on 2009-05-01 10:33: +
                +
                +

                Nice!

                I assume that it would be (relatively, as these things go) straightforward for those us interested to turn the x86 assembly backend into a C backend?

                I know that even mentioning number-crunching applications gets certain members of the pypy team beating their heads against the wall (lurkers can read the grumbling on irc too!). But with a delegate-to-C backend, those of us who have unimplemented architectures and are in the happy regime where we don't care about compilation overhead can get the benefits of icc's excellent optimizations without having to do any of the work. We'd just need to make sure that the generated C is code that icc can handle. (There are unfortunately idioms that gcc and icc don't do very well with.)

                To be clear, I'm not suggesting that the pypy team itself go this route: at the moment it feels like the rest of us should stay out of your way.. laissez les bon temps roulez! :^)

                I'm asking instead if there are any obvious gotchas involved in doing so.

                +
                +
                +
                +
                + + Tim Parkin wrote on 2009-05-01 11:28: +
                +
                +

                Congrats for stage one... exciting times for python..

                +
                +
                +
                +
                + + proteusguy wrote on 2009-05-02 11:48: +
                +
                +

                Nice job guys! Once you announce that PyPy is pretty much of comparable (90% or better) speed to that of CPython then we will be happy to start running capacity tests of our web services environment on top of it and report back our results.

                Given the growing number of python implementations has there ever been a discussion of PyPy replacing CPython as the canonical implementation of python once it consistently breaks performance & reliability issues? I don't know enough of the details to advocate such a position - just curious if there's been any official thought to the possibility.

                +
                +
                +
                +
                + + Armin Rigo wrote on 2009-05-04 13:55: +
                +
                +

                DSM, Proteusguy: I'd be happy to answer your questions on the pypy-dev mailing list. I think that there is no answer short enough to fit a blog post comment.

                +
                +
                +
                +
                + + Jacob Hallén wrote on 2009-05-04 14:45: +
                +
                +

                proteusguy: It is our hope that PyPy can one day replace CPython as the reference implementation, but this depends on many factors. Most of them are way out of our control. It will depend very much on the level of PyPy uptake in the community, but this is just a first step. With enough adoption, the Python developers (the people actually making new versions of CPython) need to be convinced that working from PyPy as a base to develop the language makes sense. If they are convinced, Guido may decide that it is a good idea and make the switch, but not before then.

                +
                +
                +
                +
                + + Anonymous wrote on 2009-05-08 15:36: +
                +
                +

                See https://moderator.appspot.com/#9/e=c9&t=pypy for Guido's opinion about PyPy.

                +
                +
                +
                +
                + + Anonymous wrote on 2009-05-10 11:19: +
                +
                +

                Surely gdb is more powerful than pdb because many more people are forced to used gdb. c code is much harder to debug than python code, and needs debugging more often than python code.

                +
                +
                +
                +
                + + Cacas Macas wrote on 2009-05-14 08:31: +
                +
                +

                Good day.
                I use Python for about 3 years and i am following your blog almost every day to see news.
                I am very excited to see more Pypy, though i don't understand how to use it (?!?!) and i never managed to install it!
                Wikipedia says "PyPy is a followup to the Psyco project" and i use Psyco, so Pypy must be a very good thing. I use Psyco very intense, in all my applications, but it's very easy to use.
                I have Windows and Windows document is incomplete "https://codespeak.net/pypy/dist/pypy/doc/windows.html". I have MinGW compiler.
                Pypy is not very friendly with users. I think more help documents would be very useful. When i will understand how to install Pypy, i will use it.
                Keep up the good work!

                +
                +
                +
                +
                + + stracin wrote on 2009-06-13 14:56: +
                +
                +

                """Rumors have it that the secret goal is being faster-than-C which is nonsense, isn't it?"""

                what does this statement from the pypy homepage mean?

                that c-pypy will be faster than cpython?

                or that code run in c-pypy will be faster than compiled C code? :o

                because of the "nonsense" i think you mean the latter? but isn't it nonsense? :) would be awesome though.

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2009/04/beta-for-110-released-4604559533184706699.html b/posts/2009/04/beta-for-110-released-4604559533184706699.html new file mode 100644 index 000000000..d1720d034 --- /dev/null +++ b/posts/2009/04/beta-for-110-released-4604559533184706699.html @@ -0,0 +1,506 @@ + + + + + +Beta for 1.1.0 released | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                Beta for 1.1.0 released

                + + + +
                +

                Today we are releasing a beta of the upcoming PyPy 1.1 release. There +are some Windows and OS X issues left that we would like to address +between now and the final release but apart from this things should be +working. We would appreciate feedback.

                +

                The PyPy development team.

                +
                +

                PyPy 1.1: Compatibility & Consolidation

                +

                Welcome to the PyPy 1.1 release - the first release after the end of EU +funding. This release focuses on making PyPy's Python interpreter more +compatible with CPython (currently CPython 2.5) and on making the +interpreter more stable and bug-free.

                +

                PyPy's Getting Started lives at:

                +
                +https://codespeak.net/pypy/dist/pypy/doc/getting-started.html +
                +
                +

                Highlights of This Release

                +
                + +
                +
                +
                +

                Other Changes

                +
                + +
                +
                +
                +

                What is PyPy?

                +

                Technically, PyPy is both a Python interpreter implementation and an +advanced compiler, or more precisely a framework for implementing dynamic +languages and generating virtual machines for them.

                +

                The framework allows for alternative frontends and for alternative +backends, currently C, Java and .NET. For our main target "C", we can +"mix in" different garbage collectors and threading models, +including micro-threads aka "Stackless". The inherent complexity that +arises from this ambitious approach is mostly kept away from the Python +interpreter implementation, our main frontend.

                +

                Socially, PyPy is a collaborative effort of many individuals working +together in a distributed and sprint-driven way since 2003. PyPy would +not have gotten as far as it has without the coding, feedback and +general support from numerous people.

                +

                Have fun,

                +
                +

                the PyPy release team, [in alphabetical order]

                +

                Amaury Forgeot d'Arc, Anders Hammerquist, Antonio Cuni, Armin Rigo, +Carl Friedrich Bolz, Christian Tismer, Holger Krekel, +Maciek Fijalkowski, Samuele Pedroni

                +

                and many others: +https://codespeak.net/pypy/dist/pypy/doc/contributor.html

                +
                +
                +
                +
                +

                Comments

                +
                +
                +
                + + Benjamin Peterson wrote on 2009-04-20 01:21: +
                +
                +

                Congratulations! PyPy is becoming more and more viable every day. I hope I can continue to become more involved in this awesome project.

                +
                +
                +
                +
                + + Anonymous wrote on 2009-04-21 01:18: +
                +
                +

                pypy is a very interesting project!

                i have a question. do you think pypy-c without jit can ever reach the speed of c-python? why is it slower?

                or will you put all the optimization efforts into the jit now? doesn't the performance difference matter because the jit will make it up anyway?

                +
                +
                +
                +
                + + Maciej Fijalkowski wrote on 2009-04-21 04:36: +
                +
                +

                PyPy without jit can (and is sometimes) be faster than cpython, for various reasons, including garbage collector.

                On the other hand, we rather won't sacrifice simplicity for speed and we hope that jit will go that part. Also the funny thing is that since we generate our jit, it gets better as interpreter gets simpler, because jit generator is able to find out more on it's own. So in fact we might give up on some optimizations in favor of simplicity, because jit will be happier.

                Cheers,
                fijal

                +
                +
                +
                +
                + + Luis wrote on 2009-04-21 14:04: +
                +
                +

                Sorry for my anxiety, but is there any rough estimation on when the jit will be in a usable state?

                +
                +
                +
                +
                + + Maciej Fijalkowski wrote on 2009-04-21 22:14: +
                +
                +

                Personally, I'm doing it in my free time. That means I'm giving no estimates, because it makes no sense. If you wish to go into some contractual obligations on our sides, we're up to discuss I suppose :-)

                +
                +
                +
                +
                + + Luis wrote on 2009-04-21 22:33: +
                +
                +

                Maciej, I know how hard you are working on this. I didn't mean to sound disrespectful and I don't want to bother you... It's just that as everyone else, I'm anxoiusly looking forward to seeing pypy's magic in action. By the way, the new post is very much appreciated. Thanks!

                +
                +
                +
                +
                + + Anonymous wrote on 2009-06-29 07:47: +
                +
                +

                I am desperately looking for some help building PyPy. I have posted a an Issue (#443) about my issues in the PyPy site.

                If anyone from the release/Dev. team can give me a hand, I would seriously appreciate this!

                I can be reached at wnyrodeo@yahoo.com

                Thanks.

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2009/04/leysin-sprint-report-1416905818217912359.html b/posts/2009/04/leysin-sprint-report-1416905818217912359.html new file mode 100644 index 000000000..872fdbd96 --- /dev/null +++ b/posts/2009/04/leysin-sprint-report-1416905818217912359.html @@ -0,0 +1,413 @@ + + + + + +Leysin Sprint Report | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                Leysin Sprint Report

                + + + +
                +

                The Leysin sprint is nearing its end, as usual here is an attempt at a summary +

                +

                of what we did.

                +Beautiful Leysin Landscape
                +

                Release Work

                +

                Large parts of the sprint were dedicated to fixing bugs. Since the easy bugs +seem to have been fixed long ago, those were mostly very annoying and hard bugs. +This work was supported by our buildbots, which we tried to get free of +test-failures. This was worked on by nearly all participants of the sprint +(Samuele, Armin, Anto, Niko, Anders, Christian, Carl Friedrich). One +particularly annoying bug was the differences in the tracing events that PyPy +produces (fixed by Anders, Samuele and Christian). Some details about larger +tasks are in the sections below.

                +

                The work culminated in the beta released on Sunday.

                +
                +

                Stackless

                +

                A large number of problems came from our stackless features, which do some +advanced things and thus seem to contain advanced bugs. Samuele and Carl +Friedrich spent some time fixing tasklet pickling and unpickling. This was +achieved by supporting the (un)pickling of builtin code objects. In addition +they fixed some bugs in the finalization of tasklets. This needs some care +because the __del__ of a tasklet cannot run at arbitrary points in time, but +only at safe points. This problem was a bit subtle to get right, and popped up +nearly every morning of the sprint in form of a test failure.

                +

                Armin and Niko added a way to restrict the stack depth of the RPython-level +stack. This can useful when using stackless, because if this is not there it is +possible that you fill your whole heap with stack frames in the case of an +infinite recursion. Then they went on to make stackless not segfault when +threads are used at the same time, or if a callback from C library code is in +progress. Instead you get a RuntimeError now, which is not good but better +than a segfault.

                +
                +Anto and Armin working on the JIT + +
                +

                Killing Features

                +

                During the sprint we discussed the fate of the LLVM and the JS backends. Both +have not really been maintained for some time, and even partially untested +(their tests were skipped). Also their usefulness appears to be limited. The JS +backend is cool in principle, but has some serious limitations due to the fact +that JavaScript is really a dynamic language, while RPython is rather static. +This made it hard to use some features of JS from RPython, e.g. RPython does not +support closures of any kind.

                +

                The LLVM backend had its own set of problems. For +a long time it produced the fastest form of PyPy's Python interpreter, by first +using the LLVM backend, applying the LLVM optimizations to the result, then +using LLVM's C backend to produce C code, then apply GCC to the result :-). +However, it is not clear that it is still useful to directly produce LLVM +bitcode, since LLVM has rather good C frontends nowadays, with llvm-gcc and +clang. It is likely that we will use LLVM in the future in our JIT (but that's +another story, based on different code).

                +

                Therefore we decided to remove these two backends from SVN, which Samuele and +Carl Friedrich did. They are not dead, only resting until somebody who is +interested in maintaining them steps up.

                +
                +
                +

                Windows

                +

                One goal of the release is good Windows-support. Anders and Samuele set up a new +windows buildbot which revealed a number of failures. Those were attacked by +Anders, Samuele and Christian as well as by Amaury (who was not at the sprint, +but thankfully did a lot of Windows work in the last months).

                +
                +
                +

                OS X

                +

                Christian with some help by Samuele tried to get translation working again under +Mac OS X. This was a large mess, because of different behaviours of some POSIX +functionality in Leopard. It is still possible to get the old behaviour back, +but whether that was enabled or not depended on a number of factors such as +which Python is used. Eventually they managed to successfully navigate that maze +and produce something that almost works (there is still a problem remaining +about OpenSSL).

                +
                +Samuele and Carl Friedrich pretending to work on something +
                +

                Documentation

                +

                The Friday of the sprint was declared to be a documentation day, where (nearly) +no coding was allowed. This resulted in a newly structured and improved getting +started document (done by Carl Friedrich, Samuele and some help of Niko) and +a new document describing differences to CPython (Armin, Carl Friedrich) as +well as various improvements to existing documents (everybody else). Armin +undertook the Sisyphean task of listing all talks, paper and related stuff +of the PyPy project.

                +
                +
                +
                +

                Various Stuff

                +
                +

                Java Backend Work

                +

                Niko and Anto worked on the JVM backend for a while. First they had to fix +translation of the Python interpreter to Java. Then they tried to improve the +performance of the Python interpreter when translated to Java. Mostly they did a +lot of profiling to find performance bottlenecks. They managed to improve +performance by 40% by overriding fillInStackTrace of the generated exception +classes. Apart from that they found no simple-to-fix performance problems.

                +
                +
                +

                JIT Work

                +

                Armin gave a presentation about the current state of the JIT to the sprinters as +well as Adrian Kuhn, Toon Verwaest and Camillo Bruni of the University of Bern +who came to visit for one day. There was a bit of work on the JIT going on too; +Armin and Anto tried to get closer to having a working JIT on top of the CLI.

                +
                +
                +
                +

                Comments

                +
                +
                +
                + + Unknown wrote on 2009-04-22 07:46: +
                +
                +

                Guys, are you going to make a new release with the things done during the sprint? Thanks.

                (pypy is a great work; Keep it up!)

                +
                +
                +
                +
                + + vak wrote on 2009-11-03 12:30: +
                +
                +

                hi,
                could you please make a new blog-post and tell us about news regarding LLVM and PyPy, please?

                thanks in advance!

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2009/04/leysin-sprint-started-4551365436232104640.html b/posts/2009/04/leysin-sprint-started-4551365436232104640.html new file mode 100644 index 000000000..c4aa472ae --- /dev/null +++ b/posts/2009/04/leysin-sprint-started-4551365436232104640.html @@ -0,0 +1,286 @@ + + + + + +Leysin Sprint Started | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                Leysin Sprint Started

                + + + +
                +

                The Leysin Sprint started today. The weather is great and the view is wonderful, as usual. Technically we are working on the remaining test failures of the nightly test runs and are generally trying to fix various long-postponed bugs. I will try to give more detailed reports as the sprint progresses.

                +
                +

                Comments

                +
                +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2009/04/pycon-videos-are-online-909873128878039557.html b/posts/2009/04/pycon-videos-are-online-909873128878039557.html new file mode 100644 index 000000000..6c6f07e40 --- /dev/null +++ b/posts/2009/04/pycon-videos-are-online-909873128878039557.html @@ -0,0 +1,311 @@ + + + + + +Pycon videos are online | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                Pycon videos are online

                + + + +
                +

                Hi. +

                +

                +We didn't yet write full pycon summary, but both of our talks are now online: PyPy status talk and python in a sandbox.

                +

                Update:

                +

                Slides are also available: PyPy status talk and Python in a sandbox.

                + + +Enjoy!
                +fijal & holger +
                +

                Comments

                +
                +
                +
                + + larsr wrote on 2009-04-08 15:25: +
                +
                +

                I found the slides to the python in a sandbox to be useful too.

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2009/04/roadmap-for-jit-377358891902851723.html b/posts/2009/04/roadmap-for-jit-377358891902851723.html new file mode 100644 index 000000000..53ef257c6 --- /dev/null +++ b/posts/2009/04/roadmap-for-jit-377358891902851723.html @@ -0,0 +1,534 @@ + + + + + +Roadmap for JIT | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                Roadmap for JIT

                + + + +
                +

                Hello. +

                +

                +First a disclaimer. This post is more about plans for future than current +status. We usually try to write about things that we have done, because +it's much much easier to promise things than to actually make it happen, +but I think it's important enough to have some sort of roadmap. +

                +

                +In recent months we came to the point where the 5th generation of +JIT prototype was working as nice +or even a bit nicer than 1st one back in 2007. Someone might ask "so why +did you spend all this time without going forward?". And indeed, we spend +a lot of time moving sideways, but as posted, we also spent a lot of time +doing some other things, which are important as well. +The main advantage of current JIT incarnation is much much simpler than +the first one. Even I can comprehend it, which is much of an improvement :-) +

                +

                +So, the prototype is working and gives very nice speedups in range of 20-30x +over CPython. We're pretty confident this prototype will work and will +produce fast python interpreter eventually. So we decided that now we'll +work towards changing prototype into something stable and solid. This +might sound easy, but in fact it's not. Having stable assembler backend +and optimizations that keep semantics is not as easy as it might sound. +

                +

                +The current roadmap, as I see it, looks like as following: +

                +
                  +
                • Provide a JIT that does not speedup things, but produce assembler without + optimizations turned on, that is correct and able to run CPython's library + tests on a nightly basis. +
                • +
                • + Introduce simple optimizations, that should make above JIT a bit faster than + CPython. With optimizations disabled JIT is producing incredibly dumb + assembler, which is slower than correspoding C code, even with removal + of interpretation overhead (which is not very surprising). +
                • +
                • + Backport optimizations from JIT prototype, one by one, keeping an eye + on how they perform and making sure they don't break anything. +
                • +
                • + Create new optimizations, like speeding up attribute access. +
                • +
                • + Profit. +
                • +
                +

                +This way, we can hopefully provide a working JIT, which gives fast python +interpreter, which is a bit harder than just a nice prototype. +

                +

                +Tell us what you think about this plan. +

                +Cheers,
                +fijal & others. +
                +

                Comments

                +
                +
                +
                + + Anonymous wrote on 2009-04-21 20:58: +
                +
                +

                I think it's a great idea. If the test suite succeeds on the basic JIT, it's much easier to spot regressions when you start adding the cool stuff. It also gives you a solid foundation to build on.

                Good luck, this project is amazing :)

                +
                +
                +
                +
                + + rjw wrote on 2009-04-21 21:54: +
                +
                +

                Its not obvious from this post what would actually be the difference between the prototype and the final jit with all the prototypes optimisations. So ... it sounds like a lot of work for zero gain. I'm sure there is missing information, like what is actually missing from or wrong with the prototype ( is it in a different language? Prolog?) Without this information its impossible to judge this plan.

                +
                +
                +
                +
                + + Michael Foord wrote on 2009-04-21 22:54: +
                +
                +

                This sounds like a very pragmatic approach and is very encouraging. Nice work guys - very much looking forward to what the future has to offer.

                +
                +
                +
                +
                + + Tim Parkin wrote on 2009-04-21 23:06: +
                +
                +

                I'm extremely excited about seeing this happen. It is an unfortunate fact that the majority of people won't get PyPy until they see a 'big win'. Once they've noticed the big win they will start to see the 'hidden genius'. I'm glad that you are taking such a professional approach to this next phase and look forward to the day when people will start to look give PyPy the attention it deserves (if not for quite the right reason).

                +
                +
                +
                +
                + + Alex wrote on 2009-04-22 00:34: +
                +
                +

                I agree with Michael, one of the hallmarks of Python philosophy has always been "make it right, and then make it fast", sounds like you guys have taken this to heart.

                +
                +
                +
                +
                + + Leonardo Santagada wrote on 2009-04-22 02:52: +
                +
                +

                Great guys, the plan seems very solid and reasonable!

                responding to rjw: I think the problem was that the prototype was really incomplete, putting all the complexity needed for the rest of the language could be done without removing the optimizations but would make bug finding way harder.

                I hope that this could be the only new feature for the next pypy release. Focusing on the JIT might be the best way to attract many more eyes and hands to the project.

                +
                +
                +
                +
                + + Michael Hudson-Doyle wrote on 2009-04-22 04:12: +
                +
                +

                This sounds like a very sane plan. Good luck with it!

                +
                +
                +
                +
                + + Anonymous wrote on 2009-04-22 07:59: +
                +
                +

                I like how for once step 2 isn't "???", but a well thought out plan =).

                +
                +
                +
                +
                + + Zemantic dreams wrote on 2009-04-22 10:20: +
                +
                +

                guys, you rock! I can't wait to see the results!

                bye
                Andraz Tori, Zemanta

                +
                +
                +
                +
                + + Anonymous wrote on 2009-04-22 13:21: +
                +
                +

                Very sensible plan! Good luck guys. Here's to pypy taking over the world (-:

                +
                +
                +
                +
                + + herse wrote on 2009-04-22 19:36: +
                +
                +

                "It's super easy to provide 95% of python in a reasonable speed, just the last 5% gets tricky."

                i often come across this statement.

                wouldn't it make sense then to offer a pypy compile option for producing an interpreter which leaves away those 5% in favor of speed for people who don't need those 5%?

                or isn't this feasible or wanted for some reason?

                i am just curious... :) pypy is an awesome project and i am looking forward to the jit!

                +
                +
                +
                +
                + + Anonymous wrote on 2009-04-24 09:34: +
                +
                +

                The roadmap is okay. The only thing I miss is a rough timeline.

                +
                +
                +
                +
                + + Anonymous wrote on 2009-04-24 22:18: +
                +
                +

                Tenretn hör eviece ne Pypy tan cafretn anretx. Lbisi programma o oitcenno ih ecafretn cabpöo, anretn 'retupmo ih nis secorpbut pypy eka LD oitcenno huob raa rawtfo laweri anosre Python code?

                +
                +
                +
                +
                + + René Leonhardt wrote on 2009-04-24 23:26: +
                +
                +

                Congratulations, the LLVM backend for JIT has been accepted, I am eager to see the results :)

                +
                +
                +
                +
                + + Armin Rigo wrote on 2009-04-28 20:18: +
                +
                +

                herse: that's an approach which is often mentioned, but which does not make sense in PyPy. The JIT is generated from the language spec; whether this spec covers 95% or 100% of Python doesn't change anything. The 95%-versus-100% debate only makes sense at another level, e.g. if we wanted to make PyPy faster without a JIT at all.

                +
                +
                +
                +
                + + Richard Emslie wrote on 2009-04-29 23:47: +
                +
                +

                Awesome work thus far & congratulations guys. Sounds like a good strategy to having something that works. Best of luck and I'm looking forward to see how things pan out. :-)

                +
                +
                +
                +
                + + herse wrote on 2009-04-30 05:12: +
                +
                +

                """The JIT is generated from the language spec; whether this spec covers 95% or 100% of Python doesn't change anything."""

                i see. the whole pypy idea really sounds awesome to me.

                i have another question. your python interpeter is written in rpython so it is supposed to be simpler to work with than the c implementation. but i could imagine that it is incredibly hard to debug problems in pypy-c? doesn't this counterbalance the advantage again?

                +
                +
                +
                +
                + + Maciej Fijalkowski wrote on 2009-04-30 05:58: +
                +
                +

                We're usually not debugging problems in pypy-c. It turns out that 99% of the problems you can debug by running on top of CPython, so you can test things really deeply, without compilation.

                +
                +
                +
                +
                + + Collin Winter wrote on 2009-06-08 23:21: +
                +
                +

                This looks like a good plan. I look forward to sharing ideas with you in the future :)

                When you say, "So, the prototype is working and gives very nice speedups in range of 20-30x over CPython", what benchmarks is that on? Can you be more specific?

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2009/05/icooolps-submissions-6705901656116873587.html b/posts/2009/05/icooolps-submissions-6705901656116873587.html new file mode 100644 index 000000000..5e4a3173f --- /dev/null +++ b/posts/2009/05/icooolps-submissions-6705901656116873587.html @@ -0,0 +1,330 @@ + + + + + +ICOOOLPS Submissions | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                ICOOOLPS Submissions

                + + + +
                +

                Both of the papers that people from the PyPy team submitted to ICOOOLPS have +been accepted. They are:

                +
                +
                  +
                • "Faster than C#: efficient implementation of dynamic languages on .NET" +(pdf1) by Armin, Anto and Davide Ancona, who is Anto's Ph.D. advisor
                • +
                • "Tracing the Meta-Level: PyPy’s Tracing JIT Compiler" (pdf2) by Carl +Friedrich, Armin, Anto and Maciek
                • +
                +
                +

                (the pdfs are obviously the submitted versions, not the final ones).

                +

                This year ICOOOLPS (Implementation, Compilation, Optimization of +Object-Oriented Languages, Programs and Systems) is being held on July the 6th +at ECOOP 2009 in Genova, Italy. Other than these two papers, Anto and Carl +Friedrich will also present a PyPy tutorial, on July the 7th.

                +
                +

                Comments

                +
                +
                +
                + + Unknown wrote on 2009-05-16 11:22: +
                +
                +

                It does seem like an odd idea to trace the bytecode of an interpreter of the bytecode of a language, rather than just tracing the bytecode for a language. For example, it requires that you annotate the interpreter to retain information that you would otherwise naturally have, and it requires that you trace lots of extra bookkeeping code in the interpreter.

                Given that you're writing a JIT that traces the execution of some bytecode, what advantages does tracing the outer bytecode have over tracing the inner bytecode? Is it that the outer bytecode is simpler than the inner bytecode; if so, is there no way to (inefficiently) compile the inner bytecode to the outer bytecode?

                +
                +
                +
                +
                + + Carl Friedrich Bolz-Tereick wrote on 2009-05-16 12:08: +
                +
                +

                John: The main reason for writing a JIT that traces the bytecode of the "outer" interpreter (which we call language interpreter in the paper) is that then we need to write only one tracing JIT in PyPy, and can use it for a variety of languages.

                The tracing of the extra bookkeeping code is not a problem is not such a large problem, as the paper shows. None of these opcodes are actually part of the final trace.

                If you want to discuss this more, I would suggest that we move this discussion to pypy-dev@codespeak.net which is the project mailing list. Not everybody is reading comments here :).

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2009/06/europython-8318355560715932819.html b/posts/2009/06/europython-8318355560715932819.html new file mode 100644 index 000000000..31a2b2f4f --- /dev/null +++ b/posts/2009/06/europython-8318355560715932819.html @@ -0,0 +1,307 @@ + + + + + +EuroPython | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                EuroPython

                + + + +
                +

                EuroPython is coming. We have two 30-minutes talks that we will present. In addition, the sprint takes place the 29th of June (there will be no-one from the team on the 28th of June), as well as on the 3rd and 4th of July.

                +
                +

                Comments

                +
                +
                +
                + + Luis wrote on 2009-07-09 19:37: +
                +
                +

                Please guys, can anyone of you tell something about Europython's vm panel news? I've been searching for comments on blogs in the last days but I couldn't find anything... There were many interesting presentations (hotpy, crosstwiner, psyco v2, etc...) but no comments so far! Is there any significant news in that field? How do these projects compare to pypy...?

                +
                +
                +
                +
                + + Kumo wrote on 2009-07-11 00:44: +
                +
                +

                Will you publish the slides of the 2nd talk, as you did with the 1st?

                I am looking forward to reading the slides as well as more comments about the talks.

                Keep the good work!

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2009/06/jit-progress-7289127796450840053.html b/posts/2009/06/jit-progress-7289127796450840053.html new file mode 100644 index 000000000..8367eb0bb --- /dev/null +++ b/posts/2009/06/jit-progress-7289127796450840053.html @@ -0,0 +1,368 @@ + + + + + +JIT progress | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                JIT progress

                + + + +
                +

                In the last days I finally understood how to do virtualizables. Now the frame overhead is gone. This was done with the help of discussion with Samuele, porting ideas from PyPy's first JIT attempt. +

                +

                +This is of course work in progress, but it works in PyPy (modulo a few XXXs, but no bugs so far). The performance of the resulting code is quite good: even with Boehm (the GC that is easy to compile to but gives a slowish pypy-c), a long-running loop typically runs 50% faster than CPython. That's "baseline" speed, moreover: we will get better speed-ups by applying optimizations on the generated code. Doing so is in progress, but it suddenly became easier because that optimization phase no longer has to consider virtualizables -- they are now handled earlier. +

                +

                Update:Virtualizables is basically a way to avoid frame overhead. The frame object +is allocated and has a pointer, but the JIT is free to unpack it's fields (for example python +level locals) and store them somewhere else (stack or registers). Each external (out of jit) access +to frame managed by jit, needs to go via special accessors that can ask jit where those variables +are.

                +
                +

                Comments

                +
                +
                +
                + + Luis wrote on 2009-06-23 22:06: +
                +
                +

                I have no clue of what you're talking about, bit it sounds great! Keep it up!!

                +
                +
                +
                +
                + + Anonymous wrote on 2009-06-23 23:51: +
                +
                +

                What are virtualizables?

                +
                +
                +
                +
                + + Leonardo Santagada wrote on 2009-06-24 00:06: +
                +
                +

                From what I understand virtualizables are objects that you use to represent objects that are expensive to construct. For example frame objects in python are very expensive so they are virtualizables and if a function is executed and it doesn't try to access its frame object it is never created.

                Probably armin can give a more precise answer.

                What I want to know, couldn't CPython have virtualizables for frame objects? I guess the answer is that it could but would involve a lot of C code.

                +
                +
                +
                +
                + + Maciej Fijalkowski wrote on 2009-06-24 00:09: +
                +
                +

                Ok, I updated the post with quick explanation of what actually virtualizables are. Leonardo: you need compiler in the first place for that :-) Psyco has some kind of virtualizables (but psyco frames are read only).

                Cheers,
                fijal

                +
                +
                +
                +
                + + Unknown wrote on 2009-06-24 10:12: +
                +
                +

                Could you use virtualizables to avoid constructing the frame at all, and then only allocate it if it is accessed?

                +
                +
                +
                +
                + + Anonymous wrote on 2009-06-24 14:22: +
                +
                +

                @Leonardo:

                I'm guessing that yes, CPython COULD have virtualizables. However, the people who built CPython a) didn't know about them, b) didn't know how to code that in "C", or c) didn't consider it a priority item.

                Either way, these are the types of advantages I would imagine coding python using python would expose. Optimize what you need to, and then start to see the real ROI of PyPy!

                +
                +
                +
                +
                + + Antonio Cuni wrote on 2009-06-24 14:50: +
                +
                +

                @Ben: no. In the current incarnation, the JITs generated by PyPy optimize only hot loops, when they are executed more than N times. At that point, the frame object has already been allocated.

                The real advantage of virtualizables is that they allows to:

                1) produce very fast code, as if the frame weren't allocated at all (e.g. by storing local variables on the stack or in the registers)

                2) they don't compromise the compatibility with CPython; in particular, sys._getframe() & co. still works fine, because the JIT knows how and when to synchronize the virtualizable (i.e., the frame) with the values that are on the stack.


                @gregturn: I don't see how you can implement something similar to virtualizables without writing a compiler, and CPython is not such a thing :-)

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2009/06/news-from-jit-front-367552118380842303.html b/posts/2009/06/news-from-jit-front-367552118380842303.html new file mode 100644 index 000000000..e327667a9 --- /dev/null +++ b/posts/2009/06/news-from-jit-front-367552118380842303.html @@ -0,0 +1,468 @@ + + + + + +News from the jit front | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                News from the jit front

                + + + +
                +

                +As usual, progress is going slower then predicted, +but nevertheless, we're working hard to make some progress. +

                +

                +We recently managed to make our nice GCs cooperate with our JIT. This is +one point from our detailed plan. As of now, we have a JIT with GCs and +no optimizations. It already speeds up some things, while slowing down +others. The main reason for this is that the JIT generates assembler which is kind +of ok, but it does not do the same level of optimizations gcc would do. +

                +

                +So the current status of the JIT is that it can produce assembler out +of executed python code (or any interpreter written in RPython actually), +but the results are not high quality enough since we're missing optimizations. +

                +

                +The current plan, as of now, looks as follows: +

                +
                  +
                • +Improve the handling of GCs in JIT with inlining of malloc-fast + paths, that should speed up things by a constant, not too big factor. +
                • +
                • +Write a simplified python interpreter, which will be a base for experiments + and to make sure that our JIT does correct things with regard to + optimizations. That would work as mid-level integration test. +
                • +
                • +Think about ways to inline loop-less python functions into their parent's loop. +
                • +
                • +Get rid of frame overhead (by virtualizables) +
                • +
                • +Measure, write benchmarks, publish +
                • +
                • +Profit +
                • +
                + +Cheers,
                +fijal +
                +

                Comments

                +
                +
                +
                + + Anonymous wrote on 2009-06-16 08:03: +
                +
                +

                nice to see the progresses on pypy jit!!

                +
                +
                +
                +
                + + Anonymous wrote on 2009-06-16 09:22: +
                +
                +

                Do you expect to produce jit faster, then Unladen-Swallow's LLVM based ?

                +
                +
                +
                +
                + + Anonymous wrote on 2009-06-16 13:20: +
                +
                +

                Thanks for all the hard work, guys. Keep it up!

                +
                +
                +
                +
                + + Anonymous wrote on 2009-06-16 13:46: +
                +
                +

                ah, this jit business is so exciting!

                +
                +
                +
                +
                + + Anonymous wrote on 2009-06-16 17:00: +
                +
                +

                I am not really shure how this plan relates to the roadmap that was presented in April.

                +
                +
                +
                +
                + + Armin Rigo wrote on 2009-06-16 18:15: +
                +
                +

                How this plan relates: it does not. Fijal's style is to give the current idea of the plans. Don't believe him too much :-) This and April's plan need somehow to be added to each other, or something :-)

                +
                +
                +
                +
                + + Armin Rigo wrote on 2009-06-16 18:22: +
                +
                +

                Unladen-Swallow's LLVM JIT is a very different beast: it compiles each Python function as a unit. You can only get a uniform bit of speedup this way (maybe 2-3x). By contrast, what we are doing gives a non-uniform speedup: like Psyco, we will probably obtain speedups between 2x and 100x depending on the use case.

                (Of course the plan is to be faster than Psyco in the common case :-)

                +
                +
                +
                +
                + + Luis wrote on 2009-06-17 00:11: +
                +
                +

                Armin: regarding Unladen-Swallow, does this approach prevent coming up later with a tracing jit? Or it could be done on top of it?

                +
                +
                +
                +
                + + Nighteh3 wrote on 2009-06-17 05:45: +
                +
                +

                Sweet !! Good luck guys :)

                +
                +
                +
                +
                + + Maciej Fijalkowski wrote on 2009-06-17 05:55: +
                +
                +

                No no no no, trust me :-)

                The thing is that I'm trying to present "current plan"
                as live as it can be. Which means we might change
                our mind completely. But otherwise, the whole blog
                would be mostly empty and boring...

                Cheers,
                fijal

                +
                +
                +
                +
                + + tobami wrote on 2009-06-17 11:22: +
                +
                +

                Could you please, elaborate on the second point about a simplified python interpreter?

                +
                +
                +
                +
                + + tobami wrote on 2009-06-17 11:26: +
                +
                +

                Also, wouldn't it be better to refactor the plan as follows?:

                - Improve the handling of GCs in JIT with inlining of malloc-fast paths, that should speed up things by a constant, not too big factor.
                - Measure, write benchmarks
                - Write a simplified python interpreter, which will be a base for experiments and to make sure that our JIT does correct things with regard to optimizations. That would work as mid-level integration test.
                - Think about ways to inline loop-less python functions into their parent's loop.
                - Measure, publish benchmarks, RELEASE 1.2
                - Get rid of frame overhead (by virtualizables)
                - Measure, publish benchmarks
                - Iterate...

                +
                +
                +
                +
                + + Anonymous wrote on 2009-06-17 14:01: +
                +
                +

                Concerning current ideas vs April's roadmap: I understand that plans change and that's ok of course. But as April's roadmap isn't mentioned at all, I have no idea how the current ideas relate to the previous roadmap (like the current ideas replace the old road map or parts of it / they are additional ideas and the old roadmap is postponed / they are a detailing of (parts of) April's roadmap). Maybe that's obvious to people with better pypy-knowledge than me. I understand Armin's comment that they are additional ideas.

                Keep up the good work!

                Branko

                +
                +
                +
                +
                + + Anonymous wrote on 2009-06-18 14:40: +
                +
                +

                What about threading? Will we have a GIL-less interpreter in the end (assuming the GCs support that)?

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2009/07/ecoop-2009-8415055006373020774.html b/posts/2009/07/ecoop-2009-8415055006373020774.html new file mode 100644 index 000000000..8a3041112 --- /dev/null +++ b/posts/2009/07/ecoop-2009-8415055006373020774.html @@ -0,0 +1,439 @@ + + + + + +ECOOP 2009 | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                ECOOP 2009

                + + + +
                +

                Last week (from 6th to 10th of July) Anto, Armin and me (Carl Friedrich) were in +the magnificent city of Genova, Italy at the ECOOP conference. In this blog +post I want to give a (necessarily personal) account of what we did there.

                +
                +

                Workshop days: ICOOOLPS

                +

                The first two days of the conference were the workshop days. On Monday we +attended the ICOOOLPS workshop, (see the programme of the workshop). We +had gotten two papers accepted at the workshop (one about layering PyPy's JIT +on top of the CLR and one about the basic idea of PyPy's tracing JIT) and +thus gave two presentations at the workshop, one was given by Anto, the other +by me. Both went reasonably well, we got some positive feedback.

                +

                Nearly all the other talks were rather interesting as well. I particularly liked +the one by Hans Schippers, who presented a machine model built on delegation +called delMDSOC. The model is meant implement most features that a language +would need that makes it possible to separate cross-cutting concerns. In the +talk at ICOOOLPS he presented an extension to the model that adds concurrency +support, using a combination of actors and coroutines. He then showed that the +concurrency mechanisms of Java, Salsa (and extension of Java adding actors) and +Io can be mapped to this model.

                +

                Furthermore there were two interesting invited talks, one by Andreas Gal +(Mozilla), and one by Cliff Click (Azul Systems). Andreas explained how +TraceMonkey works. This was very useful for me, because his talk was just before +mine and I could thus kill most of my introduction about tracing JIT compilers +and have more time for the really interesting stuff :-). Cliff talked about +implementing other languages on top of the JVM and some of the pitfalls in +getting them perform well.

                +

                All in all, ICOOOLPS was a very enjoyable workshop, also with many interesting +discussions.

                +

                On Tuesday there were more workshops, but also the PyPy tutorial, so I only went +to a few talks of the COP workshop and spent the rest of the morning +preparing the tutorial (see next section).

                +
                +
                +

                Tutorial

                +

                On Tuesday afternoon we gave a PyPy Tutorial, as part of the ECOOP summer +school. The first lesson we learned was that (as opposed to a community +conference) people don't necessarily want to actually take their laptop out and +try stuff. We gave a slow walk-through about the full life-cycle of development +of a dynamic language interpreter using PyPy's tool-chain: Starting from writing +your interpreter in RPython, testing it on top of CPython to translating it to +C, .NET or Java to actually adding hints to get a JIT inserted.

                +

                There were about seven people attending the tutorial, a couple of which were +very interested and were asking questions and discussing. Some of the +discussions were even very technical, e.g. one about the details of our +type-inference algorithm for RPython and why we cannot do a bottom-up analysis +but have to use forward-propagation instead.

                +

                Jan Vitek of Purdue University told of some of the problems of the OVM +project, which is (among other things) a Java implementation in Java (OVM also +wants to support implementing VMs for other languages with it, if I understood +correctly). He said that the project has +essentially gotten too large and complicated, which means that it is very hard +for new people to get into the project. While PyPy doesn't have some of the +problems of a full Java implementation (e.g. right now our concurrency support +is minimal) I definitely think that some of these risks apply to PyPy as well +and we should find ways to improve the situation in this regard. Channeling +Samuele: Somewhere inside the large lumbering blob of PyPy there is an elegant +core trying to get out.

                +
                +
                +

                Main Conference

                +

                From Wednesday till Friday the main conference was happening. Many of the +talks were not all that interesting for me, being quite Java centric. One talk +that I liked a lot was "Making Sense of Large Heaps", which was presented by +Nick Mitchell (IBM). He presented a tool called "Yeti" that can be used to +analyze large heaps of Java programs. The tool uses some clever algorithms and +heuristics to summarize the heap usage of data structures in intelligent ways to +make it easier to find possible memory-wasters in a program. Nick also gave Anto +and me a demo of the tool, where we tried to apply it to pypy-jvm (we found +out that a fifth of the static data in there belongs to the parser/compiler :-( +).

                +

                On each of the days of the conference there was a keynote. I missed the one by +Simon Peyton-Jones on Wednesday about type classes in Haskell. On Thursday, +David Ungar was awarded the Dahl-Nygaard-Prize for his work on the Self +programming language. Subsequently he gave a really inspiring keynote with the +title "Self and Self: Whys and Wherefores" where he recollected Self's history, +both on a technical as well as on a social level. Parts of the talk were +snippets from the movies Self: The Movie and Alternate Reality Kit, both +of which I highly recommend.

                +

                The keynote on Friday was by Cliff Click with the title "Java on 1000 Cores: +Tales of Hardware/Software Co-design". He described the custom CPU architecture +that Azul Systems has developed to run Java server applications on hundreds of +cores. The talk mostly talked about the hardware, which I found very interesting +(but some people didn't care for too much). Azul's CPU is essentially 54 in-order +RISC cores in a single processor. The cores have a lot of extensions that make +it easier to run Java on them, e.g. hardware read- and write-barriers, +hardware-transactional-memory and hardware escape-detection (!).

                +

                In addition to the talks, there is of course always the hallway track (or coffee +track) which is the track where you stand in the hallway and discuss with +people. As usual, this was the most interesting part of the conference. One of +those talks was Anto and me giving a PyPy demo to David Ungar. We had a very +interesting discussion about VM implementation in general and the sort of +debugging tools you need to write in particular. He liked PyPy a lot, which +makes me very happy. He also liked the fact that I have actually read most Self +papers :-).

                +
                +
                +

                Comments

                +
                +
                +
                + + Alexander Kellett wrote on 2009-07-16 19:09: +
                +
                +

                The link to delMDSOC should be https://www.hpi.uni-potsdam.de/hirschfeld/projects/delmdsoc/

                Alex

                +
                +
                +
                +
                + + Anonymous wrote on 2009-07-17 04:10: +
                +
                +

                Glad it went well.

                I gather there wasn't any sprint at EuroPython? I was hoping for some news.

                If you can get a real python implementation out there that is starting to get faster than CPython, you could get some momentum really quickly.

                I hope Unladen Swallow doesn't end up stealing your potential userbase, or at least dividing it.

                +
                +
                +
                +
                + + Donovan Preston wrote on 2009-07-18 01:28: +
                +
                +

                I <3 Self.

                +
                +
                +
                +
                + + Terrence wrote on 2009-07-18 05:45: +
                +
                +

                Is something like your PyPy Tutorial online somewhere? I have a befunge interpreter that I've been meaning to get working on pypy but I have almost no idea where to begin. I've been reading pypy's code on and off for awhile now and it's very slow going. If there were some way to get up and running faster, I would really like to know about it.

                +
                +
                +
                +
                + + Armin Rigo wrote on 2009-07-20 10:25: +
                +
                +

                You can find the tutorial
                here but the part written down is quite limited. If you need starting points, look at pypy/translator/goal/target*.py (for example targetjsstandalone, which runs our partial JS interpreter).

                +
                +
                +
                +
                + + Terrence wrote on 2009-07-21 08:40: +
                +
                +

                Thank you for the link. I had started with targetpypystandalone.py, which, on reflection, appears to be more towards the deep end of the pool. The javascript target is exactly what I'm looking for.

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2009/07/pypy-numeric-experiments-2221073696038673235.html b/posts/2009/07/pypy-numeric-experiments-2221073696038673235.html new file mode 100644 index 000000000..34d150a07 --- /dev/null +++ b/posts/2009/07/pypy-numeric-experiments-2221073696038673235.html @@ -0,0 +1,434 @@ + + + + + +PyPy numeric experiments | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                PyPy numeric experiments

                + + + +
                +

                +Because PyPy will be presenting at the upcoming euroscipy conference, I have been playing recently with the idea of NumPy and PyPy integration. My idea is to integrate PyPy's JIT with NumPy or at least a very basic subset of it. Time constraints make it impossible to hand write a JIT compiler that understands NumPy. But given PyPy's architecture we actually have a JIT generator, so we don't need to write one :-) +

                + +

                +Our JIT has shown that it can speed up small arithmetic examples significantly. What happens with something like NumPy? +

                +

                +I wrote a very minimal subset of NumPy in RPython, called micronumpy (only single-dimension int arrays that can only get and set items), and a benchmark against it. The point of this benchmark is to compare the performance of a builtin function (numpy.minimum) against the equivalent hand-written function, written in pure Python and compiled by our JIT. +

                +

                +The goal is to prove that it is possible to write algorithms in Python instead of C without loss of efficiency. Sure, we can write some functions (like minimum in the following example), but there is a whole universe of other ufuncs which would be cool to have in Python instead, assuming this could be done without a huge loss in efficiency. +

                +

                +Here are the results. This is comparing PyPy svn revision 66303 in the pyjitpl5 branch against python 2.6 with NumPy 1.2.1. The builtin numpy.minimum in PyPy is just a naive implementation in RPython, which is comparable to the speed of a naive implementation written in C (and thus a bit slower than the optimized +version in NumPy): +

                + + + + + + + + + + + + + + + + + +
                NumPy (builtin function)0.12s
                PyPy's micronumpy (builtin function)0.28s
                CPython (pure Python)11s
                PyPy with JIT (pure Python)0.91s
                +

                +As we can see, PyPy's JIT is slower than the optmized NumPy's C version, but still much faster than CPython (12x). +

                +

                +Why is it slower? When you actually look at assembler, it's pretty obvious that it's atrocious. There's a lot of speedup to be gained out of just doing simple optimizations on resulting assembler. There are also pretty obvious limitations, like x86 backend not being able to emit opcodes for floats or x86_64 not being there. Those limitations are not fundamental in any sense and can be relatively straightforward to overcome. Therefore it seems we can get C-level speeds for pure Python implementations of numeric algorithms using NumPy arrays in PyPy. I think it's an interesting perspective that Python has the potential of becoming less of a glue language and more of a real implementation language in the scientific field. +

                +Cheers,
                +fijal +
                +

                Comments

                +
                +
                +
                + + Anonymous wrote on 2009-07-17 20:50: +
                +
                +

                I have the feeling your are confessing pypys secrete goal ;-).

                +
                +
                +
                +
                + + Anonymous wrote on 2009-07-18 08:48: +
                +
                +

                a really efficient python for science: THAT would be a real milestone for dynamic languages; and start their era...

                +
                +
                +
                +
                + + tobami wrote on 2009-07-21 10:51: +
                +
                +

                Very, very interesting.

                Something I missed though was a real naive C implementation. You state it is about as fast as "PyPy's micronumpy", but it would have been nice to post the numbers. Of course, the problem is that the code would be different (C, instead of Python), but still...

                +
                +
                +
                +
                + + Anonymous wrote on 2009-07-22 09:37: +
                +
                +

                What would it take to get this really started? Some of our group would happily help here, if there is a sort of a guideline (a TODO list?) that tells what must be done (i.e. as a friend put it, we would be codemonkeys).

                +
                +
                +
                +
                + + Yosef wrote on 2009-07-27 07:19: +
                +
                +

                The difference in pure-python speed is what is most interesting for me, as however much NumPy you use, sometimes important parts of the software still can't be easily vectorized (or at all). If PyPy can let me run compiled NumPy (or Cython) code glued with lightning-fast Python, this leaves me with almost no performance problems. Add to that the convenience of vectorization as a means of writing short, readable code, and its a winning combination.

                +
                +
                +
                +
                + + Zeev wrote on 2009-07-29 09:37: +
                +
                +

                Saying that implementing efficient code generation for floating point code on x86 in your jit is going to be straight forward is disingenuous.

                +
                +
                +
                +
                + + René Dudfield wrote on 2009-07-30 04:02: +
                +
                +

                Here's a project using corepy, runtime assembler to create a faster numpy:

                https://numcorepy.blogspot.com/

                There's also projects like pycuda, and pygpu which generate numpy code to run on GPUs.

                It gets many times than standard numpy.

                pygame uses SDL blitters, and its own blitters - which are specialised array operations for images... these are many times faster than numpy in general - since they are hand optimized assembler, or very efficiently optimised C.

                Remember that hand optimized assembler can be 10x faster than even C, and that not all C code is equal.

                So it seems that even the pypy generated C code could even be faster.

                What about applying pypy to CUDA, or OpenCL C like languages?

                cu,

                +
                +
                +
                +
                + + Maciej Fijalkowski wrote on 2009-08-02 22:17: +
                +
                +

                @ilume

                I think you're completely missing the point. These experiments are performed using pure-python code that happens to operate on numpy arrays. Assembler generation happens when interpreting this code by the interpreter, so it's not really even the level of hand-written C. Corenumpy on the other hand is trying to speed up numpy operations itself (which is also a nice goal, but completely different).

                Cheers,
                fijal

                +
                +
                +
                +
                + + Anonymous wrote on 2011-04-13 09:48: +
                +
                +

                Hi Maciej! Would you mind blogging an update on PyPy / C interfaces and NumPy?

                I am extensively using NumPy / SciPy / NLopt (apart from apart from the stuff I import from there, my stuff is mostly pure Python algorithms, which interpreter spends most time working on).

                The latest improvements in PyPy JIT really sound like if they could magically dramatically speed up my stuff...

                I don't mind trying PyPy out in production if it will yield significant speedups and otherwise debugging why wouldn't it, but I need access to C stuff from within Python.

                +
                +
                +
                +
                + + Maciej Fijalkowski wrote on 2011-04-13 09:53: +
                +
                +

                Stay tuned, I'll blog about it when I have more results. The progress has been slow so far, but it might accelerate

                +
                +
                +
                +
                + + Anonymous wrote on 2011-04-13 13:37: +
                +
                +

                Hi! Thanks, can't wait for it... :-)

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2009/08/gothenburg-jit-sprint-report-3309138497953458138.html b/posts/2009/08/gothenburg-jit-sprint-report-3309138497953458138.html new file mode 100644 index 000000000..2443c4676 --- /dev/null +++ b/posts/2009/08/gothenburg-jit-sprint-report-3309138497953458138.html @@ -0,0 +1,373 @@ + + + + + +Gothenburg JIT sprint report | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                Gothenburg JIT sprint report

                + + + +
                +

                Finally, we managed to squeeze in some time to write a report about what +has been going on the mysterious JIT sprint in Gothenburg, Sweden. +The main goals of the sprint were to lay down the groundwork for getting +more JIT work going in the next months and get more of PyPy developers +up to speed with the current state of the JIT. One of the elements was +to get better stability of the JIT, moving it slowly from being a prototype to +actually work nicely on larger programs.

                + +

                The secret goal of the sprint was to seek more speed, which Anto and +Carl Friedrich did even during the break day:

                + + +

                We spent the first two days improving test coverage of the x86 backend +and the optimizer. Now we have 100% coverage with unittests +(modulo figleaf bugs), which does not mean anything, but it's better +than before.

                + +

                Then we spent quite some time improving the optimizer passes, so +now we generate far less code than before the sprint, because a lot of +it is optimized away. On the interpreter side, we marked more objects +(like code objects) as immutable, so that reading fields from them +can be constant-folded.

                +

                Another important optimization that we did is to remove consecutive +reading of the same fields from the same structure, if no code in between +can change it.

                +

                Our JIT is a hybrid environment, where only hot loops of code are jitted +and the rest stays being interpreted. We found out that the performance +of the non-jitted part was suboptimal, because all accesses to python +frames went through an extra layer of indirection. We removed this layer +of indirection, in the case where the jit and the interpreter cannot +access the same frame (which is the common case).

                +

                We also spent some time improving the performance of our x86 backend, +by making it use more registers and by doing more advanced variable +renaming at the end of loops. It seems that using more registerd is not as +much of a win as we hoped, because modern day processors are much +smarter than we thought.

                +

                The most mind bending part was finding why we loose performance by +making the JIT see more of the interpreter. It took us two very frustrating +days and 36 gray hairs to find out that from the JIT we call a different malloc +function in the Boehm GC, which is by far slower than the version that +we use from the interpreter. This meant that the more we jitted, the +slower our code got, purely because of the mallocs.

                +

                Now that this is fixed, the world makes much more sense again.

                +

                A lot of the sprint's work is not directly measurable in the performance +figures, but we did a lot of work that is necessary for performance to +improve in the next weeks. After we have done a bit more work, we should +be able to provide some performance figures for programs that are +more realistic than just loops that count to ten millions (which are +very fast already :).

                +

                Now we're going to enjoy a couple of days off to recover from the sprint.

                +

                Bästa hälsningar,
                +Carl Friedrich, fijal

                +
                +

                Comments

                +
                +
                +
                + + Anonymous wrote on 2009-08-25 14:26: +
                +
                +

                Excellent summary. You should never doubt the value of these updates, they are essential for maintaining awareness.

                +
                +
                +
                +
                + + Bourne wrote on 2009-08-25 18:11: +
                +
                +

                Congrats on your impressive work! This sounds more and more promising.

                +
                +
                +
                +
                + + Freakazo wrote on 2009-08-28 15:15: +
                +
                +

                Updates like this are extremely interesting to read, and it gives me a months worth of new terms and technology to learn :D

                Can't wait to use Pypy!

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2009/08/pypy-gets-new-compiler_25-6401910947439531107.html b/posts/2009/08/pypy-gets-new-compiler_25-6401910947439531107.html new file mode 100644 index 000000000..9ddbc1f0f --- /dev/null +++ b/posts/2009/08/pypy-gets-new-compiler_25-6401910947439531107.html @@ -0,0 +1,409 @@ + + + + + +PyPy gets a new compiler | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                PyPy gets a new compiler

                + + + +
                +

                Today, I merged the parser-compiler branch, which I have been working on over the summer. It contained a total rewrite of both PyPy's Python parser and AST compiler. PyPy's old parser was (in)famous internally for being complicated and slow (with many algorithmic complexities greater than O(n)). The new parser is a simple as I could make it LL(1) parser like CPython (though it doesn't share the hacks of CPython's parser).

                + +

                The new compiler is based on the Abstract Syntax Trees (AST) that CPython 2.5 introduced instead of PyPy's old AST based on the compiler package's. This means that Python code running on PyPy will be able to use the same _ast interface as CPython. PyPy's _ast implementation supports AST features that CPython 2.6 added, including compiling modified AST to bytecode and executing it. In this rewrite, some more obscure compiler features were added, too. For example, jumps in bytecode can now be greater than 65535 bytes! (That's like an if statement with 7000 lines of code in the body.)

                + +

                While the PyPy translation toolchain still has many obscure details and hacks, this merge completes the process of making the actual Python interpreter very clean. Hopefully, this will make adding new features much easier and make PyPy less frustrating to maintain as well as providing application level code with an improved AST interface!

                +
                +

                Comments

                +
                +
                +
                + + Jeremy Cowles wrote on 2009-08-25 23:03: +
                +
                +

                Nice, keep up the good work!

                +
                +
                +
                +
                + + Anonymous wrote on 2009-08-26 08:12: +
                +
                +

                Thank you.. Keep it up.

                +
                +
                +
                +
                + + random user wrote on 2009-08-26 17:52: +
                +
                +

                Very nice. Thanks for all of your work!

                +
                +
                +
                +
                + + tobami wrote on 2009-08-31 10:43: +
                +
                +

                Hi, the Gothenburg sprint news are very interesting.

                What are your thoughts about a release roadmap?. Do you intend to release a pypy 1.2 with improved compatibility and speed but no JIT, and later include the JIT (version 1.5, maybe?)?.

                I think publishing some kind of roadmap would be useful, as a project suffers when its release cycles are BOTH long and unpredictable.

                +
                +
                +
                +
                + + tobami wrote on 2009-08-31 10:51: +
                +
                +

                Also, starting from the next stable release, it would be great to publish some kind of benchmarks page to keep track of performance across different versions (cpython 2.6 vs pypy 1.1 vs pypy 1.2 vs pypy with JIT).

                Now that I think of it, do you need some kind of help with the website?. I think starting with the next pypy's release, the project will get a lot more visibility and a nicer and better structured website would be a definite plus. If you feel it would be a useful task I could help there.

                +
                +
                +
                +
                + + Maciej Fijalkowski wrote on 2009-08-31 16:05: +
                +
                +

                Hey.

                Both, the benchmarks (that would also include say jython) and a nice website for people who actually want to use it would be a very nice addon. We definitely would appreciate some help with it.

                If you have any ideas feel free to continue discussion on pypy-dev.

                Cheers,
                fijal

                +
                +
                +
                +
                + + tobami wrote on 2009-08-31 21:46: +
                +
                +

                Hi Maciej, as you suggested, I have subscribed to the pypy-dev mailing list and have started the discussion.

                Cheers,

                Miquel

                +
                +
                +
                +
                + + Maciej Fijalkowski wrote on 2009-09-01 15:47: +
                +
                +

                Hey Miguel.

                I fail to see your post.

                Cheers,
                fijal

                +
                +
                +
                +
                + + tobami wrote on 2009-09-02 10:28: +
                +
                +

                it got rejected. I have written to pypy-dev-owner to see where the problem is.

                Cheers

                +
                +
                +
                +
                + + Maciej Fijalkowski wrote on 2009-09-05 10:13: +
                +
                +

                @tobami

                you should subscribe to the list first.
                We get far too much spam to accept
                posts from non-members.

                Cheers,
                fijal

                +
                +
                +
                +
                + + tobami wrote on 2009-09-05 21:21: +
                +
                +

                @Maciej,

                well, I subscribed first, that is the problem. I now get sent the pypy-dev mailing list, but my post got rejected anyway. And pypy-owner hasn't answered yet.

                What can I do?

                +
                +
                +
                +
                + + Maciej Fijalkowski wrote on 2009-09-06 19:57: +
                +
                +

                @tobami

                you did something wrong. pypy-dev
                is not a moderated list (from
                members, that is). Can you leave your mail, so we can no longer spam here? Mine is fijal at merlinux.eu

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2009/09/first-results-of-jit-6674537807334018925.html b/posts/2009/09/first-results-of-jit-6674537807334018925.html new file mode 100644 index 000000000..88d89f7eb --- /dev/null +++ b/posts/2009/09/first-results-of-jit-6674537807334018925.html @@ -0,0 +1,710 @@ + + + + + +First results of the JIT | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                First results of the JIT

                + + + +
                +

                Hi all,

                + +

                Just a quick note to tell you that we are progressing on the +JIT front. Here are the running times of the richards +benchmark on my laptop:

                + +
                  +
                • 8.18 seconds with CPython 2.5.2; + +
                • +
                • 2.61 seconds with pypy-c-jit (3x faster than CPython); + +
                • +
                • 1.04 seconds if you ignore the time spent making assembler (8x faster than CPython); + +
                • +
                • 1.59 seconds on Psyco, for reference (5x faster that CPython).
                • +
                +

                Yes, as this table shows, we are spending 1.57 seconds in the JIT +support code. That's too much -- even ridiculously so -- for anything but a +long-running process. We are working on that :-)

                + +

                If you want to build your own pypy-c-jit (for x86-32 only for now):

                + +
                  +
                • you need a Subversion checkout of trunk; + +
                • +
                • run pypy/translator/goal/translate.py with the -Ojit + option; + +
                • +
                • as usual, wait a long time (and be sure you have more than 1GB of RAM).
                • +
                +

                For now pypy-c-jit spews a lot of debugging output and +there are a few known +examples where it crashes. As we like to repeat, however, it's a complete JIT: +apart from the crashes (the bugs are probably in the JIT support code), it supports the whole Python language from the start -- in the sense of doing correct things. Future work include +Python-specific improvements by e.g. tweaking the data structures used to store Python objects so that they are more JIT-friendly.

                + +

                EDIT: Oh yes, fijal reminds me that CPython 2.6 is 30% faster than CPython 2.5 on this benchmark (which is mostly my "fault", as I extracted a small part of PyPy and submitted it as a patch to CPython that works particularly well for examples like richards). It does not fundamentally change the fact that we are way faster though.

                +
                +

                Comments

                +
                +
                +
                + + Unknown wrote on 2009-09-27 17:56: +
                +
                +

                This thing just got interesting.

                Why this particular benchmark?

                +
                +
                +
                +
                + + cjrh wrote on 2009-09-27 19:32: +
                +
                +

                Fantastic!

                At this point, it would be a really good idea for the pypy team to prepare downloadable binaries or setup tools, or eggs for making it extremely easy for a new user to try it out. Now that the performance is starting to become interesting, many more people will want to experiment with it and you don't want that enthusiam hampered by a somewhat involved setup process.

                +
                +
                +
                +
                + + Unknown wrote on 2009-09-27 19:40: +
                +
                +

                > it would be a really good idea for the pypy team to prepare downloadable binaries or setup tools, or eggs

                I second this notion. I am among the group of people who are quite tempted to try things out, but not sure how much work I'll have to do first.

                +
                +
                +
                +
                + + Anonymous wrote on 2009-09-27 20:27: +
                +
                +

                Me too I'd like pre-built binaries

                +
                +
                +
                +
                + + Anonymous wrote on 2009-09-27 20:29: +
                +
                +

                I agree. Please put some binaries on your page to make it easier for everyone to survey what you've done!

                +
                +
                +
                +
                + + Armin Rigo wrote on 2009-09-27 20:30: +
                +
                +

                This particular benchmark happens to be the one we use; there is no deep reason besides its relative simplicity (but it is not a microbenchmark, it's really computing something). We will of course make more tests with a better range of benchmarks when things start to settle a bit. Right now we are busy developing, and the numbers change every week.

                It's also for this reason that there is no nicely-packaged release, sorry :-)
                Note that translating your own pypy-c-jit is not a lot of work for you. It is just a lot of work for your CPU :-) You just do "svn co", "cd pypy/translator/goal" and "./translate -Ojit".

                +
                +
                +
                +
                + + Anonymous wrote on 2009-09-27 20:41: +
                +
                +

                I would appreciate binaries because I don't have a computer with multi-GB RAM. I tried translating pypy a few months ago but gave up after several hours (the computer was just swapping constantly).

                I can wait some longer, but regular binary releases (even if just unstable trunk snapshots) would be useful.

                Anyway, keep up the good work! This is looking really promising.

                +
                +
                +
                +
                + + Anonymous wrote on 2009-09-28 00:47: +
                +
                +

                Quite nice, thank you..

                +
                +
                +
                +
                + + PavPanchekha wrote on 2009-09-28 03:27: +
                +
                +

                Perhaps there is some way to store generated assembler code? I don't know too much about assembler or the JIT backend, but I assume that it'd be possible to stick the generated assembler code into a comment (or, if those don't exist, a docstring) in the .pyc file, so that a library that is commonly imported won't have to waste time generating assembler.

                +
                +
                +
                +
                + + Benjamin Peterson wrote on 2009-09-28 03:45: +
                +
                +

                @PavPanchekha We specialize the assembler agressively, so that probably wouldn't be so useful. We have a lot of room to improve on assembly generation, though.

                +
                +
                +
                +
                + + Unknown wrote on 2009-09-28 07:09: +
                +
                +

                Thanks for the update!

                +
                +
                +
                +
                + + Anonymous wrote on 2009-09-28 11:40: +
                +
                + Anonymous said: I would appreciate binaries because I don't have a computer with multi-GB RAM.

                I do have such a computer, but I would still appreciate binaries, because the current trunk does not translate for me:

                [translation:ERROR] File "/tmp/pypy/pypy/annotation/annrpython.py", line 227, in addpendingblock
                [translation:ERROR] assert s_oldarg.contains(s_newarg)
                [translation:ERROR] AssertionError':
                [translation:ERROR] .. v1703 = simple_call((function mmap), v1702, map_size_0, (7), (34), (-1), (0))
                [translation:ERROR] .. '(pypy.rlib.rmmap:628)alloc' +
                +
                +
                +
                + + Armin Rigo wrote on 2009-09-28 11:59: +
                +
                +

                You are probably missing a dependency. See https://codespeak.net/pypy/dist/pypy/doc/getting-started-python.html#translating-the-pypy-python-interpreter

                +
                +
                +
                +
                + + della wrote on 2009-09-28 13:43: +
                +
                +

                Great work! Is it possible to build the 32-bit binary on a 64-bit machine without too much effort? Having those instructions would certainly help us 64-bit people :)

                +
                +
                +
                +
                + + Luis wrote on 2009-09-28 14:02: +
                +
                +

                I guess the time spent making assembler is only the first time the code is executed. Is that right? If so, we can consider an 8x speedup as the most accurate result. Or not?

                +
                +
                +
                +
                + + nshepperd wrote on 2009-09-28 14:41: +
                +
                +

                @della: I use a 32-bit chroot on my own x64 machine. I don't know if that counts as "too much effort" (certainly it theoretically shouldn't require that), but it has been for me the most painless way to do it.

                +
                +
                +
                +
                + + Maciej Fijalkowski wrote on 2009-09-28 15:16: +
                +
                +

                @Luis: yes, it's only first time.
                Well, depends how you count, but it
                can be considered 8x speedup...

                +
                +
                +
                +
                + + Armin Rigo wrote on 2009-09-28 20:17: +
                +
                +

                Here are prebuilt C sources (in which "tracing" time was reduced by 20-30% since the blog post):

                https://wyvern.cs.uni-duesseldorf.de/~arigo/chain.tar.bz2

                Linux x86-32 only. You still need a svn checkout of PyPy, and you still need to compile them with gcc -- but it does not take too long: edit the first entry of the Makefile to point to your checkout of PyPy and type "make". This still assumes that all dependencies have been installed first. Don't complain if the #includes are at the wrong location for your distribution; you would get them right if you translated the whole thing yourself. In fact, don't complain if it does not compile for any reason, please :-) C sources like that are not really supposed to be portable, because they are just intermediates in the translation process.

                +
                +
                +
                +
                + + Anonymous wrote on 2009-09-28 21:28: +
                +
                +

                ̉You are probably missing a dependency. See https://codespeak.net/pypy/dist/pypy/doc/getting-started-python.html#translating-the-pypy-python-interpreter

                Dear Armin, it seem like this document should mention libexpat1-dev and libssl-dev as dependencies, too. Anyway, I managed to build pypy-c, and here are the result for some small benchmarks I wrote. (Is there a way here at blogger.com to not break the formatting?)

                python 2.5 psyco pypy-c
                richards 14.9 2.9 3.9
                mergesort 27.6 4.8 26.3
                convexhull 9.4 5.6 6.3
                bigcityskyline 46.9 3.1 7.6
                fft 14.1 15.4 25.0

                Thank you all for your efforts.

                +
                +
                +
                +
                + + Armin Rigo wrote on 2009-09-29 07:47: +
                +
                +

                Thanks for the missing dependencies; added to the development version of the page. Thanks also for the numbers you report. The next obvious thing we miss is float support (coming soon!), which shows in some of your benchmarks.

                +
                +
                +
                +
                + + René Dudfield wrote on 2009-09-29 08:06: +
                +
                +

                Hi,

                this is so unbelievably awesome, it's going to take me a while to recover from all the awesomness.

                CONGRATS!

                ps. a nice improvement for users is to get your ./configure script to find dependencies and report the ones missing, and ones used (s/configure/setup.py/g).

                +
                +
                +
                +
                + + Anonymous wrote on 2009-09-29 13:09: +
                +
                +

                nice!

                so what is your guess at the moment? how fast can pypy get if you further optimize the jit?

                +
                +
                +
                +
                + + Anonymous wrote on 2009-09-29 13:47: +
                +
                +

                Dear Pypy developers, is it possible to switch off the very agressive JIT logging in pypy-c? First, this could make pypy-c a drop-in replacement for cpython. (Many more beta-testers.) Second, the logging itself seems to be somewhat resource-intensive.

                Very cool Mandelbrot ascii art, by the way.

                +
                +
                +
                +
                + + Maciej Fijalkowski wrote on 2009-09-30 08:42: +
                +
                +

                Dear anonymous.

                you can compile ./translate.py -Ojit --jit-debug=profile

                There is no runtime switch unfortunately, so far.

                Cheers,
                fijal

                +
                +
                +
                +
                + + Anonymous wrote on 2009-09-30 10:32: +
                +
                +

                Thank you! For many of us, the translation-time switch will be just as good.

                +
                +
                +
                +
                + + della wrote on 2009-10-01 09:31: +
                +
                +

                I can't seem to compile (32-bit Ubuntu 9.10 chroot), by manually executing the Makefile in /tmp/usession-0/testing_1 I get this traceback:

                File "/home/della/pkg/pypy/trunk/pypy/translator/c/gcc/trackgcroot.py", line 1210, in (module)
                tracker.process(f, g, filename=fn)
                File "/home/della/pkg/pypy/trunk/pypy/translator/c/gcc/trackgcroot.py", line 229, in process
                lines = self.process_function(lines, entrypoint, filename)
                File "/home/della/pkg/pypy/trunk/pypy/translator/c/gcc/trackgcroot.py", line 244, in process_function
                table = tracker.computegcmaptable(self.verbose)
                File "/home/della/pkg/pypy/trunk/pypy/translator/c/gcc/trackgcroot.py", line 285, in computegcmaptable
                self.parse_instructions()
                File "/home/della/pkg/pypy/trunk/pypy/translator/c/gcc/trackgcroot.py", line 364, in parse_instructions
                meth = self.find_missing_visit_method(opname)
                File "/home/della/pkg/pypy/trunk/pypy/translator/c/gcc/trackgcroot.py", line 390, in find_missing_visit_method
                raise UnrecognizedOperation(opname)
                __main__.UnrecognizedOperation: jc

                there are some type warnings also for pointers, I don't know if they could be any useful. Maybe you can help me?

                +
                +
                +
                +
                + + Armin Rigo wrote on 2009-10-01 16:56: +
                +
                +

                Thanks for the report, della. Fixed, if you want to try again. Parsing gcc output is a bit delicate as the exact set of operations used depends on the specific version and command-line options passed to gcc.

                +
                +
                +
                +
                + + Armin Rigo wrote on 2009-10-01 16:58: +
                +
                +

                Since the blog post, here are the updated numbers: we run richards.py in 2.10 seconds (almost 4x faster than CPython), and only spend 0.916 seconds actually running the assembler (almost 9x faster than CPython).

                +
                +
                +
                +
                + + Anonymous wrote on 2009-10-01 18:25: +
                +
                +

                Very nice. Do you expect to get faster than psyco?

                +
                +
                +
                +
                + + Luis wrote on 2009-10-02 01:27: +
                +
                +

                This is very exciting! Please, try to post updates to these figures... thanks!

                +
                +
                +
                +
                + + Unhelpful wrote on 2009-10-04 01:13: +
                +
                +

                I was having the same problem as della, and your fix seems to work, but it's breaking somewhere else now. I don't think I have a dependency problem, I can build a working pypy-c without jit. Running make manually produces heaps of warnings about incompatible pointers, some probably harmless (int* vs long int*, these should be the same on x86-32), but others worry me more, like struct stat* vs struct stat64*, or struct SSL* vs char**. I put the complete output of a manual run of make online.

                +
                +
                +
                +
                + + Anonymous wrote on 2009-10-04 01:40: +
                +
                +

                Interestingly, the translation itself seems to consume at most about 960MB of ram. It's easy to translate on a system even with only a gig of ram if you stop everything else.

                Try switching run levels or the like.

                The -Ojit option seems to cause an error in translation with Revision 68125, when translated using Python 2.5.2 on Debian Lenny.

                +
                +
                +
                +
                + + proteusguy wrote on 2009-10-04 07:31: +
                +
                +

                First off - congratulations and good job on the great progress. I've been watching this project since the 2007 PyCon in DFW and it's great to see these promising results.

                That said, while I know there's still a lot of work to do and this is very much an in-progress thing, I'm very much looking forward to an excuse to try this stuff out in anger - real practical situations. For me that means some statistical calculation engines (monto-carlo analysis) front ended by web services. In both situations this brings up two constraints: a) must support 64bit (because our data sets rapidly go above 4GB RAM) and b) must not be overly memory hungry (because any significant incremental overhead really hurts when your data sets are already over 4GB RAM).

                For now we use Psyco for small stuff but have to re-implement in C++ once we hit that 32-bit limit. PyPy is very exciting as a practical alternative to Psyco because of anticipated 64bit support. I wonder if, due to the existence fo Psyco already, that PyPy shouldn't focus first on 64bit instead?

                Few things would speed up progress than getting PyPy used out in the wild - even if only by those of us who appreciate it's very much in flux but still understand how to benefit from it.

                I understand you guys have your focus and goals and encourage you to keep up the good work. Just thought I'd throw this out as an idea to consider. I'm sure there are a lot like me anxious to give it a spin.

                -- Ben Scherrey

                +
                +
                +
                +
                + + Armin Rigo wrote on 2009-10-04 11:25: +
                +
                +

                Andrew: can you update and try again? If you still have the .c files around it is enough to go there and type "make"; otherwise, restart the build. It should still crash, but give us more information about why it does.

                +
                +
                +
                +
                + + Unhelpful wrote on 2009-10-04 16:04: +
                +
                +

                The new traceback is:

                Traceback (most recent call last):
                File "/home/chshrcat/build/pypy-trunk/pypy/translator/c/gcc/trackgcroot.py", line 1211, in <module>
                assert fn.endswith('.s')
                AssertionError

                Is the position in input tracked? that might help, or I could package my .gcmap files.

                +
                +
                +
                +
                + + Unhelpful wrote on 2009-10-04 16:15: +
                +
                +

                The trouble seems to be implement.gcmap and implement_9.gcmap. These are bothe empty, and trigger the assertion error.

                Running trackgcroot as the Makefile does, but without those two files, permits compilation to continue, but linking fails with undefined references to various symbols with the prefix 'pypy_g_'.

                I suspected the changes might have invalidated the old .gcmap files, so I tried removing them, and got this when it tried to generate implement.gcmap:

                Traceback (most recent call last):
                File "/home/chshrcat/build/pypy-trunk/pypy/translator/c/gcc/trackgcroot.py", line 1214, in <module>
                tracker.process(f, g, filename=fn)
                File "/home/chshrcat/build/pypy-trunk/pypy/translator/c/gcc/trackgcroot.py", line 229, in process
                lines = self.process_function(lines, entrypoint, filename)
                File "/home/chshrcat/build/pypy-trunk/pypy/translator/c/gcc/trackgcroot.py", line 244, in process_function
                table = tracker.computegcmaptable(self.verbose)
                File "/home/chshrcat/build/pypy-trunk/pypy/translator/c/gcc/trackgcroot.py", line 285, in computegcmaptable
                self.parse_instructions()
                File "/home/chshrcat/build/pypy-trunk/pypy/translator/c/gcc/trackgcroot.py", line 365, in parse_instructions
                insn = meth(line)
                File "/home/chshrcat/build/pypy-trunk/pypy/translator/c/gcc/trackgcroot.py", line 741, in visit_jmp
                self.conditional_jump(line)
                File "/home/chshrcat/build/pypy-trunk/pypy/translator/c/gcc/trackgcroot.py", line 757, in conditional_jump
                raise UnrecognizedOperation(line)
                __main__.UnrecognizedOperation: jmp T.14141

                +
                +
                +
                +
                + + Anonymous wrote on 2009-10-04 16:19: +
                +
                +

                A correction/clarification to last night's post:

                There isn't a bug in the -Ojit translation process, I was just missing a dependency that I could've sworn I've installed before.

                The translation process only takes < 1GB memory if done without any options. Attempting to translate with the -Ojit option takes at least 2.5GB of RAM, as I tried last night (with it as the only running process) and it consumed my swapfile and ran out of memory.

                Is there any documented way to use a translated pypy binary to build other pypy translations? That might help reduce the build requirements, and would also be mighty cool.

                +
                +
                +
                +
                + + Armin Rigo wrote on 2009-10-04 16:50: +
                +
                +

                NickDaly: checked in, please try. Also, please come to the mailing list instead of posting here if you have further comments to do... https://codespeak.net/mailman/listinfo/pypy-dev

                +
                +
                +
                +
                + + Michael Allman wrote on 2009-10-05 10:56: +
                +
                +

                Is pypy-c-jit written in C or Python or something else? I ask because of the "c" in pypy-c-jit.

                +
                +
                +
                +
                + + Carl Friedrich Bolz-Tereick wrote on 2009-10-05 14:28: +
                +
                +

                Michael: It is written in RPython (a subset of Python) but then translated to C. By convention we therefore call the executable-name pypy-c. If the executable also contains a JIT, we call it pypy-c-jit.

                +
                +
                +
                +
                + + Carl Friedrich Bolz-Tereick wrote on 2009-10-05 15:48: +
                +
                +

                Ben Scherrey: 64bit support might happen not too far in the future. Not using too much memory is a different problem, that might take a while longer. It has two aspects, one is that the JIT itself uses way too much memory at the moment. We will work on that soon.

                The other aspect is making sure that your dataset does not take too much heap. It depends a bit which data structures you use, but it's not likely to be that great right now. That might change at some point, I have some ideas in that direction, but not really time to work on the soon.

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2009/09/pypy-sprint-in-dusseldorf-6-nov-13-nov-8153983964308175836.html b/posts/2009/09/pypy-sprint-in-dusseldorf-6-nov-13-nov-8153983964308175836.html new file mode 100644 index 000000000..27c232b1a --- /dev/null +++ b/posts/2009/09/pypy-sprint-in-dusseldorf-6-nov-13-nov-8153983964308175836.html @@ -0,0 +1,343 @@ + + + + + +PyPy sprint in Düsseldorf, 6 Nov - 13 Nov | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                PyPy sprint in Düsseldorf, 6 Nov - 13 Nov

                + + + +
                +

                The next PyPy sprint will be held in the Computer Science department of +Heinrich-Heine Universität Düsseldorf from the 6th to the 13th of +November 2009. This is a fully public sprint, everyone is welcome to +join us.

                +
                +

                Topics and goals

                +

                At the sprint we intend to work on the JIT generator in PyPy and on +applying it to PyPy Python interpreter.

                +

                The precise work that will be done is not fixed, as we don't know in +which state the JIT will be in November. However, possible areas of +work might include:

                +
                  +
                • tweaking the interpreter/objspace to be more JIT-friendly, e.g. +instance implementation code, call code
                • +
                • if there is interest starting non x86-32 JIT backends
                • +
                • trying out existing software to find features where the optimizations +of the JIT could be improved
                • +
                • improving our benchmarking infrastructure
                • +
                +

                We will give special priority to topics that "non-core" people find +interesting (as long as they are somehow JIT-related).

                +

                For an introduction of how our JIT-generation process works, please +refer to our blog:

                +

                https://morepypy.blogspot.com/2009/03/jit-bit-of-look-inside.html

                +

                There is also a more dense academic paper about the subject:

                +

                https://codespeak.net/svn/pypy/extradoc/talk/icooolps2009/bolz-tracing-jit-final.pdf

                +
                +
                +

                Location

                +

                The sprint will take place in a seminar room of the computer science +department. It is in the building 25.12 of the university campus. For +travel instructions see

                +
                +https://stups.cs.uni-duesseldorf.de/anreise/esbahn.php +
                +
                +
                +

                Registration

                +

                If you'd like to come, please subscribe to the pypy-sprint mailing +list and drop a note about your interests and post any questions. +More organisational information will be send to that list. We'll keep a +list of people which we'll update (which you can do so yourself if +you have codespeak commit rights).

                +
                +
                +

                Comments

                +
                +
                +
                + + Unknown wrote on 2009-09-25 08:53: +
                +
                +

                Following the svn mailing list, there appears to have been a number of quite large refactorings of the JIT recently. Is there a good description of what they are going to achieve, and what the performance gains are? A blog post with an update would be really cool

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2009/10/first-pypy-cli-jit-benchmarks-6698484455072589492.html b/posts/2009/10/first-pypy-cli-jit-benchmarks-6698484455072589492.html new file mode 100644 index 000000000..28f911ccc --- /dev/null +++ b/posts/2009/10/first-pypy-cli-jit-benchmarks-6698484455072589492.html @@ -0,0 +1,511 @@ + + + + + +First pypy-cli-jit benchmarks | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                First pypy-cli-jit benchmarks

                + + + +
                +

                As the readers of this blog already know, I've been working on porting the +JIT to CLI/.NET for the last months. Now that it's finally possible to get a +working pypy-cli-jit, it's time to do some benchmarks.

                +

                Warning: as usual, all of this has to be considered to be a alpha version: +don't be surprised if you get a crash when trying to run pypy-cli-jit. Of +course, things are improving very quickly so it should become more and more +stable as days pass.

                +

                For this time, I decided to run four benchmarks. Note that for all of them we +run the main function once in advance, to let the JIT recoginizing the hot +loops and emitting the corresponding code. Thus, the results reported do +not include the time spent by the JIT compiler itself, but give a good +measure of how good is the code generated by the JIT. At this point in time, +I know that the CLI JIT backend spends way too much time compiling stuff, but +this issue will be fixed soon.

                +
                +
                  +
                • +f1.py: this is the classic PyPy JIT benchmark. It is just a function +that does some computational intensive work with integers.
                • +
                • +floatdemo.py: this is the same benchmark involving floating point +numbers that have already been described in a previous blog post.
                • +
                • +oodemo.py: this is just a microbenchmark doing object oriented stuff +such as method calls and attribute access.
                • +
                • +richards2.py: a modified version of the classic richards.py, with a +warmup call before starting the real benchmark.
                • +
                +
                +

                The benchmarks were run on a Windows machine with an Intel Pentium Dual Core +E5200 2.5GHz and 2GB RAM, both with .NET (CLR 2.0) and Mono 2.4.2.3.

                +

                Because of a known mono bug, if you use a version older than 2.1 you need +to pass the option -O=-branch to mono when running pypy-cli-jit, else it +will just loop forever.

                +

                For comparison, we also run the same benchmarks with IronPython 2.0.1 and +IronPython 2.6rc1. Note that IronPython 2.6rc1 does not work with mono.

                +

                So, here are the results (expressed in seconds) with Microsoft CLR:

                +
                + ++++++++ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                Benchmarkpypy-cli-jitipy 2.0.1ipy 2.6ipy2.01/ pypyipy2.6/ pypy
                f10.0280.1450.1365.18x4.85x
                floatdemo0.6710.7650.8121.14x1.21x
                oodemo1.254.2783.8163.42x3.05x
                richards212284426700.36x0.54x
                +
                +

                And with Mono:

                +
                + ++++++ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                Benchmarkpypy-cli-jitipy 2.0.1ipy2.01/ pypy
                f10.0420.69516.54x
                floatdemo0.7811.2181.55x
                oodemo1.7039.5015.31x
                richards27208621.20x
                +
                +

                These results are very interesting: under the CLR, we are between 5x faster +and 3x slower than IronPython 2.0.1, and between 4.8x faster and 1.8x slower +than IronPython 2.6. On the other hand, on mono we are consistently faster +than IronPython, up to 16x. Also, it is also interesting to note that +pypy-cli runs faster on CLR than mono for all benchmarks except richards2.

                +

                I've not investigated yet, but I think that the culprit is the terrible +behaviour of tail calls on CLR: as I already wrote in another blog post, +tail calls are ~10x slower than normal calls on CLR, while being only ~2x +slower than normal calls on mono. richads2 is probably the benchmark that +makes most use of tail calls, thus explaining why we have a much better result +on mono than CLR.

                +

                The next step is probably to find an alternative implementation that does not +use tail calls: this probably will also improve the time spent by the JIT +compiler itself, which is not reported in the numbers above but that so far it +is surely too high to be acceptable. Stay tuned.

                +
                +

                Comments

                +
                +
                +
                + + Michael Foord wrote on 2009-10-15 15:01: +
                +
                +

                Perhaps you should try another run with the .NET 4 beta. They have at least *mostly* fixed the terrible performance of tail calls there.

                Anyway - interesting stuff, keep up the good work. What is the current state of .NET integration with pypy-cli?

                +
                +
                +
                +
                + + Antonio Cuni wrote on 2009-10-15 15:17: +
                +
                +

                Oh, I didn't know about .NET 4 beta. Have you got any link that explains how they fixed the tail call stuff? I'll surely give it a try.

                About the .NET integration: no news from this front. Nowadays I'm fully concentrated on the JIT because I need some (possibly good :-)) results for my phd thesis. When pypy-cli-jit is super-fast, I'll try to make is also useful :-)

                +
                +
                +
                +
                + + Michael Foord wrote on 2009-10-15 15:30: +
                +
                +

                Here's at least one link (with some references) on the tail call improvements in .NET 4:

                https://extended64.com/blogs/news/archive/2009/05/10/tail-call-improvements-in-net-framework-4.aspx

                +
                +
                +
                +
                + + Michael Foord wrote on 2009-10-15 15:31: +
                +
                +

                I'm also intrigued as to why you didn't benchmark IronPython 2.6 on Mono? I thought that on very recent versions of Mono you could build and run IronPython 2.6 fine now?

                +
                +
                +
                +
                + + Michael Foord wrote on 2009-10-15 15:34: +
                +
                +

                Ah, I see now you say that it doesn't work. Hmmm... there are definitely folks who maintain a version that does work (perhaps needing Mono 2.4.3 which I guess is trunk?).

                See the download previews here anyway: https://ironpython-urls.blogspot.com/2009/09/more-from-mono-moonlight-2-monodevelop.html

                +
                +
                +
                +
                + + Anonymous wrote on 2009-10-15 16:09: +
                +
                +

                I wonder if this paper would be useful? It's a way to do continuations using the stack on .NET. Maybe you can use it to speed up tail calls?

                https://www.cs.brown.edu/~sk/Publications/Papers/Published/pcmkf-cont-from-gen-stack-insp/

                +
                +
                +
                +
                + + Antonio Cuni wrote on 2009-10-19 11:58: +
                +
                +

                @Michael: from the link you posted, it seems that tail call improvements in .NET 4 are only for x86_64, but my benchmarks were un on 32 bit, so I don't think it makes a difference. Anyway, I'll try to benchmark with .NET 4 soon, thanks for the suggestion.

                @Anonymous: the paper is interesting, but I don't think it's usable for our purposes: throwing and catching exception is incredibly costing in .NET, we cannot really use them too heavily. The fact that the paper says nothing about performances is also interesting :-)

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2009/10/gc-improvements-6174120095428192954.html b/posts/2009/10/gc-improvements-6174120095428192954.html new file mode 100644 index 000000000..d617ca1b7 --- /dev/null +++ b/posts/2009/10/gc-improvements-6174120095428192954.html @@ -0,0 +1,769 @@ + + + + + +GC improvements | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                GC improvements

                + + + +
                +

                In the last week, I (Armin) have been taking some time off the +JIT work to improve our GCs. More precisely, our GCs now take +one or two words less for every object. This further reduce the +memory usage of PyPy, as we will show at the end.

                + +

                Background information: RPython object model

                + +

                We first need to understand the RPython object model as +implemented by our GCs and our C backend. (Note that the +object model of the Python interpreter is built on top of +that, but is more complicated -- e.g. Python-level objects +are much more flexible than RPython objects.)

                + +

                Consider these two RPython classes:

                + +
                +class A:
                +    def __init__(self, x):
                +        self.x = x
                +    def f(self):
                +        return self.x * 42
                +
                +class B(A):
                +    def __init__(self, x, y):
                +        self.x = x
                +        self.y = y
                +    def f(self):
                +        return self.x + self.y
                +
                + +

                The instances of A and B look like this in memory (all cells +are one word):

                + +

                + + + + + +
                GC headervtable ptr of Ahashx
                +

                + + + + + + +
                GC headervtable ptr of Bhashxy
                +

                The first word, the GC header, describes the layout. It +encodes on half a word the shape of the object, including where it +contains further pointers, so that the GC can trace it. The +other half contains GC flags (e.g. the mark bit of a +mark-and-sweep GC).

                + +

                The second word is used for method dispatch. It is similar to a +C++ vtable pointer. It points to static data that is mostly a +table of methods (as function pointers), containing e.g. the method f +of the example.

                + +

                The hash field is not necessarily there; it is only present in classes +whose hash is ever taken in the RPython program (which includes being +keys in a dictionary). It is an "identity hash": it works like +object.__hash__() in Python, but it cannot just be the address of +the object in case of a GC that moves objects around.

                + +

                Finally, the x and y fields are, obviously, used to store the value +of the fields. Note that instances of B can be used in places that +expect a pointer to an instance of A.

                + +

                Unifying the vtable ptr with the GC header

                + +

                The first idea of saving a word in every object is the observation +that both the vtable ptr and the GC header store information about +the class of the object. Therefore it is natural to try to only have +one of them. The problem is that we still need bits for the GC flags, +so the field that we have to remove is the vtable pointer.

                + +

                This means that method dispatch needs to be more clever: it +cannot directly read the vtable ptr, but needs to compute it +from the half-word of the GC header. Fortunately, this can be +done with no extra instruction on the assembler level. Here is +how things will look like in the end, assuming a 32-bit x86 +machine (but note that as usual we just generate portable C).

                + +

                The trick for achieving efficiency is that we store all +vtables together in memory, and make sure that they don't take +more than 256 KB in total (16 bits, plus 2 bits of alignment). +Here is how the assembler code (produced by the normal C +compiler, e.g. gcc) for calling a method looks like. Before +the change:

                + +
                +MOV EDX, [EAX + 4]               # load the vtable ptr from object EAX
                +MOV EDX, [EDX + method_offset]   # load the function pointer from the vtable
                +CALL EDX
                +
                + +

                Instead, we now have:

                + +
                +MOVZX EDX, [EAX]     # load the 16-bit part of the GC header from EAX
                +MOV EDX, [vtable_start + 4*EDX + method_offset]
                +CALL EDX
                +
                + +

                Note that the complex addressing scheme done by the second MOV +is still just one instruction: the vtable_start and +method_offset are constants, so they are combined. And as the +vtables are anyway aligned at a word boundary, we can use +4*EDX to address them, giving us 256 KB instead of just 64 KB +of vtables.

                + +

                Optimizing the hash field

                + +

                In PyPy's Python interpreter, all application-level objects +are represented as an instance of some subclass of W_Root. +Since all of these objects could potentially be stored in a +dictionary by the application Python program, all these +objects need a hash field. Of course, in practice, only a +fraction of all objects in a Python program end up having +their hash ever taken. Thus this field of W_Root is wasted +memory most of the time.

                + +

                (Up to now, we had a hack in place to save the hash field +on a few classes like W_IntegerObject, but that meant that +the Python expression ``object.__hash__(42)'' would raise +a TypeError in PyPy.)

                + +

                The solution we implemented now (done by some Java GCs, among +others) is to add a hash field to an object when the +(identity) hash of that object is actually taken. This means +that we had to enhance our GCs to support this. When objects +are allocated, we don't reserve any space for the hash:

                + +object at 0x74B028 + + + + +
                ...00...xy
                +

                When the hash of an object is taken, we use its current memory +address, and set a flag in the GC header saying that this +particular object needs a hash:

                + +object at 0x74B028 + + + + +
                ...01...xy
                +

                If the GC needs to move the object to another memory location, +it will make the new version of the object bigger, i.e. it +will also allocate space for the hash field:

                + +object at 0x825F60 + + + + + +
                ...11...xy0x74B028
                +

                This hash field is immediately initialized with the old memory +address, which is the hash value that we gave so far for the +object. To not disturb the layout of the object, we always +put the extra hash field at the end. Of course, once set, +the hash value does not change even if the object needs to +move again.

                + +

                Results

                + +

                Running the following program on PyPy's Python interpreter +with n=4000000:

                + +
                +def make_linked_list(n):
                +    a = None
                +    i = 0
                +    while i < n:
                +        b = X()
                +        b.next = a
                +        a = b
                +        i += 1
                +
                + +

                the two optimizations together save 32 MB of RAM (i.e. 8 bytes +per object). The version of PyPy we measured this with was built +as follows:

                + +
                +./translate.py --gcremovetypeptr targetpypystandalone --objspace-std-withsharingdict
                +
                + +

                The total amount of RAM used on a 32-bit Linux is 247 MB, +completing in 10.3 seconds. On CPython, it consumes 684 MB +and takes 89 seconds to complete... This nicely shows that +our GCs are much faster at allocating objects, and that our +objects can be much smaller than CPython's.

                + +

                Armin Rigo & Carl Friedrich Bolz

                +
                +

                Comments

                +
                +
                +
                + + Shahms wrote on 2009-10-16 16:53: +
                +
                +

                Not really GC related and you may have covered this in another post, but how does PyPy handle id() in a world where the object may move? Is the hash field reused for this when necessary as well? If so, how do you deal with the possibility of another object being allocated at the same address as the original object? If not, how do you avoid having an object's id() change when it's moved?

                +
                +
                +
                +
                + + kbob wrote on 2009-10-16 17:55: +
                +
                +

                Very nice. Using the address for the hash value was especially clever. But how random are those hash values?

                +
                +
                +
                +
                + + Alex wrote on 2009-10-16 19:15: +
                +
                +

                kbob: If PyPy is anything like CPython the randomness isn't so important. The CPython dictionary hash collision resolution strategy is extremely efficient, even amongst hashes with very similar values.

                +
                +
                +
                +
                + + Lucian wrote on 2009-10-16 19:39: +
                +
                +

                This is all sorts of cool. I can't wait for a mostly-production-ready PyPy with JIT.

                On a somewhat related note, how do the JIT and ctypes interact right now, if at all?

                +
                +
                +
                +
                + + Carl Friedrich Bolz-Tereick wrote on 2009-10-16 19:43: +
                +
                +

                Shams: Excellent question! The implementation of id that we have is basically a weak key dict mapping objects to ids on demand. This has the fun side-effect that the ids of PyPy's object start with 1 on count up from there.

                This is rather inefficient (e.g. your garbage collections become linearly slower the more objects you have that have their id taken), but there is not much else you can do. Jython uses a similar solution. For this reason, calling id a lot is essentially discouraged in code you want to run on PyPy.

                +
                +
                +
                +
                + + Carl Friedrich Bolz-Tereick wrote on 2009-10-16 19:50: +
                +
                +

                kbob: I think they should be random enough. You get a collision if you ask the hash of object a, then a collection happens that moves a, then you ask object b for its hash and object b happens to be in the place where object a was before. That sounds unlikely.

                If you write contrived code that has a loop that repeatedly allocates an object, asks its hash by putting it into a dict and then forces a nursery collection, you can get collision: all those objects will be at the beginning of the nursery when their hash is taken. Unlikely again to occur in practise.

                +
                +
                +
                +
                + + Carl Friedrich Bolz-Tereick wrote on 2009-10-16 19:57: +
                +
                +

                Alex: you are right. We use exactly CPython's algorithm for implementing dicts, so having bad hash functions is not a big problem. However, if you really have hash value collisions (e.g. everything hashes to 1) your dict still degenerates to essentially a linear search.

                +
                +
                +
                +
                + + Skandalfo wrote on 2009-10-16 20:05: +
                +
                +

                Wow! You guys that you are my computing heroes.

                Whenever I talk to other people about your project, I always state you are the best example I can imagine of REAL innovation in computer languages.

                That said, I gather the only thing making id() different from hash() is that you need to guarantee that the values for live objects are always unique.

                You could just use the same strategy as with the hash, sticking the id value along the object the next time the object is moved by the GC.

                Meanwhile, from the time id() is called to the time the object is moved, you can just temporarily store an {address: id} mapping somewhere. Entries would be removed from the map once the objects get moved. From then on the id would be attached to the object.

                If GC cycles are frequent, the map doesn't have to grow too large.

                I don't know if the need for id reuse after the id space gets exhausted is important or not. Once you get to the end of the space, you would have to scan the map and heap to find a convenient "hole" to reuse, I suppose.

                +
                +
                +
                +
                + + Shahms wrote on 2009-10-16 20:19: +
                +
                +

                Thanks, Carl. Following up what Skandalfo said, (although this is probably a poor forum for such discussions), it seems like you could reuse the hash field for id as well. Given that the minimum size for a Python object is > 1 byte, you should have at least that much space for offsetting the hash/id. As the GC/allocator has to store information about addresses and blocks anyway it should be a relatively simple matter of building and maintaining a bloom filter of offsets in use for a particular base address.

                Of course, this also constraints the addresses at which Python objects may be allocated and the lower bits in the address may already be used for other purposes...

                +
                +
                +
                +
                + + Carl Friedrich Bolz-Tereick wrote on 2009-10-16 20:37: +
                +
                +

                Skandalof, Shahms: I guess there are possible ways to make id a bit faster than what we have now. What we have now is well-tested and works reasonably enough. I assume anyway that there is not too much Python code whose performance depends critically on having an extremely efficient implementation of id (and if there is, I am prepared to ask the author to rewrite the code instead :-) ).

                +
                +
                +
                +
                + + Skandalfo wrote on 2009-10-16 20:38: +
                +
                +

                Shahms: I confess I don't understand your proposal. Do you mean you can have at most as many live objects as the available address space divided by the object alignment?

                When I talked about id space I wasn't referring to the memory required to store the per-object id value, but the fact that if you assign the id values using sequential values, and those values are, for instance, 64 bit integers, you could theoretically create and destroy a lot of objects in a long lived process and the sequence would wrap around.

                About making hash/id the same, I've just checked that CPython does indeed use the id() value as the value returned by the default hash() implementation.

                You could just do the same, and use the id value as the "master" one. For hash() you would just call id(). This allows you to use just one value attached to the objects for both functions.

                The cost of that approach would be having to assign an id immediately (having to put it into the temporary map, then having to look it up in the map until the next time the object is moved) for the call to hash() (with no call to id()) too.

                The good thing compared to the weak key dict, is that the temporary map doesn't need to be garbage collected at all. The entries are removed when objects are moved (or collected).

                +
                +
                +
                +
                + + Shahms wrote on 2009-10-16 20:44: +
                +
                +

                Carl, no doubt you're right. I know that I can probably count the number of times I've needed to use id() on one hand and I'm pretty sure the vast majority of those cases was sticking an-hashable object in a dict.

                +
                +
                +
                +
                + + Skandalfo wrote on 2009-10-16 20:53: +
                +
                +

                Carl, Shahms: I couldn't agree more about id() not being important.

                Probably Guido should have refrained from making it available in CPython at the time. I suppose it was just easy to add it to the language with the memory allocation model of CPython. The fact is that I don't really see any use for id() once you have the "is" operator and the hash() method...

                +
                +
                +
                +
                + + Michael Hudson-Doyle wrote on 2009-10-16 22:19: +
                +
                +

                Yay, I remember talking about removing the gc type pointer, oh, about 3.5 years ago :) Cool that it got done, sounds like a neat pair of hacks.

                +
                +
                +
                +
                + + Maciej Fijalkowski wrote on 2009-10-17 01:17: +
                +
                +

                @Lucian:

                ctypes and JIT works just fine together.

                +
                +
                +
                +
                + + Anonymous wrote on 2009-10-17 09:57: +
                +
                +

                Doesn't deepcopy use id() a lot? I remember once using deepcopy on a complicated structure, resulting in thousands of id() calls.

                +
                +
                +
                +
                + + RonnyPfannschmidt wrote on 2009-10-17 10:08: +
                +
                +

                what about pickle - as far as i remember its memo code for dealing with object cycles is using id, too

                +
                +
                +
                +
                + + Armin Rigo wrote on 2009-10-17 16:32: +
                +
                +

                Too bad for the current implementation of pickle and deepcopy. The fault in that case is CPython's general view that id() is cheap, despite repeated attempts to convince them otherwise. These attempts have been done notably by guys from Jython, even before PyPy time; indeed id() is a mess for any implementation apart from CPython's simple non-moving GC).

                A suitable replacement would be e.g. a 'collections.identitydict' type, if someone feels like going Yet Another Time to python-dev with this issue.

                +
                +
                +
                +
                + + Marius Gedminas wrote on 2009-10-17 22:20: +
                +
                +

                When I was writing objgraph I saw no way of traversing arbitrary object graphs without using id().

                collections.identitydict sounds like a nice idea. Has anyone written a PEP for it?

                +
                +
                +
                +
                + + Anonymous wrote on 2009-10-18 09:14: +
                +
                +

                Is there any possibility to translate pypy under OSX 10.6 as 32bit? Translation works but I get an "ValueError: bad marshal data" when running pypy-c. I assume that is due to the fact that I got a 64bit binary.

                +
                +
                +
                +
                + + Maciej Fijalkowski wrote on 2009-10-18 18:49: +
                +
                +

                @Anonymous:

                Try deleting all your .pyc files and see what happens.

                +
                +
                +
                +
                + + Armin Rigo wrote on 2009-10-19 10:30: +
                +
                +

                Marius: as I said, feel free to :-) but the current situation is, no, not as far as I know.

                +
                +
                +
                +
                + + klaussfreire wrote on 2009-10-19 16:38: +
                +
                +

                Wouldn't it free up the GC from all that burden if only a set of live ids were kept? (ie: no weak dict)

                So, when you get an id() call, you check the object to see if there's a cached id (much like the hash hack) - if not, you generate a random (or sequential) unused id and store it both in the "live ids" set and in the object's structure, as done with hash values.

                So, successive calls to id() would be as fast as in CPython, and garbage collection would be fast too (only an extra set deletion per object whose id was obtained).

                In fact, this set could be implemented as a bit array with "free lists", which could be very very efficient, given that its size will be bound by the number of live objects.

                +
                +
                +
                +
                + + Armin Rigo wrote on 2009-10-21 08:11: +
                +
                +

                Claudio: this doesn't work (unless I misunderstood). You cannot add a field like the hash field at any point in time, but only during collections when the object moves.

                +
                +
                +
                +
                + + klaussfreire wrote on 2009-10-21 13:34: +
                +
                +

                Yes, I've been thinking about that too.

                But that can be patched - the weak key dict could still be used for those objects that haven't been collected yet. Since addition of the id would most likely happen in the nursery, or the first generation at most (big assumption), I don't think the dict would grow very big even under heavy id() usage.

                +
                +
                +
                +
                + + omul cu 6233 wrote on 2009-11-02 21:51: +
                +
                +

                Wohoo, nice performance

                +
                +
                +
                +
                + + Unknown wrote on 2010-04-14 15:14: +
                +
                +

                I'm astonished a bit by your need to pack vtables together within 256KB. How many bits do you need for mark-and-sweep marking or similar stuff? The usual solution I've seen for this is to use the low two bits of the vtable pointer for flags, usually, and mask them off when reading the vtable pointer. Would it work here?

                If that isn't enough, then you have to pack vtables together as you do (maybe in a bigger space if you can use more bits).

                +
                +
                +
                +
                + + PJE wrote on 2010-09-22 18:22: +
                +
                +

                I can think of one place where I use a lot of id() calls, and that's in PEAK-Rules' generic function implementation, for indexing "is" tests.

                For example, if you have a bunch of methods that test if "x is Something" (for different values of Something), then a dictionary of id()'s is used to identify which of these tests went off. While the total number of Somethings isn't likely to be high, the weakref dict in PyPy means that every 'x' the function is called with will end up burning memory and speed to hold an id forever.

                While it's perhaps the case that I could avoid this by using a linear search (ugh) in cases where the number of Somethings is small, it's an example of a place where id() makes an operation neat, fast, and simple in regular Python.

                Of course, if there were another way to put arbitrary (i.e possibly-unhashable, comparable only by identity) objects in a dictionary, and then determine whether a given object was one of them, that'd certainly be a suitable substitute.

                Or, if PyPI offered a temp_id() that would simply let you *check* identity, without forcing the object to hold onto it, that'd work fine too. Say, if there was a has_id() that told you if an id() is outstanding for the object already, or a get_id() that returned None for an object whose id() had never been taken.

                With an API like that, I could prevent memory/speed blowup by not having each call of the function adding more objects to PyPy's id() dict.

                (Heck, perhaps such an API should be added across Python versions, i.e., to CPython and Jython as well.)

                +
                +
                +
                +
                + + Maciej Fijalkowski wrote on 2010-09-22 18:30: +
                +
                +

                @PJE PyPy offers collections.identity_dict, or something similar which would have the effect how you like (but internally doesn't use id operation, just the object identity).

                +
                +
                +
                +
                + + Anonymous wrote on 2011-05-07 02:34: +
                +
                +

                This program in C# takes 589 miliseconds, and 52 MB RAM. 17x faster, 4.75x less RAM.

                +
                +
                +
                +
                + + Anonymous wrote on 2011-09-15 10:37: +
                +
                +

                And in assembly it will be even faster and smaller.

                Python has many lovely attributes, but efficiency is not its primary virtue. That said, making it more efficient is still a plus, which this work is doing

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2009/10/pypys-jit-now-supports-floats-7003493323596806737.html b/posts/2009/10/pypys-jit-now-supports-floats-7003493323596806737.html new file mode 100644 index 000000000..e0e66da2a --- /dev/null +++ b/posts/2009/10/pypys-jit-now-supports-floats-7003493323596806737.html @@ -0,0 +1,574 @@ + + + + + +PyPy's JIT now supports floats | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                PyPy's JIT now supports floats

                + + + +
                +

                +Hello. +

                + +

                +We've just merged branch which adds float support to x86 backend. +This means that floating point operations are now super fast +in PyPy's JIT. Let's have a look at example, provided by +Alex Gaynor +and stolen from Factor blog. +

                + +

                +The original version of the benchmark, was definitely tuned for the performance needs of CPython. +

                +

                +For running this on PyPy, I changed to a bit simpler version of the program, +and I'll explain a few changes that I did, which the reflect current +limitations of PyPy's JIT. They're not very deep and they might be +already gone while you're reading it: +

                + +
                  +
                • Usage of __slots__. This is a bit ridiculous, but we spend quite a bit + of time to speed up normal instances of new-style classes which are + very fast, yet ones with __slots__ are slower. To be fixed soon.
                • + +
                • Usage of reduce. This one is even more obscure, but reduce is not + perceived as a thing producing loops in a program. Moving to + a pure-Python version of reduce fixes the problem.
                • + +
                • Using x ** 2 vs x * x. In PyPy, reading a local variable is a + no-op when JITted (the same as reading local variable in C). However + multiplication is simpler operation that power operation.
                • +
                +

                +I also included the original Java benchmark. Please +note that original java version is similar to my modified one +(not the one specifically tuned for CPython) +

                + +The performance figures below (for n = 1 000 000), average of 10 runs: + +
                  +
                • CPython 2.6: 7.56s +
                • +
                • CPython & psyco 2.6: 4.44s +
                • +
                • PyPy: 1.63s +
                • +
                • Java (JVM 1.6, client mode): 0.77s +
                • +
                +

                +and while JVM is much faster, it's very good that we can even compare :-) +

                + +Cheers
                +fijal +
                +

                Comments

                +
                +
                +
                + + Anonymous wrote on 2009-10-06 17:26: +
                +
                +

                So it's much faster than Psyco and only about 2x slower than the JVM. That's impressive, as Python is much more dynamic!

                Congrats and thanks for the regular updates, it's much appreciated.

                +
                +
                +
                +
                + + Luis wrote on 2009-10-06 17:31: +
                +
                +

                Very exciting!
                By the way, this result doesn't include the time to generate assembler. Right?

                +
                +
                +
                +
                + + Anonymous wrote on 2009-10-06 17:37: +
                +
                +

                Great, you guys are heroes!

                Btw, what's the next big hurdle to run real-world programs? Memory use? Threads?

                +
                +
                +
                +
                + + Anonymous wrote on 2009-10-06 17:47: +
                +
                +

                Great job! I really appreciate your work.

                @Luis: I think, it does include the assembler. I just compiled trunk and ran the modified benchmark on python 2.6 and pypy-c-jit. Best time of 10 runs:
                Python 2.6.2: 0.911113977432
                Pypy: 0.153664112091
                So it's nearly 6x faster for me (including the time for generating the assembler, of course) - even much better than on the postet numbers...I don't know, if cpython was run with the unmodified version of the benchmark though.

                +
                +
                +
                +
                + + William wrote on 2009-10-06 19:36: +
                +
                +

                I'd be interested to see the results for a much longer run (n = 10 000 000?).

                +
                +
                +
                +
                + + Panos Laganakos wrote on 2009-10-06 19:55: +
                +
                +

                Wicked! Keep the sweetness coming :)

                +
                +
                +
                +
                + + Unknown wrote on 2009-10-07 03:15: +
                +
                +

                Very exciting. Thanks! These are nearing "holy crap" numbers.

                <mindControl>siiiixty foooouuur biiiiit<mindControl>

                :-)

                +
                +
                +
                +
                + + René Dudfield wrote on 2009-10-07 11:35: +
                +
                +

                awesome! things are really starting to move along now :)

                I tried the same little benchmark with the shedskin python to C++ compiler for comparison:

                cpython2.5: 16.2770409584
                cpython2.6: 12.2321541309
                shedskin: 0.316256999969

                Shedskin is 38.6 times faster than cpython2.6, and 51.4 times faster than cpython2.5... and to extrapolate from your numbers 3.9 times faster than the jvm.

                Of course that doesn't include the time it takes to generate the C++ and then compile it with g++ (using the old 4.0.1 g++, not the latest 4.4). I also didn't include the python interpreter startup cost.

                btw, I found map, reduce and filter all to be faster with pure python versions when using psyco too.

                cu!

                +
                +
                +
                +
                + + Maciej Fijalkowski wrote on 2009-10-07 13:42: +
                +
                +

                @illume

                that's a bit unfair comparison, since shedskin is not python. you can compare RPython and shedskin though. RPython is sometimes faster than C even...

                And also, yes, in PyPy or psyco time we include compilation time.

                Cheers,
                fijal

                +
                +
                +
                +
                + + Luis wrote on 2009-10-07 14:34: +
                +
                +

                I'm still confussed.. if you post the average of 10 runs, and assembler is generated only in the first run, then this time is diluted. Shouldn't you compute the average of 10 runs, but excluding the first one? (that means, runing it 11 times and ignoring the first one?).

                +
                +
                +
                +
                + + Anonymous wrote on 2009-10-07 18:31: +
                +
                +

                @Luis: no, I think fijal started the pypy-c interpreter 10 times, and each time it generates assembly (it's not cached afaik).

                +
                +
                +
                +
                + + Luis wrote on 2009-10-07 19:28: +
                +
                +

                Well, no matter how they measure it, this is definitely within the "Holy Crap" range...

                +
                +
                +
                +
                + + Maciej Fijalkowski wrote on 2009-10-07 20:37: +
                +
                +

                @Luis:

                Maybe I should... I really run this 10 times while assembler was generated only during the first time. But also dilluting assembler generation time over runs is kind of real-life effect...

                +
                +
                +
                +
                + + Baczek wrote on 2009-10-08 16:06: +
                +
                +

                how about including unladen swallow results?

                +
                +
                +
                +
                + + Michael Allman wrote on 2009-10-08 18:26: +
                +
                +

                How come the pypy JIT is compiled AOT to C? I thought the idea of PyPy was to implement a python runtime in python? Why not run the JIT on a python runtime?

                Awesome work. I wish the Ruby folk were as motivated...

                Cheers.

                +
                +
                +
                +
                + + Anonymous wrote on 2009-10-08 18:32: +
                +
                +

                I seem to recall grumblings from C++ programmers a few years ago when Java started supporting multi-core architecture, which made Java execution as fast or faster than C++ with much less development effort (for free with the Java interpreter vs hand-written C++ support).

                If your testing machine is a multi-core/processor machine, it might be appropriate to say that PyPy is now as fast as C++ (without explicit multi-core support). Wow!

                +
                +
                +
                +
                + + Armin Rigo wrote on 2009-10-09 11:39: +
                +
                +

                Michael: because our goal is to have a general framework, not a Python-centered solution. For example, the JIT generator works mostly out of the box with any other language that we implemented in RPython (which includes Smalltalk).

                +
                +
                +
                +
                + + hihhu wrote on 2009-10-09 18:06: +
                +
                +

                Great work!

                How large an effort would it be to have eg. Perl or Ruby working with this? Just out of curiosity, I'm trying to understand this project better.

                +
                +
                +
                +
                + + Anonymous wrote on 2009-10-09 20:23: +
                +
                +

                In the correct original version of the benchmark there are two calls to sin(). A good compiler optimizes one of them away. A worse compiler don't. So it's more fair to put back the second sin in the Python code too.

                +
                +
                +
                +
                + + Maciej Fijalkowski wrote on 2009-10-11 19:37: +
                +
                +

                @hihu:

                It would be a bit easier than writing the interpreter in C, since RPython is much nicer. Also, you get JIT for almost free and decent GC for free. On the other hand, writing interpreters it's quite a bit of work on it's own.

                @Anonymous:

                Indeed, well, spotted, it would be more fair. However, there is no measurable difference (at least in pypy running time).

                PS. We have weekends, too.

                Cheers,
                fijal

                +
                +
                +
                +
                + + della wrote on 2009-10-12 08:48: +
                +
                +

                Would a Pypy implementation of Perl/Ruby/PHP mean that it would be possible to use libraries developed in one language for the other one? That would be very cool indeed.

                And, for that matter, would that mean interoperability between python2 and python3 modules when the py3 interpreter will be done? :)

                +
                +
                +
                +
                + + Maciej Fijalkowski wrote on 2009-10-12 15:33: +
                +
                +

                @della.

                In general, that would not be that simple. You need to somehow map data types between interpreters in an unclear manner. For example, what would happen if you call Python2.x function passing argument that is py3k dict (which has different interface)?

                Cheers,
                fijal

                +
                +
                +
                +
                + + della wrote on 2009-10-13 09:34: +
                +
                +

                One would imagine having different interfaces for the same objects when accessed from 2.x and 3.x code. Would that be difficult?

                Of course, I understand mapping data structures between languages that have many more differences between them than py2 and py3 would definitely be more complex.

                +
                +
                +
                +
                + + Anonymous wrote on 2009-11-02 18:08: +
                +
                +

                Not to rain on the parade, but Java's trig functions are very slow outside of -pi/2,pi/2 range to correct terrible fsin/fcos results on Intel x86.

                See https://bugs.sun.com/bugdatabase/view_bug.do?bug_id=4857011

                Your benchmark should include something to measure the error, or not use trig functions as a benchmark when comparing to Java.

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2009/11/dusseldorf-sprint-report-2505348213879053352.html b/posts/2009/11/dusseldorf-sprint-report-2505348213879053352.html new file mode 100644 index 000000000..38ae7854b --- /dev/null +++ b/posts/2009/11/dusseldorf-sprint-report-2505348213879053352.html @@ -0,0 +1,425 @@ + + + + + +Düsseldorf Sprint Report | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                Düsseldorf Sprint Report

                + + + +
                +

                While the Düsseldorf is dwindling off, we put our minds to the task of retelling +our accomplishments. The sprint was mostly about improving the JIT and we +managed to stick to that task (as much as we managed to stick to anything). The +sprint was mostly filled with doing many small things.

                +
                +

                Inlining

                +

                Carl Friedrich and Samuele started the sprint trying to tame the JIT's inlining. +Until now, the JIT would try to inline everything in a loop (except other loops) +which is what most tracing JITs actually do. This works great if the resulting +trace is of reasonable length, but if not it would result in excessive memory +consumption and code cache problems in the CPU. So far we just had a limit on +the trace size, and we would abort tracing when the limit was reached. This +would happen again and again for the same loop, which is not useful at all. The +new approach introduced is to be more clever when tracing is aborted by marking +the function with the largest contribution to the trace size as non-inlinable. The +next time this loop is traced, it usually then gives a reasonably sized trace.

                +

                This gives a problem because now some functions that don't contain loops are not +inlined, which means they never get assembler code for them generated. To remedy +this problem we also make it possible to trace functions from their start (as +opposed to just tracing loops). We do that only for functions that can not be +inlinined (either because they contain loops or they were marked as +non-inlinable as described above).

                +

                The result of this is that the Python version telco decimal benchmark runs +to completion without having to arbitrarily increase the trace length limit. +It's also about 40% faster than running it on CPython. This is one of the first +non-tiny programs that we speed up.

                +
                +
                +

                Reducing GC Pressure

                +

                Armin and Anto used some GC instrumentation to find places in pypy-c-jit +that allocate a lot of memory. This is an endlessly surprising exercise, as +usually we don't care too much about allocations of short-lived objects when +writing RPython, as our GCs usually deal well with those. They found a few +places where they could remove allocations, most importantly by making one of +the classes that make up traces smaller.

                +
                +
                +

                Optimizing Chains of Guards

                +

                Carl Friedrich and Samuele started a simple optimization on the trace level that +removes superfluous guards. A common pattern in a trace is to have stronger +and stronger guards about the same object. As an example, often there is first a +guard that an object is not None, later followed by a guard that it is exactly +of a given class and then even later that it is a precise instance of that +class. This is inefficient, as we can just check the most precise thing in the +place of the first guard, saving us guards (which take memory, as they need resume data). +Maciek, Armin and Anto later improved on that by introducing a new guard that +checks for non-nullity and a specific class in one guard, which allows us to +collapse more chains.

                +
                +
                +

                Improving JIT and Exceptions

                +

                Armin and Maciek went on a multi-day quest to make the JIT and Python-level +exceptions like each other more. So far, raising and catching exceptions would +make the JIT generate code that has a certain amusement value, but is not really +fast in any way. To improve the situation, they had to dig into the exception +support in the Python interpreter, where they found various inefficiencies. They +also had to rewrite the exceptions module to be in RPython (as opposed to +just pure Python + an old hack). Another problems is that tracebacks give you +access to interpreter frames. This forces the JIT to deoptimize things, as +the JIT keeps some of the frame's content in CPU registers or on the CPU stack, +which reflective access to frames prevents. +Currently we try to improve the simple cases where the traceback is never +actually accessed. This work is not completely finished, but some cases are +already significantly faster.

                +
                +
                +

                Moving PyPy to use py.test 1.1

                +

                Holger worked on porting PyPy to use the newly released py.test 1.1. PyPy +still uses some very old support code in its testing infrastructure, which makes +this task a bit annoying. He also gave the other PyPy developers a demo of some +of the newer py.test features and we discussed which of them we want to start +using to improve our tests to make them shorter and clearer. One of the things +we want to do eventually is to have less skipped tests than now.

                +
                +
                +

                Using a Simple Effect Analysis for the JIT

                +

                One of the optimization the JIT does is caching fields that are read out of +structures on the heap. This cache needs to be invalidated at some points, for +example when such a field is written to (as we don't track aliasing much). +Another case is a call in the assembler, as the target function could +arbitrarily change the heap. This of course is imprecise, since most functions +don't actually change the whole heap, and we have an analysis that finds out +which sorts of types of structs and arrays a function can mutate. During the +sprint Carl Friedrich and Samuele integrated this analysis with the JIT, to help +it invalidate caches less aggressively. Later Anto and Carl Friedrich also +ported this support to the CLI version of the JIT.

                +
                +
                +

                Miscellaneous

                +

                Samuele (with some assistance of Carl Friedrich) set up a buildbot slave on a +Mac Mini at the University. This should let us stabilize on the Max OS X. So far +we still have a number of failing tests, but now we are in a situation to +sanely approach fixing them.

                +

                Anto improved the CLI backend to support the infrastructure for producing the +profiling graphs Armin introduced.

                +

                The guinea-pigs that were put into Carl Friedrich's care have been fed (which +was the most important sprint task anyway).

                +

                Samuele & Carl Friedrich

                +
                +
                +

                Comments

                +
                +
                +
                + + Anonymous wrote on 2009-11-13 17:57: +
                +
                +

                Great news and a nice read. Out of curiosity, did you also improve performance for the richards or pystone benchmarks?

                +
                +
                +
                +
                + + hubert wrote on 2009-11-14 05:05: +
                +
                +

                this is a very fascinating project and i enjoy reading the blog even if i am not really a computer scientist and don't have a very deep understanding of many details. :)

                something i always wonder about... wouldn't it be possible to use genetic algorithms in compiler technology? like a python to machine code compiler that evolves to the fastest solution by itself? or is there still not enough computing power for something like that?

                +
                +
                +
                +
                + + pollo wrote on 2009-11-14 11:18: +
                +
                +

                Very interesting. Thanks for all your work!

                +
                +
                +
                +
                + + Carl Friedrich Bolz-Tereick wrote on 2009-11-14 14:57: +
                +
                +

                @Anonymous: Richards and Pystone become less and less important as benchmarks, we are trying to look into more application-like larger things now.

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2009/11/dusseldorf-sprint-started-7608527610228870250.html b/posts/2009/11/dusseldorf-sprint-started-7608527610228870250.html new file mode 100644 index 000000000..9e40c6d85 --- /dev/null +++ b/posts/2009/11/dusseldorf-sprint-started-7608527610228870250.html @@ -0,0 +1,325 @@ + + + + + +Düsseldorf Sprint Started | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                Düsseldorf Sprint Started

                + + + +
                +

                The Düsseldorf sprint starts today. Only Samuele and me are there so far, but that should change over the course of the day. We will mostly work on the JIT during this sprint, trying to make it a lot more practical. For that we need to decrease its memory requirements some more and to make it use less aggressive inlining. We will post more as the sprint progresses.

                +
                +

                Comments

                +
                +
                +
                + + kataton wrote on 2009-11-10 07:39: +
                +
                +

                Looking forward to amazing new developments...

                +
                +
                +
                +
                + + Luis wrote on 2009-11-12 12:40: +
                +
                +

                Are you planning a new release anytime soon? (hopefully with JIT?)

                +
                +
                +
                +
                + + Anonymous wrote on 2009-11-13 13:48: +
                +
                +

                A release is planned for the February-March timeframe.

                /Jacob Hallén

                +
                +
                +
                +
                + + Armin Rigo wrote on 2009-11-17 10:08: +
                +
                +

                Actually, I would plan the release for the end of the next sprint, which should be in January.

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2009/11/hi-all-this-week-i-worked-on-improving-6515977421244851229.html b/posts/2009/11/hi-all-this-week-i-worked-on-improving-6515977421244851229.html new file mode 100644 index 000000000..07c7a7367 --- /dev/null +++ b/posts/2009/11/hi-all-this-week-i-worked-on-improving-6515977421244851229.html @@ -0,0 +1,334 @@ + + + + + +Logging and nice graphs | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                Logging and nice graphs

                + + + +
                +

                Hi all,

                + +

                This week I worked on improving the system we use for logging. Well, it was not really a "system" but rather a pile of hacks to measure in custom ways timings and counts and display them. So now, we have a system :-)

                + +

                The system in question was integrated in the code for the GC and the JIT, which are two independent components as far as the source is concerned. However, we can now display a unified view. Here is for example pypy-c-jit running pystone for (only) 5000 iterations:

                + + + +

                The top long bar represents time. The bottom shows two summaries of the total time taken by the various components, and also plays the role of a legend to understand the colors at the top. Shades of red are the GC, shades of green are the JIT.

                + +

                Here is another picture, this time on pypy-c-jit running 10 iterations of richards:

                + + + +

                We have to look more closely at various examples, but a few things immediately show up. One thing is that the GC is put under large pressure by the jit-tracing, jit-optimize and (to a lesser extent) the jit-backend components. So large in fact that the GC takes at least 60-70% of the time there. We will have to do something about it at some point. The other thing is that on richards (and it's likely generally the case), the jit-blackhole component takes a lot of time. "Blackholing" is the operation of recovering from a guard failure in the generated assembler, and falling back to the interpreter. So this is also something we will need to improve.

                + +

                That's it! The images were generated with the following commands:

                + +
                PYPYLOG=/tmp/log pypy-c-jit richards.py
                +python pypy/tool/logparser.py draw-time /tmp/log --mainwidth=8000 --output=filename.png
                + +EDIT: nowadays the command-line has changed to:
                python rpython/tool/logparser.py draw-time /tmp/log --mainwidth=8000 filename.png
                +
                +

                Comments

                +
                +
                +
                + + pollo wrote on 2009-11-02 01:09: +
                +
                +

                Nice work.
                I think you'll cause a revolution when this project delivers its goals, opening python (and other dynamic languages) to a much wider range of uses.

                +
                +
                +
                +
                + + René Dudfield wrote on 2009-11-02 07:56: +
                +
                +

                ooh, pretty graphs :) It's been very good to follow pypy progress through the blog.

                Can the gc/jit be made to take up a maximum amount of time, or be an incremental process? This is important for things requiring real time - like games, audio, multimedia, robots, ninjas etc.

                A note, that some other languages do gc/jit in other threads. But I imagine, pypy is concentrating on single threaded performance at the moment.

                I'm sure you're aware of both those things already, but I'm interested to see what the pypy approach to them is?

                cu,

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2009/11/pypy-on-rupy-2009-5675275348619189353.html b/posts/2009/11/pypy-on-rupy-2009-5675275348619189353.html new file mode 100644 index 000000000..bd13f4424 --- /dev/null +++ b/posts/2009/11/pypy-on-rupy-2009-5675275348619189353.html @@ -0,0 +1,324 @@ + + + + + +PyPy on RuPy 2009 | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                PyPy on RuPy 2009

                + + + +
                +

                Hello. +

                +

                +It's maybe a bit late to announce, but there will be PyPy talk +at Rupy conference this weekend in +Poznan. Precisely, I'll be talking mostly about PyPy's JIT and +how to use it. Unfortunately the talk is on Saturday, at 8:30 in the morning. +

                +

                +EDIT: Talk is online, together with examples +

                +Cheers,
                +fijal +
                +

                Comments

                +
                +
                +
                + + ulrik wrote on 2009-11-03 19:12: +
                +
                +

                I, and many interested with me, appreciate links to slides, videos or transcripts of the talk once it has been held. PyPy is exciting! Good luck in Poznan.

                +
                +
                +
                +
                + + Maciej Fijalkowski wrote on 2009-11-03 21:11: +
                +
                +

                All materials for pypy talks are available in talk directory.

                Cheers,
                fijal

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2009/11/some-benchmarking-9211261260383281459.html b/posts/2009/11/some-benchmarking-9211261260383281459.html new file mode 100644 index 000000000..4314c95d9 --- /dev/null +++ b/posts/2009/11/some-benchmarking-9211261260383281459.html @@ -0,0 +1,623 @@ + + + + + +Some benchmarking | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                Some benchmarking

                + + + +
                +

                Hello. +

                +

                +Recently, thanks to the surprisingly helpful Unhelpful, also known as Andrew Mahone, +we have a decent, if slightly arbitrary, set of performances graphs. +It contains a couple of benchmarks already +seen on this blog as well as some taken from The Great Computer +Language Benchmarks Game. These benchmarks don't even try to represent "real applications" +as they're mostly small algorithmic benchmarks. Interpreters used: +

                +
                  +
                1. +PyPy trunk, revision 69331 with --translation-backendopt-storesink, which is +now on by default +
                2. +
                3. +Unladen swallow trunk, r900 +
                4. +
                5. CPython 2.6.2 release
                6. +
                +

                +Here are the graphs; the benchmarks and the runner script are available +

                + + + +And zoomed in for all benchmarks except binary-trees and fannkuch. + + +

                +As we can see, PyPy is generally somewhere between the same speed +as CPython to 50x faster (f1int). The places where we're the same +speed as CPython are places where we know we have problems - for example generators are +not sped up by the JIT and they require some work (although not as much by far +as generators & Psyco :-). The glaring inefficiency is in the regex-dna benchmark. +This one clearly demonstrates that our regular expression engine is really, +really, bad and urgently requires attention. +

                +

                +The cool thing here is, that although these benchmarks might not represent +typical python applications, they're not uninteresting. They show +that algorithmic code does not need to be far slower in Python than in C, +so using PyPy one need not worry about algorithmic code being dramatically +slow. As many readers would agree, that kills yet another usage of C in our +lives :-) +

                +Cheers,
                +fijal +
                +

                Comments

                +
                +
                +
                + + Luis wrote on 2009-11-18 22:09: +
                +
                +

                Wow! This is getting really interesting. Congratulations!
                By the way, it would be great if you include psyco in future graphs, so speed junkies can have a clearer picture of pypy's progress.

                +
                +
                +
                +
                + + Eric Florenzano wrote on 2009-11-18 22:14: +
                +
                +

                Very interesting, congratulations on all the recent progress! It would be very interesting to see how PyPy stacks up against Unladen Swallow on Unladen Swallow's own performance benchmark tests, which do include a bit more real-world scenarios.

                +
                +
                +
                +
                + + Maciej Fijalkowski wrote on 2009-11-18 22:31: +
                +
                +

                @Eric: yes, definitely, we're approaching that set of benchmarks

                @Luis: yes, definitely, will try to update tomorrow, sorry.

                +
                +
                +
                +
                + + Paddy3118 wrote on 2009-11-19 04:06: +
                +
                +

                It's good, but...

                We are still in the realms of micro-benchmarks. It would be good to compare their performances when working on something larger. Django or Zope maybe?

                +
                +
                +
                +
                + + Gaëtan de Menten wrote on 2009-11-19 07:52: +
                +
                +

                These last months, you seem to have had almost exponential progress. I guess all those years of research are finally paying off. Congratulations!

                Also, another graph for memory pressure would be nice to have. Unladen Shadow is (was?) not very good in that area, and I wonder how PyPy compares.

                [nitpick warning]
                As a general rule, when mentioning trunk revisions, it's nice to also mention a date so that people know the test was fair. People assume it's from the day you did the tests, and confirming that would be nice.
                [/nitpick warning]

                +
                +
                +
                +
                + + Antoine wrote on 2009-11-19 09:45: +
                +
                +

                How about benchmarking against CPython trunk as well?

                cheers

                Antoine.

                +
                +
                +
                +
                + + Tony Landis wrote on 2009-11-19 16:02: +
                +
                +

                What about memory consumption? That is almost as important to me as speed.

                +
                +
                +
                +
                + + wilk wrote on 2009-11-19 16:04: +
                +
                +

                Congratulations !

                Please could you remember us how to build and test pypy-jit ?

                +
                +
                +
                +
                + + Anonymous wrote on 2009-11-19 23:38: +
                +
                +

                I'm curious why mandelbrot is much less accelerated than, say, nbody. Does PyPy not JIT complex numbers properly yet?

                +
                +
                +
                +
                + + Benjamin Peterson wrote on 2009-11-20 03:03: +
                +
                +

                @wilk ./translate.py -Ojit targetpypystandalone.py

                +
                +
                +
                +
                + + Benjamin Peterson wrote on 2009-11-20 03:11: +
                +
                +

                @Anon Our array module is in pure Python and much less optimized than CPython's.

                +
                +
                +
                +
                + + Leo wrote on 2009-11-20 07:11: +
                +
                +

                How long until I can do

                pypy-c-jit translate.py -Ojit targetpypystandalone.py

                ?

                So far, when I try, I get

                NameError: global name 'W_NoneObject' is not defined
                https://paste.pocoo.org/show/151829/

                +
                +
                +
                +
                + + holger krekel wrote on 2009-11-20 07:37: +
                +
                +

                ASFAIU it's not PyPy's regex engine being "bad" but rather the fact that the JIT generator cannot consider and optimize the loop in the regex engine, as it is a nested loop (the outer one being the bytecode interpretation one).

                +
                +
                +
                +
                + + Armin Rigo wrote on 2009-11-20 10:41: +
                +
                +

                @holger: yes, that explains why regexps are not faster in PyPy, but not why they are 5x or 10x slower. Of course our regexp engine is terribly bad. We should have at least a performance similar to CPython.

                +
                +
                +
                +
                + + Anonymous wrote on 2009-11-20 15:35: +
                +
                +

                Benjamin, is it really an issue with array? The inner loop just does complex arithmetic. --Anon

                +
                +
                +
                +
                + + Benjamin Peterson wrote on 2009-11-20 22:41: +
                +
                +

                @Anon I'm only guessing. Our math is awfully fast.

                +
                +
                +
                +
                + + Antonio Cuni wrote on 2009-11-20 23:54: +
                +
                +

                @Anon, @Benjamin
                I've just noticed that W_ComplexObject in objspace/std/complexobject.py is not marked as _immutable_=True (as it is e.g. W_IntObject), so it is totally possible that the JIT is not able to optimize math with complexes as it does with ints and floats. We should look into it, it is probably easy to discover

                +
                +
                +
                +
                + + vak wrote on 2009-11-20 23:58: +
                +
                +

                guys, sorry, who cares about *seconds*??

                why didn't you normalize to the test winners? :)

                +
                +
                +
                +
                + + Leo wrote on 2009-11-21 09:06: +
                +
                +

                So, um, has anyone managed to get JIT-ed pypy to compile itself?

                When I tried to do this today, I got this:

                https://paste.pocoo.org/show/151829/

                +
                +
                +
                +
                + + Maciej Fijalkowski wrote on 2009-11-21 11:26: +
                +
                +

                @Leo:

                yes, we know that bug. Armin is fixing it right now on faster-raise branch.

                +
                +
                +
                +
                + + Armin Rigo wrote on 2009-11-21 17:47: +
                +
                +

                antonio: good point. On the second thought, though, it's not a *really* good point because we don't have _immutable_=True on floats either...

                +
                +
                +
                +
                + + Leo wrote on 2009-11-21 19:35: +
                +
                +

                @Maciej Great! It'll be awesome to have a (hopefully much faster??) JITted build ... it currently takes my computer more than an hour ...

                +
                +
                +
                +
                + + Benjamin Peterson wrote on 2009-11-22 01:45: +
                +
                +

                @Leo it's likely to take tons of memory, though.

                +
                +
                +
                +
                + + Anonymous wrote on 2009-11-22 10:13: +
                +
                +

                Would perhaps also be nice to compare the performance with one the current Javascript-Engines(V8, SquirrelFish etc.)

                +
                +
                +
                +
                + + Tom Clarke wrote on 2009-11-22 12:08: +
                +
                +

                Nice comparisons - and micro-performance looking good. Congratulations.

                HOWEVER - there is no value in having three columns for each benchmark. The overall time is arbitrary, all that matters is relative so you might as well normalise all graphs to CPython = 1.0, for example. The relevant informtion is then easier to see!

                +
                +
                +
                +
                + + Unknown wrote on 2009-11-23 19:24: +
                +
                +

                it's called "The Computer Language
                Benchmarks Game" these days...

                +
                +
                +
                +
                + + Luis wrote on 2009-11-23 21:10: +
                +
                +

                Tom is right, normalizing the graphs to cpython = 1.0 would make them much more readable.
                Anyway, this is a very good Job from Unhelpful.
                Thanks!

                +
                +
                +
                +
                + + Anonymous wrote on 2009-11-27 13:54: +
                +
                +

                Do any of those benchmarks work with shedskin?

                +
                +
                +
                +
                + + ¬¬ wrote on 2009-11-30 07:26: +
                +
                +

                glad to see someone did something with my language shootout benchmark comment ;)

                +
                +
                +
                +
                + + Anonymous wrote on 2009-12-01 19:07: +
                +
                +

                I checked https://www.looking-glass.us/~chshrcat/python-benchmarks/results.txt but it doesn't have the data for unladen swallow. Where are the number?

                +
                +
                +
                +
                + + Term Paper wrote on 2010-02-18 07:05: +
                +
                +

                I'm curious why mandelbrot is much less accelerated than, say, nbody. Does PyPy not JIT complex numbers properly yet?

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2009/11/using-cpython-extension-modules-with-4951018896657992031.html b/posts/2009/11/using-cpython-extension-modules-with-4951018896657992031.html new file mode 100644 index 000000000..7f567b736 --- /dev/null +++ b/posts/2009/11/using-cpython-extension-modules-with-4951018896657992031.html @@ -0,0 +1,438 @@ + + + + + +Using CPython extension modules with PyPy, or: PyQt on PyPy | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                Using CPython extension modules with PyPy, or: PyQt on PyPy

                + + + +
                +
                + +

                If you have ever wanted to use CPython extension modules on PyPy, +we want to announce that there is a solution that should be compatible +to quite a bit of the available modules. It is neither new nor written +by us, but works nevertheless great with PyPy.

                +

                The trick is to use RPyC, a transparent, symmetric remote procedure +call library written in Python. The idea is to start a +CPython process that hosts the PyQt libraries +and connect to it via TCP to send RPC commands to it.

                +

                I tried to run PyQt applications +using it on PyPy and could get quite a bit of the functionality of these +working. Remaining problems include regular segfaults of CPython +because of PyQt-induced memory corruption and bugs because classes +like StandardButtons behave incorrectly when it comes to arithmetical operations.

                +

                Changes to RPyC needed to be done to support remote unbound __init__ methods, +shallow call by value for list and dict types (PyQt4 methods want real lists and dicts +as parameters), and callbacks to methods (all remote method objects are wrapped into +small lambda functions to ease the call for PyQt4).

                +

                If you want to try RPyC to run the PyQt application of your choice, you just +need to follow these steps. Please report your experience here in the blog +comments or on our mailing list.

                +
                +
                  +
                1. Download RPyC from the RPyC download page.
                2. +
                3. Download this patch and apply it to RPyC by running +patch -p1 < rpyc-3.0.7-pyqt4-compat.patch in the RPyC directory.
                4. +
                5. Install RPyc by running python setup.py install as root.
                6. +
                7. Run the file rpyc/servers/classic_server.py using CPython.
                8. +
                9. Execute your PyQt application on PyPy.
                10. +
                +
                +

                PyPy will automatically connect to CPython and use its PyQt libraries.

                +

                Note that this scheme works with nearly every extension library. Look +at pypy/lib/sip.py on how to add new libraries (you need to create +such a file for every proxied extension module).

                +

                Have fun with PyQt

                +

                Alexander Schremmer

                +
                +
                +

                Comments

                +
                +
                +
                + + intgr wrote on 2009-11-30 13:03: +
                +
                +

                OT: you should separate labels by commas, so that Blogspot recognizes them as distinct labels.

                +
                +
                +
                +
                + + Carl Friedrich Bolz-Tereick wrote on 2009-11-30 13:08: +
                +
                +

                intgr: Thanks, done.

                +
                +
                +
                +
                + + Anonymous wrote on 2009-11-30 19:38: +
                +
                + "regular segfaults of CPython because of PyQt-induced memory corruption and bugs because classes like StandardButtons behave incorrectly when it comes to arithmetical operations."

                These sound interesting. Could you please elaborate? A link would suffice, if these are already documented by non-pypy people. Thanks! +
                +
                +
                +
                + + holger krekel wrote on 2009-12-01 09:17: +
                +
                +

                cool stuff, alexander! Generic access to all CPython-provided extension could remove an importing blocker for PyPy usage, allows incremental migrations.

                Besides, I wonder if having two processes, one for application and one for bindings can have benefits to stability.

                +
                +
                +
                +
                + + Alexander Schremmer wrote on 2009-12-01 10:28: +
                +
                +

                Dear anonymous,

                the StandardButtons bug was already communicated to a Nokia employee.
                If you are interested in the segfaults, contact me and I give you the source code that I used for testing.

                +
                +
                +
                +
                + + Zemantic dreams wrote on 2009-12-03 06:33: +
                +
                +

                This is an important step forward!

                There are probably two reasons why people use extensions: bindings to libraries and performance.

                Unfortunately this specific approach does not address performance. Is there anything on horizon that would allow near-CPython API for extensions. So modules would just need to be recompiled against PyPy bindings for CPython API? Probably not 100% compatible, but close?

                Any chances of that happening?

                Andraz Tori, Zemanta

                +
                +
                +
                +
                + + Alexander Schremmer wrote on 2009-12-03 08:51: +
                +
                + Any chances of that happening?

                In theory, this is possible, but a lot of work. Nobody has stepped up to implement it, yet. +
                +
                +
                +
                + + Unhelpful wrote on 2009-12-04 07:08: +
                +
                +

                Isn't the exposure of refcounts in the CPython C API going to be a bit of a problem for implementing the API on pypy? perhaps a "fake" refcount could be associated with an object when it is first passed to an extension? This could still be problematic if the extension code expects to usefully manipulate the refcount, or to learn anything by examining it...

                +
                +
                +
                +
                + + Alexander Schremmer wrote on 2009-12-04 10:10: +
                +
                + Isn't the exposure of refcounts in the CPython C API going to be a bit of a problem for implementing the API on pypy?

                Indeed, it would be part of the task to introduce support in the GCs for such refcounted objects. Note that real refcounting is necessary because the object could be stored in an C array, invisible to the GC. +
                +
                +
                +
                + + Unhelpful wrote on 2009-12-04 10:32: +
                +
                +

                I'm trying to think of ways around that, but any API change to make objects held only in extensions trackable by the GC would probably be much worse than adding refcounted objects, wouldn't it, unless the extension were written in rpython...

                +
                +
                +
                +
                + + handsomegui wrote on 2015-04-28 15:16: +
                +
                +

                Any news on this PyQt on PyPy topic? With the latest PyPy 2.5.1? Thanks.

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2009/12/accelerating-pypy-development-by-8973749020516679741.html b/posts/2009/12/accelerating-pypy-development-by-8973749020516679741.html new file mode 100644 index 000000000..2e28451da --- /dev/null +++ b/posts/2009/12/accelerating-pypy-development-by-8973749020516679741.html @@ -0,0 +1,411 @@ + + + + + +Accelerating PyPy development by funding | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                Accelerating PyPy development by funding

                + + + +
                +

                PyPy has recently made some great speed and memory progress towards providing the most efficient Python interpreter out there. We also just announced +our plans for the pypy-1.2 release. Much of this is driven by personal +commitment, by individuals and companies investing time and money. +Now we'd appreciate some feedback and help regarding getting money +into the PyPy project to help its core members (between +5 and 15 people depending how you count) to sustain themselves. We see +several options:

                +
                  +
                • use a foundation structure and ask for tax-exempt donations to the +project, its developers and infrastructure. We just got +a letter from the Software Freedom Conservancy that they view +our application favourably so this option becomes practical hopefully +soon.
                • +
                • offer to implement certain features like a 64bit JIT-backend, +Numpy for PyPy or a streamlined installation in exchange for money, +contributed in small portions/donations. Do you imagine you or your +company would sponsor PyPy on a small scale for efforts like this? +Any other bits you'd like to see?
                • +
                • offer to implement larger scale tasks by contracting PyPy related companies, +namely Open End and merlinux who have successfully done such +contracts in the past. Please don't hesitate to contact +holger@merlinux.eu and bea@openend.se if you want to start a +conversation on this.
                • +
                • apply for public/state funding - in fact we are likely to get some +funding through Eurostars, more on that separately. Such funding +is usually only a 50-60% percentage of actual employment and +project costs, and is tied to research questions rather than +to make PyPy a production-useable interpreter, though.
                • +
                +

                Anything else we should look out for?

                +

                cheers & thanks for any feedback, +Maciej and Holger

                +
                +

                Comments

                +
                +
                +
                + + Anonymous wrote on 2009-12-21 18:28: +
                +
                +

                What's the status of possible mobile applications for PyPy? That seems nearer in terms of potential products and thus 'commercial' funding.

                +
                +
                +
                +
                + + Po wrote on 2009-12-21 21:57: +
                +
                +

                Have you guys looked into jitting regular expressions?
                I am not quite sure how hard it would be but having very fast regexps would be a great selling point for Pypy.

                +
                +
                +
                +
                + + Anonymous wrote on 2009-12-22 00:24: +
                +
                +

                What about activating the Python Users Groups around the world? I think the case has to be made for PyPy still to the regular folk, if you will. So - what if you conducted a video showing off it's potential, or maybe a series of videos, much like the "Summer of NHibernate" series. All the while, on the same site as the videos, you have a "tips" jar for donations. The videos would serve as great marketing campaign and would invite the development community into the fold, earning the buy-in you seek. This kind of attention in the community would only serve the project well when attracting the larger fish to the pond.

                Just my thoughts. :)

                +
                +
                +
                +
                + + holger krekel wrote on 2009-12-22 08:59: +
                +
                +

                @kitblake good point. The main blocker for making PyPy useful on mobile phones is support for GUI apps. Alexander's recent PyPy QT experiments are teasing in this direction. To fully exploit PyPy's super-efficient memory usage we probably need to provide native bindings. That and maybe a GIL-less interpreter would make PyPy a superior choice for mobile devices.

                However, GUI-bindings/free threading are orthogonal to the ongoing JIT-efforts. Somehow PyPy suffers a bit from its big potential (there also is stackless and sandboxing etc.). Question is: (How) can we make donation/other funding guide PyPy developments and at the same time support dedicated developers?

                +
                +
                +
                +
                + + holger krekel wrote on 2009-12-22 09:07: +
                +
                +

                @Ryan Interesting idea. What would you suppose to see in such a video?

                +
                +
                +
                +
                + + Niki wrote on 2009-12-22 09:58: +
                +
                +

                What if new PySide code generator targets RPython?
                https://www.pyside.org/

                +
                +
                +
                +
                + + Alexander Schremmer wrote on 2009-12-22 21:39: +
                +
                +

                Niki, generally thats a viable approach. Pyside is moving to shiboken, a new framework for generating bindings. Somebody would have to check how large the effort is to port it to RPython.
                Currently, Pyside is using Boost::Python AFAIK.

                +
                +
                +
                +
                + + Anonymous wrote on 2009-12-30 14:02: +
                +
                +

                could you accept donations via a paypal button or something like that? It's simple and easy but I think it's unlikely to be sufficient.

                I'm always amazed at the MoveOn organization... it seems like every week they send out mail like 'hey! we need $400,000 to stop The Man! Can you send us $20?' followed by 'Thanks! We've achieved our goal!'

                I don't know how many people or how much each one donates but they always meet their goal!

                +
                +
                +
                +
                + + holger krekel wrote on 2010-01-04 15:11: +
                +
                +

                anonymous: yes, establishing some way to accept money via paypal is high on our list. if nothing else we can use some private trusted account. moveon is rather geared towards general politics, i guess, so not directly applicable. But i remember there was some open source market place which allows to bid for certain features ...

                +
                +
                +
                +
                + + Anonymous wrote on 2010-01-06 05:18: +
                +
                +

                Have you considered moving to any sort of DVCS (Hg, Git, etc)? Or, given your current management style, does a centralized VCS or a DVCS add more to the project?

                Googling "open source bounties", finds Stack Overflow suggesting self-hosting bounties for the best results, which I suppose, makes sense. The people interested in taking bounties would be the ones already at your site. Being one of a million bounty providers on a site wouldn't generate much traffic.

                Thinking out loud, moving to a DVCS might actually help the bounty process, assuming you'd want to move in that direction.

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2009/12/leysin-winter-sprint-23-30th-january-7768876505015446348.html b/posts/2009/12/leysin-winter-sprint-23-30th-january-7768876505015446348.html new file mode 100644 index 000000000..a1c019693 --- /dev/null +++ b/posts/2009/12/leysin-winter-sprint-23-30th-january-7768876505015446348.html @@ -0,0 +1,323 @@ + + + + + +Leysin Winter Sprint: reported | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                Leysin Winter Sprint: reported

                + + + +
                + + + + +
                + +

                Update: the sprint has been reported to some later date.

                + +The next PyPy sprint will probably still be in Leysin, Switzerland, for the +seventh time. + +
                       + +
                +
                +

                Comments

                +
                +
                +
                + + Anonymous wrote on 2009-12-05 07:56: +
                +
                +

                It would be nice if there are prebuilt binaries in the next release.
                Certainly if it's faster there are a lot of graphics based projects where this would be interesting (pygame, pygelt, cocos2d, shoebot etc).

                +
                +
                +
                +
                + + Anonymous wrote on 2009-12-05 14:22: +
                +
                +

                @Anonymous:
                Probably they would be still slower, because ctypes is very slow in PyPy afaik.
                Someone mentioned in irc that the long time goal for ctypes is, that the jit doesn't use libffi at all but does direct assembler-to-c calls instead, if I remember correctly. - what should be superfast.
                That would of course be absolutely awesome. :)
                (and it's also the secret reason, why I only use pypy compatible modules for my pyglet game ;)
                Unfortunately I don't know if this is going to happen anytime "soon" / before the 1.2 release (at least I can't find it on extradoc/planning/jit.txt) but I know many people who would instantly drop cpython then. :P
                Heck, if I only had more clue about, how difficult this is to implement...

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2009/12/planning-next-release-of-pypy-4193252449406707091.html b/posts/2009/12/planning-next-release-of-pypy-4193252449406707091.html new file mode 100644 index 000000000..0dba53567 --- /dev/null +++ b/posts/2009/12/planning-next-release-of-pypy-4193252449406707091.html @@ -0,0 +1,352 @@ + + + + + +Planning a next release of PyPy | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                Planning a next release of PyPy

                + + + +
                +

                The PyPy core team is planning to make a new release before the next Pycon US.

                +

                The main target of the 1.2 release is packaging the good results +we have achieved applying our current JIT compiler generator to our +Python interpreter. Some of that progress has been chronicled in +recent posts on the status blog. By releasing them in a +relatively stable prototype we want to encourage people to try them with their +own code and to gather feedback in this way. By construction the JIT compiler +should support all Python features, what may vary are the speedups +achieved (in some cases the JIT may produce worse results than the PyPy +interpreter which we would like to know) and the extra memory required +by it.

                +

                For the 1.2 release we will focus on the JIT stability first, less on +improving non-strictly JIT areas. The JIT should be good at many things +as shown by previous blog postings. We want the JIT compiler in the +release to work well on Intel 32 bits on Linux, with Mac OS X and +Windows being secondary targets. Which compilation targets work will +depend a bit on contributions.

                +

                In order to finalize the release we intend to have a concentrated +effort ("virtual sprint") from the 22nd to the 29th of +January. Coordination will happen as usual through the #pypy irc +channel on freenode. Samuele Pedroni will take the role of release +manager as he already did in the past.

                +
                +

                Comments

                +
                +
                +
                + + Anonymous wrote on 2009-12-17 15:37: +
                +
                +

                Good News!
                Can't wait to try pypy as my standard python vm on my desktop machine.

                Btw: Are there any plans yet for python generators support in the jit?
                Because thats the only feature that I'm currently missing when using pypy.
                I have some medium sized apps, that I'd like to try, but they often use generators, so these will be slower with jit than without, won't they?

                +
                +
                +
                +
                + + Maciej Fijalkowski wrote on 2009-12-17 16:48: +
                +
                +

                @Anonymous.

                Generators won't be sped up by JIT. This does not mean that JIT can't run or can't speed up other parts of your program. And yes, there are plans of supporting that.

                Cheers,
                fijal

                +
                +
                +
                +
                + + servo wrote on 2009-12-18 03:11: +
                +
                +

                I want to get involved in the development of PyPy, but I'm just a student with some experience with compilers. There's any list of junior contributions that can be done by somebody starting?

                Thanks!

                +
                +
                +
                +
                + + Maciej Fijalkowski wrote on 2009-12-18 12:11: +
                +
                +

                @servo

                Show on #pypy on freenode IRC. We'll find you something, don't worry :-)

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2010/01/nightly-graphs-of-pypys-performance-8360469412941669946.html b/posts/2010/01/nightly-graphs-of-pypys-performance-8360469412941669946.html new file mode 100644 index 000000000..bed037b68 --- /dev/null +++ b/posts/2010/01/nightly-graphs-of-pypys-performance-8360469412941669946.html @@ -0,0 +1,399 @@ + + + + + +Nightly graphs of PyPy's performance | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                Nightly graphs of PyPy's performance

                + + + +
                +

                Hello.

                +

                In the past few months, we made tremendous progress on the JIT front. +To monitor the progress daily, we introduced recently some cool graphs +that plot revision vs performance. They are based on unladen swallow +benchmark runner and they're written entirely in JavaScript, using canvas +via the JQuery and Flot libraries. +It's amazing what you can do in JavaScript these days... They are also +tested via the very good oejskit plugin, that integrates py.test +with JavaScript testing, driven by the command line.

                +

                As you can probably see, we're very good on some benchmarks and not that +great on others. Some of the bad results come from the fact that while we +did a lot of JIT-related work, other PyPy parts did not see that much +love. Some of our algorithms on the builtin data types are inferior to those +of CPython. This is going to be an ongoing focus for a while.

                +

                We want to first improve on the benchmarks for a couple +of weeks before doing a release to gather further feedback.

                +

                Cheers, +fijal

                +
                +

                Comments

                +
                +
                +
                + + Bill Mill wrote on 2010-01-25 17:38: +
                +
                +

                So... what's a revision number that I can use? Am I just supposed to guess? The page should have a reasonable default revision number.

                +
                +
                +
                +
                + + Bill Mill wrote on 2010-01-25 17:40: +
                +
                +

                for anyone else looking, 70700 is a reasonable place to start. (The graphs are really nice by the way, I'm not hating!)

                +
                +
                +
                +
                + + Anonymous wrote on 2010-01-25 18:12: +
                +
                +

                a couple of suggestions:

                1. scale for X axis (dates are likely to be more interesting than revision numbers)

                1a. scale for Y axis

                2. Add another line: unladen swallow performance

                +
                +
                +
                +
                + + Gaëtan de Menten wrote on 2010-01-25 19:36: +
                +
                +

                +1 for Anonymous's suggestions 1 and 2.

                +
                +
                +
                +
                + + RPG wrote on 2010-01-25 20:18: +
                +
                +

                This is cool.

                Unladen Swallow's perf should also be considered if possible.

                +
                +
                +
                +
                + + Maciej Fijalkowski wrote on 2010-01-25 21:04: +
                +
                +

                Hey.

                Regarding revisions - by default it points to the first one we have graphs from, so you can just slice :) Also, yeah, revision numbers and dates should show up, will fix that. We don't build nightly unladen swallow and we don't want to run it against some older version, because they're improving constantly.

                Cheers,
                fijal

                +
                +
                +
                +
                + + Anonymous wrote on 2010-01-25 23:55: +
                +
                +

                Wonderful idea, great implementation (axis are needed, tooltips would be interesting for long series), impressive results.

                I hope you guys exploit this to raise interest in PyPy in this pre-release period. Just take a look at the response you get to posts involving numbers, benchmarks, etc. (BTW, keep linking to the funding post) :)

                A series of short posts discussing hot topics would be a sure way to keep Pypy around the news until the release, so you get as much feedback as possible.

                Suggestions:

                - Possible factors in slower results (discuss points in the Some Benchmarking post);

                - One-of comparisons to different CPython versions, Unladen Swallow, ShedSkin, [C|J|IronP]ython (revisit old benchmarks posts?);

                - Mention oprofile and the need for better profiling tools in blog, so you can crowdsource a review of options;

                - Ping the Phoronix Test Suite folks to include Pypy translation (or even these benchmarks) in their tests: Python is an important part of Linux distros;

                - Don't be afraid to post press-quotable numbers and pics, blurbs about what Pypy is and how much it's been improving, etc. Mention unrelated features of the interpreter (sandboxable!), the framework (free JIT for other languages), whatever;

                - The benchmark platform (code, hardware, plans for new features).

                +
                +
                +
                +
                + + Unknown wrote on 2010-01-26 06:32: +
                +
                +

                Regarding comparison with unladen swallow: I think having a point per month would be good enough for comparison purposes.

                +
                +
                +
                +
                + + Maciej Fijalkowski wrote on 2010-01-26 08:53: +
                +
                +

                @Anonymous: Great suggestions! I'll look at this issues. In fact, things like profiling has been highly on our todo list, but we should advertise it more. We surely miss someone who'll be good at PR :-)

                +
                +
                +
                +
                + + Luis wrote on 2010-02-24 10:51: +
                +
                +

                Something's wrong with plot one's scale: the speed ups are represented by a first line of 2x, a second one of 4x and the third one is 8x. Shouldn't it be 6x instead?

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2010/02/pycon-2010-report-6986911457623699520.html b/posts/2010/02/pycon-2010-report-6986911457623699520.html new file mode 100644 index 000000000..95b8d175c --- /dev/null +++ b/posts/2010/02/pycon-2010-report-6986911457623699520.html @@ -0,0 +1,349 @@ + + + + + +Pycon 2010 report | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                Pycon 2010 report

                + + + +
                +

                Hello.

                +

                Greetings to everybody from Pycon 2010 Atlanta. Right now I'm sitting in +a sprint room with people sprinting on various projects, like CPython, +twisted etc. The conference was really great, and I've seen some good talks, +although I've been too exhausted from my own talks to go to too many. +Probably I should stay away from proposing that many talks to next pycon :-)

                +

                The highlight of sprints was that we got a common mercurial repository at python.org for python benchmarks. We might be able to come up with +"the python benchmark suite" which will mostly consist +of simple benchmarks using large python libraries, rather than microbenchmarks. +The repository was started by the Unladen Swallow people and we already +have common commit access among PyPy, CPython, Unladen Swallow, Jython +and Iron Python. We don't have yet a common place to run benchmarks, +but we should be able to fix that soon.

                +

                Regarding the talks, there are online videos for +How to write cross-interpreter python programs and Speed of PyPy talks, +among other talks from Pycon. +There should be a video for my short keynote shortly.

                +

                The talks were well received as there is interest in PyPy's progress.

                +

                +

                +

                Cheers,
                +fijal

                +
                +

                Comments

                +
                +
                +
                + + Luis wrote on 2010-02-24 20:53: +
                +
                +

                Hi, I just wanted to say that there's something wrong with the PLOT ONE graphic. The speedups are expressed by horizontal lines (each one is 2x). The third line shows 8x instead of 6x.

                +
                +
                +
                +
                + + Anonymous wrote on 2010-02-25 00:56: +
                +
                +

                It was nice meeting you. I hope you have fun in South Africa :)

                Antoine.

                +
                +
                +
                +
                + + Maciej Fijalkowski wrote on 2010-02-25 04:43: +
                +
                +

                @Luis

                It's called a logarithmic scale. It means you get 2x 4x 8x 16x etc.

                +
                +
                +
                +
                + + Luis wrote on 2010-02-25 20:40: +
                +
                +

                @Fijal
                I see... please excuse my ignorance :-)

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2010/03/blog-coverage-of-speedpypyorg-2291955489972824511.html b/posts/2010/03/blog-coverage-of-speedpypyorg-2291955489972824511.html new file mode 100644 index 000000000..86cb57fe8 --- /dev/null +++ b/posts/2010/03/blog-coverage-of-speedpypyorg-2291955489972824511.html @@ -0,0 +1,316 @@ + + + + + +Blog coverage of speed.pypy.org | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                Blog coverage of speed.pypy.org

                + + + +
                +

                If you want to read a detailed analysis about why speed.pypy.org is cool, head over to Saveen Reddy's blog at the MSDN.

                +
                +

                Comments

                +
                +
                +
                + + PAOLO BASSO wrote on 2010-03-20 13:15: +
                +
                +

                First of all congratulations for the great work, I can say I am a newbie in Python world but I follow with interest this project. I tryed the release with the JIT compiler with also the parallel python module and the speed gain is sensible. I compared also the performance with psyco on 3 or 4 benchmarks and it seems that the time for the execution is usually more or less the same. Do you think there will be the possibility again for a massive speed improvement in future releases or the level of max performance is not so far? How much faster could it be in the future?

                Thanks,

                Paolo

                +
                +
                +
                +
                + + Luis wrote on 2010-03-22 20:12: +
                +
                +

                Question:
                According to the Computer Language Benchmarks Game, there are three benchmarks that perform way slower in Pypy against Python 3 ( see here: https://shootout.alioth.debian.org/u32/benchmark.php?test=all&lang=pypy&lang2=python3 ).

                Those are:
                1) reverse-complement
                2) regex-dna
                3) pidgits

                I know that regex-dna performs slower because regex haven't been optimized yet, but what's the reason for the other two? Do they use regex too?

                +
                +
                +
                +
                + + Anonymous wrote on 2010-03-24 17:02: +
                +
                +

                @Luis pidigits is about using gmpy for cpython vs longs for pypy. It's a bit apples vs oranges. That said, CPython's longs are still faster than pypy's so we definitely can improve. This are needs some love :)

                Reverse complement is string benchmark and I did not look but it might be that the speed of str.translate is suboptimal.

                Cheers,
                fijal, hiding

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2010/03/hello-5058108566628405592.html b/posts/2010/03/hello-5058108566628405592.html new file mode 100644 index 000000000..5bed687d0 --- /dev/null +++ b/posts/2010/03/hello-5058108566628405592.html @@ -0,0 +1,532 @@ + + + + + +Benchmarking twisted | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                Benchmarking twisted

                + + + +
                +

                Hello.

                +

                I recently did some benchmarking of twisted on top of PyPy. For the very +impatient: PyPy is up to 285% faster than CPython. For more patient people, +there is a full explanation of what I did and how I performed measurments, +so they can judge themselves.

                +

                The benchmarks are living in twisted-benchmarks and were mostly written +by Jean Paul Calderone. Even though he called them "initial exploratory +investigation into a potential direction for future development resulting +in performance oriented metrics guiding the process of optimization and +avoidance of complexity regressions", they're still much much better than +average benchmarks found out there.

                +

                The methodology was to run each benchmark for +quite some time (about 1 minute), measuring number of requests each 5s. +Then I looked at dump of data and substracted some time it took +for JIT-capable interpreters to warm up (up to 15s), averaging +everything after that. Averages of requests per second are in the table below (the higher the better):

                + ++++++ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                benchnameCPythonUnladen swallowPyPy
                names1093011940 (9% faster)15429 (40% faster)
                pb17052280 (34% faster)3029 (78% faster)
                iterations7556994554 (25% faster)291066 (285% faster)
                accept21762166 (same speed)2290 (5% faster)
                web879854 (3% slower)1040 (18% faster)
                tcp105M119M (7% faster)60M (46% slower)
                +

                To reproduce, run each benchmark with:

                +
                +benchname.py -n 12 -d 5
                +

                WARNING: running tcp-based benchmarks that open new connection for each +request (web & accept) can exhaust number of some kernel structures, +limit n or wait until next run if you see drops in request per second.

                +

                The first obvious thing is that various benchmarks are more or less amenable +to speedups by JIT compilation. Accept and tcp getting smallest speedups, if at +all. This is understandable, since JIT is mostly about reducing interpretation +and frame overhead, which is probably not large when it comes to accepting +connections. However, if you actually loop around, doing something, JIT +can give you a lot of speedup.

                +

                The other obvious thing is that PyPy is the fastest python interpreter +here, almost across-the board (Jython and IronPython won't run twisted), +except for raw tcp throughput. However, speedups can vary and I expect +this to improve after the release, as there are points, where PyPy can +be improved. Regarding raw tcp throughput - this can be a problem for +some applications and we're looking forward to improve this particular +bit.

                +

                The main reason to use twisted for this comparison is a lot of support from +twisted team and JP Calderone in particular, especially when it comes to +providing benchmarks. If some open source project wants to be looked at +by PyPy team, please provide a reasonable set of benchmarks and infrastructure.

                +

                If, however, you're a closed source project fighting with performance problems +of Python, we're providing contracting for investigating opportunities, how +PyPy and not only PyPy, can speed up your project.

                +

                Cheers,
                +fijal

                +

                Benchmark descriptions:

                +
                  +
                • +names - simple DNS server
                • +
                • +web - simple http hello world server
                • +
                • +pb - perspective broker, RPC mechanism for twisted
                • +
                • +iterations - empty twisted loop
                • +
                • +accept - number of tcp connections accepted per second
                • +
                • +tcp - raw socket transfer throughput
                • +
                +

                Used interpreters:

                +
                  +
                • CPython 2.6.2 - as packaged by ubuntu
                • +
                • Unladen swallow svn trunk, revision 1109
                • +
                • PyPy svn trunk, revision 71439
                • +
                +

                Twisted version used: svn trunk, revision 28580

                +

                Machine: unfortunately 32bit virtual-machine under qemu, running ubuntu karmic, +on top of Quad core intel Q9550 with 6M cache. Courtesy of Michael Schneider.

                +
                +

                Comments

                +
                +
                +
                + + Alexander Solovyov wrote on 2010-03-01 15:42: +
                +
                +

                Would be nice to see at least rough approximation of amount of RAM used by each implementation. :-)

                +
                +
                +
                +
                + + Anonymous wrote on 2010-03-01 18:58: +
                +
                +

                Great as always.

                I'm looking forward to use PyPy in production with the next stable release in march. =)

                +
                +
                +
                +
                + + Yuri Baburov wrote on 2010-03-01 20:37: +
                +
                +

                Is it possible to run the same tests with CPython+Psyco?
                That would be really interesting to see!

                +
                +
                +
                +
                + + Tim Parkin wrote on 2010-03-01 20:39: +
                +
                +

                Congrats... things continue to look interesting :-)

                +
                +
                +
                +
                + + Maciej Fijalkowski wrote on 2010-03-01 21:31: +
                +
                +

                @Yuri

                No, psyco has limitations on frames that break zope.interface which twisted depends on.

                +
                +
                +
                +
                + + Doc Button wrote on 2010-03-02 07:02: +
                +
                +

                I agree with Yuri, it would be of interest to record memory stats for each benchmark run.

                +
                +
                +
                +
                + + KoObz wrote on 2010-03-02 19:09: +
                +
                +

                Awesome results Maciej.

                Question: what's it gonna take for pypy to supplant Cpython?

                You're faster and I'm guessing you have nowhere near the manpower of Cpython. Plus, you're written in Python so future work will be much easier. Seems like a no brainer to embrace pypy.

                +
                +
                +
                +
                + + Luis wrote on 2010-03-02 23:04: +
                +
                +

                Question: After having read many comments and posts from pypy's developers lately, I got the impression (I might be wrong though), that you are betting all on tracing for getting speedups, (that the slow interpreter will eventually be compensated by the magic of tracing).
                However, other projects that rely on tracing seem to favor a dual approach, which is a traditional method-a-time jit (which can evenly speed up all kinds of code) plus tracing for getting the most of highly numerical code (luajit 2.0, mozila's jaegermonkey, for example).

                Is this accurate or I'm wrong? Do you think that the current tracing strategy will eventually get speedups for those benchamarks that are currently on par or way bellow cpython? Or will you have to add a more traditional approach for the baseline?

                +
                +
                +
                +
                + + Maciej Fijalkowski wrote on 2010-03-03 00:14: +
                +
                +

                Hey Luis.

                That's a very interesting question. I will try answer couple of your points, but feel free to move to pypy-dev mailing list if you want to continue discussion.

                We indeed bet on tracing (or jitting in general) to compensate for slower interpretation than CPython. However, our tracing is far more general than spidermonkeys - for example we can trace a whole function from start and not require an actual loop. We hope to generalize tracing so it can eventually trace all constructs.

                The main difference between ahead-of-time and tracing is that tracing requires actual run, while ahead-of-time tries to predict what will happen. Results are generally in favor of tracing, although the variation will be larger (tracing does statistically correct branch prediction, not necesarilly always the correct one).

                Regarding benchmarks, most of those benchmarks that we're slower than CPython showcase that our tracing is slow (they don't contain warmup). And again, for some of those we'll just include warmup (like twisted.web which is web server, makes sense in my opinion), for other we'll try to make tracing faster. And again, the speed of tracing is not the property of tracing, but rather pypy's limitation right now.

                Some other benchmarks are slow because we don't JIT regular expressions (spambayes). This should be fixed, but it's again unrelated to tracing.

                To summarize: I don't expect us trying dual approach (one jit is enough fun, believe me), but instead generalizing tracing and making it more efficient. How this will go, we'll see, I hope pretty well.

                Cheers,
                fijal

                +
                +
                +
                +
                + + Antonio Cuni wrote on 2010-03-03 09:09: +
                +
                +

                @Luis

                other than Maciek's points, which I subscribe, it should be said
                that, since each language has a different semantics, the
                efficiency of a traditional "method-at-a-time" JIT can vary
                dramatically. In particular, the dynamism of Python is so deep
                that a traditional JIT cannot win much: Jython and IronPython do
                exactly that, but for most use cases are slower than CPython. If
                you are interested, Chapter 2 of my PhD thesis explores these
                topics :-)
                https://codespeak.net/svn/user/antocuni/phd/thesis/thesis.pdf

                +
                +
                +
                +
                + + Anonymous wrote on 2010-03-10 00:35: +
                +
                +

                great results!
                As for the warm-up, would it be possible to save some of the tracing decisions in some file (.pyt?) to help on next startup?
                -shai

                +
                +
                +
                +
                + + Maciej Fijalkowski wrote on 2010-03-10 22:08: +
                +
                +

                @Anonymous

                Saving the results is hard, but not impossible. There are other possibilities (like keeping process around) though.

                Cheers,
                fijal

                +
                +
                +
                +
                + + Unknown wrote on 2016-07-30 10:28: +
                +
                +

                May I have ur test code?

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2010/03/heroes-of-12-release-7211722984024027191.html b/posts/2010/03/heroes-of-12-release-7211722984024027191.html new file mode 100644 index 000000000..57b619d93 --- /dev/null +++ b/posts/2010/03/heroes-of-12-release-7211722984024027191.html @@ -0,0 +1,342 @@ + + + + + +Heroes of the 1.2 Release | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                Heroes of the 1.2 Release

                + + + +
                +

                Now that the release is done I wanted to list and to thank some people that +were essential in the process of getting it out of the door, particularly +because the work of some of them is not very visible usually.

                +

                Armin Rigo and Maciej Fijałkowski tirelessly worked on most aspects of +the release, be it fixing the last known bugs and performance problems, +packaging or general wizardry.

                +

                Amaury Forgeot d'Arc made sure that PyPy 1.2 actually supports Windows as a +platform properly and compiled the Windows binaries.

                +

                Miquel Torres designed and implemented our new speed overview page, +https://speed.pypy.org which is a great tool for us to spot performance +regressions and to showcase our improvements to the general public.

                +

                tav designed the new user-oriented web page, https://pypy.org which is a lot +nicer for people that only want to use PyPy as a Python implementation (and not +be confused by how PyPy is actually made).

                +

                Holger Krekel fixed our main development server codespeak.net, even while +being on vacation and not really having online connectivity. Without that, we +couldn't actually have released anything.

                +

                Bartosz Skowron worked a lot on making Ubuntu packages for PyPy, which is +really cool. Even though he didn't quite finish in time for the release, we will +hopefully get them soon.

                +

                Thanks to all you guys!

                +
                +

                Comments

                +
                +
                +
                + + Nicola Larosa wrote on 2010-03-13 10:53: +
                +
                +

                Many thanks to all of you for the hard work, PyPy is shaping up very nicely. :-)

                +
                +
                +
                +
                + + Bartosz Skowron wrote on 2010-03-13 14:45: +
                +
                +

                Heh, I would finish the Ubuntu package if i didn't have restricted Internet access (only port 80 is working in the hotel where i'm staying now). please wait till Monday :)

                +
                +
                +
                +
                + + Philipp Strube wrote on 2010-03-13 14:54: +
                +
                +

                Awesome. Will try this out for our cloud hosting platform.

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2010/03/introducing-nightly-builds-and-ubuntu-3346203966988761264.html b/posts/2010/03/introducing-nightly-builds-and-ubuntu-3346203966988761264.html new file mode 100644 index 000000000..aebdaa365 --- /dev/null +++ b/posts/2010/03/introducing-nightly-builds-and-ubuntu-3346203966988761264.html @@ -0,0 +1,381 @@ + + + + + +Introducing nightly builds and ubuntu PPA | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                Introducing nightly builds and ubuntu PPA

                + + + +
                +

                Hello. +

                +

                +We're pleased to announce two things that we were constantly asked for: Nightly builds and Ubuntu PPA for 1.2 release made by Bartosz Skowron. There are no nightly build ubuntu packages (yet). +

                +

                +Nightly builds are what they are - pure pypy executables with JIT compiled in (for linux only now). They require either a pypy checkout or a release download. The main difference is that by default display more debugging information than release builds and that they contain recent bugfixes and improvements of course :-) +

                +Cheers
                +fijal +
                +

                Comments

                +
                +
                +
                + + ben.b.boyer wrote on 2010-03-25 17:08: +
                +
                +

                great!

                +
                +
                +
                +
                + + nekto0n wrote on 2010-03-26 09:39: +
                +
                +

                Niiice =) Using PyPy becomes easier.
                Could please disable jit on amd64 or perhaps build 32-bit deb for amd64 machines?

                +
                +
                +
                +
                + + Maciej Fijalkowski wrote on 2010-03-26 17:15: +
                +
                +

                @nek0ton building 32bit JIT for 64bit is hard since you need 32bit libraries. We just don't build nightly 64bit (nor release contained it).

                +
                +
                +
                +
                + + nekto0n wrote on 2010-03-26 17:24: +
                +
                +

                @fijal Why so? 32bit libraries are available on ubuntu (with ia32 suffix), kernel is build with 32bit support option. Don't see any problem here.
                I understand why not to build 64bit release - JIT is the goal.
                P.S. Maybe unavailable amd64 build would force someone to digg and fix that issue? =) Are there any guides available to do it?

                +
                +
                +
                +
                + + Maciej Fijalkowski wrote on 2010-03-26 21:29: +
                +
                +

                the reason is precisely what you described - you need custom libraries linked with special suffix or place which is probably distribution dependent.

                +
                +
                +
                +
                + + Unknown wrote on 2010-03-31 12:46: +
                +
                +

                What would it take to make a 64 bit native everything (amd64)?

                Btw. I noticed the supported modules list seems to be incomplete at https://pypy.org/compat.html
                At least os, subprocess seem to be there even if not listed, probably more?

                +
                +
                +
                +
                + + Maciej Fijalkowski wrote on 2010-03-31 16:44: +
                +
                +

                @harri.

                The general answer is that both subprocess and os are written in Python (and not C), so we don't list them. However I wonder how we can list things not to confuse people who don't know that. Any ideas (listing all possible modules is a bit too much).

                +
                +
                +
                +
                + + Unknown wrote on 2010-04-07 14:39: +
                +
                +

                If the supported modules is over 50% of all, how about just listing modules that still require work? I suspect many people are unaware that PyPy is getting feature complete, usable for real work.

                +
                +
                +
                +
                + + Unknown wrote on 2011-07-20 13:08: +
                +
                +

                Any reason the PPA doesn't have a newer 1.5 build for natty?

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2010/03/introducing-pypy-12-release-2791388655442447862.html b/posts/2010/03/introducing-pypy-12-release-2791388655442447862.html new file mode 100644 index 000000000..00207f8a5 --- /dev/null +++ b/posts/2010/03/introducing-pypy-12-release-2791388655442447862.html @@ -0,0 +1,520 @@ + + + + + +Introducing the PyPy 1.2 release | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                Introducing the PyPy 1.2 release

                + + + +
                +

                We are pleased to announce PyPy's 1.2 release. +This version 1.2 is a major milestone and it is the first release to ship +a Just-in-Time compiler that is known to be faster than CPython +(and unladen swallow) on some real-world applications (or the best benchmarks +we could get for them). The main theme for the 1.2 release is speed.

                +

                The JIT is stable and we don't observe crashes. Nevertheless we would +recommend you to treat it as beta software and as a way to try out the JIT +to see how it works for you.

                +

                Highlights:

                +
                  +
                • The JIT compiler.
                • +
                • Various interpreter optimizations that improve performance as well as help +save memory. Read our various blog posts about achievements.
                • +
                • Introducing a new PyPy website at pypy.org made by tav and improved +by the PyPy team.
                • +
                • Introducing speed.pypy.org made by Miquel Torres, a new service that monitors our performance +nightly.
                • +
                • There will be ubuntu packages on PyPy's PPA made by Bartosz Skowron, +however various troubles prevented us from having them as of now.
                • +
                +

                Known JIT problems (or why you should consider this beta software) are:

                +
                  +
                • The only supported platform is 32bit x86 for now, we're looking for help with +other platforms.
                • +
                • It is still memory-hungry. There is no limit on the amount of RAM that +the assembler can consume; it is thus possible (although unlikely) that +the assembler ends up using unreasonable amounts of memory.
                • +
                +

                If you want to try PyPy, go to the download page on our excellent new site +and find the binary for your platform. If the binary does not work (e.g. on +Linux, because of different versions of external .so dependencies), or if +your platform is not supported, you can try building from the source.

                +

                The PyPy release team,
                +Armin Rigo, Maciej Fijalkowski and Amaury Forgeot d'Arc

                +

                Together with
                +Antonio Cuni, Carl Friedrich Bolz, Holger Krekel, Samuele Pedroni and many others.

                +
                +

                Comments

                +
                +
                +
                + + Brian Slesinsky wrote on 2010-03-12 18:37: +
                +
                +

                The front page of the new PyPy site should include some of these caveats about it being beta software; it gives the wrong impression about PyPy's current status.

                +
                +
                +
                +
                + + Peter wrote on 2010-03-12 18:50: +
                +
                +

                Congratulations! This is great news!

                +
                +
                +
                +
                + + stuaxo wrote on 2010-03-12 20:21: +
                +
                +

                Is it possible to install distribute in this?

                +
                +
                +
                +
                + + Martijn Faassen wrote on 2010-03-12 20:34: +
                +
                +

                Congrats! Now to port a lot of software onto this!

                +
                +
                +
                +
                + + Anonymous wrote on 2010-03-12 21:50: +
                +
                +

                Congratulations! I've been looking forward to this.

                Question: does PyPy have an API for creating native modules?

                +
                +
                +
                +
                + + Maciej Fijalkowski wrote on 2010-03-12 22:25: +
                +
                +

                @Anonymous:

                if you mean wrapping C libraries we recommend using ctypes.

                +
                +
                +
                +
                + + Unknown wrote on 2010-03-12 23:31: +
                +
                +

                awesome! congratulations!

                why is spambayes so slow? does it use regular expressions?

                +
                +
                +
                +
                + + Isaac Gouy wrote on 2010-03-13 00:28: +
                +
                +

                Why is there a problem with nbody and itertools ?

                pypy temporarily in the benchmarks game.

                +
                +
                +
                +
                + + Benjamin Peterson wrote on 2010-03-13 01:15: +
                +
                +

                @horace: yes, regexes are probably the problem.

                @Issac: combinations is a 2.6 feature, which we don't support.

                +
                +
                +
                +
                + + Isaac Gouy wrote on 2010-03-13 01:33: +
                +
                + combinations is a 2.6 feature, which we don't support

                Would anyone care to contribute a modified working nbody program to the benchmarks game? ;-) +
                +
                +
                +
                + + Armin Rigo wrote on 2010-03-13 03:11: +
                +
                +

                @Isaac: we have nbody_modified in our benchmarks, source code here.

                +
                +
                +
                +
                + + Unknown wrote on 2010-03-13 10:38: +
                +
                +

                Thanks for creating windows binaries! I waited long time for that...

                +
                +
                +
                +
                + + René Dudfield wrote on 2010-03-13 12:52: +
                +
                +

                Congrats to all the pypy peoples!

                +
                +
                +
                +
                + + Vitéz Gábor wrote on 2010-03-13 13:18: +
                +
                +

                Great work! Keep it up!

                +
                +
                +
                +
                + + Shin Guey wrote on 2010-03-13 15:46: +
                +
                +

                I just tried the windows binary.

                Oh damn, it is really FAST!!!

                3x performance gain...
                C:\work\bzr-proj>pypy script.py -t i2d -f longdata.txt
                solve parallel
                m = 1 total = 128
                m = 2 total = 16384
                m = 3 total = 2097152
                Require M stage: 3
                Time taken 00:00:05 (907ms)

                C:\work\bzr-proj>python script.py -t i2d -f longdata.txt
                solve parallel
                m = 1 total = 128
                m = 2 total = 16384
                m = 3 total = 2097152
                Require M stage: 3
                Time taken 00:00:15 (093ms)

                +
                +
                +
                +
                + + Shin Guey wrote on 2010-03-13 15:52: +
                +
                +

                Forgot about the memory usage, python consume ~4MB and pypy consume ~24MB. Pypy need 6x more memory, but I don't care about this in my script since the performance gain is significant.

                I really want to know the pypy vs luajit, I think luajit should be much faster. I am in progress in converting my script to lua but that is painful, my knowledge on lua doesn't match with python.

                +
                +
                +
                +
                + + Carl Friedrich Bolz-Tereick wrote on 2010-03-13 16:11: +
                +
                +

                @shin if you have a comparison to LuaJIT, I would be extremely interested to hear the results! I agree that LuaJIT will likely be faster though.

                +
                +
                +
                +
                + + Anonymous wrote on 2010-03-13 19:09: +
                +
                +

                can't wait to test it out!

                +
                +
                +
                +
                + + large file transfer wrote on 2010-03-14 14:49: +
                +
                +

                I really want to know the pypy vs luajit, I think luajit should be much faster. I am in progress in converting my script to lua but that is painful, my knowledge on lua doesn't match with python.

                +
                +
                +
                +
                + + cjrh wrote on 2010-03-15 07:27: +
                +
                +

                Thanks for windows binaries!

                +
                +
                +
                +
                + + Anonymous wrote on 2010-04-09 13:47: +
                +
                +

                Congratulations !

                Please mention in the download section that VC2005 redistributables are needed to run it on Win !

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2010/03/introducing-speedpypyorg-1822874891591164256.html b/posts/2010/03/introducing-speedpypyorg-1822874891591164256.html new file mode 100644 index 000000000..f48845a56 --- /dev/null +++ b/posts/2010/03/introducing-speedpypyorg-1822874891591164256.html @@ -0,0 +1,476 @@ + + + + + +Introducing speed.pypy.org | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                Introducing speed.pypy.org

                + + + +
                +

                Hello.

                +

                Some time ago, we introduced our nightly performance graphs. This was a quick +hack to allow us to see performance regressions. Thanks to Miquel Torres, +we can now introduce https://speed.pypy.org, which is a Django-powered web +app sporting a more polished visualisation of our nightly performance runs.

                +

                While this website is not finished yet, it's already far better than our previous +approach :-)

                +

                Details about announcement on pypy-dev are found here.

                +

                If you're are interested in having something similar for other benchmark runs, contact Miquel (tobami at gmail).

                +

                Quoting Miquel: "I would also like to note, that if other performance-oriented +opensource projects are interested, I would be willing to see if we can set-up +such a Speed Center for them. There are already people interested in +contributing to make it into a framework to be plugged into buildbots, software +forges and the like. Stay tuned!"

                +
                +

                Comments

                +
                +
                +
                + + Unknown wrote on 2010-03-03 17:12: +
                +
                +

                Excellent! We really ought to deploy this for unladen, too. Unfortunately, I don't think I'll have the time to get that going. :(

                +
                +
                +
                +
                + + Unknown wrote on 2010-03-03 19:48: +
                +
                +

                In my mind PyPy with its JIT will/should eventually get us close to matching or beating Java performance for the non-dynamic subset of python. Would that be a fair statment? If so is there some bench mark that allows us to compare that. What that be usefull?

                +
                +
                +
                +
                + + Philip Jenvey wrote on 2010-03-03 22:49: +
                +
                +

                I would love to see this become a Python implementation shootout, a single place where we could compare the speeds of CPython/PyPy/Unladen/Jython/IronPython

                +
                +
                +
                +
                + + Anonymous wrote on 2010-03-03 23:09: +
                +
                +

                This is great! It's excellent to see the fruits of the pypy jit work so clearly.

                I'd also like to see this in place for other Python implementations.

                +
                +
                +
                +
                + + matt harrison wrote on 2010-03-03 23:40: +
                +
                +

                Awesome work. One more feature request: Track memory usage.

                +
                +
                +
                +
                + + Maciej Fijalkowski wrote on 2010-03-03 23:56: +
                +
                +

                @matt

                Ok. So I've seen those feature request often enough. These benchmarks are not good for tracking memory usage - they'll simply measure the amount interpreter allocates at the beginning. If you provide better ones, we'll do it.

                Cheers,
                fijal

                +
                +
                +
                +
                + + Unknown wrote on 2010-03-04 04:59: +
                +
                +

                With the JIT would a script that does not use the dynamic aspects of python be able to match the speed of Java?

                +
                +
                +
                +
                + + tobami wrote on 2010-03-04 09:46: +
                +
                +

                @Reid: maybe I can help you out setting it up. You could actually even begin saving results to speed.pypy.org right away with minimal configuration changes (though I understand you may prefer to have your own site and DB).

                +
                +
                +
                +
                + + tobami wrote on 2010-03-04 09:54: +
                +
                +

                @Philip, Anonymous

                The first features are catering to trunk development, which was the most urgent thing.

                But my plan all along was to implement a third tab for comparing implementations (among other things. See mailing list announcement for details).

                So your wish should come to pass :-)

                +
                +
                +
                +
                + + Neil wrote on 2010-03-04 12:08: +
                +
                +

                Neat! I still like the original graphs though, it's nice to see the history for all the benchmarks together.

                I think the 'average' is pretty meaningless - it implies that a simple average of all the benchmarks will correspond to the typical real-world speed up you will get using pypy with your existing python code, which I don't think is true.

                +
                +
                +
                +
                + + tobami wrote on 2010-03-04 12:20: +
                +
                +

                @Neil
                a view showing all timeline graphs at once is also planned.

                About the average, of course you can not take from it that pypy-c-jit is nearly 3 times as fast as cpython. Because it depends on the particular choice of benchmarks, which right now is not at all representative of actual real-world usage.

                Regardless, it is there so that a developer gets an overall feeling for how a given revision change has affected performance across all benchmarks.

                We can't avoid the risk of people reaching wrong conclusions, but that is always the case with statistics, averages and benchmarks ;-)

                +
                +
                +
                +
                + + Carl Friedrich Bolz-Tereick wrote on 2010-03-04 13:14: +
                +
                +

                @sarvi: reaching the speed of Java is a really non-trivial goal, because Sun's JVM has really been highly optimized over many years. I guess it will take us a long time (if at all) to reach such levels of performance.

                +
                +
                +
                +
                + + Unknown wrote on 2010-03-05 04:17: +
                +
                +

                I understand JVM is highly optimized.
                And overtime and once yall have more momentum industry funding I am sure your VM will get just as optimized. I am sure Google will pick you guys up soon. I have no doubt about it. Unladen Swallow seems a waste of time once yall get more credibility.

                Even then I do expect Dynamic scripting capabilities to perform slower the Java.

                I am just hoping that eventually the non-dynamic parts of python will perform on par with Java.

                And we can all program in just Python and C. :-))

                +
                +
                +
                +
                + + della wrote on 2010-03-06 08:55: +
                +
                +

                Great work! BTW, could it be possible to also have a quick link to the source code of the benchmarks in the website?

                +
                +
                +
                +
                + + tobami wrote on 2010-03-06 12:51: +
                +
                +

                @della

                yeah, such things are missing right now.

                An about page, and possibly an explanation (with links to the code) of each benchmark are probably going to be implemented. Currently there is only tooltip explanations for some.

                +
                +
                +
                +
                + + Luis wrote on 2010-03-09 01:28: +
                +
                +

                Another silly question:
                AFAIK, the benchmark improvements seen lately are due to the way you measure avergages, by excluding warmup time. Seeing that warmup takes time that may be critical in some situations, I wonder if it's possible to somehow "save" the generated jited code so it can be reused after the first time it's generated.
                This way, it would be possible to distribute programs already "warmed up", kind of a compiled version of them. Sorry if this doesn't make sense at all... for a clueless ignorant like me, it does!

                +
                +
                +
                +
                + + Maciej Fijalkowski wrote on 2010-03-09 17:09: +
                +
                +

                @Luis

                Hey. It's a valid option, but it's however at least hard (if not next to impossible). There is work planned on reducing warmup time, so it won't matter that much instead.

                Cheers,
                fijal

                +
                +
                +
                +
                + + stuaxo wrote on 2010-03-09 18:02: +
                +
                +

                It would be nice if the timeline had the date on it (only where the date changes, and the beginning + end).

                +
                +
                +
                +
                + + large file transfer wrote on 2010-03-17 17:42: +
                +
                +

                I am sure your VM will get just as optimized. I am sure Google will pick you guys up soon. I have no doubt about it. Unladen Swallow seems a waste of time once yall get more credibility.

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2010/03/state-of-pypy-talk-from-pycon-6748503931490058986.html b/posts/2010/03/state-of-pypy-talk-from-pycon-6748503931490058986.html new file mode 100644 index 000000000..fdeb0ea2e --- /dev/null +++ b/posts/2010/03/state-of-pypy-talk-from-pycon-6748503931490058986.html @@ -0,0 +1,296 @@ + + + + + +State of PyPy talk from Pycon | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                State of PyPy talk from Pycon

                + + + +
                +

                Hello.

                +

                +The last PyPy video from pycon has been uploaded. It's a very short (less than 10 minutes) "keynote" talk about state of PyPy.

                +

                +Enjoy!
                +fijal

                +
                +

                Comments

                +
                +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2010/04/pypy-on-google-open-source-blog-1192495586835103069.html b/posts/2010/04/pypy-on-google-open-source-blog-1192495586835103069.html new file mode 100644 index 000000000..581c90b7e --- /dev/null +++ b/posts/2010/04/pypy-on-google-open-source-blog-1192495586835103069.html @@ -0,0 +1,311 @@ + + + + + +PyPy on google open source blog | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                PyPy on google open source blog

                + + + +
                +

                Hello

                +

                Bea Düring, from the PyPy team, wrote a post for google open source blog covering PyPy's 1.2 release. It's also the first public mention of the fact that google provided financial support for PyPy's 2.5 compatibility. Thanks!

                +

                Cheers
                +fijal

                +
                +

                Comments

                +
                +
                +
                + + Unknown wrote on 2010-04-09 22:51: +
                +
                +

                Interesting read, thank you. By the way, are there any plans to push for 3.x compatibility?

                +
                +
                +
                +
                + + Benjamin Peterson wrote on 2010-04-10 02:59: +
                +
                +

                @Fahrrad The plan is to work towards 2.7 compatibility this summer.

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2010/04/using-cpython-extension-modules-with-5864754772659599217.html b/posts/2010/04/using-cpython-extension-modules-with-5864754772659599217.html new file mode 100644 index 000000000..d085a1045 --- /dev/null +++ b/posts/2010/04/using-cpython-extension-modules-with-5864754772659599217.html @@ -0,0 +1,504 @@ + + + + + +Using CPython extension modules with PyPy natively, or: PyPy can load .pyd files with CPyExt! | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                Using CPython extension modules with PyPy natively, or: PyPy can load .pyd files with CPyExt!

                + + + +
                +

                PyPy is now able to load +and run CPython extension modules (i.e. .pyd and .so files) natively by using the new CPyExt +subsystem. +Unlike the solution presented in another blog post (where extension modules like +numpy etc. were run on CPython and proxied through TCP), this solution does not require +a running CPython anymore. We do not achieve full binary compatiblity +yet (like Ironclad), but recompiling the extension is generally enough.

                +

                The only prerequisite is that the necessary functions of the C API of CPython are already +implemented in PyPy. If you are a user or an author of a module and miss certain functions +in PyPy, we invite you to implement them. Up until now, a lot of people (including a lot of +new committers) have stepped up and implemented a few functions to get their favorite module +running. See the end of this post for a list of names.

                +

                Regarding speed, we tried the following: even though there is a bit of overhead when running +these modules, we could run the regular expression engine of CPython (_sre.so) and execute +the spambayes benchmark of the Unladen Swallow benchmark suite (cf. speed.pypy.org) and +experience a speedup: +It became two times faster on pypy-c than with the built-in regular +expression engine of PyPy. From Amdahl's Law it follows that the _sre.so must run several +times faster than the built-in engine.

                +

                Currently pursued modules include PIL and others. Distutils support is nearly ready. +If you would like to participate or want information on how to use this new feature, come and join +our IRC channel #pypy on freenode.

                +

                Amaury Forgeot d'Arc and Alexander Schremmer

                +

                Further CPyExt Contributors:

                +
                  +
                • Alex Gaynor +
                • +
                • Benjamin Peterson +
                • +
                • Jean-Paul Calderone +
                • +
                • Maciej Fijalkowski +
                • +
                • Jan de Mooij +
                • +
                • Lucian Branescu Mihaila +
                • +
                • Andreas Stührk +
                • +
                • Zooko Wilcox-O Hearn
                • +
                +
                +

                Comments

                +
                +
                +
                + + Anonymous wrote on 2010-04-10 03:38: +
                +
                +

                Holy crap, this is huge! Is it available in the PPA already? I guess this would put all benchmarks past CPython speed (except for outliers like the euler14 thing).

                +
                +
                +
                +
                + + Anonymous wrote on 2010-04-10 04:09: +
                +
                +

                Great news! What is the status of numpy/scipy support?

                +
                +
                +
                +
                + + Alex wrote on 2010-04-10 06:16: +
                +
                +

                @Anonymous I don't think anyone has started trying to test numpy or scipy yet, however fundamentally it's just a matter of implementing missing functions. For me starting on numpy in my next goal, after PIL.

                +
                +
                +
                +
                + + Anonymous wrote on 2010-04-10 08:32: +
                +
                +

                This is very good news. JIT compiled Python can never fully replace extension modules (existing ones, or the need for new ones), so extension support should be a high priority for the PyPy project. I hope you can eventually get rid of that overhead.

                +
                +
                +
                +
                + + holger krekel wrote on 2010-04-10 14:41: +
                +
                +

                wow, just coming back from vacation and have to say: great news and great work, guys! Historically speaking, this is the third approach to the "ext" module issue and if the promise works out as it seems to do, probably the last as far as leveraging cpy ext modules are concerned! I wonder - does it still make sense to have "native" extension modules, the ones we currently have as "mixed" modules?

                +
                +
                +
                +
                + + Anonymous wrote on 2010-04-10 16:14: +
                +
                +

                Let me ask for a bit more detail. I depend on a module (https://homepages.inf.ed.ac.uk/lzhang10/maxent_toolkit.html), that is currently unsupported, as far as I know. I'd really like to port it to pypy. Where to start?

                Is it possible that the module runs without modifications? Can I check this simply by building a pypy-trunk, and write "import cmaxent"?

                +
                +
                +
                +
                + + Bartosz SKOWRON wrote on 2010-04-10 16:29: +
                +
                +

                @Anonymous: No it's not in the PPA. We provide only the latest release (1.2 in this case) and weekly builds for trunk (which haven't been announced on the blog yet). CPython extension modules live in their own branch. The branch will be merged into the trunk sooner or later.

                PS. The weekly builds are available here at https://launchpad.net/~pypy

                +
                +
                +
                +
                + + Alexander Schremmer wrote on 2010-04-10 18:43: +
                +
                +

                @Anonymous

                To test your module, you need to compile and load it. For compilation, you can use a compiled pypy binary and run setup.py build_ext with your setup file. For hints about manual compilation and module loading, visit our IRC channel.

                +
                +
                +
                +
                + + Alexander Schremmer wrote on 2010-04-10 18:45: +
                +
                +

                @holger

                MixedModules allow you to implement modules in RPython (using the PyPy API) and Python at the same time. CPyExt is for modules written in C using the CPython API. So both solutions are for different needs.

                +
                +
                +
                +
                + + Unknown wrote on 2010-04-10 20:52: +
                +
                +

                what about embedding pypy? will this work too in the future?

                the reason i ask is blender. there were some security concerns among blender developers recently. blender uses embedded cpython for scripting. normal scripts (like exporters) which have to be evoked by the user aren't that much of a problem but blender also supports python expressions for animation parameters. without a sandbox downloading and opening .blend files from unknown sources is kind of risky since a malicious python expression theoretically could wipe your harddisk.

                pypy with its support for a sandbox could be a very good replacement for cpython in blender (also because of its speed) but if it isn't compatible with the cpython api then a swap probably would be way too much effort.

                +
                +
                +
                +
                + + Alexander Schremmer wrote on 2010-04-10 23:07: +
                +
                + @horace
                what about embedding pypy?

                That should work as easy as extending. +
                +
                +
                +
                + + holger krekel wrote on 2010-04-11 16:54: +
                +
                +

                @alexander True, mixed modules are for rpython-implemented modules and need to be translated together with the pypy interpreter and could make use of the JIT. My question more aimed at the issue for which use cases / goals which kind of extension module mechanism makes sense.
                IOW, some discussion and web page regarding rpy-ext/ctypes/cpy-ext would make sense, i guess. Or is it somewhere already?

                +
                +
                +
                +
                + + Alexander Schremmer wrote on 2010-04-11 17:03: +
                +
                + @holger
                some discussion and web page regarding rpy-ext/ctypes/cpy-ext would make sense

                Yes, someone could write down guidelines. Using the C API runs your module fast in case of CPython. A bit slower on ironpython and PyPy.

                Using ctypes gives your module access to these three interpreters as well, but it will run slower. One advantage here is that you do not need to write C to create a wrapper around a library. If your objective is speed and lower memory usage, then CTypes does not work either.

                Mixed modules make your module work only on PyPy and provide a decent speed and a mixture of a decent (Python) and a bit harder to grasp (RPython) programming language. This only makes sense as a platform if your users are also using PyPy. +
                +
                +
                +
                + + René Dudfield wrote on 2010-04-12 13:56: +
                +
                +

                Super awesome! Can't wait to get home and try it out.

                +
                +
                +
                +
                + + Gary Robinson wrote on 2010-07-10 12:45: +
                +
                +

                It's a few months later, and I'm wondering what progress has been made. Early comments mentioned that nobody had tried numpy or scipy yet -- has that changed?

                Also, does this make the multiprocessing library available? Or, is pp (parallel processing) available?

                I'm very excited about PyPy because of the JIT. But for my work I also need some form of utilizing multiple CPU's. Right now I'm using unladen swallow with the multiprocessing module.

                +
                +
                +
                +
                + + Anonymous wrote on 2011-04-13 10:07: +
                +
                +

                Yup, I'd love to hear about the progress on this.

                +
                +
                +
                +
                + + Anonymous wrote on 2011-05-09 13:40: +
                +
                +

                Any chance this will be released sometime?

                +
                +
                +
                +
                + + Alexander Schremmer wrote on 2011-05-09 14:01: +
                +
                +

                It was already released, just check out the current PyPy release.

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2010/05/efficient-and-elegant-regular-2727904462179540436.html b/posts/2010/05/efficient-and-elegant-regular-2727904462179540436.html new file mode 100644 index 000000000..5d0b8a6c3 --- /dev/null +++ b/posts/2010/05/efficient-and-elegant-regular-2727904462179540436.html @@ -0,0 +1,616 @@ + + + + + +An Efficient and Elegant Regular Expression Matcher in Python | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                An Efficient and Elegant Regular Expression Matcher in Python

                + + + +
                +

                Two weeks ago, I was at the Workshop Programmiersprachen und Rechenkonzepte, +a yearly meeting of German programming language researchers. At the workshop, +Frank Huch and Sebastian Fischer gave a really excellent talk about an +elegant regular expression matcher written in Haskell. One design goal of the +matcher was to run in time linear to the length of the input string (i.e. +without backtracking) and linear in the size of the regular expression. The +memory use should also only be linear in the regular expression.

                +

                During the workshop, some of the Haskell people and me then implemented the +algorithm in (R)Python. Involved were Frank, Sebastian, Baltasar Trancón y +Widemann, Bernd Braßel and Fabian Reck.

                +

                In this blog post I want to describe this implementation and show the code of +it, because it is quite simple. In a later post I will show what optimizations +PyPy can perform on this matcher and also do some benchmarks.

                +

                A Note on terminology: In the rest of the post "regular expression" is meant +in the Computer Science sense, not in the POSIX sense. Most importantly, that +means that back-references are not allowed.

                +

                Another note: This algorithm could not be used to implement PyPy's re +module! So it won't help to speed up this currently rather slow implementation.

                +
                +

                Implementing Regular Expression Matchers

                +

                There are two typical approaches to implement regular expression. A naive one is +to use a back-tracking implementation, which can lead to exponential matching +times given a sufficiently evil regular expression.

                +

                The other, more complex one, is to transform the regular expression into a +non-deterministic finite automaton (NFA) and then transform the NFA into a +deterministic finite automaton (DFA). A DFA can be used to efficiently match +a string, the problem of this approach is that turning an NFA into a DFA can +lead to exponentially large automatons.

                +

                Given this problem of potential memory explosion, a more sophisticated approach +to matching is to not construct the DFA fully, but instead use the NFA for +matching. This requires some care, because it is necessary to keep track of +which set of states the automaton is in (it is not just one state, because the +automaton is non-deterministic).

                +

                The algorithm described here is essentially equivalent to this approach, however +it does not need an intermediate NFA and represents a state of a corresponding +DFA as marked regular expression (represented as a tree of nodes). For many +details about an alternative approach to implement regular expressions +efficiently, see Russ Cox excellent article collection.

                +
                +
                +

                The Algorithm

                +

                In the algorithm the regular expression is represented as a tree of nodes. The +leaves of the nodes can match exactly one character (or the epsilon node, which +matches the empty string). The inner nodes of the tree combine other nodes in +various ways, like alternative, sequence or repetition. Every node in the tree +can potentially have a mark. The meaning of the mark is that a node is marked, +if that sub-expression matches the string seen so far.

                +

                The basic approach of the algorithm is that for every character of the input +string the regular expression tree is walked and a number of the nodes in the +regular expression are marked. At the end of the string, if the top-level node +is marked, the string matches, otherwise it does not. At the beginning of the +string, one mark gets shifted into the regular expression from the top, and then +the marks that are in the regex already are shifted around for every additional +character.

                +

                Let's start looking at some code, and an example to make this clearer. The base +class of all regular expression nodes is this:

                +
                class Regex(object):
                +    def __init__(self, empty):
                +        # empty denotes whether the regular expression
                +        # can match the empty string
                +        self.empty = empty
                +        # mark that is shifted through the regex
                +        self.marked = False
                +
                +    def reset(self):
                +        """ reset all marks in the regular expression """
                +        self.marked = False
                +
                +    def shift(self, c, mark):
                +        """ shift the mark from left to right, matching character c."""
                +        # _shift is implemented in the concrete classes
                +        marked = self._shift(c, mark)
                +        self.marked = marked
                +        return marked
                +
                +

                The match function which checks whether a string matches a regex is:

                +
                def match(re, s):
                +    if not s:
                +        return re.empty
                +    # shift a mark in from the left
                +    result = re.shift(s[0], True)
                +    for c in s[1:]:
                +        # shift the internal marks around
                +        result = re.shift(c, False)
                +    re.reset()
                +    return result
                +
                +

                The most important subclass of Regex is Char, which matches one +concrete character:

                +
                class Char(Regex):
                +    def __init__(self, c):
                +        Regex.__init__(self, False)
                +        self.c = c
                +
                +    def _shift(self, c, mark):
                +        return mark and c == self.c
                +
                +

                Shifting the mark through Char is easy: a Char instance retains a mark +that is shifted in when the current character is the same as that in the +instance.

                +

                Another easy case is that of the empty regular expression Epsilon:

                +
                class Epsilon(Regex):
                +    def __init__(self):
                +        Regex.__init__(self, empty=True)
                +
                +    def _shift(self, c, mark):
                +        return False
                +
                +

                Epsilons never get a mark, but they can match the empty string.

                +
                +

                Alternative

                +

                Now the more interesting cases remain. First we define an abstract base class +Binary for the case of composite regular expressions with two children, and +then the first subclass Alternative which matches if either of two regular +expressions matches the string (usual regular expressions syntax a|b).

                +
                class Binary(Regex):
                +    def __init__(self, left, right, empty):
                +        Regex.__init__(self, empty)
                +        self.left = left
                +        self.right = right
                +
                +    def reset(self):
                +        self.left.reset()
                +        self.right.reset()
                +        Regex.reset(self)
                +
                +class Alternative(Binary):
                +    def __init__(self, left, right):
                +        empty = left.empty or right.empty
                +        Binary.__init__(self, left, right, empty)
                +
                +    def _shift(self, c, mark):
                +        marked_left  = self.left.shift(c, mark)
                +        marked_right = self.right.shift(c, mark)
                +        return marked_left or marked_right
                +
                +

                An Alternative can match the empty string, if either of its children can. +Similarly, shifting a mark into an Alternative shifts it into both its +children. If either of the children are marked afterwards, the Alternative +is marked too.

                +

                As an example, consider the regular expression a|b|c, which would be +represented by the objects Alternative(Alternative(Char('a'), Char('b')), Char('c')). +Matching the string "a" would lead to the following marks in +the regular expression objects (green nodes are marked, white ones are +unmarked):

                + +alternativea.gif

                At the start of the process, no node is marked. Then the first char is matched, +which adds a mark to the Char('a') node, and the mark will propagate up the +two Alternative nodes.

                +
                +
                +

                Repetition

                +

                The two remaining classes are slightly trickier. Repetition is used to match +a regular expression any number of times (usual regular expressions syntax +a*):

                +
                class Repetition(Regex):
                +    def __init__(self, re):
                +        Regex.__init__(self, True)
                +        self.re = re
                +
                +    def _shift(self, c, mark):
                +        return self.re.shift(c, mark or self.marked)
                +
                +    def reset(self):
                +        self.re.reset()
                +        Regex.reset(self)
                +
                +

                A Repetition can always match the empty string. The mark is shifted into the +child, but if the Repetition is already marked, this will be shifted into +the child as well, because the Repetition could match a second time.

                +

                As an example, consider the regular expression (a|b|c)* matching the string +abcbac:

                +repetition.gif

                For every character, one of the alternatives matches, thus the repetition matches +as well.

                +
                +
                +

                Sequence

                +

                The only missing class is that for sequences of expressions, Sequence (usual +regular expressions syntax ab):

                +
                class Sequence(Binary):
                +    def __init__(self, left, right):
                +        empty = left.empty and right.empty
                +        Binary.__init__(self, left, right, empty)
                +
                +    def _shift(self, c, mark):
                +        old_marked_left = self.left.marked
                +        marked_left = self.left.shift(c, mark)
                +        marked_right = self.right.shift(
                +            c, old_marked_left or (mark and self.left.empty))
                +        return (marked_left and self.right.empty) or marked_right
                +
                +

                A Sequence can be empty only if both its children are empty. The mark +handling is a bit delicate. If a mark is shifted in, it will be shifted to the +left child regular expression. If that left child is already marked before the +shift, that mark is shifted to the right child. If the left child can match the +empty string, the right child gets the mark shifted in as well.

                +

                The whole sequence matches (i.e. is marked), if the left child is marked after +the shift and if the right child can match the empty string, or if the right +child is marked.

                +

                Consider the regular expression abc matching the string abcd. For the +first three characters, the marks wander from left to right, when the d is +reached, the matching fails.

                +sequence.gif +
                +
                +

                More Complex Example

                +

                As a more complex example, consider the expression ((abc)*|(abcd))(d|e) +matching the string abcabcabcd.

                +complex.gif

                Note how the two branches of the first alternative match the first abc in +parallel, until it becomes clear that only the left alternative (abc)* can +work.

                +
                +
                +

                Complexity

                +

                The match function above loops over the entire string without going back and +forth. Each iteration goes over the whole tree every time. Thus the complexity +of the algorithm is O(m*n) where m is the size of the regular expression +and n is the length of the string.

                +
                +
                +
                +

                Summary & Outlook

                +

                So, what have we achieved now? The code shown here can match regular expressions +with the desired complexity. It is also not much code. By itself, the Python +code shown above is not terribly efficient. In the next post I will show how the +JIT generator can be used to make the simple matcher shown above really fast.

                +
                +
                +

                Comments

                +
                +
                +
                + + Marius Gedminas wrote on 2010-05-21 16:41: +
                +
                +

                Have you seen Russ Cox's series of articles about regular expressions?

                Google Chrome's regexp library is also interesting.

                Google appears to have put a lot of research in efficient regexp algorithms while paying attention to backwards-compatibility concerns, as existing applications often rely on backtracking.

                +
                +
                +
                +
                + + kay schluehr wrote on 2010-05-21 20:26: +
                +
                + Most importantly, that means that back-references are not allowed.

                Limited backreferences can be integrated within this pattern matching scheme. General backreferences are only possible with backtracking but unless you want to solve NP complete problems using POSIX style regexps they might not be necessary. +
                +
                +
                +
                + + Carl Friedrich Bolz-Tereick wrote on 2010-05-21 21:48: +
                +
                +

                Marius: The Russ Cox site is linked from the article :-).

                Kay: Thanks for the link, will check it out.

                +
                +
                +
                +
                + + Thomas wrote on 2010-05-21 22:07: +
                +
                +

                I do not use regular expressions very heavily and am very new to pypy in general (1.2 works pretty good for me on my pure python code). From this article I don't see a full explaination why this basic algorithm couldn't be used for pypy. Is it primarily due to concerns about backward compatiblity or something more interesting? I am looking forward to the article to come about applying the JIT.

                +
                +
                +
                +
                + + Benjamin Peterson wrote on 2010-05-22 20:58: +
                +
                +

                @Thomas Python's re library requires backtracking.

                +
                +
                +
                +
                + + Anonymous wrote on 2010-05-23 00:24: +
                +
                +

                This is just beautiful, I hope some version of this will be available for PyPy users: sandboxing + non-pathological REs sounds like a nice combo.

                +
                +
                +
                +
                + + Damian Cugley wrote on 2010-05-24 16:22: +
                +
                +

                Though these regexes can't be used as a drop-in replacement for the re module, if there were strikingly faster it might be worth having them as an alternative. The backtracking features are so seldom required that a faster, non-backtracking algorithm might prove popular with people who worry about matching speed.

                +
                +
                +
                +
                + + Anonymous wrote on 2010-05-25 12:53: +
                +
                +

                It would be fun to read an article where you take the real Python regexes and apply PyPy's JIT code generation to them, i.e. when you call re.compile(...), you'd get native code out of it, specialized for the regex being compiled. After all, haven't you used the JIT on "toy" languages before? Regexes are a "toy" language, albeit a useful one..

                +
                +
                +
                +
                + + Carl Friedrich Bolz-Tereick wrote on 2010-05-25 14:21: +
                +
                +

                Anonymous2: Yes, but PyPy's current implementation of the re module is a bit of a mess, and not really fast. It's rather unclear how easy/possible it would be to generate a good JIT for it.

                +
                +
                +
                +
                + + Unhelpful wrote on 2010-06-04 21:17: +
                +
                +

                Instead of a "special" interpreter for REs in RPython, and a JIT for it, what about "compiling" REs to Python bytecode, and letting the existing PyPy JIT trace and compile them if they end up being used often enough? This is probably slower in the case of lots of throwaway REs that are used once, but when a few REs are used repeatedly it ought to work.

                +
                +
                +
                +
                + + Anonymous wrote on 2017-11-21 15:28: +
                +
                +

                "As an example, consider the regular expression a|b|c, which would be represented by the objects Alternative(Alternative(Char('a'), Char('b')), Char('c'))"

                But how to create such a representation, when you scan input regex literal by literal?

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2010/05/pypy-in-googles-summer-of-code-2010-5321939902318322352.html b/posts/2010/05/pypy-in-googles-summer-of-code-2010-5321939902318322352.html new file mode 100644 index 000000000..7d5d01df5 --- /dev/null +++ b/posts/2010/05/pypy-in-googles-summer-of-code-2010-5321939902318322352.html @@ -0,0 +1,374 @@ + + + + + +PyPy in Google's Summer of Code 2010 | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                PyPy in Google's Summer of Code 2010

                + + + +
                +

                Good news everyone.

                +

                This year, thanks to google generosity and PSF support, we got two and a +half of students for PyPy's summer of code. We didn't cut any students, but one +of the projects is a joint project of PyPy and numpy. Hereby I present +descriptions, in my own words with my own opinions and in arbitrary order. For +more details please follow links to particular blogs.

                +
                +

                +Jason Creighton: 64bit JIT backend for PyPy

                +

                Intel 64bit (and I mean x86_64) compatibility for JIT has been one of the top +requested features (along with GIL removal). While GIL removal is not really an +easy task, having our JIT emit 64bit assembler is sort of easy, thanks to our +JIT backend abstraction. It will likely be faster, thanks to abundance of +registers.

                +
                +
                +

                +Bartosz Skowron: Fast ctypes for PyPy

                +

                Historically weak point of PyPy was compatibility with extension modules. We +have progressed quite a bit in recent years, first introducing ctypes for +pypy then progressing towards CPython extension modules. However, ctypes is +well known to be slow (and it's even slower on PyPy) and writing CPython +extension modules is ugly, and it's going to be only with compatibility layer +that'll keep this slow. What happens if we try to employ JIT technology to +ctypes? Maybe we can compile calls to C code from Python as a direct calls in +compiled assembler? Why not?

                +

                This project will look how the JIT technology can be employed to do some +sort of FFI. There is no guarantee we'll get super-fast ctypes as a result, +but it's good to see progress in that area.

                +
                +
                +

                +Dan Roberts: Numpy in PyPy

                +

                This is a joint project of numpy and PyPy. The main objective is to bring +numpy to PyPy, possibly fast. The official mentor for this project is +Stefan van der Walt from numpy community. During initial meeting it was +agreed that probably the best way to go would be to support original numpy +with CPython extension compatibility and then provide a minimal native numpy +framework for pypy. The former would retain full compatibility, while the +latter would have JIT integration, with line of our previous +numeric experiments. There would be an explicit interface from converting +one array to another for convinience.

                +
                +

                Overall, I'm very happy to see so much support for PyPy from SoC. I hope all +three proposals will be successful!

                +

                Cheers,
                +fijal & pypy team.

                +
                +

                Comments

                +
                +
                +
                + + Michael Twomey wrote on 2010-06-01 11:48: +
                +
                +

                Some really nice stuff in there, very interested in the potential for JIT + numpy, keep up the good work!

                +
                +
                +
                +
                + + Anonymous wrote on 2010-06-01 11:53: +
                +
                +

                Cool projects. Two of them live as PyPy branches:

                https://codespeak.net/viewvc/pypy/branch/x86-64-jit-backend/

                https://codespeak.net/viewvc/pypy/branch/fast-ctypes/

                Where can we follow the NumPy work? :)

                +
                +
                +
                +
                + + Unknown wrote on 2010-06-07 16:16: +
                +
                +

                when will pypy catch up with python 3.1? will it happen during the python language moratorium (pep 3003)?

                +
                +
                +
                +
                + + Maciej Fijalkowski wrote on 2010-06-10 20:08: +
                +
                +

                @horace

                Depends when you can help :)

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2010/05/running-wxpython-on-top-of-pypy-52246787415886751.html b/posts/2010/05/running-wxpython-on-top-of-pypy-52246787415886751.html new file mode 100644 index 000000000..d4eb12b68 --- /dev/null +++ b/posts/2010/05/running-wxpython-on-top-of-pypy-52246787415886751.html @@ -0,0 +1,410 @@ + + + + + +Running wxPython on top of pypy | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                Running wxPython on top of pypy

                + + + +
                +

                Hello,

                +

                These last three weeks we have been busy working on the cpyext subsystem, which +allows pypy to execute extension modules written with the Python C API.

                +

                Today we hacked enough to have wxPython compile, and run its wonderful demo. +This: + +cannot be distinguished from the same run with a +standard python interpreter, but this: + +shows an exception that +CPython never produces.

                +

                wxPython is a big extension module: it has more than 500 classes and 7500 +functions, most of the code is automatically generated by swig. It uses +advanced techniques, like "Original Object Return" and cross-platform +polymorphism, that effectively allows the developer to seamlessly subclass C++ +objects in Python and write GUI applications efficiently.

                +

                The demo application runs reasonably fast, it feels slower than with CPython, +but I did not activate the JIT option of pypy. It still crashes in some places +(the demo is very comprehensive and covers all the aspects of wxPython), and +threads are expected to not work at the moment.

                +

                We had to modify a little the code of wxPython, mainly because it often stores +borrowed references into C++ objects. This does not work well in pypy, where +all other counted references can disappear, and allows the address of the object +to change. The solution is to use weak references instead. The patch is here, +it will eventually be merged into the upstream wxPython version.

                +

                This first real test proves that CPython extensions can be migrated to pypy +without much pain. It also points some places which can be improved, like +better diagnostics in crashes, better support of distutils...

                +

                Amaury Forgeot d'Arc

                +
                +

                Comments

                +
                +
                +
                + + René Dudfield wrote on 2010-05-03 17:09: +
                +
                +

                sweet as!

                +
                +
                +
                +
                + + Dan Villiom Podlaski Christiansen wrote on 2010-05-03 18:00: +
                +
                +

                Nice! Do you have any plans for making Mac nightlies with this available? I'd love to try out PyPy, but the one time I tried bootstrapping, it used all available memory. After I had let it run overnight but it didn't finish, I killed it…

                +
                +
                +
                +
                + + Bourne wrote on 2010-05-03 19:59: +
                +
                +

                This is very good news.

                Finishing wxPython and the JIT is probably all that's needed to make PyPy a **great** alternative to CPython. (but I guess you figured that already)

                Thanks!

                +
                +
                +
                +
                + + Stu wrote on 2010-05-03 23:52: +
                +
                +

                Sweet ! I wonder if pycairo and pygtk... at the moment I don't know if it's cairo or python slowing down my app (I have an idea it's both, but running it in pypy does seem attractive).

                +
                +
                +
                +
                + + René Dudfield wrote on 2010-05-04 09:33: +
                +
                +

                Are there docs for how to compile extensions somewhere? I had a quick look, but couldn't find them.

                +
                +
                +
                +
                + + The Cannon Family wrote on 2010-05-04 19:28: +
                +
                +

                this is a major accomplishment in terms of usability, many people use Python extension modules, way to go. (and next steps, PIL).

                +
                +
                +
                +
                + + Amaury Forgeot d'Arc wrote on 2010-05-04 22:38: +
                +
                +

                PIL also works with PyPy. I've only tried basic tests though (like gif->png conversion)

                +
                +
                +
                +
                + + Amaury Forgeot d'Arc wrote on 2010-05-04 22:40: +
                +
                +

                @illume: you have to compile pypy with the option "--withmod-cpyext", then it should be enough to run "/path/to/pypy-c setup.py build"

                +
                +
                +
                +
                + + Unknown wrote on 2010-05-07 13:44: +
                +
                +

                Well done! The Italian Python Community has an article on this (here, in Italian)

                +
                +
                +
                +
                + + Anonymous wrote on 2010-05-09 09:49: +
                +
                +

                Wow. PyPy is coming along quite nicely :-)

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2010/06/blackhole-interpreter-2752965445510091289.html b/posts/2010/06/blackhole-interpreter-2752965445510091289.html new file mode 100644 index 000000000..5b2a9c375 --- /dev/null +++ b/posts/2010/06/blackhole-interpreter-2752965445510091289.html @@ -0,0 +1,445 @@ + + + + + +"Blackhole" interpreter | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                "Blackhole" interpreter

                + + + +
                +

                Hi all,

                + +

                Here are a few words about the JIT's "great speedup in compiling +time" advertized on the PyPy 1.3 release (see the + +previous blog post). +The exact meaning behind these words needs a fair bit of +explanation, so here it is in case you are interested.

                + +

                If you download a version of PyPy 1.3 that includes a JIT +compiler, you get an executable that could be qualified as rather +fat: it actually contains three interpreters. You have on the +one hand the regular Python interpreter. It is here because it's +not possible to JIT-compile every single piece of Python code you +try to run; only the most executed loops are JIT-compiled. They +are JIT-compiled with a tracing interpreter that operates one +level down. This is the second interpreter. This tracing step +is quite slow, but it's all right because it's only invoked on +the most executed loops (on the order of 100 to 1000 times in +total in a run of a Python script that takes anyway seconds or +minutes to run).

                + +

                So apart from the JIT compilation itself, we have two worlds in +which the execution proceeds: either by regular interpretation, +or by the execution of assembler code generated by the JIT +compiler. And of course, we need to be able to switch from one +world to the other quickly: during regular interpretation we have +to detect if we already have generated assembler for this piece +of code and if so, jump to it; and during execution of the +assembler, when a "guard" fails, i.e. when we meet a path of +execution for which we did not produce assembler, then we need to +switch back to regular interpretation (or occasionally invoke the +JIT compiler again).

                + +

                Let us consider the cost of switching from one world to another. +During regular interpretation, if we detect that we already have +assembler corresponding to this Python loop, then we just jump to +it instead of interpreting the Python loop. This is fairly +cheap, as it involves just one fast extra check per Python loop. +The reverse is harder because "guard" failures can occur at any +point in time: it is possible that the bit of assembler that we +already executed so far corresponds to running the first 4 Python +opcodes of the loop and a half. The guard that failed just now +is somewhere in the middle of interpreting that opcode -- say, +multiplying these two Python objects.

                + +

                It's almost impossible to just "jump" at the right place in the +code of the regular interpreter -- how do you jump inside a +regular function compiled in C, itself in a call chain, resuming +execution of the function from somewhere in the middle?

                + +

                So here is the important new bit in PyPy 1.3. Previously, what +we would do is invoke the JIT compiler again in order to follow +what needs to happen between the guard failure and the real end +of the Python opcode. We would then throw away the trace +generated, as the only purpose was to finish running the current +opcode. We call this "blackhole interpretation". After the end +of the Python opcode, we can jump to the regular interpreter +easily.

                + +

                Doing so was straightforward, but slow, in case it needs to be +done very often (as in the case in some examples, but not all). +In PyPy 1.3, this blackhole interpretation step has been +redesigned as a time-critical component, and that's where the +third interpreter comes from. It is an interpreter that works +like the JIT compiler, but without the overhead of tracing (e.g. +it does not need to box all values). It was designed from the +ground up for the sole purpose of finishing the execution of the +current Python opcode. The bytecode format that it interprets is +also new, designed for that purpose, and the JIT compiler itself +(the second interpreter) was adapted to it. +The old bytecode format in PyPy 1.2 is gone +(it was more suited for the JIT compiler, but less for blackhole +interpretation).

                + +

                In summary, it was a lot of changes in the most front-end-ish +parts of the JIT compiler, even though it was mostly hidden +changes. I hope that this longish blog post helped bring it a +bit more to the light :-)

                +
                +

                Comments

                +
                +
                +
                + + GRon wrote on 2010-06-26 21:06: +
                +
                +

                Interesting, is there any documentation for the different bytecode sets you have/had?

                I would be especially interested in the differences, and the reasons for those design decisions.

                +
                +
                +
                +
                + + Armin Rigo wrote on 2010-06-26 23:11: +
                +
                +

                I fear not. The bytecode set is quite custom, made to represent RPython code, which is at the level (roughly speaking) of Java -- with a few additional instructions to guide the JIT compiler. The latest version uses a register-based machine, which is more convenient than a Java-like stack-based approach starting from the control flow graphs of RPython functions. It has three independent sets of registers: integers, pointers, and floating-point (pointers are different from integers at this level because the GC needs to track them and possibly move them). Register numbers are encoded in one byte, so there is room for 256 registers of each kind, but in practice doing a simple register allocation step on each graph means that no bytecode ends up using more than ~15 registers. A few parts are needed only by the JIT compiler and not by the blackhole interpreter; these are encoded "off-line" to avoid slowing down the blackhole interpreter.

                Well, I could talk at length about all the details of the format, but in truth there is nothing very deep there :-) See the comments in https://codespeak.net/svn/pypy/trunk/pypy/jit/codewriter/codewriter.py as well as the tests like test/test_flatten.py and test/test_regalloc.py.

                +
                +
                +
                +
                + + Zeev wrote on 2010-06-27 01:40: +
                +
                +

                Does the PyPy JIT replace a running interpreted loop with a compiled one mid-run or only on the next iteration or only the next time this loop starts?

                Is there a way to ask the PyPy interpreter to tell me what it jitted as it ran some code?

                Or will it be too difficult for me to relate the produced machine code with my python source code (because it's not a straightforward method jit)?

                +
                +
                +
                +
                + + Maciej Fijalkowski wrote on 2010-06-27 17:00: +
                +
                +

                Hi Zeev.

                Only at the next iteration of the loop. However, you have to have at least ~1000 iterations before it happens.

                There is a variety of tools that we use for inspecting generated loops. There is no programmable interface from python yet, but there are some external tools.

                Run: PYPYJITLOG=jit-log-opt:log pypy

                and you'll get a file log which contains all the loops. There are tools in the source checkout pypy/jit/tool, loopviewer.py, showstats.py and traceviewer.py which can help you viewing those loops. They'll contain debug_merge_points which are with info about python opcodes (including functions and file), but they can span several functions. Have fun :)

                If you want more info, drop by on #pypy at irc.freenode.net.

                Cheers,
                fijal

                +
                +
                +
                +
                + + Luis wrote on 2010-06-30 02:20: +
                +
                +

                Is this Blackhole interpreter the Jaegermonkey of pypy?

                +
                +
                +
                +
                + + Armin Rigo wrote on 2010-07-08 16:26: +
                +
                +

                Luis: no.

                +
                +
                +
                +
                + + Antonio Cuni wrote on 2010-07-08 16:47: +
                +
                +

                @Luis: just to expand a bit Armin's answer :-).

                Jaegermonkey is a method-by-method compiler that Tracemonkey uses *before* the tracing compiler enters in action. In pypy, this is equivalent to the normal Python interpreter that profiles your loops to find the hot ones, with the obvious difference that Jaegermonkey is a compiler, while ours is an interpreter.

                The blackhole interpreter is something that it's used internally by our tracing jit compiler, and AFAIK it has no equivalent in tracemonkey.

                +
                +
                +
                +
                + + Luis wrote on 2010-07-08 23:27: +
                +
                +

                I see. Thanks Armin and Antonio.

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2010/06/jit-for-regular-expression-matching-3877859053629057968.html b/posts/2010/06/jit-for-regular-expression-matching-3877859053629057968.html new file mode 100644 index 000000000..a8740f222 --- /dev/null +++ b/posts/2010/06/jit-for-regular-expression-matching-3877859053629057968.html @@ -0,0 +1,673 @@ + + + + + +A JIT for Regular Expression Matching | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                A JIT for Regular Expression Matching

                + + + +
                +

                This is part 2 of a series, see Part 1 for an introduction. In this post +I want to describe how the JIT generator of the PyPy project can be used to turn +the elegant but not particularly fast regular expression matcher from the first +part into a rather fast implementation. In addition, I will show some speed +measurements against various regular expression implementations.

                +

                Again, note the disclaimer: This technology could not easily be used +to implement Python's re-module.

                +
                +

                Example Expression and First Numbers

                +

                The regular expression I will use as an example in the rest of this paper is +the expression (a|b)*a(a|b){20}a(a|b)*. It matches all strings that have two +a with exactly 20 characters between them. This regular expression has +the property that the corresponding DFA needs 2**(n+1) different states. As +an input string, we use a random string (of varying lengths) that does not +match the regular expression. I will give all results as number of chars matched +per second. While this is not a particularly typical regular expression, it +should still be possible to get some ballpark numbers for the speeds of various +implementations – as we will see, the differences between implementations are +huge anyway.

                +

                All the benchmarks were performed on my laptop, which has an Intel Core2 Duo +P8400 processor with 2.26 GHz and 3072 KB of cache on a machine with 3GB RAM +running Ubuntu Linux 10.04.

                +

                To get a feeling for the orders of magnitude involved, the CPython re module +(which is implemented in C and quite optimized) can match 2'500'000 chars/s. +Google's new re2 implementation still matches 550'000 chars/s. Google's +implementation is slower, but their algorithm gives complexity and space +guarantees similar to our implementation in the last blog post.

                +

                On the other end of the performance scale is the pure-Python code from the last +blog post running on CPython. It can match only 12'200 chars/s and is thus 200 +times slower than the re module.

                +
                +
                +

                Translating the Matcher

                +

                The code described in the last blog post is not only normal Python code, but +also perfectly valid RPython code. Nothing particularly dynamic is going on in +the code, thus it can be translated with PyPy's translation toolchain to C code. +The resulting binary is considerably faster and can match 720'000 chars/s, 60 +times faster than the untranslated version.

                +

                Another approach is to write equivalent versions of the algorithms in lower +level languages. This has been done for C++ by Sebastian Fischer and for Java by +Baltasar Trancón y Widemann. The algorithm is object-oriented enough to be +mapped very closely to the respective languages. The C++ version is +a little bit faster than the RPython version translated to C, at 750'000 chars/s. That's +not very surprising, given their similarity. The Java version is more than twice +as fast, with 1'920'000 chars/s. Apparently the Java JIT compiler is a lot +better at optimizing the method calls in the algorithm or does some other +optimizations. One reason for this could be that the Java JIT can assume that +the classes it sees are all there are (and it will invalidate the generated +machine code if more classes are loaded), whereas the C++ compiler needs to +generate code that works even in the presence of more regular expression +classes.

                +
                +
                +

                Generating a JIT

                +

                To get even more performance out of the RPython code, it is possible to generate +a JIT for it with the help of the PyPy translation toolchain. To do this, the +matching code needs to be extended somewhat by some hints that tell PyPy's JIT +generator how this is to be done. The JIT generator can automatically produce a +JIT compiler from an RPython interpreter of the source language. In our case, +we view the regular expression matcher as an interpreter for regular +expressions. Then the match function corresponds to the +dispatch loop of a traditional interpreter.

                +

                Our regular expression matcher is a very peculiar interpreter. The matcher +works by running exactly one loop (the one in match) as many times as the +input string is long, irrespective of the "program", i.e. the particular +regular expressions. In addition, within the loop there are no conditions (e.g. +if statements) at all, it is just linear code. This makes it almost perfectly +suited +to the JIT generator, which produces a tracing JIT. A tracing JIT compiles the +hot loops of a program (i.e. regular expression) and has to do extra work if +there are conditions in the loop. In our case, there is exactly one loop per +regular expression, without any condition.

                +
                +

                JIT Hints

                +

                The hints that are needed for the match function of the last blog post can +be seen here (the function is slightly rewritten, e.g. the JIT does only +properly support a while loop as the main dispatch loop):

                +
                jitdriver = jit.JitDriver(reds=["i", "result", "s"], greens=["re"])
                +
                +def match(re, s):
                +    if not s:
                +        return re.empty
                +    # shift a mark in from the left
                +    result = re.shift(s[0], 1)
                +    i = 1
                +    while i < len(s):
                +        jitdriver.can_enter_jit(i=i, result=result, s=s, re=re)
                +        jitdriver.jit_merge_point(i=i, result=result, s=s, re=re)
                +        # shift the internal marks around
                +        result = re.shift(s[i], 0)
                +        i += 1
                +    re.reset()
                +    return result
                +
                +

                The jitdriver is an instance describing the data of the interpreter we are +dealing with. The arguments to the constructor need to list all local variables +of the dispatch loop. The local variables are classified into two classes, red +ones and green ones. The green ones hold the objects that make up the program +that the interpreter currently runs and which position in the program is +currently being executed. In a typical bytecode interpreter, the bytecode object +and the program counter would be green. In our case, the regular expression is +the program, so it is green. The rest of the variables are red.

                +

                The green variables are treated specially by the JIT generator. At runtime, for +a given value of the green variables, one piece of machine code will be +generated. This piece of machine code can therefore assume that the value of +the green variable is constant.

                +

                There are two additional hints, which are method calls on the +jitdriver instance. The jit_merge_point method marks the beginning of +the main interpreter loop. The can_enter_jit function marks the point where +a loop in the user program can be closed, which in our case is trivial, it's +just at the end of the interpreter loop (for technical reasons it is put at the beginning, because nothing must happen between the can_enter_jit and jit_merge_point invocations).

                +

                Those are the hints that the JIT generator needs to function at all. We added +some additional hints, that give the JIT generator more information to work +with. Those hints are immutability information, which means that certain +instance fields can not be changed after the object has been constructed. Apart +from the marked field, none of the fields of any of the Regex subclasses +can change. For example for the Char class this is expressed in the +following way:

                +
                class Char(Regex):
                +    _immutable_fields_ = ["c"]
                +    def __init__(self, c):
                +        ...
                +
                +

                These hints allow the generated JIT to constant-fold reads out of the immutable +fields in some situations.

                +
                +
                +

                Adaptions to the Original Code

                +

                In the introduction above I wrote that the code within the loop in match +uses no conditions. It is indeed true that none of the _shift methods +have an if statement or similar. However, there are some hidden conditions +due to the fact that the and and or boolean operators are used, which +are short-circuiting. Therefore the JIT-version of the code needs to be adapted +to use the non-short-circuiting operators & and |.

                +
                +
                +

                JIT Example

                +

                To get an impression of how the generated machine code looks like, consider the +regular expression (a|b)*. As regular expression objects this would be +Repetition(Alternative(Char('a'), Char('b'))). The machine code in its intermediate, +machine-independent form looks as follows (I have slightly cleaned it up and +added comments for clarity):

                +
                # arguments of the loop
                +# i0 is i in the match function
                +# result0 is result in the match function
                +# s0 is s in the match function
                +[i0, result0, s0] # those are the arguments to the machine code
                +char = s0[i0] # read the character
                +# read the current mark:
                +i5 = ConstPtr(ptr_repetition).marked
                +i7 = char == 'a' # is the character equal to 'a'
                +i8 = i5 & i7
                +i10 = char == 'b' # is the character equal to 'b'
                +i11 = i5 & i10
                +# write new mark
                +ConstPtr(ptr_chara).marked = i8
                +i13 = i8 | i11
                +# write new mark
                +ConstPtr(ptr_charb).marked = i11
                +# write new mark
                +ConstPtr(ptr_alternative).marked = i13
                +# increment the index
                +i17 = i0 + 1
                +i18 = len(s0)
                +# write new mark
                +ConstPtr(ptr_repetition).marked = i13
                +# check that index is smaller than the length of the string
                +i19 = i17 < i18
                +if not i19:
                +    go back to normally running match
                +jump(i17, i13, s0) # start from the top again
                +
                +

                The various ConstPtr(ptr_*) denote constant addresses of parts of the regular +expression tree:

                +
                  +
                • +ptr_repetition is the Repetition +
                • +
                • +ptr_chara is Char('a') +
                • +
                • +ptr_charb is Char('b') +
                • +
                • +ptr_alternative is the Alternative +
                • +
                +

                Essentially the machine code reads the next char out of the string, the current +mark out of the Repetition and then performs some boolean operations on +those, writing back the new marks. Note in particular how the generated +machine code does not need to do any method calls to shift and _shift and +that most field reads out of the regular expression classes have been optimized +away, because the fields are immutable. Therefore the machine code does not +need to deconstruct the tree of regular expression objects at all, it just +knows where in memory the various parts of it are, and encodes that directly +into the code.

                +
                +
                +

                Performance Results With JIT

                +

                With the regular expression matcher translated to C and with a generated JIT, +the regular expression performance increases significantly. Our running example +can match 16'500'000 chars/s, which is more than six times faster than the +re module. This is not an entirely fair comparison, because the re +module can give more information than just "matches" or "doesn't match", but +it's still interesting to see. A more relevant comparison is that between the +program with and without a JIT: Generating a JIT speeds the matcher up by more +than 20 times.

                +
                +
                +
                +

                Conclusion

                +

                So, what have we actually won? We translated the relatively simple and very slow +regular expression matching algorithm from the last post to C and were thus able +to speed it up significantly. The real win is gained by also generating a JIT +for the matcher, which can be regarded as a simple interpreter. The resulting +matcher is rather fast.

                +

                The lesson from these posts is not that you can or should write a practical +and general regular expression module in this way – indeed, enhancing the +algorithm to support more features of the re module would be a lot of work +and it is also unclear what the speed results for more realistic regular +expressions would be. However, it makes for a great case study of the JIT +generator. It was relatively straightforward to generate a JIT for the regex +matcher, and the speed results were great (Admittedly I know rather a lot about +PyPy's JIT though). This approach is generalizable to many programs that are +sufficiently "interpreter-like" (whatever that exactly means).

                +

                All the results that appeared at various points in this blog post can be seen +here:

                + +++++ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                Implementationchars/sspeedup over pure Python
                Pure Python code12'2001
                Python re module2'500'000205
                Google's re2 implementation550'00045
                RPython implementation translated to C720'00059
                C++ implementation750'00061
                Java implementation1'920'000157
                RPython implementation with JIT16'500'0001352
                +
                +

                Sources

                +

                All the source code can be found in my Subversion user directory on Codespeak.

                +
                +
                +

                Edit:

                +

                Armin is right (see first comment). I fixed the problem.

                +
                +

                Comments

                +
                +
                +
                + + Armin Rigo wrote on 2010-06-08 13:11: +
                +
                +

                Warning: the first example is wrong: there should be no code executed between can_enter_jit() and jit_merge_point(). In this case, there is the exit condition of the loop. It needs to be rewritten as a "while True:" loop with a "break" just before can_enter_jit().

                +
                +
                +
                +
                + + Carl Friedrich Bolz-Tereick wrote on 2010-06-08 13:35: +
                +
                +

                @Armin: Damn, you're right. I fixed the blog post.

                +
                +
                +
                +
                + + Nelson Elhage wrote on 2010-06-08 15:36: +
                +
                +

                What happens if you don't replace and and or?

                Without those changes, the modifications for JIT really are prety small --
                mostly just some annotations in the main loop and at toplevel for each
                class. With those changes, though, you need to potentially check the entire
                codebase of your interpreter.

                Pretty fun performance results, though.

                +
                +
                +
                +
                + + Carl Friedrich Bolz-Tereick wrote on 2010-06-08 16:40: +
                +
                +

                @Nelson: If you don't change the "and" and "or" you get a lot of assembler code generated, and it's not particularly fast.

                Note that this "and" and "or" business is quite specific to this particular example. Usually you can work more incrementally by generating a JIT, then looking at the produced assembler and then doing some small changes in the interpreter to improve parts of it. Each such change is usually localized to one part of the interpreter improves the performance of some language feature.

                This example is not really large enough to show this way of working, though :-). Maybe at some point I should write a walk-through for some interpreter.

                +
                +
                +
                +
                + + Kumo wrote on 2010-06-08 22:55: +
                +
                +

                Would it be possible to create a pypy or cpython extension module this way?

                +
                +
                +
                +
                + + Jared Forsyth wrote on 2010-06-09 21:27: +
                +
                +

                Could you post your 'test runner' code? I'm running some tests (with your) code and getting drastically different numbers...

                +
                +
                +
                +
                + + Carl Friedrich Bolz-Tereick wrote on 2010-06-10 10:51: +
                +
                +

                @jabapyth: there is no test runner code. I am simply running something like

                genrandom 20 1000000 | time regex-c

                What performance results are you getting? Are you sure that you translated jitregex.py with -Ojit? Otherwise the JIT is not put into the executable.

                +
                +
                +
                +
                + + Maxim Yegorushkin wrote on 2010-08-02 00:19: +
                +
                +

                boost::regex is not mentioned. It's got both recursive and non-recursive implementations. And it is the base of the standard C++ TR1 regex. Would be interesting to stack it up against other results because it is

                +
                +
                +
                +
                + + Maciej Fijalkowski wrote on 2010-08-02 08:52: +
                +
                +

                We can't possibly include all regex engines (even if we would like to). However, sources are out there and you can always rerun those benchmarks and see how it compares :-)

                Cheers,
                fijal

                +
                +
                +
                +
                + + Nikhil wrote on 2013-01-12 21:25: +
                +
                +

                I'm not able to access the code on codespeak.net. Has the code been moved to some other place?

                +
                +
                +
                +
                + + Maciej Fijalkowski wrote on 2013-01-12 22:40: +
                +
                +

                The code has been merged to PyPy since I think. Look up cfbolz repos on bitbucket though

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2010/06/pypy-13-released-8546085566902489304.html b/posts/2010/06/pypy-13-released-8546085566902489304.html new file mode 100644 index 000000000..44b3aaafe --- /dev/null +++ b/posts/2010/06/pypy-13-released-8546085566902489304.html @@ -0,0 +1,358 @@ + + + + + +PyPy 1.3 released | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                PyPy 1.3 released

                + + + +
                +

                Hello.

                +

                We're please to announce the release of PyPy 1.3. This release has two major +improvements. First of all, we stabilized the JIT compiler since 1.2 release, +answered user issues, fixed bugs, and generally improved speed.

                +

                We're also pleased to announce alpha support for loading CPython extension +modules written in C. While the main purpose of this release is increased +stability, this feature is in alpha stage and it is not yet suited for +production environments.

                +
                +

                Highlights of this release

                +
                  +
                • +

                  We introduced support for CPython extension modules written in C. As of now, +this support is in alpha, and it's very unlikely unaltered C extensions will +work out of the box, due to missing functions or refcounting details. The +support is disabled by default, so you have to do:

                  +
                  +import cpyext
                  +
                  +

                  before trying to import any .so file. Also, libraries are source-compatible +and not binary-compatible. That means you need to recompile binaries, using +for example:

                  +
                  +pypy setup.py build
                  +
                  +

                  Details may vary, depending on your build system. Make sure you include +the above line at the beginning of setup.py or put it in your PYTHONSTARTUP.

                  +

                  This is alpha feature. It'll likely segfault. You have been warned!

                  +
                • +
                • +

                  JIT bugfixes. A lot of bugs reported for the JIT have been fixed, and its +stability greatly improved since 1.2 release.

                  +
                • +
                • +

                  Various small improvements have been added to the JIT code, as well as a great +speedup of compiling time.

                  +
                • +
                +
                +

                +Cheers,
                +Maciej Fijalkowski, Armin Rigo, Alex Gaynor, Amaury Forgeot d'Arc and the PyPy team +

                +

                +Update:The correct command to build extension is "pypy setup.py build", not "python setup.py build" as it was stated before.

                +
                +

                Comments

                +
                +
                +
                + + Isaac Gouy wrote on 2010-06-27 23:18: +
                +
                +

                fyi benchmarks game

                +
                +
                +
                +
                + + Maciej Fijalkowski wrote on 2010-06-28 07:04: +
                +
                +

                Thanks. I don't think we improved in any of the areas measured by those benchmarks (even if, only by a tiny bit).

                Cheers,
                fijal

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2010/07/cern-sprint-report-wrapping-c-libraries-6547377950791793143.html b/posts/2010/07/cern-sprint-report-wrapping-c-libraries-6547377950791793143.html new file mode 100644 index 000000000..118266abf --- /dev/null +++ b/posts/2010/07/cern-sprint-report-wrapping-c-libraries-6547377950791793143.html @@ -0,0 +1,361 @@ + + + + + +CERN Sprint Report – Wrapping C++ Libraries | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                CERN Sprint Report – Wrapping C++ Libraries

                + + + +
                +

                The last five days we have been sprinting in a meeting room in the Computing +Center at CERN in Genève, Switzerland. Present are Armin Rigo, Antonio Cuni, +Carl Friedrich Bolz and Wim Lavrijsen (LBL). The goal of the sprint was to use +some of the C++ technology developed at CERN to make it possible to use C++ +libraries from PyPy's Python interpreter. For this we used the Reflex +library, which provides reflection information for C++ classes. We discussed +using Reflex in PyPy during the Düsseldorf sprint of 2008, please read +that blog post if you want some more details on how Reflex works. There is +support for this sort of C++/Python integration also for CPython, using the +PyROOT module.

                +

                The sprint was very successful. On Monday we had a few discussion about how +Reflex could best be integrated with PyPy. One of the goals of the sprint was to +make the approach JIT-friendly from the start, so that calls to C++ libraries +can be reasonably fast. After the discussion we started coding on the +reflex-support branch. This branch adds a new cppyy builtin module to +PyPy's Python interpreter (why we chose that name is left as an exercise to the +reader). This module can be used to load C++ classes, construct instances and +call static and instance methods on them.

                +

                The work has just started, as of now, the argument and return types of the +methods are restricted to some simple C types, such as int, double and +char* and pointers to class instances. Most of the work necessary to +properly resolve overloaded methods is done, but default arguments are not.

                +

                As an example, suppose there is a C++ class like this:

                +
                class example01 {
                +private:
                +    static int count;
                +    int somedata;
                +public:
                +
                +    example01(int a) : somedata(a) {
                +        count++;
                +    }
                +    ~example01() {
                +        count--;
                +    }
                +    static int getCount() {
                +        return count;
                +    }
                +
                +    int addDataToInt(int a) {
                +        return somedata + a;
                +    }
                +};
                +int example01::count = 0;
                +
                +

                You can now use it from PyPy's Python interpreter in the following way, after +you have used Reflex to generate reflection information for the class:

                +
                import cppyy
                +cppyy.load_lib("example01Dict.so") # contains the Reflex information
                +example01_class = cppyy.gbl.example01
                +instance = example01_class(7)
                +assert example01_class.getCount() == 1
                +res = instance.addDataToInt(4)
                +assert res == 11
                +res = instance.addDataToInt(-4)
                +assert res == 3
                +instance.destruct() # so far explicit destruction needed
                +assert example01_class.getCount() == 0
                +
                +

                We also did some very early JIT work and some early performance measurements. +The rough figures are that cppyy is two times faster at calling a simple C++ +method from Python than PyROOT. To get a feeling for how fast things could +go in the end, we also implemented a proof-of-concept for some more advanced JIT +technology (which requires a patch for Reflex and uses a GCC extension). With +this, the speedup over PyROOT is a factor of 20. Of course, this is still a +lot slower than a C++ to C++ method call (probably by at least an order of +magnitude).

                +

                The sprint was very productive because we managed to get the right people into +the same room working together. Wim has a lot of experience with C++ and Reflex, +and is the author of PyROOT, and of course the others know a lot about PyPy +(at the end of the sprint, Anto was very glad that he stopped using C++ a long +time ago). Also, working at CERN was very cool. The atmosphere is amazing, and +we got to visit the ATLAS control room. Extremely advanced technology, and +also research on a completely different scale than what we are used to.

                +
                +

                Comments

                +
                +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2010/07/comparing-spur-to-pypy-8835011873209414462.html b/posts/2010/07/comparing-spur-to-pypy-8835011873209414462.html new file mode 100644 index 000000000..7372da6e1 --- /dev/null +++ b/posts/2010/07/comparing-spur-to-pypy-8835011873209414462.html @@ -0,0 +1,488 @@ + + + + + +Comparing SPUR to PyPy | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                Comparing SPUR to PyPy

                + + + +
                +

                Recently, I've become aware of the SPUR project of Microsoft Research and +read some of their papers (the tech report "SPUR: A Trace-Based JIT Compiler +for CIL" is very cool). I found the project to be very interesting and since +their approach is in many ways related to what PyPy is doing, I now want to +compare and contrast the two projects.

                +
                +

                A Tracing JIT for .NET

                +

                SPUR consist of two parts: On the one hand it is a VM for CIL, the +bytecode of the .NET VM. This VM uses a tracing JIT compiler to compile the +programs it is running to machine code. As opposed to most existing VMs that +have a tracing JIT it does not use an interpreter at all. Instead it +contains various variants of a JIT compiler that produce different versions of +each method. Those are:

                +
                  +
                • a profiling JIT which produces code that does lightweight profiling when +running the compiled method
                • +
                • a tracing JIT which produces code that produces a trace when running the +compiled method
                • +
                • a transfer-tail JIT which is used to produce code which is run to get from a +failing guard back to the normal profiling version of a method
                • +
                • an optimizing JIT that actually optimizes traces and turns them into machine code
                • +
                +
                +

                Optimizations Done by the Optimizing JIT

                +

                SPUR's optimizing JIT does a number of powerful optimizations on the traces before it +turns them into machine code. Among them are usual compiler optimizations such +as register allocation, common subexpression elimination, loop invariant code +motion, etc.

                +

                It also performs some optimizations that are specific to the tracing context and +are thus not commonly found in "normal" compilers:

                +
                  +
                • +guard implication: if a guard is implied by an earlier guard, it is removed
                • +
                • +guard strengthening: if there is a sequence of guards that become stronger +and stronger (i.e. each guard implies the previous one), the first guard in +the sequence is replaced by the last one, and all others are removed. This can +greatly reduce the number of guards and is generally safe. It can shift a +guard failure to an earlier point in the trace, but the failure would have +occurred at some point in the trace anyway.
                • +
                • +load/store optimizations: this is an optimization for memory reads/writes. +If several loads from the same memory location occur without writes in +between, all but the first one are removed. Similarly, if a write to a memory +location is performed, this write is delayed as much as possible. If there is +a write to the same location soon afterwards, the first write can be removed.
                • +
                • +escape analysis: for allocations that occur in a loop, the optimizer checks +whether the resulting object escapes the loop. If not, the allocation is moved +before the loop, so that only one object needs to be allocated, instead of one +every loop iteration.
                • +
                • +user-controlled loop unrolling: not exactly an optimization, but an +interesting feature anyway. It is possible to annotate a CIL method with a +special decorator [TraceUnfold] and then the tracing JIT will fully unroll +the loops it contains. This can be useful for loops than are known to run a +small and fixed number of iterations for each call-site.
                • +
                • +user controlled tracing: The user can also control tracing up to a point. +Methods can be annotated with [NativeCall] to tell the tracer to never +trace their execution. Instead they appear as a direct call in the trace.
                • +
                +
                +
                +
                +

                A JavaScript Implementation

                +

                In addition to the tracing JIT I just described, SPUR also contains a JavaScript +implementation for .NET. The approach of this implementation is to translate +JavaScript to CIL bytecode, doing some amount of type inference to detect +variables that have fixed types. All operations where no precise type could be +determined are implemented with calls to a JavaScript runtime system, which does +the necessary type dispatching. The JavaScript runtime is implemented in C#.

                +

                The JavaScript implementation and the CLI VM with a tracing JIT sound quite +unrelated at first, but together they amplify each other. The tracing JIT traces +the JavaScript functions that have been translated to CLI bytecode. Since the +JavaScript runtime is in C#, it exists as CLI bytecode too. Thus it can be +inlined into the JavaScript functions by the tracer. This is highly beneficial, +since it exposes the runtime type dispatching of the JavaScript operations to +the optimizations of the tracing JIT. Particularly the common expression +elimination helps the JavaScript code. If a series of operations is performed on +the same object, the operations will all do the same type checks. All but the +type checks of the first operation can be removed by the optimizer.

                +
                +

                Performance Results

                +

                The speed results of the combined JavaScript implementation and tracing JIT are +quite impressive. It beats TraceMonkey for most benchmarks in SunSpider (apart +from some string-heavy benchmarks that are quite slow) and can compete with V8 +in many of them. However, all this is steady-state performance and it seems +SPUR's compile time is rather bad currently.

                +
                +
                +

                Further Possibilities

                +

                A further (so far still hypothetical) advantage of SPUR is that the approach can +optimize cases where execution crosses the border of two different systems. If +somebody wrote an HTML layout engine and a DOM in C# to get a web browser and +integrated it with the JavaScript implementation described above, the tracing +JIT could optimize DOM manipulations performed by JavaScript code as well as +callbacks from the browser into JavaScript code.

                +

                Of course the approach SPUR takes to implement JavaScript is completely +generalizable. It should be possible to implement other dynamic languages in the +same way as JavaScript using SPUR. One would have to write a runtime system for +the language in C#, as well as a compiler from the language into CIL bytecode. +Given these two elements, SPUR's tracing JIT compiler would probably do a +reasonable job at optimizing this other language (of course in practise, the +language implementation would need some tweaking and annotations to make it +really fast).

                +
                +
                +
                +

                Comparison With PyPy

                +

                The goals of PyPy and SPUR are very similar. Both projects want to implement +dynamic languages in an efficient way by using a tracing JIT. Both apply the +tracing JIT "one level down", i.e. the runtime system of the dynamic language is +visible to the tracing JIT. This is the crucial point of the approach of both +projects. Since the runtime system of the dynamic language is visible to the +tracing JIT, the JIT can optimize programs in that dynamic language. It does not +itself need to know about the semantics of the dynamic language. This makes the +tracing JIT usable for a variety of dynamic languages. It also means that the +two halves can be implemented and debugged independently.

                +

                In SPUR, C# (or another language that is compilable to CIL) plays the role of +RPython, and CIL is equivalent to the intermediate format that PyPy's +translation toolchain uses. Both formats operate on a similar abstraction level, +they are quite close to C, but still have support for the object system of their +respective language and are garbage-collected.

                +

                SPUR supports only a JavaScript implementation so far, which could maybe change in +the future. Thus JavaScript in SPUR corresponds to Python in PyPy, which was the +first dynamic language implemented in PyPy (and is also the reason for PyPy's +existence).

                +

                There are obviously also differences between the two projects, although many of +them are only skin-deep. The largest difference is the reliance of SPUR on +compilers on all levels. PyPy takes the opposite approach of using interpreters +almost everywhere. The parts of PyPy that correspond to SPUR's compilers are (I +will use the Python implementation of PyPy as an example):

                +
                  +
                • the JavaScript-to-CIL compiler corresponds to the Python interpreter of PyPy
                • +
                • the profiling JIT corresponds to a part of PyPy's translation toolchain +which adds some profiling support in the process of turning RPython code into +C code,
                • +
                • the tracing JIT corresponds to a special interpreter in the PyPy JIT which +executes an RPython program and produces a trace of the execution
                • +
                • the transfer-tail JIT corresponds to PyPy's blackhole interpreter, also +called fallback interpreter
                • +
                • the optimizing JIT corresponds to the optimizers and backends of PyPy's JIT
                • +
                +
                +

                PyPy's Optimizations

                +

                Comparing the optimizations that the two projects perform, the biggest +difference is that PyPy does "trace stitching" instead of fully supporting trace +trees. The difference between the two concerns what happens when a new trace +gets added to an existing loop. The new trace starts from a guard in the +existing loop that was observed to fail often. Trace stitching means that the +loop is just patched with a jump to the new trace. SPUR instead recompiles the +whole trace tree, which gives the optimizers more opportunities, but also makes +compilation a lot slower. Another difference is that PyPy does not perform +loop-invariant code motion yet.

                +

                Many of the remaining optimizations are very similar. PyPy supports guard +implication as well as guard strengthening. It has some load/store +optimizations, but PyPy's alias analysis is quite rudimentary. On the other +hand, PyPy's escape analysis is very powerful. PyPy also has support for the +annotations that SPUR supports, using some decorators in the pypy.rlib.jit +module. User-controlled loop unrolling is performed using the unroll_safe +decorator, tracing of a function can be disabled with the dont_look_inside +decorator.

                +

                PyPy has a few more annotations that were not mentioned in the SPUR tech report. +Most importantly, it is possible to declare a function as pure, using the +purefunction decorator. PyPy's optimizers will remove calls to a function +decorated that way if the arguments to the call are all constant. In addition it +is possible to declare instances of classes to be immutable, which means that +field accesses on constant instances can be folded away. Furthermore there is +the promote hint, which is spelled x = hint(x, promote=True). This will +produce a guard in the trace, to turn x into a constant after the guard.

                +
                +
                +
                +

                Summary

                +

                Given the similarity between the projects' goals, it is perhaps not so +surprising to see that PyPy and SPUR have co-evolved and reached many similar +design decisions. It is still very good to see another project that does many +things in the same way as PyPy.

                +
                +
                +

                Comments

                +
                +
                +
                + + Anonymous wrote on 2010-07-04 09:27: +
                +
                +

                Besides being similar projects, is it possible to cross the streams? Could PyPy's CLI backend take the place of the JavaScript-to-CIL compiler (or is that the wrong parallel)?

                +
                +
                +
                +
                + + Carl Friedrich Bolz-Tereick wrote on 2010-07-04 21:55: +
                +
                +

                @Anonymous: I guess you could program stuff in RPython, compile to CIL and get it jitted by SPUR. However, this wouldn't work well for our main RPython programs, which are all interpreters. Using a tracing JIT on an interpreter without doing anything special is not helping much.

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2010/07/europython-2010-report-7803731360759120212.html b/posts/2010/07/europython-2010-report-7803731360759120212.html new file mode 100644 index 000000000..826a566a5 --- /dev/null +++ b/posts/2010/07/europython-2010-report-7803731360759120212.html @@ -0,0 +1,353 @@ + + + + + +EuroPython 2010 report | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                EuroPython 2010 report

                + + + +
                +

                So, EuroPython 2010 is over, I am flying home and it's time to write a report +about the conference from the PyPy point of view.

                +

                As usual, the conference was very interesting and went very well. The quality +of the talks I attended to was high on average and most importantly I could +meet a lot of interesting people to discuss various things.

                +

                On the first day, Armin, Amaury and I presented the usual PyPy status talk +(here are the slides): +the talk is an extended version of the one that I and Armin presented at +Pycon Italia in May and is divided in three parts: first I talked about the +current status of the project, what is the content of the recent 1.2 and 1.3 +releases and showed a demo of a simple Django application that renders a +Mandelbrot fractal and is measurably faster on PyPy than on CPython. In the +second part of the talk, Armin gave an introduction about the ideas that stand +behind the JIT. Finally, in the third part Amaury explained how the new +cpyext module lets PyPy to compile and load existing CPython extensions +written in C.

                +

                I think that the talk was well received: the only drawback is that there was +no time to answer questions at the end of the presentation. However, we +received a lot of "offline" questions after the talk finished and thorough the +whole conference: it is always great to see that people are interested in our +work, and I'd like to thank everybody for the feedback that they gave to us.

                +

                PyPy was also mentioned in the interesting Mark Shannon's talk, where he +compared the optimization techniques used by PyPy, Unladen Swallow and +HotPy, which is Mark's own PhD project. Moreover, Henrik Vendelbo +gave a talk about how to tweak PyPy to produce a standalone +executable which embeds a whole python application to make deployment easier, +while Andrew Francis explained his implementation of the Go select +statement based on the stackless.py module implemented in PyPy. Personally, +I am glad to see that people start to think of PyPy as a useful starting +point to experiment with new features and use cases that we did not think +about: after all, one of PyPy explicit goals is to be "flexible and easy to +experiment with".

                +

                After the conference there were the usual post EuroPython sprints: this +year we had not planned a PyPy sprint, but some people showed interest +in it and since Armin and I happened to be still around the day after the +conference, we decided to do a mini 1-day sprint, with 6 or 7 people +present. Since there were only two core developers it was impossible to use +our usual pairing scheme, in which every newcomer pairs with someone who is +experienced with the source code to gain knowledge of it. However, I think it +was still a successful day of work, and we managed to fix a couple of bugs +that was standing in our issue tracker. Again, I'd like to thank all the +people that came and worked with us during the sprint.

                +

                In conclusion I really enjoyed the EuroPython 2010 experience: the fact that I +managed to find a place in Birmingham where to eat a good Italian-style "gelato" +helped a lot :-).

                +
                +

                Comments

                +
                +
                +
                + + Anonymous wrote on 2010-07-26 12:22: +
                +
                +

                Just awesome.

                +
                +
                +
                +
                + + Paul Boddie wrote on 2010-08-09 23:03: +
                +
                +

                Finding gelato hopefully won't be a problem at next year's EuroPython. ;-)

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2010/07/play-on-regular-expression-9014941705636345998.html b/posts/2010/07/play-on-regular-expression-9014941705636345998.html new file mode 100644 index 000000000..7278fd43f --- /dev/null +++ b/posts/2010/07/play-on-regular-expression-9014941705636345998.html @@ -0,0 +1,286 @@ + + + + + +A Play on Regular Expression | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                A Play on Regular Expression

                + + + +
                +

                The paper where the algorithms we described in the recent blog posts come from is now available. It is written as a play in three Acts with a cast of three and is very readable and funny. The Haskell code is at Sebastian Fischer's github pages.

                +
                +

                Comments

                +
                +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2010/08/call-for-benchmarks-2605012131351543912.html b/posts/2010/08/call-for-benchmarks-2605012131351543912.html new file mode 100644 index 000000000..94e9fd399 --- /dev/null +++ b/posts/2010/08/call-for-benchmarks-2605012131351543912.html @@ -0,0 +1,537 @@ + + + + + +Call for Benchmarks | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                Call for Benchmarks

                + + + +
                +

                As you know, a lot of PyPy's recent development effort has gone into speeding up +execution of Python programs. However, an additional good property of PyPy's +Python interpreter is that most objects are represented in a much more compact +way than in CPython. We would like to investigate some more advanced techniques +to reduce the memory usage of Python programs further.

                +

                To do this it is necessary to investigate the memory behaviour of real programs +with large heaps. For speed measurements there are standard benchmarks, but for +memory improvements there is nothing comparable, the memory behaviour of large +programs is not that well understood. Therefore we are looking for programs that we +can study and use as benchmarks.

                +

                Specifically we are looking for Python programs with the following properties:

                +
                  +
                • large heaps of about 10MB-1GB
                • +
                • should have non-trivial runtime as well (in the range of a few seconds), to +judge the speed impact of optimizations
                • +
                • ideally pure-Python programs that don't use extension modules so that they run +under both CPython and PyPy (this is optional, but makes my life much easier).
                • +
                +

                We are also rather interested in programs that do a lot of string/unicode +processing.

                +

                We would be grateful for all ideas. Telling us about a program also has the +advantage that we will work on optimizing PyPy for it :-).

                +
                +

                Comments

                +
                +
                +
                + + lasi wrote on 2010-08-17 12:15: +
                +
                +

                I'm not think very much about it. But Zodb, durus or dobbin could be useful.

                +
                +
                +
                +
                + + Zeev wrote on 2010-08-17 12:26: +
                +
                +

                portage, the official Gentoo Linux package manager, does package dependency resolution and can take a few seconds for large updates. It parses package metadata from text files.

                +
                +
                +
                +
                + + Peter Goodman wrote on 2010-08-17 12:31: +
                +
                +

                You could run a program that determinizes a large NFA. Given an existing Python program that can determinize an NFA, you could give it an expanded version of the NFA on page 15 here: https://www.springerlink.com/content/cq16j1uv511g793g/fulltext.pdf. Another way is to take some complex NFAs, concatenate them, and determinize.

                +
                +
                +
                +
                + + Anonymous wrote on 2010-08-17 13:09: +
                +
                +

                Bazaar and mercurial take a lot of memory (time as well) when updating/merging etc. large repositories, especially if they contain large files.

                +
                +
                +
                +
                + + Anonymous wrote on 2010-08-17 13:23: +
                +
                +

                Pylint (https://www.logilab.org/project/pylint) could be a nice target. Pure Python, the size of the heap and run time depend on what kind of code you throw at it.

                +
                +
                +
                +
                + + VanL wrote on 2010-08-17 14:15: +
                +
                +

                You could try loading and manipulating a large graph with NetworkX. Pure Python, and the size and runtime could be tuned by varying the size of the graph and the algorithms that are run.

                +
                +
                +
                +
                + + Unknown wrote on 2010-08-17 14:51: +
                +
                + Whoosh comes to mind. People will always be grateful if you speed up search for them :) +
                +
                +
                +
                + + Anonymous wrote on 2010-08-17 15:15: +
                +
                +

                The CDPedia creates and manipulates its index with a pure-python inverted index implementation.

                It could be extracted and made into a benchmark - there are other pure-python inverted indices around, those could also work.

                They do tend to use lots and lots of memory, the CDPedia's implementation uses the builtin array module for byte sequence manipulation and bare strings as data store (it's highly optimized for lowering CPython's memory usage), but there are a few dict-heavy places yet.

                +
                +
                +
                +
                + + Anonymous wrote on 2010-08-17 16:32: +
                +
                +

                Agreed that Bazaar and Mercurial would be interesting use cases, especially for projects with large revision history graphs.

                Memory usage analysis has come up recently on the bzr list:
                https://lists.ubuntu.com/archives/bazaar/2010q3/069549.html

                +
                +
                +
                +
                + + Carl Friedrich Bolz-Tereick wrote on 2010-08-17 16:33: +
                +
                +

                All great ideas, thanks a lot!

                +
                +
                +
                +
                + + Anonymous wrote on 2010-08-17 17:42: +
                +
                +

                Python Natural Language Toolkit
                https://www.nltk.org/

                Give a huge corpus (Wikipedia?) and do any operation on it -- nltk will take huge loads of memory in all kinds of custom objects, lists and tuples.

                +
                +
                +
                +
                + + Pingveno wrote on 2010-08-17 21:25: +
                +
                +

                From what I understand, PyExcelerator, a writer/reader for Excel files, takes huge amounts of memory for very large files. It uses pure Python objects for each cell, which kills memory use when you're writing many millions of cells.

                +
                +
                +
                +
                + + Dan Stromberg wrote on 2010-08-17 22:53: +
                +
                +

                A couple of possibilities from my own OSS code:

                https://stromberg.dnsalias.org/~strombrg/treap

                https://stromberg.dnsalias.org/~strombrg/pyindex.html



                I'd most likely be happy to relicense the treap code as needed to facilitate inclusion. The pyindex code is under a UCI (I believe it's BSDish) license, and would probably need to remain so.

                +
                +
                +
                +
                + + Anonymous wrote on 2010-08-18 15:18: +
                +
                +

                I really didn't think about it much, I'm just trying to chew through my RSS backlog, and ran into a post about pkgcore dealing with memory issues just a few minutes after I read this call for benchmarks.

                Maybe you could use that.

                +
                +
                +
                +
                + + Anonymous wrote on 2010-08-18 23:45: +
                +
                +

                You might want to lok at MiniLight:

                https://www.hxa.name/minilight/#comparison

                +
                +
                +
                +
                + + none wrote on 2010-08-20 13:40: +
                +
                +

                I'm the author of a scientific application that can be suited to your needs. It runs both with Python 2.x and PyPy, so I bundled a distribution with some example benchmarks if this interests you: https://dl.dropbox.com/u/7931953/pypy-bench.tar.bz2 (see bench.README)

                An interesting observation in my opinion is that on small runs, CPython outperforms PyPy but this progressively reverses on longer runs.

                +
                +
                +
                +
                + + Carl Friedrich Bolz-Tereick wrote on 2010-08-20 14:44: +
                +
                +

                @all: thanks for the proposals, I am looking at them.

                @Franck: This is probably due to the JIT, which needs some time to compile at the beginning. Later, the assembler exists and executes quickly. Will look at your code, thanks for providing it.

                +
                +
                +
                +
                + + Anonymous wrote on 2010-08-20 18:58: +
                +
                +

                Hello, i am the author of an chess program being written entirely in python. I haven't published it jet, because i am a bit ashame of its poor quality. However it should suffice for the sole purpose of benchmarking. Please drop me a note if you are interested. My email adress is: larudwer at freenet dot de

                Some Notes:
                The Program is just console mode (UCI), no gui.

                it eats up all the memory you have

                cpython is almost twice as fast as pypy-1.3 on this program and psyco accelerates it by another factor of two.

                +
                +
                +
                +
                + + Unknown wrote on 2010-08-21 17:33: +
                +
                +

                You could consider Tahoe-LAFS. A good reason to use it is that it is a practicality-oriented, widely deployed tool with significant memory usage that we routinely spend engineering effort to track and manage.

                Here are some graphs of the memory usage of different versions of Tahoe-LAFS over time:

                32-bit machine:
                https://tahoe-lafs.org/tahoe-figleaf-graph/hanford.allmydata.com-tahoe_memstats.html

                64-bit machine:
                https://tahoe-lafs.org/tahoe-figleaf-graph/hanford.allmydata.com-tahoe_memstats_64.html

                Here are some open tickets about memory usage in our issue tracker:

                https://tahoe-lafs.org/trac/tahoe-lafs/query?status=!closed&keywords=~memory&order=priority

                The reason not to use Tahoe-LAFS as a subject is that it uses several native-code libraries to for the CPU-intensive inner loops (cryptography, erasure coding). I really want those libraries, and hence Tahoe-LAFS, to be usable with cpyext as soon as possible, but I haven't tried and I assume that cpyext isn't 100% there yet.

                By the way the easiest way to measure the performance of Tahoe-LAFS would be to run its unit tests and measure the memory usage and runtime. This is not only the easiest way, but it is also a pressing issue for us! Tahoe-LAFS unit tests take too long to run, and this causes problems for us, and we very much like it if they could run to completion faster.

                https://tahoe-lafs.org/trac/tahoe-lafs/ticket/20# unit tests take too long

                Here are our buildbots showing unit test runtime among other things:

                https://tahoe-lafs.org/buildbot/waterfall?show_events=true&builder=Kyle+OpenBSD-4.6+amd64&builder=hardy-amd64&builder=Arthur+lenny+c7+32bit&builder=Eugen+lenny-amd64&builder=David+A.+OpenSolaris+i386&builder=Ruben+Fedora&builder=Zooko+zomp+Mac-amd64+10.6+py2.6&builder=FreeStorm+WinXP-x86+py2.6&builder=tarballs

                +
                +
                +
                +
                + + Adam Sampson wrote on 2010-08-22 16:22: +
                +
                +

                rawdog (disclosure of bias: I wrote it) sounds like it might be of use. It's an RSS aggregator that generates static HTML. Pure Python 2, with lots of string processing, mostly in the feedparser module. Memory usage and runtime depends on how many feeds it's reading and how much history it keeps, since it does everything in memory at the moment, using pickle for persistant state. (With my 800-odd feeds and two-month history, writing the entire store to HTML will use a few hundred meg of memory and run for several minutes.)

                A future redesign will use a more sensible database-backed approach...

                +
                +
                +
                +
                + + Bob Ziuchkovski wrote on 2010-08-24 00:10: +
                +
                +

                Scapy would be a great one to benchmark. Depending on the size of the packet capture, it can consume quite a bit of proc/mem when loading and dissecting large captures. I run it at work on Cpython and would love to see it running/optimized under pypy. The only problem is that I believe it uses some 2.6 pythonisms.

                +
                +
                +
                +
                + + Carl Friedrich Bolz-Tereick wrote on 2010-08-27 19:03: +
                +
                +

                Thanks again for all the additional pointers. Still investigating all of them.

                +
                +
                +
                +
                + + Anonymous wrote on 2010-09-05 05:01: +
                +
                +

                How about Nucular, a search engine written in python by aaron watters.

                https://nucular.sourceforge.net/

                +
                +
                +
                +
                + + Anonymous wrote on 2010-09-10 20:27: +
                +
                +

                In my view, the natural competitors to PyPy (in the domain of fast interpreters for dynamic languages) are Tracemonkey and V8. Therefore, translations of the Sunspider, V8, and Dromaeo benchmarks would be appropriate.

                +
                +
                +
                +
                + + Anonymous wrote on 2010-09-17 14:58: +
                +
                +

                bitbake looks like a good candidate. It's a derivative of portage, and used to crosscompile linux distro for embedded device.

                With non trivial distro, it can use up to 400Mb. It already use psyco if available, and can be interesting compare speed/memory usage with pypy.

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2010/08/europython-2010-videos-available-8446190660370796142.html b/posts/2010/08/europython-2010-videos-available-8446190660370796142.html new file mode 100644 index 000000000..73d307f87 --- /dev/null +++ b/posts/2010/08/europython-2010-videos-available-8446190660370796142.html @@ -0,0 +1,335 @@ + + + + + +EuroPython 2010 Videos available | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                EuroPython 2010 Videos available

                + + + +
                +

                Hi all,

                +

                the videos of the talks from EuroPython 2010 are now available on +blip.tv: in particular, there are the three videos of the PyPy talk.

                +

                Part 1: What's news in PyPy 1.2 and 1.3 (by Antonio Cuni)

                +

                Part 2: Just in Time compilation (by Armin Rigo)

                +

                Part 3: cpyext (by Amaury Forgeot d'Arc)

                +

                Moreover, here is Mark Shannon's talk which compares HotPy, Unladen Swallow +and PyPy:

                +
                +

                Comments

                +
                +
                +
                + + Anonymous wrote on 2010-08-17 18:36: +
                +
                +

                Can you post the links to the blip.tv pages, so I can go there and download the videos?

                The blip.tv viewer applet has no such link, and even digging the source isn't helpful (it seems that they use different identifiers for embed applets than for the same videos on their own website). Grrr!

                +
                +
                +
                +
                + + Antonio Cuni wrote on 2010-08-17 20:07: +
                +
                +

                Sure, here are the links:
                Part 1: https://europythonvideos.blip.tv/file/3981017/

                Part 2: https://europythonvideos.blip.tv/file/3981028/

                Part 3: https://europythonvideos.blip.tv/file/4000720/

                HotPy: https://europythonvideos.blip.tv/file/3980963/

                +
                +
                +
                +
                + + Lucian wrote on 2010-08-18 01:10: +
                +
                +

                @Anonymous

                You can also click the title of the video in the Blip.tv embedded player.

                +
                +
                +
                +
                + + horace wrote on 2010-08-19 18:51: +
                +
                +

                if i remember correctly, a while ago you were looking for alternative names for pypy.

                i just came across the wikipeda article for the "black mamba" which states that it is the fastest snake of the world.

                how about the name "black mamba"? :)

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2010/08/pyohio-2568618480482575546.html b/posts/2010/08/pyohio-2568618480482575546.html new file mode 100644 index 000000000..f7c9f248f --- /dev/null +++ b/posts/2010/08/pyohio-2568618480482575546.html @@ -0,0 +1,353 @@ + + + + + +PyOhio | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                PyOhio

                + + + +
                +

                This weekend I delivered a talk at PyOhio (an annual conference in Columbus, OH, USA) on PyPy and Unladen Swallow. The talk covered reasons that Python, the language, is hard to optimize, why CPython is slow, and a few optimizations that PyPy and Unladen Swallow have implemented. The slides from my talk are online, and the talk was recorded so a video will follow. I gave a similar talk to ChiPy (the Chicago Python user group), which was also recorded and the video is available. Both audiences were excited about the futures for PyPy and Unladen Swallow, and for the future of a faster Python.

                +

                Alex

                +
                +

                Comments

                +
                +
                +
                + + tucuman87 wrote on 2010-08-05 13:55: +
                +
                +

                I do not understand why is python so hard to optimize- after all, LuaJIT is VERY fast, and I thought Lua has the same dynamical features as python. I'm no Python nor Lua expert, but it would be nice knowing...

                Thanks!

                +
                +
                +
                +
                + + Maciej Fijalkowski wrote on 2010-08-05 13:57: +
                +
                +

                Did you actually watch the video? it's explained there.

                +
                +
                +
                +
                + + Anonymous wrote on 2010-08-05 19:27: +
                +
                +

                Any chance of putting the slides somewhere that can be directed downloaded?

                +
                +
                +
                +
                + + Alex wrote on 2010-08-05 19:33: +
                +
                +

                There's a link to download the slides on the right hand side. This link: https://www.scribd.com/document_downloads/direct/35240506?extension=pdf&ft=1281033139&lt=1281036749&uahk=mAWsHOEi/etYRUUXWst+oYKiWIU
                should also work.

                +
                +
                +
                +
                + + tucuman87 wrote on 2010-08-05 20:42: +
                +
                +

                I've seen the video, and the question I asked is not answered: what dynamic feature Python has that Lua doesn't?

                is such a specific feature responsible for the fast LuaJIT?

                Thanks.

                +
                +
                +
                +
                + + Maciej Fijalkowski wrote on 2010-08-05 20:47: +
                +
                +

                The main meta-reason is that Python is a very complex language compared to lua, so you have to take into account a lot of things that you don't care about in lua. one example is new style classes with insane semantics about descriptors.

                +
                +
                +
                +
                + + Luis wrote on 2010-08-06 04:16: +
                +
                +

                The author of Luajit, Mike Pall, participated in a long thread posted here https://lambda-the-ultimate.org/node/3851#comment-57700 , as well as Maciej and others.

                There, he said about python:
                "Python (the core language) just has different quirks that need to be worked around. But no show stoppers.
                What is more challenging, is to efficiently handle Python (the programming environment) with all of its classes and methods. In particular the plethora of container types, since you really want to inline their accessors.
                Since I don't believe in multi-language VMs, LuaJIT is pretty Lua-specific on all levels. Your best bet right now is to join the PyPy effort (they already handle the issues I mentioned)."

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2010/08/using-virtualenv-with-pypy-7238942727709530503.html b/posts/2010/08/using-virtualenv-with-pypy-7238942727709530503.html new file mode 100644 index 000000000..2cd7430fa --- /dev/null +++ b/posts/2010/08/using-virtualenv-with-pypy-7238942727709530503.html @@ -0,0 +1,423 @@ + + + + + +Using virtualenv with PyPy | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                Using virtualenv with PyPy

                + + + +
                +

                Thanks to the work that was recently done on the sys-prefix branch, it is +now possible to use virtualenv with PyPy.

                +

                To try it, you need:

                +
                +
                  +
                • a recent version of PyPy: PyPy 1.3 does not contain the necessary logic to +work with virtualenv, so you need a more recent PyPy from subversion +trunk. You can either build it by yourself or download one of our +precompiled nightly builds +
                • +
                • a copy of virtualenv-pypy: this is a fork of virtualenv that contains +all the patches needed to work with PyPy, and hopefully will be merged +back at some point. It should be totally compatible with the official +version of virtualenv, so it is safe to use it even to create non-PyPy +environments. If you notice some weird behavior that does not happen with +the standard virtualenv, please let us know.
                • +
                +
                +

                The directory layout has been redesigned in a way that it is possible to use +virtualenv to install a PyPy both from a precompiled tarball or from an svn +checkout:

                +
                +# from a tarball
                +$ virtualenv -p /opt/pypy-c-jit-76426-linux/bin/pypy my-pypy-env
                +
                +# from the svn checkout
                +$ virtualenv -p /path/to/pypy-trunk/pypy/translator/goal/pypy-c my-pypy-env
                +
                +

                Once the environment has been created, you can enter it as usual. Note that +bin/python is now a symlink to bin/pypy.

                +

                Enjoy it :-)

                +
                +

                Comments

                +
                +
                +
                + + René Dudfield wrote on 2010-08-02 17:13: +
                +
                +

                Another great step for pypy being used in more productions.

                +
                +
                +
                +
                + + Konrad wrote on 2010-08-03 17:50: +
                +
                +

                Good job!

                +
                +
                +
                +
                + + Alexei Boronine wrote on 2010-08-05 16:22: +
                +
                +

                I recently made a script called pypyenv for easily installing PyPy in a virtualenv side by side with CPython, sharing site-packages. It will allow one to experiment with PyPy in a working virtualenv without breaking current code.

                +
                +
                +
                +
                + + Antonio Cuni wrote on 2010-08-05 19:16: +
                +
                +

                @Alex: nice. pypyenv is obviously something different that virtualenv-pypy, but it might be useful if someone wants to try PyPy.

                However, I don't think that sharing site-packages is a good idea: it works as long as you have only pure python packages, but it stops as soon as you build some C extension, as the .so produced by PyPy are incompatible with CPython

                +
                +
                +
                +
                + + Alexei Boronine wrote on 2010-08-06 00:58: +
                +
                +

                @Antonio

                Interesting point, I'll mention it in the README. (by the way, thank you for your work, PyPy rocks my socks!)

                +
                +
                +
                +
                + + Xiong Chiamiov wrote on 2010-08-11 23:31: +
                +
                +

                A bit unrelated, but would it be possible to have nightly releases with consistent filenames? Right now, they all include the svn revision number (I assume that's what it is), which makes it difficult to write a script that downloads and installs the latest version.

                Specifically, I'm looking to create an Arch pkgbuild, because it takes too damn long to compile on my notebook, and I don't want to use the stable release.

                +
                +
                +
                +
                + + jezdez wrote on 2010-09-03 15:10: +
                +
                +

                FYI, the fixes have been merged in virtualenv's main repo.

                +
                +
                +
                +
                + + Anonymous wrote on 2011-12-14 10:28: +
                +
                +

                it seems there's no pypy/translator/goal/pypy-c anymore?
                how to init virtualenv from pypy source now?

                +
                +
                +
                +
                + + Maciej Fijalkowski wrote on 2011-12-14 10:31: +
                +
                +

                It's not in a checkout you have to compile it first.

                +
                +
                +
                +
                + + Asmo wrote on 2012-04-20 12:47: +
                +
                +

                Is the information here still valid? Or should virtualenv work fine with pypy?

                +
                +
                +
                +
                + + Maciej Fijalkowski wrote on 2012-04-20 12:50: +
                +
                +

                Virtualenv should work just fine

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2010/09/escape-analysis-in-pypys-jit-1780048403046080197.html b/posts/2010/09/escape-analysis-in-pypys-jit-1780048403046080197.html new file mode 100644 index 000000000..0cff0620e --- /dev/null +++ b/posts/2010/09/escape-analysis-in-pypys-jit-1780048403046080197.html @@ -0,0 +1,581 @@ + + + + + +Escape Analysis in PyPy's JIT | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                Escape Analysis in PyPy's JIT

                + + + +
                +

                The goal of a just-in-time compiler for a dynamic language is obviously to +improve the speed of the language over an implementation of the language that +uses interpretation. The first goal of a JIT is thus to remove the +interpretation overhead, i.e. the overhead of bytecode (or AST) dispatch and the +overhead of the interpreter's data structures, such as operand stack etc. The +second important problem that any JIT for a dynamic language needs to solve is +how to deal with the overhead of boxing of primitive types and of type +dispatching. Those are problems that are usually not present in statically typed +languages.

                +

                Boxing of primitive types means that dynamic languages need to be able to handle +all objects, even integers, floats, etc. in the same way as user-defined +instances. Thus those primitive types are usually boxed, i.e. a small +heap-structure is allocated for them, that contains the actual value.

                +

                Type dispatching is the process of finding the concrete implementation that is +applicable to the objects at hand when doing a generic operation at hand. An +example would be the addition of two objects: The addition needs to check what +the concrete objects are that should be added are, and choose the implementation +that is fitting for them.

                +

                Last year, we wrote a blog post and a paper about how PyPy's meta-JIT +approach works. These explain how the meta-tracing JIT can remove the overhead +of bytecode dispatch. In this post (and probably a followup) we want to explain +how the traces that are produced by our meta-tracing JIT are then optimized to +also remove some of the overhead more closely associated to dynamic languages, +such as boxing overhead and type dispatching. The most important technique to +achieve this is a form of escape analysis that we call virtual objects. +This is best explained via an example.

                +
                +

                Running Example

                +

                For the purpose of this blog post, we are going to use a very simple object +model, that just supports an integer and a float type. The objects support only +two operations, add, which adds two objects (promoting ints to floats in a +mixed addition) and is_positive, which returns whether the number is greater +than zero. The implementation of add uses classical Smalltalk-like +double-dispatching. These classes could be part of the implementation of a very +simple interpreter written in RPython.

                +
                class Base(object):
                +    def add(self, other):
                +        """ add self to other """
                +        raise NotImplementedError("abstract base")
                +    def add__int(self, intother):
                +        """ add intother to self, where intother is a Python integer """
                +        raise NotImplementedError("abstract base")
                +    def add__float(self, floatother):
                +        """ add floatother to self, where floatother is a Python float """
                +        raise NotImplementedError("abstract base")
                +    def is_positive(self):
                +        """ returns whether self is positive """
                +        raise NotImplementedError("abstract base")
                +
                +class BoxedInteger(Base):
                +    def __init__(self, intval):
                +        self.intval = intval
                +    def add(self, other):
                +        return other.add__int(self.intval)
                +    def add__int(self, intother):
                +        return BoxedInteger(intother + self.intval)
                +    def add__float(self, floatother):
                +        return BoxedFloat(floatother + float(self.intval))
                +    def is_positive(self):
                +        return self.intval > 0
                +
                +class BoxedFloat(Base):
                +    def __init__(self, floatval):
                +        self.floatval = floatval
                +    def add(self, other):
                +        return other.add__float(self.floatval)
                +    def add__int(self, intother):
                +        return BoxedFloat(float(intother) + self.floatval)
                +    def add__float(self, floatother):
                +        return BoxedFloat(floatother + self.floatval)
                +    def is_positive(self):
                +        return self.floatval > 0.0
                +
                +

                Using these classes to implement arithmetic shows the basic problem that a +dynamic language implementation has. All the numbers are instances of either +BoxedInteger or BoxedFloat, thus they consume space on the heap. Performing many +arithmetic operations produces lots of garbage quickly, thus putting pressure on +the garbage collector. Using double dispatching to implement the numeric tower +needs two method calls per arithmetic operation, which is costly due to the +method dispatch.

                +

                To understand the problems more directly, let us consider a simple function that +uses the object model:

                +
                def f(y):
                +    res = BoxedInteger(0)
                +    while y.is_positive():
                +        res = res.add(y).add(BoxedInteger(-100))
                +        y = y.add(BoxedInteger(-1))
                +    return res
                +
                +

                The loop iterates y times, and computes something in the process. To +understand the reason why executing this function is slow, here is the trace +that is produced by the tracing JIT when executing the function with y +being a BoxedInteger:

                +
                +# arguments to the trace: p0, p1
                +# inside f: res.add(y)
                +guard_class(p1, BoxedInteger)
                +    # inside BoxedInteger.add
                +    i2 = getfield_gc(p1, intval)
                +    guard_class(p0, BoxedInteger)
                +        # inside BoxedInteger.add__int
                +        i3 = getfield_gc(p0, intval)
                +        i4 = int_add(i2, i3)
                +        p5 = new(BoxedInteger)
                +            # inside BoxedInteger.__init__
                +            setfield_gc(p5, i4, intval)
                +# inside f: BoxedInteger(-100)
                +p6 = new(BoxedInteger)
                +    # inside BoxedInteger.__init__
                +    setfield_gc(p6, -100, intval)
                +
                +# inside f: .add(BoxedInteger(-100))
                +guard_class(p5, BoxedInteger)
                +    # inside BoxedInteger.add
                +    i7 = getfield_gc(p5, intval)
                +    guard_class(p6, BoxedInteger)
                +        # inside BoxedInteger.add__int
                +        i8 = getfield_gc(p6, intval)
                +        i9 = int_add(i7, i8)
                +        p10 = new(BoxedInteger)
                +            # inside BoxedInteger.__init__
                +            setfield_gc(p10, i9, intval)
                +
                +# inside f: BoxedInteger(-1)
                +p11 = new(BoxedInteger)
                +    # inside BoxedInteger.__init__
                +    setfield_gc(p11, -1, intval)
                +
                +# inside f: y.add(BoxedInteger(-1))
                +guard_class(p0, BoxedInteger)
                +    # inside BoxedInteger.add
                +    i12 = getfield_gc(p0, intval)
                +    guard_class(p11, BoxedInteger)
                +        # inside BoxedInteger.add__int
                +        i13 = getfield_gc(p11, intval)
                +        i14 = int_add(i12, i13)
                +        p15 = new(BoxedInteger)
                +            # inside BoxedInteger.__init__
                +            setfield_gc(p15, i14, intval)
                +
                +# inside f: y.is_positive()
                +guard_class(p15, BoxedInteger)
                +    # inside BoxedInteger.is_positive
                +    i16 = getfield_gc(p15, intval)
                +    i17 = int_gt(i16, 0)
                +# inside f
                +guard_true(i17)
                +jump(p15, p10)
                +
                +

                (indentation corresponds to the stack level of the traced functions).

                +

                The trace is inefficient for a couple of reasons. One problem is that it checks +repeatedly and redundantly for the class of the objects around, using a +guard_class instruction. In addition, some new BoxedInteger instances are +constructed using the new operation, only to be used once and then forgotten +a bit later. In the next section, we will see how this can be improved upon, +using escape analysis.

                +
                +
                +

                Virtual Objects

                +

                The main insight to improve the code shown in the last section is that some of +the objects created in the trace using a new operation don't survive very +long and are collected by the garbage collector soon after their allocation. +Moreover, they are used only inside the loop, thus we can easily prove that +nobody else in the program stores a reference to them. The +idea for improving the code is thus to analyze which objects never escape the +loop and may thus not be allocated at all.

                +

                This process is called escape analysis. The escape analysis of +our tracing JIT works by using virtual objects: The trace is walked from +beginning to end and whenever a new operation is seen, the operation is +removed and a virtual object is constructed. The virtual object summarizes the +shape of the object that is allocated at this position in the original trace, +and is used by the escape analysis to improve the trace. The shape describes +where the values that would be stored in the fields of the allocated objects +come from. Whenever the optimizer sees a setfield that writes into a virtual +object, that shape summary is thus updated and the operation can be removed. +When the optimizer encounters a getfield from a virtual, the result is read +from the virtual object, and the operation is also removed.

                +

                In the example from last section, the following operations would produce two +virtual objects, and be completely removed from the optimized trace:

                +
                +p5 = new(BoxedInteger)
                +setfield_gc(p5, i4, intval)
                +p6 = new(BoxedInteger)
                +setfield_gc(p6, -100, intval)
                +
                +

                The virtual object stored in p5 would know that it is an BoxedInteger, and that +the intval field contains i4, the one stored in p6 would know that +its intval field contains the constant -100.

                +

                The following operations, that use p5 and p6 could then be +optimized using that knowledge:

                +
                +guard_class(p5, BoxedInteger)
                +i7 = getfield_gc(p5, intval)
                +# inside BoxedInteger.add
                +guard_class(p6, BoxedInteger)
                +# inside BoxedInteger.add__int
                +i8 = getfield_gc(p6, intval)
                +i9 = int_add(i7, i8)
                +
                +

                The guard_class operations can be removed, because the classes of p5 and +p6 are known to be BoxedInteger. The getfield_gc operations can be removed +and i7 and i8 are just replaced by i4 and -100. Thus the only +remaining operation in the optimized trace would be:

                +
                +i9 = int_add(i4, -100)
                +
                +

                The rest of the trace is optimized similarly.

                +

                So far we have only described what happens when virtual objects are used in +operations that read and write their fields. When the virtual object is used in +any other operation, it cannot stay virtual. For example, when a virtual object +is stored in a globally accessible place, the object needs to actually be +allocated, as it will live longer than one iteration of the loop.

                +

                This is what happens at the end of the trace above, when the jump operation +is hit. The arguments of the jump are at this point virtual objects. Before the +jump is emitted, they are forced. This means that the optimizers produces code +that allocates a new object of the right type and sets its fields to the field +values that the virtual object has. This means that instead of the jump, the +following operations are emitted:

                +
                +p15 = new(BoxedInteger)
                +setfield_gc(p15, i14, intval)
                +p10 = new(BoxedInteger)
                +setfield_gc(p10, i9, intval)
                +jump(p15, p10)
                +
                +

                Note how the operations for creating these two instances has been moved down the +trace. It looks like for these operations we actually didn't win much, because +the objects are still allocated at the end. However, the optimization was still +worthwhile even in this case, because some operations that have been performed +on the forced virtual objects have been removed (some getfield_gc operations +and guard_class operations).

                +

                The final optimized trace of the example looks like this:

                +
                +# arguments to the trace: p0, p1
                +guard_class(p1, BoxedInteger)
                +i2 = getfield_gc(p1, intval)
                +guard_class(p0, BoxedInteger)
                +i3 = getfield_gc(p0, intval)
                +i4 = int_add(i2, i3)
                +i9 = int_add(i4, -100)
                +
                +guard_class(p0, BoxedInteger)
                +i12 = getfield_gc(p0, intval)
                +i14 = int_add(i12, -1)
                +
                +i17 = int_gt(i14, 0)
                +guard_true(i17)
                +p15 = new(BoxedInteger)
                +setfield_gc(p15, i14, intval)
                +p10 = new(BoxedInteger)
                +setfield_gc(p10, i9, intval)
                +jump(p15, p10)
                +
                +

                The optimized trace contains only two allocations, instead of the original five, +and only three guard_class operations, from the original seven.

                +
                +
                +

                Summary

                +

                In this blog post we described how simple escape analysis within the scope of +one loop works. This optimizations reduces the allocation of many intermediate +data structures that become garbage quickly in an interpreter. It also removes a +lot of the type dispatching overhead. In a later post, we will explain how this +optimization can be improved further.

                +
                +
                +

                Comments

                +
                +
                +
                + + Anonymous wrote on 2010-09-13 19:38: +
                +
                +

                Beautiful post. I love it when people dare to broach more 'advanced' subjects in blog format.

                +
                +
                +
                +
                + + Carl Friedrich Bolz-Tereick wrote on 2010-09-14 11:49: +
                +
                +

                Thanks a lot :-).

                +
                +
                +
                +
                + + jdb wrote on 2010-09-15 10:21: +
                +
                +

                +1, thanks

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2010/09/pypy-in-googles-summer-of-code-2010-1267220161643618015.html b/posts/2010/09/pypy-in-googles-summer-of-code-2010-1267220161643618015.html new file mode 100644 index 000000000..a619767fa --- /dev/null +++ b/posts/2010/09/pypy-in-googles-summer-of-code-2010-1267220161643618015.html @@ -0,0 +1,324 @@ + + + + + +PyPy in Google's Summer of Code 2010 | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                PyPy in Google's Summer of Code 2010

                + + + +
                +

                Hello.

                +

                This year we had a record of two and a half applications (one was on a cross +section of PyPy and numpy) accepted for the Google +SoC program. Since it ended a couple of weeks ago, we wanted to present the results that +were achieved. All three projects were completed successfully, although the rate +of success varied quite a bit.

                +

                The Numpy proposal progress significantly on making numpy compatible with +PyPy's CPython's extension module support, but failed to bring PyPy's numpy +implementation into a usable shape (which is a somewhat ambitious goal, one +might argue). The experiments done during the projects are living on the +micronumpy branch.

                +

                The Fast ctypes proposal did some useful experiments on how to JIT external +calls from PyPy to C, however, the actual code as of now is not very +interesting and it's quite far from providing a full ctypes replacement (or +equivalent).

                +

                Definitely the most successful proposal was a 64bit (x86_64) backend for PyPy's +JIT. It not only includes working 64bit JIT (merged into PyPy trunk), but also +a working asmgcc for x86_64 linux platform, that makes it possible to run the JIT +on this architecture with our advanced garbage collectors. One can say that +x64_64 is now no longer a second-class citizen for PyPy, although it definitely +didn't receive as much testing as the x86 platform. Expect this to be a major +selling point for the next PyPy release :-)

                +

                Cheers, +fijal & the PyPy team

                +
                +

                Comments

                +
                +
                +
                + + Anonymous wrote on 2010-09-24 05:26: +
                +
                +

                Awesome news.

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2010/09/using-escape-analysis-across-loop-2887031293132023676.html b/posts/2010/09/using-escape-analysis-across-loop-2887031293132023676.html new file mode 100644 index 000000000..5ced238bb --- /dev/null +++ b/posts/2010/09/using-escape-analysis-across-loop-2887031293132023676.html @@ -0,0 +1,419 @@ + + + + + +Using Escape Analysis Across Loop Boundaries for Specialization | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                Using Escape Analysis Across Loop Boundaries for Specialization

                + + + +
                +

                This blog post is a successor to the one about escape analysis in PyPy's +JIT. The examples from there will be continued here. This post is a bit +science-fictiony. The algorithm that PyPy currently uses is significantly more +complex and much harder than the one that is described here. The resulting +behaviour is very similar, however, so we will use the simpler version (and we +might switch to that at some point in the actual implementation).

                +

                In the last blog post we described how escape analysis can be used to remove +many of the allocations of short-lived objects and many of the type dispatches +that are present in a non-optimized trace. In this post we will improve the +optimization to also handle more cases.

                +

                To understand some more what the optimization described in the last blog post +can achieve, look at the following figure:

                +

                + +new lifetimes

                +

                The figure shows a trace before optimization, together with the lifetime of +various kinds of objects created in the trace. It is executed from top to +bottom. At the bottom, a jump is used to execute the same loop another time. +For clarity, the figure shows two iterations of the loop. +The loop is executed until one of the guards in the trace fails, and the +execution is aborted.

                +

                Some of the operations within this trace are new operations, which each create a +new instance of some class. These instances are used for a while, e.g. by +calling methods on them, reading and writing their fields. Some of these +instances escape, which means that they are stored in some globally accessible +place or are passed into a function.

                +

                Together with the new operations, the figure shows the lifetimes of the +created objects. Objects in category 1 live for a while, and are then just not +used any more. The creation of these objects is removed by the +optimization described in the last blog post.

                +

                Objects in category 2 live for a while and then escape. The optimization of the +last post deals with them too: the new that creates them and +the field accesses are deferred, until the point where the object escapes.

                +

                The objects in category 3 and 4 are in principle like the objects in category 1 +and 2. They are created, live for a while, but are then passed as an argument +to the jump operation. In the next iteration they can either die (category +3) or escape (category 4).

                +

                The optimization of the last post considered the passing of an object along a +jump to be equivalent to escaping. It was thus treating objects in category 3 +and 4 like those in category 2.

                +

                The improved optimization described in this post will make it possible to deal +better with objects in category 3 and 4. This will have two consequences: on +the one hand, more allocations are removed from the trace (which is clearly +good). As a side-effect of this, the traces will also be type-specialized.

                +
                +

                Optimizing Across the Jump

                +

                Let's look at the final trace obtained in the last post for the example loop. +The final trace was much better than the original one, because many allocations +were removed from it. However, it also still contained allocations:

                +
                +step 1 +
                +

                The two new BoxedIntegers stored in p15 and p10 are passed into +the next iteration of the loop. The next iteration will check that they are +indeed BoxedIntegers, read their intval fields and then not use them +any more. Thus those instances are in category 3.

                +

                In its current state the loop +allocates two BoxedIntegers at the end of every iteration, that then die +very quickly in the next iteration. In addition, the type checks at the start +of the loop are superfluous, at least after the first iteration.

                +

                The reason why we cannot optimize the remaining allocations away is because +their lifetime crosses the jump. To improve the situation, a little trick is +needed. The trace above represents a loop, i.e. the jump at the end jumps to +the beginning. Where in the loop the jump occurs is arbitrary, since the loop +can only be left via failing guards anyway. Therefore it does not change the +semantics of the loop to put the jump at another point into the trace and we +can move the jump operation just above the allocation of the objects that +appear in the current jump. This needs some care, because the arguments to +jump are all currently live variables, thus they need to be adapted.

                +

                If we do that for our example trace above, the trace looks like this:

                +
                +step 2 +
                +

                Now the lifetime of the remaining allocations no longer crosses the jump, and +we can run our escape analysis a second time, to get the following trace:

                +
                +step3 +
                +

                This result is now really good. The code performs the same operations than +the original code, but using direct CPU arithmetic and no boxing, as opposed to +the original version which used dynamic dispatching and boxing.

                +

                Looking at the final trace it is also completely clear that specialization has +happened. The trace corresponds to the situation in which the trace was +originally recorded, which happened to be a loop where BoxedIntegers were +used. The now resulting loop does not refer to the BoxedInteger class at +all any more, but it still has the same behaviour. If the original loop had +used BoxedFloats, the final loop would use float_* operations +everywhere instead (or even be very different, if the object model had +user-defined classes).

                +
                +
                +

                Entering the Loop

                +

                The approach of placing the jump at some other point in the loop leads to +one additional complication that we glossed over so far. The beginning of the +original loop corresponds to a point in the original program, namely the +while loop in the function f from the last post.

                +

                Now recall that in a VM that uses a tracing JIT, all programs start by being +interpreted. This means that when f is executed by the interpreter, it is +easy to go from the interpreter to the first version of the compiled loop. +After the jump is moved and the escape analysis optimization is applied a +second time, this is no longer easily possible. In particular, the new loop +expects two integers as input arguments, while the old one expected two +instances.

                +

                To make it possible to enter the loop directly from the intepreter, there +needs to be some additional code that enters the loop by taking as input +arguments what is available to the interpreter, i.e. two instances. This +additional code corresponds to one iteration of the loop, which is thus +peeled off:

                +
                +step 4 +
                +
                +
                +

                Summary

                +

                The optimization described in this post can be used to optimize away +allocations in category 3 and improve allocations in category 4, by deferring +them until they are no longer avoidable. A side-effect of these optimizations +is also that the optimized loops are specialized for the types of the variables +that are used inside them.

                +
                +
                +

                Comments

                +
                +
                +
                + + Ole Laursen wrote on 2010-09-24 17:18: +
                +
                +

                Interesting, like the previous post. Keep 'em coming. :)

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2010/10/dusseldorf-sprint-report-2010-371223200425847723.html b/posts/2010/10/dusseldorf-sprint-report-2010-371223200425847723.html new file mode 100644 index 000000000..a74317a09 --- /dev/null +++ b/posts/2010/10/dusseldorf-sprint-report-2010-371223200425847723.html @@ -0,0 +1,391 @@ + + + + + +Düsseldorf Sprint Report 2010 | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                Düsseldorf Sprint Report 2010

                + + + +
                + + +

                This years installment of the yearly PyPy Düsseldorf Sprint is drawing to a +close. As usual, we worked in the seminar room of the programming language +group at the University of Düsseldorf. The sprint was different from previous +ones in that we had fewer people than usual and many actually live in +Düsseldorf all the time.

                +

                David spent the sprint working on the arm-backend branch, which is adding an +ARM backend to the JIT. With the help of Armin he added support for bridges in +the JIT and generally implemented missing operations, mostly for handling integers so far.

                +

                Ronny and Anto worked the whole week trying to come up with a scheme for +importing PyPy's SVN history into a mercurial repository without loosing too +much information. This is a non-trivial task, because PyPy's history is gnarly. +We are nearly at revision 79000 and when we started using it, Subversion was at +version 0.1. All possible and impossible ways to mangle and mistreat a +Subversion repository have been applied to PyPy's repo, so most of the +importing tools just give up. Ronny and Anto came up with a new plan and new +helper scripts every day, only to then discover another corner case that they +hadn't thought of. Now they might actually have a final plan (but they said +that every day, so who knows?).

                +The branch history of PyPy's repository (every box is a branch)

                Carl Friedrich and Lukas started working in earnest on memory benchmarks to +understand the memory behaviour of Python code better. They have now +implemented a generic memory benchmark runner and a simple analysis that walks +all objects and collects size information about them. They also added some +benchmarks that were proposed in the comments of the recent call for +benchmarks. As soon as some results from that work are there, we will post +about them.

                +

                There were also some minor tasks performed during the sprint. Armin implemented +the _bisect module and the dict.popitem method in RPython. Armin and +Carl Friedrich made the new memory-saving mapdict implementation more suitable +to use without the JIT (blog post should come about that too, at some point). +They also made classes with custom metaclasses a lot faster when the JIT is +used.

                +

                The last three days of the sprint were spent working on Håkan's +jit-unroll-loops branch. The branch is meant to move loop invariants out of +the loop, using techniques very similar to what is described in the recent post +on escape analysis across loop boundaries (see? it will soon stop being +science-fiction). Some of the ideas of this approach also come from LuaJIT +which also uses very aggressive loop invariant code motion in its optimizers. +Moving loop invariants outside of the loop is very useful, because many of the +lookups that Python programs do in loops are loop invariants. An example is if +you call a function in a loop: The global lookup can often be done only once.

                +

                This branch fundamentally changes some of the core assumptions of the JIT, so +it is a huge amount of work to make it fit with all the other parts and to +adapt all tests. That work is now nearly done, some failing tests remain. The +next steps are to fix them and then do additional tests with the translated +executable and look at the benchmarks.

                +
                +

                Comments

                +
                +
                +
                + + Luis wrote on 2010-11-04 13:58: +
                +
                +

                It's great to see improvements in pypy. At this moment, the only three benchmarks that perform better in cpython than in pypy are spitfire, slow spitfire and twisted_tcp.

                What's the reason for the lower performance on these benchmarks? Is it the same reason for the three or there are multiple causes?

                Luis

                +
                +
                +
                +
                + + Maciej Fijalkowski wrote on 2010-11-04 14:03: +
                +
                +

                Hey.

                spitfire and slowspitfire are a 'won't fix' benchmarks (at least in the near future). The spitfire_cstringio is using the same thing, but cStringIO instead of a list of strings.

                Twisted_tcp is slightly more complex and has something to do with pushing a lot of data through sockets. In pypy you usually have to copy data before write, because it can potentially be moved in the GC.

                Cheers,
                fijal

                +
                +
                +
                +
                + + Luis wrote on 2010-11-04 19:46: +
                +
                +

                Thanks! I suppose "won't fix" has a meaning in a pypy context. What does it mean?

                +
                +
                +
                +
                + + Maciej Fijalkowski wrote on 2010-11-04 21:16: +
                +
                +

                won't fix means we won't fix it ;-) To be precise it means we know this program is slow, but also there is a way to write this program to be fast, please use the other way.

                +
                +
                +
                +
                + + Luis wrote on 2010-11-04 23:41: +
                +
                +

                So it doesn't make much sense including these benchmarks in speed.pypy.org, don't you think?
                Perhaps it should be described somewhere what are the strengths and weaknesses of this implementation, suggesting the right approach for each task. Something like "best practices" or something like that...

                +
                +
                +
                +
                + + Maciej Fijalkowski wrote on 2010-11-05 07:36: +
                +
                +

                I think deleting it from the nightly run doesn't make sense. It still measures something and helps us catch regressions.

                The document you're proposing is actually a really neat idea. I've already did a couple of presentation on it, so it's only about gathering knowledge ("only").

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2010/10/next-pypy-sprint-4850394963147107623.html b/posts/2010/10/next-pypy-sprint-4850394963147107623.html new file mode 100644 index 000000000..44aef14c2 --- /dev/null +++ b/posts/2010/10/next-pypy-sprint-4850394963147107623.html @@ -0,0 +1,292 @@ + + + + + +Next PyPy sprint | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                Next PyPy sprint

                + + + +
                +

                Hi all,

                + +

                The next PyPy sprint is scheduled for the end of the month, from the 25th to the 31st of October 2010. It will be done at the university of Düsseldorf, Germany, where three of us are working.

                + +

                Please see this link for more information.

                +
                +

                Comments

                +
                +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2010/10/peace-of-green-4230271053903469504.html b/posts/2010/10/peace-of-green-4230271053903469504.html new file mode 100644 index 000000000..29bb9db4f --- /dev/null +++ b/posts/2010/10/peace-of-green-4230271053903469504.html @@ -0,0 +1,320 @@ + + + + + +The peace of green | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                The peace of green

                + + + +
                +

                No, we are not going to talk about the environment (i.e., the set of variables +as printed by /usr/bin/env. What else? :-)).

                +

                After months in which we had a couple of tests failing every day, we finally +managed to turn (almost) everything green today, at least on Linux. Enjoy +this screenshoot taken from the nightly build page:

                + + + + +

                As usual, the full buildbot results can be seen from the summary page.

                +

                cheers, +Anto

                +
                +

                Comments

                +
                +
                +
                + + intgr wrote on 2010-10-25 16:32: +
                +
                +

                Am I sensing a PyPy 1.4 release?

                +
                +
                +
                +
                + + Antonio Cuni wrote on 2010-10-25 21:05: +
                +
                +

                @intgr: "yes", although we don't have any concrete plan to do a release. But it's true that if we keep all our tests green, doing a release it's much less effort

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2010/10/phd-thesis-about-pypys-cli-jit-backend-969267841095296323.html b/posts/2010/10/phd-thesis-about-pypys-cli-jit-backend-969267841095296323.html new file mode 100644 index 000000000..71ec3b5cb --- /dev/null +++ b/posts/2010/10/phd-thesis-about-pypys-cli-jit-backend-969267841095296323.html @@ -0,0 +1,373 @@ + + + + + +PhD Thesis about PyPy's CLI JIT Backend | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                PhD Thesis about PyPy's CLI JIT Backend

                + + + +
                +

                Hi all,

                +

                few months ago I finished the PhD studies and now my thesis is available, +just in case someone does not have anything better to do than read it :-).

                +

                The title of the thesis is High performance implementation of Python for +CLI/.NET with JIT compiler generation for dynamic languages, and its mainly +based on my work on the CLI backend for the PyPy JIT (note that the CLI JIT +backend is currently broken on trunk, but it's still working in the cli-jit +branch).

                +

                The thesis might be useful also for people that are not directly interested in +the CLI JIT backend, as it also contains general information about the inner +workings of PyPy which are independent from the backend: in particular, +chapters 5 and 6 explain how the JIT frontend works.

                +
                +
                Here is the summary of chapters:
                +
                  +
                1. Introduction
                2. +
                3. The problem
                4. +
                5. Enter PyPy
                6. +
                7. Characterization of the target platform
                8. +
                9. Tracing JITs in a nutshell
                10. +
                11. The PyPy JIT compiler generator
                12. +
                13. The CLI JIT backend
                14. +
                15. Benchmarks
                16. +
                17. Conclusion and Future Work
                18. +
                +
                +

                cheers, +Anto

                +
                +

                Comments

                +
                +
                +
                + + The Cannon Family wrote on 2010-10-22 18:42: +
                +
                +

                congratulations.

                +
                +
                +
                +
                + + Eric van Riet Paap wrote on 2010-10-22 18:43: +
                +
                +

                Yes Anto, congratulations!

                +
                +
                +
                +
                + + Lino wrote on 2010-10-23 01:26: +
                +
                +

                Impressive work, Antonio.

                ciao

                +
                +
                +
                +
                + + Anonymous wrote on 2010-10-23 08:36: +
                +
                +

                Very interesting stuff, still busily reading... could you write a short bibtex entry for citation? Thanks

                +
                +
                +
                +
                + + glyph wrote on 2010-10-23 20:56: +
                +
                +

                Congratulations!

                (but when are we going to see it merged to trunk... ;-))

                +
                +
                +
                +
                + + Antonio Cuni wrote on 2010-10-24 10:08: +
                +
                +

                thank you, guys :-)

                @anonymous: here you can find the bibtex for the thesis, as wall as for other PyPy related papers: https://codespeak.net/svn/pypy/extradoc/talk/bibtex.bib

                @glyph: unfortunately, trunk has diverged a lot since the cli-jit branch, and merging is not an easy issue. There are also fundamental features that on CLI cannot be implemented as efficently as on x86. It's on my todo list, but no concrete plan so far :-(

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2010/11/efficiently-implementing-python-objects-3838329944323946932.html b/posts/2010/11/efficiently-implementing-python-objects-3838329944323946932.html new file mode 100644 index 000000000..5e2d613ad --- /dev/null +++ b/posts/2010/11/efficiently-implementing-python-objects-3838329944323946932.html @@ -0,0 +1,624 @@ + + + + + +Efficiently Implementing Python Objects With Maps | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                Efficiently Implementing Python Objects With Maps

                + + + +
                +

                As could be foreseen by my Call for Memory Benchmarks post a while ago, I am +currently working on improving the memory behaviour of PyPy's Python +interpreter. In this blog post I want to describe the various data a Python +instance can store. Then I want to describe how a branch that I did and that was +recently merged implements the various features of instances in a very +memory-efficient way.

                +
                +

                Python's Object Model

                +

                All "normal" new-style Python instances (i.e. instances of subclasses of object +without added declarations) store two (or possibly three) kinds of information.

                +
                +

                Storing the Class

                +

                Every instance knows which class it belongs to. This information is accessible +via the .__class__ attribute. It can also be changed to other (compatible +enough) classes by writing to that attribute.

                +
                +
                +

                Instance Variables

                +

                Every instance also has an arbitrary number of attributes stored (also called +instance variables). The instance variables used can vary per instance, which is +not the case in most other class-based languages: traditionally (e.g. in +Smalltalk or Java) the class describes the shape of its instances, +which means that the +set of admissible instance variable names is the same for all instances of a +class.

                +

                In Python on the other hand, it is possible to add arbitrary attributes to an +instance at any point. The instance behaves like a dictionary mapping attribute +names (as strings) to the attribute values.

                +

                This is actually how CPython implements instances. Every instance has a +reference to a dictionary that stores all the attributes of the instance. This +dictionary can be reached via the .__dict__ attribute. To make things more +fun, the dictionary can also be changed by writing to that attribute.

                +
                +
                +

                Example

                +

                As an example, consider the following code:

                +
                class A(object):
                +    pass
                +
                +instance1 = A()
                +instance1.x = 4
                +instance1.y = 6
                +instance1.z = -1
                +
                +instance2 = A()
                +instance2.x = 1
                +instance2.y = 2
                +instance2.z = 3
                +
                +

                These two instances would look something like this in memory:

                + +

                (The picture glosses over a number of details, but it still shows the essential +issues.)

                +

                This way of storing things is simple, but unfortunately rather inefficient. Most +instances of the same class have the same shape, i.e. the same set of instance +attribute names. That means that the key part of all the dictionaries is +identical (shown grey here). Therefore storing that part repeatedly in all +instances is a waste. In addition, dictionaries are themselves rather large. +Since they are typically implemented as hashmaps, which must not be too full to +be efficient, a dictionary will use something like 6 words on average per key.

                +
                +
                +

                Slots

                +

                Since normal instances are rather large, CPython 2.2 introduced slots, to make +instances consume less memory. Slots are a way to fix the set of attributes an +instance can have. This is achieved by adding a declaration to a class, like +this:

                +
                class B(object):
                +    __slots__ = ["x", "y", "z"]
                +
                +

                Now the instances of B can only have x, y and z as attributes +and don't have a dictionary at all. Instead, the instances of B get +allocated with enough size to hold exactly the number of instance variables that +the class permits. This clearly saves a lot of memory over the dictionary +approach, but has a number of disadvantages. It is obviously less flexible, as +you cannot add additional instance variables to an instance if you happen to +need to do that. It also introduces a set of rules and corner-cases that can +be surprising sometimes (e.g. instances of a subclass of a class with slots that +doesn't have a slots declaration will have a dict).

                +
                +
                +
                +

                Using Maps for Memory-Efficient Instances

                +

                As we have seen in the diagram above, the dictionaries of instances of the same +class tend to look very similar and share all the keys. The central idea to use +less memory is to "factor out" the common parts of the instance dictionaries +into a new object, called a "map" (because it is a guide to the landscape of the +object, or something). After that factoring out, the representation of the +instances above looks something like this:

                + +

                Every instance now has a reference to its map, which describes what the instance +looks like. The actual instance variables are stored in an array (called +storage in the diagram). In the example here, the map describes that the +instances have three attributes x, y and z. The numbers after the +attributes are indexes into the storage array.

                +

                If somebody adds a new attribute to one of the instances, the map for that +instance will be changed to another map that also contains the new attribute, +and the storage will have to grow a field with the new attribute. The maps are +immutable, immortal and reused as much as possible. This means, that two +instances of the same class with the same set of attributes will have the same +map. This also means that the memory the map itself uses is not too important, +because it will potentially be amortized over many instances.

                +

                Note that using maps makes instances nearly as small as if the correct slots had +been declared in the class. The only overhead needed is the indirection to the +storage array, because you can get new instance variables, but not new slots.

                +

                The concept of a "map" that describes instances is kind of old and comes from +the virtual machine for the Self programming language. The optimization was +first described in 1989 in a paper by Chambers, Ungar and Lee with the title An +Efficient Implementation of Self, a Dynamically-Typed Object-Oriented Language +Based on Prototypes. A similar technique is used in Google's V8 JavaScript +engine, where the maps are called hidden classes and in the Rhino +JavaScript engine.

                +

                The rest of the post describes a number of further details that occur if +instances are implemented using maps.

                +
                +

                Supporting Dictionaries with Maps

                +

                The default instance representation with maps as shown above works without +actually having a dictionary as part of each instance. If a dictionary is +actually requested, by accessing the .__dict__ attribute, it needs to be +created and cached. The dictionary is not a normal Python dictionary, but a thin +wrapper around the object that forwards all operations to it. From the user's +point of view it behaves like a normal dictionary though (it even has the +correct type).

                +

                The dictionary needs to be cached, because accessing .__dict__ several times +should always return the same dictionary. The caching happens by using a +different map that knows about the dictionary and putting the dictionary into +the storage array:

                + +

                Things become really complex if the fake dict is used in strange ways. As long +as the keys are strings, everything is fine. If somebody adds other keys to the +dict, they cannot be represented by the map any more (which supports only +attributes, i.e. string keys in the __dict__). If that happens, all the +information of the instance will move into the fake dictionary, like this:

                + +

                In this picture, the key -1 was added to the instance's dictionary. Since +using the dictionary in arbitrary ways should be rare, we are fine with the +additional time and memory that the approach takes.

                +
                +
                +

                Slots and Maps

                +

                Maps work perfectly together with slots, because the slots can just be stored +into the storage array used by the maps as well (in practise there are some +refinements to that scheme). This means that putting a __slots__ on a +class has mostly no effect, because the instance only stores the values of the +attributes (and not the names), which is equivalent to the way slots are stored +in CPython.

                +
                +
                +
                +

                Implementation Details

                +

                In the diagrams above, I represented the maps as flat objects. In practise this +is a bit more complex, because it needs to be efficient to go from one map to +the next when new attributes are added. Thus the maps are organized in a tree.

                +

                The instances with their maps from above look a bit more like this in practise:

                + +

                Every map just describes one attribute of the object, with a name and a an +index. Every map also has a back field, that points to another map +describing what the rest of the object looks like. This chain ends with a +terminator, which also stores the class of the object.

                +

                The maps also contain the information necessary for making a new object of +class A. Immediately after the new object has been created, its map is the +terminator. If the x attribute is added, its maps is changed to the +second-lowest map, and so on. The blue arrows show the sequence of maps that +the new object goes through when the attributes x, y, z are added.

                +

                This representation of maps as chains of objects sounds very inefficient if an +object has many attributes. The whole chain has to be walked to find the index. +This is true to some extent. The problem goes away in the presence of the JIT, +which knows that the chain of maps is an immutable structure, and will thus +optimize away all the chain-walking. If the JIT is not used, there are a few +caches that try to speed up the walking of this chain (similar to the method +cache in CPython and PyPy).

                +
                +
                +

                Results

                +

                It's hard to compare the improvements of this optimization in a fair way, as +the trade-offs are just very different. Just to give an impression, a million +objects of the same class with three fields on a 32bit system takes:

                +

                without slots:

                +
                  +
                • 182 MiB memory in CPython
                • +
                • 177 MiB memory in PyPy without maps
                • +
                • 40 MiB memory in PyPy with maps
                • +
                +

                with slots:

                +
                  +
                • 45 MiB memory in CPython
                • +
                • 50 MiB memory in PyPy without maps
                • +
                • 40 MiB memory in PyPy with maps
                • +
                +

                Note how maps make the objects a bit more efficient like CPython using slots. +Also, using slots has no additional effect in PyPy.

                +
                +
                +

                Conclusion

                +

                Maps are a powerful approach to shrinking the memory used by many similar +instances. I think they can be pushed even further (e.g. by adding information +about the types of the attributes) and plan to do so in the following months. +Details will be forthcoming.

                +
                +
                +

                Comments

                +
                +
                +
                + + Unknown wrote on 2010-11-13 17:28: +
                +
                +

                Not sure if you are glossing over this, but it seems trivial to avoid the map chain walking by duplicating all of the information in a maps back pointer chain into the map itself. However, the lookup keys are still strings, so your options are some kind of frozen hashtable (which could be nice) or a sorted array.

                Both of those still seem much more efficient than chasing pointers.

                +
                +
                +
                +
                + + Erez wrote on 2010-11-13 19:08: +
                +
                +

                What about the additional running time overhead?

                +
                +
                +
                +
                + + Anonymous wrote on 2010-11-13 20:48: +
                +
                +

                I was surprised not to see any real-world benchmarks (since you collected them earlier). That leaves the impression, that it might be disapointing (knowing that the object/class ratio generally isn't very large).

                +
                +
                +
                +
                + + Carl Friedrich Bolz-Tereick wrote on 2010-11-13 21:48: +
                +
                +

                @Reid:
                I am glossing over the runtime overhead, because the JIT completely removes it, as it knows that the maps are immutable. So you only have a problem if you don't want a JIT, in which case maps indeed make some things a bit slower. Duplicating the information everywhere is possible, but I would like to avoid it (we had a prototype that did it, and it became messy quickly).

                @Erez
                There is no additional runtime overhead if you have the JIT – in fact, things become faster, because the JIT can turn an attribute access into a array field read out of the storage array at a fixed offset.

                @Anonymous
                I have not presented any real-world benchmarks, because I actually did not get around to running them. Yes, I collected some and started writing a memory benchmark framework. But I didn't have time for a full analysis yet. I plan to do such an analysis hopefully soon.

                Anyway, maps never make anything larger, so it is really just a matter of how many instances there are in practice. This will just depend on the benchmark.

                +
                +
                +
                +
                + + Zeev wrote on 2010-11-14 00:13: +
                +
                +

                Does this optimization enable building pypy using pypy without having 16GB of ram?

                +
                +
                +
                +
                + + Anonymous wrote on 2010-11-14 01:29: +
                +
                +

                Is there anything intrinsic toPyPy in this, or can this optimisation be used in CPython as well?

                +
                +
                +
                +
                + + ot wrote on 2010-11-14 03:16: +
                +
                +

                To remove the chain-walking overhead when the code is not JITted, would it be possible to use a persistent hashtable, like for example the hash trie used in Clojure (see https://blog.higher-order.net/2009/09/08/understanding-clojures-persistenthashmap-deftwice/)? They are quite simple to implement and very fast (almost as fast as a normal hashtable lookup)

                +
                +
                +
                +
                + + Allen Short wrote on 2010-11-14 08:34: +
                +
                +

                ot: i'm part way to implementing that for Python.

                https://bazaar.launchpad.net/~washort/%2Bjunk/perseus/annotate/head:/perseus/__init__.py

                +
                +
                +
                +
                + + ot wrote on 2010-11-14 21:23: +
                +
                +

                @Allen: interesting, I wonder how much code would need to be changed to make it RPython...

                +
                +
                +
                +
                + + verte wrote on 2010-11-15 07:45: +
                +
                +

                What are the pypy "with slots" and "without slots" numbers? They are different even though you point out that they have no effect. Is the pypy in question one with sharing dicts?

                +
                +
                +
                +
                + + Maciej Fijalkowski wrote on 2010-11-15 07:56: +
                +
                +

                @verte mapdicts help pypy objects with or without slots (although much more for the latter). There is no difference in pypy with mapdict between having slots or not having slots.

                +
                +
                +
                +
                + + Carl Friedrich Bolz-Tereick wrote on 2010-11-15 12:58: +
                +
                +

                @Zeev no, the translation results from last week already included this optimization. Maps don't help translation much, because we already added all the necessary slot declarations to save memory on CPython.

                @ot Would be possible, yes. Not sure it is worth it, given that the total set of attributes of typical instances is not very large. The data structure looks interesting though.

                +
                +
                +
                +
                + + ot wrote on 2010-11-15 13:21: +
                +
                +

                @Carl: yes, it was just hypotetic, "if it is a bottleneck". I think anyway that even with very small instance dictionaries there could be a benefit: most keys would be resolved within the first layer of the trie, so with a single lookup or two at most. But it could be premature optimization.

                +
                +
                +
                +
                + + verte wrote on 2010-11-16 00:30: +
                +
                +

                I still don't understand. Was there a difference with __slots__ on pypy before mapdict? Why are the numbers different on pypy without mapdict? Are those the numbers with or without the old sharing dictimpl? If without, what is the memory usage with sharing dicts?

                +
                +
                +
                +
                + + barnert wrote on 2014-08-03 23:31: +
                +
                +

                This came up on StackOverflow, but let me answer it here.

                While the CPython split-dict implementation in 3.3+ (PEP 412) may be inspired by your design, it's not the same, and it doesn't provide nearly as much savings.

                The first difference is that it still has a full dict struct in the instance. For classes without that many attributes (i.e., most of them), the dict struct is almost as big as the hash table, so this means you typically only get half the savings as in PyPy. However, this means the thing you can access by __dict__ isn't created dynamically, it acts exactly like a dict even at the C API level, and it can transparently (again, even at the C API level) convert itself to a combined dict if needed (if the table needs to expand and there's more than one reference to the shared key table). The fact that the difference between 3.2 and 3.3 is completely undetectable to any code that doesn't directly access the hash buckets is a big part of the reason Mark Shannon was able to get everyone to agree to accept it, and as far as I know there's no serious consideration for changing it.

                The second difference is that the dict struct's array is kept in the same (sparse) order as the shared key table, rather than being a compact array indexed by the values of the shared key table. This means it's kept at least 1/3rd unloaded, meaning that again you get less space savings than with PyPy. There's less of a good rationale here; there's a small performance cost to the slightly more complicated code needed for indexing PyPy-style, and it would make small combined dicts one word larger (which could affect classes whose instances all have different attributes, or cases where you have a huge number of classes with few instances, or other edge cases). The PEP implies that using the sparse implementation is probably overly conservative, and leaves open the possibility of changing it after 3.3.

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2010/11/improving-memory-behaviour-to-make-self-856966667913962461.html b/posts/2010/11/improving-memory-behaviour-to-make-self-856966667913962461.html new file mode 100644 index 000000000..d810baa70 --- /dev/null +++ b/posts/2010/11/improving-memory-behaviour-to-make-self-856966667913962461.html @@ -0,0 +1,498 @@ + + + + + +Improving Memory Behaviour to Make Self-Hosted PyPy Translations Practical | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                Improving Memory Behaviour to Make Self-Hosted PyPy Translations Practical

                + + + +
                +

                In our previous blog post, we talked about how fast PyPy can translate +itself compared to CPython. However, the price to pay for the 2x speedup was +an huge amount of memory: actually, it was so huge that a standard -Ojit +compilation could not be completed on 32-bit because it required more than the +4 GB of RAM that are addressable on that platform. On 64-bit, it consumed +8.3 GB of RAM instead of the 2.3 GB needed by CPython.

                +

                This behavior was mainly caused by the JIT, because at the time we wrote the +blog post the generated assembler was kept alive forever, together with some +big data structure needed to execute it.

                +

                In the past two weeks Anto and Armin attacked the issue in the jit-free +branch, which has been recently merged to trunk. The branch solves several +issues. The main idea of the branch is that if a +loop has not been executed for a certain amount of time (controlled by the new +loop_longevity JIT parameter) we consider it "old" and no longer needed, +thus we deallocate it.

                +

                (In the process of doing this, we also discovered and fixed an +oversight in the implementation of generators, which led to generators being +freed only very slowly.)

                +

                To understand the freeing of loops some more, let's look at how many loops are +actually created during a translation. +The purple line in the following graph shows how many loops (and bridges) are +alive at any point in time with an infinite longevity, which is equivalent to +the situation we had before the jit-free branch. By contrast, the blue +line shows the number of loops that you get in the current trunk: the +difference is evident, as now we never have more than 10000 loops alive, while +previously we got up to about 37000 ones. The time on the X axis is expressed +in "Giga Ticks", where a tick is the value read out of the Time Stamp Counter +of the CPU.

                + + + +

                The grey vertical bars represent the beginning of each phase of the +translation:

                +
                  +
                • +annotate performs control flow graph construction and type inference.
                • +
                • +rtype lowers the abstraction level of the control flow graphs with types to that of C.
                • +
                • +pyjitpl constructs the JIT.
                • +
                • +backendopt optimizes the control flow graphs.
                • +
                • +stackcheckinsertion finds the places in the call graph that can overflow the C stack and inserts checks that raise an exception instead.
                • +
                • +database_c produces a database of all the objects the C code will have to know about.
                • +
                • +source_c produces the C source code.
                • +
                • +compile_c calls the compiler to produce the executable.
                • +
                +

                You can nicely see, how the number of alive graphs drops shortly after the +beginning of a new phase.

                +

                Those two fixes, freeing loops and generators, improve the memory usage greatly: +now, translating PyPy +on PyPy on 32-bit consumes 2 GB of RAM, while on CPython it consumes 1.1 GB. +This result can even be improved somewhat, because we are not actually freeing +the assembler code itself, but +only the large data structures around it; we can consider it as a residual +memory leak of around 150 MB in this case. This will be fixed in the +jit-free-asm branch.

                +

                The following graph shows the memory usage in more detail:

                +
                +
                  +
                • the blue line (cpython-scaled) shows the total amount of RAM that the +OS allocates for CPython. Note that the X axis (the time) has been +scaled down so that it spans as much as the PyPy one, to ease the +comparison. Actually, CPython took more than twice as much time as PyPy to +complete the translation
                • +
                • the red line (VmRss) shows the total amount of RAM that the +OS allocates for PyPy: it includes both the memory directly handled by +our GC and the "raw memory" that we need to allocate for other tasks, such +as the assembly code generated by the JIT
                • +
                • the brown line (gc-before) shows how much memory is used by the GC +before each major collection
                • +
                • the yellow line (gc-after) shows how much memory is used by the GC +after each major collection: this represent the amount of memory which is +actually needed to hold our Python objects. The difference between +gc-before and gc-after (the GC delta) is the amout of memory that the GC +uses before triggering a new major collection
                • +
                +
                + + + +

                By comparing gc-after and cpython-scaled, we can see that PyPy +uses mostly the same amount of memory as CPython for storing the application +objects (due to reference counting the memory usage in CPython is always very +close to the actually necessary memory). The extra memory +used by PyPy is due to the GC delta, to the machine code generated by the JIT +and probably to some other external effect (such as e.g. Memory +Fragmentation).

                +

                Note that the GC delta can be set arbitrarly low (another recent addition -- +the default value depends on the actual RAM on your computer; it probably +works to translate if your computer has precisely 2 GB, because in this +case the GC delta and thus the total memory usage will be somewhat +lower than reported here), but the cost is to have more +frequent major collections and thus a higher run-time overhead. The same is +true for the memory needed by the JIT, which can be reduced by telling the JIT +to compile less often or to discard old loops more frequently. As often +happens in computer science, there is a trade-off between space and time, and +currently for this particular example PyPy runs twice as fast as CPython by +doubling the memory usage. We hope to improve even more on this trade-off.

                +

                On 64-bit, things are even better as shown by the the following graph:

                + + + +

                The general shape of the lines is similar to the 32-bit graph. However, the +relative difference to CPython is much better: we need about 3 GB of RAM, just +24% more than the 2.4 GB needed by CPython. And we are still more than 2x +faster!

                +

                The memory saving is due (partly?) to the vtable ptr optimization, which is +enabled by default on 64-bit because it has no speed penalty (see +Unifying the vtable ptr with the GC header).

                +

                The net result of our work is that now translating PyPy on PyPy is practical +and takes less than 30 minutes. It's impressive how quickly you get used to +translation taking half the time -- now we cannot use CPython any more for that +because it feels too slow :-).

                +
                +

                Comments

                +
                +
                +
                + + crncosta wrote on 2010-11-26 16:29: +
                +
                +

                Big huge improvement since last post. Kudos!! :-)

                Please don't get me wrong, but I need to ask: is there any plan to merge pypy into cPython (or even replace it)?

                BTW, I'm following the blog (please, keep this regular posts) and planing to make a donation to support your next sprint due to the regular and very well done work.

                congratulations again.

                +
                +
                +
                +
                + + Martijn Faassen wrote on 2010-11-26 17:09: +
                +
                +

                This is incredibly cool! Congrats!

                +
                +
                +
                +
                + + Leonardo Santagada wrote on 2010-11-26 17:38: +
                +
                +

                This is amazing. It was kind of a let down when you reported it used too much memory. But now I can on my laptop translate pypy in 32 and 64 bits using pypy itself :)

                The world is good again :)

                +
                +
                +
                +
                + + Maciej Fijalkowski wrote on 2010-11-26 17:48: +
                +
                +

                @crncosta

                There are no plans for merging PyPy to CPython. I don't think "replacing" is a good word, but you can use PyPy for a lot of things already, so it is a viable Python implementation together with CPython.

                +
                +
                +
                +
                + + Luis wrote on 2010-11-26 18:20: +
                +
                +

                I am curious... Has there ever been interest from Google to sponsor this project?
                I know about unladen swallow, but has anyone there expressed interest in using pypy somewhere in their organization?

                Sorry for the off topic question...

                +
                +
                +
                +
                + + Peter wrote on 2010-11-26 18:28: +
                +
                +

                Always fascinating to read about the work you're doing. Please keep posting, and keep up the good work. You really are heroes.

                +
                +
                +
                +
                + + WI wrote on 2010-11-27 01:09: +
                +
                +

                Wow! this is great news.. keep us posted on what other developments you have.

                +
                +
                +
                +
                + + Anonymous wrote on 2010-11-27 05:02: +
                +
                +

                like luis i am also curious about why google doesn't show a lot more interest in pypy. unladen swallow didn't really work out or did it?

                +
                +
                +
                +
                + + Symbol wrote on 2010-11-27 11:37: +
                +
                +

                Kudos!

                KUTGW!!

                +
                +
                +
                +
                + + Anonymous wrote on 2010-11-28 08:33: +
                +
                +

                Congrats!

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2010/11/pypy-14-ouroboros-in-practice-5437628000869417542.html b/posts/2010/11/pypy-14-ouroboros-in-practice-5437628000869417542.html new file mode 100644 index 000000000..791434e35 --- /dev/null +++ b/posts/2010/11/pypy-14-ouroboros-in-practice-5437628000869417542.html @@ -0,0 +1,583 @@ + + + + + +PyPy 1.4: Ouroboros in practice | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                PyPy 1.4: Ouroboros in practice

                + + + +
                +

                We're pleased to announce the 1.4 release of PyPy. This is a major breakthrough +in our long journey, as PyPy 1.4 is the first PyPy release that can translate +itself faster than CPython. Starting today, we are using PyPy more for +our every-day development. So may you :) You can download it here:

                +

                https://pypy.org/download.html

                +
                +

                What is PyPy

                +

                PyPy is a very compliant Python interpreter, almost a drop-in replacement +for CPython. It is fast (pypy 1.4 and cpython 2.6 comparison).

                +

                New Features

                +

                Among its new features, this release includes numerous performance improvements +(which made fast self-hosting possible), a 64-bit JIT backend, as well +as serious stabilization. As of now, we can consider the 32-bit and 64-bit +linux versions of PyPy stable enough to run in production.

                +

                Numerous speed achievements are described on our blog. Normalized speed +charts comparing pypy 1.4 and pypy 1.3 as well as pypy 1.4 and cpython 2.6 +are available on the benchmark website. For the impatient: yes, we got a lot faster!

                +
                +
                +

                More highlights

                +
                  +
                • PyPy's built-in Just-in-Time compiler is fully transparent and +automatically generated; it now also has very reasonable memory +requirements. The total memory used by a very complex and +long-running process (translating PyPy itself) is within 1.5x to +at most 2x the memory needed by CPython, for a speed-up of 2x.
                • +
                • More compact instances. All instances are as compact as if +they had __slots__. This can give programs a big gain in +memory. (In the example of translation above, we already have +carefully placed __slots__, so there is no extra win.)
                • +
                • +Virtualenv support: now PyPy is fully compatible with virtualenv: note that +to use it, you need a recent version of virtualenv (>= 1.5).
                • +
                • Faster (and JITted) regular expressions - huge boost in speeding up +the re module.
                • +
                • Other speed improvements, like JITted calls to functions like map().
                • +
                +

                Cheers,
                +Carl Friedrich Bolz, Antonio Cuni, Maciej Fijalkowski, +Amaury Forgeot d'Arc, Armin Rigo and the PyPy team

                +
                +
                +

                Comments

                +
                +
                +
                + + ipc wrote on 2010-11-26 18:42: +
                +
                +

                congratulations!

                +
                +
                +
                +
                + + why wrote on 2010-11-26 18:47: +
                +
                +

                This is unacceptable. Christmas is not until next month!!!

                +
                +
                +
                +
                + + Tim Parkin wrote on 2010-11-26 19:09: +
                +
                +

                Massive congratulations - exciting!

                +
                +
                +
                +
                + + Unknown wrote on 2010-11-26 19:18: +
                +
                +

                Sweet! Keep up the great work !

                +
                +
                +
                +
                + + Anonymous wrote on 2010-11-26 19:41: +
                +
                +

                Woohoo!!

                +
                +
                +
                +
                + + Martijn Faassen wrote on 2010-11-26 20:07: +
                +
                +

                Awesome!

                +
                +
                +
                +
                + + Anonymous wrote on 2010-11-26 20:59: +
                +
                +

                Hip hip hooooraaaay!!!!

                +
                +
                +
                +
                + + ipc wrote on 2010-11-26 22:51: +
                +
                +

                all I want for Christmas is stackless support in a 64-bit pypy-c-jit :) 'two greenlets switching and a partridge in a pear tree!'

                +
                +
                +
                +
                + + Unknown wrote on 2010-11-26 23:14: +
                +
                +

                Congratulations. I hope the PPA is going to be updated soon. Too lazy to build it myself, right now. (:

                +
                +
                +
                +
                + + Paul Boddie wrote on 2010-11-26 23:29: +
                +
                +

                Is there a -j <number-of-cores> option for the translation process? It's a bit unfortunate that 15 cores on the nice machine I'm using can't be put to use making it translate faster. (Or unfortunate that I didn't read the documentation, maybe.)

                +
                +
                +
                +
                + + ipc wrote on 2010-11-26 23:54: +
                +
                +

                --make-jobs=N only some parts of the translation process is parallel.

                +
                +
                +
                +
                + + Anonymous wrote on 2010-11-27 00:10: +
                +
                +

                Eta until numpy scipy?

                +
                +
                +
                +
                + + Paul Boddie wrote on 2010-11-27 01:00: +
                +
                +

                The report of 2.4GB usage on x86-64 is accurate, but it took about 7800s on a 2.33GHz Xeon. Next time I'll try and exercise some of the other cores, though.

                +
                +
                +
                +
                + + Anonymous wrote on 2010-11-27 04:54: +
                +
                +

                so pypy on average is now about 2x faster than cpython?

                and unladen swallows goal was beeing 5x faster? was that totally unrealistic?

                +
                +
                +
                +
                + + Leonard Ritter wrote on 2010-11-27 10:59: +
                +
                +

                You are my heroes!

                +
                +
                +
                +
                + + Symbol wrote on 2010-11-27 11:37: +
                +
                +

                Just Awesome!!!

                KUTGW!

                +
                +
                +
                +
                + + Daivd wrote on 2010-11-27 12:02: +
                +
                +

                Does this release include the -free branch that was mentioned in the previous post? The 2x memory requirements lead me to believe so.

                +
                +
                +
                +
                + + Maciej Fijalkowski wrote on 2010-11-27 13:45: +
                +
                +

                @Daivd
                yes, it does

                @Anonymous
                5x improvement is not a well defined goal, however it's a good marketing thing. PyPy is 2x faster on translation, 60x faster on some benchmarks while slower on other. What does it mean to be 5x faster?

                +
                +
                +
                +
                + + Christian S. Perone wrote on 2010-11-27 14:23: +
                +
                +

                Sounds great, great work, great thanks !

                +
                +
                +
                +
                + + scientist wrote on 2010-11-27 14:34: +
                +
                +

                Do you know why the purely numerical benchmarks nbody and spectral-norm are still so much slower in PyPy compared to e.g. LuaJIT?

                +
                +
                +
                +
                + + tobami wrote on 2010-11-27 14:44: +
                +
                +

                This is awesome. PyPy 1.4 addresses the 2 slowest benchmarks, slowspitfire and spambayes. There is no benchmark anymore where PyPy is much slower than CPython.

                To me, this marks the first time you can say that PyPy is ready for general "consumption". Congratulations!

                PS: The best comparison to appreciate how much of an improvement 1.4 has been is:
                https://speed.pypy.org/comparison/?exe=2%2B35,1%2B41,1%2B172&ben=1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20&env=1&hor=false&bas=2%2B35&chart=normal+bars

                +
                +
                +
                +
                + + Maciej Fijalkowski wrote on 2010-11-27 17:37: +
                +
                +

                @scientist

                Sure, because LuaJIT is crazy when it comes to optimizations :-) We'll get there eventually, but purely numerical stuff is not as high on our list as other things.

                +
                +
                +
                +
                + + Luis wrote on 2010-11-27 18:37: +
                +
                +

                @maciej: in an old thread (have tracing compilers won?) you replied to Mike Pall saying that pypy was in a way middle ground, that it didn't offer as much opportunities for micro optimizations as luajit.

                You were discussing about keeping high level constructions from the user program to perform more tricks.

                Has the situation changed?
                Do you really think now that you'll get there?

                Anyway, let me tell you that you are all already my super heroes :-)

                +
                +
                +
                +
                + + Maciej Fijalkowski wrote on 2010-11-27 18:46: +
                +
                +

                Heh, I don't remember that :-)

                Anyway, LuaJIT has more options for microoptimziations simply because Lua is a simpler language. That doesn't actually make it impossible for PyPy, it simply make it harder and taking more time (but it's still possible). I still think we can get (but predicting future is hard) where LuaJIT is right now, but racing Mike would be a challenge that we might loose ;-)

                That said, even in simple loops there are obvious optimizations to be performed, so we're far from being done. We're going there, but it's taking time ;-)

                +
                +
                +
                +
                + + Victor wrote on 2010-11-27 19:33: +
                +
                +

                Congrats to all PyPy developers for making huge contributions to Python performance, JIT and implementation research and delivering an end product that will help many developers to get more done.

                IIUC, we still have ARM, jit-unroll-loops, more memory improvements, Python 2.7 (Fast Forward branch) and a bunch of other cool improvements in the works, besides some known interesting targets that will eventually be tackled (e.g. JITted stackless).

                I wish more big Python apps and developers would play with PyPy and report the results.

                Cheers!

                P.S.: Fijal: see https://lambda-the-ultimate.org/node/3851#comment-57715

                +
                +
                +
                +
                + + Michal M. wrote on 2010-11-29 18:55: +
                +
                +

                Congratulations.
                However, you suggest people used it in production environment - please, give us version compatible at least with CPython 2.6.
                I hope that you plan it but at first you wanted to have stable and fast base. :)

                +
                +
                +
                +
                + + Amaury Forgeot d'Arc wrote on 2010-12-01 22:21: +
                +
                +

                @Michal:
                There is already an ongoing effort to port PyPy to Python 2.7.

                But we need some help! It's a good way to become a PyPy developer.
                And no, you don't have to be a JIT expert to implement itertools.combinations or asian codecs.

                +
                +
                +
                +
                + + Anonymous wrote on 2011-02-09 00:18: +
                +
                +

                kudos to whip-smart guys for this wonderful piece of software.

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2010/11/running-large-radio-telescope-software-7600337209616168504.html b/posts/2010/11/running-large-radio-telescope-software-7600337209616168504.html new file mode 100644 index 000000000..69d3cc914 --- /dev/null +++ b/posts/2010/11/running-large-radio-telescope-software-7600337209616168504.html @@ -0,0 +1,432 @@ + + + + + +Running large radio telescope software on top of PyPy and twisted | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                Running large radio telescope software on top of PyPy and twisted

                + + + +
                +

                Hello.

                +

                As some of you already know, I've recently started working on a +very large radio telescope at SKA South Africa. This telescope's +operating software runs almost exclusively on Python (several high throughput +pieces are in C or CUDA or directly executed by FPGAs). Some cool telescope pictures:

                + +
                + +
                +
                + +
                + +

                (photos courtesy of SKA South Africa)

                +

                Most of the operation software is using the KatCP protocol to talk between devices. +The currently used implementation is Open Source software with a custom home built +server and client. As part of the experiments, I've implemented a Twisted based +version and run in on top of CPython and PyPy for both the default +implementation and the one based on Twisted to see how those perform.

                +

                There are two testing scenarios: the first one is trying to saturate the connection +by setting up multiple sensors that report state every 10ms, the second one +is measuring a round-trip between sending a request and receiving the response. +Both numbers are measuring the number of requests per 0.2s, so the more the better. On X axis there is a number of simultanously connected clients.

                +

                All benchmark code is available in the KatCP repository.

                +

                The results are as follows:

                + +
                + +
                +
                + +
                +

                As you can see, in general Twisted has larger overhead for a single client +and scales better as the number of clients increases. That's I think expected, +since Twisted has extra layers of indirection. The round trip degradation of +Twisted has to be investigated, but for us scenario1 is by far more important.

                +

                All across the board PyPy performs much better than CPython for both +Twisted and a home-made solution, which I think is a pretty good result.

                +

                Note: we didn't roll this set up into production yet, but there are high +chances for both twisted and PyPy to be used in some near future.

                +

                Cheers, +fijal

                +
                +

                Comments

                +
                +
                +
                + + Anonymous wrote on 2010-11-16 01:56: +
                +
                +

                Why not try PyZmq (https://www.zeromq.org/bindings:python):)? the IPython project(https://ipython.scipy.org/moin/) is also moving from Twisted
                to PyZmq.

                +
                +
                +
                +
                + + Maciej Fijalkowski wrote on 2010-11-16 06:23: +
                +
                +

                Sorry this is not an apropriate forum to discuss this. One of the reasons would be that Twisted and PyZmq are doing two completely different things and PyZmq won't work on PyPy.

                +
                +
                +
                +
                + + Michal M. wrote on 2010-11-16 07:59: +
                +
                +

                Oh, I envy you. And congratulations.
                Keep working.
                I wait for 2.6 compatible ver. of PyPy to try it with my little project.

                A widząc, że prawdopodobnie rodak, to tym bardziej się cieszę.

                +
                +
                +
                +
                + + glyph wrote on 2010-11-16 09:13: +
                +
                +

                Maciej, this is great news. Congratulations.

                I look forward to making PyPy+Twisted even faster from the Twisted side :).

                +
                +
                +
                +
                + + Alessio Sclocco wrote on 2010-11-17 14:00: +
                +
                +

                Hi Maciej,

                You say that there you are mostly using Python and sometimes C, CUDA or FPGAs.
                I am writing my master thesis in the Netherlands, it is about the efficient implementation of a beam forming algorithm (the one used by the LOFAR) on modern GPUs using CUDA and OpenCL. Do you have some papers or other material there about the telescope software ? I would be really interested on citing it on the related works part.

                +
                +
                +
                +
                + + Maciej Fijalkowski wrote on 2010-11-17 16:26: +
                +
                +

                Hey Alessio. I think this blog is not really a good medium for 2-way communication feel free to come to #pypy on irc.freenode.net or write to me directly at fijall at gmail.

                In general, we don't want beam forming to be performed on GPU (because it's hard), but rather on custom-built hardware and FPGAs.

                +
                +
                +
                +
                + + Anonymous wrote on 2010-11-21 10:40: +
                +
                +

                I have a program using Python and Twisted where I load tested both server and client connections (the program can do both the server and client protocol). I tested both types out to 100 connections (at 50 milli-second polling intervals) while measuring CPU load.

                What I found was that when acting as a server it scaled fairly linearly. When acting as the client side however, load rose to a peak about 60 clients, then fell by a third until 80 clients, and then rose again until at 100 clients it reached the same load level as at 60. If you have a similar situation you may need to watch out for this phenomenon.

                I also found that using the epoll reactor on Linux made a *big* difference to capacity in my applications, much more so than any normal program optimization efforts that I made. I have multiple clients and multiple server ports all running simultaneously, so I'm not sure how this may translate to your application if you are only using Twisted as a server.

                Here's a link to my project web site where I show the connections versus CPU load chart (first chart):

                https://mblogic.sourceforge.net/mblogichelp/general/Capacity-en.html

                I haven't tested this with PyPy as I don't have a combination of anything that is both 32-bit *and* new enough to run a recent version.

                +
                +
                +
                +
                + + Maciej Fijalkowski wrote on 2010-11-21 12:16: +
                +
                +

                PyPy has 64bit support these days.

                +
                +
                +
                +
                + + Anonymous wrote on 2010-11-22 20:28: +
                +
                +

                I also made the previous anonymous post on the 21st. I haven't been able to get the 64 bit JIT version to run or build. That may be my fault, but I haven't been able to test it (this isn't a problem that I want to waste your time on however).

                I have tested the non-JIT Pypy using a simplified version of my server and client programs, using asyncore instead of Twisted. The server and client use a standard industrial automation protocol to talk to each other over a TCP socket. The programs also make heavy use of list slicing and struct.

                The non-JIT version passes all the tests I have for the server, and runs my application performance test at roughly 1/3 the speed of CPython 2.6. This is very impressive, as I have never been able to get either IronPython (on Mono) nor Jython to even run the programs, let alone pass my functional tests. The fact that Pypy (non-JIT) can run these programs perfectly without changes is something that I find very promising.

                Please continue the good work, and thank you for what you've done so far!

                +
                +
                +
                +
                + + Maciej Fijalkowski wrote on 2010-11-23 08:13: +
                +
                +

                Hey, great to hear!

                Well, the non-JIT version would rather be slow, but that's fine :) We try very hard to produce a compliant python interpreter and twisted folk helped us greatly with getting all the posix stuff right.

                Cheers,
                fijal

                +
                +
                +
                +
                + + Unknown wrote on 2010-11-26 23:53: +
                +
                +

                I would be very interested if someone could provide some info on how to get twisted working on pyp. I have managed to install twisted in the pypy setup but starting it produces:
                AttributeError: 'module' object has no attribute 'load_dynamic'
                coming from zope

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2010/11/snake-which-bites-its-tail-pypy-jitting-5161284681004717142.html b/posts/2010/11/snake-which-bites-its-tail-pypy-jitting-5161284681004717142.html new file mode 100644 index 000000000..cbb15cf15 --- /dev/null +++ b/posts/2010/11/snake-which-bites-its-tail-pypy-jitting-5161284681004717142.html @@ -0,0 +1,598 @@ + + + + + +A snake which bites its tail: PyPy JITting itself | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                A snake which bites its tail: PyPy JITting itself

                + + + +
                + + + +

                We have to admit: even if we have been writing for years about the fantastic +speedups that the PyPy JIT gives, we, the PyPy developers, still don't use it +for our daily routine. Until today :-).

                +

                Readers brave enough to run translate.py to translate PyPy by themselves +surely know that the process takes quite a long time to complete, about a hour +on super-fast hardware and even more on average computers. Unfortunately, it +happened that translate.py was a bad match for our JIT and thus ran much +slower on PyPy than on CPython.

                +

                One of the main reasons is that the PyPy translation toolchain makes heavy use +of custom metaclasses, and until few weeks ago metaclasses disabled some of +the central optimizations which make PyPy so fast. During the recent +Düsseldorf sprint, Armin and Carl Friedrich fixed this problem and +re-enabled all the optimizations even in presence of metaclasses.

                +

                So, today we decided that it was time to benchmark again PyPy against itself. +First, we tried to translate PyPy using CPython as usual, with the following +command line (on a machine with an "Intel(R) Xeon(R) CPU W3580 @ 3.33GHz" and +12 GB of RAM, running a 32-bit Ubuntu):

                +
                +$ python ./translate.py -Ojit targetpypystandalone --no-allworkingmodules
                +
                +... lots of output, fractals included ...
                +
                +[Timer] Timings:
                +[Timer] annotate                       ---  252.0 s
                +[Timer] rtype_lltype                   ---  199.3 s
                +[Timer] pyjitpl_lltype                 ---  565.2 s
                +[Timer] backendopt_lltype              ---  217.4 s
                +[Timer] stackcheckinsertion_lltype     ---   26.8 s
                +[Timer] database_c                     ---  234.4 s
                +[Timer] source_c                       ---  480.7 s
                +[Timer] compile_c                      ---  258.4 s
                +[Timer] ===========================================
                +[Timer] Total:                         --- 2234.2 s
                +
                +

                Then, we tried the same command line with PyPy (SVN revision 78903, x86-32 JIT +backend, downloaded from the nightly build page):

                +
                +$ pypy-c-78903 ./translate.py -Ojit targetpypystandalone --no-allworkingmodules
                +
                +... lots of output, fractals included ...
                +
                +[Timer] Timings:
                +[Timer] annotate                       ---  165.3 s
                +[Timer] rtype_lltype                   ---  121.9 s
                +[Timer] pyjitpl_lltype                 ---  224.0 s
                +[Timer] backendopt_lltype              ---   72.1 s
                +[Timer] stackcheckinsertion_lltype     ---    7.0 s
                +[Timer] database_c                     ---  104.4 s
                +[Timer] source_c                       ---  167.9 s
                +[Timer] compile_c                      ---  320.3 s
                +[Timer] ===========================================
                +[Timer] Total:                         --- 1182.8 s
                +
                +

                Yes, it's not a typo: PyPy is almost two times faster than CPython! +Moreover, we can see that PyPy is faster in each of the individual steps apart +compile_c, which consists in just a call to make to invoke gcc. +The slowdown comes from the fact that the Makefile also contains a lot of +calls to the trackgcroot.py script, which happens to perform badly on PyPy +but we did not investigate why yet.

                +

                However, there is also a drawback: on this specific benchmark, PyPy consumes +much more memory than CPython. The reason why the command line above contains +--no-allworkingmodules is that if we include all the modules the +translation crashes when it's complete at 99% because it consumes all the 4GB +of memory which is addressable by a 32-bit process.

                +

                A partial explanation if that so far the assembler generated by the PyPy JIT +is immortal, and the memory allocated for it is never reclaimed. This is +clearly bad for a program like translate.py which is divided into several +independent steps, and for which most of the code generated in each step could +be safely be thrown away when it's completed.

                +

                If we switch to 64-bit we can address the whole 12 GB of RAM that we have, and +thus translating with all working modules is no longer an issue. This is the +time taken with CPython (note that it does not make sense to compare with the +32-bit CPython translation above, because that one does not include all the +modules):

                +
                +$ python ./translate.py -Ojit
                +
                +[Timer] Timings:
                +[Timer] annotate                       ---  782.7 s
                +[Timer] rtype_lltype                   ---  445.2 s
                +[Timer] pyjitpl_lltype                 ---  955.8 s
                +[Timer] backendopt_lltype              ---  457.0 s
                +[Timer] stackcheckinsertion_lltype     ---   63.0 s
                +[Timer] database_c                     ---  505.0 s
                +[Timer] source_c                       ---  939.4 s
                +[Timer] compile_c                      ---  465.1 s
                +[Timer] ===========================================
                +[Timer] Total:                         --- 4613.2 s
                +
                +

                And this is for PyPy:

                +
                +$ pypy-c-78924-64 ./translate.py -Ojit
                +
                +[Timer] Timings:
                +[Timer] annotate                       ---  505.8 s
                +[Timer] rtype_lltype                   ---  279.4 s
                +[Timer] pyjitpl_lltype                 ---  338.2 s
                +[Timer] backendopt_lltype              ---  125.1 s
                +[Timer] stackcheckinsertion_lltype     ---   21.7 s
                +[Timer] database_c                     ---  187.9 s
                +[Timer] source_c                       ---  298.8 s
                +[Timer] compile_c                      ---  650.7 s
                +[Timer] ===========================================
                +[Timer] Total:                         --- 2407.6 s
                +
                +

                The results are comparable with the 32-bit case: PyPy is still almost 2 times +faster than CPython. And it also shows that our 64-bit JIT backend is as good +as the 32-bit one. Again, the drawback is in the consumed memory: CPython +used 2.3 GB while PyPy took 8.3 GB.

                +

                Overall, the results are impressive: we knew that PyPy can be good at +optimizing small benchmarks and even middle-sized programs, but as far as we +know this is the first example in which it heavily optimizes a huge, real world +application. And, believe us, the PyPy translation toolchain is complex +enough to contains all kinds of dirty tricks and black magic that make Python +lovable and hard to optimize :-).

                +
                +

                Comments

                +
                +
                +
                + + Victor wrote on 2010-11-09 17:50: +
                +
                +

                This is amazing, huge kudos to all PyPy developers!

                Do these results include "Håkan's jit-unroll-loops branch" you mentioned in sprint report? When are we going to get a release containing these improvements? And do the nightly builds include them?

                +
                +
                +
                +
                + + Carl Friedrich Bolz-Tereick wrote on 2010-11-09 18:05: +
                +
                +

                @Victor: No, Håkan's branch has not been merged. It still has some problems that we don't quite know how to solve.

                The nightly builds include all other improvements though. We plan to do a release at some point soon.

                +
                +
                +
                +
                + + Anonymous wrote on 2010-11-09 18:28: +
                +
                +

                This is great!

                One question: A while back, after the GSoC project for 64-bit, there was an issue with asmgcc-64 such that the 64-bit GC was slower than it should be.

                It appears from the performance described in this post, that that must be resolved now. Is that right?

                Thanks,
                Gary

                +
                +
                +
                +
                + + Leonardo Santagada wrote on 2010-11-09 18:36: +
                +
                +

                There should be a way to not only throw away jit memory but somehow tell pypy to try to not use more than say 3gb of ram so it will not hit swap on 4gb machines.

                +
                +
                +
                +
                + + Maciej Fijalkowski wrote on 2010-11-09 18:39: +
                +
                +

                @Gary yes, that is correct

                +
                +
                +
                +
                + + Anonymous wrote on 2010-11-09 23:00: +
                +
                +

                Wow, cool work!

                +
                +
                +
                +
                + + Eric van Riet Paap wrote on 2010-11-09 23:26: +
                +
                +

                Excellent.. congratulations!

                +
                +
                +
                +
                + + Unknown wrote on 2010-11-09 23:30: +
                +
                +

                Wow, looks great!

                Many thanks for posting the benchmark – and for your relentless work on pypy!

                One thing: Could you add tests comparing with programs converted to python3?

                +
                +
                +
                +
                + + Antonio Cuni wrote on 2010-11-10 07:21: +
                +
                +

                @ArneBab: I'm not sure what you mean, but consider that at the moment PyPy does not support Python 3, so it does not make sense to compare against it.

                +
                +
                +
                +
                + + Michael Foord wrote on 2010-11-10 16:04: +
                +
                +

                PyPy continues to get more and more impressive.

                +
                +
                +
                +
                + + Armin Rigo wrote on 2010-11-10 17:02: +
                +
                +

                For reference, at some point (long ago) I tried to use Psyco to speed up translate.py on CPython; but i didn't make any difference -- I'm guessing it's because we have nested scope variables at a few critical points, which Psyco cannot optimize. Now I no longer have a need for that :-)

                +
                +
                +
                +
                + + Anonymous wrote on 2010-11-10 23:19: +
                +
                +

                Very cool achievement. I'm curious however to know why compile_c section is slower. I thought it was mostly waiting on external programs to run and so should of been similar time cpython? Congratulations!

                +
                +
                +
                +
                + + Antonio Cuni wrote on 2010-11-11 10:28: +
                +
                +

                @Anonymous: you are right when you say that compile_c mostly invokes gcc, but also a python script called trackgcroot.py.

                The python script is run with the same interpreter using for translate.py (so pypy in this case), and it happens that it's slower than with cpython.

                +
                +
                +
                +
                + + Anonymous wrote on 2010-11-11 13:03: +
                +
                +

                How come the 64 bit timings are so much worse than the 32 bit timings (both CPython and PyPy)?

                +
                +
                +
                +
                + + Carl Friedrich Bolz-Tereick wrote on 2010-11-11 13:11: +
                +
                +

                @Anonymous: Because the 64bit version is translating all modules, which simply gives the translator a lot more to do. We cannot do that yet on 32bit due to memory problems.

                +
                +
                +
                +
                + + Victor wrote on 2010-11-11 17:26: +
                +
                +

                @cfbolz Well, but you sure can run the 64bit version with the same module list as you did for 32bit... So if running the benchmark again in the same conditions isn't a lot of work, it'd provide yet another interesting data point ;)

                +
                +
                +
                +
                + + adimasci wrote on 2010-11-12 07:13: +
                +
                +

                Nice work !

                +
                +
                +
                +
                + + Anonymous wrote on 2010-11-12 07:29: +
                +
                +

                In other words: The pypy jit compiler leaks a massive amount of memory. Will you address this issue?

                +
                +
                +
                +
                + + Maciej Fijalkowski wrote on 2010-11-12 07:58: +
                +
                +

                Technically it's not "leaking". And yes, we're trying to address this issue.

                +
                +
                +
                +
                + + Tim Parkin wrote on 2010-11-23 10:10: +
                +
                +

                Yes I think the word you wanted was "uses" instead of "leaks". The latter implies unforseen problems and errors, the former implies that memory usage hasn't been addressed yet... Just to reiterate - PyPy currently *uses* more memory than CPython.

                +
                +
                +
                +
                + + Tim Parkin wrote on 2010-11-23 10:10: +
                +
                +

                Oh and a huge congratulations for this achievement!!!

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2010/11/speeding-up-pypy-by-donations-6035529829962326007.html b/posts/2010/11/speeding-up-pypy-by-donations-6035529829962326007.html new file mode 100644 index 000000000..334824f71 --- /dev/null +++ b/posts/2010/11/speeding-up-pypy-by-donations-6035529829962326007.html @@ -0,0 +1,345 @@ + + + + + +Speeding up PyPy by donations | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                Speeding up PyPy by donations

                + + + +
                +
                +

                PyPy joins the Software Freedom Conservancy

                + +

                Good news. PyPy is now a member of the Software Freedom Conservancy (SFC), +see the SFC blog post. This allows us to manage non-profit monetary aspects of +the project independently from a company or particular persons. So we +can now officially receive donations both from people prefering right or +left sides, see the Donate buttons on our home page and our blog. +And you can use PayPal or Google Checkout, Donations are tax-exempt in the +USA and hopefully soon in Europe as well.

                +

                What's it going to get used for? For the immediate future we intend to use +the donations for funding travels of core contributors to PyPy sprints +who otherwise can't afford to come. So if you have no time but some +money you can help to encourage coding contributors to care for PyPy. +If we end up with bigger sums we'll see and take suggestions. Money +spending decisions will be done by core PyPy people according to +non-profit guidelines. And we'll post information from time to time +about how much we got and where the money went.

                +

                If you have any questions regarding the SFC membership or donations +you may send email to sfc at pypy.org which will be observed +by Carl Friedrich Bolz, Jacob Hallen and Holger Krekel - the initial +PyPy SFC representatives on behalf of the PyPy team. Many thanks go +out to Bradley M. Kuhn for helping to implement the PyPy SFC membership.

                +

                cheers,

                +

                Holger & Carl Friedrich

                +
                +
                +

                Comments

                +
                +
                +
                + + glyph wrote on 2010-11-11 08:13: +
                +
                +

                Congratulations, welcome to the SFC family! It's been great for Twisted. Just donated $25 myself - now go make Twisted faster on PyPy :).

                +
                +
                +
                +
                + + holger krekel wrote on 2010-11-11 13:13: +
                +
                +

                Thanks glyph. I realized we should have mentioned Twisted already in the post since you are working through the SFC for some time now. In fact, your being there was a good argument for us to also consider going there, so thanks for that :)

                +
                +
                +
                +
                + + Carl Friedrich Bolz-Tereick wrote on 2010-11-11 13:14: +
                +
                +

                @glyph cool, thanks! As for making Twisted faster, we already did some of that: https://bit.ly/aGCY6r
                No clue how these benchmarks reflect an actual application of course :-)

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2010/12/leysin-winter-sprint-8115212435349091722.html b/posts/2010/12/leysin-winter-sprint-8115212435349091722.html new file mode 100644 index 000000000..1a07d82a4 --- /dev/null +++ b/posts/2010/12/leysin-winter-sprint-8115212435349091722.html @@ -0,0 +1,342 @@ + + + + + +Leysin Winter sprint | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                Leysin Winter sprint

                + + + +
                +

                Hi all,

                + + + + +
                + +

                The next sprint will be in Leysin, Switzerland, during the week of the 16th-22nd of January 2011.

                + +

                Now that we have released 1.4, and plan to release 1.4.1 soon, the sprint is going to be mainly working on fixing issues reported by various users. Of course this does not prevent people from showing up with a more precise interest in mind.

                + +

                As usual, the break day on the sprint will likely be a day of skiing :-)

                + +

                Hoping to see you there.

                + +
                + +
                +

                Update: there are actually a number of branches that we want to polish and merge into trunk: at least fast-forward, jit-unroll-loops, arm-backend and jitypes2. For more details, see the announcement.

                +
                +

                Comments

                +
                +
                +
                + + Victor wrote on 2010-12-10 10:24: +
                +
                +

                Armin,
                Don't forget to ping pypy-sprint about this to avoid people getting confused again ;)

                +
                +
                +
                +
                + + Armin Rigo wrote on 2010-12-10 16:28: +
                +
                +

                Victor: sorry, I don't get you. Do you mean, to tell people about the updates I did to the blog post? Or just to send the announcement to pypy-sprint too (I only sent it to pypy-dev so far)?

                +
                +
                +
                +
                + + Victor wrote on 2010-12-10 22:12: +
                +
                +

                I meant just to send the announcement to pypy-sprint too. IIRC, someone looked there for the last sprint and got confused by the announcement of a sprint in the same month of last year.

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2010/12/oh-and-btw-pypy-gets-funding-through-3568486750776147382.html b/posts/2010/12/oh-and-btw-pypy-gets-funding-through-3568486750776147382.html new file mode 100644 index 000000000..5fdcfb486 --- /dev/null +++ b/posts/2010/12/oh-and-btw-pypy-gets-funding-through-3568486750776147382.html @@ -0,0 +1,323 @@ + + + + + +Oh, and btw: PyPy gets funding through "Eurostars" | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                Oh, and btw: PyPy gets funding through "Eurostars"

                + + + +
                +

                There is a supporting reason why we made so many advances in the last year: +funding through Eurostars, a European research funding program. +The title of our proposal (accepted in 2009) is: "PYJIT - a fast +and flexible toolkit for dynamic programming languages based on PyPy". +And the participants are Open End AB, the Heinrich-Heine-Universität +Düsseldorf (HHU), and merlinux GmbH.

                +

                It's not hard to guess what PYJIT is actually about, is it? +Quoting: "The PYJIT project will deliver a fast and flexible +Just-In-Time Compiler toolkit based on PyPy to the market of dynamic +languages. Our main aim is to showcase our project's results for the +Open Source language Python, providing unprecedented levels of +flexibility and with speed hitherto only available using statically +typed languages." (Details in German or in Swedish :-)

                +

                A subgoal is to improve our development and testing infrastructure, +mainly showcased by Holger's recent py.test releases, the testing tool +used by PyPy for its 16K tests and the speed.pypy.org infrastructure +(web app programmed by Miquel Torres on his own time).

                +

                The overall scope of this project is smaller than that of the previous EU project +from 2004 to 2007. The persons that are (or were) getting money to work +on PyPy are Samuele Pedroni (at Open End), Maciej Fijalkowski (as a +subcontractor), Carl Friedrich Bolz, Armin Rigo, Antonio Cuni (all at +HHU), and Holger Krekel (at merlinux) as well as Ronny Pfannschmidt (as +a subcontractor).

                +

                The Eurostars funding lasts until August 2011. What comes afterwards? +Well, for one, many of the currently funded people have done work without +getting funding in previous years. This will probably continue. +We also have non-funded people in the core group right now and we'll +hope to enlarge it further. But of course there are still large tasks +ahead which may greatly benefit from funding. We have setup a +donation infrastructure and maybe we can win one or more larger +organisations to provide higher or regular sums of money to fund future +development work. Another possibility for companies is to pay +PyPy developers to help and improve PyPy for their particular use cases.

                +

                And finally, your help, donations and suggestions are always +welcome and overall we hope to convince more and more people it's +worthwhile to invest into PyPy's future.

                +
                +

                Comments

                +
                +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2010/12/pypy-14-release-aftermath-2979780282210978576.html b/posts/2010/12/pypy-14-release-aftermath-2979780282210978576.html new file mode 100644 index 000000000..c8c662335 --- /dev/null +++ b/posts/2010/12/pypy-14-release-aftermath-2979780282210978576.html @@ -0,0 +1,404 @@ + + + + + +PyPy 1.4 release aftermath | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                PyPy 1.4 release aftermath

                + + + +
                +

                A couple days have passed since the announcement of the 1.4 release, and this +is a short summary of what happened afterwards. Let's start with +numbers:

                +
                  +
                • 16k visits to the release announcement on our blog
                • +
                • we don't have download statistics unfortunately
                • +
                • 10k visits to speed center +
                • +
                • most traffic comes from referring sites, reddit alone creating above a third +of our traffic
                • +
                +

                Not too bad for a project that doesn't have a well-established user base.

                +

                Lessons learned:

                +
                  +
                • Releases are very important. They're still the major way projects communicate +with community, even if we have nightly builds that are mostly stable.
                • +
                • No segfaults were reported, no incompatibilities between JIT and normal +interpretation. We think that proves (or at least provides a lot of +experimental evidence) that our write-once-and-then-transform method is +effective.
                • +
                • A lot of people complained about their favorite module in C not working, we +should have made it clearer that CPyExt is in alpha state. Indeed, we +would like to know which C extension modules do work :-).
                • +
                • Some people reported massive speedups, other reported slowdowns compared +to CPython. Most of those slowdowns relate to modules being inefficient +(or doing happy nonsense), like ctypes. This is expected, given that +not all modules are even jitted (although having them jitted is usually +a matter of a couple of minutes).
                • +
                • Nobody complained about a lack of some stdlib module. We implemented the ones +which are used more often, but this makes us wonder if less used stdlib modules +have any users at all.
                • +
                +

                In general feedback has been overwhelmingly positive and we would like to +thank everyone trying (and especially those reporting problems)

                +

                Cheers,
                +fijal

                +
                +

                Comments

                +
                +
                +
                + + Bryan Murphy wrote on 2010-12-01 14:26: +
                +
                +

                I love what you guys are doing. Keep up the good work!

                +
                +
                +
                +
                + + Leonardo Santagada wrote on 2010-12-01 17:07: +
                +
                +

                There was a complain about the lack of ssl module by someone trying to use pg8000 with pypy. I wonder if pypy should focus on openssl or on the ssl module.

                +
                +
                +
                +
                + + Anonymous wrote on 2010-12-01 20:47: +
                +
                +

                pyglet actually seems to use the MacOS module to create some windows.

                +
                +
                +
                +
                + + Paul Boddie wrote on 2010-12-02 00:16: +
                +
                +

                I'm very impressed with what you've all achieved!

                I've been testing PyPy 1.4 with some code I'm working on which only depends on two pure-Python non-stdlib libraries, and although the result was a 50% longer running time than with Python 2.5, it's remarkable that the code behaves in the same way and produces the same results. When trying to produce a fully compatible implementation of something, it's no trivial task to get identical behaviour (even though I'm not really using "exotic" or "frivolous" language features): some corner case usually comes along and makes things difficult. To see a non-trivial example work just like "normal Python" is surely evidence that PyPy is ready for a wider audience.

                As for my code, I've been doing some profiling generally - it uses things like the array and bisect modules substantially - and will attempt to see how profile-directed improvements affect PyPy's performance.

                Keep up the good work!

                +
                +
                +
                +
                + + The Cannon Family wrote on 2010-12-03 07:11: +
                +
                +

                A lot of the standard library looks like it has volumes of _legacy_ code depending on it, even if the current bleeding edge people use it less. In my mind supporting essentially all the standard library is a good long term goal, but as pointed out, parts of it can wait. Eventually I would like to see Tkinter support, and I would surmise that it is the most used of the stuff that is not implemented. We use it in a couple items (+/- 10% of total code, not likely to change). I would guess that the situations where these obscure parts of the standard library are being used are the parts were speed is maybe not the most important thing, supporting an existing workflow or parsing legacy data is the key.

                +
                +
                +
                +
                + + Maciej Fijalkowski wrote on 2010-12-05 08:09: +
                +
                +

                @The Cannon Family

                The question is why those legacy people would move to PyPy? PyPy is bleeding edge in a way.

                Besides a lot of those modules are like audioop or ossaudiodev. I don't see legitimate usecase for those, even in legacy code.

                +
                +
                +
                +
                + + Richard Jones wrote on 2010-12-09 04:02: +
                +
                +

                I'm very, very impressed and can't wait to use pypy in a real project. I'm blocked at the moment because I need pyglet on OS X (no MacOS module).

                I gave an introduction to cython at the local Python user group and for a lark I ran the original pure-Python code up against the cython version.

                cpython: 1.4s
                cython: 0.2s
                pypy: 0.2s

                Hmm :-)

                +
                +
                +
                +
                + + Xavier Combelle wrote on 2010-12-15 16:36: +
                +
                +

                I don't know how it is representative but for this usecase
                there is a factor 7 between pypy and cpython 2.7

                cpython 2.7
                >>> timeit.Timer('sum(x for x in xrange(100000))').repeat(10,100)
                [1.3338480523322112, 1.5916376967269201, 1.5959533140645483, 1.8427266639818676,
                1.3473615220676294, 1.842070271069737, 1.3346074032759319, 1.5859678554627408,
                1.8533299541683306, 1.5872797264355398]

                pypy 1.4
                >>>> timeit.Timer('sum(x for x in xrange(100000))').repeat(10,100)
                [7.5079355199007978, 7.9444552948765477, 7.2710836043080178, 7.5406516611307666,
                7.5192312421594352, 7.4927645588612677, 7.5075613773735768, 7.5201248774020826,
                7.7839006757141931, 7.5898334809973278]

                but maybe it is not representative

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2010/12/pypy-141-7283625923182122073.html b/posts/2010/12/pypy-141-7283625923182122073.html new file mode 100644 index 000000000..58b56c694 --- /dev/null +++ b/posts/2010/12/pypy-141-7283625923182122073.html @@ -0,0 +1,470 @@ + + + + + +PyPy 1.4.1 | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                PyPy 1.4.1

                + + + +
                +

                Here is PyPy 1.4.1 :-)

                + +

                Update: Win32 binaries available.

                + +

                Enjoy!

                + +

                Release announcement

                + +

                We're pleased to announce +the 1.4.1 release of PyPy. +This release consolidates all the bug fixes that occurred since the +previous release. To everyone that took the trouble to report +them, we want to say thank you.

                + +

                What is PyPy

                + +

                PyPy is a very compliant Python interpreter, almost a drop-in +replacement for CPython. Note that it still only emulates Python +2.5 by default; the fast-forward branch with Python 2.7 +support is slowly getting ready but will only be integrated in +the next release.

                + +

                In two words, the advantage of trying out PyPy instead of CPython +(the default implementation of Python) is, for now, the +performance. Not all programs are faster in PyPy, but we are +confident that any CPU-intensive task will be much faster, at +least if it runs for long enough (the JIT has a slow warm-up +phase, which can take several seconds or even one minute on the +largest programs).

                + +

                Note again that we do support compiling and using C extension +modules from CPython (pypy setup.py install). However, this +is still an alpha feature, and the most complex modules typically +fail for various reasons; others work (e.g. PIL) but take a +serious performance hit. Also, for Mac OS X see below.

                + +

                Please note also that PyPy's performance was optimized almost +exclusively on Linux. It seems from some reports that on Windows +as well as Mac OS X (probably for different reasons) the +performance might be lower. We did not investigate much so far.

                + +

                More highlights

                + +
                  +
                • We migrated to Mercurial (thanks to Ronny Pfannschmidt and + Antonio Cuni) for the effort) and moved to bitbucket. The new + command to check out a copy of PyPy is: +
                  hg clone https://bitbucket.org/pypy/pypy + +

                  +
                • +
                • In long-running processes, the assembler generated by old + JIT-compilations is now freed. There should be no more leak, + however long the process runs. + +

                  +
                • +
                • Improve a lot the performance of the binascii module, and + of hashlib.md5 and hashlib.sha. + +

                  +
                • +
                • Made sys.setrecursionlimit() a no-op. Instead, we rely purely + on the built-in stack overflow detection mechanism, which also + gives you a RuntimeError -- just not at some exact recursion + level. + +

                  +
                • +
                • Fix argument processing (now e.g. pypy -OScpass works like + it does on CPython --- if you have a clue what it does there + :-) ) + +

                  +
                • +
                • cpyext on Mac OS X: it still does not seem to work. I get + systematically a segfault in dlopen(). Contributions welcome. + +

                  +
                • +
                • Fix two corner cases in the GC (one in minimark, one in + asmgcc+JIT). This notably prevented pypy translate.py -Ojit + from working on Windows, leading to crashes. + +

                  +
                • +
                • Fixed a corner case in the JIT's optimizer, leading to Fatal + RPython error: AssertionError. + +

                  +
                • +
                • Added some missing built-in functions into the 'os' module. + +

                  +
                • +
                • Fix ctypes (it was not propagating keepalive information from + c_void_p). + +
                • +
                +
                +

                Comments

                +
                +
                +
                + + Symbol wrote on 2010-12-22 12:00: +
                +
                +

                Wow, and I thought 1.4.1 would come out after the january sprint!

                A christmas present :->

                What would be the focus of the january sprint then?

                +
                +
                +
                +
                + + Armin Rigo wrote on 2010-12-22 12:09: +
                +
                +

                There are still a number of branches that have not been merged into trunk yet: at least fast-forward (Python 2.7), jit-unroll-loops (better JITting of arithmetic and short loops), arm-backend (JIT support on ARM) and jitypes2 (turn ctypes calls into real assembler-level calls with the JIT). There is also the stackless+JIT integration pending. Finally the sprint will also be a place to try out and run some applications. So it's not like we are out of work :-)

                +
                +
                +
                +
                + + Unknown wrote on 2010-12-22 13:10: +
                +
                +

                I'm interested in the performance improvement in hashlib.sha. I haven't seen that one before on https://speed.pypy.org . Could you give me more details?

                Regards,

                Zooko

                +
                +
                +
                +
                + + Armin Rigo wrote on 2010-12-22 13:59: +
                +
                +

                Actually, hashlib.sha was not the same as sha.sha: the former used to be a ctypes call to the OpenSSL lib, whereas the latter uses our built-in sha implementation. So hashlib.sha was faster in theory, but killed by the overhead of using ctypes. Now, at least in a default version of pypy, the hashlib.md5 and .sha are redirected to the built-in md5.md5 and sha.sha.

                Another issue was that with the built-in md5.md5 and sha.sha, on 64-bit, there was a 1.5x speed impact due to the C compiler not recognizing an expression that was meant to be a 32-bit integer rotation.

                I guess that https://speed.pypy.org don't show this because they use directly md5.md5 or sha.sha, and are on 32-bit.

                +
                +
                +
                +
                + + Martijn Faassen wrote on 2010-12-22 14:14: +
                +
                +

                Thanks for PyPy 1.4.1. I reported two issues concerning buildout with PyPy 1.4, and they all got fixed!

                So PyPy 1.4.1 is now compatible with buildout, which is really convenient as it makes it easy for me to test other projects.

                +
                +
                +
                +
                + + shadinger wrote on 2010-12-28 16:00: +
                +
                +

                I compiled 1.4.1 on Win32 using Visual C++ 2010.

                Do you want to add it to the download page?

                To whom shall I send it?

                Happy new year.

                +
                +
                +
                +
                + + Andrei wrote on 2011-01-28 20:08: +
                +
                +

                Hello,

                sorry, I'm a bit new here - is it possible that PyPy makes Python run in a browser? Somehow "translating" all the Python into Javascript?

                I'm wondering because I saw you run, for example, CLI, so perhaps PyPy may somehow enable Python in a browser?

                +
                +
                +
                +
                + + Armin Rigo wrote on 2011-01-29 10:23: +
                +
                +

                Andrei: not directly. We played at some point with translating RPython code to Javascript, but it didn't give enough benefits (because it's not full Python that we can translate, just "RPython"). The alternative would be to translate the whole PyPy interpreter to Javascript, but that would give a result that is both huge (in term of download size) and horribly slow (100x slower than Javascript maybe).

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2010/12/pypy-migrates-to-mercurial-3308736161543832134.html b/posts/2010/12/pypy-migrates-to-mercurial-3308736161543832134.html new file mode 100644 index 000000000..fd996fd3a --- /dev/null +++ b/posts/2010/12/pypy-migrates-to-mercurial-3308736161543832134.html @@ -0,0 +1,361 @@ + + + + + +PyPy migrates to Mercurial | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                PyPy migrates to Mercurial

                + + + +
                +

                The assiduous readers of this blog surely remember that during the last +Düsseldorf sprint in October, we started the process for migrating our main +development repository from Subversion to Mercurial. Today, after more than +two months, the process has finally been completed :-).

                +

                The new official PyPy repository is hosted on BitBucket.

                +

                The migration has been painful because the SVN history of PyPy was a mess and +none of the existing conversion tools could handle it correctly. This was +partly because PyPy started when subversion was still at version 0.9 when some +best-practices were still to be established, and partly because we probably +managed to invent all the possible ways to do branches (and even some of the +impossible ones: there is at least one commit which you cannot do with the +plain SVN client but you have to speak to the server by yourself :-)).

                +

                The actual conversion was possible thanks to the enormous work done by Ronny +Pfannschmidt and his hackbeil tool. I would like to personally thank Ronny +for his patience to handle all the various requests we asked for.

                +

                We hope that PyPy development becomes even more approachable now, at least from +a version control point of view.

                +
                +

                Comments

                +
                +
                +
                + + Anonymous wrote on 2010-12-14 20:19: +
                +
                +

                Awesome! Besides simplifying life for potential new contributors, it's very nice to be able to follow progress using the shortlog on bitbucket.org.

                +
                +
                +
                +
                + + Vladimir wrote on 2010-12-14 21:08: +
                +
                +

                Over 9000 branches :/

                +
                +
                +
                +
                + + Antonio Cuni wrote on 2010-12-14 22:34: +
                +
                +

                @Владимир: 9000? I count 459 on my local repo, which is still a lot, but not so much :-)
                Anyway, most of them are closed, it's just that bitbucket displays also those. And I think that the huge number of branches is another evidence of the "we are not heroes" thing :-)
                https://morepypy.blogspot.com/2010/12/we-are-not-heroes-just-very-patient.html

                +
                +
                +
                +
                + + Michael Foord wrote on 2010-12-15 01:38: +
                +
                +

                Hey, you guys are *my* heroes. :-)

                +
                +
                +
                +
                + + Leonardo Santagada wrote on 2010-12-15 13:03: +
                +
                +

                "PyPy is faster than CPython, again" should be the title. Faster at migrating to mercurial

                :)

                Great work, now pypy could be even more self hosting if it would run hg on it, when it becomes faster than cpython and stable to do so.

                +
                +
                +
                +
                + + Bernhard Leiner wrote on 2010-12-15 20:28: +
                +
                +

                PyPy running Mercurial is actually not to far away...

                https://markmail.org/message/wjik2ecanvmt463y#query:+page:1+mid:qbdxn3566j2y7piu+state:results

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2010/12/we-are-not-heroes-just-very-patient-7114408885070101720.html b/posts/2010/12/we-are-not-heroes-just-very-patient-7114408885070101720.html new file mode 100644 index 000000000..6b1185a5c --- /dev/null +++ b/posts/2010/12/we-are-not-heroes-just-very-patient-7114408885070101720.html @@ -0,0 +1,356 @@ + + + + + +We are not heroes, just very patient | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                We are not heroes, just very patient

                + + + +
                +

                Inspired by some of the comments to the release that said "You are heroes", I though a bit about the longish history of PyPy and hunted around for some of the mailing list posts that started the project. Then I put all this information together into the following timeline:

                + +

                There is also a larger version of the timeline. Try to click on some of the events, the links usually go to the sprint descriptions. I also tried to find pictures for the sprints but succeeded for only half of them, if anybody still has some, I would be interested. It's kind of fun to browse around in some of the old sprint descriptions to see how PyPy evolved. Some of the current ideas have been around for a long time, some are new. In the description of the releases I put estimates for the speed of the release.

                +
                +

                Comments

                +
                +
                +
                + + Symbol wrote on 2010-12-01 14:37: +
                +
                +

                Many promising projects bite the dust not due to lack of talent, interest, need or support, but perseverance.

                Not only do I believe that pypy has yet to realize it's full potential, I believe that it will actually achieve it. And then some.

                So again, keep up the good work!!
                p.s
                (my flattr account is not yet operational ;-<)

                +
                +
                +
                +
                + + Symbol wrote on 2010-12-01 14:41: +
                +
                +

                Question,
                What do the funds(EU, eurostars) cover?

                I see that there had been a burst of activity during the EU period.

                Does this mean that funding is a bottleneck to this project? Would the end of the current eurostars funding be an obstacle?

                +
                +
                +
                +
                + + Skandalfo wrote on 2010-12-01 19:16: +
                +
                +

                Then you are just very patient heroes :-D

                +
                +
                +
                +
                + + holger krekel wrote on 2010-12-01 22:16: +
                +
                +

                Sure, funding does make a difference. There are couple of people currently (Anto, Armin, Carl Friedrich, partially Maciej, me ...) who get some money through the Eurostars project. This does make a difference in terms of how much time can be devoted. I guess there should be a clarifying blog post on this and maybe also some opinions and views on how things can continue after the funding period (which ends second half next year).

                +
                +
                +
                +
                + + Nik Haldimann wrote on 2010-12-02 03:28: +
                +
                +

                Amazing how far you have come. Congrats!

                I found myself in 3 of those old sprint pictures, and I remember all of them as very good times that overall probably taught me more than the school I was attending during that time.

                +
                +
                +
                +
                + + glyph wrote on 2010-12-02 07:21: +
                +
                +

                This timeline sort of makes the point. You are heroes ;). Patience is harder than a few nights of crazy hacking and brilliant ideas.

                +
                +
                +
                +
                + + Martijn Faassen wrote on 2010-12-10 12:21: +
                +
                +

                Yeah, you have more heroic patience than I tended to display
                cheerleading/criticizing the project.

                Currently there's nothing left to criticize for me -- I think everything's being done pretty much right (communication, releases, even work on C-module support!).

                But that might change once I start to use the project seriously. :)

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2011/01/jit-backend-for-arm-processors-5994810755839586463.html b/posts/2011/01/jit-backend-for-arm-processors-5994810755839586463.html new file mode 100644 index 000000000..1efade25f --- /dev/null +++ b/posts/2011/01/jit-backend-for-arm-processors-5994810755839586463.html @@ -0,0 +1,476 @@ + + + + + +A JIT Backend for ARM Processors | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                A JIT Backend for ARM Processors

                + + + +
                +
                +In the past few months, I have been developing as a part of my master thesis +the ARM backend for the the PyPy JIT, in the arm-backend branch. Currently, it is still work in progress: all integer and object operations are working and +the support for floating point is also under development.
                +ARM processors are very widely used, beeing deployed in servers, some netbooks +and mainly mobile devices such as phones and tablets. One of our goals is to be +able to run PyPy on phones, specially on Android. Currently is not yet possible +to translate and compile PyPy for Android automatically, but there has been +some work on using Android's NDK to compile PyPy's generated C code.
                +The JIT Backend targets the application profile of the ARMv7 instruction set +architecture which is found for example in the Cortex-A8 processors used in many Android powered devices and in Apple's A4 processors built into the latest iOS devices. To develop and +test the backend we are using a BeagleBoard-xM which has a 1 GHz ARM +Cortex-A8 and 512 MB of RAM running the ARM port of Ubuntu 10.10.
                +Currently on Linux it is possible to translate and cross-compile PyPy's Python +interpreter as well as other interpreters with the ARM JIT backend enabled +using Scratchbox 2 to provide a build environment and the GNU ARM cross +compilation toolchain. So far the backend only supports the Boehm garbage +collector which does not produce the best results combined with the JIT, but we +plan to add support for the other GCs in the future, doing so should increase +the performance of PyPy on ARM.
                +While still debugging the last issues with the backend we already can run some +simple benchmarks on Pyrolog, a prolog interpreter written in RPython. +Even using Boehm as the GC the results look very promising. In the benchmarks +we compare Pyrolog to SWI-Prolog, a prolog interpreter written in C, which +is available from the package repositories for Ubuntu's ARM port.
                +The benchmarks can be found in the pyrolog-bench repository.
                + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                BenchmarkSWI-Prolog in ms.Pyrolog in ms.Speedup
                iterate60.06.010.0
                iterate_assert130.06.021.67
                iterate_call3310.05.0662.0
                iterate_cut60.0359.00.16713
                iterate_exception4950.0346.014.306
                iterate_failure400.0127.03.1496
                iterate_findall740.0No res.
                iterate_if140.06.023.333
                +The iterate_call benchmark, which constructs a predicate and calls it at +runtime, with a speedup of 662 times over SWI-Prolog is an example where the +JIT can show its strength. The Pyrolog interpreter and the JIT treat +dynamically defined predicates as static ones and can generate optimezed code +in both cases. Whereas SWI only compiles statically defined rules and has to +fall back to interpretation on dynamic ones.
                +For simple benchmarks running on PyPy's Python intepreter we see some speedups +over CPython, but we still need to debug the backend bit more before we can +show numbers on more complex benchmarks. So, stay tuned.
                +
                +

                Comments

                +
                +
                +
                + + Anonymous wrote on 2011-01-30 10:01: +
                +
                +

                Awesome stuff. I have a panda board and another xm that's usually not doing much if you want to borrow some cycles :-)

                When you support floats will you be aiming for hard float? It's the way of the future, I hear...

                +
                +
                +
                +
                + + Unknown wrote on 2011-01-30 11:47: +
                +
                +

                I am curious if you had any use for ThumbEE (or Jazelle RCT) to speed up?

                +
                +
                +
                +
                + + David Schneider wrote on 2011-01-30 19:05: +
                +
                +

                @mwhudson: thanks it would be great to be able to test on more hardware.

                For the float support we still need to investigate a bit, but if possible I would like to target hard floats.

                @dbrodie: currently we are targeting the arm state, so not at the moment.

                +
                +
                +
                +
                + + Martijn Faassen wrote on 2011-01-31 14:11: +
                +
                +

                One would imagine conserving memory would be an important factor on mobile devices. Even though mobile devices have a growing amount of memory available, it will still be less than desktops for the forseeable future. Memory pressure can create real slowdowns.

                A JIT normally takes more memory, but on the other hand PyPy offers features to reduce usage of memory. Could you share some of your thinking on this?

                +
                +
                +
                +
                + + Armin Rigo wrote on 2011-02-05 19:51: +
                +
                +

                Martijn: you are describing the situation as well as we (at least I) know it so far: while PyPy has in many cases a lower non-JIT memory usage, the JIT adds some overhead. But it seems to be within ~200MB on "pypy translate.py", which is kind of the extreme example in hugeness. So already on today's high-end boards with 1GB of RAM, it should easily fit. Moreover it can be tweaked, e.g. it's probably better on these systems to increase the threshold at which JITting starts (which also reduces the number of JITted code paths). So I think that the possibility is real.

                +
                +
                +
                +
                + + Dan wrote on 2011-04-30 16:40: +
                +
                +

                Showing speedups over repetitive instructions (which caching & JIT are really good at) is irrelevant.

                What happens when people use real benchmarks, like constraint-based solvers and non-iterative stuff (maybe take a look at the other benchmarks) ...

                Prolog is a declative language, not a sysadmin scripting language.

                Also, the SWI implementation adds so many functionalities, it's like making a «Extract chars from an RDBMS vs Text files» benchmark.

                +
                +
                +
                +
                + + Carl Friedrich Bolz-Tereick wrote on 2011-05-02 19:02: +
                +
                +

                @Dan

                Why are you so defensive? This benchmark is clearly not about how fast Pyrolog is, but how the ARM JIT backend performs, using trivial Prolog microbenchmarks, with SWI to give a number to compare against.

                Pyrolog is a minimal Prolog implementation that is (at least so far) mostly an experiment to see how well PyPy's JIT technology can do on an non-imperative language. This paper contains more interesting benchmarks:

                https://portal.acm.org/citation.cfm?id=1836102

                +
                +
                +
                +
                + + jamu wrote on 2011-05-16 13:11: +
                +
                +

                Hi,
                Is there a way to cross compile on a host machine (but not with scratch box) where I have tool chain and file system for the target?

                Any instructions for building with arm back-end?

                Cheers

                +
                +
                +
                +
                + + David Schneider wrote on 2011-06-08 20:41: +
                +
                +

                @jamu: scratchbox 2 is currently the only option to cross-translate pypy for ARM. You can find some documentation about the cross translation at https://foss.heptapod.net/pypy/pypy/-/tree/branch/arm-backend-2/pypy/doc/arm.rst

                +
                +
                +
                +
                + + vak wrote on 2011-09-30 10:12: +
                +
                +

                Sounds very cool, are there any updates?

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2011/01/loop-invariant-code-motion-1998392217676829154.html b/posts/2011/01/loop-invariant-code-motion-1998392217676829154.html new file mode 100644 index 000000000..d1963b8a4 --- /dev/null +++ b/posts/2011/01/loop-invariant-code-motion-1998392217676829154.html @@ -0,0 +1,858 @@ + + + + + +Loop invariant code motion | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                Loop invariant code motion

                + + + +
                +

                Recently, the jit-unroll-loops branch was merged. It implements the +idea described in +Using Escape Analysis Across Loop Boundaries for Specialization. +That post does only talk about virtuals, but the idea turned out +to be more far reaching. After the metainterpreter produces a trace, +several optimizations are applied to the trace before it is turned +into binary code. Removing allocations is only one of them. There are also +for instance +

                +
                  +
                • Heap optimizations that removes memory accesses by reusing results + previously read from or written to the same location. +
                • +
                • Reusing of the results of pure operations if the same pure + operation is executed twice. +
                • +
                • Removal of redundant guards. +
                • +
                • ... +
                • +
                +A lot of these optimizations are in one way or another removing +operations form the trace and/or reusing previous results. All of these +optimizations could benefit from being able to operate across loop +boundaries. Not only in the sense that operations operating on loop +invariants could be moved out of the loop entirely. But also that +results produced at the end of an iteration could be reused at the +beginning of the next even if there are no loop invariants involved. + +

                + +This is achieved by unrolling the trace into two iterations, and +letting the optimizer work on this two-iteration-trace. +The optimizer will now be able to optimize the second iteration more than the +first since it can reuse results from the first iteration. The +optimized version of the first iteration we call the preamble and the +optimized version of the second iteration we call the loop. The +preamble will end with a jump to the loop, while the loop will end +with a jump to itself. This means that the preamble will be executed +once for the first iteration, the loop will be executed for all following +iterations. + +

                +

                +

                +

                Sqrt example

                +Here is an example of a Python implementation of sqrt using a fairly +simple algorithm + +

                + + + +

                +
                def sqrt(y, n=10000):
                +    x = y / 2
                +    while n > 0:
                +        n -= 1
                +        x = (x + y/x) / 2
                +    return x
                +
                +

                + +If it is called with sqrt(1234.0), +a fairly long trace is produced. From this trace +the optimizer creates +the +following preamble (Loop 1) and loop (Loop 0) + + +

                +

                + +

                +

                + +Looking at the preamble, it starts by making sure that it is not +currently being profiled, the guard +on i5, and that the function object have not been changed +since the trace was made, the guard on p3. Somewhat +intermixed with that, the +integer variable n is unboxed, by making sure p11 +points to an integer object and reading out the integer value from +that object. +These operations are not needed in the +loop (and have been removed from it) as emitting the same guards again +would be redundant and n becomes a virtual before the +end of the preamble. +

                +
                +        guard_value(i5, 0, descr=<Guard6>) 
                +        guard_nonnull_class(p11, ConstClass(W_IntObject), descr=<Guard7>) 
                +        guard_value(p3, ConstPtr(ptr15), descr=<Guard8>) 
                +        i16 = getfield_gc_pure(p11, descr=<W_IntObject.inst_intval>)
                +
                + +Next comes a test and a guard implementing the while statement +followed by the decrementing of n. These operation appear +both in the preamble and in the loop +
                +        i18 = int_gt(i16, 0)
                +        guard_true(i18, descr=<Guard9>) 
                +        i20 = int_sub(i16, 1)
                +
                + +After that the two floating point variables x and y +are unboxed. Again this is only needed in the preamble. Note how the +unboxed value of y, called f23, is passed unchanged +from the preamble to the loop in arguments of the jump +to allow it to be reused. It will not become a virtual +since it is never changed within the loop. +
                +        guard_nonnull_class(p12, 17652552, descr=<Guard10>) 
                +        guard_nonnull_class(p10, 17652552, descr=<Guard11>) 
                +        f23 = getfield_gc_pure(p10, descr=<W_FloatObject.inst_floatval>)
                +        f24 = getfield_gc_pure(p12, descr=<W_FloatObject.inst_floatval>)
                +
                + +Following that is the actual calculations performed in the loop in +form of floating point operations (since the function was called with +a float argument). These appear in both the loop +and the preamble. +
                +        i26 = float_eq(f24, 0.000000)
                +        guard_false(i26, descr=<Guard12>) 
                +        f27 = float_truediv(f23, f24)
                +        f28 = float_add(f24, f27)
                +        f30 = float_truediv(f28, 2.000000)
                +
                + +Finally there are some tests checking if a signal was received +(such as when the user presses ctrl-C) and thus should execute some +signal handler or if we need to hand over to another thread. This is +implemented with a counter that is decreased once every iteration. It +will go below zero after some specific number of iterations, tunable by +sys.setcheckinterval. The counter is read from and written to +some global location where it also can be made negative by a C-level +signal handler. +
                +        i32 = getfield_raw(32479328, descr=<pypysig_long_struct.c_value>)
                +        i34 = int_sub(i32, 2)
                +        setfield_raw(32479328, i34, descr=<pypysig_long_struct.c_value>)
                +        i36 = int_lt(i34, 0)
                +        guard_false(i36, descr=<Guard13>) 
                +        jump(p0, p1, p2, p4, p10, i20, f30, f23, descr=<Loop0>)
                +
                + +

                +

                +

                Bridges

                + +When a guard fails often enough, the meta-interpreter is started again +to produce a new trace starting at the failing guard. The tracing is +continued until a previously compiled loop is entered. This could +either be the the same loop that contains the failing guard +or some completely different loop. If it is the same loop, executing +the preamble again maybe be unnecessary. +It is preferable to end the bridge with a jump directly to +the loop. To achieve this the optimizer tries to produce short + preambles that are inlined at the end of bridges allowing +them to jump directly to the loop. Inlining is better than jumping to +a common preamble because most of the inlined short preamble can +typically be removed again by the optimizer. +Creating such a short +preamble is however not always possible. Bridges jumping to loops for which +no short preamble can be generated have to end with a jump to the +full preamble instead. + +

                + +The short preamble is created by comparing the operations in the +preamble with the operations in the loop. The +operations that are in the preamble but not in the loop +are moved to the short preamble whenever it is safe to move them to +the front of the operations remaining. In other words, the full preamble +is equivalent to the short preamble followed by one iteration of the +loop. + +

                +

                + +This much has currently been implemented. To give the full picture +here, there are two more features that +hopefully will be implemented in the near future. +The first is to replace the full preamble, used by the interpreter +when it reaches a compiled loop, with the short preamble. +This is currently not done and is probably not as straight forward as +it might first seem. The problem is where to resume interpreting on a +guard failure. However, implementing that should save some +memory. Not only +because the preamble will become smaller, but mainly because the +guards will appear either in the loop or in the preamble, but not +in both (as they do now). That means there will only be a single bridge and +not potentially two copies once the guards are traced. + +

                +

                + +The sqrt example above would with a short preamble result in a trace +like this + +

                +

                + +

                +If it is executed long enough, the last guard will be traced to form a +bridge. The trace will inherit the virtuals from its parent. This can +be used to optimize away the part of the inlined short preamble +that deals with virtuals. The resulting bridge should look +something like + +
                +    [p0, p1, p2, p3, p4, f5, i6]
                +    i7 = force_token()
                +    setfield_gc(p1, i7, descr=<PyFrame.vable_token>)
                +    call_may_force(ConstClass(action_dispatcher), p0, p1, descr=<VoidCallDescr>)
                +    guard_not_forced(, descr=<Guard19>) 
                +    guard_no_exception(, descr=<Guard20>) 
                +
                +    guard_nonnull_class(p4, 17674024, descr=<Guard21>) 
                +    f52 = getfield_gc_pure(p4, descr=<W_FloatObject.inst_floatval>)
                +    jump(p1, p0, p2, p3, p4, i38, f53, f52, descr=<Loop0>)
                +
                + +Here the first paragraph comes from the traced bridge and the second +is what remains of the short preamble after optimization. The +box p4 is +not a virtual (it contains a pointer to y which is never +changed), and it is only virtuals +that the bridge inherit from it's parents. This is why the last two +operations currently cannot be removed. + + +

                + +Each time the short preamble is inlined, a new copy of each of the +guards in it is generated. Typically the short preamble is inlined in +several places and thus there will be several copies of each of those +guards. +If they fail often enough bridges +from them will be traced (as with all guards). But since there +typically are several copies of each guard the same bridge +will be generated in +several places. To prevent this, mini-bridges from the inlined guards +are produced already during the inlining. These mini-bridges contain +nothing but a jump to the preamble. + +

                +

                +The mini-bridges needs the arguments of the preamble to be able +to jump to it. These arguments contain among other things, boxed +versions of the +variables x and y. Those variables are virtuals in +the loop, and have to be allocated. Currently those allocations +are placed in front of the inlined guard. Moving those allocations into +the mini-bridges is the second feature that +hopefully will be implemented in the near future. + +After this feature is +implemented, the result should look something like +

                +

                + + + +

                +

                +

                +

                Multiple specialized versions

                + +Floating point operations were generated in the trace above +because sqrt was called with a float argument. If it is +instead called with an int argument, integer operations will be generated. The +somewhat more complex situations is when both int's and float's are +used as arguments. Then the jit need to generate multiple versions of +the same loop, specialized in different ways. The details, given +below, on how this is achieved is somewhat involved. For the casual +reader it would make perfect sense to skip to the next section here. + +

                + +Consider the case when sqrt is first called with a float +argument (but with n small enough not to generate the +bridge). Then the trace shown above will be +generated. If sqrt is now called with an int argument, the +guard in the preamble testing that the type of the input object is float +will fail: +

                +
                +        guard_nonnull_class(p12, 17652552, descr=<Guard10>) 
                +
                +It will fail every iteration, so soon enough a bridge will be +generated from this guard in the preamble. This guard will end with a +jump to the same loop, and the optimizer will try to inline +the short preamble at the end of it. This will however fail +since now there are two guards on p12. One that makes sure it +is an int and and one that makes sure it is a float. The optimizer +will detect that the second guard will always fail and mark the bridge +as invalid. Invalid loops are not passed on to the backend for +compilation. + +

                + +If a loop is detected to be invalid while inlining the short preamble, +the metainterpreter will continue to trace for yet another +iteration of the loop. This new trace can be compiled as above and +will produce a new loop with a new preamble that are now specialized +for int arguments instead of float arguments. The bridge that +previously became invalid will now be tried again. This time inlining +the short preamble of the new loop instead. This will produce a set of +traces connected like this + +

                +

                + + +(click for some hairy details) +

                +

                + +The height of the boxes is this figure represents how many instructions +they contain (presuming the missing features from the previous section +are implemented). Loop 0 is specialized for floats and it's preamble have +been split into two boxes at the failing guard. Loop 2 is specialized +for ints and is larger than Loop 0. This is mainly because the integer +division in python does not map to the integer division of the +machine, but have to be implemented with several instructions (integer +division in python truncates its result towards minus +infinity, while the the machine integer division truncates towards +0). Also the height of the bridge is about the same as the height of +Loop 2. This is because it contains a full iteration of the loop. + +

                +

                + +

                +

                A More Advanced Example

                + +Let's conclude with an example that is a bit more advanced, where this unrolling +approach actually outperforms the previous approach. Consider +making a +fixed-point +implementation of the square root using 16 bit's of decimals. This can be +done using the same implementation +of sqrt but calling it with an object of a class representing +such fixed-point real numbers: + +

                +

                +
                class Fix16(object):
                +    def __init__(self, val, scale=True):
                +        if isinstance(val, Fix16):
                +            self.val = val.val
                +        else:
                +            if scale:
                +                self.val = int(val * 2**16)
                +            else:
                +                self.val = val
                +
                +    def __add__(self, other):
                +        return  Fix16(self.val + Fix16(other).val, False)
                +
                +    def __sub__(self, other):
                +        return  Fix16(self.val - Fix16(other).val, False)
                +
                +    def __mul__(self, other):
                +        return  Fix16((self.val >> 8) * (Fix16(other).val >> 8), False)
                +
                +    def __div__(self, other):
                +        return  Fix16((self.val << 16) / Fix16(other).val, False)
                +
                + +

                + +Below is a table comparing the runtime of the sqrt function above with +different argument types on different python interpreters. Pypy 1.4.1 +was released before the optimizations described in this post were in place +while they are in place in the +nightly + build from January 5, +denoted pypy in the table. There are also the running time for the same +algorithms implemented in C and compiled with "gcc -O3 +-march=native". Tests were executed on a 2.53GHz Intel Core2 +processor with n=100000000 iterations. +Comparing the integer versions with C may be considered a +bit unfair because of the more advanced integer division operator in +python. The left part of this table shows runtimes of sqrt in +a program containing a single call to sqrt (i.e. only a single +specialized version of the loop is needed). The right part shows the +runtime of sqrt when it has been called with a different +type of argument before. + +

                +

                + +

                + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                First callSecond call
                floatintFix16  floatintFix16
                cpython 28.18 s 22.13 s 779.04 s 28.07 s 22.21 s 767.03 s
                pypy 1.4.1 1.20 s 6.49 s 11.31 s 1.20 s 6.54 s 11.23 s
                pypy 1.20 s 6.44 s 6.78 s 1.19 s 6.26 s 6.79 s
                gcc 1.15 s 1.82 s 1.89 s 1.15 s 1.82 s 1.89 s
                +

                + +For this to work in the last case, when Fix16 is the argument type in +the second type, +the trace_limit had to be increased from its default value to prevent +the metainterpreter from aborting while tracing the second version of +the loop. Also sys.setcheckinterval(1000000) were used to prevent the +bridge from being generated. With the bridge the performance of the +last case is significantly worse. Maybe because the optimizer currently +fails to generate a short preamble for it. But the slowdown +seems too big for that to be the only explanation. Below are the runtimes +numbers with checkinterval set to its default value of 100: + +

                + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                First callSecond call
                floatintFix16  floatintFix16
                cpython 28.71 s 22.09 s 781.86 s 28.28 s 21.92 s 761.59 s
                pypy 1.4.1 1.21 s 6.48 s 11.22 s 1.72 s 7.58 s 12.18 s
                pypy 1.21 s 6.27 s 7.22 s 1.20 s 6.29 s 90.47 s
                +

                +

                +

                Conclusions

                +Even though we are seeing speedups in a variety of different small +benchmarks, more complicated examples are not affected much by these +optimizations. It might partly be because larger examples have longer +and more complicated loops, and thus allowing optimizations to operate +across loop boundary will have a smaller relative effect. Another problem is +that with more complicated examples there will be more bridges, and bridges +are currently not handled very well (most of the time all virtuals are +forced at the end of the bridge as explained above). But moving those +forcings into the mini bridges should fix that. +
                +

                Comments

                +
                +
                +
                + + Anonymous wrote on 2011-01-13 07:22: +
                +
                +

                Great post.

                +
                +
                +
                +
                + + Eric wrote on 2012-11-20 16:17: +
                +
                +

                Do you think you could fix the pictures?
                I only see black images with a exclamation marks.

                thanks

                +
                +
                +
                +
                + + Anonymous wrote on 2013-03-05 01:33: +
                +
                +

                Something has eaten the images. Please fix, if you can.

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2011/01/pypy-wants-you-4543209863582915733.html b/posts/2011/01/pypy-wants-you-4543209863582915733.html new file mode 100644 index 000000000..d71e49c14 --- /dev/null +++ b/posts/2011/01/pypy-wants-you-4543209863582915733.html @@ -0,0 +1,383 @@ + + + + + +PyPy wants you! | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                PyPy wants you!

                + + + +
                + + +

                If you ever considered contributing to PyPy, but never did so far, this is a +good moment to start! :-)

                +

                Recently, we merged the fast-forward branch which brings Python 2.7 +compatibility, with the plan of releasing a new version of PyPy as soon as all +tests pass.

                +

                However, at the moment there are still quite a few of failing tests because +of new 2.7 features that have not been implemented yet: many of them are easy +to fix, and doing it represents a good way to get confidence with the code +base, for those who are interested in it. Michael Foord wrote a little howto +explaining the workflow for running lib-python tests.

                +

                Thus, if you are willing to join us in the effort of having a PyPy compatible +with Python 2.7, probably the most sensible option is to come on the #PyPy IRC +channel on Freenode, so we can coordinate each other not to fix the same test +twice.

                +

                Moreover, if you are a student and are considering participating in the next +Google Summer of Code this is a good time to get into pypy. You have the +opportunity to get a good understanding of pypy for when you decide what you +would like to work on over the summer.

                +
                +

                Comments

                +
                +
                +
                + + Oliver Sherouse wrote on 2011-01-21 19:15: +
                +
                +

                Would you mind giving us a hint of what skills programmers would need to be actually useful? I know you don't want to scare anybody off, but PyPy is kind of the ultimate evolution of what you can do with the language, and I get the sense (perhaps wrongly!) that it goes places where desktop-and-web-app guys like me are a bit out of our depth and actually might waste time more than anything else.

                I'm asking this here because I'm pretty sure that others are going to be thinking the same thing.

                +
                +
                +
                +
                + + nekto0n wrote on 2011-01-21 20:37: +
                +
                +

                Seems a lot of volantiers applied - buildbot.pypy.org renders 502 Proxy Error

                +
                +
                +
                +
                + + holger krekel wrote on 2011-01-22 11:35: +
                +
                +

                Nofrak: you ask good questions. I'd say you need to know your way around Python programming in general which you most certainly do if you have done desktop or Web apps in Python.

                Secondly, it's important to know a bit about the basic structure of an Python interpreter. Reading some docs, among them Chapter 1 of https://codespeak.net/pypy/trunk/pypy/doc/coding-guide.html#overview-and-motivation should help.

                Thirdly, methodology: PyPy is written in a test-driven way, and for the Python interpreter there are several places for tests: one is the (sometimes slightly modified) standard CPython tests in the lib-python/(modified-)2.7.0 directory, another is pypy/objspace/std/test. The implementation of the interpreter mainly is written down in pypy/objspace/std/*.py.

                Hope that helps a bit. IRC is a good place to ask for further directions, of course.

                +
                +
                +
                +
                + + Anonymous wrote on 2011-01-22 20:31: +
                +
                +

                And then what do we do after fixing a failing test case? For each patch, create a new bug in the bug tracker and attach it?

                +
                +
                +
                +
                + + Antonio Cuni wrote on 2011-01-22 22:59: +
                +
                +

                @Anonymous: creating a new issue in the bug tracker is not necessary: you can just come on IRC or write to pypy-dev attaching your patch, or you can e.g. fork the project on bitbucket and send a pull request, or you can send us the mercurial bundle, etc. etc.

                There is no really any bureaucracy for this :)

                +
                +
                +
                +
                + + Simon JOnes wrote on 2011-01-26 23:12: +
                +
                +

                What is the best IRC channel to go on?

                +
                +
                +
                +
                + + Maciej Fijalkowski wrote on 2011-01-27 05:33: +
                +
                +

                #pypy on freenode

                +
                +
                +
                +
                + + Anonymous wrote on 2011-02-16 19:01: +
                +
                +

                What's the story on PyPy for the Python 3 language? Python 3 is over 2 years old, and Python 2.x is looking older every day. I might consider getting involved, but I don't want to feel like I'm spending time contributing to a dead-end branch of the language.

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2011/02/pypy-faster-than-c-on-carefully-crafted-5614784244310486765.html b/posts/2011/02/pypy-faster-than-c-on-carefully-crafted-5614784244310486765.html new file mode 100644 index 000000000..53d6548b7 --- /dev/null +++ b/posts/2011/02/pypy-faster-than-c-on-carefully-crafted-5614784244310486765.html @@ -0,0 +1,630 @@ + + + + + +PyPy faster than C on a carefully crafted example | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                PyPy faster than C on a carefully crafted example

                + + + +
                +

                Good day everyone.

                +

                Recent round of optimizations, especially loop invariant code motion +has been very good for small to medium examples. There is work ongoing to +make them scale to larger ones, however there are few examples worth showing +how well they perform. This one following example, besides getting benefits +from loop invariants, also shows a difference between static and dynamic +compilation. In fact, after applying all the optimizations C does, only a +JIT can use the extra bit of runtime information to run even faster.

                +

                The example is as follows. First Python. I create two files, x.py:

                +
                +def add(a, b):
                +  return a + b
                +
                +

                And y.py:

                +
                +from x import add
                +
                +def main():
                +    i = 0
                +    a = 0.0
                +    while i < 1000000000:
                +        a += 1.0
                +        add(a, a)
                +        i += 1
                +
                +main()
                +
                +

                For C, x.c:

                +
                +double add(double a, double b)
                +{
                +  return a + b;
                +}
                +
                +

                and y.c:

                +
                +double add(double a, double b);
                +
                +int main()
                +{
                +  int i = 0;
                +  double a = 0;
                +  while (i < 1000000000) {
                +    a += 1.0;
                +    add(a, a);
                +    i++;
                +  }
                +}
                +
                +

                Results?

                +
                  +
                • 1.97s - PyPy
                • +
                • 3.07s - C
                • +
                +Compilation options: +
                  +
                • PyPy trunk (386ed41eae0c), running pypy-c y.py
                • +
                • C - gcc -O3 (GCC 4.4.5 shipped with Ubuntu Maverick)
                • +
                +

                Hence, PyPy 50% faster than C on this carefully crafted example. The reason +is obvious - static compiler can't inline across file boundaries. In C, +you can somehow circumvent that, however, it wouldn't anyway work +with shared libraries. In Python however, even when the whole import system +is completely dynamic, the JIT can dynamically find out what can be inlined. +That example would work equally well for Java and other decent JITs, it's +however good to see we work in the same space :-)

                +

                Cheers,
                +fijal

                +

                EDIT: Updated GCC version

                +
                +

                Comments

                +
                +
                +
                + + Anonymous wrote on 2011-02-04 11:43: +
                +
                +

                > The reason is obvious - static compiler can't inline across file boundaries.

                That's what link-time optimizations are for, which where added to GCC in 2009; however, your point concerning shared libaries is valid...

                +
                +
                +
                +
                + + Zeev wrote on 2011-02-04 11:55: +
                +
                +

                I added a printf("%f\n",a) to the end of the file so the compiler wouldn't optimize the whole thing away. On my Cure 2 Duo 2.33Ghz, I got for gcc -O3:

                1000000000.000000

                real 0m4.396s
                user 0m4.386s
                sys 0m0.007s

                and for gcc -O3 -flto -fwhole-program:


                1000000000.000000

                real 0m1.312s
                user 0m1.308s
                sys 0m0.003s

                +
                +
                +
                +
                + + Anonymous wrote on 2011-02-04 11:59: +
                +
                +

                Great work!

                Now you just have to identify and remove dead code in your jit. Then you could remove the call to 'add' altogether.

                +
                +
                +
                +
                + + Armin Rigo wrote on 2011-02-04 12:23: +
                +
                +

                In this strange example, in our JIT, the call to 'add' is indeed removed because of inlining, and then the addition that occurs in there is removed because of dead code elimination.

                +
                +
                +
                +
                + + Maciej Fijalkowski wrote on 2011-02-04 12:56: +
                +
                +

                @Zeev yes, but C equivalent of Python import is indeed shared libraries, where -fwhole-program no longer works.

                +
                +
                +
                +
                + + Maciej Fijalkowski wrote on 2011-02-04 13:01: +
                +
                +

                @Armin note that even when the result is accumulated (addition is not removed, although the call is still inlined), PyPy is still faster. Not as much though: 2.5s vs 3.0s

                +
                +
                +
                +
                + + Anonymous wrote on 2011-02-04 13:23: +
                +
                +

                For completeness's sake, what's the output of `gcc --version` in your example?

                +
                +
                +
                +
                + + klauss wrote on 2011-02-04 14:37: +
                +
                +

                Not to mention specialization: python's (and pypy's) add() can add pretty much anything - strings if you will.

                The JIT will inline a specialized version particular to the call site, whereas C can only apply generalized optimizations.

                +
                +
                +
                +
                + + Greg Milner wrote on 2011-02-05 02:02: +
                +
                +

                Everyone knows Python runs faster than C...

                By about 6 weeks.

                +
                +
                +
                +
                + + Anonymous wrote on 2011-02-05 13:01: +
                +
                +

                There's another simple case where pypy could (in principle) do very much better than standard C: turn pow(x, i) into sqrt(x*x*x) if i == 3/2, and other reductions. In practice if you don't know what i is at compiletime you often bundle the simplifications into a function (at the cost of some ifs) but a JIT could do a very nice job on this automagically whenever i is fixed, which it usually is.

                +
                +
                +
                +
                + + Anonymous wrote on 2011-02-06 14:12: +
                +
                +

                You wrote: "PyPy 50% faster than C on this carefully crafted example".

                The truth is: PyPy is 35% faster than the C code (using C as the baseline), because it completes in 65% of the time required by the C version.

                The C code takes 50% more time to execute (is slower by 50%, 1.5x slower) than the PyPy code (using PyPy as the baseline).

                +
                +
                +
                +
                + + haypo wrote on 2011-02-08 22:58: +
                +
                +

                Test with gcc (Debian 20110126-0ubuntu1) 4.6.0 20110126 (experimental) [trunk revision 169285]: "/usr/lib/gcc-snapshot/bin/gcc [OPTIONS] x.c y.c -o x && time ./x". OPTIONS=-O0: 10.1s; OPTIONS=-O3: 9.1s; OPTIONS=-O3 -flto: 0.002s. Woops, 0.002 second? I checked: the result is correct :-) LTO rocks!

                +
                +
                +
                +
                + + Maciej Fijalkowski wrote on 2011-02-09 06:43: +
                +
                +

                @haypo print the result so the loop don't get removed as dead code. Besides, the problem is really the fact that's -flto is unfair since python imports more resemble shared libraries than statically-compiled files.

                +
                +
                +
                +
                + + Anonymous wrote on 2011-05-05 06:40: +
                +
                +

                In general, if you want to compare the performance of languages, you're actually supposed to try to write the *fastest* implementation in each language. Not just some arbitrary one.

                In this example, the program has no output, so both implementations are crap and could be made a lot faster.

                Come up with a program that has testable output, and see if someone can't comment with a C program that's faster than your python.

                +
                +
                +
                +
                + + Anonymous wrote on 2011-12-01 00:09: +
                +
                +

                RIDICULOUS!

                +
                +
                +
                +
                + + Eric wrote on 2012-11-20 15:46: +
                +
                +

                Pypy isn't faster than C, even on this example for multiple reasons:

                First it's conceptual: C is almost as optimized as assembly (it's often referred to as a super assembler) so even if Pypy ends-up generating some assembly code, it has first to evaluate the runtime environment to figure out the type of variables and emit assembly code, and all this process is not free... so Pypy can only asymptotically reach the same level as C and assembly.

                Second, the test is flawed: I did a slight modification that shouldn't change the results: I've inlined the add() in both python and C. Oh! surprise: Pypy keeps the same time whereas C is 4x faster than before (without inlining).

                So to make it fair, we need to use the best capabilities of both languages:
                - python: I'm sure the author provided the best python implementation, and the fact that inlining add() doesn't change results kinda proves this)
                - C: when you inline the function you get:

                [code]

                static inline double add_double(double a, double b) {
                return a + b;
                }

                int main()
                {
                unsigned int i;
                double a = 0.0;

                for (i = 0; i < N; i++) {
                a += 1.0;
                add_double(a, a);
                }
                printf("%f\n", a);
                }

                [/code]

                Results:
                C inlined: 1.10s
                C: 3.98s
                Pypy inlined: 3.30s
                Pypy: 3.28s

                Conclusion:
                - When using the right C code, on the same example C is 3 times faster than Pypy.
                - As demonstrated, the statement that Pypy is faster than C is simply biased by a not optimizsed C code.

                +
                +
                +
                +
                + + Staff wrote on 2012-11-21 06:07: +
                +
                +

                @Eric This post is not trying to argue that Python is "better" or even faster than C. It is just pointing out that certain classes of optimizations (i.e. whole program optimizations) come naturally to the PyPy JIT.

                This is, of course, only one small facet of why a program runs fast. The author admits that it is a contrived example to illustrate the point.

                Taking the point to an extreme, one could see a PyPy program run faster than a C program if the C program made many calls to simple shared libraries. For example, if one dynamically links a C stdlib into their program, and uses it heavily, the equivalent python code may conceivably run faster.

                +
                +
                +
                +
                + + Eric wrote on 2012-11-21 14:44: +
                +
                +

                Please read the title of this article again: "PyPy faster than C on a carefully crafted example"

                Based on a specific example or not it doesn't matter, I'm simply not comfortable with reading strong statement like this that are obvioulsy false to any serious computer scientist and misleading to beginners. It's false because it's the conclusion of a test which is biased.

                The root of benchmarking is to get rid of any bias
                In this case the obvious bias is that Pypy is optimized and C isn't (as demonstrated above with inline functions).

                You can't transpose only what you want in real life and not the other: your argument that in real life the C could use external library hence be slower is valid, but then you have to compare with real life Python scripts which can't be as much optimized by Pypy as this crafted example. So in real life you get a C code that may be slowed down a bit by dynamic linking, and python scripts that are much slower because Pypy isn't ready to match C speed for everything (yet).

                If you want to use a crafted Python example, you have to compare it to a crafted C example, so that you can compare apples with apples.

                All that is methodology, that said JIT is quite powerful and it's impressive in itself to beat CPython by a large margin.

                +
                +
                +
                +
                + + keegano wrote on 2013-02-06 22:53: +
                +
                +

                Eric: Your comments about "real life" are irrelevant - the post is about a specific, contrived example. I don't think anyone would argue that a high-level, garbage-collected language like python could ever beat out C in general - it's simply a demonstration that, in a very specific instance, equivalent code in python and C can run faster in python because of the JIT making optimizations that can't occur at compile time.

                +
                +
                +
                +
                + + Eric wrote on 2013-02-06 23:02: +
                +
                +

                You're assuming that python is faster even on this crafted example, but keep in mind that this comparison is biased because the C version isn't optimal.

                +
                +
                +
                +
                + + Eric wrote on 2013-02-06 23:02: +
                +
                +

                you're assuming that python is faster even on this crafted example, but keep in mind that this comparison is biased because the C version isn't optimal.

                +
                +
                +
                +
                + + Staff wrote on 2013-02-07 01:42: +
                +
                +

                stop feeding this troll

                +
                +
                +
                +
                + + Eric wrote on 2013-02-07 10:18: +
                +
                +

                point taken, but do update the article to take into account my remark: both the title and the conclusion of the "demonstration" are false, even on a contrived example as you barely can't find any C code that would be slower than the code generated by your JIT for the simple reason that C is really too close to assembly and that JIT adds an overhead.

                +
                +
                +
                +
                + + Maciej Fijalkowski wrote on 2013-02-07 10:30: +
                +
                +

                Hey Eric.

                Your argument is incredibly flawed. You can compile faster version of assembler (or is C the fastest assembler ever?) if you try hard enough. Why not?

                +
                +
                +
                +
                + + Eric wrote on 2013-02-07 10:48: +
                +
                +

                Please don't digress, what I say is simple:
                The article states that Pypy generates code faster than C on a crafted example.
                I demonstrate there is a more optimized C code that the author's one, hence that the whole article is wrong... end of the story.

                +
                +
                +
                +
                + + Maciej Fijalkowski wrote on 2013-02-07 10:52: +
                +
                +

                No, it's a reasonable piece of C. You don't inline your printf code, do you? dynamic linking is a thing that people use.

                +
                +
                +
                +
                + + Eric wrote on 2013-02-09 11:38: +
                +
                +

                You're right, people very often use dynamic linking. However the following is not a reasonable piece of Python code:

                def add(a, b): return a + b

                People rarely use that and more importantly they don't write a loop that calls it 1 billion times.

                The point is that the reasoning spans two levels (hence is flawed/biased):
                - in Python the author took a crafted piece of Python that is not meaningful in real life because it has the property to do what he wants at the Pypy level
                - in C the author uses a very common mechanism that isn't fully optimized (not as much as Python/Ppy is optimized).

                I know you will not agree since you're all proud that "Pypy is faster than C" (lol it's nonsense even on a "crafted example") but you have to compare apples with apples.

                +
                +
                +
                +
                + + Dvd Fo wrote on 2013-09-20 18:29: +
                +
                +

                @Eric what you don't understand is the point of the article. The actual point is to demonstrate a nice property of PyPy JIT, which is able to generate fast code when it can. Comparing to C in this manner proves that PyPy's generated machine code is relevant with regard to speed.
                Of course this example is fragile because it relies on suboptimal C code, but this serves only to prove the point about PyPy.

                +
                +
                +
                +
                + + Anonymous wrote on 2013-12-07 06:14: +
                +
                +

                @Eric... Non sense.. Are you a ambassador for C ?

                +
                +
                +
                +
                + + Eric wrote on 2013-12-07 08:44: +
                +
                +

                Do argue if you disagree, don't troll.

                I think everything have been said already anyway.

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2011/02/pypy-san-franciso-bay-area-tour-2011-6179180737090334330.html b/posts/2011/02/pypy-san-franciso-bay-area-tour-2011-6179180737090334330.html new file mode 100644 index 000000000..348e03e21 --- /dev/null +++ b/posts/2011/02/pypy-san-franciso-bay-area-tour-2011-6179180737090334330.html @@ -0,0 +1,411 @@ + + + + + +The PyPy San Franciso Bay Area Tour 2011 | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                The PyPy San Franciso Bay Area Tour 2011

                + + + +
                +

                PyPy is coming to the San Francisco Bay Area in the beginning of March with +a series of talks and a mini sprint.

                + +
                  +
                • +

                  Wednesday March 2, 4:15 p.m. Armin Rigo gives +a +talk at Stanford. open to the public.

                  + +
                • +
                • +

                  Thursday March 3, 6:00 p.m. General talk at Yelp, 706 Mission St 9th Floor, + San Francisco CA 94103 open to the public.

                  + +
                • +
                • +

                  Saturday and Sunday March 5 and 6. + PyPy mini sprint at noisebridge. + 2169 Mission street between 17th and 18th in San Francisco. Open to the public.

                  + +
                • +
                • +

                  Monday March 7th, 11:30 a.m. Google Tech talk in Mountain View at the + Googleplex. Not open to the public (but the video should be available + later).

                  + +
                • +
                • +

                  Monday March 7th, 2:30 p.m. Talk at Mozilla in Mountain View. Not + open to the public (but Mozilla developers can videoconference).

                  +
                • +
                +

                From the PyPy project team we will have Armin Rigo, Maciej Fijałkowski +(from 6th March), Laura Creighton and Jacob Hallén and possibly +Christian Tismer attending.

                + +

                Most of the talks will focus on (some of) the highlights and the +status of pypy:

                + +
                  +
                • most Python benchmarks run much faster than with CPython or Psyco +
                • +
                • the real-world PyPy compiler toolchain itself (200 KLocs) runs twice as fast +
                • +
                • supports x86 32 and 64bit and is in the process of supporting ARM +
                • +
                • full compatibility with CPython (more than Jython/IronPython) +
                • +
                • full (and JIT-ed) ctypes support to call C libraries from Python +
                • +
                • supports Stackless Python (in-progress) +
                • +
                • new "cpyext" layer which integrates existing CPython C extensions +
                • +
                • an experimental super-fast JIT-compilation of calls to C++ libraries +
                • +
                +

                As is usual for us, there is vastly more material that is available for +us to cover than time, especially when it comes to possible future +directions for PyPy. We want to reserve a certain amount of time at +each talk purely to discuss things that are of interest to audience +members. However, if you already know what you wish we would discuss, +and are attending a talk (or even if you aren't), please let us know. +You can either reply to this blog post, or mail Laura directly at +lac at openend.se .

                + +

                Apart from getting more technical and project insight, our travel is +also a good possibility for companies in the SF area to talk to us +regarding contracting. In September 2011 our current "Eurostars" research +project ends and some of us are looking for ways to continue working on +PyPy through consulting, subcontracting or hiring. The two companies, +Open End and merlinux, have successfully done a number of such contracts +and projects in the past. If you want to talk business or get together for +lunch or dinner, let us know! If you would like us to come to your company +and make a presentation, let us know! If you have any ideas about what +we should discuss in a presentation so that you could use it to convince +the powers-that-be at your place of employment that investing time and +money in PyPy would be a good idea, let us know!

                + +

                On Tuesday March 8th we will be heading for Atlanta for the Python VM +and Language Summits before attending PyCon. Maciej Fijałkowski and +Alex Gaynor will be giving a talk entitled +Why is +Python slow and how can PyPy help? +Maciej will also be giving the talk +Running +ultra large telescopes in Python which is +partially about his experiences using PyPy in the Square Kilometer Array +project in South Africa. There will be a PyPy Sprint March 14-17. +All are welcome.

                +
                +

                Comments

                +
                +
                +
                + + Dan wrote on 2011-02-13 01:42: +
                +
                +

                I wanted to let everyone know, there is a PSF sponsored code sprint in Portland, Oregon on February 26th starting at 9am. If you're going to be in the area, it promises to be a great time. We've got a great plan for the day which can be see in this google doc. I hope to see some of you there!

                --Dan

                +
                +
                +
                +
                + + Anonymous wrote on 2011-02-16 00:51: +
                +
                +

                We'll be giving a talk at Dropbox in San Francisco at 16:00 on Friday March 4th.

                +
                +
                +
                +
                + + Anonymous wrote on 2011-02-22 05:52: +
                +
                +

                And we'll be dropping by the Google building in San Francisco at 10.45 a.m.
                on Tuesday March 1st to chat with
                Googlers there and give an informal
                talk.

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2011/02/pypy-winter-sprint-report-4155886720346408516.html b/posts/2011/02/pypy-winter-sprint-report-4155886720346408516.html new file mode 100644 index 000000000..9e0f0d069 --- /dev/null +++ b/posts/2011/02/pypy-winter-sprint-report-4155886720346408516.html @@ -0,0 +1,377 @@ + + + + + +PyPy Winter Sprint Report | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                PyPy Winter Sprint Report

                + + + +
                +

                A few weeks ago I had the great fortune to attend the PyPy winter sprint in Leysin Switzerland. I've wanted to contribute to PyPy for a long time and I thought diving into a sprint might be a good way to get familiar with some of the code. What I wasn't expecting was to be using RPython to implement new methods on built-in Python objects on the first day. The main thing I took away from the sprint was just how easy it is to get involved in developing PyPy (well, some bits of it at least and being surrounded by core developers helps). I wrote up a very short description of how to get started here, but I'll do a longer blog post with examples on my own blog soon(ish).

                +The sprint was kicked off by Armin merging the "fast-forward" branch of PyPy onto trunk. "fast-forward" brings PyPy from Python 2.5 compatibility to Python 2.7. Along with this it brought a large number of test failures, as the sterling work done by Benjamin Peterson and Amaury Forgeot d'Arc was not complete. This immediately set the primary sprint goal to reduce the number of test failures.

                +We made a great deal of progress on this front, and you can see how close PyPy is now from the buildbots.

                +Jacob Hallén and I started working through the list of tests with failures alphabetically. We made short work of test_asyncore and moved onto test_bytes where I was stuck for the rest of the sprint. I spent much of the remaining days working with Laura Creighton on the pypy bytearray implementation to make it more compatible with Python 2.7. This meant adding new methods, changing some of the Python protocol method implementations and even changing the way that bytearray is constructed. All in all great fun and a great introduction to working with RPython.

                +A big part of the compatibility with Python 2.7 work was done by Laura and Armin who basically rewrote the math module from scratch. This was needed to incorporate all the improvements made (mostly by Mark Dickinson) in CPython in 2.7. That involved a lot of head-scratching about such subtleties as whether -0.0 should be considered almost equal to 0.0 and other fun problems.


                + + + +
                The first meal together, before everyone had arrived
                +If you add on top of this the wonderful people, the beautiful scenery, the Swiss cheese fondues, managing to not kill myself with a days skiing and traditional pypy card games, I can heartily recommend pypy sprints as a close approximation of geek nirvana.

                + + +
                View of the mountains from the sprint
                +
                +Working on 2.7 compatibility wasn't the only work that happened during the sprint. Other activities included:
                  +
                • Antonio Cuni worked on the "jittypes" branch. This is a reimplementation of the core of the PyPy ctypes code to make it jittable. The goal is that for common cases the jit should be able to turn ctypes calls from Python into direct C level calls. This work was not completed but very close and is great for the future of integrating C libraries with PyPy. As ctypes is also available in CPython and IronPython, and hopefully will be available in Jython soon, integrating C code with Python through ctypes is the most "implementation portable" technique.
                • +
                • David Schneider continued his work on the JIT backend for ARM. PyPy has been cross-compilable to ARM for a long time, but bringing the JIT to ARM will provide a *fast* PyPy for ARM, which includes platforms like Android. Again David didn't complete this work but did complete the float support.
                • +
                • Håkan Ardo was present for two days and continued his crazy-clever work on JIT optimisations, some of which are described in the Loop invariant code motion blog entry.
                • +
                • Holger Krekel worked on updating the PyPy test suite to the latest version of py.test and also worked with me on the interminable bytearray changes for part of the sprint.
                • +
                • No one was sure what  Maciej Fijałkowski worked on but he seemed to be quite busy.
                • +
                +I think that was most of the work done during the actual sprint. There was also a great deal of healthy discussion about the future of PyPy. Expect lots more interesting and exciting developments over the coming year.

                +
                +

                Comments

                +
                +
                +
                + + Anonymous wrote on 2011-02-14 15:00: +
                +
                +

                "There was also a great deal of healthy discussion about the future of PyPy."

                World domination?

                +
                +
                +
                +
                + + Carl Friedrich Bolz-Tereick wrote on 2011-02-14 16:19: +
                +
                +

                Very nice report, thanks a lot Michael!

                +
                +
                +
                +
                + + Anonymous wrote on 2011-02-15 01:16: +
                +
                +

                > world domination?

                why yes of course! the ouroboros is their symbol; PyPy is, evidently, backed by the templars

                +
                +
                +
                +
                + + Anonymous wrote on 2011-02-15 16:21: +
                +
                +

                > world domination?

                Mongol General: Pypy devs! What is best in life?
                Pypy dev: To crush your enemies, see them driven before you, and to hear the lamentation of their women.
                Mongol General: That is good! That is good.

                +
                +
                +
                +
                + + Carl Friedrich Bolz-Tereick wrote on 2011-02-15 16:41: +
                +
                +

                @Anonymous: Let's not get too far off-track. Also, I don't really like being ascribed a rather violent quote by (supposedly) Genghis Khan, so stop that please.

                +
                +
                +
                +
                + + Anonymous wrote on 2011-02-15 17:44: +
                +
                +

                @Carl, it wasn't Genghis Khan.
                It was Conan the Barbarian, impersonated by former California governor.
                Not to be taken too seriously... :-)

                +
                +
                +
                +
                + + Carl Friedrich Bolz-Tereick wrote on 2011-02-15 20:40: +
                +
                +

                @Anonymous: https://www.barbariankeep.com/ctbsecrets.html

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2011/03/bay-area-2011-tour-summary-9117372109664978472.html b/posts/2011/03/bay-area-2011-tour-summary-9117372109664978472.html new file mode 100644 index 000000000..b578a6ebe --- /dev/null +++ b/posts/2011/03/bay-area-2011-tour-summary-9117372109664978472.html @@ -0,0 +1,504 @@ + + + + + +Bay Area 2011 Tour Summary | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                Bay Area 2011 Tour Summary

                + + + +
                +

                We spent the week in the San Francisco Bay Area showing off PyPy. +Here are notes and photos of the tour.

                +
                +

                Day 1: Google SF

                +

                Google has offices in downtown San Francisco. They are at a beautiful +place and the views are spectacular. We thank Wesley Chun and Guido van +Rossum for organizing this meeting. Between 25 and 30 engineers showed +up. Some of them were Python programmers, but others were C++ +programmers; and they all seem to have real problems that they want to +solve with PyPy. We didn't have prepared slides so far, so we mostly +ran demos and talked. As predicted, Google would love SWIG support. +They suggested that we rename the translation toolchain (as we vaguely +thought too) to separate it more from PyPy's Python interpreter; up +until today, many had no idea that they could use PyPy for other +languages. All in all, it was very positive and people looked forward +to meeting up at PyCon.

                +
                +
                +

                Day 2: Stanford

                + + + + +

                This was the most academically-oriented talk. You can find the +abstract, the slides (PgUp/PgDown to navigate) and the video here. +There were around 35 people in the audience, and maybe 1000 real-time +video watchers (who didn't get to ask questions). The live audience +seemed to be a mixture of students, professors, and people from the +local industry. We thank David Allison and Andy Freeman for organizing +it. It has been two or three years since they invited me (Armin) and I +finally managed to get here :-)

                +

                The slides are longer than the talk; we focused on the JIT because that +was what the audience was most interested in. They were really +impressed at the stability, the tests, and that we don't have lots of +bugs reported in the JIT of our latest public release. We later found +out that many who came to the talk believed that they were going to get +a talk about how we jitted a subset of python because real python is too +hard -- impossible to do. They came to heckle with examples of how +python was impossible. So they were amazed when the first slide of +Armin's presentation was "Python is complicated", and the next slide +"Python is messy". It was a positive outcome. We made new fans :-)

                +
                +
                +

                Day 3: Yelp

                + + + + + +

                As you can see in the image, tons of people showed up -- ~140. Thanks +to Grace Law, who is the coordinator for the SF Python Meet-up, and to +Jimmy Retzlaff and Ashley King-Bishof from Yelp. Yelp is also located +in downtown San Francisco. This looks like the place to be if you are a +start-up in California (and not in Silicon Valley): lots of enthusiastic +young people are here, and they are hiring. Yelp has an enormous open +space, suitable for huge parties, and the coolest beer dispensers on the +planet, made as a hack-a-thon project by three Yelp engineers (pictured +below):

                + + + + + + + + + +

                By the way, their management structure seems to be flat. There are +almost no line managers, i.e. managers for the engineering staff; +instead they self-organize into teams. This is not what you expect +for the USA; things appear to have changed a lot.

                +

                The talk was in two sections, "PyPy from the user's point of view" and +"How the JIT works". Good feedback; impressed that we support all of +Python 2.7 (including all the modules that are in C in the stdlib), and +impressed that the Python 3.0 conversion is not considered a big deal by +us, although we have no precise date yet. The plan is, of course, just +to tweak the interpreter until it supports both (by adding the necessary +conditions); the other aspects like GC and the JIT will not be affected +at all.

                +
                +
                +

                Day 4: Dropbox

                + + + + + + + +

                This was another place full of excited, successful young people. The +CTO looks like he turned 30 last week, and he's been CTO for 4 years +now. The three of us were quite obviously the oldest people there. We +felt old. They have another great big open barn complex. It's +loud. Very loud. Loud refrigerators, loud street noise, loud machinery +in the walls doing who knows what, loudly.

                +

                This was the first tech talk at dropbox. Thanks to Rian Hunter for +organizing it. They have a big kitchen, and we held the talk in there. +There was a skylight, which made the room too bright, so harder to read +the slides than would otherwise be the case. They were jazzed about our +visit, and wanted copies of all the pictures Jacob took before he left.

                +

                They seemed familiar with Google V8, and thought that how long it took +to build PyPy was a great incentive for us to make PyPy faster. They +are very interested in fast ctypes, fast SWIG, fast Cython. They were +pleased and surprised that we don't have too much JIT bloat (typically +~10% of the total RAM usage).

                +

                The mobile developers want a smaller Python more than a faster one. +Python takes too much memory given the tiny amount available on a lot of +cell phones. Not that we have an answer to this problem now.

                +

                They were pleased to learn that we will soon be able to JIT ctypes code. +And the fact that Armin knows many ways to segfault CPython was a bit of +a shock. We talked for an hour after the presentation. Again, a very +positive outcome.

                +
                +
                +

                Days 5 and 6: Noisebridge sprint

                + + + +

                About six people showed up for the sprint. (Late. Californians really +do start the day at 11.) Noisebridge is a very eclectic place; people +show up to do pretty much everything from sewing to breaking apart +equipment to making robots and beer. It's donation-driven. Thanks to +Jim Stockford for volunteering the space and arranging this and helping +us set up for the sprint.

                +

                During the sprint, we did a little bit of everything; there was no clear +pattern. Ademan worked on sqlite, Greg Price looked to see if his +software could run on PyPy, Will worked on the documentation, and a few +of us fixed some more 2.7 tests. Alex Gaynor and Fijal joined us, too.

                +
                +
                +

                Day 7: Google Mountain View and Mozilla

                +

                We gave two talks on the 7th day of our trip so we were already quite +exhausted. Fortunately new people joined, so the talks were actually split +between multiple people. We would like to thank Peter Norvig and Ben Bayer +for inviting us to Google and Andreas Gal, Brendan Eich and Dave Herman +for inviting us to Mozilla. Both talks should hopefully appear online +at some point soon, but as of now we don't have a link.

                +

                It was pretty incredible to find ourselves at Mozilla talking with at +least 15 people who deeply understood the ideas of tracing JITs and +also understood why we undertook the decision to generate our JIT +instead of writing it. They suffered from having to write JavaScript +JIT (even multiple ones) by hand, as Armin did with Psyco. He deeply +sympathizes. The discussion afterwards was very successful and we're +looking forward to cooperating with them. Many exciting things were +discussed as possibilities.

                +

                Next day we went to Pycon, which is ongoing and a topic for yet another +blog post.

                +
                +
                +

                Comments

                +
                +
                +
                + + Luis wrote on 2011-03-11 00:29: +
                +
                +

                Great post, but the links are broken...

                +
                +
                +
                +
                + + ipc wrote on 2011-03-11 11:39: +
                +
                +

                thank you for sharing! The tour seems like a very good way to draw the attention of a lot of smart and influential people to the fantastic work you've been doing.

                +
                +
                +
                +
                + + Maciej Fijalkowski wrote on 2011-03-11 14:12: +
                +
                +

                @Luis thanks, fixed I hope. bitbucket is not very good at permalinks and I forgot extradoc has "tip" and not "default"

                +
                +
                +
                +
                + + Armin Rigo wrote on 2011-03-11 15:31: +
                +
                +

                fijal: bitbucket serves html files as binary or something. This means that at least in Firefox we don't get the "ui" subdirectory, just the raw html. Annoying.

                +
                +
                +
                +
                + + Antonio Cuni wrote on 2011-03-11 15:38: +
                +
                +

                @armin: I think that bitbucket's choice is the only reasonable one, else it could be probably exploited to do some sort of Cross Side Scripting attack

                +
                +
                +
                +
                + + Maciej Fijalkowski wrote on 2011-03-11 15:52: +
                +
                +

                Eh. That means we should host them somewhere else I fear.

                +
                +
                +
                +
                + + Andreas Mueller wrote on 2012-08-16 12:29: +
                +
                +

                The link to the video seems to be broken. At least I can't find the video on the page that is linked to.
                Could you please check?
                Thanks,
                Andy

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2011/03/controlling-tracing-of-interpreter-with-871085470935630424.html b/posts/2011/03/controlling-tracing-of-interpreter-with-871085470935630424.html new file mode 100644 index 000000000..c39d64d77 --- /dev/null +++ b/posts/2011/03/controlling-tracing-of-interpreter-with-871085470935630424.html @@ -0,0 +1,544 @@ + + + + + +Controlling the Tracing of an Interpreter With Hints, Part 1: Controlling the Extent of Tracing | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                Controlling the Tracing of an Interpreter With Hints, Part 1: Controlling the Extent of Tracing

                + + + +
                +

                The question I was asked most often during my recent US trip was how exactly +the hints work that interpreter authors can use to improve the execution speed +of the programs running on their interpreters. Since those hints are not really +documented all that well, I decided to write blog posts about them. This is the +first one.

                +
                +

                Background

                +

                First, let's recap some basics: PyPy's approach to implementing dynamic +languages is to write an interpreter for +the language in RPython. This interpreter can be translated to C and then +further to machine code. The interpreter consists of code in the form of a +large number of generated C functions and some data. Similarly, the user +program consists of functions in the language the interpreter executes.

                +

                As was explained in a blog post and a paper two years ago, PyPy's JIT is a +meta-tracer. Since we want to re-use our tracer for a variety of languages, we +don't trace the execution of the user program, but instead trace the execution +of the interpreter that is running the program. This means that the traces +don't contain the bytecodes of the language in question, but RPython-level +operations that the interpreter did to execute the program.

                +

                On the other hand, the loops that are traced by the tracer are the loops in the +user program. This means that the tracer stops tracing after one iteration of +the loop in the user function that is being considered. At this point, it can +have traced many iterations of the interpreter main loop.

                +

                Here's a diagram of this process:

                + + + +

                On the left you see the levels of execution. The CPU executes the binary of +PyPy's Python interpreter, which consists of RPython functions that have been +compiled first to C, then to machine code. Some of these functions contain +loops, others don't. The interpreter runs a Python program written by a +programmer (the user). If the tracer is used, it traces operations on the level +of the interpreter. However, the extent of the trace is determined by the loops +in the user program.

                +
                +
                +

                How Far Should Tracing Go

                +

                When the tracer encounters a function call at the interpreter level, e.g. the +interpreter main loop calling a helper function, it can do one of two things:

                +
                  +
                1. it can trace into the helper function, effectively inlining it into the trace.
                2. +
                3. it can not trace into the function and instead record a call to that function +as an operation in the trace. Such a call operation in the trace is sometimes +called residual call.
                4. +
                +

                As a default, the tracer will try to trace into the helper because that will +give more information to the optimizer, allowing it to do a better job. This is +particularly important for the allocation removal optimization, because if a +freshly allocated object is passed as an argument to a residual call, its +allocation cannot be optimized away.

                +

                There is a problem however if the helper function itself contains a loop. The +tracer records the linear sequence of operations that are being executed. Thus +when it encounters a loop on the interpreter level it records all the +operations of every iteration of the loop itself, with the net effect of +unrolling it. The only places where the tracer stops and tries to close the +trace is in the main loop of the interpreter. When the tracer encounters the +main loop, it also checks whether the original user loop has been closed, and +thus whether it can stop tracing.

                +

                For most helper functions in the interpreter that contain loops, fully +unrolling does not make sense. If a loop is unrolled, the trace is specific to +the number of iteration that was seen during tracing. If the trace is later +executed with a different number of iterations, the trace will be left via a +guard failure, which is inefficient. Therefore the default behaviour of the +tracer is to never trace into a function on the interpreter level that contains +a loop, but to trace into all non-looping helper functions.

                +

                This default behaviour is essentially a heuristic, but one that usually makes +sense. We want to produce just enough traces to make the resulting code +efficient, but not more. Therefore we trace as much as possible (everything by +default) except the functions which loops where tracing would produce code that +is less general than it could be.

                +

                As an example for a helper with a loop, take string concatenation. It loops over +the characters of both arguments and copies them over into the result string. It +does not make sense to unroll the loops in this function. If we do that, +the resulting trace can only be used for strings of the length that was seen +during tracing. In practise, the string lengths are usually different each run, +meaning that the trace with unrolling is not run to completion in most cases.

                +
                +
                +

                Influencing the Default Behaviour

                +

                Sometimes the default behaviour is not actually what is wanted. This is +something the interpreter author has to decide, usually by looking at the traces +that are produced and deciding that they should be improved. There are two ways +in which the default is wrong:

                +
                  +
                • +false negatives: if a helper function that does contain a loop should +be traced into, unrolling the loop.
                • +
                • +false positives: if a helper function that does not contain a loop is +inlined into the trace, but the interpreter author decides that this is not +helpful.
                • +
                +

                If the interpreter author finds false negatives or false positives, she can fix +that by applying a hint to the tracer. These hints take the form of function +decorators (which both live in the pypy.rlib.jit module). In the next two +subsections I will describe these two function decorators and their use.

                +
                +

                Unrolling Functions With Loops

                +

                The first decorator, used to fix false negatives, is the unroll_safe +decorator. It is used to tell the tracer to always trace into a function that +has a loop, effectively unrolling the loop. This decorator should be used only +if the loop in the helper function is expected to always run for the same number +of iterations. This sounds like a strong restriction, in practise this is less +severe: The number of iterations needs to only be the same in the context where +the helper functions is traced from.

                +

                It is easiest to understand this condition via an example. Let's look at the +BUILD_TUPLE bytecode in Python. It takes one argument, the length n of +the tuple being built. The bytecode pops n arguments from the stack, turns +them into a tuple and pushes that tuple on the stack. Thus the function that +implements BUILD_TUPLE in PyPy's Python interpreter calls a helper +popvalues which pops n values from the stack and returns them in a list. +This helper is implemented with a loop and would thus not be traced into by +default. The loop in the helper can run for very different numbers of +iterations, because it is used in a variety of places. However, for every +concrete BUILD_TUPLE bytecode, the argument will be constant. Therefore it +is safe (and even necessary) to annotate popvalues with the unroll_safe +decorator.

                +

                A different example is the implementation of the isinstance builtin. It is +used to check whether an object a is an instance of a class B like +this: isinstance(a, B). The second argument of the function can also be a +tuple of classes to check whether an object is an instance of one of a number of +classes: isinstance(a, (A, B, C, D)). To implement this second case, the +implementation of isinstance contains a loop iterating over the elements of +the tuple. The number of loop iterations can vary, but is usually fixed for each +individual call site which typically just lists a few classes in the source +code. Therefore it is also safe to annotate the implementation of isinstance +with the unroll_safe decorator.

                +
                +
                +

                Preventing the Tracing of Functions

                +

                The second decorator dont_look_inside is used to fix false positives. It +tells the JIT to never trace into the decorated function and just always produce +a residual call instead. This decorator is in many ways less important than the +unrolling one (except for a special situation that I will describe in a +follow-up post). It is used if tracing into a function is not expected to yield +any speed benefits, because the optimizer will not be able to improve it much. +This is often the case if the called helper function does not contain any +"dynamic" behaviour. In such a situation it is better to just leave the function +call in the trace, because that produces less code.

                +

                An example would be the import mechanism in Python. It's very unlikely that any +performance improvement can be had by turning part of it into assembler. +Therefore we hide it from the tracer by annotating them with +dont_look_inside.

                +
                +
                +
                +

                Conclusion

                +

                In this post we discussed two hints that can be used to control precisely which +parts of the interpreter should be meta-traced. If these hints are used +carefully, this can go a long way to making the interpreter produce traces that +contain exactly the interesting part of the execution, and will contain calls to +the functions that can not be optimized by tracing techniques.

                +

                In the next part of this series I will discuss a different set of hints that can +be used to strongly optimize traces.

                +
                +
                +

                Comments

                +
                +
                +
                + + Victor wrote on 2011-03-12 21:28: +
                +
                +

                Would it be possible (i.e. is the code amenable) to programmatically randomly sprinkle these decorators around and compare effects on speed (or on measurable trace quality)?

                It would make JIT generation a bit more meta :)

                +
                +
                +
                +
                + + Gaëtan de Menten wrote on 2011-03-13 10:42: +
                +
                +

                Thanks for the very interesting post!

                Sorry if the following questions are naive, but you post makes me wonder if not tracing at all the functions which contain loops with a varying number of iteration means that no optimization is possible at all for those loops? Also, wouldn't it be possible to detect there is a loop and produce a special kind of trace in that case which do not duplicate the body of the loop? I guess that if it was possible and useful, you'd have done it, so I guess the real question is: why doesn't this work?

                +
                +
                +
                +
                + + Carl Friedrich Bolz-Tereick wrote on 2011-03-14 09:54: +
                +
                +

                @Victor: yes, there are probably ways to do place some of the hints more automatically. However, you will always have to look at the traces and think about how to improve them, so we chose the pragmatic path and didn't do anything magic.

                +
                +
                +
                +
                + + Carl Friedrich Bolz-Tereick wrote on 2011-03-14 10:02: +
                +
                +

                @Gaëtan: those are excellent questions!

                Yes, functions in the interpreter with loops that we do not trace are not optimized at all. For most of these functions this is not a problem, e.g. string concatenation does not have much optimization potential anyway. However, there are some functions with loops (like the implementation of the map builtin) that would benefit from tracing, and we don't have a good general solution for that yet.

                One of the ideas for solutions are indeed to try to start new traces in the interpreter functions with loops. We did not get around to playing with this yet, as there are not so many cases in the Python interpreter where this leads to a huge benefit.

                +
                +
                +
                +
                + + Gaëtan de Menten wrote on 2011-03-14 13:50: +
                +
                +

                I'm puzzled now. I fail to see why those loops "do not have much optimization potential". I can understand that it's hard to optimize them because of the trace problem but I thought they would benefit from optimization like any other code (eg avoiding boxing/unboxing temporary variables), especially since they are within a loop, hence any gain will be multiplied by the number of iterations.

                +
                +
                +
                +
                + + Carl Friedrich Bolz-Tereick wrote on 2011-03-14 14:01: +
                +
                +

                @Gaëtan:
                is it possible that you are mixing up the two levels involved? The post talked only about functions in the interpreter, not about the functions in pure Python that a user of the interpreter might write. To clarify:

                - All loops on the application level, i.e. in the program the user wrote, are traceable and will be traced if they are executed often enough.

                - Some loops in the interpreter itself are not. Most of these loops do not do any boxing/unboxing, so they won't benefit from optimization. For some of the loops that would benefit we added some manual hacks to trace them anyway, e.g. for the implementation of "map". Some others still need to be improved, e.g. any, all, zip, ...

                +
                +
                +
                +
                + + Unknown wrote on 2011-03-15 14:52: +
                +
                +

                Carl, thanks for the post. The information is very helpful.

                While I understand special casing to overwrite the default tracing/not-tracing rules can help performance, I wonder how well are the default heuristics performing. Do you have any bulk part estimation of the performance loss by turning off special casing? And how many hints (related to whether to trace or unroll) do you have to introduce to PyPy?

                +
                +
                +
                +
                + + Carl Friedrich Bolz-Tereick wrote on 2011-03-15 16:00: +
                +
                +

                Hi Peng,

                Thanks :-). No, I didn't really do benchmarks yet, plan to do so in the future (these blog posts will turn into a paper soonish).

                There are about 20-30 unroll_safe hints and equally many dont_look_inside hints. Some of them are really important, ie the speed would be abysmal without them. Most of them are really in the bytecode dispatch area, they are cases that e.g. Jython would not have, because in Jython the Python-to-Java compiler takes care of them.

                +
                +
                +
                +
                + + Gaëtan de Menten wrote on 2011-03-16 10:45: +
                +
                +

                No, I wasn't confusing the two levels involved (if pypy wasn't optimizing variable-length loops in userlevel code, it wouldn't optimize much I guess).

                My point was more theoretical: I guess that, in theory, those loops would benefit from optimizations like any other part of the interpreter. Your answer leads me to believe that *in practice* this isn't an issue because there are either not that many of them in the interpreter and/or they are not in speed critical parts and most of those that are important speed-wise have been taken care of manually in some way or another.

                +
                +
                +
                +
                + + Carl Friedrich Bolz-Tereick wrote on 2011-03-16 12:15: +
                +
                +

                @Gaëtan: yes, that's a good interpretation. At some point we might still think about a more general solution for this problem, to get the remaining rare cases fixed, but for now we have a lot of the common ones covered.

                +
                +
                +
                +
                + + Matty wrote on 2017-06-07 12:50: +
                +
                +

                @Gaëtan
                Untraceable Interpreter-level loops don't need to be optimized by the jit because they are agressively optimized by the C compiler (remeber that rpython is translated to C)

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2011/03/controlling-tracing-of-interpreter-with_15-3281215865169782921.html b/posts/2011/03/controlling-tracing-of-interpreter-with_15-3281215865169782921.html new file mode 100644 index 000000000..4288c992b --- /dev/null +++ b/posts/2011/03/controlling-tracing-of-interpreter-with_15-3281215865169782921.html @@ -0,0 +1,569 @@ + + + + + +Controlling the Tracing of an Interpreter With Hints, Part 2: Controlling Optimization | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                Controlling the Tracing of an Interpreter With Hints, Part 2: Controlling Optimization

                + + + +
                +

                This is part 2 of a series on how to speed up an interpreter written with PyPy +by adding JIT hints to the interpreter. Part 1 described how to control the +extent of tracing. In this post I will describe how to add hints that +influence the optimizer. If applied correctly these techniques can give +really big speedups by pre-computing parts of what happens at runtime. On the other +hand, if applied incorrectly they might lead to code bloat, thus making the +resulting program actually slower.

                +
                +

                Background

                +

                Before sending the trace to the backend to produce actual machine code, it is +optimized. The optimizer applies a number of techniques to remove or reduce +the number of operations: most of these are well known compiler optimization +techniques, with the difference that it is easier to apply them in a tracing +JIT because it only has to deal with linear traces. Among the techniques:

                + +

                In some places it turns out that if the interpreter author rewrites some parts +of the interpreter with these optimizations in mind the traces that are produced +by the optimizer can be vastly improved.

                +

                In this post I will describe two hints that allow the interpreter author to +increase the optimization opportunities for constant folding. For constant +folding to work, two conditions need +to be met:

                +
                  +
                • the arguments of an operation actually need to all be constant, +i.e. statically known by the optimizer
                • +
                • the operation needs to be pure, i.e. always yield the same result given +the same arguments.
                • +
                +

                The PyPy JIT generator automatically detects the majority of these conditions. +However, for the cases in which the automatic detection does not work, the +interpreter author can apply hints to improve the optimization +opportunities. There is one kind of hint for both of the conditions above.

                +

                Note: These hints are written by an interpreter developer and applied to the +RPython source of the interpreter. Normal Python users will never see them.

                +
                +
                +

                Where Do All the Constants Come From

                +

                It is worth clarifying what is a "constant" in this context. A variable of +the trace is said to be constant if its value is statically known by the +optimizer.

                +

                The simplest example of constants are literal values. For example, if in the +RPython source code we have a line like y = x + 1, the second operand will +be a constant in the trace.

                +

                However, the optimizer can statically know the value of a variable even if it +is not a constant in the original source code. For example, consider the +following fragment of RPython code:

                +
                if x == 4:
                +    y = y + x
                +
                +

                If the fragment is traced with x being 4, the following trace is +produced:

                +
                +guard(x == 4)
                +y = y + x
                +
                +

                In the trace above, the value of x is statically known thanks to the +guard. Remember that a guard is a runtime check. The above trace will run to +completion when x == 4. If the check fails, execution of the trace is +stopped and the interpreter continues to run.

                +

                There are cases in which it is useful to turn an arbitrary variable +into a constant value. This process is called promotion and it is an old idea +in partial evaluation (it's called "the trick" there). Promotion is also heavily +used by Psyco and by all older versions of PyPy's JIT. Promotion is a technique +that only works well in JIT compilers, in +static compilers it is significantly less applicable.

                +

                Promotion is essentially a tool for trace specialization. In some places in the +interpreter it would be very useful if a variable were constant, even though it +could have different values in practice. In such a place, promotion is used. The +typical reason to do that is if there is +a lot of computation depending on the value of that variable.

                +

                Let's make this more concrete. If we trace a call to the following function:

                +
                def f1(x, y):
                +    z = x * 2 + 1
                +    return z + y
                +
                +

                We get a trace that looks like this:

                +
                +v1 = x * 2
                +z = v1 + 1
                +v2 = z + y
                +return(v2)
                +
                +

                Observe how the first two operations could be constant-folded if the value of +x were known. Let's assume that the value of x can vary, but does so +rarely, i.e. only takes a few different values at runtime. If this is the +case, we can add a hint to promote x, like this:

                +
                def f2(x, y):
                +    x = hint(x, promote=True)
                +    z = x * 2 + 1
                +    return z + y
                +
                +

                The meaning of this hint is that the tracer should pretend that x is a +constant +in the code that follows. When just running the code, the function has no +effect, as it simply returns its first argument. When tracing, some extra work +is done. Let's assume that this changed function is traced with +the arguments 4 and 8. The trace will be the same, except for one +operation at the beginning:

                +
                +guard(x == 4)
                +v1 = x * 2
                +z = v1 + 1
                +v2 = z + y
                +return(v2)
                +
                +

                The promotion is turned into a guard operation in the trace. The guard +captures the value of x as it was at runtime. From the point of view of the +optimizer, this guard is not any different than the one produced by the if +statement in the example above. After the guard, the rest of the trace can +assume that x is equal to 4, meaning that the optimizer will turn this +trace into:

                +
                +guard(x == 4)
                +v2 = 9 + y
                +return(v2)
                +
                +

                Notice how the first two arithmetic operations were constant folded. The hope is +that the guard is executed quicker than the multiplication and the addition that +was now optimized away.

                +

                If this trace is executed with values of x other than 4, the guard will +fail, and execution will continue in the interpreter. If the guard fails often +enough, a new trace will be started from the guard. This other trace will +capture a different value of x. If it is e.g. 2, then the optimized +trace looks like this:

                +
                +guard(x == 2)
                +v2 = 5 + y
                +return(v2)
                +
                +

                This new trace will be attached to the guard instruction of the first trace. If +x takes on even more values, a new trace will eventually be made for all of them, +linking them into a chain. This is clearly not desirable, so we should promote +only variables that don't vary much. However, adding a promotion hint will never produce wrong +results. It might just lead to too much assembler code.

                +

                Promoting integers, as in the examples above, is not used that often. +However, the internals of dynamic language interpreters often +have values that are variable but vary little in the context of parts of a user +program. An example would be the types of variables in a user function. Even +though in principle the argument to a Python function could be any Python type, +in practise the argument types tend to not vary much. Therefore it is possible to +promote the types. In the next blog post I will give a complete example for how +this works.

                +
                +
                +

                Declaring New Pure Operations

                +

                In the last section we saw a way to turn arbitrary variables into constants. All +pure operations on these constants can be constant-folded. This works great for +constant folding of simple types, e.g. integers. Unfortunately, in the context of an +interpreter for a dynamic +language, most operations actually manipulate objects, not simple types. The +operations on objects are often not pure and might even have side-effects. If +one reads a field out of a constant reference to an object this cannot +necessarily be folded away because the object can be mutated. Therefore, another +hint is needed.

                +

                As an example, take the following class:

                +
                class A(object):
                +    def __init__(self, x, y):
                +        self.x = x
                +        self.y = y
                +
                +    def f(self, val):
                +        self.y = self.compute() + val
                +
                +    def compute(self):
                +        return self.x * 2 + 1
                +
                +

                Tracing the call a.f(10) of some instance of A yields the following +trace (note how the call to compute is inlined):

                +
                +x = a.x
                +v1 = x * 2
                +v2 = v1 + 1
                +v3 = v2 + val
                +a.y = v3
                +
                +

                In this case, adding a promote of self in the f method to get rid of the +computation of the first few operations does not help. Even if a is a +constant reference to an object, reading the x field does not necessarily +always yield the same value. To solve this problem, there is another annotation, +which lets the interpreter author communicate invariants to the optimizer. In +this case, she could decide that the x field of instances of A is +immutable, and therefore compute +is a pure function. To communicate this, there is a purefunction decorator. +If the code in compute should be constant-folded away, we would change the +class as follows:

                +
                class A(object):
                +    def __init__(self, x, y):
                +        self.x = x
                +        self.y = y
                +
                +    def f(self, val):
                +        self = hint(self, promote=True)
                +        self.y = self.compute() + val
                +
                +    @purefunction
                +    def compute(self):
                +        return self.x * 2 + 1
                +
                +

                Now the trace will look like this:

                +
                +guard(a == 0xb73984a8)
                +v1 = compute(a)
                +v2 = v1 + val
                +a.y = v2
                +
                +

                Here, 0xb73984a8 is the address of the instance of A that was used +during tracing. The call to compute is not inlined, so that the optimizer +has a chance to see it. Since compute function is marked as pure, and its +argument +is a constant reference, the call will be removed by the optimizer. The final +trace looks like this:

                +
                +guard(a == 0xb73984a8)
                +v2 = 9 + val
                +a.y = v2
                +
                +

                (assuming that the x field's value is 4).

                +

                On the one hand, the purefunction annotation is very powerful. It can be +used to constant-fold arbitrary parts of the computation in the interpreter. +However, the annotation also gives you ample opportunity to mess things up. If a +function is annotated to be pure, but is not really, the optimizer can produce +subtly wrong code. Therefore, a lot of care has to be taken when using this +annotation.

                +
                +

                Observably Pure Functions

                +

                Why can't we simply write an analysis to find out that the x fields of the +A instances is immutable and deduce that compute is a pure function, +since it only reads the x field and does not have side effects? This might +be possible in this particular case, but in practice the functions that are +annotate with the purefunction decorator are usually more complex. +The easiest example for this is that of a function that uses memoization to +cache its results. If you analyze this function, it looks like the function has +side effects, because it changes the memoizing dictionary. However, because this side +effect is not externally visible, the function from the outside is pure. This is +a property that is not easily detectable by analysis. Therefore, the purity +of this function needs to be annotated.

                +
                +
                +

                Immutable Fields

                +

                One of the most common cases of pure functions is reading immutable +values out of objects. Since this is so common, we have special syntactic sugar +for it. A RPython class can have a class attribute _immutable_fields_ set to +a list of strings, listing the fields that cannot be changed. This is equivalent +to using getters and annotating them with purefunction.

                +
                +
                +
                +

                Conclusion

                +

                In this blog post I explained two more hints that can be used in the source code +of the interpreter. They are used to influence what the optimizer does with the +trace. I realize the examples given here are a bit too small, in the next +installment I will give a worked-out example that puts all the pieces together.

                +
                +
                +

                Comments

                +
                +
                +
                + + Gaëtan de Menten wrote on 2011-03-16 10:56: +
                +
                +

                Again a very interesting post. I would like some precisions for one sentence:
                "If x takes on even more values, a new trace will eventually be made for all of them, linking them into a chain."

                Does it mean they are all tried in sequence, or is there some dispatch mechanism? If there isn't, wouldn't it be beneficial to have one in place (probably using a hash table of some sort) when there is more than a few values? Or is the number of "generated branches" never supposed to be large enough to make such an approach worthwile?

                +
                +
                +
                +
                + + Carl Friedrich Bolz-Tereick wrote on 2011-03-16 12:27: +
                +
                +

                @Gaëtan:

                Right now it's just a linear search always, which is clearly not ideal and we might very well fix this in the future. Currently we have the hope that in practice the number of values is always small, but we never measured.

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2011/03/controlling-tracing-of-interpreter-with_21-6524148550848694588.html b/posts/2011/03/controlling-tracing-of-interpreter-with_21-6524148550848694588.html new file mode 100644 index 000000000..dc50e9989 --- /dev/null +++ b/posts/2011/03/controlling-tracing-of-interpreter-with_21-6524148550848694588.html @@ -0,0 +1,762 @@ + + + + + +Controlling the Tracing of an Interpreter With Hints, Part 3: Putting it All Together | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                Controlling the Tracing of an Interpreter With Hints, Part 3: Putting it All Together

                + + + +
                +

                This is part 3 of the series on how to speed up an interpreter written with +PyPy by adding JIT hints to the interpreter. Part 1 described how to control +the extent of tracing. Part 2 described how to influence the optimizer with +promotion and pure functions. In this post I describe a worked-out example of +a small object model for a dynamic language and how to make it efficient using +the hints described in the previous posts.

                +
                +

                A Simple Object Model

                +

                To implement a dynamic language efficiently, the operations on its objects need +to be fast. Most dynamic languages have object models that are made by using +dictionaries everywhere. Let's look at an example of how the JIT can be made to +optimize such operations.

                +

                For the purpose of this blog post we will use a very simple and bare-bones +object model that just supports very simple classes and instances, without any +inheritance or any fancy features. The model has classes, which contain methods. +Instances have a class. Instances have their own attributes. When looking up an +attribute on an instance, the instances attributes are searched. If the +attribute is not found there, the class' attributes are searched.

                +

                To implement this object model, we could use the following RPython code as part +of the interpreter source code:

                +
                class Class(object):
                +    def __init__(self, name):
                +        self.name = name
                +        self.methods = {}
                +
                +    def instantiate(self):
                +        return Instance(self)
                +
                +    def find_method(self, name):
                +        result = self.methods.get(name)
                +        if result is not None:
                +            return result
                +        raise AttributeError(name)
                +
                +    def change_method(self, name, value):
                +        self.methods[name] = value
                +
                +
                +class Instance(object):
                +    def __init__(self, cls):
                +        self.cls = cls
                +        self.attributes = {}
                +
                +    def getfield(self, name):
                +        result = self.attributes.get(name)
                +        if result is not None:
                +            return result
                +        raise AttributeError(name)
                +
                +    def write_attribute(self, name, value):
                +        self.attributes[name] = value
                +
                +    def getattr(self, name):
                +        try:
                +            return self.getfield(name)
                +        except AttributeError:
                +            return self.cls.find_method(name)
                +
                +

                In this straightforward implementation the methods and attributes are just +stored in dictionaries on the classes/instances. While this object model is very +simple it already contains all the hard parts of Python's object model. Both +instances and classes can have arbitrary fields, and they are changeable at +any time. Moreover, instances can change their class after they have been +created.

                +

                When using this object model in +an interpreter, a huge amount of time will be spent doing lookups in these +dictionaries. To make the language efficient using a tracing JIT, we need to +find a way to get rid of these dictionary lookups somehow.

                +

                Let's assume we trace through code that sums three attributes, such as:

                +
                inst.getattr("a") + inst.getattr("b") + inst.getattr("c")
                +
                +

                The trace could look like this:

                +
                # inst.getattr("a")
                +attributes1 = inst.attributes
                +result1 = dict.get(attributes1, "a")
                +guard(result1 is not None)
                +
                +# inst.getattr("b")
                +attributes2 = inst.attributes
                +v1 = dict.get(attributes2, "b")
                +guard(v1 is None)
                +cls1 = inst.cls
                +methods1 = cls.methods
                +result2 = dict.get(methods1, "b")
                +guard(result2 is not None)
                +v2 = result1 + result2
                +
                +# inst.getattr("c")
                +attributes3 = inst.attributes
                +v3 = dict.get(attributes3, "c")
                +guard(v3 is None)
                +cls1 = inst.cls
                +methods2 = cls.methods
                +result3 = dict.get(methods2, "c")
                +guard(result3 is not None)
                +
                +v4 = v2 + result3
                +return(v4)
                +
                +

                In this example, the attribute a is found on the instance, but the +attributes b and c are found on the class. The trace indeed contains +five calls to dict.get, which is slow.

                +
                +
                +

                Making Instance Attributes Faster Using Maps

                +

                The first step in making getattr faster in our object model is to optimize +away the dictionary lookups on the instances. The hints we have looked at in the +two earlier blog posts don't seem to help with the current object model. There is +no pure function to be seen, and the instance is not a candidate for promotion, +because there tend to be many instances.

                +

                This is a common problem when trying to apply hints. Often, the interpreter +needs a small rewrite to expose the pure functions and nearly-constant objects +that are implicitly there. In the case of instance fields this rewrite is not +entirely obvious. The basic idea is as follows. In theory instances can have +arbitrary fields. In practice however many instances share their layout (i.e. +their set of keys) with many other instances.

                +

                Therefore it makes sense to factor the layout information out of the instance +implementation into a shared object. This shared layout object is called a +map. Maps are an old idea that comes originally from the SELF language. They are +also used by many JavaScript implementations such as V8. I've written about maps +before, so I won't explain them fully again.

                +

                The rewritten Instance class using maps looks like this:

                +
                class Map(object):
                +    def __init__(self):
                +        self.attribute_indexes = {}
                +        self.other_maps = {}
                +
                +    @purefunction
                +    def getindex(self, name):
                +        return self.attribute_indexes.get(name, -1)
                +
                +    @purefunction
                +    def new_map_with_additional_attribute(self, name):
                +        if name not in self.other_maps:
                +            newmap = Map()
                +            newmap.attribute_indexes.update(self.attribute_indexes)
                +            newmap.attribute_indexes[name] = len(self.attribute_indexes)
                +            self.other_maps[name] = newmap
                +        return self.other_maps[name]
                +
                +
                +EMPTY_MAP = Map()
                +
                +class Instance(object):
                +    def __init__(self, cls):
                +        self.cls = cls
                +        self.map = EMPTY_MAP
                +        self.storage = []
                +
                +    def getfield(self, name):
                +        map = hint(self.map, promote=True)
                +        index = map.getindex(name)
                +        if index != -1:
                +            return self.storage[index]
                +        raise AttributeError(name)
                +
                +    def write_attribute(self, name, value):
                +        map = hint(self.map, promote=True)
                +        index = map.getindex(name)
                +        if index != -1:
                +            self.storage[index] = value
                +            return
                +        self.map = map.new_map_with_additional_attribute(name)
                +        self.storage.append(value)
                +
                +    def getattr(self, name):
                +        try:
                +            return self.getfield(name)
                +        except AttributeError:
                +            return self.cls.find_method(name)
                +
                +

                Instances no longer use dictionaries to store their fields. Instead, they have a +reference to a map, which maps field names to indexes into a storage list. The +storage list contains the actual field values. The maps are shared between +objects with the same layout. Therefore they have to be immutable, which means +that their getindex method is a pure function. When a new attribute is added +to an instance, a new map needs to be chosen, which is done with the +new_map_with_additional_attribute method on the previous map. Now that we have +introduced maps, it is safe to promote the map everywhere, because we assume +that the number of different instance layouts is small.

                +

                With this changed instance implementation, the trace we had above changes to the +following, where 0xb74af4a8 is the memory address of the Map instance that +has been promoted:

                +
                # inst.getattr("a")
                +map1 = inst.map
                +guard(map1 == 0xb74af4a8)
                +index1 = Map.getindex(map1, "a")
                +guard(index1 != -1)
                +storage1 = inst.storage
                +result1 = storage1[index1]
                +
                +# inst.getattr("b")
                +map2 = inst.map
                +guard(map2 == 0xb74af4a8)
                +index2 = Map.getindex(map2, "b")
                +guard(index2 == -1)
                +cls1 = inst.cls
                +methods1 = cls.methods
                +result2 = dict.get(methods1, "b")
                +guard(result2 is not None)
                +v2 = result1 + result2
                +
                +# inst.getattr("c")
                +map3 = inst.map
                +guard(map3 == 0xb74af4a8)
                +index3 = Map.getindex(map3, "c")
                +guard(index3 == -1)
                +cls1 = inst.cls
                +methods2 = cls.methods
                +result3 = dict.get(methods2, "c")
                +guard(result3 is not None)
                +
                +v4 = v2 + result3
                +return(v4)
                +
                +

                The calls to Map.getindex can be optimized away, because they are calls to +a pure function and they have constant arguments. That means that index1/2/3 +are constant and the guards on them can be removed. All but the first guard on +the map will be optimized away too, because the map cannot have changed in +between. The optimized trace looks like this:

                +
                # inst.getattr("a")
                +map1 = inst.map
                +guard(map1 == 0xb74af4a8)
                +storage1 = inst.storage
                +result1 = storage1[0]
                +
                +# inst.getattr("b")
                +cls1 = inst.cls
                +methods1 = cls1.methods
                +result2 = dict.get(methods1, "b")
                +guard(result2 is not None)
                +v2 = result1 + result2
                +
                +# inst.getattr("c")
                +cls2 = inst.cls
                +methods2 = cls2.methods
                +result3 = dict.get(methods2, "c")
                +guard(result3 is not None)
                +
                +v4 = v2 + result3
                +return(v4)
                +
                +

                The index 0 that is used to read out of the storage array is the result +of the constant-folded getindex call. This trace is already much better than +the original one. Now we are down from five dictionary lookups to just two.

                +
                +
                +

                Versioning of Classes

                +

                Instances were optimized making the assumption that the total number of +Instance layouts is small compared to the number of instances. For classes we +will make an even stronger assumption. We simply assume that it is rare for +classes to change at all. This is not totally reasonable (sometimes classes contain +counters or similar things) but for this simple example it is good enough.

                +

                What we would really like is if the Class.find_method method were pure. +But it cannot be, because it is always possible to change the class itself. +Every time the class changes, find_method can potentially return a +new value.

                +

                Therefore, we give every class a version number, which is increased every time a +class gets changed (i.e., the content of the methods dictionary changes). +This means that the result of methods.get() for a given (name, +version) pair will always be the same, i.e. it is a pure operation. To help +the JIT to detect this case, we factor it out in a helper method which is +explicitly marked as @purefunction. The refactored Class looks like +this:

                +
                class VersionTag(object):
                +    pass
                +
                +class Class(object):
                +    def __init__(self, name):
                +        self.name = name
                +        self.methods = {}
                +        self.version = VersionTag()
                +
                +    def find_method(self, name):
                +        self = hint(self, promote=True)
                +        version = hint(self.version, promote=True)
                +        result = self._find_method(name, version)
                +        if result is not None:
                +            return result
                +        raise AttributeError(name)
                +
                +    @purefunction
                +    def _find_method(self, name, version):
                +        return self.methods.get(name)
                +
                +    def change_method(self, name, value):
                +        self.methods[name] = value
                +        self.version = VersionTag()
                +
                +

                What is interesting here is that _find_method takes the version +argument but it does not use it at all. Its only purpose is to make the call +pure (because when the version number changes, the result of the call might be +different than the previous one).

                +

                The trace with this new class implementation looks like this:

                +
                # inst.getattr("a")
                +map1 = inst.map
                +guard(map1 == 0xb74af4a8)
                +index1 = Map.getindex(map1, "a")
                +guard(index1 != -1)
                +storage1 = inst.storage
                +result1 = storage1[index1]
                +
                +# inst.getattr("b")
                +map2 = inst.map
                +guard(map2 == 0xb74af4a8)
                +index2 = Map.getindex(map2, "b")
                +guard(index2 == -1)
                +cls1 = inst.cls
                +guard(cls1 == 0xb7aaaaf8)
                +version1 = cls1.version
                +guard(version1 == 0xb7bbbb18)
                +result2 = Class._find_method(cls, "b", version1)
                +guard(result2 is not None)
                +v2 = result1 + result2
                +
                +# inst.getattr("c")
                +map3 = inst.map
                +guard(map3 == 0xb74af4a8)
                +index3 = Map.getindex(map3, "c")
                +guard(index3 == -1)
                +cls2 = inst.cls
                +guard(cls2 == 0xb7aaaaf8)
                +version2 = cls2.version
                +guard(version2 == 0xb7bbbb18)
                +result3 = Class._find_method(cls, "c", version2)
                +guard(result3 is not None)
                +
                +v4 = v2 + result3
                +return(v4)
                +
                +

                The calls to Class._find_method can now be optimized away, also the +promotion of the class and the version, except for the first one. The final +optimized trace looks like this:

                +
                # inst.getattr("a")
                +map1 = inst.map
                +guard(map1 == 0xb74af4a8)
                +storage1 = inst.storage
                +result1 = storage1[0]
                +
                +# inst.getattr("b")
                +cls1 = inst.cls
                +guard(cls1 == 0xb7aaaaf8)
                +version1 = cls1.version
                +guard(version1 == 0xb7bbbb18)
                +v2 = result1 + 41
                +
                +# inst.getattr("c")
                +v4 = v2 + 17
                +return(v4)
                +
                +

                The constants 41 and 17 are the results of the folding of the +_find_method` calls. This final trace is now very good. It no longer performs any +dictionary lookups. Instead it contains several guards. The first guard +checks that the map is still the same. This guard will fail if the same +code is executed with an instance that has another layout. The second guard +checks that the class of inst is still the same. It will fail if trace is +executed with an instance of another class. The third guard checks that the +class did not change since the trace was produced. It will fail if somebody +calls the change_method method on the class.

                +
                +
                +

                Real-World Considerations

                +

                The techniques used above for the simple object model are used for the object +model of PyPy's Python interpreter too. Since Python's object model is +considerably more complex, some additional work needs to be done.

                +

                The first problem that needs to be solved is that Python supports (multiple) +inheritance. Therefore looking up a method in a class needs to consider the +whole method resolution order. This makes the versioning of classes more +complex. If a class is changed its version changes. At the same time, the +versions of all the classes inheriting from it need to be changed as well, +recursively. This makes class changes expensive, but they should be rare. On the +other hand, a method lookup in a complex class hierarchy is as optimized in the +trace as in our object model here.

                +

                A downside of the versioning of classes that we haven't yet fixed in PyPy, is +that some classes do change a lot. An example would be a class that keeps a +counter of how many instances have been created so far. This is very slow right +now, but we have ideas about how to fix it in the future.

                +

                Another optimization is that in practice the shape of an instance is correlated +with its class. In our code above, we allow both to vary independently. +In PyPy's Python interpreter we act somewhat more cleverly. The class of +an instance is not stored on the instance itself, but on the map. This means +that we get one fewer promotion (and thus one fewer guard) in the trace, because the class doesn't need to +be promoted after the map has been.

                +
                +
                +

                More General Patterns

                +

                The techniques we used above to make instance and class lookups faster are +applicable in more general cases than the one we developed them for. A more +abstract view of maps is that of splitting a data-structure into a part that +changes slowly, and a part that changes quickly. In the concrete example of maps +we split the original dictionary into the map (the slow-changing part) and the +storage array (the quick-changing part). All the computation on the +slow-changing part can be constant-folded during tracing so that only the +manipulation of the quick-changing part remains.

                +

                Similarly, versions can be used to constant-fold arbitrary functions of large data +structures. The version needs to be updated carefully every time the result of +this function can change. Therefore this is useful only if the data structure is +expected to change slowly.

                +
                +
                +

                Conclusion

                +

                In this post I showed how to use purefunction and promote to make a +small but still relevant dynamic object model no longer use any dictionary lookups +after tracing. Instead a number of guards are inserted into the +trace to check whether the assumptions about the objects are still true. This +makes operations on objects seriously faster. I plan to write another small post +that shows the speed benefits for PyPy's Python interpreter for exactly these +operations.

                +
                +
                +

                Comments

                +
                +
                +
                + + Unknown wrote on 2011-03-21 19:33: +
                +
                +

                Very clever indeed.
                I think and additional speedup can be achieved
                by using a technique from smalltalk intrepters: Method lookup cache.
                The cache is organized so that function
                cache(class, method) returns a pointer to the method.
                The early Smalltalk implementors reported pretty spectacular speedups when this cache was implemented.

                +
                +
                +
                +
                + + Anonymous wrote on 2011-03-21 20:03: +
                +
                +

                SO MUCH AWESOME.

                +
                +
                +
                +
                + + RonnyPfannschmidt wrote on 2011-03-21 22:07: +
                +
                +

                @vadiml: the jit+version tags already acts as method lookup cache for jited code
                it basically inlines lookup(class, method)

                +
                +
                +
                +
                + + Unknown wrote on 2011-03-22 07:46: +
                +
                +

                @RonnyPfannschmidt: thinking more about it
                yes, you're right of course

                +
                +
                +
                +
                + + Anonymous wrote on 2011-03-23 18:37: +
                +
                +

                I'm wondering about VersionTag(). The guard you've shown looks at its memory address. Doesn't PyPy use compacting garbage collectors? I seem to recall that from earlier posts about the cost of id().

                +
                +
                +
                +
                + + Anonymous wrote on 2011-03-23 20:23: +
                +
                +

                Hmm. And now I think I know why twisted isn't any faster in pypy. I remember looking at the source a few years ago and being horrified to see that they were changing class methods during runtime. I guessed to avoid one layer of dispatch in state machines. Anyway, it's an "optimisation" that will hurt pypy.

                +
                +
                +
                +
                + + Carl Friedrich Bolz-Tereick wrote on 2011-03-24 09:11: +
                +
                +

                @Marius: You are right. The trace is a bit simplified, in practice there is an indirection so that if the GC moves the object, the trace still works.

                @Anonymous: can you find that place in twisted? would be very interesting to see. Also it probably means we should implement these ideas about making changing classes not quite so inefficient.

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2011/03/controlling-tracing-of-interpreter-with_26-3072929156700508140.html b/posts/2011/03/controlling-tracing-of-interpreter-with_26-3072929156700508140.html new file mode 100644 index 000000000..c32a69167 --- /dev/null +++ b/posts/2011/03/controlling-tracing-of-interpreter-with_26-3072929156700508140.html @@ -0,0 +1,412 @@ + + + + + +Controlling the Tracing of an Interpreter With Hints, Part 4: Benchmarks | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                Controlling the Tracing of an Interpreter With Hints, Part 4: Benchmarks

                + + + +
                +

                This is part 4 and the final part of the series on how to speed up an interpreter +written with PyPy by adding JIT hints to the interpreter. Part 1 described how +to control the extent of tracing. Part 2 described how to influence the +optimizer with promotion and pure functions. Part 3 described a simple object +model and how it can be optimized by doing small rewrites. In this (short) post +I present some benchmarks.

                +
                +

                Benchmarks

                +

                For the benchmarks I ran a subset of the benchmarks on https://speed.pypy.org +with CPython and four different executables of PyPy's Python interpreter (all +with a JIT). The executables contain all combinations of enabling maps (which +make instance attributes fast) and type versions (which makes method lookup +fast).

                +
                  +
                • +pypy-slow: contains neither maps nor type versions.
                • +
                • +pypy-map: contains maps but not type versions.
                • +
                • +pypy-version: contains type versions but not maps.
                • +
                • +pypy-full: contains both maps and type versions
                • +
                +

                The results are as follows:

                + +

                The graph shows the speedup over CPython's numbers. The results are quite +interesting. Maps by themselves do not speed up much over the bare JIT, whereas +typed versions alone improve on the JIT baseline in many cases. However, maps +are not useless. In combination with type versions they add a nice improvement +over just type versions in a number of benchmarks (most notably +raytrace-simple and richards but also in crypto-pyaes, django +and go).

                +

                It's clear that type versions can be arbitrarily effective. A method lookup on a +class can be arbitrarily slow, if the inheritance hierarchy becomes deeper and +deeper. The full lookup is replaced by one promotion if type versions are +enabled.

                +

                Maps on the other hand always replace one dict lookup with one promotion. Since +dict lookups are already very fast, this by itself does not lead to a gigantic +improvement. Only in combination with type versions do they show their full +potential.

                +
                +
                +

                Comments

                +
                +
                +
                + + Winston Ewert wrote on 2011-03-26 20:17: +
                +
                +

                It's not clear to me why version + maps combine so well. Maps should effectively eliminate lookups on the instance dict and versions eliminate lookups on the class dict. Both versions would seem to eliminate different classes of lookups, so I'm not seeing why we have dramatic improvement when using them together.

                +
                +
                +
                +
                + + Alex wrote on 2011-03-26 20:19: +
                +
                +

                I'm not an expert at CPU architecture, but ISTM eliminating both can eliminate a large number of memory reads which would help with pipelining and other very low level optimizations.

                +
                +
                +
                +
                + + Carl Friedrich Bolz-Tereick wrote on 2011-03-26 21:33: +
                +
                +

                @Winston: I actually have no clue :-). The numbers are hard to deny though. I plan to stare at the traces a bit next week, can comment here if I find something interesting.

                +
                +
                +
                +
                + + Carl Friedrich Bolz-Tereick wrote on 2011-03-27 14:52: +
                +
                +

                @Winston: ok, I probably found out. Your reasoning is too simple because usually you do several lookups on the same object in a row. Every lookup looks first in the class, then in the instance. So it looks a bit like this:

                lookup name1 in obj.__class__
                lookup name1 in obj.__dict__
                lookup name2 in obj.__class__
                lookup name2 in obj.__dict__
                lookup name2 in obj.__class__
                lookup name2 in obj.__dict__

                when using maps, every lookup in the dict is simply reading the map, promoting it and then a read. after the promotion of the map, the instance's layout is fully known. however, if type versions are disabled, the lookups in the class are complex operations that are opaque to the JIT. Therefore the JIT assumes they can change the layout and thus the map of the object.

                If you also enable type versions, then the class lookups are understandable to the JIT. therefore the JIT can see that the class lookup didn't change the layout of the class. This means that after the first instance lookup, the following instance lookups cost nothing at all.

                +
                +
                +
                +
                + + klaussfreire wrote on 2011-03-28 15:04: +
                +
                +

                I think an important improvement brought about by maps is the memory footprint reduction.

                It won't matter all the time, but it makes all classes as space-efficient as if they used __slots__, all automagically, which is no small thing.

                For programs that handle lots of small objects, this can really make a difference, in memory consumption and speed (less memory to shuffle around will invariably be faster)

                Perhaps the benchmark suite doesn't have enough of those cases.

                +
                +
                +
                +
                + + Maciej Fijalkowski wrote on 2011-03-28 22:16: +
                +
                +

                @cfbolz I think one reason why maps+version tags are fast is because we lack jit.unroll_safe on several lookup functions when version tags are disabled. Marking them as unrollable would speed things up.

                The reasoning behind this is that old style classes which have maps, but no version tags are much faster than new style classes with version tags disabled.

                +
                +
                +
                +
                + + Winston Ewert wrote on 2011-03-30 00:41: +
                +
                +

                Thanks for taking the time to answer my query.

                The use of class versions eliminates the opaque function being called because the JIT knows the return will be constant. This allows optimizations to work correctly. But this makes me wonder how much of the improvement is due to class versions and how much is due to lack of opaqueness.

                At any rate, I always find the posts on this blog very interesting. It definitely some neat stuff you are doing here.

                +
                +
                +
                +
                + + Carl Friedrich Bolz-Tereick wrote on 2011-03-30 11:30: +
                +
                +

                @fijal I thought old-style classes had celldicts? That's yet another thing, but your point is still correct.

                +
                +
                +
                +
                + + Benjamin wrote on 2011-04-27 22:48: +
                +
                +

                I'd love to see a blog post about conventions to favor or avoid while writing python code to best take advantage of these excellent features. For example, your previous post implied something like this would be faster than changing the class directly:

                class Counter(object):
                ....def __init__(self):
                ........self.count = 0
                ....def increment(self):
                ........self.count += 1

                class Many(object):
                ....counter = Counter()
                ....def __init__(self):
                ........self.counter.increment()

                Granted, it would be preferable, from a coding standpoint, to just use a simple class attribute, but the adaptations that would likely work best for the pypy JIT seem like far smaller divergences from the 'ideal' python than many other lengths people go to when coding for speed, particularly compared to something like cython.

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2011/03/thank-you-to-psf-5934275567667314914.html b/posts/2011/03/thank-you-to-psf-5934275567667314914.html new file mode 100644 index 000000000..c7382042f --- /dev/null +++ b/posts/2011/03/thank-you-to-psf-5934275567667314914.html @@ -0,0 +1,349 @@ + + + + + +A thank you to the PSF | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                A thank you to the PSF

                + + + +
                +

                This year's PyCon was an incredible time; several members of the PyPy team were +there, and we'll be blogging more about our experiences in the coming days. +However, we quickly wanted to extend a thank you to the Python Software +Foundation (PSF).

                +

                As you may have heard, on Friday morning at PyCon Jesse Noller handed the PyPy +team a check for $10,000, on behalf of the PSF. This was in recognition of our +success over the past few years in bringing PyPy from a research project +to a fast, compliant, production-ready Python implementation, and to allow us +to continue our work on making it faster and more up-to-date with upstream +version changes.

                +

                Beyond the large check, we're grateful for the endorsement this represents, +not only of our work on PyPy, but also of all alternatve Python VMs. +The PSF has shifted its focus from representing just CPython to representing +the Python Language, reguardless of its implementation, something we are very +appreciative of.

                + +

                From left to right, PyPy people present at PyCon 2011: Maciej Fijałkowski, Armin Rigo, Alex Gaynor, Laura Creighton and Jacob Hallén

                + +

                Thank you, PSF.

                +
                +

                Comments

                +
                +
                +
                + + Hodgestar wrote on 2011-03-22 00:17: +
                +
                +

                Congratulations! It's great to see the PSF embracing the broader Python ecosystem.

                +
                +
                +
                +
                + + Steve wrote on 2011-03-22 03:24: +
                +
                +

                It's nice to be able to offer this support as an indication that we aren't just the CPython Software Foundation. It is a well-deserved award, and we know it will be put to good use.

                +
                +
                +
                +
                + + Unknown wrote on 2011-03-23 14:47: +
                +
                +

                Yyes. Keep it Going! =)

                +
                +
                +
                +
                + + Unknown wrote on 2011-05-03 08:34: +
                +
                +

                Wow, congratulations! PyPy has gone a long way.

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2011/03/us-trip-report-popl-microsoft-ibm-3874568000250679204.html b/posts/2011/03/us-trip-report-popl-microsoft-ibm-3874568000250679204.html new file mode 100644 index 000000000..ad3ac6332 --- /dev/null +++ b/posts/2011/03/us-trip-report-popl-microsoft-ibm-3874568000250679204.html @@ -0,0 +1,454 @@ + + + + + +US Trip Report: POPL, Microsoft, IBM | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                US Trip Report: POPL, Microsoft, IBM

                + + + +
                +

                Some notes from my recent trip (from 23rd of January to 17th of February) to the +US where, I presented PyPy at various scientifically oriented places. In +summary, there seems to be quite a bit of interest in PyPy within the research +community, details below.

                +
                +

                PEPM/POPL/STOP

                +

                From the 24th to the 29th of January I was in Austin, Texas at the POPL +conference, where I gave a talk at one of the workshops, PEPM (Partial +Evaluation and Program Manipulation). The title of our paper is +"Allocation Removal by Partial Evaluation in a Tracing JIT", the abstract is:

                +
                +The performance of many dynamic language implementations suffers from high +allocation rates and runtime type checks. This makes dynamic languages less +applicable to purely algorithmic problems, despite their growing +popularity. In this paper we present a simple compiler optimization based +on online partial evaluation to remove object allocations and runtime type +checks in the context of a tracing JIT. We evaluate the optimization using +a Python VM and find that it gives good results for all our (real-life) +benchmarks.
                +

                The talk (slides) seemed to be well-received and there was +a good discussion afterwards. PEPM in general was a very enjoyable workshop +with many interesting talks on partial evaluation (which I am very interested +in) and a great keynote by Olivier Danvy about "A Walk in the Semantic Park".

                +

                POPL itself was a bit outside of the area I am most knowledgeable in, most of +the talks being on formal topics. Some of the talks that stuck to my mind:

                +
                  +
                • +"The Design of Kodu: A Tiny Visual Programming Language for Children on the +Xbox 360", the keynote by Matthew MacLaurin from Microsoft Research. I didn't +know about Kodu before, and was very impressed by it.
                • +
                +
                  +
                • +"Automating String Processing in Spreadsheets using Input-Output Examples" +(paper) by Sumit Gulwani (also from MS Research) describes a plugin to Excel +that can automate many common string processing tasks by giving a couple of +examples, which are then abstracted into a generic string manipulation. Very +cool.
                • +
                +
                  +
                • +"Dynamic Inference of Static Types for Ruby" (paper) by Michael Furr, +Jong-hoon (David) An, Jeffrey S. Foster and Michael Hicks describes an +approach to type inference that works by observing the actual types seen +during unit-testing. Similar things have been done a few times before, +however, the paper actually gives a correctness result.
                • +
                +
                  +
                • +"The Essence of Compiling with Traces" (paper) by Shu-Yu Guo and Jens +Palsberg describes a formalization of a simple imperative language and +proves that executing it using trace compilation will do exactly the same +thing than using an interpreter. It also looks at what conditions an +optimization on traces must fulfill to still produce valid results.
                • +
                +

                After the main conference, I took part in the STOP (Scripts to Programs) +workshop. It had a great keynote "Scripting in a Concurrent World" by John Field +about the Thorn language and a few interesting other talks.

                +
                +
                +

                Microsoft Research

                +

                After POPL I went to Redmond to visit Microsoft Research for a week, +specifically the RiSE group. This is the group that did the SPUR project, +a meta-tracing JIT for C# applied to a JavaScript interpreter in C#. I compared +PyPy to SPUR last year. I am very grateful for Microsoft for inviting me +there.

                +

                At Microsoft I gave a talk about "PyPy's Approach to Implementing Dynamic +Languages Using a Tracing JIT Compiler", the slides of which can be found +here. The talk was filmed and is online. People seemed to be impressed +with the "product qualities" of PyPy, e.g. the buildbot infrastructure and +speed tracking website.

                +

                The rest of the time I discussed with various researchers in the RiSE group, +particularly with Nikolai Tillmann. We talked a lot about similarities and +differences between SPUR and PyPy and tried to understand our respective projects +better. SPUR is a really great project and I learned a lot in the discussions, +for example about the optimizations and heuristics their trace compiler uses.

                +

                Another very cool project done by the RiSE group that I learned more about is +PEX. PEX is a unit test generator for C# that tries to produce unit tests for +so-far untested execution paths within methods. There is an online puzzle +version of it, if you want to get an impression of the technology (including a +very impressive C# IDE in the browser).

                +
                +
                +

                IBM

                +

                For the last part of the trip I stayed in New York City for two weeks, +mostly as a vacation. However, I also visited IBM Watson Research Center for +two days, to which I had been invited by David Edelsohn.

                +

                The first day I gave the same presentation I had given at Microsoft (with some +improvements to the slides), again it was quite well received. The rest of +the time I spent in (very fruitful) discussions with various people and teams, +among them the Liquid Metal team and the Thorn team.

                +

                The second day I met with members of the FIORANO group, who are working on +dynamic compilation for dynamic languages and Java. They explored various ways +to speed up Python, both by improving the CPython interpreter as well as with +JIT compilation techniques.

                +

                Another of their projects is to add a trace compiler to IBM's J9 JVM, about +which the paper "A Trace-based Java JIT Compiler Retrofitted from a +Method-based Compiler" is going to appear at CGO. I discussed tracing JITs with +Peng Wu, one of the authors of that paper. Peng tries to systematically look at +the various heuristics found in the different VMs that use tracing JITs. This +is a very different perspective from the one I usually have, focusing on how to +improve PyPy's specific heuristics. Therefore that discussion helped me thinking +about the issues more generally.

                +

                Another goal of the group is to try to find benchmarks that are representative +for typical Python workloads, which is something that has been done very +carefully for Java e.g. when developing the DaCapo benchmark suite. The +benchmarks that the Python community uses have not been selected in such a +careful and measured way, so I think that trying to be more systematic there is +a very worthwhile endeavour.

                +
                +
                +

                Comments

                +
                +
                +
                + + holger krekel wrote on 2011-03-05 14:04: +
                +
                +

                Thanks for the interesting overview of your travels and research interactions! I i agree that getting better and more systematic benchmarks for Python would be worthwhile.

                +
                +
                +
                +
                + + Ivan wrote on 2011-03-07 20:36: +
                +
                +

                I find this project fascinating.

                I wonder what's the theoretical limit of this approach for improving the performance of python (or any other language implemented in pypy)?

                Do you have any rought estimation on how far you can go? Have you reached a limit or you are just scratching the possibilities?

                For example, do you think you can compete with javascript v8 or luajit?

                +
                +
                +
                +
                + + Maciej Fijalkowski wrote on 2011-03-08 13:59: +
                +
                +

                Hi Ivan.

                In general I don't think there are limits of approach other than say time and money. Python is a complex language.

                Can you come up with an example where PyPy is actually slower than V8 *other* than computer language shootout? Programs on computer language shootout are just not nicely optimized for PyPy.

                +
                +
                +
                +
                + + Ivan wrote on 2011-03-08 16:10: +
                +
                +

                Hi Fijall,

                I'm afraid I don't know about benchmarks and comparison between these languages, other than the shootout. I guess this is the first reference someone gets when comparing languages, since it's the most popular out there.

                But it would be great if there was a resource to compare against other languages. At least, from a marketing point of view, it would be very good for pypy.

                May I know why the shootout is not a good parameter?

                And, is there any other benchmarks comparing pypy against v8, tracemonkey/jägermonkey, etc..?

                +
                +
                +
                +
                + + Maciej Fijalkowski wrote on 2011-03-08 16:21: +
                +
                +

                Hi Ivan.

                Shootout is not good because it contains heavily tuned programs, some of them even massively stretching the benchmark restrictions. They're tailored towards specific implementations, contain specific per-benchmark options etc. Nobody looked at python programs at detail and especially from PyPy perspective. This would need to be done first to compare those fairly, until it's not done, it's comparing naive version to a heavily optimized one and not comparing languages.

                From what I measured roughly PyPy comes on par with tracemonkey and about 2x slower V8. But those were very unscientific experiments and I'll deny everything :)

                I don't think there is any good cross-language comparison and that's at least partly due to the fact that workloads differ in different languages. Most shootout programs for example are tailored towards C workloads. Optimizing precisely for them (even if you have a good programs) is kind of fun, but it does not represent what we try to achieve, that is speeding up large python programs.

                I hope this answers your question.

                Cheers,
                fijal

                +
                +
                +
                +
                + + Anonymous wrote on 2011-03-10 23:15: +
                +
                +

                to me it seems like you have reached the goals of unladen swallow and unladen swallow was a bit of a failure?

                if google wants a faster python, why don't they fund you? it would be awesome if the core team could work on it full-time. :)

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2011/04/pypy-15-released-catching-up-302997959079576809.html b/posts/2011/04/pypy-15-released-catching-up-302997959079576809.html new file mode 100644 index 000000000..ee970b4dd --- /dev/null +++ b/posts/2011/04/pypy-15-released-catching-up-302997959079576809.html @@ -0,0 +1,518 @@ + + + + + +PyPy 1.5 Released: Catching Up | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                PyPy 1.5 Released: Catching Up

                + + + +
                +

                We're pleased to announce the 1.5 release of PyPy. This release updates +PyPy with the features of CPython 2.7.1, including the standard library. Thus +all the features of CPython 2.6 and CPython 2.7 are now supported. It +also contains additional performance improvements. You can download it here:

                +
                +https://pypy.org/download.html +
                +
                +

                What is PyPy?

                +

                PyPy is a very compliant Python interpreter, almost a drop-in replacement for +CPython 2.7.1. It's fast (pypy 1.5 and cpython 2.6.2 performance comparison) +due to its integrated tracing JIT compiler.

                +

                This release includes the features of CPython 2.6 and 2.7. It also includes a +large number of small improvements to the tracing JIT compiler. It supports +Intel machines running Linux 32/64 or Mac OS X. Windows is beta (it roughly +works but a lot of small issues have not been fixed so far). Windows 64 is +not yet supported.

                +

                Numerous speed achievements are described on our blog. Normalized speed +charts comparing pypy 1.5 and pypy 1.4 as well as pypy 1.5 and cpython +2.6.2 are available on our benchmark website. The speed improvement over 1.4 +seems to be around 25% on average.

                +
                +
                +

                More highlights

                +
                  +
                • The largest change in PyPy's tracing JIT is adding support for loop invariant +code motion, which was mostly done by Håkan Ardö. This feature improves the +performance of tight loops doing numerical calculations.
                • +
                • The CPython extension module API has been improved and now supports many more +extensions. For information on which one are supported, please refer to our +compatibility wiki.
                • +
                • These changes make it possible to support Tkinter and IDLE.
                • +
                • The cProfile profiler is now working with the JIT. However, it skews the +performance in unstudied ways. Therefore it is not yet usable to analyze +subtle performance problems (the same is true for CPython of course).
                • +
                • There is an external fork which includes an RPython version of the +postgresql. However, there are no prebuilt binaries for this.
                • +
                • Our developer documentation was moved to Sphinx and cleaned up.
                • +
                • and many small things :-)
                • +
                +

                Cheers,

                +

                Carl Friedrich Bolz, Laura Creighton, Antonio Cuni, Maciej Fijalkowski, +Amaury Forgeot d'Arc, Alex Gaynor, Armin Rigo and the PyPy team

                +
                +
                +

                Comments

                +
                +
                +
                + + kost BebiX wrote on 2011-04-30 16:59: +
                +
                +

                Cool. Blog design became blue :-)

                +
                +
                +
                +
                + + Anonymous wrote on 2011-04-30 17:37: +
                +
                +

                Unless there is something Intel specific - maybe calling it x86/x86-64 might be a good idea since this suggests that pypy does not work on amd / via chips.

                +
                +
                +
                +
                + + Anonymous wrote on 2011-04-30 21:33: +
                +
                +

                do you have plans to add CPython 2.7.1 to speed.pypy.org?

                +
                +
                +
                +
                + + Anonymous wrote on 2011-04-30 22:21: +
                +
                +

                Is it just me or does cProfile seem rather broken (at least on Windows)? I get random subtimings that are negative or in the billions.

                >>>> cProfile.run("[abs(1) for n in xrange(10**6)]")
                1000002 function calls in 1.000 seconds

                Ordered by: standard name

                ncalls tottime percall cumtime percall filename:lineno(function)
                1 -137.813 -137.813 1.000 1.000 :1()
                1000000 138.813 0.000 138.813 0.000 {abs}
                1 0.000 0.000 0.000 0.000 {method 'disable' of '_lsprof.Prof
                iler' objects}

                +
                +
                +
                +
                + + Zooko wrote on 2011-04-30 22:34: +
                +
                +

                Where's the flattr button? I want to give you a euro tip again, just like I do every time you blog.

                Also: way to go on releasing PyPy 1.5! This project is really growing up!

                +
                +
                +
                +
                + + Armin Rigo wrote on 2011-05-01 11:10: +
                +
                +

                Anonymous: cProfile on Windows works for me. It might be details of your Windows version or whatever. Can you open it as a proper bug report? Thanks! https://codespeak.net/issue/pypy-dev/

                +
                +
                +
                +
                + + Unknown wrote on 2011-05-01 11:24: +
                +
                +

                Awesome! Looking forward to PyPy on NaCl.

                +
                +
                +
                +
                + + Antonio Cuni wrote on 2011-05-01 12:20: +
                +
                +

                @zooko: I don't know why the flattr button went away. I re-uploaded the template to blogger and now it seems to be there again, can you confirm?

                +
                +
                +
                +
                + + etal wrote on 2011-05-01 13:40: +
                +
                +

                Great stuff. Do you think PyPy is ready to be re-packaged for Debian yet?

                I'm looking at this:
                https://bugs.debian.org/538858

                I have a feeling the popcon would be quite a bit higher nowadays.

                +
                +
                +
                +
                + + Gaëtan de Menten wrote on 2011-05-02 08:19: +
                +
                +

                Congratulations to the whole team. What's coming next now that this large milestone is completed?

                +
                +
                +
                +
                + + Anonymous wrote on 2011-05-02 11:17: +
                +
                +

                Is it just me or does the download page still point to the 1.4.1 release?

                +
                +
                +
                +
                + + Antonio Cuni wrote on 2011-05-02 11:23: +
                +
                +

                @Anonymous: what is the "download page" you are talking about? For me,
                https://pypy.org/download.html

                shows only links to PyPy 1.5. Maybe it's a browser cache issue?

                +
                +
                +
                +
                + + Anonymous wrote on 2011-05-02 11:31: +
                +
                +

                This is insane.

                I clicked on the link multiple times yesterday and today (after restarting firefox) and only now the page refreshed correctly.

                Just shows you that anything can happen.

                +
                +
                +
                +
                + + vak wrote on 2011-05-03 16:43: +
                +
                +

                btw, regarding https://bitbucket.org/pypy/compatibility/wiki/Home -- i am using pymongo driver under pypy without problems (not yet checked against the fresh pypy 1.5 though)

                +
                +
                +
                +
                + + vak wrote on 2011-05-04 09:19: +
                +
                +

                minor thing -- version isn't updated?

                Python 2.7.1 (b590cf6de419, Apr 30 2011, 02:00:34)
                [PyPy 1.5.0-alpha0 with GCC 4.4.3] on linux2

                +
                +
                +
                +
                + + Anonymous wrote on 2011-05-05 12:29: +
                +
                +

                Great news, 25% speedup over PyPy 1.4 is just another great step forward. I'm looking forward for times when Python will be fastest dynamic object-oriented language and it will be more and more popular. I feel that these times are very close thanks to PyPy.

                What about adding PyPy to The Computer Language Benchmarks Game?

                +
                +
                +
                +
                + + Damian Cugley wrote on 2011-05-07 10:36: +
                +
                +

                I have not yet managed to build C extensions on Mac OS X with distribute/distutils/whatever because sysconfig.get_config_var returns None. Is there a quick way to fix this?

                +
                +
                +
                +
                + + Damian Cugley wrote on 2011-05-07 10:38: +
                +
                +

                @anonymous The Computer Language Benchmarks Game only permits one implementation per language, and CPython 3.2 is the implementation they use for Python.

                +
                +
                +
                +
                + + Anonymous wrote on 2011-05-07 14:09: +
                +
                +

                Would it be easy to implement mutable builtin classes (for example for adding new methods to int or str) in pypy?

                +
                +
                +
                +
                + + Thomas Heller wrote on 2011-06-07 17:38: +
                +
                +

                I'm speechless :-)

                This is the first time I use pypy and it works out of the box even with my fancy Windows GUI toolkit (written completely in ctypes) out of the box.

                Great work, guys!

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2011/04/pypy-goteborg-post-easter-sprint-april-16274563331982977.html b/posts/2011/04/pypy-goteborg-post-easter-sprint-april-16274563331982977.html new file mode 100644 index 000000000..dceaee2c2 --- /dev/null +++ b/posts/2011/04/pypy-goteborg-post-easter-sprint-april-16274563331982977.html @@ -0,0 +1,393 @@ + + + + + +PyPy Göteborg Post-Easter Sprint April 25 - May 1 2011 | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                PyPy Göteborg Post-Easter Sprint April 25 - May 1 2011

                + + + +
                +

                The next PyPy sprint will be in Gothenburg, Sweden. It is a public sprint, +very suitable for newcomers. We'll focus on making the 1.5 release (if +it hasn't already happened) and whatever interests the Sprint attendees.

                +
                +

                Topics and goals

                +

                The main goal is to polish and release PyPy 1.5, supporting Python 2.7 +as well as the last few months' improvements in the JIT (provided that +it hasn't already happened). Other topics:

                +
                  +
                • Going over our documentation, and classifying our docs in terms of +mouldiness. Deciding what needs writing, and maybe writing it.
                • +
                • Helping people get their code running with PyPy
                • +
                • maybe work on EuroPython Training, and talks
                • +
                • Summer of Code preparation
                • +
                • speed.pypy.org
                • +
                • any other programming task is welcome too -- e.g. tweaking the +Python or JavaScript interpreter, Stackless support, and so on.
                • +
                +
                +
                +

                Location

                +

                The sprint will be held in the apartment of Laura Creighton and Jacob Hallén +which is at Götabergsgatan 22 in Gothenburg, Sweden. Here is a map. This is +in central Gothenburg. It is between the tram stops of Vasaplatsen and +Valand, (a distance of 4 blocks) where many lines call -- the 2, 3, 4, 5, +7, 10 and 13.

                +

                Probably cheapest and not too far away is to book accomodation at SGS +Veckobostader. The Elite Park Avenyn Hotel is a luxury hotel just a +few blocks away. There are scores of hotels a short walk away from the +sprint location, suitable for every budget, desire for luxury, and desire +for the unusual. You could, for instance, stay on a boat. Options are +too numerous to go into here. Just ask in the mailing list or on the blog.

                +

                Hours will be +from 10:00 until people have had enough. It's a good idea to arrive a +day before the sprint starts and leave a day later. In the middle of +the sprint there usually is a break day and it's usually ok to take +half-days off if you feel like it.

                +
                +
                +

                Good to Know

                +

                Sweden is not part of the Euro zone. One SEK (krona in singular, kronor +in plural) is roughly 1/10th of a Euro (9.36 SEK to 1 Euro).

                +

                The venue is central in Gothenburg. There is a large selection of +places to get food nearby, from edible-and-cheap to outstanding. We +often cook meals together, so let us know if you have any food allergies, +dislikes, or special requirements.

                +

                Sweden uses the same kind of plugs as Germany. 230V AC.

                +

                The Sprint will be held the week following Easter. This means, as always, +that Gothcon will be taking place the weekend before (Easter weekend). +Gothcon, now in its 35 year, is the largest European game players conference. +Some of you may be interested in arriving early for the board games. +The conference site is only in Swedish, alas. You don't need to register +in advance unless you are planning to host a tournament, (and it's too +late for that anyway).

                +
                +
                +

                Getting Here

                +

                If are coming train, you will arrive at the Central Station. It is +about 12 blocks to the site from there, or you can take a tram.

                +

                There are two airports which are local to Göteborg, Landvetter (the main +one) and Gothenburg City Airport (where some budget airlines fly). +If you arrive at Landvetter the airport bus stops right downtown at +Elite Park Avenyn Hotel which is the second stop, 4 blocks from the +Sprint site, as well as the end of the line, which is the Central Station. +If you arrive at Gothenburg City Airport take the bus to the end of the +line. You will be at the Central Station.

                +

                You can also arrive by ferry, from either Kiel in Germany or Frederikshavn +in Denmark.

                +
                +
                +

                Who's Coming?

                +

                If you'd like to come, please let us know when you will be arriving and +leaving, as well as letting us know your interests We'll keep a list +of people which we'll update (which you can do so yourself if you +have bitbucket pypy commit rights).

                +
                +
                +

                Comments

                +
                +
                +
                + + intgr wrote on 2011-04-04 22:37: +
                +
                +

                "e.g. tweaking the Python or JavaScript interpreter"

                Are you implying that PyPy has a JavaScript interpreter now?

                +
                +
                +
                +
                + + Carl Friedrich Bolz-Tereick wrote on 2011-04-05 13:58: +
                +
                +

                It had one since a few years. It's not complete though: https://bitbucket.org/pypy/lang-js/overview

                +
                +
                +
                +
                + + vak wrote on 2011-04-28 08:59: +
                +
                +

                any updates from the event?

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2011/04/tutorial-part-2-adding-jit-8121732841568309472.html b/posts/2011/04/tutorial-part-2-adding-jit-8121732841568309472.html new file mode 100644 index 000000000..547dae883 --- /dev/null +++ b/posts/2011/04/tutorial-part-2-adding-jit-8121732841568309472.html @@ -0,0 +1,652 @@ + + + + + +Tutorial Part 2: Adding a JIT | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                Tutorial Part 2: Adding a JIT

                + + + +
                +

                This is the second part of a tutorial written by Andrew Brown. The first +part described how to write an interpreter with PyPy.

                +
                +

                Adding JIT

                +

                Translating RPython to C is pretty cool, but one of the best features of PyPy +is its ability to generate just-in-time compilers for your interpreter. +That's right, from just a couple hints on how your interpreter is structured, +PyPy will generate and include a JIT compiler that will, at runtime, translate +the interpreted code of our BF language to machine code!

                +

                So what do we need to tell PyPy to make this happen? First it needs to know +where the start of your bytecode evaluation loop is. This lets it keep track of +instructions being executed in the target language (BF).

                +

                We also need to let it know what defines a particular execution frame. Since +our language doesn't really have stack frames, this boils down to what's +constant for the execution of a particular instruction, and what's not. These +are called "green" and "red" variables, respectively.

                +

                Refer back to example2.py for the following.

                +

                In our main loop, there are four variables used: pc, program, bracket_map, and +tape. Of those, pc, program, and bracket_map are all green variables. They +define the execution of a particular instruction. If the JIT routines see the +same combination of green variables as before, it knows it's skipped back and +must be executing a loop. The variable "tape" is our red variable, it's what's +being manipulated by the execution.

                +

                So let's tell PyPy this info. Start by importing the JitDriver class and making +an instance:

                +
                from pypy.rlib.jit import JitDriver
                +jitdriver = JitDriver(greens=['pc', 'program', 'bracket_map'],
                +        reds=['tape'])
                +
                +

                And we add this line to the very top of the while loop in the mainloop +function:

                +
                jitdriver.jit_merge_point(pc=pc, tape=tape, program=program,
                +        bracket_map=bracket_map)
                +
                +

                We also need to define a JitPolicy. We're not doing anything fancy, so this is +all we need somewhere in the file:

                +
                def jitpolicy(driver):
                +    from pypy.jit.codewriter.policy import JitPolicy
                +    return JitPolicy()
                +
                +

                See this example at example3.py

                +

                Now try translating again, but with the flag --opt=jit:

                +
                +$ python ./pypy/pypy/translator/goal/translate.py --opt=jit example3.py
                +
                +

                It will take significantly longer to translate with JIT enabled, almost 8 +minutes on my machine, and the resulting binary will be much larger. When it's +done, try having it run the mandelbrot program again. A world of difference, +from 12 seconds compared to 45 seconds before!

                +

                Interestingly enough, you can see when the JIT compiler switches from +interpreted to machine code with the mandelbrot example. The first few lines of +output come out pretty fast, and then the program gets a boost of speed and +gets even faster.

                +
                +
                +

                A bit about Tracing JIT Compilers

                +

                It's worth it at this point to read up on how tracing JIT compilers work. +Here's a brief explanation: The interpreter is usually running your interpreter +code as written. When it detects a loop of code in the target language (BF) is +executed often, that loop is considered "hot" and marked to be traced. The next +time that loop is entered, the interpreter gets put in tracing mode where every +executed instruction is logged.

                +

                When the loop is finished, tracing stops. The trace of the loop is sent to an +optimizer, and then to an assembler which outputs machine code. That machine +code is then used for subsequent loop iterations.

                +

                This machine code is often optimized for the most common case, and depends on +several assumptions about the code. Therefore, the machine code will contain +guards, to validate those assumptions. If a guard check fails, the runtime +falls back to regular interpreted mode.

                +

                A good place to start for more information is +https://en.wikipedia.org/wiki/Just-in-time_compilation

                +
                +
                +

                Debugging and Trace Logs

                +

                Can we do any better? How can we see what the JIT is doing? Let's do two +things.

                +

                First, let's add a get_printable_location function, which is used during debug +trace logging:

                +
                def get_location(pc, program, bracket_map):
                +    return "%s_%s_%s" % (
                +            program[:pc], program[pc], program[pc+1:]
                +            )
                +jitdriver = JitDriver(greens=['pc', 'program', 'bracket_map'], reds=['tape'],
                +        get_printable_location=get_location)
                +
                +

                This function is passed in the green variables, and should return a string. +Here, we're printing out the BF code, surrounding the currently executing +instruction with underscores so we can see where it is.

                +

                Download this as example4.py and translate it the same as example3.py.

                +

                Now let's run a test program (test.b, which just prints the letter "A" 15 or so +times in a loop) with trace logging:

                +
                +$ PYPYLOG=jit-log-opt:logfile ./example4-c test.b
                +
                +

                Now take a look at the file "logfile". This file is quite hard to read, so +here's my best shot at explaining it.

                +

                The file contains a log of every trace that was performed, and is essentially a +glimpse at what instructions it's compiling to machine code for you. It's +useful to see if there are unnecessary instructions or room for optimization.

                +

                Each trace starts with a line that looks like this:

                +
                +[3c091099e7a4a7] {jit-log-opt-loop
                +
                +

                and ends with a line like this:

                +
                +[3c091099eae17d jit-log-opt-loop}
                +
                +

                The next line tells you which loop number it is, and how many ops are in it. +In my case, the first trace looks like this:

                + + + +
                 1
                + 2
                + 3
                + 4
                + 5
                + 6
                + 7
                + 8
                + 9
                +10
                +11
                +12
                +13
                +14
                +15
                +16
                +17
                +18
                +19
                +20
                +21
                +22
                +23
                +24
                +25
                +26
                +27
                +28
                +29
                +
                  [3c167c92b9118f] {jit-log-opt-loop
                +  # Loop 0 : loop with 26 ops
                +  [p0, p1, i2, i3]
                +  debug_merge_point('+<[>[_>_+<-]>.[<+>-]<<-]++++++++++.', 0)
                +  debug_merge_point('+<[>[>_+_<-]>.[<+>-]<<-]++++++++++.', 0)
                +  i4 = getarrayitem_gc(p1, i2, descr=<SignedArrayDescr>)
                +  i6 = int_add(i4, 1)
                +  setarrayitem_gc(p1, i2, i6, descr=<SignedArrayDescr>)
                +  debug_merge_point('+<[>[>+_<_-]>.[<+>-]<<-]++++++++++.', 0)
                +  debug_merge_point('+<[>[>+<_-_]>.[<+>-]<<-]++++++++++.', 0)
                +  i7 = getarrayitem_gc(p1, i3, descr=<SignedArrayDescr>)
                +  i9 = int_sub(i7, 1)
                +  setarrayitem_gc(p1, i3, i9, descr=<SignedArrayDescr>)
                +  debug_merge_point('+<[>[>+<-_]_>.[<+>-]<<-]++++++++++.', 0)
                +  i10 = int_is_true(i9)
                +  guard_true(i10, descr=<Guard2>) [p0]
                +  i14 = call(ConstClass(ll_dict_lookup__dicttablePtr_Signed_Signed), ConstPtr(ptr12), 90, 90, descr=<SignedCallDescr>)
                +  guard_no_exception(, descr=<Guard3>) [i14, p0]
                +  i16 = int_and(i14, -9223372036854775808)
                +  i17 = int_is_true(i16)
                +  guard_false(i17, descr=<Guard4>) [i14, p0]
                +  i19 = call(ConstClass(ll_get_value__dicttablePtr_Signed), ConstPtr(ptr12), i14, descr=<SignedCallDescr>)
                +  guard_no_exception(, descr=<Guard5>) [i19, p0]
                +  i21 = int_add(i19, 1)
                +  i23 = int_lt(i21, 114)
                +  guard_true(i23, descr=<Guard6>) [i21, p0]
                +  guard_value(i21, 86, descr=<Guard7>) [i21, p0]
                +  debug_merge_point('+<[>[_>_+<-]>.[<+>-]<<-]++++++++++.', 0)
                +  jump(p0, p1, i2, i3, descr=<Loop0>)
                +  [3c167c92bc6a15] jit-log-opt-loop}
                +
                +
                +

                I've trimmed the debug_merge_point lines a bit, they were really long.

                +

                So let's see what this does. This trace takes 4 parameters: 2 object pointers +(p0 and p1) and 2 integers (i2 and i3). Looking at the debug lines, it seems to +be tracing one iteration of this loop: "[>+<-]"

                +

                It starts executing the first operation on line 4, a ">", but immediately +starts executing the next operation. The ">" had no instructions, and looks +like it was optimized out completely. This loop must always act on the same +part of the tape, the tape pointer is constant for this trace. An explicit +advance operation is unnecessary.

                +

                Lines 5 to 8 are the instructions for the "+" operation. First it gets the +array item from the array in pointer p1 at index i2 (line 6), adds 1 to it and +stores it in i6 (line 7), and stores it back in the array (line 8).

                +

                Line 9 starts the "<" instruction, but it is another no-op. It seems that i2 +and i3 passed into this routine are the two tape pointers used in this loop +already calculated. Also deduced is that p1 is the tape array. It's not clear +what p0 is.

                +

                Lines 10 through 13 perform the "-" operation: get the array value (line 11), +subtract (line 12) and set the array value (line 13).

                +

                Next, on line 14, we come to the "]" operation. Lines 15 and 16 check whether +i9 is true (non-zero). Looking up, i9 is the array value that we just +decremented and stored, now being checked as the loop condition, as expected +(remember the definition of "]"). Line 16 is a guard, if the condition is not +met, execution jumps somewhere else, in this case to the routine called +<Guard2> and is passed one parameter: p0.

                +

                Assuming we pass the guard, lines 17 through 23 are doing the dictionary lookup +to bracket_map to find where the program counter should jump to. I'm not too +familiar with what the instructions are actually doing, but it looks like there +are two external calls and 3 guards. This seems quite expensive, especially +since we know bracket_map will never change (PyPy doesn't know that). We'll +see below how to optimize this.

                +

                Line 24 increments the newly acquired instruction pointer. Lines 25 and 26 make +sure it's less than the program's length.

                +

                Additionally, line 27 guards that i21, the incremented instruction pointer, is +exactly 86. This is because it's about to jump to the beginning (line 29) and +the instruction pointer being 86 is a precondition to this block.

                +

                Finally, the loop closes up at line 28 so the JIT can jump to loop body <Loop0> +to handle that case (line 29), which is the beginning of the loop again. It +passes in parameters (p0, p1, i2, i3).

                +
                +
                +

                Optimizing

                +

                As mentioned, every loop iteration does a dictionary lookup to find the +corresponding matching bracket for the final jump. This is terribly +inefficient, the jump target is not going to change from one loop to the next. +This information is constant and should be compiled in as such.

                +

                The problem is that the lookups are coming from a dictionary, and PyPy is +treating it as opaque. It doesn't know the dictionary isn't being modified or +isn't going to return something different on each query.

                +

                What we need to do is provide another hint to the translation to say that the +dictionary query is a pure function, that is, its output depends only on its +inputs and the same inputs should always return the same output.

                +

                To do this, we use a provided function decorator pypy.rlib.jit.purefunction, +and wrap the dictionary call in a decorated function:

                +
                @purefunction
                +def get_matching_bracket(bracket_map, pc):
                +    return bracket_map[pc]
                +
                +

                This version can be found at example5.py

                +

                Translate again with the JIT option and observe the speedup. Mandelbrot now +only takes 6 seconds! (from 12 seconds before this optimization)

                +

                Let's take a look at the trace from the same function:

                +
                [3c29fad7b792b0] {jit-log-opt-loop
                +# Loop 0 : loop with 15 ops
                +[p0, p1, i2, i3]
                +debug_merge_point('+<[>[_>_+<-]>.[<+>-]<<-]++++++++++.', 0)
                +debug_merge_point('+<[>[>_+_<-]>.[<+>-]<<-]++++++++++.', 0)
                +i4 = getarrayitem_gc(p1, i2, descr=<SignedArrayDescr>)
                +i6 = int_add(i4, 1)
                +setarrayitem_gc(p1, i2, i6, descr=<SignedArrayDescr>)
                +debug_merge_point('+<[>[>+_<_-]>.[<+>-]<<-]++++++++++.', 0)
                +debug_merge_point('+<[>[>+<_-_]>.[<+>-]<<-]++++++++++.', 0)
                +i7 = getarrayitem_gc(p1, i3, descr=<SignedArrayDescr>)
                +i9 = int_sub(i7, 1)
                +setarrayitem_gc(p1, i3, i9, descr=<SignedArrayDescr>)
                +debug_merge_point('+<[>[>+<-_]_>.[<+>-]<<-]++++++++++.', 0)
                +i10 = int_is_true(i9)
                +guard_true(i10, descr=<Guard2>) [p0]
                +debug_merge_point('+<[>[_>_+<-]>.[<+>-]<<-]++++++++++.', 0)
                +jump(p0, p1, i2, i3, descr=<Loop0>)
                +[3c29fad7ba32ec] jit-log-opt-loop}
                +
                +

                Much better! Each loop iteration is an add, a subtract, two array loads, two +array stores, and a guard on the exit condition. That's it! This code doesn't +require any program counter manipulation.

                +

                I'm no expert on optimizations, this tip was suggested by Armin Rigo on the +pypy-dev list. Carl Friedrich has a series of posts on how to optimize your +interpreter that are also very useful: https://bit.ly/bundles/cfbolz/1

                +
                +
                +

                Final Words

                +

                I hope this has shown some of you what PyPy is all about other than a faster +implementation of Python.

                +

                For those that would like to know more about how the process works, there are +several academic papers explaining the process in detail that I recommend. In +particular: Tracing the Meta-Level: PyPy's Tracing JIT Compiler.

                +

                See https://readthedocs.org/docs/pypy/en/latest/extradoc.html

                +
                +
                +

                Comments

                +
                +
                +
                + + Winston Ewert wrote on 2011-04-06 21:59: +
                +
                +

                Some interpreters are written to evaluate directly from the AST. i.e. they never generate bytecode, instead each node in the ast simply has the code to execute it as a "virtual" function. Could PyPy JIT such an interpreter? Or does it essentially assume a bytecode based interpreter?

                +
                +
                +
                +
                + + Anonymous wrote on 2011-04-07 05:56: +
                +
                +

                In theory it should be able to, if it's written in RPython. Perhaps it would be harder to place the hints for the jit engine?

                As far as I understand it, it still traces some kind of bytecode (generated from the RPython code), but uses the can_enter_jit hints to determine what to trace and the length of a trace.

                If it'll be fast is another question though. Why not give it a try? (E.g. one could implement the LLVM kaleidoscope language in RPython.)

                +
                +
                +
                +
                + + Maciej Fijalkowski wrote on 2011-04-07 06:05: +
                +
                +

                @Winston in theory nothing prevents JIT from working on AST-based interpreters. In practice however, it would require a bit of engineering to convince the JIT that the green (constant) argument is a complex object structure. That's however just engineering

                +
                +
                +
                +
                + + Carl Friedrich Bolz-Tereick wrote on 2011-04-07 09:24: +
                +
                +

                It's actually not a problem at all to have an AST-based interpreter. In fact, the Prolog uses "ASTs" (Prolog is homoiconic, so the ASTs are just Prologs normal data structures).

                Maciej: that's not a problem if your ASTs are actually immutable. If they aren't you have a problem which indeed requires some engineering.

                +
                +
                +
                +
                + + Quiz wrote on 2011-04-07 10:45: +
                +
                +

                The effect of the loop "[>+<-]" is

                tape[position+1] += tape[position]
                tape[position] = 0

                We saw that PyPy can optimize the program counter away in this loop--but this loop could be executed in constant time. Will PyPy ever be able to optimize it to that degree?

                +
                +
                +
                +
                + + Winston Ewert wrote on 2011-04-10 01:53: +
                +
                +

                Well, you finally motivated me to give it a try. I optimized the BF example and managed to get some pretty nice speed boosts all without dipping into the low level (aside from reading the log)

                +
                +
                +
                +
                + + Anonymous wrote on 2011-04-13 09:50: +
                +
                +

                Great article, man! Many thanks and keep on rocking!

                +
                +
                +
                +
                + + Anonymous wrote on 2011-08-07 08:47: +
                +
                +

                Great tutorial, but where can I find the 'test.b' file (mentioned for the tracing JIT) for a try?

                +
                +
                +
                +
                + + Anonymous wrote on 2012-11-22 10:50: +
                +
                +

                hi guys. can jit merge points not be put inside methods? Going off example3.py, if I take the body of the while loop and move it into a method of the Tape class (along with the jitdriver), all the speed gains go away. can anyone explain why this happens? Thanks!

                +
                +
                +
                +
                + + Sarah Mount wrote on 2016-07-30 23:12: +
                +
                +

                BTW the link to https://bit.ly/bundles/cfbolz/1 has bit-rotted.

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2011/04/tutorial-writing-interpreter-with-pypy-3785910476193156295.html b/posts/2011/04/tutorial-writing-interpreter-with-pypy-3785910476193156295.html new file mode 100644 index 000000000..fbb198f53 --- /dev/null +++ b/posts/2011/04/tutorial-writing-interpreter-with-pypy-3785910476193156295.html @@ -0,0 +1,719 @@ + + + + + +Tutorial: Writing an Interpreter with PyPy, Part 1 | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                Tutorial: Writing an Interpreter with PyPy, Part 1

                + + + +
                +

                This is a guest blog post written by Andrew Brown, with help from the PyPy developers +on the pypy-dev mailing list.

                +

                This tutorial's master copy and supporting files live at +https://bitbucket.org/brownan/pypy-tutorial/

                +
                +

                When I first learned about the PyPy project, it took me a while to figure out +exactly what it was about. For those that don't already know, it's two things:

                +
                  +
                • A set of tools for implementing interpreters for interpreted languages
                • +
                • An implementation of Python using this toolchain
                • +
                +

                The second part is probably what most people think PyPy is, but this tutorial +is not about their Python interpreter. It is about writing your own +interpreter for your own language.

                +

                This is the project I undertook to help myself better understand how PyPy works +and what it's all about.

                +

                This tutorial assumes you know very little about PyPy, how it works, and even +what it's all about. I'm starting from the very beginning here.

                +
                +

                What PyPy Does

                +

                Here's a brief overview of what PyPy can do. Let's say you want to write an +interpreted language. This involves writing some kind of source code parser, a +bytecode interpretation loop, and lots of standard library code.

                +

                That's quite a bit of work for moderately complicated languages, and there's a +lot of low level work involved. Writing the parser and compiler code usually +isn't fun, that's why there are tools out there to generate parsers and +compilers for you.

                +

                Even then, you still must worry about memory management in your interpreter, +and you're going to be re-implementing a lot if you want data types like +arbitrary precision integers, nice general hash tables, and such. It's enough +to put someone off from implementing their idea for a language.

                +

                Wouldn't it be nice if you could write your language in an existing high level +language like, for example, Python? That sure would be ideal, you'd get all the +advantages of a high level language like automatic memory management and rich +data types at your disposal. Oh, but an interpreted language interpreting +another language would be slow, right? That's twice as much interpreting going +on.

                +

                As you may have guessed, PyPy solves this problem. PyPy is a sophisticated +toolchain for analyzing and translating your interpreter code to C code (or JVM +or CLI). This process is called "translation", and it knows how to translate +quite a lot of Python's syntax and standard libraries, but not everything. All +you have to do is write your interpreter in RPython, a subset of the Python +language carefully defined to allow this kind of analysis and translation, and +PyPy will produce for you a very efficient interpreter.

                +

                Because efficient interpreters should not be hard to write.

                +
                +
                +

                The Language

                +

                The language I've chosen to implement is dead simple. The language runtime +consists of a tape of integers, all initialized to zero, and a single pointer +to one of the tape's cells. The language has 8 commands, described here:

                +
                +
                >
                +
                Moves the tape pointer one cell to the right
                +
                +
                +
                <
                +
                Moves the tape pointer one cell to the left
                +
                +
                +
                Increments the value of the cell underneath the pointer
                +
                -
                +
                Decrements the value of the cell underneath the pointer
                +
                +
                +
                [
                +
                If the cell under the current pointer is 0, skip to the instruction after +the matching ]
                +
                +
                +
                ]
                +
                Skip back to the matching [ (evaluating its condition)
                +
                +
                +
                .
                +
                Print out a single byte to stdout from the cell under the pointer
                +
                +
                +
                ,
                +
                Read in a single byte from stdin to the cell under the pointer
                +
                +

                Any unrecognized bytes are ignored.

                +

                Some of you may recognize this language. I will be referring to it as BF.

                +

                One thing to notice is that the language is its own bytecode; there is no +translation from source code to bytecode. This means that the language can be +interpreted directly: the main eval loop of our interpreter will operate right +on the source code. This simplifies the implementation quite a bit.

                +
                +
                +

                First Steps

                +

                Let's start out by writing a BF interpreter in plain old Python. The first step +is sketching out an eval loop:

                +
                def mainloop(program):
                +    tape = Tape()
                +    pc = 0
                +    while pc < len(program):
                +        code = program[pc]
                +
                +        if code == ">":
                +            tape.advance()
                +        elif code == "<":
                +            tape.devance()
                +        elif code == "+":
                +            tape.inc()
                +        elif code == "-":
                +            tape.dec()
                +        elif code == ".":
                +            sys.stdout.write(chr(tape.get()))
                +        elif code == ",":
                +            tape.set(ord(sys.stdin.read(1)))
                +        elif code == "[" and value() == 0:
                +            # Skip forward to the matching ]
                +        elif code == "]" and value() != 0:
                +            # Skip back to the matching [
                +
                +        pc += 1
                +
                +

                As you can see, a program counter (pc) holds the current instruction index. The +first statement in the loop gets the instruction to execute, and then a +compound if statement decides how to execute that instruction.

                +

                The implementation of [ and ] are left out here, but they should change the +program counter to the value of the matching bracket. (The pc then gets +incremented, so the condition is evaluated once when entering a loop, and once +at the end of each iteration)

                +

                Here's the implementation of the Tape class, which holds the tape's values as +well as the tape pointer:

                +
                class Tape(object):
                +    def __init__(self):
                +        self.thetape = [0]
                +        self.position = 0
                +
                +    def get(self):
                +        return self.thetape[self.position]
                +    def set(self, val):
                +        self.thetape[self.position] = val
                +    def inc(self):
                +        self.thetape[self.position] += 1
                +    def dec(self):
                +        self.thetape[self.position] -= 1
                +    def advance(self):
                +        self.position += 1
                +        if len(self.thetape) <= self.position:
                +            self.thetape.append(0)
                +    def devance(self):
                +        self.position -= 1
                +
                +

                As you can see, the tape expands as needed to the right, indefinitely. We +should really add some error checking to make sure the pointer doesn't go +negative, but I'm not worrying about that now.

                +

                Except for the omission of the "[" and "]" implementation, this code will work +fine. However, if the program has a lot of comments, it will have to skip over +them one byte at a time at runtime. So let's parse those out once and for all.

                +

                At the same time, we'll build a dictionary mapping between brackets, so that +finding a matching bracket is just a single dictionary lookup. Here's how:

                +
                def parse(program):
                +    parsed = []
                +    bracket_map = {}
                +    leftstack = []
                +
                +    pc = 0
                +    for char in program:
                +        if char in ('[', ']', '<', '>', '+', '-', ',', '.'):
                +            parsed.append(char)
                +
                +            if char == '[':
                +                leftstack.append(pc)
                +            elif char == ']':
                +                left = leftstack.pop()
                +                right = pc
                +                bracket_map[left] = right
                +                bracket_map[right] = left
                +            pc += 1
                +
                +    return "".join(parsed), bracket_map
                +
                +

                This returns a string with all invalid instructions removed, and a dictionary +mapping bracket indexes to their matching bracket index.

                +

                All we need is some glue code and we have a working BF interpreter:

                +
                def run(input):
                +    program, map = parse(input.read())
                +    mainloop(program, map)
                +
                +if __name__ == "__main__":
                +    import sys
                +    run(open(sys.argv[1], 'r'))
                +
                +

                If you're following along at home, you'll also need to change the signature of +mainloop() and implement the bracket branches of the if statement. Here's the +complete example: example1.py

                +

                At this point you can try it out to see that it works by running the +interpreter under python, but be warned, it will be very slow on the more +complex examples:

                +
                +$ python example1.py 99bottles.b
                +
                +

                You can find mandel.b and several other example programs (not written by me) in +my repository.

                +
                +
                +

                PyPy Translation

                +

                But this is not about writing a BF interpreter, this is about PyPy. So what +does it take to get PyPy to translate this into a super-fast executable?

                +

                As a side note, there are some simple examples in the pypy/translator/goal +directory of the PyPy source tree that are helpful here. My starting point for +learning this was the example "targetnopstandalone.py", a simple hello world +for PyPy.

                +

                For our example, the module must define a name called "target" which returns the +entry point. The translation process imports your module and looks for that +name, calls it, and the function object returned is where it starts the +translation.

                +
                def run(fp):
                +    program_contents = ""
                +    while True:
                +        read = os.read(fp, 4096)
                +        if len(read) == 0:
                +            break
                +        program_contents += read
                +    os.close(fp)
                +    program, bm = parse(program_contents)
                +    mainloop(program, bm)
                +
                +def entry_point(argv):
                +    try:
                +        filename = argv[1]
                +    except IndexError:
                +        print "You must supply a filename"
                +        return 1
                +
                +    run(os.open(filename, os.O_RDONLY, 0777))
                +    return 0
                +
                +def target(*args):
                +    return entry_point, None
                +
                +if __name__ == "__main__":
                +    entry_point(sys.argv)
                +
                +

                The entry_point function is passed the command line arguments when you run the +resulting executable.

                +

                A few other things have changed here too. See the next section...

                +
                +
                +

                About RPython

                +

                Let's talk a bit about RPython at this point. PyPy can't translate arbitrary +Python code because Python is a bit too dynamic. There are restrictions on what +standard library functions and what syntax constructs one can use. I won't be +going over all the restrictions, but for more information see +https://readthedocs.org/docs/pypy/en/latest/coding-guide.html#restricted-python

                +

                In the example above, you'll see a few things have changed. I'm now using low +level file descriptors with os.open and os.read instead of file objects. +The implementation of "." and "," are similarly tweaked (not shown above). +Those are the only changes to make to this code, the rest is simple enough for +PyPy to digest.

                +

                That wasn't so hard, was it? I still get to use dictionaries, expandable lists, +and even classes and objects! And if low level file descriptors are too low for +you, there are some helpful abstractions in the rlib.streamio module included +with PyPy's "RPython standard library."

                +

                For the example thus far, see example2.py

                +
                +
                +

                Translating

                +

                If you haven't already, check yourself out the latest version of PyPy from +their bitbucket.org repository:

                +
                +$ hg clone https://bitbucket.org/pypy/pypy
                +
                +

                (A recent revision is necessary because of a bugfix that makes my example +possible)

                +

                The script to run is in "pypy/translator/goal/translate.py". Run this script, +passing in our example module as an argument.

                +

                [A note added much later: this script has been moved to "rpython/bin/rpython".]

                +
                +$ python ./pypy/pypy/translator/goal/translate.py example2.py
                +
                +

                (You can use PyPy's python interpreter for extra speed, but it's not necessary)

                +

                PyPy will churn for a bit, drawing some nice looking fractals to your console +while it works. It takes around 20 seconds on my machine.

                +

                The result from this is an executable binary that interprets BF programs. +Included in my repository are some example BF programs, including a mandelbrot +fractal generator, which takes about 45 seconds to run on my computer. Try it +out:

                +
                +$ ./example2-c mandel.b
                +
                +

                Compare this to running the interpreter un-translated on top of python:

                +
                +$ python example2.py mandel.b
                +
                +

                Takes forever, doesn't it?

                +

                So there you have it. We've successfully written our own interpreter in RPython +and translated it with the PyPy toolchain.

                +
                +

                (more in the next blog post...)

                +
                +
                +

                Comments

                +
                +
                +
                + + Dunk wrote on 2011-04-05 14:10: +
                +
                +

                nice post!

                +
                +
                +
                +
                + + DaNmarner wrote on 2011-04-05 16:35: +
                +
                +

                Hmmmmmm, yum.

                I'm going to translate this into Chinese, if you don't mind?

                +
                +
                +
                +
                + + Anonymous wrote on 2011-04-05 16:56: +
                +
                +

                "devance"? I think you meant "retract".

                +
                +
                +
                +
                + + Paul Smith wrote on 2011-04-06 04:09: +
                +
                +

                On my Ubuntu 10.10 laptop, the PyPy BF interpreter ran hanoi in ~20 sec and mandel in ~40 sec. By comparison, the beef BF interpreter (written in C) ran these in ~10 and ~20 sec., respectively. Not too shabby, PyPy.

                +
                +
                +
                +
                + + Unknown wrote on 2011-04-06 10:22: +
                +
                +

                Nice article though I'm really missing a simple benchmark between the python interpreter and the pypy interpreter. "Takes forever" vs "45 seconds" isn't as awesome of a conclusion as I'd hoped for.

                +
                +
                +
                +
                + + Anonymous wrote on 2011-04-06 14:52: +
                +
                +

                @temptemptemp13: I think you are missing something much more substantial. This article is not about Python at all. It is about how to use the PyPy toolchain to implement a different language - in this case the brainfuck programming language.

                While BF isn't a very useful language, it has the nice properties of being very small. Almost all of the language fits in a blog post.

                +
                +
                +
                +
                + + Unknown wrote on 2011-04-08 10:32: +
                +
                +

                Thanks. I've finally understood what PyPy is.

                +
                +
                +
                +
                + + Anonymous wrote on 2011-04-12 18:24: +
                +
                +

                I like how this article became family-friendly by actually avoiding calling BF by its name :-)

                +
                +
                +
                +
                + + Davide wrote on 2011-04-15 03:52: +
                +
                +

                Amazing! Thanks for posting. I was wondering, what's about a pure C or C++ implementations, as close as reasonable to the python one? So I wrote them. You can read more details here, but the bottom line is that PyPy is (marginally) faster than C++, and (marginally) slower than C :-O

                +
                +
                +
                +
                + + Antonio Cuni wrote on 2011-04-15 07:53: +
                +
                +

                @Davide: you should compare your C version against the PyPy version WITH the JIT, as explained here:

                https://morepypy.blogspot.com/2011/04/tutorial-part-2-adding-jit.html

                I bet that PyPy will easily win :-)

                +
                +
                +
                +
                + + Anonymous wrote on 2011-12-12 01:15: +
                +
                +

                Nice post. I just want to report that I tried running

                /usr/share/pypy-1.6/pypy/translator/goal/translate.py example2.py

                and got the following error.
                This is with an Ubuntu 1.7 pypy package rebuilt on Debian squeeze (the 1.6 is a typo, it should be 1.7).

                [translation:ERROR] Error:
                [translation:ERROR] Traceback (most recent call last):
                [translation:ERROR] File "/usr/share/pypy-1.6/pypy/translator/goal/translate.py", line 308, in main
                [translation:ERROR] drv.proceed(goals)
                [translation:ERROR] File "/usr/share/pypy-1.6/pypy/translator/driver.py", line 809, in proceed
                [translation:ERROR] return self._execute(goals, task_skip = self._maybe_skip())
                [translation:ERROR] File "/usr/share/pypy-1.6/pypy/translator/tool/taskengine.py", line 116, in _execute
                [translation:ERROR] res = self._do(goal, taskcallable, *args, **kwds)
                [translation:ERROR] File "/usr/share/pypy-1.6/pypy/translator/driver.py", line 286, in _do
                [translation:ERROR] res = func()
                [translation:ERROR] File "/usr/share/pypy-1.6/pypy/translator/driver.py", line 441, in task_backendopt_lltype
                [translation:ERROR] from pypy.translator.backendopt.all import backend_optimizations
                [translation:ERROR] File "/usr/share/pypy-1.6/pypy/translator/backendopt/all.py", line 2, in
                [translation:ERROR] from pypy.translator.backendopt import removenoops
                [translation:ERROR] File "/usr/share/pypy-1.6/pypy/translator/backendopt/removenoops.py", line 5, in
                [translation:ERROR] from pypy import conftest
                [translation:ERROR] File "/usr/share/pypy-1.6/pypy/conftest.py", line 1, in
                [translation:ERROR] import py, pytest, sys, os, textwrap, types
                [translation:ERROR] ImportError: No module named pytest
                [translation] start debugger...
                > /usr/share/pypy-1.6/pypy/conftest.py(1)()
                -> import py, pytest, sys, os, textwrap, types
                (Pdb+)

                So, it looks like pytest needs to be installed. This does not appear to be available as a Debian package.

                Regards, Faheem Mitha
                (faheem at faheem dot info)

                +
                +
                +
                +
                + + James Mills wrote on 2013-02-14 05:44: +
                +
                +

                This is a great post for anyone interested in programming languages :) Great post!

                +
                +
                +
                +
                + + ℭacilhας, ℒa ℬatalema wrote on 2013-02-23 02:12: +
                +
                +

                Now, with os.read() and os.write():

                [translation:ERROR] Error:
                [translation:ERROR] Traceback (most recent call last):
                [translation:ERROR] File "/opt/local/lib/pypy/src/pypy-pypy-07e08e9c885c/pypy/translator/goal/translate.py", line 303, in main
                [translation:ERROR] drv.proceed(goals)
                [translation:ERROR] File "/opt/local/lib/pypy-2.0-b1/src/pypy-pypy-07e08e9c885c/pypy/translator/driver.py", line 771, in proceed
                [translation:ERROR] return self._execute(goals, task_skip = self._maybe_skip())
                [translation:ERROR] File "/opt/local/lib/pypy-2.0-b1/src/pypy-pypy-07e08e9c885c/pypy/translator/tool/taskengine.py", line 116, in _execute
                [translation:ERROR] res = self._do(goal, taskcallable, *args, **kwds)
                [translation:ERROR] File "/opt/local/lib/pypy-2.0-b1/src/pypy-pypy-07e08e9c885c/pypy/translator/driver.py", line 283, in _do
                [translation:ERROR] res = func()
                [translation:ERROR] File "/opt/local/lib/pypy-2.0-b1/src/pypy-pypy-07e08e9c885c/pypy/translator/driver.py", line 319, in task_annotate
                [translation:ERROR] s = annotator.build_types(self.entry_point, self.inputtypes)
                [translation:ERROR] File "/opt/local/lib/pypy-2.0-b1/src/pypy-pypy-07e08e9c885c/pypy/annotation/annrpython.py", line 89, in build_types
                [translation:ERROR] return self.build_graph_types(flowgraph, inputcells, complete_now=complete_now)
                [translation:ERROR] File "/opt/local/lib/pypy-2.0-b1/src/pypy-pypy-07e08e9c885c/pypy/annotation/annrpython.py", line 142, in build_graph_types
                [translation:ERROR] self.complete()
                [translation:ERROR] File "/opt/local/lib/pypy-2.0-b1/src/pypy-pypy-07e08e9c885c/pypy/annotation/annrpython.py", line 217, in complete
                [translation:ERROR] raise AnnotatorError(text)
                [translation:ERROR] AnnotatorError: -+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
                [translation:ERROR] Blocked block -- operation cannot succeed
                [translation:ERROR]
                [translation:ERROR] v1 = ord(v0)
                [translation:ERROR] In :
                [translation:ERROR] Happened at file /Users/cacilhas/Workspace/Personal/brainfuck/src/brainfuck/parser.py line 29
                [translation:ERROR]
                [translation:ERROR] ==> tape.set(ord(os.read(0, 1)))
                [translation:ERROR]
                [translation:ERROR] Known variable annotations:
                [translation:ERROR] v0 = SomeString(can_be_None=True)

                +
                +
                +
                +
                + + Dvd Fo wrote on 2013-08-26 12:25: +
                +
                +

                I think that your "," implementation is incorrect, os.read returns an empty string on EOF, thus [0] triggers an exception.
                According to Wikipedia, setting the cell to 0, -1 or leaving the cell unchanged each may be used to tell EOF apart from other characters.

                +
                +
                +
                +
                + + James wrote on 2015-12-02 05:50: +
                +
                +

                I followed this tutorial again several years later :) (just for fun) using the newly published rpython toolchain now available up on PyPi. You can now just: pip install rpython -- I also wanted to point out that recent versions of the RPython toolchain have made advances in what it can translate it seems; specifically I did not need to change the open(...).read() parts to lower level os.read() calls.

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2011/04/using-tkinter-and-idle-with-pypy-6156563216925585965.html b/posts/2011/04/using-tkinter-and-idle-with-pypy-6156563216925585965.html new file mode 100644 index 000000000..c645f2fc0 --- /dev/null +++ b/posts/2011/04/using-tkinter-and-idle-with-pypy-6156563216925585965.html @@ -0,0 +1,461 @@ + + + + + +Using Tkinter and IDLE with PyPy | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                Using Tkinter and IDLE with PyPy

                + + + +
                +

                We are pleased to announce that Tkinter, the GUI library based on TCL/TK, now +works with PyPy.
                +Tkinter is composed of two parts:

                +
                +
                  +
                • +_tkinter, a module written in C which interfaces with the TCL world
                • +
                • +Tkinter, a pure Python package which wraps _tkinter to expose the +pythonic API we are used to
                • +
                +
                +
                +
                +
                + +
                +The PyPy version of _tkinter reuses the C code of as found in CPython and +compile it through the PyPy C-API compatibility layer, cpyext. To make it +work with PyPy, we had to modify it slightly, in order to remove the +dependency on some API functions which are not supported by PyPy. In particular, we +removed the dependency on the PyOS_InputHook variable, which allows a nice +integration of Tkinter and the Python interactive prompt: the result is that, +unlike CPython, in PyPy Tk windows created at the interactive prompt are not +shown until we manually call the mainloop method. Apart from this +inconvenience, all the rest works fine.
                +At the moment, _tkinter is not distributed with PyPy because our build +system does not support automatic compilation of C extension. Instead, it is +necessary to install it manually, either directly from source or by +easy_installing/pip installing tkinter-pypy from PyPI.
                +For everything to work correctly, you need a recent build of PyPy: the +following is a step-by-step guide to install _tkinter in a PyPy nightly +build for Linux 64 bit; for other architectures, look at the nightly build +page:
                $ wget https://buildbot.pypy.org/nightly/trunk/pypy-c-jit-43485-1615dfd7d8f1-linux64.tar.bz2
                +
                +$ tar xfv pypy-c-jit-43485-1615dfd7d8f1-linux64.tar.bz2
                +
                +$ cd pypy-c-jit-43485-1615dfd7d8f1-linux64/
                +
                +$ wget https://peak.telecommunity.com/dist/ez_setup.py
                +
                +$ ./bin/pypy ez_setup.py    # install setuptools
                +
                +$ ./bin/easy_install tkinter-pypy
                +
                +Once you complete the steps above, you can start using Tkinter from your +python programs. In particular, you can use IDLE, the IDE which is part of +the Python standard library. To start IDLE, type:
                $ ./bin/pypy -m idlelib.idle
                +
                +Have fun :-) +
                +

                Comments

                +
                +
                +
                + + Unknown wrote on 2011-04-20 15:09: +
                +
                +

                It is sooo ancient. I'd think twice before bundling anything potentially exploitable (read - compiled C modules) with PyPy.

                +
                +
                +
                +
                + + RonnyPfannschmidt wrote on 2011-04-20 22:59: +
                +
                +

                i fail to see how this is more exploitable than say ctypes (which is already shipped)

                +
                +
                +
                +
                + + Brandon Corfman wrote on 2011-04-22 17:01: +
                +
                +

                I'm really REALLY happy about this ... Tkinter, multiprocessing, and 2.7 support were my remaining roadblocks to using PyPy. I'm d/l now to give it a try with Raven Checkers. I hope that I won't need to look back.

                +
                +
                +
                +
                + + Joaquin Abian wrote on 2011-05-13 20:41: +
                +
                +

                I tried to install tkinter on win 7. When I do pypy ez_setup.py I get a traceback that finish with:

                File "ez_setup.py", line 212, in main
                from setuptools.command.easy_install import main
                ZipImportError: 'setuptools.command.install'

                Some hint on how to solve it?

                +
                +
                +
                +
                + + Antonio Cuni wrote on 2011-05-18 15:13: +
                +
                +

                @Joaquin:
                indeed, ez_setup seems not to work on windows. It might be related to this, although I did not investigate further:
                https://bugs.pypy.org/issue725

                Instead of ez_setup, you can try to follow these instructions and install distribute/pip, which we recommend anyway nowadays:
                https://doc.pypy.org/en/latest/getting-started.html#installing-pypy

                Note however that tkinter-pypy is not precompiled for windows, so you need to have the necessary developer tools installed. If you manage to build a precompiled binary of tkinter-pypy, I'd be happy to put it in pypi :-)

                +
                +
                +
                +
                + + Anonymous wrote on 2011-11-24 16:52: +
                +
                +

                Seems that tcl8.4-dev and tk8.4-dev needs to be installed!
                This should be insert into the "install instruction" ;)

                +
                +
                +
                +
                + + Daniel Petti wrote on 2012-05-29 19:01: +
                +
                +

                What does "command 'cc' failed with error 1" mean? I keep getting that upon installing tkinter-pypy

                +
                +
                +
                +
                + + Anonymous wrote on 2012-10-22 17:27: +
                +
                +

                I'm unable to compile it on Windows (MinGW and also tried with VS 2010). Getting the following error:

                fatal error: tcl.h: No such file or directory

                My TCL installed under a different directory. How can I point the compiler to use tcl.h file from that directory?

                +
                +
                +
                +
                + + Rich Wandell wrote on 2013-05-03 14:47: +
                +
                +

                I am having an incredible amount of problems attempting to build tkinter for pypy on windows. Is there anywhere I can download a pre built version?

                +
                +
                +
                +
                + + Anonymous wrote on 2013-10-28 18:14: +
                +
                +

                This is outdated. But how to use Tkinter currently under windows?

                +
                +
                +
                +
                + + Unknown wrote on 2014-02-02 11:18: +
                +
                +

                I think I've managed to compile Tkinter for Windows. Could anyone interested please try it out? Just download this archive and extract it into your Pypy folder:
                https://dl-web.dropbox.com/get/Public/Tkinter%20for%20Windows.zip?_subject_uid=29914669&w=AACPaRHDWsfcxafgdXsHV405wJNIsKrYzRXZMHwIKPuiNA&dl=1

                +
                +
                +
                +
                + + Luis wrote on 2014-05-11 22:35: +
                +
                +

                XJDHDR: The link is not working. Do you still have the file available to download?

                +
                +
                +
                +
                + + Unknown wrote on 2014-05-12 17:27: +
                +
                +

                @Luis
                The file is still available. Try this link:
                https://dl.dropboxusercontent.com/u/29914669/Tkinter%20for%20Windows.zip

                Dropbox must have changed something on their end.

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2011/05/numpy-follow-up-6928627691060102514.html b/posts/2011/05/numpy-follow-up-6928627691060102514.html new file mode 100644 index 000000000..18d4aff5e --- /dev/null +++ b/posts/2011/05/numpy-follow-up-6928627691060102514.html @@ -0,0 +1,551 @@ + + + + + +NumPy Follow up | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                NumPy Follow up

                + + + +
                +

                Hi everyone. Since yesterday's blog post we got a ton of feedback, so we want +to clarify a few things, as well as share some of the progress we've made, in +only the 24 hours since the post.

                +

                Reusing the original NumPy

                +

                First, a lot of people have asked why we cannot just reuse the original NumPy +through cpyext, our CPython C-API compatibility layer. We believe this is +not the best approach, for a few reasons:

                +
                +
                  +
                1. +cpyext is slow, and always will be slow. It has to emulate far too many +details of the CPython object model that don't exist on PyPy (e.g., +reference counting). Since people are using NumPy primarily for speed this +would mean that even if we could have a working NumPy, no one would want to +use it. Also, as soon as the execution crosses the cpyext boundary, it +becomes invisible to the JIT, which means the JIT has to assume the worst +and deoptimize stuff away.
                2. +
                3. NumPy uses many obscure documented and undocumented details of the CPython +C-API. Emulating these is often difficult or impossible (e.g. we can't fix +accessing a struct field, as there's no function call for us to intercept).
                4. +
                5. It's not much fun. Frankly, working on cpyext, debugging the crashes, +and everything else that goes with it is not terribly fun, especially when +you know that the end result will be slow. We've demonstrated we can build +a much faster NumPy, in a way that's more fun, and given that the people +working on this are volunteers, it's important to keep us motivated.
                6. +
                +
                +

                Finally, we are not proposing to rewrite the entirety of NumPy or, god +forbid, BLAST, or any of the low level stuff that operates on C-level arrays, +only the parts that interface with Python code directly.

                +

                C bindings vs. CPython C-API

                +

                There are two issues on C code, one has a very nice story, and the other not so +much. First is the case of arbitrary C-code that isn't Python related, things +like libsqlite, libbz2, or any random C shared library on your system. +PyPy will quite happily call into these, and bindings can be developed either +at the RPython level (using rffi) or in pure Python, using ctypes. +Writing bindings with ctypes has the advantage that they can run on every +alternative Python implementation, such as Jython and IronPython. Moreover, +once we merge the jittypes2 branch ctypes calls will even be smoking +fast.

                +

                On the other hand there is the CPython C-extension API. This is a very specific +API which CPython exposes, and PyPy tries to emulate. It will never be fast, +because there is far too much overhead in all the emulation that needs to be +done.

                +

                One of the reasons people write C extensions is for speed. Often, with PyPy +you can just forget about C, write everything in pure python and let the JIT to +do its magic.

                +

                In case the PyPy JIT alone isn't fast enough, or you just want to +use existing C code then it might make sense to split +your C-extension into 2 parts, one which doesn't touch the CPython C-API and +thus can be loaded with ctypes and called from PyPy, and another which does +the interfacing with Python for CPython (where it will be faster).

                +

                There are also libraries written in C to interface with existing C codebases, +but for whom performance is not the largest goal, for these the right solution +is to try using CPyExt, and if it works that's great, but if it fails the +solution will be to rewrite using ctypes, where it will work on all Python +VMs, not just CPython.

                +

                And finally there are rare cases where rewriting in RPython makes more sense, +NumPy is one of the few examples of these because we need to be able to give +the JIT hints on how to appropriately vectorize all of the operations on an +array. In general writing in RPython is not necessary for almost any +libraries, NumPy is something of a special case because it is so ubiquitous +that every ounce of speed is valuable, and makes the way people use it leads to +code structure where the JIT benefits enormously from extra hints and the +ability to manipulate memory directly, which is not possible from Python.

                +

                Progress

                +

                On a more positive note, after we published the last post, several new people +came and contributed improvements to the numpy-exp branch. We would like to +thank all of them:

                +
                +
                  +
                • nightless_night contributed: An implementation of __len__, fixed bounds +checks on __getitem__ and __setitem__.
                • +
                • brentp contributed: Subtraction and division on NumPy arrays.
                • +
                • MostAwesomeDude contributed: Multiplication on NumPy arrays.
                • +
                • hodgestar contributed: Binary operations between floats and NumPy arrays.
                • +
                +
                +

                Those last two were technically an outstanding branch we finally merged, but +hopefully you get the picture. In addition there was some exciting work done by +regular PyPy contributors. I hope it's clear that there's a place to jump in +for people with any level of PyPy familiarity. If you're interested in +contributing please stop by #pypy on irc.freenode.net, the pypy-dev mailing +list, or send us pull requests on bitbucket.

                +

                Alex

                +
                +

                Comments

                +
                +
                +
                + + Anonymous wrote on 2011-05-05 23:14: +
                +
                +

                How does this suggestion to use ctypes to interface with external C modules square with the python-dev antipathy towards doing that?

                "Given the choice of using either ctypes or an external package, I prefer the external package." Martin v. Löwis

                "If it means using ctypes to interface with system C libraries, I'm -10 on it :)" Antoine Pitrou

                +
                +
                +
                +
                + + Alex wrote on 2011-05-05 23:19: +
                +
                +

                I don't know what to say for them, besides they apparently don't hate it so much as to remove it from the stdlib :)

                +
                +
                +
                +
                + + Michael Foord wrote on 2011-05-06 00:08: +
                +
                +

                Isn't there another fairly major drawback to implementing in RPython - that you can only use it if it is compiled (translated) at the same time as pypy. So effectively pypy *has* to be distributed with all the RPython extensions you will ever use, or you have to retranslate *everything* whenever you add a new extension.

                Developing cross-platform, cross-architecture, stuff with ctypes can also be a lot more painful than writing extensions using the Python C API (and having the compiler make some decisions at compile time rather than having to do it all at runtime).

                +
                +
                +
                +
                + + Robert Kern wrote on 2011-05-06 04:54: +
                +
                +

                Most of python-dev's "antipathy" towards using ctypes is focused on using ctypes for stdlib modules, not on general principles. For security, stability, and portability reasons, many platforms need to disable ctypes when they build Python. Consequently, there is a policy that no stdlib module can use ctypes. They are not recommending against using ctypes in general.

                +
                +
                +
                +
                + + Anonymous wrote on 2011-05-06 05:19: +
                +
                +

                One major worry is how well you will end up tracking NumPy development. Will you evenutally add an "RPython glue" subdir to NumPy's distribution?

                +
                +
                +
                +
                + + Anonymous wrote on 2011-05-06 05:59: +
                +
                +

                thanks for the follow-up. I won't argue with points 1 and 3, but I think 2 can be reasonably addressed: I don't think the usage of internal details is pervasive in the code, and most of it is for historical reasons. We cannot remove them altogether from the numpy headers for backward compatibility reasons, but we can replace most of it inside numpy itself.

                I am still a bit confused though: from your description, it seems that you intend to fork numpy to replace some pieces from C to RPython, but if I look at the numpy-ext branch, I see a rewrite of numpy in rpython. Maybe you are talking about another code ?

                +
                +
                +
                +
                + + Anonymous wrote on 2011-05-06 08:22: +
                +
                +

                I think that the most important part of numpy is array operations (indexing, +-*/, broadcasting, etc). So it would be good enough to implement only array class in RPython and call to numpy using ctypes/cpyext for all other stuff. I've read somewhere about the plans to impose separation between numpy and scipy so numpy holds only implementation of fast arrays and scipy will hold all non-trivial operations on them. IMHO such separation will be ideal for pypy too.

                +
                +
                +
                +
                + + Wladimir wrote on 2011-05-06 08:42: +
                +
                +

                Thanks for the clear explanation. I really wondered why it was so hard to re-use the existing numpy.

                +
                +
                +
                +
                + + Antoine P. wrote on 2011-05-06 15:02: +
                +
                +

                Thanks Robert for clarifying our position :)

                Another issue with ctypes is that it doesn't work on all systems.

                Yet another issue with ctypes is that it is currently unmaintained (which won't help fixing portability issues :-)).

                +
                +
                +
                +
                + + Anonymous wrote on 2011-05-06 17:26: +
                +
                +

                I am sory for the silly question, but how do I install this module in an existing pypy instalation ?

                Thanks for the great job !

                +
                +
                +
                +
                + + Anonymous wrote on 2011-05-06 21:15: +
                +
                +

                OK I see ...

                hg clone https://foss.heptapod.net/pypy/pypy/-/tree/branch/numpy-exp .....

                +
                +
                +
                +
                + + Anonymous wrote on 2011-05-07 03:49: +
                +
                +

                I like the idea of reimplementing part of Numpy in pypy to leverage the JIT in pypy. The existence of numexpr demonstrates the deficiency of Numpy as a Python library. A JIT is much more appropriate for what effectively should be a DSL.

                But I would recommend something grander, perhaps for the longer term. I think if pypy could produce do for Python what McVM and McJIT propose to do for Matlab, it would be game-changing for Python and pypy. It would make pypy not only competitive with Matlab in ways that Numpy and Scipy are not yet and may never be, but also with F#. The rapid uptake of F# in financial industry in particular, despite the availability of Matlab, showcases the need for a fast prototyping language that does not rely on calling Fortran code for speed. I know I am looking for such language; Numpy and Python simply don't offer enough power and flexibility. I hope I can choose pypy.

                +
                +
                +
                +
                + + Anonymous wrote on 2011-05-11 00:31: +
                +
                +

                Any idea about an eta on merging the jitypes2 branch (and/or a little more info on what it does to speed ctypes up so much)?

                +
                +
                +
                +
                + + Antonio Cuni wrote on 2011-05-11 07:33: +
                +
                +

                @anonymous: the jitypes2 branch is mostly ready, but we are hunting two bugs and won't be merged until we fix them.

                The speedup comes from the fact that ctypes call are seen by the JIT, and directly compiled into a call to the corresponding C function. Thus, most of the overhead of ctypes itself is optimized away.

                +
                +
                +
                +
                + + Unknown wrote on 2011-05-11 19:51: +
                +
                +

                I wonder if an RPython/cython backend might be possible. cython is already my favorite way to write CExtensions and it generates code for both python 2.x and 3.x. It would be great if it could be adapted for PyPy extensions.

                +
                +
                +
                +
                + + Anonymous wrote on 2011-05-12 18:51: +
                +
                +

                Hi!

                Thanks a lot for the previous post and the follow up! I really appreciate that you could find time to make a write up on the progress that you made so far on this extremely important feature.

                This all sounds very cool, but also to me it seems that it's very important to work with NumPy / SciPy developers, so that the parts that have to be replaced would be isolated and maintained in parallel for RPython and C API, or rewritten in ctypes (not sure if this is even possible). This way this eternal catch-up trap that many seem to be afraid of will not happen.

                Also, I wonder in how much money this would actually translate. Maybe Enthought could sponsor some development...

                Regarding Cython... I also use it to write trivial extensions to implement computation kernels outside Python in C. It would be great if Cython were able to generate something that would work with PyPy as well...

                Thanks!

                +
                +
                +
                +
                + + Laura Creighton wrote on 2011-05-13 17:55: +
                +
                +

                CLM:We actually have a GSoC student proposal from Romain Guillebert to
                investigate this idea.

                +
                +
                +
                +
                + + Maciej Fijalkowski wrote on 2011-05-23 08:55: +
                +
                +

                @Anonymous the idea is that you should not use Cython at all and PyPy's JIT should handle the computational kernel just fine.

                +
                +
                +
                +
                + + Anonymous wrote on 2011-07-26 11:18: +
                +
                +

                I don't know why do you decide to use ctypes - in numpy community it is considered as obsolete already for a long time (maybe several years), is not under active development, and now Cython is recommended by default tool for it:

                https://docs.scipy.org/doc/numpy/user/misc.html?highlight=cython#interfacing-to-c

                Also, I guess you could search for some volunteers to work on numpy-PYPY in numpy-user, scipy-user, scipy-dev mail lists.

                I'm interested in operations like hstack, vstack, max, min, argmin, nanmax, nanargmin (along a given axis) etc - are they already available? Or when it will be done?

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2011/05/numpy-in-pypy-status-and-roadmap-8332894230779779992.html b/posts/2011/05/numpy-in-pypy-status-and-roadmap-8332894230779779992.html new file mode 100644 index 000000000..6d7a81698 --- /dev/null +++ b/posts/2011/05/numpy-in-pypy-status-and-roadmap-8332894230779779992.html @@ -0,0 +1,744 @@ + + + + + +Numpy in PyPy - status and roadmap | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                Numpy in PyPy - status and roadmap

                + + + +
                +

                Hello.

                +

                NumPy integration is one of the single most requested features for PyPy. This +post tries to describe where we are, what we plan (or what we don't plan), and +how you can help.

                +

                Short version for the impatient: we are doing experiments, which show that +PyPy+numpy can be faster and better than CPython+numpy. We have a plan on how +to move forward, but at the moment there is lack of dedicated people or money +to tackle it.

                +
                +

                The slightly longer version

                +

                Integrating numpy in PyPy has been my pet project on an on-and-off (mostly off) +basis over the past two years. There were some experiments, then a long +pause, and then some more experiments which are documented below.

                +

                The general idea is not to use the existing CPython module, but to +reimplement numpy in RPython (i.e. the language PyPy is implemented in), thus +letting our JIT achieve extra speedups. The really cool thing about this part +is that numpy will automatically benefit of any general JIT improvements, +without any need of extra tweaking.

                +

                At the moment, there is branch called numpy-exp which contains a +translatable version of a very minimal version of numpy in the module called +micronumpy. Example benchmarks show the following:

                + +++++ + + + + + + + + + + + + + + + + + +
                 additerate
                CPython 2.6.5 with numpy 1.3.00.260s (1x)4.2 (1x)
                PyPy numpy-exp @ 3a9d77b789e10.120s (2.2x)0.087 (48x)
                +

                The add benchmark spends most of the time inside the + operator on +arrays (doing a + a + a + a + a), , which in CPython is implemented in C. +As you can see from the table above, the PyPy version is already ~2 times +faster. (Although numexpr is still faster than PyPy, but we're working on it).

                +

                The exact way array addition is implemented is worth another blog post, but in +short it lazily evaluates the expression and computes it at the end, avoiding +intermediate results. This approach scales much better than numexpr +and can lead to speeding up all the operations that you can perform on matrices.

                +

                The next obvious step to get even more speedups would be to extend the JIT to +use SSE operations on x86 CPUs, which should speed it up by about additional +2x, as well as using multiple threads to do operations.

                +

                iterate is also interesting, but for entirely different reasons. On CPython +it spends most of the time inside a Python loop; the PyPy version is ~48 times +faster, because the JIT can optimize across the python/numpy boundary, showing +the potential of this approach, users are not grossly penalized for writing +their loops in Python.

                +

                The drawback of this approach is that we need to reimplement numpy in RPython, +which takes time. A very rough estimate is that it would be possible to +implement an useful subset of it (for some definition of useful) in a period +of time comprised between one and three man-months.

                +

                It also seems that the result will be faster for most cases and the same speed +as original numpy for other cases. The only problem is finding the dedicated +persons willing to spend quite some time on this and however, I am willing to +both mentor such a person and encourage him or her.

                +

                The good starting point for helping would be to look at what's already +implemented in micronumpy modules and try extending it. Adding a - operator +or adding integers would be an interesting start. Drop by on #pypy on +irc.freenode.net or get in contact with developers via some other channel (such +as the pypy-dev mailing list) if you want to help.

                +

                Another option would be to sponsor NumPy development. In case you're +interested, please get in touch with us or leave your email in comments.

                +

                Cheers,
                +fijal

                +
                +
                +

                Comments

                +
                +
                +
                + + Unknown wrote on 2011-05-04 17:30: +
                +
                +

                While the RPython approach does sound valuable long-term, do you know if anyone has experimented with cpyext and the CPython extension module as a near-term alternative?

                +
                +
                +
                +
                + + matt harrison wrote on 2011-05-04 17:30: +
                +
                +

                Great post. (I'm another person who would like numpy on pypy).
                What are the guidelines for when something should be implemented in RPython? For me personally there are a few instances I would trade some of the dynamicism of Python for speed in my own code.

                +
                +
                +
                +
                + + Maciej Fijalkowski wrote on 2011-05-04 17:35: +
                +
                +

                @Nick the mixed approach (use cpyext and pieces in RPython) sounds maybe valuable short term, but it can burn people easily. RPython-only is way more elegant and gives you wins upfront. Since there is noone willing to invest time in short term approach, this sounds like a no-brainer.

                @matt almost nothing should be implemented in RPython, except the interpreter itself. Writing Python should be fast enough. Numpy is a notable example where we want to leverage last bits and pieces of JIT and be really really fast. For example you can't really leverage SSE from Python layer.

                +
                +
                +
                +
                + + Davide wrote on 2011-05-04 18:12: +
                +
                +

                Are you in touch with Numpy developers? Are they eager to "stop" using Python and move to RPython? I mean, if this work needs to be redone for each version of Numpy, we will be always lagging behind, and always spend lot of efforts. On the other hand, if Numpy devs will start using the RPython for and let die the pure-Python one, then, the porting effort would me much more meaningful, and I believe it will be easier to find a group of people interested in doing it (myself, maybe)

                +
                +
                +
                +
                + + Davide wrote on 2011-05-04 18:13: +
                +
                +

                And what about SciPy?

                +
                +
                +
                +
                + + Anonymous wrote on 2011-05-04 18:15: +
                +
                +

                I've got to say that this worries me more than it encourages me.

                1) It doesn't sound like this path will lead to easier integration of scipy. If I'm wrong please let me know! But if I'm right, the reality is that most of the reason I care about numpy is because scipy depends on it, and I care about scipy.

                2) What about the numpy refactoring effort, which is supposed to be making a better C interface for numpy, which works with IronPython as well as CPython (https://lists.ironpython.com/pipermail/users-ironpython.com/2010-December/014059.html)? Why not just encourage that effort, and leverage it for PyPy integration? Is there a reason it won't work for numpy even though it works for both IronPython and CPython? (

                +
                +
                +
                +
                + + Maciej Fijalkowski wrote on 2011-05-04 18:27: +
                +
                +

                @Davide it's not Python vs RPython, it's C (which numpy is implemented in) vs RPython. No numpy users will be requires to use RPython for anything.

                @Gary I believe you're wrong. The idea stays the same - you can call arbitrary C code that will manipulate raw memory and do what it wants to do. The idea is to implement only the interface part (which uses CPython C API) and not the C part, which will work anyway. So at the end, we hope to leverage that effort. Also we're not microsoft and we can't pay large sums of money to do it and having small subset of numpy that's really fast appeals much more to me than a large effort that only gives numpy for pypy (that's not faster than cpython's one).

                +
                +
                +
                +
                + + Davide wrote on 2011-05-04 19:12: +
                +
                +

                @Maciej: It was clear to me that numpy users shouldn't change anything, but I thought you intended to change only the Python part of Numpy, not the C part.

                Now, if you plan to change the whole C sections, that's a huge job. What are your plans for dependencies like the BLAS, LAPACK and the likes? Would you reimplement them in RPython as well?

                And regardless of the answer, my question is still valid: do you see this project as a "catch-up porting" of Numpy, with the version for CPython going on by itself? Or do you see the RPython fork becoming the mainstream Numpy? And if it's the latter, how that would perform on CPython? I think these questions are the key of the matter.

                +
                +
                +
                +
                + + Maciej Fijalkowski wrote on 2011-05-04 19:18: +
                +
                +

                see my reply above about BLAS/LAPACK etc. Regarding the C part, it's a big task, but I think not too big. Also it's relatively easy to come up with working piece that's not full, nontheless useful.

                This won't work on CPython, period.

                +
                +
                +
                +
                + + Anonymous wrote on 2011-05-04 19:25: +
                +
                +

                @Maciej -- sorry if I'm being dense, but are you saying that the approach you're outlining will allow for scipy to work with numpy?

                +
                +
                +
                +
                + + Maciej Fijalkowski wrote on 2011-05-04 19:29: +
                +
                +

                @Gary an uneducated guess would be "yes". Probably with some work and scipy refactoring.

                +
                +
                +
                +
                + + cool-RR wrote on 2011-05-04 19:34: +
                +
                +

                Thanks for writing this post Maciej! It's great to have some visibility on your plans about this issue.

                +
                +
                +
                +
                + + Anonymous wrote on 2011-05-04 19:47: +
                +
                +

                OK. As I've argued before in various pypy groups, I think one of the groups that will most strongly benefit from pypy's speed is the scientific community -- but they need numpy and scipy. So now that I know that this plan will (hopefully) allow for both of those to be used from pypy, I'm encouraged by it.

                +
                +
                +
                +
                + + Anonymous wrote on 2011-05-04 19:49: +
                +
                +

                @Maciej: The parts of Scipy written in Python are for the most part not large. The main work would be in reimplementing the C code that uses Numpy's C-API, and figuring out a way to interface with Fortran code.

                +
                +
                +
                +
                + + Joseph wrote on 2011-05-04 20:21: +
                +
                +

                You say you lack sufficient resources to put in a large effort, but your answers to CPython extensions is "reimplement everything RPython". Would it not make more sense to improve cpyext so that you get good performance out of it (maybe even JIT compatible)? This seems like a better answer then re-writing every single CPython extension and trying to keep the RPython implementation in sync.

                +
                +
                +
                +
                + + Peter Cock wrote on 2011-05-04 20:33: +
                +
                +

                Have you tried micronumpy under Jython? I'm assuming RPython, being just a subset of Python, should also work there, and might double as a way to get (some of) NumPy on Jython.

                +
                +
                +
                +
                + + Maciej Fijalkowski wrote on 2011-05-04 20:34: +
                +
                +

                @Joseph cpyext will always be only a semi-permanent compatibility layer. Making numpy work with cpyext is both unrewarding (hard work with obscure bugs), but also significantly harder to make fast, in some places completely impossible. Yes, it doesn't make sense for all extensions, it doesn't even make sense for most. Numpy is however special, since speed is the reason of it's existence. Also, frankly, when it comes down to my free time "let's make this cool JITed code run 50x faster than CPython" beats "let's stare puzzled at this segfault".

                +
                +
                +
                +
                + + Maciej Fijalkowski wrote on 2011-05-04 20:35: +
                +
                +

                @Joseph anyway, it's exactly for the same reason "why write another interpreter if you can just improve CPython". Because it's easier at the end.

                +
                +
                +
                +
                + + Corbin Simpson wrote on 2011-05-04 21:45: +
                +
                +

                To everybody asking why we cannot just use cpyext: I already tried it. It's not gonna happen without hacking the crap out of numpy. Additionally, it's going to be slow: Numpy is not fast for most operations, because of double-unboxing. Only vector ops are fast. JITing the operations is going to be a big win.

                For those of you not believing numpy is slow, look at numexpr (https://code.google.com/p/numexpr/) which implements many of the same ideas that we are planning on implementing.

                +
                +
                +
                +
                + + Jonas B. wrote on 2011-05-04 21:45: +
                +
                +

                Extremely exciting! Perhaps this is a good time to document the internals of NumPy a bit better while your scour the source to reimplement in RPython.

                Perhaps this is a good fit for a Kickstarter (or similar) project? I believe this requires very talented and dedicated developers and paying the professionally by raising money on the Internet should be possible. It's been done before.

                +
                +
                +
                +
                + + Anonymous wrote on 2011-05-04 22:58: +
                +
                +

                Yes, having a couple of Kickstarter projects for PyPy would be nice. It seems the current view is "we'll wait for someone wanting a feature enough to fund it". Picking one or two known valuable features to put on Kickstarter would provide for a nice test: can you raise more money by asking for it in a targeted way?

                +
                +
                +
                +
                + + Anonymous wrote on 2011-05-05 01:23: +
                +
                +

                Two comments:

                One, you guys need to make up your minds with respect to how people are supposed to interface C code with PyPy, and make one well-supported way. The sooner, the better.

                Two, as long as your numpy clone implements the (new-style) Python array interface, it should "just work" with Scipy, with everything else being a Scipy bug. (Correct me if I'm wrong.)

                Andreas

                +
                +
                +
                +
                + + Anonymous wrote on 2011-05-05 01:58: +
                +
                +

                Doesn't getting SciPy to work involve interfacing with a lot of Fortran code?

                +
                +
                +
                +
                + + Unknown wrote on 2011-05-05 04:47: +
                +
                +

                To address some of the criticism you're receiving, it may be worth making another post clarifying the points made in the comments and elsewhere:

                - numpy+cpyext has been tried and found wanting (and very hard to debug)
                - no developers available that are interested in beating their heads against that particular wall
                - pure C and Python components of numpy should remain largely the same
                - only the Python bindings layer that uses the CPython C API needs to be reimplemented
                - RPython has its own FFI which is PyPy's preferred way to interface to non-Python code (https://pypy.readthedocs.org/en/latest/rffi.html)
                - cpyext is a useful tool for compatibility with relatively simple C extensions that don't stress the C API greatly, but numpy is not such an extension.

                +
                +
                +
                +
                + + david wrote on 2011-05-05 09:42: +
                +
                +

                Hi maciej, I am david (we quickly met at pycon where I presented myself as a numpy guy).

                I think part of the misunderstanding is around the meaning of "numpy in pypy". Rewriting an array class on top of pypy is certainly valuable, and I am in no position to tell other people what to do in their free time. But I don't think it can realistically mean people will be able to use this instead of numpy after 2-3 man months: how will interfacing with BLAS/LAPACK work ? How will interfacing with the vast amount of fortran code in scipy work ?

                If cpyext is indeed a dead-end, it would valuable to know why. Personally, I would certainly be happy to fix parts of numpy that makes cpyext impractically, even if it meant it were twice slower than on cpython. Because I could still benefit from pypy *elsewhere*, without having to rewrite all the numpy/scipy/etc... code.

                +
                +
                +
                +
                + + Maciej Fijalkowski wrote on 2011-05-05 09:53: +
                +
                +

                @david please look above at my responses. there will still be a piece of memory you can pass to LAPACK or BLAS or something. the RPython part is about the interface only and not C-only part. If you want to improve numpy, please separate C-only parts from interface parts as much as possible, using C from RPython is a no-brainer.

                +
                +
                +
                +
                + + Dániel Varga wrote on 2011-05-05 10:34: +
                +
                +

                Maciej, let me second Nick's polite request for a more detailed post about the plan.

                If even David, an actual numpy developer can misunderstand your description, what do you expect from the unwashed masses of scipy users like me? :) Fortunately it does not take too much effort to alleviate the worries. All you have to do is explain to everyone that the plan takes into account the giant amount of C and Fortran code in numpy/scipy, and takes into account the fact that forking numpy/scipy is infeasible.

                +
                +
                +
                +
                + + Bluebird wrote on 2011-05-05 11:49: +
                +
                +

                Didn't you say in another post that the JIT is more efficient at optimizing Python code than RPython ?

                +
                +
                +
                +
                + + cournape wrote on 2011-05-05 12:17: +
                +
                +

                @daniel: I don't think there is a misunderstanding as much as a different people wanting different things. I believe that Maciej and other pypy people are more interested in leveraging pypy and its JIT do to things which are indeed quite complicated in numpy today (avoid temporary, fast iterators in python, etc...). I have little doubt that pypy is a better platform than cpython to experiment this kind of things.

                I am more surprised about the claim that numpy is so tight to cpyhon internals. It certainly depends on the C API, but mostly public API, documented as such.

                +
                +
                +
                +
                + + Armin Rigo wrote on 2011-05-05 12:45: +
                +
                +

                @nick: thank you very much for giving all relevant pieces of information that are missing from the original post!

                +
                +
                +
                +
                + + glyph wrote on 2011-05-05 19:32: +
                +
                +

                Hey Maciej! This sounds absolutely awesome. I hope you can find someone to do the necessary work. I think you might need to explain a little better in a separate post where that 48x speedup comes from, and why RPython is a necessary part of it. I think I understand why, but clearly some of the commenters don't :).

                +
                +
                +
                +
                + + Anonymous wrote on 2011-06-21 22:50: +
                +
                +

                Well, if the answer of "How to make numpy available in pypy" is "do a complicated rewrite of numpy," then I'm pretty skeptical about the pypy project. I primarily use numpy, but also scipy sometimes and Image sometimes. As a user it's most important to me that code runs. Speed is not as critical. For example if I take stddev() of an array I first want that to run, and only secondarily want it efficient. If there's a library that I might want to use, and I can't expend a reasonable amount of effort to wrap it, or else someone else can do that, then I don't find pypy that encouraging at all. Since there are lots of libraries out there, and it has been convincingly argued that Python's primary utility is its library support.

                +
                +
                +
                +
                + + Alex wrote on 2011-06-21 22:56: +
                +
                +

                @Anonymous: While you may not be concerned with performance, a great many people are. The only way to have arbitrary numpy stuff work in theory would be CPyExt, but as we've said that's frought with complications in that a) it won't work out of the box on something that uses as many corners of the CPython C-API as NumPy, and b) will always be slow. Given people's desire for speed with respect to NumPy we consider reimplementing it a reasonable course.

                +
                +
                +
                +
                + + Anonymous wrote on 2011-06-22 00:33: +
                +
                +

                Alex -- I'm not saying speed is unimportant. What I'm saying is being able to easily make existing CPython extension modules compile against numpy is very important to people. If there is a 20% slowdown or a 10% speedup of the C extension in many cases that is no big deal. Most importantly it would put PyPy on rather equal standing with CPython. And then the JIT pure Python code might win out for efficiency, so PyPy might be a net win for many users.

                On the other hand doing research into lazy evaluation and vectorizing and loop restructuring, can obviously make numpy faster, but is more of a tangent, than being helpful to the majority of users who just want to run their CPython extensions at roughly the same speed under PyPy. Until people can actually run their extensions easily (which I argue is the major value that Python has) I doubt there will be much adoption of PyPy.

                Say I can already add lists of floats and take their standard deviation using numpy, using the C extension library. It isn't clear to me why this should be substantially less efficient under PyPy than under CPython.

                We see the same issue with Python 3.0 adoption. Personally I think it makes bad language changes such as getting rid of string % operator which I use constantly, so I'd avoid it for that reason. But far more importantly it can't run a lot of the libraries I use, with comparable performance. So it's completely a no go to me for that reason.

                So I am suggesting that optimizing a single library by rewriting it, seems a case of premature optimization when most libraries can't even run with PyPy.

                +
                +
                +
                +
                + + Maciej Fijalkowski wrote on 2011-06-22 07:36: +
                +
                +

                It's a tough call, but for me most libraries run under PyPy. There are few that don't but I can usually work around that. Regarding numpy - noone wants slower numpy *no matter what*. Besides, it's not clear whether making numpy behave using CPyext would take less effort than writing it from scratch - the first reasonable subset can be expected *much* faster, when doing a rewrite.

                Numpy really *is* special, for all my needs, I want a small subset that performs reasonably well, not a whole thing that performs poorly. It's a matter of taste, but it's also much more fun, which plays a lot in terms of people spending free time on it. Would you rather add functionality for X that you need or fix next obscure segfault?

                Cheers,
                fijal

                +
                +
                +
                +
                + + Maciej Fijalkowski wrote on 2011-06-22 10:14: +
                +
                +

                @Anonymous Clarifying: We're hoping to reuse most parts of numpy (and scipy), especially those written in pure C. The "only" part requiring rewriting is the part that uses CPython C API, which is mostly the array interface.

                +
                +
                +
                +
                + + Anonymous wrote on 2011-06-23 04:23: +
                +
                +

                Maciej -- I didn't realize large parts of these libraries could be reused. So maybe once the PyPy C extension facilities are working well enough that important 3rd party libraries can be compiled, I'll just switch to PyPy for performance. It sure does sound more fun to make numpy functions compile down to heavily optimized RPython and get big speed gains. But I still maintain that users would appreciate being able to get all arbitrary libraries to build in the first place, e.g. if scipy or library X depends on the numpy C interface, and that gets broken in the PyPy numpy implementation, then users won't be able to use their desired library at all. So I guess I'm just arguing that the most C extension modules that can work with numpy, the better. Since if we wanted fast but no libraries we'd be using C :-).

                +
                +
                +
                +
                + + Davide wrote on 2011-06-23 16:19: +
                +
                +

                Maciej (et all),
                it looks like this issue isn't clear yet to people. Let's see if I can help.

                Numpy is made of 3 "piece" (doesn't matter if they are separate pieces or mingled together, they are there): a pure python part, a pure C part and a C-to-python "glue". All of them are very important to numpy, but the C-to-python glue is special, in that both python and C need to access the same data structures without any conversion or copy (otherwise it will be slow). I'm not sure what exactly numpy is doing for this "special glue" part, but that's the point where pypy suffer: of course pypy works just fine with pure python, and doesn't "care" at all about the C sections. So one option is to rewrite the C-to-python pieces of numpy. I'm sorry but it's still unclear to me if you want also to rewrite the C part or not (here you said kind-of-yes: https://morepypy.blogspot.com/2011/05/numpy-in-pypy-status-and-roadmap.html?showComment=1304533136864#c3499269873134208179 and here you said no: https://morepypy.blogspot.com/2011/05/numpy-in-pypy-status-and-roadmap.html?showComment=1308734098907#c2151221303214453177 so probably you should clarify better)

                Now, if I understand it right, your plan is to fork numpy for this purpose (either rewrite the C-to-python glue only, or the C part also). I believe this will fail, and the reason is pretty simple: first, even before you start, you already say that you don't have people/money/time to commit to this project. Second, maintaining a fork is a huge, huge task. You might easily introduce bugs, break feature, etc - while people are expecting something that "just works" as drop-in replacement, so even a "almost success" from a technical point of view, can be a big failure for adopter, if it doesn't behave. Last, but not least, numpy is a moving target, and you'll always play catch up. Is this the game you want to play??

                Now, I don't want to tell you what you have to do for fun, but if you want to have chances of success, you have to change the "politics" of your plan. I trust you that technically your plan is fine, but rather than implementing it within a numpy fork (or worst: rewrite), I suggest that you work with the numpy and/or CPython community, to see if you can write a wrapper around cpyext (or whatever they are using for C-to-Python glue). This wrapper (at compiler time) should either become cpyext (or whatever) if you are using CPython, or become "something else" if you are using pypy. If you persuade numpy people to use this wrapper you'll have the same numpy code base working as is in CPython and pypy. Sure you will not be exploiting the faster-than-C capabilities of pypy, but you can get there more smoothly: improving the speed one feature at time, while the rest of the framework is still working and thus useful, and thus increasing its user base, people interested in it (and some of them may become contributors).

                Instead your plan sounds like: implement one feature at time, while the rest of the framework doesn't work and thus nobody uses it in production, let alone care about its speed. On top of which, you'll be trying to catch-up with numpy.

                +
                +
                +
                +
                + + Maciej Fijalkowski wrote on 2011-06-23 18:12: +
                +
                +

                @Anonymous there are many things I disagree with and I'm not going to fork numpy.

                The basis is - I claim there is more use for fast numpy which is incomplete than slow complete numpy.

                I would refer you to yet another blog post (personal this time) explaining more why I do what I do: https://lostinjit.blogspot.com

                +
                +
                +
                +
                + + Connelly Barnes wrote on 2011-08-24 04:13: +
                +
                +

                Here is a completely different approach taken by IronPython for Scipy+Numpy compatibility:

                https://www.johndcook.com/blog/2009/03/19/ironclad-ironpytho/

                It's basically a bidirectional FFI. Have a CPython and an IronPython both running, and wrap objects so that IronPython objects can be used by CPython and vice versa. This requires some platform specific binary level compatibility, in their case, DLL hacking, to allow the FFI to work in both directions.

                It seems like that approach should be practical for getting all of large libraries such as Scipy or Numpy working in Pypy. Since it's already been demonstrated to work for IronPython.

                The above roadmap proposes instead speeding up the core array object by coding it in RPython.

                But I wonder if these two approaches could work together. For example Numpy could be configured to use ordinary CPython array objects, or PyPy compiled RPython array objects. Then the FFI just has to take care to wrap objects appropriately that are in the "other interpreter".

                Thoughts?

                +
                +
                +
                +
                + + Connelly Barnes wrote on 2011-10-13 00:30: +
                +
                +

                As a follow up to my previous comment, I noticed there is a bidirectional FFI for Python called RPyC that was previously discussed on the Pypy blog:

                https://morepypy.blogspot.com/2009/11/using-cpython-extension-modules-with.html

                I have no idea if it has been tried with Numpy yet.

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2011/05/playing-with-linear-programming-on-pypy-4040572987275633047.html b/posts/2011/05/playing-with-linear-programming-on-pypy-4040572987275633047.html new file mode 100644 index 000000000..e2a5825cd --- /dev/null +++ b/posts/2011/05/playing-with-linear-programming-on-pypy-4040572987275633047.html @@ -0,0 +1,445 @@ + + + + + +Playing with Linear Programming on PyPy | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                Playing with Linear Programming on PyPy

                + + + +
                +

                Fancy hi-level interfaces often come with a high runtime overhead +making them slow. Here is an experiment with building such an +interface using constructions that PyPy should be good at +optimizing. The idea is to allow the JIT in PyPy to remove the +overhead introduced by using a fancy high-level python interface +on top of a low-level C interface. The application considered is +Linear +programming. It is a tool used to solve linear optimization +problems. It can for example be used to find the nonnegative values +x, y and z that gives the maximum value of + +

                +
                + +
                + +without violating the constraints + +
                + +
                + +There exists general purpose solvers for these kind of problems that +are very fast and can literally handle millions of variables. To use +them however the problem has to be transformed into some specific +matrix form, and the coefficients of all the matrices +has to be passed to the solver using some API. This transformation is +a tedious and error prone step that forces you to work with matrix +indexes instead of readable variable names. Also it makes maintaining +an implementation hard since any modification has to be transformed +too. + +

                +The example above comes from the manual of +the glpk library. That +manual continues by describing how to convert this problem into the +standard form of glpk (which involves introducing three new variables) +and then gives the c-code needed to call the +library. Relating that c-code to the problem above without the +intermediate explanation of the manual is not easy. A common +solution here is to build a hi-level interface that allows a more +natural way of defining the matrices and/or allow the equations to be +entered symbolically. Unfortunately, such interfaces often become +slow. For the benchmark below for example, +cvxopt +requires 20 minutes to setup a problem that takes 9.43 seconds to solve +(this seems a bit extreme, am I doing something wrong?). + +

                +The high-level interface I constructed on top of the +glpk library is +pplp and it allows +the equations to be entered symbolically. The above problem can be +solved using +
                +    lp = LinearProgram()
                +    x, y, z = lp.IntVar(), lp.IntVar(), lp.IntVar()
                +    lp.objective = 10*x + 6*y + 4*z
                +    lp.add_constraint( x + y + z <= 100 )
                +    lp.add_constraint( 10*x + 4*y + 5*z <= 600 )
                +    lp.add_constraint( 2*x + 2*y + 6*z <= 300 )
                +    lp.add_constraint( x >= 0 )
                +    lp.add_constraint( y >= 0 )
                +    lp.add_constraint( z >= 0 )
                +
                +    maxval = lp.maximize()
                +    print maxval
                +    print x.value, y.value, z.value
                +
                + +

                +To benchmark the API I used it to solve a +minimum-cost + flow problem with 154072 nodes and 390334 arcs. The C library + needs 9.43 s to solve this and the pplp interface adds another 5.89 + s under PyPy and 28.17 s under CPython. A large amount of time is + still spend setting up the problem, but it's a significant + improvement over the 20 minutes required on CPython by + cvxopt. It is + probably not designed to be fast on this kind of benchmark. I have + not been able to get cvxopt to work under PyPy. The benchmark used is + available here +
                +

                Comments

                +
                +
                +
                + + The Cannon Family wrote on 2011-05-11 23:27: +
                +
                +

                for the first equation do you not perhaps mean f(x,y,z) = 10x+6y+4z instead of z = 10x+6y+4z ?

                +
                +
                +
                +
                + + Hakan Ardo wrote on 2011-05-12 07:29: +
                +
                +

                Yes, there is a typo there, I'll update the post. Thanx for noting.

                +
                +
                +
                +
                + + Winston Ewert wrote on 2011-05-12 14:28: +
                +
                +

                That seems like a lot of overhead for the wrapper, what is up with that? I mean, I'd expect the wrapper to reasonably quickly pass it off to the C library.

                +
                +
                +
                +
                + + Anonymous wrote on 2011-05-12 16:48: +
                +
                +

                you should try www.solverfoundation.com using ironpython too.

                +
                +
                +
                +
                + + Hakan Ardo wrote on 2011-05-12 18:53: +
                +
                +

                Winston: It is indeed. What cvxopt spends 20 min on I don't know. One guess would be that it is passing the ~2 million coefficients involved to C one by one, possible with a bit of error checking for each of them. As for the 6 s used by pplp, it needs to convert the equations into the matrices glpk wants. That means shuffling the coefficients around a bit and some bookkeeping to keep track of which goes where.

                Anonymous: OK, how would the above example look in that case?

                +
                +
                +
                +
                + + Hakan Ardo wrote on 2011-05-14 12:24: +
                +
                +

                Thanx for noting, I've fixed the post (again).

                +
                +
                +
                +
                + + Unknown wrote on 2011-05-30 18:48: +
                +
                +

                have you tried openopt[1]?

                [1] openopt.org

                +
                +
                +
                +
                + + Joachim Dahl wrote on 2011-08-05 09:37: +
                +
                +

                Are you distinguishing between the time it takes to setup the optimization problem and the time it takes to actually solve it?

                GLPK is a simplex solver written in C, and CVXOPT is an interior point solver written in Python/C and is not particularly optimized for sparse problem. Nevertheless, you should check the you actually formulate a large sparse problem in CVXOPT, and not a dense one.

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2011/05/pypy-genova-pegli-post-europython-4004229800858530064.html b/posts/2011/05/pypy-genova-pegli-post-europython-4004229800858530064.html new file mode 100644 index 000000000..376782d00 --- /dev/null +++ b/posts/2011/05/pypy-genova-pegli-post-europython-4004229800858530064.html @@ -0,0 +1,382 @@ + + + + + +PyPy Genova-Pegli Post-EuroPython Sprint June 27 - July 2 2011 | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                PyPy Genova-Pegli Post-EuroPython Sprint June 27 - July 2 2011

                + + + +
                +

                The next PyPy sprint will be in Genova-Pegli, Italy, the week after EuroPython +(which is in Florence, about 3h away by train). This is a fully public sprint: +newcomers and topics other than those proposed below are welcome.

                +
                +

                + +Goals and topics of the sprint

                +
                  +
                • +
                  +Now that we have released 1.5, the sprint itself is going to be mainly +working on fixing issues reported by various users. Possible topics +include, but are not limited to:
                  +
                  +
                    +
                  • fixing issues in the bug tracker
                  • +
                  • improve cpyext, the C-API compatibility layer, to support more extension +modules
                  • +
                  • finish/improve/merge jitypes2, the branch which makes ctypes JIT friendly
                  • +
                  • general JIT improvements
                  • +
                  • improve our tools, like the jitviewer or the buildbot infrastructure
                  • +
                  • make your favorite module/application working on PyPy, if it doesn't yet
                  • +
                  +
                  +
                • +
                • +
                  +Of course this does not prevent people from showing up with a more precise +interest in mind If there are newcomers, we will gladly give introduction +talks.
                  +
                • +
                • +
                  +Since we are almost on the beach, we can take one day off for summer +relaxation and/or tourist visits nearby :-).
                  +
                • +
                +
                +
                +

                + +Exact times

                +The work days should be 27 June - 2 July 2011. People may arrive on +the 26th already and/or leave on the 3rd.
                +
                +

                + +Location & Accomodation

                +Both the sprint venue and the lodging will be at Albergo Puppo in +Genova-Pegli, Italy. Pegli is a nice and peaceful little quarter of Genova, +and the hotel is directly on the beach, making it a perfect place for those +who want to enjoy the sea in the middle of the Italian summer, as a quick +search on Google Images shows :-)

                +The place has a good ADSL Internet connexion with wireless installed. You can +of course arrange your own lodging anywhere but I definitely recommend lodging +there too.
                +Please confirm that you are coming so that we can adjust the reservations as +appropriate. The prices are as follows, and they include breakfast and a +parking place for the car, in case you need it:
                +
                  +
                • single room: 70 €
                • +
                • double room: 95 €
                • +
                • triple room: 105 €
                • +
                +
                +Please register by hg:
                +https://foss.heptapod.net/pypy/extradoc/-/blob/branch/default/extradoc/sprintinfo/genova-pegli-2011/people.txt +
                +or on the pypy-dev mailing list if you do not yet have check-in rights:
                +https://mail.python.org/mailman/listinfo/pypy-dev +
                +In case you want to share a room with someone else but you don't know who, +please let us know (either by writing it directly in people.txt or by writing +on the mailing list) and we will try to arrange it.
                +
                +

                Comments

                +
                +
                +
                + + vak wrote on 2011-05-25 11:39: +
                +
                +

                Hi,

                as for upcoming sprint...

                The grid on https://speed.pypy.org/timeline/ is a totally great idea. However the benchmark tests listed represent no progress since a long time already.

                Q1. Does it mean that the set is not representative any more and should be extended?

                Q2. Is it possible to include some micro benchmarks, please? (Oh, please!)

                +
                +
                +
                +
                + + vak wrote on 2011-06-14 14:31: +
                +
                +

                no answers -- it's a pity

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2011/05/pypy-usage-survey-1402303968715807009.html b/posts/2011/05/pypy-usage-survey-1402303968715807009.html new file mode 100644 index 000000000..f812b6b46 --- /dev/null +++ b/posts/2011/05/pypy-usage-survey-1402303968715807009.html @@ -0,0 +1,519 @@ + + + + + +PyPy Usage Survey | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                PyPy Usage Survey

                + + + +
                +

                We've been working on PyPy for a long time. But readers of this blog will know +that in the past year something has changed: we think PyPy is production ready. +And it's not just us, this week LWN.net wrote an article about how PyPy +sped up one of their scripts by a factor of three, noting that, "plans are to +run gitdm under PyPy from here on out". All in all we think PyPy is pretty +great, but not everyone is using it yet, and we want to know why. We want your +feedback on why PyPy isn't ready to be your only Python yet, and how we can +improve it to make that happen.

                +

                Therefore, we've put together a quick survey, whether you're using PyPy or not +if you could take a few minutes to fill it out and let us know how we're doing +we'd really appreciate it. You can find the form here.

                +

                Thanks, +The PyPy team

                +
                +

                Comments

                +
                +
                +
                + + Anonymous wrote on 2011-05-16 18:23: +
                +
                +

                We are very interested in using PyPy in production but our project is based on lxml library and both are incompatible. Do you suggest any fix for this? I'm not sure if PyPy would compensate the reduction if performance of a pure Python XML library.

                +
                +
                +
                +
                + + Anonymous wrote on 2011-05-16 18:55: +
                +
                +

                Biggest blocker right now is gevent, which I believe would require pypy stackless and JIT to get along plus some work to make gevent use ctypes in place of cpython api.

                +
                +
                +
                +
                + + Anonymous wrote on 2011-05-16 19:12: +
                +
                +

                I suggest that you reproduce this survey on StackOverflow (if it's acceptable there, maybe Programmers?) and Quora, maybe Convore too. Posting to comp.lang.python would also help.

                +
                +
                +
                +
                + + Anonymous wrote on 2011-05-16 19:22: +
                +
                +

                Pypy needs to either be a dropin replacement for python or provide a significant (order of magnitude) difference in performance that moving to pypy won't be as big of a deal when you lose the support of so many 3rd party libraries.

                +
                +
                +
                +
                + + Anonymous wrote on 2011-05-16 19:35: +
                +
                +

                1. Installation is long and non-intuitive. I'd like to see PyPy packaged up for all the major distros + Mac OSX via Fink, Homebrew, and MacPorts.

                2. A comprehensive listing of modules that can and cannot be used with PyPy. I'm still not quite clear as to how PyPy interacts with the major web frameworks and WSGI (haven't researched it much either).

                3. Drop-in replacement for Python 2.7. I want my scripts that I wrote in Python to run in PyPy with no complications.

                +
                +
                +
                +
                + + Pavel wrote on 2011-05-16 19:46: +
                +
                +

                Could you provide the downloads with PGP signatures, please? We would like to use PyPy in production to run our payment processing system backend, but verified integrity and authenticity of its source code is strictly required.

                +
                +
                +
                +
                + + Victor wrote on 2011-05-16 20:05: +
                +
                + 2. A comprehensive listing of modules that can and cannot be used with PyPy. I'm still not quite clear as to how PyPy interacts with the major web frameworks and WSGI (haven't researched it much either).

                This is available at the PyPy Compatibility Wiki (I should update it this week, lots of new information around). +
                +
                +
                +
                + + Anonymous wrote on 2011-05-16 20:20: +
                +
                +

                We would use it across all our deployments (hundreds of thousands of LOCs) and gladly contribute and invest in pypy as soon as you guys implement python3 spec. Literally can't wait.

                +
                +
                +
                +
                + + Daniel Kluev wrote on 2011-05-17 06:52: +
                +
                +

                I'd love to use PyPy in some of my projects, but they rely on lots of 3rd-party C/C++-based libraries.

                1) lxml, thats an absolute must for most of my applications. Original ETree now lacks many features lxml has, so there is no ready pure-python replacement avail.
                2) Some my own boost::python libraries. I didn't actually try to compile them on PyPy, but as I was told on IRC, support for b::p is still marginal.
                3) PycURL, PyV8, PyQt, wxPython and so on.

                +
                +
                +
                +
                + + Martin Gfeller wrote on 2011-05-17 09:14: +
                +
                +

                We would like to profit from the speedup, but it would be a major piece of work for us, as we're currently running Zope 2.13 (which we could replace, because we make only limited use of it and have our own hybrid database). However, before making an investment, we need to be sure that:


                - PyPy won't go away like Psyco did. A kind of "mainstream endorsement" by PSF would be helpful


                - numpy and scipy are available

                - a decent ODBC package is available (we're using mxODBC) at the moment

                - full support on Windows 32 and 64 bit



                Best regards, Martin

                Swisscom IT Services Finance

                +
                +
                +
                +
                + + Maciej Fijalkowski wrote on 2011-05-17 09:18: +
                +
                +

                @martin

                * numpy, scipy support is on the way

                * 32bit windows is done, 64bit windows will happen, it's on the todo list

                * PSF has just endorsed PyPy in front of 1000 people crowd on pycon giving us a 10000$ check (https://3.bp.blogspot.com/-yLUKuyRgjdg/TYfklB5Jg4I/AAAAAAAABKM/_5Rv2thqzA0/s1600/pycon_cheque.jpg).

                That answers roughly half to 3/4 of your issues, no bad, we're getting there :)

                +
                +
                +
                +
                + + Anonymous wrote on 2011-05-17 15:48: +
                +
                +

                I would like to repeat the numpy and scipy thing. I have to add matplotlib, which a lot of people use for plotting. Personally I also cannot live without h5py, which is awesome for storing and handling numerical data. I have no idea if it will work with pypy, because it does require numpy first.

                I'm looking forward to pypy becoming faster, better supported, and more popular! I am convinced that it will.

                +
                +
                +
                +
                + + wilk wrote on 2011-05-17 16:38: +
                +
                +

                I've a project wich use psyco with a factor 15 (computation of train path) ! yes really, this project is in production (unfortunately not open source) ! I just tried it with pypy 1.5, and it works with the same factor (congratulation to you). So i'm sure that we'll use pypy.

                But like my other project, i don't change something wich already works. Most of them don't need speed improvement.

                On one scrabble game i'd like to replace a scrabble solver in C (if someone wants to help, it's opensource ?)

                I also hope to see a debian package in the next debian release...

                Thanks for your work, i follow it !

                +
                +
                +
                +
                + + Anonymous wrote on 2011-05-18 13:26: +
                +
                +

                On my server I'm running couple of Django based ecommerce systems. I hope to be running more of them soon (hopefully). There is also PostgreSQL. Still not using PyPy but I just can't wait to check if it will be faster and if so then how much. I don't know yet how to run Django app on production on PyPy but as soon I check and run couple of performance tests I will surely give some feedback.

                +
                +
                +
                +
                + + raptor wrote on 2011-05-23 00:53: +
                +
                +

                Its all about compatibility with 3rd party libs, C libs or boost::python. Otherwise those who want to JIT their Python are just going to wait a bit longer for PEP 3146 so they can have a good LLVM based JIT in standard Python.

                https://www.python.org/dev/peps/pep-3146/

                +
                +
                +
                +
                + + Anonymous wrote on 2011-05-23 03:48: +
                +
                +

                The pypy group should make a full featured ide with a gui designer with built in packaging to .exe and linux .deb and .rpm that only runs the pypy vm. That would bring the desktop application programmers in by the droves.

                +
                +
                +
                +
                + + Carl Friedrich Bolz-Tereick wrote on 2011-05-23 07:35: +
                +
                +

                @Hart: unladen swallow is dead:

                https://qinsb.blogspot.com/2011/03/unladen-swallow-retrospective.html

                +
                +
                +
                +
                + + Anonymous wrote on 2011-05-23 15:21: +
                +
                +

                Well, basically, it's NumPy, SciPy, Matplotlib and MayaVi. I'm also using Cython to optimize computation intensive code paths, but of course it would be nicer to stick to pure Python and let JIT do it's magic.

                +
                +
                +
                +
                + + lazyweb wrote on 2011-05-23 18:44: +
                +
                +

                Arrgh, gevent does not work with pypy? There's my blocker.

                +
                +
                +
                +
                + + Gaëtan de Menten wrote on 2011-05-30 12:59: +
                +
                +

                How long are you planning to keep this poll open? I hope you will blog about its results when it's closed...

                +
                +
                +
                +
                + + Almir Karic wrote on 2011-06-02 02:49: +
                +
                +

                would love to see the results

                +
                +
                +
                +
                + + Anonymous wrote on 2011-06-02 15:21: +
                +
                +

                I'm interesting in psycopg2and PIL libraries.

                +
                +
                +
                +
                + + Caetano wrote on 2011-06-02 15:29: +
                +
                +

                The only thing that makes me not using pypy is the lack of supporting python bynaries .so, .pyd, etc.
                I know that is a hard feature to implement because is needed to stub the CPython api.
                but I think when its done will there is no reasons to not using pypy for anybody.

                +
                +
                +
                +
                + + Anonymous wrote on 2011-08-04 21:21: +
                +
                +

                Numpy, scipy, matplotlib, and image are the stick ups for me.

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2011/05/server-migration-in-progress-2113491786141182920.html b/posts/2011/05/server-migration-in-progress-2113491786141182920.html new file mode 100644 index 000000000..e87653d22 --- /dev/null +++ b/posts/2011/05/server-migration-in-progress-2113491786141182920.html @@ -0,0 +1,311 @@ + + + + + +Server migration in progress | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                Server migration in progress

                + + + +
                +

                Hi all,

                + +

                We are in the process of migrating the hosting machine for PyPy, moving away from codespeak.net and towards a mixture of custom servers (e.g. for buildbot.pypy.org) and wide-scale services (e.g. for the docs, now at readthedocs.org).

                + +

                When this is done, a proper announce will be posted here. In the meantime, we have already moved the mailing lists, now hosted on python.org. The subscribers' list have been copied, so if you didn't notice anything special for the past week, then everything works fine :-) This concerns pypy-dev, pypy-issue and pypy-commit. Two notes: +

                +
                  +
                • Some settings have not been copied, notably if you used to disable mail delivery. Sorry about that; you have to re-enter such settings. +
                • +
                • Following the move, about 50 addresses have been dropped for being invalid. I'm unsure why they were not dropped earlier, but in case sending mail to you from python.org instead of codespeak.net fails, then you have been dropped from the mailing lists, and you need to subscribe again. +
                • +
                +
                +

                Comments

                +
                +
                +
                + + Henrik Vendelbo wrote on 2011-05-17 16:15: +
                +
                +

                I enjoy PyPy a lot, and would use it for production.

                However I tend to have a lot of problems when I upgrade to the latest source as my PyPy modules/extensions break and I will have to reimplement them with the new internal APIs.

                It would be great if there was a bit more stability around the structure of main and how to write a module.

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2011/06/global-interpreter-lock-or-how-to-kill-8270246310848099963.html b/posts/2011/06/global-interpreter-lock-or-how-to-kill-8270246310848099963.html new file mode 100644 index 000000000..e5510255d --- /dev/null +++ b/posts/2011/06/global-interpreter-lock-or-how-to-kill-8270246310848099963.html @@ -0,0 +1,820 @@ + + + + + +Global Interpreter Lock, or how to kill it | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                Global Interpreter Lock, or how to kill it

                + + + +
                +

                People that listened to my (Armin Rigo) lightning talk at EuroPython know that +suddenly, we have a plan to remove the Global Interpreter Lock --- the +infamous GIL, the thing in CPython that prevents multiple threads from +actually running in your Python code in parallel.

                +

                That's not actually new, because Jython has been doing it all along. +Jython works by very carefully adding locks to +all the mutable built-in types, and by relying on the underlying Java +platform to be efficient about them (so that the result is faster than, +say, very carefully adding similar locks in CPython). By "very +carefully", I mean really really carefully; for example, +'dict1.update(dict2)' needs to lock both dict1 and dict2, but if you do +it naively, then a parallel 'dict2.update(dict1)' might cause a +deadlock.

                +

                All of PyPy, CPython and IronPython have a GIL. But for PyPy we are considering +a quite different approach than Jython's, based on Software +Transactional Memory. This is a recent development in computer +science, and it gives a nicer solution than locking. Here is a short +introduction to it.

                +

                Say you want to atomically pop an item from 'list1' and append it to +'list2':

                +
                +def f(list1, list2):
                +    x = list1.pop()
                +    list2.append(x)
                +
                +

                This is not safe in multithreaded cases (even with the GIL). Say that +you call f(l1, l2) in thread 1 and f(l2, l1) in thread 2. What +you want is that it has no effect at all (x is moved from one list to +the other, then back). But what can occur is that instead the top of +the two lists are swapped, depending on timing issues.

                +

                One way to fix it is with a global lock:

                +
                +def f(list1, list2):
                +    global_lock.acquire()
                +    x = list1.pop()
                +    list2.append(x)
                +    global_lock.release()
                +
                +

                A finer way to fix it is with locks that come with the lists:

                +
                +def f(list1, list2):
                +    acquire_all_locks(list1.lock, list2.lock)
                +    x = list1.pop()
                +    list2.append(x)
                +    release_all_locks(list1.lock, list2.lock)
                +
                +

                The second solution is a model for Jython's, while the first is a model +for CPython's. Indeed, in CPython's interpreter, we acquire the GIL, +then we do one bytecode (or actually a number of them, like 100), then +we release the GIL; and then we proceed to the next bunch of 100.

                +

                Software Transactional Memory (STM) gives a third solution:

                +
                +def f(list1, list2):
                +    while True:
                +        t = transaction()
                +        x = list1.pop(t)
                +        list2.append(t, x)
                +        if t.commit():
                +            break
                +
                +

                In this solution, we make a transaction object and use it in all +reads and writes we do to the lists. There are actually several +different models, but let's focus on one of them. During a transaction, +we don't actually change the global memory at all. Instead, we use the +thread-local transaction object. We store in it which objects we +read from, which objects we write to, and what values we write. It is +only when the transaction reaches its end that we attempt to "commit" +it. Committing might fail if other commits have occurred in between, +creating inconsistencies; in that case, the transaction aborts and +must restart from the beginning.

                +

                In the same way as the previous two solutions are models for CPython and +Jython, the STM solution looks like it could be a model for PyPy in the +future. In such a PyPy, the interpreter would start a transaction, do +one or several bytecodes, and then end the transaction; and repeat. +This is very similar to what is going on in CPython with the GIL. In +particular, it means that it gives programmers all the same guarantees +as the GIL does. The only difference is that it can actually run +multiple threads in parallel, as long as their code does not interfere +with each other. (In particular, if you need not just the GIL but actual +locks in your existing multi-threaded program, then this will not +magically remove the need for them. You might get an additional built-in +module that exposes STM to your Python programs, if you prefer it over +locks, but that's another question.)

                +

                Why not apply that idea to CPython? Because we would need to change +everything everywhere. In the example above, you may have noted that I +no longer call 'list1.pop()', but 'list1.pop(t)'; this is a way to tell +that the implementation of all the methods needs to be changed, in order +to do their work "transactionally". This means that instead of really +changing the global memory in which the list is stored, it must instead +record the change in the transation object. If our interpreter is +written in C, as CPython is, then we need to write it explicitly +everywhere. If it is written instead in a higher-level language, as +PyPy is, then we can add this behavior as as set of translation rules, and +apply them automatically wherever it is necessary. Moreover, it can be +a translation-time option: you can either get the current "pypy" with a +GIL, or a version with STM, which would be slower due to the extra +bookkeeping. (How much slower? I have no clue, but as a wild guess, +maybe between 2 and 5 times slower. That is fine if you have enough +cores, as long as it scales nicely :-)

                +

                A final note: as STM research is very recent (it started around 2003), +there are a number of variants around, and it's not clear yet which one +is better in which cases. As far as I can tell, the approach described +in "A Comprehensive Strategy for Contention Management in Software +Transactional Memory" seems to be one possible state-of-the-art; it also +seems to be "good enough for all cases".

                +

                So, when will it be done? I cannot say yet. It is still at the idea +stage, but I think that it can work. How long would it take us to +write it? Again no clue, but we are looking at many months rather +than many days. This is the sort of thing that I would +like to be able to work on full time after the Eurostars funding +runs out on September 1. We are currently looking at ways to use +crowdfunding to raise money so that I can do exactly that. Expect +a blog post about that very soon. But this looks like a perfect +candidate for crowdfunding -- there are at least thousands of you who +would be willing to pay 10s of Euros to Kill the GIL. Now we only +have to make this happen.

                +
                +

                Comments

                +
                +
                +
                + + Michael Foord wrote on 2011-06-29 17:54: +
                +
                +

                If you concurrently run two transactions that interfere with each other - and they both restart on failure - isn't there a possibility that neither would ever complete? How would you mitigate against that? (Fallback to a global lock after a certain number of transaction failures perhaps?)

                +
                +
                +
                +
                + + Anonymous wrote on 2011-06-29 18:13: +
                +
                +

                There's a thing that is not clear to me: how do you detect failures during commits?

                +
                +
                +
                +
                + + jdhardy wrote on 2011-06-29 18:16: +
                +
                +

                IronPython doesn't have a GIL - it's the same as Jython.

                +
                +
                +
                +
                + + Michael Foord wrote on 2011-06-29 18:17: +
                +
                +

                Plus transactions have to be scoped around code that is side-effect free (or you can guarantee containing the side-effects within the transaction). Why STM research was done in Haskell I guess. Anyway, it sounds like a hard problem. That's why Armin is interested I guess... :-)

                +
                +
                +
                +
                + + Antonio Cuni wrote on 2011-06-29 18:23: +
                +
                +

                @michael: if two transactions conflict, you rollback only one of those, and from the external the effect is the same as having one locked by the GIL

                About side effects: the plan is to close a transaction before a side effect operation and reopen a new one after it: this is what happens already with the GIL, which is released e.g. before I/O calls.

                At least, this is how I understand it, and since I'm not Armin I might be wrong :-)

                +
                +
                +
                +
                + + Michael Foord wrote on 2011-06-29 18:26: +
                +
                +

                @antonio
                Ah, that makes sense. Thanks. :-)

                +
                +
                +
                +
                + + Anonymous wrote on 2011-06-29 18:30: +
                +
                +

                This sounds like a great idea...

                What happens when transaction interleaves together and fail? Both threads will still continue trying so to me this appears to be somewhat as efficient as locks. (Note I know nothing in this topic and would definitely like to learn more).

                +
                +
                +
                +
                + + Sebastian Noack wrote on 2011-06-29 19:14: +
                +
                +

                I don't think that the primary reason STM is slower than the GIL, is the extra bookkeeping, but the fact that things need to be repeated. However, I could imagine, that STM still might yield better response times than acquiring locks, in some cases.

                +
                +
                +
                +
                + + Tuomas Jorma Juhani Räsänen wrote on 2011-06-29 20:27: +
                +
                +

                STM is not ot that "recent" though:

                Nir Shavit and Dan Touitou. Software transactional memory. In PODC '95: Proceedings of the fourteenth annual ACM symposium on Principles of distributed computing, pages 204-213, New York, NY, USA, 1995. ACM.

                +
                +
                +
                +
                + + xyproto wrote on 2011-06-29 20:34: +
                +
                +

                I can imagine the reason this is efficient is because code often work on different parts of memory in different threads.

                +
                +
                +
                +
                + + ChrisW wrote on 2011-06-29 22:17: +
                +
                +

                Hmm, ZODB has this kind of optimistic transaction committing, it results in having to deal with ConflictErrors and slowness from retrying requests when they conflict amongst other pain. If that's the price for losing the GIL, I'll stick with the GIL, thanks...

                +
                +
                +
                +
                + + gertjan wrote on 2011-06-29 22:48: +
                +
                +

                Well when it comes to removing the GIL I have always had my hopes on pypy, and I'd be very happy to contribute some coin to make it happen. I'll be looking out for that crowdfunding post.

                +
                +
                +
                +
                + + Zemantic dreams wrote on 2011-06-29 23:00: +
                +
                +

                Ok, so where can we give a small contribution?




                Andraz Tori, Zemanta

                +
                +
                +
                +
                + + Richard wrote on 2011-06-30 00:32: +
                +
                +

                Have you read about Microsoft's abandoned attempt to bring STM to .NET? Have you considered the problems they had?

                +
                +
                +
                +
                + + Jon Morgan wrote on 2011-06-30 05:56: +
                +
                +

                Interesting idea, but some questions:
                1. What do C extensions do? (extensions designed for CPython that are using GIL methods). Would they still be able to be used, or would they have to be rewritten for PyPy?

                2. What happens if repeatable operations are interleaved with operations that are not repeatable? (e.g. logging values to a file - we wouldn't want it to happen twice if there was a conflict, unless of course you are using that logging to trace what is happening...).

                +
                +
                +
                +
                + + Ben wrote on 2011-06-30 10:30: +
                +
                +

                @Michael Foord: In state-of-the-art lazy[1] STM systems, the probability of two transactions continually causing each other to restart is minuscule. A transaction only causes another one to restart when it tries to commit. So when somebody restarts, it means that someone else has successfully committed.

                [1] In "Lazy" STMs, transactions only get exclusive access to the things they're trying to write to for a very short window of time at the end. This means they have to record writes in a transaction log, as Armin described, because there might be many pending writes for the same object. An alternative design is "eager" STM, where transactions write directly and have to "undo" their writes if they get aborted. Eager systems look good on paper, but in my opinion they're not worth it. With eager STM, the runtime system has to be very carefully designed to avoid livelock (when the system hangs because some transactions constantly abort each other). Lazy STM is almost impossible to livelock in practice, because even if some transactions are highly conflicting at least one of them (almost always) has to commit.

                +
                +
                +
                +
                + + Ben wrote on 2011-06-30 10:52: +
                +
                +

                Also, my honours project was implementing most of an STM system, and I've been a long time fan of (and sometime tinkerer with) PyPy, so I would be very interested in where this goes.

                And I know this is extremely premature, but if there were enough money coming in for this project and the PyPy team were willing to include outside developers, I would absolutely love to put serious work into this.

                +
                +
                +
                +
                + + Armin Rigo wrote on 2011-06-30 11:28: +
                +
                +

                @Richard: reading the web page you point out, Microsoft's STM attempt (like most others I'm aware of) seems to work at a different level: basically as a library for application programmers. I can go through all 4 points and show why they are not relevant in our context:

                * any visible I/O (e.g. writing to a file or a log) is going to end the transaction and start the next one, just like the GIL is released and re-acquired around most calls to the C library's write() function

                * the 2nd issue is moot, because STM will be an internal detail in PyPy, not a user-visible feature

                * the 3nd issue he describes is about "update-in-place" STM, which I believe is not the best solution: we want instead to keep a local log of the changes, and apply them only at commit-time (as described e.g. in the paper I pointed out)

                * the final issue is the lack of real successes with STM. Well, we can't do anything about that ahead of time :-)

                +
                +
                +
                +
                + + Anonymous wrote on 2011-06-30 11:29: +
                +
                +

                One note on the lock-based example you gave, that locks list1 and then list2: It isn't free of deadlocks!

                Having two threads call the function simultaneously with swapped args may cause a deadlock. See the bank account problem.

                +
                +
                +
                +
                + + Armin Rigo wrote on 2011-06-30 11:49: +
                +
                +

                @Anonymous: yes, I know it can deadlock. I have hidden the problem into some theoretical function acquire_all_locks(), which should somehow make sure that all locks are atomically acquired, in any order (which I think is possible by first sorting the locks according to their address in memory). I didn't want to put too much emphasis on the negative side of locks :-)

                +
                +
                +
                +
                + + Armin Rigo wrote on 2011-06-30 11:51: +
                +
                +

                @Jon Morgan:

                1. We would most probably still
                have a GIL for the CPython C
                extensions. Only one can run at a
                time, but any number of PyPy
                threads can run at the same time.
                (This is because the CPython C
                extensions never access PyPy's own
                objects directly --- they cannot,
                because PyPy's own objects can
                move, and the C code is not
                prepared for that.)

                2. Logging to a file is done with a
                call to a function like write().
                In CPython and so far in PyPy, the
                call to write() is preceded by
                "release GIL" and followed by
                "re-acquire GIL". In the STM PyPy,
                it would be preceded by "end the
                current transaction" and "start the
                next transaction". This gives the
                same behavior. But we may have to
                think a bit harder about writes
                that are buffered, because it seems
                that if all threads write into the
                same buffer then it will cause many
                transaction conflicts.

                Note however that we are talking
                here about very short-lived
                transactions. Even if you have 20
                threads all writing to the same log
                file, each thread is going to run
                much more than 20 bytecodes between
                any two writes to the log file.
                You only get conflicts if two of
                these threads are running the
                write() call at the same time, and
                such a conflict only causes one of
                the threads to roll back and retry
                the write(), not more.

                +
                +
                +
                +
                + + Armin Rigo wrote on 2011-06-30 11:54: +
                +
                +

                @tuomasjjrasanen: yes, actually the first paper is from the 80's. But I think that it's only from around 2003 or 2004 that research seriously started, in the sense that papers were produced regularly, from several teams.

                +
                +
                +
                +
                + + Kevin Granade wrote on 2011-06-30 14:47: +
                +
                +

                To address the anonymous question near the start of the comments, one way to detect commit collision is to copy a global generation counter at the start of your transaction, and then compare your stored copy to the current generation counter at commit time (after taking a lock), and if no one else has incremented the generation counter, you do so and complete your operation.

                So transaction does:
                self.generation = global.generation

                And commit does:
                if lock(global.lock):
                if self.generation == global.generation:
                global.generation += 1
                return True
                unlock(global.lock)
                return False

                +
                +
                +
                +
                + + Jan Ziak (atomsymbol) wrote on 2011-06-30 16:47: +
                +
                +

                I am not sure what to make out of the solution (=STM) to GIL you proposed in the article. You are essentially suggesting to slow down all Python programs in PyPy by a factor of, say, 4 and hope to recover the loss for a very small percentage of programs on an 8-core machine.

                That can't be right. Please tell me I am dreaming ... :)

                +
                +
                +
                +
                + + Michael Foord wrote on 2011-06-30 19:29: +
                +
                +

                So if there is only one thread transactions will be disabled?

                I wonder how "fine grained" transactions will be: if you have parallel operations working concurrently on a large array do you think you will be able to allow threads to simultaneously modify different areas of the array?

                +
                +
                +
                +
                + + Ben wrote on 2011-06-30 21:22: +
                +
                +

                @⚛: That's kind of how parallelization goes. There are overheads, and the only way to make up for them is to hope you have enough parallel speedup. STM (and any approach to this problem based on fine-grained locking) would work best if only a small known set of objects are shared between threads, and only those are synchronized, which unfortunately cannot be the case for a general GIL-removal proposal.

                However I think PyPy's JIT could potentially help a little here. The escape analysis PyPy already does can also prove "this value cannot be accessed by another thread" and used to avoid logging some values, since they cannot conflict with parallel transactions. There are probably some more STM-specific optimizations the JIT could do as well.

                +
                +
                +
                +
                + + Ben wrote on 2011-06-30 21:27: +
                +
                +

                @Michael Foord: STM definitely can be made as fine-grained as you like. Some existing STM systems operate at the level of machine words. Given that this one will be operating at the interpreter level, I would guess that code working on different sections of the same object (or array) would able to run in parallel, but I guess it depends on how the tradeoffs play out.

                +
                +
                +
                +
                + + Armin Rigo wrote on 2011-06-30 22:12: +
                +
                +

                @⚛: to complete Ben's answer: yes, you are correct, but that's why the translation step "insert STM logic" is never going to be mandatory. You will get either a regular pypy-c-gil or a pypy-c-stm, as two different executables, and you will choose the one most suited for your particular program. I still expect pypy-c-gil to be the most used one, with pypy-c-stm an alternative that is only useful for people with massively multi-threaded programs.

                +
                +
                +
                +
                + + EmilK wrote on 2011-07-01 10:55: +
                +
                +

                It would be cool, if the python programmer could mark "uncritical" sections, such that the stm book keeping is disabled for those sections where the programmer knows that there is no concurrency.

                +
                +
                +
                +
                + + Jacob Hallén wrote on 2011-07-01 14:17: +
                +
                +

                @EmilK: I think that would be very uncool. You would allow the developer to introduce bugs that would be extremely hard to locate. Parallel programs are quite difficult to get right to start with, and anyone who does not have complete understanding of what constitutes a critical section will be very likely to make an error.

                +
                +
                +
                +
                + + Skandalfo wrote on 2011-07-02 20:18: +
                +
                +

                There's an intermediate option between the GIL and the careful locking done by Jython, that I had a look at some time ago for making Python more thread friendly.

                Just exchanging the GIL for a global readers-writer lock would allow Python to use way more concurrency. You would run all Python code under a reader lock for operations that were read-only on objects. For modifying built in mutable objects, or for things like the one involving both lists in the Jython example, or when calling into C modules, you would have to acquire the writer version of the lock.

                Python threads would relinquish the reader lock each N opcodes, just like it's done now for the GIL, and I guess the acquisition of the writer lock should be given priority over the reader ones.

                This approach should be simpler to implement than using the transactional memory approach, and it should be possible to bake it into CPython too. I think I remember having read some discussion about this somewhere, but it didn't seem to come to anything...

                +
                +
                +
                +
                + + Armin Rigo wrote on 2011-07-06 14:26: +
                +
                +

                @Skandalfo: this cannot work with CPython, because of reference counting -- every bytecode modifies reference counts, so needs the "write" lock. But it could be a possible idea to consider in PyPy.

                +
                +
                +
                +
                + + WhiteLynx wrote on 2011-07-06 19:42: +
                +
                +

                I love this idea.

                Just musing on an implementation detail here, but isn't the "lazy" STM implementation's transaction system effectively just an in-memory implementation of copy-on-write semantics? It might be interesting to take a look at other things that have used COW for inspiration. (ZFS and btrfs come to mind) I like the idea that committing a transaction for a given object would just involve changing the object's address in memory to the modified copy.

                Also, I'd be interested to see the read/write lock system get implemented, because it seems like it might be a better choice for programs that only use a couple threads.

                +
                +
                +
                +
                + + Anonymous wrote on 2011-07-06 21:30: +
                +
                +

                What is wrong with Jython's lock model? Java is a pretty efficient language, no? And there is also no need to acquire locks for objects that you can prove won't cause conflicts...

                +
                +
                +
                +
                + + Skandalfo wrote on 2011-07-06 21:47: +
                +
                +

                @Armin Rigo: If the problem for the RW-lock approach in CPython is just about reference count updates and checks, perhaps those could be done via atomic primitives, as supported on most modern architectures. This is what boost::shared_ptr does, IIRC, for the pointers to be thread-safe by default.

                +
                +
                +
                +
                + + Armin Rigo wrote on 2011-07-09 13:18: +
                +
                +

                @Skandalfo: right, indeed. I don't know exactly the cost of such atomic operations. Maybe it's fine, but I fear that doing tons of increfs/decrefs all the time (as needed for refcounts in CPython's simple interpreter) has an important cost.

                +
                +
                +
                +
                + + Tuure Laurinolli wrote on 2011-07-11 20:10: +
                +
                +

                @Armin Rigo

                You'd need similar atomic instructions for an STM implementation too - although perhaps not as many? In any case they should be about as cheap as L1 cache writes unless there's contention, but then things are going to be slow in any case if you have contention. Of course you might have false sharing of objects etc. to muddle things up.

                In any case, what sort of semantics would a GIL-free Python have in multi-threaded case, compared to current GIL-infested Python? Each opcode can assumed to execute atomically?

                +
                +
                +
                +
                + + Anonymous wrote on 2011-07-17 12:32: +
                +
                +

                One thread have one interpreter.
                Threads interactive like os native thread, use the os interactive method wrap by py.

                I want to embed multi interpreter in my c code!

                Please kill GIL!!!

                +
                +
                +
                +
                + + Raymin wrote on 2011-07-17 12:48: +
                +
                +

                One thread have one interpreter.
                Threads interactive like os native thread, use the os interactive method wrap by py.

                I want to embed multi interpreter in my c code!

                Please kill GIL!!!

                +
                +
                +
                +
                + + Armin Rigo wrote on 2011-07-24 13:07: +
                +
                +

                @Tuure Laurinolli: yes, but PyPy has no refcounts. I was just discussing the pro/cons of the proposed locking solution on CPython (which is off-topic as far as this original blog post is concerned). I don't even want to think about STM for CPython :-)

                For your second question, from the user's point of view, the semantics we would get with STM are automatically the same as with the GIL, which is why I like the approach.

                +
                +
                +
                +
                + + Anonymous wrote on 2011-07-29 14:08: +
                +
                +

                Also, what about the performance if the lazy commit method used in the post? Every transaction will create additional memory? Is that really efficient, IMHO this model is aiming a very small number of use cases??

                +
                +
                +
                +
                + + klaussfreire wrote on 2011-10-14 21:26: +
                +
                +

                I can see a use for STM in CPython, too, though. Even though it seems to be not applicable, it need not be true.

                I worked on making the reference counting thread-friendly, in the sense that when you have multiple threads reading a big data structure, CPython's reference counting turns all the reads into writes, which is awful for performance.

                I wrote a patch to pack all writes in the same memory page (ie, reference pools, external reference counting), and was working on a patch for STM reference count updates.

                The thing with STM and reference counting, is that many operations cancel out at the end of the transaction. Like when you just read objects while performing computations, you acquire a reference, work, then release it.

                In the end, STM here would remove the need to write to shared memory.

                In the process of working on that patch, I can tell CPython can be made to use STM techniques. You have thread-local storage at the VM level already, macros handle almost all reference counting operations, it's all abstracted enough that it might be possible.

                For reference counting, the only problem is that STM is way slower for single threaded applications. WAY slower. For multithreaded, it pays off considerably, but CPython guys are very strongly set in favouring single-threaded performance.

                +
                +
                +
                +
                + + halfaleague wrote on 2011-10-28 03:55: +
                +
                +

                How can we fund this?

                +
                +
                +
                +
                + + Maciej Fijalkowski wrote on 2011-10-28 07:31: +
                +
                +

                @halfaleague get in contact. pypy@sfconservancy.org is the right address for non-profit funding inquires.

                +
                +
                +
                +
                + + Daniel Waterworth wrote on 2011-12-11 07:40: +
                +
                +

                I managed to write a Haskell STM implementation in a single morning. It may not be the most efficient implementation (I've found it to be about half the speed of the GHC implementation in the limited testing I've done), but it's really simple and only uses atomic CAS.

                https://gist.github.com/1454995

                +
                +
                +
                +
                + + shawn wrote on 2011-12-31 20:38: +
                +
                +

                have you looked at all at "Worlds" as a simpler interface to STM?

                https://www.vpri.org/pdf/tr2011001_final_worlds.pdf

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2011/06/report-back-from-our-survey-2083371215707583264.html b/posts/2011/06/report-back-from-our-survey-2083371215707583264.html new file mode 100644 index 000000000..d014fc587 --- /dev/null +++ b/posts/2011/06/report-back-from-our-survey-2083371215707583264.html @@ -0,0 +1,412 @@ + + + + + +Report back from our survey | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                Report back from our survey

                + + + +
                +

                Hi all,

                +

                I'm here to report back the results of our survey. First, we're very pleased to +report that a number of you guys are happilly running PyPy in production! Most +(97%) of the respondants using PyPy are using it because it's faster, but a +further 26% (respondants could choose multiple answers) are using it because of +lower memory usage. Of users who aren't using PyPy, the most common reason was +C extensions, followed by "Other".

                +

                From reading the extra comments section there are a few things we've learned:

                +
                  +
                1. Google docs needs a better UI for this stuff
                2. +
                3. A huge number of people want NumPy and SciPy, it was easily the most +requested C extension (25% of respondants said somthing about NumPy). We've +already blogged on the topic of our plans for NumPy.
                4. +
                5. Having packages in the various OS's repositories would be a big help in +getting users up and running.
                6. +
                +

                A huge thanks to everyone who responded! Finally, if you're using PyPy in +production we'd love to get a testimonial from you, if you're willing to spare +a few minutes to give us a quote or two please get in contact with us via our +mailing list.

                +

                Thanks, +Alex

                +
                +

                Comments

                +
                +
                +
                + + Paul wrote on 2011-06-08 10:18: +
                +
                +

                I'm surprised more people didn't mention Python 3 support as a big breaker. I certainly did.

                +
                +
                +
                +
                + + Jan Ziak (atomsymbol) wrote on 2011-06-08 14:16: +
                +
                +

                "... we're very pleased to report that a number of you guys are happilly running PyPy in production"

                You decided to keep the actual number of users a secret? Why?

                +
                +
                +
                +
                + + Maciej Fijalkowski wrote on 2011-06-08 14:20: +
                +
                +

                @⚛ I think Alex was simply too lazy to count :-) At some point there were 600 respondents and roughly 10% of them used pypy in production, which is pretty good IMO.

                +
                +
                +
                +
                + + Jan Ziak (atomsymbol) wrote on 2011-06-08 18:05: +
                +
                +

                @Maciej Fijalkowski: Ok, thanks for the clarification.

                +
                +
                +
                +
                + + Marko Tasic wrote on 2011-06-08 20:42: +
                +
                +

                I'm using pypy 1.5 with jit in production for highly reliable and responsive distributed and decentralized systems, and I'm happy with it.

                +
                +
                +
                +
                + + Jan Ziak (atomsymbol) wrote on 2011-06-09 07:22: +
                +
                +

                @Marko Tasic: If I may ask a question. You wrote that you are using PyPy for highly reliable systems. I know what you mean, but it seems to me that certain features of Python are in contradiction with high reliability. For example, it is in practice impossible to know at compile-time whether you misspelled a variable or parameter in Python source code. My question would be: why are you using a language which has only rudimentary compile-time error detection to implement a high reliability system?

                +
                +
                +
                +
                + + Maciej Fijalkowski wrote on 2011-06-09 07:58: +
                +
                +

                @⚛ Not even trying to argue with you, comments on this blog is not a proper place to discuss whether Python is good for high-reliability systems. Please take the discussion somewhere else

                Thanks,
                fijal

                +
                +
                +
                +
                + + Jan Ziak (atomsymbol) wrote on 2011-06-09 09:38: +
                +
                +

                @Maciej Fijalkowski: I will of course do what you ask, but I would like you to point me to at least one blog comment that: (1) Is initially saying that Python/PyPy is *good* for task X, and (2) You or somebody else from the PyPy team wrote "Please take the discussion about X somewhere else".

                Thanks

                +
                +
                +
                +
                + + Maciej Fijalkowski wrote on 2011-06-09 09:41: +
                +
                +

                @⚛ The line might be blurry, but "I'm using PyPy for X" or "I'm not using PyPy for X, because ..." is on topic. While "Python can be used for X" or "Python can't be used for X, because ..." is not on topic. This is a fine line between language implementation (which is PyPy about) and language design (which PyPy is not about, python-dev/python-list/python-ideas mailing lists are about that).

                Cheers,
                fijal

                +
                +
                +
                +
                + + Anonymous wrote on 2011-06-11 01:06: +
                +
                +

                What about a FFI to C or C++? Something like LuaJit's FFI, which is really good.

                +
                +
                +
                +
                + + Anonymous wrote on 2011-06-15 10:10: +
                +
                +

                Lack of support for numpy and scipy are what keep me from using pypy. Am using python for analysis of ultra high throughput DNA sequencing data.

                Would be very curious to see how much performance I could gain by using pypy.

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2011/07/realtime-image-processing-in-python-6985924592886873374.html b/posts/2011/07/realtime-image-processing-in-python-6985924592886873374.html new file mode 100644 index 000000000..93e46d197 --- /dev/null +++ b/posts/2011/07/realtime-image-processing-in-python-6985924592886873374.html @@ -0,0 +1,600 @@ + + + + + +Realtime image processing in Python | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                Realtime image processing in Python

                + + + +
                +

                Image processing is notoriously a CPU intensive task. To do it in realtime, +you need to implement your algorithm in a fast language, hence trying to do it +in Python is foolish: Python is clearly not fast enough for this task. Is it? +:-)
                +Actually, it turns out that the PyPy JIT compiler produces code which is fast +enough to do realtime video processing using two simple algorithms implemented +by Håkan Ardö.
                sobel.py implements a classical way of locating edges in images, the +Sobel operator. It is an approximation of the magnitude of the image +gradient. The processing time is spend on two convolutions between the +image and 3x3-kernels.
                magnify.py implements a pixel coordinate transformation that rearranges +the pixels in the image to form a magnifying effect in the center. +It consists of a single loop over the pixels in the output image copying +pixels from the input image.
                +You can try by yourself by downloading the appropriate demo:

                +
                + +
                +To run the demo, you need to have mplayer installed on your system. The +demo has been tested only on linux, it might (or not) work also on other +systems:
                $ pypy pypy-image-demo/sobel.py
                +
                +$ pypy pypy-image-demo/magnify.py
                +
                +By default, the two demos uses an example AVI file. To have more fun, you can +use your webcam by passing the appropriate mplayer parameters to the scripts, +e.g:
                $ pypy demo/sobel.py tv://
                +
                +By default magnify.py uses nearest-neighbor interpolation. By adding the +option -b, bilinear interpolation will be used instead, which gives +smoother result:
                $ pypy demo/magnify.py -b
                +
                +There is only a single implementation of the algorithm in +magnify.py. The two different interpolation methods are implemented by +subclassing the class used to represent images and embed the +interpolation within the pixel access method. PyPy is able to achieve good +performance with this kind of abstractions because it can inline +the pixel access method and specialize the implementation of the algorithm. +In C++ that kind of pixel access method would be virtual and you'll need to use +templates to get the same effect without incurring in runtime overhead.
                + + + + +
                +The video above shows PyPy and CPython running sobel.py side by +side (PyPy taking input from the webcam, CPython from the test +file). Alternatively, to have a feeling on how much PyPy is faster than +CPython, try to run the demo with the latter. These are the the average fps +(frames per second) that I get on my machine (Ubuntu 64 bit, Intel i7 920, 4GB +RAM) when processing the default test.avi video and using the prebuilt +PyPy binary found in the full tarball alinked above. For sobel.py:
                +
                  +
                • PyPy: ~47.23 fps
                • +
                • CPython: ~0.08 fps
                • +
                +
                +For magnify.py:
                +
                  +
                • PyPy: ~26.92 fps
                • +
                • CPython: ~1.78 fps
                • +
                +
                +This means that on sobel.py, PyPy is 590 times faster. On +magnify.py the difference is much less evident and the speedup is "only" +15x.
                +It must be noted that this is an extreme example of what PyPy can do. In +particular, you cannot expect (yet :-)) PyPy to be fast enough to run an +arbitrary video processing algorithm in real time, but the demo still proves +that PyPy has the potential to get there. +
                +

                Comments

                +
                +
                +
                + + Anonymous wrote on 2011-07-07 17:47: +
                +
                +

                Pypy is awesome!

                +
                +
                +
                +
                + + Anonymous wrote on 2011-07-07 18:19: +
                +
                +

                I have a n00b problem: On Mac OS X 10.5.8, the precompiled pypy binary crashes with this message:
                dyld: Library not loaded: /usr/lib/libssl.0.9.8.dylib

                What's up with this? Thanks, and sorry for being offtopic.

                +
                +
                +
                +
                + + metapundit.net wrote on 2011-07-07 19:17: +
                +
                +

                I saw this demo recently when Dan Roberts presented at Baypiggies. We broke into spontaneous applause when the pypy runtime ran at a watchable speed after cpython ran at less than 1 frame/second. Very impressive!

                +
                +
                +
                +
                + + Anonymous wrote on 2011-07-07 21:07: +
                +
                +

                Anonymous, can you read?

                "prebuilt PyPy binaries for linux 32 and 64 bits"
                "The demo has been tested only on linux, it might (or not) work also on other systems"

                Mac OS X is not Linux.

                +
                +
                +
                +
                + + schmichael wrote on 2011-07-07 21:23: +
                +
                +

                Perhaps add a comment to sobel.py explaining what "pypyjit.set_param(trace_limit=200000)" does?

                +
                +
                +
                +
                + + Luis wrote on 2011-07-07 22:27: +
                +
                +

                The only chamge I'd like to see in this project is its name... Trying to gather news from twitter for example, makes me search amongst thousands of comments in japanese (pypy means "boobies" in japanese), other incomprehensible comments in malay and hundreds of music fans of Look-Ka PYPY (WTF??)

                +
                +
                +
                +
                + + Anonymous wrote on 2011-07-07 22:58: +
                +
                +

                Other Anonymous: Yes, I can read. I should have given a bit more context, but I was offtopic anyway. My goal was not running the demo, my goal was running pypy. I used the OS X binary from pypy.org. For those who are really good at reading, this was probably clear from the fact that my binary only crashed at library loading time.

                +
                +
                +
                +
                + + Antonio Cuni wrote on 2011-07-07 23:03: +
                +
                +

                @Anonymous: most probably, the prebuilt PyPy for Mac Os X was built on a system different (older?) than yours.

                For a quick workaround, you can try to do "ln -s /usr/lib/libssl-XXX.dylib /usr/lib/libssl.0.9.8.dylib". This should at least make it working, but of course it might break in case you actually use libssl.

                The proper fix is to recompile PyPy by yourself.

                +
                +
                +
                +
                + + Antonio Cuni wrote on 2011-07-07 23:08: +
                +
                +

                @schmichael

                to avoid the potential problem of infinite tracing, the JIT bails out if it traces "too much", depending on the trace_limit.
                In this case, the default trace_limit is not enough to fully optimize the whole algorithm, hence we need to help the JIT by telling it to trace a bit more than usual.

                I agree that having to mess up with the internal parameters of the JIT is suboptimal. I plan to address this issue in the next weeks.

                +
                +
                +
                +
                + + relet wrote on 2011-07-07 23:43: +
                +
                +

                How does it perform against python-opencv?

                +
                +
                +
                +
                + + Anonymous wrote on 2011-07-07 23:47: +
                +
                +

                Antonio: Thanks for the quick reply. Unfortunately pypy can't be misled with the symlink hack: "Reason: Incompatible library version: pypy requires version 0.9.8 or later, but libssl.0.9.8.dylib provides version 0.9.7"

                It seem like the prebuilt was created on a 10.6, and it does not work on vanilla 10.5 systems. Not a big deal, but is good to know.

                +
                +
                +
                +
                + + Anonymous wrote on 2011-07-08 04:44: +
                +
                +

                Thanks for posting this. pypy is great. I'm trying to figure out how to write modules in RPython. I was sad that I missed the Baypiggies presentation.

                +
                +
                +
                +
                + + René Dudfield wrote on 2011-07-08 07:35: +
                +
                +

                Hello,

                it's lovely that pypy can do this. This result is amazing, wonderful, and is very kittens. pypy is fast at running python code (*happy dance*).

                But.

                It also makes kittens cry when you compare to CPython in such a way.

                The reality is that CPython users would do this using a library like numpy, opencv, pygame, scipy, pyopengl, freej (the list of real time video processing python libraries is very large, so I won't list them all here).

                Of course python can do this task well, and has for more than 10 years.

                This code does not take advantage of vectorization through efficient SIMD, multiple cores or graphics hardware, and isn't careful with reusing memory - so is not within an order of magnitude of the speed of CPython code with libraries doing real time video processing.

                Anyone within the field would ask about using these features.

                Another question they would ask is about pauses. How does the JIT affect pauses in animation? What are the rules for when the JIT warms up, and how can you tell when the code will start running fast? How does the GC affect pauses? If there is a way to turn off the GC, or reuse memory in some way such that the GC won't cause the program to fail(Remember that in realtime a pause is a program fail). Does the GC pool memory of similar size objects automatically? Does the GC work well with 256MB-1GB-16GB sized objects? In a 16GB system, can you use 15GB of objects, and then delete those objects to then use another 15GB of different objects? Or will the program swap, or fragment memory causing pauses?

                Please don't make kittens cry. Be realistic with CPython comparisons.


                At the moment the python implementation is not as elegant as a vector style implementation. A numpy/matlab/CUDA/OpenCL approach looks really nice for this type of code. One speed up might be to reuse memory, or act in place where possible. For example, not copying the image... unless the GC magically takes care of that for you.

                +
                +
                +
                +
                + + Jacob Hallén wrote on 2011-07-08 08:21: +
                +
                +

                @illume:More or less everyone knows that you can speed up your code by writing or using an extension library. Unfortunately this introduces a dependency on the library (for instance libssl mentioned in the comment thread) and it usually increases the complexity of your code.

                Using PyPy you can solve computationally intensive problems in plain Python. Writing in Python saves development time. This is what the comparison is all about.

                +
                +
                +
                +
                + + René Dudfield wrote on 2011-07-08 12:23: +
                +
                +

                hi @jacob: below is code which runs either multi core, vectorised SIMD, and on a GPU if you like. You'll notice that it is way shorter and more elegant than the 'pure python' code.

                def sobelEdgeDetect(im=DImage, p=Position):
                ....wX = outerproduct([1,2,1],[-1,0,1])
                ....wY = transpose(wX)

                ....Gx = convolve(wX,im,p)
                ....Gy = convolve(wY,im,p)

                ....return sqrt(Gx**2 + Gy**2)

                If pypy is 5x slower than C, and SIMD is 5x faster than C... and using multiple cores is 8x faster than a single core you can see this python code is (5 * 5 * 8) 200x faster than the pypy code. This is just comparing CPU based code. Obviously GPU code for real time image processing is very fast compared to CPU based code.

                Things like numpy, pyopengl etc come packaged with various OSes - but chosing those dependencies compared to depending on pypy is a separate issue I guess (but many cpython packaged libraries are packaged for more platforms than pypy).

                Of course using tested, and debugged existing code written in python will save you development time: for example using sobel written with the scipy library:
                https://docs.scipy.org/doc/scipy/reference/generated/scipy.ndimage.filters.sobel.html

                The fact is CPython is fast enough, more elegant, and will save you time for realtime image processing - unless you ignore the reality that people use CPython libraries for these tasks.

                Finally the given code does not prove that the frames are all processed in realtime. They give an average time over all of the frames. Realtime video requires that you meet your target speed for every frame. It would need to be extended to measure each frame to make sure that each frame is within the required time budget.

                +
                +
                +
                +
                + + Antonio Cuni wrote on 2011-07-08 12:31: +
                +
                +

                @illume: I think you completely missed the point of the blog post. This is not about "you should use pypy to do video processing", it's about "pypy runs pure python code very fast".

                +
                +
                +
                +
                + + René Dudfield wrote on 2011-07-08 12:58: +
                +
                +

                @Antonio Cuni, I'm saying the post reads like cpython can not do "realtime image processing in python" and that pypy can.

                +
                +
                +
                +
                + + tismer wrote on 2011-07-08 14:21: +
                +
                +

                @illume:
                This example shows pure python code and compares its execution time in cpython and pypy. Nothing else. Writing graphics code in pure python that runs not dreadfully slow was to my knowledge never before shown.
                If enough people understand the potential of this technique and put their time into it, we will hopefully come closer to your (5 * 5 * 8) acceleration in pypy, too.
                I will for sure work on this.

                +
                +
                +
                +
                + + Eventh wrote on 2011-07-08 14:41: +
                +
                +

                SIMD instructions and multi core support is something PyPy has potential to support, given time and funding.

                +
                +
                +
                +
                + + Anonymous wrote on 2011-07-08 21:20: +
                +
                +

                The typical optimization path here would be implementing the necessary numpy array operations for the algorithms described. I wonder how a proper numpy implementation would compare.

                +
                +
                +
                +
                + + Armin Rigo wrote on 2011-07-09 13:38: +
                +
                +

                I think you are still missing the point of the post. It was not "use pure Python to write your video processing algos". That's of course nonsense, given the amount and quality of existing C extension modules to do that.

                The point is that when you want to experiment with writing a new algorithm of any kind, it is now possible to do it in pure Python instead of, say, C code. If later your project needs to move past the experimentation phase, you will have to decide if you want to keep that Python code, rewrite it in C, or (if applicable) use SIMD instructions from Python or from C, or whatever.

                The real point of this demo is to show that PyPy makes Python fast enough as an early experimentation platform for almost any kind of algorithm. If you can write in Python instead of in C, you'll save 50% of your time (random estimate); and then for the 5% of projects that go past the experimentation phase and where Python is not enough (other random estimate), spend more time learning other techniques and using them. The result is still in your favor, and it's only going to be more so as PyPy continues to improve.

                +
                +
                +
                +
                + + Yaacov wrote on 2011-10-18 23:31: +
                +
                +

                I was hoping to experiment with this amazing demo on my Windows-based computers. Any advice for how I would start making the required changes?

                Jacob

                +
                +
                +
                +
                + + Anonymous wrote on 2012-07-24 13:38: +
                +
                +

                dead links

                +
                +
                +
                +
                + + Maciej Fijalkowski wrote on 2012-07-24 13:41: +
                +
                +

                Unfortunately the server died :( I'm not sure where exactly are packaged demos, but they can be run from:

                https://foss.heptapod.net/pypy/extradoc/-/blob/branch/default/extradoc/talk/iwtc11/benchmarks/image

                +
                +
                +
                +
                + + Unknown wrote on 2012-10-04 22:08: +
                +
                +
                The python code for this seems to be now here:
                https://foss.heptapod.net/pypy/extradoc/-/blob/branch/default/talk/dls2012/demo +
                +
                +
                +
                + + Unknown wrote on 2012-10-04 22:09: +
                +
                +

                The scripts can be found here:

                https://foss.heptapod.net/pypy/extradoc/-/blob/branch/default/153804ce4fc3/talk/dls2012/demo

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2011/08/pypy-16-kickass-panda-559424594592497545.html b/posts/2011/08/pypy-16-kickass-panda-559424594592497545.html new file mode 100644 index 000000000..89a3015bd --- /dev/null +++ b/posts/2011/08/pypy-16-kickass-panda-559424594592497545.html @@ -0,0 +1,540 @@ + + + + + +PyPy 1.6 - kickass panda | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                PyPy 1.6 - kickass panda

                + + + +
                +

                We're pleased to announce the 1.6 release of PyPy. This release brings a lot +of bugfixes and performance improvements over 1.5, and improves support for +Windows 32bit and OS X 64bit. This version fully implements Python 2.7.1 and +has beta level support for loading CPython C extensions. You can download it +here:

                +
                +https://pypy.org/download.html +
                +
                +

                What is PyPy?

                +

                PyPy is a very compliant Python interpreter, almost a drop-in replacement for +CPython 2.7.1. It's fast (pypy 1.6 and cpython 2.6.2 performance comparison) +due to its integrated tracing JIT compiler.

                +

                This release supports x86 machines running Linux 32/64 or Mac OS X. Windows 32 +is beta (it roughly works but a lot of small issues have not been fixed so +far). Windows 64 is not yet supported.

                +

                The main topics of this release are speed and stability: on average on +our benchmark suite, PyPy 1.6 is between 20% and 30% faster than PyPy 1.5, +which was already much faster than CPython on our set of benchmarks.

                +

                The speed improvements have been made possible by optimizing many of the +layers which compose PyPy. In particular, we improved: the Garbage Collector, +the JIT warmup time, the optimizations performed by the JIT, the quality of +the generated machine code and the implementation of our Python interpreter.

                +
                +
                +

                Highlights

                +
                  +
                • Numerous performance improvements, overall giving considerable speedups:
                    +
                  • better GC behavior when dealing with very large objects and arrays
                  • +
                  • +fast ctypes: now calls to ctypes functions are seen and optimized +by the JIT, and they are up to 60 times faster than PyPy 1.5 and 10 times +faster than CPython
                  • +
                  • improved generators(1): simple generators now are inlined into the caller +loop, making performance up to 3.5 times faster than PyPy 1.5.
                  • +
                  • improved generators(2): thanks to other optimizations, even generators +that are not inlined are between 10% and 20% faster than PyPy 1.5.
                  • +
                  • faster warmup time for the JIT
                  • +
                  • JIT support for single floats (e.g., for array('f'))
                  • +
                  • optimized dictionaries: the internal representation of dictionaries is now +dynamically selected depending on the type of stored objects, resulting in +faster code and smaller memory footprint. For example, dictionaries whose +keys are all strings, or all integers. Other dictionaries are also smaller +due to bugfixes.
                  • +
                  +
                • +
                • JitViewer: this is the first official release which includes the JitViewer, +a web-based tool which helps you to see which parts of your Python code have +been compiled by the JIT, down until the assembler. The jitviewer 0.1 has +already been release and works well with PyPy 1.6.
                • +
                • The CPython extension module API has been improved and now supports many +more extensions. For information on which one are supported, please refer to +our compatibility wiki.
                • +
                • Multibyte encoding support: this was of of the last areas in which we were +still behind CPython, but now we fully support them.
                • +
                • Preliminary support for NumPy: this release includes a preview of a very +fast NumPy module integrated with the PyPy JIT. Unfortunately, this does +not mean that you can expect to take an existing NumPy program and run it on +PyPy, because the module is still unfinished and supports only some of the +numpy API. However, barring some details, what works should be +blazingly fast :-)
                • +
                • Bugfixes: since the 1.5 release we fixed 53 bugs in our bug tracker, not +counting the numerous bugs that were found and reported through other +channels than the bug tracker.
                • +
                +

                Cheers,

                +

                Hakan Ardo, Carl Friedrich Bolz, Laura Creighton, Antonio Cuni, +Maciej Fijalkowski, Amaury Forgeot d'Arc, Alex Gaynor, +Armin Rigo and the PyPy team

                +
                +
                +

                Comments

                +
                +
                +
                + + Anonymous wrote on 2011-08-18 18:59: +
                +
                +

                Finally :) I'm really looking forward to test this code out :)

                +
                +
                +
                +
                + + René Dudfield wrote on 2011-08-18 19:01: +
                +
                +

                Congrats team pypy!

                +
                +
                +
                +
                + + Anonymous wrote on 2011-08-18 21:15: +
                +
                +

                I look forward to support Python 3

                +
                +
                +
                +
                + + Anonymous wrote on 2011-08-18 21:58: +
                +
                +

                "and has beta level support for loading CPython C extensions"

                does this mean that the regular Numpy and Scipy can be used with this.

                +
                +
                +
                +
                + + almir karic wrote on 2011-08-18 22:54: +
                +
                +

                no.

                "Unfortunately, this does not mean that you can expect to take an existing NumPy program and run it on PyPy"

                thanks for the release pypy team!

                +
                +
                +
                +
                + + Anonymous wrote on 2011-08-19 03:37: +
                +
                +

                Impressive as always. Thanks for releasing such great software.
                Keep up the good work.

                Anghel

                +
                +
                +
                +
                + + profu wrote on 2011-08-19 05:13: +
                +
                +

                Where is the windows version?

                +
                +
                +
                +
                + + Anonymous wrote on 2011-08-19 07:36: +
                +
                +

                I did some benchmark with some simple parameterized SELECT statements, and found that pg8000 on pypy 1.6 is more than one time slower than pg8000 on python 2.7.1, while the later is already more than one time slower than psycopg2 on python 2.7.1.

                +
                +
                +
                +
                + + Anonymous wrote on 2011-08-19 07:55: +
                +
                +

                Still can't build and run python-ldap extension... :(
                That's a deal-breaker for me.

                +
                +
                +
                +
                + + Maciej Szumocki wrote on 2011-08-19 08:34: +
                +
                +

                What kind of problems prevent releasing a Windows 64bit version?

                +
                +
                +
                +
                + + Lenz wrote on 2011-08-19 12:59: +
                +
                +

                Congrats !!! Realy amazing job !!

                By the way, where can I find more informations about the alredy implemented numpy functions ?

                Thanks.

                +
                +
                +
                +
                + + jensck wrote on 2011-08-19 18:28: +
                +
                +

                Amazing - PyPy just keeps making leaps and bounds forward for compat. and processing performance. I don't know how you guys keep up such pace, but I dig it!

                How is typical memory usage these days? It's been a while since anything was reported on its resource usage vs. CPython. Maybe such a benchmark could be added to the speed site?

                +
                +
                +
                +
                + + Jan Ziak (atomsymbol) wrote on 2011-08-19 21:24: +
                +
                +

                PyPy 1.5: 68 seconds
                PyPy 1.6: 65 seconds
                Python 3.2 (Intel C compiler): 36 seconds

                Extrapolation to the future:
                PyPy 1.17: 35 seconds ?

                Jokes aside, PyPy's compatibility with CPython is good.

                +
                +
                +
                +
                + + Anonymous wrote on 2011-08-20 01:05: +
                +
                +

                I'm still most looking forward to the day your jit makes some .pyj files. Initialization time for the jit is a bit high, especially if you use pypy integrated into other scripts where the init time might impact performance, numpy had no dtypes, and laked almost every function, but atleast it's a step in the right direction :)

                Memory usage for one of my own test apps (building a one dimensional dict with int keys, and object (sometimes by reference from a second dict) resulted in 76MB to python2.7 and 108MB to pypy 1.6. So memory usage is still a bit behind tho (the pypy runtime was better with around 35% tho).

                +
                +
                +
                +
                + + Anonymous wrote on 2011-08-21 14:54: +
                +
                +

                Is there a 32-bit OSX version somewhere? 64-bit seems to eat up memory in my tests...
                Impressive stuff, though :-)

                +
                +
                +
                +
                + + Anonymous wrote on 2011-08-22 10:56: +
                +
                +

                But man! What's wrong with Windows?

                Will the windows version will be dropped?

                Version 1.5 does not fully work on Windows and now you release 1.6 you does not provide a windows version...

                So, I really want to know if it will be future support for windows.

                This will help to decide if pypy will be an option or one just have to find other options to speed up the programs.

                Please, clarify this.

                Bests,

                +
                +
                +
                +
                + + Anonymous wrote on 2011-08-22 18:25: +
                +
                +

                Pypy does support Windows 32bit, with a couple of bugs, the windows support have been improved from 1.5 to 1.6. Perhaps it will be fully working by 1.7.

                +
                +
                +
                +
                + + Anonymous wrote on 2011-08-23 09:45: +
                +
                +

                Ok, but where to download PYPY for Win32 ?

                +
                +
                +
                +
                + + Anonymous wrote on 2011-08-23 12:48: +
                +
                +

                I believe you got to compile it yourself.

                +
                +
                +
                +
                + + vak wrote on 2011-08-24 11:16: +
                +
                +

                just impressive. If you guys could resurrect the numpy operation like:

                boolean_array = arr > value

                it would be just a dream. This important operation returns not an array, but a value now.

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2011/08/pypy-is-faster-than-c-again-string-6756589731691762127.html b/posts/2011/08/pypy-is-faster-than-c-again-string-6756589731691762127.html new file mode 100644 index 000000000..2359fee29 --- /dev/null +++ b/posts/2011/08/pypy-is-faster-than-c-again-string-6756589731691762127.html @@ -0,0 +1,867 @@ + + + + + +PyPy is faster than C, again: string formatting | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                PyPy is faster than C, again: string formatting

                + + + +
                +

                String formatting is probably something you do just about every day in Python, +and never think about. It's so easy, just "%d %d" % (i, i) and you're +done. No thinking about how to size your result buffer, whether your output +has an appropriate NULL byte at the end, or any other details. A C +equivalent might be:

                +
                +char x[44];
                +sprintf(x, "%d %d", i, i);
                +
                +

                Note that we had to stop for a second and consider how big numbers might get +and overestimate the size (44 = length of the biggest number on 64bit (20) + +1 for the sign * 2 + 1 (for the space) + 1 (NUL byte)), it took the authors of +this post, fijal and alex, 3 tries to get the math right on this :-)

                +

                This is fine, except you can't even return x from this function, a more +fair comparison might be:

                +
                +char *x = malloc(44 * sizeof(char));
                +sprintf(x, "%d %d", i, i);
                +
                +

                x is slightly overallocated in some situations, but that's fine.

                +

                But we're not here to just discuss the implementation of string +formatting, we're here to discuss how blazing fast PyPy is at it, with +the new unroll-if-alt branch. Given the Python code:

                +
                +def main():
                +    for i in xrange(10000000):
                +        "%d %d" % (i, i)
                +
                +main()
                +
                +

                and the C code:

                +
                +#include <stdio.h>
                +#include <stdlib.h>
                +
                +
                +int main() {
                +    int i = 0;
                +    char x[44];
                +    for (i = 0; i < 10000000; i++) {
                +        sprintf(x, "%d %d", i, i);
                +    }
                +}
                +
                +

                Run under PyPy, at the head of the unroll-if-alt branch, and +compiled with GCC 4.5.2 at -O4 (other optimization levels were tested, +this produced the best performance). It took 0.85 seconds to +execute under PyPy, and 1.63 seconds with the compiled binary. We +think this demonstrates the incredible potential of dynamic +compilation, GCC is unable to inline or unroll the sprintf call, +because it sits inside of libc.

                +

                Benchmarking the C code:

                +
                +#include <stdio.h>
                +#include <stdlib.h>
                +
                +
                +int main() {
                +    int i = 0;
                +    for (i = 0; i < 10000000; i++) {
                +        char *x = malloc(44 * sizeof(char));
                +        sprintf(x, "%d %d", i, i);
                +        free(x);
                +    }
                +}
                +
                +

                Which as discussed above, is more comperable to the Python, gives a +result of 1.96 seconds.

                +

                Summary of performance:

                + +++++++ + + + + + + + + + + + + + + + + + + + + + + + +
                PlatformGCC (stack)GCC (malloc)CPythonPyPy (unroll-if-alt)
                Time1.63s1.96s10.2s0.85s
                relative to C1x0.83x0.16x1.9x
                +

                Overall PyPy is almost 2x faster. This is clearly win for dynamic +compilation over static - the sprintf function lives in libc and so +cannot be specializing over the constant string, which has to be parsed +every time it's executed. In the case of PyPy, we specialize +the assembler if we detect the left hand string of the modulo operator +to be constant.

                +

                Cheers,
                +alex & fijal

                +
                +

                Comments

                +
                +
                +
                + + salmon wrote on 2011-08-02 19:23: +
                +
                +

                What about '{0}'.format('pypy') ?
                Is this also faster?

                +
                +
                +
                +
                + + JoeHillen wrote on 2011-08-02 19:59: +
                +
                +

                Where can we see this "unroll-if-alt" branch?

                +
                +
                +
                +
                + + Greg Haines wrote on 2011-08-02 20:13: +
                +
                +

                Are you sure the compiler isn't optimizing away the actual execution since you're not doing anything with the result?

                +
                +
                +
                +
                + + Thomas Schilling wrote on 2011-08-02 20:18: +
                +
                +

                How are those two loops equivalent? You're not printing anything in the Python loop. I/O buffering etc. can eat quite a bit of runtime. It would also be nice to see what the particular improvements in this "unroll-if-alt" branch are.

                +
                +
                +
                +
                + + Anonymous wrote on 2011-08-02 20:19: +
                +
                +

                How about doing something like that:
                ....
                char p[5] = "%d %d"
                //and then
                sprintf(x, p, i,i);
                ....

                ?

                +
                +
                +
                +
                + + Andrew Pendleton wrote on 2011-08-02 20:25: +
                +
                +

                @Thomas the C one doesn't print anything, either; sprintf just returns a string. printf is the one that prints.

                +
                +
                +
                +
                + + Anonymous wrote on 2011-08-02 20:26: +
                +
                +

                @Thomas the C one doesn't print anything either, so it sounds pretty equivalent to me.

                +
                +
                +
                +
                + + Johan Tibell wrote on 2011-08-02 20:28: +
                +
                +

                This doesn't really have anything to do with dynamic compilation, but cross module optimization. There are static compilers, such as the Glasgow Haskell Compiler, that do this. If the compilation strategy depended on runtime data (e.g. measure hot spots), it would be dynamic compilation.

                +
                +
                +
                +
                + + Anonymous wrote on 2011-08-02 20:56: +
                +
                +

                *yawn* If you want to see ridiculously fast string formatting, look at the Boost's Spirit library (specifically Karma). Small test case, but point well illustrated: https://www.boost.org/doc/libs/1_47_0/libs/spirit/doc/html/spirit/karma/performance_measurements/numeric_performance/int_performance.html Or look at Spirit's input parser for even integers: https://alexott.blogspot.com/2010/01/boostspirit2-vs-atoi.html

                +
                +
                +
                +
                + + Antonio Cuni wrote on 2011-08-02 20:57: +
                +
                +

                @JoeHillen: the unroll-if-alt branch is inside the main pypy repo on bitbucket (together with all the other branches).

                @Greg: yes, we checked the generated code, it's not optimized away.

                @anonymous: why it should be any faster? String literals in C are constants, it's not that you need to create a new one at each iteration

                @Johan: note that the PyPy approach can generate code optimized for a formatting string loaded from a disk, or computed at runtime. No static compiler could do that.

                +
                +
                +
                +
                + + Anonymous wrote on 2011-08-02 21:10: +
                +
                +

                What machine are you on that an int is 64 bits? Hardly anybody uses ILP64 or SILP64 data models ( https://en.wikipedia.org/wiki/64-bit#Specific_C-language_data_models ). Maybe a fourth try is in order? :P

                +
                +
                +
                +
                + + Johan Tibell wrote on 2011-08-02 21:14: +
                +
                +

                Antonio, that is indeed neat.

                +
                +
                +
                +
                + + Unknown wrote on 2011-08-02 22:04: +
                +
                +

                So when are you going to teach PyPy that the result of an unused string formatting can be deleted, and then delete the loop? ;)

                I'm not sure how you'd get there from a tracing JIT, though. WIth Python, you still have to call all the formatting and stringification methods because they might have side effects. You only get to know that the entire operation is a no-op after you've inlined everything, but by then it will be at a low enough representation that it's hard to tell.

                +
                +
                +
                +
                + + Anonymous wrote on 2011-08-02 22:04: +
                +
                +

                sizeof(char)==1. By definition. Argh.

                PS: negative karma for lying headline

                +
                +
                +
                +
                + + Anonymous wrote on 2011-08-02 22:15: +
                +
                +

                Check that you're not spending all your time in malloc/free(). Also use the return value from a failed snprintf(), plus 1, to size your output buffer.

                +
                +
                +
                +
                + + Unknown wrote on 2011-08-02 22:21: +
                +
                +

                @Anonymous 2: Even if all the time were spent in malloc/free, PyPy has to dynamically allocate the string data structure, as well as provide a buffer to fill with the characters from the integers, since it has no way of knowing how much space will be needed (could be a custom integer class).

                However, you're right that malloc and free are slow and a good gc system would have a faster allocator.

                +
                +
                +
                +
                + + vsergeev wrote on 2011-08-02 22:24: +
                +
                +

                a quick tip to minimize the math in determining your sprintf buffer size for your experiment:
                #include < stdint.h >
                len = snprintf(NULL, 0, "%d %d", INT32_MIN, INT32_MIN);
                will give you the string length required (not including null terminating byte) to fit the formatted string.

                Similarly, %lld and INT64_MIN will do the trick (on the right platform) for 64-bit signed integers.

                (not that I advocate fixed sized buffers for formatted strings based on min/max digit lengths for any real application)

                +
                +
                +
                +
                + + Anonymous wrote on 2011-08-02 22:33: +
                +
                +

                You wrote:
                and compiled with GCC 4.5.2 at -O4

                Please read the manual of GCC. There you will see that every optimization level above 3 is handled as it would be 3. '-O4' is nothing else than '-O3'.

                It is also known that optimizing with -O3 may lead to several problems at runtime (e.g. memory delays for short programs or memory allocation failure in larger programs).
                That's why the recommended optimization level is '2' (or 's' for embedded systems) and not '3'.

                Did you test with a realtime kernel?
                How about the scheduler?

                Maybe you should double check your test environment.

                +
                +
                +
                +
                + + Anonymous wrote on 2011-08-02 22:47: +
                +
                +

                For all you complaining about test eviorment. Pypy would still have to do that internaly. If they should be truely comparable, then you need to also include snprintf inside the loop, making C even slower. Also, I doubt you will get 200% performance boost from scheduler change.

                unroll-if-alt will be included in 1.6 right? Also when will 1.6 be released?

                +
                +
                +
                +
                + + Thomas Schilling wrote on 2011-08-02 22:50: +
                +
                +

                @Andrew, @hobbs: Oh, sorry I overlooked the "s" in "sprintf". It would still be nice compare the generated machine code to explain the differences.

                Whenever, someone claims language L1 implementation A is faster than language L2 implementation B there are obvious questions about (1) fairness of comparison, (2) what is being measured. In this case PyPy is specializing on the format string interpreter (does that require library annotations?) which a C compiler could do in principle here (but probably doesn't.) So, I'm always a bit suspicious when I see these kinds of comparisons.

                @Johan: GHC's cross-module optimization often comes at the expense of binary compatibility. A JIT has a big advantage here.

                +
                +
                +
                +
                + + René Dudfield wrote on 2011-08-02 23:33: +
                +
                +

                The python faster than C day has come! Congrats.

                ps. Did you try it with (Link Time Optimization)LTO? that is with gcc the option: -flto ? Also, are you using PGO with gcc?

                +
                +
                +
                +
                + + nekto0n wrote on 2011-08-02 23:40: +
                +
                +

                @salmon According to this commit new style formatting is supported too.

                Someone correct me if I'm wrong.

                +
                +
                +
                +
                + + Anonymous wrote on 2011-08-02 23:49: +
                +
                +

                I think that computation is not correct yet. IIRC, you only get 20 digits in an unsigned 64-bit quantity.

                Worse, (again IIRC) sprintf is locale dependent. It may insert thousands separators.

                +
                +
                +
                +
                + + Anonymous wrote on 2011-08-03 00:31: +
                +
                +

                This is not a good performance test because all printf function have high constant complexity, without looking at format string, check it

                +
                +
                +
                +
                + + Strohan wrote on 2011-08-03 01:54: +
                +
                +

                wouldn't it be better to run your test with a more modern c++ library like cstring?

                +
                +
                +
                +
                + + Anonymous wrote on 2011-08-03 03:07: +
                +
                +

                If 1.9x is "almost 2x faster", then what is "1x faster"?

                +
                +
                +
                +
                + + Poposhka wrote on 2011-08-03 05:09: +
                +
                +

                post the Assembly code, map files and call graph or it didnt happen!!!!!!!!

                +
                +
                +
                +
                + + Reinis I. wrote on 2011-08-03 07:13: +
                +
                +

                "one time faster" is bad English.

                +
                +
                +
                +
                + + Anonymous wrote on 2011-08-03 08:38: +
                +
                +

                What performance impact does the malloc/free produce in the C code? AFAIK Python allocates memory in larger chunks from the operating system. Probably Python does not have to call malloc after initialization after it allocated the first chunk.

                AFAIK each malloc/free crosses the boundaries between user-mode/kernel-mode.

                So, IMHO you should compare the numbers of a C program which
                does not allocate dynamic memory more than once and uses an internal memory management system.

                These numbers would be interesting.

                Have fun

                +
                +
                +
                +
                + + Damian Cugley wrote on 2011-08-03 08:44: +
                +
                +

                The point here is not that the Python implementation of formatting is better than the C standard library, but that dynamic optimisation can make a big difference. The first time the formatting operator is called its format string is parsed and assembly code for assembling the output generated. The next 999999 times that assembly code is used without doing the parsing step. Even if sprintf were defined locally, a static compiler can’t optimise away the parsing step, so that work is done redundantly every time around the loop.

                In a language like Haskell something similar happens. A string formatting function in the style of sprintf would take a format string as a parameter and return a new function that formats its arguments according to that string. The new function corresponds to the specialized assembly code generated by PyPy’s JIT. I think if you wanted to give the static compiler the opportunity to do optimizations that PyPy does at runtime you would need to use a custom type rather than a string as the formatting spec. (NB my knowledge of functional-language implementation is 20 years out of date so take the above with a pinch of salt.)

                +
                +
                +
                +
                + + Dave Kirby wrote on 2011-08-03 12:50: +
                +
                +

                @Anonymous:

                The C code shown does not do any malloc/free. The sprintf function formats the string into the char array x, which is allocated on the stack. It is highly unlikely that the sprintf function itself mallocs any memory.

                +
                +
                +
                +
                + + Paul Jaros wrote on 2011-08-03 15:45: +
                +
                +

                I'm following the progress on pypy since many years and the potential is and has always been here. And boy, pypy has come a looong way.

                You are my favorite open-source project and I am excited to see what will happen next. Go pypy-team, go!

                +
                +
                +
                +
                + + Stepan Koltsov wrote on 2011-08-03 18:25: +
                +
                +

                PyPy does nothing 1.9 times faster than C.

                +
                +
                +
                +
                + + Jan Ziak (atomsymbol) wrote on 2011-08-03 19:51: +
                +
                +

                You wrote: "We think this demonstrates the incredible potential of dynamic compilation, ..."

                I disagree. You tested a microbenchmark. Claims about compiler or language X winning over Y should be made after observing patterns in real programs. That is: execute or analyse real C programs which are making use of 'sprintf', record their use of 'sprintf', create a statistics out of the recorded data and then finally use the statistical distributions to create Python programs with a similar distribution of calls to '%'.

                Trivial microbenchmarks can be deceiving.

                +
                +
                +
                +
                + + Anonymous wrote on 2011-08-04 01:11: +
                +
                +

                @Dave Kirby:

                There are two C programs there. One on the stack, one with a malloc / free in the loop.

                Which one is used for the faster claim?

                +
                +
                +
                +
                + + Armin Rigo wrote on 2011-08-04 08:47: +
                +
                +

                @Anonymous: this branch, unroll-if-alt, will not be included in the release 1.6, which we're doing right now (it should be out any day now). It will only be included in the next release, which we hope to do soonish. It will also be in the nightly builds as soon as it is merged.

                +
                +
                +
                +
                + + Connelly Barnes wrote on 2011-08-04 20:50: +
                +
                +

                Is string/IO performance in general being worked on in Pypy? Last I looked Pypy showed it was faster than CPython in many cases on its benchmarks page, but for many string/IO intensive tasks I tried Pypy v1.5 on, it was slower.

                +
                +
                +
                +
                + + Maciej Fijalkowski wrote on 2011-08-05 07:06: +
                +
                +

                @Connelly yes, for some definition of working (being thought about). that's one reason why twisted_tcp is slower than other twisted benchmarks. We however welcome simple benchmarks as bugs on the issue tracker

                +
                +
                +
                +
                + + tt wrote on 2011-08-05 10:05: +
                +
                +

                This is a horribly flawed benchmark which illustrates absolutely nothing. First of all, an optimizing JIT should be (easily) able to detect that your inner loop has no side effects and optimize it away. Secondly, with code like that you should expect all kinds of weirds transformations by the compiler, hence - you can't be really sure what you are comparing here. As many here have pointed out, you should compare the output assembly.

                Anyway, if you really want to do a benchmark like that, do it the right way. Make the loop grow a string by continuous appending and write the string to the file in the end (time the loop only). This way you will get accurate results which really compare the performance of two compilers performing the same task.

                +
                +
                +
                +
                + + Anonymous wrote on 2011-08-05 11:28: +
                +
                +

                try in nodejs:

                var t = (new Date()).getTime();

                function main() {
                var x;
                for (var i = 0; i < 10000000; i++)
                x = i + " " + i;
                return x;
                }
                x = main();

                t = (new Date()).getTime() - t;
                console.log(x + ", " + t);

                +
                +
                +
                +
                + + tt wrote on 2011-08-05 16:15: +
                +
                +

                I have now put a small, slightly more realistic benchmark. I used following code.

                Python

                def main():
                x = ""
                for i in xrange(50000):
                x = "%s %d" % (x, i)
                return x

                x = main()

                f = open("log.txt", "w")
                f.write(x)
                f.close()

                C
                #include
                #include
                #include


                int main() {
                int i;
                char *x = malloc(0);
                FILE *file;

                *x = 0x00;

                for (i = 0; i < 50000; i++) {
                char *nx = malloc(strlen(x) + 16); // +16 bytes to be on the safe side

                sprintf(nx, "%s %d", x, i);
                free(x);
                x = nx;
                }

                file = fopen("log1.txt","w");
                fprintf(file, "%s", x);
                fclose(file);
                }

                JavaScript (NodeJS)

                var fs = require('fs');

                String.prototype.format = function() {
                var formatted = this;
                for (var i = 0; i < arguments.length; i++) {
                var regexp = new RegExp('\\{'+i+'\\}', 'gi');
                formatted = formatted.replace(regexp, arguments[i]);
                }
                return formatted;
                };


                function main() {
                var x = "";
                for (var i = 0; i < 50000; i++)
                x = "{0} {1}".format(x, i);
                return(x)
                }

                x = main();
                fs.writeFile('log.txt', x)


                Note for JS example: I did not want to use the stuff like i + " " + i because it bypasses the format function call. Obviously, using the + operator the nodejs example would be much faster (but pypy probably as well).

                Also, I used PyPy 1.5 as I did not find any precompiled PyPy 1.6 for OS X.

                Results:

                PyPy: real 0m13.307s
                NodeJS: real 0m44.350s
                C: real 0m1.812s

                +
                +
                +
                +
                + + Jan Ziak (atomsymbol) wrote on 2011-08-05 18:32: +
                +
                +

                @tt: This is a very inefficient C/C++ implementation of the idea "make a loop grow a string by continuous appending and write the string to the file in the end". In addition, it appears to be an uncommon piece of C/C++ code.

                +
                +
                +
                +
                + + tt wrote on 2011-08-05 20:03: +
                +
                +

                Well, I never said anything about writing super efficient C code. Anyway, I don't see how you want to implement string formatting more efficiently - if we talk about general usage scenario. You can't really reuse the old string buffer, you basically have to allocate new one each time the string grows. Or pre-allocate a larger string buffer and do some substring copies (which will result in a much more complicated code). Anyway, the malloc() on OS X is very fast.

                My point is: even this C code, which you call inefficient is around 6 times faster then pypy 1.5

                +
                +
                +
                +
                + + Antiplutocrat wrote on 2011-08-05 21:08: +
                +
                +

                @tt except one of the main points of the post was that they had implemented a *new* feature (unroll-if-alt, I believe) that sped things up a bunch.

                I'm not sure how much any comparison that *doesn't* use this new feature is worth ...

                So many haters ! ;)

                +
                +
                +
                +
                + + Anonymous wrote on 2011-08-06 00:59: +
                +
                +

                The compare is good because both use standard langauge fetatures to do the same thing, using a third part lib is not the same, then I have to code the same implant in RPython and people would still complain do to RPython often being faster then C regardless.

                Python could have detected that the loop is not doing anything, but give that one value had a __str__ call it could've broken some code. Anyway, C compiler could also see that you didn't do anything with the value and optimalize it the same way.

                +
                +
                +
                +
                + + tt wrote on 2011-08-06 11:05: +
                +
                +

                @Antiplutocrat:
                Honestly, I expected a bit more objectivity from posters here. I am really disappointed that you compare me to "haters" (whoever that may be).

                Your point about unroll-if-alt is absolutely valid and I myself have explicitly stated that I did not use that feature. At no point I have refuted that the original blog post was wrong - it is still very well possible that PyPy 1.6 is faster then C in this usage scenario. The main goal of my post was to make clear that the original benchmarks were flawed, as they grant the compiler too much space for unpredictable optimizations. I believe that my benchmark code produces more realistic results and I suggest that the authors of this blog entry re-run the benchmark using my code (or something similar, which controls for unpredictable optimizations).

                +
                +
                +
                +
                + + Anonymous wrote on 2011-08-06 16:35: +
                +
                +

                @tt: Code is doing something else so it's not the same.

                +
                +
                +
                +
                + + Anonymous wrote on 2011-08-12 17:07: +
                +
                +

                Only a quick note OffTopic: in the python FAQ, one could update adding PyPy besides Psyco in the performance tips:

                https://docs.python.org/faq/programming.html#my-program-is-too-slow-how-do-i-speed-it-up

                +
                +
                +
                +
                + + Jan Ziak (atomsymbol) wrote on 2011-08-14 18:34: +
                +
                +

                @Anonymous: I agree with your other paragraphs, but not with the one where you wrote that "... OLDER version (4.5.x) of GCC whilst a newer version (4.6.x) is available with major improvements to the optimizer in general".

                I am not sure, what "major improvements" in GCC 4.6 do you mean? Do you have benchmark numbers to back up your claim?

                As far as well-written C code is concerned, in my opinion, there haven't been any "major improvements" in GCC for more than 5+ years. There have been improvements of a few percent in a limited number of cases - but nothing major.

                Even LTO (link-time optimization (and lets hope it will be safe/stable to use when GCC 4.7 is released)) isn't a major boost. I haven't seen LTO being able to optimize calls to functions living in dynamic libraries (the bsearch(3) function would be a nice candidate). And I also haven't seen GCC's LTO being able to optimize calls to/within the Qt GUI library when painting pixels or lines onto the screen.

                The main point of the PyPy article was that run-time optimizations in PyPy have a chance of surpassing GCC in certain cases.

                Personally, I probably wouldn't willingly choose to work on a project like PyPy - since, err, I believe that hard-core JIT optimizations on a dynamically typed language like Python are generally a bad idea - but I am (in a positive way) eager to see what the PyPy team will be able to do in this field in the years to come.

                +
                +
                +
                +
                + + Anonymous wrote on 2011-08-14 20:17: +
                +
                +

                @⚛: I indeed do not have benchmarks for these claims, but GCC 4.6 indeed added some newer optimization techniques to its assortment. Maybe these may not have had a significant influence in said case but they might have somewhere else. I'm merely saying: you can't really compare the latest hot inventions with something that is surpassed (e.g. compare Java 7 to a program output by Visual C++ back form the VS 2003 IDE).

                All by all, I'm not saying that Python sucks and don't want to sound like a fanboy (on the contrary, Linux uses a great deal of Python and if this could mean a major speedup, then why the hell not ;).

                I guess I was pissed off because the written article sounds very much fanboyish and pro-Python (just look at the title alone).

                +
                +
                +
                +
                + + Anonymous wrote on 2012-11-01 17:25: +
                +
                +

                So a loop that doesn't print in Python is compared to a loop in C that does and that was compiled on one of the slowest C compilers out there.

                YearOfTheLinuxDesktopIsAtHand(TM)

                +
                +
                +
                +
                + + Cees Timmerman wrote on 2012-11-12 15:36: +
                +
                +

                @Anonymous, "the C one doesn't print anything, either; sprintf just returns a string. printf is the one that prints." - Andrew Pendleton, this page, August 2, 2011 9:25 PM

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2011/08/visualization-of-jitted-code-6202490807361942120.html b/posts/2011/08/visualization-of-jitted-code-6202490807361942120.html new file mode 100644 index 000000000..e36a0df17 --- /dev/null +++ b/posts/2011/08/visualization-of-jitted-code-6202490807361942120.html @@ -0,0 +1,424 @@ + + + + + +Visualization of JITted code | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                Visualization of JITted code

                + + + +
                +

                Hello.

                +

                We're proud to announce the first public release of the jitviewer. As of now, +jitviewer is a slightly internal tool that helps understanding how your Python +source code is compiled by the PyPy's JIT all the way down to machine code.

                +

                To install it, you need a very recent version of PyPy +(newer than 9th of August), for example one of the nightly builds:

                +
                +
                  +
                • install pip and distribute either by creating a PyPy virtualenv +or by following the installation instructions.
                • +
                • make sure to have a source code checkout of PyPy and put it in your +PYTHONPATH.
                • +
                • +pip install jitviewer. Note that you need to run the pip +executable which belongs to PyPy, not the globally installed one.
                • +
                +
                +

                Have a look at the README for how to start it, or try the online demo if +you just want to play with it.

                +

                The jitviewer is a web application written with flask and jinja2. If +you have experience with web development and you want to help PyPy, don't +hesitate to contact us, there are plenty of things to improve in it :-).

                +
                +

                What does the jitviewer really do?

                +

                At the top of the page, you will see the list of pieces of code which has been +compiled by the JIT. You will see entries for both normal loops and for +"entry bridges". This is not the right place to discuss the difference +between those, but you most probably want to look at loops, because usually +it's where most of the time is spent.

                +

                Note that for each loop, you will see the name of the function which contains +the first instruction of the loop. However, thanks to the inlining done +by the JIT, it will contain also the code for other functions.

                +

                Once you select a loop, the jitviewer shows how the JIT has compiled the +Python source code into assembler in a hierarchical way. It displays four +levels:

                +
                  +
                • +

                  Python source code: only the lines shown in azure have been compiled for +this particular loop, the ones in gray have not.

                  +
                • +
                • +

                  Python bytecode, the one you would get by doing:

                  +
                  +def f(a, b):
                  +   return a + b
                  +
                  +import dis
                  +dis.dis(f)
                  +
                  +

                  The opcodes are e.g. LOAD_FAST, LOAD_GLOBAL etc. The opcodes +which are not in bold have been completely optimized aways by the JIT.

                  +
                • +
                • +

                  Intermediate representation of jit code (IR). This is a combination of +operations (like integer addition, reading fields out of structures) and +guards (which check that the assumptions we made are actually true). Guards +are in red. These operations are "at the same level as C": so, for example, ++ takes two unboxed integers which can be stored into the register +of the CPU.

                  +
                • +
                • +

                  Assembler: you can see it by clicking on "Show assembler" in the menu on the +right.

                  +
                • +
                +

                Sometimes you'll find that a guard fails often enough that a new piece of +assembler is required to be compiled. This is an alternative path through the +code and it's called a bridge. You can see bridges in the jitviewer when +there is a link next to a guard. For more information about purpose look up +the jit documentation.

                +
                +
                +

                I'm still confused

                +

                Jitviewer is not perfect when it comes to explaining what's going on. Feel free +to pop up on IRC or send us a mail to the mailing list, we'll try to explain +and/or improve the situation. Consult the contact page for details.

                +

                Cheers,
                +fijal & antocuni

                +
                +
                +

                Comments

                +
                +
                +
                + + Paul Smith wrote on 2011-08-13 20:47: +
                +
                +

                I'm getting a TemplateNotFound jinja2 exception when I run the jitviewer.py as shown in the README.

                +
                +
                +
                +
                + + Maciej Fijalkowski wrote on 2011-08-13 20:48: +
                +
                +

                I think you have to python setup.py install it in a virtualenv. It might not work from the checkout any more.

                +
                +
                +
                +
                + + Paul Smith wrote on 2011-08-13 21:31: +
                +
                +

                That fixed it, thanks.

                +
                +
                +
                +
                + + Anonymous wrote on 2011-08-14 10:25: +
                +
                +

                Would it be possible to get some screenshots of jitviewer, as the online demo is currently down?

                +
                +
                +
                +
                + + Garito wrote on 2011-08-16 19:23: +
                +
                +

                The demo doesn't work

                Please, could you put it back?

                Thanks a lot!

                I'm developing a programming languaje based on mindmaps and I would like to know if my code works with pypy...

                +
                +
                +
                +
                + + Unknown wrote on 2011-10-02 08:33: +
                +
                +

                jitviewer repository - https://bitbucket.org/pypy/jitviewer

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2011/08/we-need-software-transactional-memory-6513983438425039230.html b/posts/2011/08/we-need-software-transactional-memory-6513983438425039230.html new file mode 100644 index 000000000..82026cf1a --- /dev/null +++ b/posts/2011/08/we-need-software-transactional-memory-6513983438425039230.html @@ -0,0 +1,927 @@ + + + + + +We need Software Transactional Memory | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                We need Software Transactional Memory

                + + + +
                +

                Hi all. Here is (an extract of) a short summary paper about my current position on +Software Transactional Memory as a general tool in the implementation +of Python or Python-like languages. Thanks to people on IRC for discussion on making +this blog post better (lucian, Alex Gaynor, rguillebert, timonator, Da_Blitz). +For the purpose of the present discussion, we are comparing Java with Python +when it comes to multi-threading.

                + +

                The problem in complex high-level languages

                +

                Like Java, the Python language gives guarantees: it is not acceptable +for the Python virtual machine to crash due to incorrect usage of +threads. A primitive operation in Java is something like reading or +writing a field of an object; the corresponding guarantees are along the +lines of: if the program reads a field of an object, and another thread +writes to the same field of the same object, then the program will see +either the old value, or the new value, but not something else entirely, +and the virtual machine will not crash.

                +

                Higher-level languages like Python differ from Java by the fact that a +"primitive operation" is far more complex. It may for example involve +looking in several hash maps, perhaps doing updates. In general, it is +completely impossible to map every operation that must be atomic to a +single processor instruction.

                + +

                Jython: fine-grained locking

                +

                This problem has been solved "explicitly" in the Jython interpreter that +runs on top of Java. The solution is explicit in the following sense: +throughout the Jython interpreter, every single operation makes careful +use of Java-level locking mechanisms. This is an application of +"fine-grained locking". For example, operations like attribute lookup, +which need to perform look-ups in a number of hash maps, are protected +by acquiring and releasing locks (in __getattribute__).

                +

                A draw-back of this solution is the attention to detail required. +If even one place misses a lock, then there is either a +bug --- and such bugs occur in cases that are increasingly rare and hard +to debug as the previous bugs are fixed --- or we just file it under "differences +from CPython". There is however the risk of +deadlock, if two threads attempt to lock the same objects in different +order.

                + +

                In practice, the situation is actually not as bad as +I may paint it: the number of locks in Jython is reasonable, and allows for +all the "common cases" to work as expected. +(For the uncommon cases, see below.)

                + +

                Performance-wise, the Java virtual machine itself comes with locks that +have been heavily optimized over a long period of time, so the +performance is acceptable. However if this solution were coded in C, it +would need a lot of extra work to optimize the locks manually (possibly +introducing more of the subtle bugs).

                + +

                CPython: coarse-grained locking

                +

                CPython, the standard implementation of Python in C, took a different +and simpler approach: it has a single global lock, called the Global +Interpreter Lock (GIL). It uses "coarse-grained locking": the lock is +acquired and released around the whole execution of one bytecode (or +actually a small number of bytecodes, like 100). This solution is +enough to ensure that no two operations can conflict with each other, +because the two bytecodes that invoke them are themselves +serialized by the GIL. It is a solution which avoids --- unlike Jython +--- writing careful lock-acquiring code all over the interpreter. It +also offers even stronger guarantees: every bytecode runs entirely +atomically.

                +

                Nowadays, the draw-back of the GIL approach is obvious on multi-core +machines: by serializing the execution of bytecodes, starting multiple +threads does not actually let the interpreter use of more than one core.

                +

                PyPy, the Python implementation in Python, takes the same approach so +far.

                + +

                Existing usage

                +

                As we have seen, we have the following situation: the existing Python +language, as CPython implements it, offers very strong guarantees about +multi-threaded usage. It is important to emphasize that most existing +multi-threaded Python programs actually rely on such strong guarantees. +This can be seen for example in a problem that takes a populated list +and does in several threads:

                +
                +next_item = global_list.pop()
                +
                +

                This implicitly relies on the fact that pop() will perform atomic +removal from the list. If two threads try to pop() from the same list +at the same time, then the two operations will occur in one order or the +other; but they will not e.g. return the same object to both threads or +mess up the internal state of the list object.

                +

                With such an example in mind, it should be clear that we do not want a +solution to the multi-core issue that involves dropping these strong +guarantees. It is ok however to lower the barrier, as Jython does; but +any Python implementation must offer some guarantees, or not offer +multi-threading at all. This includes the fact that a lot of methods on +built-in types are supposed to be atomic.

                + +

                (It should be noted that not offering multi-threading at all is actually +also a (partial) solution to the problem. Recently, several "hacks" +have appeared that give a programmer more-or-less transparent access to +multiple independent processes (e.g. multiprocessing). While these provide appropriate +solutions in some context, they are not as widely applicable as +multi-threading. As a typical example, they fail to apply when the +mutiple cores need to process information that cannot be serialized at +all --- a requirement for any data exchange between several processes.)

                + +

                Here is an example of how Jython's consistency is weaker than CPython's GIL. +It takes uncommon examples to show it, and the fact that it does not work +like a CPython programmer expect them to is generally considered as an +implementation detail. Consider:

                +
                Thread 1:  set1.update(set2)
                +Thread 2:  set2.update(set3)
                +Thread 3:  set3.update(set1)
                +

                Each operation is atomic in the case of CPython, but decomposed in two steps +(which can each be considered atomic) in the case of Jython: reading from the +argument, and then updating the target set. Suppose that initially +set1 = {1}, set2 = {2}, set3 = {3}. On CPython, independently on +the order in which the threads run, we will end up with at least one of the +sets being {1, 2, 3}. On Jython, it is possible that all +three sets end up as containing two items only. The example is a bit +far-fetched but should show that CPython's consistency is strictly stronger +than Jython's.

                + +

                PyPy

                +

                PyPy is a Python interpreter much like CPython or Jython, but the way it +is produced is particular. It is an interpreter written in RPython, a +subset of Python, which gets turned into a complete virtual machine (as +generated C code) automatically by a step called the "translation". In +this context, the trade-offs are different from the ones in CPython and +in Jython: it is possible in PyPy, and even easy, to apply arbitrary +whole-program transformations to the interpreter at "translation-time".

                +

                With this in mind, it is possible to imagine a whole-program +transformation that would add locking on every object manipulated in +RPython by the interpreter. This would end up in a situation similar to +Jython. However, it would not automatically solve the issue of +deadlocks, which is avoided in the case of Jython by careful manual +placement of the locks. (In fact, being deadlock-free is a global +program property that cannot be automatically ensured or verified; any +change to Jython can in theory break this property, and thus introduce +subtle deadlocks. The same applies to non-atomicity.)

                +

                In fact, we can easily check that if the interpreter accesses (for +both reading and writing) +objects A and B in a bytecode of thread 1, and objects B and A (in the +opposite order) in a bytecode of thread 2 --- and moreover if you need to +have accessed the first object before you can decide that you will need +to access the second object --- then there is no way (apart from the GIL) to avoid +a deadlock while keeping the strong guarantee of atomicity. Indeed, if +both threads have progressed to the middle of the execution of their +bytecode, then A has already been mutated by thread 1 and similarly B +has already been mutated by thread 2. It is not possible to +successfully continue running the threads in that case.

                + +

                Using Software Transactional Memory

                +

                Software Transactional Memory (STM) is an approach that gives a solution +to precisely the above problem. If a thread ended up in a situation +where continuing to run it would be wrong, then we can abort and +rollback. This is similar to the notion of transaction on databases. +In the above example, one or both threads would notice that they are +about to run into troubles and abort. This means more concretely that +they need to have a way to restart execution at the start of the +bytecode, with all the side-effects of what they did so far being either +cancelled or just not committed yet.

                +

                We think that this capacity to abort and rollback is the missing piece +of the puzzle of multi-threaded implementations of Python. +Actually, according to the presentation of the problem given +above, it is unavoidable that any solution that wants to offer the +same level of consistency and atomicity as CPython would involve +the capacity of aborting and rolling back --- which means precisely +that STM cannot be avoided.

                + +

                Ok, but why not settle down with Jython's +approach and put careful locks left and right throughout the interpreter? +Because (1) we would have to consider every operation's atomicity and make decisions +(or steal Jython's) and document them +here; +(2) it would also be really a lot of work, to optimize these locks e.g. with the +JIT as well as the JVM does; and (3) it is not the PyPy way to require manually +tweaking your code everywhere for a feature that should be orthogonal. Point +(3) is probably the most important here: you need to redo the work for every +language you implement in PyPy. +It also implies my own point (4): it is not fun :-)

                + +

                In more details, the process would work as follows. (This gives an +overview of one possible model; it is possible that a different model +will end up being better.) In every thread:

                +
                  +
                • At the start of a bytecode, we start a "transaction". This means +setting up a thread-local data structure to record a log of what +occurs in the transaction.
                • +
                • We record in the log all objects that are read, as well as the +modifications that we would like to make.
                • +
                • During this time, we detect "read" inconsistencies, shown by the +object's "last-modified" timestamp being later than the start time +of the current transaction, and abort. This prevents the rest of +the code from running with inconsistent values.
                • +
                • If we reach the end of the bytecode without a "read" inconsistency, +then we atomically check for "write" inconsistencies. These are +inconsistencies which arise from concurrent updates to objects +in the other threads --- either our "write" objects, or our "read" +objects.
                • +
                • If no inconsistency is found, we "commit" the transaction by copying +the delayed writes from the log into main memory.
                • +
                +

                The points at which a transaction starts or ends are exactly the +points at which, in CPython, the Global Interpreter Lock is +respectively acquired and released. If we ignore the fact that (purely for +performance) CPython acquires and releases the GIL only every N bytecodes, +then this means:

                +
                  +
                1. Before any bytecode we acquire the GIL (start a transaction), and after +the bytecode we release it (ends the transaction); and +
                2. +
                3. Before doing an external call to the C library or the OS we release the GIL +(ends the transaction) and afterwards re-acquire it (start the next transaction). +
                4. +
                +So in particular this model is well suited to the STM condition that we cannot +do anything in a transaction that cannot be rolled back, like --- precisely --- +system calls. Indeed, by construction, these system calls occur outside a +transaction, because in CPython they occur with the GIL released. + +

                Performance

                +

                A large number of implementation details are still open for now. +From a user's point of view (i.e. the programmer using Python), +the most relevant one is the overall performance impact. We +cannot give precise numbers so far, and we expect the initial +performance to be abysmally bad (maybe 10x slower); however, with +successive improvements to the locking mechanism, to the global +program transformation inserting the locks, to the garbage +collector (GC), and to the Just-in-Time (JIT) compiler, we +believe that it should be possible to get a roughly reasonable +performance (up to maybe 2x slower). For example, the GC can +maintain flags on the objects to know that they did not escape +their creation thread, and do not need any logging; and the JIT +compiler can aggregate several reads or writes to an object into +one. We believe that these are the kind of optimizations that +can give back a lot of the performance lost.

                + +

                The state of STM

                +

                Transactional Memory is itself a relatively old idea, originating +from a 1986 paper by Tom Knight. At first based on hardware +support, the idea of software-only transactional memory (STM) was +popularized in 1995 and has recently been the focus of intense +research.

                +

                The approach outlined above --- using STM to form the core of the +implementation of a language --- is new, as far as we know. So +far, most implementations provide STM as a library feature. It +requires explicit usage, often in the form of explicitly +declaring which objects must be protected by STM (object-based +STMs). It is only recently that native STM support has started +to appear, notably in the Clojure language.

                +

                STM is described on Wikipedia as an approach that "greatly +simplifies conceptual understanding of multithreaded programs and +helps make programs more maintainable by working in harmony with +existing high-level abstractions such as objects and modules." +We actually think that these benefits are important enough to +warrant being exposed to the Python programmer as well, instead +of being used only internally. This would give the Python +programmer a very simple interface:

                +
                +with atomic:
                +    <these operations are executed atomically>
                +
                +

                (This is an old idea. Funny how back in 2003 people, including me, thought that this was a hack. Now I'm writing a blog post to say "it was not a hack; it's explicitly using locks that is a hack." I'm buying the idea of composability.)

                + +

                From a practical point of view, I started looking seriously at +the University of Rochester STM (RSTM), a C++ library that has +been a focus of --- and a collection of results from --- recent +research. One particularly representative paper is +A +Comprehensive Strategy for Contention Management in Software +Transactional Memory by Michael F. Spear, Luke Dalessandro, +Virendra J. Marathe and Michael L. Scott.

                + +

                Conclusion

                +

                Taking these ideas and applying them in the context of an +implementation of a complex high-level language like Python comes +with its own challanges. In this context, using PyPy makes sense +as both an experimentation platform and as a platform that is +recently gaining attention for its performance. The alternatives +are unattractive: doing it in CPython for example would mean +globally rewriting the interpreter. In PyPy instead, we write it +as a transformation that is applied systematically at translation-time. +Also, PyPy is a general platform for generating fast interpreters +for dynamic languages; the STM implementation in PyPy would work +out of the box for other language implementations as well, instead +of just for Python.

                +

                Update: +

                +
                  +
                • This is mostly me (Armin Rigo) ranting aloud and trying experiments; +this post should not be confused as meaning that the whole PyPy team +will now spend the next years working on it full-time. +As I said it is orthogonal to the actual Python interpreter, and it is in +any case a feature that can be turned on or off during translation; I know +that in many or most use cases, people are more interested in getting a +fast PyPy rather than one which is twice as slow but scales well. +
                • +
                • Nothing I said is really new. For proof, see +Riley and Zilles (2006) +as well as Tabba (2010) who both experimented with Hardware Transactional Memory, turning CPython or PyPy interpreter's GIL into start/end transactions, as I describe here. +
                • +
                +
                +

                Comments

                +
                +
                +
                + + Anonymous wrote on 2011-08-23 13:40: +
                +
                +

                How to handle composability ("with atomic") when something inside composed block turns out to make a system call? With explicit locking, this shouldn't be a problem.

                +
                +
                +
                +
                + + ajuc wrote on 2011-08-23 14:43: +
                +
                +

                Re sys calls in transactions:

                In clojure it is solved by requiring that code in transaction is side effect free.

                You can tag code as having side effects by macro "io!" :

                (defn launch-missiles
                “Launch attack on remote targets with everything we have.”
                []
                (io!
                (doseq [missile (all-silos)]
                (fire missile))))

                Then if you try to execut this code in transaction clojure will complain, because you can't really rollback launching nuclear missiles :)

                +
                +
                +
                +
                + + ajuc wrote on 2011-08-23 14:49: +
                +
                +

                Ehh, I should've thought more before posting.

                Code in transactions need not be side effect free - in fact in clojure side effects are the whole point of transactions. But this code should only change STM controlled variables, not outside world.

                And "io!" macro is for marking code that changes things outside of STM.

                Sorry for confusion.

                +
                +
                +
                +
                + + Armin Rigo wrote on 2011-08-23 14:56: +
                +
                +

                Here are my current hacks in C, based on RSTM: https://bitbucket.org/arigo/arigo/raw/default/hack/stm/c , from the repo https://bitbucket.org/arigo/arigo .

                +
                +
                +
                +
                + + Thomas Schilling wrote on 2011-08-23 14:56: +
                +
                +

                Implementing STM at a core level is certainly a nice research topic, but I wonder whether it's the best way forward for Python.

                STM works well in Haskell because it has the type system to enforce several constraints. Also most data is immutable in Haskell, so threading is mostly safe by default.

                Most Python objects are mutable (by default), so users have to be very careful when using multi-threading. STM gives you a nice, composable primitive to protect your critical sections, but it does not tell where your critical sections are.

                You dismiss multiprocessing because of serialization issues, but what about multiprocessing within the same process? You have a VM already, so my guess would be that it wouldn't be that hard to implement software processes (a la Erlang). Sure, using message passing may lead to a fair amount of copying, but I it seems to be much easier to implement and easier to use than shared-memory concurrency + STM.

                +
                +
                +
                +
                + + Armin Rigo wrote on 2011-08-23 15:23: +
                +
                +

                @Thomas Schilling: I don't see how having a "multiprocessing" that uses the same process, rather than different processes, makes a difference. In both cases you need to write your threading code specially and care about explicitly transferring objects via shared memory --- either to another OS thread in the same process, or to a different process altogether.

                +
                +
                +
                +
                + + René Dudfield wrote on 2011-08-23 16:04: +
                +
                +

                closures

                +
                +
                +
                +
                + + Sam Wilson wrote on 2011-08-23 16:32: +
                +
                +

                I'm with illume... look at what Apple has done with blocks. This seems like a very efficient way forward.

                Separately, you are missing something about the Java-side.

                For many of the data structures in Java there are atomic and non-atomic versions. That is, when you are using a data structure on a single thread, you grab the non-atomic version. This way, you don't pay for the overhead of the locking. But, when you are sharing a data structure between threads, you use the atomic version. As a by-product of history, though it is a nice by-product, you usually get the atomic version by default. That is to say, you have to go looking for trouble by explicitly asking for the non-atomic version.

                By baking this into the language, you are forcing a single policy on all programs, rather than letting the programmer choose what policy is going to be best in that scenario. Either that, or they will be forced to put code guards all over the place.

                To me, it seems like the language/runtime should provide the most basic of atomic operations, and the run-time library on top should provide the policy. That's the Java approach, in a nutshell. It gives the programmer flexibility and keeps the core runtime simple and easier to optimize.

                Granted, you want a high-level language where the programmer doesn't make a lot of these decisions. So... looking at your own arguments... you are expecting an initial 10x performance hit relative to the current GIL-python approach, with hopes of getting it down to 2x performance... If that's the case, why not just stick with the GIL and have Python programmers take advantage of multiprocessing by creating co-operative programs using a message passing API. In some ways, it's a little more TAUP to do it that way, isn't it?

                +
                +
                +
                +
                + + nekto0n wrote on 2011-08-23 16:37: +
                +
                +

                What about replaying syscalls? Is it possible that such situation will happen?

                +
                +
                +
                +
                + + Armin Rigo wrote on 2011-08-23 16:45: +
                +
                +

                @Anonymous: this case can be handled on a case-by-case basis (e.g. special-casing "prints" to buffer), but it also has a general solution: we turn the transaction into an "inevitable" transaction, i.e. one which cannot fail.

                I already have support for this in my demo code, because it is needed to handle the cases where the nesting of the C program is such that setjmp/longjmp can no longer work. The typical example is the RETURN_VALUE bytecode. It starts a transaction, returns to the caller by popping off some C frames, then ends the transaction in the caller. When we return from the C frame of the callee, in the middle of the transaction, we notice that we won't have the setjmp around any longer, so we are not allowed to abort and rollback any more.

                Inevitable transactions have the property of being "a bit like" a GIL in the sense that you can only have one in total, and other transactions cannot commit before it does. In case of the RETURN_VALUE, it's a very short transaction so it shouldn't really be a problem. For the case of a user-specified "with atomic:" block, it can make all the other threads pause. Not ideal, but at least better than nothing...

                +
                +
                +
                +
                + + TomV wrote on 2011-08-23 16:49: +
                +
                +

                Could you explain a bit more what PyPy currently does to prevent these kinds of problems?

                +
                +
                +
                +
                + + nekto0n wrote on 2011-08-23 16:52: +
                +
                +

                @TomV PyPy uses GIL

                +
                +
                +
                +
                + + Armin Rigo wrote on 2011-08-23 16:54: +
                +
                +

                @Sam Wilson: as you know, the PyPy approach is to sacrifice nothing to performance for the user, and get reasonably good (if not exactly Java-level) performance anyway :-)

                I should also mention generally that for some programs that I have in mind, using a message-passing API would be a complete rewrite (if it is really possible at all), whereas "just" making them multithreaded can be done. The "translate.py" of PyPy falls into this category. It is a program that heavily use objects within objects within objects in a big non-nicely-separable "mess", and I would not dare to think about how to send parts of this object graph over a messaging API and get back localized updates.

                Of course there are also other use cases where you can naturally get a model that plays nicely with message passing.

                +
                +
                +
                +
                + + Armin Rigo wrote on 2011-08-23 17:03: +
                +
                +

                @nekto0n: that's not really possible in general, because you need to have the return value of the syscall to decide what to do next, which normally means that you have to really do the syscall.

                +
                +
                +
                +
                + + nekto0n wrote on 2011-08-23 17:12: +
                +
                +

                @armin please describe what will happen if 2 threads call write() on single socket object? what exactly should/will happen when iterpreter begins to dispatch CALL bytecode?

                I think, it's the most questionable part of STM approach.

                +
                +
                +
                +
                + + Rodrigo Araújo wrote on 2011-08-23 17:33: +
                +
                +

                some change in my code

                https://paste.pocoo.org/show/463085/

                +
                +
                +
                +
                + + Armin Rigo wrote on 2011-08-23 17:34: +
                +
                +

                @nekto0n: nothing particular. The two threads will run the calls in parallel, just like CPython, which calls the send() function without any GIL acquired. What exactly occurs depends on the OS and not on the language.

                +
                +
                +
                +
                + + Anonymous wrote on 2011-08-23 17:37: +
                +
                +

                I dissagree to the fact that threads whose transactions would be invalidated, are stealing CPU timeshares from other processes / threads.

                STM is an 'egoist' aproach

                +
                +
                +
                +
                + + kost BebiX wrote on 2011-08-23 20:54: +
                +
                +

                I know this might sound stupid, but is it possible to enable/disable STM on the fly? Like to enable it only for several threads involved.

                +
                +
                +
                +
                + + kost BebiX wrote on 2011-08-23 20:55: +
                +
                +

                Or just not open transaction when there's only 1 thread?

                +
                +
                +
                +
                + + Unknown wrote on 2011-08-23 22:43: +
                +
                +

                Hi,

                I thought a bit about what you said about Jython. Mostly, I was thinking about a way to do this automatically instead of making it explicitly.

                I came up with this first draft: https://github.com/albertz/automatic_object_locking

                This will obviously also be very slow but it should be possible to optimize this well (similarly to STM). And I think it is much easier than STM.

                -Albert

                +
                +
                +
                +
                + + Anonymous wrote on 2011-08-24 07:39: +
                +
                +

                Funny to see how Python eats itself like an Ouroboros. Wrong design decisions that made concurrency almost impossible, dirty hacks ("dirty" compared to, for example, Erlang's approach to SMP — almost linear scalability with a number of cores with 10-20% static overhead thanks to locks) that PyPy team are trying to do to solve problems introduced by Guido's ignorance, and a lot of Python "programmers" that don't understand what SMP is. Python is a ghetto, for real.

                +
                +
                +
                +
                + + Paul Harrison wrote on 2011-08-24 07:51: +
                +
                +

                Seems like it should be possible to guarantee performance not much worse than with a GIL.

                Am I right in thinking there is a locked section where changes are written to memory? The execution before this is effectively just some speculative computation to to speed up the locked section. If it turns out there's an inconsistency, just execute the locked section as you would normally. If the speculative computation is failing most of the time or is slow, switch to not doing it -- and we are back to GIL performance.

                +
                +
                +
                +
                + + Armin Rigo wrote on 2011-08-24 10:29: +
                +
                +

                @all: please come to the #pypy irc channel on irc.freenode.net if you want to discuss this further.

                +
                +
                +
                +
                + + Thomas Schilling wrote on 2011-08-24 12:01: +
                +
                +

                @Armin: Each in-memory process would use its own part of the heap so there would be no locking necessary except during message sending. You also don't need to have a 1-to-1 mapping of OS threads to processes. You could schedule N processes onto M OS threads (preferably chosen to match the number of CPU cores).

                Of course, if you don't want a message-passing model (as you mentioned in another comment) then fine.

                My argument is just that: STM is difficult to implement, difficult to make fast, and it still isn't that easy to use. A message passing model is much easier to implement and easier to use for end users. (You can still get deadlocks, but you could provide libraries for standard communication patterns which you only have to get right once, like Erlang's OTP.)

                +
                +
                +
                +
                + + Jan Ziak (atomsymbol) wrote on 2011-08-24 13:39: +
                +
                +

                I think that there is some confusion here about what the underlying problem that you are trying to solve is.

                The underlying (fundamental) problem that transactional memory as a method to replace GIL in Python is trying to solve is: automatic parallelization. That *is* hard.

                Mediocre implementations of transactional memory are trivial to implement. Almost anybody can do it. Of course, the performance will be horrible.

                If we stick to the idea about the underlying problem (automatic parallelization) and keep it in our minds while thinking, it is clear and utterly obvious that *any* implementation of transactional memory which is slower than serial execution is simply missing the target. The target is, obviously, to run the program faster than serial execution. Otherwise, it would be totally pointless.

                Based on this reasoning, it is an *obvious* conclusion that a transactional memory implementation simply cannot be allowed to result in lower performance than serial execution of the code. Allowing lower performance would be completely irrational.

                We are humans, not animals. Rationality is our distinctive feature. We have to try to follow rationality.

                In light of this, saying that "It is OK for transactional memory to result in 2x slowdown" is irrational. I will write it one more time: accepting 2x slowdown is irrational.

                Now, it is crucial to note that there are various kinds of performance measurements. And it is OK to slow down one performance indicator while boosting another performance indicator. For example, in web server environment, it is OK to slow down the delivery of individual web pages by a factor 1.3 - while boosting the number of requests per second by 2.3. That is *rational* and perfectly OK. Also, 3x developer productivity boost would be OK.

                Following this example, if transactional memory is allowed to slow down performance of the program (compared to serial execution) by 2x, a person who follows rationally would immediately be drawn to seek for the evidence of a greater-than-2x performance boost in another area of the program.

                Omitting developer productivity, how are the PyPy developers going to deliver the *mandatory* greater-than-2x performance boost (in some area) without actually solving the underlying hard problems requiring hard-core code analysis?

                If PyPy's transactional memory implementation would serialize calls to the Linux kernel (because it is hard to emulate them in user-space), then this alone would prevent some programs to achieve the more-than-2x performance boost. This is because it is impossible to boost program performance (in some areas, given a particular performance indicator) unless the modified program is allowed to call kernel functions out-of-order or in parallel.

                -----

                Note: I am *not* saying that PyPy should give up. I am just trying to note that you do not seem to know what you are doing. But I may be wrong.

                +
                +
                +
                +
                + + Armin Rigo wrote on 2011-08-24 16:50: +
                +
                +

                Of course the sentence "It is OK for transactional memory to result in 2x slowdown" was meant "on one thread". As soon as your program uses more than 2 threads, on a more-than-2-CPUs machine, then you win.

                +
                +
                +
                +
                + + Jan Ziak (atomsymbol) wrote on 2011-08-24 17:20: +
                +
                +

                I read "Tabba (2010)" (Tabba: Adding Concurrency in Python Using a Commercial Processor’s Hardware Transactional Memory Support) just now.


                The article:

                - benchmark "iterate": This function is not making calls to other functions. The authors are independently running 16 instances of the "iterate" function on a 16-core CPU using 16 threads. The speedup in respect to unmodified CPython is 7x. The slowdown in respect to 16 CPython processes is 2.2x.

                - benchmark "count": This is similar to "iterate". The speedup in respect to unmodified CPython is 4.5x. The slowdown in respect to 16 CPython processes is 3.5x.

                - benchmark "pystone": This function is making calls to other functions. 16 instances of the "pystone" function on a 16-core CPU using 16 threads. The speedup in respect to unmodified CPython is 0.9x. The slowdown in respect to 16 CPython processes is 17x.


                My analysis:

                - iterate: The fact that N instances of this function can run in parallel without any interference can be determined easily. The algorithm to determine this is trivial. (Not to mention, the pointless loop in the function can be replaced by a NOP in a dead-code elimination pass).

                - count: same as "iterate".

                - pystone: It is not trivial to determine whether multiple instances can run in parallel. So, it should presumably run single-threaded.

                - The article is *not* mentioning any real problem that was solved by TM in the case of "iterate", "count" or "pystone". That is logical, since the truth is that there is no real problem to solve here. The benchmark functions can be trivially run in 16 CPython Linux processes - anybody can do that (even your grandma).


                My summary:

                - In case of the two functions for which it *can* be trivially determined whether their instances can run in parallel, the TM approach results in a 2x-3x slowdown compared to the most basic auto-parallelization algorithm.

                - In case of the function for which it *cannot* be trivially determined whether multiple instances can run in parallel, the TM approach running on 4-16 threads achieved 90% (loss of 10%) of the speed of single-threaded CPython without TM. On 1 thread, the TM approach is 2.1x slower.


                Bottom line:

                Call me crazy, but my conclusion from this article is that TM (at least the TM approach from the article) is not working at all.

                +
                +
                +
                +
                + + Greg Wilson wrote on 2011-08-24 17:43: +
                +
                +

                Cool to see this happening. What's also cool is the result reported in Rossbach et al's study (https://www.neverworkintheory.org/?p=122): novices using STM did better in simple programming problems than students using traditional mechanisms, even though they thought they had done worse. "Baroque syntax" may be part of the problem; I'm sure the paper's authors would be happy to chat.

                +
                +
                +
                +
                + + Timo wrote on 2011-08-27 13:52: +
                +
                +

                ⚛, you're missing a very important bit of the paper. In it, the authors say, that the Rock hardware only holds 256 bytes of write-buffer content, while Riley and Zilles¹ determined the average write-buffer size needed for transactions to not fail prematurely would be "less than 640 bytes", which is almost three times as much as Rock offers.

                Thus, the big slowdown that the pystone benchmark experiences could be caused by the shortcomings of the TM built into Rock.

                I do have to agree, though, that the "benchmarks" used in the paper are not very satisfactory. However, the magical "simple parallelization algorithm" you summon in your comment would break down quite easily shortly after the complexity of the situation increases by just a bit, would it not?

                ¹ I only briefly glanced over the paper, so if anyone read it more thoroughly, they can feel free to correct me.

                +
                +
                +
                +
                + + Unknown wrote on 2011-08-28 00:01: +
                +
                +

                I thought Erlang successfully solved this problem years ago? And I don't think anything scales better than it. So why aren't we just copying them? Message passing, where each thread or process share absolutely nothing, is the sanest and safest way to do concurrent and multi-threaded programming. I mean, you don't even have to worry about locking! STM always seemed complicated to me.

                +
                +
                +
                +
                + + Anonymous wrote on 2011-08-30 03:00: +
                +
                +

                is there a branch we can check this out?

                +
                +
                +
                +
                + + squeaky_pl wrote on 2011-09-01 10:31: +
                +
                +

                Hardware transactional memory anyone? https://arstechnica.com/hardware/news/2011/08/ibms-new-transactional-memory-make-or-break-time-for-multithreaded-revolution.ars

                +
                +
                +
                +
                + + Armin Rigo wrote on 2011-09-21 19:10: +
                +
                +

                @squeaky_pl: thanks for the link. In some way researching this is ultimately doomed: either transactional memory doesn't work, or it does and in 5 or 10 years all CPUs will have good hardware support and will be able to run existing software like CPython with minor changes. :-)

                +
                +
                +
                +
                + + staila wrote on 2011-11-03 05:31: +
                +
                +

                We are actually working on implementing this directly into stailaOS.

                +
                +
                +
                +
                + + Unknown wrote on 2012-05-12 10:24: +
                +
                +

                @Mystilleef agree 100%

                +
                +
                +
                +
                + + Unknown wrote on 2012-07-05 22:42: +
                +
                +

                The high-level semantics that the Python VM provides through the GIL are perfect for most programs, and for most programmer's knowledge about concurrency.

                What is the purpose of going after the GIL?

                If it's just a performance boost on multiple cores, then an GIOL (global IO lock) implemented on the VM, as the GIL is, should be considered. The VM could run several OS threads blocking them on IO and releasing GIL.

                If the purpose is to make concurrent programming easy and correct, it can be proven that it is not possible.

                Yet, there are alternatives that don't alter the language or the semantics that can be explored.

                Erlang-style message passing can be provided through object proxies implemented on top or beneath the VM, so the threads/processes can even run on different computers.

                In short, an Actor model is much preferable to a shared-memory one.

                https://en.wikipedia.org/wiki/Actor_model

                +
                +
                +
                +
                + + Alex moner wrote on 2014-10-21 17:38: +
                +
                +

                In general, it is completely impossible to map every operation that must be atomic to a single processor instruction.Uni-source

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2011/08/wrapping-c-libraries-with-reflection-3916959558080483711.html b/posts/2011/08/wrapping-c-libraries-with-reflection-3916959558080483711.html new file mode 100644 index 000000000..b6e9f30f3 --- /dev/null +++ b/posts/2011/08/wrapping-c-libraries-with-reflection-3916959558080483711.html @@ -0,0 +1,411 @@ + + + + + +Wrapping C++ Libraries with Reflection — Status Report One Year Later | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                Wrapping C++ Libraries with Reflection — Status Report One Year Later

                + + + +
                +

                Well over a year ago, work was started on the cppyy module which lives in the +reflex-support branch. +Since then, work has progressed at a varying pace and has included a recent +sprint in Düsseldorf, last July.

                +

                Let's first take a step back and recap why we're interested in doing this, +given that it is perfectly possible to use C++ through generated bindings and +cpyext. +cppyy makes use of reflection information generated for the C++ classes of +interest, and has that reflection information available at run time. +Therefore, it is able to open up complex C++ types to the JIT in a +conceptually similar manner as simple types are open to it. +This means that it is possible to get rid of a lot of the marshalling layers +when making cross-language calls, resulting in much lower call overhead than +is possible when going through the CPython API, or other methods of wrapping.

                +

                There are two problems that need to be solved: C++ language constructs need to +be presented on the Python side in a natural way; and cross-language impedance +mismatches need to be minimized, with some hints of the user if need be. +For the former, the list of mapped features has grown to a set that is +sufficient to do real work. +There is now support for:

                +
                +
                  +
                • builtin, pointer, and array types
                • +
                • namespaces, classes, and inner classes
                • +
                • global functions, global data
                • +
                • static/instance data members and methods
                • +
                • default variables, object return by value
                • +
                • single and multiple (virtual) inheritance
                • +
                • templated classes
                • +
                • basic STL support and pythonizations
                • +
                • basic (non-global) operator mapping
                • +
                +
                +

                The second problem is harder and will always be an on-going process. +But one of the more important issues has been solved at the recent Düsseldorf +sprint, namely, that of reclaiming C++ objects instantiated from the Python +side by the garbage collector.

                +

                Performance has also improved, especially that of the nicer "pythonized" +interface that the user actually sees, although it still misses out on +about a factor of 2.5 in comparison to the lower-level interface (which has +gotten uglier, so you really don't want to use that). +Most of this improvement is due to restructuring so that it plays nicer with +the JIT and libffi, both of which themselves have seen improvements.

                +

                Work is currently concentrated on the back-ends: a CINT back-end is underway +and a LLVM/CLang pre-compiled headers (PCH) back-end is planned. +The latter is needed for this code to be released in the wild, rather than +just used in high energy physics (HEP), as that would be easier to support. +Also, within HEP, CLang's PCH are foreseen to be the future format of +reflection information.

                +

                At the end of the Düsseldorf sprint, we tried a little code that did something +actually "useful," namely the filling of a histogram with some random values. +We did get it to work, but trying cppyy on a large class library showed +that a good warning system for such things like missing classes was sorely +needed. +That has been added since, and revisiting the histogram example later, here is +an interesting note: the pypy-c run takes 1.5x the amount of time of that +of the compiled, optimized, C++ code. +The run was timed start to finish, including the reflection library loading +and JIT warm-up that is needed in the case of Python, but not for the compiled +C++ code. +However, in HEP, scientists run many short jobs while developing their +analysis codes, before submitting larger jobs on the GRID to run during lunch +time or overnight. +Thus, a more realistic comparison is to include the compilation time needed +for the C++ code and with that, the Python code needs only 55% of the time +required by C++.

                +

                The choice of a programming language is often a personal one, and such +arguments like the idea that C++ is hard to use typically do not carry much +weight with the in-crowd that studies quantum field dynamics for fun. +However, getting the prompt with your analysis results back faster is a sure +winner. We hope that cppyy will soon have progressed far enough to make it +useful first to particle physicists and then other uses for wrapping C++ +libraries.

                + +Wim Lavrijsen, Carl Friedrich Bolz, Armin Rigo +
                +

                Comments

                +
                +
                +
                + + René Dudfield wrote on 2011-08-31 10:15: +
                +
                +

                Hi,

                nice result. Wrapping C++ code can be even more tiresome than C, especially with large code bases. This will be a very welcome tool.


                This question has probably been answered before... but I ask anyway since I couldn't find the answer.

                Can the jit information be saved, so it does not need to be worked out again? Assuming all of the dependencies have not changed (.py files, pypy itself, .so files etc). Maybe if location independent code can not be saved, then trace hints or some higher level structure could be saved to inform the jit about what traces to jit? That sounds like a solution to jit warm up for code that is used repeatedly.

                cu.

                +
                +
                +
                +
                + + Wim Lavrijsen wrote on 2011-08-31 18:53: +
                +
                +

                Hi,

                thanks! :)

                There was a recent thread on saving JIT information on pypy-dev:

                https://mail.python.org/pipermail/pypy-dev/2011-August/008073.html

                and the conclusion there was that it is too hard to be of benefit because too many parts contain addresses or calculated variables that were turned into constants.

                For our (HEP) purposes, it would be of limited benefit: in the development cycle, the .py's would change all the time, and it is a safe assumption that the user codes that are being developed are the most "hot." If there is anything in the supporting code that is "hot" (most likely in the framework) it'd be in C/C++ at that point anyway.

                Rather, I'd like to have an easy way of letting the user determine which portions of the code will be hot. Saving not having to run a hot loop 1000x in interpreted mode before the JIT kicks in, is going to be more valuable in scientific codes where the hot loops tend to be blatantly obvious.

                Cheers,
                Wim

                +
                +
                +
                +
                + + Anonymous wrote on 2011-08-31 19:49: +
                +
                +

                This is great. I have been looking for just such a tool to wrap C++ numerical code.

                I guess I have two questions:
                1. Is there any documentation on how to use it?
                2. It is very important to be able to translate between NumPy data structure and C++ data structure for me, so is there any plan to make this easy?

                Thanks for great work.

                +
                +
                +
                +
                + + Wim Lavrijsen wrote on 2011-08-31 22:18: +
                +
                +

                Hi,

                thanks! :)

                ad 1) it's not at the level of being usable in a production environment. I have two known issues to resolve and probably some more unknowns. I've posted a description on pypy-dev, and I'm helping a few patient, very friendly users along. But actual documentation suggest a level of support that currently can't be offered, because all the current (and soon to disappear) caveats would need documenting as well.

                ad 2) not sure what data translation you're thinking of, but in the CPython equivalent, support was added for the buffer interface and MemoryView. Those, or something similar, will be there so that numpy array's etc. can be build from return values, from public data members, and passed into function calls as arguments. Those are not translations, but rather extraction of the data pointers (which is typically intended and the most efficient, to be sure).

                Cheers,
                Wim

                +
                +
                +
                +
                + + wholesale electronics wrote on 2011-12-17 01:23: +
                +
                +

                Maybe if location independent code can not be saved, then trace hints or some higher level structure could be saved to inform the jit about what traces to jit?

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2011/09/py3k-for-pypy-fundraiser-8139653689520709617.html b/posts/2011/09/py3k-for-pypy-fundraiser-8139653689520709617.html new file mode 100644 index 000000000..29da1c64a --- /dev/null +++ b/posts/2011/09/py3k-for-pypy-fundraiser-8139653689520709617.html @@ -0,0 +1,486 @@ + + + + + +Py3k for PyPy fundraiser | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                Py3k for PyPy fundraiser

                + + + +
                +

                Hi,

                +

                We would like to announce a donation campaign for implementing Python 3 in PyPy.
                +Please read our detailed plan for all the details and donate using the
                +button on that page!

                +

                Thanks,
                +The PyPy Team

                +
                +

                Comments

                +
                +
                +
                + + stan wrote on 2011-09-21 18:21: +
                +
                +

                Two comments:

                1. It would be really nice to see a semi-frequently updated progress bar (live is best) with # of dollars and # of contributions for the fundraising.

                Part of the excitement created by sites like Kickstarter (and the Humble Indie Bundle and so on) is seeing how your small contribution adds to the whole. A donate button feels more like throwing your money into a dark hole (a very reasonable and worthwhile hole, but a hole nonetheless). Take advantage of some video game psychology and give us that "level up" feedback when we contribute! :)

                2. I know you don't want to oversubscribe yourselves, but would you consider doing a similar funding drive for Numpy support? PLEASE???

                +
                +
                +
                +
                + + Konstantine Rybnikov wrote on 2011-09-21 18:55: +
                +
                +

                Totally agree with stan about progress bar. Recent novacut's donation campaign showed importance of that a lot (since people saw that they need to hurry up with fundings and did lots of them in last couple of days).

                +
                +
                +
                +
                + + Carl Friedrich Bolz-Tereick wrote on 2011-09-21 19:30: +
                +
                +

                @stan: 1. progress bar will be coming soon

                2. we are actively working on putting up an equivalent page for Numpy support.

                +
                +
                +
                +
                + + stan wrote on 2011-09-21 19:45: +
                +
                +

                Awesome! I want to be first in line to pitch $50 into the Numpy jar.

                +
                +
                +
                +
                + + Anonymous wrote on 2011-09-21 21:03: +
                +
                +

                Awesome! Infact I regard Python 3 as much more important as any other features you could add now. 10% more performance is not nearly in the same league as Python3 support. Will happily spend some money on this.

                +
                +
                +
                +
                + + João Bernardo wrote on 2011-09-22 00:01: +
                +
                +

                Great!! I was waiting for that

                +
                +
                +
                +
                + + Anonymous wrote on 2011-09-22 01:03: +
                +
                +

                For complete support thats like 200,000$. I understand it's a willing feature, but I don't think the pypy community and followers are that huge.

                Btw, nice getting all the benchmarks above CPython 2.6 :)

                +
                +
                +
                +
                + + Sojin wrote on 2011-09-22 05:41: +
                +
                +

                Great work guys! I think keeping this amazing project alive is important for the Python eco-system... Here comes my $$.

                +
                +
                +
                +
                + + Laurent wrote on 2011-09-22 14:56: +
                +
                +

                I've heard that Py3K support for PyPy will be implemented in Python 2.X anyway. Is that true?

                +
                +
                +
                +
                + + Antonio Cuni wrote on 2011-09-22 16:49: +
                +
                +

                @Laurent: to be more precise, py3k will be implemented in RPython, which is indeed a subset of Python 2.

                Right now we don't have any plan to port RPython to Python 3: it's not a priority and it won't give any advantage to the PyPy end users.

                +
                +
                +
                +
                + + Anonymous wrote on 2011-09-23 00:31: +
                +
                +

                cpython3 is a fork of cpython2, but here you intend to support both versions with the same codebase. Does not this make the task much harder, and peeking into cpython3 code for guidance less useful? Also, isn't it possible that the resulting large set of switch (PYVERSION) {} statements will make the code less readable and maintainable?

                Anyway, I have full faith in your assessment of the best approach, but I am still interested in your explanation. :)

                +
                +
                +
                +
                + + Zinahe wrote on 2011-09-30 16:25: +
                +
                +

                Just made my donation. GOD SPEED.

                I second stan's idea of providing a progress bar showing the overall status of the fundraising.

                +
                +
                +
                +
                + + Harald Armin Massa wrote on 2011-09-30 21:28: +
                +
                +

                a) please, please get the pages lac showed in her lightning talk at pycon.uk online.
                - There are pictures of people in it, and it is easier to donate to people then to something abstract
                - there is text what happened
                - there is text that anonymous donation is possible

                b) please, work on the feedback. It is CRUCIAL to show the actual state. Giving 5€ and nothing happens is dull. Giving 5€ and a number goes up - good. Giving 500€ and a rendered bar moves a pixel - awesome!

                c) I found the Python3PyPy fundraiser easily. I did not find the numpypy fundraiser. Please, put lacs pages up :) if I can vote for them somewhere, please let me know.

                +
                +
                +
                +
                + + Maciej Fijalkowski wrote on 2011-09-30 21:36: +
                +
                +

                @Harald poke lac harder so she deploys it :)

                +
                +
                +
                +
                + + Anonymous wrote on 2011-10-05 01:04: +
                +
                +

                It's been a couple of weeks and the progress bar still isn't there. Although there is a link for it that doesn't work.
                Please fix this and make it visible without having to click anything.

                +
                +
                +
                +
                + + Anonymous wrote on 2011-10-06 23:45: +
                +
                +

                Hi! Please create the same kind of bucket for numpy support. I'm a big fan of Py3k, but I'm an even bigger fan of numpy - and I need it for my work. I'll donate to Py3k now, but I'll donate a bigger sum to both when I see the new bucket.

                +
                +
                +
                +
                + + Maciej Fijalkowski wrote on 2011-10-07 00:10: +
                +
                +

                We're waiting for the final ok of the proposal so it can be said it benefits the public good. Any day now :)

                +
                +
                +
                +
                + + Stefan M wrote on 2011-10-08 19:48: +
                +
                +

                * Who needs Python 3 support??? *

                It looks like the PyPy project is adding things just to improve something and keep doing something but for who's sake?

                I really need is proper support for C extensions. Without it, people who use Python professionally like myself, cannot switch to PyPy and we are stuck with Cython and/or Psyco.

                Who steers the development of Pypy and why would these people refuse to realize what hinders thousands of developers, who would love to use Pypy to make the switch from CPython ???

                Please tell me which real software projects use PyPy and for what reason they would need Py3K support!


                Go ahead and add more language constructs that you can use to run academic programs even faster and keep ignoring what is really necessary to push Pypy into day-to-day usability

                (* frustrated *)

                +
                +
                +
                +
                + + Maciej Fijalkowski wrote on 2011-10-08 20:11: +
                +
                +

                Hi Stefan. There is noone who steers direction in PyPy. Since this is open source, people either work on what they like, because it's fun or scratches their itch. Note that Python 3 work is something that people expressed interest in funding -- if they fund it enough, why wouldn't developers work on it? It's more interesting than most jobs.

                With regard to C extensions - it's good enough for many people, like quora to run on PyPy. Improving the support is boring and frustrating, so I don't think anyone would be willing to invest significant amount of his *free time* into that. However, feel free to speak with your money, you know how to find me.

                Cheers,
                fijal

                +
                +
                +
                +
                + + Stefan M wrote on 2011-10-13 03:46: +
                +
                +

                Hi Maciej,

                I realize that I came across in a somewhat obnoxious way. Sorry for that - I simply did not realize that PyPy is a true hobbyist project (at least currently).
                I wish I could contribute funding but though I am using Python a lot at work, we are a National Lab and struggling to keep our government funding ourselves.

                I hope a deep-pocket corporate will fund the Numpy development

                Cheers, Stefan

                +
                +
                +
                +
                + + Anonymous wrote on 2011-10-26 11:09: +
                +
                +

                i wonder why you don't get (more) funding from google?

                you seem to have reached the goal of unladen swallow now and there still is room for improvement.

                and it would be peanuts for them anyway. :)

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2011/10/more-compact-lists-with-list-strategies-8229304944653956829.html b/posts/2011/10/more-compact-lists-with-list-strategies-8229304944653956829.html new file mode 100644 index 000000000..10028f51b --- /dev/null +++ b/posts/2011/10/more-compact-lists-with-list-strategies-8229304944653956829.html @@ -0,0 +1,461 @@ + + + + + +More Compact Lists with List Strategies | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                More Compact Lists with List Strategies

                + + + +
                +

                Since we come closer to merging the list-strategy branch I want to try to explain this memory optimization today.

                +

                Datatypes in PyPy are stored as W_<type>Objects (e.g. W_StringObject to represent strings, W_IntObject to represent ints). This is necessary due to the dynamic nature of Python. So the actual value (e.g. string, integer) is stored inside that box, resulting in an indirection. When having a large amount of such boxed objects, for example in a list, the wasted memory can become quite large.

                +

                If you have a closer look at such lists, you will see that in many of them only one type of data is stored and only few (and smaller) lists store mixed types. Another thing to observe is that those lists often won't change the types of the objects they contain at runtime very often. For instance a list of a million integers is very unlikely to suddenly get a string appended to it.

                +

                List Strategies

                +

                The goal of this work is to write an optimization that exploits this behaviour. Instead of wrapping all items in a list, we implement lists in a way that they are optimized for storing certain (primitive) datatypes. These implementations store the content of the list in unwrapped form, getting rid of the extra indirection and wrapper objects.

                +

                One approach would be to add a level of indirection, making each W_ListObject instance point to another object that stores the actual content. For this other object, several implementations would exist, for every datatype we want to store without wrapping it (as well as a general one that deals with arbitrary content). The data layout would look something like this:

                +

                This approach has the problem that we need two indirections to get to the data and that the implementation instances need memory themselves.

                +

                What we would like to do is to make the W_ListObject point to an RPython list directly, that contains either wrapped or unwrapped data. This plan has the problem that storing different unwrapped data is not directly possible in RPython.

                +

                To solve the problem, we use the rerased RPython library module. It allows us to erase the type of an object, in this case lists, and returns something similar to void-star in C, or Object in Java. This object is then stored on the W_ListObject in the field storage. If we want to work with the list, for example to append or delete items, we need to unerase the storage again.

                +

                Example for rerase:

                +
                storage = erase([1 ,2 ,3 ,4])
                +# storage is an opaque object that you can do nothing with
                +....
                +l = unerase(storage)
                +l.clear()
                +
                +

                Now that we know how to make the W_ListObject point directly to wrapped or unwrapped data, we need to find out how to actually do any operations on this data. This can be accomplished by adding another field to our W_ListObject. This field points to a ListStrategy object. The actual implementation of W_ListObject is now deferred to those ListStrategy classes. For instance, a W_ListObject which holds only integers will use the IntegerListStrategy.

                +

                When the type of content is being changed, we need to change the used strategy as well as the storage in compatible ways. For example when we add a string to the list of integers we need to switch to the ObjectListStrategy and change the storage to be a list of wrapped objects. Thus the currently used strategy always knows what to do with what is currently in the storage.

                +

                As you can see, we now save one level of indirections by storing some of the data unwrapped. Of course each operation on a list needs to go via the strategy, but since we save one indirection for each element stored in that list and the Strategy classes are singletons, the benefits outweigh the costs.

                +

                Currently there are only strategies for integers and strings since many lists seem to have these datatypes. Other strategies i.e for floats and unicode strings are planned. We also implemented two special strategies for empty lists and range-lists. The EmptyListStrategy's storage is None. If objects are added to the list we just switch to the appropriate strategy (determined by the item's type). RangeListsStrategies do not store any items at all. Instead they only store values describing the range of the list, i.e. start, step and length. On any operations that changes the data of the list we switch to the IntegerStrategy.

                +

                A nice side-effect of storing unwrapped datatypes is that we can implement optimized methods for certain cases. For instance, since comparison of unwrapped integers is now much faster than comparison between arbitrary objects, we can rewrite the sorting methods for lists containing integers.

                +

                Microbenchmarks

                +

                Finally here is an early overview of the memory consumption of different Python implementations: CPython, PyPy and PyPy-list which uses list-strategies. To demonstrate how powerful list-strategies can be in the best case, we wrote benchmarks that create a list of integers, a list of strings and a range-list each with one million elements each and then reads out the heap size of the process as reported by the OS.

                +

                The results are as follows:

                +

                The savings on integers and strings in this ideal case are quite big.

                +

                The benchmark for range-lists is a little unfair, since in CPython one could accomplish the same memory behaviour using xrange. However, in PyPy users won't notice that internally the list does not store all items, making it still possible to use all list methods, such as append or delete.

                +

                Conclusion

                +

                We hope that list strategies bring memory savings for applications that use homogeneous lists of primitive types. Furthermore, operations on such lists tend to be somewhat faster as well. This also integrates well with the JIT. The list strategies optimizations will be merged to the PyPy's default branch at some point in the next months. An equivalent optimization for dictionaries has already been merged (and is part of PyPy 1.6), one for sets is coming in the future.

                +

                Lukas Diekmann and Carl Friedrich Bolz

                +
                +

                Comments

                +
                +
                +
                + + Winston Ewert wrote on 2011-10-11 13:10: +
                +
                +

                Nice.

                But isn't there a small change in semantics to do that? If a push a python int object onto a list and then pop it back off I'll have the exact same object. But if you unwrap the object and store it as a plain int and then repop it I don't have the exact same object. I've a got a new object.

                +
                +
                +
                +
                + + Anonymous wrote on 2011-10-11 13:20: +
                +
                +

                It seems to be very nice.

                By the way, are object attributes optimized the same way? Objects of the same class can be expected to frequently store data of the same type in the same attribute.
                I've found a nearly-year-old post on maps ( https://morepypy.blogspot.com/2010/11/efficiently-implementing-python-objects.html ), but it does not mention attribute value types... has this idea been considered?

                +
                +
                +
                +
                + + Unknown wrote on 2011-10-11 13:25: +
                +
                +

                I can see float support presenting some interesting challenges being emblematic of a wider issue. It would be very easy for someone to have a list of "floats" but if they populated it with any literals, most likely they'll be integer literals, missing any of the float optimization.

                For most apps this won't be a problem but if someone is trying to optimize their application they might see this as a performance heisenbug. For example they write a hard coded list and it is slow, read it from a file and it is fast.

                One approach is for there to be a document on some website (that gets out of date) that lists PyPy micro-optimizations. Someone would then need continually audit their code against that list. This doesn't seem practical.

                I've seen posted some low level visualization tools. I'd be curious how practical it would be to have a higher level profiler tool integrate with the JIT to detect patterns like the list of mixed float/int situation to flag these micro-optimizations in a more automated fashion.

                +
                +
                +
                +
                + + Alex wrote on 2011-10-11 13:27: +
                +
                +

                Winston: Indeed, very clever of you to notice :) However, we noticed as well, going forward integers (and other primitives) identity will be a function of their value, not the identity of their box. This means that for all ints `i is x` if and only if `i == x`. This also means that `id()` is now a function of value for primitives. Don't rely on that though! Just like we don't want people relying on `i is x` if `i == x and -100 < i < 200`, we don't want people relying on this either.

                Anonymous:

                Yes, this is definitely a consideration, I keep meaning to make time to work on this.

                +
                +
                +
                +
                + + evilpies wrote on 2011-10-11 14:08: +
                +
                +

                Well interesting, SpiderMonkey is considering to implement something like this, because NaN-boxing usually wastes a lot of memory.

                +
                +
                +
                +
                + + Maciej Fijalkowski wrote on 2011-10-11 19:52: +
                +
                +

                @Ed I think float list can accomodate a limited set of integer values (those that can be represented correctly when interpreted as float) without any issue. You would then however need to tag which one is integer and which one is float, having to keep a bitmap. That's certainly possible, but a bit of a mess.

                +
                +
                +
                +
                + + Alex wrote on 2011-10-11 20:23: +
                +
                +

                fijal: I think better than obscure hacks like a bitmap allowing integers as floats, perhaps it would be better just to eventually have logging of when you get fallbacks like that. For eventual integration with the jitviewer of course :)

                +
                +
                +
                +
                + + Winston Ewert wrote on 2011-10-11 20:54: +
                +
                +

                A general runtime warning system that could say things like: "list of floats decaying to list of objects because of adding int", "two ints being compared via is", etc. might be useful. That could handle any number of situations with surprising semantics or performance.

                +
                +
                +
                +
                + + Anonymous wrote on 2011-10-11 21:47: +
                +
                +

                This is very interesting. I have been thinking along somewhat similar lines for a while (but for performance reasons, rather than memory size), and so have already reviewed how I use lists in my own code. In my own programs, having non-uniform data types in a list is extremely rare. However, some lists are lists of lists (or tuples or dictionaries). The most common large lists however tend to be lists of strings.

                1) If I correctly understand your explanation of what you are doing, your "list strategies" are effectively marking uniform lists as being either of one of a few known basic types (e.g. IntegerListStrategy), or just a traditional list of objects. Is that correct?

                2) Do you think there are any meaningful performance optimsations which could be gained when the list type is known in advance?

                3) What about built-in functions such as all(), any(), len(), min(), max(), etc? Would they be able to make use of this to improve their performance?

                4) Would the underlying array data format be exposed for people who want to write extensions making direct use of it (e.g. for things like SIMD libraries)?

                5) Could this allow a list to be in shared memory and directly accessed by another program?

                6) Would the new list format be compatible with a "memoryview" (as str and bytearray are)?

                7) My own earlier thoughts had involved marking a list as being of a uniform or non-uniform data when the list is created or altered, and using optimised code for the expected type for uniform lists. One sticky point however was threading, as a different type could be appended in another thread, which means that the consuming function would have to somehow be aware of this. Would your concept have a problem with threading if appending a string to an integer list suddenly required changing the underlying list strategy while another thread was accessing the same list?

                8) Python 3.x seems to favour iterators over creating lists (e.g. map, filter, range are replaced by what used to be imap, ifilter, and xrange), and generators were introduced to complement list comprehensions in order to save memory. Does this have any implications for what you are doing?

                9) Could your list concept be applied by the CPython developers to CPython? This might help ensure that any subtle semantic issues which arise as a result apply equally to CPython, rather than having people call them "Pypy bugs".

                10) What about changing the Python language semantics to allow a user to specify that a list must be of a specific uniform type, and raising a type error if an element(s) of an unexpected type is added to the list? This is actually a language feature that I would like to have in order to catch errors without having to write code to examine each individual data element (as that can be slow and error prone in itself).

                11) Finally, why is there such a large start-up memory use in your micro-benchmarks when comparing Pypy-list to CPython? Is this just general overhead from Pypy itself, or is that due to something related to converting the list format to a particular "list strategy"?

                +
                +
                +
                +
                + + Alex wrote on 2011-10-11 23:14: +
                +
                +

                Anonymous: Wow a lot of questions, I'll try to answer them :)

                1) Yes.

                2) Probably not, you get the most performance gains when you have a large list, and if it's large the very-very-very-small initial transition is amortized over many elements.

                3) Many of those are pure-python and will thus automatically gain these benefits, max() and min() unfortunately are not.

                4) Probably not, we don't expose this data in any other place nor do we have any APIs for it.

                5) I suppose in theory, again we have no API for it.

                6) No, it wouldn't be, since that's not a part of the list API. We don't define the language, we just implement it (faster).

                7) No, there's no problem with this, you simply need to lock (or whatever the equivilant in STM) is the list and do the modifications.

                8) No, I don't think it does.

                9) Yes, it could be applied to CPython with slightly more difficulty, and it would see the memory gains. However, it would see performance losses (as you do with teh array module on CPython) because it would need to box/unbox at every iteraction, whereas teh JIT is able to remove that.

                10) Propose it to python-ideas, we don't define the language.

                11) I can't understand the charts, so I can't answer this one.

                +
                +
                +
                +
                + + Anonymous wrote on 2011-10-12 03:15: +
                +
                +

                Alex: I'm the anonymous with all the questions. Thank you for your detailed answers. I completely understand that there are side issues that you don't want to deal with at this time.

                As for the possible performance effects of the proposed new list data format if applied to CPython, doing the operation: "y = reduce(operator.add, x, 0)" where x is either a list or array of 1,000,000 integers does not seem to produce a measurable difference in speed for me (Python 2.6 on 64 bit Ubuntu). Any differences seem to go either way when the test is repeated, so they seem equivalent within the margin of error. An equivalent for loop yields the same result (except for being slower, of course).

                When extracting or replacing slices for lists and arrays (e.g. "y = x[i:i + 50]" and "x[i:i + 50] = y") within a for loop, the array version seems to be significantly *faster* than the list version for large slices (e.g. 50), and approximately the same for small slices (e.g. 5).

                Theoretically, yes the implementation with array should always be slower, but I can't seem to get that result when I attempt to measure it. Perhaps I'm doing something wrong, but it appears from the (admittedly minimal) testing that I have done that significant speed penalties for CPython cannot simply be assumed.

                I realize that ultimately this isn't a matter for the Pypy developers to concern themselves with, but should the question ever arise I don't think it can be dismissed out of hand.

                +
                +
                +
                +
                + + Carl Friedrich Bolz-Tereick wrote on 2011-10-12 08:18: +
                +
                +

                Some additional thoughts to @Anonymous questions:

                3) What about built-in functions such as all(), any(), len(), min(), max(), etc? Would they be able to make use of this to improve their performance?

                len does not depend on the content of the list, so it does not win. all, any, min and max could be improved, yes.

                7) My own earlier thoughts had involved marking a list as being of a uniform or non-uniform data when the list is created or altered, and using optimised code for the expected type for uniform lists. One sticky point however was threading, as a different type could be appended in another thread, which means that the consuming function would have to somehow be aware of this. Would your concept have a problem with threading if appending a string to an integer list suddenly required changing the underlying list strategy while another thread was accessing the same list?

                The JIT does indeed produce special optimized code for the type of list it is currently observing, making operations faster. The fact that another thread could change the type of the list is not a problem, because we have a GIL and thus the JIT knows at which points another thread can run.

                10) What about changing the Python language semantics to allow a user to specify that a list must be of a specific uniform type, and raising a type error if an element(s) of an unexpected type is added to the list? This is actually a language feature that I would like to have in order to catch errors without having to write code to examine each individual data element (as that can be slow and error prone in itself).

                this already exists. it's called the array module.

                11) Finally, why is there such a large start-up memory use in your micro-benchmarks when comparing Pypy-list to CPython? Is this just general overhead from Pypy itself, or is that due to something related to converting the list format to a particular "list strategy"?

                The higher startup memory is also there in the PyPy without list strategies, so those have nothing to do with it.

                +
                +
                +
                +
                + + Anonymous wrote on 2011-10-13 09:54: +
                +
                +

                I too had trouble understanding the chart. The vertical axis doesn't have negative numbers to represent a delta, just ignore the signs.

                The blue area is an algebraically positive area, representing the startup memory use. The yellow area represents the memory use delta after doing the 1e6 items list operations.

                +
                +
                +
                +
                + + Armin Rigo wrote on 2011-10-19 13:26: +
                +
                +

                Re list of floats-and-ints: a fully compatible way is to use the NaN-tagging idea from SpiderMonkey, i.e. have a special encoding of NaN that is normally not used, and that still leaves 32 bits of extra information. We would then represent ints in the list as such a NaN-encoded float. (At least it works as long as the integer is not too large, on 64-bit platforms.)

                +
                +
                +
                +
                + + Ole Laursen wrote on 2011-11-18 14:55: +
                +
                +

                Neat!

                Nice work people. I'm amazed it's so simple do to afterall, just switch type based on what the first element is. It must be a big boon for garbage collection, too?

                +
                +
                +
                +
                + + Anonymous wrote on 2011-12-16 14:07: +
                +
                +

                The benchmark measures virtual memory (don't know on which architecture); measuring RSS would be more representative of the actual amount of RAM spent storing the data. Presumably it would also be more favourable to PyPy, since moving garbage collection doubles the amount of virtual memory.

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2011/10/numpy-funding-and-status-update-2380711174693638392.html b/posts/2011/10/numpy-funding-and-status-update-2380711174693638392.html new file mode 100644 index 000000000..a277774d8 --- /dev/null +++ b/posts/2011/10/numpy-funding-and-status-update-2380711174693638392.html @@ -0,0 +1,388 @@ + + + + + +Numpy funding and status update | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                Numpy funding and status update

                + + + +
                +

                Hi everyone,

                +

                It's been a little while since we wrote about NumPy on PyPy, so we wanted to +give everyone an update on what we've been up to, and what's up next for us.

                +

                We would also like to note that we're launching a funding campaign +for NumPy support in PyPy. Details can be found on the donation page.

                +

                Some of the things that have happened since last we wrote are:

                +
                  +
                • We added dtype support, meaning you can now create arrays of a bunch of +different types, including bools, ints of a various sizes, and floats.
                • +
                • More array methods and ufuncs, including things like comparison methods +(==, >, etc.)
                • +
                • Support for more and more argument types, for example you can index by a +tuple now (only works with tuples of length one, since we only have +single-dimension arrays thus far).
                • +
                +

                Some of the things we're working on at the moment:

                +
                  +
                • More dtypes, including complex values and user-defined dtypes.
                • +
                • Subscripting arrays by other array as indices, and by bool arrays as masks.
                • +
                • Starting to reuse Python code from the original numpy.
                • +
                +

                Some of the things on the near horizon are:

                +
                  +
                • Better support for scalar data, for example did you know that +numpy.array([True], dtype=bool)[0] doesn't return a bool object? +Instead it returns a numpy.bool_.
                • +
                • Multi-dimensional array support.
                • +
                +

                If you're interested in helping out, we always love more contributors, +Alex, Maciej, Justin, and the whole PyPy team

                +
                +

                Comments

                +
                +
                +
                + + Anonymous wrote on 2011-10-12 23:34: +
                +
                +

                What is the best way to contact people about this? Our company has some interest in sponsporing this work, but it wasn't clear from this or the donations page how to actually talk to anyone about it. Maybe I'm missing the obvious.

                +
                +
                +
                +
                + + Alex wrote on 2011-10-12 23:53: +
                +
                +

                Anonymous: The address to contact is "pypy at sfconservancy.org". Thanks!

                +
                +
                +
                +
                + + stan wrote on 2011-10-13 00:14: +
                +
                +

                Yay! Time to put my money where my mouth is. :)

                +
                +
                +
                +
                + + Anonymous wrote on 2011-10-14 07:31: +
                +
                +

                What does it mean "Starting to reuse Python code from the original numpy"? If it is copy-paste and something will be changed in numpy git trunc, will it be automatically taken into account by your numpy for PyPy?

                +
                +
                +
                +
                + + Luis wrote on 2011-10-15 04:05: +
                +
                +

                This is off topic but, congratulations! You already achieved Unladen Swallow's performance goal of 5x faster than cpython on average.

                https://code.google.com/p/unladen-swallow/wiki/ProjectPlan#Performance

                https://speed.pypy.org/

                +
                +
                +
                +
                + + Anonymous wrote on 2011-10-17 08:51: +
                +
                +

                You probably have already seen that, but there is an interesting comment from Travis Oliphant about the porting of numpy to pypy :

                https://technicaldiscovery.blogspot.com/2011/10/thoughts-on-porting-numpy-to-pypy.html

                +
                +
                +
                +
                + + D wrote on 2011-10-17 10:50: +
                +
                +

                You haven't answered my question about reuse numpy code for 3 days, I guess because you don't know it overall. I'm not 100% agree with neither Travis opinion nor Stefan M comment from https://morepypy.blogspot.com/2011/09/py3k-for-pypy-fundraiser.html , but in answer to Stefan M you say "Since this is open source, people either work on what they like, because it's fun or scratches their itch" and "Improving the [C extensions] support is boring and frustrating". Guys, AFAIK you received FP7 support for developing some soft for users, not for fun. You should spend some efforts for boring yet important work toward the mentioned things, if you would like to obtain further increase of users number and finance support. Also, clarification about reusing CPython numpy code is also highly appreciated.

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2011/10/pypy-goteborg-post-halloween-sprint-nov-7335004338996313725.html b/posts/2011/10/pypy-goteborg-post-halloween-sprint-nov-7335004338996313725.html new file mode 100644 index 000000000..51588e8b8 --- /dev/null +++ b/posts/2011/10/pypy-goteborg-post-halloween-sprint-nov-7335004338996313725.html @@ -0,0 +1,358 @@ + + + + + +PyPy Göteborg Post-Hallowe'en Sprint Nov 2nd - Nov 9th | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                PyPy Göteborg Post-Hallowe'en Sprint Nov 2nd - Nov 9th

                + + + +
                +

                The next PyPy sprint will be in Gothenburg, Sweden. It is a public sprint, +suitable for newcomers. We'll focus on making a public kickoff for +both the numpy/pypy integration project +and the Py3k support project, +as well as whatever interests the Sprint attendees. Since both of these +projects are very new, there will be plenty of work suitable for newcomers +to PyPy.

                +

                Other topics might include:

                +
                  +
                • Helping people get their code running with PyPy
                • +
                • work on a FSCons talk?
                • +
                • state of the STM Vinnova project (We most likely, but not for certain will +know whether or not we are approved by this date.)
                • +
                +
                +

                Other Useful dates

                +

                GothPyCon - Saturday Oct 29.

                +

                FSCONS Friday Nov 11 - Sunday Nov 12.

                +
                +
                +

                Location

                +

                The sprint will be held in the apartment of Laura Creighton and Jacob Hallén +which is at Götabergsgatan 22 in Gothenburg, Sweden. Here is a map. This is +in central Gothenburg. It is between the tram stops of Vasaplatsen and +Valand, (a distance of 4 blocks) where many lines call -- the 2, 3, 4, 5, +7, 10 and 13.

                +

                Probably cheapest and not too far away is to book accomodation at SGS +Veckobostader. The Elite Park Avenyn Hotel is a luxury hotel just a +few blocks away. There are scores of hotels a short walk away from the +sprint location, suitable for every budget, desire for luxury, and desire +for the unusual. You could, for instance, stay on a boat. Options are +too numerous to go into here. Just ask in the mailing list or on the blog.

                +

                Hours will be +from 10:00 until people have had enough. It's a good idea to arrive a +day before the sprint starts and leave a day later. In the middle of +the sprint there usually is a break day and it's usually ok to take +half-days off if you feel like it. Of course, many of you may be interested +in sticking around for FSCons, held the weekend after the sprint.

                +
                +
                +

                Good to Know

                +

                Sweden is not part of the Euro zone. One SEK (krona in singular, kronor +in plural) is roughly 1/10th of a Euro (9.36 SEK to 1 Euro).

                +

                The venue is central in Gothenburg. There is a large selection of +places to get food nearby, from edible-and-cheap to outstanding. We +often cook meals together, so let us know if you have any food allergies, +dislikes, or special requirements.

                +

                Sweden uses the same kind of plugs as Germany. 230V AC.

                +
                +
                +

                Getting Here

                +

                If are coming train, you will arrive at the Central Station. It is +about 12 blocks to the site from there, or you can take a tram.

                +

                There are two airports which are local to Göteborg, Landvetter (the main +one) and Gothenburg City Airport (where some budget airlines fly). +If you arrive at Landvetter the airport bus stops right downtown at +Elite Park Avenyn Hotel which is the second stop, 4 blocks from the +Sprint site, as well as the end of the line, which is the Central Station. +If you arrive at Gothenburg City Airport take the bus to the end of the +line. You will be at the Central Station.

                +

                You can also arrive by ferry, from either Kiel in Germany or Frederikshavn +in Denmark.

                +
                +
                +

                Who's Coming?

                +

                If you'd like to come, please let us know when you will be arriving and +leaving, as well as letting us know your interests We'll keep a list +of people which we'll update (which you can do so yourself if you +have bitbucket pypy commit rights).

                +
                +
                +

                Comments

                +
                +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2011/10/speeding-up-json-encoding-in-pypy-8937643890263223898.html b/posts/2011/10/speeding-up-json-encoding-in-pypy-8937643890263223898.html new file mode 100644 index 000000000..d46947d87 --- /dev/null +++ b/posts/2011/10/speeding-up-json-encoding-in-pypy-8937643890263223898.html @@ -0,0 +1,549 @@ + + + + + +Speeding up JSON encoding in PyPy | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                Speeding up JSON encoding in PyPy

                + + + +
                +

                Hi

                +

                Recently I spent a bit of effort into speeding up JSON in PyPy. I started with +writing a benchmark, which is admittedly not a very good one, but it's +better than nothing (suggestions on how to improve it are welcome!).

                +

                For this particular benchmark, the numbers are as follow. Note that CPython by +default uses the optimized C extension, while PyPy uses the pure Python one. +PyPy trunk contains another pure Python version which has been optimized +specifically for the PyPy JIT. Detailed optimizations are described later in +this post.

                +

                The number reported is the time taken for the third run, when things are +warmed up. Full session here.

                + ++++ + + + + + + + + + + + + + + + + + + + + + + + + + + +
                CPython 2.622s
                CPython 2.73.7s
                CPython 2.7 no C extension44s
                PyPy 1.534s
                PyPy 1.622s
                PyPy trunk3.3s
                +

                Lessons learned:

                +
                +

                Expectations are high

                +

                A lot of performance critical stuff in Python world is already written in a hand +optimized C. Writing C (especially when you interface with CPython C API) is +ugly and takes significant effort. This approach does not scale well when +there is a lot of code to be written or when there is a very tight coupling +between the part to be rewritten and the rest of the code. Still, people would +expect PyPy to be better at "tasks" and not precisely at running equivalent +code, hence a comparison between the C extension and the pure python version +is sound. Fortunately it's possible to outperform the C extension, but requires +a bit of effort on the programmer side as well.

                +
                +
                +

                Often interface between the C and Python part is ugly

                +

                This is very clear if you look at json module as implemented in CPython's +standard library. Not everything is in C (it would probably be just too +much effort) and the interface to what is in C is guided via profiling not +by what kind of interface makes sense. This especially is evident comparing CPython 2.6 to 2.7. +Just adapting the code to an interface with C made the Python version slower. +Removing this clutter improves the readability a lot and improves PyPy's version +a bit, although I don't have hard numbers.

                +
                +
                +

                JitViewer is crucial

                +

                In case you're fighting with PyPy's performance, jitviewer is worth a shot. +While it's not completely trivial to understand what's going on, it'll +definitely show you what kind of loops got compiled and how.

                +
                +
                +

                No nice and fast way to build strings in Python

                +

                PyPy has a custom thing called __pypy__.builders.StringBuilder. It has +a few a features that make it much easier to optimize than other ways like +str.join() or cStringIO.

                +
                  +
                • You can specify the start size, which helps a lot if you can even provide +a rough estimate on the size of the string (less copying)
                • +
                • Only append and build are allowed. While the string is being built you +can't seek or do anything else. After it's built you can never append any more.
                • +
                • Unicode version available as well as __pypy__.builders.UnicodeBuilder.
                • +
                +
                +
                +

                Method calls are ok, immutable globals are ok

                +

                PyPy's JIT seems to be good enough for at least the simple cases. Calling +methods for common infrastructure or loading globals (instead of rebinding as +locals) is fast enough and improves code readability.

                +
                +
                +

                String copying is expensive

                +

                Edit: see the comment at the end

                +

                If you use re.sub, the current implementation will always create a copy +of the string even if there was no match to replace. +If you know your regexp is simple, first try to check if there is +anything to replace. This is a pretty hard optimization to +do automatically -- simply matching the regular expression can be too costly +for it to make sense. In our particular example however, the regexp is really +simple, checking ranges of characters. It also seems that this is by far the +fastest way to escape characters as of now.

                +
                +
                +

                Generators are slower than they should be

                +

                I changed the entire thing to simply call builder.append instead of +yielding to the main loop where it would be gathered. This is kind of a PyPy +bug that using generators extensively is slower, but a bit hard to fix. +Especially in cases where there is relatively little data being passed around +(few bytes), it makes sense to gather it first. If I were to implement an +efficient version of iterencode, I would probably handle chunks of +predetermined size, about 1000 bytes instead of yielding data every few bytes.

                +
                +
                +

                I must admit I worked around PyPy's performance bug

                +

                For obscure (although eventually fixable) reasons, this:

                +
                +for c in s: # s is string
                +  del c
                +
                +

                is faster than:

                +
                +for c in s:
                +  pass
                +
                +

                This is a PyPy performance bug and should be fixed, but on a different branch ;-)

                +
                +
                +

                PyPy's JIT is good

                +

                I was pretty surprised, but the JIT actually did make stuff work nicely. +The changes that were done were relatively minor and straightforward, once +the module was cleaned to the normal "pythonic" state. +It is worth noting that it's possible to write code in Python and make it +run really fast, but you have to be a bit careful. Again, jitviewer is your +friend when determining why things are slow. I hope we can write more tools +in the future that would more automatically guide people through potential +performance pitfals.

                +

                Cheers, +fijal

                +

                Edit: I was wrong about re.sub. It just seems to be that the JIT is figuring match better than sub, will be fixed soon

                +
                +
                +

                Comments

                +
                +
                +
                + + Ian McKellar wrote on 2011-10-27 17:20: +
                +
                +

                It would be neat to get UnicodeBuilder and StringBuilder in to mainline Python. They'd be more efficient in CPython than existing string construction methods and it would be easier to write more performant portable Python.

                +
                +
                +
                +
                + + Yury S wrote on 2011-10-27 17:32: +
                +
                +

                Can you elaborate a bit on the slowness of generators?

                +
                +
                +
                +
                + + Alex wrote on 2011-10-27 17:52: +
                +
                +

                Ian: yes it would, python-ideas/dev has had this discussion many times, if you want to convince them of the merit of this idea, feel free to try, but I've gotten weary of this discussion

                +
                +
                +
                +
                + + Anonymous wrote on 2011-10-27 23:27: +
                +
                +

                This is not meant to derail the rather nice performance numbers, but I wouldn't call the json/simplejson code "pythonic" in the first place.

                +
                +
                +
                +
                + + Gaëtan de Menten wrote on 2011-10-28 07:06: +
                +
                +

                I wonder if using a constant object to dump in each iteration doesn't skew the benchmark in favor of pypy, whereas the jit couldn't optimize as much with a varying object (which is what usually happens in real-life scenarios).

                +
                +
                +
                +
                + + Maciej Fijalkowski wrote on 2011-10-28 07:27: +
                +
                +

                @Gaetan it certainly could in theory. In practice it does not occur here, but I only know that from looking at traces. However, creating a new object each time would make the benchmark more of an object creation one (probably GC related)

                +
                +
                +
                +
                + + Gaëtan de Menten wrote on 2011-10-28 07:42: +
                +
                +

                @Maciej: not if you build the list of objects to dump out of the timed loop, or did I miss something?

                +
                +
                +
                +
                + + Maciej Fijalkowski wrote on 2011-10-28 07:47: +
                +
                +

                True, that might be a bit biggish though. Anyway as I said, it's good enough, JIT does not assume such things are constant. In fact it would execute exactly the same code for similarily shaped objects (different if all objects slightly differ in shape though)

                +
                +
                +
                +
                + + James Thiele wrote on 2011-10-28 16:11: +
                +
                +

                Interfacing Python to C isn't ugly if you use Cython.

                +
                +
                +
                +
                + + Maciej Fijalkowski wrote on 2011-10-28 16:31: +
                +
                +

                That is probably a matter of taste which we should not discuss among gentleman, I however find pure python better than Python-Cython-C combination. Also parsing JSON in C is not fun at all.

                +
                +
                +
                +
                + + Leonardo Santagada wrote on 2011-10-31 19:15: +
                +
                +

                The guys from ultrajson have a benchmark here https://github.com/esnme/ultrajson/blob/master/python/benchmark.py

                and the results are in the README https://github.com/esnme/ultrajson/blob/master/README

                would be interesting to run those benchmarks (of course, first warming up the jit), and comparing the results to ultrajson.

                +
                +
                +
                +
                + + Maciej Fijalkowski wrote on 2011-10-31 20:43: +
                +
                +

                feel free leonardo :)

                +
                +
                +
                +
                + + Leonardo Santagada wrote on 2011-11-01 14:42: +
                +
                +

                It was just a suggestion on how to improve it, like you asked. If it was just going to be ignored I would not have bothered.

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2011/11/gothenburg-sprint-report-8371395613874909242.html b/posts/2011/11/gothenburg-sprint-report-8371395613874909242.html new file mode 100644 index 000000000..5a05075a7 --- /dev/null +++ b/posts/2011/11/gothenburg-sprint-report-8371395613874909242.html @@ -0,0 +1,365 @@ + + + + + +Gothenburg sprint report | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                Gothenburg sprint report

                + + + +
                +

                In the past week, we have been busy hacking on PyPy at the Gothenburg sprint, the second of this 2011. The sprint was hold at Laura's and Jacob's place, and here is a brief report of what happened.


                +In the first day we welcomed Mark Pearse, who was new to PyPy and at his first sprint. Mark worked the whole sprint in the new SpecialisedTuple branch, whose aim is to have a special implementation for small 2-items and 3-items tuples of primitive types (e.g., ints or floats) to save memory. Mark paired with Antonio for a couple of days, then he continued alone and did an amazing job. He even learned how to properly do Test Driven Development :-).

                +Antonio spent a couple of days investigating whether it is possible to use application checkpoint libraries such as BLCR and DMTCP to save the state of the PyPy interpreter between subsequent runs, thus saving also the JIT-compiled code to reduce the warmup time. The conclusion is that these are interesting technologies, but more work would be needed (either on the PyPy side or on the checkpoint library side) before it can have a practical usage for PyPy users.

                +Then, Antonio spent most of the rest of the sprint working on his ffistruct branch, whose aim is to provide a very JIT-friendly way to interact with C structures, and eventually implement ctypes.Structure on top of that. The "cool part" of the branch is already done, and the JIT already can compile set/get of fields into a single fast assembly instruction, about 400 times faster than the corresponding ctypes code. What is still left to do is to add a nicer syntax (which is easy) and to implement all the ctypes peculiarities (which is tedious, at best :-)).

                +As usual, Armin did tons of different stuff, including fixing a JIT bug, improving the performance of file.readlines() and working on the STM branch (for Software Transactional Memory), which is now able to run RPython multithreaded programs using software transaction (as long as they don't fill up all the memory, because support for the GC is still missing :-)). Finally, he worked on improving the Windows version of PyPy. While doing so he discovered together with Anto a terrible bug which lead to a continuous leak of stack space because the JIT called some functions using the wrong calling convention.

                +Håkan, with some help from Armin, worked on the jit-targets branch, whose goal is to heavily refactor the way the traces are internally represented by the JIT, so that in the end we can produce (even :-)) better code than what we do nowadays. More details in this mail.

                +Andrew Dalke worked on a way to integrate PyPy with FORTRAN libraries, and in particular the ones which are wrapped by Numpy and Scipy: in doing so, he wrote f2pypy, which is similar to the existing f2py but instead of producing a CPython extension module it produces a pure python modules based on ctypes. More work is needed before it can be considered complete, but f2pypy is already able to produce a wrapper for BLAS which passes most of the tests under CPython, although there's still work left to get it working for PyPy.

                + + + +
                Armin and Håkan with Laura's "5x faster" cake
                Christian Tismer worked the whole sprint on the branch to make PyPy compatible with Windows 64 bit. This needs a lot of work because a lot of PyPy is written under the assumption that the long type in C has the same bit size than void*, which is not true on Win64. Christian says that in the past Genova-Pegli sprint he completed 90% of the work, and in this sprint he did the other 90% of the work. Obviously, what is left to complete the task is the third 90% :-). More seriously, he estimated a total of 2-4 person-weeks of work to finish it.

                +But, all in all, the best part of the sprint has been the cake that Laura baked to celebrate the "5x faster than CPython" achievement. Well, actually our speed page reports "only" 4.7x, but that's because in the meantime we switched from comparing against CPython 2.6 to comparing against CPython 2.7, which is slightly faster. We are confident that we will reach the 5x goal again, and that will be the perfect excuse to eat another cake :-) +
                +

                Comments

                +
                +
                +
                + + Albien wrote on 2011-11-15 00:40: +
                +
                +

                Freaking amazing guys together!!!

                +
                +
                +
                +
                + + Kumo wrote on 2011-11-15 03:28: +
                +
                +

                "5x faster than CPython cake". Sounds delicious.

                +
                +
                +
                +
                + + Anonymous wrote on 2011-11-15 10:18: +
                +
                +

                awesome! what do you think? how much room for improvement is there? is 10x possible? :)

                +
                +
                +
                +
                + + Luis wrote on 2011-11-15 13:52: +
                +
                +

                Congratulations! I guess that 5x faster (Unladen Swallow's performance goal) means that pypy is now "officially" fast.

                As Anonymous asked above, I also wonder how much room for improvement there is from now on.
                Have all the low hanging fruits been picked already? Can we expect this pace of improvement to go on for a while? Or you are close to hit the limit?

                Well, I know it's hard to predict... I'd just like to know what your heart tells you :-)

                Thank you guys for all the hard work!

                +
                +
                +
                +
                + + Anonymous wrote on 2011-11-18 15:56: +
                +
                +

                does pygame work with pypy? would be awesome... what about pyopengl?

                +
                +
                +
                +
                + + Anonymous wrote on 2011-11-19 00:28: +
                +
                +

                Sorry, but pyopengl require either numpy or Numeric, which unfortunatly ain't supported yet.

                +
                +
                +
                +
                + + Anonymous wrote on 2011-12-17 01:06: +
                +
                +

                Five times faster than CPython. Great! How does it compare to C?

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2011/11/pypy-17-on-win32-4962523601794245248.html b/posts/2011/11/pypy-17-on-win32-4962523601794245248.html new file mode 100644 index 000000000..d7a9c0ffd --- /dev/null +++ b/posts/2011/11/pypy-17-on-win32-4962523601794245248.html @@ -0,0 +1,290 @@ + + + + + +PyPy 1.7 on Win32 | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                PyPy 1.7 on Win32

                + + + +
                +

                Hi all,

                + +

                We have fixed _continuation on Win32 (thanks Stakkars), and so we have now a Win32 version of PyPy 1.7.

                +
                +

                Comments

                +
                +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2011/11/pypy-17-widening-sweet-spot-4260962828394182017.html b/posts/2011/11/pypy-17-widening-sweet-spot-4260962828394182017.html new file mode 100644 index 000000000..d65d1a687 --- /dev/null +++ b/posts/2011/11/pypy-17-widening-sweet-spot-4260962828394182017.html @@ -0,0 +1,613 @@ + + + + + +PyPy 1.7 - widening the sweet spot | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                PyPy 1.7 - widening the sweet spot

                + + + +
                +

                We're pleased to announce the 1.7 release of PyPy. As became a habit, this +release brings a lot of bugfixes and performance improvements over the 1.6 +release. However, unlike the previous releases, the focus has been on widening +the "sweet spot" of PyPy. That is, classes of Python code that PyPy can greatly +speed up should be vastly improved with this release. You can download the 1.7 +release here:

                +
                +https://pypy.org/download.html +
                +
                +

                What is PyPy?

                +

                PyPy is a very compliant Python interpreter, almost a drop-in replacement for +CPython 2.7. It's fast (pypy 1.7 and cpython 2.7.1 performance comparison) +due to its integrated tracing JIT compiler.

                +

                This release supports x86 machines running Linux 32/64, Mac OS X 32/64 or +Windows 32. Windows 64 work is ongoing, but not yet natively supported.

                +

                The main topic of this release is widening the range of code which PyPy +can greatly speed up. On average on +our benchmark suite, PyPy 1.7 is around 30% faster than PyPy 1.6 and up +to 20 times faster on some benchmarks.

                +
                +
                +

                Highlights

                +
                  +
                • +

                  Numerous performance improvements. There are too many examples which python +constructs now should behave faster to list them.

                  +
                • +
                • +

                  Bugfixes and compatibility fixes with CPython.

                  +
                • +
                • +

                  Windows fixes.

                  +
                • +
                • +

                  PyPy now comes with stackless features enabled by default. However, +any loop using stackless features will interrupt the JIT for now, so no real +performance improvement for stackless-based programs. Contact pypy-dev for +info how to help on removing this restriction.

                  +
                • +
                • +

                  NumPy effort in PyPy was renamed numpypy. In order to try using it, simply +write:

                  +
                  +import numpypy as numpy
                  +
                  +

                  at the beginning of your program. There is a huge progress on numpy in PyPy +since 1.6, the main feature being implementation of dtypes.

                  +
                • +
                • +

                  JSON encoder (but not decoder) has been replaced with a new one. This one +is written in pure Python, but is known to outperform CPython's C extension +up to 2 times in some cases. It's about 20 times faster than +the one that we had in 1.6.

                  +
                • +
                • +

                  The memory footprint of some of our RPython modules has been drastically +improved. This should impact any applications using for example cryptography, +like tornado.

                  +
                • +
                • +

                  There was some progress in exposing even more CPython C API via cpyext.

                  +
                • +
                +
                +
                +

                Things that didn't make it, expect in 1.8 soon

                +

                There is an ongoing work, which while didn't make it to the release, is +probably worth mentioning here. This is what you should probably expect in +1.8 some time soon:

                +
                  +
                • Specialized list implementation. There is a branch that implements lists of +integers/floats/strings as compactly as array.array. This should drastically +improve performance/memory impact of some applications
                • +
                • NumPy effort is progressing forward, with multi-dimensional arrays coming +soon.
                • +
                • There are two brand new JIT assembler backends, notably for the PowerPC and +ARM processors.
                • +
                +
                +
                +

                Fundraising

                +

                It's maybe worth mentioning that we're running fundraising campaigns for +NumPy effort in PyPy and for Python 3 in PyPy. In case you want to see any +of those happen faster, we urge you to donate to numpy proposal or +py3k proposal. In case you want PyPy to progress, but you trust us with +the general direction, you can always donate to the general pot.

                +
                +

                Cheers,
                Maciej Fijałkowki, Armin Rigo and the entire PyPy team

                +
                +

                Comments

                +
                +
                +
                + + Unknown wrote on 2011-11-21 12:29: +
                +
                +

                Could you put a link to some sort of NEWS file, a list of issue tracker tickets, or at least the relevant span of the revision control tool so that I could browse what sorts of changes have gone into trunk since 1.6?

                +
                +
                +
                +
                + + Anonymous wrote on 2011-11-21 12:54: +
                +
                +

                "PyPy now comes with stackless features enabled by default"

                Could you please tell a bit more about it? Is it just sort of internal optimizations, something under the hood? Or does it mean tail recursion optimization? Or cooperative multitasking with greenlets? What's the API for stackless features?

                +
                +
                +
                +
                + + Anonymous wrote on 2011-11-21 14:27: +
                +
                +

                Is it so hard to wait until you have a Windows build before announcing a release?

                Or not telling in the release that the Windows binary is available?

                +
                +
                +
                +
                + + Benjamin Peterson wrote on 2011-11-21 15:30: +
                +
                +

                @Zooko

                hg log -rrelease-1.6:release-1.7

                +
                +
                +
                +
                + + Jan Ziak (atomsymbol) wrote on 2011-11-21 16:38: +
                +
                +

                I am getting a segmentation fault.

                +
                +
                +
                +
                + + D wrote on 2011-11-21 18:37: +
                +
                +

                So if I want to run PyPy on my code with numpy I have to replace in each file "import numpy" by "import numpypy", "from numpy import ..." by "from numpypy import ...". And each time I want to switch beween PyPy and CPython, I have to search and replace all those occurrences backward. Well done...

                +
                +
                +
                +
                + + Anonymous wrote on 2011-11-21 19:35: +
                +
                +

                Thank you for all your work, it's nice to see how far you have come in so little time! Keep raising the bar.

                +
                +
                +
                +
                + + Amaury wrote on 2011-11-21 21:06: +
                +
                +

                @D: Please take it the easy way and add "sys.modules['numpy'] = numpypy" at the start of your program.

                +
                +
                +
                +
                + + Maciej Fijalkowski wrote on 2011-11-21 21:08: +
                +
                +

                @⚛ report a bug to bugs.pypy.org

                @D it's gonna stay like this until it's finished. The problem is that most programs won't run out of the box anyway as of now, because of some missing functionality. We'll probably rename it back once it's finished.

                +
                +
                +
                +
                + + Armin Rigo wrote on 2011-11-21 21:09: +
                +
                +

                @D: all you need is to create a file "numpy.py" that contains "from numpypy import *". (The real reason we did this temporary renaming is because numpy developers asked us to.)

                More likely, though, you are probably going to hit some unimplemented feature anyway, as our numpy(py) is still incomplete.

                +
                +
                +
                +
                + + Anonymous wrote on 2011-11-21 22:49: +
                +
                +

                Re: numpypy. The standard in the bad old days with three different and subtly incompatible array libraries was "try: import ...; except: ..."

                +
                +
                +
                +
                + + Jan Ziak (atomsymbol) wrote on 2011-11-22 07:14: +
                +
                +

                @Maciej: I am *not* going to submit a bug report, on purpose. When developing software for the masses, there are always two sets of users. One set comprises the users who report bugs, the other set comprises the users who are experiencing issues but do not report bugs.

                The ideal state would be that there are no bugs, but this is only theoretical of course.

                As an experiment, I have decided not to tell you any information about the segmentation fault. Nothing. Absolutely nothing.

                The question is what measures are you going to take to solve this PyPy issue.

                Good luck ...

                +
                +
                +
                +
                + + Maciej Fijalkowski wrote on 2011-11-22 08:09: +
                +
                +

                @⚛ we're going to do nothing with that. Most probably you're using a CPython C extension or some illegal ctypes invocation or older version of jinja that did that or something... Besides, there is absolutely no point in trying to fix a bug that noone can potentially provide any information for.

                Cheers,
                fijal

                +
                +
                +
                +
                + + Jan Ziak (atomsymbol) wrote on 2011-11-22 09:07: +
                +
                +

                @Maciej:

                PyPy 1.6 worked OK (but it was slower than CPython).

                "we're going to do nothing with that."

                OK

                "Most probably you're using a CPython C extension or some illegal ctypes invocation or older version of jinja that did that or something..."

                I don't think so. GDB says that the EIP register stops at an address which does not seem to belong to the PyPy executable nor to any dynamically loaded library. This leads me to the conclusion that the issue is in the x86 code generated by PyPy.

                "Besides, there is absolutely no point in trying to fix a bug that noone can potentially provide any information for."

                I am not saying you have to fix it. I am just saying that PyPy 1.7 generates code that segfaults.

                Does PyPy employ partial verification when generating x86 code?

                +
                +
                +
                +
                + + Jorgen wrote on 2011-11-22 09:28: +
                +
                +

                @Flower

                "As an experiment, I have decided not to tell you any information about the segmentation fault. Nothing. Absolutely nothing."

                So you want to conduct an experiment into 'How to help out an open source project by withholding crucial information'? And I thought the ideas of my PhD-advisor were bad ...

                +
                +
                +
                +
                + + Anonymous wrote on 2011-11-22 09:56: +
                +
                +

                The point he, she, or it is making is that PyPy should contain a theorem prover to verify the code it generates so it is possible to prove mathematically that it never generates bad code—and that anything else is beneath the contempt of a serious computer scientist. If you need information about a segfault in order to debug it, you obviously have not thought it through thoroughly enough.

                +
                +
                +
                +
                + + Jan Ziak (atomsymbol) wrote on 2011-11-22 10:02: +
                +
                +

                @Jorgen and @Maciej:

                Well, I previously wrote here that "The question is what measures are you (=the PyPy team) going to take to solve this PyPy issue."

                This sentence of mine contained the additional information that: I believe that it is a PyPy issue.

                Maciej then wrote: "Most probably you're using a CPython C extension or ... that did that or something". This means he was trying to put the blame on others (C extensions or whatever) rather than admitting that it might be an issue attributable to PyPy and PyPy alone.

                Then you (Jorgen) wrote "So you want to conduct an experiment into 'How to help out an open source project by withholding crucial information'?". And that is exactly what I intend to do: to help the PyPy project by withholding crucial information.

                It will work.

                +
                +
                +
                +
                + + Jan Ziak (atomsymbol) wrote on 2011-11-22 10:14: +
                +
                +

                @Damian:

                "... PyPy should contain a theorem prover to verify the code it generates so it is possible to prove mathematically that it never generates bad code"

                I believe such a thing is impossible.

                +
                +
                +
                +
                + + Anonymous wrote on 2011-11-22 11:36: +
                +
                +

                It's possible if you let the verifier reject legal code. It's probably not realistic though, RPython (or is that the JIT-annotation language?) would have to be designed to be verifiable for whatever property you want to verify.

                +
                +
                +
                +
                + + Armin Rigo wrote on 2011-11-22 14:28: +
                +
                +

                @⚛: you're sitting in your own corner of the world thinking that we will try hard to figure out which segfault you could possibly mean, and that it will help the PyPy project :-) I've heard many misconceptions of how Open Source works, but I've never heard this one.

                How it really works is: you think you have a genuine segfault and want to report it, in which case you file a bug to https://bugs.pypy.org, and maybe we have to discuss more to figure out why, for example, it appears on your machine and not ours, or which configuration you need to reproduce it; sometimes it can take efforts on both parties to even reproduce the problem.

                You are free to not play this game, but then just like Maciej said, you will be fully ignored. Even if it's a real bug, it's likely that over time someone else will report or fix it. I'm not trying to force you to "reveal" it to us; feel free to ignore me. I'm just explaining how I believe Open Source works.

                The difference for us is small, because a real bug will be seen and reported by others too. The difference for you is whether you would like to contribute and get our thanks, or don't care about it.

                +
                +
                +
                +
                + + Anonymous wrote on 2011-11-22 23:50: +
                +
                +

                The pypy team "could" solve it. But it would be a massive waste of time, and of cource the changes are that they are unable to because of problems in your setup. I most certainly hope no open source team really spend their time on such ghost hunts.

                +
                +
                +
                +
                + + Anonymous wrote on 2011-11-23 04:25: +
                +
                +

                https://democreatorreview.blogspot.com/

                +
                +
                +
                +
                + + Winston Ewert wrote on 2011-11-23 04:42: +
                +
                +

                Somewhat off the topic of this post, but I'm wondering what the special optimization of string lists would be. I can see obvious benefits to storing ints/floats directly in the list rather then as boxed numbers, but not so much for strings since they have be stored using an indirection anyways.

                +
                +
                +
                +
                + + Carl Friedrich Bolz-Tereick wrote on 2011-11-23 09:15: +
                +
                +

                @Winston:

                astutely observed (as always). There are two points to string lists:

                1) PyPy's strings have one extra indirection, e.g. the data is not stored in the string box. This is due to RPython restrictions. With string lists, one indirection can be removed.

                2) If the JIT knows that the full list stores only strings, it can actually generate better code, because it does not need to check the type of the item that was just read out of the list.

                +
                +
                +
                +
                + + vacation homes in kissimmee florida wrote on 2011-11-25 09:30: +
                +
                +

                This means he was trying to put the blame on others....

                +
                +
                +
                +
                + + wholesale electronics wrote on 2011-12-17 01:20: +
                +
                +

                omething under the hood? Or does it mean tail recursion optimization?

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2011/12/come-see-us-at-pycon-2012-610420698450130659.html b/posts/2011/12/come-see-us-at-pycon-2012-610420698450130659.html new file mode 100644 index 000000000..a4095819b --- /dev/null +++ b/posts/2011/12/come-see-us-at-pycon-2012-610420698450130659.html @@ -0,0 +1,320 @@ + + + + + +Come see us at PyCon 2012 | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                Come see us at PyCon 2012

                + + + +
                +

                PyCon 2012 is coming up in just a few short months, and PyPy will be well
                +represented there. We'll be delivering a tutorial, two talks, plus we'll be
                +around for the sprints.

                +

                Here are the abstracts for the tutorials and talks:

                +
                  +
                • +How to get the most out of your PyPy, by Maciej Fijalkowski, Alex Gaynor
                  +and Armin Rigo: For many applications PyPy can provide performance benefits
                  +right out of the box. However, little details can push your application to
                  +perform much better. In this tutorial we'll give you insights on how to push
                  +PyPy to its limits. We'll focus on understanding the performance
                  +characteristics of PyPy, and learning the analysis tools in order to maximize
                  +your applications' performance. This is the tutorial. +
                • +
                • +Why PyPy by example, by Maciej Fijalkowski, Alex Gaynor and Armin Rigo:
                  +One of the goals of PyPy is to make existing Python code faster; however an
                  +even broader goal was to make it possible to write things in Python that
                  +previously would needed to be written in C or other low-level language. This
                  +talk will show examples of this, and describe how they represent the
                  +tremendous progress PyPy has made, and what it means for people looking at
                  +using PyPy.
                • +
                • +How the PyPy JIT works, by Benjamin Peterson: The Python community is
                  +abuzz about the major speed gains PyPy can offer for pure Python code. But how
                  +does the PyPy JIT actually work? This talk will discuss how the PyPy JIT is
                  +implemented. It will include descriptions of the tracing, optimization, and
                  +assembly generation phases. I will demonstrate each step with an example loop.
                • +
                +

                If you have any questions let us know! We look forward to seeing people at
                +PyCon and chatting about PyPy and the entire Python ecosystem.

                +

                See you there,
                +Maciej Fijalkowski, Alex Gaynor, Benjamin Peterson, Armin Rigo, and the entire PyPy team

                +
                +

                Comments

                +
                +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2011/12/leysin-winter-sprint-6862532189897876336.html b/posts/2011/12/leysin-winter-sprint-6862532189897876336.html new file mode 100644 index 000000000..fa5249aa2 --- /dev/null +++ b/posts/2011/12/leysin-winter-sprint-6862532189897876336.html @@ -0,0 +1,363 @@ + + + + + +Leysin Winter Sprint | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                Leysin Winter Sprint

                + + + +
                +

                PyPy Leysin Winter Sprint: 15-22nd January 2012

                + +

                +The next PyPy sprint will be in Leysin, Switzerland, for the +eighth time. This is a fully public sprint: newcomers and topics +other than those proposed below are welcome.

                + +

                Goals and topics of the sprint

                + +
                  +
                • Py3k: work towards supporting Python 3 in PyPy + +
                • +
                • NumPyPy: work towards supporting the numpy module in PyPy + +
                • +
                • JIT backends: integrate tests for ARM; look at the PowerPC 64; + maybe try again to write an LLVM- or GCC-based one + +
                • +
                • STM and STM-related topics; or the Concurrent Mark-n-Sweep GC + +
                • +
                • And as usual, the main side goal is to have fun in winter sports :-) + We can take a day off for ski. +
                • +
                +

                Exact times

                + +

                The work days should be 15-21 January 2011 (Sunday-Saturday). The +official plans are for people to arrive on the 14th or the 15th, and to +leave on the 22nd.

                + +

                Interested? Read more...

                +
                +

                Comments

                +
                +
                +
                + + Anonymous wrote on 2011-12-28 01:30: +
                +
                +

                How is the STM work going, btw?

                Do you have any indications yet on whether it'll be workable in an imperative VM?

                +
                +
                +
                +
                + + Anonymous wrote on 2012-01-02 11:49: +
                +
                +

                any news on the win64 port?

                +
                +
                +
                +
                + + Klaus Ramelow wrote on 2012-01-07 12:56: +
                +
                +

                Leysin Winter Sprint
                Exact times

                The work days should be 15-21 January 2011 (Sunday-Saturday).

                I assume it will be January 2012

                +
                +
                +
                +
                + + Armin Rigo wrote on 2012-01-09 19:54: +
                +
                +

                STM work is slowly progressing, as you must have noticed in pypy-dev.

                The Win64 port's progress is unknown, sorry.

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2011/12/plotting-using-matplotlib-from-pypy-6389240123679375092.html b/posts/2011/12/plotting-using-matplotlib-from-pypy-6389240123679375092.html new file mode 100644 index 000000000..3faf9afa8 --- /dev/null +++ b/posts/2011/12/plotting-using-matplotlib-from-pypy-6389240123679375092.html @@ -0,0 +1,441 @@ + + + + + +Plotting using matplotlib from PyPy | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                Plotting using matplotlib from PyPy

                + + + +
                +

                Big fat warning This is just a proof of concept. It barely works. There are +missing pieces left and right, which were replaced with hacks so I can get this +to run and prove it's possible. Don't try this at home, especially your home. +You have been warned.

                +

                There has been a lot of talking about PyPy not integrating well with the +current scientific Python ecosystem, and numpypy (a NumPy reimplementation +on top of pypy) was dubbed "a fancy array library". I'm going to show that +integration with this ecosystem is possible with our design.

                +

                First, the demo:

                +
                +#!/usr/bin/env pypy
                +
                +# numpy, pypy version
                +import numpypy as numpy
                +# DRAGONS LIVE THERE (fortunately hidden)
                +from embed.emb import import_mod
                +
                +pylab = import_mod('matplotlib.pylab')
                +
                +if __name__ == '__main__':
                +    a = numpy.arange(100, dtype=int)
                +    b = numpy.sin(a)
                +    pylab.plot(a, b)
                +    pylab.show()
                +
                +

                And you get:

                + + + +

                Now, how to reproduce it:

                +
                  +
                • +

                  You need a PyPy without cpyext, I did not find a linker that would support +overriding symbols. Right now there are no nightlies like this, so you have +to compile it yourself, like:

                  +
                  +./translate.py -Ojit targetpypystandalone.py --withoutmod-cpyext
                  +
                  +

                  That would give you a PyPy that's unable to load some libraries like PIL, but +perfectly working otherwise.

                  +
                • +
                • +

                  Speaking of which, you need a reasonably recent PyPy.

                  +
                • +
                • +

                  The approach is generally portable, however the implementation has been +tested only on 64bit linux. Few tweaks might be required.

                  +
                • +
                • +

                  You need to install python2.6, the python2.6 development headers, and have +numpy and matplotlib installed on that python.

                  +
                • +
                • +

                  You need a checkout of my hacks directory and put embedded on your +PYTHONPATH, your pypy checkout also has to be on the PYTHONPATH.

                  +
                • +
                +
                +

                Er wait, what happened?

                +

                What didn't happen is we did not reimplement matplotlib on top of PyPy. What +did happen is we embed CPython inside of PyPy using ctypes. We instantiate it. +and follow the embedding tutorial for CPython. Since numpy arrays are not +movable, we're able to pass around an integer that's represents the memory +address of the array data and reconstruct it in the embedded interpreter. Hence +with a relatively little effort we managed to reuse the same array data on both +sides to plot at array. Easy, no?

                +

                This approach can be extended to support anything that's not too tied with +python objects. SciPy and matplotlib both fall into the same category +but probably the same strategy can be applied to anything, like GTK or QT. +It's just a matter of extending a hack into a working library.

                +

                To summarize, while we're busy making numpypy better and faster, it seems +that all external libraries on the C side can be done using an embedded Python +interpreter with relatively little effort. To get to that point, I spent +a day and a half to learn how to embed CPython, with very little prior +experience in the CPython APIs. Of course you should still keep as much as +possible in PyPy to make it nice and fast :)

                +

                Cheers, +fijal

                +
                +
                +

                Comments

                +
                +
                +
                + + Kumo wrote on 2011-12-09 04:06: +
                +
                +

                Pretty cool!

                +
                +
                +
                +
                + + Eli Bressert wrote on 2011-12-09 20:27: +
                +
                +

                Two thumbs up! This is quite exciting! Looking forward to further followup from this.

                How does Scipy look in terms of implementation, e.g. wrapping fortran code with f2py? Could it become achieved?

                +
                +
                +
                +
                + + Pankaj wrote on 2011-12-10 06:14: +
                +
                +

                freaking awesome :)

                +
                +
                +
                +
                + + Laptop repair wrote on 2011-12-10 13:02: +
                +
                +

                PyPy Version is showing best result, it is giving extra protection to program.

                +
                +
                +
                +
                + + dac wrote on 2011-12-13 20:52: +
                +
                +

                Good work. Is this approach your long term plan for supporting scientific python libraries or just a stop-gap solution until "proper" support can be added to pypy (or to the library)?

                +
                +
                +
                +
                + + Maciej Fijalkowski wrote on 2011-12-13 23:06: +
                +
                +

                @dac this can scale to the entire matplotlib/scipy fully. Whether scientific community people will take up a gargantuan task of moving SciPy/matplotlib out of using CPython C API is beyond my knowledge, but even if it'll happen, it won't happen in short-to-mid-term.

                So overall I think it's a good midterm solution, that might just stay forever.

                +
                +
                +
                +
                + + Anonymous wrote on 2014-01-04 09:23: +
                +
                +

                Another solution containing dragons, that someone might find useful:
                1) create new python file, that would print diagrams
                2) send data from main program running in pypy to the second python file using call from subprocess
                eg. call(["python", "pythondiagrams.py", "-data", str(my_data).replace(" ", ";")]), data should be be text type and contain separator other than space
                3) parse input data using argparse and convert them using ast

                +
                +
                +
                +
                + + Konstantin Lopuhin wrote on 2014-02-02 12:32: +
                +
                +

                Also, seems that embed must live below the root of pypy source tree (else it fails to create proper paths to ".o" output files in rpython.translator.platform.Platform._make_o_file).

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2012/01/comparing-partial-evaluation-and-7255412724168990164.html b/posts/2012/01/comparing-partial-evaluation-and-7255412724168990164.html new file mode 100644 index 000000000..34b5069cb --- /dev/null +++ b/posts/2012/01/comparing-partial-evaluation-and-7255412724168990164.html @@ -0,0 +1,673 @@ + + + + + +Comparing Partial Evaluation and Tracing, Part 1 | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                Comparing Partial Evaluation and Tracing, Part 1

                + + + +
                +

                As part of writing my PhD I am currently thinking about the relationship +between PyPy's meta-tracing approach with various previous ideas to +automatically get a (JIT-)compiler from only an interpreter of a language. One +of the most-researched ideas along these lines is that of partial evaluation. +Partial evaluation has basically the same goals as PyPy when it comes to +compilers: Write an interpreter, and get a compiler for free. The methods for +reaching that goal are a bit different. In this series of blog posts, I am +trying to explore the similarities and differences of partial evaluation and +PyPy's meta-tracing.

                +

                A Flowgraph Language

                +

                To be able to clearly understand what "partial evaluation" is and what +"meta-tracing" is I will show an "executable model" of both. To that end, I am +defining a small imperative language and will then show what a partial evaluator +and a tracer for that language look like. All this code will be +implemented in Prolog. (Any pattern-matching functional language would do, but I +happen to know Prolog best. Backtracking is not used, so you can read things +simply as functional programs.) In this post I will start with +the definition of the language, and a partial evaluator for it. The code +written in this blog post can be found fully here: https://paste.pocoo.org/show/541004/

                +

                The language is conceptionally similar to PyPy's flow graphs, but a bit more +restricted. It does not have function calls, only labelled basic blocks +that consist of a series of linearly executed operations, followed by a +conditional or an unconditional jump. Every operation is assigning a value to a +variable, which is computed by applying some operation to some arguments.

                +

                A simple program to raise x to the yth power in that language looks like +this:

                +
                +power:
                +    res = 1
                +    if y goto power_rec else goto power_done
                +
                +power_rec:
                +    res = res * x
                +    y = y - 1
                +    if y goto power_rec else goto power_done
                +
                +power_done:
                +    print_and_stop(res)
                +
                +

                To represent the same program as Prolog data structures, we use the +following Prolog code:

                +
                block(power, op1(res, same, const(1),
                +             if(y, power_rec, power_done))).
                +block(power_rec, op2(res, mul, var(res), var(x),
                +                 op2(y, sub, var(y), const(1),
                +                 if(y, power_rec, power_done)))).
                +block(power_done, print_and_stop(var(res))).
                +
                +

                Every rule of block declares one block by first giving the label of the +block, followed by the code. Code is a series of op1 or op2 statements +terminated by a jump, an if or a print_and_stop. op1 statements +are operations with one argument of the form op1(res_variable, +operation_name, argument, next_statement). Arguments can be either variables +in the form var(name) or constants in the form const(value).

                +

                To run programs in this flowgraph language, we first need some helper +functionality. The first few helper functions are concerned with the handling of +environments, the data structures the interpreter uses to map variable +names occuring in the program to the variables' current values. In Python +dictionaries would be used for this purpose, but in Prolog we have to emulate +these by lists of key/value pairs (not very efficient, but good enough):

                +
                lookup(X, [], _) :- throw(key_not_found(X)).
                +lookup(Key, [Key/Value | _], Value) :- !.
                +lookup(Key, [_ | Rest], Value) :- lookup(Key, Rest, Value).
                +
                +write_env([], X, V, [X/V]).
                +write_env([Key/_ | Rest], Key, Value, [Key/Value | Rest]) :- !.
                +write_env([Pair | Rest], Key, Value, [Pair | NewRest]) :- write_env(Rest, Key, Value, NewRest).
                +
                +remove_env([], _, []).
                +remove_env([Key/_ | Rest], Key, Rest) :- !.
                +remove_env([Pair | Rest], Key, [Pair | NewRest]) :- remove_env(Rest, Key, NewRest).
                +
                +resolve(const(X), _, X).
                +resolve(var(X), Env, Y) :- lookup(X, Env, Y).
                +
                +

                The implementation of these functions is not too important. The lookup +function finds a key in an environment list, the write_env function adds a +new key/value pair to an environment, remove_env removes a key. The +resolve function is used to take either a constant or a variable and return +a value. If it's a constant, the value of that constant is returned, if it's a +variable it is looked up in the environment. Note how the last argument of +lookup and resolve is actually a return value, which is the typical +approach in Prolog.

                +

                So far we have not specified what the primitive operations that can occur in the +program actually mean. For that we define a do_op function which +executes primitive operations:

                +
                do_op(same, X, X).
                +do_op(mul, X, Y, Z) :- Z is X * Y.
                +do_op(add, X, Y, Z) :- Z is X + Y.
                +do_op(sub, X, Y, Z) :- Z is X - Y.
                +do_op(eq, X, Y, Z) :- X == Y -> Z = 1; Z = 0.
                +do_op(ge, X, Y, Z) :- X >= Y -> Z = 1; Z = 0.
                +do_op(readlist, L, I, X) :- nth0(I, L, X).
                +do_op(Op, _, _, _) :- throw(missing_op(Op)).
                +
                +

                Again the last argument is an output variable.

                +

                Now we can start executing simple operations. For that an interp predicate +is defined. It takes as its first argument the current environment and as the +second argument the operation to execute. E.g. to execute primitive operations +with one or two arguments:

                +
                interp(op1(ResultVar, Op, Arg, Rest), Env) :-
                +    resolve(Arg, Env, RArg),
                +    do_op(Op, RArg, Res),
                +    write_env(Env, ResultVar, Res, NEnv),
                +    interp(Rest, NEnv).
                +
                +interp(op2(ResultVar, Op, Arg1, Arg2, Rest), Env) :-
                +    resolve(Arg1, Env, RArg1),
                +    resolve(Arg2, Env, RArg2),
                +    do_op(Op, RArg1, RArg2, Res),
                +    write_env(Env, ResultVar, Res, NEnv),
                +    interp(Rest, NEnv).
                +
                +

                First the arguments are resolved into values. Afterwards the operation is executed, +and the result is written back into the environment. Then interp is called on +the rest of the program. Similarly easy are the unconditional jump and +print_and_stop:

                +
                interp(jump(L), Env) :-
                +    block(L, Block),
                +    interp(Block, Env).
                +
                +
                +interp(print_and_stop(Arg), Env) :-
                +    resolve(Arg, Env, Val),
                +    print(Val), nl.
                +
                +

                In the unconditional jump we simply get the target block and continue executing +that. To execute print_and_stop we resolve the argument, print the value and +then are done.

                +

                The conditional jump is only slightly more difficult:

                +
                interp(if(V, L1, L2), Env) :-
                +    lookup(V, Env, Val),
                +    (Val == 0 ->
                +        block(L2, Block)
                +    ;
                +        block(L1, Block)
                +    ),
                +    interp(Block, Env).
                +
                +

                First the variable is looked up in the environment. If the variable is zero, +execution continues at the second block, otherwise it continues at the first +block.

                +

                Given this interpreter, we can execute the above example program like this, on a +Prolog console:

                +
                $ swipl -s cfglang.pl
                +?- block(power, Block), interp(Block, [x/10, y/10]).
                +10000000000
                +
                +

                Partial Evaluation of the Flowgraph Language

                +

                Let's look at what a partial evaluator for this simple flowgraph language would +look like. Partial evaluation (PE), also called specialization, is a program +manipuation technique. PE takes an input program and transforms it into a +(hopefully) simpler and faster output program. It does this by assuming that +some variables in the input program are constants. All operations that act only +on such constants can be folded away. All other operations need to remain in the +output program (called residual program). Thus the partial evaluator proceeds +much like an interpreter, just that it cannot actually execute some operations. +Also, its output is not just a value, but also list of remaining operations that +could not be optimized away.

                +

                The partial evaluator cannot use normal environments, because unlike the +interpreter not all variables' values are known to it. It will therefore work on +partial environments, which store just the know variables. For these partial +environments, some new helper functions are needed:

                +
                plookup(Key, [], var(Key)).
                +plookup(Key, [Key/Value | _], const(Value)) :- !.
                +plookup(Key, [_ | Rest], Value) :- plookup(Key, Rest, Value).
                +
                +presolve(const(X), _, const(X)).
                +presolve(var(V), PEnv, X) :- plookup(V, PEnv, X).
                +
                +

                The function plookup takes a variable and a partial environment and returns +either const(Value) if the variable is found in the partial environment or +var(Key) if it is not. Equivalently, presolve is like resolve, +except that it uses plookup instead of lookup.

                +

                With these helpers we can start writing a partial evaluator. The following two +rules are where the main optimization in the form of constant folding happens. +The idea is that when the partial evaluator sees an operation that involves +only constant arguments, it can constant-fold the operation, otherwise it +can't:

                +
                pe(op1(ResultVar, Op, Arg, Rest), PEnv, NewOp) :-
                +    presolve(Arg, PEnv, RArg),
                +    (RArg = const(C) ->
                +        do_op(Op, C, Res),
                +        write_env(PEnv, ResultVar, Res, NEnv),
                +        RestResidual = NewOp
                +    ;
                +        remove_env(PEnv, ResultVar, NEnv),
                +        NewOp = op1(ResultVar, Op, RArg, RestResidual)
                +    ),
                +    pe(Rest, NEnv, RestResidual).
                +
                +pe(op2(ResultVar, Op, Arg1, Arg2, Rest), PEnv, NewOp) :-
                +    presolve(Arg1, PEnv, RArg1),
                +    presolve(Arg2, PEnv, RArg2),
                +    (RArg1 = const(C1), RArg2 = const(C2) ->
                +        do_op(Op, C1, C2, Res),
                +        write_env(PEnv, ResultVar, Res, NEnv),
                +        RestResidual = NewOp
                +
                +    ;
                +        remove_env(PEnv, ResultVar, NEnv),
                +        NewOp = op2(ResultVar, Op, RArg1, RArg2, RestResidual)
                +    ),
                +    pe(Rest, NEnv, RestResidual).
                +
                +

                The pe predicate takes a partial environment, the current operations and +potentially returns a new operation. To partially evaluate a simple operation, its arguments are +looked up in the partial environment. If all the arguments are constants, the +operation can be executed, and no new operation is produced. Otherwise, we need +to produce a new residual operation which is exactly like the one currently +looked at. Also, the result variable needs to be removed from the partial +environment, because it was just overwritten by an unknown value.

                +

                The potentially generated residual operation is stored into the output argument +NewOp. The output argument of the recursive call is the last argument of +the newly created residual operation, which will then be filled by the +recursive call. This is a typical approach in Prolog, but may look strange if +you are not familiar with it.

                +

                Note how the first case of these two rules is just like interpretation. The +second case doesn't really do anything, it just produces a residual operation. +This relationship between normal evaluation and partial evaluation is very +typical.

                +

                The unconditional jump and print_and_stop are not much more complex:

                +
                pe(jump(L), PEnv, jump(LR)) :-
                +    do_pe(L, PEnv, LR).
                +
                +pe(print_and_stop(Arg), Env, print_and_stop(RArg)) :-
                +    presolve(Arg, Env, RArg).
                +
                +

                To partially evaluate an unconditional jump we again produce a jump. The target +label of that residual jump is computed by asking the partial evaluator to +produce residual code for the label L with the given partial environment. +print_and_stop is simply turned into a print_and_stop. We will see the +code for do_pe soon.

                +

                Conditional jumps are more interesting:

                +
                pe(if(V, L1, L2), PEnv, NewOp) :-
                +    plookup(V, PEnv, Val),
                +    (Val = const(C) ->
                +        (C = 0 ->
                +            L = L2
                +        ;
                +            L = L1
                +        ),
                +        do_pe(L, PEnv, LR),
                +        NewOp = jump(LR)
                +    ;
                +        do_pe(L1, PEnv, L1R),
                +        do_pe(L2, PEnv, L2R),
                +        NewOp = if(V, L1R, L2R)
                +    ).
                +
                +

                First we look up the value of the condition variable. If it is a constant, we +can produce better code, because we know statically that only one path is +reachable. Thus we produce code for that path, and then emit an unconditional +jump there. If the condition variable is not known at partial evaluation time, +we need to partially evaluate both paths and produce a conditional jump in the +residual code.

                +

                This rule is the one that causes the partial evaluator to potentially do much +more work than the interpreter, because after an if sometimes both paths +need to be explored. In the worst case this process never stops, so a real +partial evaluator would need to ensure somehow that it terminates. There are +many algorithms for doing that, but I will ignore this problem here.

                +

                Now we need to understand what the do_pe predicate is doing. Its most +important task is to make sure that we don't do the same work twice by +memoizing code that was already partially evaluated in the past. For that it +keeps a mapping of Label, Partial Environment to Label of the residual +code:

                +
                do_pe(L, PEnv, LR) :-
                +    (code_cache(L, PEnv, LR) ->
                +        true
                +    ;
                +        gensym(L, LR),
                +        assert(code_cache(L, PEnv, LR)),
                +        block(L, Code),
                +        pe(Code, PEnv, Residual),
                +        assert(block(LR, Residual))
                +    ).
                +
                +

                If the code cache indicates that label L was already partially evaluated +with partial environment PEnv, then the previous residual code label +LPrevious +is returned. Otherwise, a new label is generated with gensym, the code cache +is informed of that new label with assert, then the block is partially +evaluated and the residual code is added to the database.

                +

                For those who know partial evaluation terminology: This partial evaluator is a +polyvariant online partial evaluator. "Polyvariant" means that for every label, +several specialized version of the block can be generated. "Online" means that +no preprocessing is done before the partial evaluator runs.

                + +

                Partial Evaluation Example

                +

                With this code we can look at the classical example of partial evaluation (it's +probably the "Hello World" of partial evaluation). We +can ask the partial evaluator to compute a power function, where the exponent +y is a fixed number, e.g. 5, and the base x is unknown:

                +
                ?- do_pe(power, [y/5], LR).
                +LR = power1.
                +
                +

                To find out which code was produced, we can use listing:

                +
                ?- listing(code_cache)
                +code_cache(power, [y/5], power1).
                +code_cache(power_rec, [y/5, res/1], power_rec1).
                +code_cache(power_rec, [y/4], power_rec2).
                +code_cache(power_rec, [y/3], power_rec3).
                +code_cache(power_rec, [y/2], power_rec4).
                +code_cache(power_rec, [y/1], power_rec5).
                +code_cache(power_done, [y/0], power_done1).
                +
                +?- listing(block)
                +.... the block definition of the user program ....
                +block(power_done1, print_and_stop(var(res))).
                +block(power_rec5, op2(res, mul, var(res), var(x), jump(power_done1))).
                +block(power_rec4, op2(res, mul, var(res), var(x), jump(power_rec5))).
                +block(power_rec3, op2(res, mul, var(res), var(x), jump(power_rec4))).
                +block(power_rec2, op2(res, mul, var(res), var(x), jump(power_rec3))).
                +block(power_rec1, op2(res, mul, const(1), var(x), jump(power_rec2))).
                +block(power1, jump(power_rec1)).
                +
                +

                The code_cache tells which residual labels correspond to which original +labels under which partial environments. Thus, power1 contains the code of +power under the assumption that y is 5. Looking at the block listing, +the label power1 corresponds to code that simply multiplies res by x +five times without using the variable x at all. The loop that was present +in the original program has been fully unrolled, the loop variable y has +disappeared. Hopefully this is faster than the original program.

                + +

                Conclusion

                +

                In this blog post we saw an interpreter for a simple flow graph language in +Prolog, together with a partial evaluator for it. The partial evaluator +essentially duplicates every rule of the interpreter. If all the arguments of +the current operation are known, it acts like the interpreter, otherwise it +simply copies the operation into the residual code.

                +

                Partial evaluation can be used for a variety of applications, but the most +commonly cited one is that of applying it to an interpreter. To do that, the +program that the interpreter runs is assumed to be constant by the partial +evaluator. Thus a specialized version of the interpreter is produced that does +not use the input program at all. That residual code can be seen as a compiled +version of the input program.

                +

                In the next blog post in this series we will look at writing a simple tracer for +the same flowgraph language.

                +
                +

                Comments

                +
                +
                +
                + + 單中杰 wrote on 2012-01-26 16:57: +
                +
                +

                Excellent example and explanation! I look forward to the next installment!

                But down with gensym! Instead, you can just let LR=pair(L,PEnv).

                +
                +
                +
                +
                + + Armin Rigo wrote on 2012-01-26 17:36: +
                +
                +

                For those not too familiar with Prolog: assert(foo(..)) is not at all like the "assert" of Python or C code. Instead, it adds the rule 'foo(..)' in the database of rules. In other words, it is as if 'foo(..)' was added to the currently running program, as an extra rule.

                +
                +
                +
                +
                + + Carl Friedrich Bolz-Tereick wrote on 2012-01-27 10:01: +
                +
                +

                單中杰: Thanks for the compliments.

                I really like the idea of getting rid of gensym that way. It had never occurred to me to simply use a non-atomic term as a label, very nice.

                +
                +
                +
                +
                + + Anonymous wrote on 2012-01-27 13:29: +
                +
                +

                Very interesting, but I'm a bit confused - what does block(X, Y) do? It isn't defined anywhere.

                +
                +
                +
                +
                + + Carl Friedrich Bolz-Tereick wrote on 2012-01-27 13:38: +
                +
                +

                @Anonymous: block(L, O) lists all the labels and operations corresponding to the labels that exist in the user program. See the very beginning of the post. Also, when partial evaluation creates new code it adds new cases to block(L, O), with the statement assert(block(..., ...)).

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2012/01/numpypy-progress-report-running-3336055571122066974.html b/posts/2012/01/numpypy-progress-report-running-3336055571122066974.html new file mode 100644 index 000000000..50c4552ef --- /dev/null +++ b/posts/2012/01/numpypy-progress-report-running-3336055571122066974.html @@ -0,0 +1,656 @@ + + + + + +NumPyPy progress report - running benchmarks | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                NumPyPy progress report - running benchmarks

                + + + +
                +

                Hello.

                +

                We're excited to let you know about some of the great progress we've made on +NumPyPy: both completeness and performance. In this blog entry we mostly +will talk about performance and how much progress we have made so far. +

                +

                Word of warning: this +work is in progress -- we're maybe half way to where we want to be and there are +many trivial and not so trivial optimizations to be written. (For example, we +haven't even started to implement important optimizations, like vectorization.)

                +
                +

                Benchmark

                +

                We chose a laplace equation solver, based on SciPy's PerformancePython wiki. +Unfortunately, the different implementations on the wiki page accidentally use +two different algorithms, which have different convergences, and very different +performance characteristics on modern computers. As a result, we implemented +our own versions in both C and Python (with and without NumPy). The full source +can be found in fijal's hack repo, all these benchmarks were performed at +revision 18502dbbcdb3.

                +

                First, let me describe various algorithms used. Note that some of them contain +PyPy-specific hacks to work around limitations in the current implementation. +These hacks will go away eventually and the performance will improve. +Numerically the algorithms used are identical, however exact data layout in +memory differs between them.

                +

                A note about all the benchmarks: they each were run once, but the +performance is very stable across runs.

                +

                Starting with the C version, it implements a trivial laplace transform +using two loops and double-reference memory (array of int*). The double +reference does not matter for performance and the two algorithms are +implemented in inline-laplace.c and laplace.c. They were both compiled +with gcc 4.4.5 at -O3. The inline version modifies array in-place while the non-inline version stores results in a copy. That makes them converge at different rate, hence different number of iterations

                +

                A straightforward version of those in Python is implemented in laplace.py +using, respectively, inline_slow_time_step and slow_time_step. +slow_2_time_step does the same thing, except it copies arrays in-place +instead of creating new copies. Table below compares running PyPy against C:

                + +++++ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                benchnumber of iterationstime per iteration
                laplace C2196.3ms
                inline-laplace C27820ms
                slow python21917ms
                slow 2 python21914ms
                inline_slow python27823.7ms
                +

                An important thing to notice is the data dependency of the inline +version causes a huge slowdown for the C versions. This is not a severe +disadvantage for us though -- the brain-dead Python version takes longer +and PyPy is not able to take advantage of the knowledge that the data is +independent. The results are in the same ballpark as the C versions -- +15% - 170% slower, but the algorithm +one chooses matters more than the language. By comparison, the slow versions +take about 5.75s each on CPython 2.6 per iteration and, by estimation, +are about 200x slower than the PyPy equivalent, if I had the patience to +measure the full run.

                +

                The next step is to use NumPy expressions. The first problem we run into is +that computing the error requires walking the entire array a second time. This +is fairly inefficient in terms of cache access, so I took the liberty of +computing the errors every 15 steps. This results in the convergence being +rounded to the nearest 15 iterations, but speeds things up considerably. +numeric_time_step takes the most braindead approach of replacing the array +with itself, like this:

                +
                +u[1:-1, 1:-1] = ((u[0:-2, 1:-1] + u[2:, 1:-1])*dy2 +
                +                       (u[1:-1,0:-2] + u[1:-1, 2:])*dx2)*dnr_inv
                +
                +

                We need 3 arrays here -- one is an intermediate (PyPy only needs one, for all of +those subexpressions), one is a copy for computing the error, and one is the +result. This works automatically because in NumPy + or * creates an +intermediate, while NumPyPy avoids allocating the intermediate if possible.

                +

                numeric_2_time_step works in pretty much the same way:

                +
                +src = self.u
                +self.u = src.copy()
                +self.u[1:-1, 1:-1] = ((src[0:-2, 1:-1] + src[2:, 1:-1])*dy2 +
                +                      (src[1:-1,0:-2] + src[1:-1, 2:])*dx2)*dnr_inv
                +
                +

                except the copy is now explicit rather than implicit.

                +

                numeric_3_time_step does the same thing, but notice one doesn't have to copy +the entire array, it's enough to copy the border pieces and fill rest with +zeros:

                +
                +src = self.u
                +self.u = numpy.zeros((self.nx, self.ny), 'd')
                +self.u[0] = src[0]
                +self.u[-1] = src[-1]
                +self.u[:, 0] = src[:, 0]
                +self.u[:, -1] = src[:, -1]
                +self.u[1:-1, 1:-1] = ((src[0:-2, 1:-1] + src[2:, 1:-1])*dy2 +
                +                      (src[1:-1,0:-2] + src[1:-1, 2:])*dx2)*dnr_inv
                +
                +

                numeric_4_time_step is the one that tries hardest to resemble the C version. +Instead of doing an array copy, it actually notices that one can alternate +between two arrays. This is exactly what the C version does. The +remove_invalidates call is a PyPy specific hack - we hope to remove this +call in the near future, but, in short, it promises "I don't have any unbuilt +intermediates that depend on the value of the argument", which means one doesn't +have to compute sub-expressions one is not actually using:

                +
                +remove_invalidates(self.old_u)
                +remove_invalidates(self.u)
                +self.old_u[:,:] = self.u
                +src = self.old_u
                +self.u[1:-1, 1:-1] = ((src[0:-2, 1:-1] + src[2:, 1:-1])*dy2 +
                +                      (src[1:-1,0:-2] + src[1:-1, 2:])*dx2)*dnr_inv
                +
                +

                This one is the most comparable to the C version.

                +

                numeric_5_time_step does the same thing, but notices one doesn't have to copy +the entire array, it's enough to just copy the edges. This is an optimization +that was not done in the C version:

                +
                +remove_invalidates(self.old_u)
                +remove_invalidates(self.u)
                +src = self.u
                +self.old_u, self.u = self.u, self.old_u
                +self.u[0] = src[0]
                +self.u[-1] = src[-1]
                +self.u[:, 0] = src[:, 0]
                +self.u[:, -1] = src[:, -1]
                +self.u[1:-1, 1:-1] = ((src[0:-2, 1:-1] + src[2:, 1:-1])*dy2 +
                +                      (src[1:-1,0:-2] + src[1:-1, 2:])*dx2)*dnr_inv
                +
                +

                Let's look at the table of runs. As before, gcc 4.4.5, compiled at -O3, +and PyPy nightly 7bb8b38d8563, on an x86-64 machine. All of the numeric methods +run for 226 steps, slightly more than the 219, rounding to the next 15 when the +error is computed.

                + +++++ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                benchmarkPyPyCPython
                numeric21ms35ms
                numeric 214ms37ms
                numeric 313ms29ms
                numeric 411ms31ms
                numeric 59.3ms21ms
                +

                We think that these preliminary results are pretty good. They're not as fast as +the C version (or as fast as we'd like them to be), but we're already much +faster than NumPy on CPython -- almost always by more than 2x on this relatively +real-world example. This is not the end, though. In fact, it's hardly the +beginning! As we continue work, we hope to make even more use of the +high level information that we have. Looking at the assembler generated by +gcc for this example, it's pretty clear we can outperform it thanks to better +aliasing information and hence better possibilities for vectorization. +Stay tuned.

                +

                EDIT: fixed the benchmark name +

                +

                EDIT2: added info that first table is about PyPy

                +

                Cheers, +fijal

                +
                +
                +

                Comments

                +
                +
                +
                + + D wrote on 2012-01-10 20:24: +
                +
                +

                Nice to hear, but what we (numpy users) really need is 2-dimensional matrices with basic arithmetic operations (+, -, /, *, sin, cos, pow etc) and other related methods, e.g. min(array,axis), nanmax(array, axis), argmax(array,axis), nanargmin(array, axis) etc. While CPython soft dependent on these operations works more or less fast, with PyPy it mere doesn't work at all. I hope first of all you'll focus on it instead of speed improvement for single-dimensional arrays.
                Regards, D.

                +
                +
                +
                +
                + + Maciej Fijalkowski wrote on 2012-01-10 20:27: +
                +
                +

                It would be really cool if you try before complaining. I think all of it works on a nightly build, except the axis argument which is on a branch being worked on.

                +
                +
                +
                +
                + + D wrote on 2012-01-10 20:28: +
                +
                +

                Also, IIRC NumPyPy still misses linalg.solve method for solving systems of linear equations, that is highly important for lots of soft. Connecting sparse SLE solver (like umfpack or superlu from scipy.sparse) also would be very essential.

                +
                +
                +
                +
                + + Maciej Fijalkowski wrote on 2012-01-10 20:30: +
                +
                +

                We're working on it. Stay tuned

                +
                +
                +
                +
                + + D wrote on 2012-01-10 20:32: +
                +
                +

                Maciej, anything about 2-dimensional matrix implementations with related operations haven't been mentioned in blog, so why I have to know about it? I only installed and tried stable PyPy 1.7, because I had tried building PyPy from sources and found it damned hard, especially for my limited hardware (2 GB RAM).

                +
                +
                +
                +
                + + Maciej Fijalkowski wrote on 2012-01-10 20:33: +
                +
                +

                Good point, we'll write a blog post what has been implemented as well. Try nightly

                +
                +
                +
                +
                + + Adam wrote on 2012-01-10 21:02: +
                +
                +

                A Laplace transform is something quite different to solving Laplace's equation with finite differences...

                +
                +
                +
                +
                + + Maciej Fijalkowski wrote on 2012-01-10 21:07: +
                +
                +

                fixed, thanks

                +
                +
                +
                +
                + + Anonymous wrote on 2012-01-10 21:13: +
                +
                +

                It may be nice to link to the nightly builds so that people can try this out :)

                +
                +
                +
                +
                + + Chris LeBlanc wrote on 2012-01-10 23:12: +
                +
                +

                This is excellent! Great work, the potential of this project is very exciting. I was quietly wishing for this since pypy first started.

                I use NumPy all the time, and any increase in performance makes a big difference. This is one of the main advantages of NumPyPy over NumPy, so it makes sense to focus on it.

                There seems to be lots of complaining about missing features and such, but having a solid foundation to work from seems to be the most important thing. Missing features can be added down the line.

                I remember reading a blog post last year about using transactional memory as a way of removing the GIL. If you could combine that with NumPyPy to run numerical tasks in parallel, that would make a lot of scientific programmers very happy. I don't know if this is feasible, but it sure would be nice.

                Keep up the good work.

                +
                +
                +
                +
                + + Maciej Fijalkowski wrote on 2012-01-10 23:18: +
                +
                +

                Hi Chris.

                We have vague plans how to parallelize numpy expressions without even having to remove the GIL. That way you'll have workers that are able to perform (or help perform) numeric tasks, but the interpreter itself will still run in a single thread. The same goes for GPUs and MIC.

                +
                +
                +
                +
                + + Anonymous wrote on 2012-01-11 10:55: +
                +
                +

                Nightly builds
                https://buildbot.pypy.org/nightly/trunk

                +
                +
                +
                +
                + + Anonymous wrote on 2012-01-11 13:33: +
                +
                +

                Please when you consider parallelizing things, do remember about leaving an explicit switch to turn it off!

                I run my Python stuff on clusters through a queuing system and it will be VERY unhappy if single processes use more than one thread without informing the scheduler.

                +
                +
                +
                +
                + + Anonymous wrote on 2012-01-11 13:34: +
                +
                +

                Hey, by the way, your progress on NumPy is amazing and highly appreciated.

                +
                +
                +
                +
                + + Maciej Fijalkowski wrote on 2012-01-11 15:31: +
                +
                +

                @Anonymous of course, this is a given that we'll leave the switch to turn it off. It might be not even on by default, that's up for discussion

                +
                +
                +
                +
                + + Paul Harrison wrote on 2012-01-12 02:42: +
                +
                +

                Chris, if you haven't considered this already, it's sometimes possible to achieve parallelism with multiple processes using memory mapped files as numpy arrays. It's a bit awkward, but it can also make for an easier path to a computation that is resumable or can be run on a cluster.

                GIL removal would be wonderful, but it's a pretty ambitious idea. Then again, these pypy folk seem able to deliver on some pretty amazing stuff.

                +
                +
                +
                +
                + + Peter S wrote on 2012-01-16 10:33: +
                +
                +

                I am closely following these developments with numpypy and I just succesfully tested the last nightly build, which I find very impressive!

                For research purposes, the main thing we need is scipy.stats.ttest_1samp to work on pypy. Is there an estimation on when scipypy will be available?

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2012/01/numpypy-status-update-6434340612277938795.html b/posts/2012/01/numpypy-status-update-6434340612277938795.html new file mode 100644 index 000000000..9fb87396d --- /dev/null +++ b/posts/2012/01/numpypy-status-update-6434340612277938795.html @@ -0,0 +1,353 @@ + + + + + +NumPyPy status update | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                NumPyPy status update

                + + + +
                +

                Hello.

                +

                This is just a quick status update on the NumPy in PyPy project that very +recently became my day job. I should give my thanks once again to Getco, +Nate Lawson and other contributors who donated above $40000 towards the goal.

                +

                Recently we (Alex Gaynor, Matti Picus and me) implemented a few interesting things +that a lot of people use:

                +
                  +
                • more ufuncs
                • +
                • most ufuncs now accept the axis parameter (except all and any)
                • +
                • fixed string representation of arrays, now it's identical to numpy (uses +pretty much the same code)
                • +
                • +ndarray.flat should be working correctly
                • +
                • +ndarray.flatten, ndarray.ravel, ndarray.take +
                • +
                • indexing arrays by boolean arrays of the same size
                • +
                • and various bugfixes.
                • +
                +

                We would also like to introduce the nightly report of numpy status. This +is an automated tool that does package introspection. While it gives some +sort of idea how much of numpy is implemented, it's not by far the authority. +Your tests should be the authority. It won't report whether functions +support all kinds of parameters (for example masked arrays and out parameter +are completely unsupported) or that functions work at all. We also +reserve the right to incorporate jokes in that website, so don't treat it +that seriously overall :-)

                +

                Thanks, and stay tuned. We hope to post here regular updates on the +progress.

                +

                Cheers,
                +fijal & the PyPy team

                +
                +

                Comments

                +
                +
                +
                + + Anonymous wrote on 2012-01-28 14:54: +
                +
                +

                I use "out" parameter very often in my code (with numpy.take), without this one my code would run much worse (because huge arrays of hundreds MB would copy many times inside a big cycle). How currently the "out" parameter is handled (warning, error, nothing)?

                +
                +
                +
                +
                + + Maciej Fijalkowski wrote on 2012-01-28 15:01: +
                +
                +

                It just errors with more or less acceptable error message. Note that pypy does not create intermediates for most of operations, so if you have a lot of them chained actually using out will be worse than not using it.

                +
                +
                +
                +
                + + Anonymous wrote on 2012-01-29 23:31: +
                +
                +

                I'm new to python but not to Cpython/numpy/scipy/matplotlib and I fail to understand what you are doing.

                * In a nutshell, what's numpypy? Is it a rewrite of the numpy code to make it compatible with pypy? or are you working on pypy itself to be able to run numpy as it is??

                * if numpypy is a rewrite of numpy, that's good but how do you plan to keep numpy and numpypy sync (in terms of functionalities)??

                * Using numpy with pypy will be great but what about scipy qnd matplotlib??
                Many users need at least these two modules on top of numpy;

                I would be very happy with pypy being able to work with unpachted numpy/scipy/matplotlib.

                I think your website should summarise these issues on its front page.

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2012/01/py3k-and-numpy-first-stage-thanks-to-3008917396290059758.html b/posts/2012/01/py3k-and-numpy-first-stage-thanks-to-3008917396290059758.html new file mode 100644 index 000000000..51549a78d --- /dev/null +++ b/posts/2012/01/py3k-and-numpy-first-stage-thanks-to-3008917396290059758.html @@ -0,0 +1,339 @@ + + + + + +Py3k and Numpy First Stage: Thanks to all who Gave | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                Py3k and Numpy First Stage: Thanks to all who Gave

                + + + +
                +

                Last year was quite successful for PyPy fundraising through the Software Freedom Conservancy, and Conservancy and PyPy are very excited to announce that enough was raised to begin the first stages on the Py3k and Numpy grant proposals.

                +

                As of the end of 2011, 135 different individuals gave to the Py3k campaign, and 114 to the Numpy campaign. We thank each of you who donated to help make this work possible. Meanwhile, if you haven't given to support these projects, we do hope you'll give generously now to help fund their second stages later this year!

                +

                We're also particularly excited that a few donors gave particularly large donations to support this work; those big donations really filled in the gap to help us get started!

                +

                Specifically, we're pleased to announce that Google donated $35000 towards implementing Python 3 in PyPy. Google's general support of the Python community is well known, and their specific support of our grant proposal is much appreciated.

                +

                Meanwhile, Numpy was supported in part by contributions from Nate Lawson, Cantab Capital Partners, and Getco, as well as more than a hundred other contributors.

                +

                With these donations combined with many others, we're now starting work on both projects. This week, the Conservancy signed contracts with Antonio Cuni and Benjamin Peterson to work towards the Stage 1.1 goals in Py3k proposal (and is negotiating for another contractor as well), and with Maciej Fijałkowski to work towards the Stage 1 goals in the Numpy proposal.

                +

                In 2012, PyPy will continue regular sprint meetings, at which Py3K and Numpy efforts will certainly have a place. We have some limited funds to fund travels of contributors to those meetings.

                +

                We're very thankful for all who donated so far to support these efforts, and we hope that now that work has begun, even more donors will come forward to help us finish the job. In the meantime, watch for the commits showing up from these developers and other contributors in the PyPy repositories!

                +

                Cheers, The PyPy Team

                +
                +

                Comments

                +
                +
                +
                + + Gaëtan de Menten wrote on 2012-01-28 20:35: +
                +
                +

                It seems strange to me that Amaury Forgeot d'Arc wasn't the first one to be contracted for working on Py3k support. From the commit messages, he seems to have done most of the work in the py3k branch so far, or is he the unnamed third contractor?

                +
                +
                +
                +
                + + Anonymous wrote on 2012-01-28 23:12: +
                +
                +

                What about a Py2k8, is there any hope? Will at least 2.7 still be supported?

                +
                +
                +
                +
                + + Amaury Forgeot d'Arc wrote on 2012-01-28 23:22: +
                +
                +

                @Gaëtan: The reason is simple: I already have a regular day job, 40 hours a week, and I cannot have another remuneration without consent of my employer.

                Actually I started the py3k branch before the funding proposal, and even before that I've been trying different ways to do the transition from str to unicode.
                Then, my understanding of the JIT and other optimizations is very poor. And there are important changes to do around the representation of unicode for example, or the int/long unification, if we want pypy3k to be as fast as 2.7.

                I am quite happy of the current state: some people are paid to do and finish the real job, and volunteers can have fun and help in some parts, working on the most interesting project around Python.

                +
                +
                +
                +
                + + Amaury Forgeot d'Arc wrote on 2012-01-28 23:27: +
                +
                +

                @Anonymous: there won't be any Python 2.8 (search for PEP404 for the reasons), but as stated in the py3k grant proposal: https://pypy.org/py3donate.html
                "The goal of the PyPy community is to support both Python 2 and Python 3 for the forseeable future"

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2012/01/pypy-internship-at-ncar-2244162842744077724.html b/posts/2012/01/pypy-internship-at-ncar-2244162842744077724.html new file mode 100644 index 000000000..0b47ddbac --- /dev/null +++ b/posts/2012/01/pypy-internship-at-ncar-2244162842744077724.html @@ -0,0 +1,343 @@ + + + + + +PyPy internship at NCAR | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                PyPy internship at NCAR

                + + + +
                +

                Hello, everyone

                +

                I would like to inform you that there is a very interesting opportunity +for doing an internship at NCAR in the lovely town of Boulder, situated +on the foothils of Rocky Mountains. Before you read on, make sure you:

                +
                  +
                • are a student of a US University, who is legally eligible to work in the US
                • +
                • are at least finishing second year this year
                • +
                • apply before February 3rd.
                • +
                +

                The internship itself will focus on using PyPy (in some way) to provide +a high performance numeric kernel for an atmospheric model, and measuring how +fast we can go. This is very much in line with what the current effort on +NumPy in PyPy is about. The internship will be mentored by Davide del Vento +and I hope to have some influence over where it goes myself :-)

                +

                A few interesting links:

                + +

                Feel free to contact Davide for details about the proposal and pypy-dev or +me directly for details about PyPy.

                +

                Cheers, +fijal

                +
                +

                Comments

                +
                +
                +
                + + Rahul wrote on 2012-01-16 05:03: +
                +
                +

                It looks good opportunity for a student. You can also post it on https://jobs.pythonweekly.com/

                +
                +
                +
                +
                + + Cameron Sparr wrote on 2012-02-01 22:56: +
                +
                +

                I've applied for the internship already but was hoping to get some more details so I could make some last-minute edits to my application! Do you have Davide Del Vento's contact info?

                +
                +
                +
                +
                + + Maciej Fijalkowski wrote on 2012-02-02 08:34: +
                +
                +

                send me a mail

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2012/01/simple-tracer-for-flow-graph-language-6930951890987229484.html b/posts/2012/01/simple-tracer-for-flow-graph-language-6930951890987229484.html new file mode 100644 index 000000000..a764b1b5b --- /dev/null +++ b/posts/2012/01/simple-tracer-for-flow-graph-language-6930951890987229484.html @@ -0,0 +1,661 @@ + + + + + +A Simple Tracer for the Flow Graph Language | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                A Simple Tracer for the Flow Graph Language

                + + + +
                +

                Part 2 of Comparing Partial Evaluation to Tracing

                +

                This is the second blog post in a series about comparing partial evaluation and +tracing. In the first post of the series I introduced a small flow-graph +language together with an interpreter for it. Then I showed a partial evaluator +for the language. In this post I will show how a tracer for the same language +works and how it relates to both execution and to partial evaluation. +The code from this post can be found here: https://paste.pocoo.org/show/543542/

                +

                Tracing Execution

                +

                The idea of a tracer (for the described language and also in general) is to do completely normal +interpretation but at the same time keep a log of all the normal operations +(i.e. non-control-flow operations) that were performed. This continues until the +tracer executes the code block where it started at, in which case the trace +corresponds to a closed loop. Then tracing stops and the last operation is +replaced by a jump to the start. After tracing has ended, the trace can be +executed, optionally optimizing it before that.

                +

                To write a tracer, we start from the rules of the interpreter, rename the +predicate to trace and add some extra arguments. Thus, the following rules +in the interpreter:

                +
                interp(op1(ResultVar, Op, Arg, Rest), Env) :-
                +    resolve(Arg, Env, RArg),
                +    do_op(Op, RArg, Res),
                +    write_env(Env, ResultVar, Res, NEnv),
                +    interp(Rest, NEnv).
                +
                +interp(op2(ResultVar, Op, Arg1, Arg2, Rest), Env) :-
                +    resolve(Arg1, Env, RArg1),
                +    resolve(Arg2, Env, RArg2),
                +    do_op(Op, RArg1, RArg2, Res),
                +    write_env(Env, ResultVar, Res, NEnv),
                +    interp(Rest, NEnv).
                +
                +

                become the following rules in the tracer:

                +
                trace(op1(ResultVar, Op, Arg, Rest), Env, op1(ResultVar, Op, Arg, T), TraceAnchor) :-
                +    resolve(Arg, Env, RArg),
                +    do_op(Op, RArg, Res),
                +    write_env(Env, ResultVar, Res, NEnv),
                +    trace(Rest, NEnv, T, TraceAnchor).
                +
                +trace(op2(ResultVar, Op, Arg1, Arg2, Rest), Env, op2(ResultVar, Op, Arg1, Arg2, T), TraceAnchor) :-
                +    resolve(Arg1, Env, RArg1),
                +    resolve(Arg2, Env, RArg2),
                +    do_op(Op, RArg1, RArg2, Res),
                +    write_env(Env, ResultVar, Res, NEnv),
                +    trace(Rest, NEnv, T, TraceAnchor).
                +
                +

                Note how the bodies of the trace rules correspond exactly to the bodies of +the interp rules, the only difference is the recursive call to trace. +The meaning of the arguments of trace is as follows: The first and second argument are +the operation currently executed and the environment, +like in the interpreter. The argument +after that is an output argument that collects the currently traced operation, +in the example above it is exactly like the operation that was executed. +TraceAnchor is additional information about the trace that is being built +right now, most of the time it is just handed on to the recursive call of +trace. We will see later what it contains.

                +

                The rule for print_and_stop is very simple, as execution (and therefore also +tracing) simply stops there:

                +
                trace(print_and_stop(V), Env, print_and_stop(V), _) :-
                +    resolve(V, Env, Val),
                +    print(Val), nl.
                +
                +

                Left are the rules for the control operations jump and if. A trace +linearizes one execution path, it contains no jumps. However, when a jump to the +starting label is reached, tracing should stop. Therefore, the implementation of +jump contains two cases:

                +
                trace(jump(L), Env, T, TraceAnchor) :-
                +    (TraceAnchor = traceanchor(L, FullTrace) ->
                +        T = loop,
                +        write(trace), nl, write(FullTrace), nl,
                +        do_optimize(FullTrace, OptTrace),
                +        write(opttrace), nl, write(OptTrace), nl,
                +        runtrace(OptTrace, Env, OptTrace)
                +    ;
                +        block(L, Block),
                +        trace(Block, Env, T, TraceAnchor)
                +    ).
                +
                +

                Let's disect this code in small increments. First, we see what TraceAnchor +is. It is a term of the form +traceanchor(StartLabel, FullTrace). StartLabel is a label in the program +where tracing started (and where it should end as well, when the loop is +closed). The argument FullTrace is an accumulator which contains the full +trace that is being built right now.

                +

                The condition at the start of the rule checks whether the taget-label L is +the same as the one stored in the trace anchor. If that is the case, we can stop +tracing. The rest of the trace T is assigned the operation loop, which +jumps back to the beginning of the trace. Afterwards we print and optimize the +trace, then run it, using the FullTrace part of the traceanchor.

                +

                If the label we jump to is not the StartLabel we simply continue tracing +without recording any operation. This part of the rule is again extremely +similar to the interpretation of jump.

                +

                For now, we will not use any interesting optimizations, just return the +unoptimized trace unchanged:

                +
                do_optimize(FullTrace, FullTrace).
                +
                +

                The missing operation now is if. An if statement needs special treatment, +because it is a way where control flow can diverge from the trace. The trace is +linear, therefore it can only record one of the two possible paths. When +executing the trace it is possible for the other path to be taken. Therefore +we need to make sure that the same conditions that were true or false during +tracing are still true or false during the execution of the trace. This is done +with a guard operation, which checks for this condition. The following rule +implements it:

                +
                trace(if(V, L1, L2), Env, T, TraceAnchor) :-
                +    lookup(V, Env, Val),
                +    (Val == 0 ->
                +        L = L2, T = guard_false(V, [], L1, NT)
                +    ;
                +        L = L1, T = guard_true(V, [], L2, NT)
                +    ),
                +    trace(jump(L), Env, NT, TraceAnchor).
                +
                +

                It is very similar to the interp rule of if. The rule inserts a +guard_true into the case, if the condition is true, and a guard_false if +the condition is false. The arguments of the guard are: The variable that is +being guarded, an empty list (the reason for that will be explained in a later +post), the label where execution needs to continue when the guard fails and the +rest of the trace.

                +

                Let's also add a small helper predicate that can be used to conveniently start +tracing:

                +
                do_trace(L, Env) :-
                +    block(L, StartBlock),
                +    trace(StartBlock, Env, ProducedTrace, traceanchor(L, ProducedTrace)).
                +
                +

                The predicate takes a label and an environment and executes the label with the +given environment by first producing a trace, then executing the trace and +eventually jumping back to interpretation, if a guard fails. It does this by +reading the code at label L with the block statement, and then calling +trace with an unbound variable ProducedTrace to hold the trace and a trace +anchor that contains the label where tracing started and the produced trace +variable.

                +

                With that predicate and the trace so far we can already trace the power +implementation from the last blog post, just not execute the trace (which we +will do in the next section):

                +
                ?- do_trace(power_rec, [res/1, x/10, y/20]).
                +trace
                +op2(res,mul,var(res),var(x),op2(y,sub,var(y),const(1),guard_true(y,[],power_done,loop)))
                +opttrace
                +op2(res,mul,var(res),var(x),op2(y,sub,var(y),const(1),guard_true(y,[],power_done,loop)))
                +...
                +
                +

                The computed trace is:

                +
                +op2(res,mul,var(res),var(x),
                +op2(y,sub,var(y),const(1),
                +guard_true(y,[],power_done,
                +loop)))
                +
                +

                which is exactly the content of the loop from power_rec. Note how the if +is turned into a guard_true which jumps to power_done if the guard +fails.

                +

                A real tracing system would need a way for the tracer to get started, e.g. by +doing profiling in an interpreter and starting the tracer for labels that are +jumped to often. Also, traces for the same label are usually cached in some way. +These details are left out in this simple model.

                +

                Executing Traces

                +

                In a real tracing system, the traces would be turned into machine code and +executed by the CPU. In our small model, we will simply write another +interpreter for them. This interpreter is very simple and looks again very +similar to interp.

                +
                runtrace(op1(ResultVar, Op, Arg, Rest), Env, TraceFromStart) :-
                +    resolve(Arg, Env, RArg),
                +    do_op(Op, RArg, Res),
                +    write_env(Env, ResultVar, Res, NEnv),
                +    runtrace(Rest, NEnv, TraceFromStart).
                +
                +runtrace(op2(ResultVar, Op, Arg1, Arg2, Rest), Env, TraceFromStart) :-
                +    resolve(Arg1, Env, RArg1),
                +    resolve(Arg2, Env, RArg2),
                +    do_op(Op, RArg1, RArg2, Res),
                +    write_env(Env, ResultVar, Res, NEnv),
                +    runtrace(Rest, NEnv, TraceFromStart).
                +
                +

                These rules are completely equivalent to the interp rules for op1 and +op2. runtrace needs an extra argument, TraceFromStart, which is +always just handed over to the recursive call of runtrace.

                +

                When the end of the trace is reached and the loop statement is encountered, +we simply start from the beginning:

                +
                runtrace(loop, Env, TraceFromStart) :-
                +    runtrace(TraceFromStart, Env, TraceFromStart).
                +
                +

                The remaining question is what to do when encountering guards. In that case the +guard condition needs to be checked. If the guard succeeds, executing the trace can +continue. Otherwise the trace is aborted and the interpreter resumes execution:

                +
                runtrace(guard_true(V, ResumeVars, L, Rest), Env, TraceFromStart) :-
                +    lookup(V, Env, Val),
                +    (Val == 0 ->
                +        resume_interp(Env, ResumeVars, L)
                +    ;
                +        runtrace(Rest, Env, TraceFromStart)
                +    ).
                +
                +runtrace(guard_false(V, ResumeVars, L, Rest), Env, TraceFromStart) :-
                +    lookup(V, Env, Val),
                +    (Val == 0 ->
                +        runtrace(Rest, Env, TraceFromStart)
                +    ;
                +        resume_interp(Env, ResumeVars, L)
                +    ).
                +
                +
                +resume_interp(Env, [], L) :-
                +    block(L, Block),
                +    interp(Block, Env).
                +
                +

                Note how the execution is handed over to the interpreter at the label that was +encoded as the third argument in the guard operation. +What the ResumeVars are for we will see in a later post. For now we assume +that it is always an empty list.

                +

                With this interpreter for traces we can now trace and then execute the example:

                +
                :- do_trace(power_rec, [res/1, x/10, y/20]).
                +trace
                +op2(res,mul,var(res),var(x),op2(y,sub,var(y),const(1),guard_true(y,[],power_done,loop)))
                +opttrace
                +op2(res,mul,var(res),var(x),op2(y,sub,var(y),const(1),guard_true(y,[],power_done,loop)))
                +100000000000000000000
                +
                +

                Of course this is example is not very exciting, because the trace looks more or less exactly +like the original code as well. There will be more exciting examples in a later +post.

                +

                Extension: Promotion

                +

                As it is, the tracer does not actually add much to the interpreter. It +linearizes control flow, but nothing deeply advanced happens. In this section I +will add a crucial but simple to implement extension to the control flow language that allows the tracer +to do more interesting things. This extension is called promotion.

                +

                Promotion is basically a hint that the programmer can add to her control flow +graph program. A promotion is an operation promote(V, L) that takes a +variable V and a label L. When the interpreter runs this statement, it +simply jumps to the label L and ignores the variable:

                +
                interp(promote(_, L), Env) :-
                +    interp(jump(L), Env).
                +
                +

                However, the tracer does something much more interesting. For the tracer, the +promote statement is a hint that it would be very useful to know the value +of V and that the rest of the trace should keep that value as a constant. +Therefore, when the tracer encounters a promotion, it inserts a special kind of +guard called guard_value

                +
                trace(promote(V, L), Env, guard_value(V, Val, [], L, T), TraceAnchor) :-
                +    lookup(V, Env, Val),
                +    trace(jump(L), Env, T, TraceAnchor).
                +
                +

                The guard_value is an interesting operation, because it freezes the current +value FVal of variable V into the trace. When the trace is executed, the +guard checks that the current value of the variable and the frozen value are the +same. If yes, execution continues, if not, the trace is aborted:

                +
                runtrace(guard_value(V, FVal, ResumeVars, L, Rest), Env, TraceFromStart) :-
                +    lookup(V, Env, Val),
                +    (Val == FVal ->
                +        runtrace(Rest, Env, TraceFromStart)
                +    ;
                +        resume_interp(Env, ResumeVars, L)
                +    ).
                +
                +

                What can this operation be used for? It's a way to communicate to the tracer +that variable V is not changing very often and that it is therefore useful +to freeze the current value into the trace. This can be done even without +knowing the value of V in advance.

                +

                Let's look at a (slightly contrived) example:

                +
                +l:
                +    c = i >= 0
                +    if c goto b else goto l_done
                +
                +l_done:
                +    print_and_stop(var(i))
                +
                +b:
                +    promote(x, b2)
                +
                +b2:
                +    x2 = x * 2
                +    x3 = x2 + 1
                +    i = i - x3
                +    goto l
                +
                +

                Encoded in Prolog syntax:

                +
                block(l, op2(c, ge, var(i), const(0),
                +         if(c, b, l_done))).
                +block(l_done, print_and_stop(var(i))).
                +
                +block(b, promote(x, b2)).
                +block(b2, op2(x2, mul, var(x), const(2),
                +          op2(x3, add, var(x2), const(1),
                +          op2(i, sub, var(i), var(x3),
                +          jump(l))))).
                +
                +

                This is a simple loop that counts down in steps of x * 2 + 1, whatever x +might be, until i >= 0 is no longer true. Assuming that x doesn't change +often, it is worth to promote it to be able to constant-fold x * 2 + 1 to +not have to redo it every iteration. This is done with the promotion of x +(of course optimizing this loop with loop invariant code motion would work as +well, because x doesn't actually change during the loop).

                +

                To trace this, we can run the following query:

                +
                ?- do_trace(b, [i/100, x/5]).
                +trace
                +guard_value(x,5,[],b2,op2(x2,mul,var(x),const(2),op2(x3,add,var(x2),const(1),op2(i,sub,var(i),var(x3),op2(c,ge,var(i),const(0),guard_true(c,[],l_done,loop))))))
                +opttrace
                +guard_value(x,5,[],b2,op2(x2,mul,var(x),const(2),op2(x3,add,var(x2),const(1),op2(i,sub,var(i),var(x3),op2(c,ge,var(i),const(0),guard_true(c,[],l_done,loop))))))
                +-10
                +
                +

                Writing the trace in a more readable way:

                +
                guard_value(x,3,[],b2,
                +op2(x2,mul,var(x),const(2),
                +op2(x3,add,var(x2),const(1),
                +op2(i,sub,var(i),var(x3),
                +op2(c,ge,var(i),const(0),
                +guard_true(c,[],l_done,
                +loop))))))
                +
                +

                After the guard_value the operations performed on x could be +constant-folded away, because the guard ensures that x is 5 before +execution continues. To actually do the constant-folding we would need some +optimization component that optimizes traces. This will be done in the next blog +post.

                +

                In this section I mostly talked about how promotion is realized in the tracer, +not what and how to use to use it for. Promotion is one of the most important +ingredients that's responsible for the success of PyPy's tracing approach. How +this works is discussed in detail in the paper "Runtime feedback in a +meta-tracing JIT for efficient dynamic languages".

                +

                Conclusion

                +

                In this blog post we have seen a very minimalistic tracer and an interpreter for +the produced traces. The tracer is very much like the original interpreter, it +just also keeps track of which operations were executed, in addition to +executing the program. Tracing stops when a loop is closed, then the trace can +be optimized and run. Running a trace continues until a failing guard is hit. At +that point, execution goes back to the normal interpreter (and stays there, in +this very simple implementation).

                +

                I also presented an extension of tracing that makes it possible to add a hint +called promote to the original program that tells the tracer to feed back a +runtime value into the trace and freeze it there. This extension would be +impossible to do in the partial evaluator from the last post, because partial +evaluation is done strictly before run time, so if a variable isn't already +known, its likely runtime value cannot be found out.

                +

                In the next post I will show how to optimize traces before executing them and +how the optimizer for traces is related to partial evaluation.

                +
                +

                Comments

                +
                +
                +
                + + larsr wrote on 2012-02-01 13:54: +
                +
                +

                Hi, these posts are great!

                A question: shouldn't runtrace resume tracing instead of running the interpreter (in resume_interp)?

                And perhaps a clarification: when the blog post calls do_trace all of the necessary code has not been shown yet, so one can't really follow along at the keyboard there just yet.

                +
                +
                +
                +
                + + Carl Friedrich Bolz-Tereick wrote on 2012-02-02 15:36: +
                +
                +

                @larsr: thanks!

                Yes, in principle you are right that there could be a mechanism that stars to trace from the point where a guard fails. This is an element of tracing JITs that the current code leaves off, it would need to be solved together with the caching of traces.

                A lot of things are just sketched in this implementation, e.g. only one trace ever is started, once you end up in the interpreter the tracer never starts again.

                +
                +
                +
                +
                + + Anonymous wrote on 2012-03-04 11:46: +
                +
                +

                trace(if(V, L1, L2), Env, T, TraceAnchor) :-
                lookup(V, Env, Val),
                (Val == 0 ->
                L = L2, T = guard_false(V, [], L1, NT)
                ;
                L = L1, T = guard_true(V, [], L2, NT)
                ),
                trace(jump(L), Env, NT, TraceAnchor). This trac is okay, but python IDE is not Adroid supported. ビーグレン

                +
                +
                +
                +
                + + quadhier wrote on 2020-08-04 15:52: +
                +
                +

                Hi! Great posts. But the source code url is invalid now, could you please provide the source code again? THANKS!

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2012/01/transactional-memory-ii-7225309560970774590.html b/posts/2012/01/transactional-memory-ii-7225309560970774590.html new file mode 100644 index 000000000..3659214a2 --- /dev/null +++ b/posts/2012/01/transactional-memory-ii-7225309560970774590.html @@ -0,0 +1,672 @@ + + + + + +Transactional Memory (II) | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                Transactional Memory (II)

                + + + +
                +

                Here is an update about the previous blog post about the +Global Interpreter Lock (GIL). In 5 months, the point of view +changed quite a bit.

                +

                Let me remind you that the GIL is the technique used in both CPython and +PyPy to safely run multi-threaded programs: it is a global lock that +prevents multiple threads from actually running at the same time. The +reason to do that is that it would have disastrous effects in the +interpreter if several threads access the same object concurrently --- to +the point that in CPython even just manipulating the object's reference +counter needs to be protected by the lock.

                +

                So far, the ultimate goal to enable true multi-CPU usage has been to +remove the infamous GIL from the interpreter, so that multiple threads +could actually run in parallel. It's a lot of work, but this has been +done in Jython. The reason that it has not been done in CPython so far +is that it's even more work: we would need to care not only about +carefully adding fine-grained locks everywhere, but also about reference +counting; and there are a lot more C extension modules that would need +care, too. And we don't have locking primitives as performant as +Java's, which have been hand-tuned since ages (e.g. to use help from the +JIT compiler).

                +

                But we think we have a plan to implement a different model for using +multiple cores. Believe it or not, this is better than just removing +the GIL from PyPy. You might get to use all your cores without ever +writing threads.

                +

                You would instead just use some event dispatcher, say from Twisted, from +Stackless, or from your favorite GUI; or just write your own. From +there, you (or someone else) would add some minimal extra code to the +event dispatcher's source code, to exploit the new transactional features +offered by PyPy. Then you would run your program on a +special version of PyPy, and voilà: you get some form of automatic parallelization. +Sounds magic, but the basic idea is simple: start handling multiple +events in parallel, giving each one its own transaction. More about +it later.

                + +

                Threads or Events?

                +

                First, why would this be better than "just" removing the GIL? Because +using threads can be a mess in any complex program. Some authors (e.g. +Lee) have argued that the reason is that threads are fundamentally +non-deterministic. This makes it very hard to reason about them. +Basically the programmer needs to "trim" down the non-determinism (e.g. +by adding locks, semaphores, etc.), and it's hard to be sure when he's +got a sufficiently deterministic result, if only because he can't write +exhaustive tests for it.

                +

                By contrast, consider a Twisted program. It's not a multi-threaded +program, which means that it handles the "events" one after the other. +The exact ordering of the events is not really deterministic, because +they often correspond to external events; but that's the only source of +non-determinism. The actual handling of each event occurs in a nicely +deterministic way, and most importantly, not in parallel with the +handling of other events. The same is true about other libraries like +GUI toolkits, gevent, or Stackless.

                +

                (Of course the Twisted and the Stackless models, to cite only these two, +are quite different from each other; but they have in common the fact +that they are not multi-threaded, and based instead on "events" --- +which in the Stackless case means running a tasklet from one switch() +point to the next one.)

                +

                These two models --- threads or events --- are the two main models we +have right now. The latter is more used in Python, because it is much +simpler to use than the former, and the former doesn't give any benefit +because of the GIL. A third model, which is the only one that gives +multi-core benefits, is to use multiple processes, and do inter-process +communication.

                + +

                The problem

                +

                Consider the case of a big program that has arbitrary complicated +dependencies. Even assuming a GIL-less Python, this is likely enough to +prevent the programmer from even starting a multi-threaded rewrite, +because it would require a huge mess of locks. He could also consider +using multiple processes instead, but the result is annoying as well: +the complicated dependencies translate into a huge mess of inter-process +synchronization.

                +

                The problem can also be down-sized to very small programs, like the kind +of hacks that you do and forget about. In this case, the dependencies +might be simpler, but you still have to learn and use subtle locking +patterns or a complex inter-process library, which is overkill for the +purpose.

                +

                (This is similar to how explicit memory management is not very hard for +small programs --- but still, nowadays a lot of people agree that +automatic memory management is easier for programs of all sizes. I +think the same will eventually be true for using multiple CPUs, but the +correct solution will take time to mature, like garbage collectors did. +This post is a step in hopefully the right direction :-))

                + +

                Events in Transactions

                +

                Let me introduce the notion of independent events: two events are +independent if they don't touch the same set of objects. In a multi-threaded +world, it means that they can be executed in parallel without needing any lock +to ensure correctness.

                +

                Events might also be mostly independent, i.e. they rarely access the same +object concurrently. Of course, in a multi-threaded world we would still need +locks to ensure correctness, but the point is that the locks are rarely causing +pauses: lock contention is low.

                +

                Consider again the Twisted example I gave above. There are often several +events pending in the dispatch queue (assuming the program is using 100% +of our single usable CPU, otherwise the whole discussion is moot). The case I am +interested in is the case in which these events are generally mostly +independent, i.e. we expect few conflicts between them. However +they don't have to be proved independent. In fact it is fine if +they have arbitrary complicated dependencies as described above. The +point is the expected common case. Imagine that you have a GIL-less +Python and that you can, by a wave of your hand, have all the careful +locking mess magically done. Then what I mean here is the case in which +such a theoretical program would run mostly in parallel on multiple +core, without waiting too often on the locks.

                +

                In this case, the solution I'm proposing is that with minimal tweaks +in the event dispatch loop, we can +handle multiple events on multiple threads, each in its own transaction. +A transaction is basically a tentative execution of the corresponding +piece of code: if we detect conflicts with other concurrently executing +transactions, we abort the whole transaction and restart it from +scratch.

                +

                By now, the fact that it can basically work should be clear: multiple +transactions will only get into conflict when modifying the same data +structures, which is the case where the magical wand above would have +put locks. If the magical program could progress without too many +locks, then the transactional program can progress without too many +conflicts. In a way, you get even more than what the magical program +can give you: each event is dispatched in its own transaction, which +means that from each event's point of view, we have the illusion that +nobody else is running concurrently. This is exactly what all existing +Twisted-/Stackless-/etc.-based programs are assuming.

                +

                Note that this solution, without transactions, already exists in some +other languages: for example, Erlang is all about independent events. +This is the simple case where we can just run them on multiple cores, +knowing by construction of the language that you can't get conflicts. +Of course, it doesn't work for Python or for a lot of other languages. +From that point of view, what I'm suggesting is merely that +transactional memory could be a good model to cope with the risks of +conflicts that come from not having a special-made language.

                + +

                Not a perfect solution

                +

                Of course, transactional memory +(TM) is not a perfect solution either. Right now, the biggest issue is +the performance hit that comes from the software implementation (STM). +In time, hardware support (HTM) is likely to show up and help +mitigate the problem; but I won't deny the fact that in some cases, +because it's simple enough and/or because you really need the top +performance, TM is not the best solution.

                +

                Also, the explanations above are silent on what is a hard point for TM, +namely system calls. The basic general solution is to suspend other +transactions as soon as a transaction does its first system call, so +that we are sure that the transaction will succeed. Of course this +solution is far from optimal. Interestingly, it's possible to do better +on a case-by-case basis: for example, by adding in-process buffers, we +can improve the situation for sockets, by having recv() store in a +buffer what is received so that it can be re-recv()-ed later if the +transaction is aborted; similarly, send() or writes to log files can be +delayed until we are sure that the transaction will commit.

                +

                From my point of view, the most important point is that the TM solution +comes from the correct side of the "determinism" scale. With threads, +you have to prune down non-determinism. With TM, you start from a +mostly deterministic point, and if needed, you add non-determinism. The +reason you would want to do so is to make the transactions shorter: +shorter transactions have less risks of conflicts, and when there are +conflicts, less things to redo. So making transactions shorter +increases the parallelism that your program can achieve, while at the +same time requiring more care.

                +

                In terms of an event-driven model, the equivalent would be to divide the +response of a big processing event into several events that are handled +one after the other: for example, the first event sets things up and fires the second +event, which does the actual computation; and afterwards a third event +writes the results back. As a result, the second event's transaction +has little risks of getting aborted. On the other hand, the writing +back needs to be aware of the fact that it's not in the same transaction +as the original setting up, which means that other unrelated +transactions may have run in-between.

                + +

                One step towards the future?

                +

                These, and others, are the problems of the TM approach. They are "new" +problems, too, in the sense that the existing ways of programming don't +have these problems.

                +

                Still, as you have guessed, I think that it is overall a win, and +possibly a big win --- a win that might be on the same scale for the age +of multiple CPUs as automatic garbage collection was 20 years ago for +the age of RAM size explosion.

                +

                Stay tuned for more!

                +

                --- Armin (and reviews by Antonio and Fijal)

                + +
                UPDATE: please look at the tiny transaction module I wrote as an example. The idea is to have the same interface as this module, but implemented differently. By making use of transactional memory internally, it should be possible to safely run on multiple CPUs while keeping the very same programmer interface. +
                +

                Comments

                +
                +
                +
                + + Unknown wrote on 2012-01-14 15:17: +
                +
                +

                Great article, great solution to a big problem...

                I am really looking forward to this :-)

                As an experiment I have developed Pyworks, which makes objects concurrent and methods asynchronious. But it makes little sense to do performance test on an multicore CPU because of the GIL.

                The code for Pyworks can be found at https://bitbucket.org/raindog/pyworks

                +
                +
                +
                +
                + + Anonymous wrote on 2012-01-14 15:38: +
                +
                +

                > These two models --- threads or events --- are the two main models we have right now.

                Where does Go-style concurrency fit in?

                +
                +
                +
                +
                + + gasche wrote on 2012-01-14 16:50: +
                +
                +

                If you go that road, you will certainly find out that Transactional Memory is much, much harder to get right than it looks like in today effectful/imperative languages. Sure, it looks wonderful on paper, but if your language doesn't help you control side-effects it will give you a very hard time.

                Currently, there is satisfying STM support in Haskell (because of its tight type-based control of side-effects) and Clojure (beacuse of its tight control on mutability), and it might be getting into Scala.

                I doubt Python can easily get such control, at least without an important reorganization of idiomatic practices and frameworks, that go beyond the "let's be event-driven" decision. Which makes your "this is going to work magically" story a bit hard to believe.

                There has been intense research on this topic for some decades now, and several attempts at getting it to work in current mainstream languages have mostly failed.

                See for example this long retrospective of the STM.NET effort at Microsoft Research, by Joe Duffy:
                A (brief) retrospective on transactional memory
                or this shorter blog post by Brian Hurt:
                The problem with STM: your languages still suck.

                I was a bit disappointed that you didn't cite any of the relevant literature in your post. It made me suspicious of "reiventing the wheel"...

                +
                +
                +
                +
                + + Anonymous wrote on 2012-01-14 16:57: +
                +
                +

                One major use-case for multithreading involves a large, unchanging data structure which many threads access. I.e., the data structure is loaded by a parent task, then not modified again; a number of threads are then spawned to use it for calculations.

                In CPython, the GIL makes this impossible if only because the reference counters need to be protected. With Cython in threads, however, you can turn off the GIL and do some work on C-style data structures.

                I'm wondering whether the STM PyPy effort could have a very useful, and very early, benefit: simply enabling an unchanging data structure to be accessed by a number of processors via the kinds of events you describe. There wouldn't be a need for transactions, because the programmer would take responsibility for only sharing unchanging structures between simultaneously-executing events.

                But it seems like the basic requirements for this kind of facility might be met in in early stage of STM development. And a solution that allowed multiple processors to access large, unchanging structures would be very useful in certain applications. I know I have one in mind that I'm looking at CPython/Cython for, but I'd rather see if I could get the performance I need from PyPy.

                Just thought it was worth mentioning.

                +
                +
                +
                +
                + + Armin Rigo wrote on 2012-01-14 19:27: +
                +
                +

                @Anonymous: in the extract you cite I meant "the two main models in Python". As far as I can tell, Go does concurrency by enforcing all communications to go via channels, so I would classify it as a "special-made" language. This solution might be nice and usable, but it does not really work at all in languages like Python.

                +
                +
                +
                +
                + + Daniel Waterworth wrote on 2012-01-14 20:27: +
                +
                +

                @Armin, CSP may be built into Go, but IMO this was a mistake, there is no requirement for it to be a language feature; it fits nicer as library. See [python-csp] for a python implementation.

                [python-csp] https://code.google.com/p/python-csp/wiki/Tutorial

                +
                +
                +
                +
                + + Armin Rigo wrote on 2012-01-14 21:11: +
                +
                +

                @gasche: I know about Haskell, Clojure and Scala, and I just read the two blog posts you pointed to.

                I'm not talking about giving explicit TM to the end programmers. I'm instead considering TM as an internal, implementation-only feature. That makes it very similar to GCs.

                I know the points and issues of traditional TM systems, which are nicely reported by Joe Duffy in "A (brief) retrospective on transactional memory". These are of course perfectly valid issues, but I think they do not apply (or "not that much") in the particular context I'm talking about. For example, this includes the large sections about nested transactions, and about consistency between the transactional and non-transactional worlds (Weak or Strong Atomicity, The Privatization Problem). Even "Where is the Killer App?" is obvious in this case: any existing Twisted App is potentially a Killer App.

                Sorry for not including references to papers. I must admit I don't know any paper that describes a similar use case for TM.

                +
                +
                +
                +
                + + Simon Weber wrote on 2012-01-14 21:45: +
                +
                +

                The link to the previous blog post is broken. It should be: https://morepypy.blogspot.com/2011/06/global-interpreter-lock-or-how-to-kill.html

                +
                +
                +
                +
                + + Anonymous wrote on 2012-01-15 07:24: +
                +
                +

                > @Armin, CSP may be built into Go, but IMO this was a mistake, there is no requirement for it to be a language feature; it fits nicer as library. See [python-csp] for a python implementation.

                Stackless (which PyPy enables) supports Go-style channels as well, no?

                https://www.stackless.com/wiki/Channels

                +
                +
                +
                +
                + + René Dudfield wrote on 2012-01-15 08:03: +
                +
                +

                Your idea could work for other easy to inject into points, such as loops, and comprehensions. Especially with much of the work in pypy already done for identifying information about loops.

                How does this compare to grand central dispatch and blocks? https://en.wikipedia.org/wiki/Grand_Central_Dispatch

                Events are a very good way to model concurrency, and are widely used. It is a great place to dispatch concurrency into parallelism.

                Closures/blocks provide a fairly decent way to get some of the protection of STM - and in many programs give you the 80% solution. For code that plays nicely and avoids mutable, or global data - this works. Luckily, a lot of event based code is already written in this way. As you say, they are "generally mostly independent".

                Making the bad cases a quick fail, like in JavaScript worker threads could be an ok option. As soon as someone tries to access global data(do a system call, access the DOM, or access data outside the closure even), the program would fail there. Then you could fix those cases, or "add non-determinism" as you say. I think I'd prefer fail fast here, rather than have to detect these problems, and have them silently pass by.

                You still have scheduling problems, and trying to figure out task size. As well, this does not solve lots of other problems. However, it is cool that it could be applied automatically, and probably 'safely'.

                Another random thought... you could probably mark chunks of code as 'pure' as your run through them, and if they do a system call or mutate global data mark them as 'unpure' and don't try them again.

                I very much look forward to reading your results as you implement more.

                +
                +
                +
                +
                + + Eric van Riet Paap wrote on 2012-01-15 08:56: +
                +
                +

                When Armin gets this excited I'd fasten my seatbelt and put my goggles on.

                Thank you for letting me be an (otherwise mostly silent) observer.

                Please keep shifting boundaries!

                - Eric

                +
                +
                +
                +
                + + Armin Rigo wrote on 2012-01-16 10:08: +
                +
                +

                Update: please look at the tiny transaction module I wrote as an example. The idea is to have the same interface as this module, but implemented differently. By making use of transactional memory internally, it should be possible to safely run on multiple CPUs while keeping the very same programmer interface.

                https://bitbucket.org/arigo/arigo/raw/default/hack/stm/transactionmodule/

                +
                +
                +
                +
                + + René Dudfield wrote on 2012-01-16 12:11: +
                +
                +

                @Armin: That transaction code looks very simple. It seems trivial to implement a map/mapReduce style function on top of your transaction module.

                It is a very similar API to worker pool APIs which many thread using programs use. The main difference is that you combine the join() in the run method. It seems that a threaded web server for example could use this? What would happen if each incoming request comes in, and is put into the transaction (and say the 10th request has an error)? Would it be better to use multiple transactions?

                Have you thought how thread local storage would work?

                +
                +
                +
                +
                + + Armin Rigo wrote on 2012-01-16 12:55: +
                +
                +

                @notme: yes, a web server or anything can use this instead of using threads. It's of course missing a convincing select() or poll() version for that.

                The details haven't been thought out; right now an exception interrupts everything. In an STM model it's unclear if concurrent transactions should still be allowed to complete or not. Anyway the point is that exceptions should not really occur because precisely they interrupt everything --- you would typically add instead in every transaction code like "try: .. except: traceback.print_exc()".

                Thread local storage: what would be the point?

                +
                +
                +
                +
                + + Unknown wrote on 2012-01-18 10:06: +
                +
                +

                I also see no reason for Thread local memory.

                I like the idea of thinking about TM in the same line as GC. When you have GC the changes to the language is that you don't need to write free/dealloc.

                Having TM would mean that you don't have to write acquire_GIL

                +
                +
                +
                +
                + + headius wrote on 2012-01-24 04:22: +
                +
                +

                The devil's in the details.

                I'm not sure I buy your conclusions here. STM is not a panacea for solving concurrency issues, and it has some key limitations that limit its general applicability.

                On what granularity do you plan to have transactions? How do you know? Perhaps the VM will have enough knowledge of a given thread's activities to limit transactional overhead to only those structures in memory that are shared, but there still needs to be some indirection in case another thread hops in and starts making changes.

                Where do transactions start and end? In STMs I know, the in-transaction overhead for reading and writing data is *much* higher, since it needs to know if someone else has committed a transaction first and be able to roll back.

                Perhaps this is all intended to be hidden, and you never actually have "threads" that the user can see. But if you're going to parallelize, you'll have threads *somewhere* that are going to contend for resources. If they're going to contend for resources, even in an STM, they're going to have to check for contention, register their interest, and then you're back to the indirection overhead.

                Perhaps I'm not understand what your end goal is. You can't simply turn the world into a series of transactions unless you want every read and write to have transaction overhead or you have some clear way of limiting transaction overhead to only where it's needed. You cite Erlang...but Erlang deals with immutable objects, and there's far less need for anything like an STM. Others have mentioned Clojure...but again, Clojure is mostly immutable structures, and transactional overhead is limited to Refs, where you'll make single coarse-grained reads and writes.

                Am I missing the point? Are you not suggesting VM-wide STM, with the resulting transactional overhead for every read and write?

                +
                +
                +
                +
                + + Armin Rigo wrote on 2012-01-24 10:03: +
                +
                +

                @Charles: Indeed, I am suggesting VM-wide STM, with the resulting transactional overhead for every read and write. I actually got such a VM yesterday (with no GC): it seems to be about 10x slower on a single thread.

                Note that even 10x slower is a plus if it scales to dozens of processors. But of course, a better point of view is that some years ago the regular pypy *was* 10x slower than CPython. It was a lot of efforts but we managed to make it only 1.5-2x slower. And this is all without counting the JIT. If STM bogs down to a generally-not-triggered read barrier before every read, then the performance impact could be well under 2x.

                Please note also that I don't care about Java-like performance where even loosing 10% of performance would be a disaster. If we end up with a pypy-tm that is 2x slower than a regular pypy, I would be quite happy, and I believe that there is a non-negligible fraction of the Python users that would be, too.

                On granularity: for now I'm going with the idea that the granularity is defined "naturally" in the source program as the amount of work done every time some central dispatch loop calls some code. There might be several dispatch loops in total, too. This is true in the cases I can think of: typical Twisted or Stackless programs, pypy's "translate.py", the richards benchmark, etc.

                Please look at https://paste.pocoo.org/show/539822/ for an example of what I'm talking about. It's a diff against the standard richards.py: it is a pure Python user program in which I added calls to the new 'transaction' module. At this level there is no hint of Transactional Memory.

                +
                +
                +
                +
                + + Armin Rigo wrote on 2012-01-31 17:13: +
                +
                +

                @Gary Robinson: (off-topic:) for this kind of use case, you can use os.fork() after the immutable data is ready. It "kind of works" both in pypy and in cpython, although not really --- in cpython the reference counts are modified, causing the pages to get unshared between processes; and in pypy the garbage collector (GC) has the same effect, so far. It could be solved in pypy by more tweaks the GC.

                +
                +
                +
                +
                + + Anonymous wrote on 2012-02-01 18:43: +
                +
                +

                @armin: @Anonymous: in the extract you cite I meant "the two main models in Python". As far as I can tell, Go does concurrency by enforcing all communications to go via channels, so I would classify it as a "special-made" language. This solution might be nice and usable, but it does not really work at all in languages like Python.

                Armin, Stackless Python uses a model that at the API level is very similar to Go. Go borrows from the Bell Labs family of languages (i.e. Newsqueak). The fundamental idea is that message pasing is used to share information between threads/processes/coroutines. In this regard, Go is in the same camp as say, Erlang (although the messaging systems are different).


                What I think is interesting and workable for Python are efforts in languages like Polyphonic C# (see the paper "Scalable Join Patterns") and Concurrent/Parallel ML, where lock-free libraries and STM techniques are used under the hood to improve the efficiency of the messaging/synchronisation system. In this fashion, the programmer has a conceptually clean concurrency model and still can make the important decisions about how to partition the problem.

                Cheers,
                Andrew

                +
                +
                +
                +
                + + Anonymous wrote on 2012-02-01 18:59: +
                +
                +

                @daniel@Armin, CSP may be built into Go, but IMO this was a mistake, there is no requirement for it to be a language feature; it fits nicer as library. See [python-csp] for a python library

                I have looked at Python-CSP a long time ago. I recall it being verbose. However I use Stackless Python. And using PyPy's stackless.py, I implemented select() and join patterns. Sometimes I wish I had language support: they cut down on silly mistakes and make the code less verbose for simple cases. However what I have found is that the language can get in the way. For instance, in Go, one has to come up with hacks to do some simple like do a select on an arbitrary number of channels. Perhaps I am wrong but I suspect stuff like select()'s design was influenced by the fact Newsqueak was originally designed to make a windowing system easier to write. So one is monitoring only a handful of channels. In constrast, this is not the way Stackless Python programmes are written.

                Cheers,
                Andrew

                +
                +
                +
                +
                + + Armin Rigo wrote on 2012-02-01 20:39: +
                +
                +

                A link to a group that did the same thing (thanks a lot Andrew for this link!):

                https://research.microsoft.com/en-us/projects/ame/

                In particular the May 2007 paper (HotOS) nicely summarizes exactly what I'm trying to say, and I think it is clearer than me, if I have to jugde from feedback :-)

                +
                +
                +
                +
                + + Anonymous wrote on 2012-02-27 17:57: +
                +
                +

                Speaking as someone maintaining a large application that uses Twisted, this sounds great.

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2012/02/almost-there-pypys-arm-backend_01-3216759488618774525.html b/posts/2012/02/almost-there-pypys-arm-backend_01-3216759488618774525.html new file mode 100644 index 000000000..16fca1489 --- /dev/null +++ b/posts/2012/02/almost-there-pypys-arm-backend_01-3216759488618774525.html @@ -0,0 +1,669 @@ + + + + + +Almost There - PyPy's ARM Backend | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                Almost There - PyPy's ARM Backend

                + + + +
                +
                +In this post I want to give an update on the status of the ARM backend for PyPy's JIT and describe some of the issues and details of the backend.
                +
                +

                + + + + +Current Status

                +It has been a more than a year that I have been working on the ARM backend. Now it is in a shape, that we can measure meaningful numbers and also ask for some feedback. Since the last post about the backend we have added support floating point operations as well as for PyPy's framework GC's. Another area of work was to keep up with the constant improvements done in the main development branch, such as out-of-line guards, labels, etc. It has been possible for about a year to cross-translate the PyPy Python interpreter and other interpreters such as Pyrolog, with a JIT, to run benchmarks on ARM. Up until now there remained some hard to track bugs that would cause the interpreter to crash with a segmentation fault in certain cases when running with the JIT on ARM. Lately it was possible to run all benchmarks without problems, but when running the translation toolchain itself it would crash. During the last PyPy sprint in Leysin Armin and I managed to fix several of these hard to track bugs in the ARM backend with the result that, it is now possible to run the PyPy translator on ARM itself (at least unless until it runs out of memory), which is a kind of litmus test for the backend itself and used to crash before. Just to point it out, we are not able to complete a PyPy translation on ARM, because on the hardware we have currently available there is not enough memory. But up to the point we run out of memory the JIT does not hit any issues.

                +
                +
                +

                + + + + +Implementation Details

                +The hardware requirements to run the JIT on ARM follow those for Ubuntu on ARM which targets ARMv7 with a VFP unit running in little endian mode. The JIT can be translated without floating point support, but there might be a few places that need to be fixed to fully work in this setting. We are targeting the ARM instruction set, because at least at the time we decided to use it seemed to be the best choice in terms of speed while having some size overhead compared to the Thumb2 instruction set. It appears that the Thumb2 instruction set should give comparable speed with better code density but has a few restriction on the number of registers available and the use of conditional execution. Also the implementation is a bit easier using a fixed width instruction set and we can use the full set of registers in the generated code when using the ARM instruction set.

                +
                +
                +

                + + + + +The calling convention on ARM

                +The calling convention on ARM uses 4 of the general purpose registers to pass arguments to functions, further arguments are passed on the stack. The presence of a floating point unit is not required for ARM cores, for this reason there are different ways of handling floats with relation to the calling convention. There is a so called soft-float calling convention that is independent of the presence of a floating point unit. For this calling convention floating point arguments to functions are stored in the general purpose registers and on the stack. Passing floats around this way works with software and hardware floating point implementations. But in presence of a floating point unit it produces some overhead, because floating point numbers need to be moved from the floating point unit to the core registers to do a call and moved back to the floating point registers by the callee. The alternative calling convention is the so-called hard-float calling convention which requires the presence of a floating point unit but has the advantage of getting rid of the overhead of moving floating point values around when performing a call. Although it would be better in the long term to support the hard-float calling convention, we need to be able to interoperate with external code compiled for the operating system we are running on. For this reason at the moment we only support the soft-float to interoperate with external code. We implemented and tested the backend on a BeagleBoard-xM with a Cortex-A8 processor running Ubuntu 11.04 for ARM.

                +
                +
                +

                + + + + +Translating for ARM

                +The toolchain used to translate PyPy currently is based on a Scratchbox2. Scratchbox2 is a cross-compiling environment. Development had stopped for a while, but it seems to have revived again. We run a 32-bit Python interpreter on the host system and perform all calls to the compiler using a Scratchbox2 based environment. A description on how to setup the cross translation toolchain can be found here.

                +
                +
                +

                + + + + +Results

                +The current results on ARM, as shown in the graph below, show that the JIT currently gives a speedup of about 3.5 times compared to CPython on ARM. The benchmarks were run on the before mentioned BeagleBoard-xM with a 1GHz ARM Cortex-A8 processor and 512MB of memory. The operating system on the board is Ubuntu 11.04 for ARM. We measured the PyPy interpreter with the JIT enabled and disabled comparing each to CPython Python 2.7.1+ (r271:86832) for ARM. The graph shows the speedup or slowdown of both PyPy versions for the different benchmarks from our benchmark suite normalized to the runtime of CPython. The data used for the graph can be seen below.
                + +
                +
                +The speedup is less than the speedup of 5.2 times we currently get on x86 on our own benchmark suite (see https://speed.pypy.org for details). There are several possible reasons for this. Comparing the results for the interpreter without the JIT on ARM and x86 suggests that the interpreter generated by PyPy, without the JIT, has a worse performance when compared to CPython that it does on x86. Also it is quite possible that the code we are generating with the JIT is not yet optimal. Also there are some architectural constraints produce some overhead. One of these differences is the handling of constants, most ARM instructions only support 8 bit (that can be shifted) immediate values, larger constants need to be loaded into a register, something that is not necessary on x86.

                + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                BenchmarkPyPy JITPyPy no JIT
                ai0.4844397800473.72756749625
                chaos0.08072916919342.2908692212
                crypto_pyaes0.07111148322453.30112318509
                django0.09777432455192.56779947601
                fannkuch0.2104237356982.49163632938
                float0.1542753346752.12053281495
                go0.3304830342025.84628320479
                html5lib0.6292643898623.60333138526
                meteor-contest0.9847474269122.93838610037
                nbody_modified0.2369695930821.40027234936
                pyflate-fast0.3674471918072.72472422146
                raytrace-simple0.02905274614371.97270054339
                richards0.0345755735533.29767342015
                slowspitfire0.7866425519083.7397367403
                spambayes0.6603243794563.29059863111
                spectral-norm0.0636107837314.01788986233
                spitfire0.436171311652.72050579076
                spitfire_cstringio0.2555387021341.7418593111
                telco0.1029189304133.86388866047
                twisted_iteration0.1227239868054.33632475491
                twisted_names2.423677971352.99878698076
                twisted_pb1.309918374314.48877805486
                twisted_tcp0.9270333540552.8161624665
                waf1.020598119321.03793427321
                +
                +

                +

                + + + + +The next steps and call for help

                +Although there probably still are some remaining issues which have not surfaced yet, the JIT backend for ARM is working. Before we can merge the backend into the main development line there are some things that we would like to do first, in particular it we are looking for a way to run the all PyPy tests to verify that things work on ARM before we can merge. Additionally there are some other longterm ideas. To do this we are looking for people willing to help, either by contributing to implement the open features or that can help us with hardware to test.

                +The incomplete list of open topics:
                  +
                • We are looking for a better way to translate PyPy for ARM, than the one describe above. I am not sure if there currently is hardware with enough memory to directly translate PyPy on an ARM based system, this would require between 1.5 or 2 Gig of memory. A fully QEMU based approach could also work, instead of Scratchbox2 that uses QEMU under the hood.
                • +
                • Test the JIT on different hardware.
                • +
                • Experiment with the JIT settings to find the optimal thresholds for ARM.
                • +
                • Continuous integration: We are looking for a way to run the PyPy test suite to make sure everything works as expected on ARM, here QEMU also might provide an alternative.
                • +
                • A long term plan would be to port the backend to ARMv5 ISA and improve the support for systems without a floating point unit. This would require to implement the ISA and create different code paths and improve the instruction selection depending on the target architecture.
                • +
                • Review of the generated machine code the JIT generates on ARM to see if the instruction selection makes sense for ARM.
                • +
                • Build a version that runs on Android.
                • +
                • Improve the tools, i.e. integrate with jitviewer.
                • +
                +So if you are interested or willing to help in any way contact us.
                +
                +

                Comments

                +
                +
                +
                + + Michael Hudson-Doyle wrote on 2012-02-02 00:20: +
                +
                +

                Awesome news. We might be able to donate some time in the Linaro validation lab to running tests, I'll see what we can do...

                +
                +
                +
                +
                + + Anonymous wrote on 2012-02-02 21:55: +
                +
                +

                "Just to point it out, we are not able to complete a PyPy translation on ARM, because on the hardware we have currently available there is not enough memory."

                Can't you just add more swap?

                +
                +
                +
                +
                + + Jan Ziak (atomsymbol) wrote on 2012-02-03 09:08: +
                +
                +

                You wrote: "The speedup is less than the speedup of 5.2 times you currently get on x86."

                The removed comment was meant to point out that the author of the blog post does not (and cannot) know the actual speedups (and slowdowns) people are getting on their machines.

                The speedup of 5.2 you mentioned is contradicting my own experience.

                I suggest you rewrite your sentence into "The speedup is less than the speedup of 5.2 times we currently get on x86."

                +
                +
                +
                +
                + + Maciej Fijalkowski wrote on 2012-02-03 09:15: +
                +
                +

                "The speedup of 5.2 you mentioned is contradicting my own experience."

                Did you run the very same benchmark suite or some arbitrary programs? If arbitrary programs then sorry, but it's seriously impossible for us to optimize stuff we don't know about. Please submit bug tracker issues for that.

                I agree the speedup should be qualified "on our own benchmark suite", but if you don't contribute benchmarks, you can't complain.

                +
                +
                +
                +
                + + David Schneider wrote on 2012-02-03 11:30: +
                +
                +

                @⚛ to avoid misunderstandings I updated the sentence in question to make it clear that I was comparing the performance of the ARM backend running our own benchmark suite to the results of the benchmarks as shown on speed.pypy.org.

                +
                +
                +
                +
                + + Naos wrote on 2012-02-03 23:19: +
                +
                +

                Every time I visit comments to blog posts on this page I see some hater or two or even more who, don't know why, have wierd problems without a reason. People chill out, you get this brilliant piece of software and you do not have to pay for it.

                +
                +
                +
                +
                + + Jan Ziak (atomsymbol) wrote on 2012-02-04 08:36: +
                +
                +

                @Naos: I do *not* hate PyPy. I like it and want to make it better. To do that, I am using a method different from your method. I would like the PyPy team to figure out how to run my benchmark faster without me disclosing the name of the benchmark. I believe that in the end this method will lead to a couple of universal optimizations in PyPy.

                +
                +
                +
                +
                + + Maciej Fijalkowski wrote on 2012-02-04 09:38: +
                +
                +

                @⚛ Contrary to what you might believe, this ends up with you being annoying and nothing else. If you think we don't think all the time about general improvements, you're wrong, but pointless, unscientific complaining is not welcomed here.

                +
                +
                +
                +
                + + Anonymous wrote on 2012-02-06 10:55: +
                +
                +

                will it work on the raspberry pi or does it use a different arm architecture?

                (i am confused by arms. :) apparently it isn't like in the x86 world where everything is compatible with each other.)

                +
                +
                +
                +
                + + Anonymous wrote on 2012-02-06 11:46: +
                +
                +

                @Anonymous

                > The hardware requirements to run the JIT on ARM follow those for Ubuntu on ARM which targets ARMv7 with a VFP unit running in little endian mode.

                This is higher than Raspberry Pi's ARMv6.

                +
                +
                +
                +
                + + Anonymous wrote on 2012-02-06 22:14: +
                +
                + A long term plan would be to port the backend to ARMv5 ISA and improve the support for systems without a floating point unit. This would require to implement the ISA and create different code paths and improve the instruction selection depending on the target architecture.

                so someday it will support the v6 too? or isn't v5 and v6 compatible either? +
                +
                +
                +
                + + David Schneider wrote on 2012-02-09 14:18: +
                +
                +

                @Anonymous ARMv6 should be backwards compatible to the ARMv5 ISA. So it should be possible to use the JIT on ARMv6 once it works for ARMv5.

                +
                +
                +
                +
                + + Anonymous wrote on 2012-02-10 08:10: +
                +
                +

                Really nice with raspberry Pi being released and ARM proto boards gaining traction. Looking forward to develop w/ this on rPi

                +
                +
                +
                +
                + + d.q. wrote on 2012-03-14 16:42: +
                +
                +

                I have an arm system with enough swap, microsd or cifs or both... though a setup like this will probably take until next release to translate pypy, won't it?
                Can also do usermode qemu of course.

                +
                +
                +
                +
                + + David Schneider wrote on 2012-04-18 10:05: +
                +
                +

                @d.q. It would at least take until the next release, if not several...
                A qemu based solution would be interesting. If you are interested I would propose you join #pypy on IRC to discuss

                +
                +
                +
                +
                + + Anonymous wrote on 2013-03-01 01:28: +
                +
                +

                I'm pretty sure the guys over at Boundary Devices are making a SabreLite variant with 2GB ram.

                https://boundarydevices.com/imx6-options-single-dual-core-2gb-ddr/

                You may need to email them directly to purchase it still, but they're responsive and a pleasure to work with.

                +
                +
                +
                +
                + + Eric van Riet Paap wrote on 2013-03-19 14:15: +
                +
                +

                Anyone try to create a qemu based solution? Even if only a build system. I would be interested in having some documentation about how to build pypy for Raspberry Pi. I know it's a non-compatible ARM version but we have to start somewhere. Plus I have a very RPi's laying around to play with...

                +
                +
                +
                +
                + + Eric van Riet Paap wrote on 2013-03-19 14:17: +
                +
                +

                Anyone tried to build with a qemu setup? I have several Raspberry Pi's around that I could play with. I know the JIT will not work because of the difference in ARM versions but it's a start.

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2012/02/introductionary-article-about-rpython-5386281283454207551.html b/posts/2012/02/introductionary-article-about-rpython-5386281283454207551.html new file mode 100644 index 000000000..03658bb92 --- /dev/null +++ b/posts/2012/02/introductionary-article-about-rpython-5386281283454207551.html @@ -0,0 +1,317 @@ + + + + + +Introductory Article About RPython | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                Introductory Article About RPython

                + + + +
                + Laurence Tratt from King's College London has written a long and detailed introduction to the goals and significance of RPython over on his blog. Laurie has been implementing his Converge Language in RPython in the last months. He is one of the first people external to the PyPy team who have pushed a sizeable RPython-based VM quite far, adding and tuning JIT hints. The post describes some of that work and his impressions of RPython and PyPy.

                +"RPython, to my mind, is an astonishing project. It has, almost single-handedly, opened up an entirely new approach to VM implementation. As my experience shows, creating a decent RPython VM is not a huge amount of work (despite some frustrations). In short: never again do new languages need come with unusably slow VMs. That the the PyPy / RPython team have shown that these ideas scale up to a fast implementation of a large, real-world language (Python) is another feather in their cap." 
                +
                +

                Comments

                +
                +
                +
                + + Luis wrote on 2012-02-10 01:38: +
                +
                +

                My English is not very good, but I suspect "Introductionary" is not a word. I would use "introductory" instead.

                +
                +
                +
                +
                + + Carl Friedrich Bolz-Tereick wrote on 2012-02-10 10:07: +
                +
                +

                It probably wasn't before, but now it is! (and a pretty nice word, no?)

                Fixed.

                +
                +
                +
                +
                + + Luis wrote on 2012-02-10 23:56: +
                +
                +

                "It probably wasn't before, but now it is! (and a pretty nice word, no?)"

                Well, it surely sounds more sophisticated :-)

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2012/02/larger-example-for-flow-graph-language-6139699450091061040.html b/posts/2012/02/larger-example-for-flow-graph-language-6139699450091061040.html new file mode 100644 index 000000000..3ea2066a4 --- /dev/null +++ b/posts/2012/02/larger-example-for-flow-graph-language-6139699450091061040.html @@ -0,0 +1,696 @@ + + + + + +A Larger Example for the Flow Graph Language | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                A Larger Example for the Flow Graph Language

                + + + +
                +

                Part 4 of Comparing Partial Evaluation to Tracing

                +

                This is the fourth and final blog post in a series about comparing partial evaluation and +tracing. We've come a long way: In the first post of the series I showed an interpreter for a small flow-graph +language together with a partial evaluator it. In the second post I showed how a tracer for +the same language works and how it relates to both execution and to partial +evaluation. The third post described an optimizer for traces.

                +

                In this final post we can compare and contrast the two different approaches of +tracing and partial evaluation by means of an example. The programs in the flow +chart language seen so far have been rather small, so I want to give an example +of a larger program: an interpreter for an extremely simple bytecode +instruction set. I will look at how the partial evaluator deals with that +interpreter, and +what the tracer does with it. The code for +that, as well as all the code of the series can be found here: https://paste.pocoo.org/show/550282/ (some small +additions have been made, such as a nicer way to print traces).

                +

                A Bytecode Interpreter

                +

                Writing programs in the flow graph language is painful, but I still want to give +an example that is a bit more interesting than the tiny ones that we've seen so +far. The example is an interpreter for the bytecode of a very trivial +register-based language. The language has four registers, one of which is an +accumulator on which all the actual operations are performed.

                +

                The opcodes of the language are:

                +
                  +
                • +jump_if_a, jumps to a target address when the accumulator is non-zero
                • +
                • +mov_a_r0, mov_a_r1, mov_a_r2 move the value of the accumulator to +the respective register
                • +
                • +mov_r0_a, mov_r1_a, mov_r2_a move the value of a register to +the accumulator
                • +
                • +add_r0_to_a, add_r1_to_a, add_r2_to_a add the value of the +register to the accumulator
                • +
                • +decr_a decrement the accumulator
                • +
                • +return_a stop the program and print the accumulator
                • +
                +

                The interpreter has a main loop that reads the opcode at the current program +counter, does a (lengthy) dispatch to the right bytecode via a series of if +statements and then executes the right opcode. Afterwards the next opcode is +treated equivalently.

                +

                Here is a part of the source code in the flow graph language. As pseudocode:

                +
                +bytecode_loop:
                +    opcode = bytecode[pc]
                +    pc = pc + 1
                +    c = opcode == 'jump_if_a'
                +    if c goto op_jump_if_a else goto not_jump_if_a
                +
                +# select the right bytecode via a long series of if statements
                +not_jump_if_a:
                +    c = opcode == 'mov_a_r0'
                +    if y goto op_mov_a_r0 else goto not_mov_a_r0
                +not_mov_a_r0:
                +    c = opcode == 'mov_a_r0'
                +    if y goto op_mov_a_r1 else goto not_mov_a_r1
                +...
                +
                +# bytecode implementations
                +op_mov_a_r0:
                +    r0 = a
                +    goto bytecode_loop
                +
                +op_jump_if_a:
                +    c = a == 0
                +    target = bytecode[pc]
                +    pc += 1
                +    if c goto bytecode_loop else goto op_jump_if_a_jump
                +
                +op_jump_if_a_jump:
                +    pc = target
                +    goto bytecode_loop
                +...
                +
                +

                And actually working, as Prolog facts (the full implementation can be found at +the link above):

                +
                % bytecode dispatch loop
                +block(bytecode_loop,
                +      op2(opcode, readlist, var(bytecode), var(pc),
                +      op2(pc, add, var(pc), const(1),
                +      op2(c, eq, var(opcode), const(jump_if_a),
                +      if(c, op_jump_if_a, not_jump_if_a))))).
                +
                +% select the right bytecode via a long series of if statements
                +block(not_jump_if_a,
                +      op2(c, eq, var(opcode), const(mov_a_r0),
                +      if(c, op_mov_a_r0, not_mov_a_r0))).
                +block(not_mov_a_r0,
                +      op2(c, eq, var(opcode), const(mov_a_r1),
                +      if(c, op_mov_a_r1, not_mov_a_r1))).
                +...
                +
                +% bytecode implementations
                +block(op_jump_if_a,
                +      op2(c, eq, var(a), const(0),
                +      op2(target, readlist, var(bytecode), var(pc),
                +      op2(pc, add, var(pc), const(1),
                +      if(c, bytecode_loop, op_jump_if_a_jump))))).
                +block(op_jump_if_a_jump,
                +      op1(pc, same, var(target),
                +      promote(bytecode, bytecode_loop))).
                +block(op_mov_a_r0,
                +      op1(r0, same, var(a), jump(bytecode_loop))).
                +...
                +
                +

                The bytecode_loop block is the main dispatch loop. It reads an opcode out of the +bytecode list at the program counter position, then has a long series of if +statements that compares the current opcode to the various existing opcodes. +The full code of the interpreter can be found under the link above.

                +

                The bytecodes of the interpreter don't really permit hugely complex +programs, but it can be used to write a program that computes the square of a +number with the following program:

                +
                +mov_a_r0     # r0 = a
                +mov_a_r1     # r1 = a
                +# 2:
                +mov_r0_a     # r0--
                +decr_a
                +mov_a_r0
                +mov_r2_a     # r2 += a
                +add_r1_to_a
                +mov_a_r2
                +mov_r0_a     # if r0!=0: goto 2
                +jump_if_a 2
                +mov_r2_a     # return r2
                +return_a
                +
                +

                Partially Evaluating the Bytecode Interpreter

                +

                The partial evaluator from the first blog post can be easily used to partially +evaluate the bytecode interpreter. The static input is the bytecode for +computing the square and the initial program counter value, as given above. The +dynamic input are the content of the accumulator (the number to be squared). +This can be done as follows:

                +
                ?- bytecode_square(B),
                +Env = [bytecode/B, pc/0],
                +do_pe(bytecode_loop, Env, Label),
                +REnv = [a/16, r0/0, r1/0, r2/0],
                +interp(jump(Label), REnv), listing(block).
                +256
                +:- dynamic block/2.
                +
                +<lots of generated code>
                +
                +

                The code that is generated by the partial evaluation process is somewhat hard to +read. It contains a lot of passages like this:

                +
                ...
                +block(op_return_a1, print_and_stop(var(a))).
                +block(not_decr_a1, jump(op_return_a1)).
                +block(not_add_r2_to_a2, jump(not_decr_a1)).
                +block(not_add_r1_to_a2, jump(not_add_r2_to_a2)).
                +block(not_add_r0_to_a3, jump(not_add_r1_to_a2)).
                +block(not_mov_r2_a3, jump(not_add_r0_to_a3)).
                +block(not_mov_r1_a5, jump(not_mov_r2_a3)).
                +block(not_mov_r0_a5, jump(not_mov_r1_a5)).
                +block(not_mov_a_r27, jump(not_mov_r0_a5)).
                +block(not_mov_a_r18, jump(not_mov_a_r27)).
                +block(not_mov_a_r09, jump(not_mov_a_r18)).
                +block(not_jump_if_a11, jump(not_mov_a_r09)).
                +block(bytecode_loop12, jump(not_jump_if_a11)).
                +block(op_mov_r2_a2, op1(a, same, var(r2), jump(bytecode_loop12))).
                +...
                +
                +

                I.e. lots of blocks that do nothing but jump to another block, interspersed with +some blocks that contain an actual operation. I cleaned the output up manually +and got something like the following (this sort of cleanup is something a good +partial evaluation system would do itself, after partial evaluation has +occurred):

                +
                block(bytecode_loop1,
                +    op1(r0, same, var(a),
                +    op1(r1, same, var(a),
                +    op1(a, same, var(r0),
                +    op2(a, sub, var(a), const(1),
                +    op1(r0, same, var(a),
                +    op1(a, same, var(r2),
                +    op2(a, add, var(a), var(r1),
                +    op1(r2, same, var(a),
                +    op1(a, same, var(r0),
                +    op2(c, eq, var(a), const(0),
                +    if(c, bytecode_loop11, op_jump_if_a_jump1)))))))))))).
                +
                +block(bytecode_loop11,
                +    op1(a, same, var(r2),
                +    print_and_stop(var(a))).
                +
                +block(op_jump_if_a_jump1,
                +    op1(a, same, var(r0),
                +    op2(a, sub, var(a), const(1),
                +    op1(r0, same, var(a),
                +    op1(a, same, var(r2),
                +    op2(a, add, var(a), var(r1),
                +    op1(r2, same, var(a),
                +    op1(a, same, var(r0),
                +    op2(c, eq, var(a), const(0),
                +    if(c, bytecode_loop11, op_jump_if_a_jump1)))))))))).
                +
                +

                What do we see here? The partial evaluator has generated a block bytecode_loop1, +which corresponds to the initialization opcodes mov_a_r0 and mov_a_r1 together +with one iteration of the loop. Then it either jumps to a copy of the main loop +(label op_jump_if_a_jump1) or to block bytecode_loop11, which prints the result +and then stops. The residual code does exactly what the bytecode did: It +squares the accumulator then prints that. All the uses of the bytecode and +pc variable are gone.

                +

                Why did the partial evaluator produce two copies of the main loop that +look the same? The reason for that is that in the second copy, the additional +static information target = 2 is known, where target is a variable in +the interpreter source that stores the jump target, for very brief periods of +time. This additional static information does not have any effect on the +residual code, so the same code is uselessly generated twice. This is an +example of overspecialization.

                +

                Tracing the Interpreter

                +

                In this section we will look at what happens if we try to trace the interpreter. +The naive way of doing that yields traces that are not very useful, because they +abort after one iteration. We will look at a way of avoiding this problem. The +problems described in this section are at the core of the paper Tracing the +meta-level: PyPy's tracing JIT compiler (that paper uses a slightly more +advanced version of the bytecode interpreter as an example).

                +

                To trace the interpreter, it is useful to change the bytecode_loop block from above +to always promote the bytecode and the pc variables, because without +knowing them the trace produced is not really interesting. This is similar to +making these variables static in the partial evaluation example above:

                +
                block(bytecode_loop,
                +      promote(bytecode, bytecode_loop_promote_bytecode)).
                +block(bytecode_loop_promote_bytecode,
                +      promote(pc, bytecode_loop_promote_pc)).
                +block(bytecode_loop_promote_pc,
                +      op2(opcode, readlist, var(bytecode), var(pc),
                +      op2(pc, add, var(pc), const(1),
                +      op2(c, eq, var(opcode), const(0),
                +      if(c, op_jump_if_a, not_jump_if_a))))).
                +...
                +
                +

                The rest of the interpreter stays unchanged.

                +

                To trace the interpreter we would start naively at the bytecode_loop label, because +that's the label in the interpreter that is jumped to most often (which a +profiler could establish easily). The following command can be used for that +(this output prints traces in a slightly more readable way than in previous blog +posts):

                +
                ?- bytecode_square(B),
                +        A = 16, Env = [bytecode/B, pc/2, a/A, r0/A, r1/A, r2/0],
                +        do_trace(bytecode_loop, Env).
                +trace
                +  guard_value(bytecode,[mov_a_r0,mov_a_r1,mov_r0_a,decr_a,mov_a_r0,mov_r2_a,add_r1_to_a,mov_a_r2,mov_r0_a,jump_if_a,2,mov_r2_a,return_a],[],bytecode_loop_promote_bytecode)
                +  guard_value(pc,2,[],bytecode_loop_promote_pc)
                +  op2(opcode,readlist,var(bytecode),var(pc))
                +  op2(pc,add,var(pc),const(1))
                +  op2(c,eq,var(opcode),const(jump_if_a))
                +  guard_false(c,[],op_jump_if_a)
                +  op2(c,eq,var(opcode),const(mov_a_r0))
                +  guard_false(c,[],op_mov_a_r0)
                +  op2(c,eq,var(opcode),const(mov_a_r1))
                +  guard_false(c,[],op_mov_a_r1)
                +  op2(c,eq,var(opcode),const(mov_a_r2))
                +  guard_false(c,[],op_mov_a_r2)
                +  op2(c,eq,var(opcode),const(mov_r0_a))
                +  guard_true(c,[],not_mov_r0_a)
                +  op1(a,same,var(r0))
                +  loop
                +
                +opttrace
                +  guard_value(bytecode,[mov_a_r0,mov_a_r1,mov_r0_a,decr_a,mov_a_r0,mov_r2_a,add_r1_to_a,mov_a_r2,mov_r0_a,jump_if_a,2,mov_r2_a,return_a],[],bytecode_loop_promote_bytecode)
                +  guard_value(pc,2,[bytecode/[mov_a_r0,mov_a_r1,mov_r0_a,decr_a,mov_a_r0,mov_r2_a,add_r1_to_a,mov_a_r2,mov_r0_a,jump_if_a,2,mov_r2_a,return_a]],bytecode_loop_promote_pc)
                +  op1(a,same,var(r0))
                +  op1(bytecode,same,const([mov_a_r0,mov_a_r1,mov_r0_a,decr_a,mov_a_r0,mov_r2_a,add_r1_to_a,mov_a_r2,mov_r0_a,jump_if_a,2,mov_r2_a,return_a]))
                +  op1(pc,same,const(3))
                +  op1(opcode,same,const(mov_r0_a))
                +  op1(c,same,const(1))
                +  loop
                +
                +256
                +B = [mov_a_r0, mov_a_r1, mov_r0_a, decr_a, mov_a_r0, mov_r2_a, add_r1_to_a, mov_a_r2, mov_r0_a|...],
                +A = 16,
                +Env = [bytecode/[mov_a_r0, mov_a_r1, mov_r0_a, decr_a, mov_a_r0, mov_r2_a, add_r1_to_a|...], pc/2, a/16, r0/16, r1/16, r2/0]
                +
                +

                These traces are very short. They start with promoting the bytecode and the +pc, followed by the execution of the opcode mov_r0_a, which is the +one at position 2 in the given bytecode. Then they increment the pc and +loop back to the beginning. Looking at the optimized trace, it is clear that the +trace is essentially useless. It will run only for one iteration, because in the +second iteration the pc is 3, thus the guard_value at the beginning +will fail.

                +

                This problem can be solved by tracing more than just one iteration of the +bytecode dispatch loop, which is called meta-tracing. To get this behaviour, in +this simple example it is enough to start (and thus end) tracing at a different +label, op_jump_if_a_jump. This label is hit when the interpreter executes a +jump_if_a bytecode and the jump is taken. In a loop on the level of the +executed bytecode program there is one such jump. Thus tracing from this label, +a full loop in the bytecode program is traced, containing potentially many +iterations of the bytecode dispatch loop in the control flow graph language.

                +

                Doing that yields the following:

                +
                ?- bytecode_square(B),
                +        A = 16, Env = [bytecode/B, pc/11, a/A, r0/A, r1/A, r2/0, target/2],
                +        do_trace(op_jump_if_a_jump, Env).
                +trace
                +  op1(pc,same,var(target))
                +  guard_value(bytecode,[mov_a_r0,mov_a_r1,mov_r0_a,decr_a,mov_a_r0,mov_r2_a,add_r1_to_a,mov_a_r2,mov_r0_a,jump_if_a,2,mov_r2_a,return_a],[],bytecode_loop)
                +  guard_value(bytecode,[mov_a_r0,mov_a_r1,mov_r0_a,decr_a,mov_a_r0,mov_r2_a,add_r1_to_a,mov_a_r2,mov_r0_a,jump_if_a,2,mov_r2_a,return_a],[],bytecode_loop_promote_bytecode)
                +  guard_value(pc,2,[],bytecode_loop_promote_pc)
                +  op2(opcode,readlist,var(bytecode),var(pc))
                +  op2(pc,add,var(pc),const(1))
                +  op2(c,eq,var(opcode),const(jump_if_a))
                +  guard_false(c,[],op_jump_if_a)
                +  op2(c,eq,var(opcode),const(mov_a_r0))
                +  guard_false(c,[],op_mov_a_r0)
                +  op2(c,eq,var(opcode),const(mov_a_r1))
                +  guard_false(c,[],op_mov_a_r1)
                +  op2(c,eq,var(opcode),const(mov_a_r2))
                +  guard_false(c,[],op_mov_a_r2)
                +  op2(c,eq,var(opcode),const(mov_r0_a))
                +  guard_true(c,[],not_mov_r0_a)
                +  op1(a,same,var(r0))
                +  guard_value(bytecode,[mov_a_r0,mov_a_r1,mov_r0_a,decr_a,mov_a_r0,mov_r2_a,add_r1_to_a,mov_a_r2,mov_r0_a,jump_if_a,2,mov_r2_a,return_a],[],bytecode_loop_promote_bytecode)
                +  guard_value(pc,3,[],bytecode_loop_promote_pc)
                +  op2(opcode,readlist,var(bytecode),var(pc))
                +  ...
                +  lots of operations ommitted
                +  ...
                +  guard_value(bytecode,[mov_a_r0,mov_a_r1,mov_r0_a,decr_a,mov_a_r0,mov_r2_a,add_r1_to_a,mov_a_r2,mov_r0_a,jump_if_a,2,mov_r2_a,return_a],[],bytecode_loop_promote_bytecode)
                +  guard_value(pc,9,[],bytecode_loop_promote_pc)
                +  op2(opcode,readlist,var(bytecode),var(pc))
                +  op2(pc,add,var(pc),const(1))
                +  op2(c,eq,var(opcode),const(jump_if_a))
                +  guard_true(c,[],not_jump_if_a)
                +  op2(c,eq,var(a),const(0))
                +  op2(target,readlist,var(bytecode),var(pc))
                +  op2(pc,add,var(pc),const(1))
                +  guard_false(c,[],bytecode_loop)
                +  loop
                +
                +opttrace
                +  op1(pc,same,var(target))
                +  guard_value(bytecode,[mov_a_r0,mov_a_r1,mov_r0_a,decr_a,mov_a_r0,mov_r2_a,add_r1_to_a,mov_a_r2,mov_r0_a,jump_if_a,2,mov_r2_a,return_a],[],bytecode_loop)
                +  guard_value(pc,2,[bytecode/[mov_a_r0,mov_a_r1,mov_r0_a,decr_a,mov_a_r0,mov_r2_a,add_r1_to_a,mov_a_r2,mov_r0_a,jump_if_a,2,mov_r2_a,return_a]],bytecode_loop_promote_pc)
                +  op1(a,same,var(r0))
                +  op2(a,sub,var(a),const(1))
                +  op1(r0,same,var(a))
                +  op1(a,same,var(r2))
                +  op2(a,add,var(a),var(r1))
                +  op1(r2,same,var(a))
                +  op1(a,same,var(r0))
                +  op2(c,eq,var(a),const(0))
                +  guard_false(c,[bytecode/[mov_a_r0,mov_a_r1,mov_r0_a,decr_a,mov_a_r0,mov_r2_a,add_r1_to_a,mov_a_r2,mov_r0_a,jump_if_a,2,mov_r2_a,return_a],pc/11,opcode/jump_if_a,target/2],bytecode_loop)
                +  op1(bytecode,same,const([mov_a_r0,mov_a_r1,mov_r0_a,decr_a,mov_a_r0,mov_r2_a,add_r1_to_a,mov_a_r2,mov_r0_a,jump_if_a,2,mov_r2_a,return_a]))
                +  op1(pc,same,const(11))
                +  op1(opcode,same,const(jump_if_a))
                +  op1(target,same,const(2))
                +  op1(c,same,const(0))
                +  loop
                +
                +256
                +B = [mov_a_r0, mov_a_r1, mov_r0_a, decr_a, mov_a_r0, mov_r2_a, add_r1_to_a, mov_a_r2, mov_r0_a|...],
                +A = 16,
                +Env = [bytecode/[mov_a_r0, mov_a_r1, mov_r0_a, decr_a, mov_a_r0, mov_r2_a, add_r1_to_a|...], pc/11, a/16, r0/16, r1/16, r2/0, target/2] .
                +
                +

                That looks better. The trace corresponds to the interpreter running all the +bytecodes in the loop of the squaring function in the example bytecode above. +The optimized code starts with +two guards (checking that the bytecode is still the one for the squaring +function, checking that the pc is 2) and then only does the operations +that actually do the computation. No bytecode dispatching is performed, thus the +interpretation overhead is fully removed, apart from the two guard_value +operations at the beginning.

                +

                Many of the assignments in the trace are superfluous, e.g. all the copying back +and forth between registers r1, r1, r2 and accumulator a. This +could be easily solved by an even more intelligent optimization utilizing SSA +form.

                +

                Conclusion About the Interpreter

                +

                Both partial evaluation and meta-tracing can be used to transform the example +bytecode computing a square into a form that shows the essential computation +that is going on, without the interpretation overhead. The naive partial evaluator +produces lots of extra blocks that just jump around, which could be solved with +a post-processing step. The tracer by itself produces uselessly short traces, +but with a simple trick of starting the trace at a different point the results +become a lot better.

                +

                In a real meta-tracing system, the meta-tracer would need a way for the author +of the interpreter +to mark which bytecode corresponds to a backward jump. It would also need better +integration with the interpreter to start tracing automatically, as well as +cache the traces. Additionally, it would have to deal better with guards that fail a +lot, attaching new traces to the failing guards. However, all that is "just" +engineering on top of the ideas presented in this series of blog posts.

                +

                High-Level Conclusion

                +

                Some concluding high-level thoughts about the similarities of tracing and +partial evaluation: Tracing and partial evaluation try to tackle a similar +problem, that of automatically reducing the interpreter overhead, their +approaches are slightly different though.

                +

                Tracing is very close to normal evaluation, only keeping some extra information +in the process. But then, the optimizer that is used in a tracer +is again very similar in structure to a partial evaluator. The task of the +optimizer is much simpler though, because it does not need to deal with control +flow at all, just a linear list of operations.

                +

                So in a sense tracing is taking those parts of partial evaluation that work (the +"just evaluate those things that you can, and leave the others") and replacing +the parts that don't (controlling unfolding) by a much more pragmatic mechanism. +That mechanism observes actual execution runs of the program to choose control +flow paths that are typical. At the same time, the tracer's focus is on loops, +because they are where most programs spend significant amounts of time.

                +

                Another point of view of tracing is that it is a form of partial evaluation that +replaces the control components of a partial evaluator with an oracle (the +actual execution runs) that provide the information which paths to look at.

                +

                Already in the quite trivial interpreter here the effects of this are visible. +The simple partial evaluator over-specializes the loop and produces two +identical versions of it, that aren't different. The tracer doesn't, and it +also generates only code for the loop itself, not for the initialization +opcodes.

                +

                That's it for this series. To those that made it, thanks for following along. +Also thanks to Samuele and Sven, who consistently gave me good feedback on the +posts before I put them here.

                +
                +

                Comments

                +
                +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2012/02/optimizing-traces-of-flow-graph-4169388883059419385.html b/posts/2012/02/optimizing-traces-of-flow-graph-4169388883059419385.html new file mode 100644 index 000000000..a30547e6f --- /dev/null +++ b/posts/2012/02/optimizing-traces-of-flow-graph-4169388883059419385.html @@ -0,0 +1,508 @@ + + + + + +Optimizing Traces of the Flow Graph Language | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                Optimizing Traces of the Flow Graph Language

                + + + +
                +

                Part 3 of Comparing Partial Evaluation to Tracing

                +

                This is the third blog post in a series about comparing partial evaluation and +tracing. In the first post of the series I introduced a small flow-graph +language together with an interpreter for it. Then I showed a partial evaluator +for the language. In the second post of the series I showed how a tracer for +the same language works and how it relates to both execution and to partial +evaluation. Then I added support for promotion to that tracer.

                +

                In this post I will show how to optimize the traces that are produced by the +tracer and compare the structure of the optimizer to that of partial +evaluation.

                +

                The code from this post can be found here: https://paste.pocoo.org/show/547304/

                +

                Optimizing Traces

                +

                In the last post we saw how to produce a linear trace with guards by +interpreting a control flow graph program in a special mode. A trace always end with +a loop statement, which jumps to the beginning. The tracer is just logging +the operations that are done while interpreting, so the trace can contain +superfluous operations. On the other hand, the trace also contains some of the +runtime values through promotions and some decisions made on them which can be +exploited by optimization. An example for this is the trace produced by the +promotion example from the last post:

                +
                op2(c,ge,var(i),const(0),
                +guard_true(c,[],l_done,
                +guard_value(x,5,[],b2,
                +op2(x2,mul,var(x),const(2),
                +op2(x3,add,var(x2),const(1),
                +op2(i,sub,var(i),var(x3),
                +loop))))))
                +
                +

                After the guard_value(x, 5, ...) operation, x is know to be 5: If +it isn't 5, execution falls back to the interpreter. Therefore, operations +on x after the guard can be constant-folded. To do that sort of +constant-folding, +an extra optimization step is needed. That optimization step walks along the +trace, remembers which variables are constants and what their values are using a +partial environment. The opimizer removes operations that have only constant +arguments and leaves the others in the trace. This process is actually +remarkably similar to partial evaluation: Some variables are known to be +constants, operations on only constant arguments are optimized away, the rest +remains.

                +

                The code for optimizing operations looks as follows:

                +
                optimize(op1(ResultVar, Op, Arg, Rest), PEnv, NewOp) :-
                +    presolve(Arg, PEnv, RArg),
                +    (RArg = const(C) ->
                +        do_op(Op, C, Res),
                +        write_env(PEnv, ResultVar, Res, NEnv),
                +        NewOp = RestResidual
                +    ;
                +        remove_env(PEnv, ResultVar, NEnv),
                +        NewOp = op1(ResultVar, Op, RArg, RestResidual)
                +    ),
                +    optimize(Rest, NEnv, RestResidual).
                +
                +optimize(op2(ResultVar, Op, Arg1, Arg2, Rest), PEnv, NewOp) :-
                +    presolve(Arg1, PEnv, RArg1),
                +    presolve(Arg2, PEnv, RArg2),
                +    (RArg1 = const(C1), RArg2 = const(C2) ->
                +        do_op(Op, C1, C2, Res),
                +        write_env(PEnv, ResultVar, Res, NEnv),
                +        NewOp = RestResidual
                +    ;
                +        remove_env(PEnv, ResultVar, NEnv),
                +        NewOp = op2(ResultVar, Op, RArg1, RArg2, RestResidual)
                +    ),
                +    optimize(Rest, NEnv, RestResidual).
                +
                +

                Just like partial evaluation! It even reuses the helper functions presolve +from the partial evaluator and a partial environment PEnv. When the +arguments of the operation are known constants in the partial environment, the +operation can be executed at optimization time and removed from the trace. +Otherwise, the operation has to stay in the output trace. The result variable +(as in the partial evaluator) needs to be removed from the partial environment, +because it was just overwritten by an unknown result.

                +

                Now we need to deal with guards in the trace.

                +
                optimize(guard_true(V, [], L, Rest), PEnv, NewOp) :-
                +    plookup(V, PEnv, Val),
                +    (Val = const(C) ->
                +        NewOp = RestResidual
                +    ;
                +        NewOp = guard_true(V, PEnv, L, RestResidual)
                +    ),
                +    optimize(Rest, PEnv, RestResidual).
                +
                +optimize(guard_false(V, [], L, Rest), PEnv, NewOp) :-
                +    plookup(V, PEnv, Val),
                +    (Val = const(C) ->
                +        NewOp = RestResidual,
                +        NEnv = PEnv
                +    ;
                +        write_env(PEnv, V, 0, NEnv),
                +        NewOp = guard_false(V, PEnv, L, RestResidual)
                +    ),
                +    optimize(Rest, NEnv, RestResidual).
                +
                +

                When the variable that is being guarded is actually known to be a constant, we +can remove the guard. Note that it is not possible that the guard of that +constant fails: The tracer recorded the operation while running with real +values, therefore the guards have to succeed for values the optimizer +discovers to be constant.

                +

                guard_false is slightly different from guard_true: after the former we +know that the argument is actually 0. After guard_true we only know that +it is not equal to zero, but not which precise value it has.

                +

                Another point to note in the optimization of guards is that the second argument +of the guard operation, which was so far always just an empty list, is now +replaced by the partial environment PEnv. I will discuss further down why +this is needed.

                +

                Optimizing guard_value is very similar, except that it really gives precise +information about the variable involved:

                +
                optimize(guard_value(V, C, [], L, Rest), PEnv, NewOp) :-
                +    plookup(V, PEnv, Val),
                +    (Val = const(C1) ->
                +        NewOp = RestResidual,
                +        NEnv = PEnv
                +    ;
                +        write_env(PEnv, V, C, NEnv),
                +        NewOp = guard_value(V, C, PEnv, L, RestResidual)
                +    ),
                +    optimize(Rest, NEnv, RestResidual).
                +
                +

                This operation is the main way how the optimizer gains constant variables that +it then exploits to do constant-folding on later operations. This is a chief +difference from partial evaluation: There the optimizer knows the value of some +variables from the start. When optimizing traces, at the beginning the value of +no variable is known. Knowledge about some variables is only later gained +through guards.

                +

                Now we are missing what happens with the loop statement. In principle, it is +turned into a loop statement again. However, at the loop statement a few +additional operations need to be emitted. The reason is that we optimized away +operations and thus assignments when the result value of the variable was a +constant. That means the involved variable still potentially has some older +value. The next iteration of the loop would continue with this older value, +which is obviously wrong. Therefore we need to emit some assignments before the +loop statement, one per entry in the partial environment:

                +
                optimize(loop, PEnv, T) :-
                +    generate_assignments(PEnv, T).
                +
                +generate_assignments([], loop).
                +generate_assignments([Var/Val | Tail], op1(Var, same, const(Val), T)) :-
                +    generate_assignments(Tail, T).
                +
                +

                As an example of how generate_assignments assignments works, let's look at +the following example. When the partial environment is, [x/5, y/10] the +following assignments are generated:

                +
                ?- generate_assignments([x/5, y/10], Out).
                +Out = op1(x, same, const(5), op1(y, same, const(10), loop)).
                +
                +

                That's all the code of the optimizer. While the basic structure is quite similar to partial evaluation, +it's a lot less complex as well. What made the partial evaluator hard was that +it needs to deal with control flow statements and with making sure that code is +reused if the same block is partially evaluated with the same constants. Here, +all these complexities go away. The tracer has already removed all control flow +and replaced it with guards and one loop operation at the end. Thus, the +optimizer can simply do one pass over the operations, removing some (with some +extra care around the loop statement).

                +

                With this machinery in place, we can optimize the trace from the promotion +example of the last post:

                +
                ?- optimize(
                +    guard_value(x,3,[],b2,
                +    op2(x2,mul,var(x),const(2),
                +    op2(x3,add,var(x2),const(1),
                +    op2(i,sub,var(i),var(x3),
                +    op2(c,ge,var(i),const(0),
                +    guard_true(c,[],l_done, loop)))))),
                +    [],
                +    LoopOut).
                +LoopOut = guard_value(x, 3, [], b2, op2(i, sub, var(i), const(7), op2(c, ge, var(i), const(0), guard_true(c, [x/3, x2/6, x3/7], l_done, op1(x, same, const(3), op1(x2, same, const(6), op1(x3, same, const(7), loop)))))))
                +
                +

                More readably, the optimized version is:

                +
                guard_value(x, 3, [], b2,
                +op2(i, sub, var(i), const(7),
                +op2(c, ge, var(i), const(0),
                +guard_true(c, [x/3, x2/6, x3/7], l_done,
                +op1(x, same, const(3),
                +op1(x2, same, const(6),
                +op1(x3, same, const(7),
                +loop)))))))
                +
                +

                As intended, the operations on x after the guard_value have all been +removed. However, some additional assignments (to x, x2, x3) at the end have been generated as +well. The assignments look superfluous, but the optimizer does not have +enough information to easily recognize this. That can be fixed, but only at the +cost of additional complexity. (A real system would transform the trace into +static single assignment form to answer such questions.)

                +

                Resuming to the Interpreter

                +

                Why does the code above need to add the partial environment to +the guards that cannot be optimized away? The reason is related to why we needed +to generate assignments before the loop statement. The problem is that the optimizer +removes assignments to variables when it knows the values of these variables. +That means that when switching back from running the optimized trace to the +interpreter, a number of variables are not updated in the environment, making +the execution in the interpreter incorrect.

                +

                In the example above, this applies to the variables x2 and x3. When the +second guard fails, they have not been assigned in the optimized case. +Therefore, the guard lists them and their (always constant) values.

                +

                When switching back these assignments need to be made. Thus we need to adapt the +resume_interp function from the last blog post as follows:

                +
                write_resumevars([], Env, Env).
                +write_resumevars([Key / Value | Rest], Env, NEnv) :-
                +    write_env(Env, Key, Value, Env1),
                +    write_resumevars(Rest, Env1, NEnv).
                +
                +resume_interp(Env, ResumeVars, L) :-
                +    write_resumevars(ResumeVars, Env, NEnv),
                +    block(L, Block),
                +    interp(Block, NEnv).
                +
                +

                On resuming, the ResumeVars (a former partial environment) are simply added +back to the normal environment before going back to the interpreter.

                +

                The data attached to guards about what needs to be done to resume to the +interpreter when the guard fails is often a very complex part of a tracing +system. The data can become big, yet most guards never fail. Therefore, most +real systems try hard to compress the attached data or try to share it between +subsequent guards.

                +

                Summary

                +

                In this post we have shown how to optimize traces by applying a variant of the +partial evaluation principle: Perform all the operations that have only constant +arguments, leave the others alone. However, optimizing traces is much simpler, +because no control flow is involved. All the questions about control flow have +already been solved by the tracing component.

                +

                In the next and final post of the series I will show a larger example of how +tracing and partial evaluation can be used to optimize a small bytecode +interpreter.

                +
                +

                Comments

                +
                +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2012/02/py3k-status-update-8840622949715145821.html b/posts/2012/02/py3k-status-update-8840622949715145821.html new file mode 100644 index 000000000..9781eb413 --- /dev/null +++ b/posts/2012/02/py3k-status-update-8840622949715145821.html @@ -0,0 +1,364 @@ + + + + + +Py3k status update | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                Py3k status update

                + + + +
                +
                Thank to all the people who donated to the py3k proposal, we managed to collect enough money to start to work on the first step. This is a quick summary of what I did since I began working on this.
                +First of all, many thanks to Amaury Forgeot d'Arc, who started the py3k branch months ago, and already implemented lots of features including e.g. switching to "unicode everywhere" and the int/long unification, making my job considerably easier :-)
                +I started to work on the branch at the last Leysin sprint together with Romain Guillebert, where we worked on various syntactical changes such as extended tuple unpacking and keyword-only arguments. Working on such features is a good way to learn about a lot of the layers which the PyPy Python interpreter is composed of, because often you have to touch the tokenizer, the parser, the ast builder, the compiler and finally the interpreter.
                +Then I worked on improving our test machinery in various way, e.g. by optimizing the initialization phase of the object space created by tests, which considerably speeds up small test runs, and adding the possibility to automatically run our tests against CPython 3, to ensure that what we are not trying to fix a test which is meant to fail :-). I also setup our buildbot to run the py3k tests nightly, so that we can have an up to date overview of what is left to do.
                +Finally I started to look at all the tests in the interpreter/ directory, trying to unmangle the mess of failing tests. Lots of tests were failing because of simple syntax errors (e.g., by using the no longer valid except Exception, e syntax or the old print statement), others for slightly more complex reasons like unicode vs bytes or the now gone int/long distinction. Others were failing simply because they relied on new features, such as the new lexical exception handlers.
                +To give some numbers, at some point in january we had 1621 failing tests in the branch, while today we are under 1000 (to be exact: 999, and this is why I've waited until today to post the status update :-)).
                +Before ending this blog post, I would like to thank once again all the people who donated to PyPy, who let me to do this wonderful job. That's all for now, I'll post more updates soon.
                +cheers, Antonio
                +
                +

                Comments

                +
                +
                +
                + + Piotr Husiatyński wrote on 2012-02-16 16:12: +
                +
                +

                Will the py3 branch be finally merged into main branch and the pypy will be albe to run both 2.x and 3.x depending on the boot switch or the 2.x support will be dropped or the will be no merge?

                +
                +
                +
                +
                + + Antonio Cuni wrote on 2012-02-16 16:23: +
                +
                +

                @Piotr: the work in the py3k branch is destroying some of the python2 semantics, so we won't merge the twos as long as we support python2 (and we'll do it for a long time, because pypy itself is written in python2).
                The current plan is just to keep the development in parallel, and regularly merge "default" into "py3k".

                +
                +
                +
                +
                + + Alberto Berti wrote on 2012-02-16 18:24: +
                +
                +

                Ciao Antonio,

                very good news, i'm glad that my little monetary contribution allowed you to be paid to work on that. Keep us posted!

                Cheers,

                Alberto

                +
                +
                +
                +
                + + Seb wrote on 2012-02-16 23:07: +
                +
                +

                So, it has to be asked: any plan of rewriting PyPy in python 3? :D

                +
                +
                +
                +
                + + Antonio Cuni wrote on 2012-02-16 23:30: +
                +
                +

                @Alberto: thank you very much :-)

                @Seb: not in the short/middle term (and it's unclear whether we want to go there at all)

                +
                +
                +
                +
                + + Echo wrote on 2012-02-17 10:21: +
                +
                +

                Good luck preventing the python2 and python3 branches from convergng; it's what merges do, and the alternative is a lot of cherry-picking. A lot of codebases use feature-flipping instead.

                +
                +
                +
                +
                + + Antonio Cuni wrote on 2012-02-17 10:30: +
                +
                +

                @Echo: no, the plan is to regularly merge "default" (i.e. python2) into "py3k". Note that in "default" we are mostly developing other parts than the Python interpreter (e.g., the JIT compiler), so this should not be too much of a problem, although it will be annoying sometimes

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2012/02/pypy-18-business-as-usual-7266036404915945090.html b/posts/2012/02/pypy-18-business-as-usual-7266036404915945090.html new file mode 100644 index 000000000..59a43b319 --- /dev/null +++ b/posts/2012/02/pypy-18-business-as-usual-7266036404915945090.html @@ -0,0 +1,444 @@ + + + + + +PyPy 1.8 - business as usual | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                PyPy 1.8 - business as usual

                + + + +
                +

                We're pleased to announce the 1.8 release of PyPy. As habitual this +release brings a lot of bugfixes, together with performance and memory +improvements over the 1.7 release. The main highlight of the release +is the introduction of list strategies which makes homogenous lists +more efficient both in terms of performance and memory. This release +also upgrades us from Python 2.7.1 compatibility to 2.7.2. Otherwise +it's "business as usual" in the sense that performance improved +roughly 10% on average since the previous release.

                +

                you can download the PyPy 1.8 release here:

                +
                +https://pypy.org/download.html +
                +
                +

                What is PyPy?

                +

                PyPy is a very compliant Python interpreter, almost a drop-in replacement for +CPython 2.7. It's fast (pypy 1.8 and cpython 2.7.1 performance comparison) +due to its integrated tracing JIT compiler.

                +

                This release supports x86 machines running Linux 32/64, Mac OS X 32/64 or +Windows 32. Windows 64 work has been stalled, we would welcome a volunteer +to handle that.

                +
                +
                +

                Highlights

                +
                  +
                • +

                  List strategies. Now lists that contain only ints or only floats should +be as efficient as storing them in a binary-packed array. It also improves +the JIT performance in places that use such lists. There are also special +strategies for unicode and string lists.

                  +
                • +
                • +

                  As usual, numerous performance improvements. There are many examples +of python constructs that now should be faster; too many to list them.

                  +
                • +
                • +

                  Bugfixes and compatibility fixes with CPython.

                  +
                • +
                • +

                  Windows fixes.

                  +
                • +
                • +

                  NumPy effort progress; for the exact list of things that have been done, +consult the numpy status page. A tentative list of things that has +been done:

                  +
                    +
                  • multi dimensional arrays
                  • +
                  • various sizes of dtypes
                  • +
                  • a lot of ufuncs
                  • +
                  • a lot of other minor changes
                  • +
                  +

                  Right now the numpy module is available under both numpy and numpypy +names. However, because it's incomplete, you have to import numpypy first +before doing any imports from numpy.

                  +
                • +
                • +

                  New JIT hooks that allow you to hook into the JIT process from your python +program. There is a brief overview of what they offer.

                  +
                • +
                • +

                  Standard library upgrade from 2.7.1 to 2.7.2.

                  +
                • +
                +
                +
                +

                Ongoing work

                +

                As usual, there is quite a bit of ongoing work that either didn't make it to +the release or is not ready yet. Highlights include:

                +
                  +
                • Non-x86 backends for the JIT: ARMv7 (almost ready) and PPC64 (in progress)
                • +
                • Specialized type instances - allocate instances as efficient as C structs, +including type specialization
                • +
                • More numpy work
                • +
                • Since the last release there was a significant breakthrough in PyPy's +fundraising. We now have enough funds to work on first stages of numpypy +and py3k. We would like to thank again to everyone who donated.
                • +
                • It's also probably worth noting, we're considering donations for the +Software Transactional Memory project. You can read more about our plans +
                • +
                +

                Cheers,
                +The PyPy Team

                +
                +
                +

                Comments

                +
                +
                +
                + + Anonymous wrote on 2012-02-10 11:08: +
                +
                +

                As usual, excellent work!
                The faster Pypy becomes, the less the need to use limited languages just for speed considerations.
                List specialization is really cool and seems to boost performance and reduce memory usage considerably. I'd love seeing specializations for tuples of ints/floats/strings as structs.
                On a side note, what stops people from using RPython as a compiled language (in terms of speed) with a nicer syntax?

                +
                +
                +
                +
                + + Daivd wrote on 2012-02-10 12:57: +
                +
                +

                Well done!

                I find nothing on the comparison page about memory (maybe because it's called speed.pypy.org...). How are you stacking up against CPython there, on benchmarks and real word examples? I realize a JIT will always need some memory overhead, but perhaps you have done enough clever things now, like list strategies, to be competitive anyway?

                +
                +
                +
                +
                + + halfaleague wrote on 2012-02-10 14:03: +
                +
                +

                I would donate to this.
                Would this give us 'true' multithreading? a la clojure?

                +
                +
                +
                +
                + + Unknown wrote on 2012-02-10 17:45: +
                +
                +

                Seems like you guys are ahead of the curve with STM: https://arstechnica.com/business/news/2012/02/transactional-memory-going-mainstream-with-intel-haswell.ars

                +
                +
                +
                +
                + + kurdakov wrote on 2012-02-12 11:29: +
                +
                +

                Did anybody test if it works with
                pypy?

                https://github.com/mvantellingen/psycopg2-ctypes

                would be great to have out of the box postgresql support for Django

                +
                +
                +
                +
                + + Joko Susilo wrote on 2012-02-13 11:02: +
                +
                +

                i will try it first

                +
                +
                +
                +
                + + Anonymous wrote on 2012-02-13 14:23: +
                +
                +

                Just donated for py3k. I think it would make sense to allow donations for STM as well.

                +
                +
                +
                +
                + + One Wellness Place wrote on 2012-04-20 10:44: +
                +
                +

                I will try it.

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2012/03/call-for-donations-for-software-8853699867109654713.html b/posts/2012/03/call-for-donations-for-software-8853699867109654713.html new file mode 100644 index 000000000..4f879526a --- /dev/null +++ b/posts/2012/03/call-for-donations-for-software-8853699867109654713.html @@ -0,0 +1,407 @@ + + + + + +Call for donations for Software Transactional Memory | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                Call for donations for Software Transactional Memory

                + + + +
                +

                Hi all,

                + +

                The Software Transactional Memory +call for donations is up. From the proposal:

                + + + + + +
                +Previous attempts on Hardware Transactional Memory focused on parallelizing existing programs written using the thread or threading modules. However, as argued here, this may not be the most practical way to achieve real multithreading; it seems that better alternatives would offer good scalability too. Notably, Transactional Memory could benefit any event-based system that is written to dispatch events serially (Twisted-based, most GUI toolkit, Stackless, gevent, and so on). The events would internally be processed in parallel, while maintaining the illusion of serial execution, with all the corresponding benefits of safety. This should be possible with minimal changes to the event dispatchers. This approach has been described by the Automatic Mutual Exclusion work at Microsoft Research, but not been implemented anywhere (to the best of our knowledge). +

                +Note that, yes, this gives you both sides of the coin: you keep using your non-thread-based program (without worrying about locks and their drawbacks like deadlocks, races, and friends), and your programs benefit from all your cores. +

                +In more details, a low-level built-in module will provide the basics to start transactions in parallel; but this module will be only used internally in a tweaked version of, say, a Twisted reactor. Using this reactor will be enough for your existing Twisted-based programs to actually run on multiple cores. You, as a developer of the Twisted-based program, have only to care about improving the parallelizability of your program (e.g. by splitting time-consuming transactions into several parts; the exact rules will be published in detail once they are known). +
                +

                The point is that your program is always correct, and can be tweaked to improve performance. This is the opposite from what explicit threads and locks give you, which is a performant program which you need to tweak to remove bugs. Arguably, this approach is the reason for why you use Python in the first place :-)

                + +

                Armin

                +
                +

                Comments

                +
                +
                +
                + + Konstantine Rybnikov wrote on 2012-03-08 21:13: +
                +
                +

                Great news, really looking into experimenting with that, good luck!

                My question is: will it map to os thread being created on each event dispatch or can it potentially be somehow optimized? I mean, you can potentially end up with code that has tons of small events, and creating os thread on each event would slow down your program.

                +
                +
                +
                +
                + + Anonymous wrote on 2012-03-08 23:22: +
                +
                +

                @k_bx it's not like that at all. There are links in the proposal that may enlighten, depending on what you already know.

                +
                +
                +
                +
                + + Armin Rigo wrote on 2012-03-09 01:49: +
                +
                +

                Indeed, it is creating a pool of N threads and reusing them, where N is configurable. Ideally it should default to the number of cores you have, detected in some (sadly non-portable) way.

                +
                +
                +
                +
                + + Anonymous wrote on 2012-03-09 09:06: +
                +
                +

                Are any of you affiliated with a university? Since this is research, maybe you can get a grant for a post-doc or a PhD position.

                +
                +
                +
                +
                + + Anonymous wrote on 2012-03-09 10:03: +
                +
                +

                Trivial comment - on the donation page in the "What is Transactional Memory?" section, I think a (TM) has been turned into a superscript TM (as in trademark).

                +
                +
                +
                +
                + + Steve Phillips wrote on 2012-03-10 00:50: +
                +
                +

                This sounds exciting for the kinds of Python programs that would benefit from TM, but can anyone give a ballpark estimate of what percentage of programs that might be?

                Personally, I write various (non-evented) Python scripts (replacements for Bash scripts, IRC bot, etc) and do a lot of Django web dev. It's not clear that I or similar people would benefit from Transactional Memory.

                Is that correct?

                +
                +
                +
                +
                + + Anonymous wrote on 2012-03-10 01:23: +
                +
                +

                Could u update the donation page? It doesn't seem to be tallying the amounts.

                I am really excited to see this work even if it is pure research (I donated $200). It would be awesome if

                stm:
                ....pre:
                ........# init transaction state
                ....trans:
                ........# parallel stuff

                So it would be easy to retry failed transactions or be able to reorder them for contention or perf.

                +
                +
                +
                +
                + + kurdakov wrote on 2012-03-17 17:07: +
                +
                +

                offtopic:

                there is a project to help bring C# and C++ together

                https://github.com/mono/cxxi
                and fork https://github.com/kthompson/cxxi

                in essence: there is a generation step which allows then to easily use C++ objects in C# and vice versa.

                considering that ctypes are very much like p/invoke, it looks like pypy might have something similar for python/C++ environments , this might allow much easier to port, for example, Blender to use pypy as scripting language.

                +
                +
                +
                +
                + + Arne Babenhauserheide wrote on 2012-03-22 14:08: +
                +
                +

                Could you post an example snippet of code which would benefit from that?

                I ask because I have trouble really imagining example code.

                Something minimal with the least possible amount of extension modules which I could just throw into the pypy and pypy-tm interpreter and see the difference.

                +
                +
                +
                +
                + + Armin Rigo wrote on 2012-04-02 15:12: +
                +
                +

                I wrote a minimal example here:

                https://foss.heptapod.net/pypy/pypy/-/tree/branch//stm-gc/lib_pypy/transaction.py

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2012/03/py3k-status-update-2-4018939509128176130.html b/posts/2012/03/py3k-status-update-2-4018939509128176130.html new file mode 100644 index 000000000..78d70418d --- /dev/null +++ b/posts/2012/03/py3k-status-update-2-4018939509128176130.html @@ -0,0 +1,348 @@ + + + + + +Py3k status update #2 | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                Py3k status update #2

                + + + +
                +

                This is the second status update about my work on the py3k branch, which I can work on thanks to all of the people who donated to the py3k proposal.

                +

                Since my previous status update, things have improved a lot: first of all, I fixed the syntax of many more tests, which were failing on the branch because they used constructs which are no longer valid in Python 3, such as u'' strings, the print statement or the old except Exception, e syntax. I have to say that this work is tedious and not very rewarding, but it has to be done anyway, so that the real failures can stand up.

                +

                Then, I spent most of the rest of the time by killing features which are present in Python 2 and are gone in Python 3.

                +

                Some of them were easy and mechnical: for example, I removed all the function attributes such as func_code and func_closure, which has been renamed to __code__ and __closure__, and then I had to find and fix all the places which still expected the old ones.

                +

                Some were trickier: I removed support for the cmp function and the __cmp__ special method, but this also meant that I had to fix a few types which relied on it to be comparable (for example, did you know that the cells contained in __closure__ are comparable?). At the same time, I also removed the old behavior which in Python 2 allows us to compare arbitrary objects with <, > & co.: in Python 3 the only comparisons allowed between incompatible types are == and !=.

                +

                Speaking of old special methods, __hex__ and __oct__ are gone as well (and I didn't even know about their existence before removing them :-))

                +

                But the most important breakthrough was the removal of the _file module, containing the implementation of the file type in Python 2, which is now gone since in Python 3 files are handled by the _io module. Killing the module was not straightforward, because some of the importing logic was tightly tied to the internal implementation of files, so it needed some refactoring. Finally, I had to fix the marshal module to correctly detect text files vs. byte files.

                +

                Among these things, I fixed tons of smaller issues here and there. As a result, there are many fewer failing tests than a few weeks ago. Obviously the number itself does not mean much, because sometimes fixing a single test takes hours, and some other times by changing one line one fixes tens of tests. But at the end, seeing it dropping from 999 to 650 always is nice and rewarding :-).

                +

                The road for having a pypy3k is still long, but everything is going fine so far. Stay tuned for more updates!

                +

                cheers, Antonio

                +
                +
                +

                Comments

                +
                +
                +
                + + Larry Hastings wrote on 2012-03-02 01:17: +
                +
                +

                You might consider leaving the u prefix in--PEP 414 puts it back, and it just got accepted.

                https://www.python.org/dev/peps/pep-0414/

                +
                +
                +
                +
                + + Unknown wrote on 2012-03-02 05:57: +
                +
                +

                It's cleaner to flush them out - the forward porting effort is targeting 3.2, so the stdlib should respect that. (i.e. if it runs on PyPy by default, it should run on CPython 3.2 as well).

                Otherwise we'll end up with confusing cases of "this runs on 3.2 in PyPy, but CPython reports a syntax error"

                +
                +
                +
                +
                + + Unknown wrote on 2012-03-02 05:59: +
                +
                +

                For importing support, you may want to look at the current state of the 3.3 importlib implementation. Brett's on the verge of hooking that up as CPython's native import system - it should be possible for PyPy3k to use it as well.

                +
                +
                +
                +
                + + Antonio Cuni wrote on 2012-03-02 08:28: +
                +
                +

                @Larry: yes, I've considered that, but as Nick says we are targeting 3.2, so it's much easier to just kill it in the meantime. Adding it back will be very easy.

                @Nick: yes, using importlib is something which I also have considered. However, I'd prefer not to diverge too much from the "default" branch (because we are going to regularly merge default into py3k for a long time). I suppose that as long as the current importing logic works fine, we'll keep it :-)

                +
                +
                +
                +
                + + shaurz wrote on 2012-03-03 15:01: +
                +
                +

                The u"" syntax is coming back.

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2012/04/numpy-on-pypy-progress-report-6048076549081013253.html b/posts/2012/04/numpy-on-pypy-progress-report-6048076549081013253.html new file mode 100644 index 000000000..19119fc11 --- /dev/null +++ b/posts/2012/04/numpy-on-pypy-progress-report-6048076549081013253.html @@ -0,0 +1,590 @@ + + + + + +NumPy on PyPy progress report | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                NumPy on PyPy progress report

                + + + +
                +

                Hello.

                +

                A lot of things happened in March, like pycon. I was also busy doing other +things (pictured), so apologies for the late numpy status update.

                +

                However, a lot of things have happened and numpy continues to be one of the +main points of entry for hacking on PyPy. Apologies to all the people whose +patches I don't review in timely manner, but seriously, you do a lot of +work.

                +

                This list of changes is definitely not exhaustive, and I might be forgetting +important contributions. In a loose order:

                +
                  +
                • +

                  Matti Picus made out parameter work for a lot of (but not all) +functions.

                  +
                • +
                • +

                  We merged record dtypes support. The only missing dtypes left are complex +(important), datetime (less important) and object (which will probably +never be implemented because it makes very little sense and is a mess with moving GCs).

                  +
                • +
                • +

                  Taavi Burns and others implemented lots of details, including lots of ufuncs. +On the completely unscientific measure of "implemented functions" on +numpypy status page, we're close to 50% of numpy working. In reality +it might be more or less, but after complex dtypes we're getting very close +to running real programs.

                  +
                • +
                • +

                  Bool indexing of arrays of the same size should work, leaving only +arrays-of-ints indexing as the last missing element of fancy indexing.

                  +
                • +
                • +

                  I did some very early experiments on SSE. This work is seriously +preliminary - in fact the only implemented operation is addition of +float single-dimension numpy arrays. However, results are encouraging, +given that our assembler generator is far from ideal:

                  + ++++++++ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    +

                  Numpy

                  +
                  +

                  PyPy SSE

                  +
                  +

                  PyPy

                  +
                  +

                  GCC non-looped

                  +
                  +

                  GCC looped

                  +
                  +

                  a+b

                  +
                  +

                  0.6s

                  +
                  +

                  0.3s

                  +
                  +

                  0.4s

                  +
                  +

                  0.3s

                  +
                  +

                  0.25s

                  +
                  +

                  a+b+c

                  +
                  +

                  1.9s

                  +
                  +

                  0.35s

                  +
                  +

                  0.5s

                  +
                  +

                  0.7s

                  +
                  +

                  0.32s

                  +
                  +

                  a+b+c+d+e

                  +
                  +

                  3.2s

                  +
                  +

                  0.36s

                  +
                  +

                  0.8s

                  +
                  +

                  1.7s

                  +
                  +

                  0.51s

                  +
                  +

                  The benchmark repo is available. GCC was run with -O3, no further +options specified. PyPy was run with default options, the SSE branch is under +backend-vector-ops, but it's not working completely yet.

                  +

                  One might argue that C and Python is not the same code - indeed it is not. +It just shows some possible approach to writing numeric code.

                  +
                • +
                +

                Next step would be to just continue implementing missing features such as

                +
                  +
                • specialised arrays i.e. masked arrays and matrixes
                • +
                • core modules such as fft, linalg, random.
                • +
                • numpy's testing framework
                • +
                +

                The future is hard to predict, but we're not far off!

                +

                Cheers,
                fijal

                + +

                UPDATE:Indeed, string and unicode dtypes are not supported yet. They're as important as complex dtype

                +
                +

                Comments

                +
                +
                +
                + + Jeff Terrace wrote on 2012-04-17 18:53: +
                +
                +

                I think the string dtype is missing too?

                +
                +
                +
                +
                + + Anonymous wrote on 2012-04-17 19:57: +
                +
                +

                Hello,

                May you get a bit more precise on the GCC test ?

                For instance, is the GCC code using SSE too ? Is it written in a single loop (x[i] = a[i] + b[i] + c[i]) or in several consecutive loops first a+b then (a+b) + c ?

                Just to know :-)

                +
                +
                +
                +
                + + Winston Ewert wrote on 2012-04-17 20:03: +
                +
                +

                One thing I'll note is that I do from time to time use the object dtype. Occasionally, I've got multidimensional arrays of objects, and the array operations from numpy are useful. I don't really get a speed advantage there, but the interface from numpy is useful. But its not super necessary and certainly not a priority.

                +
                +
                +
                +
                + + Anonymous wrote on 2012-04-17 20:04: +
                +
                +

                Sorry, didn't RTFA completely. I just had a look at the C code.

                Still, a question: is PyPy doing the optimization of combining operations in one step ?

                A "good" Fortran compiler should be able to do those optimizations, for instance.

                +
                +
                +
                +
                + + Gaël wrote on 2012-04-17 21:17: +
                +
                +

                You should compare to numpy with a JIT, such as numexpr, it would be interesting to see whether PyPy is able to beat the numexpr JIT.

                +
                +
                +
                +
                + + x wrote on 2012-04-17 22:55: +
                +
                +

                Very cool!

                +
                +
                +
                +
                + + Armin Rigo wrote on 2012-04-18 10:07: +
                +
                +

                "busy doing other things (pictured)". Pictured where? :-)

                +
                +
                +
                +
                + + Ralf Gommers wrote on 2012-04-18 20:45: +
                +
                +

                Hi, Numpy masked arrays, matrices and the testing framework are pure Python, so why do you need to implement them?

                +
                +
                +
                +
                + + Alex wrote on 2012-04-18 22:20: +
                +
                +

                Ralf, we don't have to implement the pure-python stuff, so much as we need to make sure the features of NumPy's core that they depend on are implemented.

                +
                +
                +
                +
                + + EOL (Eric O LEBIGOT) wrote on 2012-04-19 10:17: +
                +
                +

                Support for objects is actually quite useful: please reconsider adding it.

                Here is a very useful case: the manipulation of arrays of numbers with uncertainties (special uncertainties.UFloat objects). Numbers with uncertainties behave very much like regular numbers: it is very useful to be able to use the regular NumPy syntax for array operations, for calculating matrix inverses when the matrices contain number with uncertainties, etc. I know many people use these features.

                It would be *great* (read: irreplaceable :) to have support for the object NumPy dtype.

                +
                +
                +
                +
                + + Unknown wrote on 2012-04-19 13:47: +
                +
                +

                This sounds really cool!

                And it would be awesome if you’d manage to coordinate with numpy, so the projects merge to a single python codebase with two separate backends: One C-Extension based for CPython and one Pure-Python based for pypy.

                +
                +
                +
                +
                + + Anonymous wrote on 2012-04-19 17:19: +
                +
                +

                Any chance comparing with Fortran? There are assumptions about pointers and alignment that Fortran compiler can make.

                +
                +
                +
                +
                + + Unknown wrote on 2012-04-20 15:26: +
                +
                +

                Nice...but what is the next step?
                Numpy alone is not that useful.

                "We" need at least scipy and matplotlib.

                Are you going to port all these modules? I don't think so.

                One way forward could be to have numpy in pypy and at least scipy and matplotlib working with the pypy C api at a decent speed.

                What do you think?

                +
                +
                +
                +
                + + Anonymous wrote on 2012-04-22 20:07: +
                +
                +

                What about pickling? I'd love to experiment with hybrid CPython/PyPy execution using some magic from the multiprocessing module or a similar parallel computation framework.

                +
                +
                +
                +
                + + Anonymous wrote on 2012-07-30 21:02: +
                +
                +

                Hello,

                This is a very promising result, thank you for sharing it.
                Could you give a few more details about the differences wrt to numpy?

                What would people have to do to use numpypy with scipy?

                +
                +
                +
                +
                + + Raul Durand wrote on 2012-08-06 16:52: +
                +
                +

                I think the numpy.linalg module is pretty important.
                How to move efforts into this?

                +
                +
                +
                +
                + + Raul Durand wrote on 2012-08-06 16:53: +
                +
                +

                I think the numpy.linalg module is pretty important.
                How to move efforts into this?

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2012/04/py3k-status-update-3-6975588144646689872.html b/posts/2012/04/py3k-status-update-3-6975588144646689872.html new file mode 100644 index 000000000..0000804d0 --- /dev/null +++ b/posts/2012/04/py3k-status-update-3-6975588144646689872.html @@ -0,0 +1,342 @@ + + + + + +Py3k status update #3 | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                Py3k status update #3

                + + + +
                +
                This is the third status update about my work on the py3k branch, which I can work on thanks to all of the people who donated to the py3k proposal.

                +A lot of work has been done during the last month: as usual, the list of changes is too big to be reported in a detalied way, so this is just a summary of what happened.

                +One of the most active areas was killing old and deprecated features. In particular, we killed support for the __cmp__ special method and its counsins, the cmp builtin function and keyword argument for list.sort() and sorted(). Killing is easy, but then you have to fix all the places which breaks because of this, including all the types which relied on __cmp__ to be comparable,, fixing all the tests which tried to order objects which are no longer ordeable now, or implementing new behavior like forbidding calling hash() on objects which implement __eq__ but not __hash__.

                +Among the other features, we killed lots of now-gone functions in the operator module, the builtins apply(), reduce() and buffer, and the os.* functions to deal with temporary files, which has been deprecated in favour of the new tempfile module.

                +The other topic which can't miss in a py3k status update is, as usual, string-vs-unicode. At this round, we fixed bugs in string formatting (in particular to teach format() to always use unicode strings) and various corner cases about when calling the (possibly overridden) __str__ method on subclasses of str. Believe me, you don't want to know the precise rules :-).

                +Other features which we worked on and fixed tests include, but are not limited to, marshal, hashlib, zipimport, _socket and itertools, plus the habitual endless lists of tests which fail for shallow reasons such as the syntactic differences, int vs long, range() vs list(range()) etc. As a result, the number of failing tests dropped from 650 to 235: we are beginning to see the light at the end of the tunnel :-)

                +Benjamin finished implementing Python 3 syntax. Most of it was small cleanups and tweaks to be compatible with CPython such as making True and False keywords and preventing . . . (note spaces between dots) from being parsed as Ellipsis. Larger syntax additions included keyword only arguments and function annotations.

                +Finally, we did some RPython fixes, so that it is possible again to translate PyPy in the py3k branch. However, the resuling binary is a strange beast which mixes python 2 and python 3 semantics, so it is unusable for anything but showing friends how cool it is.

                +I would like to underline that I was not alone in doing all this work. In particular, a lot of people joined the PyPy sprint at Pycon and worked on the branch, as you can clearly see in this activity graph. I would like to thank all who helped!
                +
                +cheers,
                +Antonio and Benjamin
                +
                +

                Comments

                +
                +
                +
                + + Unknown wrote on 2012-04-11 11:19: +
                +
                +

                Very cool work!

                Thanks for the update! I‘ll need to see if I can already let it hit my own python3 project (I had to convert that to python2.x to make it run with pypy, being able to get rid of that step would be really cool!)

                Do you already have prebuilt binaries of pypy3?

                +
                +
                +
                +
                + + Antonio Cuni wrote on 2012-04-14 11:21: +
                +
                +

                I don't think that there is any chance that a python3 project will run as of now, there are still tons of features missing. So far my job as mostly been to fix all the failing tests in the PyPy testsuite. When I'll have finished, I'll be able to start with new features.

                And, for the same reason: no prebuilt binaries yet, sorry.

                +
                +
                +
                +
                + + Unknown wrote on 2012-04-19 13:41: +
                +
                +

                OK, thanks for the info!

                I’m anxious to test it, once you give it a chance to run simple unicode-using code!

                +
                +
                +
                +
                + + Anonymous wrote on 2012-05-30 20:01: +
                +
                +

                Pocoo's pastebin has unfortunately permanently shut down. Any chance you could repaste how cool it is somewhere else?

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2012/04/pycon-2012-wrap-up-559575896040055505.html b/posts/2012/04/pycon-2012-wrap-up-559575896040055505.html new file mode 100644 index 000000000..70cd84acc --- /dev/null +++ b/posts/2012/04/pycon-2012-wrap-up-559575896040055505.html @@ -0,0 +1,332 @@ + + + + + +PyCon 2012 wrap up | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                PyCon 2012 wrap up

                + + + +
                +

                So, PyCon happened. This was the biggest PyCon ever and probably the biggest +gathering of Python hackers ever.

                +

                From the PyPy perspective, a lot at PyCon was about PyPy. Listing things:

                +
                  +
                • David Beazley presented an excellent keynote describing his experience +diving head-first into PyPy and at least partly failing. He, however, did +not fail to explain bits and pieces about PyPy's architecture. +Video is available.
                • +
                • We gave tons of talks, including the tutorial, why pypy by example +and pypy's JIT architecture +
                • +
                • We had a giant influx of new commiters, easily doubling the amount of pull +requests ever created for PyPy. The main topics for newcomers were numpy and +py3k, disproving what David said about PyPy being too hard to dive into ;)
                • +
                • Guido argued in his keynote that Python is not too slow. In the meantime, +we're trying to prove him correct :-)
                • +
                +

                We would like to thank everyone who talked to us, shared ideas and especially +those who participated in sprints - we're always happy to welcome newcomers!

                +

                I'm sure there are tons of things I forgot, but thank you all!

                +

                Cheers, +fijal

                +
                +

                Comments

                +
                +
                +
                + + Dave Beazley wrote on 2012-04-14 00:16: +
                +
                +

                I'm so happy to be proven wrong!

                +
                +
                +
                +
                + + Maciej Fijalkowski wrote on 2012-04-14 09:36: +
                +
                +

                I think "proven" is a bit strong word, we're trying though :)

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2012/04/pypy-sprint-in-leipzig-june-22-27-6450601012927549960.html b/posts/2012/04/pypy-sprint-in-leipzig-june-22-27-6450601012927549960.html new file mode 100644 index 000000000..664a7eeb9 --- /dev/null +++ b/posts/2012/04/pypy-sprint-in-leipzig-june-22-27-6450601012927549960.html @@ -0,0 +1,319 @@ + + + + + +PyPy sprint in Leipzig, Germany (June 22-27) | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                PyPy sprint in Leipzig, Germany (June 22-27)

                + + + +
                +

                The next PyPy sprint will be held --- for the first time in a while --- +in a place where we haven't been so far: Leipzig, Germany, at the +Python Academy's Teaching Center. It will take place from the 22nd +to the 27th of June 2012, before EuroPython. Thanks to Mike Müller for +organizing it!

                +

                This is a fully public sprint, everyone is welcome to join us. All days are +full sprint days, so it is recommended to arrive the 21st and leave the 28th.

                +

                Topics and goals

                +

                Open. Here are some goals:

                +
                  +
                • numpy: progress towards completing the numpypy module; try to +use it in real code
                • +
                • stm: progress on Transactional Memory; try out the transaction module on real code.
                • +
                • jit optimizations: there are a number of optimizations we can still +try out or refactor.
                • +
                • work on various, more efficient data structures for Python language. +A good example would be lazy string slicing/concatenation or more efficient +objects.
                • +
                • any other PyPy-related topic is fine too.
                • +
                +

                Grants

                +

                For students, we have the possibility to support some costs via PyPy +funds. Additionally, we can support you applying for grants from the +PSF and other sources.

                +

                Registration

                +

                If you'd like to come, please sign up either by announcing yourself on +pypy-dev, or by directly adding yourself to the list of people. +(We need to have a head count for the organization.) If you are new to +the project please drop a note about your interests and post any +questions.

                +

                More...

                +

                For more information, please see the sprint announcement.

                +
                +

                Comments

                +
                +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2012/04/stm-update-and-thanks-everybody-6071745734932940294.html b/posts/2012/04/stm-update-and-thanks-everybody-6071745734932940294.html new file mode 100644 index 000000000..35f689e88 --- /dev/null +++ b/posts/2012/04/stm-update-and-thanks-everybody-6071745734932940294.html @@ -0,0 +1,400 @@ + + + + + +STM update (and thanks everybody) | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                STM update (and thanks everybody)

                + + + +
                +

                A short update on the Software Transactional Memory (STM) side. Let me remind you that the work is to add STM internally into PyPy, with the goal of letting the user's programs run on multiple cores after a minor adaptation. (The goal is not to expose STM to the user's program.) I will soon write some official documentation that explains in more details exactly what you get. For now you can read the previous blog posts, and you can also find technical details in the call for donation itself; or directly look at how I adapted the examples linked to later in this post.

                +

                I have now reached the point where the basics seem to work. There is no integration with the JIT so far; moreover the integration with the Garbage Collection subsystem is not finished right now, but at least it is "not crashing in my simple tests and not leaking memory too quickly". (It means that it is never calling __del__ so far, although it releases memory; and when entering transactional mode or when going to the next transaction, all live objects become immortal. This should still let most not-too-long-running programs work.)

                +

                If you want to play with it, you can download this binary (you need to put it in a place with the paths lib-python and lib_pypy, for example inside the main directory from a regular nightly tarball or from a full checkout). This version was compiled for Linux x86 32-bit from the stm-gc branch on the 25th of April. It runs e.g. the modified version of richards. This branch could also be translated for Linux x86-64, but not for other OSes nor other CPUs for now.

                +

                The resulting pypy-stm exposes the same interface as the pure Python transaction module, which is an emulator (running on CPython or any version of PyPy) which can be used to play around and prepare your programs. See the comments in there. A difference is that the real pypy-stm doesn't support epoll right now, so it cannot be used yet to play with a branch of Twisted that was already adapted (thanks Jean-Paul Calderone); but that's coming soon. For now you can use it to get multi-core usage on purely computational programs.

                +

                I did for example adapt PyPy's own translate.py: see the tweak in rpython/rtyper.py. Lines 273-281 are all that I needed to add, and they are mostly a "simplification and parallelization" of the lines above. There are a few more places in the whole translate.py that could be similarly modified, but overall it is just that: a few places. I did not measure performance, but I checked that it is capable of using multiple cores in the RTyping step of translation, with --- as expected --- some still-reasonable number of conflicts, particularly at the beginning when shared data structures are still being built.

                +

                On a few smaller, more regular examples like richards, I did measure the performance. It is not great, even taking into account that it has no JIT so far. Running pypy-stm with one thread is roughly 5 times slower than running a regular PyPy with no JIT (it used to be better in previous versions, but they didn't have any GC; nevertheless, I need to investigate). However, it does seem to scale. At least, it scales roughly as expected on my 2-real-cores, 4-hyperthreaded-cores laptop (i.e. for N between 1 and 4, the N-threaded pypy-stm performs similarly to N independent pypy-stm's running one thread each).

                +

                And finally...

                +

                ...a big thank you to everyone who contributed some money to support this! As you see on the PyPy site, we got more than 6700$ so far in only 5 or 6 weeks. Thanks to that, my contract started last Monday, and I am now paid a small salary via the Software Freedom Conservancy (thanks Bradley M. Kuhn for organizational support from the SFC). Again, thank you everybody!

                +

                UPDATE: The performance regression was due to disabling an optimization, the method cache, which caused non-deterministic results --- the performance could vary from simple to double. Today, as a workaround, I made the method cache transaction-local for now; it is only effective for transactions that run for long enough (maybe 0.1ms or 1ms), but at least it is there in this situation. In the version of richards presented above, the transactions are too short to make a difference (around 0.015ms).

                +
                +

                Comments

                +
                +
                +
                + + Anonymous wrote on 2012-04-27 20:37: +
                +
                +

                I don't get it. It's great that pypy libs and so on will be multithreaded with good performance, but how does that help you to write a multithreaded program with good performance, if you don't expose the tools you used to do that?

                +
                +
                +
                +
                + + Alexander Sedov wrote on 2012-04-27 20:44: +
                +
                +

                Interface is exposed; transaction module it is.

                +
                +
                +
                +
                + + Texatril wrote on 2012-04-27 20:44: +
                +
                +

                I think the idea is that the GIL would be gone since internally the interpreter would use STM, and at the programmer level, you would be free to use the normal threading mechanisms

                +
                +
                +
                +
                + + Maciej Fijalkowski wrote on 2012-04-27 20:49: +
                +
                +

                @Texatril no, the point is you would not have to. You write a normal event-based program with transaction module and boom it works. It's easier than writing correct multithreaded code.

                +
                +
                +
                +
                + + Anonymous wrote on 2012-04-27 20:50: +
                +
                +

                Ah, you kinda contradicted yourself by saying the goal wasn't to expose STM to users' programs, but then saying that it exposed the same API as the transaction module.

                The transaction module is pretty horrible though. Might I suggest a better syntax than the transaction module? Something like exceptions would be better:

                begin:
                ...
                commit

                or:

                transaction:
                ...
                rollback:
                retry

                perhaps with an option (in a later version?) to replace the "retry" with alternate code.

                +
                +
                +
                +
                + + Maciej Fijalkowski wrote on 2012-04-27 20:52: +
                +
                +

                @Anonymous that would be a bad API, because you cannot fail a transaction. It'll be automatically retried until it finishes. That's in-line with correct programs, just multithreaded

                +
                +
                +
                +
                + + Unknown wrote on 2012-04-28 09:37: +
                +
                +

                Anonymous: the user level API is any asynchronous event handling framework that uses the transaction library internally to handle events in parallel.

                So, for example, you take *any* Twisted program and run it on pypy-stm and it will use the available number of cores to process events without losing any of the normal correctness guarantees of event-based programming.

                +
                +
                +
                +
                + + Armin Rigo wrote on 2012-04-29 07:43: +
                +
                +

                The goal is really not to expose STM to the user. The pure Python transaction module is a working implementation, running on a single core but running. The fact that pypy-stm provides an alternate implementation, based on STM and giving multi-core usage --- this is the implementation detail.

                That's why it has the kind of API you see, and not some STM syntax like "begin: rollback: commit". I also dislike custom keywords, because then we can no longer run the program on CPython or non-STM-based PyPys. But I know I am no language designer myself, so the details are open for discussion.

                Nick: thanks for the precisions. Note however that the transaction module is also meant to be used directly, e.g. in CPU-intensive computational programs that don't use any event framework, like I did in rpython/rtyper.py.

                +
                +
                +
                +
                + + Unknown wrote on 2012-05-01 06:10: +
                +
                +

                That sounds great!

                From the code I wondered, though, if it’s not actually only 2 lines:

                for block in pending:
                transaction.add(self.specialize_block, block)
                transaction.run()

                That sounds like map() - for example like the futures module:

                with concurrent.futures.ThreadExecutor() as e:
                e.map(...)

                Similarly something like

                with transaction.Runner() as r:
                r.map(self.specialize_block, block)

                might be easier.

                Anyway: Your STM project sounds great!

                +
                +
                +
                +
                + + Armin Rigo wrote on 2012-05-01 08:51: +
                +
                +

                @arne: right, maybe. It points to a similarity, at least. This simple example corresponds nicely to map(), but in other examples (like richards) we add() more transactions from within transactions. Nevertheless, using the "with ... as t:" syntax might work, by passing the "t" inside transactions in order to call t.map() or t.add() on it too.

                This would also open the door to naturally nest these constructs. Right now if you call transaction.run() inside a transaction, you get an error. Such a case is more work to support in the current implementation, but from the surface it looks like a transaction.Runner() kind of interface should allow us to express what we need.

                +
                +
                +
                +
                + + Unknown wrote on 2012-05-03 19:51: +
                +
                +

                @Armin: Nice! Congrats for the great project!

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2012/05/stm-update-back-to-threads-6622746581767639355.html b/posts/2012/05/stm-update-back-to-threads-6622746581767639355.html new file mode 100644 index 000000000..2dbf4ca2b --- /dev/null +++ b/posts/2012/05/stm-update-back-to-threads-6622746581767639355.html @@ -0,0 +1,487 @@ + + + + + +STM update: back to threads? | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                STM update: back to threads?

                + + + +
                +

                Hi again,

                +Here is another update on the status of Software Transactional Memory on PyPy.

                +Those of you who have been closely following this blog since last year know that, from the very first post about STM, I explored various design ideas about the API that we should get when programming in Python.

                +I went a full circle, and now I am back to where I started (with, important difference, a very roughly working implementation of pypy-stm).

                +What I realized is that the "thread" module is not that bad after all --- I mean, yes, it is a horribly low-level interface, but it is general enough to build various interesting things on top of it. What the "stm-thread" branch of PyPy contains is, basically, the regular "thread" module in which the GIL was replaced with STM. It gives multicore capabilities to any program based on multiple threads. (This is so far exactly the idea same than the one being investigated for Hardware Transactional Memory. It is roughly also what you would get if you managed to convince GCC 4.7 to compile CPython using STM.)

                +Now while this might already be quite interesting to some people, here is how it relates to all I said previously: namely, threads are bad, and some new "transaction" module would be a better idea.

                +There is one new core functionality in the "stm-thread" branch: it is "thread.atomic", a context manager that can be used in a "with" statement (exact name subject to change). In terms of the GIL, it prevents the GIL from being released in the "with" block. In terms of STM, it prevents a "transaction break", which means that the whole "with" statement runs in one single transaction. (From the Python programmer's point of view, the net effect is the same.)

                +So far, no ground-breaking news. But what I missed previously is that this is enough to give multicore capabilities even to a program that is not using threads so far. It is possible to rewrite an equivalent of the old transaction module in a few pages of pure Python, using "thread.atomic". Something along the following lines: start N threads that each reads from a Queue.Queue() the next job to do, and does it in a "with thread.atomic" block. The STM version of PyPy is then able to run these atomic blocks concurrently. The key point is that the slightly delicate handling of threads should be nicely hidden inside the new "transaction" module, and from outside the observed behavior would be exactly as if the transactions that we schedule are run serially.

                +The point I kept missing was that, yes, this sounds like nonsense, because it seems that we create N threads just to serialize their work again in "thread.atomic" sections. In fact this would be nonsense in any model that would "just" remove the GIL to let multiple threads run concurrently without crashing. Indeed, you have multiple threads, but their atomic blocks would be again a sort of GIL: only one of them would run at a time. And this is indeed the simple model of execution that you get even with STM --- but not the model of performance. The performance with STM scales with the number of cores, as long as there is enough non-conflicting work to do.

                +So in summary the complete circle back to the starting point is that threads might be a good low-level model. It mends itself naturally to, say, a kind of program in which the main thread polls file descriptors using select() or the Linux epoll(), and the work received is split along N other threads --- which is the kind of program you would naturally write in other languages that don't have a GIL, say Java. The other threads can then use "thread.atomic" blocks to protect sections of their work. The traditional Transactional Memory point of view is that you use such blocks to guard the short sections of code that communicate with other threads or modify global state, but nothing prevents you from using much larger sections: you should be able to scale them up to the size of a native "unit of work", so that every unit is naturally atomic. And then it's only a matter of design: you can tweak an existing module that does the thread pooling to add one "with thread.atomic"; or do it yourself from scratch; or (if the design is compatible enough) just plug in the proposed pure-Python "transaction" module. Or if you feel like it you can even use threads directly (but keep in mind that using threads too explicitly is not a composable abstraction, whereas higher-level designs typically are).

                +At the end of the day, you can write or reuse programs whose global structure you are already familiar with, for example with a thread pool (that can be hidden in a library if you prefer), or any other structure with or without explicit threads. But you can do so without all the mess that comes with threads like locks and deadlocks. From that angle it is really similar to Garbage Collection: e.g. the Boehm GC (now used by GCC itself) lets you write C code like you are used to, but forgeting all you had to learn about careful explicit memory management.

                +
                +

                Comments

                +
                +
                +
                + + Benjamin wrote on 2012-05-08 04:38: +
                +
                +

                So I'm not sure if I fully grok STM, but my basic understanding of the workflow for a transaction is this:

                1. Make a copy of whatever it is you're planning to use, ie, 'stuff'.
                2. Do anything that doesn't have side effects (writing to memory/disk).
                3. Acquire a lock & compare the state of the parts of 'stuff' you want to change to the current state.
                4a. If 'stuff to write' is unchanged, write it and release lock.
                4b. Otherwise, release lock and restart transaction.

                With the context manager, how is 'stuff' determined? Does it record everything in locals()? That seems like it might be excessive. Would it make sense to expose 'stuff' to the programmer?

                If you were to expose 'stuff' to the programmer, I'd think you'd want a new local context where the only variables available were those explicitly specified as 'stuff' (and builtins, etc) so as to avoid congruency accidents. Something like:

                with atomic(f, x, y, z, q) as f, x, y, z, q:
                z += f(x, y)
                y = x
                x = q.pop()

                This would also help remind folks to keep their transactions small.

                Furthermore, this could easily be transformed into a very useful (function) decorator that uses the function's arguments as the 'stuff'.

                Am I missing something? Are my suggestions reasonable?

                +
                +
                +
                +
                + + Unknown wrote on 2012-05-08 06:09: +
                +
                + this might give you some insight into another approach for passing messages (aka information) between threads which might be GIL friendly. +
                +
                +
                +
                + + Frankier wrote on 2012-05-08 07:29: +
                +
                +

                @Benjamin:

                My understanding is STM is using these type of transactions: https://en.wikipedia.org/wiki/Optimistic_concurrency_control

                +
                +
                +
                +
                + + Armin Rigo wrote on 2012-05-08 08:17: +
                +
                +

                @Benjamin: no, that's not reasonable at all in the context of large transactions. "Help remind folks to keep their transactions small" is precisely what I don't want: I want large transactions. This might be harder to do efficiently, it might be more conflict-prone, etc.; but what I don't want is the classical situation where you have to be very careful about keeping your transactions as small as possible, because that's just as hard and error-prone as using locks.

                What I want is for "the average programmer" to not use the "thread" module at all, including "thread.atomic". This should be part of a library that does thread pooling and dispatching (large) transactions.

                +
                +
                +
                +
                + + Kristján Valur wrote on 2012-05-08 11:33: +
                +
                +

                You know, of course, that stackless has an "atomic" property, and stacklesslib has an stacklesslib.utils.atomic ctxtmgr.

                I recently modified stackless so that the "atomic" property also inhibited GIL release, so that inter-thread tasklet operations could be made safe.

                On a whim I scoured the python archives and found that such a property had been proposed to cPython but rejected (unwisely imho) in favor of general locking.

                Perhaps we can get them to reconsider?

                +
                +
                +
                +
                + + Kristján Valur wrote on 2012-05-08 11:41: +
                +
                +

                Oh, and btw:
                an "atomic" property in regular cPython (and stackless) of course only prevents preemptive release of the GIL. Any blocking IO calls will still cause a "co-operative" GIL release. For this reason, "atomic" cannot be replace generic locks completely.

                How does this play with longer "transactions" in STM?

                +
                +
                +
                +
                + + Armin Rigo wrote on 2012-05-08 11:54: +
                +
                +

                @Kris: ah, interesting. You did the same as what I attempted in my hack of CPython at https://bitbucket.org/arigo/cpython-withatomic . This didn't really work out, though, because the stdlib (including file objects) use regular locks. A simple "print" in an atomic block could lead to deadlocks: the atomic block can block waiting for the stdout's file lock to be released, but it does so without releasing the GIL. Now the lock would typically be released by another thread --- if only it could grab the GIL for a short while.

                You can see the workaround I found in the last few commit messages of the above repository, but I'm not satisfied with it... In general I'm still unsure what the best way is. For now in pypy-stm I'm going to hack on a case-by-case basis to convert the locks to atomic sections.

                Perhaps it is possible to do it semi-generically, e.g. convert all syntactically nested "with lock:" statements in the user code into "with atomic:" statements (similar to next year's Intel CPUs, which will have "lock elision" to help convert from lock-based to HTM programs). As far as I know, this idea doesn't work in all situations, e.g. if you acquire a lock in one thread and release it in another thread.

                As far as I can say, this issue is the main blocker preventing any further progress on the CPython side. It is certainly the reason I stopped pushing for it last year.

                +
                +
                +
                +
                + + Armin Rigo wrote on 2012-05-08 11:58: +
                +
                +

                @Kris: ah, ok: you have a version of "atomic" that doesn't prevent the GIL from being released around I/O calls. This is different from the version described in this post, which is also what I assumed in my previous answer. In a "with atomic" block, the GIL is not released under any circumstance (equivalently, the whole "atomic" block runs as a single transaction), so that the programmer can assume that a "with atomic" block is truly atomic.

                +
                +
                +
                +
                + + Unknown wrote on 2012-05-08 12:59: +
                +
                +

                How would a code example look for thread.atomic?

                +
                +
                +
                +
                + + Armin Rigo wrote on 2012-05-08 13:23: +
                +
                +

                @Arne: here is an example using directly thread.atomic. In your multithreaded application, at some point, you want to remove an item from list1 and add it to list2, knowing that list1 and list2 are also accessed by other threads. Then you write:

                with thread.atomic:
                x = list1.pop()
                list2.append(x)

                This is a classical STM example. What I'm pushing for is not that, though: it is for not writing multithreaded code in the first place. With the proper library code you can write code like the first few lines of transaction. The library code would itself use thread.atomic, but not you directly.

                +
                +
                +
                +
                + + Kristján Valur wrote on 2012-05-08 15:02: +
                +
                +

                Yes, sorry for not being clear, Armin. But an "atomic" flag that inhibits involountary thread switching is useful too, because it is a fast "lock" around all kinds of code:

                with atomic:
                foo = foo+1 #multi-threading-safe

                without the overhead of real locks.
                In our GIL world, real locks only benefit areas that incur thread-blocking operations such as IO.

                Anyway, that is off-topic, I suppose :)

                +
                +
                +
                +
                + + Kristján Valur wrote on 2012-05-08 15:06: +
                +
                +

                Of course, we cannot replace thread._Lock with an "atomic" equivalent, because it is a non-recursive entity, also used for such things as condition variables!.

                Not a very wise move, in retrospect.

                +
                +
                +
                +
                + + Armin Rigo wrote on 2012-05-08 16:38: +
                +
                +

                @Kris: indeed. I found out a way that should in all cases either work or raise an exception if unsupported (and not unexpectedly deadlock).

                The unsupported situation is: we are in a "with atomic" block trying to acquire a lock, and this lock is acquired already. In this case, there is nothing the interpreter can do automatically. It can only complain rather than deadlocking: no other thread is going to run in parallel to release the lock.

                This should let the "common use case" work, which is locks used as scoped mutexes. Caveat: only as long as you use them either only in "with atomic" blocks --- because they appear to be fully serialized, so the mutex will never block --- or only outside "with atomic" blocks.

                This leaves the case of mixed usage as unsupported, but I don't see how it could reasonably be supported.

                So for now, pypy-stm will raise "oups deadlock" if you try to use "print" statements both inside and outside atomic blocks in parallel... that's the best I could come up with so far.

                +
                +
                +
                +
                + + Anonymous wrote on 2012-05-09 00:38: +
                +
                +

                thanks for the article. might want to reword "This is so far exactly the idea same than the one being investigated for Hardware Transactional Memory.". :)

                +
                +
                +
                +
                + + Ole Laursen wrote on 2012-05-11 12:20: +
                +
                +

                To expand slightly on what someone else commented, there was a talk not too long ago by some guys who found out using queues to communicate between threads can be pretty hefty bottleneck. They were using the JVM.

                The talk is interesting because they actually measured the stuff they do and compared it with how it affects the CPU pipelines/caches. The queue discussion is around 32 minutes into the talk.

                It's perhaps not relevant for pypy-stm at the moment, but it's definitely relevant for anyone interested in high-performance multithreaded code.

                +
                +
                +
                +
                + + Dima Q wrote on 2012-05-18 10:03: +
                +
                +

                Good job, Armin!

                This is exactly what Python needs, and if turns out hard rather than insanely hard, all the better!

                +
                +
                +
                +
                + + Jonas W. wrote on 2012-05-21 17:51: +
                +
                +

                I am not entirely sure about the concept which is being implemented in PyPy-stm or better, which is planned for a parallel PyPy in the future.

                I think am a pretty conservative programmer, and I actually dislike the idea of running code twice because of conflicts which could have been foreseen at development time ;). I still see the advantages STM brings regarding development time.

                So I'm wondering about a point which was not entirely clear in your post. You're saying you don't want people to (be forced to?) write short transactions. However, I could still in a project which is both CPU and memory intensive try to keep the thread.atomic sections as small as possible to avoid unneccessary overheads but still get effective logs?

                +
                +
                +
                +
                + + Armin Rigo wrote on 2012-05-21 22:42: +
                +
                +

                @Jonas: it is not always obvious at development time -- to say the least -- how to avoid all conflicts. Think about how hard it is to add automatic GC to C++ in a large project: it's messy but you might get pretty far with just reference counting -- until some point when you loose because of cyclic references. If instead you had used a proper GC-managed language, the problem would just not exist. It's the same about Transactional Memory and conflicts: you can either think harder and harder about using locks correctly, until your programs becomes really messy; then you give up and use TM, solving the issue instantly and letting you think again about your original problem.

                Regarding the transaction size: with a good implementation, big transactions should not be slower than small transactions. The only potential drawback of having big transactions is that the risks of conflicts might increase (depending on your program).

                Note that this question has a different answer for Python than for C, where code outside transactions runs faster than code within transactions. It is not so in Python. The reason is that transactions are always needed in Python: either explicitly, or implicitly in order to protect the interpreter structures (in replacement of the famous GIL).

                +
                +
                +
                +
                + + Connelly Barnes wrote on 2012-05-30 05:53: +
                +
                +

                Is there any plan to add type declarations as some optional mode in PyPy, like Cython allows? Because PyPy can sometimes give some speed up, but when it doesn't it seems the alternative for the user is to go back to CPython + Cython.

                +
                +
                +
                +
                + + Unknown wrote on 2012-06-05 12:26: +
                +
                +

                @Armin: Looks nice!

                But you’re right: The explicit transaction still looks nicer.

                I think though, that both can nicely complement each other:

                (1) The transaction is efficient for pushing out parts of the code from the main run to get it multithreaded (think “#pragma omp parallel for” from OpenMP).

                (2) The thread.atomic is efficient for protecting stuff inside a threaded application. Also I like that I don’t have to explicitely state which variables I want to protect. And I like that it is not full locking: If I don’t actually get a conflict, other code still runs in parallel.

                The first actually looks more interesting though, because it might be possible to make every for-loop run like this, as long as later runs are not dependent on the result of previous runs. This would require quite heavy runtime analysis, though.

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2012/06/architecture-of-cppyy-9077100041707701102.html b/posts/2012/06/architecture-of-cppyy-9077100041707701102.html new file mode 100644 index 000000000..5b1f7271c --- /dev/null +++ b/posts/2012/06/architecture-of-cppyy-9077100041707701102.html @@ -0,0 +1,552 @@ + + + + + +Architecture of Cppyy | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                Architecture of Cppyy

                + + + +
                +

                The cppyy module makes it possible to call into C++ from PyPy through the +Reflex package. +Work started about two years ago, with a follow-up sprint a year later. +The module has now reached an acceptable level of maturity and initial +documentation with setup instructions, as well as a list of the currently +supported language features, are now available here. +There is a sizable (non-PyPy) set of unit and application tests that is still +being worked through, not all of them of general applicability, so development +continues its current somewhat random walk towards full language coverage. +However, if you find that cppyy by and large works for you except for certain +specific features, feel free to ask for them to be given higher priority.

                +

                Cppyy handles bindings differently than what is typically found in other +tools with a similar objective, so this update walks through some of these +differences, and explains why choices were made as they are.

                +

                The most visible difference, is from the viewpoint of the Python programmer +interacting with the module. +The two canonical ways of making Python part of a larger environment, are to +either embed or extend it. +The latter is done with so-called extension modules, which are explicitly +constructed to be very similar in their presentation to the Python programmer +as normal Python modules. +In cppyy, however, the external C++ world is presented from a single entrance +point, the global C++ namespace (in the form of the variable cppyy.gbl). +Thus, instead of importing a package that contains your C++ classes, usage +looks like this (assuming class MyClass in the global namespace):

                +
                +>>>> import cppyy
                +>>>> m = cppyy.gbl.MyClass()
                +>>>> # etc.
                +
                +

                This is more natural than it appears at first: C++ classes and functions are, +once compiled, represented by unique linker symbols, so it makes sense to give +them their own unique place on the Python side as well. +This organization allows pythonizations of C++ classes to propagate from one +code to another, ensures that all normal Python introspection (such as +issubclass and isinstance) works as expected in all cases, and that it +is possible to represent C++ constructs such as typedefs simply by Python +references. +Achieving this unified presentation would clearly require a lot of internal +administration to track all C++ entities if they each lived in their own, +pre-built extension modules. +So instead, cppyy generates the C++ bindings at run-time, which brings us to +the next difference.

                +

                Then again, that is not really a difference: when writing or generating a +Python extension module, the result is some C code that consists of calls into +Python, which then gets compiled. +However, it is not the bindings themselves that are compiled; it is the code +that creates the bindings that gets compiled. +In other words, any generated or hand-written extension module does exactly +what cppyy does, except that they are much more specific in that the bound +code is hard-wired with e.g. fixed strings and external function calls. +The upshot is that in Python, where all objects are first-class and run-time +constructs, there is no difference whatsoever between bindings generated at +run-time, and bindings generated at ... well, run-time really. +There is a difference in organization, though, which goes back to the first +point of structuring the C++ class proxies in Python: given that a class will +settle in a unique place once bound, instead of inside a module that has no +meaning in the C++ world, it follows that it can also be uniquely located in +the first place. +In other words, cppyy can, and does, make use of a class loader to +auto-load classes on-demand.

                +

                If at this point, this all reminds you of a bit ctypes, just with some extra +bells and whistles, you would be quite right. +In fact, internally cppyy makes heavy use of the RPython modules that form the +guts of ctypes. +The difficult part of ctypes, however, is the requirement to annotate +functions and structures. +That is not very pleasant in C, but in C++ there is a whole other level of +complexity in that the C++ standard specifies many low-level details, that are +required for dispatching calls and understanding object layout, as +"implementation defined." +Of course, in the case of Open Source compilers, getting at those details is +doable, but having to reverse engineer closed-source compilers gets old rather +quickly in more ways than one. +More generally, these implementation defined details prevent a clean interface, +i.e. without a further dependency on the compiler, into C++ like the one that +the CFFI module provides for C. +Still, once internal pointers have been followed, offsets have been calculated, +this objects have been provided, etc., etc., the final dispatch into binary +C++ is no different than that into C, and cppyy will therefore be able to make +use of CFFI internally, like it does with ctypes today. +This is especially relevant in the CLang/LLVM world, where stub functions +are done away with. +To get the required low-level details then, cppyy relies on a back-end, rather +than getting it from the programmer, and this is where Reflex (together with +the relevant C++ compiler) comes in, largely automating this tedious process.

                +

                There is nothing special about Reflex per se, other than that it is relatively +lightweight, available, and has proven to be able to handle huge code bases. +It was a known quantity when work on cppyy started, and given the number +of moving parts in learning PyPy, that was a welcome relief. +Reflex is based on gccxml, and can therefore handle pretty much any C or +C++ code that you care to throw at it. +It is also technically speaking obsolete as it will not support C++11, since +gccxml won't, but its expected replacement, based on CLang/LLVM, is not +quite there yet (we are looking at Q3 of this year). +In cppyy, access to Reflex, or any back-end for that matter, is through a +thin C API (see the schematic below): cppyy asks high level questions to the +back-end, and receives low-level results, some of which are in the form of +opaque handles. +This ensures that cppyy is not tied to any specific back-end. +In fact, currently it already supports another, CINT, but that back-end is +of little interest outside of High Energy Physics (HEP). +The Python side is always the same, however, so any Python code based on cppyy +does not have to change if the back-end changes. +To use the system, a back-end specific tool (genreflex for Reflex) is +first run on a set of header files with a selection file for choosing the +required classes. +This produces a C++ file that must be compiled into a shared library, and a +corresponding map file for the class loader. +These shared libraries, with their map files alongside, can be put anywhere +as long as they can be located through the standard paths for the dynamic +loader. +With that in place, the setup is ready, and the C++ classes are available to +be used from cppyy.

                + +

                So far, nothing that has been described is specific to PyPy. +In fact, most of the technologies described have been used for a long time +on CPython already, so why the need for a new, PyPy-specific, module? +To get to that, it is important to first understand how a call is mediated +between Python and C++. +In Python, there is the concept of a PyObject, which has a reference count, a +pointer to a type object, and some payload. +There are APIs to extract the low-level information from the payload for use +in the C++ call, and to repackage any results from the call. +This marshalling is where the bulk of the time is spent when dispatching. +To be absolutely precise, most C++ extension module generators produce slow +dispatches because they don't handle overloads efficiently, but even in there, +they still spend most of their time in the marshalling code, albeit in calls +that fail before trying the next overload. +In PyPy, speed is gained by having the JIT unbox objects into the payload only, +allowing it to become part of compiled traces. +If the same marshalling APIs were used, the JIT is forced to rebox the payload, +hand it over through the API, only to have it unboxed again by the binding. +Doing so is dreadfully inefficient. +The objective of cppyy, then, is to keep all code transparent to the JIT until +the absolute last possible moment, i.e. the call into C++ itself, therefore +allowing it to (more or less) directly pass the payload it already has, with +an absolute minimal amount of extra work. +In the extreme case when the binding is not to a call, but to a data member of +an object (or to a global variable), the memory address is delivered to the +JIT and this results in direct access with no overhead. +Note the interplay: cppyy in PyPy does not work like a binding in the CPython +sense that is a back-and-forth between the interpreter and the extension. +Instead, it does its work by being transparent to the JIT, allowing the JIT to +dissolve the binding. +And with that, we have made a full circle: if to work well with the JIT, and +in so doing achieve the best performance, you can not have marshalling or do +any other API-based driving, then the concept of compiled extension modules is +out, and the better solution is in run-time generated bindings.

                +

                That leaves one final point. +What if you do want to present an extension module-like interface to +programmers that use your code? +But of course, this is Python: everything consists of first-class objects, +whose behavior can be changed on the fly. +In CPython, you might hesitate to make such changes, as every overlay or +indirection results in quite a bit of overhead. +With PyPy, however, these layers are all optimized out of existences, making +that a non-issue.

                +

                This posting laid out the reasoning behind the organization of cppyy. +A follow-up is planned, to explain how C++ objects are handled and +represented internally.

                +

                Wim Lavrijsen

                +
                +

                Comments

                +
                +
                +
                + + Fernando Perez wrote on 2012-06-25 21:00: +
                +
                +

                Thanks for this excellent post; any chance you'll make it to Scipy'2012 in Austin? I still remember your talk at one of the very old Scipys at Caltech as one of the best we've had; it would be great to catch up on the implications of your continued work on this front since. With the recent progress on cython and numpy/numba, fresh ideas on the C++ front are a great complement.

                +
                +
                +
                +
                + + Sebastien Binet wrote on 2012-06-26 09:28: +
                +
                +

                Wim,

                I know you are quite attached to details so I was surprised by:

                """
                Reflex is based on gccxml, and can therefore handle pretty much any C or C++ code that you care to throw at it
                """

                but that's not true: gccxml being an interesting and useful hack of the C++ frontend of GCC, it can only correctly parse the subset of C which is valid C++.

                here are a few links:
                https://stackoverflow.com/questions/1201593/c-subset-of-c-where-not-examples

                https://en.wikipedia.org/wiki/Compatibility_of_C_and_C%2B%2B

                I discovered it the hard way...

                +
                +
                +
                +
                + + Anonymous wrote on 2012-06-26 09:45: +
                +
                +

                @Sebastien, GCC-XML must be able to parse the entirety of C, since it has to support "extern C" blocks, mustn't it?

                +
                +
                +
                +
                + + Sebastien Binet wrote on 2012-06-26 12:30: +
                +
                +

                "extern C" is "just" modifying the symbol mangling mechanism of the identifiers inside the extern-C block.

                just try this example from the link I posted earlier:
                https://stackoverflow.com/questions/1201593/c-subset-of-c-where-not-examples

                """
                struct A { struct B { int a; } b; int c; };
                struct B b; // ill-formed: b has incomplete type (*not* A::B)
                """

                even if you create a foo.h like so:

                """
                #ifdef __cplusplus
                extern "C" {
                #endif

                struct A { struct B { int a; } b; int c; };
                struct B b;
                #ifdef __cplusplus
                }
                #endif
                """

                and compile some main.c/cxx (which just includes that header) with gcc/g++, you'll get:

                """
                $ gcc main.c
                $ echo $?
                0

                $ g++ main.cxx
                In file included from main.cxx:2:0:
                foo.h:7:12: error: aggregate ‘B b’ has incomplete type and cannot be defined
                zsh: exit 1 g++ main.cxx
                """

                gccxml is using the C++ parser, thus my first remark :}

                +
                +
                +
                +
                + + Sebastien Binet wrote on 2012-06-26 12:54: +
                +
                +

                Also, as we are in the nitpicking and parsing department, any C++ keyword which isn't a C one, can be correctly used in a C file, making that file landing in the valid-C-which-isnt-in-the-C++-subset-of-C
                (e.g.: class,new,this to name a few of the most popular types or identifiers one can find in C codebases)

                +
                +
                +
                +
                + + Wim Lavrijsen wrote on 2012-06-26 17:59: +
                +
                +

                @Fernando: no, no travel for me anytime soon. If Py4Science is still going, though, I can always walk down the hill, of course. :)

                I've seen Numba (Stefan brought it up on the pypy-dev list), but it appears to be focused on C. With LLVM, we are using the AST directly. I don't think you can drive C++ through llvm-py.

                @Sebastien: the "details" that you are missing are in that "pretty much any" is not the same as "all." Worse, Reflex has a whole toolchain of gccxml, genreflex, C++ compiler, and finally the Reflex API. You lose information at every step along the way. It's one more reason for CLang/LLVM, but as said, that's for Q3/2012.

                Note though that there are two kinds of C headers that one may encounter. Those that are in a pure C environment, and those for mixed C/C++ use (e.g. Python.h and the system headers). In the former case, no-one would drag in the dependency on a C++ compiler, just to use Reflex. Using e.g. CFFI is a much better option. In the other case, there is no problem either way.

                Cheers,
                Wim

                +
                +
                +
                +
                + + Anonymous wrote on 2012-06-27 11:46: +
                +
                +

                On a similar note, what's the state of embedding PyPy into C++ (or does cppyy make that case fully obsolete?)?

                +
                +
                +
                +
                + + Wim Lavrijsen wrote on 2012-06-27 18:16: +
                +
                +

                @anonymous: there was a recent thread on pypy-dev, showing a successful embedding: https://mail.python.org/pipermail/pypy-dev/2012-March/009661.html

                If done through C++, you can use the Python C-API (through cpyext), but AFAIK, that doesn't play nicely with threads yet.

                Cheers,
                Wim

                +
                +
                +
                +
                + + Matthias wrote on 2012-06-28 16:50: +
                +
                +

                From my past experience wrapping a C++ library to python is a whole lot more than just being able to call functions and having objects.

                For example using a binding generator like SWIG you need to annotate your source, because the source alone does not have sufficient information to generate proper bindings (at least no bindings that feel python-like).

                So I am wondering how Cppyy behaves in this area.

                E.g. how does this play with templates? I will probably still need to define up-front which instantiations I need to be available in python?

                How does it deal with object ownership? E.g. what happens if the C++ code decides to delete an object that python still points to? Or how are shared pointers dealt with?

                How is type mapping handled? E.g. you might want to call functions taking MyString with "standard" python strings instead of having to construct MyString() objects first and then passing those.

                +
                +
                +
                +
                + + Wim Lavrijsen wrote on 2012-06-28 18:36: +
                +
                +

                @Matthias: there are several follow-up posts planned to explain everything in detail, so just a few quick answers now.

                Pythonizations are handled automatically based on signature, otherwise by allowing user defined pythonization functions.

                Template instantiations are still needed in the Reflex world, but with CLang/LLVM, those can be generated by the backend (CINT can perform the instantiations automatically as well).

                Object ownership can be handled heuristically if the C++ side behaves (this is e.g. the case for most of ROOT). If that's not the case, extra annotations per function or per object are needed. In addition, communication with the memory regulator (a tracker of all proxies on the python side) through a callback on both sides is possible.

                Type mappings happen through custom converters that are to be coded up in either Python or C++. Standard mappings (e.g. the use of std::string in the way that you describe for MyString) have been added by default. Type mappings can also be done based on signature in some cases.

                Not everything of the above is implemented in cppyy yet, but all have been solved before in PyROOT on CPython. It's just a matter of time to implement things for cppyy. The important point, however, is that none of this needs a separate language: most of it can be handled automatically, with a little work of the programmer in python proper or, worst case, with a C++ helper.

                Cheers,
                Wim

                +
                +
                +
                +
                + + Anonymous wrote on 2013-09-20 06:58: +
                +
                +

                Hmm is anyone else experiencing problems with the pictures on this blog loading?
                I'm trying to find out if its a problem on my end or if it's the
                blog. Any feed-back would be greatly appreciated.

                my site ... Splendyr REview - https://livingwaychristianfriendshipgroup.com/members/starcormi/activity/932712/ -

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2012/06/europython-sprint-5668923199392472912.html b/posts/2012/06/europython-sprint-5668923199392472912.html new file mode 100644 index 000000000..24b927712 --- /dev/null +++ b/posts/2012/06/europython-sprint-5668923199392472912.html @@ -0,0 +1,321 @@ + + + + + +EuroPython sprint | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                EuroPython sprint

                + + + +
                +

                Hi all,

                EuroPython is next week. We will actually be giving a presentation on Monday, in one of the plenary talks: PyPy: current status and GIL-less future. This is the first international PyPy keynote we give, as far as I know, but not the first keynote about PyPy [David Beazley's video] :-)

                +The other talks are PyPy JIT under the hood and to some extent Performance analysis tools for JITted VMs. This year we are also trying out a help desk. Finally, we will have the usual sprint after EuroPython on Saturday and Sunday.

                +See you soon!

                +Armin.

                +
                +

                Comments

                +
                +
                +
                + + holger krekel wrote on 2012-06-28 10:35: +
                +
                +

                Don't you consider the David Beazley keynote at Pycon 2012 as a talk about PyPy? (even if not from a core dev)

                +
                +
                +
                +
                + + Armin Rigo wrote on 2012-06-28 10:38: +
                +
                +

                That's what the link "the first keynote about PyPy" is about. It's a link to the pypy blog where we talk about David's keynote. I did not find a direct page at us.pycon.org...

                +
                +
                +
                +
                + + Armin Rigo wrote on 2012-06-28 10:39: +
                +
                +

                That's what the link "the first keynote about PyPy" is about. It's a link to the pypy blog where we talk about David's keynote. I did not find a direct page at us.pycon.org...

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2012/06/py3k-status-update-4-4834053219477515637.html b/posts/2012/06/py3k-status-update-4-4834053219477515637.html new file mode 100644 index 000000000..dd94fe7a1 --- /dev/null +++ b/posts/2012/06/py3k-status-update-4-4834053219477515637.html @@ -0,0 +1,350 @@ + + + + + +Py3k status update #4 | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                Py3k status update #4

                + + + +
                +

                This is the fourth status update about our work on the py3k branch, which we
                +can work on thanks to all of the people who donated to the py3k proposal.

                +

                For various reasons, less work than usual has been done since the last status
                +update. However, some interesting things happened anyway.

                +

                As readers know, so far we spent most of the effort in fixing all PyPy's own
                +tests which started to fail for various py2/py3 differences. Most of them
                +failed for shallow reasons, e.g. syntactic changes or the int/long
                +unifications. Others failed for subtle differences and needed a bit more care,
                +for example the fact that unbound methods are gone in Py3k.

                +

                The good news is that finally we are seeing the light at the end of the
                +tunnel. Most of them have been fixed. For sine other tests, we introduced the
                +concept of "py3k-skipping": some optimizations and modules are indeed failing,
                +but right now we are concentrating on completing the core language and so we
                +are not interested in those. When the core language will be done, we will be
                +able to easily find and work on the py3k-skipped tests. In particular, for
                +now we disabled the Int and String dict strategies, which are broken
                +because of the usual int/long unification and str vs bytes. As for modules,
                +for now _continuation (needed for stackless) and _multiprocessing do
                +not work yet.

                +

                Another non-trivial feature we implemented is the proper cleaning of exception
                +variables when we exit except blocks. This is a feature which touches
                +lots of levels of PyPy, starting from astcompiler, down to the bytecode
                +interpreter. It tooks two days of headache, but at the end we made it :-).

                +

                Additionally, Amaury did a lot of improvements to cpyext, which had been
                +broken since forever on this branch.

                +

                As for the next plans, now that things are starting to work and PyPy's own
                +tests mostly pass, we can finally start to run the compiled PyPy against
                +CPython's test suite. It is very likely that we will have tons of failures at
                +the beginning, but once we start to fix them one by one, a Py3k-compatible
                +PyPy will be closer and closer.

                +
                +

                Comments

                +
                +
                +
                + + Connelly Barnes wrote on 2012-06-27 18:28: +
                +
                +

                Does anyone actually use Python 3? That whole project of Guido's reminds me of "things you should never do: rewrites."

                https://www.neilgunton.com/doc/?o=1&doc_id=8583

                +
                +
                +
                +
                + + Unknown wrote on 2012-06-29 10:55: +
                +
                +

                I cheered at your update when I saw it originally - but did not write this here.

                Since no one else did that, yet, I want to go back to fix the mistake:

                Great work!

                I’m anxious to see my python3 code running under pypy!

                +
                +
                +
                +
                + + z1r0un wrote on 2013-08-06 04:24: +
                +
                +

                @Connelly Barnes:
                Wat.
                I use Python3 almost exclusively, mainly because filter, map, and friends return iterators as FSM intended. I haven't done much string work, but that's another major win. And it's not like 2.7's EOL'd.
                In short, un-bunch your knickers and roll with the times.

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2012/06/pypy-19-yard-wolf-7006180436602667005.html b/posts/2012/06/pypy-19-yard-wolf-7006180436602667005.html new file mode 100644 index 000000000..79132e219 --- /dev/null +++ b/posts/2012/06/pypy-19-yard-wolf-7006180436602667005.html @@ -0,0 +1,402 @@ + + + + + +PyPy 1.9 - Yard Wolf | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                PyPy 1.9 - Yard Wolf

                + + + +
                +

                We're pleased to announce the 1.9 release of PyPy. This release brings mostly
                +bugfixes, performance improvements, other small improvements and overall
                +progress on the numpypy effort.
                +It also brings an improved situation on Windows and OS X.

                +

                You can download the PyPy 1.9 release here:

                +
                https://pypy.org/download.html
                +
                +

                What is PyPy?

                +

                PyPy is a very compliant Python interpreter, almost a drop-in replacement for
                +CPython 2.7. It's fast (pypy 1.9 and cpython 2.7.2 performance comparison)
                +due to its integrated tracing JIT compiler.

                +

                This release supports x86 machines running Linux 32/64, Mac OS X 64 or
                +Windows 32. Windows 64 work is still stalling, we would welcome a volunteer
                +to handle that.

                +
                +
                +

                Thanks to our donors

                +

                But first of all, we would like to say thank you to all people who
                +donated some money to one of our four calls:

                +
                +

                Thank you all for proving that it is indeed possible for a small team of
                +programmers to get funded like that, at least for some
                +time. We want to include this thank you in the present release
                +announcement even though most of the work is not finished yet. More
                +precisely, neither Py3k nor STM are ready to make it in an official release
                +yet: people interested in them need to grab and (attempt to) translate
                +PyPy from the corresponding branches (respectively py3k and
                stm-thread).

                +
                +
                +

                Highlights

                +
                  +
                • This release still implements Python 2.7.2.
                • +
                • Many bugs were corrected for Windows 32 bit. This includes new
                  +functionality to test the validity of file descriptors; and
                  +correct handling of the calling convensions for ctypes. (Still not
                  +much progress on Win64.) A lot of work on this has been done by Matti Picus
                  +and Amaury Forgeot d'Arc.
                • +
                • Improvements in cpyext, our emulator for CPython C extension modules.
                  +For example PyOpenSSL should now work. We thank various people for help.
                • +
                • Sets now have strategies just like dictionaries. This means for example
                  +that a set containing only ints will be more compact (and faster).
                • +
                • A lot of progress on various aspects of numpypy. See the numpy-status
                  +page for the automatic report.
                • +
                • It is now possible to create and manipulate C-like structures using the
                  +PyPy-only _ffi module. The advantage over using e.g. ctypes is that
                  _ffi is very JIT-friendly, and getting/setting of fields is translated
                  +to few assembler instructions by the JIT. However, this is mostly intended
                  +as a low-level backend to be used by more user-friendly FFI packages, and
                  +the API might change in the future. Use it at your own risk.
                • +
                • The non-x86 backends for the JIT are progressing but are still not
                  +merged (ARMv7 and PPC64).
                • +
                • JIT hooks for inspecting the created assembler code have been improved.
                  +See JIT hooks documentation for details.
                • +
                • +select.kqueue has been added (BSD).
                • +
                • Handling of keyword arguments has been drastically improved in the best-case
                  +scenario: proxy functions which simply forwards *args and **kwargs
                  +to another function now performs much better with the JIT.
                • +
                • List comprehension has been improved.
                • +
                +
                +
                +

                JitViewer

                +

                There will be a corresponding 1.9 release of JitViewer which is guaranteed to work
                +with PyPy 1.9. See the JitViewer docs for details.

                +

                Cheers,
                +The PyPy Team

                +
                +
                +

                Comments

                +
                +
                +
                + + Dmitrey wrote on 2012-06-08 11:11: +
                +
                +

                I have took a look at the mentioned numpypy table (https://buildbot.pypy.org/numpy-status/latest.html), and it lies in many ways. At first, some methods marked as "done" and undone yet, e.g. consider searchsorted:
                >>>> from numpypy import searchsorted
                >>>> searchsorted([1,2,3],[2,3])
                Traceback (most recent call last):
                File "", line 1, in
                File "/home/dmitrey/Install/pypy-c-jit-55492-ac392fb76904-linux/lib_pypy/numpypy/core/fromnumeric.py", line 763, in searchsorted
                raise NotImplementedError('Waiting on interp level method')
                NotImplementedError: Waiting on interp level method

                (and AFAIK there are many other similar numpypy funcs that are present in dir(numpypy), but only raise NotImplementedError).

                At 2nd, some funcs like all and any, also mentioned there as "done", don't work with "axis" parameter and thus also should be unmarked.

                FYI as a temporary replacement for some missing in PyPy yet numpy funcs (atleast_1d, atleast_2d, hstack, vstack, cumsum, isscalar, asscalar, asfarray, flatnonzero, tile, zeros_like, ones_like, empty_like, where, searchsorted;
                with "axis" parameter: nan(arg)min, nan(arg)max, all, any )

                I have implemented them in AppLevel (thus PyPy developers refuce to commit them, but some users could be interested right now), see https://openopt.org/PyPy for more details and my sincere opinion on the situation.

                Best wishes for PyPy developers and users, D.

                +
                +
                +
                +
                + + Maciej Fijalkowski wrote on 2012-06-08 12:52: +
                +
                +

                Hi Dmitrey, nice to hear from you.

                The page is automatically generated - we should probably just disable those functions, I can't remember the exact reason why they're there in the first place.

                When it comes to missing arguments - you just can't help it. It's an automatically generated page that should give only an overview.

                As far as your patches go - yes, we need tests and we also need tests that cover corner cases. This is very important for us, we can live without the rest (like implementations on the interp-level). We do care about quality a lot.

                Cheers,
                fijal

                +
                +
                +
                +
                + + Dmitrey wrote on 2012-06-08 15:24: +
                +
                +

                hi fijal,
                as far as I remember, main reasons of PyPy developers (I don't remember namely) to reject my funcs propositions were AppLevel vs InterpLevel, not corner testcases (they even said "don't start the func, it must be InterpLevel"). Thus to speedup OpenOpt port on PyPy I went other way and as you probably have seen that some OpenOpt Suite functionality is already available in PyPy and works some times faster.

                If apperplevel is ok for some of those funcs mentioned above, you or any other PyPy programmer can take anything from the code; as for me, I have lots of other things to do with my projects, especially now, before regular release, and thus cannot allocate time to create testcases for the numpy funcs.

                BTW, what about fancy indexing with int arrays (https://bugs.pypy.org/issue1130) - when it will be implemented? It's very important for many Python projects and hangs for a long time already.

                +
                +
                +
                +
                + + Peter Thomson wrote on 2012-06-10 16:56: +
                +
                +

                Congratulations to the new release to the best and most awesome team there is. We work daily with Python and PyPy and always look forward to the latest release :-)

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2012/06/release-01-of-cffi-4760622823232463868.html b/posts/2012/06/release-01-of-cffi-4760622823232463868.html new file mode 100644 index 000000000..04a02658a --- /dev/null +++ b/posts/2012/06/release-01-of-cffi-4760622823232463868.html @@ -0,0 +1,366 @@ + + + + + +Release 0.1 of CFFI | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                Release 0.1 of CFFI

                + + + +
                +
                +

                Hi.

                +

                We're pleased to announce the first public release, 0.1 of CFFI, a way to call C from Python.
                +(This release does not support PyPy yet --- but we announce it here as it is planned for the
                +next release :-)

                +

                The package is available on bitbucket as well as documented. You can also install it
                +straight from the python package index (pip).

                +

                The aim of this project is to provide a convenient and reliable way of calling C code from Python.
                +The interface is based on LuaJIT's FFI and follows a few principles:

                +
                  +
                • The goal is to call C code from Python. You should be able to do so
                  +without learning a 3rd language: every alternative requires you to learn
                  +their own language (Cython, SWIG) or API (ctypes). So we tried to
                  +assume that you know Python and C and minimize the extra bits of API that
                  +you need to learn.
                • +
                • Keep all the Python-related logic in Python so that you don't need to
                  +write much C code (unlike CPython native C extensions).
                • +
                • Work either at the level of the ABI (Application Binary Interface)
                  +or the API (Application Programming Interface). Usually, C
                  +libraries have a specified C API but often not an ABI (e.g. they may
                  +document a "struct" as having at least these fields, but maybe more).
                  +(ctypes works at the ABI level, whereas Cython or native C extensions
                  +work at the API level.)
                • +
                • We try to be complete. For now some C99 constructs are not supported,
                  +but all C89 should be, including macros (and including macro "abuses",
                  +which you can manually wrap in saner-looking C functions).
                • +
                • We attempt to support both PyPy and CPython (although PyPy support is not
                  +complete yet) with a reasonable path for other Python implementations like
                  +IronPython and Jython.
                • +
                • Note that this project is not about embedding executable C code in
                  +Python, unlike Weave. This is about calling existing C libraries
                  +from Python.
                • +
                +
                +

                Status of the project

                +

                Consider this as a beta release. Creating CPython extensions is fully supported and the API should
                +be relatively stable; however, minor adjustements of the API are possible.

                +

                PyPy support is not yet done and this is a goal for the next release. There are vague plans to make this the
                +preferred way to call C from Python that can reliably work between PyPy and CPython.

                +

                Right now CFFI's verify() requires a C compiler and header files to be available at run-time.
                +This limitation will be lifted in the near future and it'll contain a way to cache the resulting binary.

                +

                Cheers,

                +Armin Rigo and Maciej Fijałkowski

                +
                +
                +
                +

                Comments

                +
                +
                +
                + + intgr wrote on 2012-06-19 00:28: +
                +
                +

                Will the CFFI be any JIT-friendlier than PyPy's ctypes?

                +
                +
                +
                +
                + + Anonymous wrote on 2012-06-19 16:46: +
                +
                +

                What's the difference between CFFI and CPyExt?

                +
                +
                +
                +
                + + RonnyPfannschmidt wrote on 2012-06-19 18:04: +
                +
                +

                @intgr yes

                @anon cffi is a FFI, cpyext is a api emulation they are completely different things

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2012/06/stm-with-threads-7818875111634541910.html b/posts/2012/06/stm-with-threads-7818875111634541910.html new file mode 100644 index 000000000..6ebe05d9d --- /dev/null +++ b/posts/2012/06/stm-with-threads-7818875111634541910.html @@ -0,0 +1,386 @@ + + + + + +STM with threads | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                STM with threads

                + + + +
                +

                Hi all,

                +

                A quick update. The first version of pypy-stm based on regular
                +threads
                is ready. Still having no JIT and a 4-or-5-times performance
                +hit, it is not particularly fast, but I am happy that it turns out not
                +to be much slower than the previous thread-less attempts. It is at
                +least fast enough to run faster (in real time) than an equivalent no-STM
                +PyPy, if fed with an eight-threaded program on an eight-core machine
                +(provided, of course, you don't mind it eating all 8 cores' CPU power
                +instead of just one :-).

                +

                You can download and play around with this binary for Linux 64. It
                +was made from the stm-thread branch of the PyPy repository (translate.py --stm -O2 targetpypystandalone.py). (Be sure
                +to put it where it can find its stdlib, e.g. by putting it inside the
                +directory from the official 1.9 release.)

                +

                This binary supports the thread module and runs without the GIL.
                +So, despite the factor-of-4 slow-down issue, it should be the fourth
                +complete Python interpreter in which we can reasonably claim to have
                +resolved the problem of the GIL. (The first one was Greg Stein's Python
                +1.4, re-explored here; the second one is Jython; the third one is
                IronPython.) Unlike the previous three, it is also the first one to
                +offer full GIL semantics to the programmer, and additionally
                thread.atomic (see below). I should also add that we're likely to
                +see in the next year a 5th such interpreter, too, based on Hardware
                +Transactional Memory (same approach as with STM, but using e.g.
                Intel's HTM).

                +

                The binary I linked to above supports all built-in modules from PyPy,
                +apart from signal, still being worked on (which can be a bit
                +annoying because standard library modules like subprocess depend on
                +it). The sys.get/setcheckinterval() functions can be used to tweak
                +the frequency of the automatic commits. Additionally, it offers
                thread.atomic, described in the previous blog post as a way to
                +create longer atomic sections (with the observable effect of preventing
                +the "GIL" to be released during that time). A complete
                transaction.py module based on it is available from the sources.

                +

                The main missing features are:

                +
                  +
                • the signal module;
                • +
                • the Garbage Collector, which does not do major collections so far, only
                  +minor ones;
                • +
                • and finally, the JIT, which needs some amount of integration to generate
                  +the correctly-tweaked assembler.
                • +
                +

                Have fun!

                +

                Armin.

                +
                +

                Comments

                +
                +
                +
                + + Anonymous wrote on 2012-06-12 08:11: +
                +
                +

                STM has such much potential. I wonder if it gets the attention of the hacker community it deserves. And if not, why not? I hope this is getting more recognition in the future.

                +
                +
                +
                +
                + + Paul Jaros wrote on 2012-06-12 08:12: +
                +
                +

                Ah... didn't mean to post it anonymously.

                +
                +
                +
                +
                + + Unknown wrote on 2012-06-13 11:21: +
                +
                +

                Nice!

                +
                +
                +
                +
                + + Armin Rigo wrote on 2012-06-13 15:19: +
                +
                +

                @Paul: my guess would be that the majority of people that know STM are still looking at it from the point of view of short or very short transactions, as a replacement of locking. Even gcc 4.7 got an STM extension, but it cannot be used with long-running transactions: the performance is not at all tuned for this case, and you cannot express things you need in real long-running transactions, like interrupting them for I/O.

                Moreover the single-core 4x performance hit is usually far more that what people are willing to accept --- not realizing that in many cases it will soon be outdated, as a way of measuring performance: the future is toward many-cores machines.

                +
                +
                +
                +
                + + Anonymous wrote on 2012-06-14 16:11: +
                +
                +

                For a casual Python programmer like me, how does STM affect the way I write my programs? I know about suggested benefits of STM on multi-core machines. However, what I'm asking is what is it that I have to do differently to get that benefit ?

                Thanks

                +
                +
                +
                +
                + + Armin Rigo wrote on 2012-06-15 07:42: +
                +
                +

                @Anonymous: https://foss.heptapod.net/pypy/pypy/-/tree/branch//stm-thread/pypy/doc/stm.rst

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2012/07/cffi-release-02-4800000428934604295.html b/posts/2012/07/cffi-release-02-4800000428934604295.html new file mode 100644 index 000000000..75c835283 --- /dev/null +++ b/posts/2012/07/cffi-release-02-4800000428934604295.html @@ -0,0 +1,316 @@ + + + + + +CFFI release 0.2.1 | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                CFFI release 0.2.1

                + + + +
                +

                Hi everybody,

                +

                We released CFFI 0.2.1 (expected to be 1.0 soon). CFFI is a way to call C from Python.

                +

                EDIT: Win32 was broken in 0.2. Fixed.

                +

                This release is only for CPython 2.6 or 2.7. PyPy support is coming in
                +the ffi-backend branch, but not finished yet. CPython 3.x would be
                +easy but requires the help of someone.

                +

                The package is available on bitbucket as well as documented. You
                +can also install it straight from the python package index: pip install cffi

                +
                  +
                • Contains numerous small changes and support for more C-isms.
                • +
                • The biggest news is the support for installing packages that use
                  ffi.verify() on machines without a C compiler. Arguably, this
                  +lifts the last serious restriction for people to use CFFI.
                • +
                • Partial list of smaller changes:
                    +
                  • mappings between 'wchar_t' and Python unicodes
                  • +
                  • the introduction of ffi.NULL
                  • +
                  • a possibly clearer API for ffi.new(): e.g. to allocate a single int and obtain a pointer to it, use ffi.new("int *") instead of the old
                    ffi.new("int") +
                  • +
                  • and of course a plethora of smaller bug fixes
                  • +
                  +
                • +
                • CFFI uses pkg-config to install itself if available. This helps
                  +locate libffi on modern Linuxes. Mac OS/X support is available too
                  +(see the detailed installation instructions). Win32 should work out
                  +of the box. Win64 has not been really tested yet.
                • +
                +

                Cheers,
                +Armin Rigo and Maciej Fijałkowski

                +
                +

                Comments

                +
                +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2012/07/hello-everyone-6869934374873967346.html b/posts/2012/07/hello-everyone-6869934374873967346.html new file mode 100644 index 000000000..d241639a2 --- /dev/null +++ b/posts/2012/07/hello-everyone-6869934374873967346.html @@ -0,0 +1,714 @@ + + + + + +Prototype PHP interpreter using the PyPy toolchain - Hippy VM | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                Prototype PHP interpreter using the PyPy toolchain - Hippy VM

                + + + +
                +

                Hello everyone.

                +

                I'm proud to release the result of a Facebook-sponsored study on the feasibility of +using the RPython toolchain to produce a PHP interpreter. The rules were +simple: two months; one person; get as close to PHP as possible, implementing +enough warts and corner cases to be reasonably sure that it answers hard +problems in the PHP language. The outcome is called Hippy VM and implements +most of the PHP 1.0 language (functions, arrays, ints, floats and strings). +This should be considered an alpha release.

                +

                The resulting interpreter is obviously incomplete – it does not support all +modern PHP constructs (classes are completely unimplemented), builtin functions, +grammar productions, web server integration, builtin libraries +etc., etc.. It's just complete enough for me to reasonably be able to +say that – given some engineering effort – it's possible to provide a rock-solid +and fast PHP VM using PyPy technologies.

                +

                The result is available in a Bitbucket repo and is released under the MIT +license.

                +
                +

                Performance

                +

                The table below shows a few benchmarks comparing Hippy VM to Zend (a standard +PHP interpreter available in Linux distributions) and HipHop VM (a PHP-to-C++ +optimizing compiler developed by Facebook). The versions used were Zend 5.3.2 +(Zend Engine v2.3.0) and HipHop VM heads/vm-0-ga4fbb08028493df0f5e44f2bf7c042e859e245ab +(note that you need to check out the vm branch to get the newest version).

                +

                The run was performed on 64-bit Linux running on a Xeon W3580 with 8M of +L2 cache, which was otherwise unoccupied.

                +

                Unfortunately, I was not able to run it on the JITted version of HHVM, the new effort by Facebook, +but people involved with the project told me it's usually slower or comparable with the compiled HipHop. +Their JITted VM is still alpha software, so I'll update it as soon as I have the info.

                +
                + ++++++++ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                benchmarkZendHipHop VMHippy VMHippy / ZendHippy / HipHop
                arr2.7710.508+-0%0.274+-0%10.1x1.8x
                fannkuch21.2397.248+-0%1.377+-0%15.4x5.3x
                heapsort1.7390.507+-0%0.192+-0%9.1x2.6x
                binary_trees3.2230.641+-0%0.460+-0%7.0x1.4x
                cache_get_scb3.3500.614+-0%0.267+-2%12.6x2.3x
                fib2.3570.497+-0%0.021+-0%111.6x23.5x
                fasta1.4990.233+-4%0.177+-0%8.5x1.3x
                +
                +

                The PyPy compiler toolchain provides a way to implement a dynamic +language interpreter in a high-level language called RPython. This is +a language which is lower-level than Python, but still higher-level than +C or C++: for example, RPython is a garbage-collected language. The killer +feature is that the toolchain will generate a JIT for your interpreter which +will be able to leverage most of the work that has been done on speeding up Python +in the PyPy project. The resulting JIT is generated for your interpreter, and is not Python-specific. +This was one of the toolchain's original design decisions – in contrast to e.g. the JVM, +which was initially only used to interpret Java and later adjusted to serve as a platform for +dynamic languages.

                +

                Another important difference is that there is no common bytecode to which you compile both your +language and Python, so you don't inherit problems presented when implementing language X on top of, +say, Parrot VM or the JVM. The PyPy toolchain does not impose constraints on the semantics of +your language, whereas the benefits of the JVM only apply to languages that map well onto Java concepts.

                +

                To read more about creating your own interpreters using the PyPy toolchain, +read more blog posts or an excellent article by Laurence Tratt.

                +
                +
                +

                PHP deviations

                +

                The project's biggest deviation from the PHP specification is probably +that GC is no longer reference counting. That means that the object finalizer, when +implemented, will not be called directly at the moment of object death, but +at some later point. There are possible future developments to alleviate that +problem, by providing "refcounted" objects when leaving the current scope. +Research has to be done in order to achieve that.

                +
                +
                +

                Assessment

                +

                The RPython toolchain seems to be a cost-effective choice for writing +dynamic language VMs. It both provides a fast JIT and gives you +access to low-level primitives when you need them. A good example is +in the directory hippy/rpython which contains the implementation +of an ordered dictionary. An ordered dictionary is not a primitive +that RPython provides – it's not necessary for the goal of +implementing Python. Now, implementing it on top of a normal dictionary +is possible, but inefficient. RPython provides a way to work +directly at a lower level, if you desire to do so.

                +

                Things that require improvements in RPython:

                +
                  +
                • Lack of mutable strings on the RPython level ended up being a problem. +I ended up using lists of characters; which are efficient, but inconvenient, +since they don't support any string methods.
                • +
                • Frame handling is too conservative and too Python-specific, especially around +the calls. It's possible to implement less general, but simpler and faster +frame handling implementation in RPython.
                • +
                +
                +
                +

                Status of the implementation

                +

                Don't use it! It's a research prototype intended to assess the feasibility +of using RPython to create dynamic language VMs. The most notable +feature that's missing is reasonable error reporting. That said, I'm +confident it implements enough of the PHP language to prove that the full +implementation will present the same performance characteristics.

                +
                +
                +

                Benchmarks

                +

                The benchmarks are a selection of computer language shootout benchmarks, as well +as cache_get_scb, which is a part of old Facebook code. All benchmarks other +than this one (which is not open source, but definitely the most interesting :( ) are +available in the bench directory. The Python program to run them is called +runner.py and is in the same directory. It runs them 10 times, cutting off the first +3 runs (to ignore the JIT warm-up time) and averaging the rest. As you can see +the standard deviation is fairly minimal for all interpreters and runs; if +it's omitted it means it's below 0.5%.

                +

                The benchmarks were not selected for their ease of optimization – the optimizations +in the interpreter were written specifically for this set of benchmarks. No special JIT +optimizations were added, and barring what's mentioned below a vanilla PyPy 1.9 checkout +was used for compilation.

                +
                +
                +

                So, how fast will my website run if this is completed?

                +

                The truth is that I lack the benchmarks to be able to answer that right now. The core +of the PHP language is implemented up to the point where I'm confident +that the performance will not change as we get more of the PHP going.

                +
                +
                +

                How do I run it?

                +

                Get a PyPy checkout, apply the diff if you want to squeeze out the last +bits of performance and run pypy-checkout/pypy/bin/rpython targethippy.py to +get an executable that resembles a PHP interpreter. You can also directly run +python targethippy.py file.php, but this will be about 2000x slower.

                +
                +
                +

                RPython modifications

                +

                There was a modification that I did to the PyPy source code; the diff +is available. It's trivial, and should simply be made optional in the +RPython JIT generator, but it was easier just to do it, given the very constrained time +frame.

                +
                  +
                • +gen_store_back_in_virtualizable was disabled. This feature is +necessary for Python frames but not for PHP frames. PHP frames +do not have to be kept alive after we exit a function.
                • +
                +
                +
                +

                Future

                +

                Hippy is a cool prototype that presents a very interesting path towards a fast +PHP VM. However, at the moment I have too many other open source commitments +to take on the task of completing it in my spare time. I do think that this project +has a lot of potential, but I will not commit to any further development at +this time. If you send pull requests I'll try to review them. I'm also open +to having further development on this project funded, so if you're interested +in this project and the potential of a fast PHP interpreter, please get in +touch.

                +
                +

                Cheers,
                +fijal

                +

                EDIT: Fixed the path to the rpython binary

                +
                +

                Comments

                +
                +
                +
                + + Anonymous wrote on 2012-07-13 23:26: +
                +
                +

                it's cool. Next on the list Javascript to Python/PyPy converter...

                +
                +
                +
                +
                + + Maciej Fijalkowski wrote on 2012-07-13 23:34: +
                +
                +

                please read the blog post first. It's *not* PHP to Python converter. There is also a started JS implementation on in https://bitbucket.org/pypy/lang-js, but JS is kind of useless without a browser.

                +
                +
                +
                +
                + + Anonymous wrote on 2012-07-14 00:30: +
                +
                +

                JS to pypy would be useful when time comes to running all those node based apps in prod ;)

                Also, Java to PyPy would be a cool experiment too - jvm's way too bloated...

                +
                +
                +
                +
                + + Christian Heimes wrote on 2012-07-14 01:42: +
                +
                +

                Do I read the numbers correctly? The fibonacci test runs more than 110 times faster in your experimental, 2 months old VM than in the default Zend VM? That's amazing!

                It took me a while to figure out the meaning of the numbers. Please add units and explain that small is faster.

                Christian

                +
                +
                +
                +
                + + Unknown wrote on 2012-07-14 02:27: +
                +
                +

                Nice, Python surprising when

                +
                +
                +
                +
                + + Konstantine Rybnikov wrote on 2012-07-14 07:25: +
                +
                +

                Cool. When will your pypy converter convert my c++ programs to python? Can't wait until that happens! Anyway, nice work!

                p.s.: sarcasm

                +
                +
                +
                +
                + + Benedikt Morbach wrote on 2012-07-14 10:22: +
                +
                +

                Hey there, nice work.

                Do you have any numbers or estimates how memory consumption compares?

                +
                +
                +
                +
                + + Ole Laursen wrote on 2012-07-14 11:56: +
                +
                +

                I hope you get funding for researching the refcount thing. Being able to predict when something gets whacked is just really convenient and something PyPy Python can benefit from too.

                While GC may be more efficient, the unpredictable nature of it do become a problem in production in some cases.

                For instance, for a webapp written with Django and CPython, when a request is over I know that the stuff that was allocated is now gone unless I put something in a global data structure. I suspect many applications have similar patterns where you perform a big operation after which it's natural to have a clean up.

                +
                +
                +
                +
                + + Inactive Account wrote on 2012-07-15 00:21: +
                +
                +

                Wow, this is wonderful.
                You rock.

                I surely hope you get funding.

                If I didn't live in Brazil, and our currency wasn't so weak, and my income wasn't so low, I would definitely donate some dozens of dollars.

                Keep the good work

                +
                +
                +
                +
                + + Tom wrote on 2012-07-15 19:02: +
                +
                +

                I would like to see how this compares to the Phalanger project. Which runs PHP in the .NET runtime.

                +
                +
                +
                +
                + + Maciej Fijalkowski wrote on 2012-07-15 19:05: +
                +
                +

                About phalanger: the short answer is that I don't have windows and comparisons on mono would be a bit ingenuine. The longer answer is that I don't expect phalanger to particularly excel compared to Zend.

                For example compare the performance of IronPython and CPython. The same reasons apply as they do towards JVM or Parrot - this is IMO nto the right way for dynamic lanaguages.

                +
                +
                +
                +
                + + Anonymous wrote on 2012-07-15 20:16: +
                +
                +

                Does the Zend test include APC as well? That's the current standard way to run php scripts...

                +
                +
                +
                +
                + + Maciej Fijalkowski wrote on 2012-07-15 20:29: +
                +
                +

                Yes, although APC does not change anything in *this* set of benchmarks, precisely because you run everything in-process (within the same interpreter instance even).

                +
                +
                +
                +
                + + Reini Urban wrote on 2012-07-16 16:25: +
                +
                +

                Love this effort and esp. the benchmarks! Great work

                Referring to your mentioning of JVM and parrot:

                You consider as disadvantage to be tied to an existing set of VM opcodes to implement many languages. You were talking about .NET (which had to add Iron-style dynamic reflection later) or the JVM.

                parrot already has all the functionality the JVM or .NET was missing and even more (e.g. dynamic types loadable as plugins) and considers it as advantage to share opcodes and bytecode libraries across different languages.

                But parrot cannot compete with your speed yet.

                +
                +
                +
                +
                + + SM wrote on 2012-07-16 17:36: +
                +
                +

                Very interesting project. It would be nice if you used a recent version of PHP for comparisons - 5.3.2 is over 2 years old and one version behind. Try something like 5.4.4.

                +
                +
                +
                +
                + + Reinis I. wrote on 2012-07-18 20:59: +
                +
                +

                > JS is kind of useless without a browser

                This would have been more true before Node.js, but now it's false.

                +
                +
                +
                +
                + + Arne Babenhauserheide wrote on 2012-07-18 22:18: +
                +
                +

                Wow, 1.5x to 20x faster than a PHP-compiler and 7x to 100x faster than PHP itself… congrats!

                +
                +
                +
                +
                + + Anonymous wrote on 2012-07-24 11:01: +
                +
                +

                Offtopic: not trying to sound offensive or pushy, but what happened to numpypy development? I'm regularly checking https://buildbot.pypy.org/numpy-status/latest.html, and it looks like its development is stale for several months.

                +
                +
                +
                +
                + + Maciej Fijalkowski wrote on 2012-07-24 11:06: +
                +
                +

                @Anonymous not much. I'll write a non-progress blog post some time soon.

                +
                +
                +
                +
                + + Anonymous wrote on 2012-07-24 11:46: +
                +
                +

                @Fijal
                Thank you!

                +
                +
                +
                +
                + + Dima Tisnek wrote on 2012-08-08 09:33: +
                +
                +

                Awesome proof of concept!

                Can you post memory footprint comparison, please?

                And perhaps a quick overview what these test cases cover, arithmetic, function call overhead, dynamic language features?

                Thanks for your hard work, without likes of you OSS would never exist!

                +
                +
                +
                +
                + + Anonymous wrote on 2013-02-03 15:15: +
                +
                +

                Just in case anyone *is* interested in implementing PHP on the Parrot Virtual Machine, you don't have to tie yourself to the PVM bytecodes.

                You can write your PHP compiler entirely in NQP (Not Quite Perl) which in turn produces parrot bytecode for you.

                This is important for two reasons:

                First, NQP is a mid level language, and is relatively easy to write in, and doesn't require you to know anything at all about the PVM.

                Second, although NQP *presently* only targets PVM, there's an in-progress backend which targets the Java Virtual Machine! Early benchmarks suggest that it is already faster than perl5, and there are many optimizations and speedups to come.

                Thus, if you were to write a PHP compiler in NQP, you could target either the Parrot Virtual machine, or (in the future) the Java virtual machine.

                +
                +
                +
                +
                + + Unknown wrote on 2013-02-03 15:16: +
                +
                +

                Just in case anyone *is* interested in implementing PHP on the Parrot Virtual Machine, you don't have to tie yourself to the PVM bytecodes.

                You can write your PHP compiler entirely in NQP (Not Quite Perl) which in turn produces parrot bytecode for you.

                This is important for two reasons:

                First, NQP is a mid level language, and is relatively easy to write in, and doesn't require you to know anything at all about the PVM.

                Second, although NQP *presently* only targets PVM, there's an in-progress backend which targets the Java Virtual Machine! Early benchmarks suggest that it is already faster than perl5, and there are many optimizations and speedups to come.

                Thus, if you were to write a PHP compiler in NQP, you could target either the Parrot Virtual machine, or (in the future) the Java virtual machine.

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2012/07/py3k-status-update-5-359698189825543897.html b/posts/2012/07/py3k-status-update-5-359698189825543897.html new file mode 100644 index 000000000..ed8290858 --- /dev/null +++ b/posts/2012/07/py3k-status-update-5-359698189825543897.html @@ -0,0 +1,398 @@ + + + + + +Py3k status update #5 | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                Py3k status update #5

                + + + +
                +

                This is the fifth status update about our work on the py3k branch, which we
                +can work on thanks to all of the people who donated to the py3k proposal.

                +

                Apart from the usual "fix shallow py3k-related bugs" part, most of my work in
                +this iteration has been to fix the bootstrap logic of the interpreter, in
                +particular to setup the initial sys.path.

                +

                Until few weeks ago, the logic to determine sys.path was written entirely
                +at app-level in pypy/translator/goal/app_main.py, which is automatically
                +included inside the executable during translation. The algorithm is more or
                +less like this:

                +
                  +
                1. find the absolute path of the executable by looking at sys.argv[0]
                  +and cycling through all the directories in PATH +
                2. +
                3. starting from there, go up in the directory hierarchy until we find a
                  +directory which contains lib-python and lib_pypy +
                4. +
                +

                This works fine for Python 2 where the paths and filenames are represented as
                +8-bit strings, but it is a problem for Python 3 where we want to use unicode
                +instead. In particular, whenever we try to encode a 8-bit string into an
                +unicode, PyPy asks the _codecs built-in module to find the suitable
                +codec. Then, _codecs tries to import the encodings package, to list
                +all the available encodings. encodings is a package of the standard
                +library written in pure Python, so it is located inside
                lib-python/3.2. But at this point in time we yet have to add
                lib-python/3.2 to sys.path, so the import fails. Bootstrap problem!

                +

                The hard part was to find the problem: since it is an error which happens so
                +early, the interpreter is not even able to display a traceback, because it
                +cannot yet import traceback.py. The only way to debug it was through some
                +carefully placed print statement and the help of gdb. Once found the
                +problem, the solution was as easy as moving part of the logic to RPython,
                +where we don't have bootstrap problems.

                +

                Once the problem was fixed, I was able to finally run all the CPython test
                +against the compiled PyPy. As expected there are lots of failures, and fixing
                +them will be the topic of my next months.

                +
                +

                Comments

                +
                +
                +
                + + Anonymous wrote on 2012-07-10 17:10: +
                +
                +

                Would be nice to have a PyPy distribution embeded in OpenOffice 3.4.2

                +
                +
                +
                +
                + + haypo wrote on 2012-07-11 10:18: +
                +
                +

                I solved a similar issue in Python 3.2. Python 3 did use the wrong encoding to encode/decode filenames. When I tried to use the filesystem encoding instead, I had an ugly bootstrap issue with encodings implemented in Python (whereas ASCII, latin1 and utf-8 are implemented in C with a fast-path).

                The solution is to use C function to encode to/decode from the locale encoding, because the filesystem encoding is the locale encoding. mbstowcs() and wcstombs() are used until the Python codec machinery is ready.

                +
                +
                +
                +
                + + Anonymous wrote on 2012-07-13 15:58: +
                +
                +

                Did you try to compare PyPy to Pythran? According to his author, Pythran is on some benchmarks 30x faster than PyPy: https://linuxfr.org/users/serge_ss_paille/journaux/pythran-python-c#comment-1366988

                see also the manual here: https://github.com/serge-sans-paille/pythran/blob/master/MANUAL

                What do you think of this approach of translating Python to C++ ?

                +
                +
                +
                +
                + + Maciej Fijalkowski wrote on 2012-07-13 17:54: +
                +
                +

                @Anonymous - there is extremely little point in comparing python with whatever-looks-like-python-but-is-not. It's beyond the scope of this blog for sure.

                +
                +
                +
                +
                + + Anonymous wrote on 2012-07-13 21:11: +
                +
                +

                To be fair to @Anonymous, the pypy developers commonly compare pypy to C in benchmarks so it's not so unreasonable. The point is that only that one should understand that they are different languages, not that all comparisons between languages are pointless.

                +
                +
                +
                +
                + + Maciej Fijalkowski wrote on 2012-07-13 21:19: +
                +
                +

                Oh yes sure. It's as producting to compare pypy to shedskin as it is to compare pypy with g77. It still *is* or might be a valuable comparison, but it is important to keep in mind that those languages are different.

                +
                +
                +
                +
                + + Unknown wrote on 2012-08-13 17:30: +
                +
                +

                Any news on the py3k side?

                That’s actually what’s most interesting to me on a practical level and it would be nice to know how long it will still take till I can test it :)

                +
                +
                +
                +
                + + Antonio Cuni wrote on 2012-08-14 10:06: +
                +
                +

                @arne due to EuroPython and some personal issues not much has happened on the py3k side in the past month.

                It is hard to give estimates about when things will be ready, because it depends a lot on how much time I'll be able to dedicate on it. At this point, most of the major features are implemented and I am fixing all the smaller ones which are highlighted by failing CPython tests. However, sometimes a small feature might take much more time to fix than a big one

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2012/08/c-objects-in-cppyy-part-1-data-members-1105848719513737614.html b/posts/2012/08/c-objects-in-cppyy-part-1-data-members-1105848719513737614.html new file mode 100644 index 000000000..6ff5c548c --- /dev/null +++ b/posts/2012/08/c-objects-in-cppyy-part-1-data-members-1105848719513737614.html @@ -0,0 +1,838 @@ + + + + + +C++ objects in cppyy, part 1: Data Members | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                C++ objects in cppyy, part 1: Data Members

                + + + +
                +

                The cppyy module makes it possible to call into C++ from PyPy through the +Reflex package. +Documentation and setup instructions are +available here. +Recent work has focused on STL, low-level buffers, and code quality, but also +a lot on pythonizations for the +CINT backend, which is +mostly for High Energy Physics (HEP) use only. +A +previous posting walked +through the high-level structure and organization of the module, where it was +argued why it is necessary to write cppyy in RPython and generate bindings at +run-time for the best performance. +This posting details how access to C++ data structures is provided and is part +of a series of 3 postings on C++ object representation in Python: the second +posting will be about method dispatching, the third will tie up several odds +and ends by showing how the choices presented here and in part 2 work together +to make features such as auto-casting possible. + + +

                +

                Wrapping Choices

                + +

                Say we have a plain old data type (POD), which is the simplest possible +data structure in C++. +Like for example: + +

                +
                    struct A {
                +        int    m_i;
                +        double m_d;
                +    };
                + +

                What should such a POD look like when represented in Python? +Let's start by looking at a Python data structure that is functionally +similar, in that it also carries two public data members of the desired +types. +Something like this: + +

                +
                    class A(object):
                +        def __init__(self):
                +            self.m_i = 0
                +            self.m_d = 0.
                + +

                Alright, now how to go about connecting this Python class with the former +C++ POD? +Or rather, how to connect instances of either. +The exact memory layout of a Python +A +instance is up to Python, and likewise the layout of a C++ +A instance is up +to C++. +Both layouts are implementation details of the underlying language, language +implementation, language version, and the platform used. +It should be no surprise then, that for example an +int in C++ looks +nothing like a +PyIntObject, even +though it is perfectly possible, in both cases, to point out in memory where +the integer value is. +The two representations can thus not make use of the same block of memory +internally. +However, the requirement is that the access to C++ from Python looks and feels +natural in its use, not that the mapping is exact. +Another requirement is that we want access to the actual object from both +Python and C++. +In practice, it is easier to provide natural access to C++ from Python than +the other way around, because the choices of memory layout in C++ are far more +restrictive: the memory layout defines the access, as the actual class +definition is gone at run-time. +The best choice then, is that the Python object will act as a proxy to the C++ +object, with the actual data always being in C++. + +

                +

                From here it follows that if the +m_i data member +lives in C++, then Python needs some kind of helper to access it. +Conveniently, since version 2.2, Python has a +property construct +that can take a getter and setter function that are called when the property +is used in Python code, and present it to the programmer as if it were a data +member. +So we arrive at this (note how the +property instance +is a variable at the class level): + +

                +
                    class A(object):
                +        def __init__(self):
                +            self._cppthis = construct_new_A()
                +        m_i = property(get_m_i, set_m_i)
                +        m_d = property(get_m_d, set_m_d)
                + +

                The +construct_new_A +helper is not very interesting (the reflection layer can provide for it +directly), and methods are a subject for part 2 of this posting, so focus on +get_m_i +and set_m_i. +In order for the getter to work, the method needs to have access to the C++ +instance for which the Python object is a proxy. +On access, Python will call the getter function with the proxy instance for +which it is called. +The proxy has a +_cppthis data +member from which the C++ instance can be accessed (think of it as a pointer) +and all is good, at least for +m_i. +The second data member +m_d, however, +requires some more work: it is located at some offset into +_cppthis. +This offset can be obtained from the reflection information, which lets the +C++ compiler calculate it, so details such as +byte padding +are fully accounted for. +Since the setter also needs the offset, and since both share some more details +such as the containing class and type information of the data member, it is +natural to create a custom property class. +The getter and setter methods then become bound methods of an instance of that +custom property, +CPPDataMember, and +there is one such instance per data member. +Think of something along these lines: + +

                +
                    def make_datamember(cppclass, name):
                +        cppdm = cppyy.CPPDataMember(cppclass, name)
                +        return property(cppdm.get, cppdm.set)
                + +where the +make_datamember +function replaces the call to +property in the +class definition above. + +

                Now hold on a minute! +Before it was argued that Python and C++ can not share the same underlying +memory structure, because of choices internal to the language. +But if on the Python side choices are being made by the developer of the +language bindings, that is no longer a limitation. +In other words, why not go through e.g. the Python extension API, and do +this: + +

                +
                    struct A_pyproxy {
                +        PyObject_HEAD
                +        int    m_i;
                +        double m_d;
                +    };
                + +

                Doing so would save on +malloc overhead and remove +a pointer indirection. +There are some technical issues specific to PyPy for such a choice: there is +no such thing as +PyPyObject_HEAD +and the layout of objects is not a given as that is decided only at +translation time. +But assume that those issues can be solved, and also accept that there is no +problem in creating structure definitions like this at run-time, since the +reflection layer can provide both the required size and access to the +placement +new operator +(compare e.g. CPython's +struct module). +There is then still a more fundamental problem: it must be possible to take +over ownership in Python from instances created in C++ and vice-versa. +With a proxy scheme, that is trivial: just pass the pointer and do the +necessary bookkeeping. +With an embedded object, however, not every use case can be implemented: e.g. +if an object is created in Python, passed to C++, and deleted in C++, it +must have been allocated independently. +The proxy approach is therefore still the best choice, although embedding +objects may provide for optimizations in some use cases. + + +

                +

                Inheritance

                + +

                The next step, is to take a more complicated C++ class, one with inheritance +(I'm leaving out details such as constructors etc., for brevity): + +

                +
                    class A {
                +    public:
                +        virtual ~A() {}
                +        int    m_i;
                +        double m_d;
                +    };
                +
                +    class B : public A {
                +    public:
                +        virtual ~B() {}
                +        int    m_j;
                +    };
                + +

                From the previous discussion, it should already be clear what this will look +like in Python: + +

                +
                    class A(object):
                +        def __init__(self):
                +            self._cppthis = construct_new_A()
                +        m_i = make_datamember('A', 'm_i')
                +        m_d = make_datamember('A', 'm_d')
                +
                +    class B(A):
                +        def __init__(self):
                +            self._cppthis = construct_new_B()
                +        m_j = make_datamember('B', 'm_j')
                + +

                There are some minor adjustments needed, however. +For one, the offset of the +m_i data member +may be no longer zero: it is possible that a virtual function dispatch table +(vtable) +pointer is added at the beginning of +A (an alternative +is to have the vtable pointer at the end of the object). +But if +m_i is handled the +same way as +m_d, with the +offset provided by the compiler, then the compiler will add the bits, if any, +for the vtable pointer and all is still fine. +A real problem could come in however, with a call of the +m_i property on +an instance of +B: in that case, +the _cppthis +points to a B +instance, whereas the getter/setter pair expect an +A instance. +In practice, this is usually not a problem: compilers will align +A and +B and calculate +an offset for +m_j from the start +of A. +Still, that is an implementation detail (even though it is one that can be +determined at run-time and thus taken advantage of by the JIT), so it can not +be relied upon. +The m_i getter +thus needs to take into account that it can be called with a derived type, +and so it needs to add an additional offset. +With that modification, the code looks something like this (as you would have +guessed, this is getting more and more into pseudo-code territory, although it +is conceptually close to the actual implementation in cppyy): + +

                +
                    def get_m_i(self):
                +        return int(self._cppthis + offset(A, m_i) + offset(self.__class__, A))
                + +

                Which is a shame, really, because the offset between +B and +A is going +to be zero most of the time in practice, and the JIT can not completely +elide +the offset calculation (as we will see later; it is easy enough to elide if +self.__class__ is +A, though). +One possible solution is to repeat the properties for each derived class, i.e. +to have a +get_B_m_i etc., but +that looks ugly on the Python side and anyway +does not work in all cases: e.g. with multiple inheritance where there are +data members with the same name in both bases, or if +B itself has a +public data member called +m_i that shadows +the one from A. +The optimization then, is achieved by making +B in charge of the +offset calculations, by making +offset a method of +B, like so: + +

                +
                    def get_m_i(self):
                +        return int(self._cppthis + offset(A, m_i) + self.offset(A))
                + +

                The insight is that by scanning the inheritance hierarchy of a derived +class like B, you +can know statically whether it may sometimes need offsets, or whether the +offsets are always going to be zero. +Hence, if the offsets are always zero, the method +offset on +B will +simply return the literal +0 as its +implementation, with the JIT taking care of the rest through inlining and +constant folding. +If the offset could be non-zero, then the method will perform an actual +calculation, and it will let the JIT elide the call only if possible. + + +

                +

                Multiple Virtual Inheritance

                + +

                Next up would be multiple inheritance, but that is not very interesting: we +already have the offset calculation between the actual and base class, which +is all that is needed to resolve any multiple inheritance hierarchy. +So, skip that and move on to multiple virtual inheritance. +That that is going to be a tad more complicated will be clear if you show the +following code snippet to any old C++ hand and see how they respond. +Most likely you will be told: "Don't ever do that." +But if code can be written, it will be written, and so for the sake of the +argument, what would this look like in Python: + +

                +
                    class A {
                +    public:
                +        virtual ~A() {}
                +        int m_a;
                +    };
                +
                +    class B : public virtual A {
                +    public:
                +        virtual ~B() {}
                +        int m_b;
                +    };
                +
                +    class C : public virtual A {
                +    public:
                +        virtual ~C() {}
                +        int m_c;
                +    };
                +
                +    class D : public virtual B, public virtual C {
                +    public:
                +        virtual ~D() {}
                +        int m_d;
                +    };
                + +

                Actually, nothing changes from what we have seen so far: the scheme as laid +out above is fully sufficient. +For example, D +would simply look like: + +

                +
                    class D(B, C):
                +        def __init__(self):
                +            self._cppthis = construct_new_D()
                +        m_d = make_datamember('D', 'm_d')
                + +

                Point being, the only complication added by the multiple virtual +inheritance, is that navigation of the C++ instance happens with pointers +internal to the instance rather than with offsets. +However, it is still a fixed offset from any location to any other location +within the instance as its parts are laid out consecutively in memory (this is +not a requirement, but it is the most efficient, so it is what is used in +practice). +But what you can not do, is determine the offset statically: you need a live +(i.e. constructed) object for any offset calculations. +In Python, everything is always done dynamically, so that is of itself not a +limitation. +Furthermore, +self is already +passed to the offset calculation (remember that this was done to put the +calculation in the derived class, to optimize the common case of zero +offset), thus a live C++ instance is there precisely when it is needed. +The call to the offset calculation is hard to elide, since the instance will +be passed to a C++ helper and so the most the JIT can do is guard on the +instance's memory address, which is likely to change between traces. +Instead, explicit caching is needed on the base and derived types, allowing +the JIT to elide the lookup in the explicit cache. + + +

                +

                Static Data Members and Global Variables

                + +

                That, so far, covers all access to instance data members. +Next up are static data members and global variables. +A complication here is that a Python +property needs to +live on the class in order to work its magic. +Otherwise, if you get the property, it will simply return the getter function, +and if you set it, it will dissappear. +The logical conclusion then, is that a +property +representing a static or global variable, needs to live on the class of the +class, or the metaclass. +If done directly though, that would mean that every static data member is +available from every class, since all Python classes have the same metaclass, +which is class +type (and which is +its own metaclass). +To prevent that from happening and because +type is actually +immutable, each proxy class needs to have its own custom metaclass. +Furthermore, since static data can also be accessed on the instance, the +class, too, gets a +property object +for each static data member. +Expressed in code, for a basic C++ class, this looks as follows: + +

                +
                    class A {
                +    public:
                +        static int s_i;
                +    };
                + +

                Paired with some Python code such as this, needed to expose the static +variable both on the class and the instance level: + +

                +
                    meta_A = type(CppClassMeta, 'meta_A', [CPPMetaBase], {})
                +    meta_A.s_i = make_datamember('A', 's_i')
                +
                +    class A(object):
                +        __metaclass__ = meta_A
                +        s_i = make_datamember('A', 's_i')
                + +

                Inheritance adds no complications for the access of static data per se, but +there is the issue that the metaclasses must follow the same hierarchy as the +proxy classes, for the Python method resolution order (MRO) to work. +In other words, there are two complete, parallel class hierarchies that map +one-to-one: a hierarchy for the proxy classes and one for their metaclasses. + +

                +

                A parallel class hierarchy is used also in other highly dynamic, +object-oriented environments, such as for example +Smalltalk. +In Smalltalk as well, class-level constructs, such as class methods and data +members, are defined for the class in the metaclass. +A metaclass hierarchy has further uses, such as lazy loading of nested +classes and member templates (this would be coded up in the base class of all +metaclasses: +CPPMetaBase), and +makes it possible to distribute these over different reflection libraries. +With this in place, you can write Python codes like so: + +

                +
                    >>>> from cppyy.gbl import A
                +    >>>> a = A()
                +    >>>> a.s_i = 42
                +    >>>> print A.s_i == a.s_i
                +    True
                +    >>>> # etc.
                + +

                The implementation of the getter for +s_i is a lot +easier than for instance data: the static data lives at a fixed, global, +address, so no offset calculations are needed. +The same is done for global data or global data living in namespaces: +namespaces are represented as Python classes, and global data are implemented +as properties on them. +The need for a metaclass is one of the reasons why it is easier for namespaces +to be classes: module objects are too restrictive. +And even though namespaces are not modules, you still can, with +some limitations, +import from +them anyway. + +

                +

                It is common that global objects themselves are pointers, and therefore it +is allowed that the stored +_cppthis is not a +pointer to a C++ object, but rather a pointer to a pointer to a C++ object. +A double pointer, as it were. +This way, if the C++ code updates the global pointer, it will automatically +reflect on the Python side in the proxy. +Likewise, if on the Python side the pointer gets set to a different variable, +it is the pointer that gets updated, and this will be visible on the C++ side. +In general, however, the same caveat as for normal Python code applies: in +order to set a global object, it needs to be set within the scope of that +global object. +As an example, consider the following code for a C++ namespace +NS with +global variable +g_a, which behaves +the same as Python code for what concerns the visibility of changes to the +global variable: + +

                +
                    >>>> from cppyy.gbl import NS, A
                +    >>>> from NS import g_a
                +    >>>> g_a = A(42)                     # does NOT update C++ side
                +    >>>> print NS.g_a.m_i
                +    13                                   # the old value happens to be 13
                +    >>>> NS.g_a = A(42)                  # does update C++ side
                +    >>>> print NS.g_a.m_i
                +    42
                +    >>>> # etc.
                + + +

                Conclusion

                + +

                That covers all there is to know about data member access of C++ classes in +Python through a reflection layer! +A few final notes: RPython does not support metaclasses, and so the +construction of proxy classes (code like +make_datamember +above) happens in Python code instead. +There is an overhead penalty of about 2x over pure RPython code associated +with that, due to extra guards that get inserted by the JIT. +A factor of 2 sounds like a lot, but the overhead is tiny to begin with, and +2x of tiny is still tiny and it's not easy to measure. +The class definition of the custom property, +CPPDataMember, is +in RPython code, to be transparent to the JIT. +The actual offset calculations are in the reflection layer. +Having the proxy class creation in Python, with structural code in RPython, +complicates matters if proxy classes need to be constructed on-demand. +For example, if an instance of an as-of-yet unseen type is returned by a +method. +Explaining how that is solved is a topic of part 2, method calls, so stay +tuned. + +

                +

                This posting laid out the reasoning behind the object representation of C++ +objects in Python by cppyy for the purpose of data member access. +It explained how the chosen representation of offsets gives rise to a very +pythonic representation, which allows Python introspection tools to work as +expected. +It also explained some of the optimizations done for the benefit of the JIT. +Next up are method calls, which will be described in part 2.

                +
                +

                Comments

                +
                +
                +
                + + Sindwiller wrote on 2012-09-12 13:50: +
                +
                +

                On a related note, do you know when Reflex will discard gccxml? I'm using Boost.Python with Ogre3D (among other things) right now and I'm looking into the pypy option. Gccxml, however, complains about some C++11 related stuff (which is somewhat odd, to the least, as I don't expose any Ogre-internal class or anything like that).

                +
                +
                +
                +
                + + Wim Lavrijsen wrote on 2013-02-27 23:28: +
                +
                +

                Reflex itself will be discarded in favor of clang from llvm. That is, however, still experimental, but we're getting there.

                +
                +
                +
                +
                + + heemanshu bhalla wrote on 2013-10-03 14:18: +
                +
                +

                Complete explanation of static data members with classes and program go to link :-

                https://geeksprogrammings.blogspot.in/2013/09/static-data-members.html

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2012/08/cffi-release-03-4740491796308953732.html b/posts/2012/08/cffi-release-03-4740491796308953732.html new file mode 100644 index 000000000..cffb80d02 --- /dev/null +++ b/posts/2012/08/cffi-release-03-4740491796308953732.html @@ -0,0 +1,342 @@ + + + + + +CFFI release 0.3 | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                CFFI release 0.3

                + + + +
                +

                Hi everybody,

                +

                We released CFFI 0.3. This is the first release that supports more +than CPython 2.x :-)

                +
                  +
                • CPython 2.6, 2.7, and 3.x are supported (3.3 definitely, but maybe 3.2 or earlier too)
                • +
                • +PyPy trunk is supported.
                • +
                +

                In more details, the main news are:

                +
                  +
                • support for PyPy. You need to get a trunk version of PyPy, which +comes with the built-in module _cffi_backend to use with the CFFI +release. For testing, you can download the Linux 32/64 versions of +PyPy trunk. The OS/X and Windows versions of _cffi_backend +are not tested at all so far, so probably don't work yet.
                • +
                • support for Python 3. It is unknown which exact version is +required; probably 3.2 or even earlier, but we need 3.3 to run the +tests. The 3.x version is not a separate source; it runs out of the same sources. Thanks Amaury for starting this port.
                • +
                • the main change in the API is that you need to use ffi.string(cdata) +instead of str(cdata) or unicode(cdata). The motivation for this +change was the Python 3 compatibility. If your Python 2 code used to +contain str(<cdata 'char *'>), it would interpret the memory content +as a null-terminated string; but on Python 3 it would just return a +different string, namely "<cdata 'char *'>", and proceed without even +a crash, which is bad. So ffi.string() solves it by always returning +the memory content as an 8-bit string (which is a str in Python 2 and +a bytes in Python 3).
                • +
                • other minor API changes are documented at +https://cffi.readthedocs.org/ (grep for version 0.3).
                • +
                +

                Upcoming work, to be done before release 1.0:

                +
                  +
                • expose to the user the module cffi.model in a possibly refactored +way, for people that don't like (or for some reason can't easily use) +strings containing snippets of C declarations. We are thinking about +refactoring it in such a way that it has a ctypes-compatible +interface, to ease porting existing code from ctypes to cffi. Note +that this would concern only the C type and function declarations, not +all the rest of ctypes.
                • +
                • CFFI 1.0 will also have a corresponding PyPy release. We are thinking +about calling it PyPy 2.0 and including the whole of CFFI (instead of +just the _cffi_backend module like now). In other words it will +support CFFI out of the box --- we want to push forward usage of CFFI +in PyPy :-) +
                • +
                +

                Cheers,

                +

                Armin Rigo and Maciej Fijałkowski

                +
                +

                Comments

                +
                +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2012/08/hello-everyone-5492331040603503642.html b/posts/2012/08/hello-everyone-5492331040603503642.html new file mode 100644 index 000000000..1f2083b88 --- /dev/null +++ b/posts/2012/08/hello-everyone-5492331040603503642.html @@ -0,0 +1,328 @@ + + + + + +NumPyPy non-progress report | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                NumPyPy non-progress report

                + + + +
                +
                +

                Hello everyone.

                +

                Not much has happened in the past few months with numpypy development. A part +of the reason was doing other stuff for me, a part of the reason was +various unexpected visa-related admin, a part of the reason was EuroPython +and a part was long-awaited holiday.

                +

                The thing that's maybe worth mentioning is that it does not mean the donations +disappeared in the mist. PyPy developers are being paid to work on NumPyPy on +an hourly basis - that means if I decide to take holidays or work on something +else, the money is simply staying in the account until later.

                +

                Thanks again for all the donations, I hope to get back to this topic soon!

                +

                Cheers,
                +fijal

                +
                +
                +
                +

                Comments

                +
                +
                +
                + + Stephen Weber wrote on 2012-08-09 00:37: +
                +
                +

                Thanks for the non-update, I trust you that all is well. Rest helps us work better!

                +
                +
                +
                +
                + + Unknown wrote on 2012-08-13 13:25: +
                +
                +

                Please don’t worry too much about the money lost/not-lost. The important part is that you enjoy the programming. For you, because that’s more fun and for us because more fun for the programmer means better code.

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2012/08/multicore-programming-in-pypy-and-6595343388141556320.html b/posts/2012/08/multicore-programming-in-pypy-and-6595343388141556320.html new file mode 100644 index 000000000..474294b2e --- /dev/null +++ b/posts/2012/08/multicore-programming-in-pypy-and-6595343388141556320.html @@ -0,0 +1,745 @@ + + + + + +Multicore Programming in PyPy and CPython | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                Multicore Programming in PyPy and CPython

                + + + +
                +

                Hi all,

                +

                This is a short "position paper" kind of post about my view (Armin +Rigo's) on the future of multicore programming in high-level languages. +It is a summary of the +keynote presentation at EuroPython. As I learned by talking with people +afterwards, I am not a good enough speaker to manage to convey a deeper +message in a 20-minutes talk. I will try instead to convey it in a +250-lines post...

                +

                This is about three points:

                +
                  +
                1. We often hear about people wanting a version of Python running without +the Global Interpreter Lock (GIL): a "GIL-less Python". But what we +programmers really need is not just a GIL-less Python --- we need a +higher-level way to write multithreaded programs than using directly +threads and locks. One way is Automatic Mutual Exclusion (AME), which +would give us an "AME Python".
                2. +
                3. A good enough Software Transactional Memory (STM) system can be used +as an internal tool to do that. +This is what we are building into an "AME PyPy".
                4. +
                5. The picture is darker for CPython, though there is a way too. The +problem is that when we say STM, we think about either GCC 4.7's STM +support, or Hardware Transactional Memory (HTM). However, both +solutions are enough for a "GIL-less CPython", but not +for "AME CPython", due to capacity limitations. For the latter, we +need somehow to add some large-scale STM into the compiler.
                6. +
                +

                Let me explain these points in more details.

                +
                +

                GIL-less versus AME

                +

                The first point is in favor of the so-called Automatic Mutual Exclusion +approach. The issue with using threads (in any language with or without +a GIL) is that threads are fundamentally non-deterministic. In other +words, the programs' behaviors are not reproductible at all, and worse, +we cannot even reason about it --- it becomes quickly messy. We would +have to consider all possible combinations of code paths and timings, +and we cannot hope to write tests that cover all combinations. This +fact is often documented as one of the main blockers towards writing +successful multithreaded applications.

                +

                We need to solve this issue with a higher-level solution. Such +solutions exist theoretically, and Automatic Mutual Exclusion (AME) is +one of them. The idea of AME is that we divide the execution of each +thread into a number of "atomic blocks". Each block is well-delimited +and typically large. Each block runs atomically, as if it acquired a +GIL for its whole duration. The trick is that internally we use +Transactional Memory, which is a technique that lets the system run the +atomic blocks from each thread in parallel, while giving the programmer +the illusion that the blocks have been run in some global serialized +order.

                +

                This doesn't magically solve all possible issues, but it helps a lot: it +is far easier to reason in terms of a random ordering of large atomic +blocks than in terms of a random ordering of lines of code --- not to +mention the mess that multithreaded C is, where even a random ordering +of instructions is not a sufficient model any more.

                +

                How do such atomic blocks look like? For example, a program might +contain a loop over all keys of a dictionary, performing some +"mostly-independent" work on each value. This is a typical example: +each atomic block is one iteration through the loop. By using the +technique described here, we can run the iterations in parallel +(e.g. using a thread pool) but using AME to ensure that they appear to +run serially.

                +

                In Python, we don't care about the order in which the loop iterations +are done, because we are anyway iterating over the keys of a dictionary. +So we get exactly the same effect as before: the iterations still run in +some random order, but --- and that's the important point --- they +appear to run in a +global serialized order. In other words, we introduced parallelism, but +only under the hood: from the programmer's point of view, his program +still appears to run completely serially. Parallelisation as a +theoretically invisible optimization... more about the "theoretically" +in the next paragraph.

                +

                Note that randomness of order is not fundamental: they are techniques +building on top of AME that can be used to force the order of the +atomic blocks, if needed.

                +
                +
                +

                PyPy and STM/AME

                +

                Talking more precisely about PyPy: the current prototype pypy-stm is +doing precisely this. In pypy-stm, the length of the atomic blocks is +selected in one of two ways: either explicitly or automatically.

                +

                The automatic selection gives blocks corresponding to some small number +of bytecodes, in which case we have merely a GIL-less Python: multiple +threads will appear to run serially, with the execution randomly +switching from one thread to another at bytecode boundaries, just like +in CPython.

                +

                The explicit selection is closer to what was described in the previous +section: someone --- the programmer or the author of some library that +the programmer uses --- will explicitly put with thread.atomic: in +the source, which delimitates an atomic block. For example, we can use +it to build a library that can be used to iterate over the keys of a +dictionary: instead of iterating over the dictionary directly, we would +use some custom utility which gives the elements "in parallel". It +would give them by using internally a pool of threads, but enclosing +every handling of an element into such a with thread.atomic block.

                +

                This gives the nice illusion of a global serialized order, and thus +gives us a well-behaving model of the program's behavior.

                +

                Restating this differently, +the only semantical difference between pypy-stm and +a regular PyPy or CPython is that it has thread.atomic, which is a +context manager that gives the illusion of forcing the GIL to not be +released during the execution of the corresponding block of code. Apart +from this addition, they are apparently identical.

                +

                Of course they are only semantically identical if we ignore performance: +pypy-stm uses multiple threads and can potentially benefit from that +on multicore machines. The drawback is: when does it benefit, and how +much? The answer to this question is not immediate. The programmer +will usually have to detect and locate places that cause too many +"conflicts" in the Transactional Memory sense. A conflict occurs when +two atomic blocks write to the same location, or when A reads it, +B writes it, but B finishes first and commits. A conflict +causes the execution of one atomic block to be aborted and restarted, +due to another block committing. Although the process is transparent, +if it occurs more than occasionally, then it has a negative impact on +performance.

                +

                There is no out-of-the-box perfect solution for solving all conflicts. +What we will need is more tools to detect them and deal with them, data +structures that are made aware of the risks of "internal" conflicts when +externally there shouldn't be one, and so on. There is some work ahead.

                +

                The point here is that from the point of view of the final programmer, +we gets conflicts that we should resolve --- but at any point, our +program is correct, even if it may not be yet as efficient as it could +be. This is the opposite of regular multithreading, where programs are +efficient but not as correct as they could be. In other words, as we +all know, we only have resources to do the easy 80% of the work and not +the remaining hard 20%. So in this model we get a program that has 80% +of the theoretical maximum of performance and it's fine. In the regular +multithreading model we would instead only manage to remove 80% of the +bugs, and we are left with obscure rare crashes.

                +
                +
                +

                CPython and HTM

                +

                Couldn't we do the same for CPython? The problem here is that +pypy-stm is implemented as a transformation step during translation, +which is not directly possible in CPython. Here are our options:

                +
                  +
                • We could review and change the C code everywhere in CPython.
                • +
                • We use GCC 4.7, which supports some form of STM.
                • +
                • We wait until Intel's next generation of CPUs comes out ("Haswell") +and use HTM.
                • +
                • We write our own C code transformation within a compiler (e.g. LLVM).
                • +
                +

                I will personally file the first solution in the "thanks but no thanks" +category. If anything, it will give us another fork of CPython that +will painfully struggle to keep not more than 3-4 versions behind, and +then eventually die. It is very unlikely to be ever merged into the +CPython trunk, because it would need changes everywhere. Not to +mention that these changes would be very experimental: tomorrow we might +figure out that different changes would have been better, and have to +start from scratch again.

                +

                Let us turn instead to the next two solutions. Both of these solutions +are geared toward small-scale transactions, but not long-running ones. +For example, I have no clue how to give GCC rules about performing I/O +in a transaction --- this seems not supported at all; and moreover +looking at the STM library that is available so far to be linked with +the compiled program, it assumes short transactions only. By contrast, +when I say "long transaction" I mean transactions that can run for 0.1 +seconds or more. To give you an idea, in 0.1 seconds a PyPy program +allocates and frees on the order of ~50MB of memory.

                +

                Intel's Hardware Transactional Memory solution is both more flexible and +comes with a stricter limit. In one word, the transaction boundaries +are given by a pair of special CPU instructions that make the CPU enter +or leave "transactional" mode. If the transaction aborts, the CPU +cancels any change, rolls back to the "enter" instruction and causes +this instruction to return an error code instead of re-entering +transactional mode (a bit like a fork()). The software then detects +the error code. Typically, if transactions are rarely cancelled, it is +fine to fall back to a GIL-like solution just to redo these cancelled +transactions.

                +

                About the implementation: this is done by recording all the changes that +a transaction wants to do to the main memory, and keeping them invisible +to other CPUs. This is "easily" achieved by keeping them inside this +CPU's local cache; rolling back is then just a matter of discarding a +part of this cache without committing it to memory. From this point of +view, there is a lot to bet that we are actually talking about the +regular per-core Level 1 and Level 2 caches --- so any transaction that +cannot fully store its read and written data in the 64+256KB of the L1+L2 +caches will abort.

                +

                So what does it mean? A Python interpreter overflows the L1 cache of +the CPU very quickly: just creating new Python function frames takes a +lot of memory (on the order of magnitude of 1/100 of the whole L1 +cache). Adding a 256KB L2 cache into the picture helps, particularly +because it is highly associative and thus avoids a lot of fake conflicts. +However, as long as the HTM support is limited to L1+L2 caches, +it is not going to be enough to run an "AME Python" with any sort of +medium-to-long transaction. It can +run a "GIL-less Python", though: just running a few hundred or even +thousand bytecodes at a time should fit in the L1+L2 caches, for most +bytecodes.

                +

                I would vaguely guess that it will take on the order of 10 years until +CPU cache sizes grow enough for a CPU in HTM mode to actually be able to +run 0.1-second transactions. (Of course in 10 years' time a lot of other +things may occur too, including the whole Transactional Memory model +being displaced by something else.)

                +
                +
                +

                Write your own STM for C

                +

                Let's discuss now the last option: if neither GCC 4.7 nor HTM are +sufficient for an "AME CPython", then we might want to +write our own C compiler patch (as either extra work on GCC 4.7, or an +extra pass to LLVM, for example).

                +

                We would have to deal with the fact that we get low-level information, +and somehow need to preserve interesting high-level bits through the +compiler up to the point at which our pass runs: for example, whether +the field we read is immutable or not. (This is important because some +common objects are immutable, e.g. PyIntObject. Immutable reads don't +need to be recorded, whereas reads of mutable data must be protected +against other threads modifying them.) We can also have custom code to +handle the reference counters: e.g. not consider it a conflict if +multiple transactions have changed the same reference counter, but just +resolve it automatically at commit time. We are also free to handle I/O +in the way we want.

                +

                More generally, the advantage of this approach over both the current GCC +4.7 and over HTM is that we control the whole process. While this still +looks like a lot of work, it looks doable. It would be possible to come +up with a minimal patch of CPython that can be accepted into core +without too much troubles (e.g. to mark immutable fields and tweak the +refcounting macros), and keep all the cleverness inside the compiler +extension.

                +
                +
                +

                Conclusion

                +

                I would assume that a programming model specific to PyPy and not +applicable to CPython has little chances to catch on, as long as PyPy is +not the main Python interpreter (which looks unlikely to change anytime +soon). Thus as long as only PyPy has AME, it looks like it will not +become the main model of multicore usage in Python. However, I can +conclude with a more positive note than during the EuroPython +conference: it is a lot of work, but there is a more-or-less reasonable +way forward to have an AME version of CPython too.

                +

                In the meantime, pypy-stm is around the corner, and together with +tools developed on top of it, it might become really useful and used. I +hope that in the next few years this work will trigger enough motivation +for CPython to follow the ideas.

                +
                +
                +

                Comments

                +
                +
                +
                + + JohnLenton wrote on 2012-08-09 12:29: +
                +
                +

                A question: does a “donate towards STM/AME in pypy” also count as a donation towards the CPython work? Getting the hooks in CPython to allow exploration and implementation of this seems at least as important as the pypy work. In fact, I think it’s quite a bit more important.

                +
                +
                +
                +
                + + Armin Rigo wrote on 2012-08-09 12:55: +
                +
                +

                @John: I didn't foresee this development at the start of the year, so I don't know. It's a topic that would need to be discussed internally, likely with feedback from past donators.

                Right now of course I'm finishing the basics of pypy-stm (working on the JIT now), and from there on there is a lot that can be done as pure Python, like libraries of better-suited data structures --- and generally gaining experience that would anyway be needed for CPython's work.

                +
                +
                +
                +
                + + Anonymous wrote on 2012-08-09 15:53: +
                +
                +

                With HTM you don't have to have a one-to-one mapping between your application transactions and the hardware interface. You can also have an STM, that is implemented using HTM. So you may do all the book-keeping yourself in software, but then at commit time use HTM.

                +
                +
                +
                +
                + + Nat Tuck wrote on 2012-08-09 16:37: +
                +
                +

                No. We really do want a GIL-free Python. Even if that means we sometimes need to deal with locks.

                Right now a high end server can have 64 cores. That means that parallel python code could run faster than serial C code.

                STM and other high level abstractions are neat, but they're no substitute for just killing the damn GIL.

                +
                +
                +
                +
                + + Anonymous wrote on 2012-08-09 17:32: +
                +
                +

                What does 'just killing the damn GIL' mean without something like STM? Do you consider it acceptable for Python primitives not to be threadsafe?

                If you intend to run 64 cores, then what is the exact reason you need threading and can't use multiprocessing?

                +
                +
                +
                +
                + + Anonymous wrote on 2012-08-09 19:54: +
                +
                +

                Jesus Christ why don't we all just spend 5 min fiddling with the multiprocessing module and learn how to partition execution and queues like we partition sequences of statements into functions? So sick of GIL articles and the obsession with not learning how to divide up the work and communicate. In some ways the need to recognize narrow channels where relatively small amounts of data are being channeled through relatively intense blocks of execution and create readable, explicit structure around those blocks might actually improve the comprehensibility of some code I've seen. Getting a little tired of seeing so much effort by excellent, essential, dedicated Python devs getting sucked up by users who won't get it.

                I think users are driving this speed-for-free obsession way to far. If anything bugs in a magical system are harder to find than understanding explicit structure and explicit structure that's elegant is neither crufty nor slow. Eventually, no interpreter will save a bad programmer. Are we next going to enable the novice "Pythonista" to forego any knowledge of algorithms?

                We -need- JIT on production systems to get response times down for template processing without micro-caching out the wazoo. These types of services are already parallel by nature of the servers and usually I/O bound except for the few slow parts. Cython already serves such an excellent roll for both C/C++ API's AND speed AND optimizing existing python code with minimal changes. JIT PyPy playing well with Cython would make Python very generally uber. Users who actually get multiprocessing and can divide up the workflow won't want a slower implementation of any other kind. Getting a somewhat good solution for 'free' is not nearly as appealing as the additional headroom afforded by an incremental user cost (adding some strong typing or patching a function to work with pypy/py3k).

                +
                +
                +
                +
                + + Unknown wrote on 2012-08-09 19:59: +
                +
                +

                template processing. lol.

                +
                +
                +
                +
                + + Maciej Fijalkowski wrote on 2012-08-09 21:27: +
                +
                +

                @Anonymous.

                I welcome you to work out how to make pypy translation process parallel using any techniques you described.

                +
                +
                +
                +
                + + Benjamin wrote on 2012-08-10 07:27: +
                +
                +

                I get the overall goals and desires and I think they are fabulous. However, one notion that seems counterintuitive to me is the desire for large atomic operations.

                Aside from the nomenclature (atomic generally means smallest possible), my intuition is that STM would generally operate more efficiently by having fewer roll-backs with small atomic operations and frequent commits. This leads me to assume there is some sort of significant overhead involved with the setup or teardown of the STM 'wrapper'.

                From a broader perspective, I get that understanding interlacing is much easier with larger pieces, but larger pieces of code don't lend themselves to wide distribution across many cores like small pieces do.

                It seems, to me, that you're focusing heavily on the idea of linearly written code magically functioning in parallel and neglecting the idea of simple, low-cost concurrency, which might have a much bigger short-term impact; and which, through use, may shed light on better frameworks for reducing the complexity inherent in concurrency.

                +
                +
                +
                +
                + + Armin Rigo wrote on 2012-08-10 08:57: +
                +
                +

                @Anonymous: "So you may do all the book-keeping yourself in software, but then at commit time use HTM.": I don't see how (or the point), can you be more explicit or post a link?

                @Anonymous: I'm not saying that STM is the final solution to all problems. Some classes of problems have other solutions that work well so far and I'm not proposing to change them. Big servers can naturally handle big loads just by having enough processes. What I'm describing instead is a pure language feature that may or may not help in particular cases --- and there are other cases than the one you describe where the situation is very different and multiprocessing doesn't help at all. Also, you have to realise that any argument "we will never need feature X because we can work around it using hack Y" is bound to lose eventually: at least some people in some cases will need the clean feature X because the hack Y is too complicated to learn or use correctly.

                @Benjamin: "atomic" actually means "not decomposable", not necessarily "as small as possible". This focus on smallness of transaction IMO is an artefact of last decade's research focus. In my posts I tend to focus on large transaction as a counterpoint: in the use cases I have in mind there is no guarantee that all transactions will be small. Some of them may be, but others not, and this is a restriction. In things like "one iteration through this loop = one transaction", some of these iterations go away and do a lot of stuff.

                +
                +
                +
                +
                + + Unknown wrote on 2012-08-10 18:15: +
                +
                +

                Transactional programming is neat. So are Goroutines and functional-style parallelism. On the other hand, I think that C and C++ (or at least C1x and C++11) get one thing completely right: they don't try to enforce any particular threading model. For some problems (like reference counts, as you mention), you really do want a different model. As long as other languages force me to choose a single model, my big projects will stay in C/C++.

                +
                +
                +
                +
                + + Benjamin wrote on 2012-08-10 21:17: +
                +
                +

                @Armin I'd love to hear your thoughts (benefits, costs, entrenched ideas, etc.) on large vs small transactions at some point. Though I suspect that would be a post unto itself.

                +
                +
                +
                +
                + + Armin Rigo wrote on 2012-08-10 22:04: +
                +
                +

                @Benjamin: a user program might be optimized to reduce its memory usage, for example by carefully reusing objects instead of throwing them away, finding more memory-efficient constructs, and so on. But in many cases in Python you don't care too much. Similarly, I expect that it's possible to reduce the size of transactions by splitting them up carefully, hoping to get some extras in performance. But most importantly I'd like a system where the programmer didn't have to care overmuch about that. It should still work reasonably well for *any* size, just like a reasonable GC should work for any heap size.

                If I had to describe the main issue I have against HTM, it is that beyond some transaction size we loose all parallelism because it has to fall back on the GIL.

                Well, now that I think about it, it's the same in memory usage: if you grow past the RAM size, the program is suddenly swapping, and performance becomes terrible. But RAM sizes are so far much more generous than maximum hardware transaction sizes.

                +
                +
                +
                +
                + + Unknown wrote on 2012-08-12 08:26: +
                +
                +

                There are two key concurrency patterns to keep in mind when considering Armin's STM work:

                1. Event-loop based applications that spend a lot of time idling waiting for events.

                2. Map-reduce style applications where only the reduce step is particularly prone to resource contention, but the map step is read-heavy (and thus hard to split amongst multiple processes)

                For both of those use cases, splitting out multiple processes often won't pay off due to either the serialisation overhead or the additional complexity needed to make serialisation possible at all.

                Coarse-grained STM, however, should pay off handsomely in both of those scenarios: if the CPU bound parts of the application are touching different data structures, or are only *reading* any shared data, with any writes being batched for later application, then the STM interaction can be built in to the event loop or parallel execution framework.

                Will STM help with threading use cases where multiple threads are simultaneously reading and writing the same data structure? No, it won't. However, such applications don't exploit multiple cores effectively even with free threading, because their *lock* contention will also be high.

                As far as "just kill the GIL" goes, I've already written extensively on that topic: https://python-notes.boredomandlaziness.org/en/latest/python3/questions_and_answers.html#but-but-surely-fixing-the-gil-is-more-important-than-fixing-unicode

                +
                +
                +
                +
                + + klaussfreire wrote on 2012-08-13 23:35: +
                +
                +

                Option 5, implement STM on the operating system. Linux already has COW for processes, imagine COW-MERGE for threads.

                When you start transactional mode, all pages are marked read-only, thread-private and COW. When you commit, dirty pages are merged with the processes' page maps, unless conflicts arise (the process already has dirty pages).

                A simple versioning system and version checks would take care of conflict detection.

                I just wonder how difficult it would be designing applications that can run on this model (conflicts at page level vs object level).

                Thread-private allocation arenas are entirely possible to avoid new objects from creating conflicts all the time, so it would be a matter of making read-only use of objects really read-only, something I've done incrementally in patches already. Reference counts have to be externalized (taken out of PyObject), for instance.

                +
                +
                +
                +
                + + Armin Rigo wrote on 2012-08-14 09:12: +
                +
                +

                @klaussfreire: that approach is a cool hack but unlikely to work in practice in a language like Python, because the user doesn't control at all what objects are together with what other objects on the same pages. Even with the reference counts moved out of the way I guess you'd have far too many spurious conflicts.

                +
                +
                +
                +
                + + klaussfreire wrote on 2012-08-14 15:43: +
                +
                +

                @Armin, well, Python itself does know.

                In my half-formed idea in my head, python would use thread-local versions of the integer pool and the various free lists, and allocation of new objects would be served from an also thread-local arena (while in a transaction).

                Read-write access to shared objects, yes, would be a little bit unpredictable. That's why I was wondering how good (if at all) it would work for Python.

                +
                +
                +
                +
                + + Wim Lavrijsen wrote on 2012-08-14 20:18: +
                +
                +

                @klaussfreire

                is this perhaps what you are looking for: https://plasma.cs.umass.edu/emery/grace

                Cheers,
                Wim

                +
                +
                +
                +
                + + klaussfreire wrote on 2012-08-14 21:50: +
                +
                +

                Damn. And I thought I was being original. I can already spot a few key places where kernel-based support would be superior (not only raw performance, but also transparency), but in general, that's exactly what I was talking about, sans transaction retrials.

                +
                +
                +
                +
                + + Mark D. wrote on 2012-08-16 04:23: +
                +
                +

                0.1 second transactions? With hardware transactional memory the general idea is transactions about ten thousand times smaller. A dozen memory modifications maybe.

                It would be prohibitively expensive, hardware wise, to implement conflict detection for transactions much larger than that, to say nothing of the occurrence of conflicts requiring rollback and re-execution if such enormously large transactions were executed optimistically.

                +
                +
                +
                +
                + + Armin Rigo wrote on 2012-08-19 11:58: +
                +
                +

                @Mark D.: I don't know if "a dozen memory modification" comes from real work in the field or is just a guess. My own guess would be that Intel Haswell supports easily hunderds of modifications, possibly thousands. Moreover the built-in cache coherency mechanisms should be used here too, in a way that scales with the cache size; this means they should not be "prohibitively expensive".
                Of course I know that in 0.1 seconds we do far more than thousands writes, but I think that nothing strictly limits the progression of future processors in that respect.

                The occurrence of conflicts in large transactions depends on two factors. First, "true conflicts", which is the hard problem, but which I think should be relatively deterministic and debuggable with new tools. Second, "false conflicts", which is the HTM/STM mechanism detecting a conflict when there is none. To handle large transactions this should occur with a probability very, very close to 0% for each memory access. In pypy-stm it is 0%, but indeed, with HTM it depends on how close to 0% they can get. I have no data on that.

                +
                +
                +
                +
                + + Ole Laursen wrote on 2012-09-06 15:04: +
                +
                +

                I'm a little late, but regarding the simple let's-do-the-loop-concurrently example, if pypy-stm ends up working out as hoped, would it be relatively easy for pypy to do it automatically without having to use parallel loop thing explicitly?

                I have a hunch the answer would be yes, but that the hard part is figuring out when it makes sense and how to do the split (each thread needs a good chunk to work on).

                On the other hand, GCC has OpenMP which does seem really convenient and also looks like it has (or rather an implementation of that would have to have) solved part of this problem.

                Many years ago, I read about research in auto-parallellising compilers and it stroke me as a really hard problem. But if you can just do some magic with the loops, perhaps it's an attainable goal?

                +
                +
                +
                +
                + + Unknown wrote on 2012-09-06 21:02: +
                +
                +

                I really believe that concurrency - like memory allocation, GC and safe arrays - should be done without the user thinking about it...

                Languages like Erlang, ABCL and Concurrent Object Oriented C solves this quite elegant.

                Just make every Object a "process" (thread/greenlet) and every return value a Future and your are done :-)

                +
                +
                +
                +
                + + Anonymous wrote on 2015-09-22 07:53: +
                +
                +

                Ammm... Jython 2.7.0 !

                All pure Python syntax using threading instantly go MULTI-CORE! All you need to do is replace the 'p' with a 'j' in your command and voila!

                ;)

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2012/09/numpy-on-pypy-status-update-1605312600799448094.html b/posts/2012/09/numpy-on-pypy-status-update-1605312600799448094.html new file mode 100644 index 000000000..fc6737fc4 --- /dev/null +++ b/posts/2012/09/numpy-on-pypy-status-update-1605312600799448094.html @@ -0,0 +1,358 @@ + + + + + +NumPy on PyPy status update | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                NumPy on PyPy status update

                + + + +
                +
                +

                Hello everyone.

                +

                It's been a while since we posted a numpy work update, but I'm pleased to +inform you that work on it has been restarted. A lot of the work has been +done by Matti Picus, who is one of the newest contributors to the PyPy +project. None of the work below has been merged so far, it's work in progress:

                +
                  +
                • Complex dtype support.
                • +
                • Fixing incompatibilities between numpy and pypy's version.
                • +
                • Refactoring numpypy to simplify the code and make it easier for new +contributors.
                • +
                • Reuse most of the numpy's pure python code without modifications.
                • +
                +

                Finishing this is also the plan for the next month.

                +

                Cheers,
                +fijal

                +
                +
                +
                +

                Comments

                +
                +
                +
                + + Anonymous wrote on 2012-09-05 20:59: +
                +
                +

                Exciting stuff!

                It would be great to see a write-up of what, if anything, still remains to be done after this merge to have full compatibility with numpy.

                +
                +
                +
                +
                + + Maciej Fijalkowski wrote on 2012-09-05 21:31: +
                +
                +

                Once we have a better idea about the numpy's test status we'll post it. That would be probably on the next month's update report.

                +
                +
                +
                +
                + + Unknown wrote on 2012-09-07 15:03: +
                +
                +

                Great to hear that!

                I’m anxious to see numpy on pypy bear fruit, so I can test it with some model of a model I experiment with.

                +
                +
                +
                +
                + + Raul Durand wrote on 2012-12-04 12:31: +
                +
                +

                Pypy and numpypy are just great!
                I will be able to move some projects completely to pypy after Linalg implementation.
                In the meanwhile I just noticed that vectorized operations as dot product in numpypy are not yet as fast as in numpy.

                +
                +
                +
                +
                + + Raul Durand wrote on 2012-12-04 12:32: +
                +
                +

                Pypy and numpypy are just great!
                I will be able to move some projects completely to pypy after Linalg implementation.
                In the meanwhile I just noticed that vectorized operations as dot product in numpypy are not yet as fast as in numpy.

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2012/09/py3k-status-update-6-4049281716377789914.html b/posts/2012/09/py3k-status-update-6-4049281716377789914.html new file mode 100644 index 000000000..cf1e40831 --- /dev/null +++ b/posts/2012/09/py3k-status-update-6-4049281716377789914.html @@ -0,0 +1,406 @@ + + + + + +Py3k status update #6 | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                Py3k status update #6

                + + + +
                +

                This is the sixth status update about our work on the py3k branch, which we
                +can work on thanks to all of the people who donated to the py3k proposal.

                +

                The coolest news is not about what we did in the past weeks, but what we will
                +do in the next: I am pleased to announce that Philip Jenvey has been
                +selected by the PyPy communitiy to be funded for his upcoming work on py3k,
                +thanks to your generous donations. He will start to work on it shortly, and he
                +will surely help the branch to make faster progress. I am also particularly
                +happy of this because Philip is the first non-core developer who is getting
                +paid with donations: he demonstrated over the past months to be able to work
                +effectively on PyPy, and so we were happy to approve his application for the
                +job. This means that anyone can potentially be selected in the future, the
                +only strict requirement is to have a deep interest in working on PyPy and to
                +prove to be able to do so by contributing to the project.

                +

                Back to the status of the branch. Most of the work since the last status
                +update has been done in the area of, guess what? Unicode strings. As usual,
                +this is one of the most important changes between Python 2 and Python 3, so
                +it's not surprising. The biggest news is that now PyPy internally supports
                +unicode identifiers (such as names of variables, functions, attributes, etc.),
                +whereas earlier it supported only ASCII bytes strings. The changes is still
                +barely visible from the outside, because the parser still rejects non-ASCII
                +identifiers, however you can see it with a bit of creativity:

                +
                >>>> def foo(x): pass
                +>>>> foo(**{'àèìòù': 42})
                +Traceback (most recent call last):
                +  File "<console>", line 1, in <module>
                +TypeError: foo() got an unexpected keyword argument 'àèìòù'
                +
                +

                Before the latest changes, you used to get question marks instead of the
                +proper name for the keyword argument. Although this might seem like a small
                +detail, it is a big step towards a proper working Python 3 interpreter and it
                +required a couple of days of headaches. A spin-off of this work is that now
                +RPython has better built-in support for unicode (also in the default branch):
                +for example, it now supports unicode string formatting (using the percent
                +operator) and the methods .encode/.decode('utf-8').

                +

                Other than that there is the usual list of smaller issues and bugs that got
                +fixed, including (but not limited to):

                +
                  +
                • teach the compiler when to emit the new opcode DELETE_DEREF (and
                  +implement it!)
                • +
                • detect when we use spaces and TABs inconsistently in the source code, as
                  +CPython does
                • +
                • fix yet another bug related to the new lexically scoped exceptions (this
                  +is the last one, hopefully)
                • +
                • port some of the changes that we did to the standard CPython 2.7 tests to
                  +3.2, to mark those which are implementation details and should not be run on
                  +PyPy
                • +
                +

                Finally, I would like to thank Amaury Forgeot d'Arc and Ariel Ben-Yehuda for
                +their work on the branch; among other things, Amaury recently worked on
                cpyext and on the PyPy _cffi_backend, while Ariel submitted a patch to
                +implement PEP 3138.

                +
                +

                Comments

                +
                +
                +
                + + Ernst Sjöstrand wrote on 2012-09-26 10:48: +
                +
                +

                Following your work, great to see progress!

                +
                +
                +
                +
                + + Anonymous wrote on 2012-10-05 16:43: +
                +
                +

                Python 3.3 has some absolutely crucial fixes (finally! to Unicode). I'd go as far as to say that Python 3.3 is the first Pyton version of all that is truly suitable for the full range of internationalized apps. So I wonder a bit about the set target for the PyPy3 work being 3.2. Any chance it can be 3.2 with the 3.3 Unicode implementation?

                +
                +
                +
                +
                + + Antonio Cuni wrote on 2012-10-05 17:35: +
                +
                +

                we chose to target 3.2 because at the time 3.3 was a moving target. Now we could indeed decide to retarget 3.3, but I'm not sure it's a good idea. There is still a lot of work to be done for 3.2, and adding more features would only shift the end to later.

                +
                +
                +
                +
                + + Anonymous wrote on 2012-10-07 19:41: +
                +
                +

                I know this might be an odd request but... Has the pypy team ever considered inquiring Mozilla about embedding pypy into their browser? I say this because Google is embedding a vm from chromium to support dart. I do not think this is ideal for an open web. Pypy, on the other hand, would be ideal as an open web vm! Think about it!

                +
                +
                +
                +
                + + rental mobil jakarta wrote on 2012-10-08 15:15: +
                +
                +

                Nice article, thanks for the information.

                +
                +
                +
                +
                + + Arne Babenhauserheide wrote on 2012-10-17 09:08: +
                +
                +

                That sounds great!

                Thank you for your work - and for keeping us up to date!

                +
                +
                +
                +
                + + Unknown wrote on 2012-10-17 09:11: +
                +
                +

                I think main change in 3.3 is that they allow u'' as syntax for indicating a string (just inactive syntax for easing the porting of python2 code: '' is exactly equal to u'').

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2012/09/pypy-cape-town-sprint-oct-7th-oct-21st-5757682347636918027.html b/posts/2012/09/pypy-cape-town-sprint-oct-7th-oct-21st-5757682347636918027.html new file mode 100644 index 000000000..0258e5dc6 --- /dev/null +++ b/posts/2012/09/pypy-cape-town-sprint-oct-7th-oct-21st-5757682347636918027.html @@ -0,0 +1,436 @@ + + + + + +PyPy Cape Town Sprint Oct 7th - Oct 21st 2012 | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                PyPy Cape Town Sprint Oct 7th - Oct 21st 2012

                + + + +
                +
                +

                Hello everyone!

                +

                The next PyPy sprint will be in Cape Town, South Africa. It is a +public sprint, suitable for newcomers. It starts a couple of days +after PyCon South Africa, which is on the 4th and 5th of October. +This is a relatively unusual sprint in that it is hosted halfway +across the world from where most contributors live, so we plan to +spend some time during those two weeks doing sprinting and some time +doing touristy stuff. The goals for the sprint are general progress +and whatever people are interested in.

                +

                Possible topics:

                +
                  +
                • PyPy release 2.0
                • +
                • running your software on PyPy
                • +
                • work on PyPy's numpy (status)
                • +
                • work on STM (status)
                • +
                • JIT improvements
                • +
                • any exciting stuff you can think of
                • +
                +

                If there are newcomers, we'll run the usual introduction to hacking on +PyPy.

                +
                +

                Location

                +

                The sprint will be held either in the apartment of fijal, which is in +Tamboerskloof, Cape Town, or in the offices of the Praekelt +Foundation, located in Woodstock, Cape Town. The Praekelt Foundation +has offered to host us, if needed.

                +

                Cape Town, as a very touristy place, has tons of accomodation ranging +in quality from good to amazing. Depending on the sprint location you +might need a car.

                +
                +
                +

                Good to Know

                +

                You probably don't need visa for South Africa -- consult Wikipedia. +South Africa is a lovely place with lots of stuff to do. You can see +penguins, elephants, lions and sharks all on one day (or better yet, +on multiple days).

                +

                There is a wide selection of good restaurants within a reasonable +distance of the sprint venue (depending on the venue, either walking +or driving).

                +

                The power plug is some weird derivative of an old-english standard, +but adapters are easily acquired.

                +
                +
                +

                Who's Coming?

                +

                If you'd like to come, please let us know when you will be arriving +and leaving, as well as what your interests are. We'll keep a list of +people which we'll update (or you can do so yourself if you have +bitbucket pypy commit rights).

                +
                +

                Cheers,
                +fijal +

                +
                +
                +
                +

                Comments

                +
                +
                +
                + + Anonymous wrote on 2012-09-07 11:16: +
                +
                +

                Why pypy is three times slower than python2.6 + psyco2 ??

                # text parser:
                # python2.7 - 0.94s
                # python2.7 + cython - 0.73s
                # pypy1.9 - 0.68s
                # python2.5 + psyco1.6 - 0.31s
                # python2.6 + psyco2 - 0.23s

                "python2.6 + psyco2" is 3.3 times faster than pypy1.9, why ??

                +
                +
                +
                +
                + + Maciej Fijalkowski wrote on 2012-09-07 13:48: +
                +
                +

                Obviously if you don't provide a benchmark we're completely clueless.

                +
                +
                +
                +
                + + Anonymous wrote on 2012-09-09 13:31: +
                +
                +

                I found that "cStringIO" is extremely slow in pypy1.9 (almost three times slower than python2.7), I'm using a lot of cStringIO in my text parser. here is my benchmark:

                import time, cStringIO

                def test1():
                text = '1234567890' * 1024 * 256
                sio = cStringIO.StringIO()
                ts = time.time()
                for ch in text: sio.write(ch)
                print 'ts', time.time() - ts

                try:
                import psyco
                psyco.full()
                except:
                pass


                test1()
                test1()
                test1()

                # python2.7 0.45s
                # psyco2 0.26s
                # pypy-1.9 1.30s

                +
                +
                +
                +
                + + Arne Babenhauserheide wrote on 2012-09-12 15:29: +
                +
                +

                You could try using StringIO instead of cStringIO. pypy can optimize that much better.

                Here’s an adapted example:

                ------ ------ ------

                import time, StringIO, cStringIO

                def csio():
                text = '1234567890' * 1024 * 256
                sio = cStringIO.StringIO()
                ts = time.time()
                for ch in text: sio.write(ch)
                print 'ts', time.time() - ts

                def nsio():
                text = '1234567890' * 1024 * 256
                sio = StringIO.StringIO()
                ts = time.time()
                for ch in text: sio.write(ch)
                print 'ts', time.time() - ts


                print "cStringIO"
                csio()
                csio()
                csio()

                print "StringIO"
                nsio()
                nsio()
                nsio()

                ------ ------ ------

                Results for me with pypy 1.9:

                $ python stringiotest.py
                cStringIO
                ts 0.636300086975
                ts 0.63633108139
                ts 0.636710882187
                StringIO
                ts 3.35502791405
                ts 3.34557986259
                ts 3.33949017525
                $ bin/pypy stringiotest.py
                cStringIO
                ts 1.05391597748
                ts 0.528824090958
                ts 0.530929803848
                StringIO
                ts 0.359623908997
                ts 0.277186870575
                ts 0.273662090302

                +
                +
                +
                +
                + + Anonymous wrote on 2012-09-13 13:25: +
                +
                +

                thanks, it works with StringIO.

                +
                +
                +
                +
                + + Unknown wrote on 2012-09-13 13:26: +
                +
                +

                Increase the amount of iterations for even higher speedups:

                text = '1234567890' * 1024 * 256 * 16





                $ bin/pypy stringiotest.py
                cStringIO
                ts 224.367353201
                ts 140.621050835
                ts 140.672322035
                StringIO
                ts 5.80670285225
                ts 4.95937395096
                ts 4.82084798813

                $ python stringiotest.py
                cStringIO
                ts 9.54650998116
                ts 9.60773801804
                ts 9.56916093826
                StringIO
                ts 47.1465728283
                ts 47.145359993
                ts 47.1618230343


                Interestingly pypy with StringIO is twice as fast as python with cStringIO. But pypy with cStringIO is slow.

                So pypy with StringIO might still require 2x as much time as python2.6+psyco2.

                But remember that this compares pure python code on pypy with hand-optimized C-code+psyco.

                +
                +
                +
                +
                + + Unknown wrote on 2012-09-13 13:29: +
                +
                +

                Glad to help :)

                The cool part here is that pypy allows us to replace many C-modules with nicely readable python-code and still get a fast program.

                And that your custom code gets the same speedups.

                +
                +
                +
                +
                + + Anonymous wrote on 2012-09-13 13:32: +
                +
                +

                in order to import StringIO as cStringIO. how to confirm my script is running pypy? not python ?

                +
                +
                +
                +
                + + how to climb wrote on 2012-09-13 16:12: +
                +
                +

                thanks for the post dear. nice blog.

                +
                +
                +
                +
                + + Unknown wrote on 2012-09-14 10:04: +
                +
                +

                you could just import sys:

                import sys
                ispypy = hasattr(sys, "pypy_version_info")

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2012/10/cape-town-2012-sprint-report-1612771358321767072.html b/posts/2012/10/cape-town-2012-sprint-report-1612771358321767072.html new file mode 100644 index 000000000..71f9d5af1 --- /dev/null +++ b/posts/2012/10/cape-town-2012-sprint-report-1612771358321767072.html @@ -0,0 +1,367 @@ + + + + + +Cape Town 2012 sprint report | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                Cape Town 2012 sprint report

                + + + +
                +
                +

                Hello.

                +

                We're about to finish a PyPy sprint in Cape Town, South Africa that was +one of the smallest done so far, only having Armin Rigo and Maciej Fijalkowski +with Alex Gaynor joining briefly at the beginning, however also one of the +longest, lasting almost 3 weeks. The sprint theme seems to be predominantly +"no new features" and "spring cleaning". We overall removed about 20k lines +of code in the PyPy source tree. The breakdown of things done and worked on:

                +
                  +
                • +

                  We killed SomeObject support in annotation and rtyper. This is a modest +code saving, however, it reduces the complexity of RPython and also, +hopefully, improves compile errors from RPython. We're far from done +on the path to have comprehensible compile-time errors, but the first +step is always the hardest :)

                  +
                • +
                • +

                  We killed some magic in specifying the interface between builtin functions +and Python code. It used to be possible to write builtin functions like this:

                  +
                  +def f(space, w_x='xyz'):
                  +
                  +

                  which will magically wrap 'xyz' into a W_StringObject. Right now, instead, +you have to write:

                  +
                  +@unwrap_spec(w_x=WrappedDefault('xyz'))
                  +def f(space, w_x):
                  +
                  +

                  which is more verbose, but less magical.

                  +
                • +
                • +

                  We killed the CExtModuleBuilder which is the last remaining part of +infamous extension compiler that could in theory build C extensions +for CPython in RPython. This was never working very well and the main +part was killed long ago.

                  +
                • +
                • +

                  We killed various code duplications in the C backend.

                  +
                • +
                • +

                  We killed microbench and a bunch of other small-to-medium unused +directories.

                  +
                • +
                • +

                  We killed llgraph JIT backend and rewrote it from scratch. Now the llgraph +backend is not translatable, but this feature was rarely used and caused +a great deal of complexity.

                  +
                • +
                • +

                  We progressed on continulet-jit-3 branch, up to the point of merging +it into result-in-resops branch, which also has seen a bit of progress.

                  +

                  Purpose of those two branches:

                  +
                    +
                  • +continulet-jit-3: enable stackless to interact with the JIT by killing +global state while resuming from the JIT into the interpreter. This has +multiple benefits. For example it's one of the stones on the path to +enable STM for PyPy. It also opens new possibilities for other optimizations +including Python-Python calls and generators.
                  • +
                  • +result-in-resops: the main goal is to speed up the tracing time of PyPy. +We found out the majority of time is spent in the optimizer chain, +which faces an almost complete rewrite. It also simplifies the storage +of the operations as well as the number of implicit invariants that have +to be kept in mind while developing.
                  • +
                  +
                • +
                • +

                  We finished and merged the excellent work by Ronan Lamy which makes the +flow object space (used for abstract interpretation during RPython +compilation) independent from the Python interpreter. This means +we've achieved an important milestone on the path of separating the RPython +translation toolchain from the PyPy Python interpreter.

                  +
                • +
                +

                Cheers,
                +fijal & armin

                +
                +
                +
                +

                Comments

                +
                +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2012/11/numpy-status-update-5-5489198414356844587.html b/posts/2012/11/numpy-status-update-5-5489198414356844587.html new file mode 100644 index 000000000..0e2ee3f38 --- /dev/null +++ b/posts/2012/11/numpy-status-update-5-5489198414356844587.html @@ -0,0 +1,362 @@ + + + + + +NumPy status update #5 | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                NumPy status update #5

                + + + +
                +
                + +

                Hello.

                +

                I'm quite excited to inform that work on NumPy in PyPy has been restarted +and there has been quite a bit of progress on the NumPy front in PyPy in the +past two months. Things that happened:

                +
                  +
                • +complex dtype support - thanks to matti picus, NumPy on PyPy now supports +complex dtype (only complex128 so far, there is work on the other part)
                • +
                • +big refactoring - probably the biggest issue we did was finishing +a big refactoring that disabled some speedups (notably lazy computation +of arrays), but lowered the barrier of implementing cool new features.
                • +
                • +fancy indexing support - all fancy indexing tricks should now work, +including a[b] where b is an array of integers.
                • +
                • +newaxis support - now you can use newaxis features
                • +
                • improvements to ``intp``, ``uintp``, ``void``, ``string`` and record dtypes
                • +
                +

                Features that have active branches, but hasn't been merged:

                +
                  +
                • float16 dtype support
                • +
                • +missing ndarray attributes - this is a branch to finish all attributes +on ndarray, hence ending one chapter.
                • +
                • +pickling support for numarray - hasn't started yet, but next on the list
                • +
                +

                More importantly, we're getting very close to able to import the python part +of the original numpy with only import modifications and running it's tests. +Most tests will fail at this point, however it'll be a good start for another +chapter :-)

                +

                Cheers,
                +fijal

                +
                +
                +
                +

                Comments

                +
                +
                +
                + + Dmitrey wrote on 2012-11-01 17:11: +
                +
                +

                Hi,
                are sort() and argsort(), preferably with axis parameter, in nearest future plans?

                Regards, Dmitrey.

                +
                +
                +
                +
                + + Maciej Fijalkowski wrote on 2012-11-01 17:13: +
                +
                +

                Hi Dmitrey.

                argsort (with axis) is already implemented on a branch, sort coming later (it's further in the alphabet, I'm at g now ;-)

                +
                +
                +
                +
                + + Anonymous wrote on 2012-11-01 17:14: +
                +
                +

                hey, cool progress!
                numpypy.complex64(complex(4., 3.)) works for me on nightlies, FWIW

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2012/11/py3k-status-update-7-6182140595418083307.html b/posts/2012/11/py3k-status-update-7-6182140595418083307.html new file mode 100644 index 000000000..94dcd1172 --- /dev/null +++ b/posts/2012/11/py3k-status-update-7-6182140595418083307.html @@ -0,0 +1,424 @@ + + + + + +Py3k status update #7 | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                Py3k status update #7

                + + + +
                +

                This is the seventh status update about our work on the py3k branch, which
                +we can work on thanks to all of the people who donated to the py3k
                +proposal
                .

                +

                The biggest news is that this month Philip started to work on py3k in parallel
                +to Antonio. As such, there was an increased amount of activity.

                +

                The py3k buildbots now fully translate the branch every night and run the
                +Python standard library tests.

                +

                We currently pass 160 out of approximately 355 modules of CPython's standard
                +test suite, fail 144 and skip approximately 51.

                +

                Some highlights:

                +
                  +
                • dictviews (the objects returned by dict.keys/values/items) has been greatly
                  +improved, and now they full support set operators
                • +
                • a lot of tests has been fixed wrt complex numbers (and in particular the
                  __complex__ method)
                • +
                • _csv has been fixed and now it correctly handles unicode instead of bytes
                • +
                • more parser fixes, py3k list comprehension semantics; now you can no longer
                  +access the list comprehension variable after it finishes
                • +
                • 2to3'd most of the lib_pypy modules (pypy's custom standard lib
                  +replacements/additions)
                • +
                • py3-enabled pyrepl: this means that finally readline works at the command
                  +prompt, as well as builtins.input(). pdb seems to work, as well as
                  fancycompleter to get colorful TAB completions :-)
                • +
                • py3 round
                • +
                • further tightening/cleanup of the unicode handling (more usage of
                  +surrogateescape, surrogatepass among other things)
                • +
                • as well as keeping up with some big changes happening on the default branch
                  +and of course various other fixes.
                • +
                +

                Finally, we would like to thank Amaury Forgeot d'Arc for his significant
                +contributions.

                +

                cheers,
                +Philip&Antonio

                +
                +

                Comments

                +
                +
                +
                + + Unknown wrote on 2012-11-03 20:23: +
                +
                +

                Very cool!

                Thank you for your work!

                +
                +
                +
                +
                + + Anonymous wrote on 2012-11-04 05:32: +
                +
                +

                Great work!

                +
                +
                +
                +
                + + Anonymous wrote on 2012-11-06 05:22: +
                +
                +

                thanks for sharing.

                +
                +
                +
                +
                + + Unknown wrote on 2012-11-08 21:26: +
                +
                +

                How do I compile/translate it for testing the py3k branch?

                How much optimization is already possible?

                +
                +
                +
                +
                + + Antonio Cuni wrote on 2012-11-13 08:59: +
                +
                +

                @arne: you can just use the usual translate.py command inside the py3k branch.
                Or download one of the nightly builds:
                https://buildbot.pypy.org/nightly/py3k/

                however, note that:
                - JIT is not enabled (yet)
                - no focus has been put on performances (yet :)) so it is probably slower than even the non-jitted python2

                +
                +
                +
                +
                + + Anonymous wrote on 2012-11-22 07:14: +
                +
                +

                when will pypy-2.0 be available ?

                +
                +
                +
                +
                + + Anonymous wrote on 2012-11-22 07:15: +
                +
                +

                when will pypy-2.0 be avaliable ?

                +
                +
                +
                +
                + + Maciej Fijalkowski wrote on 2012-11-22 08:12: +
                +
                +

                2.0 beta 1 - today. 2.0 final - no date yet.

                +
                +
                +
                +
                + + Anonymous wrote on 2012-11-22 10:19: +
                +
                +

                looking forward to see the release note of pypy-2.0 b1

                +
                +
                +
                +
                + + Unknown wrote on 2012-11-22 12:17: +
                +
                +

                @antonio: for me the translate with goal pypy (interpreter) did not work, so I asked.

                I’ll try again. Thanks!

                +
                +
                +
                +
                + + Antonio Cuni wrote on 2012-11-22 15:05: +
                +
                +

                @arne: it's surely possible that translation is broken at some revision, it's all work in progress :). If you go to the nightly build page, you can see which for which revision translation did work

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2012/11/pypy-20-beta-1-2702952243260181341.html b/posts/2012/11/pypy-20-beta-1-2702952243260181341.html new file mode 100644 index 000000000..5846d4049 --- /dev/null +++ b/posts/2012/11/pypy-20-beta-1-2702952243260181341.html @@ -0,0 +1,470 @@ + + + + + +PyPy 2.0 beta 1 | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                PyPy 2.0 beta 1

                + + + +
                +
                +

                We're pleased to announce the 2.0 beta 1 release of PyPy. This release is +not a typical beta, in a sense the stability is the same or better than 1.9 +and can be used in production. It does however include a few performance +regressions documented below that don't allow us to label is as 2.0 final. +(It also contains many performance improvements.)

                +

                The main features of this release are support for ARM processor and +compatibility with CFFI. It also includes +numerous improvements to the numpy in pypy effort, cpyext and performance.

                +

                You can download the PyPy 2.0 beta 1 release here:

                +
                +https://pypy.org/download.html +
                +
                +

                What is PyPy?

                +

                PyPy is a very compliant Python interpreter, almost a drop-in replacement for +CPython 2.7.3. It's fast (pypy 2.0 beta 1 and cpython 2.7.3 +performance comparison) due to its integrated tracing JIT compiler.

                +

                This release supports x86 machines running Linux 32/64, Mac OS X 64 or +Windows 32. It also supports ARM machines running Linux. +Windows 64 work is still stalling, we would welcome a volunteer +to handle that.

                +
                +
                +

                How to use PyPy?

                +

                We suggest using PyPy from a virtualenv. Once you have a virtualenv +installed, you can follow instructions from pypy documentation on how +to proceed. This document also covers other installation schemes.

                +
                +
                +

                Regressions

                +

                Reasons why this is not PyPy 2.0:

                +
                  +
                • the ctypes fast path is now slower than it used to be. In PyPy +1.9 ctypes was either incredibly faster or slower than CPython depending whether +you hit the fast path or not. Right now it's usually simply slower. We're +probably going to rewrite ctypes using cffi, which will make it +universally faster.
                • +
                • +cffi (an alternative to interfacing with C code) is very fast, but +it is missing one optimization that will make it as fast as a native +call from C.
                • +
                • +numpypy lazy computation was disabled for the sake of simplicity. +We should reenable this for the final 2.0 release.
                • +
                +
                +
                +

                Highlights

                +
                  +
                • +cffi is officially supported by PyPy. You can install it normally by +using pip install cffi once you have installed PyPy and pip. +The corresponding 0.4 version of cffi has been released.
                • +
                • ARM is now an officially supported processor architecture. +PyPy now work on soft-float ARM/Linux builds. Currently ARM processors +supporting the ARMv7 and later ISA that include a floating-point unit are +supported.
                • +
                • This release contains the latest Python standard library 2.7.3 and is fully +compatible with Python 2.7.3.
                • +
                • It does not however contain hash randomization, since the solution present +in CPython is not solving the problem anyway. The reason can be +found on the CPython issue tracker.
                • +
                • +gc.get_referrers() is now faster.
                • +
                • Various numpy improvements. The list includes:
                    +
                  • axis argument support in many places
                  • +
                  • full support for fancy indexing
                  • +
                  • +complex128 and complex64 dtypes
                  • +
                  +
                • +
                • +JIT hooks are now a powerful tool to introspect the JITting process that +PyPy performs.
                • +
                • +**kwds usage is much faster in the typical scenario
                • +
                • operations on long objects are now as fast as in CPython (from +roughly 2x slower)
                • +
                • We now have special strategies for dict/set/list which contain +unicode strings, which means that now such collections will be both faster +and more compact.
                • +
                +
                +
                +

                Things we're working on

                +

                There are a few things that did not make it to the 2.0 beta 1, which +are being actively worked on. Greenlets support in the JIT is one +that we would like to have before 2.0 final. Two important items that +will not make it to 2.0, but are being actively worked on, are:

                +
                  +
                • Faster JIT warmup time.
                • +
                • Software Transactional Memory.
                • +
                +

                Cheers,
                +Maciej Fijalkowski, Armin Rigo and the PyPy team

                +
                +
                +
                +
                +

                Comments

                +
                +
                +
                + + Anonymous wrote on 2012-11-22 16:51: +
                +
                +

                Good job! 2 things:
                1) the link to the .tar.bz for Linux 64 (libc 2.13) links to a corrupted file (bz2 claims it is corrupted, and its MD5 hash doesn't match the one on the page)

                2) the link to the benchmark on this page: https://speed.pypy.org/comparison/?exe=1%2B785,2%2B472&ben=1,34,27,2,25,3,46,4,5,41,42,22,44,6,39,7,8,45,23,24,9,10,11,12,13,40,14,15,35,36,37,38,16,28,30,32,29,33,17,18,19,20,43&env=1,2&hor=true&bas=2%2B472&chart=normal+bars

                is empty -- no charts were plotted. (I've turned off all my adblocking).

                +
                +
                +
                +
                + + Anonymous wrote on 2012-11-22 16:52: +
                +
                +

                Oops, the chart appears now -- it took a long time to load.

                +
                +
                +
                +
                + + Unknown wrote on 2012-11-22 17:46: +
                +
                +

                The OSX binary segfaults on a Lion 64bit. I tried both 2.0-beta1 and a nightly build. Notice, 1.9 works perfectly.

                +
                +
                +
                +
                + + Unknown wrote on 2012-11-23 05:51: +
                +
                +

                I would be more than happy to give it a shot if there was solid PostgreSQL support - otherwise it is a no-go for me.

                +
                +
                +
                +
                + + Anonymous wrote on 2012-11-23 19:34: +
                +
                +

                Issue 1257 still not fixed (memory leak when using web.py framework).

                +
                +
                +
                +
                + + Anonymous wrote on 2012-11-23 19:35: +
                +
                +

                For PostgreSQL it works with psycopg2ct.

                +
                +
                +
                +
                + + Gabriel wrote on 2012-11-30 09:26: +
                +
                +

                Just announced on the IRC channel: psycopg2cffi. They ported it for speed, but from my CFFI experience, I think the biggest advantage is maintainability.

                +
                +
                +
                +
                + + Anonymous wrote on 2012-12-04 19:44: +
                +
                +

                I think I should give a try to this.
                Goona give a shot
                ---------------
                www.insecuregeek.blogspot.com

                +
                +
                +
                +
                + + Unknown wrote on 2013-01-07 15:07: +
                +
                +

                If we can get greenlet support in the JIT that'd be fantastic - my non-blocking driver for MongoDB, Motor, will need it before it's usable with PyPy. Thanks for the amazing work!

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2012/11/pypy-san-francisco-sprint-dec-1st-dec-5133109101989613355.html b/posts/2012/11/pypy-san-francisco-sprint-dec-1st-dec-5133109101989613355.html new file mode 100644 index 000000000..8d543f9c5 --- /dev/null +++ b/posts/2012/11/pypy-san-francisco-sprint-dec-1st-dec-5133109101989613355.html @@ -0,0 +1,365 @@ + + + + + +PyPy San Francisco Sprint Dec 1st - Dec 2nd 2012 | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                PyPy San Francisco Sprint Dec 1st - Dec 2nd 2012

                + + + +
                +

                The next PyPy sprint will be in San Francisco, California. It is a
                +public sprint, suitable for newcomers. It will run on Saturday December 1st and
                +Sunday December 2nd. The goals for the sprint are continued work towards the
                +2.0 release as well as code cleanup, we of course welcome any topic which
                +contributors are interested in working on.

                +

                Some other possible topics are:

                +
                  +
                • running your software on PyPy
                • +
                • work on PyPy's numpy (status)
                • +
                • work on STM (status)
                • +
                • JIT improvements
                • +
                • any exciting stuff you can think of
                • +
                +

                If there are newcomers, we'll run the usual introduction to hacking on
                +PyPy.

                +

                Location

                +

                The sprint will be held at the Rackspace Office:

                +

                620 Folsom St, Ste 100
                +San Francisco

                +

                The doors will open at 10AM both days, and run until 6PM both days.

                +

                Thanks to David Reid for helping get everything set up!

                +
                +

                Comments

                +
                +
                +
                + + Mike Pavone wrote on 2012-11-29 22:49: +
                +
                +

                Hi, I'm interested in getting involved with PyPy development and would love to attend the sprint to get started, but I'm not sure I can make it both days. Would it be okay to just participate Sunday or would that not make sense?

                +
                +
                +
                +
                + + Maciej Fijalkowski wrote on 2012-11-29 23:06: +
                +
                +

                absolutely

                +
                +
                +
                +
                + + Jean-Paul Calderone wrote on 2012-11-30 19:16: +
                +
                +

                Awww jeez, you guys couldn't wait a couple more weeks? Have fun. If anyone's still in the bay area after Dec 10th give a holler.

                +
                +
                +
                +
                + + Anonymous wrote on 2012-12-03 01:17: +
                +
                +

                It would have helped a lot if this sprint was announced more in advance. I just missed it because I didn't bother to check the PyPy blog last week.

                +
                +
                +
                +
                + + Maciej Fijalkowski wrote on 2012-12-03 16:35: +
                +
                +

                I'm sorry, but we didn't know more in advance.

                +
                +
                +
                +
                + + Anonymous wrote on 2012-12-25 20:48: +
                +
                +

                STM update looks interesting and promising!

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2012/12/py3k-status-update-8-3932232806458251730.html b/posts/2012/12/py3k-status-update-8-3932232806458251730.html new file mode 100644 index 000000000..4311cdad0 --- /dev/null +++ b/posts/2012/12/py3k-status-update-8-3932232806458251730.html @@ -0,0 +1,347 @@ + + + + + +Py3k status update #8 | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                Py3k status update #8

                + + + +
                +

                This is the eight status update about our work on the py3k branch, which
                +we can work on thanks to all of the people who donated to the py3k
                +proposal
                .

                +

                Just a short update on November's work: we're now passing about 194 of
                +approximately 355 modules of CPython's regression test suite, up from passing
                +160 last month. Many test modules only fail a small number of individual tests
                +now.

                +

                We'd like to thank Amaury Forgeot d'Arc for his contributions, in particular he
                +has made significant progress on updating CPyExt for Python 3 this month.

                +

                Some other highlights:

                +
                  +
                • +test_marshal now passes, and there's been significant progress on
                  +pickling (thanks Kenny Levinsen and Amaury for implementing
                  int.{to,from}_bytes)
                • +
                • We now have a _posixsubprocess module
                • +
                • More encoding related fixes, which affects many failing tests
                • +
                • +_sre was updated and now test_re almost passes
                • +
                • Exception behavior is almost complete per the Python 3 specs, what's mostly
                  +missing now are the new __context__ and __traceback__ attributes (PEP
                  +3134
                  )
                • +
                • Fixed some crashes and deadlocks occurring during the regression tests
                • +
                • We merged the unicode-strategies branch both to default and to py3k: now we
                  +have versions of lists, dictionaries and sets specialized for unicode
                  +elements, as we already had for strings.
                • +
                • However, for string-specialized containers are still faster in some cases
                  +because there are shortcuts which have not been implemented for unicode yet
                  +(e.g., constructing a set of strings from a list of strings). The plan is to
                  +completely kill the shortcuts and improve the JIT to produce the fast
                  +version automatically for both the string and unicode versions, to have a
                  +more maintainable codebase without sacrificing the speed. The autoreds
                  +branch (already merged) was a first step in this direction.
                • +
                +

                cheers,
                +Philip&Antonio

                +
                +

                Comments

                +
                +
                +
                + + Anonymous wrote on 2012-12-05 22:14: +
                +
                +

                Well done. PyPy is one the most interesting projects out there today.
                Keep up the amazing work guys!

                J.

                +
                +
                +
                +
                + + Anonymous wrote on 2013-01-03 14:56: +
                +
                +

                thank you for your work!!

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2012/12/pypy-related-internship-at-ncar-7412729710421119926.html b/posts/2012/12/pypy-related-internship-at-ncar-7412729710421119926.html new file mode 100644 index 000000000..4e8e26702 --- /dev/null +++ b/posts/2012/12/pypy-related-internship-at-ncar-7412729710421119926.html @@ -0,0 +1,322 @@ + + + + + +PyPy related internship at NCAR | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                PyPy related internship at NCAR

                + + + +
                +
                +

                Hello everyone

                +

                I would like to advertise a PyPy-related summer internship at +the National Center for Atmospheric Research, which is located in lovely +Boulder, Colorado. As for the last year, the mentor will be Davide del Vento, +with my possible support on the PyPy side.

                +

                The full details of the application are to be found on +the internship description and make sure you read the requirements +first. Important requirements:

                +
                  +
                • Must currently be enrolled in a United States university.
                • +
                • Only students authorized to work for any employer in the United +States will be considered for the SIParCS program.
                • +
                • Must be a graduate or under graduate who has completed their sophomore year.
                • +
                +

                If you happen to fulfill the requirements, to me this sounds like +a great opportunity to spend a summer at NCAR in Boulder hacking on atmospheric +models using PyPy.

                +

                Cheers, +fijal

                +
                +
                +
                +

                Comments

                +
                +
                +
                + + Anonymous wrote on 2012-12-07 23:35: +
                +
                +

                You can post it on https://jobs.pythonweekly.com/

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2013/01/numpypy-2013-developer-position-1547805593757893630.html b/posts/2013/01/numpypy-2013-developer-position-1547805593757893630.html new file mode 100644 index 000000000..7a69039b7 --- /dev/null +++ b/posts/2013/01/numpypy-2013-developer-position-1547805593757893630.html @@ -0,0 +1,500 @@ + + + + + +NumPyPy 2013 Developer Position | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                NumPyPy 2013 Developer Position

                + + + +
                +
                +
                +

                Introduction

                +

                Proposed herein is a part-time fellowship for developing NumPy in PyPy. +The work will initially consist of 100 hours +with the possibility of extension, until the funds run out. +Development and improvement of PyPy's NumPyPy (as +with most Open Source and Free Software) is done as a collaborative process +between volunteer, paid, and academic contributors. Due to a successful funding +drive but a lack of contributors willing to work directly for PyPy, we find +ourselves in the enviable situation of being able to offer this position.

                +
                +
                +

                Background

                +

                PyPy's developers make all PyPy software available to the public +without charge, under PyPy's Open Source copyright license, the +permissive MIT License. PyPy's license assures that PyPy is equally +available to everyone freely on terms that allow both non-commercial +and commercial activity. This license allows for academics, for-profit +software developers, volunteers and enthusiasts alike to collaborate +together to make a better Python implementation for everyone.

                +

                NumPy support for PyPy is licensed similarly, and therefore NumPy in +PyPy support can directly help researchers and developers who seek to +do numeric computing but want an easier programming language to use +than Fortan or C, which is typically used for these +applications. Being licensed freely to the general public means that +opportunities to use, improve and learn about how NumPy in PyPy works +itself will be generally available to everyone.

                +
                +
                +

                The Need for a Part-Time Developer

                +

                NumPy project in PyPy has seen some slow, but steady progress since we started +working about a year ago. On one hand, +it's actually impressive what we could deliver with the effort undertaken, +on the other hand, we would like to see the development accelerated.

                +

                PyPy has strict coding, testing, documentation, and review standards, +which ensures excellent code quality, continually improving +documentation and code test coverage, and minimal regressions. A +part-time developer will be able to bring us closer to the goal of +full numpy-api implementation and speed improvements.

                +
                +
                +

                Work Plan

                +

                The current proposal is split into two parts:

                +
                  +
                • +

                  Compatibility:

                  +

                  This part covers the core NumPy Python API. We'll implement most NumPy APIs +that are officially documented and we'll pass most of NumPy's tests that +cover documented APIs and are not implementation details. +Specifically, we don't plan to:

                  +
                    +
                  • implement NumPy's C API
                  • +
                  • implement other scientific libraries, like SciPy, matplotlib or biopython
                  • +
                  • implement details that are otherwise agreed by consensus to not have a place +in PyPy's implementation of NumPy or agreed with NumPy community +to be implementation details
                  • +
                  +
                • +
                • +

                  Speed:

                  +

                  This part will cover significant speed improvements in the JIT that would +make numeric computations faster. This includes, but is not necesarilly +limited to:

                  +
                    +
                  • write a set of benchmarks covering various use cases
                  • +
                  • teaching the JIT backend (or multiple backends) how to deal with vector +operations, like SSE
                  • +
                  • experiments with automatic parallelization using multiple threads, akin +to numexpr
                  • +
                  • improving the JIT register allocator that will make a difference, especially +for tight loops
                  • +
                  +

                  As with all speed improvements, it's relatively hard to predict exactly +how it'll cope, however we expect the results to be withing an order +of magnitude of handwritten C equivalent.

                  +
                • +
                +
                +
                +

                Position Candidate

                +

                We would like people who are proficient in NumPy and PyPy (but don't have to be +core developers of either) to step up. The developer selection will be done +by consensus of PyPy core developers and consulted with the Software Freedom +Conservancy for lack of conflict of interest. The main criterium will be +past contributions to the PyPy project, but they don't have to be significant +in size.

                +

                A candidate for the Developer position will demonstrate the following:

                +
                  +
                • The ability to write clear, stable, suitable and tested code
                • +
                • The ability to understand and extend the JIT capabilities used in NumPyPy.
                • +
                • A positive presence in PyPy's online community on IRC and the mailing +list.
                • +
                +

                Ideally the Developer will also:

                +
                  +
                • Have familiarity with the infrastructure of the PyPy project (including +bug tracker and buildbot).
                • +
                • Have Worked to provide education or outreach on PyPy in other forums such as +workshops, conferences, and user groups.
                • +
                +

                Conservancy and PyPy are excited to announce the Developer Position. +Renumeration for the position will be at the rate of 60 USD per hour, through +the Software Freedom Conservancy.

                +

                PyPy community is promising to provide necessary guidance and help into +the current codebase, however we expect a successful candidate to be able +to review code and incorporate external patches within two months of the +starting date of the contract.

                +

                Candidates should submit their proposal (including their CV) to:

                +

                pypy-z@python.org

                +

                The deadline for this initial round of proposals is February 1, 2013.

                +
                +
                +
                +
                +

                Comments

                +
                +
                +
                + + Anonymous wrote on 2013-01-26 11:37: +
                +
                +

                I was wondering, why is PyPy so eager to support NumPy of all things? Surely there are things more interesting to a general python/pypy user base. Can someone clarify that for me?

                +
                +
                +
                +
                + + Maciej Fijalkowski wrote on 2013-01-26 11:40: +
                +
                +

                There was a numpy fundraiser due to popular demand. Feel free to suggest a different fundraiser if you want something else. I would be willing to even do a survey.

                +
                +
                +
                +
                + + Anonymous wrote on 2013-01-26 14:56: +
                +
                +

                The thing is, the most interesting use of Python is in science, IMHO at least. And absolute majority of python scientific libraries use numpy as base. So, it would be awesome to have fast and robust numpy compatible library running on pypy.

                +
                +
                +
                +
                + + Armin Rigo wrote on 2013-01-26 17:28: +
                +
                +

                The deadline seems too tight: it's next Friday.

                +
                +
                +
                +
                + + Anonymous wrote on 2013-01-26 18:31: +
                +
                +

                It's been said before but as a long time NumPy and SciPy user, please please please don't call this project NumPy. It's great for PyPy to have an nd-array lib and for sure NumPy has some of the best semantics and user API for that so by all means make it compatible, but giving it the same name just makes tremendous confusion for users. For scientific users without the C-API which allows most of the widely used scientific extensions it is simply not "numpy".

                +
                +
                +
                +
                + + Wes Turner wrote on 2013-01-26 22:24: +
                +
                +

                @201301261931

                As NumPyPy intends to implement NumPy APIs, as a non-contributor, I feel like NumPyPy is a good name.

                So then the package names would be:

                * https://pypi.python.org/pypi/numpy
                * https://pypi.python.org/pypi/numpypy

                @201301261237

                IMHO, this is not the forum for discussing what sort of pony you would like?

                +
                +
                +
                +
                + + Anonymous wrote on 2013-01-27 16:19: +
                +
                +

                FWIW I think that numpypy to work is hugely important for the acceptance of pypy. Simple things like using matplotlib are crucial to lots of people who aren't using much of the rest of scipy, for example.

                +
                +
                +
                +
                + + Rahul Chaudhary wrote on 2013-01-28 01:31: +
                +
                +

                You can post it on https://jobs.pythonweekly.com/ and it will be included in Python Weekly newsletter too.

                +
                +
                +
                +
                + + Anonymous wrote on 2013-01-30 01:36: +
                +
                +

                I am following each of your announcements with great interest.
                JIT optimization of array manipulations would enormously benefit my daily work.

                Even though I am trying hard to follow the discussion, I have difficulty understanding the issues at hand, and what numpypy is going to be when it is finished.

                Probably I am not the only one, considering the sometimes controversial discussion.

                My current understanding is this:
                All python code in numpy will run much better under pypy.

                The problem are the external libraries. Depending on the type, there will be different approaches.

                I assume that you will re-write a large part of the c-part of numpy directly in python, and then make use of the JIT optimizer. That would be the approach for all of the algorithms that are currently written in c, but could be easily re-implemented in python.
                Something like ufunc_object.c could probably be rewritten in python without a loss of speed.
                Of course, even though this would still run under normal python, it would be far to slow.

                Then you have external dlls, like BLAS. I assume you will call them differently (ctypes?), and not as extension modules. If you use ctypes, it will still run under normal python, maybe a bit slower.

                Then you have parts that are currently written in c, but that you can neither re-implement in python, nor call as a dll. Will you re-write those in c, using a different c-api? Or re-write them, so that they can be called using ctypes?


                Maybe you give a short general overview about the issues with the c-api and what you are doing?

                Something like. "Currently the function numpy.dot is written as a c-extension. It makes extensive use of PyArray_GETITEM. This limits the optimizer. We are therefore completely rewriting the function in python"

                What is the best approach for a user like me, who makes heavy use of numpy, but also scipy and my own extension modules, cython and f2py?

                Should I preferably write future modules as dlls, so that they can be called with ctypes (or cffi or something else), instead of making extension modules?

                Do you think it will be possible at all to use scipy, which makes much more use of non-python libraries, or do you think that scipy will have to be re-written?

                +
                +
                +
                +
                + + Alendit wrote on 2013-02-09 12:09: +
                +
                +

                Just a question - the donation figures on the homepage seem to be the same for the last 6 month or so. Is there really no donation or aren't they updated anymore.

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2013/01/py3k-status-update-9-98332471264591773.html b/posts/2013/01/py3k-status-update-9-98332471264591773.html new file mode 100644 index 000000000..2cde2decb --- /dev/null +++ b/posts/2013/01/py3k-status-update-9-98332471264591773.html @@ -0,0 +1,330 @@ + + + + + +Py3k status update #9 | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                Py3k status update #9

                + + + +
                +

                This is the ninth status update about our work on the py3k branch, which
                +we can work on thanks to all of the people who donated to the py3k
                +proposal
                .

                +

                Just a very short update on December's work: we're now passing about 223 of
                +approximately 355 modules of CPython's regression test suite, up from passing
                +194 last month.

                +

                Some brief highlights:

                +
                  +
                • More encoding related issues were addressed. e.g. now most if not all the
                  +multibytecodec test modules pass.
                • +
                • Fixed some path handling issues (test_os, test_ntpath and
                  test_posixpath now pass)
                • +
                • We now pass test_class, test_descr and almost test_builtin (among
                  +other things): these are notable as they are fairly extensive test suites of
                  +core aspects of the langauge.
                • +
                • Amaury Forgeot d'Arc continued making progress on CPyExt (thanks again!)
                • +
                +

                cheers,
                +Phil

                +
                +

                Comments

                +
                +
                +
                + + Unknown wrote on 2013-01-14 10:58: +
                +
                +

                Nice! Thank you for your update!

                +
                +
                +
                +
                + + Kevin S. Smith wrote on 2013-01-24 17:24: +
                +
                +

                The update was expected. Thank you for your update. Hope to see more.

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2013/02/10-years-of-pypy-634401291726575821.html b/posts/2013/02/10-years-of-pypy-634401291726575821.html new file mode 100644 index 000000000..ac337a619 --- /dev/null +++ b/posts/2013/02/10-years-of-pypy-634401291726575821.html @@ -0,0 +1,544 @@ + + + + + +10 years of PyPy | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                10 years of PyPy

                + + + +
                +
                + +

                +

                From a software engineering perspective, 10 years is indistinguishable +from infinity, so I don't care what happens 10 years from now -- as +long as you don't blame me. :-)

                - Guido van Rossum, Python creator. +

                10 years is indeed a long time. PyPy was created approximately 10 years ago, +with the exact date being lost in the annals of the version control system. +We've come a long way during those 10 years, from a "minimal Python" that +was supposed to serve mostly as an educational tool, through to a vehicle for +academic research to a high performance VM for Python and beyond.

                +

                Some facts from the PyPy timeline:

                +
                  +
                • In 2007, at the end of the EU funding period, we promised the JIT was just around the corner. +It turned out we misjudged it pretty badly -- the first usable PyPy was released in 2010.
                • +
                • At some point we decided to have a JavaScript backend so one could compile RPython programs +to JavaScript and run them in a browser. Turned out it was a horrible idea.
                • +
                • Another option we tried was using RPython to write CPython C extensions. Again, it turned out RPython +is a bad language and instead we made a fast JIT, so you don't have to write C extensions.
                • +
                • We made N attempts to use LLVM. Seriously, N is 4 or 5. But we haven't fully given up yet :-) +They all run into issues one way or another.
                • +
                • We were huge fans of ctypes at the beginning. Up to the point where we tried to make +a restricted subset with static types, called rctypes for RPython. Turned out to be horrible. +Twice.
                • +
                • We were very hopeful about creating a JIT generator from the beginning. But the first one failed miserably, +generating too much assembler. The second failed too. The third first burned down and then failed. +However, we managed to release a working JIT in 2010, against all odds.
                • +
                • Martijn Faassen used to ask us "how fast is PyPy" so we decided to name an option enabling all +optimizations "--faassen". Then "--no-faassen" was naturally added too. Later we +decided to grow up and renamed it to "-O2", and now "-Ojit".
                • +
                • The first time the Python interpreter successfully compiled to C, it segfaulted because the code generator used signed chars instead of unsigned chars...
                • +
                • To make it more likely to be accepted, the proposal for the EU project contained basically every feature under the sun a language could have. This proved to be annoying, because we had to actually implement all that stuff. Then we had to do a cleanup sprint where we deleted 30% of codebase and 70% of features.
                • +
                • At one sprint someone proposed a new software development methodology: 'Terminology-Driven Programming' means to pick a fancy name, then discuss what it could mean, then implement it. Examples: timeshifter, rainbow interpreter, meta-space bubble, hint annotations (all but one of these really existed).
                • +
                • There is a conspiracy theory that the reason why translation is so slow is because time is stored away during it, which is later retrieved when an actual program runs to make them appear faster
                • +
                +

                Overall, it was a really long road. However, 10 years later we are in +good shape. A quick look on the immediate future: we are approaching +PyPy 2.0 with stackless+JIT and cffi support, +the support for Python 3 is taking shape, non-standard +extensions like STM are slowly getting ready (more soon), and there are +several non-Python interpreters around the corner (Hippy, Topaz and more).

                +

                Cheers,
                +fijal, arigo, hodgestar, cfbolz and the entire pypy team.

                + + +
                +
                +
                +

                Comments

                +
                +
                +
                + + Anonymous wrote on 2013-02-28 22:43: +
                +
                +

                My best wishes to whole PyPy team! And thanks for all the hard work!

                +
                +
                +
                +
                + + Anonymous wrote on 2013-02-28 23:01: +
                +
                +

                You guys rock!

                +
                +
                +
                +
                + + Anonymous wrote on 2013-02-28 23:04: +
                +
                +

                Best blog posting - ever! Heres to another 10 pypy years and N llvm endeavours. -- rxe

                +
                +
                +
                +
                + + Anonymous wrote on 2013-02-28 23:33: +
                +
                +

                You've made a great work so far, please continue with it!!

                +
                +
                +
                +
                + + Vanessa wrote on 2013-03-01 00:37: +
                +
                +

                Only those who dare to fail greatly can ever achieve greatly. --RFK
                Congrats, guys!

                +
                +
                +
                +
                + + Anonymous wrote on 2013-03-01 01:45: +
                +
                +

                Congratulations and thank you for the great work, looking forward to the next 10 years!

                +
                +
                +
                +
                + + dmatos wrote on 2013-03-01 02:16: +
                +
                +

                Great work!

                +
                +
                +
                +
                + + Anonymous wrote on 2013-03-01 06:20: +
                +
                +

                How will PyPy impact Python future and it's adoption as preferred language?

                +
                +
                +
                +
                + + Anonymous wrote on 2013-03-01 08:23: +
                +
                +

                indeed: congratulations and much respect for the perseverance and hard work you have put into this project over the years!

                +
                +
                +
                +
                + + Gaëtan de Menten wrote on 2013-03-01 08:42: +
                +
                +

                First, congratulations for keeping at it for 10 years! PyPy is one of the most interesting project I know of.

                This blog post is also very interesting but by reading it I can't help but think: are all those "failures" documented somewhere in one place? It could be a very interesting read.

                Or more specifically:
                * Why was the JavaScript backend a horrible idea?
                * Why is RPython a bad language (for writing CPython extensions)?
                * What went wrong in the different attempts at using LLVM?
                * What were those "70% of features" that were dropped after the EU project?

                +
                +
                +
                +
                + + glyph wrote on 2013-03-01 09:16: +
                +
                +

                Congratulations! Here's to another 10 years!

                And the JavaScript backend was a great idea - bring it back! It's certainly better than the other Python-to-JS translators out there, at least in terms of actually parsing some Python. I want Python in my browser!

                +
                +
                +
                +
                + + kayhayen wrote on 2013-03-01 11:29: +
                +
                +

                I was and always will be impressed by PyPy. And the self-critic of this post only furthers it. You are cool people, looking forward to meet you again.

                +
                +
                +
                +
                + + Anonymous wrote on 2013-03-01 12:12: +
                +
                +

                I remember 10 years ago, when I decided to learn to program... I didn't know what language to choose, and someone suggested python. It was someone I approached through a mailing list, and he was passionate explaining why python is so special.

                I remember reading about it being cool but with a "performance problem". However, there were some nerds out there talking about a minimal python, that would eventually become a fast python, so I said "cool, perhaps in a few months there will be a fast python...".

                I spent ten years following silently this story, and I'm happy to say "Happy birthday Pypy!".

                I've never met any of you, but I feel I know you.
                You showed me the value of perseverance, that every failure is one step closer to success.

                Congratulations and a big THANK YOU!
                Luis Gonzalez, from Buenos Aires.

                +
                +
                +
                +
                + + Paul Jaros wrote on 2013-03-01 14:12: +
                +
                +

                PyPy is my favorite open-source project. Best of wishes for the future development.
                May you find all the funding you need, become the leading STM Implementation and become the defacto Python standard.

                +
                +
                +
                +
                + + Stefane Fermigier wrote on 2013-03-01 14:34: +
                +
                +

                +1 on Gaëtan de Menten's comment.

                +
                +
                +
                +
                + + Daniel wrote on 2013-03-01 22:06: +
                +
                +

                One more +1 on Gaëtan de Menten's comment. :)

                +
                +
                +
                +
                + + Anonymous wrote on 2013-03-02 01:06: +
                +
                +

                You are incredible people and you do such cool stuff! Best of luck to you and keep up the great work!

                +
                +
                +
                +
                + + Arne Babenhauserheide wrote on 2013-03-02 11:03: +
                +
                +

                Thank you for the great post - and thank you for sticking to it and finding ways to get time to make it work - including to add everything under the sun into that EU project to be able to go full-time!

                You’re a great example how to really do stuff right - by actually doing it and keeping at it through every stumbling block on the way.

                Happy birthday - and thank you for pypy!

                +
                +
                +
                +
                + + Jan Brohl wrote on 2013-03-03 12:32: +
                +
                +

                +1 on Gaëtan de Menten's comment.

                +
                +
                +
                +
                + + Anonymous wrote on 2013-03-04 14:11: +
                +
                +

                I'd also like to see the failures documented. Trying and failing is a great way to learn - but even better is to learn from other's failures.

                +
                +
                +
                +
                + + Anonymous wrote on 2013-03-05 11:49: +
                +
                +

                Great work guys! Happy birthday PyPy!

                +
                +
                +
                +
                + + Электроник wrote on 2013-03-10 01:34: +
                +
                +

                Thanks for making fast Python possible and creating a masterpiece in process!
                About Terminology-Driven Programming: let me guess, the only nonexistent thing is a timeshifter? Three other names make a lot of sense in context of PyPy.

                +
                +
                +
                +
                + + Armin Rigo wrote on 2013-03-23 16:42: +
                +
                +

                Электроник: no :-) Try again.

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2013/02/announcing-topaz-rpython-powered-ruby-6662407703061538341.html b/posts/2013/02/announcing-topaz-rpython-powered-ruby-6662407703061538341.html new file mode 100644 index 000000000..f1381277e --- /dev/null +++ b/posts/2013/02/announcing-topaz-rpython-powered-ruby-6662407703061538341.html @@ -0,0 +1,468 @@ + + + + + +Announcing Topaz, an RPython powered Ruby interpreter | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                Announcing Topaz, an RPython powered Ruby interpreter

                + + + +
                +

                Hello everyone

                + +

                Last week, Alex Gaynor announced the first public release of +Topaz, +a Ruby interpreter written in RPython. This is the culmination of a +part-time effort over the past 10 months to provide a Ruby interpreter +that implements enough interesting constructs in Ruby to show that the +RPython toolchain can produce a Ruby implementation fast enough to +beat what is out there.

                + +

                Disclaimer

                + +

                Obviously the implementation is very incomplete currently in terms of +available standard library. We are working on getting it useable. If +you want to try it, grab a +nightly build.

                + +

                We have run some benchmarks from the +Ruby benchmark suite +and the +metatracing VMs experiment. The +preliminary results are promising, but at this point we are missing so +many method implementations that most benchmarks won't run yet. So instead of +performance, I'm going to talk about the high-level structure of the +implementation.

                + +

                Architecture

                + +

                Topaz interprets a custom bytecode set. The basics are similar to +Smalltalk VMs, with bytecodes for loading and storing locals and +instance variables, sending messages, and stack management. Some +syntactical features of Ruby, such as defining classes and modules, +literal regular expressions, hashes, ranges, etc also have their own +bytecodes. The third kind of bytecodes are for control flow constructs +in Ruby, such as loops, exception handling, break, continue, etc.

                + +

                In trying to get from Ruby source code to bytecode, we found that the +easiest way to support all of the Ruby syntax is to write a custom +lexer and use an RPython port of PLY +(fittingly called RPly) to create the +parser from the Ruby yacc grammar.

                + +

                The Topaz interpreter uses an ObjectSpace (similar to how PyPy does +it), to interact with the Ruby world. The object space contains all +the logic for wrapping and interacting with Ruby objects from the +VM. It's __init__ method sets up the core classes, initial globals, +and creates the main thread (the only one right now, as we do not have +threading, yet).

                + +

                Classes are mostly written in Python. We use ClassDef objects to +define the Ruby hierarchy and attach RPython methods to Ruby via +ClassDef decorators. These two points warrant a little explanation.

                + +
                Hierarchies
                + +

                All Ruby classes ultimately inherit from BasicObject. However, most +objects are below Object (which is a direct subclass of +BasicObject). This includes objects of type Fixnum, Float, +Class, and Module, which may not need all of the facilities of +full objects most of the time.

                + +

                Most VMs treat such objects specially, using tagged pointers to +represent Fixnums, for example. Other VMs (for example from the +SOM Family) +don't. In the latter case, the implementation hierarchy matches the +language hierarchy, which means that objects like Fixnum share a +representation with all other objects (e.g. they have class pointers +and some kind of instance variable storage).

                + +

                In Topaz, implementation hierarchy and language hierarchy are +separate. The first is defined through the Python inheritance. The +other is defined through the ClassDef for each Python class, where the +appropriate Ruby superclass is chosen. The diagram below shows how the +implementation class W_FixnumObject inherits directly from +W_RootObject. Note that W_RootObject doesn't have any attrs, +specifically no storage for instance variables and no map (for +determining the class - we'll get to that). These attributes are +instead defined on W_Object, which is what most other implementation +classes inherit from. However, on the Ruby side, Fixnum correctly +inherits (via Numeric and Integer) from Object.

                + +
                + +
                + +

                This simple structural optimization gives a huge speed boost, but +there are VMs out there that do not have it and suffer performance +hits for it.

                + +
                Decorators
                + +

                Ruby methods can have symbols in its names that are not allowed as +part of Python method names, for example !, ?, or =, so we +cannot simply define Python methods and expose them to Ruby by the +same name.

                + +

                For defining the Ruby method name of a function, as well as argument +number checking, Ruby type coercion and unwrapping of Ruby objects to +their Python equivalents, we use decorators defined on ClassDef. When +the ObjectSpace initializes, it builds all Ruby classes from their +respective ClassDef objects. For each method in an implementation +class that has a ClassDef decorator, a wrapper method is generated and +exposed to Ruby. These wrappers define the name of the Ruby method, +coerce Ruby arguments, and unwrap them for the Python method.

                + +

                Here is a simple example:

                + +
                @classdef.method("*", times="int")
                +def method_times(self, space, times):
                +    return self.strategy.mul(space, self.str_storage, times)
                +
                + +

                This defines the method * on the Ruby String class. When this is +called, the first argument is converted into a Ruby Fixnum object +using the appropriate coercion method, and then unwrapped into a plain +Python int and passed as argument to method_times. The wrapper +method also supplies the space argument.

                + +

                Object Structure

                + +

                Ruby objects have dynamically defined instance variables and may +change their class at any time in the program (a concept called +singleton class +in Ruby - it allows each object to have unique behaviour). To still +efficiently access instance variables, you want to avoid dictionary +lookups and let the JIT know about objects of the same class that have +the same instance variables. Topaz, like PyPy (which got it from +Self), implements instances using maps, which transforms dictionary +lookups into array accesses. See the +blog post +for the details.

                + +

                This is only a rough overview of the architecture. If you're +interested, get in touch on +#topaz.freenode.net, follow the +Topaz Twitter account or contribute +on GitHub.

                + +Tim Felgentreff +
                +

                Comments

                +
                +
                +
                + + Shin Guey wrote on 2013-02-12 19:25: +
                +
                +

                Interesting. Although I code a lot in python but still quite like Ruby. Am looking forward for a fast ruby...

                +
                +
                +
                +
                + + Unknown wrote on 2013-02-12 20:37: +
                +
                +

                Does this mean that JVM is now obsolete?

                +
                +
                +
                +
                + + Anonymous wrote on 2013-02-13 14:36: +
                +
                +

                Don't worry. JVM will outlive you and your grandgrandchildren.

                +
                +
                +
                +
                + + smurfix wrote on 2013-02-17 09:05: +
                +
                +

                "Its __init__ method", not "It's".

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2013/02/cffi-05-1630643916751622710.html b/posts/2013/02/cffi-05-1630643916751622710.html new file mode 100644 index 000000000..e69fff530 --- /dev/null +++ b/posts/2013/02/cffi-05-1630643916751622710.html @@ -0,0 +1,320 @@ + + + + + +CFFI 0.5 | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                CFFI 0.5

                + + + +
                +

                Hi all,

                + +

                A short notice to tell you that CFFI 0.5 was released. This +contains a number of small improvements from 0.4, but seems to otherwise +be quite stable since a couple of months --- no change since January 10, +apart from the usual last-minute fixes for Python 3 and for Windows.

                + +

                Have fun!

                + +

                Armin

                +
                +

                Comments

                +
                +
                +
                + + Dirkjan Ochtman wrote on 2013-02-08 11:53: +
                +
                +

                Nice! I've added it to the Gentoo package repository; all the tests passed without any issues, this time.

                +
                +
                +
                +
                + + mattip wrote on 2013-03-31 14:41: +
                +
                +

                Note that pypy uses a builtin cffi_backend which must match the cffi version. As of March 31 for instance nightly builds work with cffi 0.6

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2013/02/cppyy-status-update-808802896237239604.html b/posts/2013/02/cppyy-status-update-808802896237239604.html new file mode 100644 index 000000000..7b790ff54 --- /dev/null +++ b/posts/2013/02/cppyy-status-update-808802896237239604.html @@ -0,0 +1,399 @@ + + + + + +cppyy status update | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                cppyy status update

                + + + +
                +

                The cppyy module +provides C++ bindings for PyPy by using the reflection information extracted +from C++ header files by means of the +Reflex package. +In order to support C++11, the goal is to move away from Reflex and instead use +cling, an interactive +C++ interpreter, as the backend. +Cling is based on llvm's +clang. + +The use of a real compiler under the hood has the advantage that it is now +possible to cover every conceivable corner case. +The disadvantage, however, is that every corner case actually has to be +covered. +Life is somewhat easier when calls come in from the python interpreter, as +those calls have already been vetted for syntax errors and all lookups are +well scoped. +Furthermore, the real hard work of getting sane responses from and for C++ +in an interactive environment is done in cling, not in the bindings. +Nevertheless, it is proving a long road (but for that matter clang does not +support all of C++11 yet), so here's a quick status update showing that good +progress is being made. + +

                +

                The following example is on CPython, not PyPy, but moving a third +(after Reflex and +CINT) backend into place +underneath cppyy is straightforward compared to developing the backend +in the first place. + +Take this snippet of C++11 code +(cpp11.C): + +

                +

                +
                    constexpr int data_size() { return 5; }
                +
                +    auto N = data_size();
                +
                +    template<class L, class R>
                +    struct MyMath {
                +       static auto add(L l, R r) -> decltype(l+r) { return l + r; }
                +    };
                +
                +    template class MyMath<int, int>;
                + +

                As a practical matter, most usage of new C++11 features will live in +implementations, not in declarations, and are thus never seen by the bindings. +The above example is therefore somewhat contrived, but it will serve to show +that these new declarations actually work. +The new features used here are +constexpr, +auto, and +decltype. +Here is how you could use these from CPython, using the +PyROOT +package, which has more than a passing resemblance to cppyy, as one is based +on the other: + +

                +

                +
                    import ROOT as gbl
                +    gbl.gROOT.LoadMacro('cpp11.C')
                +
                +    print 'N =', gbl.N
                +    print '1+1 =', gbl.MyMath(int, int).add(1,1)
                + +which, when entered into a file +(cpp11.py) and executed, +prints the expected results: + +

                +
                    $ python cpp11.py
                +    N = 5
                +    1+1 = 2
                + +In the example, the C++ code is compiled on-the-fly, rather than first generating +a dictionary as is needed with Reflex. +A deployment model that utilizes stored pre-compiled information is foreseen +to work with larger projects, which may have to pull in headers from many places. + +

                Work is going to continue first on C++03 on cling with CPython (about 85% of +unit tests currently pass), with a bit of work on C++11 support on the side. +Once fully in place, it can be brought into a new backend for cppyy, after +which the remaining parts of C++11 can be fleshed out for both interpreters. + +

                +

                Cheers,
                +Wim Lavrijsen

                +
                +

                Comments

                +
                +
                +
                + + Anonymous wrote on 2013-02-28 00:17: +
                +
                +

                How would memory management work for C++ objects which own PyPy objects? In CPython, or any similar reference counting system, a C++ class can hold only references via special smart pointers. These smart pointers don't need to be registered in any way with the outer class, since there's no need for a garbage collector to traverse from the outer object to the inner smart pointer instances.

                For decent garbage collection to work, presumably one needs to be able to enumerate the PyPy objects pointed to by a C++ object. How would this work?

                +
                +
                +
                +
                + + Wim Lavrijsen wrote on 2013-02-28 00:34: +
                +
                +

                Right now, there are no PyPy objects exposed as such, but only PyObjects through cpyext in support of the python C-API. In cppyy, cpyext is used for any interface that has a PyObject* as argument or return value. It is cpyext that takes care of marrying the ref-count API with the garbage collector.

                Don't pin me down on the details, but from what I understand of cpyext, a wrapper object with the proper C layout is created, and given a life line by putting it in an internal container holding all such objects safe from the gc simply by existing. When the ref count hits zero, the life line gets removed. Object identity is preserved by finding objects in the internal container and reusing them.

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2013/02/hello-everyone-4718797989680066222.html b/posts/2013/02/hello-everyone-4718797989680066222.html new file mode 100644 index 000000000..28af561fa --- /dev/null +++ b/posts/2013/02/hello-everyone-4718797989680066222.html @@ -0,0 +1,302 @@ + + + + + +PyCon Silicon Valley and San Francisco visit | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                PyCon Silicon Valley and San Francisco visit

                + + + +
                +
                + +

                Hello everyone.

                +

                We (Armin Rigo and Maciej Fijalkowski) are visiting San Francisco/Silicon Valley +for PyCon and beyond. Alex Gaynor, another core PyPy dev is living there +permanently. My visiting dates are 12-28 of March, Armin's 11-21st. +If you want us to give a talk at your company or simply catch up with us +for a dinner +please get in touch. Write to pypy-dev@python.org, if you want this publically +known or simply send me a mail at fijall@gmail.com if you don't want it public.

                +

                Cheers,
                +fijal

                +
                +
                +
                +

                Comments

                +
                +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2013/03/numpy-status-update-and-developer-1503421654591696377.html b/posts/2013/03/numpy-status-update-and-developer-1503421654591696377.html new file mode 100644 index 000000000..4737b22e8 --- /dev/null +++ b/posts/2013/03/numpy-status-update-and-developer-1503421654591696377.html @@ -0,0 +1,348 @@ + + + + + +Numpy status update and developer announcement | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                Numpy status update and developer announcement

                + + + +
                +
                + +

                Hello, some good news!

                +

                First the update:

                +
                  +
                • +dtype support - NumPy on PyPy now supports non-native storage formats. +Due to a lack of true support for longdoubles in rpython, we decided to back +out the support of longdouble-as-double which was misleading.
                • +
                • +missing ndarray attributes - work has been made toward supporting the +complete set of attributes +on ndarrays. We are progressing alphabetically, and have made it to d. +Unsupported attributes, and unsupported arguments to attribute calls +will raise a NotImplementedError.
                • +
                • +pickling support for numarray - hasn't started yet, but next on the list
                • +
                • There has been some work on exposing FFI routines in numpypy.
                • +
                • Brian Kearns has made progress in improving the numpypy namespace. +The python numpypy submodules now more closely resemble their numpy +counterparts. Also, translated _numpypy submodules are now more properly +mapped to the numpy core c-based submodules, furthering the goal of being +able to install numpy as a pure-python module with few modifications.
                • +
                +

                And now the good news:

                +

                While our funding drive over 2012 did not reach our goal, we still managed to +raise a fair amount of money in donations. So far we only managed to spend around $10 000 of it. +We issued a call for additional developers, and are glad to welcome Romain Guillebert and Ronan Lamy +to the numpypy team. Hopefully we will be able to report on speedier progress soon.

                +

                Cheers,
                +Matti Picus, Maciej Fijalkowski

                +
                +
                +
                +

                Comments

                +
                +
                +
                + + cournape wrote on 2013-03-19 08:46: +
                +
                +

                Regarding long double, that's clearly something you should not waste your time on. I think the way it was implemented in numpy is not good, and I generally advise against it (the only real use I can see is if you need to interoperate with binary formats that use it, but even there, the complete platform specificity of it is a killer).

                +
                +
                +
                +
                + + Power Cords wrote on 2013-03-20 06:15: +
                +
                +

                Joining of two additional developers is a good sign for Numpy and so we hope that they will now focus on speedier progress soon.

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2013/03/py3k-status-update-10-6681398990092286007.html b/posts/2013/03/py3k-status-update-10-6681398990092286007.html new file mode 100644 index 000000000..734d63c1e --- /dev/null +++ b/posts/2013/03/py3k-status-update-10-6681398990092286007.html @@ -0,0 +1,427 @@ + + + + + +Py3k status update #10 | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                Py3k status update #10

                + + + +
                +

                This is the tenth status update about our work on the py3k branch, which we
                +can work on thanks to all of the people who donated to the py3k proposal.

                +

                There's been significant progress since the last update: the linux x86-32
                +buildbot
                now passes 289 out of approximately 354 modules (with 39 skips) of
                +CPython's regression test suite.

                +

                That means there's only 26 test module failures left! The list of major items
                +remaining for 3.2 compatibility are now short enough to list here, with their
                +related tests:

                +
                  +
                • Tokenizer support for non-ascii identifiers
                • +
                +
                  +
                • test_importlib
                • +
                • test_pep263
                • +
                + +
                  +
                • test_memoryview
                • +
                +
                  +
                • multiprocessing module currently deadlocks
                • +
                +
                  +
                • test_multiprocessing
                • +
                +
                  +
                • Buggy handling of the new extended unpacking syntax by the compiler:
                • +
                +
                  +
                • test_unpack_ex
                • +
                +
                  +
                • The new Global Interpreter Lock and new thread signal handling
                • +
                +
                  +
                • test_threading
                • +
                • test_threadsignals
                • +
                • test_sys
                • +
                +
                  +
                • Upgrade unicodedata to 6.0.0 (requires updates to the actual unicodedata
                  +generation script)
                • +
                +
                  +
                • test_ucn
                • +
                • test_unicode
                • +
                • test_unicodedata
                • +
                + +
                  +
                • test_capi (currently crashes)
                • +
                +
                  +
                • Update int's hash code to match to CPython (float's is already updated on the
                  py3k-newhash branch. note that PyPy 2.x doesn't even totally match
                  +CPython's hashing)
                • +
                +
                  +
                • test_decimal
                • +
                • test_fractions
                • +
                • test_numeric_tower
                • +
                +
                  +
                • Miscellaneous:
                • +
                +
                  +
                • test_complex
                • +
                • test_float
                • +
                • test_peepholer
                • +
                • test_range
                • +
                • test_sqlite (a new cffi based version seems to be coming)
                • +
                • test_ssl
                • +
                • test_struct
                • +
                • test_subprocess
                • +
                • test_sys_settrace
                • +
                • test_time
                • +
                +

                Additionally there are still a number of failures in PyPy's internal test
                +suite. These tests are usually ran against untranslated versions of PyPy during
                +development. However we've now began running them against a fully translated
                +version of PyPy on the buildbot too (thanks to Amaury for setting this
                +up). This further ensures that our tests and implementation are sane.

                +

                We're getting closer to producing an initial alpha release. Before that happens
                +we'd like to see:

                +
                  +
                • further test fixes
                • +
                • the results of test runs on other major platforms (e.g. linux x86-64 and osx
                  +seem to have some additional failures as of now)
                • +
                • some basic real world testing
                • +
                +

                Finally I'd like to thank Manuel Jacob for his various contributions over the
                +past month, including fixing the array and ctypes modules among other things,
                +and also Amaury Forgeot d'Arc for his ongoing excellent contributions.

                +

                cheers,
                +Phil

                +
                +

                Comments

                +
                +
                +
                + + Ernst Sjöstrand wrote on 2013-03-05 20:47: +
                +
                +

                A chart with failing tests over time would be cool. Or, just work on fixing those tests! :-)

                +
                +
                +
                +
                + + René Dudfield wrote on 2013-03-06 10:54: +
                +
                +

                Congrats!

                +
                +
                +
                +
                + + Arne Babenhauserheide wrote on 2013-03-07 10:59: +
                +
                +

                That’s really, really, REALLY COOL!

                +
                +
                +
                +
                + + Power Cords wrote on 2013-03-12 13:57: +
                +
                +

                Cool. How many errors have been fixed in current update? Is there any log available?

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2013/03/so-you-want-to-try-pypy-4702482800824669595.html b/posts/2013/03/so-you-want-to-try-pypy-4702482800824669595.html new file mode 100644 index 000000000..bd0dc868d --- /dev/null +++ b/posts/2013/03/so-you-want-to-try-pypy-4702482800824669595.html @@ -0,0 +1,442 @@ + + + + + +So, you want to try PyPy | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                So, you want to try PyPy

                + + + +
                +
                +

                Hello.

                +

                During the PyCon trip multiple people asked me how exactly they could run +their stuff on PyPy to get the speedups. Now, in an ideal world, +you would just swap CPython with PyPy, everything would run tons of times +faster and everyone would live happily ever after. However, we don't live in +an ideal world and PyPy does not speed up everything you could +potentially run. Chances are that you can run your stuff quite a bit faster, but +it requires quite a bit more R&D than just that. This blog post is an attempt to +explain certain steps that might help. So here we go:

                +
                  +
                • Download and install PyPy. 2.0 beta 1 or upcoming 2.0 beta 2 would be a good +candidate; it's not called a beta for stability reasons.
                • +
                • Run your tests on PyPy. There is absolutely no need for fast software that +does not work. There might be some failures. Usually they're harmless (e.g. +you forgot to close the file); either fix them or at least inspect them. In +short, make sure stuff works.
                • +
                • Inspect your stack. In particular, C extensions, while sometimes working, are +a potential source of instability and slowness. Fortunately, +since the introduction of cffi, the ecosystem of PyPy-compatible software +has been growing. Things I know are written with PyPy in mind:
                    +
                  • the new version of pyOpenSSL will support PyPy via cffi
                  • +
                  • +psycopg2cffi is the most actively maintained postgres binding for PyPy, +with pg8000 reported working
                  • +
                  • mysql has a ctypes based implementation (although a cffi-based one would +be definitely better)
                  • +
                  • PyPy 2.0 beta 2 will come with sqlite-using-cffi
                  • +
                  • lxml-cffi
                  • +
                  • +uWSGI, while working, is almost certainly not the best choice. Try +tornado, twisted.web, cyclone.io, gunicorn or gevent +(note: gevent support for PyPy is not quite finished; will write about it +in a separate blog post, but you can't just use the main branch of gevent)
                  • +
                  • consult (and contribute to) pypy compatibility wiki for details (note +that it's community maintained, might be out of date)
                  • +
                  +
                • +
                +
                  +
                • Have benchmarks. If you don't have benchmarks, then performance does not +matter for you. Since PyPy's warm-up time is bad (and yes, we know, we're +working on it), you should leave ample time for warm-ups. Five to ten seconds +of continuous computation should be enough.
                • +
                • Try them. If you get lucky, the next step might be to deploy and be happy. +If you're unlucky, profile and try to isolate bottlenecks. They might be in +a specific library or they might be in your code. The better you can isolate +them, the higher your chances of understanding what's going on.
                • +
                • Don't take it for granted. PyPy's JIT is very good, but there is a variety +of reasons that it might not work how you expect it to. A lot of times it +starts off slow, but a little optimization can improve the speed as much as +10x. Since PyPy's runtime is less mature than CPython, there are higher +chances of finding an obscure corner of the standard library that might be +atrociously slow.
                • +
                • Most importantly, if you run out of options and you have a reproducible +example, please report it. A pypy-dev email, popping into #pypy +on irc.freenode.net, or getting hold of me on twitter are good ways. +You can also contact me directly at fijall at gmail.com as well. While +it's cool if the example is slow, a lot of problems only show up on large +and convoluted examples. As long as I can reproduce it on my machine or I can +log in somewhere, I am usually happy to help.
                • +
                • I typically use a combination of jitviewer, valgrind and +lsprofcalltree to try to guess what's going on. These tools are all +useful, but use them with care. They usually require quite a bit of +understanding before being useful. Also sometimes they're just plain useless +and you need to write your own analysis.
                • +
                +

                I hope this summary of steps to take is useful. We hear a lot of stories +of people trying PyPy, most of them positive, but some of them negative. +If you just post "PyPy didn't work for me" on your blog, that's +cool too, but you're missing an opportunity. The reasons may vary from +something serious like "this is a bad pattern for PyPy GC" to something +completely hilarious like "oh, I left this sys._getframe() somewhere +in my hot loops for debugging" or "I used the logging module which uses +sys._getframe() all over the place".

                +

                Cheers,
                +fijal

                +
                +
                +
                +

                Comments

                +
                +
                +
                + + Unknown wrote on 2013-03-28 09:45: +
                +
                +

                waiting for gevent's support

                +
                +
                +
                +
                + + Anonymous wrote on 2013-03-28 13:39: +
                +
                +

                Just curious, why is uwsgi not the best choice?

                +
                +
                +
                +
                + + Unknown wrote on 2013-03-28 21:28: +
                +
                +

                I'm also curious what are the issues with uWSGI.

                +
                +
                +
                +
                + + Unknown wrote on 2013-03-28 22:12: +
                +
                +

                As the main uWSGI author i can only confirm the post. Embedding pypy in c applications (not the inverse) is still hacky, and afaik uWSGi is the only project trying to do it. So albeit the combo works, it is only a proof of concept that require still lot of effort (both from pypy and uWSGI) to be production-ready.

                +
                +
                +
                +
                + + Jacob Stoner wrote on 2013-03-28 23:06: +
                +
                +

                looking forward to the post on gevent with pypy

                +
                +
                +
                +
                + + Josell wrote on 2013-03-29 05:04: +
                +
                +

                Ruby or nothing. Sorry.

                +
                +
                +
                +
                + + Anonymous wrote on 2013-04-02 13:05: +
                +
                +

                thanks for share...

                +
                +
                +
                +
                + + Anonymous wrote on 2013-04-02 14:46: +
                +
                +

                will there maybe be an asm.js backend for pypy? :) that would be kind of nice. finally python in the browser.

                to me it seems like asm.js will be more successful than google's native client since it is much simpler to implement and since it is a subset of javascript it already works everywhere, just slower.

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2013/04/pypy-20-beta-2-released-4858660312787995512.html b/posts/2013/04/pypy-20-beta-2-released-4858660312787995512.html new file mode 100644 index 000000000..3dd79c03f --- /dev/null +++ b/posts/2013/04/pypy-20-beta-2-released-4858660312787995512.html @@ -0,0 +1,439 @@ + + + + + +PyPy 2.0 beta 2 released | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                PyPy 2.0 beta 2 released

                + + + +
                +
                +

                We're pleased to announce the 2.0 beta 2 release of PyPy. This is a major +release of PyPy and we're getting very close to 2.0 final, however it includes +quite a few new features that require further testing. Please test and report +issues, so we can have a rock-solid 2.0 final. It also includes a performance +regression of about 5% compared to 2.0 beta 1 that we hope to fix before +2.0 final. The ARM support is not working yet and we're working hard to +make it happen before the 2.0 final. The new major features are:

                +
                  +
                • JIT now supports stackless features, that is greenlets and stacklets. This +means that JIT can now optimize the code that switches the context. It enables +running eventlet and gevent on PyPy (although gevent requires some +special support that's not quite finished, read below).
                • +
                • This is the first PyPy release that includes cffi as a core library. +Version 0.6 comes included in the PyPy library. cffi has seen a lot of +adoption among library authors and we believe it's the best way to wrap +C libaries. You can see examples of cffi usage in _curses.py and +_sqlite3.py in the PyPy source code.
                • +
                +

                You can download the PyPy 2.0 beta 2 release here:

                +
                +https://pypy.org/download.html +
                +
                +

                What is PyPy?

                +

                PyPy is a very compliant Python interpreter, almost a drop-in replacement for +CPython 2.7.3. It's fast (pypy 2.0 beta 2 and cpython 2.7.3 +performance comparison) due to its integrated tracing JIT compiler.

                +

                This release supports x86 machines running Linux 32/64, Mac OS X 64 or +Windows 32. It also supports ARM machines running Linux, however this is +disabled for the beta 2 release. +Windows 64 work is still stalling, we would welcome a volunteer +to handle that.

                +
                +
                +

                How to use PyPy?

                +

                We suggest using PyPy from a virtualenv. Once you have a virtualenv +installed, you can follow instructions from pypy documentation on how +to proceed. This document also covers other installation schemes.

                +
                +
                +

                Highlights

                +
                  +
                • +cffi is officially supported by PyPy. It comes included in the standard +library, just use import cffi +
                • +
                • stackless support - eventlet just works and gevent requires pypycore +and pypy-hacks branch of gevent (which mostly disables cython-based +modules)
                • +
                • callbacks from C are now much faster. pyexpat is about 3x faster, cffi +callbacks around the same
                • +
                • +__length_hint__ is implemented (PEP 424)
                • +
                • a lot of numpy improvements
                • +
                +
                +
                +

                Improvements since 1.9

                +
                  +
                • +JIT hooks are now a powerful tool to introspect the JITting process that +PyPy performs
                • +
                • various performance improvements compared to 1.9 and 2.0 beta 1
                • +
                • operations on long objects are now as fast as in CPython (from +roughly 2x slower)
                • +
                • we now have special strategies for dict/set/list which contain +unicode strings, which means that now such collections will be both faster +and more compact.
                • +
                +
                +
                +
                +
                +

                Comments

                +
                +
                +
                + + Anonymous wrote on 2013-04-08 08:30: +
                +
                +

                why do you ship with pypy sqlite version 3.5.9 (windows version),
                this is an old version which doesn't support wal mode

                2008-May-12 - Version 3.5.9

                +
                +
                +
                +
                + + Anonymous wrote on 2013-04-08 16:40: +
                +
                +

                Congratulations! And hope the ARM version of PyPy together with ARM v6 support will also coming soon.

                +
                +
                +
                +
                + + Anonymous wrote on 2013-04-08 17:26: +
                +
                +

                Can you explain "performance regression of about 5% " and also "various performance improvements compared to 1.9 and 2.0 beta 1"?

                What is faster and what is slower?

                +
                +
                +
                +
                + + Anonymous wrote on 2013-04-12 11:59: +
                +
                +

                And we've got a lot of segfaults with beta2…

                +
                +
                +
                +
                + + Maciej Fijalkowski wrote on 2013-04-12 12:11: +
                +
                +

                @Anonymous - please report those. It's impossible for us to determine what's going on without reporting back.

                +
                +
                +
                +
                + + Mak Sim wrote on 2013-04-18 10:59: +
                +
                +

                Thank you for great job.
                Do you plan to release 64-bit binaries for Windows?
                I'm trying to build from tag "pypy-2.0-beta2" under Windows7 x64, with MSVC compiler AMD-64, and I've got an exception:

                [translation:ERROR] TypeError: <Struct PyTypeObject { c_ob_refcnt, c__pad0, c__pad1, c__pad2, c__pad3, c_ob_type, c_ob_size, c__pad4, c__pad5, c__pad6, c__pad7, c_tp_name, c_tp_basicsize, c_tp_itemsize, c_tp_dealloc, c_tp_print, c_tp_getattr, c_tp_setattr, c_tp_compare, c_tp_repr, c_tp_as_number, c_tp_as_sequence, c_tp_as_mapping, c_tp_hash, c_tp_call, c_tp_str, c_tp_getattro, c_tp_setattro, c_tp_as_buffer, c_tp_flags, c__pad8, c__pad9, c__pad10, c__pad11, c_tp_doc, c_tp_traverse, c_tp_clear, c_tp_richcompare, c_tp_weaklistoffset, c__pad12, c__pad13, c__pad14, c__pad15, c_tp_iter, c_tp_iternext, c_tp_methods, c_tp_members, c_tp_getset, c_tp_base, c_tp_dict, c_tp_descr_get, c_tp_descr_set, c_tp_dictoffset, c__pad16, c__pad17, c__pad18, c__pad19, c_tp_init, c_tp_alloc, c_tp_new, c_tp_free, c_tp_is_gc, c_tp_bases, c_tp_mro, c_tp_cache, c_tp_subclasses, c_tp_weaklist, c_tp_del, c__pad20, c__pad21, c__pad22, c__pad23, c__pad24, c__pad25, c__pad26, c__pad27 }> instance field 'c_ob_refcnt':
                [translation:ERROR] expects <INT>
                [translation:ERROR] got <Signed>

                +
                +
                +
                +
                + + Maciej Fijalkowski wrote on 2013-04-18 11:01: +
                +
                +

                As it says in the release announcement, win64 is not supported. You need to build a 32bit binary (using 32bit Python)

                +
                +
                +
                +
                + + Egypt News wrote on 2013-04-22 05:04: +
                +
                +

                great news, waiting for the final v2.0

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2013/05/numpy-status-update-4176018422530420763.html b/posts/2013/05/numpy-status-update-4176018422530420763.html new file mode 100644 index 000000000..e32098778 --- /dev/null +++ b/posts/2013/05/numpy-status-update-4176018422530420763.html @@ -0,0 +1,369 @@ + + + + + +Numpy Status Update | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                Numpy Status Update

                + + + +
                +

                Hello Everyone,

                +I've started to work on NumPyPy since the end of April and here is a short update :

                +
                  +
                • I implemented pickling support on ndarrays and dtypes, it will be compatible with numpy's pickling protocol when the "numpypy" module will be renamed to "numpy".
                • +
                • I am now working on subarrays.
                • +
                +
                +
                +
                +
                +I would also like to thank everyone who donated and allowed me to work on this.
                +
                +
                +
                +
                +Cheers,
                +
                +Romain Guillebert
                +
                +

                Comments

                +
                +
                +
                + + Anonymous wrote on 2013-05-12 11:09: +
                +
                +

                No, thank you! Cannot wait till the day PyPy fully supports NumPy.

                +
                +
                +
                +
                + + Anonymous wrote on 2013-05-13 00:19: +
                +
                +

                I second the anonymous comment above. The day PyPy fully supports NumPy is the day I switch from CPython.

                +
                +
                +
                +
                + + Paul Jaros wrote on 2013-05-13 08:32: +
                +
                +

                Aww... Anonymous.

                @Romain Guillebert Thank you for the hard work you are putting into it. I will be testing my code with the current release.

                +
                +
                +
                +
                + + Anonymous wrote on 2013-05-13 18:38: +
                +
                +

                This (and to a lesser extent Python 3 support) is the only thing holding me back from switching to PyPy for all of my python programming. Thank you very much for this fantastic project!

                +
                +
                +
                +
                + + Paul Jaros wrote on 2013-05-14 22:44: +
                +
                +

                Results from running my own little Benchmark: Labyrinth Generator
                Array Size: 77711x711:

                C-Code:
                4.45 Seconds, ~50M Memory Usage.

                Pypy with standard List:
                14.5 Seconds, ~750M Memory Usage.

                Pypy with Numpypy:
                11.0 Seconds, ~78M Memory Usage.

                Pretty impressive if you ask me. Older Numpypy where about as fast as the standard List. Also Pypy is approaching C-Performance with bigger steps than I dared hoping for.

                CPython Benchmark intentionally left out... it takes ages.

                +
                +
                +
                +
                + + Anonymous wrote on 2013-05-15 14:50: +
                +
                +

                It's great to see a progress in important libraries support.

                Speed is important, but when we get acceptable speed then library support is what we need.

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2013/05/pypy-20-alpha-for-arm-2318299473927531503.html b/posts/2013/05/pypy-20-alpha-for-arm-2318299473927531503.html new file mode 100644 index 000000000..a86dc5c70 --- /dev/null +++ b/posts/2013/05/pypy-20-alpha-for-arm-2318299473927531503.html @@ -0,0 +1,701 @@ + + + + + +PyPy 2.0 alpha for ARM | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                PyPy 2.0 alpha for ARM

                + + + +
                +
                + +

                Hello.

                +

                We're pleased to announce an alpha release of PyPy 2.0 for ARM. This is mostly +a technology preview, as we know the JIT is not yet stable enough for the +full release. However please try your stuff on ARM and report back.

                +

                This is the first release that supports a range of ARM devices - anything with +ARMv6 (like the Raspberry Pi) or ARMv7 (like Beagleboard, Chromebook, +Cubieboard, etc.) that supports VFPv3 should work. We provide builds with +support for both ARM EABI variants: hard-float and some older operating +systems soft-float.

                +

                This release comes with a list of limitations, consider it alpha quality, +not suitable for production:

                +
                  +
                • stackless support is missing.
                • +
                • assembler produced is not always correct, but we successfully managed to +run large parts of our extensive benchmark suite, so most stuff should work.
                • +
                +

                You can download the PyPy 2.0 alpha ARM release here (including a deb for raspbian):

                +
                +https://pypy.org/download.html +
                +

                Part of the work was sponsored by the Raspberry Pi foundation.

                +
                +

                What is PyPy?

                +

                PyPy is a very compliant Python interpreter, almost a drop-in replacement for +CPython 2.7.3. It's fast due to its integrated tracing JIT compiler.

                +

                This release supports ARM machines running Linux 32bit. Both hard-float +armhf and soft-float armel builds are provided. armhf builds are +created using the Raspberry Pi custom cross-compilation toolchain based on +gcc-arm-linux-gnueabihf and should work on ARMv6 and ARMv7 devices running at +least debian or ubuntu. armel builds are built using gcc-arm-linux-gnuebi +toolchain provided by ubuntu and currently target ARMv7. If there is interest +in other builds, such as gnueabi for ARMv6 or without requiring a VFP let us +know in the comments or in IRC.

                +
                +
                +

                Benchmarks

                +

                Everybody loves benchmarks. Here is a table of our benchmark suite +(for ARM we don't provide it yet on https://speed.pypy.org, +unfortunately).

                +

                This is a comparison of Cortex A9 processor with 4M cache and Xeon W3580 with +8M of L3 cache. The set of benchmarks is a subset of what we run for +https://speed.pypy.org that finishes in reasonable time. The ARM machine +was provided by Calxeda. +Columns are respectively:

                +
                  +
                • benchmark name
                • +
                • PyPy speedup over CPython on ARM (Cortex A9)
                • +
                • PyPy speedup over CPython on x86 (Xeon)
                • +
                • speedup on Xeon vs Cortex A9, as measured on CPython
                • +
                • speedup on Xeon vs Cortex A9, as measured on PyPy
                • +
                • relative speedup (how much bigger the x86 speedup is over ARM speedup)
                • +
                + ++++++++ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                BenchmarkPyPy vs CPython (arm)PyPy vs CPython (x86)x86 vs arm (pypy)x86 vs arm (cpython)relative speedup
                ai3.613.167.708.820.87
                bm_mako3.412.118.5613.820.62
                chaos21.8217.806.938.500.82
                crypto_pyaes22.5319.486.537.560.86
                django13.4311.167.909.510.83
                eparse1.431.176.618.120.81
                fannkuch6.225.366.187.160.86
                float5.226.009.688.431.15
                go4.723.345.918.370.71
                hexiom28.707.007.699.560.80
                html5lib2.352.136.597.260.91
                json_bench1.120.937.198.680.83
                meteor-contest2.131.685.957.540.79
                nbody_modified8.197.786.086.400.95
                pidigits1.270.9514.6719.660.75
                pyflate-fast3.303.5710.649.841.08
                raytrace-simple46.4129.005.148.230.62
                richards31.4828.516.957.680.91
                slowspitfire1.281.145.916.610.89
                spambayes1.931.274.156.300.66
                sphinx1.011.057.767.451.04
                spitfire1.551.585.625.491.02
                spitfire_cstringio9.615.745.439.090.60
                sympy_expand1.420.973.865.660.68
                sympy_integrate1.600.954.247.120.60
                sympy_str0.720.483.685.560.66
                sympy_sum1.991.193.836.380.60
                telco14.289.363.946.020.66
                twisted_iteration11.607.336.049.550.63
                twisted_names3.682.835.016.500.77
                twisted_pb4.943.025.108.340.61
                +

                It seems that Cortex A9, while significantly slower than Xeon, has higher +slowdowns with a large interpreter (CPython) than a JIT compiler (PyPy). This +comes as a surprise to me, especially that our ARM assembler is not nearly +as polished as our x86 assembler. As for the causes, various people mentioned +branch predictor, but I would not like to speculate without actually knowing.

                +
                +
                +

                How to use PyPy?

                +

                We suggest using PyPy from a virtualenv. Once you have a virtualenv +installed, you can follow instructions from pypy documentation on how +to proceed. This document also covers other installation schemes.

                +

                We would not recommend using in production PyPy on ARM just quite yet, +however the day of a stable PyPy ARM release is not far off.

                +

                Cheers,
                +fijal, bivab, arigo and the whole PyPy team

                +
                +
                +
                +
                +

                Comments

                +
                +
                +
                + + Anonymous wrote on 2013-05-08 14:43: +
                +
                +

                Congratulations!

                +
                +
                +
                +
                + + Anonymous wrote on 2013-05-08 14:43: +
                +
                +

                Congratulations!

                +
                +
                +
                +
                + + Rasmus wrote on 2013-05-09 12:43: +
                +
                +

                This is truly amazing! Great work and I'm very interested about the future.

                +
                +
                +
                +
                + + João Magalhães wrote on 2013-05-10 22:48: +
                +
                +

                This is really great news especially for the raspberry pi guys.

                Congratulations !!!

                +
                +
                +
                +
                + + Verona wrote on 2013-07-30 06:19: +
                +
                +

                This is cool!

                +
                +
                +
                +
                + + xaRD wrote on 2015-01-25 12:13: +
                +
                +

                Where I get the source code for the benchmark's you have used?

                +
                +
                +
                +
                + + Armin Rigo wrote on 2015-01-26 09:59: +
                +
                +

                https://foss.heptapod.net/pypy/benchmarks/

                + +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2013/05/pypy-20-einstein-sandwich-635158782365435530.html b/posts/2013/05/pypy-20-einstein-sandwich-635158782365435530.html new file mode 100644 index 000000000..813da271f --- /dev/null +++ b/posts/2013/05/pypy-20-einstein-sandwich-635158782365435530.html @@ -0,0 +1,397 @@ + + + + + +PyPy 2.0 - Einstein Sandwich | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                PyPy 2.0 - Einstein Sandwich

                + + + +
                +
                +

                We're pleased to announce PyPy 2.0. This is a stable release that brings +a swath of bugfixes, small performance improvements and compatibility fixes. +PyPy 2.0 is a big step for us and we hope in the future we'll be able to +provide stable releases more often.

                +

                You can download the PyPy 2.0 release here:

                +
                +https://pypy.org/download.html +
                +

                The two biggest changes since PyPy 1.9 are:

                +
                  +
                • stackless is now supported including greenlets, which means eventlet +and gevent should work (but read below about gevent)
                • +
                • PyPy now contains release 0.6 of cffi as a builtin module, which +is preferred way of calling C from Python that works well on PyPy
                • +
                +

                If you're using PyPy for anything, it would help us immensely if you fill out +the following survey: https://bit.ly/pypysurvey This is for the developers +eyes and we will not make any information public without your agreement.

                +
                +

                What is PyPy?

                +

                PyPy is a very compliant Python interpreter, almost a drop-in replacement for +CPython 2.7. It's fast (pypy 2.0 and cpython 2.7.3 performance comparison) +due to its integrated tracing JIT compiler.

                +

                This release supports x86 machines running Linux 32/64, Mac OS X 64 or +Windows 32. Windows 64 work is still stalling, we would welcome a volunteer +to handle that. ARM support is on the way, as you can see from the recently +released alpha for ARM.

                +
                +
                +

                Highlights

                +
                  +
                • Stackless including greenlets should work. For gevent, you need to check +out pypycore and use the pypy-hacks branch of gevent.
                • +
                • cffi is now a module included with PyPy. (cffi also exists for +CPython; the two versions should be fully compatible.) It is the +preferred way of calling C from Python that works on PyPy.
                • +
                • Callbacks from C are now JITted, which means XML parsing is much faster.
                • +
                • A lot of speed improvements in various language corners, most of them small, +but speeding up some particular corners a lot.
                • +
                • The JIT was refactored to emit machine code which manipulates a "frame" +that lives on the heap rather than on the stack. This is what makes +Stackless work, and it could bring another future speed-up (not done yet).
                • +
                • A lot of stability issues fixed.
                • +
                • Refactoring much of the numpypy array classes, which resulted in removal of +lazy expression evaluation. On the other hand, we now have more complete +dtype support and support more array attributes.
                • +
                +

                Cheers,
                +fijal, arigo and the PyPy team

                +
                +
                +
                +
                +

                Comments

                +
                +
                +
                + + Unknown wrote on 2013-05-09 20:01: +
                +
                +

                I read this as gevent needs a special branch but eventlet doesn't. Is that correct, or does eventlet require you to use that branch as well?

                +
                +
                +
                +
                + + Anonymous wrote on 2013-05-10 01:04: +
                +
                +

                Congrats guys! Thanks so much for all your hard work. Python is awesome, and PyPy makes it more awesome!

                +
                +
                +
                +
                + + Robert wrote on 2013-05-10 13:39: +
                +
                +

                Are we going to get lazy expression evaluation in numpypy back sometime?

                +
                +
                +
                +
                + + Wim Lavrijsen wrote on 2013-05-10 17:26: +
                +
                +

                Another thing that's new, is that cppyy is enabled, albeit that you need to install the Reflex library separately. See (Linux only, sorry): https://doc.pypy.org/en/latest/cppyy.html#installation

                +
                +
                +
                +
                + + Unknown wrote on 2013-07-20 15:24: +
                +
                +

                I'd not say eventlet just works. In this example: https://eventlet.net/doc/examples.html#web-crawler I keep receiving:

                File "/usr/lib/pypy/lib-python/2.7/socket.py", line 430, in read
                data = self._sock.recv(left)
                File "/home/divius/Projects/!demo/eventlet/env/site-packages/eventlet/greenio.py", line 251, in recv
                return fd.recv(buflen, flags)
                File "/usr/lib/pypy/lib-python/2.7/socket.py", line 188, in recv
                return self._sock.recv(buffersize, flags=flags)
                error: [Errno 9] Bad file descriptor

                +
                +
                +
                +
                + + Armin Rigo wrote on 2013-07-21 22:24: +
                +
                +

                See https://bugs.pypy.org/issue1492. This was reported and we believe we fixed it on trunk.

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2013/05/pypy-201-bohr-smrrebrd-6316445093061941482.html b/posts/2013/05/pypy-201-bohr-smrrebrd-6316445093061941482.html new file mode 100644 index 000000000..84dc21882 --- /dev/null +++ b/posts/2013/05/pypy-201-bohr-smrrebrd-6316445093061941482.html @@ -0,0 +1,320 @@ + + + + + +PyPy 2.0.1 - Bohr Smørrebrød | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                PyPy 2.0.1 - Bohr Smørrebrød

                + + + +
                +

                We're pleased to announce PyPy 2.0.1. This is a stable bugfix release +over 2.0. You can download it here:

                +
                +https://pypy.org/download.html +
                +

                The fixes are mainly about fatal errors or crashes in our stdlib. See +below for more details.

                +
                +

                What is PyPy?

                +

                PyPy is a very compliant Python interpreter, almost a drop-in replacement for +CPython 2.7. It's fast (pypy 2.0 and cpython 2.7.3 performance comparison) +due to its integrated tracing JIT compiler.

                +

                This release supports x86 machines running Linux 32/64, Mac OS X 64 or +Windows 32. Support for ARM is progressing but not bug-free yet.

                +
                +
                +

                Highlights

                + +

                Cheers, +arigo et. al. for the PyPy team

                +
                +
                +

                Comments

                +
                +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2013/05/pypy-202-fermi-panini-1917947221142595738.html b/posts/2013/05/pypy-202-fermi-panini-1917947221142595738.html new file mode 100644 index 000000000..c49858132 --- /dev/null +++ b/posts/2013/05/pypy-202-fermi-panini-1917947221142595738.html @@ -0,0 +1,330 @@ + + + + + +PyPy 2.0.2 - Fermi Panini | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                PyPy 2.0.2 - Fermi Panini

                + + + +
                +

                We're pleased to announce PyPy 2.0.2. This is a stable bugfix release +over 2.0 and 2.0.1. You can download it here:

                +
                +https://pypy.org/download.html +
                +

                It fixes a crash in the JIT when calling external C functions (with +ctypes/cffi) in a multithreaded context.

                +
                +

                What is PyPy?

                +

                PyPy is a very compliant Python interpreter, almost a drop-in replacement for +CPython 2.7. It's fast (pypy 2.0 and cpython 2.7.3 performance comparison) +due to its integrated tracing JIT compiler.

                +

                This release supports x86 machines running Linux 32/64, Mac OS X 64 or +Windows 32. Support for ARM is progressing but not bug-free yet.

                +
                +
                +

                Highlights

                +

                This release contains only the fix described above. A crash (or wrong +results) used to occur if all these conditions were true:

                +
                  +
                • your program is multithreaded;
                • +
                • it runs on a single-core machine or a heavily-loaded multi-core one;
                • +
                • it uses ctypes or cffi to issue external calls to C functions.
                • +
                +

                This was fixed in the branch emit-call-x86 (see the example file +bug1.py).

                +

                Cheers, +arigo et. al. for the PyPy team

                +
                +
                +

                Comments

                +
                +
                +
                + + Valentin wrote on 2013-07-22 09:12: +
                +
                +

                This is cool!

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2013/06/numpypy-status-update-3846626188716521472.html b/posts/2013/06/numpypy-status-update-3846626188716521472.html new file mode 100644 index 000000000..a0badcc3e --- /dev/null +++ b/posts/2013/06/numpypy-status-update-3846626188716521472.html @@ -0,0 +1,375 @@ + + + + + +NumPyPy status update | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                NumPyPy status update

                + + + +
                +

                Hello everyone,

                +May was the first month I was paid to work on NumPyPy (thanks to all who donated!), here is what I worked on during this period :

                +
                  +
                • It is now possible to use subarrays.
                • +
                • It is now possible to pickle ndarrays (including those using subarrays), dtypes and scalars, the pickling protocol is the same as numpy's.
                • +
                +
                +
                +
                +
                +For June, I plan to work on the nditer class, it seems that there's enough work for an entire month.
                +
                +Cheers
                +Romain Guillebert +
                +

                Comments

                +
                +
                +
                + + Anonymous wrote on 2013-06-03 18:49: +
                +
                +

                What's numpypy's recommended way for a C/cffi extension to get a pointer to the data?

                Thanks,
                Andreas

                +
                +
                +
                +
                + + Anonymous wrote on 2013-06-04 08:37: +
                +
                +

                Excellent work!

                +
                +
                +
                +
                + + Anonymous wrote on 2013-06-04 10:34: +
                +
                +

                Thanks! But pickling sliced arrays doesn't work yet (tested with nightly build pypy-c-jit-64739-f556942951f9-linux):

                import cPickle as pickle
                import numpypy as numpy
                a = numpy.arange(10.)[::2]
                print a # [ 0. 2. 4. 6. 8.]
                p = pickle.dumps(a)
                print pickle.loads(p) # [ 0. 1. 2. 3. 4.] oops!

                +
                +
                +
                +
                + + Romain Guillebert wrote on 2013-06-04 19:55: +
                +
                +

                @Anonymous

                Thanks for reporting it, it's fixed

                +
                +
                +
                +
                + + Anonymous wrote on 2013-06-04 21:16: +
                +
                +

                Great to hear about the progress, keep up the good work!

                +
                +
                +
                +
                + + Anonymous wrote on 2013-12-05 22:23: +
                +
                +

                It is working very well for me, thanks!

                Now, is there any way to load the resulting pickle in cPython?

                numpy.save and numpy.load do work between pypy and cPython, but my arrays are embedded in larger data structures.

                The motivation is that I would like to run a numerical program and store some results, and then load the results and plot them with matplotlib (which does not work on pypy).

                Here is the error in cPython:

                >>> pickle.load(open('/tmp/x', 'r+b'))
                Traceback (most recent call last):
                File "", line 1, in
                File "/usr/lib/python2.7/pickle.py", line 1378, in load
                return Unpickler(file).load()
                File "/usr/lib/python2.7/pickle.py", line 858, in load
                dispatch[key](self)
                File "/usr/lib/python2.7/pickle.py", line 1090, in load_global
                klass = self.find_class(module, name)
                File "/usr/lib/python2.7/pickle.py", line 1124, in find_class
                __import__(module)
                ImportError: No module named _numpypy.multiarray

                +
                +
                +
                +
                + + Anonymous wrote on 2013-12-05 22:28: +
                +
                +

                It is working great, thanks!

                Now, is there any way to load the resulting pickle in cPython?

                numpy.save and numpy.load do work between pypy and cPython, but my arrays are embedded in larger data structures.

                The motivation is that I would like to run a numerical program and store some results, and then load the results and plot them with matplotlib (which does not work on pypy).

                Here is the error in cPython:

                >>> pickle.load(open('/tmp/x', 'r+b'))
                Traceback (most recent call last):
                File "", line 1, in
                File "/usr/lib/python2.7/pickle.py", line 1378, in load
                return Unpickler(file).load()
                File "/usr/lib/python2.7/pickle.py", line 858, in load
                dispatch[key](self)
                File "/usr/lib/python2.7/pickle.py", line 1090, in load_global
                klass = self.find_class(module, name)
                File "/usr/lib/python2.7/pickle.py", line 1124, in find_class
                __import__(module)
                ImportError: No module named _numpypy.multiarray

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2013/06/py3k-status-update-11-133025715908408072.html b/posts/2013/06/py3k-status-update-11-133025715908408072.html new file mode 100644 index 000000000..fb08b1bf4 --- /dev/null +++ b/posts/2013/06/py3k-status-update-11-133025715908408072.html @@ -0,0 +1,412 @@ + + + + + +Py3k status update #11 | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                Py3k status update #11

                + + + +
                +

                This is the 11th status update about our work on the py3k branch, which we
                +can work on thanks to all of the people who donated to the py3k proposal.

                +

                Here's some highlights of the progress made since the previous update:

                +
                  +
                • PyPy py3k now matches CPython 3's hash code for
                  +int/float/complex/Decimal/Fraction
                • +
                • Various outstanding unicode identifier related issues were
                  +resolved. E.g. test_importlib/pep263/ucn/unicode all now fully pass. Various
                  +usage of identifiers (in particular type and module names) have been fixed to
                  +handle non-ascii names -- mostly around display of reprs and exception
                  +messages.
                • +
                • The unicodedata database has been upgraded to 6.0.0.
                • +
                • Windows support has greatly improved, though it could still use some more
                  +help (but so does the default branch to a certain degree).
                • +
                • Probably the last of the parsing related bugs/features have been taken care
                  +of.
                • +
                • Of course various other smaller miscellaneous fixes
                • +
                +

                This leaves the branch w/ only about 5 outstanding failures of the stdlib test
                +suite:

                +
                  +
                • +

                  test_float

                  +

                  1 failing test about containment of floats in collections.

                  +
                • +
                • +

                  test_memoryview

                  +

                  Various failures: requires some bytes/str changes among other things (Manuel
                  +Jacob's has some progress on this on the py3k-memoryview branch)

                  +
                • +
                • +

                  test_multiprocessing

                  +

                  1 or more tests deadlock on some platforms

                  +
                • +
                • +

                  test_sys and test_threading

                  +

                  2 failing tests for the New GIL's new API

                  +
                • +
                +

                Probably the biggest feature left to tackle is the New GIL.

                +

                We're now pretty close to pushing an initial release. We had planned for one
                +around PyCon, but having missed that we've put some more effort into the branch
                +to provide a more fully-fledged initial release.

                +

                Thanks to the following for their contributions: Manuel Jacob, Amaury Forgeot
                +d'Arc, Karl Ramm, Jason Chu and Christian Hudon.

                +

                cheers,
                +Phil

                +
                +

                Comments

                +
                +
                +
                + + Anonymous wrote on 2013-06-14 12:20: +
                +
                +

                In my new project I'm using Python3.
                I can't when I will run it with PyPy.

                Thanks for your work!

                +
                +
                +
                +
                + + Unknown wrote on 2013-06-14 20:29: +
                +
                +

                I just donated and found this post :) Great work!

                +
                +
                +
                +
                + + Paul Jaros wrote on 2013-06-17 08:12: +
                +
                +

                The "new GIL" picked my curiosity. Was is it? Is it related to the STM or is it a separate thing?

                Also, thanks for the update.

                +
                +
                +
                +
                + + Philip Jenvey wrote on 2013-06-18 19:35: +
                +
                +

                The new GIL is briefly explained here: https://docs.python.org/3.4/whatsnew/3.2.html#multi-threading

                Additionally, David Beazly has done a couple talks/blog posts about the problems of the old GIL and how the new GIL has improved over the old design.

                +
                +
                +
                +
                + + Paul Jaros wrote on 2013-06-19 12:02: +
                +
                +

                Thanks for the link

                +
                +
                +
                +
                + + randomlessly wrote on 2013-06-22 17:37: +
                +
                +

                kkk @Tom Li

                +
                +
                +
                +
                + + Unknown wrote on 2013-06-23 09:15: +
                +
                +

                Will the pre-release already be optimized?

                +
                +
                +
                +
                + + Tony wrote on 2013-07-31 08:36: +
                +
                +

                This is cool!

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2013/06/stm-on-drawing-board-1028082727566254104.html b/posts/2013/06/stm-on-drawing-board-1028082727566254104.html new file mode 100644 index 000000000..214dcb03e --- /dev/null +++ b/posts/2013/06/stm-on-drawing-board-1028082727566254104.html @@ -0,0 +1,438 @@ + + + + + +STM on the drawing board | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                STM on the drawing board

                + + + +
                +

                Hi all!

                + +

                This is an update about the Software Transactional Memory subproject of +PyPy. I have some good news of progress. Also, +Remi Meier will +likely help me this summer. He did various +investigations with PyPy-STM for his Master's Thesis and contributed back +a lot of ideas and some code. Welcome again Remi!

                + +

                I am also sorry that it seems to advance so slowly. Beyond the usual +excuses --- I was busy with other things, e.g. releasing PyPy 2.0 --- I +would like to reassure people: I'm again working on it, and the financial +contributions are still there and reserved for STM (almost half the money is +left, a big thank you again if you contributed!).

                + +

                The real reason for the apparent slowness, though, is that it is really +a research project. It's possible to either have hard deadlines, or to +follow various tracks and keep improving the basics, but not both at the +same time.

                + +

                During the past month where I have worked again on STM, I worked still on +the second option; and I believe it was worth every second of it. Let me try +to convince you :-)

                + +

                The main blocker was that the STM subsystem, written in C, and the +Garbage Collection (GC) subsystem, written in RPython, were getting +harder and harder to coordinate. So what I did instead is to give up +using RPython in favor of using only C for both. C is a good language +for some things, which includes low-level programming where we must take +care of delicate multithreading issues; RPython is not a good fit in +that case, and wasn't designed to be.

                + +

                I started a fresh Mercurial repo +which is basically a stand-alone C library. This library (in heavy development +right now!) gives any C +program some functions to allocate and track GC-managed objects, and +gives an actual STM+GC combination on these objects. It's possible +(though rather verbose) to use it directly in C programs, like in a +small example interpreter. Of course the eventual purpose is to link it +with PyPy during translation to C, with all the verbose calls +automatically generated.

                + +

                Since I started this, bringing the GC closer to the STM, I kept finding +new ways that the two might interact to improve the performance, maybe +radically. Here is a summary of the current ideas.

                + +

                When we run +multiple threads, there are two common cases: one is to access (read and write) +objects that have only been seen by the current thread; the other is to read +objects seen by all threads, like in Python the modules/functions/classes, +but not to write to them. Of course, writing to the same object from +multiple threads occurs too, and it is handled correctly (that's the whole +point), but it is a relatively rare case.

                + +

                So each object is classified as "public" or "protected" (or "private", +when they belong to the current transaction). Newly created objects, once +they are no longer private, remain protected until +they are read by a different thread. Now, the point is to use very +different mechanisms for public and for protected objects. Public +objects are visible by all threads, but read-only in memory; to change +them, a copy must be made, and the changes are written to the copy (the +"redolog" approach to STM). Protected objects, on the other hand, are +modified in-place, with (if necessary) a copy of them being made +for the sole purpose of a possible abort of the transaction (the "undolog" +approach).

                + +

                This is combined with a generational GC similar to PyPy's --- but here, +each thread gets its own nursery and does its own "minor collections", +independently of the others.

                + +

                So objects are by default protected; when another thread tries to follow a +pointer to them, then it is that other thread's job to carefully "steal" +the object and turn it public (possibly making a copy of it if needed, +e.g. if it was still a young object living in the original nursery).

                + +

                The same object can exist temporarily in multiple versions: any number +of public copies; at most one active protected copy; and optionally one +private copy per thread (this is the copy as currently seen by the +transaction in progress on that thread). The GC cleans up the +unnecessary copies.

                + +

                These ideas are variants and extensions of the same basic idea +of keeping multiple copies with revision numbers to track them. +Moreover, "read barriers" and "write barriers" are used by the C program +calling into this library in order to be sure that it is accessing the +right version of the object. In the currently investigated variant +I believe it should be possible to have rather cheap +read barriers, which would definitely be a major speed improvement over +the previous variants. Actually, as far as I know, it would be a major +improvement over most of the other existing STMs: in them, the typical read barrier +involves following chains of pointers, and checking some dictionary to see if this +thread has a modified local copy of the object. The difference with a +read barrier that can resolve most cases in a few CPU cycles should be +huge.

                + +

                So, this is research :-) It is progressing, and at some point I'll be +satisfied with it and stop rewriting everything; and then the actual +integration into PyPy should be straightforward (there is already code +to detect where the read and write barriers need to be inserted, where +transactions can be split, etc.). Then there is support for the +JIT to be written, and so on. But more about it later.

                + +

                The purpose of this post was to give you some glimpses into what I'm +working on right now. As usual, no plan for release yet. But you can +look forward to seeing the C library progress. I'll probably also start +soon some sample interpreter in C, to test the waters (likely a +revival of duhton). +If you know nothing about Python but all about the C-level +multithreading issues, now is a good time to get involved :-)

                + +

                Thanks for reading!

                + +

                Armin

                +
                +

                Comments

                +
                +
                +
                + + Paul Jaros wrote on 2013-06-06 12:48: +
                +
                +

                Thanks for the update. I was wondering since some time how the progress in STM has come along.
                Good job also :)

                +
                +
                +
                +
                + + Tuure Laurinolli wrote on 2013-06-18 05:27: +
                +
                +

                Do you have a description of the read and write barriers required somewhere? How does requiring a copy to be made of protected objects upon modification work with e.g. large arrays?

                +
                +
                +
                +
                + + David wrote on 2013-08-04 18:06: +
                +
                +

                Check out John Carmack's brainstorm on an integrated STM+GC system concept which is sort of "globally phased compacting GC+STM". He doesn't use the term STM, but the concept is the same.

                https://www.youtube.com/watch?v=1PhArSujR_A&feature=player_detailpage&t=1354

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2013/07/europython-8992114341185888806.html b/posts/2013/07/europython-8992114341185888806.html new file mode 100644 index 000000000..dfbc6388c --- /dev/null +++ b/posts/2013/07/europython-8992114341185888806.html @@ -0,0 +1,312 @@ + + + + + +EuroPython | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                EuroPython

                + + + +
                +

                Hi all,

                + +

                A short note: if you're at EuroPython right now and wondering if PyPy is +dead because you don't see the obviously expected talk about PyPy, don't +worry. PyPy is still alive and kicking. The truth is two-fold: (1) we +missed the talk deadline (duh!)... but as importantly, (2) for various +reasons we chose not to travel to Florence this year after our trip to +PyCon US. (Antonio Cuni is at Florence but doesn't have a talk about PyPy +either.)

                + +

                Armin

                +
                +

                Comments

                +
                +
                +
                + + rokujyouhitoma wrote on 2013-07-04 20:25: +
                +
                +

                I think of it for a moment. >dead.
                Also...I can not meet you at EuroPython :(

                See you next time!
                From Japanese Pythonista.

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2013/07/pypy-21-beta-1351105697755187196.html b/posts/2013/07/pypy-21-beta-1351105697755187196.html new file mode 100644 index 000000000..152d276b6 --- /dev/null +++ b/posts/2013/07/pypy-21-beta-1351105697755187196.html @@ -0,0 +1,326 @@ + + + + + +PyPy 2.1 beta | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                PyPy 2.1 beta

                + + + +
                +

                We're pleased to announce the first beta of the upcoming 2.1 release of PyPy. This beta contains many bugfixes and improvements, numerous improvements to the numpy in pypy effort. The main feature being that the ARM processor support is not longer considered alpha level.

                +We would like to thank the Raspberry Pi Foundation for supporting the work to finish PyPy's ARM support.


                +You can download the PyPy 2.1 beta release here:

                +
                +https://pypy.org/download.html +
                +
                +

                +
                +

                +

                +Highlights

                +
                  +
                • Bugfixes to the ARM JIT backend, so that ARM is now an officially
                  +supported processor architecture
                • +
                • Stacklet support on ARM
                • +
                • Interpreter improvements
                • +
                • Various numpy improvements
                • +
                • Bugfixes to cffi and ctypes
                • +
                • Bugfixes to the stacklet support
                • +
                • Improved logging performance
                • +
                • Faster sets for objects
                • +
                +
                +
                +

                +
                +

                +

                +What is PyPy?

                +PyPy is a very compliant Python interpreter, almost a drop-in replacement for CPython 2.7.3. It's fast due to its integrated tracing JIT compiler. This release supports x86 machines running Linux 32/64, Mac OS X 64 or Windows 32. Also this release supports ARM machines running Linux 32bit - anything with ARMv6 (like the Raspberry Pi) or ARMv7 (like Beagleboard, Chromebook, Cubieboard, etc.) that supports VFPv3 should work. Both hard-float armhf/gnueabihf and soft-float armel/gnueabi builds are provided. armhf builds for Raspbian are created using the Raspberry Pi
                custom cross-compilation toolchain based on gcc-arm-linux-gnueabihf and should work on ARMv6 and ARMv7 devices running Debian or Raspbian. armel builds are built using the gcc-arm-linux-gnuebi toolchain provided by Ubuntu and currently target ARMv7.

                +Windows 64 work is still stalling, we would welcome a volunteer to handle that.
                +
                +

                +
                +

                +

                +How to use PyPy?

                +We suggest using PyPy from a virtualenv. Once you have a virtualenv installed, you can follow instructions from pypy documentation on how to proceed. This document also covers other installation schemes.

                +Cheers,

                +the PyPy team.
                +
                +

                Comments

                +
                +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2013/07/pypy-21-beta-2-264349571160808803.html b/posts/2013/07/pypy-21-beta-2-264349571160808803.html new file mode 100644 index 000000000..4365cdb3c --- /dev/null +++ b/posts/2013/07/pypy-21-beta-2-264349571160808803.html @@ -0,0 +1,323 @@ + + + + + +PyPy 2.1 beta 2 | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                PyPy 2.1 beta 2

                + + + +
                +

                We're pleased to announce the second beta of the upcoming 2.1 release of PyPy.
                +This beta adds one new feature to the 2.1 release and contains several bugfixes listed below.

                +

                You can download the PyPy 2.1 beta 2 release here:

                +
                https://pypy.org/download.html
                +
                +

                Highlights

                +
                  +
                • Support for os.statvfs and os.fstatvfs on unix systems.
                • +
                • Fixed issue 1533: fix an RPython-level OverflowError for space.float_w(w_big_long_number).
                • +
                • Fixed issue 1552: GreenletExit should inherit from BaseException.
                • +
                • Fixed issue 1537: numpypy __array_interface__
                • +
                • Fixed issue 1238: Writing to an SSL socket in pypy sometimes failed with a "bad write retry" message.
                • +
                • +distutils: copy CPython's implementation of customize_compiler, dont call
                  +split on environment variables, honour CFLAGS, CPPFLAGS, LDSHARED and
                  +LDFLAGS.
                • +
                • During packaging, compile the CFFI tk extension.
                • +
                +
                +
                +

                What is PyPy?

                +

                PyPy is a very compliant Python interpreter, almost a drop-in replacement for
                +CPython 2.7.3. It's fast due to its integrated tracing JIT compiler.

                +

                This release supports x86 machines running Linux 32/64, Mac OS X 64 or Windows
                +32. Also this release supports ARM machines running Linux 32bit - anything with
                ARMv6 (like the Raspberry Pi) or ARMv7 (like Beagleboard,
                +Chromebook, Cubieboard, etc.) that supports VFPv3 should work.

                +

                Windows 64 work is still stalling, we would welcome a volunteer
                +to handle that.

                +
                +
                +

                How to use PyPy?

                +

                We suggest using PyPy from a virtualenv. Once you have a virtualenv
                +installed, you can follow instructions from pypy documentation on how
                +to proceed. This document also covers other installation schemes.

                +

                Cheers,
                +The PyPy Team.

                +
                +
                +

                Comments

                +
                +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2013/07/pypy-demo-evening-in-london-august-27-3640213278969666664.html b/posts/2013/07/pypy-demo-evening-in-london-august-27-3640213278969666664.html new file mode 100644 index 000000000..e514c94e1 --- /dev/null +++ b/posts/2013/07/pypy-demo-evening-in-london-august-27-3640213278969666664.html @@ -0,0 +1,308 @@ + + + + + +PyPy Demo Evening in London, August 27, 2013 | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                PyPy Demo Evening in London, August 27, 2013

                + + + +
                +

                As promised in the London sprint announcement we are organising a PyPy demo +evening during the London sprint on Tuesday, August 27 2013, 18:30-19:30 (BST). The +description of the event is below. If you want to come, please register on the +Eventbrite page.

                +
                +

                PyPy is a fast Python VM. Maybe you've never used PyPy and want to find out +what use it might be for you? Or you and your organisation have been using it +and you want to find out more about how it works under the hood? If so, this +demo session is for you!

                +

                Members of the PyPy team will give a series of lightning talks on PyPy: its +benefits; how it works; research currently being undertaken to make it +faster; and unusual uses it can be put to. Speakers will be available +afterwards for informal discussions. This is the first time an event like +this has been held in the UK, and is a unique opportunity to speak to core +people. Speakers confirmed thus far include: Armin Rigo, Maciej Fijałkowski, +Carl Friedrich Bolz, Lukas Diekmann, Laurence Tratt, Edd Barrett.

                +

                The venue for this talk is the Software Development Team, King's College +London. The main entrance is on the Strand, from where the room for the event +will be clearly signposted. Travel directions can be found at +https://www.kcl.ac.uk/campuslife/campuses/directions/strand.aspx

                +

                If you have any questions about the event, please contact Laurence Tratt

                +
                +

                Comments

                +
                +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2013/07/pypy-london-sprint-august-26-september-5156945690440578388.html b/posts/2013/07/pypy-london-sprint-august-26-september-5156945690440578388.html new file mode 100644 index 000000000..49f24cb19 --- /dev/null +++ b/posts/2013/07/pypy-london-sprint-august-26-september-5156945690440578388.html @@ -0,0 +1,386 @@ + + + + + +PyPy London Sprint (August 26 - September 1 2013) | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                PyPy London Sprint (August 26 - September 1 2013)

                + + + +
                +

                The next PyPy sprint will be in London, United Kingdom for the first +time. This is a fully public sprint. PyPy sprints are a very good way +to get into PyPy development and no prior PyPy knowledge is necessary.

                +

                Goals and topics of the sprint

                +

                For newcomers:

                +
                  +
                • bring your application/library and we'll help you port it to PyPy, +benchmark and profile
                • +
                • come and write your favorite missing numpy function
                • +
                • help us work on developer tools like jitviewer
                • +
                +

                We'll also work on:

                +
                  +
                • refactoring the JIT optimizations
                • +
                • STM and STM-related topics
                • +
                • anything else attendees are interested in
                • +
                +

                Exact times

                +

                The work days should be August 26 - September 1 2013 (Monday-Sunday). +The official plans are for people to arrive on the 26th, and +to leave on the 2nd. There will be a break day in the middle. +We'll typically start at 10:00 in the morning.

                +

                Location

                +

                The sprint will happen within a room of King's College's Strand +Campus in Central London, UK. There are some travel instructions how to +get there. We are being hosted by Laurence Tratt and the Software +Development Team.

                +

                Demo Session

                +

                If you don't want to come to the full sprint, but still want to chat a +bit, we are planning to have a demo session on Tuesday August 27. We +will announce this separately on the blog. If you are interested, please +leave a comment.

                +

                Registration

                +

                If you want to attend, please register by adding yourself to the +"people.txt" file in Mercurial:

                +
                +https://bitbucket.org/pypy/extradoc/
                +https://foss.heptapod.net/pypy/extradoc/-/blob/branch/default/extradoc/sprintinfo/london-2013
                +
                +

                or on the pypy-dev mailing list if you do not yet have check-in rights:

                +
                +https://mail.python.org/mailman/listinfo/pypy-dev
                +
                +

                Remember that you may need a (insert country here)-to-UK power adapter. +Please note that UK is not within the Schengen zone, so non-EU and +non-Switzerland citizens may require specific visa. Please check travel +regulations. Also, the UK uses pound sterling (GBP).

                +
                +

                Comments

                +
                +
                +
                + + griff wrote on 2013-07-19 15:05: +
                +
                +

                I'd be up for joining Andrew :)

                +
                +
                +
                +
                + + Unknown wrote on 2013-07-19 15:06: +
                +
                +

                Cannot quite get a week off for this, but would be very interested in the demo session on the Tuesday.

                +
                +
                +
                +
                + + Daniel wrote on 2013-07-22 11:06: +
                +
                +

                I would be very interested in the demo session on the Tuesday 27th.

                +
                +
                +
                +
                + + Carin Robert wrote on 2013-08-24 07:57: +
                +
                +

                Does the demo session happen on August 27th only? What are the timings?

                +
                +
                +
                +
                + + Armin Rigo wrote on 2013-08-24 10:42: +
                +
                +

                @Carin: https://morepypy.blogspot.ch/2013/08/preliminary-london-demo-evening-agenda.html

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2013/07/pypy-san-francisco-sprint-july-27th-2012-3064530444396960172.html b/posts/2013/07/pypy-san-francisco-sprint-july-27th-2012-3064530444396960172.html new file mode 100644 index 000000000..3f73b03b7 --- /dev/null +++ b/posts/2013/07/pypy-san-francisco-sprint-july-27th-2012-3064530444396960172.html @@ -0,0 +1,322 @@ + + + + + +PyPy San Francisco Sprint July 27th 2013 | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                PyPy San Francisco Sprint July 27th 2013

                + + + +
                +

                The next PyPy sprint will be in San Francisco, California. It is a public
                +sprint, suitable for newcomers. It will run on Saturday July 27th.

                +

                Some possible things people will be hacking on the sprint:

                +
                  +
                • running your software on PyPy
                • +
                • making your software fast on PyPy
                • +
                • improving PyPy's JIT
                • +
                • improving Twisted on PyPy
                • +
                • any exciting stuff you can think of
                • +
                +

                If there are newcomers, we'll run an introduction to hacking on PyPy.

                +

                Location
                +The sprint will be held at the Rackspace Office:

                +

                620 Folsom St, Ste 100

                +

                The doors will open at 10AM and run until 6PM.

                +
                +

                Comments

                +
                +
                +
                + + Garen wrote on 2013-07-26 04:29: +
                +
                +

                s/2012/2013/;

                +
                +
                +
                +
                + + Anonymous wrote on 2013-07-30 11:39: +
                +
                +

                You think you might get more folks if you gave more than 24 hours notice?

                Just saying...

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2013/07/pypy3-21-beta-1-8647445024868663902.html b/posts/2013/07/pypy3-21-beta-1-8647445024868663902.html new file mode 100644 index 000000000..325ee41bd --- /dev/null +++ b/posts/2013/07/pypy3-21-beta-1-8647445024868663902.html @@ -0,0 +1,351 @@ + + + + + +PyPy3 2.1 beta 1 | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                PyPy3 2.1 beta 1

                + + + +
                +

                We're pleased to announce the first beta of the upcoming 2.1 release of
                +PyPy3. This is the first release of PyPy which targets Python 3 (3.2.3)
                +compatibility.

                +

                We would like to thank all of the people who donated to the py3k proposal
                +for supporting the work that went into this and future releases.

                +

                You can download the PyPy3 2.1 beta 1 release here:

                +
                https://pypy.org/download.html#pypy3-2-1-beta-1
                +
                +

                Highlights

                +
                  +
                • The first release of PyPy3: support for Python 3, targetting CPython 3.2.3!
                    +
                  • There are some known issues including performance regressions (issues
                    #1540 & #1541) slated to be resolved before the final release.
                  • +
                  +
                • +
                +
                +
                +

                What is PyPy?

                +

                PyPy is a very compliant Python interpreter, almost a drop-in replacement for
                +CPython 2.7.3 or 3.2.3. It's fast due to its integrated tracing JIT compiler.

                +

                This release supports x86 machines running Linux 32/64, Mac OS X 64 or Windows
                +32. Also this release supports ARM machines running Linux 32bit - anything with
                ARMv6 (like the Raspberry Pi) or ARMv7 (like Beagleboard,
                +Chromebook, Cubieboard, etc.) that supports VFPv3 should work.

                +

                Windows 64 work is still stalling and we would welcome a volunteer to handle
                +that.

                +
                +
                +

                How to use PyPy?

                +

                We suggest using PyPy from a virtualenv. Once you have a virtualenv
                +installed, you can follow instructions from pypy documentation on how
                +to proceed. This document also covers other installation schemes.

                +

                Cheers,
                +the PyPy team

                +
                +
                +

                Comments

                +
                +
                +
                + + Arne Babenhauserheide wrote on 2013-07-31 08:47: +
                +
                +

                This is *really* cool!

                Thank you for realizing pypy for python3! This should make it much easier to continue work on one of my projects (it was on hold, because pypy made it much faster, but I had to convert from python3 to python2 for running it, and that became a maintenance nightmare.

                +
                +
                +
                +
                + + Anonymous wrote on 2013-08-02 11:30: +
                +
                +

                So how does one build PyPy3? It doesn't seem to be documented anywhere.

                +
                +
                +
                +
                + + Anonymous wrote on 2013-08-02 12:02: +
                +
                +

                Sorry never mind. I thought it was being developed in the same codebase, but now I realize there's a separate branch for PyPy3 that must be used to build the Python3 version.

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2013/07/software-transactional-memory-lisp-7777576128992250197.html b/posts/2013/07/software-transactional-memory-lisp-7777576128992250197.html new file mode 100644 index 000000000..877a25c22 --- /dev/null +++ b/posts/2013/07/software-transactional-memory-lisp-7777576128992250197.html @@ -0,0 +1,443 @@ + + + + + +Software Transactional Memory lisp experiments | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                Software Transactional Memory lisp experiments

                + + + +
                +
                +

                As covered in the previous blog post, the STM subproject of PyPy has been +back on the drawing board. The result of this experiment is an STM-aware +garbage collector written in C. This is finished by now, thanks to Armin's +and Remi's work, we have a fully functional garbage collector and a STM system +that can be used from any C program with enough effort. Using it is more than +a little mundane, since you have to inserts write and read barriers by hand +everywhere in your code that reads or writes to garbage collector controlled +memory. In the PyPy integration, this manual work is done automatically +by the STM transformation in the interpreter.

                +

                However, to experiment some more, we created a minimal +lisp-like/scheme-like interpreter +(called Duhton), that follows closely CPython's implementation strategy. +For anyone familiar with CPython's source code, it should be pretty +readable. This interpreter works like a normal and very basic lisp variant, +however it comes with a transaction builtin, that lets you spawn transactions +using the STM system. We implemented a few demos that let you play with the +transaction system. All the demos are running without conflicts, which means +there are no conflicting writes to global memory and hence the demos are very +amenable to parallelization. They exercise:

                +
                  +
                • arithmetics - demo/many_sqare_roots.duh +
                • +
                • read-only access to globals - demo/trees.duh +
                • +
                • read-write access to local objects - demo/trees2.duh +
                • +
                +

                With the latter ones being very similar to the classic gcbench. STM-aware +Duhton can be found in the stmgc repo, while the STM-less Duhton, +that uses refcounting, can be found in the duhton repo under the base +branch.

                +

                Below are some benchmarks. Note that this is a little comparing apples to +oranges since the single-threaded duhton uses refcounting GC vs generational +GC for STM version. Future pypy benchmarks will compare more apples to apples. +Moreover none of the benchmarks has any conflicts. Time is the total time +that the benchmark took (not the CPU time) and there was very little variation +in the consecutive runs (definitely below 5%).

                + +++++++ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                benchmark1 thread (refcount)1 thread (stm)2 threads4 threads
                square1.9s3.5s1.8s0.9s
                trees0.6s1.0s0.54s0.28s
                trees21.4s2.2s1.1s0.57s
                +

                As you can see, the slowdown for STM vs single thread is significant +(1.8x, 1.7x, 1.6x respectively), but still lower than 2x. However the speedup +from running on multiple threads parallelizes the problem almost perfectly.

                +

                While a significant milestone, we hope the next blog post will cover +STM-enabled pypy that's fully working with JIT work ongoing.

                +

                Cheers,
                +fijal on behalf of Remi Meier and Armin Rigo

                +

                +
                +
                +

                Comments

                +
                +
                +
                + + Anonymous wrote on 2013-07-12 13:06: +
                +
                +

                I hacked a bit; inserted likely hint on early exit on spinlock acquisition, Haswell xacquire/xrelease hints on spinlock acquisition and release, and compiled with Haswell optimized flags.

                Resulting scaling from 1 to 4 threads for tests were 1.92, 1.87 and 1.88. I think that's already quite close to 2.

                I think this is OK, but not extraordinary.

                +
                +
                +
                +
                + + Anonymous wrote on 2013-07-12 13:12: +
                +
                +

                Just to clarify my above comment: those were average factors of scaling per doubling of threads. So, 4-thread version ran actually 3.67, 3.50 and 3.54 times faster than single-threaded version.

                +
                +
                +
                +
                + + Armin Rigo wrote on 2013-07-12 13:15: +
                +
                +

                Cool that you hacked on it! Note however that spinlock acquisition is not a blocker in these examples --- we implement STM mostly without locks, and locks are acquired rarely. Running independent code without getting STM conflicts means that each thread will in practice only acquire its own lock. And a single global lock is used for major GC --- but there, the large amount of work done means that using the Haswell xacquire/xrelease hints is just counterproductive.

                "Resulting scaling from 1 to 4 threads" doesn't mean anything, as in some examples it scales perfectly, and in other examples it doesn't scale at all (as expected).

                +
                +
                +
                +
                + + Anonymous wrote on 2013-07-12 13:39: +
                +
                +

                All your arguments are valid, and I didn't really expect much from hinting, just decided to try. It would seem that Haswell is still inching towards higher multicore scalability - probably thanks to improved atomic and fence ops in general. It's a benefit for those workloads that should conceptually scale well...

                +
                +
                +
                +
                + + Glen Newton wrote on 2013-07-13 18:19: +
                +
                +

                You really need to go above 4 threads: 8,16,32, and 64 at least. Then plot out the overhead of the STM related to this level of threading. If your benchmark is too small, alter it so that it makes sense to try and solve it with 64 threads.

                +
                +
                +
                +
                + + Armin Rigo wrote on 2013-07-14 06:31: +
                +
                +

                @glen: we're focusing right now on the machines we have, which are standard Intels with 4, 8, or at most 12 cores. I believe it is interesting too, and it's what people have right now in their own desktop or laptop computers. Obviously the scalability to larger numbers of cores is important as well, but we can't simply disregard any result involving less than 64 cores.

                +
                +
                +
                +
                + + Anonymous wrote on 2013-07-17 17:20: +
                +
                +

                This is a really great news.

                Wish you all the best with further work!

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2013/08/numpy-road-forward-4210065750776753500.html b/posts/2013/08/numpy-road-forward-4210065750776753500.html new file mode 100644 index 000000000..d745a9520 --- /dev/null +++ b/posts/2013/08/numpy-road-forward-4210065750776753500.html @@ -0,0 +1,373 @@ + + + + + +NumPy road forward | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                NumPy road forward

                + + + +
                +
                +

                Hello everyone.

                +

                This is the roadmap for numpy effort in PyPy as discussed on the London sprint. +First, the highest on our priority list is to finish the low-level part +of the numpy module. What +we'll do is to finish the RPython part of numpy and provide a pip installable +numpypy repository that includes the pure python part of Numpy. This would +contain the original Numpy with a few minor changes.

                +

                Second, we need to work on the JIT support that will make NumPy on PyPy +faster. In detail:

                +
                  +
                • reenable the lazy loop evaluation
                • +
                • optimize bridges, which is depending on optimizer refactorings
                • +
                • SSE support
                • +
                +

                On the compatibility front, there were some independent attempts into +making the following stuff working:

                +
                  +
                • f2py
                • +
                • C API (in fact, PyArray_* API is partly present in the nightly builds of +PyPy)
                • +
                • matplotlib (both using PyArray_* API and embedding CPython runtime in PyPy)
                • +
                • scipy
                • +
                +

                In order to make all of the above happen faster, it would be helpful to raise +more funds. You can donate to PyPy's NumPy project on our website. Note +that PyPy is a member of SFC which is a 501(c)(3) US non-profit, so donations +from US companies can be tax-deducted.

                +

                Cheers,
                +fijal, arigo, ronan, rguillebert, anto and others

                +
                +
                +
                +

                Comments

                +
                +
                +
                + + Pim wrote on 2013-08-27 16:41: +
                +
                +

                Thanks for the update. I'm hoping the other presentations can also be summarized here for those who couldn't attend this (very interesting) mini-conference.

                +
                +
                +
                +
                + + Dan wrote on 2013-08-28 20:11: +
                +
                +

                Thanks for the info! I can't wait to play with it.
                I only have a very rudimentary understanding of numpypy and pypy, so please forgive if this is a stupid question:

                Will there be a way to do additional high level optimization steps before the JIT level?

                I.e. elimination of temporaries for matrices, expression optimization and so on?

                Basically check if the expression should be handled by the pypy JIT, or if if should be passed on to something like numexpr
                https://code.google.com/p/numexpr/
                that will itself hand over the code to optimized vendor libraries?

                I am a bit concerned that while the pypy JIT optimizations are without question very impressive and probably close to optimal to what can be done for generic code, the performance issues with numerical code are very different.

                Any JIT will (please correct me if I am wrong, this would be a significant breakthrough) never be able to even come close to what a vendor library like the MKL can do.

                The comparison will be even more to the disadvantage of the JIT if one uses a library like Theano that runs the code on the GPU.

                For my work, beating c for speed is not enough anymore, the challenges are how to run the computation in parallel, how to call optimized libraries without pain and how to use a GPU without re-writing the entire program and learning about a completely new system.

                Will libraries like numexpr, numba and theano be able to run under pypy, and will it eventually be possible to automatically hand over numerical expressions automatically to these libraries?

                +
                +
                +
                +
                + + Maciej Fijalkowski wrote on 2013-09-11 14:26: +
                +
                +

                Hi Dan.

                Yes, pypy will do the removal of temporary matrices, this is a very basic optimization that we had, but disabled for a while to simplify development.

                I don't think numba, numexpr or theano would ever work on PyPy (I would ask their authors though), but I personally think we can match their performance or even exceed it, time will tell though.

                Cheers,
                fijal

                +
                +
                +
                +
                + + Anonymous wrote on 2013-09-14 00:40: +
                +
                +

                Hi Maciej,

                Thanks for the answer.

                A pypy that matches what numba or theano can do, all without doing any extra annotation, would not only be a huge breakthrough for pypy, it will be a gigantic step forward for the entire numerics community.

                Thank you and keep up the good work,


                Dan

                +
                +
                +
                +
                + + Armin Rigo wrote on 2013-09-14 13:31: +
                +
                +

                @Anonymous: I'd like to point out again that all this NumPy work would get more traction and faster development within PyPy if we could manage to interest (and get contributions from) anyone that comes from the scientific community. Ourselves, we are looking at this topic as a smallish part of the whole Python world, so we disagree (to a point) with your comment "a huge breakthrough for pypy". :-)

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2013/08/numpypy-status-update-3401163348519734658.html b/posts/2013/08/numpypy-status-update-3401163348519734658.html new file mode 100644 index 000000000..375f95546 --- /dev/null +++ b/posts/2013/08/numpypy-status-update-3401163348519734658.html @@ -0,0 +1,326 @@ + + + + + +NumPyPy Status Update | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                NumPyPy Status Update

                + + + +
                +

                Hello everyone

                +As expected, nditer is a lot of work. I'm going to pause my work on it for now and focus on simpler and more important things, here is a list of what I implemented :

                +
                  +
                • Fixed a bug on 32 bit that made int32(123).dtype == dtype("int32") fail
                • +
                • Fixed a bug on the pickling of array slices
                • +
                • The external loop flag is implemented on the nditer class
                • +
                • The c_index, f_index and multi_index flags are also implemented
                • +
                • Add dtype("double") and dtype("str")
                • +
                • C-style iteration is available for nditer
                • +
                +Cheers
                +Romain Guillebert +
                +

                Comments

                +
                +
                +
                + + René Dudfield wrote on 2013-08-09 10:17: +
                +
                +

                Nice work :)

                +
                +
                +
                +
                + + Arne Babenhauserheide wrote on 2013-08-12 09:38: +
                +
                +

                thanks for the update!

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2013/08/preliminary-london-demo-evening-agenda-5254002451136674320.html b/posts/2013/08/preliminary-london-demo-evening-agenda-5254002451136674320.html new file mode 100644 index 000000000..cfda2d137 --- /dev/null +++ b/posts/2013/08/preliminary-london-demo-evening-agenda-5254002451136674320.html @@ -0,0 +1,334 @@ + + + + + +Preliminary London Demo Evening Agenda | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                Preliminary London Demo Evening Agenda

                + + + +
                +

                We now have a preliminary agenda for the demo evening in London next week. It takes place on Tuesday, August 27 2013, 18:30-19:30 (BST) at King's College London, Strand. The preliminary agenda is as follows:

                + + +

                All the talks are lightning talks. Afterwards there will be plenty of time for discussion.

                + +

                There's still free spots, if you want to come, please register on the Eventbrite page. Hope to see you there!

                +
                +

                Comments

                +
                +
                +
                + + Anonymous wrote on 2013-08-20 13:35: +
                +
                +

                Will the video of the talks be available online?

                +
                +
                +
                +
                + + Carl Friedrich Bolz-Tereick wrote on 2013-08-20 13:36: +
                +
                +

                @Anonymous: unfortunately there are no plans to film the event, no :-(

                +
                +
                +
                +
                + + Paddy3118 wrote on 2013-08-21 15:17: +
                +
                +

                Another request for cideos of the event to be made available. Please.

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2013/08/pypy-21-considered-armful-7177475722033479233.html b/posts/2013/08/pypy-21-considered-armful-7177475722033479233.html new file mode 100644 index 000000000..ac081dc57 --- /dev/null +++ b/posts/2013/08/pypy-21-considered-armful-7177475722033479233.html @@ -0,0 +1,422 @@ + + + + + +PyPy 2.1 - Considered ARMful | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                PyPy 2.1 - Considered ARMful

                + + + +
                +

                We're pleased to announce PyPy 2.1, which targets version 2.7.3 of the Python
                +language. This is the first release with official support for ARM processors in the JIT.
                +This release also contains several bugfixes and performance improvements.

                +

                You can download the PyPy 2.1 release here:

                +
                https://pypy.org/download.html
                +

                We would like to thank the Raspberry Pi Foundation for supporting the work
                +to finish PyPy's ARM support.

                +

                The first beta of PyPy3 2.1, targeting version 3 of the Python language, was
                +just released, more details can be found here.

                +
                +

                What is PyPy?

                +

                PyPy is a very compliant Python interpreter, almost a drop-in replacement for +CPython 2.7. It's fast (pypy 2.1 and cpython 2.7.2 performance comparison) +due to its integrated tracing JIT compiler.

                +

                This release supports x86 machines running Linux 32/64, Mac OS X 64 or Windows +32. This release also supports ARM machines running Linux 32bit - anything with +ARMv6 (like the Raspberry Pi) or ARMv7 (like the Beagleboard, +Chromebook, Cubieboard, etc.) that supports VFPv3 should work. Both +hard-float armhf/gnueabihf and soft-float armel/gnueabi builds are +provided. The armhf builds for Raspbian are created using the Raspberry Pi +custom cross-compilation toolchain +based on gcc-arm-linux-gnueabihf and should work on ARMv6 and +ARMv7 devices running Debian or Raspbian. The armel builds are built +using the gcc-arm-linux-gnuebi toolchain provided by Ubuntu and +currently target ARMv7.

                +

                Windows 64 work is still stalling, we would welcome a volunteer +to handle that.

                +
                +
                +

                Highlights

                +
                  +
                • JIT support for ARM, architecture versions 6 and 7, hard- and soft-float ABI
                • +
                • Stacklet support for ARM
                • +
                • Support for os.statvfs and os.fstatvfs on unix systems
                • +
                • Improved logging performance
                • +
                • Faster sets for objects
                • +
                • Interpreter improvements
                • +
                • During packaging, compile the CFFI based TK extension
                • +
                • Pickling of numpy arrays and dtypes
                • +
                • Subarrays for numpy
                • +
                • Bugfixes to numpy
                • +
                • Bugfixes to cffi and ctypes
                • +
                • Bugfixes to the x86 stacklet support
                • +
                • Fixed issue 1533: fix an RPython-level OverflowError for space.float_w(w_big_long_number).
                • +
                • Fixed issue 1552: GreenletExit should inherit from BaseException.
                • +
                • Fixed issue 1537: numpypy __array_interface__
                • +
                • Fixed issue 1238: Writing to an SSL socket in PyPy sometimes failed with a "bad write retry" message.
                • +
                +

                Cheers,

                +

                David Schneider for the PyPy team.

                +
                +
                +

                Comments

                +
                +
                +
                + + Anonymous wrote on 2013-08-02 03:42: +
                +
                +

                What about gevent support in this release? i am waiting for full support to switch to pypy on production

                +
                +
                +
                +
                + + Armin Rigo wrote on 2013-08-02 08:02: +
                +
                +

                Some issues with gevent were fixed. You need to try it out and report any remaining issues, if any.

                +
                +
                +
                +
                + + Unknown wrote on 2013-08-02 08:43: +
                +
                +

                If i read well, you did not use any ThumbEE instructions for your Arm support ? So there is room for improvement ?

                +
                +
                +
                +
                + + Armin Rigo wrote on 2013-08-02 09:44: +
                +
                +

                ThumbEE is deprecated nowadays.

                +
                +
                +
                +
                + + Unknown wrote on 2013-08-07 15:12: +
                +
                +

                Has cdecimal been backported into either version of PyPy yet? If not, any near-term plan to do so?

                +
                +
                +
                +
                + + Armin Rigo wrote on 2013-08-08 08:07: +
                +
                +

                cdecimal is purely a speed gain. On PyPy the pure Python decimal.py is accelerated by the JIT, though it is probably possible to gain some small extra factor by rewriting it directly in RPython.

                If your problem is merely that project X has listed cdecimal in its dependencies, then we could add a "cdecimal.egg-info" file that says "yup, it's installed" and be done (assuming that the API is really the same one as decimal.py).

                +
                +
                +
                +
                + + Amaury Forgeot d'Arc wrote on 2013-08-08 23:18: +
                +
                +

                cdecimal is actually based on a C library (libmpdec). Maybe a ffi-based binding could give interesting results.

                +
                +
                +
                +
                + + Anonymous wrote on 2013-08-16 12:00: +
                +
                +

                Importing sqlite3 incurs a huge delay in the latest armhf jit nightly (15 August).

                +
                +
                +
                +
                + + Anonymous wrote on 2013-08-26 12:55: +
                +
                +

                Will PyPy PPA be updated? https://launchpad.net/~pypy/+archive/ppa

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2013/08/slides-of-pypy-london-demo-evening-5157052112396009739.html b/posts/2013/08/slides-of-pypy-london-demo-evening-5157052112396009739.html new file mode 100644 index 000000000..443948e4b --- /dev/null +++ b/posts/2013/08/slides-of-pypy-london-demo-evening-5157052112396009739.html @@ -0,0 +1,325 @@ + + + + + +Slides of the PyPy London Demo Evening | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                Slides of the PyPy London Demo Evening

                + + + +
                +

                The slides of the London demo evening are now online:

                +
                +

                Comments

                +
                +
                +
                + + Anonymous wrote on 2013-08-30 20:41: +
                +
                +

                Is there a better look ink to the slides? Watching them on the blog is difficult

                +
                +
                +
                +
                + + Anonymous wrote on 2013-08-30 22:28: +
                +
                +

                Clicking the full screen button makes them easy to read for me. Maybe try that?

                +
                +
                +
                +
                + + Anonymous wrote on 2013-09-03 16:53: +
                +
                +

                Could there perhaps be videos from the presentation?

                big up for good work!

                +
                +
                +
                +
                + + Unknown wrote on 2013-09-10 14:22: +
                +
                +

                i know, such questions probably get on your nerves, but do you think you will every reach a 10x average on speed.pypy.org? :)

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2013/08/update-on-stm-8705514488940872802.html b/posts/2013/08/update-on-stm-8705514488940872802.html new file mode 100644 index 000000000..79fc9b8f3 --- /dev/null +++ b/posts/2013/08/update-on-stm-8705514488940872802.html @@ -0,0 +1,437 @@ + + + + + +Update on STM | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                Update on STM

                + + + +
                +

                Hi all,

                + +

                A quick update on Software Transactional Memory. We are +working on two fronts.

                + +

                On the one hand, the integration of the "c4" C library with PyPy is done +and works well, but is still subject to improvements. The "PyPy-STM" +executable (without the JIT) +seems to be stable, as far as it has been tested. It runs a simple +benchmark like Richards with a 3.2x slow-down over a regular JIT-less +PyPy.

                + +

                The main factor of this slow-down: the numerous "barriers" in +the code --- checks that are needed a bit everywhere to verify that a +pointer to an object points to a recent enough version, and if not, to +go to the most recent version. These barriers are inserted automatically +during the translation; there is no need for us to manually put 42 million +barriers in the source code of PyPy. But this automatic insertion uses a +primitive algorithm right now, which usually ends up putting more barriers than the +theoretical optimum. I (Armin) am trying to improve that --- and progressing: +last week the slow-down was around 4.5x. This is done in the branch +stmgc-static-barrier.

                + +

                On the other hand, Remi is progressing on the JIT integration in +the branch stmgc-c4. +This has been working in simple cases since a couple of weeks by now, but the +resulting "PyPy-JIT-STM" often crashes. This is because while the +basics are not really hard, we keep hitting new issues that must be +resolved.

                + +

                The basics are that whenever the JIT is about to generate +assembler corresponding to a load or a store in a GC object, it must +first generate a bit of extra assembler that corresponds to the barrier +that we need. This works fine by now (but could benefit from the same +kind of optimizations described above, to reduce the number of barriers). +The additional issues are all more subtle. I will describe the current +one as an example: it is how to write constant pointers inside the assembler.

                + +

                Remember that the STM library classifies objects as either +"public" or "protected/private". A "protected/private" object +is one which has not been seen by another thread so far. +This is essential as an optimization, because we know that no +other thread will access our protected or private objects in parallel, +and thus we are free to modify their content in place. By contrast, +public objects are frozen, and to do any change, we first need to +build a different (protected) copy of the object. See this +blog +post for more details.

                + +

                So far so good, but the JIT will sometimes (actually often) hard-code +constant pointers into the assembler it produces. For example, this is the +case when the Python code being JITted creates an instance of a known class; +the corresponding assembler produced by the JIT will reserve the memory for +the instance and then write the constant type pointer in it. This type +pointer is a GC object (in the simple model, it's the Python class object; +in PyPy it's actually the "map" object, which is +a different story).

                + +

                The problem right now is that this constant pointer may point to a +protected object. This is a problem because the same piece of assembler +can later be executed by a different thread. If it does, then this +different thread will create instances whose type pointer is bogus: looking +like a protected object, but actually protected by a different thread. +Any attempt to use this type pointer to change anything on the class +itself will likely crash: the threads will all think they can safely change it +in-place. To fix this, we need to make sure we only write pointers to +public objects in the assembler. This is a bit involved because we need +to ensure that there is a public version of the object to start with.

                + +

                When this is done, we will likely hit the next problem, and the next one; +but at some point it should converge (hopefully!) and we'll give you our first +PyPy-JIT-STM ready to try. Stay tuned :-)

                + +

                A bientôt,

                + +

                Armin.

                +
                +

                Comments

                +
                +
                +
                + + Anonymous wrote on 2013-08-19 11:06: +
                +
                +

                *assembly

                +
                +
                +
                +
                + + Unknown wrote on 2013-08-20 21:31: +
                +
                +

                Thanks for the update; glad it's coming together! I'm really looking forward to seeing how it stacks up once the JIT work is complete.

                Do you think that it'll be possible to ever get better than a 2x slowdown for serial operations? Or is that the minimal possible? Naively, it makes sense that it'll never be as fast, but if 1.5x or lower were possible, that would be very exciting.

                Also, is the end goal that you would have a module you import to "turn on" STM? Or would it always be a separate build of pypy, just like JIT/JIT-less?

                +
                +
                +
                +
                + + Armin Rigo wrote on 2013-08-21 09:05: +
                +
                +

                @Christopher: the slow-down we'll get is still unknown, but I fear it won't really go well under 2x.

                I see it mainly as a separate build: either you want to run all these barrier instructions everywhere (which gives the slow-down) or not. It could be possible in theory to have a version that has the barriers everywhere, but creates JIT-generated assembler that doesn't, and thus runs almost as fast as a regular PyPy as long as you don't "turn on" STM. We will see if that makes sense.

                +
                +
                +
                +
                + + Armin Rigo wrote on 2013-08-21 09:12: +
                +
                +

                @Anonymous: ah, thanks :-) I think I now learned the difference between "assembler" and "assembly" in English, which was never quite clear to me. Note that in french the same word ("assembleur") is used to mean both terms.

                +
                +
                +
                +
                + + Unknown wrote on 2013-08-22 17:14: +
                +
                +

                @Armin: Ah, I see. Well, from a user's perspective, what I most write in python these days is either GUI applications (for which I've never been able to use pypy due to lack of bindings, but that's another issue entirely), or for small services, for which pypy has provided a rather nice speed improvement.

                In a perfect world, I'd be able to use pypy for both of these tasks, not using STM for my GUI applications, but turning it on for the services I write (well, once they reach a certain point where I'd gain something from concurrency).

                I suspect having a separate build would make such a use-case awkward.

                Also, my interest is a bit self-motivated; at work we current use node.js for a lot of our services. Pypy compares decently for a lot of our tasks, but it not 'clearly better'. Once STM is stable, however, several of our services that we've struggled scaling to multiple cores on node.js could be rewritten in pypy STM, and should scale much easier. (Manual process management is painful!)

                Again, if pypy STM were a seperate build, we'd have to manage having both installed in the case where we have servers running services that need concurrency, or ones that work well enough with a very fast async implementation. Not impossible, just a bit awkward. :)

                Either way, I'm pretty excited!

                +
                +
                +
                +
                + + Unknown wrote on 2013-10-16 15:22: +
                +
                +

                Are there any plans or experiments going on related to Hardware Transactional Memory?

                +
                +
                +
                +
                + + Armin Rigo wrote on 2013-10-16 15:55: +
                +
                +

                @Ignacio Hernandez: for HTM, our position is still as described last year in: https://morepypy.blogspot.com/2012/08/multicore-programming-in-pypy-and.html

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2013/09/numpy-status-update-5160363918470470887.html b/posts/2013/09/numpy-status-update-5160363918470470887.html new file mode 100644 index 000000000..f595c2681 --- /dev/null +++ b/posts/2013/09/numpy-status-update-5160363918470470887.html @@ -0,0 +1,303 @@ + + + + + +Numpy Status Update | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                Numpy Status Update

                + + + +
                +

                Hi everyone

                +Thanks to the people who donated money to the numpy proposal, here is what I've been working on recently :

                +- Fixed conversion from a numpy complex number to a python complex number
                +- Implement the rint ufunc
                +- Make numpy.character usable as a dtype
                +- Fix ndarray(dtype=str).fill()
                +- Various fixes on boolean and fancy indexing

                +Cheers
                +Romain

                +
                +

                Comments

                +
                +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2013/09/pycon-south-africa-sprint-6630788654105016762.html b/posts/2013/09/pycon-south-africa-sprint-6630788654105016762.html new file mode 100644 index 000000000..ce4ac4c53 --- /dev/null +++ b/posts/2013/09/pycon-south-africa-sprint-6630788654105016762.html @@ -0,0 +1,343 @@ + + + + + +PyCon South Africa & sprint | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                PyCon South Africa & sprint

                + + + +
                +

                Hi all,

                + +

                For those of you that happen to be from South Africa: don't miss +PyCon ZA 2013, next October 3rd and 4th! +Like last year, a few of us will be there. There will be the first talk +about STM getting ready (a +blog post about that should follow soon).

                + +

                Moreover, general sprints will continue on the weekend (5th and 6th). +Afterwards, Fijal will host a longer PyPy sprint (marathon?) with me +until around the 21th. You are welcome to it as well! Write to the mailing list or to fijal directly (fijall +at gmail.com), or simply in comments of this post.

                + +

                --- Armin

                +
                +

                Comments

                +
                +
                +
                + + Anonymous wrote on 2013-10-09 21:49: +
                +
                +

                Hey lads, any change of 64-bit arm pypy build?

                now that hardware is finally generally available...

                I'm sure someone at the conference has the hw, perhaps already rooted?

                +
                +
                +
                +
                + + Maciej Fijalkowski wrote on 2013-10-09 21:52: +
                +
                +

                we don't have access to 64bit ARM. feel free to help us. also it's quite a bit of work

                +
                +
                +
                +
                + + Nickolas wrote on 2013-10-10 06:17: +
                +
                +

                Hey Armin

                Thanks for the awesome presentations :-)

                I'm very excited to try it out soon. I was wondering, would it not be useful to try and get the "with atomic" statement at the very least working on regular CPython? (just operating on the GIL, or simulated with a lock). This could smooth over migration somewhat?

                Also, thanks for your live demo of cffi, It is so much simpler than ctypes :-)

                +
                +
                +
                +
                + + Maciej Fijalkowski wrote on 2013-10-10 07:06: +
                +
                +

                Hi Nikolas.

                with atomic can be trivially available on CPython (and not do anything beyond have a lock)

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2013/10/incremental-garbage-collector-in-pypy-8956893523842234676.html b/posts/2013/10/incremental-garbage-collector-in-pypy-8956893523842234676.html new file mode 100644 index 000000000..d1281fa18 --- /dev/null +++ b/posts/2013/10/incremental-garbage-collector-in-pypy-8956893523842234676.html @@ -0,0 +1,727 @@ + + + + + +Incremental Garbage Collector in PyPy | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                Incremental Garbage Collector in PyPy

                + + + +
                +
                +
                + +

                Hello everyone.

                +

                We're pleased to announce that as of today, +the default PyPy comes with a GC that has much smaller pauses than yesterday.

                +

                Let's start with explaining roughly what GC pauses are. In CPython each +object has a reference count, which is incremented each time we create +references and decremented each time we forget them. This means that objects +are freed each time they become unreachable. That is only half of the story +though. First note that when the last reference to a large tree of +objects goes away, you have a pause: all the objects are freed. Your +program is not progressing at all during this pause, and this pause's +duration can be arbitrarily large. This occurs at deterministic times, +though. But consider code like this:

                +
                +class A(object):
                +     pass
                +
                +a = A()
                +b = A()
                +a.item = b
                +b.item = a
                +del a
                +del b
                +
                +

                This creates a reference cycle. It means that while we deleted references to +a and b from the current scope, they still have a reference count of 1, +because they point to each other, even though the whole group has no references +from the outside. CPython employs a cyclic garbage collector which is used to +find such cycles. It walks over all objects in memory, starting from some known +roots, such as type objects, variables on the stack, etc. This solves the +problem, but can create noticeable, nondeterministic GC pauses as the heap +becomes large and convoluted.

                +

                PyPy essentially has only the cycle finder - it does not bother with reference +counting, instead it walks alive objects every now and then (this is a big +simplification, PyPy's GC is much more complex than this). Although this might +sound like a missing feature, it is really one of the reasons why PyPy is so +fast, because at the end of the day the total time spent in managing the +memory is lower in PyPy than CPython. However, as a result, PyPy also has the +problem of GC pauses.

                +

                To alleviate this problem, which is essential for +applications like games, we started to work on incremental GC, which spreads +the walking of objects and cleaning them across the execution time in smaller +intervals. The work was sponsored by the Raspberry Pi foundation, started +by Andrew Chambers and finished by Armin Rigo and Maciej Fijałkowski.

                +
                +
                +

                Benchmarks

                +

                Everyone loves benchmarks. We did not measure any significant speed difference +on our quite extensive benchmark suite on speed.pypy.org. The main +benchmark that we used for other comparisons was translating the topaz +ruby interpreter using various versions of PyPy and CPython. The exact +command was python <pypy-checkout>/bin/rpython -O2 --rtype targettopaz.py. +Versions:

                +
                  +
                • topaz - dce3eef7b1910fc5600a4cd0afd6220543104823
                • +
                • pypy source - defb5119e3c6
                • +
                • pypy compiled with minimark (non-incremental GC) - d1a0c07b6586
                • +
                • pypy compiled with incminimark (new, incremental GC) - 417a7117f8d7
                • +
                • CPython - 2.7.3
                • +
                +

                The memory usage of CPython, PyPy with minimark and PyPy with incminimark is +shown here. Note that this benchmark is quite bad for PyPy in general, the +memory usage is higher and the amount of time taken is longer. This is due +to the JIT warmup being both memory hungry and inefficient (see below). +But first, the new GC is not worse than the old one.

                + +
                +

                EDIT:Red line is CPython, blue is incminimark (new), green is minimark (old)

                + +

                The image was obtained by graphing the output of memusage.py.

                +

                However, the GC pauses are significantly smaller. For PyPy the way to +get GC pauses is to measure time between start and stop while running stuff +with PYPYLOG=gc-collect:log pypy program.py, for CPython, the magic +incantation is gc.set_debug(gc.DEBUG_STATS) and parsing the output. +For what is worth, the average and total for CPython, as well as the total +number of events are not directly comparable since it only shows the cyclic +collector, not the reference counts. The only comparable thing is the +amount of long pauses and their duration. In the table below, pause duration +is sorted into 8 buckets, each meaning "below that or equal to the threshold". +The output is generated using the gcanalyze tool.

                +

                CPython:

                + ++++++++++ + + + + + + + + + + + + + + + + + + + + + + +
                150.1ms300.2ms450.3ms600.5ms750.6ms900.7ms1050.8ms1200.9ms
                54175321101
                +

                PyPy minimark (non-incremental GC):

                + ++++++++++ + + + + + + + + + + + + + + + + + + + + + + +
                216.4ms432.8ms649.2ms865.6ms1082.0ms1298.4ms1514.8ms1731.2ms
                2714646533
                +

                PyPy incminimark (new incremental GC):

                + ++++++++++ + + + + + + + + + + + + + + + + + + + + + + +
                15.7ms31.4ms47.1ms62.8ms78.6ms94.3ms110.0ms125.7ms
                25512122410002
                +

                As we can see, while there is still work to be done (the 100ms ones could +be split among several steps), we did improve the situation quite drastically +without any actual performance difference.

                +

                Note about the benchmark - we know it's a pretty extreme case of JIT +warmup, we know we suck on it, we're working on it and we're not afraid of +showing PyPy is not always the best ;-)

                +
                +
                +

                Nitty gritty details

                +

                Here are some nitty gritty details for people really interested in +Garbage Collection. This was done as a patch to "minimark", our current +GC, and called "incminimark" for now. The former is a generational +stop-the-world GC. New objects are allocated "young", which means that +they initially live in the "nursery", a special zone of a few MB of +memory. When the nursery is full, a "minor collection" step moves the +surviving objects out of the nursery. This can be done quickly (a few +millisecond) because we only need to walk through the young objects that +survive --- usually a small fraction of all young objects; and also by +far not all objects that are alive at this point, but only the young +ones. However, from time to time this minor collection is followed by a +"major collection": in that step, we really need to walk all objects to +classify which ones are still alive and which ones are now dead +("marking") and free the memory occupied by the dead ones ("sweeping"). +You can read more details here.

                +

                This "major collection" is what gives the long GC pauses. To fix this +problem we made the GC incremental: instead of running one complete +major collection, we split its work into a variable number of pieces and +run each piece after every minor collection for a while, until there are +no more pieces. The pieces are each doing a fraction of marking, or a +fraction of sweeping. It adds some few milliseconds after each of these +minor collections, rather than requiring hundreds of milliseconds in one +go.

                +

                The main issue is that splitting the major collections means that the +main program is actually running between the pieces, and so it can +change the pointers in the objects to point to other objects. This is +not a problem for sweeping: dead objects will remain dead whatever the +main program does. However, it is a problem for marking. Let us see +why.

                +

                In terms of the incremental GC literature, objects are either "white", +"gray" or "black". This is called tri-color marking. See for example +this blog post about Rubinius, or this page about LuaJIT or the wikipedia description. The +objects start as "white" at the beginning of marking; become "gray" when +they are found to be alive; and become "black" when they have been fully +traversed. Marking proceeds by scanning grey objects for pointers to +white objects. The white objects found are turned grey, and the grey +objects scanned are turned black. When there are no more grey objects, +the marking phase is complete: all remaining white objects are truly +unreachable and can be freed (by the following sweeping phase).

                +

                In this model, the important part is that a black object can never point +to a white object: if the latter remains white until the end, it will be +freed, which is incorrect because the black object itself can still be +reached. How do we ensure that the main program, running in the middle +of marking, will not try to write a pointer to white object into a black +object? This requires a "write barrier", i.e. a piece of code that runs +every time we set a pointer into an object or array. This piece of code +checks if some (hopefully rare) condition is met, and calls a function +if that is the case.

                +

                The trick we used in PyPy is to consider minor collections as part of +the whole, rather than focus only on major collections. The existing +minimark GC had always used a write barrier of its own to do its job, +like any generational GC. This existing write barrier is used to detect +when an old object (outside the nursery) is modified to point to a young +object (inside the nursery), which is essential information for minor +collections. Actually, although this was the goal, the actual write +barrier code is simpler: it just records all old objects into which we +write any pointer --- to a young or old object. As we found out over +time, doing so is not actually slower, and might actually be a +performance improvement: for example, if the main program does a lot of +writes into the same old object, we don't need to check over and over +again if the written pointer points to a young object or not. We just +record the old object in some list the first time, and that's it.

                +

                The trick is that this unmodified write barrier works for incminimark +too. Imagine that we are in the middle of the marking phase, running +the main program. The write barrier will record all old objects that +are being modified. Then at the next minor collection, all surviving +young objects will be moved out of the nursery. At this point, as we're +about to continue running the major collection's marking phase, we +simply add to the list of pending gray objects all the objects that we +just considered --- both the objects listed as "old objects that are +being modified", and the objects that we just moved out of the nursery. +A fraction from the former list were black object; so this mean that +they are turned back from the black to the gray color. This technique +implements nicely, if indirectly, what is called a "backward write +barrier" in the literature. The backwardness is about the color that +needs to be changed in the opposite of the usual direction "white -> +gray -> black", thus making more work for the GC. (This is as opposed +to "forward write barrier", where we would also detect "black -> white" +writes but turn the white object gray.)

                +

                In summary, I realize that this description is less about how we turned +minimark into incminimark, and more about how we differ from the +standard way of making a GC incremental. What we really had to do to +make incminimark was to write logic that says "if the major collection +is in the middle of the marking phase, then add this object to the list +of gray objects", and put it at a few places throughout minor +collection. Then we simply split a major collection into increments, +doing marking or sweeping of some (relatively arbitrary) number of +objects before returning. That's why, after we found that the existing +write barrier would do, it was not much actual work, and could be done +without major changes. For example, not a single line from the JIT +needed adaptation. All in all it was relatively painless work. ;-) +

                +

                Cheers,
                armin and fijal

                +
                +
                +
                +
                +

                Comments

                +
                +
                +
                + + H* wrote on 2013-10-15 14:24: +
                +
                +

                Nice work! :)

                +
                +
                +
                +
                + + Unknown wrote on 2013-10-15 19:10: +
                +
                +

                Thank you for this nice explanation.

                Which mechanism do you use for not adding twice an old object in the list of modified old objects?

                +
                +
                +
                +
                + + René Dudfield wrote on 2013-10-15 21:56: +
                +
                +

                Thank you! thank you! thank you! Game dev on pypy just leveled up!

                +
                +
                +
                +
                + + Anonymous wrote on 2013-10-15 22:08: +
                +
                +

                Very clever! But eh, your graphs show that your program is using 2-3x the memory of CPython. How much faster is your program overall in exchange for this hugely larger memory usage?

                +
                +
                +
                +
                + + Armin Rigo wrote on 2013-10-16 07:00: +
                +
                +

                @François: a flag on the object. All old objects have this flag initially, and we use it to detect if the write barrier must trigger. We remove it when the write barrier has triggered once. We re-add it during the following minor collection.

                @Anonymous: this program is slower on PyPy too. The point of the benchmark is to show that incminimark gives the same results as minimark, and to show that the JIT has bad cases. Running the same program for a much longer time (5-10x) lets PyPy slowly catch up and eventually beat CPython by a factor 2. The memory usage is evening out at around around 4 or 4.5GB (and I'd expect even larger examples to show lower consumption on PyPy, but that's mostly a guess).

                +
                +
                +
                +
                + + Anonymous wrote on 2013-10-16 10:03: +
                +
                +

                Thanks for moving Python forward!

                How does the incminimarc compares to Azul C4 JVM GC and Hotspots G1 GC?
                In other words are there strong guarantees that for big heap sizes e.g. 12 GB the GC pauses will not exceed some value e.g. 100ms?

                +
                +
                +
                +
                + + Anonymous wrote on 2013-10-16 10:21: +
                +
                +

                Sounds like great progress, but I hope you understand that even 15-30ms is way too much for games. That's 1-2 frames. It needs to be an order of magnitude less to ensure smooth FPS.

                Do you have plans to give the program any say in whether the GC should strive for low latency vs. high throughput?

                +
                +
                +
                +
                + + vdp wrote on 2013-10-16 11:13: +
                +
                +

                Great writeup, explaining that kind of concept in a clear way is not easy. And well done on the unequivocal improvements :)

                @annonymous Yes you'll still miss some frames, but compared to a 1 second pause, pypy suddenly became usable for games. 55fps (over what duration did those 25K collections happen ?) is not perfect, but most users won't notice. That said, it *would* be nice to be able to tune latency vs throughput.

                +
                +
                +
                +
                + + Armin Rigo wrote on 2013-10-16 12:36: +
                +
                +

                @Anonymous: our incminimark comes with no serious strong guarantee. I still think it's enough for most games, say, if "almost all" the pauses are around 10ms. It's also tweakable (see the PYPY_GC_* environment variables documented in rpython/memory/gc/incminimark.py, and try to call something like gc.collect(1) at the end of each frame).

                Anyway, at around the same time scale is the time spent JITting, which also causes apparent pauses in the program. I think that fixing it all with really strong guarantees is a much, much harder problem. CPython doesn't gives any guarantee either, as explained at the start of the blog post.

                +
                +
                +
                +
                + + Anonymous wrote on 2013-10-16 15:02: +
                +
                +

                @Armin Rigo: Yeah, it's no use pushing GC pauses much lower than other pauses, but that just means other things need improving as well. ;) If I had to draw an arbitrary line, I'd say half a frame (i.e. 8ms for 60fps, 4ms for 120fps 3D) is probably a good target for the maximum.

                The thing with CPython is that you can turn the GC off and still have everything non-cyclic collected. So with enough attention to detail you can avoid GC pauses completely.

                BTW, is any work ongoing with regard to fully concurrent GC?

                +
                +
                +
                +
                + + Armin Rigo wrote on 2013-10-16 15:14: +
                +
                +

                You *cannot* avoid GC pauses in CPython: see the first paragraph of the blog post. You can only make the GC pauses deterministic, by disabling the cyclic collector. Then you can hack the program as needed to reduce GC pauses if there are some.

                +
                +
                +
                +
                + + Unknown wrote on 2013-10-16 22:14: +
                +
                +

                Thanks for this really educational and accessible explanation - it's rare to find such a concise and clear piece of writing that a non expert can understand on the subject of GC.

                +
                +
                +
                +
                + + Unknown wrote on 2013-10-21 06:17: +
                +
                +

                This needs visualization of processes to win Wikipedia article of the month.

                +
                +
                +
                +
                + + Michael Hudson-Doyle wrote on 2013-10-22 02:29: +
                +
                +

                You can also get arbitrarily long "gc" pauses in CPython by removing the last reference to some deeply nested data structure...

                +
                +
                +
                +
                + + Dima Q wrote on 2013-11-10 13:07: +
                +
                +

                Wow PyPy keeps paying off!
                I am so glad you guys have time (and hopefully funding) push dynamic language world forward!

                +
                +
                +
                +
                + + Franck wrote on 2013-11-21 12:19: +
                +
                +

                If you fork() the whole process and do the marking on the frozen forked copy (on write) then you can be fully incremental without pauses, as long as you've got enough spare system memory compared to process size (as the main process keeps growing while you're marking and the pathological case of copy on write is 2x, however unlikely).

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2013/10/making-coveragepy-faster-under-pypy-935409618297062344.html b/posts/2013/10/making-coveragepy-faster-under-pypy-935409618297062344.html new file mode 100644 index 000000000..66692864d --- /dev/null +++ b/posts/2013/10/making-coveragepy-faster-under-pypy-935409618297062344.html @@ -0,0 +1,443 @@ + + + + + +Making coverage.py faster under PyPy | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                Making coverage.py faster under PyPy

                + + + +
                +

                If you've ever tried to run your programs with coverage.py under PyPy,
                +you've probably experienced some incredible slowness. Take this simple
                +program:

                +
                def f():
                +    return 1
                +
                +
                +def main():
                +    i = 10000000
                +    while i:
                +        i -= f()
                +
                +main()
                +
                +

                Running time coverage.py run test.py five times, and looking at the best
                +run, here's how PyPy 2.1 stacks up against CPython 2.7.5:

                + +++++ + + + + + + + + + + + + + + + + + +
                PythonTimeNormalized to CPython
                CPython 2.7.53.879s1.0x
                PyPy 2.153.330s13.7x slower
                +

                Totally ridiculous. I got turned onto this problem because on one of my
                +projects CPython takes about 1.5 minutes to run our test suite on the build
                +bot, but PyPy takes 8-10 minutes.

                +

                So I sat down to address it. And the results:

                + +++++ + + + + + + + + + + + + + + + + + + + + + + +
                PythonTimeNormalized to CPython
                CPython 2.7.53.879s1.0x
                PyPy 2.153.330s13.7x slower
                PyPy head1.433s2.7x faster
                +

                Not bad.

                +
                +

                Technical details

                +

                So how'd we do it? Previously, using sys.settrace() (which coverage.py
                +uses under the hood) disabled the JIT. Except it didn't just disable the JIT,
                +it did it in a particularly insidious way — the JIT had no idea it was being
                +disabled!

                +

                Instead, every time PyPy discovered that one of your functions was a hotspot,
                +it would start tracing to observe what the program was doing, and right when it
                +was about to finish, coverage would run and cause the JIT to abort. Tracing
                +is a slow process, it makes up for it by generating fast machine code at the
                +end, but tracing is still incredibly slow. But we never actually got to the
                +"generate fast machine code" stage. Instead we'd pay all the cost of tracing,
                +but then we'd abort, and reap none of the benefits.

                +

                To fix this, we adjusted some of the heuristics in the JIT, to better show it
                +how sys.settrace(<tracefunc>) works. Previously the JIT saw it as an opaque
                +function which gets the frame object, and couldn't tell whether or not it
                +messed with the frame object. Now we let the JIT look inside the
                <tracefunc> function, so it's able to see that coverage.py isn't
                +messing with the frame in any weird ways, it's just reading the line number and
                +file path out of it.

                +

                I asked several friends in the VM implementation and research field if they
                +were aware of any other research into making VMs stay fast when debugging tools
                +like coverage.py are running. No one I spoke to was aware of any (but I
                +didn't do a particularly exhaustive review of the literature, I just tweeted at
                +a few people), so I'm pleased to say that PyPy is quite possibly the first VM
                +to work on optimizing code in debugging mode! This is possible because of our
                +years spent investing in meta-tracing research.

                +
                +

                Happy testing,
                +Alex

                +
                +

                Comments

                +
                +
                +
                + + John Doe wrote on 2013-10-26 20:40: +
                +
                +

                No, you're not the first to make this pretentious mistake.

                What's the report for code that was actually eliminated by optimizations? Was it covered? Was it not?

                +
                +
                +
                +
                + + Anonymous wrote on 2013-10-27 17:53: +
                +
                +

                You misunderstand John Doe. The coverage report is for the user's Python code, which isn't optimized, eliminated, or otherwise modified. The PyPy speedups come from a clever reimplementation of the interpreter that runs the user's Python code, and this article was explaining how they found and fixed a big slowdown that happens to be triggered by a common test-related library.

                +
                +
                +
                +
                + + Armin Rigo wrote on 2013-10-27 19:37: +
                +
                +

                @John Doe: sadly, we fail to understand exactly what part of the blog post you're answering to in your sentence "No, you're not the first to make this pretentious mistake". Can you please give more context and elaborate a bit?

                +
                +
                +
                +
                + + John M. Camara wrote on 2013-10-27 21:09: +
                +
                +

                @Armin: I believe John Doe is talking about the last paragraph as I believe the JVM also does not disable optimizations when using debug tools.

                If this is the case than his comment is silly as Alex clearly stated he didn't do an exhaustive search.

                +
                +
                +
                +
                + + Unknown wrote on 2013-10-30 07:17: +
                +
                +

                This sounds similar to the -Og setting of GCC, which enables all optimizations which do not interfere with debugging.

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2013/10/update-on-stm-7145890443443707910.html b/posts/2013/10/update-on-stm-7145890443443707910.html new file mode 100644 index 000000000..310be72b9 --- /dev/null +++ b/posts/2013/10/update-on-stm-7145890443443707910.html @@ -0,0 +1,461 @@ + + + + + +Update on STM | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                Update on STM

                + + + +
                +

                Hi all,

                +

                The sprint in London was a lot of fun and very fruitful. In the last +update on STM, Armin was working on improving and specializing the +automatic barrier placement. There is still a lot to do in that area, +but that work is merged now. Specializing and improving barrier placement +is still to be done for the JIT.

                +

                But that is not all. Right after the sprint, we were able to squeeze +the last obvious bugs in the STM-JIT combination. However, the performance +was nowhere near to what we want. So until now, we fixed some of the most +obvious issues. Many come from RPython erring on the side of caution +and e.g. making a transaction inevitable even if that is not strictly +necessary, thereby limiting parallelism. Another problem came from +increasing counters everytime a guard fails, which caused transactions +to conflict on these counter updates. Since these counters do not have +to be completely accurate, we update them non-transactionally now with +a chance of small errors.

                +

                There are still many such performance issues of various complexity left +to tackle: we are nowhere near done. So stay tuned or contribute :)

                + +

                Performance

                +

                Now, since the JIT is all about performance, we want to at least +show you some numbers that are indicative of things to come. +Our set of STM benchmarks is very small unfortunately +(something you can help us out with), so this is +not representative of real-world performance. We tried to +minimize the effect of JIT warm-up in the benchmark results.

                +

                The machine these benchmarks were executed on has 4 physical +cores with Hyper-Threading (8 hardware threads).

                +

                Raytracer from stm-benchmarks: +Render times in seconds for a 1024x1024 image:

                + +++++ + + + + + + + + + + + + + + + + + + + + + + +
                InterpreterBase time: 1 thread8 threads (speedup)
                PyPy-2.12.472.56 (0.96x)
                CPython81.173.4 (1.1x)
                PyPy-STM50.210.8 (4.6x)
                +

                For comparison, disabling the JIT gives 148s on PyPy-2.1 and 87s on +PyPy-STM (with 8 threads).

                +

                Richards from PyPy repository on the stmgc-c4 +branch: +Average time per iteration in milliseconds:

                + +++++ + + + + + + + + + + + + + + + + + + + + + + +
                InterpreterBase time: 1 thread8 threads (speedup)
                PyPy-2.115.615.4 (1.01x)
                CPython239237 (1.01x)
                PyPy-STM371116 (3.2x)
                +

                For comparison, disabling the JIT gives 492ms on PyPy-2.1 and 538ms on +PyPy-STM.

                + +

                Try it!

                +

                All this can be found in the PyPy repository on the stmgc-c4 +branch. +Try it for yourself, but keep in mind that this is still experimental +with a lot of things yet to come. Only Linux x64 is supported right +now, but contributions are welcome.

                +

                You can download a prebuilt binary from here: +https://bitbucket.org/pypy/pypy/downloads/pypy-oct13-stm.tar.bz2 +(Linux x64 Ubuntu >= 12.04). This was made at revision bafcb0cdff48.

                + +

                Summary

                +

                What the numbers tell us is that PyPy-STM is, as expected, +the only of the three interpreters where multithreading gives a large +improvement in speed. What they also tell us is that, obviously, the +result is not good enough yet: it still takes longer on a 8-threaded +PyPy-STM than on a regular single-threaded PyPy-2.1. However, as you +should know by now, we are good at promising speed and delivering it... +years later :-)

                +

                But it has been two years already since PyPy-STM started, and this is +our first preview of the JIT integration. Expect major improvements +soon: with STM, the JIT generates code that is completely suboptimal in +many cases (barriers, allocation, and more). Once we improve this, the +performance of the STM-JITted code should come much closer to PyPy 2.1.

                +

                Cheers

                +

                Remi & Armin

                +
                +

                Comments

                +
                +
                +
                + + tobami wrote on 2013-10-16 21:14: +
                +
                +

                To see a multithreading speed up in a python interpreter is awesome!

                For next update, I would suggest to do the benchmarking turning off hyperthreading and measuring 1, 2 and 4 threads. That would give a better picture of how the STM implementation scales with threads/cores.

                +
                +
                +
                +
                + + Mak Sim wrote on 2013-10-17 09:22: +
                +
                +

                Guys you are doing great job!

                +
                +
                +
                +
                + + Anonymous wrote on 2013-10-17 13:34: +
                +
                +

                STM | Société de transport de Montréal ?

                +
                +
                +
                +
                + + Anonymous wrote on 2013-10-17 17:07: +
                +
                +

                STM stands for Software Transactional Memory and is a way to run multiple non-conflicting tasks at the same time and make it appear as if they had run in sequence.

                +
                +
                +
                +
                + + LKRaider wrote on 2013-10-23 21:45: +
                +
                +

                A bit off-topic, but just came across this paper:

                "Speculative Staging for Interpreter Optimization
                (...)
                -- we report that our optimization makes the CPython interpreter up to more than four times faster, where our interpreter closes the gap between and sometimes even outperforms PyPy's just-in-time compiler."
                https://arxiv.org/abs/1310.2300

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2013/11/cffi-08-6086756821078041950.html b/posts/2013/11/cffi-08-6086756821078041950.html new file mode 100644 index 000000000..3468a5f2f --- /dev/null +++ b/posts/2013/11/cffi-08-6086756821078041950.html @@ -0,0 +1,304 @@ + + + + + +CFFI 0.8 | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                CFFI 0.8

                + + + +
                +

                Hi all,

                + +

                CFFI 0.8 for CPython (2.6-3.x) has been released.

                + +

                Quick download: pip install cffi --upgrade +
                Documentation: https://cffi.readthedocs.org/en/release-0.8/

                + +

                What's new: a number of small fixes; ffi.getwinerror(); integrated support for C99 variable-sized structures; multi-thread safety.

                + +

                --- Armin

                + +

                Update: CFFI 0.8.1, with fixes on Python 3 on OS/X, and some FreeBSD fixes (thanks Tobias).

                +
                +

                Comments

                +
                +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2013/11/numpy-status-update-1609808546418002632.html b/posts/2013/11/numpy-status-update-1609808546418002632.html new file mode 100644 index 000000000..acc76bfd5 --- /dev/null +++ b/posts/2013/11/numpy-status-update-1609808546418002632.html @@ -0,0 +1,346 @@ + + + + + +NumPy status update | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                NumPy status update

                + + + +
                + Here is what has been happening with NumPy in PyPy in October thanks to the people who donated to the NumPyPy proposal:

                The biggest change is that we shifted to using an external fork of numpy rather than a minimal numpypy module. The idea is that we will be able to reuse most of the upstream pure-python numpy components, replacing the C modules with appropriate RPython micronumpy pieces at the correct places in the module namespace.

                The numpy fork should work just as well as the old numpypy for functionality that existed previously, and also include much new functionality from the pure-python numpy pieces that simply hadn't been imported yet in numpypy. However, this new functionality will not have been "hand picked" to only include pieces that work, so you may run into functionality that relies on unimplemented components (which should fail with user-level exceptions).

                This setup also allows us to run the entire numpy test suite, which will help in directing future compatibility development. The recent PyPy release includes these changes, so download it and let us know how it works! And if you want to live on the edge, the nightly includes even more numpy progress made in November.

                To install the fork, download the latest release, and then install numpy either separately with a virtualenv: pip install git+https://bitbucket.org/pypy/numpy.git; or directly: git clone https://bitbucket.org/pypy/numpy.git; cd numpy; pypy setup.py install.

                EDIT: if you install numpy as root, you may need to also import it once as root before it works: sudo pypy -c 'import numpy'

                Along with this change, progress was made in fixing internal micronumpy bugs and increasing compatibility:
                  +
                • Fixed a bug with strings in record dtypes
                • +
                • Fixed a bug where the multiplication of an ndarray with a Python int or float resulted in loss of the array's dtype
                • +
                • Fixed several segfaults encountered in the numpy test suite (suite should run now without segfaulting)
                • +
                +
                We also began working on __array_prepare__ and __array_wrap__, which are necessary pieces for a working matplotlib module.

                Cheers,
                +Romain and Brian +
                +

                Comments

                +
                +
                +
                + + Anonymous wrote on 2013-11-16 09:28: +
                +
                +

                Hi,

                Thanks for all your efforts on pypy-*, we really appreciate it!

                I'm trying to compile numpy with pypy-2.2-osx64 but the building process (manual and pip) fails with:
                AttributeError: 'module' object has no attribute 'get_makefile_filename'

                Full build log: https://pastebin.com/S4dybCV0

                Any idea how to resolve this?

                Thanks,
                t

                +
                +
                +
                +
                + + Maciej Fijalkowski wrote on 2013-11-16 10:06: +
                +
                +

                Hey

                Please put such reports to bugs.pypy.org so they don't get lost.

                Thanks!
                fijal

                +
                +
                +
                +
                + + Brian Kearns wrote on 2013-11-16 17:43: +
                +
                +

                Installation on OS X was fixed.

                +
                +
                +
                +
                + + Sau wrote on 2014-03-12 05:47: +
                +
                +

                I am getting an error when installing numpy for pypy 2.2.1:

                https://stackoverflow.com/questions/22342769/error-when-installing-numpy-for-pypy2-2-1

                +
                +
                +
                +
                + + Sau wrote on 2014-03-12 05:47: +
                +
                +

                I am getting an error when installing numpy for pypy 2.2.1:

                https://stackoverflow.com/questions/22342769/error-when-installing-numpy-for-pypy2-2-1

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2013/11/py3k-status-update-12-5307085693947812769.html b/posts/2013/11/py3k-status-update-12-5307085693947812769.html new file mode 100644 index 000000000..c57fc30db --- /dev/null +++ b/posts/2013/11/py3k-status-update-12-5307085693947812769.html @@ -0,0 +1,347 @@ + + + + + +Py3k status update #12 | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                Py3k status update #12

                + + + +
                +

                This is the 12th status update about our work on the py3k branch, which we
                +can work on thanks to all of the people who donated to the py3k proposal.

                +

                Here's an update on the recent progress:

                +
                  +
                • Thank you to everyone who has provided initial feedback on the PyPy3 2.1 beta
                  +1 release. We've gotten a number of bug reports, most of which have been
                  +fixed.
                • +
                • As usual, we're continually keeping up with changes from the default
                  +branch. Oftentimes these merges come at a cost (conflicts and or
                  +reintegration of py3k changes) but occasionally we get goodies for free, such
                  +as the recent JIT optimizations and incremental garbage collection.
                • +
                • We've been focusing on re-optimizing Python 2 int sized (machine sized)
                  +integers:
                • +
                +

                We have a couple of known, notable speed regressions in the PyPy3 beta release
                +vs regular PyPy. The major one being with Python 2.x int sized (or machine
                +sized) integers.

                +

                Python 3 drops the distinction between int and long types. CPython 3.x
                +accomplishes this by removing the old int type entirely and renaming the long
                +type to int. Initially, we've done the same for PyPy3 for the sake of
                +simplicity and getting everything working.

                +

                However PyPy's JIT is capable of heavily optimizing these machine sized integer
                +operations, so this came with a regression in performance in this area.

                +

                We're now in the process of solving this. Part of this work also involves some
                +house cleaning on these numeric types which also benefits the default branch.

                +

                cheers,
                +Phil

                +
                +

                Comments

                +
                +
                +
                + + Armin Rigo wrote on 2013-11-13 08:33: +
                +
                +

                We should note that the re-optimization is different than CPython's. In the latter they use a "long" implementation which they heavily optimized for the common case of small integers. In PyPy instead we use two really different implementations (like "int" and "long" on Python 2); they just happen to be exposed at the user level with the same Python type in Python 3.

                +
                +
                +
                +
                + + Anonymous wrote on 2013-11-13 21:58: +
                +
                +

                I just have to say, the PyPy team is doing a great job.

                Well done guys!

                +
                +
                +
                +
                + + Alessandro wrote on 2014-01-04 05:47: +
                +
                +

                I know nothing on pypy, but I'm interested. I have a doubt: Will the PyPy version with python 3 support leverage all of the progress of the python 2 pypy version?

                Like for example, will current numpypy be able to work on PyPy3k ?

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2013/11/pypy-22-incrementalism-4723643710897639332.html b/posts/2013/11/pypy-22-incrementalism-4723643710897639332.html new file mode 100644 index 000000000..021df4627 --- /dev/null +++ b/posts/2013/11/pypy-22-incrementalism-4723643710897639332.html @@ -0,0 +1,369 @@ + + + + + +PyPy 2.2 - Incrementalism | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                PyPy 2.2 - Incrementalism

                + + + +
                +

                We're pleased to announce PyPy 2.2, which targets version 2.7.3 of the Python language. This release main highlight is the introduction of the incremental garbage collector, sponsored by the Raspberry Pi Foundation.
                +This release also contains several bugfixes and performance improvements.
                +You can download the PyPy 2.2 release here:

                +
                https://pypy.org/download.html
                We would like to thank our donors for the continued support of the PyPy project. We showed quite a bit of progress on all three projects (see below) and we're slowly running out of funds. Please consider donating more so we can finish those projects! The three projects are:
                  +
                • Py3k (supporting Python 3.x): the release PyPy3 2.2 is imminent.
                • +
                • STM (software transactional memory): a preview will be released very soon, as soon as we fix a few bugs
                • +
                • NumPy: the work done is included in the PyPy 2.2 release. More details below.
                • +
                +
                +

                What is PyPy?

                PyPy is a very compliant Python interpreter, almost a drop-in replacement for CPython 2.7. It's fast (pypy 2.2 and cpython 2.7.2 performance comparison) due to its integrated tracing JIT compiler.
                +This release supports x86 machines running Linux 32/64, Mac OS X 64, Windows 32, or ARM (ARMv6 or ARMv7, with VFPv3).
                +Work on the native Windows 64 is still stalling, we would welcome a volunteer to handle that.
                +
                +

                Highlights

                +
                  +
                • Our Garbage Collector is now "incremental". It should avoid almost all pauses due to a major collection taking place. Previously, it would pause the program (rarely) to walk all live objects, which could take arbitrarily long if your process is using a whole lot of RAM. Now the same work is done in steps. This should make PyPy more responsive, e.g. in games. There are still other pauses, from the GC and the JIT, but they should be on the order of 5 milliseconds each.
                • +
                • The JIT counters for hot code were never reset, which meant that a process running for long enough would eventually JIT-compile more and more rarely executed code. Not only is it useless to compile such code, but as more compiled code means more memory used, this gives the impression of a memory leak. This has been tentatively fixed by decreasing the counters from time to time.
                • +
                • NumPy has been split: now PyPy only contains the core module, called _numpypy. The numpy module itself has been moved to https://bitbucket.org/pypy/numpy and numpypy disappeared. You need to install NumPy separately with a virtualenv: pip install git+https://bitbucket.org/pypy/numpy.git; or directly: git clone https://bitbucket.org/pypy/numpy.git; cd numpy; pypy setup.py install.
                • +
                • non-inlined calls have less overhead
                • +
                • Things that use sys.set_trace are now JITted (like coverage)
                • +
                • JSON decoding is now very fast (JSON encoding was already very fast)
                • +
                • various buffer copying methods experience speedups (like list-of-ints to int[] buffer from cffi)
                • +
                • We finally wrote (hopefully) all the missing os.xxx() functions, including os.startfile() on Windows and a handful of rare ones on Posix.
                • +
                • numpy has a rudimentary C API that cooperates with cpyext +
                • +
                Cheers,
                +Armin Rigo and Maciej Fijalkowski
                +
                +

                Comments

                +
                +
                +
                + + Armin Rigo wrote on 2013-11-14 11:52: +
                +
                +

                The Win32 build is here, thanks Matti! https://bitbucket.org/pypy/pypy/downloads/pypy-2.2-win32.zip

                +
                +
                +
                +
                + + foobie42 wrote on 2013-11-14 14:23: +
                +
                +

                Congrats! adb push pypypypy /sdcard/!

                +
                +
                +
                +
                + + Anonymous wrote on 2013-11-14 19:38: +
                +
                +

                @foobie42 that's what I've done just a second ago! Gotta unpack raspbian chroot zip now...

                +
                +
                +
                +
                + + Wilfred wrote on 2013-11-16 10:55: +
                +
                +

                Is speed.pypy.org still updated? The second graph on https://speed.pypy.org/ only shows 2.0 beta and trunk, and https://speed.pypy.org/comparison/ doesn't offer 2.1 or 2.2 either.

                +
                +
                +
                +
                + + Maciej Fijalkowski wrote on 2013-11-16 11:56: +
                +
                +

                I managed to update it, check it out now

                +
                +
                +
                +
                + + Unknown wrote on 2014-02-24 21:39: +
                +
                +

                Do you have plans to support python 3.3 features?

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2013/11/pypy-221-incrementalism1-9197847629771910947.html b/posts/2013/11/pypy-221-incrementalism1-9197847629771910947.html new file mode 100644 index 000000000..00201257d --- /dev/null +++ b/posts/2013/11/pypy-221-incrementalism1-9197847629771910947.html @@ -0,0 +1,338 @@ + + + + + +PyPy 2.2.1 - Incrementalism.1 | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                PyPy 2.2.1 - Incrementalism.1

                + + + +
                +

                We're pleased to announce PyPy 2.2.1, which targets version 2.7.3 of the Python +language. This is a bugfix release over 2.2.

                +

                You can download the PyPy 2.2.1 release here:

                +
                +https://pypy.org/download.html +
                +

                What is PyPy?

                +

                PyPy is a very compliant Python interpreter, almost a drop-in replacement for +CPython 2.7. It's fast (pypy 2.2 and cpython 2.7.2 performance comparison) +due to its integrated tracing JIT compiler.

                +

                This release supports x86 machines running Linux 32/64, Mac OS X 64, Windows +32, or ARM (ARMv6 or ARMv7, with VFPv3).

                +

                Work on the native Windows 64 is still stalling, we would welcome a volunteer +to handle that.

                +

                Highlights

                +

                This is a bugfix release. The most important bugs fixed are:

                +
                  +
                • an issue in sockets' reference counting emulation, showing up +notably when using the ssl module and calling makefile().
                • +
                • Tkinter support on Windows.
                • +
                • If sys.maxunicode==65535 (on Windows and maybe OS/X), the json +decoder incorrectly decoded surrogate pairs.
                • +
                • some FreeBSD fixes.
                • +
                +

                Note that CFFI 0.8.1 was released. Both versions 0.8 and 0.8.1 are +compatible with both PyPy 2.2 and 2.2.1.

                +

                Cheers, +Armin Rigo & everybody

                +
                +

                Comments

                +
                +
                +
                + + renaud wrote on 2013-11-27 15:06: +
                +
                +

                what about pypy3-2.2?
                by the way, thank you!

                +
                +
                +
                +
                + + Armin Rigo wrote on 2013-11-28 08:58: +
                +
                +

                Waiting for answers from https://mail.python.org/pipermail/pypy-dev/2013-November/011965.html.

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2013/11/pypy-leysin-winter-sprint-11-19st-8860782754173653661.html b/posts/2013/11/pypy-leysin-winter-sprint-11-19st-8860782754173653661.html new file mode 100644 index 000000000..ee4ecf357 --- /dev/null +++ b/posts/2013/11/pypy-leysin-winter-sprint-11-19st-8860782754173653661.html @@ -0,0 +1,356 @@ + + + + + +PyPy Leysin Winter Sprint (11-19st January 2014) | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                PyPy Leysin Winter Sprint (11-19st January 2014)

                + + + +
                +

                The next PyPy sprint will be in Leysin, Switzerland, for the ninth time. +This is a fully public sprint: newcomers and topics other than those +proposed below are welcome.

                +

                Goals and topics of the sprint

                +
                  +
                • Py3k: work towards supporting Python 3 in PyPy
                • +
                • NumPyPy: work towards supporting the numpy module in PyPy
                • +
                • STM: work towards supporting Software Transactional Memory
                • +
                • And as usual, the main side goal is to have fun in winter sports :-) +We can take a day off for ski.
                • +
                +

                Exact times

                +

                For a change, and as an attempt to simplify things, I specified the +dates as 11-19 January 2014, where 11 and 19 are travel days. We will +work full days between the 12 and the 18. You are of course allowed to +show up for a part of that time only, too.

                +

                Location & Accomodation

                +

                Leysin, Switzerland, "same place as before". Let me refresh your +memory: both the sprint venue and the lodging will be in a very spacious +pair of chalets built specifically for bed & breakfast: +https://www.ermina.ch/. The place has a good ADSL Internet connexion +with wireless installed. You can of course arrange your own lodging +anywhere (as long as you are in Leysin, you cannot be more than a 15 +minutes walk away from the sprint venue), but I definitely recommend +lodging there too -- you won't find a better view anywhere else (though +you probably won't get much worse ones easily, either :-)

                +

                Please confirm that you are coming so that we can adjust the +reservations as appropriate. The rate so far has been around 60 CHF a +night all included in 2-person rooms, with breakfast. There are larger +rooms too (less expensive per person) and maybe the possibility to get a +single room if you really want to.

                +

                Please register by Mercurial:

                +
                +https://bitbucket.org/pypy/extradoc/
                +https://foss.heptapod.net/pypy/extradoc/-/blob/branch/default/extradoc/sprintinfo/leysin-winter-2014
                +
                +

                or on the pypy-dev mailing list if you do not yet have check-in rights:

                +
                +https://mail.python.org/mailman/listinfo/pypy-dev +
                +

                You need a Swiss-to-(insert country here) power adapter. There will be +some Swiss-to-EU adapters around -- bring a EU-format power strip if you +have one.

                +
                +

                Comments

                +
                +
                +
                + + Pim wrote on 2014-01-29 11:57: +
                +
                +

                Very interested to know how far you got, especially STM

                +
                +
                +
                +
                + + Armin Rigo wrote on 2014-01-29 21:24: +
                +
                +

                I'll do a proper post about STM, but in the meantime: we progressed on STM-C7, without hitting an obstacle so far, so hopes are high :-)

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2013/12/numpy-status-update-november-364321959153372759.html b/posts/2013/12/numpy-status-update-november-364321959153372759.html new file mode 100644 index 000000000..18098be64 --- /dev/null +++ b/posts/2013/12/numpy-status-update-november-364321959153372759.html @@ -0,0 +1,312 @@ + + + + + +NumPy Status Update - November | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                NumPy Status Update - November

                + + + +
                +

                Since the PyPy 2.2 release last month, more progress has been made on the NumPy compatibility front. Initial work has been directed by running the NumPy test suite and targeting failures that appear most frequently, along with fixing the few bugs reported on the bug tracker.

                +Improvements were made in these areas:
                +- Many missing/broken scalar functionalities were added/fixed. The scalar API should match up more closely with arrays now.
                +- Some missing dtype functionality was added (newbyteorder, hasobject, descr, etc)
                +- Support for optional arguments (axis, order) was added to some ndarray functions
                +- Fixed some corner cases for string/record types

                +Most of these improvements went onto trunk after 2.2 was split, so if you're interested in trying them out or running into problems on 2.2, try the +nightly.

                +Thanks again to the NumPy on PyPy donors who make this continued progress possible.

                +Cheers,
                +Brian

                +
                +

                Comments

                +
                +
                +
                + + Anonymous wrote on 2013-12-17 13:45: +
                +
                +

                This is fantastic news! I can't wait until I can run my numpy scripts under pypy as easily as I can my standard python scripts.

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2013/12/pygame-cffi-8991437796535033699.html b/posts/2013/12/pygame-cffi-8991437796535033699.html new file mode 100644 index 000000000..75fb8c409 --- /dev/null +++ b/posts/2013/12/pygame-cffi-8991437796535033699.html @@ -0,0 +1,408 @@ + + + + + +PyGame CFFI | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                PyGame CFFI

                + + + +
                +
                + +

                One of the RaspberryPi's goals is to be a fun toolkit for school children (and adults!) to learn programming and electronics with. Python and pygame are part of this toolkit. Recently the RaspberryPi Foundation funded parts of the effort of porting of pypy to the Pi -- making Python programs on the Pi faster!

                +

                Unfortunately pygame is written as a Python C extension that wraps SDL which means performance of pygame under pypy remains mediocre. To fix this pygame needs to be rewritten using cffi to wrap SDL instead.

                +

                RaspberryPi sponsored a CTPUG (Cape Town Python User Group) hackathon to put together a proof-of-concept pygame-cffi. The day was quite successful - we got a basic version of the bub'n'bros client working on pygame-cffi (and on PyPy). The results can be found on github with contributions from the five people present at the sprint.

                +

                While far from complete, the proof of concept does show that there are no major obstacles to porting pygame to cffi and that cffi is a great way to bind your Python package to C libraries.

                +

                Amazingly, we managed to have machines running all three major platforms (OS X, Linux and Windows) at the hackathon so the code runs on all of them!

                +

                We would like to thank the Praekelt foundation for providing the venue and The Raspberry Pi foundation for providing food and drinks!

                +

                Cheers,
                +Simon Cross, Jeremy Thurgood, Neil Muller, David Sharpe and fijal.

                +
                +
                +
                +

                Comments

                +
                +
                +
                + + René Dudfield wrote on 2013-12-09 14:21: +
                +
                +

                Why not use the ctypes based pysdl2?

                +
                +
                +
                +
                + + Maciej Fijalkowski wrote on 2013-12-09 16:19: +
                +
                +

                first of all pygame depends on SDL 1. Second ctypes kinda suck and I don't quite buy it's stability (especially with changing APIs, though it can be less of an issue with SDL). It's also slow on pypy

                +
                +
                +
                +
                + + René Dudfield wrote on 2013-12-09 17:09: +
                +
                +

                Ah, ok. Very nice work anyway. It's impressive what you all managed to get done in the sprint :)

                Here's some information from pygame land about where the project is heading.

                SDL 1 is the past, and the SDL developers are no longer putting out releases. However, I think many people will continue to patch it up for many years. SDL 2 is the future and after many years finally has a release out (2 now). pysdl2 is part of the future of pygame. pysdl2 matches the SDL 2 API as closely as possible. A pygame API ontop of pysdl2 is the future of pygame.

                ctypes is no good for some platforms like iOS, and the web and pypy apparently. Although note, that pysdl2 already 'works' on top of pypy.

                https://bitbucket.org/marcusva/py-sdl2/
                https://pysdl2.readthedocs.org/en/latest/


                Happy hacking :)

                +
                +
                +
                +
                + + Anonymous wrote on 2013-12-09 18:56: +
                +
                +

                Amazing - you consider a messy cffi implementation (sometimes it builds on platform X, sometimes it does not, sometimes it works, sometimes it does not) a better choice over ctypes?

                +
                +
                +
                +
                + + Maciej Fijalkowski wrote on 2013-12-09 19:16: +
                +
                +

                @Anonymous - your comment is pretty loaded, but we do think cffi is better than ctypes on all platforms, that's why we came up with cffi in the first place. I think cffi FAQ contains an answer to that.

                +
                +
                +
                +
                + + Armin Rigo wrote on 2013-12-10 09:30: +
                +
                +

                @Rene: if pysdl2 is a bare-metal ctypes wrapper, writing a similar cffi wrapper instead should be very straightforward (even more than the current pygame-cffi). But do you know if pygame is really going that route, and if so, how soon?

                +
                +
                +
                +
                + + Unknown wrote on 2013-12-10 23:33: +
                +
                +

                I've been looking at cffi since it was first mentioned on our Pygame mailing list. It does look promising. I see only two, buffer related, issues that need to be resolved.

                First, PyPy lacks an array export mechanism comparable to the CPython PEP 3113 buffer protocol. Instead, only the NumPy Array Interface, version: 3 is available. Though Pygame supports both the Python and C sides of the interface, it relies on CPython's reference counting for timely buffer release [1]. Periodic garbage collection is too unpredictable.

                Second, the cffi module does not support CPython api function calls. So a cffi Pygame could not support the buffer protocol on CPython.

                A possible solution to the first issue is for PyPy to use an extended array interface that includes a PEP 3118 like buffer release callback. I am working to resolve the second issue: [Issue13797] Allow objects implemented in pure Python to export PEP 3118 buffers.

                [1] Add PEP 3118 (new) buffer support to Pygame surfaces

                +
                +
                +
                +
                + + Anonymous wrote on 2013-12-15 21:32: +
                +
                +

                Hm, I can't get this to work on Ubuntu 12.04 doing the following

                virtualenv -p /usr/bin/pypy pypy
                cd pypy
                source bin/activate
                pip install git+https://github.com/eliben/pycparser.git
                pip install hg+https://github.com/eliben/pycparser.git
                pip install hg+https://foss.heptapod.net/cffi/cffi
                git clone https://github.com/CTPUG/pygame_cffi.git
                cd pygame_cffi/
                pypy
                import pygame

                >>>> import pygame
                Traceback (most recent call last):
                File "", line 1, in
                File "pygame/__init__.py", line 9, in
                from pygame.color import Color
                File "pygame/color.py", line 3, in
                from pygame._sdl import ffi, sdl
                File "pygame/_sdl.py", line 6, in
                ffi = cffi.FFI()
                File "/home/me/Documents/python/pygame/pypy/site-packages/cffi/api.py", line 56, in __init__
                import _cffi_backend as backend
                ImportError: No module named _cffi_backend


                dpkg -l pypy
                ...
                ii pypy 1.8+dfsg-2 fast alternative implementation of Python - PyPy interpreter


                Do I need a newer pypy? Am I missing something else?

                +
                +
                +
                +
                + + Maciej Fijalkowski wrote on 2013-12-15 21:48: +
                +
                +

                yes, you need a vastly newer pypy

                +
                +
                +
                +
                + + Unknown wrote on 2013-12-16 18:49: +
                +
                +

                I am +1 on porting PySDL2 to CFFI instead of pygame.

                +
                +
                +
                +
                + + Unknown wrote on 2016-05-03 01:01: +
                +
                +
                great! what's current status of it? I really can't wait to use Pygame on a PI through pypy. +
                +
                +
                +
                + + Armin Rigo wrote on 2016-05-04 10:16: +
                +
                +

                Development occurs at https://github.com/CTPUG/pygame_cffi nowadays.

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2014/02/numpy-status-update-decemberjanuary-4292961614234099787.html b/posts/2014/02/numpy-status-update-decemberjanuary-4292961614234099787.html new file mode 100644 index 000000000..4d2ea574f --- /dev/null +++ b/posts/2014/02/numpy-status-update-decemberjanuary-4292961614234099787.html @@ -0,0 +1,346 @@ + + + + + +NumPy Status Update - December/January | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                NumPy Status Update - December/January

                + + + +
                +

                Work continued on the NumPy + PyPy front steadily in December and more lightly in January. The continued focus was compatibility, targeting incorrect or unimplemented features that appeared in multiple NumPy test suite failures. We now pass ~2/3 of the NumPy test suite. The biggest improvements were made in these areas:

                +- Bugs in conversions of arrays/scalars to/from native types
                +- Fix cases where we would choose incorrect dtypes when initializing or computing results
                +- Improve handling of subclasses of ndarray through computations
                +- Support some optional arguments for array methods that are used in the pure-python part of NumPy
                +- Support additional attributes in arrays, array.flags, and dtypes
                +- Fix some indexing corner cases that arise in NumPy testing
                +- Implemented part of numpy.fft (cffti and cfftf)

                +Looking forward, we plan to continue improving the correctness of the existing implemented NumPy functionality, while also beginning to look at performance. The initial focus for performance will be to look at areas where we are significantly worse than CPython+NumPy. Those interested in trying these improvements out will need a PyPy nightly, and an install of the PyPy NumPy fork. Thanks again to the NumPy on PyPy donors for funding this work.

                +
                +

                Comments

                +
                +
                +
                + + Anatoly Vostryakov wrote on 2014-02-06 21:38: +
                +
                +

                Many thanks for your work! Looking forward to support a full functionality of numpy in pypy!

                +
                +
                +
                +
                + + Anonymous wrote on 2014-02-06 22:21: +
                +
                +

                > We now pass ~2/3 of the NumPy test suite.

                Is the test coverage of numpy high enough so that a 100% green numpypy can be considered a full port? (Honest question, I have no background information suggesting the opposite.)

                +
                +
                +
                +
                + + Anonymous wrote on 2014-02-08 18:58: +
                +
                +

                Great news that you are making progress on numpy. I can't wait!

                +
                +
                +
                +
                + + Anonymous wrote on 2014-02-13 14:57: +
                +
                +

                I can't wait to use Numpypy to speed up scientific analysis.

                Are there any updates on using numpypy with a plotting package such as matplotlib?

                +
                +
                +
                +
                + + Armin Rigo wrote on 2014-02-18 13:43: +
                +
                +

                https://mail.python.org/pipermail/pypy-dev/2014-February/012209.html

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2014/02/py3k-status-update-13-4630607029125647100.html b/posts/2014/02/py3k-status-update-13-4630607029125647100.html new file mode 100644 index 000000000..ce95333b5 --- /dev/null +++ b/posts/2014/02/py3k-status-update-13-4630607029125647100.html @@ -0,0 +1,331 @@ + + + + + +Py3k status update #13 | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                Py3k status update #13

                + + + +
                +

                This is the 13th status update about our work on the py3k branch, which we
                +can work on thanks to all of the people who donated to the py3k proposal.

                +

                We're just finishing up a cleanup of int/long types. This work helps the py3k
                +branch unify these types into the Python 3 int and restore JIT compilation of
                +machine sized integers
                .

                +

                This cleanup also removes multimethods from these types. PyPy has
                +historically used a clever implementation of multimethod dispatch for declaring
                +methods of the __builtin__ types in RPython.

                +

                This multimethod scheme provides some convenient features for doing this,
                +however we've come to the conclusion that it may be more trouble than it's
                +worth. A major problem of multimethods is that they generate a large amount of
                +stub methods which burden the already lengthy and memory hungry RPython
                +translation process. Also, their implementation and behavior can be somewhat
                +complicated/obscure.

                +

                The alternative to multimethods involves doing the work of the type checking
                +and dispatching rules in a more verbose, manual way. It's a little more work in
                +the end but less magical.

                +

                Recently, Manuel Jacob finished a large cleanup effort of the
                +unicode/string/bytearray types that also removed their multimethods. This work
                +also benefits the py3k branch: it'll help with future PEP 393 (or PEP 393
                +alternative
                ) work. This effort was partly sponsored by Google's Summer of
                +Code: thanks Manuel and Google!

                +

                Now there's only a couple major pieces left in the multimethod removal (the
                +float/complex types and special marshaling code) and a few minor pieces that
                +should be relatively easy.

                +

                In conclusion, there's been some good progress made on py3k and multimethod
                +removal this winter, albeit a bit slower than we would have liked.

                +

                cheers,
                +Phil

                +
                +

                Comments

                +
                +
                +
                + + Armin Rigo wrote on 2014-02-18 09:41: +
                +
                +

                The str/unicode/bytearray refactoring is not completely done yet.

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2014/02/rewrites-of-stm-core-model-again-633249729751034512.html b/posts/2014/02/rewrites-of-stm-core-model-again-633249729751034512.html new file mode 100644 index 000000000..91ba09e54 --- /dev/null +++ b/posts/2014/02/rewrites-of-stm-core-model-again-633249729751034512.html @@ -0,0 +1,396 @@ + + + + + +Rewrites of the STM core model -- again | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                Rewrites of the STM core model -- again

                + + + +
                +

                Hi all,

                + +

                A quick note about the Software Transactional Memory (STM) front.

                + +

                Since the previous +post, we believe we progressed a lot by discovering an alternative +core model for software transactions. Why do I say "believe"? It's +because it means again that we have to rewrite from scratch the C +library handling STM. This is currently work in progress. Once this is +done, we should be able to adapt the existing pypy-stm to run on top of +it without much rewriting efforts; in fact it should simplify the +difficult issues we ran into for the JIT. So while this is basically +yet another restart similar to last +June's, the difference is that the work that we have already put in the PyPy +part (as opposed to the C library) remains.

                + +

                You can read about the basic ideas of this new C library here. +It is still STM-only, not HTM, but because it doesn't constantly move +objects around in memory, it would be easier to adapt an HTM version. +There are even potential ideas about a hybrid TM, like using HTM but +only to speed up the commits. It is based on a Linux-only system call, remap_file_pages() +(poll: who heard about it before? :-). As previously, the work is done +by Remi Meier and myself.

                + +

                Currently, the C library is incomplete, but early experiments show good +results in running duhton, +the interpreter for a minimal language created for the purpose of +testing STM. Good results means we brough down the slow-downs from +60-80% (previous version) to around 15% (current version). This number +measures the slow-down from the non-STM-enabled to the STM-enabled +version, on one CPU core; of course, the idea is that the STM version +scales up when using more than one core.

                + +

                This means that we are looking forward to a result that is much better +than originally predicted. The pypy-stm has chances to run at a +one-thread speed that is only "n%" slower than the regular pypy-jit, for +a value of "n" that is optimistically 15 --- but more likely some number +around 25 or 50. This is seriously better than the original estimate, +which was "between 2x and 5x". It would mean that using pypy-stm is +quite worthwhile even with just two cores.

                + +

                More updates later...

                + +

                Armin

                +
                +

                Comments

                +
                +
                +
                + + Anonymous wrote on 2014-02-10 17:29: +
                +
                +

                Did you consider existing STM libraries in your implementation? It might be worthwhile to take a look at stasis (https://code.google.com/p/stasis/) which has a pretty complete set of features.

                https://www.eecs.berkeley.edu/Pubs/TechRpts/2010/EECS-2010-2.pdf

                +
                +
                +
                +
                + + Armin Rigo wrote on 2014-02-10 20:22: +
                +
                +

                Statis is not really applicable here: it's a Transactional Storage system, which despite the attempt of this paper to generalize it, is not going to apply successfully in the context of PyPy.

                +
                +
                +
                +
                + + Armin Rigo wrote on 2014-02-10 20:22: +
                +
                +

                More comments on Hacker News.

                +
                +
                +
                +
                + + Dima Tisnek wrote on 2014-02-11 13:32: +
                +
                +

                poll response: I've heard of remap_file_pages! :)

                I was wondering how to use this call when I learnt of it, but couldn't figure anything out except possibly database applications (similar) and sort algorithms (too limited). I think this call may be used when manipulating framebuffer too, there was something about having multiple mappings [to hardware] some readonly, some not.

                I would like to [possibly] disagree with your statement in c7 README "Most probably, this comes with no overhead once the change is done..."

                TLB cache is a limited resource and may easily be contended on large systems. Regular mmap could [in theory] use huge TLB pages, remapped individual pages cannot.

                In addition there is a small penalty during first access to the remapped page, though you may consider it amortized depending on remap/reuse ratio.

                Granted it's still small stuff.

                Reserving one register is is a cool trick, and I find quite acceptable. It too has a small penalty, but the benefits surely outweigh those!

                +
                +
                +
                +
                + + Armin Rigo wrote on 2014-02-11 13:39: +
                +
                +

                @Dina: Thanks for the feedback! Note that "%gs" is a special register that is usually not used: there is no direct way to read/write its actual value. It needs to be done with a syscall, at least before very recent CPUs. It can only be used in addressing instructions as an additional offset.

                +
                +
                +
                +
                + + Arne Babenhauserheide wrote on 2014-02-21 08:04: +
                +
                +

                just 15% slower sounds wonderful!

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2014/03/hello-everyone-there-is-interview-with-7561523711224053700.html b/posts/2014/03/hello-everyone-there-is-interview-with-7561523711224053700.html new file mode 100644 index 000000000..8530c6c18 --- /dev/null +++ b/posts/2014/03/hello-everyone-there-is-interview-with-7561523711224053700.html @@ -0,0 +1,293 @@ + + + + + +PyPy on uWSGI | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                PyPy on uWSGI

                + + + +
                +
                +Hello everyone +

                There is an interview with Roberto De Ioris (from uWSGI fame) about embedding PyPy in uWSGI that covers recent addition of a PyPy embedding interface using cffi and the experience with using it. Read The full interview

                +Cheers
                +fijal +
                +
                +
                +

                Comments

                +
                +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2014/03/hi-all-here-is-one-of-first-full-pypys-8725931424559481728.html b/posts/2014/03/hi-all-here-is-one-of-first-full-pypys-8725931424559481728.html new file mode 100644 index 000000000..c70c38bd8 --- /dev/null +++ b/posts/2014/03/hi-all-here-is-one-of-first-full-pypys-8725931424559481728.html @@ -0,0 +1,438 @@ + + + + + +STMGC-C7 with PyPy | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                STMGC-C7 with PyPy

                + + + +
                +

                Hi all,

                + +

                Here is one of the first full PyPy's +(edit: it was r69967+, but the general list of versions is currently here) +compiled with the new StmGC-c7 +library. It has no JIT so far, but it runs some small +single-threaded benchmarks by taking around 40% more time than a +corresponding non-STM, no-JIT version of PyPy. It scales --- up to two +threads only, which is the hard-coded maximum so far in the c7 code. +But the scaling looks perfect in these small benchmarks without +conflict: starting two threads each running a copy of the benchmark +takes almost exactly the same amount of total time, simply using two +cores.

                + +

                Feel free to try it! It is not actually useful so far, because it is +limited to two cores and CPython is something like 2.5x faster. One of +the important next steps is to re-enable the JIT. Based on our current +understanding of the "40%" figure, we can probably reduce it with +enough efforts; but also, the JIT should be able to easily produce +machine code that suffers a bit less than the interpreter from these +effects. This seems to mean that we're looking at 20%-ish slow-downs +for the future PyPy-STM-JIT.

                + +

                Interesting times :-)

                + +

                For reference, this is what you get by downloading the +PyPy binary linked above: a Linux 64 binary (Ubuntu 12.04) that +should behave mostly like a regular PyPy. (One main missing feature is +that destructors are never called.) It uses two cores, but obviously +only if the Python program you run is multithreaded. The only new +built-in feature is with __pypy__.thread.atomic: this gives +you a way to enforce that a block of code runs "atomically", which means +without any operation from any other thread randomly interleaved.

                + +

                If you want to translate it yourself, you need a trunk version of clang +with three patches applied. That's the number of bugs that we couldn't +find workarounds for, not the total number of bugs we found by (ab)using +the address_space feature...

                + +

                Stay tuned for more!

                + +

                Armin & Remi

                +
                +

                Comments

                +
                +
                +
                + + Armin Rigo wrote on 2014-03-16 20:32: +
                +
                +

                The provided pypy-c crashes when calling fork(). Sadly fork() is indirectly called by a lot of things, including the subprocess module --- which can be executed just by importing random modules...

                +
                +
                +
                +
                + + Unknown wrote on 2014-03-17 08:39: +
                +
                +

                That sounds pretty huge!

                Do you require clang for that? (why is it named on https://foss.heptapod.net/pypy/pypy/-/tree/branch//stmgc-c7/TODO )

                +
                +
                +
                +
                + + Armin Rigo wrote on 2014-03-17 20:42: +
                +
                +

                Only clang has the address_space extension mention in the blog post; gcc does not.

                +
                +
                +
                +
                + + Unknown wrote on 2014-03-19 13:51: +
                +
                +

                I want to hear more talks on this. When is your next talk... pycon 2014? It would be hilarious if the pypy group were able to create naive concurrency in python, no one would have seen that coming! Many would have thought, "surely Haskell", or some other immutable, static language would get us there first. But no, it might just be that pypy allows any language that targets it to be concurrent, kiss style...amazing! Anyway, enough gushing, time for a random question. Mainstream vms like the JVM have added ways of speeding up dynamic languages, what advantages does pypy have over these traditional vms(other than the concurrency one that might come to fruition)? I think this would be a good question to answer at the next talk for pypy.

                +
                +
                +
                +
                + + Armin Rigo wrote on 2014-03-20 06:54: +
                +
                +

                As it turns out there will be no PyPy talk at PyCon 2014.

                The JVM runs Jython at a speed that is around that of CPython. PyPy runs substantially faster than this. One difference is that PyPy contains a small number of annotations targeted specifically towards RPython's JIT generator, whereas the JVM has no support for this.

                +
                +
                +
                +
                + + Armin Rigo wrote on 2014-03-20 07:37: +
                +
                +

                Update containing the most obvious fixes: https://cobra.cs.uni-duesseldorf.de/~buildmaster/misc/pypy-c-r70103-70091-stm.tbz2 (Ubuntu 12.04 Linux 64-bit)

                +
                +
                +
                +
                + + Unknown wrote on 2014-03-20 15:45: +
                +
                +

                Oh, I do not want to know personally about the superiority of pypy vs the jvm. I was just suggesting a talking point; basically, show others that pypy is a better alternative(for dynamic languages, possibly all languages with naive concurrency working!) then llvm, jvm, etc... I do have a question though, would you suppose that performance of pypy-stm would be better than that of something like the approach clojure has? I have heard that immutable data structures are nice for correctness but that they are bad for performance.

                +
                +
                +
                +
                + + Anonymous wrote on 2014-03-21 17:21: +
                +
                +

                So PyPy-STM is Python without GIL? And it's possible to make it only 20% slower than "regular" PyPy? That would be quite an achievement.

                Could you publish a build of PyPy-STM for Debian Stable?

                +
                +
                +
                +
                + + Armin Rigo wrote on 2014-03-22 12:24: +
                +
                +

                The PyPy-STM we have so far doesn't include any JIT. If you want to try it out anyway on other Linux platforms than Ubuntu, you need to translate it yourself, or possibly hack around with symlinks and LD_LIBRARY_PATH.

                +
                +
                +
                +
                + + Anonymous wrote on 2014-03-22 12:44: +
                +
                +

                > The PyPy-STM we have so far doesn't include any JIT

                Yep, that's what blog post said :) But also PyPy-STM doesn't include GIL, does it?

                +
                +
                +
                +
                + + Armin Rigo wrote on 2014-03-23 07:44: +
                +
                +

                Indeed, which is the point :-) You're welcome to try it out, but I'm just saying that I don't want to go to great lengths to provide precompiled binaries that work on Linux XYZ when I could basically release an updated version every couple of days... It's still experimental and in-progress. Early versions are limited to two cores; later versions to 4 cores. We still have to determine the optimal number for this limit; maybe around 8? (higher numbers imply a bit of extra overheads) It's an example of in-progress work. Another example is that so far you don't get feedback from cross-transaction conflicts; you used to in previous versions, but we didn't port it yet.

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2014/03/numpy-status-update-february-1245769841736493525.html b/posts/2014/03/numpy-status-update-february-1245769841736493525.html new file mode 100644 index 000000000..8c2ce64dd --- /dev/null +++ b/posts/2014/03/numpy-status-update-february-1245769841736493525.html @@ -0,0 +1,333 @@ + + + + + +NumPy on PyPy - Progress in February | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                NumPy on PyPy - Progress in February

                + + + +
                +

                More progress was made on the NumPy front in the past month. On the compatibility front, we now pass ~130 more tests from NumPy's suite since the end of January. Currently, we pass 2336 tests out of 3265 tests run, with many of the failures representing portions of NumPy that we don't plan to implement in the near future (object dtypes, unicode, etc). There are still some failures that do represent issues, such as special indexing cases and failures to respect subclassed ndarrays in return values, which we do plan to resolve. There are also some unimplemented components and ufuncs remaining which we hope to implement, such as nditer and mtrand. Overall, the most common array functionality should be working.

                +Additionally, I began to take a look at some of the loops generated by our code. One widely used loop is dot, and we were running about 5x slower than NumPy's C version. I was able to optimize the dot loop and also the general array iterator to get us to ~1.5x NumPy C time on dot operations of various sizes. Further progress in this area could be made by using CFFI to tie into BLAS libraries, when available. Also, work remains in examining traces generated for our other loops and checking for potential optimizations.

                +To try out PyPy + NumPy, grab a nightly PyPy and install our NumPy fork. Feel free to report comments/issues to IRC, our mailing list, or bug tracker. Thanks to the contributors to the NumPy on PyPy proposal for supporting this work.

                +Cheers,
                +Brian

                +
                +

                Comments

                +
                +
                +
                + + Anonymous wrote on 2014-03-09 06:05: +
                +
                +

                Thanks! It would be easier to repost this if the title contained pypy: "numpy in pypy - progress in February"

                +
                +
                +
                +
                + + Canesin wrote on 2014-03-17 12:33: +
                +
                +

                It would be great if the first performance optimizations where actually wrapper to BLAS, there is outstanding BSD license BLAS at https://github.com/xianyi/OpenBLAS

                +
                +
                +
                +
                + + Armin Rigo wrote on 2014-03-17 20:50: +
                +
                +

                I believe the "performance optimizations" mentioned in the blog post are unrelated to BLAS. BLAS is about calling an external library. You can't optimize that, you just merely call it. The performance optimizations are about things like computing the matrix "a + b + c", which can be done without computing the intermediate result "a + b".

                +
                +
                +
                +
                + + Canesin wrote on 2014-03-18 11:16: +
                +
                +

                Armin, I agree with you. What I'm trying to say is that maybe to make the BLAS interface is going to be very easy, give great performance and people will use it most of the time if you bundle it.

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2014/03/pygamecffi-pygame-on-pypy-8679802461301121984.html b/posts/2014/03/pygamecffi-pygame-on-pypy-8679802461301121984.html new file mode 100644 index 000000000..8a0e44671 --- /dev/null +++ b/posts/2014/03/pygamecffi-pygame-on-pypy-8679802461301121984.html @@ -0,0 +1,772 @@ + + + + + +pygame_cffi: pygame on PyPy | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                pygame_cffi: pygame on PyPy

                + + + +
                +
                +

                The Raspberry Pi aims to be a low-cost educational tool that anyone can use to learn about electronics and programming. Python and pygame are included in the Pi's programming toolkit. And since last year, thanks in part to sponsorship from the Raspberry Pi Foundation, PyPy also works on the Pi (read more here).

                +

                With PyPy working on the Pi, game logic written in Python stands to gain an awesome performance boost. However, the original pygame is a Python C extension. This means it performs poorly on PyPy and negates any speedup in the Python parts of the game code.

                +

                One solution to making pygame games run faster on PyPy, and eventually on the Raspberry Pi, comes in the form of pygame_cffi. pygame_cffi uses CFFI to wrap the underlying SDL library instead of a C extension. A few months ago, the Raspberry Pi Foundation sponsored a Cape Town Python User Group hackathon to build a proof-of-concept pygame using CFFI. This hackathon was a success and it produced an early working version of pygame_cffi.

                +

                So for the last 5 weeks Raspberry Pi has been funding work on pygame_cffi. The goal was a complete implementation of the core modules. We also wanted benchmarks to illuminate performance differences between pygame_cffi on PyPy and pygame on CPython. We are happy to report that those goals were met. So without further ado, here's a rundown of what works.

                +
                +

                Current functionality

                + + +Invention screenshot: + +
                + +Mutable mamba screenshot: + +
                + +

                With the above-mentioned functionality in place we could get 10+ of the pygame examples to work, and a number of PyWeek games. At the time of writing, if a game doesn't work it is most likely due to an unimplemented transform or draw function. That will be remedied soon.

                +
                +
                +

                Performance

                +

                In terms of performance, pygame_cffi on PyPy is showing a lot of promise. It beats pygame on CPython by a significant margin in our events processing and collision detection benchmarks, while blit and fill benchmarks perform similarly. The pygame examples we checked also perform better.

                + +
                + +
                + +

                However, there is still work to be done to identify and eliminate bottlenecks. On the Raspberry Pi performance is markedly worse compared to pygame (barring collision detection). The PyWeek games we tested also performed slightly worse. Fortunately there is room for improvement in various places.

                + +Invention & Mutable Mamba (x86) + +
                + +Standard pygame examples (Raspberry Pi) + +
                + +

                Here's a summary of some of the benchmarks. Relative speed refers to the frame rate obtained in pygame_cffi on PyPy relative to pygame on CPython.

                + ++++ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                BenchmarkRelative speed (pypy speedup)
                Events (x86)1.41
                Events (Pi)0.58
                N2 collision detection on 100 sprites (x86)4.14
                N2 collision detection on 100 sprites (Pi)1.01
                Blit 100 surfaces (x86)1.06
                Blit 100 surfaces (Pi)0.60
                Invention (x86)0.95
                Mutable Mamba (x86)0.72
                stars example (x86)1.95
                stars example (Pi)0.84
                +
                +

                OpenGL

                +

                Some not-so-great news is that PyOpenGL performs poorly on PyPy since PyOpenGL uses ctypes. This translates into a nasty reduction in frame rate for games that use OpenGL surfaces. It might be worthwhile creating a CFFI-powered version of PyOpenGL as well.

                +
                +
                +
                +

                Where to now?

                +

                Work on pygame_cffi is ongoing. Here are some things that are in the pipeline:

                +
                  +
                • Get pygame_cffi on PyPy to a place where it is consistently faster than pygame on CPython.
                • +
                • Implement the remaining modules and functions, starting with draw and transform.
                • +
                • Improve test coverage.
                • +
                • Reduce the time it takes for CFFI to parse the cdef. This makes the initial pygame import slow.
                • +
                +

                If you want to contribute you can find pygame_cffi on Github. +Feel free to find us on #pypy on freenode or post issues on github.

                +

                Cheers,
                +Rizmari Versfeld

                +
                +
                +
                +
                +

                Comments

                +
                +
                +
                + + Unknown wrote on 2014-03-28 01:04: +
                +
                +

                Pygame should be an excellent way to benchmark the performance of pypy, so this is great! I wanted to let you fellas know of another project that is using pypy that looks really neat as well... https://github.com/rfk/pypyjs

                +
                +
                +
                +
                + + Unknown wrote on 2014-03-28 12:46: +
                +
                +

                pygame seems outdated, because it is based on first SDL version.

                It will be interesting to see CFFI comparison for newer, SDL2 bindings, such as PySDL2, which is ctypes based at the moment.

                https://pypi.python.org/pypi/PySDL2

                +
                +
                +
                +
                + + Maciej Fijalkowski wrote on 2014-03-28 15:02: +
                +
                +

                Anatoly, pygame is outdated but have no clear replacement. PySDL2 is nice, but it's only a low level binding, it does not really help in the case of writing games.

                +
                +
                +
                +
                + + Unknown wrote on 2014-03-28 18:31: +
                +
                +

                Is it not wrapping the current SDL? I thought that it was... On github it says it's a pygame based wrapper(copies the api) for SDL, would that not make it the current SDL?

                +
                +
                +
                +
                + + Anonymous wrote on 2014-03-29 00:37: +
                +
                +

                I looked into PyOpenGL's code to see if there is an easy way to upgrade to CFFI.

                It's a bag of cats EVERYWHERE.

                ctypes are defined all over the place, unlike most ctypes->cffi projects, where there is a single source file (api.py) that is easy to convert due to it being the raw interface to the C library.

                +
                +
                +
                +
                + + Unknown wrote on 2014-03-29 06:41: +
                +
                +

                @Maciej, pygame includes a lot of helpers and good documentation, but it is not perspective technology to play with. I'd say there are more interesting libs out there that gain more interesting results and speeding up dynamic binding for them would be very cool to make things like these - https://devart.withgoogle.com/ - possible.


                @Anonymous, if I were to provide OpenGL bindings, I'd start with looking at https://github.com/p3/regal project and binding generator in scripts/

                +
                +
                +
                +
                + + Temia Eszteri wrote on 2014-03-29 18:42: +
                +
                +

                I've actually been working to see if I can get my own Pygame release, Sky Eraser, optimised enough to work on a Raspberry Pi -- it'd be worth seeing how implementing it under this configuration would work on top of the optimisations I've been working on in the background (boy are there a lot to make).

                I might also be rewriting the APIs for Allegro 5.1 as an experiment though, to test under both CPython and PyPy.

                +
                +
                +
                +
                + + Unknown wrote on 2014-03-29 21:15: +
                +
                +

                I started to work on a newer and experimental OpenGL wrapper for Python, proudly blessed PyOpenGLng.

                In comparison to PyOpenGL, it generates the requested OpenGL API from the OpenGL XML Registry and use an automatic translator to map the C API to Python. The translator is quite light weight in comparison to PyOpenGL source code. And it is already able to run a couple of examples for OpenGL V3 and V4.

                Actually the wrapper use ctypes. But I am looking for tips to do the same for cffi, as well as feedbacks on performance and comments.

                The project is hosted on https://github.com/FabriceSalvaire/PyOpenGLng.

                +
                +
                +
                +
                + + Unknown wrote on 2014-03-30 08:16: +
                +
                +

                @Fabrice, how is your newer and experimental OpenGL wrapper generator is better than existing ones? I am not saying that there is a NIH effect - probably some omission from documentation.

                +
                +
                +
                +
                + + Unknown wrote on 2014-03-30 08:19: +
                +
                +

                I mean that if PyOpenGL doesn't use wrapper generator then there are a couple around not limiting themselves to Python. I am especially interested to know the comparison with regal.

                +
                +
                +
                +
                + + Alecks Gates wrote on 2014-03-30 22:20: +
                +
                +

                It was my impression that OpenGL isn't hardware accelerated on the pi anyway... or am I incorrect?

                +
                +
                +
                +
                + + Unknown wrote on 2014-03-31 10:17: +
                +
                +

                @anatoly: The only real replacement for pygame which I know is pyglet. It is not quite as game-optimized as pygame, but very versatile and a joy to use.

                https://pyglet.org

                +
                +
                +
                +
                + + David wrote on 2014-04-01 20:05: +
                +
                +

                I've actually made a CFFI OpenGL binding, as part of my successor to my old PyGL3Display project. It's not hosted anywhere yet, but I'll see about getting up somewhere soon.

                +
                +
                +
                +
                + + David wrote on 2014-04-02 14:32: +
                +
                +

                And... done. A mostly drop-in replacement for PyOpenGL on CFFI, or at least for OpenGL 3.2 core spec.

                https://www.dropbox.com/s/rd44asge17xjbn2/gl32.zip

                +
                +
                +
                +
                + + Unknown wrote on 2014-04-02 14:35: +
                +
                +

                @Arne, pyglet rocks, because it is just `clone and run` unlike all other engines. But it looks a little outdated, that's why I started to look for alternatives.

                +
                +
                +
                +
                + + Unknown wrote on 2014-04-02 14:38: +
                +
                +

                @David, if you want people to comment on this, Bitbucket would be a better way to share sources than Dropbox.

                +
                +
                +
                +
                + + David wrote on 2014-04-02 14:57: +
                +
                +

                @anatoly techtonick:
                Actually, it'll end up on Launchpad in the near future (probably within 2 weeks?). However, it's the output of a wrapper generator and the wrapper generator is in pretty poor shape at the moment, in terms of packaging it's output. I just figured people might be able to use it in the near future, even if it is in 'source-code-dump' form. If there's a better temporary home for it somewhere, I'm all ears.

                +
                +
                +
                +
                + + Unknown wrote on 2014-04-02 15:08: +
                +
                +

                @David, why reinvent the wheel? There are many wrapper generators around. Also, you project is not a replacement for PyOpenGL, because of GPL restrictions.

                +
                +
                +
                +
                + + David wrote on 2014-04-02 15:39: +
                +
                +

                @anatoly

                I never claimed my project is a replacement for PyOpenGL - it's not API compatible, for a start. Regarding license, it'll probably get changed for the bindings at some point, probably to 3-clause BSD.

                On the wrapper generator: Really, the only actively maintained wrapper generator for Python that I'm aware of (which isn't project specific) is SWIG, which is not appropriate (at the very least, googling for 'python wrapper generator -swig' doesn't seem to give many results). In any case, the wrapper generator isn't a lot of code.

                +
                +
                +
                +
                + + Unknown wrote on 2014-04-03 07:28: +
                +
                +

                @anatoly: pyglet seems to be in maintenance mode right now. There are commits every few days, but only small stuff.

                On the other hand I understand that: pyglet supplies everything a backend for a game-engine needs (I use it¹), so the next step should be to use it for many games and see whether shared needs arise.

                ¹: See https://1w6.org/deutsch/anhang/programme/hexbattle-mit-zombies and https://bitbucket.org/ArneBab/hexbattle/

                +
                +
                +
                +
                + + Unknown wrote on 2014-04-03 10:35: +
                +
                +

                @David, I am speaking about OpenGL specific wrapper generators. I've added information to this page - https://www.opengl.org/wiki/Related_toolkits_and_APIs#OpenGL_loading_libraries

                The OpenGL generator in Python is included in regal project here https://github.com/p3/regal/scripts

                pyglet also has one.

                +
                +
                +
                +
                + + Unknown wrote on 2014-04-03 10:36: +
                +
                +

                Sorry, the correct link is https://github.com/p3/regal/tree/master/scripts

                +
                +
                +
                +
                + + Unknown wrote on 2014-04-03 10:39: +
                +
                +

                @Arne, kissing elves trick is low. =) Otherwise looks wesnothy and 2D. I don't see why it should use OpenGL. 3D models would be cool.

                I'd try to make it run on PySDL2 with "from sdl2.ext.api import pyglet". There is no pyglet API there, but would be interesting to see if it is possible to provide one.

                +
                +
                +
                +
                + + David wrote on 2014-04-03 15:58: +
                +
                +

                @anatoly

                Pyglet's GL wrapper generator creates a lot of chained functions (fairly slow in cPython). I'm also not sure if there's enough development activity in Pyglet to allow modifying core code, and given the size of the Pyglet project I'm not going to fork it. PyOpenGL has more or less the same issues.

                Regal appears to be a very large project (a 68MB checkout), which has a scope much greater than just its wrapper generator - the sheer scope of the project does cause some barriers to entry. I'm still looking through, but I am fairly certain that it would take more effort to adapt Regals binding generator than I have expended on my own.

                +
                +
                +
                +
                + + Unknown wrote on 2014-04-03 21:06: +
                +
                +

                @anatoly: I like kissing elves ☺ (and when I get to write the next part of the story, I intend to keep them as player characters: That someone starts out in an intimate moment does not mean he or she is watchmeat).

                @David: I guess modifying core-code in pyglet is not that big of a problem, especially *because* it is mostly being maintained right now: Little danger of breaking the in-progress work of someone else.

                +
                +
                +
                +
                + + Unknown wrote on 2014-04-03 21:09: +
                +
                +

                @anatoly: more specifically, I do not consider intimate moments as cheap (and WTactics has the image, so I could pull this off). Instead I try to rid myself of baseless inhibitions, though that’s not always easy: Killing off no longer needed societal conditioning is among the hardest battles…

                +
                +
                +
                +
                + + David wrote on 2014-04-04 01:23: +
                +
                +

                @Arne: Maybe it'd be worth looking at integrating it then; however, it really is a completely different approach - gl32 is a source code writer, whereas Pyglet uses Pythons inbuilt metaprogramming capabilities - and so it would be completely rewriting a large chunk of Pyglets core. Once I've got the binding generator finalised, it might be worth seeing if it's possible to replace Pyglet's OpenGL bindings with these ones.

                That said, in the interest of full disclosure: I'm not a fan of Pyglets per object draw method, again in the interests of speed. The per object draw method that Pyglet encourages with its API is not very scalable and eliminates a large number of the advantages of using OpenGL. So whilst I might see if gl32 can be plugged in for interesting benchmarks/proof-of-concept, I probably wouldn't try to get it bug-free and integrated into upstream Pyglet.

                +
                +
                +
                +
                + + David wrote on 2014-04-04 15:26: +
                +
                +

                @Arne: Regarding Pyglet integration - it seems it would require a lot of work. There's two major issues - firstly, Pyglet only has raw OpenGL bindings, which are used everywhere and hence the "more pythonic" bindings of gl32 would be hard to integrate without editing every file using GL in Pyglet. Secondly, Pyglet uses GL functions which were removed in 3.2, and hence are not in gl32, so the API generator would have to be extended to handle any special cases on these functions.

                +
                +
                +
                +
                + + Unknown wrote on 2014-04-07 17:23: +
                +
                +

                @David: The per-object draw-method is very convenient for programming. As soon as you need more performance, most of the objects are grouped into batches, though. That way only the draw method of the batch is called and the batch can do all kinds of optimizations.

                +
                +
                +
                +
                + + Unknown wrote on 2014-04-07 17:25: +
                +
                +

                For Python 3.2 you might find useful stuff in the python-3 port of pyglet, though that hasn’t been released, yet, IIRC.

                +
                +
                +
                +
                + + David wrote on 2014-04-07 21:26: +
                +
                +

                @Arne:

                I'd argue that objects with Z-order would be more convenient programmatically, but frankly that's a matter of opinion. (Incidentally, this is something I'm working on as well, and I think I'm mostly done on it).

                However, per-object-draw is only one concern I have on Pyglets speed credentials, as I do not believe Pyglet was written with speed as a design goal. For a different example, see pyglet.graphics.vertexbuffer; copying a ctypes object into a list in order to get slices to work is not a smart thing to do, performance wise!

                I'm not sure where you got Python 3.2 from, but what I meant was that currently I'm restricting myself to OpenGL 3.2, which means that certain older OpenGL functions do not exist. Pyglet uses some of these removed functions (e.g. glPushClientAttrib), and hence the bindings I'm generating at the moment do not provide all the features Pyglet uses.

                +
                +
                +
                +
                + + Armin Rigo wrote on 2014-04-08 04:47: +
                +
                +

                I'd like to remind readers of these comments that this thread has gone farther and farther from both the original post and the whole blog -- which is supposed to be related to PyPy. I'm rather sure that you're now discussing performance on CPython, which in this case is very different from performance on PyPy (or would be if it supported all packages involved). Maybe move this discussion somewhere more appropriate?

                +
                +
                +
                +
                + + Unknown wrote on 2014-04-09 11:47: +
                +
                +

                @Armin: You’re right… actually I would be pretty interested, though, whether pypy also has a performance issue with pyglet's chained functions.

                +
                +
                +
                +
                + + David wrote on 2014-04-09 14:30: +
                +
                +

                @Arne: In principal, PyPy seems to handle Pyglets chained functions relatively well (non-scientifically running the Astraea examples title screen sees CPU usage start very high, but eventually drops to about 80% of cPythons after the JIT warms up). There is one caveat preventing better testing: the moment keyboard input is given to Astraea on PyPy, PyPy segfaults.

                +
                +
                +
                +
                + + Unknown wrote on 2014-04-10 09:05: +
                +
                +

                @David: That is a really important feedback to Armin and and Anatoly, I think.

                +
                +
                +
                +
                + + Unknown wrote on 2014-04-10 09:06: +
                +
                +

                @David: Can you give some more background on the error (how to get the code, how to reproduce the segfault)?

                +
                +
                +
                +
                + + David wrote on 2014-04-15 11:35: +
                +
                +

                @Arne: It's as simple as running the Astraea example in Pyglet and pressing a key (under PyPy 2.2, Pyglet 1.2-beta, Ubuntu 14.04). As far as I remember, this has been the case for some time (at least as far back as Ubuntu 12.10/PyPy 2.0 beta - although back then the major issue was PyPy using a lot more CPU; I didn't report this then due to a blog post at the time saying how cTypes would be rewritten). The error reported by Apport is "Cannot access memory at address 0x20"

                Doing a cursory scan through other examples, the noisy and text_input examples also have problems. noisy segfaults when a spawned ball collides with a boundary (occasionally giving a partial rpython traceback); text_input appears to have a random chance of any of the input boxes being selectable.

                Maybe it's time to file a proper bug report on this...

                +
                +
                +
                +
                + + David wrote on 2014-04-15 14:09: +
                +
                +

                @Arne: I've now submitted a bug on the PyPy Bug tracker (Issue 1736), with more detail etc. Probably best to move conversation on any Pyglet related issues over there.

                +
                +
                +
                +
                + + Armin Rigo wrote on 2014-04-16 11:50: +
                +
                +

                Maybe indeed :-)

                +
                +
                +
                +
                + + Anonymous wrote on 2015-01-24 15:22: +
                +
                +

                I came up with a funny idea about why not making emscripten generates code targeted on RPython, then now we can use C/C++ in PyPy directly? A LLVM to RPython compiler, how about this?

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2014/04/numpy-on-pypy-status-update-1103134247318103282.html b/posts/2014/04/numpy-on-pypy-status-update-1103134247318103282.html new file mode 100644 index 000000000..47ed79aae --- /dev/null +++ b/posts/2014/04/numpy-on-pypy-status-update-1103134247318103282.html @@ -0,0 +1,367 @@ + + + + + +NumPy on PyPy - Status Update | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                NumPy on PyPy - Status Update

                + + + +
                +

                Work on NumPy on PyPy continued in March, though at a lighter pace than the previous few months. Progress was made on both compatibility and speed fronts. Several behavioral issues reported to the bug tracker were resolved. The most significant of these was probably the correction of casting to built-in Python types. Previously, int/long conversions of numpy scalars such as inf/nan/1e100 would return bogus results. Now, they raise or return values, as appropriate.

                +On the speed front, enhancements to the PyPy JIT were made to support virtualizing the raw_store/raw_load memory operations used in numpy arrays. Further work remains here in virtualizing the alloc_raw_storage when possible. This will allow scalars to have storages but still be virtualized when possible in loops.

                +Aside from continued work on compatibility/speed of existing code, we also hope to begin implementing the C-level components of other numpy modules such as mtrand, nditer, linalg, and so on. Several approaches could be taken to get C-level code in these modules working, ranging from reimplementing in RPython to interfacing with existing code with CFFI, if possible. The appropriate approach depends on many factors and will probably vary from module to module.

                To try out PyPy + NumPy, grab a nightly PyPy and install our NumPy fork. Feel free to report comments/issues to IRC, our mailing list, or bug tracker. Thanks to the contributors to the NumPy on PyPy proposal for supporting this work.

                +
                +

                Comments

                +
                +
                +
                + + Unknown wrote on 2014-04-24 23:22: +
                +
                +

                Trying to install scipy on top gives me an error while compiling scipy/cluster/src/vq_module.c; isn't scipy yet supported?

                +
                +
                +
                +
                + + Anonymous wrote on 2014-04-30 12:38: +
                +
                +

                scipy is not supported. Sometimes scipy functions are in fact in numpy in which case you can just copy the code. Otherwise you need to start learning cffi.

                +
                +
                +
                +
                + + Yichao Yu wrote on 2014-05-18 02:07: +
                +
                +

                You mentioned storage and scalar types. Is it related to this bug

                +
                +
                +
                +
                + + vak wrote on 2014-08-14 09:19: +
                +
                +

                what is the status about incorporating BLAS library?

                +
                +
                +
                +
                + + Anonymous wrote on 2014-09-22 21:52: +
                +
                +

                How far is running Pandas on Pypy? Will it be just a recompile when Numpy is ported, or is it heavy work to port Pandas to Pypy after Numpy is done? Should I look after another solution than plan to run Pandas on Pypy?

                +
                +
                +
                +
                + + Unknown wrote on 2014-11-13 10:07: +
                +
                +

                Pandas on PyPy would indeed be very interesting for huge analysis runs.

                +
                +
                +
                +
                + + Jami wrote on 2014-11-18 17:14: +
                +
                +

                Any news on the NumPy front? I check this blog for such stuff every week and also contributed to the funding drive.

                I fully understand that developers skilled enough to work on such a project are hard to come by even with money, and NumPy support isn't probably the most technologically exciting aspect of PyPy.

                Just even a few lines on the latest development or some milestones would show that the project is alive (although I fully understand that writing blog posts isn't everybody's favorite thing). And some kind of summary that in what shape the developers think the code is in. If you prefer coding to blogging, maybe implementing some kind of time-series graph for the numpypy-status page could be nice also (I keep checking it out but can never remember what was the state last time I checked). Maybe I can see if I can do a quick hack via eg archive.org for this.

                I think also a huge boost would be to have even a hacky temporary way to interface with Matplotlib and/or SciPy, as it's quite hard to do many practical analyses without these. I'd probably try to do my analyses in such an environment and perhaps even implement/fix at least things that are my own itches. There was the 2011 hack, but it doesn't seem to be elaborated anywhere. I could live with (or even prefer, so it definitely won't become the permanent version) a ugly, slow, memory-hungry and unstable hack that would spam the stderr with insulting messages. But without any way of interfacing the existing stuff it's just too much work for the more complicated analyses.

                I'm trying to track the https://bitbucket.org/pypy/numpy branch but it's a bit hard to see the bigger picture just from the commits. Even just some tags and/or meta-issues could be helpful. I'm also a bit confused on where (repo-wise) the development is actually happening. There are some sort of fresh NumPy-branches in the numpy tree. The micronumpy-project is probably dead or merged into the pypy/numpy-branch?

                PS. Please don't take this as too strong criticism. I prefer to just silently code away myself too. Just what would be nice to see as somebody eagerly waiting to use Pypy in numerical stuff.

                +
                +
                +
                +
                + + Maciej Fijalkowski wrote on 2014-11-24 12:00: +
                +
                +

                Hey Jami

                We'll try to write a blog post shortly

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2014/04/stm-results-and-second-call-for-1767845182888902777.html b/posts/2014/04/stm-results-and-second-call-for-1767845182888902777.html new file mode 100644 index 000000000..9195ce94d --- /dev/null +++ b/posts/2014/04/stm-results-and-second-call-for-1767845182888902777.html @@ -0,0 +1,365 @@ + + + + + +STM results and Second Call for Donations | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                STM results and Second Call for Donations

                + + + +
                +

                Hi all,

                + +

                We now have a preliminary version of PyPy-STM +with the JIT, from the new STM documentation +page. This PyPy-STM is still not quite useful, failing to top the +performance of a regular PyPy by a small margin on most benchmarks, but +it's definitely getting there :-) The overheads with the JIT are still +a bit too high. (I've been tracking an obscure bug since days. +It turned out to be a simple buffer overflow. But if anybody has +a clue about why a hardware watchpoint in gdb, set on one of the garbled +memory locations, fails to trigger but the memory ends up being modified +anyway... and, it turns out, by just a regular pointer write... ideas +welcome.)

                + +

                But I go off-topic :-) The main point of this post is to announce the +2nd Call for Donation about +STM. We achieved most of the goals laid out in the first call. We +even largely overachieved them in terms of raw performance, even if +there are many cases that are unreasonably slow for now. So, after the +successful research, we are launching a second proposal about the +development part of the project:

                + +
                  +
                1. +

                  Polish PyPy-STM to get a consistently reasonable speed, 25%-40% +slower than a regular JITted PyPy when running single-threaded code. Of +course it is supposed to scale nicely as long as there are no +user-visible conflicts.

                  + +
                2. +
                3. +

                  Focus on developing the Python-facing interface: both internal things +(e.g. do dictionaries need to be more TM-friendly in general?) as well +as directly visible things (e.g. some profiler-like interface to explore +common conflicts in a program).

                  + +
                4. +
                5. Regular multithreaded code should benefit out of the box, but the +final goal is to explore and tweak some existing non-multithreaded +frameworks and improve their TM-friendliness. So existing programs +using Twisted or Stackless, for example, should run on multiple cores +without any major change.

                6. +
                +

                See the full call for more +details! I'd like to thank Remi Meier for getting involved. And a big +thank you to everybody who contributed money on the first call. It +took more time than anticipated, but it's there in good but rough shape. +Now it needs a lot of polishing :-)

                + +

                Armin

                +
                +

                Comments

                +
                +
                +
                + + Dmitrey wrote on 2014-05-03 19:48: +
                +
                +

                it would be good to have compiled stm version for something more recent than Ubuntu 12.04, e.g. 14.04, preferably with numpy included, to simplify numpy installation. Or, maybe, that version for 12.04 works with 14.04?

                +
                +
                +
                +
                + + Armin Rigo wrote on 2014-05-10 14:56: +
                +
                +

                Yes, Ubuntu 14.04 seems to run fine any PyPy compiled for Ubuntu 12.04. Numpy probably works in pypy-stm, but being a module that accesses matrix data as "external" raw memory, it does not support multi-core execution.

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2014/05/pypy-23-terrestrial-arthropod-trap-9057496904945555741.html b/posts/2014/05/pypy-23-terrestrial-arthropod-trap-9057496904945555741.html new file mode 100644 index 000000000..0bc9b81ad --- /dev/null +++ b/posts/2014/05/pypy-23-terrestrial-arthropod-trap-9057496904945555741.html @@ -0,0 +1,356 @@ + + + + + +PyPy 2.3 - Terrestrial Arthropod Trap | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                PyPy 2.3 - Terrestrial Arthropod Trap

                + + + +
                +
                +
                +We’re pleased to announce PyPy 2.3, which targets version 2.7.6 of the Python language. This release updates the stdlib from 2.7.3, jumping directly to 2.7.6.

                +This release also contains several bugfixes and performance improvements, many generated by real users finding corner cases. CFFI has made it easier than ever to use existing C code with both cpython and PyPy, easing the transition for packages like cryptographyPillow(Python Imaging Library [Fork]), a basic port of pygame-cffi, and others.

                +PyPy can now be embedded in a hosting application, for instance inside uWSGI

                +You can download the PyPy 2.3 release here:

                https://pypy.org/download.html

                +PyPy is a very compliant Python interpreter, almost a drop-in replacement for CPython 2.7. It's fast (pypy 2.3 and cpython 2.7.x performance comparison; note that cpython's speed has not changed since 2.7.2) due to its integrated tracing JIT compiler.

                +This release supports x86 machines running Linux 32/64, Mac OS X 64, Windows, and OpenBSD, as well as newer ARM hardware (ARMv6 or ARMv7, with VFPv3) running Linux. 
                +
                +We would like to thank our donors for the continued support of the PyPy project.

                +The complete release notice is here

                +Cheers, The PyPy Team
                +
                +

                Comments

                +
                +
                +
                + + Anonymous wrote on 2014-05-10 05:20: +
                +
                +

                Hi Why don't you accept Bitcoin as one of donation methods? Bitcoin makes it easier to donate your project

                I believe that you add it and announce it here, there will be several posts in Reddit and others sources that help you to collect funds

                +
                +
                +
                +
                + + Anonymous wrote on 2014-05-10 06:40: +
                +
                +

                right, i think so

                +
                +
                +
                +
                + + Anonymous wrote on 2014-05-10 22:21: +
                +
                +

                Hey,
                Just wondering, does v2.3 contains the fix for issue 1683 titled "BytesIO leaks like hell"?

                https://bugs.pypy.org/issue1683

                +
                +
                +
                +
                + + Eric van Riet Paap wrote on 2014-05-12 21:40: +
                +
                +

                The bug status is set to resolved so one would expect it to be fixed. Please reopen the bug report if you think differently.

                +
                +
                +
                +
                + + Unknown wrote on 2014-05-14 10:59: +
                +
                +

                There is no info about what what exactly made CFFI easier in this release.

                +
                +
                +
                +
                + + Unknown wrote on 2014-05-14 20:21: +
                +
                +

                Hello pypy team! If you have not have not seen this post... https://www.rfk.id.au/blog/entry/pypy-js-faster-than-cpython/ , I think you will find it to be quite interesting!

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2014/06/pypy-231-terrestrial-arthropod-trap-5076300474324870908.html b/posts/2014/06/pypy-231-terrestrial-arthropod-trap-5076300474324870908.html new file mode 100644 index 000000000..5084ffa68 --- /dev/null +++ b/posts/2014/06/pypy-231-terrestrial-arthropod-trap-5076300474324870908.html @@ -0,0 +1,301 @@ + + + + + +PyPy 2.3.1 - Terrestrial Arthropod Trap Revisited | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                PyPy 2.3.1 - Terrestrial Arthropod Trap Revisited

                + + + +
                +
                We're pleased to announce PyPy 2.3.1, a feature-and-bugfix improvement over our recent 2.3 release last month.

                +This release contains several bugfixes and enhancements among the user-facing improvements:
                  +
                • The built-in struct module was renamed to _struct, solving issues with IDLE and other modules
                • +
                • Support for compilation with gcc-4.9
                • +
                • A CFFI-based version of the gdbm module is now included in our binary bundle
                • +
                • Many issues were resolved since the 2.3 release on May 8
                • +
                +
                +You can download the PyPy 2.3.1 release here:

                https://pypy.org/download.html

                PyPy is a very compliant Python interpreter, almost a drop-in replacement for CPython 2.7. It's fast (pypy 2.3.1 and cpython 2.7.x performance comparison) due to its integrated tracing JIT compiler.

                +This release supports x86 machines running Linux 32/64, Mac OS X 64, Windows, and OpenBSD, as well as newer ARM hardware (ARMv6 or ARMv7, with VFPv3) running Linux. 
                +We would like to thank our donors for the continued support of the PyPy project.

                +The complete release notice is here.

                +Please try it out and let us know what you think. We especially welcome success stories, please tell us about how it has helped you!

                +Cheers, The PyPy Team
                +
                +

                Comments

                +
                +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2014/06/pypy3-231-fulcrum-3765964217640322884.html b/posts/2014/06/pypy3-231-fulcrum-3765964217640322884.html new file mode 100644 index 000000000..43ba8e52c --- /dev/null +++ b/posts/2014/06/pypy3-231-fulcrum-3765964217640322884.html @@ -0,0 +1,390 @@ + + + + + +PyPy3 2.3.1 - Fulcrum | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                PyPy3 2.3.1 - Fulcrum

                + + + +
                +

                We're pleased to announce the first stable release of PyPy3. PyPy3
                +targets Python 3 (3.2.5) compatibility.

                +

                We would like to thank all of the people who donated to the py3k proposal
                +for supporting the work that went into this.

                +

                You can download the PyPy3 2.3.1 release here:

                +
                https://pypy.org/download.html#pypy3-2-3-1
                +
                +

                Highlights

                +
                  +
                • The first stable release of PyPy3: support for Python 3!
                • +
                • The stdlib has been updated to Python 3.2.5
                • +
                • Additional support for the u'unicode' syntax (PEP 414) from Python 3.3
                • +
                • Updates from the default branch, such as incremental GC and various JIT
                  +improvements
                • +
                • Resolved some notable JIT performance regressions from PyPy2:
                • +
                +
                  +
                • Re-enabled the previously disabled collection (list/dict/set) strategies
                • +
                • Resolved performance of iteration over range objects
                • +
                • Resolved handling of Python 3's exception __context__ unnecessarily forcing
                  +frame object overhead
                • +
                +
                +
                +

                What is PyPy?

                +

                PyPy is a very compliant Python interpreter, almost a drop-in replacement for
                +CPython 2.7.6 or 3.2.5. It's fast due to its integrated tracing JIT compiler.

                +

                This release supports x86 machines running Linux 32/64, Mac OS X 64, Windows,
                +and OpenBSD,
                +as well as newer ARM hardware (ARMv6 or ARMv7, with VFPv3) running Linux.

                +

                While we support 32 bit python on Windows, work on the native Windows 64
                +bit python is still stalling, we would welcome a volunteer
                +to handle that.

                +
                +
                +

                How to use PyPy?

                +

                We suggest using PyPy from a virtualenv. Once you have a virtualenv
                +installed, you can follow instructions from pypy documentation on how
                +to proceed. This document also covers other installation schemes.

                +

                Cheers,
                +the PyPy team

                +
                +
                +

                Comments

                +
                +
                +
                + + Omer Katz wrote on 2014-06-24 08:26: +
                +
                +

                Can we get some benchmarks much like we have for PyPY and CPython 2.7?

                +
                +
                +
                +
                + + Armin Rigo wrote on 2014-06-24 09:06: +
                +
                +

                As far as I know, a majority of the benchmarks we use have never been ported to Python 3. So it's far more complicated than just push a switch.

                +
                +
                +
                +
                + + jusic wrote on 2014-06-25 08:25: +
                +
                +

                Awesome, congrats on the new release! Finally some stable PyPy goodness for Python 3 as well :)

                +
                +
                +
                +
                + + Anonymous wrote on 2014-06-27 05:37: +
                +
                +

                Woo! This is exciting! (Now we just need to upgrade to 3.4... : ) )

                +
                +
                +
                +
                + + geerk wrote on 2014-06-28 09:06: +
                +
                +

                Glad to hear that PyPy is now for python 3. Great work!

                +
                +
                +
                +
                + + Unknown wrote on 2014-07-03 15:04: +
                +
                +

                This is great!

                Now I can finally test PyPy on some code for which I wanted to test it for years!

                (backporting to py2 was too painful)

                Thank you very much!

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2014/07/pypy-stm-first-interesting-release-8684276541915333814.html b/posts/2014/07/pypy-stm-first-interesting-release-8684276541915333814.html new file mode 100644 index 000000000..e90613df3 --- /dev/null +++ b/posts/2014/07/pypy-stm-first-interesting-release-8684276541915333814.html @@ -0,0 +1,679 @@ + + + + + +PyPy-STM: first "interesting" release | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                PyPy-STM: first "interesting" release

                + + + +
                +

                Hi all,

                + +

                PyPy-STM is now reaching a point where we can say it's good enough to be +a GIL-less Python. (We don't guarantee there are no more bugs, so please +report them :-) The first official STM release:

                + +
                  +
                • +pypy-stm-2.3-r2-linux64 +
                  (UPDATE: this is release r2, fixing a systematic segfault at start-up on some systems) +
                • +
                +

                This corresponds roughly to PyPy 2.3 (not 2.3.1). It requires 64-bit +Linux. More precisely, this release is built for Ubuntu 12.04 to 14.04; +you can also rebuild it +from source by getting the branch stmgc-c7. You need +clang to compile, and you need a patched +version of llvm.

                + +

                This version's performance can reasonably be compared with a regular +PyPy, where both include the JIT. Thanks for following the meandering progress of PyPy-STM over the past three years --- we're finally getting somewhere really interesting! We cannot thank enough all contributors to the previous PyPy-STM money pot that made this possible. And, although this blog post is focused on the results from that period of time, I have of course to remind you that we're running a second call for donation for future work, which I will briefly mention again later.

                + +

                A recap of what we did to get there: around the start of the year we found a new model, a "redo-log"-based STM which uses a couple of hardware tricks to not require chasing pointers, giving it (in this context) exceptionally cheap read barriers. This idea was developed over the following months and (relatively) easily integrated with the JIT compiler. The most recent improvements on the Garbage Collection side are closing the gap with a regular PyPy (there is still a bit more to do there). There is some preliminary user documentation.

                + +

                Today, the result of this is a PyPy-STM that is capable of running pure Python code on multiple threads in parallel, as we will show in the benchmarks that follow. A quick warning: this is only about pure Python code. We didn't try so far to optimize the case where most of the time is spent in external libraries, or even manipulating "raw" memory like array.array or numpy arrays. To some extent there is no point because the approach of CPython works well for this case, i.e. releasing the GIL around the long-running operations in C. Of course it would be nice if such cases worked as well in PyPy-STM --- which they do to some extent; but checking and optimizing that is future work.

                + +

                As a starting point for our benchmarks, when running code that +only uses one thread, we get a slow-down between 1.2 and 3: at worst, +three times as slow; at best only 20% slower than a regular +PyPy. This worst case has been brought down --it used to be 10x-- by +recent work on "card marking", a useful GC technique that is also +present in the regular PyPy (and about which I don't find any blog post; +maybe we should write one :-) The main remaining issue is fork(), or +any function that creates subprocesses: it works, but is very slow. To +remind you of this fact, it prints a line to stderr when used.

                + +

                Now the real main part: when you run multithreaded code, it scales very nicely with two +threads, and less-than-linearly but still not badly with three or four +threads. Here is an artificial example:

                + +
                    total = 0
                +    lst1 = ["foo"]
                +    for i in range(100000000):
                +        lst1.append(i)
                +        total += lst1.pop()
                + +

                We run this code N times, once in each of N threads +(full +benchmark). Run times, best of three:

                + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                Number of threadsRegular PyPy (head)PyPy-STM
                N = 1real 0.92s
                +user+sys 0.92s
                real 1.34s
                +user+sys 1.34s
                N = 2real 1.77s
                +user+sys 1.74s
                real 1.39s
                +user+sys 2.47s
                N = 3real 2.57s
                +user+sys 2.56s
                real 1.58s
                +user+sys 4.106s
                N = 4real 3.38s
                +user+sys 3.38s
                real 1.64s
                +user+sys 5.35s
                +

                (The "real" time is the wall clock time. The "user+sys" time is the +recorded CPU time, which can be larger than the wall clock time if +multiple CPUs run in parallel. This was run on a 4x2 cores machine. +For direct comparison, avoid loops that are so trivial +that the JIT can remove all allocations from them: right now +PyPy-STM does not handle this case well. It has to force a dummy allocation +in such loops, which makes minor collections occur much more frequently.)

                + +

                Four threads is the limit so far: only four threads can be executed in +parallel. Similarly, the memory usage is limited to 2.5 GB of GC +objects. These two limitations are not hard to increase, but at least +increasing the memory limit requires fighting against more LLVM bugs. +(Include here snark remarks about LLVM.)

                + +

                Here are some measurements from more real-world benchmarks. This time, +the amount of work is fixed and we parallelize it on T threads. The first benchmark is just running translate.py on a trunk PyPy. The last +three benchmarks are here.

                + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                BenchmarkPyPy 2.3(PyPy head)PyPy-STM, T=1T=2T=3T=4
                +translate.py --no-allworkingmodules
                +(annotation step)
                184s(170s)386s (2.10x)n/a
                multithread-richards
                +5000 iterations
                24.2s(16.8s)52.5s (2.17x)37.4s (1.55x)25.9s (1.07x)32.7s (1.35x)
                mandelbrot
                +divided in 16-18 bands
                22.9s(18.2s)27.5s (1.20x)14.4s (0.63x)10.3s (0.45x)8.71s (0.38x)
                btree2.26s(2.00s)2.01s (0.89x)2.22s (0.98x)2.14s (0.95x)2.42s (1.07x)
                +

                This shows various cases that can occur:

                + +
                  +
                • The mandelbrot example runs with minimal overhead and very good parallelization. +It's dividing the plane to compute in bands, and each of the T threads receives the +same number of bands. + +
                • +
                • Richards, a classical benchmark for PyPy (tweaked to run the iterations +in multiple threads), is hard to beat on regular PyPy: +we suspect that the difference is due to the fact that a lot of +paths through the loops don't allocate, triggering the issue already +explained above. Moreover, the speed of Richards was again improved +dramatically recently, in trunk. + +
                • +
                • The translation benchmark measures the time translate.py +takes to run the first phase only, "annotation" (for now it consumes too much memory +to run translate.py to the end). Moreover the timing starts only after the large number of +subprocesses spawned at the beginning (mostly gcc). This benchmark is not parallel, but we +include it for reference here. The slow-down factor of 2.1x is still too much, but +we have some idea about the reasons: most likely, again the Garbage Collector, missing the regular PyPy's +very fast small-object allocator for old objects. Also, translate.py +is an example of application that could, with +reasonable efforts, be made largely parallel in the future using atomic blocks. + +
                • +
                • Atomic blocks are also present in the btree benchmark. I'm not completely sure +but it seems that, in this case, the atomic blocks create too many +conflicts between the threads for actual parallization: the base time is very good, +but running more threads does not help at all. +
                • +
                +

                As a summary, PyPy-STM looks already useful to run CPU-bound multithreaded +applications. We are certainly still going to fight slow-downs, but it +seems that there are cases where 2 threads are enough to outperform a regular +PyPy, by a large margin. Please try it out on your own small examples!

                + +

                And, at the same time, please don't attempt to retrofit threads inside +an existing large program just to benefit from PyPy-STM! +Our goal is not to send everyone down the obscure route of multithreaded +programming and its dark traps. We are going finally to shift our main +focus on the phase 2 of our +research (donations welcome): how to enable a better way of writing multi-core programs. +The starting point is to fix and test atomic blocks. Then we will have to +debug common causes of conflicts and fix them or work around them; and +try to see how common frameworks like Twisted can be adapted.

                + +

                Lots of work ahead, but lots of work behind too :-)

                + +

                Armin (thanks Remi as well for the work).

                +
                +

                Comments

                +
                +
                +
                + + Armin Rigo wrote on 2014-07-05 16:22: +
                +
                +

                You're just extracting and running the "bin/pypy"? It works for me on a very close configuration, Ubuntu 14.04 too...

                +
                +
                +
                +
                + + Armin Rigo wrote on 2014-07-05 20:13: +
                +
                +

                Yes. Sorry, it doesn't make sense to me. You need to debug with gdb, probably with an executable that has got the debugging symbols. You need to either build it yourself, or recompile the pregenerated sources from: https://cobra.cs.uni-duesseldorf.de/~buildmaster/misc/pypy-c-r72356-stm-jit-SOURCE.txz

                +
                +
                +
                +
                + + Ernst Sjöstrand wrote on 2014-07-05 23:40: +
                +
                +

                If I try virtualenv I get:
                virtualenv stmtest -p Projekt/pypy-stm-2.3-linux64/bin/pypy
                Running virtualenv with interpreter Projekt/pypy-stm-2.3-linux64/bin/pypy
                [forking: for now, this operation can take some time]
                [forking: for now, this operation can take some time]
                New pypy executable in stmtest/bin/pypy
                [forking: for now, this operation can take some time]
                ERROR: The executable stmtest/bin/pypy is not functioning
                ERROR: It thinks sys.prefix is u'/home/ernst' (should be u'/home/ernst/stmtest')
                ERROR: virtualenv is not compatible with this system or executable

                +
                +
                +
                +
                + + Armin Rigo wrote on 2014-07-06 08:48: +
                +
                +

                @Ernst: sorry, it works fine for me as well. I tried the pypy-stm provided here, both on a Ubuntu 12.04 and a Ubuntu 14.04 machine. Maybe you have a too old virtualenv? Does it work with regular PyPy?

                +
                +
                +
                +
                + + Armin Rigo wrote on 2014-07-07 12:37: +
                +
                +

                Thanks to the author of the now-deleted comments, we could track and fix a bug that only shows up on some Linux systems. If pypy-stm systematically segfaults at start-up for you too, try the "2.3-r2" release (see update in the post itself).

                +
                +
                +
                +
                + + Anonymous wrote on 2014-07-07 20:00: +
                +
                +

                This is exciting! One minor bug in the actual post: you can describe slowdown / speedup in two different ways, with total time as a percentage of original time, or with time difference as a percentage of original time. You mention a 20% slowdown (clearly using the latter standard) and then a 300% slowdown, which you describe as 3x (suggesting that you use the former standard). To be consistent , you should either describe them as 120% and 300%, respectively (using the former standard), or 20% and 200%, respectively (using the latter standard).

                Thanks!

                +
                +
                +
                +
                + + Unknown wrote on 2014-07-07 21:35: +
                +
                +

                Hi again,

                just to play around a little I've put together https://github.com/Tinche/stm-playground for myself.

                I picked a generic CPU-bound problem (primality testing) and tried comparing multithreaded implementations in CPython 2.7, ordinary PyPy and PyPy-STM.

                I figured this would be easily parallelizable (low conflicts) but it doesn't seem to be the case - I don't get all my cores pegged using the STM.

                bench-threadpool.py, on my machine, gives about the same time for CPython and PyPy-STM, while ordinary PyPy totally smokes them both (even with the GIL :), one order of magnitude difference (20 sec vs 2 sec).

                bench-threadpool-naive will crash the STM interpreter on my system. :)

                Getting away from threads, CPython will actually beat PyPy in a multi-process scenario by a factor of 2, which I found surprising. CPython does indeed use up all my cores 100% while dealing with a process pool, while PyPy has won't even come close.

                For the same workload, PyPy is actually faster running multithreaded with the GIL than multi-process, and fastest running with only 1 thread (expected, with the GIL only being overhead in this scenario).

                +
                +
                +
                +
                + + Pim wrote on 2014-07-07 21:40: +
                +
                +

                This is good news. For many of my applications, an important feature in the next phase will be the optimization for [..] the built-in dictionary type, for which we would like accesses and writes using independent keys to be truly independent [..]. My applications are mostly server applications (Twisted-based and others) that store state information on sessions/transactions in a small number of dictionaries that can have hundreds or thousands of entries concurrently, and would be accessed constantly.

                I'm glad I donated and plan do so again in the future :-)

                +
                +
                +
                +
                + + Armin Rigo wrote on 2014-07-08 10:47: +
                +
                +

                @Tin: I would tweak bench-queue.py to avoid a million inter-thread communications via the queue. For example, run 1000 check_primes instead of just 1 for every number received from the queue.

                +
                +
                +
                +
                + + Armin Rigo wrote on 2014-07-08 11:17: +
                +
                +

                @Tin: ...no, I tried too and it doesn't seem to help. We'll need to look into this in more details....

                +
                +
                +
                +
                + + Unknown wrote on 2014-07-08 21:04: +
                +
                +

                @Armin I've pushed a version of bench-queue with a tweakable batch size and concurrency level. Doing the work in batches of, say, 1000 does indeed make it go faster with all implementations.

                I've noticed pypy-stm runs have a large variance. It's not like I'm doing scientific measurements here, but for the queue test I'm getting runtimes from ~15 sec to ~27 sec, whereas for example ordinary PyPy is in the range 4.6 sec - 4.9 sec, and CPython ~22.5 - ~24.7, again, relatively close. Again, this is just something I noticed along the way and not the result of serious benchmarking in isolation.

                +
                +
                +
                +
                + + Armin Rigo wrote on 2014-07-10 08:44: +
                +
                +

                Ooooof. Ok, I found out what is wrong in bench-queue. The issue is pretty technical, but basically if you add "with __pypy__.thread.atomic:" in the main top-level loop in worker(), then it gets vastly faster. On my machine it beats the real-time speed of a regular pypy. See https://bpaste.net/show/450553/

                It clearly needs to be fixed...

                +
                +
                +
                +
                + + Armin Rigo wrote on 2014-07-10 09:31: +
                +
                +

                Added an answer to the question "what about PyPy3?": https://pypy.readthedocs.org/en/latest/stm.html#python-3

                +
                +
                +
                +
                + + Unknown wrote on 2014-07-12 01:03: +
                +
                +

                @Armin, cool! I've found that the thread pool version can be sped up ~2-3x by wrapping the contents of check_prime with 'atomic' too.

                One more observation: with the atomic context manager, on PyPy-STM the queue implementation will beat the thread pool implementation (slightly), which isn't the case for CPython or ordinary PyPy.

                +
                +
                +
                +
                + + geerk wrote on 2014-07-16 08:16: +
                +
                +

                This is exciting news! I think pypy is the future of python.

                +
                +
                +
                +
                + + Canesin wrote on 2014-07-19 15:40: +
                +
                +

                If you guys did a facelift on the website like yours HippyVM I believe the project would gain a lot of momentum, it is unfortunate but true that most company managers would visit it and think it is not industrial quality if an employ comes saying that they should sponsor developing something in PyPy.

                +
                +
                +
                +
                + + Anonymous wrote on 2014-07-20 11:26: +
                +
                +

                r2 still doesn't work for me (ubuntu 14.04, intel Core2 CPU T7400)
                bash: ./pypy: cannot execute binary file: Exec format error

                +
                +
                +
                +
                + + isomorph wrote on 2014-07-31 05:46: +
                +
                +

                this is a question for the guys developing PyPy... i am completely new to Python so please bear with me.

                here is what i don't understand: it seems to me that you are reinventing the wheel because doesn't the Oracle or Azul Systems JVM already provide a super performant GC and JIT? even STM is becoming available. and since Jython can run on the JVM, why do PyPy at all?

                wouldn't a JVM compliant implementation of Python be more performant than PyPy or CPython?

                or am i missing something here?

                any pointers greatly appreciated. thanks.

                +
                +
                +
                +
                + + Armin Rigo wrote on 2014-08-04 08:04: +
                +
                +

                Having a JIT in the JVM is very different from having a JIT that can understand Python. For proof, the best (and only) implementation of Python on the JVM, Jython, is running at around CPython speed (generally a bit slower). I suspect that STM is similarly not designed for the purposes to which Jython would put it and would thus perform poorly. The only part that would probably work out of the box would be the GC. A more subtle argument against starting from the JVM is that of semantic mismatch. See for example https://www.stups.uni-duesseldorf.de/mediawiki/images/5/51/Pypy.pdf

                +
                +
                +
                +
                + + isomorph wrote on 2014-08-04 14:44: +
                +
                +

                awesome! thanks a lot armin. :D

                +
                +
                +
                + +
                +
                + +
                +
                + + \ No newline at end of file diff --git a/posts/2014/08/a-field-test-of-software-transactional-5659022209916605798.html b/posts/2014/08/a-field-test-of-software-transactional-5659022209916605798.html new file mode 100644 index 000000000..d24ace425 --- /dev/null +++ b/posts/2014/08/a-field-test-of-software-transactional-5659022209916605798.html @@ -0,0 +1,486 @@ + + + + + +A Field Test of Software Transactional Memory Using the RSqueak Smalltalk VM | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                +
                +

                A Field Test of Software Transactional Memory Using the RSqueak Smalltalk VM

                + + + +
                +

                +Extending the Smalltalk RSqueakVM with STM

                +

                by Conrad Calmez, Hubert Hesse, Patrick Rein and Malte Swart supervised by Tim Felgentreff and Tobias Pape

                +

                +Introduction

                +

                After pypy-stm we can announce that through the RSqueakVM (which used to be called SPyVM) a second VM implementation supports software transactional memory. RSqueakVM is a Smalltalk implementation based on the RPython toolchain. We have added STM support based on the STM tools from RPython (rstm). The benchmarks indicate that linear scale up is possible, however in some situations the STM overhead limits speedup.

                +

                The work was done as a master's project at the Software Architechture Group of Professor Robert Hirschfeld at at the Hasso Plattner Institut at the University of Potsdam. We - four students - worked about one and a half days per week for four months on the topic. The RSqueakVM was originally developped during a sprint at the University of Bern. When we started the project we were new to the topic of building VMs / interpreters.

                +

                We would like to thank Armin, Remi and the #pypy IRC channel who supported us over the course of our project. We also like to thank Toni Mattis and Eric Seckler, who have provided us with an initial code base.

                +

                +Introduction to RSqueakVM

                +

                As the original Smalltalk implementation, the RSqueakVM executes a given Squeak Smalltalk image, containing the Smalltalk code and a snapshot of formerly created objects and active execution contexts. These execution contexts are scheduled inside the image (greenlets) and not mapped to OS threads. Thereby the non-STM RSqueakVM runs on only one OS thread.

                +

                +Changes to RSqueakVM

                +

                The core adjustments to support STM were inside the VM and transparent from the view of a Smalltalk user. Additionally we added Smalltalk code to influence the behavior of the STM. As the RSqueakVM has run in one OS thread so far, we added the capability to start OS threads. Essentially, we added an additional way to launch a new Smalltalk execution context (thread). But in contrast to the original one this one creates a new native OS thread, not a Smalltalk internal green thread.

                + +

                STM (with automatic transaction boundaries) already solves the problem of concurrent access on one value as this is protected by the STM transactions (to be more precise one instruction). But there are cases were the application relies on the fact that a bigger group of changes is executed either completely or not at all (atomic). Without further information transaction borders could be in the middle of such a set of atomic statements. rstm allows to aggregate multiple statements into one higher level transaction. To let the application mark the beginning and the end of these atomic blocks (high-level transactions), we added two more STM specific extensions to Smalltalk.

                + +

                +Benchmarks

                +

                RSqueak was executed in a single OS thread so far. rstm enables us to execute the VM using several OS threads. Using OS threads we expected a speed-up in benchmarks which use multiple threads. We measured this speed-up by using two benchmarks: a simple parallel summation where each thread sums up a predefined interval and an implementation of Mandelbrot where each thread computes a range of predefined lines.

                + +

                To assess the speed-up, we used one RSqueakVM compiled with rstm enabled, but once running the benchmarks with OS threads and once with Smalltalk green threads. The workload always remained the same and only the number of threads increased. To assess the overhead imposed by the STM transformation we also ran the green threads version on an unmodified RSqueakVM. All VMs were translated with the JIT optimization and all benchmarks were run once before the measurement to warm up the JIT. As the JIT optimization is working it is likely to be adoped by VM creators (the baseline RSqueakVM did that) so that results with this optimization are more relevant in practice than those without it. We measured the execution time by getting the system time in Squeak. The results are:

                +

                +Parallel Sum Ten Million

                + + + +
                + +
                + +
                Benchmark Parallel Sum 10,000,000
                + + + + + + + + + + + + + + + +
                Thread Count RSqueak green threads RSqueak/STM green threads RSqueak/STM OS threads Slow down from RSqueak green threads to RSqueak/STM green threads Speed up from RSqueak/STM green threads to RSQueak/STM OS Threads
                1 168.0 ms 240.0 ms 290.9 ms 0.70 0.83
                2 167.0 ms 244.0 ms 246.1 ms 0.68 0.99
                4 167.8 ms 240.7 ms 366.7 ms 0.70 0.66
                8 168.1 ms 241.1 ms 757.0 ms 0.70 0.32
                16 168.5 ms 244.5 ms 1460.0 ms 0.69 0.17
                +

                +Parallel Sum One Billion

                + + + +
                + +
                + +
                Benchmark Parallel Sum 1,000,000,000
                +
                + + + + + + + + + + + + + + + + + + + + +
                Thread CountRSqueak green threadsRSqueak/STM green threadsRSqueak/STM OS threadsSlow down from RSqueak green threads to RSqueak/STM green threadsSpeed up from RSqueak/STM green threads to RSQueak/STM OS Threads
                1 16831.0 ms 24111.0 ms 23346.0 ms 0.70 1.03
                2 17059.9 ms 24229.4 ms 16102.1 ms 0.70 1.50
                4 16959.9 ms 24365.6 ms 12099.5 ms 0.70 2.01
                8 16758.4 ms 24228.1 ms 14076.9 ms 0.69 1.72
                16 16748.7 ms 24266.6 ms 55502.9 ms 0.69 0.44
                +

                +Mandelbrot Iterative

                + + + +
                + +
                + +
                Benchmark Mandelbrot
                + + + + + + + + + + + + + + +
                Thread Count RSqueak green threads RSqueak/STM green threads RSqueak/STM OS threads Slow down from RSqueak green threads to RSqueak/STM green threads Speed up from RSqueak/STM green threads to RSqueak/STM OS Threads
                1 724.0 ms 983.0 ms 1565.5 ms 0.74 0.63
                2 780.5 ms 973.5 ms 5555.0 ms 0.80 0.18
                4 781.0 ms 982.5 ms 20107.5 ms 0.79 0.05
                8 779.5 ms 980.0 ms 113067.0 ms 0.80 0.01
                +

                +Discussion of benchmark results

                +

                First of all, the ParallelSum benchmarks show that the parallelism is actually paying off, at least for sufficiently large embarrassingly parallel problems. Thus RSqueak can also benefit from rstm.

                +

                On the other hand, our Mandelbrot implementation shows the limits of our current rstm integration. We implemented two versions of the algorithm one using one low-level array and one using two nested collections. In both versions, one job only calculates a distinct range of rows and both lead to a slowdown. The summary of the state of rstm transactions shows that there are a lot of inevitable transactions (transactions which must be completed). One reason might be the interactions between the VM and its low-level extensions, so called plugins. We have to investigate this further.

                +

                +Limitations

                +

                Although the current VM setup is working well enough to support our benchmarks, the VM still has limitations. First of all, as it is based on rstm, it has the current limitation of only running on 64-bit Linux.

                +

                Besides this, we also have two major limitations regarding the VM itself. First, the atomic interface exposed in Smalltalk is currently not working, when the VM is compiled using the just-in-time compiler transformation. Simple examples such as concurrent parallel sum work fine while more complex benchmarks such as chameneos fail. The reasons for this are currently beyond our understanding. Second, Smalltalk supports green threads, which are threads which are managed by the VM and are not mapped to OS threads. We currently support starting new Smalltalk threads as OS threads instead of starting them as green threads. However, existing threads in a Smalltalk image are not migrated to OS threads, but remain running as green threads.

                +

                +Future work for STM in RSqueak

                +The work we presented showed interesting problems, we propose the following problem statements for further analysis:
                  +
                • +Inevitable transactions in benchmarks. This looks like it could limit other applications too so it should be solved.
                • +
                • +Collection implementation aware of STM: The current implementation of collections can cause a lot of STM collisions due to their internal memory structure. We believe it could bear potential for performance improvements, if we replace these collections in an STM enabled interpreter with implementations with less STM collisions. As already proposed by Remi Meier, bags, sets and lists are of particular interest.
                • +
                • Finally, we exposed STM through languages features such as the atomic method, which is provided through the VM. Originally, it was possible to model STM transactions barriers implicitly by using clever locks, now its exposed via the atomic keyword. From a language design point of view, the question arises whether this is a good solution and what features an stm-enabled interpreter must provide to the user in general? Of particular interest are for example, access to the transaction length and hints for transaction borders to and their performance impact.
                • +
                +
                  +

                  +Details for the technically inclined

                  +
                    +
                  • +Adjustments to the interpreter loop were minimal.
                  • +
                  • STM works on bytecode granularity that means, there is a implicit transaction border after every bytecode executed. Possible alternatives: only break transactions after certain bytecodes, break transactions on one abstraction layer above, e.g. object methods (setter, getter).
                  • +
                  • rstm calls were exposed using primtives (a way to expose native code in Smalltalk), this was mainly used for atomic.
                  • +
                  • Starting and stopping OS threads is exposed via primitives as well. Threads are started from within the interpreter.
                  • +
                  • For Smalltalk enabled STM code we currently have different image versions. However another way to add, load and replace code to the Smalltalk code base is required to make a switch between STM and non-STM code simple.
                  • +
                  +
                    +

                    +Details on the project setup

                    +

                    From a non-technical perspective, a problem we encountered was the huge roundtrip times (on our machines up to 600s, 900s with JIT enabled). This led to a tendency of bigger code changes ("Before we compile, let's also add this"), lost flow ("What where we doing before?") and different compiled interpreters in parallel testing ("How is this version different from the others?") As a consequence it was harder to test and correct errors. While this is not as much of a problem for other RPython VMs, RSqueakVM needs to execute the entire image, which makes running it untranslated even slower.

                    +

                    +Summary

                    +

                    The benchmarks show that speed up is possible, but also that the STM overhead in some situations can eat up the speedup. The resulting STM-enabled VM still has some limitations: As rstm is currently only running on 64-bit Linux the RSqueakVM is doing so as well. Eventhough it is possible for us now to create new threads that map to OS threads within the VM, the migration of exiting Smalltalk threads keeps being problematic.

                    +

                    We showed that an existing VM code base can benefit of STM in terms of scaling up. Further it was relatively easy to enable STM support. This may also be valuable to VM developers considering to get STM support for their VMs.

                    +
                    +

                    Comments

                    +
                    +
                    +
                    + + Armin Rigo wrote on 2014-08-09 14:10: +
                    +
                    +

                    "We showed that an existing VM code base can benefit of STM in terms of scaling up." I dispute this conclusion: in the benchmarks, it seems that the non-STM version is scaling up well, even better than the STM+OS-threads version. But how can the non-STM version scale at all? It shouldn't: that's a property of RPython. And why is the STM+OS-threads version faster even with just 1 thread? I think you need to answer these questions first. Right now it screams "you are running buggy benchmarks" to me.

                    +
                    +
                    +
                    +
                    + + Stefan Marr wrote on 2014-08-10 09:09: +
                    +
                    +

                    I concur with Armin, the conclusions are problematic in the light of the current numbers.

                    Could you give some more details on the benchmarks? Can I find the Smalltalk code somewhere?

                    Things that come to mind are details about the scheduler. In the RoarVM, that was also one of the issues (which we did not solve). The standard Squeak scheduling data structure remains unchanged I suppose? How does that interact with the STM, is it problematic that each STM thread updates this shared data structure during every scheduling operation?

                    Also, more basic, are you making sure that the benchmark processes are running with highest priority (80, IIRC), to avoid interference with other processes in the image?

                    On the language level, something that could also have an impact on the results is closures. How are they implemented? I suppose similar to the way the CogVM implements them? I suppose, you make sure that closures are not shared between processes?

                    And finally, what kind of benchmark harness are you using? Did you have a look at SMark? (https://smalltalkhub.com/#!/~StefanMarr/SMark)
                    We used that one for the RoarVM, and it provides various options to do different kind of benchmarks, including weak-scaling benchmarks, which I would find more appropriate for scalability tests. Weak-scaling means, you increase the problem size with the number of cores. That replicates the scenario where the problem itself is not really parallelizable, but you can solve more problems at the same time in parallel. It also makes sure that each process/thread does the identical operations (if setup correctly).

                    Well, all those questions aside, interesting work :) Hope to read more soon ;)

                    +
                    +
                    +
                    +
                    + + Unknown wrote on 2014-08-10 20:13: +
                    +
                    +

                    You definitely hit a really weak spot in our report... Today we investigated the ParallelSum benchmark again. So far, we've found out that it was indeed partially a problem with the priority of the benchmark process. The preliminary benchmark results make more sense now and as soon as we have stable ones we will update them.

                    I'll still try to address some of your questions right now. :)

                    1. Benchmark code
                    I've just wrapped up the current version of our benchmarks and put them in our repository. You can find the two Squeak4.5 images at the stmgc-c7 branch of the RSqueak Repository . You can find the benchmarks in the CPB package. The Squeak4.5stm image needs the RSqueak/STM VM.

                    2. Scheduler data structures
                    Yes, the scheduling data structure is completely unchanged. We have only added a new subclass of Process which overwrites fork and calls a different primitive. However, these Processes are not managed by the Smalltalk scheduler, so there should be no synchronization issues here.

                    3. Interference of other processes:
                    This is probably the source of the "speed-up" we observe on the normal RSqueakVM. With more threads we might get a bigger portion of the total runtime. So far, the benchmarks already ran in a VM mode which disables the Smalltalk GUI thread, however in the traces we found that the event handler is still scheduled every now and then. We've done it as you suggested, Stefan, and set the priority to 80 (or 79 to not mess up the timer interrupt handler).

                    4. Benchmark harness
                    We actually use SMark and also made sure the timing operations of RSqueak do their job correctly. However we are probably not using SMark at its full potential.

                    +
                    +
                    +
                    +
                    + + Unknown wrote on 2014-08-11 10:12: +
                    +
                    +

                    I've just updated the benchmarks. All benchmark processes are now running with the Smalltalk process priority of 79 (80 is the highest). The single-threaded VMs now show the expected behavior.

                    +
                    +
                    +
                    +
                    + + Unknown wrote on 2014-08-11 14:11: +
                    +
                    +

                    To further clarify on the Mandelbrot benchmarks: After a discussion with Stefan, I have changed the Mandelbrot implementation. Each job now only has private data and does not read or write in any shared data structure. Still the benchmark results remain the same and we can still observe a high proportion of inevitable transactions.

                    As Armin pointed out, and which would be a next step, we would need to figure out which parts of the interpreter might cause systematic conflicts.

                    +
                    +
                    +
                    + +
                    +
                    + +
                    +
                    + + \ No newline at end of file diff --git a/posts/2014/09/pypy-24-beta-just-in-time-for-psfs-5956090195665204063.html b/posts/2014/09/pypy-24-beta-just-in-time-for-psfs-5956090195665204063.html new file mode 100644 index 000000000..59a6804c4 --- /dev/null +++ b/posts/2014/09/pypy-24-beta-just-in-time-for-psfs-5956090195665204063.html @@ -0,0 +1,366 @@ + + + + + +PyPy 2.4-beta just in time for PSF's funding drive | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                    +
                    +

                    PyPy 2.4-beta just in time for PSF's funding drive

                    + + + +
                    +
                    +We're pleased to announce the availability of PyPy 2.4-beta1; faster, fewer bugs, and updated to the python 2.7.8 stdlib.

                    +This release contains several bugfixes and enhancements. Among the user-facing improvements:
                      +
                    • internal refactoring in string and GIL handling which led to significant speedups
                    • +
                    • improved handling of multiple objects (like sockets) in long-running programs. They are collected and released more efficiently, reducing memory use. In simpler terms - we closed what looked like a memory leak
                    • +
                    • Windows builds now link statically to zlib, expat, bzip, and openssl-1.0.1i
                    • +
                    • Many issues were resolved since the 2.3.1 release in June
                    • +
                    +
                    +You can download the PyPy 2.4-beta1 release here https://pypy.org/download.html.

                    +We would like to also point out that in +September, the Python Software Foundation will match funds for +any donations up to $10k, so head over to our website and help this mostly-volunteer effort out.

                    PyPy is a very compliant Python interpreter, almost a drop-in replacement for CPython 2.7 and 3.2.5. It's fast (pypy 2.4 and cpython 2.7.x performance comparison) due to its integrated tracing JIT compiler.

                    +This + release supports x86 machines running Linux 32/64, Mac OS X 64, +Windows, and OpenBSD, as well as newer ARM hardware (ARMv6 or ARMv7, +with VFPv3) running Linux. 
                    +We would like to thank our donors for the continued support of the PyPy project.

                    +The complete release notice is here.

                    +Please + try it out and let us know what you think. We especially welcome +success stories, please tell us about how it has helped you!

                    +Cheers, The PyPy Team

                    +News Flash from the beta release cycle:
                      +
                    • Note that the beta release mistakenly identifies itself in sys.pypy_version_info as releaselevel=='final', please do not mistake this for a final version
                    • +
                    • The beta can hit a "Illegal instruction" exception in jitted code on ARMv6 processors like the RaspberryPi. This will be fixed for the release.
                    • +
                    +
                    +
                    +
                    +
                    +
                    +

                    Comments

                    +
                    +
                    +
                    + + Unknown wrote on 2014-09-09 13:11: +
                    +
                    +

                    Short testing note:

                    ./pypy: error while loading shared libraries: libtinfo.so.5: cannot open shared object file: No such file or directory

                    64 bit Linux version tested on Gentoo GNU/Linux.

                    +
                    +
                    +
                    +
                    + + Unknown wrote on 2014-09-09 13:14: +
                    +
                    +

                    ah, found it: https://github.com/squeaky-pl/portable-pypy

                    +
                    +
                    +
                    +
                    + + Unknown wrote on 2014-09-12 13:51: +
                    +
                    +

                    Is there a chance to get pylab/matplotlib running with pypy as you showed in 2011?

                    +
                    +
                    +
                    +
                    + + Unknown wrote on 2014-09-15 22:12: +
                    +
                    +

                    I just had a very, very good experience with pypy:

                    https://github.com/ArneBab/freenet-fundraising/blob/master/slides.org#routing-simulation

                    with cpython it needs a day for a simulation with 100k nodes. In pypy it needs a few minutes!

                    +
                    +
                    +
                    +
                    + + Carlo Pires wrote on 2014-09-27 20:41: +
                    +
                    +

                    Still waiting support for Python3.4.1 to test my "big" application.

                    +
                    +
                    +
                    + +
                    +
                    + +
                    +
                    + + \ No newline at end of file diff --git a/posts/2014/09/pypy-240-released-9-days-left-in-7722154416024407111.html b/posts/2014/09/pypy-240-released-9-days-left-in-7722154416024407111.html new file mode 100644 index 000000000..47a659b53 --- /dev/null +++ b/posts/2014/09/pypy-240-released-9-days-left-in-7722154416024407111.html @@ -0,0 +1,320 @@ + + + + + +PyPy 2.4.0 released, 9 days left in funding drive | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                    +
                    +

                    PyPy 2.4.0 released, 9 days left in funding drive

                    + + + +
                    +
                    +We're pleased to announce the availability of PyPy 2.4.0; faster, fewer bugs, and updated to the python 2.7.8 stdlib.

                    +This release contains several bugfixes and enhancements. Among the user-facing improvements:
                      +
                    • internal refactoring in string and GIL handling which led to significant speedups
                    • +
                    • improved handling of multiple objects (like sockets) in long-running programs. They are collected and released more efficiently, reducing memory use. In simpler terms - we closed what looked like a memory leak
                    • +
                    • Windows builds now link statically to zlib, expat, bzip, and openssl-1.0.1i
                    • +
                    • Many issues were resolved since the 2.3.1 release in June
                    • +
                    +
                    +You can download PyPy 2.4.0 here https://pypy.org/download.html.

                    +We would like to also point out that in September, the Python Software Foundation will match funds for any donations up to $10k, so head over to our website and help this mostly-volunteer effort out.

                    PyPy is a very compliant Python interpreter, almost a drop-in replacement for CPython 2.7 and 3.2.5. It's fast (pypy 2.4 and cpython 2.7.x performance comparison) due to its integrated tracing JIT compiler.

                    +This release supports x86 machines running Linux 32/64, Mac OS X 64, Windows, and OpenBSD, as well as newer ARM hardware (ARMv6 or ARMv7, with VFPv3) running Linux. 
                    +We would like to thank our donors for the continued support of the PyPy project.

                    +The complete release notice is here.

                    +Please try it out and let us know what you think. We especially welcome success stories, please tell us about how it has helped you!

                    +Cheers, The PyPy Team

                    +
                    +
                    +

                    Comments

                    +
                    +
                    +
                    + + Unknown wrote on 2014-10-13 18:32: +
                    +
                    +

                    How did the funding drive work out?

                    +
                    +
                    +
                    + +
                    +
                    + +
                    +
                    + + \ No newline at end of file diff --git a/posts/2014/09/python-software-foundation-matching-2230529993193139046.html b/posts/2014/09/python-software-foundation-matching-2230529993193139046.html new file mode 100644 index 000000000..28325ddf4 --- /dev/null +++ b/posts/2014/09/python-software-foundation-matching-2230529993193139046.html @@ -0,0 +1,371 @@ + + + + + +Python Software Foundation Matching Donations this Month | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                    +
                    +

                    Python Software Foundation Matching Donations this Month

                    + + + +
                    +

                    We're extremely excited to announce that for the month of September, any amount
                    +you donate to PyPy will be match (up to $10,000) by the Python Software
                    +Foundation
                    .

                    +

                    This includes any of our ongoing fundraisers: NumPyPy, STM, Python3, or our
                    +general fundraising.

                    +

                    Here are some of the things your previous donations have helped accomplish:

                    +
                      +
                    • Getting PyPy3 completed (currently 3.2, with 3.3 work underway)
                    • +
                    • New research and production engineering on STM for PyPy
                    • +
                    • Lots of progress on NumPy for PyPy
                    • +
                    • Significant performance improvements
                    • +
                    +

                    You can see a preview of what's coming in our next 2.4 release in the draft
                    +release notes
                    .

                    +

                    Thank you to all the individuals and companies which have donated so far.

                    +

                    So please, donate today: https://pypy.org/

                    +

                    (Please be aware that the donation progress bars are not live updating, so
                    +don't be afraid if your donation doesn't show up immediately).

                    +
                    +

                    Comments

                    +
                    +
                    +
                    + + Unknown wrote on 2014-09-02 08:51: +
                    +
                    +

                    aaand donated ☺

                    Thank you, Python Software Foundation!

                    +
                    +
                    +
                    +
                    + + Anonymous wrote on 2014-09-04 10:57: +
                    +
                    +

                    I think you should be careful about your claims for numpy. It's a great idea and I am sure lots of people would be very interested in anything you do but I for one see very little progress on it.

                    +
                    +
                    +
                    +
                    + + handsomegui wrote on 2014-09-05 05:59: +
                    +
                    +

                    It would be nice to have a bitcoin donation address for donation.

                    +
                    +
                    +
                    +
                    + + Unknown wrote on 2014-09-05 13:30: +
                    +
                    +

                    Donated!

                    +
                    +
                    +
                    +
                    + + Canesin wrote on 2014-09-05 16:00: +
                    +
                    +

                    +1 on the bitcoin address for donation

                    +
                    +
                    +
                    +
                    + + L. Simon wrote on 2014-09-05 20:32: +
                    +
                    +

                    Consider me another request for a Bitcoin address. I'm in for a few millibits if you provide one.

                    +
                    +
                    +
                    +
                    + + Armin Rigo wrote on 2014-09-06 16:47: +
                    +
                    +

                    Sorry for the bitcoin requests... setting up a new payment system just for a few millibits is not worth it at all.

                    +
                    +
                    +
                    + +
                    +
                    + +
                    +
                    + + \ No newline at end of file diff --git a/posts/2014/10/couchbase-contribution-to-pypy-2360892117372790069.html b/posts/2014/10/couchbase-contribution-to-pypy-2360892117372790069.html new file mode 100644 index 000000000..ec59b8d26 --- /dev/null +++ b/posts/2014/10/couchbase-contribution-to-pypy-2360892117372790069.html @@ -0,0 +1,346 @@ + + + + + +Couchbase contribution to PyPy | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                    +
                    +

                    Couchbase contribution to PyPy

                    + + + +
                    +
                    +

                    Hello everyone!

                    +

                    We always offer to put on the blog info about our sponsors who donate substantial amounts of money. So far most people decided to stay anonymous, so this is the first blog post describing our sponsor and his relationship to PyPy, hopefully not the last. We'll also publish a full blog post about the PSF-matched fundraiser soon. This is a guest post by Brent Woodruff from Couchbase.

                    +

                    +

                    +
                    + +

                    +Couchbase is a leading NoSQL document database that provides a flexible data model, high performance, scalability, and high availability. Couchbase is a commercially supported open source project. Visit us at https://www.couchbase.com and https://github.com/couchbase. +

                    +

                    +Couchbase Inc. donated $2000.00, and employees of Couchbase personally contributed a disclosed additional $230.00, towards Pypy progress during the September funding drive. These funds will see a match from the Python Software Foundation. +

                    +

                    +Pypy is primarily used by Couchbase employees to perform product analysis and troubleshooting using internally developed tools. Every customer of Couchbase benefits from the use of Pypy; both due to the rapid development provided by Python, and the speed of the resulting tools provided by the Pypy JIT interpreter. +

                    +

                    +“PyPy is great - it gave us a 4x speedup in our CPU-intensive internal application over CPython” +-Dave Rigby and Daniel Owen, Couchbase Engineers +

                    +

                    +Additionally, Couchbase has a preliminary CFFI based Couchbase client available for Pypy users. +

                    + +
                    +
                    +
                    +

                    Comments

                    +
                    +
                    +
                    + + Unknown wrote on 2014-10-14 22:42: +
                    +
                    +

                    Definitely wouldn't have thought to put PyPy and Couchbase in the same sentence, but this is very good of them! Glad to see the support.

                    +
                    +
                    +
                    +
                    + + Anonymous wrote on 2014-10-15 09:34: +
                    +
                    +

                    Thanks for the donation. Could you give a bit more detail of how hard it was to make your code compatible with PyPy?

                    +
                    +
                    +
                    +
                    + + Anonymous wrote on 2014-10-15 13:28: +
                    +
                    +

                    Hello from Couchbase. With regards to making our code compatible with PyPy, I can only comment on our internal tooling. Those are currently all pure Python, so it was trivial. We used modules that work with PyPy already: namely pyparsing, LEPL, and tornado. The tools all run under both CPython and PyPy unmodified.

                    +
                    +
                    +
                    + +
                    +
                    + +
                    +
                    + + \ No newline at end of file diff --git a/posts/2014/10/pypy3-240-released-5007750685927360190.html b/posts/2014/10/pypy3-240-released-5007750685927360190.html new file mode 100644 index 000000000..38c427e6f --- /dev/null +++ b/posts/2014/10/pypy3-240-released-5007750685927360190.html @@ -0,0 +1,353 @@ + + + + + +PyPy3 2.4.0 released | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                    +
                    +

                    PyPy3 2.4.0 released

                    + + + +
                    +
                    We're pleased to announce the availability of PyPy3 2.4.0!

                    +This release contains several bugfixes and enhancements. Among the user-facing improvements specific to PyPy3:
                      +
                    • Better Windows compatibility, e.g. the nt module functions _getfinalpathname & _getfileinformation are now supported (the former is required for the popular pathlib library for example)
                    • +
                    • Various fsencode PEP 383 related fixes to the posix module (readlink, uname, ttyname and ctermid) and improved locale handling
                    • +
                    • Switched the default binary name on POSIX distributions from 'pypy' to 'pypy3' (which symlinks to to 'pypy3.2')
                    • +
                    • Fixed a couple different crashes related to parsing Python 3 source code
                    • +
                    +
                    +And improvements shared with the recent PyPy 2.4.0 release:
                      +
                    • internal refactoring in string and GIL handling which led to significant speedups
                    • +
                    • improved handling of multiple objects (like sockets) in long-running programs. They are collected and released more efficiently, reducing memory use. In simpler terms - we closed what looked like a memory leak
                    • +
                    • Windows builds now link statically to zlib, expat, bzip, and openssl-1.0.1i
                    • +
                    • Many issues were resolved since the 2.3.1 release in June
                    • +
                    +
                    +You can download PyPy3 2.4.0 here https://pypy.org/download.html.

                    PyPy is a very compliant Python interpreter, almost a drop-in replacement for CPython 2.7 and 3.2.5. It's fast (pypy 2.4 and cpython 2.7.x performance comparison) due to its integrated tracing JIT compiler.

                    +This release supports x86 machines running Linux 32/64, Mac OS X 64, Windows, and OpenBSD, as well as newer ARM hardware (ARMv6 or ARMv7, with VFPv3) running Linux. 
                    +We would like to thank our donors for the continued support of the PyPy project.

                    +The complete release notice is here.

                    +Please try it out and let us know what you think. We especially welcome success stories, please tell us about how it has helped you!

                    +Cheers, The PyPy Team

                    +
                    +
                    +

                    Comments

                    +
                    +
                    +
                    + + Unknown wrote on 2014-10-22 14:39: +
                    +
                    +

                    Great news. Thanks!

                    +
                    +
                    +
                    +
                    + + Anonymous wrote on 2014-10-28 13:15: +
                    +
                    +

                    Great work, thanks!

                    +
                    +
                    +
                    +
                    + + Unknown wrote on 2014-10-30 14:46: +
                    +
                    +

                    That’s great - thanks!

                    And the portable release directly works for my keyboard evolution! (it’s roughly 2.5x faster than cPython).

                    +
                    +
                    +
                    +
                    + + Unknown wrote on 2014-10-30 14:58: +
                    +
                    +

                    Correction: After some warmup time, pypy is more than 2.8x faster than cPython.

                    +
                    +
                    +
                    + +
                    +
                    + +
                    +
                    + + \ No newline at end of file diff --git a/posts/2014/11/pypy-io-improvements-1042070332447047674.html b/posts/2014/11/pypy-io-improvements-1042070332447047674.html new file mode 100644 index 000000000..34f79444f --- /dev/null +++ b/posts/2014/11/pypy-io-improvements-1042070332447047674.html @@ -0,0 +1,418 @@ + + + + + +PyPy IO improvements | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                    +
                    +

                    PyPy IO improvements

                    + + + +
                    +
                    + +
                    +
                    +

                    Hello everyone!

                    +

                    We've wrapped up the Warsaw sprint, so I would like to describe some +branches which have been recently merged and which improved the I/O and the +GC: gc_no_cleanup_nursery and gc-incminimark-pinning.

                    +

                    The first branch was started by Wenzhu Man for her Google Summer of Code +and finished by Maciej Fijałkowski and Armin Rigo. +The PyPy GC works by allocating new objects in the young object +area (the nursery), simply by incrementing a pointer. After each minor +collection, the nursery has to be cleaned up. For simplicity, the GC used +to do it by zeroing the whole nursery.

                    +

                    This approach has bad effects on the cache, since you zero a large piece of +memory at once and do unnecessary work for things that don't require zeroing +like large strings. We mitigated the first problem somewhat with incremental +nursery zeroing, but this branch removes the zeroing completely, thus +improving the string handling and recursive code (since jitframes don't +requires zeroed memory either). I measured the effect on two examples: +a recursive implementation of fibonacci and gcbench, +to measure GC performance.

                    +

                    The results for fibonacci and gcbench are below (normalized to cpython +2.7). Benchmarks were run 50 times each (note that the big standard +deviation comes mostly from the warmup at the beginning, true figures +are smaller):

                    +

                    +

                    + ++++++ + + + + + + + + + + + + + + + + + + + + +
                    benchmarkCPythonPyPy 2.4PyPy non-zero
                    fibonacci4.8+-0.15 (1.0x)0.59+-0.07 (8.1x)0.45+-0.07 (10.6x)
                    gcbench22+-0.36 (1.0x)1.34+-0.28 (16.4x)1.02+-0.15 (21.6x)
                    +

                    The second branch was done by Gregor Wegberg for his master thesis and finished +by Maciej Fijałkowski and Armin Rigo. Because of the way it works, the PyPy GC from +time to time moves the objects in memory, meaning that their address can change. +Therefore, if you want to pass pointers to some external C function (for +example, write(2) or read(2)), you need to ensure that the objects they are +pointing to will not be moved by the GC (e.g. when running a different thread). +PyPy up to 2.4 solves the problem by copying the data into or from a non-movable buffer, which +is obviously inefficient. +The branch introduce the concept of "pinning", which allows us to inform the +GC that it is not allowed to move a certain object for a short period of time. +This introduces a bit of extra complexity +in the garbage collector, but improves the I/O performance quite drastically, +because we no longer need the extra copy to and from the non-movable buffers.

                    +

                    In this benchmark, which does I/O in a loop, +we either write a number of bytes from a freshly allocated string into +/dev/null or read a number of bytes from /dev/full. I'm showing the results +for PyPy 2.4, PyPy with non-zero-nursery and PyPy with non-zero-nursery and +object pinning. Those are wall times for cases using os.read/os.write +and file.read/file.write, normalized against CPython 2.7.

                    +

                    Benchmarks were done using PyPy 2.4 and revisions 85646d1d07fb for +non-zero-nursery and 3d8fe96dc4d9 for non-zero-nursery and pinning. +The benchmarks were run once, since the standard deviation was small.

                    +

                    + +

                    +
                    +

                    The Y axis is speed, normalized to CPython, the more the better

                    + +

                    What we can see is that os.read and os.write both improved greatly +and outperforms CPython now for each combination. file operations are +a little more tricky, and while those branches improved the situation a bit, +the improvement is not as drastic as in os versions. It really should not +be the case and it showcases how our file buffering is inferior to CPython. +We plan on removing our own buffering and using FILE* in C in the near future, +so we should outperform CPython on those too (since our allocations are cheaper). +If you look carefully in the benchmark, the write function is copied three times. +This hack is intended to avoid JIT overspecializing the assembler code, which happens +because the buffering code was written way before the JIT was done. In fact, our buffering +is hilariously bad, but if stars align correctly it can be JIT-compiled to something +that's not half bad. Try removing the hack and seeing how the performance of the last +benchmark drops :-) Again, this hack should be absolutely unnecessary once we remove +our own buffering, stay tuned for more.

                    +

                    Cheers,
                    +fijal

                    +
                    +

                    Comments

                    +
                    +
                    +
                    + + Yichao Yu wrote on 2014-11-05 18:32: +
                    +
                    +

                    Sounds great!!!

                    Just wondering, will the pin-memory also improves the situation when passing strings/other buffers to c functions (e.g. via cffi)?

                    +
                    +
                    +
                    +
                    + + Anonymous wrote on 2014-11-05 21:54: +
                    +
                    +

                    Hey,

                    In your benchmark, the following loop:
                    for i in range(num):
                    os.write(fd, " " * num2)

                    Is not hoisted out by CPython (whereas I guess PyPy does hoist it).
                    Which means that the buffer written is basically allocated/freed upon each loop.

                    If you want to measure pure I/O performance (so let's say a zero-copy setting), it should be hoisted manually out of the loop for CPython, like this:

                    payload = b" " * num2
                    for i in range(num):
                    os.write(fd, payload)

                    Then, the results go from:

                    fwrite 100 bytes, 1.93us per write
                    fwrite 1000 bytes, 2.57us per write
                    fwrite 10000 bytes, 6.73us per write
                    file_write 100 bytes, 0.99us per write
                    file_write 1000 bytes, 1.68us per write
                    file_write 10000 bytes, 4.71us per write


                    to

                    fwrite 100 bytes, 1.38us per write
                    fwrite 1000 bytes, 1.48us per write
                    fwrite 10000 bytes, 1.38us per write
                    file_write 100 bytes, 0.65us per write
                    file_write 1000 bytes, 0.96us per write
                    file_write 10000 bytes, 2.32us per write

                    Also, might be worth trying wth binary mode.

                    Anyway, keep up the great work!

                    +
                    +
                    +
                    +
                    + + Maciej Fijalkowski wrote on 2014-11-06 06:10: +
                    +
                    +

                    PyPy does not hoist the buffer allocation here. The benchmark specifically allocated/frees the buffer every loop, since we want the object written fresh (otherwise pinning is not needed), but also we think that writing a new object (as opposed to the constant buffer) is really more of a common case. Yes, you get an overhead of allocation measured too, but the case here is that we wanted to measure the IO of fresh objects, not old ones

                    +
                    +
                    +
                    + +
                    +
                    + +
                    +
                    + + \ No newline at end of file diff --git a/posts/2014/11/september-donations-and-thank-you-to-4531550307707104017.html b/posts/2014/11/september-donations-and-thank-you-to-4531550307707104017.html new file mode 100644 index 000000000..a88e67d88 --- /dev/null +++ b/posts/2014/11/september-donations-and-thank-you-to-4531550307707104017.html @@ -0,0 +1,355 @@ + + + + + +September donations and thank you to the Python Software Foundation! | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                    +
                    +

                    September donations and thank you to the Python Software Foundation!

                    + + + +
                    +
                    + +

                    Hello everyone!

                    +

                    We would like to show you a short update on the PyPy funding. +We gathered a total of $15,986 in the month of September and as per +earlier agreement, the Python Software Foundation donated $10,000 +to PyPy. We would like to thank everyone participating and the PSF in +particular for supporting the PyPy project and making our work possible!

                    +

                    We've been working hard on the goals outlined in the funding proposals.

                    +
                      +
                    • +PyPy Python 3 support has been in beta for a while and it's already +being used by many people, as seen per the number of reported bugs. +We're currently supporting 3.2, planning on moving towards 3.4 in the +future.
                    • +
                    • Software Transactional Memory has been a successful research project, +with first real world results shown during the Warsaw sprint.
                    • +
                    • More detailed update on numpy will be published soon. A little spoiler is +that we're planning on addressing matplotlib, scipy and the larger ecosystem +to some extent. Stay tuned!
                    • +
                    +

                    Again, thanks to everyone who donated and happy Thanksgiving to everyone +on that side of the world!

                    +

                    Cheers,
                    +fijal and the entire PyPy team

                    + +
                    +
                    +
                    +

                    Comments

                    +
                    +
                    +
                    + + Alessandro wrote on 2014-11-29 02:53: +
                    +
                    +

                    Fantastic work!

                    I'm a Python 3 user, as such the PyPy3 was great for me!

                    And good news for Numpypy, it would indeed be awesome for supporting the numeric ecosystem.

                    +
                    +
                    +
                    +
                    + + Anonymous wrote on 2014-11-30 08:31: +
                    +
                    +

                    It would be amazing if pypy supported numpy and matplotlib!!

                    +
                    +
                    +
                    +
                    + + Anonymous wrote on 2014-12-04 12:35: +
                    +
                    +

                    This is great news! I've been waiting for scipy and matplotlib for a while, now it's finally on the roadmap.

                    +
                    +
                    +
                    +
                    + + Anonymous wrote on 2015-01-06 12:37: +
                    +
                    +

                    Any news on the Numpy update?

                    +
                    +
                    +
                    + +
                    +
                    + +
                    +
                    + + \ No newline at end of file diff --git a/posts/2014/11/tornado-without-gil-on-pypy-stm-7284102716557557428.html b/posts/2014/11/tornado-without-gil-on-pypy-stm-7284102716557557428.html new file mode 100644 index 000000000..8729d3cb2 --- /dev/null +++ b/posts/2014/11/tornado-without-gil-on-pypy-stm-7284102716557557428.html @@ -0,0 +1,611 @@ + + + + + +Tornado without a GIL on PyPy STM | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                    +
                    +

                    Tornado without a GIL on PyPy STM

                    + + + +
                    +

                    This post is by Konstantin Lopuhin, who tried PyPy STM during the +Warsaw sprint.

                    +

                    Python has a GIL, right? Not quite - PyPy STM is a python implementation +without a GIL, so it can scale CPU-bound work to several cores. +PyPy STM is developed by Armin Rigo and Remi Meier, +and supported by community donations. +You can read more about it in the +docs.

                    +

                    Although PyPy STM is still a work in progress, in many cases it can already +run CPU-bound code faster than regular PyPy, when using multiple cores. +Here we will see how to slightly modify Tornado IO loop to use +transaction +module. +This module is described +in the docs and is really simple to use - please see an example there. +An event loop of Tornado, or any other asynchronous +web server, looks like this (with some simplifications):

                    +
                    +while True:
                    +    for callback in list(self._callbacks):
                    +        self._run_callback(callback)
                    +    event_pairs = self._impl.poll()
                    +    self._events.update(event_pairs)
                    +    while self._events:
                    +        fd, events = self._events.popitem()
                    +        handler = self._handlers[fd]
                    +        self._handle_event(fd, handler, events)
                    +
                    +

                    We get IO events, and run handlers for all of them, these handlers can +also register new callbacks, which we run too. When using such a framework, +it is very nice to have a guaranty that all handlers are run serially, +so you do not have to put any locks. This is an ideal case for the +transaction module - it gives us guaranties that things appear +to be run serially, so in user code we do not need any locks. We just +need to change the code above to something like:

                    +
                    +while True:
                    +    for callback in list(self._callbacks):
                    +        transaction.add(                # added
                    +            self._run_callback, callback)
                    +    transaction.run()                   # added
                    +    event_pairs = self._impl.poll()
                    +    self._events.update(event_pairs)
                    +    while self._events:
                    +        fd, events = self._events.popitem()
                    +        handler = self._handlers[fd]
                    +        transaction.add(                # added
                    +            self._handle_event, fd, handler, events)
                    +    transaction.run()                   # added
                    +
                    +

                    The actual commit is +here, +- we had to extract a little function to run the callback.

                    +
                    +

                    Part 1: a simple benchmark: primes

                    +

                    Now we need a simple benchmark, lets start with +this +- just calculate a list of primes up to the given number, and return it +as JSON:

                    +
                    +def is_prime(n):
                    +    for i in xrange(2, n):
                    +        if n % i == 0:
                    +            return False
                    +    return True
                    +
                    +class MainHandler(tornado.web.RequestHandler):
                    +    def get(self, num):
                    +        num = int(num)
                    +        primes = [n for n in xrange(2, num + 1) if is_prime(n)]
                    +        self.write({'primes': primes})
                    +
                    +

                    We can benchmark it with siege:

                    +
                    +siege -c 50 -t 20s https://localhost:8888/10000
                    +
                    +

                    But this does not scale. The CPU load is at 101-104 %, and we handle 30 % +less request per second. The reason for the slowdown is STM overhead, +which needs to keep track of all writes and reads in order to detect conflicts. +And the reason for using only one core is, obviously, conflicts! +Fortunately, we can see what this conflicts are, if we run code like this +(here 4 is the number of cores to use):

                    +
                    +PYPYSTM=stm.log ./primes.py 4
                    +
                    +

                    Then we can use print_stm_log.py +to analyse this log. It lists the most expensive conflicts:

                    +
                    +14.793s lost in aborts, 0.000s paused (1258x STM_CONTENTION_INEVITABLE)
                    +File "/home/ubuntu/tornado-stm/tornado/tornado/httpserver.py", line 455, in __init__
                    +    self._start_time = time.time()
                    +File "/home/ubuntu/tornado-stm/tornado/tornado/httpserver.py", line 455, in __init__
                    +    self._start_time = time.time()
                    +...
                    +
                    +

                    There are only three kinds of conflicts, they are described in +stm source, +Here we see that two threads call into external function to get current time, +and we can not rollback any of them, so one of them must wait till the other +transaction finishes. +For now we can hack around this by disabling this timing - this is only +needed for internal profiling in tornado.

                    +

                    If we do it, we get the following results (but see caveats below):

                    + + + + + +
                    + ++++ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    Impl.req/s
                    PyPy 2.414.4
                    CPython 2.73.2
                    PyPy-STM 19.3
                    PyPy-STM 216.4
                    PyPy-STM 320.4
                    PyPy STM 424.2
                    +
                        + +
                    +

                    As we can see, in this benchmark PyPy STM using just two cores +can beat regular PyPy! +This is not linear scaling, there are still conflicts left, and this +is a very simple example but still, it works!

                    +

                    But its not that simple yet :)

                    +

                    First, these are best-case numbers after long (much longer than for regular +PyPy) warmup. Second, it can sometimes crash (although removing old pyc files +fixes it). Third, benchmark meta-parameters are also tuned.

                    +

                    Here we get relatively good results only when there are a lot of concurrent +clients - as a results, a lot of requests pile up, the server is not keeping +with the load, and transaction module is busy with work running this piled up +requests. If we decrease the number of concurrent clients, results get slightly worse. +Another thing we can tune is how heavy is each request - again, if we ask +primes up to a lower number, then less time is spent doing calculations, +more time is spent in tornado, and results get much worse.

                    +

                    Besides the time.time() conflict described above, there are a lot of others. +The bulk of time is lost in these two conflicts:

                    +
                    +14.153s lost in aborts, 0.000s paused (270x STM_CONTENTION_INEVITABLE)
                    +File "/home/ubuntu/tornado-stm/tornado/tornado/web.py", line 1082, in compute_etag
                    +    hasher = hashlib.sha1()
                    +File "/home/ubuntu/tornado-stm/tornado/tornado/web.py", line 1082, in compute_etag
                    +    hasher = hashlib.sha1()
                    +
                    +13.484s lost in aborts, 0.000s paused (130x STM_CONTENTION_WRITE_READ)
                    +File "/home/ubuntu/pypy/lib_pypy/transaction.py", line 164, in _run_thread
                    +    got_exception)
                    +
                    +

                    The first one is presumably calling into some C function from stdlib, and we get +the same conflict as for time.time() above, but is can be fixed on PyPy +side, as we can be sure that computing sha1 is pure.

                    +

                    It is easy to hack around this one too, just removing etag support, but if +we do it, performance is much worse, only slightly faster than regular PyPy, +with the top conflict being:

                    +
                    +83.066s lost in aborts, 0.000s paused (459x STM_CONTENTION_WRITE_WRITE)
                    +File "/home/arigo/hg/pypy/stmgc-c7/lib-python/2.7/_weakrefset.py", line 70, in __contains__
                    +File "/home/arigo/hg/pypy/stmgc-c7/lib-python/2.7/_weakrefset.py", line 70, in __contains__
                    +
                    +

                    Comment by Armin: It is unclear why this happens so far. We'll investigate...

                    +

                    The second conflict (without etag tweaks) originates +in the transaction module, from this piece of code:

                    +
                    +while True:
                    +    self._do_it(self._grab_next_thing_to_do(tloc_pending),
                    +                got_exception)
                    +    counter[0] += 1
                    +
                    +

                    Comment by Armin: This is a conflict in the transaction module itself; ideally, +it shouldn't have any, but in order to do that we might need a little bit +of support from RPython or C code. So this is pending improvement.

                    +

                    Tornado modification used in this blog post is based on 3.2.dev2. +As of now, the latest version is 4.0.2, and if we +apply +the same changes to this version, then we no longer get any scaling on this benchmark, +and there are no conflicts that take any substantial time.

                    +

                    Comment by Armin: There are two possible reactions to a conflict. We can either +abort one of the two threads, or (depending on the circumstances) just +pause the current thread until the other one commits, after which the +thread will likely be able to continue. The tool ``print_stm_log.py`` +did not report conflicts that cause pauses. It has been fixed very +recently. Chances are that on this test it would report long pauses and +point to locations that cause them.

                    +
                    +
                    +

                    Part 2: a more interesting benchmark: A-star

                    +

                    Although we have seen that PyPy STM is not all moonlight and roses, +it is interesting to see how it works on a more realistic application.

                    +

                    astar.py +is a simple game where several players move on a map +(represented as a list of lists of integers), +build and destroy walls, and ask server to give them +shortest paths between two points +using A-star search, adopted from ActiveState recipie.

                    +

                    The benchmark bench_astar.py +is simulating players, and tries to put the main load on A-star search, +but also does some wall building and destruction. There are no locks +around map modifications, as normal tornado is executing all callbacks +serially, and we can keep this guaranty with atomic blocks of PyPy STM. +This is also an example of a program that is not trivial +to scale to multiple cores with separate processes (assuming +more interesting shared state and logic).

                    +

                    This benchmark is very noisy due to randomness of client interactions +(also it could be not linear), so just lower and upper bounds for +number of requests are reported

                    + ++++ + + + + + + + + + + + + + + + + + + + + + + +
                    Impl.req/s
                    PyPy 2.45 .. 7
                    CPython 2.70.5 .. 0.9
                    PyPy-STM 12 .. 4
                    PyPy STM 42 .. 6
                    +

                    Clearly this is a very bad benchmark, but still we can see that scaling is worse +and STM overhead is sometimes higher. +The bulk of conflicts come from the transaction module (we have seen it +above):

                    +
                    +91.655s lost in aborts, 0.000s paused (249x STM_CONTENTION_WRITE_READ)
                    +File "/home/ubuntu/pypy/lib_pypy/transaction.py", line 164, in _run_thread
                    +    got_exception)
                    +
                    +

                    Although it is definitely not ready for production use, you can already try +to run things, report bugs, and see what is missing in user-facing tools +and libraries.

                    +

                    Benchmarks setup:

                    + +
                    +
                    +

                    Comments

                    +
                    +
                    +
                    + + Anonymous wrote on 2014-11-18 07:00: +
                    +
                    +

                    "Clearly this is a very benchmark" - looks like you've missed a word here ;)

                    +
                    +
                    +
                    +
                    + + crusaderky wrote on 2014-11-19 00:01: +
                    +
                    +

                    in bench_astar.py, you are doing the following queries:
                    - try to move: 85%
                    - build a wall: 10.5% [(1-.85)*.7]
                    - erase something: 0.45% [(1-.85)*(1-.7)*.1]
                    - show map: 4.05% [(1-.85)*(1-.7)*(1-.1)]

                    I doubt that's intentional.... :P

                    +
                    +
                    +
                    +
                    + + crusaderky wrote on 2014-11-19 01:01: +
                    +
                    +

                    Correct me if I misunderstood the theory of PyPy-STM, but in the A* test there's nothing that prevents a get() to read the game map while MapChangeHandler.put() is running (that is, while the system is in an incoherent status)?

                    Shouldn't MapChangeHandler.put() be wrapped in a exclusive write lock, and all the get() handlers be wrapped with a shared read lock?

                    +
                    +
                    +
                    +
                    + + Konstantin Lopuhin wrote on 2014-11-19 20:45: +
                    +
                    +

                    > Clearly this is a very benchmark" - looks like you've missed a word here ;)

                    Oh, yes, that word is "bad" :)

                    > Shouldn't MapChangeHandler.put() be wrapped in a exclusive write lock, and all the get() handlers be wrapped with a shared read lock?

                    Here all request handlers are already wrapped inside atomic blocks, but this is hidden from us in (modified) tornado. So we do not need any locks (as in normal tornado too, because normal tornado is single threaded). If request handlers conflict, then we just loose performance, not correctness. This is one of the main points of PyPy STM: it can support multithreaded code without needing to use locks.

                    Regarding the probabilities: yes, that's not quite intentional)

                    +
                    +
                    +
                    + +
                    +
                    + +
                    +
                    + + \ No newline at end of file diff --git a/posts/2015/01/faster-more-memory-efficient-and-more-4096950404745375390.html b/posts/2015/01/faster-more-memory-efficient-and-more-4096950404745375390.html new file mode 100644 index 000000000..49b5b9034 --- /dev/null +++ b/posts/2015/01/faster-more-memory-efficient-and-more-4096950404745375390.html @@ -0,0 +1,517 @@ + + + + + +Faster, more memory efficient and more ordered dictionaries on PyPy | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                    +
                    +

                    Faster, more memory efficient and more ordered dictionaries on PyPy

                    + + + +
                    +
                    +

                    Hello everyone!

                    +

                    As of today, we merged the latest branch that brings better dictionaries to PyPy by default. The work is based on an idea by Raymond Hettinger on python-dev, with prior work done notably in Java.  It was done by Maciej Fijałkowski and Armin Rigo, with Laurence Tratt recently prodding us to finish it.  (Earlier work going in a similar direction include Alex Gaynor's work on ordered dicts in Topaz, which was also used in the Hippy VM.  Each of these pieces of work is itself based on the original dict implementation in RPython, whose origins fade in the Subversion prehistory of PyPy.)  Coincidentally, a very similar idea has been implemented in Zend PHP very recently. Zend implementation description.

                    +

                    This post covers the basics of design and implementation as well as some basic benchmarks.

                    +
                    +
                    +

                    Dictionaries are now ordered!

                    +

                    One surprising part is that the new design, besides being more +memory efficient, is ordered by design: it preserves the +insertion order.  This is not forbidden by the Python language, which allows any order.  It makes the collections.OrderedDict subclass much faster than before: it is now a thin subclass of dict.  Obviously, we recommend that any portable Python program continues to use OrderedDict when ordering is important.  Note that a non-portable program might rely on more: for example, a **keywords argument now receives the keywords in the same order as the one in which they were given in the call.  (Whether such a thing might be called a language design change or not is a bit borderline.)  The point is that Python programs that work on CPython or previous versions of PyPy should continue to work on PyPy.

                    +

                    There is one exception, though.  The iterators of the OrderedDict subclass are now working just like the ones of the dict builtin: they will raise RuntimeError when iterating if the dictionary was modified.  In the CPython design, the class OrderedDict explicitly doesn't worry about that, and instead you get some result that might range from correct to incorrect to crashes (i.e. random Python exceptions).

                    +
                    +
                    +

                    Original PyPy dictionary design

                    +

                    Originally, PyPy dictionaries, as well as CPython dictionaries +are implemented as follows (simplified view):

                    +
                    +struct dict {
                    +   long num_items;
                    +   dict_entry* items;   /* pointer to array */
                    +}
                    +
                    +struct dict_entry {
                    +   long hash;
                    +   PyObject* key;
                    +   PyObject* value;
                    +}
                    +
                    +

                    Where items is a sparse array, with 1/3 to 1/2 of the items being NULL. +The average space occupied by a dictionary is 3 * WORD * 12/7 plus some small constant (the smallest dict has 8 entries, which is +8 * 3 * WORD + 2 * WORD = 26 WORDs).

                    +
                    +
                    +

                    New PyPy dictionary design

                    +

                    The new PyPy dictionary is split in two arrays:

                    +
                    +struct dict {
                    +    long num_items;
                    +    variable_int *sparse_array;
                    +    dict_entry* compact_array;
                    +}
                    +
                    +struct dict_entry {
                    +    long hash;
                    +    PyObject *key;
                    +    PyObject *value;
                    +}
                    +
                    +

                    Here, compact_array stores all the items in order of insertion, while sparse_array is a 1/2 to 2/3 full array of integers. The integers themselves are of the smallest size necessary for indexing the compact_array. So if compact_array has less than 256 items, then sparse_array will be made of bytes; if less than 2^16, it'll be two-byte integers; and so on.

                    +

                    This design saves quite a bit of memory. For example, on 64bit systems we can, but almost never, use indexing of more than 4 billion elements; and for small dicts, the extra sparse_array takes very little space.  For example a 100 element dict, would be on average for the original design on 64bit: 100 * 12/7 * WORD * 3 =~ 4100 bytes, while on new design it's 100 * 12/7 + 3 * WORD * 100 =~ 2600 bytes, quite a significant saving.

                    +
                    +
                    +

                    GC friendliness

                    +

                    The obvious benefit of having more compact dictionaries is an increased cache friendliness. In modern CPUs cache misses are much more costly than doing additional simple work, like having an additional level of (in-cache) indirection. Additionally, there is a GC benefit coming from it. When doing a minor collection, the GC has to visit all the GC fields in old objects that can point to young objects. In the case of large arrays, this can prove problematic since the array grows and with each minor collection we need to visit more and more GC pointers. In order to avoid it, large arrays in PyPy employ a technique called "card marking" where the GC only visits "cards" or subsets of arrays that were modified between collections. The problem with dictionaries was that by design modifications in a dictionary occur randomly, hence a lot of cards used to get invalidated. In the new design, however, new items are typically appended to the compact_array, hence invalidate much fewer cards --- which improves GC performance.  (The new sparse_array is an array of integers, so it does not suffer from the same problems.)

                    +
                    +
                    +

                    Deletion

                    +

                    Deleting entries from dictionaries is not very common, but important in a few use cases.  To preserve order, when we delete an entry, we mark the entry as removed but don't otherwise shuffle the remaining entries.  If we repeat this operation often enough, there will be a lot of removed entries in the (originally compact) array.  At this point, we need to do a "packing" operation, which moves all live entries to the start of the array (and then reindexes the sparse array, as the positions changed).  This works well, but there are use cases where previously no reindexing was ever needed, so it makes these cases a bit slower (for example when repeatedly adding and removing keys in equal number).

                    +
                    +
                    +

                    Benchmarks

                    +

                    The PyPy speed benchmarks show mostly small effect, see changes. The microbenchmarks that we did show large improvements on large and very large dictionaries (particularly, building dictionaries of at least a couple 100s of items is now twice faster) and break-even on small ones (between 20% slower and 20% faster depending very much on the usage patterns and sizes of dictionaries). The new dictionaries enable various optimization possibilities which we're going to explore in the near future.

                    +

                    Cheers,
                    +fijal, arigo and the PyPy team

                    +
                    +
                    +
                    +

                    Comments

                    +
                    +
                    +
                    + + Unknown wrote on 2015-01-22 16:26: +
                    +
                    +

                    This is outstanding work, PyPy team. Keep on keeping on!

                    +
                    +
                    +
                    +
                    + + Wilfred Hughes wrote on 2015-01-22 16:41: +
                    +
                    +

                    Fantastic!

                    https://pypy.org/performance.html states that large dicts are a weakness of pypy -- is still the case overall, or is this work sufficient to favour pypy over cpython for large dict work in general?

                    +
                    +
                    +
                    +
                    + + John M. Camara wrote on 2015-01-23 01:35: +
                    +
                    +

                    Wilfred - With the ordered dict changes that bullet item is no longer true.

                    +
                    +
                    +
                    +
                    + + EM Lazzarin wrote on 2015-01-23 23:20: +
                    +
                    +

                    Awesome work and thanks. Pypy would be ahead of the game if PEP 468 were accepted.

                    +
                    +
                    +
                    +
                    + + JSZ wrote on 2015-01-24 19:04: +
                    +
                    +

                    How is deleting an element implemented? It sounds like it would take O(n) work to remove an element from the middle of the compact array.

                    +
                    +
                    +
                    +
                    + + Armin Rigo wrote on 2015-01-25 06:58: +
                    +
                    +

                    JSZ: the array gets holes. If a lot of items are deleted it can no longer be called "compact", but if it becomes too sparse it is recompacted and rehashed.

                    +
                    +
                    +
                    +
                    + + Anonymous wrote on 2015-01-28 11:09: +
                    +
                    +

                    There are lots of things to like about this approach!

                    Did you find any problems with cache misses? With linear probing, the keys are accessed sequentially (cache friendly), but with this method the keys are accessed in random order.

                    +
                    +
                    +
                    +
                    + + Carl Friedrich Bolz-Tereick wrote on 2015-01-28 11:13: +
                    +
                    +

                    @Anonymous: The old approach didn't use linear probing either, so in that regard nothing changed.

                    +
                    +
                    +
                    +
                    + + Anonymous wrote on 2015-01-28 11:45: +
                    +
                    +

                    @carl - ah I see, thats interesting.

                    Well then, what about storing the hashes with the indices?
                    * Another chunk of memory saved. Only the lowest N bits need be stored that way instead of the full 64 bits. (Big assumption that rehashing on bit size change is ok)

                    * The nice thing is that the dense part (cache miss!) need only be accessed if the hash matches.

                    I think if I was doing this, I'd skip 8 bit indices and have 16 bit minimum so rehashing would be very rare.

                    +
                    +
                    +
                    +
                    + + Carl Friedrich Bolz-Tereick wrote on 2015-01-28 12:04: +
                    +
                    +

                    two problems with that:

                    - since the hash functions can be written in python, recomputing a hash from a key is potentially expensive

                    - why would you want to throw away bits from the hash? comparing the full hashes as a first check to see whether equality has a chance to succeed is very useful. the equality function can again be written in python, so is potentially very slow.

                    +
                    +
                    +
                    +
                    + + Armin Rigo wrote on 2015-01-28 16:03: +
                    +
                    +

                    @Anonymous: about starting at 16-bit instead of 8-bit: it doesn't give any benefit, because rehashing is needed anyway to grow the sparse table. As long as its size is at most 256, then there is no point in storing 16-bit numbers instead of 8-bit numbers. In theory we could store N-bit numbers for the optimal value of N (= 4, 6, 8, 10...) and pay only the cost of additional complexity for individual reads and writes, not for rehashing.

                    +
                    +
                    +
                    +
                    + + Anonymous wrote on 2015-01-28 21:39: +
                    +
                    +

                    Ah indeed. I am thinking of implementing this in C++ which has coloured my thoughts somewhat. In my case, key equality checks are for the most part cheap. Thus the size/compute tradeoffs may be a bit different.

                    Thanks for your thoughts.

                    +
                    +
                    +
                    +
                    + + Dustin Boswell wrote on 2015-02-04 23:05: +
                    +
                    +

                    Just curious, was there no slowdown from adding this extra level of indirection? For the case of accessing a random key from a cold dictionary, won't the lookup incur 2 cache misses now (one on each array), compared to just 1 for the original design?

                    +
                    +
                    +
                    +
                    + + Armin Rigo wrote on 2015-02-05 15:11: +
                    +
                    +

                    @Durtin: there are certainly slow-downs in some cases. If the dictionary is cold, then indeed there is one extra cache miss. It seems to be quickly compensated, though, by the fact that if then you do a few more accesses to the same dict, you are likely to get less cache misses, simply because of the more compact layout. Also, the index array is often single bytes, so it can be fully in the cache very quickly.

                    +
                    +
                    +
                    +
                    + + Alhabshi3k wrote on 2015-02-11 08:44: +
                    +
                    +

                    Thank you for improving pypy performance and features. Your project and method is promising in improvement weakness aspect of dynamic languages. At the same time, pypy should provide an simplicity of Python rather than diversity , where diversity is the reality but simplicity is the case.

                    Making dictionaries ordered by default is part of simplicity; in this effort I wish integrating the features of "defaultdict" as method and properties of the the default basic dictionary.

                    similar case , integrating "deque" features (as well ,method and properties) as part of pypy list datatype.

                    Usually I wonder why python team didn't integrate the features of these "collections" ( as they say "High-performance container datatypes" ) within original python basic datatype, as we all know , everything in Python is an Object. and I don't think it is a pythonic way to do things in diversity.

                    Anyhow , keep on your development and team spirit.

                    +
                    +
                    +
                    +
                    + + Armin Rigo wrote on 2015-02-11 09:07: +
                    +
                    +

                    @Alhabshi3k: indeed, you're right in that "defaultdict" could be replaced with an alternate constructor of the regular dicts. I'm not sure why it is not so. For deques, it is maybe a question of performance, but particularly of underlying C-level memory layout: CPython can't easily add appendleft() and popleft() to regular lists while still keeping the same C API, notably PyList_GET_ITEM() and PySequence_Fast_ITEMS() --- though that is debatable.

                    We could support that in PyPy, but that is arguably more of a language change than just making dicts ordered with no new user-visible API.

                    +
                    +
                    +
                    +
                    + + Unknown wrote on 2018-02-06 20:31: +
                    +
                    +

                    You say for 100 elements, the new design's compact array uses 3 * WORD * 100 memory, right? So no extra capacity whatsoever? Then what do you do when I insert another element? Allocate a new array with 3 * WORD * 101 memory and copy all data there (and write the new element at the end)? That would be highly inefficient. So I don't believe you're honest about the memory usage.

                    +
                    +
                    +
                    +
                    + + Armin Rigo wrote on 2018-02-06 21:08: +
                    +
                    +

                    The actual items are stored in a list which, like a list object, is slightly overallocated. Maybe the text in the blog post missed that and it should add a "k": the average is "100 * 12/7 + 3 * WORD * 100 * k" for an average value of k around 17/16. That's around 2700 instead of 2600.

                    +
                    +
                    +
                    + +
                    +
                    + +
                    +
                    + + \ No newline at end of file diff --git a/posts/2015/01/leysin-winter-sprint-20-28th-february-2590212640945547308.html b/posts/2015/01/leysin-winter-sprint-20-28th-february-2590212640945547308.html new file mode 100644 index 000000000..4a0270765 --- /dev/null +++ b/posts/2015/01/leysin-winter-sprint-20-28th-february-2590212640945547308.html @@ -0,0 +1,389 @@ + + + + + +Leysin Winter Sprint (20-28th February 2015) | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                    +
                    +

                    Leysin Winter Sprint (20-28th February 2015)

                    + + + +
                    +

                    The next PyPy sprint will be in Leysin, Switzerland, for the tenth time. +This is a fully public sprint: newcomers and topics other than those +proposed below are welcome.

                    + +

                    Goals and topics of the sprint

                    + +

                    The details depend on who is here and ready to work. We might touch +topics such as:

                    + +

                    +
                      +
                    • cleaning up the optimization step in the JIT, change the register +allocation done by the JIT's backend, or improvements to the +warm-up time + +

                      +
                    • +
                    • STM (Software Transaction Memory), notably: try to come up with +benchmarks, and measure them carefully in order to test and improve +the conflict reporting tools, and more generally to figure out how +practical it is in large projects to avoid conflicts + +

                      +
                    • +
                    • vmprof - a statistical profiler for CPython and PyPy work, including +making it more user friendly. + +

                      +
                    • +
                    • Py3k (Python 3.x support), NumPyPy (the numpy module) + +

                      +
                    • +
                    • +added: cffi 1.0, trying out pygame+cffi on Raspberry Pi devices + +
                    • +
                    • And as usual, the main side goal is to have fun in winter sports :-) +We can take a day off for ski. +
                    • +
                    +

                    Exact times

                    + +

                    For a change, and as an attempt to simplify things, I specified the +dates as 20-28 Februrary 2015, where 20 and 28 are travel days. We will +work full days between the 21 and the 27. You are of course allowed to +show up for a part of that time only, too.

                    + +

                    Location and Accomodation

                    + +

                    Leysin, Switzerland, "same place as before". Let me refresh your +memory: both the sprint venue and the lodging will be in a very spacious +pair of chalets built specifically for bed & breakfast: +Ermina. The place has a good ADSL Internet connection +with wireless installed. You can of course arrange your own lodging +anywhere (as long as you are in Leysin, you cannot be more than a 15 +minutes walk away from the sprint venue), but I definitely recommend +lodging there too -- you won't find a better view anywhere else (though +you probably won't get much worse ones easily, either :-)

                    + +

                    Please confirm that you are coming so that we can adjust the +reservations as appropriate. In the past, the rates were around 60 CHF a +night all included in 2-person rooms, with breakfast. Now, the rooms +available are either single-person (or couple), or rooms for 3 persons. +The latter choice is recommended and should be under 60 CHF per person.

                    + +

                    Please register by Mercurial, or on the pypy-dev mailing list if you do not yet have check-in rights.

                    + +

                    You need a Swiss-to-(insert country here) power adapter. There will be +some Swiss-to-EU adapters around, and at least one EU-format power strip.

                    +
                    +

                    Comments

                    +
                    +
                    +
                    + + Unknown wrote on 2015-01-16 21:07: +
                    +
                    +

                    Hi,

                    During this sprint, ss it plan to work on yield form syntax, or more generally, Python 3.3 support ?

                    I'm very interested to test PyPy with AsyncIO.

                    Regards

                    +
                    +
                    +
                    +
                    + + Armin Rigo wrote on 2015-01-17 10:21: +
                    +
                    +

                    @Ludovic, we don't have precise plans. If there is someone also interested in Python 3, then yes, this kind of work would be nice. (Note that I see some tests about "yield from" in the py3.3 branch, which may mean that it was implemented already.)

                    +
                    +
                    +
                    +
                    + + Anonymous wrote on 2015-01-24 07:38: +
                    +
                    +

                    Great news. Thanks! PyPy 3

                    +
                    +
                    +
                    + +
                    +
                    + +
                    +
                    + + \ No newline at end of file diff --git a/posts/2015/02/experiments-in-pyrlang-with-rpython-8103387814587972227.html b/posts/2015/02/experiments-in-pyrlang-with-rpython-8103387814587972227.html new file mode 100644 index 000000000..00c4dfc11 --- /dev/null +++ b/posts/2015/02/experiments-in-pyrlang-with-rpython-8103387814587972227.html @@ -0,0 +1,484 @@ + + + + + +Experiments in Pyrlang with RPython | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                    +
                    +

                    Experiments in Pyrlang with RPython

                    + + + +
                    +

                    Pyrlang is an Erlang BEAM bytecode interpreter written in RPython.

                    +

                    It implements approximately 25% of BEAM instructions. It can support +integer calculations (but not bigint), closures, exception handling, +some operators to atom, list and tuple, user modules, and multi-process +in single core. Pyrlang is still in development.

                    +

                    There are some differences between BEAM and the VM of PyPy:

                    +
                      +
                    • BEAM is a register-based VM, whereas the VM in PyPy is stack-based.
                    • +
                    • There is no traditional call-stack in BEAM. The Y register in BEAM is +similar to a call-stack, but the Y register can sometimes store some +variables.
                    • +
                    • There are no typical language-level threads and OS-level threads in +BEAM; only language-level processes, whose behavior is very similar +to the actor model.
                    • +
                    +

                    Regarding bytecode dispatch loop, Pyrlang uses a while loop to fetch +instructions and operands, call the function corresponding to every +instruction, and jump back to the head of the while loop. Due to the +differences between the RPython call-stack and BEAM’s Y register, we +decided to implement and manage the Y register by hand. On the other +hand, PyPy uses RPython’s call stack to implement Python’s call stack. +As a result, the function for the dispatch loop in PyPy calls itself +recursively. This does not happen in Pyrlang.

                    +

                    The Erlang compiler (erlc) usually compiles the bytecode instructions +for function invocation into CALL (for normal invocation) and CALL_ONLY +(for tail recursive invocation). You can use a trampoline semantic to +implement it:

                    +
                      +
                    • CALL instruction: The VM pushes the current instruction pointer (or +called-program counter in PyPy) to the Y register, and jumps to the +destination label. When encountering a RETURN instruction, the VM +pops the instruction pointer from the Y register and returns to the +location of the instruction pointer to continue executing the outer +function.
                    • +
                    • CALL_ONLY instruction: The VM simply jumps to the destination label, +without any modification of the Y register. As a result, the tail +recursive invocation never increases the Y register.
                    • +
                    +

                    The current implementation only inserts the JIT hint of can_enter_jit +following the CALL_ONLY instruction. This means that the JIT only +traces the tail-recursive invocation in Erlang code, which has a very +similar semantic to the loop in imperative programming languages like +Python.

                    +

                    We have also written a single scheduler to implement the language level +process in a single core. There is a runable queue in the scheduler. On +each iteration, the scheduler pops one element (which is a process +object with dispatch loop) from the queue, and executes the dispatch +loop of the process object. In the dispatch loop, however, there is a +counter-call “reduction” inside the dispatch loop. The reduction +decrements during the execution of the loop, and when the reduction +becomes 0, the dispatch loop terminates. Then the scheduler pushes that +element into the runable queue again, and pops the next element for the +queue, and so on.

                    +

                    We are planning to implement a multi-process scheduler for multi-core +CPUs, which will require multiple schedulers and even multiple runable +queues for each core, but that will be another story. :-)

                    +
                    +

                    Methods

                    +

                    We wrote two benchmark programs of Erlang:

                    +
                      +
                    • FACT: A benchmark to calculate the factorial in a tail-recursive +style, but because we haven’t implemented big int, we do a remainder +calculation to the argument for the next iteration, so the number +never overflows.
                    • +
                    • REVERSE: The benchmark creates a reversed list of numbers, such as +[20000, 19999, 19998, …], and applies a bubble sort to it.
                    • +
                    +
                    +
                    +

                    Results

                    +
                    +

                    The Value of Reduction

                    +

                    We used REVERSE to evaluate the JIT with different values of +reduction:

                    + + +

                    The X axis is the value of reduction, and the Y axis is the execution +time (by second).

                    +

                    It seems that when the value of reduction is small, the reduction +influences the performance significantly, but when reduction becomes +larger, it only increases the speed very slightly. In fact, we use 2000 +as the default reduction value (as well as the reduction value in the +official Erlang interpreter).

                    +

                    Surprisingly, the trace is always generated even when the reduction is +very small, such as 0, which means the dispatch loop can only run for a +very limited number of iterations, and the language level process +executes fewer instructions than an entire loop in one switch of the +scheduler). The generated trace is almost the same, regardless of +different reduction values.

                    +

                    Actually, the RPython JIT only cares what code it meets, but does not +care who executes it, thus the JIT always generates the results above. +The trace even can be shared among different threads if they execute the +same code.

                    +

                    The overhead at low reduction value may be due to the scheduler, which +switches from different processes too frequently, or from the +too-frequent switching between bytecode interpreter and native code, but +not from JIT itself.

                    +

                    Here is more explanation from Armin Rigo:

                    +
                    +“The JIT works well because you’re using a scheme where some counter +is decremented (and the soft-thread interrupted when it reaches +zero) only once in each app-level loop. The soft-thread switch is +done by returning to some scheduler, which will resume a different +soft-thread by calling it. It means the JIT can still compile each +of the loops as usual, with the generated machine code containing +the decrease-and-check-for-zero operation which, when true, exits +the assembler."
                    +
                    +
                    +

                    Fair Process Switching vs. Unfair Process Switching

                    +

                    We are also concerned about the timing for decreasing reduction value. +In our initial version of Pyrlang, we decrease reduction value at every +local function invocation, module function invocation, and BIF (built-in +function) invocation, since this is what the official Erlang interpreter +does. However, since the JIT in RPython basically traces the target +language loop (which is the tail recursive invocation in Pyrlang) it is +typically better to keep the loop whole during a switch of the language +level process. We modified Pyrlang, and made the reduction decrement +only occur after CALL_ONLY, which is actually the loop boundary of the +target language.

                    +

                    Of course, this strategy may cause an “unfair” execution among language +level processes. For example, if one process has only a single +long-sequence code, it executes until the end of the code. On the other +hand, if a process has a very short loop, it may be executed by very +limited steps then be switched out by the scheduler. However, in the +real world, this “unfairness” is usually considered acceptable, and is +used in many VM implementations including PyPy for improving the overall +performance.

                    +

                    We compared these two versions of Pyrlang in the FACT benchmark. The +reduction decrement is quite different because there are some BIF +invocations inside the loop. In the old version the process can be +suspended at loop boundaries or other function invocation, but in the +new version, it can be suspended only at loop boundaries.

                    +

                    We show that the strategy is effective, removing around 7% of the +overhead. We have also compared it in REVERSE, but since there are no +extra invocations inside the trace, it cannot provide any performance +improvement. In the real world, we believe there is usually more than +one extra invocation inside a single loop, so this strategy is effective +for most cases.

                    +
                    +
                    +

                    Comparison with Default Erlang and HiPE

                    +

                    We compared the performance of Pyrlang with the default Erlang +interpreter and the HiPE (High Performance Erlang) complier. HiPE is an +official Erlang compiler that can compile Erlang source code to native +code. The speed of Erlang programs obviously improves but loses its +generality instead.

                    +

                    Please note that Pyrlang is still in development, so in some situations +it does less work than the default Erlang interpreter, such as not +checking integer overflow when dealing with big integer, and not +checking and adding locks when accessing message queues in the +language-level process, so is therefore faster. The final version of +Pyrlang may be slower.

                    +

                    We used the two benchmark programs above, and made sure both of them are +executed for more than five seconds to cover the JIT warm-up time for +RPython. The experiment environment is a OS X 10.10 machine with 3.5GHZ +6-core Intel Xeon E5 CPU and 14GB 1866 MHz DDR3 ECC memory.

                    +

                    Let’s look at the result of FACT. The graph shows that Pyrlang runs +177.41% faster on average than Erlang, and runs at almost the same speed +as HiPE. However, since we haven’t implemented big integer in Pyrlang, +the arithmetical operators do not do any extra overflow checking. It is +reasonable that the final version for Pyrlang will be slower than the +current version and HiPE.

                    + +

                    As for REVERSE, the graph shows that Pyrlang runs 45.09% faster than +Erlang, but 63.45% slower than HiPE on average. We think this is +reasonable because there are only few arithmetical operators in this +benchmark so the speeds of these three implementations are closer. +However, we observed that at the scale of 40,000, the speed of Pyrlang +slowed down significantly (111.35% slower than HiPE) compared with the +other two scales (56.38% and 22.63% slower than HiPE).

                    +

                    Until now we can only hypothesize why Pyrlang slows down at that scale. +We guess that the overhead might be from GC. This is because the BEAM +bytecode provides some GC hints to help the default Erlang compiler to +perform some GC operations immediately. For example, using GC_BIF +instead of a BIF instruction tells the VM that there may be a GC +opportunity, and tells the VM how many live variables should be around +one instruction. In Pyrlang we do not use these kinds of hints but rely +on RPython’s GC totally. When there are a huge number of objects during +runtime, (as for REVERSE, it should be the Erlang list object) the speed +therefore slows down.

                    + +

                    Ruochen Huang

                    +
                    +
                    +
                    +

                    Comments

                    +
                    +
                    +
                    + + peterfirefly wrote on 2015-02-26 12:14: +
                    +
                    +

                    'there is a counter-call “reduction”' should probably be:

                    'there is a counter called “reduction”'.

                    +
                    +
                    +
                    + +
                    +
                    + +
                    +
                    + + \ No newline at end of file diff --git a/posts/2015/02/linalg-support-in-pypynumpy-1131217944329711855.html b/posts/2015/02/linalg-support-in-pypynumpy-1131217944329711855.html new file mode 100644 index 000000000..13309fb21 --- /dev/null +++ b/posts/2015/02/linalg-support-in-pypynumpy-1131217944329711855.html @@ -0,0 +1,594 @@ + + + + + +linalg support in pypy/numpy | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                    +
                    +

                    linalg support in pypy/numpy

                    + + + +
                    +
                    +

                    +

                    +

                    +Introduction

                    +PyPy's numpy support has matured enough that it can now support the lapack/blas libraries through the numpy.linalg module. To install the version of numpy this blog post refers to, install PyPy version 2.5.0 or newer, and run this:

                    pypy -m pip install git+https://bitbucket.org/pypy/numpy.git
                    +
                    +
                    +This update is a major step forward for PyPy's numpy support. Many of the basic matrix operations depend on linalg, even matplotlib requires it to display legends (a pypy-friendly version of matplotlib 1.3 is available at https://github.com/mattip/matplotlib).

                    +A number of improvements and adaptations, some of which are in the newly-released PyPy 2.5.0, made this possible:
                      +
                    • Support for an extended frompyfunc(), which in the PyPy version supports much of the ufunc API (signatures, multiple dtypes) allowing creation of pure-python, jit-friendly ufuncs. An additional keyword allows choosing between out = func(in) or func(in, out) ufunc signatures. More explanation follows.
                    • +
                    • Support for GenericUfuncs via PyPy's (slow) capi-compatibility layer. The underlying mechanism actually calls the internal implementation of frompyfunc().
                    • +
                    • A cffi version of _umath_linalg. Since cffi uses dlopen() to call into shared objects, we added support in the numpy build system to create non-python shared libraries from source code in the numpy tree. We also rewrote parts of the c-based _umath_linalg.c.src in python, renamed numpy's umath_linalg capi module to umath_linag_capi, and use it as a shared object through cffi.
                    • +
                    +
                    +

                    +Status

                    +We have not completely implemented all the linalg features. dtype resolution via casting is missing, especially for complex ndarrays, which leads to slight numerical errors where numpy uses a more precise type for intermediate calculations. Other missing features in PyPy's numpy support may have implications for complete linalg support.

                    +Some OSX users have noticed they need to update pip to version 6.0.8 to overcome a regression in pip, and it is not clear if we support all combinations of blas/lapack implementations on all platforms.

                    +Over the next few weeks we will be ironing out these issues.
                    +
                    +

                    +Performance

                    +A simple benchmark is shown below, but let's state the obvious: PyPy's JIT and the iterators built into PyPy's ndarray implementation will in most cases be no faster than CPython's numpy. The JIT can help where there is a mixture of python and numpy-array code. We do have plans to implement lazy evaluation and to further optimize PyPy's support for numeric python, but numpy is quite good at what it does.
                    +
                    +

                    +HowTo for PyPy's extended frompyfunc

                    +The magic enabling blas support is a rewrite of the _umath_linalg c-based module as a cffi-python module that creates ufuncs via frompyfunc. We extended the numpy frompyfunc to allow it to function as a replacement for the generic ufunc available in numpy only through the c-api.

                    +We start with the basic frompyfunc, which wraps a python function into a ufunc:
                     
                    +
                    def times2(in0):
                    +    return in0 * 2
                    +ufunc = frompyfunc(times2, 1, 1)
                    +
                    +
                    +In cpython's numpy the dtype of the result is always object, which is not implemented (yet) in PyPy, so this example will fail. While the utility of object dtypes can be debated, in the meantime we add a non-numpy-compatible keyword argument dtypes to frompyfunc. If dtype=['match'] the output dtype will match the dtype of the first input ndarray:

                    ufunc = frompyfunc(times2, 1, 1, dtype=['match'])
                    +ai = arange(24).reshape(3, 4, 2)
                    +ao = ufunc(ai)
                    +assert  (ao == ai * 2).all()
                    +
                    +
                    +I hear you ask "why is the dtypes keyword argument a list?" This is so we can support the Generalized Universal Function API, which allows specifying a number of specialized functions and the input-output dtypes each specialized function accepts.
                    +Note that the function feeds the values of ai one at a time, the function operates on scalar values. To support more complicated ufunc calls, the generalized ufunc API allows defining a signature, which specifies the layout of the ndarray inputs and outputs. So we extended frompyfunc with a signature keyword as well.
                    +We add one further extension to frompyfunc: we allow a Boolean keyword stack_inputs to specify the argument layout of the function itself. If the function is of the form:
                     
                    +
                    out0, out1, ... = func(in0, in1,...)
                    +
                    +
                    +then stack_inputs is False. If it is True the function is of the form:
                     
                    +
                    func(in0, in1, ... out0, out1, ...)
                    +
                    +
                    +Here is a complete example of using frompyfunc to create a ufunc, based on this link:
                     
                    +
                    def times2(in_array, out_array):
                    +    in_flat = in_array.flat
                    +    out_flat = out_array.flat
                    +    for i in range(in_array.size):
                    +        out_flat[i] = in_flat[i] * 2
                    +ufunc = frompyfunc([times2, times2], 1, 1,
                    +                signature='(i)->(i)',
                    +                dtypes=[dtype(int), dtype(int),
                    +                        dtype(float), dtype(float),
                    +                       ],
                    +                stack_inputs=True,
                    +                )
                    +ai = arange(10, dtype=int)
                    +ai2 = ufunc(ai)
                    +assert all(ai2 == ai * 2)
                    +
                    +
                    +Using this extended syntax, we rewrote the lapack calls into the blas functions in pure python, no c needed. Benchmarking this approach actually was much slower than using the upstream umath_linalg module via cpyext, as can be seen in the following benchmarks. This is due to the need to copy c-aligned data into Fortran-aligned format. Our __getitem__ and __setitem__ iterators are not as fast as pointer arithmetic in C. So we next tried a hybrid approach: compile and use numpy's umath_linalg python module as a shared object, and call the optimized specific wrapper function from it.
                    +
                    +

                    +Benchmarks

                    +Here are some benchmarks, running a tight loop of the different versions of linalg.inv(a), where a is a 10x10 double ndarray. The benchmark ran on an i7 processor running ubuntu 14.04 64 bit:
                    + + + + + + + + + + + + + +
                    Impl. Time after warmup
                    CPython 2.7 + numpy 1.10.dev + lapack 8.9 msec/1000 loops
                    PyPy 2.5.0 + numpy + lapack via cpyext 8.6 msec/1000 loops
                    PyPy 2.5.0 + numpy + lapack via pure python + cffi 19.9 msec/1000 loops
                    PyPy 2.5.0 + numpy + lapack via python + c + cffi 9.5 msec/1000 loops
                    +
                    +
                    +
                    +
                    +While no general conclusions may be drawn from a single micro-benchmark, it does indicate that there is some merit in the approach taken.

                    +Conclusion

                    +PyPy's numpy now includes a working linalg module. There are still some rough corners, but hopefully we have implemented the parts you need. While the speed of the isolated linalg function is no faster than CPython and upstream numpy, it should not be significantly slower either. Your use case may see an improvement if you use a mix of python and lapack, which is the usual case.

                    +Please let us know how it goes. We love to hear success stories too.

                    +We still have challenges at all levels of programming,and are always looking for people willing to contribute, so stop by on IRC at #pypy.

                    +mattip and the PyPy Team
                    +
                    +
                    +

                    Comments

                    +
                    +
                    +
                    + + Olivier Grisel wrote on 2015-02-24 10:20: +
                    +
                    +

                    Interesting work although benchmarking linear algebra routines on 10x10 arrays feels wrong: typical linear algebra applications use hundreds or thousands of dimensions. Would you mind re-rerunning those benchmarks on 1000x1000 arrays instead? The use of the CPU cache and multiple threads can be very impacting for such workloads.

                    Also some numpy / scipy developers are working on supporting OpenBLAS as the default BLAS/LAPACK by default for the Windows wheel packages and maybe later for the OSX packages as well.

                    Under Linux (Debian / Ubuntu) it's pretty easy to have libblas.so / liblapack.so be symlinks to either ATLAS or OpenBLAS using the update-alternative syste,

                    +
                    +
                    +
                    +
                    + + Maciej Fijalkowski wrote on 2015-02-24 10:23: +
                    +
                    +

                    What blog post somehow fails to mention is that we do not reimplement those but reuse whatever underlaying library is there. The measurments of the actual speed is then not that interesting, because we're only interested in the overhead of call.

                    +
                    +
                    +
                    +
                    + + Olivier Grisel wrote on 2015-02-24 10:26: +
                    +
                    +

                    It might still be interesting to run that kind of benchmarks on more realistic workloads (maybe in addition to some micro-workloads) to see the importance of the remaining overhead in a typical usage scenario.

                    +
                    +
                    +
                    +
                    + + mattip wrote on 2015-02-24 16:39: +
                    +
                    +

                    The most interesting benchmark is probably the one only you can run, i.e. how does pypy perform for you on your workload.

                    As far as lapack vs openblas, we will try to imitate what numpy does. If cpython/numpy supports a variation of lapack and pypy/numpy doesn't, that should be considered a bug.

                    Please let us know how it works for you.

                    +
                    +
                    +
                    +
                    + + Olivier Grisel wrote on 2015-02-24 17:50: +
                    +
                    +

                    > The most interesting benchmark is probably the one only you can run, i.e. how does pypy perform for you on your workload.

                    I agree, but inverting a 10x10 matrix is probably not representative of anybody's workload.

                    While it's important not to introduce too much overhead in the bindings, I think it's also good to keep in mind that an overhead of the order of the micro-second is completely negligible compared to the execution time of a typical linear algebra operation running on realistically sized data. Hence my original remark.

                    > As far as lapack vs openblas, we will try to imitate what numpy does. If cpython/numpy supports a variation of lapack and pypy/numpy doesn't, that should be considered a bug.

                    Just to clarify OpenBLAS is an implementation of the standard BLAS API that also includes the official LAPACK implementation from netlib linked against its own optimized BLAS routines. The 2 main open source optimized implementations of BLAS/LAPACK supported by numpy & scipy are ATLAS and OpenBLAS.

                    +
                    +
                    +
                    +
                    + + Romain Guillebert wrote on 2015-02-24 19:12: +
                    +
                    +

                    > While it's important not to introduce too much overhead in the bindings, I think it's also good to keep in mind that an overhead of the order of the micro-second is completely negligible compared to the execution time of a typical linear algebra operation running on realistically sized data. Hence my original remark.

                    But then you're just benchmarking the underlying library, which is the exact same library as numpy.

                    +
                    +
                    +
                    +
                    + + Olivier Grisel wrote on 2015-02-24 20:21: +
                    +
                    +

                    > But then you're just benchmarking the underlying library, which is the exact same library as numpy.

                    Yes I agree. I just want to highlight that for most common real life use cases, a small performance overhead in those those LAPACK bindings are almost never a problem.

                    Otherwise your readers might be mislead into thinking that the "PyPy 2.5.0 + numpy + lapack via pure python + cffi" version is significantly suboptimal (2x slowdown!) while in practice a couple of additional microseconds might be completely undetectable compared to the actual execution time of the "inv" function that typically lasts more than a millisecond on anything that is non-toy data.

                    +
                    +
                    +
                    +
                    + + Romain Guillebert wrote on 2015-02-24 20:57: +
                    +
                    +

                    ok, makes sense :)

                    +
                    +
                    +
                    +
                    + + Anonymous wrote on 2015-02-24 21:12: +
                    +
                    +

                    Additional data point: repeatedly inverting a ~10x10 matrix is exactly what I need performance on - for running an extended Kalman Filter. : )

                    +
                    +
                    +
                    +
                    + + Olivier Grisel wrote on 2015-02-24 21:18: +
                    +
                    +

                    > Additional data point: repeatedly inverting a ~10x10 matrix is exactly what I need performance on - for running an extended Kalman Filter. : )

                    Fair enough: so there actually exists a use case for that benchmark. Optimizing the bindings overhead might thus be worthy in the end.

                    +
                    +
                    +
                    +
                    + + Yaacov wrote on 2015-03-01 22:07: +
                    +
                    +

                    I love hearing about the progress and wish I could test on my benchmarks. Any chance of windows support?

                    +
                    +
                    +
                    +
                    + + mattip wrote on 2015-03-02 11:01: +
                    +
                    +

                    Yaacov what is missing for you to try it?
                    Here is the way I verify that the code works on windows 7 64 bit and windows 8.1:

                    download and install compiler
                    https://www.microsoft.com/en-us/download/details.aspx?id=44266

                    download pypy and open the zip
                    https://bitbucket.org/pypy/pypy/downloads/pypy-2.5.0-win32.zip

                    install pip into pypy
                    https://bootstrap.pypa.io/get-pip.py

                    install numpy into pypy
                    pip install git+https://bitbucket.org/pypy/numpy.git

                    +
                    +
                    +
                    +
                    + + Yaacov wrote on 2015-03-06 04:14: +
                    +
                    +

                    I get a tracback ending in

                    \appdata\local\temp\pip-wdyqtr-build\numpy\distutils\mi
                    sc_util.py", line 872, in _get_configuration_from_setup_py

                    config = setup_module.configuration(*args)

                    File "numpy\linalg\setup.py", line 85, in configuration

                    library_dirs = [sys.real_prefix + '/include',

                    AttributeError: 'module' object has no attribute 'real_prefix'

                    in a warning "using unoptimized lapack"

                    +
                    +
                    +
                    +
                    + + Unknown wrote on 2015-03-06 19:24: +
                    +
                    +

                    I still get non-existing conjugate method error when using, e.g., linalg.pinv. Any plan on getting this working?

                    +
                    +
                    +
                    +
                    + + mattip wrote on 2015-03-07 22:01: +
                    +
                    +

                    fixed, try a nightly (from tomorrow)

                    +
                    +
                    +
                    +
                    + + Derek Z wrote on 2015-03-13 14:56: +
                    +
                    +

                    I got an error message:

                    OSError: Cannot load library /usr/local/Cellar/pypy/2.5.0/libexec/site-packages/numpy/linalg/libumath_linalg_cffi.so: dlopen(/usr/local/Cellar/pypy/2.5.0/libexec/site-packages/numpy/linalg/libumath_linalg_cffi.so, 2): image not found

                    Is there anything I am not doing right for the installation? I have pypy 2.5, and Mac OS 10.10.

                    +
                    +
                    +
                    +
                    + + mattip wrote on 2015-03-13 15:18: +
                    +
                    +

                    Are you installing via pip, if so we have had reports of older versions of pip failing. You should have pip 6.0.8 or later. See https://bitbucket.org/pypy/numpy/issue/21

                    +
                    +
                    +
                    +
                    + + melbic wrote on 2015-03-19 11:13: +
                    +
                    +

                    Same problem here. (OSX 10.10) I've got the newest pip (6.0.8) and setuptools (14.0.3) version installed.

                    +
                    +
                    +
                    +
                    + + melbic wrote on 2015-03-19 11:13: +
                    +
                    +

                    Same problem here. (OSX 10.10) I've got the newest pip (6.0.8) and setuptools (14.0.3) version installed.

                    +
                    +
                    +
                    +
                    + + mattip wrote on 2015-03-19 17:01: +
                    +
                    +

                    I can't reproduce this as I do not have a MacOS machine. The place to follow this up is on our issue tracker, https://bitbucket.org/pypy/numpy/issue/21

                    It would be most helpful to attach a full log from "pip install" and 'pypy -c "import numpy"' to that issue

                    +
                    +
                    +
                    +
                    + + Nimrod wrote on 2015-03-29 15:34: +
                    +
                    +

                    One way pypy might be able to outperform numpy is by eliminating temporaries.

                    Just converting the BLAS functions to chain operations efficiently and sometimes update in-place rather than allocating and de-allocating arrays should help a lot.

                    +
                    +
                    +
                    +
                    + + Unknown wrote on 2015-04-05 23:07: +
                    +
                    +

                    This is great! But I can't use this for almost any of my code before np.einsum is supported :/ IMO, it is a super useful function for almost anything. Any plans for supporting it?

                    +
                    +
                    +
                    +
                    + + mattip wrote on 2015-04-07 09:01: +
                    +
                    +

                    Koos Zevenhoven - we have plans to implement all of numpy. With that, it looks like einsum will take quite a bit of work

                    +
                    +
                    +
                    + +
                    +
                    + +
                    +
                    + + \ No newline at end of file diff --git a/posts/2015/02/numpypy-status-january-2015-5092986229783279944.html b/posts/2015/02/numpypy-status-january-2015-5092986229783279944.html new file mode 100644 index 000000000..661ec1d55 --- /dev/null +++ b/posts/2015/02/numpypy-status-january-2015-5092986229783279944.html @@ -0,0 +1,347 @@ + + + + + +NumPyPy status - January 2015 | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                    +
                    +

                    NumPyPy status - January 2015

                    + + + +
                    +

                    Hi Everyone

                    +Here is what has been done in January thanks to the funding of NumPyPy, I would like to thank all the donors and tell you that you can still donate :

                    +
                      +
                    • I have focused on implementing the object dtype this month, it is now possible to store objects inside ndarrays using the object dtype
                    • +
                    • It is also possible to add an object ndarray to any other ndarray (implementing other operators is trivial)
                    • +
                    +
                    +The next things I plan on working on next are :
                    +
                    +
                      +
                    • Implementing the missing operations for object arrays
                    • +
                    • Implementing garbage collection support for object arrays (currently, storing an object inside an ndarray doesn't keep the object alive)
                    • +
                    • Packaging NumPyPy on PyPI
                    • +
                    +
                    +Cheers
                    +
                    +
                    +Romain
                    +
                    +

                    Comments

                    +
                    +
                    +
                    + + Anonymous wrote on 2015-02-12 02:15: +
                    +
                    +

                    Thanks for the post! This sounds pretty cool.

                    The previous post suggested that there would be an update in regards to linalg. Does this mean linalg is working? Is having a working linalg what stands in the way of a working matplotlib? Thanks for answering what might be a naive question!

                    +
                    +
                    +
                    +
                    + + mattip wrote on 2015-02-12 22:27: +
                    +
                    +

                    Linalg is basically usable with the usual caveats: use PyPy 2.5.0 or later, use pypy/numpy from the bitbucket repo, you can even use matplotlib from my fork at https://github.com/mattip/matplotlib but there is no gui backend available yet, so you can only save the plots to files. Watch this space for the promised blog post, hopefully next week.

                    +
                    +
                    +
                    +
                    + + Anonymous wrote on 2015-02-13 10:34: +
                    +
                    +

                    Great to hear there is some progress on numpy!

                    About matplotlib @mattip. Maybe a GSoC project for the GUI?

                    +
                    +
                    +
                    +
                    + + Jami wrote on 2015-03-06 20:01: +
                    +
                    +

                    Regarding matplotlib, I whipped up a quick hack that can do at least very simple matplotlib stuff. Based on running a "slave" CPython using RpyC, as I recall was already done in 2011 or so demos.

                    Simple stuff can run unmodified, although can be of course slow if there's a lot or frequent data passing from PyPy to CPython.

                    Could be probably quite easily done in other direction to, ie running PyPy from CPython.

                    https://github.com/jampekka/cpyproxy

                    +
                    +
                    +
                    + +
                    +
                    + +
                    +
                    + + \ No newline at end of file diff --git a/posts/2015/02/pypy-250-released-247160062953533060.html b/posts/2015/02/pypy-250-released-247160062953533060.html new file mode 100644 index 000000000..ab5337d21 --- /dev/null +++ b/posts/2015/02/pypy-250-released-247160062953533060.html @@ -0,0 +1,422 @@ + + + + + +PyPy 2.5.0 released | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                    +
                    +

                    PyPy 2.5.0 released

                    + + + +
                    +
                    +
                    +
                    +

                    +PyPy 2.5.0 - Pincushion Protea +

                    +We’re pleased to announce PyPy 2.5, which contains significant performance +enhancements and bug fixes.
                    +You can download the PyPy 2.5.0 release here:
                    + +
                    +We would like to thank our donors for the continued support of the PyPy +project, and for those who donate to our three sub-projects, as well as our +volunteers and contributors (10 new commiters joined PyPy since the last +release). +We’ve shown quite a bit of progress, but we’re slowly running out of funds. +Please consider donating more, or even better convince your employer to donate, +so we can finish those projects! The three sub-projects are:
                      +
                    • +
                      +
                      +Py3k (supporting Python 3.x): We have released a Python 3.2.5 compatible version
                      +
                      +
                      +we call PyPy3 2.4.0, and are working toward a Python 3.3 compatible version
                      +
                      +
                      +
                    • +
                    • +
                      +STM (software transactional memory): We have released a first working version, +and continue to try out new promising paths of achieving a fast multithreaded Python
                      +
                    • +
                    • +
                      +NumPy which requires installation of our fork of upstream numpy, +available on bitbucket +
                      +
                    • +
                    +
                    +

                    +What is PyPy?

                    +PyPy is a very compliant Python interpreter, almost a drop-in replacement for +CPython 2.7. It’s fast (pypy and cpython 2.7.x performance comparison) +due to its integrated tracing JIT compiler.
                    +This release supports x86 machines on most common operating systems +(Linux 32/64, Mac OS X 64, Windows, and OpenBSD), +as well as newer ARM hardware (ARMv6 or ARMv7, with VFPv3) running Linux.
                    +While we support 32 bit python on Windows, work on the native Windows 64 +bit python is still stalling, we would welcome a volunteer +to handle that.
                    +
                    +

                    +Highlights

                    +
                      +
                    • The past months have seen pypy mature and grow, as rpython becomes the goto +solution for writing fast dynamic language interpreters. Our separation of +rpython and the python interpreter PyPy is now much clearer in the +PyPy documentation and we now have separate RPython documentation.
                    • +
                    • We have improved warmup time as well as jitted code performance: more than 10% +compared to pypy-2.4.0. +We no longer zero-out memory allocated in the gc nursery by default, work that +was started during a GSoC.
                    • +
                    • Passing objects between C and PyPy has been improved. We are now able to pass +raw pointers to C (without copying) using pinning. This improves I/O; +benchmarks that use networking intensively improved by about 50%. File() +operations still need some refactoring but are already showing a 20% +improvement on our benchmarks. Let us know if you see similar improvements.
                    • +
                    • Our integrated numpy support gained much of the GenericUfunc api in order to +support the lapack/blas linalg module of numpy. This dovetails with work in the +pypy/numpy repository to support linalg both through the (slower) cpyext capi +interface and also via (the faster) pure python cffi interface, using an +extended frompyfunc() api. We will soon post a seperate blog post specifically +about linalg and PyPy.
                    • +
                    • Dictionaries are now ordered by default, see the blog post +
                    • +
                    • Our nightly translations use –shared by default, including on OS/X and linux
                    • +
                    • We now more carefully handle errno (and GetLastError, WSAGetLastError) tying +the handlers as close as possible to the external function call, in non-jitted +as well as jitted code.
                    • +
                    • Issues reported with our previous release were resolved after reports from users on +our issue tracker at https://foss.heptapod.net/pypy/pypy/-/issues or on IRC at +#pypy.
                    • +
                    +We have further improvements on the way: rpython file handling, +finishing numpy linalg compatibility, numpy object dtypes, a better profiler, +as well as support for Python stdlib 2.7.9.
                    +Please try it out and let us know what you think. We especially welcome +success stories, we know you are using PyPy, please tell us about it!
                    +Cheers
                    +The PyPy Team
                    +
                    +
                    +
                    +
                    +

                    Comments

                    +
                    +
                    +
                    + + Anonymous wrote on 2015-02-04 07:48: +
                    +
                    +

                    Many-many thanks for your work!

                    +
                    +
                    +
                    +
                    + + rndblnch wrote on 2015-02-04 15:59: +
                    +
                    +

                    any release schedule for pypy3-2.5?
                    how can we help with pypy3?

                    +
                    +
                    +
                    +
                    + + Jami wrote on 2015-02-09 11:48: +
                    +
                    +

                    Sorry to nag, but when are the news about Scipy/Matplotlib compatibility plans coming? I've been checking daily since the November 28th teaser!

                    +
                    +
                    +
                    + +
                    +
                    + +
                    +
                    + + \ No newline at end of file diff --git a/posts/2015/03/pydgin-using-rpython-to-generate-fast-1514065178985838697.html b/posts/2015/03/pydgin-using-rpython-to-generate-fast-1514065178985838697.html new file mode 100644 index 000000000..e3dadb126 --- /dev/null +++ b/posts/2015/03/pydgin-using-rpython-to-generate-fast-1514065178985838697.html @@ -0,0 +1,862 @@ + + + + + +Pydgin: Using RPython to Generate Fast Instruction-Set Simulators | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                    +
                    +

                    Pydgin: Using RPython to Generate Fast Instruction-Set Simulators

                    + + + +
                    +
                    + +

                    Note: This is a guest blog post by Derek Lockhart and Berkin Ilbeyi from +Computer Systems Laboratory of Cornell University.

                    +

                    In this blog post I'd like to describe some recent work on using the RPython +translation toolchain to generate fast instruction set simulators. +Our open-source framework, Pydgin [a], provides a domain-specific +language (DSL) embedded in Python for concisely describing instruction set +architectures [b] and then uses these descriptions to generate fast, +JIT-enabled simulators. +Pydgin will be presented at the IEEE International Symposium on Performance +Analysis of Systems and Software (ISPASS) and in this post we provide a +preview of that work. +In addition, we discuss some additional progress updates that occurred after +the publishing deadline and will not appear in the final paper [1].

                    +

                    Our area of research expertise is computer architecture, which is perhaps an +unfamiliar topic for some readers of the PyPy blog. +Below we provide some brief background on hardware simulation in the field of +computer architecture, as well as some context as to why instruction set +simulators in particular are such an important tool.

                    +
                    +

                    Simulators: Designing Hardware with Software

                    +

                    For computer architects in both academia and industry, a key step in designing +new computational hardware (e.g., CPUs, GPUs, and mobile system-on-chips) is +simulation [c] of the target system. +While numerous models for simulation exist, three classes are particularly +important in hardware design.

                    +

                    Functional Level models simulate the behavior of the target system. +These models are useful for creating a "golden" reference which can serve as an +executable specification or alternatively as an emulation platform for software +development.

                    +

                    Cycle Level models aim to simulate both the behavior and the approximate +timing of a hardware component. +These models help computer architects explore design tradeoffs and quickly +determine things like how big caches should be, how many functional units are +needed to meet throughput targets, and how the addition of a custom accelerator +block may impact total system performance.

                    +

                    Register-Transfer Level (RTL) models specify the behavior, timing, and +resources (e.g., registers, wires, logic gates) of a hardware component. +RTL models are bit-accurate hardware specifications typically written in a +hardware description language (HDL) such as Verilog or VHDL. +Once verified through extensive simulation, HDL specifications can be passed +into synthesis and place-and-route tools to estimate area/energy/timing or to +create FPGA or ASIC prototypes.

                    +

                    An instruction set simulator (ISS) is a special kind of +functional-level model that simulates the behavior of a processor or +system-on-chip (SOC). ISSs serve an important role in hardware design +because they model the instruction set architecture (ISA) interface: the +contractual boundary between hardware designers and software developers. +ISSs allow hardware designers to quickly experiment with adding new processor +instructions while also allowing software developers to build new compilers, +libraries, and applications long before physical silicon is available.

                    +
                    +
                    +

                    Instruction-Set Simulators Must be Fast and Productive

                    +

                    Instruction-set simulators are more important than ever because the ISA +boundary has become increasingly fluid. +While Moore's law has continued to deliver larger numbers of transistors +which computer architects can use to build increasingly complex chips, limits +in Dennard scaling have restricted how these transistors can be used [d]. +In more simple terms, thermal constraints (and energy constraints in mobile +devices) have resulted in a growing interest in pervasive specialization: +using custom accelerators to more efficiently perform compute intensive tasks. +This is already a reality for designers of mobile SOCs who continually add new +accelerator blocks and custom processor instructions in order to achieve higher +performance with less energy consumption. +ISSs are indispensable tools in this SOC design process for both hardware +architects building the silicon and software engineers developing the software +stack on top of it.

                    +

                    An instruction set simulator has two primary responsibilities: 1) accurately +emulating the external execution behavior of the target, and 2) providing +observability by accurately reproducing the target's internal state (e.g., +register values, program counter, status flags) at each time step. +However, other qualities critical to an effective ISS are simulation +performance and designer productivity. +Simulation performance is important because shorter simulation times allow +developers to more quickly execute and verify large software applications. +Designer productivity is important because it allows hardware architects to +easily experiment with adding new instructions and estimate their impact on +application performance.

                    +

                    To improve simulation performance, high-performance ISSs use dynamic binary +translation (DBT) as a mechanism to translate frequently visited blocks of +target instructions into optimized sequences of host instructions. +To improve designer productivity, many design toolchains automatically generate +ISSs from an architectural description language (ADL): a special +domain-specific language for succinctly specifying instruction encodings and +instruction semantics of an ISA. +Very few existing systems have managed to encapsulate the design complexity of +DBT engines such that high-performance, DBT-accelerated ISSs could be +automatically generated from ADLs [e]. +Unfortunately, tools which have done so are either proprietary software or +leave much to be desired in terms of performance or productivity.

                    +
                    +
                    +

                    Why RPython?

                    +

                    Our research group learned of the RPython translation toolchain through our +experiences with PyPy, which we had used in conjunction with our Python +hardware modeling framework to achieve significant improvements in simulation +performance [2]. +We realized that the RPython translation toolchain could potentially be adapted +to create fast instruction set simulators since the process of interpreting +executables comprised of binary instructions shared many similarities with the +process of interpreting bytecodes in a dynamic-language VM. +In addition, we were inspired by PyPy's meta-tracing approach to JIT-optimizing +VM design which effectively separates the process of specifying a language +interpreter from the optimization machinery needed to achieve good performance.

                    +

                    Existing ADL-driven ISS generators have tended to use domain-specific +languages that require custom parsers or verbose C-based syntax that +distracts from the instruction specification. +Creating an embedded-ADL within Python provides several benefits over these +existing approaches including a gentler learning curve for new users, access to +better debugging tools, and easier maintenance and extension by avoiding a +custom parser. +Additionally, we have found that the ability to directly execute Pydgin +ISA descriptions in a standard Python interpreter such as CPython or PyPy +significantly helps debugging and testing during initial ISA exploration. +Python's concise, pseudocode-like syntax also manages to map quite closely to +the pseudocode specifications provided by many ISA manuals [f].

                    +
                    +
                    +

                    The Pydgin embedded-ADL

                    +

                    Defining a new ISA in the Pydgin embedded-ADL requires four primary pieces of +information: the architectural state (e.g. register file, program counter, +control registers), the bit encodings of each instruction, the instruction +fields, and the semantic definitions for each instruction. Pydgin aims to make +this process as painless as possible by providing helper classes and functions +where possible.

                    +

                    For example, below we provide a truncated example of the ARMv5 instruction +encoding table. Pydgin maintains encodings of all instructions in a centralized +encodings data structure for easy maintenance and quick lookup. The +user-provided instruction names and bit encodings are used to automatically +generate decoders for the simulator. Unlike many ADLs, Pydgin does not require +that the user explicitly specify instruction types or mask bits for field +matching because the Pydgin decoder generator can automatically infer decoder +fields from the encoding table.

                    +
                    +encodings = [
                    +  ['adc',      'xxxx00x0101xxxxxxxxxxxxxxxxxxxxx'],
                    +  ['add',      'xxxx00x0100xxxxxxxxxxxxxxxxxxxxx'],
                    +  ['and',      'xxxx00x0000xxxxxxxxxxxxxxxxxxxxx'],
                    +  ['b',        'xxxx1010xxxxxxxxxxxxxxxxxxxxxxxx'],
                    +  ['bl',       'xxxx1011xxxxxxxxxxxxxxxxxxxxxxxx'],
                    +  ['bic',      'xxxx00x1110xxxxxxxxxxxxxxxxxxxxx'],
                    +  ['bkpt',     '111000010010xxxxxxxxxxxx0111xxxx'],
                    +  ['blx1',     '1111101xxxxxxxxxxxxxxxxxxxxxxxxx'],
                    +  ['blx2',     'xxxx00010010xxxxxxxxxxxx0011xxxx'],
                    +  # ...
                    +  ['teq',      'xxxx00x10011xxxxxxxxxxxxxxxxxxxx'],
                    +  ['tst',      'xxxx00x10001xxxxxxxxxxxxxxxxxxxx'],
                    +]
                    +
                    +

                    A major goal of Pydgin was ensuring instruction semantic definitions map to ISA +manual specifications as much as possible. The code below shows one such +definition for the ARMv5 add instruction. +A user-defined Instruction class (not shown) specifies field names that can +be used to conveniently access bit positions within an instruction (e.g. +rd, rn, S). +Additionally, users can choose to define their own helper functions, such as +the condition_passed function, to create more concise syntax that better +matches the ISA manual.

                    +
                    +def execute_add( s, inst ):
                    +  if condition_passed( s, inst.cond() ):
                    +    a,   = s.rf[ inst.rn() ]
                    +    b, _ = shifter_operand( s, inst )
                    +    result = a + b
                    +    s.rf[ inst.rd() ] = trim_32( result )
                    +
                    +    if inst.S():
                    +      if inst.rd() == 15:
                    +        raise FatalError('Writing SPSR not implemented!')
                    +      s.N = (result >> 31)&1
                    +      s.Z = trim_32( result ) == 0
                    +      s.C = carry_from( result )
                    +      s.V = overflow_from_add( a, b, result )
                    +
                    +    if inst.rd() == 15:
                    +      return
                    +
                    +  s.rf[PC] = s.fetch_pc() + 4
                    +
                    +

                    Compared to the ARM ISA Reference manual shown below, the Pydgin instruction +definition is a fairly close match. Pydgin's definitions could certainly be +made more concise by using a custom DSL, however, this would lose many of the +debugging benefits afforded to a well-supported language such as Python and +additionally require using a custom parser that would likely need modification +for each new ISA.

                    +
                    +if ConditionPassed(cond) then
                    +   Rd = Rn + shifter_operand
                    +   if S == 1 and Rd == R15 then
                    +     if CurrentModeHasSPSR() then CPSR = SPSR
                    +   else UNPREDICTABLE else if S == 1 then
                    +     N Flag = Rd[31]
                    +     Z Flag = if Rd == 0 then 1 else 0
                    +     C Flag = CarryFrom(Rn + shifter_operand)
                    +     V Flag = OverflowFrom(Rn + shifter_operand)
                    +
                    +

                    Creating an ISS that can run real applications is a rather complex task, even +for a bare metal simulator with no operating system such as Pydgin. +Each system call in the C library must be properly implemented, and +bootstrapping code must be provided to set up the program stack and +architectural state. +This is a very tedious and error prone process which Pydgin tries to +encapsulate so that it remains as transparent to the end user as possible. +In future versions of Pydgin we hope to make bootstrapping more painless and +support a wider variety of C libraries.

                    + + +
                    +
                    +

                    Pydgin Performance

                    +

                    In order to achieve good simulation performance from Pydgin ISSs, significant +work went into adding appropriate JIT annotations to the Pydgin library +components. +These optimization hints, which allow the JIT generated by the RPython +translation toolchain to produce more efficient code, have been specifically +selected for the unique properties of ISSs. +For the sake of brevity, we do not talk about the exact optimizations here but +a detailed discussion can be found in the ISPASS paper [1]. +In the paper we evaluate two ISSs, one for a simplified MIPS ISA and another +for the ARMv5 ISA, whereas below we only discuss results for the ARMv5 ISS.

                    +

                    The performance of Pydgin-generated ARMv5 ISSs were compared against +several reference ISSs: the gem5 ARM atomic simulator (gem5), +interpretive and JIT-enabled versions of SimIt-ARM (simit-nojit and +simit-jit), and QEMU. +Atomic models from the gem5 simulator were chosen for comparison due their wide +usage amongst computer architects [g]. +SimIt-ARM was selected because it is currently the highest performance +ADL-generated DBT-ISS publicly available. +QEMU has long been held as the gold-standard for DBT simulators due to its +extremely high performance, however, QEMU is generally intended for usage as an +emulator rather than a simulator [c] and therefore achieves its excellent +performance at the cost of observability. +Unlike QEMU, all other simulators in our study faithfully track architectural +state at an instruction level rather than block level. +Pydgin ISSs were generated with and without JITs using the RPython translation +toolchain in order to help quantify the performance benefit of the meta-tracing +JIT.

                    +

                    The figure below shows the performance of each ISS executing applications from +the SPEC CINT2006 benchmark suite [h]. +Benchmarks were run to completion on the high-performance DBT-ISSs +(simit-jit, pydgin-jit, and QEMU), but were terminated after only +10 billion simulated instructions for the non-JITed interpretive ISSs +(these would require many hours, in some cases days, to run to completion). +Simulation performance is measured in MIPS [i] and plotted on a log +scale due to the wide variance in performance. +The WHMEAN group summarizes each ISS's performance across all benchmarks +using the weighted harmonic mean.

                    + +
                    + +

                    A few points to take away from these results:

                    +
                      +
                    • ISSs without JITs (gem5, simit-nojit, and pydgin-nojit) demonstrate +relatively consistent performance across applications, whereas ISSs with JITs +(simit-jit, pydgin-jit, and QEMU) demonstrate much greater +performance variability from application-to-application.
                    • +
                    • The gem5 atomic model demonstrates particularly miserable performance, only +2-3 MIPS!
                    • +
                    • QEMU lives up to its reputation as a gold-standard for simulator performance, +leading the pack on nearly every benchmark and reaching speeds of 240-1120 +MIPS.
                    • +
                    • +pydgin-jit is able to outperform simit-jit on four of the +applications, including considerable performance improvements of 1.44–1.52× +for the applications 456.hmmer, 462.libquantum, and 471.omnetpp +(managing to even outperform QEMU on 471.omnetpp).
                    • +
                    • +simit-jit is able to obtain much more consistent performance (230-459 +MIPS across all applications) than pydgin-jit (9.6-659 MIPS). This is +due to simit-jit's page-based approach to JIT optimization compared to +pydgin-jit's tracing-based approach.
                    • +
                    • +464.h264ref displays particularly bad pathological behavior in Pydgin’s +tracing JIT and is the only application to perform worse on pydgin-jit +than pydgin-nojit (9.6 MIPS vs. 21 MIPS).
                    • +
                    +

                    The pathological behavior demonstrated by 464.h264ref was of particular +concern because it caused pydgin-jit to perform even worse than having no +JIT at all. RPython JIT logs indicated that the reason for this performance +degradation was a large number of tracing aborts due to JIT traces growing too +long. However, time limitations before the publication deadline prevented us +from investigating this issue thoroughly.

                    +

                    Since the deadline we've applied some minor bug fixes and made some small +improvements in the memory representation. +More importantly, we've addressed the performance degradation in 464.h264ref +by increasing trace lengths for the JIT. +Below we show how the performance of 464.h264ref changes as the +trace_limit parameter exposed by the RPython JIT is varied from the default +size of 6000 operations.

                    + + +
                    + +

                    By quadrupling the trace limit we achieve an 11x performance improvement in +464.h264ref. +The larger trace limit allows the JIT to optimize long code paths that were +previously triggering trace aborts, greatly helping amortize the costs of +tracing. +Note that arbitrarily increasing this limit can potentially hurt performance if +longer traces are not able to detect optimizable code sequences.

                    +

                    After performing similar experiments across the applications in the SPEC +CINT2006 benchmark suite, we settled on a trace limit of 400,000 operations. +In the figure below we show how the updated Pydgin ISS (pydgin-400K) improves +performance across all benchmarks and fixes the performance degradation +previously seen in 464.h264ref. Note that the non-JITted simulators have been +removed for clarity, and simulation performance is now plotted on a +linear scale to more clearly distinguish the performance gap between +each ISS.

                    + +
                    + +

                    With these improvements, we are now able to beat simit-jit on all but two +benchmarks. In future work we hope to further close the gap with QEMU as well.

                    +
                    +
                    +

                    Conclusions and Future Work

                    +

                    Pydgin demonstrates that the impressive work put into the RPython translation +toolchain, designed to simplify the process of building fast dynamic-language +VMs, can also be leveraged to build fast instruction set simulators. +Our prototype ARMv5 ISS shows that Pydgin can generate ISSs with performance +competitive to SimIt-ARM while also providing a more productive development +experience: RPython allowed us to develop Pydgin with only four person-months +of work. +Another significant benefit of the Pydgin approach is that any performance +improvements applied to the RPython translation toolchain immediately benefit +Pydgin ISSs after a simple software download and retranslation. +This allows Pydgin to track the continual advances in JIT technology introduced +by the PyPy development team.

                    +

                    Pydgin is very much a work in progress. There are many features we would like +to add, including:

                    +
                      +
                    • more concise syntax for accessing arbitrary instruction bits
                    • +
                    • support for other C libraries such as glibc, uClibc, and musl +(we currently only support binaries compiled with newlib)
                    • +
                    • support for self-modifying code
                    • +
                    • features for more productive debugging of target applications
                    • +
                    • ISS descriptions for other ISAs such as RISC-V, ARMv8, and x86
                    • +
                    • automatic generation of compilers and toolchains from Pydgin descriptions
                    • +
                    +

                    In addition, we think there are opportunities for even greater performance +improvements with more advanced techniques such as:

                    +
                      +
                    • automatic generation of optimized instruction decoders
                    • +
                    • optimizations for floating-point intensive applications
                    • +
                    • multiple tracing-JITs for parallel simulation of multicore SOCs
                    • +
                    • a parallel JIT compilation engine as proposed by Böhm et al. [3] +
                    • +
                    +

                    We hope that Pydgin can be of use to others, so if you try it out please let us +know what you think. Feel free to contact us if you find any of the above +development projects interesting, or simply fork the project on GitHub and hack +away!

                    +

                    -- Derek Lockhart and Berkin Ilbeyi

                    +
                    +
                    +

                    Acknowledgements

                    +

                    We would like to sincerely thank Carl Friedrich Bolz and Maciej Fijalkowski for their feedback on the Pydgin publication and their guidance on improving the JIT performance of our simulators. We would also like to thank for the whole PyPy team for their incredible work on the PyPy and the RPython translation toolchain. Finally, thank you to our research advisor, Prof. Christopher Batten, and the sponsors of this work which include the National Science Foundation, the Defense Advanced Research Projects Agency, and Intel Corporation.

                    +
                    +
                    +

                    Footnotes

                    + ++++ + + + + +
                    [a]Pydgin loosely stands for [Py]thon [D]SL for [G]enerating +[In]struction set simulators and is pronounced the same as “pigeon”. The +name is inspired by the word “pidgin” which is a grammatically simplified +form of language and captures the intent of the Pydgin embedded-ADL. +https://github.com/cornell-brg/pydgin +
                    + ++++ + + + + +
                    [b]Popular instruction set architectures (ISAs) include MIPs, ARM, +x86, and more recently RISC-V
                    + ++++ + + + + +
                    [c] +(1, 2) For a good discussion of simulators vs. emulators, please see the +following post on StackOverflow: +https://stackoverflow.com/questions/1584617/simulator-or-emulator-what-is-the-difference +
                    + ++++ + + + + +
                    [d]https://en.wikipedia.org/wiki/Dark_silicon
                    + ++++ + + + + +
                    [e]Please see the Pydgin paper for a more detailed discussion of prior work.
                    + ++++ + + + + +
                    [f] +

                    For more examples of Pydgin ISA specifications, please see the ISPASS +paper [1] or the Pydgin source code on GitHub.

                    +

                    Pydgin instruction definitions for a simple MIPS-inspired ISA can be +found here:

                    + +

                    Pydgin instruction definitions for a simplified ARMv5 ISA can be found +here:

                    + +
                    + ++++ + + + + +
                    [g] +

                    gem5 is a cycle-level simulation framework that contains both +functional-level (atomic) and cycle-level processor models. Although +primarily used for detailed, cycle-approximate processor simulation, +gem5's atomic model is a popular tool for many ISS tasks.

                    + +
                    + ++++ + + + + +
                    [h]All performance measurements were taken on an unloaded server-class +machine.
                    + ++++ + + + + +
                    [i]Millions of instructions per second.
                    +
                    +
                    +

                    References

                    + ++++ + + + + +
                    [1] +(1, 2, 3)

                    Derek Lockhart, Berkin Ilbeyi, and Christopher Batten. "Pydgin: +Generating Fast Instruction Set Simulators from Simple Architecture +Descriptions with Meta-Tracing JIT Compilers." IEEE Int'l Symp. on +Performance Analysis of Systems and Software (ISPASS), Mar. 2015.

                    + +
                    + ++++ + + + + +
                    [2] +

                    Derek Lockhart, Gary Zibrat, and Christopher Batten. "PyMTL: A Unified +Framework for Vertically Integrated Computer Architecture Research." 47th +ACM/IEEE Int'l Symp. on Microarchitecture (MICRO-47), Dec. 2014.

                    + +
                    + ++++ + + + + +
                    [3]I. Böhm, B. Franke, and N. Topham. Generalized Just-In-Time Trace +Compilation Using a Parallel Task Farm in a Dynamic Binary Translator. +ACM SIGPLAN Conference on Programming Language Design and Implementation +(PLDI), Jun 2011.
                    +
                    + +
                    +
                    +
                    +

                    Comments

                    +
                    +
                    +
                    + + Anonymous wrote on 2015-03-30 12:14: +
                    + +
                    +
                    + +
                    +
                    + +
                    +
                    + + \ No newline at end of file diff --git a/posts/2015/03/pypy-251-released-5657064769385723517.html b/posts/2015/03/pypy-251-released-5657064769385723517.html new file mode 100644 index 000000000..a0d1869c3 --- /dev/null +++ b/posts/2015/03/pypy-251-released-5657064769385723517.html @@ -0,0 +1,431 @@ + + + + + +PyPy 2.5.1 Released | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                    +
                    +

                    PyPy 2.5.1 Released

                    + + + +
                    +
                    +
                    + +
                    +

                    +PyPy 2.5.1 - Pineapple Bromeliad

                    +We’re pleased to announce PyPy 2.5.1, Pineapple Bromeliad following on the heels of 2.5.0. You can download the PyPy 2.5.1 release here:
                    + +
                    +We would like to thank our donors for the continued support of the PyPy +project, and for those who donate to our three sub-projects, as well as our +volunteers and contributors. +We’ve shown quite a bit of progress, but we’re slowly running out of funds. +Please consider donating more, or even better convince your employer to donate, +so we can finish those projects! The three sub-projects are:
                      +
                    • +
                      +
                      +Py3k (supporting Python 3.x): We have released a Python 3.2.5 compatible version we call PyPy3 2.4.0, and are working toward a Python 3.3 compatible version
                      +
                       
                      +
                      +
                    • +
                    • +
                      +STM (software transactional memory): We have released a first working version, +and continue to try out new promising paths of achieving a fast multithreaded Python
                      +
                      +
                      +
                      +
                    • +
                    • +
                      +NumPy which requires installation of our fork of upstream numpy, +available on bitbucket +
                      +
                    • +
                    +We would also like to encourage new people to join the project. PyPy has many +layers and we need help with all of them: PyPy and Rpython documentation +improvements, tweaking popular modules to run on pypy, or general help with making +Rpython’s JIT even better.

                    +

                    +What is PyPy?

                    +PyPy is a very compliant Python interpreter, almost a drop-in replacement for +CPython 2.7. It’s fast (pypy and cpython 2.7.x performance comparison) +due to its integrated tracing JIT compiler.

                    + +This release supports x86 machines on most common operating systems +(Linux 32/64, Mac OS X 64, Windows, and OpenBSD), +as well as newer ARM hardware (ARMv6 or ARMv7, with VFPv3) running Linux.

                    +While we support 32 bit python on Windows, work on the native Windows 64 +bit python is still stalling, we would welcome a volunteer +to handle that.

                    +
                    +
                    +

                    +Highlights

                    +
                      +
                    • The past months have seen pypy mature and grow, as rpython becomes the goto +solution for writing fast dynamic language interpreters. Our separation of +Rpython from the python interpreter PyPy is now much clearer in the +PyPy documentation and we now have seperate RPython documentation. +Tell us what still isn’t clear, or even better help us improve the documentation.
                    • +
                    +
                    +
                    +
                      +
                    • We merged version 2.7.9 of python’s stdlib. From the python release notice:
                        +
                      • The entirety of Python 3.4’s ssl module has been backported. +See PEP 466 for justification.
                      • +
                      • HTTPS certificate validation using the system’s certificate store is now +enabled by default. See PEP 476 for details.
                      • +
                      • SSLv3 has been disabled by default in httplib and its reverse dependencies +due to the POODLE attack.
                      • +
                      • The ensurepip module has been backported, which provides the pip +package manager in every Python 2.7 installation. See PEP 477.
                      • +
                      +
                      +
                    • +
                    • The garbage collector now ignores parts of the stack which did not change +since the last collection, another performance boost
                    • +
                    +
                      +
                    • errno and LastError are saved around cffi calls so things like pdb will not +overwrite it
                    • +
                    +
                      +
                    • We continue to asymptotically approach a score of 7 times faster than cpython +on our benchmark suite, we now rank 6.98 on latest runs
                    • +
                    + +Please try it out and let us know what you think. We welcome +success stories, experiments, or benchmarks, we know you are using PyPy, please tell us about it!
                    + +Cheers
                    + +The PyPy Team
                    +
                    +
                    +
                    +
                    +
                    +

                    Comments

                    +
                    +
                    +
                    + + Anonymous wrote on 2015-03-26 11:10: +
                    +
                    +

                    You mentioned about speed of PyPy over CPython. I'm interesting in memory footprint too in addition to speed up. Please, add to speed.pypy.org memory footprint metric. It's harder to find cheap and huge amount of memory for VPS than slow old cpu. Nice to know minimal memory requirements for django sites on pypy.

                    +
                    +
                    +
                    +
                    + + Anonymous wrote on 2015-03-26 11:15: +
                    +
                    +

                    Is scores from speed.pypy.org applied to PyPy3 too? Later it was written PyPy3 was not fast as PyPy2.

                    +
                    +
                    +
                    +
                    + + Maciej Fijalkowski wrote on 2015-03-26 11:56: +
                    +
                    +

                    Memory footprint is tricky to measure. PyPy usually starts at 60M (as opposed to say 6 for cpython), but then data structures are smaller. We'll try to get some measurments going on some point. Benchmarking is hard :-)

                    No, PyPy3 is not as fast as PyPy2. We should really look into it at some point.

                    +
                    +
                    +
                    + +
                    +
                    + +
                    +
                    + + \ No newline at end of file diff --git a/posts/2015/03/pypy-stm-251-released-1342113838236225773.html b/posts/2015/03/pypy-stm-251-released-1342113838236225773.html new file mode 100644 index 000000000..631f03feb --- /dev/null +++ b/posts/2015/03/pypy-stm-251-released-1342113838236225773.html @@ -0,0 +1,434 @@ + + + + + +PyPy-STM 2.5.1 released | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                    +
                    +

                    PyPy-STM 2.5.1 released

                    + + + +
                    +

                    PyPy-STM 2.5.1 - Mawhrin-Skel

                    + +

                    We're pleased to announce PyPy-STM 2.5.1, codenamed Mawhrin-Skel. +This is the second official release of PyPy-STM. You can download +this release here (64-bit Linux only):

                    +
                    +https://pypy.org/download.html +
                    +

                    Documentation:

                    +
                    +https://pypy.readthedocs.org/en/latest/stm.html +
                    +

                    PyPy is an implementation of the Python programming language which focuses +on performance. So far we've been relentlessly optimizing for the single +core/process scenario. PyPy STM brings to the table a version of PyPy +that does not have the infamous Global Interpreter Lock, hence can run +multiple threads on multiple cores. Additionally it comes with a set +of primitives that make writing multithreaded applications a lot easier, +as explained below (see TransactionQueue) and in the documentation.

                    +

                    Internally, PyPy-STM is based on the Software Transactional Memory +plug-in called stmgc-c7. This version comes with a relatively +reasonable single-core overhead but scales only up to around 4 cores +on some examples; the next version of the plug-in, stmgc-c8, is in +development and should address that limitation (as well as reduce the +overhead). These versions only support 64-bit Linux; we'd welcome +someone to port the upcoming stmgc-c8 to other (64-bit) platforms.

                    +

                    This release passes all regular PyPy tests, except for a few +special cases. In other words, you should be able to drop in +PyPy-STM instead of the regular PyPy and your program should still +work. See current status for more information.

                    +

                    This work was done by Remi Meier and Armin Rigo. Thanks to all donors +for crowd-funding the STM work so far! As usual, it took longer +than we would have thought. I really want to thank the people that +kept making donations anyway. Your trust is greatly appreciated!

                    +
                    +

                    What's new?

                    +

                    Compared to the July 2014 release, the main addition is a way to +get reports about STM conflicts. This is an essential new feature.

                    +

                    To understand why this is so important, consider that if you already +played around with the previous release, chances are that you didn't +get very far. It probably felt like a toy: on very small examples it +would nicely scale, but on any larger example it would not scale at +all. You didn't get any feedback about why, but the underlying reason +is that, in a typical large example, there are some STM conflicts that +occur all the time and that won't be immediately found just by +thinking. This prevents any parallelization.

                    +

                    Now PyPy-STM is no longer a black box: you have a way to learn about +these conflicts, fix them, and try again. The tl;dr version is to run:

                    +
                    +    PYPYSTM=stmlog ./pypy-stm example.py
                    +    ./print_stm_log.py stmlog
                    +
                    +

                    More details in the STM user guide.

                    +
                    +
                    +

                    Performance

                    +

                    The performance is now more stable than it used to be. More +precisely, the best case is still "25%-40% single-core slow-down with +very good scaling up to 4 threads", but the average performance seems +not too far from that. There are still dark spots --- notably, the +JIT is still slower to warm up, though it was improved a lot. These +are documented in the current status section. Apart from +that, we should not get more than 2x single-core slow-down in the +worst case. Please report such cases as bugs!

                    +
                    +
                    +

                    TransactionQueue

                    +

                    As explained before, PyPy-STM is more than "just" a Python without +GIL. It is a Python in which you can do minor tweaks to your +existing, non-multithreaded programs and get them to use multiple +cores. You identify medium- or large-sized, likely-independent parts +of the code and to ask PyPy-STM to run these parts in parallel. An +example would be every iteration of some outermost loop over all items +of a dictionary. This is done with a new API: +transaction.TransactionQueue(). See help(TransactionQueue) or +read more about it in the STM user guide.

                    +

                    This is not a 100% mechanical change: very likely, you need to hunt +for and fix "STM conflicts" that prevent parallel execution (see +docs). However, at all points your program runs correctly, and you +can stop the hunt when you get acceptable performance. You don't get +deadlocks or corrupted state.

                    +
                    +

                    Thanks for reading!
                    +Armin, Remi, Fijal

                    +
                    +

                    Comments

                    +
                    +
                    +
                    + + Unknown wrote on 2015-03-31 09:45: +
                    +
                    +

                    From your explanation in this post, STM sounds similar to OpenMP. Can you explain the differences?

                    → https://openmp.org/wp/openmp-specifications/

                    +
                    +
                    +
                    +
                    + + Armin Rigo wrote on 2015-03-31 10:20: +
                    +
                    +

                    This is explained in https://pypy.readthedocs.org/en/latest/stm.html#how-to-write-multithreaded-programs-the-10-000-feet-view

                    +
                    +
                    +
                    +
                    + + Unknown wrote on 2015-03-31 15:14: +
                    +
                    +

                    Nice - thanks!

                    »TransactionQueue is in part similar: your program needs to have “some chances” of parallelization before you can apply it. But I believe that the scope of applicability is much larger with TransactionQueue than with other approaches. It usually works without forcing a complete reorganization of your existing code, and it works on any Python program which has got latent and imperfect parallelism. Ideally, it only requires that the end programmer identifies where this parallelism is likely to be found«

                    If I understand that correctly, for STM the parallelism only needs to be likely and can be imperfect, because it can recover from errors.

                    This would fix a whole class of problems I experienced in OpenMP Fortran code: Turning a crash or (worse) undefined behavior into a mere performance loss - and that’s really cool!

                    Thank you for working on that!

                    +
                    +
                    +
                    +
                    + + Anonymous wrote on 2015-04-23 13:07: +
                    +
                    +

                    Why do you always ask for money if nothing actually works?

                    +
                    +
                    +
                    +
                    + + Maciej Fijalkowski wrote on 2015-04-23 15:17: +
                    +
                    +

                    the alternative is to ask for money for stuff that already works, and that's a terrible strategy. suggest better alternatives

                    +
                    +
                    +
                    +
                    + + Armin Rigo wrote on 2015-04-24 00:53: +
                    +
                    +

                    Your comment suggests PyPy-STM doesn't actually work for you. If you have found a bug, please contribute a bug report, even if only if you have an example of program that should parallelize and doesn't; such bug reports are very useful. Alternatively, you're complaining that PyPy-STM is useless for you. Maybe I've been bad at explaining what you should expect and not expect from it in the first place, so I've given you wrong expectations. In that case, sorry. (The 3rd alternative would be that you're just trolling, but let's discard it for now.)

                    +
                    +
                    +
                    + +
                    +
                    + +
                    +
                    + + \ No newline at end of file diff --git a/posts/2015/05/cffi-10-beta-1-4375652711495636911.html b/posts/2015/05/cffi-10-beta-1-4375652711495636911.html new file mode 100644 index 000000000..9cf1a1b08 --- /dev/null +++ b/posts/2015/05/cffi-10-beta-1-4375652711495636911.html @@ -0,0 +1,476 @@ + + + + + +CFFI 1.0 beta 1 | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                    +
                    +

                    CFFI 1.0 beta 1

                    + + + +
                    +

                    Finally! CFFI 1.0 is almost ready. CFFI gives Python developers a convenient way to call external C libraries. Here "Python" == "CPython or PyPy", but this post is mostly about the CPython side of CFFI, as the PyPy version is not ready yet.

                    +

                    On CPython, you can download the version +"1.0.0b1" either by looking for the cffi-1.0 branch in +the repository, or by +saying

                    + +pip install "cffi>=1.0.dev0" + +

                    (Until 1.0 final is ready, +pip install cffi will still give you version 0.9.2.)

                    +

                    The main news: you can now explicitly generate and compile a CPython C +extension module from a "build" script. Then in the rest of your +program or library, you no longer need to import cffi at all. +Instead, you simply say:

                    +
                    +from _my_custom_module import ffi, lib
                    +
                    +

                    Then you use ffi and lib just like you did in your +verify()-based project in CFFI 0.9.2. (The lib is what used to +be the result of verify().) The details of how you use them +should not have changed at all, so that the rest of your program should +not need any update.

                    +
                    +

                    Benefits

                    +

                    This is a big step towards standard practices for making and +distributing Python packages with C extension modules:

                    +
                      +
                    • on the one hand, you need an explicit compilation step, triggered +here by running the "build" script;
                    • +
                    • on the other hand, what you gain in return is better control over +when and why the C compilation occurs, and more standard ways to write +distutils- or setuptools-based setup.py files (see below).
                    • +
                    +

                    Additionally, this completely removes one of the main drawbacks of using +CFFI to interface with large C APIs: the start-up time. In some cases +it could be extreme on slow machines (cases of 10-20 seconds on ARM +boards occur commonly). Now, the import above is instantaneous.

                    +

                    In fact, none of the pure Python cffi package is needed any more at +runtime (it needs only an internal extension module from CFFI, which +can be installed by doing "pip install cffi-runtime" [*] if you only need that). +The ffi object you get by the import above is of a +completely different class written entirely in C. The two +implementations might get merged in the future; for now they are +independent, but give two compatible APIs. The differences are that +some methods like cdef() and verify() and set_source() are +omitted from the C version, because it is supposed to be a complete FFI +already; and other methods like new(), which take as parameter a +string describing a C type, are faster now because that string is parsed +using a custom small-subset-of-C parser, written in C too.

                    +
                    +
                    +

                    In practice

                    +

                    CFFI 1.0 beta 1 was tested on CPython 2.7 and 3.3/3.4, on Linux and to +some extent on Windows and OS/X. Its PyPy version is not ready yet, +and the only docs available so far are those below.

                    +

                    This is beta software, so there might be bugs and details may change. We are interested in hearing any feedback (irc.freenode.net #pypy) or bug reports.

                    +

                    To use the new features, create a source file that is not imported by the rest of +your project, in which you place (or move) the code to build the FFI +object:

                    +
                    +# foo_build.py
                    +import cffi
                    +ffi = cffi.FFI()
                    +
                    +ffi.cdef("""
                    +    int printf(const char *format, ...);
                    +""")
                    +
                    +ffi.set_source("_foo", """
                    +    #include <stdio.h>
                    +""")   # and other arguments like libraries=[...]
                    +
                    +if __name__ == '__main__':
                    +    ffi.compile()
                    +
                    +

                    The ffi.set_source() replaces the ffi.verify() of CFFI 0.9.2. +Calling it attaches the given source code to the ffi object, but this call doesn't +compile or return anything by itself. It may be placed above the ffi.cdef() +if you prefer. Its first argument is the name of the C extension module +that will be produced.

                    +

                    Actual compilation (including generating the complete C sources) occurs +later, in one of two places: either in ffi.compile(), shown above, +or indirectly from the setup.py, shown next.

                    +

                    If you directly execute the file foo_build.py above, it will +generate a local file _foo.c and compile it to _foo.so (or the +appropriate extension, like _foo.pyd on Windows). This is the +extension module that can be used in the rest of your program by saying +"from _foo import ffi, lib".

                    +
                    +
                    +

                    Distutils

                    +

                    If you want to distribute your program, you write a setup.py using +either distutils or setuptools. Using setuptools is generally +recommended nowdays, but using distutils is possible too. We show it +first:

                    +
                    +# setup.py
                    +from distutils.core import setup
                    +import foo_build
                    +
                    +setup(
                    +    name="example",
                    +    version="0.1",
                    +    py_modules=["example"],
                    +    ext_modules=[foo_build.ffi.distutils_extension()],
                    +)
                    +
                    +

                    This is similar to the CFFI 0.9.2 way. It only works if cffi was +installed previously, because otherwise foo_build cannot be +imported. The difference is that you use ffi.distutils_extension() +instead of ffi.verifier.get_extension(), because there is no longer +any verifier object if you use set_source().

                    +
                    +
                    +

                    Setuptools

                    +

                    The modern way is to write setup.py files based on setuptools, which +can (among lots of other things) handle dependencies. It is what you +normally get with pip install, too. Here is how you'd write it:

                    +
                    +# setup.py
                    +from setuptools import setup
                    +
                    +setup(
                    +    name="example",
                    +    version="0.1",
                    +    py_modules=["example"],
                    +    setup_requires=["cffi>=1.0.dev0"],
                    +    cffi_modules=["foo_build:ffi"],
                    +    install_requires=["cffi-runtime"],    # see [*] below
                    +)
                    +
                    +

                    Note that "cffi" is mentioned on three lines here:

                    +
                      +
                    • the first time is in setup_requires, which means that cffi will +be locally downloaded and used for the setup.
                    • +
                    • the second mention is a custom cffi_modules argument. This +argument is handled by cffi as soon as it is locally downloaded. It +should be a list of "module:ffi" strings, where the ffi part +is the name of the global variable in that module.
                    • +
                    • the third mention is in install_requires. It means that in +order to install this example package, "cffi-runtime" must also be +installed. This is (or will be) a PyPI entry that only contains a +trimmed down version of CFFI, one that does not include the pure +Python "cffi" package and its dependencies. None of it is needed at +runtime.
                    • +
                    +

                    [*] NOTE: The "cffi-runtime" PyPI entry is not ready yet. For now, use "cffi>=1.0.dev0" instead. Considering PyPy, which has got a built-in "_cffi_backend" module, the "cffi-runtime" package could never be upgraded there; but it would still be nice if we were able to upgrade the "cffi" pure Python package on PyPy. This might require some extra care in writing the interaction code. We need to sort it out now...

                    +
                    +
                    +

                    Thanks

                    +

                    Special thanks go to the PSF (Python Software Foundation) for their +financial support, without which this work---er... it might likely have occurred anyway, but at an unknown future date :-)

                    +

                    (For reference, the amount I asked for (and got) is equal to one +month of what a Google Summer of Code student gets, for work that will +take a bit longer than one month. At least I personally am running mostly +on such money, and so I want to thank the PSF again for their +contribution to CFFI---and while I'm at it, thanks to all other +contributors to PyPy---for making this job more than an unpaid hobby on +the side :-)

                    +

                    Armin Rigo

                    +
                    +
                    +

                    Comments

                    +
                    +
                    +
                    + + Mahmoud wrote on 2015-05-05 20:59: +
                    +
                    +

                    This is great news! We're loving using CFFI via cryptography and PyOpenSSL.

                    +
                    +
                    +
                    +
                    + + Unknown wrote on 2015-05-05 21:37: +
                    +
                    +

                    An easier way to install cffi 1.0 beta releases is with

                    pip install --pre cffi

                    The --pre flag indicates pre-releases are acceptable for installation.

                    +
                    +
                    +
                    +
                    + + Unknown wrote on 2015-05-06 08:54: +
                    +
                    +

                    That's great news! Hard to read though if you're not familiar with CFFI behaviour from before.

                    +
                    +
                    +
                    + +
                    +
                    + +
                    +
                    + + \ No newline at end of file diff --git a/posts/2015/05/cffi-101-released-756545636419794802.html b/posts/2015/05/cffi-101-released-756545636419794802.html new file mode 100644 index 000000000..4e445807c --- /dev/null +++ b/posts/2015/05/cffi-101-released-756545636419794802.html @@ -0,0 +1,345 @@ + + + + + +CFFI 1.0.1 released | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                    +
                    +

                    CFFI 1.0.1 released

                    + + + +
                    +

                    CFFI 1.0.1 final has now been released for CPython! CFFI is a (CPython and PyPy) module to interact with C code from Python.

                    +

                    The main news from CFFI 0.9 is the new way to build extension modules: +the "out-of-line" mode, where you have a separate build script. When +this script is executed, it produces the extension module. This comes +with associated Setuptools support that fixes the headache of +distributing your own CFFI-using packages. It also massively cuts +down the import times.

                    +

                    Although this is a major new version, it should be fully +backward-compatible: existing projects should continue to work, in +what is now called the "in-line mode".

                    +

                    The documentation has been reorganized and split into a few pages. +For more information about this new "out-of-line" mode, as well as +more general information about what CFFI is and how to use it, read the Goals and proceed to +the Overview.

                    +

                    Unlike the 1.0 beta 1 version (ffi.dlopen(), instead of only +ffi.verify().

                    +

                    PyPy support: PyPy needs integrated support for efficient JITting, +so you cannot install a different version of CFFI on top of an +existing PyPy. You need to wait for the upcoming PyPy 2.6 to use +CFFI 1.0---or get a nightly build.

                    +

                    My thanks again to the PSF (Python Software Foundation) for their +financial support!

                    + +UPDATE:

                    Bug with the first example "ABI out-of-line": variadic functions (like printf, ending in a "..." argument) crash. Fixed in CFFI 1.0.2.

                    +
                    +

                    Comments

                    +
                    +
                    +
                    + + Unknown wrote on 2015-05-22 17:21: +
                    +
                    +

                    it's really great!

                    +
                    +
                    +
                    +
                    + + Unknown wrote on 2015-05-22 23:32: +
                    +
                    +

                    Awesome! Thanks for this. I think is the best way to make extension modules for cpython and pypy.

                    +
                    +
                    +
                    +
                    + + Unknown wrote on 2015-05-22 23:33: +
                    +
                    +

                    Awesome! Thanks for this. I think is the best way to make extension modules for cpython and pypy.

                    +
                    +
                    +
                    + +
                    +
                    + +
                    +
                    + + \ No newline at end of file diff --git a/posts/2015/06/pypy-260-release-8983050552628070433.html b/posts/2015/06/pypy-260-release-8983050552628070433.html new file mode 100644 index 000000000..8a3c3801b --- /dev/null +++ b/posts/2015/06/pypy-260-release-8983050552628070433.html @@ -0,0 +1,438 @@ + + + + + +PyPy 2.6.0 release | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                    +
                    +

                    PyPy 2.6.0 release

                    + + + +
                    +
                    +
                    +
                    +

                    +PyPy 2.6.0 - Cameo Charm

                    +
                    +We’re pleased to announce PyPy 2.6.0, only two months after PyPy 2.5.1. We are particulary happy to update cffi to version 1.1, which makes the popular ctypes-alternative even easier to use, and to support the new vmprof statistical profiler.
                    +
                    +
                    +
                    +You can download the PyPy 2.6.0 release here:
                    +
                    +
                    +
                    + +
                    +
                    +
                    +
                    +We would like to thank our donors for the continued support of the PyPy project, and for those who donate to our three sub-projects, as well as our volunteers and contributors.
                    +
                    +
                    +
                    +Thanks also to Yury V. Zaytsev and David Wilson who recently started running nightly builds on Windows and MacOSX buildbots.
                    +
                    +
                    +
                    +We’ve shown quite a bit of progress, but we’re slowly running out of funds. Please consider donating more, or even better convince your employer to donate, so we can finish those projects! The three sub-projects are:
                    +
                    +
                    +
                      +
                    • +Py3k (supporting Python 3.x): We have released a Python 3.2.5 compatible version we call PyPy3 2.4.0, and are working toward a Python 3.3 compatible version
                    • +
                    • +STM (software transactional memory): We have released a first working version, and continue to try out new promising paths of achieving a fast multithreaded Python
                    • +
                    • +NumPy which requires installation of our fork of upstream numpy, available on bitbucket +
                    • +
                    +
                    +
                    +
                    +We would also like to encourage new people to join the project. PyPy has many layers and we need help with all of them: PyPy and RPython documentation improvements, tweaking popular modules to run on pypy, or general help with making RPython’s JIT even better. Nine new people contributed since the last release, you too could be one of them.
                    +
                    +

                    +What is PyPy?

                    +
                    +PyPy is a very compliant Python interpreter, almost a drop-in replacement for CPython 2.7. It’s fast (pypy and cpython 2.7.x performance comparison) due to its integrated tracing JIT compiler.
                    +
                    +
                    +
                    +This release supports x86 machines on most common operating systems (Linux 32/64, Mac OS X 64, Windows, OpenBSD, freebsd), as well as newer ARM hardware (ARMv6 or ARMv7, with VFPv3) running Linux.
                    +
                    +
                    +
                    +While we support 32 bit python on Windows, work on the native Windows 64 bit python is still stalling, we would welcome a volunteer to handle that. We also welcome developers with other operating systems or dynamic languages to see what RPython can do for them.
                    +
                    +
                    +
                    +
                    +

                    +Highlights

                    +
                      +
                    • Python compatibility:
                        +
                      • Improve support for TLS 1.1 and 1.2
                      • +
                      • Windows downloads now package a pypyw.exe in addition to pypy.exe
                      • +
                      • Support for the PYTHONOPTIMIZE environment variable (impacting builtin’s __debug__ property)
                      • +
                      • Issues reported with our previous release were resolved after reports from users on our issue tracker at https://foss.heptapod.net/pypy/pypy/-/issues or on IRC at #pypy.
                      • +
                      +
                    • +
                    • New features:
                        +
                      • Add preliminary support for a new lightweight statistical profiler vmprof, which has been designed to accomodate profiling JITted code
                      • +
                      +
                    • +
                    • Numpy:
                        +
                      • Support for object dtype via a garbage collector hook
                      • +
                      • Support for .can_cast and .min_scalar_type as well as beginning a refactoring of the internal casting rules
                      • +
                      • Better support for subtypes, via the __array_interface__, __array_priority__, and __array_wrap__ methods (still a work-in-progress)
                      • +
                      • Better support for ndarray.flags
                      • +
                      +
                    • +
                    • Performance improvements:
                        +
                      • Slight improvement in frame sizes, improving some benchmarks
                      • +
                      • Internal refactoring and cleanups leading to improved JIT performance
                      • +
                      +
                        +
                      • Improved IO performance of zlib and bz2 modules
                      • +
                      • We continue to improve the JIT’s optimizations. Our benchmark suite is now over 7 times faster than cpython
                      • +
                      +
                    • +
                    +
                    +
                    +
                    +
                    +Please try it out and let us know what you think. We welcome success stories, experiments, or benchmarks, we know you are using PyPy, please tell us about it!
                    +Cheers
                    +The PyPy Team
                    +
                    +
                    +

                    +
                    +
                    +
                    +
                    +
                    +

                    Comments

                    +
                    +
                    +
                    + + mattip wrote on 2015-06-01 16:32: +
                    +
                    +

                    PyPy 2.6.0 - Cameo Charm since PyPy looks best in profile (well, vmprof anyway)

                    +
                    +
                    +
                    +
                    + + Anonymous wrote on 2015-06-01 17:57: +
                    +
                    +

                    How is matplotlib state in numpypy ?

                    +
                    +
                    +
                    +
                    + + mattip wrote on 2015-06-02 10:51: +
                    +
                    +

                    No GUI backend, but this fork should work (version 1.4) for non-interactive plotting
                    https://github.com/mattip/matplotlib
                    You will need to install our fork of numpy as a prerequisite
                    https://bitbucket.org/pypy/numpy

                    Help with the cffi port of WxPython could get us a GUI backend (or a updated matplotlib)
                    https://doc.pypy.org/en/latest/project-ideas.html#make-more-python-modules-pypy-friendly

                    +
                    +
                    +
                    +
                    + + Anonymous wrote on 2015-06-02 12:07: +
                    +
                    +

                    Thanks for the information

                    +
                    +
                    +
                    + +
                    +
                    + +
                    +
                    + + \ No newline at end of file diff --git a/posts/2015/06/pypy-and-ijson-guest-blog-post-8143007374752482637.html b/posts/2015/06/pypy-and-ijson-guest-blog-post-8143007374752482637.html new file mode 100644 index 000000000..b624152e9 --- /dev/null +++ b/posts/2015/06/pypy-and-ijson-guest-blog-post-8143007374752482637.html @@ -0,0 +1,345 @@ + + + + + +PyPy and ijson - a guest blog post | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                    +
                    +

                    PyPy and ijson - a guest blog post

                    + + + +
                    +
                    +This gem was posted in the ijson issue tracker after some discussion on #pypy, and Dav1dde kindly allowed us to repost it here:

                    "So, I was playing around with parsing huge JSON files (19GiB, testfile is ~520MiB) and wanted to try a sample code with PyPy, turns out, PyPy needed ~1:30-2:00 whereas CPython 2.7 needed ~13 seconds (the pure python implementation on both pythons was equivalent at ~8 minutes).

                    "Apparantly ctypes is really bad performance-wise, especially on PyPy. So I made a quick CFFI mockup: https://gist.github.com/Dav1dde/c509d472085f9374fc1d

                    +Before:

                    CPython 2.7:
                        python -m emfas.server size dumps/echoprint-dump-1.json
                        11.89s user 0.36s system 98% cpu 12.390 total 

                    +PYPY:
                        python -m emfas.server size dumps/echoprint-dump-1.json
                        117.19s user 2.36s system 99% cpu 1:59.95 total


                    +After (CFFI):

                    CPython 2.7:
                         python jsonsize.py ../dumps/echoprint-dump-1.json
                         8.63s user 0.28s system 99% cpu 8.945 total 

                    +PyPy:
                         python jsonsize.py ../dumps/echoprint-dump-1.json
                         4.04s user 0.34s system 99% cpu 4.392 total +

                    "
                    +
                    +
                    +

                    Dav1dd goes into more detail in the issue itself, but we just want to emphasize a few significant points from this brief interchange:
                      +
                    • His CFFI implementation is faster than the ctypes one even on CPython 2.7.
                    • +
                    • PyPy + CFFI is faster than CPython even when using C code to do the heavy parsing.
                    • +
                    + The PyPy Team
                    +
                    +
                    +
                    +
                    +
                    +
                    +

                    Comments

                    +
                    +
                    +
                    + + Alendit wrote on 2015-06-18 08:38: +
                    +
                    +

                    Maybe it's time to discuss inclusion of CFFI into stdandard library again?

                    +
                    +
                    +
                    +
                    + + Armin Rigo wrote on 2015-06-18 09:52: +
                    +
                    +

                    If CPython decides to include it in its stdlib, I can make sure it is updated as needed. I don't have the energy to discuss its inclusion myself, so if it happens it will be "championed" by someone else. Nowadays, I personally think inclusion has as many drawbacks as advantages, even if CFFI 1.x shouldn't evolve a lot in the foreseeable future after the 1.0 step.

                    +
                    +
                    +
                    +
                    + + v3ss wrote on 2015-07-18 22:14: +
                    +
                    +

                    The problem is converting existing libs to use cffi. Only very few percent of Libs are ready for python3.x and with this trend , not even 1% of libs will be converted to work with CFFI.
                    That makes PyPy adoption a lot slower.

                    Is there really no chance of improving ctypes?

                    +
                    +
                    +
                    +
                    + + Maciej Fijalkowski wrote on 2015-07-19 05:39: +
                    +
                    +

                    you would think, but these days vast majority of popular C bindings come with cffi equivalents. In fact cffi is vastly more popular than ctypes ever was.

                    +
                    +
                    +
                    + +
                    +
                    + +
                    +
                    + + \ No newline at end of file diff --git a/posts/2015/08/pypy-261-released-3638960649983103796.html b/posts/2015/08/pypy-261-released-3638960649983103796.html new file mode 100644 index 000000000..ba4949bbf --- /dev/null +++ b/posts/2015/08/pypy-261-released-3638960649983103796.html @@ -0,0 +1,429 @@ + + + + + +PyPy 2.6.1 released | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                    +
                    +

                    PyPy 2.6.1 released

                    + + + +
                    +
                    +
                    +
                    +
                    +

                    +PyPy 2.6.1

                    +We’re pleased to announce PyPy 2.6.1, an update to PyPy 2.6.0 released June 1. +We have fixed many issues, updated stdlib to 2.7.10, cffi to version 1.3, extended support for +the new vmprof statistical profiler for multiple threads, and increased +functionality of numpy.
                    +You can download the PyPy 2.6.1 release here:
                    + +
                    +We would like to thank our donors for the continued support of the PyPy +project, and our volunteers and contributors.

                    +We would also like to encourage new people to join the project. PyPy has many +layers and we need help with all of them: PyPy and RPython documentation +improvements, tweaking popular modules to run on pypy, or general help with making +RPython’s JIT even better.

                    +

                    +What is PyPy?

                    +PyPy is a very compliant Python interpreter, almost a drop-in replacement for +CPython 2.7. It’s fast (pypy and cpython 2.7.x performance comparison) +due to its integrated tracing JIT compiler.

                    +This release supports x86 machines on most common operating systems +(Linux 32/64, Mac OS X 64, Windows 32, OpenBSD, freebsd), +as well as newer ARM hardware (ARMv6 or ARMv7, with VFPv3) running Linux.

                    +We also welcome developers of other +dynamic languages to see what RPython can do for them.
                    +
                    +

                    +Highlights

                    +
                      +
                    • Bug Fixes
                        +
                      • Revive non-SSE2 support
                      • +
                      • Fixes for detaching _io.Buffer*
                      • +
                      • On Windows, close (and flush) all open sockets on exiting
                      • +
                      • Drop support for ancient macOS v10.4 and before
                      • +
                      • Clear up contention in the garbage collector between trace-me-later and pinning
                      • +
                      • Issues reported with our previous release were resolved after reports from users on +our issue tracker at https://foss.heptapod.net/pypy/pypy/-/issues or on IRC at +#pypy.
                      • +
                      +
                    • +
                    • New features:
                        +
                      • cffi was updated to version 1.3
                      • +
                      • The python stdlib was updated to 2.7.10 from 2.7.9
                      • +
                      • vmprof now supports multiple threads and OS X
                      • +
                      • The translation process builds cffi import libraries for some stdlib +packages, which should prevent confusion when package.py is not used
                      • +
                      • better support for gdb debugging
                      • +
                      • freebsd should be able to translate PyPy “out of the box” with no patches
                      • +
                      +
                    • +
                    • Numpy:
                        +
                      • Better support for record dtypes, including the align keyword
                      • +
                      • Implement casting and create output arrays accordingly (still missing some corner cases)
                      • +
                      • Support creation of unicode ndarrays
                      • +
                      • Better support ndarray.flags
                      • +
                      • Support axis argument in more functions
                      • +
                      • Refactor array indexing to support ellipses
                      • +
                      • Allow the docstrings of built-in numpy objects to be set at run-time
                      • +
                      • Support the buffered nditer creation keyword
                      • +
                      +
                    • +
                    • Performance improvements:
                        +
                      • Delay recursive calls to make them non-recursive
                      • +
                      • Skip loop unrolling if it compiles too much code
                      • +
                      • Tweak the heapcache
                      • +
                      • Add a list strategy for lists that store both floats and 32-bit integers. +The latter are encoded as nonstandard NaNs. Benchmarks show that the speed +of such lists is now very close to the speed of purely-int or purely-float +lists.
                      • +
                      • Simplify implementation of ffi.gc() to avoid most weakrefs
                      • +
                      • Massively improve the performance of map() with more than +one sequence argument
                      • +
                      +
                    • +
                    +Please try it out and let us know what you think. We welcome +success stories, experiments, or benchmarks, we know you are using PyPy, please tell us about it!
                    +Cheers
                    +The PyPy Team
                    +
                    +
                    +
                    +
                    +
                    +

                    Comments

                    +
                    +
                    +
                    + + Anonymous wrote on 2015-09-02 13:28: +
                    +
                    +

                    Cool! Really nice, thank you. Any ETA for Python 3.3 compatibility in pypy?

                    +
                    +
                    +
                    +
                    + + xndxn wrote on 2015-09-03 17:37: +
                    +
                    +

                    Thanks!

                    +
                    +
                    +
                    +
                    + + Anonymous wrote on 2015-09-03 18:56: +
                    +
                    +

                    Thanks!

                    +
                    +
                    +
                    +
                    + + Anonymous wrote on 2015-09-04 05:01: +
                    +
                    +

                    Still waiting for PyPy3's update. The latest version of PyPy is much faster than the latest version of PyPy3. Please update soon. :)

                    +
                    +
                    +
                    +
                    + + PeteVine wrote on 2015-09-14 00:03: +
                    +
                    +

                    Contrary to what the front page is still saying, the non-SSE2 backend for older x86 processors is fully working and can be built from source, which takes almost 7h on a 2.2GHz Athlon XP.

                    You can download a 2.6.1 build from here:

                    https://www.dropbox.com/sh/6i7ktwv9551asfc/AADOd55Br0lDJRH8HsKpbIwTa?dl=0

                    It should work on any P2 class processor.

                    +
                    +
                    +
                    + +
                    +
                    + +
                    +
                    + + \ No newline at end of file diff --git a/posts/2015/09/pypy-warmup-improvements-8349465374608676233.html b/posts/2015/09/pypy-warmup-improvements-8349465374608676233.html new file mode 100644 index 000000000..f6d795ec2 --- /dev/null +++ b/posts/2015/09/pypy-warmup-improvements-8349465374608676233.html @@ -0,0 +1,414 @@ + + + + + +PyPy warmup improvements | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                    +
                    +

                    PyPy warmup improvements

                    + + + +
                    +
                    + +

                    Hello everyone!

                    +

                    I'm very pleased to announce that we've just managed to merge +the optresult branch. +Under this cryptic name is the biggest JIT refactoring we've done in a couple +years, mostly focused on the warmup time and memory impact of PyPy.

                    +

                    To understand why we did that, let's look back in time - back when we +got the first working JIT prototype in 2009 we were focused exclusively +on achieving peak performance with some consideration towards memory usage, but +without serious consideration towards warmup time. This means we accumulated +quite a bit of technical debt over time that we're trying, with difficulty, +to address right now. This branch mostly does not affect the peak performance +- it should however help you with short-living scripts, like test runs.

                    +

                    We identified warmup time to be one of the major pain points for pypy users, +along with memory impact and compatibility issues with CPython C extension +world. While we can't address all the issues at once, we're trying to address +the first two in the work contributing to this blog post. I will write +a separate article on the last item separately.

                    +

                    To see how much of a problem warmup is for your program, you can run your +program with PYPYLOG=jit-summary:- environment variable set. +This should show you something like this:

                    +
                    +(pypy-optresult)fijal@hermann:~/src/botbot-web$ PYPYLOG=jit-summary:- python orm.py 1500
                    +[d195a2fcecc] {jit-summary
                    +Tracing:            781     2.924965
                    +Backend:            737     0.722710
                    +TOTAL:                      35.912011
                    +ops:                1860596
                    +recorded ops:       493138
                    +  calls:            81022
                    +guards:             131238
                    +opt ops:            137263
                    +opt guards:         35166
                    +forcings:           4196
                    +abort: trace too long:      22
                    +abort: compiling:   0
                    +abort: vable escape:        22
                    +abort: bad loop:    0
                    +abort: force quasi-immut:   0
                    +nvirtuals:          183672
                    +nvholes:            25797
                    +nvreused:           116131
                    +Total # of loops:   193
                    +Total # of bridges: 575
                    +Freed # of loops:   6
                    +Freed # of bridges: 75
                    +[d195a48de18] jit-summary}
                    +
                    +

                    This means that the total (wall clock) time was 35.9s, out of which we spent +2.9s tracing 781 loops and 0.72s compiling them. The remaining couple were +aborted (trace too long is normal, vable escape means someone called +sys._getframe() or equivalent). You can do the following things:

                    +
                      +
                    • compare the numbers with pypy --jit off and see at which number of +iterations pypy jit kicks in
                    • +
                    • play with the thresholds: +pypy --jit threshold=500,function_threshold=400,trace_eagerness=50 was +much better in this example. What this does is to lower the threshold +for tracing loops from default of 1039 to 400, threshold for tracing +functions from the start from 1619 to 500 and threshold for tracing bridges +from 200 to 50. Bridges are "alternative paths" that JIT did not take that +are being additionally traced. We believe in sane defaults, so we'll try +to improve upon those numbers, but generally speaking there is no one-size +fits all here.
                    • +
                    • if the tracing/backend time stays high, come and complain to us with +benchmarks, we'll try to look at them
                    • +
                    +

                    Warmup, as a number, is notoriously hard to measure. It's a combination of:

                    +
                      +
                    • pypy running interpreter before jitting
                    • +
                    • pypy needing time to JIT the traces
                    • +
                    • additional memory allocations needed during tracing to accomodate bookkeeping +data
                    • +
                    • exiting and entering assembler until there is enough coverage of assembler
                    • +
                    +

                    We're working hard on making a better assesment at this number, stay tuned :-)

                    +
                    +

                    Speedups

                    +

                    Overall we measured about 50% speed improvement in the optimizer, which reduces +the overall warmup time between 10% and 30%. The very +obvious warmup benchmark got a speedup from 4.5s to 3.5s, almost +30% improvement. Obviously the speedups on benchmarks would vastly +depend on how much warmup time is there in those benchmarks. We observed +annotation of pypy to decreasing by about 30% and the overall translation +time by about 7%, so your mileage may vary.

                    +

                    Of course, as usual with the large refactoring of a crucial piece of PyPy, +there are expected to be bugs. We are going to wait for the default branch +to stabilize so you should see warmup improvements in the next release. +If you're not afraid to try, nightlies will already have them.

                    +

                    We're hoping to continue improving upon warmup time and memory impact in the +future, stay tuned for improvements.

                    +
                    +
                    +

                    Technical details

                    +

                    The branch does "one" thing - it changes the underlying model of how operations +are represented during tracing and optimizations. Let's consider a simple +loop like:

                    +
                    +[i0, i1]
                    +i2 = int_add(i0, i1)
                    +i3 = int_add(i2, 1)
                    +i4 = int_is_true(i3)
                    +guard_true(i4)
                    +jump(i3, i2)
                    +
                    +

                    The original representation would allocate a Box for each of i0 - i4 +and then store those boxes in instances of ResOperation. The list of such +operations would then go to the optimizer. Those lists are big - we usually +remove 90% of them during optimizations, but they can be a couple thousand +elements. Overall, allocating those big lists takes a toll on warmup time, +especially due to the GC pressure. The branch removes the existance of Box +completely, instead using a link to ResOperation itself. So say in the above +example, i2 would refer to its producer - i2 = int_add(i0, i1) with +arguments getting special treatment.

                    +

                    That alone reduces the GC pressure slightly, but a reduced number +of instances also lets us store references on them directly instead +of going through expensive dictionaries, which were used to store optimizing +information about the boxes.

                    +

                    Cheers!
                    +fijal & arigo

                    +
                    + +
                    +
                    +
                    +

                    Comments

                    +
                    +
                    +
                    + +
                    +
                    + + \ No newline at end of file diff --git a/posts/2015/10/automatic-simd-vectorization-support-in-639063580401330508.html b/posts/2015/10/automatic-simd-vectorization-support-in-639063580401330508.html new file mode 100644 index 000000000..4456f6748 --- /dev/null +++ b/posts/2015/10/automatic-simd-vectorization-support-in-639063580401330508.html @@ -0,0 +1,382 @@ + + + + + +Automatic SIMD vectorization support in PyPy | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                    +
                    +

                    Automatic SIMD vectorization support in PyPy

                    + + + +
                    +
                    +Hi everyone,

                    +it took some time to catch up with the JIT refacrtorings merged in this summer. But, (drums) we are happy to announce that:

                    +The next release of PyPy,  "PyPy 4.0.0", will ship the new auto vectorizer +

                    +The goal of this project was to increase the speed of numerical applications in both the NumPyPy library and for arbitrary Python programs. In PyPy we have focused a lot on improvements in the 'typical python workload', which usually involves object and string manipulations, mostly for web development. We're hoping with this work that we'll continue improving the other very important Python use case - numerics.

                    +What it can do! +

                    +It targets numerics only. It +will not execute object manipulations faster, but it is capable of +enhancing common vector and matrix operations.
                    +Good news is that it is not specifically targeted for the NumPy library and the PyPy +virtual machine. Any interpreter (written in RPython) is able make use +of the vectorization. For more information about that take a look here, or consult the documentation. For the time being it is not turn on by default, so be sure to enable it by specifying --jit vec=1 before running your program.

                    +If your language (written in RPython) contains many array/matrix operations, you can easily integrate the optimization by adding the parameter 'vec=1' to the JitDriver.

                    +NumPyPy Improvements +

                    + +Let's take a look at the core functions of the NumPyPy library (*).
                    The following tests tests show the speedup of the core functions commonly used in Python code interfacing with NumPy, on CPython with NumPy, on the PyPy 2.6.1 relased several weeks ago, and on PyPy 15.11 to be released soon. Timeit was used to test the time needed to run the operation in the plot title on various vector (lower case) and square matrix (upper case) sizes displayed on the X axis. The Y axis shows the speedup compared to CPython 2.7.10. This means that higher is better

                    + +
                    +
                    +
                    +
                    +
                    +In comparison to PyPy 2.6.1, the speedup greatly improved. The hardware support really strips down the runtime of the vector and matrix operations. There is another operation we would like to highlight: the dot product.
                    It is a very common operation in numerics and PyPy now (given a moderate sized matrix and vector) decreases the time spent in that operation. See for yourself:

                    + +
                    +
                    +
                    +These are nice improvements in the NumPyPy library and we got to a competitive level only making use of SSE4.1.

                    +Future work   +

                    +
                    This is not the end of the road. The GSoC project showed that it is possible to implement this optimization in PyPy. There might be other improvements we can make to carry this further:
                      +
                    • Check alignment at runtime to increase the memory throughput of the CPU
                    • +
                    • Support the AVX vector extension which (at least) doubles the size of the vector register
                    • +
                    • Handle each and every corner case in Python traces to enable it  globally
                    • +
                    • Do not rely only on loading operations to trigger the analysis, there might be cases where combination of floating point values could be done in parallel
                    • +
                    +Cheers,
                    +The PyPy Team

                    +(*) The benchmark code can be found here it was run using this configuration: i7-2600 CPU @ 3.40GHz (4 cores). +

                    +
                    +
                    +

                    Comments

                    +
                    +
                    +
                    + + Nax wrote on 2015-10-20 20:27: +
                    +
                    +

                    Which BLAS are u using for CPython Numpy? OpenBlas?

                    +
                    +
                    +
                    +
                    + + crusaderky wrote on 2015-10-20 22:20: +
                    +
                    +

                    How does it compare to numexpr on those benchmarks?

                    Also, any plan of addressing one of the killer features of numexpr, that is the fact that an operation like y += a1*x1 + a2*x2 + a3*x3 will create 5 temporary vectors and make a horrible usage of the CPU cache?

                    +
                    +
                    +
                    +
                    + + Anonymous wrote on 2015-10-21 05:03: +
                    +
                    +

                    I don't know anyone who uses NumPy for arrays with less than 128 elements.

                    Your own benchmark shows NumPypy is much slower than NumPy for large arrays...

                    +
                    +
                    +
                    +
                    + + Unknown wrote on 2015-10-21 08:44: +
                    +
                    +

                    NumPyPy is currently not complete. Trying to evaluate any numexpr gives a strange error. I guess the problem is a missing field not exported by NumPyPy.
                    However we will see how far we can get with this approach. I have made some thoughts on how we could make good use of graphics cards, but this is future work.

                    +
                    +
                    +
                    +
                    + + René Dudfield wrote on 2015-10-21 11:14: +
                    +
                    +

                    Nice work!

                    +
                    +
                    +
                    + +
                    +
                    + +
                    +
                    + + \ No newline at end of file diff --git a/posts/2015/10/powerpc-backend-for-jit-3014100267884692148.html b/posts/2015/10/powerpc-backend-for-jit-3014100267884692148.html new file mode 100644 index 000000000..132bbdc4e --- /dev/null +++ b/posts/2015/10/powerpc-backend-for-jit-3014100267884692148.html @@ -0,0 +1,413 @@ + + + + + +PowerPC backend for the JIT | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                    +
                    +

                    PowerPC backend for the JIT

                    + + + +
                    +

                    Hi all,

                    + +

                    PyPy's JIT now supports the 64-bit PowerPC architecture! This is the +third architecture supported, in addition to x86 (32 and 64) and ARM +(32-bit only). More precisely, we support Linux running the big- and the +little-endian variants of ppc64. Thanks to IBM for funding this work!

                    + +

                    The new JIT backend has been merged into "default". You should be able +to translate PPC versions +as usual +directly on the machines. For +the foreseeable future, I will compile and distribute binary versions +corresponding to the official releases (for Fedora), but of course I'd +welcome it if someone else could step in and do it. Also, it is unclear +yet if we will run a buildbot.

                    + +

                    To check that the result performs well, I logged in a ppc64le machine +and ran the usual benchmark suite of PyPy (minus sqlitesynth: sqlite +was not installed on that machine). I ran it twice at a difference of +12 hours, as an attempt to reduce risks caused by other users suddenly +using the machine. The machine was overall relatively quiet. Of +course, this is scientifically not good enough; it is what I could come +up with given the limited resources.

                    + +

                    Here are the results, where the numbers are speed-up factors between the +non-jit and the jit version of PyPy. The first column is x86-64, for +reference. The second and third columns are the two ppc64le runs. All +are Linux. A few benchmarks are not reported here because the runner +doesn't execute them on non-jit (however, apart from sqlitesynth, they +all worked).

                    + +
                    +    ai                        13.7342        16.1659     14.9091
                    +    bm_chameleon               8.5944         8.5858        8.66
                    +    bm_dulwich_log             5.1256         5.4368      5.5928
                    +    bm_krakatau                5.5201         2.3915      2.3452
                    +    bm_mako                    8.4802         6.8937      6.9335
                    +    bm_mdp                     2.0315         1.7162      1.9131
                    +    chaos                     56.9705        57.2608     56.2374
                    +    sphinx
                    +    crypto_pyaes               62.505         80.149     79.7801
                    +    deltablue                  3.3403         5.1199      4.7872
                    +    django                    28.9829         23.206       23.47
                    +    eparse                     2.3164         2.6281       2.589
                    +    fannkuch                   9.1242        15.1768     11.3906
                    +    float                     13.8145        17.2582     17.2451
                    +    genshi_text               16.4608        13.9398     13.7998
                    +    genshi_xml                 8.2782         8.0879      9.2315
                    +    go                         6.7458        11.8226     15.4183
                    +    hexiom2                   24.3612        34.7991     33.4734
                    +    html5lib                   5.4515         5.5186       5.365
                    +    json_bench                28.8774        29.5022     28.8897
                    +    meteor-contest             5.1518         5.6567      5.7514
                    +    nbody_modified            20.6138        22.5466     21.3992
                    +    pidigits                   1.0118          1.022      1.0829
                    +    pyflate-fast               9.0684        10.0168     10.3119
                    +    pypy_interp                3.3977         3.9307      3.8798
                    +    raytrace-simple           69.0114       108.8875    127.1518
                    +    richards                  94.1863       118.1257    102.1906
                    +    rietveld                   3.2421         3.0126      3.1592
                    +    scimark_fft
                    +    scimark_lu
                    +    scimark_montecarlo
                    +    scimark_sor
                    +    scimark_sparsematmul
                    +    slowspitfire               2.8539         3.3924      3.5541
                    +    spambayes                  5.0646         6.3446       6.237
                    +    spectral-norm             41.9148        42.1831     43.2913
                    +    spitfire                   3.8788         4.8214       4.701
                    +    spitfire_cstringio          7.606         9.1809      9.1691
                    +    sqlitesynth
                    +    sympy_expand               2.9537         2.0705      1.9299
                    +    sympy_integrate            4.3805         4.3467      4.7052
                    +    sympy_str                  1.5431         1.6248      1.5825
                    +    sympy_sum                  6.2519          6.096      5.6643
                    +    telco                     61.2416        54.7187     55.1705
                    +    trans2_annotate
                    +    trans2_rtype
                    +    trans2_backendopt
                    +    trans2_database
                    +    trans2_source
                    +    twisted_iteration         55.5019        51.5127     63.0592
                    +    twisted_names              8.2262         9.0062      10.306
                    +    twisted_pb                12.1134         13.644     12.1177
                    +    twisted_tcp                4.9778          1.934      5.4931
                    +
                    +    GEOMETRIC MEAN               9.31           9.70       10.01
                    +
                    + +

                    The last line reports the geometric mean of each column. We see that +the goal was reached: PyPy's JIT actually improves performance by a +factor of around 9.7 to 10 times on ppc64le. By comparison, it "only" +improves performance by a factor 9.3 on Intel x86-64. I don't know why, +but I'd guess it mostly means that a non-jitted PyPy performs slightly +better on Intel than it does on PowerPC.

                    + +

                    Why is that? Actually, if we do the same comparison with an ARM +column too, we also get higher numbers there than on Intel. +When we discovered that a few years ago, we guessed that +on ARM running the whole interpreter in +PyPy takes up a lot of resources, e.g. of instruction cache, which the +JIT's assembler doesn't need any more after the process is warmed up. +And caches are much bigger on Intel. However, PowerPC is much closer +to Intel, so this argument doesn't work for PowerPC. +But there are other more subtle +variants of it. Notably, Intel is doing crazy things about branch +prediction, which likely helps a big interpreter---both the non-JITted +PyPy and CPython, and both for the interpreter's main loop itself and +for the numerous indirect branches that depend on the types of the +objects. Maybe the PowerPC is as good as Intel, and so this argument +doesn't work either. Another one would be: +on PowerPC I did notice that gcc itself is not +perfect at optimization. During development of this backend, I often +looked at assembler produced by gcc, and there are a number of small +inefficiencies there. All these are factors that slow down the +non-JITted version of PyPy, but don't influence the speed of the +assembler produced just-in-time.

                    + +

                    Anyway, this is just guessing. The fact remains that PyPy can now +be used on PowerPC machines. Have fun!

                    + +

                    A bientôt,

                    + +

                    Armin.

                    +
                    +

                    Comments

                    +
                    +
                    +
                    + +
                    +
                    + + \ No newline at end of file diff --git a/posts/2015/10/pypy-400-released-jit-with-simd-8282134928733384063.html b/posts/2015/10/pypy-400-released-jit-with-simd-8282134928733384063.html new file mode 100644 index 000000000..81103fd16 --- /dev/null +++ b/posts/2015/10/pypy-400-released-jit-with-simd-8282134928733384063.html @@ -0,0 +1,579 @@ + + + + + +PyPy 4.0.0 Released - A Jit with SIMD Vectorization and More | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                    +
                    +

                    PyPy 4.0.0 Released - A Jit with SIMD Vectorization and More

                    + + + +
                    +
                    +

                    +PyPy 4.0.0

                    +We’re pleased and proud to unleash PyPy 4.0.0, a major update of the PyPy python 2.7.10 compatible interpreter with a Just In Time compiler. We have improved warmup time and memory overhead used for tracing, added vectorization for numpy and general loops where possible on x86 hardware (disabled by default), refactored rough edges in rpython, and increased functionality of numpy.
                    +You can download the PyPy 4.0.0 release here:
                    + +
                    +We would like to thank our donors for the continued support of the PyPy project.
                    +We would also like to thank our contributors (7 new ones since PyPy 2.6.0) and encourage new people to join the project. PyPy has many layers and we need help with all of them: PyPy and RPython documentation improvements, tweaking popular modules to run on PyPy, or general help with making RPython’s JIT even better.

                    +New Version Numbering

                    +
                    +Since the past release, PyPy 2.6.1, we decided to update the PyPy 2.x.x versioning directly to PyPy 4.x.x, to avoid confusion with CPython 2.7 and 3.5. Note that this version of PyPy uses the stdlib and implements the syntax of CPython 2.7.10.
                    +

                    +Vectorization

                    +
                    +Richard Plangger began work in March and continued over a Google Summer of Code to add a vectorization step to the trace optimizer. The step recognizes common constructs and emits SIMD code where possible, much as any modern compiler does. This vectorization happens while tracing running code, so it is actually easier at run-time to determine the availability of possible vectorization than it is for ahead-of-time compilers.
                    +Availability of SIMD hardware is detected at run time, without needing to precompile various code paths into the executable.
                    +The first version of the vectorization has been merged in this release, since it is so new it is off by default. To enable the vectorization in built-in JIT drivers (like numpy ufuncs), add –jit vec=1, to enable all implemented vectorization add –jit vec_all=1
                    +Benchmarks and a summary of this work appear here +
                    +

                    +Internal Refactoring: Warmup Time Improvement and Reduced Memory Usage

                    +
                    +Maciej Fijalkowski and Armin Rigo refactored internals of Rpython that now allow PyPy to more efficiently use guards in jitted code. They also rewrote unrolling, leading to a warmup time improvement of 20% or so. The reduction in guards also means a reduction in the use of memory, also a savings of around 20%.
                    +
                    +

                    +Numpy

                    +
                    +Our implementation of numpy continues to improve. ndarray and the numeric dtypes are very close to feature-complete; record, string and unicode dtypes are mostly supported. We have reimplemented numpy linalg, random and fft as cffi-1.0 modules that call out to the same underlying libraries that upstream numpy uses. Please try it out, especially using the new vectorization (via –jit vec=1 on the command line) and let us know what is missing for your code.
                    +
                    +

                    +CFFI

                    +
                    +While not applicable only to PyPy, cffi is arguably our most significant contribution to the python ecosystem. Armin Rigo continued improving it, and PyPy reaps the benefits of cffi-1.3: improved manangement of object lifetimes, __stdcall on Win32, ffi.memmove(), and percolate const, restrict keywords from cdef to C code.
                    +
                    +

                    +What is PyPy?

                    +
                    +PyPy is a very compliant Python interpreter, almost a drop-in replacement for CPython 2.7. It’s fast (pypy and cpython 2.7.x performance comparison) due to its integrated tracing JIT compiler.
                    +We also welcome developers of other dynamic languages to see what RPython can do for them.
                    +This release supports x86 machines on most common operating systems (Linux 32/64, Mac OS X 64, Windows 32, OpenBSD, freebsd), as well as newer ARM hardware (ARMv6 or ARMv7, with VFPv3) running Linux.
                    +We also introduce support for the 64 bit PowerPC hardware, specifically Linux running the big- and little-endian variants of ppc64.
                    +
                    +

                    +Other Highlights (since 2.6.1 release two months ago)

                    +
                      +
                    • +Bug Fixes
                        +
                      • Applied OPENBSD downstream fixes
                      • +
                      • Fix a crash on non-linux when running more than 20 threads
                      • +
                      • In cffi, ffi.new_handle() is more cpython compliant
                      • +
                      • Accept unicode in functions inside the _curses cffi backend exactly like cpython
                      • +
                      • Fix a segfault in itertools.islice()
                      • +
                      • Use gcrootfinder=shadowstack by default, asmgcc on linux only
                      • +
                      • Fix ndarray.copy() for upstream compatability when copying non-contiguous arrays
                      • +
                      • Fix assumption that lltype.UniChar is unsigned
                      • +
                      • Fix a subtle bug with stacklets on shadowstack
                      • +
                      • Improve support for the cpython capi in cpyext (our capi compatibility layer). Fixing these issues inspired some thought about cpyext in general, stay tuned for more improvements
                      • +
                      • When loading dynamic libraries, in case of a certain loading error, retry loading the library assuming it is actually a linker script, like on Arch and Gentoo
                      • +
                      • Issues reported with our previous release were resolved after reports from users on our issue tracker at https://foss.heptapod.net/pypy/pypy/-/issues or on IRC at #pypy
                      • +
                      +
                    • +
                    • +New features:
                        +
                      • Add an optimization pass to vectorize loops using x86 SIMD intrinsics.
                      • +
                      • Support __stdcall on Windows in CFFI
                      • +
                      • Improve debug logging when using PYPYLOG=???
                      • +
                      • Deal with platforms with no RAND_egd() in OpenSSL
                      • +
                      +
                    • +
                    • +Numpy:
                        +
                      • Add support for ndarray.ctypes
                      • +
                      • Fast path for mixing numpy scalars and floats
                      • +
                      • Add support for creating Fortran-ordered ndarrays
                      • +
                      • Fix casting failures in linalg (by extending ufunc casting)
                      • +
                      • Recognize and disallow (for now) pickling of ndarrays with objects embedded in them
                      • +
                      +
                    • +
                    • +Performance improvements and refactorings:
                        +
                      • Reuse hashed keys across dictionaries and sets
                      • +
                      • Refactor JIT interals to improve warmup time by 20% or so at the cost of a minor regression in JIT speed
                      • +
                      • Recognize patterns of common sequences in the JIT backends and optimize them
                      • +
                      • Make the garbage collecter more incremental over external_malloc() calls
                      • +
                      • Share guard resume data where possible which reduces memory usage
                      • +
                      • Fast path for zip(list, list)
                      • +
                      • Reduce the number of checks in the JIT for lst[a:]
                      • +
                      • Move the non-optimizable part of callbacks outside the JIT
                      • +
                      • Factor in field immutability when invalidating heap information
                      • +
                      • Unroll itertools.izip_longest() with two sequences
                      • +
                      • Minor optimizations after analyzing output from vmprof and trace logs
                      • +
                      • Remove many class attributes in rpython classes
                      • +
                      • Handle getfield_gc_pure* and getfield_gc_* uniformly in heap.py
                      • +
                      • Improve simple trace function performance by lazily calling fast2locals and locals2fast only if truly necessary
                      • +
                      +
                    • +
                    +
                    +
                    +
                    +
                    +
                    +Please try it out and let us know what you think. We welcome feedback, we know you are using PyPy, please tell us about it!
                    +Cheers
                    +The PyPy Team
                    +
                    +
                    +
                    +



                    +
                    +
                    +
                    +

                    Comments

                    +
                    +
                    +
                    + + Gerrit Slonzer wrote on 2015-10-29 14:17: +
                    +
                    +

                    With the SIMD run-time detection implemented, has the --jit-backend option become redundant?

                    +
                    +
                    +
                    +
                    + + stryker wrote on 2015-10-29 18:07: +
                    +
                    +

                    Will a similar release be coming for Python 3.5?

                    +
                    +
                    +
                    +
                    + + John M. Camara wrote on 2015-10-29 21:44: +
                    +
                    +

                    @Gerrit, they are 2 different things. One is the option to say you are interested in the SIMD support and the other is a check if SIMD support is available in the HW if you are interested in using it. I'm sure once SIMD support has been used for some time it will eventually be enabled by default but since it is new and potential could have some unknown issues at this time you have to explicitly enable it at this time.

                    +
                    +
                    +
                    +
                    + + Niklas B wrote on 2015-10-30 10:07: +
                    +
                    +

                    Awesome, can't wait to try it

                    +
                    +
                    +
                    +
                    + + Unknown wrote on 2015-10-30 19:31: +
                    +
                    +

                    Well done, thx!

                    +
                    +
                    +
                    +
                    + + Travis Griggs wrote on 2015-10-31 00:31: +
                    +
                    +

                    I keep watching the progress of PyPy with excitement. Cool things happening here. But I continue to be disappointed that it doesn't tip towards Python3. It's dead to me until that becomes the majority effort. :(

                    +
                    +
                    +
                    +
                    + + Carl Friedrich Bolz-Tereick wrote on 2015-10-31 00:35: +
                    +
                    +

                    The PyPy project contains a large plurality of interests. A lot of the people working on it are volunteers. So PyPy3 will happen if people within the project become interested in that part, or if new people with that interest join the project. At the moment, this seems not happening, which we can all be sad about. However, blaming anybody with differing interest for that situation feels a bit annoying to me.

                    +
                    +
                    +
                    +
                    + + Travis Griggs wrote on 2015-10-31 07:15: +
                    +
                    +

                    Well said, I apologize for any whining tone. It was not my intent to blame or complain. It really was just meant as a lamentation. Thanks for all you do.

                    +
                    +
                    +
                    +
                    + + PeteVine wrote on 2015-10-31 17:47: +
                    +
                    +

                    What happened to my comment? Surely the benchmark I was proposing is not censorable...

                    +
                    +
                    +
                    +
                    + + Maciej Fijalkowski wrote on 2015-10-31 18:14: +
                    +
                    +

                    @PeteVine you posted a random executable from dropbox claiming to have pypy with x87 backend. PyPy does not have an x87 backend and this raises suspitions this was just some malware. Now if you want someone to compare one thing against some other thing, please link to sources and not random binaries so the person comparing can look themselves. Additionally you did not post a benchmark, just a link to the binary

                    +
                    +
                    +
                    +
                    + + PeteVine wrote on 2015-10-31 19:29: +
                    +
                    +

                    Well, I was suggesting benchmarking the 32-bit backends to see how much difference SIMD makes - x87 means the standard fpu whereas the default uses SSE2. I know it's processor archaeology so you may have forgotten pypy even had it ;)

                    The ready-to-use pypy distro (built by me) was meant for anyone in possesion of a real set of benchmarks (not synthetic vector stuff) to be able to try it quickly.

                    And btw, you could have simply edited the dropbox link out. I'd already tested py3k using this backend and mentioned it in one of the issues on bitbucket so it's far from random.

                    @ all the people asking about pypy3 - you have the python 3.2 compatible pypy (py3k) at your disposal even now.

                    +
                    +
                    +
                    +
                    + + Armin Rigo wrote on 2015-10-31 20:54: +
                    +
                    +

                    @PeteVine: to clarify, PyPy has no JIT backend emitting the old-style x87 fpu instructions. What you are posting is very likely a PyPy whose JIT doesn't support floats at all. It emits calls to already-compiled functions, like the one doing addition of float objects, instead of writing a SSE2 float addition on unboxed objects.

                    Instead, use the official PyPy and run it with vectorization turned on and off (as documented) on the same modern machine. This allows an apple-to-apple comparison.

                    +
                    +
                    +
                    +
                    + + PeteVine wrote on 2015-10-31 22:18: +
                    +
                    +

                    Thanks for clarifying, I must have confused myself after seeing it was i486 compatible.

                    Are you saying the only difference between the backends I wanted to benchmark would boil down to jit-emitting performance and not actual pypy performance? (I must admit I tried this a while ago with fibonacci and there was no difference at all).

                    In other words, even before vectorization functionality was added, shouldn't it be possible to detect that the non-SSE2 backend is running on newer hardware and use the available SIMD? (e.g. for max. compatibility)

                    +
                    +
                    +
                    +
                    + + Armin Rigo wrote on 2015-11-03 11:33: +
                    +
                    +

                    @PeteVine Sorry, I don't understand your questions. Why do you bring the JIT-emitting performance to the table? And why fibonacci (it's not a benchmark with floats at all)? And I don't get the last question either ("SIMD" = "vectorization").

                    To some people, merely dropping the word "SIMD" into a performance discussion makes them go "ooh nice" even if they don't have a clue what it is. I hope you're more knowledgeable than that and that I'm merely missing your point :-)

                    +
                    +
                    +
                    +
                    + + PeteVine wrote on 2015-11-03 13:49: +
                    +
                    +

                    The last part should have been pretty clear as it was referring to the newly added –jit vec=1 so it's not me who's dropping SIMD here (shorthand for different instructions sets) as can be seen in the title of this blog post.

                    All this time I was merely interested in comparing the two 32-bit backends, that's all. One's using the i486/x87 instruction set regardless of any jit codes, the other is able take advantage of anything up to SSE2. The quick fibonacci test was all I did so you could have pointed me to a real set of benchmarks instead of throwing these little jabs :)

                    +
                    +
                    +
                    +
                    + + Carl Friedrich Bolz-Tereick wrote on 2015-11-03 15:12: +
                    +
                    +

                    @PeteVine: ok, there is a misunderstanding somewhere, I think. Let me try to clarify: PyPy's JIT has always used non-SIMD SSE2 instructions to implement floating point operations. We have a slow mode where only x87 instructions are used, but usually don't fall back to that, and it does not make sense to compare against that mode.

                    What the new release experimentally added is support for SIMD SSE instructions using autoparallelization when --jit vec=1 is given. This only works if your program uses numpy arrays or other simple list processing code. For details on that (and for benchmarks) it's probably best to read Richard Plangger's blog.

                    Does that make sense?

                    +
                    +
                    +
                    +
                    + + PeteVine wrote on 2015-11-03 15:46: +
                    +
                    +

                    Great, love that explanation! :)

                    But please, I'd really like to see how much of a handicap the much-maligned non-SSE2 backend incurs. Could you recommend a set of python (not purely computational) benchmarks so I can put this peevee of mine to rest/test?

                    Anyways, @Armin Rigo is a great educator himself judging from his patient replies in the bugtracker! So yeah, kudos to you guys!

                    +
                    +
                    +
                    +
                    + + Carl Friedrich Bolz-Tereick wrote on 2015-11-03 15:59: +
                    +
                    +

                    If you want to try a proper performance evaluation, the official benchmark set is probably the right one: https://bitbucket.org/pypy/benchmarks/

                    However, none of these benchmarks are exercising the new autovectorization. If you're particularly interested in that part, use the benchmarks from Richard's blog.

                    +
                    +
                    +
                    +
                    + + NortonCommander4ever wrote on 2015-11-09 14:33: +
                    +
                    +

                    Is there a readme on how to use these benchmarks somewhere? (preferably written with windows users in mind, if you know what I mean:))

                    +
                    +
                    +
                    + +
                    +
                    + +
                    +
                    + + \ No newline at end of file diff --git a/posts/2015/10/pypy-memory-and-warmup-improvements-2-4598780879518640015.html b/posts/2015/10/pypy-memory-and-warmup-improvements-2-4598780879518640015.html new file mode 100644 index 000000000..3a9dee87a --- /dev/null +++ b/posts/2015/10/pypy-memory-and-warmup-improvements-2-4598780879518640015.html @@ -0,0 +1,422 @@ + + + + + +PyPy memory and warmup improvements (2) - Sharing of Guards | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                    +
                    +

                    PyPy memory and warmup improvements (2) - Sharing of Guards

                    + + + +
                    +
                    +

                    Hello everyone!

                    +

                    This is the second part of the series of improvements in warmup time and +memory consumption in the PyPy JIT. This post covers recent work on sharing guard +resume data that was recently merged to trunk. It will be a part +of the next official PyPy release. To understand what it does, let's +start with a loop for a simple example:

                    +
                    +class A(object):
                    +    def __init__(self, x, y):
                    +        self.x = x
                    +        self.y = y
                    +
                    +    def call_method(self, z):
                    +        return self.x + self.y + z
                    +
                    +def f():
                    +    s = 0
                    +    for i in range(100000):
                    +        a = A(i, 1 + i)
                    +        s += a.call_method(i)
                    +
                    +

                    At the entrance of the loop, we have the following set of operations:

                    +
                    +
                    guard(i5 == 4)
                    +
                    guard(p3 is null)
                    +p27 = p2.co_cellvars +p28 = p2.co_freevars +
                    guard_class(p17, 4316866008, descr=<Guard0x104295e08>)
                    +p30 = p17.w_seq +
                    guard_nonnull(p30, descr=<Guard0x104295db0>)
                    +i31 = p17.index +p32 = p30.strategy +
                    guard_class(p32, 4317041344, descr=<Guard0x104295d58>)
                    +p34 = p30.lstorage +i35 = p34..item0 +
                    +

                    The above operations gets executed at the entrance, so each time we call f(). They ensure +all the optimizations done below stay valid. Now, as long as nothing +out of the ordinary happens, they only ensure that the world around us never changed. However, if e.g. someone puts new +methods on class A, any of the above guards might fail. Despite the fact that it's a very unlikely +case, PyPy needs to track how to recover from such a situation. Each of those points needs to keep the full +state of the optimizations performed, so we can safely deoptimize them and reenter the interpreter. +This is vastly wasteful since most of those guards never fail, hence some sharing between guards +has been performed.

                    +

                    We went a step further - when two guards are next to each other or the +operations in between them don't have side effects, we can safely redo the operations or to simply +put, resume in the previous guard. That means every now and again we execute a few +operations extra, but not storing extra info saves quite a bit of time and memory. This is similar to the approach that LuaJIT takes, which is called sparse snapshots.

                    + +

                    +I've done some measurements on annotating & rtyping translation of pypy, which +is a pretty memory hungry program that compiles a fair bit. I measured, respectively:

                    +
                      +
                    • total time the translation step took (annotating or rtyping)
                    • +
                    • time it took for tracing (that excludes backend time for the total JIT time) at +the end of rtyping.
                    • +
                    • memory the GC feels responsible for after the step. The real amount of memory +consumed will always be larger and the coefficient of savings is in 1.5-2x mark
                    • +
                    +

                    Here is the table:

                    + ++++++++ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    branchtime annotationtime rtypingmemory annotationmemory rtypingtracing time
                    default317s454s707M1349M60s
                    sharing302s430s595M1070M51s
                    win4.8%5.5%19%26%17%
                    +

                    Obviously pypy translation is an extreme example - the vast majority of the code out there +does not have that many lines of code to be jitted. However, it's at the very least +a good win for us :-)

                    +

                    We will continue to improve the warmup performance and keep you posted!

                    +

                    Cheers,
                    +fijal

                    +
                    +
                    +
                    +

                    Comments

                    +
                    +
                    +
                    + + Ernst Sjöstrand wrote on 2015-10-05 20:14: +
                    +
                    +

                    "when two guards are next to each other or the operations in between them don't have side effects, we can safely redo the operations or to simply put, resume in the previous guard"
                    Wait... "side effects", "redo"... Does this have synergies with STM?

                    +
                    +
                    +
                    +
                    + + Maciej Fijalkowski wrote on 2015-10-06 05:45: +
                    +
                    +

                    Side effect operation is one that does not have any side effects. This means that you can execute the operation again (e.g. reading a field or adding two numbers) and will affect nothing but it's result. As for redo - well, it has nothing to do with STM, but doing pure operations again can be sometimes useful (in short - if you have i = a + b, you don't remember the i, just a, b and that i = a + b)

                    +
                    +
                    +
                    + +
                    +
                    + +
                    +
                    + + \ No newline at end of file diff --git a/posts/2015/11/pypy-401-released-please-update-2652340737298251005.html b/posts/2015/11/pypy-401-released-please-update-2652340737298251005.html new file mode 100644 index 000000000..162e879a5 --- /dev/null +++ b/posts/2015/11/pypy-401-released-please-update-2652340737298251005.html @@ -0,0 +1,443 @@ + + + + + +PyPy 4.0.1 released please update | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                    +
                    +

                    PyPy 4.0.1 released please update

                    + + + +
                    +
                    +
                    +
                    +
                    +

                    +PyPy 4.0.1

                    +
                    +We have released PyPy 4.0.1, three weeks after PyPy 4.0.0. We have fixed a few critical bugs in the JIT compiled code, reported by users. We therefore encourage all users of PyPy to update to this version. There are a few minor enhancements in this version as well.

                    +You can download the PyPy 4.0.1 release here:
                    + +
                    +We would like to thank our donors for the continued support of the PyPy project.
                    +We would also like to thank our contributors and encourage new people to join the project. PyPy has many layers and we need help with all of them: PyPy and RPython documentation improvements, tweaking popular modules to run on pypy, or general help with making RPython’s JIT even better.

                    +

                    +
                    +

                    +

                    +CFFI update

                    +
                    +While not applicable only to PyPy, cffi is arguably our most significant contribution to the python ecosystem. PyPy 4.0.1 ships with cffi-1.3.1 with the improvements it brings.
                    +
                    +

                    +

                    +What is PyPy?

                    +
                    +PyPy is a very compliant Python interpreter, almost a drop-in replacement for CPython 2.7. It’s fast (pypy and cpython 2.7.x performance comparison) due to its integrated tracing JIT compiler.
                    +We also welcome developers of other dynamic languages to see what RPython can do for them.
                    +This release supports x86 machines on most common operating systems (Linux 32/64, Mac OS X 64, Windows 32, OpenBSD, freebsd), newer ARM hardware (ARMv6 or ARMv7, with VFPv3) running Linux, and the big- and little-endian variants of ppc64 running Linux.
                    +
                    +

                    +

                    +Other Highlights (since 4.0.0 released three weeks ago)

                    +

                    +

                    +
                      +
                    • +Bug Fixes
                        +
                      • Fix a bug when unrolling double loops in JITted code
                      • +
                      • Fix multiple memory leaks in the ssl module, one of which affected CPython as well (thanks to Alex Gaynor for pointing those out)
                      • +
                      • Use pkg-config to find ssl headers on OS-X
                      • +
                      • Issues reported with our previous release were resolved after reports from users on our issue tracker at https://foss.heptapod.net/pypy/pypy/-/issues or on IRC at #pypy
                      • +
                      +
                    • +
                    • +New features
                        +
                      • Internal cleanup of RPython class handling
                      • +
                      • Support stackless and greenlets on PPC machines
                      • +
                      • Improve debug logging in subprocesses: use PYPYLOG=jit:log.%d for example to have all subprocesses write the JIT log to a file called ‘log.%d’, with ‘%d’ replaced with the subprocess’ PID.
                      • +
                      • Support PyOS_double_to_string in our cpyext capi compatibility layer
                      • +
                      +
                    • +
                    • +Numpy
                        +
                      • Improve support for __array_interface__
                      • +
                      • Propagate most NAN mantissas through float16-float32-float64 conversions
                      • +
                      +
                    • +
                    • +Performance improvements and refactorings
                        +
                      • Improvements in slicing byte arrays
                      • +
                      • Improvements in enumerate()
                      • +
                      • Silence some warnings while translating
                      • +
                      +
                    • +
                    +Please update, and continue to help us make PyPy better.

                    +Cheers
                    +The PyPy Team
                    +
                    +
                    +
                    +
                    +
                    +

                    Comments

                    +
                    +
                    +
                    + + Marius Gedminas wrote on 2015-11-20 11:20: +
                    +
                    +

                    I'd love to upgrade and see if that makes my segfault go away, but the builds at https://launchpad.net/~pypy/+archive/ubuntu/ppa?field.series_filter=precise are two weeks old?

                    +
                    +
                    +
                    +
                    + + Armin Rigo wrote on 2015-11-20 12:06: +
                    +
                    +

                    Hi Marius! How about directing such complains to the maintainer of the PPA instead of us? :-)

                    +
                    +
                    +
                    +
                    + + Gerd Puin wrote on 2015-11-27 05:46: +
                    +
                    +

                    Where are the benchmark instructions for the official set?

                    +
                    +
                    +
                    +
                    + + Armin Rigo wrote on 2015-11-27 09:57: +
                    +
                    +

                    https://bitbucket.org/pypy/benchmarks , file runner.py. This file has various options; try this: ``python runner.py --changed /path/to/pypy``. This example would compare the speed on top of your system's python and on top of /path/to/pypy. Try also ``--fast`` if you're not patient enough.

                    +
                    +
                    +
                    +
                    + + Unknown wrote on 2015-11-28 02:54: +
                    +
                    +

                    can I run pandas in PyPy. I am using Python for Data Science

                    +
                    +
                    +
                    +
                    + + Gerd Puin wrote on 2015-11-28 04:20: +
                    +
                    +

                    Thanks Armin, that got me a result.json file - is there a tool to present the data in a more human-readable way?

                    +
                    +
                    +
                    +
                    + + Armin Rigo wrote on 2015-11-29 08:03: +
                    +
                    +

                    The command itself prints a human-readable result at the end; you can ignore result.json.

                    +
                    +
                    +
                    +
                    + + Gerd Puin wrote on 2015-11-29 18:48: +
                    +
                    +

                    I see. Just an idea - maybe the results could be reviewed on speed.pypy.org via a web interface?

                    Cheers!

                    +
                    +
                    +
                    + +
                    +
                    + +
                    +
                    + + \ No newline at end of file diff --git a/posts/2016/01/leysin-winter-sprint-20-27th-february-1737200016169608469.html b/posts/2016/01/leysin-winter-sprint-20-27th-february-1737200016169608469.html new file mode 100644 index 000000000..5bfc5d467 --- /dev/null +++ b/posts/2016/01/leysin-winter-sprint-20-27th-february-1737200016169608469.html @@ -0,0 +1,350 @@ + + + + + +Leysin Winter Sprint (20-27th February 2016) | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                    +
                    +

                    Leysin Winter Sprint (20-27th February 2016)

                    + + + +
                    +

                    The next PyPy sprint will be in Leysin, Switzerland, for the eleventh time. +This is a fully public sprint: newcomers and topics other than those +proposed below are welcome.

                    +
                    +

                    Goals and topics of the sprint

                    +

                    The details depend on who is here and ready to work. The list of +topics is mostly the same as last year (did PyPy became a mature +project with only long-term goals?):

                    +
                      +
                    • cpyext (CPython C API emulation layer): various speed and +completeness topics
                    • +
                    • cleaning up the optimization step in the JIT, change the register +allocation done by the JIT's backend, or more improvements to the +warm-up time
                    • +
                    • finish vmprof - a statistical profiler for CPython and PyPy
                    • +
                    • Py3k (Python 3.x support), NumPyPy (the numpy module)
                    • +
                    • STM (Software Transaction Memory), notably: try to come up with +benchmarks, and measure them carefully in order to test and improve +the conflict reporting tools, and more generally to figure out how +practical it is in large projects to avoid conflicts
                    • +
                    • And as usual, the main side goal is to have fun in winter sports :-) +We can take a day off for ski.
                    • +
                    +
                    +
                    +

                    Exact times

                    +

                    I have booked the week from Saturday 20 to Saturday 27. It is fine to +leave either the 27 or the 28, or even stay a few +more days on either side. The plan is to work full days between the 21 +and the 27. You are of course allowed to show up for a part of that +time only, too.

                    +
                    +
                    +

                    Location & Accomodation

                    +

                    Leysin, Switzerland, "same place as before". Let me refresh your +memory: both the sprint venue and the lodging will be in a +pair of chalets built specifically for bed & breakfast: +https://www.ermina.ch/. The place has a good ADSL Internet connection +with wireless installed. You can also arrange your own lodging +elsewhere (as long as you are in Leysin, you cannot be more than a 15 +minutes walk away from the sprint venue).

                    +

                    Please confirm that you are coming so that we can adjust the +reservations as appropriate.

                    +

                    The options of rooms are a bit more limited than on previous years +because the place for bed-and-breakfast is shrinking: what is +guaranteed is only one double-bed room and a bigger room with 5-6 +individual beds (the latter at 50-60 CHF per night, breakfast +included). If there are more people that would prefer a single room, +please contact me and we'll see what choices you have. There are a +choice of hotels, many of them reasonably priced for Switzerland.

                    +

                    Please register by Mercurial:

                    +
                    +https://bitbucket.org/pypy/extradoc/
                    https://foss.heptapod.net/pypy/extradoc/-/blob/branch/default/extradoc/sprintinfo/leysin-winter-2016 +
                    +

                    or on the pypy-dev mailing list if you do not yet have check-in rights:

                    +
                    +https://mail.python.org/mailman/listinfo/pypy-dev +
                    +

                    You need a Swiss-to-(insert country here) power adapter. There will be +some Swiss-to-EU adapters around, and at least one EU-format power strip.

                    +
                    +
                    +

                    Comments

                    +
                    +
                    +
                    + +
                    +
                    + + \ No newline at end of file diff --git a/posts/2016/01/using-cffi-for-embedding-8493496761738752124.html b/posts/2016/01/using-cffi-for-embedding-8493496761738752124.html new file mode 100644 index 000000000..72c606aa9 --- /dev/null +++ b/posts/2016/01/using-cffi-for-embedding-8493496761738752124.html @@ -0,0 +1,501 @@ + + + + + +Using CFFI for embedding | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                    +
                    +

                    Using CFFI for embedding

                    + + + +
                    +

                    Introduction

                    + +

                    CFFI has been a great success so far to call C libraries in your +Python programs, in a way that is both simple and that works across +CPython 2.x and 3.x and PyPy.

                    + +

                    This post assumes that you know what CFFI is and how to use it in +API mode (ffi.cdef(), ffi.set_source(), ffi.compile()). +A quick overview can be found in this paragraph.

                    + +

                    The major news of CFFI 1.4, released last december, was that you can +now declare C functions with extern "Python" in the cdef(). +These magic keywords make the function callable from C (where it is +defined automatically), but calling it will call some Python code +(which you attach with the @ffi.def_extern() decorator). This is +useful because it gives a more straightforward, faster and +libffi-independent way to write callbacks. For more details, see the +documentation.

                    + +

                    You are, in effect, declaring a static family of C functions which +call Python code. The idea is to take pointers to them, and pass them +around to other C functions, as callbacks. However, the idea of a set +of C functions which call Python code opens another path: embedding +Python code inside non-Python programs.

                    + +

                    Embedding

                    + +

                    Embedding is traditionally done using the CPython C API: from C code, +you call Py_Initialize() and then some other functions like +PyRun_SimpleString(). In the simple cases it is, indeed, simple +enough; but it can become a complicated story if you throw in +supporting application-dependent object types; and a messy story if +you add correctly running on multiple threads, for example.

                    +

                    Moreover, this approach is specific to CPython (2.x or 3.x). It does +not work at all on PyPy, which has its own very different, minimal +embedding API.

                    + +

                    The new-and-coming thing about CFFI 1.5, meant as replacement of the +above solutions, is direct embedding support---with no fixed API at +all. The idea is to write some Python script with a cdef() which +declares a number of extern "Python" functions. When running the +script, it creates the C source code and compiles it to a +dynamically-linked library (.so on Linux). This is the same as in +the regular API-mode usage. What is new is that these extern +"Python" can now also be exported from the .so, in the C +sense. You also give a bit of initialization-time Python code +directly in the script, which will be compiled into the .so too.

                    +

                    This library can now be used directly from any C program (and it is +still importable in Python). It exposes the C API of your choice, +which you specified with the extern "Python" declarations. You +can use it to make whatever custom API makes sense in your particular +case. You can even directly make a "plug-in" for any program that +supports them, just by exporting the API expected for such plugins.

                    + +

                    Trying it out on CPython

                    + +

                    This is still being finalized, but please try it out. You can +see embedding.py directly online for a quick glance. Or +see below the instructions on Linux with CPython 2.7 (CPython 3.x and +non-Linux platforms are still a work in progress right now, but this +should be quickly fixed):

                    +
                      +
                    • +

                      get the branch static-callback-embedding of CFFI:

                      +
                      +hg clone https://foss.heptapod.net/cffi/cffi
                      +hg up static-callback-embedding
                      +
                      +
                    • +
                    • +

                      make the _cffi_backend.so:

                      +
                      +python setup_base.py build_ext -f -i
                      +
                      +
                    • +
                    • +

                      run embedding.py in the demo directory:

                      +
                      +cd demo
                      +PYTHONPATH=.. python embedding.py
                      +
                      +
                    • +
                    • +

                      this produces _embedding_cffi.c. Run gcc to build it. On Linux:

                      +
                      +gcc -shared -fPIC _embedding_cffi.c -o _embedding_cffi.so  \
                      +    -lpython2.7 -I/usr/include/python2.7
                      +
                      +
                    • +
                    • +

                      try out the demo C program in embedding_test.c:

                      +
                      +gcc embedding_test.c _embedding_cffi.so
                      +PYTHONPATH=.. LD_LIBRARY_PATH=. ./a.out
                      +
                      +
                    • +
                    +

                    Note that if you get ImportError: cffi extension module +'_embedding_cffi' has unknown version 0x2701, it means that the +_cffi_backend module loaded is a pre-installed one instead of the +more recent one in "..". Be sure to use PYTHONPATH=.. for now. (Some installations manage to be confused enough to load the system-wide cffi even if another version is in the PYTHONPATH. I think a virtualenv can be used to work around this issue.)

                    + +

                    Try it out on PyPy

                    + +

                    Very similar steps can be followed on PyPy, but it requires the +cffi-static-callback-embedding branch of PyPy, which you must +first translate from sources. The difference is then that you need to +adapt the first gcc command line: replace -lpython2.7 with +-lpypy-c and to fix the -I path (and possibly add a -L +path).

                    + +

                    More details

                    + +

                    How it works, more precisely, is by automatically initializing CPython/PyPy +the first time any of the extern "Python" +functions is called from the C program. This is done using locks in case of multi-threading, +so several threads can concurrently do this "first call". This should work even if two +different threads call the first time a function from two different +embedded CFFI extensions that happen to be linked with the same program. Explicit initialization is +never needed.

                    + +

                    The custom initialization-time Python code you put in +ffi.embedding_init_code() is executed at that time. If this code +starts to be big, you can move it to independent modules or packages. +Then the initialization-time Python code only needs to import them. In +that case, you have to carefully set up sys.path if the modules are +not installed in the usual Python way.

                    +

                    If the Python code is big and full of dependencies, a better alternative +would be to use virtualenv. How to do that is not fully fleshed out so +far. You can certainly run the whole program with the environment +variables set up by the virtualenv's activate script first. There +are probably other solutions that involve using gcc's +-Wl,-rpath=\$ORIGIN/ or -Wl,-rpath=/fixed/path/ options to load +a specific libpython or libypypy-c library. If you try it out and it +doesn't work the way you would like, please complain :-)

                    +

                    Another point: right now this does not support CPython's notion of +multiple subinterpreters. The logic creates a single global Python +interpreter, and runs everything in that context. Maybe a future +version would have an explicit API to do that — or maybe it should be +the job of a 3rd-party extension module to provide a Python interface +over the notion of subinterpreters...

                    +

                    More generally, any feedback is appreciated.

                    +

                    Have fun,

                    +

                    Armin

                    +
                    +

                    Comments

                    +
                    +
                    +
                    + + Armin Rigo wrote on 2016-01-07 13:29: +
                    +
                    +

                    Thanks to apollo13 on irc for early feedback. Main change: the code put in embedded_init_code() should now start with "from xx import ffi", where "xx" is the name of the module (first argument to set_source()). The goal is to clearly say that you need the same line in other modules imported from there.

                    +
                    +
                    +
                    +
                    + + Unknown wrote on 2016-01-11 14:42: +
                    +
                    +

                    This is very exciting! Just waiting for Python 3.x support now. :)

                    +
                    +
                    +
                    +
                    + + Armin Rigo wrote on 2016-01-12 07:54: +
                    +
                    +

                    Python 3 is implemented and tested now.

                    +
                    +
                    +
                    +
                    + + Armin Rigo wrote on 2016-01-12 18:48: +
                    +
                    +

                    Windows support is now done (tested on Python 2.7). Expect a release soon :-)

                    +
                    +
                    +
                    +
                    + + Armin Rigo wrote on 2016-01-16 18:02: +
                    + +
                    +
                    +
                    + + Anonymous wrote on 2016-02-08 00:10: +
                    +
                    +

                    Excelent feature!!

                    CFFI rocks, and the documentation keeps improving :)

                    +
                    +
                    +
                    +
                    + + d.q. wrote on 2016-03-10 10:51: +
                    +
                    +

                    Awesome, pypyInstaller in cross-hairs!

                    +
                    +
                    +
                    + +
                    +
                    + +
                    +
                    + + \ No newline at end of file diff --git a/posts/2016/02/c-api-support-update-8582726091670983181.html b/posts/2016/02/c-api-support-update-8582726091670983181.html new file mode 100644 index 000000000..8fb92d8da --- /dev/null +++ b/posts/2016/02/c-api-support-update-8582726091670983181.html @@ -0,0 +1,372 @@ + + + + + +C-API Support update | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                    +
                    +

                    C-API Support update

                    + + + +
                    +

                    As you know, PyPy can emulate the CPython C API to some extent. In this post I will describe an important optimization that we merged to improve the performance and stability of the C-API emulation layer.

                    + +

                    The C-API is implemented by passing around PyObject * pointers in the C code. The problem with providing the same interface with PyPy is that +objects don't natively have the same PyObject * structure at all; and +additionally their memory address can change. PyPy handles the +difference by maintaining two sets of objects. More precisely, starting +from a PyPy object, it can allocate on demand a PyObject structure +and fill it with information that points back to the original PyPy +objects; and conversely, starting from a C-level object, it can allocate +a PyPy-level object and fill it with information in the opposite +direction.

                    + +

                    I have merged a rewrite of the interaction between C-API C-level objects +and PyPy's interpreter level objects. This is mostly a simplification +based on a small hack in our garbage collector. This hack makes the +garbage collector aware of the reference-counted PyObject +structures. When it considers a pair consisting of a PyPy object and a +PyObject, it will always free either none or both of them at the +same time. They both stay alive if either there is a regular GC +reference to the PyPy object, or the reference counter in the +PyObject is bigger than zero.

                    + +

                    This gives a more stable result. Previously, a PyPy object might grow a +corresponding PyObject, loose it (when its reference counter goes to +zero), and later have another corresponding PyObject re-created at a +different address. Now, once a link is created, it remains alive until +both objects die.

                    + +

                    The rewrite significantly simplifies our previous code (which used to be +based on at least 4 different dictionaries), and should make using the +C-API somewhat faster (though it is still slower than using pure +python or cffi).

                    + +

                    A side effect of this work is that now PyPy actually supports the upstream lxml package---which is is one +of the most popular packages on PyPI. (Specifically, you need version +3.5.0 with this pull +request to remove old PyPy-specific hacks that were not really +working. See +details.) At this point, we no longer recommend using the +cffi-lxml alternative: although it may still be faster, it might be +incomplete and old.

                    + +

                    We are actively working on extending our C-API support, and hope to soon +merge a branch to support more of the C-API functions (some numpy news +coming!). Please try +it out and let us know how it works for you.

                    + +

                    Armin Rigo and the PyPy team

                    +
                    +

                    Comments

                    +
                    +
                    +
                    + + mathgl wrote on 2016-02-25 16:40: +
                    +
                    +

                    wow, s good news. When trying to pick up a new lib, I always check whether it supports pypy first.

                    +
                    +
                    +
                    +
                    + + Anonymous wrote on 2016-02-26 14:57: +
                    +
                    +

                    Really looking forward to hearing news from the numpy front!

                    +
                    +
                    +
                    +
                    + + Unknown wrote on 2016-02-26 18:42: +
                    +
                    +

                    Great. Maybe now Odoo will work with PyPy!

                    +
                    +
                    +
                    +
                    + + Anonymous wrote on 2016-02-28 09:58: +
                    +
                    +

                    Great, in particular the native lxml. This is used in many large production systems that will now be even more interested in PyPy.

                    +
                    +
                    +
                    + +
                    +
                    + +
                    +
                    + + \ No newline at end of file diff --git a/posts/2016/03/pypy-50-released-5730569530415927220.html b/posts/2016/03/pypy-50-released-5730569530415927220.html new file mode 100644 index 000000000..fa063c225 --- /dev/null +++ b/posts/2016/03/pypy-50-released-5730569530415927220.html @@ -0,0 +1,565 @@ + + + + + +PyPy 5.0 released | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                    +
                    +

                    PyPy 5.0 released

                    + + + +
                    +
                    +

                    +PyPy 5.0

                    +We have released PyPy 5.0, about three months after PyPy 4.0.1. We encourage all users of PyPy to update to this version.

                    +You can download the PyPy 5.0 release here:
                    + +
                    +We would like to thank our donors for the continued support of the PyPy project.
                    +We would also like to thank our contributors and encourage new people to join the project. PyPy has many layers and we need help with all of them: PyPy and RPython documentation improvements, tweaking popular modules to run on pypy, or general help with making RPython’s JIT even better.

                    +

                    +Faster and Leaner

                    +
                    +We continue to improve the warmup time and memory usage of JIT-related metadata. The exact effects depend vastly on the program you’re running and can range from insignificant to warmup being up to 30% faster and memory dropping by about 30%.
                    +

                    +

                    +C-API Upgrade

                    +
                    +We also merged a major upgrade to our C-API layer (cpyext), simplifying the interaction between c-level objects and PyPy interpreter level objects. As a result, lxml (prerelease) with its cython compiled component passes all tests on PyPy. The new cpyext is also much faster. This major refactoring will soon be followed by an expansion of our C-API compatibility.
                    +

                    +

                    +Profiling with vmprof supported on more platforms

                    +
                    +vmprof has been a go-to profiler for PyPy on linux for a few releases and we’re happy to announce that thanks to the cooperation with jetbrains, vmprof now works on Linux, OS X and Windows on both PyPy and CPython.
                    +
                    +

                    +

                    +CFFI

                    +While not applicable only to PyPy, cffi is arguably our most significant contribution to the python ecosystem. PyPy 5.0 ships with cffi-1.5.2 which now allows embedding PyPy (or CPython) in a C program.
                    +
                    +

                    +

                    +What is PyPy?

                    +
                    +PyPy is a very compliant Python interpreter, almost a drop-in replacement for CPython 2.7. It’s fast (pypy and cpython 2.7.x performance comparison) due to its integrated tracing JIT compiler.
                    +We also welcome developers of other dynamic languages to see what RPython can do for them.
                    +This release supports x86 machines on most common operating systems (Linux 32/64, Mac OS X 64, Windows 32, OpenBSD, freebsd), newer ARM hardware (ARMv6 or ARMv7, with VFPv3) running Linux, and 64 bit PowerPC hardware, specifically Linux running the big- and little-endian variants of ppc64.
                    +
                    +

                    +

                    +Other Highlights (since 4.0.1 released in November 2015)

                    +
                      +
                    • New features:
                        +
                      • Support embedding PyPy in a C-program via cffi and static callbacks in cffi.
                        +This deprecates the old method of embedding PyPy
                      • +
                      • Refactor vmprof to work cross-operating-system, deprecate using buggy
                        +libunwind on Linux platforms. Vmprof even works on Windows now.
                      • +
                      • Support more of the C-API type slots, like tp_getattro, and fix C-API
                        +macros, functions, and structs such as _PyLong_FromByteArray(),
                        +PyString_GET_SIZE, f_locals in PyFrameObject, Py_NAN, co_filename in
                        +PyCodeObject
                      • +
                      • Use a more stable approach for allocating PyObjects in cpyext. (see
                        blog post). Once the PyObject corresponding to a PyPy object is created,
                        +it stays around at the same location until the death of the PyPy object.
                        +Done with a little bit of custom GC support. It allows us to kill the
                        +notion of “borrowing” inside cpyext, reduces 4 dictionaries down to 1, and
                        +significantly simplifies the whole approach (which is why it is a new
                        +feature while technically a refactoring) and allows PyPy to support the
                        +populart lxml module (as of the next release) with no PyPy specific
                        +patches needed
                      • +
                      • Make the default filesystem encoding ASCII, like CPython
                      • +
                      • Use hypothesis in test creation, which is great for randomizing tests
                      • +
                        +
                      • +
                      • Bug Fixes
                          +
                        • Backport always using os.urandom for uuid4 from cpython and fix the JIT as well
                          +(issue #2202)
                        • +
                        • More completely support datetime, optimize timedelta creation
                        • +
                        • Fix for issue #2185 which caused an inconsistent list of operations to be
                          +generated by the unroller, appeared in a complicated DJango app
                        • +
                        • Fix an elusive issue with stacklets on shadowstack which showed up when
                          +forgetting stacklets without resuming them
                        • +
                        • Fix entrypoint() which now acquires the GIL
                        • +
                        • Fix direct_ffi_call() so failure does not bail out before setting CALL_MAY_FORCE
                        • +
                        • Fix (de)pickling long values by simplifying the implementation
                        • +
                        • Fix RPython rthread so that objects stored as threadlocal do not force minor
                          +GC collection and are kept alive automatically. This improves perfomance of
                          +short-running Python callbacks and prevents resetting such object between
                          +calls
                        • +
                        • Support floats as parameters to itertools.isslice()
                        • +
                        • Check for the existence of CODESET, ignoring it should have prevented PyPy
                          +from working on FreeBSD
                        • +
                        • Fix for corner case (likely shown by Krakatau) for consecutive guards with
                          +interdependencies
                        • +
                        • Fix applevel bare class method comparisons which should fix pretty printing
                          +in IPython
                        • +
                        • Issues reported with our previous release were resolved after reports from users on our issue tracker at https://foss.heptapod.net/pypy/pypy/-/issues or on IRC at #pypy
                        • +
                          +
                        • +
                        • Numpy:
                            +
                          • Updates to numpy 1.10.2 (incompatibilities and not-implemented features
                            +still exist)
                          • +
                          • Support dtype=((‘O’, spec)) union while disallowing record arrays with
                            +mixed object, non-object values
                          • +
                          • Remove all traces of micronumpy from cpyext if –withoutmod-micronumpy option used
                          • +
                          • Support indexing filtering with a boolean ndarray
                          • +
                          • Support partition() as an app-level function, together with a cffi wrapper
                            +in pypy/numpy, this now provides partial support for partition()
                          • +
                            +
                          • +
                          • Performance improvements:
                              +
                            • Optimize global lookups
                            • +
                            • Improve the memory signature of numbering instances in the JIT. This should
                              +massively decrease the amount of memory consumed by the JIT, which is
                              +significant for most programs. Also compress the numberings using variable-
                              +size encoding
                            • +
                            • Optimize string concatenation
                            • +
                            • Use INT_LSHIFT instead of INT_MUL when possible
                            • +
                            • Improve struct.unpack by casting directly from the underlying buffer.
                              +Unpacking floats and doubles is about 15 times faster, and integer types
                              +about 50% faster (on 64 bit integers). This was then subsequently
                              +improved further in optimizeopt.py.
                            • +
                            • Optimize two-tuple lookups in mapdict, which improves warmup of instance
                              +variable access somewhat
                            • +
                            • Reduce all guards from int_floordiv_ovf if one of the arguments is constant
                            • +
                            • Identify permutations of attributes at instance creation, reducing the
                              +number of bridges created
                            • +
                            • Greatly improve re.sub() performance
                            • +
                              +
                            • +
                            • Internal refactorings:
                                +
                              • Refactor and improve exception analysis in the annotator
                              • +
                              • Remove unnecessary special handling of space.wrap().
                              • +
                              • Support list-resizing setslice operations in RPython
                              • +
                              • Tweak the trace-too-long heuristic for multiple jit drivers
                              • +
                              • Refactor bookkeeping (such a cool word - three double letters) in the
                                +annotater
                              • +
                              • Refactor wrappers for OS functions from rtyper to rlib and simplify them
                              • +
                              • Simplify backend loading instructions to only use four variants
                              • +
                              • Simplify GIL handling in non-jitted code
                              • +
                              • Refactor naming in optimizeopt
                              • +
                              • Change GraphAnalyzer to use a more precise way to recognize external
                                +functions and fix null pointer handling, generally clean up external
                                +function handling
                              • +
                              • Remove pure variants of getfield_gc_* operations from the JIT by
                                +determining purity while tracing
                              • +
                              • Refactor databasing
                              • +
                              • Simplify bootstrapping in cpyext
                              • +
                              • Refactor rtyper debug code into python.rtyper.debug
                              • +
                              • Seperate structmember.h from Python.h Also enhance creating api functions
                                +to specify which header file they appear in (previously only pypy_decl.h)
                              • +
                              • Fix tokenizer to enforce universal newlines, needed for Python 3 support
                              • +
                              +
                            • +
                            +Please try it out and let us know what you think. We welcome feedback, we know you are using PyPy, please tell us about it!
                            +Cheers
                            +The PyPy Team
                            +
                            +
                            +

                            Comments

                            +
                            +
                            +
                            + + HelpingHand wrote on 2016-03-10 22:30: +
                            +
                            +

                            What is the status on finally getting a functional x64 build for windows? I am mainly interested in embedding PyPy and unless there is support for it, I will continue to avoid it.

                            +
                            +
                            +
                            +
                            + + mathgl wrote on 2016-03-11 05:05: +
                            +
                            +

                            does new cpyext help for supporting numpy?

                            +
                            +
                            +
                            +
                            + + mattip wrote on 2016-03-11 08:06: +
                            +
                            +

                            HelpingHand: work on x64 for windows [0] is awaiting a champion, with either the skill to do it or with the deep pockets to sponsor it. If you are interested, please come to #pypy on IRC to discuss it

                            [0] https://doc.pypy.org/en/latest/windows.html#what-is-missing-for-a-full-64-bit-translation

                            +
                            +
                            +
                            +
                            + + mattip wrote on 2016-03-11 08:09: +
                            +
                            +

                            mathgl: yes, we are cautiously optimistic that if we now flesh out cpyext to support enough of the C-API that vanilla numpy might just work. Stay tuned for further developments

                            +
                            +
                            +
                            +
                            + + Martin Gfeller wrote on 2016-03-11 08:57: +
                            +
                            +

                            I've asked Brett Cannon, well-know Pythonista working at Microsoft about whether they could sponsor or undertake Windows 64-bit work.

                            If you have a substantial use cause requiring the speed of PyPy, large address spaces and Windows, it might help.

                            +
                            +
                            +
                            +
                            + + Unknown wrote on 2016-03-11 10:52: +
                            +
                            +

                            What happened to the speed graph on speed.pypy.org? The speedups for earlier versions of PyPy before 5.0 suddenly are much higher than they used to be. Compare for example against the graph of a couple of weeks ago (https://web.archive.org/web/20160228102615/https://speed.pypy.org/)

                            Version 28/2 11/3
                            1.5 3.18x 4.86x
                            2.1 6.12x 7.50x
                            2.4.0 6.22x 7.61x
                            2.6.1 7.05x 8.58x

                            Has the benchmark been changed, the timing method, the speed computation, hardware used, etc? More importantly, which version is "correct"?

                            +
                            +
                            +
                            +
                            + + Maciej Fijalkowski wrote on 2016-03-11 10:56: +
                            +
                            +

                            Hi Paul.

                            We rerun all benchmarks on old Pythons and it shows now a different subset of benchmarks. I must admit I don't know why the main site chooses some benchmarks and not others, it's certainly not deliberate. Any single number you use is not correct, a bit by definition - we suggest you look in details what the benchmarks do or even better, benchmark yourself. We'll look why it's showing a different subset

                            +
                            +
                            +
                            +
                            + + Unknown wrote on 2016-03-11 11:07: +
                            +
                            +

                            Great news! Awesome!

                            +
                            +
                            +
                            +
                            + + mattip wrote on 2016-03-11 12:40: +
                            +
                            +

                            Paul Melis, Maciej Fjalkowski - indeed there was a bug; I reran the old benchmarks but only ~half ran to completion. I reverted the bad run, now results are like they used to be. Thanks for pointing it out

                            +
                            +
                            +
                            +
                            + + Unknown wrote on 2016-03-14 03:52: +
                            +
                            +

                            When is release of pypy3 5.0?
                            I'd like also to get the profit of pypy5.0 by a condition of support of python 3.2.5.

                            +
                            +
                            +
                            +
                            + + Armin Rigo wrote on 2016-03-17 15:07: +
                            +
                            +

                            lxml 3.6.0 released with support for PyPy 5.x.

                            +
                            +
                            +
                            +
                            + + Armin Rigo wrote on 2016-03-20 11:10: +
                            +
                            +

                            Before trying out lxml 3.6.0, upgrade to PyPy 5.0.1: the release 5.0.0 does not reliably work with it.

                            +
                            +
                            +
                            + +
                            +
                            + +
                            +
                            + + \ No newline at end of file diff --git a/posts/2016/03/pypy-501-bugfix-released-2218405735970044084.html b/posts/2016/03/pypy-501-bugfix-released-2218405735970044084.html new file mode 100644 index 000000000..abb31e249 --- /dev/null +++ b/posts/2016/03/pypy-501-bugfix-released-2218405735970044084.html @@ -0,0 +1,337 @@ + + + + + +PyPy 5.0.1 bugfix released | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                            +
                            +

                            PyPy 5.0.1 bugfix released

                            + + + +
                            +
                            +

                            +PyPy 5.0.1 +

                            +
                            +We have released a bugfix for PyPy 5.0, after reports that the newly released +lxml 3.6.0, which now supports PyPy 5.0 +, can crash on large files. +Thanks to those who reported the crash. Please update, downloads are available +at

                            pypy.org/download.html

                            +The changes between PyPy 5.0 and 5.0.1 are only two bug fixes: one in +cpyext, which fixes notably (but not only) lxml; and another for a +corner case of the JIT.

                            +What is PyPy?

                            +
                            +
                            +
                            +PyPy is a very compliant Python interpreter, almost a drop-in replacement for +CPython 2.7. It’s fast (PyPy and CPython 2.7.x performance comparison) +due to its integrated tracing JIT compiler.
                            + +We also welcome developers of other +dynamic languages to see what RPython can do for them.
                            + +This release supports x86 machines on most common operating systems +(Linux 32/64, Mac OS X 64, Windows 32, OpenBSD, FreeBSD), +newer ARM hardware (ARMv6 or ARMv7, with VFPv3) running Linux, and the +big- and little-endian variants of PPC64 running Linux.

                            +Please update, and continue to help us make PyPy better.

                            +Cheers
                            + +The PyPy Team
                            +
                            +
                            +

                            Comments

                            +
                            +
                            +
                            + + Armin Rigo wrote on 2016-03-28 03:39: +
                            +
                            +

                            ppc64 released four days ago, and big-endian updated just now to fix an important bug. There are other big-endian bugs left which we're fixing as we go along; they will be in the next official release.

                            +
                            +
                            +
                            + +
                            +
                            + +
                            +
                            + + \ No newline at end of file diff --git a/posts/2016/04/pypy-51-released-4979856639628970409.html b/posts/2016/04/pypy-51-released-4979856639628970409.html new file mode 100644 index 000000000..f5bf6661e --- /dev/null +++ b/posts/2016/04/pypy-51-released-4979856639628970409.html @@ -0,0 +1,409 @@ + + + + + +PyPy 5.1 released | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                            +
                            +

                            PyPy 5.1 released

                            + + + +
                            +
                            +
                            +
                            +We have released PyPy 5.1, about a month after PyPy 5.0.

                            +This release includes more improvement to warmup time and memory requirements, extending the work done on PyPy 5.0. We have seen an additional reduction of about 20% in memory requirements, and up to 30% warmup time improvement, more detail in the blog post.

                            +We also now have full support for the IBM s390x. Since this support is in RPython, any dynamic language written using RPython, like PyPy, will automagically be supported on that architecture.

                            +We updated cffi to 1.6 (cffi 1.6 itself will be released shortly), and continue to improve support for the wider python ecosystem using the PyPy interpreter.

                            +You can download the PyPy 5.1 release here:
                            + +
                            +We would like to thank our donors for the continued support of the PyPy project.
                            +We would also like to thank our contributors and encourage new people to join the project. PyPy has many layers and we need help with all of them: PyPy and RPython documentation improvements, tweaking popular modules to run on pypy, or general help with making RPython’s JIT even better.
                            +

                            +What is PyPy?

                            +PyPy is a very compliant Python interpreter, almost a drop-in replacement for CPython 2.7. It’s fast (PyPy and CPython 2.7.x performance comparison) due to its integrated tracing JIT compiler.

                            +We also welcome developers of other dynamic languages to see what RPython can do for them.

                            +This release supports:
                              +
                            • +x86 machines on most common operating systems (Linux 32/64, Mac OS X 64, Windows 32, OpenBSD, FreeBSD),
                            • +
                            • newer ARM hardware (ARMv6 or ARMv7, with VFPv3) running Linux,
                            • +
                            • big- and little-endian variants of PPC64 running Linux,
                            • +
                            • +s390x running Linux
                            • +
                            +
                            +
                            +
                            +

                            +Other Highlights

                            +

                            +(since the release of PyPy 5.0 in March, 2016

                            +
                              +
                            • +

                              +New features:

                              +
                                +
                              • A new jit backend for the IBM s390x, which was a large effort over the past few months.
                              • +
                              • Add better support for PyUnicodeObject in the C-API compatibility layer
                              • +
                              • Support GNU/kFreeBSD Debian ports in vmprof
                              • +
                              • Add __pypy__._promote
                              • +
                              • Make attrgetter a single type for CPython compatibility
                              • +
                              +
                              +
                            • +
                            • +

                              +Bug Fixes

                              +
                                +
                              • Catch exceptions raised in an exit function
                              • +
                              • Fix a corner case in the JIT
                              • +
                              • Fix edge cases in the cpyext refcounting-compatible semantics (more work on cpyext compatibility is coming in the cpyext-ext branch, but isn’t ready yet)
                              • +
                              • Try harder to not emit NEON instructions on ARM processors without NEON support
                              • +
                              • Improve the rpython posix module system interaction function calls
                              • +
                              • Detect a missing class function implementation instead of calling a random function
                              • +
                              • Check that PyTupleObjects do not contain any NULLs at the point of conversion to W_TupleObjects
                              • +
                              • In ctypes, fix _anonymous_ fields of instances
                              • +
                              • Fix JIT issue with unpack() on a Trace which contains half-written operations
                              • +
                              • Fix sandbox startup (a regression in 5.0)
                              • +
                              • Fix possible segfault for classes with mangled mro or __metaclass__
                              • +
                              • Fix isinstance(deque(), Hashable) on the pure python deque
                              • +
                              • Fix an issue with forkpty()
                              • +
                              • Issues reported with our previous release were resolved after reports from users on our issue tracker at https://foss.heptapod.net/pypy/pypy/-/issues or on IRC at #pypy
                              • +
                              +
                              +
                            • +
                            • +

                              +Numpy:

                              +
                                +
                              • Implemented numpy.where for a single argument
                              • +
                              • Indexing by a numpy scalar now returns a scalar
                              • +
                              • Fix transpose(arg) when arg is a sequence
                              • +
                              • Refactor include file handling, now all numpy ndarray, ufunc, and umath functions exported from libpypy.so are declared in pypy_numpy.h, which is included only when building our fork of numpy
                              • +
                              • Add broadcast
                              • +
                              +
                              +
                            • +
                            • +

                              +Performance improvements:

                              +
                                +
                              • Improve str.endswith([tuple]) and str.startswith([tuple]) to allow JITting
                              • +
                              • Merge another round of improvements to the warmup performance
                              • +
                              • Cleanup history rewriting in pyjitpl
                              • +
                              • Remove the forced minor collection that occurs when rewriting the assembler at the start of the JIT backend
                              • +
                              • Port the resource module to cffi
                              • +
                                +
                              • +
                              • +

                                +Internal refactorings:

                                +
                                  +
                                • Use a simpler logger to speed up translation
                                • +
                                • Drop vestiges of Python 2.5 support in testing
                                • +
                                • Update rpython functions with ones needed for py3k
                                • +
                                +
                              • +
                              +
                              +
                              +
                              +
                              +
                              +
                              +
                              +Please update, and continue to help us make PyPy better.
                              +Cheers
                              +The PyPy Team
                              +
                              +
                              +
                              +






                              +
                              +
                              +
                              +

                              Comments

                              +
                              +
                              +
                              + +
                              +
                              + + \ No newline at end of file diff --git a/posts/2016/04/pypy-enterprise-edition-3688275697656890948.html b/posts/2016/04/pypy-enterprise-edition-3688275697656890948.html new file mode 100644 index 000000000..07dcff421 --- /dev/null +++ b/posts/2016/04/pypy-enterprise-edition-3688275697656890948.html @@ -0,0 +1,306 @@ + + + + + +PyPy Enterprise Edition | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                              +
                              +

                              PyPy Enterprise Edition

                              + + + +
                              +

                              With the latest additions, PyPy's JIT now supports the Z architecture on Linux. The newest architecture revision (also known as s390x, or colloquially referred to as "big iron") is the 64-bit extension for IBM mainframes. Currently only Linux 64 bit is supported (not z/OS nor TPF).
                              +This is the fourth assembler backend supported by PyPy in addition to x86 (32 and 64), ARM (32-bit only) and PPC64 (both little- and big-endian). It might seem that we kind of get a hang of new architectures. Thanks to IBM for funding this work!

                              +

                              +History

                              +When I went to university one lecture covered the prediction of Thomas Watson in 1943. His famous quote "I think there is a world market for maybe five computers ...", turned out not to be true.

                              +However, even 70 years later, mainframes are used more often than you think. They back critical tasks requiring a high level of stability/security and offer high hardware and computational utilization rates by virtualization.

                              +With the new PyPy JIT backend we are happy to present a fast Python virtual machine for mainframes and contribute more free software running on s390x.

                              +Meta tracing +

                              +Even though the JIT backend has been tested on PyPy, it is not restricted to  the Python programming language. Do you have a great idea for a DSL, or another language that should run on mainframes? Go ahead and just implement your interpreter using RPython.

                              +How do I get a copy? +

                              +PyPy can be built using the usual instructions found here. As soon as the next PyPy version has been released we will provide binaries. Until then you can just grab a nightly here.We are currently busy to get the next version of PyPy ready, so an official release will be rolled out soon.

                              +Comparing s390x to x86 +

                              +The goal of this comparison is not to scientifically evaluate the benefits/disadvantages on s390x, but rather to see that PyPy's architecture delivers the same benefits as it does on other platforms. Similar to the comparison done for PPC I ran the benchmarks using the same setup. The first column is the speedup of the PyPy JIT VM compared to the speedup of a pure PyPy interpreter 1). Note that the s390x's OS was virtualized.

                              +  Label               x86     s390x      s390x (run 2)

                                ai                 13.7      12.4       11.9
                                bm_chameleon        8.5       6.3        6.8
                                bm_dulwich_log      5.1       5.0        5.1
                                bm_krakatau         5.5       2.0        2.0
                                bm_mako             8.4       5.8        5.9
                                bm_mdp              2.0       3.8        3.8
                                chaos              56.9      52.6       53.4
                                crypto_pyaes       62.5      64.2       64.2
                                deltablue           3.3       3.9        3.6
                                django             28.8      22.6       21.7
                                eparse              2.3       2.5        2.6
                                fannkuch            9.1       9.9       10.1
                                float              13.8      12.8       13.8
                                genshi_text        16.4      10.5       10.9
                                genshi_xml          8.2       7.9        8.2
                                go                  6.7       6.2       11.2
                                hexiom2            24.3      23.8       23.5
                                html5lib            5.4       5.8        5.7
                                json_bench         28.8      27.8       28.1
                                meteor-contest      5.1       4.2        4.4
                                nbody_modified     20.6      19.3       19.4
                                pidigits            1.0      -1.1       -1.0
                                pyflate-fast        9.0       8.7        8.5
                                pypy_interp         3.3       4.2        4.4
                                raytrace-simple    69.0     100.9       93.4
                                richards           94.1      96.6       84.3
                                rietveld            3.2       2.5        2.7
                                slowspitfire        2.8       3.3        4.2
                                spambayes           5.0       4.8        4.8
                                spectral-norm      41.9      39.8       42.6
                                spitfire            3.8       3.9        4.3
                                spitfire_cstringio  7.6       7.9        8.2
                                sympy_expand        2.9       1.8        1.8
                                sympy_integrate     4.3       3.9        4.0
                                sympy_str           1.5       1.3        1.3
                                sympy_sum           6.2       5.8        5.9
                                telco              61.2      48.5       54.8
                                twisted_iteration  55.5      41.9       43.8
                                twisted_names       8.2       9.3        9.7
                                twisted_pb         12.1      10.4       10.2
                                twisted_tcp         4.9       4.8        5.2


                                Geometric mean:    9.31      9.10       9.43


                              +As you can see the benefits are comparable on both platforms.
                              +Of course this is scientifically not good enough, but it shows a tendency. s390x can achieve the same results as you can get on x86.

                              +Are you running your business application on a mainframe? We would love to get some feedback. Join us in IRC tell us if PyPy made your application faster!

                              +plan_rich & the PyPy Team

                              1) PyPy revision for the benchmarks: 4b386bcfee54 +
                              +

                              Comments

                              +
                              +
                              +
                              + +
                              +
                              + + \ No newline at end of file diff --git a/posts/2016/04/warmup-improvements-more-efficient-7082900097299909512.html b/posts/2016/04/warmup-improvements-more-efficient-7082900097299909512.html new file mode 100644 index 000000000..39db83d82 --- /dev/null +++ b/posts/2016/04/warmup-improvements-more-efficient-7082900097299909512.html @@ -0,0 +1,416 @@ + + + + + +Warmup improvements: more efficient trace representation | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                              +
                              +

                              Warmup improvements: more efficient trace representation

                              + + + +
                              +
                              +

                              Hello everyone.

                              +

                              I'm pleased to inform that we've finished another round of +improvements to the warmup performance of PyPy. Before I go +into details, I'll recap the achievements that we've done since we've started +working on the warmup performance. I picked a random PyPy from November 2014 +(which is definitely before we started the warmup work) and compared it with +a recent one, after 5.0. The exact revisions are respectively ffce4c795283 +and cfbb442ae368. First let's compare pure warmup benchmarks that +can be found in our benchmarking suite. Out of those, +pypy-graph-alloc-removal numbers should be taken with a grain of salt, +since other work could have influenced the results. +The rest of the benchmarks mentioned is bottlenecked purely by warmup times.

                              +

                              You can see how much your program spends in warmup running +PYPYLOG=jit-summary:- pypy your-program.py under "tracing" and "backend" +fields (in the first three lines). An example looks like that:

                              +
                              +[e00c145a41] {jit-summary
                              +Tracing:        71      0.053645 <- time spent tracing & optimizing
                              +Backend:        71      0.028659 <- time spent compiling to assembler
                              +TOTAL:                  0.252217 <- total run time of the program
                              +
                              +

                              The results of the benchmarks

                              + ++++++++ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                              benchmarktime - oldtime - newspeedupJIT time - oldJIT time - new
                              function_call1.861.421.3x1.12s0.57s
                              function_call25.17s2.73s1.9x4.2s1.6s
                              bridges2.77s2.07s1.3x1.5s0.8s
                              pypy-graph-alloc-removal2.06s1.65s1.25x1.25s0.79s
                              +

                              As we can see, the overall warmup benchmarks got up to 90% faster with +JIT time dropping by up to 2.5x. We have more optimizations in the pipeline, +with an idea how to transfer some of the JIT gains into more of a total program +runtime by jitting earlier and more eagerly.

                              +
                              +

                              Details of the last round of optimizations

                              +

                              Now the nitty gritty details - what did we actually do? I covered a lot of +warmup improvements in the past blog posts so I'm going to focus on +the last change, the jit-leaner-frontend branch. This last change is simple, instead of using +pointers to store the "operations" objects created during tracing, we use a compact list of +16-bit integers (with 16bit pointers in between). On 64bit machine the memory wins are +tremendous - the new representation is 4x more efficient to use 16bit pointers than full 64bit pointers. +Additionally, the smaller representation has much better cache behavior and much less +pointer chasing in memory. It also has a better defined lifespan, so we don't need to +bother tracking them by the GC, which also saves quite a bit of time.

                              +

                              The change sounds simple, but the details in the underlaying data mean that +everything in the JIT had to be changed which took quite a bit of effort :-)

                              +

                              Going into the future on the JIT front, we have an exciting set of optimizations, +ranging from faster loops through faster warmup to using better code generation +techniques and broadening the kind of program that PyPy speeds up. Stay tuned +for the updates.

                              +

                              We would like to thank our commercial partners for making all of this possible. +The work has been performed by baroquesoftware and would not be possible +without support from people using PyPy in production. If your company uses +PyPy and want it to do more or does not use PyPy but has performance problems +with the Python installation, feel free to get in touch with me, trust me using +PyPy ends up being a lot cheaper than rewriting everything in go :-)

                              +

                              Best regards,
                              +Maciej Fijalkowski

                              +
                              +
                              +
                              +
                              +

                              Comments

                              +
                              +
                              +
                              + + Peter wrote on 2016-04-08 08:55: +
                              +
                              +

                              It would be nice to compare speed with C-Python and on short benchmarks, as that is where warmup time matters the most

                              +
                              +
                              +
                              +
                              + + Maciej Fijalkowski wrote on 2016-04-08 09:14: +
                              +
                              +

                              Those benchmarks are very synthetic warmup-oriented ones. It means you exec() piece of code and then run it 2000 times and then exec again. Any other short-running programs have a lot more noise where you have multiple effects taking place and it would be really hard to compare between old and new pypy. That said it's a fair requirement, we have one more branch in the pipeline and I'll try to get more real world data.

                              +
                              +
                              +
                              + +
                              +
                              + +
                              +
                              + + \ No newline at end of file diff --git a/posts/2016/05/pypy-511-bugfix-released-7586640750680293200.html b/posts/2016/05/pypy-511-bugfix-released-7586640750680293200.html new file mode 100644 index 000000000..ad13c1eed --- /dev/null +++ b/posts/2016/05/pypy-511-bugfix-released-7586640750680293200.html @@ -0,0 +1,309 @@ + + + + + +PyPy 5.1.1 bugfix released | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                              +
                              +

                              PyPy 5.1.1 bugfix released

                              + + + +
                              +
                              +
                              +We have released a bugfix for PyPy 5.1, due to a regression in installing third-party packages depending on numpy (using our numpy fork available at https://bitbucket.org/pypy/numpy ).

                              Thanks to those who reported the issue. We also fixed a regression in translating PyPy which increased the memory required to translate. Improvement will be noticed by downstream packagers and those who translate rather than
                              download pre-built binaries.
                              +
                              +

                              +What is PyPy?

                              +
                              +PyPy is a very compliant Python interpreter, almost a drop-in replacement for CPython 2.7. It's fast (PyPy and CPython 2.7.x performance comparison) due to its integrated tracing JIT compiler.

                              We also welcome developers of other dynamic languages to see what RPython can do for them.

                              This release supports:
                              +
                                +
                              • +x86 machines on most common operating systems (Linux 32/64, Mac OS X 64, Windows 32, OpenBSD, FreeBSD),
                              • +
                              • newer ARM hardware (ARMv6 or ARMv7, with VFPv3) running Linux,
                              • +
                              • big- and little-endian variants of PPC64 running Linux,
                              • +
                              • +s390x running Linux
                              • +
                              +
                              +Please update, and continue to help us make PyPy better.

                              Cheers

                              The PyPy Team
                              +
                              +
                              +
                              +

                              Comments

                              +
                              +
                              +
                              + +
                              +
                              + + \ No newline at end of file diff --git a/posts/2016/05/pypy33-v52-alpha-1-released-1725927506363370346.html b/posts/2016/05/pypy33-v52-alpha-1-released-1725927506363370346.html new file mode 100644 index 000000000..3a73d632d --- /dev/null +++ b/posts/2016/05/pypy33-v52-alpha-1-released-1725927506363370346.html @@ -0,0 +1,411 @@ + + + + + +PyPy3.3 v5.2 alpha 1 released | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                              +
                              +

                              PyPy3.3 v5.2 alpha 1 released

                              + + + +
                              +

                              We're pleased to announce the first alpha release of PyPy3.3 v5.2. This is the
                              +first release of PyPy which targets Python 3.3 (3.3.5) compatibility.

                              +

                              We would like to thank all of the people who donated to the py3k proposal
                              +for supporting the work that went into this and future releases.

                              +

                              You can download the PyPy3.3 v5.2 alpha 1 release here:

                              +
                              https://pypy.org/download.html#python-3-3-5-compatible-pypy3-3-v5-2
                              +
                              +

                              Highlights

                              + +
                              +
                              +

                              What is PyPy?

                              +

                              PyPy is a very compliant Python interpreter, almost a drop-in replacement for
                              +CPython 2.7.10 and one day 3.3.5. It's fast due to its integrated tracing JIT
                              +compiler.

                              +

                              We also welcome developers of other dynamic languages to see what RPython
                              +can do for them.

                              +

                              This release supports:

                              +
                                +
                              • +x86 machines on most common operating systems except Windows
                                +(Linux 32/64, Mac OS X 64, OpenBSD, FreeBSD),
                              • +
                              • newer ARM hardware (ARMv6 or ARMv7, with VFPv3) running Linux,
                              • +
                              • big- and little-endian variants of PPC64 running Linux,
                              • +
                              • +s390x running Linux
                              • +
                              +

                              Please try it out and let us know what you think. We welcome feedback, we know
                              +you are using PyPy, please tell us about it!

                              +

                              We'd especially like to thank these people for their contributions to this
                              +release:

                              +

                              Manuel Jacob, Ronan Lamy, Mark Young, Amaury Forgeot d'Arc, Philip Jenvey,
                              +Martin Matusiak, Vasily Kuznetsov, Matti Picus, Armin Rigo and many others.

                              +

                              Cheers

                              +

                              The PyPy Team

                              +
                              +
                              +

                              Comments

                              +
                              +
                              +
                              + + rnbdlnch wrote on 2016-05-31 09:33: +
                              +
                              +

                              thank you!!!

                              +
                              +
                              +
                              +
                              + + Unknown wrote on 2016-05-31 12:47: +
                              +
                              +

                              Many, many thanks!

                              +
                              +
                              +
                              +
                              + + Unknown wrote on 2016-05-31 13:30: +
                              +
                              +

                              As a follow-up: Did asyncio work previously? Anyway, it does now as 'yield from' is there. Beautiful!

                              +
                              +
                              +
                              +
                              + + Anonymous wrote on 2016-06-01 01:52: +
                              +
                              +

                              Great News!!! Thank you!!!

                              +
                              +
                              +
                              +
                              + + Robert wrote on 2016-06-03 23:04: +
                              +
                              +

                              Excited! Can't wait for the 3.4 compatibility!

                              +
                              +
                              +
                              +
                              + + Sean Vieira wrote on 2016-07-05 19:33: +
                              +
                              +

                              Hip, hip, huzzah!

                              +
                              +
                              +
                              +
                              + + Hai Zaar wrote on 2016-07-25 14:59: +
                              +
                              +

                              Great news guys! Did you consider skipping 3.3/3.4 support all together and going straight for 3.5 compatibility?

                              +
                              +
                              +
                              +
                              + + Armin Rigo wrote on 2016-07-31 14:22: +
                              +
                              +

                              We'll be working next on 3.5 support.

                              +
                              +
                              +
                              + +
                              +
                              + +
                              +
                              + + \ No newline at end of file diff --git a/posts/2016/06/pypy2-v53-released-major-c-extension-7708576047190172431.html b/posts/2016/06/pypy2-v53-released-major-c-extension-7708576047190172431.html new file mode 100644 index 000000000..086970f6b --- /dev/null +++ b/posts/2016/06/pypy2-v53-released-major-c-extension-7708576047190172431.html @@ -0,0 +1,435 @@ + + + + + +PyPy2 v5.3 released - major C-extension support improvements | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                              +
                              +

                              PyPy2 v5.3 released - major C-extension support improvements

                              + + + +
                              +
                              +We have released PyPy2.7 v5.3, about six weeks after PyPy 5.1 and a week after +PyPy3.3 v5.2 alpha 1, the first PyPy release targeting 3.3 +compatibility. This new PyPy2.7 release includes major improvements for the +C-API compatibility layer. In addition to complete support +for lxml, we now pass most (more than 95%) of the upstream numpy test suite. We can build and run scipy and matplotlib as well. Most of the failures have to do with (ab) use of the C-API, for instance writing to a read-only pointer obtained from PyString_AsString().

                              +Note that the C-API compatibility layer is significantly slower than CPython, as explained in the blog post about the new strategy for reflection of C objects into the PyPy interpreter.

                              +We updated cffi to version 1.7 (incremental changes which provide a nicer developer experience, documented here). We would encourage developers to move their C-extension modules to cffi, but are willing to help you work through issues with existing code; come to #pypy on IRC and let us know how we can help you help us do better.

                              +You can download the PyPy2 v5.3 release here:
                              + +
                              +We would like to thank our donors for their continued support of the PyPy +project. We would also like to thank our contributors and +encourage new people to join the project. PyPy has many +layers and we need help with all of them: PyPy and RPython documentation +improvements, tweaking popular modules to run on PyPy, or general help +with making RPython’s JIT even better.

                              +

                              +What is PyPy?

                              +PyPy is a very compliant Python interpreter, almost a drop-in replacement for CPython 2.7. It’s fast (PyPy and CPython 2.7 performance comparison) due to its integrated tracing JIT compiler.

                              +We also welcome developers of other dynamic languages to see what RPython can do for them.

                              +This release supports:
                                +
                              • +x86 machines on most common operating systems (Linux 32/64, Mac OS X 64, Windows 32, OpenBSD, FreeBSD)
                              • +
                              • newer ARM hardware (ARMv6 or ARMv7, with VFPv3) running Linux
                              • +
                              • big- and little-endian variants of PPC64 running Linux
                              • +
                              • +s390x running Linux
                              • +
                              +
                              +
                              +

                              +Other Highlights

                              +

                              +(since the release of PyPy 5.1 in April, 2016)

                              +
                                +
                              • +
                                +New features: +
                                +
                                  +
                                • +
                                  +Merge a major expansion of the C-API support in cpyext, also expand cpyext tests to allow running them after translation as well as untranslated
                                  +
                                • +
                                • +
                                  +Instead of “GIL not held when a CPython C extension module +calls PyXxx”, we now silently acquire/release the GIL. Helps with +C extension modules that call some PyXxx() functions without +holding the GIL (arguably, they are theoretically buggy).
                                  +
                                • +
                                • +
                                  +Support command line -v to trace import statements
                                  +
                                • +
                                • +
                                  +Revive traceviewer, a tool to use pygame to view traces
                                  +
                                  +
                                  +
                                  +
                                • +
                                +
                              • +
                              • +
                                +Numpy via our internal _numpypy module: +
                                +
                                  +
                                • Implement ufunc.outer
                                • +
                                • Move PyPy-specific numpypy headers to a subdirectory (also changed the repo +accordingly)
                                • +

                                +
                              • +
                              • +
                                +Performance improvements: +
                                +
                                  +
                                • Use bitstrings to compress lists of descriptors that are attached to an +EffectInfo
                                • +
                                • Remove most of the _ovf, _zer and _val operations from RPython. Kills +quite some code internally, and allows the JIT to do better +optimizations: for example, app-level code like x / 2 or x % 2 +can now be turned into x >> 1 or x & 1, even if x is possibly +negative.
                                • +
                                • Rework the way registers are moved/spilled in before_call()
                                • +
                                +
                                +
                              • +
                              • +
                                +Internal refactorings: +
                                +
                                  +
                                • Refactor code to better support Python3-compatible syntax
                                • +
                                • Reduce the size of generated C sources during translation by +eliminating many many unused struct declarations (Issue #2281)
                                • +
                                • Reduce the size of generated code by using the same function objects in +all generated subclasses
                                • +
                                • Share cpyext Py* function wrappers according to the signature, shrinking the +translated libpypy.so by about 10% (without the JIT)
                                • +
                                +
                              • +
                              +Please update, and continue to help us make PyPy better. +Cheers
                              + +The PyPy Team
                              +
                              +
                              +

                              Comments

                              +
                              +
                              +
                              + + Anonymous wrote on 2016-06-09 19:48: +
                              +
                              +

                              "We can build and run scipy and matplotlib as well."

                              That's exciting. Are there special instructions needed to build and run spicy and matplotlib with PyPy to see how well it presently works for particular applications? Or is it not even really ready for outsiders to knock it around yet?

                              +
                              +
                              +
                              +
                              + + mattip wrote on 2016-06-09 23:26: +
                              +
                              +

                              No special instructions, just build from source (binaries precompiled for cpython will not work) using "pypy setup.py install", and let us know how it goes. The order should be numpy, matplotlib, scipy (we have reports that pygtk works too fwiw).

                              There have already been some bug reports, so you might want to patch your pymem.h header in pypy/include with this changeset https://bitbucket.org/pypy/pypy/commits/68486f0f79c649514, and if you are on OSX you may need to patch numpy/distutils/fcompiler/gnu.py with this patch https://bitbucket.org/pypy/numpy/commits/50bff5807e09721acc4d778ce8ffdef86e2f4c50

                              +
                              +
                              +
                              +
                              + + Canesin wrote on 2016-06-12 17:38: +
                              +
                              +

                              Great work as usual!

                              +
                              +
                              +
                              + +
                              +
                              + +
                              +
                              + + \ No newline at end of file diff --git a/posts/2016/07/reverse-debugging-for-python-8854823774141612670.html b/posts/2016/07/reverse-debugging-for-python-8854823774141612670.html new file mode 100644 index 000000000..34f946d79 --- /dev/null +++ b/posts/2016/07/reverse-debugging-for-python-8854823774141612670.html @@ -0,0 +1,806 @@ + + + + + +Reverse debugging for Python | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                              +
                              +

                              Reverse debugging for Python

                              + + + +
                              +
                              +

                              RevPDB

                              +

                              A "reverse debugger" is a debugger where you can go forward and +backward in time. It is an uncommon feature, at least in the open +source world, but I have no idea why. I have used undodb-gdb and +rr, which are reverse debuggers for C code, and I can only say that +they saved me many, many days of poking around blindly in gdb.

                              +

                              The PyPy team is pleased to give you "RevPDB", a reverse-debugger +similar to rr but for Python.

                              +

                              An example is worth a thousand words. Let's say your big Python +program has a bug that shows up inconsistently. You have nailed it +down to something like:

                              +
                                +
                              • start x.py, which does stuff (maybe involving processing files, +answering some web requests that you simulate from another terminal, +etc.);
                              • +
                              • sometimes, after a few minutes, your program's state becomes +inconsistent and you get a failing assert or another exception.
                              • +
                              +

                              This is the case where RevPDB is useful.

                              +

                              RevPDB is available only on 64-bit Linux and OS/X right now, but should +not be too hard to port to other OSes. It is very much alpha-level! +(It is a debugger full of bugs. Sorry about that.) I believe it is +still useful---it helped me in one real use case already.

                              +
                              +
                              +

                              How to get RevPDB

                              +

                              The following demo was done with an alpha version for 64-bit Linux, +compiled for Arch Linux. I won't provide the binary; it should be +easy enough to retranslate (much faster than a regular PyPy because it +contains neither a JIT nor a custom GC). Grab the PyPy sources from +Mercurial, and then:

                              +
                              +hg update reverse-debugger
                              +# or "hg update ff376ccacb36" for exactly this demo
                              +cd pypy/goal
                              +../../rpython/bin/rpython -O2 --revdb targetpypystandalone.py  \
                              +                  --withoutmod-cpyext --withoutmod-micronumpy
                              +
                              +

                              and possibly rename the final pypy-c to pypy-revdb to avoid +confusion.

                              +

                              Other platforms than 64-bit Linux and OS/X need some fixes before they work.

                              +
                              +
                              +

                              Demo

                              +

                              For this demo, we're going to use this x.py as the "big program":

                              +
                              +import os
                              +
                              +class Foo(object):
                              +    value = 5
                              +
                              +lst1 = [Foo() for i in range(100)]
                              +lst1[50].value += 1
                              +for x in lst1:
                              +    x.value += 1
                              +
                              +for x in lst1:
                              +    if x.value != 6:
                              +        print 'oops!'
                              +        os._exit(1)
                              +
                              +

                              Of course, it is clear what occurs in this small example: the check +fails on item 50. For this demo, the check has been written with +os._exit(1), because this exits immediately the program. If it +was written with an assert, then its failure would execute things +in the traceback module afterwards, to print the traceback; it +would be a minor mess just to find the exact point of the failing +assert. (This and other issues are supposed to be fixed in the +future, but for now it is alpha-level.)

                              +

                              Anyway, with a regular assert and a regular post-mortem pdb, +we could observe that x.value is indeed 7 instead of 6 when the +assert fails. Imagine that the program is much bigger: how would we +find the exact chain of events that caused this value 7 to show up on +this particular Foo object? This is what RevPDB is for.

                              +

                              First, we need for now to disable Address Space Layout Randomization +(ASLR), otherwise replaying will not work. This is done once with the +following command line, which changes the state until the next +reboot:

                              +
                              +echo 0 | sudo tee /proc/sys/kernel/randomize_va_space
                              +
                              +

                              UPDATE: the above is no longer necessary from revision ff376ccacb36.

                              +

                              Run x.py with RevPDB's version of PyPy instead of the regular +interpreter (CPython or PyPy):

                              +
                              +PYPYRDB=log.rdb ./pypy-revdb x.py
                              +
                              +

                              This pypy-revdb executable is like a slow PyPy executable, running +(for now) without a JIT. This produces a file log.rdb which +contains a complete log of this execution. (If the bug we are +tracking occurs rarely, we need to re-run it several times until we +get the failure. But once we got the failure, then we're done with +this step.)

                              +

                              Start:

                              +
                              +rpython/translator/revdb/revdb.py log.rdb
                              +
                              +

                              We get a pdb-style debugger. This revdb.py is a normal Python +program, which you run with an unmodified Python; internally, it looks +inside the log for the path to pypy-revdb and run it as needed (as +one forking subprocess, in a special mode).

                              +

                              Initially, we are at the start of the program---not at the end, like +we'd get in a regular debugger:

                              +
                              +File "<builtin>/app_main.py", line 787 in setup_bootstrap_path:
                              +(1)$
                              +
                              +

                              The list of commands is available with help.

                              +

                              Go to the end with continue (or c):

                              +
                              +(1)$ continue
                              +File "/tmp/x.py", line 14 in <module>:
                              +...
                              +  lst1 = [Foo() for i in range(100)]
                              +  lst1[50].value += 1
                              +  for x in lst1:
                              +      x.value += 1
                              +
                              +  for x in lst1:
                              +      if x.value != 6:
                              +          print 'oops!'
                              +>         os._exit(1)
                              +(19727)$
                              +
                              +

                              We are now at the beginning of the last executed line. The number +19727 is the "time", measured in number of lines executed. We can go +backward with the bstep command (backward step, or bs), line +by line, and forward again with the step command. There are also +commands bnext, bcontinue and bfinish and their forward +equivalents. There is also "go TIME" to jump directly to the specified +time. (Right now the debugger only stops at "line start" +events, not at function entry or exit, which makes some cases a bit +surprising: for example, a step from the return statement of +function foo() will jump directly to the caller's caller, if the +caller's current line was return foo() + 2, because no "line +start" event occurs in the caller after foo() returns to it.)

                              +

                              We can print Python expressions and statements using the p +command:

                              +
                              +(19727)$ p x
                              +$0 = <__main__.Foo object at 0xfffffffffffeab3e>
                              +(19727)$ p x.value
                              +$1 = 7
                              +(19727)$ p x.value + 1
                              +8
                              +
                              +

                              The "$NUM =" prefix is only shown when we print an object that +really exists in the debugged program; that's why the last line does +not contain it. Once a $NUM has been printed, then we can use +it in further expressions---even at a different point time. It +becomes an anchor that always refers to the same object:

                              +
                              +(19727)$ bstep
                              +
                              +File "/tmp/x.py", line 13 in <module>:
                              +...
                              +
                              +  lst1 = [Foo() for i in range(100)]
                              +  lst1[50].value += 1
                              +  for x in lst1:
                              +      x.value += 1
                              +
                              +  for x in lst1:
                              +      if x.value != 6:
                              +>         print 'oops!'
                              +          os._exit(1)
                              +(19726)$ p $0.value
                              +$1 = 7
                              +
                              +

                              In this case, we want to know when this value 7 was put in this +attribute. This is the job of a watchpoint:

                              +
                              +(19726)$ watch $0.value
                              +Watchpoint 1 added
                              +updating watchpoint value: $0.value => 7
                              +
                              +

                              This watchpoint means that $0.value will be evaluated at each line. +When the repr() of this expression changes, the watchpoint activates +and execution stops:

                              +
                              +(19726)$ bcontinue
                              +[searching 19629..19726]
                              +[searching 19338..19629]
                              +
                              +updating watchpoint value: $0.value => 6
                              +Reverse-hit watchpoint 1: $0.value
                              +File "/tmp/x.py", line 9 in <module>:
                              +  import os
                              +
                              +  class Foo(object):
                              +      value = 5
                              +
                              +  lst1 = [Foo() for i in range(100)]
                              +  lst1[50].value += 1
                              +  for x in lst1:
                              +>     x.value += 1
                              +
                              +  for x in lst1:
                              +      if x.value != 6:
                              +          print 'oops!'
                              +          os._exit(1)
                              +(19524)$
                              +
                              +

                              Note that using the $NUM syntax is essential in watchpoints. You +can't say "watch x.value", because the variable x will go out +of scope very soon when we move forward or backward in time. In fact +the watchpoint expression is always evaluated inside an environment +that contains the builtins but not the current locals and globals. +But it also contains all the $NUM, which can be used to refer to +known objects. It is thus common to watch $0.attribute if $0 +is an object, or to watch len($1) if $1 is some list. The +watch expression can also be a simple boolean: for example, "watch +$2 in $3" where $3 is some dict and $2 is some object that +you find now in the dict; you would use this to find out the time when +$2 was put inside $3, or removed from it.

                              +

                              Use "info watchpoints" and "delete <watchpointnum>" to manage +watchpoints.

                              +

                              There are also regular breakpoints, which you set with "b +FUNCNAME". It breaks whenever there is a call to a function that +happens to have the given name. (It might be annoying to use for a +function like __init__() which has many homonyms. There is no +support for breaking on a fully-qualified name or at a given line +number for now.)

                              +

                              In our demo, we stop at the line x.value += 1, which is where the +value was changed from 6 to 7. Use bcontinue again to stop at the +line lst1[50].value += 1, which is where the value was changed from +5 to 6. Now we know how this value attribute ends up being 7.

                              +
                              +(19524)$ bcontinue
                              +[searching 19427..19524]
                              +[searching 19136..19427]
                              +
                              +updating watchpoint value: $0.value => 5
                              +Reverse-hit watchpoint 1: $0.value
                              +File "/tmp/x.py", line 7 in <module>:
                              +  import os
                              +
                              +  class Foo(object):
                              +      value = 5
                              +
                              +  lst1 = [Foo() for i in range(100)]
                              +> lst1[50].value += 1
                              +  for x in lst1:
                              +      x.value += 1
                              +
                              +  for x in lst1:
                              +      if x.value != 6:
                              +...
                              +(19422)$
                              +
                              +

                              Try to use bcontinue yet another time. It will stop now just before +$0 is created. At that point in time, $0 refers to +an object that does not exist yet, so the watchpoint now evaluates to +an error message (but it continues to work as before, with that error +message as the string it currently evaluates to).

                              +
                              +(19422)$ bcontinue
                              +[searching 19325..19422]
                              +
                              +updating watchpoint value: $0.value => RuntimeError:
                              +               '$0' refers to an object created later in time
                              +Reverse-hit watchpoint 1: $0.value
                              +File "/tmp/x.py", line 6 in <module>:
                              +  import os
                              +
                              +  class Foo(object):
                              +      value = 5
                              +
                              +> lst1 = [Foo() for i in range(100)]
                              +  lst1[50].value += 1
                              +  for x in lst1:
                              +      x.value += 1
                              +
                              +  for x in lst1:
                              +...
                              +(19371)$
                              +
                              +

                              In big programs, the workflow is similar, just more complex. Usually +it works this way: we find interesting points in time with some +combination of watchpoints and some direct commands to move around. +We write down on a piece of (real or virtual) paper these points in +history, including most importantly their time, so that we can +construct an ordered understanding of what is going on.

                              +

                              The current revdb can be annoying and sometimes even crash; but +the history you reconstruct can be kept. All the times and +expressions printed are still valid when you restart revdb. The +only thing "lost" is the $NUM objects, which you need to print +again. (Maybe instead of $0, $1, ... we should use $<big +number>, where the big number identifies uniquely the object by its +creation time. These numbers would continue to be valid even after +revdb is restarted. They are more annoying to use than just +$0 though.)

                              +

                              Screencast: Here's a (slightly typo-y) screencast of cfbolz using the reverse debugger: +

                              +
                              +
                              +

                              Current issues

                              +

                              General issues:

                              +
                                +
                              • If you are using revdb on a log that took more than a few +minutes to record, then it can be painfully slow. This is because +revdb needs to replay again big parts of the log for some +operations.
                              • +
                              • The pypy-revdb is currently missing the following modules:
                                  +
                                • +thread (implementing multithreading is possible, but not done +yet);
                                • +
                                • +cpyext (the CPython C API compatibility layer);
                                • +
                                • +micronumpy (minor issue only);
                                • +
                                • +_continuation (for greenlets).
                                • +
                                +
                              • +
                              • Does not contain a JIT, and does not use our fast garbage +collectors. You can expect pypy-revdb to be maybe 3 times +slower than CPython.
                              • +
                              • Only works on Linux and OS/X. There is no fundamental reason for +this restriction, but it is some work to fix.
                              • +
                              • Replaying a program uses a lot more memory; maybe 15x as much than +during the recording. This is because it creates many forks. If +you have a program that consumes 10% of your RAM or more, you will +need to reduce MAX_SUBPROCESSES in process.py.
                              • +
                              +

                              Replaying also comes with a bunch of user interface issues:

                              +
                                +
                              • +Attempted to do I/O or access raw memory: we get this whenever +trying to print some expression that cannot be evaluated with +only the GC memory---or which can, but then the __repr__() +method of the result cannot. We need to reset the state with +bstep + step before we can print anything else. However, +if only the __repr__() crashes, you still see the $NUM = +prefix, and you can use that $NUM afterwards.
                              • +
                              • +id() is globally unique, returning a reproducible 64-bit number, +so sometimes using id(x) is a workaround for when using x +doesn't work because of Attempted to do I/O issues (e.g. p +[id(x) for x in somelist]).
                              • +
                              • as explained in the demo, next/bnext/finish/bfinish might jump +around a bit non-predictably.
                              • +
                              • similarly, breaks on watchpoints can stop at apparently unexpected +places (when going backward, try to do "step" once). The issue is +that it can only stop at the beginning of every line. In the +extreme example, if a line is foo(somelist.pop(getindex())), +then somelist is modified in the middle. Immediately before +this modification occurs, we are in getindex(), and +immediately afterwards we are in foo(). The watchpoint will +stop the program at the end of getindex() if running backward, +and at the start of foo() if running forward, but never +actually on the line doing the change.
                              • +
                              • watchpoint expressions must not have any side-effect at all. If +they do, the replaying will get out of sync and revdb.py will +complain about that. Regular p expressions and statements can +have side-effects; these effects are discarded as soon as you move +in time again.
                              • +
                              • sometimes even "p import foo" will fail with Attempted to do +I/O. Use instead "p import sys; foo = sys.modules['foo']".
                              • +
                              • use help to see all commands. backtrace can be useful. +There is no up command; you have to move in time instead, +e.g. using bfinish to go back to the point where the current +function was called.
                              • +
                              +
                              +
                              +

                              How RevPDB is done

                              +

                              If I had to pick the main advantage of PyPy over CPython, it is that +we have got with the RPython translation toolchain a real place for +experimentation. Every now and then, we build inside RPython some +feature that gives us an optionally tweaked version of the PyPy +interpreter---tweaked in a way that would be hard to do with CPython, +because it would require systematic changes everywhere. The most +obvious and successful examples are the GC and the JIT. But there +have been many other experiments along the same lines, from the +so-called stackless transformation in the early days, to the STM +version of PyPy.

                              +

                              RevPDB works in a similar way. It is a version of PyPy in which some +operations are systematically replaced with other operations.

                              +

                              To keep the log file at a reasonable size, we duplicate the content of +all GC objects during replaying---by repeating the same actions on +them, without writing anything in the log file. So that means that in +the pypy-revdb binary, the operations that do arithmetic or +read/write GC-managed memory are not modified. Most operations are +like that. However, the other operations, the ones that involve +either non-GC memory or calls to external C functions, are tweaked. +Each of these operations is replaced with code that works in two +modes, based on a global flag:

                              +
                                +
                              • in "recording" mode, we log the result of the operation (but not the +arguments);
                              • +
                              • in "replaying" mode, we don't really do the operation at all, but +instead just fetch the result from the log.
                              • +
                              +

                              Hopefully, all remaining unmodified operations (arithmetic and GC +load/store) are completely deterministic. So during replaying, every +integer or non-GC pointer variable will have exactly the same value as +it had during recording. Interestingly, it means that if the +recording process had a big array in non-GC memory, then in the +replaying process, the array is not allocated at all; it is just +represented by the same address, but there is nothing there. When we +record "read item 123 from the array", we record the result of the +read (but not the "123"). When we replay, we're seeing again the same +"read item 123 from the array" operation. At that point, we don't +read anything; we just return the result from the log. Similarly, +when recording a "write" to the array, we record nothing (this write +operation has no result); so that when replaying, we redo nothing.

                              +

                              Note how that differs from anything managed by GC memory: GC objects +(including GC arrays) are really allocated, writes really occur, and +reads are redone. We don't touch the log in this case.

                              +
                              +
                              +

                              Other reverse debuggers for Python

                              +

                              There are already some Python experiments about reverse debugging. +This is also known as "omniscient debugging". However, I claim that +the result they get to is not very useful (for the purpose presented +here). How they work is typically by recording changes to some +objects, like lists and dictionaries, in addition to recording the +history of where your program passed through. However, the problem of +Python is that lists and dictionaries are not the end of the story. +There are many, many, many types of objects written in C which are +mutable---in fact, the immutable ones are the exception. You can try +to systematically record all changes, but it is a huge task and easy +to forget a detail.

                              +

                              In other words it is a typical use case for tweaking the RPython +translation toolchain, rather than tweaking the CPython (or PyPy) +interpreter directly. The result that we get here with RevPDB is more +similar to rr anyway, in that only a relatively small number of +external events are recorded---not every single change to every single +list and dictionary.

                              +

                              Some links:

                              + +

                              For C:

                              + +
                              +
                              +

                              Future work

                              +

                              As mentioned above, it is alpha-level, and only works on Linux and OS/X. +So the plans for the immediate future are to fix the various +issues described above, and port to more operating systems. The core of the system +is in the C file and headers in rpython/translator/revdb/src-revdb.

                              +

                              For interested people, there is also the Duhton interpreter and its +reverse-debugger branch, which is where I prototyped the RPython +concept before moving to PyPy. The basics should work for any +interpreter written in RPython, but they require some specific code to +interface with the language; in the case of PyPy, it is in +pypy/interpreter/reverse_debugging.py.

                              +

                              In parallel, there are various user interface improvements that people +could be interested in, like a more "pdb++" experience. (And the script +at rpython/translator/revdb/revdb.py should be moved out into some +more "official" place, and the reverse-debugger branch should be +merged back to default.)

                              +

                              I would certainly welcome any help!

                              +

                              -+- Armin

                              +
                              +
                              +

                              Comments

                              +
                              +
                              +
                              + + Rachmad Imam Tarecha wrote on 2016-07-08 13:57: +
                              +
                              +

                              I think python is hard programming language, :D

                              +
                              +
                              +
                              +
                              + + mrh1997 wrote on 2016-07-09 22:59: +
                              +
                              +

                              I am really impressed!
                              Especially of the fact that you did the Job within one month.

                              I had the idea of such a tool, too some time ago (with exactly the same approach, but in CPython instead of PyPy).
                              But I failed to implement it, as in CPython I had to do a lot more modifications...

                              +
                              +
                              +
                              +
                              + + Armin Rigo wrote on 2016-07-10 18:31: +
                              +
                              +

                              Seems to work out of the box on OS/X. I've updated it in the blog post.

                              +
                              +
                              +
                              +
                              + + Ron Barak wrote on 2016-07-14 22:50: +
                              +
                              +

                              Erratum:
                              RevPDB is only available only on 64-bit Linux -> RevPDB is available only on 64-bit Linux

                              +
                              +
                              +
                              +
                              + + Armin Rigo wrote on 2016-07-15 08:55: +
                              +
                              +

                              Thanks for the typo.

                              +
                              +
                              +
                              + +
                              +
                              + +
                              +
                              + + \ No newline at end of file diff --git a/posts/2016/08/pypy-gets-funding-from-mozilla-for-5569307998787871200.html b/posts/2016/08/pypy-gets-funding-from-mozilla-for-5569307998787871200.html new file mode 100644 index 000000000..ecd7b3c38 --- /dev/null +++ b/posts/2016/08/pypy-gets-funding-from-mozilla-for-5569307998787871200.html @@ -0,0 +1,517 @@ + + + + + +PyPy gets funding from Mozilla for Python 3.5 support | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                              +
                              +

                              PyPy gets funding from Mozilla for Python 3.5 support

                              + + + +
                              +

                              "Python 2.x versus Python 3.x": this is by now an old question. In the eyes of some people Python 2 is here to stay, and in the eyes of others Python has long been 3 only.

                              + +

                              PyPy's own position is that PyPy will support Python 2.7 forever---the RPython language in which PyPy is written is a subset of 2.7, and we have no plan to upgrade that. But at the same time, we want to support 3.x. This is particularly true now: a relatively recent development is that Python 3.5 seems to attract more and more people. The "switch" to Python 3.x might be starting to happen.

                              + +

                              Correspondingly, PyPy has been searching for a while for a way to support a larger-scale development effort. The goal is to support not just any old version of Python 3.x, but Python 3.5, as this seems to be the version that people are switching to. PyPy is close to supporting all of Python 3.3 now; but the list of what is new in Python 3.4 and 3.5 is far, far longer than anyone imagines. The long-term goal is also to get a version of "PyPy3" that is as good as "PyPy2" is, including its performance and its cpyext layer (CPython C API interoperability), for example.

                              + +

                              So, the end result: Mozilla recently decided to award $200,000 to Baroque Software to work on PyPy as part of its Mozilla Open Source Support (MOSS) initiative. This money will be used to implement the Python 3.5 features in PyPy. Within the next year, we plan to use the money to pay four core PyPy developers half-time to work on the missing features and on some of the big performance and cpyext issues. This should speed up the progress of catching up with Python 3.x significantly. We are extremely thankful to Mozilla for supporting us in this way, and will keep you updated on the progress via this blog.

                              +
                              +

                              Comments

                              +
                              +
                              +
                              + + Dave wrote on 2016-08-09 17:46: +
                              +
                              +

                              Great to hear of this development. I'm one of those "Python has long been 3 only" developers, but have had an eye on PyPy for a long time and even donated several times. Planning to switch to PyPy when 3.5 support lands.

                              +
                              +
                              +
                              +
                              + + Unknown wrote on 2016-08-09 17:56: +
                              +
                              +

                              glad to hear that.

                              To me, the time to switch to py3 depends upon the maturity of pypy 3.

                              I have used pypy 2 for a while in production, so far so good.

                              +
                              +
                              +
                              +
                              + + Alessandro wrote on 2016-08-09 18:05: +
                              +
                              +

                              Great news!

                              I'm one of those "Python 3 only". The switch was terrible for the community, but Python 3 is superior than 2 in my opinion.

                              RPython 3 would be great to, but it's propably complete inviable.

                              +
                              +
                              +
                              +
                              + + Ronan wrote on 2016-08-09 18:29: +
                              +
                              +

                              Fantastic news! Thanks for your work, and thanks Mozilla for their support :)

                              +
                              +
                              +
                              +
                              + + Shen wrote on 2016-08-09 21:36: +
                              +
                              +

                              Awesome !

                              +
                              +
                              +
                              +
                              + + Unknown wrote on 2016-08-10 00:07: +
                              +
                              +

                              Is there any chance optional typing information will be used to help the JIT?

                              +
                              +
                              +
                              +
                              + + Anonymous wrote on 2016-08-10 00:16: +
                              +
                              +

                              200,000 sounds like a lot of money but it is two developers for a year at less than Silicon Valley wages.

                              +
                              +
                              +
                              +
                              + + Anonymous wrote on 2016-08-10 00:45: +
                              +
                              +

                              I think you are overpaying yourselves. But hey, it's your money.

                              +
                              +
                              +
                              +
                              + + Anonymous wrote on 2016-08-10 06:06: +
                              +
                              +

                              2.7 forever!

                              +
                              +
                              +
                              +
                              + + cclauss wrote on 2016-08-10 07:38: +
                              +
                              +

                              This is huge news! Corporate sponsorship of open source projects is a beautiful thing for us all. Congrats to the PyPy team. You really deserve this kind of support for all of your hard work and perseverance over the years.

                              Given that Python 3.6 will be going beta next month, perhaps that should be your target instead of 3.5 but you know your craft better than I do.

                              Those of you who would be interested to pay Mozilla back for this investment might want to help port the following 9 Mozilla projects to Python 3:
                              * mozrunner, moznetwork, mozdevice, mozprocess, mozprofile, mozfile, mozinfo, mozlog, mozcrash

                              These nine Python 2 projects are all in the Top 150 PyPI downloads of all time and each of them has been downloaded with pip more than 5 million times. Currently 92% of the Top 200 PyPI packages are Python3 compatible. Converting these 9 Mozbase modules to Python 3 would bump that number up to 96.5%. It would also probably push us over the line where 50% of the Top 4,000 PyPI packages are Python 3 compatible. This kind of momentum would be welcome news as the Python community continues our move to Python 3.

                              +
                              +
                              +
                              +
                              + + Armin Rigo wrote on 2016-08-10 09:46: +
                              +
                              +

                              Why Python 3.5 instead of 3.6? That's because 3.5 is the version that attracts people. Python 3.6 will only be out of beta in december and my own wild guess is that it won't immediately attract all the 2.7 people, who grew accustomed to sticking to a known unchanging version. So the deal with Mozilla is to get a PyPy 3.5 working as well as possible; it is better than getting a PyPy 3.6 that (like current versions of PyPy 3) has downsides in completeness and performance.

                              +
                              +
                              +
                              +
                              + + Unknown wrote on 2016-08-10 12:23: +
                              +
                              +

                              Great news! I Can't wait to the moment we get an stable Python3 PyPy! Congratulations!

                              +
                              +
                              +
                              +
                              + + Unknown wrote on 2016-08-10 12:24: +
                              +
                              +

                              Great news! I Can't wait to the moment we get an stable Python3 PyPy! Congratulations!

                              +
                              +
                              +
                              +
                              + + guillem wrote on 2016-08-10 14:08: +
                              +
                              +

                              TL;DR. Python 3.5 is a "good enough" and seems a future-proof language.

                              I was in charge of deciding which version of Python to use at my job. We started the development of a framework supporting Python 2.7 and Python >=3.4, but we recently switched to a Python 3 only development. The whole python 2 vs 3 thing was quite confusing to the operations department and developers that are not proficient with Python.

                              There was a quite thorough assessment of the features, and we decided to stick to Python 3.5, at least for the next decade or so. On the Python 3 side, one of the reasons was that the async/await syntax allows junior developers to understand (more or less) asynchronous programming, while coroutines+decorators are quite a mess. We still have some Red Hat instances that use Python 3.4, but as soon as we get rid of them, everything will be Python 3.5.

                              +
                              +
                              +
                              +
                              + + touilleMan wrote on 2016-08-10 15:24: +
                              +
                              +

                              Awesome news ! Long live Pypy ;-)

                              +
                              +
                              +
                              +
                              + + Anonymous wrote on 2016-08-10 18:28: +
                              +
                              +

                              Super great news!

                              +
                              +
                              +
                              +
                              + + Anonymous wrote on 2016-08-10 21:12: +
                              +
                              +

                              This is amazing news! I use Python3 extensively in private research projects. Unfortunately, I have been in the position of choosing between Python3 and having a high-performance interpreter implementation. Testing the PyPy 3.3.5 alpha shows all-around disappointing performance in our string-manipulation-heavy projects making intense use of unicode, so I can only recommend our team to stay with CPython for both performance and compatibility.

                              I am very excited to hear that PyPy3 will be getting more of the specific attention it deserves, with Python 3.5 support to boot!

                              +
                              +
                              +
                              +
                              + + PvdE wrote on 2016-08-11 13:03: +
                              +
                              +

                              Good news, and you definitely deserve it. But I guess this will take virtually all of the PyPy team's resources for the next one-two years, what does this means for other in-progress innovations, in particular STM-GC? I donated but it looks like the money is not being spent.

                              If continuing the improvements to CPython support means PyPy will become an option for more programs, that would be great. The (few) red lines in https://packages.pypy.org/ and lower performance for others are still blockers for many users.

                              +
                              +
                              +
                              +
                              + + Armin Rigo wrote on 2016-08-11 16:40: +
                              +
                              +

                              @PvdE: I'll admit that the STM-GC is not actively going anywhere right now. STM is hard, unsurprizingly. There is still a bit being done by Remi (working at an academic institution where he doesn't need the money). For me, I am part of the Python 3.5 team, but I might also come back to STM-GC soon. We expect not all our resources to be consumed by Python 3.5. In fact, the money covers four half-time jobs (and flexibility allows someone to do less than half-time while someone else does more).

                              +
                              +
                              +
                              +
                              + + Anonymous wrote on 2016-08-17 04:04: +
                              +
                              +

                              This is great news.

                              A developer's decision to target Python 3 over Python 2 in their projects is more fundamental than deciding which interpreter to use. People use Python 3 because it's future-proof, to take advantage of its new features and to do their bit in driving the Python ecosystem forward. For me and I imagine many others, being curious about PyPy hasn't been enough to override all those factors.

                              I think there's a huge untapped audience out there waiting for first-class support for modern Python in PyPy before giving it a shot. I hope to see a big bump in PyPy adoption with this move, and a corresponding bump in funding and support for PyPy's development.

                              Thanks for your fantastic work!

                              +
                              +
                              +
                              +
                              + + JP wrote on 2016-08-31 17:45: +
                              +
                              +

                              Great news! Will this include making numpy work with Python3 pypy? That's the main thing preventing me from evaluating pypy for my Python3-only OpenGL application.

                              +
                              +
                              +
                              +
                              + + mattip wrote on 2016-09-07 09:05: +
                              +
                              +

                              @JP cpyext compatibility is one of the milestones, and we currently pass over 99% of the upstream NumPy test suite using PyPy 2.7, so it all should Just Work

                              +
                              +
                              +
                              +
                              + + Anonymous wrote on 2016-10-04 16:54: +
                              +
                              +

                              Getting PyPy3 to 3.5 status is a good start considering that the current LTS version of Ubuntu (16.04) has 3.5 and that is going to be supported for a while.

                              +
                              +
                              +
                              +
                              + + Anonymous wrote on 2016-12-17 22:26: +
                              +
                              +

                              Great news! Pypy's lack of Python 3 support is the biggest reason I haven't switched yet. It technically supports most of 3.3 already, but since Pypy3 is slower than CPython, it may as well not exist. Hopefully you'll also work out the performance issues in Pypy 3 as well.

                              +
                              +
                              +
                              + +
                              +
                              + +
                              +
                              + + \ No newline at end of file diff --git a/posts/2016/08/pypy-tooling-upgrade-jitviewer-and-5107430577468391432.html b/posts/2016/08/pypy-tooling-upgrade-jitviewer-and-5107430577468391432.html new file mode 100644 index 000000000..ce8f68e25 --- /dev/null +++ b/posts/2016/08/pypy-tooling-upgrade-jitviewer-and-5107430577468391432.html @@ -0,0 +1,362 @@ + + + + + +PyPy Tooling Upgrade: JitViewer and VMProf | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                              +
                              +

                              PyPy Tooling Upgrade: JitViewer and VMProf

                              + + + +
                              +

                              We are happy to announce a major JitViewer (JV) update.
                              +JV allows you to inspect RPython's internal compiler representation (the language in which PyPy is implemented) including the generated machine code of your program. It can graphically show you details of the JIT compiled code and helps you pinpoint issues in your program.

                              +VMProf is a statistical CPU profiler for python imposing very little overhead at runtime.

                              +Both VMProf and JitViewer share a common goal: Present useful information for your python program.
                              +The combination of both can reveal more information than either alone.
                              +That is the reason why they are now both packaged together.
                              +We also updated vmprof.com with various bug fixes and changes including an all new interface to JV.

                              +This work was done with the goal of improving tooling and libraries around the Python/PyPy/RPython ecosystem.
                              +Some of the tools we have developed:

                              +
                                +
                              • +CFFI - Foreign Function Interface that avoids CPyExt (CFFI docs)
                              • +
                              • +RevDB - A reverse debugger for python (RevDB blog post)
                              • +
                              +
                              +and of course the tools we discuss here:

                                +
                              • +VMProf - A statistical CPU profiler (VMProf docs)
                              • +
                              • +JitViewer - Visualization of the log file produced by RPython (JitLog docs)
                              • +
                              +

                              +A "brand new" JitViewer

                              +
                              +JitViewer has two pieces: you create a log file when running your program, and then use a graphic tool to view what happened.

                              +The old logging format was a hard-to-maintain, plain-text-logging facility. Frequent changes often broke internal tools.
                              +Additionally, the logging output of a long running program required a lot of disk space.

                              +Our new binary format encodes data densely, makes use of some compression (gzip), and tries to remove repetition where possible.
                              +It also supports versioning for future proofing and can be extended easily.

                              +And *drumroll* you no longer need to install a tool to view the log yourself
                              +anymore! The whole system moved to vmprof.com and you can use it any time.

                              +Sounds great. But what can you do with it? Here are two examples for a PyPy user:

                              +
                              PyPy crashed? Did you discover a bug?

                              +
                              +For some hard to find bugs it is often necessary to look at the compiled code. The old
                              +procedure often required you to upload a plain text file which was hard to parse and to look through.

                              +A better way to share a crash report is to install the ``vmprof`` module from PyPi and execute either of the two commands:

                              +# this program does not crash, but has some weird behaviour
                              $ pypy -m jitlog --web <your program args>
                              ...
                              PyPy Jitlog: https://vmprof.com/#/<hash>/traces
                              # this program segfaults
                              $ pypy -m jitlog -o /tmp/log <your program args>
                              ...
                              <Segfault>
                              $ pypy -m jitlog --upload /tmp/log
                              PyPy Jitlog: https://vmprof.com/#/<hash>/traces


                              +Providing the link in the bug report allows PyPy developers to browse and identify potential issues.

                              +Speed issues

                              +
                              +VMProf is a great tool to find hot spots that consume a lot of time in your program. As soon as you have identified code that runs slowly, you can switch to jitlog and maybe pinpoint certain aspects that do not behave as expected. You will find an overview, and are able to browse the generated code. If you cannot make sense of all that, you can just share the link with us and we can have a look too.

                              +
                              Future direction

                              +
                              +We hope that the new release will help both PyPy developers and PyPy users resolve potential issues and easily point them out.

                              +Here are a few ideas what might come in the next few releases:


                                +
                              •  Combination of CPU profiles and the JITLOG (sadly did not make it into the current release).
                              • +
                              • Extend vmprof.com to be able to query vmprof/jitlog.
                                An example query for vmprof: 'methods.callsites() > 5' and
                                for the jitlog would be 'traces.contains('call_assembler').hasbridge('*my_func_name*')'.
                              • +
                              • Extend the jitlog to capture the information of the optimization stage.
                              • +
                              +

                              +Richard Plangger (plan_rich) and the PyPy team
                              +
                              +
                              +
                              +

                              Comments

                              +
                              +
                              +
                              + + phd wrote on 2016-08-11 20:29: +
                              +
                              +

                              https://www.vmprof.com/ doesn't work, but https://vmprof.com/ does. Please fix your DNS.

                              +
                              +
                              +
                              +
                              + + Armin Rigo wrote on 2016-08-14 08:43: +
                              +
                              +

                              @phd: thanks, fixed the link inside the blog post. ``www.vmprof.com`` was never intended to work---but that could be considered as a bug; if you feel like it should, please open an issue.

                              +
                              +
                              +
                              + +
                              +
                              + +
                              +
                              + + \ No newline at end of file diff --git a/posts/2016/08/pypy2-v54-released-incremental-3611318295736669599.html b/posts/2016/08/pypy2-v54-released-incremental-3611318295736669599.html new file mode 100644 index 000000000..610b28691 --- /dev/null +++ b/posts/2016/08/pypy2-v54-released-incremental-3611318295736669599.html @@ -0,0 +1,357 @@ + + + + + +PyPy2 v5.4 released - incremental improvements and enhancements | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                              +
                              +

                              PyPy2 v5.4 released - incremental improvements and enhancements

                              + + + +
                              +
                              +We have released PyPy2.7 v5.4, a little under two months after PyPy2.7 v5.3. +This new PyPy2.7 release includes incremental improvements to our C-API +compatibility layer (cpyext), enabling us to pass over 99% of the upstream +numpy test suite.

                              +We updated built-in cffi support to version 1.8, +which now supports the “limited API” mode for c-extensions on +CPython >=3.2.

                              +We improved tooling for the PyPy JIT, and expanded VMProf +support to OpenBSD and Dragon Fly BSD

                              +As always, this release fixed many issues and bugs raised by the +growing community of PyPy users. We strongly recommend updating.

                              +You can download the PyPy2 v5.4 release here:
                              + +
                              +We would like to thank our donors for their continued support of the PyPy +project. We would also like to thank our contributors and +encourage new people to join the project. PyPy has many +layers and we need help with all of them: PyPy and RPython documentation +improvements, testing and adapting popular modules to run on PyPy, or general help +with making RPython’s JIT even better.

                              +

                              +What is PyPy?

                              +PyPy is a very compliant Python interpreter, almost a drop-in replacement for CPython 2.7. It’s fast (PyPy and CPython 2.7 performance comparison) due to its integrated tracing JIT compiler.

                              +We also welcome developers of other dynamic languages to see what RPython can do for them.

                              +This release supports:
                                +
                              • +x86 machines on most common operating systems (Linux 32/64, Mac OS X 64, Windows 32, OpenBSD, FreeBSD)
                              • +
                              • newer ARM hardware (ARMv6 or ARMv7, with VFPv3) running Linux
                              • +
                              • big- and little-endian variants of PPC64 running Linux
                              • +
                              • +s390x running Linux
                              • +
                              +
                              +
                              +

                              +What is New?

                              +

                              +(since the release of PyPy 5.3 in June, 2016)

                              +There are many incremental improvements to RPython and PyPy, the complete listing is here. Mozilla generously sponsored work toward python 3.5 compatibility, and we are beginning to see some cross-over improvements of RPython and PyPy2.7 as a result.

                              +Please update, and continue to help us make PyPy better. +Cheers

                              +The PyPy Team
                              +
                              +

                              Comments

                              +
                              +
                              +
                              + + stuaxo wrote on 2016-08-31 22:52: +
                              +
                              +

                              Is this available on the PPA ?

                              (if it is, which one, possibly I have the wrong one) - at the moment I have

                              Get:2 https://ppa.launchpad.net/pypy/ppa/ubuntu xenial/main amd64 pypy amd64 5.3.1+dfsg-1~ppa1~ubuntu16.04 [7,754 kB]

                              ?

                              +
                              +
                              +
                              +
                              + + Armin Rigo wrote on 2016-09-01 09:11: +
                              +
                              +

                              This is outside our control and should be checked with the 3rd-party provider for the particular platform (in this case, the PPA).

                              +
                              +
                              +
                              + +
                              +
                              + +
                              +
                              + + \ No newline at end of file diff --git a/posts/2016/09/pypy-541-bugfix-released-3217566297258542810.html b/posts/2016/09/pypy-541-bugfix-released-3217566297258542810.html new file mode 100644 index 000000000..41d132a55 --- /dev/null +++ b/posts/2016/09/pypy-541-bugfix-released-3217566297258542810.html @@ -0,0 +1,326 @@ + + + + + +PyPy 5.4.1 bugfix released | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                              +
                              +

                              PyPy 5.4.1 bugfix released

                              + + + +
                              +
                              +We have released a bugfix for PyPy2.7-v5.4.0, released last week, due to the following issues:

                              +
                                +
                              • Update list of contributors in documentation and LICENSE file, this was unfortunately left out of 5.4.0. My apologies to the new contributors
                              • +
                              • Allow tests run with -A to find libm.so even if it is a script not a dynamically loadable file
                              • +
                              • Bump sys.setrecursionlimit() when translating PyPy, for translating with CPython
                              • +
                              • Tweak a float comparison with 0 in backendopt.inline to avoid rounding errors
                              • +
                              • Fix for an issue for translating the sandbox
                              • +
                              • Fix for and issue where unicode.decode('utf8', 'custom_replace') messed up the last byte of a unicode string sometimes
                              • +
                              • Update built-in cffi to version 1.8.1
                              • +
                              • Explicitly detect that we found as-yet-unsupported OpenSSL 1.1, and crash translation with a message asking for help porting it
                              • +
                              • Fix a regression where a PyBytesObject was forced (converted to a RPython object) when not required, reported as issue #2395
                              • +
                              +
                              +Thanks to those who reported the issues.

                              +What is PyPy?

                              +
                              +PyPy is a very compliant Python interpreter, almost a drop-in replacement for CPython 2.7. It's fast (PyPy and CPython 2.7.x performance comparison) due to its integrated tracing JIT compiler.

                              +We also welcome developers of other dynamic languages to see what RPython can do for them.

                              +This release supports:
                              +
                                +
                              • +x86 machines on most common operating systems (Linux 32/64, Mac OS X 64, Windows 32, OpenBSD, FreeBSD),
                              • +
                              • newer ARM hardware (ARMv6 or ARMv7, with VFPv3) running Linux,
                              • +
                              • big- and little-endian variants of PPC64 running Linux,
                              • +
                              • +s390x running Linux
                              • +
                              +
                              +Please update, and continue to help us make PyPy better.

                              +Cheers

                              +The PyPy Team
                              +
                              +
                              +

                              Comments

                              +
                              +
                              +
                              + +
                              +
                              + + \ No newline at end of file diff --git a/posts/2016/09/revdb-released-v541-6719768292347391304.html b/posts/2016/09/revdb-released-v541-6719768292347391304.html new file mode 100644 index 000000000..bbc59c2bd --- /dev/null +++ b/posts/2016/09/revdb-released-v541-6719768292347391304.html @@ -0,0 +1,319 @@ + + + + + +RevDB released, v5.4.1 | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                              +
                              +

                              RevDB released, v5.4.1

                              + + + +
                              +

                              Hi all,

                              + +

                              +The first beta version of RevDB is out! Remember that RevDB is a reverse debugger for Python. The idea is that it is a debugger that can run forward and backward in time, letting you more easily understand your subtle bug in your big Python program.

                              + +

                              +RevDB should work on almost any Python program. Even if you are normally only using CPython, trying to reproduce the bug with RevDB is similar to trying to run the program on a regular PyPy---usually it just works, even if not quite always. + +

                              +

                              +News from the alpha version in the previous blog post include notably support for: +

                              +
                                +
                              • Threads. +
                              • +
                              • CPyExt, the compatibility layer of PyPy that can run CPython C extension modules. +
                              • +
                              +as well as many other improvements. + +

                              +You need to build it yourself for now. It is tested on 64-bit Linux. 32-bit Linux, OS/X, and other POSIX platforms should all either work out of the box or be just a few fixes away (contributions welcome). Win32 support is a lot more involved but not impossible.

                              + +

                              +See https://bitbucket.org/pypy/revdb/ for more information!

                              + +

                              Armin

                              +
                              +

                              Comments

                              +
                              +
                              +
                              + +
                              +
                              + + \ No newline at end of file diff --git a/posts/2016/10/pypy3-550-released-8069558680221199646.html b/posts/2016/10/pypy3-550-released-8069558680221199646.html new file mode 100644 index 000000000..69950292c --- /dev/null +++ b/posts/2016/10/pypy3-550-released-8069558680221199646.html @@ -0,0 +1,397 @@ + + + + + +PyPy3 5.5.0 released | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                              +
                              +

                              PyPy3 5.5.0 released

                              + + + +
                              +

                              We're pleased to announce the release of PyPy3 v5.5.0. Coming four months after PyPy3.3 v5.2, it improves compatibility with Python 3.3 (3.3.5). We strongly recommend updating from previous PyPy3 versions.

                              +We would like to thank all of the people who donated to the py3k proposal for supporting the work that went into this release.

                              +You can download the PyPy3.3 v5.5.0 release here: https://pypy.org/download.html

                              +
                                +
                              • Improved Python 3.3.5 support.
                              • +
                                  +
                                • os.get_terminal_size(), time.monotonic(), str.casefold() 
                                • +
                                • faulthandler module
                                • +
                                • There are still some missing features such as a PEP 393-like space efficient string representation and including performance regressions (e.g. issue #2305). The focus for this release has been updating to 3.3 compatibility. Windows is also not yet supported.
                                • +
                                +
                              • +ensurepip is also included (it's only included in CPython 3 >= 3.4).
                              • +
                              • Buffer interface improvements (numpy on top of cpyext)
                              • +
                              • Several JIT improvements (force-virtual-state, residual calls)
                              • +
                              • Search path for libpypy-c.so has changed (helps with cffi embedding on linux distributions)
                              • +
                              • Improve the error message when the user forgot the "self" argument of a method
                              • +
                              • Many more small improvements, please head over to our documentation for more information
                              • +
                              +

                              +Towards Python 3.5

                              +
                              +
                              +We have started to work on Python 3.5, which is a version used by many software projects. It seems to get wide adoption. We are happy to be part of the Mozilla Open Source Support (MOSS) initiative.
                              +
                              +
                              +
                              +
                              +Nevertheless we want to give our users the chance to use PyPy in their Python 3 projects, thus we have prepared this release.
                              +
                              +

                              +What is PyPy?

                              +PyPy is a very compliant Python interpreter, almost a drop-in replacement for CPython 2.7.10 and 3.3.5. It's fast due to its integrated tracing JIT compiler.

                              We also welcome developers of other dynamic languages to see what RPython can do for them.

                              +This release supports:
                                +
                              • x86 machines on most common operating systems except Windows 
                              • +
                              • newer ARM hardware (ARMv6 or ARMv7, with VFPv3) running Linux 
                              • +
                              • big- and little-endian variants of PPC64 running Linux 
                              • +
                              • s390x running Linux
                              • +
                              +Please try it out and let us know what you think. We welcome feedback, we know
                              +you are using PyPy, please tell us about it!

                              +Cheers

                              +The PyPy Team +
                              +

                              Comments

                              +
                              +
                              +
                              + + Mak Sim wrote on 2016-10-13 07:51: +
                              +
                              +

                              Great! Wayting for windows build.

                              +
                              +
                              +
                              +
                              + + Anonymous wrote on 2016-10-14 05:08: +
                              +
                              +

                              Excellent news. Thank you!

                              +
                              +
                              +
                              +
                              + + Butla wrote on 2016-10-17 15:52: +
                              +
                              +

                              Wow! 3.5? That would be incredible. Shouldn't there be more hype around JITted asyncio applications?

                              +
                              +
                              +
                              +
                              + + Unknown wrote on 2016-10-21 07:49: +
                              +
                              +

                              I was really touched.
                              \(^o^)/

                              +
                              +
                              +
                              +
                              + + Unknown wrote on 2016-11-06 01:19: +
                              +
                              +

                              Butla: I do totally agree, pypy not only for numeric code anymore but also for parallel production servers.

                              +
                              +
                              +
                              +
                              + + Unknown wrote on 2016-12-02 15:49: +
                              +
                              +

                              The performance difference between 5.5 and 5.2 is awesome! For my heavy string and lists-of-strings processing tool, 5.5 needs about 25% less time for the same task. Thank you so much!

                              +
                              +
                              +
                              +
                              + + Dagur wrote on 2017-05-24 14:08: +
                              +
                              +

                              What is the status on windows support?

                              +
                              +
                              +
                              + +
                              +
                              + +
                              +
                              + + \ No newline at end of file diff --git a/posts/2016/11/pypy27-v56-released-stdlib-2712-support-5671090852400583673.html b/posts/2016/11/pypy27-v56-released-stdlib-2712-support-5671090852400583673.html new file mode 100644 index 000000000..4f8245751 --- /dev/null +++ b/posts/2016/11/pypy27-v56-released-stdlib-2712-support-5671090852400583673.html @@ -0,0 +1,402 @@ + + + + + +PyPy2.7 v5.6 released - stdlib 2.7.12 support, C-API improvements, and more | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                              +
                              +

                              PyPy2.7 v5.6 released - stdlib 2.7.12 support, C-API improvements, and more

                              + + + +
                              +
                              +
                              +
                              +
                              +

                              +

                              +We have released PyPy2.7 v5.6 [0], about two months after PyPy2.7 v5.4. This new PyPy2.7 release includes the upstream stdlib version 2.7.12.

                              +We continue to make incremental improvements to our C-API compatibility layer (cpyext). We pass all but 12 of the over-6000 tests in the upstream NumPy test suite, and have begun examining what it would take to support Pandas and PyQt.

                              +Work proceeds at a good pace on the PyPy3.5 version due to a grant from the Mozilla Foundation, and some of those changes have been backported to PyPy2.7 where relevant.

                              +The PowerPC and s390x backend have been enhanced with the capability to use SIMD instructions for micronumpy loops.

                              +We changed timeit to now report average +/- standard deviation, which is better than the misleading minimum value reported in CPython.

                              +We now support building PyPy with OpenSSL 1.1 in our built-in _ssl module, as well as maintaining support for previous versions.

                              CFFI has been updated to 1.9, improving an already great package for interfacing with C.

                              +As always, this release fixed many issues and bugs raised by the growing community of PyPy users. We strongly recommend updating. You can download the PyPy2.7 v5.6 release here:
                              + +
                              +Downstream packagers have been hard at work. The Debian package is already available, and the portable PyPy versions are also ready, for those who wish to run PyPy on other Linux distributions like RHEL/Centos 5.

                              +We would like to thank our donors for the continued support of the PyPy project.

                              +We would also like to thank our contributors and encourage new people to join the project. PyPy has many layers and we need help with all of them: PyPy and RPython documentation improvements, tweaking popular modules to run on pypy, or general help with making RPython’s JIT even better.
                              +

                              +What is PyPy?

                              +PyPy is a very compliant Python interpreter, almost a drop-in replacement for CPython 2.7. It’s fast (PyPy and CPython 2.7.x performance comparison) due to its integrated tracing JIT compiler.
                              +We also welcome developers of other dynamic languages to see what RPython can do for them.
                              +This release supports:
                              +
                              +
                                +
                              • +x86 machines on most common operating systems (Linux 32/64 bits, Mac OS X 64 bits, Windows 32 bits, OpenBSD, FreeBSD)
                              • +
                              • newer ARM hardware (ARMv6 or ARMv7, with VFPv3) running Linux,
                              • +
                              • big- and little-endian variants of PPC64 running Linux,
                              • +
                              • +s390x running Linux
                              • +
                              +
                              +
                              +
                              +
                              +

                              +What else is new?

                              +
                              +(since the release of PyPy 5.4 in August, 2016)
                              +
                              +There are many incremental improvements to RPython and PyPy, the complete listing is here. +
                              +
                              +Please update, and continue to help us make PyPy better.

                              +Cheers, The PyPy team

                              +[0] We skipped 5.5 since we share a code base with PyPy3, and PyPy3.3-v.5.5-alpha was released last month
                              +
                              +
                              +
                              +
                              +
                              +

                              Comments

                              +
                              +
                              +
                              + + Anonymous wrote on 2016-11-13 01:32: +
                              +
                              +

                              I am really liking the regular updates! Nice to hear about cpyext and PyQt! Do desktop ui's apps gain alot of performance from being on pypy? Would kivy go faster seeing as it has a large chunk of widgets implemented in python?

                              +
                              +
                              +
                              +
                              + + Unknown wrote on 2016-11-13 07:09: +
                              +
                              +

                              All core features in Kivy are implemented in Cython. PyPy is slower with Cython.

                              +
                              +
                              +
                              +
                              + + Anonymous wrote on 2016-11-13 10:55: +
                              +
                              +

                              isn't the cpyext going to be the answer for pyQt and cython? Or are you saying pyQt should perform greater?

                              +
                              +
                              +
                              +
                              + + mathgl wrote on 2016-11-17 07:00: +
                              +
                              +

                              cpyext make them work instead of faster at the moment.

                              +
                              +
                              +
                              +
                              + + Unknown wrote on 2018-04-16 07:23: +
                              +
                              +

                              The python interpreter size is 3.5MB where as pypy intepreter size is almost 40MB. As it has huge size difference it is impossible to replace in embedded projects

                              Is there any way to reduce it or any suggestions to implement in embedded area.Why is this difference.

                              +
                              +
                              +
                              +
                              + + Armin Rigo wrote on 2018-04-16 16:05: +
                              +
                              +

                              Please ask on pypy's irc channel: #pypy at freenode.net, or the pypy-dev mailing list. This blog post is old, it is pointless to ask questions here about it---you're unlikely to get an answer.

                              +
                              +
                              +
                              + +
                              +
                              + +
                              +
                              + + \ No newline at end of file diff --git a/posts/2016/11/vectorization-extended-powerpc-and-s390x-4042433015460084057.html b/posts/2016/11/vectorization-extended-powerpc-and-s390x-4042433015460084057.html new file mode 100644 index 000000000..9f8224e95 --- /dev/null +++ b/posts/2016/11/vectorization-extended-powerpc-and-s390x-4042433015460084057.html @@ -0,0 +1,358 @@ + + + + + +Vectorization extended. PowerPC and s390x | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                              +
                              +

                              Vectorization extended. PowerPC and s390x

                              + + + +
                              +
                              +We are happy to announce that JIT support in both the PowerPC backend and the
                              +s390x backend have been enhanced. Both can now vectorize loops via SIMD
                              +instructions. Special thanks to IBM for funding this work.

                              +If you are not familiar with this topic you can read more details here.
                              +
                              +There are many more enhancements under the hood. Most notably, all pure operations are now delayed until the latest possible point. In some cases indices have been calculated more than once or they needed an additional register, because the old value is still used. Additionally it is now possible to load quadword-aligned memory in both PPC and s390x (x86 currently cannot do that).

                              +NumPy & CPyExt +

                              +The community and core developers have been moving CPyExt towards a complete, but emulated, layer for CPython C extensions. This is great, because the one restriction preventing the wider deployment of PyPy in several scenarios will hopefully be removed. However, we advocate not to use CPyExt, but rather to not write C code at all (let PyPy speed up your Python code) or use cffi.

                              +The work done here to support vectorization helps micronumpy (NumPyPy) to speed up operations for PPC and s390x. So why is PyPy supporting both NumPyPy and NumPy, do we actually need both? Yes, there are places where gcc can beat the JIT, and places where the tight integration between NumPyPy and PyPy is more performant. We do have plans to integrate both, hijacking the C-extension method calls to use NumPyPy where we know NumPyPy can be faster.

                              +Just to give you an idea why this is a benefit:

                              +NumPy arrays can carry custom dtypes and apply user defined python functions on the arrays. How could one optimize this kind of scenario? In a traditional setup, you cannot. But as soon as NumPyPy is turned on, you can suddenly JIT compile this code and vectorize it.

                              +Another example is element access that occurs frequently, or any other calls that cross between Python and the C level frequently.

                              +Benchmarks +

                              +Let's have a look at some benchmarks reusing mikefc's numpy benchmark suite (find the forked version here). I only ran a subset of microbenchmarks, showing that the core functionality is
                              functioning properly. Additionally it has been rewritten to use perf instead of the timeit stdlib module.

                              +Setup +

                              +x86 runs on a Intel i7-2600 clocked at 3.40GHz using 4 cores. PowerPC runs on the Power 8 clocked at 3.425GHz providing 160 cores. Last but not least the mainframe machine clocked up to 4 GHz, but fully virtualized (as it is common for such machines). Note that PowerPC is a non private remote machine. It is used by many users and it is crowded with processes. It is hard to extract a stable benchmark there.

                              +x86 ran on Fedora 24 (kernel version of 4.8.4), PPC ran on Fedora 21 (kernel version 3.17.4) and s390x ran on Redhat Linux 7.2 (kernel version 3.10.0). Respectivley, numpy on cpython had openblas available on x86, no blas implementation were present on s390x and PPC provided blas and lapack.

                              +As you can see all machines run very different configurations. It does not make sense to compare across platforms, but rather implementations on the same platform.







                              +Blue shows CPython 2.7.10+ available on that platform using the latest NumPy (1.11). Micro NumPy is used for PyPy. PyPy+ indicates that the vectorization optimization is turned on.
                              +All bar charts show the median value of all runs (5 samples, 100 loops, 10 inner loops, for the operations on vectors (not matrices) the loops are set to 1000). PyPy additionally gets 3 extra executions to warmup the JIT.

                              +The comparison is really comparing speed of machine code. It compares the PyPy's JIT output vs GCC's output. It has little to do with the speed of the interpreter.

                              +Both new SIMD backends speedup the numeric kernels. Some times it is near to the speed of CPython, some times it is faster. The maximum parallelism very much depends on the extension emitted by the compiler. All three SIMD backends have the same vector register size (which is 128 bit). This means that all three behave similar but ppc and s390x gain more because they can load 128bit of memory from quadword aligned memory.

                              +Future directions

                              +Python is achieving rapid adoption in data science. This is currently a trend emerging in Europe, and Python is already heavily used for data science in the USA many other places around the world.


                              +PyPy can make a valuable contribution for data scientists, helping them to rapidly write scientific programs in Python and run them at near native speed. If you happen to be in that situation, we are eager to hear you feedback or resolve your issues and also work together to improve the performance of your,
                              +code. Just get in touch!


                              +Richard Plangger (plan_rich) and the PyPy team
                              +
                              +
                              +

                              Comments

                              +
                              +
                              +
                              + + Anonymous wrote on 2016-11-03 20:06: +
                              +
                              +

                              As you are talking about GCC beating your JIT, you are using your own vectorizing compiler right?
                              I wonder if this is a feasible approach. Can you really compete with the years if not decades of work that went into the vectorizers of GCC and LLVM?
                              Wouldn't it make more sense to plug into GCC's and LLVM's JIT API's (yes GCC has a JIT) for this type of code?
                              What does PyPy bring to the table that the existing JIT's do not for numerical code?

                              +
                              +
                              +
                              +
                              + + Anonymous wrote on 2016-11-07 06:44: +
                              +
                              +

                              It's good to see pypy making progress on using python as a toolkit for data science. In addition to numpy, pandas/scipy also needs to work well for me to switch.

                              Also, a lot of data science is currently being run on windows and the x64 port of pypy hasn't had much traction in the last several years. If these 2 issues are solved (pandas/scipy being supported on a x64 windows pypy) then there should be no reason to keep using CPython.

                              +
                              +
                              +
                              +
                              + + mathgl wrote on 2016-11-08 05:22: +
                              +
                              +

                              I think most of pypy dev/users use Linux/MacOsx only, so there is no strong motivation to support win64 at the moment.

                              +
                              +
                              +
                              +
                              + + Armin Rigo wrote on 2016-11-09 16:16: +
                              +
                              +

                              Not necessarily the users---there are some on Windows. But the point is that we have not a single developer on Windows. Until someone comes forward with a serious offer for either code or money, Win64 will not get magically done.

                              +
                              +
                              +
                              + +
                              +
                              + +
                              +
                              + + \ No newline at end of file diff --git a/posts/2017/01/leysin-winter-sprint-2526th-feb-4th-3831779797804484935.html b/posts/2017/01/leysin-winter-sprint-2526th-feb-4th-3831779797804484935.html new file mode 100644 index 000000000..a1436d965 --- /dev/null +++ b/posts/2017/01/leysin-winter-sprint-2526th-feb-4th-3831779797804484935.html @@ -0,0 +1,352 @@ + + + + + +Leysin Winter Sprint: 25/26th Feb. - 4th March 2017 | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                              +
                              +

                              Leysin Winter Sprint: 25/26th Feb. - 4th March 2017

                              + + + +
                              +

                              The next PyPy sprint will be in Leysin, Switzerland, for the twelveth time. +This is a fully public sprint: newcomers and topics other than those +proposed below are welcome.

                              +
                              +

                              Goals and topics of the sprint

                              +

                              The list of topics is very open.

                              +
                                +
                              • The main topic is Python 3.5 support in PyPy, as most py3.5 +contributors should be present. It is also a good topic if you have +no or limited experience with PyPy contribution: we can easily find +something semi-independent that is not done in py3.5 so far, and +do pair-programming with you.
                              • +
                              • Any other topic is fine too: JIT compiler optimizations, CFFI, +the RevDB reverse debugger, improving to speed of your program on +PyPy, etc.
                              • +
                              • And as usual, the main side goal is to have fun in winter sports :-) +We can take a day off (for ski or anything else).
                              • +
                              +
                              +
                              +

                              Exact times

                              +

                              Work days: starting 26th Feb (~noon), ending March 4th (~noon).

                              +

                              I have pre-booked the week from Saturday Feb 25th to Saturday March 4th. +If it is possible for you to arrive Sunday before mid-afternoon, then +you should get a booking from Sunday only. The break day should be +around Wednesday.

                              +

                              It is fine to stay a few more days on either side, or conversely to book +for a part of that time only.

                              +
                              +
                              +

                              Location & Accomodation

                              + +

                              Leysin, Switzerland, "same place as before".

                              + +
                              + +

                              Let me refresh your +memory: both the sprint venue and the lodging will be in a +pair of chalets built specifically for bed & breakfast: +https://www.ermina.ch/. The place has a good ADSL Internet connection +with wireless installed. You can also arrange your own lodging +elsewhere (as long as you are in Leysin, you cannot be more than a 15 +minutes walk away from the sprint venue).

                              +

                              Please confirm that you are coming so that we can adjust the +reservations as appropriate.

                              +

                              The options of rooms are a bit more limited than on previous years +because the place for bed-and-breakfast is shrinking; but we should +still have enough room for us. The price is around 60 CHF, breakfast +included, in shared rooms (3 or 4 people). If there are people that +would prefer a double or single room, please contact me and we'll see +what choices you have. There are also a choice of hotels in Leysin.

                              +

                              Please register by Mercurial:

                              +
                              +https://bitbucket.org/pypy/extradoc/ +https://foss.heptapod.net/pypy/extradoc/-/blob/branch/default/extradoc/sprintinfo/leysin-winter-2017/ +
                              +

                              or on the pypy-dev mailing list if you do not yet have check-in rights:

                              +
                              +https://mail.python.org/mailman/listinfo/pypy-dev +
                              +

                              You need a Swiss-to-(insert country here) power adapter. There will be +some Swiss-to-EU adapters around, and at least one EU-format power strip.

                              +
                              +
                              +

                              Comments

                              +
                              +
                              +
                              + +
                              +
                              + + \ No newline at end of file diff --git a/posts/2017/03/async-http-benchmarks-on-pypy3-1092124994927894138.html b/posts/2017/03/async-http-benchmarks-on-pypy3-1092124994927894138.html new file mode 100644 index 000000000..b14a2b4a0 --- /dev/null +++ b/posts/2017/03/async-http-benchmarks-on-pypy3-1092124994927894138.html @@ -0,0 +1,485 @@ + + + + + +Async HTTP benchmarks on PyPy3 | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                              +
                              +

                              Async HTTP benchmarks on PyPy3

                              + + + +
                              +
                              +Hello everyone, +
                              +
                              +
                              +
                              +
                              +Since Mozilla announced funding, we've been working quite hard on delivering you a working Python 3.5. +
                              +
                              +  +
                              +
                              +We are almost ready to release an alpha version of PyPy 3.5. Our goal is to release it shortly after the sprint. Many modules have already been ported and  it can probably run many Python 3 programs already. We are happy to receive any feedback after the next release.  +
                              +
                              +
                              +
                              +
                              +To show that the heart (asyncio) of Python 3 is already working we have prepared some benchmarks. They are done by Paweł Piotr Przeradowski @squeaky_pl for a HTTP workload on serveral asynchronous IO libraries, namely the relatively new asyncio and curio libraries and the battle-tested tornado, gevent and Twisted libraries. To see the benchmarks check out https://github.com/squeaky-pl/zenchmarks and the instructions for reproducing can be found inside README.md in the repository. Raw results can be obtained from https://github.com/squeaky-pl/zenchmarks/blob/master/results.csv. +
                              +
                              +
                              +
                              +
                              +The + purpose of the presented benchmarks is showing that the upcoming PyPy release +is already working with unmodified code that runs on CPython 3.5. PyPy +also manages to make them run significantly faster. +
                              +
                              +
                              +
                              +
                              +The + benchmarks consist of HTTP servers implemented on the top of the mentioned +libraries. All the servers are single-threaded relying on underlying +event loops to provide concurrency. Access logging was disabled to +exclude terminal I/O from the results. The view code consists of a +lookup in a dictionary mapping ASCII letters to verses from the famous +Zen of Python. If a verse is found the view returns it, otherwise a 404 +Not Found response is served. The 400 Bad Request and 500 Internal +Server Error cases are also handled. +
                              +
                              +
                              +
                              +
                              +The workload was generated with the wrk HTTP benchmarking tool. It is run with one thread opening up to 100 +concurrent connections for 2 seconds and repeated 1010 times to get +consecutive measures. There is a Lua script provided + that instructs wrk to continuously send 24 different requests that hit +different execution paths (200, 404, 400) in the view code. Also it is +worth noting that wrk will only count 200 responses as successful so the actual request per second throughput is higher. +
                              +
                              +
                              +
                              +
                              +For your convenience all the used libraries versions are vendored into the benchmark repository. There is also a precompiled portable version of wrk provided + that should run on any reasonably recent (10 year old or newer) Linux +x86_64 distribution. The benchmark was performed on a public cloud scaleway x86_64 server launched in a Paris data center. The server was running +Ubuntu 16.04.01 LTS and reported Intel(R) Xeon(R) CPU D-1531 @ 2.20GHz +CPU. CPython 3.5.2 (shipped by default in Ubuntu) was benchmarked +against a pypy-c-jit-90326-88ef793308eb-linux64 snapshot of the 3.5 compatibility branch of PyPy. +
                              +
                              +
                              +
                              +
                              + +
                              +
                              +  +
                              +
                              +  +
                              +
                              +  +
                              +
                              +  +
                              +
                              +We want to thank Mozilla for supporting our work! +
                              +
                              +
                              +
                              +
                              +Cheers, +
                              +
                              +fijal, squeaky_pl and the PyPy Team +
                              +
                              +
                              +
                              +
                              +

                              Comments

                              +
                              +
                              +
                              + + Benjamin wrote on 2017-03-02 00:37: +
                              +
                              +

                              This is fantastic! How close to ready is the async/await syntax? Any chance it could be snuck in the 3.5 release?

                              +
                              +
                              +
                              +
                              + + Armin Rigo wrote on 2017-03-02 07:55: +
                              +
                              +

                              As far as I know, curio (and maybe asyncio) wouldn't run if we didn't properly support async/await already.

                              +
                              +
                              +
                              +
                              + + Konstantin Lopuhin wrote on 2017-03-02 09:44: +
                              +
                              +

                              Great news, you are doing awesome work! Any chance cpyext will be included in the alpha?

                              +
                              +
                              +
                              +
                              + + Ronan Lamy wrote on 2017-03-02 21:49: +
                              +
                              +

                              cpyext will be included. We expect C-API support to be approximately on par with pypy2, e.g. the pypy3 nightlies have nearly complete support for numpy.

                              +
                              +
                              +
                              +
                              + + Unknown wrote on 2017-03-03 18:35: +
                              +
                              +

                              Awesome work!

                              +
                              +
                              +
                              +
                              + + Unknown wrote on 2017-03-03 22:30: +
                              +
                              +

                              @Benjamin, async def / async for / async with / await were all introduced in Python 3.5.

                              +
                              +
                              +
                              +
                              + + Unknown wrote on 2017-03-03 22:30: +
                              +
                              +

                              This is wonderful work, congrats!

                              +
                              +
                              +
                              +
                              + + stuaxo wrote on 2017-03-04 16:22: +
                              +
                              +

                              This is great. It would be good to include some alternate asyncio back-ends as well if they work with pypy.

                              For instance, my current project uses libuv and gbulb in different components.

                              +
                              +
                              +
                              +
                              + + Anonymous wrote on 2017-03-07 17:03: +
                              +
                              +

                              Will this work with uvloop? I'm curious as I would like to get Sanic running on this! :-)

                              +
                              +
                              +
                              +
                              + + Armin Rigo wrote on 2017-03-08 22:49: +
                              +
                              +

                              From what I've read on #pypy (sorry if I'm messing something up): uvloop is a drop-in replacement of asyncio, but asyncio is faster on PyPy. PyPy's JIT for pure Python code beats the overheads of the CPython API compatibility layer (in this case, via Cython). Moreover, considering the whole application, using asyncio on PyPy easily beats using uvloop on CPython. So, as long as it remains a fully compatible replacement, you can "drop it out" and use asyncio instead on PyPy.

                              +
                              +
                              +
                              + +
                              +
                              + +
                              +
                              + + \ No newline at end of file diff --git a/posts/2017/03/leysin-winter-sprint-summary-4587213628578490701.html b/posts/2017/03/leysin-winter-sprint-summary-4587213628578490701.html new file mode 100644 index 000000000..e762d97b0 --- /dev/null +++ b/posts/2017/03/leysin-winter-sprint-summary-4587213628578490701.html @@ -0,0 +1,454 @@ + + + + + +Leysin Winter Sprint Summary | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                              +
                              +

                              Leysin Winter Sprint Summary

                              + + + +
                              + Today + is the last day of our yearly sprint event in Leysin. We had lots of +ideas on how to enhance the current state of PyPy, we went skiing and +had interesting discussions around virtual machines, the Python +ecosystem, and other real world problems.
                              +  +
                              +
                              +

                              +Why don't you join us next time? +

                              +
                              +
                              +A usual PyPy sprints day goes through the following stages: +
                              +
                              +
                              +
                              +
                              +
                                +
                              1.  Planning Session: Tasks from previous days that have seen progress or +are completed are noted in a shared document. Everyone adds new tasks +and then assigns themselves to one or more tasks (usually in pairs). As +soon as everybody is happy with their task and has a partner to work +with, the planning session is concluded and the work can start.
                              2. +
                              3. +Discussions: A sprint is a good occasion to discuss difficult +and important topics in person. We usually sit down in a separate area +in the sprint room and discuss until a) nobody wants to discuss anymore +or b) we found a solution to the problem. The good thing is that usally +the outcome is b). +
                              4. +
                              5. +Lunch: For lunch we prepare sandwiches and other finger food. +
                              6. +
                              7. +Continue working until dinner, which we eat at a random restaurant in Leysin. +
                              8. +
                              9. +Goto 1 the next day, if sprint has not ended.
                              10. +
                              +
                              +
                              +Sprints + are open to everybody and help newcomers to get started with PyPy (we usually + pair you with a developer familiar with PyPy). They are perfect to +discuss and find solutions to problems we currently face. If you are +eager to join next year, please don't hesitate to register next year +around January. +
                              +
                              +  +
                              +
                              +

                              +Sprint Summary    +

                              +Sprint goals included to work on the following topics: +
                              +
                              +
                                +
                              • Work towards releasing PyPy 3.5 (it will be released soon)
                              • +
                              • +CPython Extension (CPyExt) modules on PyPy +
                              • +
                              • Have fun in winter sports (a side goal)
                              • +
                              +
                              +
                              +

                              +Highlights +

                              +

                              +

                              +

                              + +

                              +
                              +
                              +
                                +
                              • +We have spent lots of time debugging and fixing memory issues on CPyExt. + In particular, we fixed a serious memory leak where taking a memoryview + would prevent numpy arrays from ever being freed. More work is still required to ensure that our GC always releases arrays in a timely +manner. +
                              • +
                              • +Fruitful discussions and progress about how to flesh out some details about the unicode representation in PyPy. Our current goal is to use utf-8 as the unicode representation internally and have fast vectorized operations (indexing, check if valid, ...). +
                              • +
                              • +PyPy will participate in GSoC 2017 and we will try to allocate more resources to that than last year. +
                              • +
                              • +Profile and think about some details how to reduce the starting size of the interpreter. The starting point would be to look at the parser and reduce the amount of strings to keep alive. +
                              • +
                              • Found a topic for a student's master thesis: correctly freeing cpyext reference cycles.
                              • +
                              • Run lots of Python3 code on top of PyPy3 and resolve issues we found along the way.
                              • +
                              • +Initial work on making RPython thread-safe without a GIL. +
                              • +
                              +
                              +
                              +

                              +List of attendees +

                              +
                              +
                              +- Stefan Beyer +
                              +
                              +- Antonio Cuni +
                              +
                              +- Maciej Fijalkowski +
                              +
                              +- Manuel Jacob +
                              +
                              +- Ronan Lamy +
                              +
                              +- Remi Meier +
                              +
                              +- Richard Plangger +
                              +
                              +- Armin Rigo +
                              +
                              +- Robert Zaremba +
                              +
                              +  +
                              +
                              +  +
                              +
                              +
                              +
                              +
                              +
                              + +
                              +
                              +
                              +
                              +
                              +We + would like to thank our donors for the continued support of the PyPy +project and we looking forward to next years sprint in Leysin. +
                              +
                              +
                              +
                              +
                              +
                              +The PyPy Team +
                              +
                              +
                              +
                              +



                              +
                              +

                              Comments

                              +
                              +
                              +
                              + +
                              +
                              + + \ No newline at end of file diff --git a/posts/2017/03/pypy27-and-pypy35-v57-two-in-one-release-4736633226245374150.html b/posts/2017/03/pypy27-and-pypy35-v57-two-in-one-release-4736633226245374150.html new file mode 100644 index 000000000..2afb14036 --- /dev/null +++ b/posts/2017/03/pypy27-and-pypy35-v57-two-in-one-release-4736633226245374150.html @@ -0,0 +1,436 @@ + + + + + +PyPy2.7 and PyPy3.5 v5.7 - two in one release | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                              +
                              +

                              PyPy2.7 and PyPy3.5 v5.7 - two in one release

                              + + + +
                              +
                              +
                              +The PyPy team is proud to release both PyPy2.7 v5.7 (an interpreter supporting +Python v2.7 syntax), and a beta-quality PyPy3.5 v5.7 (an interpreter for Python +v3.5 syntax). The two releases are both based on much the same codebase, thus +the dual release. Note that PyPy3.5 only supports Linux 64bit for now.

                              +This new PyPy2.7 release includes the upstream stdlib version 2.7.13, and PyPy3.5 (our first in the 3.5 series) includes the upstream stdlib version 3.5.3.

                              +We continue to make incremental improvements to our C-API compatibility layer (cpyext). PyPy2 can now import and run many C-extension packages, among the most notable are Numpy, Cython, and Pandas. Performance may be slower than CPython, especially for frequently-called short C functions. Please let us know if your use case is slow, we have ideas how to make things faster but need real-world examples (not micro-benchmarks) of problematic code.

                              +Work proceeds at a good pace on the PyPy3.5 version due to a grant from the Mozilla Foundation, hence our first 3.5.3 beta release. Thanks Mozilla !!! While we do not pass all tests yet, asyncio works and as these benchmarks show it already gives a nice speed bump. We also backported the f"" formatting from 3.6 (as an exception; otherwise “PyPy3.5” supports the Python 3.5 language).

                              CFFI has been updated to 1.10, improving an already great package for interfacing with C.

                              +We now use shadowstack as our default gcrootfinder even on Linux. The alternative, asmgcc, will be deprecated at some future point. While about 3% slower, shadowstack is much more easily maintained and debuggable. Also, the performance of shadowstack has been improved in general: this should close the speed gap between other platforms and Linux.

                              +As always, this release fixed many issues and bugs raised by the growing community of PyPy users. We strongly recommend updating.

                              +You can download the v5.7 release here:
                              + +
                              +We would like to thank our donors for the continued support of the PyPy project.
                              +We would also like to thank our contributors and encourage new people to join the project. PyPy has many layers and we need help with all of them: PyPy and RPython documentation improvements, tweaking popular modules to run on pypy, or general help with making RPython’s JIT even better.
                              +

                              +

                              +What is PyPy?

                              +PyPy is a very compliant Python interpreter, almost a drop-in replacement for CPython 2.7 and CPython 3.5. It’s fast (PyPy and CPython 2.7.x performance comparison) due to its integrated tracing JIT compiler.
                              +We also welcome developers of other dynamic languages to see what RPython can do for them.
                              +The PyPy 2.7 release supports:
                              +
                              +
                                +
                              • +x86 machines on most common operating systems (Linux 32/64 bits, Mac OS X 64 bits, Windows 32 bits, OpenBSD, FreeBSD)
                              • +
                              • newer ARM hardware (ARMv6 or ARMv7, with VFPv3) running Linux,
                              • +
                              • big- and little-endian variants of PPC64 running Linux,
                              • +
                              • +s390x running Linux
                              • +
                              +
                              +
                              +

                              +

                              +What else is new?

                              +
                              +(since the releases of PyPy 2.7 and 3.3 at the end of 2016)
                              +
                              +There are many incremental improvements to RPython and PyPy, the complete listing is here. +
                              +
                              +Please update, and continue to help us make PyPy better.

                              +Cheers, The PyPy team

                              +
                              +
                              +
                              +
                              +

                              Comments

                              +
                              +
                              +
                              + + Sergei wrote on 2017-03-21 10:33: +
                              +
                              +

                              Awesome! Thanks, guys.

                              +
                              +
                              +
                              +
                              + + Baczek wrote on 2017-03-21 11:19: +
                              +
                              +

                              > We also backported the f"" formatting from 3.6 (as an exception; otherwise “PyPy3.5” supports the Python 3.5 language).

                              Could you also support just the syntax part of variable type declarations? It'll make using mypy that much nicer.

                              +
                              +
                              +
                              +
                              + + Unknown wrote on 2017-03-21 12:16: +
                              +
                              +

                              Awesome! Thanks a lot!

                              +
                              +
                              +
                              +
                              + + Mike wrote on 2017-03-23 10:06: +
                              +
                              +

                              Hello.
                              Thanks for pypy!
                              I have a question: Is there any big company who using pypy in production?
                              Thanks

                              +
                              +
                              +
                              +
                              + + Canesin wrote on 2017-03-23 14:40: +
                              +
                              +

                              Great work as usual! Is there any plan to benefit from programs using PEP 484 syntax ?

                              +
                              +
                              +
                              +
                              + + Armin Rigo wrote on 2017-03-23 16:16: +
                              +
                              +

                              @Canesin: benefit for performance? No. The PEP itself says "Using type hints for performance optimizations is left as an exercise for the reader". But that's a misleading comment. There is no useful optimization that we can apply from the knowledge "argument 1 is an int", because that could also be an arbitrarily-large integer and/or an instance of a subclass of int. And if it really turns out to be almost always a regular machine-sized integer, then PyPy's JIT will figure it out by itself. PEP 484 is totally pointless for performance. (It is probably useful for other reasons outside the scope of this comment.)

                              +
                              +
                              +
                              +
                              + + Miro Hrončok wrote on 2017-03-29 17:13: +
                              +
                              +

                              Excellent news! Is PyPy3 support for 32bit Linux planned? Thanks for info.

                              +
                              +
                              +
                              +
                              + + Armin Rigo wrote on 2017-04-01 08:13: +
                              +
                              +

                              Miro: yes, we plan to have support for the same set of platforms. The various Posix platforms are not too much work, and Windows will follow, too.

                              +
                              +
                              +
                              +
                              + + Gaëtan de Menten wrote on 2017-04-02 09:44: +
                              +
                              +

                              Is there anybody working on win64? It is a bit frustrating to see pypy maturing quickly to the point that I could probably use it soon in production... if only it worked on win64.

                              +
                              +
                              +
                              +
                              + + Armin Rigo wrote on 2017-04-02 09:48: +
                              +
                              +

                              Gaëtan: no. We need either outside contributions or, more likely, money to make it happen. Just like what occurred with Mozilla for Python 3.

                              +
                              +
                              +
                              + +
                              +
                              + +
                              +
                              + + \ No newline at end of file diff --git a/posts/2017/04/native-profiling-in-vmprof-6949065546884243105.html b/posts/2017/04/native-profiling-in-vmprof-6949065546884243105.html new file mode 100644 index 000000000..6ad6c4791 --- /dev/null +++ b/posts/2017/04/native-profiling-in-vmprof-6949065546884243105.html @@ -0,0 +1,387 @@ + + + + + +Native profiling in VMProf | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                              +
                              +

                              Native profiling in VMProf

                              + + + +
                              +

                              We are happy to announce a new release for the PyPI package vmprof.
                              +It is now able to capture native stack frames on Linux and Mac OS X to show you bottle necks in compiled code (such as CFFI modules, Cython or C Python extensions). It supports PyPy, CPython versions 2.7, 3.4, 3.5 and 3.6. Special thanks to Jetbrains for funding the native profiling support.

                              +
                              +vmprof logo +
                              +
                              +
                              +
                              What is vmprof?

                              If you have already worked with vmprof you can skip the next two section. If not, here is a short introduction:

                              The goal of vmprof package is to give you more insight into your program. It is a statistical profiler. Another prominent profiler you might already have worked with is cProfile. It is bundled with the Python standard library.

                              vmprof's distinct feature (from most other profilers) is that it does not significantly slow down your program execution. The employed strategy is statistical, rather than deterministic. Not every function call is intercepted, but it samples stack traces and memory usage at a configured sample rate (usually around 100hz). You can imagine that this creates a lot less contention than doing work before and after each function call.

                              As mentioned earlier cProfile gives you a complete profile, but it needs to intercept every function call (it is a deterministic profiler). Usually this means that you have to capture and record every function call, but this takes an significant amount time.

                              The overhead vmprof consumes is roughly 3-4% of your total program runtime or even less if you reduce the sampling frequency. Indeed it lets you sample and inspect much larger programs. If you failed to profile a large application with cProfile, please give vmprof a shot.

                              vmprof.com or PyCharm

                              +
                              +There are two major alternatives to the command-line tools shipped with vmprof:
                              +
                                +
                              • A web service on vmprof.com +
                              • +
                              • PyCharm Professional Edition
                              • +
                              +
                              +While the command line tool is only good for quick inspections, vmprof.com + and PyCharm compliment each other providing deeper insight into your +program. With PyCharm you can view the per-line profiling results inside + the editor. With the vmprof.com you get a handy visualization of the profiling results as a flame chart and memory usage graph.
                              +
                              +
                              +
                              +
                              +
                              +
                              +Since the PyPy Team runs and maintains the service on vmprof.com (which is by the way free and open-source), I’ll explain some more details here. On vmprof.com you can inspect the generated profile interactively instead of looking at console output. What is sent to vmprof.com? You can find details here.
                              +
                              +
                              Flamegraph: Accumulates and displays the most frequent codepaths. It allows you to quickly and accurately identify hot spots in your code. The flame graph below is a very short run of richards.py (Thus it shows a lot of time spent in PyPy's JIT compiler).

                              + +
                              +

                              List all functions (optionally sorted): the equivalent of the vmprof command line output in the web.

                              + +
                              +
                              Memory curve: A line plot that shows how how many MBytes have been consumed over the lifetime of your program (see more info in the section below).

                              + +
                              +Native programs

                              The new feature introduced in vmprof 0.4.x allows you to look beyond the Python level. As you might know, Python maintains a stack of frames to save the execution. Up to now the vmprof profiles only contained that level of information. But what if you program jumps to native code (such as calling gzip compression on a large file)? Up to now you would not see that information.

                              +Many packages make use of the CPython C API (which we discurage, please lookup cffi for a better way to call C). Have you ever had the issue that you know that your performance problems reach down to, but you could not profile it properly? Now you can!

                              Let's inspect a very simple Python program to find out why a program is significantly slower on Linux than on Mac:

                              import numpy as np
                              +n = 1000
                              +a = np.random.random((n, n))
                              +b = np.random.random((n, n))
                              +c = np.dot(np.abs(a), b)



                              +Take two NxN random matrix objects and create a dot product. The first argument to the dot product provides the absolute value of the random matrix.

                              + + + + + + + + + + + + + + + + + + + + + + +
                              RunPythonNumPyOSn=... Took
                              [1]CPython 3.5.2NumPy 1.12.1Mac OS X, 10.12.3n=5000~9 sec
                              [2]CPython 3.6.0NumPy 1.12.1Linux 64, Kernel 4.9.14n=1000~26 sec
                              +
                              +Note that the Linux machine operates on a 5 times smaller matrix, still it takes much longer. What is wrong? Is Linux slow? CPython 3.6.0? Well no, lets inspect and [1] and [2] (shown below in that order).
                              + +
                              +
                              + +
                              +
                              [2] runs on Linux, spends nearly all of the time in PyArray_MatrixProduct2, if you compare to [1] on Mac OS X, you'll see that a lot of time is spent in generating the random numbers and the rest in cblas_matrixproduct.

                              +Blas has a very efficient implementation so you can achieve the same on Linux if you install a blas implementation (such as openblas).

                              +Usually you can spot potential program source locations that take a lot of time and might be the first starting point to resolve performance issues.

                              Beyond Python programs

                              +It is not unthinkable that the strategy can be reused for native programs. Indeed this can already be done by creating a small cffi wrapper around an entry point of a compiled C program. It would even work for programs compiled from other languages (e.g. C++ or Fortran). The resulting function names are the full symbol name embedded into either the executable symboltable or extracted from the dwarf debugging information. Most of those will be compiler specific and contain some cryptic information.

                              Memory profiling
                              +We thankfully received a code contribution from the company Blue Yonder. They have built a memory profiler (for Linux and Mac OS X) on top of vmprof.com that displays the memory consumption for the runtime of your process.

                              +You can run it the following way:

                              $ python -m vmprof --mem --web script.py

                              +By adding --mem, vmprof will capture memory information and display it in the dedicated view on vmprof.com. You can tha view by by clicking the 'Memory' switch in the flamegraph view.

                              There is more

                              +Some more minor highlights contained in 0.4.x:
                                +
                              • VMProf support for Windows 64 bit (No native profiling)
                              • +
                              • VMProf can read profiles generated by another host system
                              • +
                              • VMProf is now bundled in several binary wheel for fast and easy installation (Mac OS X, Linux 32/64 for CPython 2.7, 3.4, 3.5, 3.6)
                              • +
                              +Future plans - Profile Streaming

                              +vmprof has not reached the end of development. There are many features we could implement. But there is one feature that could be a great asset to many Python developers.

                              +Continuous delivery of your statistical profile, or in short, profile streaming. One of the great strengths of vmprof is that is consumes very little overhead. It is not a crazy idea to run this in production.

                              +It would require a smart way to stream the profile in the background to vmprof.com and new visualizations to look at much more data your Python service produces.

                              +If that sounds like a solid vmprof improvement, don't hesitate to get in touch with us (e.g. IRC #pypy, mailing list pypy-dev, or comment below)

                              You can help!

                              +There are some immediate things other people could help with. Either by donating time or money (yes we have occasional contributors which is great)!
                                +
                              • We gladly received code contribution for the memory profiler. But it was not enough time to finish the migration completely. Sadly it is a bit brittle right now.
                              • +
                              • We would like to spend more time on other visualizations. This should include to give a much better user experience on vmprof.com (like a tutorial that explains the visualization that we already have). 
                              • +
                              • Build Windows 32/64 bit wheels (for all CPython versions we currently support)
                              • +
                              +We are also happy to accept google summer of code projects on vmprof for new visualizations and other improvements. If you qualify and are interested, don't hesitate to ask!

                              +Richard Plangger (plan_rich) and the PyPy Team

                              +[1] Mac OS X https://vmprof.com/#/567aa150-5927-4867-b22d-dbb67ac824ac
                              +[2] Linux64 https://vmprof.com/#/097fded2-b350-4d68-ae93-7956cd10150c +
                              +

                              Comments

                              +
                              +
                              +
                              + +
                              +
                              + + \ No newline at end of file diff --git a/posts/2017/04/pypy-571-bugfix-released-8519267986159880133.html b/posts/2017/04/pypy-571-bugfix-released-8519267986159880133.html new file mode 100644 index 000000000..785b2304d --- /dev/null +++ b/posts/2017/04/pypy-571-bugfix-released-8519267986159880133.html @@ -0,0 +1,372 @@ + + + + + +PyPy 5.7.1 bugfix released | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                              +
                              +

                              PyPy 5.7.1 bugfix released

                              + + + +
                              +
                              +We have released a bugfix PyPy2.7-v5.7.1 and PyPy3.5-v5.7.1 beta (Linux 64bit), +due to the following issues:
                              +
                              +
                                +
                              • correctly handle an edge case in dict.pop (issue 2508)
                              • +
                              • fix a regression to correctly handle multiple inheritance in a C-API type +where the second base is an app-level class with a __new__ function
                              • +
                              • fix a regression to fill a C-API type’s tp_getattr slot from a +__getattr__ method (issue 2523)
                              • +
                              +
                              +
                              +Thanks to those who reported issues and helped test out the fixes

                              +You can download the v5.7.1 release here:
                              + +
                              +

                              +What is PyPy?

                              +PyPy is a very compliant Python interpreter, almost a drop-in replacement for CPython 2.7 and CPython 3.5. It’s fast (PyPy and CPython 2.7.x performance comparison) due to its integrated tracing JIT compiler.
                              +We also welcome developers of other dynamic languages to see what RPython can do for them.
                              +The PyPy 2.7 release supports:
                              +
                              +
                                +
                              • +x86 machines on most common operating systems (Linux 32/64 bits, Mac OS X 64 bits, Windows 32 bits, OpenBSD, FreeBSD)
                              • +
                              • newer ARM hardware (ARMv6 or ARMv7, with VFPv3) running Linux,
                              • +
                              • big- and little-endian variants of PPC64 running Linux,
                              • +
                              • +s390x running Linux
                              • +
                              +
                              +
                              +Please update, and continue to help us make PyPy better.

                              +Cheers, The PyPy team

                              +
                              +
                              +

                              Comments

                              +
                              +
                              +
                              + + Anonymous wrote on 2017-04-04 14:20: +
                              +
                              +

                              any chance for a Mac OS X PyPy3 distribution?
                              compilation from sources fails …

                              thanks for the great work by the way !

                              +
                              +
                              +
                              +
                              + + aiguy wrote on 2017-05-15 11:24: +
                              +
                              +

                              Tried looking for a pypy wishlist but couldn't find one. So hopefully somebody reads comments.

                              My three biggest pypy wishes are for...

                              1. faster csv file reading by replacing Python library code with compiled C code which I understand from 4 years ago is still slower than cPython so is still on the todo list.

                              2. Update SQLite to latest version in pypy distribution since they have made some great speed enhancements in recent releases.

                              3. Create an containerized downloadable Docker distribution for PyPy which allows for easy deployment of PyPy projects to other machines. platforms and thumbs drives. This would also allow easier setup of multiple PyPy microservices and encapsulation of multiple pypy environments on the same machine.

                              +
                              +
                              +
                              +
                              + + Armin Rigo wrote on 2017-05-15 16:40: +
                              +
                              +

                              @aiguy: csv is written in C already nowadays. Please report an issue with reproducible examples if you find that PyPy is still a lot slower than CPython at reading large-ish csv files.

                              For SQLite, I guess you're talking about Windows. We have plans to update it at some point.

                              For Docker, that's outside the scope of the PyPy team and should be done (or is done already?) by other people.

                              +
                              +
                              +
                              +
                              + + Carl Friedrich Bolz-Tereick wrote on 2017-05-15 18:24: +
                              +
                              +

                              There are maintained Docker files here, IIRC: https://hub.docker.com/_/pypy/

                              +
                              +
                              +
                              + +
                              +
                              + +
                              +
                              + + \ No newline at end of file diff --git a/posts/2017/06/pypy-v58-released-739876359584854017.html b/posts/2017/06/pypy-v58-released-739876359584854017.html new file mode 100644 index 000000000..19e5e4cd6 --- /dev/null +++ b/posts/2017/06/pypy-v58-released-739876359584854017.html @@ -0,0 +1,407 @@ + + + + + +PyPy v5.8 released | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                              +
                              +

                              PyPy v5.8 released

                              + + + +
                              +
                              +The PyPy team is proud to release both PyPy2.7 v5.8 (an interpreter supporting +Python 2.7 syntax), and a beta-quality PyPy3.5 v5.8 (an interpreter for Python +3.5 syntax). The two releases are both based on much the same codebase, thus +the dual release. Note that PyPy3.5 supports Linux 64bit only for now.

                              +This new PyPy2.7 release includes the upstream stdlib version 2.7.13, and +PyPy3.5 includes the upstream stdlib version 3.5.3.

                              +We fixed critical bugs in the shadowstack rootfinder garbage collector +strategy that crashed multithreaded programs and very rarely showed up +even in single threaded programs.

                              +We added native PyPy support to profile frames in the vmprof statistical +profiler.

                              +The struct module functions pack* and unpack* are now much faster, +especially on raw buffers and bytearrays. Microbenchmarks show a 2x to 10x +speedup. Thanks to Gambit Research for sponsoring this work.

                              +This release adds (but disables by default) link-time optimization and +profile guided optimization of the base interpreter, which may make +unjitted code run faster. To use these, translate with appropriate +options. Be aware of issues with gcc toolchains, though.

                              +Please let us know if your use case is slow, we have ideas how to make things +faster but need real-world examples (not micro-benchmarks) of problematic code.

                              +Work sponsored by a Mozilla grant continues on PyPy3.5; numerous fixes from +CPython were ported to PyPy and PEP 489 was fully implemented. Of course the +bug fixes and performance enhancements mentioned above are part of both PyPy +2.7 and PyPy 3.5.

                              CFFI, which is part of the PyPy release, has been updated to an unreleased 1.10.1, +improving an already great package for interfacing with C.

                              +Anyone using NumPy 1.13.0, must upgrade PyPy to this release since we implemented some previously missing C-API functionality. Many other c-extension modules now work with PyPy, let us know if yours does not.

                              +As always, this release fixed many issues and bugs raised by the +growing community of PyPy users. We strongly recommend updating.

                              +You can download the v5.8 release here:
                              + +
                              +We would like to thank our donors and contributors, and +encourage new people to join the project. PyPy has many +layers and we need help with all of them: PyPy and RPython documentation +improvements, tweaking popular modules to run on PyPy, or general help +with making RPython’s JIT even better.

                              +What is PyPy?

                              +PyPy is a very compliant Python interpreter, almost a drop-in replacement for CPython 2.7 and CPython 3.5. It’s fast (PyPy and CPython 2.7.x performance comparison) due to its integrated tracing JIT compiler.
                              +We also welcome developers of other dynamic languages to see what RPython can do for them.
                              +The PyPy 2.7 release supports:
                              +
                              +
                                +
                              • +x86 machines on most common operating systems (Linux 32/64 bits, Mac OS X 64 bits, Windows 32 bits, OpenBSD, FreeBSD)
                              • +
                              • newer ARM hardware (ARMv6 or ARMv7, with VFPv3) running Linux,
                              • +
                              • big- and little-endian variants of PPC64 running Linux,
                              • +
                              • +s390x running Linux
                              • +
                              +
                              +
                              +

                              +What else is new?

                              +
                              +PyPy 5.7 was released in March, 2017.
                              +
                              +There are many incremental improvements to RPython and PyPy, the complete listing is here. +
                              +
                              +Please update, and continue to help us make PyPy better.

                              +Cheers, The PyPy team

                              +
                              +
                              +

                              Comments

                              +
                              +
                              +
                              + + Unknown wrote on 2017-06-09 12:11: +
                              +
                              +

                              Great news! Thank you!

                              +
                              +
                              +
                              +
                              + + Albert Le Blanc wrote on 2017-06-09 12:37: +
                              +
                              +

                              Can we get a comprehensive update on Numpypy? It has gone really quiet since the days when Alex Gaynor used to talk at Pycon etc about the work which has been going on since what 2010/11? The repo has issues that are not looked at. I would really like an honest appraisal of what was learned in the Numpypy project and what is the future of Numpy (Scipy too) & PyPy because the situation for developers like myself is that we're caught between a rock and a hard place. PyPy consistently allows us to write code and explore algorithms in Python!! Whereas CPython forces you into C/Cython continually. PyPy is a great dream in my heart. What you guys are doing - allowing me to write Python and it be fast. What other language forces you so much to write in another language when performance is a consideration? The speed difference between Node.js and Python 3 is laughable. PyPy for the win!!!!

                              But....and it's a big but I am one of those devs who extensively is addicted to numeric arrays, not because I'm a 'quant' or an astronomer or rocket scientist but because Numpy arrays are simply better for many solutions than Python's other data structures. And once leveraged, giving that up to go to PyPy is impossible. It forces you to choose between numpy + slower python (CPython) or slower Numpy and faster python (PyPy).

                              Numpypy was a great dream, the best of both. But it seems to have failed, proven to be too difficult or does it simply need more money? I would appreciate a public update (if one exists, please link to it). Because the sadness for me is that a genuinely fast Python runtime will never be usable until the Numpy/Scipy world works and you get the fast python and as fast numpy.

                              I would really like to help, raise money whatever but maybe I'm out of the loop and the plan has changed?

                              +
                              +
                              +
                              +
                              + + Johny JKJK wrote on 2017-06-09 12:38: +
                              +
                              +

                              Is it possible to resurrect pypy uwsgi integration for pypy3.5?

                              +
                              +
                              +
                              +
                              + + mattip wrote on 2017-06-10 18:37: +
                              +
                              +

                              Hi Albert. We have decided that a better route is to use upstream NumPy for compatibility. We are a small group, and reimplementing all of the c code in NumPy for Numpypy would be a never ending, close to impossible task.

                              However, we do have a different long-term plan to combine numpy and python. Since our c-api emulation layer is slow, perhaps we can "hijack" the most common python calls that cross that emulation border and make them fast. This would utilize much of NumPyPy but would mean that only a subset of the extensive NumPy library would need to be implemented and maintained. We have a branch that demonstrates a proof-of-concept for simple item access (ctypedef struct). Help on the PyPy project is always welcome, come to #pypy on IRC and we can discuss it further.

                              +
                              +
                              +
                              +
                              + + v3ss wrote on 2017-06-27 18:51: +
                              +
                              +

                              Regarding Beta Status on PyPy 5.8 3.5.x , What are the main missing points?
                              What are the current know issues for PyPy 5.8-3.5.x ?

                              +
                              +
                              +
                              + +
                              +
                              + +
                              +
                              + + \ No newline at end of file diff --git a/posts/2017/07/binary-wheels-for-pypy-8718353804433344916.html b/posts/2017/07/binary-wheels-for-pypy-8718353804433344916.html new file mode 100644 index 000000000..0198b1c3b --- /dev/null +++ b/posts/2017/07/binary-wheels-for-pypy-8718353804433344916.html @@ -0,0 +1,329 @@ + + + + + +Binary wheels for PyPy | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                              +
                              +

                              Binary wheels for PyPy

                              + + + +
                              +

                              Hi,

                              +this is a short blog post, just to announce the existence of this Github repository, which contains binary PyPy wheels for some selected packages. The availability of binary wheels means that you can install the packages much more quickly, without having to wait for compilation.

                              +
                              +
                              +
                              +At the moment of writing, these packages are available:

                                +
                              • numpy
                              • +
                              • scipy
                              • +
                              • pandas
                              • +
                              • psutil
                              • +
                              • netifaces
                              • +
                              +
                              +For now, we provide only wheels built on Ubuntu, compiled for PyPy 5.8.
                              +In particular, it is worth noting that they are not manylinux1 wheels, which means they could not work on other Linux distributions. For more information, see the explanation in the README of the above repo.

                              +Moreover, the existence of the wheels does not guarantee that they work correctly 100% of the time. they still depend on cpyext, our C-API emulation layer, which is still work-in-progress, although it has become better and better during the last months. Again, the wheels are there only to save compilation time.

                              +To install a package from the wheel repository, you can invoke pip like this:

                              $ pip install --extra-index https://antocuni.github.io/pypy-wheels/ubuntu numpy
                              +
                              +
                              +
                              +Happy installing!
                              +
                              +

                              Comments

                              +
                              +
                              +
                              + + Unknown wrote on 2017-07-27 11:16: +
                              +
                              +

                              Very nice. The main reason I can't actively recommend PyPy to others is that I would have to help them install all packages, where for CPython I can just say "conda install foo". Working on efforts like this is extremely useful for the community.

                              +
                              +
                              +
                              +
                              + + Gaëtan de Menten wrote on 2017-10-02 08:56: +
                              +
                              +

                              Speaking of which if those were conda packages, that would make it much easier for me. And if pytables and pyyaml worked in pypy (a few years ago they did not and I have no idea what is their current state) and were packaged too, I could finally try pypy on my real projects, and not just toy examples.

                              +
                              +
                              +
                              + +
                              +
                              + +
                              +
                              + + \ No newline at end of file diff --git a/posts/2017/08/lets-remove-global-interpreter-lock-748023554216649595.html b/posts/2017/08/lets-remove-global-interpreter-lock-748023554216649595.html new file mode 100644 index 000000000..19d3aef78 --- /dev/null +++ b/posts/2017/08/lets-remove-global-interpreter-lock-748023554216649595.html @@ -0,0 +1,491 @@ + + + + + +Let's remove the Global Interpreter Lock | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                              +
                              +

                              Let's remove the Global Interpreter Lock

                              + + + +
                              +
                              +

                              Hello everyone

                              +

                              The Python community has been discussing removing the Global Interpreter Lock for +a long time. +There have been various attempts at removing it: +Jython or IronPython successfully removed it with the help of the underlying +platform, and some have yet to bear fruit, like gilectomy. Since our February sprint in Leysin, +we have experimented with the topic of GIL removal in the PyPy project. +We believe that the work done in IronPython or Jython can be reproduced with +only a bit more effort in PyPy. Compared to that, removing the GIL in CPython is a much +harder topic, since it also requires tackling the problem of multi-threaded reference +counting. See the section below for further details.

                              +

                              As we announced at EuroPython, what we have so far is a GIL-less PyPy +which can run very simple multi-threaded, nicely parallelized, programs. +At the moment, more complicated programs probably segfault. The +remaining 90% (and another 90%) of work is with putting locks in strategic +places so PyPy does not segfault during concurrent accesses to +data structures.

                              +

                              Since such work would complicate the PyPy code base and our day-to-day work, +we would like to judge the interest of the community and the commercial +partners to make it happen (we are not looking for individual +donations at this point). We estimate a total cost of $50k, +out of which we already have backing for about 1/3 (with a possible 1/3 +extra from the STM money, see below). This would give us a good +shot at delivering a good proof-of-concept working PyPy with no GIL. If we can get a $100k +contract, we will deliver a fully working PyPy interpreter with no GIL as a release, +possibly separate from the default PyPy release.

                              +

                              People asked several questions, so I'll try to answer the technical parts +here.

                              +

                              What would the plan entail?

                              +

                              We've already done the work on the Garbage Collector to allow doing multi- +threaded programs in RPython. "All" that is left is adding locks on mutable +data structures everywhere in the PyPy codebase. Since it would significantly complicate +our workflow, we require real interest in that topic, backed up by +commercial contracts in order to justify the added maintenance burden.

                              +

                              Why did the STM effort not work out?

                              +

                              STM was a research project that proved that the idea is possible. However, +the amount of user effort that is required to make programs run in a +parallelizable way is significant, and we never managed to develop tools +that would help in doing so. At the moment we're not sure if more work +spent on tooling would improve the situation or if the whole idea is really doomed. +The approach also ended up adding significant overhead on single threaded programs, +so in the end it is very easy to make your programs slower. (We have some money +left in the donation pot for STM which we are not using; according to the rules, we +could declare the STM attempt failed and channel that money towards the present +GIL removal proposal.)

                              +

                              Wouldn't subinterpreters be a better idea?

                              +

                              Python is a very mutable language - there are tons of mutable state and +basic objects (classes, functions,...) that are compile-time in other +language but runtime and fully mutable in Python. In the end, sharing +things between subinterpreters would be restricted to basic immutable +data structures, which defeats the point. Subinterpreters suffers from the same problems as +multiprocessing with no additional benefits. +We believe that reducing mutability to implement subinterpreters is not viable without seriously impacting the +semantics of the language (a conclusion which applies to many other +approaches too).

                              +

                              Why is it easier to do in PyPy than CPython?

                              +

                              Removing the GIL in CPython has two problems:

                              +
                                +
                              • how do we guard access to mutable data structures with locks and
                              • +
                              • what to do with reference counting that needs to be guarded.
                              • +
                              +

                              PyPy only has the former problem; the latter doesn't exist, +due to a different garbage collector approach. Of course the first problem +is a mess too, but at least we are already half-way there. Compared to Jython +or IronPython, PyPy lacks some data structures that are provided by JVM or .NET, +which we would need to implement, hence the problem is a little harder +than on an existing multithreaded platform. However, there is good research +and we know how that problem can be solved.

                              +

                              Best regards,
                              +Maciej Fijalkowski

                              +
                              +
                              +
                              +

                              Comments

                              +
                              +
                              +
                              + + Patrick wrote on 2017-08-14 18:03: +
                              +
                              +

                              Where can one donate? Is there a specific page for it? :)

                              +
                              +
                              +
                              +
                              + + Anonymous wrote on 2017-08-14 20:12: +
                              +
                              +

                              Where can we we donate or forward a link to managing directors for corporate donations?

                              +
                              +
                              +
                              +
                              + + funny_falcon wrote on 2017-08-14 21:29: +
                              +
                              +

                              Neither .Net, nor Java put locks around every mutable access. Why the hell PyPy should?

                              +
                              +
                              +
                              +
                              + + Unknown wrote on 2017-08-15 00:29: +
                              +
                              +

                              It sounds to me like you are just looking for money to spend. I see no reliable or commercial deliverable coming out of this effort (you listed a bucketload of caveats already). If it were doable in $100k, it would have been done long ago, no? Caveat Emptor to those who toss their money at this.

                              +
                              +
                              +
                              +
                              + + Unknown wrote on 2017-08-15 06:05: +
                              +
                              +

                              200+ comments about this article are at: https://news.ycombinator.com/item?id=15008636

                              +
                              +
                              +
                              +
                              + + Zunzster wrote on 2017-08-15 06:20: +
                              +
                              +

                              @funny_falcon: I don't read this as them arguing for putting "putting locks around *every* mutable access". Rather, just the core shared-mutable pieces of the run-time library and infrastructure, which in .NET and the JVM are provided by the VM itself for Jython and IronPython but which PyPy has to implement.

                              @scott_taggart: Your vision seems limited. Perhaps you aren't familiar with the PyPy team's strong history of delivering. It may well be 'doable in $100K' but how is that supposed to have spontaneously happened already without a viable plan and a trusted team which is exactly what the PyPy project is?

                              I always thought the STM concept was really clever and elegant in theory but that the overhead involved, both in recording and rollback-retries, could impact forward progress too much to be viable in practice. Essentially, STM and locks are dual's of each other, with STM having better composition and locks less overhead.

                              At least with a more traditional locking approach, the locks are still being inserted by the interpreter/library, so they can be reasoned about more carefully (and even instrumented programmatically) to avoid some of the classic problems with lock-based designs whilst regaining the performance lost to STM overhead.

                              If anyone can pull it off, the PyPy team can :-)

                              +
                              +
                              +
                              +
                              + + Unknown wrote on 2017-08-15 08:31: +
                              +
                              +

                              +1

                              +
                              +
                              +
                              +
                              + + Unknown wrote on 2017-08-15 09:19: +
                              +
                              +

                              Why not rather implement immutable datastructures like Clojure does?

                              +
                              +
                              +
                              +
                              + + Anonymous wrote on 2017-08-15 12:42: +
                              +
                              +

                              Oh, just shut up and take my money.

                              +
                              +
                              +
                              +
                              + + Anonymous wrote on 2017-08-15 14:10: +
                              +
                              +

                              I have been very impressed with the PyPy developers accomplishments to date and sincerely hope that they find corporate sponsors for this worthwhile endeavor.

                              +
                              +
                              +
                              +
                              + + Unknown wrote on 2017-08-15 20:23: +
                              +
                              +

                              How can people donate? $50k seems a bargain for such an important achievement. That's pocket change to most moderately sized companies.

                              +
                              +
                              +
                              +
                              + + Joce wrote on 2017-08-16 05:17: +
                              +
                              +

                              Sounds good, perhaps time to mark the STM effort as stale?

                              +
                              +
                              +
                              +
                              + + Unknown wrote on 2017-09-13 23:22: +
                              +
                              +

                              This would be awesome, please. :(

                              +
                              +
                              +
                              +
                              + + PvdE wrote on 2017-10-04 06:59: +
                              +
                              +

                              I donated to the original STM and would be happy if it were reallocated to this.

                              +
                              +
                              +
                              + +
                              +
                              + +
                              +
                              + + \ No newline at end of file diff --git a/posts/2017/10/cape-of-good-hope-for-pypy-hello-from-3656631725712879033.html b/posts/2017/10/cape-of-good-hope-for-pypy-hello-from-3656631725712879033.html new file mode 100644 index 000000000..b0c8ea80d --- /dev/null +++ b/posts/2017/10/cape-of-good-hope-for-pypy-hello-from-3656631725712879033.html @@ -0,0 +1,464 @@ + + + + + +(Cape of) Good Hope for PyPy | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                              +
                              +

                              (Cape of) Good Hope for PyPy

                              + + + +
                              +
                              +
                              +
                              +Hello from the other side of the world (for most of you)!

                              +With the excuse of coming to PyCon ZA during the last two weeks Armin, +Ronan, Antonio and sometimes Maciek had a very nice and productive sprint in +Cape Town, as pictures show :). We would like to say a big thank you to +Kiwi.com, which sponsored part of the travel costs via its awesome Sourcelift +program to help Open Source projects.

                              + + +
                              Armin, Anto and Ronan at Cape Point
                              +
                              +Armin, Ronan and Anto spent most of the time hacking at cpyext, our CPython +C-API compatibility layer: during the last years, the focus was to make it +working and compatible with CPython, in order to run existing libraries such +as numpy and pandas. However, we never paid too much attention to performance, +so the net result is that with the latest released version of PyPy, C +extensions generally work but their speed ranges from "slow" to "horribly +slow".

                              +For example, these very simple microbenchmarks measure the speed of +calling (empty) C functions, i.e. the time you spend to "cross the border" +between RPython and C. (Note: this includes the time spent doing the loop in regular Python code.) These are the results on CPython, on PyPy 5.8, and on +our newest in-progress version:

                              $ python bench.py     # CPython
                              +noargs      : 0.41 secs
                              +onearg(None): 0.44 secs
                              +onearg(i)   : 0.44 secs
                              +varargs     : 0.58 secs
                              +
                              +
                              +
                              +
                              +
                              $ pypy-5.8 bench.py   # PyPy 5.8
                              +noargs      : 1.01 secs
                              +onearg(None): 1.31 secs
                              +onearg(i)   : 2.57 secs
                              +varargs     : 2.79 secs
                              +
                              +
                              +
                              +
                              +
                              $ pypy bench.py       # cpyext-refactor-methodobject branch
                              +noargs      : 0.17 secs
                              +onearg(None): 0.21 secs
                              +onearg(i)   : 0.22 secs
                              +varargs     : 0.47 secs
                              +
                              +
                              +
                              +
                              +
                              
                              +
                              
                              +So yes: before the sprint, we were ~2-6x slower than CPython. Now, we are
                              +faster than it!
                              +To reach this result, we did various improvements, such as:
                              +
                              +
                                +
                              1. teach the JIT how to look (a bit) inside the cpyext module;
                              2. +
                              3. write specialized code for calling METH_NOARGS, METH_O and +METH_VARARGS functions; previously, we always used a very general and +slow logic;
                              4. +
                              5. implement freelists to allocate the cpyext versions of int and +tuple objects, as CPython does;
                              6. +
                              7. the cpyext-avoid-roundtrip branch: crossing the RPython/C border is +slowish, but the real problem was (and still is for many cases) we often +cross it many times for no good reason. So, depending on the actual API +call, you might end up in the C land, which calls back into the RPython +land, which goes to C, etc. etc. (ad libitum).
                              8. +
                              +
                              +The branch tries to fix such nonsense: so far, we fixed only some cases, which +are enough to speed up the benchmarks shown above. But most importantly, we +now have a clear path and an actual plan to improve cpyext more and +more. Ideally, we would like to reach a point in which cpyext-intensive +programs run at worst at the same speed of CPython.

                              +The other big topic of the sprint was Armin and Maciej doing a lot of work on the +unicode-utf8 branch: the goal of the branch is to always use UTF-8 as the +internal representation of unicode strings. The advantages are various: +
                              +
                                +
                              • decoding a UTF-8 stream is super fast, as you just need to check that the +stream is valid;
                              • +
                              • encoding to UTF-8 is almost a no-op;
                              • +
                              • UTF-8 is always more compact representation than the currently +used UCS-4. It's also almost always more compact than CPython 3.5 latin1/UCS2/UCS4 combo;
                              • +
                              • smaller representation means everything becomes quite a bit faster due to lower cache pressure.
                              • +
                              +
                              +Before you ask: yes, this branch contains special logic to ensure that random +access of single unicode chars is still O(1), as it is on both CPython and the +current PyPy.
                              +We also plan to improve the speed of decoding even more by using modern processor features, like SSE and AVX. Preliminary results show that decoding can be done 100x faster than the current setup. +

                              +In summary, this was a long and profitable sprint, in which we achieved lots +of interesting results. However, what we liked even more was the privilege of +doing commits from awesome places such as the top of Table Mountain:

                              + + +
                              + + +
                              The panorama we looked at instead of staring at cpyext code
                              +
                              +

                              Comments

                              +
                              +
                              +
                              + + Nickolas wrote on 2017-10-18 22:59: +
                              +
                              +

                              It was awesome meeting you all, and I'm so stoked about the recent PyPy improvements :-D

                              +
                              +
                              +
                              +
                              + + Anonymous wrote on 2017-10-19 06:31: +
                              +
                              +

                              Fantastic news. Many Python users need to use some of these many specialized CPython-based extension modules for which there is no CFFI alternative extensively and as a result have not benefited much, or not at all, from PyPy's speed advantages. These improvements could make PyPy the default Python for many of us.

                              +
                              +
                              +
                              +
                              + + Anonymous wrote on 2017-10-19 07:57: +
                              +
                              +

                              Could you give a hint to how you're doing O(1) individual character access in UTF-8 strings? Not that I'd find such a requirement particularly necessary (might be handy for all-ASCII strings, but easy to flat those cases), but how is it done? I can figure O(log(n)) ways with up to O(n) storage overhead or O(sqrt(n)) with up to O(sqrt(n)) storage overhead, but O(1) w/o the O(n) storage overhead of having UTF-32 around?

                              +
                              +
                              +
                              +
                              + + Maciej Fijalkowski wrote on 2017-10-19 09:51: +
                              +
                              +

                              Hi Anonymous.

                              It's O(1) time with O(n) storage overhead, but the constants can be manipulated to have 10% or 25% overhead, and only if ever indexed and not ascii at that.

                              +
                              +
                              +
                              +
                              + + intgr wrote on 2017-10-20 15:33: +
                              +
                              +

                              Really excited to hear about the Unicode representation changes; it should finally make PyPy significantly faster at Unicode manipulation than CPython 3.6 is. It seems this has been bogging down PyPy's advantage at Unicode-heavy workloads like webapp template rendering.

                              Even without O(1) access to characters by index, I think it's a great idea to use UTF-8 internally, since that's the prevalent encoding for input/output pretty much everywhere. Accessing Unicode characters by index is an antipattern in most situations and UCS-2/UTF-16 is becoming irrelevant.

                              +
                              +
                              +
                              +
                              + + Oscar Smith wrote on 2017-10-20 16:10: +
                              +
                              +

                              I would also be really interested in a quick blogpost at some point about how to get O(1) indexing without greater storage overhead than just using UTF-32

                              +
                              +
                              +
                              + +
                              +
                              + +
                              +
                              + + \ No newline at end of file diff --git a/posts/2017/10/how-to-make-your-code-80-times-faster-1424098117108093942.html b/posts/2017/10/how-to-make-your-code-80-times-faster-1424098117108093942.html new file mode 100644 index 000000000..e3e1fa44c --- /dev/null +++ b/posts/2017/10/how-to-make-your-code-80-times-faster-1424098117108093942.html @@ -0,0 +1,547 @@ + + + + + +How to make your code 80 times faster | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                              +
                              +

                              How to make your code 80 times faster

                              + + + +
                              +
                              +I often hear people who are happy because PyPy makes their code 2 times faster +or so. Here is a short personal story which shows PyPy can go well beyond +that.

                              DISCLAIMER: this is not a silver bullet or a general recipe: it worked in +this particular case, it might not work so well in other cases. But I think it +is still an interesting technique. Moreover, the various steps and +implementations are showed in the same order as I tried them during the +development, so it is a real-life example of how to proceed when optimizing +for PyPy.

                              +Some months ago I played a bit with evolutionary algorithms: the ambitious +plan was to automatically evolve a logic which could control a (simulated) +quadcopter, i.e. a PID controller (spoiler: it doesn't fly).

                              +The idea is to have an initial population of random creatures: at each +generation, the ones with the best fitness survive and reproduce with small, +random variations.

                              +However, for the scope of this post, the actual task at hand is not so +important, so let's jump straight to the code. To drive the quadcopter, a +Creature has a run_step method which runs at each delta_t (full +code):
                              class Creature(object):
                              +    INPUTS = 2  # z_setpoint, current z position
                              +    OUTPUTS = 1 # PWM for all 4 motors
                              +    STATE_VARS = 1
                              +    ...
                              +
                              +    def run_step(self, inputs):
                              +        # state: [state_vars ... inputs]
                              +        # out_values: [state_vars, ... outputs]
                              +        self.state[self.STATE_VARS:] = inputs
                              +        out_values = np.dot(self.matrix, self.state) + self.constant
                              +        self.state[:self.STATE_VARS] = out_values[:self.STATE_VARS]
                              +        outputs = out_values[self.STATE_VARS:]
                              +        return outputs
                              +
                              +
                                +
                              • +inputs is a numpy array containing the desired setpoint and the current +position on the Z axis;
                              • +
                              • +outputs is a numpy array containing the thrust to give to the motors. To +start easy, all the 4 motors are constrained to have the same thrust, so +that the quadcopter only travels up and down the Z axis;
                              • +
                              • +self.state contains arbitrary values of unknown size which are passed from +one step to the next;
                              • +
                              • +self.matrix and self.constant contains the actual logic. By putting +the "right" values there, in theory we could get a perfectly tuned PID +controller. These are randomly mutated between generations.
                              • +
                              +run_step is called at 100Hz (in the virtual time frame of the simulation). At each +generation, we test 500 creatures for a total of 12 virtual seconds each. So, +we have a total of 600,000 executions of run_step at each generation.

                              +At first, I simply tried to run this code on CPython; here is the result:
                              $ python -m ev.main
                              +Generation   1: ... [population = 500]  [12.06 secs]
                              +Generation   2: ... [population = 500]  [6.13 secs]
                              +Generation   3: ... [population = 500]  [6.11 secs]
                              +Generation   4: ... [population = 500]  [6.09 secs]
                              +Generation   5: ... [population = 500]  [6.18 secs]
                              +Generation   6: ... [population = 500]  [6.26 secs]
                              +
                              +Which means ~6.15 seconds/generation, excluding the first.

                              +Then I tried with PyPy 5.9:
                              $ pypy -m ev.main
                              +Generation   1: ... [population = 500]  [63.90 secs]
                              +Generation   2: ... [population = 500]  [33.92 secs]
                              +Generation   3: ... [population = 500]  [34.21 secs]
                              +Generation   4: ... [population = 500]  [33.75 secs]
                              +
                              +Ouch! We are ~5.5x slower than CPython. This was kind of expected: numpy is +based on cpyext, which is infamously slow. (Actually, we are working on +that and on the cpyext-avoid-roundtrip branch we are already faster than +CPython, but this will be the subject of another blog post.)

                              +So, let's try to avoid cpyext. The first obvious step is to use numpypy +instead of numpy (actually, there is a hack to use just the micronumpy +part). Let's see if the speed improves:
                              $ pypy -m ev.main   # using numpypy
                              +Generation   1: ... [population = 500]  [5.60 secs]
                              +Generation   2: ... [population = 500]  [2.90 secs]
                              +Generation   3: ... [population = 500]  [2.78 secs]
                              +Generation   4: ... [population = 500]  [2.69 secs]
                              +Generation   5: ... [population = 500]  [2.72 secs]
                              +Generation   6: ... [population = 500]  [2.73 secs]
                              +
                              +So, ~2.7 seconds on average: this is 12x faster than PyPy+numpy, and more than +2x faster than the original CPython. At this point, most people would be happy +and go tweeting how PyPy is great.

                              +In general, when talking of CPython vs PyPy, I am rarely satified of a 2x +speedup: I know that PyPy can do much better than this, especially if you +write code which is specifically optimized for the JIT. For a real-life +example, have a look at capnpy benchmarks, in which the PyPy version is +~15x faster than the heavily optimized CPython+Cython version (both have been +written by me, and I tried hard to write the fastest code for both +implementations).

                              +So, let's try to do better. As usual, the first thing to do is to profile and +see where we spend most of the time. Here is the vmprof profile. We spend a +lot of time inside the internals of numpypy, and allocating tons of temporary +arrays to store the results of the various operations.

                              +Also, let's look at the jit traces and search for the function run: +this is loop in which we spend most of the time, and it is composed of 1796 +operations. The operations emitted for the line np.dot(...) + +self.constant are listed between lines 1217 and 1456. Here is the excerpt +which calls np.dot(...); most of the ops are cheap, but at line 1232 we +see a call to the RPython function descr_dot; by looking at the +implementation we see that it creates a new W_NDimArray to store the +result, which means it has to do a malloc():
                              + +
                              +
                              +The implementation of the + self.constant part is also interesting: +contrary the former, the call to W_NDimArray.descr_add has been inlined by +the JIT, so we have a better picture of what's happening; in particular, we +can see the call to __0_alloc_with_del____ which allocates the +W_NDimArray for the result, and the raw_malloc which allocates the +actual array. Then we have a long list of 149 simple operations which set the +fields of the resulting array, construct an iterator, and finally do a +call_assembler: this is the actual logic to do the addition, which was +JITtted indipendently; call_assembler is one of the operations to do +JIT-to-JIT calls:
                              + +
                              +
                              +All of this is very suboptimal: in this particular case, we know that the +shape of self.matrix is always (3, 2): so, we are doing an incredible +amount of work, including calling malloc() twice for the temporary arrays, just to +call two functions which ultimately do a total of 6 multiplications +and 6 additions. Note also that this is not a fault of the JIT: CPython+numpy +has to do the same amount of work, just hidden inside C calls.

                              +One possible solution to this nonsense is a well known compiler optimization: +loop unrolling. From the compiler point of view, unrolling the loop is always +risky because if the matrix is too big you might end up emitting a huge blob +of code, possibly uselss if the shape of the matrices change frequently: this +is the main reason why the PyPy JIT does not even try to do it in this case.

                              +However, we know that the matrix is small, and always of the same +shape. So, let's unroll the loop manually:
                              class SpecializedCreature(Creature):
                              +
                              +    def __init__(self, *args, **kwargs):
                              +        Creature.__init__(self, *args, **kwargs)
                              +        # store the data in a plain Python list
                              +        self.data = list(self.matrix.ravel()) + list(self.constant)
                              +        self.data_state = [0.0]
                              +        assert self.matrix.shape == (2, 3)
                              +        assert len(self.data) == 8
                              +
                              +    def run_step(self, inputs):
                              +        # state: [state_vars ... inputs]
                              +        # out_values: [state_vars, ... outputs]
                              +        k0, k1, k2, q0, q1, q2, c0, c1 = self.data
                              +        s0 = self.data_state[0]
                              +        z_sp, z = inputs
                              +        #
                              +        # compute the output
                              +        out0 = s0*k0 + z_sp*k1 + z*k2 + c0
                              +        out1 = s0*q0 + z_sp*q1 + z*q2 + c1
                              +        #
                              +        self.data_state[0] = out0
                              +        outputs = [out1]
                              +        return outputs
                              +
                              +In the actual code there is also a sanity check which asserts that the +computed output is the very same as the one returned by Creature.run_step.

                              +So, let's try to see how it performs. First, with CPython:
                              $ python -m ev.main
                              +Generation   1: ... [population = 500]  [7.61 secs]
                              +Generation   2: ... [population = 500]  [3.96 secs]
                              +Generation   3: ... [population = 500]  [3.79 secs]
                              +Generation   4: ... [population = 500]  [3.74 secs]
                              +Generation   5: ... [population = 500]  [3.84 secs]
                              +Generation   6: ... [population = 500]  [3.69 secs]
                              +
                              +This looks good: 60% faster than the original CPython+numpy +implementation. Let's try on PyPy:
                              Generation   1: ... [population = 500]  [0.39 secs]
                              +Generation   2: ... [population = 500]  [0.10 secs]
                              +Generation   3: ... [population = 500]  [0.11 secs]
                              +Generation   4: ... [population = 500]  [0.09 secs]
                              +Generation   5: ... [population = 500]  [0.08 secs]
                              +Generation   6: ... [population = 500]  [0.12 secs]
                              +Generation   7: ... [population = 500]  [0.09 secs]
                              +Generation   8: ... [population = 500]  [0.08 secs]
                              +Generation   9: ... [population = 500]  [0.08 secs]
                              +Generation  10: ... [population = 500]  [0.08 secs]
                              +Generation  11: ... [population = 500]  [0.08 secs]
                              +Generation  12: ... [population = 500]  [0.07 secs]
                              +Generation  13: ... [population = 500]  [0.07 secs]
                              +Generation  14: ... [population = 500]  [0.08 secs]
                              +Generation  15: ... [population = 500]  [0.07 secs]
                              +
                              +Yes, it's not an error. After a couple of generations, it stabilizes at around +~0.07-0.08 seconds per generation. This is around 80 (eighty) times faster +than the original CPython+numpy implementation, and around 35-40x faster than +the naive PyPy+numpypy one.

                              +Let's look at the trace again: it no longer contains expensive calls, and +certainly no more temporary malloc() s. The core of the logic is between +lines 386-416, where we can see that it does fast C-level multiplications and +additions: float_mul and float_add are translated straight into +mulsd and addsd x86 instructions.

                              +As I said before, this is a very particular example, and the techniques +described here do not always apply: it is not realistic to expect an 80x +speedup on arbitrary code, unfortunately. However, it clearly shows the potential of PyPy when +it comes to high-speed computing. And most importantly, it's not a toy +benchmark which was designed specifically to have good performance on PyPy: +it's a real world example, albeit small.

                              +You might be also interested in the talk I gave at last EuroPython, in which I +talk about a similar topic: "The Joy of PyPy JIT: abstractions for free" +(abstract, slides and video).

                              +

                              +How to reproduce the results

                              +
                              $ git clone https://github.com/antocuni/evolvingcopter
                              +$ cd evolvingcopter
                              +$ {python,pypy} -m ev.main --no-specialized --no-numpypy
                              +$ {python,pypy} -m ev.main --no-specialized
                              +$ {python,pypy} -m ev.main
                              +
                              +
                              +
                              +
                              +

                              Comments

                              +
                              +
                              +
                              + + Unknown wrote on 2017-11-02 21:23: +
                              +
                              +

                              Isn't this a factor 80 slowdown because of a design error? Normally, one should store all creatures in a big numpy array and evaluate run_step on all creatures at once.

                              +
                              +
                              +
                              +
                              + + Unknown wrote on 2017-11-27 10:48: +
                              +
                              +

                              I don't understand - how do you figure out that line 1232 is not cheap?

                              +
                              +
                              +
                              +
                              + + Antonio Cuni wrote on 2017-11-28 09:40: +
                              +
                              +

                              @anatoly: line 1232 is a call to descr_dot: if you look at the implementation, you see that it does lots of things including mallocs, and those we know are not cheap at all

                              +
                              +
                              +
                              +
                              + + homm wrote on 2018-05-21 15:17: +
                              +
                              +

                              Have you tried the third argument of numpy.dot, out to avoid memory alocation?

                              +
                              +
                              +
                              + +
                              +
                              + +
                              +
                              + + \ No newline at end of file diff --git a/posts/2017/10/pypy-v59-released-now-supports-pandas-2261195727261691228.html b/posts/2017/10/pypy-v59-released-now-supports-pandas-2261195727261691228.html new file mode 100644 index 000000000..a8e58146c --- /dev/null +++ b/posts/2017/10/pypy-v59-released-now-supports-pandas-2261195727261691228.html @@ -0,0 +1,426 @@ + + + + + +PyPy v5.9 Released, Now Supports Pandas, NumPy | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                              +
                              +

                              PyPy v5.9 Released, Now Supports Pandas, NumPy

                              + + + +
                              +
                              +The PyPy team is proud to release both PyPy3.5 v5.9 (a beta-quality interpreter for Python +3.5 syntax) and PyPy2.7 v5.9 (an interpreter supporting +Python 2.7 syntax).

                                +
                              • NumPy and Pandas now work on PyPy2.7 (together with Cython 0.27.1). Many other modules +based on C-API extensions work on PyPy as well.
                              • +
                              +
                                +
                              • Cython 0.27.1 (released very recently) supports more projects with PyPy, both +on PyPy2.7 and PyPy3.5 beta. Note version 0.27.1 is now the minimum +version that supports this version of PyPy, due to some interactions with +updated C-API interface code.
                              • +
                              +
                                +
                              • We optimized the JSON parser for recurring string keys, which should decrease +memory use by up to 50% and increase parsing speed by up to 15% for large JSON files +with many repeating dictionary keys (which is quite common).
                              • +
                              +
                                +
                              • +CFFI, which is part of the PyPy release, has been updated to 1.11.1, +improving an already great package for interfacing with C. CFFI now supports +complex arguments in API mode, as well as char16_t and char32_t and has +improved support for callbacks.
                              • +
                              +
                                +
                              • Issues in the C-API compatibility layer that appeared as excessive memory +use were cleared up and other incompatibilities were resolved. The C-API +compatibility layer does slow down code which crosses the python-c interface +often. Some fixes are in the pipelines for some of the performance issues, and we still recommend +using pure python on PyPy or interfacing via CFFI
                              • +
                              +
                              +Please let us know if your use case is slow, we have ideas how to make things +faster but need real-world examples (not micro-benchmarks) of problematic code.

                              +Work sponsored by a Mozilla grant continues on PyPy3.5; we continue on the path to the goal of a complete python 3.5 implementation. Of course the bug fixes and performance enhancements +mentioned above are part of both PyPy2.7 and PyPy3.5 beta.

                              +As always, this release fixed many other issues and bugs raised by the +growing community of PyPy users. We strongly recommend updating.

                              +You can download the v5.9 releases here (note that we provide PyPy3.5 binaries for only Linux 64bit for now):

                              + +
                              +We would like to thank our donors and contributors, and +encourage new people to join the project. PyPy has many +layers and we need help with all of them: PyPy and RPython documentation +improvements, tweaking popular modules to run on PyPy, or general help +with making RPython’s JIT even better.

                              +What is PyPy?

                              +PyPy is a very compliant Python interpreter, almost a drop-in replacement for CPython 2.7 (stdlib version 2.7.13), and CPython 3.5 (stdlib version 3.5.3). It’s fast (PyPy and CPython 2.7.x performance comparison) due to its integrated tracing JIT compiler.

                              +We also welcome developers of other dynamic languages to see what RPython can do for them.

                              +The PyPy 2.7 release supports:
                              +
                              +
                                +
                              • +x86 machines on most common operating systems (Linux 32/64 bits, Mac OS X 64 bits, Windows 32 bits, OpenBSD, FreeBSD)
                              • +
                              • newer ARM hardware (ARMv6 or ARMv7, with VFPv3) running Linux,
                              • +
                              • big- and little-endian variants of PPC64 running Linux,
                              • +
                              • +s390x running Linux
                              • +
                              +
                              +
                              +

                              +What else is new?

                              +
                              +PyPy 5.8 was released in June, 2017.
                              +
                              +There are many incremental improvements to RPython and PyPy, the complete listing is here.
                              +
                              +Please update, and continue to help us make PyPy better.

                              +Cheers, The PyPy team
                              +
                              +

                              Comments

                              +
                              +
                              +
                              + + Anonymous wrote on 2017-10-05 19:36: +
                              +
                              +

                              Pypy3 works very well with flask. Good Job and thanx.



                              cheers Rob

                              +
                              +
                              +
                              +
                              + + Carlos Vega wrote on 2017-10-08 16:56: +
                              +
                              +

                              Good job ! 😀🎉
                              It would be great if you could update https://packages.pypy.org/
                              I'm going to donate again ! your work is awesome.

                              +
                              +
                              +
                              +
                              + + melin wrote on 2017-11-02 13:57: +
                              +
                              +

                              Pypy test run pands two or three times slower than pyhon

                              df = sparkSession.sql("select * from test_users_dt").toPandas()
                              for index, row in df.iterrows():
                              result = 0

                              for key in range(0, 10000000):
                              event_type = row.event_type
                              if key > 234:
                              result = result + 1
                              len(event_type + "123")

                              print(result)

                              +
                              +
                              +
                              +
                              + + Maciej Fijalkowski wrote on 2017-11-02 14:00: +
                              +
                              +

                              @melin

                              We know that. We're in the process of improving that by merging various cpyext improvement branches. Stay tuned.

                              +
                              +
                              +
                              +
                              + + Unknown wrote on 2017-11-20 09:14: +
                              +
                              +

                              So this means the numpy port for pypy is redundant now right? We can use the original python numpy package?

                              +
                              +
                              +
                              +
                              + + Antonio Cuni wrote on 2017-11-20 09:48: +
                              +
                              +

                              @Eitan

                              yes, but look at this FAQ for a longer explanation: https://doc.pypy.org/en/latest/faq.html#what-about-numpy-numpypy-micronumpy

                              +
                              +
                              +
                              + +
                              +
                              + +
                              +
                              + + \ No newline at end of file diff --git a/posts/2017/12/pypy27-and-pypy35-v510-dual-release-3223396318213306071.html b/posts/2017/12/pypy27-and-pypy35-v510-dual-release-3223396318213306071.html new file mode 100644 index 000000000..eed309380 --- /dev/null +++ b/posts/2017/12/pypy27-and-pypy35-v510-dual-release-3223396318213306071.html @@ -0,0 +1,411 @@ + + + + + +PyPy2.7 and PyPy3.5 v5.10 dual release | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                              +
                              +

                              PyPy2.7 and PyPy3.5 v5.10 dual release

                              + + + +
                              +
                              +

                              The PyPy team is proud to release both PyPy2.7 v5.10 (an interpreter supporting +Python 2.7 syntax), and a final PyPy3.5 v5.10 (an interpreter for Python +3.5 syntax). The two releases are both based on much the same codebase, thus +the dual release.

                              +

                              This release is an incremental release with very few new features, the main +feature being the final PyPy3.5 release that works on linux and OS X with beta +windows support. It also includes fixes for vmprof cooperation with greenlets.

                              +

                              Compared to 5.9, the 5.10 release contains mostly bugfixes and small improvements. +We have in the pipeline big new features coming for PyPy 6.0 that did not make +the release cut and should be available within the next couple months.

                              +

                              As always, this release is 100% compatible with the previous one and fixed +several issues and bugs raised by the growing community of PyPy users. +As always, we strongly recommend updating.

                              +

                              There are quite a few important changes that are in the pipeline that did not +make it into the 5.10 release. Most important are speed improvements to cpyext +(which will make numpy and pandas a bit faster) and utf8 branch that changes +internal representation of unicode to utf8, which should help especially the +Python 3.5 version of PyPy.

                              +

                              This release concludes the Mozilla Open Source grant for having a compatible +PyPy 3.5 release and we're very grateful for that. Of course, we will continue +to improve PyPy 3.5 and probably move to 3.6 during the course of 2018.

                              +

                              You can download the v5.10 releases here:

                              +
                              +https://pypy.org/download.html +
                              +

                              We would like to thank our donors for the continued support of the PyPy +project.

                              +

                              We would also like to thank our contributors and +encourage new people to join the project. PyPy has many +layers and we need help with all of them: PyPy and RPython documentation +improvements, tweaking popular modules to run on pypy, or general help +with making RPython's JIT even better.

                              +
                              +

                              What is PyPy?

                              +

                              PyPy is a very compliant Python interpreter, almost a drop-in replacement for +CPython 2.7 and CPython 3.5. It's fast (PyPy and CPython 2.7.x performance comparison) +due to its integrated tracing JIT compiler.

                              +

                              We also welcome developers of other dynamic languages to see what RPython +can do for them.

                              +

                              The PyPy release supports:

                              +
                              +
                                +
                              • +x86 machines on most common operating systems +(Linux 32/64 bits, Mac OS X 64 bits, Windows 32 bits, OpenBSD, FreeBSD)
                              • +
                              • newer ARM hardware (ARMv6 or ARMv7, with VFPv3) running Linux,
                              • +
                              • big- and little-endian variants of PPC64 running Linux,
                              • +
                              • +s390x running Linux
                              • +
                              +
                              +
                              +
                              +

                              Changelog

                              +
                                +
                              • improve ssl handling on windows for pypy3 (makes pip work)
                              • +
                              • improve unicode handling in various error reporters
                              • +
                              • fix vmprof cooperation with greenlets
                              • +
                              • fix some things in cpyext
                              • +
                              • test and document the cmp(nan, nan) == 0 behaviour
                              • +
                              • don't crash when calling sleep with inf or nan
                              • +
                              • fix bugs in _io module
                              • +
                              • inspect.isbuiltin() now returns True for functions implemented in C
                              • +
                              • allow the sequences future-import, docstring, future-import for CPython bug compatibility
                              • +
                              • Issue #2699: non-ascii messages in warnings
                              • +
                              • posix.lockf
                              • +
                              • fixes for FreeBSD platform
                              • +
                              • add .debug files, so builds contain debugging info, instead of being stripped
                              • +
                              • improvements to cppyy
                              • +
                              • issue #2677 copy pure c PyBuffer_{From,To}Contiguous from cpython
                              • +
                              • issue #2682, split firstword on any whitespace in sqlite3
                              • +
                              • ctypes: allow ptr[0] = foo when ptr is a pointer to struct
                              • +
                              • matplotlib will work with tkagg backend once matplotlib pr #9356 is merged
                              • +
                              • improvements to utf32 surrogate handling
                              • +
                              • cffi version bump to 1.11.2
                              • +
                              +Maciej Fijalkowski, Matti Picus and the whole PyPy team +
                              +
                              +
                              +
                              +

                              Comments

                              +
                              +
                              +
                              + + Unknown wrote on 2017-12-25 21:25: +
                              +
                              +

                              Thanks a lot! What a christmas present!

                              +
                              +
                              +
                              +
                              + + stuaxo wrote on 2017-12-25 23:32: +
                              +
                              +

                              Great work and happy Xmas :)

                              +
                              +
                              +
                              +
                              + + Joce wrote on 2017-12-26 04:40: +
                              +
                              +

                              Thanks for the Christmas present of a new release!

                              I'm having issues with the the official builds of 5.10 for macOS x64 (the "high sierra" version for pypy3):

                              With pypy3: dyld: Library not loaded: /usr/local/opt/libffi/lib/libffi.6.dylib
                              Referenced from: /Users/joce/devtools/python/pypy3-v5.10.0-osx64/bin/libpypy3-c.dylib
                              Reason: image not found
                              Abort trap: 6

                              With pypy2: dyld: Library not loaded: /usr/local/opt/openssl/lib/libssl.1.0.0.dylib
                              Referenced from: /Users/joce/devtools/python/pypy2-v5.10.0-osx64/bin/libpypy-c.dylib
                              Reason: image not found
                              Abort trap: 6

                              Given that I have no homebrew or macports installed (and never have on that fresh-ish install of high sierra), /usr/local/opt/ doesn't even exist, so it's not an appropriate folder for the linker to serach in.

                              +
                              +
                              +
                              +
                              + + Anonymous wrote on 2018-01-05 21:23: +
                              +
                              +

                              Excellent work! I look forward to using PyPy with my 3.5-compatible applications!

                              +
                              +
                              +
                              + +
                              +
                              + +
                              +
                              + + \ No newline at end of file diff --git a/posts/2018/01/leysin-winter-sprint-17-24-march-2018-7141092581585849418.html b/posts/2018/01/leysin-winter-sprint-17-24-march-2018-7141092581585849418.html new file mode 100644 index 000000000..7fd4ac7f0 --- /dev/null +++ b/posts/2018/01/leysin-winter-sprint-17-24-march-2018-7141092581585849418.html @@ -0,0 +1,370 @@ + + + + + +Leysin Winter sprint: 17-24 March 2018 | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                              +
                              +

                              Leysin Winter sprint: 17-24 March 2018

                              + + + +
                              + + + +
                              + +

                              The next PyPy sprint will be in Leysin, Switzerland, for the thirteenth +time. This is a fully public sprint: newcomers and topics other than +those proposed below are welcome.

                              + +

                              (Note: this sprint is independent from the suggested April-May sprint in +Poland.)

                              + +

                              Goals and topics of the sprint

                              + +

                              The list of topics is open, but here is our current list:

                              + +
                              +
                              +
                              +
                                +
                              • cffi tutorial/overview rewrite +
                              • +
                              • py3 test runners are too complicated +
                              • +
                              • make win32 builds green +
                              • +
                              • make packaging more like cpython/portable builds +
                              • +
                              • get CI builders for PyPy into mainstream projects (Numpy, Scipy, lxml, uwsgi) +
                              • +
                              • get more of scientific stack working (tensorflow?) +
                              • +
                              • cpyext performance improvements +
                              • +
                              • General 3.5 and 3.6 improvements +
                              • +
                              • JIT topics: guard-compatible, and the subsequent research project to save and reuse traces across processes +
                              • +
                              • finish unicode-utf8 +
                              • +
                              • update www.pypy.org, speed.pypy.org (web devs needed) +
                              • +
                              +

                              As usual, the main side goal is to have fun in winter sports :-) +We can take a day off (for ski or anything else).

                              + +

                              Exact times

                              + +

                              Work days: starting March 18th (~noon), ending March 24th (~noon).

                              + +

                              Please see announcement.txt for more information.

                              +
                              +

                              Comments

                              +
                              +
                              +
                              + + Евгений Демченко wrote on 2018-01-09 05:49: +
                              +
                              +

                              Can we expect a python 3.6 support released anytime soon? Thanks!

                              +
                              +
                              +
                              +
                              + + Oscar Smith wrote on 2018-01-13 01:34: +
                              +
                              +

                              It would be nice to have tensorflow working on pypy, even if there aren't many real world cases where this is useful, as most tensorflow does not use python for much heavy lifting.

                              +
                              +
                              +
                              +
                              + + Eric van Riet Paap wrote on 2018-02-09 22:24: +
                              +
                              +

                              Hi PyPy-team!

                              While I was checking out a reinforcement learning repo I thought it would benefit a lot from have the games it was learning in something faster that CPython. So I had another look at PyPy. Tensorflow I could not install so I am really happy that this is on the agenda for the next sprint!

                              good luck and have fun!

                              Eric

                              +
                              +
                              +
                              + +
                              +
                              + +
                              +
                              + + \ No newline at end of file diff --git a/posts/2018/01/pypy-5101-bugfix-release-for-python-35-8485250762789380657.html b/posts/2018/01/pypy-5101-bugfix-release-for-python-35-8485250762789380657.html new file mode 100644 index 000000000..4f2d11096 --- /dev/null +++ b/posts/2018/01/pypy-5101-bugfix-release-for-python-35-8485250762789380657.html @@ -0,0 +1,345 @@ + + + + + +PyPy 5.10.1 bugfix release for python 3.5 | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                              +
                              +

                              PyPy 5.10.1 bugfix release for python 3.5

                              + + + +
                              +
                              +We have released a bug fix PyPy3.5-v5.10.1 +due to the following issues:
                              +
                              +
                                +
                              • Fix time.sleep(float('nan')) which would hang on Windows
                              • +
                              • Fix missing errno constants on Windows
                              • +
                              • Fix issue 2718 for the REPL on Linux
                              • +
                              • Fix an overflow in converting int secs to nanosecs (issue 2717 )
                              • +
                              • Using kwarg 'flag' to os.setxattr had no effect
                              • +
                              • Fix the winreg module for unicode entries in the registry on Windows
                              • +
                              +
                              +
                              +Note that many of these fixes are for our new beta version of PyPy3.5 on Windows. There may be more unicode problems in the Windows beta version, +especially concerning directory- and file-names with non-ASCII +characters.

                              +On macOS, we recommend you wait for the +Homebrew package to prevent issues with third-party packages. For other supported platforms our downloads are available now.
                              +Thanks to those who reported the issues.

                              +

                              +What is PyPy?

                              +PyPy is a very compliant Python interpreter, almost a drop-in replacement for +CPython 2.7 and CPython 3.5. It’s fast (PyPy and CPython 2.7.x performance comparison) +due to its integrated tracing JIT compiler.
                              + +We also welcome developers of other dynamic languages to see what RPython +can do for them.
                              + +This PyPy 3.5 release supports:
                              +
                              +
                                +
                              • +x86 machines on most common operating systems +(Linux 32/64 bits, macOS 64 bits, Windows 32 bits, OpenBSD, FreeBSD)
                              • +
                              • newer ARM hardware (ARMv6 or ARMv7, with VFPv3) running Linux,
                              • +
                              • big- and little-endian variants of PPC64 running Linux,
                              • +
                              • +s390x running Linux
                              • +
                              +
                              +
                              +Please update, and continue to help us make PyPy better.
                              + +Cheers
                              + +The PyPy Team
                              +
                              +
                              +
                              +

                              Comments

                              +
                              +
                              +
                              + +
                              +
                              + + \ No newline at end of file diff --git a/posts/2018/03/leysin-winter-sprint-2018-review-3988364248531980164.html b/posts/2018/03/leysin-winter-sprint-2018-review-3988364248531980164.html new file mode 100644 index 000000000..14ce07684 --- /dev/null +++ b/posts/2018/03/leysin-winter-sprint-2018-review-3988364248531980164.html @@ -0,0 +1,346 @@ + + + + + +Leysin Winter Sprint 2018: review | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                              +
                              +

                              Leysin Winter Sprint 2018: review

                              + + + +
                              +

                              Like every year, the PyPy developers and a couple of newcomers + gathered in Leysin, Switzerland, to share their thoughts and + contribute to the development of PyPy.

                              +

                              As always, we had interesting discussions about how we could + improve PyPy, to make it the first choice for even more + developers. We also made some progress with current issues, like + compatibility with Python 3.6 and improving the performance of + CPython extension modules, where we fixed a lot of bugs and gained + new insights about where and how we could tweak PyPy.

                              +

                              We were very happy about the number of new people who joined us + for the first time, and hope they enjoyed it as much as everyone + else.

                              +

                              Topics

                              + We worked on the following topics (and more!):
                                +
                              • Introductions for newcomers
                              • +
                              • Python 3.5 and 3.6 improvements
                              • +
                              • CPyExt performance improvements and GC implementation
                                +
                              • +
                              • JIT: guard-compatible implementation
                                +
                              • +
                              • Pygame performance improvements
                              • +
                              • Unicode/UTF8 implementation
                                +
                              • +
                              • CFFI tutorial/overview rewrite +
                              • +
                              • py3 test runners refactoring
                              • +
                              • RevDB improvements
                                +
                              • +
                              + The weather was really fine for most of the week, with only + occasional snow and fog. We started our days with a short (and + sometimes not so short) planning session and enjoyed our dinners in + the great restaurants in the area. Some of us even started earlier + and continued till late night. It was a relaxed, but also very + productive atmosphere. On our break day on Wednesday, we enjoyed the + great conditions and went skiing and hiking. +

                              Attendees

                              +
                                +
                              • Arianna
                              • +
                              • Jean-Daniel
                                +
                              • +
                              • Stefan Beyer
                              • +
                              • Floris Bruynooghe
                                +
                              • +
                              • Antonio Cuni
                              • +
                              • René Dudfield
                              • +
                              • Manuel Jacob
                              • +
                              • Ronan Lamy
                              • +
                              • Remi Meier
                              • +
                              • Matti Picus
                                +
                              • +
                              • Armin Rigo
                              • +
                              • Alexander Schremmer
                                +
                              • +
                              + Leysin is easily reachable by Geneva Airport, so feel free to join + us next time!


                              Cheers,
                              + Stefan

                              +
                              +

                              Comments

                              +
                              +
                              +
                              + +
                              +
                              + + \ No newline at end of file diff --git a/posts/2018/04/how-to-ignore-annoying-cython-warnings-1007636731207810779.html b/posts/2018/04/how-to-ignore-annoying-cython-warnings-1007636731207810779.html new file mode 100644 index 000000000..0e2501bcd --- /dev/null +++ b/posts/2018/04/how-to-ignore-annoying-cython-warnings-1007636731207810779.html @@ -0,0 +1,347 @@ + + + + + +How to ignore the annoying Cython warnings in PyPy 6.0 | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                              +
                              +

                              How to ignore the annoying Cython warnings in PyPy 6.0

                              + + + +
                              +
                              +
                              +
                              +
                              +If you install any Cython-based module in PyPy 6.0.0, it is very likely that you get a warning like this:
                              +
                              >>>> import numpy
                              +/data/extra/pypy/6.0.0/site-packages/numpy/random/__init__.py:99: UserWarning: __builtin__.type size changed, may indicate binary incompatibility. Expected 888, got 408
                              +  from .mtrand import *
                              +
                              +
                              +The TL;DR version is: the warning is a false alarm, and you can hide it by doing:
                              +
                              $ pypy -m pip install pypy-fix-cython-warning
                              +
                              +
                              +The package does not contain any module, only a .pth file which installs a warning filter at startup.
                              +

                              +Technical details

                              +
                              +This happens because whenever Cython compiles a pyx file, it generates C code which does a sanity check on the C size of PyType_Type. PyPy versions up to 5.10 are buggy and report the incorrect size, so Cython includes a workaround to compare it with the incorrect value, when on PyPy.
                              +
                              +PyPy 6 fixed the bug and now PyType_Type reports the correct size; however, Cython still tries to compare it with the old, buggy value, so it (wrongly) emits the warning.
                              +
                              +Cython 0.28.2 includes a fix for it, so that C files generated by it no longer emit the warning. However, most packages are distributed with pre-cythonized C files. For example, numpy-1.14.2.zip include C files which were generated by Cython 0.26.1: if you compile it you still get the warning, even if you locally installed a newer version of Cython.

                              +There is not much that we can do on the PyPy side, apart for waiting for all the Cython-based packages to do a new release which include C files generated by a newer Cython.  In the mean time, installing this module will silence the warning. +
                              +
                              +
                              +
                              +
                              +
                              +
                              +

                              Comments

                              +
                              +
                              +
                              + + Ralf Gommers wrote on 2018-04-29 04:42: +
                              +
                              +

                              I've opened an issue to allow easier workarounds via a Cython compile flag: https://github.com/cython/cython/issues/2221

                              +
                              +
                              +
                              +
                              + + Antonio Cuni wrote on 2018-05-02 10:34: +
                              +
                              +

                              Thanks, although note that this flag would not help much in this case. Even if it were there, the package author would still have to recompile/republish each package in order to get rid of them. And once you do that, the warning vanishes anyway in the case of PyPy :)

                              +
                              +
                              +
                              +
                              + + Amaroq Starwind wrote on 2018-05-10 19:12: +
                              +
                              +

                              PyPy looks awesome. I can't wait for Python 3.6.5 support and/or a Windows x86-64 version! Though it would be unlikely, Anaconda and/or IdleX support would be awesome too.

                              +
                              +
                              +
                              + +
                              +
                              + +
                              +
                              + + \ No newline at end of file diff --git a/posts/2018/04/improving-syntaxerror-in-pypy-5733639208090522433.html b/posts/2018/04/improving-syntaxerror-in-pypy-5733639208090522433.html new file mode 100644 index 000000000..f9d5fe3d9 --- /dev/null +++ b/posts/2018/04/improving-syntaxerror-in-pypy-5733639208090522433.html @@ -0,0 +1,509 @@ + + + + + +Improving SyntaxError in PyPy | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                              +
                              +

                              Improving SyntaxError in PyPy

                              + + + +
                              +

                              For the last year, my halftime job has been to teach non-CS uni students +to program in Python. While doing that, I have been trying to see what common +stumbling blocks exist for novice programmers. There are many +things that could be said here, but a common theme that emerges is +hard-to-understand error messages. One source of such error messages, +particularly when starting out, is SyntaxErrors.

                              +

                              PyPy's parser (mostly following the architecture of CPython) uses a +regular-expression-based tokenizer with some cleverness to deal with +indentation, and a simple LR(1) parser. Both of these components obviously +produce errors for invalid syntax, but the messages are not very helpful. Often, +the message is just "invalid syntax", without any hint of what exactly is wrong. +In the last couple of weeks I have invested a little bit of effort to make them a +tiny bit better. They will be part of the upcoming PyPy 6.0 release. Here are +some examples of what changed.

                              +
                              +

                              Missing Characters

                              +

                              The first class of errors occurs when a token is missing, often there is only one +valid token that the parser expects. This happens most commonly by leaving out +the ':' after control flow statements (which is the syntax error I personally +still make at least a few times a day). In such situations, the parser will now +tell you which character it expected:

                              +
                              +>>>> # before
                              +>>>> if 1
                              +  File "<stdin>", line 1
                              +    if 1
                              +       ^
                              +SyntaxError: invalid syntax
                              +>>>>
                              +
                              +>>>> # after
                              +>>>> if 1
                              +  File "<stdin>", line 1
                              +    if 1
                              +       ^
                              +SyntaxError: invalid syntax (expected ':')
                              +>>>>
                              +
                              +

                              Another example of this feature:

                              +
                              +>>>> # before
                              +>>>> def f:
                              +  File "<stdin>", line 1
                              +    def f:
                              +        ^
                              +SyntaxError: invalid syntax
                              +>>>>
                              +
                              +>>>> # after
                              +>>>> def f:
                              +  File "<stdin>", line 1
                              +    def f:
                              +         ^
                              +SyntaxError: invalid syntax (expected '(')
                              +>>>>
                              +
                              +
                              +
                              +

                              Parentheses

                              +

                              Another source of errors are unmatched parentheses. Here, PyPy has always had +slightly better error messages than CPython:

                              +
                              +>>> # CPython
                              +>>> )
                              +  File "<stdin>", line 1
                              +    )
                              +    ^
                              +SyntaxError: invalid syntax
                              +>>>
                              +
                              +>>>> # PyPy
                              +>>> )
                              +  File "<stdin>", line 1
                              +    )
                              +    ^
                              +SyntaxError: unmatched ')'
                              +>>>>
                              +
                              +

                              The same is true for parentheses that are never closed (the call to eval is +needed to get the error, otherwise the repl will just wait for more input):

                              +
                              +>>> # CPython
                              +>>> eval('(')
                              +  File "<string>", line 1
                              +    (
                              +    ^
                              +SyntaxError: unexpected EOF while parsing
                              +>>>
                              +
                              +>>>> # PyPy
                              +>>>> eval('(')
                              +  File "<string>", line 1
                              +    (
                              +    ^
                              +SyntaxError: parenthesis is never closed
                              +>>>>
                              +
                              +

                              What I have now improved is the case of parentheses that are matched wrongly:

                              +
                              +>>>> # before
                              +>>>> (1,
                              +.... 2,
                              +.... ]
                              +  File "<stdin>", line 3
                              +    ]
                              +    ^
                              +SyntaxError: invalid syntax
                              +>>>>
                              +
                              +>>>> # after
                              +>>>> (1,
                              +.... 2,
                              +.... ]
                              +  File "<stdin>", line 3
                              +    ]
                              +    ^
                              +SyntaxError: closing parenthesis ']' does not match opening parenthesis '(' on line 1
                              +>>>>
                              +
                              +
                              +
                              +

                              Conclusion

                              +

                              Obviously these are just some very simple cases, and there is still a lot of +room for improvement (one huge problem is that only a single SyntaxError is +ever shown per parse attempt, but fixing that is rather hard).

                              +

                              If you have a favorite unhelpful SyntaxError message you love to hate, please +tell us in the comments and we might try to improve it. Other kinds of +non-informative error messages are also always welcome!

                              +
                              +
                              +

                              Comments

                              +
                              +
                              +
                              + + stuaxo wrote on 2018-04-10 11:36: +
                              +
                              +

                              This is great, I've been thinking along these lines when it comes to python errors for a while.

                              This kind of improvements would be great for the long-suffering python web developers too.

                              +
                              +
                              +
                              +
                              + + stuaxo wrote on 2018-04-10 11:38: +
                              +
                              +

                              Despite my typo-ridden comment, English is my first language :(

                              +
                              +
                              +
                              +
                              + + René Dudfield wrote on 2018-04-10 16:00: +
                              +
                              +

                              I've seen people struggle with lambda.

                              >>> lambda x:
                              File "", line 1
                              lambda x:
                              ^
                              SyntaxError: invalid syntax

                              +
                              +
                              +
                              +
                              + + smurfix wrote on 2018-04-10 16:55: +
                              +
                              +

                              Upon a syntax error, you might want to scan forward until the next line with the current(ly-broken) statement's indent (or maybe until there's a dedent to below that level (except when already at top level, obviously)), then resume parsing.

                              +
                              +
                              +
                              +
                              + + André Roberge wrote on 2018-04-11 00:12: +
                              +
                              +

                              I applaud this initiative. This is something that I have attempted to do on https://reeborg.ca/reeborg.html (only for code run in the the editor, not for the repl). I also tried to provide translations when using languages other than English. I think it would be great if you could somehow provide a hook to easily add translations.

                              +
                              +
                              +
                              +
                              + + Benjamin wrote on 2018-04-11 07:07: +
                              +
                              +

                              Missing commas between elements in data structures is probably my most common syntax error, especially when dealing with nested data structures or structures split across multiple lines. And while they're something I can recognize very easily, the actual error message isn't especially helpful, particularly when the next element after a missing comma is on the following line.

                              +
                              +
                              +
                              +
                              + + Unknown wrote on 2018-04-11 14:38: +
                              +
                              +

                              Thanks for the explanation. It all makes sense now that I know Python uses regular expressions in its parser. When Idle points to a random space character within the indentation, off to the left of a code block implemented in compliance with every recognized convention, boldly proclaiming "syntax error", I know precisely which vestigial anti-Pythonic Bell Labs holdover to resent. Again.

                              +
                              +
                              +
                              +
                              + + Carl Friedrich Bolz-Tereick wrote on 2018-04-12 20:23: +
                              +
                              +

                              Everybody thanks for the suggestions! I've added these to my collections of things I might want to fix.

                              @smurfix there is a huge amount of scientific papers on approaches how to do stuff like that, I am currently working through them (slowly)

                              @Unknown do you have an example for this behaviour?

                              +
                              +
                              +
                              +
                              + + Noah F. San Tsorvutz wrote on 2018-04-13 19:39: +
                              +
                              +

                              Sorry for the 'unknown' status ... In fact, it happened again today. I can send a screenshot, if that will help, confirming the presence of a red highlighted space, among many seemingly non-offending spaces, within the left margin indentation. Let me see if it is still happening when I try to run that code ... No, that exact SNAFU has moved on, but I now have an example of a syntax error being highlighted within a comment. Is that interesting?

                              +
                              +
                              +
                              +
                              + + Amaroq Starwind wrote on 2018-05-10 19:06: +
                              +
                              +

                              I would love to see this get updated to Python 3.6.5. I'm currently using that for my programs, and even after looking at the changelogs between Python versions, I'm not sure what I'd lose by moving down to 3.5.3 so that I could use PyPy.

                              I'm also curious about things like IdleX and Anaconda. Would those be, hypothetically speaking, mergeable with PyPy?

                              +
                              +
                              +
                              + +
                              +
                              + +
                              +
                              + + \ No newline at end of file diff --git a/posts/2018/04/pypy27-and-pypy35-v60-dual-release-7416552143474607997.html b/posts/2018/04/pypy27-and-pypy35-v60-dual-release-7416552143474607997.html new file mode 100644 index 000000000..388ba9a81 --- /dev/null +++ b/posts/2018/04/pypy27-and-pypy35-v60-dual-release-7416552143474607997.html @@ -0,0 +1,388 @@ + + + + + +PyPy2.7 and PyPy3.5 v6.0 dual release | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                              +
                              +

                              PyPy2.7 and PyPy3.5 v6.0 dual release

                              + + + +
                              +
                              +The PyPy team is proud to release both PyPy2.7 v6.0 (an interpreter supporting +Python 2.7 syntax), and a PyPy3.5 v6.0 (an interpreter supporting Python +3.5 syntax). The two releases are both based on much the same codebase, thus +the dual release.
                              +This release is a feature release following our previous 5.10 incremental +release in late December 2017. Our C-API compatibility layer cpyext is +now much faster (see the blog post) as well as more complete. We have made +many other improvements in speed and CPython compatibility. Since the changes +affect the included python development header files, all c-extension modules must +be recompiled for this version.
                              +Until we can work with downstream providers to distribute builds with PyPy, we +have made packages for some common packages available as wheels. You may +compile yourself using pip install --no-build-isolation <package>, the +no-build-isolation is currently needed for pip v10.
                              +First-time python users are often stumped by silly typos and omissions when +getting started writing code. We have improved our parser to emit more friendly +syntax errors, making PyPy not only faster but more friendly.
                              +The GC now has hooks to gain more insights into its performance
                              +The default Matplotlib TkAgg backend now works with PyPy, as do pygame and pygobject.
                              +We updated the cffi module included in PyPy to version 1.11.5, and the +cppyy backend to 0.6.0. Please use these to wrap your C and C++ code, +respectively, for a JIT friendly experience.
                              +As always, this release is 100% compatible with the previous one and fixed +several issues and bugs raised by the growing community of PyPy users. +We strongly recommend updating.
                              +The Windows PyPy3.5 release is still considered beta-quality. There are open +issues with unicode handling especially around system calls and c-extensions.
                              +The utf8 branch that changes internal representation of unicode to utf8 did not +make it into the release, so there is still more goodness coming. We also +began working on a Python3.6 implementation, help is welcome.
                              +You can download the v6.0 releases here:
                              + +
                              +We would like to thank our donors for the continued support of the PyPy +project. If PyPy is not quite good enough for your needs, we are available for +direct consulting work.
                              +We would also like to thank our contributors and encourage new people to join +the project. PyPy has many layers and we need help with all of them: PyPy +and RPython documentation improvements, tweaking popular modules to run +on pypy, or general help with making RPython’s JIT even better.
                              +

                              +What is PyPy?

                              +PyPy is a very compliant Python interpreter, almost a drop-in replacement for +CPython 2.7 and CPython 3.5. It’s fast (PyPy and CPython 2.7.x performance comparison) +due to its integrated tracing JIT compiler.
                              +We also welcome developers of other dynamic languages to see what RPython +can do for them.
                              +The PyPy release supports:
                              +
                              +
                                +
                              • +x86 machines on most common operating systems +(Linux 32/64 bits, Mac OS X 64 bits, Windows 32 bits, OpenBSD, FreeBSD)
                              • +
                              • newer ARM hardware (ARMv6 or ARMv7, with VFPv3) running Linux,
                              • +
                              • big- and little-endian variants of PPC64 running Linux,
                              • +
                              • +s390x running Linux
                              • +
                              +
                              +
                              +
                              +
                              +

                              +What else is new?

                              +
                              +PyPy 5.10 was released in Dec, 2017.
                              +
                              +There are many incremental improvements to RPython and PyPy, the complete listing is here.
                              +
                              +Please update, and continue to help us make PyPy better.

                              +Cheers, The PyPy team
                              +
                              +
                              +

                              Comments

                              +
                              +
                              +
                              + + Anonymous wrote on 2018-04-27 10:51: +
                              +
                              +

                              Good news! Gratz PyPy Dev Core people!

                              +
                              +
                              +
                              +
                              + + Gaëtan de Menten wrote on 2018-05-02 10:13: +
                              +
                              +

                              Congratulations to the team! This is getting more interesting with each release!

                              FWIW (not much, I know), I personally need two more things to start using pypy at work:
                              * Windows 64bit support
                              * pypy-specific conda packages for a few popular third-party packages (numpy, pandas, pytables, xlwings, ...)

                              If you would do a funding campaign specifically for either of those, I would donate, as I guess many people would.

                              +
                              +
                              +
                              + +
                              +
                              + +
                              +
                              + + \ No newline at end of file diff --git a/posts/2018/06/repeating-matrix-multiplication-8641748742577945875.html b/posts/2018/06/repeating-matrix-multiplication-8641748742577945875.html new file mode 100644 index 000000000..a9f4d5619 --- /dev/null +++ b/posts/2018/06/repeating-matrix-multiplication-8641748742577945875.html @@ -0,0 +1,358 @@ + + + + + +Repeating a Matrix Multiplication Benchmark | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                              +
                              +

                              Repeating a Matrix Multiplication Benchmark

                              + + + +
                              +

                              I watched the Hennessy & Patterson's Turing award lecture recently:

                              + + + +

                              In it, there's a slide comparing the performance of various matrix +multiplication implementations, using Python (presumably CPython) as a baseline +and comparing that against various C implementations (I couldn't find the +linked paper yet):

                              + + + +

                              I expected the baseline speedup of switching from CPython to C to be +higher and I also wanted to know what performance PyPy gets, so I did my own +benchmarks. This is a problem that Python is completely unsuited for, so it +should give very exaggerated results.

                              +

                              The usual disclaimers apply: All benchmarks are lies, benchmarking of +synthetic workloads even more so. My implementation is really naive (though I +did optimize it a little bit to help CPython), don't use any +of this code +for anything real. The benchmarks ran on my rather old Intel i5-3230M laptop +under Ubuntu 17.10.

                              +

                              With that said, my results were as follows:

                              + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                              Implementationtimespeedup over CPythonspeedup over PyPy
                              CPython512.588 ± 2.362 s1 ×
                              PyPy8.167 ± 0.007 s62.761 ± 0.295 ×1 ×
                              'naive' C2.164 ± 0.025 s236.817 ± 2.918 ×3.773 ± 0.044 ×
                              NumPy0.171 ± 0.002 s2992.286 ± 42.308 ×47.678 ± 0.634 ×
                              +

                              This is running 1500x1500 matrix multiplications with (the same) random matrices. Every +implementation is run 50 times in a fresh process. The results are averaged, +the errors are bootstrapped 99% confidence intervals.

                              +

                              So indeed the speedup that I got of switching from CPython to C is quite a bit higher than +47x! PyPy is much better than CPython, but of course can't really compete +against GCC. And then the real professionals (numpy/OpenBLAS) are in a whole +'nother league. The speedup of the AVX numbers in the slide above is even +higher than my NumPy numbers, which I assume is the result of my old CPU with +two cores, vs. the 18 core CPU with AVX support. +Lesson confirmed: leave matrix multiplication to people who +actually know what they are doing.

                              +
                              +
                              +

                              Comments

                              +
                              +
                              +
                              + +
                              +
                              + + \ No newline at end of file diff --git a/posts/2018/09/inside-cpyext-why-emulating-cpython-c-8083064623681286567.html b/posts/2018/09/inside-cpyext-why-emulating-cpython-c-8083064623681286567.html new file mode 100644 index 000000000..a1acd94cb --- /dev/null +++ b/posts/2018/09/inside-cpyext-why-emulating-cpython-c-8083064623681286567.html @@ -0,0 +1,798 @@ + + + + + +Inside cpyext: Why emulating CPython C API is so Hard | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                              +
                              +

                              Inside cpyext: Why emulating CPython C API is so Hard

                              + + + +
                              +
                              +cpyext is PyPy's subsystem which provides a compatibility +layer to compile and run CPython C extensions inside PyPy. Often people ask +why a particular C extension doesn't work or is very slow on PyPy. +Usually it is hard to answer without going into technical details. The goal of +this blog post is to explain some of these technical details, so that we can +simply link here instead of explaining again and again :).
                              +From a 10.000 foot view, cpyext is PyPy's version of "Python.h". Every time +you compile an extension which uses that header file, you are using cpyext. +This includes extension explicitly written in C (such as numpy) and +extensions which are generated from other compilers/preprocessors +(e.g. Cython).
                              +At the time of writing, the current status is that most C extensions "just +work". Generally speaking, you can simply pip install them, +provided they use the public, official C API instead of poking at private +implementation details. However, the performance of cpyext is generally +poor. A Python program which makes heavy use of cpyext extensions +is likely to be slower on PyPy than on CPython.
                              +Note: in this blog post we are talking about Python 2.7 because it is still +the default version of PyPy: however most of the implementation of cpyext is +shared with PyPy3, so everything applies to that as well.
                              +

                              +C API Overview

                              +In CPython, which is written in C, Python objects are represented as PyObject*, +i.e. (mostly) opaque pointers to some common "base struct".
                              +CPython uses a very simple memory management scheme: when you create an +object, you allocate a block of memory of the appropriate size on the heap. +Depending on the details, you might end up calling different allocators, but +for the sake of simplicity, you can think that this ends up being a call to +malloc(). The resulting block of memory is initialized and casted to to +PyObject*: this address never changes during the object lifetime, and the +C code can freely pass it around, store it inside containers, retrieve it +later, etc.
                              +Memory is managed using reference counting. When you create a new reference to +an object, or you discard a reference you own, you have to increment or +decrement the reference counter accordingly. When the reference counter goes to +0, it means that the object is no longer used and can safely be +destroyed. Again, we can simplify and say that this results in a call to +free(), which finally releases the memory which was allocated by malloc().
                              +Generally speaking, the only way to operate on a PyObject* is to call the +appropriate API functions. For example, to convert a given PyObject* to a C +integer, you can use PyInt_AsLong(); to add two objects together, you can +call PyNumber_Add().
                              +Internally, PyPy uses a similar approach. All Python objects are subclasses of +the RPython W_Root class, and they are operated by calling methods on the +space singleton, which represents the interpreter.
                              +At first, it looks very easy to write a compatibility layer: just make +PyObject* an alias for W_Root, and write simple RPython functions +(which will be translated to C by the RPython compiler) which call the +space accordingly:
                              def PyInt_AsLong(space, o):
                              +    return space.int_w(o)
                              +
                              +def PyNumber_Add(space, o1, o2):
                              +    return space.add(o1, o2)
                              +
                              +Actually, the code above is not too far from the real +implementation. However, there are tons of gory details which make it much +harder than it looks, and much slower unless you pay a lot of attention +to performance.
                              +
                              +

                              +The PyPy GC

                              +To understand some of cpyext challenges, you need to have at least a rough +idea of how the PyPy GC works.
                              +Contrarily to the popular belief, the "Garbage Collector" is not only about +collecting garbage: instead, it is generally responsible for all memory +management, including allocation and deallocation.
                              +Whereas CPython uses a combination of malloc/free/refcounting to manage +memory, the PyPy GC uses a completely different approach. It is designed +assuming that a dynamic language like Python behaves the following way:
                              +
                                +
                              • You create, either directly or indirectly, lots of objects.
                              • +
                              • Most of these objects are temporary and very short-lived. Think e.g. of +doing a + b + c: you need to allocate an object to hold the temporary +result of a + b, then it dies very quickly because you no longer need it +when you do the final + c part.
                              • +
                              • Only small fraction of the objects survive and stay around for a while.
                              • +
                              +
                              +So, the strategy is: make allocation as fast as possible; make deallocation of +short-lived objects as fast as possible; find a way to handle the remaining +small set of objects which actually survive long enough to be important.
                              +This is done using a Generational GC: the basic idea is the following:
                              +
                                +
                              1. We have a nursery, where we allocate "young objects" very quickly.
                              2. +
                              3. When the nursery is full, we start what we call a "minor collection".
                                  +
                                • We do a quick scan to determine the small set of objects which survived so +far
                                • +
                                • We move these objects out of the nursery, and we place them in the +area of memory which contains the "old objects". Since the address of the +objects changes, we fix all the references to them accordingly.
                                • +
                                +
                              4. +
                              +
                                +
                              1. now the nursery contains only objects which "died young". We can +discard all of them very quickly, reset the nursery, and use the same area +of memory to allocate new objects from now.
                              2. +
                              +
                              +In practice, this scheme works very well and it is one of the reasons why PyPy +is much faster than CPython. However, careful readers have surely noticed +that this is a problem for cpyext. On one hand, we have PyPy objects which +can potentially move and change their underlying memory address; on the other +hand, we need a way to represent them as fixed-address PyObject* when we +pass them to C extensions. We surely need a way to handle that.
                              +
                              +

                              +PyObject* in PyPy

                              +Another challenge is that sometimes, PyObject* structs are not completely +opaque: there are parts of the public API which expose to the user specific +fields of some concrete C struct. For example the definition of PyTypeObject +which exposes many of the tp_* slots to the user. +Since the low-level layout of PyPy W_Root objects is completely different +than the one used by CPython, we cannot simply pass RPython objects to C; we +need a way to handle the difference.
                              +So, we have two issues so far: objects can move, and incompatible +low-level layouts. cpyext solves both by decoupling the RPython and the C +representations. We have two "views" of the same entity, depending on whether +we are in the PyPy world (the movable W_Root subclass) or in the C world +(the non-movable PyObject*).
                              PyObject* are created lazily, only when they are actually needed. The +vast majority of PyPy objects are never passed to any C extension, so we don't +pay any penalty in that case. However, the first time we pass a W_Root to +C, we allocate and initialize its PyObject* counterpart.
                              +The same idea applies also to objects which are created in C, e.g. by calling +PyObject_New(). At first, only the PyObject* exists and it is +exclusively managed by reference counting. As soon as we pass it to the PyPy +world (e.g. as a return value of a function call), we create its W_Root +counterpart, which is managed by the GC as usual.
                              +Here we start to see why calling cpyext modules is more costly in PyPy than in +CPython. We need to pay some penalty for all the conversions between +W_Root and PyObject*.
                              +Moreover, the first time we pass a W_Root to C we also need to allocate +the memory for the PyObject* using a slowish "CPython-style" memory +allocator. In practice, for all the objects which are passed to C we pay more +or less the same costs as CPython, thus effectively "undoing" the speedup +guaranteed by PyPy's Generational GC under normal circumstances.
                              + +
                              +

                              +Crossing the border between RPython and C

                              +There are two other things we need to care about whenever we cross the border +between RPython and C, and vice-versa: exception handling and the GIL.
                              +In the C API, exceptions are raised by calling PyErr_SetString() (or one of +many other functions which have a similar effect), which basically works by +creating an exception value and storing it in some global variable. The +function then signals that an exception has occurred by returning an error value, +usually NULL.
                              +On the other hand, in the PyPy interpreter, exceptions are propagated by raising the +RPython-level OperationError exception, which wraps the actual app-level +exception values. To harmonize the two worlds, whenever we return from C to +RPython, we need to check whether a C API exception was raised and if so turn it +into an OperationError.
                              +We won't dig into details of how the GIL is handled in cpyext. +For the purpose of this post, it is enough to know that whenever we enter +C land, we store the current thread id into a global variable which is +accessible also from C; conversely, whenever we go back from RPython to C, we +restore this value to 0.
                              +Similarly, we need to do the inverse operations whenever you need to cross the +border between C and RPython, e.g. by calling a Python callback from C code.
                              +All this complexity is automatically handled by the RPython function +generic_cpy_call. If you look at the code you see that it takes care of 4 +things:
                              +
                                +
                              1. Handling the GIL as explained above.
                              2. +
                              3. Handling exceptions, if they are raised.
                              4. +
                              5. Converting arguments from W_Root to PyObject*.
                              6. +
                              7. Converting the return value from PyObject* to W_Root.
                              8. +
                              +
                              +So, we can see that calling C from RPython introduce some overhead. +Can we measure it?
                              +Assuming that the conversion between W_Root and PyObject* has a +reasonable cost (as explained by the previous section), the overhead +introduced by a single border-cross is still acceptable, especially if the +callee is doing some non-negligible amount of work.
                              +However this is not always the case. There are basically three problems that +make (or used to make) cpyext super slow:
                              +
                                +
                              1. Paying the border-crossing cost for trivial operations which are called +very often, such as Py_INCREF.
                              2. +
                              3. Crossing the border back and forth many times, even if it's not strictly +needed.
                              4. +
                              5. Paying an excessive cost for argument and return value conversions.
                              6. +
                              +
                              +The next sections explain in more detail each of these problems.
                              +
                              +

                              +Avoiding unnecessary roundtrips

                              +Prior to the 2017 Cape Town Sprint, cpyext was horribly slow, and we were +well aware of it: the main reason was that we never really paid too much +attention to performance. As explained in the blog post, emulating all the +CPython quirks is basically a nightmare, so better to concentrate on +correctness first.
                              +However, we didn't really know why it was so slow. We had theories and +assumptions, usually pointing at the cost of conversions between W_Root +and PyObject*, but we never actually measured it.
                              +So, we decided to write a set of cpyext microbenchmarks to measure the +performance of various operations. The result was somewhat surprising: the +theory suggests that when you do a cpyext C call, you should pay the +border-crossing costs only once, but what the profiler told us was that we +were paying the cost of generic_cpy_call several times more than what we expected.
                              +After a bit of investigation, we discovered this was ultimately caused by our +"correctness-first" approach. For simplicity of development and testing, when +we started cpyext we wrote everything in RPython: thus, every single API call +made from C (like the omnipresent PyArg_ParseTuple(), PyInt_AsLong(), etc.) +had to cross back the C-to-RPython border. This was especially daunting for +very simple and frequent operations like Py_INCREF and Py_DECREF, +which CPython implements as a single assembly instruction!
                              +Another source of slow down was the implementation of PyTypeObject slots. +At the C level, these are function pointers which the interpreter calls to do +certain operations, e.g. tp_new to allocate a new instance of that type.
                              +As usual, we have some magic to implement slots in RPython; in particular, +_make_wrapper does the opposite of generic_cpy_call: it takes a +RPython function and wraps it into a C function which can be safely called +from C, handling the GIL, exceptions and argument conversions automatically.
                              +This was very handy during the development of cpyext, but it might result in +some bad nonsense; consider what happens when you call the following C +function:
                              static PyObject* foo(PyObject* self, PyObject* args)
                              +{
                              +    PyObject* result = PyInt_FromLong(1234);
                              +    return result;
                              +}
                              +
                              +
                                +
                              1. you are in RPython and do a cpyext call to foo: RPython-to-C;
                              2. +
                              3. +foo calls PyInt_FromLong(1234), which is implemented in RPython: +C-to-RPython;
                              4. +
                              5. the implementation of PyInt_FromLong indirectly calls +PyIntType.tp_new, which is a C function pointer: RPython-to-C;
                              6. +
                              7. however, tp_new is just a wrapper around an RPython function, created +by _make_wrapper: C-to-RPython;
                              8. +
                              9. finally, we create our RPython W_IntObject(1234); at some point +during the RPython-to-C crossing, its PyObject* equivalent is +created;
                              10. +
                              11. after many layers of wrappers, we are again in foo: after we do +return result, during the C-to-RPython step we convert it from +PyObject* to W_IntObject(1234).
                              12. +
                              +Phew! After we realized this, it was not so surprising that cpyext was very +slow :). And this was a simplified example, since we are not passing a +PyObject* to the API call. When we do, we need to convert it back and +forth at every step. Actually, I am not even sure that what I described was +the exact sequence of steps which used to happen, but you get the general +idea.
                              +The solution is simple: rewrite as much as we can in C instead of RPython, +to avoid unnecessary roundtrips. This was the topic of most of the Cape Town +sprint and resulted in the cpyext-avoid-roundtrip branch, which was +eventually merged.
                              +Of course, it is not possible to move everything to C: there are still +operations which need to be implemented in RPython. For example, think of +PyList_Append: the logic to append an item to a list is complex and +involves list strategies, so we cannot replicate it in C. However, we +discovered that a large subset of the C API can benefit from this.
                              +Moreover, the C API is huge. While we invented this new way of writing +cpyext code, we still need to +convert many of the functions to the new paradigm. Sometimes the rewrite is +not automatic +or straighforward. cpyext is a delicate piece of software, so it happens often +that we make a mistake and end up staring at a segfault in gdb.
                              +However, the most important takeaway is that the performance improvements we got +from this optimization are impressive, as we will detail later.
                              +
                              +

                              +Conversion costs

                              +The other potential big source of slowdown is the conversion of arguments +between W_Root and PyObject*.
                              +As explained earlier, the first time you pass a W_Root to C, you need to +allocate its PyObject* counterpart. Suppose you have a foo function +defined in C, which takes a single int argument:
                              for i in range(N):
                              +    foo(i)
                              +
                              +To run this code, you need to create a different PyObject* for each value +of i: if implemented naively, it means calling N times malloc() +and free(), which kills performance.
                              +CPython has the very same problem, which is solved by using a free list to +allocate ints. So, what we did was to simply steal the code from CPython +and do the exact same thing. This was also done in the +cpyext-avoid-roundtrip branch, and the benchmarks show that it worked +perfectly.
                              +Every type which is converted often to PyObject* must have a very fast +allocator. At the moment of writing, PyPy uses free lists only for ints and +tuples: one of the next steps on our TODO list is certainly to use this +technique with more types, like float.
                              +Conversely, we also need to optimize the converstion from PyObject* to +W_Root: this happens when an object is originally allocated in C and +returned to Python. Consider for example the following code:
                              import numpy as np
                              +myarray = np.random.random(N)
                              +for i in range(len(arr)):
                              +    myarray[i]
                              +
                              +At every iteration, we get an item out of the array: the return type is a an +instance of numpy.float64 (a numpy scalar), i.e. a PyObject'*: this is +something which is implemented by numpy entirely in C, so completely +opaque to cpyext. We don't have any control on how it is allocated, +managed, etc., and we can assume that allocation costs are the same as on +CPython.
                              +As soon as we return these PyObject* to Python, we need to allocate +their W_Root equivalent. If you do it in a small loop like in the example +above, you end up allocating all these W_Root inside the nursery, which is +a good thing since allocation is super fast (see the section above about the +PyPy GC).
                              +However, we also need to keep track of the W_Root to PyObject* link. +Currently, we do this by putting all of them in a dictionary, but it is very +inefficient, especially because most of these objects die young and thus it +is wasted work to do that for them. Currently, this is one of the biggest +unresolved problem in cpyext, and it is what causes the two microbenchmarks +allocate_int and allocate_tuple to be very slow.
                              +We are well aware of the problem, and we have a plan for how to fix it. The +explanation is too technical for the scope of this blog post as it requires a +deep knowledge of the GC internals to be understood, but the details are +here.
                              +
                              +

                              +C API quirks

                              +Finally, there is another source of slowdown which is beyond our control. Some +parts of the CPython C API are badly designed and expose some of the +implementation details of CPython.
                              +The major example is reference counting. The Py_INCREF / Py_DECREF API +is designed in such a way which forces other implementation to emulate +refcounting even in presence of other GC management schemes, as explained +above.
                              +Another example is borrowed references. There are API functions which do +not incref an object before returning it, e.g. PyList_GetItem(). This is +done for performance reasons because we can avoid a whole incref/decref pair, +if the caller needs to handle the returned item only temporarily: the item is +kept alive because it is in the list anyway.
                              +For PyPy, this is a challenge: thanks to list strategies, lists are often +represented in a compact way. For example, a list containing only integers is +stored as a C array of long. How to implement PyList_GetItem? We +cannot simply create a PyObject* on the fly, because the caller will never +decref it and it will result in a memory leak.
                              +The current solution is very inefficient. The first time we do a +PyList_GetItem, we convert the whole list to a list of +PyObject*. This is bad in two ways: the first is that we potentially pay a +lot of unneeded conversion cost in case we will never access the other items +of the list. The second is that by doing that we lose all the performance +benefit granted by the original list strategy, making it slower for the +rest of the pure-python code which will manipulate the list later.
                              PyList_GetItem is an example of a bad API because it assumes that the list +is implemented as an array of PyObject*: after all, in order to return a +borrowed reference, we need a reference to borrow, don't we?
                              +Fortunately, (some) CPython developers are aware of these problems, and there +is an ongoing project to design a better C API which aims to fix exactly +this kind of problem.
                              +Nonetheless, in the meantime we still need to implement the current +half-broken APIs. There is no easy solution for that, and it is likely that +we will always need to pay some performance penalty in order to implement them +correctly.
                              +However, what we could potentially do is to provide alternative functions +which do the same job but are more PyPy friendly: for example, we could think +of implementing PyList_GetItemNonBorrowed or something like that: then, C +extensions could choose to use it (possibly hidden inside some macro and +#ifdef) if they want to be fast on PyPy.
                              +
                              +

                              +Current performance

                              +During the whole blog post we claimed cpyext is slow. How +slow it is, exactly?
                              +We decided to concentrate on microbenchmarks for now. It should be evident +by now there are simply too many issues which can slow down a cpyext +program, and microbenchmarks help us to concentrate on one (or few) at a +time.
                              +The microbenchmarks measure very simple things, like calling functions and +methods with the various calling conventions (no arguments, one arguments, +multiple arguments); passing various types as arguments (to measure conversion +costs); allocating objects from C, and so on.
                              +Here are the results from the old PyPy 5.8 relative and normalized to CPython +2.7, the lower the better:

                              + +
                              +
                              + +
                              +
                              + +
                              +
                              +PyPy was horribly slow everywhere, ranging from 2.5x to 10x slower. It is +particularly interesting to compare simple.noargs, which measures the cost +of calling an empty function with no arguments, and simple.onearg(i), +which measures the cost calling an empty function passing an integer argument: +the latter is ~2x slower than the former, indicating that the conversion cost +of integers is huge.
                              +PyPy 5.8 was the last release before the famous Cape Town sprint, when we +started to look at cpyext performance seriously. Here are the performance data for +PyPy 6.0, the latest release at the time of writing:
                              + +
                              +

                              +The results are amazing! PyPy is now massively faster than before, and for +most benchmarks it is even faster than CPython: yes, you read it correctly: +PyPy is faster than CPython at doing CPython's job, even considering all the +extra work it has to do to emulate the C API. This happens thanks to the JIT, +which produces speedups high enough to counterbalance the slowdown caused by +cpyext.
                              +There are two microbenchmarks which are still slower though: allocate_int +and allocate_tuple, for the reasons explained in the section about +Conversion costs.
                              +
                              +

                              +Next steps

                              +Despite the spectacular results we got so far, cpyext is still slow enough to +kill performance in most real-world code which uses C extensions extensively +(e.g., the omnipresent numpy).
                              +Our current approach is something along these lines:
                              +
                                +
                              1. run a real-world small benchmark which exercises cpyext
                              2. +
                              3. measure and find the major bottleneck
                              4. +
                              5. write a corresponding microbenchmark
                              6. +
                              7. optimize it
                              8. +
                              9. repeat
                              10. +
                              +
                              +On one hand, this is a daunting task because the C API is huge and we need to +tackle functions one by one. On the other hand, not all the functions are +equally important, and is is enough to optimize a relatively small subset to +improve many different use cases.
                              +Where a year ago we announced we have a working answer to run c-extension in +PyPy, we now have a clear picture of what are the performance bottlenecks, and +we have developed some technical solutions to fix them. It is "only" a matter +of tackling them, one by one. It is worth noting that most of the work was +done during two sprints, for a total 2-3 person-months of work.
                              +We think this work is important for the Python ecosystem. PyPy has established +a baseline for performance in pure python code, providing an answer for the +"Python is slow" detractors. The techniques used to make cpyext performant +will let PyPy become an alternative for people who mix C extensions with +Python, which, it turns out, is just about everyone, in particular those using +the various scientific libraries. Today, many developers are forced to seek +performance by converting code from Python to a lower language. We feel there +is no reason to do this, but in order to prove it we must be able to run both +their python and their C extensions performantly, then we can begin to educate +them how to write JIT-friendly code in the first place.
                              +We envision a future in which you can run arbitrary Python programs on PyPy, +with the JIT speeding up the pure Python parts and the C parts running as fast +as today: the best of both worlds!
                              +
                              +
                              +

                              Comments

                              +
                              +
                              +
                              + + AlbertMietus wrote on 2018-09-22 08:37: +
                              +
                              +

                              Thanks fo this nice article!

                              —Albert

                              +
                              +
                              +
                              +
                              + + Pixy Misa wrote on 2018-09-22 09:58: +
                              +
                              +

                              Great work guys! I should benchmark some of my apps again - a couple of things that were dependent on C extensions didn't show much speedup previously.

                              +
                              +
                              +
                              +
                              + + Anonymous wrote on 2018-09-22 15:55: +
                              +
                              +

                              Great work man !

                              +
                              +
                              +
                              + +
                              +
                              + +
                              +
                              + + \ No newline at end of file diff --git a/posts/2018/09/the-first-15-years-of-pypy-3412615975376972020.html b/posts/2018/09/the-first-15-years-of-pypy-3412615975376972020.html new file mode 100644 index 000000000..f7dfdb8da --- /dev/null +++ b/posts/2018/09/the-first-15-years-of-pypy-3412615975376972020.html @@ -0,0 +1,1077 @@ + + + + + +The First 15 Years of PyPy — a Personal Retrospective | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                              +
                              +

                              The First 15 Years of PyPy — a Personal Retrospective

                              + + + +
                              +

                              A few weeks ago I (=Carl Friedrich Bolz-Tereick) gave a keynote at ICOOOLPS in +Amsterdam with the above title. I was very happy to have been given that +opportunity, since a number of our papers have been published at ICOOOLPS, +including the very first one I published when I'd just started my PhD. I decided +to turn the talk manuscript into a (longish) blog post, to make it available to a wider audience. +Note that this blog post describes my personal recollections and research, it is +thus necessarily incomplete and coloured by my own experiences.

                              +

                              PyPy has turned 15 years old this year, so I decided that that's a good reason +to dig into and talk about the history of the project so far. I'm going to do +that using the lens of how performance developed over time, which is from +something like 2000x slower than CPython, to roughly 7x faster. In this post +I am going to present the history of the project, and also talk about some +lessons that we learned.

                              +

                              The post does not make too many assumptions about any prior knowledge of what +PyPy is, so if this is your first interaction with it, welcome! I have tried to +sprinkle links to earlier blog posts and papers into the writing, in case you +want to dive deeper into some of the topics.

                              +

                              As a disclaimer, in this post I am going to mostly focus on ideas, and not +explain who had or implemented them. A huge amount of people contributed to the +design, the implementation, the funding and the organization of PyPy over the +years, and it would be impossible to do them all justice.

                              + +
                              +

                              2003: Starting the Project

                              +

                              On the technical level PyPy is a Python interpreter written in Python, which is +where the name comes from. It also has an automatically generated JIT compiler, +but I'm going to introduce that gradually over the rest of the blog post, so +let's not worry about it too much yet. On the social level PyPy is an +interesting mixture of a open source project, that sometimes had research done +in it.

                              +

                              The project got started in late 2002 and early 2003. To set the stage, at that +point Python was a significantly less popular language than it is today. Python +2.2 was the version at the time, Python didn't even have a bool type yet.

                              +

                              In fall 2002 the PyPy project was started by a number of Python programmers on a +mailing list who said +something like (I am exaggerating somewhat) "Python is the greatest most +wonderful most perfect language ever, we should use it for absolutely +everything. Well, what aren't we using it for? The Python virtual machine itself +is written in C, that's bad. Let's start a project to fix that."

                              +

                              Originally that project was called "minimal python", or "ptn", later gradually +renamed to PyPy. Here's the mailing list post to announce the project more +formally:

                              +
                              Minimal Python Discussion, Coding and Sprint
                              +--------------------------------------------
                              +
                              +We announce a mailinglist dedicated to developing
                              +a "Minimal Python" version.  Minimal means that
                              +we want to have a very small C-core and as much
                              +as possible (re)implemented in python itself.  This
                              +includes (parts of) the VM-Code.
                              +

                              Why would that kind of project be useful? Originally it wasn't necessarily meant +to be useful as a real implementation at all, it was more meant as a kind of +executable explanation of how Python works, free of the low level details of +CPython. But pretty soon there were then also plans for how the virtual machine +(VM) could be bootstrapped to be runnable without an existing Python +implementation, but I'll get to that further down.

                              +
                              + + +
                              +

                              2003: Implementing the Interpreter

                              +

                              In early 2003 a group of Python people met in Hildesheim (Germany) for the first +of many week long development sprints, organized by Holger Krekel. During that +week a group of people showed up and started working on the core interpreter. +In May 2003 a second sprint was organized by Laura Creighton and Jacob Halén in +Gothenburg (Sweden). And already at that sprint enough of the Python bytecodes +and data structures were implemented to make it possible to run a program that +computed how much money everybody had to pay for the food bills of the week. And +everybody who's tried that for a large group of people knows that that’s an +amazingly complex mathematical problem.

                              +

                              In the next two years, the project continued as a open source project with +various contributors working on it in their free time, and meeting for the +occasional sprint. In that time, the rest of the core interpreter and the core +data types were implemented.

                              +

                              There's not going to be any other code in this post, but to give a bit of a +flavor of what the Python interpreter at that time looked like, here's the +implementation of the DUP_TOP bytecode after these first sprints. As you can +see, it's in Python, obviously, and it has high level constructs such as method +calls to do the stack manipulations:

                              +
                              def DUP_TOP(f):
                              +    w_1 = f.valuestack.top()
                              +    f.valuestack.push(w_1)
                              +

                              Here's the early code for integer addition:

                              +
                              def int_int_add(space, w_int1, w_int2):
                              +    x = w_int1.intval
                              +    y = w_int2.intval
                              +    try:
                              +        z = x + y
                              +    except OverflowError:
                              +        raise FailedToImplement(space.w_OverflowError,
                              +                                space.wrap("integer addition"))
                              +    return W_IntObject(space, z)
                              +

                              (the current implementations look slightly but not fundamentally different.)

                              +
                              + + +
                              +

                              Early organizational ideas

                              +

                              Some of the early organizational ideas of the project were as follows. Since the +project was started on a sprint and people really liked that style of working +PyPy continued to be developed on various subsequent sprints.

                              +

                              From early on there was a very heavy emphasis on testing. All the parts of the +interpreter that were implemented had a very careful set of unit tests to make +sure that they worked correctly. From early on, there was a continuous +integration infrastructure, which grew over time (nowadays it is very natural +for people to have automated tests, and the concept of green/red builds: but +embracing this workflow in the early 2000s was not really mainstream yet, and +it is probably one of the reasons behind PyPy's success).

                              +

                              At the sprints there was also an emphasis on doing pair programming to make +sure that everybody understood the codebase +equally. There was also a heavy emphasis on writing good code and on regularly +doing refactorings to make sure that the codebase remained nice, clean and +understandable. Those ideas followed from the early thoughts that PyPy would be +a sort of readable explanation of the language.

                              +

                              There was also a pretty fundamental design decision made at the time. That was +that the project should stay out of language design completely. Instead it would +follow CPython's lead and behave exactly like that implementation in all cases. +The project therefore committed to being almost quirk-to-quirk compatible and to +implement even the more obscure (and partially unnecessary) corner cases of +CPython.

                              +

                              All of these principles continue pretty much still today (There are a few places +where we had to deviate from being completely compatible, they are documented +here).

                              +
                              + + +
                              +

                              2004-2007: EU-Funding

                              +

                              While all this coding was going on it became clear pretty soon that the goals +that various participants had for the project would be very hard to achieve with +just open source volunteers working on the project in their spare time. +Particularly also the sprints became expensive given that those were just +volunteers doing this as a kind of weird hobby. Therefore a couple of people of +the project got together to apply for an EU grant in the framework programme 6 +to solve these money problems. In mid-2004 that application proved to be +successful. And so the project got a grant of a 1.3 million Euro for +two years to be able to employ some of the core developers and to make it +possible for them work on the project full time. The EU grant went to seven +small-to-medium companies and Uni Düsseldorf. The budget also contained money to +fund sprints, both for the employed core devs as well as other open source +contributors.

                              + +

                              The EU project started in December 2004 and that was a fairly heavy change in +pace for the project. Suddenly a lot of people were working full time on it, and +the pace and the pressure picked up quite a lot. Originally it had been a +leisurely project people worked on for fun. But afterwards people discovered +that doing this kind of work full time becomes slightly less fun, particularly +also if you have to fulfill the ambitious technical goals that the EU proposal +contained. And the proposal indeed contained a bit everything to increase its +chance of acceptance, such as aspect oriented programming, semantic web, logic +programming, constraint programming, and so on. Unfortunately it +turned out that those things then have to be implemented, which can be called +the first thing we learned: if you promise something to the EU, you'll have to +actually go do it (After the funding ended, a lot of these features were +actually removed from the project again, at a cleanup sprint).

                              +
                              + + +
                              +

                              2005: Bootstrapping PyPy

                              +

                              So what were the actually useful things done as part of the EU project?

                              +

                              One of the most important goals that the EU project was meant to solve was the +question of how to turn PyPy into an actually useful VM for Python. The +bootstrapping plans were taken quite directly from Squeak, which is a Smalltalk +VM written in a subset of Smalltalk called Slang, which can then be bootstrapped +to C code. The plan for PyPy was to do something similar, to define a restricted +subset of Python called RPython, restricted in such a way that it should be +possible to statically compile RPython programs to C code. Then the Python +interpreter should only use that subset, of course.

                              +

                              The main difference from the Squeak approach is that Slang, the subset of Squeak +used there, is actually quite a low level language. In a way, you could almost +describe it as C with Smalltalk syntax. RPython was really meant to be a +much higher level language, much closer to Python, with full support for single +inheritance classes, and most of Python's built-in data structures.

                              + +
                              +

                              (BTW, you don’t have to understand any of the illustrations in this blog post, +they are taken from talks and project reports we did over the years so they are +of archaeological interest only and I don’t understand most of them myself.)

                              +

                              From 2005 on, work on the RPython type inference engine and C backend started in +earnest, which was sort of co-developed with the RPython language definition and +the PyPy Python interpreter. This is also roughly the time that I joined the +project as a volunteer.

                              +

                              And at the second sprint I went to, in July 2005, two and a half years after the +project got started, we managed to bootstrap the PyPy interpreter to C for the +first time. When we ran the compiled program, it of course immediately +segfaulted. The reason for that was that the C backend had turned characters +into signed chars in C, while the rest of the infrastructure assumed that they +were unsigned chars. After we fixed that, the second attempt worked and we +managed to run an incredibly complex program, something like 6 * 7. That +first bootstrapped version was really really slow, a couple of hundred times +slower than CPython.

                              + +
                              +

                              The bootstrapping process of RPython has a number of nice benefits, a big one +being that a number of the properties of the generated virtual machine don't +have to expressed in the interpreter. The biggest example of this is garbage +collection. RPython is a garbage collected language, and the interpreter does +not have to care much about GC in most cases. When the C source code is +generated, a GC is automatically inserted. This is a source of great +flexibility. Over time we experimented with a number of different GC +approaches, from reference counting to Boehm to our current incremental +generational collector. As an aside, for a long time we were also working on +other backends to the RPython language and hoped to be able to target Java and +.NET as well. Eventually we abandoned this strand of work, however.

                              +
                              + + +
                              +

                              RPython's Modularity Problems

                              +

                              Now we come to the first thing I would say we learned in the project, which is +that the quality of tools we thought of as internal things still matters a lot. +One of the biggest technical mistakes we've made in the project was that we +designed RPython without any kind of story for modularity. There is no concept +of modules in the language or any other way to break up programs into smaller +components. We always thought that it would be ok for RPython to be a little bit +crappy. It was meant to be this sort of internal language with not too many +external users. And of course that turned out to be completely wrong later.

                              +

                              That lack of modularity led to various problems that persist until today. The +biggest one is that there is no separate compilation for RPython programs at +all! You always need to compile all the parts of your VM together, which leads +to infamously bad compilation times.

                              +

                              Also by not considering the modularity question we were never forced to fix +some internal structuring issues of the RPython compiler itself. +Various layers of the compiler keep very badly defined and porous interfaces between +them. This was made possible by being able to work with all the program information in one heap, +making the compiler less approachable and maintainable than it maybe could be.

                              +

                              Of course this mistake just got more and more costly to fix over time, +and so it means that so far nobody has actually done it. +Not thinking more carefully about RPython's design, particularly its +modularity story, is in my opinion the biggest technical mistake the project +made.

                              +
                              + + +
                              +

                              2006: The Meta-JIT

                              +

                              After successfully bootstrapping the VM we did some fairly straightforward +optimizations on the interpreter and the C backend and managed to reduce the +slowdown versus CPython to something like 2-5 times slower. That's great! But of +course not actually useful in practice. So where do we go from here?

                              +

                              One of the not so secret goals of Armin Rigo, one of the PyPy founders, was to +use PyPy together with some advanced partial evaluation magic sauce to +somehow automatically generate a JIT compiler from the interpreter. The goal was +something like, "you write your interpreter in RPython, add a few annotations +and then we give you a JIT for free for the language that that interpreter +implements."

                              +

                              Where did the wish for that approach come from, why not just write a JIT for +Python manually in the first place? Armin had actually done just that before he +co-founded PyPy, in a project called Psyco. Psyco was an extension module for +CPython that contained a method-based JIT compiler for Python code. And Psyco +proved to be an amazingly frustrating compiler to write. There were two main +reasons for that. The first reason was that Python is actually quite a complex +language underneath its apparent simplicity. The second reason for the +frustration was that Python was and is very much an alive language, that gains +new features in the language core in every version. So every time a new Python +version came out, Armin had to do fundamental changes and rewrites to Psyco, and +he was getting pretty frustrated with it. So he hoped that that effort could be +diminished by not writing the JIT for PyPy by hand at all. Instead, the goal was +to generate a method-based JIT from the interpreter automatically. By taking the +interpreter, and applying a kind of advanced transformation to it, that would +turn it into a method-based JIT. And all that would still be translated into a +C-based VM, of course.

                              +
                              +

                              Slide from Psyco presentation at EuroPython 2002

                              +
                              + + +
                              +

                              The First JIT Generator

                              +

                              From early 2006 on until the end of the EU project a lot of work went into +writing such a JIT generator. The idea was to base it on runtime partial +evaluation. Partial evaluation is an old idea in computer science. It's supposed +to be a way to automatically turn interpreters for a language into a compiler +for that same language. Since PyPy was trying to generate a JIT compiler, which +is in any case necessary to get good performance for a dynamic language like +Python, the partial evaluation was going to happen at runtime.

                              +

                              There are various ways to look at partial evaluation, but if you've never heard +of it before, a simple way to view it is that it will compile a Python function +by gluing together the implementations of the bytecodes of that function and +optimizing the result.

                              +

                              The main new ideas of PyPy's partial-evaluation based JIT generator as opposed +to earlier partial-evaluation approaches are the ideas of "promote" and the idea +of "virtuals". Both of these techniques had already been present (in a slightly +less general form) in Psyco, and the goal was to keep using them in PyPy. Both +of these techniques also still remain in use today in PyPy. I'm +going on a slight technical diversion now, to give a high level explanation of +what those ideas are for.

                              +
                              +
                              + + +
                              +

                              Promote

                              +

                              One important ingredient of any JIT compiler is the ability to do runtime +feedback. Runtime feedback is most commonly used to know something about which +concrete types are used by a program in practice. Promote is basically a way to +easily introduce runtime feedback into the JIT produced by the JIT generator. +It's an annotation the implementer of a language can use to express their wish +that specialization should happen at this point. This mechanism can be used to +express all kinds of runtime feedback, moving values from the interpreter +into the compiler, whether they be types or other things.

                              +
                              + + +
                              +

                              Virtuals

                              +

                              Virtuals are a very aggressive form of partial escape analysis. A dynamic +language often puts a lot of pressure on the garbage collector, since most +primitive types (like integers, floats and strings) are boxed in the heap, and +new boxes are allocated all the time.

                              +

                              With the help of virtuals a very significant portion of all allocations in the +generated machine code can be completely removed. Even if they can't be removed, +often the allocation can be delayed or moved into an error path, or even +into a deoptimization path, and thus disappear from the generated machine code +completely.

                              +

                              This optimization really is the super-power of PyPy's optimizer, since it +doesn't work only for primitive boxes but for any kind of object allocated on +the heap with a predictable lifetime.

                              +

                              As an aside, while this kind of partial escape analysis is sort of new for +object-oriented languages, it has actually existed in Prolog-based partial +evaluation systems since the 80s, because it's just extremely natural there.

                              +
                              + + +
                              +

                              JIT Status 2007

                              +

                              So, back to our history. We're now in 2007, at the end of the EU project (you +can find the EU-reports we wrote during the projects here). The EU project +successfully finished, we survived the final review with the EU. So, what's the +2007 status of the JIT generator? It works kind of, it can be applied to PyPy. It +produces a VM with a JIT that will turn Python code into machine code at runtime +and run it. However, that machine code is not particularly fast. Also, it tends +to generate many megabytes of machine code even for small Python programs. While +it's always faster than PyPy without JIT, it's only sometimes faster than +CPython, and most of the time Psyco still beats it. On the one hand, this is +still an amazing achievement! It's arguably the biggest application of partial +evaluation at this point in time! On the other hand, it was still quite +disappointing in practice, particularly since some of us had believed at the +time that it should have been possible to reach and then surpass the speed of +Psyco with this approach.

                              +
                              + + +
                              +

                              2007: RSqueak and other languages

                              +

                              After the EU project ended we did all kinds of things. Like sleep for a month +for example, and have the cleanup sprint that I already mentioned. We also had a +slightly unusual sprint in Bern, with members of the Software Composition +Group of Oscar Nierstrasz. As I wrote above, PyPy had been heavily influenced +by Squeak Smalltalk, and that group is a heavy user of Squeak, so we wanted to +see how to collaborate with them. At the beginning of the sprint, we decided +together that the goal of that week should be to try to write a Squeak virtual +machine in RPython, and at the end of the week we'd gotten surprisingly far with +that goal. Basically most of the bytecodes and the Smalltalk object system +worked, we had written an image loader and could run some benchmarks (during the +sprint we also regularly updated a blog, the success of which led us to start +the PyPy blog).

                              + +
                              +

                              The development of the Squeak interpreter was very interesting for the project, +because it was the first real step that moved RPython from being an +implementation detail of PyPy to be a more interesting project in its own right. +Basically a language to write interpreters in, with the eventual promise to get +a JIT for that language almost for free. That Squeak implementation is now +called RSqueak ("Research Squeak").

                              +

                              I'll not go into more details about any of the other language implementations in +RPython in this post, but over the years we've had a large variety of language +of them done by various people and groups, most of them as research vehicles, +but also some as real language implementations. Some very cool research results +came out of these efforts, here's a slightly outdated list of some of them.

                              +

                              The use of RPython for other languages complicated the PyPy narrative a lot, and +in a way we never managed to recover the simplicity of the original project +description "PyPy is Python in Python". Because now it's something like "we have +this somewhat strange language, a subset of Python, that's called RPython, and +it's good to write interpreters in. And if you do that, we'll give you a JIT for +almost free. And also, we used that language to write a Python implementation, +called PyPy.". It just doesn't roll off the tongue as nicely.

                              +
                              + + +
                              +

                              2008-2009: Four More JIT Generators

                              +

                              Back to the JIT. After writing the first JIT generator as part of the EU +project, with somewhat mixed results, we actually wrote several more JIT +generator prototypes with different architectures to try to solve some of the +problems of the first approach. To give an impression of these prototypes, +here’s a list of them.

                              +
                                +
                              • The second JIT generator we started working on in 2008 behaved exactly like +the first one, but had a meta-interpreter based architecture, to make it more +flexible and easier to experiment with. The meta-interpreter was called +the "rainbow interpreter", and in general the JIT is an area where we went +somewhat overboard with borderline silly terminology, with notable +occurrences of "timeshifter", "blackhole interpreter" etc.

                              • +
                              • The third JIT generator was an experiment based on the second one which +changed +compilation strategy. While the previous two had compiled many control flow +paths of the currently compiled function eagerly, that third JIT was sort of +maximally lazy and stopped compilation at every control flow split to avoid +guessing which path would actually be useful later when executing the code. +This was an attempt to reduce the problem of the first JIT generating way too +much machine code. Only later, when execution went down one of the not yet +compiled paths would it continue compiling more code. This gives an effect +similar to that of lazy basic block versioning.

                              • +
                              • The fourth JIT generator was a pretty strange prototype, a runtime partial +evaluator for Prolog, to experiment with various specialization trade-offs. It +had an approach that we gave a not at all humble name, called "perfect +specialization".

                              • +
                              • The fifth JIT generator is the one that we are still using today. Instead of +generating a method-based JIT compiler from our interpreter we switched to +generating a tracing JIT compiler. Tracing JIT compilers were sort of the +latest fashion at the time, at least for a little while.

                              • +
                              +
                              + + +
                              +

                              2009: Meta-Tracing

                              +

                              So, how did that tracing JIT generator work? A tracing JIT generates code by +observing and logging the execution of the running program. This yields a +straight-line trace of operations, which are then optimized and compiled into +machine code. Of course most tracing systems mostly focus on tracing loops.

                              +

                              As we discovered, it's actually quite simple to apply a tracing JIT to a generic +interpreter, by not tracing the execution of the user program directly, but by +instead tracing the execution of the interpreter while it is running the user +program (here's the paper we wrote about this approach).

                              +

                              So that's what we implemented. Of course we kept the two successful parts of the +first JIT, promote and virtuals (both links go to the papers about these +features in the meta-tracing context).

                              +
                              +
                              + + +
                              +

                              Why did we Abandon Partial Evaluation?

                              +

                              So one question I get sometimes asked when telling this story is, why did +we think that tracing would work better than partial evaluation (PE)? One of the +hardest parts of compilers in general and partial evaluation based systems in +particular is the decision when and how much to inline, how much to specialize, +as well as the decision when to split control flow paths. In the PE based JIT +generator we never managed to control that question. Either the JIT would +inline too much, leading to useless compilation of all kinds of unlikely error +cases. Or it wouldn't inline enough, preventing necessary optimizations.

                              +

                              Meta tracing solves this problem with a hammer, it doesn't make particularly +complex inlining decisions at all. It instead decides what to inline by +precisely following what a real execution through the program is doing. Its +inlining decisions are therefore very understandable and predictable, and it +basically only has one heuristic based on whether the called function contains a +loop or not: If the called function contains a loop, we'll never inline it, if +it doesn't we always try to inline it. That predictability is actually what was +the most helpful, since it makes it possible for interpreter authors to +understand why the JIT did what it did and to actually influence its inlining +decisions by changing the annotations in the interpreter source. It turns out +that simple is better than complex.

                              +
                              + + +
                              +

                              2009-2011: The PyJIT Eurostars Project

                              +

                              While we were writing all these JIT prototypes, PyPy had sort of reverted back +to being a volunteer-driven open source project (although some of us, like +Antonio Cuni and I, had started working for universities and other project +members had other sources of funding). But again, while we did the work it +became clear that to get an actually working fast PyPy with generated JIT we +would need actual funding again for the project. So we applied to the EU again, +this time for a much smaller project with less money, in the Eurostars +framework. We got a grant for three participants, merlinux, OpenEnd and Uni +Düsseldorf, on the order of a bit more than half a million euro. That money was +specifically for JIT development and JIT testing infrastructure.

                              +
                              +
                              + + +
                              +

                              Tracing JIT improvements

                              +

                              When writing the grant we had sat together at a sprint and discussed extensively +and decided that we would not switch JIT generation approaches any more. We all +liked the tracing approach well enough and thought it was promising. So instead +we agreed to try in earnest to make the tracing JIT really practical. So in the +Eurostars project we started with implementing sort of fairly standard JIT +compiler optimizations for the meta-tracing JIT, such as:

                              +
                                +
                              • constant folding

                              • +
                              • dead code elimination

                              • +
                              • loop invariant code motion (using LuaJIT's approach)

                              • +
                              • better heap optimizations

                              • +
                              • faster deoptimization (which is actually a bit of a mess in the +meta-approach)

                              • +
                              • and dealing more efficiently with Python frames objects and the +features of Python's debugging facilities

                              • +
                              +
                              + + +
                              +

                              2010: speed.pypy.org

                              +

                              In 2010, to make sure that we wouldn't accidentally introduce speed regressions +while working on the JIT, we implemented infrastructure to build PyPy and run +our benchmarks nightly. Then, the https://speed.pypy.org website was implemented +by Miquel Torres, a volunteer. The website shows the changes in benchmark +performance compared to the previous n days. It didn't sound too important at +first, but this was (and is) a fantastic tool, and an amazing motivator over the +next years, to keep continually improving performance.

                              +
                              +
                              + + +
                              +

                              Continuous Integration

                              +

                              This actually leads me to something else that I'd say we learned, which is that +continuous integration is really awesome, and completely transformative to have +for a project. This is not a particularly surprising insight nowadays in the +open source community, it's easy to set up continuous integration on Github +using Travis or some other CI service. But I still see a lot of research +projects that don't have tests, that don't use CI, so I wanted to mention it +anyway. As I mentioned earlier in the post, PyPy has a quite serious testing +culture, with unit tests written for new code, regression tests for all bugs, +and integration tests using the CPython test suite. Those tests are run +nightly on a number of architectures and operating systems.

                              +

                              Having all this kind of careful testing is of course necessary, since PyPy is +really trying to be a Python implementation that people actually use, not just +write papers about. But having all this infrastructure also had other benefits, +for example it allows us to trust newcomers to the project very quickly. +Basically after your first patch gets accepted, you immediately get commit +rights to the PyPy repository. If you screw up, the tests (or the code reviews) +are probably going to catch it, and that reduction to the barrier to +contributing is just super great.

                              +

                              This concludes my advertisement for testing in this post.

                              +
                              + + +
                              +

                              2010: Implementing Python Objects with Maps

                              +

                              So, what else did we do in the Eurostars project, apart from adding traditional +compiler optimizations to the tracing JIT and setting up CI infrastructure? +Another strand of work, that went on sort of concurrently to the JIT generator +improvements, were deep rewrites in the Python runtime, and the Python data +structures. I am going to write about two exemplary ones here, maps and storage strategies.

                              +

                              The first such rewrite is fairly standard. Python instances are similar to +Javascript objects, in that you can add arbitrary attributes to them at runtime. +Originally Python instances were backed by a dictionary in PyPy, but of course +in practice most instances of the same class have the same set of attribute +names. Therefore we went and implemented Self style maps, which are often +called hidden classes in the JS world to represent instances instead. This +has two big benefits, it allows you to generate much better machine code for +instance attribute access and makes instances use a lot less memory.

                              +
                              +
                              + + +
                              +

                              2011: Container Storage Strategies

                              +

                              Another important change in the PyPy runtime was rewriting the Python container +data structures, such as lists, dictionaries and sets. A fairly straightforward +observation about how those are used is that in a significant percentage of +cases they contain type-homogeneous data. As an example it's quite common to +have lists of only integers, or lists of only strings. So we changed the list, +dict and set implementations to use something we called storage strategies. With +storage strategies these data structures use a more efficient representations if +they contain only primitives of the same type, such as ints, floats, strings. +This makes it possible to store the values without boxing them in the underlying +data structure. Therefore read and write access are much faster for such type +homogeneous containers. Of course when later another data type gets added to +such a list, the existing elements need to all be boxed at that point, which is +expensive. But we did a study and found out that that happens quite rarely in +practice. A lot of that work was done by Lukas Diekmann.

                              +
                              +
                              + + +
                              +

                              Deep Changes in the Runtime are Necessary

                              +

                              These two are just two examples for a number of fairly fundamental changes in +the PyPy runtime and PyPy data structures, probably the two most important ones, +but we did many others. That leads me to another thing we learned. If you want +to generate good code for a complex dynamic language such as Python, it's +actually not enough at all to have a good code generator and good compiler +optimizations. That's not going to help you, if your runtime data-structures +aren't in a shape where it's possible to generate efficient machine code to +access them.

                              +

                              Maybe this is well known in the VM and research community. However it's the main +mistake that in my opinion every other Python JIT effort has made in the last 10 +years, where most projects said something along the lines of "we're not +changing the existing CPython data structures at all, we'll just let LLVM +inline enough C code of the runtime and then it will optimize all the overhead +away". That never works very well.

                              +
                              + + +
                              +

                              JIT Status 2011

                              +

                              So, here we are at the end of the Eurostars project, what's the status of the JIT? Well, it +seems this meta-tracing stuff really works! We finally started actually +believing in it, when we reached the point in 2010 where self-hosting PyPy was +actually faster than bootstrapping the VM on CPython. Speeding up the +bootstrapping process is something that Psyco never managed at all, so we +considered this a quite important achievement. At the end of +Eurostars, we were about 4x faster than CPython on our set of benchmarks.

                              +
                              + + +
                              +

                              2012-2017: Engineering and Incremental Progress

                              +

                              2012 the Eurostars project was finished and PyPy reverted yet another time back +to be an open source project. From then on, we've had a more diverse set of +sources of funding: we received some crowd funding via the Software Freedom +Conservancy and contracts of various sizes from companies to implement various +specific features, often handled by Baroque Software. Over the next couple of +years +we revamped various parts of the VM. We improved the GC in major ways. We +optimized the implementation of the JIT compiler to improve warmup times. We +implemented backends for various CPU architectures (including PowerPC and +s390x). We tried to reduce the number of performance cliffs and make the JIT +useful in a broader set of cases.

                              +

                              Another strand of work was to push quite significantly to be more +compatible with CPython, particularly the Python 3 line as well as extension +module support. Other compatibility improvements we did was making sure that +virtualenv works with PyPy, better support for distutils and setuptools and +similar improvements. The continually improving performance as well better +compatibility with the ecosystem tools led to the first few users of PyPy in +industry.

                              +
                              +
                              + + +

                              CPyExt

                              +

                              Another very important strand of work that took a lot of effort in recent years +was CPyExt. One of the main blockers of PyPy adoption had always been the fact +that a lot of people need specific C-extension modules at least in some parts of +their program, and telling them to reimplement everything in Python is just not +a practical solution. Therefore we worked on CPyExt, an emulation layer to make +it possible to run CPython C-extension modules in PyPy. Doing that was a very +painful process, since the CPython extension API leaks a lot of CPython +implementation details, so we had to painstakingly emulate all of these details +to make it possible to run extensions. That this works at all remains completely +amazing to me! But nowadays CPyExt is even getting quite good, a lot of the big +numerical libraries such as Numpy and Pandas are now supported (for a while +we had worked hard on a reimplementation of Numpy called NumPyPy, but +eventually realized that it would never be complete and useful enough). +However, calling CPyExt modules from PyPy can still be very slow, +which makes it impractical for some applications +that's why we are working on it.

                              +

                              Not thinking about C-extension module emulation earlier in the project history +was a pretty bad strategic mistake. It had been clear for a long time that +getting people to just stop using all their C-extension modules was never going +to work, despite our efforts to give them alternatives, such as cffi. So we +should have thought of a story for all the existing C-extension modules earlier +in the project. Not starting CPyExt earlier was mostly a failure of our +imagination (and maybe a too high pain threshold): We didn't believe this kind +of emulation was going to be practical, until somebody went and tried it.

                              +
                              + + +
                              +

                              Python 3

                              +

                              Another main +focus of the last couple of years has been to catch up with the CPython 3 line. +Originally we had ignored Python 3 for a little bit too long, and were trailing +several versions behind. In 2016 and 2017 we had a grant from the Mozilla open +source support program of $200'000 to be able to catch up with Python 3.5. This +work is now basically done, and we are starting to target CPython 3.6 and will +have to look into 3.7 in the near future.

                              +
                              + + +
                              +

                              Incentives of OSS compared to Academia

                              +

                              So, what can be learned from those more recent years? One thing we can observe +is that a lot of the engineering work we did in that time is not really science +as such. A lot of the VM techniques we implemented are kind of well known, and +catching up with new Python features is also not particularly deep researchy +work. Of course this kind of work is obviously super necessary if you want +people to use your VM, but it would be very hard to try to get research funding +for it. PyPy managed quite well over its history to balance phases of more +research oriented work, and more product oriented ones. But getting this balance +somewhat right is not easy, and definitely also involves a lot of luck. And, as +has been discussed a lot, it's actually very hard to find funding for open +source work, both within and outside of academia.

                              +
                              +
                              +

                              Meta-Tracing really works!

                              +

                              Let me end with what, in my opinion, is the main positive technical result of PyPy the +project. Which is that the whole idea of using a meta-tracing JIT can really +work! Currently PyPy is about 7 times faster than CPython on a broad set of +benchmarks. Also, one of the very early motivations for using a meta-jitting +approach in PyPy, which was to not have to adapt the JIT to new versions of +CPython proved to work: indeed we didn't have to change anything in the JIT +infrastructure to support Python 3.

                              +

                              RPython has also worked and improved performance for a number of other +languages. Some of these interpreters had wildly different architectures. +AST-based interpreters, bytecode based, CPU emulators, really inefficient +high-level ones that allocate continuation objects all the time, and so on. This +shows that RPython also gives you a lot of freedom in deciding how you want to +structure the interpreter and that it can be applied to languages of quite +different paradigms.

                              +

                              I'll end with a list of the people that have contributed code to PyPy over its +history, more than 350 of them. I'd like to thank all of them and the various +roles they played. To the next 15 years!

                              + +
                              + +
                              +
                              + +

                              Acknowledgements

                              +

                              A lot of people helped me with this blog post. Tim Felgentreff made me give the +keynote, which lead me to start collecting the material. Samuele Pedroni +gave essential early input when I just started planning the talk, and also gave +feedback on the blog post. Maciej Fijałkowski gave me feedback on the post, in +particular important insight about the more recent years of the project. Armin +Rigo discussed the talk slides with me, and provided details about the early +expectations about the first JIT's hoped-for performance. Antonio Cuni gave +substantial feedback and many very helpful suggestions for the blog post. +Michael Hudson-Doyle also fixed a number of mistakes in the post and rightfully +complained about the lack of mention of the GC. Christian Tismer provided +access to his copy of early Python-de mailing list posts. Matti Picus pointed +out a number of things I had forgotten and fixed a huge number of typos and +awkward English, including my absolute inability to put commas correctly. +All remaining errors are of course my own.

                              +
                              + +

                              update: fixed confusing wording in the maps section.

                              +
                              +

                              Comments

                              +
                              +
                              +
                              + + Peterino wrote on 2018-09-14 00:24: +
                              +
                              +

                              Congratulations on your story, and the fantastic achievements!

                              Interestingly, from my personal experience I can't confirm the "PyPy is faster than CPython" claim. Maybe you can help me understand. I'm running a simple set of tests against a subset of Python versions of CPython and against PyPy, for a few years. For what I saw in that time, PyPy - including PyPy3 now - was always the slowest, usually by a factor of 2 compared to the mean of all CPython versions. See the results on Travis, for example: https://travis-ci.org/painless-software/painless-continuous-delivery

                              Why is it that way? When is PyPy really faster? Are the benchmarks you run tailored to a specific area of software development?

                              And then the final thing I've not yet understood about PyPy: What is the ultimate plan? Should it ever replace CPython, one day? When it proves to be both faster and less laborious to maintain, maybe?

                              +
                              +
                              +
                              +
                              + + Carl Friedrich Bolz-Tereick wrote on 2018-09-14 07:58: +
                              +
                              +

                              Hi Peterino! Thanks for testing your project on PyPy! PyPy is often slower on unit tests, as is explained here:
                              https://alexgaynor.net/2013/jul/15/your-tests-are-not-benchmark/

                              PyPy is best at CPU bound python code It will never replace CPython but keep existing as an alternative python implementation.

                              +
                              +
                              +
                              +
                              + + MikeABKK wrote on 2020-04-18 18:47: +
                              +
                              +

                              > ...AST-based interpreters, ... really inefficient high-level ones that allocate continuation objects all the time, and so on.

                              Does anyone have any links to these sorts of interpreters? I am interested in both. I'd very much appreciate any references.

                              +
                              +
                              +
                              + +
                              +
                              + +
                              +
                              + + \ No newline at end of file diff --git a/posts/2018/11/guest-post-implementing-calculator-repl-6271483514675006846.html b/posts/2018/11/guest-post-implementing-calculator-repl-6271483514675006846.html new file mode 100644 index 000000000..6a88fb774 --- /dev/null +++ b/posts/2018/11/guest-post-implementing-calculator-repl-6271483514675006846.html @@ -0,0 +1,1537 @@ + + + + + +Guest Post: Implementing a Calculator REPL in RPython | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                              +
                              +

                              Guest Post: Implementing a Calculator REPL in RPython

                              + + + +
                              +

                              This is a tutorial style post that walks through using the RPython translation +toolchain to create a REPL that executes basic math expressions.

                              + +

                              We will do that by scanning the user's input into tokens, compiling those +tokens into bytecode and running that bytecode in our own virtual machine. Don't +worry if that sounds horribly complicated, we are going to explain it step by +step.

                              + +

                              This post is a bit of a diversion while on my journey to create a compliant +lox implementation +using the RPython translation toolchain. The +majority of this work is a direct RPython translation of the low level C +guide from Bob Nystrom (@munificentbob) in the +excellent book craftinginterpreters.com +specifically the chapters 14 – 17.

                              + +

                              The road ahead

                              + +

                              As this post is rather long I'll break it into a few major sections. In each section we will +have something that translates with RPython, and at the end it all comes together.

                              + + +

                              A REPL

                              + +

                              So if you're a Python programmer you might be thinking this is pretty trivial right?

                              + +

                              I mean if we ignore input errors, injection attacks etc couldn't we just do something +like this:

                              + +
                              """
                              +A pure python REPL that can parse simple math expressions
                              +"""
                              +while True:
                              +    print(eval(raw_input("> ")))
                              +
                              + +

                              Well it does appear to do the trick:

                              + +
                              $ python2 section-1-repl/main.py
                              +> 3 + 4 * ((1.0/(2 * 3 * 4)) + (1.0/(4 * 5 * 6)) - (1.0/(6 * 7 * 8)))
                              +3.1880952381
                              +
                              + +

                              So can we just ask RPython to translate this into a binary that runs magically +faster?

                              + +

                              Let's see what happens. We need to add two functions for RPython to +get its bearings (entry_point and target) and call the file targetXXX:

                              + +

                              targetrepl1.py

                              + +
                              def repl():
                              +    while True:
                              +        print eval(raw_input('> '))
                              +
                              +
                              +def entry_point(argv):
                              +    repl()
                              +    return 0
                              +
                              +
                              +def target(driver, *args):
                              +    return entry_point, None
                              +
                              + +

                              Which at translation time gives us this admonishment that accurately tells us +we are trying to call a Python built-in raw_input that is unfortunately not +valid RPython.

                              + +
                              $ rpython ./section-1-repl/targetrepl1.py
                              +...SNIP...
                              +[translation:ERROR] AnnotatorError: 
                              +
                              +object with a __call__ is not RPython: <built-in function raw_input>
                              +Processing block:
                              + block@18 is a <class 'rpython.flowspace.flowcontext.SpamBlock'> 
                              + in (target1:2)repl 
                              + containing the following operations: 
                              +       v0 = simple_call((builtin_function raw_input), ('> ')) 
                              +       v1 = simple_call((builtin_function eval), v0) 
                              +       v2 = str(v1) 
                              +       v3 = simple_call((function rpython_print_item), v2) 
                              +       v4 = simple_call((function rpython_print_newline)) 
                              +
                              + +

                              Ok so we can't use raw_input or eval but that doesn't faze us. Let's get +the input from a stdin stream and just print it out (no evaluation).

                              + +

                              targetrepl2.py

                              + +
                              from rpython.rlib import rfile
                              +
                              +LINE_BUFFER_LENGTH = 1024
                              +
                              +
                              +def repl(stdin):
                              +    while True:
                              +        print "> ",
                              +        line = stdin.readline(LINE_BUFFER_LENGTH)
                              +        print line
                              +
                              +
                              +def entry_point(argv):
                              +    stdin, stdout, stderr = rfile.create_stdio()
                              +    try:
                              +        repl(stdin)
                              +    except:
                              +        return 0
                              +
                              +
                              +def target(driver, *args):
                              +    return entry_point, None
                              +
                              + +

                              Translate targetrepl2.py – we can add an optimization level if we +are so inclined:

                              + +
                              $ rpython --opt=2 section-1-repl/targetrepl2.py
                              +...SNIP...
                              +[Timer] Timings:
                              +[Timer] annotate                       ---  1.2 s
                              +[Timer] rtype_lltype                   ---  0.9 s
                              +[Timer] backendopt_lltype              ---  0.6 s
                              +[Timer] stackcheckinsertion_lltype     ---  0.0 s
                              +[Timer] database_c                     --- 15.0 s
                              +[Timer] source_c                       ---  1.6 s
                              +[Timer] compile_c                      ---  1.9 s
                              +[Timer] =========================================
                              +[Timer] Total:                         --- 21.2 s
                              +
                              + +

                              No errors!? Let's try it out:

                              + +
                              $ ./target2-c 
                              +1 + 2
                              +>  1 + 2
                              +
                              +^C
                              +
                              + +

                              Ahh our first success – let's quickly deal with the flushing fail by using the +stdout stream directly as well. Let's print out the input in quotes:

                              + +
                              from rpython.rlib import rfile
                              +
                              +LINE_BUFFER_LENGTH = 1024
                              +
                              +
                              +def repl(stdin, stdout):
                              +    while True:
                              +        stdout.write("> ")
                              +        line = stdin.readline(LINE_BUFFER_LENGTH)
                              +        print '"%s"' % line.strip()
                              +
                              +
                              +def entry_point(argv):
                              +    stdin, stdout, stderr = rfile.create_stdio()
                              +    try:
                              +        repl(stdin, stdout)
                              +    except:
                              +        pass
                              +    return 0
                              +
                              +
                              +def target(driver, *args):
                              +    return entry_point, None
                              +
                              + +

                              Translation works, and the test run too:

                              + +
                              $ ./target3-c 
                              +> hello this seems better
                              +"hello this seems better"
                              +> ^C
                              +
                              + +

                              So we are in a good place with taking user input and printing output... What about +the whole math evaluation thing we were promised? For that we are can probably leave +our RPython REPL behind for a while and connect it up at the end.

                              + +

                              A virtual machine

                              + +

                              A virtual machine is the execution engine of our basic math interpreter. It will be very simple, +only able to do simple tasks like addition. I won't go into any depth to describe why we want +a virtual machine, but it is worth noting that many languages including Java and Python make +this decision to compile to an intermediate bytecode representation and then execute that with +a virtual machine. Alternatives are compiling directly to native machine code like (earlier versions of) the V8 +JavaScript engine, or at the other end of the spectrum executing an abstract syntax tree – +which is what the Truffle approach to building VMs is based on.

                              + +

                              We are going to keep things very simple. We will have a stack where we can push and pop values, +we will only support floats, and our VM will only implement a few very basic operations.

                              + +

                              OpCodes

                              + +

                              In fact our entire instruction set is:

                              + +
                              OP_CONSTANT
                              +OP_RETURN
                              +OP_NEGATE
                              +OP_ADD
                              +OP_SUBTRACT
                              +OP_MULTIPLY
                              +OP_DIVIDE
                              +
                              + +

                              Since we are targeting RPython we can't use the nice enum module from the Python standard +library, so instead we just define a simple class with class attributes.

                              + +

                              We should start to get organized, so we will create a new file +opcodes.py and add this:

                              + +
                              class OpCode:
                              +    OP_CONSTANT = 0
                              +    OP_RETURN = 1
                              +    OP_NEGATE = 2
                              +    OP_ADD = 3
                              +    OP_SUBTRACT = 4
                              +    OP_MULTIPLY = 5
                              +    OP_DIVIDE = 6
                              +
                              + +

                              Chunks

                              + +

                              To start with we need to get some infrastructure in place before we write the VM engine.

                              + +

                              Following craftinginterpreters.com +we start with a Chunk object which will represent our bytecode. In RPython we have access +to Python-esq lists so our code object will just be a list of OpCode values – which are +just integers. A list of ints, couldn't get much simpler.

                              + +

                              section-2-vm/chunk.py

                              + +
                              class Chunk:
                              +    code = None
                              +
                              +    def __init__(self):
                              +        self.code = []
                              +
                              +    def write_chunk(self, byte):
                              +        self.code.append(byte)
                              +
                              +    def disassemble(self, name):
                              +        print "== %s ==\n" % name
                              +        i = 0
                              +        while i < len(self.code):
                              +            i = disassemble_instruction(self, i)
                              +
                              + +

                              From here on I'll only present minimal snippets of code instead of the whole lot, but +I'll link to the repository with the complete example code. For example the +various debugging including disassemble_instruction isn't particularly interesting +to include verbatim. See the github repo for full details

                              + +

                              We need to check that we can create a chunk and disassemble it. The quickest way to do this +is to use Python during development and debugging then every so often try to translate it.

                              + +

                              Getting the disassemble part through the RPython translator was a hurdle for me as I +quickly found that many str methods such as format are not supported, and only very basic +% based formatting is supported. I ended up creating helper functions for string manipulation +such as:

                              + +
                              def leftpad_string(string, width, char=" "):
                              +    l = len(string)
                              +    if l > width:
                              +        return string
                              +    return char * (width - l) + string
                              +
                              + +

                              Let's write a new entry_point that creates and disassembles a chunk of bytecode. We can +set the target output name to vm1 at the same time:

                              + +

                              targetvm1.py

                              + +
                              def entry_point(argv):
                              +    bytecode = Chunk()
                              +    bytecode.write_chunk(OpCode.OP_ADD)
                              +    bytecode.write_chunk(OpCode.OP_RETURN)
                              +    bytecode.disassemble("hello world")
                              +    return 0
                              +
                              +def target(driver, *args):
                              +    driver.exe_name = "vm1"
                              +    return entry_point, None
                              +
                              + +

                              Running this isn't going to be terribly interesting, but it is always nice to +know that it is doing what you expect:

                              + +
                              $ ./vm1 
                              +== hello world ==
                              +
                              +0000 OP_ADD       
                              +0001 OP_RETURN    
                              +
                              + +

                              Chunks of data

                              + +

                              Ref: https://www.craftinginterpreters.com/chunks-of-bytecode.html#constants

                              + +

                              So our bytecode is missing a very crucial element – the values to operate on!

                              + +

                              As with the bytecode we can store these constant values as part of the chunk +directly in a list. Each chunk will therefore have a constant data component, +and a code component.

                              + +

                              Edit the chunk.py file and add the new instance attribute constants as an +empty list, and a new method add_constant.

                              + +
                                  def add_constant(self, value):
                              +        self.constants.append(value)
                              +        return len(self.constants) - 1
                              +
                              + +

                              Now to use this new capability we can modify our example chunk +to write in some constants before the OP_ADD:

                              + +
                                  bytecode = Chunk()
                              +    constant = bytecode.add_constant(1.0)
                              +    bytecode.write_chunk(OpCode.OP_CONSTANT)
                              +    bytecode.write_chunk(constant)
                              +
                              +    constant = bytecode.add_constant(2.0)
                              +    bytecode.write_chunk(OpCode.OP_CONSTANT)
                              +    bytecode.write_chunk(constant)
                              +
                              +    bytecode.write_chunk(OpCode.OP_ADD)
                              +    bytecode.write_chunk(OpCode.OP_RETURN)
                              +
                              +    bytecode.disassemble("adding constants")
                              +
                              + +

                              Which still translates with RPython and when run gives us the following disassembled +bytecode:

                              + +
                              == adding constants ==
                              +
                              +0000 OP_CONSTANT  (00)        '1'
                              +0002 OP_CONSTANT  (01)        '2'
                              +0004 OP_ADD       
                              +0005 OP_RETURN
                              +
                              + +

                              We won't go down the route of serializing the bytecode to disk, but this bytecode chunk +(including the constant data) could be saved and executed on our VM later – like a Java +.class file. Instead we will pass the bytecode directly to our VM after we've created +it during the compilation process.

                              + +

                              Emulation

                              + +

                              So those four instructions of bytecode combined with the constant value mapping +00 -> 1.0 and 01 -> 2.0 describes individual steps for our virtual machine +to execute. One major point in favor of defining our own bytecode is we can +design it to be really simple to execute – this makes the VM really easy to implement.

                              + +

                              As I mentioned earlier this virtual machine will have a stack, so let's begin with that. +Now the stack is going to be a busy little beast – as our VM takes instructions like +OP_ADD it will pop off the top two values from the stack, and push the result of adding +them together back onto the stack. Although dynamically resizing Python lists +are marvelous, they can be a little slow. RPython can take advantage of a constant sized +list which doesn't make our code much more complicated.

                              + +

                              To do this we will define a constant sized list and track the stack_top directly. Note +how we can give the RPython translator hints by adding assertions about the state that +the stack_top will be in.

                              + +
                              class VM(object):
                              +    STACK_MAX_SIZE = 256
                              +    stack = None
                              +    stack_top = 0
                              +
                              +    def __init__(self):
                              +        self._reset_stack()
                              +
                              +    def _reset_stack(self):
                              +        self.stack = [0] * self.STACK_MAX_SIZE
                              +        self.stack_top = 0
                              +
                              +    def _stack_push(self, value):
                              +        assert self.stack_top < self.STACK_MAX_SIZE
                              +        self.stack[self.stack_top] = value
                              +        self.stack_top += 1
                              +
                              +    def _stack_pop(self):
                              +        assert self.stack_top >= 0
                              +        self.stack_top -= 1
                              +        return self.stack[self.stack_top]
                              +
                              +    def _print_stack(self):
                              +        print "         ",
                              +        if self.stack_top <= 0:
                              +            print "[]",
                              +        else:
                              +            for i in range(self.stack_top):
                              +                print "[ %s ]" % self.stack[i],
                              +        print
                              +
                              + +

                              Now we get to the main event, the hot loop, the VM engine. Hope I haven't built it up to +much, it is actually really simple! We loop until the instructions tell us to stop +(OP_RETURN), and dispatch to other simple methods based on the instruction.

                              + +
                                  def _run(self):
                              +        while True:
                              +            instruction = self._read_byte()
                              +
                              +            if instruction == OpCode.OP_RETURN:
                              +                print "%s" % self._stack_pop()
                              +                return InterpretResultCode.INTERPRET_OK
                              +            elif instruction == OpCode.OP_CONSTANT:
                              +                constant = self._read_constant()
                              +                self._stack_push(constant)
                              +            elif instruction == OpCode.OP_ADD:
                              +                self._binary_op(self._stack_add)    
                              +
                              + +

                              Now the _read_byte method will have to keep track of which instruction we are up +to. So add an instruction pointer (ip) to the VM with an initial value of 0. +Then _read_byte is simply getting the next bytecode (int) from the chunk's code:

                              + +
                                  def _read_byte(self):
                              +        instruction = self.chunk.code[self.ip]
                              +        self.ip += 1
                              +        return instruction
                              +
                              + +

                              + +

                              If the instruction is OP_CONSTANT we take the constant's address from the next byte +of the chunk's code, retrieve that constant value and add it to the VM's stack.

                              + +
                                  def _read_constant(self):
                              +        constant_index = self._read_byte()
                              +        return self.chunk.constants[constant_index]
                              +
                              + +

                              Finally our first arithmetic operation OP_ADD, what it has to achieve doesn't +require much explanation: pop two values from the stack, add them together, push +the result. But since a few operations all have the same template we introduce a +layer of indirection – or abstraction – by introducing a reusable _binary_op +helper method.

                              + +
                                  @specialize.arg(1)
                              +    def _binary_op(self, operator):
                              +        op2 = self._stack_pop()
                              +        op1 = self._stack_pop()
                              +        result = operator(op1, op2)
                              +        self._stack_push(result)
                              +
                              +    @staticmethod
                              +    def _stack_add(op1, op2):
                              +        return op1 + op2
                              +
                              + +

                              + +

                              Note we tell RPython to specialize _binary_op on the first argument. This causes +RPython to make a copy of _binary_op for every value of the first argument passed, +which means that each copy contains a call to a particular operator, which can then be +inlined.

                              + +

                              To be able to run our bytecode the only thing left to do is to pass in the chunk +and call _run():

                              + +
                                  def interpret_chunk(self, chunk):
                              +        if self.debug_trace:
                              +            print "== VM TRACE =="
                              +        self.chunk = chunk
                              +        self.ip = 0
                              +        try:
                              +            result = self._run()
                              +            return result
                              +        except:
                              +            return InterpretResultCode.INTERPRET_RUNTIME_ERROR
                              +
                              + +

                              targetvm3.py connects the pieces:

                              + +
                              def entry_point(argv):
                              +    bytecode = Chunk()
                              +    constant = bytecode.add_constant(1)
                              +    bytecode.write_chunk(OpCode.OP_CONSTANT)
                              +    bytecode.write_chunk(constant)
                              +    constant = bytecode.add_constant(2)
                              +    bytecode.write_chunk(OpCode.OP_CONSTANT)
                              +    bytecode.write_chunk(constant)
                              +    bytecode.write_chunk(OpCode.OP_ADD)
                              +    bytecode.write_chunk(OpCode.OP_RETURN)
                              +
                              +    vm = VM()
                              +    vm.interpret_chunk(bytecode)
                              +
                              +    return 0
                              +
                              + +

                              I've added some trace debugging so we can see what the VM and stack is doing.

                              + +

                              The whole thing translates with RPython, and when run gives us:

                              + +
                              ./vm3
                              +== VM TRACE ==
                              +          []
                              +0000 OP_CONSTANT  (00)        '1'
                              +          [ 1 ]
                              +0002 OP_CONSTANT  (01)        '2'
                              +          [ 1 ] [ 2 ]
                              +0004 OP_ADD       
                              +          [ 3 ]
                              +0005 OP_RETURN    
                              +3
                              +
                              + +

                              Yes we just computed the result of 1+2. Pat yourself on the back.

                              + +

                              At this point it is probably valid to check that the translated executable is actually +faster than running our program directly in Python. For this trivial example under +Python2/pypy this targetvm3.py file runs in the 20ms – 90ms region, and the +compiled vm3 runs in <5ms. Something useful must be happening during the translation.

                              + +

                              I won't go through the code adding support for our other instructions as they are +very similar and straightforward. Our VM is ready to execute our chunks of bytecode, +but we haven't yet worked out how to take the entered expression and turn that into +this simple bytecode. This is broken into two steps, scanning and compiling.

                              + +

                              Scanning the source

                              + +

                              All the source for this section can be found in +section-3-scanning.

                              + +

                              The job of the scanner is to take the raw expression string and transform it into +a sequence of tokens. This scanning step will strip out whitespace and comments, +catch errors with invalid token and tokenize the string. For example the input +"( 1 + 2 ) would get tokenized into LEFT_PAREN, NUMBER(1), PLUS, NUMBER(2), RIGHT_PAREN.

                              + +

                              As with our OpCodes we will just define a simple Python class to define an int +for each type of token:

                              + +
                              class TokenTypes:
                              +    ERROR = 0
                              +    EOF = 1
                              +    LEFT_PAREN = 2
                              +    RIGHT_PAREN = 3
                              +    MINUS = 4
                              +    PLUS = 5
                              +    SLASH = 6
                              +    STAR = 7
                              +    NUMBER = 8
                              +
                              + +

                              A token has to keep some other information as well – keeping track of the location and +length of the token will be helpful for error reporting. The NUMBER token clearly needs +some data about the value it is representing: we could include a copy of the source lexeme +(e.g. the string 2.0), or parse the value and store that, or – what we will do in this +blog – use the location and length information as pointers into the original source +string. Every token type (except perhaps ERROR) will use this simple data structure:

                              + +
                              class Token(object):
                              +
                              +    def __init__(self, start, length, token_type):
                              +        self.start = start
                              +        self.length = length
                              +        self.type = token_type
                              +
                              + +

                              Our soon to be created scanner will create these Token objects which refer back to +addresses in some source. If the scanner sees the source "( 1 + 2.0 )" it would emit +the following tokens:

                              + +
                              Token(0, 1, TokenTypes.LEFT_PAREN)
                              +Token(2, 1, TokenTypes.NUMBER)
                              +Token(4, 1, TokenTypes.PLUS)
                              +Token(6, 3, TokenTypes.NUMBER)
                              +Token(10, 1, TokenTypes.RIGHT_PAREN)
                              +
                              + +

                              Scanner

                              + +

                              Let's walk through the scanner implementation method +by method. The scanner will take the source and pass through it once, creating tokens +as it goes.

                              + +
                              class Scanner(object):
                              +
                              +    def __init__(self, source):
                              +        self.source = source
                              +        self.start = 0
                              +        self.current = 0
                              +
                              + +

                              The start and current variables are character indices in the source string that point to +the current substring being considered as a token.

                              + +

                              For example in the string "(51.05+2)" while we are tokenizing the number 51.05 +we will have start pointing at the 5, and advance current character by character +until the character is no longer part of a number. Midway through scanning the number +the start and current values might point to 1 and 4 respectively:

                              + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                              012345678
                              "(""5""1"".""0""5""+""2"")"
                               ^ ^
                              +

                              From current=4 the scanner peeks ahead and sees that the next character (5) is +a digit, so will continue to advance.

                              + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                              012345678
                              "(""5""1"".""0""5""+""2"")"
                               ^ ^
                              +

                              When the scanner peeks ahead and sees the "+" it will create the number +token and emit it. The method that carry's out this tokenizing is _number:

                              + +
                                  def _number(self):
                              +        while self._peek().isdigit():
                              +            self.advance()
                              +
                              +        # Look for decimal point
                              +        if self._peek() == '.' and self._peek_next().isdigit():
                              +            self.advance()
                              +            while self._peek().isdigit():
                              +                self.advance()
                              +
                              +        return self._make_token(TokenTypes.NUMBER)
                              +
                              + +

                              It relies on a few helpers to look ahead at the upcoming characters:

                              + +
                                  def _peek(self):
                              +        if self._is_at_end():
                              +            return '\0'
                              +        return self.source[self.current]
                              +
                              +    def _peek_next(self):
                              +        if self._is_at_end():
                              +            return '\0'
                              +        return self.source[self.current+1]
                              +
                              +    def _is_at_end(self):
                              +        return len(self.source) == self.current
                              +
                              + +

                              If the character at current is still part of the number we want to call advance +to move on by one character.

                              + +
                                  def advance(self):
                              +        self.current += 1
                              +        return self.source[self.current - 1]
                              +
                              + +

                              Once the isdigit() check fails in _number() we call _make_token() to emit the +token with the NUMBER type.

                              + +
                                  def _make_token(self, token_type):
                              +        return Token(
                              +            start=self.start,
                              +            length=(self.current - self.start),
                              +            token_type=token_type
                              +        )
                              +
                              + +

                              Note again that the token is linked to an index address in the source, rather than +including the string value.

                              + +

                              Our scanner is pull based, a token will be requested via scan_token. First we skip +past whitespace and depending on the characters emit the correct token:

                              + +
                                  def scan_token(self):
                              +        # skip any whitespace
                              +        while True:
                              +            char = self._peek()
                              +            if char in ' \r\t\n':
                              +                self.advance()
                              +            break
                              +
                              +        self.start = self.current
                              +
                              +        if self._is_at_end():
                              +            return self._make_token(TokenTypes.EOF)
                              +
                              +        char = self.advance()
                              +
                              +        if char.isdigit():
                              +            return self._number()
                              +
                              +        if char == '(':
                              +            return self._make_token(TokenTypes.LEFT_PAREN)
                              +        if char == ')':
                              +            return self._make_token(TokenTypes.RIGHT_PAREN)
                              +        if char == '-':
                              +            return self._make_token(TokenTypes.MINUS)
                              +        if char == '+':
                              +            return self._make_token(TokenTypes.PLUS)
                              +        if char == '/':
                              +            return self._make_token(TokenTypes.SLASH)
                              +        if char == '*':
                              +            return self._make_token(TokenTypes.STAR)
                              +
                              +        return ErrorToken("Unexpected character", self.current)
                              +
                              + +

                              + +

                              If this was a real programming language we were scanning, this would be the point where we +add support for different types of literals and any language identifiers/reserved words.

                              + +

                              At some point we will need to parse the literal value for our numbers, but we leave that +job for some later component, for now we'll just add a get_token_string helper. To make +sure that RPython is happy to index arbitrary slices of source we add range assertions:

                              + +
                                  def get_token_string(self, token):
                              +        if isinstance(token, ErrorToken):
                              +            return token.message
                              +        else:
                              +            end_loc = token.start + token.length
                              +            assert end_loc < len(self.source)
                              +            assert end_loc > 0
                              +            return self.source[token.start:end_loc]
                              +
                              + +

                              A simple entry point can be used to test our scanner with a hard coded +source string:

                              + +

                              targetscanner1.py

                              + +
                              from scanner import Scanner, TokenTypes, TokenTypeToName
                              +
                              +
                              +def entry_point(argv):
                              +
                              +    source = "(   1   + 2.0 )"
                              +
                              +    scanner = Scanner(source)
                              +    t = scanner.scan_token()
                              +    while t.type != TokenTypes.EOF and t.type != TokenTypes.ERROR:
                              +        print TokenTypeToName[t.type],
                              +        if t.type == TokenTypes.NUMBER:
                              +            print "(%s)" % scanner.get_token_string(t),
                              +        print
                              +        t = scanner.scan_token()
                              +    return 0
                              +
                              + +

                              RPython didn't complain, and lo it works:

                              + +
                              $ ./scanner1 
                              +LEFT_PAREN
                              +NUMBER (1)
                              +PLUS
                              +NUMBER (2.0)
                              +RIGHT_PAREN
                              +
                              + +

                              Let's connect our REPL to the scanner.

                              + +

                              targetscanner2.py

                              + +
                              from rpython.rlib import rfile
                              +from scanner import Scanner, TokenTypes, TokenTypeToName
                              +
                              +LINE_BUFFER_LENGTH = 1024
                              +
                              +
                              +def repl(stdin, stdout):
                              +    while True:
                              +        stdout.write("> ")
                              +        source = stdin.readline(LINE_BUFFER_LENGTH)
                              +
                              +        scanner = Scanner(source)
                              +        t = scanner.scan_token()
                              +        while t.type != TokenTypes.EOF and t.type != TokenTypes.ERROR:
                              +            print TokenTypeToName[t.type],
                              +            if t.type == TokenTypes.NUMBER:
                              +                print "(%s)" % scanner.get_token_string(t),
                              +            print
                              +            t = scanner.scan_token()
                              +
                              +
                              +def entry_point(argv):
                              +    stdin, stdout, stderr = rfile.create_stdio()
                              +    try:
                              +        repl(stdin, stdout)
                              +    except:
                              +        pass
                              +    return 0
                              +
                              + +

                              With our REPL hooked up we can now scan tokens from arbitrary input:

                              + +
                              $ ./scanner2
                              +> (3 *4) - -3
                              +LEFT_PAREN
                              +NUMBER (3)
                              +STAR
                              +NUMBER (4)
                              +RIGHT_PAREN
                              +MINUS
                              +MINUS
                              +NUMBER (3)
                              +> ^C
                              +
                              + +

                              Compiling expressions

                              + +

                              References

                              + +
                                +
                              • https://www.craftinginterpreters.com/compiling-expressions.html
                              • + +
                              • https://effbot.org/zone/simple-top-down-parsing.htm
                              • +
                              +

                              The final piece is to turn this sequence of tokens into our low level +bytecode instructions for the virtual machine to execute. Buckle up, +we are about to write us a compiler.

                              + +

                              Our compiler will take a single pass over the tokens using +Vaughan Pratt’s +parsing technique, and output a chunk of bytecode – if we do it +right it will be compatible with our existing virtual machine.

                              + +

                              Remember the bytecode we defined above is really simple – by relying +on our stack we can transform a nested expression into a sequence of +our bytecode operations.

                              + +

                              To make this more concrete let's go through by hand translating an +expression into bytecode.

                              + +

                              Our source expression:

                              + +
                              (3 + 2) - (7 * 2)
                              +
                              + +

                              If we were to make an abstract syntax tree we'd get something +like this:

                              + +

                              + +

                              Now if we start at the first sub expression (3+2) we can clearly +note from the first open bracket that we must see a close bracket, +and that the expression inside that bracket must be valid on its +own. Not only that but regardless of the inside we know that the whole +expression still has to be valid. Let's focus on this first bracketed +expression, let our attention recurse into it so to speak.

                              + +

                              This gives us a much easier problem – we just want to get our virtual +machine to compute 3 + 2. In this bytecode dialect we would load the +two constants, and then add them with OP_ADD like so:

                              + +
                              OP_CONSTANT  (00) '3.000000'
                              +OP_CONSTANT  (01) '2.000000'
                              +OP_ADD
                              +
                              + +

                              The effect of our vm executing these three instructions is that sitting +pretty at the top of the stack is the result of the addition. Winning.

                              + +

                              Jumping back out from our bracketed expression, our next token is MINUS, +at this point we have a fair idea that it must be used in an infix position. +In fact whatever token followed the bracketed expression it must be a +valid infix operator, if not the expression is over or had a syntax error.

                              + +

                              Assuming the best from our user (naive), we handle MINUS the same way +we handled the first PLUS. We've already got the first operand on the +stack, now we compile the right operand and then write out the bytecode +for OP_SUBTRACT.

                              + +

                              The right operand is another simple three instructions:

                              + +
                              OP_CONSTANT  (02) '7.000000'
                              +OP_CONSTANT  (03) '2.000000'
                              +OP_MULTIPLY
                              +
                              + +

                              Then we finish our top level binary expression and write a OP_RETURN to +return the value at the top of the stack as the execution's result. Our +final hand compiled program is:

                              + +
                              OP_CONSTANT  (00) '3.000000'
                              +OP_CONSTANT  (01) '2.000000'
                              +OP_ADD
                              +OP_CONSTANT  (02) '7.000000'
                              +OP_CONSTANT  (03) '2.000000'
                              +OP_MULTIPLY
                              +OP_SUBTRACT
                              +OP_RETURN
                              +
                              + +

                              Ok that wasn't so hard was it? Let's try make our code do that.

                              + +

                              We define a parser object which will keep track of where we are, and +whether things have all gone horribly wrong:

                              + +
                              class Parser(object):
                              +    def __init__(self):
                              +        self.had_error = False
                              +        self.panic_mode = False
                              +        self.current = None
                              +        self.previous = None
                              +
                              + +

                              The compiler will also be a class, we'll need one of our Scanner instances +to pull tokens from, and since the output is a bytecode Chunk let's go ahead +and make one of those in our compiler initializer:

                              + +
                              class Compiler(object):
                              +
                              +    def __init__(self, source):
                              +        self.parser = Parser()
                              +        self.scanner = Scanner(source)
                              +        self.chunk = Chunk()
                              +
                              + +

                              Since we have this (empty) chunk of bytecode we will make a helper method +to add individual bytes. Every instruction will pass from our compiler into +an executable program through this simple .

                              + +
                                  def emit_byte(self, byte):
                              +        self.current_chunk().write_chunk(byte)
                              +
                              + +

                              To quote from Bob Nystrom on the Pratt parsing technique:

                              + +
                              +

                              the implementation is a deceptively-simple handful of deeply intertwined code

                              +
                              + +

                              I don't actually think I can do justice to this section. Instead I suggest +reading his treatment in +Pratt Parsers: Expression Parsing Made Easy +which explains the magic behind the parsing component. Our only major difference is +instead of creating an AST we are going to directly emit bytecode for our VM.

                              + +

                              Now that I've absolved myself from taking responsibility in explaining this somewhat +tricky concept, I'll discuss some of the code from +compiler.py, and walk through what happens +for a particular rule.

                              + +

                              I'll jump straight to the juicy bit the table of parse rules. We define a ParseRule +for each token, and each rule comprises:

                              + +
                                +
                              • an optional handler for when the token is as a prefix (e.g. the minus in (-2)),
                              • + +
                              • an optional handler for whet the token is used infix (e.g. the slash in 2/47)
                              • + +
                              • a precedence value (a number that determines what is of higher precedence)
                              • +
                              +
                              rules = [
                              +    ParseRule(None,              None,            Precedence.NONE),   # ERROR
                              +    ParseRule(None,              None,            Precedence.NONE),   # EOF
                              +    ParseRule(Compiler.grouping, None,            Precedence.CALL),   # LEFT_PAREN
                              +    ParseRule(None,              None,            Precedence.NONE),   # RIGHT_PAREN
                              +    ParseRule(Compiler.unary,    Compiler.binary, Precedence.TERM),   # MINUS
                              +    ParseRule(None,              Compiler.binary, Precedence.TERM),   # PLUS
                              +    ParseRule(None,              Compiler.binary, Precedence.FACTOR), # SLASH
                              +    ParseRule(None,              Compiler.binary, Precedence.FACTOR), # STAR
                              +    ParseRule(Compiler.number,   None,            Precedence.NONE),   # NUMBER
                              +]
                              +
                              + +

                              These rules really are the magic of our compiler. When we get to a particular +token such as MINUS we see if it is an infix operator and if so we've gone and +got its first operand ready. At all times we rely on the relative precedence; consuming +everything with higher precedence than the operator we are currently evaluating.

                              + +

                              In the expression:

                              + +
                              2 + 3 * 4
                              +
                              + +

                              The * has higher precedence than the +, so 3 * 4 will be parsed together +as the second operand to the first infix operator (the +) which follows +the BEDMAS +order of operations I was taught at high school.

                              + +

                              To encode these precedence values we make another Python object moonlighting +as an enum:

                              + +
                              class Precedence(object):
                              +    NONE = 0
                              +    DEFAULT = 1
                              +    TERM = 2        # + -
                              +    FACTOR = 3      # * /
                              +    UNARY = 4       # ! - +
                              +    CALL = 5        # ()
                              +    PRIMARY = 6
                              +
                              + +

                              What happens in our compiler when turning -2.0 into bytecode? Assume we've just +pulled the token MINUS from the scanner. Every expression has to start with some +type of prefix – whether that is:

                              + +
                                +
                              • a bracket group (,
                              • + +
                              • a number 2,
                              • + +
                              • or a prefix unary operator -.
                              • +
                              +

                              Knowing that, our compiler assumes there is a prefix handler in the rule table – in +this case it points us at the unary handler.

                              + +
                                  def parse_precedence(self, precedence):
                              +        # parses any expression of a given precedence level or higher
                              +        self.advance()
                              +        prefix_rule = self._get_rule(self.parser.previous.type).prefix
                              +        prefix_rule(self)
                              +
                              + +

                              + +

                              unary is called:

                              + +
                                  def unary(self):
                              +        op_type = self.parser.previous.type
                              +        # Compile the operand
                              +        self.parse_precedence(Precedence.UNARY)
                              +        # Emit the operator instruction
                              +        if op_type == TokenTypes.MINUS:
                              +            self.emit_byte(OpCode.OP_NEGATE)
                              +
                              + +

                              Here – before writing the OP_NEGATE opcode we recurse back into parse_precedence +to ensure that whatever follows the MINUS token is compiled – provided it has +higher precedence than unary – e.g. a bracketed group. +Crucially at run time this recursive call will ensure that the result is left +on top of our stack. Armed with this knowledge, the unary method just +has to emit a single byte with the OP_NEGATE opcode.

                              + +

                              Test compilation

                              + +

                              Now we can test our compiler by outputting disassembled bytecode +of our user entered expressions. Create a new entry_point +targetcompiler:

                              + +
                              from rpython.rlib import rfile
                              +from compiler import Compiler
                              +
                              +LINE_BUFFER_LENGTH = 1024
                              +
                              +
                              +def entry_point(argv):
                              +    stdin, stdout, stderr = rfile.create_stdio()
                              +
                              +    try:
                              +        while True:
                              +            stdout.write("> ")
                              +            source = stdin.readline(LINE_BUFFER_LENGTH)
                              +            compiler = Compiler(source, debugging=True)
                              +            compiler.compile()
                              +    except:
                              +        pass
                              +    return 0
                              +
                              + +

                              Translate it and test it out:

                              + +
                              $ ./compiler1 
                              +> (2/4 + 1/2)
                              +== code ==
                              +
                              +0000 OP_CONSTANT  (00) '2.000000'
                              +0002 OP_CONSTANT  (01) '4.000000'
                              +0004 OP_DIVIDE    
                              +0005 OP_CONSTANT  (02) '1.000000'
                              +0007 OP_CONSTANT  (00) '2.000000'
                              +0009 OP_DIVIDE    
                              +0010 OP_ADD       
                              +0011 OP_RETURN
                              +
                              + +

                              Now if you've made it this far you'll be eager to finally connect everything +together by executing this bytecode with the virtual machine.

                              + +

                              End to end

                              + +

                              All the pieces slot together rather easily at this point, create a new +file targetcalc.py and define our +entry point:

                              + +
                              from rpython.rlib import rfile
                              +from compiler import Compiler
                              +from vm import VM
                              +
                              +LINE_BUFFER_LENGTH = 4096
                              +
                              +
                              +def entry_point(argv):
                              +    stdin, stdout, stderr = rfile.create_stdio()
                              +    vm = VM()
                              +    try:
                              +        while True:
                              +            stdout.write("> ")
                              +            source = stdin.readline(LINE_BUFFER_LENGTH)
                              +            if source:
                              +                compiler = Compiler(source, debugging=False)
                              +                compiler.compile()
                              +                vm.interpret_chunk(compiler.chunk)
                              +    except:
                              +        pass
                              +    return 0
                              +
                              +
                              +def target(driver, *args):
                              +    driver.exe_name = "calc"
                              +    return entry_point, None
                              +
                              + +

                              + +

                              Let's try catch it out with a double negative:

                              + +
                              $ ./calc 
                              +> 2--3
                              +== VM TRACE ==
                              +          []
                              +0000 OP_CONSTANT  (00) '2.000000'
                              +          [ 2.000000 ]
                              +0002 OP_CONSTANT  (01) '3.000000'
                              +          [ 2.000000 ] [ 3.000000 ]
                              +0004 OP_NEGATE    
                              +          [ 2.000000 ] [ -3.000000 ]
                              +0005 OP_SUBTRACT  
                              +          [ 5.000000 ]
                              +0006 OP_RETURN    
                              +5.000000
                              +
                              + +

                              Ok well let's evaluate the first 50 terms of the +Nilakantha Series:

                              + +
                              $ ./calc
                              +> 3 + 4 * ((1/(2 * 3 * 4)) + (1/(4 * 5 * 6)) - (1/(6 * 7 * 8)) + (1/(8 * 9 * 10)) - (1/(10 * 11 * 12)) + (1/(12 * 13 * 14)) - (1/(14 * 15 * 16)) + (1/(16 * 17 * 18)) - (1/(18 * 19 * 20)) + (1/(20 * 21 * 22)) - (1/(22 * 23 * 24)) + (1/(24 * 25 * 26)) - (1/(26 * 27 * 28)) + (1/(28 * 29 * 30)) - (1/(30 * 31 * 32)) + (1/(32 * 33 * 34)) - (1/(34 * 35 * 36)) + (1/(36 * 37 * 38)) - (1/(38 * 39 * 40)) + (1/(40 * 41 * 42)) - (1/(42 * 43 * 44)) + (1/(44 * 45 * 46)) - (1/(46 * 47 * 48)) + (1/(48 * 49 * 50)) - (1/(50 * 51 * 52)) + (1/(52 * 53 * 54)) - (1/(54 * 55 * 56)) + (1/(56 * 57 * 58)) - (1/(58 * 59 * 60)) + (1/(60 * 61 * 62)) - (1/(62 * 63 * 64)) + (1/(64 * 65 * 66)) - (1/(66 * 67 * 68)) + (1/(68 * 69 * 70)) - (1/(70 * 71 * 72)) + (1/(72 * 73 * 74)) - (1/(74 * 75 * 76)) + (1/(76 * 77 * 78)) - (1/(78 * 79 * 80)) + (1/(80 * 81 * 82)) - (1/(82 * 83 * 84)) + (1/(84 * 85 * 86)) - (1/(86 * 87 * 88)) + (1/(88 * 89 * 90)) - (1/(90 * 91 * 92)) + (1/(92 * 93 * 94)) - (1/(94 * 95 * 96)) + (1/(96 * 97 * 98)) - (1/(98 * 99 * 100)) + (1/(100 * 101 * 102)))
                              +
                              +== VM TRACE ==
                              +          []
                              +0000 OP_CONSTANT  (00) '3.000000'
                              +          [ 3.000000 ]
                              +0002 OP_CONSTANT  (01) '4.000000'
                              +...SNIP...
                              +0598 OP_CONSTANT  (101) '102.000000'
                              +          [ 3.000000 ] [ 4.000000 ] [ 0.047935 ] [ 1.000000 ] [ 10100.000000 ] [ 102.000000 ]
                              +0600 OP_MULTIPLY  
                              +          [ 3.000000 ] [ 4.000000 ] [ 0.047935 ] [ 1.000000 ] [ 1030200.000000 ]
                              +0601 OP_DIVIDE    
                              +          [ 3.000000 ] [ 4.000000 ] [ 0.047935 ] [ 0.000001 ]
                              +0602 OP_ADD       
                              +          [ 3.000000 ] [ 4.000000 ] [ 0.047936 ]
                              +0603 OP_MULTIPLY  
                              +          [ 3.000000 ] [ 0.191743 ]
                              +0604 OP_ADD       
                              +          [ 3.191743 ]
                              +0605 OP_RETURN    
                              +3.191743
                              +
                              + +

                              We just executed 605 virtual machine instructions to compute pi to 1dp!

                              + +

                              This brings us to the end of this tutorial. To recap we've walked through the whole +compilation process: from the user providing an expression string on the REPL, scanning +the source string into tokens, parsing the tokens while accounting for relative +precedence via a Pratt parser, generating bytecode, and finally executing the bytecode +on our own VM. RPython translated what we wrote into C and compiled it, meaning +our resulting calc REPL is really fast.

                              + +
                              +

                              “The world is a thing of utter inordinate complexity and richness and strangeness that is absolutely awesome.”

                              + +

                              ― Douglas Adams

                              +
                              + +

                              Many thanks to Bob Nystrom for writing the book that inspired this post, and thanks to +Carl Friedrich and Matt Halverson for reviewing.

                              + +

                              ― Brian (@thorneynzb)

                              +
                              +

                              Comments

                              +
                              +
                              +
                              + +
                              +
                              + + \ No newline at end of file diff --git a/posts/2018/11/hello-everyone-at-pypy-we-are-trying-to-5336557946798583063.html b/posts/2018/11/hello-everyone-at-pypy-we-are-trying-to-5336557946798583063.html new file mode 100644 index 000000000..c5e09516e --- /dev/null +++ b/posts/2018/11/hello-everyone-at-pypy-we-are-trying-to-5336557946798583063.html @@ -0,0 +1,322 @@ + + + + + +Funding for 64-bit Armv8-a support in PyPy | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                              +
                              +

                              Funding for 64-bit Armv8-a support in PyPy

                              + + + +
                              +
                              + +

                              Hello everyone

                              + +

                              At PyPy we are trying to support a relatively wide range of platforms. We have PyPy working on OS X, Windows and various flavors of linux (and unofficially various flavors of BSD) on the software side, with hardware side having x86, x86_64, PPC, 32-bit Arm (v7) and even zarch. This is harder than for other projects, since PyPy emits assembler on the fly from the just in time compiler and it requires significant amount of work to port it to a new platform.

                              + +

                              We are pleased to inform that Arm Limited, together with Crossbar.io GmbH, are sponsoring the development of 64-bit Armv8-a architecture support through Baroque Software OU, which would allow PyPy to run on a new variety of low-power, high-density servers with that architecture. We believe this will be beneficial for the funders, for the PyPy project as well as to the wider community.

                              + +

                              The work will commence soon and will be done some time early next year with expected speedups either comparable to x86 speedups or, if our current experience with ARM holds, more significant than x86 speedups.

                              + +

                              Best,
                              +Maciej Fijalkowski and the PyPy team

                              +
                              +
                              +
                              +

                              Comments

                              +
                              +
                              +
                              + + GG boy wrote on 2018-12-01 13:59: +
                              +
                              +

                              Good job

                              +
                              +
                              +
                              +
                              + + Mahmoud Hashemi wrote on 2018-12-09 19:44: +
                              +
                              +

                              Nice! Congrats!

                              +
                              +
                              +
                              + +
                              +
                              + +
                              +
                              + + \ No newline at end of file diff --git a/posts/2018/12/pypy-winter-sprint-feb-4-9-in-dusseldorf-7199110498451574074.html b/posts/2018/12/pypy-winter-sprint-feb-4-9-in-dusseldorf-7199110498451574074.html new file mode 100644 index 000000000..e2f2e3956 --- /dev/null +++ b/posts/2018/12/pypy-winter-sprint-feb-4-9-in-dusseldorf-7199110498451574074.html @@ -0,0 +1,370 @@ + + + + + +PyPy Winter Sprint Feb 4-9 in Düsseldorf | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                              +
                              +

                              PyPy Winter Sprint Feb 4-9 in Düsseldorf

                              + + + +
                              +
                              +
                              +
                              +

                              + PyPy Sprint February 4th-9th 2019 in Düsseldorf

                              +
                              +
                              +The next PyPy sprint will be held in the Computer Science department of Heinrich-Heine Universität Düsseldorf from the 4th to the 9st of February 2019 (nine years after the last sprint there). This is a fully public sprint, everyone is welcome to join us.
                              +

                              +Topics and goals

                              +
                              +
                              +
                                +
                              • improve Python 3.6 support
                              • +
                              • discuss benchmarking situation
                              • +
                              • progress on utf-8 branches
                              • +
                              • cpyext performance and completeness
                              • +
                              • packaging: are we ready to upload to PyPI?
                              • +
                                  +
                                • +issue 2617  - we expose too many functions from lib-pypy.so
                                • +
                                • +manylinux2010 - will it solve our build issues?
                                • +
                                • formulate an ABI name and upgrade policy
                                • +
                                +
                              +
                                +
                              • +memoryview(ctypes.Structure) does not create the correct format string
                              • +
                              • discussing the state and future of PyPy and the wider Python ecosystem
                              • +
                              +
                              +
                              +

                              +Location

                              +
                              +The sprint will take place in seminar room 25.12.02.55 of the computer science department.  It is in the building 25.12 of the university campus, second floor. Travel instructions
                              +
                              +

                              +Exact times

                              +
                              +Work days: starting February 4th (10:00), ending February 9th (~afternoon). The break day will probably be Thursday.
                              +

                              +Registration

                              +
                              +
                              +Please register by Mercurial::
                              +https://bitbucket.org/pypy/extradoc/
                              +
                              +https://foss.heptapod.net/pypy/extradoc/-/blob/branch/default/extradoc/sprintinfo/ddorf2019/people.txt

                              +or on the pypy-dev mailing list if you do not yet have check-in rights:
                              +
                              + +
                              +
                              +
                              +
                              +Looking forward to seeing everyone there!
                              +
                              +
                              +

                              Comments

                              +
                              +
                              +
                              + + Anonymous wrote on 2018-12-27 13:00: +
                              +
                              +

                              The travel instructions link is a redirect to a 404 page.

                              +
                              +
                              +
                              +
                              + + Armin Rigo wrote on 2018-12-27 13:33: +
                              +
                              +

                              Thanks! Fixed.

                              +
                              +
                              +
                              + +
                              +
                              + +
                              +
                              + + \ No newline at end of file diff --git a/posts/2019/01/pypy-for-low-latency-systems-613165393301401965.html b/posts/2019/01/pypy-for-low-latency-systems-613165393301401965.html new file mode 100644 index 000000000..6f052d44e --- /dev/null +++ b/posts/2019/01/pypy-for-low-latency-systems-613165393301401965.html @@ -0,0 +1,423 @@ + + + + + +PyPy for low-latency systems | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                              +
                              +

                              PyPy for low-latency systems

                              + + + +
                              +

                              +PyPy for low-latency systems

                              +Recently I have merged the gc-disable branch, introducing a couple of features +which are useful when you need to respond to certain events with the lowest +possible latency. This work has been kindly sponsored by Gambit Research +(which, by the way, is a very cool and geeky place where to work, in case you +are interested). Note also that this is a very specialized use case, so these +features might not be useful for the average PyPy user, unless you have the +same problems as described here.

                              +The PyPy VM manages memory using a generational, moving Garbage Collector. +Periodically, the GC scans the whole heap to find unreachable objects and +frees the corresponding memory. Although at a first look this strategy might +sound expensive, in practice the total cost of memory management is far less +than e.g. on CPython, which is based on reference counting. While maybe +counter-intuitive, the main advantage of a non-refcount strategy is +that allocation is very fast (especially compared to malloc-based allocators), +and deallocation of objects which die young is basically for free. More +information about the PyPy GC is available here.

                              +As we said, the total cost of memory managment is less on PyPy than on +CPython, and it's one of the reasons why PyPy is so fast. However, one big +disadvantage is that while on CPython the cost of memory management is spread +all over the execution of the program, on PyPy it is concentrated into GC +runs, causing observable pauses which interrupt the execution of the user +program.
                              +To avoid excessively long pauses, the PyPy GC has been using an incremental +strategy since 2013. The GC runs as a series of "steps", letting the user +program to progress between each step.

                              +The following chart shows the behavior of a real-world, long-running process:
                              + +
                              +

                              +The orange line shows the total memory used by the program, which +increases linearly while the program progresses. Every ~5 minutes, the GC +kicks in and the memory usage drops from ~5.2GB to ~2.8GB (this ratio is controlled +by the PYPY_GC_MAJOR_COLLECT env variable).
                              +The purple line shows aggregated data about the GC timing: the whole +collection takes ~1400 individual steps over the course of ~1 minute: each +point represent the maximum time a single step took during the past 10 +seconds. Most steps take ~10-20 ms, although we see a horrible peak of ~100 ms +towards the end. We have not investigated yet what it is caused by, but we +suspect it is related to the deallocation of raw objects.

                              +These multi-millesecond pauses are a problem for systems where it is important +to respond to certain events with a latency which is both low and consistent. +If the GC kicks in at the wrong time, it might causes unacceptable pauses during +the collection cycle.

                              +Let's look again at our real-world example. This is a system which +continuously monitors an external stream; when a certain event occurs, we want +to take an action. The following chart shows the maximum time it takes to +complete one of such actions, aggregated every minute:

                              + +
                              +
                              +You can clearly see that the baseline response time is around ~20-30 +ms. However, we can also see periodic spikes around ~50-100 ms, with peaks up +to ~350-450 ms! After a bit of investigation, we concluded that most (although +not all) of the spikes were caused by the GC kicking in at the wrong time.

                              +The work I did in the gc-disable branch aims to fix this problem by +introducing two new features to the gc module:
                              +
                                +
                              • +gc.disable(), which previously only inhibited the execution of +finalizers without actually touching the GC, now disables the GC major +collections. After a call to it, you will see the memory usage grow +indefinitely.
                              • +
                              • +gc.collect_step() is a new function which you can use to manually +execute a single incremental GC collection step.
                              • +
                              +
                              +It is worth to specify that gc.disable() disables only the major +collections, while minor collections still runs. Moreover, thanks to the +JIT's virtuals, many objects with a short and predictable lifetime are not +allocated at all. The end result is that most objects with short lifetime are +still collected as usual, so the impact of gc.disable() on memory growth +is not as bad as it could sound.

                              +Combining these two functions, it is possible to take control of the GC to +make sure it runs only when it is acceptable to do so. For an example of +usage, you can look at the implementation of a custom GC inside pypytools. +The peculiarity is that it also defines a "with nogc():" context manager +which you can use to mark performance-critical sections where the GC is not +allowed to run.

                              +The following chart compares the behavior of the default PyPy GC and the new +custom GC, after a careful placing of nogc() sections:

                              + +
                              +
                              +The yellow line is the same as before, while the purple line shows the new +system: almost all spikes have gone, and the baseline performance is about 10% +better. There is still one spike towards the end, but after some investigation +we concluded that it was not caused by the GC.

                              +Note that this does not mean that the whole program became magically +faster: we simply moved the GC pauses in some other place which is not +shown in the graph: in this specific use case this technique was useful +because it allowed us to shift the GC work in places where pauses are more +acceptable.

                              +All in all, a pretty big success, I think. These functionalities are already +available in the nightly builds of PyPy, and will be included in the next +release: take this as a New Year present :)

                              +Antonio Cuni and the PyPy team +
                              +

                              Comments

                              +
                              +
                              +
                              + + stuaxo wrote on 2019-01-08 18:47: +
                              +
                              +

                              Could see this being handy for python game libraries too.

                              +
                              +
                              +
                              +
                              + + samantha wrote on 2019-01-08 22:40: +
                              +
                              +

                              I am a bit surprised as these functions have been available for a long time in python gc module. So I suppose the news is a better performing one in pypy?

                              +
                              +
                              +
                              +
                              + + Armin Rigo wrote on 2019-01-09 02:46: +
                              +
                              +

                              @samantha: ``gc.collect_step()`` is new.

                              +
                              +
                              +
                              + +
                              +
                              + +
                              +
                              + + \ No newline at end of file diff --git a/posts/2019/02/dusseldorf-sprint-report-2019-6107623654916313905.html b/posts/2019/02/dusseldorf-sprint-report-2019-6107623654916313905.html new file mode 100644 index 000000000..7327c09c6 --- /dev/null +++ b/posts/2019/02/dusseldorf-sprint-report-2019-6107623654916313905.html @@ -0,0 +1,385 @@ + + + + + +Düsseldorf Sprint Report 2019 | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                              +
                              +

                              Düsseldorf Sprint Report 2019

                              + + + +
                              +

                              Hello everyone!

                              +

                              We are happy to report a successful and well attended sprint that is wrapping up +in Düsseldorf, Germany. In the last week we had eighteen people sprinting +at the Heinrich-Heine-Universität Düsseldorf on various topics.

                              +
                              +

                              Totally serious work going on here constantly.

                              +
                              +

                              A big +chunk of the sprint was dedicated to various discussions, since we did not +manage to gather the core developers in one room in quite a while. +Discussion topics included:

                              +
                                +
                              • Funding and general sustainability of open source.
                              • +
                              • Catching up with CPython 3.7/3.8 – we are planning to release 3.6 some time +in the next few months and we will continue working on 3.7/3.8.
                              • +
                              • What to do with VMprof
                              • +
                              • How can we support Cython inside PyPy in a way that will be understood +by the JIT, hence fast.
                              • +
                              • The future of supporting the numeric stack on pypy – we have made significant +progress in the past few years and most of the numeric stack works out of the box, +but deployment and performance remain problems. Improving on those problems +remains a very important focus for PyPy as a project.
                              • +
                              • Using the presence of a CPython developer (Łukasz Langa) and a Graal Python developer +(Tim Felgentreff) we discussed ways to collaborate in order to improve Python +ecosystem across implementations.
                              • +
                              • Pierre-Yves David and Georges Racinet from octobus gave us an exciting demo +on Heptapod, which adds mercurial support to gitlab.
                              • +
                              • Maciej and Armin gave demos of their current (non-PyPy-related) project VRSketch.
                              • +
                              +
                              + +

                              Visiting the Landschaftspark Duisburg Nord on the break day

                              +
                              + +

                              Some highlights of the coding tasks worked on:

                              +
                                +
                              • Aarch64 (ARM64) JIT backend work has been started, we are able to run the first +test! Tobias Oberstein from Crossbar GmbH and Rodolph Perfetta from ARM joined the +sprint to help kickstart the project.
                              • +
                              • The long running math-improvements branch that was started by Stian Andreassen got merged +after bugfixes done by Alexander Schremmer. It should improve operations on large integers.
                              • +
                              • The arcane art of necromancy was used to revive long dormant regalloc branch started +and nearly finished by Carl Friedrich Bolz-Tereick. The branch got merged and gives +some modest speedups across the board.
                              • +
                              • Andrew Lawrence worked on MSI installer for PyPy on windows.
                              • +
                              • Łukasz worked on improving failing tests on the PyPy 3.6 branch. He knows very obscure +details of CPython (e.g. how pickling works), hence we managed to progress very quickly.
                              • +
                              • Matti Picus set up a new benchmarking server for PyPy 3 branches.
                              • +
                              • The Utf8 branch, which changes the internal representation of unicode might be finally +merged at some point very soon. We discussed and improved upon the last few +blockers. It gives significant speedups in a lot of cases handling strings.
                              • +
                              • Zlib was missing couple methods, which were added by Ronan Lamy and Julian Berman.
                              • +
                              • Manuel Jacob fixed RevDB failures.
                              • +
                              • Antonio Cuni and Matti Picus worked on 7.0 release which should happen in a few days.
                              • +
                              +

                              Now we are all quite exhausted, and are looking forward to catching up on sleep.

                              +

                              Best regards, +Maciej Fijałkowski, Carl Friedrich Bolz-Tereick and the whole PyPy team.

                              +
                              +

                              Comments

                              +
                              +
                              +
                              + + Juan Luis Cano wrote on 2019-02-09 18:19: +
                              +
                              +

                              Congratulations for the sprint, folks! Any plans to leverage the manylinux2010 infrastructure and about producing PyPy compatible wheels soon?

                              +
                              +
                              +
                              +
                              + + Anonymous wrote on 2019-02-10 15:29: +
                              +
                              +

                              Nice work, looking forward to Python 3.6 and beyond! Is there anywhere to view the Python 3 benchmarks like there is for PyPy2?

                              +
                              +
                              +
                              +
                              + + Carl Friedrich Bolz-Tereick wrote on 2019-02-11 08:22: +
                              +
                              +

                              Hi Juan! Yes, we are going to work on manylinux2010 support to have PyPy wheels soon.

                              +
                              +
                              +
                              +
                              + + Carl Friedrich Bolz-Tereick wrote on 2019-02-11 08:24: +
                              +
                              +

                              @Anonymous yes, being able to view PyPy3 benchmarking results is the goal of the new benchmarking server, will still take a bit of work to hook everything up.

                              +
                              +
                              +
                              + +
                              +
                              + +
                              +
                              + + \ No newline at end of file diff --git a/posts/2019/02/pypy-v700-triple-release-of-27-35-and-606875333356156076.html b/posts/2019/02/pypy-v700-triple-release-of-27-35-and-606875333356156076.html new file mode 100644 index 000000000..a6a08e5b7 --- /dev/null +++ b/posts/2019/02/pypy-v700-triple-release-of-27-35-and-606875333356156076.html @@ -0,0 +1,402 @@ + + + + + +PyPy v7.0.0: triple release of 2.7, 3.5 and 3.6-alpha | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                              +
                              +

                              PyPy v7.0.0: triple release of 2.7, 3.5 and 3.6-alpha

                              + + + +
                              +
                              +The PyPy team is proud to release the version 7.0.0 of PyPy, which includes +three different interpreters:
                              +
                                +
                              • PyPy2.7, which is an interpreter supporting the syntax and the features of +Python 2.7
                              • +
                              • PyPy3.5, which supports Python 3.5
                              • +
                              • PyPy3.6-alpha: this is the first official release of PyPy to support 3.6 +features, although it is still considered alpha quality.
                              • +
                              +
                              +All the interpreters are based on much the same codebase, thus the triple +release.
                              +Until we can work with downstream providers to distribute builds with PyPy, we +have made packages for some common packages available as wheels.
                              +The GC hooks , which can be used to gain more insights into its +performance, has been improved and it is now possible to manually manage the +GC by using a combination of gc.disable and gc.collect_step. See the +GC blog post.
                              +We updated the cffi module included in PyPy to version 1.12, and the +cppyy backend to 1.4. Please use these to wrap your C and C++ code, +respectively, for a JIT friendly experience.
                              +As always, this release is 100% compatible with the previous one and fixed +several issues and bugs raised by the growing community of PyPy users. +We strongly recommend updating.
                              +The PyPy3.6 release and the Windows PyPy3.5 release are still not production +quality so your mileage may vary. There are open issues with incomplete +compatibility and c-extension support.
                              +The utf8 branch that changes internal representation of unicode to utf8 did not +make it into the release, so there is still more goodness coming. +You can download the v7.0 releases here:
                              +https://pypy.org/download.html +
                              +We would like to thank our donors for the continued support of the PyPy +project. If PyPy is not quite good enough for your needs, we are available for +direct consulting work.
                              +We would also like to thank our contributors and encourage new people to join +the project. PyPy has many layers and we need help with all of them: PyPy +and RPython documentation improvements, tweaking popular modules to run +on pypy, or general help with making RPython's JIT even better.
                              +

                              +What is PyPy?

                              +PyPy is a very compliant Python interpreter, almost a drop-in replacement for +CPython 2.7, 3.5 and 3.6. It's fast (PyPy and CPython 2.7.x performance +comparison) due to its integrated tracing JIT compiler.
                              +We also welcome developers of other dynamic languages to see what RPython +can do for them.
                              +The PyPy release supports:
                              +
                                +
                              • +x86 machines on most common operating systems +(Linux 32/64 bits, Mac OS X 64 bits, Windows 32 bits, OpenBSD, FreeBSD)
                              • +
                              • big- and little-endian variants of PPC64 running Linux,
                              • +
                              • +s390x running Linux
                              • +
                              +
                              +Unfortunately at the moment of writing our ARM buildbots are out of service, +so for now we are not releasing any binary for the ARM architecture.
                              +
                              +

                              +What else is new?

                              +PyPy 6.0 was released in April, 2018. +There are many incremental improvements to RPython and PyPy, the complete listing is here.

                              +Please update, and continue to help us make PyPy better.


                              +Cheers, The PyPy team +
                              +
                              +
                              +

                              Comments

                              +
                              +
                              +
                              + + Anonymous wrote on 2019-02-11 20:18: +
                              +
                              +
                              I would be very happy, if at some point request-html would work. Thank you for your great work.


                              cheers
                              Rob +
                              +
                              +
                              +
                              + + Carl Friedrich Bolz-Tereick wrote on 2019-02-11 22:06: +
                              +
                              +

                              @Rob can you please file an issue with how we can reproduce the problem?

                              +
                              +
                              +
                              +
                              + + Anonymous wrote on 2019-02-15 13:54: +
                              +
                              +

                              requests-html seems to work with pypy 3.6 -v7.0, but the normal requests not.


                              This Code works with cpython

                              from requests_html import HTMLSession
                              import requests

                              def get_url():
                                  session = HTMLSession()
                                  #r = session.get('https://www.kernel.org/', verify='kernel.org.crt')
                                  r = session.get('https://www.kernel.org/')
                                  url = r.html.xpath('//*[@id="latest_link"]/a/@href')
                                  return url[0]

                              def download():
                                  with open('last_stable_kernel.txt', 'rt') as last_kernel:
                                      last_kernel = last_kernel.read()
                                  url = get_url()
                                  if url != last_kernel:
                                      print('New kernel found !!!\n')
                                      print('Downloading from this url: \n' + url )
                                      res = requests.get(url, stream = True)
                                      if res.status_code == requests.codes.ok: # Check the download
                                          print('Download complete\n')
                                      print('Writing file to disk.')
                                      kernel = open('latest_kernel.tar.xz', 'wb')
                                      for file in res.iter_content(1024):
                                          kernel.write(file)
                                      kernel.close()
                                      with open('last_stable_kernel.txt','wt') as last_kernel:
                                          last_kernel.write(url)
                                      return True

                                  else:
                                      print('I have allready the newest kernel !')
                                      return False

                              if __name__ == "__main__":
                              download()

                              +
                              +
                              +
                              +
                              + + Anonymous wrote on 2019-02-15 14:01: +
                              +
                              +

                              The pybench2.0 looks good. (except string mapping)


                              Test minimum average operation overhead
                              -------------------------------------------------------------------------------
                              BuiltinFunctionCalls: 0ms 5ms 0.01us 0.005ms
                              BuiltinMethodLookup: 0ms 1ms 0.00us 0.006ms
                              CompareFloats: 0ms 1ms 0.00us 0.005ms
                              CompareFloatsIntegers: 0ms 1ms 0.00us 0.003ms
                              CompareIntegers: 0ms 1ms 0.00us 0.007ms
                              CompareInternedStrings: 0ms 1ms 0.00us 0.023ms
                              CompareLongs: 0ms 1ms 0.00us 0.004ms
                              CompareStrings: 0ms 0ms 0.00us 0.016ms
                              ComplexPythonFunctionCalls: 12ms 14ms 0.07us 0.007ms
                              ConcatStrings: 0ms 1ms 0.00us 0.017ms
                              CreateInstances: 8ms 12ms 0.11us 0.013ms
                              CreateNewInstances: 8ms 13ms 0.16us 0.012ms
                              CreateStringsWithConcat: 0ms 1ms 0.00us 0.014ms
                              DictCreation: 11ms 13ms 0.03us 0.005ms
                              DictWithFloatKeys: 48ms 50ms 0.06us 0.010ms
                              DictWithIntegerKeys: 10ms 11ms 0.01us 0.016ms
                              DictWithStringKeys: 11ms 13ms 0.01us 0.016ms
                              ForLoops: 3ms 7ms 0.28us 0.003ms
                              IfThenElse: 0ms 1ms 0.00us 0.012ms
                              ListSlicing: 22ms 24ms 1.69us 0.004ms
                              NestedForLoops: 9ms 10ms 0.01us 0.002ms
                              NestedListComprehensions: 8ms 11ms 0.92us 0.002ms
                              NormalClassAttribute: 5ms 6ms 0.01us 0.011ms
                              NormalInstanceAttribute: 4ms 5ms 0.00us 0.022ms
                              PythonFunctionCalls: 0ms 2ms 0.01us 0.007ms
                              PythonMethodCalls: 59ms 66ms 0.29us 0.012ms
                              Recursion: 6ms 7ms 0.15us 0.009ms
                              SecondImport: 65ms 74ms 0.74us 0.003ms
                              SecondPackageImport: 67ms 70ms 0.70us 0.003ms
                              SecondSubmoduleImport: 89ms 92ms 0.92us 0.004ms
                              SimpleComplexArithmetic: 0ms 1ms 0.00us 0.007ms
                              SimpleDictManipulation: 12ms 16ms 0.01us 0.008ms
                              SimpleFloatArithmetic: 0ms 1ms 0.00us 0.010ms
                              SimpleIntFloatArithmetic: 0ms 1ms 0.00us 0.010ms
                              SimpleIntegerArithmetic: 0ms 1ms 0.00us 0.010ms
                              SimpleListComprehensions: 6ms 9ms 0.72us 0.003ms
                              SimpleListManipulation: 3ms 5ms 0.00us 0.011ms
                              SimpleLongArithmetic: 0ms 1ms 0.00us 0.007ms
                              SmallLists: 3ms 4ms 0.01us 0.007ms
                              SmallTuples: 0ms 1ms 0.00us 0.007ms
                              SpecialClassAttribute: 5ms 6ms 0.01us 0.011ms
                              SpecialInstanceAttribute: 4ms 5ms 0.00us 0.022ms
                              StringMappings: 838ms 846ms 3.36us 0.017ms
                              StringPredicates: 5ms 6ms 0.01us 0.144ms
                              StringSlicing: 0ms 1ms 0.00us 0.019ms
                              TryExcept: 0ms 0ms 0.00us 0.012ms
                              TryFinally: 0ms 2ms 0.01us 0.007ms
                              TryRaiseExcept: 0ms 1ms 0.01us 0.009ms
                              TupleSlicing: 36ms 38ms 0.15us 0.003ms
                              WithFinally: 0ms 2ms 0.01us 0.007ms
                              WithRaiseExcept: 0ms 1ms 0.02us 0.013ms
                              -------------------------------------------------------------------------------
                              Totals: 1359ms 1461ms



                              Best regards
                              Rob

                              +
                              +
                              +
                              + +
                              +
                              + +
                              +
                              + + \ No newline at end of file diff --git a/posts/2019/03/pypy-v71-released-now-uses-utf-8-451324088028792912.html b/posts/2019/03/pypy-v71-released-now-uses-utf-8-451324088028792912.html new file mode 100644 index 000000000..2897a6fac --- /dev/null +++ b/posts/2019/03/pypy-v71-released-now-uses-utf-8-451324088028792912.html @@ -0,0 +1,448 @@ + + + + + +PyPy v7.1 released; now uses utf-8 internally for unicode strings | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                              +
                              +

                              PyPy v7.1 released; now uses utf-8 internally for unicode strings

                              + + + +
                              +
                              +The PyPy team is proud to release version 7.1.0 of PyPy, which includes +two different interpreters:
                              +
                              +
                                +
                              • PyPy2.7, which is an interpreter supporting the syntax and the features of +Python 2.7
                              • +
                              • PyPy3.6-beta: this is the second official release of PyPy to support 3.6 +features, although it is still considered beta quality.
                              • +
                              +
                              +
                              +The interpreters are based on much the same codebase, thus the double +release.

                              +This release, coming fast on the heels of 7.0 in February, finally merges the +internal refactoring of unicode representation as UTF-8. Removing the +conversions from strings to unicode internally lead to a nice speed bump. We merged the utf-8 changes to the py3.5 branch (Python3.5.3) but will concentrate on 3.6 going forward.

                              +We also improved the ability to use the buffer protocol with ctype structures +and arrays.

                              +The CFFI backend has been updated to version 1.12.2. We recommend using CFFI +rather than c-extensions to interact with C, and cppyy for interacting with +C++ code.
                              + You can download the v7.1 releases here:
                              + +
                              +We would like to thank our donors for the continued support of the PyPy +project. If PyPy is not quite good enough for your needs, we are available for +direct consulting work.

                              +We would also like to thank our contributors and encourage new people to join +the project. PyPy has many layers and we need help with all of them: PyPy +and RPython documentation improvements, tweaking popular modules to run +on pypy, or general help with making RPython’s JIT even better.
                              +

                              +What is PyPy? +

                              +PyPy is a very compliant Python interpreter, almost a drop-in replacement for +CPython 2.7, 3.6. It’s fast (PyPy and CPython 2.7.x performance +comparison) due to its integrated tracing JIT compiler.

                              +We also welcome developers of other dynamic languages to see what RPython +can do for them.
                              + +This PyPy release supports:
                                +
                              +
                              +
                                +
                              • +x86 machines on most common operating systems +(Linux 32/64 bits, Mac OS X 64 bits, Windows 32 bits, OpenBSD, FreeBSD)
                              • +
                              • big- and little-endian variants of PPC64 running Linux
                              • +
                              •  ARM32 although we do not supply downloadable binaries at this time
                              • +
                              • +s390x running Linux
                              • +
                              +
                              +

                              +What else is new? +

                              +PyPy 7.0 was released in February, 2019. +There are many incremental improvements to RPython and PyPy, for more information see the changelog.

                              +Please update, and continue to help us make PyPy better.


                              +Cheers, The PyPy team +
                              +
                              +
                              +
                              +
                              +

                              Comments

                              +
                              +
                              +
                              + + Anonymous wrote on 2019-03-28 09:52: +
                              +
                              +

                              Hi,

                              I get this error when trying to run my app with the new PyPy release (pypy 2.7 syntax on Windows):

                              'C:\pypy2\lib_pypy\_sqlite3_cffi.pypy-41.pyd': The specified module could not be found


                              The file specified in the error message (\lib_pypy\_sqlite3_cffi.pypy-41.pyd) is in the folder so whatever is missing is not quite so obvious.

                              +
                              +
                              +
                              +
                              + + Noah F. San Tsorvutz wrote on 2019-03-29 14:27: +
                              +
                              +

                              One question about using utf8 text encoding, internally.

                              Is text handling code much different now, in PyPy, vs. cPython?

                              If handling characters ( code points ) within the ASCII range
                              is more like Python v.2.x, that would be very good news to
                              at least one old fart who is having trouble even treating
                              print as a function ...

                              Thanks!

                              +
                              +
                              +
                              +
                              + + Armin Rigo wrote on 2019-03-31 08:00: +
                              +
                              +

                              @Noah The answer is complicated because CPython changed its internals more than once. The current CPython 3.x stores unicode strings as an array of same-sized characters; if your string contains even one character over 0xffff then it's an array of 4 bytes for all the characters. Sometimes CPython *also* caches the UTF8 string, but doesn't use it much. The new PyPy is very different: it uses the UTF8 string *only*, and it works for both PyPy 2.7 or 3.x.

                              +
                              +
                              +
                              +
                              + + Armin Rigo wrote on 2019-03-31 08:04: +
                              +
                              +

                              @Anonymous It works for me. Please open a bug report on https://bugs.pypy.org and give more details...

                              +
                              +
                              +
                              +
                              + + Anonymous wrote on 2019-03-31 12:09: +
                              +
                              +

                              Hi Armin,

                              I can't log in to bugs.pypy.org but the problem is very easy to replicate, you only need to test this and it fails (v6.0.0 works fine but both v7.0.0 and 7.1.0 fail):

                              try:
                              import sqlite3
                              except Exception as e:
                              print str(e)

                              The error is:
                              'C:\pypy27v710\lib_pypy\_sqlite3_cffi.pypy-41.pyd': The specified module could not be found

                              I've tested it on two different Win10 PCs (32bit PyPy on 64bit Win10) and both exhibit the same behaviour.

                              +
                              +
                              +
                              +
                              + + Armin Rigo wrote on 2019-03-31 16:29: +
                              +
                              +

                              It is not so easy, because it works fine for me (win10 too). Please file a regular bug report. If you can't then we have another problem to solve first...

                              +
                              +
                              +
                              +
                              + + Anonymous wrote on 2019-03-31 18:06: +
                              +
                              +

                              Hi Armin,

                              I've got the answer: With PyPy version >= 7.0.0 you have to add PyPy's root folder to PATH in Environment Variables, that wasn't required with versions <= 6.0.0

                              +
                              +
                              +
                              +
                              + + Armin Rigo wrote on 2019-04-01 08:15: +
                              +
                              +

                              https://foss.heptapod.net/pypy/pypy/-/issues/2988/windows-cant-find-_sqlite3_cffipypy-41pyd

                              +
                              +
                              +
                              +
                              + + Anonymous wrote on 2019-04-02 19:10: +
                              +
                              +

                              Hi Armin,

                              Moving the dlls to lib_pypy is a nice easy workaround, thank you.

                              And thanks to everybody in the PyPy team for their excellent work.

                              +
                              +
                              +
                              + +
                              +
                              + +
                              +
                              + + \ No newline at end of file diff --git a/posts/2019/04/an-rpython-jit-for-lpegs-4779548053359386284.html b/posts/2019/04/an-rpython-jit-for-lpegs-4779548053359386284.html new file mode 100644 index 000000000..973f11847 --- /dev/null +++ b/posts/2019/04/an-rpython-jit-for-lpegs-4779548053359386284.html @@ -0,0 +1,668 @@ + + + + + +An RPython JIT for LPegs | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                              +
                              +

                              An RPython JIT for LPegs

                              + + + +
                              +

                              The following is a guest post by Stefan Troost, he describes the work he did in his bachelor thesis:

                              + +

                              In this project we have used the RPython infrastructure to generate an RPython +JIT for a +less-typical use-case: string pattern matching. The work in this project is +based on Parsing Expression Grammars and +LPeg, an implementation of PEGs +designed to be used in Lua. In this post I will showcase some of the work that +went into this project, explain PEGs in general and LPeg in particular, and +show some benchmarking results.

                              +

                              +Parsing Expression Grammars

                              +

                              Parsing Expression Grammas (PEGs) are a type of formal grammar similar to +context-free grammars, with the main difference being that they are unambiguous. +This is achieved by redefining the ambiguous choice operator of CFGs (usually +noted as |) as an ordered choice operator. In practice this means that if a +rule in a PEG presents a choice, a PEG parser should prioritize the leftmost +choice. Practical uses include parsing and pattern-searching. In comparison to +regular expressions PEGs stand out as being able to be parsed in linear time, +being strictly more powerful than REs, as well as being arguably more readable.

                              +

                              +LPeg

                              +

                              LPeg is an implementation of PEGs written in C to be used in the Lua +programming language. A crucial detail of this implementation is that it parses +high level function calls, translating them to bytecode, and interpreting that +bytecode. Therefore, we are able to improve that implementation by replacing +LPegs C-interpreter with an RPython JIT. I use a modified version of LPeg to +parse PEGs and pass the generated Intermediate Representation, the LPeg +bytecode, to my VM.

                              +

                              +The LPeg Library

                              +

                              The LPeg Interpreter executes bytecodes created by parsing a string of commands +using the LPeg library. Our JIT supports a subset of the LPeg library, with +some of the more advanced or obscure features being left out. Note that this +subset is still powerful enough to do things like parse JSON.

                              + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                              OperatorDescription
                              lpeg.P(string)Matches string literally
                              lpeg.P(n)Matches exactly n characters
                              lpeg.P(-n)Matches at most n characters
                              lpeg.S(string)Matches any character in string (Set)
                              lpeg.R(“xy”)Matches any character between x and y (Range)
                              pattern^nMatches at least n repetitions of pattern
                              pattern^-nMatches at most n repetitions of pattern
                              pattern1 * pattern2Matches pattern1 followed by pattern2
                              pattern1 + pattern2Matches pattern1 or pattern2 (ordered choice)
                              pattern1 - pattern2Matches pattern1 if pattern2 does not match
                              -patternEquivalent to ("" - pattern)
                              +

                              As a simple example, the pattern lpeg.P"ab"+lpeg.P"cd" would match either the +string ab or the string cd.

                              +

                              To extract semantic information from a pattern, captures are needed. These are +the following operations supported for capture creation.

                              + + + + + + + + + + + + + + + +
                              OperationWhat it produces
                              lpeg.C(pattern)the match for patten plus all captures made by pattern
                              lpeg.Cp()the current position (matches the empty string)
                              +

                              (tables taken from the LPeg documentation)

                              +

                              These patterns are translated into bytecode by LPeg, at which point we are able +to pass them into our own VM.

                              +

                              +The VM

                              +

                              The state of the VM at any point is defined by the following variables:

                              +
                                +
                              • +PC: program counter indicating the current instruction
                              • +
                              • +fail: an indicator that some match failed and the VM must backtrack
                              • +
                              • +index: counter indicating the current character of the input string
                              • +
                              • +stackentries: stack of return addresses and choice points
                              • +
                              • +captures: stack of capture objects
                              • +
                              +

                              The execution of bytecode manipulates the values of these variables in order to +produce some output. How that works and what that output looks like will be +explained now.

                              +

                              +The Bytecode

                              +

                              For simplicity’s sake I will not go over every individual bytecode, but instead +choose some that exemplify the core concepts of the bytecode set.

                              +

                              +generic character matching bytecodes

                              +
                                +
                              • +

                                any: Checks if there’s any characters left in the inputstring. If it succeeds +it advances the index and PC by 1, if not the bytecode fails.

                                +
                              • +
                              • +

                                char c: Checks if there is another bytecode in the input and if that +character is equal to c. Otherwise the bytecode fails.

                                +
                              • +
                              • +

                                set c1-c2: Checks if there is another bytecode in the input and if that +character is between (including) c1 and c2. Otherwise the bytecode fails.

                                +
                              • +
                              +

                              These bytecodes are the easiest to understand with very little impact on the +VM. What it means for a bytecode to fail will be explained when +we get to control flow bytecodes.

                              +

                              To get back to the example, the first half of the pattern lpeg.P"ab" could be +compiled to the following bytecodes:

                              +
                              char a
                              +char b
                              +
                              +

                              +control flow bytecodes

                              +
                                +
                              • +

                                jmp n: Sets PC to n, effectively jumping to the n’th bytecode. Has no defined +failure case.

                                +
                              • +
                              • +

                                testchar c n: This is a lookahead bytecode. If the current character is equal +to c it advances the PC but not the index. Otherwise it jumps to n.

                                +
                              • +
                              • +

                                call n: Puts a return address (the current PC + 1) on the stackentries stack +and sets the PC to n. Has no defined failure case.

                                +
                              • +
                              • +

                                ret: Opposite of call. Removes the top value of the stackentries stack (if +the string of bytecodes is valid this will always be a return address) and +sets the PC to the removed value. Has no defined failure case.

                                +
                              • +
                              • +

                                choice n: Puts a choice point on the stackentries stack. Has no defined +failure case.

                                +
                              • +
                              • +

                                commit n: Removes the top value of the stackentries stack (if the string of +bytecodes is valid this will always be a choice point) and jumps to n. Has no +defined failure case.

                                +
                              • +
                              +

                              Using testchar we can implement the full pattern lpeg.P"ab"+lpeg.P"cd" with +bytecode as follows:

                              +
                              testchar a -> L1
                              +any
                              +char b
                              +end
                              +any
                              +L1: char c
                              +char d
                              +end
                              +
                              +

                              The any bytecode is needed because testchar does not consume a character +from the input.

                              +

                              +Failure Handling, Backtracking and Choice Points

                              +

                              A choice point consist of the VM’s current index and capturestack as well as a +PC. This is not the VM’s PC at the time of creating the +choicepoint, but rather the PC where we should continue trying to find +matches when a failure occurs later.

                              +

                              Now that we have talked about choice points, we can talk about how the VM +behaves in the fail state. If the VM is in the fail state, it removed entries +from the stackentries stack until it finds a choice point. Then it backtracks +by restoring the VM to the state defined by the choice point. If no choice +point is found this way, no match was found in the string and the VM halts.

                              +

                              Using choice points we could implement the example lpeg.P"ab" + lpeg.P"cd" in +bytecodes in a different way (LPEG uses the simpler way shown above, but for +more complex patterns it can’t use the lookahead solution using testchar):

                              +
                              choice L1
                              +char a
                              +char b
                              +commit
                              +end
                              +L1: char c
                              +char d
                              +end
                              +
                              +

                              +Captures

                              +

                              Some patterns require the VM to produce more output than just “the pattern +matched” or “the pattern did not match”. Imagine searching a document for an +IPv4 address and all your program responded was “I found one”. In order to +recieve additional information about our inputstring, captures are used.

                              +

                              +The capture object

                              +

                              In my VM, two types of capture objects are supported, one of them being the +position capture. It consists of a single index referencing the point in the +inputstring where the object was created.

                              +

                              The other type of capture object is called simplecapture. It consists of an +index and a size value, which are used to reference a substring of the +inputstring. In addition, simplecaptures have a variable status indicating they +are either open or full. If a simplecapture object is open, that means that its +size is not yet determined, since the pattern we are capturing is of variable +length.

                              +

                              Capture objects are created using the following bytecodes:

                              +
                                +
                              • +

                                Fullcapture Position: Pushes a positioncapture object with the current index +value to the capture stack.

                                +
                              • +
                              • +

                                Fullcapture Simple n: Pushes a simplecapture object with current index value +and size=n to the capture stack.

                                +
                              • +
                              • +

                                Opencapture Simple: Pushes an open simplecapture object with current index +value and undetermined size to the capture stack.

                                +
                              • +
                              • +

                                closecapture: Sets the top element of the capturestack to full and sets its +size value using the difference between the current index and the index of +the capture object.

                                +
                              • +
                              +

                              +The RPython Implementation

                              +

                              These, and many more bytecodes were implemented in an RPython-interpreter. +By adding jit hints, we were able to generate an efficient JIT. +We will now take a closer look at some implementations of bytecodes.

                              +
                              ...
                              +        elif instruction.name == "any":
                              +            if index >= len(inputstring):
                              +                fail = True
                              +            else:
                              +                pc += 1
                              +                index += 1
                              +
                              +...
                              +
                              +

                              The code for the any-bytecode is relatively straight-forward. It either +advances the pc and index or sets the VM into the fail state, +depending on whether the end of the inputstring has been reached or not.

                              +
                              ...
                              +        if instruction.name == "char":
                              +            if index >= len(inputstring):
                              +                fail = True
                              +            elif instruction.character == inputstring[index]:
                              +                pc += 1
                              +                index += 1
                              +            else:
                              +                fail = True
                              +...
                              +
                              +

                              The char-bytecode also looks as one would expect. If the VM’s string index is +out of range or the character comparison fails, the VM is put into the +fail state, otherwise the pc and index are advanced by 1. As you can see, the +character we’re comparing the current inputstring to is stored in the +instruction object (note that this code-example has been simplified for +clarity, since the actual implementation includes a jit-optimization that +allows the VM to execute multiple successive char-bytecodes at once).

                              +
                              ...
                              +        elif instruction.name == "jmp":
                              +            pc = instruction.goto
                              +...
                              +
                              +

                              The jmp-bytecode comes with a goto value which is a pc that we want +execution to continue at.

                              +
                              ...
                              +        elif instruction.name == "choice":
                              +            pc += 1
                              +            choice_points = choice_points.push_choice_point(
                              +                instruction.goto, index, captures)
                              +...
                              +
                              +

                              As we can see here, the choice-bytecode puts a choice point onto the stack that +may be backtracked to if the VM is in the fail-state. This choice point +consists of a pc to jump to which is determined by the bytecode. +But it also includes the current index and captures values at the time the choice +point was created. An ongoing topic of jit optimization is which data structure +is best suited to store choice points and return addresses. Besides naive +implementations of stacks and single-linked lists, more case-specific +structures are also being tested for performance.

                              +

                              +Benchmarking Result

                              +

                              In order to find out how much it helps to JIT LPeg patterns we ran a small +number of benchmarks. We used an otherwise idle Intel Core i5-2430M CPU with +3072 KiB of cache and 8 GiB of RAM, running with 2.40GHz. The machine was +running Ubuntu 14.04 LTS, Lua 5.2.3 and we used GNU grep 2.16 as a point of +comparison for one of the benchmarks. The benchmarks were run 100 times in +a new process each. We measured the full runtime of the called process, +including starting the process.

                              +

                              Now we will take a look at some plots generated by measuring the runtime of +different iterations of my JIT compared to lua and using bootstrapping to +generate a sampling distribution of mean values. The plots contain a few different +variants of pypeg, only the one called "fullops" is important for this blog post, however.

                              + +
                              + +

                              This is the plot for a search pattern that searches a text file for valid URLs. +As we can see, if the input file is as small as 100 kb, the benefits of JIT +optimizations do not outweigh the time required to generate the +machine code. As a result, all of our attempts perform significantly slower +than LPeg.

                              + +
                              + +

                              This is the plot for the same search pattern on a larger input file. As we can +see, for input files as small as 500 kb our VM already outperforms LPeg’s. An +ongoing goal of continued development is to get this lower boundary as small as +possible.

                              + +
                              + +

                              The benefits of a JIT compared to an Interpreter become more and more relevant +for larger input files. Searching a file as large as 5 MB makes this fairly +obvious and is exactly the behavior we expect.

                              + +
                              + +

                              This time we are looking at a different more complicated pattern, one that parses JSON used on a +50 kb input file. As expected, LPeg outperforms us, however, something +unexpected happens as we increase the filesize.

                              + +
                              + +

                              Since LPeg has a defined maximum depth of 400 for the choicepoints and +returnaddresses Stack, LPeg by default refuses to parse files as small as +100kb. This raises the question if LPeg was intended to be used for parsing. +Until a way to increase LPeg’s maximum stack depth is found, no comparisons to +LPeg can be performed at this scale. This has been a low priority in the past +but may be addressed in the future.

                              +

                              To conclude, we see that at sufficiently high filesizes, our JIT outperforms +the native LPeg-interpreter. This lower boundary is currently as low as 100kb +in filesize.

                              +

                              +Conclusion

                              +

                              Writing a JIT for PEG’s has proven itself to be a challenge worth pursuing, as +the expected benefits of a JIT compared to an Interpreter have been achieved. +Future goals include getting LPeg to be able to use parsing patterns on larger +files, further increasing the performance of our JIT and comparing it to other +well-known programs serving a similar purpose, like grep.

                              +

                              The prototype implementation that I described in this post can be found +on Github +(it's a bit of a hack in some places, though).

                              +
                              +

                              Comments

                              +
                              +
                              +
                              + +
                              +
                              + + \ No newline at end of file diff --git a/posts/2019/04/pypy-711-bug-fix-release-6539023630991217367.html b/posts/2019/04/pypy-711-bug-fix-release-6539023630991217367.html new file mode 100644 index 000000000..f8e1f70e0 --- /dev/null +++ b/posts/2019/04/pypy-711-bug-fix-release-6539023630991217367.html @@ -0,0 +1,319 @@ + + + + + +PyPy 7.1.1 Bug Fix Release | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                              +
                              +

                              PyPy 7.1.1 Bug Fix Release

                              + + + +
                              +
                              +The PyPy team is proud to release a bug-fix release version 7.1.1 of PyPy, which +includes two different interpreters:
                                +
                              • PyPy2.7, which is an interpreter supporting the syntax and the features of +Python 2.
                              • +
                              • PyPy3.6-beta: the second official release of PyPy to support 3.6 +features.
                              • +
                              +
                              +
                              +
                              +
                              +The interpreters are based on much the same codebase, thus the double +release.

                              +This bugfix fixes bugs related to large lists, dictionaries, and sets, some corner cases with unicode, and PEP 3118 memory views of ctype structures. It also fixes a few issues related to the ARM 32-bit backend. For the complete list see the changelog.

                              +You can download the v7.1.1 releases here:
                              + +
                              +
                              +As always, this release is 100% compatible with the previous one and fixed +several issues and bugs raised by the growing community of PyPy users. +We strongly recommend updating.

                              +The PyPy3.6 release is rapidly maturing, but is still considered beta-quality.

                              +The PyPy team
                              +
                              +

                              Comments

                              +
                              +
                              +
                              + +
                              +
                              + + \ No newline at end of file diff --git a/posts/2019/07/pypy-jit-for-aarch64-7161523403247118006.html b/posts/2019/07/pypy-jit-for-aarch64-7161523403247118006.html new file mode 100644 index 000000000..07efb902f --- /dev/null +++ b/posts/2019/07/pypy-jit-for-aarch64-7161523403247118006.html @@ -0,0 +1,389 @@ + + + + + +PyPy JIT for Aarch64 | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                              +
                              +

                              PyPy JIT for Aarch64

                              + + + +
                              +
                              + +

                              Hello everyone.

                              +

                              We are pleased to announce the availability of the new PyPy for AArch64. This +port brings PyPy's high-performance just-in-time compiler to the AArch64 +platform, also known as 64-bit ARM. With the addition of AArch64, PyPy now +supports a total of 6 architectures: x86 (32 & 64bit), ARM (32 & 64bit), PPC64, +and s390x. The AArch64 work was funded by ARM Holdings Ltd. and Crossbar.io.

                              +

                              PyPy has a good record of boosting the performance of Python programs on the +existing platforms. To show how well the new PyPy port performs, we compare the +performance of PyPy against CPython on a set of benchmarks. As a point of +comparison, we include the results of PyPy on x86_64.

                              +

                              Note, however, that the results presented here were measured on a Graviton A1 +machine from AWS, which comes with a very serious word of warning: Graviton A1's +are virtual machines, and, as such, they are not suitable for benchmarking. If +someone has access to a beefy enough (16G) ARM64 server and is willing to give +us access to it, we are happy to redo the benchmarks on a real machine. One +major concern is that while a virtual CPU is 1-to-1 with a real CPU, it is not +clear to us how CPU caches are shared across virtual CPUs. Also, note that by no +means is this benchmark suite representative enough to average the results. Read +the numbers individually per benchmark.

                              +

                              The following graph shows the speedups on AArch64 of PyPy (hg id 2417f925ce94) compared to +CPython (2.7.15), as well as the speedups on a x86_64 Linux laptop +comparing the most recent release, PyPy 7.1.1, to CPython 2.7.16.

                              + +
                              + +

                              In the majority of benchmarks, the speedups achieved on AArch64 match those +achieved on the x86_64 laptop. Over CPython, PyPy on AArch64 achieves speedups +between 0.6x to 44.9x. These speedups are comparable to x86_64, where the +numbers are between 0.6x and 58.9x.

                              +

                              The next graph compares between the speedups achieved on AArch64 to the speedups +achieved on x86_64, i.e., how great the speedup is on AArch64 vs. the same +benchmark on x86_64. This comparison should give a rough idea about the +quality of the generated code for the new platform.

                              + +
                              + +

                              Note that we see a large variance: There are generally three groups of +benchmarks - those that run at more or less the same speed, those that +run at 2x the speed, and those that run at 0.5x the speed of x86_64.

                              +

                              The variance and disparity are likely related to a variety of issues, mostly due +to differences in architecture. What is however interesting is that, compared +to measurements performed on older ARM boards, the branch predictor on the +Graviton A1 machine appears to have improved. As a result, the speedups achieved +by PyPy over CPython are smaller than on older ARM boards: sufficiently branchy +code, like CPython itself, simply runs a lot faster. Hence, the advantage +of the non-branchy code generated by PyPy's just-in-time compiler is smaller.

                              +

                              One takeaway here is that many possible improvements for PyPy have yet to be +implemented. This is true for both of the above platforms, but probably more so +for AArch64, which comes with a large number of CPU registers. The PyPy backend +was written with x86 (the 32-bit variant) in mind, which has a really low number +of registers. We think that we can improve in the area of emitting more modern +machine code, which may have a higher impact on AArch64 than on x86_64. There is +also a number of missing features in the AArch64 backend. These features are +currently implemented as expensive function calls instead of inlined native +instructions, something we intend to improve.

                              +

                              Best,

                              +

                              Maciej Fijalkowski, Armin Rigo and the PyPy team

                              + +
                              +
                              +
                              +

                              Comments

                              +
                              +
                              +
                              + + Unknown wrote on 2019-07-25 18:59: +
                              +
                              +

                              Hey - I can provide access to several flavors of beefy bare-metal arm64 hardware as part of the Works on Arm project, for your benchmark efforts.

                              +
                              +
                              +
                              +
                              + + Maciej Fijalkowski wrote on 2019-07-25 21:22: +
                              +
                              +

                              Awesome! Send me an email - fijall at gmail

                              +
                              +
                              +
                              +
                              + + Anonymous wrote on 2019-07-29 15:57: +
                              +
                              +
                              Does this work well with pypy3 ? +
                              +
                              +
                              +
                              + + Armin Rigo wrote on 2019-07-29 20:02: +
                              +
                              +

                              Yes, it works with any RPython-based interpreter (including pypy2 and pypy3).

                              +
                              +
                              +
                              + +
                              +
                              + +
                              +
                              + + \ No newline at end of file diff --git a/posts/2019/08/a-second-life-for-sandbox-6848726729476245390.html b/posts/2019/08/a-second-life-for-sandbox-6848726729476245390.html new file mode 100644 index 000000000..05f596302 --- /dev/null +++ b/posts/2019/08/a-second-life-for-sandbox-6848726729476245390.html @@ -0,0 +1,333 @@ + + + + + +A second life for the Sandbox | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                              +
                              +

                              A second life for the Sandbox

                              + + + +
                              +

                              Hi all,

                              Anvil is a UK-based company sponsoring one month of work to revive PyPy's +"sandbox" mode and upgrade it to PyPy3. Thanks to them, sandboxing will be +given a second life!

                              +The sandboxed PyPy is a special version of PyPy that runs +fully isolated. It gives a safe way to execute arbitrary Python +programs (whole programs, not small bits of code inside your larger Python +program). Such scripts can be fully untrusted, and they can try to do +anything—there are no syntax-based restrictions, for example—but whatever +they do, any communication with the external world is not actually done but +delegated to the parent process. This is similar but much more flexible than +Linux's Seccomp approach, and it is more lightweight than setting up a full +virtual machine. It also works without operating system support.

                              +However, during the course of the years the sandbox mode of PyPy has been +mostly unmaintained and unsupported by the core developers, mostly because of +a lack of interest by users and because it took too much effort to maintain +it.

                              +Now we have found that we have an actual user, Anvil. As far as I can tell +they are still using a very old version of PyPy, the last one that supported +sandboxing. This is where this contract comes from: the goal is to modernize sandboxing and port it to PyPy3.

                              +Part of my motivation for accepting this work is that I may have found a way to +tweak the protocol on the pipe between the sandboxed PyPy and the parent +controller process. This should make the sandboxed PyPy more resilient against +future developments and easier to maintain; at most, in the future some tweaks will be needed in the +controller process but hopefully not deep inside the guts of the sandboxed +PyPy. Among the advantages, such a more robust solution should mean that we +can actually get a working sandboxed PyPy—or sandboxed PyPy3 or sandboxed +version of any other interpreter written in RPython—with just an extra +argument when calling rpython to translate this interpreter. If everything +works as planned, sandboxing may be given a second life.

                              +Armin Rigo

                              +
                              +

                              Comments

                              +
                              +
                              +
                              + + mark wrote on 2020-03-16 11:10: +
                              +
                              +

                              Hi Armin,

                              I like your initiative a lot - I tihnk it is very useful to have a safe execution environment for python scripts (a lot can be done, once this is achieved).
                              Please keep us updated about the stated of development.
                              I am wondering, if it is already in a usable condition - descriptions diverge here.
                              Thanks, mark

                              +
                              +
                              +
                              + +
                              +
                              + +
                              +
                              + + \ No newline at end of file diff --git a/posts/2019/10/pypy-v72-released-1090406556726313495.html b/posts/2019/10/pypy-v72-released-1090406556726313495.html new file mode 100644 index 000000000..35cda3327 --- /dev/null +++ b/posts/2019/10/pypy-v72-released-1090406556726313495.html @@ -0,0 +1,407 @@ + + + + + +PyPy v7.2 released | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                              +
                              +

                              PyPy v7.2 released

                              + + + +
                              +
                              +The PyPy team is proud to release the version 7.2.0 of PyPy, which includes +two different interpreters:
                                +
                              • PyPy2.7, which is an interpreter supporting the syntax and the features of +Python 2.7 including the stdlib for CPython 2.7.13
                              • +
                              +
                                +
                              • PyPy3.6: which is an interpreter supporting the syntax and the features of +Python 3.6, including the stdlib for CPython 3.6.9.
                              • +
                              +
                              +
                              +
                              +
                              +The interpreters are based on much the same codebase, thus the double +release.

                              +As always, this release is 100% compatible with the previous one and fixed +several issues and bugs raised by the growing community of PyPy users. +We strongly recommend updating. Many of the fixes are the direct result of +end-user bug reports, so please continue reporting issues as they crop up.

                              +You can download the v7.2 releases here:
                              + +
                              +With the support of Arm Holdings Ltd. and Crossbar.io, this release supports +the 64-bit aarch64 ARM architecture. More about the work and the +performance data around this welcome development can be found in the blog +post.

                              + +This release removes the “beta” tag from PyPy3.6. While there may still be some +small corner-case incompatibilities (around the exact error messages in +exceptions and the handling of faulty codec errorhandlers) we are happy with +the quality of the 3.6 series and are looking forward to working on a Python +3.7 interpreter.

                              +We updated our benchmark runner at https://speed.pypy.org to a more modern +machine and updated the baseline python to CPython 2.7.11. Thanks to Baroque +Software for maintaining the benchmark runner.

                              +The CFFI-based _ssl module was backported to PyPy2.7 and updated to use +cryptography version 2.7. Additionally, the _hashlib, and crypt (or +_crypt on Python3) modules were converted to CFFI. This has two +consequences: end users and packagers can more easily update these libraries +for their platform by executing (cd lib_pypy; ../bin/pypy _*_build.py). +More significantly, since PyPy itself links to fewer system shared objects +(DLLs), on platforms with a single runtime namespace like linux, different CFFI +and c-extension modules can load different versions of the same shared object +into PyPy without collision (issue 2617).

                              +Until downstream providers begin to distribute c-extension builds with PyPy, we +have made packages for some common packages available as wheels.

                              +The CFFI backend has been updated to version 1.13.0. We recommend using CFFI +rather than c-extensions to interact with C, and cppyy for interacting with +C++ code.

                              +Thanks to Anvil, we revived the PyPy Sandbox, (soon to be released) which allows total control +over a Python interpreter’s interactions with the external world.

                              +We implemented a new JSON decoder that is much faster, uses less memory, and +uses a JIT-friendly specialized dictionary. More about that in the recent blog post

                              +We would like to thank our donors for the continued support of the PyPy +project. If PyPy is not quite good enough for your needs, we are available for +direct consulting work. +
                              +We would also like to thank our contributors and encourage new people to join +the project. PyPy has many layers and we need help with all of them: PyPy +and RPython documentation improvements, tweaking popular modules to run +on PyPy, or general help with making RPython’s JIT even better. Since the +previous release, we have accepted contributions from 27 new contributors, +so thanks for pitching in.

                              +

                              +What is PyPy? +

                              +PyPy is a very compliant Python interpreter, almost a drop-in replacement for +CPython 2.7, 3.6. It’s fast (PyPy and CPython 2.7.x performance +comparison) due to its integrated tracing JIT compiler.

                              +We also welcome developers of other dynamic languages to see what RPython +can do for them.

                              +This PyPy release supports:
                                +
                              • +x86 machines on most common operating systems +(Linux 32/64 bit, Mac OS X 64-bit, Windows 32-bit, OpenBSD, FreeBSD)
                              • +
                              +
                              +
                              +
                                +
                              • big- and little-endian variants of PPC64 running Linux +
                              • +
                              +
                              +
                              +
                                +
                              • +s390x running Linux
                              • +
                              +
                              +
                              +
                                +
                              • 64-bit ARM machines running Linux
                              • +
                              +
                              +
                              +
                              +
                              +Unfortunately at the moment of writing our ARM buildbots are out of service, +so for now we are not releasing any binary for the ARM architecture (32-bit), although PyPy does support ARM 32-bit processors.

                              +

                              +What else is new? +

                              +PyPy 7.1 was released in March, 2019. +There are many incremental improvements to RPython and PyPy, For more information about the 7.2.0 release, see the full changelog.

                              +Please update, and continue to help us make PyPy better.

                              +Cheers,
                              +The PyPy team +
                              +

                              +
                              +
                              +
                              +

                              Comments

                              +
                              +
                              +
                              + +
                              +
                              + + \ No newline at end of file diff --git a/posts/2019/10/pypys-new-json-parser-492911724084305501.html b/posts/2019/10/pypys-new-json-parser-492911724084305501.html new file mode 100644 index 000000000..35a4701c0 --- /dev/null +++ b/posts/2019/10/pypys-new-json-parser-492911724084305501.html @@ -0,0 +1,862 @@ + + + + + +PyPy's new JSON parser | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                              +
                              +

                              PyPy's new JSON parser

                              + + + +
                              +

                              +Introduction

                              +In the last year or two I have worked on and off on making PyPy's +JSON faster, particularly when parsing large +JSON files. In this post I am going to document those techniques and +measure their performance impact. Note that I am quite a lot more +constrained in what optimizations I can apply here, compared to some of +the much more advanced approaches like +Mison, +Sparser or +SimdJSON because I don't want to +change the json.loads API that Python programs expect, and because I +don't want to only support CPUs with wide SIMD extensions. With a more +expressive API, more optimizations would be possible.
                              +There are a number of problems of working with huge JSON files: +deserialization takes a long time on the one hand, and the resulting +data structures often take a lot of memory (usually they can be many +times bigger than the size of the file they originated from). Of course +these problems are related, because allocating and initializing a big +data structure takes longer than a smaller data structure. Therefore I +always tried to attack both of these problems at the same time.
                              +One common theme of the techniques I am describing is that of optimizing +the parser for how JSON files are typically used, not how they could +theoretically be used. This is a similar approach to the way dynamic +languages are optimized more generally: most JITs will optimize for +typical patterns of usage, at the cost of less common usage patterns, +which might even become slower as a result of the optimizations.

                              +Maps

                              +The first technique I investigated is to use maps in the JSON parser. +Maps, also called hidden classes or shapes, are a fairly common way to +(generally, not just in the context of JSON parsing) optimize instances +of +classes +in dynamic language VMs. Maps exploit the fact that while it is in +theory possible to add arbitrary fields to an instance, in practice most +instances of a class are going to have the same set of fields (or one of +a small number of different sets). Since JSON dictionaries or objects +often come from serialized instances of some kind, this property often +holds in JSON files as well: dictionaries often have the same fields in +the same order, within a JSON file.
                              +This property can be exploited in two ways: on the one hand, it can be +used to again store the deserialized dictionaries in a more memory +efficient way by not using a hashmap in most cases, but instead +splitting the dictionary into a shared description of the set of keys +(the map) and an array of storage with the values. This makes the +deserialized dictionaries smaller if the same set of keys is repeated a +lot. This is completely transparent to the Python programmer, the +dictionary will look completely normal to the Python program but its +internal representation is different.
                              +One downside of using maps is that sometimes files will contain many +dictionaries that have unique key sets. Since maps themselves are quite +large data structures and since dictionaries that use maps contain an +extra level of indirection we want to fall back to using normal hashmaps +to represent the dictionaries where that is the case. To prevent this we +perform some statistics at runtime, how often every map (i.e. set of +keys) is used in the file. For uncommonly used maps, the map is +discarded and the dictionaries that used the map converted into using a +regular hashmap.

                              +Using Maps to Speed up Parsing

                              +Another benefit of using maps to store deserialized dictionaries is that +we can use them to speed up the parsing process itself. To see how this +works, we need to understand maps a bit better. All the maps produced as +a side-effect of parsing JSON form a tree. The tree root is a map that +describes the object without any attributes. From every tree node we +have a number of edges going to other nodes, each edge for a specific +new attribute added:

                              +This map tree is the result of parsing a file that has dictionaries with +the keys a, b, c many times, the keys a, b, f less often, and also some +objects with the keys x, y.
                              +When parsing a dictionary we traverse this tree from the root, according +to the keys that we see in the input file. While doing this, we +potentially add new nodes, if we get key combinations that we have never +seen before. The set of keys of a dictionary parsed so far are +represented by the current tree node, while we can store the values into +an array. We can use the tree of nodes to speed up parsing. A lot of the +nodes only have one child, because after reading the first few keys of +an object, the remaining ones are often uniquely determined in a given +file. If we have only one child map node, we can speculatively parse the +next key by doing a memcmp between the key that the map tree says is +likely to come next and the characters that follow the ',' that started +the next entry in the dictionary. If the memcmp returns true this +means that the speculation paid off, and we can transition to the new map +that the edge points to, and parse the corresponding value. If not, we +fall back to general code that parses the string, handles escaping rules +etc. This trick was explained to me by some V8 engineers, the same trick +is supposedly used as part of the V8 JSON parser.
                              +This scheme doesn't immediately work for map tree nodes that have more +than one child. However, since we keep statistics anyway about how often +each map is used as the map of a parsed dictionary, we can speculate +that the most common map transition is taken more often than the others +in the future, and use that as the speculated next node.
                              +So for the example transition tree shown in the figure above the key +speculation would succeed for objects with keys a, b, c. For objects +with keys a, b, f the speculation would succeed for the first two +keys, but not for the third key f. For objects with the keys +x, y the speculation would fail for the first key x but succeed +for the second key y.
                              +For real-world datasets these transition trees can become a lot more +complicated, for example here is a visualization of a part of the +transition tree generated for parsing a New York Times dataset:
                              + +

                              +Caching Strings

                              +A rather obvious observation we can use to improve performance of the +parser is the fact that string values repeat a lot in most JSON files. +For strings that are used as dictionary keys this is pretty obvious. +However it happens also for strings that are used as values in +dictionaries (or are stored in lists). We can use this fact to +intern/memoize strings and save memory. This is an approach that many +JSON parsers use, including +CPython's. +To do this, I keep a dictionary of strings that we have seen so far +during parsing and look up new strings that are deserialized. If we have +seen the string before, we can re-use the deserialized previous string. +Right now I only consider utf-8 strings for caching that do not contain +any escapes (whether stuff like \", \n or escaped unicode chars).
                              +This simple approach works extremely well for dictionary keys, but needs +a number of improvements to be a win in general. The first observation +is that computing the hash to look up the string in the dictionary of +strings we've seen so far is basically free. We can compute the hash +while scanning the input for the end of the string we are currently +deserializing. Computing the hash while scanning doesn't increase the +time spent scanning much. This is not a new idea, I am sure many other +parsers do the same thing (but CPython doesn't seem to).
                              +Another improvement follows from the observation that inserting every +single deserialized non-key string into a hashmap is too expensive. +Instead, we insert strings into the cache more conservatively, by +keeping a small ring buffer of hashes of recently deserialized strings. +The hash is looked for in the ring buffer, and only if the hash is +present we insert the string into the memoization hashmap. This has the +effect of only inserting strings into the memoization hashmap that +re-occur a second time not too far into the file. This seems to give a +good trade-off between still re-using a lot of strings but keeping the +time spent updating and the size of the memoization hashmap low.
                              +Another twist is that in a lot of situations caching strings is not +useful at all, because it will almost never succeed. Examples of this +are UUIDs (which are unique), or the content of a tweet in a JSON file +with many tweets (which is usually unique). However, in the same file it +might be useful to cache e.g. the user name of the Twitter user, because +many tweets from the same person could be in such a file. Therefore the +usefulness of the string cache depends on which fields of objects we are +deserializing the value off. Therefore we keep statistics per map field +and disable string memoization per individual field if the cache hit +rate falls below a certain threshold. This gives the best of both +worlds: in the cases where string values repeat a lot in certain fields +we use the cache to save time and memory. But for those fields that +mostly contain unique strings we don't waste time looking up and adding +strings in the memoization table. Strings outside of dictionaries are +quite rare anyway, so we just always try to use the cache for them.
                              +The following pseudocode sketches the code to deserialize a string in +the input at a given position. The function also takes a map, which is +the point in the map tree that we are currently deserializing a field +off (if we are deserializing a string in another context, some kind of +dummy map can be used there).
                              
                              +def deserialize_string(pos, input, map):
                              +    # input is the input string, pos is the position of the starting " of
                              +    # the string
                              +
                              +    # find end of string, check whether it contains escape codes,
                              +    # compute hash, all at the same time
                              +    end, escapes, hash = find_end_of_string(pos + 1, input)
                              +    if end == -1:
                              +        raise ParseError
                              +    if escapes:
                              +        # need to be much more careful with escaping
                              +        return deserialize_string_escapes(pos, input)
                              +    
                              +    # should we cache at all?
                              +    if map.cache_disabled():
                              +        return input[pos + 1:end]
                              +
                              +    # if string is in cache, return it
                              +    if hash in cache:
                              +        map.cache_hit += 1
                              +        return cache[hash]
                              +
                              +    result = input[pos + 1:end]
                              +    map.cache_miss += 1
                              +
                              +    # if hash is in the ring buffer of recently seen hashes,
                              +    # add the string to the cache
                              +    if hash in ring_buffer:
                              +        cache[hash] = result
                              +    else:
                              +        ring_buffer.write(hash)
                              +    return result
                              +
                              +
                              +
                              +

                              +Evaluation

                              +To find out how much the various techniques help, I implemented a number +of JSON parsers in PyPy with different combinations of the techniques +enabled. I compared the numbers with the JSON parser of CPython 3.7.3 +(simplejson), with ujson, with the JSON parser of Node 12.11.1 (V8) and with +RapidJSON (in DOM mode).
                              +I collected a number of medium-to-large JSON files to try the JSON +parsers on:
                                +
                              • +Censys: A subset of the Censys port and +protocol scan data for websites in the Alexa top million domains
                              • +
                              • +Gharchive: Github activity from +January 15-23, 2015 from Github Archive
                              • +
                              • +Reddit: Reddit +comments from May 2009
                              • +
                              • Rosie: The nested matches produced using the Rosie pattern +language all.things pattern on a log +file
                              • +
                              • Nytimes: Metadata of a collection of New York Times articles
                              • +
                              • Tpch: The TPC-H database benchmark's deals table as a JSON file
                              • +
                              • Twitter: A JSON export of the @pypyproject Twitter account data
                              • +
                              • Wikidata: A file storing a subset of the Wikidata fact dump from Nov +11, 2014
                              • +
                              • +Yelp: A file of yelp +businesses
                              • +
                              +Here are the file sizes of the benchmarks:
                              + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                              BenchmarkFile Size [MiB]
                              Censys898.45
                              Gharchive276.34
                              NYTimes12.98
                              Reddit931.65
                              Rosie388.88
                              TPCH173.86
                              Wikidata119.75
                              Yelp167.61
                              +I measured the times of each benchmark with a number of variations +of the improved PyPy algorithms:
                                +
                              • PyPyBaseline: The PyPy JSON parser as it was before my work with JSON +parsing started (PyPy version 5.8)
                              • +
                              • PyPyKeyStringCaching: Memoizing the key strings of dictionaries, but +not the other strings in a json file, and not using maps to represent +dictionaries (this is the JSON parser that PyPy has been shipping since +version 5.9, in the benchmarks I used 7.1).
                              • +
                              • PyPyMapNoCache: Like PyPyKeyStringCaching, but using maps to +represent dictionaries. This includes speculatively parsing the next +key using memcmp, but does not use string caching of non-key strings.
                              • +
                              • PyPyFull: Like PyPyMapNoCache but uses a string cache for all +strings, not just keys. This is equivalent to what will be released soon as part of PyPy 7.2
                              • +
                              +In addition to wall clock time of parsing, I also measured the increase +in memory use of each implementation after the input string has been +deserialized, i.e. the size of the in-memory representation of every +JSON file.

                              +Contributions of Individual Optimizations

                              +Let's first look at the contributions of the individual optimizations to the +overall performance and memory usage.

                              +All the benchmarks were run 30 times in new processes, all the numbers are +normalized to PyPyFull.
                              +The biggest individual improvement to both parsing time and memory used comes +from caching just the keys in parsed dictionaries. This is the optimization in +PyPy's JSON parser that has been implemented for a while already. To understand +why this optimization is so useful, let's look at some numbers about each +benchmark, namely the number of total keys across all dictionaries in each +file, as well as the number of unique keys. As we can see, for all benchmarks +the number of unique keys is significantly smaller than the number of keys in +total.
                              + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                              BenchmarkNumber of keysNumber of unique keys
                              Censys14 404 234163
                              Gharchive6 637 881169
                              NYTimes417 33760
                              Reddit25 226 39721
                              Rosie28 500 1015
                              TPCH6 700 00045
                              Wikidata6 235 0881 602
                              Yelp5 133 91461
                              +The next big jump in deserialization time and memory comes from introducing +maps to represent deserialized dictionaries. With PyPyMapNoCache +deserialization time goes down because it's much cheaper to walk the tree +of maps and store all deserialized objects into an array of values than to +build hashmaps with the same keys again and again. Memory use goes down +for the same reason: it takes a lot less memory to store the shared +structure of each set of keys in the map, as opposed to repeating it again +and again in every hashmap.
                              +We can look at some numbers about every benchmark again. The table shows how +many map-based dictionaries are deserialized for every benchmark, and how many +hashmap-backed dictionaries. We see that the number of hashmap-backed +dictionaries is often zero, or at most a small percentage of all dictionaries +in each benchmark. Yelp has the biggest number of hashmap-backed dictionaries. +The reason for this is that the input file contains hashmaps that store +combinations of various features of Yelp businesses, and a lot of these +combinations are totally unique to a business. Therefore the heuristics +determine that it's better to store these using hashmaps.
                              + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                              BenchmarkMap DictsRegular Dicts% Regular Dicts
                              Censys4 049 2351 0420.03
                              Gharchive955 30100.00
                              NYTimes80 39300.00
                              Reddit1 201 25700.00
                              Rosie6 248 96600.00
                              TPCH1 000 00000.00
                              Wikidata1 923 46046 9052.38
                              Yelp443 14052 05110.51
                              + +We can also look at numbers about how often the memcmp-based speculative +parsing of the next key of a given map succeeds. Looking at statistics +about each benchmark, we can see that the speculation of what key we +expect next pays off in a significant percentage of cases, between 63% for +Wikidata where the dictionary structures are quite irregular, and 99% for +Reddit, where all the dictionaries have the same set of keys.
                              + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                              BenchmarkNumber of KeysMap Transitions% Successful Speculation
                              Censys14 404 23414 403 24365.79
                              Gharchive6 637 8816 637 88186.71
                              NYTimes417 337417 33779.85
                              Reddit25 226 39725 226 397100.00
                              Rosie28 500 10128 500 10190.37
                              TPCH6 700 0006 700 00086.57
                              Wikidata6 235 0885 267 74463.68
                              Yelp5 133 9144 593 98090.43
                              geomean82.04
                              +General string caching is the most unclear optimization. On the one hand its +impact on memory usage is quite substantial, leading to a 20% reduction for +Gharchive and Reddit, up to a 2× improvement for Yelp. On the other hand, the +effect on performance is less clear, since it even leads to a slowdown in +Gharchive and Reddit, and generally only a small improvement. Choosing the +right heuristic for when to disable the cache also has somewhat unclear effects +and is definitely a topic worthy of further investigation.

                              +Comparison against other JSON Decoders

                              +To get a more general feeling of the performance and memory usage of the +improved PyPy parser, we compare it against CPython's built-in json +parser, ujson for CPython, Node's (V8) JSON parser and RapidJSON. For +better context for the memory usage I also show the file size of the input +files.
                              +These benchmarks are not really an apples-to-apple comparison. All of the +implementations use different in-memory representations of strings in +the deserialized data-structure (Node uses two bytes per character in +a string, in CPython it +depends but 4 bytes on my +machine), PyPyBaseline uses four bytes, PyPy and RapidJSON use utf-8). But +it's still interesting to get some ballpark numbers. The results are as +follows:

                              +As we can see, PyPyFull handily beats CPython and ujson, with a geometric +mean of the improvement of about 2.5×. The memory improvement can be even +more extreme, with an improvement of over 4× against CPython/ujson in some +cases (CPython gives better memory sizes, because its parser caches the +keys of dictionaries as well). Node is often more than 50% slower, whereas +RapidJSON beats us easily, by a factor of 2× on average.

                              +Conclusions

                              +While the speedup I managed to achieve over the course of this project is +nice and I am certainly happy to beat both CPython and Node, I am +ultimately still annoyed that RapidJSON manages to maintain such a clear +lead over PyPyFull, and would like to get closer to it. One problem that +PyPy suffers compared to RapidJSON is the overhead of garbage collection. +Deserializing large JSON files is pretty much the worst case for the +generational GC that PyPy uses, since none of the deserialized objects die +young (and the GC expects that most objects do). That means that a lot of +the deserialization time of PyPy is wasted allocating the resulting +objects in the nursery, and then copying them into the old generation. +Somehow, this should be done in better ways, but all my attempts to not +have to do the copy did not seem to help much. So maybe more improvements +are possible, if I can come up with more ideas.
                              +On the memory side of things, Node/V8 is beating PyPy clearly which might +indicate more general problems in how we represent Python objects in +memory. On the other hand, I think it's cool that we are competitive with +RapidJSON in terms of memory and often within 2× of the file size.
                              +An effect that I didn't consider at all in this blog post is the fact that +accessing the deserialized objects with constants strings is also faster +than with regular dictionaries, due to them being represented with maps. +More benchmarking work to do in the future!
                              +If you have your own programs that run on PyPy and use the json parser +a lot, please measure them on the new code and let me know whether you see +any difference! +
                              +

                              Comments

                              +
                              +
                              +
                              + + Unknown wrote on 2019-10-09 09:49: +
                              +
                              +

                              Great work! Excited for the new release.

                              This makes me wonder if maps are (or can) be used for identical dicts which are constructed in a tight loop (e.g. from a CSV or SQLAlchemy rows).

                              +
                              +
                              +
                              +
                              + + Carl Friedrich Bolz-Tereick wrote on 2019-10-09 11:20: +
                              +
                              +

                              thanks! yes, that should be possible somehow and would indeed be quite cool. If you give us a real benchmark, we can think about it (maybe I should start with csv.DictReader?)

                              +
                              +
                              +
                              +
                              + + Alexander Hultnér | Hultnér Technologies wrote on 2019-10-09 12:52: +
                              +
                              +

                              Excellent work!

                              These posts are always a pleasure to read, and the improvements in PyPy are wounderful.
                              Thanks to you and everyone who's involved in making PyPy the great product it is toady, do keep the great work up!

                              +
                              +
                              +
                              +
                              + + peak wrote on 2019-10-15 03:12: +
                              +
                              +

                              Great work, but could you please provide links (preferably of the permalink variety) to the datasets you used for benchmarking? Thanks.

                              +
                              +
                              +
                              + +
                              +
                              + +
                              +
                              + + \ No newline at end of file diff --git a/posts/2019/12/hpy-kick-off-sprint-report-1840829336092490938.html b/posts/2019/12/hpy-kick-off-sprint-report-1840829336092490938.html new file mode 100644 index 000000000..9cf14890b --- /dev/null +++ b/posts/2019/12/hpy-kick-off-sprint-report-1840829336092490938.html @@ -0,0 +1,651 @@ + + + + + +HPy kick-off sprint report | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                              +
                              +

                              HPy kick-off sprint report

                              + + + +
                              +

                              Recently Antonio, Armin and Ronan had a small internal sprint in the beautiful +city of Gdańsk to kick-off the development of HPy. Here is a brief report of +what was accomplished during the sprint.

                              +
                              +

                              What is HPy?

                              +

                              The TL;DR answer is "a better way to write C extensions for Python".

                              +

                              The idea of HPy was born during EuroPython 2019 in Basel, where there was an +informal meeting which included core developers of PyPy, CPython (Victor +Stinner and Mark Shannon) and Cython (Stefan Behnel). The ideas were later also +discussed with Tim Felgentreff of GraalPython, to make sure they would also be +applicable to this very different implementation, Windel Bouwman of RustPython +is following the project as well.

                              +

                              All of us agreed that the current design of the CPython C API is problematic +for various reasons and, in particular, because it is too tied to the current +internal design of CPython. The end result is that:

                              + +
                                +
                              • alternative implementations of Python (such as PyPy, but not only) have a +hard time loading and executing existing C extensions;
                              • +
                              • CPython itself is unable to change some of its internal implementation +details without breaking the world. For example, as of today it would be +impossible to switch from using reference counting to using a real GC, +which in turns make it hard for example to remove the GIL, as gilectomy +attempted.
                              • +
                              +

                              HPy tries to address these issues by following two major design guidelines:

                              +
                                +
                              1. objects are referenced and passed around using opaque handles, which are +similar to e.g., file descriptors in spirit. Multiple, different handles +can point to the same underlying object, handles can be duplicated and +each handle must be released independently of any other duplicate.
                              2. +
                              3. The internal data structures and C-level layout of objects are not +visible nor accessible using the API, so each implementation if free to +use what fits best.
                              4. +
                              +

                              The other major design goal of HPy is to allow incremental transition and +porting, so existing modules can migrate their codebase one method at a time. +Moreover, Cython is considering to optionally generate HPy code, so extension +module written in Cython would be able to benefit from HPy automatically.

                              +

                              More details can be found in the README of the official HPy repository.

                              +
                              +
                              +

                              Target ABI

                              +

                              When compiling an HPy extension you can choose one of two different target ABIs:

                              + +
                                +
                              • +HPy/CPython ABI: in this case, hpy.h contains a set of macros and +static inline functions. At compilation time this translates the HPy API +into the standard C-API. The compiled module will have no performance +penalty, and it will have a "standard" filename like +foo.cpython-37m-x86_64-linux-gnu.so.
                              • +
                              • +Universal HPy ABI: as the name implies, extension modules compiled +this way are "universal" and can be loaded unmodified by multiple Python +interpreters and versions. Moreover, it will be possible to dynamically +enable a special debug mode which will make it easy to find e.g., open +handles or memory leaks, without having to recompile the extension.
                              • +
                              +

                              Universal modules can also be loaded on CPython, thanks to the +hpy_universal module which is under development. An extra layer of +indirection enables loading extensions compiled with the universal ABI. Users +of hpy_universal will face a small performance penalty compared to the ones +using the HPy/CPython ABI.

                              +

                              This setup gives several benefits:

                              + +
                                +
                              • Extension developers can use the extra debug features given by the +Universal ABI with no need to use a special debug version of Python.
                              • +
                              • Projects which need the maximum level of performance can compile their +extension for each relevant version of CPython, as they are doing now.
                              • +
                              • Projects for which runtime speed is less important will have the choice of +distributing a single binary which will work on any version and +implementation of Python.
                              • +
                              +
                              +
                              +

                              A simple example

                              +

                              The HPy repo contains a proof of concept module. Here is a simplified +version which illustrates what a HPy module looks like:

                              +
                              +#include "hpy.h"
                              +
                              +HPy_DEF_METH_VARARGS(add_ints)
                              +static HPy add_ints_impl(HPyContext ctx, HPy self, HPy *args, HPy_ssize_t nargs)
                              +{
                              +    long a, b;
                              +    if (!HPyArg_Parse(ctx, args, nargs, "ll", &a, &b))
                              +        return HPy_NULL;
                              +    return HPyLong_FromLong(ctx, a+b);
                              +}
                              +
                              +
                              +static HPyMethodDef PofMethods[] = {
                              +    {"add_ints", add_ints, HPy_METH_VARARGS, ""},
                              +    {NULL, NULL, 0, NULL}
                              +};
                              +
                              +static HPyModuleDef moduledef = {
                              +    HPyModuleDef_HEAD_INIT,
                              +    .m_name = "pof",
                              +    .m_doc = "HPy Proof of Concept",
                              +    .m_size = -1,
                              +    .m_methods = PofMethods
                              +};
                              +
                              +
                              +HPy_MODINIT(pof)
                              +static HPy init_pof_impl(HPyContext ctx)
                              +{
                              +    HPy m;
                              +    m = HPyModule_Create(ctx, &moduledef);
                              +    if (HPy_IsNull(m))
                              +        return HPy_NULL;
                              +    return m;
                              +}
                              +
                              +

                              People who are familiar with the current C-API will surely notice many +similarities. The biggest differences are:

                              + +
                                +
                              • Instead of PyObject *, objects have the type HPy, which as +explained above represents a handle.
                              • +
                              • You need to explicitly pass an HPyContext around: the intent is +primary to be future-proof and make it easier to implement things like +sub- interpreters.
                              • +
                              • +HPy_METH_VARARGS is implemented differently than CPython's +METH_VARARGS: in particular, these methods receive an array of HPy +and its length, instead of a fully constructed tuple: passing a tuple +makes sense on CPython where you have it anyway, but it might be an +unnecessary burden for alternate implementations. Note that this is +similar to the new METH_FASTCALL which was introduced in CPython.
                              • +
                              • HPy relies a lot on C macros, which most of the time are needed to support +the HPy/CPython ABI compilation mode. For example, HPy_DEF_METH_VARARGS +expands into a trampoline which has the correct C signature that CPython +expects (i.e., PyObject (*)(PyObject *self, *PyObject *args)) and +which calls add_ints_impl.
                              • +
                              +
                              +
                              +

                              Sprint report and current status

                              +

                              After this long preamble, here is a rough list of what we accomplished during +the week-long sprint and the days immediatly after.

                              +

                              On the HPy side, we kicked-off the code in the repo: at the moment of writing +the layout of the directories is a bit messy because we moved things around +several times, but we identified several main sections:

                              + +
                                +
                              1. +

                                A specification of the API which serves both as documentation and as an +input for parts of the projects which are automatically +generated. Currently, this lives in public_api.h.

                                +
                              2. +
                              3. +

                                A set of header files which can be used to compile extension modules: +depending on whether the flag -DHPY_UNIVERSAL_ABI is passed to the +compiler, the extension can target the HPy/CPython ABI or the HPy +Universal ABI

                                +
                              4. +
                              5. +

                                A CPython extension module called hpy_universal which makes it +possible to import universal modules on CPython

                                +
                              6. +
                              7. +

                                A set of tests which are independent of the implementation and are meant +to be an "executable specification" of the semantics. Currently, these +tests are run against three different implementations of the HPy API:

                                + +
                                  +
                                • the headers which implements the "HPy/CPython ABI"
                                • +
                                • the hpy_universal module for CPython
                                • +
                                • the hpy_universal module for PyPy (these tests are run in the PyPy repo)
                                • +
                                +
                              8. +
                              +

                              Moreover, we started a PyPy branch in which to implement the +hpy_univeral module: at the moment of writing PyPy can pass all the HPy +tests apart the ones which allow conversion to and from PyObject *. +Among the other things, this means that it is already possible to load the +very same binary module in both CPython and PyPy, which is impressive on its +own :).

                              +

                              Finally, we wanted a real-life use case to show how to port a module to HPy +and to do benchmarks. After some searching, we choose ultrajson, for the +following reasons:

                              + +
                                +
                              • it is a real-world extension module which was written with performance in +mind
                              • +
                              • when parsing a JSON file it does a lot of calls to the Python API to +construct the various parts of the result message
                              • +
                              • it uses only a small subset of the Python API
                              • +
                              +

                              This repo contains the HPy port of ultrajson. This commit shows an example +of what the porting looks like.

                              +

                              ujson_hpy is also a very good example of incremental migration: so far +only ujson.loads is implemented using the HPy API, while ujson.dumps +is still implemented using the old C-API, and both can coexist nicely in the +same compiled module.

                              +
                              +
                              +

                              Benchmarks

                              +

                              Once we have a fully working ujson_hpy module, we can finally run +benchmarks! We tested several different versions of the module:

                              + +
                                +
                              • +ujson: this is the vanilla implementation of ultrajson using the +C-API. On PyPy this is executed by the infamous cpyext compatibility +layer, so we expect it to be much slower than on CPython
                              • +
                              • +ujson_hpy: our HPy port compiled to target the HPy/CPython ABI. We +expect it to be as fast as ujson +
                              • +
                              • +ujson_hpy_universal: same as above but compiled to target the +Universal HPy ABI. We expect it to be slightly slower than ujson on +CPython, and much faster on PyPy.
                              • +
                              +

                              Finally, we also ran the benchmark using the builtin json module. This is +not really relevant to HPy, but it might still be an interesting as a +reference data point.

                              +

                              The benchmark is very simple and consists of parsing a big JSON file 100 +times. Here is the average time per iteration (in milliseconds) using the +various versions of the module, CPython 3.7 and the latest version of the hpy +PyPy branch:

                              + +++++ + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                               CPythonPyPy
                              ujson154.32633.97
                              ujson_hpy152.19 
                              ujson_hpy_universal168.78207.68
                              json224.59135.43
                              +

                              As expected, the benchmark proves that when targeting the HPy/CPython ABI, HPy +doesn't impose any performance penalty on CPython. The universal version is +~10% slower on CPython, but gives an impressive 3x speedup on PyPy! It it +worth noting that the PyPy hpy module is not fully optimized yet, and we +expect to be able to reach the same performance as CPython for this particular +example (or even more, thanks to our better GC).

                              +

                              All in all, not a bad result for two weeks of intense hacking :)

                              +

                              It is also worth noting than PyPy's builtin json module does really +well in this benchmark, thanks to the recent optimizations that were described +in an earlier blog post.

                              +
                              +
                              +

                              Conclusion and future directions

                              +

                              We think we can be very satisfied about what we have got so far. The +development of HPy is quite new, but these early results seem to indicate that +we are on the right track to bring Python extensions into the future.

                              +

                              At the moment, we can anticipate some of the next steps in the development of +HPy:

                              + +
                                +
                              • Think about a proper API design: what we have done so far has +been a "dumb" translation of the API we needed to run ujson. However, +one of the declared goal of HPy is to improve the design of the API. There +will be a trade-off between the desire of having a clean, fresh new API +and the need to be not too different than the old one, to make porting +easier. Finding the sweet spot will not be easy!
                              • +
                              • Implement the "debug" mode, which will help developers to find +bugs such as leaking handles or using invalid handles.
                              • +
                              • Instruct Cython to emit HPy code on request.
                              • +
                              • Eventually, we will also want to try to port parts of numpy to HPy to +finally solve the long-standing problem of sub-optimal numpy +performance in PyPy.
                              • +
                              +

                              Stay tuned!

                              + +
                              +
                              +

                              Comments

                              +
                              +
                              +
                              + + Anonymous wrote on 2019-12-18 16:22: +
                              +
                              +

                              Is HPy going to be C(++)-specific? Will you consider the feasibility of implementing that API in other languages, such as Rust? Extensive usage of macros is something that's more difficult to generate bindings for.

                              +
                              +
                              +
                              +
                              + + Antonio Cuni wrote on 2019-12-18 19:00: +
                              +
                              +

                              At the moment HPy is two thing:

                              - A C API: here the goal is to have something which is easy to write and to migrate from existing C extensions. The macros are mostly needed to overcome limitations of C as a language

                              - an ABI: this is independent from C: any language can decide what is the best API to generate extensions compatible with such an ABI

                              +
                              +
                              +
                              +
                              + + Anonymous wrote on 2019-12-18 23:53: +
                              +
                              +

                              This sounds really interesting.

                              What does this mean for the future of CFFI?

                              +
                              +
                              +
                              +
                              + + René Dudfield wrote on 2019-12-19 07:41: +
                              +
                              +

                              Great work!

                              Especially happy with the consideration of incremental adoption.

                              +
                              +
                              +
                              +
                              + + Antonio Cuni wrote on 2019-12-19 09:35: +
                              +
                              +

                              @Unknown: CFFI solves a different problem, which is how to wrap an existing C library which does not need to manipulate Python objects. As such, it will continue its development independently than HPy, as far as I can see

                              +
                              +
                              +
                              +
                              + + Anonymous wrote on 2019-12-20 08:39: +
                              +
                              +
                              Hi PyPy team, thanks for your great work but I found this:

                              import sqlite3
                              print sqlite3.version

                              2.6.0


                              sqlite3 is SQLite 2?

                              Any chance of a dot release to bring sqlite3 up to date? +
                              +
                              +
                              +
                              + + Anonymous wrote on 2019-12-20 09:09: +
                              +
                              +

                              import sqlite3
                              print sqlite3.sqlite_version

                              D'oh!
                              Sorry about that :-)

                              +
                              +
                              +
                              + +
                              +
                              + +
                              +
                              + + \ No newline at end of file diff --git a/posts/2019/12/pypy-730-released-3614026620096963655.html b/posts/2019/12/pypy-730-released-3614026620096963655.html new file mode 100644 index 000000000..0dcead3f2 --- /dev/null +++ b/posts/2019/12/pypy-730-released-3614026620096963655.html @@ -0,0 +1,389 @@ + + + + + +PyPy 7.3.0 released | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                              +
                              +

                              PyPy 7.3.0 released

                              + + + +
                              +
                              +The PyPy team is proud to release the version 7.3.0 of PyPy, which includes +two different interpreters:
                                +
                              • PyPy2.7, which is an interpreter supporting the syntax and the features of +Python 2.7 including the stdlib for CPython 2.7.13
                              • +
                              • PyPy3.6: which is an interpreter supporting the syntax and the features of +Python 3.6, including the stdlib for CPython 3.6.9.
                              • +
                              +
                              +
                              +
                              +
                              +The interpreters are based on much the same codebase, thus the double +release.

                              +We have worked with the python packaging group to support tooling around +building third party packages for python, so this release changes the ABI tag +for PyPy.

                              +Based on the great work done in portable-pypy, the linux downloads we +provide are now built on top of the manylinux2010 CentOS6 docker image. +The tarballs include the needed shared objects to run on any platform that +supports manylinux2010 wheels, which should include all supported versions of +debian- and RedHat-based distributions (including Ubuntu, CentOS, and Fedora).

                              +The CFFI backend has been updated to version 1.13.1. We recommend using CFFI +rather than c-extensions to interact with C.
                              + +The built-in cppyy module was upgraded to 1.10.6, which +provides, among others, better template resolution, stricter enum handling, +anonymous struct/unions, cmake fragments for distribution, optimizations for +PODs, and faster wrapper calls. We reccomend using cppyy for performant +wrapping of C++ code for Python.

                              +The vendored pyrepl package for interaction inside the REPL was updated.

                              +Support for codepage encoding and decoding was added for Windows.

                              +As always, this release fixed several issues and bugs raised by the growing +community of PyPy users. We strongly recommend updating. Many of the fixes are +the direct result of end-user bug reports, so please continue reporting issues +as they crop up.
                              + +You can download the v7.3 releases here:
                              + +
                              +We would like to thank our donors for the continued support of the PyPy +project. If PyPy is not quite good enough for your needs, we are available for +direct consulting work.

                              +We would also like to thank our contributors and encourage new people to join +the project. PyPy has many layers and we need help with all of them: PyPy +and RPython documentation improvements, tweaking popular packages to run +on pypy, or general help with making RPython’s JIT even better. Since the +previous release, we have accepted contributions from 3 new contributors, +thanks for pitching in.

                              +If you are a python library maintainer and use c-extensions, please consider making a cffi / cppyy version of your library that would be performant on PyPy. If you are stuck with using the C-API, you can use docker images with PyPy built in or the multibuild system to build wheels.

                              +

                              +What is PyPy? +

                              +PyPy is a very compliant Python interpreter, almost a drop-in replacement for +CPython 2.7, 3.6. It’s fast (PyPy and CPython 2.7.x performance +comparison) due to its integrated tracing JIT compiler.

                              +We also welcome developers of other dynamic languages to see what RPython +can do for them.

                              +This PyPy release supports:
                                +
                              • +x86 machines on most common operating systems +(Linux 32/64 bit, Mac OS X 64-bit, Windows 32-bit, OpenBSD, FreeBSD)
                              • +
                              +
                              +
                              +
                                +
                              • big- and little-endian variants of PPC64 running Linux +
                              • +
                              +
                              +
                              +
                                +
                              • +s390x running Linux
                              • +
                              +
                              +
                              +
                                +
                              • 64-bit ARM machines running Linux
                              • +
                              +Unfortunately at the moment of writing our ARM buildbots are out of service, +so for now we are not releasing any binary for the ARM architecture (32-bit), although PyPy does support ARM 32-bit processors.

                              +

                              +What else is new? +

                              +PyPy 7.2 was released in October, 2019. +There are many incremental improvements to RPython and PyPy, For more information about the 7.3.0 release, see the full changelog.

                              +Please update, and continue to help us make PyPy better.

                              +Cheers,
                              +The PyPy team +
                              +

                              +
                              +
                              +
                              +

                              Comments

                              +
                              +
                              +
                              + +
                              +
                              + + \ No newline at end of file diff --git a/posts/2020/01/leysin-winter-sprint-2020-feb-28-march-6349761524797409012.html b/posts/2020/01/leysin-winter-sprint-2020-feb-28-march-6349761524797409012.html new file mode 100644 index 000000000..5c53ca84c --- /dev/null +++ b/posts/2020/01/leysin-winter-sprint-2020-feb-28-march-6349761524797409012.html @@ -0,0 +1,306 @@ + + + + + +Leysin Winter sprint 2020: Feb 29 - March 8th | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                              +
                              +

                              Leysin Winter sprint 2020: Feb 29 - March 8th

                              + + + +
                              + The next PyPy sprint will be in Leysin, Switzerland, for the fourteenth +time. This is a fully public sprint: newcomers and topics other than +those proposed below are welcome.



                              +Goals and topics of the sprint

                              +The list of topics is open.  For reference, we would like to work at least partially on the following topics:
                              +As usual, the main side goal is to have fun in winter sports :-) +We can take a day off (for ski or anything else).

                              +Times and accomodation

                              +The sprint will occur for one week starting on Saturday, the 29th of February, to Sunday, the 8th of March 2020 (dates were pushed back one day!)  It will occur in Les Airelles, a different bed-and-breakfast place from the traditional one in Leysin.  It is a nice old house at the top of the village.

                              We have a 4- or 5-people room as well as up to three double-rooms.  Please register early!  These rooms are not booked for the sprint in advance, and might be already taken if you end up announcing yourself late.  We have a big room for up to 7 people with nice view, which might be split in two or three sub-rooms; plus possibly separately-booked double rooms if needed. (But it is of course always possible to book at a different place in Leysin.)

                              +For more information, see our repository or write to me directly at armin.rigo@gmail.com. +
                              +

                              Comments

                              +
                              +
                              +
                              + +
                              +
                              + + \ No newline at end of file diff --git a/posts/2020/02/pypy-and-cffi-have-moved-to-heptapod-5791595152472747032.html b/posts/2020/02/pypy-and-cffi-have-moved-to-heptapod-5791595152472747032.html new file mode 100644 index 000000000..3a003ec42 --- /dev/null +++ b/posts/2020/02/pypy-and-cffi-have-moved-to-heptapod-5791595152472747032.html @@ -0,0 +1,382 @@ + + + + + +PyPy and CFFI have moved to Heptapod | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                              +
                              +

                              PyPy and CFFI have moved to Heptapod

                              + + + +
                              +
                              +
                              +
                              +
                              +It has been a very busy month, not so much because of deep changes in the JIT of PyPy but more around the development, deployment, and packaging of the project.
                              +

                              +  +

                              +

                              +Hosting +

                              +The biggest news is that we have moved the center of our development off Bitbucket and to the new https://foss.heptapod.net/pypy. This is a friendly fork of Gitlab called heptapod that understands Mercurial and is hosted by Clever Cloud. When Atlassian decided to close down Mercurial hosting on bitbucket.org, PyPy debated what to do. Our development model is based on long-lived branches, and we want to keep the ability to immediately see which branch each commit came from. Mercurial has this, git does not (see our FAQ). Octobus, whose business is Mercurial, developed a way to use Mercurial with Gitlab called heptapod. The product is still under development, but quite usable (i.e., it doesn't get in the way). Octobus partnered with Clever Cloud hosting to offer community FOSS projects hosted on Bitbucket who wish to remain with Mercurial a new home. PyPy took them up on the offer, and migrated its repos to https://foss.heptapod.net/pypy. We were very happy with how smooth it was to import the repos to heptapod/GitLab, and are learning the small differences between Bitbucket and GitLab. All the pull requests, issues, and commits kept the same ids, but work is still being done to attribute the issues, pull requests, and comments to the correct users. So from now on, when you want to contribute to PyPy, you do so at the new home.

                              +CFFI, which previously was also hosted on Bitbucket, has joined the PyPy group at https://foss.heptapod.net/pypy/cffi.
                              +
                              +

                              +  +

                              +

                              +Website +

                              +Secondly, thanks to work by https://baroquesoftware.com/ in leading a redesign and updating the logo, the https://www.pypy.org website has undergone a facelift. It should now be easier to use on small-screen devices. Thanks also to the PSF for hosting the site.
                              +
                              +

                              +  +

                              +

                              +Packaging +

                              +Also, building PyPy from source takes a fair amount of time. While we provide downloads in the form of tarballs or zipfiles, and some platforms such as debian and Homebrew provide packages, traditionally the downloads have only worked on a specific flavor of operating system. A few years ago squeaky-pl started providing portable builds. We have adopted that build system for our linux offerings, so the nightly downloads and release downloads should now work on any glibc platform that has not gone EndOfLife. So there goes another excuse not to use PyPy. And the "but does it run scipy" excuse also no longer holds, although "does it speed up scipy" still has the wrong answer. For that we are working on HPy, and will be sprinting soon.
                              +The latest versions of pip, wheel, and setuptools, together with the manylinux2010 standard for linux wheels and tools such as multibuild or cibuildwheels (well, from the next version) make it easier for library developers to build binary wheels for PyPy. If you are having problems getting going with this, please reach out.
                              +
                              +
                              +

                              +  +

                              +

                              +Give it a try +

                              +Thanks to all the folks who provide the infrastructure PyPy depends on. We hope the new look will encourage more involvement and engagement. Help prove us right!

                              +The PyPy Team
                              +
                              +
                              +
                              +
                              +

                              Comments

                              +
                              +
                              +
                              + + Matěj Cepl wrote on 2020-02-16 21:20: +
                              +
                              +

                              Could you elaborate on “this is not always possible with Git”, please? This is too brief statement for my taste, and it doesn't make much sense (merging a branch certainly doesn't make it go away, and it is certainly possible to find to which branch a commit used to belong before merging).

                              +
                              +
                              +
                              +
                              + + Armin Rigo wrote on 2020-02-16 22:35: +
                              +
                              +

                              https://doc.pypy.org/en/latest/faq.html#why-doesn-t-pypy-use-git-and-move-to-github

                              +
                              +
                              +
                              +
                              + + Miro Hrončok wrote on 2020-02-18 07:25: +
                              +
                              +

                              Were issue attachments migrated properly?

                              +
                              +
                              +
                              +
                              + + mattip wrote on 2020-02-19 07:34: +
                              +
                              +

                              > Were issue attachments migrated properly?

                              No. They should be migrated in a follow-up. Here is the heptapod issue about attachments https://foss.heptapod.net/heptapod/foss.heptapod.net/issues/37

                              +
                              +
                              +
                              +
                              + + Carl Friedrich Bolz-Tereick wrote on 2020-02-19 08:19: +
                              +
                              +

                              I deleted one anonymous comment that was insulting the project decision. Please make your points constructively or don't make them.

                              +
                              +
                              +
                              + +
                              +
                              + +
                              +
                              + + \ No newline at end of file diff --git a/posts/2020/03/leysin-2020-sprint-report-764567777353955897.html b/posts/2020/03/leysin-2020-sprint-report-764567777353955897.html new file mode 100644 index 000000000..832a02450 --- /dev/null +++ b/posts/2020/03/leysin-2020-sprint-report-764567777353955897.html @@ -0,0 +1,395 @@ + + + + + +Leysin 2020 Sprint Report | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                              +
                              +

                              Leysin 2020 Sprint Report

                              + + + +
                              +

                              At the end of February ten of us gathered in Leysin, Switzerland to work on
                              +a variety of topics including HPy, PyPy Python 3.7 support and the PyPy
                              +migration to Heptapod.

                              +
                              + +
                              +
                              +We had a fun and productive week. The snow was beautiful. There was skiing
                              +and lunch at the top of Berneuse, cooking together, some late nights at
                              +the pub next door, some even later nights coding, and of course the
                              +obligatory cheese fondue outing.

                              +There were a few of us participating in a PyPy sprint for the first time
                              +and a few familiar faces who had attended many sprints. Many different
                              +projects were represented including PyPy, HPy, GraalPython,
                              Heptapod, and rust-cpython. The atmosphere was relaxed and welcoming, so if
                              +you're thinking of attending the next one -- please do!

                              +Topics worked on:

                              +HPy

                              +HPy is a new project to design and implement a better API for extending
                              +Python in C. If you're unfamiliar with it you can read more about it at
                              HPy.

                              +A lot of attention was devoted to the Big HPy Design Discussion which
                              +took up two full mornings. So much was decided that this will likely
                              +get its own detailed write-up, but bigger topics included:
                                +
                              • the HPy GetAttr, SetAttr, GetItem and SetItem methods,
                              • +
                              • HPy_FromVoidP and HPy_AsVoidP for passing HPy handles to C functions
                                +that pass void* pointers to callbacks,
                              • +
                              • avoiding having va_args as part of the ABI,
                              • +
                              • exception handling,
                              • +
                              • support for creating custom types.
                              • +
                              +Quite a few things got worked on too:
                                +
                              • implemented support for writing methods that take keyword arguments with
                                +HPy_METH_KEYWORDS,
                              • +
                              • implemented HPy_GetAttr, HPy_SetAttr, HPy_GetItem, and HPy_SetItem,
                              • +
                              • started implementing support for adding custom types,
                              • +
                              • started implementing dumping JSON objects in ultrajson-hpy,
                              • +
                              • refactored the PyPy GIL to improve the interaction between HPy and
                                +PyPy's cpyext,
                              • +
                              • experimented with adding HPy support to rust-cpython.
                              • +
                              +And there was some discussion of the next steps of the HPy initiative
                              +including writing documentation, setting up websites and funding, and
                              +possibly organising another HPy gathering later in the year.

                              +PyPy

                              +
                                +
                              • Georges gave a presentation on the Heptapod topic and branch workflows
                                +and showed everyone how to use hg-evolve.
                              • +
                              • Work was done on improving the PyPy CI buildbot post the move to
                                +heptapod, including a light-weight pre-merge CI and restricting
                                +when the full CI is run to only branch commits.
                              • +
                              • A lot of work was done improving the -D tests.
                              • +
                              +

                              +Miscellaneous

                              +
                                +
                              • Armin demoed VRSketch and NaN Industries in VR, including an implementation
                                +of the Game of Life within NaN Industries!
                              • +
                              • Skiing!
                              • +
                              +

                              +Aftermath

                              +Immediately after the sprint large parts of Europe and the world were
                              +hit by the COVID-19 epidemic. It was good to spend time together before
                              +travelling ceased to be a sensible idea and many gatherings were cancelled.

                              +Keep safe out there everyone.

                              +The HPy & PyPy Team & Friends

                              In joke for those who attended the sprint: Please don't replace this blog post
                              +with its Swedish translation (or indeed a translation to any other language :).
                              +
                              +

                              Comments

                              +
                              +
                              +
                              + + Pim wrote on 2020-03-30 13:04: +
                              +
                              +

                              How does HPY relate to CFFI?

                              +
                              +
                              +
                              +
                              + + Antonio Cuni wrote on 2020-03-30 16:44: +
                              +
                              +

                              @Pim: CFFI allows to wrap C code in pure Python.
                              HPy allows to write Python extensions in C.

                              For example, you can't write a new class in CFFI, and CFFI functions can't receive Python objects as arguments

                              +
                              +
                              +
                              + +
                              +
                              + +
                              +
                              + + \ No newline at end of file diff --git a/posts/2020/04/pypy-731-released-6266451647387657480.html b/posts/2020/04/pypy-731-released-6266451647387657480.html new file mode 100644 index 000000000..933a98f64 --- /dev/null +++ b/posts/2020/04/pypy-731-released-6266451647387657480.html @@ -0,0 +1,397 @@ + + + + + +PyPy 7.3.1 released | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                              +
                              +

                              PyPy 7.3.1 released

                              + + + +
                              +
                              +The PyPy team is proud to release the version 7.3.1 of PyPy, which includes +two different interpreters:
                              +
                              +
                                +
                              • PyPy2.7, which is an interpreter supporting the syntax and the features of +Python 2.7 including the stdlib for CPython 2.7.13
                              • +
                              • PyPy3.6: which is an interpreter supporting the syntax and the features of +Python 3.6, including the stdlib for CPython 3.6.9.
                              • +
                              +
                              +
                              +The interpreters are based on much the same codebase, thus the multiple +release. This is a micro release, no APIs have changed since the 7.3.0 release +in December, but read on to find out what is new.

                              +Conda Forge now supports PyPy as a Python interpreter. The support right now +is being built out. After this release, many more c-extension-based +packages can be successfully built and uploaded. This is the result of a lot of +hard work and good will on the part of the Conda Forge team. A big shout out +to them for taking this on.

                              +We have worked with the Python packaging group to support tooling around +building third party packages for Python, so this release updates the pip and +setuptools installed when executing pypy -mensurepip to pip>=20. This +completes the work done to update the PEP 425 python tag from pp373 to +mean “PyPy 7.3 running python3” to pp36 meaning “PyPy running Python +3.6” (the format is recommended in the PEP). The tag itself was +changed in 7.3.0, but older pip versions build their own tag without querying +PyPy. This means that wheels built for the previous tag format will not be +discovered by pip from this version, so library authors should update their +PyPy-specific wheels on PyPI.

                              +Development of PyPy is transitioning to https://foss.heptapod.net/pypy/pypy. +This move was covered more extensively in the blog post from last month.

                              +The CFFI backend has been updated to version 14.0. We recommend using CFFI +rather than c-extensions to interact with C, and using cppyy for performant +wrapping of C++ code for Python. The cppyy backend has been enabled +experimentally for win32, try it out and let use know how it works.

                              +Enabling cppyy requires a more modern C compiler, so win32 is now built +with MSVC160 (Visual Studio 2019). This is true for PyPy 3.6 as well as for 2.7.

                              +We have improved warmup time by up to 20%, performance of io.StringIO to +match if not be faster than CPython, and improved JIT code generation for +generators (and generator expressions in particular) when passing them to +functions like sum, map, and map that consume them. Performance of closures has also be improved in certain situations.

                              +As always, this release fixed several issues and bugs raised by the growing +community of PyPy users. We strongly recommend updating. Many of the fixes are +the direct result of end-user bug reports, so please continue reporting issues +as they crop up.
                              + +You can find links to download the v7.3.1 releases here:
                              + +
                              +We would like to thank our donors for the continued support of the PyPy +project. If PyPy is not quite good enough for your needs, we are available for +direct consulting work.

                              +We would also like to thank our contributors and encourage new people to join +the project. PyPy has many layers and we need help with all of them: PyPy +and RPython documentation improvements, tweaking popular modules to run +on PyPy, or general help with making RPython’s JIT even better. Since the +previous release, we have accepted contributions from 13 new contributors, +thanks for pitching in.

                              +If you are a Python library maintainer and use c-extensions, please consider +making a cffi / cppyy version of your library that would be performant on PyPy. +In any case both cibuildwheel and the multibuild system support +building wheels for PyPy wheels.
                              +

                              +

                              +What is PyPy? +

                              +PyPy is a very compliant Python interpreter, almost a drop-in replacement for +CPython 2.7, 3.6, and soon 3.7. It’s fast (PyPy and CPython 2.7.x performance +comparison) due to its integrated tracing JIT compiler.

                              +We also welcome developers of other dynamic languages to see what RPython +can do for them.

                              +This PyPy release supports:
                              +
                              +
                                +
                              • +x86 machines on most common operating systems +(Linux 32/64 bits, Mac OS X 64 bits, Windows 32 bits, OpenBSD, FreeBSD)
                              • +
                              • big- and little-endian variants of PPC64 running Linux,
                              • +
                              • +s390x running Linux
                              • +
                              • 64-bit ARM machines running Linux.
                              • +
                              +
                              +
                              +
                              +

                              +What else is new? +

                              +For more information about the 7.3.1 release, see the full changelog.

                              +Please update, and continue to help us make PyPy better.

                              +Cheers,
                              +The PyPy team +
                              +


                              +The PyPy Team 
                              +
                              +
                              +
                              +

                              Comments

                              +
                              +
                              +
                              + +
                              +
                              + + \ No newline at end of file diff --git a/posts/2020/08/a-new-chapter-for-pypy-8388322709667328389.html b/posts/2020/08/a-new-chapter-for-pypy-8388322709667328389.html new file mode 100644 index 000000000..976c5af9b --- /dev/null +++ b/posts/2020/08/a-new-chapter-for-pypy-8388322709667328389.html @@ -0,0 +1,404 @@ + + + + + +A new chapter for PyPy | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                              +
                              +

                              A new chapter for PyPy

                              + + + +
                              +

                              PyPy winds down its membership in the Software Freedom Conservancy

                              + +

                              Conservancy and PyPy's great work together

                              + +

                              PyPy joined Conservancy in +the second half of 2010, shortly after the release of +PyPy 1.2, the first version to contain a fully functional JIT. In 2013, PyPy +started supporting ARM, bringing its just-in-time speediness to many more devices and began working toward supporting NumPy to help +scientists crunch their numbers faster. Together, PyPy and Conservancy ran successful fundraising drives and facilitated payment +and oversight for contractors and code sprints.

                              + +

                              Conservancy supported PyPy's impressive growth as it expanded support for +different hardware platforms, greatly improved the performance of C extensions, +and added support for Python 3 as the language itself evolved.

                              + +

                              The road ahead

                              + +

                              Conservancy provides a fiscal and organizational home for projects that find the +freedoms and guardrails that come along with a charitable home advantageous for +their community goals. While this framework was a great fit for the early PyPy +community, times change and all good things must come to an end.

                              + +

                              PyPy will remain a free and open source project, but the community's structure +and organizational underpinnings will be changing and the PyPy community will be +exploring options outside of the charitable realm for its next phase of growth +("charitable" in the legal sense -- PyPy will remain a community project).

                              + +

                              During the last year PyPy and Conservancy have worked together to properly +utilise the generous donations made by stalwart PyPy enthusiats over the years +and to wrap up PyPy's remaining charitable obligations. PyPy is grateful for +the Conservancy's help in shepherding the project toward its next chapter.

                              + +

                              Thank yous

                              +

                              From Conservancy:

                              +

                              +
                              "We are happy that Conservancy was able to help PyPy bring important software +for the public good during a critical time in its history. We wish the +community well and look forward to seeing it develop and succeed in new ways."
                              +
                              +
                              — Karen Sandler, Conservancy's Executive Director
                              +

                              +

                              From PyPy:

                              +

                              +
                              +

                              "PyPy would like to thank Conservancy for their decade long support in +building the community and wishes Conservancy continued success in their +journey promoting, improving, developing and defending free and open source +sofware."

                              +

                              — Simon Cross & Carl Friedrich Bolz-Tereick, on behalf of PyPy.

                              +
                              +

                              +
                              +
                              + +

                              About

                              + +

                              PyPy is a multi-layer python interpreter with a built-in JIT compiler that runs +Python quickly across different computing environments. +Software Freedom Conservancy (Conservancy) is a charity that provides a home +to over forty free and open source software projects.

                              +
                              +

                              Comments

                              +
                              +
                              +
                              + + intgr wrote on 2020-08-12 23:36: +
                              +
                              +

                              This post has lots of words but unfortunately contains almost no information. What impact does this change have on PyPy? What is the new chapter?

                              +
                              +
                              +
                              +
                              + + Rick Sanchez wrote on 2020-08-13 06:33: +
                              +
                              +

                              What does PyPy do? Why should I use it over other Python compilers?

                              +
                              +
                              +
                              +
                              + + Anonymous wrote on 2020-08-13 08:38: +
                              +
                              +

                              @intgr the wind-down with the SFC hasn't been smooth and this is the politically-neutral, agreed-by-both-parties post. PyPy remains the same free and open-source project. Essentially we just switched to a different money-handler. We're announcing it in the next blog post.

                              +
                              +
                              +
                              +
                              + + Florian wrote on 2020-08-19 11:10: +
                              +
                              +

                              As https://bitbucket.org/pypy/pypy/downloads/pypy2.7-v7.3.1-linux32.tar.bz2 is down (due to heptapod move ?)

                              Where can we download pypy binaries ?

                              + +
                              +
                              +
                              +
                              + + Armin Rigo wrote on 2020-08-19 22:44: +
                              +
                              +

                              The page https://pypy.org/download.html contains the updated links.

                              +
                              +
                              +
                              + +
                              +
                              + +
                              +
                              + + \ No newline at end of file diff --git a/posts/2020/08/pypy-is-on-open-collective-5673322428814364737.html b/posts/2020/08/pypy-is-on-open-collective-5673322428814364737.html new file mode 100644 index 000000000..45b6b4d30 --- /dev/null +++ b/posts/2020/08/pypy-is-on-open-collective-5673322428814364737.html @@ -0,0 +1,300 @@ + + + + + +PyPy is on Open Collective | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                              +
                              +

                              PyPy is on Open Collective

                              + + + +
                              +

                              Hi all,

                              + +

                              PyPy is now a member of Open Collective, a fiscal host. We have been thinking about switching to this organization for a couple of years; we like it for various reasons, like the budget transparency and the lightweight touch. We can now officially announce our membership!

                              + +

                              With this, we are now again free to use PyPy for all financial issues, like receiving funds professionally, paying parts of sprint budgets as we like, and so on. We will shortly be reintroducing buttons that link to Open Collective from the PyPy web site.

                              + +

                              Although the old donation buttons were removed last year, we believe that there are still a few people that send regularly money to the SFC, the not-for-profit charity we were affiliated with. If you do, please stop doing it now (and, if you like to do so, please set up an equivalent donation to PyPy on Open Collective).

                              + +

                              And by the way, sorry for all of you who were getting mixed feelings from the previous blog post (co-written with the SFC). PyPy is committed to continue being Open Source just like before. This was never in question. What these two blog posts mean is only that we switched to a different organization for our internal finances.

                              + +

                              We're looking forward to how this new relationship will go!

                              + +

                              Armin Rigo, for the PyPy team

                              +
                              +

                              Comments

                              +
                              +
                              +
                              + +
                              +
                              + + \ No newline at end of file diff --git a/posts/2020/09/pypy-732-triple-release-python-27-36-3980901335490872787.html b/posts/2020/09/pypy-732-triple-release-python-27-36-3980901335490872787.html new file mode 100644 index 000000000..6ff3ae5ef --- /dev/null +++ b/posts/2020/09/pypy-732-triple-release-python-27-36-3980901335490872787.html @@ -0,0 +1,425 @@ + + + + + +PyPy 7.3.2 triple release: python 2.7, 3.6, and 3.7 | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                              +
                              +

                              PyPy 7.3.2 triple release: python 2.7, 3.6, and 3.7

                              + + + +
                              +

                               

                              +
                              The PyPy team is proud to release version 7.3.2 of PyPy, which includes +three different interpreters: +
                              +
                              +
                              +
                                +
                              • PyPy2.7, which is an interpreter supporting the syntax and the features of +Python 2.7 including the stdlib for CPython 2.7.13
                              • +
                              • PyPy3.6: which is an interpreter supporting the syntax and the features of +Python 3.6, including the stdlib for CPython 3.6.9.
                              • +
                              • PyPy3.7 alpha: which is our first release of an interpreter supporting the +syntax and the features of Python 3.7, including the stdlib for CPython +3.7.9. We call this an alpha release since it is our first. It is based off PyPy 3.6 so +issues should be around compatibility and not stability. Please try it out +and let us know what is broken or missing. We have not implemented some of the +documented changes in the re module, and other pieces are also +missing. For more information, see the PyPy 3.7 wiki page
                              • +
                              +
                              +
                              +

                              The interpreters are based on much the same codebase, thus the multiple +release. This is a micro release, all APIs are compatible with the 7.3.0 (Dec +2019) and 7.3.1 (April 2020) releases, but read on to find out what is new.

                              +

                              Conda Forge now supports PyPy as a python interpreter. The support is quite +complete for linux and macOS. This is the result of a lot of +hard work and good will on the part of the Conda Forge team. A big shout out +to them for taking this on.

                              +

                              Development of PyPy has transitioning to https://foss.heptapod.net/pypy/pypy. +This move was covered more extensively in this blog post. We have seen an +increase in the number of drive-by contributors who are able to use gitlab + +mercurial to create merge requests.

                              +

                              The CFFI backend has been updated to version 1.14.2. We recommend using CFFI +rather than c-extensions to interact with C, and using cppyy for performant +wrapping of C++ code for Python.

                              +

                              NumPy has begun shipping wheels on PyPI for PyPy, currently for linux 64-bit +only. Wheels for PyPy windows will be available from the next NumPy release. Thanks to NumPy for their support.

                              +

                              A new contributor took us up on the challenge to get windows 64-bit support. +The work is proceeding on the win64 branch, more help in coding or +sponsorship is welcome.

                              +

                              As always, this release fixed several issues and bugs. We strongly recommend +updating. Many of the fixes are the direct result of end-user bug reports, so +please continue reporting issues as they crop up.

                              +

                              You can find links to download the v7.3.2 releases here:

                              +
                              + +
                              +

                              We would like to thank our donors for the continued support of the PyPy +project. Please help support us at Open Collective. If PyPy is not yet good enough for your needs, we are available for +direct consulting work.

                              +

                              We would also like to thank our contributors and encourage new people to join +the project. PyPy has many layers and we need help with all of them: PyPy +and RPython documentation improvements, tweaking popular modules to run +on pypy, or general help with making RPython’s JIT even better. Since the +previous release, we have accepted contributions from 8 new contributors, +thanks for pitching in.

                              +

                              If you are a python library maintainer and use c-extensions, please consider +making a cffi / cppyy version of your library that would be performant on PyPy. +In any case both cibuildwheel and the multibuild system support +building wheels for PyPy.

                              +
                              +

                              What is PyPy?

                              +

                              PyPy is a very compliant Python interpreter, almost a drop-in replacement for +CPython 2.7, 3.6, and 3.7. It’s fast (PyPy and CPython 2.7.x performance +comparison) due to its integrated tracing JIT compiler.

                              +

                              We also welcome developers of other dynamic languages to see what RPython +can do for them.

                              +

                              This PyPy release supports:

                              +
                              +
                                +
                              • +x86 machines on most common operating systems +(Linux 32/64 bits, Mac OS X 64 bits, Windows 32 bits, OpenBSD, FreeBSD)
                              • +
                              • big- and little-endian variants of PPC64 running Linux,
                              • +
                              • +s390x running Linux
                              • +
                              • 64-bit ARM machines running Linux.
                              • +
                              +
                              +

                              PyPy does support ARM 32 bit processors, but does not release binaries.

                              +
                              + +
                              +

                              +What else is new? +

                              +For more information about the 7.3.2 release, see the full changelog.

                              +Please update, and continue to help us make PyPy better.

                              +Cheers,
                              +The PyPy team +
                              +

                               

                              +

                               

                              +
                              +

                              Comments

                              +
                              +
                              +
                              + + Marius Gedminas wrote on 2020-09-25 09:47: +
                              +
                              +

                              The SHA256 checksum for pypy3.6-v7.3.2-aarch64.tar.bz2 is one character too short on the https://www.pypy.org/download.html. Was it accidentally truncated during a copy/paste?

                              +
                              +
                              +
                              +
                              + + Anonymous wrote on 2020-09-25 14:03: +
                              +
                              +

                              Better work

                              +
                              +
                              +
                              +
                              + + Gaëtan de Menten wrote on 2020-09-25 14:45: +
                              +
                              +

                              Concerning sponsorship for the win64 branch, I am in no position to sponsor the entire thing but I would happily give a few euros if you opened a specific fund raiser for that. Note that the donation link (on the blog) is currently broken.

                              +
                              +
                              +
                              +
                              + + mattip wrote on 2020-09-26 17:32: +
                              +
                              +

                              Marius Gedminas: thanks. Indeed a copy-paste truncation. Should be fixed now, try to refresh (may take 30 minutes or so to propagate)

                              +
                              +
                              +
                              + +
                              +
                              + +
                              +
                              + + \ No newline at end of file diff --git a/posts/2020/11/pypy-733-triple-release-python-37-36-3446596804408262749.html b/posts/2020/11/pypy-733-triple-release-python-37-36-3446596804408262749.html new file mode 100644 index 000000000..c88a0998d --- /dev/null +++ b/posts/2020/11/pypy-733-triple-release-python-37-36-3446596804408262749.html @@ -0,0 +1,380 @@ + + + + + +PyPy 7.3.3 triple release: python 3.7, 3.6, and 2.7 | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                              +
                              +

                              PyPy 7.3.3 triple release: python 3.7, 3.6, and 2.7

                              + + + +
                              +

                               The PyPy team is proud to release the version 7.3.3 of PyPy, which includes +three different interpreters: +

                              +
                              +
                                +
                              • PyPy2.7, which is an interpreter supporting the syntax and the features of +Python 2.7 including the stdlib for CPython 2.7.18 (updated from the +previous version)
                              • +
                              • PyPy3.6: which is an interpreter supporting the syntax and the features of +Python 3.6, including the stdlib for CPython 3.6.12 (updated from the +previous version).
                              • +
                              • PyPy3.7 beta: which is our second release of an interpreter supporting the +syntax and the features of Python 3.7, including the stdlib for CPython +3.7.9. We call this beta quality software, there may be issues about +compatibility with new and changed features in CPython 3.7. +Please let us know what is broken or missing. We have not implemented the +documented changes in the re module, and a few other pieces are also +missing. For more information, see the PyPy 3.7 wiki page
                              • +
                              +
                              +

                              The interpreters are based on much the same codebase, thus the multiple +release. This is a micro release, all APIs are compatible with the 7.3 +releases, but read on to find out what is new.

                              +

                              Several issues found in the 7.3.2 release were fixed. Many of them came from the +great work by conda-forge to ship PyPy binary packages. A big shout out +to them for taking this on.

                              +

                              Development of PyPy has moved to https://foss.heptapod.net/pypy/pypy. +This was covered more extensively in this blog post. We have seen an +increase in the number of drive-by contributors who are able to use gitlab + +mercurial to create merge requests.

                              +

                              The CFFI backend has been updated to version 1.14.3. We recommend using CFFI +rather than c-extensions to interact with C, and using cppyy for performant +wrapping of C++ code for Python.

                              +

                              A new contributor took us up on the challenge to get windows 64-bit support. +The work is proceeding on the win64 branch, more help in coding or +sponsorship is welcome. In anticipation of merging this large change, we fixed +many test failures on windows.

                              +

                              As always, this release fixed several issues and bugs. We strongly recommend +updating. Many of the fixes are the direct result of end-user bug reports, so +please continue reporting issues as they crop up.

                              +

                              You can find links to download the v7.3.3 releases here:

                              +
                              + +
                              +

                              We would like to thank our donors for the continued support of the PyPy +project. If PyPy is not quite good enough for your needs, we are available for +direct consulting work.

                              +

                              We would also like to thank our contributors and encourage new people to join +the project. PyPy has many layers and we need help with all of them: PyPy +and RPython documentation improvements, tweaking popular modules to run +on pypy, or general help with making RPython’s JIT even better. Since the +previous release, we have accepted contributions from 2 new contributors, +thanks for pitching in.

                              +

                              If you are a python library maintainer and use c-extensions, please consider +making a cffi / cppyy version of your library that would be performant on PyPy. +In any case both cibuildwheel and the multibuild system support +building wheels for PyPy.

                              +
                              +

                              What is PyPy?

                              +

                              PyPy is a Python interpreter, a drop-in replacement for CPython 2.7, 3.6, and +3.7. It’s fast (PyPy and CPython 3.7.4 performance +comparison) due to its integrated tracing JIT compiler.

                              +

                              We also welcome developers of other dynamic languages to see what RPython +can do for them.

                              +

                              This PyPy release supports:

                              +
                              +
                                +
                              • +x86 machines on most common operating systems +(Linux 32/64 bits, Mac OS X 64 bits, Windows 32 bits, OpenBSD, FreeBSD)
                              • +
                              • big- and little-endian variants of PPC64 running Linux,
                              • +
                              • +s390x running Linux
                              • +
                              • 64-bit ARM machines running Linux.
                              • +
                              +
                              +

                              PyPy does support ARM 32 bit processors, but does not release binaries.

                              +

                               

                              +

                              +What else is new? +

                              +For more information about the 7.3.3 release, see the full changelog.

                              +Please update, and continue to help us make PyPy better.

                              +Cheers,
                              +The PyPy team +

                               

                              +
                              +
                              +

                              Comments

                              +
                              +
                              +
                              + +
                              +
                              + + \ No newline at end of file diff --git a/posts/2020/12/mac-meets-arm64-940822335619099039.html b/posts/2020/12/mac-meets-arm64-940822335619099039.html new file mode 100644 index 000000000..f79451576 --- /dev/null +++ b/posts/2020/12/mac-meets-arm64-940822335619099039.html @@ -0,0 +1,423 @@ + + + + + +Mac meets Arm64 | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                              +
                              +

                              Mac meets Arm64

                              + + + +
                              + Looking for sponsorship + +

                              Apple now ships Macs which are running on an arm64 variant machine with the +latest version of MacOS, Big Sur M1. We are getting requests for PyPy to +support this new architecture. Here is our position on this topic (or at least +mine, Armin Rigo's), and how you can help.

                              + +

                              Porting PyPy is harder than just re-running the compiler, because PyPy contains +a few big architecture-dependent "details", like the JIT compiler and the +foreign function interfaces (CFFI and ctypes).

                              + +

                              Fixing the JIT compiler should not be too much work: we already support arm64, +just the Linux one. But Apple made various details different (like the calling +conventions). A few other parts need to be fixed too, notably CFFI and ctypes, +again because of the calling conventions.

                              + +

                              Fixing that would be a reasonable amount of work. I would do it myself for a +small amount of money. However, the story doesn't finish here. Obviously, the +start of the story would be to get ssh access to a Big Sur M1 machine. (If at +this point you're thinking "sure, I can give you ssh access for three months", +then please read on.) The next part of the story is that we need a machine +available long term. It can be either a machine provided and maintained by a +third party, or alternatively a pot of money big enough to support the +acquision of a machine and ongoing work of one of us.

                              + +

                              If we go with the provided-machine solution: What we need isn't a lot of +resources. Our CI requires maybe 10 GB of disk space, and a few hours of CPU +per run. It should fit into 8 GB of RAM. We normally do a run every night but +we can certainly lower the frequency a bit if that would help. However, we'd +ideally like some kind of assurance that you are invested into maintaining the +machine for the next 3-5 years (I guess, see below). We had far too many +machines that disappeared after a few months.

                              + +

                              If we go with the money-supported solution: it's likely that after 3-5 years +the whole Mac base will have switched to arm64, we'll drop x86-64 support for +Mac, and we'll be back to the situation of the past where there was only one +kind of Mac machine to care about. In the meantime, we are looking at 3-5 +years of lightweight extra maintenance. We have someone that has said he would +do it, but not for free.

                              + +

                              If either of these two solutions occurs, we'll still have, I quote, "probably +some changes in distutils-type stuff to make python happy", and then some +packaging/deployment changes to support the "universal2" architecture, i.e. +including both versions inside a single executable (which will not be just an +extra switch to clang, because the two versions need a different JIT backend +and so must be translated separately).

                              + +

                              So, now all the factors are on the table. We won't do the minimal "just the +JIT compiler fixes" if we don't have a plan that goes farther. Either we get +sufficient money, and maybe support, and then we can do it quickly; or PyPy +will just remain not natively available on M1 hardware for the next 3-5 years. +We are looking forward to supporting M1, and view resources contributed by +the community as a vote of confidence in assuring the future of PyPy on this +hardware. Contact us: pypy-dev@python.org, or our private mailing +list pypy-z@python.org.

                              + +

                              Thanks for reading!

                              + +

                              Armin Rigo

                              +
                              +

                              Comments

                              +
                              +
                              +
                              + + Adam Sah wrote on 2020-12-31 14:16: +
                              +
                              +

                              if you post a crowdsourcing link (e.g. gofundme, etc) I'd be happy to contribute, and now that it's hit the front page of HN, I'm sure lots of other people would join. M1 macs are pretty inexpensive.

                              p.s. thanks!!! for all the work - I use pypy regularly.

                              +
                              +
                              +
                              +
                              + + Joshua Herman wrote on 2020-12-31 16:47: +
                              +
                              +

                              I have an M1 MacBook Air that I could give you SSH access to but it will come to me in mid January.

                              +
                              +
                              +
                              +
                              + + Anonymous wrote on 2020-12-31 21:51: +
                              +
                              +

                              ditto on the crowdsource

                              +
                              +
                              +
                              +
                              + + Michael wrote on 2021-01-01 00:03: +
                              +
                              +

                              You can contribute to PyPy on their Open Collective page:

                              https://opencollective.com/pypy

                              +
                              +
                              +
                              +
                              + + Adam Sah wrote on 2021-01-01 00:25: +
                              +
                              +

                              done.

                              +
                              +
                              +
                              +
                              + + Anonymous wrote on 2021-01-02 20:03: +
                              +
                              +

                              M1 Macs for CI are available for free for open source developers. See: https://www.macstadium.com/opensource

                              +
                              +
                              +
                              +
                              + + Armin Rigo wrote on 2021-01-02 20:29: +
                              +
                              +

                              @Anonymous: like many others, MacStadium is conflating "open source" with "hobbyist" by adding this clause: "Open source project may not (...)receive funding from commercial companies or organizations (NGO, education, research or governmental). (...) Contributors who are paid to work on the project are not eligible." The point of my blog post was precisely that I won't do it for free.

                              +
                              +
                              +
                              +
                              + + glyph wrote on 2021-01-04 05:39: +
                              +
                              +

                              It seems like it might be worth reaching out to MacStadium about it regardless. They've got Golang, Rust, Node, NumFocus, and Monero listed on their support page https://www.macstadium.com/opensource-members which suggests to me that this language might just be a hamfistedly awkward attempt to avoid somebody at Facebook trying to get a free fleet of mac minis out of open sourcing their SDK or something.

                              +
                              +
                              +
                              + +
                              +
                              + +
                              +
                              + + \ No newline at end of file diff --git a/posts/2021/03/new-hpy-blog.html b/posts/2021/03/new-hpy-blog.html new file mode 100644 index 000000000..4c27076fb --- /dev/null +++ b/posts/2021/03/new-hpy-blog.html @@ -0,0 +1,297 @@ + + + + + +New HPy blog | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                              +
                              +

                              New HPy blog

                              + + + +
                              +

                              Regular readers of this blog +already know +about HPy, a project which aims to develop a new C +API for Python to make it easier/faster to support C extensions on alternative +Python implementations, including PyPy.

                              +

                              The HPy team just published the +first post of HPy new +blog, so if you are interested in its development, make sure to check it out!

                              +
                              +

                              Comments

                              +
                              +
                              +
                              + +
                              +
                              + + \ No newline at end of file diff --git a/posts/2021/03/pypys-blog-has-moved.html b/posts/2021/03/pypys-blog-has-moved.html new file mode 100644 index 000000000..d933613c6 --- /dev/null +++ b/posts/2021/03/pypys-blog-has-moved.html @@ -0,0 +1,331 @@ + + + + + +PyPy's blog has moved | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                              +
                              +

                              PyPy's blog has moved

                              + + + +
                              +

                              For many years, PyPy has been publishing blog posts at +https://morepypy.blogspot.com. From now on, +the posts will be here, at https://pypy.org/blog. The +RSS feed is https://pypy.org/rss.xml. The original +content has been migrated to the newer site, including comments.

                              + + +

                              Among the motivations for the move were:

                              +

                              One site to rule them all

                              +

                              Adding the blog posts here seems like a natural extension of the web site +rather than outsourcing it to a third-party. Since the site is generated using +the static site generator nikola from the github repo +https://github.com/pypy/pypy.org, we also +have good source control for the content.

                              +

                              CI previews, and github

                              +

                              Those of you who follow PyPy may note something new in the URL for the repo: +until now PyPy has been using mercurial as hosted +on https://foss.heptapod.net. While +heptapod (a community driven effort to bring mercurial +support to GitLab™) does provide a GitLab CI runner for the open source +offering, on github it is easier to integrate netlify +for previews. Hopefully the move to the more popular github platform will +encourage new contributors to publish their success stories around using PyPy +and the RPython toolchain.

                              +

                              Comments

                              +

                              Comments to blog posts are generated via the utterances +javascript plugin. The comments appear as issues in the repo. +When viewing the site, a query is made to fetch the comments to the issue with +that name. To comment, users must authorize the utterances app to post on their +behalf using the GitHub +OAuth flow. +Alternatively, users can comment on the GitHub issue directly. The interaction +with github for authentication and moderation seems more natural than the +manual moderation required on blogspot.

                              +

                              Please prove to us that the move is worth it

                              +

                              Help us with guest blog posts, and PRs to improve the styling of the site. One +already open issue is that the +navbar needlessly uses javascript, help to keep the responsive style in pure +CSS is welcome. The theme could also use tweaking.

                              +

                              But more importantly, we want to hear from you. Guest blog posts about +PyPy are welcome. Just follow the directions in the repo's README to create a +PR with your favorite PyPy story.

                              +

                              The PyPy Team

                              +
                              +

                              Comments

                              +
                              +
                              +
                              + +
                              +
                              + + \ No newline at end of file diff --git a/posts/2021/04/pypy-v734-release-of-python-27-and-37.html b/posts/2021/04/pypy-v734-release-of-python-27-and-37.html new file mode 100644 index 000000000..3bb94aff1 --- /dev/null +++ b/posts/2021/04/pypy-v734-release-of-python-27-and-37.html @@ -0,0 +1,395 @@ + + + + + +PyPy v7.3.4: release of python 2.7 and 3.7 | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                              +
                              +

                              PyPy v7.3.4: release of python 2.7 and 3.7

                              + + + +
                              +

                              PyPy v7.3.4: release of python 2.7 and 3.7

                              +

                              The PyPy team is proud to release the version 7.3.4 of PyPy, which includes +two different interpreters:

                              +
                              +
                                +
                              • PyPy2.7, which is an interpreter supporting the syntax and the features of +Python 2.7 including the stdlib for CPython 2.7.18+ (the + is for +backported security updates)

                              • +
                              • PyPy3.7, which is an interpreter supporting the syntax and the features of +Python 3.7, including the stdlib for CPython 3.7.10. We no longer refer to +this as beta-quality as the last incompatibilities with CPython (in the +re module) have been fixed.

                              • +
                              +
                              +

                              We are no longer releasing a Python3.6 version, as we focus on updating to +Python 3.8. We have begun streaming the advances towards this goal on Saturday +evenings European time on https://www.twitch.tv/pypyproject. If Python3.6 is +important to you, please reach out as we could offer sponsored longer term +support.

                              +

                              The two interpreters are based on much the same codebase, thus the multiple +release. This is a micro release, all APIs are compatible with the other 7.3 +releases. Highlights of the release include binary Windows 64 support, +faster numerical instance fields, and a preliminary HPy backend.

                              +

                              A new contributor (Ondrej Baranovič - thanks!) took us up on the challenge to get +windows 64-bit support. The work has been merged and for the first time we +are releasing a 64-bit Windows binary package.

                              +

                              The release contains the biggest change to PyPy's implementation of the +instances of user-defined classes in many years. The optimization was +motivated by the report of performance problems running a numerical particle +emulation. We implemented an optimization that stores int and float +instance fields in an unboxed way, as long as these fields are type-stable +(meaning that the same field always stores the same type, using the principle +of type freezing). This gives significant performance improvements on +numerical pure-Python code, and other code where instances store many integers +or floating point numbers.

                              +

                              There were also a number of optimizations for methods around strings and bytes, +following user reported performance problems. If you are unhappy with PyPy's +performance on some code of yours, please report an issue!

                              +

                              A major new feature is prelminary support for the Universal mode of HPy: a +new way of writing c-extension modules to totally encapsulate PyObject*. +The goal, as laid out in the HPy documentation and recent HPy blog post, +is to enable a migration path +for c-extension authors who wish their code to be performant on alternative +interpreters like GraalPython (written on top of the Java virtual machine), +RustPython, and PyPy. Thanks to Oracle and IBM for sponsoring work on HPy.

                              +

                              Support for the vmprof statistical profiler has been extended to ARM64 via a +built-in backend.

                              +

                              Several issues exposed in the 7.3.3 release were fixed. Many of them came from the +great work ongoing to ship PyPy-compatible binary packages in conda-forge. +A big shout out to them for taking this on.

                              +

                              Development of PyPy takes place on https://foss.heptapod.net/pypy/pypy. +We have seen an increase in the number of drive-by contributors who are able to +use gitlab + mercurial to create merge requests.

                              +

                              The CFFI backend has been updated to version 1.14.5 and the cppyy backend +to 1.14.2. We recommend using CFFI rather than C-extensions to interact with C, +and using cppyy for performant wrapping of C++ code for Python.

                              +

                              As always, we strongly recommend updating to the latest versions. Many fixes +are the direct result of end-user bug reports, so please continue reporting +issues as they crop up.

                              +

                              You can find links to download the v7.3.4 releases here:

                              +
                              +

                              https://pypy.org/download.html

                              +
                              +

                              We would like to thank our donors for the continued support of the PyPy +project. If PyPy is not quite good enough for your needs, we are available for +direct consulting work. If PyPy is helping you out, we would love to hear about +it and encourage submissions to our renovated blog site via a pull request +to https://github.com/pypy/pypy.org

                              +

                              We would also like to thank our contributors and encourage new people to join +the project. PyPy has many layers and we need help with all of them: PyPy +and RPython documentation improvements, tweaking popular modules to run +on PyPy, or general help with making RPython's JIT even better. Since the +previous release, we have accepted contributions from 10 new contributors, +thanks for pitching in, and welcome to the project!

                              +

                              If you are a python library maintainer and use C-extensions, please consider +making a cffi / cppyy version of your library that would be performant on PyPy. +In any case both cibuildwheel and the multibuild system support +building wheels for PyPy.

                              +

                              What is PyPy?

                              +

                              PyPy is a Python interpreter, a drop-in replacement for CPython 2.7, 3.7, and +soon 3.8. It's fast (PyPy and CPython 3.7.4 performance +comparison) due to its integrated tracing JIT compiler.

                              +

                              We also welcome developers of other dynamic languages to see what RPython +can do for them.

                              +

                              This PyPy release supports:

                              +
                              +
                                +
                              • x86 machines on most common operating systems +(Linux 32/64 bits, Mac OS X 64 bits, Windows 32/64 bits, OpenBSD, FreeBSD)

                              • +
                              • big- and little-endian variants of PPC64 running Linux,

                              • +
                              • s390x running Linux

                              • +
                              • 64-bit ARM machines running Linux.

                              • +
                              +
                              +

                              PyPy does support ARM 32 bit processors, but does not release binaries.

                              +

                              What else is new?

                              +

                              For more information about the 7.3.4 release, see the full changelog.

                              +

                              Please update, and continue to help us make PyPy better.

                              +

                              Cheers, +The PyPy team

                              +
                              +
                              +

                              Comments

                              +
                              +
                              +
                              + +
                              +
                              + + \ No newline at end of file diff --git a/posts/2021/04/ways-pypy-graphviz.html b/posts/2021/04/ways-pypy-graphviz.html new file mode 100644 index 000000000..43a3dc14a --- /dev/null +++ b/posts/2021/04/ways-pypy-graphviz.html @@ -0,0 +1,334 @@ + + + + + +Some Ways that PyPy uses Graphviz | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                              +
                              +

                              Some Ways that PyPy uses Graphviz

                              + + + +
                              +

                              Some way that PyPy uses Graphviz

                              +

                              Somebody wrote this super cool thread on Twitter about using Graphviz to make +software visualize its internal state:

                              +

                              PyPy is using this approach a lot too and I collected a few screenshots of that +technique on Twitter and I thought it would make a nice blog post too!

                              +

                              The most important view early in the project, and the way that our Graphviz +visualizations got started was that we implemented a way to look at the control +flow graphs of our RPython functions after type inference. They are in static +single information form (SSI), a variant of SSA form. Hovering over the +variables shows the inferred types in the footer:

                              +/images/2021-graphviz-02-cfg-types.png

                              There's another view that shows the inferred call graph of the program:

                              +/images/2021-graphviz-05-call-graph.png

                              A related viewer shows the inferred class hierarchy (in this case the exception +hierarchy) and you can focus on a single class, which will show you its base +classes and all the methods and instance attributes that were found:

                              +/images/2021-graphviz-03-classhier.png/images/2021-graphviz-04-classhier-detailed.png

                              We also have a view to show us the traces that are produced by the tracing JIT +tests. this viewer doesn't really scale to the big traces that the full Python +interpreter produces, but it's really useful during testing:

                              +/images/2021-graphviz-06-trace.png

                              Then there are more traditional tree views, eg here is a parse tree for a small +piece of Python source code:

                              +/images/2021-graphviz-07-parse-tree.png

                              Parsing-related we have visualized the DFAs of the parser in the past, +though the code is unfortunately lost.

                              +

                              All these visualizations are made by walking the relevant data structures and +producing a Graphviz input file using a bit of string manipulation, which is +quite easy to do. Knowing a bit of Graphviz is a really useful skill, it's +super easy to make throwaway visualizations.

                              +

                              For example here is a one-off thing I did when debugging our JSON parser to +show the properties of the objects used in a huge example json file:

                              +/images/2021-graphviz-08-json-parser.png

                              On top of graphviz, we have a custom tool called the dotviewer, which is +written in Python and uses Pygame to give you a zoomable, pannable, searchable +way to look at huge Graphviz graphs. All the images in this post are +screenshots of that tool. In its simplest form it takes any .dot files as +input.

                              +

                              Here's a small video dotviewer, moving around and searching in the json graph. +By writing a bit of extra Python code the dotviewer can also be extended to add +hyperlinks in the graphs to navigate to different views (for example, we did +that for the callgraphs above).

                              +

                              All in all this is a really powerful approach to understand the behaviour of +some of code, or when debugging complicated problems and we have gotten a +huge amount of mileage out of this over the years. It can be seen as an instance +of moldable development ("a way of programming through which you construct +custom tools for each problem"). And it's really easy to get into! The Graphviz +language is quite a simple text-based language that can be applied to a huge +amount of different visualization situations.

                              +
                              +
                              +

                              Comments

                              +
                              +
                              +
                              + +
                              +
                              + + \ No newline at end of file diff --git a/posts/2021/05/pypy-irc-moves-to-libera-chat.html b/posts/2021/05/pypy-irc-moves-to-libera-chat.html new file mode 100644 index 000000000..c353e1f93 --- /dev/null +++ b/posts/2021/05/pypy-irc-moves-to-libera-chat.html @@ -0,0 +1,297 @@ + + + + + +#pypy IRC moves to Libera.Chat | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                              +
                              +

                              #pypy IRC moves to Libera.Chat

                              + + + +
                              +

                              Following the example of many other FOSS projects, the PyPy team has +decided to move its official #pypy IRC channel from Freenode to +Libera.Chat: irc.libera.chat/pypy

                              +

                              The core devs will no longer be present on the Freenode channel, so we recommend to +join the new channel as soon as possible.

                              +

                              wikimedia.org has a +nice guide on +how to setup your client to migrate from Freenode to Libera.Chat.

                              + +
                              +

                              Comments

                              +
                              +
                              +
                              + +
                              +
                              + + \ No newline at end of file diff --git a/posts/2021/05/pypy-v735-release.html b/posts/2021/05/pypy-v735-release.html new file mode 100644 index 000000000..ceaa00a5a --- /dev/null +++ b/posts/2021/05/pypy-v735-release.html @@ -0,0 +1,369 @@ + + + + + +PyPy v7.3.5: bugfix release of python 2.7 and 3.7 | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                              +
                              +

                              PyPy v7.3.5: bugfix release of python 2.7 and 3.7

                              + + + +
                              +

                              PyPy v7.3.5: release of 2.7 and 3.7

                              +

                              We are releasing a PyPy 7.3.5 with bugfixes for PyPy 7.3.4, released April 4. +PyPy 7.3.4 was the first release that runs on windows 64-bit, so that support +is still "beta". We are releasing it in the hopes that we can garner momentum +for its continued support, but are already aware of some problems, for instance +it errors in the NumPy test suite (issue 3462). Please help out with testing +the release and reporting successes and failures, financially supporting our +ongoing work, and helping us find the source of these problems.

                              +
                                +
                              • The new windows 64-bit builds improperly named c-extension modules +with the same extension as the 32-bit build (issue 3443)

                              • +
                              • Use the windows-specific PC/pyconfig.h rather than the posix one

                              • +
                              • Fix the return type for _Py_HashDouble which impacts 64-bit windows

                              • +
                              • A change to the python 3.7 sysconfig.get_config_var('LIBDIR') was wrong, +leading to problems finding libpypy3-c.so for embedded PyPy (issue 3442).

                              • +
                              • Instantiate distutils.command.install schema for PyPy-specific +implementation_lower

                              • +
                              • Delay thread-checking logic in greenlets until the thread is actually started +(continuation of issue 3441)

                              • +
                              • +

                                Four upstream (CPython) security patches were applied:

                                +
                                  +
                                • BPO 42988 to remove pydoc.getfile

                                • +
                                • BPO 43285 to not trust the PASV response in ftplib.

                                • +
                                • BPO 43075 to remove a possible ReDoS in urllib AbstractBasicAuthHandler

                                • +
                                • BPO 43882 to sanitize urls containing ASCII newline and tabs in +urllib.parse

                                • +
                                +
                              • +
                              • Fix for json-specialized dicts (issue 3460)

                              • +
                              • Specialize ByteBuffer.setslice which speeds up binary file reading by a +factor of 3

                              • +
                              • When assigning the full slice of a list, evaluate the rhs before clearing the +list (issue 3440)

                              • +
                              • On Python2, PyUnicode_Contains accepts bytes as well as unicode.

                              • +
                              • Finish fixing _sqlite3 - untested _reset() was missing an argument +(issue 3432)

                              • +
                              • Update the packaged sqlite3 to 3.35.5 on windows. While not a bugfix, this +seems like an easy win.

                              • +
                              +

                              We recommend updating. These fixes are the direct result of end-user bug +reports, so please continue reporting issues as they crop up.

                              +

                              You can find links to download the v7.3.5 releases here:

                              +
                              +

                              https://pypy.org/download.html

                              +
                              +

                              We would like to thank our donors for the continued support of the PyPy +project. If PyPy is not quite good enough for your needs, we are available for +direct consulting work. If PyPy is helping you out, we would love to hear about +it and encourage submissions to our renovated blog site via a pull request +to https://github.com/pypy/pypy.org

                              +

                              We would also like to thank our contributors and encourage new people to join +the project. PyPy has many layers and we need help with all of them: PyPy +and RPython documentation improvements, tweaking popular modules to run +on PyPy, or general help with making RPython's JIT even better.

                              +

                              If you are a python library maintainer and use C-extensions, please consider +making a CFFI / cppyy version of your library that would be performant on PyPy. +In any case both cibuildwheel and the multibuild system support +building wheels for PyPy.

                              +

                              What is PyPy?

                              +

                              PyPy is a Python interpreter, a drop-in replacement for CPython 2.7, 3.7, and +soon 3.8. It's fast (PyPy and CPython 3.7.4 performance +comparison) due to its integrated tracing JIT compiler.

                              +

                              We also welcome developers of other dynamic languages to see what RPython +can do for them.

                              +

                              This PyPy release supports:

                              +
                              +
                                +
                              • x86 machines on most common operating systems +(Linux 32/64 bits, Mac OS X 64 bits, Windows 32/64 bits, OpenBSD, FreeBSD)

                              • +
                              • big- and little-endian variants of PPC64 running Linux,

                              • +
                              • s390x running Linux

                              • +
                              • 64-bit ARM machines running Linux.

                              • +
                              +
                              +

                              PyPy does support ARM 32 bit processors, but does not release binaries.

                              +
                              +
                              +

                              Comments

                              +
                              +
                              +
                              + +
                              +
                              + + \ No newline at end of file diff --git a/posts/2021/09/jit-auto-generated-code.html b/posts/2021/09/jit-auto-generated-code.html new file mode 100644 index 000000000..9faea860c --- /dev/null +++ b/posts/2021/09/jit-auto-generated-code.html @@ -0,0 +1,553 @@ + + + + + +Better JIT Support for Auto-Generated Python Code | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                              +
                              +

                              Better JIT Support for Auto-Generated Python Code

                              + + + +
                              +

                              Performance Cliffs

                              +

                              A common bad property of many different JIT compilers is that of a "performance +cliff": A seemingly reasonable code change, leading to massively reduced +performance due to hitting some weird property of the JIT compiler that's not +easy to understand for the programmer (e.g. here's a blog post about the fix of +a performance cliff when running React on +V8). Hitting a performance cliff as a +programmer can be intensely frustrating and turn people off from using PyPy +altogether. Recently we've been working on trying to remove some of PyPy's +performance cliffs, and this post describes one such effort.

                              +

                              The problem showed up in an issue +where somebody found the performance +of their website using Tornado a lot +worse than what various benchmarks suggested. It took some careful digging to +figure out what caused the problem: The slow performance was caused by the huge +functions that the Tornado templating engine creates. These functions lead the +JIT to behave in unproductive ways. In this blog post I'll describe why the +problem occurs and how we fixed it.

                              +

                              Problem

                              +

                              After quite a bit of debugging we narrowed down the problem to the following +reproducer: If you render a big HTML template +(example) +using the Tornado templating engine, the template rendering is really not any +faster than CPython. A small template doesn't show this behavior, and other +parts of Tornado seem to perform well. So we looked into how the templating +engine works, and it turns out that the templates are compiled into Python +functions. This means that a big template can turn into a really enormous Python +function (Python version of the +example). +For some reason really enormous Python functions aren't handled particularly +well by the JIT, and in the next section I'll explain some the background that's +necessary to understand why this happens.

                              +

                              Trace Limits and Inlining

                              +

                              To understand why the problem occurs, it's necessary to understand how PyPy's +trace limit and inlining works. The tracing JIT has a maximum trace length built +in, the reason for that is some limitation in the compact encoding of traces in +the JIT. Another reason is that we don't want to generate arbitrary large chunks +of machine code. Usually, when we hit the trace limit, it is due to inlining. +While tracing, the JIT will inline many of the functions called from the +outermost one. This is usually good and improves performance greatly, however, +inlining can also lead to the trace being too long. If that happens, we +will mark a called function as uninlinable. The next time we trace the outer +function we won't inline it, leading to a shorter trace, which hopefully fits +the trace limit.

                              +

                              Diagram illustrating the interaction of the trace limit and inlining

                              +

                              In the diagram above we trace a function f, which calls a function g, which +is inlined into the trace. The trace ends up being too long, so the JIT +disables inlining of g. The next time we try to trace f the trace will +contain a call to g instead of inlining it. The trace ends up being not too +long, so we can turn it into machine code when tracing finishes.

                              +

                              Now we know enough to understand what the problem with automatically generated +code is: sometimes, the outermost function itself +doesn't fit the trace limit, without any inlining going on at all. This is +usually not the case for normal, hand-written Python functions. However, it can +happen for automatically generated Python code, such as the code that the +Tornado templating engine produces.

                              +

                              So, what happens when the JIT hits such a huge function? The function is traced +until the trace is too long. Then the trace limits stops further tracing. Since +nothing was inlined, we cannot make the trace shorter the next time by disabling +inlining. Therefore, this happens again and again, the next time we trace the +function we run into exactly the same problem. The net effect is that the +function is even slowed down: we spend time tracing it, then stop tracing and +throw the trace away. Therefore, that effort is never useful, so the resulting +execution can be slower than not using the JIT at all!

                              +

                              Solution

                              +

                              To get out of the endless cycle of useless retracing we first had the idea of +simply disabling all code generation for such huge functions, that produce too long +traces even if there is no inlining at all. However, that lead to disappointing +performance in the example Tornado program, because important parts of the code +remain always interpreted.

                              +

                              Instead, our solution is now as follows: After we have hit the trace limit and +no inlining has happened so far, we mark the outermost function as a source of huge +traces. The next time we trace such a function, we do so in a special mode. In +that mode, hitting the trace limit behaves differently: Instead of stopping the +tracer and throwing away the trace produced so far, we will use the unfinished +trace to produce machine code. This trace corresponds to the first part of the +function, but stops at a basically arbitrary point in the middle of the +function.

                              +

                              The question is what should happen when execution +reaches the end of this unfinished trace. We want to be able to cover more of +the function with machine code and therefore need to extend the trace +from that point on. But we don't want to do that too +eagerly to prevent lots and lots of machine code being generated. To achieve +this behaviour we add a guard to the end of the unfinished trace, which will +always fail. This has the right behaviour: a failing guard will transfer control +to the interpreter, but if it fails often enough, we can patch it to jump to +more machine code, that starts from this position. In that way, we can slowly +explore the full gigantic function and add all those parts of the control flow +graph that are actually commonly executed at runtime.

                              +

                              Diagram showing what happens in the new jit when tracing a huge function

                              +

                              In the diagram we are trying to trace a huge function f, which leads to +hitting the trace limit. However, nothing was inlined into the trace, so +disabling inlining won't ensure a successful trace attempt the next time. +Instead, we mark f as "huge". This has the effect that when we trace it again +and are about to hit the trace limit, we end the trace at an arbitrary point by +inserting a guard that always fails.

                              +

                              Diagram showing what happens in the new jit when tracing a huge function until completion

                              +

                              If this guard failure is executed often enough, we might patch the guard and +add a jump to a further part of the function f. This can continue potentially +several times, until the trace really hits and end points (for example by +closing the loop and jumping back to trace 1, or by returning from f).

                              +

                              Evaluation

                              +

                              Since this is a performance cliff that we didn't observe in any of our +benchmarks ourselves, it's pointless to look at the +effect that this improvement has on existing benchmarks – there shouldn't and +indeed there isn't any.

                              +

                              Instead, we are going to look at a micro-benchmark that came out of the +original bug report, one that simply renders a big artificial Tornado template +200 times. The code of the micro-benchmark can be found +here.

                              +

                              All benchmarks were run 10 times in new processes. The means and standard +deviations of the benchmark runs are:

                              + + + + + + + + + + + + + + + + + + + + + + + +
                              ImplementationTime taken (lower is better)
                              CPython 3.9.514.19 ± 0.35s
                              PyPy3 without JIT59.48 ± 5.41s
                              PyPy3 JIT old14.47 ± 0.35s
                              PyPy3 JIT new4.89 ± 0.10s
                              +

                              What we can see is that while the old JIT is very helpful for this +micro-benchmark, it only brings the performance up to CPython levels, not +providing any extra benefit. The new JIT gives an almost 3x speedup.

                              +

                              Another interesting number we can look at is how often the JIT started a trace, +and for how many traces we produced actual machine code:

                              + + + + + + + + + + + + + + + + + + + + + +
                              ImplementationTraces StartedTraces sent to backendTime spent in JIT
                              PyPy3 JIT old216240.65s
                              PyPy3 JIT new30250.06s
                              +

                              Here we can clearly see the problem: The old JIT would try tracing the +auto-generated templating code again and again, but would never actually produce +any machine code, wasting lots of time in the process. The new JIT still traces a +few times uselessly, but then eventually converges and stops emitting machine +code for all the paths through the auto-generated Python code.

                              + + + +

                              Tim Felgentreff pointed me to the fact that +Truffle also has a +mechanism +to slice huge methods into smaller compilation units (and I am sure other JITs +have such mechanisms as well).

                              +

                              Conclusion

                              +

                              In this post we've described a performance cliff in PyPy's JIT, that of really +big auto-generated functions which hit the trace limit without inlining, that we +still want to generate machine code for. We achieve this by chunking up the +trace into several smaller traces, which we compile piece by piece. This is not +a super common thing to be happening – otherwise we would have run into and +fixed it earlier – but it's still good to have a fix now.

                              +

                              The work +described in this post tiny bit experimental still, but we will release it as +part of the upcoming 3.8 beta release, to get some more experience with it. +Please grab a 3.8 release +candidate, +try it out and let us know your observations, good and bad!

                              +
                              +

                              Comments

                              +
                              +
                              +
                              + +
                              +
                              + + \ No newline at end of file diff --git a/posts/2021/10/pypy-v736-release.html b/posts/2021/10/pypy-v736-release.html new file mode 100644 index 000000000..0f2d387af --- /dev/null +++ b/posts/2021/10/pypy-v736-release.html @@ -0,0 +1,386 @@ + + + + + +PyPy v7.3.6: release of python 2.7, 3.7, and 3.8 | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                              +
                              +

                              PyPy v7.3.6: release of python 2.7, 3.7, and 3.8

                              + + + +
                              +

                              PyPy v7.3.6: release of python 2.7, 3.7, and 3.8-beta

                              +

                              The PyPy team is proud to release version 7.3.6 of PyPy, which includes +three different interpreters:

                              +
                              +
                                +
                              • PyPy2.7, which is an interpreter supporting the syntax and the features of +Python 2.7 including the stdlib for CPython 2.7.18+ (the + is for +backported security updates)

                              • +
                              • PyPy3.7, which is an interpreter supporting the syntax and the features of +Python 3.7, including the stdlib for CPython 3.7.12.

                              • +
                              • PyPy3.8, which is an interpreter supporting the syntax and the features of +Python 3.8, including the stdlib for CPython 3.8.12. Since this is our +first release of the interpreter, we relate to this as "beta" quality. We +welcome testing of this version, if you discover incompatibilites, please +report them so we can gain confidence in the version.

                              • +
                              +
                              +

                              The interpreters are based on much the same codebase, thus the multiple +release. This is a micro release, all APIs are compatible with the other 7.3 +releases. Highlights of the release, since the release of 7.3.5 in May 2021, +include:

                              +
                              +
                                +
                              • We have merged a backend for HPy, the better C-API interface. The backend +implements HPy version 0.0.3.

                              • +
                              • Translation of PyPy into a binary, known to be slow, is now about 40% +faster. On a modern machine, PyPy3.8 can translate in about 20 minutes.

                              • +
                              • PyPy Windows 64 is now available on conda-forge, along with nearly 700 +commonly used binary packages. This new offering joins the more than 1000 +conda packages for PyPy on Linux and macOS. Many thanks to the conda-forge +maintainers for pushing this forward over the past 18 months.

                              • +
                              • Speed improvements were made to io, sum, _ssl and more. These +were done in response to user feedback.

                              • +
                              • The 3.8 version of the release contains a beta-quality improvement to the +JIT to better support compiling huge Python functions by breaking them +up into smaller pieces.

                              • +
                              • The release of Python3.8 required a concerted effort. We were greatly +helped by @isidentical (Batuhan Taskaya) and other new contributors.

                              • +
                              • The 3.8 package now uses the same layout as CPython, and many of the +PyPy-specific changes to sysconfig, distutils.sysconfig, and +distutils.commands.install.py have been removed. The stdlib now +is located in <base>/lib/pypy3.8 on posix systems, and in +<base>/Lib on Windows. The include files on windows remain the same. +On posix they are in <base>/include/pypy3.8. Note we still use the +pypy prefix to prevent mixing the files with CPython (which uses +python.

                              • +
                              +
                              +

                              We recommend updating. You can find links to download the v7.3.6 releases here:

                              +
                              +

                              https://pypy.org/download.html

                              +
                              +

                              We would like to thank our donors for the continued support of the PyPy +project. If PyPy is not quite good enough for your needs, we are available for +direct consulting work. If PyPy is helping you out, we would love to hear about +it and encourage submissions to our blog via a pull request +to https://github.com/pypy/pypy.org

                              +

                              We would also like to thank our contributors and encourage new people to join +the project. PyPy has many layers and we need help with all of them: PyPy +and RPython documentation improvements, tweaking popular modules to run +on PyPy, or general help with making RPython's JIT even better. Since the +previous release, we have accepted contributions from 7 new contributors, +thanks for pitching in, and welcome to the project!

                              +

                              If you are a python library maintainer and use C-extensions, please consider +making a CFFI / cppyy version of your library that would be performant on PyPy. +In any case both cibuildwheel and the multibuild system support +building wheels for PyPy.

                              +

                              What is PyPy?

                              +

                              PyPy is a Python interpreter, a drop-in replacement for CPython 2.7, 3.7, and +soon 3.8. It's fast (PyPy and CPython 3.7.4 performance +comparison) due to its integrated tracing JIT compiler.

                              +

                              We also welcome developers of other dynamic languages to see what RPython +can do for them.

                              +

                              This PyPy release supports:

                              +
                              +
                                +
                              • x86 machines on most common operating systems +(Linux 32/64 bits, Mac OS X 64 bits, Windows 64 bits, OpenBSD, FreeBSD)

                              • +
                              • big- and little-endian variants of PPC64 running Linux,

                              • +
                              • s390x running Linux

                              • +
                              • 64-bit ARM machines running Linux.

                              • +
                              +
                              +

                              PyPy does support Windows 32-bit and ARM 32 bit processors, but does not +release binaries. Please reach out to us if you wish to sponsor releases for +those platforms.

                              +

                              What else is new?

                              +

                              For more information about the 7.3.6 release, see the full changelog.

                              +

                              Please update, and continue to help us make PyPy better.

                              +

                              Cheers, +The PyPy team

                              +
                              +
                              +

                              Comments

                              +
                              +
                              +
                              + +
                              +
                              + + \ No newline at end of file diff --git a/posts/2021/10/pypy-v737-release.html b/posts/2021/10/pypy-v737-release.html new file mode 100644 index 000000000..a88fdebf9 --- /dev/null +++ b/posts/2021/10/pypy-v737-release.html @@ -0,0 +1,343 @@ + + + + + +PyPy v7.3.7: bugfix release of python 3.7 and 3.8 | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                              +
                              +

                              PyPy v7.3.7: bugfix release of python 3.7 and 3.8

                              + + + +
                              +

                              PyPy v7.3.7: bug-fix release of 3.7, 3.8

                              +

                              We are releasing a PyPy 7.3.7 to fix the recent 7.3.6 release's binary +incompatibility with the previous 7.3.x releases. We mistakenly added fields +to PyFrameObject and PyDateTime_CAPI that broke the promise of binary +compatibility, which means that c-extension wheels compiled for 7.3.5 will not +work with 7.3.6 and via-versa. Please do not use 7.3.6.

                              +

                              We have added a cursory test for binary API breakage to the +https://github.com/pypy/binary-testing repo which hopefully will prevent such +mistakes in the future.

                              +

                              Additionally, a few smaller bugs were fixed:

                              +
                                +
                              • Use uint for the request argument of fcntl.ioctl (issue 3568)

                              • +
                              • Fix incorrect tracing of while True` body in 3.8 (issue 3577)

                              • +
                              • Properly close resources when using a concurrent.futures.ProcessPool +(issue 3317)

                              • +
                              • Fix the value of LIBDIR in _sysconfigdata in 3.8 (issue 3582)

                              • +
                              +

                              You can find links to download the v7.3.7 releases here:

                              +
                              +

                              https://pypy.org/download.html

                              +
                              +

                              We would like to thank our donors for the continued support of the PyPy +project. If PyPy is not quite good enough for your needs, we are available for +direct consulting work. If PyPy is helping you out, we would love to hear about +it and encourage submissions to our blog site via a pull request +to https://github.com/pypy/pypy.org

                              +

                              We would also like to thank our contributors and encourage new people to join +the project. PyPy has many layers and we need help with all of them: PyPy +and RPython documentation improvements, tweaking popular modules to run +on PyPy, or general help with making RPython's JIT even better.

                              +

                              If you are a python library maintainer and use C-extensions, please consider +making a CFFI / cppyy version of your library that would be performant on PyPy. +In any case both cibuildwheel and the multibuild system support +building wheels for PyPy.

                              +

                              What is PyPy?

                              +

                              PyPy is a Python interpreter, a drop-in replacement for CPython 2.7, 3.7, and +3.8. It's fast (PyPy and CPython 3.7.4 performance +comparison) due to its integrated tracing JIT compiler.

                              +

                              We also welcome developers of other dynamic languages to see what RPython +can do for them.

                              +

                              This PyPy release supports:

                              +
                              +
                                +
                              • x86 machines on most common operating systems +(Linux 32/64 bits, Mac OS X 64 bits, Windows 64 bits, OpenBSD, FreeBSD)

                              • +
                              • 64-bit ARM machines running Linux.

                              • +
                              • s390x running Linux

                              • +
                              +
                              +

                              PyPy does support ARM 32 bit and PPC64 processors, but does not release binaries.

                              +
                              +
                              +

                              Comments

                              +
                              +
                              +
                              + +
                              +
                              + + \ No newline at end of file diff --git a/posts/2021/12/error-message-style-guides.html b/posts/2021/12/error-message-style-guides.html new file mode 100644 index 000000000..ac63d11d9 --- /dev/null +++ b/posts/2021/12/error-message-style-guides.html @@ -0,0 +1,354 @@ + + + + + +Error Message Style Guides of Various Languages | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                              +
                              +

                              Error Message Style Guides of Various Languages

                              + + + +
                              +

                              Error Message Style Guides of Various Languages

                              +

                              PyPy has been trying to produce good SyntaxErrors and other errors for +a long time. CPython has also made an enormous push to improve its +SyntaxErrors in the last few releases. These improvements are great, but the process +feels somewhat arbitrary sometimes. To see what other languages are doing, I +asked people on Twitter whether they know of error message style guides for +other programming languages.

                              +

                              Wonderfully, people answered me with lots of helpful links (full list at the +end of the post), thank you everybody! All those sources are very interesting +and contain many great points, I recommend reading them directly! In this +post, I'll try to summarize some common themes or topics that I thought were +particularly interesting.

                              +

                              Language Use

                              +

                              Almost all guides stress the need for plain and simple English, as well as +conciseness and clarity [Flix, Racket, Rust, Flow]. Flow suggests to put coding +effort into making the grammar correct, for example in the case of plurals or +to distinguish between "a" and "an".

                              +

                              The suggested tone should be friendly and neutral, the messages should not +blame the Programmer [Flow]. Rust and Flix suggest to not use the term +'illegal' and use something like 'invalid' instead.

                              +

                              Flow suggests to avoid "compiler speak". For example terms like 'token' and +'identifier' should be avoided and terms that are more familiar to programmers +be used (eg "name" is better). The Racket guide goes further and has a list of +allowed technical terms and some prohibited terms.

                              +

                              Structure

                              +

                              Several guides (such as Flix and Flow) point out a 80/20 rule: 80% of the times an error message is +read, the developer knows that message well and knows exactly what to do. For +this use case it's important that the message is short. On the other hand, 20% +of the times this same message will have to be understood by a developer who +has never seen it before and is confused, and so the message needs to contain +enough information +to allow them to find out what is going on. So the error message needs to strike +a balance between brevity and clarity.

                              +

                              The Racket guide proposes to use the following general structure for errors: +'State the constraint that was violated ("expected a"), followed by what was +found instead.'

                              +

                              The Rust guides says to avoid "Did you mean?" and questions in general, and +wants the compiler to instead be explicit about why something was suggested. The +example the Rust guide gives is: 'Compare "did you mean: Foo" vs. "there is a +struct with a similar name: Foo".' Racket goes further and forbids +suggestions altogether because "Students will follow well‐meaning‐but‐wrong +advice uncritically, if only because they have no reason to doubt the +authoritative voice of the tool."

                              +

                              Formatting and Source Positions

                              +

                              The Rust guide suggests to put all identifiers into backticks (like in +Markdown), Flow formats the error messages using full Markdown.

                              +

                              The Clang, Flow and Rust guides point out the importance of using precise +source code spans to point to errors, which is especially important if the +compiler information is used in the context of an IDE to show a red squiggly +underline or some other highlighting. The spans should be as small as possible to point out the source of +the error [Flow].

                              +

                              Conclusion

                              +

                              I am quite impressed how advanced and well-thought out the approaches are. I wonder whether it would makes sense for +Python to adopt a (probably minimal, to get started) subset of these ideas as guidelines for its own errors.

                              +

                              Sources

                              +
                              +
                              +

                              Comments

                              +
                              +
                              +
                              + +
                              +
                              + + \ No newline at end of file diff --git a/posts/2022/02/nlp-icelandic-case-study.html b/posts/2022/02/nlp-icelandic-case-study.html new file mode 100644 index 000000000..e89cd2e7f --- /dev/null +++ b/posts/2022/02/nlp-icelandic-case-study.html @@ -0,0 +1,435 @@ + + + + + +Natural Language Processing for Icelandic with PyPy: A Case Study | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                              +
                              +

                              Natural Language Processing for Icelandic with PyPy: A Case Study

                              + + + +
                              +

                              Natural Language Processing for Icelandic with PyPy: A Case Study

                              +

                              Icelandic is one +of the smallest languages of the world, with about 370.000 speakers. It +is a language in the Germanic family, most similar to Norwegian, Danish +and Swedish, but closer to the original Old +Norse spoken throughout +Scandinavia until about the 14th century CE.

                              +

                              As with other small languages, there are worries that the language may +not +survive +in a digital world, where all kinds of fancy applications are developed +first - and perhaps only - for the major languages. Voice assistants, +chatbots, spelling and grammar checking utilities, machine translation, +etc., are increasingly becoming staples of our personal and professional +lives, but if they don’t exist for Icelandic, Icelanders will gravitate +towards English or other languages where such tools are readily +available.

                              +

                              Iceland is a technology-savvy country, with world-leading adoption +rates of the +Internet, +PCs and smart devices, and a thriving software industry. So the +government figured that it would be worthwhile to fund a 5-year +plan to build natural +language processing (NLP) resources and other infrastructure for the +Icelandic language. The project focuses on collecting data and +developing open source software for a range of core applications, such +as tokenization, vocabulary lookup, n-gram statistics, part-of-speech +tagging, named entity recognition, spelling and grammar checking, neural +language models and speech processing.

                              +
                              +

                              My name is Vilhjálmur Þorsteinsson, and I’m the founder and CEO of a +software startup Miðeind in Reykjavík, +Iceland, that employs 10 software engineers and linguists and focuses on +NLP and AI for the Icelandic language. The company participates in the +government’s language technology program, and has contributed +significantly to the program’s core tools (e.g., a tokenizer and a +parser), spelling and grammar checking modules, and a neural machine +translation stack.

                              +

                              When it came to a choice of programming languages and development tools +for the government program, the requirements were for a major, well +supported, vendor-and-OS-agnostic FOSS platform with a large and diverse +community, including in the NLP space. The decision to select Python as +a foundational language for the project was a relatively easy one. That +said, there was a bit of trepidation around the well known fact that +CPython can be slow for inner-core tasks, such as tokenization and +parsing, that can see heavy workloads in production.

                              +

                              I first became aware of PyPy in early 2016 when I was developing a +crossword game Netskrafl in Python 2.7 +for Google App Engine. I had a utility program that compressed a +dictionary into a Directed Acyclic Word Graph and was taking 160 +seconds  to run on CPython 2.7, so I tried PyPy and to my amazement saw +a 4x speedup (down to 38 seconds), with literally no effort besides +downloading the PyPy runtime.

                              +

                              This led me to select PyPy as the default Python interpreter for my +company’s Python development efforts as well as for our production +websites and API servers, a role in which it remains to this day. We +have followed PyPy’s upgrades along the way, being just about to migrate +our minimally required language version from 3.6 to 3.7.

                              +

                              In NLP, speed and memory requirements can be quite important for +software usability. On the other hand, NLP logic and algorithms are +often complex and challenging to program, so programmer productivity and +code clarity are also critical success factors. A pragmatic approach +balances these factors, avoids premature optimization and seeks a +careful compromise between maximal run-time efficiency and minimal +programming and maintenance effort.

                              +

                              Turning to our use cases, our Icelandic text +tokenizer "Tokenizer" is fairly light, +runs tight loops and performs a large number of small, repetitive +operations. It runs very well on PyPy’s JIT and has not required further +optimization.

                              +

                              Our Icelandic parser Greynir +(known on PyPI as reynir) is, +if I may say so myself, a piece of work. It parses natural language +text according to a +hand-written context-free +grammar, +using an Earley-type +algorithm as enhanced +by Scott and +Johnstone. +The CFG contains almost 7,000 nonterminals and 6,000 terminals, and the +parser handles ambiguity as well as left, right and middle recursion. It +returns a packed parse forest for each input sentence, which is then +pruned by a scoring heuristic down to a single best result tree.

                              +

                              This parser was originally coded in pure Python and turned out to be +unusably slow when run on CPython - but usable on PyPy, where it was +3-4x faster. However, when we started applying it to heavier production +workloads, it  became apparent that it needed to be faster still. We +then proceeded to convert the innermost Earley parsing loop from Python +to tight +C++ +and to call it from PyPy via +CFFI, with callbacks for +token-terminal matching functions (“business logic”) that remained on +the Python side. This made the parser much faster (on the order of 100x +faster than the original on CPython) and quick enough for our production +use cases. Even after moving much of the heavy processing to C++ and using CFFI, PyPy still gives a significant speed boost over CPython.

                              +

                              Connecting C++ code with PyPy proved to be quite painless using CFFI, +although we had to figure out a few magic incantations in our build +module +to make it compile smoothly during setup from source on Windows and +MacOS in addition to Linux. Of course, we build binary PyPy and CPython +wheels for the most common targets so most users don’t have to worry +about setup requirements.

                              +

                              With the positive experience from the parser project, we proceeded to +take a similar approach for two other core NLP packages: our compressed +vocabulary package BinPackage +(known on PyPI as islenska) and our +trigrams database package Icegrams. +These packages both take large text input (3.1 million word forms with +inflection data in the vocabulary case; 100 million tokens in the +trigrams case) and compress it into packed binary structures. These +structures are then memory-mapped at run-time using +mmap and queried via +Python functions with a lookup time in the microseconds range. The +low-level data structure navigation is done in +C++, +called from Python via CFFI. The ex-ante preparation, packing, +bit-fiddling and data structure generation is fast enough with PyPy, so +we haven’t seen a need to optimize that part further.

                              +

                              To showcase our tools, we host public (and open source) websites such as +greynir.is for our parsing, named entity +recognition and query stack and +yfirlestur.is for our spell and grammar +checking stack. The server code on these sites is all Python running on +PyPy using Flask, +wrapped in gunicorn and hosted on +nginx. The underlying database is +PostgreSQL accessed via +SQLAlchemy and +psycopg2cffi. This setup +has served us well for 6 years and counting, being fast, reliable and +having helpful and supporting communities.

                              +

                              As can be inferred from the above, we are avid fans of PyPy and +commensurately thankful for the great work by the PyPy team over the +years. PyPy has enabled us to use Python for a larger part of our +toolset than CPython alone would have supported, and its smooth +integration with C/C++ through CFFI has helped us attain a better +tradeoff between performance and programmer productivity in our +projects. We wish for PyPy a great and bright future and also look +forward to exciting related developments on the horizon, such as +HPy.

                              +
                              +
                              +

                              Comments

                              +
                              +
                              +
                              + +
                              +
                              + + \ No newline at end of file diff --git a/posts/2022/02/pypy-v738-release.html b/posts/2022/02/pypy-v738-release.html new file mode 100644 index 000000000..975a347a0 --- /dev/null +++ b/posts/2022/02/pypy-v738-release.html @@ -0,0 +1,387 @@ + + + + + +PyPy v7.3.8: release of python 2.7, 3.7, 3.8, and 3.9 | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                              +
                              +

                              PyPy v7.3.8: release of python 2.7, 3.7, 3.8, and 3.9

                              + + + +
                              +

                              PyPy v7.3.8: release of python 2.7, 3.7, 3.8, and 3.9-beta

                              +

                              The PyPy team is proud to release version 7.3.8 of PyPy. It has been only a few +months since our last release, but we have some nice speedups and bugfixes we +wish to share. The release includes four different interpreters:

                              +
                              +
                                +
                              • PyPy2.7, which is an interpreter supporting the syntax and the features of +Python 2.7 including the stdlib for CPython 2.7.18+ (the + is for +backported security updates)

                              • +
                              • PyPy3.7, which is an interpreter supporting the syntax and the features of +Python 3.7, including the stdlib for CPython 3.7.12. This will be the last +release of PyPy3.7.

                              • +
                              • PyPy3.8, which is an interpreter supporting the syntax and the features of +Python 3.8, including the stdlib for CPython 3.8.12. This is our third +release of this interpreter, and we are removing the "beta" tag.

                              • +
                              • PyPy3.9, which is an interpreter supporting the syntax and the features of +Python 3.9, including the stdlib for CPython 3.9.10. As this is our first +release of this interpreter, we relate to this as "beta" quality. We +welcome testing of this version, if you discover incompatibilities, please +report them so we can gain confidence in the version.

                              • +
                              +
                              +

                              The interpreters are based on much the same codebase, thus the multiple +release. This is a micro release, all APIs are compatible with the other 7.3 +releases. Highlights of the release, since the release of 7.3.7 in late October 2021, +include:

                              +
                              +
                                +
                              • PyPy3.9 uses an RPython version of the PEG parser which brought with it a +cleanup of the lexer and parser in general

                              • +
                              • Fixed a regression in PyPy3.8 when JITting empty list comprehensions

                              • +
                              • Tweaked some issues around changing the file layout after packaging to make +the on-disk layout of PyPy3.8 more compatible with CPython. This requires +setuptools>=58.1.0

                              • +
                              • RPython now allows the target executable to have a . in its name, so +PyPy3.9 will produce a pypy3.9-c and libpypy3.9-c.so. Changing the +name of the shared object to be version-specific (it used to be +libpypy3-c.so) will allow it to live alongside other versions.

                              • +
                              • Building PyPy3.9+ accepts a --platlibdir argument like CPython.

                              • +
                              • Improvement in ssl's use of CFFI buffers to speed up recv and recvinto

                              • +
                              • Update the packaged OpenSSL to 1.1.1m

                              • +
                              +
                              +

                              We recommend updating. You can find links to download the v7.3.8 releases here:

                              +
                              +

                              https://pypy.org/download.html

                              +
                              +

                              We would like to thank our donors for the continued support of the PyPy +project. If PyPy is not quite good enough for your needs, we are available for +direct consulting work. If PyPy is helping you out, we would love to hear about +it and encourage submissions to our blog via a pull request +to https://github.com/pypy/pypy.org

                              +

                              We would also like to thank our contributors and encourage new people to join +the project. PyPy has many layers and we need help with all of them: PyPy +and RPython documentation improvements, tweaking popular modules to run +on PyPy, or general help with making RPython's JIT even better. Since the +previous release, we have accepted contributions from 6 new contributors, +thanks for pitching in, and welcome to the project!

                              +

                              If you are a python library maintainer and use C-extensions, please consider +making a HPy / CFFI / cppyy version of your library that would be performant +on PyPy. +In any case both cibuildwheel and the multibuild system support +building wheels for PyPy.

                              +

                              What is PyPy?

                              +

                              PyPy is a Python interpreter, a drop-in replacement for CPython 2.7, 3.7, 3.8 and +3.9. It's fast (PyPy and CPython 3.7.4 performance +comparison) due to its integrated tracing JIT compiler.

                              +

                              We also welcome developers of other dynamic languages to see what RPython +can do for them.

                              +

                              This PyPy release supports:

                              +
                              +
                                +
                              • x86 machines on most common operating systems +(Linux 32/64 bits, Mac OS X 64 bits, Windows 64 bits, OpenBSD, FreeBSD)

                              • +
                              • 64-bit ARM machines running Linux. A shoutout to Huawei for sponsoring +the VM running the tests.

                              • +
                              • s390x running Linux

                              • +
                              • big- and little-endian variants of PPC64 running Linux,

                              • +
                              +
                              +

                              PyPy support Windows 32-bit, PPC64 big- and little-endian, and ARM 32 bit, but +does not release binaries. Please reach out to us if you wish to sponsor +releases for those platforms.

                              +

                              Known Issues with PyPy3.9

                              +
                                +
                              • There is still a known speed regression around **kwargs handling

                              • +
                              • We slightly modified the concurrent future's ProcessExcecutorPool to +start all the worker processes when the first task is received (like on +Python3.8) to avoid an apparent race condition when using fork and +threads (issue 3650).

                              • +

                              What else is new?

                              +

                              For more information about the 7.3.8 release, see the full changelog.

                              +

                              Please update, and continue to help us make PyPy better.

                              +

                              Cheers, +The PyPy team

                              +
                              +
                              +

                              Comments

                              +
                              +
                              +
                              + +
                              +
                              + + \ No newline at end of file diff --git a/posts/2022/03/pypy-v738-release.html b/posts/2022/03/pypy-v738-release.html new file mode 100644 index 000000000..2ae38e7e0 --- /dev/null +++ b/posts/2022/03/pypy-v738-release.html @@ -0,0 +1,376 @@ + + + + + +PyPy v7.3.9 security release | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                              +
                              +

                              PyPy v7.3.9 security release

                              + + + +
                              +

                              PyPy v7.3.9 security release

                              +

                              The PyPy team is proud to release version 7.3.9 of PyPy. This is a security +release to match the recent CPython release and updates the portable pypy +tarballs with bzip2 1.0.8, openssl1.1.1n, and libexpat 2.4.7. Along +the way this release fixes some issues discovered after the 7.3.8 release and +updates sqlite3 to 3.38.2. It includes:

                              +
                              +
                                +
                              • PyPy2.7, which is an interpreter supporting the syntax and the features of +Python 2.7 including the stdlib for CPython 2.7.18+ (the + is for +backported security updates)

                              • +
                              • PyPy3.7, which is an interpreter supporting the syntax and the features of +Python 3.7, including the stdlib for CPython 3.7.13. This will be the last +release of PyPy3.7.

                              • +
                              • PyPy3.8, which is an interpreter supporting the syntax and the features of +Python 3.8, including the stdlib for CPython 3.8.13.

                              • +
                              • PyPy3.9, which is an interpreter supporting the syntax and the features of +Python 3.9, including the stdlib for CPython 3.9.12. We relate to this as +"beta" quality. We welcome testing of this version, if you discover +incompatibilities, please report them so we can gain confidence in the version.

                              • +
                              +
                              +

                              The interpreters are based on much the same codebase, thus the multiple +release. This is a micro release, all APIs are compatible with the other 7.3 +releases. Highlights of the release, since the release of 7.3.8 in February 2022, +include:

                              +
                              +
                                +
                              • Fixed some failing stdlib tests on PyPy3.9

                              • +
                              • Update the bundled libexpat to 2.4.6 and sqlite3 to 3.38.2

                              • +
                              +
                              +

                              We recommend updating. You can find links to download the v7.3.9 releases here:

                              +
                              +

                              https://pypy.org/download.html

                              +
                              +

                              We would like to thank our donors for the continued support of the PyPy +project. If PyPy is not quite good enough for your needs, we are available for +direct consulting work. If PyPy is helping you out, we would love to hear about +it and encourage submissions to our blog via a pull request +to https://github.com/pypy/pypy.org

                              +

                              We would also like to thank our contributors and encourage new people to join +the project. PyPy has many layers and we need help with all of them: PyPy +and RPython documentation improvements, tweaking popular modules to run +on PyPy, or general help with making RPython's JIT even better. Since the +7.3.7 release, we have accepted contributions from 6 new contributors, +thanks for pitching in, and welcome to the project!

                              +

                              If you are a python library maintainer and use C-extensions, please consider +making a HPy / CFFI / cppyy version of your library that would be performant +on PyPy. +In any case both cibuildwheel and the multibuild system support +building wheels for PyPy.

                              +

                              What is PyPy?

                              +

                              PyPy is a Python interpreter, a drop-in replacement for CPython 2.7, 3.7, 3.8 and +3.9. It's fast (PyPy and CPython 3.7.4 performance +comparison) due to its integrated tracing JIT compiler.

                              +

                              We also welcome developers of other dynamic languages to see what RPython +can do for them.

                              +

                              This PyPy release supports:

                              +
                              +
                                +
                              • x86 machines on most common operating systems +(Linux 32/64 bits, Mac OS X 64 bits, Windows 64 bits, OpenBSD, FreeBSD)

                              • +
                              • 64-bit ARM machines running Linux. A shoutout to Huawei for sponsoring +the VM running the tests.

                              • +
                              • s390x running Linux

                              • +
                              • big- and little-endian variants of PPC64 running Linux,

                              • +
                              +
                              +

                              PyPy support Windows 32-bit, PPC64 big- and little-endian, and ARM 32 bit, but +does not release binaries. Please reach out to us if you wish to sponsor +releases for those platforms.

                              +

                              Known Issues with PyPy3.9

                              +
                                +
                              • We slightly modified the concurrent future's ProcessExcecutorPool to +start all the worker processes when the first task is received (like on +Python3.8) to avoid an apparent race condition when using fork and +threads (issue 3650).

                              • +

                              What else is new?

                              +

                              For more information about the 7.3.9 release, see the full changelog.

                              +

                              Please update, and continue to help us make PyPy better.

                              +

                              Cheers, +The PyPy team

                              +
                              +
                              +

                              Comments

                              +
                              +
                              +
                              + +
                              +
                              + + \ No newline at end of file diff --git a/posts/2022/04/how-is-pypy-tested.html b/posts/2022/04/how-is-pypy-tested.html new file mode 100644 index 000000000..7706a30b0 --- /dev/null +++ b/posts/2022/04/how-is-pypy-tested.html @@ -0,0 +1,506 @@ + + + + + +How is PyPy Tested? | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                              +
                              +

                              How is PyPy Tested?

                              + + + +
                              +

                              How is PyPy Tested?

                              +

                              In this post I want to give an overview of how the PyPy project does and thinks +about testing. PyPy takes testing quite seriously and has done some from the +start of the project. Here I want to present the different styles of +tests that PyPy has, when we use them and how I think about them.

                              +

                              Background

                              +

                              To make the blog post self-contained, I am going to start with a small overview +about PyPy's architecture. If you already know what PyPy is and how it works, +you can skip this section.

                              +

                              PyPy means "Python in Python". It is an alternative implementation of the Python +language. Usually, when we speak of "Python", we can mean two different things. +On the one hand it means "Python as an abstract programming language". On the +other hand, the main implementation of that language is also often called +"Python". To more clearly distinguish the two, the implementation is often also +called "CPython", because it is an interpreter implemented in C code.

                              +

                              Now we can make the statement "PyPy is Python in Python" more precise: PyPy is +an interpreter for Python 3.9, implemented in RPython. RPython ("Restricted +Python") is a subset of Python 2, which is statically typed (using type +inference, not type annotations) and can be compiled +to C code. That means we can take our Python 3.9 interpreter, and compile it +into a C binary that can run Python 3.9 code. The final binary behaves pretty +similarly to CPython.

                              +

                              The main thing that makes PyPy interesting is that during the translation of our +interpreter to C, a number of components are automatically inserted into the +final binary. One component is a reasonably good garbage collector.

                              +

                              The more exciting component that is inserted into the binary is a just-in-time +compiler. The insertion of this component is not fully automatic, instead it is +guided by a small number of annotations in the source code of the interpreter. +The effect of inserting this JIT compiler into the binary is that the resulting +binary can run Python code significantly faster than CPython, in many cases. +How this works is not important for the rest of the post, if you want to see an +example of concretely doing that to a small interpreter you can look at this +video.

                              +

                              PyPy Testing History

                              +

                              A few historical notes on the PyPy project and its relationship to testing: The +PyPy project was started in 2004. At the time when the project was started, +Extreme Programming and Agile Software Development were up and coming. On the +methodology side, PyPy was heavily influenced by these, and started using +Test-Driven Development and pair programming right from the start.

                              +

                              Also technologically, PyPy has been influential on testing in the Python world. +Originally, PyPy had used the unittest testing framework, but pretty soon +the developers got frustrated with it. Holger Krekel, one of the original +developers who started PyPy, started the pytest testing framework soon +afterwards.

                              +

                              Interpreter-Level Tests

                              +

                              So, how are tests for PyPy written, concretely? The tests for the interpreter +are split into two different kinds, which we call "interpreter level tests" and +"application level tests". The former are tests that can be used to test the +objects and functions that are used in the implementation of the Python +interpreter. Since the interpreter is written in Python 2, those tests are also +written in Python 2, using pytest. They tend to be more on the unit test side of +things. They are in files with the pattern test_*.py.

                              +

                              Here is an example that tests the implementation of integers (very slightly +simplified):

                              +
                              class TestW_IntObject:
                              +    ...
                              +
                              +    def test_hash(self):
                              +        w_x = W_IntObject(42)
                              +        w_result = w_x.descr_hash(self.space)
                              +        assert isinstance(w_result, W_IntObject)
                              +        assert w_result.intval == 42
                              +
                              +

                              This test checks that if you take an object that represents integers in the +Python language (using the class W_IntObject, a "wrapped integer object") +with the value 42, computing the hash of that object returns another instance of +the same class, also with the value 42.

                              +

                              These tests can be run on top of any Python 2 implementation, either CPython or +PyPy. We can then test and debug the internals of the PyPy interpreter using +familiar tools like indeed pytest and the Python debuggers. They can be run, +because all the involved code like the tests and the class W_IntObject are +just completely regular Python 2 classes that behave in the regular way when +run on top of a Python interpreter.

                              +

                              In CPython, these tests don't really have an equivalent. They would correspond +to tests that are written in C and that can test the logic of all the C +functions of CPython that execute certain functionality, accessing the internals +of C structs in the process. ¹

                              +

                              Application-Level Tests

                              +

                              There is also a second class of tests for the interpreter. Those are tests that +don't run on the level of the implementation. Instead, they are executed by +the PyPy Python interpreter, thus running on the level of the applications run +by PyPy. Since the interpreter is running Python 3, the tests are also written +in Python 3. They are stored in files with the pattern apptest_*.py and +look like "regular" Python 3 tests. ²

                              +

                              Here's an example of how you could write a test equivalent to the one above:

                              +
                              def test_hash():
                              +    assert hash(42) == 42
                              +
                              +

                              This style of test looks more "natural" and is the preferred one in cases where +the test does not need to access the internals of the logic or the objects of +the interpreter.

                              +

                              Application level tests can be run in two different ways. On the one hand, we +can simply run them on CPython 3. This is very useful! Since we want PyPy to +behave like CPython, running the tests that we write on CPython is useful to +make sure that the tests themselves aren't wrong.

                              +

                              On the other hand, the main way to run these tests is on top of PyPy, itself +running on top of a Python 2 implementation. This makes it possible to run the +test without first bootstrapping PyPy to C. Since bootstrapping to C is a +relatively slow operation (can take up to an hour) it is crucially important to +be able to run tests without bootstrapping first. It also again makes it +possible to debug crashes in the interpreter using the regular Python 2 +debugger. Of course running tests in this way is unfortunately itself not super +fast, given that they run on a stack of two different interpreters.

                              +

                              Application-level tests correspond quite closely to CPython's tests suite (which +is using the unittest framework). Of course in CPython it is not possible to run +the test suite without building the CPython binary using a C compiler. ³

                              +

                              So when do we write application-level tests, and when interpreter-level tests? +Interpreter-level tests are necessary to test internal data structures that +touch data and logic that is not directly exposed to the Python language. If +that is not necessary, we try to write application-level tests. App-level tests +are however by their nature always more on the integration test side of things. +To be able to run the test_hash function above, many parts of PyPy need to +work correctly, the parser, the bytecode compiler, the bytecode interpreter, the +hash builtin, calling the __hash__ special method, etc, etc.

                              +

                              This observation is also true for CPython! One could argue that CPython has no +unit tests at all, because in order to be able to even run the tests, most of +Python needs to be in working order already, so all the tests are really +implicitly integration tests.

                              +

                              The CPython Test Suite

                              +

                              We also use the CPython Test suite as a final check to see whether our +interpreter correctly implements all the features of the Python language. In +that sense it acts as some kind of compliance test suite that checks whether we +implement the language correctly. The test suite is not perfect for this. +Since it is written for CPython's purposes during its development, a +lot of the tests check really specific CPython implementation details. Examples +for these are tests that check that __del__ is called immediately after +objects go out of scope (which only happens if you use reference counting as a +garbage collection strategy, PyPy uses a different approach to garbage +collection). Other examples are checking +for exception error messages very explicitly. However, the CPython test suite +has gotten a lot better in these regards over time, by adding +support.gc_collect() calls to fix the former problem, and by marking some +very specific tests with the @impl_detail decorator. Thanks to all the +CPython developers who have worked on this!

                              +

                              In the process of re-implementing CPython's functionality and running CPython's +tests suite, PyPy can often also be a good way to find bugs in CPython. While we +think about the corner cases of some Python feature we occasionally find +situations where CPython didn't get everything completely correct either, which +we then report back.

                              +

                              Testing for Performance Regressions

                              +

                              All the tests we described so far are checking behaviour. But one of PyPy's +important goals is to be a fast implementation not "just" a correct one. Some +aspects of performance can be tested by regular unit tests, either application- +or interpreter-level. In order to check whether some performance shortcut is +taken in the interpreter, we sometimes can write tests that monkeypatch the slow +default implementation to always error. Then, if the fast path is taken +properly, that slow default implementation is never reached.

                              +

                              But we also have additional tests that test the correct interaction with the JIT +explicitly. For that, we have a special style of test that checks that the JIT +will produce the correct machine code for a small snippet of Python code. To +make this kind of test somewhat more robust, we don't check the machine code +directly, but instead the architecture independent intermediate +representation that the JIT uses to produce machine code from.

                              +

                              As an example, here is a small test that loading the attribute of a constant +global instance can be completely constant folded away:

                              +
                              def test_load_attr(self):
                              +    src = '''
                              +        class A(object):
                              +            pass
                              +        a = A()
                              +        a.x = 1
                              +        def main(n):
                              +            i = 0
                              +            while i < n:
                              +                i = i + a.x
                              +            return i
                              +    '''
                              +    log = self.run(src, [1000])
                              +    assert log.result == 1000
                              +    loop, = log.loops_by_filename(self.filepath)
                              +    assert loop.match("""
                              +        i9 = int_lt(i5, i6)
                              +        guard_true(i9, descr=...)
                              +        guard_not_invalidated(descr=...)
                              +        i10 = int_add(i5, 1)
                              +        --TICK--
                              +        jump(..., descr=...)
                              +    """)
                              +
                              +

                              The string passed to the loop.match function is a string representation of +the intermediate representation code that is generated for the while loop in +the main function given in the source. The important part of that +intermediate representation is that the i = i + a.x addition is optimized +into an int_add(x, 1) operation. The second argument for the addition is the +constant 1, because the JIT noted that the global a is a constant, and +the attribute x of that instance is always 1. The test thus checks that +this optimization still works.

                              +

                              Those tests are again more on the unit test side of things (and can thus +unfortunately be a bit brittle sometimes and break). The integration test +equivalent for performance is the PyPy Speed Center which tracks the +performance of micro- and macro-benchmarks over time and lets us see when big +performance regressions are happening. The speed center is not really an +automatic test and does not produce pass/fail outcomes. Instead, it requires +human judgement and intervention in order to interpret the performance changes. +Having a real pass/fail mechanism is something that would be great to have +but is probably quite tricky in practice.

                              +

                              Conclusion

                              +

                              This concludes my overview of some of the different styles of tests that we use +to develop the PyPy Python interpreter.

                              +

                              There is a whole other set of tests for the development of the RPython language, +the garbage collectors it provides as well as the code that does the automatic +JIT insertion, maybe I'll cover these in a future post.

                              +

                              Footnotes

                              +

                              ¹ CPython has the _testcapimodule.c and related modules, that are used to +unit-test the C-API. However, these are still driven from Python tests using +the unittest framework and wouldn't run without the Python interpreter +already working.

                              +

                              ² There is also a deprecated different way to write these tests, by putting +them in the test_*.py files that interpreter level tests are using and +then having a test class with the pattern class AppTest*. We haven't +converted all of them to the new style yet, even though the old style is +quite weird: since the test_*.py files are themselves parsed by +Python 2, the tests methods in AppTest* classes need to be written in the +subset of Python 3 syntax that is also valid Python 2 syntax, leading to a lot +of confusion.

                              +

                              ³ Nit-picky side-note: C interpreters are a thing! But not that +widely used in practice, or only in very specific situations.

                              +
                              +
                              +

                              Comments

                              +
                              +
                              +
                              + +
                              +
                              + + \ No newline at end of file diff --git a/posts/2022/07/ddorf-sprint-sep-2022.html b/posts/2022/07/ddorf-sprint-sep-2022.html new file mode 100644 index 000000000..925422839 --- /dev/null +++ b/posts/2022/07/ddorf-sprint-sep-2022.html @@ -0,0 +1,334 @@ + + + + + +Düsseldorf HPy/PyPy/GraalPy sprint September 19-23rd 2022 | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                              +
                              +

                              Düsseldorf HPy/PyPy/GraalPy sprint September 19-23rd 2022

                              + + + +
                              +

                              The programming language group of the Computer Science department of +Heinrich-Heine Universität Düsseldorf is happy to invite everybody to another +sprint in Düsseldorf, from the 19th to the 23rd of September 2022. This is a +fully public sprint, everyone and particularly newcomers are welcome to join +us! The goal is to bring together people from the HPy, PyPy, GraalPy and +CPython communities.

                              +

                              Topics and goals

                              +
                                +
                              • work on HPy APIs, discussions around next steps for the project

                              • +
                              • continuing new and ongoing ports to HPy, including Cython, NumPy, Pillow, Matplotlib

                              • +
                              • 3.10 support on PyPy and GraalPy

                              • +
                              • preparing the next PyPy release

                              • +
                              • discussions around ways to improve collaboration between the different Python +implementations

                              • +

                              What is a sprint?

                              +

                              The experience of the PyPy project has shown the benefits of regular +sprints. They are focussed one week physical meetings where people pair-program +on new features and discuss future plans. Coming to one is a great way to get +started with a project!

                              +

                              Location

                              +

                              The sprint will take place in a seminar room of the computer science +department. It is in the building 25.12, room 02.50 (second floor) of the +university campus. For travel instructions see

                              +
                              +

                              https://www.cs.hhu.de/lehrstuehle-und-arbeitsgruppen/softwaretechnik-und-programmiersprachen/kontakt/service/lage-und-anreise

                              +
                              +

                              We ask participants to wear masks during the indoor working hours.

                              +
                              +Photograph of the statue of Heinrich Heine in front of the University library on the campus in Düsseldorf +

                              Wiegels, CC BY 3.0, via Wikimedia Commons

                              +

                              Exact times

                              +

                              Work days: starting September 19th (~morning), ending September 23rd (~afternoon). +We will do a to-be-planned social activity on Wednesday afternoon.

                              +

                              Registration

                              +

                              Please register by editing this file or by opening a pull request:

                              +
                              +

                              https://foss.heptapod.net/pypy/extradoc/-/blob/branch/extradoc/sprintinfo/ddorf2022/people.txt

                              +
                              +

                              or by sending a quick mail to the pypy-dev mailing list:

                              +
                              +

                              http://mail.python.org/mailman/listinfo/pypy-dev

                              +
                              +
                              +
                              +

                              Comments

                              +
                              +
                              +
                              + +
                              +
                              + + \ No newline at end of file diff --git a/posts/2022/07/m1-support-for-pypy.html b/posts/2022/07/m1-support-for-pypy.html new file mode 100644 index 000000000..ef2ff09f7 --- /dev/null +++ b/posts/2022/07/m1-support-for-pypy.html @@ -0,0 +1,299 @@ + + + + + +M1 support for PyPy | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                              +
                              +

                              M1 support for PyPy

                              + + + +
                              +

                              The PyPy team is happy to announce that we can now target the macOS ARM64 +platform. Much of the work was executed by Maciej Fijałkowski (fijal) and +funded via a generous contribution to our OpenCollective. The work is based +on our existing support for aarch64 (arm64 on linux) with some twists +to support the differences between the CPUs and the operating system. There +are nightly builds for pypy3.8 and pypy3.9 (look for macos_arm64), and +the architecture will be part of our next release.

                              +

                              Please try it out and let us know how it is useful for you or how we could +improve.

                              +

                              We still need help improving our macOS support. We have an open issue to +help our packaging story. Help is welcome.

                              +

                              The PyPy team.

                              +
                              +

                              Comments

                              +
                              +
                              +
                              + +
                              +
                              + + \ No newline at end of file diff --git a/posts/2022/07/toy-optimizer.html b/posts/2022/07/toy-optimizer.html new file mode 100644 index 000000000..8607d989a --- /dev/null +++ b/posts/2022/07/toy-optimizer.html @@ -0,0 +1,1078 @@ + + + + + +Implementing a Toy Optimizer | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                              +
                              +

                              Implementing a Toy Optimizer

                              + + + +
                              +

                              In this blog post I want to show the complete code (in Python3) of how a very +simple optimizer for sequences of operations can work. These algorithms could +be part of a (really simple) compiler, or a JIT. The architecture of the code in +this blog post is very similar to that of the trace optimizer of the PyPy JIT: +After a trace is produced, is is optimized before being sent to the machine code +backend that produces binary instructions for the CPU architecture that PyPy is +running on.

                              +

                              To get started, the first thing we need to do is define how our operations are +stored. The +format that a compiler uses to store the program while it is being optimized +is usually called its intermediate representation (IR). Many production +compilers use IRs that are in the Static Single-Assignment Form (SSA), and +we will also use that. SSA form has the property that every variable is +assigned to exactly once, and every variable is defined before it is used. This +simplifies many things.

                              +

                              Let's make this concrete. If our input program is a complex expressions, such +as a * (b + 17) + (b + 17) the intermediate representation of that (or at +least its text representation) would maybe be something like:

                              +
                              var1 = add(b, 17)
                              +var2 = mul(a, var1)
                              +var3 = add(b, 17)
                              +var4 = add(var2, var3)
                              +

                              This sequence of instructions is inefficient. The operation add(b, 17) is +computed twice and we can save time by removing the second one and only +computing it once. In this post I want to show an optimizer that can do this +(and some related) optimizations.

                              +

                              Looking at the IR we notice that the input expression has been linearized +into a sequence of operations, and all the intermedia results have been given +unique variable names. The value that every variable is assigned is computed +by the right hand side, which is some operation consisting of an operand and an +arbitrary number of arguments. The arguments of an operation are either +themselves variables or constants.

                              +

                              I will not at all talk about the process of translating the input program +into the IR. Instead, I will assume we have some component that does this +translation already. The tests in this blog post will construct small +snippets of IR by hand. I also won't talk about what happens after the +optimization (usually the optimized IR is translated into machine code).

                              +

                              Implementing the Intermediate Representation

                              +

                              Let's start modelling the intermediate representation with Python classes. +First we define a base class of all values that can be used as arguments in +operations, and let's also add a class that represents constants:

                              +
                              import pytest
                              +from typing import Optional, Any
                              +
                              +class Value:
                              +    pass
                              +
                              +class Constant(Value):
                              +    def __init__(self, value: Any):
                              +        self.value = value
                              +
                              +    def __repr__(self):
                              +        return f"Constant({self.value})"
                              +
                              +

                              One consequence of the fact that every variable is assigned to only once is +that variables are in a one-to-one correspondence with the right-hand-side of +their unique assignments. That means that we don't need a class that represents +variables at all. Instead, it's sufficient to have a class that represents an +operation (the right-hand side), and that by definition is the same as the variable (left-hand side) that it defines:

                              +
                              class Operation(Value):
                              +    def __init__(self, name: str, args: list[Value]):
                              +        self.name = name
                              +        self.args = args
                              +
                              +    def __repr__(self):
                              +        return f"Operation({self.name}, {self.args})"
                              +
                              +    def arg(self, index: int):
                              +        return self.args[index]
                              +
                              +

                              Now we can instantiate these two classes to represent the example sequence of +operations above:

                              +
                              def test_construct_example():
                              +    # first we need something to represent
                              +    # "a" and "b". In our limited view, we don't
                              +    # know where they come from, so we will define
                              +    # them with a pseudo-operation called "getarg"
                              +    # which takes a number n as an argument and
                              +    # returns the n-th input argument. The proper
                              +    # SSA way to do this would be phi-nodes.
                              +
                              +    a = Operation("getarg", [Constant(0)])
                              +    b = Operation("getarg", [Constant(1)])
                              +    # var1 = add(b, 17)
                              +    var1 = Operation("add", [b, Constant(17)])
                              +    # var2 = mul(a, var1)
                              +    var2 = Operation("mul", [a, var1])
                              +    # var3 = add(b, 17)
                              +    var3 = Operation("add", [b, Constant(17)])
                              +    # var4 = add(var2, var3)
                              +    var4 = Operation("add", [var2, var3])
                              +
                              +    sequence = [a, b, var1, var2, var3, var4]
                              +    # nothing to test really, it shouldn't crash
                              +
                              +

                              Usually, complicated programs are represented as a control flow graph in a +compiler, which represents all the possible paths that control can take while +executing the program. Every node in the control flow graph is a basic +block. A basic block is a linear sequence of operations with no control flow +inside of it.

                              +

                              When optimizing a program, a compiler usually looks at the whole control flow +graph of a function. However, that is still too complicated! So let's +simplify further and look at only at optimizations we can do when looking at +a single basic block and its sequence of instructions (they are called local +optimizations).

                              +

                              Let's define a class representing basic blocks and let's also add some +convenience functions for constructing sequences of operations, because the +code in test_construct_example is a bit annoying.

                              +
                              class Block(list):
                              +    def opbuilder(opname):
                              +        def wraparg(arg):
                              +            if not isinstance(arg, Value):
                              +                arg = Constant(arg)
                              +            return arg
                              +        def build(self, *args):
                              +            # construct an Operation, wrap the
                              +            # arguments in Constants if necessary
                              +            op = Operation(opname,
                              +                [wraparg(arg) for arg in args])
                              +            # add it to self, the basic block
                              +            self.append(op)
                              +            return op
                              +        return build
                              +
                              +    # a bunch of operations we support
                              +    add = opbuilder("add")
                              +    mul = opbuilder("mul")
                              +    getarg = opbuilder("getarg")
                              +    dummy = opbuilder("dummy")
                              +    lshift = opbuilder("lshift")
                              +
                              +def test_convencience_block_construction():
                              +    bb = Block()
                              +    # a again with getarg, the following line
                              +    # defines the Operation instance and
                              +    # immediately adds it to the basic block bb
                              +    a = bb.getarg(0)
                              +    assert len(bb) == 1
                              +    assert bb[0].name == "getarg"
                              +
                              +    # it's a Constant
                              +    assert bb[0].args[0].value == 0
                              +
                              +    # b with getarg
                              +    b = bb.getarg(1)
                              +    # var1 = add(b, 17)
                              +    var1 = bb.add(b, 17)
                              +    # var2 = mul(a, var1)
                              +    var2 = bb.mul(a, var1)
                              +    # var3 = add(b, 17)
                              +    var3 = bb.add(b, 17)
                              +    # var4 = add(var2, var3)
                              +    var4 = bb.add(var2, var3)
                              +    assert len(bb) == 6
                              +
                              +

                              That's a good bit of infrastructure to make the tests easy to write. One +thing we are lacking though is a way to print the basic blocks into a nicely +readable textual representation. Because in the current form, the repr of a +Block is very annoying, the output of pretty-printing bb in the test above +looks like this:

                              +
                              [Operation('getarg', [Constant(0)]),
                              + Operation('getarg', [Constant(1)]),
                              + Operation('add',
                              +           [Operation('getarg',
                              +                      [Constant(1)]),
                              +                 Constant(17)]),
                              + Operation('mul',
                              +           [Operation('getarg',
                              +                      [Constant(0)]),
                              +                 Operation('add',
                              +                           [Operation('getarg',
                              +                                      [Constant(1)]),
                              +                            Constant(17)])]),
                              + Operation('add',
                              +           [Operation('getarg',
                              +                      [Constant(1)]),
                              +            Constant(17)]),
                              + Operation('add',
                              +           [Operation('mul',
                              +                       [Operation('getarg',
                              +                                  [Constant(0)]),
                              +                             Operation('add',
                              +                                       [Operation('getarg',
                              +                                                  [Constant(1)]),
                              +                                        Constant(17)])]),
                              +                 Operation('add',
                              +                           [Operation('getarg',
                              +                                           [Constant(1)]),
                              +                                 Constant(17)])])]
                              +
                              +

                              It's impossible to see what is going on here, because the Operations in the +basic block appear several times, once as elements of the list but then also as +arguments to operations further down in the list. So we need some code that +turns things back into a readable textual representation, so we have a chance +to debug.

                              +
                              def bb_to_str(bb: Block, varprefix: str = "var"):
                              +    # the implementation is not too important,
                              +    # look at the test below to see what the
                              +    # result looks like
                              +
                              +    def arg_to_str(arg: Value):
                              +        if isinstance(arg, Constant):
                              +            return str(arg.value)
                              +        else:
                              +            # the key must exist, otherwise it's
                              +            # not a valid SSA basic block:
                              +            # the variable must be defined before
                              +            # its first use
                              +            return varnames[arg]
                              +
                              +    varnames = {}
                              +    res = []
                              +    for index, op in enumerate(bb):
                              +        # give the operation a name used while
                              +        # printing:
                              +        var = f"{varprefix}{index}"
                              +        varnames[op] = var
                              +        arguments = ", ".join(
                              +            arg_to_str(op.arg(i))
                              +                for i in range(len(op.args))
                              +        )
                              +        strop = f"{var} = {op.name}({arguments})"
                              +        res.append(strop)
                              +    return "\n".join(res)
                              +
                              +def test_basicblock_to_str():
                              +    bb = Block()
                              +    var0 = bb.getarg(0)
                              +    var1 = bb.add(5, 4)
                              +    var2 = bb.add(var1, var0)
                              +
                              +    assert bb_to_str(bb) == """\
                              +var0 = getarg(0)
                              +var1 = add(5, 4)
                              +var2 = add(var1, var0)"""
                              +
                              +    # with a different prefix for the invented
                              +    # variable names:
                              +    assert bb_to_str(bb, "x") == """\
                              +x0 = getarg(0)
                              +x1 = add(5, 4)
                              +x2 = add(x1, x0)"""
                              +
                              +    # and our running example:
                              +    bb = Block()
                              +    a = bb.getarg(0)
                              +    b = bb.getarg(1)
                              +    var1 = bb.add(b, 17)
                              +    var2 = bb.mul(a, var1)
                              +    var3 = bb.add(b, 17)
                              +    var4 = bb.add(var2, var3)
                              +
                              +    assert bb_to_str(bb, "v") == """\
                              +v0 = getarg(0)
                              +v1 = getarg(1)
                              +v2 = add(v1, 17)
                              +v3 = mul(v0, v2)
                              +v4 = add(v1, 17)
                              +v5 = add(v3, v4)"""
                              +    # Note the re-numbering of the variables! We
                              +    # don't attach names to Operations at all, so
                              +    # the printing will just number them in
                              +    # sequence, can sometimes be a source of
                              +    # confusion.
                              +
                              +

                              This is much better. Now we're done with the basic infrastructure, we can +define sequences of operations and print them in a readable way. Next we need a +central data structure that is used when actually optimizing basic blocks.

                              +

                              Storing Equivalences between Operations Using a Union-Find Data Structure

                              +

                              When optimizing a sequence of operations, we want to make it less costly to +execute. For that we typically want to remove operations (and sometimes +replace operations with less expensive ones). We can remove operations if +they do redundant computation, like case of the duplicate add(v1, 17) in +the example. So what we want to do is to turn the running input sequence:

                              +
                              v0 = getarg(0)
                              +v1 = getarg(1)
                              +v2 = add(v1, 17)
                              +v3 = mul(v0, v2)
                              +v4 = add(v1, 17)
                              +v5 = add(v3, v4)
                              +

                              Into the following optimized output sequence:

                              +
                              optvar0 = getarg(0)
                              +optvar1 = getarg(1)
                              +optvar2 = add(optvar1, 17)
                              +optvar3 = mul(optvar0, optvar2)
                              +optvar4 = add(optvar3, optvar2)
                              +

                              We left out the second add (which defines v4), and then replaced the +usage of v4 with v2 in the final operation that defines v5.

                              +

                              What we effectively did was discover that v2 and v4 are equivalent and then +replaced v4 with v2. In general, we might discover more such equivalences, +and we need a data structure to store them. A good data structure to store +these equivalences is Union Find (also called Disjoint-set data structure), +which stores a collection of disjoint sets. Disjoint means, that no operation +can appear in more than one set. The sets in our concrete case are the sets of +operations that compute the same result.

                              +

                              When we start out, every operation is in its own singleton set, with no other +member. As we discover more equivalences, we will unify sets into larger sets +of operations that all compute the same result. So one operation the data +structure supports is union, to unify two sets, we'll call that +make_equal_to in the code below.

                              +

                              The other operation the data structure supports is find, which takes an +operation and returns a "representative" of the set of all equivalent +operations. Two operations are in the same set, if the representative that +find returns for them is the same.

                              +

                              The exact details of how the data structure works are only sort of important +(even though it's very cool, I promise!). It's OK to skip over the +implementation. We will add the data structure right into our Value, +Constant and Operation classes:

                              +
                              class Value:
                              +    def find(self):
                              +        raise NotImplementedError("abstract")
                              +    def _set_forwarded(self, value):
                              +        raise NotImplementedError("abstract")
                              +
                              +
                              +class Operation(Value):
                              +    def __init__(self, name: str, args: list[Value]):
                              +        self.name = name
                              +        self.args = args
                              +        self.forwarded = None
                              +
                              +    def __repr__(self):
                              +        return (
                              +            f"Operation({self.name},"
                              +            f"{self.args}, {self.forwarded})"
                              +        )
                              +
                              +    def find(self) -> Value:
                              +        # returns the "representative" value of
                              +        # self, in the union-find sense
                              +        op = self
                              +        while isinstance(op, Operation):
                              +            # could do path compression here too
                              +            # but not essential
                              +            next = op.forwarded
                              +            if next is None:
                              +                return op
                              +            op = next
                              +        return op
                              +
                              +    def arg(self, index):
                              +        # change to above: return the
                              +        # representative of argument 'index'
                              +        return self.args[index].find()
                              +
                              +    def make_equal_to(self, value: Value):
                              +        # this is "union" in the union-find sense,
                              +        # but the direction is important! The
                              +        # representative of the union of Operations
                              +        # must be either a Constant or an operation
                              +        # that we know for sure is not optimized
                              +        # away.
                              +
                              +        self.find()._set_forwarded(value)
                              +
                              +    def _set_forwarded(self, value: Value):
                              +        self.forwarded = value
                              +
                              +
                              +class Constant(Value):
                              +    def __init__(self, value: Any):
                              +        self.value = value
                              +
                              +    def __repr__(self):
                              +        return f"Constant({self.value})"
                              +
                              +    def find(self):
                              +        return self
                              +
                              +    def _set_forwarded(self, value: Value):
                              +        # if we found out that an Operation is
                              +        # equal to a constant, it's a compiler bug
                              +        # to find out that it's equal to another
                              +        # constant
                              +        assert isinstance(value, Constant) and \
                              +            value.value == self.value
                              +
                              +def test_union_find():
                              +    # construct three operation, and unify them
                              +    # step by step
                              +    bb = Block()
                              +    a1 = bb.dummy(1)
                              +    a2 = bb.dummy(2)
                              +    a3 = bb.dummy(3)
                              +
                              +    # at the beginning, every op is its own
                              +    # representative, that means every
                              +    # operation is in a singleton set
                              +    # {a1} {a2} {a3}
                              +    assert a1.find() is a1
                              +    assert a2.find() is a2
                              +    assert a3.find() is a3
                              +
                              +    # now we unify a2 and a1, then the sets are
                              +    # {a1, a2} {a3}
                              +    a2.make_equal_to(a1)
                              +    # they both return a1 as the representative
                              +    assert a1.find() is a1
                              +    assert a2.find() is a1
                              +    # a3 is still different
                              +    assert a3.find() is a3
                              +
                              +    # now they are all in the same set {a1, a2, a3}
                              +    a3.make_equal_to(a2)
                              +    assert a1.find() is a1
                              +    assert a2.find() is a1
                              +    assert a3.find() is a1
                              +
                              +    # now they are still all the same, and we
                              +    # also learned that they are the same as the
                              +    # constant 6
                              +    # the single remaining set then is
                              +    # {6, a1, a2, a3}
                              +    c = Constant(6)
                              +    a2.make_equal_to(c)
                              +    assert a1.find() is c
                              +    assert a2.find() is c
                              +    assert a3.find() is c
                              +
                              +    # union with the same constant again is fine
                              +    a2.make_equal_to(c)
                              +
                              +

                              Constant Folding

                              +

                              Now comes the first actual optimization, a simple constant folding pass. It +will remove operations where all the arguments are constants and replace them +with the constant result.

                              +

                              Every pass has the same structure: we go over all operations in the basic +block in order and decide for each operation whether it can be removed. For the +constant folding pass, we can remove all the operations with constant +arguments (but we'll implement only the add case here).

                              +

                              I will show a buggy version of the constant folding pass first. It has a +problem that is related to why we need the union-find data structure. We will +fix it a bit further down.

                              +
                              def constfold_buggy(bb: Block) -> Block:
                              +    opt_bb = Block()
                              +
                              +    for op in bb:
                              +        # basic idea: go over the list and do
                              +        # constant folding of add where possible
                              +        if op.name == "add":
                              +            arg0 = op.args[0]
                              +            arg1 = op.args[1]
                              +            if isinstance(arg0, Constant) and \
                              +                    isinstance(arg1, Constant):
                              +                # can constant-fold! that means we
                              +                # learned a new equality, namely
                              +                # that op is equal to a specific
                              +                # constant
                              +                value = arg0.value + arg1.value
                              +                op.make_equal_to(Constant(value))
                              +                # don't need to have the operation
                              +                # in the optimized basic block
                              +                continue
                              +        # otherwise the operation is not
                              +        # constant-foldable and we put into the
                              +        # output list
                              +        opt_bb.append(op)
                              +    return opt_bb
                              +
                              +
                              +def test_constfold_simple():
                              +    bb = Block()
                              +    var0 = bb.getarg(0)
                              +    var1 = bb.add(5, 4)
                              +    var2 = bb.add(var1, var0)
                              +
                              +    opt_bb = constfold_buggy(bb)
                              +    assert bb_to_str(opt_bb, "optvar") == """\
                              +optvar0 = getarg(0)
                              +optvar1 = add(9, optvar0)"""
                              +
                              +@pytest.mark.xfail
                              +def test_constfold_buggy_limitation():
                              +    # this test fails! it shows the problem with
                              +    # the above simple constfold_buggy pass
                              +
                              +    bb = Block()
                              +    var0 = bb.getarg(0)
                              +    # this is folded
                              +    var1 = bb.add(5, 4)
                              +    # we want this folded too, but it doesn't work
                              +    var2 = bb.add(var1, 10)
                              +    var3 = bb.add(var2, var0)
                              +
                              +    opt_bb = constfold_buggy(bb)
                              +    assert bb_to_str(opt_bb, "optvar") == """\
                              +optvar0 = getarg(0)
                              +optvar1 = add(19, optvar0)"""
                              +
                              +

                              Why does the test fail? The opt_bb printed output looks like this:

                              +
                              optvar0 = getarg(0)
                              +optvar1 = add(9, 10)
                              +optvar2 = add(optvar1, optvar0)
                              +

                              The problem is that when we optimize the second addition in constfold_buggy, +the argument of that operation is an Operation not a Constant, so +constant-folding is not applied to the second add. However, we have already +learned that the argument var1 to the operation var2 is equal to +Constant(9). This information is stored in the union-find data structure. +So what we are missing are suitable find calls in the constant folding pass, to +make use of the previously learned equalities.

                              +

                              Here's the fixed version:

                              +
                              def constfold(bb: Block) -> Block:
                              +    opt_bb = Block()
                              +
                              +    for op in bb:
                              +        # basic idea: go over the list and do
                              +        # constant folding of add where possible
                              +        if op.name == "add":
                              +            # >>> changed
                              +            arg0 = op.arg(0) # uses .find()
                              +            arg1 = op.arg(1) # uses .find()
                              +            # <<< end changes
                              +            if isinstance(arg0, Constant) and \
                              +                    isinstance(arg1, Constant):
                              +                # can constant-fold! that means we
                              +                # learned a new equality, namely
                              +                # that op is equal to a specific
                              +                # constant
                              +                value = arg0.value + arg1.value
                              +                op.make_equal_to(Constant(value))
                              +                # don't need to have the operation
                              +                # in the optimized basic block
                              +                continue
                              +        # otherwise the operation is not
                              +        # constant-foldable and we put into the
                              +        # output list
                              +        opt_bb.append(op)
                              +    return opt_bb
                              +
                              +
                              +def test_constfold_two_ops():
                              +    # now it works!
                              +    bb = Block()
                              +    var0 = bb.getarg(0)
                              +    var1 = bb.add(5, 4)
                              +    var2 = bb.add(var1, 10)
                              +    var3 = bb.add(var2, var0)
                              +    opt_bb = constfold(bb)
                              +
                              +    assert bb_to_str(opt_bb, "optvar") == """\
                              +optvar0 = getarg(0)
                              +optvar1 = add(19, optvar0)"""
                              +
                              +

                              Common Subexpression Elimination

                              +

                              The constfold pass only discovers equalities between Operations and +Constants. Let's do a second pass that also discovers equalities between +Operations and other Operations.

                              +

                              A simple optimization that does that has this property common subexpression +elimination (CSE), which will finally optimize away the problem in the +introductory example code that we had above.

                              +
                              def cse(bb: Block) -> Block:
                              +    # structure is the same, loop over the input,
                              +    # add some but not all operations to the
                              +    # output
                              +
                              +    opt_bb = Block()
                              +
                              +    for op in bb:
                              +        # only do CSE for add here, but it
                              +        # generalizes
                              +        if op.name == "add":
                              +            arg0 = op.arg(0)
                              +            arg1 = op.arg(1)
                              +            # Check whether we have emitted the
                              +            # same operation already
                              +            prev_op = find_prev_add_op(
                              +                arg0, arg1, opt_bb)
                              +            if prev_op is not None:
                              +                # if yes, we can optimize op away
                              +                # and replace it with the earlier
                              +                # result, which is an Operation
                              +                # that was already emitted to
                              +                # opt_bb
                              +                op.make_equal_to(prev_op)
                              +                continue
                              +        opt_bb.append(op)
                              +    return opt_bb
                              +
                              +
                              +def eq_value(val0, val1):
                              +    if isinstance(val0, Constant) and \
                              +            isinstance(val1, Constant):
                              +        # constants compare by their value
                              +        return val0.value == val1.value
                              +    # everything else by identity
                              +    return val0 is val1
                              +
                              +
                              +def find_prev_add_op(arg0: Value, arg1: Value,
                              +        opt_bb: Block) -> Optional[Operation]:
                              +    # Really naive and quadratic implementation.
                              +    # What we do is walk over the already emitted
                              +    # operations and see whether we emitted an add
                              +    # with the current arguments already. A real
                              +    # implementation might use a hashmap of some
                              +    # kind, or at least only look at a limited
                              +    # window of instructions.
                              +    for opt_op in opt_bb:
                              +        if opt_op.name != "add":
                              +            continue
                              +        # It's important to call arg here,
                              +        # for the same reason why we
                              +        # needed it in constfold: we need to
                              +        # make sure .find() is called
                              +        if eq_value(arg0, opt_op.arg(0)) and \
                              +                eq_value(arg1, opt_op.arg(1)):
                              +            return opt_op
                              +    return None
                              +
                              +
                              +def test_cse():
                              +    bb = Block()
                              +    a = bb.getarg(0)
                              +    b = bb.getarg(1)
                              +    var1 = bb.add(b, 17)
                              +    var2 = bb.mul(a, var1)
                              +    var3 = bb.add(b, 17)
                              +    var4 = bb.add(var2, var3)
                              +
                              +    opt_bb = cse(bb)
                              +    assert bb_to_str(opt_bb, "optvar") == """\
                              +optvar0 = getarg(0)
                              +optvar1 = getarg(1)
                              +optvar2 = add(optvar1, 17)
                              +optvar3 = mul(optvar0, optvar2)
                              +optvar4 = add(optvar3, optvar2)"""
                              +
                              +

                              Strength Reduction

                              +

                              Now we have one pass that replaces Operations with Constants and one that +replaces Operations with previously existing Operations. Let's now do one +final pass that replaces Operations by newly invented Operations, a simple +strength reduction. This one will be simple.

                              +
                              def strength_reduce(bb: Block) -> Block:
                              +    opt_bb = Block()
                              +    for op in bb:
                              +        if op.name == "add":
                              +            arg0 = op.arg(0)
                              +            arg1 = op.arg(1)
                              +            if arg0 is arg1:
                              +                # x + x turns into x << 1
                              +                newop = opt_bb.lshift(arg0, 1)
                              +                op.make_equal_to(newop)
                              +                continue
                              +        opt_bb.append(op)
                              +    return opt_bb
                              +
                              +def test_strength_reduce():
                              +    bb = Block()
                              +    var0 = bb.getarg(0)
                              +    var1 = bb.add(var0, var0)
                              +
                              +    opt_bb = strength_reduce(bb)
                              +
                              +    assert bb_to_str(opt_bb, "optvar") == """\
                              +optvar0 = getarg(0)
                              +optvar1 = lshift(optvar0, 1)"""
                              +
                              +

                              Putting Things Together

                              +

                              Let's combine the passes into one single pass, so that we are going over all +the operations only exactly once, instead of having to look at every operation +once for all the different passes.

                              +
                              def optimize(bb: Block) -> Block:
                              +    opt_bb = Block()
                              +
                              +    for op in bb:
                              +        if op.name == "add":
                              +            arg0 = op.arg(0)
                              +            arg1 = op.arg(1)
                              +
                              +            # constant folding
                              +            if isinstance(arg0, Constant) and \
                              +                    isinstance(arg1, Constant):
                              +                value = arg0.value + arg1.value
                              +                op.make_equal_to(Constant(value))
                              +                continue
                              +
                              +            # cse
                              +            prev_op = find_prev_add_op(
                              +                arg0, arg1, opt_bb)
                              +            if prev_op is not None:
                              +                op.make_equal_to(prev_op)
                              +                continue
                              +
                              +            # strength reduce:
                              +            # x + x turns into x << 1
                              +            if arg0 is arg1:
                              +                newop = opt_bb.lshift(arg0, 1)
                              +                op.make_equal_to(newop)
                              +                continue
                              +
                              +            # and while we are at it, let's do some
                              +            # arithmetic simplification:
                              +            # a + 0 => a
                              +            if eq_value(arg0, Constant(0)):
                              +                op.make_equal_to(arg1)
                              +                continue
                              +            if eq_value(arg1, Constant(0)):
                              +                op.make_equal_to(arg0)
                              +                continue
                              +        opt_bb.append(op)
                              +    return opt_bb
                              +
                              +
                              +def test_single_pass():
                              +    bb = Block()
                              +    # constant folding
                              +    var0 = bb.getarg(0)
                              +    var1 = bb.add(5, 4)
                              +    var2 = bb.add(var1, 10)
                              +    var3 = bb.add(var2, var0)
                              +
                              +    opt_bb = optimize(bb)
                              +    assert bb_to_str(opt_bb, "optvar") == """\
                              +optvar0 = getarg(0)
                              +optvar1 = add(19, optvar0)"""
                              +
                              +    # cse + strength reduction
                              +    bb = Block()
                              +    var0 = bb.getarg(0)
                              +    var1 = bb.getarg(1)
                              +    var2 = bb.add(var0, var1)
                              +    var3 = bb.add(var0, var1) # the same as var3
                              +    var4 = bb.add(var2, 2)
                              +    var5 = bb.add(var3, 2) # the same as var4
                              +    var6 = bb.add(var4, var5)
                              +
                              +    opt_bb = optimize(bb)
                              +    assert bb_to_str(opt_bb, "optvar") == """\
                              +optvar0 = getarg(0)
                              +optvar1 = getarg(1)
                              +optvar2 = add(optvar0, optvar1)
                              +optvar3 = add(optvar2, 2)
                              +optvar4 = lshift(optvar3, 1)"""
                              +
                              +    # removing + 0
                              +    bb = Block()
                              +    var0 = bb.getarg(0)
                              +    var1 = bb.add(16, -16)
                              +    var2 = bb.add(var0, var1)
                              +    var3 = bb.add(0, var2)
                              +    var4 = bb.add(var2, var3)
                              +
                              +    opt_bb = optimize(bb)
                              +    assert bb_to_str(opt_bb, "optvar") == """\
                              +optvar0 = getarg(0)
                              +optvar1 = lshift(optvar0, 1)"""
                              +
                              +

                              Conclusion

                              +

                              That's it for now. Why is this architecture cool? From a software engineering +point of view, sticking everything into a single function like in optimize +above is obviously not great, and if you wanted to do this for real you would +try to split the cases into different functions that are individually +digestible, or even use a DSL that makes the pattern matching much more +readable. But the advantage of the architecture is that it's quite efficient, +it makes it possible to pack a lot of good optimizations into a single pass +over a basic block.

                              +

                              Of course this works even better if you are in a tracing context, where +everything is put into a trace, which is basically one incredibly long basic +block. In a JIT context it's also quite important that the +optimizer itself runs quickly.

                              +

                              Various other optimizations are possible in this model. There is a +follow-up post that show how to implement what is arguably PyPy's most +important optimization.

                              +

                              Some Further Pointers

                              +

                              This post is only a short introduction and is taking some shortcuts, I wanted to +also give some (non-exhaustive) pointers to more general literature about the +touched topics.

                              +

                              The approach to CSE described here is usually can be seen as value +numbering, it's normally really implemented with a hashmap though. Here's a +paper that describes various styles of implementing that, even beyond a +single basic block. The paper also partly takes the perspective of discovering +equivalence classes of operations that compute the same result.

                              +

                              A technique that leans even more fully into finding equivalences between +operations is using e-graphs and then applying equality saturation (this is +significantly more advanced that what I described here though). A cool modern +project that applies this technique is egg.

                              +

                              If you squint a bit, you can generally view a constant folding pass as a very +simple form of Partial Evaluation: every operation that has constant +arguments is constant-folded away, and the remaining ones are "residualized", +i.e. put into the output program. This point of view is not super important for +the current post, but will become important in the next one.

                              +

                              Acknowledgements: Thanks to Thorsten Ball for getting me to write +this and for his enthusiastic feedback. I also got great feedback from Max +Bernstein, Matti Picus and Per Vognsen. A conversation with Peng Wu that +we had many many years ago and that stuck with me made me keep thinking about +various ways to view compiler optimizations.

                              +
                              +
                              +

                              Comments

                              +
                              +
                              +
                              + +
                              +
                              + + \ No newline at end of file diff --git a/posts/2022/10/blog-15-years.html b/posts/2022/10/blog-15-years.html new file mode 100644 index 000000000..c656ec90f --- /dev/null +++ b/posts/2022/10/blog-15-years.html @@ -0,0 +1,371 @@ + + + + + +The PyPy Blog Turns 15 Years | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                              +
                              +

                              The PyPy Blog Turns 15 Years

                              + + + +
                              +

                              Exactly 15 years ago today we wrote the first blog post on the PyPy blog! +Over the years, we have written 423 posts, from the shortest to the +longest. In 2021 we moved from blogger to our own domain.

                              +

                              The topics over the years varied widely, we published release announcements; +roadmaps; JIT, GC and STM updates; benchmarks; sprint, trip and +conference reports; technical deep dives; case studies; april fool's +jokes; research projects; other languages using RPython; finished PhD +Bachelor and Master, theses; pictures:

                              + +a collage of photos taken at PyPy sprints +

                              and diagrams:

                              + +a collage of diagrams from previous blog posts +

                              Quite a number of blog posts were very early iterations of papers that we +published later, here are a few that I can remember:

                              + +

                              Greatest Hits

                              +

                              In terms of visitors, the top five posts on the old blog were – on the new blog +we simply don't have stats (yet?):

                              +
                                +
                              1. Let's remove the global interpreter lock

                              2. +
                              3. Tutorial: Writing an Interpreter with PyPy, Part 1

                              4. +
                              5. PyPy's new JSON parser

                              6. +
                              7. PyPy gets funding from Mozilla for Python 3.5 support

                              8. +
                              9. How to make your code 80 times faster

                              10. +
                              +

                              The number of posts per year developed like this:

                              +/images/2022-pypy-posts-per-year.svg

                              The most prolific authors are:

                              +
                                +
                              1. Maciej Fijałkowski

                              2. +
                              3. Carl Friedrich Bolz-Tereick

                              4. +
                              5. Armin Rigo

                              6. +
                              7. Antonio Cuni

                              8. +
                              9. Matti Picus

                              10. +
                              +

                              Several blog posts have made it to the Hacker News front page, three of them to +number 1:

                              +

                              Personal Favourites

                              +

                              While looking through the posts, there were a few that stood out to me in some +way, so here's a subjective list of ones that I had fun looking at again:

                              + +

                              We'd like to thank our authors, guest authors, commenters, users and readers who +have stuck with us through one and a half decades! If there's any particular +topics you would like to read something about, or any guest posts you'd like to +write, let us know!

                              +
                              +
                              +

                              Comments

                              +
                              +
                              +
                              + +
                              +
                              + + \ No newline at end of file diff --git a/posts/2022/10/toy-optimizer-allocation-removal.html b/posts/2022/10/toy-optimizer-allocation-removal.html new file mode 100644 index 000000000..ab68e1c74 --- /dev/null +++ b/posts/2022/10/toy-optimizer-allocation-removal.html @@ -0,0 +1,1207 @@ + + + + + +Allocation Removal in the Toy Optimizer | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                              +
                              +

                              Allocation Removal in the Toy Optimizer

                              + + + +
                              +

                              One of the workhorse optimization of RPython's tracing JIT is allocation +removal, which removes short-lived object allocation from traces. Many Python +programs create a lot of objects that only live for a short time, and whose +lifespan is fully predictable (common examples are integer and float boxes, but +also tuples, frames, intermediate string results, etc). Allocation removal will +try (and very often succeed) to remove these allocations from traces. In +this blog post I want to show a toy version of how allocation removal is +implemented.

                              +

                              In the previous blog post of this series I showed the complete code for +writing a toy one-pass optimizer that does constant folding, common +subexpression elimination and strength reduction. In this +second post, I want to use allocation removal as a more advanced optimization +pass. The basic optimization framework is the same, we will use the same +datastructures for intermediate representation and also keep using the same +union find data structure to store equivalences between IR operations. Here's +the infrastructure code from the last post:

                              +
                              import pytest
                              +from typing import Optional, Any
                              +
                              +
                              +class Value:
                              +    def find(self):
                              +        raise NotImplementedError("abstract")
                              +
                              +    def _set_forwarded(self, value):
                              +        raise NotImplementedError("abstract")
                              +
                              +
                              +class Operation(Value):
                              +    def __init__(
                              +        self, name: str, args: list[Value]
                              +    ):
                              +        self.name = name
                              +        self.args = args
                              +        self.forwarded = None
                              +        self.info = None
                              +
                              +    def __repr__(self):
                              +        return (
                              +            f"Operation({self.name}, "
                              +            f"{self.args}, {self.forwarded}, "
                              +            f"{self.info})"
                              +        )
                              +
                              +    def find(self) -> Value:
                              +        op = self
                              +        while isinstance(op, Operation):
                              +            next = op.forwarded
                              +            if next is None:
                              +                return op
                              +            op = next
                              +        return op
                              +
                              +    def arg(self, index):
                              +        return self.args[index].find()
                              +
                              +    def make_equal_to(self, value: Value):
                              +        self.find()._set_forwarded(value)
                              +
                              +    def _set_forwarded(self, value: Value):
                              +        self.forwarded = value
                              +
                              +
                              +class Constant(Value):
                              +    def __init__(self, value: Any):
                              +        self.value = value
                              +
                              +    def __repr__(self):
                              +        return f"Constant({self.value})"
                              +
                              +    def find(self):
                              +        return self
                              +
                              +    def _set_forwarded(self, value: Value):
                              +        assert (
                              +            isinstance(value, Constant)
                              +            and value.value == self.value
                              +        )
                              +
                              +class Block(list):
                              +    def opbuilder(opname):
                              +        def wraparg(arg):
                              +            if not isinstance(arg, Value):
                              +                arg = Constant(arg)
                              +            return arg
                              +        def build(self, *args):
                              +            # construct an Operation, wrap the
                              +            # arguments in Constants if necessary
                              +            op = Operation(opname,
                              +                [wraparg(arg) for arg in args])
                              +            # add it to self, the basic block
                              +            self.append(op)
                              +            return op
                              +        return build
                              +
                              +    # a bunch of operations we support
                              +    add = opbuilder("add")
                              +    mul = opbuilder("mul")
                              +    getarg = opbuilder("getarg")
                              +    dummy = opbuilder("dummy")
                              +    lshift = opbuilder("lshift")
                              +    # some new one for this post
                              +    alloc = opbuilder("alloc")
                              +    load = opbuilder("load")
                              +    store = opbuilder("store")
                              +    print = opbuilder("print")
                              +
                              +def bb_to_str(bb: Block, varprefix: str = "var"):
                              +    def arg_to_str(arg: Value):
                              +        if isinstance(arg, Constant):
                              +            return str(arg.value)
                              +        else:
                              +            return varnames[arg]
                              +
                              +    varnames = {}
                              +    res = []
                              +    for index, op in enumerate(bb):
                              +        var = f"{varprefix}{index}"
                              +        varnames[op] = var
                              +        arguments = ", ".join(
                              +            arg_to_str(op.arg(i))
                              +                for i in range(len(op.args))
                              +        )
                              +        strop = f"{var} = {op.name}({arguments})"
                              +        res.append(strop)
                              +    return "\n".join(res)
                              +
                              +

                              There are two changes to the code from the last post: Operation instances +have a new .info field, which is set to None by default. We will learn +how the info field is used a bit further down. Also, we define some new +operations.

                              +

                              Interpreter

                              +

                              In this post we will mainly concern ourselves with optimizing +programs that allocate memory. We assume that our language is garbage collected +and memory safe. The new operations that we will optimize are alloc +(allocates some new object), store (stores a value into a fixed field of an +object), load (loads the value from a field in the object).

                              +

                              We are leaving out a lot of details of a "real" system here, usually an +alloc operation would get some extra information, for example the type of +the freshly allocated object or at least its size. load and store would +typically have some kind of field offset and maybe some information about the +field's type

                              +

                              Here's a simple program that uses these operations:

                              +
                              var0 = getarg(0)
                              +obj0 = alloc()
                              +store(obj0, 0, var0)
                              +var1 = load(obj0, 0)
                              +print(var1)
                              +

                              The code allocates a new object obj0, stores var0 into field 0 of +the object, the loads the same field and prints the result of the load.

                              +

                              Before we get started in writing the optimizer for these operations, let's try +to understand the semantics of the new operations a bit better. To do this, we +can sketch a small interpreter for basic blocks, supporting only getarg, +alloc, store, load, print:

                              +
                              def test_interpret():
                              +    bb = Block()
                              +    var0 = bb.getarg(0)
                              +    obj = bb.alloc()
                              +    sto = bb.store(obj, 0, var0)
                              +    var1 = bb.load(obj, 0)
                              +    bb.print(var1)
                              +    assert interpret(bb, 17) == 17
                              +
                              +class Object:
                              +    def __init__(self):
                              +        self.contents: dict[int, Any] = {}
                              +
                              +    def store(self, idx : int, value : Any):
                              +        self.contents[idx] = value
                              +
                              +    def load(self, idx : int):
                              +        return self.contents[idx]
                              +
                              +def get_num(op, index=1):
                              +    assert isinstance(op.arg(index), Constant)
                              +    return op.arg(index).value
                              +
                              +def interpret(bb : Block, *args : tuple[Any]):
                              +    def argval(op, i):
                              +        arg = op.arg(i)
                              +        if isinstance(arg, Constant):
                              +            return arg.value
                              +        else:
                              +            assert isinstance(arg, Operation)
                              +            return arg.info
                              +
                              +    for index, op in enumerate(bb):
                              +        if op.name == "getarg":
                              +            res = args[get_num(op, 0)]
                              +        elif op.name == "alloc":
                              +            res = Object()
                              +        elif op.name == "load":
                              +            fieldnum = get_num(op)
                              +            res = argval(op, 0).load(fieldnum)
                              +        elif op.name == "store":
                              +            obj = argval(op, 0)
                              +            fieldnum = get_num(op)
                              +            fieldvalue = argval(op, 2)
                              +            obj.store(fieldnum, fieldvalue)
                              +            # no result, only side effect
                              +            continue
                              +        elif op.name == "print":
                              +            res = argval(op, 0)
                              +            print(res)
                              +            return res
                              +        else:
                              +            raise NotImplementedError(
                              +                f"{op.name} not supported")
                              +        op.info = res
                              +
                              +

                              The interpreter walks the operations of a block, executing each one in turn. It +uses the info field to store the result of each already executed +Operation. In this interpreter sketch we stop at the first print that +we execute and return its argument for the simple but bad reason that it makes +test_interpret easier to write.

                              +

                              Objects in the interpreter are represented using a class Object, which +stores the object's field into a Python dictionary. As written above, this is a +simplification, in a real system the alloc operation might for example take +some kind of type as an argument, that describes which kinds of fields an +object has and how they are laid out in memory, which would allow more +efficient storage of the content. But we don't want to care about this level of +detail in the post, so using a dict in the interpreter is good enough.

                              +

                              Version 1: Naive Attempt

                              +

                              In many programs, some allocated objects don't live for very long and have a +completely predictable lifetime. They get allocated, used for a while, and then +there is no way to reference them any more, so the garbage collector will +reclaim them. The very first example block had such an allocation:

                              +
                              var0 = getarg(0)
                              +obj0 = alloc()
                              +store(obj0, 0, var0)
                              +var1 = load(obj0, 0)
                              +print(var1)
                              +

                              Here obj0 is written to, then read from, and then it's no longer used. We +want to optimize such programs to remove this alloc operation. The optimized +version of this program would look like this:

                              +
                              var0 = getarg(0)
                              +print(var0)
                              +

                              The alloc, store and load operations have been completely removed. +This is a pretty important optimizations for PyPy's JIT: Allocations, memory +reads and writes are quite costly and occur a lot in Python, so getting rid +of as many of them as possible is instrumental for performance.

                              +

                              Implementing the optimization is not a lot of code! However, understanding all +the corner cases of the +optimization and making sure that the resulting program behave correctly is not +completely trivial. Therefore we will develop the optimization step by step, in +a test driven fashion: I will start each section with a new test that shows a +bug in the version of the optimization that we have so far.

                              +

                              Let's start in a really naive way. Here's the first test we would like to +pass, using the example program above:

                              +
                              def test_remove_unused_allocation():
                              +    bb = Block()
                              +    var0 = bb.getarg(0)
                              +    obj = bb.alloc()
                              +    sto = bb.store(obj, 0, var0)
                              +    var1 = bb.load(obj, 0)
                              +    bb.print(var1)
                              +    opt_bb = optimize_alloc_removal(bb)
                              +    # the virtual object looks like this:
                              +    #  obj
                              +    # ┌──────────┐
                              +    # │ 0: var0  │
                              +    # └──────────┘
                              +    assert bb_to_str(opt_bb, "optvar") == """\
                              +optvar0 = getarg(0)
                              +optvar1 = print(optvar0)"""
                              +
                              +

                              We will define a class VirtualObject that is basically identical to +Object above. But it will not be used by the interpreter, instead we will +use it during optimization.

                              +
                              class VirtualObject:
                              +    def __init__(self):
                              +        self.contents: dict[int, Value] = {}
                              +
                              +    def store(self, idx, value):
                              +        self.contents[idx] = value
                              +
                              +    def load(self, idx):
                              +        return self.contents[idx]
                              +
                              +

                              The structure of the optimizer is going to be like those in the first blog post. +The optimizer makes a single pass over all operations. It removes some and +emits others.

                              +

                              This first version of the allocation removal optimizer is going to be extremely +optimistic. It simply assumes that all the allocations in the program can be +optimized away. That is not realistic in practice. We will have to +refine this approach later, but it's a good way to start. That means whenever +the optimizer sees an alloc operation, it removes it and creates a +VirtualObject object which stores the information that is known during +optimization about the result of the alloc. Like in the interpreter, the +VirtualObject is stored in the .info field of the Operation instance +that represents the alloc.

                              +

                              When the optimizer sees a store operation, it will also remove it and +instead execute the store by calling the VirtualObject.store method. +Here is one important difference between the interpreter and the optimizer: In +the interpreter, the values that were stored into an Object (and thus +put into the object's .contents dictionary) were runtime values, for +example integers or other objects. In the optimizer however, the +fields of the VirtualObject store Value instances, either Constant +instances or Operation instances.

                              +

                              When the optimizer sees a load operation, it also removes it, and replaces +the load with the Operation (or Constant) that is stored in the +VirtualObject at that point:

                              +
                              def optimize_alloc_removal(bb):
                              +    opt_bb = Block()
                              +    for op in bb:
                              +        if op.name == "alloc":
                              +            op.info = VirtualObject()
                              +            continue
                              +        if op.name == "load":
                              +            info = op.arg(0).info
                              +            field = get_num(op)
                              +            op.make_equal_to(info.load(field))
                              +            continue
                              +        if op.name == "store":
                              +            info = op.arg(0).info
                              +            field = get_num(op)
                              +            info.store(field, op.arg(2))
                              +            continue
                              +        opt_bb.append(op)
                              +    return opt_bb
                              +
                              +

                              This is the first version of the optimization. It doesn't handle all kinds of +difficult cases, and we'll have to do something about its optimism. +But, already in this minimalistic form, we can write a slightly more complicated +test with two allocations, one object pointing to the other. It works correctly +too, both allocations are removed:

                              +
                              def test_remove_two_allocations():
                              +    bb = Block()
                              +    var0 = bb.getarg(0)
                              +    obj0 = bb.alloc()
                              +    sto1 = bb.store(obj0, 0, var0)
                              +    obj1 = bb.alloc()
                              +    sto2 = bb.store(obj1, 0, obj0)
                              +    var1 = bb.load(obj1, 0)
                              +    var2 = bb.load(var1, 0)
                              +    bb.print(var2)
                              +    # the virtual objects look like this:
                              +    #  obj0
                              +    # ┌──────┐
                              +    # │ 0: ╷ │
                              +    # └────┼─┘
                              +    #      │
                              +    #      ▼
                              +    #     obj1
                              +    #   ┌─────────┐
                              +    #   │ 0: var0 │
                              +    #   └─────────┘
                              +    # therefore
                              +    # var1 is the same as obj0
                              +    # var2 is the same as var0
                              +    opt_bb = optimize_alloc_removal(bb)
                              +    assert bb_to_str(opt_bb, "optvar") == """\
                              +optvar0 = getarg(0)
                              +optvar1 = print(optvar0)"""
                              +
                              +

                              Version 2: Re-Materializing Allocations

                              +

                              To make it easier to talk about how the optimizer operates, let's introduce +some terminology. As already seen by the choice +of the class name VirtualObject, we will call an object virtual if the +optimizer has optimized away the alloc operation that creates the object. +Other objects are equivalently not virtual, for example those that have +existed before we enter the current code block.

                              +

                              The first problem that we need to fix is the assumption that every +allocation can be removed. So far we only looked at small programs where every +allocation could be removed, or equivalently, where every object is virtual. +A program that creates virtual objects, stores into and loads from them, and +then forgets the objects. In this simple case removing the allocations is fine. +As we saw in the previous section, it's also fine to have a virtual object +reference another virtual, both allocations can be removed.

                              +

                              What are the cases were we can't remove an allocation? +The first version of the optimizer simply assumed that every allocation can be +removed. This can't work. We will replace this assumption with the following +simple heuristic:

                              +

                              If a reference to a virtual object a is stored into an object b +that is not virtual, then a will also stop being virtual. If an object a +that was virtual stops being virtual, we say that it escapes. ¹

                              +

                              The simplest test case for this happening looks like this:

                              +
                              def test_materialize():
                              +    bb = Block()
                              +    var0 = bb.getarg(0)
                              +    obj = bb.alloc()
                              +    sto = bb.store(var0, 0, obj)
                              +    opt_bb = optimize_alloc_removal(bb)
                              +    #  obj is virtual, without any fields
                              +    # ┌───────┐
                              +    # │ empty │
                              +    # └───────┘
                              +    # then we store a reference to obj into
                              +    # field 0 of var0. Since var0 is not virtual,
                              +    # obj escapes, so we have to put it back
                              +    # into the optimized basic block
                              +    assert bb_to_str(opt_bb, "optvar") == """\
                              +optvar0 = getarg(0)
                              +optvar1 = alloc()
                              +optvar2 = store(optvar0, 0, optvar1)"""
                              +    # so far, fails like this:
                              +    # the line:
                              +    # info.store(field, op.arg(2))
                              +    # produces an AttributeError because info
                              +    # is None
                              +
                              +

                              If the optimizer reaches a point where a virtual object escapes (like the +store operation in the test), the optimizer has already removed the alloc +operation that created the virtual object. If the object escapes, we don't want +to go back in the operations list and re-insert the alloc operation, that +sounds potentially very complicated. Instead, we re-insert the alloc +operation that will recreate the virtual object at the point of escape using a +helper function materialize.

                              +
                              def materialize(opt_bb, value: Operation) -> None:
                              +    assert not isinstance(value, Constant)
                              +    assert isinstance(value, Operation)
                              +    info = value.info
                              +    assert isinstance(info, VirtualObject)
                              +    assert value.name == "alloc"
                              +    # put the alloc operation back into the trace
                              +    opt_bb.append(value)
                              +
                              +

                              I've added a number of fairly strong assertions to materialize to encode our +current assumptions about the situations in which it expects to be called. We +will remove some of them later as we generalize the code.

                              +

                              Now that we have materialize we need to change optimize_alloc_removal to +recognize the case of storing a virtual object into a non-virtual one. We can +recognize Operation instances that produced a virtual object by looking at +their .info field. If it is None, the object is not virtual, otherwise +it is. If we store something into a virtual object, we leave the code as above. +If we store a virtual object into an object that is not virtual, we will first +materialize the virtual object, and then emit the store.

                              +
                              def optimize_alloc_removal(bb):
                              +    opt_bb = Block()
                              +    for op in bb:
                              +        if op.name == "alloc":
                              +            op.info = VirtualObject()
                              +            continue
                              +        if op.name == "load":
                              +            info = op.arg(0).info
                              +            field = get_num(op)
                              +            op.make_equal_to(info.load(field))
                              +            continue
                              +        if op.name == "store":
                              +            info = op.arg(0).info
                              +            if info: # virtual
                              +                field = get_num(op)
                              +                info.store(field, op.arg(2))
                              +                continue
                              +            else: # not virtual
                              +                # first materialize the
                              +                # right hand side
                              +                materialize(opt_bb, op.arg(2))
                              +                # then emit the store via
                              +                # the general path below
                              +        opt_bb.append(op)
                              +    return opt_bb
                              +
                              +

                              This is the general idea, and it is enough to pass test_materialize. But of +course there are still a number of further problems that we now need to solve.

                              +

                              Version 3: Don't Materialize Twice

                              +

                              The first problem is the fact that after we materialize a virtual object, it is +no longer virtual. So if it escapes a second time, it should not be +materialized a second time. A test for that case could simply repeat the +store operation:

                              +
                              def test_dont_materialize_twice():
                              +    # obj is again an empty virtual object,
                              +    # and we store it into var0 *twice*.
                              +    # this should only materialize it once
                              +    bb = Block()
                              +    var0 = bb.getarg(0)
                              +    obj = bb.alloc()
                              +    sto0 = bb.store(var0, 0, obj)
                              +    sto1 = bb.store(var0, 0, obj)
                              +    opt_bb = optimize_alloc_removal(bb)
                              +    assert bb_to_str(opt_bb, "optvar") == """\
                              +optvar0 = getarg(0)
                              +optvar1 = alloc()
                              +optvar2 = store(optvar0, 0, optvar1)
                              +optvar3 = store(optvar0, 0, optvar1)"""
                              +    # fails so far: the operations that we get
                              +    # at the moment are:
                              +    # optvar0 = getarg(0)
                              +    # optvar1 = alloc()
                              +    # optvar2 = store(optvar0, 0, optvar1)
                              +    # optvar3 = alloc()
                              +    # optvar4 = store(optvar0, 0, optvar3)
                              +    # ie the object is materialized twice,
                              +    # which is incorrect
                              +
                              +

                              We solve the problem by setting the .info field of an object that we +materialize to None to mark it as no longer being virtual.

                              +
                              def materialize(opt_bb, value: Operation) -> None:
                              +    assert not isinstance(value, Constant)
                              +    assert isinstance(value, Operation)
                              +    info = value.info
                              +    if info is None:
                              +        return # already materialized
                              +    assert value.name == "alloc"
                              +    # put the alloc operation back into the trace
                              +    opt_bb.append(value)
                              +    # but only once
                              +    value.info = None
                              +
                              +# optimize_alloc_removal unchanged
                              +
                              +

                              This fixes the problem, only one alloc is created. This fix also allows +another test case to pass, one where we store a non-virtual into another +non-virtual, code which we cannot optimize at all:

                              +
                              def test_materialize_non_virtuals():
                              +    # in this example we store a non-virtual var1
                              +    # into another non-virtual var0
                              +    # this should just lead to no optimization at
                              +    # all
                              +    bb = Block()
                              +    var0 = bb.getarg(0)
                              +    var1 = bb.getarg(1)
                              +    sto = bb.store(var0, 0, var1)
                              +    opt_bb = optimize_alloc_removal(bb)
                              +    assert bb_to_str(opt_bb, "optvar") == """\
                              +optvar0 = getarg(0)
                              +optvar1 = getarg(1)
                              +optvar2 = store(optvar0, 0, optvar1)"""
                              +
                              +

                              Version 4: Materialization of Constants

                              +

                              Another straightforward extension is to support materializing constants. A +constant is never virtual, so materializing it should do nothing.

                              +
                              def test_materialization_constants():
                              +    # in this example we store the constant 17
                              +    # into the non-virtual var0
                              +    # again, this will not be optimized
                              +    bb = Block()
                              +    var0 = bb.getarg(0)
                              +    sto = bb.store(var0, 0, 17)
                              +    opt_bb = optimize_alloc_removal(bb)
                              +    # the previous line fails so far, triggering
                              +    # the assert:
                              +    # assert not isinstance(value, Constant)
                              +    # in materialize
                              +    assert bb_to_str(opt_bb, "optvar") == """\
                              +optvar0 = getarg(0)
                              +optvar1 = store(optvar0, 0, 17)"""
                              +
                              +

                              To implement that case, we check for value being a constant and return +early:

                              +
                              def materialize(opt_bb, value: Operation) -> None:
                              +    if isinstance(value, Constant):
                              +        return
                              +    assert isinstance(value, Operation)
                              +    info = value.info
                              +    if info is None:
                              +        return # already materialized
                              +    assert value.name == "alloc"
                              +    # put the alloc operation back into the trace
                              +    opt_bb.append(value)
                              +    # but only once
                              +    value.info = None
                              +
                              +# optimize_alloc_removal unchanged
                              +
                              +

                              Version 5: Materializing Fields

                              +

                              Now we need to solve a more difficult problem. So far, the virtual objects that +we have materialized have all been empty, meaning they didn't have any fields +written to at the point of materialization. Let's write a test for this:

                              +
                              def test_materialize_fields():
                              +    bb = Block()
                              +    var0 = bb.getarg(0)
                              +    var1 = bb.getarg(1)
                              +    obj = bb.alloc()
                              +    contents0 = bb.store(obj, 0, 8)
                              +    contents1 = bb.store(obj, 1, var1)
                              +    sto = bb.store(var0, 0, obj)
                              +
                              +    # the virtual obj looks like this
                              +    #  obj
                              +    # ┌──────┬──────────┐
                              +    # │ 0: 8 │ 1: var1  │
                              +    # └──────┴──────────┘
                              +    # then it needs to be materialized
                              +    # this is the first example where a virtual
                              +    # object that we want to materialize has any
                              +    # content and is not just an empty object
                              +    opt_bb = optimize_alloc_removal(bb)
                              +    assert bb_to_str(opt_bb, "optvar") == """\
                              +optvar0 = getarg(0)
                              +optvar1 = getarg(1)
                              +optvar2 = alloc()
                              +optvar3 = store(optvar2, 0, 8)
                              +optvar4 = store(optvar2, 1, optvar1)
                              +optvar5 = store(optvar0, 0, optvar2)"""
                              +    # fails so far! the operations we get
                              +    # at the moment are:
                              +    # optvar0 = getarg(0)
                              +    # optvar1 = getarg(1)
                              +    # optvar2 = alloc()
                              +    # optvar3 = store(optvar0, 0, optvar2)
                              +    # which is wrong, because the store operations
                              +    # into optvar1 got lost
                              +
                              +

                              To fix this problem, we need to re-create a store operation for every +element of the .contents dictionary of the virtual object we are +materializing. ²

                              +
                              def materialize(opt_bb, value: Operation) -> None:
                              +    if isinstance(value, Constant):
                              +        return
                              +    assert isinstance(value, Operation)
                              +    info = value.info
                              +    if info is None:
                              +        return # already materialized
                              +    assert value.name == "alloc"
                              +    # put the alloc operation back into the trace
                              +    opt_bb.append(value)
                              +    # put the content back
                              +    for idx, val in info.contents.items():
                              +        # re-create store operation
                              +        opt_bb.store(value, idx, val)
                              +    # only materialize once
                              +    value.info = None
                              +
                              +# optimize_alloc_removal unchanged
                              +
                              +

                              This is enough to pass the test.

                              +

                              Version 6: Recursive Materialization

                              +

                              In the above example, the fields of the virtual objects contained +only constants or non-virtual objects. However, we could have a situation where +a whole tree of virtual objects is built, and then the root of the tree escapes. +This makes it necessary to escape the whole tree. Let's write a test for a small +tree of two virtual objects:

                              +
                              def test_materialize_chained_objects():
                              +    bb = Block()
                              +    var0 = bb.getarg(0)
                              +    obj0 = bb.alloc()
                              +    obj1 = bb.alloc()
                              +    contents = bb.store(obj0, 0, obj1)
                              +    const = bb.store(obj1, 0, 1337)
                              +    sto = bb.store(var0, 0, obj0)
                              +    #  obj0
                              +    # ┌──────┐
                              +    # │ 0: ╷ │
                              +    # └────┼─┘
                              +    #      │
                              +    #      ▼
                              +    #     obj1
                              +    #   ┌─────────┐
                              +    #   │ 0: 1337 │
                              +    #   └─────────┘
                              +    # now obj0 escapes
                              +    opt_bb = optimize_alloc_removal(bb)
                              +    assert bb_to_str(opt_bb, "optvar") == """\
                              +optvar0 = getarg(0)
                              +optvar1 = alloc()
                              +optvar2 = alloc()
                              +optvar3 = store(optvar2, 0, 1337)
                              +optvar4 = store(optvar1, 0, optvar2)
                              +optvar5 = store(optvar0, 0, optvar1)"""
                              +    # fails in an annoying way! the resulting
                              +    # basic block is not in proper SSA form
                              +    # so printing it fails. The optimized
                              +    # block would look like this:
                              +    # optvar0 = getarg(0)
                              +    # optvar1 = alloc()
                              +    # optvar3 = store(optvar1, 0, optvar2)
                              +    # optvar4 = store(optvar0, 0, optvar1)
                              +    # where optvar2 is an ``alloc`` Operation
                              +    # that is not itself in the output block
                              +
                              +

                              To fix it, materialize needs to call itself recursively for all the field +values of the virtual object:

                              +
                              def materialize(opt_bb, value: Operation) -> None:
                              +    if isinstance(value, Constant):
                              +        return
                              +    assert isinstance(value, Operation)
                              +    info = value.info
                              +    if info is None:
                              +        return # already materialized
                              +    assert value.name == "alloc"
                              +    # put the alloc operation back into the trace
                              +    opt_bb.append(value)
                              +    # put the content back
                              +    for idx, val in sorted(info.contents.items()):
                              +        # materialize recursively
                              +        materialize(opt_bb, val)
                              +        opt_bb.store(value, idx, val)
                              +    # only materialize once
                              +    value.info = None
                              +
                              +# optimize_alloc_removal unchanged
                              +
                              +

                              Getting there, the materialization logic is almost done. We need to fix a +subtle remaining problem though.

                              +

                              Version 7: Dealing with Object Cycles

                              +

                              The bug we need to fix in this section is a bit tricky, and does not immediately +occur in a lot of programs. In +fact, in PyPy a variant of it was hiding out in our optimizer +until we found it much later (despite us being aware of the general problem and +correctly dealing with it in other cases).

                              +

                              The problem is this: a virtual object can (directly or indirectly) point to +itself, and we must carefully deal with that case to avoid infinite recursion in +materialize. Here's the simplest test:

                              +
                              def test_object_graph_cycles():
                              +    bb = Block()
                              +    var0 = bb.getarg(0)
                              +    var1 = bb.alloc()
                              +    var2 = bb.store(var1, 0, var1)
                              +    var3 = bb.store(var0, 1, var1)
                              +    #   ┌────────┐
                              +    #   ▼        │
                              +    #  obj0      │
                              +    # ┌──────┐   │
                              +    # │ 0: ╷ │   │
                              +    # └────┼─┘   │
                              +    #      │     │
                              +    #      └─────┘
                              +    # obj0 points to itself, and then it is
                              +    # escaped
                              +    opt_bb = optimize_alloc_removal(bb)
                              +    # the previous line fails with an
                              +    # InfiniteRecursionError
                              +    # materialize calls itself, infinitely
                              +
                              +    # what we want is instead this output:
                              +    assert bb_to_str(opt_bb, "optvar") == """\
                              +optvar0 = getarg(0)
                              +optvar1 = alloc()
                              +optvar2 = store(optvar1, 0, optvar1)
                              +optvar3 = store(optvar0, 1, optvar1)"""
                              +
                              +

                              The fix is not a big change, but a little bit subtle nevertheless. +We have to change the +order in which things are done in materialize. Right after emitting the +alloc, we set the .info to None, to mark the object as not virtual. +Only afterwards do we re-create the stores and call materialize recursively. +If a recursive call reaches the same object, it's already marked as non-virtual, +so materialize won't recurse further:

                              +
                              def materialize(opt_bb, value: Operation) -> None:
                              +    if isinstance(value, Constant):
                              +        return
                              +    assert isinstance(value, Operation)
                              +    info = value.info
                              +    if info is None:
                              +        return # already materialized
                              +    assert value.name == "alloc"
                              +    # put the alloc operation back into the trace
                              +    opt_bb.append(value)
                              +    # only materialize once
                              +    value.info = None
                              +    # put the content back
                              +    for idx, val in sorted(info.contents.items()):
                              +        # materialize recursively
                              +        materialize(opt_bb, val)
                              +        opt_bb.store(value, idx, val)
                              +
                              +

                              Version 8: Loading from non-virtual objects

                              +

                              Now materialize is done. We need to go back to optimize_alloc_removal and +improve it further. The last time we changed it, we added a case analysis to the +code dealing with store, distinguishing between storing to a virtual and to +a non-virtual object. We need to add an equivalent distinction to the load +case, because right now loading from a non-virtual crashes.

                              +
                              def test_load_non_virtual():
                              +    bb = Block()
                              +    var0 = bb.getarg(0)
                              +    var1 = bb.load(var0, 0)
                              +    bb.print(var1)
                              +    # the next line fails in the line
                              +    # op.make_equal_to(info.load(field))
                              +    # because info is None
                              +    opt_bb = optimize_alloc_removal(bb)
                              +    assert bb_to_str(opt_bb, "optvar") == """\
                              +optvar0 = getarg(0)
                              +optvar1 = load(optvar0, 0)
                              +optvar2 = print(optvar1)"""
                              +
                              +

                              To fix it, we split the load code into two cases, leaving the virtual path +as before, and letting the load from a non-virtual fall through to the +general code at the end of the function.

                              +
                              def optimize_alloc_removal(bb):
                              +    opt_bb = Block()
                              +    for op in bb:
                              +        if op.name == "alloc":
                              +            op.info = VirtualObject()
                              +            continue
                              +        if op.name == "load":
                              +            info = op.arg(0).info
                              +            if info: # virtual
                              +                field = get_num(op)
                              +                op.make_equal_to(info.load(field))
                              +                continue
                              +            # otherwise not virtual, use the
                              +            # general path below
                              +        if op.name == "store":
                              +            info = op.arg(0).info
                              +            if info: # virtual
                              +                field = get_num(op)
                              +                info.store(field, op.arg(2))
                              +                continue
                              +            else: # not virtual
                              +                # first materialize the
                              +                # right hand side
                              +                materialize(opt_bb, op.arg(2))
                              +                # then emit the store via
                              +                # the general path below
                              +        opt_bb.append(op)
                              +    return opt_bb
                              +
                              +

                              Version 9 (Final): Materialize on Other Operations

                              +

                              We're almost at the end now. There's one final generalization left to do. We +started with the heuristic that storing a virtual into a non-virtual would +escape it. This should be generalized. Every time we pass a virtual into any +operation where it is not the first argument of a load and a store +should also escape it (imagine passing the virtual to some function call). +Let's test this as usual with our print operation:

                              +
                              def test_materialize_on_other_ops():
                              +    # materialize not just on store
                              +    bb = Block()
                              +    var0 = bb.getarg(0)
                              +    var1 = bb.alloc()
                              +    var2 = bb.print(var1)
                              +    opt_bb = optimize_alloc_removal(bb)
                              +    assert bb_to_str(opt_bb, "optvar") == """\
                              +optvar0 = getarg(0)
                              +optvar1 = alloc()
                              +optvar2 = print(optvar1)"""
                              +    # again, the resulting basic block is not in
                              +    # valid SSA form
                              +
                              +

                              To fix this, we will take the call to materialize out of the store code +path and instead put it into the generic code path the end of the while +loop:

                              +
                              # materialize is unchanged
                              +def materialize(opt_bb, value: Value) -> None:
                              +    if isinstance(value, Constant):
                              +        return
                              +    assert isinstance(value, Operation)
                              +    info = value.info
                              +    if not info:
                              +        # Already materialized
                              +        return
                              +    assert value.name == "alloc"
                              +    opt_bb.append(value)
                              +    value.info = None
                              +    for idx, val in sorted(info.contents.items()):
                              +        materialize(opt_bb, val)
                              +        opt_bb.store(value, idx, val)
                              +
                              +def optimize_alloc_removal(bb):
                              +    opt_bb = Block()
                              +    for op in bb:
                              +        if op.name == "alloc":
                              +            op.info = VirtualObject()
                              +            continue
                              +        if op.name == "load":
                              +            info = op.arg(0).info
                              +            if info: # virtual
                              +                field = get_num(op)
                              +                op.make_equal_to(info.load(field))
                              +                continue
                              +        if op.name == "store":
                              +            info = op.arg(0).info
                              +            if info: # virtual
                              +                field = get_num(op)
                              +                info.store(field, op.arg(2))
                              +                continue
                              +        # materialize all the arguments of
                              +        # operations that are put into the
                              +        # output basic block
                              +        for arg in op.args:
                              +            materialize(opt_bb, arg.find())
                              +        opt_bb.append(op)
                              +    return opt_bb
                              +
                              +

                              That's it, we're done. It's not a lot of code, but actually quite a powerful +optimization. In addition to removing allocations for objects that are only used +briefly and in predictable ways, it also has another effect. If an object is +allocated, used in a number of operations and then escapes further down in the +block, the operations in between can often be optimized away. This is +demonstrated by the next test (which already passes):

                              +
                              def test_sink_allocations():
                              +    bb = Block()
                              +    var0 = bb.getarg(0)
                              +    var1 = bb.alloc()
                              +    var2 = bb.store(var1, 0, 123)
                              +    var3 = bb.store(var1, 1, 456)
                              +    var4 = bb.load(var1, 0)
                              +    var5 = bb.load(var1, 1)
                              +    var6 = bb.add(var4, var5)
                              +    var7 = bb.store(var1, 0, var6)
                              +    var8 = bb.store(var0, 1, var1)
                              +    opt_bb = optimize_alloc_removal(bb)
                              +    assert bb_to_str(opt_bb, "optvar") == """\
                              +optvar0 = getarg(0)
                              +optvar1 = add(123, 456)
                              +optvar2 = alloc()
                              +optvar3 = store(optvar2, 0, optvar1)
                              +optvar4 = store(optvar2, 1, 456)
                              +optvar5 = store(optvar0, 1, optvar2)"""
                              +
                              +

                              Note that the addition is not optimized away, because the code from this blog +post does not contain constant folding and the other optimizations from +the last one. Combining them would not be too hard though.

                              +

                              Conclusion

                              +

                              That's it! The core idea of PyPy's allocation removal optimization in one or +two screens of code. The real implementation has a number of refinements, +but the core ideas are all here.

                              +

                              I'm not going to show any benchmark numbers or anything like that here, if you +are interested in numbers you could look at the evaluation Section 6. +"Implementation and Evaluation" of the paper that describes the work.

                              +

                              There's a complementary optimization that improves load and store +operations for objects that are not virtual. I'll probably not write that +down as another post, but Max Bernstein and I developed that together on a +PyPy Twitch channel channel a few weeks ago, here's the recording:

                              +

                              Footnotes

                              +

                              ¹ This is how PyPy uses the terminology, not really used consistently by other +projects. The term "escape" is fairly standard throughout the escape +analysis literature. The term "virtual" was used originally in Armin Rigo's +Psyco but is e.g. also used by the paper Partial Escape Analysis and Scalar +Replacement for Java.

                              +

                              ² The order in which we put the store operations back is relying on +dictionary iteration order, which is insertion order. That's not a bad +ordering, we could also be explicit and sort the fields in some order (ideally +the order in which the object lays them out in memory).

                              +
                              +
                              +

                              Comments

                              +
                              +
                              +
                              + +
                              +
                              + + \ No newline at end of file diff --git a/posts/2022/11/pypy-and-conda-forge.html b/posts/2022/11/pypy-and-conda-forge.html new file mode 100644 index 000000000..948acbaa9 --- /dev/null +++ b/posts/2022/11/pypy-and-conda-forge.html @@ -0,0 +1,334 @@ + + + + + +PyPy and conda-forge | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                              +
                              +

                              PyPy and conda-forge

                              + + + +
                              +

                              You can use PyPy as your python interpreter in a conda environment. The +conda-forge team has graciously provided this service.

                              +

                              The conda-forge tips-and-tricks +page says:

                              +
                              +

                              The conda-forge channel supports creating and installing packages into +environments using the PyPy interpreter. Many packages are already available. +You need to enable the conda-forge channel and use the pypy identifier when +creating your environment:

                              +
                              +
                                $ conda create -c conda-forge -n my-pypy-env pypy python=3.8
                              +  $ conda activate my-pypy-env
                              +
                              + +
                              +

                              Currently supported python versions are 3.8 and 3.9. Support for pypy3.7 has +been dropped. While you can still create a python 3.7 environment, you you +will not be getting updates as new package versions are released (including +pypy itself).

                              +

                              if you are using defaults as a low priority channel, then you need to use +strict channel priority as the metadata in defaults has not been patched yet +which allows cpython extension packages to be installed alongside pypy.

                              +
                              +
                                $ conda config --set channel_priority strict
                              +
                              + +

                              The work required some out-of-the-box thinking on the part of conda-forge since +they needed to add the idea of a pypy identifier to the python version and +the whole conda team has been very supportive of the effort needed. Binary +packages are on offer for the usual platforms:

                              +
                                +
                              • +x86_64 windows, macos, linux
                              • +
                              • +ppc64le and aarch64 linux.
                              • +
                              +

                              There are currently over 1000 packages available for download via the +conda-forge channel, and more are being added as the kind package maintainers +work around various differences between CPython and PyPy. Please let us know if +your favorite package is not supported.

                              +
                              +

                              Comments

                              +
                              +
                              +
                              + +
                              +
                              + + \ No newline at end of file diff --git a/posts/2022/12/jit-bug-finding-smt-fuzzing.html b/posts/2022/12/jit-bug-finding-smt-fuzzing.html new file mode 100644 index 000000000..f4565a982 --- /dev/null +++ b/posts/2022/12/jit-bug-finding-smt-fuzzing.html @@ -0,0 +1,866 @@ + + + + + +Finding JIT Optimizer Bugs using SMT Solvers and Fuzzing | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                              +
                              +

                              Finding JIT Optimizer Bugs using SMT Solvers and Fuzzing

                              + + + +
                              +

                              In this blog post I want to describe a recent bug finding technique that I've +added to the PyPy JIT testing infrastructure. This technique uses the Z3 +theorem prover to find bugs in the optimizer of PyPy's JIT, in particular its +integer operation optimizations. The approach is +based on things I have learned from John Regehr's blog (this post is a +good first one to read), Twitter, and on +his (et al) paper Alive2: Bounded Translation Validation for LLVM. The work +was triggered by a recent miscompilation bug my current bachelor student Nico +Rittinghaus found.

                              +

                              Background: Python Integers in the PyPy JIT

                              +

                              The optimizer of PyPy's JITs operates on traces, which are linear sequences of +instructions with guards. The instructions in the traces operate on different +machine-level data types, machine integers, doubles, pointers, bools, etc. In +this post we'll be mostly concerned with machine integers.

                              +

                              To given some wider context I'll explain a bit how Python ints in the user code +relate to the types that are used in traces when the PyPy Python implementation +is used. +When PyPy turns a regular Python 3 function into a trace, there is a lot of work +happening in the JIT frontend to try to observe and infer the types that the +Python function concretely uses at runtime. The traces are generated under these +typing assumptions. Therefore, code that uses ints in the Python code can +typically be translated into traces that operate on machine integers. In order +to make sure that the Python integer semantics are upheld, many of the +operations in the traces need to check that the integer results of some +operations still fit into a machine integer. If that is not the case (a rare +situation for most programs), the trace is left via a guard, execution falls +back to the interpreter, and there a big integer representation is chosen for +the too big value (the big integer representation is done via a pointer and +some storage on the heap).

                              +

                              All of this machinery is not going to be too relevant for the rest of the +post. For the post it's important to know that trace instructions operate on +machine integers and other low-level types, and some of the operations can +optionally check whether the +results still fit into a machine integer. These trace operations are improved by +the optimizer, which tries to transform the trace into one that behaves the +same, but is less costly to execute.

                              +

                              Background: Bounds Analysis in PyPy's JIT

                              +

                              The optimizer of PyPy's JIT has an analysis based on abstract interpretation +that tries to find out whether the integer values stored in a variable are +actually not using the full 64 bit (or 32 bit) range, but instead fit into some +smaller range. This means that for every integer variable x in a trace, the +JIT compiler tracks upper and lower bounds of the runtime value of that +variable: a range [a, b] such that for every concrete runtime value v +that gets stored in variable x, a <= v <= b must be true. +a and b start out +as the most general MININT and MAXINT, but sometimes there is extra +information that makes it possible to improve these known bounds, and that is +often useful to optimize the code.

                              +

                              A typical example is that the JIT knows that the length of a string is +non-negative, so for this kind of code: x = len(s) where s is a string, +x gets a range [0, MAXINT] assigned. With this information we could for +example remove a check x + 10 < 0 completely, because it can never be true.

                              +

                              The bounds information is useful for optimization, but the analysis of the +bounds is also a source of bugs in the JIT, because the reasoning is often +subtle and easy to get wrong in corner cases. We already use a number of testing +techniques to try to make sure that it is correct. A simple one is +property-based testing using Hypothesis on the operations on bounds. Even +though Hypothesis is fantastic, it unfortunately does not catch +absolutely all the bugs even if we'd like it too, as we'll see in the next +section.

                              +

                              Motivation: A JIT Miscompilation

                              +

                              I am currently supervising a Bachelor thesis by Nico Rittinghaus, who is +extending the integer analysis in the JIT. He'll probably write a separate blog +post about that soon. In the process of his work, the current bounds analysis +code got a lot of scrutiny, and we found out that one of the unit tests of the +bounds analysis was actually incorrect, and the example code in that unit test +was optimized incorrectly. This case of incorrect optimization is not a big deal +for regular Python code, because it involved a "wrapping integer addition +operation", i.e. one where overflowing results just wrap around to negative +values. All the additions and other arithmetic operations that the PyPy Python +frontend generates actually have +overflow checks (to be able to switch to a big integer representation if +needed). +However, it's still possible to trigger the problem with the +__pypy__.intop.int_add API which is a function that exposes wraparound +arithmetic on Python ints.

                              +

                              Here's the miscompilation. The JIT optimizes the following function:

                              +
                              import __pypy__
                              +
                              +def wrong(x):
                              +    a = __pypy__.intop.int_add(x, 10)
                              +    if a < 15:
                              +        if x < 6:
                              +            return 0
                              +        return 1
                              +    return 2
                              +
                              +

                              Into the following code:

                              +
                              import __pypy__
                              +
                              +def wrong(x):
                              +    a = __pypy__.intop.int_add(x, 10)
                              +    if a < 15:
                              +        return 0
                              +    return 2
                              +
                              +

                              Basically the faulty reasoning of the JIT looks like this: if int_add(x, 10) < 15 +then it must follow that x < 5, which is stronger than x < 6, so the +second if is always true. This sounds good, but is actually wrong +if the addition + 10 wrapped around. So if x == MAXINT, then +int_add(x, 10) == MININT + 9 < 15. But MAXINT < 5 is not +correct.

                              +

                              Note how the same reasoning with overflow-checking addition is correct! If x + +10 < 15 and the + didn't overflow, then indeed x < 6. And if your +mind bends starting to think about all this, you understand some of the +difficulty of getting the JIT correct in this area.

                              +

                              How could we have avoided this bug?

                              +

                              One exercise I try to do after finding bugs is to reflect on ways that the +bug could have been avoided. I think this is particularly important in the JIT, +where bugs are potentially really annoying to find and can cause very strange +behaviour in basically arbitrary Python code.

                              +

                              It's easy to always answer this question with "try to think more carefully +when working", but that approach cannot be relied on in complicated situations, +because humans don't concentrate perfectly for long stretches of time.

                              +

                              A situation-specific problem I identified was the bad design of the range analysis API. +A range is not just represented by two numbers, instead it's two numbers +and two bools that are supposed to represent that some operation did or did not +underflow/overflow. The meaning of these bools was quite hard to grasp and easy +to get wrong, so probably they should never have been introduced in the first +place (and my bugfix indeed removed them).

                              +

                              But in the rest of this blog post I want to talk about another, systematic +approach that can be applied to the problem of mis-optimizations of integer +operations, and that is done by applying an SMT solver to the problem.

                              +

                              An SMT solver (Satisfyability Modulo Theories) is a tool that can be used to +find out whether mathematical formulas are "satisfiable", i.e. whether +some chosen set of inputs exists that will make the formulas evaluate to true. SMT solvers are +commonly used in a wide range of CS applications including program correctness +proofs, program synthesis, etc. The most widely known one is probably Z3 by +Microsoft Research which has the nice advantage of coming with an easy-to-use +Python binding.

                              +

                              Going into this I basically knew next to nothing about SMT solvers (despite +having been embedded in a formal methods research group for years!) so it was an +interesting new world to learn about.

                              +

                              As briefly mentioned in the introduction, the approach I took followed a similar +(but much more properly executed) one applied to LLVM operations, called +Alive2. Krister Waldfridsson has done similar work for GCC recently, +described on his blog.

                              +

                              Z3 Proof of Concept

                              +

                              The first thing I did was to try to get Z3 find the above bug, by encoding the +input program into an SMT formula by hand and trying to get Z3 to prove the condition +that the JIT thinks is always true. The Z3 code for this looks as follows:

                              +
                              from z3 import BitVec, Implies, prove
                              +x = BitVec('x', 64)
                              +a = x + 10
                              +cond1 = a < 15
                              +cond2 = x < 6
                              +prove(Implies(cond1, cond2))
                              +
                              +

                              Here, x is defined to be a bit vector variable of width 64, which is a +datatype that can be used to represent bounded machine integers. Addition on +bit vectors performs wraparound arithmetic, like the __pypy__.intop.int_add +call in the original code. The JIT optimized the second condition away, so +essentially it was convinced that the first condition implies the second one. +The above snippet tries to get Z3 to confirm this.

                              +

                              When run, the above program prints:

                              +
                              counterexample
                              +[x = 9223372036854775803]
                              +

                              Which shows the bug. As a small side-note, I thought it was cool that the +process of "proving" something in Z3 basically means trying to find an example +for the negation of the formula. If no counterexample can be found for the +negation, the original formula is true. If the original formula turns out to be +false (like here) we get a nice example that shows the problem to go with it.

                              +

                              It's not realistic to hand-translate all the hundreds of +unit-tests into Z3 formulas and then ask Z3 to prove the optimizations. Instead, +we want to have a program that does this for us.

                              +

                              SMT Checking of the JIT Optimizer

                              +

                              What we want from this program is the following: given an unoptimized trace and +its optimized version, we want to use Z3 to check whether the optimized trace +behaves identically to the unoptimized one. One question is what "behaves +identically" means. What we care about is the outputs of the trace being the +same values, no matter how they are computed. Also, for every guard we want to +make sure that it fails in identical ways in the optimized and unoptimized +versions. A guard is only allowed to be optimized away if it can never fail. +The code that comes after a guard can assume that the guard has not failed, +because otherwise execution would have left the trace. All of this should be +true regardless for the values of the input variables of the trace.

                              +

                              So in order to check that the two traces are behaving identically, we do the +following:

                              +
                                +
                              • We create Z3 variables for every input variable. We use the same input +variables both for the unoptimized as well as the optimized trace.

                              • +
                              • We align the two traces at the corresponding guards. Thankfully the optimizer +keeps track of which optimized guard corresponds to which unoptimized input +guard.

                              • +
                              • All the operations before a guard are translated into Z3 formulas, for both +versions of the trace.

                              • +
                              • For two corresponding guards, we ask Z3 to prove that the guard conditions are +identical.

                              • +
                              • For a guard that was optimized away we ask Z3 to prove that the condition is +always true.

                              • +
                              • After a guard, we tell Z3 that from now on it can assume that the guard +condition is true.

                              • +
                              • We repeat this, guard for guard, until we reach the end of the trace. There, +we ask Z3 to prove that the output variables in the unoptimized trace and the +optimized trace are identical (every trace can return one or many values).

                              • +
                              +

                              I implemented this, it's not a lot of code, basically a couple of hundred lines +of (somewhat hacky) Python code. So far I only support integer +operations. Here are some parts of the code to give you a flavor of what this +looks like.

                              +

                              This is the code that translates operations into Z3 formulas:

                              +
                              def add_to_solver(self, ops, state):
                              +    for op in ops:
                              +        if op.type != 'v': # is it an operation with a result
                              +            res = self.newvar(op)
                              +        else: # or does it return void
                              +            res = None
                              +
                              +       # ...
                              +
                              +        # convert arguments
                              +        if op.numargs() == 1:
                              +            arg0 = self.convertarg(op, 0)
                              +        elif op.numargs() == 2:
                              +            arg0 = self.convertarg(op, 0)
                              +            arg1 = self.convertarg(op, 1)
                              +
                              +        # compute results
                              +        if opname == "int_add":
                              +            expr = arg0 + arg1
                              +        elif opname == "int_sub":
                              +            expr = arg0 - arg1
                              +        elif opname == "int_mul":
                              +            expr = arg0 * arg1
                              +        elif opname == "int_and":
                              +            expr = arg0 & arg1
                              +        elif opname == "int_or":
                              +            expr = arg0 | arg1
                              +        elif opname == "int_xor":
                              +            expr = arg0 ^ arg1
                              +
                              +        # ...  more operations, some shown below
                              +
                              +        self.solver.add(res == expr)
                              +
                              +

                              New Z3 variables are defined by the helper function newvar, which adds the +operation to a dictionary box_to_z3 mapping boxes (=variables) to Z3 +variables. Due to the SSA property that traces have, a variable must be defined +before its first use.

                              +

                              Here's what newvar looks like (LONG_BIT is a constant that is either +64 or 32, depending on the target architecture):

                              +
                              def newvar(self, box, repr=None):
                              +    # ... some logic around making the string representation
                              +    # somewhat nicer omitted
                              +    result = z3.BitVec(repr, LONG_BIT)
                              +    self.box_to_z3[box] = result
                              +    return result
                              +
                              +

                              The convert method turns an operation argument (either a constant or a +variable) into a Z3 formula (either a constant bit vector or an already defined +Z3 variable). convertarg is a helper function that takes an operation, reads +its nth argument and converts it.

                              +
                              def convert(self, box):
                              +    if isinstance(box, ConstInt):
                              +        return z3.BitVecVal(box.getint(), LONG_BIT)
                              +    return self.box_to_z3[box]
                              +
                              +def convertarg(self, box, arg):
                              +    return self.convert(box.getarg(arg))
                              +
                              +

                              The lookup of variables in box_to_z3 that convert does cannot fail, +because the variable must have been defined before use.

                              +

                              Comparisons return the bit vector 0 or bit vector 1, we use a helper function +cond to turn the Z3 truth value of the comparison into a bit vector:

                              +
                              def cond(self, z3expr):
                              +    return z3.If(z3expr, TRUEBV, FALSEBV)
                              +
                              +
                              +def add_to_solver(self, ops, state):
                              +        # ... start as above
                              +
                              +        # more cases
                              +        elif opname == "int_eq":
                              +            expr = self.cond(arg0 == arg1)
                              +        elif opname == "int_ne":
                              +            expr = self.cond(arg0 != arg1)
                              +        elif opname == "int_lt":
                              +            expr = self.cond(arg0 < arg1)
                              +        elif opname == "int_le":
                              +            expr = self.cond(arg0 <= arg1)
                              +        elif opname == "int_gt":
                              +            expr = self.cond(arg0 > arg1)
                              +        elif opname == "int_ge":
                              +            expr = self.cond(arg0 >= arg1)
                              +        elif opname == "int_is_true":
                              +            expr = self.cond(arg0 != FALSEBV)
                              +        elif opname == "uint_lt":
                              +            expr = self.cond(z3.ULT(arg0, arg1))
                              +        elif opname == "uint_le":
                              +            expr = self.cond(z3.ULE(arg0, arg1))
                              +        elif opname == "uint_gt":
                              +            expr = self.cond(z3.UGT(arg0, arg1))
                              +        elif opname == "uint_ge":
                              +            expr = self.cond(z3.UGE(arg0, arg1))
                              +        elif opname == "int_is_zero":
                              +            expr = self.cond(arg0 == FALSEBV)
                              +
                              +        # ... rest as above
                              +
                              +

                              So basically for every trace operation that operates on integers I had to give a +translation into Z3 formulas, which is mostly straightforward.

                              +

                              Guard operations get converted into a Z3 boolean by their own helper function, +which looks like this:

                              +
                              def guard_to_condition(self, guard, state):
                              +    opname = guard.getopname()
                              +    if opname == "guard_true":
                              +        return self.convertarg(guard, 0) == TRUEBV
                              +    elif opname == "guard_false":
                              +        return self.convertarg(guard, 0) == FALSEBV
                              +    elif opname == "guard_value":
                              +        return self.convertarg(guard, 0) == self.convertarg(guard, 1)
                              +
                              +    # ... some more exist, shown below
                              +
                              +

                              Some operations are a bit trickier. An important example in the context of +this blog post are integer operations that check for overflow. The overflow +operations return a result, but also a boolean whether the operation overflowed +or not.

                              +
                              def add_to_solver(self, ops, state):
                              +
                              +        # ... more cases
                              +
                              +        elif opname == "int_add_ovf":
                              +            expr = arg0 + arg1
                              +            m = z3.SignExt(LONG_BIT, arg0) + z3.SignExt(LONG_BIT, arg1)
                              +            state.no_ovf = m == z3.SignExt(LONG_BIT, expr)
                              +        elif opname == "int_sub_ovf":
                              +            expr = arg0 - arg1
                              +            m = z3.SignExt(LONG_BIT, arg0) - z3.SignExt(LONG_BIT, arg1)
                              +            state.no_ovf = m == z3.SignExt(LONG_BIT, expr)
                              +        elif opname == "int_mul_ovf":
                              +            expr = arg0 * arg1
                              +            m = z3.SignExt(LONG_BIT, arg0) * z3.SignExt(LONG_BIT, arg1)
                              +            state.no_ovf = m == z3.SignExt(LONG_BIT, expr)
                              +
                              +        # ...
                              +
                              +

                              The boolean is computed by comparing the result of the bit vector operation with +the result of converting the input bit vectors into an abstract (arbitrary +precision) integer and the result back to bit vectors. Let's go through the +addition case step by step, the other cases work analogously.

                              +

                              The addition in the first elif that computes expr is an addition on bit +vectors, therefore it is performing wraparound arithmetic. +z3.SignExt(LONG_BIT, arg0) sign-extends arg0 from a bit vector of +LONG_BIT bits to an abstract, arbitrary precision integer. The addition in +the second line is therefore an addition between abstract integers, so it will +never overflow and just compute the correct result as an integer.

                              +

                              The condition to check for overflow is now: if the results of the two different +ways to do the addition are the same, then overflow did not occur. So in order +to compute state.no_ovf in the addition case the +code converts the result of the bit vector wraparound addition to +an abstract integer (using SignExt again), and then compares that to the integer +result.

                              +

                              This boolean can then be checked by the guard operations guard_no_overflow +and guard_overflow.

                              +
                              def guard_to_condition(self, guard, state):
                              +
                              +    # ... more cases
                              +
                              +    elif opname == "guard_no_overflow":
                              +        assert state.no_ovf is not None
                              +        return state.no_ovf
                              +    elif opname == "guard_overflow":
                              +        assert state.no_ovf is not None
                              +        return z3.Not(state.no_ovf)
                              +
                              +    # ... more cases
                              +
                              +

                              Finding the Bug, Again

                              +

                              Let's actually make all of this more concrete by applying it to the trace of our +original bug. The input trace and the incorrectly optimized trace for that look +like this (differences highlighted):

                              +
                              # input                       # optimized
                              +[i0]                          [i0]
                              +i1 = int_add(i0, 10)          i1 = int_add(i0, 10)
                              +i2 = int_lt(i1, 15)           i2 = int_lt(i1, 15)
                              +guard_true(i2)                guard_true(i2)
                              +i3 = int_lt(i0, 6)            jump(0)
                              +guard_true(i3)
                              +jump(0)
                              +
                              +

                              Note that the trace represents just one of the paths through the control flow +graph of the original function, which is typical for tracing JITs (the other +paths could incrementally get added later).

                              +

                              The first guards in both these traces correspond to each other, so the first +chunks to check are the first three operations (lines 1-4). Those operations +don't get changed by the optimizer at all.

                              +

                              These two identical traces get translated to the following Z3 formulas:

                              +
                              i1unoptimized == input_i0 + 10
                              +i2unoptimized == If(i1unoptimized < 15, 1, 0)
                              +i1optimized == input_i0 + 10
                              +i2optimized == If(i1optimized < 15, 1, 0)
                              +
                              +

                              To check that the two corresponding guards are the same, the solver is asked to +prove that (i2unoptimized == 1) == (i2optimized == 1). This is +correct, because the formulas for i2unoptimized and i2optimized are +completely identical.

                              +

                              After checking that the guards behave the same, we add the knowledge to the +solver that the guards passed. So the Z3 formulas become:

                              +
                              i1unoptimized == input_i0 + 10
                              +i2unoptimized == If(i1unoptimized < 15, 1, 0)
                              +i1optimized == input_i0 + 10
                              +i2optimized == If(i1optimized < 15, 1, 0)
                              +i1optimized == 1
                              +i2optimized == 1
                              +
                              +

                              Now we continue with the remaining operations of the two traces (lines 6-8).

                              +

                              We start by adding the int_lt operation in the unoptimized trace to the Z3 +formulas:

                              +
                              ...
                              +i3unoptimized == If(input_i0 < 6, 1, 0)
                              +
                              +

                              Because the second guard was optimized away, we need to ask Z3 to prove that +i3unoptimized == 1 is always true, which fails and gives the following +counterexample:

                              +
                              input_i0 = 9223372036854775800
                              +i1unoptimized = 9223372036854775810
                              +i2unoptimized = 0
                              +i1optimized = 9223372036854775810
                              +i2optimized = 1
                              +i3unoptimized = 0
                              +
                              +

                              Thus demonstrating the bug. The fact that the Z3-based equivalence check also +managed to find the original motivating bug without manually translating it to +a formula is a good confirmation that the approach works.

                              +

                              Second bug

                              +

                              So with this code I applied the Z3-based equivalence check to all our optimizer +unit tests. In addition to the bug we've been discussing the whole post, it also +found another buggy test! I had found it too by hand by staring at all the tests +in the process of writing all the Z3 infrastructure, but it was still a good +confirmation that the process worked. This bug was in the range analysis for +int_neg, integer negation. It failed to account that -MININT == MININT +and therefore did a mis-optimization along the following lines:

                              +
                              import __pypy__
                              +
                              +def wrong(x):
                              +    a = __pypy__.intop.int_sub(0, x)
                              +    if a < 0:
                              +        if x > 0:
                              +            return 0
                              +        return 1
                              +    return 2
                              +
                              +

                              Which was wrongly optimized into:

                              +
                              import __pypy__
                              +
                              +def wrong(x):
                              +    a = __pypy__.intop.int_sub(0, x)
                              +    if a < 0:
                              +        return 0
                              +    return 2
                              +
                              +

                              This is wrong precisely for x == MININT.

                              +

                              Generating Random Traces

                              +

                              These two bugs were the only two that the Z3 checker found for existing unit +tests. To try to find some more bugs I combined PyPy's existing random trace +generator with the Z3 optimization checker. The random trace generator has so +far been mostly used to find bugs in the machine code backends, particularly +also in the register allocator. So far we haven't used it with our optimizer, +but my experiments show that we should have!

                              +

                              I'm going to describe a little bit how the random trace generator works. It's +actually not that complicated, but there's one neat trick to it.

                              +

                              The basic idea is straightforward, it starts out with an empty trace with a +random number of input variables. Then it adds some number of operations to the +trace, either regular operations or guards. Every operation takes already +existing variables as input.

                              +

                              The neat trick is that our random trace generator keeps a concrete random +example value for every one of the input variables, and an example result for +every operation. In this way, it is possible to generate guards that are +consistent with the example values to ensure that running the trace to its end +is possible with at least one set of values.

                              +

                              Here's an example random trace that is generated, together with the random +example inputs and the results of every operation at the end of every line:

                              +
                              [i0, i1, i2, i3, i4, i5] # example values: 9, 11, -8, -95, 46, 57
                              +i6 = int_add_ovf(i3, i0) # -86
                              +guard_no_overflow()
                              +i7 = int_sub(i2, -35/ci) # 27
                              +i8 = uint_ge(i3, i5) # 1
                              +guard_true(i8)
                              +i9 = int_lt(i7, i8) # 0
                              +i10 = int_mul_ovf(34/ci, i7) # 918
                              +guard_no_overflow()
                              +i11 = int_and(i10, 63/ci) # 22
                              +i12 = int_rshift(i3, i11) # -1
                              +i13 = int_is_zero(i7) # 0
                              +i14 = int_is_true(i13) # 0
                              +guard_false(i13)
                              +i15 = int_lt(i8, i4) # 1
                              +i16 = int_and(i6, i0) # 8
                              +i17 = uint_ge(i6, -6/ci) # 0
                              +finish()
                              +

                              Note how every guard generated is true for the example values.

                              +

                              I have been running this combination of random trace generation and Z3 checking +for many nights and it has found some bugs, which I'll describe in the next +section. It should probably be run for a lot longer, but still a useful +exercise already.

                              +

                              In this mode, I'm giving every Z3 call a time limit to make sure that the random +tests don't just take arbitrarily long. This means that asking Z3 to prove +something can have three outcomes, either it's proved, or Z3 finds a +counterexample, or Z3 times out.

                              +

                              Bugs Found

                              +

                              In addition to the two bugs I've already described, I'll briefly list the +additional bugs that were found by optimizing random traces and then trying to +prove the equivalence with Z3.

                              +

                              Most of the bugs were actually identified by optimizing random traces alone, not +by the Z3 component. They manifested as assert failures in the JIT compiler.

                              +
                                +
                              • The JIT concluded after 12 == int_mul(x, 12) that x == 1, which is +incorrect if overflow occurred (a counterexample is 0x8000000000000001).

                              • +
                              • An amusing bug, where from 0 == int_lshift(0x1000000000000000, x) with +x <= 0 <= 15, the JIT concluded that 0x1000000000000000 == 0, +triggering an assert. This wrong conclusion was again caused by not taking the +possibility of overflow into account.

                              • +
                              • A corner case in an optimization for chained integer additions with a +constant, where in complex enough expressions, the wrong IR API was used +(which works correctly in simple cases). Again, this triggered an assert.

                              • +
                              +

                              This shows that we should have been fuzzing our JIT optimizer already (not a +surprising observation in hindsight, fuzz all the things!).

                              +

                              Thankfully, there was also one further bug that really failed in the Z3 +verifier. It's a bug in common subexpression elimination / arithmetic +simplification, which again does not take overflow correctly into account.

                              +

                              The buggy trace looks like this (unfortunately it's not easily possible to show +this bug in Python code).

                              +
                              [a, b]
                              +c = int_add(a, b)
                              +r = int_sub_ovf(c, b)
                              +guard_no_ovf()
                              +finish(r)
                              +
                              +

                              This was optimized to:

                              +
                              [a, b]
                              +finish(a)
                              +
                              +

                              Which is incorrect, because the guard can fail given the right inputs. +But the optimizer concluded that the subtraction is safe, because its the +inverse of an earlier addition, not taking into account that this earlier +addition can have overflowed.

                              +

                              Note that a related optimization is actually correct. Given this code:

                              +
                              [a, b]
                              +c = int_add_ovf(a, b)
                              +guard_no_ovf()
                              +r = int_sub(c, b)
                              +finish(r)
                              +
                              +

                              It can be optimized to:

                              +
                              [a, b]
                              +c = int_add_ovf(a, b)
                              +guard_no_ovf()
                              +finish(a)
                              +
                              +

                              Future Work and Conclusion

                              +

                              In the current form the Z3 checker is only a start, even though it has already +been concretely useful. There are various directions into which we could extend +it. In addition to generate random tests completely from scratch, we could also +start from the existing manually written unit-tests and randomly mutate those.

                              +

                              I also want to extend the Z3 checker with support more operations, heap +operations in particular (but it's not quite clear to me how to model garbage +collection).

                              +

                              I also want to try to switch the code away from the Z3 API and use the more +general smtlib interface directly, in order to be able to use other SMT +checkers than Z3, eg CVC4.

                              +

                              But all in all this was a fun and not too hard way to find a bunch of bugs in +our optimizer! And the infrastructure is now in place, which means that we run +some random test cases every time we execute our tests. This is going to be +particularly useful when we do further work on the integer reasoning of the JIT +(like Nico is doing, for example). As of time of writing of this post, all the +bugs mentioned have been fixed and the Z3 code has landed on the default branch +and runs as part of PyPy's CI infrastructure.

                              +

                              Acknowledgements

                              +

                              Thanks to Saam Barati, Max Bernstein, Joshua Schmidt and Martin +Berger, for great feedback on drafts of this post!

                              +
                              +
                              +

                              Comments

                              +
                              +
                              +
                              + +
                              +
                              + + \ No newline at end of file diff --git a/posts/2022/12/pypy-v7310-release.html b/posts/2022/12/pypy-v7310-release.html new file mode 100644 index 000000000..76bf01cc6 --- /dev/null +++ b/posts/2022/12/pypy-v7310-release.html @@ -0,0 +1,370 @@ + + + + + +PyPy v7.3.10 release | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                              +
                              +

                              PyPy v7.3.10 release

                              + + + +
                              +

                              PyPy v7.3.10: release of python 2.7, 3.8, and 3.9

                              +

                              The PyPy team is proud to release version 7.3.10 of PyPy. We have some nice +speedups and bugfixes we wish to share. The release includes three different +interpreters:

                              +
                              +
                                +
                              • PyPy2.7, which is an interpreter supporting the syntax and the features of +Python 2.7 including the stdlib for CPython 2.7.18+ (the + is for +backported security updates)

                              • +
                              • PyPy3.8, which is an interpreter supporting the syntax and the features of +Python 3.8, including the stdlib for CPython 3.8.15.

                              • +
                              • PyPy3.9, which is an interpreter supporting the syntax and the features of +Python 3.9, including the stdlib for CPython 3.9.15. We have gained +confidence in the stability of this version, and are removing the "beta" +label.

                              • +
                              +
                              +

                              The interpreters are based on much the same codebase, thus the multiple +release. This is a micro release, all APIs are compatible with the other 7.3 +releases. Highlights of the release, since the release of 7.3.9 in March 2022 +include:

                              +
                              +
                                +
                              • A release of Apple Silicon M1 arm64 versions. This work was sponsored by +an anonymous donor and is tested on our buildbots.

                              • +
                              • Many improvements to the basic interpreter to make it 15-20% faster

                              • +
                              • The conda-forge community has built over 1000 packages for PyPy3.8 and 3.9, +making it easier than ever to use PyPy.

                              • +
                              • Update the packaged OpenSSL to 1.1.1s, sqlite3 to 3.39.4, and apply +applicable security fixes from CPython 3.9.15 to PyPy2.7

                              • +
                              • Update the HPy backend in PyPy3.8 and PyPy3.9 to 0.0.4

                              • +
                              +
                              +

                              We recommend updating. You can find links to download the v7.3.10 releases here:

                              +
                              +

                              https://pypy.org/download.html

                              +
                              +

                              We would like to thank our donors for the continued support of the PyPy +project. If PyPy is not quite good enough for your needs, we are available for +direct consulting work. If PyPy is helping you out, we would love to hear about +it and encourage submissions to our blog via a pull request +to https://github.com/pypy/pypy.org

                              +

                              We would also like to thank our contributors and encourage new people to join +the project. PyPy has many layers and we need help with all of them: bug fixes, +PyPy and RPython documentation improvements, or general help with making +RPython's JIT even better. Since the previous release, we have accepted +contributions from five new contributors, thanks for pitching in, and welcome +to the project!

                              +

                              If you are a python library maintainer and use C-extensions, please consider +making a HPy / CFFI / cppyy version of your library that would be performant +on PyPy. +In any case, both cibuildwheel and the multibuild system support +building wheels for PyPy.

                              +

                              What is PyPy?

                              +

                              PyPy is a Python interpreter, a drop-in replacement for CPython 2.7, 3.8 and +3.9. It's fast (PyPy and CPython 3.7.4 performance +comparison) due to its integrated tracing JIT compiler.

                              +

                              We also welcome developers of other dynamic languages to see what RPython +can do for them.

                              +

                              We provide binary builds for:

                              +
                              +
                                +
                              • x86 machines on most common operating systems +(Linux 32/64 bits, Mac OS 64 bits, Windows 64 bits)

                              • +
                              • 64-bit ARM machines running Linux (aarch64).

                              • +
                              • Apple M1 arm64 machines (macos_arm64).

                              • +
                              • s390x running Linux

                              • +
                              +
                              +

                              PyPy support Windows 32-bit, Linux PPC64 big- and little-endian, and Linux ARM +32 bit, but does not release binaries. Please reach out to us if you wish to +sponsor binary releases for those platforms. Downstream packagers provide +binary builds for debian, Fedora, conda, OpenBSD, FreeBSD, Gentoo, and more.

                              +

                              What else is new?

                              +

                              For more information about the 7.3.10 release, see the full changelog.

                              +

                              Please update, and continue to help us make pypy better.

                              +

                              Cheers, +The PyPy Team

                              +
                              +
                              +

                              Comments

                              +
                              +
                              +
                              + +
                              +
                              + + \ No newline at end of file diff --git a/posts/2022/12/pypy-v7311-release.html b/posts/2022/12/pypy-v7311-release.html new file mode 100644 index 000000000..50ba6e47e --- /dev/null +++ b/posts/2022/12/pypy-v7311-release.html @@ -0,0 +1,360 @@ + + + + + +PyPy v7.3.11 release | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                              +
                              +

                              PyPy v7.3.11 release

                              + + + +
                              +

                              PyPy v7.3.11: release of python 2.7, 3.8, and 3.9

                              +

                              The PyPy team is proud to release version 7.3.11 of PyPy. As could be expected, +the first release of macOS arm64 impacted the macOS x86-64 build, so this is +a bug release to restore the ability of macOS users to run PyPy on +macOS < 11.0. It also incorporates the latest CPython stdlib updates +released the day after 7.3.10 went out, and a few more bug fixes. The release +includes three different interpreters:

                              +
                              +
                                +
                              • PyPy2.7, which is an interpreter supporting the syntax and the features of +Python 2.7 including the stdlib for CPython 2.7.18+ (the + is for +backported security updates)

                              • +
                              • PyPy3.8, which is an interpreter supporting the syntax and the features of +Python 3.8, including the stdlib for CPython 3.8.16. Note we intend to drop +support for this version in an upcoming release as soon as we release +Pyython 3.10.

                              • +
                              • PyPy3.9, which is an interpreter supporting the syntax and the features of +Python 3.9, including the stdlib for CPython 3.9.16.

                              • +
                              +
                              +

                              The interpreters are based on much the same codebase, thus the multiple +release. This is a micro release, all APIs are compatible with the other 7.3 +releases and follows quickly on the heals of the 7.3.10 release on Dec 6.

                              +

                              We recommend updating. You can find links to download the v7.3.11 releases here:

                              +
                              +

                              https://pypy.org/download.html

                              +
                              +

                              We would like to thank our donors for the continued support of the PyPy +project. If PyPy is not quite good enough for your needs, we are available for +direct consulting work. If PyPy is helping you out, we would love to hear about +it and encourage submissions to our blog via a pull request +to https://github.com/pypy/pypy.org

                              +

                              We would also like to thank our contributors and encourage new people to join +the project. PyPy has many layers and we need help with all of them: bug fixes, +PyPy and RPython documentation improvements, or general help with making +RPython's JIT even better. Since the previous release, we have accepted +contributions from one new contributor, thanks for pitching in, and welcome +to the project!

                              +

                              If you are a python library maintainer and use C-extensions, please consider +making a HPy / CFFI / cppyy version of your library that would be performant +on PyPy. +In any case, both cibuildwheel and the multibuild system support +building wheels for PyPy.

                              +

                              What is PyPy?

                              +

                              PyPy is a Python interpreter, a drop-in replacement for CPython 2.7, 3.8 and +3.9. It's fast (PyPy and CPython 3.7.4 performance +comparison) due to its integrated tracing JIT compiler.

                              +

                              We also welcome developers of other dynamic languages to see what RPython +can do for them.

                              +

                              We provide binary builds for:

                              +
                              +
                                +
                              • x86 machines on most common operating systems +(Linux 32/64 bits, Mac OS 64 bits, Windows 64 bits)

                              • +
                              • 64-bit ARM machines running Linux (aarch64).

                              • +
                              • Apple M1 arm64 machines (macos_arm64).

                              • +
                              • s390x running Linux

                              • +
                              +
                              +

                              PyPy support Windows 32-bit, Linux PPC64 big- and little-endian, and Linux ARM +32 bit, but does not release binaries. Please reach out to us if you wish to +sponsor binary releases for those platforms. Downstream packagers provide +binary builds for debian, Fedora, conda, OpenBSD, FreeBSD, Gentoo, and more.

                              +

                              What else is new?

                              +

                              For more information about the 7.3.11 release, see the full changelog.

                              +

                              Please update, and continue to help us make pypy better.

                              +

                              Cheers, +The PyPy Team

                              +
                              +
                              +

                              Comments

                              +
                              +
                              +
                              + +
                              +
                              + + \ No newline at end of file diff --git a/posts/2023/01/string-concatenation-quadratic.html b/posts/2023/01/string-concatenation-quadratic.html new file mode 100644 index 000000000..891a5dfbe --- /dev/null +++ b/posts/2023/01/string-concatenation-quadratic.html @@ -0,0 +1,365 @@ + + + + + +Repeated string concatenation is quadratic in PyPy (and CPython) | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                              +
                              +

                              Repeated string concatenation is quadratic in PyPy (and CPython)

                              + + + +
                              +

                              This is a super brief blog post responding to an issue that we got on the PyPy +issue tracker. I am moving my response to the blog (with permission of the +submitter) to have a post to point to, since it's a problem that comes up with +some regularity. It's also documented on our page of differences between PyPy +and CPython but I thought an additional blog post might be good.

                              +

                              The issue pointed out that a small program that operates on strings is much +slower on PyPy compared to CPython. The program is a solution for 2016's +Advent of Code Day 16 and looks like this:

                              +
                              def dragon(a):
                              +    b = a[::-1].replace('0','r').replace('1','0').replace('r','1')
                              +    return a+'0'+b
                              +
                              +def diffstr(a):
                              +    b = ""
                              +    for i in range(0,len(a),2):
                              +        b += ['0','1'][a[i] == a[i+1]]
                              +    return b
                              +
                              +def iterdiff(a):
                              +    b = a
                              +    while(len(b) % 2 == 0):
                              +        b = diffstr(b)
                              +    return b
                              +
                              +size = 35651584
                              +initstate = '10010000000110000'
                              +while(len(initstate) < size):
                              +    initstate = dragon(initstate)
                              +initstate = initstate[:size]
                              +print(iterdiff(initstate))
                              +
                              +

                              The submitter pointed out, that the program is fast on CPython (~8s on my +laptop) and slow (didn't finish) on PyPy.

                              +

                              The reason for the performance difference is that += on strings in a loop +has quadratic complexity in PyPy, which is what diffstr does. To see the +quadraticness, consider that to add a character at the end of the string, the +beginning of the string needs to be copied into a new chunk of memory. If the +loop runs n times, that means there are

                              +

                              1 + 2 + 3 + ... + n = n * (n + 1) // 2

                              +

                              character copies.

                              +

                              Repeated string concatenations are in principle also quadratic in CPython, but +CPython has an optimization that makes them sometimes not quadratic, which is +what makes this program not too slow in CPython.

                              +

                              In order to fix the problem on PyPy it's best to use a list for the string +parts, which has the right amortized O(1) complexity for .append calls, and +then use str.join after the loop:

                              +
                              def diffstr(a):
                              +    b = []
                              +    for i in range(0,len(a),2):
                              +        b.append(['0','1'][a[i] == a[i+1]])
                              +    return "".join(b)
                              +
                              +

                              With this change the program becomes a little bit faster on CPython for me, and +on PyPy it stops being quadratic and runs in ~3.5s.

                              +

                              In general, it's best not to rely on the presence of this optimization in +CPython either. Sometimes, a small innocent looking changes will break CPython's +optimization. E.g. this useless change makes CPython also take ages:

                              +
                              def diffstr(a):
                              +    b = ""
                              +    for i in range(0,len(a),2):
                              +        b += ['0','1'][a[i] == a[i+1]]
                              +        c = b
                              +    return b
                              +
                              +

                              The reason why this change breaks the optimization in CPython is that it only +triggers if the reference count of b is 1, in which case it uses realloc +on the string. The change is unrealistic of course, but you could imagine a +related that keeps an extra reference to b for a sensible reason.

                              +

                              Another situation in which the optimization doesn't work is discussed in this +StackOverflow question with an answer by Tim Peters.

                              +

                              It's unlikely that PyPy will fix this. We had a prototype how to do it, but it +seems very little "production" code uses += on strings in a loop, and the fix +makes the strings implementation quite a bit more complex.

                              +

                              So, in summary, don't use repeated concatenations in a loop!

                              +
                              +

                              Comments

                              +
                              +
                              +
                              + +
                              +
                              + + \ No newline at end of file diff --git a/posts/2023/05/rpython-used-to-speed-up-risc-v-simulation-over-15x.html b/posts/2023/05/rpython-used-to-speed-up-risc-v-simulation-over-15x.html new file mode 100644 index 000000000..19e5a44bb --- /dev/null +++ b/posts/2023/05/rpython-used-to-speed-up-risc-v-simulation-over-15x.html @@ -0,0 +1,312 @@ + + + + + +RPython-based emulator speeds up RISC-V simulation over 15x | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                              +
                              +

                              RPython-based emulator speeds up RISC-V simulation over 15x

                              + + + +
                              +

                              In cooperation with RISC-V International, who funded a part of this project, +we recently created a workflow to +use RPython to take a Sail RISC-V model and automatically create a RISC-V ISA +emulator from it, which we call Pydrofoil. The simulator sped up booting a +linux emulator from 35 minutes (using the standard Sail-generated emulator in +C) to 2 minutes, a speedup of 17.5x. More details about the process are in the +RISC-V blog post.

                              +

                              A few take-aways from the project:

                              +
                                +
                              • While PyPy has shown it can speed up generic python code about 4x, the +technology behind PyPy can really shine in other areas.

                              • +
                              • RPython is malleable and can be molded to many tasks, the RPython meta-JIT is +very flexible.

                              • +
                              • A JIT is well-suited for the problem of emulation, because it can +perform dynamic binary translation.

                              • +
                              +

                              PyPy can solve real world performance problems, even somewhat unusual ones. +Please get in touch and let us know how we can help you solve yours!

                              +
                              +

                              Comments

                              +
                              +
                              +
                              + +
                              +
                              + + \ No newline at end of file diff --git a/posts/2023/06/pypy-v7312-release.html b/posts/2023/06/pypy-v7312-release.html new file mode 100644 index 000000000..d310570a0 --- /dev/null +++ b/posts/2023/06/pypy-v7312-release.html @@ -0,0 +1,361 @@ + + + + + +PyPy v7.3.12 release | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                              +
                              +

                              PyPy v7.3.12 release

                              + + + +
                              +

                              PyPy v7.3.12: release of python 2.7, 3.9, and 3.10.

                              +

                              The PyPy team is proud to release version 7.3.12 of PyPy. +This release includes a new string-to-int algorithm (also appearing in CPython +3.12) that is faster than the older one; support for symlinks in Windows; and +our first Python3.10 version.

                              +

                              The release includes three different interpreters:

                              +
                              +
                                +
                              • PyPy2.7, which is an interpreter supporting the syntax and the features of +Python 2.7 including the stdlib for CPython 2.7.18+ (the + is for +backported security updates)

                              • +
                              • PyPy3.9, which is an interpreter supporting the syntax and the features of +Python 3.9, including the stdlib for CPython 3.9.17.

                              • +
                              • PyPy3.10, which is an interpreter supporting the syntax and the features of +Python 3.10, including the stdlib for CPython 3.10.12. This is our first +release of 3.10, but based on past experience we are quite confident in +its compatibility with upstream. Of course, we recommend testing your code +with this new version before putting it into production. Note it does +require at least cython 0.29.35 or cython 3.0.0b3

                              • +
                              +
                              +

                              The interpreters are based on much the same codebase, thus the multiple +release. This is a micro release, all APIs are compatible with the other 7.3 +releases. It follows after 7.3.11 release on Dec 29, 2022

                              +

                              We recommend updating. You can find links to download the v7.3.12 releases here:

                              +
                              +

                              https://pypy.org/download.html

                              +
                              +

                              We would like to thank our donors for the continued support of the PyPy +project. If PyPy is not quite good enough for your needs, we are available for +direct consulting work. If PyPy is helping you out, we would love to hear about +it and encourage submissions to our blog via a pull request +to https://github.com/pypy/pypy.org

                              +

                              We would also like to thank our contributors and encourage new people to join +the project. PyPy has many layers and we need help with all of them: bug fixes, +PyPy and RPython documentation improvements, or general help with making +RPython's JIT even better. Since the previous release, we have accepted +contributions from one new contributor, thanks for pitching in, and welcome +to the project!

                              +

                              If you are a python library maintainer and use C-extensions, please consider +making a HPy / CFFI / cppyy version of your library that would be performant +on PyPy. In any case, both cibuildwheel and the multibuild system support +building wheels for PyPy.

                              +

                              What is PyPy?

                              +

                              PyPy is a Python interpreter, a drop-in replacement for CPython 2.7, 3.9 and +3.10. It's fast (PyPy and CPython 3.7.4 performance +comparison) due to its integrated tracing JIT compiler.

                              +

                              We also welcome developers of other dynamic languages to see what RPython +can do for them.

                              +

                              We provide binary builds for:

                              +
                              +
                                +
                              • x86 machines on most common operating systems +(Linux 32/64 bits, Mac OS 64 bits, Windows 64 bits)

                              • +
                              • 64-bit ARM machines running Linux (aarch64).

                              • +
                              • Apple M1 arm64 machines (macos_arm64).

                              • +
                              • s390x running Linux

                              • +
                              +
                              +

                              PyPy support Windows 32-bit, Linux PPC64 big- and little-endian, and Linux ARM +32 bit, but does not release binaries. Please reach out to us if you wish to +sponsor binary releases for those platforms. Downstream packagers provide +binary builds for debian, Fedora, conda, OpenBSD, FreeBSD, Gentoo, and more.

                              +

                              What else is new?

                              +

                              For more information about the 7.3.12 release, see the full changelog.

                              +

                              Please update, and continue to help us make pypy better.

                              +

                              Cheers, +The PyPy Team

                              +
                              +
                              +

                              Comments

                              +
                              +
                              +
                              + +
                              +
                              + + \ No newline at end of file diff --git a/posts/2023/09/pypy-v7313-release.html b/posts/2023/09/pypy-v7313-release.html new file mode 100644 index 000000000..afd5ac120 --- /dev/null +++ b/posts/2023/09/pypy-v7313-release.html @@ -0,0 +1,357 @@ + + + + + +PyPy v7.3.13 release | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                              +
                              +

                              PyPy v7.3.13 release

                              + + + +
                              +

                              PyPy v7.3.13: release of python 2.7, 3.9, and 3.10

                              +

                              The PyPy team is proud to release version 7.3.13 of PyPy. +This is primarily a security/bug-fix release. CPython released security +patches, and this release also improves the ability to use type +specifications via PyType_FromSpec and friends. There are also some +small speed-ups.

                              +

                              The release includes three different interpreters:

                              +
                              +
                                +
                              • PyPy2.7, which is an interpreter supporting the syntax and the features of +Python 2.7 including the stdlib for CPython 2.7.18+ (the + is for +backported security updates)

                              • +
                              • PyPy3.9, which is an interpreter supporting the syntax and the features of +Python 3.9, including the stdlib for CPython 3.9.18.

                              • +
                              • PyPy3.10, which is an interpreter supporting the syntax and the features of +Python 3.10, including the stdlib for CPython 3.10.13. Note it requires at +least cython 0.29.35 or cython 3.0.0b3.

                              • +
                              +
                              +

                              The interpreters are based on much the same codebase, thus the multiple +release. This is a micro release, all APIs are compatible with the other 7.3 +releases. It follows after 7.3.12 release on June 16, 2023.

                              +

                              We recommend updating. You can find links to download the v7.3.13 releases here:

                              +
                              +

                              https://pypy.org/download.html

                              +
                              +

                              We would like to thank our donors for the continued support of the PyPy +project. If PyPy is not quite good enough for your needs, we are available for +direct consulting work. If PyPy is helping you out, we would love to hear about +it and encourage submissions to our blog via a pull request +to https://github.com/pypy/pypy.org

                              +

                              We would also like to thank our contributors and encourage new people to join +the project. PyPy has many layers and we need help with all of them: bug fixes, +PyPy and RPython documentation improvements, or general help with making +RPython's JIT even better.

                              +

                              If you are a python library maintainer and use C-extensions, please consider +making a HPy / CFFI / cppyy version of your library that would be performant +on PyPy. In any case, both cibuildwheel and the multibuild system support +building wheels for PyPy.

                              +

                              What is PyPy?

                              +

                              PyPy is a Python interpreter, a drop-in replacement for CPython +It's fast (PyPy and CPython 3.7.4 performance +comparison) due to its integrated tracing JIT compiler.

                              +

                              We also welcome developers of other dynamic languages to see what RPython +can do for them.

                              +

                              We provide binary builds for:

                              +
                              +
                                +
                              • x86 machines on most common operating systems +(Linux 32/64 bits, Mac OS 64 bits, Windows 64 bits)

                              • +
                              • 64-bit ARM machines running Linux (aarch64).

                              • +
                              • Apple M1 arm64 machines (macos_arm64).

                              • +
                              • s390x running Linux

                              • +
                              +
                              +

                              PyPy support Windows 32-bit, Linux PPC64 big- and little-endian, and Linux ARM +32 bit, but does not release binaries. Please reach out to us if you wish to +sponsor binary releases for those platforms. Downstream packagers provide +binary builds for debian, Fedora, conda, OpenBSD, FreeBSD, Gentoo, and more.

                              +

                              What else is new?

                              +

                              For more information about the 7.3.13 release, see the full changelog.

                              +

                              Please update, and continue to help us make pypy better.

                              +

                              Cheers, +The PyPy Team

                              +
                              +
                              +

                              Comments

                              +
                              +
                              +
                              + +
                              +
                              + + \ No newline at end of file diff --git a/posts/2023/12/pypy-moved-to-git-github.html b/posts/2023/12/pypy-moved-to-git-github.html new file mode 100644 index 000000000..791bc5522 --- /dev/null +++ b/posts/2023/12/pypy-moved-to-git-github.html @@ -0,0 +1,468 @@ + + + + + +PyPy has moved to Git, GitHub | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                              +
                              +

                              PyPy has moved to Git, GitHub

                              + + + +
                              +

                              PyPy has moved its canonical repo and issue tracker from +https://foss.heptapod.net/pypy/pypy to https://github.com/pypy/pypy. Obviously, +this means development will now be tracked in Git rather than Mercurial.

                              +

                              Motivation

                              +

                              We still feel Mercurial is a better version control system. The named branch +model and user interface are superior. But

                              +
                                +
                              • +

                                foss.heptapod.net is not well indexed in google/bing/duckduckgo + search, so people find it harder to search for issues in the project.

                                +
                              • +
                              • +

                                Since Heptapod has tightened its spam control, we get reports that + users create issues only to have them flagged as spam.

                                +
                              • +
                              • +

                                Open Source has become synonymous with GitHub, and we are too small to + change that.

                                +
                              • +
                              • +

                                Much of the current development comes as a reaction to fixing issues. + Tracking interlocking issues is easier if all the code is on the same + platform.

                                +
                              • +
                              • +

                                The FAQ + presents two arguments against the move. Github notes + solves much of point (1): the difficulty of discovering provenance of + commits, although not entirely. But the main problem is point (2), it turns + out that not moving to GitHub is an impediment to contribution and issue + reporting.

                                +
                              • +
                              • +

                                People who wish to continue to use Mercurial can use the same method below to + push to GitHub.

                                +
                              • +
                              • +

                                GitHub is more resource rich than foss.heptapod.net. We could add CI + jobs to replace some of our aging buildbot + infrastructure.

                                +
                              • +
                              +

                              Method

                              +

                              The migration required two parts: migrating the code and then migrating the +issues and merge requests.

                              +

                              Code migration 1: code and notes

                              +

                              I used a fork of git-remote-hg to +create a local Git repo with all the changesets. Then I wanted to add a Git +note to each commit with the branch it came from. So I prepared a file with two +columns: the Git commit hash, and the corresponding branch from Mercurial. +Mercurial can describe each commit in two ways: either the commit hash or by a +number index. I used hg log to convert an index i to the Mercurial hash, +and then git-hg-helper from git-remote-hg to convert the Mercurial hash to +a Git hash:

                              +
                              $(cd pypy-git; git-hg-helper git-rev $(cd ../pypy-hg; hg log -r $i -T"{node}\n"))
                              +
                              + +

                              Then I used hg log again to print the Mercurial branch for the index i:

                              +
                              $(cd pypy-hg; hg log -r $i -T'{branch}\n')
                              +
                              + +

                              Putting these two together, I could loop over all the commits by their +numerical index to prepare the file. Then I iterated over each line in the +file, and added the Git note. Since the git note add command works on the +current HEAD, I needed to checkout each commit in turn and then add the note:

                              +
                              git checkout -q <hash> && git notes --ref refs/notes/branch add -m branch:<branch>
                              +
                              + +

                              I could then use git push --all to push to GitHub.

                              +

                              Code migration 2: prepare the branches

                              +

                              PyPy has almost 500 open branches. The code migration created all the branch +HEADs, but git push --all did not push them. I needed to check them out and +push each one. So I created a file with all the branch names

                              +
                              cd pypy-hg; hg branches | cut -f1 -d" " > branches.txt
                              +
                              + +

                              and then push each one to the GitHub repo

                              +
                              while read branch; do git checkout branches/$branch && git push origin branches/$branch; done < branches.txt
                              +
                              + +

                              Note that the branches were named branches/XXX by the migration, not branch/XXX. This confuses the merge request migration, more about that later.

                              +

                              Issue and merge request migration

                              +

                              I used the solution from +node-gitlab-2-github which +worked almost perfectly. It is important to do the conversion on a private +repo otherwise every mention of a successfully mapped user name notifies +the user about the transfer. This can be quite annoying for a repo the size of +PyPy with 600 merge requests and over 4000 issues. Issues transferred without a +problem: the script properly retained the issue numbers. However the script +does not convert the Mercurial hashes to Git hashes, so the bare hashes in +comments show up without a link to the commit. Merge requests are more of a problem:

                              +
                                +
                              • The Mercurial named branch "disappears" once it is merged, so a merge request + to a merged branch does not find the target branch name in Git. The + conversion creates an issue instead with the label gitlab merge request.
                              • +
                              • For some reason, the branches created by git-remote-hg are called + branches/XXX and not branch/XXX as expected by GitLab. This messes up the + merge request/PR conversion. For some of the branches (open PRs and main + target branches) I manually created additional branches without the es. The + net result is that open merge requests became open PRs, merged merge requests + became issues, and closed-not-merged merge requests were not migrated.
                              • +
                              +

                              Layered conversions

                              +

                              PyPy already migrated once from Bitbucket to Heptapod. Many of the issues +reflect the multiple transitions: they have lines like "Created originally on +Bitbucket by XXX" from the first transition, and an additional line "In +Heptapod" from this transition.

                              +

                              Credits

                              +

                              We would like to express our gratitude to the Octobus +team who support Heptapod. The transition from Bitbucket was quite an effort, +and they have generously hosted our development since then. We wish them all +the best, and still believe that Mercurial should have "won".

                              +

                              Next steps

                              +

                              While the repo at GitHub is live, there are still a few more things we need to +do:

                              +
                                +
                              • Documentation needs an update for the new repo and the build automation from + readthedocs must be adjusted.
                              • +
                              • The wiki should be copied from Heptapod.
                              • +
                              • buildbot.pypy.org should also look at the new repo. I hope the code is up to + the task of interacting with a Git repo.
                              • +
                              • speed.pypy.org tracks changes, it too needs to reference the new location
                              • +
                              • To keep tracking branches with Git notes on new commits, I activated a + github action by Julian to + add a Git branch note to each commit. Please see the README there for + directions on using Git notes.
                              • +
                              • Some of the merge requests were not migrated. If someone wants to, they could + migrate those once they figure out the branch naming problems.
                              • +
                              +

                              Additionally, now is the time for all of you to prove the move is worthwhile:

                              +
                                +
                              • Star the repo, let others know how to find it,
                              • +
                              • Help fix some of the open issues or file new ones,
                              • +
                              • Take advantage of the more familiar workflow to get involved in the project,
                              • +
                              • Suggest ways to improve the migration: are there things I missed or could + have done better?
                              • +
                              +

                              How will development change?

                              +

                              Heptapod did not allow personal forks, so we were generous with a commit bit to +the main repo. Additionally, we (well, me) have been using a +commit-directly-to-main workflow. We will now be adopting a more structured +workflow. Please fork the repo and submit a pull request for any changes. We +can now add some pre-merge CI to check that the PR at least passes the first +stage of translation. The live and active branches will be:

                              +
                                +
                              • +main: what was "default" in Mercurial, it is the Python2.7 interpreter and + the base of the RPython interpreter,
                              • +
                              • +py3.9: the Python3.9 interpreter, which also includes all RPython changes + from main. This is exactly like on Mercurial, and
                              • +
                              • +py3.10: the Python3.10 interpreter, which also includes all RPython changes + from main and all bugfixes from py3.9. This is exactly like on Mercurial.
                              • +
                              +

                              Working between the repos

                              +
                              Finding commits
                              +

                              If you want to figure out how a Mercurial commit relates to a Git commit, you +can use git-hg-helper. You run it in the Git repo. It takes the full long +hash from one repo and gives you the corresponding hash of the other repo:

                              +
                              $ git-hg-helper git-rev d64027c4c2b903403ceeef2c301f5132454491df
                              +4527e62ad94b0e940a5b0f9f20d29428672f93f7
                              +$ git-hg-helper hg-rev 4527e62ad94b0e940a5b0f9f20d29428672f93f7
                              +d64027c4c2b903403ceeef2c301f5132454491df
                              +
                              + +
                              Finding branches
                              +

                              Branches migrated from Mercurial will have a branches prefix, not branch. +While GitLab uses branch for its prefix, the git-remote-hg script uses +branches. New work should be in a PR targeting main, py3.9 or py3.10.

                              +

                              Thanks for helping to make PyPy better.

                              +

                              Matti

                              +

                              Update

                              +

                              In the meantime we found out that unfortunately something went wrong in the +migration of the issues. The old issue +3655 got lost in the +migration. This means that after number 3655 the numbers are different between +github and heptapod, with heptapod being one larger. E.g. issue 3700 on +heptapod is issue 3699 on +github. We are investigating +options.

                              +
                              +

                              Comments

                              +
                              +
                              +
                              + +
                              +
                              + + \ No newline at end of file diff --git a/posts/2023/12/pypy-v7314-release.html b/posts/2023/12/pypy-v7314-release.html new file mode 100644 index 000000000..7ccfb7dc8 --- /dev/null +++ b/posts/2023/12/pypy-v7314-release.html @@ -0,0 +1,355 @@ + + + + + +PyPy v7.3.14 release | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                              +
                              +

                              PyPy v7.3.14 release

                              + + + +
                              +

                              PyPy v7.3.14: release of python 2.7, 3.9, and 3.10

                              +

                              The PyPy team is proud to release version 7.3.14 of PyPy.

                              +

                              Highlights of this release are compatibility with HPy-0.9, cffi 1.16, +additional C-API interfaces, and more python3.10 fixes.

                              +

                              The release includes three different interpreters:

                              +
                              +
                                +
                              • PyPy2.7, which is an interpreter supporting the syntax and the features of +Python 2.7 including the stdlib for CPython 2.7.18+ (the + is for +backported security updates)

                              • +
                              • PyPy3.9, which is an interpreter supporting the syntax and the features of +Python 3.9, including the stdlib for CPython 3.9.18.

                              • +
                              • PyPy3.10, which is an interpreter supporting the syntax and the features of +Python 3.10, including the stdlib for CPython 3.10.13.

                              • +
                              +
                              +

                              The interpreters are based on much the same codebase, thus the multiple +release. This is a micro release, all APIs are compatible with the other 7.3 +releases. It follows after 7.3.13 release on Sept 29, 2023.

                              +

                              We recommend updating. You can find links to download the v7.3.14 releases here:

                              +
                              +

                              https://pypy.org/download.html

                              +
                              +

                              We would like to thank our donors for the continued support of the PyPy +project. If PyPy is not quite good enough for your needs, we are available for +direct consulting work. If PyPy is helping you out, we would love to hear about +it and encourage submissions to our blog via a pull request +to https://github.com/pypy/pypy.org

                              +

                              We would also like to thank our contributors and encourage new people to join +the project. Since the last release we have contributions from three new +contributors. PyPy has many layers and we need help with all of them: bug +fixes, PyPy and RPython documentation improvements, or general help +with making RPython's JIT even better.

                              +

                              If you are a python library maintainer and use C-extensions, please consider +making a HPy / CFFI / cppyy version of your library that would be performant +on PyPy. In any case, both cibuildwheel and the multibuild system support +building wheels for PyPy.

                              +

                              What is PyPy?

                              +

                              PyPy is a Python interpreter, a drop-in replacement for CPython +It's fast (PyPy and CPython 3.7.4 performance +comparison) due to its integrated tracing JIT compiler.

                              +

                              We also welcome developers of other dynamic languages to see what RPython +can do for them.

                              +

                              We provide binary builds for:

                              +
                              +
                                +
                              • x86 machines on most common operating systems +(Linux 32/64 bits, Mac OS 64 bits, Windows 64 bits)

                              • +
                              • 64-bit ARM machines running Linux (aarch64).

                              • +
                              • Apple M1 arm64 machines (macos_arm64).

                              • +
                              • s390x running Linux

                              • +
                              +
                              +

                              PyPy support Windows 32-bit, Linux PPC64 big- and little-endian, and Linux ARM +32 bit, but does not release binaries. Please reach out to us if you wish to +sponsor binary releases for those platforms. Downstream packagers provide +binary builds for debian, Fedora, conda, OpenBSD, FreeBSD, Gentoo, and more.

                              +

                              What else is new?

                              +

                              For more information about the 7.3.14 release, see the full changelog.

                              +

                              Please update, and continue to help us make pypy better.

                              +

                              Cheers, +The PyPy Team

                              +
                              +
                              +

                              Comments

                              +
                              +
                              +
                              + +
                              +
                              + + \ No newline at end of file diff --git a/posts/2024/01/pypy-v7315-release.html b/posts/2024/01/pypy-v7315-release.html new file mode 100644 index 000000000..4d2bc726a --- /dev/null +++ b/posts/2024/01/pypy-v7315-release.html @@ -0,0 +1,354 @@ + + + + + +PyPy v7.3.15 release | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                              +
                              +

                              PyPy v7.3.15 release

                              + + + +
                              +

                              PyPy v7.3.15: release of python 2.7, 3.9, and 3.10

                              +

                              The PyPy team is proud to release version 7.3.15 of PyPy.

                              +

                              This is primarily a bug-fix release, and includes work done to migrate PyPy to +Git and Github.

                              +

                              The release includes three different interpreters:

                              +
                              +
                                +
                              • PyPy2.7, which is an interpreter supporting the syntax and the features of +Python 2.7 including the stdlib for CPython 2.7.18+ (the + is for +backported security updates)

                              • +
                              • PyPy3.9, which is an interpreter supporting the syntax and the features of +Python 3.9, including the stdlib for CPython 3.9.18.

                              • +
                              • PyPy3.10, which is an interpreter supporting the syntax and the features of +Python 3.10, including the stdlib for CPython 3.10.13.

                              • +
                              +
                              +

                              The interpreters are based on much the same codebase, thus the multiple +release. This is a micro release, all APIs are compatible with the other 7.3 +releases. It follows after 7.3.14 release on Dec 25, 2023

                              +

                              We recommend updating. You can find links to download the v7.3.15 releases here:

                              +
                              +

                              https://pypy.org/download.html

                              +
                              +

                              We would like to thank our donors for the continued support of the PyPy +project. If PyPy is not quite good enough for your needs, we are available for +direct consulting work. If PyPy is helping you out, we would love to hear about +it and encourage submissions to our blog via a pull request +to https://github.com/pypy/pypy.org

                              +

                              We would also like to thank our contributors and encourage new people to join +the project. PyPy has many layers and we need help with all of them: bug fixes, +PyPy and RPython documentation improvements, or general help with +making RPython's JIT even better.

                              +

                              If you are a python library maintainer and use C-extensions, please consider +making a HPy / CFFI / cppyy version of your library that would be performant +on PyPy. In any case, both cibuildwheel and the multibuild system support +building wheels for PyPy.

                              +

                              What is PyPy?

                              +

                              PyPy is a Python interpreter, a drop-in replacement for CPython +It's fast (PyPy and CPython 3.7.4 performance +comparison) due to its integrated tracing JIT compiler.

                              +

                              We also welcome developers of other dynamic languages to see what RPython +can do for them.

                              +

                              We provide binary builds for:

                              +
                              +
                                +
                              • x86 machines on most common operating systems +(Linux 32/64 bits, Mac OS 64 bits, Windows 64 bits)

                              • +
                              • 64-bit ARM machines running Linux (aarch64).

                              • +
                              • Apple M1 arm64 machines (macos_arm64).

                              • +
                              • s390x running Linux

                              • +
                              +
                              +

                              PyPy support Windows 32-bit, Linux PPC64 big- and little-endian, and Linux ARM +32 bit, but does not release binaries. Please reach out to us if you wish to +sponsor binary releases for those platforms. Downstream packagers provide +binary builds for debian, Fedora, conda, OpenBSD, FreeBSD, Gentoo, and more.

                              +

                              What else is new?

                              +

                              For more information about the 7.3.15 release, see the full changelog.

                              +

                              Please update, and continue to help us make pypy better.

                              +

                              Cheers, +The PyPy Team

                              +
                              +
                              +

                              Comments

                              +
                              +
                              +
                              + +
                              +
                              + + \ No newline at end of file diff --git a/posts/2024/03/fixing-bug-incremental-gc.html b/posts/2024/03/fixing-bug-incremental-gc.html new file mode 100644 index 000000000..cc43e5ec2 --- /dev/null +++ b/posts/2024/03/fixing-bug-incremental-gc.html @@ -0,0 +1,756 @@ + + + + + +Fixing a Bug in PyPy's Incremental GC | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                              +
                              +

                              Fixing a Bug in PyPy's Incremental GC

                              + + + +
                              +

                              Introduction

                              +

                              Since last summer, I've been looking on and off into a weird and hard to +reproduce crash bug in PyPy. It was +manifesting only on CI, and it seemed to always happen in the AST rewriting +phase of pytest, the symptoms being that PyPy would crash +with a segfault. All my attempts to reproduce it locally failed, and my +attempts to try to understand the problem by dumping the involved ASTs lead +nowhere.

                              +

                              A few weeks ago, we got two more +bug reports, the last one by +the authors of the nanobind binding +generator, with the same symptoms: crash in AST rewriting, only on CI. I +decided to make a more serious push to try to find the bug this time. +Ultimately the problem turned out to be several bugs in PyPy's garbage +collector (GC) that had been there since its inception in +2013. +Understanding the +situation turned out to be quite involved, additionally complicated by this +being the first time that I was working on this particular aspect of PyPy's GC. +Since the bug was so much work to find, I thought I'd write a blog post about +it.

                              +

                              The blog post consists of three parts: first a chronological description of +what I did to find the bug, a technical explanation of what goes wrong, some +reflections on the bug (and then a bonus bug I also found in the process).

                              +

                              Finding the Bug

                              +

                              I started from the failing nanobind CI +runs +that ended with a segfault of the PyPy interpreter. This was only an +intermittent problem, not every run was failing. When I tried to just run the +test suite locally, I couldn't get it to fail. Therefore at first I tried to +learn more about what was happening by looking on the CI runners.

                              +

                              Running on CI

                              +

                              I forked the nanobind repo and hacked the CI script in order to get it to use a +PyPy build with full debug information and more assertions turned on. In order +to increase the probability of seeing the crash I added an otherwise unused +matrix +variable to the CI script that just contained 32 parameters. This means every +build is done 32 times (sorry Github for wasting your CPUs 😕). With that +amount of repetition, I got at least one job of every build that was crashing.

                              +

                              Then I added the -Xfaulthandler option to the PyPy command which will use the +faulthandler module +try to print a Python stacktrace if the VM segfaults to confirm that PyPy was +indeed crashing in the AST +rewriting +phase +of pytest, which pytest uses for nicer +assertions. +I experimented with hacking our faulthandler implementation to also give me a +C-level callstack, but that didn't work as well as I hoped.

                              +

                              Then I tried to run gdb on CI to try to get it +to print a C callstack at the crash point. You can get gdb to execute commands +as if typed at the prompt with the -ex commandline option, I used something +like this:

                              +
                              gdb -ex "set confirm off" -ex "set pagination off" -ex \
                              +    "set debuginfod enabled off" -ex run -ex where -ex quit \
                              +    --args <command> <arguments>
                              +
                              + +

                              But unfortunately the crash never occurred when running in gdb.

                              +

                              Afterwards I tried the next best thing, which was configuring the CI runner to +dump a core file and upload it as a build +artifact, which worked. Looking +at the cores locally only sort of worked, because I am running a different +version of Ubuntu than the CI runners. So I used +tmate to be able to log into the +CI runner after a crash and interactively used gdb there. Unfortunately what I +learned from that was that the bug was some kind of memory corruption, +which is always incredibly unpleasant to debug. Basically the header word of a +Python object had been corrupted somehow at the point of the crash, which means +that it's vtable wasn't +usable any more.

                              +

                              (Sidenote: PyPy doesn't really use a vtable +pointer, +instead it uses half a word in the header for the vtable, and the other half +for flags that the GC needs to keep track of the state of the object. +Corrupting all this is still bad.)

                              +

                              Reproducing Locally

                              +

                              At that point it was clear that I had to push to reproduce the problem on my +laptop, to allow me to work on the problem more directly and not to always have +to go via the CI runner. Memory corruption bugs often have a lot of randomness +(depending on which part of memory gets modified, things might crash or more +likely just happily keep running). Therefore I decided to try to brute-force +reproducing the crash by simply running the tests many many times. Since the +crash happened in the AST rewriting phase of pytest, and that happens only if +no pyc +files +of the bytecode-compiled rewritten ASTs exist, I made sure to delete them +before every test run.

                              +

                              To repeat the test runs I used +multitime, which is a simple program +that runs a command repeatedly. It's meant for lightweight benchmarking +purposes, but it also halts the execution of the command if that command exits +with an error (and it sleeps a small random time between runs, which might help +with randomizing the situation, maybe). Here's a demo:

                              +

                              (Max pointed out +autoclave to me when reviewing +this post, which is a more dedicated tool for this job.)

                              +

                              Thankfully, running the tests repeatedly eventually lead to a crash, solving my +"only happens on CI" problem. I then tried various variants to exclude possible +sources of errors. The first source of errors to exclude in PyPy bugs is the +just-in-time compiler, so I reran the tests with --jit off to see whether I +could still get it to crash, and thankfully I eventually could (JIT bugs are +often very annoying).

                              +

                              Next source of bugs to exclude where C-extensions. Since those were the tests +of nanobind, a framework for creating C-extension modules I was a bit worried +that the bug might be in our emulation of CPython's C-API. But running PyPy +with the -v option (which will print all the imports as they happen) +confirmed that at the point of crash no C-extension had been imported yet.

                              +

                              Using rr +

                              +

                              I still couldn't get the bug to happen in GDB, so the tool I tried next was +rr, the "reverse debugger". rr can record the execution of a program and +later replay it arbitrarily often. This gives you a time-traveling debugger +that allows you to execute the program backwards in addition to forwards. +Eventually I managed to get the crash to happen when running the tests with +rr record --chaos (--chaos randomizes some decisions that rr takes, to try to +increase the chance of reproducing bugs).

                              +

                              Using rr well is quite hard, and I'm not very good at it. The main approach I +use with rr to debug memory corruption is to replay the crash, then set a +watchpoint +for the corrupted memory location, then use the command reverse-continue to +find the place in the code that mutated the memory location. reverse-continue +is like continue, except that it will execute the program backwards from the +current point. Here's a little demo of this:

                              +

                              Doing this for my bug revealed that the object that was being corrupted was +erroneously collected by the garbage collector. For some reason the GC had +wrongly decided that the object was no longer reachable and therefore put the +object into a freelist by writing a pointer to the next entry in the freelist +into the first word of the object, overwriting the object's header. The next +time the object was used things crashed.

                              +

                              Side-quest: wrong GC assertions

                              +

                              At this point in the process, I got massively side-tracked. PyPy's GC has a +number of debug modes that you can optionally turn on. Those slow down the +program execution a lot, but they should in theory help to understand why the +GC goes wrong. When I turned them on, I was getting a failing assertion really +early in the test execution, complaining about an invariant violation in the GC +logic. At first this made me very happy. I thought that this would help me fix +the bug more quickly.

                              +

                              Extremely frustratingly, after two days of work I concluded that the assertion +logic itself was wrong. I have fixed that in the meantime too, the details +of that are in the bonus section at the end of the post.

                              +

                              Using GDB scripting to find the real bug

                              +

                              After that disaster I went back to the earlier rr recording without GC assertions +and tried to understand in more detail why the GC decided to free an object +that was still being referenced. To be able to do that I used the GDB Python +scripting +API to +write some helper commands to understand the state of the GC heap (rr is an +extension of GDB, so the GDB scripting API works in rr too).

                              +

                              The first (small) helper command I wrote with the GDB scripting API was a way +to pretty-print the currently active GC flags of a random PyPy object, starting +just from the pointer. The more complex command I wrote was an object tracer, +which follows pointers to GC objects starting from a root object to explore the +object graph. The object tracer isn't complete, it doesn't deal with all the +complexities of PyPy's GC. But it was good enough to help me with my problem, I +found out that the corrupted object was stored in an array.

                              +

                              As an example, here's a function that uses the GDB API to walk one of the +helper data structures of the GC, a stack of pointers:

                              +
                              def walk_addr_stack(obj):
                              +    """ walk an instance of the AddressStack class (which is a linked list of
                              +    arrays of 1019 pointers).
                              +
                              +    the first of the arrays is only partially filled with used_in_last_chunk
                              +    items, all the other chunks are full."""
                              +    if obj.type.code == gdb.TYPE_CODE_PTR:
                              +        obj = obj.dereference()
                              +    used_in_last_chunk = lookup(obj, "used_in_last_chunk")
                              +    chunk = lookup(obj, "inst_chunk").dereference()
                              +    while 1:
                              +        items = lookup(chunk, "items")
                              +        for i in range(used_in_last_chunk):
                              +            yield items[i]
                              +        chunk = lookup(chunk, "next")
                              +        if not chunk:
                              +            break
                              +        chunk = chunk.dereference()
                              +        used_in_last_chunk = 1019
                              +
                              + +

                              The full file of supporting code I wrote can be found in this +gist. This is +pretty rough throw-away code, however.

                              +

                              In the following recording I show a staged debugging session with some of the +extra commands I wrote with the Python API. The details aren't important, I +just wanted to give a bit of a flavor of what inspecting objects looks like:

                              +

                              The next step was to understand why the array content wasn't being correctly +traced by the GC, which I eventually managed with some conditional +breakpoints, +more watchpoints, and using reverse-continue. It turned out to be a bug that +occurs when the content of one array was memcopied into another array. The +technical details of why the array wasn't traced correctly are described in +detail in the next section.

                              +

                              Writing a unit test

                              +

                              To try to make sure I really understood the bug correctly I then wrote a GC +unit test that shows the problem. Like most of PyPy, our GC is written in +RPython, a (somewhat strange) subset/dialect of Python2, which can be compiled +to C code. However, since it is also valid Python2 code, it can be unit-tested +on top of a Python2 +implementation +(which is one of the reasons why we keep maintaining PyPy2).

                              +

                              In the GC unit tests you have a lot of control about what order things happen +in, e.g. how objects are allocated, when garbage collection phases happen, etc. +After some trying I managed to write a test that crashes with the same kind of +memory corruption that my original crash exhibited: an object that is still +reachable via an array is collected by the GC. To give you a flavor of what +this kind of test looks like, here's an (edited for clarity) version of the +test I eventually managed to write

                              +
                              def test_incrementality_bug_arraycopy(self):
                              +    source = self.malloc(VAR, 8) # first array
                              +    # the stackroots list emulates the C stack
                              +    self.stackroots.append(source)
                              +    target = self.malloc(VAR, 8) # second array
                              +    self.stackroots.append(target)
                              +    node = self.malloc(S) # unrelated object, will be collected
                              +    node.x = 5
                              +    # store reference into source array, calling the write barrier
                              +    self.writearray(source, 0, node)
                              +    val = self.gc.collect_step()
                              +    source = self.stackroots[0] # reload arrays, they might have moved
                              +    target = self.stackroots[1]
                              +    # this GC step traces target
                              +    val = self.gc.collect_step()
                              +
                              +    # emulate what a memcopy of arrays does
                              +    res = self.gc.writebarrier_before_copy(source, target, 0, 0, 2)
                              +    assert res
                              +    target[0] = source[0] # copy two elements of the arrays
                              +    target[1] = source[1]
                              +    # now overwrite the reference to node in source
                              +    self.writearray(source, 0, lltype.nullptr(S))
                              +    # this GC step traces source
                              +    self.gc.collect_step()
                              +    # some more collection steps, crucially target isn't traced again
                              +    # but node is deleted
                              +    for i in range(3):
                              +        self.gc.collect_step()
                              +    # used to crash, node got collected
                              +    assert target[0].x == 5
                              +
                              + +

                              One of the good properties of testing our GC that way is that all the memory is +emulated. The crash in the last line of the test isn't a segfault at all, +instead you get a nice exception saying that you tried to access a freed chunk +of memory and you can then debug this with a python2 debugger.

                              +

                              Fixing the Bug

                              +

                              With the unit test in hand, fixing the test was relatively straightforward (the +diff in its simplest form is anyway only a single line +change). +After this first version of my fix, I +talked to Armin +Rigo who +helped me find different case that was still wrong, in the same area of the +code.

                              +

                              I also got help by the developers at PortaOne +who are using PyPy on their servers and had seen some mysterious PyPy +crashes +recently, that looked related to the GC. They did test deployments of my fixes +in their various stages to their servers to try to see whether stability +improved for them. Unfortunately in the end it turned out that their crashes +are an unrelated GC bug related to object pinning, which we haven't resolved +yet.

                              +

                              Writing a GC fuzzer/property based test

                              +

                              Finding bugs in the GC is always extremely disconcerting, particularly since +this one managed to hide for so long (more than ten years!). Therefore I wanted +to use these bugs as motivation to try to find more problems in PyPy's GC. Given +the ridiculous effectiveness of fuzzing, I used +hypothesis to write a +property-based test. Every test performs a sequence of randomly chosen steps +from the following list:

                              +
                                +
                              • allocate an object
                              • +
                              • read a random field from a random object
                              • +
                              • write a random reference into a random object
                              • +
                              • drop a random stack reference
                              • +
                              • perform one GC step
                              • +
                              • allocate an array
                              • +
                              • read a random index from a random array
                              • +
                              • write to an array
                              • +
                              • memcopy between two arrays
                              • +
                              +

                              This approach of doing a sequence of steps is pretty close to the stateful +testing approach of +hypothesis, but I just implemented it manually with the data +strategy.

                              +

                              Every one of those steps is always performed on both the tested GC, and on some +regular Python objects. The Python objects provide the "ground truth" of what +the heap should look like, so we can compare the state of the GC objects +with the state of the Python objects to find out whether the GC made a mistake.

                              +

                              In order to check whether the test is actually useful, I reverted my bug fixes +and made sure that the test re-finds both the spurious GC assertion error and the +problems with memcopying an array.

                              +

                              In addition, the test also found corner cases in my fix. There was a situation +that I hadn't accounted for, which the test found after eventually. +I also plan on adding a bunch of other GC features as steps in the +test to stress them too (for example weakrefs, identity hashes, pinning, maybe +finalization).

                              +

                              At the point of publishing this post, the fixes got merged to the 2.7/3.9/3.10 +branches of PyPy, and will be part of the next release (v7.3.16).

                              +

                              The technical details of the bug

                              +

                              In order to understand the technical details of the bug, I need to give some +background explanations about PyPy's GC.

                              +

                              PyPy's incremental GC

                              +

                              PyPy uses an incremental generational mark-sweep GC. It's +generational +and therefore has minor collections (where only young objects get collected) +and major collections (collecting long-lived objects eventually, using a +mark-and-sweep +algorithm). Young objects are allocated in a nursery using a +bump-pointer allocator, which makes allocation quite efficient. They are moved +out of the nursery by minor collections. In order to find references from old +to young objects the GC uses a write barrier to detect writes into old objects.

                              +

                              The GC is also +incremental, +which means that its major collections aren't done all at once (which would +lead to long pauses). Instead, major collections are sliced up into small +steps, which are done directly after a minor collection (the GC isn't +concurrent though, which would mean that the GC does work in a separate +thread).

                              +

                              The incremental GC uses tri-color +marking +to reason about the reachable part of the heap during the marking phase, where +every old object can be:

                              +
                                +
                              • black: already marked, reachable, definitely survives the collection
                              • +
                              • grey: will survive, but still needs to be marked
                              • +
                              • white: potentially dead
                              • +
                              +

                              The color of every object is encoded by setting flags +in the object header.

                              +

                              The GC maintains the invariant that black objects must never point to white +objects. At the start of a major collection cycle the stack roots are turned +gray. During the mark phase of a major collection cycle, the GC will trace gray +objects, until +none are left. To trace a gray object, all the objects it references have to be +marked grey if they are white so far. After a grey object is traced, it can be +marked black (because all the referenced objects are now either black or gray). +Eventually, there are no gray objects left. At that point (because no white +object can be reached from a black one) all the white objects are known to be +unreachable and can therefore be freed.

                              +

                              The GC is incremental because every collection step will only trace a limited +number of gray objects, before giving control back to the program. This leads to +a problem: if an already traced (black) object is changed between two marking +steps of the GC, the program can mutate that object and write a new reference +into one of its fields. This could lead to an invariant violation, if the +referenced object is white. Therefore, the GC uses the write barrier (which it +needs anyway to find references from old to young objects) to mark all black +objects that are modified gray, and then trace them again at one of the +later collection steps.

                              +

                              The special write barrier of memcopy

                              +

                              Arrays use a different kind of write barrier than normal objects. Since they +can be arbitrarily large, tracing them can take a long time. Therefore it's +potentially wasteful to trace them fully at a minor collection. To fix this, +the array write barrier keeps more granular information about which parts of +the array have been modified since the last collection step. Then only the +modified parts of the array need to be traced, not the whole array.

                              +

                              In addition, there is another optimization for arrays, which is that memcopy is +treated specially by the GC. If memcopy is implemented by simply writing a loop +that copies the content of one array to the other, that will invoke the write +barrier every single loop iteration for the write of every array element, +costing a lot of overhead. Here's some pseudo-code:

                              +
                              def arraycopy(source, dest, source_start, dest_start, length):
                              +    for i in range(length):
                              +        value = source[source_start + i]
                              +        dest[dest_start + i] = value # <- write barrier inserted here
                              +
                              + +

                              Therefore the GC has a special memcopy-specific +write barrier that will perform the GC logic once before the memcopy loop, and +then use a regular (typically SIMD-optimized) memcopy implementation from +libc. Roughly like this:

                              +
                              def arraycopy(source, dest, source_start, dest_start, length):
                              +    gc_writebarrier_before_array_copy(source, dest, source_start, dest_start, length)
                              +    raw_memcopy(cast_to_voidp(source) + source_start,
                              +                cast_to_voidp(dest) + dest_start,
                              +                sizeof(itemtype(source)) * length)
                              +
                              + +

                              (this is really a rough sketch. The real +code +is much more complicated.)

                              +

                              The bug

                              +

                              The bugs turned out to be precisely in this memcopy write barrier. When we +implemented the current GC, we adapted our previous GC, which was a +generational mark-sweep GC but not incremental. We started with most of the +previous GC's code, including the write barriers. The regular write barriers +were adapted to the new incremental assumptions, in particular the need for the +write barrier to also turn black objects back to gray when they are modified +during a marking phase. This was simply not done at all for the memcopy write +barrier, at least in two of the code paths. Fixing this problem fixes the unit +tests and stops the crashes.

                              +

                              Reflections

                              +

                              The way the bug was introduced is really typical. A piece of code (the memcopy +write barrier) was written under a set of assumptions. Then those assumptions +changed later. Not all the code pieces that relied on these assumptions to be +correct were updated. It's pretty hard to prevent this in all situations.

                              +

                              I still think we could have done more to prevent the bug occurring. Writing a +property-based test for the GC would have been a good idea given the complexity +of the GC, and definitely something we did in other parts of our code at the +time (just using the random module mostly, we started using hypothesis +later).

                              +

                              It's a bit of a mystery to me why this bug managed to be undetected for so +long. Memcopy happens in a lot of pretty core operations of e.g. lists in +Python (list.extend, to name just one example). To speculate, I would suspect +that all the other preconditions for the bug occurring made it pretty rare:

                              +
                                +
                              • the content of an old list that is not yet marked needs to be copied into + another old list that is marked already
                              • +
                              • the source of the copy needs to also store an object that has no other + references
                              • +
                              • the source of the copy then needs to be overwritten with other data
                              • +
                              • then the next collection steps need to be happening at the right points
                              • +
                              • ...
                              • +
                              +

                              Given the complexity of the GC logic I also wonder whether some lightweight +formal methods would have been a good idea. Formalizing some of the core +invariants in B or +TLA+ and then model +checking them up to some number +of +objects would have found this problem pretty quickly. There are also correctness +proofs for GC algorithms in some research papers, but I don't have a good +overview of the literature to point to any that are particularly good or bad. +Going such a more formal route might have fixed this and probably a whole bunch +of other bugs, but of course it's a pretty expensive (and tedious) approach.

                              +

                              While it was super annoying to track this down, it was definitely good to learn +a bit more about how to use rr and the GDB scripting interface.

                              +

                              Bonus Section: The Wrong Assertion

                              +

                              Some more technical information about the wrong assertion is in this section.

                              +

                              Background: pre-built objects

                              +

                              PyPy's VM-building bootstrapping process can "freeze" a bunch of heap objects +into the final binary. This allows the VM to start up quickly, because those +frozen objects are loaded by the OS as part of the binary.

                              +

                              Those frozen pre-built objects are parts of the 'roots' of the garbage +collector and need to be traced. However, tracing all the pre-built objects at +every collection would be very expensive, because there are a lot of them +(about 150,000 in a PyPy 3.10 binary). Tracing them all is also not necessary, +because most of them are never modified. Unmodified pre-built objects can only reference +other pre-built objects, which can never be deallocated anyway. Therefore we +have an optimization that uses the write barrier (which we need anyway to find +old-to-young pointers) to notice when a pre-built object gets modified for the +very first time. If that happens, it gets added to the set of pre-built objects +that gets counted as a root, and is traced as a root at collections +from then on.

                              +

                              The wrong assertion

                              +

                              The assertion that triggered when I turned on the GC debug mode was saying that +the GC found a reference from a black to a white object, violating its +invariant. Unmodified pre-built objects count as black, and they aren't roots, +because they can only ever reference other pre-built objects. However, when a +pre-built object gets modified for the first time, it becomes part of the root +set and will be marked gray. This logic works fine.

                              +

                              The wrong assertion triggers if a pre-built object is mutated for the very +first time in the middle of an incremental marking phase. While the pre-built +object gets added to the root set just fine, and will get traced before the +marking phase ends, this is encoded slightly differently for pre-built objects, +compared to "regular" old objects. Therefore, the invariant checking code +wrongly reported a black->white pointer in this situation.

                              +

                              To fix it I also wrote a unit test checking the problem, made sure that the GC +hypothesis test also found the bug, and then fixed the wrong assertion to take +the color encoding of pre-built objects into account.

                              +

                              The bug managed to be invisible because we don't tend to turn on the GC +assertions very often. We only do that when we find a GC bug, which is of +course also when we need it the most to be correct.

                              +

                              Acknowledgements

                              +

                              Thanks to Matti Picus, Max Bernstein, Wouter van Heyst for giving me feedback on drafts of the +post. Thanks to Armin Rigo for reviewing the code and pointing out holes in my +thinking. Thanks to the original reporters of the various forms of the bug, +including Lily Foote, David Hewitt, Wenzel Jakob.

                              +
                              +

                              Comments

                              +
                              +
                              +
                              + +
                              +
                              + + \ No newline at end of file diff --git a/posts/2024/04/pypy-v7316-release.html b/posts/2024/04/pypy-v7316-release.html new file mode 100644 index 000000000..4e3127c5f --- /dev/null +++ b/posts/2024/04/pypy-v7316-release.html @@ -0,0 +1,354 @@ + + + + + +PyPy v7.3.16 release | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                              +
                              +

                              PyPy v7.3.16 release

                              + + + +
                              +

                              PyPy v7.3.16: release of python 2.7, 3.9, and 3.10

                              +

                              The PyPy team is proud to release version 7.3.16 of PyPy.

                              +

                              This release includes security fixes from upstream CPython, and bugfixes to the +garbage collector, described in a gc bug-hunt blog post.

                              +

                              The release includes three different interpreters:

                              +
                              +
                                +
                              • PyPy2.7, which is an interpreter supporting the syntax and the features of +Python 2.7 including the stdlib for CPython 2.7.18+ (the + is for +backported security updates)

                              • +
                              • PyPy3.9, which is an interpreter supporting the syntax and the features of +Python 3.9, including the stdlib for CPython 3.9.19.

                              • +
                              • PyPy3.10, which is an interpreter supporting the syntax and the features of +Python 3.10, including the stdlib for CPython 3.10.14.

                              • +
                              +
                              +

                              The interpreters are based on much the same codebase, thus the multiple +release. This is a micro release, all APIs are compatible with the other 7.3 +releases. It follows after 7.3.15 release on Jan 15, 2024

                              +

                              We recommend updating. You can find links to download the v7.3.16 releases here:

                              +
                              +

                              https://pypy.org/download.html

                              +
                              +

                              We would like to thank our donors for the continued support of the PyPy +project. If PyPy is not quite good enough for your needs, we are available for +direct consulting work. If PyPy is helping you out, we would love to hear +about it and encourage submissions to our blog via a pull request +to https://github.com/pypy/pypy.org

                              +

                              We would also like to thank our contributors and encourage new people to join +the project. PyPy has many layers and we need help with all of them: bug fixes, +PyPy and RPython documentation improvements, or general help with +making RPython's JIT even better.

                              +

                              If you are a python library maintainer and use C-extensions, please consider +making a HPy / CFFI / cppyy version of your library that would be performant +on PyPy. In any case, both cibuildwheel and the multibuild system support +building wheels for PyPy.

                              +

                              What is PyPy?

                              +

                              PyPy is a Python interpreter, a drop-in replacement for CPython +It's fast (PyPy and CPython 3.7.4 performance +comparison) due to its integrated tracing JIT compiler.

                              +

                              We also welcome developers of other dynamic languages to see what RPython +can do for them.

                              +

                              We provide binary builds for:

                              +
                              +
                                +
                              • x86 machines on most common operating systems +(Linux 32/64 bits, Mac OS 64 bits, Windows 64 bits)

                              • +
                              • 64-bit ARM machines running Linux (aarch64).

                              • +
                              • Apple M1 arm64 machines (macos_arm64).

                              • +
                              • s390x running Linux

                              • +
                              +
                              +

                              PyPy support Windows 32-bit, Linux PPC64 big- and little-endian, and Linux ARM +32 bit, but does not release binaries. Please reach out to us if you wish to +sponsor binary releases for those platforms. Downstream packagers provide +binary builds for debian, Fedora, conda, OpenBSD, FreeBSD, Gentoo, and more.

                              +

                              What else is new?

                              +

                              For more information about the 7.3.16 release, see the full changelog.

                              +

                              Please update, and continue to help us make pypy better.

                              +

                              Cheers, +The PyPy Team

                              +
                              +
                              +

                              Comments

                              +
                              +
                              +
                              + +
                              +
                              + + \ No newline at end of file diff --git a/posts/2024/05/vmprof-firefox-converter.html b/posts/2024/05/vmprof-firefox-converter.html new file mode 100644 index 000000000..95dedb8fa --- /dev/null +++ b/posts/2024/05/vmprof-firefox-converter.html @@ -0,0 +1,363 @@ + + + + + +Profiling PyPy using the Firefox profiler user interface | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                              +
                              +

                              Profiling PyPy using the Firefox profiler user interface

                              + + + +
                              +

                              Introduction

                              +

                              If you ever wanted to profile your Python code on PyPy, you probably came across VMProf — a statistical profiler for PyPy.

                              +

                              VMProf's console output can already give some insights into where your code spends time, +but it is far from showing all the information captured while profiling.

                              +

                              There have been some tools around to visualize VMProf's output. +Unfortunately the vmprof.com user interface is no longer available and vmprof-server is not as easy to use, you may want to take a look at a local viewer or converter. +Those so far could give you some general visualizations of your profile, but do not show any PyPy related context like PyPy's log output (PyPyLog, which is output when using the PYPYLOG environment variable to log JIT actions).

                              +

                              To bring all of those features together in one tool, you may take a look at the vmprof-firefox-converter.

                              +

                              Created in the context of my bachelor's thesis, the vmprof-firefox-converter is a tool for analyzing VMProf profiles with the Firefox profiler user interface. +Instead of building a new user interface from scratch, this allows us to reuse the user interface work Mozilla put into the Firefox profiler. +The Firefox profiler offers a timeline where you can zoom into profiles and work with different visualizations like a flame graph or a stack chart. +To understand why there is time spent inside a function, you can revisit the source code and even dive into the intermediate representation of functions executed by PyPy's just-in-time compiler. +Additionally, there is a visualization for PyPy's log output, to keep track whether PyPy spent time inside the interpreter, JIT or GC throughout the profiling time.

                              +

                              Profiling word count

                              +

                              In this blog post, I want to show an example of how to use the vmprof-firefox-converter for a simple Python program. +Based on Ben Hoyt's blog Performance comparison: counting words in Python, Go, C++, C, AWK, Forth, and Rust we will profile two python versions of a word counter running on PyPy. One being a bit more optimized. For this, VMProf will be used, but instead of just going with the console output, we will use the Firefox profiler user interface.

                              +

                              At first, we are going to look at a simple way of counting words with Collections.Counter. +This will read one line from the standard input at a time and count the words with counter.update()

                              +
                              counts = collections.Counter()
                              +for line in sys.stdin:
                              +    words = line.lower().split()
                              +    counts.update(words)
                              +
                              +for word, count in counts.most_common():
                              +    print(word, count)
                              +
                              + +

                              To start profiling, simply execute: +pypy -m vmprofconvert -run simple.py <kjvbible_x10.txt

                              +

                              This will run the above code with vmprof, automatically capture and convert the results and finally open the Firefox profiler.

                              +

                              The input file is the king James version of the bible concatenated ten times.

                              +

                              To get started, we take a look at the call stack.

                              +

                              +Here we see that most of the time is spent in native code (marked as blue) e.g., the counter.update() or split() C implementation.

                              +

                              Now let's proceed with the more optimized version. +This time we read 64 Kb of data from the standard input and count the words with counter.update().

                              +
                              counts = collections.Counter()
                              +remaining = ''
                              +while True:
                              +    chunk = remaining + sys.stdin.read(64*1024)
                              +    if not chunk:
                              +        break
                              +    last_lf = chunk.rfind('\n')  # process to last LF character
                              +    if last_lf == -1:
                              +        remaining = ''
                              +    else:
                              +        remaining = chunk[last_lf+1:]
                              +        chunk = chunk[:last_lf]
                              +    counts.update(chunk.lower().split())
                              +
                              +for word, count in counts.most_common():
                              +    print(word, count)
                              +
                              + +

                              As we did before, we are going to take a peek at the call stack.

                              +

                              +

                              Now there is more time spent in native code, caused by larger chunks of text passed to counter.update().

                              +

                              This becomes even more clear by comparing the stack charts.

                              +

                              +

                              Here, in the unoptimized case, we only read in one line at each loop iteration. +This results in small "spikes" in the stack chart.

                              +

                              But let's take an even closer look.

                              +

                              +

                              Zoomed in, we see the call stack alternating between _count_elements() and (unfortunately unsymbolized) native calls coming from reading and splitting the input text (e.g., decode()).

                              +

                              Let us now take a look at the optimized case.

                              +

                              +

                              And if we look closer at the same interval as before, we see some spikes, but slightly different.

                              +

                              +

                              Even though we do not want to compare the (amount of) milliseconds directly, we clearly see that the spikes are wider, i.e. the time spent in those function calls is longer. +You may already know where this comes from. +We read a 64 Kb chunk of data from std in and pass that to counter.update(), so both these tasks do more work and take longer. +Bigger chunks mean there is less alternating between reading and counting, so there is more time spent doing work than "doing" loop iterations.

                              +

                              Getting started

                              +

                              You can get the converter from GitHub.

                              +

                              Both VMProf and the vmprof-firefox-converter were created for profiling PyPy, but you can also use them with CPython.

                              +

                              This project is still somewhat experimental, so if you want to try it out, please let us know whether it worked for you.

                              +
                              +

                              Comments

                              +
                              +
                              +
                              + +
                              +
                              + + \ No newline at end of file diff --git a/posts/2024/07/finding-simple-rewrite-rules-jit-z3.html b/posts/2024/07/finding-simple-rewrite-rules-jit-z3.html new file mode 100644 index 000000000..997a0663a --- /dev/null +++ b/posts/2024/07/finding-simple-rewrite-rules-jit-z3.html @@ -0,0 +1,900 @@ + + + + + +Finding Simple Rewrite Rules for the JIT with Z3 | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                              +
                              +

                              Finding Simple Rewrite Rules for the JIT with Z3

                              + + + +
                              +

                              In June I was at the PLDI conference in +Copenhagen to present a paper +I co-authored with Max Bernstein. I also finally +met John Regehr, who I'd been talking on social +media for ages but had never met. John has been working on compiler correctness +and better techniques for building compilers and optimizers since a very long +time. The blog post Finding JIT Optimizer Bugs using SMT Solvers and +Fuzzing +was heavily inspired by this work. We talked a lot about his and his groups +work on using Z3 for +superoptimization and for +finding missing optimizations. I have applied some of the things John told me +about to the traces of PyPy's JIT, and wanted to blog about that. However, my +draft felt quite hard to understand. Therefore I have now written this current +post, to at least try to provide a somewhat gentler on-ramp to the topic.

                              +

                              In this post we will use the Python-API to Z3 to find local peephole rewrite +rules for the operations in the intermediate representation of PyPy's tracing +JIT. The code for this is simple enough that we can go through all of it.

                              +

                              The PyPy JIT produces traces of machine level instructions, which are optimized +and then turned into machine code. The optimizer uses a number of approaches to +make the traces more efficient. For integer operations it applies a number of +arithmetic simplification rules rules, for example int_add(x, 0) -> x. When +implementing these rules in the JIT there are two problems: How do we know +that the rules are correct? And how do we know that we haven't forgotten any +rules? We'll try to answer both of these, but the first one in particular.

                              +

                              We'll be using Z3, a satisfiability module theories (SMT) solver which has good +bitvector support and most importantly an excellent Python API. We can use the +solver to reason about bitvectors, which are how we will model machine +integers.

                              +

                              To find rewrite rules, we will consider the binary operations (i.e. those +taking two arguments) in PyPy traces that take and produce integers. The +completely general form op(x, y) is not simplifiable on its own. But if +either x == y +or if one of the arguments is a constant, we can potentially simplify the +operation into a simpler form. The results are either the variable x, or a +(potentially different) constant. We'll ignore constant-folding where both +arguments of the binary operation are constants. The possible results for a +simplifiable binary operation are the variable x or another constant. This +leaves the following patterns as possibilities:

                              +
                                +
                              • op(x, x) == x
                              • +
                              • op(x, x) == c1
                              • +
                              • op(x, c1) == x
                              • +
                              • op(c1, x) == x
                              • +
                              • op(x, c1) == c2
                              • +
                              • op(c1, x) == c2
                              • +
                              +

                              Our approach will be to take every single supported binary integer operation, +instantiate all of these patterns, and try to ask Z3 whether the resulting +simplification is valid for all values of x.

                              +

                              Quick intro to the Z3 Python-API

                              +

                              Here's a terminal session showing the use of the Z3 Python API:

                              +
                              >>>> import z3
                              +>>>> # construct a Z3 bitvector variable of width 8, with name x:
                              +>>>> x = z3.BitVec('x', 8)
                              +>>>> # construct a more complicated formula by using operator overloading:
                              +>>>> x + x
                              +x + x
                              +>>>> x + 1
                              +x + 1
                              +
                              + +

                              Z3 checks the "satisfiability" of a formula. This means that it tries to find +an example set of concrete values for the variables that occur in a formula, +such that the formula becomes true. Examples:

                              +
                              >>>> solver = z3.Solver()
                              +>>>> solver.check(x * x == 3)
                              +unsat
                              +>>>> # meaning no x fulfils this property
                              +>>>>
                              +>>>> solver.check(x * x == 9)
                              +sat
                              +>>>> model = solver.model()
                              +>>>> model
                              +[x = 253]
                              +>>>> model[x].as_signed_long()
                              +-3
                              +>>>> # 253 is the same as -3 in two's complement arithmetic with 8 bits
                              +
                              + +

                              In order to use Z3 to prove something, we can ask Z3 to find counterexamples +for the statement, meaning concrete values that would make the negation of the +statement true:

                              +
                              >>>> solver.check(z3.Not(x ^ -1 == ~x))
                              +unsat
                              +
                              + +

                              The result unsat means that we just proved that x ^ -1 == ~x is true for +all x, because there is no value for x that makes not (x ^ -1 == ~x) +true (this works because -1 has all the bits set).

                              +

                              If we try to prove something incorrect in this way, the following happens:

                              +
                              >>>> solver.check(z3.Not(x ^ -1 == x))
                              +sat
                              +
                              + +

                              sat shows that x ^ -1 == x is (unsurprisingly) not always true, and we can +ask for a counterexample:

                              +
                              >>>> solver.model()
                              +[x = 0]
                              +
                              + +

                              This way of proving this works because the check calls try to solve an +(implicit) "exists" quantifier, over all the Z3 variables used in the formula. +check will either return z3.unsat, which means that no concrete values make +the formula true; or z3.sat, which means that you can get some concrete +values that make the formula true by calling solver.model().

                              +

                              In math terms we prove things using check by de-Morgan's rules for quantifiers:

                              +

                              $$ \lnot \exists x: \lnot f(x) \implies \forall x: f(x) $$

                              +

                              Now that we've seen the basics of using the Z3 API on a few small examples, +we'll use it in a bigger program.

                              +

                              Encoding the integer operations of RPython's JIT into Z3 formulas

                              +

                              Now we'll use the API to reason about the integer operations of the PyPy JIT +intermediate representation (IR). The binary integer operations are:

                              +
                              opnames2 = [
                              +"int_add",
                              +"int_sub",
                              +"int_mul",
                              +"int_and",
                              +"int_or",
                              +"int_xor",
                              +"int_eq",
                              +"int_ne",
                              +"int_lt",
                              +"int_le",
                              +"int_gt",
                              +"int_ge",
                              +"uint_lt",
                              +"uint_le",
                              +"uint_gt",
                              +"uint_ge",
                              +"int_lshift",
                              +"int_rshift",
                              +"uint_rshift",
                              +"uint_mul_high",
                              +"int_pydiv",
                              +"int_pymod",
                              +]
                              +
                              + +

                              There's not much special about the integer operations. Like in LLVM, most of +them are signedness-independent: int_add, int_sub, int_mul, ... work +correctly for unsigned integers but also for +two's-complement signed +integers. Exceptions for that are order comparisons like int_lt etc. for +which we have unsigned variants uint_lt etc. All operations that produce a +boolean result return a full-width integer 0 or 1 (the PyPy JIT supports +only word-sized integers in its intermediate representation)

                              +

                              In order to reason about the IR operations, some ground work:

                              +
                              import z3
                              +
                              +INTEGER_WIDTH = 64
                              +solver = z3.Solver()
                              +solver.set("timeout", 10000) # milliseconds, ie 10s
                              +xvar = z3.BitVec('x', INTEGER_WIDTH)
                              +constvar = z3.BitVec('const', INTEGER_WIDTH)
                              +constvar2 = z3.BitVec('const2', INTEGER_WIDTH)
                              +TRUEBV = z3.BitVecVal(1, INTEGER_WIDTH)
                              +FALSEBV = z3.BitVecVal(0, INTEGER_WIDTH)
                              +
                              + +

                              And here's the a function to turn an integer IR operation of PyPy's JIT into Z3 +formulas:

                              +
                              def z3_expression(opname, arg0, arg1=None):
                              +    """ computes a tuple of (result, valid_if) of Z3 formulas. `result` is the
                              +    formula representing the result of the operation, given argument formulas
                              +    arg0 and arg1. `valid_if` is a pre-condition that must be true for the
                              +    result to be meaningful. """
                              +    result = None
                              +    valid_if = True # the precondition is mostly True, with few exceptions
                              +    if opname == "int_add":
                              +        result = arg0 + arg1
                              +    elif opname == "int_sub":
                              +        result = arg0 - arg1
                              +    elif opname == "int_mul":
                              +        result = arg0 * arg1
                              +    elif opname == "int_and":
                              +        result = arg0 & arg1
                              +    elif opname == "int_or":
                              +        result = arg0 | arg1
                              +    elif opname == "int_xor":
                              +        result = arg0 ^ arg1
                              +    elif opname == "int_eq":
                              +        result = cond(arg0 == arg1)
                              +    elif opname == "int_ne":
                              +        result = cond(arg0 != arg1)
                              +    elif opname == "int_lt":
                              +        result = cond(arg0 < arg1)
                              +    elif opname == "int_le":
                              +        result = cond(arg0 <= arg1)
                              +    elif opname == "int_gt":
                              +        result = cond(arg0 > arg1)
                              +    elif opname == "int_ge":
                              +        result = cond(arg0 >= arg1)
                              +    elif opname == "uint_lt":
                              +        result = cond(z3.ULT(arg0, arg1))
                              +    elif opname == "uint_le":
                              +        result = cond(z3.ULE(arg0, arg1))
                              +    elif opname == "uint_gt":
                              +        result = cond(z3.UGT(arg0, arg1))
                              +    elif opname == "uint_ge":
                              +        result = cond(z3.UGE(arg0, arg1))
                              +    elif opname == "int_lshift":
                              +        result = arg0 << arg1
                              +        valid_if = z3.And(arg1 >= 0, arg1 < INTEGER_WIDTH)
                              +    elif opname == "int_rshift":
                              +        result = arg0 << arg1
                              +        valid_if = z3.And(arg1 >= 0, arg1 < INTEGER_WIDTH)
                              +    elif opname == "uint_rshift":
                              +        result = z3.LShR(arg0, arg1)
                              +        valid_if = z3.And(arg1 >= 0, arg1 < INTEGER_WIDTH)
                              +    elif opname == "uint_mul_high":
                              +        # zero-extend args to 2*INTEGER_WIDTH bit, then multiply and extract
                              +        # highest INTEGER_WIDTH bits
                              +        zarg0 = z3.ZeroExt(INTEGER_WIDTH, arg0)
                              +        zarg1 = z3.ZeroExt(INTEGER_WIDTH, arg1)
                              +        result = z3.Extract(INTEGER_WIDTH * 2 - 1, INTEGER_WIDTH, zarg0 * zarg1)
                              +    elif opname == "int_pydiv":
                              +        valid_if = arg1 != 0
                              +        r = arg0 / arg1
                              +        psubx = r * arg1 - arg0
                              +        result = r + (z3.If(arg1 < 0, psubx, -psubx) >> (INTEGER_WIDTH - 1))
                              +    elif opname == "int_pymod":
                              +        valid_if = arg1 != 0
                              +        r = arg0 % arg1
                              +        result = r + (arg1 & z3.If(arg1 < 0, -r, r) >> (INTEGER_WIDTH - 1))
                              +    elif opname == "int_is_true":
                              +        result = cond(arg0 != FALSEBV)
                              +    elif opname == "int_is_zero":
                              +        result = cond(arg0 == FALSEBV)
                              +    elif opname == "int_neg":
                              +        result = -arg0
                              +    elif opname == "int_invert":
                              +        result = ~arg0
                              +    else:
                              +        assert 0, "unknown operation " + opname
                              +    return result, valid_if
                              +
                              +def cond(z3expr):
                              +    """ helper function to turn a Z3 boolean result z3expr into a 1 or 0
                              +    bitvector, using z3.If """
                              +    return z3.If(z3expr, TRUEBV, FALSEBV)
                              +
                              + +

                              We map the semantics of a PyPy JIT operation to Z3 with the z3_expression +function. It takes the name of a JIT operation and its two (or one) arguments +into a pair of Z3 formulas, result and valid_if. The resulting formulas are +constructed with the operator overloading of Z3 variables/formulas.

                              +

                              The first element result of the result of z3_expression represents the result +of performing the operation. valid_if is a bool that represents a condition that +needs to be True in order for the result of the operation to be defined. E.g. +int_pydiv(a, b) is only valid if b != 0. Most operations are always valid, +so they return True as that condition (we'll ignore valid_if for a bit, but it +will become more relevant further down in the post).

                              +

                              We can define a helper function to prove things by finding counterexamples:

                              +
                              def prove(cond):
                              +    """ Try to prove a condition cond by searching for counterexamples of its negation. """
                              +    z3res = solver.check(z3.Not(cond))
                              +    if z3res == z3.unsat:
                              +        return True
                              +    elif z3res == z3.unknown: # eg on timeout
                              +        return False
                              +    elif z3res == z3.sat:
                              +        return False
                              +    assert 0, "should be unreachable"
                              +
                              + +

                              Finding rewrite rules

                              +

                              Now we can start finding our first rewrite rules, following the first pattern +op(x, x) -> x. We do this by iterating over all the supported binary +operation names, getting the z3 expression for op(x, x) and then asking Z3 to +prove op(x, x) == x.

                              +
                              for opname in opnames2:
                              +    result, valid_if = z3_expression(opname, xvar, xvar)
                              +    if prove(result == xvar):
                              +        print(f"{opname}(x, x) -> x, {result}")
                              +
                              + +

                              This yields the simplifications:

                              +
                              int_and(x, x) -> x
                              +int_or(x, x) -> x
                              +
                              + +

                              Synthesizing constants

                              +

                              Supporting the next patterns is harder: op(x, x) == c1, op(x, c1) == x, and +op(c1, x) == x. We don't know which constants to pick to try to get Z3 to +prove the equality. We could iterate over common constants like 0, 1, +MAXINT, etc, or even over all the 256 values for a bitvector of length 8. +However, we will instead ask Z3 to find the constants for us too.

                              +

                              This can be done by using quantifiers, in this case z3.ForAll. The query we +pose to Z3 is "does there exist a constant c1 such that for all x the +following is true: op(x, c1) == x? Note that the constant c1 is not +necessarily unique, there could be many of them. We generate several matching +constant, and add that they must be different to the condition of the second +and further queries.

                              +

                              We can express this in a helper function:

                              +
                              def find_constant(z3expr, number_of_results=5):
                              +    condition = z3.ForAll(
                              +        [xvar],
                              +        z3expr
                              +    )
                              +    for i in range(number_of_results):
                              +        checkres = solver.check(condition)
                              +        if checkres == z3.sat:
                              +            # if a solver check succeeds, we can ask for a model, which is
                              +            # concrete values for the variables constvar
                              +            model = solver.model()
                              +            const = model[constvar].as_signed_long()
                              +            yield const
                              +            # make sure we don't generate the same constant again on the
                              +            # next call
                              +            condition = z3.And(constvar != const, condition)
                              +        else:
                              +            # no (more) constants found
                              +            break
                              +
                              + +

                              We can use this new function for the three mentioned patterns:

                              +
                              # try to find constants for op(x, x) == c
                              +for opname in opnames2:
                              +    result, valid_if = z3_expression(opname, xvar, xvar)
                              +    for const in find_constant(result == constvar):
                              +        print(f"{opname}(x, x) -> {const}")
                              +# try to find constants for op(x, c) == x and op(c, x) == x
                              +for opname in opnames2:
                              +    result, valid_if = z3_expression(opname, xvar, constvar)
                              +    for const in find_constant(result == xvar):
                              +        print(f"{opname}(x, {const}) -> x")
                              +    result, valid_if = z3_expression(opname, constvar, xvar)
                              +    for const in find_constant(result == xvar):
                              +        print(f"{opname}({const}, x) -> x")
                              +# this code is not quite correct, we'll correct it later
                              +
                              + +

                              Together this yields the following new simplifications:

                              +
                              # careful, these are not all correct!
                              +int_sub(x, x) -> 0
                              +int_xor(x, x) -> 0
                              +int_eq(x, x) -> 1
                              +int_ne(x, x) -> 0
                              +int_lt(x, x) -> 0
                              +int_le(x, x) -> 1
                              +int_gt(x, x) -> 0
                              +int_ge(x, x) -> 1
                              +uint_lt(x, x) -> 0
                              +uint_le(x, x) -> 1
                              +uint_gt(x, x) -> 0
                              +uint_ge(x, x) -> 1
                              +uint_rshift(x, x) -> 0
                              +int_pymod(x, x) -> 0
                              +int_add(x, 0) -> x
                              +int_add(0, x) -> x
                              +int_sub(x, 0) -> x
                              +int_mul(x, 1) -> x
                              +int_mul(1, x) -> x
                              +int_and(x, -1) -> x
                              +int_and(-1, x) -> x
                              +int_or(x, 0) -> x
                              +int_or(0, x) -> x
                              +int_xor(x, 0) -> x
                              +int_xor(0, x) -> x
                              +int_lshift(x, 0) -> x
                              +int_rshift(x, 0) -> x
                              +uint_rshift(x, 0) -> x
                              +int_pydiv(x, 1) -> x
                              +int_pymod(x, 0) -> x
                              +
                              + +

                              Most of these look good at first glance, but the last one reveals a problem: +we've been ignoring the valid_if expression up to now. We can stop doing that by +changing the code like this, which adds z3.And(valid_if, ...) to the argument of +the calls to find_constant:

                              +
                              # try to find constants for op(x, x) == c, op(x, c) == x and op(c, x) == x
                              +for opname in opnames2:
                              +    result, valid_if = z3_expression(opname, xvar, xvar)
                              +    for const in find_constant(z3.And(valid_if, result == constvar)):
                              +        print(f"{opname}(x, x) -> {const}")
                              +# try to find constants for op(x, c) == x and op(c, x) == x
                              +for opname in opnames2:
                              +    result, valid_if = z3_expression(opname, xvar, constvar)
                              +    for const in find_constant(z3.And(result == xvar, valid_if)):
                              +        print(f"{opname}(x, {const}) -> x")
                              +    result, valid_if = z3_expression(opname, constvar, xvar)
                              +    for const in find_constant(z3.And(result == xvar, valid_if)):
                              +        print(f"{opname}({const}, x) -> x")
                              +
                              + +

                              And we get this list instead:

                              +
                              int_sub(x, x) -> 0
                              +int_xor(x, x) -> 0
                              +int_eq(x, x) -> 1
                              +int_ne(x, x) -> 0
                              +int_lt(x, x) -> 0
                              +int_le(x, x) -> 1
                              +int_gt(x, x) -> 0
                              +int_ge(x, x) -> 1
                              +uint_lt(x, x) -> 0
                              +uint_le(x, x) -> 1
                              +uint_gt(x, x) -> 0
                              +uint_ge(x, x) -> 1
                              +int_add(x, 0) -> x
                              +int_add(0, x) -> x
                              +int_sub(x, 0) -> x
                              +int_mul(x, 1) -> x
                              +int_mul(1, x) -> x
                              +int_and(x, -1) -> x
                              +int_and(-1, x) -> x
                              +int_or(x, 0) -> x
                              +int_or(0, x) -> x
                              +int_xor(x, 0) -> x
                              +int_xor(0, x) -> x
                              +int_lshift(x, 0) -> x
                              +int_rshift(x, 0) -> x
                              +uint_rshift(x, 0) -> x
                              +int_pydiv(x, 1) -> x
                              +
                              + +

                              Synthesizing two constants

                              +

                              For the patterns op(x, c1) == c2 and op(c1, x) == c2 we need to synthesize +two constants. We can again write a helper method for that:

                              +
                              def find_2consts(z3expr, number_of_results=5):
                              +    condition = z3.ForAll(
                              +        [xvar],
                              +        z3expr
                              +    )
                              +    for i in range(number_of_results):
                              +        checkres = solver.check(condition)
                              +        if checkres == z3.sat:
                              +            model = solver.model()
                              +            const = model[constvar].as_signed_long()
                              +            const2 = model[constvar2].as_signed_long()
                              +            yield const, const2
                              +            condition = z3.And(z3.Or(constvar != const, constvar2 != const2), condition)
                              +        else:
                              +            return
                              +
                              + +

                              And then use it like this:

                              +
                              for opname in opnames2:
                              +    # try to find constants c1, c2 such that op(c1, x) -> c2
                              +    result, valid_if = z3_expression(opname, constvar, xvar)
                              +    consts = find_2consts(z3.And(valid_if, result == constvar2))
                              +    for const, const2 in consts:
                              +        print(f"{opname}({const}, x) -> {const2}")
                              +    # try to find constants c1, c2 such that op(x, c1) -> c2
                              +    result, valid_if = z3_expression(opname, xvar, constvar)
                              +    consts = find_2consts(z3.And(valid_if, result == constvar2))
                              +    for const, const2 in consts:
                              +        print("%s(x, %s) -> %s" % (opname, const, const2))
                              +
                              + +

                              Which yields some straightforward simplifications:

                              +
                              int_mul(0, x) -> 0
                              +int_mul(x, 0) -> 0
                              +int_and(0, x) -> 0
                              +int_and(x, 0) -> 0
                              +uint_lt(x, 0) -> 0
                              +uint_le(0, x) -> 1
                              +uint_gt(0, x) -> 0
                              +uint_ge(x, 0) -> 1
                              +int_lshift(0, x) -> 0
                              +int_rshift(0, x) -> 0
                              +uint_rshift(0, x) -> 0
                              +uint_mul_high(0, x) -> 0
                              +uint_mul_high(1, x) -> 0
                              +uint_mul_high(x, 0) -> 0
                              +uint_mul_high(x, 1) -> 0
                              +int_pymod(x, 1) -> 0
                              +int_pymod(x, -1) -> 0
                              +
                              + +

                              A few require a bit more thinking:

                              +
                              int_or(-1, x) -> -1
                              +int_or(x, -1) -> -1
                              +
                              + +

                              The are true because in two's complement, -1 has all bits set.

                              +

                              The following ones require recognizing that -9223372036854775808 == -2**63 is +the most negative signed 64-bit integer, and 9223372036854775807 == 2 ** 63 - +1 is the most positive one:

                              +
                              int_lt(9223372036854775807, x) -> 0
                              +int_lt(x, -9223372036854775808) -> 0
                              +int_le(-9223372036854775808, x) -> 1
                              +int_le(x, 9223372036854775807) -> 1
                              +int_gt(-9223372036854775808, x) -> 0
                              +int_gt(x, 9223372036854775807) -> 0
                              +int_ge(9223372036854775807, x) -> 1
                              +int_ge(x, -9223372036854775808) -> 1
                              +
                              + +

                              The following ones are true because the bitpattern for -1 is the largest +unsigned number:

                              +
                              uint_lt(-1, x) -> 0
                              +uint_le(x, -1) -> 1
                              +uint_gt(x, -1) -> 0
                              +uint_ge(-1, x) -> 1
                              +
                              + +

                              Strength Reductions

                              +

                              All the patterns so far only had a variable or a constant on the target of the +rewrite. We can also use the machinery to do strengh-reductions where we +generate a single-argument operation op1(x) for input operations op(x, c1) +or op(c1, x). To achieve this, we try all combinations of binary and unary +operations. (We won't consider strength reductions where a binary operation +gets turned into a "cheaper" other binary operation here.)

                              +
                              opnames1 = [
                              +"int_is_true",
                              +"int_is_zero",
                              +"int_neg",
                              +"int_invert",
                              +]
                              +
                              +for opname in opnames2:
                              +    for opname1 in opnames1:
                              +        result, valid_if = z3_expression(opname, xvar, constvar)
                              +        # try to find a constant op(x, c) == g(x)
                              +        result1, valid_if1 = z3_expression(opname1, xvar)
                              +        consts = find_constant(z3.And(valid_if, valid_if1, result == result1))
                              +        for const in consts:
                              +            print(f"{opname}(x, {const}) -> {opname1}(x)")
                              +
                              +        # try to find a constant op(c, x) == g(x)
                              +        result, valid_if = z3_expression(opname, constvar, xvar)
                              +        result1, valid_if1 = z3_expression(opname1, xvar)
                              +        consts = find_constant(z3.And(valid_if, valid_if1, result == result1))
                              +        for const in consts:
                              +            print(f"{opname}({const}, x) -> {opname1}(x)")
                              +
                              + +

                              Which yields the following new simplifications:

                              +
                              int_sub(0, x) -> int_neg(x)
                              +int_sub(-1, x) -> int_invert(x)
                              +int_mul(x, -1) -> int_neg(x)
                              +int_mul(-1, x) -> int_neg(x)
                              +int_xor(x, -1) -> int_invert(x)
                              +int_xor(-1, x) -> int_invert(x)
                              +int_eq(x, 0) -> int_is_zero(x)
                              +int_eq(0, x) -> int_is_zero(x)
                              +int_ne(x, 0) -> int_is_true(x)
                              +int_ne(0, x) -> int_is_true(x)
                              +uint_lt(0, x) -> int_is_true(x)
                              +uint_lt(x, 1) -> int_is_zero(x)
                              +uint_le(1, x) -> int_is_true(x)
                              +uint_le(x, 0) -> int_is_zero(x)
                              +uint_gt(x, 0) -> int_is_true(x)
                              +uint_gt(1, x) -> int_is_zero(x)
                              +uint_ge(x, 1) -> int_is_true(x)
                              +uint_ge(0, x) -> int_is_zero(x)
                              +int_pydiv(x, -1) -> int_neg(x)
                              +
                              + +

                              Conclusions

                              +

                              With not very little code we managed to generate a whole lot of local +simplifications for integer operations in the IR of PyPy's JIT. The rules +discovered that way are "simple", in the sense that they only require looking +at a single instruction, and not where the arguments of that instruction came +from. They also don't require any knowledge about the properties of the +arguments of the instructions (e.g. that they are positive).

                              +

                              The rewrites in this post have mostly been in PyPy's JIT already. But now we +mechanically confirmed that they are correct. I've also added the remaining +useful looking ones, in particular int_eq(x, 0) -> int_is_zero(x) etc.

                              +

                              If we wanted to scale this approach up, we would have to work much harder! +There are a bunch of problems that come with generalizing the approach to +looking at sequences of instructions:

                              +
                                +
                              • +

                                Combinatorial explosion: if we look at sequences of instructions, we very + quickly get a combinatorial explosion and it becomes untractable to try all + combinations.

                                +
                              • +
                              • +

                                Finding non-minimal patterns: Some complicated simplifications can be + instances of simpler ones. For example, because int_add(x, 0) -> x, it's + also true that int_add(int_sub(x, y), 0) -> int_sub(x, y). If we simply + generate all possible sequences, we will find the latter simplification rule, + which we would usually not care about.

                                +
                              • +
                              • +

                                Unclear usefulness: if we simply generate all rewrites up to a certain number + of instructions, we will get a lot of patterns that are useless in the sense + that they typically aren't found in realistic programs. It would be much + better to somehow focus on the patterns that real benchmarks are using.

                                +
                              • +
                              +

                              In the next blog post I'll discuss an alternative approach to simply generating +all possible sequences of instructions, that tries to address these problems. +This works by analyzing the real traces of benchmarks and mining those for +inefficiencies, which only shows problems that occur in actual programs.

                              +

                              Sources

                              +

                              I've been re-reading a lot of blog posts from John's blog:

                              + +

                              but also papers:

                              + +

                              Another of my favorite blogs has been Philipp Zucker's +blog in the last year or two, lots of excellent +posts about/using Z3 on there.

                              +
                              +

                              Comments

                              +
                              +
                              +
                              + +
                              +
                              + + \ No newline at end of file diff --git a/posts/2024/07/mining-jit-traces-missing-optimizations-z3.html b/posts/2024/07/mining-jit-traces-missing-optimizations-z3.html new file mode 100644 index 000000000..cf9da1dae --- /dev/null +++ b/posts/2024/07/mining-jit-traces-missing-optimizations-z3.html @@ -0,0 +1,647 @@ + + + + + +Mining JIT traces for missing optimizations with Z3 | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                              +
                              +

                              Mining JIT traces for missing optimizations with Z3

                              + + + +
                              +

                              In my last post I've described how to use Z3 to find simple local peephole +optimization patterns +for the integer operations in PyPy's JIT. An example is int_and(x, 0) -> +0. In this post I want to scale up the problem of identifying possible +optimizations to much bigger instruction sequences, also using Z3. For that, I +am starting with the JIT traces of real benchmarks, after they have been +optimized by the optimizer of PyPy's JIT. Then we can ask Z3 to find +inefficient integer operations in those traces.

                              +

                              Starting from the optimized traces of real programs has some big +advantages over the "classical" superoptimization approach of generating and +then trying all possible sequences of instructions. It avoids the +combinatorial explosion that happens with the latter approach. Also, starting +from the traces of benchmarks or (even better) actual programs makes sure that +we actually care about the missing optimizations +that are found in this way. And because the traces are analyzed after they have +been optimized by PyPy's optimizer, we only get reports for missing +optimizations, that the JIT isn't able to do (yet).

                              +

                              The techniques and experiments I describe in this post are again the result of +a bunch of discussions with John Regehr at a conference a few weeks ago, as +well as reading his blog posts and papers. Thanks John! Also thanks to Max +Bernstein for super helpful feedback on the drafts +of this blog post (and for poking me to write things in general).

                              +

                              High-Level Approach

                              +

                              The approach that I took works as follows:

                              +
                                +
                              • Run benchmarks or other interesting programs and then dump the IR of the JIT + traces into a file. The traces have at that point been already optimized by + the PyPy JIT's optimizer.
                              • +
                              • For every trace, ignore all the operations on non-integer variables.
                              • +
                              • Translate every integer operation into a Z3 formula.
                              • +
                              • For every operation, use Z3 to find out whether the operation is redundant + (how that is done is described below).
                              • +
                              • If the operation is redundant, the trace is less efficient than it could have + been, because the optimizer could also have removed the operation. Report the + inefficiency.
                              • +
                              • Minimize the inefficient programs by removing as many operations as possible + to make the problem easier to understand.
                              • +
                              +

                              In the post I will describe the details and show some pseudocode of the +approach. I'll also make the proper code public eventually (but it needs a +healthy dose of cleanups first).

                              +

                              Dumping PyPy Traces

                              +

                              PyPy will write its JIT traces into the file out if the environment variable +PYPYLOG is set as follows:

                              +
                              PYPYLOG=jit-log-opt:out pypy <program.py>
                              +
                              + +

                              This environment variable works for PyPy, but also for other virtual machines +built with RPython.

                              +

                              (This is really a side point for the rest of the blog post, but since the +question came up I wanted to clarify it: Operations on integers in the Python +program that the JIT is running don't all correspond 1-to-1 with the int_... +operations in the traces. The int_... trace operations always operate on +machine words. The Python int type supports arbitrarily large integers. PyPy +will optimistically try to lower the operations on Python integers into machine +word operations, but adds the necessary guards into the trace to make sure that +overflow outside of the range of machine words is caught. In case one of these +guards fails the interpreter switches to a big integer heap-allocated +representation.)

                              +

                              Encoding Traces as Z3 formulas

                              +

                              The last blog post already contained the code to encode the results of +individual trace operations into Z3 formulas, so we don't need to repeat that +here. To encode traces of operations we introduce a Z3 variable for every +operation in the trace and then call the z3_expression function for every +single one of the operations in the trace.

                              +

                              For example, for the following trace:

                              +
                              [i1]
                              +i2 = uint_rshift(i1, 32)
                              +i3 = int_and(i2, 65535)
                              +i4 = uint_rshift(i1, 48)
                              +i5 = int_lshift(i4, 16)
                              +i6 = int_or(i5, i3)
                              +jump(i6, i2) # equal
                              +
                              + +

                              We would get the Z3 formula:

                              +
                              z3.And(i2 == LShR(i1, 32),
                              +       i3 == i2 & 65535,
                              +       i4 == LShR(i1, 48),
                              +       i5 == i4 << 16)
                              +
                              + +

                              Usually we won't ask for the formula of the whole trace at once. Instead we go +through the trace operation by operation and try to find inefficiencies in the +current one we are looking at. Roughly like this (pseudo-)code:

                              +
                              def newvar(name):
                              +    return z3.BitVec(name, INTEGER_WIDTH)
                              +
                              +def find_inefficiencies(trace):
                              +    solver = z3.Solver()
                              +    var_to_z3var = {}
                              +    for input_argument in trace.inputargs:
                              +        var_to_z3var[input_argument] = newz3var(input_argument)
                              +    for op in trace:
                              +        var_to_z3var[op] = z3resultvar = newz3var(op.resultvarname)
                              +        arg0 = op.args[0]
                              +        z3arg0 = var_to_z3var[arg0]
                              +        if len(op.args) == 2:
                              +            arg1 = op.args[1]
                              +            z3arg1 = var_to_z3var[arg1]
                              +        else:
                              +            z3arg1 = None
                              +        res, valid_if = z3_expression(op.name, z3arg0, z3arg1)
                              +        # checking for inefficiencies, see the next sections
                              +        ...
                              +        if ...:
                              +            return "inefficient", op
                              +
                              +        # not inefficient, assert op into the solver and continue with the next op
                              +        solver.add(z3resultvar == res)
                              +    return None # no inefficiency found
                              +
                              + +

                              Identifying constant booleans with Z3

                              +

                              To get started finding inefficiencies in a trace, we can +first focus on boolean variables. For every operation in the trace that +returns a bool we can ask Z3 to prove that this variable must be always True or +always False. Most of the time, neither of these proofs will succeed. But if Z3 +manages to prove one of them, we know have found an ineffiency: instead of +computing the boolean result (eg by executing a comparison) the JIT's optimizer +could have replaced the operation with the corresponding boolean constant.

                              +

                              Here's an example of an inefficiency found that way: if x < y and y < z are +both true, PyPy's JIT could conclude that x < z must also +be true. However, currently the JIT cannot make that conclusion because it +only reasons about the concrete ranges (lower and upper bounds) for every +integer variable, but it has no way to remember anything about relationships +between different variables. This kind of reasoning would quite often be useful +to remove list/string bounds checks. Here's a talk about how LLVM does +this (but it might be +too heavyweight for a JIT setting).

                              +

                              Here are some more examples found that way:

                              +
                                +
                              • +x - 1 == x is always False
                              • +
                              • +x - (x == -1) == -1 is always False. The pattern x - (x == -1) happens a + lot in PyPy's hash computations: To be compatible with the CPython hashes we + need to make sure that no object's hash is -1 (CPython uses -1 as an error + value on the C level).
                              • +
                              +

                              Here's pseudo-code for how to implement checking boolean operations for +inefficiencies:

                              +
                              def find_inefficiencies(trace):
                              +    ...
                              +    for op in trace:
                              +        ...
                              +        res, valid_if = z3_expression(op.name, z3arg0, z3arg1)
                              +        # check for boolean constant result
                              +        if op.has_boolean_result():
                              +            if prove(solver, res == 0):
                              +                return "inefficient", op, 0
                              +            if prove(solver, res == 1):
                              +                return "inefficient", op, 1
                              +        # checking for other inefficiencies, see the next sections
                              +        ...
                              +
                              +        # not inefficient, add op to the solver and continue with the next op
                              +        solver.add(z3resultvar == res)
                              +    return None # no inefficiency found
                              +
                              + +

                              Identifying redundant operations

                              +

                              A more interesting class of redundancy is to try to find two operations in a +trace that compute the same result. We can do that by asking Z3 to prove for +each pair of different operations in the trace to prove that the result is +always the same. If a previous operation returns the same result, the JIT could +have re-used that result instead of re-computing it, saving time. Doing this +search for equivalent operations with Z3 is quadratic in the number of +operations, but since traces have a maximum length it is not too bad in +practice.

                              +

                              This is the real workhorse of my script so far, it's what finds most of the +inefficiencies. Here's a few examples:

                              +
                                +
                              • The very first and super useful example the script found is int_eq(b, 1) == + b if b is known to be a boolean (ie and integer 0 or 1). I have already + implemented this optimization in the JIT.
                              • +
                              • Similarly, int_and(b, 1) == b for booleans.
                              • +
                              • (x << 4) & -0xf == x << 4
                              • +
                              • +((x >> 63) << 1) << 2) >> 3 == x >> 63. In general the JIT is quite bad at + optimizing repeated shifts (the infrastructure for doing better with that is + already in place, so this will be a relatively easy fix).
                              • +
                              • +(x & 0xffffffff) | ((x >> 32) << 32) == x. Having the JIT optimize this + would maybe require first recognizing that (x >> 32) << 32 can be expressed + as a mask: (x & 0xffffffff00000000), and then using (x & c1) | (x & c2) == + x & (c1 | c2) +
                              • +
                              • A commonly occurring pattern is variations of this one: + ((x & 1345) ^ 2048) - 2048 == x & 1345 (with different constants, of + course). xor is add without carry, and x & 1345 does not have the bit + 2048 set. Therefore the ^ 2048 is equivalent to + 2048, which the - + 2048 cancels. More generally, if a & b == 0, then a + b == a | b == a ^ b. + I don't understand at all why this appears so often in the traces, but I + see variations of it a lot. LLVM can optimize this, but GCC + can't, thanks to + Andrew Pinski for filing the + bug!
                              • +
                              +

                              And here's some implementation pseudo-code again:

                              +
                              def find_inefficiencies(trace):
                              +    ...
                              +    for op in trace:
                              +        ...
                              +        res, valid_if = z3_expression(op.name, z3arg0, z3arg1)
                              +        # check for boolean constant result
                              +        ...
                              +        # searching for redundant operations
                              +        for previous_op in trace:
                              +            if previous_op is op:
                              +                break # done, reached the current op
                              +            previous_op_z3var = var_to_z3var[previous_op]
                              +            if prove(solver, previous_op_z3var == res):
                              +                return "inefficient", op, previous_op
                              +        ...
                              +        # more code here later
                              +        ...
                              +
                              +        # not inefficient, add op to the solver and continue with the next op
                              +        solver.add(z3resultvar == res)
                              +    return None # no inefficiency found
                              +
                              + +

                              Synthesizing more complicated constants with exists-forall

                              +

                              To find out whether some integer operations always return a constant result, we +can't simply use the same trick as for those operations that return boolean +results, because enumerating 2⁶⁴ possible constants and checking them all +would take too long. Like in the last post, we can use z3.ForAll to find out +whether Z3 can synthesize a constant for the result of an operation for us. +If such a constant exists, the JIT could have removed the operation, +and replaced it with the constant that Z3 provides.

                              +

                              Here a few examples of inefficiencies found this way:

                              +
                                +
                              • +(x ^ 1) ^ x == 1 (or, more generally: (x ^ y) ^ x == y)
                              • +
                              • if x | y == 0, it follows that x == 0 and y == 0 +
                              • +
                              • if x != MAXINT, then x + 1 > x +
                              • +
                              +

                              Implementing this is actually slightly annoying. The solver.add calls for +non-inefficient ops add assertions to the solver, which are now confusing the +z3.ForAll query. We could remove all assertion from the solver, then do the +ForAll query, then add the assertions back. What I ended doing instead was +instantiating a second solver object that I'm using for the ForAll queries, +that remains empty the whole time.

                              +
                              def find_inefficiencies(trace):
                              +    solver = z3.Solver()
                              +    empty_solver = z3.Solver()
                              +    var_to_z3var = {}
                              +    ...
                              +    for op in trace:
                              +        ...
                              +        res, valid_if = z3_expression(op.name, z3arg0, z3arg1)
                              +        # check for boolean constant result
                              +        ...
                              +        # searching for redundant operations
                              +        ...
                              +        # checking for constant results
                              +        constvar = z3.BitVec('find_const', INTEGER_WIDTH)
                              +        condition = z3.ForAll(
                              +            var_to_z3var.values(),
                              +            z3.Implies(
                              +                *solver.assertions(),
                              +                expr == constvar
                              +            )
                              +        )
                              +        if empty_solver.check(condition) == z3.sat:
                              +            model = empty_solver.model()
                              +            const = model[constvar].as_signed_long()
                              +            return "inefficient", op, const
                              +
                              +        # not inefficient, add op to the solver and continue with the next op
                              +        solver.add(z3resultvar == res)
                              +    return None # no inefficiency found
                              +
                              + +

                              Minimization

                              +

                              Analyzing an inefficiency by hand in the context of a larger trace is quite +tedious. Therefore I've implemented a (super inefficient) script to try to make +the examples smaller. Here's how that works:

                              +
                                +
                              • First throw out all the operations that occur after the inefficient operation + in the trace.
                              • +
                              • Then we remove all "dead" operations, ie operations that don't have their + results used (all the operations that we can analyze with Z3 are without side + effects).
                              • +
                              • Now we try to remove every guard in the trace one by one and check + afterwards, whether the resulting trace still has an inefficiency.
                              • +
                              • We also try to replace every single operation with a new argument to the + trace, to see whether the inefficiency is still present.
                              • +
                              +

                              The minimization process is sort of inefficient and I should probably be using + shrinkray or + C-Reduce instead. However, it + seems to work well in practice and the runtime isn't too bad.

                              +

                              Results

                              +

                              So far I am using the JIT traces of three programs: 1) Booting Linux on the +Pydrofoil RISC-V emulator, 2) booting Linux on the Pydrofoil ARM emulator, and 3) +running the PyPy bootstrap process on top of PyPy.

                              +

                              I picked these programs because most Python programs don't contain interesting +amounts of integer operations, and the traces of the emulators +contain a lot of them. I also used the bootstrap process because I still wanted +to try a big Python program and personally care about the runtime of this +program a lot.

                              +

                              The script identifies 94 +inefficiencies in the traces, a lot of them come from repeating +patterns. My next steps will be to manually inspect them all, categorize them, and +implement easy optimizations identified that way. I also want a way to sort the +examples by execution count in the benchmarks, to get a feeling for which of +them are most important.

                              +

                              I didn't investigate the full set of Python +benchmarks that PyPy uses yet, because I don't expect +them to contain interesting amounts of integer operations, but maybe I am wrong +about that? Will have to try eventually.

                              +

                              Conclusion

                              +

                              This was again much easier to do than I would have expected! Given that I had +the translation of trace ops to Z3 already in place, it was a matter of about a +day's of programming to use this infrastructure to find the first problems and +minimizing them.

                              +

                              Reusing the results of existing operations or replacing operations by constants +can be seen as "zero-instruction superoptimization". I'll probably be rather +busy for a while to add the missing optimizations identified by my simple +script. But later extensions to actually synthesize one or several operations +in the attempt to optimize the traces more and find more opportunities should +be possible.

                              +

                              Finding inefficiencies in traces with Z3 is significantly less +annoying and also less error-prone than just manually inspecting traces and +trying to spot optimization opportunities.

                              +

                              Random Notes and Sources

                              +

                              Again, John's blog posts:

                              + +

                              and papers:

                              + +

                              I remembered recently that I had seen the approach of optimizing the traces of +a tracing JIT with Z3 a long time ago, as part of the (now long dead, I think) +SPUR +project. +There's a workshop +paper +from 2010 about this. SPUR was trying to use Z3 built into the actual JIT (as +opposed to using Z3 only to find places where the regular optimizers could be +improved). In addition to bitvectors, SPUR also used the Z3 support for arrays +to model the C# heap and remove redundant stores. This is still another future +extension for all the Z3 work I've been doing in the context of the PyPy JIT.

                              +
                              +

                              Comments

                              +
                              +
                              +
                              + +
                              +
                              + + \ No newline at end of file diff --git a/posts/2024/07/toy-abstract-interpretation.html b/posts/2024/07/toy-abstract-interpretation.html new file mode 100644 index 000000000..39ff130cb --- /dev/null +++ b/posts/2024/07/toy-abstract-interpretation.html @@ -0,0 +1,794 @@ + + + + + +Abstract interpretation in the Toy Optimizer | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                              +
                              +

                              Abstract interpretation in the Toy Optimizer

                              + + + +
                              +

                              This is a cross-post +from Max Bernstein from his excellent blog where he writes about programming +languages, compilers, optimizations, virtual machines. He's looking for a +(dynamic language runtime or compiler related) job too.

                              +
                              +

                              CF Bolz-Tereick wrote some excellent posts in which they introduce a small IR +and optimizer and extend it with allocation +removal. We also did a live stream together in which +we did some more heap optimizations.

                              +

                              In this blog post, I'm going to write a small abstract interpreter for the Toy +IR and then show how we can use it to do some simple optimizations. It assumes +that you are familiar with the little IR, which I have reproduced unchanged in +a GitHub Gist.

                              +

                              Abstract interpretation is a general framework for efficiently computing +properties that must be true for all possible executions of a program. It's a +widely used approach both in compiler optimizations as well as offline static +analysis for finding bugs. I'm writing this post to pave the way for CF's next +post on proving abstract interpreters correct for range analysis and known bits +analysis inside PyPy.

                              +

                              Before we begin, I want to note a couple of things:

                              +
                                +
                              • The Toy IR is in SSA form, which means that every variable is defined exactly + once. This means that abstract properties of each variable are easy to track.
                              • +
                              • The Toy IR represents a linear trace without control flow, meaning we won't + talk about meet/join or fixpoints. They only make sense if the IR has a + notion of conditional branches or back edges (loops).
                              • +
                              +

                              Alright, let's get started.

                              +

                              Welcome to abstract interpretation

                              +

                              Abstract interpretation means a couple different things to different people. +There's rigorous mathematical formalism thanks to Patrick and Radhia Cousot, +our favorite power couple, and there's also sketchy hand-wavy stuff like what +will follow in this post. In the end, all people are trying to do is reason +about program behavior without running it.

                              +

                              In particular, abstract interpretation is an over-approximation of the +behavior of a program. Correctly implemented abstract interpreters never lie, +but they might be a little bit pessimistic. This is because instead of using +real values and running the program---which would produce a concrete result and +some real-world behavior---we "run" the program with a parallel universe of +abstract values. This abstract run gives us information about all possible +runs of the program.1

                              +

                              Abstract values always represent sets of concrete values. Instead of literally +storing a set (in the world of integers, for example, it could get pretty +big...there are a lot of integers), we group them into a finite number of named +subsets.2

                              +

                              Let's learn a little about abstract interpretation with an example program and +example abstract domain. Here's the example program:

                              +
                              v0 = 1
                              +v1 = 2
                              +v2 = add(v0, v1)
                              +
                              + +

                              And our abstract domain is "is the number positive" (where "positive" means +nonnegative, but I wanted to keep the words distinct):

                              +
                                     top
                              +    /       \
                              +positive    negative
                              +    \       /
                              +      bottom
                              +
                              + +

                              The special top value means "I don't know" and the special bottom value +means "empty set" or "unreachable". The positive and negative values +represent the sets of all positive and negative numbers, respectively.

                              +

                              We initialize all the variables v0, v1, and v2 to bottom and then walk +our IR, updating our knowledge as we go.

                              +
                              # here
                              +v0:bottom = 1
                              +v1:bottom = 2
                              +v2:bottom = add(v0, v1)
                              +
                              + +

                              In order to do that, we have to have transfer functions for each operation. +For constants, the transfer function is easy: determine if the constant is +positive or negative. For other operations, we have to define a function that +takes the abstract values of the operands and returns the abstract value of the +result.

                              +

                              In order to be correct, transfer functions for operations have to be compatible +with the behavior of their corresponding concrete implementations. You can +think of them having an implicit universal quantifier forall in front of +them.

                              +

                              Let's step through the constants at least:

                              +
                              v0:positive = 1
                              +v1:positive = 2
                              +# here
                              +v2:bottom = add(v0, v1)
                              +
                              + +

                              Now we need to figure out the transfer function for add. It's kind of tricky +right now because we haven't specified our abstract domain very well. I keep +saying "numbers", but what kinds of numbers? Integers? Real numbers? Floating +point? Some kind of fixed-width bit vector (int8, uint32, ...) like an +actual machine "integer"?

                              +

                              For this post, I am going to use the mathematical definition of integer, which +means that the values are not bounded in size and therefore do not overflow. +Actual hardware memory constraints aside, this is kind of like a Python int.

                              +

                              So let's look at what happens when we add two abstract numbers:

                              + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                              toppositivenegativebottom
                              toptoptoptopbottom
                              positivetoppositivetopbottom
                              negativetoptopnegativebottom
                              bottombottombottombottombottom
                              +

                              As an example, let's try to add two numbers a and b, where a is positive +and b is negative. We don't know anything about their values other than their +signs. They could be 5 and -3, where the result is 2, or they could be +1 and -100, where the result is -99. This is why we can't say anything +about the result of this operation and have to return top.

                              +

                              The short of this table is that we only really know the result of an addition +if both operands are positive or both operands are negative. Thankfully, in +this example, both operands are known positive. So we can learn something about +v2:

                              +
                              v0:positive = 1
                              +v1:positive = 2
                              +v2:positive = add(v0, v1)
                              +# here
                              +
                              + +

                              This may not seem useful in isolation, but analyzing more complex programs even +with this simple domain may be able to remove checks such as if (v2 < 0) { ... }.

                              +

                              Let's take a look at another example using an sample absval (absolute value) +IR operation:

                              +
                              v0 = getarg(0)
                              +v1 = getarg(1)
                              +v2 = absval(v0)
                              +v3 = absval(v1)
                              +v4 = add(v2, v3)
                              +v5 = absval(v4)
                              +
                              + +

                              Even though we have no constant/concrete values, we can still learn something +about the states of values throughout the program. Since we know that absval +always returns a positive number, we learn that v2, v3, and v4 are all +positive. This means that we can optimize out the absval operation on v5:

                              +
                              v0:top = getarg(0)
                              +v1:top = getarg(1)
                              +v2:positive = absval(v0)
                              +v3:positive = absval(v1)
                              +v4:positive = add(v2, v3)
                              +v5:positive = v4
                              +
                              + +

                              Other interesting lattices include:

                              +
                                +
                              • Constants (where the middle row is pretty wide)
                              • +
                              • Range analysis (bounds on min and max of a number)
                              • +
                              • Known bits (using a bitvector representation of a number, which bits are + always 0 or 1)
                              • +
                              +

                              For the rest of this blog post, we are going to do a very limited version of +"known bits", called parity. This analysis only tracks the least significant +bit of a number, which indicates if it is even or odd.

                              +

                              Parity

                              +

                              The lattice is pretty similar to the positive/negative lattice:

                              +
                                  top
                              +  /     \
                              +even    odd
                              +  \     /
                              +   bottom
                              +
                              + +

                              Let's define a data structure to represent this in Python code:

                              +
                              class Parity:
                              +    def __init__(self, name):
                              +        self.name = name
                              +
                              +    def __repr__(self):
                              +        return self.name
                              +
                              + +

                              And instantiate the members of the lattice:

                              +
                              TOP = Parity("top")
                              +EVEN = Parity("even")
                              +ODD = Parity("odd")
                              +BOTTOM = Parity("bottom")
                              +
                              + +

                              Now let's write a forward flow analysis of a basic block using this lattice. +We'll do that by assuming that a method on Parity is defined for each IR +operation. For example, Parity.add, Parity.lshift, etc.

                              +
                              def analyze(block: Block) -> None:
                              +    parity = {v: BOTTOM for v in block}
                              +
                              +    def parity_of(value):
                              +        if isinstance(value, Constant):
                              +            return Parity.const(value)
                              +        return parity[value]
                              +
                              +    for op in block:
                              +        transfer = getattr(Parity, op.name)
                              +        args = [parity_of(arg.find()) for arg in op.args]
                              +        parity[op] = transfer(*args)
                              +
                              + +

                              For every operation, we compute the abstract value---the parity---of the +arguments and then call the corresponding method on Parity to get the +abstract result.

                              + +

                              We need to special case Constants due to a quirk of how the Toy IR is +constructed: the constants don't appear in the instruction stream and instead +are free-floating.

                              +

                              Let's start by looking at the abstraction function for concrete +values---constants:

                              +
                              class Parity:
                              +    # ...
                              +    @staticmethod
                              +    def const(value):
                              +        if value.value % 2 == 0:
                              +            return EVEN
                              +        else:
                              +            return ODD
                              +
                              + +

                              Seems reasonable enough. Let's pause on operations for a moment and consider an +example program:

                              +
                              v0 = getarg(0)
                              +v1 = getarg(1)
                              +v2 = lshift(v0, 1)
                              +v3 = lshift(v1, 1)
                              +v4 = add(v2, v3)
                              +v5 = dummy(v4)
                              +
                              + +

                              This function (which is admittedly a little contrived) takes two inputs, shifts +them left by one bit, adds the result, and then checks the least significant +bit of the addition result. It then passes that result into a dummy function, +which you can think of as "return" or "escape".

                              +

                              To do some abstract interpretation on this program, we'll need to implement the +transfer functions for lshift and add (dummy will just always return +TOP). We'll start with add. Remember that adding two even numbers returns +an even number, adding two odd numbers returns an even number, and mixing even +and odd returns an odd number.

                              +
                              class Parity:
                              +    # ...
                              +    def add(self, other):
                              +        if self is BOTTOM or other is BOTTOM:
                              +            return BOTTOM
                              +        if self is TOP or other is TOP:
                              +            return TOP
                              +        if self is EVEN and other is EVEN:
                              +            return EVEN
                              +        if self is ODD and other is ODD:
                              +            return EVEN
                              +        return ODD
                              +
                              + +

                              We also need to fill in the other cases where the operands are top or +bottom. In this case, they are both "contagious"; if either operand is +bottom, the result is as well. If neither is bottom but either operand is top, +the result is as well.

                              +

                              Now let's look at lshift. Shifting any number left by a non-zero number of +bits will always result in an even number, but we need to be careful about the +zero case! Shifting by zero doesn't change the number at all. Unfortunately, +since our lattice has no notion of zero, we have to over-approximate here:

                              +
                              class Parity:
                              +    # ...
                              +    def lshift(self, other):
                              +        # self << other
                              +        if other is ODD:
                              +            return EVEN
                              +        return TOP
                              +
                              + +

                              This means that we will miss some opportunities to optimize, but it's a +tradeoff that's just part of the game. (We could also add more elements to our +lattice, but that's a topic for another day.)

                              +

                              Now, if we run our abstract interpretation, we'll collect some interesting +properties about the program. If we temporarily hack on the internals of +bb_to_str, we can print out parity information alongside the IR operations:

                              +
                              v0:top = getarg(0)
                              +v1:top = getarg(1)
                              +v2:even = lshift(v0, 1)
                              +v3:even = lshift(v1, 1)
                              +v4:even = add(v2, v3)
                              +v5:top = dummy(v4)
                              +
                              + +

                              This is pretty awesome, because we can see that v4, the result of the +addition, is always even. Maybe we can do something with that information.

                              +

                              Optimization

                              +

                              One way that a program might check if a number is odd is by checking the least +significant bit. This is a common pattern in C code, where you might see code +like y = x & 1. Let's introduce a bitand IR operation that acts like the +& operator in C/Python. Here is an example of use of it in our program:

                              +
                              v0 = getarg(0)
                              +v1 = getarg(1)
                              +v2 = lshift(v0, 1)
                              +v3 = lshift(v1, 1)
                              +v4 = add(v2, v3)
                              +v5 = bitand(v4, 1)  # new!
                              +v6 = dummy(v5)
                              +
                              + +

                              We'll hold off on implementing the transfer function for it---that's left as an +exercise for the reader---and instead do something different.

                              +

                              Instead, we'll see if we can optimize operations of the form bitand(X, 1). If +we statically know the parity as a result of abstract interpretation, we can +replace the bitand with a constant 0 or 1.

                              +

                              We'll first modify the analyze function (and rename it) to return a new +Block containing optimized instructions:

                              +
                              def simplify(block: Block) -> Block:
                              +    parity = {v: BOTTOM for v in block}
                              +
                              +    def parity_of(value):
                              +        if isinstance(value, Constant):
                              +            return Parity.const(value)
                              +        return parity[value]
                              +
                              +    result = Block()
                              +    for op in block:
                              +        # TODO: Optimize op
                              +        # Emit
                              +        result.append(op)
                              +        # Analyze
                              +        transfer = getattr(Parity, op.name)
                              +        args = [parity_of(arg.find()) for arg in op.args]
                              +        parity[op] = transfer(*args)
                              +    return result
                              +
                              + +

                              We're approaching this the way that PyPy does things under the hood, which is +all in roughly a single pass. It tries to optimize an instruction away, and if +it can't, it copies it into the new block.

                              +

                              Now let's add in the bitand optimization. It's mostly some gross-looking +pattern matching that checks if the right hand side of a bitwise and +operation is 1 (TODO: the left hand side, too). CF had some neat ideas on how +to make this more ergonomic, which I might save for later.3

                              +

                              Then, if we know the parity, optimize the bitand into a constant.

                              +
                              def simplify(block: Block) -> Block:
                              +    parity = {v: BOTTOM for v in block}
                              +
                              +    def parity_of(value):
                              +        if isinstance(value, Constant):
                              +            return Parity.const(value)
                              +        return parity[value]
                              +
                              +    result = Block()
                              +    for op in block:
                              +        # Try to simplify
                              +        if isinstance(op, Operation) and op.name == "bitand":
                              +            arg = op.arg(0)
                              +            mask = op.arg(1)
                              +            if isinstance(mask, Constant) and mask.value == 1:
                              +                if parity_of(arg) is EVEN:
                              +                    op.make_equal_to(Constant(0))
                              +                    continue
                              +                elif parity_of(arg) is ODD:
                              +                    op.make_equal_to(Constant(1))
                              +                    continue
                              +        # Emit
                              +        result.append(op)
                              +        # Analyze
                              +        transfer = getattr(Parity, op.name)
                              +        args = [parity_of(arg.find()) for arg in op.args]
                              +        parity[op] = transfer(*args)
                              +    return result
                              +
                              + +

                              Remember: because we use union-find to rewrite instructions in the optimizer +(make_equal_to), later uses of the same instruction get the new +optimized version "for free" (find).

                              +

                              Let's see how it works on our IR:

                              +
                              v0 = getarg(0)
                              +v1 = getarg(1)
                              +v2 = lshift(v0, 1)
                              +v3 = lshift(v1, 1)
                              +v4 = add(v2, v3)
                              +v6 = dummy(0)
                              +
                              + +

                              Hey, neat! bitand disappeared and the argument to dummy is now the constant +0 because we know the lowest bit.

                              +

                              Wrapping up

                              +

                              Hopefully you have gained a little bit of an intuitive understanding of +abstract interpretation. Last year, being able to write some code made me more +comfortable with the math. Now being more comfortable with the math is helping +me write the code. It's nice upward spiral.

                              +

                              The two abstract domains we used in this post are simple and not very useful in +practice but it's possible to get very far using slightly more complicated +abstract domains. Common domains include: constant propagation, type inference, +range analysis, effect inference, liveness, etc. For example, here is a a +sample lattice for constant propagation:

                              +
                              + +

                              It has multiple levels to indicate more and less precision. For example, you +might learn that a variable is either 1 or 2 and be able to encode that as +nonnegative instead of just going straight to top.

                              +

                              Check out some real-world abstract interpretation in open source projects:

                              + +

                              If you have some readable examples, please share them so I can add.

                              +

                              Acknowledgements

                              +

                              Thank you to CF Bolz-Tereick for the toy optimizer and +helping edit this post!

                              +
                              +
                              +
                                +
                              1. +

                                In the words of abstract interpretation researchers Vincent Laviron +and Francesco Logozzo in their paper Refining Abstract +Interpretation-based Static Analyses with Hints (APLAS 2009):

                                +
                                +

                                The three main elements of an abstract interpretation are: (i) the +abstract elements ("which properties am I interested in?"); (ii) the +abstract transfer functions ("which is the abstract semantics of basic +statements?"); and (iii) the abstract operations ("how do I combine the +abstract elements?").

                                +
                                +

                                We don't have any of these "abstract operations" in this post because +there's no control flow but you can read about them elsewhere! 

                                +
                              2. +
                              3. +

                                These abstract values are arranged in a lattice, which is a +mathematical structure with some properties but the most important ones are +that it has a top, a bottom, a partial order, a meet operation, and values +can only move in one direction on the lattice.

                                +

                                Using abstract values from a lattice promises two things:

                                +
                                  +
                                • The analysis will terminate
                                • +
                                • The analysis will be correct for any run of the program, not just one + sample run
                                • +
                                +

                                +
                              4. +
                              5. +

                                Something about __match_args__ and @property... 

                                +
                              6. +
                              +
                              +
                              +

                              Comments

                              +
                              +
                              +
                              + +
                              +
                              + + \ No newline at end of file diff --git a/posts/2024/08/conda-forge-proposes-dropping-support-for-pypy.html b/posts/2024/08/conda-forge-proposes-dropping-support-for-pypy.html new file mode 100644 index 000000000..16e99e7f4 --- /dev/null +++ b/posts/2024/08/conda-forge-proposes-dropping-support-for-pypy.html @@ -0,0 +1,307 @@ + + + + + +Conda-forge proposes sunsetting support for PyPy | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                              +
                              +

                              Conda-forge proposes sunsetting support for PyPy

                              + + + +
                              +

                              Conda-forge has kindly been providing support for PyPy since 2019. The +conda-forge team has been very patient and generous with resources, but it +seems the uptake of PyPy has not justified the effort. Major packages still +are not available on PyPy, +others find it hard to update +versions. We don't +get much feedback at all about people using PyPy, and even less about PyPy on +conda-forge. The conda-forge team has proposed sunsetting +PyPy going +forward, which means current packages would remain but no new packages would be +built. If you have an opinion, you can comment on that PR, or on this blog post.

                              +

                              Since conda-forge supports PyPy3.9 but not PyPy3.10, we have continued +releasing PyPy3.9 even though we typically support only one version of PyPy3. +With the sunsetting proposal, we will not release any more updates to PyPy3.9. +I opened a poll about the +intention to drop PyPy3.9. If you have an opinion, please chime in.

                              +
                              +

                              Comments

                              +
                              +
                              +
                              + +
                              +
                              + + \ No newline at end of file diff --git a/posts/2024/08/portaone.html b/posts/2024/08/portaone.html new file mode 100644 index 000000000..f2d450ac7 --- /dev/null +++ b/posts/2024/08/portaone.html @@ -0,0 +1,394 @@ + + + + + +Guest Post: How PortaOne uses PyPy for high-performance processing, connecting over 1B of phone calls every month | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                              +
                              +

                              Guest Post: How PortaOne uses PyPy for high-performance processing, connecting over 1B of phone calls every month

                              + + + +
                              +

                              The PyPy project is always happy to hear about industrial use and deployments +of PyPy. For the GC bug +finding +task earlier this year, we collaborated with PortaOne and we're super happy +that Serhii Titov, head of the QA department at PortaOne, was up to writing +this guest post to describe their use and experience with the project.

                              +
                              +

                              What does PortaOne do?

                              +

                              We at PortaOne Inc. allow telecom operators to +launch new services (or provide existing services more efficiently) using our +VoIP platform (PortaSIP) and our real-time charging system (PortaBilling), +which provides additional features for cloud PBX, such as call transfer, +queues, interactive voice response (IVR) and more. At this moment our support +team manages several thousand servers with our software installed in 100 +countries, through which over 500 telecommunication service providers connect +millions of end users every day. The unique thing about PortaOne is that we +supply the source code of our product to our customers - something unheard of +in the telecom world! Thus we attract "telco innovators", who use our APIs to +build around the system and the source code to create unique tweaks of +functionality, which produces amazing products.

                              +

                              At the core of PortaSIP is the middle-ware component (the proper name for it is +"B2BUA", but that probably does not say much to anyone outside of experts in +VoIP), which implements the actual handling of SIP calls, messages, etc. and +all added features (for instance, trying to send a call via telco operators +through which the cost per minute is lower). It has to be fast (since even a +small delay in establishing a call is noticed by a customer), reliable +(everyone hates when a call drops or cannot be completed) and yet easily +expandable with new functionality. This is why we decided to use Python as +opposed to C/C++ or similar programming languages, which are often used in +telecom equipment.

                              +

                              The B2BUA component is a batch of similar Python processes that are looped +inside a +asyncore.dispatcher +wrapper. The load balancing between these Python processes is done by our +stateless SIP proxy server written in C++. All our sockets are served by this +B2BUA. We have our custom client-wrappers around pymysql, redis, +cassandra-driver and requests to communicate with external services. Some +of the Python processes use cffi +wrappers around C-code to improve their performance (examples: an Oracle DB +driver, a client to a radius server, a custom C logger).

                              +

                              The I/O operations that block the main thread of the Python processes are +processed in sub-threads. We have custom wrappers around threading.Thread +and also asyncore.dispatcher. The results of such operations are returned to +the main thread.

                              +

                              Improving our performance with PyPy

                              +

                              We started with CPython and then in 2014 switched to PyPy because it was +faster. Here's an exact quote from our first testing notes: "PyPy gives +significant performance boost, ~50%". Nowadays, after years of changes in all +the software involved, PyPy still gives us +50% boost compared to CPython.

                              +

                              Taking care of real time traffic for so many people around the globe is +something we're really proud of. I hope the PyPy team can be proud of it as +well, as the PyPy product is a part of this solution.

                              +

                              Finding a garbage collector bug: stage 1, the GC hooks

                              +

                              However our path with PyPy wasn't perfectly smooth. There were very rare cases +of crashes on PyPy that we weren't able to catch. That's because to make +coredump useful we needed to switch to PyPy with debug, but we cannot let it +run in that mode on a production system for an extended period of time, and we +did not have any STR (steps-to-reproduce) to make PyPy crash again in our lab. +That's why we kept (and still keep) both interpreters installed just in case, +and we would switch to CPython if we noticed it happening.

                              +

                              At the time of updating PyPy from 3.5 to 3.6 our QA started noticing those +crashes more often, but we still had no luck with STR or collecting proper +coredumps with debug symbols. Then it became even worse after our development +played with the Garbage Collector's +options to increase performance +of our middleware component. The crashes started to affect our regular +performance testing (controlled by QA manager Yevhenii Bovda). At that point it +was decided that we can no longer live like that and so we started an intense +investigation.

                              +

                              During the first stage of our investigation (following the best practice of +troubleshooting) we narrowed down the issue as much as we could. So, it was not +our code, it was definitely somewhere in PyPy. Eventually our SIP software +engineer Yevhenii Yatchenko found out +that this bug is connected with the use of our custom hooks in the +GC. Yevhenii created +ticket #4899 and within 2-3 days we +got a fix from a member of the PyPy team, in true open-source fashion.

                              +

                              Finding a garbage collector bug: stage 2, the real bug

                              +

                              Then came stage 2. In parallel with the previous ticket, Yevhenii created +#4900 that we still see failing +with coredumps quite often, and they are not connected to GC custom hooks. In a +nutshell, it took us dozens of back and forward emails, three Zoom sessions and +four versions of a patch to solve the issue. During the last iteration we got a +new set of options to try and a new version of the patch. Surprisingly, that +helped! What a relief! So, the next logical step was to remove all debug +options and run PyPy only with the patch. Unfortunately, it started to fail +again and we came to the obvious conclusion that what will help us is not a +patch, but one of options we were testing out. At that point we found out that +PYPY_GC_MAX_PINNED=0 +is a necessary and sufficient condition to solve our issue. This points to +another bug in the garbage collector, somehow related to object pinning.

                              +

                              Here's our current state: we have to add PYPY_GC_MAX_PINNED=0, but we do not +face the crashes anymore.

                              +

                              Conclusion and next steps

                              +

                              Gratitude is extended to Carl for his invaluable assistance in resolving the +nasty bugss, because it seems we're the only ones who suffered from the last +one and we really did not want to fall back to CPython due to its performance +disadvantage.

                              +

                              Serhii Titov, head of the QA department at PortaOne Inc.

                              +

                              P.S. If you are a perfectionist and at this point you have mixed feelings and +you are still bothered by the question "But there might still be a bug in the +GC, what about that?" - Carl has some ideas about it and he will sort it out +(we will help with the testing/verification part).

                              +
                              +

                              Comments

                              +
                              +
                              +
                              + +
                              +
                              + + \ No newline at end of file diff --git a/posts/2024/08/pypy-v7317-release.html b/posts/2024/08/pypy-v7317-release.html new file mode 100644 index 000000000..4afc741af --- /dev/null +++ b/posts/2024/08/pypy-v7317-release.html @@ -0,0 +1,405 @@ + + + + + +PyPy v7.3.17 release | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                              +
                              +

                              PyPy v7.3.17 release

                              + + + +
                              +

                              PyPy v7.3.17: release of python 2.7 and 3.10

                              +

                              The PyPy team is proud to release version 7.3.17 of PyPy.

                              +

                              This release includes a new RISC-V JIT backend, an improved REPL based on +work by the CPython team, and better JIT optimizations of integer +operations. Special shout-outs to Logan Chien for the RISC-V backend +work, to Nico Rittinghaus for better integer optimization in the JIT, and +the CPython team that has worked on the repl.

                              +

                              The release includes two different interpreters:

                              +
                                +
                              • PyPy2.7, which is an interpreter supporting the syntax and the features of +Python 2.7 including the stdlib for CPython 2.7.18+ (the + is for +backported security updates)

                              • +
                              • PyPy3.10, which is an interpreter supporting the syntax and the features of +Python 3.10, including the stdlib for CPython 3.10.14.

                              • +
                              +

                              The interpreters are based on much the same codebase, thus the dual +release. This is a micro release, all APIs are compatible with the other 7.3 +releases. It follows after 7.3.16 release on April 23, 2024.

                              +

                              We recommend updating. You can find links to download the releases here:

                              +
                              +

                              https://pypy.org/download.html

                              +
                              +

                              We would like to thank our donors for the continued support of the PyPy +project. If PyPy is not quite good enough for your needs, we are available for +direct consulting work. If PyPy is helping you out, we would love to hear +about it and encourage submissions to our blog via a pull request +to https://github.com/pypy/pypy.org

                              +

                              We would also like to thank our contributors and encourage new people to join +the project. PyPy has many layers and we need help with all of them: bug fixes, +PyPy and RPython documentation improvements, or general help with +making RPython's JIT even better.

                              +

                              If you are a python library maintainer and use C-extensions, please consider +making a HPy / CFFI / cppyy version of your library that would be performant +on PyPy. In any case, both cibuildwheel and the multibuild system support +building wheels for PyPy.

                              +

                              RISC-V backend for the JIT

                              +

                              PyPy's JIT has added support for generating 64-bit RISC-V machine code at +runtime (RV64-IMAD, specifically). So far we are not releasing binaries for any +RISC-V platforms, but there are instructions on how to cross-compile binaries.

                              +

                              REPL Improvements

                              +

                              The biggest user-visible change of the release is new features in the repl of +PyPy3.10. CPython 3.13 has adopted and extended PyPy's pure-Python repl, adding +a number of features and fixing a number or bugs in the process. We have +backported and added the following features:

                              +
                                +
                              • Prompts and tracebacks use terminal colors, as well as terminal hyperlinks +for file names.

                              • +
                              • Bracketed paste enable pasting several lines of input into the terminal +without auto-indentation getting in the way.

                              • +
                              • A special interactive help browser (F1), history browser (F2), explicit paste +mode (F3).

                              • +
                              • Support for Ctrl-<left/right> to jump over whole words at a time.

                              • +
                              +

                              See the CPython documentation for further details. Thanks to Łukasz Langa, +Pablo Galindo Salgado and the other CPython devs involved in this work.

                              +

                              Better JIT optimizations of integer operations

                              +

                              The optimizers of PyPy's JIT have become much better at reasoning about and +optimizing integer operations. This is done with a new "knownbits" abstract +domain. In many programs that do bit-manipulation of integers, some of the +bits of the integer variables of the program can be statically known. Here's a +simple example:

                              +
                              x = a | 1
                              +...
                              +if x & 1:
                              +    ...
                              +else:
                              +    ...
                              +
                              +

                              With the new abstract domain, the JIT can optimize the if-condition to +True, because it already knows that the lowest bit of x must be set. +This optimization applies to all Python-integers that fit into a machine word +(PyPy optimistically picks between two different representations for int, +depending on the size of the value). Unfortunately there is very little impact +of this change on almost all Python code, because intensive bit-manipulation is +rare in Python. However, the change leads to significant performance +improvements in Pydrofoil (the RPython-based RISC-V/ARM emulators that are +automatically generated from high-level Sail specifications of the respective +ISAs, and that use the RPython JIT to improve performance).

                              +

                              PyPy versions and speed.pypy.org

                              +

                              The keen-eyed will have noticed no mention of Python version 3.9 in the +releases above. Typically we will maintain only one version of Python3, but due +to PyPy3.9 support on conda-forge we maintained multiple versions from the +first release of PyPy3.10 in PyPy v7.3.12 (Dec 2022). Conda-forge is +sunsetting its PyPy support, which means we can drop PyPy3.9. Since that was +the major driver of benchmarks at https://speed.pypy.org, we revamped the site +to showcase PyPy3.9, PyPy3.10, and various versions of cpython on the home +page. For historical reasons, the "baseline" for comparison is still cpython +3.7.19.

                              +

                              We will keep the buildbots building PyPY3.9 until the end of August, these +builds will still be available on the nightly builds tab of the buildbot.

                              +

                              What is PyPy?

                              +

                              PyPy is a Python interpreter, a drop-in replacement for CPython +It's fast (PyPy and CPython performance +comparison) due to its integrated tracing JIT compiler.

                              +

                              We also welcome developers of other dynamic languages to see what RPython +can do for them.

                              +

                              We provide binary builds for:

                              +
                                +
                              • x86 machines on most common operating systems +(Linux 32/64 bits, Mac OS 64 bits, Windows 64 bits)

                              • +
                              • 64-bit ARM machines running Linux (aarch64) and macos (macos_arm64).

                              • +
                              +

                              PyPy supports Windows 32-bit, Linux PPC64 big- and little-endian, Linux ARM +32 bit, RISC-V RV64IMAFD Linux, and s390x Linux but does not release binaries. +Please reach out to us if you wish to sponsor binary releases for those +platforms. Downstream packagers provide binary builds for debian, Fedora, +conda, OpenBSD, FreeBSD, Gentoo, and more.

                              +

                              What else is new?

                              +

                              For more information about the 7.3.17 release, see the full changelog.

                              +

                              Please update, and continue to help us make pypy better.

                              +

                              Cheers, +The PyPy Team

                              +
                              +
                              +

                              Comments

                              +
                              +
                              +
                              + +
                              +
                              + + \ No newline at end of file diff --git a/posts/2024/08/toy-knownbits.html b/posts/2024/08/toy-knownbits.html new file mode 100644 index 000000000..dc50b62c8 --- /dev/null +++ b/posts/2024/08/toy-knownbits.html @@ -0,0 +1,1681 @@ + + + + + +A Knownbits Abstract Domain for the Toy Optimizer, Correctly | PyPy + + + + + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                              +
                              +

                              A Knownbits Abstract Domain for the Toy Optimizer, Correctly

                              + + + +
                              +

                              After Max' introduction to abstract interpretation for the toy optimizer in the +last post, I want to present a more complicated abstract domain in this post. +This abstract domain reasons about the individual bits of a variable in a trace. +Every bit can be either "known zero", "known one" or "unknown". The abstract +domain is useful for optimizing integer operations, particularly the bitwise operations. +The abstract domain follows quite closely the tristate abstract domain of the +eBPF verifier in the Linux +Kernel, as +described by the paper +Sound, Precise, and Fast Abstract Interpretation with Tristate +Numbers by Harishankar Vishwanathan, Matan +Shachnai, Srinivas Narayana, and Santosh Nagarakatte.

                              +

                              The presentation in this post will still be in the context of the +toy optimizer. We'll spend a significant part of +the post convincing ourselves that the abstract domain transfer functions that +we're writing are really correct, using both property-based testing and +automated proofs (again using Z3).

                              +

                              PyPy has implemented and merged a more complicated version of the same abstract +domain for the "real" PyPy JIT. A more thorough explanation of that real world +implementation will follow.

                              +

                              I'd like to thank Max Bernstein and Armin Rigo for lots of great feedback on +drafts of this post. The PyPy implementation was mainly done by Nico +Rittinghaus and me.

                              +

                              Contents:

                              + +

                              Motivation

                              +

                              In many programs that do bit-manipulation of integers, some of the bits of the +integer variables of the program can be statically known. Here's a simple +example:

                              +
                              x = a | 1
                              +...
                              +if x & 1:
                              +    ...
                              +else:
                              +    ...
                              +
                              + +

                              After the assignment x = a | 1, we know that the lowest bit of x must be 1 +(the other bits are unknown) and an optimizer could remove the condition x & 1 by +constant-folding it to 1.

                              +

                              Another (more complicated) example is:

                              +
                              assert i & 0b111 == 0 # check that i is a multiple of 8
                              +j = i + 16
                              +assert j & 0b111 == 0
                              +
                              + +

                              This kind of code could e.g. happen in a CPU +emulator, where i and j are +integers that represent emulated pointers, and the asserts are alignment +checks. The first assert implies that the lowest three bits of i must be 0. +Adding 16 to such a number produces a result where the lowest three bits are +again all 0, therefore the second assert is always true. So we would like a +compiler to remove the second assert.

                              +

                              Both of these will optimizations are doable with the help of the knownbits +abstract domain that we'll discuss in the rest of the post.

                              +

                              The Knownbits Abstract Domain

                              +

                              An abstract value of the knownbits domain needs to be able to store, for every +bit of an integer variable in a program, whether it is known 0, known 1, or +unknown. To represent +three different states, we need 2 bits, which we will call one and unknown. +Here's the encoding:

                              + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                              oneunknownknownbit
                              000
                              101
                              01?
                              11illegal
                              +

                              The unknown bit is set if we don't know the value of the bit ("?"), the one +bit is set if the bit is known to be a 1. Since two bits are enough to encode +four different states, but we only need three, the combination of a set one +bit and a set unknown is not allowed.

                              +

                              We don't just want to encode a single bit, however. Instead, we want to do this +for all the bits of an integer variable. Therefore the instances of the abstract +domain get two integer fields ones and unknowns, where each pair of +corresponding bits encodes the knowledge about the corresponding bit of the +integer variable in the program.

                              +

                              We can start implementing a Python class that works like this:

                              +
                              from dataclasses import dataclass
                              +
                              +@dataclass(eq=False)
                              +class KnownBits:
                              +    ones : int
                              +    unknowns : int
                              +
                              +    def __post_init__(self):
                              +        if isinstance(self.ones, int):
                              +            assert self.is_well_formed()
                              +
                              +    def is_well_formed(self):
                              +        # a bit cannot be both 1 and unknown
                              +        return self.ones & self.unknowns == 0
                              +
                              +    @staticmethod
                              +    def from_constant(const : int):
                              +        """ Construct a KnownBits corresponding to a constant, where all bits
                              +        are known."""
                              +        return KnownBits(const, 0)
                              +
                              +    def is_constant(self):
                              +        """ Check if the KnownBits instance represents a constant. """
                              +        # it's a constant if there are no unknowns
                              +        return self.unknowns == 0
                              +
                              + +

                              We can also add some convenience properties. Sometimes it is easier to work +with an integer where all the known bits are set, or one where the positions +of all the known zeros have a set bit:

                              +
                              class KnownBits:
                              +    ...
                              +
                              +    @property
                              +    def knowns(self):
                              +        """ return an integer where the known bits are set. """
                              +        # the knowns are just the unknowns, inverted
                              +        return ~self.unknowns
                              +
                              +    @property
                              +    def zeros(self):
                              +        """ return an integer where the places that are known zeros have a bit
                              +        set. """
                              +        # it's a 0 if it is known, but not 1
                              +        return self.knowns & ~self.ones
                              +
                              + +

                              Also, for debugging and for writing tests we want a way to print the known bits +in a human-readable form, and also to have a way to construct a KnownBits +instance from a string. It's not important to understand the details of +__str__ or from_str for the rest of the post, so I'm putting them into a fold:

                              +
                              KnownBits from and to string conversions
                              class KnownBits:
                              +    ...
                              +
                              +    def __repr__(self):
                              +        if self.is_constant():
                              +            return f"KnownBits.from_constant({self.ones})"
                              +        return f"KnownBits({self.ones}, {self.unknowns})"
                              +
                              +    def __str__(self):
                              +        res = []
                              +        ones, unknowns = self.ones, self.unknowns
                              +        # construct the string representation right to left
                              +        while 1:
                              +            if not ones and not unknowns:
                              +                break # we leave off the leading known 0s
                              +            if ones == -1 and not unknowns:
                              +                # -1 has all bits set in two's complement, so the leading
                              +                # bits are all 1
                              +                res.append('1')
                              +                res.append("...")
                              +                break
                              +            if unknowns == -1:
                              +                # -1 has all bits set in two's complement, so the leading bits
                              +                # are all ?
                              +                assert not ones
                              +                res.append("?")
                              +                res.append("...")
                              +                break
                              +            if unknowns & 1:
                              +                res.append('?')
                              +            elif ones & 1:
                              +                res.append('1')
                              +            else:
                              +                res.append('0')
                              +            ones >>= 1
                              +            unknowns >>= 1
                              +        if not res:
                              +            res.append('0')
                              +        res.reverse()
                              +        return "".join(res)
                              +
                              +    @staticmethod
                              +    def from_str(s):
                              +        """ Construct a KnownBits instance that from a string. String can start
                              +        with ...1 to mean that all higher bits are 1, or ...? to mean that all
                              +        higher bits are unknown. Otherwise it is assumed that the higher bits
                              +        are all 0. """
                              +        ones, unknowns = 0, 0
                              +        startindex = 0
                              +        if s.startswith("...?"):
                              +            unknowns = -1
                              +            startindex = 4
                              +        elif s.startswith("...1"):
                              +            ones = -1
                              +            startindex = 4
                              +        for index in range(startindex, len(s)):
                              +            ones <<= 1
                              +            unknowns <<= 1
                              +            c = s[index]
                              +            if c == '1':
                              +                ones |= 1
                              +            elif c == '?':
                              +                unknowns |= 1
                              +        return KnownBits(ones, unknowns)
                              +
                              +    @staticmethod
                              +    def all_unknown():
                              +        """ convenience constructor for the "all bits unknown" abstract value
                              +        """
                              +        return KnownBits.from_str("...?")
                              +
                              + + + +

                              And here's a pytest-style unit test for str:

                              +
                              def test_str():
                              +    assert str(KnownBits.from_constant(0)) == '0'
                              +    assert str(KnownBits.from_constant(5)) == '101'
                              +    assert str(KnownBits(5, 0b10)) == '1?1'
                              +    assert str(KnownBits(~0b1111, 0b10)) == '...100?0'
                              +    assert str(KnownBits(1, ~0b1)) == '...?1'
                              +
                              + +

                              An instance of KnownBits represents a set of integers, namely those that match +the known bits stored in the instance. We can write a method contains that +takes a concrete int value and returns True if the value matches the +pattern of the known bits:

                              +
                              class KnownBits:
                              +    ...
                              +
                              +    def contains(self, value : int):
                              +        """ Check whether the KnownBits instance contains the concrete integer
                              +        `value`. """
                              +        # check whether value matches the bit pattern. in the places where we
                              +        # know the bits, the value must agree with ones.
                              +        return value & self.knowns == self.ones
                              +
                              + +

                              and a test:

                              +
                              def test_contains():
                              +    k1 = KnownBits.from_str('1?1')
                              +    assert k1.contains(0b111)
                              +    assert k1.contains(0b101)
                              +    assert not k1.contains(0b110)
                              +    assert not k1.contains(0b011)
                              +
                              +    k2 = KnownBits.from_str('...?1') # all odd numbers
                              +    for i in range(-101, 100):
                              +        assert k2.contains(i) == (i & 1)
                              +
                              + +

                              Transfer Functions

                              +

                              Now that we have implemented the basics of the KnownBits class, we need to +start implementing the transfer functions. They are for computing what we know +about the results of an operation, given the knowledge we have about the bits +of the arguments.

                              +

                              We'll start with a simple unary operation, invert(x) (which is ~x in Python +and C syntax), which flips all the bits of at integer. If we know some bits of +the arguments, we can compute the corresponding bits of the result. The unknown +bits remain unknown.

                              +

                              Here's the code:

                              +
                              class KnownBits:
                              +    ...
                              +
                              +    def abstract_invert(self):
                              +        # self.zeros has bits set where the known 0s are in self
                              +        return KnownBits(self.zeros, self.unknowns)
                              +
                              + +

                              And a unit-test:

                              +
                              def test_invert():
                              +    k1 = KnownBits.from_str('01?01?01?')
                              +    k2 = k1.abstract_invert()
                              +    assert str(k2) == '...10?10?10?'
                              +
                              +    k1 = KnownBits.from_str('...?')
                              +    k2 = k1.abstract_invert()
                              +    assert str(k2) == '...?'
                              +
                              + +

                              Before we continue with further transfer functions, we'll think about +correctness of the transfer functions and build up some test infrastructure. To +test transfer functions, it's quite important to move being simple example-style +unit tests. The state-space for more complicated binary transfer functions is +extremely large and it's too easy to do something wrong in a corner case. +Therefore we'll look at property-based-test for KnownBits next.

                              +

                              Property-based Tests with Hypothesis

                              +

                              We want to do property-based tests of KnownBits, to try +make it less likely that we'll get a corner-case in the implementation wrong. +We'll use Hypothesis for that.

                              +

                              I can't give a decent introduction to Hypothesis here, but want to give a few +hints about the API. Hypothesis is a way to run unit tests with randomly +generated input. It provides strategies to describe the data that the test +functions expects. Hypothesis provides primitive strategies (for things like +integers, strings, floats, etc) and ways to build composite strategies out of +the primitive ones.

                              +

                              To be able to write the tests, we need to generate random KnownBits instances, +and we also want an int instance that is a member of the KnownBits instance. +We generate tuples of (KnownBits, int) together, to ensure this property. +We'll ask Hypothesis to generate us a random concrete int as the concrete +value, and then we'll also generate a second random int to use as the +unknown masks (i.e. which bits of the concrete int we don't know in the +KnownBits instance). Here's a function that takes two such ints and builds the +tuple:

                              +
                              def build_knownbits_and_contained_number(concrete_value : int, unknowns : int):
                              +    # to construct a valid KnownBits instance, we need to mask off the unknown
                              +    # bits
                              +    ones = concrete_value & ~unknowns
                              +    return KnownBits(ones, unknowns), concrete_value
                              +
                              + +

                              We can turn this function into a hypothesis strategy to generate input data +using the strategies.builds function:

                              +
                              from hypothesis import strategies, given, settings
                              +
                              +ints = strategies.integers()
                              +
                              +random_knownbits_and_contained_number = strategies.builds(
                              +    build_knownbits_and_contained_number,
                              +    ints, ints
                              +)
                              +
                              + +

                              One important special case of KnownBits are the constants, which contain only +a single concrete value. We'll also generate some of those specifically, and +then combine the random_knownbits_and_contained_number strategy with it:

                              +
                              constant_knownbits = strategies.builds(
                              +    lambda value: (KnownBits.from_constant(value), value),
                              +    ints
                              +)
                              +
                              +knownbits_and_contained_number = constant_knownbits | random_knownbits_and_contained_number
                              +
                              + +

                              Now we can write the first property-based tests, for the KnownBits.contains +method:

                              +
                              @given(knownbits_and_contained_number)
                              +def test_contains(t):
                              +    k, n = t
                              +    assert k.contains(t)
                              +
                              + +

                              The @given decorator is used to tell Hypothesis which strategy to use to +generate random data for the test function. Hypothesis will run the test with a +number of random examples (100 by default). If it finds an error, it will try to +minimize the example needed that demonstrates the problem, to try to make it +easier to understand what is going wrong. It also saves all failing cases into +an example database and tries them again on subsequent runs.

                              +

                              This test is as much a check for whether we got the strategies right as it is +for the logic in KnownBits.contains. Here's an example output of random +concrete and abstract values that we are getting here:

                              +
                              110000011001101 ...?0???1
                              +...1011011 ...1011011
                              +...1001101110101000010010011111011 ...1001101110101000010010011111011
                              +...1001101110101000010010011111011 ...100110111010100001?010?1??1??11
                              +1000001101111101001011010011111101000011000111011001011111101 1000001101111101001011010011111101000011000111011001011111101
                              +1000001101111101001011010011111101000011000111011001011111101 1000001101111101001011010011111101000011000111????01?11?????1
                              +1111100000010 1111100000010
                              +1111100000010 ...?11111?00000??
                              +110110 110110
                              +110110 ...?00?00????11??10
                              +110110 ??0??0
                              +...100010111011111 ...?100?10111??111?
                              +...1000100000110001 ...?000?00000??000?
                              +110000001110 ...?0?0??000?00?0?0000000?00???0000?????00???000?0?00?01?000?0??1??
                              +110000001110 ??000000???0
                              +1011011010000001110101001111000010001001011101010010010001000000010101010010001101110101111111010101010010101100110000011110000 1011011010000001110101001111000010001001011101010010010001000000010101010010001101110101111111010101010010101100110000011110000
                              +...1011010010010100 ...1011010010010100
                              +...1011111110110011 ...1011111110110011
                              +101000011110110 101000011?10?1?
                              +100101 ?00?0?
                              +
                              + +

                              That looks suitably random, but we might want to bias our random numbers a +little bit towards common error values like small constants, powers of two, etc. +Like this:

                              +
                              INTEGER_WIDTH = 64
                              +# some small integers
                              +ints_special = set(range(100))
                              +# powers of two
                              +ints_special = ints_special.union(1 << i for i in range(INTEGER_WIDTH - 2))
                              +# powers of two - 1
                              +ints_special = ints_special.union((1 << i) - 1 for i in range(INTEGER_WIDTH - 2))
                              +# negative versions of what we have so far
                              +ints_special = ints_special.union(-x for x in ints_special)
                              +# bit-flipped versions of what we have so far
                              +ints_special = ints_special.union(~x for x in ints_special)
                              +ints_special = list(ints_special)
                              +# sort them (because hypothesis simplifies towards earlier elements in the list)
                              +ints_special.sort(key=lambda element: (abs(element), element < 0))
                              +
                              +ints = strategies.sampled_from(ints_special) | strategies.integers()
                              +
                              + +

                              Now we get data like this:

                              +
                              1110 1110
                              +...10000000000000000001 ...10000??0??0000??00?1
                              +1 ??0??0000??00?1
                              +1 ?
                              +...10101100 ...10101100
                              +110000000011001010111011111111111111011110010001001100110001011 ...?0?101?
                              +110000000011001010111011111111111111011110010001001100110001011 ??00000000??00?0?0???0??????????????0????00?000?00??00??000?0??
                              +...1011111111111111111111111111 ...?11?11??
                              +...1011111111111111111111111111 ...?0??????????????????????????
                              +0 ...?0??????????????????????????
                              +101101 101101
                              +111111111111111111111111111111111111111111111 111111111111111111111111111111111111111111111
                              +10111 10111
                              +...101100 ...1?111011?0
                              +101000 ?001010?0
                              +101000 ?0?000
                              +110010 110010
                              +...100111 ...100111
                              +1111011010010 1111011010010
                              +...1000000000000000000000000000000000000 ...1000000000000000000000000000000000000
                              +
                              + +

                              We can also write a test that checks that the somewhat tricky logic in +__str__ and from_str is correct, by making sure that the two functions +round-trip (ie converting a KnownBits to a string and then back to a +KnownBits instance produces the same abstract value).

                              +
                              @given(knownbits_and_contained_number)
                              +def test_hypothesis_str_roundtrips(t1):
                              +    k1, n1 = t1
                              +    s = str(k1)
                              +    k2 = KnownBits.from_str(s)
                              +    assert k1.ones == k2.ones
                              +    assert k1.unknowns == k2.unknowns
                              +
                              + +

                              Now let's actually apply this infrastructure to test abstract_invert.

                              +

                              When are Transfer Functions Correct? How do we test them?

                              +

                              Abstract values, i.e. instances of KnownBits represent sets of concrete +values. We want the transfer functions to compute overapproximations of the +concrete values. So if we have an arbitrary abstract value k, with a concrete +number n that is a member of the abstract values (i.e. +k.contains(n) == True) then the result of the concrete operation op(n) +must be a member of the result of the abstract operation k.abstract_op() +(i.e. k.abstract_op().contains(op(n)) == True).

                              +

                              Checking the correctness/overapproximation property is a good match for +hypothesis. Here's what the test for abstract_invert looks like:

                              +
                              @given(knownbits_and_contained_number)
                              +def test_hypothesis_invert(t):
                              +    k1, n1 = t1
                              +    n2 = ~n1 # compute the real result
                              +    k2 = k1.abstract_invert() # compute the abstract result
                              +    assert k2.contains(n2) # the abstract result must contain the real result
                              +
                              + +

                              This is the only condition needed for abstract_invert to be correct. If +abstract_invert fulfils this property for every combination of abstract and +concrete value then abstract_invert is correct. Note however, that this test +does not actually check whether abstract_invert gives us precise results. A +correct (but imprecise) implementation of abstract_invert would simply return +a completely unknown result, regardless of what is known about the input +KnownBits.

                              +

                              The "proper" CS term for this notion of correctness is called soundness. The +correctness condition on the transfer functions is called a Galois +connection. I won't go into any mathematical/technical details here, but +wanted to at least mention the terms. I found Martin +Kellogg's +slides +to be quite an approachable introduction to the Galois connection and how to +show soundness.

                              +

                              Implementing Binary Transfer Functions

                              +

                              Now we have infrastructure in place for testing transfer functions with random +inputs. With that we can start thinking about the more complicated case, that of +binary operations. Let's start with the simpler ones, and and or. For and, +we can know a 0 bit in the result if either of the input bits are known 0; +or we can know a 1 bit in the result if both input bits are known 1. +Otherwise the resulting bit is unknown. Let's look at all the combinations:

                              +
                              and
                              +input1: 000111???
                              +input2: 01?01?01? 
                              +result: 00001?0??
                              +
                              + +
                              class KnownBits:
                              +    ...
                              +
                              +    def abstract_and(self, other):
                              +        ones = self.ones & other.ones # known ones
                              +        knowns = self.zeros | other.zeros | ones
                              +        return KnownBits(ones, ~knowns)
                              +
                              + +

                              Here's an example unit-test and a property-based test for and:

                              +
                              def test_and():
                              +    # test all combinations of 0, 1, ? in one example
                              +    k1 = KnownBits.from_str('01?01?01?')
                              +    k2 = KnownBits.from_str('000111???')
                              +    res = k1.abstract_and(k2)     # should be: 0...00001?0??
                              +    assert str(res) ==   "1?0??"
                              +
                              +@given(knownbits_and_contained_number, knownbits_and_contained_number)
                              +def test_hypothesis_and(t1, t2):
                              +    k1, n1 = t1
                              +    k2, n2 = t2
                              +    k3 = k1.abstract_and(k2)
                              +    n3 = n1 & n2
                              +    assert k3.contains(n3)
                              +
                              + +

                              To implement or is pretty similar. The result is known 1 where either of the +inputs is 1. The result is known 0 where both inputs are known 0, and ? +otherwise.

                              +
                              or
                              +input1: 000111???
                              +input2: 01?01?01? 
                              +result: 01?111?1?
                              +
                              + +
                              class KnownBits:
                              +    ...
                              +
                              +    def abstract_or(self, other):
                              +        ones = self.ones | other.ones
                              +        zeros = self.zeros & other.zeros
                              +        knowns = ones | zeros
                              +        return KnownBits(ones, ~knowns)
                              +
                              + +

                              Here's an example unit-test and a property-based test for or:

                              +
                              def test_or():
                              +    k1 = KnownBits.from_str('01?01?01?')
                              +    k2 = KnownBits.from_str('000111???')
                              +    res = k1.abstract_or(k2)     # should be:  0...01?111?1?
                              +    assert str(res) ==   "1?111?1?"
                              +
                              +@given(knownbits_and_contained_number, knownbits_and_contained_number)
                              +def test_hypothesis_or(t1, t2):
                              +    k1, n1 = t1
                              +    k2, n2 = t2
                              +    k3 = k1.abstract_or(k2)
                              +    n3 = n1 | n2
                              +    assert k3.contains(n3)
                              +
                              + +

                              Implementing support for abstract_xor is relatively simple, and left as an +exercise :-).

                              +

                              Addition and Subtraction

                              +

                              invert, and, and or are relatively simple transfer functions to write, +because they compose over the individual bits of the integers. The arithmetic +functions add and sub are significantly harder, because of carries and +borrows. Coming up with the formulas for them and gaining an intuitive +understanding is quite tricky and involves carefully going through a few +examples with pen and paper. When implementing this in PyPy, Nico and I didn't +come up with the implementation ourselves, but instead took them from the +Tristate Numbers paper. Here's the code, +with example tests and hypothesis tests:

                              +
                              class KnownBits:
                              +    ...
                              +
                              +    def abstract_add(self, other):
                              +        sum_ones = self.ones + other.ones
                              +        sum_unknowns = self.unknowns + other.unknowns
                              +        all_carries = sum_ones + sum_unknowns
                              +        ones_carries = all_carries ^ sum_ones
                              +        unknowns = self.unknowns | other.unknowns | ones_carries
                              +        ones = sum_ones & ~unknowns
                              +        return KnownBits(ones, unknowns)
                              +
                              +    def abstract_sub(self, other):
                              +        diff_ones = self.ones - other.ones
                              +        val_borrows = (diff_ones + self.unknowns) ^ (diff_ones - other.unknowns)
                              +        unknowns = self.unknowns | other.unknowns | val_borrows
                              +        ones = diff_ones & ~unknowns
                              +        return KnownBits(ones, unknowns)
                              +
                              +
                              +def test_add():
                              +    k1 = KnownBits.from_str('0?10?10?10')
                              +    k2 = KnownBits.from_str('0???111000')
                              +    res = k1.abstract_add(k2)
                              +    assert str(res) ==   "?????01?10"
                              +
                              +def test_sub():
                              +    k1 = KnownBits.from_str('0?10?10?10')
                              +    k2 = KnownBits.from_str('0???111000')
                              +    res = k1.abstract_sub(k2)
                              +    assert str(res) ==   "...?11?10"
                              +    k1 = KnownBits.from_str(    '...1?10?10?10')
                              +    k2 = KnownBits.from_str('...10000???111000')
                              +    res = k1.abstract_sub(k2)
                              +    assert str(res) ==   "111?????11?10"
                              +
                              +@given(knownbits_and_contained_number, knownbits_and_contained_number)
                              +def test_hypothesis_add(t1, t2):
                              +    k1, n1 = t1
                              +    k2, n2 = t2
                              +    k3 = k1.abstract_add(k2)
                              +    n3 = n1 + n2
                              +    assert k3.contains(n3)
                              +
                              +@given(knownbits_and_contained_number, knownbits_and_contained_number)
                              +def test_hypothesis_sub(t1, t2):
                              +    k1, n1 = t1
                              +    k2, n2 = t2
                              +    k3 = k1.abstract_sub(k2)
                              +    n3 = n1 - n2
                              +    assert k3.contains(n3)
                              +
                              + +

                              Now we are in a pretty good situation, and have implemented abstract versions +for a bunch of important arithmetic and binary functions. What's also surprising +is that the implementation of all of the transfer functions is quite efficient. +We didn't have to write loops over the individual bits at all, instead we found +closed form expressions using primitive operations on the underlying integers +ones and unknowns. This means that computing the results of abstract +operations is quite efficient, which is important when using the abstract domain +in the context of a JIT compiler.

                              +

                              Proving correctness of the transfer functions with Z3

                              +

                              As one can probably tell from my recent posts, I've been thinking about +compiler correctness a lot. Getting the transfer functions absolutely +correct is really crucial, because a bug in them would lead to miscompilation of +Python code when the abstract domain is added to the JIT. While the randomized +tests are great, it's still entirely possible for them to miss bugs. The state +space for the arguments of a binary transfer function is 3**64 * 3**64, and if +only a small part of that contains wrong behaviour it would be really unlikely +for us to find it with random tests by chance. Therefore I was reluctant to +merge the PyPy branch that contained the new abstract domain for a long time.

                              +

                              To increase our confidence in the correctness of the transfer functions further, +we can use Z3 to prove their correctness, which gives us much stronger +guarantees (not 100%, obviously). In this subsection I will show how to do that.

                              +

                              Here's an attempt to do this manually in the Python repl:

                              +
                              >>>> import z3
                              +>>>> solver = z3.Solver()
                              +>>>> # like last blog post, proof by failing to find counterexamples
                              +>>>> def prove(cond): assert solver.check(z3.Not(cond)) == z3.unsat
                              +>>>>
                              +>>>> # let's set up a z3 bitvector variable for an arbitrary concrete value
                              +>>>> n1 = z3.BitVec('concrete_value', 64)
                              +>>>> n1
                              +concrete_value
                              +>>>> # due to operator overloading we can manipulate z3 formulas
                              +>>>> n2 = ~n1
                              +>>>> n2
                              +~concrete_value
                              +>>>> 
                              +>>>> # now z3 bitvector variables for the ones and zeros fields
                              +>>>> ones = z3.BitVec('abstract_ones', 64)
                              +>>>> unknowns = z3.BitVec('abstract_unknowns', 64)
                              +>>>> # we construct a KnownBits instance with the z3 variables
                              +>>>> k1 = KnownBits(ones, unknowns)
                              +>>>> # due to operator overloading we can call the methods on k1:
                              +>>>> k2 = k1.abstract_invert()
                              +>>>> k2.ones
                              +~abstract_unknowns & ~abstract_ones
                              +>>>> k2.unknowns
                              +abstract_unknowns
                              +>>>> # here's the correctness condition that we want to prove:
                              +>>>> k2.contains(n2)
                              +~concrete_value & ~abstract_unknowns ==
                              +~abstract_unknowns & ~abstract_ones
                              +>>>> # let's try
                              +>>>> prove(k2.contains(n2))
                              +Traceback (most recent call last):
                              +  File "<stdin>", line 1, in <module>
                              +  File "<stdin>", line 1, in prove
                              +AssertionError
                              +>>>> # it doesn't work! let's look at the counterexample to see why:
                              +>>>> solver.model()
                              +[abstract_unknowns = 0,
                              + abstract_ones = 0,
                              + concrete_value = 1]
                              +>>>> # we can build a KnownBits instance with the values in the
                              +>>>> # counterexample:
                              +>>>> ~1 # concrete result
                              +-2
                              +>>>> counter_example_k1 = KnownBits(0, 0)
                              +>>>> counter_example_k1
                              +KnownBits.from_constant(0)
                              +>>>> counter_example_k2 = counter_example_k1.abstract_invert()
                              +>>>> counter_example_k2
                              +KnownBits.from_constant(-1)
                              +>>>> # let's check the failing condition
                              +>>>> counter_example_k2.contains(~1)
                              +False
                              +
                              + +

                              What is the problem here? We didn't tell Z3 that n1 was supposed to be a +member of k1. We can add this as a precondition to the solver, and then the +prove works:

                              +
                              >>>> solver.add(k1.contains(n1))
                              +>>>> prove(k2.contains(n2)) # works!
                              +
                              + +

                              This is super cool! It's really a proof about the actual implementation, because +we call the implementation methods directly, and due to the operator overloading +that Z3 does we can be sure that we are actually checking a formula that +corresponds to the Python code. This eliminates one source of errors in formal +methods.

                              +

                              Doing the proof manually on the Python REPL is kind of annoying though, and we +also would like to make sure that the proofs are re-done when we change the +code. What we would really like to do is writing the proofs as a unit-test that +we can run while developing and in CI. Doing this is possible, and the unit +tests that really perform proofs look pleasingly similar to the +Hypothesis-based ones.

                              +

                              First we need to set up a bit of infrastructure:

                              +
                              INTEGER_WIDTH = 64
                              +
                              +def BitVec(name):
                              +    return z3.BitVec(name, INTEGER_WIDTH)
                              +
                              +def BitVecVal(val):
                              +    return z3.BitVecVal(val, INTEGER_WIDTH)
                              +
                              +def z3_setup_variables():
                              +    # instantiate a solver
                              +    solver = z3.Solver()
                              +
                              +    # a Z3 variable for the first concrete value
                              +    n1 = BitVec("n1")
                              +    # a KnownBits instances that uses Z3 variables as its ones and unknowns,
                              +    # representing the first abstract value
                              +    k1 = KnownBits(BitVec("n1_ones"), BitVec("n1_unkowns"))
                              +    # add the precondition to the solver that the concrete value n1 must be a
                              +    # member of the abstract value k1
                              +    solver.add(k1.contains(n1))
                              +
                              +    # a Z3 variable for the second concrete value
                              +    n2 = BitVec("n2")
                              +    # a KnownBits instances for the second abstract value
                              +    k2 = KnownBits(BitVec("n2_ones"), BitVec("n2_unkowns"))
                              +    # add the precondition linking n2 and k2 to the solver
                              +    solver.add(k2.contains(n2))
                              +    return solver, k1, n1, k2, n2
                              +
                              +def prove(cond, solver):
                              +    z3res = solver.check(z3.Not(cond))
                              +    if z3res != z3.unsat:
                              +        assert z3res == z3.sat # can't be timeout, we set no timeout
                              +        # make the model with the counterexample global, to make inspecting the
                              +        # bug easier when running pytest --pdb
                              +        global model
                              +        model = solver.model()
                              +        print(f"n1={model.eval(n1)}, n2={model.eval(n2)}")
                              +        counter_example_k1 = KnownBits(model.eval(k1.ones).as_signed_long(),
                              +                                       model.eval(k1.unknowns).as_signed_long())
                              +        counter_example_k2 = KnownBits(model.eval(k2.ones).as_signed_long(),
                              +                                       model.eval(k2.unknowns).as_signed_long())
                              +        print(f"k1={counter_example_k1}, k2={counter_example_k2}")
                              +        print(f"but {cond=} evaluates to {model.eval(cond)}")
                              +        raise ValueError(solver.model())
                              +
                              + +

                              And then we can write proof-unit-tests like this:

                              +
                              def test_z3_abstract_invert():
                              +    solver, k1, n1, _, _ = z3_setup_variables()
                              +    k2 = k1.abstract_invert()
                              +    n2 = ~n1
                              +    prove(k2.contains(n2), solver)
                              +
                              +def test_z3_abstract_and():
                              +    solver, k1, n1, k2, n2 = z3_setup_variables()
                              +    k3 = k1.abstract_and(k2)
                              +    n3 = n1 & n2
                              +    prove(k3.contains(n3), solver)
                              +
                              +def test_z3_abstract_or():
                              +    solver, k1, n1, k2, n2 = z3_setup_variables()
                              +    k3 = k1.abstract_or(k2)
                              +    n3 = n1 | n2
                              +    prove(k3.contains(n3), solver)
                              +
                              +def test_z3_abstract_add():
                              +    solver, k1, n1, k2, n2 = z3_setup_variables()
                              +    k3 = k1.abstract_add(k2)
                              +    n3 = n1 + n2
                              +    prove(k3.contains(n3), solver)
                              +
                              +def test_z3_abstract_sub():
                              +    solver, k1, n1, k2, n2 = z3_setup_variables()
                              +    k3 = k1.abstract_sub(k2)
                              +    n3 = n1 - n2
                              +    prove(k3.contains(n3), solver)
                              +
                              + +

                              It's possible to write a bit more Python-metaprogramming-magic and unify the +Hypothesis and Z3 tests into the same test definition.1

                              +

                              Cases where this style of Z3 proof doesn't work

                              +

                              Unfortunately the approach described in the previous section only works for a +very small number of cases. It breaks down as soon as the KnownBits methods +that we're calling contain any if conditions (including hidden ones like +the short-circuiting and and or in Python). Let's look at an example and +implement abstract_eq. eq is supposed to be an operation that compares two +integers and returns 0 or 1 if they are different or equal, respectively. +Implementing this in knownbits looks like this (with example and hypothesis +tests):

                              +
                              class KnownBits:
                              +    ...
                              +
                              +    def abstract_eq(self, other):
                              +        # the result is a 0, 1, or ?
                              +
                              +        # if they are both the same constant, they must be equal
                              +        if self.is_constant() and other.is_constant() and self.ones == other.ones:
                              +            return KnownBits.from_constant(1)
                              +        # check whether we have known disagreeing bits, then we know the result
                              +        # is 0
                              +        if self._disagrees(other):
                              +            return KnownBits.from_constant(0)
                              +        return KnownBits(0, 1) # an unknown boolean
                              +
                              +    def _disagrees(self, other):
                              +        # check whether the bits disagree in any place where both are known
                              +        both_known = self.knowns & other.knowns
                              +        return self.ones & both_known != other.ones & both_known
                              +
                              +def test_eq():
                              +    k1 = KnownBits.from_str('...?')
                              +    k2 = KnownBits.from_str('...?')
                              +    assert str(k1.abstract_eq(k2)) == '?'
                              +    k1 = KnownBits.from_constant(10)
                              +    assert str(k1.abstract_eq(k1)) == '1'
                              +    k1 = KnownBits.from_constant(10)
                              +    k2 = KnownBits.from_constant(20)
                              +    assert str(k1.abstract_eq(k2)) == '0'
                              +
                              +@given(knownbits_and_contained_number, knownbits_and_contained_number)
                              +def test_hypothesis_eq(t1, t2):
                              +    k1, n1 = t1
                              +    k2, n2 = t2
                              +    k3 = k1.abstract_eq(k2)
                              +    assert k3.contains(int(n1 == n2))
                              +
                              + +

                              Trying to do the proof in the same style as before breaks:

                              +
                              >>>> k3 = k1.abstract_eq(k2)
                              +Traceback (most recent call last):
                              +  File "<stdin>", line 1, in <module>
                              +  File "knownbits.py", line 246, in abstract_eq
                              +    if self._disagrees(other):
                              +  File "venv/site-packages/z3/z3.py", line 381, in __bool__
                              +    raise Z3Exception("Symbolic expressions cannot be cast to concrete Boolean values.")
                              +z3.z3types.Z3Exception: Symbolic expressions cannot be cast to concrete Boolean values.
                              +
                              + +

                              We cannot call abstract_eq on a KnownBits with Z3 variables as fields, +because once we hit an if statement, the whole approach of relying on the +operator overloading breaks down. Z3 doesn't actually parse the Python code or +anything advanced like that, we rather build an expression only by running the +code and letting the Z3 formulas build up.

                              +

                              To still prove the correctness of abstract_eq we need to manually transform +the control flow logic of the function into a Z3 formula that uses the z3.If +expression, using a small helper function:

                              +
                              def z3_cond(b, trueval=1, falseval=0):
                              +    return z3.If(b, BitVecVal(trueval), BitVecVal(falseval))
                              +
                              +def z3_abstract_eq(k1, k2):
                              +    # follow the *logic* of abstract_eq, we can't call it due to the ifs in it
                              +    case1cond = z3.And(k1.is_constant(), k2.is_constant(), k1.ones == k2.ones)
                              +    case2cond = k1._disagrees(k2)
                              +
                              +    # ones is 1 in the first case, 0 otherwise
                              +    ones = z3_cond(case1cond, 1, 0)
                              +
                              +    # in the first two cases, unknowns is 0, 1 otherwise
                              +    unknowns = z3_cond(z3.Or(case1cond, case2cond), 0, 1)
                              +    return KnownBits(ones, unknowns)
                              +
                              +def test_z3_abstract_eq_logic():
                              +    solver, k1, n1, k2, n2 = z3_setup_variables()
                              +    n3 = z3_cond(n1 == n2) # concrete result
                              +    k3 = z3_abstract_eq(k1, k2)
                              +    prove(k3.contains(n3), solver)
                              +
                              + +

                              This proof works. It is a lot less satisfying than the previous ones though, +because we could have done an error in the manual transcription from Python code +to Z3 formulas (there are possibly more heavy-handed approaches where we do +this transformation more automatically using e.g. the ast module to analyze +the source code, but that's a much more complicated researchy project). To +lessen this problem somewhat we can factor out the parts of the logic that don't +have any conditions into small helper methods (like _disagrees in this +example) and use them in the manual conversion of the code to Z3 formulas.2

                              +

                              The final condition that Z3 checks, btw, is this one:

                              +
                              If(n1 == n2, 1, 0) &
                              +~If(Or(And(n1_unkowns == 0,
                              +           n2_unkowns == 0,
                              +           n1_ones == n2_ones),
                              +       n1_ones & ~n1_unkowns & ~n2_unkowns !=
                              +       n2_ones & ~n1_unkowns & ~n2_unkowns),
                              +    0, 1) ==
                              +If(And(n1_unkowns == 0, n2_unkowns == 0, n1_ones == n2_ones),
                              +   1, 0)
                              +
                              + +

                              Making Statements about Precision

                              +

                              So far we have only used Z3 to prove statements about correctness, i.e. that +our abstract operations overapproximate what can happen with concrete values. +While proving this property is essential if we want to avoid miscompilation, +correctness alone is not a very strong constraint on the implementation of our +abstract transfer functions. We could simply return Knownbits.unknowns() for +every abstract_* method and the resulting overapproximation would be correct, +but useless in practice.

                              +

                              It's much harder to make statements about whether the transfer functions are +maximally precise. There are two aspects of precision I want to discuss in this +section, however.

                              +

                              The first aspect is that we would really like it if the transfer functions +compute the maximally precise results for singleton sets. If all abstract +arguments of an operations are constants, i.e. contain only a single concrete +element, then we know that the resulting set also has only a single element. We +can prove that all our transfer functions have this property:

                              +
                              def test_z3_prove_constant_folding():
                              +    solver, k1, n1, k2, n2 = z3_setup_variables()
                              +    k3 = k1.abstract_invert()
                              +    prove(z3.Implies(k1.is_constant(),
                              +                     k3.is_constant()), solver)
                              +
                              +    k3 = k1.abstract_and(k2)
                              +    prove(z3.Implies(z3.And(k1.is_constant(), k2.is_constant()),
                              +                     k3.is_constant()), solver)
                              +
                              +    k3 = k1.abstract_or(k2)
                              +    prove(z3.Implies(z3.And(k1.is_constant(), k2.is_constant()),
                              +                     k3.is_constant()), solver)
                              +
                              +    k3 = k1.abstract_sub(k2)
                              +    prove(z3.Implies(z3.And(k1.is_constant(), k2.is_constant()),
                              +                     k3.is_constant()), solver)
                              +
                              +    k3 = z3_abstract_eq(k1, k2)
                              +    prove(z3.Implies(z3.And(k1.is_constant(), k2.is_constant()),
                              +                     k3.is_constant()), solver)
                              +
                              + +

                              Proving with Z3 that the transfer functions are maximally precise for +non-constant arguments seems to be relatively hard. I tried a few completely +rigorous approaches and failed. The paper Sound, Precise, and Fast Abstract +Interpretation with Tristate Numbers +contains an optimality proof for the transfer functions of addition and +subtraction, so we can be certain that they are as precise as is +possible.

                              +

                              I still want to show an approach for trying to find concrete examples of +abstract values that are less precise than they could be, using a combination +of Hypothesis and Z3. The idea is to use hypothesis to pick random abstract +values. Then we compute the abstract result using our transfer function. +Afterwards we can ask Z3 to find us an abstract result that is better than the +one our transfer function produced. If Z3 finds a better abstract result, we +have a concrete example of imprecision for our transfer function. Those tests +aren't strict proofs, because they rely on generating random abstract values, +but they can still be valuable (not for the transfer functions in this blog +post, which are all optimal).

                              +

                              Here is what the code looks like (this is a little bit bonus content, I'll not +explain the details and can only hope that the comments are somewhat helpful):

                              +
                              @given(random_knownbits_and_contained_number, random_knownbits_and_contained_number)
                              +@settings(deadline=None)
                              +def test_check_precision(t1, t2):
                              +    k1, n1 = t1
                              +    k2, n2 = t2
                              +    # apply transfer function
                              +    k3 = k1.abstract_add(k2)
                              +    example_res = n1 + n2
                              +
                              +    # try to find a better version of k3 with Z3
                              +    solver = z3.Solver()
                              +    solver.set("timeout", 8000)
                              +
                              +    var1 = BitVec('v1')
                              +    var2 = BitVec('v2')
                              +
                              +    ones = BitVec('ones')
                              +    unknowns = BitVec('unknowns')
                              +    better_k3 = KnownBits(ones, unknowns)
                              +    print(k1, k2, k3)
                              +
                              +    # we're trying to find an example for a better k3, so we use check, without
                              +    # negation:
                              +    res = solver.check(z3.And(
                              +        # better_k3 should be a valid knownbits instance
                              +        better_k3.is_well_formed(),
                              +        # it should be better than k3, ie there are known bits in better_k3
                              +        # that we don't have in k3
                              +        better_k3.knowns & ~k3.knowns != 0,
                              +        # now encode the correctness condition for better_k3 with a ForAll:
                              +        # for all concrete values var1 and var2, it must hold that if
                              +        # var1 is in k1 and var2 is in k2 it follows that var1 + var2 is in
                              +        # better_k3
                              +        z3.ForAll(
                              +        [var1, var2],
                              +        z3.Implies(
                              +            z3.And(k1.contains(var1), k2.contains(var2)),
                              +            better_k3.contains(var1 + var2)))))
                              +    # if this query is satisfiable, we have found a better result for the
                              +    # abstract_add
                              +    if res == z3.sat:
                              +        model = solver.model()
                              +        rk3 = KnownBits(model.eval(ones).as_signed_long(), model.eval(unknowns).as_signed_long())
                              +        print("better", rk3)
                              +        assert 0
                              +    if res == z3.unknown:
                              +        print("timeout")
                              +
                              + +

                              It does not actually fail for abstract_add (nor the other abstract +functions). To see the test failing we can add some imprecision to the +implementation of abstract_add to see Hypothesis and Z3 find examples of +values that are not optimally precise (for example by setting some bits +of unknowns in the implementation of abstract_add unconditionally).

                              +

                              Using the Abstract Domain in the Toy Optimizer for Generalized Constant Folding

                              +

                              Now after all this work we can finally actually use the knownbits abstract +domain in the toy optimizer. The code for this follows Max' intro post about +abstract interpretation +quite closely.

                              +

                              For completeness sake, in the fold there's the basic infrastructure classes +that make up the IR again (they are identical or at least extremely close to +the previous toy posts).

                              +
                              toy infrastructure
                              class Value:
                              +    def find(self):
                              +        raise NotImplementedError("abstract")
                              +
                              +
                              +@dataclass(eq=False)
                              +class Operation(Value):
                              +    name : str
                              +    args : list[Value]
                              +
                              +    forwarded : Optional[Value] = None
                              +
                              +    def find(self) -> Value:
                              +        op = self
                              +        while isinstance(op, Operation):
                              +            next = op.forwarded
                              +            if next is None:
                              +                return op
                              +            op = next
                              +        return op
                              +
                              +    def arg(self, index):
                              +        return self.args[index].find()
                              +
                              +    def make_equal_to(self, value : Value):
                              +        self.find().forwarded = value
                              +
                              +
                              +@dataclass(eq=False)
                              +class Constant(Value):
                              +    value : object
                              +
                              +    def find(self):
                              +        return self
                              +
                              +
                              +class Block(list):
                              +    def __getattr__(self, opname):
                              +        def wraparg(arg):
                              +            if not isinstance(arg, Value):
                              +                arg = Constant(arg)
                              +            return arg
                              +        def make_op(*args):
                              +            op = Operation(opname,
                              +                [wraparg(arg) for arg in args])
                              +            self.append(op)
                              +            return op
                              +        return make_op
                              +
                              +
                              +def bb_to_str(l : Block, varprefix : str = "var"):
                              +    def arg_to_str(arg : Value):
                              +        if isinstance(arg, Constant):
                              +            return str(arg.value)
                              +        else:
                              +            return varnames[arg]
                              +
                              +    varnames = {}
                              +    res = []
                              +    for index, op in enumerate(l):
                              +        # give the operation a name used while
                              +        # printing:
                              +        var =  f"{varprefix}{index}"
                              +        varnames[op] = var
                              +        arguments = ", ".join(
                              +            arg_to_str(op.arg(i))
                              +                for i in range(len(op.args))
                              +        )
                              +        strop = f"{var} = {op.name}({arguments})"
                              +        res.append(strop)
                              +    return "\n".join(res)
                              +
                              + + + +

                              Now we can write some first tests, the first one simply checking constant +folding:

                              +
                              def test_constfold_two_ops():
                              +    bb = Block()
                              +    var0 = bb.getarg(0)
                              +    var1 = bb.int_add(5, 4)
                              +    var2 = bb.int_add(var1, 10)
                              +    var3 = bb.int_add(var2, var0)
                              +
                              +    opt_bb = simplify(bb)
                              +    assert bb_to_str(opt_bb, "optvar") == """\
                              +optvar0 = getarg(0)
                              +optvar1 = int_add(19, optvar0)"""
                              +
                              + +

                              Calling the transfer functions on constant KnownBits produces a constant +results, as we have seen. Therefore "regular" constant folding should hopefully +be achieved by optimizing with the KnownBits abstract domain too.

                              +

                              The next two tests are slightly more complicated and can't be optimized by +regular constant-folding. They follow the motivating examples from the start of +this blog post, a hundred years ago:

                              +
                              def test_constfold_via_knownbits():
                              +    bb = Block()
                              +    var0 = bb.getarg(0)
                              +    var1 = bb.int_or(var0, 1)
                              +    var2 = bb.int_and(var1, 1)
                              +    var3 = bb.dummy(var2)
                              +
                              +    opt_bb = simplify(bb)
                              +    assert bb_to_str(opt_bb, "optvar") == """\
                              +optvar0 = getarg(0)
                              +optvar1 = int_or(optvar0, 1)
                              +optvar2 = dummy(1)"""
                              +
                              +def test_constfold_alignment_check():
                              +    bb = Block()
                              +    var0 = bb.getarg(0)
                              +    var1 = bb.int_invert(0b111)
                              +    # mask off the lowest three bits, thus var2 is aligned
                              +    var2 = bb.int_and(var0, var1)
                              +    # add 16 to aligned quantity
                              +    var3 = bb.int_add(var2, 16)
                              +    # check alignment of result
                              +    var4 = bb.int_and(var3, 0b111)
                              +    var5 = bb.int_eq(var4, 0)
                              +    # var5 should be const-folded to 1
                              +    var6 = bb.dummy(var5)
                              +
                              +    opt_bb = simplify(bb)
                              +    assert bb_to_str(opt_bb, "optvar") == """\
                              +optvar0 = getarg(0)
                              +optvar1 = int_and(optvar0, -8)
                              +optvar2 = int_add(optvar1, 16)
                              +optvar3 = dummy(1)"""
                              +
                              + +

                              Here is simplify to make these tests pass:

                              +
                              def unknown_transfer_functions(*abstract_args):
                              +    return KnownBits.all_unknown()
                              +
                              +
                              +def simplify(bb: Block) -> Block:
                              +    abstract_values = {} # dict mapping Operation to KnownBits
                              +
                              +    def knownbits_of(val : Value):
                              +        if isinstance(val, Constant):
                              +            return KnownBits.from_constant(val.value)
                              +        return abstract_values[val]
                              +
                              +    opt_bb = Block()
                              +    for op in bb:
                              +        # apply the transfer function on the abstract arguments
                              +        name_without_prefix = op.name.removeprefix("int_")
                              +        method_name = f"abstract_{name_without_prefix}"
                              +        transfer_function = getattr(KnownBits, method_name, unknown_transfer_functions)
                              +        abstract_args = [knownbits_of(arg.find()) for arg in op.args]
                              +        abstract_res = abstract_values[op] = transfer_function(*abstract_args)
                              +        # if the result is a constant, we optimize the operation away and make
                              +        # it equal to the constant result
                              +        if abstract_res.is_constant():
                              +            op.make_equal_to(Constant(abstract_res.ones))
                              +            continue
                              +        # otherwise emit the op
                              +        opt_bb.append(op)
                              +    return opt_bb
                              +
                              + +

                              The code follows the approach from the previous blog post very closely. The +only difference is that we apply the transfer function first, to be able to +detect whether the abstract domain can tell us that the result has to always be +a constant. This code makes all three tests pass.

                              +

                              Using the KnownBits Domain for Conditional Peephole Rewrites

                              +

                              So far we are only using the KnownBits domain to find out that certain +operations have to produce a constant. We can also use the KnownBits domain +to check whether certain operation rewrites are correct. Let's use one of the +examples from the Mining JIT traces for missing optimizations with +Z3 +post, where Z3 found the inefficiency (x << 4) & -0xf == x << 4 in PyPy JIT +traces. We don't have shift operations, but we want to generalize this optimization +anyway. The general form of this rewrite is that under some circumstances x & +y == x, and we can use the KnownBits domain to detect situations where this +must be true.

                              +

                              To understand when x & y == x is true, we can think about individual pairs of +bits a and b. If a == 0, then a & b == 0 & b == 0 == a. If b == 1 +then a & b == a & 1 == a. So if either a == 0 or b == 1 is true, +a & b == a follows. And if either of these conditions is true for all the +bits of x and y, we can know that x & y == x.

                              +

                              We can write a method on KnownBits to check for this condition:

                              +
                              class KnownBits:
                              +    ...
                              +
                              +    def is_and_identity(self, other):
                              +        """ Return True if n1 & n2 == n1 for any n1 in self and n2 in other.
                              +        (or, equivalently, return True if n1 | n2 == n2)"""
                              +        return self.zeros | other.ones == -1
                              +
                              + +

                              Since my reasoning about this feels ripe for errors, let's check that our +understanding is correct with Z3:

                              +
                              def test_prove_is_and_identity():
                              +    solver, k1, n1, k2, n2 = z3_setup_variables()
                              +    prove(z3.Implies(k1.is_and_identity(k2), n1 & n2 == n1), solver)
                              +
                              + +

                              Now let's use this in the toy optimizer. Here are two tests for this rewrite:

                              +
                              def test_remove_redundant_and():
                              +    bb = Block()
                              +    var0 = bb.getarg(0)
                              +    var1 = bb.int_invert(0b1111)
                              +    # mask off the lowest four bits
                              +    var2 = bb.int_and(var0, var1)
                              +    # applying the same mask is not redundant
                              +    var3 = bb.int_and(var2, var1)
                              +    var4 = bb.dummy(var3)
                              +
                              +    opt_bb = simplify(bb)
                              +    assert bb_to_str(opt_bb, "optvar") == """\
                              +optvar0 = getarg(0)
                              +optvar1 = int_and(optvar0, -16)
                              +optvar2 = dummy(optvar1)"""
                              +
                              +def test_remove_redundant_and_more_complex():
                              +    bb = Block()
                              +    var0 = bb.getarg(0)
                              +    var1 = bb.getarg(1)
                              +    # var2 has bit pattern ????
                              +    var2 = bb.int_and(var0, 0b1111)
                              +    # var3 has bit pattern ...?1111
                              +    var3 = bb.int_or(var1, 0b1111)
                              +    # var4 is just var2
                              +    var4 = bb.int_and(var2, var3)
                              +    var5 = bb.dummy(var4)
                              +
                              +    opt_bb = simplify(bb)
                              +    assert bb_to_str(opt_bb, "optvar") == """\
                              +optvar0 = getarg(0)
                              +optvar1 = getarg(1)
                              +optvar2 = int_and(optvar0, 15)
                              +optvar3 = int_or(optvar1, 15)
                              +optvar4 = dummy(optvar2)"""
                              +
                              + +

                              The first test could also be made to pass by implementing a reassociation +optimization that turns (x & c1) & c2 into x & (c1 & c2) and then constant-folds the second and. But here we want to +use KnownBits and conditionally rewrite int_and to its first argument. So to make the tests pass, +we can change simplify like this:

                              +
                              def simplify(bb: Block) -> Block:
                              +    abstract_values = {} # dict mapping Operation to KnownBits
                              +
                              +    def knownbits_of(val : Value):
                              +        ...
                              +
                              +    opt_bb = Block()
                              +    for op in bb:
                              +        # apply the transfer function on the abstract arguments
                              +        name_without_prefix = op.name.removeprefix("int_")
                              +        method_name = f"abstract_{name_without_prefix}"
                              +        transfer_function = getattr(KnownBits, method_name, unknown_transfer_functions)
                              +        abstract_args = [knownbits_of(arg.find()) for arg in op.args]
                              +        abstract_res = abstract_values[op] = transfer_function(*abstract_args)
                              +        # if the result is a constant, we optimize the operation away and make
                              +        # it equal to the constant result
                              +        if abstract_res.is_constant():
                              +            op.make_equal_to(Constant(abstract_res.ones))
                              +            continue
                              +        # <<<< new code
                              +        # conditionally rewrite int_and(x, y) to x
                              +        if op.name == "int_and":
                              +            k1, k2 = abstract_args
                              +            if k1.is_and_identity(k2):
                              +                op.make_equal_to(op.arg(0))
                              +                continue
                              +        # >>>> end changes
                              +        opt_bb.append(op)
                              +    return opt_bb
                              +
                              + +

                              And with that, the new tests pass as well. A real implementation would also +check the other argument order, but we leave that out for the sake of brevity.

                              +

                              This rewrite also generalizes the rewrites int_and(0, x) -> 0 and +int_and(-1, x) -> x, let's add a test for those:

                              +
                              def test_remove_and_simple():
                              +    bb = Block()
                              +    var0 = bb.getarg(0)
                              +    var1 = bb.getarg(1)
                              +    var2 = bb.int_and(0, var0) # == 0
                              +    var3 = bb.int_invert(var2) # == -1
                              +    var4 = bb.int_and(var1, var3) # == var1
                              +    var5 = bb.dummy(var4)
                              +
                              +    opt_bb = simplify(bb)
                              +    assert bb_to_str(opt_bb, "optvar") == """\
                              +optvar0 = getarg(0)
                              +optvar1 = getarg(1)
                              +optvar2 = dummy(optvar1)"""
                              +
                              + +

                              This test just passes. And that's it for this post!

                              +

                              Conclusion

                              +

                              In this post we've seen the implementation, testing and proofs about a 'known +bits' abstract domain, as well as its use in the toy optimizer to generalize +constant folding, and to implement conditional peephole rewrites.

                              +

                              In the next posts I'll write about the real implementation of a knownbits +domain in PyPy's JIT, its combination with the existing interval abstract +domain, how to deal with gaining information from conditions in the program, +and some lose ends.

                              +

                              Sources:

                              + +
                              +
                              +
                                +
                              1. +

                                There's a subtletly about the Z3 proofs that I'm sort of +glossing over here. Python integers are of arbitrary width, and the +KnownBits code is actually carefully written to work for integers of any +size. This property is tested by the Hypothesis tests, which don't limit +the sizes of the generated random integers. However, the Z3 proofs only +check bitvectors of a fixed bitwidth of 64. There are various ways to deal +with this situation. For most "real" compilers, the bitwidth of integers +would be fixed anyway. Then the components ones and unknowns of the +KnownBits class would use the number of bits the corresponding integer +variable has, and the Z3 proofs would use the same width. This is what we +do in the PyPy JIT. 

                                +
                              2. +
                              3. +

                                The less close connection between implementation and proof +for abstract_eq is one of the reasons why it makes sense to do +unit-testing in addition to proofs. For a more detailed explanation of +why both tests and proofs are good to +have, see Jeremy Siek's blog +post, +as well as the Knuth +quote

                                +
                              4. +
                              +
                              +
                              +

                              Comments

                              +
                              +
                              +
                              + +
                              +
                              + + \ No newline at end of file diff --git a/pypy-sponsors.html b/pypy-sponsors.html new file mode 100644 index 000000000..fe1642e97 --- /dev/null +++ b/pypy-sponsors.html @@ -0,0 +1,152 @@ + + + + + +PyPy Sponsors and Consultants | PyPy + + + + + + + + + + + + + + + + + + + + + + + Skip to main content +
                              +

                              PyPy Sponsors and Consultants

                              + + +
                              +

                              Keeping a project as ambitious as PyPy requires resources. Sometimes the +problems encountered are general, like updating python versions or supporting +various c-extensions. Sometimes the problems are specific and require precise +solutions that may not generalize to all users. Likewise, sponsorship of PyPy +can be general or specific.

                              +

                              General PyPy Sponsorship

                              +

                              PyPy has had many financial contributors in the +past. We are grateful to them, and to the following current sponsors:

                              + +

                              PyPy Consulting Work

                              +
                                +
                              • +

                                Baroque Software is an innovative company that + has been doing performance oriented consulting for a variety of biggest + players on the market since 2007. Please reach out to their team for + help making PyPy fulfill its potential in your application.

                                +
                              • +
                              • +

                                Matti Picus, the PyPy release manager, has been + pushing PyPy into the Python ecosystem since 2016: dealing with + packaging, compatibility, and performance. He works at + Quansight and is available for projects.

                                +
                              • +
                              +
                              + +
                              +
                              + + \ No newline at end of file diff --git a/robots.txt b/robots.txt new file mode 100644 index 000000000..df6e3c222 --- /dev/null +++ b/robots.txt @@ -0,0 +1,4 @@ +Sitemap: https://www.pypy.org/sitemapindex.xml + +User-Agent: * +Host: www.pypy.org diff --git a/rss.xml b/rss.xml new file mode 100644 index 000000000..ec96b661f --- /dev/null +++ b/rss.xml @@ -0,0 +1,3704 @@ + +PyPyhttps://www.pypy.org/A Faster PythonenContents © 2024 <a href="mailto:pypy-dev@pypy.org">The PyPy Team</a> Sat, 31 Aug 2024 17:48:12 GMTNikola (getnikola.com)http://blogs.law.harvard.edu/tech/rssGuest Post: How PortaOne uses PyPy for high-performance processing, connecting over 1B of phone calls every monthhttps://www.pypy.org/posts/2024/08/portaone.htmlThe PyPy Team<p>The PyPy project is always happy to hear about industrial use and deployments +of PyPy. For the <a href="https://www.pypy.org/posts/2024/03/fixing-bug-incremental-gc.html">GC bug +finding</a> +task earlier this year, we collaborated with PortaOne and we're super happy +that Serhii Titov, head of the QA department at PortaOne, was up to writing +this guest post to describe their use and experience with the project.</p> +<hr> +<h3 id="what-does-portaone-do">What does PortaOne do?</h3> +<p>We at <a href="https://www.portaone.com/">PortaOne Inc.</a> allow telecom operators to +launch new services (or provide existing services more efficiently) using our +VoIP platform (PortaSIP) and our real-time charging system (PortaBilling), +which provides additional features for cloud PBX, such as call transfer, +queues, interactive voice response (IVR) and more. At this moment our support +team manages several thousand servers with our software installed in 100 +countries, through which over 500 telecommunication service providers connect +millions of end users every day. The unique thing about PortaOne is that we +supply the source code of our product to our customers - something unheard of +in the telecom world! Thus we attract "telco innovators", who use our APIs to +build around the system and the source code to create unique tweaks of +functionality, which produces amazing products.</p> +<p>At the core of PortaSIP is the middle-ware component (the proper name for it is +"B2BUA", but that probably does not say much to anyone outside of experts in +VoIP), which implements the actual handling of SIP calls, messages, etc. and +all added features (for instance, trying to send a call via telco operators +through which the cost per minute is lower). It has to be fast (since even a +small delay in establishing a call is noticed by a customer), reliable +(everyone hates when a call drops or cannot be completed) and yet easily +expandable with new functionality. This is why we decided to use Python as +opposed to C/C++ or similar programming languages, which are often used in +telecom equipment.</p> +<p>The B2BUA component is a batch of similar Python processes that are looped +inside a +<a href="https://docs.python.org/3.10/library/asyncore.html"><code>asyncore.dispatcher</code></a> +wrapper. The load balancing between these Python processes is done by our +stateless SIP proxy server written in C++. All our sockets are served by this +B2BUA. We have our custom client-wrappers around <code>pymysql</code>, <code>redis</code>, +<code>cassandra-driver</code> and <code>requests</code> to communicate with external services. Some +of the Python processes use <a href="https://cffi.readthedocs.io/en/stable/"><code>cffi</code></a> +wrappers around C-code to improve their performance (examples: an Oracle DB +driver, a client to a radius server, a custom C logger).</p> +<p>The I/O operations that block the main thread of the Python processes are +processed in sub-threads. We have custom wrappers around <code>threading.Thread</code> +and also <code>asyncore.dispatcher</code>. The results of such operations are returned to +the main thread.</p> +<h3 id="improving-our-performance-with-pypy">Improving our performance with PyPy</h3> +<p>We started with CPython and then in 2014 switched to PyPy because it was +faster. Here's an exact quote from our first testing notes: "PyPy gives +significant performance boost, ~50%". Nowadays, after years of changes in all +the software involved, PyPy still gives us +50% boost compared to CPython.</p> +<p>Taking care of real time traffic for so many people around the globe is +something we're really proud of. I hope the PyPy team can be proud of it as +well, as the PyPy product is a part of this solution.</p> +<h3 id="finding-a-garbage-collector-bug-stage-1-the-gc-hooks">Finding a garbage collector bug: stage 1, the GC hooks</h3> +<p>However our path with PyPy wasn't perfectly smooth. There were very rare cases +of crashes on PyPy that we weren't able to catch. That's because to make +coredump useful we needed to switch to PyPy with debug, but we cannot let it +run in that mode on a production system for an extended period of time, and we +did not have any STR (steps-to-reproduce) to make PyPy crash again in our lab. +That's why we kept (and still keep) both interpreters installed just in case, +and we would switch to CPython if we noticed it happening.</p> +<p>At the time of updating PyPy from 3.5 to 3.6 our QA started noticing those +crashes more often, but we still had no luck with STR or collecting proper +coredumps with debug symbols. Then it became even worse after our development +played with the <a href="https://doc.pypy.org/en/latest/gc_info.html">Garbage Collector's +options</a> to increase performance +of our middleware component. The crashes started to affect our regular +performance testing (controlled by QA manager Yevhenii Bovda). At that point it +was decided that we can no longer live like that and so we started an intense +investigation.</p> +<p>During the first stage of our investigation (following the best practice of +troubleshooting) we narrowed down the issue as much as we could. So, it was not +our code, it was definitely somewhere in PyPy. Eventually our SIP software +engineer <a href="https://github.com/Yevhenii-Yatchenko">Yevhenii Yatchenko</a> found out +that this bug is connected with the use of our <a href="https://doc.pypy.org/en/latest/gc_info.html#gc-hooks">custom hooks in the +GC</a>. Yevhenii created +ticket <a href="https://github.com/pypy/pypy/issues/4899">#4899</a> and within 2-3 days we +got a fix from a <a href="https://github.com/cfbolz">member of the PyPy team</a>, in true open-source fashion.</p> +<h3 id="finding-a-garbage-collector-bug-stage-2-the-real-bug">Finding a garbage collector bug: stage 2, the real bug</h3> +<p>Then came stage 2. In parallel with the previous ticket, Yevhenii created +<a href="https://github.com/pypy/pypy/issues/4900">#4900</a> that we still see failing +with coredumps quite often, and they are not connected to GC custom hooks. In a +nutshell, it took us dozens of back and forward emails, three Zoom sessions and +four versions of a patch to solve the issue. During the last iteration we got a +new set of options to try and a new version of the patch. Surprisingly, that +helped! What a relief! So, the next logical step was to remove all debug +options and run PyPy only with the patch. Unfortunately, it started to fail +again and we came to the obvious conclusion that what will help us is not a +patch, but one of options we were testing out. At that point we found out that +<a href="https://doc.pypy.org/en/latest/gc_info.html#environment-variables"><code>PYPY_GC_MAX_PINNED=0</code></a> +is a necessary and sufficient condition to solve our issue. This points to +another bug in the garbage collector, somehow related to object pinning.</p> +<p>Here's our current state: we have to add <code>PYPY_GC_MAX_PINNED=0</code>, but we do not +face the crashes anymore.</p> +<h3 id="conclusion-and-next-steps">Conclusion and next steps</h3> +<p>Gratitude is extended to Carl for his invaluable assistance in resolving the +nasty bugss, because it seems we're the only ones who suffered from the last +one and we really did not want to fall back to CPython due to its performance +disadvantage.</p> +<p>Serhii Titov, head of the QA department at PortaOne Inc.</p> +<p>P.S. If you are a perfectionist and at this point you have mixed feelings and +you are still bothered by the question "But there might still be a bug in the +GC, what about that?" - Carl has some ideas about it and he will sort it out +(we will help with the testing/verification part).</p>casestudyguestposthttps://www.pypy.org/posts/2024/08/portaone.htmlThu, 29 Aug 2024 09:00:00 GMTPyPy v7.3.17 releasehttps://www.pypy.org/posts/2024/08/pypy-v7317-release.htmlmattip<section id="pypy-v7-3-17-release-of-python-2-7-and-3-10"> +<h2>PyPy v7.3.17: release of python 2.7 and 3.10</h2> +<p>The PyPy team is proud to release version 7.3.17 of PyPy.</p> +<p>This release includes a new <a class="reference internal" href="https://www.pypy.org/posts/2024/08/pypy-v7317-release.html#risc-v-jit-backend">RISC-V JIT backend</a>, an <a class="reference internal" href="https://www.pypy.org/posts/2024/08/pypy-v7317-release.html#improved-repl">improved REPL</a> based on +work by the CPython team, and <a class="reference internal" href="https://www.pypy.org/posts/2024/08/pypy-v7317-release.html#better-jit-optimizations">better JIT optimizations</a> of integer +operations. Special shout-outs to <a class="reference external" href="https://github.com/loganchien">Logan Chien</a> for the <a class="reference external" href="https://github.com/pypy/pypy/pull/5002">RISC-V backend +work</a>, to <a class="reference external" href="https://github.com/nirit100">Nico Rittinghaus</a> for better integer optimization in the JIT, and +the CPython team that has worked on the repl.</p> +<p>The release includes two different interpreters:</p> +<ul class="simple"> +<li><p>PyPy2.7, which is an interpreter supporting the syntax and the features of +Python 2.7 including the stdlib for CPython 2.7.18+ (the <code class="docutils literal">+</code> is for +backported security updates)</p></li> +<li><p>PyPy3.10, which is an interpreter supporting the syntax and the features of +Python 3.10, including the stdlib for CPython 3.10.14.</p></li> +</ul> +<p>The interpreters are based on much the same codebase, thus the dual +release. This is a micro release, all APIs are compatible with the other 7.3 +releases. It follows after 7.3.16 release on April 23, 2024.</p> +<p>We recommend updating. You can find links to download the releases here:</p> +<blockquote> +<p><a class="reference external" href="https://pypy.org/download.html">https://pypy.org/download.html</a></p> +</blockquote> +<p>We would like to thank our donors for the continued support of the PyPy +project. If PyPy is not quite good enough for your needs, we are available for +<a class="reference external" href="https://www.pypy.org/pypy-sponsors.html">direct consulting</a> work. If PyPy is helping you out, we would love to hear +about it and encourage submissions to our <a class="reference external" href="https://pypy.org/blog">blog</a> via a pull request +to <a class="reference external" href="https://github.com/pypy/pypy.org">https://github.com/pypy/pypy.org</a></p> +<p>We would also like to thank our contributors and encourage new people to join +the project. PyPy has many layers and we need help with all of them: bug fixes, +<a class="reference external" href="https://www.pypy.org/posts/2024/08/index.html">PyPy</a> and <a class="reference external" href="https://rpython.readthedocs.org">RPython</a> documentation improvements, or general <a class="reference external" href="https://www.pypy.org/posts/2024/08/project-ideas.html">help</a> with +making RPython's JIT even better.</p> +<p>If you are a python library maintainer and use C-extensions, please consider +making a <a class="reference external" href="https://hpyproject.org/">HPy</a> / <a class="reference external" href="https://cffi.readthedocs.io">CFFI</a> / <a class="reference external" href="https://cppyy.readthedocs.io">cppyy</a> version of your library that would be performant +on PyPy. In any case, both <a class="reference external" href="https://github.com/joerick/cibuildwheel">cibuildwheel</a> and the <a class="reference external" href="https://github.com/matthew-brett/multibuild">multibuild system</a> support +building wheels for PyPy.</p> +<section id="risc-v-backend-for-the-jit"> +<span id="risc-v-jit-backend"></span><h3>RISC-V backend for the JIT</h3> +<p>PyPy's JIT has added support for generating 64-bit RISC-V machine code at +runtime (RV64-IMAD, specifically). So far we are not releasing binaries for any +RISC-V platforms, but there are <a class="reference external" href="https://rpython.readthedocs.io/en/latest/riscv.html">instructions</a> on how to cross-compile binaries.</p> +</section> +<section id="repl-improvements"> +<span id="improved-repl"></span><h3>REPL Improvements</h3> +<p>The biggest user-visible change of the release is new features in the repl of +PyPy3.10. CPython 3.13 has adopted and extended PyPy's pure-Python repl, adding +a number of features and fixing a number or bugs in the process. We have +backported and added the following features:</p> +<ul class="simple"> +<li><p>Prompts and tracebacks use terminal colors, as well as <a class="reference external" href="https://gist.github.com/egmontkob/eb114294efbcd5adb1944c9f3cb5feda">terminal hyperlinks</a> +for file names.</p></li> +<li><p><a class="reference external" href="https://en.wikipedia.org/wiki/Bracketed-paste">Bracketed paste</a> enable pasting several lines of input into the terminal +without auto-indentation getting in the way.</p></li> +<li><p>A special interactive help browser (F1), history browser (F2), explicit paste +mode (F3).</p></li> +<li><p>Support for Ctrl-&lt;left/right&gt; to jump over whole words at a time.</p></li> +</ul> +<p>See the <a class="reference external" href="https://docs.python.org/3.13/whatsnew/3.13.html#a-better-interactive-interpreter">CPython documentation for further details</a>. Thanks to Łukasz Langa, +Pablo Galindo Salgado and the other CPython devs involved in this work.</p> +</section> +<section id="better-jit-optimizations-of-integer-operations"> +<span id="better-jit-optimizations"></span><h3>Better JIT optimizations of integer operations</h3> +<p>The optimizers of PyPy's JIT have become much better at reasoning about and +optimizing integer operations. This is done with a new <a class="reference external" href="https://pypy.org/posts/2024/08/toy-knownbits.html">"knownbits" abstract +domain</a>. In many programs that do bit-manipulation of integers, some of the +bits of the integer variables of the program can be statically known. Here's a +simple example:</p> +<div class="code"><pre class="code python"><a id="rest_code_eca6db629fd844478a4ee5bd2ccb11fc-1" name="rest_code_eca6db629fd844478a4ee5bd2ccb11fc-1" href="https://www.pypy.org/posts/2024/08/pypy-v7317-release.html#rest_code_eca6db629fd844478a4ee5bd2ccb11fc-1"></a><span class="n">x</span> <span class="o">=</span> <span class="n">a</span> <span class="o">|</span> <span class="mi">1</span> +<a id="rest_code_eca6db629fd844478a4ee5bd2ccb11fc-2" name="rest_code_eca6db629fd844478a4ee5bd2ccb11fc-2" href="https://www.pypy.org/posts/2024/08/pypy-v7317-release.html#rest_code_eca6db629fd844478a4ee5bd2ccb11fc-2"></a><span class="o">...</span> +<a id="rest_code_eca6db629fd844478a4ee5bd2ccb11fc-3" name="rest_code_eca6db629fd844478a4ee5bd2ccb11fc-3" href="https://www.pypy.org/posts/2024/08/pypy-v7317-release.html#rest_code_eca6db629fd844478a4ee5bd2ccb11fc-3"></a><span class="k">if</span> <span class="n">x</span> <span class="o">&amp;</span> <span class="mi">1</span><span class="p">:</span> +<a id="rest_code_eca6db629fd844478a4ee5bd2ccb11fc-4" name="rest_code_eca6db629fd844478a4ee5bd2ccb11fc-4" href="https://www.pypy.org/posts/2024/08/pypy-v7317-release.html#rest_code_eca6db629fd844478a4ee5bd2ccb11fc-4"></a> <span class="o">...</span> +<a id="rest_code_eca6db629fd844478a4ee5bd2ccb11fc-5" name="rest_code_eca6db629fd844478a4ee5bd2ccb11fc-5" href="https://www.pypy.org/posts/2024/08/pypy-v7317-release.html#rest_code_eca6db629fd844478a4ee5bd2ccb11fc-5"></a><span class="k">else</span><span class="p">:</span> +<a id="rest_code_eca6db629fd844478a4ee5bd2ccb11fc-6" name="rest_code_eca6db629fd844478a4ee5bd2ccb11fc-6" href="https://www.pypy.org/posts/2024/08/pypy-v7317-release.html#rest_code_eca6db629fd844478a4ee5bd2ccb11fc-6"></a> <span class="o">...</span> +</pre></div> +<p>With the new abstract domain, the JIT can optimize the <code class="docutils literal">if</code>-condition to +<code class="docutils literal">True</code>, because it already knows that the lowest bit of <code class="docutils literal">x</code> must be set. +This optimization applies to all Python-integers that fit into a machine word +(PyPy optimistically picks between two different representations for <code class="docutils literal">int</code>, +depending on the size of the value). Unfortunately there is very little impact +of this change on almost all Python code, because intensive bit-manipulation is +rare in Python. However, the change leads to significant performance +improvements in <a class="reference external" href="https://docs.pydrofoil.org/en/latest/">Pydrofoil</a> (the RPython-based RISC-V/ARM emulators that are +automatically generated from high-level <a class="reference external" href="https://github.com/rems-project/sail/">Sail</a> specifications of the respective +ISAs, and that use the RPython JIT to improve performance).</p> +</section> +<section id="pypy-versions-and-speed-pypy-org"> +<h3>PyPy versions and speed.pypy.org</h3> +<p>The keen-eyed will have noticed no mention of Python version 3.9 in the +releases above. Typically we will maintain only one version of Python3, but due +to PyPy3.9 support on conda-forge we maintained multiple versions from the +first release of PyPy3.10 in PyPy v7.3.12 (Dec 2022). Conda-forge is +<a class="reference external" href="https://pypy.org/posts/2024/08/conda-forge-proposes-dropping-support-for-pypy.html">sunsetting its PyPy support</a>, which means we can drop PyPy3.9. Since that was +the major driver of benchmarks at <a class="reference external" href="https://speed.pypy.org">https://speed.pypy.org</a>, we revamped the site +to showcase PyPy3.9, PyPy3.10, and various versions of cpython on the home +page. For historical reasons, the "baseline" for comparison is still cpython +3.7.19.</p> +<p>We will keep the buildbots building PyPY3.9 until the end of August, these +builds will still be available on the <a class="reference external" href="https://buildbot.pypy.org/nightly/">nightly builds</a> tab of the buildbot.</p> +</section> +<section id="what-is-pypy"> +<h3>What is PyPy?</h3> +<p>PyPy is a Python interpreter, a drop-in replacement for CPython +It's fast (<a class="reference external" href="https://speed.pypy.org">PyPy and CPython</a> performance +comparison) due to its integrated tracing JIT compiler.</p> +<p>We also welcome developers of other <a class="reference external" href="https://rpython.readthedocs.io/en/latest/examples.html">dynamic languages</a> to see what RPython +can do for them.</p> +<p>We provide binary builds for:</p> +<ul class="simple"> +<li><p><strong>x86</strong> machines on most common operating systems +(Linux 32/64 bits, Mac OS 64 bits, Windows 64 bits)</p></li> +<li><p>64-bit <strong>ARM</strong> machines running Linux (<code class="docutils literal">aarch64</code>) and macos (<code class="docutils literal">macos_arm64</code>).</p></li> +</ul> +<p>PyPy supports Windows 32-bit, Linux PPC64 big- and little-endian, Linux ARM +32 bit, RISC-V RV64IMAFD Linux, and s390x Linux but does not release binaries. +Please reach out to us if you wish to sponsor binary releases for those +platforms. Downstream packagers provide binary builds for debian, Fedora, +conda, OpenBSD, FreeBSD, Gentoo, and more.</p> +</section> +<section id="what-else-is-new"> +<h3>What else is new?</h3> +<p>For more information about the 7.3.17 release, see the <a class="reference external" href="https://doc.pypy.org/en/latest/release-v7.3.17.html#changelog">full changelog</a>.</p> +<p>Please update, and continue to help us make pypy better.</p> +<p>Cheers, +The PyPy Team</p> +</section> +</section>releasehttps://www.pypy.org/posts/2024/08/pypy-v7317-release.htmlWed, 28 Aug 2024 12:22:08 GMTConda-forge proposes sunsetting support for PyPyhttps://www.pypy.org/posts/2024/08/conda-forge-proposes-dropping-support-for-pypy.htmlmattip<p>Conda-forge has kindly been providing support for PyPy since 2019. The +conda-forge team has been very patient and generous with resources, but it +seems the uptake of PyPy has not justified the effort. Major packages still +are not <a href="https://conda-forge.org/status/migration/?name=pypy38">available on PyPy</a>, +others find it hard to <a href="https://github.com/conda-forge/numpy-feedstock/pull/310">update +versions</a>. We don't +get much feedback at all about people using PyPy, and even less about PyPy on +conda-forge. The conda-forge team has proposed <a href="https://github.com/conda-forge/conda-forge.github.io/pull/2259">sunsetting +PyPy</a> going +forward, which means current packages would remain but no new packages would be +built. If you have an opinion, you can comment on that PR, or on this blog post.</p> +<p>Since conda-forge supports PyPy3.9 but not PyPy3.10, we have continued +releasing PyPy3.9 even though we typically support only one version of PyPy3. +With the sunsetting proposal, we will not release any more updates to PyPy3.9. +I opened a <a href="https://github.com/orgs/pypy/discussions/4998">poll</a> about the +intention to drop PyPy3.9. If you have an opinion, please chime in.</p>conda-forgehttps://www.pypy.org/posts/2024/08/conda-forge-proposes-dropping-support-for-pypy.htmlFri, 09 Aug 2024 06:27:41 GMTA Knownbits Abstract Domain for the Toy Optimizer, Correctlyhttps://www.pypy.org/posts/2024/08/toy-knownbits.htmlCF Bolz-Tereick<p>After <a href="https://bernsteinbear.com/blog/toy-abstract-interpretation/">Max' introduction to abstract interpretation for the toy optimizer</a> in the +last post, I want to present a more complicated abstract domain in this post. +This abstract domain reasons about the individual bits of a variable in a trace. +Every bit can be either "known zero", "known one" or "unknown". The abstract +domain is useful for optimizing integer operations, particularly the bitwise operations. +The abstract domain follows quite closely the <a href="https://github.com/torvalds/linux/blob/master/kernel/bpf/tnum.c">tristate abstract domain of the +eBPF verifier in the Linux +Kernel</a>, as +described by the paper +<a href="https://arxiv.org/abs/2105.05398">Sound, Precise, and Fast Abstract Interpretation with Tristate +Numbers</a> by Harishankar Vishwanathan, Matan +Shachnai, Srinivas Narayana, and Santosh Nagarakatte.</p> +<p>The presentation in this post will still be in the context of the +<a href="https://www.pypy.org/categories/toy-optimizer">toy optimizer</a>. We'll spend a significant part of +the post convincing ourselves that the abstract domain transfer functions that +we're writing are really correct, using both property-based testing and +automated proofs (again using Z3).</p> +<p>PyPy has implemented and merged a more complicated version of the same abstract +domain for the "real" PyPy JIT. A more thorough explanation of that real world +implementation will follow.</p> +<p>I'd like to thank Max Bernstein and Armin Rigo for lots of great feedback on +drafts of this post. The PyPy implementation was mainly done by Nico +Rittinghaus and me.</p> +<p><strong>Contents:</strong></p> +<div class="toc"> +<ul> +<li><a href="https://www.pypy.org/posts/2024/08/toy-knownbits.html#motivation">Motivation</a></li> +<li><a href="https://www.pypy.org/posts/2024/08/toy-knownbits.html#the-knownbits-abstract-domain">The Knownbits Abstract Domain</a></li> +<li><a href="https://www.pypy.org/posts/2024/08/toy-knownbits.html#transfer-functions">Transfer Functions</a></li> +<li><a href="https://www.pypy.org/posts/2024/08/toy-knownbits.html#property-based-tests-with-hypothesis">Property-based Tests with Hypothesis</a></li> +<li><a href="https://www.pypy.org/posts/2024/08/toy-knownbits.html#when-are-transfer-functions-correct-how-do-we-test-them">When are Transfer Functions Correct? How do we test them?</a></li> +<li><a href="https://www.pypy.org/posts/2024/08/toy-knownbits.html#implementing-binary-transfer-functions">Implementing Binary Transfer Functions</a></li> +<li><a href="https://www.pypy.org/posts/2024/08/toy-knownbits.html#addition-and-subtraction">Addition and Subtraction</a></li> +<li><a href="https://www.pypy.org/posts/2024/08/toy-knownbits.html#proving-correctness-of-the-transfer-functions-with-z3">Proving correctness of the transfer functions with Z3</a></li> +<li><a href="https://www.pypy.org/posts/2024/08/toy-knownbits.html#cases-where-this-style-of-z3-proof-doesnt-work">Cases where this style of Z3 proof doesn't work</a></li> +<li><a href="https://www.pypy.org/posts/2024/08/toy-knownbits.html#making-statements-about-precision">Making Statements about Precision</a></li> +<li><a href="https://www.pypy.org/posts/2024/08/toy-knownbits.html#using-the-abstract-domain-in-the-toy-optimizer-for-generalized-constant-folding">Using the Abstract Domain in the Toy Optimizer for Generalized Constant Folding</a></li> +<li><a href="https://www.pypy.org/posts/2024/08/toy-knownbits.html#using-the-knownbits-domain-for-conditional-peephole-rewrites">Using the KnownBits Domain for Conditional Peephole Rewrites</a></li> +<li><a href="https://www.pypy.org/posts/2024/08/toy-knownbits.html#conclusion">Conclusion</a></li> +</ul> +</div> +<h3 id="motivation">Motivation</h3> +<p>In many programs that do bit-manipulation of integers, some of the bits of the +integer variables of the program can be statically known. Here's a simple +example:</p> +<div class="code"><pre class="code literal-block"><span class="nv">x</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="nv">a</span><span class="w"> </span><span class="o">|</span><span class="w"> </span><span class="mi">1</span> +... +<span class="k">if</span><span class="w"> </span><span class="nv">x</span><span class="w"> </span><span class="o">&amp;</span><span class="w"> </span><span class="mi">1</span>: +<span class="w"> </span>... +<span class="k">else</span>: +<span class="w"> </span>... +</pre></div> + +<p>After the assignment <code>x = a | 1</code>, we know that the lowest bit of <code>x</code> must be <code>1</code> +(the other bits are unknown) and an optimizer could remove the condition <code>x &amp; 1</code> by +constant-folding it to <code>1</code>.</p> +<p>Another (more complicated) example is:</p> +<div class="code"><pre class="code literal-block">assert i &amp; 0b111 == 0 # check that i is a multiple of 8 +j = i + 16 +assert j &amp; 0b111 == 0 +</pre></div> + +<p>This kind of code could e.g. happen in a <a href="https://docs.pydrofoil.org/en/latest/">CPU +emulator</a>, where <code>i</code> and <code>j</code> are +integers that represent emulated pointers, and the <code>assert</code>s are alignment +checks. The first assert implies that the lowest three bits of i must be <code>0</code>. +Adding 16 to such a number produces a result where the lowest three bits are +again all <code>0</code>, therefore the second assert is always true. So we would like a +compiler to remove the second assert.</p> +<p>Both of these will optimizations are doable with the help of the knownbits +abstract domain that we'll discuss in the rest of the post.</p> +<h3 id="the-knownbits-abstract-domain">The Knownbits Abstract Domain</h3> +<p>An abstract value of the knownbits domain needs to be able to store, for every +bit of an integer variable in a program, whether it is known 0, known 1, or +unknown. To represent +three different states, we need 2 bits, which we will call <code>one</code> and <code>unknown</code>. +Here's the encoding:</p> +<table> +<thead> +<tr> +<th>one</th> +<th>unknown</th> +<th align="right">knownbit</th> +</tr> +</thead> +<tbody> +<tr> +<td>0</td> +<td>0</td> +<td align="right">0</td> +</tr> +<tr> +<td>1</td> +<td>0</td> +<td align="right">1</td> +</tr> +<tr> +<td>0</td> +<td>1</td> +<td align="right">?</td> +</tr> +<tr> +<td>1</td> +<td>1</td> +<td align="right">illegal</td> +</tr> +</tbody> +</table> +<p>The <code>unknown</code> bit is set if we don't know the value of the bit ("?"), the <code>one</code> +bit is set if the bit is known to be a <code>1</code>. Since two bits are enough to encode +four different states, but we only need three, the combination of a set <code>one</code> +bit and a set <code>unknown</code> is not allowed.</p> +<p>We don't just want to encode a single bit, however. Instead, we want to do this +for all the bits of an integer variable. Therefore the instances of the abstract +domain get two integer fields <code>ones</code> and <code>unknowns</code>, where each pair of +corresponding bits encodes the knowledge about the corresponding bit of the +integer variable in the program.</p> +<p>We can start implementing a Python class that works like this:</p> +<div class="code"><pre class="code literal-block"><span class="kn">from</span> <span class="nn">dataclasses</span> <span class="kn">import</span> <span class="n">dataclass</span> + +<span class="nd">@dataclass</span><span class="p">(</span><span class="n">eq</span><span class="o">=</span><span class="kc">False</span><span class="p">)</span> +<span class="k">class</span> <span class="nc">KnownBits</span><span class="p">:</span> + <span class="n">ones</span> <span class="p">:</span> <span class="nb">int</span> + <span class="n">unknowns</span> <span class="p">:</span> <span class="nb">int</span> + + <span class="k">def</span> <span class="nf">__post_init__</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span> + <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">ones</span><span class="p">,</span> <span class="nb">int</span><span class="p">):</span> + <span class="k">assert</span> <span class="bp">self</span><span class="o">.</span><span class="n">is_well_formed</span><span class="p">()</span> + + <span class="k">def</span> <span class="nf">is_well_formed</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span> + <span class="c1"># a bit cannot be both 1 and unknown</span> + <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">ones</span> <span class="o">&amp;</span> <span class="bp">self</span><span class="o">.</span><span class="n">unknowns</span> <span class="o">==</span> <span class="mi">0</span> + + <span class="nd">@staticmethod</span> + <span class="k">def</span> <span class="nf">from_constant</span><span class="p">(</span><span class="n">const</span> <span class="p">:</span> <span class="nb">int</span><span class="p">):</span> +<span class="w"> </span><span class="sd">""" Construct a KnownBits corresponding to a constant, where all bits</span> +<span class="sd"> are known."""</span> + <span class="k">return</span> <span class="n">KnownBits</span><span class="p">(</span><span class="n">const</span><span class="p">,</span> <span class="mi">0</span><span class="p">)</span> + + <span class="k">def</span> <span class="nf">is_constant</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span> +<span class="w"> </span><span class="sd">""" Check if the KnownBits instance represents a constant. """</span> + <span class="c1"># it's a constant if there are no unknowns</span> + <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">unknowns</span> <span class="o">==</span> <span class="mi">0</span> +</pre></div> + +<p>We can also add some convenience properties. Sometimes it is easier to work +with an integer where all the <em>known</em> bits are set, or one where the positions +of all the known zeros have a set bit:</p> +<div class="code"><pre class="code literal-block"><span class="k">class</span> <span class="nc">KnownBits</span><span class="p">:</span> + <span class="o">...</span> + + <span class="nd">@property</span> + <span class="k">def</span> <span class="nf">knowns</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span> +<span class="w"> </span><span class="sd">""" return an integer where the known bits are set. """</span> + <span class="c1"># the knowns are just the unknowns, inverted</span> + <span class="k">return</span> <span class="o">~</span><span class="bp">self</span><span class="o">.</span><span class="n">unknowns</span> + + <span class="nd">@property</span> + <span class="k">def</span> <span class="nf">zeros</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span> +<span class="w"> </span><span class="sd">""" return an integer where the places that are known zeros have a bit</span> +<span class="sd"> set. """</span> + <span class="c1"># it's a 0 if it is known, but not 1</span> + <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">knowns</span> <span class="o">&amp;</span> <span class="o">~</span><span class="bp">self</span><span class="o">.</span><span class="n">ones</span> +</pre></div> + +<p>Also, for debugging and for writing tests we want a way to print the known bits +in a human-readable form, and also to have a way to construct a <code>KnownBits</code> +instance from a string. It's not important to understand the details of +<code>__str__</code> or <code>from_str</code> for the rest of the post, so I'm putting them into a fold:</p> +<details> +<summary><code>KnownBits</code> from and to string conversions</summary> + + +<div class="code"><pre class="code literal-block"><span class="k">class</span> <span class="nc">KnownBits</span><span class="p">:</span> + <span class="o">...</span> + + <span class="k">def</span> <span class="fm">__repr__</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span> + <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">is_constant</span><span class="p">():</span> + <span class="k">return</span> <span class="sa">f</span><span class="s2">"KnownBits.from_constant(</span><span class="si">{</span><span class="bp">self</span><span class="o">.</span><span class="n">ones</span><span class="si">}</span><span class="s2">)"</span> + <span class="k">return</span> <span class="sa">f</span><span class="s2">"KnownBits(</span><span class="si">{</span><span class="bp">self</span><span class="o">.</span><span class="n">ones</span><span class="si">}</span><span class="s2">, </span><span class="si">{</span><span class="bp">self</span><span class="o">.</span><span class="n">unknowns</span><span class="si">}</span><span class="s2">)"</span> + + <span class="k">def</span> <span class="fm">__str__</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span> + <span class="n">res</span> <span class="o">=</span> <span class="p">[]</span> + <span class="n">ones</span><span class="p">,</span> <span class="n">unknowns</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">ones</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">unknowns</span> + <span class="c1"># construct the string representation right to left</span> + <span class="k">while</span> <span class="mi">1</span><span class="p">:</span> + <span class="k">if</span> <span class="ow">not</span> <span class="n">ones</span> <span class="ow">and</span> <span class="ow">not</span> <span class="n">unknowns</span><span class="p">:</span> + <span class="k">break</span> <span class="c1"># we leave off the leading known 0s</span> + <span class="k">if</span> <span class="n">ones</span> <span class="o">==</span> <span class="o">-</span><span class="mi">1</span> <span class="ow">and</span> <span class="ow">not</span> <span class="n">unknowns</span><span class="p">:</span> + <span class="c1"># -1 has all bits set in two's complement, so the leading</span> + <span class="c1"># bits are all 1</span> + <span class="n">res</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="s1">'1'</span><span class="p">)</span> + <span class="n">res</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="s2">"..."</span><span class="p">)</span> + <span class="k">break</span> + <span class="k">if</span> <span class="n">unknowns</span> <span class="o">==</span> <span class="o">-</span><span class="mi">1</span><span class="p">:</span> + <span class="c1"># -1 has all bits set in two's complement, so the leading bits</span> + <span class="c1"># are all ?</span> + <span class="k">assert</span> <span class="ow">not</span> <span class="n">ones</span> + <span class="n">res</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="s2">"?"</span><span class="p">)</span> + <span class="n">res</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="s2">"..."</span><span class="p">)</span> + <span class="k">break</span> + <span class="k">if</span> <span class="n">unknowns</span> <span class="o">&amp;</span> <span class="mi">1</span><span class="p">:</span> + <span class="n">res</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="s1">'?'</span><span class="p">)</span> + <span class="k">elif</span> <span class="n">ones</span> <span class="o">&amp;</span> <span class="mi">1</span><span class="p">:</span> + <span class="n">res</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="s1">'1'</span><span class="p">)</span> + <span class="k">else</span><span class="p">:</span> + <span class="n">res</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="s1">'0'</span><span class="p">)</span> + <span class="n">ones</span> <span class="o">&gt;&gt;=</span> <span class="mi">1</span> + <span class="n">unknowns</span> <span class="o">&gt;&gt;=</span> <span class="mi">1</span> + <span class="k">if</span> <span class="ow">not</span> <span class="n">res</span><span class="p">:</span> + <span class="n">res</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="s1">'0'</span><span class="p">)</span> + <span class="n">res</span><span class="o">.</span><span class="n">reverse</span><span class="p">()</span> + <span class="k">return</span> <span class="s2">""</span><span class="o">.</span><span class="n">join</span><span class="p">(</span><span class="n">res</span><span class="p">)</span> + + <span class="nd">@staticmethod</span> + <span class="k">def</span> <span class="nf">from_str</span><span class="p">(</span><span class="n">s</span><span class="p">):</span> +<span class="w"> </span><span class="sd">""" Construct a KnownBits instance that from a string. String can start</span> +<span class="sd"> with ...1 to mean that all higher bits are 1, or ...? to mean that all</span> +<span class="sd"> higher bits are unknown. Otherwise it is assumed that the higher bits</span> +<span class="sd"> are all 0. """</span> + <span class="n">ones</span><span class="p">,</span> <span class="n">unknowns</span> <span class="o">=</span> <span class="mi">0</span><span class="p">,</span> <span class="mi">0</span> + <span class="n">startindex</span> <span class="o">=</span> <span class="mi">0</span> + <span class="k">if</span> <span class="n">s</span><span class="o">.</span><span class="n">startswith</span><span class="p">(</span><span class="s2">"...?"</span><span class="p">):</span> + <span class="n">unknowns</span> <span class="o">=</span> <span class="o">-</span><span class="mi">1</span> + <span class="n">startindex</span> <span class="o">=</span> <span class="mi">4</span> + <span class="k">elif</span> <span class="n">s</span><span class="o">.</span><span class="n">startswith</span><span class="p">(</span><span class="s2">"...1"</span><span class="p">):</span> + <span class="n">ones</span> <span class="o">=</span> <span class="o">-</span><span class="mi">1</span> + <span class="n">startindex</span> <span class="o">=</span> <span class="mi">4</span> + <span class="k">for</span> <span class="n">index</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="n">startindex</span><span class="p">,</span> <span class="nb">len</span><span class="p">(</span><span class="n">s</span><span class="p">)):</span> + <span class="n">ones</span> <span class="o">&lt;&lt;=</span> <span class="mi">1</span> + <span class="n">unknowns</span> <span class="o">&lt;&lt;=</span> <span class="mi">1</span> + <span class="n">c</span> <span class="o">=</span> <span class="n">s</span><span class="p">[</span><span class="n">index</span><span class="p">]</span> + <span class="k">if</span> <span class="n">c</span> <span class="o">==</span> <span class="s1">'1'</span><span class="p">:</span> + <span class="n">ones</span> <span class="o">|=</span> <span class="mi">1</span> + <span class="k">elif</span> <span class="n">c</span> <span class="o">==</span> <span class="s1">'?'</span><span class="p">:</span> + <span class="n">unknowns</span> <span class="o">|=</span> <span class="mi">1</span> + <span class="k">return</span> <span class="n">KnownBits</span><span class="p">(</span><span class="n">ones</span><span class="p">,</span> <span class="n">unknowns</span><span class="p">)</span> + + <span class="nd">@staticmethod</span> + <span class="k">def</span> <span class="nf">all_unknown</span><span class="p">():</span> +<span class="w"> </span><span class="sd">""" convenience constructor for the "all bits unknown" abstract value</span> +<span class="sd"> """</span> + <span class="k">return</span> <span class="n">KnownBits</span><span class="o">.</span><span class="n">from_str</span><span class="p">(</span><span class="s2">"...?"</span><span class="p">)</span> +</pre></div> + + + +</details> + +<p>And here's a <a href="https://pytest.org">pytest</a>-style unit test for <code>str</code>:</p> +<div class="code"><pre class="code literal-block"><span class="k">def</span> <span class="nf">test_str</span><span class="p">():</span> + <span class="k">assert</span> <span class="nb">str</span><span class="p">(</span><span class="n">KnownBits</span><span class="o">.</span><span class="n">from_constant</span><span class="p">(</span><span class="mi">0</span><span class="p">))</span> <span class="o">==</span> <span class="s1">'0'</span> + <span class="k">assert</span> <span class="nb">str</span><span class="p">(</span><span class="n">KnownBits</span><span class="o">.</span><span class="n">from_constant</span><span class="p">(</span><span class="mi">5</span><span class="p">))</span> <span class="o">==</span> <span class="s1">'101'</span> + <span class="k">assert</span> <span class="nb">str</span><span class="p">(</span><span class="n">KnownBits</span><span class="p">(</span><span class="mi">5</span><span class="p">,</span> <span class="mb">0b10</span><span class="p">))</span> <span class="o">==</span> <span class="s1">'1?1'</span> + <span class="k">assert</span> <span class="nb">str</span><span class="p">(</span><span class="n">KnownBits</span><span class="p">(</span><span class="o">~</span><span class="mb">0b1111</span><span class="p">,</span> <span class="mb">0b10</span><span class="p">))</span> <span class="o">==</span> <span class="s1">'...100?0'</span> + <span class="k">assert</span> <span class="nb">str</span><span class="p">(</span><span class="n">KnownBits</span><span class="p">(</span><span class="mi">1</span><span class="p">,</span> <span class="o">~</span><span class="mb">0b1</span><span class="p">))</span> <span class="o">==</span> <span class="s1">'...?1'</span> +</pre></div> + +<p>An instance of <code>KnownBits</code> represents a set of integers, namely those that match +the known bits stored in the instance. We can write a method <code>contains</code> that +takes a concrete <code>int</code> value and returns <code>True</code> if the value matches the +pattern of the known bits:</p> +<div class="code"><pre class="code literal-block"><span class="k">class</span> <span class="nc">KnownBits</span><span class="p">:</span> + <span class="o">...</span> + + <span class="k">def</span> <span class="nf">contains</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span> <span class="p">:</span> <span class="nb">int</span><span class="p">):</span> +<span class="w"> </span><span class="sd">""" Check whether the KnownBits instance contains the concrete integer</span> +<span class="sd"> `value`. """</span> + <span class="c1"># check whether value matches the bit pattern. in the places where we</span> + <span class="c1"># know the bits, the value must agree with ones.</span> + <span class="k">return</span> <span class="n">value</span> <span class="o">&amp;</span> <span class="bp">self</span><span class="o">.</span><span class="n">knowns</span> <span class="o">==</span> <span class="bp">self</span><span class="o">.</span><span class="n">ones</span> +</pre></div> + +<p>and a test:</p> +<div class="code"><pre class="code literal-block"><span class="k">def</span> <span class="nf">test_contains</span><span class="p">():</span> + <span class="n">k1</span> <span class="o">=</span> <span class="n">KnownBits</span><span class="o">.</span><span class="n">from_str</span><span class="p">(</span><span class="s1">'1?1'</span><span class="p">)</span> + <span class="k">assert</span> <span class="n">k1</span><span class="o">.</span><span class="n">contains</span><span class="p">(</span><span class="mb">0b111</span><span class="p">)</span> + <span class="k">assert</span> <span class="n">k1</span><span class="o">.</span><span class="n">contains</span><span class="p">(</span><span class="mb">0b101</span><span class="p">)</span> + <span class="k">assert</span> <span class="ow">not</span> <span class="n">k1</span><span class="o">.</span><span class="n">contains</span><span class="p">(</span><span class="mb">0b110</span><span class="p">)</span> + <span class="k">assert</span> <span class="ow">not</span> <span class="n">k1</span><span class="o">.</span><span class="n">contains</span><span class="p">(</span><span class="mb">0b011</span><span class="p">)</span> + + <span class="n">k2</span> <span class="o">=</span> <span class="n">KnownBits</span><span class="o">.</span><span class="n">from_str</span><span class="p">(</span><span class="s1">'...?1'</span><span class="p">)</span> <span class="c1"># all odd numbers</span> + <span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="o">-</span><span class="mi">101</span><span class="p">,</span> <span class="mi">100</span><span class="p">):</span> + <span class="k">assert</span> <span class="n">k2</span><span class="o">.</span><span class="n">contains</span><span class="p">(</span><span class="n">i</span><span class="p">)</span> <span class="o">==</span> <span class="p">(</span><span class="n">i</span> <span class="o">&amp;</span> <span class="mi">1</span><span class="p">)</span> +</pre></div> + +<h3 id="transfer-functions">Transfer Functions</h3> +<p>Now that we have implemented the basics of the <code>KnownBits</code> class, we need to +start implementing the transfer functions. They are for computing what we know +about the <em>results</em> of an operation, given the knowledge we have about the bits +of the arguments.</p> +<p>We'll start with a simple unary operation, <code>invert(x)</code> (which is <code>~x</code> in Python +and C syntax), which flips all the bits of at integer. If we know some bits of +the arguments, we can compute the corresponding bits of the result. The unknown +bits remain unknown.</p> +<p>Here's the code:</p> +<div class="code"><pre class="code literal-block"><span class="k">class</span> <span class="nc">KnownBits</span><span class="p">:</span> + <span class="o">...</span> + + <span class="k">def</span> <span class="nf">abstract_invert</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span> + <span class="c1"># self.zeros has bits set where the known 0s are in self</span> + <span class="k">return</span> <span class="n">KnownBits</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">zeros</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">unknowns</span><span class="p">)</span> +</pre></div> + +<p>And a unit-test:</p> +<div class="code"><pre class="code literal-block"><span class="k">def</span> <span class="nf">test_invert</span><span class="p">():</span> + <span class="n">k1</span> <span class="o">=</span> <span class="n">KnownBits</span><span class="o">.</span><span class="n">from_str</span><span class="p">(</span><span class="s1">'01?01?01?'</span><span class="p">)</span> + <span class="n">k2</span> <span class="o">=</span> <span class="n">k1</span><span class="o">.</span><span class="n">abstract_invert</span><span class="p">()</span> + <span class="k">assert</span> <span class="nb">str</span><span class="p">(</span><span class="n">k2</span><span class="p">)</span> <span class="o">==</span> <span class="s1">'...10?10?10?'</span> + + <span class="n">k1</span> <span class="o">=</span> <span class="n">KnownBits</span><span class="o">.</span><span class="n">from_str</span><span class="p">(</span><span class="s1">'...?'</span><span class="p">)</span> + <span class="n">k2</span> <span class="o">=</span> <span class="n">k1</span><span class="o">.</span><span class="n">abstract_invert</span><span class="p">()</span> + <span class="k">assert</span> <span class="nb">str</span><span class="p">(</span><span class="n">k2</span><span class="p">)</span> <span class="o">==</span> <span class="s1">'...?'</span> +</pre></div> + +<p>Before we continue with further transfer functions, we'll think about +correctness of the transfer functions and build up some test infrastructure. To +test transfer functions, it's quite important to move being simple example-style +unit tests. The state-space for more complicated binary transfer functions is +extremely large and it's too easy to do something wrong in a corner case. +Therefore we'll look at property-based-test for <code>KnownBits</code> next.</p> +<h3 id="property-based-tests-with-hypothesis">Property-based Tests with Hypothesis</h3> +<p>We want to do property-based tests of <code>KnownBits</code>, to try +make it less likely that we'll get a corner-case in the implementation wrong. +We'll use <a href="https://hypothesis.readthedocs.io/en/latest/">Hypothesis</a> for that.</p> +<p>I can't give a decent introduction to Hypothesis here, but want to give a few +hints about the API. Hypothesis is a way to run unit tests with randomly +generated input. It provides <em>strategies</em> to describe the data that the test +functions expects. Hypothesis provides primitive strategies (for things like +integers, strings, floats, etc) and ways to build composite strategies out of +the primitive ones.</p> +<p>To be able to write the tests, we need to generate random <code>KnownBits</code> instances, +and we also want an <code>int</code> instance that is a member of the <code>KnownBits</code> instance. +We generate tuples of <code>(KnownBits, int)</code> together, to ensure this property. +We'll ask Hypothesis to generate us a random concrete <code>int</code> as the concrete +value, and then we'll also generate a second random <code>int</code> to use as the +<code>unknown</code> masks (i.e. which bits of the concrete int we don't know in the +<code>KnownBits</code> instance). Here's a function that takes two such ints and builds the +tuple:</p> +<div class="code"><pre class="code literal-block"><span class="k">def</span> <span class="nf">build_knownbits_and_contained_number</span><span class="p">(</span><span class="n">concrete_value</span> <span class="p">:</span> <span class="nb">int</span><span class="p">,</span> <span class="n">unknowns</span> <span class="p">:</span> <span class="nb">int</span><span class="p">):</span> + <span class="c1"># to construct a valid KnownBits instance, we need to mask off the unknown</span> + <span class="c1"># bits</span> + <span class="n">ones</span> <span class="o">=</span> <span class="n">concrete_value</span> <span class="o">&amp;</span> <span class="o">~</span><span class="n">unknowns</span> + <span class="k">return</span> <span class="n">KnownBits</span><span class="p">(</span><span class="n">ones</span><span class="p">,</span> <span class="n">unknowns</span><span class="p">),</span> <span class="n">concrete_value</span> +</pre></div> + +<p>We can turn this function into a hypothesis strategy to generate input data +using the <code>strategies.builds</code> function:</p> +<div class="code"><pre class="code literal-block"><span class="kn">from</span> <span class="nn">hypothesis</span> <span class="kn">import</span> <span class="n">strategies</span><span class="p">,</span> <span class="n">given</span><span class="p">,</span> <span class="n">settings</span> + +<span class="n">ints</span> <span class="o">=</span> <span class="n">strategies</span><span class="o">.</span><span class="n">integers</span><span class="p">()</span> + +<span class="n">random_knownbits_and_contained_number</span> <span class="o">=</span> <span class="n">strategies</span><span class="o">.</span><span class="n">builds</span><span class="p">(</span> + <span class="n">build_knownbits_and_contained_number</span><span class="p">,</span> + <span class="n">ints</span><span class="p">,</span> <span class="n">ints</span> +<span class="p">)</span> +</pre></div> + +<p>One important special case of <code>KnownBits</code> are the constants, which contain only +a single concrete value. We'll also generate some of those specifically, and +then combine the <code>random_knownbits_and_contained_number</code> strategy with it:</p> +<div class="code"><pre class="code literal-block"><span class="n">constant_knownbits</span> <span class="o">=</span> <span class="n">strategies</span><span class="o">.</span><span class="n">builds</span><span class="p">(</span> + <span class="k">lambda</span> <span class="n">value</span><span class="p">:</span> <span class="p">(</span><span class="n">KnownBits</span><span class="o">.</span><span class="n">from_constant</span><span class="p">(</span><span class="n">value</span><span class="p">),</span> <span class="n">value</span><span class="p">),</span> + <span class="n">ints</span> +<span class="p">)</span> + +<span class="n">knownbits_and_contained_number</span> <span class="o">=</span> <span class="n">constant_knownbits</span> <span class="o">|</span> <span class="n">random_knownbits_and_contained_number</span> +</pre></div> + +<p>Now we can write the first property-based tests, for the <code>KnownBits.contains</code> +method:</p> +<div class="code"><pre class="code literal-block"><span class="nd">@given</span><span class="p">(</span><span class="n">knownbits_and_contained_number</span><span class="p">)</span> +<span class="k">def</span> <span class="nf">test_contains</span><span class="p">(</span><span class="n">t</span><span class="p">):</span> + <span class="n">k</span><span class="p">,</span> <span class="n">n</span> <span class="o">=</span> <span class="n">t</span> + <span class="k">assert</span> <span class="n">k</span><span class="o">.</span><span class="n">contains</span><span class="p">(</span><span class="n">t</span><span class="p">)</span> +</pre></div> + +<p>The <code>@given</code> decorator is used to tell Hypothesis which strategy to use to +generate random data for the test function. Hypothesis will run the test with a +number of random examples (100 by default). If it finds an error, it will try to +minimize the example needed that demonstrates the problem, to try to make it +easier to understand what is going wrong. It also saves all failing cases into +an example database and tries them again on subsequent runs.</p> +<p>This test is as much a check for whether we got the strategies right as it is +for the logic in <code>KnownBits.contains</code>. Here's an example output of random +concrete and abstract values that we are getting here:</p> +<div class="code"><pre class="code literal-block"><span class="mf">110000011001101</span><span class="w"> </span><span class="mf">...</span><span class="err">?</span><span class="mf">0</span><span class="err">???</span><span class="mf">1</span> +<span class="mf">...1011011</span><span class="w"> </span><span class="mf">...1011011</span> +<span class="mf">...1001101110101000010010011111011</span><span class="w"> </span><span class="mf">...1001101110101000010010011111011</span> +<span class="mf">...1001101110101000010010011111011</span><span class="w"> </span><span class="mf">...100110111010100001</span><span class="err">?</span><span class="mf">010</span><span class="err">?</span><span class="mf">1</span><span class="err">??</span><span class="mf">1</span><span class="err">??</span><span class="mf">11</span> +<span class="mf">1000001101111101001011010011111101000011000111011001011111101</span><span class="w"> </span><span class="mf">1000001101111101001011010011111101000011000111011001011111101</span> +<span class="mf">1000001101111101001011010011111101000011000111011001011111101</span><span class="w"> </span><span class="mf">1000001101111101001011010011111101000011000111</span><span class="err">????</span><span class="mf">01</span><span class="err">?</span><span class="mf">11</span><span class="err">?????</span><span class="mf">1</span> +<span class="mf">1111100000010</span><span class="w"> </span><span class="mf">1111100000010</span> +<span class="mf">1111100000010</span><span class="w"> </span><span class="mf">...</span><span class="err">?</span><span class="mf">11111</span><span class="err">?</span><span class="mf">00000</span><span class="err">??</span> +<span class="mf">110110</span><span class="w"> </span><span class="mf">110110</span> +<span class="mf">110110</span><span class="w"> </span><span class="mf">...</span><span class="err">?</span><span class="mf">00</span><span class="err">?</span><span class="mf">00</span><span class="err">????</span><span class="mf">11</span><span class="err">??</span><span class="mf">10</span> +<span class="mf">110110</span><span class="w"> </span><span class="err">??</span><span class="mf">0</span><span class="err">??</span><span class="mf">0</span> +<span class="mf">...100010111011111</span><span class="w"> </span><span class="mf">...</span><span class="err">?</span><span class="mf">100</span><span class="err">?</span><span class="mf">10111</span><span class="err">??</span><span class="mf">111</span><span class="err">?</span> +<span class="mf">...1000100000110001</span><span class="w"> </span><span class="mf">...</span><span class="err">?</span><span class="mf">000</span><span class="err">?</span><span class="mf">00000</span><span class="err">??</span><span class="mf">000</span><span class="err">?</span> +<span class="mf">110000001110</span><span class="w"> </span><span class="mf">...</span><span class="err">?</span><span class="mf">0</span><span class="err">?</span><span class="mf">0</span><span class="err">??</span><span class="mf">000</span><span class="err">?</span><span class="mf">00</span><span class="err">?</span><span class="mf">0</span><span class="err">?</span><span class="mf">0000000</span><span class="err">?</span><span class="mf">00</span><span class="err">???</span><span class="mf">0000</span><span class="err">?????</span><span class="mf">00</span><span class="err">???</span><span class="mf">000</span><span class="err">?</span><span class="mf">0</span><span class="err">?</span><span class="mf">00</span><span class="err">?</span><span class="mf">01</span><span class="err">?</span><span class="mf">000</span><span class="err">?</span><span class="mf">0</span><span class="err">??</span><span class="mf">1</span><span class="err">??</span> +<span class="mf">110000001110</span><span class="w"> </span><span class="err">??</span><span class="mf">000000</span><span class="err">???</span><span class="mf">0</span> +<span class="mf">1011011010000001110101001111000010001001011101010010010001000000010101010010001101110101111111010101010010101100110000011110000</span><span class="w"> </span><span class="mf">1011011010000001110101001111000010001001011101010010010001000000010101010010001101110101111111010101010010101100110000011110000</span> +<span class="mf">...1011010010010100</span><span class="w"> </span><span class="mf">...1011010010010100</span> +<span class="mf">...1011111110110011</span><span class="w"> </span><span class="mf">...1011111110110011</span> +<span class="mf">101000011110110</span><span class="w"> </span><span class="mf">101000011</span><span class="err">?</span><span class="mf">10</span><span class="err">?</span><span class="mf">1</span><span class="err">?</span> +<span class="mf">100101</span><span class="w"> </span><span class="err">?</span><span class="mf">00</span><span class="err">?</span><span class="mf">0</span><span class="err">?</span> +</pre></div> + +<p>That looks suitably random, but we might want to bias our random numbers a +little bit towards common error values like small constants, powers of two, etc. +Like this:</p> +<div class="code"><pre class="code literal-block"><span class="n">INTEGER_WIDTH</span> <span class="o">=</span> <span class="mi">64</span> +<span class="c1"># some small integers</span> +<span class="n">ints_special</span> <span class="o">=</span> <span class="nb">set</span><span class="p">(</span><span class="nb">range</span><span class="p">(</span><span class="mi">100</span><span class="p">))</span> +<span class="c1"># powers of two</span> +<span class="n">ints_special</span> <span class="o">=</span> <span class="n">ints_special</span><span class="o">.</span><span class="n">union</span><span class="p">(</span><span class="mi">1</span> <span class="o">&lt;&lt;</span> <span class="n">i</span> <span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="n">INTEGER_WIDTH</span> <span class="o">-</span> <span class="mi">2</span><span class="p">))</span> +<span class="c1"># powers of two - 1</span> +<span class="n">ints_special</span> <span class="o">=</span> <span class="n">ints_special</span><span class="o">.</span><span class="n">union</span><span class="p">((</span><span class="mi">1</span> <span class="o">&lt;&lt;</span> <span class="n">i</span><span class="p">)</span> <span class="o">-</span> <span class="mi">1</span> <span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="n">INTEGER_WIDTH</span> <span class="o">-</span> <span class="mi">2</span><span class="p">))</span> +<span class="c1"># negative versions of what we have so far</span> +<span class="n">ints_special</span> <span class="o">=</span> <span class="n">ints_special</span><span class="o">.</span><span class="n">union</span><span class="p">(</span><span class="o">-</span><span class="n">x</span> <span class="k">for</span> <span class="n">x</span> <span class="ow">in</span> <span class="n">ints_special</span><span class="p">)</span> +<span class="c1"># bit-flipped versions of what we have so far</span> +<span class="n">ints_special</span> <span class="o">=</span> <span class="n">ints_special</span><span class="o">.</span><span class="n">union</span><span class="p">(</span><span class="o">~</span><span class="n">x</span> <span class="k">for</span> <span class="n">x</span> <span class="ow">in</span> <span class="n">ints_special</span><span class="p">)</span> +<span class="n">ints_special</span> <span class="o">=</span> <span class="nb">list</span><span class="p">(</span><span class="n">ints_special</span><span class="p">)</span> +<span class="c1"># sort them (because hypothesis simplifies towards earlier elements in the list)</span> +<span class="n">ints_special</span><span class="o">.</span><span class="n">sort</span><span class="p">(</span><span class="n">key</span><span class="o">=</span><span class="k">lambda</span> <span class="n">element</span><span class="p">:</span> <span class="p">(</span><span class="nb">abs</span><span class="p">(</span><span class="n">element</span><span class="p">),</span> <span class="n">element</span> <span class="o">&lt;</span> <span class="mi">0</span><span class="p">))</span> + +<span class="n">ints</span> <span class="o">=</span> <span class="n">strategies</span><span class="o">.</span><span class="n">sampled_from</span><span class="p">(</span><span class="n">ints_special</span><span class="p">)</span> <span class="o">|</span> <span class="n">strategies</span><span class="o">.</span><span class="n">integers</span><span class="p">()</span> +</pre></div> + +<p>Now we get data like this:</p> +<div class="code"><pre class="code literal-block"><span class="mf">1110</span><span class="w"> </span><span class="mf">1110</span> +<span class="mf">...10000000000000000001</span><span class="w"> </span><span class="mf">...10000</span><span class="err">??</span><span class="mf">0</span><span class="err">??</span><span class="mf">0000</span><span class="err">??</span><span class="mf">00</span><span class="err">?</span><span class="mf">1</span> +<span class="mf">1</span><span class="w"> </span><span class="err">??</span><span class="mf">0</span><span class="err">??</span><span class="mf">0000</span><span class="err">??</span><span class="mf">00</span><span class="err">?</span><span class="mf">1</span> +<span class="mf">1</span><span class="w"> </span><span class="err">?</span> +<span class="mf">...10101100</span><span class="w"> </span><span class="mf">...10101100</span> +<span class="mf">110000000011001010111011111111111111011110010001001100110001011</span><span class="w"> </span><span class="mf">...</span><span class="err">?</span><span class="mf">0</span><span class="err">?</span><span class="mf">101</span><span class="err">?</span> +<span class="mf">110000000011001010111011111111111111011110010001001100110001011</span><span class="w"> </span><span class="err">??</span><span class="mf">00000000</span><span class="err">??</span><span class="mf">00</span><span class="err">?</span><span class="mf">0</span><span class="err">?</span><span class="mf">0</span><span class="err">???</span><span class="mf">0</span><span class="err">??????????????</span><span class="mf">0</span><span class="err">????</span><span class="mf">00</span><span class="err">?</span><span class="mf">000</span><span class="err">?</span><span class="mf">00</span><span class="err">??</span><span class="mf">00</span><span class="err">??</span><span class="mf">000</span><span class="err">?</span><span class="mf">0</span><span class="err">??</span> +<span class="mf">...1011111111111111111111111111</span><span class="w"> </span><span class="mf">...</span><span class="err">?</span><span class="mf">11</span><span class="err">?</span><span class="mf">11</span><span class="err">??</span> +<span class="mf">...1011111111111111111111111111</span><span class="w"> </span><span class="mf">...</span><span class="err">?</span><span class="mf">0</span><span class="err">??????????????????????????</span> +<span class="mf">0</span><span class="w"> </span><span class="mf">...</span><span class="err">?</span><span class="mf">0</span><span class="err">??????????????????????????</span> +<span class="mf">101101</span><span class="w"> </span><span class="mf">101101</span> +<span class="mf">111111111111111111111111111111111111111111111</span><span class="w"> </span><span class="mf">111111111111111111111111111111111111111111111</span> +<span class="mf">10111</span><span class="w"> </span><span class="mf">10111</span> +<span class="mf">...101100</span><span class="w"> </span><span class="mf">...1</span><span class="err">?</span><span class="mf">111011</span><span class="err">?</span><span class="mf">0</span> +<span class="mf">101000</span><span class="w"> </span><span class="err">?</span><span class="mf">001010</span><span class="err">?</span><span class="mf">0</span> +<span class="mf">101000</span><span class="w"> </span><span class="err">?</span><span class="mf">0</span><span class="err">?</span><span class="mf">000</span> +<span class="mf">110010</span><span class="w"> </span><span class="mf">110010</span> +<span class="mf">...100111</span><span class="w"> </span><span class="mf">...100111</span> +<span class="mf">1111011010010</span><span class="w"> </span><span class="mf">1111011010010</span> +<span class="mf">...1000000000000000000000000000000000000</span><span class="w"> </span><span class="mf">...1000000000000000000000000000000000000</span> +</pre></div> + +<p>We can also write a test that checks that the somewhat tricky logic in +<code>__str__</code> and <code>from_str</code> is correct, by making sure that the two functions +round-trip (ie converting a <code>KnownBits</code> to a string and then back to a +<code>KnownBits</code> instance produces the same abstract value).</p> +<div class="code"><pre class="code literal-block"><span class="nd">@given</span><span class="p">(</span><span class="n">knownbits_and_contained_number</span><span class="p">)</span> +<span class="k">def</span> <span class="nf">test_hypothesis_str_roundtrips</span><span class="p">(</span><span class="n">t1</span><span class="p">):</span> + <span class="n">k1</span><span class="p">,</span> <span class="n">n1</span> <span class="o">=</span> <span class="n">t1</span> + <span class="n">s</span> <span class="o">=</span> <span class="nb">str</span><span class="p">(</span><span class="n">k1</span><span class="p">)</span> + <span class="n">k2</span> <span class="o">=</span> <span class="n">KnownBits</span><span class="o">.</span><span class="n">from_str</span><span class="p">(</span><span class="n">s</span><span class="p">)</span> + <span class="k">assert</span> <span class="n">k1</span><span class="o">.</span><span class="n">ones</span> <span class="o">==</span> <span class="n">k2</span><span class="o">.</span><span class="n">ones</span> + <span class="k">assert</span> <span class="n">k1</span><span class="o">.</span><span class="n">unknowns</span> <span class="o">==</span> <span class="n">k2</span><span class="o">.</span><span class="n">unknowns</span> +</pre></div> + +<p>Now let's actually apply this infrastructure to test <code>abstract_invert</code>.</p> +<h3 id="when-are-transfer-functions-correct-how-do-we-test-them">When are Transfer Functions Correct? How do we test them?</h3> +<p>Abstract values, i.e. instances of <code>KnownBits</code> represent <em>sets</em> of concrete +values. We want the transfer functions to compute <em>overapproximations</em> of the +concrete values. So if we have an arbitrary abstract value <code>k</code>, with a concrete +number <code>n</code> that is a member of the abstract values (i.e. +<code>k.contains(n) == True</code>) then the result of the concrete operation <code>op(n)</code> +<strong>must</strong> be a member of the result of the abstract operation <code>k.abstract_op()</code> +(i.e. <code>k.abstract_op().contains(op(n)) == True</code>).</p> +<p>Checking the correctness/overapproximation property is a good match for +hypothesis. Here's what the test for <code>abstract_invert</code> looks like:</p> +<div class="code"><pre class="code literal-block"><span class="nd">@given</span><span class="p">(</span><span class="n">knownbits_and_contained_number</span><span class="p">)</span> +<span class="k">def</span> <span class="nf">test_hypothesis_invert</span><span class="p">(</span><span class="n">t</span><span class="p">):</span> + <span class="n">k1</span><span class="p">,</span> <span class="n">n1</span> <span class="o">=</span> <span class="n">t1</span> + <span class="n">n2</span> <span class="o">=</span> <span class="o">~</span><span class="n">n1</span> <span class="c1"># compute the real result</span> + <span class="n">k2</span> <span class="o">=</span> <span class="n">k1</span><span class="o">.</span><span class="n">abstract_invert</span><span class="p">()</span> <span class="c1"># compute the abstract result</span> + <span class="k">assert</span> <span class="n">k2</span><span class="o">.</span><span class="n">contains</span><span class="p">(</span><span class="n">n2</span><span class="p">)</span> <span class="c1"># the abstract result must contain the real result</span> +</pre></div> + +<p>This is the <em>only</em> condition needed for <code>abstract_invert</code> to be correct. If +<code>abstract_invert</code> fulfils this property for every combination of abstract and +concrete value then <code>abstract_invert</code> is correct. Note however, that this test +does not actually check whether <code>abstract_invert</code> gives us precise results. A +correct (but imprecise) implementation of <code>abstract_invert</code> would simply return +a completely unknown result, regardless of what is known about the input +<code>KnownBits</code>.</p> +<p>The "proper" CS term for this notion of correctness is called <em>soundness</em>. The +correctness condition on the transfer functions is called a <em>Galois +connection</em>. I won't go into any mathematical/technical details here, but +wanted to at least mention the terms. I found <a href="https://web.njit.edu/~mjk76/">Martin +Kellogg</a>'s +<a href="https://web.njit.edu/~mjk76/teaching/cs684-sp24/assets/lecture-12.pdf#34">slides</a> +to be quite an approachable introduction to the Galois connection and how to +show soundness.</p> +<h3 id="implementing-binary-transfer-functions">Implementing Binary Transfer Functions</h3> +<p>Now we have infrastructure in place for testing transfer functions with random +inputs. With that we can start thinking about the more complicated case, that of +binary operations. Let's start with the simpler ones, <code>and</code> and <code>or</code>. For <code>and</code>, +we can know a <code>0</code> bit in the result if either of the input bits are known <code>0</code>; +or we can know a <code>1</code> bit in the result if both input bits are known <code>1</code>. +Otherwise the resulting bit is unknown. Let's look at all the combinations:</p> +<div class="code"><pre class="code literal-block">and +input1: 000111??? +input2: 01?01?01? +result: 00001?0?? +</pre></div> + +<div class="code"><pre class="code literal-block"><span class="k">class</span> <span class="nc">KnownBits</span><span class="p">:</span> + <span class="o">...</span> + + <span class="k">def</span> <span class="nf">abstract_and</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">):</span> + <span class="n">ones</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">ones</span> <span class="o">&amp;</span> <span class="n">other</span><span class="o">.</span><span class="n">ones</span> <span class="c1"># known ones</span> + <span class="n">knowns</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">zeros</span> <span class="o">|</span> <span class="n">other</span><span class="o">.</span><span class="n">zeros</span> <span class="o">|</span> <span class="n">ones</span> + <span class="k">return</span> <span class="n">KnownBits</span><span class="p">(</span><span class="n">ones</span><span class="p">,</span> <span class="o">~</span><span class="n">knowns</span><span class="p">)</span> +</pre></div> + +<p>Here's an example unit-test and a property-based test for <code>and</code>:</p> +<div class="code"><pre class="code literal-block"><span class="k">def</span> <span class="nf">test_and</span><span class="p">():</span> + <span class="c1"># test all combinations of 0, 1, ? in one example</span> + <span class="n">k1</span> <span class="o">=</span> <span class="n">KnownBits</span><span class="o">.</span><span class="n">from_str</span><span class="p">(</span><span class="s1">'01?01?01?'</span><span class="p">)</span> + <span class="n">k2</span> <span class="o">=</span> <span class="n">KnownBits</span><span class="o">.</span><span class="n">from_str</span><span class="p">(</span><span class="s1">'000111???'</span><span class="p">)</span> + <span class="n">res</span> <span class="o">=</span> <span class="n">k1</span><span class="o">.</span><span class="n">abstract_and</span><span class="p">(</span><span class="n">k2</span><span class="p">)</span> <span class="c1"># should be: 0...00001?0??</span> + <span class="k">assert</span> <span class="nb">str</span><span class="p">(</span><span class="n">res</span><span class="p">)</span> <span class="o">==</span> <span class="s2">"1?0??"</span> + +<span class="nd">@given</span><span class="p">(</span><span class="n">knownbits_and_contained_number</span><span class="p">,</span> <span class="n">knownbits_and_contained_number</span><span class="p">)</span> +<span class="k">def</span> <span class="nf">test_hypothesis_and</span><span class="p">(</span><span class="n">t1</span><span class="p">,</span> <span class="n">t2</span><span class="p">):</span> + <span class="n">k1</span><span class="p">,</span> <span class="n">n1</span> <span class="o">=</span> <span class="n">t1</span> + <span class="n">k2</span><span class="p">,</span> <span class="n">n2</span> <span class="o">=</span> <span class="n">t2</span> + <span class="n">k3</span> <span class="o">=</span> <span class="n">k1</span><span class="o">.</span><span class="n">abstract_and</span><span class="p">(</span><span class="n">k2</span><span class="p">)</span> + <span class="n">n3</span> <span class="o">=</span> <span class="n">n1</span> <span class="o">&amp;</span> <span class="n">n2</span> + <span class="k">assert</span> <span class="n">k3</span><span class="o">.</span><span class="n">contains</span><span class="p">(</span><span class="n">n3</span><span class="p">)</span> +</pre></div> + +<p>To implement <code>or</code> is pretty similar. The result is known <code>1</code> where either of the +inputs is <code>1</code>. The result is known <code>0</code> where both inputs are known <code>0</code>, and <code>?</code> +otherwise.</p> +<div class="code"><pre class="code literal-block">or +input1: 000111??? +input2: 01?01?01? +result: 01?111?1? +</pre></div> + +<div class="code"><pre class="code literal-block"><span class="k">class</span> <span class="nc">KnownBits</span><span class="p">:</span> + <span class="o">...</span> + + <span class="k">def</span> <span class="nf">abstract_or</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">):</span> + <span class="n">ones</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">ones</span> <span class="o">|</span> <span class="n">other</span><span class="o">.</span><span class="n">ones</span> + <span class="n">zeros</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">zeros</span> <span class="o">&amp;</span> <span class="n">other</span><span class="o">.</span><span class="n">zeros</span> + <span class="n">knowns</span> <span class="o">=</span> <span class="n">ones</span> <span class="o">|</span> <span class="n">zeros</span> + <span class="k">return</span> <span class="n">KnownBits</span><span class="p">(</span><span class="n">ones</span><span class="p">,</span> <span class="o">~</span><span class="n">knowns</span><span class="p">)</span> +</pre></div> + +<p>Here's an example unit-test and a property-based test for <code>or</code>:</p> +<div class="code"><pre class="code literal-block"><span class="k">def</span> <span class="nf">test_or</span><span class="p">():</span> + <span class="n">k1</span> <span class="o">=</span> <span class="n">KnownBits</span><span class="o">.</span><span class="n">from_str</span><span class="p">(</span><span class="s1">'01?01?01?'</span><span class="p">)</span> + <span class="n">k2</span> <span class="o">=</span> <span class="n">KnownBits</span><span class="o">.</span><span class="n">from_str</span><span class="p">(</span><span class="s1">'000111???'</span><span class="p">)</span> + <span class="n">res</span> <span class="o">=</span> <span class="n">k1</span><span class="o">.</span><span class="n">abstract_or</span><span class="p">(</span><span class="n">k2</span><span class="p">)</span> <span class="c1"># should be: 0...01?111?1?</span> + <span class="k">assert</span> <span class="nb">str</span><span class="p">(</span><span class="n">res</span><span class="p">)</span> <span class="o">==</span> <span class="s2">"1?111?1?"</span> + +<span class="nd">@given</span><span class="p">(</span><span class="n">knownbits_and_contained_number</span><span class="p">,</span> <span class="n">knownbits_and_contained_number</span><span class="p">)</span> +<span class="k">def</span> <span class="nf">test_hypothesis_or</span><span class="p">(</span><span class="n">t1</span><span class="p">,</span> <span class="n">t2</span><span class="p">):</span> + <span class="n">k1</span><span class="p">,</span> <span class="n">n1</span> <span class="o">=</span> <span class="n">t1</span> + <span class="n">k2</span><span class="p">,</span> <span class="n">n2</span> <span class="o">=</span> <span class="n">t2</span> + <span class="n">k3</span> <span class="o">=</span> <span class="n">k1</span><span class="o">.</span><span class="n">abstract_or</span><span class="p">(</span><span class="n">k2</span><span class="p">)</span> + <span class="n">n3</span> <span class="o">=</span> <span class="n">n1</span> <span class="o">|</span> <span class="n">n2</span> + <span class="k">assert</span> <span class="n">k3</span><span class="o">.</span><span class="n">contains</span><span class="p">(</span><span class="n">n3</span><span class="p">)</span> +</pre></div> + +<p>Implementing support for <code>abstract_xor</code> is relatively simple, and left as an +exercise :-).</p> +<h3 id="addition-and-subtraction">Addition and Subtraction</h3> +<p><code>invert</code>, <code>and</code>, and <code>or</code> are relatively simple transfer functions to write, +because they compose over the individual bits of the integers. The arithmetic +functions <code>add</code> and <code>sub</code> are significantly harder, because of carries and +borrows. Coming up with the formulas for them and gaining an intuitive +understanding is quite tricky and involves carefully going through a few +examples with pen and paper. When implementing this in PyPy, Nico and I didn't +come up with the implementation ourselves, but instead took them from the +<a href="https://arxiv.org/abs/2105.05398">Tristate Numbers</a> paper. Here's the code, +with example tests and hypothesis tests:</p> +<div class="code"><pre class="code literal-block"><span class="k">class</span> <span class="nc">KnownBits</span><span class="p">:</span> + <span class="o">...</span> + + <span class="k">def</span> <span class="nf">abstract_add</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">):</span> + <span class="n">sum_ones</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">ones</span> <span class="o">+</span> <span class="n">other</span><span class="o">.</span><span class="n">ones</span> + <span class="n">sum_unknowns</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">unknowns</span> <span class="o">+</span> <span class="n">other</span><span class="o">.</span><span class="n">unknowns</span> + <span class="n">all_carries</span> <span class="o">=</span> <span class="n">sum_ones</span> <span class="o">+</span> <span class="n">sum_unknowns</span> + <span class="n">ones_carries</span> <span class="o">=</span> <span class="n">all_carries</span> <span class="o">^</span> <span class="n">sum_ones</span> + <span class="n">unknowns</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">unknowns</span> <span class="o">|</span> <span class="n">other</span><span class="o">.</span><span class="n">unknowns</span> <span class="o">|</span> <span class="n">ones_carries</span> + <span class="n">ones</span> <span class="o">=</span> <span class="n">sum_ones</span> <span class="o">&amp;</span> <span class="o">~</span><span class="n">unknowns</span> + <span class="k">return</span> <span class="n">KnownBits</span><span class="p">(</span><span class="n">ones</span><span class="p">,</span> <span class="n">unknowns</span><span class="p">)</span> + + <span class="k">def</span> <span class="nf">abstract_sub</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">):</span> + <span class="n">diff_ones</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">ones</span> <span class="o">-</span> <span class="n">other</span><span class="o">.</span><span class="n">ones</span> + <span class="n">val_borrows</span> <span class="o">=</span> <span class="p">(</span><span class="n">diff_ones</span> <span class="o">+</span> <span class="bp">self</span><span class="o">.</span><span class="n">unknowns</span><span class="p">)</span> <span class="o">^</span> <span class="p">(</span><span class="n">diff_ones</span> <span class="o">-</span> <span class="n">other</span><span class="o">.</span><span class="n">unknowns</span><span class="p">)</span> + <span class="n">unknowns</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">unknowns</span> <span class="o">|</span> <span class="n">other</span><span class="o">.</span><span class="n">unknowns</span> <span class="o">|</span> <span class="n">val_borrows</span> + <span class="n">ones</span> <span class="o">=</span> <span class="n">diff_ones</span> <span class="o">&amp;</span> <span class="o">~</span><span class="n">unknowns</span> + <span class="k">return</span> <span class="n">KnownBits</span><span class="p">(</span><span class="n">ones</span><span class="p">,</span> <span class="n">unknowns</span><span class="p">)</span> + + +<span class="k">def</span> <span class="nf">test_add</span><span class="p">():</span> + <span class="n">k1</span> <span class="o">=</span> <span class="n">KnownBits</span><span class="o">.</span><span class="n">from_str</span><span class="p">(</span><span class="s1">'0?10?10?10'</span><span class="p">)</span> + <span class="n">k2</span> <span class="o">=</span> <span class="n">KnownBits</span><span class="o">.</span><span class="n">from_str</span><span class="p">(</span><span class="s1">'0???111000'</span><span class="p">)</span> + <span class="n">res</span> <span class="o">=</span> <span class="n">k1</span><span class="o">.</span><span class="n">abstract_add</span><span class="p">(</span><span class="n">k2</span><span class="p">)</span> + <span class="k">assert</span> <span class="nb">str</span><span class="p">(</span><span class="n">res</span><span class="p">)</span> <span class="o">==</span> <span class="s2">"?????01?10"</span> + +<span class="k">def</span> <span class="nf">test_sub</span><span class="p">():</span> + <span class="n">k1</span> <span class="o">=</span> <span class="n">KnownBits</span><span class="o">.</span><span class="n">from_str</span><span class="p">(</span><span class="s1">'0?10?10?10'</span><span class="p">)</span> + <span class="n">k2</span> <span class="o">=</span> <span class="n">KnownBits</span><span class="o">.</span><span class="n">from_str</span><span class="p">(</span><span class="s1">'0???111000'</span><span class="p">)</span> + <span class="n">res</span> <span class="o">=</span> <span class="n">k1</span><span class="o">.</span><span class="n">abstract_sub</span><span class="p">(</span><span class="n">k2</span><span class="p">)</span> + <span class="k">assert</span> <span class="nb">str</span><span class="p">(</span><span class="n">res</span><span class="p">)</span> <span class="o">==</span> <span class="s2">"...?11?10"</span> + <span class="n">k1</span> <span class="o">=</span> <span class="n">KnownBits</span><span class="o">.</span><span class="n">from_str</span><span class="p">(</span> <span class="s1">'...1?10?10?10'</span><span class="p">)</span> + <span class="n">k2</span> <span class="o">=</span> <span class="n">KnownBits</span><span class="o">.</span><span class="n">from_str</span><span class="p">(</span><span class="s1">'...10000???111000'</span><span class="p">)</span> + <span class="n">res</span> <span class="o">=</span> <span class="n">k1</span><span class="o">.</span><span class="n">abstract_sub</span><span class="p">(</span><span class="n">k2</span><span class="p">)</span> + <span class="k">assert</span> <span class="nb">str</span><span class="p">(</span><span class="n">res</span><span class="p">)</span> <span class="o">==</span> <span class="s2">"111?????11?10"</span> + +<span class="nd">@given</span><span class="p">(</span><span class="n">knownbits_and_contained_number</span><span class="p">,</span> <span class="n">knownbits_and_contained_number</span><span class="p">)</span> +<span class="k">def</span> <span class="nf">test_hypothesis_add</span><span class="p">(</span><span class="n">t1</span><span class="p">,</span> <span class="n">t2</span><span class="p">):</span> + <span class="n">k1</span><span class="p">,</span> <span class="n">n1</span> <span class="o">=</span> <span class="n">t1</span> + <span class="n">k2</span><span class="p">,</span> <span class="n">n2</span> <span class="o">=</span> <span class="n">t2</span> + <span class="n">k3</span> <span class="o">=</span> <span class="n">k1</span><span class="o">.</span><span class="n">abstract_add</span><span class="p">(</span><span class="n">k2</span><span class="p">)</span> + <span class="n">n3</span> <span class="o">=</span> <span class="n">n1</span> <span class="o">+</span> <span class="n">n2</span> + <span class="k">assert</span> <span class="n">k3</span><span class="o">.</span><span class="n">contains</span><span class="p">(</span><span class="n">n3</span><span class="p">)</span> + +<span class="nd">@given</span><span class="p">(</span><span class="n">knownbits_and_contained_number</span><span class="p">,</span> <span class="n">knownbits_and_contained_number</span><span class="p">)</span> +<span class="k">def</span> <span class="nf">test_hypothesis_sub</span><span class="p">(</span><span class="n">t1</span><span class="p">,</span> <span class="n">t2</span><span class="p">):</span> + <span class="n">k1</span><span class="p">,</span> <span class="n">n1</span> <span class="o">=</span> <span class="n">t1</span> + <span class="n">k2</span><span class="p">,</span> <span class="n">n2</span> <span class="o">=</span> <span class="n">t2</span> + <span class="n">k3</span> <span class="o">=</span> <span class="n">k1</span><span class="o">.</span><span class="n">abstract_sub</span><span class="p">(</span><span class="n">k2</span><span class="p">)</span> + <span class="n">n3</span> <span class="o">=</span> <span class="n">n1</span> <span class="o">-</span> <span class="n">n2</span> + <span class="k">assert</span> <span class="n">k3</span><span class="o">.</span><span class="n">contains</span><span class="p">(</span><span class="n">n3</span><span class="p">)</span> +</pre></div> + +<p>Now we are in a pretty good situation, and have implemented abstract versions +for a bunch of important arithmetic and binary functions. What's also surprising +is that the implementation of all of the transfer functions is quite efficient. +We didn't have to write loops over the individual bits at all, instead we found +closed form expressions using primitive operations on the underlying integers +<code>ones</code> and <code>unknowns</code>. This means that computing the results of abstract +operations is quite efficient, which is important when using the abstract domain +in the context of a JIT compiler.</p> +<h3 id="proving-correctness-of-the-transfer-functions-with-z3">Proving correctness of the transfer functions with Z3</h3> +<p>As one can probably tell from my recent posts, I've been thinking about +compiler correctness a lot. Getting the transfer functions absolutely +correct is really crucial, because a bug in them would lead to miscompilation of +Python code when the abstract domain is added to the JIT. While the randomized +tests are great, it's still entirely possible for them to miss bugs. The state +space for the arguments of a binary transfer function is <code>3**64 * 3**64</code>, and if +only a small part of that contains wrong behaviour it would be really unlikely +for us to find it with random tests by chance. Therefore I was reluctant to +merge the PyPy branch that contained the new abstract domain for a long time.</p> +<p>To increase our confidence in the correctness of the transfer functions further, +we can use Z3 to <em>prove</em> their correctness, which gives us much stronger +guarantees (not 100%, obviously). In this subsection I will show how to do that.</p> +<p>Here's an attempt to do this manually in the Python repl:</p> +<div class="code"><pre class="code literal-block"><span class="o">&gt;&gt;&gt;&gt;</span><span class="w"> </span><span class="kn">import</span><span class="w"> </span><span class="nn">z3</span> +<span class="o">&gt;&gt;&gt;&gt;</span><span class="w"> </span><span class="n">solver</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">z3</span><span class="o">.</span><span class="n">Solver</span><span class="p">()</span> +<span class="o">&gt;&gt;&gt;&gt;</span><span class="w"> </span><span class="c1"># like last blog post, proof by failing to find counterexamples</span> +<span class="o">&gt;&gt;&gt;&gt;</span><span class="w"> </span><span class="k">def</span><span class="w"> </span><span class="nf">prove</span><span class="p">(</span><span class="n">cond</span><span class="p">):</span><span class="w"> </span><span class="k">assert</span><span class="w"> </span><span class="n">solver</span><span class="o">.</span><span class="n">check</span><span class="p">(</span><span class="n">z3</span><span class="o">.</span><span class="n">Not</span><span class="p">(</span><span class="n">cond</span><span class="p">))</span><span class="w"> </span><span class="o">==</span><span class="w"> </span><span class="n">z3</span><span class="o">.</span><span class="n">unsat</span> +<span class="o">&gt;&gt;&gt;&gt;</span> +<span class="o">&gt;&gt;&gt;&gt;</span><span class="w"> </span><span class="c1"># let's set up a z3 bitvector variable for an arbitrary concrete value</span> +<span class="o">&gt;&gt;&gt;&gt;</span><span class="w"> </span><span class="n">n1</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">z3</span><span class="o">.</span><span class="n">BitVec</span><span class="p">(</span><span class="s1">'concrete_value'</span><span class="p">,</span><span class="w"> </span><span class="mi">64</span><span class="p">)</span> +<span class="o">&gt;&gt;&gt;&gt;</span><span class="w"> </span><span class="n">n1</span> +<span class="n">concrete_value</span> +<span class="o">&gt;&gt;&gt;&gt;</span><span class="w"> </span><span class="c1"># due to operator overloading we can manipulate z3 formulas</span> +<span class="o">&gt;&gt;&gt;&gt;</span><span class="w"> </span><span class="n">n2</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="o">~</span><span class="n">n1</span> +<span class="o">&gt;&gt;&gt;&gt;</span><span class="w"> </span><span class="n">n2</span> +<span class="o">~</span><span class="n">concrete_value</span> +<span class="o">&gt;&gt;&gt;&gt;</span><span class="w"> </span> +<span class="o">&gt;&gt;&gt;&gt;</span><span class="w"> </span><span class="c1"># now z3 bitvector variables for the ones and zeros fields</span> +<span class="o">&gt;&gt;&gt;&gt;</span><span class="w"> </span><span class="n">ones</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">z3</span><span class="o">.</span><span class="n">BitVec</span><span class="p">(</span><span class="s1">'abstract_ones'</span><span class="p">,</span><span class="w"> </span><span class="mi">64</span><span class="p">)</span> +<span class="o">&gt;&gt;&gt;&gt;</span><span class="w"> </span><span class="n">unknowns</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">z3</span><span class="o">.</span><span class="n">BitVec</span><span class="p">(</span><span class="s1">'abstract_unknowns'</span><span class="p">,</span><span class="w"> </span><span class="mi">64</span><span class="p">)</span> +<span class="o">&gt;&gt;&gt;&gt;</span><span class="w"> </span><span class="c1"># we construct a KnownBits instance with the z3 variables</span> +<span class="o">&gt;&gt;&gt;&gt;</span><span class="w"> </span><span class="n">k1</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">KnownBits</span><span class="p">(</span><span class="n">ones</span><span class="p">,</span><span class="w"> </span><span class="n">unknowns</span><span class="p">)</span> +<span class="o">&gt;&gt;&gt;&gt;</span><span class="w"> </span><span class="c1"># due to operator overloading we can call the methods on k1:</span> +<span class="o">&gt;&gt;&gt;&gt;</span><span class="w"> </span><span class="n">k2</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">k1</span><span class="o">.</span><span class="n">abstract_invert</span><span class="p">()</span> +<span class="o">&gt;&gt;&gt;&gt;</span><span class="w"> </span><span class="n">k2</span><span class="o">.</span><span class="n">ones</span> +<span class="o">~</span><span class="n">abstract_unknowns</span><span class="w"> </span><span class="o">&amp;</span><span class="w"> </span><span class="o">~</span><span class="n">abstract_ones</span> +<span class="o">&gt;&gt;&gt;&gt;</span><span class="w"> </span><span class="n">k2</span><span class="o">.</span><span class="n">unknowns</span> +<span class="n">abstract_unknowns</span> +<span class="o">&gt;&gt;&gt;&gt;</span><span class="w"> </span><span class="c1"># here's the correctness condition that we want to prove:</span> +<span class="o">&gt;&gt;&gt;&gt;</span><span class="w"> </span><span class="n">k2</span><span class="o">.</span><span class="n">contains</span><span class="p">(</span><span class="n">n2</span><span class="p">)</span> +<span class="o">~</span><span class="n">concrete_value</span><span class="w"> </span><span class="o">&amp;</span><span class="w"> </span><span class="o">~</span><span class="n">abstract_unknowns</span><span class="w"> </span><span class="o">==</span> +<span class="o">~</span><span class="n">abstract_unknowns</span><span class="w"> </span><span class="o">&amp;</span><span class="w"> </span><span class="o">~</span><span class="n">abstract_ones</span> +<span class="o">&gt;&gt;&gt;&gt;</span><span class="w"> </span><span class="c1"># let's try</span> +<span class="o">&gt;&gt;&gt;&gt;</span><span class="w"> </span><span class="n">prove</span><span class="p">(</span><span class="n">k2</span><span class="o">.</span><span class="n">contains</span><span class="p">(</span><span class="n">n2</span><span class="p">))</span> +<span class="n">Traceback</span><span class="w"> </span><span class="p">(</span><span class="n">most</span><span class="w"> </span><span class="n">recent</span><span class="w"> </span><span class="n">call</span><span class="w"> </span><span class="n">last</span><span class="p">):</span> +<span class="w"> </span><span class="n">File</span><span class="w"> </span><span class="s2">"&lt;stdin&gt;"</span><span class="p">,</span><span class="w"> </span><span class="n">line</span><span class="w"> </span><span class="mi">1</span><span class="p">,</span><span class="w"> </span><span class="ow">in</span><span class="w"> </span><span class="o">&lt;</span><span class="n">module</span><span class="o">&gt;</span> +<span class="w"> </span><span class="n">File</span><span class="w"> </span><span class="s2">"&lt;stdin&gt;"</span><span class="p">,</span><span class="w"> </span><span class="n">line</span><span class="w"> </span><span class="mi">1</span><span class="p">,</span><span class="w"> </span><span class="ow">in</span><span class="w"> </span><span class="n">prove</span> +<span class="n">AssertionError</span> +<span class="o">&gt;&gt;&gt;&gt;</span><span class="w"> </span><span class="c1"># it doesn't work! let's look at the counterexample to see why:</span> +<span class="o">&gt;&gt;&gt;&gt;</span><span class="w"> </span><span class="n">solver</span><span class="o">.</span><span class="n">model</span><span class="p">()</span> +<span class="p">[</span><span class="n">abstract_unknowns</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="mi">0</span><span class="p">,</span> +<span class="w"> </span><span class="n">abstract_ones</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="mi">0</span><span class="p">,</span> +<span class="w"> </span><span class="n">concrete_value</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="mi">1</span><span class="p">]</span> +<span class="o">&gt;&gt;&gt;&gt;</span><span class="w"> </span><span class="c1"># we can build a KnownBits instance with the values in the</span> +<span class="o">&gt;&gt;&gt;&gt;</span><span class="w"> </span><span class="c1"># counterexample:</span> +<span class="o">&gt;&gt;&gt;&gt;</span><span class="w"> </span><span class="o">~</span><span class="mi">1</span><span class="w"> </span><span class="c1"># concrete result</span> +<span class="o">-</span><span class="mi">2</span> +<span class="o">&gt;&gt;&gt;&gt;</span><span class="w"> </span><span class="n">counter_example_k1</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">KnownBits</span><span class="p">(</span><span class="mi">0</span><span class="p">,</span><span class="w"> </span><span class="mi">0</span><span class="p">)</span> +<span class="o">&gt;&gt;&gt;&gt;</span><span class="w"> </span><span class="n">counter_example_k1</span> +<span class="n">KnownBits</span><span class="o">.</span><span class="n">from_constant</span><span class="p">(</span><span class="mi">0</span><span class="p">)</span> +<span class="o">&gt;&gt;&gt;&gt;</span><span class="w"> </span><span class="n">counter_example_k2</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">counter_example_k1</span><span class="o">.</span><span class="n">abstract_invert</span><span class="p">()</span> +<span class="o">&gt;&gt;&gt;&gt;</span><span class="w"> </span><span class="n">counter_example_k2</span> +<span class="n">KnownBits</span><span class="o">.</span><span class="n">from_constant</span><span class="p">(</span><span class="o">-</span><span class="mi">1</span><span class="p">)</span> +<span class="o">&gt;&gt;&gt;&gt;</span><span class="w"> </span><span class="c1"># let's check the failing condition</span> +<span class="o">&gt;&gt;&gt;&gt;</span><span class="w"> </span><span class="n">counter_example_k2</span><span class="o">.</span><span class="n">contains</span><span class="p">(</span><span class="o">~</span><span class="mi">1</span><span class="p">)</span> +<span class="kc">False</span> +</pre></div> + +<p>What is the problem here? We didn't tell Z3 that <code>n1</code> was supposed to be a +member of <code>k1</code>. We can add this as a precondition to the solver, and then the +prove works:</p> +<div class="code"><pre class="code literal-block">&gt;&gt;&gt;&gt; solver.add(k1.contains(n1)) +&gt;&gt;&gt;&gt; prove(k2.contains(n2)) # works! +</pre></div> + +<p>This is super cool! It's really a proof about the actual implementation, because +we call the implementation methods directly, and due to the operator overloading +that Z3 does we can be sure that we are actually checking a formula that +corresponds to the Python code. This eliminates one source of errors in formal +methods.</p> +<p>Doing the proof manually on the Python REPL is kind of annoying though, and we +also would like to make sure that the proofs are re-done when we change the +code. What we would really like to do is writing the proofs as a unit-test that +we can run while developing and in CI. Doing this is possible, and the unit +tests that really perform proofs look pleasingly similar to the +Hypothesis-based ones.</p> +<p>First we need to set up a bit of infrastructure:</p> +<div class="code"><pre class="code literal-block"><span class="n">INTEGER_WIDTH</span> <span class="o">=</span> <span class="mi">64</span> + +<span class="k">def</span> <span class="nf">BitVec</span><span class="p">(</span><span class="n">name</span><span class="p">):</span> + <span class="k">return</span> <span class="n">z3</span><span class="o">.</span><span class="n">BitVec</span><span class="p">(</span><span class="n">name</span><span class="p">,</span> <span class="n">INTEGER_WIDTH</span><span class="p">)</span> + +<span class="k">def</span> <span class="nf">BitVecVal</span><span class="p">(</span><span class="n">val</span><span class="p">):</span> + <span class="k">return</span> <span class="n">z3</span><span class="o">.</span><span class="n">BitVecVal</span><span class="p">(</span><span class="n">val</span><span class="p">,</span> <span class="n">INTEGER_WIDTH</span><span class="p">)</span> + +<span class="k">def</span> <span class="nf">z3_setup_variables</span><span class="p">():</span> + <span class="c1"># instantiate a solver</span> + <span class="n">solver</span> <span class="o">=</span> <span class="n">z3</span><span class="o">.</span><span class="n">Solver</span><span class="p">()</span> + + <span class="c1"># a Z3 variable for the first concrete value</span> + <span class="n">n1</span> <span class="o">=</span> <span class="n">BitVec</span><span class="p">(</span><span class="s2">"n1"</span><span class="p">)</span> + <span class="c1"># a KnownBits instances that uses Z3 variables as its ones and unknowns,</span> + <span class="c1"># representing the first abstract value</span> + <span class="n">k1</span> <span class="o">=</span> <span class="n">KnownBits</span><span class="p">(</span><span class="n">BitVec</span><span class="p">(</span><span class="s2">"n1_ones"</span><span class="p">),</span> <span class="n">BitVec</span><span class="p">(</span><span class="s2">"n1_unkowns"</span><span class="p">))</span> + <span class="c1"># add the precondition to the solver that the concrete value n1 must be a</span> + <span class="c1"># member of the abstract value k1</span> + <span class="n">solver</span><span class="o">.</span><span class="n">add</span><span class="p">(</span><span class="n">k1</span><span class="o">.</span><span class="n">contains</span><span class="p">(</span><span class="n">n1</span><span class="p">))</span> + + <span class="c1"># a Z3 variable for the second concrete value</span> + <span class="n">n2</span> <span class="o">=</span> <span class="n">BitVec</span><span class="p">(</span><span class="s2">"n2"</span><span class="p">)</span> + <span class="c1"># a KnownBits instances for the second abstract value</span> + <span class="n">k2</span> <span class="o">=</span> <span class="n">KnownBits</span><span class="p">(</span><span class="n">BitVec</span><span class="p">(</span><span class="s2">"n2_ones"</span><span class="p">),</span> <span class="n">BitVec</span><span class="p">(</span><span class="s2">"n2_unkowns"</span><span class="p">))</span> + <span class="c1"># add the precondition linking n2 and k2 to the solver</span> + <span class="n">solver</span><span class="o">.</span><span class="n">add</span><span class="p">(</span><span class="n">k2</span><span class="o">.</span><span class="n">contains</span><span class="p">(</span><span class="n">n2</span><span class="p">))</span> + <span class="k">return</span> <span class="n">solver</span><span class="p">,</span> <span class="n">k1</span><span class="p">,</span> <span class="n">n1</span><span class="p">,</span> <span class="n">k2</span><span class="p">,</span> <span class="n">n2</span> + +<span class="k">def</span> <span class="nf">prove</span><span class="p">(</span><span class="n">cond</span><span class="p">,</span> <span class="n">solver</span><span class="p">):</span> + <span class="n">z3res</span> <span class="o">=</span> <span class="n">solver</span><span class="o">.</span><span class="n">check</span><span class="p">(</span><span class="n">z3</span><span class="o">.</span><span class="n">Not</span><span class="p">(</span><span class="n">cond</span><span class="p">))</span> + <span class="k">if</span> <span class="n">z3res</span> <span class="o">!=</span> <span class="n">z3</span><span class="o">.</span><span class="n">unsat</span><span class="p">:</span> + <span class="k">assert</span> <span class="n">z3res</span> <span class="o">==</span> <span class="n">z3</span><span class="o">.</span><span class="n">sat</span> <span class="c1"># can't be timeout, we set no timeout</span> + <span class="c1"># make the model with the counterexample global, to make inspecting the</span> + <span class="c1"># bug easier when running pytest --pdb</span> + <span class="k">global</span> <span class="n">model</span> + <span class="n">model</span> <span class="o">=</span> <span class="n">solver</span><span class="o">.</span><span class="n">model</span><span class="p">()</span> + <span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">"n1=</span><span class="si">{</span><span class="n">model</span><span class="o">.</span><span class="n">eval</span><span class="p">(</span><span class="n">n1</span><span class="p">)</span><span class="si">}</span><span class="s2">, n2=</span><span class="si">{</span><span class="n">model</span><span class="o">.</span><span class="n">eval</span><span class="p">(</span><span class="n">n2</span><span class="p">)</span><span class="si">}</span><span class="s2">"</span><span class="p">)</span> + <span class="n">counter_example_k1</span> <span class="o">=</span> <span class="n">KnownBits</span><span class="p">(</span><span class="n">model</span><span class="o">.</span><span class="n">eval</span><span class="p">(</span><span class="n">k1</span><span class="o">.</span><span class="n">ones</span><span class="p">)</span><span class="o">.</span><span class="n">as_signed_long</span><span class="p">(),</span> + <span class="n">model</span><span class="o">.</span><span class="n">eval</span><span class="p">(</span><span class="n">k1</span><span class="o">.</span><span class="n">unknowns</span><span class="p">)</span><span class="o">.</span><span class="n">as_signed_long</span><span class="p">())</span> + <span class="n">counter_example_k2</span> <span class="o">=</span> <span class="n">KnownBits</span><span class="p">(</span><span class="n">model</span><span class="o">.</span><span class="n">eval</span><span class="p">(</span><span class="n">k2</span><span class="o">.</span><span class="n">ones</span><span class="p">)</span><span class="o">.</span><span class="n">as_signed_long</span><span class="p">(),</span> + <span class="n">model</span><span class="o">.</span><span class="n">eval</span><span class="p">(</span><span class="n">k2</span><span class="o">.</span><span class="n">unknowns</span><span class="p">)</span><span class="o">.</span><span class="n">as_signed_long</span><span class="p">())</span> + <span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">"k1=</span><span class="si">{</span><span class="n">counter_example_k1</span><span class="si">}</span><span class="s2">, k2=</span><span class="si">{</span><span class="n">counter_example_k2</span><span class="si">}</span><span class="s2">"</span><span class="p">)</span> + <span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">"but </span><span class="si">{</span><span class="n">cond</span><span class="si">=}</span><span class="s2"> evaluates to </span><span class="si">{</span><span class="n">model</span><span class="o">.</span><span class="n">eval</span><span class="p">(</span><span class="n">cond</span><span class="p">)</span><span class="si">}</span><span class="s2">"</span><span class="p">)</span> + <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="n">solver</span><span class="o">.</span><span class="n">model</span><span class="p">())</span> +</pre></div> + +<p>And then we can write proof-unit-tests like this:</p> +<div class="code"><pre class="code literal-block"><span class="k">def</span> <span class="nf">test_z3_abstract_invert</span><span class="p">():</span> + <span class="n">solver</span><span class="p">,</span> <span class="n">k1</span><span class="p">,</span> <span class="n">n1</span><span class="p">,</span> <span class="n">_</span><span class="p">,</span> <span class="n">_</span> <span class="o">=</span> <span class="n">z3_setup_variables</span><span class="p">()</span> + <span class="n">k2</span> <span class="o">=</span> <span class="n">k1</span><span class="o">.</span><span class="n">abstract_invert</span><span class="p">()</span> + <span class="n">n2</span> <span class="o">=</span> <span class="o">~</span><span class="n">n1</span> + <span class="n">prove</span><span class="p">(</span><span class="n">k2</span><span class="o">.</span><span class="n">contains</span><span class="p">(</span><span class="n">n2</span><span class="p">),</span> <span class="n">solver</span><span class="p">)</span> + +<span class="k">def</span> <span class="nf">test_z3_abstract_and</span><span class="p">():</span> + <span class="n">solver</span><span class="p">,</span> <span class="n">k1</span><span class="p">,</span> <span class="n">n1</span><span class="p">,</span> <span class="n">k2</span><span class="p">,</span> <span class="n">n2</span> <span class="o">=</span> <span class="n">z3_setup_variables</span><span class="p">()</span> + <span class="n">k3</span> <span class="o">=</span> <span class="n">k1</span><span class="o">.</span><span class="n">abstract_and</span><span class="p">(</span><span class="n">k2</span><span class="p">)</span> + <span class="n">n3</span> <span class="o">=</span> <span class="n">n1</span> <span class="o">&amp;</span> <span class="n">n2</span> + <span class="n">prove</span><span class="p">(</span><span class="n">k3</span><span class="o">.</span><span class="n">contains</span><span class="p">(</span><span class="n">n3</span><span class="p">),</span> <span class="n">solver</span><span class="p">)</span> + +<span class="k">def</span> <span class="nf">test_z3_abstract_or</span><span class="p">():</span> + <span class="n">solver</span><span class="p">,</span> <span class="n">k1</span><span class="p">,</span> <span class="n">n1</span><span class="p">,</span> <span class="n">k2</span><span class="p">,</span> <span class="n">n2</span> <span class="o">=</span> <span class="n">z3_setup_variables</span><span class="p">()</span> + <span class="n">k3</span> <span class="o">=</span> <span class="n">k1</span><span class="o">.</span><span class="n">abstract_or</span><span class="p">(</span><span class="n">k2</span><span class="p">)</span> + <span class="n">n3</span> <span class="o">=</span> <span class="n">n1</span> <span class="o">|</span> <span class="n">n2</span> + <span class="n">prove</span><span class="p">(</span><span class="n">k3</span><span class="o">.</span><span class="n">contains</span><span class="p">(</span><span class="n">n3</span><span class="p">),</span> <span class="n">solver</span><span class="p">)</span> + +<span class="k">def</span> <span class="nf">test_z3_abstract_add</span><span class="p">():</span> + <span class="n">solver</span><span class="p">,</span> <span class="n">k1</span><span class="p">,</span> <span class="n">n1</span><span class="p">,</span> <span class="n">k2</span><span class="p">,</span> <span class="n">n2</span> <span class="o">=</span> <span class="n">z3_setup_variables</span><span class="p">()</span> + <span class="n">k3</span> <span class="o">=</span> <span class="n">k1</span><span class="o">.</span><span class="n">abstract_add</span><span class="p">(</span><span class="n">k2</span><span class="p">)</span> + <span class="n">n3</span> <span class="o">=</span> <span class="n">n1</span> <span class="o">+</span> <span class="n">n2</span> + <span class="n">prove</span><span class="p">(</span><span class="n">k3</span><span class="o">.</span><span class="n">contains</span><span class="p">(</span><span class="n">n3</span><span class="p">),</span> <span class="n">solver</span><span class="p">)</span> + +<span class="k">def</span> <span class="nf">test_z3_abstract_sub</span><span class="p">():</span> + <span class="n">solver</span><span class="p">,</span> <span class="n">k1</span><span class="p">,</span> <span class="n">n1</span><span class="p">,</span> <span class="n">k2</span><span class="p">,</span> <span class="n">n2</span> <span class="o">=</span> <span class="n">z3_setup_variables</span><span class="p">()</span> + <span class="n">k3</span> <span class="o">=</span> <span class="n">k1</span><span class="o">.</span><span class="n">abstract_sub</span><span class="p">(</span><span class="n">k2</span><span class="p">)</span> + <span class="n">n3</span> <span class="o">=</span> <span class="n">n1</span> <span class="o">-</span> <span class="n">n2</span> + <span class="n">prove</span><span class="p">(</span><span class="n">k3</span><span class="o">.</span><span class="n">contains</span><span class="p">(</span><span class="n">n3</span><span class="p">),</span> <span class="n">solver</span><span class="p">)</span> +</pre></div> + +<p>It's possible to write a bit more Python-metaprogramming-magic and unify the +Hypothesis and Z3 tests into the same test definition.<sup id="fnref:proof_bitwidths"><a class="footnote-ref" href="https://www.pypy.org/posts/2024/08/toy-knownbits.html#fn:proof_bitwidths">1</a></sup></p> +<h3 id="cases-where-this-style-of-z3-proof-doesnt-work">Cases where this style of Z3 proof doesn't work</h3> +<p>Unfortunately the approach described in the previous section only works for a +very small number of cases. It breaks down as soon as the <code>KnownBits</code> methods +that we're calling contain any <code>if</code> conditions (including hidden ones like +the short-circuiting <code>and</code> and <code>or</code> in Python). Let's look at an example and +implement <code>abstract_eq</code>. <code>eq</code> is supposed to be an operation that compares two +integers and returns <code>0</code> or <code>1</code> if they are different or equal, respectively. +Implementing this in knownbits looks like this (with example and hypothesis +tests):</p> +<div class="code"><pre class="code literal-block"><span class="k">class</span> <span class="nc">KnownBits</span><span class="p">:</span> + <span class="o">...</span> + + <span class="k">def</span> <span class="nf">abstract_eq</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">):</span> + <span class="c1"># the result is a 0, 1, or ?</span> + + <span class="c1"># if they are both the same constant, they must be equal</span> + <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">is_constant</span><span class="p">()</span> <span class="ow">and</span> <span class="n">other</span><span class="o">.</span><span class="n">is_constant</span><span class="p">()</span> <span class="ow">and</span> <span class="bp">self</span><span class="o">.</span><span class="n">ones</span> <span class="o">==</span> <span class="n">other</span><span class="o">.</span><span class="n">ones</span><span class="p">:</span> + <span class="k">return</span> <span class="n">KnownBits</span><span class="o">.</span><span class="n">from_constant</span><span class="p">(</span><span class="mi">1</span><span class="p">)</span> + <span class="c1"># check whether we have known disagreeing bits, then we know the result</span> + <span class="c1"># is 0</span> + <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">_disagrees</span><span class="p">(</span><span class="n">other</span><span class="p">):</span> + <span class="k">return</span> <span class="n">KnownBits</span><span class="o">.</span><span class="n">from_constant</span><span class="p">(</span><span class="mi">0</span><span class="p">)</span> + <span class="k">return</span> <span class="n">KnownBits</span><span class="p">(</span><span class="mi">0</span><span class="p">,</span> <span class="mi">1</span><span class="p">)</span> <span class="c1"># an unknown boolean</span> + + <span class="k">def</span> <span class="nf">_disagrees</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">):</span> + <span class="c1"># check whether the bits disagree in any place where both are known</span> + <span class="n">both_known</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">knowns</span> <span class="o">&amp;</span> <span class="n">other</span><span class="o">.</span><span class="n">knowns</span> + <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">ones</span> <span class="o">&amp;</span> <span class="n">both_known</span> <span class="o">!=</span> <span class="n">other</span><span class="o">.</span><span class="n">ones</span> <span class="o">&amp;</span> <span class="n">both_known</span> + +<span class="k">def</span> <span class="nf">test_eq</span><span class="p">():</span> + <span class="n">k1</span> <span class="o">=</span> <span class="n">KnownBits</span><span class="o">.</span><span class="n">from_str</span><span class="p">(</span><span class="s1">'...?'</span><span class="p">)</span> + <span class="n">k2</span> <span class="o">=</span> <span class="n">KnownBits</span><span class="o">.</span><span class="n">from_str</span><span class="p">(</span><span class="s1">'...?'</span><span class="p">)</span> + <span class="k">assert</span> <span class="nb">str</span><span class="p">(</span><span class="n">k1</span><span class="o">.</span><span class="n">abstract_eq</span><span class="p">(</span><span class="n">k2</span><span class="p">))</span> <span class="o">==</span> <span class="s1">'?'</span> + <span class="n">k1</span> <span class="o">=</span> <span class="n">KnownBits</span><span class="o">.</span><span class="n">from_constant</span><span class="p">(</span><span class="mi">10</span><span class="p">)</span> + <span class="k">assert</span> <span class="nb">str</span><span class="p">(</span><span class="n">k1</span><span class="o">.</span><span class="n">abstract_eq</span><span class="p">(</span><span class="n">k1</span><span class="p">))</span> <span class="o">==</span> <span class="s1">'1'</span> + <span class="n">k1</span> <span class="o">=</span> <span class="n">KnownBits</span><span class="o">.</span><span class="n">from_constant</span><span class="p">(</span><span class="mi">10</span><span class="p">)</span> + <span class="n">k2</span> <span class="o">=</span> <span class="n">KnownBits</span><span class="o">.</span><span class="n">from_constant</span><span class="p">(</span><span class="mi">20</span><span class="p">)</span> + <span class="k">assert</span> <span class="nb">str</span><span class="p">(</span><span class="n">k1</span><span class="o">.</span><span class="n">abstract_eq</span><span class="p">(</span><span class="n">k2</span><span class="p">))</span> <span class="o">==</span> <span class="s1">'0'</span> + +<span class="nd">@given</span><span class="p">(</span><span class="n">knownbits_and_contained_number</span><span class="p">,</span> <span class="n">knownbits_and_contained_number</span><span class="p">)</span> +<span class="k">def</span> <span class="nf">test_hypothesis_eq</span><span class="p">(</span><span class="n">t1</span><span class="p">,</span> <span class="n">t2</span><span class="p">):</span> + <span class="n">k1</span><span class="p">,</span> <span class="n">n1</span> <span class="o">=</span> <span class="n">t1</span> + <span class="n">k2</span><span class="p">,</span> <span class="n">n2</span> <span class="o">=</span> <span class="n">t2</span> + <span class="n">k3</span> <span class="o">=</span> <span class="n">k1</span><span class="o">.</span><span class="n">abstract_eq</span><span class="p">(</span><span class="n">k2</span><span class="p">)</span> + <span class="k">assert</span> <span class="n">k3</span><span class="o">.</span><span class="n">contains</span><span class="p">(</span><span class="nb">int</span><span class="p">(</span><span class="n">n1</span> <span class="o">==</span> <span class="n">n2</span><span class="p">))</span> +</pre></div> + +<p>Trying to do the proof in the same style as before breaks:</p> +<div class="code"><pre class="code literal-block"><span class="o">&gt;&gt;&gt;&gt;</span> <span class="n">k3</span> <span class="o">=</span> <span class="n">k1</span><span class="o">.</span><span class="n">abstract_eq</span><span class="p">(</span><span class="n">k2</span><span class="p">)</span> +<span class="n">Traceback</span> <span class="p">(</span><span class="n">most</span> <span class="n">recent</span> <span class="n">call</span> <span class="n">last</span><span class="p">):</span> + <span class="n">File</span> <span class="s2">"&lt;stdin&gt;"</span><span class="p">,</span> <span class="n">line</span> <span class="mi">1</span><span class="p">,</span> <span class="ow">in</span> <span class="o">&lt;</span><span class="n">module</span><span class="o">&gt;</span> + <span class="n">File</span> <span class="s2">"knownbits.py"</span><span class="p">,</span> <span class="n">line</span> <span class="mi">246</span><span class="p">,</span> <span class="ow">in</span> <span class="n">abstract_eq</span> + <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">_disagrees</span><span class="p">(</span><span class="n">other</span><span class="p">):</span> + <span class="n">File</span> <span class="s2">"venv/site-packages/z3/z3.py"</span><span class="p">,</span> <span class="n">line</span> <span class="mi">381</span><span class="p">,</span> <span class="ow">in</span> <span class="fm">__bool__</span> + <span class="k">raise</span> <span class="n">Z3Exception</span><span class="p">(</span><span class="s2">"Symbolic expressions cannot be cast to concrete Boolean values."</span><span class="p">)</span> +<span class="n">z3</span><span class="o">.</span><span class="n">z3types</span><span class="o">.</span><span class="n">Z3Exception</span><span class="p">:</span> <span class="n">Symbolic</span> <span class="n">expressions</span> <span class="n">cannot</span> <span class="n">be</span> <span class="n">cast</span> <span class="n">to</span> <span class="n">concrete</span> <span class="n">Boolean</span> <span class="n">values</span><span class="o">.</span> +</pre></div> + +<p>We cannot call <code>abstract_eq</code> on a <code>KnownBits</code> with Z3 variables as fields, +because once we hit an <code>if</code> statement, the whole approach of relying on the +operator overloading breaks down. Z3 doesn't actually parse the Python code or +anything advanced like that, we rather build an expression only by running the +code and letting the Z3 formulas build up.</p> +<p>To still prove the correctness of <code>abstract_eq</code> we need to manually transform +the control flow logic of the function into a Z3 formula that uses the <code>z3.If</code> +expression, using a small helper function:</p> +<div class="code"><pre class="code literal-block"><span class="k">def</span> <span class="nf">z3_cond</span><span class="p">(</span><span class="n">b</span><span class="p">,</span> <span class="n">trueval</span><span class="o">=</span><span class="mi">1</span><span class="p">,</span> <span class="n">falseval</span><span class="o">=</span><span class="mi">0</span><span class="p">):</span> + <span class="k">return</span> <span class="n">z3</span><span class="o">.</span><span class="n">If</span><span class="p">(</span><span class="n">b</span><span class="p">,</span> <span class="n">BitVecVal</span><span class="p">(</span><span class="n">trueval</span><span class="p">),</span> <span class="n">BitVecVal</span><span class="p">(</span><span class="n">falseval</span><span class="p">))</span> + +<span class="k">def</span> <span class="nf">z3_abstract_eq</span><span class="p">(</span><span class="n">k1</span><span class="p">,</span> <span class="n">k2</span><span class="p">):</span> + <span class="c1"># follow the *logic* of abstract_eq, we can't call it due to the ifs in it</span> + <span class="n">case1cond</span> <span class="o">=</span> <span class="n">z3</span><span class="o">.</span><span class="n">And</span><span class="p">(</span><span class="n">k1</span><span class="o">.</span><span class="n">is_constant</span><span class="p">(),</span> <span class="n">k2</span><span class="o">.</span><span class="n">is_constant</span><span class="p">(),</span> <span class="n">k1</span><span class="o">.</span><span class="n">ones</span> <span class="o">==</span> <span class="n">k2</span><span class="o">.</span><span class="n">ones</span><span class="p">)</span> + <span class="n">case2cond</span> <span class="o">=</span> <span class="n">k1</span><span class="o">.</span><span class="n">_disagrees</span><span class="p">(</span><span class="n">k2</span><span class="p">)</span> + + <span class="c1"># ones is 1 in the first case, 0 otherwise</span> + <span class="n">ones</span> <span class="o">=</span> <span class="n">z3_cond</span><span class="p">(</span><span class="n">case1cond</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="mi">0</span><span class="p">)</span> + + <span class="c1"># in the first two cases, unknowns is 0, 1 otherwise</span> + <span class="n">unknowns</span> <span class="o">=</span> <span class="n">z3_cond</span><span class="p">(</span><span class="n">z3</span><span class="o">.</span><span class="n">Or</span><span class="p">(</span><span class="n">case1cond</span><span class="p">,</span> <span class="n">case2cond</span><span class="p">),</span> <span class="mi">0</span><span class="p">,</span> <span class="mi">1</span><span class="p">)</span> + <span class="k">return</span> <span class="n">KnownBits</span><span class="p">(</span><span class="n">ones</span><span class="p">,</span> <span class="n">unknowns</span><span class="p">)</span> + +<span class="k">def</span> <span class="nf">test_z3_abstract_eq_logic</span><span class="p">():</span> + <span class="n">solver</span><span class="p">,</span> <span class="n">k1</span><span class="p">,</span> <span class="n">n1</span><span class="p">,</span> <span class="n">k2</span><span class="p">,</span> <span class="n">n2</span> <span class="o">=</span> <span class="n">z3_setup_variables</span><span class="p">()</span> + <span class="n">n3</span> <span class="o">=</span> <span class="n">z3_cond</span><span class="p">(</span><span class="n">n1</span> <span class="o">==</span> <span class="n">n2</span><span class="p">)</span> <span class="c1"># concrete result</span> + <span class="n">k3</span> <span class="o">=</span> <span class="n">z3_abstract_eq</span><span class="p">(</span><span class="n">k1</span><span class="p">,</span> <span class="n">k2</span><span class="p">)</span> + <span class="n">prove</span><span class="p">(</span><span class="n">k3</span><span class="o">.</span><span class="n">contains</span><span class="p">(</span><span class="n">n3</span><span class="p">),</span> <span class="n">solver</span><span class="p">)</span> +</pre></div> + +<p>This proof works. It is a lot less satisfying than the previous ones though, +because we could have done an error in the manual transcription from Python code +to Z3 formulas (there are possibly more heavy-handed approaches where we do +this transformation more automatically using e.g. the <code>ast</code> module to analyze +the source code, but that's a much more complicated researchy project). To +lessen this problem somewhat we can factor out the parts of the logic that don't +have any conditions into small helper methods (like <code>_disagrees</code> in this +example) and use them in the manual conversion of the code to Z3 formulas.<sup id="fnref:tests_vs_proofs"><a class="footnote-ref" href="https://www.pypy.org/posts/2024/08/toy-knownbits.html#fn:tests_vs_proofs">2</a></sup></p> +<p>The final condition that Z3 checks, btw, is this one:</p> +<div class="code"><pre class="code literal-block">If(n1 == n2, 1, 0) &amp; +~If(Or(And(n1_unkowns == 0, + n2_unkowns == 0, + n1_ones == n2_ones), + n1_ones &amp; ~n1_unkowns &amp; ~n2_unkowns != + n2_ones &amp; ~n1_unkowns &amp; ~n2_unkowns), + 0, 1) == +If(And(n1_unkowns == 0, n2_unkowns == 0, n1_ones == n2_ones), + 1, 0) +</pre></div> + +<h3 id="making-statements-about-precision">Making Statements about Precision</h3> +<p>So far we have only used Z3 to prove statements about correctness, i.e. that +our abstract operations overapproximate what can happen with concrete values. +While proving this property is essential if we want to avoid miscompilation, +correctness alone is not a very strong constraint on the implementation of our +abstract transfer functions. We could simply return <code>Knownbits.unknowns()</code> for +every <code>abstract_*</code> method and the resulting overapproximation would be correct, +but useless in practice.</p> +<p>It's much harder to make statements about whether the transfer functions are +maximally precise. There are two aspects of precision I want to discuss in this +section, however.</p> +<p>The first aspect is that we would really like it if the transfer functions +compute the maximally precise results for singleton sets. If all abstract +arguments of an operations are constants, i.e. contain only a single concrete +element, then we know that the resulting set also has only a single element. We +can prove that all our transfer functions have this property:</p> +<div class="code"><pre class="code literal-block"><span class="k">def</span> <span class="nf">test_z3_prove_constant_folding</span><span class="p">():</span> + <span class="n">solver</span><span class="p">,</span> <span class="n">k1</span><span class="p">,</span> <span class="n">n1</span><span class="p">,</span> <span class="n">k2</span><span class="p">,</span> <span class="n">n2</span> <span class="o">=</span> <span class="n">z3_setup_variables</span><span class="p">()</span> + <span class="n">k3</span> <span class="o">=</span> <span class="n">k1</span><span class="o">.</span><span class="n">abstract_invert</span><span class="p">()</span> + <span class="n">prove</span><span class="p">(</span><span class="n">z3</span><span class="o">.</span><span class="n">Implies</span><span class="p">(</span><span class="n">k1</span><span class="o">.</span><span class="n">is_constant</span><span class="p">(),</span> + <span class="n">k3</span><span class="o">.</span><span class="n">is_constant</span><span class="p">()),</span> <span class="n">solver</span><span class="p">)</span> + + <span class="n">k3</span> <span class="o">=</span> <span class="n">k1</span><span class="o">.</span><span class="n">abstract_and</span><span class="p">(</span><span class="n">k2</span><span class="p">)</span> + <span class="n">prove</span><span class="p">(</span><span class="n">z3</span><span class="o">.</span><span class="n">Implies</span><span class="p">(</span><span class="n">z3</span><span class="o">.</span><span class="n">And</span><span class="p">(</span><span class="n">k1</span><span class="o">.</span><span class="n">is_constant</span><span class="p">(),</span> <span class="n">k2</span><span class="o">.</span><span class="n">is_constant</span><span class="p">()),</span> + <span class="n">k3</span><span class="o">.</span><span class="n">is_constant</span><span class="p">()),</span> <span class="n">solver</span><span class="p">)</span> + + <span class="n">k3</span> <span class="o">=</span> <span class="n">k1</span><span class="o">.</span><span class="n">abstract_or</span><span class="p">(</span><span class="n">k2</span><span class="p">)</span> + <span class="n">prove</span><span class="p">(</span><span class="n">z3</span><span class="o">.</span><span class="n">Implies</span><span class="p">(</span><span class="n">z3</span><span class="o">.</span><span class="n">And</span><span class="p">(</span><span class="n">k1</span><span class="o">.</span><span class="n">is_constant</span><span class="p">(),</span> <span class="n">k2</span><span class="o">.</span><span class="n">is_constant</span><span class="p">()),</span> + <span class="n">k3</span><span class="o">.</span><span class="n">is_constant</span><span class="p">()),</span> <span class="n">solver</span><span class="p">)</span> + + <span class="n">k3</span> <span class="o">=</span> <span class="n">k1</span><span class="o">.</span><span class="n">abstract_sub</span><span class="p">(</span><span class="n">k2</span><span class="p">)</span> + <span class="n">prove</span><span class="p">(</span><span class="n">z3</span><span class="o">.</span><span class="n">Implies</span><span class="p">(</span><span class="n">z3</span><span class="o">.</span><span class="n">And</span><span class="p">(</span><span class="n">k1</span><span class="o">.</span><span class="n">is_constant</span><span class="p">(),</span> <span class="n">k2</span><span class="o">.</span><span class="n">is_constant</span><span class="p">()),</span> + <span class="n">k3</span><span class="o">.</span><span class="n">is_constant</span><span class="p">()),</span> <span class="n">solver</span><span class="p">)</span> + + <span class="n">k3</span> <span class="o">=</span> <span class="n">z3_abstract_eq</span><span class="p">(</span><span class="n">k1</span><span class="p">,</span> <span class="n">k2</span><span class="p">)</span> + <span class="n">prove</span><span class="p">(</span><span class="n">z3</span><span class="o">.</span><span class="n">Implies</span><span class="p">(</span><span class="n">z3</span><span class="o">.</span><span class="n">And</span><span class="p">(</span><span class="n">k1</span><span class="o">.</span><span class="n">is_constant</span><span class="p">(),</span> <span class="n">k2</span><span class="o">.</span><span class="n">is_constant</span><span class="p">()),</span> + <span class="n">k3</span><span class="o">.</span><span class="n">is_constant</span><span class="p">()),</span> <span class="n">solver</span><span class="p">)</span> +</pre></div> + +<p>Proving with Z3 that the transfer functions are maximally precise for +non-constant arguments seems to be relatively hard. I tried a few completely +rigorous approaches and failed. The paper <a href="https://arxiv.org/pdf/2105.05398">Sound, Precise, and Fast Abstract +Interpretation with Tristate Numbers</a> +contains an optimality proof for the transfer functions of addition and +subtraction, so we can be certain that they are as precise as is +possible.</p> +<p>I still want to show an approach for trying to find concrete examples of +abstract values that are less precise than they could be, using a combination +of Hypothesis and Z3. The idea is to use hypothesis to pick random abstract +values. Then we compute the abstract result using our transfer function. +Afterwards we can ask Z3 to find us an abstract result that is better than the +one our transfer function produced. If Z3 finds a better abstract result, we +have a concrete example of imprecision for our transfer function. Those tests +aren't strict proofs, because they rely on generating random abstract values, +but they can still be valuable (not for the transfer functions in this blog +post, which are all optimal).</p> +<p>Here is what the code looks like (this is a little bit bonus content, I'll not +explain the details and can only hope that the comments are somewhat helpful):</p> +<div class="code"><pre class="code literal-block"><span class="nd">@given</span><span class="p">(</span><span class="n">random_knownbits_and_contained_number</span><span class="p">,</span> <span class="n">random_knownbits_and_contained_number</span><span class="p">)</span> +<span class="nd">@settings</span><span class="p">(</span><span class="n">deadline</span><span class="o">=</span><span class="kc">None</span><span class="p">)</span> +<span class="k">def</span> <span class="nf">test_check_precision</span><span class="p">(</span><span class="n">t1</span><span class="p">,</span> <span class="n">t2</span><span class="p">):</span> + <span class="n">k1</span><span class="p">,</span> <span class="n">n1</span> <span class="o">=</span> <span class="n">t1</span> + <span class="n">k2</span><span class="p">,</span> <span class="n">n2</span> <span class="o">=</span> <span class="n">t2</span> + <span class="c1"># apply transfer function</span> + <span class="n">k3</span> <span class="o">=</span> <span class="n">k1</span><span class="o">.</span><span class="n">abstract_add</span><span class="p">(</span><span class="n">k2</span><span class="p">)</span> + <span class="n">example_res</span> <span class="o">=</span> <span class="n">n1</span> <span class="o">+</span> <span class="n">n2</span> + + <span class="c1"># try to find a better version of k3 with Z3</span> + <span class="n">solver</span> <span class="o">=</span> <span class="n">z3</span><span class="o">.</span><span class="n">Solver</span><span class="p">()</span> + <span class="n">solver</span><span class="o">.</span><span class="n">set</span><span class="p">(</span><span class="s2">"timeout"</span><span class="p">,</span> <span class="mi">8000</span><span class="p">)</span> + + <span class="n">var1</span> <span class="o">=</span> <span class="n">BitVec</span><span class="p">(</span><span class="s1">'v1'</span><span class="p">)</span> + <span class="n">var2</span> <span class="o">=</span> <span class="n">BitVec</span><span class="p">(</span><span class="s1">'v2'</span><span class="p">)</span> + + <span class="n">ones</span> <span class="o">=</span> <span class="n">BitVec</span><span class="p">(</span><span class="s1">'ones'</span><span class="p">)</span> + <span class="n">unknowns</span> <span class="o">=</span> <span class="n">BitVec</span><span class="p">(</span><span class="s1">'unknowns'</span><span class="p">)</span> + <span class="n">better_k3</span> <span class="o">=</span> <span class="n">KnownBits</span><span class="p">(</span><span class="n">ones</span><span class="p">,</span> <span class="n">unknowns</span><span class="p">)</span> + <span class="nb">print</span><span class="p">(</span><span class="n">k1</span><span class="p">,</span> <span class="n">k2</span><span class="p">,</span> <span class="n">k3</span><span class="p">)</span> + + <span class="c1"># we're trying to find an example for a better k3, so we use check, without</span> + <span class="c1"># negation:</span> + <span class="n">res</span> <span class="o">=</span> <span class="n">solver</span><span class="o">.</span><span class="n">check</span><span class="p">(</span><span class="n">z3</span><span class="o">.</span><span class="n">And</span><span class="p">(</span> + <span class="c1"># better_k3 should be a valid knownbits instance</span> + <span class="n">better_k3</span><span class="o">.</span><span class="n">is_well_formed</span><span class="p">(),</span> + <span class="c1"># it should be better than k3, ie there are known bits in better_k3</span> + <span class="c1"># that we don't have in k3</span> + <span class="n">better_k3</span><span class="o">.</span><span class="n">knowns</span> <span class="o">&amp;</span> <span class="o">~</span><span class="n">k3</span><span class="o">.</span><span class="n">knowns</span> <span class="o">!=</span> <span class="mi">0</span><span class="p">,</span> + <span class="c1"># now encode the correctness condition for better_k3 with a ForAll:</span> + <span class="c1"># for all concrete values var1 and var2, it must hold that if</span> + <span class="c1"># var1 is in k1 and var2 is in k2 it follows that var1 + var2 is in</span> + <span class="c1"># better_k3</span> + <span class="n">z3</span><span class="o">.</span><span class="n">ForAll</span><span class="p">(</span> + <span class="p">[</span><span class="n">var1</span><span class="p">,</span> <span class="n">var2</span><span class="p">],</span> + <span class="n">z3</span><span class="o">.</span><span class="n">Implies</span><span class="p">(</span> + <span class="n">z3</span><span class="o">.</span><span class="n">And</span><span class="p">(</span><span class="n">k1</span><span class="o">.</span><span class="n">contains</span><span class="p">(</span><span class="n">var1</span><span class="p">),</span> <span class="n">k2</span><span class="o">.</span><span class="n">contains</span><span class="p">(</span><span class="n">var2</span><span class="p">)),</span> + <span class="n">better_k3</span><span class="o">.</span><span class="n">contains</span><span class="p">(</span><span class="n">var1</span> <span class="o">+</span> <span class="n">var2</span><span class="p">)))))</span> + <span class="c1"># if this query is satisfiable, we have found a better result for the</span> + <span class="c1"># abstract_add</span> + <span class="k">if</span> <span class="n">res</span> <span class="o">==</span> <span class="n">z3</span><span class="o">.</span><span class="n">sat</span><span class="p">:</span> + <span class="n">model</span> <span class="o">=</span> <span class="n">solver</span><span class="o">.</span><span class="n">model</span><span class="p">()</span> + <span class="n">rk3</span> <span class="o">=</span> <span class="n">KnownBits</span><span class="p">(</span><span class="n">model</span><span class="o">.</span><span class="n">eval</span><span class="p">(</span><span class="n">ones</span><span class="p">)</span><span class="o">.</span><span class="n">as_signed_long</span><span class="p">(),</span> <span class="n">model</span><span class="o">.</span><span class="n">eval</span><span class="p">(</span><span class="n">unknowns</span><span class="p">)</span><span class="o">.</span><span class="n">as_signed_long</span><span class="p">())</span> + <span class="nb">print</span><span class="p">(</span><span class="s2">"better"</span><span class="p">,</span> <span class="n">rk3</span><span class="p">)</span> + <span class="k">assert</span> <span class="mi">0</span> + <span class="k">if</span> <span class="n">res</span> <span class="o">==</span> <span class="n">z3</span><span class="o">.</span><span class="n">unknown</span><span class="p">:</span> + <span class="nb">print</span><span class="p">(</span><span class="s2">"timeout"</span><span class="p">)</span> +</pre></div> + +<p>It does not actually fail for <code>abstract_add</code> (nor the other abstract +functions). To see the test failing we can add some imprecision to the +implementation of <code>abstract_add</code> to see Hypothesis and Z3 find examples of +values that are not optimally precise (for example by setting some bits +of <code>unknowns</code> in the implementation of <code>abstract_add</code> unconditionally).</p> +<h3 id="using-the-abstract-domain-in-the-toy-optimizer-for-generalized-constant-folding">Using the Abstract Domain in the Toy Optimizer for Generalized Constant Folding</h3> +<p>Now after all this work we can finally actually use the knownbits abstract +domain in the toy optimizer. The code for this follows <a href="https://bernsteinbear.com/blog/toy-abstract-interpretation/">Max' intro post about +abstract interpretation</a> +quite closely.</p> +<p>For completeness sake, in the fold there's the basic infrastructure classes +that make up the IR again (they are identical or at least extremely close to +the previous toy posts).</p> +<details> +<summary>toy infrastructure</summary> + +<div class="code"><pre class="code literal-block"><span class="k">class</span> <span class="nc">Value</span><span class="p">:</span> + <span class="k">def</span> <span class="nf">find</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span> + <span class="k">raise</span> <span class="ne">NotImplementedError</span><span class="p">(</span><span class="s2">"abstract"</span><span class="p">)</span> + + +<span class="nd">@dataclass</span><span class="p">(</span><span class="n">eq</span><span class="o">=</span><span class="kc">False</span><span class="p">)</span> +<span class="k">class</span> <span class="nc">Operation</span><span class="p">(</span><span class="n">Value</span><span class="p">):</span> + <span class="n">name</span> <span class="p">:</span> <span class="nb">str</span> + <span class="n">args</span> <span class="p">:</span> <span class="nb">list</span><span class="p">[</span><span class="n">Value</span><span class="p">]</span> + + <span class="n">forwarded</span> <span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Value</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span> + + <span class="k">def</span> <span class="nf">find</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Value</span><span class="p">:</span> + <span class="n">op</span> <span class="o">=</span> <span class="bp">self</span> + <span class="k">while</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">op</span><span class="p">,</span> <span class="n">Operation</span><span class="p">):</span> + <span class="nb">next</span> <span class="o">=</span> <span class="n">op</span><span class="o">.</span><span class="n">forwarded</span> + <span class="k">if</span> <span class="nb">next</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span> + <span class="k">return</span> <span class="n">op</span> + <span class="n">op</span> <span class="o">=</span> <span class="nb">next</span> + <span class="k">return</span> <span class="n">op</span> + + <span class="k">def</span> <span class="nf">arg</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">index</span><span class="p">):</span> + <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">args</span><span class="p">[</span><span class="n">index</span><span class="p">]</span><span class="o">.</span><span class="n">find</span><span class="p">()</span> + + <span class="k">def</span> <span class="nf">make_equal_to</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span> <span class="p">:</span> <span class="n">Value</span><span class="p">):</span> + <span class="bp">self</span><span class="o">.</span><span class="n">find</span><span class="p">()</span><span class="o">.</span><span class="n">forwarded</span> <span class="o">=</span> <span class="n">value</span> + + +<span class="nd">@dataclass</span><span class="p">(</span><span class="n">eq</span><span class="o">=</span><span class="kc">False</span><span class="p">)</span> +<span class="k">class</span> <span class="nc">Constant</span><span class="p">(</span><span class="n">Value</span><span class="p">):</span> + <span class="n">value</span> <span class="p">:</span> <span class="nb">object</span> + + <span class="k">def</span> <span class="nf">find</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span> + <span class="k">return</span> <span class="bp">self</span> + + +<span class="k">class</span> <span class="nc">Block</span><span class="p">(</span><span class="nb">list</span><span class="p">):</span> + <span class="k">def</span> <span class="fm">__getattr__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">opname</span><span class="p">):</span> + <span class="k">def</span> <span class="nf">wraparg</span><span class="p">(</span><span class="n">arg</span><span class="p">):</span> + <span class="k">if</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">arg</span><span class="p">,</span> <span class="n">Value</span><span class="p">):</span> + <span class="n">arg</span> <span class="o">=</span> <span class="n">Constant</span><span class="p">(</span><span class="n">arg</span><span class="p">)</span> + <span class="k">return</span> <span class="n">arg</span> + <span class="k">def</span> <span class="nf">make_op</span><span class="p">(</span><span class="o">*</span><span class="n">args</span><span class="p">):</span> + <span class="n">op</span> <span class="o">=</span> <span class="n">Operation</span><span class="p">(</span><span class="n">opname</span><span class="p">,</span> + <span class="p">[</span><span class="n">wraparg</span><span class="p">(</span><span class="n">arg</span><span class="p">)</span> <span class="k">for</span> <span class="n">arg</span> <span class="ow">in</span> <span class="n">args</span><span class="p">])</span> + <span class="bp">self</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">op</span><span class="p">)</span> + <span class="k">return</span> <span class="n">op</span> + <span class="k">return</span> <span class="n">make_op</span> + + +<span class="k">def</span> <span class="nf">bb_to_str</span><span class="p">(</span><span class="n">l</span> <span class="p">:</span> <span class="n">Block</span><span class="p">,</span> <span class="n">varprefix</span> <span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">"var"</span><span class="p">):</span> + <span class="k">def</span> <span class="nf">arg_to_str</span><span class="p">(</span><span class="n">arg</span> <span class="p">:</span> <span class="n">Value</span><span class="p">):</span> + <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">arg</span><span class="p">,</span> <span class="n">Constant</span><span class="p">):</span> + <span class="k">return</span> <span class="nb">str</span><span class="p">(</span><span class="n">arg</span><span class="o">.</span><span class="n">value</span><span class="p">)</span> + <span class="k">else</span><span class="p">:</span> + <span class="k">return</span> <span class="n">varnames</span><span class="p">[</span><span class="n">arg</span><span class="p">]</span> + + <span class="n">varnames</span> <span class="o">=</span> <span class="p">{}</span> + <span class="n">res</span> <span class="o">=</span> <span class="p">[]</span> + <span class="k">for</span> <span class="n">index</span><span class="p">,</span> <span class="n">op</span> <span class="ow">in</span> <span class="nb">enumerate</span><span class="p">(</span><span class="n">l</span><span class="p">):</span> + <span class="c1"># give the operation a name used while</span> + <span class="c1"># printing:</span> + <span class="n">var</span> <span class="o">=</span> <span class="sa">f</span><span class="s2">"</span><span class="si">{</span><span class="n">varprefix</span><span class="si">}{</span><span class="n">index</span><span class="si">}</span><span class="s2">"</span> + <span class="n">varnames</span><span class="p">[</span><span class="n">op</span><span class="p">]</span> <span class="o">=</span> <span class="n">var</span> + <span class="n">arguments</span> <span class="o">=</span> <span class="s2">", "</span><span class="o">.</span><span class="n">join</span><span class="p">(</span> + <span class="n">arg_to_str</span><span class="p">(</span><span class="n">op</span><span class="o">.</span><span class="n">arg</span><span class="p">(</span><span class="n">i</span><span class="p">))</span> + <span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="nb">len</span><span class="p">(</span><span class="n">op</span><span class="o">.</span><span class="n">args</span><span class="p">))</span> + <span class="p">)</span> + <span class="n">strop</span> <span class="o">=</span> <span class="sa">f</span><span class="s2">"</span><span class="si">{</span><span class="n">var</span><span class="si">}</span><span class="s2"> = </span><span class="si">{</span><span class="n">op</span><span class="o">.</span><span class="n">name</span><span class="si">}</span><span class="s2">(</span><span class="si">{</span><span class="n">arguments</span><span class="si">}</span><span class="s2">)"</span> + <span class="n">res</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">strop</span><span class="p">)</span> + <span class="k">return</span> <span class="s2">"</span><span class="se">\n</span><span class="s2">"</span><span class="o">.</span><span class="n">join</span><span class="p">(</span><span class="n">res</span><span class="p">)</span> +</pre></div> + + + +</details> + +<p>Now we can write some first tests, the first one simply checking constant +folding:</p> +<div class="code"><pre class="code literal-block"><span class="k">def</span> <span class="nf">test_constfold_two_ops</span><span class="p">():</span> + <span class="n">bb</span> <span class="o">=</span> <span class="n">Block</span><span class="p">()</span> + <span class="n">var0</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">getarg</span><span class="p">(</span><span class="mi">0</span><span class="p">)</span> + <span class="n">var1</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">int_add</span><span class="p">(</span><span class="mi">5</span><span class="p">,</span> <span class="mi">4</span><span class="p">)</span> + <span class="n">var2</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">int_add</span><span class="p">(</span><span class="n">var1</span><span class="p">,</span> <span class="mi">10</span><span class="p">)</span> + <span class="n">var3</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">int_add</span><span class="p">(</span><span class="n">var2</span><span class="p">,</span> <span class="n">var0</span><span class="p">)</span> + + <span class="n">opt_bb</span> <span class="o">=</span> <span class="n">simplify</span><span class="p">(</span><span class="n">bb</span><span class="p">)</span> + <span class="k">assert</span> <span class="n">bb_to_str</span><span class="p">(</span><span class="n">opt_bb</span><span class="p">,</span> <span class="s2">"optvar"</span><span class="p">)</span> <span class="o">==</span> <span class="s2">"""</span><span class="se">\</span> +<span class="s2">optvar0 = getarg(0)</span> +<span class="s2">optvar1 = int_add(19, optvar0)"""</span> +</pre></div> + +<p>Calling the transfer functions on constant <code>KnownBits</code> produces a constant +results, as we have seen. Therefore "regular" constant folding should hopefully +be achieved by optimizing with the <code>KnownBits</code> abstract domain too.</p> +<p>The next two tests are slightly more complicated and can't be optimized by +regular constant-folding. They follow the motivating examples from the start of +this blog post, a hundred years ago:</p> +<div class="code"><pre class="code literal-block"><span class="n">def</span><span class="w"> </span><span class="n">test_constfold_via_knownbits</span><span class="p">():</span> +<span class="w"> </span><span class="n">bb</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">Block</span><span class="p">()</span> +<span class="w"> </span><span class="n">var0</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">bb</span><span class="o">.</span><span class="n">getarg</span><span class="p">(</span><span class="mi">0</span><span class="p">)</span> +<span class="w"> </span><span class="n">var1</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">bb</span><span class="o">.</span><span class="n">int_or</span><span class="p">(</span><span class="n">var0</span><span class="p">,</span><span class="w"> </span><span class="mi">1</span><span class="p">)</span> +<span class="w"> </span><span class="n">var2</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">bb</span><span class="o">.</span><span class="n">int_and</span><span class="p">(</span><span class="n">var1</span><span class="p">,</span><span class="w"> </span><span class="mi">1</span><span class="p">)</span> +<span class="w"> </span><span class="n">var3</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">bb</span><span class="o">.</span><span class="n">dummy</span><span class="p">(</span><span class="n">var2</span><span class="p">)</span> + +<span class="w"> </span><span class="n">opt_bb</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">simplify</span><span class="p">(</span><span class="n">bb</span><span class="p">)</span> +<span class="w"> </span><span class="nb">assert</span><span class="w"> </span><span class="n">bb_to_str</span><span class="p">(</span><span class="n">opt_bb</span><span class="p">,</span><span class="w"> </span><span class="s2">"optvar"</span><span class="p">)</span><span class="w"> </span><span class="o">==</span><span class="w"> </span><span class="s2">"""</span><span class="se">\</span> +<span class="s2">optvar0 = getarg(0)</span> +<span class="s2">optvar1 = int_or(optvar0, 1)</span> +<span class="s2">optvar2 = dummy(1)"""</span> + +<span class="n">def</span><span class="w"> </span><span class="n">test_constfold_alignment_check</span><span class="p">():</span> +<span class="w"> </span><span class="n">bb</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">Block</span><span class="p">()</span> +<span class="w"> </span><span class="n">var0</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">bb</span><span class="o">.</span><span class="n">getarg</span><span class="p">(</span><span class="mi">0</span><span class="p">)</span> +<span class="w"> </span><span class="n">var1</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">bb</span><span class="o">.</span><span class="n">int_invert</span><span class="p">(</span><span class="mi">0</span><span class="n">b111</span><span class="p">)</span> +<span class="w"> </span><span class="c1"># mask off the lowest three bits, thus var2 is aligned</span> +<span class="w"> </span><span class="n">var2</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">bb</span><span class="o">.</span><span class="n">int_and</span><span class="p">(</span><span class="n">var0</span><span class="p">,</span><span class="w"> </span><span class="n">var1</span><span class="p">)</span> +<span class="w"> </span><span class="c1"># add 16 to aligned quantity</span> +<span class="w"> </span><span class="n">var3</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">bb</span><span class="o">.</span><span class="n">int_add</span><span class="p">(</span><span class="n">var2</span><span class="p">,</span><span class="w"> </span><span class="mi">16</span><span class="p">)</span> +<span class="w"> </span><span class="c1"># check alignment of result</span> +<span class="w"> </span><span class="n">var4</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">bb</span><span class="o">.</span><span class="n">int_and</span><span class="p">(</span><span class="n">var3</span><span class="p">,</span><span class="w"> </span><span class="mi">0</span><span class="n">b111</span><span class="p">)</span> +<span class="w"> </span><span class="n">var5</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">bb</span><span class="o">.</span><span class="n">int_eq</span><span class="p">(</span><span class="n">var4</span><span class="p">,</span><span class="w"> </span><span class="mi">0</span><span class="p">)</span> +<span class="w"> </span><span class="c1"># var5 should be const-folded to 1</span> +<span class="w"> </span><span class="n">var6</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">bb</span><span class="o">.</span><span class="n">dummy</span><span class="p">(</span><span class="n">var5</span><span class="p">)</span> + +<span class="w"> </span><span class="n">opt_bb</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">simplify</span><span class="p">(</span><span class="n">bb</span><span class="p">)</span> +<span class="w"> </span><span class="nb">assert</span><span class="w"> </span><span class="n">bb_to_str</span><span class="p">(</span><span class="n">opt_bb</span><span class="p">,</span><span class="w"> </span><span class="s2">"optvar"</span><span class="p">)</span><span class="w"> </span><span class="o">==</span><span class="w"> </span><span class="s2">"""</span><span class="se">\</span> +<span class="s2">optvar0 = getarg(0)</span> +<span class="s2">optvar1 = int_and(optvar0, -8)</span> +<span class="s2">optvar2 = int_add(optvar1, 16)</span> +<span class="s2">optvar3 = dummy(1)"""</span> +</pre></div> + +<p>Here is <code>simplify</code> to make these tests pass:</p> +<div class="code"><pre class="code literal-block"><span class="k">def</span> <span class="nf">unknown_transfer_functions</span><span class="p">(</span><span class="o">*</span><span class="n">abstract_args</span><span class="p">):</span> + <span class="k">return</span> <span class="n">KnownBits</span><span class="o">.</span><span class="n">all_unknown</span><span class="p">()</span> + + +<span class="k">def</span> <span class="nf">simplify</span><span class="p">(</span><span class="n">bb</span><span class="p">:</span> <span class="n">Block</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Block</span><span class="p">:</span> + <span class="n">abstract_values</span> <span class="o">=</span> <span class="p">{}</span> <span class="c1"># dict mapping Operation to KnownBits</span> + + <span class="k">def</span> <span class="nf">knownbits_of</span><span class="p">(</span><span class="n">val</span> <span class="p">:</span> <span class="n">Value</span><span class="p">):</span> + <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">val</span><span class="p">,</span> <span class="n">Constant</span><span class="p">):</span> + <span class="k">return</span> <span class="n">KnownBits</span><span class="o">.</span><span class="n">from_constant</span><span class="p">(</span><span class="n">val</span><span class="o">.</span><span class="n">value</span><span class="p">)</span> + <span class="k">return</span> <span class="n">abstract_values</span><span class="p">[</span><span class="n">val</span><span class="p">]</span> + + <span class="n">opt_bb</span> <span class="o">=</span> <span class="n">Block</span><span class="p">()</span> + <span class="k">for</span> <span class="n">op</span> <span class="ow">in</span> <span class="n">bb</span><span class="p">:</span> + <span class="c1"># apply the transfer function on the abstract arguments</span> + <span class="n">name_without_prefix</span> <span class="o">=</span> <span class="n">op</span><span class="o">.</span><span class="n">name</span><span class="o">.</span><span class="n">removeprefix</span><span class="p">(</span><span class="s2">"int_"</span><span class="p">)</span> + <span class="n">method_name</span> <span class="o">=</span> <span class="sa">f</span><span class="s2">"abstract_</span><span class="si">{</span><span class="n">name_without_prefix</span><span class="si">}</span><span class="s2">"</span> + <span class="n">transfer_function</span> <span class="o">=</span> <span class="nb">getattr</span><span class="p">(</span><span class="n">KnownBits</span><span class="p">,</span> <span class="n">method_name</span><span class="p">,</span> <span class="n">unknown_transfer_functions</span><span class="p">)</span> + <span class="n">abstract_args</span> <span class="o">=</span> <span class="p">[</span><span class="n">knownbits_of</span><span class="p">(</span><span class="n">arg</span><span class="o">.</span><span class="n">find</span><span class="p">())</span> <span class="k">for</span> <span class="n">arg</span> <span class="ow">in</span> <span class="n">op</span><span class="o">.</span><span class="n">args</span><span class="p">]</span> + <span class="n">abstract_res</span> <span class="o">=</span> <span class="n">abstract_values</span><span class="p">[</span><span class="n">op</span><span class="p">]</span> <span class="o">=</span> <span class="n">transfer_function</span><span class="p">(</span><span class="o">*</span><span class="n">abstract_args</span><span class="p">)</span> + <span class="c1"># if the result is a constant, we optimize the operation away and make</span> + <span class="c1"># it equal to the constant result</span> + <span class="k">if</span> <span class="n">abstract_res</span><span class="o">.</span><span class="n">is_constant</span><span class="p">():</span> + <span class="n">op</span><span class="o">.</span><span class="n">make_equal_to</span><span class="p">(</span><span class="n">Constant</span><span class="p">(</span><span class="n">abstract_res</span><span class="o">.</span><span class="n">ones</span><span class="p">))</span> + <span class="k">continue</span> + <span class="c1"># otherwise emit the op</span> + <span class="n">opt_bb</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">op</span><span class="p">)</span> + <span class="k">return</span> <span class="n">opt_bb</span> +</pre></div> + +<p>The code follows the approach from the previous blog post very closely. The +only difference is that we apply the transfer function <em>first</em>, to be able to +detect whether the abstract domain can tell us that the result has to always be +a constant. This code makes all three tests pass.</p> +<h3 id="using-the-knownbits-domain-for-conditional-peephole-rewrites">Using the <code>KnownBits</code> Domain for Conditional Peephole Rewrites</h3> +<p>So far we are only using the <code>KnownBits</code> domain to find out that certain +operations have to produce a constant. We can also use the <code>KnownBits</code> domain +to check whether certain operation rewrites are correct. Let's use one of the +examples from the <a href="https://www.pypy.org/posts/2024/07/mining-jit-traces-missing-optimizations-z3.html">Mining JIT traces for missing optimizations with +Z3</a> +post, where Z3 found the inefficiency <code>(x &lt;&lt; 4) &amp; -0xf == x &lt;&lt; 4</code> in PyPy JIT +traces. We don't have shift operations, but we want to generalize this optimization +anyway. The general form of this rewrite is that under some circumstances <code>x &amp; +y == x</code>, and we can use the <code>KnownBits</code> domain to detect situations where this +must be true.</p> +<p>To understand <em>when</em> <code>x &amp; y == x</code> is true, we can think about individual pairs of +bits <code>a</code> and <code>b</code>. If <code>a == 0</code>, then <code>a &amp; b == 0 &amp; b == 0 == a</code>. If <code>b == 1</code> +then <code>a &amp; b == a &amp; 1 == a</code>. So if either <code>a == 0</code> or <code>b == 1</code> is true, +<code>a &amp; b == a</code> follows. And if either of these conditions is true for <em>all</em> the +bits of <code>x</code> and <code>y</code>, we can know that <code>x &amp; y == x</code>.</p> +<p>We can write a method on <code>KnownBits</code> to check for this condition:</p> +<div class="code"><pre class="code literal-block"><span class="k">class</span> <span class="nc">KnownBits</span><span class="p">:</span> + <span class="o">...</span> + + <span class="k">def</span> <span class="nf">is_and_identity</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">):</span> +<span class="w"> </span><span class="sd">""" Return True if n1 &amp; n2 == n1 for any n1 in self and n2 in other.</span> +<span class="sd"> (or, equivalently, return True if n1 | n2 == n2)"""</span> + <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">zeros</span> <span class="o">|</span> <span class="n">other</span><span class="o">.</span><span class="n">ones</span> <span class="o">==</span> <span class="o">-</span><span class="mi">1</span> +</pre></div> + +<p>Since my reasoning about this feels ripe for errors, let's check that our +understanding is correct with Z3:</p> +<div class="code"><pre class="code literal-block"><span class="k">def</span> <span class="nf">test_prove_is_and_identity</span><span class="p">():</span> + <span class="n">solver</span><span class="p">,</span> <span class="n">k1</span><span class="p">,</span> <span class="n">n1</span><span class="p">,</span> <span class="n">k2</span><span class="p">,</span> <span class="n">n2</span> <span class="o">=</span> <span class="n">z3_setup_variables</span><span class="p">()</span> + <span class="n">prove</span><span class="p">(</span><span class="n">z3</span><span class="o">.</span><span class="n">Implies</span><span class="p">(</span><span class="n">k1</span><span class="o">.</span><span class="n">is_and_identity</span><span class="p">(</span><span class="n">k2</span><span class="p">),</span> <span class="n">n1</span> <span class="o">&amp;</span> <span class="n">n2</span> <span class="o">==</span> <span class="n">n1</span><span class="p">),</span> <span class="n">solver</span><span class="p">)</span> +</pre></div> + +<p>Now let's use this in the toy optimizer. Here are two tests for this rewrite:</p> +<div class="code"><pre class="code literal-block"><span class="k">def</span> <span class="nf">test_remove_redundant_and</span><span class="p">():</span> + <span class="n">bb</span> <span class="o">=</span> <span class="n">Block</span><span class="p">()</span> + <span class="n">var0</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">getarg</span><span class="p">(</span><span class="mi">0</span><span class="p">)</span> + <span class="n">var1</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">int_invert</span><span class="p">(</span><span class="mb">0b1111</span><span class="p">)</span> + <span class="c1"># mask off the lowest four bits</span> + <span class="n">var2</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">int_and</span><span class="p">(</span><span class="n">var0</span><span class="p">,</span> <span class="n">var1</span><span class="p">)</span> + <span class="c1"># applying the same mask is not redundant</span> + <span class="n">var3</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">int_and</span><span class="p">(</span><span class="n">var2</span><span class="p">,</span> <span class="n">var1</span><span class="p">)</span> + <span class="n">var4</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">dummy</span><span class="p">(</span><span class="n">var3</span><span class="p">)</span> + + <span class="n">opt_bb</span> <span class="o">=</span> <span class="n">simplify</span><span class="p">(</span><span class="n">bb</span><span class="p">)</span> + <span class="k">assert</span> <span class="n">bb_to_str</span><span class="p">(</span><span class="n">opt_bb</span><span class="p">,</span> <span class="s2">"optvar"</span><span class="p">)</span> <span class="o">==</span> <span class="s2">"""</span><span class="se">\</span> +<span class="s2">optvar0 = getarg(0)</span> +<span class="s2">optvar1 = int_and(optvar0, -16)</span> +<span class="s2">optvar2 = dummy(optvar1)"""</span> + +<span class="k">def</span> <span class="nf">test_remove_redundant_and_more_complex</span><span class="p">():</span> + <span class="n">bb</span> <span class="o">=</span> <span class="n">Block</span><span class="p">()</span> + <span class="n">var0</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">getarg</span><span class="p">(</span><span class="mi">0</span><span class="p">)</span> + <span class="n">var1</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">getarg</span><span class="p">(</span><span class="mi">1</span><span class="p">)</span> + <span class="c1"># var2 has bit pattern ????</span> + <span class="n">var2</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">int_and</span><span class="p">(</span><span class="n">var0</span><span class="p">,</span> <span class="mb">0b1111</span><span class="p">)</span> + <span class="c1"># var3 has bit pattern ...?1111</span> + <span class="n">var3</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">int_or</span><span class="p">(</span><span class="n">var1</span><span class="p">,</span> <span class="mb">0b1111</span><span class="p">)</span> + <span class="c1"># var4 is just var2</span> + <span class="n">var4</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">int_and</span><span class="p">(</span><span class="n">var2</span><span class="p">,</span> <span class="n">var3</span><span class="p">)</span> + <span class="n">var5</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">dummy</span><span class="p">(</span><span class="n">var4</span><span class="p">)</span> + + <span class="n">opt_bb</span> <span class="o">=</span> <span class="n">simplify</span><span class="p">(</span><span class="n">bb</span><span class="p">)</span> + <span class="k">assert</span> <span class="n">bb_to_str</span><span class="p">(</span><span class="n">opt_bb</span><span class="p">,</span> <span class="s2">"optvar"</span><span class="p">)</span> <span class="o">==</span> <span class="s2">"""</span><span class="se">\</span> +<span class="s2">optvar0 = getarg(0)</span> +<span class="s2">optvar1 = getarg(1)</span> +<span class="s2">optvar2 = int_and(optvar0, 15)</span> +<span class="s2">optvar3 = int_or(optvar1, 15)</span> +<span class="s2">optvar4 = dummy(optvar2)"""</span> +</pre></div> + +<p>The first test could also be made to pass by implementing a reassociation +optimization that turns <code>(x &amp; c1) &amp; c2</code> into <code>x &amp; (c1 &amp; c2)</code> and then constant-folds the second <code>and</code>. But here we want to +use <code>KnownBits</code> and conditionally rewrite <code>int_and</code> to its first argument. So to make the tests pass, +we can change <code>simplify</code> like this:</p> +<div class="code"><pre class="code literal-block"><span class="k">def</span> <span class="nf">simplify</span><span class="p">(</span><span class="n">bb</span><span class="p">:</span> <span class="n">Block</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Block</span><span class="p">:</span> + <span class="n">abstract_values</span> <span class="o">=</span> <span class="p">{}</span> <span class="c1"># dict mapping Operation to KnownBits</span> + + <span class="k">def</span> <span class="nf">knownbits_of</span><span class="p">(</span><span class="n">val</span> <span class="p">:</span> <span class="n">Value</span><span class="p">):</span> + <span class="o">...</span> + + <span class="n">opt_bb</span> <span class="o">=</span> <span class="n">Block</span><span class="p">()</span> + <span class="k">for</span> <span class="n">op</span> <span class="ow">in</span> <span class="n">bb</span><span class="p">:</span> + <span class="c1"># apply the transfer function on the abstract arguments</span> + <span class="n">name_without_prefix</span> <span class="o">=</span> <span class="n">op</span><span class="o">.</span><span class="n">name</span><span class="o">.</span><span class="n">removeprefix</span><span class="p">(</span><span class="s2">"int_"</span><span class="p">)</span> + <span class="n">method_name</span> <span class="o">=</span> <span class="sa">f</span><span class="s2">"abstract_</span><span class="si">{</span><span class="n">name_without_prefix</span><span class="si">}</span><span class="s2">"</span> + <span class="n">transfer_function</span> <span class="o">=</span> <span class="nb">getattr</span><span class="p">(</span><span class="n">KnownBits</span><span class="p">,</span> <span class="n">method_name</span><span class="p">,</span> <span class="n">unknown_transfer_functions</span><span class="p">)</span> + <span class="n">abstract_args</span> <span class="o">=</span> <span class="p">[</span><span class="n">knownbits_of</span><span class="p">(</span><span class="n">arg</span><span class="o">.</span><span class="n">find</span><span class="p">())</span> <span class="k">for</span> <span class="n">arg</span> <span class="ow">in</span> <span class="n">op</span><span class="o">.</span><span class="n">args</span><span class="p">]</span> + <span class="n">abstract_res</span> <span class="o">=</span> <span class="n">abstract_values</span><span class="p">[</span><span class="n">op</span><span class="p">]</span> <span class="o">=</span> <span class="n">transfer_function</span><span class="p">(</span><span class="o">*</span><span class="n">abstract_args</span><span class="p">)</span> + <span class="c1"># if the result is a constant, we optimize the operation away and make</span> + <span class="c1"># it equal to the constant result</span> + <span class="k">if</span> <span class="n">abstract_res</span><span class="o">.</span><span class="n">is_constant</span><span class="p">():</span> + <span class="n">op</span><span class="o">.</span><span class="n">make_equal_to</span><span class="p">(</span><span class="n">Constant</span><span class="p">(</span><span class="n">abstract_res</span><span class="o">.</span><span class="n">ones</span><span class="p">))</span> + <span class="k">continue</span> + <span class="c1"># &lt;&lt;&lt;&lt; new code</span> + <span class="c1"># conditionally rewrite int_and(x, y) to x</span> + <span class="k">if</span> <span class="n">op</span><span class="o">.</span><span class="n">name</span> <span class="o">==</span> <span class="s2">"int_and"</span><span class="p">:</span> + <span class="n">k1</span><span class="p">,</span> <span class="n">k2</span> <span class="o">=</span> <span class="n">abstract_args</span> + <span class="k">if</span> <span class="n">k1</span><span class="o">.</span><span class="n">is_and_identity</span><span class="p">(</span><span class="n">k2</span><span class="p">):</span> + <span class="n">op</span><span class="o">.</span><span class="n">make_equal_to</span><span class="p">(</span><span class="n">op</span><span class="o">.</span><span class="n">arg</span><span class="p">(</span><span class="mi">0</span><span class="p">))</span> + <span class="k">continue</span> + <span class="c1"># &gt;&gt;&gt;&gt; end changes</span> + <span class="n">opt_bb</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">op</span><span class="p">)</span> + <span class="k">return</span> <span class="n">opt_bb</span> +</pre></div> + +<p>And with that, the new tests pass as well. A real implementation would also +check the other argument order, but we leave that out for the sake of brevity.</p> +<p>This rewrite also generalizes the <a href="https://pypy.org/posts/2024/07/finding-simple-rewrite-rules-jit-z3.html">rewrites</a> <code>int_and(0, x) -&gt; 0</code> and +<code>int_and(-1, x) -&gt; x</code>, let's add a test for those:</p> +<div class="code"><pre class="code literal-block"><span class="k">def</span> <span class="nf">test_remove_and_simple</span><span class="p">():</span> + <span class="n">bb</span> <span class="o">=</span> <span class="n">Block</span><span class="p">()</span> + <span class="n">var0</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">getarg</span><span class="p">(</span><span class="mi">0</span><span class="p">)</span> + <span class="n">var1</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">getarg</span><span class="p">(</span><span class="mi">1</span><span class="p">)</span> + <span class="n">var2</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">int_and</span><span class="p">(</span><span class="mi">0</span><span class="p">,</span> <span class="n">var0</span><span class="p">)</span> <span class="c1"># == 0</span> + <span class="n">var3</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">int_invert</span><span class="p">(</span><span class="n">var2</span><span class="p">)</span> <span class="c1"># == -1</span> + <span class="n">var4</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">int_and</span><span class="p">(</span><span class="n">var1</span><span class="p">,</span> <span class="n">var3</span><span class="p">)</span> <span class="c1"># == var1</span> + <span class="n">var5</span> <span class="o">=</span> <span class="n">bb</span><span class="o">.</span><span class="n">dummy</span><span class="p">(</span><span class="n">var4</span><span class="p">)</span> + + <span class="n">opt_bb</span> <span class="o">=</span> <span class="n">simplify</span><span class="p">(</span><span class="n">bb</span><span class="p">)</span> + <span class="k">assert</span> <span class="n">bb_to_str</span><span class="p">(</span><span class="n">opt_bb</span><span class="p">,</span> <span class="s2">"optvar"</span><span class="p">)</span> <span class="o">==</span> <span class="s2">"""</span><span class="se">\</span> +<span class="s2">optvar0 = getarg(0)</span> +<span class="s2">optvar1 = getarg(1)</span> +<span class="s2">optvar2 = dummy(optvar1)"""</span> +</pre></div> + +<p>This test just passes. And that's it for this post!</p> +<h3 id="conclusion">Conclusion</h3> +<p>In this post we've seen the implementation, testing and proofs about a 'known +bits' abstract domain, as well as its use in the toy optimizer to generalize +constant folding, and to implement conditional peephole rewrites.</p> +<p>In the next posts I'll write about the real implementation of a knownbits +domain in PyPy's JIT, its combination with the existing interval abstract +domain, how to deal with gaining information from conditions in the program, +and some lose ends.</p> +<p>Sources:</p> +<ul> +<li><a href="https://github.com/llvm/llvm-project/blob/main/llvm/lib/Support/KnownBits.cpp">Known bits in LLVM</a></li> +<li><a href="https://github.com/torvalds/linux/blob/master/kernel/bpf/tnum.c">Tristate numbers for known bits in Linux eBPF</a></li> +<li><a href="https://arxiv.org/abs/2105.05398">Sound, Precise, and Fast Abstract Interpretation with Tristate Numbers</a></li> +<li><a href="https://people.cs.rutgers.edu/~sn349/papers/agni-cav2023.pdf">Verifying the Verifier: eBPF Range Analysis Verification</a></li> +<li><a href="https://dougallj.wordpress.com/2020/01/13/bit-twiddling-addition-with-unknown-bits/">Bit-Twiddling: Addition with Unknown + Bits</a> + is a super readable blog post by Dougall J. I've taken the <code>ones</code> and + <code>unknowns</code> naming from this post, which I find significantly clearer than + <code>value</code> and <code>mask</code>, which the Linux kernel uses.</li> +<li><a href="https://bitmath.blogspot.com/">Bits, Math and Performance(?)</a>, a fantastic + blog by <a href="https://mastodon.gamedev.place/@harold">Harold Aptroot</a>. There are a + lot of relevant posts about known bits, range analysis etc. Harold is also + the author of <a href="http://haroldbot.nl/">Haroldbot</a>, a website that can be used + for bitvector calculations, and also checks bitvector identities.</li> +<li><a href="https://cea.hal.science/cea-01795779/document">Sharpening Constraint Programming approaches for Bit-Vector Theory</a></li> +<li><a href="https://users.cs.utah.edu/~regehr/papers/lctes06_2/fp019-regehr.pdf">Deriving Abstract Transfer Functions for Analyzing Embedded Software</a></li> +<li><a href="https://arxiv.org/abs/2105.00493">Synthesizing Abstract Transformers</a></li> +</ul> +<div class="footnote"> +<hr> +<ol> +<li id="fn:proof_bitwidths"> +<p>There's a subtletly about the Z3 proofs that I'm sort of +glossing over here. Python integers are of arbitrary width, and the +<code>KnownBits</code> code is actually carefully written to work for integers of any +size. This property is tested by the Hypothesis tests, which don't limit +the sizes of the generated random integers. However, the Z3 proofs only +check bitvectors of a fixed bitwidth of 64. There are various ways to deal +with this situation. For most "real" compilers, the bitwidth of integers +would be fixed anyway. Then the components <code>ones</code> and <code>unknowns</code> of the +<code>KnownBits</code> class would use the number of bits the corresponding integer +variable has, and the Z3 proofs would use the same width. This is what we +do in the PyPy JIT. <a class="footnote-backref" href="https://www.pypy.org/posts/2024/08/toy-knownbits.html#fnref:proof_bitwidths" title="Jump back to footnote 1 in the text">↩</a></p> +</li> +<li id="fn:tests_vs_proofs"> +<p>The less close connection between implementation and proof +for <code>abstract_eq</code> is one of the reasons why it makes sense to do +unit-testing <em>in addition</em> to proofs. For a more detailed explanation of +why both tests and proofs are good to +have, see <a href="https://siek.blogspot.com/2024/06/data-structures-and-algorithms-correctly.html#correct-software-via-write-test-and-prove:~:text=We%20recognize%20that%20once%20step,detect%20most%20of%20the%20bugs">Jeremy Siek's blog +post</a>, +as well as the <a href="https://www-cs-faculty.stanford.edu/~knuth/faq.html#:~:text=What's%20the%20exact%20citation%20of%20your%20oft%2Dcited%20comment%20about%20bugs?">Knuth +quote</a>. <a class="footnote-backref" href="https://www.pypy.org/posts/2024/08/toy-knownbits.html#fnref:tests_vs_proofs" title="Jump back to footnote 2 in the text">↩</a></p> +</li> +</ol> +</div>toy-optimizerz3https://www.pypy.org/posts/2024/08/toy-knownbits.htmlSat, 03 Aug 2024 14:00:00 GMTAbstract interpretation in the Toy Optimizerhttps://www.pypy.org/posts/2024/07/toy-abstract-interpretation.htmlMax Bernstein<p>This is a <a href="https://bernsteinbear.com/blog/toy-abstract-interpretation/" rel="canonical">cross-post</a> +from Max Bernstein from his excellent blog where he writes about programming +languages, compilers, optimizations, virtual machines. He's looking for a +(dynamic language runtime or compiler related) job too.</p> +<hr> +<p>CF Bolz-Tereick wrote some excellent posts in which they <a href="https://pypy.org/posts/2022/07/toy-optimizer.html">introduce a small IR +and optimizer</a> and <a href="https://pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html">extend it with allocation +removal</a>. We also did a live stream together in which +we did <a href="https://www.youtube.com/watch?v=w-UHg0yOPSE">some more heap optimizations</a>.</p> +<p>In this blog post, I'm going to write a small abstract interpreter for the Toy +IR and then show how we can use it to do some simple optimizations. It assumes +that you are familiar with the little IR, which I have reproduced unchanged in +<a href="https://gist.github.com/tekknolagi/4425b28d5267e7bae8b0d7ef8fb4a671">a GitHub Gist</a>.</p> +<p>Abstract interpretation is a general framework for efficiently computing +properties that must be true for all possible executions of a program. It's a +widely used approach both in compiler optimizations as well as offline static +analysis for finding bugs. I'm writing this post to pave the way for CF's next +post on proving abstract interpreters correct for range analysis and known bits +analysis inside PyPy.</p> +<p>Before we begin, I want to note a couple of things:</p> +<ul> +<li>The Toy IR is in SSA form, which means that every variable is defined exactly + once. This means that abstract properties of each variable are easy to track.</li> +<li>The Toy IR represents a linear trace without control flow, meaning we won't + talk about meet/join or fixpoints. They only make sense if the IR has a + notion of conditional branches or back edges (loops).</li> +</ul> +<p>Alright, let's get started.</p> +<h3 id="welcome-to-abstract-interpretation">Welcome to abstract interpretation</h3> +<p>Abstract interpretation means a couple different things to different people. +There's rigorous mathematical formalism thanks to Patrick and Radhia Cousot, +our favorite power couple, and there's also sketchy hand-wavy stuff like what +will follow in this post. In the end, all people are trying to do is reason +about program behavior without running it.</p> +<p>In particular, abstract interpretation is an <em>over-approximation</em> of the +behavior of a program. Correctly implemented abstract interpreters never lie, +but they might be a little bit pessimistic. This is because instead of using +real values and running the program---which would produce a concrete result and +some real-world behavior---we "run" the program with a parallel universe of +<em>abstract</em> values. This abstract run gives us information about all possible +runs of the program.<sup id="fnref:logozzo"><a class="footnote-ref" href="https://www.pypy.org/posts/2024/07/toy-abstract-interpretation.html#fn:logozzo">1</a></sup></p> +<p>Abstract values always represent sets of concrete values. Instead of literally +storing a set (in the world of integers, for example, it could get pretty +big...there are a lot of integers), we group them into a finite number of named +subsets.<sup id="fnref:lattices"><a class="footnote-ref" href="https://www.pypy.org/posts/2024/07/toy-abstract-interpretation.html#fn:lattices">2</a></sup></p> +<p>Let's learn a little about abstract interpretation with an example program and +example abstract domain. Here's the example program:</p> +<div class="code"><pre class="code literal-block"><span class="n">v0</span> <span class="o">=</span> <span class="mi">1</span> +<span class="n">v1</span> <span class="o">=</span> <span class="mi">2</span> +<span class="n">v2</span> <span class="o">=</span> <span class="n">add</span><span class="p">(</span><span class="n">v0</span><span class="p">,</span> <span class="n">v1</span><span class="p">)</span> +</pre></div> + +<p>And our abstract domain is "is the number positive" (where "positive" means +nonnegative, but I wanted to keep the words distinct):</p> +<div class="code"><pre class="code literal-block"> top + / \ +positive negative + \ / + bottom +</pre></div> + +<p>The special <em>top</em> value means "I don't know" and the special <em>bottom</em> value +means "empty set" or "unreachable". The <em>positive</em> and <em>negative</em> values +represent the sets of all positive and negative numbers, respectively.</p> +<p>We initialize all the variables <code>v0</code>, <code>v1</code>, and <code>v2</code> to <em>bottom</em> and then walk +our IR, updating our knowledge as we go.</p> +<div class="code"><pre class="code literal-block"><span class="c1"># here</span> +<span class="n">v0</span><span class="p">:</span><span class="n">bottom</span> <span class="o">=</span> <span class="mi">1</span> +<span class="n">v1</span><span class="p">:</span><span class="n">bottom</span> <span class="o">=</span> <span class="mi">2</span> +<span class="n">v2</span><span class="p">:</span><span class="n">bottom</span> <span class="o">=</span> <span class="n">add</span><span class="p">(</span><span class="n">v0</span><span class="p">,</span> <span class="n">v1</span><span class="p">)</span> +</pre></div> + +<p>In order to do that, we have to have <em>transfer functions</em> for each operation. +For constants, the transfer function is easy: determine if the constant is +positive or negative. For other operations, we have to define a function that +takes the abstract values of the operands and returns the abstract value of the +result.</p> +<p>In order to be correct, transfer functions for operations have to be compatible +with the behavior of their corresponding concrete implementations. You can +think of them having an implicit universal quantifier <em>forall</em> in front of +them.</p> +<p>Let's step through the constants at least:</p> +<div class="code"><pre class="code literal-block"><span class="n">v0</span><span class="p">:</span><span class="n">positive</span> <span class="o">=</span> <span class="mi">1</span> +<span class="n">v1</span><span class="p">:</span><span class="n">positive</span> <span class="o">=</span> <span class="mi">2</span> +<span class="c1"># here</span> +<span class="n">v2</span><span class="p">:</span><span class="n">bottom</span> <span class="o">=</span> <span class="n">add</span><span class="p">(</span><span class="n">v0</span><span class="p">,</span> <span class="n">v1</span><span class="p">)</span> +</pre></div> + +<p>Now we need to figure out the transfer function for <code>add</code>. It's kind of tricky +right now because we haven't specified our abstract domain very well. I keep +saying "numbers", but what kinds of numbers? Integers? Real numbers? Floating +point? Some kind of fixed-width bit vector (<code>int8</code>, <code>uint32</code>, ...) like an +actual machine "integer"?</p> +<p>For this post, I am going to use the mathematical definition of integer, which +means that the values are not bounded in size and therefore do not overflow. +Actual hardware memory constraints aside, this is kind of like a Python <code>int</code>.</p> +<p>So let's look at what happens when we add two abstract numbers:</p> +<table> +<thead> +<tr> +<th></th> +<th>top</th> +<th>positive</th> +<th>negative</th> +<th>bottom</th> +</tr> +</thead> +<tbody> +<tr> +<td><strong>top</strong></td> +<td>top</td> +<td>top</td> +<td>top</td> +<td>bottom</td> +</tr> +<tr> +<td><strong>positive</strong></td> +<td>top</td> +<td>positive</td> +<td>top</td> +<td>bottom</td> +</tr> +<tr> +<td><strong>negative</strong></td> +<td>top</td> +<td>top</td> +<td>negative</td> +<td>bottom</td> +</tr> +<tr> +<td><strong>bottom</strong></td> +<td>bottom</td> +<td>bottom</td> +<td>bottom</td> +<td>bottom</td> +</tr> +</tbody> +</table> +<p>As an example, let's try to add two numbers <code>a</code> and <code>b</code>, where <code>a</code> is positive +and <code>b</code> is negative. We don't know anything about their values other than their +signs. They could be <code>5</code> and <code>-3</code>, where the result is <code>2</code>, or they could be +<code>1</code> and <code>-100</code>, where the result is <code>-99</code>. This is why we can't say anything +about the result of this operation and have to return <em>top</em>.</p> +<p>The short of this table is that we only really know the result of an addition +if both operands are positive or both operands are negative. Thankfully, in +this example, both operands are known positive. So we can learn something about +<code>v2</code>:</p> +<div class="code"><pre class="code literal-block"><span class="n">v0</span><span class="p">:</span><span class="n">positive</span> <span class="o">=</span> <span class="mi">1</span> +<span class="n">v1</span><span class="p">:</span><span class="n">positive</span> <span class="o">=</span> <span class="mi">2</span> +<span class="n">v2</span><span class="p">:</span><span class="n">positive</span> <span class="o">=</span> <span class="n">add</span><span class="p">(</span><span class="n">v0</span><span class="p">,</span> <span class="n">v1</span><span class="p">)</span> +<span class="c1"># here</span> +</pre></div> + +<p>This may not seem useful in isolation, but analyzing more complex programs even +with this simple domain may be able to remove checks such as <code>if (v2 &lt; 0) { ... }</code>.</p> +<p>Let's take a look at another example using an sample <code>absval</code> (absolute value) +IR operation:</p> +<div class="code"><pre class="code literal-block"><span class="n">v0</span> <span class="o">=</span> <span class="n">getarg</span><span class="p">(</span><span class="mi">0</span><span class="p">)</span> +<span class="n">v1</span> <span class="o">=</span> <span class="n">getarg</span><span class="p">(</span><span class="mi">1</span><span class="p">)</span> +<span class="n">v2</span> <span class="o">=</span> <span class="n">absval</span><span class="p">(</span><span class="n">v0</span><span class="p">)</span> +<span class="n">v3</span> <span class="o">=</span> <span class="n">absval</span><span class="p">(</span><span class="n">v1</span><span class="p">)</span> +<span class="n">v4</span> <span class="o">=</span> <span class="n">add</span><span class="p">(</span><span class="n">v2</span><span class="p">,</span> <span class="n">v3</span><span class="p">)</span> +<span class="n">v5</span> <span class="o">=</span> <span class="n">absval</span><span class="p">(</span><span class="n">v4</span><span class="p">)</span> +</pre></div> + +<p>Even though we have no constant/concrete values, we can still learn something +about the states of values throughout the program. Since we know that <code>absval</code> +always returns a positive number, we learn that <code>v2</code>, <code>v3</code>, and <code>v4</code> are all +positive. This means that we can optimize out the <code>absval</code> operation on <code>v5</code>:</p> +<div class="code"><pre class="code literal-block"><span class="n">v0</span><span class="p">:</span><span class="n">top</span> <span class="o">=</span> <span class="n">getarg</span><span class="p">(</span><span class="mi">0</span><span class="p">)</span> +<span class="n">v1</span><span class="p">:</span><span class="n">top</span> <span class="o">=</span> <span class="n">getarg</span><span class="p">(</span><span class="mi">1</span><span class="p">)</span> +<span class="n">v2</span><span class="p">:</span><span class="n">positive</span> <span class="o">=</span> <span class="n">absval</span><span class="p">(</span><span class="n">v0</span><span class="p">)</span> +<span class="n">v3</span><span class="p">:</span><span class="n">positive</span> <span class="o">=</span> <span class="n">absval</span><span class="p">(</span><span class="n">v1</span><span class="p">)</span> +<span class="n">v4</span><span class="p">:</span><span class="n">positive</span> <span class="o">=</span> <span class="n">add</span><span class="p">(</span><span class="n">v2</span><span class="p">,</span> <span class="n">v3</span><span class="p">)</span> +<span class="n">v5</span><span class="p">:</span><span class="n">positive</span> <span class="o">=</span> <span class="n">v4</span> +</pre></div> + +<p>Other interesting lattices include:</p> +<ul> +<li>Constants (where the middle row is pretty wide)</li> +<li>Range analysis (bounds on min and max of a number)</li> +<li>Known bits (using a bitvector representation of a number, which bits are + always 0 or 1)</li> +</ul> +<p>For the rest of this blog post, we are going to do a very limited version of +"known bits", called <em>parity</em>. This analysis only tracks the least significant +bit of a number, which indicates if it is even or odd.</p> +<h3 id="parity">Parity</h3> +<p>The lattice is pretty similar to the positive/negative lattice:</p> +<div class="code"><pre class="code literal-block"> top + / \ +even odd + \ / + bottom +</pre></div> + +<p>Let's define a data structure to represent this in Python code:</p> +<div class="code"><pre class="code literal-block"><span class="k">class</span> <span class="nc">Parity</span><span class="p">:</span> + <span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">name</span><span class="p">):</span> + <span class="bp">self</span><span class="o">.</span><span class="n">name</span> <span class="o">=</span> <span class="n">name</span> + + <span class="k">def</span> <span class="fm">__repr__</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span> + <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">name</span> +</pre></div> + +<p>And instantiate the members of the lattice:</p> +<div class="code"><pre class="code literal-block"><span class="n">TOP</span> <span class="o">=</span> <span class="n">Parity</span><span class="p">(</span><span class="s2">"top"</span><span class="p">)</span> +<span class="n">EVEN</span> <span class="o">=</span> <span class="n">Parity</span><span class="p">(</span><span class="s2">"even"</span><span class="p">)</span> +<span class="n">ODD</span> <span class="o">=</span> <span class="n">Parity</span><span class="p">(</span><span class="s2">"odd"</span><span class="p">)</span> +<span class="n">BOTTOM</span> <span class="o">=</span> <span class="n">Parity</span><span class="p">(</span><span class="s2">"bottom"</span><span class="p">)</span> +</pre></div> + +<p>Now let's write a forward flow analysis of a basic block using this lattice. +We'll do that by assuming that a method on <code>Parity</code> is defined for each IR +operation. For example, <code>Parity.add</code>, <code>Parity.lshift</code>, etc.</p> +<div class="code"><pre class="code literal-block"><span class="k">def</span> <span class="nf">analyze</span><span class="p">(</span><span class="n">block</span><span class="p">:</span> <span class="n">Block</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="kc">None</span><span class="p">:</span> + <span class="n">parity</span> <span class="o">=</span> <span class="p">{</span><span class="n">v</span><span class="p">:</span> <span class="n">BOTTOM</span> <span class="k">for</span> <span class="n">v</span> <span class="ow">in</span> <span class="n">block</span><span class="p">}</span> + + <span class="k">def</span> <span class="nf">parity_of</span><span class="p">(</span><span class="n">value</span><span class="p">):</span> + <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">value</span><span class="p">,</span> <span class="n">Constant</span><span class="p">):</span> + <span class="k">return</span> <span class="n">Parity</span><span class="o">.</span><span class="n">const</span><span class="p">(</span><span class="n">value</span><span class="p">)</span> + <span class="k">return</span> <span class="n">parity</span><span class="p">[</span><span class="n">value</span><span class="p">]</span> + + <span class="k">for</span> <span class="n">op</span> <span class="ow">in</span> <span class="n">block</span><span class="p">:</span> + <span class="n">transfer</span> <span class="o">=</span> <span class="nb">getattr</span><span class="p">(</span><span class="n">Parity</span><span class="p">,</span> <span class="n">op</span><span class="o">.</span><span class="n">name</span><span class="p">)</span> + <span class="n">args</span> <span class="o">=</span> <span class="p">[</span><span class="n">parity_of</span><span class="p">(</span><span class="n">arg</span><span class="o">.</span><span class="n">find</span><span class="p">())</span> <span class="k">for</span> <span class="n">arg</span> <span class="ow">in</span> <span class="n">op</span><span class="o">.</span><span class="n">args</span><span class="p">]</span> + <span class="n">parity</span><span class="p">[</span><span class="n">op</span><span class="p">]</span> <span class="o">=</span> <span class="n">transfer</span><span class="p">(</span><span class="o">*</span><span class="n">args</span><span class="p">)</span> +</pre></div> + +<p>For every operation, we compute the abstract value---the parity---of the +arguments and then call the corresponding method on <code>Parity</code> to get the +abstract result.</p> +<!-- TODO maybe learn more about different IRs and how they do constants. +apparently pypy/llvm are free-floating; cinder is not --> +<p>We need to special case <code>Constant</code>s due to a quirk of how the Toy IR is +constructed: the constants don't appear in the instruction stream and instead +are free-floating.</p> +<p>Let's start by looking at the abstraction function for concrete +values---constants:</p> +<div class="code"><pre class="code literal-block"><span class="k">class</span> <span class="nc">Parity</span><span class="p">:</span> + <span class="c1"># ...</span> + <span class="nd">@staticmethod</span> + <span class="k">def</span> <span class="nf">const</span><span class="p">(</span><span class="n">value</span><span class="p">):</span> + <span class="k">if</span> <span class="n">value</span><span class="o">.</span><span class="n">value</span> <span class="o">%</span> <span class="mi">2</span> <span class="o">==</span> <span class="mi">0</span><span class="p">:</span> + <span class="k">return</span> <span class="n">EVEN</span> + <span class="k">else</span><span class="p">:</span> + <span class="k">return</span> <span class="n">ODD</span> +</pre></div> + +<p>Seems reasonable enough. Let's pause on operations for a moment and consider an +example program:</p> +<div class="code"><pre class="code literal-block"><span class="n">v0</span> <span class="o">=</span> <span class="n">getarg</span><span class="p">(</span><span class="mi">0</span><span class="p">)</span> +<span class="n">v1</span> <span class="o">=</span> <span class="n">getarg</span><span class="p">(</span><span class="mi">1</span><span class="p">)</span> +<span class="n">v2</span> <span class="o">=</span> <span class="n">lshift</span><span class="p">(</span><span class="n">v0</span><span class="p">,</span> <span class="mi">1</span><span class="p">)</span> +<span class="n">v3</span> <span class="o">=</span> <span class="n">lshift</span><span class="p">(</span><span class="n">v1</span><span class="p">,</span> <span class="mi">1</span><span class="p">)</span> +<span class="n">v4</span> <span class="o">=</span> <span class="n">add</span><span class="p">(</span><span class="n">v2</span><span class="p">,</span> <span class="n">v3</span><span class="p">)</span> +<span class="n">v5</span> <span class="o">=</span> <span class="n">dummy</span><span class="p">(</span><span class="n">v4</span><span class="p">)</span> +</pre></div> + +<p>This function (which is admittedly a little contrived) takes two inputs, shifts +them left by one bit, adds the result, and then checks the least significant +bit of the addition result. It then passes that result into a <code>dummy</code> function, +which you can think of as "return" or "escape".</p> +<p>To do some abstract interpretation on this program, we'll need to implement the +transfer functions for <code>lshift</code> and <code>add</code> (<code>dummy</code> will just always return +<code>TOP</code>). We'll start with <code>add</code>. Remember that adding two even numbers returns +an even number, adding two odd numbers returns an even number, and mixing even +and odd returns an odd number.</p> +<div class="code"><pre class="code literal-block"><span class="k">class</span> <span class="nc">Parity</span><span class="p">:</span> + <span class="c1"># ...</span> + <span class="k">def</span> <span class="nf">add</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">):</span> + <span class="k">if</span> <span class="bp">self</span> <span class="ow">is</span> <span class="n">BOTTOM</span> <span class="ow">or</span> <span class="n">other</span> <span class="ow">is</span> <span class="n">BOTTOM</span><span class="p">:</span> + <span class="k">return</span> <span class="n">BOTTOM</span> + <span class="k">if</span> <span class="bp">self</span> <span class="ow">is</span> <span class="n">TOP</span> <span class="ow">or</span> <span class="n">other</span> <span class="ow">is</span> <span class="n">TOP</span><span class="p">:</span> + <span class="k">return</span> <span class="n">TOP</span> + <span class="k">if</span> <span class="bp">self</span> <span class="ow">is</span> <span class="n">EVEN</span> <span class="ow">and</span> <span class="n">other</span> <span class="ow">is</span> <span class="n">EVEN</span><span class="p">:</span> + <span class="k">return</span> <span class="n">EVEN</span> + <span class="k">if</span> <span class="bp">self</span> <span class="ow">is</span> <span class="n">ODD</span> <span class="ow">and</span> <span class="n">other</span> <span class="ow">is</span> <span class="n">ODD</span><span class="p">:</span> + <span class="k">return</span> <span class="n">EVEN</span> + <span class="k">return</span> <span class="n">ODD</span> +</pre></div> + +<p>We also need to fill in the other cases where the operands are <em>top</em> or +<em>bottom</em>. In this case, they are both "contagious"; if either operand is +bottom, the result is as well. If neither is bottom but either operand is top, +the result is as well.</p> +<p>Now let's look at <code>lshift</code>. Shifting any number left by a non-zero number of +bits will always result in an even number, but we need to be careful about the +zero case! Shifting by zero doesn't change the number at all. Unfortunately, +since our lattice has no notion of zero, we have to over-approximate here:</p> +<div class="code"><pre class="code literal-block"><span class="k">class</span> <span class="nc">Parity</span><span class="p">:</span> + <span class="c1"># ...</span> + <span class="k">def</span> <span class="nf">lshift</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">):</span> + <span class="c1"># self &lt;&lt; other</span> + <span class="k">if</span> <span class="n">other</span> <span class="ow">is</span> <span class="n">ODD</span><span class="p">:</span> + <span class="k">return</span> <span class="n">EVEN</span> + <span class="k">return</span> <span class="n">TOP</span> +</pre></div> + +<p>This means that we will miss some opportunities to optimize, but it's a +tradeoff that's just part of the game. (We could also add more elements to our +lattice, but that's a topic for another day.)</p> +<p>Now, if we run our abstract interpretation, we'll collect some interesting +properties about the program. If we temporarily hack on the internals of +<code>bb_to_str</code>, we can print out parity information alongside the IR operations:</p> +<div class="code"><pre class="code literal-block"><span class="n">v0</span><span class="p">:</span><span class="n">top</span> <span class="o">=</span> <span class="n">getarg</span><span class="p">(</span><span class="mi">0</span><span class="p">)</span> +<span class="n">v1</span><span class="p">:</span><span class="n">top</span> <span class="o">=</span> <span class="n">getarg</span><span class="p">(</span><span class="mi">1</span><span class="p">)</span> +<span class="n">v2</span><span class="p">:</span><span class="n">even</span> <span class="o">=</span> <span class="n">lshift</span><span class="p">(</span><span class="n">v0</span><span class="p">,</span> <span class="mi">1</span><span class="p">)</span> +<span class="n">v3</span><span class="p">:</span><span class="n">even</span> <span class="o">=</span> <span class="n">lshift</span><span class="p">(</span><span class="n">v1</span><span class="p">,</span> <span class="mi">1</span><span class="p">)</span> +<span class="n">v4</span><span class="p">:</span><span class="n">even</span> <span class="o">=</span> <span class="n">add</span><span class="p">(</span><span class="n">v2</span><span class="p">,</span> <span class="n">v3</span><span class="p">)</span> +<span class="n">v5</span><span class="p">:</span><span class="n">top</span> <span class="o">=</span> <span class="n">dummy</span><span class="p">(</span><span class="n">v4</span><span class="p">)</span> +</pre></div> + +<p>This is pretty awesome, because we can see that <code>v4</code>, the result of the +addition, is <em>always</em> even. Maybe we can do something with that information.</p> +<h3 id="optimization">Optimization</h3> +<p>One way that a program might check if a number is odd is by checking the least +significant bit. This is a common pattern in C code, where you might see code +like <code>y = x &amp; 1</code>. Let's introduce a <code>bitand</code> IR operation that acts like the +<code>&amp;</code> operator in C/Python. Here is an example of use of it in our program:</p> +<div class="code"><pre class="code literal-block"><span class="n">v0</span> <span class="o">=</span> <span class="n">getarg</span><span class="p">(</span><span class="mi">0</span><span class="p">)</span> +<span class="n">v1</span> <span class="o">=</span> <span class="n">getarg</span><span class="p">(</span><span class="mi">1</span><span class="p">)</span> +<span class="n">v2</span> <span class="o">=</span> <span class="n">lshift</span><span class="p">(</span><span class="n">v0</span><span class="p">,</span> <span class="mi">1</span><span class="p">)</span> +<span class="n">v3</span> <span class="o">=</span> <span class="n">lshift</span><span class="p">(</span><span class="n">v1</span><span class="p">,</span> <span class="mi">1</span><span class="p">)</span> +<span class="n">v4</span> <span class="o">=</span> <span class="n">add</span><span class="p">(</span><span class="n">v2</span><span class="p">,</span> <span class="n">v3</span><span class="p">)</span> +<span class="n">v5</span> <span class="o">=</span> <span class="n">bitand</span><span class="p">(</span><span class="n">v4</span><span class="p">,</span> <span class="mi">1</span><span class="p">)</span> <span class="c1"># new!</span> +<span class="n">v6</span> <span class="o">=</span> <span class="n">dummy</span><span class="p">(</span><span class="n">v5</span><span class="p">)</span> +</pre></div> + +<p>We'll hold off on implementing the transfer function for it---that's left as an +exercise for the reader---and instead do something different.</p> +<p>Instead, we'll see if we can optimize operations of the form <code>bitand(X, 1)</code>. If +we statically know the parity as a result of abstract interpretation, we can +replace the <code>bitand</code> with a constant <code>0</code> or <code>1</code>.</p> +<p>We'll first modify the <code>analyze</code> function (and rename it) to return a new +<code>Block</code> containing optimized instructions:</p> +<div class="code"><pre class="code literal-block"><span class="k">def</span> <span class="nf">simplify</span><span class="p">(</span><span class="n">block</span><span class="p">:</span> <span class="n">Block</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Block</span><span class="p">:</span> + <span class="n">parity</span> <span class="o">=</span> <span class="p">{</span><span class="n">v</span><span class="p">:</span> <span class="n">BOTTOM</span> <span class="k">for</span> <span class="n">v</span> <span class="ow">in</span> <span class="n">block</span><span class="p">}</span> + + <span class="k">def</span> <span class="nf">parity_of</span><span class="p">(</span><span class="n">value</span><span class="p">):</span> + <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">value</span><span class="p">,</span> <span class="n">Constant</span><span class="p">):</span> + <span class="k">return</span> <span class="n">Parity</span><span class="o">.</span><span class="n">const</span><span class="p">(</span><span class="n">value</span><span class="p">)</span> + <span class="k">return</span> <span class="n">parity</span><span class="p">[</span><span class="n">value</span><span class="p">]</span> + + <span class="n">result</span> <span class="o">=</span> <span class="n">Block</span><span class="p">()</span> + <span class="k">for</span> <span class="n">op</span> <span class="ow">in</span> <span class="n">block</span><span class="p">:</span> + <span class="c1"># TODO: Optimize op</span> + <span class="c1"># Emit</span> + <span class="n">result</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">op</span><span class="p">)</span> + <span class="c1"># Analyze</span> + <span class="n">transfer</span> <span class="o">=</span> <span class="nb">getattr</span><span class="p">(</span><span class="n">Parity</span><span class="p">,</span> <span class="n">op</span><span class="o">.</span><span class="n">name</span><span class="p">)</span> + <span class="n">args</span> <span class="o">=</span> <span class="p">[</span><span class="n">parity_of</span><span class="p">(</span><span class="n">arg</span><span class="o">.</span><span class="n">find</span><span class="p">())</span> <span class="k">for</span> <span class="n">arg</span> <span class="ow">in</span> <span class="n">op</span><span class="o">.</span><span class="n">args</span><span class="p">]</span> + <span class="n">parity</span><span class="p">[</span><span class="n">op</span><span class="p">]</span> <span class="o">=</span> <span class="n">transfer</span><span class="p">(</span><span class="o">*</span><span class="n">args</span><span class="p">)</span> + <span class="k">return</span> <span class="n">result</span> +</pre></div> + +<p>We're approaching this the way that PyPy does things under the hood, which is +all in roughly a single pass. It tries to optimize an instruction away, and if +it can't, it copies it into the new block.</p> +<p>Now let's add in the <code>bitand</code> optimization. It's mostly some gross-looking +pattern matching that checks if the right hand side of a bitwise <code>and</code> +operation is <code>1</code> (TODO: the left hand side, too). CF had some neat ideas on how +to make this more ergonomic, which I might save for later.<sup id="fnref:match-args"><a class="footnote-ref" href="https://www.pypy.org/posts/2024/07/toy-abstract-interpretation.html#fn:match-args">3</a></sup></p> +<p>Then, if we know the parity, optimize the <code>bitand</code> into a constant.</p> +<div class="code"><pre class="code literal-block"><span class="k">def</span> <span class="nf">simplify</span><span class="p">(</span><span class="n">block</span><span class="p">:</span> <span class="n">Block</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Block</span><span class="p">:</span> + <span class="n">parity</span> <span class="o">=</span> <span class="p">{</span><span class="n">v</span><span class="p">:</span> <span class="n">BOTTOM</span> <span class="k">for</span> <span class="n">v</span> <span class="ow">in</span> <span class="n">block</span><span class="p">}</span> + + <span class="k">def</span> <span class="nf">parity_of</span><span class="p">(</span><span class="n">value</span><span class="p">):</span> + <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">value</span><span class="p">,</span> <span class="n">Constant</span><span class="p">):</span> + <span class="k">return</span> <span class="n">Parity</span><span class="o">.</span><span class="n">const</span><span class="p">(</span><span class="n">value</span><span class="p">)</span> + <span class="k">return</span> <span class="n">parity</span><span class="p">[</span><span class="n">value</span><span class="p">]</span> + + <span class="n">result</span> <span class="o">=</span> <span class="n">Block</span><span class="p">()</span> + <span class="k">for</span> <span class="n">op</span> <span class="ow">in</span> <span class="n">block</span><span class="p">:</span> + <span class="c1"># Try to simplify</span> + <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">op</span><span class="p">,</span> <span class="n">Operation</span><span class="p">)</span> <span class="ow">and</span> <span class="n">op</span><span class="o">.</span><span class="n">name</span> <span class="o">==</span> <span class="s2">"bitand"</span><span class="p">:</span> + <span class="n">arg</span> <span class="o">=</span> <span class="n">op</span><span class="o">.</span><span class="n">arg</span><span class="p">(</span><span class="mi">0</span><span class="p">)</span> + <span class="n">mask</span> <span class="o">=</span> <span class="n">op</span><span class="o">.</span><span class="n">arg</span><span class="p">(</span><span class="mi">1</span><span class="p">)</span> + <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">mask</span><span class="p">,</span> <span class="n">Constant</span><span class="p">)</span> <span class="ow">and</span> <span class="n">mask</span><span class="o">.</span><span class="n">value</span> <span class="o">==</span> <span class="mi">1</span><span class="p">:</span> + <span class="k">if</span> <span class="n">parity_of</span><span class="p">(</span><span class="n">arg</span><span class="p">)</span> <span class="ow">is</span> <span class="n">EVEN</span><span class="p">:</span> + <span class="n">op</span><span class="o">.</span><span class="n">make_equal_to</span><span class="p">(</span><span class="n">Constant</span><span class="p">(</span><span class="mi">0</span><span class="p">))</span> + <span class="k">continue</span> + <span class="k">elif</span> <span class="n">parity_of</span><span class="p">(</span><span class="n">arg</span><span class="p">)</span> <span class="ow">is</span> <span class="n">ODD</span><span class="p">:</span> + <span class="n">op</span><span class="o">.</span><span class="n">make_equal_to</span><span class="p">(</span><span class="n">Constant</span><span class="p">(</span><span class="mi">1</span><span class="p">))</span> + <span class="k">continue</span> + <span class="c1"># Emit</span> + <span class="n">result</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">op</span><span class="p">)</span> + <span class="c1"># Analyze</span> + <span class="n">transfer</span> <span class="o">=</span> <span class="nb">getattr</span><span class="p">(</span><span class="n">Parity</span><span class="p">,</span> <span class="n">op</span><span class="o">.</span><span class="n">name</span><span class="p">)</span> + <span class="n">args</span> <span class="o">=</span> <span class="p">[</span><span class="n">parity_of</span><span class="p">(</span><span class="n">arg</span><span class="o">.</span><span class="n">find</span><span class="p">())</span> <span class="k">for</span> <span class="n">arg</span> <span class="ow">in</span> <span class="n">op</span><span class="o">.</span><span class="n">args</span><span class="p">]</span> + <span class="n">parity</span><span class="p">[</span><span class="n">op</span><span class="p">]</span> <span class="o">=</span> <span class="n">transfer</span><span class="p">(</span><span class="o">*</span><span class="n">args</span><span class="p">)</span> + <span class="k">return</span> <span class="n">result</span> +</pre></div> + +<p>Remember: because we use union-find to rewrite instructions in the optimizer +(<code>make_equal_to</code>), later uses of the same instruction get the new +optimized version "for free" (<code>find</code>).</p> +<p>Let's see how it works on our IR:</p> +<div class="code"><pre class="code literal-block"><span class="n">v0</span> <span class="o">=</span> <span class="n">getarg</span><span class="p">(</span><span class="mi">0</span><span class="p">)</span> +<span class="n">v1</span> <span class="o">=</span> <span class="n">getarg</span><span class="p">(</span><span class="mi">1</span><span class="p">)</span> +<span class="n">v2</span> <span class="o">=</span> <span class="n">lshift</span><span class="p">(</span><span class="n">v0</span><span class="p">,</span> <span class="mi">1</span><span class="p">)</span> +<span class="n">v3</span> <span class="o">=</span> <span class="n">lshift</span><span class="p">(</span><span class="n">v1</span><span class="p">,</span> <span class="mi">1</span><span class="p">)</span> +<span class="n">v4</span> <span class="o">=</span> <span class="n">add</span><span class="p">(</span><span class="n">v2</span><span class="p">,</span> <span class="n">v3</span><span class="p">)</span> +<span class="n">v6</span> <span class="o">=</span> <span class="n">dummy</span><span class="p">(</span><span class="mi">0</span><span class="p">)</span> +</pre></div> + +<p>Hey, neat! <code>bitand</code> disappeared and the argument to <code>dummy</code> is now the constant +<code>0</code> because we know the lowest bit.</p> +<h3 id="wrapping-up">Wrapping up</h3> +<p>Hopefully you have gained a little bit of an intuitive understanding of +abstract interpretation. Last year, being able to write some code made me more +comfortable with the math. Now being more comfortable with the math is helping +me write the code. It's nice upward spiral.</p> +<p>The two abstract domains we used in this post are simple and not very useful in +practice but it's possible to get very far using slightly more complicated +abstract domains. Common domains include: constant propagation, type inference, +range analysis, effect inference, liveness, etc. For example, here is a a +sample lattice for constant propagation:</p> +<figure style="display: block; margin: 0 auto;"> +<!-- +digraph G { + rankdir="BT"; + top [shape=Msquare]; + bottom [shape=Msquare]; + + bottom -> "-inf"; + bottom -> "-2"; + bottom -> "-1"; + bottom -> 0; + bottom -> 1; + bottom -> 2; + bottom -> "+inf"; + + "-inf" -> negative; + "-2" -> negative; + "-1" -> negative; + 0 -> top; + 1 -> nonnegative; + 2 -> nonnegative; + "+inf" -> nonnegative; + + negative -> nonzero; + nonnegative -> nonzero; + nonzero->top; + + {rank=same; "-inf"; "-2"; "-1"; 0; 1; 2; "+inf"} + {rank=same; nonnegative; negative;} +} +--> + <object class="svg" type="image/svg+xml" data="https://www.pypy.org/images/2024-complex-lattice.svg"> + </object> +</figure> + +<p>It has multiple levels to indicate more and less precision. For example, you +might learn that a variable is either <code>1</code> or <code>2</code> and be able to encode that as +<code>nonnegative</code> instead of just going straight to <code>top</code>.</p> +<p>Check out some real-world abstract interpretation in open source projects:</p> +<ul> +<li><a href="https://github.com/llvm/llvm-project/blob/main/llvm/lib/Support/KnownBits.cpp">Known bits in LLVM</a></li> +<li><a href="https://github.com/llvm/llvm-project/blob/main/llvm/lib/IR/ConstantRange.cpp">Constant range in LLVM</a></li> +<li>But I am told that the ranges don't form a lattice (see <a href="https://dl.acm.org/doi/10.1145/2651360">Interval Analysis and Machine Arithmetic: Why Signedness Ignorance Is Bliss</a>)</li> +<li><a href="https://github.com/torvalds/linux/blob/master/kernel/bpf/tnum.c">Tristate numbers for known bits in Linux eBPF</a></li> +<li><a href="https://github.com/torvalds/linux/blob/28bbe4ea686a023929d907cc168430b61094811c/kernel/bpf/verifier.c#L13335">Range analysis in Linux eBPF</a></li> +<li><a href="https://github.com/bminor/binutils-gdb/blob/master/gdb/prologue-value.c">GDB prologue analysis</a> + of assembly to understand the stack and find frame pointers without using + DWARF (<a href="https://sourceware.org/gdb/wiki/Internals/Prologue%20Analysis">some + docs</a>)</li> +</ul> +<p>If you have some readable examples, please share them so I can add.</p> +<h3 id="acknowledgements">Acknowledgements</h3> +<p>Thank you to <a href="https://cfbolz.de/">CF Bolz-Tereick</a> for the toy optimizer and +helping edit this post!</p> +<div class="footnote"> +<hr> +<ol> +<li id="fn:logozzo"> +<p>In the words of abstract interpretation researchers Vincent Laviron +and Francesco Logozzo in their paper <em>Refining Abstract +Interpretation-based Static Analyses with Hints</em> (APLAS 2009):</p> +<blockquote> +<p>The three main elements of an abstract interpretation are: (i) the +abstract elements ("which properties am I interested in?"); (ii) the +abstract transfer functions ("which is the abstract semantics of basic +statements?"); and (iii) the abstract operations ("how do I combine the +abstract elements?").</p> +</blockquote> +<p>We don't have any of these "abstract operations" in this post because +there's no control flow but you can read about them elsewhere! <a class="footnote-backref" href="https://www.pypy.org/posts/2024/07/toy-abstract-interpretation.html#fnref:logozzo" title="Jump back to footnote 1 in the text">↩</a></p> +</li> +<li id="fn:lattices"> +<p>These abstract values are arranged in a <em>lattice</em>, which is a +mathematical structure with some properties but the most important ones are +that it has a top, a bottom, a partial order, a meet operation, and values +can only move in one direction on the lattice.</p> +<p>Using abstract values from a lattice promises two things:</p> +<ul> +<li>The analysis will terminate</li> +<li>The analysis will be correct for <em>any</em> run of the program, not just one + sample run</li> +</ul> +<p><a class="footnote-backref" href="https://www.pypy.org/posts/2024/07/toy-abstract-interpretation.html#fnref:lattices" title="Jump back to footnote 2 in the text">↩</a></p> +</li> +<li id="fn:match-args"> +<p>Something about <code>__match_args__</code> and <code>@property</code>... <a class="footnote-backref" href="https://www.pypy.org/posts/2024/07/toy-abstract-interpretation.html#fnref:match-args" title="Jump back to footnote 3 in the text">↩</a></p> +</li> +</ol> +</div>toy-optimizerhttps://www.pypy.org/posts/2024/07/toy-abstract-interpretation.htmlWed, 24 Jul 2024 14:48:00 GMTMining JIT traces for missing optimizations with Z3https://www.pypy.org/posts/2024/07/mining-jit-traces-missing-optimizations-z3.htmlCF Bolz-Tereick<p>In my last post I've described <a href="https://www.pypy.org/posts/2024/07/finding-simple-rewrite-rules-jit-z3.html">how to use Z3 to find simple local peephole +optimization patterns</a> +for the integer operations in PyPy's JIT. An example is <code>int_and(x, 0) -&gt; +0</code>. In this post I want to scale up the problem of identifying possible +optimizations to much bigger instruction sequences, also using Z3. For that, I +am starting with the JIT traces of <strong>real benchmarks</strong>, after they have been +optimized by the optimizer of PyPy's JIT. Then we can ask Z3 to find +inefficient integer operations in those traces.</p> +<p>Starting from the optimized traces of real programs has some big +advantages over the "classical" superoptimization approach of generating and +then trying all possible sequences of instructions. It avoids the +combinatorial explosion that happens with the latter approach. Also, starting +from the traces of benchmarks or (even better) actual programs makes sure that +we actually care about the missing optimizations +that are found in this way. And because the traces are analyzed after they have +been optimized by PyPy's optimizer, we only get reports for <em>missing</em> +optimizations, that the JIT isn't able to do (yet).</p> +<p>The techniques and experiments I describe in this post are again the result of +a bunch of discussions with John Regehr at a conference a few weeks ago, as +well as reading his blog posts and papers. Thanks John! Also thanks to <a href="https://bernsteinbear.com/">Max +Bernstein</a> for super helpful feedback on the drafts +of this blog post (and for poking me to write things in general).</p> +<h3 id="high-level-approach">High-Level Approach</h3> +<p>The approach that I took works as follows:</p> +<ul> +<li>Run benchmarks or other interesting programs and then dump the IR of the JIT + traces into a file. The traces have at that point been already optimized by + the PyPy JIT's optimizer.</li> +<li>For every trace, ignore all the operations on non-integer variables.</li> +<li>Translate every integer operation into a Z3 formula.</li> +<li>For every operation, use Z3 to find out whether the operation is redundant + (how that is done is described below).</li> +<li>If the operation is redundant, the trace is less efficient than it could have + been, because the optimizer could also have removed the operation. Report the + inefficiency.</li> +<li>Minimize the inefficient programs by removing as many operations as possible + to make the problem easier to understand.</li> +</ul> +<p>In the post I will describe the details and show some pseudocode of the +approach. I'll also make the proper code public eventually (but it needs a +healthy dose of cleanups first).</p> +<h3 id="dumping-pypy-traces">Dumping PyPy Traces</h3> +<p>PyPy will write its JIT traces into the file <code>out</code> if the environment variable +<a href="https://doc.pypy.org/en/latest/man/pypy.1.html"><code>PYPYLOG</code></a> is set as follows:</p> +<div class="code"><pre class="code literal-block">PYPYLOG=jit-log-opt:out pypy &lt;program.py&gt; +</pre></div> + +<p>This environment variable works for PyPy, but also for other virtual machines +built with RPython.</p> +<p>(This is really a side point for the rest of the blog post, but since the +question came up I wanted to clarify it: Operations on integers in the Python +program that the JIT is running don't all correspond 1-to-1 with the <code>int_...</code> +operations in the traces. The <code>int_...</code> trace operations always operate on +machine words. The Python <code>int</code> type supports arbitrarily large integers. PyPy +will optimistically try to lower the operations on Python integers into machine +word operations, but adds the necessary guards into the trace to make sure that +overflow outside of the range of machine words is caught. In case one of these +guards fails the interpreter switches to a big integer heap-allocated +representation.)</p> +<h3 id="encoding-traces-as-z3-formulas">Encoding Traces as Z3 formulas</h3> +<p>The last blog post already contained the code to encode the results of +individual trace operations into Z3 formulas, so we don't need to repeat that +here. To encode traces of operations we introduce a Z3 variable for every +operation in the trace and then call the <code>z3_expression</code> function for every +single one of the operations in the trace.</p> +<p>For example, for the following trace:</p> +<div class="code"><pre class="code literal-block"><span class="k">[i1]</span> +<span class="na">i2</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="s">uint_rshift(i1, 32)</span> +<span class="na">i3</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="s">int_and(i2, 65535)</span> +<span class="na">i4</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="s">uint_rshift(i1, 48)</span> +<span class="na">i5</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="s">int_lshift(i4, 16)</span> +<span class="na">i6</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="s">int_or(i5, i3)</span> +<span class="na">jump(i6, i2) # equal</span> +</pre></div> + +<p>We would get the Z3 formula:</p> +<div class="code"><pre class="code literal-block">z3.And(i2 == LShR(i1, 32), + i3 == i2 &amp; 65535, + i4 == LShR(i1, 48), + i5 == i4 &lt;&lt; 16) +</pre></div> + +<p>Usually we won't ask for the formula of the whole trace at once. Instead we go +through the trace operation by operation and try to find inefficiencies in the +current one we are looking at. Roughly like this (pseudo-)code:</p> +<div class="code"><pre class="code literal-block"><span class="k">def</span> <span class="nf">newvar</span><span class="p">(</span><span class="n">name</span><span class="p">):</span> + <span class="k">return</span> <span class="n">z3</span><span class="o">.</span><span class="n">BitVec</span><span class="p">(</span><span class="n">name</span><span class="p">,</span> <span class="n">INTEGER_WIDTH</span><span class="p">)</span> + +<span class="k">def</span> <span class="nf">find_inefficiencies</span><span class="p">(</span><span class="n">trace</span><span class="p">):</span> + <span class="n">solver</span> <span class="o">=</span> <span class="n">z3</span><span class="o">.</span><span class="n">Solver</span><span class="p">()</span> + <span class="n">var_to_z3var</span> <span class="o">=</span> <span class="p">{}</span> + <span class="k">for</span> <span class="n">input_argument</span> <span class="ow">in</span> <span class="n">trace</span><span class="o">.</span><span class="n">inputargs</span><span class="p">:</span> + <span class="n">var_to_z3var</span><span class="p">[</span><span class="n">input_argument</span><span class="p">]</span> <span class="o">=</span> <span class="n">newz3var</span><span class="p">(</span><span class="n">input_argument</span><span class="p">)</span> + <span class="k">for</span> <span class="n">op</span> <span class="ow">in</span> <span class="n">trace</span><span class="p">:</span> + <span class="n">var_to_z3var</span><span class="p">[</span><span class="n">op</span><span class="p">]</span> <span class="o">=</span> <span class="n">z3resultvar</span> <span class="o">=</span> <span class="n">newz3var</span><span class="p">(</span><span class="n">op</span><span class="o">.</span><span class="n">resultvarname</span><span class="p">)</span> + <span class="n">arg0</span> <span class="o">=</span> <span class="n">op</span><span class="o">.</span><span class="n">args</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span> + <span class="n">z3arg0</span> <span class="o">=</span> <span class="n">var_to_z3var</span><span class="p">[</span><span class="n">arg0</span><span class="p">]</span> + <span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">op</span><span class="o">.</span><span class="n">args</span><span class="p">)</span> <span class="o">==</span> <span class="mi">2</span><span class="p">:</span> + <span class="n">arg1</span> <span class="o">=</span> <span class="n">op</span><span class="o">.</span><span class="n">args</span><span class="p">[</span><span class="mi">1</span><span class="p">]</span> + <span class="n">z3arg1</span> <span class="o">=</span> <span class="n">var_to_z3var</span><span class="p">[</span><span class="n">arg1</span><span class="p">]</span> + <span class="k">else</span><span class="p">:</span> + <span class="n">z3arg1</span> <span class="o">=</span> <span class="kc">None</span> + <span class="n">res</span><span class="p">,</span> <span class="n">valid_if</span> <span class="o">=</span> <span class="n">z3_expression</span><span class="p">(</span><span class="n">op</span><span class="o">.</span><span class="n">name</span><span class="p">,</span> <span class="n">z3arg0</span><span class="p">,</span> <span class="n">z3arg1</span><span class="p">)</span> + <span class="c1"># checking for inefficiencies, see the next sections</span> + <span class="o">...</span> + <span class="k">if</span> <span class="o">...</span><span class="p">:</span> + <span class="k">return</span> <span class="s2">"inefficient"</span><span class="p">,</span> <span class="n">op</span> + + <span class="c1"># not inefficient, assert op into the solver and continue with the next op</span> + <span class="n">solver</span><span class="o">.</span><span class="n">add</span><span class="p">(</span><span class="n">z3resultvar</span> <span class="o">==</span> <span class="n">res</span><span class="p">)</span> + <span class="k">return</span> <span class="kc">None</span> <span class="c1"># no inefficiency found</span> +</pre></div> + +<h3 id="identifying-constant-booleans-with-z3">Identifying constant booleans with Z3</h3> +<p>To get started finding inefficiencies in a trace, we can +first focus on boolean variables. For every operation in the trace that +returns a bool we can ask Z3 to prove that this variable must be always True or +always False. Most of the time, neither of these proofs will succeed. But if Z3 +manages to prove one of them, we know have found an ineffiency: instead of +computing the boolean result (eg by executing a comparison) the JIT's optimizer +could have replaced the operation with the corresponding boolean constant.</p> +<p>Here's an example of an inefficiency found that way: if <code>x &lt; y</code> and <code>y &lt; z</code> are +both true, PyPy's JIT could conclude that <code>x &lt; z</code> must also +be true. However, currently the JIT cannot make that conclusion because it +only reasons about the concrete ranges (lower and upper bounds) for every +integer variable, but it has no way to remember anything about relationships +between different variables. This kind of reasoning would quite often be useful +to remove list/string bounds checks. Here's a <a href="https://www.youtube.com/watch?app=desktop&amp;v=1hm5ZVmBEvo">talk about how LLVM does +this</a> (but it might be +too heavyweight for a JIT setting).</p> +<p>Here are some more examples found that way:</p> +<ul> +<li><code>x - 1 == x</code> is always False</li> +<li><code>x - (x == -1) == -1</code> is always False. The pattern <code>x - (x == -1)</code> happens a + lot in PyPy's hash computations: To be compatible with the CPython hashes we + need to make sure that no object's hash is -1 (CPython uses -1 as an error + value on the C level).</li> +</ul> +<p>Here's pseudo-code for how to implement checking boolean operations for +inefficiencies:</p> +<div class="code"><pre class="code literal-block"><span class="k">def</span> <span class="nf">find_inefficiencies</span><span class="p">(</span><span class="n">trace</span><span class="p">):</span> + <span class="o">...</span> + <span class="k">for</span> <span class="n">op</span> <span class="ow">in</span> <span class="n">trace</span><span class="p">:</span> + <span class="o">...</span> + <span class="n">res</span><span class="p">,</span> <span class="n">valid_if</span> <span class="o">=</span> <span class="n">z3_expression</span><span class="p">(</span><span class="n">op</span><span class="o">.</span><span class="n">name</span><span class="p">,</span> <span class="n">z3arg0</span><span class="p">,</span> <span class="n">z3arg1</span><span class="p">)</span> + <span class="c1"># check for boolean constant result</span> + <span class="k">if</span> <span class="n">op</span><span class="o">.</span><span class="n">has_boolean_result</span><span class="p">():</span> + <span class="k">if</span> <span class="n">prove</span><span class="p">(</span><span class="n">solver</span><span class="p">,</span> <span class="n">res</span> <span class="o">==</span> <span class="mi">0</span><span class="p">):</span> + <span class="k">return</span> <span class="s2">"inefficient"</span><span class="p">,</span> <span class="n">op</span><span class="p">,</span> <span class="mi">0</span> + <span class="k">if</span> <span class="n">prove</span><span class="p">(</span><span class="n">solver</span><span class="p">,</span> <span class="n">res</span> <span class="o">==</span> <span class="mi">1</span><span class="p">):</span> + <span class="k">return</span> <span class="s2">"inefficient"</span><span class="p">,</span> <span class="n">op</span><span class="p">,</span> <span class="mi">1</span> + <span class="c1"># checking for other inefficiencies, see the next sections</span> + <span class="o">...</span> + + <span class="c1"># not inefficient, add op to the solver and continue with the next op</span> + <span class="n">solver</span><span class="o">.</span><span class="n">add</span><span class="p">(</span><span class="n">z3resultvar</span> <span class="o">==</span> <span class="n">res</span><span class="p">)</span> + <span class="k">return</span> <span class="kc">None</span> <span class="c1"># no inefficiency found</span> +</pre></div> + +<h3 id="identifying-redundant-operations">Identifying redundant operations</h3> +<p>A more interesting class of redundancy is to try to find two operations in a +trace that compute the same result. We can do that by asking Z3 to prove for +each pair of different operations in the trace to prove that the result is +always the same. If a previous operation returns the same result, the JIT could +have re-used that result instead of re-computing it, saving time. Doing this +search for equivalent operations with Z3 is quadratic in the number of +operations, but since traces have a maximum length it is not too bad in +practice.</p> +<p>This is the real workhorse of my script so far, it's what finds most of the +inefficiencies. Here's a few examples:</p> +<ul> +<li>The very first and super useful example the script found is <code>int_eq(b, 1) == + b</code> if <code>b</code> is known to be a boolean (ie and integer 0 or 1). I have already + implemented this optimization in the JIT.</li> +<li>Similarly, <code>int_and(b, 1) == b</code> for booleans.</li> +<li><code>(x &lt;&lt; 4) &amp; -0xf == x &lt;&lt; 4</code></li> +<li><code>((x &gt;&gt; 63) &lt;&lt; 1) &lt;&lt; 2) &gt;&gt; 3 == x &gt;&gt; 63</code>. In general the JIT is quite bad at + optimizing repeated shifts (the infrastructure for doing better with that is + already in place, so this will be a relatively easy fix).</li> +<li><code>(x &amp; 0xffffffff) | ((x &gt;&gt; 32) &lt;&lt; 32) == x</code>. Having the JIT optimize this + would maybe require first recognizing that <code>(x &gt;&gt; 32) &lt;&lt; 32</code> can be expressed + as a mask: <code>(x &amp; 0xffffffff00000000)</code>, and then using <code>(x &amp; c1) | (x &amp; c2) == + x &amp; (c1 | c2)</code></li> +<li>A commonly occurring pattern is variations of this one: + <code>((x &amp; 1345) ^ 2048) - 2048 == x &amp; 1345</code> (with different constants, of + course). xor is add without carry, and <code>x &amp; 1345</code> does not have the bit + <code>2048</code> set. Therefore the <code>^ 2048</code> is equivalent to <code>+ 2048</code>, which the <code>- + 2048</code> cancels. More generally, if <code>a &amp; b == 0</code>, then <code>a + b == a | b == a ^ b</code>. + I don't understand at all why this appears so often in the traces, but I + see variations of it a lot. LLVM can optimize this, but <a href="https://gcc.gnu.org/bugzilla/show_bug.cgi?id=115829">GCC + can't</a>, thanks to + <a href="https://hachyderm.io/@pinskia/112752641328799157">Andrew Pinski for filing the + bug</a>!</li> +</ul> +<p>And here's some implementation pseudo-code again:</p> +<div class="code"><pre class="code literal-block"><span class="k">def</span> <span class="nf">find_inefficiencies</span><span class="p">(</span><span class="n">trace</span><span class="p">):</span> + <span class="o">...</span> + <span class="k">for</span> <span class="n">op</span> <span class="ow">in</span> <span class="n">trace</span><span class="p">:</span> + <span class="o">...</span> + <span class="n">res</span><span class="p">,</span> <span class="n">valid_if</span> <span class="o">=</span> <span class="n">z3_expression</span><span class="p">(</span><span class="n">op</span><span class="o">.</span><span class="n">name</span><span class="p">,</span> <span class="n">z3arg0</span><span class="p">,</span> <span class="n">z3arg1</span><span class="p">)</span> + <span class="c1"># check for boolean constant result</span> + <span class="o">...</span> + <span class="c1"># searching for redundant operations</span> + <span class="k">for</span> <span class="n">previous_op</span> <span class="ow">in</span> <span class="n">trace</span><span class="p">:</span> + <span class="k">if</span> <span class="n">previous_op</span> <span class="ow">is</span> <span class="n">op</span><span class="p">:</span> + <span class="k">break</span> <span class="c1"># done, reached the current op</span> + <span class="n">previous_op_z3var</span> <span class="o">=</span> <span class="n">var_to_z3var</span><span class="p">[</span><span class="n">previous_op</span><span class="p">]</span> + <span class="k">if</span> <span class="n">prove</span><span class="p">(</span><span class="n">solver</span><span class="p">,</span> <span class="n">previous_op_z3var</span> <span class="o">==</span> <span class="n">res</span><span class="p">):</span> + <span class="k">return</span> <span class="s2">"inefficient"</span><span class="p">,</span> <span class="n">op</span><span class="p">,</span> <span class="n">previous_op</span> + <span class="o">...</span> + <span class="c1"># more code here later</span> + <span class="o">...</span> + + <span class="c1"># not inefficient, add op to the solver and continue with the next op</span> + <span class="n">solver</span><span class="o">.</span><span class="n">add</span><span class="p">(</span><span class="n">z3resultvar</span> <span class="o">==</span> <span class="n">res</span><span class="p">)</span> + <span class="k">return</span> <span class="kc">None</span> <span class="c1"># no inefficiency found</span> +</pre></div> + +<h3 id="synthesizing-more-complicated-constants-with-exists-forall">Synthesizing more complicated constants with exists-forall</h3> +<p>To find out whether some integer operations always return a constant result, we +can't simply use the same trick as for those operations that return boolean +results, because enumerating 2⁶⁴ possible constants and checking them all +would take too long. Like in the last post, we can use <code>z3.ForAll</code> to find out +whether Z3 can synthesize a constant for the result of an operation for us. +If such a constant exists, the JIT could have removed the operation, +and replaced it with the constant that Z3 provides.</p> +<p>Here a few examples of inefficiencies found this way:</p> +<ul> +<li><code>(x ^ 1) ^ x == 1</code> (or, more generally: <code>(x ^ y) ^ x == y</code>)</li> +<li>if <code>x | y == 0</code>, it follows that <code>x == 0</code> and <code>y == 0</code></li> +<li>if <code>x != MAXINT</code>, then <code>x + 1 &gt; x</code></li> +</ul> +<p>Implementing this is actually slightly annoying. The <code>solver.add</code> calls for +non-inefficient ops add assertions to the solver, which are now confusing the +<code>z3.ForAll</code> query. We could remove all assertion from the solver, then do the +<code>ForAll</code> query, then add the assertions back. What I ended doing instead was +instantiating a second solver object that I'm using for the <code>ForAll</code> queries, +that remains empty the whole time.</p> +<div class="code"><pre class="code literal-block"><span class="k">def</span> <span class="nf">find_inefficiencies</span><span class="p">(</span><span class="n">trace</span><span class="p">):</span> + <span class="n">solver</span> <span class="o">=</span> <span class="n">z3</span><span class="o">.</span><span class="n">Solver</span><span class="p">()</span> + <span class="n">empty_solver</span> <span class="o">=</span> <span class="n">z3</span><span class="o">.</span><span class="n">Solver</span><span class="p">()</span> + <span class="n">var_to_z3var</span> <span class="o">=</span> <span class="p">{}</span> + <span class="o">...</span> + <span class="k">for</span> <span class="n">op</span> <span class="ow">in</span> <span class="n">trace</span><span class="p">:</span> + <span class="o">...</span> + <span class="n">res</span><span class="p">,</span> <span class="n">valid_if</span> <span class="o">=</span> <span class="n">z3_expression</span><span class="p">(</span><span class="n">op</span><span class="o">.</span><span class="n">name</span><span class="p">,</span> <span class="n">z3arg0</span><span class="p">,</span> <span class="n">z3arg1</span><span class="p">)</span> + <span class="c1"># check for boolean constant result</span> + <span class="o">...</span> + <span class="c1"># searching for redundant operations</span> + <span class="o">...</span> + <span class="c1"># checking for constant results</span> + <span class="n">constvar</span> <span class="o">=</span> <span class="n">z3</span><span class="o">.</span><span class="n">BitVec</span><span class="p">(</span><span class="s1">'find_const'</span><span class="p">,</span> <span class="n">INTEGER_WIDTH</span><span class="p">)</span> + <span class="n">condition</span> <span class="o">=</span> <span class="n">z3</span><span class="o">.</span><span class="n">ForAll</span><span class="p">(</span> + <span class="n">var_to_z3var</span><span class="o">.</span><span class="n">values</span><span class="p">(),</span> + <span class="n">z3</span><span class="o">.</span><span class="n">Implies</span><span class="p">(</span> + <span class="o">*</span><span class="n">solver</span><span class="o">.</span><span class="n">assertions</span><span class="p">(),</span> + <span class="n">expr</span> <span class="o">==</span> <span class="n">constvar</span> + <span class="p">)</span> + <span class="p">)</span> + <span class="k">if</span> <span class="n">empty_solver</span><span class="o">.</span><span class="n">check</span><span class="p">(</span><span class="n">condition</span><span class="p">)</span> <span class="o">==</span> <span class="n">z3</span><span class="o">.</span><span class="n">sat</span><span class="p">:</span> + <span class="n">model</span> <span class="o">=</span> <span class="n">empty_solver</span><span class="o">.</span><span class="n">model</span><span class="p">()</span> + <span class="n">const</span> <span class="o">=</span> <span class="n">model</span><span class="p">[</span><span class="n">constvar</span><span class="p">]</span><span class="o">.</span><span class="n">as_signed_long</span><span class="p">()</span> + <span class="k">return</span> <span class="s2">"inefficient"</span><span class="p">,</span> <span class="n">op</span><span class="p">,</span> <span class="n">const</span> + + <span class="c1"># not inefficient, add op to the solver and continue with the next op</span> + <span class="n">solver</span><span class="o">.</span><span class="n">add</span><span class="p">(</span><span class="n">z3resultvar</span> <span class="o">==</span> <span class="n">res</span><span class="p">)</span> + <span class="k">return</span> <span class="kc">None</span> <span class="c1"># no inefficiency found</span> +</pre></div> + +<h3 id="minimization">Minimization</h3> +<p>Analyzing an inefficiency by hand in the context of a larger trace is quite +tedious. Therefore I've implemented a (super inefficient) script to try to make +the examples smaller. Here's how that works:</p> +<ul> +<li>First throw out all the operations that occur <em>after</em> the inefficient operation + in the trace.</li> +<li>Then we remove all "dead" operations, ie operations that don't have their + results used (all the operations that we can analyze with Z3 are without side + effects).</li> +<li>Now we try to remove every guard in the trace one by one and check + afterwards, whether the resulting trace still has an inefficiency.</li> +<li>We also try to replace every single operation with a new argument to the + trace, to see whether the inefficiency is still present.</li> +</ul> +<p>The minimization process is sort of inefficient and I should probably be using + <a href="https://github.com/DRMacIver/shrinkray">shrinkray</a> or + <a href="https://github.com/csmith-project/creduce">C-Reduce</a> instead. However, it + seems to work well in practice and the runtime isn't too bad.</p> +<h3 id="results">Results</h3> +<p>So far I am using the JIT traces of three programs: 1) Booting Linux on the +<a href="https://docs.pydrofoil.org">Pydrofoil</a> RISC-V emulator, 2) booting Linux on the Pydrofoil ARM emulator, and 3) +running the PyPy bootstrap process on top of PyPy.</p> +<p>I picked these programs because most Python programs don't contain interesting +amounts of integer operations, and the traces of the emulators +contain a lot of them. I also used the bootstrap process because I still wanted +to try a big Python program and personally care about the runtime of this +program a lot.</p> +<p>The script identifies 94 +inefficiencies in the traces, a lot of them come from repeating +patterns. My next steps will be to manually inspect them all, categorize them, and +implement easy optimizations identified that way. I also want a way to sort the +examples by execution count in the benchmarks, to get a feeling for which of +them are most important.</p> +<p>I didn't investigate the full set of <a href="https://speed.pypy.org">Python +benchmarks</a> that PyPy uses yet, because I don't expect +them to contain interesting amounts of integer operations, but maybe I am wrong +about that? Will have to try eventually.</p> +<h3 id="conclusion">Conclusion</h3> +<p>This was again much easier to do than I would have expected! Given that I had +the translation of trace ops to Z3 already in place, it was a matter of about a +day's of programming to use this infrastructure to find the first problems and +minimizing them.</p> +<p>Reusing the results of existing operations or replacing operations by constants +can be seen as "zero-instruction superoptimization". I'll probably be rather +busy for a while to add the missing optimizations identified by my simple +script. But later extensions to actually synthesize one or several operations +in the attempt to optimize the traces more and find more opportunities should +be possible.</p> +<p>Finding inefficiencies in traces with Z3 is significantly less +annoying and also less error-prone than just manually inspecting traces and +trying to spot optimization opportunities.</p> +<h3 id="random-notes-and-sources">Random Notes and Sources</h3> +<p>Again, John's blog posts:</p> +<ul> +<li><a href="https://blog.regehr.org/archives/1109">Let’s Work on an LLVM Superoptimizer</a></li> +<li><a href="https://blog.regehr.org/archives/1146">Early Superoptimizer Results</a></li> +<li><a href="https://blog.regehr.org/archives/1252">A Few Synthesizing Superoptimizer Results</a></li> +<li><a href="https://blog.regehr.org/archives/1636">Synthesizing Constants</a></li> +</ul> +<p>and papers:</p> +<ul> +<li><a href="https://arxiv.org/pdf/1711.04422">A Synthesizing Superoptimizer</a></li> +<li><a href="https://dl.acm.org/doi/pdf/10.1145/3649837">Hydra: Generalizing Peephole Optimizations with Program Synthesis</a></li> +</ul> +<p>I remembered recently that I had seen the approach of optimizing the traces of +a tracing JIT with Z3 a long time ago, as part of the (now long dead, I think) +<a href="https://web.archive.org/web/20160304055149/http://research.microsoft.com/en-us/projects/spur/">SPUR +project</a>. +There's a <a href="https://web.archive.org/web/20161029162737/http://csl.stanford.edu/~christos/pldi2010.fit/tillmann.provers4jit.pdf">workshop +paper</a> +from 2010 about this. SPUR was trying to use Z3 built into the actual JIT (as +opposed to using Z3 only to find places where the regular optimizers could be +improved). In addition to bitvectors, SPUR also used the Z3 support for arrays +to model the C# heap and remove redundant stores. This is still another future +extension for all the Z3 work I've been doing in the context of the PyPy JIT.</p>jitz3https://www.pypy.org/posts/2024/07/mining-jit-traces-missing-optimizations-z3.htmlFri, 19 Jul 2024 17:01:09 GMTFinding Simple Rewrite Rules for the JIT with Z3https://www.pypy.org/posts/2024/07/finding-simple-rewrite-rules-jit-z3.htmlCF Bolz-Tereick<p>In June I was at the <a href="https://pldi24.sigplan.org/">PLDI conference</a> in +Copenhagen to present a <a href="https://dl.acm.org/doi/10.1145/3652588.3663316">paper</a> +I co-authored with <a href="https://bernsteinbear.com/">Max Bernstein</a>. I also finally +met <a href="https://blog.regehr.org/">John Regehr</a>, who I'd been talking on social +media for ages but had never met. John has been working on compiler correctness +and better techniques for building compilers and optimizers since a very long +time. The blog post <a href="https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html">Finding JIT Optimizer Bugs using SMT Solvers and +Fuzzing</a> +was heavily inspired by this work. We talked a lot about his and his groups +work on using Z3 for +<a href="https://en.wikipedia.org/wiki/Superoptimization">superoptimization</a> and for +finding missing optimizations. I have applied some of the things John told me +about to the traces of PyPy's JIT, and wanted to blog about that. However, my +draft felt quite hard to understand. Therefore I have now written this current +post, to at least try to provide a somewhat gentler on-ramp to the topic.</p> +<p>In <em>this</em> post we will use the Python-API to Z3 to find local peephole rewrite +rules for the operations in the intermediate representation of PyPy's tracing +JIT. The code for this is simple enough that we can go through all of it.</p> +<p>The PyPy JIT produces traces of machine level instructions, which are optimized +and then turned into machine code. The optimizer uses a number of approaches to +make the traces more efficient. For integer operations it applies a number of +arithmetic simplification rules rules, for example <code>int_add(x, 0) -&gt; x</code>. When +implementing these rules in the JIT there are <strong>two problems</strong>: How do we know +that the rules are correct? And how do we know that we haven't forgotten any +rules? We'll try to answer both of these, but the first one in particular.</p> +<p>We'll be using Z3, a satisfiability module theories (SMT) solver which has good +bitvector support and most importantly an excellent Python API. We can use the +solver to reason about bitvectors, which are how we will model machine +integers.</p> +<p>To find rewrite rules, we will consider the binary operations (i.e. those +taking two arguments) in PyPy traces that take and produce integers. The +completely general form <code>op(x, y)</code> is not simplifiable on its own. But if +either <code>x == y</code> +or if one of the arguments is a constant, we can potentially simplify the +operation into a simpler form. The results are either the variable <code>x</code>, or a +(potentially different) constant. We'll ignore constant-folding where both +arguments of the binary operation are constants. The possible results for a +simplifiable binary operation are the variable <code>x</code> or another constant. This +leaves the following patterns as possibilities:</p> +<ul> +<li><code>op(x, x) == x</code></li> +<li><code>op(x, x) == c1</code></li> +<li><code>op(x, c1) == x</code></li> +<li><code>op(c1, x) == x</code></li> +<li><code>op(x, c1) == c2</code></li> +<li><code>op(c1, x) == c2</code></li> +</ul> +<p>Our approach will be to take every single supported binary integer operation, +instantiate all of these patterns, and try to ask Z3 whether the resulting +simplification is valid for all values of <code>x</code>.</p> +<h3 id="quick-intro-to-the-z3-python-api">Quick intro to the Z3 Python-API</h3> +<p>Here's a terminal session showing the use of the Z3 Python API:</p> +<div class="code"><pre class="code literal-block"><span class="go">&gt;&gt;&gt;&gt; import z3</span> +<span class="go">&gt;&gt;&gt;&gt; # construct a Z3 bitvector variable of width 8, with name x:</span> +<span class="go">&gt;&gt;&gt;&gt; x = z3.BitVec('x', 8)</span> +<span class="go">&gt;&gt;&gt;&gt; # construct a more complicated formula by using operator overloading:</span> +<span class="go">&gt;&gt;&gt;&gt; x + x</span> +<span class="go">x + x</span> +<span class="go">&gt;&gt;&gt;&gt; x + 1</span> +<span class="go">x + 1</span> +</pre></div> + +<p>Z3 checks the "satisfiability" of a formula. This means that it tries to find +an example set of concrete values for the variables that occur in a formula, +such that the formula becomes true. Examples:</p> +<div class="code"><pre class="code literal-block"><span class="go">&gt;&gt;&gt;&gt; solver = z3.Solver()</span> +<span class="go">&gt;&gt;&gt;&gt; solver.check(x * x == 3)</span> +<span class="go">unsat</span> +<span class="go">&gt;&gt;&gt;&gt; # meaning no x fulfils this property</span> +<span class="go">&gt;&gt;&gt;&gt;</span> +<span class="go">&gt;&gt;&gt;&gt; solver.check(x * x == 9)</span> +<span class="go">sat</span> +<span class="go">&gt;&gt;&gt;&gt; model = solver.model()</span> +<span class="go">&gt;&gt;&gt;&gt; model</span> +<span class="go">[x = 253]</span> +<span class="go">&gt;&gt;&gt;&gt; model[x].as_signed_long()</span> +<span class="go">-3</span> +<span class="go">&gt;&gt;&gt;&gt; # 253 is the same as -3 in two's complement arithmetic with 8 bits</span> +</pre></div> + +<p>In order to use Z3 to prove something, we can ask Z3 to find counterexamples +for the statement, meaning concrete values that would make the negation of the +statement true:</p> +<div class="code"><pre class="code literal-block"><span class="go">&gt;&gt;&gt;&gt; solver.check(z3.Not(x ^ -1 == ~x))</span> +<span class="go">unsat</span> +</pre></div> + +<p>The result <code>unsat</code> means that we just proved that <code>x ^ -1 == ~x</code> is true for +all <code>x</code>, because there is no value for <code>x</code> that makes <code>not (x ^ -1 == ~x)</code> +true (this works because -1 has all the bits set).</p> +<p>If we try to prove something incorrect in this way, the following happens:</p> +<div class="code"><pre class="code literal-block"><span class="go">&gt;&gt;&gt;&gt; solver.check(z3.Not(x ^ -1 == x))</span> +<span class="go">sat</span> +</pre></div> + +<p><code>sat</code> shows that <code>x ^ -1 == x</code> is (unsurprisingly) not always true, and we can +ask for a counterexample:</p> +<div class="code"><pre class="code literal-block"><span class="go">&gt;&gt;&gt;&gt; solver.model()</span> +<span class="go">[x = 0]</span> +</pre></div> + +<p>This way of proving this works because the <code>check</code> calls try to solve an +(implicit) "exists" quantifier, over all the Z3 variables used in the formula. +<code>check</code> will either return <code>z3.unsat</code>, which means that no concrete values make +the formula true; or <code>z3.sat</code>, which means that you can get some concrete +values that make the formula true by calling <code>solver.model()</code>.</p> +<p>In math terms we prove things using <code>check</code> by de-Morgan's rules for quantifiers:</p> +<p>$$ \lnot \exists x: \lnot f(x) \implies \forall x: f(x) $$</p> +<p>Now that we've seen the basics of using the Z3 API on a few small examples, +we'll use it in a bigger program.</p> +<h3 id="encoding-the-integer-operations-of-rpythons-jit-into-z3-formulas">Encoding the integer operations of RPython's JIT into Z3 formulas</h3> +<p>Now we'll use the API to reason about the integer operations of the PyPy JIT +intermediate representation (IR). The binary integer operations are:</p> +<div class="code"><pre class="code literal-block"><span class="n">opnames2</span> <span class="o">=</span> <span class="p">[</span> +<span class="s2">"int_add"</span><span class="p">,</span> +<span class="s2">"int_sub"</span><span class="p">,</span> +<span class="s2">"int_mul"</span><span class="p">,</span> +<span class="s2">"int_and"</span><span class="p">,</span> +<span class="s2">"int_or"</span><span class="p">,</span> +<span class="s2">"int_xor"</span><span class="p">,</span> +<span class="s2">"int_eq"</span><span class="p">,</span> +<span class="s2">"int_ne"</span><span class="p">,</span> +<span class="s2">"int_lt"</span><span class="p">,</span> +<span class="s2">"int_le"</span><span class="p">,</span> +<span class="s2">"int_gt"</span><span class="p">,</span> +<span class="s2">"int_ge"</span><span class="p">,</span> +<span class="s2">"uint_lt"</span><span class="p">,</span> +<span class="s2">"uint_le"</span><span class="p">,</span> +<span class="s2">"uint_gt"</span><span class="p">,</span> +<span class="s2">"uint_ge"</span><span class="p">,</span> +<span class="s2">"int_lshift"</span><span class="p">,</span> +<span class="s2">"int_rshift"</span><span class="p">,</span> +<span class="s2">"uint_rshift"</span><span class="p">,</span> +<span class="s2">"uint_mul_high"</span><span class="p">,</span> +<span class="s2">"int_pydiv"</span><span class="p">,</span> +<span class="s2">"int_pymod"</span><span class="p">,</span> +<span class="p">]</span> +</pre></div> + +<p>There's not much special about the integer operations. Like in LLVM, most of +them are signedness-independent: <code>int_add</code>, <code>int_sub</code>, <code>int_mul</code>, ... work +correctly for unsigned integers but also for +<a href="https://en.wikipedia.org/wiki/Two%27s_complement">two's-complement</a> signed +integers. Exceptions for that are order comparisons like <code>int_lt</code> etc. for +which we have unsigned variants <code>uint_lt</code> etc. All operations that produce a +boolean result return a full-width integer <code>0</code> or <code>1</code> (the PyPy JIT supports +only word-sized integers in its intermediate representation)</p> +<p>In order to reason about the IR operations, some ground work:</p> +<div class="code"><pre class="code literal-block"><span class="kn">import</span> <span class="nn">z3</span> + +<span class="n">INTEGER_WIDTH</span> <span class="o">=</span> <span class="mi">64</span> +<span class="n">solver</span> <span class="o">=</span> <span class="n">z3</span><span class="o">.</span><span class="n">Solver</span><span class="p">()</span> +<span class="n">solver</span><span class="o">.</span><span class="n">set</span><span class="p">(</span><span class="s2">"timeout"</span><span class="p">,</span> <span class="mi">10000</span><span class="p">)</span> <span class="c1"># milliseconds, ie 10s</span> +<span class="n">xvar</span> <span class="o">=</span> <span class="n">z3</span><span class="o">.</span><span class="n">BitVec</span><span class="p">(</span><span class="s1">'x'</span><span class="p">,</span> <span class="n">INTEGER_WIDTH</span><span class="p">)</span> +<span class="n">constvar</span> <span class="o">=</span> <span class="n">z3</span><span class="o">.</span><span class="n">BitVec</span><span class="p">(</span><span class="s1">'const'</span><span class="p">,</span> <span class="n">INTEGER_WIDTH</span><span class="p">)</span> +<span class="n">constvar2</span> <span class="o">=</span> <span class="n">z3</span><span class="o">.</span><span class="n">BitVec</span><span class="p">(</span><span class="s1">'const2'</span><span class="p">,</span> <span class="n">INTEGER_WIDTH</span><span class="p">)</span> +<span class="n">TRUEBV</span> <span class="o">=</span> <span class="n">z3</span><span class="o">.</span><span class="n">BitVecVal</span><span class="p">(</span><span class="mi">1</span><span class="p">,</span> <span class="n">INTEGER_WIDTH</span><span class="p">)</span> +<span class="n">FALSEBV</span> <span class="o">=</span> <span class="n">z3</span><span class="o">.</span><span class="n">BitVecVal</span><span class="p">(</span><span class="mi">0</span><span class="p">,</span> <span class="n">INTEGER_WIDTH</span><span class="p">)</span> +</pre></div> + +<p>And here's the a function to turn an integer IR operation of PyPy's JIT into Z3 +formulas:</p> +<div class="code"><pre class="code literal-block"><span class="k">def</span> <span class="nf">z3_expression</span><span class="p">(</span><span class="n">opname</span><span class="p">,</span> <span class="n">arg0</span><span class="p">,</span> <span class="n">arg1</span><span class="o">=</span><span class="kc">None</span><span class="p">):</span> +<span class="w"> </span><span class="sd">""" computes a tuple of (result, valid_if) of Z3 formulas. `result` is the</span> +<span class="sd"> formula representing the result of the operation, given argument formulas</span> +<span class="sd"> arg0 and arg1. `valid_if` is a pre-condition that must be true for the</span> +<span class="sd"> result to be meaningful. """</span> + <span class="n">result</span> <span class="o">=</span> <span class="kc">None</span> + <span class="n">valid_if</span> <span class="o">=</span> <span class="kc">True</span> <span class="c1"># the precondition is mostly True, with few exceptions</span> + <span class="k">if</span> <span class="n">opname</span> <span class="o">==</span> <span class="s2">"int_add"</span><span class="p">:</span> + <span class="n">result</span> <span class="o">=</span> <span class="n">arg0</span> <span class="o">+</span> <span class="n">arg1</span> + <span class="k">elif</span> <span class="n">opname</span> <span class="o">==</span> <span class="s2">"int_sub"</span><span class="p">:</span> + <span class="n">result</span> <span class="o">=</span> <span class="n">arg0</span> <span class="o">-</span> <span class="n">arg1</span> + <span class="k">elif</span> <span class="n">opname</span> <span class="o">==</span> <span class="s2">"int_mul"</span><span class="p">:</span> + <span class="n">result</span> <span class="o">=</span> <span class="n">arg0</span> <span class="o">*</span> <span class="n">arg1</span> + <span class="k">elif</span> <span class="n">opname</span> <span class="o">==</span> <span class="s2">"int_and"</span><span class="p">:</span> + <span class="n">result</span> <span class="o">=</span> <span class="n">arg0</span> <span class="o">&amp;</span> <span class="n">arg1</span> + <span class="k">elif</span> <span class="n">opname</span> <span class="o">==</span> <span class="s2">"int_or"</span><span class="p">:</span> + <span class="n">result</span> <span class="o">=</span> <span class="n">arg0</span> <span class="o">|</span> <span class="n">arg1</span> + <span class="k">elif</span> <span class="n">opname</span> <span class="o">==</span> <span class="s2">"int_xor"</span><span class="p">:</span> + <span class="n">result</span> <span class="o">=</span> <span class="n">arg0</span> <span class="o">^</span> <span class="n">arg1</span> + <span class="k">elif</span> <span class="n">opname</span> <span class="o">==</span> <span class="s2">"int_eq"</span><span class="p">:</span> + <span class="n">result</span> <span class="o">=</span> <span class="n">cond</span><span class="p">(</span><span class="n">arg0</span> <span class="o">==</span> <span class="n">arg1</span><span class="p">)</span> + <span class="k">elif</span> <span class="n">opname</span> <span class="o">==</span> <span class="s2">"int_ne"</span><span class="p">:</span> + <span class="n">result</span> <span class="o">=</span> <span class="n">cond</span><span class="p">(</span><span class="n">arg0</span> <span class="o">!=</span> <span class="n">arg1</span><span class="p">)</span> + <span class="k">elif</span> <span class="n">opname</span> <span class="o">==</span> <span class="s2">"int_lt"</span><span class="p">:</span> + <span class="n">result</span> <span class="o">=</span> <span class="n">cond</span><span class="p">(</span><span class="n">arg0</span> <span class="o">&lt;</span> <span class="n">arg1</span><span class="p">)</span> + <span class="k">elif</span> <span class="n">opname</span> <span class="o">==</span> <span class="s2">"int_le"</span><span class="p">:</span> + <span class="n">result</span> <span class="o">=</span> <span class="n">cond</span><span class="p">(</span><span class="n">arg0</span> <span class="o">&lt;=</span> <span class="n">arg1</span><span class="p">)</span> + <span class="k">elif</span> <span class="n">opname</span> <span class="o">==</span> <span class="s2">"int_gt"</span><span class="p">:</span> + <span class="n">result</span> <span class="o">=</span> <span class="n">cond</span><span class="p">(</span><span class="n">arg0</span> <span class="o">&gt;</span> <span class="n">arg1</span><span class="p">)</span> + <span class="k">elif</span> <span class="n">opname</span> <span class="o">==</span> <span class="s2">"int_ge"</span><span class="p">:</span> + <span class="n">result</span> <span class="o">=</span> <span class="n">cond</span><span class="p">(</span><span class="n">arg0</span> <span class="o">&gt;=</span> <span class="n">arg1</span><span class="p">)</span> + <span class="k">elif</span> <span class="n">opname</span> <span class="o">==</span> <span class="s2">"uint_lt"</span><span class="p">:</span> + <span class="n">result</span> <span class="o">=</span> <span class="n">cond</span><span class="p">(</span><span class="n">z3</span><span class="o">.</span><span class="n">ULT</span><span class="p">(</span><span class="n">arg0</span><span class="p">,</span> <span class="n">arg1</span><span class="p">))</span> + <span class="k">elif</span> <span class="n">opname</span> <span class="o">==</span> <span class="s2">"uint_le"</span><span class="p">:</span> + <span class="n">result</span> <span class="o">=</span> <span class="n">cond</span><span class="p">(</span><span class="n">z3</span><span class="o">.</span><span class="n">ULE</span><span class="p">(</span><span class="n">arg0</span><span class="p">,</span> <span class="n">arg1</span><span class="p">))</span> + <span class="k">elif</span> <span class="n">opname</span> <span class="o">==</span> <span class="s2">"uint_gt"</span><span class="p">:</span> + <span class="n">result</span> <span class="o">=</span> <span class="n">cond</span><span class="p">(</span><span class="n">z3</span><span class="o">.</span><span class="n">UGT</span><span class="p">(</span><span class="n">arg0</span><span class="p">,</span> <span class="n">arg1</span><span class="p">))</span> + <span class="k">elif</span> <span class="n">opname</span> <span class="o">==</span> <span class="s2">"uint_ge"</span><span class="p">:</span> + <span class="n">result</span> <span class="o">=</span> <span class="n">cond</span><span class="p">(</span><span class="n">z3</span><span class="o">.</span><span class="n">UGE</span><span class="p">(</span><span class="n">arg0</span><span class="p">,</span> <span class="n">arg1</span><span class="p">))</span> + <span class="k">elif</span> <span class="n">opname</span> <span class="o">==</span> <span class="s2">"int_lshift"</span><span class="p">:</span> + <span class="n">result</span> <span class="o">=</span> <span class="n">arg0</span> <span class="o">&lt;&lt;</span> <span class="n">arg1</span> + <span class="n">valid_if</span> <span class="o">=</span> <span class="n">z3</span><span class="o">.</span><span class="n">And</span><span class="p">(</span><span class="n">arg1</span> <span class="o">&gt;=</span> <span class="mi">0</span><span class="p">,</span> <span class="n">arg1</span> <span class="o">&lt;</span> <span class="n">INTEGER_WIDTH</span><span class="p">)</span> + <span class="k">elif</span> <span class="n">opname</span> <span class="o">==</span> <span class="s2">"int_rshift"</span><span class="p">:</span> + <span class="n">result</span> <span class="o">=</span> <span class="n">arg0</span> <span class="o">&lt;&lt;</span> <span class="n">arg1</span> + <span class="n">valid_if</span> <span class="o">=</span> <span class="n">z3</span><span class="o">.</span><span class="n">And</span><span class="p">(</span><span class="n">arg1</span> <span class="o">&gt;=</span> <span class="mi">0</span><span class="p">,</span> <span class="n">arg1</span> <span class="o">&lt;</span> <span class="n">INTEGER_WIDTH</span><span class="p">)</span> + <span class="k">elif</span> <span class="n">opname</span> <span class="o">==</span> <span class="s2">"uint_rshift"</span><span class="p">:</span> + <span class="n">result</span> <span class="o">=</span> <span class="n">z3</span><span class="o">.</span><span class="n">LShR</span><span class="p">(</span><span class="n">arg0</span><span class="p">,</span> <span class="n">arg1</span><span class="p">)</span> + <span class="n">valid_if</span> <span class="o">=</span> <span class="n">z3</span><span class="o">.</span><span class="n">And</span><span class="p">(</span><span class="n">arg1</span> <span class="o">&gt;=</span> <span class="mi">0</span><span class="p">,</span> <span class="n">arg1</span> <span class="o">&lt;</span> <span class="n">INTEGER_WIDTH</span><span class="p">)</span> + <span class="k">elif</span> <span class="n">opname</span> <span class="o">==</span> <span class="s2">"uint_mul_high"</span><span class="p">:</span> + <span class="c1"># zero-extend args to 2*INTEGER_WIDTH bit, then multiply and extract</span> + <span class="c1"># highest INTEGER_WIDTH bits</span> + <span class="n">zarg0</span> <span class="o">=</span> <span class="n">z3</span><span class="o">.</span><span class="n">ZeroExt</span><span class="p">(</span><span class="n">INTEGER_WIDTH</span><span class="p">,</span> <span class="n">arg0</span><span class="p">)</span> + <span class="n">zarg1</span> <span class="o">=</span> <span class="n">z3</span><span class="o">.</span><span class="n">ZeroExt</span><span class="p">(</span><span class="n">INTEGER_WIDTH</span><span class="p">,</span> <span class="n">arg1</span><span class="p">)</span> + <span class="n">result</span> <span class="o">=</span> <span class="n">z3</span><span class="o">.</span><span class="n">Extract</span><span class="p">(</span><span class="n">INTEGER_WIDTH</span> <span class="o">*</span> <span class="mi">2</span> <span class="o">-</span> <span class="mi">1</span><span class="p">,</span> <span class="n">INTEGER_WIDTH</span><span class="p">,</span> <span class="n">zarg0</span> <span class="o">*</span> <span class="n">zarg1</span><span class="p">)</span> + <span class="k">elif</span> <span class="n">opname</span> <span class="o">==</span> <span class="s2">"int_pydiv"</span><span class="p">:</span> + <span class="n">valid_if</span> <span class="o">=</span> <span class="n">arg1</span> <span class="o">!=</span> <span class="mi">0</span> + <span class="n">r</span> <span class="o">=</span> <span class="n">arg0</span> <span class="o">/</span> <span class="n">arg1</span> + <span class="n">psubx</span> <span class="o">=</span> <span class="n">r</span> <span class="o">*</span> <span class="n">arg1</span> <span class="o">-</span> <span class="n">arg0</span> + <span class="n">result</span> <span class="o">=</span> <span class="n">r</span> <span class="o">+</span> <span class="p">(</span><span class="n">z3</span><span class="o">.</span><span class="n">If</span><span class="p">(</span><span class="n">arg1</span> <span class="o">&lt;</span> <span class="mi">0</span><span class="p">,</span> <span class="n">psubx</span><span class="p">,</span> <span class="o">-</span><span class="n">psubx</span><span class="p">)</span> <span class="o">&gt;&gt;</span> <span class="p">(</span><span class="n">INTEGER_WIDTH</span> <span class="o">-</span> <span class="mi">1</span><span class="p">))</span> + <span class="k">elif</span> <span class="n">opname</span> <span class="o">==</span> <span class="s2">"int_pymod"</span><span class="p">:</span> + <span class="n">valid_if</span> <span class="o">=</span> <span class="n">arg1</span> <span class="o">!=</span> <span class="mi">0</span> + <span class="n">r</span> <span class="o">=</span> <span class="n">arg0</span> <span class="o">%</span> <span class="n">arg1</span> + <span class="n">result</span> <span class="o">=</span> <span class="n">r</span> <span class="o">+</span> <span class="p">(</span><span class="n">arg1</span> <span class="o">&amp;</span> <span class="n">z3</span><span class="o">.</span><span class="n">If</span><span class="p">(</span><span class="n">arg1</span> <span class="o">&lt;</span> <span class="mi">0</span><span class="p">,</span> <span class="o">-</span><span class="n">r</span><span class="p">,</span> <span class="n">r</span><span class="p">)</span> <span class="o">&gt;&gt;</span> <span class="p">(</span><span class="n">INTEGER_WIDTH</span> <span class="o">-</span> <span class="mi">1</span><span class="p">))</span> + <span class="k">elif</span> <span class="n">opname</span> <span class="o">==</span> <span class="s2">"int_is_true"</span><span class="p">:</span> + <span class="n">result</span> <span class="o">=</span> <span class="n">cond</span><span class="p">(</span><span class="n">arg0</span> <span class="o">!=</span> <span class="n">FALSEBV</span><span class="p">)</span> + <span class="k">elif</span> <span class="n">opname</span> <span class="o">==</span> <span class="s2">"int_is_zero"</span><span class="p">:</span> + <span class="n">result</span> <span class="o">=</span> <span class="n">cond</span><span class="p">(</span><span class="n">arg0</span> <span class="o">==</span> <span class="n">FALSEBV</span><span class="p">)</span> + <span class="k">elif</span> <span class="n">opname</span> <span class="o">==</span> <span class="s2">"int_neg"</span><span class="p">:</span> + <span class="n">result</span> <span class="o">=</span> <span class="o">-</span><span class="n">arg0</span> + <span class="k">elif</span> <span class="n">opname</span> <span class="o">==</span> <span class="s2">"int_invert"</span><span class="p">:</span> + <span class="n">result</span> <span class="o">=</span> <span class="o">~</span><span class="n">arg0</span> + <span class="k">else</span><span class="p">:</span> + <span class="k">assert</span> <span class="mi">0</span><span class="p">,</span> <span class="s2">"unknown operation "</span> <span class="o">+</span> <span class="n">opname</span> + <span class="k">return</span> <span class="n">result</span><span class="p">,</span> <span class="n">valid_if</span> + +<span class="k">def</span> <span class="nf">cond</span><span class="p">(</span><span class="n">z3expr</span><span class="p">):</span> +<span class="w"> </span><span class="sd">""" helper function to turn a Z3 boolean result z3expr into a 1 or 0</span> +<span class="sd"> bitvector, using z3.If """</span> + <span class="k">return</span> <span class="n">z3</span><span class="o">.</span><span class="n">If</span><span class="p">(</span><span class="n">z3expr</span><span class="p">,</span> <span class="n">TRUEBV</span><span class="p">,</span> <span class="n">FALSEBV</span><span class="p">)</span> +</pre></div> + +<p>We map the semantics of a PyPy JIT operation to Z3 with the <code>z3_expression</code> +function. It takes the name of a JIT operation and its two (or one) arguments +into a pair of Z3 formulas, <code>result</code> and <code>valid_if</code>. The resulting formulas are +constructed with the operator overloading of Z3 variables/formulas.</p> +<p>The first element <code>result</code> of the result of <code>z3_expression</code> represents the result +of performing the operation. <code>valid_if</code> is a bool that represents a condition that +needs to be <code>True</code> in order for the result of the operation to be defined. E.g. +<code>int_pydiv(a, b)</code> is only valid if <code>b != 0</code>. Most operations are always valid, +so they return <code>True</code> as that condition (we'll ignore <code>valid_if</code> for a bit, but it +will become more relevant further down in the post).</p> +<p>We can define a helper function to prove things by finding counterexamples:</p> +<div class="code"><pre class="code literal-block"><span class="k">def</span> <span class="nf">prove</span><span class="p">(</span><span class="n">cond</span><span class="p">):</span> +<span class="w"> </span><span class="sd">""" Try to prove a condition cond by searching for counterexamples of its negation. """</span> + <span class="n">z3res</span> <span class="o">=</span> <span class="n">solver</span><span class="o">.</span><span class="n">check</span><span class="p">(</span><span class="n">z3</span><span class="o">.</span><span class="n">Not</span><span class="p">(</span><span class="n">cond</span><span class="p">))</span> + <span class="k">if</span> <span class="n">z3res</span> <span class="o">==</span> <span class="n">z3</span><span class="o">.</span><span class="n">unsat</span><span class="p">:</span> + <span class="k">return</span> <span class="kc">True</span> + <span class="k">elif</span> <span class="n">z3res</span> <span class="o">==</span> <span class="n">z3</span><span class="o">.</span><span class="n">unknown</span><span class="p">:</span> <span class="c1"># eg on timeout</span> + <span class="k">return</span> <span class="kc">False</span> + <span class="k">elif</span> <span class="n">z3res</span> <span class="o">==</span> <span class="n">z3</span><span class="o">.</span><span class="n">sat</span><span class="p">:</span> + <span class="k">return</span> <span class="kc">False</span> + <span class="k">assert</span> <span class="mi">0</span><span class="p">,</span> <span class="s2">"should be unreachable"</span> +</pre></div> + +<h3 id="finding-rewrite-rules">Finding rewrite rules</h3> +<p>Now we can start finding our first rewrite rules, following the first pattern +<code>op(x, x) -&gt; x</code>. We do this by iterating over all the supported binary +operation names, getting the z3 expression for <code>op(x, x)</code> and then asking Z3 to +prove <code>op(x, x) == x</code>.</p> +<div class="code"><pre class="code literal-block"><span class="k">for</span> <span class="n">opname</span> <span class="ow">in</span> <span class="n">opnames2</span><span class="p">:</span> + <span class="n">result</span><span class="p">,</span> <span class="n">valid_if</span> <span class="o">=</span> <span class="n">z3_expression</span><span class="p">(</span><span class="n">opname</span><span class="p">,</span> <span class="n">xvar</span><span class="p">,</span> <span class="n">xvar</span><span class="p">)</span> + <span class="k">if</span> <span class="n">prove</span><span class="p">(</span><span class="n">result</span> <span class="o">==</span> <span class="n">xvar</span><span class="p">):</span> + <span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">"</span><span class="si">{</span><span class="n">opname</span><span class="si">}</span><span class="s2">(x, x) -&gt; x, </span><span class="si">{</span><span class="n">result</span><span class="si">}</span><span class="s2">"</span><span class="p">)</span> +</pre></div> + +<p>This yields the simplifications:</p> +<div class="code"><pre class="code literal-block"><span class="n">int_and</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="n">x</span> +<span class="n">int_or</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="n">x</span> +</pre></div> + +<h3 id="synthesizing-constants">Synthesizing constants</h3> +<p>Supporting the next patterns is harder: <code>op(x, x) == c1</code>, <code>op(x, c1) == x</code>, and +<code>op(c1, x) == x</code>. We don't know which constants to pick to try to get Z3 to +prove the equality. We could iterate over common constants like <code>0</code>, <code>1</code>, +<code>MAXINT</code>, etc, or even over all the 256 values for a bitvector of length 8. +However, we will instead ask Z3 to find the constants for us too.</p> +<p>This can be done by using quantifiers, in this case <code>z3.ForAll</code>. The query we +pose to Z3 is "does there exist a constant <code>c1</code> such that for all <code>x</code> the +following is true: <code>op(x, c1) == x</code>? Note that the constant <code>c1</code> is not +necessarily unique, there could be many of them. We generate several matching +constant, and add that they must be different to the condition of the second +and further queries.</p> +<p>We can express this in a helper function:</p> +<div class="code"><pre class="code literal-block"><span class="k">def</span> <span class="nf">find_constant</span><span class="p">(</span><span class="n">z3expr</span><span class="p">,</span> <span class="n">number_of_results</span><span class="o">=</span><span class="mi">5</span><span class="p">):</span> + <span class="n">condition</span> <span class="o">=</span> <span class="n">z3</span><span class="o">.</span><span class="n">ForAll</span><span class="p">(</span> + <span class="p">[</span><span class="n">xvar</span><span class="p">],</span> + <span class="n">z3expr</span> + <span class="p">)</span> + <span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="n">number_of_results</span><span class="p">):</span> + <span class="n">checkres</span> <span class="o">=</span> <span class="n">solver</span><span class="o">.</span><span class="n">check</span><span class="p">(</span><span class="n">condition</span><span class="p">)</span> + <span class="k">if</span> <span class="n">checkres</span> <span class="o">==</span> <span class="n">z3</span><span class="o">.</span><span class="n">sat</span><span class="p">:</span> + <span class="c1"># if a solver check succeeds, we can ask for a model, which is</span> + <span class="c1"># concrete values for the variables constvar</span> + <span class="n">model</span> <span class="o">=</span> <span class="n">solver</span><span class="o">.</span><span class="n">model</span><span class="p">()</span> + <span class="n">const</span> <span class="o">=</span> <span class="n">model</span><span class="p">[</span><span class="n">constvar</span><span class="p">]</span><span class="o">.</span><span class="n">as_signed_long</span><span class="p">()</span> + <span class="k">yield</span> <span class="n">const</span> + <span class="c1"># make sure we don't generate the same constant again on the</span> + <span class="c1"># next call</span> + <span class="n">condition</span> <span class="o">=</span> <span class="n">z3</span><span class="o">.</span><span class="n">And</span><span class="p">(</span><span class="n">constvar</span> <span class="o">!=</span> <span class="n">const</span><span class="p">,</span> <span class="n">condition</span><span class="p">)</span> + <span class="k">else</span><span class="p">:</span> + <span class="c1"># no (more) constants found</span> + <span class="k">break</span> +</pre></div> + +<p>We can use this new function for the three mentioned patterns:</p> +<div class="code"><pre class="code literal-block"><span class="c1"># try to find constants for op(x, x) == c</span> +<span class="k">for</span> <span class="n">opname</span> <span class="ow">in</span> <span class="n">opnames2</span><span class="p">:</span> + <span class="n">result</span><span class="p">,</span> <span class="n">valid_if</span> <span class="o">=</span> <span class="n">z3_expression</span><span class="p">(</span><span class="n">opname</span><span class="p">,</span> <span class="n">xvar</span><span class="p">,</span> <span class="n">xvar</span><span class="p">)</span> + <span class="k">for</span> <span class="n">const</span> <span class="ow">in</span> <span class="n">find_constant</span><span class="p">(</span><span class="n">result</span> <span class="o">==</span> <span class="n">constvar</span><span class="p">):</span> + <span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">"</span><span class="si">{</span><span class="n">opname</span><span class="si">}</span><span class="s2">(x, x) -&gt; </span><span class="si">{</span><span class="n">const</span><span class="si">}</span><span class="s2">"</span><span class="p">)</span> +<span class="c1"># try to find constants for op(x, c) == x and op(c, x) == x</span> +<span class="k">for</span> <span class="n">opname</span> <span class="ow">in</span> <span class="n">opnames2</span><span class="p">:</span> + <span class="n">result</span><span class="p">,</span> <span class="n">valid_if</span> <span class="o">=</span> <span class="n">z3_expression</span><span class="p">(</span><span class="n">opname</span><span class="p">,</span> <span class="n">xvar</span><span class="p">,</span> <span class="n">constvar</span><span class="p">)</span> + <span class="k">for</span> <span class="n">const</span> <span class="ow">in</span> <span class="n">find_constant</span><span class="p">(</span><span class="n">result</span> <span class="o">==</span> <span class="n">xvar</span><span class="p">):</span> + <span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">"</span><span class="si">{</span><span class="n">opname</span><span class="si">}</span><span class="s2">(x, </span><span class="si">{</span><span class="n">const</span><span class="si">}</span><span class="s2">) -&gt; x"</span><span class="p">)</span> + <span class="n">result</span><span class="p">,</span> <span class="n">valid_if</span> <span class="o">=</span> <span class="n">z3_expression</span><span class="p">(</span><span class="n">opname</span><span class="p">,</span> <span class="n">constvar</span><span class="p">,</span> <span class="n">xvar</span><span class="p">)</span> + <span class="k">for</span> <span class="n">const</span> <span class="ow">in</span> <span class="n">find_constant</span><span class="p">(</span><span class="n">result</span> <span class="o">==</span> <span class="n">xvar</span><span class="p">):</span> + <span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">"</span><span class="si">{</span><span class="n">opname</span><span class="si">}</span><span class="s2">(</span><span class="si">{</span><span class="n">const</span><span class="si">}</span><span class="s2">, x) -&gt; x"</span><span class="p">)</span> +<span class="c1"># this code is not quite correct, we'll correct it later</span> +</pre></div> + +<p>Together this yields the following new simplifications:</p> +<div class="code"><pre class="code literal-block"><span class="cp"># careful, these are not all correct!</span> +<span class="n">int_sub</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="mi">0</span> +<span class="n">int_xor</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="mi">0</span> +<span class="n">int_eq</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="mi">1</span> +<span class="n">int_ne</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="mi">0</span> +<span class="n">int_lt</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="mi">0</span> +<span class="n">int_le</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="mi">1</span> +<span class="n">int_gt</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="mi">0</span> +<span class="n">int_ge</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="mi">1</span> +<span class="n">uint_lt</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="mi">0</span> +<span class="n">uint_le</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="mi">1</span> +<span class="n">uint_gt</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="mi">0</span> +<span class="n">uint_ge</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="mi">1</span> +<span class="n">uint_rshift</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="mi">0</span> +<span class="n">int_pymod</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="mi">0</span> +<span class="n">int_add</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="mi">0</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="n">x</span> +<span class="n">int_add</span><span class="p">(</span><span class="mi">0</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="n">x</span> +<span class="n">int_sub</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="mi">0</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="n">x</span> +<span class="n">int_mul</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="mi">1</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="n">x</span> +<span class="n">int_mul</span><span class="p">(</span><span class="mi">1</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="n">x</span> +<span class="n">int_and</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="o">-</span><span class="mi">1</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="n">x</span> +<span class="n">int_and</span><span class="p">(</span><span class="o">-</span><span class="mi">1</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="n">x</span> +<span class="n">int_or</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="mi">0</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="n">x</span> +<span class="n">int_or</span><span class="p">(</span><span class="mi">0</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="n">x</span> +<span class="n">int_xor</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="mi">0</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="n">x</span> +<span class="n">int_xor</span><span class="p">(</span><span class="mi">0</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="n">x</span> +<span class="n">int_lshift</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="mi">0</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="n">x</span> +<span class="n">int_rshift</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="mi">0</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="n">x</span> +<span class="n">uint_rshift</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="mi">0</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="n">x</span> +<span class="n">int_pydiv</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="mi">1</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="n">x</span> +<span class="n">int_pymod</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="mi">0</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="n">x</span> +</pre></div> + +<p>Most of these look good at first glance, but the last one reveals a problem: +we've been ignoring the <code>valid_if</code> expression up to now. We can stop doing that by +changing the code like this, which adds <code>z3.And(valid_if, ...)</code> to the argument of +the calls to <code>find_constant</code>:</p> +<div class="code"><pre class="code literal-block"><span class="c1"># try to find constants for op(x, x) == c, op(x, c) == x and op(c, x) == x</span> +<span class="k">for</span> <span class="n">opname</span> <span class="ow">in</span> <span class="n">opnames2</span><span class="p">:</span> + <span class="n">result</span><span class="p">,</span> <span class="n">valid_if</span> <span class="o">=</span> <span class="n">z3_expression</span><span class="p">(</span><span class="n">opname</span><span class="p">,</span> <span class="n">xvar</span><span class="p">,</span> <span class="n">xvar</span><span class="p">)</span> + <span class="k">for</span> <span class="n">const</span> <span class="ow">in</span> <span class="n">find_constant</span><span class="p">(</span><span class="n">z3</span><span class="o">.</span><span class="n">And</span><span class="p">(</span><span class="n">valid_if</span><span class="p">,</span> <span class="n">result</span> <span class="o">==</span> <span class="n">constvar</span><span class="p">)):</span> + <span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">"</span><span class="si">{</span><span class="n">opname</span><span class="si">}</span><span class="s2">(x, x) -&gt; </span><span class="si">{</span><span class="n">const</span><span class="si">}</span><span class="s2">"</span><span class="p">)</span> +<span class="c1"># try to find constants for op(x, c) == x and op(c, x) == x</span> +<span class="k">for</span> <span class="n">opname</span> <span class="ow">in</span> <span class="n">opnames2</span><span class="p">:</span> + <span class="n">result</span><span class="p">,</span> <span class="n">valid_if</span> <span class="o">=</span> <span class="n">z3_expression</span><span class="p">(</span><span class="n">opname</span><span class="p">,</span> <span class="n">xvar</span><span class="p">,</span> <span class="n">constvar</span><span class="p">)</span> + <span class="k">for</span> <span class="n">const</span> <span class="ow">in</span> <span class="n">find_constant</span><span class="p">(</span><span class="n">z3</span><span class="o">.</span><span class="n">And</span><span class="p">(</span><span class="n">result</span> <span class="o">==</span> <span class="n">xvar</span><span class="p">,</span> <span class="n">valid_if</span><span class="p">)):</span> + <span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">"</span><span class="si">{</span><span class="n">opname</span><span class="si">}</span><span class="s2">(x, </span><span class="si">{</span><span class="n">const</span><span class="si">}</span><span class="s2">) -&gt; x"</span><span class="p">)</span> + <span class="n">result</span><span class="p">,</span> <span class="n">valid_if</span> <span class="o">=</span> <span class="n">z3_expression</span><span class="p">(</span><span class="n">opname</span><span class="p">,</span> <span class="n">constvar</span><span class="p">,</span> <span class="n">xvar</span><span class="p">)</span> + <span class="k">for</span> <span class="n">const</span> <span class="ow">in</span> <span class="n">find_constant</span><span class="p">(</span><span class="n">z3</span><span class="o">.</span><span class="n">And</span><span class="p">(</span><span class="n">result</span> <span class="o">==</span> <span class="n">xvar</span><span class="p">,</span> <span class="n">valid_if</span><span class="p">)):</span> + <span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">"</span><span class="si">{</span><span class="n">opname</span><span class="si">}</span><span class="s2">(</span><span class="si">{</span><span class="n">const</span><span class="si">}</span><span class="s2">, x) -&gt; x"</span><span class="p">)</span> +</pre></div> + +<p>And we get this list instead:</p> +<div class="code"><pre class="code literal-block"><span class="n">int_sub</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="mi">0</span> +<span class="n">int_xor</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="mi">0</span> +<span class="n">int_eq</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="mi">1</span> +<span class="n">int_ne</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="mi">0</span> +<span class="n">int_lt</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="mi">0</span> +<span class="n">int_le</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="mi">1</span> +<span class="n">int_gt</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="mi">0</span> +<span class="n">int_ge</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="mi">1</span> +<span class="n">uint_lt</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="mi">0</span> +<span class="n">uint_le</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="mi">1</span> +<span class="n">uint_gt</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="mi">0</span> +<span class="n">uint_ge</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="mi">1</span> +<span class="n">int_add</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="mi">0</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="n">x</span> +<span class="n">int_add</span><span class="p">(</span><span class="mi">0</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="n">x</span> +<span class="n">int_sub</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="mi">0</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="n">x</span> +<span class="n">int_mul</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="mi">1</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="n">x</span> +<span class="n">int_mul</span><span class="p">(</span><span class="mi">1</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="n">x</span> +<span class="n">int_and</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="o">-</span><span class="mi">1</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="n">x</span> +<span class="n">int_and</span><span class="p">(</span><span class="o">-</span><span class="mi">1</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="n">x</span> +<span class="n">int_or</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="mi">0</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="n">x</span> +<span class="n">int_or</span><span class="p">(</span><span class="mi">0</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="n">x</span> +<span class="n">int_xor</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="mi">0</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="n">x</span> +<span class="n">int_xor</span><span class="p">(</span><span class="mi">0</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="n">x</span> +<span class="n">int_lshift</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="mi">0</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="n">x</span> +<span class="n">int_rshift</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="mi">0</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="n">x</span> +<span class="n">uint_rshift</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="mi">0</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="n">x</span> +<span class="n">int_pydiv</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="mi">1</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="n">x</span> +</pre></div> + +<h3 id="synthesizing-two-constants">Synthesizing two constants</h3> +<p>For the patterns <code>op(x, c1) == c2</code> and <code>op(c1, x) == c2</code> we need to synthesize +two constants. We can again write a helper method for that:</p> +<div class="code"><pre class="code literal-block"><span class="k">def</span> <span class="nf">find_2consts</span><span class="p">(</span><span class="n">z3expr</span><span class="p">,</span> <span class="n">number_of_results</span><span class="o">=</span><span class="mi">5</span><span class="p">):</span> + <span class="n">condition</span> <span class="o">=</span> <span class="n">z3</span><span class="o">.</span><span class="n">ForAll</span><span class="p">(</span> + <span class="p">[</span><span class="n">xvar</span><span class="p">],</span> + <span class="n">z3expr</span> + <span class="p">)</span> + <span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="n">number_of_results</span><span class="p">):</span> + <span class="n">checkres</span> <span class="o">=</span> <span class="n">solver</span><span class="o">.</span><span class="n">check</span><span class="p">(</span><span class="n">condition</span><span class="p">)</span> + <span class="k">if</span> <span class="n">checkres</span> <span class="o">==</span> <span class="n">z3</span><span class="o">.</span><span class="n">sat</span><span class="p">:</span> + <span class="n">model</span> <span class="o">=</span> <span class="n">solver</span><span class="o">.</span><span class="n">model</span><span class="p">()</span> + <span class="n">const</span> <span class="o">=</span> <span class="n">model</span><span class="p">[</span><span class="n">constvar</span><span class="p">]</span><span class="o">.</span><span class="n">as_signed_long</span><span class="p">()</span> + <span class="n">const2</span> <span class="o">=</span> <span class="n">model</span><span class="p">[</span><span class="n">constvar2</span><span class="p">]</span><span class="o">.</span><span class="n">as_signed_long</span><span class="p">()</span> + <span class="k">yield</span> <span class="n">const</span><span class="p">,</span> <span class="n">const2</span> + <span class="n">condition</span> <span class="o">=</span> <span class="n">z3</span><span class="o">.</span><span class="n">And</span><span class="p">(</span><span class="n">z3</span><span class="o">.</span><span class="n">Or</span><span class="p">(</span><span class="n">constvar</span> <span class="o">!=</span> <span class="n">const</span><span class="p">,</span> <span class="n">constvar2</span> <span class="o">!=</span> <span class="n">const2</span><span class="p">),</span> <span class="n">condition</span><span class="p">)</span> + <span class="k">else</span><span class="p">:</span> + <span class="k">return</span> +</pre></div> + +<p>And then use it like this:</p> +<div class="code"><pre class="code literal-block"><span class="k">for</span> <span class="n">opname</span> <span class="ow">in</span> <span class="n">opnames2</span><span class="p">:</span> + <span class="c1"># try to find constants c1, c2 such that op(c1, x) -&gt; c2</span> + <span class="n">result</span><span class="p">,</span> <span class="n">valid_if</span> <span class="o">=</span> <span class="n">z3_expression</span><span class="p">(</span><span class="n">opname</span><span class="p">,</span> <span class="n">constvar</span><span class="p">,</span> <span class="n">xvar</span><span class="p">)</span> + <span class="n">consts</span> <span class="o">=</span> <span class="n">find_2consts</span><span class="p">(</span><span class="n">z3</span><span class="o">.</span><span class="n">And</span><span class="p">(</span><span class="n">valid_if</span><span class="p">,</span> <span class="n">result</span> <span class="o">==</span> <span class="n">constvar2</span><span class="p">))</span> + <span class="k">for</span> <span class="n">const</span><span class="p">,</span> <span class="n">const2</span> <span class="ow">in</span> <span class="n">consts</span><span class="p">:</span> + <span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">"</span><span class="si">{</span><span class="n">opname</span><span class="si">}</span><span class="s2">(</span><span class="si">{</span><span class="n">const</span><span class="si">}</span><span class="s2">, x) -&gt; </span><span class="si">{</span><span class="n">const2</span><span class="si">}</span><span class="s2">"</span><span class="p">)</span> + <span class="c1"># try to find constants c1, c2 such that op(x, c1) -&gt; c2</span> + <span class="n">result</span><span class="p">,</span> <span class="n">valid_if</span> <span class="o">=</span> <span class="n">z3_expression</span><span class="p">(</span><span class="n">opname</span><span class="p">,</span> <span class="n">xvar</span><span class="p">,</span> <span class="n">constvar</span><span class="p">)</span> + <span class="n">consts</span> <span class="o">=</span> <span class="n">find_2consts</span><span class="p">(</span><span class="n">z3</span><span class="o">.</span><span class="n">And</span><span class="p">(</span><span class="n">valid_if</span><span class="p">,</span> <span class="n">result</span> <span class="o">==</span> <span class="n">constvar2</span><span class="p">))</span> + <span class="k">for</span> <span class="n">const</span><span class="p">,</span> <span class="n">const2</span> <span class="ow">in</span> <span class="n">consts</span><span class="p">:</span> + <span class="nb">print</span><span class="p">(</span><span class="s2">"</span><span class="si">%s</span><span class="s2">(x, </span><span class="si">%s</span><span class="s2">) -&gt; </span><span class="si">%s</span><span class="s2">"</span> <span class="o">%</span> <span class="p">(</span><span class="n">opname</span><span class="p">,</span> <span class="n">const</span><span class="p">,</span> <span class="n">const2</span><span class="p">))</span> +</pre></div> + +<p>Which yields some straightforward simplifications:</p> +<div class="code"><pre class="code literal-block"><span class="n">int_mul</span><span class="p">(</span><span class="mi">0</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="mi">0</span> +<span class="n">int_mul</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="mi">0</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="mi">0</span> +<span class="n">int_and</span><span class="p">(</span><span class="mi">0</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="mi">0</span> +<span class="n">int_and</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="mi">0</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="mi">0</span> +<span class="n">uint_lt</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="mi">0</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="mi">0</span> +<span class="n">uint_le</span><span class="p">(</span><span class="mi">0</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="mi">1</span> +<span class="n">uint_gt</span><span class="p">(</span><span class="mi">0</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="mi">0</span> +<span class="n">uint_ge</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="mi">0</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="mi">1</span> +<span class="n">int_lshift</span><span class="p">(</span><span class="mi">0</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="mi">0</span> +<span class="n">int_rshift</span><span class="p">(</span><span class="mi">0</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="mi">0</span> +<span class="n">uint_rshift</span><span class="p">(</span><span class="mi">0</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="mi">0</span> +<span class="n">uint_mul_high</span><span class="p">(</span><span class="mi">0</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="mi">0</span> +<span class="n">uint_mul_high</span><span class="p">(</span><span class="mi">1</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="mi">0</span> +<span class="n">uint_mul_high</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="mi">0</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="mi">0</span> +<span class="n">uint_mul_high</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="mi">1</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="mi">0</span> +<span class="n">int_pymod</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="mi">1</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="mi">0</span> +<span class="n">int_pymod</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="o">-</span><span class="mi">1</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="mi">0</span> +</pre></div> + +<p>A few require a bit more thinking:</p> +<div class="code"><pre class="code literal-block"><span class="n">int_or</span><span class="p">(</span><span class="o">-</span><span class="mi">1</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="o">-</span><span class="mi">1</span> +<span class="n">int_or</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="o">-</span><span class="mi">1</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="o">-</span><span class="mi">1</span> +</pre></div> + +<p>The are true because in two's complement, <code>-1</code> has all bits set.</p> +<p>The following ones require recognizing that <code>-9223372036854775808 == -2**63</code> is +the most negative signed 64-bit integer, and <code>9223372036854775807 == 2 ** 63 - +1</code> is the most positive one:</p> +<div class="code"><pre class="code literal-block"><span class="n">int_lt</span><span class="p">(</span><span class="mi">9223372036854775807</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="mi">0</span> +<span class="n">int_lt</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="o">-</span><span class="mi">9223372036854775808</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="mi">0</span> +<span class="n">int_le</span><span class="p">(</span><span class="o">-</span><span class="mi">9223372036854775808</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="mi">1</span> +<span class="n">int_le</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="mi">9223372036854775807</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="mi">1</span> +<span class="n">int_gt</span><span class="p">(</span><span class="o">-</span><span class="mi">9223372036854775808</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="mi">0</span> +<span class="n">int_gt</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="mi">9223372036854775807</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="mi">0</span> +<span class="n">int_ge</span><span class="p">(</span><span class="mi">9223372036854775807</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="mi">1</span> +<span class="n">int_ge</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="o">-</span><span class="mi">9223372036854775808</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="mi">1</span> +</pre></div> + +<p>The following ones are true because the bitpattern for <code>-1</code> is the largest +unsigned number:</p> +<div class="code"><pre class="code literal-block"><span class="n">uint_lt</span><span class="p">(</span><span class="o">-</span><span class="mi">1</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="mi">0</span> +<span class="n">uint_le</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="o">-</span><span class="mi">1</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="mi">1</span> +<span class="n">uint_gt</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="o">-</span><span class="mi">1</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="mi">0</span> +<span class="n">uint_ge</span><span class="p">(</span><span class="o">-</span><span class="mi">1</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="mi">1</span> +</pre></div> + +<h3 id="strength-reductions">Strength Reductions</h3> +<p>All the patterns so far only had a variable or a constant on the target of the +rewrite. We can also use the machinery to do strengh-reductions where we +generate a single-argument operation <code>op1(x)</code> for input operations <code>op(x, c1)</code> +or <code>op(c1, x)</code>. To achieve this, we try all combinations of binary and unary +operations. (We won't consider strength reductions where a binary operation +gets turned into a "cheaper" other binary operation here.)</p> +<div class="code"><pre class="code literal-block"><span class="n">opnames1</span> <span class="o">=</span> <span class="p">[</span> +<span class="s2">"int_is_true"</span><span class="p">,</span> +<span class="s2">"int_is_zero"</span><span class="p">,</span> +<span class="s2">"int_neg"</span><span class="p">,</span> +<span class="s2">"int_invert"</span><span class="p">,</span> +<span class="p">]</span> + +<span class="k">for</span> <span class="n">opname</span> <span class="ow">in</span> <span class="n">opnames2</span><span class="p">:</span> + <span class="k">for</span> <span class="n">opname1</span> <span class="ow">in</span> <span class="n">opnames1</span><span class="p">:</span> + <span class="n">result</span><span class="p">,</span> <span class="n">valid_if</span> <span class="o">=</span> <span class="n">z3_expression</span><span class="p">(</span><span class="n">opname</span><span class="p">,</span> <span class="n">xvar</span><span class="p">,</span> <span class="n">constvar</span><span class="p">)</span> + <span class="c1"># try to find a constant op(x, c) == g(x)</span> + <span class="n">result1</span><span class="p">,</span> <span class="n">valid_if1</span> <span class="o">=</span> <span class="n">z3_expression</span><span class="p">(</span><span class="n">opname1</span><span class="p">,</span> <span class="n">xvar</span><span class="p">)</span> + <span class="n">consts</span> <span class="o">=</span> <span class="n">find_constant</span><span class="p">(</span><span class="n">z3</span><span class="o">.</span><span class="n">And</span><span class="p">(</span><span class="n">valid_if</span><span class="p">,</span> <span class="n">valid_if1</span><span class="p">,</span> <span class="n">result</span> <span class="o">==</span> <span class="n">result1</span><span class="p">))</span> + <span class="k">for</span> <span class="n">const</span> <span class="ow">in</span> <span class="n">consts</span><span class="p">:</span> + <span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">"</span><span class="si">{</span><span class="n">opname</span><span class="si">}</span><span class="s2">(x, </span><span class="si">{</span><span class="n">const</span><span class="si">}</span><span class="s2">) -&gt; </span><span class="si">{</span><span class="n">opname1</span><span class="si">}</span><span class="s2">(x)"</span><span class="p">)</span> + + <span class="c1"># try to find a constant op(c, x) == g(x)</span> + <span class="n">result</span><span class="p">,</span> <span class="n">valid_if</span> <span class="o">=</span> <span class="n">z3_expression</span><span class="p">(</span><span class="n">opname</span><span class="p">,</span> <span class="n">constvar</span><span class="p">,</span> <span class="n">xvar</span><span class="p">)</span> + <span class="n">result1</span><span class="p">,</span> <span class="n">valid_if1</span> <span class="o">=</span> <span class="n">z3_expression</span><span class="p">(</span><span class="n">opname1</span><span class="p">,</span> <span class="n">xvar</span><span class="p">)</span> + <span class="n">consts</span> <span class="o">=</span> <span class="n">find_constant</span><span class="p">(</span><span class="n">z3</span><span class="o">.</span><span class="n">And</span><span class="p">(</span><span class="n">valid_if</span><span class="p">,</span> <span class="n">valid_if1</span><span class="p">,</span> <span class="n">result</span> <span class="o">==</span> <span class="n">result1</span><span class="p">))</span> + <span class="k">for</span> <span class="n">const</span> <span class="ow">in</span> <span class="n">consts</span><span class="p">:</span> + <span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">"</span><span class="si">{</span><span class="n">opname</span><span class="si">}</span><span class="s2">(</span><span class="si">{</span><span class="n">const</span><span class="si">}</span><span class="s2">, x) -&gt; </span><span class="si">{</span><span class="n">opname1</span><span class="si">}</span><span class="s2">(x)"</span><span class="p">)</span> +</pre></div> + +<p>Which yields the following new simplifications:</p> +<div class="code"><pre class="code literal-block"><span class="n">int_sub</span><span class="p">(</span><span class="mi">0</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="n">int_neg</span><span class="p">(</span><span class="n">x</span><span class="p">)</span> +<span class="n">int_sub</span><span class="p">(</span><span class="o">-</span><span class="mi">1</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="n">int_invert</span><span class="p">(</span><span class="n">x</span><span class="p">)</span> +<span class="n">int_mul</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="o">-</span><span class="mi">1</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="n">int_neg</span><span class="p">(</span><span class="n">x</span><span class="p">)</span> +<span class="n">int_mul</span><span class="p">(</span><span class="o">-</span><span class="mi">1</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="n">int_neg</span><span class="p">(</span><span class="n">x</span><span class="p">)</span> +<span class="n">int_xor</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="o">-</span><span class="mi">1</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="n">int_invert</span><span class="p">(</span><span class="n">x</span><span class="p">)</span> +<span class="n">int_xor</span><span class="p">(</span><span class="o">-</span><span class="mi">1</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="n">int_invert</span><span class="p">(</span><span class="n">x</span><span class="p">)</span> +<span class="n">int_eq</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="mi">0</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="n">int_is_zero</span><span class="p">(</span><span class="n">x</span><span class="p">)</span> +<span class="n">int_eq</span><span class="p">(</span><span class="mi">0</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="n">int_is_zero</span><span class="p">(</span><span class="n">x</span><span class="p">)</span> +<span class="n">int_ne</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="mi">0</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="n">int_is_true</span><span class="p">(</span><span class="n">x</span><span class="p">)</span> +<span class="n">int_ne</span><span class="p">(</span><span class="mi">0</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="n">int_is_true</span><span class="p">(</span><span class="n">x</span><span class="p">)</span> +<span class="n">uint_lt</span><span class="p">(</span><span class="mi">0</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="n">int_is_true</span><span class="p">(</span><span class="n">x</span><span class="p">)</span> +<span class="n">uint_lt</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="mi">1</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="n">int_is_zero</span><span class="p">(</span><span class="n">x</span><span class="p">)</span> +<span class="n">uint_le</span><span class="p">(</span><span class="mi">1</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="n">int_is_true</span><span class="p">(</span><span class="n">x</span><span class="p">)</span> +<span class="n">uint_le</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="mi">0</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="n">int_is_zero</span><span class="p">(</span><span class="n">x</span><span class="p">)</span> +<span class="n">uint_gt</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="mi">0</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="n">int_is_true</span><span class="p">(</span><span class="n">x</span><span class="p">)</span> +<span class="n">uint_gt</span><span class="p">(</span><span class="mi">1</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="n">int_is_zero</span><span class="p">(</span><span class="n">x</span><span class="p">)</span> +<span class="n">uint_ge</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="mi">1</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="n">int_is_true</span><span class="p">(</span><span class="n">x</span><span class="p">)</span> +<span class="n">uint_ge</span><span class="p">(</span><span class="mi">0</span><span class="p">,</span><span class="w"> </span><span class="n">x</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="n">int_is_zero</span><span class="p">(</span><span class="n">x</span><span class="p">)</span> +<span class="n">int_pydiv</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="o">-</span><span class="mi">1</span><span class="p">)</span><span class="w"> </span><span class="o">-&gt;</span><span class="w"> </span><span class="n">int_neg</span><span class="p">(</span><span class="n">x</span><span class="p">)</span> +</pre></div> + +<h3 id="conclusions">Conclusions</h3> +<p>With not very little code we managed to generate a whole lot of local +simplifications for integer operations in the IR of PyPy's JIT. The rules +discovered that way are "simple", in the sense that they only require looking +at a single instruction, and not where the arguments of that instruction came +from. They also don't require any knowledge about the properties of the +arguments of the instructions (e.g. that they are positive).</p> +<p>The rewrites in this post have mostly been in PyPy's JIT already. But now we +mechanically confirmed that they are correct. I've also added the remaining +useful looking ones, in particular <code>int_eq(x, 0) -&gt; int_is_zero(x)</code> etc.</p> +<p>If we wanted to scale this approach up, we would have to work much harder! +There are a bunch of problems that come with generalizing the approach to +looking at sequences of instructions:</p> +<ul> +<li> +<p>Combinatorial explosion: if we look at sequences of instructions, we very + quickly get a combinatorial explosion and it becomes untractable to try all + combinations.</p> +</li> +<li> +<p>Finding non-minimal patterns: Some complicated simplifications can be + instances of simpler ones. For example, because <code>int_add(x, 0) -&gt; x</code>, it's + also true that <code>int_add(int_sub(x, y), 0) -&gt; int_sub(x, y)</code>. If we simply + generate all possible sequences, we will find the latter simplification rule, + which we would usually not care about.</p> +</li> +<li> +<p>Unclear usefulness: if we simply generate all rewrites up to a certain number + of instructions, we will get a lot of patterns that are useless in the sense + that they typically aren't found in realistic programs. It would be much + better to somehow focus on the patterns that real benchmarks are using.</p> +</li> +</ul> +<p>In the <a href="https://www.pypy.org/posts/2024/07/mining-jit-traces-missing-optimizations-z3.html">next blog post</a> I'll discuss an alternative approach to simply generating +all possible sequences of instructions, that tries to address these problems. +This works by analyzing the real traces of benchmarks and mining those for +inefficiencies, which only shows problems that occur in actual programs.</p> +<h3 id="sources">Sources</h3> +<p>I've been re-reading a lot of blog posts from John's blog:</p> +<ul> +<li><a href="https://blog.regehr.org/archives/1109">Let’s Work on an LLVM Superoptimizer</a></li> +<li><a href="https://blog.regehr.org/archives/1146">Early Superoptimizer Results</a></li> +<li><a href="https://blog.regehr.org/archives/1252">A Few Synthesizing Superoptimizer Results</a></li> +<li><a href="https://blog.regehr.org/archives/1636">Synthesizing Constants</a></li> +</ul> +<p>but also papers:</p> +<ul> +<li><a href="https://arxiv.org/pdf/1711.04422">A Synthesizing Superoptimizer</a></li> +<li><a href="https://dl.acm.org/doi/pdf/10.1145/3649837">Hydra: Generalizing Peephole Optimizations with Program Synthesis</a></li> +</ul> +<p>Another of my favorite blogs has been <a href="https://www.philipzucker.com/">Philipp Zucker's +blog</a> in the last year or two, lots of excellent +posts about/using Z3 on there.</p>jitz3https://www.pypy.org/posts/2024/07/finding-simple-rewrite-rules-jit-z3.htmlFri, 12 Jul 2024 19:14:09 GMTProfiling PyPy using the Firefox profiler user interfacehttps://www.pypy.org/posts/2024/05/vmprof-firefox-converter.htmlChristoph Jung<h3 id="introduction">Introduction</h3> +<p>If you ever wanted to profile your Python code on PyPy, you probably came across <a href="https://vmprof.readthedocs.io/en/latest/vmprof.html">VMProf</a> — a statistical profiler for PyPy.</p> +<p>VMProf's console output can already give some insights into where your code spends time, +but it is far from showing all the information captured while profiling.</p> +<p>There have been some tools around to visualize VMProf's output. +Unfortunately the vmprof.com user interface is no longer available and vmprof-server is not as easy to use, you may want to take a look at a local viewer or converter. +Those so far could give you some general visualizations of your profile, but do not show any PyPy related context like PyPy's log output (<a href="https://rpython.readthedocs.io/en/latest/logging.html">PyPyLog</a>, which is output when using the PYPYLOG environment variable to log JIT actions).</p> +<p>To bring all of those features together in one tool, you may take a look at the vmprof-firefox-converter.</p> +<p>Created in the context of my bachelor's thesis, the vmprof-firefox-converter is a tool for analyzing VMProf profiles with the <a href="https://profiler.firefox.com/">Firefox profiler</a> user interface. +Instead of building a new user interface from scratch, this allows us to reuse the user interface work Mozilla put into the Firefox profiler. +The Firefox profiler offers a timeline where you can zoom into profiles and work with different visualizations like a flame graph or a stack chart. +To understand why there is time spent inside a function, you can revisit the source code and even dive into the intermediate representation of functions executed by PyPy's just-in-time compiler. +Additionally, there is a visualization for PyPy's log output, to keep track whether PyPy spent time inside the interpreter, JIT or GC throughout the profiling time.</p> +<h3 id="profiling-word-count">Profiling word count</h3> +<p>In this blog post, I want to show an example of how to use the vmprof-firefox-converter for a simple Python program. +Based on Ben Hoyt's blog <a href="https://benhoyt.com/writings/count-words/">Performance comparison: counting words in Python, Go, C++, C, AWK, Forth, and Rust</a> we will profile two python versions of a word counter running on PyPy. One being a bit more optimized. For this, VMProf will be used, but instead of just going with the console output, we will use the Firefox profiler user interface.</p> +<p>At first, we are going to look at a simple way of counting words with <code>Collections.Counter</code>. +This will read one line from the standard input at a time and count the words with <code>counter.update()</code></p> +<div class="code"><pre class="code literal-block">counts = collections.Counter() +for line in sys.stdin: + words = line.lower().split() + counts.update(words) + +for word, count in counts.most_common(): + print(word, count) +</pre></div> + +<p>To start profiling, simply execute: +<code>pypy -m vmprofconvert -run simple.py &lt;kjvbible_x10.txt</code></p> +<p>This will run the above code with vmprof, automatically capture and convert the results and finally open the Firefox profiler. </p> +<p>The input file is the king James version of the bible concatenated ten times.</p> +<p>To get started, we take a look at the call stack.</p> +<p><img src="https://github.com/Cskorpion/vmprof-firefox-converter/blob/main/images/blog/simple_call_stack_crp.png?raw=true"> +Here we see that most of the time is spent in native code (marked as blue) e.g., the <code>counter.update()</code> or <code>split()</code> C implementation.</p> +<p>Now let's proceed with the more optimized version. +This time we read 64 Kb of data from the standard input and count the words with <code>counter.update()</code>.</p> +<div class="code"><pre class="code literal-block">counts = collections.Counter() +remaining = '' +while True: + chunk = remaining + sys.stdin.read(64*1024) + if not chunk: + break + last_lf = chunk.rfind('\n') # process to last LF character + if last_lf == -1: + remaining = '' + else: + remaining = chunk[last_lf+1:] + chunk = chunk[:last_lf] + counts.update(chunk.lower().split()) + +for word, count in counts.most_common(): + print(word, count) +</pre></div> + +<p>As we did before, we are going to take a peek at the call stack.</p> +<p><img src="https://github.com/Cskorpion/vmprof-firefox-converter/blob/main/images/blog/optimized_call_stack_crp.png?raw=true"> </p> +<p>Now there is more time spent in native code, caused by larger chunks of text passed to <code>counter.update()</code>.</p> +<p>This becomes even more clear by comparing the stack charts.</p> +<p><img src="https://github.com/Cskorpion/vmprof-firefox-converter/blob/main/images/blog/simple_stack_chart.png?raw=true"></p> +<p>Here, in the unoptimized case, we only read in one line at each loop iteration. +This results in small "spikes" in the stack chart. </p> +<p>But let's take an even closer look.</p> +<p><img src="https://github.com/Cskorpion/vmprof-firefox-converter/blob/main/images/blog/simple_stack_chart_zoom.png?raw=true"></p> +<p>Zoomed in, we see the call stack alternating between <code>_count_elements()</code> and (unfortunately unsymbolized) native calls coming from reading and splitting the input text (e.g., <code>decode()</code>).</p> +<p>Let us now take a look at the optimized case.</p> +<p><img src="https://github.com/Cskorpion/vmprof-firefox-converter/blob/main/images/blog/optimized_stack_chart.png?raw=true"></p> +<p>And if we look closer at the same interval as before, we see some spikes, but slightly different.</p> +<p><img src="https://github.com/Cskorpion/vmprof-firefox-converter/blob/main/images/blog/optimized_stack_chart_zoom.png?raw=true"></p> +<p>Even though we do not want to compare the (amount of) milliseconds directly, we clearly see that the spikes are wider, i.e. the time spent in those function calls is longer. +You may already know where this comes from. +We read a 64 Kb chunk of data from std in and pass that to <code>counter.update()</code>, so both these tasks do more work and take longer. +Bigger chunks mean there is less alternating between reading and counting, so there is more time spent doing work than "doing" loop iterations.</p> +<h3 id="getting-started">Getting started</h3> +<p>You can get the converter from <a href="https://github.com/Cskorpion/vmprof-firefox-converter">GitHub</a>.</p> +<p>Both VMProf and the vmprof-firefox-converter were created for profiling PyPy, but you can also use them with CPython. </p> +<p>This project is still somewhat experimental, so if you want to try it out, please let us know whether it worked for you.</p>https://www.pypy.org/posts/2024/05/vmprof-firefox-converter.htmlFri, 26 Apr 2024 14:38:00 GMTPyPy v7.3.16 releasehttps://www.pypy.org/posts/2024/04/pypy-v7316-release.htmlmattip<section id="pypy-v7-3-16-release-of-python-2-7-3-9-and-3-10"> +<h2>PyPy v7.3.16: release of python 2.7, 3.9, and 3.10</h2> +<p>The PyPy team is proud to release version 7.3.16 of PyPy.</p> +<p>This release includes security fixes from upstream CPython, and bugfixes to the +garbage collector, described in a <a class="reference external" href="https://www.pypy.org/posts/2024/03/fixing-bug-incremental-gc.html">gc bug-hunt blog post</a>.</p> +<p>The release includes three different interpreters:</p> +<blockquote> +<ul class="simple"> +<li><p>PyPy2.7, which is an interpreter supporting the syntax and the features of +Python 2.7 including the stdlib for CPython 2.7.18+ (the <code class="docutils literal">+</code> is for +backported security updates)</p></li> +<li><p>PyPy3.9, which is an interpreter supporting the syntax and the features of +Python 3.9, including the stdlib for CPython 3.9.19.</p></li> +<li><p>PyPy3.10, which is an interpreter supporting the syntax and the features of +Python 3.10, including the stdlib for CPython 3.10.14.</p></li> +</ul> +</blockquote> +<p>The interpreters are based on much the same codebase, thus the multiple +release. This is a micro release, all APIs are compatible with the other 7.3 +releases. It follows after 7.3.15 release on Jan 15, 2024</p> +<p>We recommend updating. You can find links to download the v7.3.16 releases here:</p> +<blockquote> +<p><a class="reference external" href="https://pypy.org/download.html">https://pypy.org/download.html</a></p> +</blockquote> +<p>We would like to thank our donors for the continued support of the PyPy +project. If PyPy is not quite good enough for your needs, we are available for +<a class="reference external" href="https://www.pypy.org/pypy-sponsors.html">direct consulting</a> work. If PyPy is helping you out, we would love to hear +about it and encourage submissions to our <a class="reference external" href="https://pypy.org/blog">blog</a> via a pull request +to <a class="reference external" href="https://github.com/pypy/pypy.org">https://github.com/pypy/pypy.org</a></p> +<p>We would also like to thank our contributors and encourage new people to join +the project. PyPy has many layers and we need help with all of them: bug fixes, +<a class="reference external" href="https://www.pypy.org/posts/2024/04/index.html">PyPy</a> and <a class="reference external" href="https://rpython.readthedocs.org">RPython</a> documentation improvements, or general <a class="reference external" href="https://www.pypy.org/posts/2024/04/project-ideas.html">help</a> with +making RPython's JIT even better.</p> +<p>If you are a python library maintainer and use C-extensions, please consider +making a <a class="reference external" href="https://hpyproject.org/">HPy</a> / <a class="reference external" href="https://cffi.readthedocs.io">CFFI</a> / <a class="reference external" href="https://cppyy.readthedocs.io">cppyy</a> version of your library that would be performant +on PyPy. In any case, both <a class="reference external" href="https://github.com/joerick/cibuildwheel">cibuildwheel</a> and the <a class="reference external" href="https://github.com/matthew-brett/multibuild">multibuild system</a> support +building wheels for PyPy.</p> +<section id="what-is-pypy"> +<h3>What is PyPy?</h3> +<p>PyPy is a Python interpreter, a drop-in replacement for CPython +It's fast (<a class="reference external" href="https://speed.pypy.org">PyPy and CPython 3.7.4</a> performance +comparison) due to its integrated tracing JIT compiler.</p> +<p>We also welcome developers of other <a class="reference external" href="https://rpython.readthedocs.io/en/latest/examples.html">dynamic languages</a> to see what RPython +can do for them.</p> +<p>We provide binary builds for:</p> +<blockquote> +<ul class="simple"> +<li><p><strong>x86</strong> machines on most common operating systems +(Linux 32/64 bits, Mac OS 64 bits, Windows 64 bits)</p></li> +<li><p>64-bit <strong>ARM</strong> machines running Linux (<code class="docutils literal">aarch64</code>).</p></li> +<li><p>Apple <strong>M1 arm64</strong> machines (<code class="docutils literal">macos_arm64</code>).</p></li> +<li><p><strong>s390x</strong> running Linux</p></li> +</ul> +</blockquote> +<p>PyPy support Windows 32-bit, Linux PPC64 big- and little-endian, and Linux ARM +32 bit, but does not release binaries. Please reach out to us if you wish to +sponsor binary releases for those platforms. Downstream packagers provide +binary builds for debian, Fedora, conda, OpenBSD, FreeBSD, Gentoo, and more.</p> +</section> +<section id="what-else-is-new"> +<h3>What else is new?</h3> +<p>For more information about the 7.3.16 release, see the <a class="reference external" href="https://doc.pypy.org/en/latest/release-v7.3.16.html#changelog">full changelog</a>.</p> +<p>Please update, and continue to help us make pypy better.</p> +<p>Cheers, +The PyPy Team</p> +</section> +</section>releasehttps://www.pypy.org/posts/2024/04/pypy-v7316-release.htmlTue, 23 Apr 2024 12:22:08 GMTFixing a Bug in PyPy's Incremental GChttps://www.pypy.org/posts/2024/03/fixing-bug-incremental-gc.htmlCarl Friedrich Bolz-Tereick<h2 id="introduction">Introduction</h2> +<p>Since last summer, I've been looking on and off into a weird and hard to +reproduce <a href="https://github.com/pypy/pypy/issues/3959">crash bug in PyPy</a>. It was +manifesting only on CI, and it seemed to always happen in the AST rewriting +phase of <a href="https://pytest.org">pytest</a>, the symptoms being that PyPy would crash +with a segfault. All my attempts to reproduce it locally failed, and my +attempts to try to understand the problem by dumping the involved ASTs lead +nowhere.</p> +<p>A few weeks ago, we got <a href="https://github.com/PyO3/pyo3/issues/3766">two more</a> +<a href="https://github.com/orgs/pypy/discussions/4923">bug reports</a>, the last one by +the authors of the <a href="https://nanobind.readthedocs.io/">nanobind</a> binding +generator, with the same symptoms: crash in AST rewriting, only on CI. I +decided to make a more serious push to try to find the bug this time. +Ultimately the problem turned out to be several bugs in PyPy's garbage +collector (GC) that had been there since its inception in +<a href="https://www.pypy.org/posts/2013/10/incremental-garbage-collector-in-pypy-8956893523842234676.html">2013</a>. +Understanding the +situation turned out to be quite involved, additionally complicated by this +being the first time that I was working on this particular aspect of PyPy's GC. +Since the bug was so much work to find, I thought I'd write a blog post about +it.</p> +<p>The blog post consists of three parts: first a chronological description of +what I did to find the bug, a technical explanation of what goes wrong, some +reflections on the bug (and then a bonus bug I also found in the process).</p> +<h2 id="finding-the-bug">Finding the Bug</h2> +<p>I started from the failing <a href="https://github.com/wjakob/nanobind/actions/runs/8234561874/job/22516568891">nanobind CI +runs</a> +that ended with a segfault of the PyPy interpreter. This was only an +intermittent problem, not every run was failing. When I tried to just run the +test suite locally, I couldn't get it to fail. Therefore at first I tried to +learn more about what was happening by looking on the CI runners.</p> +<h3 id="running-on-ci">Running on CI</h3> +<p>I forked the nanobind repo and hacked the CI script in order to get it to use a +PyPy build with <a href="https://doc.pypy.org/en/latest/build.html#making-a-debug-build-of-pypy">full debug information and more assertions turned on</a>. In order +to increase the probability of seeing the crash I added an otherwise unused +<a href="https://docs.github.com/en/actions/using-jobs/using-a-matrix-for-your-jobs">matrix</a> +variable to the CI script that just contained 32 parameters. This means every +build is done 32 times (sorry Github for wasting your CPUs 😕). With that +amount of repetition, I got at least one job of every build that was crashing.</p> +<p>Then I added the <code>-Xfaulthandler</code> option to the PyPy command which will use the +<a href="https://docs.python.org/3.11/library/faulthandler.html">faulthandler</a> module +try to print a Python stacktrace if the VM segfaults to confirm that PyPy was +indeed crashing in the <a href="https://docs.python.org/3/library/ast.html">AST</a> +<a href="https://github.com/pytest-dev/pytest/blob/main/src/_pytest/assertion/rewrite.py">rewriting +phase</a> +of pytest, which pytest uses for <a href="https://docs.pytest.org/en/7.1.x/how-to/assert.html#asserting-with-the-assert-statement">nicer +assertions</a>. +I experimented with hacking our faulthandler implementation to also give me a +C-level callstack, but that didn't work as well as I hoped.</p> +<p>Then I tried to run <a href="https://sourceware.org/gdb/">gdb</a> on CI to try to get it +to print a C callstack at the crash point. You can get gdb to execute commands +as if typed at the prompt with the <code>-ex</code> commandline option, I used something +like this:</p> +<div class="code"><pre class="code literal-block">gdb -ex "set confirm off" -ex "set pagination off" -ex \ + "set debuginfod enabled off" -ex run -ex where -ex quit \ + --args &lt;command&gt; &lt;arguments&gt; +</pre></div> + +<p>But unfortunately the crash never occurred when running in gdb.</p> +<p>Afterwards I tried the next best thing, which was configuring the CI runner to +<a href="https://github.com/itamarst/gha-upload-cores">dump a core file and upload it as a build +artifact</a>, which worked. Looking +at the cores locally only sort of worked, because I am running a different +version of Ubuntu than the CI runners. So I used +<a href="https://mxschmitt.github.io/action-tmate/">tmate</a> to be able to log into the +CI runner after a crash and interactively used gdb there. Unfortunately what I +learned from that was that the bug was some kind of <strong>memory corruption</strong>, +which is always incredibly unpleasant to debug. Basically the header word of a +Python object had been corrupted somehow at the point of the crash, which means +that it's <a href="https://en.wikipedia.org/wiki/Virtual_method_table">vtable</a> wasn't +usable any more.</p> +<p>(Sidenote: <a href="https://www.pypy.org/posts/2009/10/gc-improvements-6174120095428192954.html#unifying-the-vtable-ptr-with-the-gc-header">PyPy doesn't really use a vtable +pointer</a>, +instead it uses half a word in the header for the vtable, and the other half +for flags that the GC needs to keep track of the state of the object. +Corrupting all this is still bad.)</p> +<h3 id="reproducing-locally">Reproducing Locally</h3> +<p>At that point it was clear that I had to push to reproduce the problem on my +laptop, to allow me to work on the problem more directly and not to always have +to go via the CI runner. Memory corruption bugs often have a lot of randomness +(depending on which part of memory gets modified, things might crash or more +likely just happily keep running). Therefore I decided to try to brute-force +reproducing the crash by simply running the tests many many times. Since the +crash happened in the AST rewriting phase of pytest, and that happens only if +no <a href="https://stackoverflow.com/questions/2998215/if-python-is-interpreted-what-are-pyc-files">pyc +files</a> +of the bytecode-compiled rewritten ASTs exist, I made sure to delete them +before every test run.</p> +<p>To repeat the test runs I used +<a href="https://tratt.net/laurie/src/multitime/">multitime</a>, which is a simple program +that runs a command repeatedly. It's meant for lightweight benchmarking +purposes, but it also halts the execution of the command if that command exits +with an error (and it sleeps a small random time between runs, which might help +with randomizing the situation, maybe). Here's a demo:</p> +<script src="https://asciinema.org/a/648877.js" id="asciicast-648877" async="true"></script> + +<p>(<a href="https://bernsteinbear.com/">Max</a> pointed out +<a href="https://github.com/silentbicycle/autoclave">autoclave</a> to me when reviewing +this post, which is a more dedicated tool for this job.)</p> +<p>Thankfully, running the tests repeatedly eventually lead to a crash, solving my +"only happens on CI" problem. I then tried various variants to exclude possible +sources of errors. The first source of errors to exclude in PyPy bugs is the +just-in-time compiler, so I reran the tests with <code>--jit off</code> to see whether I +could still get it to crash, and thankfully I eventually could (JIT bugs are +often very annoying).</p> +<p>Next source of bugs to exclude where C-extensions. Since those were the tests +of nanobind, a framework for creating C-extension modules I was a bit worried +that the bug might be in our emulation of CPython's C-API. But running PyPy +with the <code>-v</code> option (which will print all the imports as they happen) +confirmed that at the point of crash no C-extension had been imported yet.</p> +<h3 id="using-rr">Using <code>rr</code></h3> +<p>I still couldn't get the bug to happen in GDB, so the tool I tried next was +<a href="https://rr-project.org/">rr, the "reverse debugger"</a>. rr can record the execution of a program and +later replay it arbitrarily often. This gives you a time-traveling debugger +that allows you to execute the program backwards in addition to forwards. +Eventually I managed to get the crash to happen when running the tests with +<code>rr record --chaos</code> (<code>--chaos</code> randomizes some decisions that rr takes, to try to +increase the chance of reproducing bugs).</p> +<p>Using rr well is quite hard, and I'm not very good at it. The main approach I +use with rr to debug memory corruption is to replay the crash, then set a +<a href="https://sourceware.org/gdb/current/onlinedocs/gdb.html/Set-Watchpoints.html">watchpoint</a> +for the corrupted memory location, then use the command <code>reverse-continue</code> to +find the place in the code that mutated the memory location. <code>reverse-continue</code> +is like <code>continue</code>, except that it will execute the program backwards from the +current point. Here's a little demo of this:</p> +<script src="https://asciinema.org/a/648814.js" id="asciicast-648814" async="true"></script> + +<p>Doing this for my bug revealed that the object that was being corrupted was +erroneously collected by the garbage collector. For some reason the GC had +wrongly decided that the object was no longer reachable and therefore put the +object into a freelist by writing a pointer to the next entry in the freelist +into the first word of the object, overwriting the object's header. The next +time the object was used things crashed.</p> +<h3 id="side-quest-wrong-gc-assertions">Side-quest: wrong GC assertions</h3> +<p>At this point in the process, I got massively side-tracked. PyPy's GC has a +number of debug modes that you can optionally turn on. Those slow down the +program execution a lot, but they should in theory help to understand why the +GC goes wrong. When I turned them on, I was getting a failing assertion really +early in the test execution, complaining about an invariant violation in the GC +logic. At first this made me very happy. I thought that this would help me fix +the bug more quickly.</p> +<p>Extremely frustratingly, after two days of work I concluded that the assertion +logic itself was wrong. I have fixed that in the meantime too, the details +of that are in the bonus section at the end of the post.</p> +<h3 id="using-gdb-scripting-to-find-the-real-bug">Using GDB scripting to find the real bug</h3> +<p>After that disaster I went back to the earlier rr recording without GC assertions +and tried to understand in more detail why the GC decided to free an object +that was still being referenced. To be able to do that I used the <a href="https://sourceware.org/gdb/current/onlinedocs/gdb.html/Python-API.html">GDB Python +scripting +API</a> to +write some helper commands to understand the state of the GC heap (rr is an +extension of GDB, so the GDB scripting API works in rr too).</p> +<p>The first (small) helper command I wrote with the GDB scripting API was a way +to pretty-print the currently active GC flags of a random PyPy object, starting +just from the pointer. The more complex command I wrote was an object tracer, +which follows pointers to GC objects starting from a root object to explore the +object graph. The object tracer isn't complete, it doesn't deal with all the +complexities of PyPy's GC. But it was good enough to help me with my problem, I +found out that the corrupted object was stored in an array.</p> +<p>As an example, here's a function that uses the GDB API to walk one of the +helper data structures of the GC, a stack of pointers:</p> +<div class="code"><pre class="code literal-block"><span class="k">def</span> <span class="nf">walk_addr_stack</span><span class="p">(</span><span class="n">obj</span><span class="p">):</span> +<span class="w"> </span><span class="sd">""" walk an instance of the AddressStack class (which is a linked list of</span> +<span class="sd"> arrays of 1019 pointers).</span> + +<span class="sd"> the first of the arrays is only partially filled with used_in_last_chunk</span> +<span class="sd"> items, all the other chunks are full."""</span> + <span class="k">if</span> <span class="n">obj</span><span class="o">.</span><span class="n">type</span><span class="o">.</span><span class="n">code</span> <span class="o">==</span> <span class="n">gdb</span><span class="o">.</span><span class="n">TYPE_CODE_PTR</span><span class="p">:</span> + <span class="n">obj</span> <span class="o">=</span> <span class="n">obj</span><span class="o">.</span><span class="n">dereference</span><span class="p">()</span> + <span class="n">used_in_last_chunk</span> <span class="o">=</span> <span class="n">lookup</span><span class="p">(</span><span class="n">obj</span><span class="p">,</span> <span class="s2">"used_in_last_chunk"</span><span class="p">)</span> + <span class="n">chunk</span> <span class="o">=</span> <span class="n">lookup</span><span class="p">(</span><span class="n">obj</span><span class="p">,</span> <span class="s2">"inst_chunk"</span><span class="p">)</span><span class="o">.</span><span class="n">dereference</span><span class="p">()</span> + <span class="k">while</span> <span class="mi">1</span><span class="p">:</span> + <span class="n">items</span> <span class="o">=</span> <span class="n">lookup</span><span class="p">(</span><span class="n">chunk</span><span class="p">,</span> <span class="s2">"items"</span><span class="p">)</span> + <span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="n">used_in_last_chunk</span><span class="p">):</span> + <span class="k">yield</span> <span class="n">items</span><span class="p">[</span><span class="n">i</span><span class="p">]</span> + <span class="n">chunk</span> <span class="o">=</span> <span class="n">lookup</span><span class="p">(</span><span class="n">chunk</span><span class="p">,</span> <span class="s2">"next"</span><span class="p">)</span> + <span class="k">if</span> <span class="ow">not</span> <span class="n">chunk</span><span class="p">:</span> + <span class="k">break</span> + <span class="n">chunk</span> <span class="o">=</span> <span class="n">chunk</span><span class="o">.</span><span class="n">dereference</span><span class="p">()</span> + <span class="n">used_in_last_chunk</span> <span class="o">=</span> <span class="mi">1019</span> +</pre></div> + +<p>The full file of supporting code I wrote can be found in <a href="https://gist.github.com/cfbolz/13cadcbbef321d93fc9790dff6f60a6a">this +gist</a>. This is +pretty rough throw-away code, however.</p> +<p>In the following recording I show a staged debugging session with some of the +extra commands I wrote with the Python API. The details aren't important, I +just wanted to give a bit of a flavor of what inspecting objects looks like:</p> +<script src="https://asciinema.org/a/648889.js" id="asciicast-648889" async="true"></script> + +<p>The next step was to understand why the array content wasn't being correctly +traced by the GC, which I eventually managed with some <a href="https://www.fayewilliams.com/2011/07/13/gdb-conditional-breakpoints/">conditional +breakpoints</a>, +more watchpoints, and using <code>reverse-continue</code>. It turned out to be a bug that +occurs when the content of one array was memcopied into another array. The +technical details of why the array wasn't traced correctly are described in +detail in the next section.</p> +<h3 id="writing-a-unit-test">Writing a unit test</h3> +<p>To try to make sure I really understood the bug correctly I then wrote a GC +unit test that shows the problem. Like most of PyPy, our GC is written in +RPython, a (somewhat strange) subset/dialect of Python2, which can be compiled +to C code. However, since it is also valid Python2 code, it can be <a href="https://www.pypy.org/posts/2022/04/how-is-pypy-tested.html">unit-tested +on top of a Python2 +implementation</a> +(which is one of the reasons why we keep maintaining PyPy2).</p> +<p>In the GC unit tests you have a lot of control about what order things happen +in, e.g. how objects are allocated, when garbage collection phases happen, etc. +After some trying I managed to write a test that crashes with the same kind of +memory corruption that my original crash exhibited: an object that is still +reachable via an array is collected by the GC. To give you a flavor of what +this kind of test looks like, here's an (edited for clarity) version of the +test I eventually managed to write</p> +<div class="code"><pre class="code literal-block"><span class="k">def</span> <span class="nf">test_incrementality_bug_arraycopy</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span> + <span class="n">source</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">malloc</span><span class="p">(</span><span class="n">VAR</span><span class="p">,</span> <span class="mi">8</span><span class="p">)</span> <span class="c1"># first array</span> + <span class="c1"># the stackroots list emulates the C stack</span> + <span class="bp">self</span><span class="o">.</span><span class="n">stackroots</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">source</span><span class="p">)</span> + <span class="n">target</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">malloc</span><span class="p">(</span><span class="n">VAR</span><span class="p">,</span> <span class="mi">8</span><span class="p">)</span> <span class="c1"># second array</span> + <span class="bp">self</span><span class="o">.</span><span class="n">stackroots</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">target</span><span class="p">)</span> + <span class="n">node</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">malloc</span><span class="p">(</span><span class="n">S</span><span class="p">)</span> <span class="c1"># unrelated object, will be collected</span> + <span class="n">node</span><span class="o">.</span><span class="n">x</span> <span class="o">=</span> <span class="mi">5</span> + <span class="c1"># store reference into source array, calling the write barrier</span> + <span class="bp">self</span><span class="o">.</span><span class="n">writearray</span><span class="p">(</span><span class="n">source</span><span class="p">,</span> <span class="mi">0</span><span class="p">,</span> <span class="n">node</span><span class="p">)</span> + <span class="n">val</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">gc</span><span class="o">.</span><span class="n">collect_step</span><span class="p">()</span> + <span class="n">source</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">stackroots</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span> <span class="c1"># reload arrays, they might have moved</span> + <span class="n">target</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">stackroots</span><span class="p">[</span><span class="mi">1</span><span class="p">]</span> + <span class="c1"># this GC step traces target</span> + <span class="n">val</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">gc</span><span class="o">.</span><span class="n">collect_step</span><span class="p">()</span> + + <span class="c1"># emulate what a memcopy of arrays does</span> + <span class="n">res</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">gc</span><span class="o">.</span><span class="n">writebarrier_before_copy</span><span class="p">(</span><span class="n">source</span><span class="p">,</span> <span class="n">target</span><span class="p">,</span> <span class="mi">0</span><span class="p">,</span> <span class="mi">0</span><span class="p">,</span> <span class="mi">2</span><span class="p">)</span> + <span class="k">assert</span> <span class="n">res</span> + <span class="n">target</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span> <span class="o">=</span> <span class="n">source</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span> <span class="c1"># copy two elements of the arrays</span> + <span class="n">target</span><span class="p">[</span><span class="mi">1</span><span class="p">]</span> <span class="o">=</span> <span class="n">source</span><span class="p">[</span><span class="mi">1</span><span class="p">]</span> + <span class="c1"># now overwrite the reference to node in source</span> + <span class="bp">self</span><span class="o">.</span><span class="n">writearray</span><span class="p">(</span><span class="n">source</span><span class="p">,</span> <span class="mi">0</span><span class="p">,</span> <span class="n">lltype</span><span class="o">.</span><span class="n">nullptr</span><span class="p">(</span><span class="n">S</span><span class="p">))</span> + <span class="c1"># this GC step traces source</span> + <span class="bp">self</span><span class="o">.</span><span class="n">gc</span><span class="o">.</span><span class="n">collect_step</span><span class="p">()</span> + <span class="c1"># some more collection steps, crucially target isn't traced again</span> + <span class="c1"># but node is deleted</span> + <span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="mi">3</span><span class="p">):</span> + <span class="bp">self</span><span class="o">.</span><span class="n">gc</span><span class="o">.</span><span class="n">collect_step</span><span class="p">()</span> + <span class="c1"># used to crash, node got collected</span> + <span class="k">assert</span> <span class="n">target</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span><span class="o">.</span><span class="n">x</span> <span class="o">==</span> <span class="mi">5</span> +</pre></div> + +<p>One of the good properties of testing our GC that way is that all the memory is +emulated. The crash in the last line of the test isn't a segfault at all, +instead you get a nice exception saying that you tried to access a freed chunk +of memory and you can then debug this with a python2 debugger.</p> +<h3 id="fixing-the-bug">Fixing the Bug</h3> +<p>With the unit test in hand, fixing the test was relatively straightforward (the +diff in its simplest form is anyway only a <a href="https://github.com/pypy/pypy/commit/78bbeb93471b5f38438004e971f4b4f84ab17a84">single line +change</a>). +After this first version of my fix, I +<a href="https://github.com/pypy/pypy/issues/4925#issuecomment-2014459454">talked to Armin +Rigo</a> who +helped me find different case that was still wrong, in the same area of the +code.</p> +<p>I also got help by the developers at <a href="https://portaone.com/">PortaOne</a> +who are using PyPy on their servers and had seen some <a href="https://github.com/pypy/pypy/issues/4900">mysterious PyPy +crashes</a> +recently, that looked related to the GC. They did test deployments of my fixes +in their various stages to their servers to try to see whether stability +improved for them. Unfortunately in the end it turned out that their crashes +are an unrelated GC bug related to object pinning, which we haven't resolved +yet.</p> +<h3 id="writing-a-gc-fuzzerproperty-based-test">Writing a GC fuzzer/property based test</h3> +<p>Finding bugs in the GC is always extremely disconcerting, particularly since +this one managed to hide for so long (more than ten years!). Therefore I wanted +to use these bugs as motivation to try to find more problems in PyPy's GC. Given +the ridiculous effectiveness of fuzzing, I used +<a href="https://hypothesis.readthedocs.io/en/latest/">hypothesis</a> to write a +property-based test. Every test performs a sequence of randomly chosen steps +from the following list:</p> +<ul> +<li>allocate an object</li> +<li>read a random field from a random object</li> +<li>write a random reference into a random object</li> +<li>drop a random stack reference</li> +<li>perform one GC step</li> +<li>allocate an array</li> +<li>read a random index from a random array</li> +<li>write to an array</li> +<li>memcopy between two arrays</li> +</ul> +<p>This approach of doing a sequence of steps is pretty close to the <a href="https://hypothesis.readthedocs.io/en/latest/stateful.html">stateful +testing</a> approach of +hypothesis, but I just implemented it manually with the <a href="https://hypothesis.readthedocs.io/en/latest/data.html#drawing-interactively-in-tests">data +strategy</a>.</p> +<p>Every one of those steps is always performed on both the tested GC, and on some +regular Python objects. The Python objects provide the "ground truth" of what +the heap should look like, so we can compare the state of the GC objects +with the state of the Python objects to find out whether the GC made a mistake.</p> +<p>In order to check whether the test is actually useful, I reverted my bug fixes +and made sure that the test re-finds both the spurious GC assertion error and the +problems with memcopying an array.</p> +<p>In addition, the test also found corner cases in my fix. There was a situation +that I hadn't accounted for, which the test found after eventually. +I also plan on adding a bunch of other GC features as steps in the +test to stress them too (for example weakrefs, identity hashes, pinning, maybe +finalization).</p> +<p>At the point of publishing this post, the fixes got merged to the 2.7/3.9/3.10 +branches of PyPy, and will be part of the next release (v7.3.16).</p> +<h2 id="the-technical-details-of-the-bug">The technical details of the bug</h2> +<p>In order to understand the technical details of the bug, I need to give some +background explanations about PyPy's GC.</p> +<h3 id="pypys-incremental-gc">PyPy's incremental GC</h3> +<p>PyPy uses an incremental generational mark-sweep GC. It's +<a href="https://en.wikipedia.org/wiki/Tracing_garbage_collection#Generational_GC_(ephemeral_GC)">generational</a> +and therefore has minor collections (where only young objects get collected) +and major collections (collecting long-lived objects eventually, using a +<a href="https://en.wikipedia.org/wiki/Tracing_garbage_collection#Na%C3%AFve_mark-and-sweep">mark-and-sweep</a> +algorithm). Young objects are allocated in a nursery using a +bump-pointer allocator, which makes allocation quite efficient. They are moved +out of the nursery by minor collections. In order to find references from old +to young objects the GC uses a write barrier to detect writes into old objects.</p> +<p>The GC is also +<a href="https://en.wikipedia.org/wiki/Tracing_garbage_collection#Stop-the-world_vs._incremental_vs._concurrent">incremental</a>, +which means that its major collections aren't done all at once (which would +lead to long pauses). Instead, major collections are sliced up into small +steps, which are done directly after a minor collection (the GC isn't +<em>concurrent</em> though, which would mean that the GC does work in a separate +thread).</p> +<p>The incremental GC uses <a href="https://en.wikipedia.org/wiki/Tracing_garbage_collection#Tri-color_marking">tri-color +marking</a> +to reason about the reachable part of the heap during the marking phase, where +every old object can be:</p> +<ul> +<li>black: already marked, reachable, definitely survives the collection</li> +<li>grey: will survive, but still needs to be marked</li> +<li>white: potentially dead</li> +</ul> +<p>The color of every object is encoded by setting flags +in the object header.</p> +<p>The GC maintains the <strong>invariant</strong> that black objects must never point to white +objects. At the start of a major collection cycle the stack roots are turned +gray. During the mark phase of a major collection cycle, the GC will trace gray +objects, until +none are left. To trace a gray object, all the objects it references have to be +marked grey if they are white so far. After a grey object is traced, it can be +marked black (because all the referenced objects are now either black or gray). +Eventually, there are no gray objects left. At that point (because no white +object can be reached from a black one) all the white objects are known to be +unreachable and can therefore be freed.</p> +<p>The GC is incremental because every collection step will only trace a limited +number of gray objects, before giving control back to the program. This leads to +a problem: if an already traced (black) object is changed between two marking +steps of the GC, the program can mutate that object and write a new reference +into one of its fields. This could lead to an invariant violation, if the +referenced object is white. Therefore, the GC uses the write barrier (which it +needs anyway to find references from old to young objects) to mark all black +objects that are modified gray, and then trace them again at one of the +later collection steps.</p> +<h3 id="the-special-write-barrier-of-memcopy">The special write barrier of memcopy</h3> +<p>Arrays use a different kind of write barrier than normal objects. Since they +can be arbitrarily large, tracing them can take a long time. Therefore it's +potentially wasteful to trace them fully at a minor collection. To fix this, +the array write barrier keeps more granular information about which parts of +the array have been modified since the last collection step. Then only the +modified parts of the array need to be traced, not the whole array.</p> +<p>In addition, there is another optimization for arrays, which is that memcopy is +treated specially by the GC. If memcopy is implemented by simply writing a loop +that copies the content of one array to the other, that will invoke the write +barrier every single loop iteration for the write of every array element, +costing a lot of overhead. Here's some pseudo-code:</p> +<div class="code"><pre class="code literal-block"><span class="k">def</span> <span class="nf">arraycopy</span><span class="p">(</span><span class="n">source</span><span class="p">,</span> <span class="n">dest</span><span class="p">,</span> <span class="n">source_start</span><span class="p">,</span> <span class="n">dest_start</span><span class="p">,</span> <span class="n">length</span><span class="p">):</span> + <span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="n">length</span><span class="p">):</span> + <span class="n">value</span> <span class="o">=</span> <span class="n">source</span><span class="p">[</span><span class="n">source_start</span> <span class="o">+</span> <span class="n">i</span><span class="p">]</span> + <span class="n">dest</span><span class="p">[</span><span class="n">dest_start</span> <span class="o">+</span> <span class="n">i</span><span class="p">]</span> <span class="o">=</span> <span class="n">value</span> <span class="c1"># &lt;- write barrier inserted here</span> +</pre></div> + +<p>Therefore the GC has a special memcopy-specific +write barrier that will perform the GC logic once before the memcopy loop, and +then use a regular (typically SIMD-optimized) memcopy implementation from +<code>libc</code>. Roughly like this:</p> +<div class="code"><pre class="code literal-block"><span class="k">def</span> <span class="nf">arraycopy</span><span class="p">(</span><span class="n">source</span><span class="p">,</span> <span class="n">dest</span><span class="p">,</span> <span class="n">source_start</span><span class="p">,</span> <span class="n">dest_start</span><span class="p">,</span> <span class="n">length</span><span class="p">):</span> + <span class="n">gc_writebarrier_before_array_copy</span><span class="p">(</span><span class="n">source</span><span class="p">,</span> <span class="n">dest</span><span class="p">,</span> <span class="n">source_start</span><span class="p">,</span> <span class="n">dest_start</span><span class="p">,</span> <span class="n">length</span><span class="p">)</span> + <span class="n">raw_memcopy</span><span class="p">(</span><span class="n">cast_to_voidp</span><span class="p">(</span><span class="n">source</span><span class="p">)</span> <span class="o">+</span> <span class="n">source_start</span><span class="p">,</span> + <span class="n">cast_to_voidp</span><span class="p">(</span><span class="n">dest</span><span class="p">)</span> <span class="o">+</span> <span class="n">dest_start</span><span class="p">,</span> + <span class="n">sizeof</span><span class="p">(</span><span class="n">itemtype</span><span class="p">(</span><span class="n">source</span><span class="p">))</span> <span class="o">*</span> <span class="n">length</span><span class="p">)</span> +</pre></div> + +<p>(this is really a rough sketch. The <a href="https://github.com/pypy/pypy/blob/789f964fff59c722b0872abcdc56d2b1373a9f3b/rpython/rlib/rgc.py#L365">real +code</a> +is much more complicated.)</p> +<h3 id="the-bug">The bug</h3> +<p>The bugs turned out to be precisely in this memcopy write barrier. When we +implemented the current GC, we adapted our previous GC, which was a +generational mark-sweep GC but <em>not</em> incremental. We started with most of the +previous GC's code, including the write barriers. The regular write barriers +were adapted to the new incremental assumptions, in particular the need for the +write barrier to also turn black objects back to gray when they are modified +during a marking phase. This was simply not done at all for the memcopy write +barrier, at least in two of the code paths. Fixing this problem fixes the unit +tests and stops the crashes.</p> +<h2 id="reflections">Reflections</h2> +<p>The way the bug was introduced is really typical. A piece of code (the memcopy +write barrier) was written under a set of assumptions. Then those assumptions +changed later. Not all the code pieces that relied on these assumptions to be +correct were updated. It's pretty hard to prevent this in all situations.</p> +<p>I still think we could have done more to prevent the bug occurring. Writing a +property-based test for the GC would have been a good idea given the complexity +of the GC, and definitely something we did in other parts of our code at the +time (just using the <code>random</code> module mostly, we started using hypothesis +later).</p> +<p>It's a bit of a mystery to me why this bug managed to be undetected for so +long. Memcopy happens in a lot of pretty core operations of e.g. lists in +Python (<code>list.extend</code>, to name just one example). To speculate, I would suspect +that all the other preconditions for the bug occurring made it pretty rare:</p> +<ul> +<li>the content of an old list that is not yet marked needs to be copied into + another old list that is marked already</li> +<li>the source of the copy needs to also store an object that has no other + references</li> +<li>the source of the copy then needs to be overwritten with other data</li> +<li>then the next collection steps need to be happening at the right points</li> +<li>...</li> +</ul> +<p>Given the complexity of the GC logic I also wonder whether some lightweight +formal methods would have been a good idea. Formalizing some of the core +invariants in <a href="https://en.wikipedia.org/wiki/B-Method">B</a> or +<a href="https://en.wikipedia.org/wiki/TLA%2B">TLA+</a> and then <a href="https://en.wikipedia.org/wiki/Model_checking">model +checking</a> them up to some number +of +objects would have found this problem pretty quickly. There are also correctness +proofs for GC algorithms in some research papers, but I don't have a good +overview of the literature to point to any that are particularly good or bad. +Going such a more formal route might have fixed this and probably a whole bunch +of other bugs, but of course it's a pretty expensive (and tedious) approach.</p> +<p>While it was super annoying to track this down, it was definitely good to learn +a bit more about how to use rr and the GDB scripting interface.</p> +<h2 id="bonus-section-the-wrong-assertion">Bonus Section: The Wrong Assertion</h2> +<p>Some more technical information about the wrong assertion is in this section.</p> +<h3 id="background-pre-built-objects">Background: pre-built objects</h3> +<p>PyPy's VM-building bootstrapping process can "freeze" a bunch of heap objects +into the final binary. This allows the VM to start up quickly, because those +frozen objects are loaded by the OS as part of the binary.</p> +<p>Those frozen pre-built objects are parts of the 'roots' of the garbage +collector and need to be traced. However, tracing all the pre-built objects at +every collection would be very expensive, because there are a lot of them +(about 150,000 in a PyPy 3.10 binary). Tracing them all is also not necessary, +because most of them are never modified. Unmodified pre-built objects can only reference +other pre-built objects, which can never be deallocated anyway. Therefore we +have an optimization that uses the write barrier (which we need anyway to find +old-to-young pointers) to notice when a pre-built object gets modified for the +very first time. If that happens, it gets added to the set of pre-built objects +that gets counted as a root, and is traced as a root at collections +from then on.</p> +<h3 id="the-wrong-assertion">The wrong assertion</h3> +<p>The assertion that triggered when I turned on the GC debug mode was saying that +the GC found a reference from a black to a white object, violating its +invariant. Unmodified pre-built objects count as black, and they aren't roots, +because they can only ever reference other pre-built objects. However, when a +pre-built object gets modified for the first time, it becomes part of the root +set and will be marked gray. This logic works fine.</p> +<p>The wrong assertion triggers if a pre-built object is mutated for the very +first time in the middle of an incremental marking phase. While the pre-built +object gets added to the root set just fine, and will get traced before the +marking phase ends, this is encoded slightly differently for pre-built objects, +compared to "regular" old objects. Therefore, the invariant checking code +wrongly reported a black-&gt;white pointer in this situation.</p> +<p>To fix it I also wrote a unit test checking the problem, made sure that the GC +hypothesis test also found the bug, and then fixed the wrong assertion to take +the color encoding of pre-built objects into account.</p> +<p>The bug managed to be invisible because we don't tend to turn on the GC +assertions very often. We only do that when we find a GC bug, which is of +course also when we need it the most to be correct.</p> +<h2 id="acknowledgements">Acknowledgements</h2> +<p>Thanks to Matti Picus, Max Bernstein, Wouter van Heyst for giving me feedback on drafts of the +post. Thanks to Armin Rigo for reviewing the code and pointing out holes in my +thinking. Thanks to the original reporters of the various forms of the bug, +including Lily Foote, David Hewitt, Wenzel Jakob.</p>https://www.pypy.org/posts/2024/03/fixing-bug-incremental-gc.htmlTue, 26 Mar 2024 19:14:09 GMT \ No newline at end of file diff --git a/search.html b/search.html new file mode 100644 index 000000000..89d348e2a --- /dev/null +++ b/search.html @@ -0,0 +1,112 @@ + + + + + +Search | PyPy + + + + + + + + + + + + + + + + Skip to main content +
                              +

                              Search

                              +

                              More PyPy info can be found at our + reference doc site +

                              + +
                              Use the search box in the navigation bar to search.
                              +

                              +

                              +
                              +
                              + © 2024 The PyPy Team +   + Built with Nikola +   + Last built 2024-08-31T17:48 +
                              + + + + + +
                              +
                              +
                              + + \ No newline at end of file diff --git a/sidebar-en.inc b/sidebar-en.inc new file mode 100644 index 000000000..73a599194 --- /dev/null +++ b/sidebar-en.inc @@ -0,0 +1,143 @@ +
                              +

                              + The PyPy blogposts +

                              +
                              + Create a guest post via a PR to the source repo +
                              +
                              + + +
                              +

                              + Archives +

                              + +
                              + + +
                              +

                              + Tags +

                              + +
                              \ No newline at end of file diff --git a/sitemap.xml b/sitemap.xml new file mode 100644 index 000000000..272a774f9 --- /dev/null +++ b/sitemap.xml @@ -0,0 +1,2380 @@ + + + + https://www.pypy.org/ + 2024-08-31T17:48:00Z + + + https://www.pypy.org/2007/ + 2024-08-31T17:48:00Z + + + https://www.pypy.org/2008/ + 2024-08-31T17:48:00Z + + + https://www.pypy.org/2009/ + 2024-08-31T17:48:00Z + + + https://www.pypy.org/2010/ + 2024-08-31T17:48:00Z + + + https://www.pypy.org/2011/ + 2024-08-31T17:48:00Z + + + https://www.pypy.org/2012/ + 2024-08-31T17:48:00Z + + + https://www.pypy.org/2013/ + 2024-08-31T17:48:00Z + + + https://www.pypy.org/2014/ + 2024-08-31T17:48:00Z + + + https://www.pypy.org/2015/ + 2024-08-31T17:48:00Z + + + https://www.pypy.org/2016/ + 2024-08-31T17:48:00Z + + + https://www.pypy.org/2017/ + 2024-08-31T17:48:00Z + + + https://www.pypy.org/2018/ + 2024-08-31T17:48:00Z + + + https://www.pypy.org/2019/ + 2024-08-31T17:48:00Z + + + https://www.pypy.org/2020/ + 2024-08-31T17:48:00Z + + + https://www.pypy.org/2021/ + 2024-08-31T17:48:00Z + + + https://www.pypy.org/2022/ + 2024-08-31T17:48:00Z + + + https://www.pypy.org/2023/ + 2024-08-31T17:48:00Z + + + https://www.pypy.org/2024/ + 2024-08-31T17:48:00Z + + + https://www.pypy.org/archive.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/authors/ + 2024-08-31T17:48:00Z + + + https://www.pypy.org/authors/alex.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/authors/alexander-schremmer.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/authors/antonio-cuni.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/authors/armin-rigo.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/authors/bea-during.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/authors/benjamin-peterson.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/authors/brian-kearns.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/authors/carl-friedrich-bolz-tereick.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/authors/cf-bolz-tereick.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/authors/christoph-jung.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/authors/david-schneider.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/authors/hakan-ardo.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/authors/hodgestar.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/authors/holger-krekel.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/authors/maciej-fijalkowski.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/authors/mattip.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/authors/max-bernstein.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/authors/michael-foord.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/authors/philip-jenvey.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/authors/richard-plangger.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/authors/romain-guillebert.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/authors/samuele-pedroni.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/authors/the-pypy-team.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/authors/unknown.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/authors/vilhjalmur-thorsteinsson.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/authors/wim-lavrijsen.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/blog/ + 2024-08-31T17:48:00Z + + + https://www.pypy.org/blog/index-1.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/blog/index-10.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/blog/index-11.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/blog/index-12.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/blog/index-13.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/blog/index-14.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/blog/index-15.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/blog/index-16.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/blog/index-17.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/blog/index-18.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/blog/index-19.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/blog/index-2.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/blog/index-20.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/blog/index-21.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/blog/index-22.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/blog/index-23.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/blog/index-24.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/blog/index-25.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/blog/index-26.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/blog/index-27.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/blog/index-28.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/blog/index-29.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/blog/index-3.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/blog/index-30.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/blog/index-31.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/blog/index-32.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/blog/index-33.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/blog/index-34.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/blog/index-35.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/blog/index-36.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/blog/index-37.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/blog/index-38.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/blog/index-39.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/blog/index-4.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/blog/index-40.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/blog/index-41.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/blog/index-42.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/blog/index-43.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/blog/index-44.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/blog/index-5.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/blog/index-6.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/blog/index-7.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/blog/index-8.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/blog/index-9.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/categories/ + 2024-08-31T17:48:00Z + + + https://www.pypy.org/categories/arm.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/categories/casestudy.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/categories/cli.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/categories/compiler.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/categories/conda-forge.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/categories/cpyext.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/categories/cpython.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/categories/ep2008.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/categories/extension-modules.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/categories/gc.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/categories/graalpython.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/categories/guestpost.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/categories/heptapod.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/categories/hpy.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/categories/jit.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/categories/jython.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/categories/kcachegrind.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/categories/meta.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/categories/numpy.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/categories/parser.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/categories/performance.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/categories/profiling.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/categories/pypy.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/categories/pypy3.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/categories/pyqt4.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/categories/release.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/categories/releasecffi.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/categories/releaserevdb.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/categories/releasestm.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/categories/revdb.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/categories/roadmap.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/categories/rpyc.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/categories/smalltalk.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/categories/speed.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/categories/sponsors.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/categories/sprint.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/categories/sprints.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/categories/squeak.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/categories/stm.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/categories/sun.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/categories/testing.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/categories/toy-optimizer.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/categories/unicode.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/categories/valgrind.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/categories/z3.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/checksums.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/compat.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/contact.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/download.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/download_advanced.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/features.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/howtohelp.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/people.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/performance.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2007/10/first-post-8150793557471983289.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2007/11/pypy-cleanup-sprint-startup-4429006224971155209.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2007/11/pypy-google-tech-talk-9082134238390123890.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2007/11/pypy-road-show-1-new-york-and-ibm-7837076523877011699.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2007/11/pypy-road-show-5790414147905233059.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2007/11/ropes-branch-merged-8782576892496878598.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2007/11/sprint-discussions-releases-testing-1126468258904483211.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2007/11/sprint-discussions-wrapping-external-8731011170537270161.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2007/11/sprint-pictures-3151912856495869652.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2007/11/unicode-support-in-rpython-in-recent-1444449848043047640.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2007/12/faster-implementation-of-classic-1021557618590043616.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2007/12/faster-than-c-8057790636822502084.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2007/12/german-slides-of-talk-at-python-user-3715884461725333051.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2007/12/good-news-from-garbage-collection-front-2678138026363485439.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2007/12/profiling-for-fun-with-valgrind-3215121784705288400.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2007/12/pypy-talk-at-python-user-group-munich-1952379593354367249.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2007/12/pypy-tasks-in-ghop-5130253260153218709.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2007/12/pypy-winter-sports-sprint-from-12-19th-5592383212609773292.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2007/12/various-performance-improvements-7027210611565246190.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2008/01/buildbots-and-better-platform-support-6965497451398110731.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2008/01/crashing-other-peoples-compilers-4574453763254909150.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2008/01/finding-gc-roots-using-llvm-or-parsing-1980376164990001937.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2008/01/improve-net-integration-2239651503641931440.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2008/01/leysin-winter-sport-sprint-started-5478612778498579467.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2008/01/pypy-keyboard-heatmap-4950995633665492453.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2008/01/pypynet-goes-windows-forms-7031406830502864570.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2008/01/rpython-can-be-faster-than-c-2559071147541131237.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2008/01/visualizing-python-tokenizer-5020282079473796926.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2008/02/pypy-presence-on-various-conferences-in-6584680808789191759.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2008/02/python-finalizers-semantics-part-1-1196956834543115766.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2008/02/python-finalizers-semantics-part-2-2748812428675325525.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2008/02/running-pyglet-on-pypy-3191536711417589549.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2008/03/as-fast-as-cpython-for-carefully-taken-1984440931984637179.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2008/03/bittorrent-on-pypy-7984272143557948160.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2008/03/ctypes-configuration-tool-7414864595600362988.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2008/03/py-lib-091-released-1654797401128918376.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2008/03/pypy-summer-of-code-participation-3403842530060519982.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2008/04/float-operations-for-jit-6499693696246367083.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2008/04/googles-summer-of-code-4911168632727441622.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2008/04/other-aprils-fools-ideas-955926452383759016.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2008/04/trying-to-get-pypy-to-run-on-python-30-5082015544752137606.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2008/04/wrapping-pyrepl-in-readline-api-362730784820949868.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2008/05/berlin-sprint-day-1-2-8761821946764492267.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2008/05/berlin-sprint-finished-1597243123548564657.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2008/05/general-performance-improvements-838741900863354293.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2008/05/more-windows-support-1747028151130099034.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2008/05/next-sprint-berlin-may-17-22nd-may-5362899847460267375.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2008/05/progresses-on-cli-jit-backend-front-1021772190959551376.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2008/05/s3-workshop-potsdam-2008-writeup-6610637452403831794.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2008/05/threads-and-gcs-1126087726480790112.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2008/06/better-profiling-support-for-pypy-1848129914083462080.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2008/06/german-introductory-podcast-about-3836017753197345761.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2008/06/hi-all-some-news-from-jit-front-7534695765973581706.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2008/06/list-comprehension-implementation-5289956690288817225.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2008/06/next-sprint-vilniuspost-europython-10-3844544842675903586.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2008/06/pdb-and-rlcompleterng-2414105295687348881.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2008/06/pypy-code-swarm-7038411918926116477.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2008/06/pypy-improvements-5272963843122158791.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2008/06/running-nevow-on-top-of-pypy-58891137802412513.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2008/06/running-pylons-on-top-of-pypy-3234492105090025733.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2008/07/ep2008-pypy-meets-jython-1107070144380217881.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2008/07/europython-2008-pypy-talks-and-sprint-2255727845041197411.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2008/07/finding-bugs-in-pypy-with-fuz-7503072572107631526.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2008/07/pypy-at-europython-2008-1488914968455397674.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2008/07/pypys-python-runs-pinax-django-1265543049596913506.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2008/08/new-translation-option-opt-7737733390438084418.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2008/08/pylibpytest-092-released-6233865913406513469.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2008/08/pypy-runs-unmodified-django-10-beta-7105507436425430319.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2008/09/dsseldorf-pypy-sprint-5-13th-october-8919978872121664955.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2008/09/pycon-uk-javascript-and-gil-8387247619202094916.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2008/09/pypypython-at-maemo-summit-6115106472056714072.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2008/10/dsseldorf-sprint-report-days-1-3-5256639868851086032.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2008/10/prolog-jit-masters-thesis-finished-5462132148241449867.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2008/10/sprint-discussions-c-library-bindings-249141169883996521.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2008/10/sprint-discussions-jit-generator-3301578822967655604.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2008/10/sprint-discussions-release-planning-7097053444808236145.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2008/11/one-year-pypy-blog-3267056180369310162.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2008/11/porting-jit-to-cli-part-1-8712941279840156635.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2008/11/porting-jit-to-cli-part-2-2456826431882963884.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2008/12/porting-jit-to-cli-part-3-3519327524638923621.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2008/12/pycon-2009-9090464449197911432.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2009/01/wroclaw-2009-pypy-sprint-and-talk-8240928228677982487.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2009/02/wroclaw-2009-sprint-progress-report-2510073170049635489.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2009/03/applying-tracing-jit-to-interpreter-3287844903778799266.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2009/03/good-news-everyone-421421336094214242.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2009/03/jit-bit-of-look-inside-7472130507462677287.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2009/03/next-leysin-winter-sprint-1791506307881043273.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2009/03/pypy-on-mobiles-at-openbossa-845760004725129519.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2009/03/pypy-talk-at-openbossa-09-5135830287297423499.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2009/03/vm-summit-nice-to-see-friendly-8755773725359396485.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2009/04/11-final-released-225813777919757859.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2009/04/4-weeks-of-gdb-522864241041643529.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2009/04/beta-for-110-released-4604559533184706699.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2009/04/leysin-sprint-report-1416905818217912359.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2009/04/leysin-sprint-started-4551365436232104640.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2009/04/pycon-videos-are-online-909873128878039557.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2009/04/roadmap-for-jit-377358891902851723.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2009/05/icooolps-submissions-6705901656116873587.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2009/06/europython-8318355560715932819.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2009/06/jit-progress-7289127796450840053.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2009/06/news-from-jit-front-367552118380842303.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2009/07/ecoop-2009-8415055006373020774.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2009/07/pypy-numeric-experiments-2221073696038673235.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2009/08/gothenburg-jit-sprint-report-3309138497953458138.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2009/08/pypy-gets-new-compiler_25-6401910947439531107.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2009/09/first-results-of-jit-6674537807334018925.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2009/09/pypy-sprint-in-dusseldorf-6-nov-13-nov-8153983964308175836.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2009/10/first-pypy-cli-jit-benchmarks-6698484455072589492.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2009/10/gc-improvements-6174120095428192954.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2009/10/pypys-jit-now-supports-floats-7003493323596806737.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2009/11/dusseldorf-sprint-report-2505348213879053352.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2009/11/dusseldorf-sprint-started-7608527610228870250.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2009/11/hi-all-this-week-i-worked-on-improving-6515977421244851229.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2009/11/pypy-on-rupy-2009-5675275348619189353.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2009/11/some-benchmarking-9211261260383281459.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2009/11/using-cpython-extension-modules-with-4951018896657992031.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2009/12/accelerating-pypy-development-by-8973749020516679741.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2009/12/leysin-winter-sprint-23-30th-january-7768876505015446348.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2009/12/planning-next-release-of-pypy-4193252449406707091.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2010/01/nightly-graphs-of-pypys-performance-8360469412941669946.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2010/02/pycon-2010-report-6986911457623699520.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2010/03/blog-coverage-of-speedpypyorg-2291955489972824511.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2010/03/hello-5058108566628405592.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2010/03/heroes-of-12-release-7211722984024027191.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2010/03/introducing-nightly-builds-and-ubuntu-3346203966988761264.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2010/03/introducing-pypy-12-release-2791388655442447862.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2010/03/introducing-speedpypyorg-1822874891591164256.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2010/03/state-of-pypy-talk-from-pycon-6748503931490058986.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2010/04/pypy-on-google-open-source-blog-1192495586835103069.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2010/04/using-cpython-extension-modules-with-5864754772659599217.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2010/05/efficient-and-elegant-regular-2727904462179540436.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2010/05/pypy-in-googles-summer-of-code-2010-5321939902318322352.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2010/05/running-wxpython-on-top-of-pypy-52246787415886751.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2010/06/blackhole-interpreter-2752965445510091289.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2010/06/jit-for-regular-expression-matching-3877859053629057968.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2010/06/pypy-13-released-8546085566902489304.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2010/07/cern-sprint-report-wrapping-c-libraries-6547377950791793143.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2010/07/comparing-spur-to-pypy-8835011873209414462.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2010/07/europython-2010-report-7803731360759120212.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2010/07/play-on-regular-expression-9014941705636345998.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2010/08/call-for-benchmarks-2605012131351543912.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2010/08/europython-2010-videos-available-8446190660370796142.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2010/08/pyohio-2568618480482575546.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2010/08/using-virtualenv-with-pypy-7238942727709530503.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2010/09/escape-analysis-in-pypys-jit-1780048403046080197.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2010/09/pypy-in-googles-summer-of-code-2010-1267220161643618015.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2010/09/using-escape-analysis-across-loop-2887031293132023676.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2010/10/dusseldorf-sprint-report-2010-371223200425847723.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2010/10/next-pypy-sprint-4850394963147107623.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2010/10/peace-of-green-4230271053903469504.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2010/10/phd-thesis-about-pypys-cli-jit-backend-969267841095296323.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2010/11/efficiently-implementing-python-objects-3838329944323946932.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2010/11/improving-memory-behaviour-to-make-self-856966667913962461.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2010/11/pypy-14-ouroboros-in-practice-5437628000869417542.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2010/11/running-large-radio-telescope-software-7600337209616168504.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2010/11/snake-which-bites-its-tail-pypy-jitting-5161284681004717142.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2010/11/speeding-up-pypy-by-donations-6035529829962326007.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2010/12/leysin-winter-sprint-8115212435349091722.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2010/12/oh-and-btw-pypy-gets-funding-through-3568486750776147382.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2010/12/pypy-14-release-aftermath-2979780282210978576.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2010/12/pypy-141-7283625923182122073.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2010/12/pypy-migrates-to-mercurial-3308736161543832134.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2010/12/we-are-not-heroes-just-very-patient-7114408885070101720.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2011/01/jit-backend-for-arm-processors-5994810755839586463.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2011/01/loop-invariant-code-motion-1998392217676829154.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2011/01/pypy-wants-you-4543209863582915733.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2011/02/pypy-faster-than-c-on-carefully-crafted-5614784244310486765.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2011/02/pypy-san-franciso-bay-area-tour-2011-6179180737090334330.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2011/02/pypy-winter-sprint-report-4155886720346408516.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2011/03/bay-area-2011-tour-summary-9117372109664978472.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2011/03/controlling-tracing-of-interpreter-with-871085470935630424.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2011/03/controlling-tracing-of-interpreter-with_15-3281215865169782921.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2011/03/controlling-tracing-of-interpreter-with_21-6524148550848694588.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2011/03/controlling-tracing-of-interpreter-with_26-3072929156700508140.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2011/03/thank-you-to-psf-5934275567667314914.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2011/03/us-trip-report-popl-microsoft-ibm-3874568000250679204.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2011/04/pypy-15-released-catching-up-302997959079576809.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2011/04/pypy-goteborg-post-easter-sprint-april-16274563331982977.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2011/04/tutorial-part-2-adding-jit-8121732841568309472.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2011/04/tutorial-writing-interpreter-with-pypy-3785910476193156295.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2011/04/using-tkinter-and-idle-with-pypy-6156563216925585965.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2011/05/numpy-follow-up-6928627691060102514.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2011/05/numpy-in-pypy-status-and-roadmap-8332894230779779992.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2011/05/playing-with-linear-programming-on-pypy-4040572987275633047.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2011/05/pypy-genova-pegli-post-europython-4004229800858530064.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2011/05/pypy-usage-survey-1402303968715807009.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2011/05/server-migration-in-progress-2113491786141182920.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2011/06/global-interpreter-lock-or-how-to-kill-8270246310848099963.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2011/06/report-back-from-our-survey-2083371215707583264.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2011/07/realtime-image-processing-in-python-6985924592886873374.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2011/08/pypy-16-kickass-panda-559424594592497545.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2011/08/pypy-is-faster-than-c-again-string-6756589731691762127.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2011/08/visualization-of-jitted-code-6202490807361942120.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2011/08/we-need-software-transactional-memory-6513983438425039230.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2011/08/wrapping-c-libraries-with-reflection-3916959558080483711.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2011/09/py3k-for-pypy-fundraiser-8139653689520709617.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2011/10/more-compact-lists-with-list-strategies-8229304944653956829.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2011/10/numpy-funding-and-status-update-2380711174693638392.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2011/10/pypy-goteborg-post-halloween-sprint-nov-7335004338996313725.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2011/10/speeding-up-json-encoding-in-pypy-8937643890263223898.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2011/11/gothenburg-sprint-report-8371395613874909242.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2011/11/pypy-17-on-win32-4962523601794245248.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2011/11/pypy-17-widening-sweet-spot-4260962828394182017.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2011/12/come-see-us-at-pycon-2012-610420698450130659.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2011/12/leysin-winter-sprint-6862532189897876336.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2011/12/plotting-using-matplotlib-from-pypy-6389240123679375092.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2012/01/comparing-partial-evaluation-and-7255412724168990164.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2012/01/numpypy-progress-report-running-3336055571122066974.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2012/01/numpypy-status-update-6434340612277938795.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2012/01/py3k-and-numpy-first-stage-thanks-to-3008917396290059758.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2012/01/pypy-internship-at-ncar-2244162842744077724.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2012/01/simple-tracer-for-flow-graph-language-6930951890987229484.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2012/01/transactional-memory-ii-7225309560970774590.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2012/02/almost-there-pypys-arm-backend_01-3216759488618774525.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2012/02/introductionary-article-about-rpython-5386281283454207551.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2012/02/larger-example-for-flow-graph-language-6139699450091061040.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2012/02/optimizing-traces-of-flow-graph-4169388883059419385.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2012/02/py3k-status-update-8840622949715145821.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2012/02/pypy-18-business-as-usual-7266036404915945090.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2012/03/call-for-donations-for-software-8853699867109654713.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2012/03/py3k-status-update-2-4018939509128176130.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2012/04/numpy-on-pypy-progress-report-6048076549081013253.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2012/04/py3k-status-update-3-6975588144646689872.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2012/04/pycon-2012-wrap-up-559575896040055505.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2012/04/pypy-sprint-in-leipzig-june-22-27-6450601012927549960.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2012/04/stm-update-and-thanks-everybody-6071745734932940294.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2012/05/stm-update-back-to-threads-6622746581767639355.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2012/06/architecture-of-cppyy-9077100041707701102.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2012/06/europython-sprint-5668923199392472912.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2012/06/py3k-status-update-4-4834053219477515637.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2012/06/pypy-19-yard-wolf-7006180436602667005.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2012/06/release-01-of-cffi-4760622823232463868.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2012/06/stm-with-threads-7818875111634541910.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2012/07/cffi-release-02-4800000428934604295.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2012/07/hello-everyone-6869934374873967346.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2012/07/py3k-status-update-5-359698189825543897.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2012/08/c-objects-in-cppyy-part-1-data-members-1105848719513737614.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2012/08/cffi-release-03-4740491796308953732.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2012/08/hello-everyone-5492331040603503642.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2012/08/multicore-programming-in-pypy-and-6595343388141556320.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2012/09/numpy-on-pypy-status-update-1605312600799448094.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2012/09/py3k-status-update-6-4049281716377789914.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2012/09/pypy-cape-town-sprint-oct-7th-oct-21st-5757682347636918027.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2012/10/cape-town-2012-sprint-report-1612771358321767072.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2012/11/numpy-status-update-5-5489198414356844587.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2012/11/py3k-status-update-7-6182140595418083307.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2012/11/pypy-20-beta-1-2702952243260181341.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2012/11/pypy-san-francisco-sprint-dec-1st-dec-5133109101989613355.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2012/12/py3k-status-update-8-3932232806458251730.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2012/12/pypy-related-internship-at-ncar-7412729710421119926.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2013/01/numpypy-2013-developer-position-1547805593757893630.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2013/01/py3k-status-update-9-98332471264591773.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2013/02/10-years-of-pypy-634401291726575821.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2013/02/announcing-topaz-rpython-powered-ruby-6662407703061538341.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2013/02/cffi-05-1630643916751622710.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2013/02/cppyy-status-update-808802896237239604.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2013/02/hello-everyone-4718797989680066222.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2013/03/numpy-status-update-and-developer-1503421654591696377.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2013/03/py3k-status-update-10-6681398990092286007.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2013/03/so-you-want-to-try-pypy-4702482800824669595.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2013/04/pypy-20-beta-2-released-4858660312787995512.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2013/05/numpy-status-update-4176018422530420763.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2013/05/pypy-20-alpha-for-arm-2318299473927531503.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2013/05/pypy-20-einstein-sandwich-635158782365435530.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2013/05/pypy-201-bohr-smrrebrd-6316445093061941482.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2013/05/pypy-202-fermi-panini-1917947221142595738.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2013/06/numpypy-status-update-3846626188716521472.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2013/06/py3k-status-update-11-133025715908408072.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2013/06/stm-on-drawing-board-1028082727566254104.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2013/07/europython-8992114341185888806.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2013/07/pypy-21-beta-1351105697755187196.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2013/07/pypy-21-beta-2-264349571160808803.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2013/07/pypy-demo-evening-in-london-august-27-3640213278969666664.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2013/07/pypy-london-sprint-august-26-september-5156945690440578388.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2013/07/pypy-san-francisco-sprint-july-27th-2012-3064530444396960172.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2013/07/pypy3-21-beta-1-8647445024868663902.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2013/07/software-transactional-memory-lisp-7777576128992250197.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2013/08/numpy-road-forward-4210065750776753500.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2013/08/numpypy-status-update-3401163348519734658.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2013/08/preliminary-london-demo-evening-agenda-5254002451136674320.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2013/08/pypy-21-considered-armful-7177475722033479233.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2013/08/slides-of-pypy-london-demo-evening-5157052112396009739.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2013/08/update-on-stm-8705514488940872802.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2013/09/numpy-status-update-5160363918470470887.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2013/09/pycon-south-africa-sprint-6630788654105016762.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2013/10/incremental-garbage-collector-in-pypy-8956893523842234676.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2013/10/making-coveragepy-faster-under-pypy-935409618297062344.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2013/10/update-on-stm-7145890443443707910.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2013/11/cffi-08-6086756821078041950.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2013/11/numpy-status-update-1609808546418002632.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2013/11/py3k-status-update-12-5307085693947812769.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2013/11/pypy-22-incrementalism-4723643710897639332.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2013/11/pypy-221-incrementalism1-9197847629771910947.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2013/11/pypy-leysin-winter-sprint-11-19st-8860782754173653661.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2013/12/numpy-status-update-november-364321959153372759.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2013/12/pygame-cffi-8991437796535033699.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2014/02/numpy-status-update-decemberjanuary-4292961614234099787.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2014/02/py3k-status-update-13-4630607029125647100.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2014/02/rewrites-of-stm-core-model-again-633249729751034512.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2014/03/hello-everyone-there-is-interview-with-7561523711224053700.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2014/03/hi-all-here-is-one-of-first-full-pypys-8725931424559481728.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2014/03/numpy-status-update-february-1245769841736493525.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2014/03/pygamecffi-pygame-on-pypy-8679802461301121984.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2014/04/numpy-on-pypy-status-update-1103134247318103282.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2014/04/stm-results-and-second-call-for-1767845182888902777.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2014/05/pypy-23-terrestrial-arthropod-trap-9057496904945555741.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2014/06/pypy-231-terrestrial-arthropod-trap-5076300474324870908.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2014/06/pypy3-231-fulcrum-3765964217640322884.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2014/07/pypy-stm-first-interesting-release-8684276541915333814.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2014/08/a-field-test-of-software-transactional-5659022209916605798.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2014/09/pypy-24-beta-just-in-time-for-psfs-5956090195665204063.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2014/09/pypy-240-released-9-days-left-in-7722154416024407111.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2014/09/python-software-foundation-matching-2230529993193139046.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2014/10/couchbase-contribution-to-pypy-2360892117372790069.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2014/10/pypy3-240-released-5007750685927360190.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2014/11/pypy-io-improvements-1042070332447047674.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2014/11/september-donations-and-thank-you-to-4531550307707104017.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2014/11/tornado-without-gil-on-pypy-stm-7284102716557557428.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2015/01/faster-more-memory-efficient-and-more-4096950404745375390.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2015/01/leysin-winter-sprint-20-28th-february-2590212640945547308.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2015/02/experiments-in-pyrlang-with-rpython-8103387814587972227.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2015/02/linalg-support-in-pypynumpy-1131217944329711855.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2015/02/numpypy-status-january-2015-5092986229783279944.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2015/02/pypy-250-released-247160062953533060.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2015/03/pydgin-using-rpython-to-generate-fast-1514065178985838697.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2015/03/pypy-251-released-5657064769385723517.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2015/03/pypy-stm-251-released-1342113838236225773.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2015/05/cffi-10-beta-1-4375652711495636911.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2015/05/cffi-101-released-756545636419794802.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2015/06/pypy-260-release-8983050552628070433.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2015/06/pypy-and-ijson-guest-blog-post-8143007374752482637.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2015/08/pypy-261-released-3638960649983103796.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2015/09/pypy-warmup-improvements-8349465374608676233.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2015/10/automatic-simd-vectorization-support-in-639063580401330508.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2015/10/powerpc-backend-for-jit-3014100267884692148.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2015/10/pypy-400-released-jit-with-simd-8282134928733384063.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2015/10/pypy-memory-and-warmup-improvements-2-4598780879518640015.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2015/11/pypy-401-released-please-update-2652340737298251005.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2016/01/leysin-winter-sprint-20-27th-february-1737200016169608469.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2016/01/using-cffi-for-embedding-8493496761738752124.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2016/02/c-api-support-update-8582726091670983181.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2016/03/pypy-50-released-5730569530415927220.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2016/03/pypy-501-bugfix-released-2218405735970044084.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2016/04/pypy-51-released-4979856639628970409.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2016/04/pypy-enterprise-edition-3688275697656890948.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2016/04/warmup-improvements-more-efficient-7082900097299909512.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2016/05/pypy-511-bugfix-released-7586640750680293200.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2016/05/pypy33-v52-alpha-1-released-1725927506363370346.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2016/06/pypy2-v53-released-major-c-extension-7708576047190172431.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2016/07/reverse-debugging-for-python-8854823774141612670.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2016/08/pypy-gets-funding-from-mozilla-for-5569307998787871200.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2016/08/pypy-tooling-upgrade-jitviewer-and-5107430577468391432.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2016/08/pypy2-v54-released-incremental-3611318295736669599.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2016/09/pypy-541-bugfix-released-3217566297258542810.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2016/09/revdb-released-v541-6719768292347391304.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2016/10/pypy3-550-released-8069558680221199646.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2016/11/pypy27-v56-released-stdlib-2712-support-5671090852400583673.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2016/11/vectorization-extended-powerpc-and-s390x-4042433015460084057.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2017/01/leysin-winter-sprint-2526th-feb-4th-3831779797804484935.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2017/03/async-http-benchmarks-on-pypy3-1092124994927894138.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2017/03/leysin-winter-sprint-summary-4587213628578490701.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2017/03/pypy27-and-pypy35-v57-two-in-one-release-4736633226245374150.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2017/04/native-profiling-in-vmprof-6949065546884243105.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2017/04/pypy-571-bugfix-released-8519267986159880133.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2017/06/pypy-v58-released-739876359584854017.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2017/07/binary-wheels-for-pypy-8718353804433344916.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2017/08/lets-remove-global-interpreter-lock-748023554216649595.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2017/10/cape-of-good-hope-for-pypy-hello-from-3656631725712879033.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2017/10/how-to-make-your-code-80-times-faster-1424098117108093942.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2017/10/pypy-v59-released-now-supports-pandas-2261195727261691228.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2017/12/pypy27-and-pypy35-v510-dual-release-3223396318213306071.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2018/01/leysin-winter-sprint-17-24-march-2018-7141092581585849418.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2018/01/pypy-5101-bugfix-release-for-python-35-8485250762789380657.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2018/03/leysin-winter-sprint-2018-review-3988364248531980164.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2018/04/how-to-ignore-annoying-cython-warnings-1007636731207810779.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2018/04/improving-syntaxerror-in-pypy-5733639208090522433.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2018/04/pypy27-and-pypy35-v60-dual-release-7416552143474607997.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2018/06/repeating-matrix-multiplication-8641748742577945875.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2018/09/inside-cpyext-why-emulating-cpython-c-8083064623681286567.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2018/09/the-first-15-years-of-pypy-3412615975376972020.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2018/11/guest-post-implementing-calculator-repl-6271483514675006846.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2018/11/hello-everyone-at-pypy-we-are-trying-to-5336557946798583063.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2018/12/pypy-winter-sprint-feb-4-9-in-dusseldorf-7199110498451574074.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2019/01/pypy-for-low-latency-systems-613165393301401965.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2019/02/dusseldorf-sprint-report-2019-6107623654916313905.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2019/02/pypy-v700-triple-release-of-27-35-and-606875333356156076.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2019/03/pypy-v71-released-now-uses-utf-8-451324088028792912.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2019/04/an-rpython-jit-for-lpegs-4779548053359386284.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2019/04/pypy-711-bug-fix-release-6539023630991217367.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2019/07/pypy-jit-for-aarch64-7161523403247118006.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2019/08/a-second-life-for-sandbox-6848726729476245390.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2019/10/pypy-v72-released-1090406556726313495.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2019/10/pypys-new-json-parser-492911724084305501.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2019/12/hpy-kick-off-sprint-report-1840829336092490938.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2019/12/pypy-730-released-3614026620096963655.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2020/01/leysin-winter-sprint-2020-feb-28-march-6349761524797409012.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2020/02/pypy-and-cffi-have-moved-to-heptapod-5791595152472747032.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2020/03/leysin-2020-sprint-report-764567777353955897.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2020/04/pypy-731-released-6266451647387657480.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2020/08/a-new-chapter-for-pypy-8388322709667328389.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2020/08/pypy-is-on-open-collective-5673322428814364737.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2020/09/pypy-732-triple-release-python-27-36-3980901335490872787.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2020/11/pypy-733-triple-release-python-37-36-3446596804408262749.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2020/12/mac-meets-arm64-940822335619099039.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2021/03/new-hpy-blog.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2021/03/pypys-blog-has-moved.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2021/04/pypy-v734-release-of-python-27-and-37.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2021/04/ways-pypy-graphviz.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2021/05/pypy-irc-moves-to-libera-chat.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2021/05/pypy-v735-release.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2021/09/jit-auto-generated-code.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2021/10/pypy-v736-release.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2021/10/pypy-v737-release.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2021/12/error-message-style-guides.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2022/02/nlp-icelandic-case-study.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2022/02/pypy-v738-release.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2022/03/pypy-v738-release.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2022/04/how-is-pypy-tested.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2022/07/ddorf-sprint-sep-2022.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2022/07/m1-support-for-pypy.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2022/07/toy-optimizer.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2022/10/blog-15-years.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2022/10/toy-optimizer-allocation-removal.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2022/11/pypy-and-conda-forge.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2022/12/jit-bug-finding-smt-fuzzing.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2022/12/pypy-v7310-release.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2022/12/pypy-v7311-release.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2023/01/string-concatenation-quadratic.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2023/05/rpython-used-to-speed-up-risc-v-simulation-over-15x.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2023/06/pypy-v7312-release.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2023/09/pypy-v7313-release.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2023/12/pypy-moved-to-git-github.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2023/12/pypy-v7314-release.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2024/01/pypy-v7315-release.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2024/03/fixing-bug-incremental-gc.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2024/04/pypy-v7316-release.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2024/05/vmprof-firefox-converter.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2024/07/finding-simple-rewrite-rules-jit-z3.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2024/07/mining-jit-traces-missing-optimizations-z3.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2024/07/toy-abstract-interpretation.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2024/08/conda-forge-proposes-dropping-support-for-pypy.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2024/08/portaone.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2024/08/pypy-v7317-release.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/posts/2024/08/toy-knownbits.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/pypy-sponsors.html + 2024-08-31T17:48:00Z + + + https://www.pypy.org/search.html + 2024-08-31T17:48:00Z + + \ No newline at end of file diff --git a/sitemapindex.xml b/sitemapindex.xml new file mode 100644 index 000000000..9f80452a8 --- /dev/null +++ b/sitemapindex.xml @@ -0,0 +1,300 @@ + + + + https://www.pypy.org/authors/alex.xml + 2024-08-31T17:48:00Z + + + https://www.pypy.org/authors/alexander-schremmer.xml + 2024-08-31T17:48:00Z + + + https://www.pypy.org/authors/antonio-cuni.xml + 2024-08-31T17:48:00Z + + + https://www.pypy.org/authors/armin-rigo.xml + 2024-08-31T17:48:00Z + + + https://www.pypy.org/authors/bea-during.xml + 2024-08-31T17:48:00Z + + + https://www.pypy.org/authors/benjamin-peterson.xml + 2024-08-31T17:48:00Z + + + https://www.pypy.org/authors/brian-kearns.xml + 2024-08-31T17:48:00Z + + + https://www.pypy.org/authors/carl-friedrich-bolz-tereick.xml + 2024-08-31T17:48:00Z + + + https://www.pypy.org/authors/cf-bolz-tereick.xml + 2024-08-31T17:48:00Z + + + https://www.pypy.org/authors/christoph-jung.xml + 2024-08-31T17:48:00Z + + + https://www.pypy.org/authors/david-schneider.xml + 2024-08-31T17:48:00Z + + + https://www.pypy.org/authors/hakan-ardo.xml + 2024-08-31T17:48:00Z + + + https://www.pypy.org/authors/hodgestar.xml + 2024-08-31T17:48:00Z + + + https://www.pypy.org/authors/holger-krekel.xml + 2024-08-31T17:48:00Z + + + https://www.pypy.org/authors/maciej-fijalkowski.xml + 2024-08-31T17:48:00Z + + + https://www.pypy.org/authors/mattip.xml + 2024-08-31T17:48:00Z + + + https://www.pypy.org/authors/max-bernstein.xml + 2024-08-31T17:48:00Z + + + https://www.pypy.org/authors/michael-foord.xml + 2024-08-31T17:48:00Z + + + https://www.pypy.org/authors/philip-jenvey.xml + 2024-08-31T17:48:00Z + + + https://www.pypy.org/authors/richard-plangger.xml + 2024-08-31T17:48:00Z + + + https://www.pypy.org/authors/romain-guillebert.xml + 2024-08-31T17:48:00Z + + + https://www.pypy.org/authors/samuele-pedroni.xml + 2024-08-31T17:48:00Z + + + https://www.pypy.org/authors/the-pypy-team.xml + 2024-08-31T17:48:00Z + + + https://www.pypy.org/authors/unknown.xml + 2024-08-31T17:48:00Z + + + https://www.pypy.org/authors/vilhjalmur-thorsteinsson.xml + 2024-08-31T17:48:00Z + + + https://www.pypy.org/authors/wim-lavrijsen.xml + 2024-08-31T17:48:00Z + + + https://www.pypy.org/categories/arm.xml + 2024-08-31T17:48:00Z + + + https://www.pypy.org/categories/casestudy.xml + 2024-08-31T17:48:00Z + + + https://www.pypy.org/categories/cli.xml + 2024-08-31T17:48:00Z + + + https://www.pypy.org/categories/compiler.xml + 2024-08-31T17:48:00Z + + + https://www.pypy.org/categories/conda-forge.xml + 2024-08-31T17:48:00Z + + + https://www.pypy.org/categories/cpyext.xml + 2024-08-31T17:48:00Z + + + https://www.pypy.org/categories/cpython.xml + 2024-08-31T17:48:00Z + + + https://www.pypy.org/categories/ep2008.xml + 2024-08-31T17:48:00Z + + + https://www.pypy.org/categories/extension-modules.xml + 2024-08-31T17:48:00Z + + + https://www.pypy.org/categories/gc.xml + 2024-08-31T17:48:00Z + + + https://www.pypy.org/categories/graalpython.xml + 2024-08-31T17:48:00Z + + + https://www.pypy.org/categories/guestpost.xml + 2024-08-31T17:48:00Z + + + https://www.pypy.org/categories/heptapod.xml + 2024-08-31T17:48:00Z + + + https://www.pypy.org/categories/hpy.xml + 2024-08-31T17:48:00Z + + + https://www.pypy.org/categories/jit.xml + 2024-08-31T17:48:00Z + + + https://www.pypy.org/categories/jython.xml + 2024-08-31T17:48:00Z + + + https://www.pypy.org/categories/kcachegrind.xml + 2024-08-31T17:48:00Z + + + https://www.pypy.org/categories/meta.xml + 2024-08-31T17:48:00Z + + + https://www.pypy.org/categories/numpy.xml + 2024-08-31T17:48:00Z + + + https://www.pypy.org/categories/parser.xml + 2024-08-31T17:48:00Z + + + https://www.pypy.org/categories/performance.xml + 2024-08-31T17:48:00Z + + + https://www.pypy.org/categories/profiling.xml + 2024-08-31T17:48:00Z + + + https://www.pypy.org/categories/pypy.xml + 2024-08-31T17:48:00Z + + + https://www.pypy.org/categories/pypy3.xml + 2024-08-31T17:48:00Z + + + https://www.pypy.org/categories/pyqt4.xml + 2024-08-31T17:48:00Z + + + https://www.pypy.org/categories/release.xml + 2024-08-31T17:48:00Z + + + https://www.pypy.org/categories/releasecffi.xml + 2024-08-31T17:48:00Z + + + https://www.pypy.org/categories/releaserevdb.xml + 2024-08-31T17:48:00Z + + + https://www.pypy.org/categories/releasestm.xml + 2024-08-31T17:48:00Z + + + https://www.pypy.org/categories/revdb.xml + 2024-08-31T17:48:00Z + + + https://www.pypy.org/categories/roadmap.xml + 2024-08-31T17:48:00Z + + + https://www.pypy.org/categories/rpyc.xml + 2024-08-31T17:48:00Z + + + https://www.pypy.org/categories/smalltalk.xml + 2024-08-31T17:48:00Z + + + https://www.pypy.org/categories/speed.xml + 2024-08-31T17:48:00Z + + + https://www.pypy.org/categories/sponsors.xml + 2024-08-31T17:48:00Z + + + https://www.pypy.org/categories/sprint.xml + 2024-08-31T17:48:00Z + + + https://www.pypy.org/categories/sprints.xml + 2024-08-31T17:48:00Z + + + https://www.pypy.org/categories/squeak.xml + 2024-08-31T17:48:00Z + + + https://www.pypy.org/categories/stm.xml + 2024-08-31T17:48:00Z + + + https://www.pypy.org/categories/sun.xml + 2024-08-31T17:48:00Z + + + https://www.pypy.org/categories/testing.xml + 2024-08-31T17:48:00Z + + + https://www.pypy.org/categories/toy-optimizer.xml + 2024-08-31T17:48:00Z + + + https://www.pypy.org/categories/unicode.xml + 2024-08-31T17:48:00Z + + + https://www.pypy.org/categories/valgrind.xml + 2024-08-31T17:48:00Z + + + https://www.pypy.org/categories/z3.xml + 2024-08-31T17:48:00Z + + + https://www.pypy.org/rss.xml + 2024-08-31T17:48:00Z + + + https://www.pypy.org/sitemap.xml + 2024-08-31T17:48:00Z + + \ No newline at end of file